1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
3
4declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
5declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
6declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
7declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
8declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
9declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
10declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
11declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
12declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
13declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
14declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
15declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
16declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
17declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
18declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
19declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
20declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
21declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
22declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
23declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
24declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
25declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
26declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
27declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
28declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
29declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
30
31define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
32; CHECK-LABEL: test_vaddl_s8:
33; CHECK:       // %bb.0: // %entry
34; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
35; CHECK-NEXT:    ret
36entry:
37  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
38  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
39  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
40  ret <8 x i16> %add.i
41}
42
43define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
44; CHECK-LABEL: test_vaddl_s16:
45; CHECK:       // %bb.0: // %entry
46; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
47; CHECK-NEXT:    ret
48entry:
49  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
50  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
51  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
52  ret <4 x i32> %add.i
53}
54
55define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
56; CHECK-LABEL: test_vaddl_s32:
57; CHECK:       // %bb.0: // %entry
58; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
59; CHECK-NEXT:    ret
60entry:
61  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
62  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
63  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
64  ret <2 x i64> %add.i
65}
66
67define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
68; CHECK-LABEL: test_vaddl_u8:
69; CHECK:       // %bb.0: // %entry
70; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
71; CHECK-NEXT:    ret
72entry:
73  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
74  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
75  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
76  ret <8 x i16> %add.i
77}
78
79define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
80; CHECK-LABEL: test_vaddl_u16:
81; CHECK:       // %bb.0: // %entry
82; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
83; CHECK-NEXT:    ret
84entry:
85  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
86  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
87  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
88  ret <4 x i32> %add.i
89}
90
91define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
92; CHECK-LABEL: test_vaddl_u32:
93; CHECK:       // %bb.0: // %entry
94; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
95; CHECK-NEXT:    ret
96entry:
97  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
98  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
99  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
100  ret <2 x i64> %add.i
101}
102
103define <8 x i16> @test_vaddl_a8(<8 x i8> %a, <8 x i8> %b) {
104; CHECK-LABEL: test_vaddl_a8:
105; CHECK:       // %bb.0: // %entry
106; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
107; CHECK-NEXT:    bic v0.8h, #255, lsl #8
108; CHECK-NEXT:    ret
109entry:
110  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
111  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
112  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
113  %and = and <8 x i16> %add.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
114  ret <8 x i16> %and
115}
116
117define <4 x i32> @test_vaddl_a16(<4 x i16> %a, <4 x i16> %b) {
118; CHECK-LABEL: test_vaddl_a16:
119; CHECK:       // %bb.0: // %entry
120; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
121; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
122; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
123; CHECK-NEXT:    ret
124entry:
125  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
126  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
127  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
128  %and = and <4 x i32> %add.i, <i32 65535, i32 65535, i32 65535, i32 65535>
129  ret <4 x i32> %and
130}
131
132define <2 x i64> @test_vaddl_a32(<2 x i32> %a, <2 x i32> %b) {
133; CHECK-LABEL: test_vaddl_a32:
134; CHECK:       // %bb.0: // %entry
135; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
136; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
137; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
138; CHECK-NEXT:    ret
139entry:
140  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
141  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
142  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
143  %and = and <2 x i64> %add.i, <i64 4294967295, i64 4294967295>
144  ret <2 x i64> %and
145}
146
147define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
148; CHECK-LABEL: test_vaddl_high_s8:
149; CHECK:       // %bb.0: // %entry
150; CHECK-NEXT:    saddl2 v0.8h, v0.16b, v1.16b
151; CHECK-NEXT:    ret
152entry:
153  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
154  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
155  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
156  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
157  %add.i = add <8 x i16> %0, %1
158  ret <8 x i16> %add.i
159}
160
161define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
162; CHECK-LABEL: test_vaddl_high_s16:
163; CHECK:       // %bb.0: // %entry
164; CHECK-NEXT:    saddl2 v0.4s, v0.8h, v1.8h
165; CHECK-NEXT:    ret
166entry:
167  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
168  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
169  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
170  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
171  %add.i = add <4 x i32> %0, %1
172  ret <4 x i32> %add.i
173}
174
175define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
176; CHECK-LABEL: test_vaddl_high_s32:
177; CHECK:       // %bb.0: // %entry
178; CHECK-NEXT:    saddl2 v0.2d, v0.4s, v1.4s
179; CHECK-NEXT:    ret
180entry:
181  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
182  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
183  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
184  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
185  %add.i = add <2 x i64> %0, %1
186  ret <2 x i64> %add.i
187}
188
189define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
190; CHECK-LABEL: test_vaddl_high_u8:
191; CHECK:       // %bb.0: // %entry
192; CHECK-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
193; CHECK-NEXT:    ret
194entry:
195  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
196  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
197  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
198  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
199  %add.i = add <8 x i16> %0, %1
200  ret <8 x i16> %add.i
201}
202
203define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
204; CHECK-LABEL: test_vaddl_high_u16:
205; CHECK:       // %bb.0: // %entry
206; CHECK-NEXT:    uaddl2 v0.4s, v0.8h, v1.8h
207; CHECK-NEXT:    ret
208entry:
209  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
210  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
211  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
212  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
213  %add.i = add <4 x i32> %0, %1
214  ret <4 x i32> %add.i
215}
216
217define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
218; CHECK-LABEL: test_vaddl_high_u32:
219; CHECK:       // %bb.0: // %entry
220; CHECK-NEXT:    uaddl2 v0.2d, v0.4s, v1.4s
221; CHECK-NEXT:    ret
222entry:
223  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
224  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
225  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
226  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
227  %add.i = add <2 x i64> %0, %1
228  ret <2 x i64> %add.i
229}
230
231define <8 x i16> @test_vaddl_high_a8(<16 x i8> %a, <16 x i8> %b) {
232; CHECK-LABEL: test_vaddl_high_a8:
233; CHECK:       // %bb.0: // %entry
234; CHECK-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
235; CHECK-NEXT:    bic v0.8h, #255, lsl #8
236; CHECK-NEXT:    ret
237entry:
238  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
239  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
240  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
241  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
242  %add.i = add <8 x i16> %0, %1
243  %and = and <8 x i16> %add.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
244  ret <8 x i16> %and
245}
246
247define <4 x i32> @test_vaddl_high_a16(<8 x i16> %a, <8 x i16> %b) {
248; CHECK-LABEL: test_vaddl_high_a16:
249; CHECK:       // %bb.0: // %entry
250; CHECK-NEXT:    uaddl2 v0.4s, v0.8h, v1.8h
251; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
252; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
253; CHECK-NEXT:    ret
254entry:
255  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
256  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
257  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
258  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
259  %add.i = add <4 x i32> %0, %1
260  %and = and <4 x i32> %add.i, <i32 65535, i32 65535, i32 65535, i32 65535>
261  ret <4 x i32> %and
262}
263
264define <2 x i64> @test_vaddl_high_a32(<4 x i32> %a, <4 x i32> %b) {
265; CHECK-LABEL: test_vaddl_high_a32:
266; CHECK:       // %bb.0: // %entry
267; CHECK-NEXT:    uaddl2 v0.2d, v0.4s, v1.4s
268; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
269; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
270; CHECK-NEXT:    ret
271entry:
272  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
273  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
274  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
275  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
276  %add.i = add <2 x i64> %0, %1
277  %and = and <2 x i64> %add.i, <i64 4294967295, i64 4294967295>
278  ret <2 x i64> %and
279}
280
281define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
282; CHECK-LABEL: test_vaddw_s8:
283; CHECK:       // %bb.0: // %entry
284; CHECK-NEXT:    saddw v0.8h, v0.8h, v1.8b
285; CHECK-NEXT:    ret
286entry:
287  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
288  %add.i = add <8 x i16> %vmovl.i.i, %a
289  ret <8 x i16> %add.i
290}
291
292define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
293; CHECK-LABEL: test_vaddw_s16:
294; CHECK:       // %bb.0: // %entry
295; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
296; CHECK-NEXT:    ret
297entry:
298  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
299  %add.i = add <4 x i32> %vmovl.i.i, %a
300  ret <4 x i32> %add.i
301}
302
303define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
304; CHECK-LABEL: test_vaddw_s32:
305; CHECK:       // %bb.0: // %entry
306; CHECK-NEXT:    saddw v0.2d, v0.2d, v1.2s
307; CHECK-NEXT:    ret
308entry:
309  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
310  %add.i = add <2 x i64> %vmovl.i.i, %a
311  ret <2 x i64> %add.i
312}
313
314define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
315; CHECK-LABEL: test_vaddw_u8:
316; CHECK:       // %bb.0: // %entry
317; CHECK-NEXT:    uaddw v0.8h, v0.8h, v1.8b
318; CHECK-NEXT:    ret
319entry:
320  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
321  %add.i = add <8 x i16> %vmovl.i.i, %a
322  ret <8 x i16> %add.i
323}
324
325define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
326; CHECK-LABEL: test_vaddw_u16:
327; CHECK:       // %bb.0: // %entry
328; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
329; CHECK-NEXT:    ret
330entry:
331  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
332  %add.i = add <4 x i32> %vmovl.i.i, %a
333  ret <4 x i32> %add.i
334}
335
336define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
337; CHECK-LABEL: test_vaddw_u32:
338; CHECK:       // %bb.0: // %entry
339; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
340; CHECK-NEXT:    ret
341entry:
342  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
343  %add.i = add <2 x i64> %vmovl.i.i, %a
344  ret <2 x i64> %add.i
345}
346
347define <8 x i16> @test_vaddw_a8(<8 x i16> %a, <8 x i8> %b) {
348; CHECK-LABEL: test_vaddw_a8:
349; CHECK:       // %bb.0: // %entry
350; CHECK-NEXT:    uaddw v0.8h, v0.8h, v1.8b
351; CHECK-NEXT:    bic v0.8h, #255, lsl #8
352; CHECK-NEXT:    ret
353entry:
354  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
355  %add.i = add <8 x i16> %vmovl.i.i, %a
356  %and = and <8 x i16> %add.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
357  ret <8 x i16> %and
358}
359
360define <4 x i32> @test_vaddw_a16(<4 x i32> %a, <4 x i16> %b) {
361; CHECK-LABEL: test_vaddw_a16:
362; CHECK:       // %bb.0: // %entry
363; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
364; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
365; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
366; CHECK-NEXT:    ret
367entry:
368  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
369  %add.i = add <4 x i32> %vmovl.i.i, %a
370  %and = and <4 x i32> %add.i, <i32 65535, i32 65535, i32 65535, i32 65535>
371  ret <4 x i32> %and
372}
373
374define <2 x i64> @test_vaddw_a32(<2 x i64> %a, <2 x i32> %b) {
375; CHECK-LABEL: test_vaddw_a32:
376; CHECK:       // %bb.0: // %entry
377; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
378; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
379; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
380; CHECK-NEXT:    ret
381entry:
382  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
383  %add.i = add <2 x i64> %vmovl.i.i, %a
384  %and = and <2 x i64> %add.i, <i64 4294967295, i64 4294967295>
385  ret <2 x i64> %and
386}
387
388define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
389; CHECK-LABEL: test_vaddw_high_s8:
390; CHECK:       // %bb.0: // %entry
391; CHECK-NEXT:    saddw2 v0.8h, v0.8h, v1.16b
392; CHECK-NEXT:    ret
393entry:
394  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
395  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
396  %add.i = add <8 x i16> %0, %a
397  ret <8 x i16> %add.i
398}
399
400define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
401; CHECK-LABEL: test_vaddw_high_s16:
402; CHECK:       // %bb.0: // %entry
403; CHECK-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
404; CHECK-NEXT:    ret
405entry:
406  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
408  %add.i = add <4 x i32> %0, %a
409  ret <4 x i32> %add.i
410}
411
412define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
413; CHECK-LABEL: test_vaddw_high_s32:
414; CHECK:       // %bb.0: // %entry
415; CHECK-NEXT:    saddw2 v0.2d, v0.2d, v1.4s
416; CHECK-NEXT:    ret
417entry:
418  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
419  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
420  %add.i = add <2 x i64> %0, %a
421  ret <2 x i64> %add.i
422}
423
424define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
425; CHECK-LABEL: test_vaddw_high_u8:
426; CHECK:       // %bb.0: // %entry
427; CHECK-NEXT:    uaddw2 v0.8h, v0.8h, v1.16b
428; CHECK-NEXT:    ret
429entry:
430  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
431  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
432  %add.i = add <8 x i16> %0, %a
433  ret <8 x i16> %add.i
434}
435
436define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
437; CHECK-LABEL: test_vaddw_high_u16:
438; CHECK:       // %bb.0: // %entry
439; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
440; CHECK-NEXT:    ret
441entry:
442  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
443  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
444  %add.i = add <4 x i32> %0, %a
445  ret <4 x i32> %add.i
446}
447
448define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
449; CHECK-LABEL: test_vaddw_high_u32:
450; CHECK:       // %bb.0: // %entry
451; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
452; CHECK-NEXT:    ret
453entry:
454  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
455  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
456  %add.i = add <2 x i64> %0, %a
457  ret <2 x i64> %add.i
458}
459
460define <8 x i16> @test_vaddw_high_a8(<8 x i16> %a, <16 x i8> %b) {
461; CHECK-LABEL: test_vaddw_high_a8:
462; CHECK:       // %bb.0: // %entry
463; CHECK-NEXT:    uaddw2 v0.8h, v0.8h, v1.16b
464; CHECK-NEXT:    bic v0.8h, #255, lsl #8
465; CHECK-NEXT:    ret
466entry:
467  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
468  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
469  %add.i = add <8 x i16> %0, %a
470  %and = and <8 x i16> %add.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
471  ret <8 x i16> %and
472}
473
474define <4 x i32> @test_vaddw_high_a16(<4 x i32> %a, <8 x i16> %b) {
475; CHECK-LABEL: test_vaddw_high_a16:
476; CHECK:       // %bb.0: // %entry
477; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
478; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
479; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
480; CHECK-NEXT:    ret
481entry:
482  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
483  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
484  %add.i = add <4 x i32> %0, %a
485  %and = and <4 x i32> %add.i, <i32 65535, i32 65535, i32 65535, i32 65535>
486  ret <4 x i32> %and
487}
488
489define <2 x i64> @test_vaddw_high_a32(<2 x i64> %a, <4 x i32> %b) {
490; CHECK-LABEL: test_vaddw_high_a32:
491; CHECK:       // %bb.0: // %entry
492; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
493; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
494; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
495; CHECK-NEXT:    ret
496entry:
497  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
498  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
499  %add.i = add <2 x i64> %0, %a
500  %and = and <2 x i64> %add.i, <i64 4294967295, i64 4294967295>
501  ret <2 x i64> %and
502}
503
504define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
505; CHECK-LABEL: test_vsubl_s8:
506; CHECK:       // %bb.0: // %entry
507; CHECK-NEXT:    ssubl v0.8h, v0.8b, v1.8b
508; CHECK-NEXT:    ret
509entry:
510  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
511  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
512  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
513  ret <8 x i16> %sub.i
514}
515
516define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
517; CHECK-LABEL: test_vsubl_s16:
518; CHECK:       // %bb.0: // %entry
519; CHECK-NEXT:    ssubl v0.4s, v0.4h, v1.4h
520; CHECK-NEXT:    ret
521entry:
522  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
523  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
524  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
525  ret <4 x i32> %sub.i
526}
527
528define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
529; CHECK-LABEL: test_vsubl_s32:
530; CHECK:       // %bb.0: // %entry
531; CHECK-NEXT:    ssubl v0.2d, v0.2s, v1.2s
532; CHECK-NEXT:    ret
533entry:
534  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
535  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
536  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
537  ret <2 x i64> %sub.i
538}
539
540define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
541; CHECK-LABEL: test_vsubl_u8:
542; CHECK:       // %bb.0: // %entry
543; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
544; CHECK-NEXT:    ret
545entry:
546  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
547  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
548  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
549  ret <8 x i16> %sub.i
550}
551
552define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
553; CHECK-LABEL: test_vsubl_u16:
554; CHECK:       // %bb.0: // %entry
555; CHECK-NEXT:    usubl v0.4s, v0.4h, v1.4h
556; CHECK-NEXT:    ret
557entry:
558  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
559  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
560  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
561  ret <4 x i32> %sub.i
562}
563
564define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
565; CHECK-LABEL: test_vsubl_u32:
566; CHECK:       // %bb.0: // %entry
567; CHECK-NEXT:    usubl v0.2d, v0.2s, v1.2s
568; CHECK-NEXT:    ret
569entry:
570  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
571  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
572  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
573  ret <2 x i64> %sub.i
574}
575
576define <8 x i16> @test_vsubl_a8(<8 x i8> %a, <8 x i8> %b) {
577; CHECK-LABEL: test_vsubl_a8:
578; CHECK:       // %bb.0: // %entry
579; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
580; CHECK-NEXT:    bic v0.8h, #255, lsl #8
581; CHECK-NEXT:    ret
582entry:
583  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
584  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
585  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
586  %and = and <8 x i16> %sub.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
587  ret <8 x i16> %and
588}
589
590define <4 x i32> @test_vsubl_a16(<4 x i16> %a, <4 x i16> %b) {
591; CHECK-LABEL: test_vsubl_a16:
592; CHECK:       // %bb.0: // %entry
593; CHECK-NEXT:    usubl v0.4s, v0.4h, v1.4h
594; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
595; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
596; CHECK-NEXT:    ret
597entry:
598  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
599  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
600  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
601  %and = and <4 x i32> %sub.i, <i32 65535, i32 65535, i32 65535, i32 65535>
602  ret <4 x i32> %and
603}
604
605define <2 x i64> @test_vsubl_a32(<2 x i32> %a, <2 x i32> %b) {
606; CHECK-LABEL: test_vsubl_a32:
607; CHECK:       // %bb.0: // %entry
608; CHECK-NEXT:    usubl v0.2d, v0.2s, v1.2s
609; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
610; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
611; CHECK-NEXT:    ret
612entry:
613  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
614  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
615  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
616  %and = and <2 x i64> %sub.i, <i64 4294967295, i64 4294967295>
617  ret <2 x i64> %and
618}
619
620define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
621; CHECK-LABEL: test_vsubl_high_s8:
622; CHECK:       // %bb.0: // %entry
623; CHECK-NEXT:    ssubl2 v0.8h, v0.16b, v1.16b
624; CHECK-NEXT:    ret
625entry:
626  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
627  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
628  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
629  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
630  %sub.i = sub <8 x i16> %0, %1
631  ret <8 x i16> %sub.i
632}
633
634define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
635; CHECK-LABEL: test_vsubl_high_s16:
636; CHECK:       // %bb.0: // %entry
637; CHECK-NEXT:    ssubl2 v0.4s, v0.8h, v1.8h
638; CHECK-NEXT:    ret
639entry:
640  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
641  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
642  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
643  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
644  %sub.i = sub <4 x i32> %0, %1
645  ret <4 x i32> %sub.i
646}
647
648define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
649; CHECK-LABEL: test_vsubl_high_s32:
650; CHECK:       // %bb.0: // %entry
651; CHECK-NEXT:    ssubl2 v0.2d, v0.4s, v1.4s
652; CHECK-NEXT:    ret
653entry:
654  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
655  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
656  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
657  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
658  %sub.i = sub <2 x i64> %0, %1
659  ret <2 x i64> %sub.i
660}
661
662define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
663; CHECK-LABEL: test_vsubl_high_u8:
664; CHECK:       // %bb.0: // %entry
665; CHECK-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
666; CHECK-NEXT:    ret
667entry:
668  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
669  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
670  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
671  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
672  %sub.i = sub <8 x i16> %0, %1
673  ret <8 x i16> %sub.i
674}
675
676define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
677; CHECK-LABEL: test_vsubl_high_u16:
678; CHECK:       // %bb.0: // %entry
679; CHECK-NEXT:    usubl2 v0.4s, v0.8h, v1.8h
680; CHECK-NEXT:    ret
681entry:
682  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
683  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
684  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
685  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
686  %sub.i = sub <4 x i32> %0, %1
687  ret <4 x i32> %sub.i
688}
689
690define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
691; CHECK-LABEL: test_vsubl_high_u32:
692; CHECK:       // %bb.0: // %entry
693; CHECK-NEXT:    usubl2 v0.2d, v0.4s, v1.4s
694; CHECK-NEXT:    ret
695entry:
696  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
697  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
698  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
699  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
700  %sub.i = sub <2 x i64> %0, %1
701  ret <2 x i64> %sub.i
702}
703
704define <8 x i16> @test_vsubl_high_a8(<16 x i8> %a, <16 x i8> %b) {
705; CHECK-LABEL: test_vsubl_high_a8:
706; CHECK:       // %bb.0: // %entry
707; CHECK-NEXT:    usubl2 v0.8h, v0.16b, v1.16b
708; CHECK-NEXT:    bic v0.8h, #255, lsl #8
709; CHECK-NEXT:    ret
710entry:
711  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
712  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
713  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
714  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
715  %sub.i = sub <8 x i16> %0, %1
716  %and = and <8 x i16> %sub.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
717  ret <8 x i16> %and
718}
719
720define <4 x i32> @test_vsubl_high_a16(<8 x i16> %a, <8 x i16> %b) {
721; CHECK-LABEL: test_vsubl_high_a16:
722; CHECK:       // %bb.0: // %entry
723; CHECK-NEXT:    usubl2 v0.4s, v0.8h, v1.8h
724; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
725; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
726; CHECK-NEXT:    ret
727entry:
728  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
729  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
730  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
731  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
732  %sub.i = sub <4 x i32> %0, %1
733  %and = and <4 x i32> %sub.i, <i32 65535, i32 65535, i32 65535, i32 65535>
734  ret <4 x i32> %and
735}
736
737define <2 x i64> @test_vsubl_high_a32(<4 x i32> %a, <4 x i32> %b) {
738; CHECK-LABEL: test_vsubl_high_a32:
739; CHECK:       // %bb.0: // %entry
740; CHECK-NEXT:    usubl2 v0.2d, v0.4s, v1.4s
741; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
742; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
743; CHECK-NEXT:    ret
744entry:
745  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
746  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
747  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
748  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
749  %sub.i = sub <2 x i64> %0, %1
750  %and = and <2 x i64> %sub.i, <i64 4294967295, i64 4294967295>
751  ret <2 x i64> %and
752}
753
754define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
755; CHECK-LABEL: test_vsubw_s8:
756; CHECK:       // %bb.0: // %entry
757; CHECK-NEXT:    ssubw v0.8h, v0.8h, v1.8b
758; CHECK-NEXT:    ret
759entry:
760  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
761  %sub.i = sub <8 x i16> %a, %vmovl.i.i
762  ret <8 x i16> %sub.i
763}
764
765define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
766; CHECK-LABEL: test_vsubw_s16:
767; CHECK:       // %bb.0: // %entry
768; CHECK-NEXT:    ssubw v0.4s, v0.4s, v1.4h
769; CHECK-NEXT:    ret
770entry:
771  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
772  %sub.i = sub <4 x i32> %a, %vmovl.i.i
773  ret <4 x i32> %sub.i
774}
775
776define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
777; CHECK-LABEL: test_vsubw_s32:
778; CHECK:       // %bb.0: // %entry
779; CHECK-NEXT:    ssubw v0.2d, v0.2d, v1.2s
780; CHECK-NEXT:    ret
781entry:
782  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
783  %sub.i = sub <2 x i64> %a, %vmovl.i.i
784  ret <2 x i64> %sub.i
785}
786
787define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
788; CHECK-LABEL: test_vsubw_u8:
789; CHECK:       // %bb.0: // %entry
790; CHECK-NEXT:    usubw v0.8h, v0.8h, v1.8b
791; CHECK-NEXT:    ret
792entry:
793  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
794  %sub.i = sub <8 x i16> %a, %vmovl.i.i
795  ret <8 x i16> %sub.i
796}
797
798define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
799; CHECK-LABEL: test_vsubw_u16:
800; CHECK:       // %bb.0: // %entry
801; CHECK-NEXT:    usubw v0.4s, v0.4s, v1.4h
802; CHECK-NEXT:    ret
803entry:
804  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
805  %sub.i = sub <4 x i32> %a, %vmovl.i.i
806  ret <4 x i32> %sub.i
807}
808
809define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
810; CHECK-LABEL: test_vsubw_u32:
811; CHECK:       // %bb.0: // %entry
812; CHECK-NEXT:    usubw v0.2d, v0.2d, v1.2s
813; CHECK-NEXT:    ret
814entry:
815  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
816  %sub.i = sub <2 x i64> %a, %vmovl.i.i
817  ret <2 x i64> %sub.i
818}
819
820define <8 x i16> @test_vsubw_a8(<8 x i16> %a, <8 x i8> %b) {
821; CHECK-LABEL: test_vsubw_a8:
822; CHECK:       // %bb.0: // %entry
823; CHECK-NEXT:    usubw v0.8h, v0.8h, v1.8b
824; CHECK-NEXT:    bic v0.8h, #255, lsl #8
825; CHECK-NEXT:    ret
826entry:
827  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
828  %sub.i = sub <8 x i16> %a, %vmovl.i.i
829  %and = and <8 x i16> %sub.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
830  ret <8 x i16> %and
831}
832
833define <4 x i32> @test_vsubw_a16(<4 x i32> %a, <4 x i16> %b) {
834; CHECK-LABEL: test_vsubw_a16:
835; CHECK:       // %bb.0: // %entry
836; CHECK-NEXT:    usubw v0.4s, v0.4s, v1.4h
837; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
838; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
839; CHECK-NEXT:    ret
840entry:
841  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
842  %sub.i = sub <4 x i32> %a, %vmovl.i.i
843  %and = and <4 x i32> %sub.i, <i32 65535, i32 65535, i32 65535, i32 65535>
844  ret <4 x i32> %and
845}
846
847define <2 x i64> @test_vsubw_a32(<2 x i64> %a, <2 x i32> %b) {
848; CHECK-LABEL: test_vsubw_a32:
849; CHECK:       // %bb.0: // %entry
850; CHECK-NEXT:    usubw v0.2d, v0.2d, v1.2s
851; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
852; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
853; CHECK-NEXT:    ret
854entry:
855  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
856  %sub.i = sub <2 x i64> %a, %vmovl.i.i
857  %and = and <2 x i64> %sub.i, <i64 4294967295, i64 4294967295>
858  ret <2 x i64> %and
859}
860
861define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
862; CHECK-LABEL: test_vsubw_high_s8:
863; CHECK:       // %bb.0: // %entry
864; CHECK-NEXT:    ssubw2 v0.8h, v0.8h, v1.16b
865; CHECK-NEXT:    ret
866entry:
867  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
868  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
869  %sub.i = sub <8 x i16> %a, %0
870  ret <8 x i16> %sub.i
871}
872
873define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
874; CHECK-LABEL: test_vsubw_high_s16:
875; CHECK:       // %bb.0: // %entry
876; CHECK-NEXT:    ssubw2 v0.4s, v0.4s, v1.8h
877; CHECK-NEXT:    ret
878entry:
879  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
880  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
881  %sub.i = sub <4 x i32> %a, %0
882  ret <4 x i32> %sub.i
883}
884
885define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
886; CHECK-LABEL: test_vsubw_high_s32:
887; CHECK:       // %bb.0: // %entry
888; CHECK-NEXT:    ssubw2 v0.2d, v0.2d, v1.4s
889; CHECK-NEXT:    ret
890entry:
891  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
892  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
893  %sub.i = sub <2 x i64> %a, %0
894  ret <2 x i64> %sub.i
895}
896
897define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
898; CHECK-LABEL: test_vsubw_high_u8:
899; CHECK:       // %bb.0: // %entry
900; CHECK-NEXT:    usubw2 v0.8h, v0.8h, v1.16b
901; CHECK-NEXT:    ret
902entry:
903  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
904  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
905  %sub.i = sub <8 x i16> %a, %0
906  ret <8 x i16> %sub.i
907}
908
909define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
910; CHECK-LABEL: test_vsubw_high_u16:
911; CHECK:       // %bb.0: // %entry
912; CHECK-NEXT:    usubw2 v0.4s, v0.4s, v1.8h
913; CHECK-NEXT:    ret
914entry:
915  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
916  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
917  %sub.i = sub <4 x i32> %a, %0
918  ret <4 x i32> %sub.i
919}
920
921define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
922; CHECK-LABEL: test_vsubw_high_u32:
923; CHECK:       // %bb.0: // %entry
924; CHECK-NEXT:    usubw2 v0.2d, v0.2d, v1.4s
925; CHECK-NEXT:    ret
926entry:
927  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
928  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
929  %sub.i = sub <2 x i64> %a, %0
930  ret <2 x i64> %sub.i
931}
932
933define <8 x i16> @test_vsubw_high_a8(<8 x i16> %a, <16 x i8> %b) {
934; CHECK-LABEL: test_vsubw_high_a8:
935; CHECK:       // %bb.0: // %entry
936; CHECK-NEXT:    usubw2 v0.8h, v0.8h, v1.16b
937; CHECK-NEXT:    bic v0.8h, #255, lsl #8
938; CHECK-NEXT:    ret
939entry:
940  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
941  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
942  %sub.i = sub <8 x i16> %a, %0
943  %and = and <8 x i16> %sub.i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
944  ret <8 x i16> %and
945}
946
947define <4 x i32> @test_vsubw_high_a16(<4 x i32> %a, <8 x i16> %b) {
948; CHECK-LABEL: test_vsubw_high_a16:
949; CHECK:       // %bb.0: // %entry
950; CHECK-NEXT:    usubw2 v0.4s, v0.4s, v1.8h
951; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
952; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
953; CHECK-NEXT:    ret
954entry:
955  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
956  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
957  %sub.i = sub <4 x i32> %a, %0
958  %and = and <4 x i32> %sub.i, <i32 65535, i32 65535, i32 65535, i32 65535>
959  ret <4 x i32> %and
960}
961
962define <2 x i64> @test_vsubw_high_a32(<2 x i64> %a, <4 x i32> %b) {
963; CHECK-LABEL: test_vsubw_high_a32:
964; CHECK:       // %bb.0: // %entry
965; CHECK-NEXT:    usubw2 v0.2d, v0.2d, v1.4s
966; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
967; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
968; CHECK-NEXT:    ret
969entry:
970  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
971  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
972  %sub.i = sub <2 x i64> %a, %0
973  %and = and <2 x i64> %sub.i, <i64 4294967295, i64 4294967295>
974  ret <2 x i64> %and
975}
976
977define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
978; CHECK-LABEL: test_vaddhn_s16:
979; CHECK:       // %bb.0: // %entry
980; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
981; CHECK-NEXT:    ret
982entry:
983  %vaddhn.i = add <8 x i16> %a, %b
984  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
985  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
986  ret <8 x i8> %vaddhn2.i
987}
988
989define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
990; CHECK-LABEL: test_vaddhn_s32:
991; CHECK:       // %bb.0: // %entry
992; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
993; CHECK-NEXT:    ret
994entry:
995  %vaddhn.i = add <4 x i32> %a, %b
996  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
997  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
998  ret <4 x i16> %vaddhn2.i
999}
1000
1001define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
1002; CHECK-LABEL: test_vaddhn_s64:
1003; CHECK:       // %bb.0: // %entry
1004; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
1005; CHECK-NEXT:    ret
1006entry:
1007  %vaddhn.i = add <2 x i64> %a, %b
1008  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
1009  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
1010  ret <2 x i32> %vaddhn2.i
1011}
1012
1013define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
1014; CHECK-LABEL: test_vaddhn_u16:
1015; CHECK:       // %bb.0: // %entry
1016; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
1017; CHECK-NEXT:    ret
1018entry:
1019  %vaddhn.i = add <8 x i16> %a, %b
1020  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1021  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
1022  ret <8 x i8> %vaddhn2.i
1023}
1024
1025define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
1026; CHECK-LABEL: test_vaddhn_u32:
1027; CHECK:       // %bb.0: // %entry
1028; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
1029; CHECK-NEXT:    ret
1030entry:
1031  %vaddhn.i = add <4 x i32> %a, %b
1032  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
1033  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
1034  ret <4 x i16> %vaddhn2.i
1035}
1036
1037define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
1038; CHECK-LABEL: test_vaddhn_u64:
1039; CHECK:       // %bb.0: // %entry
1040; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
1041; CHECK-NEXT:    ret
1042entry:
1043  %vaddhn.i = add <2 x i64> %a, %b
1044  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
1045  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
1046  ret <2 x i32> %vaddhn2.i
1047}
1048
1049define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1050; CHECK-LABEL: test_vaddhn_high_s16:
1051; CHECK:       // %bb.0: // %entry
1052; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1053; CHECK-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
1054; CHECK-NEXT:    ret
1055entry:
1056  %vaddhn.i.i = add <8 x i16> %a, %b
1057  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1058  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
1059  %0 = bitcast <8 x i8> %r to <1 x i64>
1060  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
1061  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1062  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1063  ret <16 x i8> %2
1064}
1065
1066define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1067; CHECK-LABEL: test_vaddhn_high_s32:
1068; CHECK:       // %bb.0: // %entry
1069; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1070; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
1071; CHECK-NEXT:    ret
1072entry:
1073  %vaddhn.i.i = add <4 x i32> %a, %b
1074  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
1075  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
1076  %0 = bitcast <4 x i16> %r to <1 x i64>
1077  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
1078  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1079  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1080  ret <8 x i16> %2
1081}
1082
1083define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1084; CHECK-LABEL: test_vaddhn_high_s64:
1085; CHECK:       // %bb.0: // %entry
1086; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1087; CHECK-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
1088; CHECK-NEXT:    ret
1089entry:
1090  %vaddhn.i.i = add <2 x i64> %a, %b
1091  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
1092  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
1093  %0 = bitcast <2 x i32> %r to <1 x i64>
1094  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
1095  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1096  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1097  ret <4 x i32> %2
1098}
1099
1100define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1101; CHECK-LABEL: test_vaddhn_high_u16:
1102; CHECK:       // %bb.0: // %entry
1103; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1104; CHECK-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
1105; CHECK-NEXT:    ret
1106entry:
1107  %vaddhn.i.i = add <8 x i16> %a, %b
1108  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1109  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
1110  %0 = bitcast <8 x i8> %r to <1 x i64>
1111  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
1112  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1113  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1114  ret <16 x i8> %2
1115}
1116
1117define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1118; CHECK-LABEL: test_vaddhn_high_u32:
1119; CHECK:       // %bb.0: // %entry
1120; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1121; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
1122; CHECK-NEXT:    ret
1123entry:
1124  %vaddhn.i.i = add <4 x i32> %a, %b
1125  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
1126  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
1127  %0 = bitcast <4 x i16> %r to <1 x i64>
1128  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
1129  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1130  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1131  ret <8 x i16> %2
1132}
1133
1134define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1135; CHECK-LABEL: test_vaddhn_high_u64:
1136; CHECK:       // %bb.0: // %entry
1137; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1138; CHECK-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
1139; CHECK-NEXT:    ret
1140entry:
1141  %vaddhn.i.i = add <2 x i64> %a, %b
1142  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
1143  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
1144  %0 = bitcast <2 x i32> %r to <1 x i64>
1145  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
1146  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1147  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1148  ret <4 x i32> %2
1149}
1150
1151define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
1152; CHECK-LABEL: test_vraddhn_s16:
1153; CHECK:       // %bb.0: // %entry
1154; CHECK-NEXT:    raddhn v0.8b, v0.8h, v1.8h
1155; CHECK-NEXT:    ret
1156entry:
1157  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1158  ret <8 x i8> %vraddhn2.i
1159}
1160
1161define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
1162; CHECK-LABEL: test_vraddhn_s32:
1163; CHECK:       // %bb.0: // %entry
1164; CHECK-NEXT:    raddhn v0.4h, v0.4s, v1.4s
1165; CHECK-NEXT:    ret
1166entry:
1167  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1168  ret <4 x i16> %vraddhn2.i
1169}
1170
1171define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
1172; CHECK-LABEL: test_vraddhn_s64:
1173; CHECK:       // %bb.0: // %entry
1174; CHECK-NEXT:    raddhn v0.2s, v0.2d, v1.2d
1175; CHECK-NEXT:    ret
1176entry:
1177  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1178  ret <2 x i32> %vraddhn2.i
1179}
1180
1181define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
1182; CHECK-LABEL: test_vraddhn_u16:
1183; CHECK:       // %bb.0: // %entry
1184; CHECK-NEXT:    raddhn v0.8b, v0.8h, v1.8h
1185; CHECK-NEXT:    ret
1186entry:
1187  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1188  ret <8 x i8> %vraddhn2.i
1189}
1190
1191define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
1192; CHECK-LABEL: test_vraddhn_u32:
1193; CHECK:       // %bb.0: // %entry
1194; CHECK-NEXT:    raddhn v0.4h, v0.4s, v1.4s
1195; CHECK-NEXT:    ret
1196entry:
1197  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1198  ret <4 x i16> %vraddhn2.i
1199}
1200
1201define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
1202; CHECK-LABEL: test_vraddhn_u64:
1203; CHECK:       // %bb.0: // %entry
1204; CHECK-NEXT:    raddhn v0.2s, v0.2d, v1.2d
1205; CHECK-NEXT:    ret
1206entry:
1207  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1208  ret <2 x i32> %vraddhn2.i
1209}
1210
1211define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1212; CHECK-LABEL: test_vraddhn_high_s16:
1213; CHECK:       // %bb.0: // %entry
1214; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1215; CHECK-NEXT:    raddhn2 v0.16b, v1.8h, v2.8h
1216; CHECK-NEXT:    ret
1217entry:
1218  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1219  %0 = bitcast <8 x i8> %r to <1 x i64>
1220  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
1221  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1222  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1223  ret <16 x i8> %2
1224}
1225
1226define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1227; CHECK-LABEL: test_vraddhn_high_s32:
1228; CHECK:       // %bb.0: // %entry
1229; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1230; CHECK-NEXT:    raddhn2 v0.8h, v1.4s, v2.4s
1231; CHECK-NEXT:    ret
1232entry:
1233  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1234  %0 = bitcast <4 x i16> %r to <1 x i64>
1235  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
1236  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1237  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1238  ret <8 x i16> %2
1239}
1240
1241define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1242; CHECK-LABEL: test_vraddhn_high_s64:
1243; CHECK:       // %bb.0: // %entry
1244; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1245; CHECK-NEXT:    raddhn2 v0.4s, v1.2d, v2.2d
1246; CHECK-NEXT:    ret
1247entry:
1248  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1249  %0 = bitcast <2 x i32> %r to <1 x i64>
1250  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
1251  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1252  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1253  ret <4 x i32> %2
1254}
1255
1256define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1257; CHECK-LABEL: test_vraddhn_high_u16:
1258; CHECK:       // %bb.0: // %entry
1259; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1260; CHECK-NEXT:    raddhn2 v0.16b, v1.8h, v2.8h
1261; CHECK-NEXT:    ret
1262entry:
1263  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1264  %0 = bitcast <8 x i8> %r to <1 x i64>
1265  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
1266  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1267  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1268  ret <16 x i8> %2
1269}
1270
1271define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1272; CHECK-LABEL: test_vraddhn_high_u32:
1273; CHECK:       // %bb.0: // %entry
1274; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1275; CHECK-NEXT:    raddhn2 v0.8h, v1.4s, v2.4s
1276; CHECK-NEXT:    ret
1277entry:
1278  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1279  %0 = bitcast <4 x i16> %r to <1 x i64>
1280  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
1281  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1282  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1283  ret <8 x i16> %2
1284}
1285
1286define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1287; CHECK-LABEL: test_vraddhn_high_u64:
1288; CHECK:       // %bb.0: // %entry
1289; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1290; CHECK-NEXT:    raddhn2 v0.4s, v1.2d, v2.2d
1291; CHECK-NEXT:    ret
1292entry:
1293  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1294  %0 = bitcast <2 x i32> %r to <1 x i64>
1295  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
1296  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1297  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1298  ret <4 x i32> %2
1299}
1300
1301define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
1302; CHECK-LABEL: test_vsubhn_s16:
1303; CHECK:       // %bb.0: // %entry
1304; CHECK-NEXT:    subhn v0.8b, v0.8h, v1.8h
1305; CHECK-NEXT:    ret
1306entry:
1307  %vsubhn.i = sub <8 x i16> %a, %b
1308  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1309  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
1310  ret <8 x i8> %vsubhn2.i
1311}
1312
1313define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
1314; CHECK-LABEL: test_vsubhn_s32:
1315; CHECK:       // %bb.0: // %entry
1316; CHECK-NEXT:    subhn v0.4h, v0.4s, v1.4s
1317; CHECK-NEXT:    ret
1318entry:
1319  %vsubhn.i = sub <4 x i32> %a, %b
1320  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
1321  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
1322  ret <4 x i16> %vsubhn2.i
1323}
1324
1325define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
1326; CHECK-LABEL: test_vsubhn_s64:
1327; CHECK:       // %bb.0: // %entry
1328; CHECK-NEXT:    subhn v0.2s, v0.2d, v1.2d
1329; CHECK-NEXT:    ret
1330entry:
1331  %vsubhn.i = sub <2 x i64> %a, %b
1332  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
1333  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
1334  ret <2 x i32> %vsubhn2.i
1335}
1336
1337define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
1338; CHECK-LABEL: test_vsubhn_u16:
1339; CHECK:       // %bb.0: // %entry
1340; CHECK-NEXT:    subhn v0.8b, v0.8h, v1.8h
1341; CHECK-NEXT:    ret
1342entry:
1343  %vsubhn.i = sub <8 x i16> %a, %b
1344  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1345  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
1346  ret <8 x i8> %vsubhn2.i
1347}
1348
1349define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
1350; CHECK-LABEL: test_vsubhn_u32:
1351; CHECK:       // %bb.0: // %entry
1352; CHECK-NEXT:    subhn v0.4h, v0.4s, v1.4s
1353; CHECK-NEXT:    ret
1354entry:
1355  %vsubhn.i = sub <4 x i32> %a, %b
1356  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
1357  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
1358  ret <4 x i16> %vsubhn2.i
1359}
1360
1361define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
1362; CHECK-LABEL: test_vsubhn_u64:
1363; CHECK:       // %bb.0: // %entry
1364; CHECK-NEXT:    subhn v0.2s, v0.2d, v1.2d
1365; CHECK-NEXT:    ret
1366entry:
1367  %vsubhn.i = sub <2 x i64> %a, %b
1368  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
1369  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
1370  ret <2 x i32> %vsubhn2.i
1371}
1372
1373define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1374; CHECK-LABEL: test_vsubhn_high_s16:
1375; CHECK:       // %bb.0: // %entry
1376; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1377; CHECK-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
1378; CHECK-NEXT:    ret
1379entry:
1380  %vsubhn.i.i = sub <8 x i16> %a, %b
1381  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1382  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
1383  %0 = bitcast <8 x i8> %r to <1 x i64>
1384  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
1385  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1386  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1387  ret <16 x i8> %2
1388}
1389
1390define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1391; CHECK-LABEL: test_vsubhn_high_s32:
1392; CHECK:       // %bb.0: // %entry
1393; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1394; CHECK-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
1395; CHECK-NEXT:    ret
1396entry:
1397  %vsubhn.i.i = sub <4 x i32> %a, %b
1398  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
1399  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
1400  %0 = bitcast <4 x i16> %r to <1 x i64>
1401  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
1402  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1403  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1404  ret <8 x i16> %2
1405}
1406
1407define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1408; CHECK-LABEL: test_vsubhn_high_s64:
1409; CHECK:       // %bb.0: // %entry
1410; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1411; CHECK-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
1412; CHECK-NEXT:    ret
1413entry:
1414  %vsubhn.i.i = sub <2 x i64> %a, %b
1415  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
1416  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
1417  %0 = bitcast <2 x i32> %r to <1 x i64>
1418  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
1419  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1420  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1421  ret <4 x i32> %2
1422}
1423
1424define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1425; CHECK-LABEL: test_vsubhn_high_u16:
1426; CHECK:       // %bb.0: // %entry
1427; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1428; CHECK-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
1429; CHECK-NEXT:    ret
1430entry:
1431  %vsubhn.i.i = sub <8 x i16> %a, %b
1432  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1433  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
1434  %0 = bitcast <8 x i8> %r to <1 x i64>
1435  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
1436  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1437  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1438  ret <16 x i8> %2
1439}
1440
1441define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1442; CHECK-LABEL: test_vsubhn_high_u32:
1443; CHECK:       // %bb.0: // %entry
1444; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1445; CHECK-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
1446; CHECK-NEXT:    ret
1447entry:
1448  %vsubhn.i.i = sub <4 x i32> %a, %b
1449  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
1450  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
1451  %0 = bitcast <4 x i16> %r to <1 x i64>
1452  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
1453  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1454  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1455  ret <8 x i16> %2
1456}
1457
1458define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1459; CHECK-LABEL: test_vsubhn_high_u64:
1460; CHECK:       // %bb.0: // %entry
1461; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1462; CHECK-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
1463; CHECK-NEXT:    ret
1464entry:
1465  %vsubhn.i.i = sub <2 x i64> %a, %b
1466  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
1467  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
1468  %0 = bitcast <2 x i32> %r to <1 x i64>
1469  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
1470  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1471  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1472  ret <4 x i32> %2
1473}
1474
1475define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
1476; CHECK-LABEL: test_vrsubhn_s16:
1477; CHECK:       // %bb.0: // %entry
1478; CHECK-NEXT:    rsubhn v0.8b, v0.8h, v1.8h
1479; CHECK-NEXT:    ret
1480entry:
1481  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1482  ret <8 x i8> %vrsubhn2.i
1483}
1484
1485define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
1486; CHECK-LABEL: test_vrsubhn_s32:
1487; CHECK:       // %bb.0: // %entry
1488; CHECK-NEXT:    rsubhn v0.4h, v0.4s, v1.4s
1489; CHECK-NEXT:    ret
1490entry:
1491  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1492  ret <4 x i16> %vrsubhn2.i
1493}
1494
1495define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
1496; CHECK-LABEL: test_vrsubhn_s64:
1497; CHECK:       // %bb.0: // %entry
1498; CHECK-NEXT:    rsubhn v0.2s, v0.2d, v1.2d
1499; CHECK-NEXT:    ret
1500entry:
1501  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1502  ret <2 x i32> %vrsubhn2.i
1503}
1504
1505define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
1506; CHECK-LABEL: test_vrsubhn_u16:
1507; CHECK:       // %bb.0: // %entry
1508; CHECK-NEXT:    rsubhn v0.8b, v0.8h, v1.8h
1509; CHECK-NEXT:    ret
1510entry:
1511  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1512  ret <8 x i8> %vrsubhn2.i
1513}
1514
1515define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
1516; CHECK-LABEL: test_vrsubhn_u32:
1517; CHECK:       // %bb.0: // %entry
1518; CHECK-NEXT:    rsubhn v0.4h, v0.4s, v1.4s
1519; CHECK-NEXT:    ret
1520entry:
1521  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1522  ret <4 x i16> %vrsubhn2.i
1523}
1524
1525define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
1526; CHECK-LABEL: test_vrsubhn_u64:
1527; CHECK:       // %bb.0: // %entry
1528; CHECK-NEXT:    rsubhn v0.2s, v0.2d, v1.2d
1529; CHECK-NEXT:    ret
1530entry:
1531  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1532  ret <2 x i32> %vrsubhn2.i
1533}
1534
1535define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1536; CHECK-LABEL: test_vrsubhn_high_s16:
1537; CHECK:       // %bb.0: // %entry
1538; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1539; CHECK-NEXT:    rsubhn2 v0.16b, v1.8h, v2.8h
1540; CHECK-NEXT:    ret
1541entry:
1542  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1543  %0 = bitcast <8 x i8> %r to <1 x i64>
1544  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
1545  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1546  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1547  ret <16 x i8> %2
1548}
1549
1550define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1551; CHECK-LABEL: test_vrsubhn_high_s32:
1552; CHECK:       // %bb.0: // %entry
1553; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1554; CHECK-NEXT:    rsubhn2 v0.8h, v1.4s, v2.4s
1555; CHECK-NEXT:    ret
1556entry:
1557  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1558  %0 = bitcast <4 x i16> %r to <1 x i64>
1559  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
1560  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1561  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1562  ret <8 x i16> %2
1563}
1564
1565define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1566; CHECK-LABEL: test_vrsubhn_high_s64:
1567; CHECK:       // %bb.0: // %entry
1568; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1569; CHECK-NEXT:    rsubhn2 v0.4s, v1.2d, v2.2d
1570; CHECK-NEXT:    ret
1571entry:
1572  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1573  %0 = bitcast <2 x i32> %r to <1 x i64>
1574  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
1575  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1576  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1577  ret <4 x i32> %2
1578}
1579
1580define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
1581; CHECK-LABEL: test_vrsubhn_high_u16:
1582; CHECK:       // %bb.0: // %entry
1583; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1584; CHECK-NEXT:    rsubhn2 v0.16b, v1.8h, v2.8h
1585; CHECK-NEXT:    ret
1586entry:
1587  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
1588  %0 = bitcast <8 x i8> %r to <1 x i64>
1589  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
1590  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1591  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
1592  ret <16 x i8> %2
1593}
1594
1595define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
1596; CHECK-LABEL: test_vrsubhn_high_u32:
1597; CHECK:       // %bb.0: // %entry
1598; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1599; CHECK-NEXT:    rsubhn2 v0.8h, v1.4s, v2.4s
1600; CHECK-NEXT:    ret
1601entry:
1602  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
1603  %0 = bitcast <4 x i16> %r to <1 x i64>
1604  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
1605  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1606  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
1607  ret <8 x i16> %2
1608}
1609
1610define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
1611; CHECK-LABEL: test_vrsubhn_high_u64:
1612; CHECK:       // %bb.0: // %entry
1613; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
1614; CHECK-NEXT:    rsubhn2 v0.4s, v1.2d, v2.2d
1615; CHECK-NEXT:    ret
1616entry:
1617  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
1618  %0 = bitcast <2 x i32> %r to <1 x i64>
1619  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
1620  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
1621  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
1622  ret <4 x i32> %2
1623}
1624
1625define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
1626; CHECK-LABEL: test_vabdl_s8:
1627; CHECK:       // %bb.0: // %entry
1628; CHECK-NEXT:    sabdl v0.8h, v0.8b, v1.8b
1629; CHECK-NEXT:    ret
1630entry:
1631  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
1632  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
1633  ret <8 x i16> %vmovl.i.i
1634}
1635
1636define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
1637; CHECK-LABEL: test_vabdl_s16:
1638; CHECK:       // %bb.0: // %entry
1639; CHECK-NEXT:    sabdl v0.4s, v0.4h, v1.4h
1640; CHECK-NEXT:    ret
1641entry:
1642  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
1643  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
1644  ret <4 x i32> %vmovl.i.i
1645}
1646
1647define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
1648; CHECK-LABEL: test_vabdl_s32:
1649; CHECK:       // %bb.0: // %entry
1650; CHECK-NEXT:    sabdl v0.2d, v0.2s, v1.2s
1651; CHECK-NEXT:    ret
1652entry:
1653  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
1654  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
1655  ret <2 x i64> %vmovl.i.i
1656}
1657
1658define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
1659; CHECK-LABEL: test_vabdl_u8:
1660; CHECK:       // %bb.0: // %entry
1661; CHECK-NEXT:    uabdl v0.8h, v0.8b, v1.8b
1662; CHECK-NEXT:    ret
1663entry:
1664  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
1665  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
1666  ret <8 x i16> %vmovl.i.i
1667}
1668
1669define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
1670; CHECK-LABEL: test_vabdl_u16:
1671; CHECK:       // %bb.0: // %entry
1672; CHECK-NEXT:    uabdl v0.4s, v0.4h, v1.4h
1673; CHECK-NEXT:    ret
1674entry:
1675  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
1676  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
1677  ret <4 x i32> %vmovl.i.i
1678}
1679
1680define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
1681; CHECK-LABEL: test_vabdl_u32:
1682; CHECK:       // %bb.0: // %entry
1683; CHECK-NEXT:    uabdl v0.2d, v0.2s, v1.2s
1684; CHECK-NEXT:    ret
1685entry:
1686  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
1687  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
1688  ret <2 x i64> %vmovl.i.i
1689}
1690
1691define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1692; CHECK-LABEL: test_vabal_s8:
1693; CHECK:       // %bb.0: // %entry
1694; CHECK-NEXT:    sabal v0.8h, v1.8b, v2.8b
1695; CHECK-NEXT:    ret
1696entry:
1697  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
1698  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1699  %add.i = add <8 x i16> %vmovl.i.i.i, %a
1700  ret <8 x i16> %add.i
1701}
1702
1703define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1704; CHECK-LABEL: test_vabal_s16:
1705; CHECK:       // %bb.0: // %entry
1706; CHECK-NEXT:    sabal v0.4s, v1.4h, v2.4h
1707; CHECK-NEXT:    ret
1708entry:
1709  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
1710  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1711  %add.i = add <4 x i32> %vmovl.i.i.i, %a
1712  ret <4 x i32> %add.i
1713}
1714
1715define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1716; CHECK-LABEL: test_vabal_s32:
1717; CHECK:       // %bb.0: // %entry
1718; CHECK-NEXT:    sabal v0.2d, v1.2s, v2.2s
1719; CHECK-NEXT:    ret
1720entry:
1721  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
1722  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1723  %add.i = add <2 x i64> %vmovl.i.i.i, %a
1724  ret <2 x i64> %add.i
1725}
1726
1727define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
1728; CHECK-LABEL: test_vabal_u8:
1729; CHECK:       // %bb.0: // %entry
1730; CHECK-NEXT:    uabal v0.8h, v1.8b, v2.8b
1731; CHECK-NEXT:    ret
1732entry:
1733  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
1734  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1735  %add.i = add <8 x i16> %vmovl.i.i.i, %a
1736  ret <8 x i16> %add.i
1737}
1738
1739define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
1740; CHECK-LABEL: test_vabal_u16:
1741; CHECK:       // %bb.0: // %entry
1742; CHECK-NEXT:    uabal v0.4s, v1.4h, v2.4h
1743; CHECK-NEXT:    ret
1744entry:
1745  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
1746  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1747  %add.i = add <4 x i32> %vmovl.i.i.i, %a
1748  ret <4 x i32> %add.i
1749}
1750
1751define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
1752; CHECK-LABEL: test_vabal_u32:
1753; CHECK:       // %bb.0: // %entry
1754; CHECK-NEXT:    uabal v0.2d, v1.2s, v2.2s
1755; CHECK-NEXT:    ret
1756entry:
1757  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
1758  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1759  %add.i = add <2 x i64> %vmovl.i.i.i, %a
1760  ret <2 x i64> %add.i
1761}
1762
1763define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
1764; CHECK-LABEL: test_vabdl_high_s8:
1765; CHECK:       // %bb.0: // %entry
1766; CHECK-NEXT:    sabdl2 v0.8h, v0.16b, v1.16b
1767; CHECK-NEXT:    ret
1768entry:
1769  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1770  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1771  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1772  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1773  ret <8 x i16> %vmovl.i.i.i
1774}
1775
1776define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
1777; CHECK-LABEL: test_vabdl_high_s16:
1778; CHECK:       // %bb.0: // %entry
1779; CHECK-NEXT:    sabdl2 v0.4s, v0.8h, v1.8h
1780; CHECK-NEXT:    ret
1781entry:
1782  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1783  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1784  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1785  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1786  ret <4 x i32> %vmovl.i.i.i
1787}
1788
1789define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
1790; CHECK-LABEL: test_vabdl_high_s32:
1791; CHECK:       // %bb.0: // %entry
1792; CHECK-NEXT:    sabdl2 v0.2d, v0.4s, v1.4s
1793; CHECK-NEXT:    ret
1794entry:
1795  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1796  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1797  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1798  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1799  ret <2 x i64> %vmovl.i.i.i
1800}
1801
1802define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
1803; CHECK-LABEL: test_vabdl_high_u8:
1804; CHECK:       // %bb.0: // %entry
1805; CHECK-NEXT:    uabdl2 v0.8h, v0.16b, v1.16b
1806; CHECK-NEXT:    ret
1807entry:
1808  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1809  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1810  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1811  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
1812  ret <8 x i16> %vmovl.i.i.i
1813}
1814
1815define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
1816; CHECK-LABEL: test_vabdl_high_u16:
1817; CHECK:       // %bb.0: // %entry
1818; CHECK-NEXT:    uabdl2 v0.4s, v0.8h, v1.8h
1819; CHECK-NEXT:    ret
1820entry:
1821  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1822  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1823  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1824  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
1825  ret <4 x i32> %vmovl.i.i.i
1826}
1827
1828define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
1829; CHECK-LABEL: test_vabdl_high_u32:
1830; CHECK:       // %bb.0: // %entry
1831; CHECK-NEXT:    uabdl2 v0.2d, v0.4s, v1.4s
1832; CHECK-NEXT:    ret
1833entry:
1834  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1835  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1836  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1837  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
1838  ret <2 x i64> %vmovl.i.i.i
1839}
1840
1841define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1842; CHECK-LABEL: test_vabal_high_s8:
1843; CHECK:       // %bb.0: // %entry
1844; CHECK-NEXT:    sabal2 v0.8h, v1.16b, v2.16b
1845; CHECK-NEXT:    ret
1846entry:
1847  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1848  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1849  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1850  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
1851  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
1852  ret <8 x i16> %add.i.i
1853}
1854
1855define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1856; CHECK-LABEL: test_vabal_high_s16:
1857; CHECK:       // %bb.0: // %entry
1858; CHECK-NEXT:    sabal2 v0.4s, v1.8h, v2.8h
1859; CHECK-NEXT:    ret
1860entry:
1861  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1862  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1863  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1864  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
1865  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
1866  ret <4 x i32> %add.i.i
1867}
1868
1869define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1870; CHECK-LABEL: test_vabal_high_s32:
1871; CHECK:       // %bb.0: // %entry
1872; CHECK-NEXT:    sabal2 v0.2d, v1.4s, v2.4s
1873; CHECK-NEXT:    ret
1874entry:
1875  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1876  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1877  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1878  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
1879  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
1880  ret <2 x i64> %add.i.i
1881}
1882
1883define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
1884; CHECK-LABEL: test_vabal_high_u8:
1885; CHECK:       // %bb.0: // %entry
1886; CHECK-NEXT:    uabal2 v0.8h, v1.16b, v2.16b
1887; CHECK-NEXT:    ret
1888entry:
1889  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1890  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1891  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1892  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
1893  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
1894  ret <8 x i16> %add.i.i
1895}
1896
1897define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
1898; CHECK-LABEL: test_vabal_high_u16:
1899; CHECK:       // %bb.0: // %entry
1900; CHECK-NEXT:    uabal2 v0.4s, v1.8h, v2.8h
1901; CHECK-NEXT:    ret
1902entry:
1903  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1904  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1905  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
1906  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
1907  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
1908  ret <4 x i32> %add.i.i
1909}
1910
1911define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
1912; CHECK-LABEL: test_vabal_high_u32:
1913; CHECK:       // %bb.0: // %entry
1914; CHECK-NEXT:    uabal2 v0.2d, v1.4s, v2.4s
1915; CHECK-NEXT:    ret
1916entry:
1917  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1918  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1919  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
1920  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
1921  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
1922  ret <2 x i64> %add.i.i
1923}
1924
1925define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
1926; CHECK-LABEL: test_vmull_s8:
1927; CHECK:       // %bb.0: // %entry
1928; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
1929; CHECK-NEXT:    ret
1930entry:
1931  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
1932  ret <8 x i16> %vmull.i
1933}
1934
1935define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
1936; CHECK-LABEL: test_vmull_s16:
1937; CHECK:       // %bb.0: // %entry
1938; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
1939; CHECK-NEXT:    ret
1940entry:
1941  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
1942  ret <4 x i32> %vmull2.i
1943}
1944
1945define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
1946; CHECK-LABEL: test_vmull_s32:
1947; CHECK:       // %bb.0: // %entry
1948; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
1949; CHECK-NEXT:    ret
1950entry:
1951  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
1952  ret <2 x i64> %vmull2.i
1953}
1954
1955define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
1956; CHECK-LABEL: test_vmull_u8:
1957; CHECK:       // %bb.0: // %entry
1958; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
1959; CHECK-NEXT:    ret
1960entry:
1961  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
1962  ret <8 x i16> %vmull.i
1963}
1964
1965define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
1966; CHECK-LABEL: test_vmull_u16:
1967; CHECK:       // %bb.0: // %entry
1968; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
1969; CHECK-NEXT:    ret
1970entry:
1971  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
1972  ret <4 x i32> %vmull2.i
1973}
1974
1975define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
1976; CHECK-LABEL: test_vmull_u32:
1977; CHECK:       // %bb.0: // %entry
1978; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
1979; CHECK-NEXT:    ret
1980entry:
1981  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
1982  ret <2 x i64> %vmull2.i
1983}
1984
1985define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
1986; CHECK-LABEL: test_vmull_high_s8:
1987; CHECK:       // %bb.0: // %entry
1988; CHECK-NEXT:    smull2 v0.8h, v0.16b, v1.16b
1989; CHECK-NEXT:    ret
1990entry:
1991  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1992  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1993  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
1994  ret <8 x i16> %vmull.i.i
1995}
1996
1997define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
1998; CHECK-LABEL: test_vmull_high_s16:
1999; CHECK:       // %bb.0: // %entry
2000; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
2001; CHECK-NEXT:    ret
2002entry:
2003  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2004  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2005  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2006  ret <4 x i32> %vmull2.i.i
2007}
2008
2009define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
2010; CHECK-LABEL: test_vmull_high_s32:
2011; CHECK:       // %bb.0: // %entry
2012; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.4s
2013; CHECK-NEXT:    ret
2014entry:
2015  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2016  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2017  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2018  ret <2 x i64> %vmull2.i.i
2019}
2020
2021define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
2022; CHECK-LABEL: test_vmull_high_u8:
2023; CHECK:       // %bb.0: // %entry
2024; CHECK-NEXT:    umull2 v0.8h, v0.16b, v1.16b
2025; CHECK-NEXT:    ret
2026entry:
2027  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2028  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2029  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2030  ret <8 x i16> %vmull.i.i
2031}
2032
2033define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
2034; CHECK-LABEL: test_vmull_high_u16:
2035; CHECK:       // %bb.0: // %entry
2036; CHECK-NEXT:    umull2 v0.4s, v0.8h, v1.8h
2037; CHECK-NEXT:    ret
2038entry:
2039  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2040  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2041  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2042  ret <4 x i32> %vmull2.i.i
2043}
2044
2045define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
2046; CHECK-LABEL: test_vmull_high_u32:
2047; CHECK:       // %bb.0: // %entry
2048; CHECK-NEXT:    umull2 v0.2d, v0.4s, v1.4s
2049; CHECK-NEXT:    ret
2050entry:
2051  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2052  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2053  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2054  ret <2 x i64> %vmull2.i.i
2055}
2056
2057define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
2058; CHECK-LABEL: test_vmlal_s8:
2059; CHECK:       // %bb.0: // %entry
2060; CHECK-NEXT:    smlal v0.8h, v1.8b, v2.8b
2061; CHECK-NEXT:    ret
2062entry:
2063  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
2064  %add.i = add <8 x i16> %vmull.i.i, %a
2065  ret <8 x i16> %add.i
2066}
2067
2068define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2069; CHECK-LABEL: test_vmlal_s16:
2070; CHECK:       // %bb.0: // %entry
2071; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.4h
2072; CHECK-NEXT:    ret
2073entry:
2074  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
2075  %add.i = add <4 x i32> %vmull2.i.i, %a
2076  ret <4 x i32> %add.i
2077}
2078
2079define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2080; CHECK-LABEL: test_vmlal_s32:
2081; CHECK:       // %bb.0: // %entry
2082; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
2083; CHECK-NEXT:    ret
2084entry:
2085  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
2086  %add.i = add <2 x i64> %vmull2.i.i, %a
2087  ret <2 x i64> %add.i
2088}
2089
2090define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
2091; CHECK-LABEL: test_vmlal_u8:
2092; CHECK:       // %bb.0: // %entry
2093; CHECK-NEXT:    umlal v0.8h, v1.8b, v2.8b
2094; CHECK-NEXT:    ret
2095entry:
2096  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
2097  %add.i = add <8 x i16> %vmull.i.i, %a
2098  ret <8 x i16> %add.i
2099}
2100
2101define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2102; CHECK-LABEL: test_vmlal_u16:
2103; CHECK:       // %bb.0: // %entry
2104; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.4h
2105; CHECK-NEXT:    ret
2106entry:
2107  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
2108  %add.i = add <4 x i32> %vmull2.i.i, %a
2109  ret <4 x i32> %add.i
2110}
2111
2112define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2113; CHECK-LABEL: test_vmlal_u32:
2114; CHECK:       // %bb.0: // %entry
2115; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.2s
2116; CHECK-NEXT:    ret
2117entry:
2118  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
2119  %add.i = add <2 x i64> %vmull2.i.i, %a
2120  ret <2 x i64> %add.i
2121}
2122
2123define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
2124; CHECK-LABEL: test_vmlal_high_s8:
2125; CHECK:       // %bb.0: // %entry
2126; CHECK-NEXT:    smlal2 v0.8h, v1.16b, v2.16b
2127; CHECK-NEXT:    ret
2128entry:
2129  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2130  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2131  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2132  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
2133  ret <8 x i16> %add.i.i
2134}
2135
2136define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2137; CHECK-LABEL: test_vmlal_high_s16:
2138; CHECK:       // %bb.0: // %entry
2139; CHECK-NEXT:    smlal2 v0.4s, v1.8h, v2.8h
2140; CHECK-NEXT:    ret
2141entry:
2142  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2143  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2144  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2145  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
2146  ret <4 x i32> %add.i.i
2147}
2148
2149define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2150; CHECK-LABEL: test_vmlal_high_s32:
2151; CHECK:       // %bb.0: // %entry
2152; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
2153; CHECK-NEXT:    ret
2154entry:
2155  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2156  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2157  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2158  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
2159  ret <2 x i64> %add.i.i
2160}
2161
2162define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
2163; CHECK-LABEL: test_vmlal_high_u8:
2164; CHECK:       // %bb.0: // %entry
2165; CHECK-NEXT:    umlal2 v0.8h, v1.16b, v2.16b
2166; CHECK-NEXT:    ret
2167entry:
2168  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2169  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2170  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2171  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
2172  ret <8 x i16> %add.i.i
2173}
2174
2175define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2176; CHECK-LABEL: test_vmlal_high_u16:
2177; CHECK:       // %bb.0: // %entry
2178; CHECK-NEXT:    umlal2 v0.4s, v1.8h, v2.8h
2179; CHECK-NEXT:    ret
2180entry:
2181  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2182  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2183  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2184  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
2185  ret <4 x i32> %add.i.i
2186}
2187
2188define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2189; CHECK-LABEL: test_vmlal_high_u32:
2190; CHECK:       // %bb.0: // %entry
2191; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
2192; CHECK-NEXT:    ret
2193entry:
2194  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2195  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2196  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2197  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
2198  ret <2 x i64> %add.i.i
2199}
2200
2201define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
2202; CHECK-LABEL: test_vmlsl_s8:
2203; CHECK:       // %bb.0: // %entry
2204; CHECK-NEXT:    smlsl v0.8h, v1.8b, v2.8b
2205; CHECK-NEXT:    ret
2206entry:
2207  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
2208  %sub.i = sub <8 x i16> %a, %vmull.i.i
2209  ret <8 x i16> %sub.i
2210}
2211
2212define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2213; CHECK-LABEL: test_vmlsl_s16:
2214; CHECK:       // %bb.0: // %entry
2215; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.4h
2216; CHECK-NEXT:    ret
2217entry:
2218  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
2219  %sub.i = sub <4 x i32> %a, %vmull2.i.i
2220  ret <4 x i32> %sub.i
2221}
2222
2223define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2224; CHECK-LABEL: test_vmlsl_s32:
2225; CHECK:       // %bb.0: // %entry
2226; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.2s
2227; CHECK-NEXT:    ret
2228entry:
2229  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
2230  %sub.i = sub <2 x i64> %a, %vmull2.i.i
2231  ret <2 x i64> %sub.i
2232}
2233
2234define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
2235; CHECK-LABEL: test_vmlsl_u8:
2236; CHECK:       // %bb.0: // %entry
2237; CHECK-NEXT:    umlsl v0.8h, v1.8b, v2.8b
2238; CHECK-NEXT:    ret
2239entry:
2240  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
2241  %sub.i = sub <8 x i16> %a, %vmull.i.i
2242  ret <8 x i16> %sub.i
2243}
2244
2245define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2246; CHECK-LABEL: test_vmlsl_u16:
2247; CHECK:       // %bb.0: // %entry
2248; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.4h
2249; CHECK-NEXT:    ret
2250entry:
2251  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
2252  %sub.i = sub <4 x i32> %a, %vmull2.i.i
2253  ret <4 x i32> %sub.i
2254}
2255
2256define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2257; CHECK-LABEL: test_vmlsl_u32:
2258; CHECK:       // %bb.0: // %entry
2259; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.2s
2260; CHECK-NEXT:    ret
2261entry:
2262  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
2263  %sub.i = sub <2 x i64> %a, %vmull2.i.i
2264  ret <2 x i64> %sub.i
2265}
2266
2267define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
2268; CHECK-LABEL: test_vmlsl_high_s8:
2269; CHECK:       // %bb.0: // %entry
2270; CHECK-NEXT:    smlsl2 v0.8h, v1.16b, v2.16b
2271; CHECK-NEXT:    ret
2272entry:
2273  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2274  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2275  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2276  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
2277  ret <8 x i16> %sub.i.i
2278}
2279
2280define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2281; CHECK-LABEL: test_vmlsl_high_s16:
2282; CHECK:       // %bb.0: // %entry
2283; CHECK-NEXT:    smlsl2 v0.4s, v1.8h, v2.8h
2284; CHECK-NEXT:    ret
2285entry:
2286  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2287  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2288  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2289  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
2290  ret <4 x i32> %sub.i.i
2291}
2292
2293define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2294; CHECK-LABEL: test_vmlsl_high_s32:
2295; CHECK:       // %bb.0: // %entry
2296; CHECK-NEXT:    smlsl2 v0.2d, v1.4s, v2.4s
2297; CHECK-NEXT:    ret
2298entry:
2299  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2300  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2301  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2302  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
2303  ret <2 x i64> %sub.i.i
2304}
2305
2306define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
2307; CHECK-LABEL: test_vmlsl_high_u8:
2308; CHECK:       // %bb.0: // %entry
2309; CHECK-NEXT:    umlsl2 v0.8h, v1.16b, v2.16b
2310; CHECK-NEXT:    ret
2311entry:
2312  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2313  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2314  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2315  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
2316  ret <8 x i16> %sub.i.i
2317}
2318
2319define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2320; CHECK-LABEL: test_vmlsl_high_u16:
2321; CHECK:       // %bb.0: // %entry
2322; CHECK-NEXT:    umlsl2 v0.4s, v1.8h, v2.8h
2323; CHECK-NEXT:    ret
2324entry:
2325  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2326  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2327  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2328  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
2329  ret <4 x i32> %sub.i.i
2330}
2331
2332define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2333; CHECK-LABEL: test_vmlsl_high_u32:
2334; CHECK:       // %bb.0: // %entry
2335; CHECK-NEXT:    umlsl2 v0.2d, v1.4s, v2.4s
2336; CHECK-NEXT:    ret
2337entry:
2338  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2339  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2340  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2341  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
2342  ret <2 x i64> %sub.i.i
2343}
2344
2345define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
2346; CHECK-LABEL: test_vqdmull_s16:
2347; CHECK:       // %bb.0: // %entry
2348; CHECK-NEXT:    sqdmull v0.4s, v0.4h, v1.4h
2349; CHECK-NEXT:    ret
2350entry:
2351  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
2352  ret <4 x i32> %vqdmull2.i
2353}
2354
2355define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
2356; CHECK-LABEL: test_vqdmull_s32:
2357; CHECK:       // %bb.0: // %entry
2358; CHECK-NEXT:    sqdmull v0.2d, v0.2s, v1.2s
2359; CHECK-NEXT:    ret
2360entry:
2361  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
2362  ret <2 x i64> %vqdmull2.i
2363}
2364
2365define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2366; CHECK-LABEL: test_vqdmlal_s16:
2367; CHECK:       // %bb.0: // %entry
2368; CHECK-NEXT:    sqdmlal v0.4s, v1.4h, v2.4h
2369; CHECK-NEXT:    ret
2370entry:
2371  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
2372  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
2373  ret <4 x i32> %vqdmlal4.i
2374}
2375
2376define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2377; CHECK-LABEL: test_vqdmlal_s32:
2378; CHECK:       // %bb.0: // %entry
2379; CHECK-NEXT:    sqdmlal v0.2d, v1.2s, v2.2s
2380; CHECK-NEXT:    ret
2381entry:
2382  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
2383  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
2384  ret <2 x i64> %vqdmlal4.i
2385}
2386
2387define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
2388; CHECK-LABEL: test_vqdmlsl_s16:
2389; CHECK:       // %bb.0: // %entry
2390; CHECK-NEXT:    sqdmlsl v0.4s, v1.4h, v2.4h
2391; CHECK-NEXT:    ret
2392entry:
2393  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
2394  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
2395  ret <4 x i32> %vqdmlsl4.i
2396}
2397
2398define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
2399; CHECK-LABEL: test_vqdmlsl_s32:
2400; CHECK:       // %bb.0: // %entry
2401; CHECK-NEXT:    sqdmlsl v0.2d, v1.2s, v2.2s
2402; CHECK-NEXT:    ret
2403entry:
2404  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
2405  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
2406  ret <2 x i64> %vqdmlsl4.i
2407}
2408
2409define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
2410; CHECK-LABEL: test_vqdmull_high_s16:
2411; CHECK:       // %bb.0: // %entry
2412; CHECK-NEXT:    sqdmull2 v0.4s, v0.8h, v1.8h
2413; CHECK-NEXT:    ret
2414entry:
2415  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2416  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2417  %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2418  ret <4 x i32> %vqdmull2.i.i
2419}
2420
2421define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
2422; CHECK-LABEL: test_vqdmull_high_s32:
2423; CHECK:       // %bb.0: // %entry
2424; CHECK-NEXT:    sqdmull2 v0.2d, v0.4s, v1.4s
2425; CHECK-NEXT:    ret
2426entry:
2427  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2428  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2429  %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2430  ret <2 x i64> %vqdmull2.i.i
2431}
2432
2433define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2434; CHECK-LABEL: test_vqdmlal_high_s16:
2435; CHECK:       // %bb.0: // %entry
2436; CHECK-NEXT:    sqdmlal2 v0.4s, v1.8h, v2.8h
2437; CHECK-NEXT:    ret
2438entry:
2439  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2440  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2441  %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2442  %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
2443  ret <4 x i32> %vqdmlal4.i.i
2444}
2445
2446define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2447; CHECK-LABEL: test_vqdmlal_high_s32:
2448; CHECK:       // %bb.0: // %entry
2449; CHECK-NEXT:    sqdmlal2 v0.2d, v1.4s, v2.4s
2450; CHECK-NEXT:    ret
2451entry:
2452  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2453  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2454  %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2455  %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
2456  ret <2 x i64> %vqdmlal4.i.i
2457}
2458
2459define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
2460; CHECK-LABEL: test_vqdmlsl_high_s16:
2461; CHECK:       // %bb.0: // %entry
2462; CHECK-NEXT:    sqdmlsl2 v0.4s, v1.8h, v2.8h
2463; CHECK-NEXT:    ret
2464entry:
2465  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2466  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2467  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
2468  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
2469  ret <4 x i32> %vqdmlsl4.i.i
2470}
2471
2472define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
2473; CHECK-LABEL: test_vqdmlsl_high_s32:
2474; CHECK:       // %bb.0: // %entry
2475; CHECK-NEXT:    sqdmlsl2 v0.2d, v1.4s, v2.4s
2476; CHECK-NEXT:    ret
2477entry:
2478  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2479  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2480  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
2481  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
2482  ret <2 x i64> %vqdmlsl4.i.i
2483}
2484
2485define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
2486; CHECK-LABEL: test_vmull_p8:
2487; CHECK:       // %bb.0: // %entry
2488; CHECK-NEXT:    pmull v0.8h, v0.8b, v1.8b
2489; CHECK-NEXT:    ret
2490entry:
2491  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
2492  ret <8 x i16> %vmull.i
2493}
2494
2495define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
2496; CHECK-LABEL: test_vmull_high_p8:
2497; CHECK:       // %bb.0: // %entry
2498; CHECK-NEXT:    pmull2 v0.8h, v0.16b, v1.16b
2499; CHECK-NEXT:    ret
2500entry:
2501  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2502  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2503  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
2504  ret <8 x i16> %vmull.i.i
2505}
2506
2507define i128 @test_vmull_p64(i64 %a, i64 %b) #4 {
2508; CHECK-LABEL: test_vmull_p64:
2509; CHECK:       // %bb.0: // %entry
2510; CHECK-NEXT:    fmov d0, x0
2511; CHECK-NEXT:    fmov d1, x1
2512; CHECK-NEXT:    pmull v0.1q, v0.1d, v1.1d
2513; CHECK-NEXT:    mov x1, v0.d[1]
2514; CHECK-NEXT:    fmov x0, d0
2515; CHECK-NEXT:    ret
2516entry:
2517  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
2518  %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
2519  ret i128 %vmull3.i
2520}
2521
2522define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 {
2523; CHECK-LABEL: test_vmull_high_p64:
2524; CHECK:       // %bb.0: // %entry
2525; CHECK-NEXT:    pmull2 v0.1q, v0.2d, v1.2d
2526; CHECK-NEXT:    mov x1, v0.d[1]
2527; CHECK-NEXT:    fmov x0, d0
2528; CHECK-NEXT:    ret
2529entry:
2530  %0 = extractelement <2 x i64> %a, i32 1
2531  %1 = extractelement <2 x i64> %b, i32 1
2532  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
2533  %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
2534  ret i128 %vmull3.i.i
2535}
2536
2537
2538