1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - -lower-interleaved-accesses=false | FileCheck %s
3
4define <8 x i8> @vpaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
5; CHECK-LABEL: vpaddi8:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vldr d16, [r1]
8; CHECK-NEXT:    vldr d17, [r0]
9; CHECK-NEXT:    vpadd.i8 d16, d17, d16
10; CHECK-NEXT:    vmov r0, r1, d16
11; CHECK-NEXT:    mov pc, lr
12	%tmp1 = load <8 x i8>, <8 x i8>* %A
13	%tmp2 = load <8 x i8>, <8 x i8>* %B
14	%tmp3 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
15	ret <8 x i8> %tmp3
16}
17
18define <4 x i16> @vpaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
19; CHECK-LABEL: vpaddi16:
20; CHECK:       @ %bb.0:
21; CHECK-NEXT:    vldr d16, [r1]
22; CHECK-NEXT:    vldr d17, [r0]
23; CHECK-NEXT:    vpadd.i16 d16, d17, d16
24; CHECK-NEXT:    vmov r0, r1, d16
25; CHECK-NEXT:    mov pc, lr
26	%tmp1 = load <4 x i16>, <4 x i16>* %A
27	%tmp2 = load <4 x i16>, <4 x i16>* %B
28	%tmp3 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
29	ret <4 x i16> %tmp3
30}
31
32define <2 x i32> @vpaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
33; CHECK-LABEL: vpaddi32:
34; CHECK:       @ %bb.0:
35; CHECK-NEXT:    vldr d16, [r1]
36; CHECK-NEXT:    vldr d17, [r0]
37; CHECK-NEXT:    vpadd.i32 d16, d17, d16
38; CHECK-NEXT:    vmov r0, r1, d16
39; CHECK-NEXT:    mov pc, lr
40	%tmp1 = load <2 x i32>, <2 x i32>* %A
41	%tmp2 = load <2 x i32>, <2 x i32>* %B
42	%tmp3 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
43	ret <2 x i32> %tmp3
44}
45
46define <2 x float> @vpaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
47; CHECK-LABEL: vpaddf32:
48; CHECK:       @ %bb.0:
49; CHECK-NEXT:    vldr d16, [r1]
50; CHECK-NEXT:    vldr d17, [r0]
51; CHECK-NEXT:    vpadd.f32 d16, d17, d16
52; CHECK-NEXT:    vmov r0, r1, d16
53; CHECK-NEXT:    mov pc, lr
54	%tmp1 = load <2 x float>, <2 x float>* %A
55	%tmp2 = load <2 x float>, <2 x float>* %B
56	%tmp3 = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
57	ret <2 x float> %tmp3
58}
59
60declare <8 x i8>  @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
61declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
62declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
63
64declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
65
66define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind {
67; CHECK-LABEL: vpaddls8:
68; CHECK:       @ %bb.0:
69; CHECK-NEXT:    vldr d16, [r0]
70; CHECK-NEXT:    vpaddl.s8 d16, d16
71; CHECK-NEXT:    vmov r0, r1, d16
72; CHECK-NEXT:    mov pc, lr
73	%tmp1 = load <8 x i8>, <8 x i8>* %A
74	%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
75	ret <4 x i16> %tmp2
76}
77
78define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind {
79; CHECK-LABEL: vpaddls16:
80; CHECK:       @ %bb.0:
81; CHECK-NEXT:    vldr d16, [r0]
82; CHECK-NEXT:    vpaddl.s16 d16, d16
83; CHECK-NEXT:    vmov r0, r1, d16
84; CHECK-NEXT:    mov pc, lr
85	%tmp1 = load <4 x i16>, <4 x i16>* %A
86	%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
87	ret <2 x i32> %tmp2
88}
89
90define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind {
91; CHECK-LABEL: vpaddls32:
92; CHECK:       @ %bb.0:
93; CHECK-NEXT:    vldr d16, [r0]
94; CHECK-NEXT:    vpaddl.s32 d16, d16
95; CHECK-NEXT:    vmov r0, r1, d16
96; CHECK-NEXT:    mov pc, lr
97	%tmp1 = load <2 x i32>, <2 x i32>* %A
98	%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
99	ret <1 x i64> %tmp2
100}
101
102define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind {
103; CHECK-LABEL: vpaddlu8:
104; CHECK:       @ %bb.0:
105; CHECK-NEXT:    vldr d16, [r0]
106; CHECK-NEXT:    vpaddl.u8 d16, d16
107; CHECK-NEXT:    vmov r0, r1, d16
108; CHECK-NEXT:    mov pc, lr
109	%tmp1 = load <8 x i8>, <8 x i8>* %A
110	%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
111	ret <4 x i16> %tmp2
112}
113
114define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind {
115; CHECK-LABEL: vpaddlu16:
116; CHECK:       @ %bb.0:
117; CHECK-NEXT:    vldr d16, [r0]
118; CHECK-NEXT:    vpaddl.u16 d16, d16
119; CHECK-NEXT:    vmov r0, r1, d16
120; CHECK-NEXT:    mov pc, lr
121	%tmp1 = load <4 x i16>, <4 x i16>* %A
122	%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
123	ret <2 x i32> %tmp2
124}
125
126define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind {
127; CHECK-LABEL: vpaddlu32:
128; CHECK:       @ %bb.0:
129; CHECK-NEXT:    vldr d16, [r0]
130; CHECK-NEXT:    vpaddl.u32 d16, d16
131; CHECK-NEXT:    vmov r0, r1, d16
132; CHECK-NEXT:    mov pc, lr
133	%tmp1 = load <2 x i32>, <2 x i32>* %A
134	%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
135	ret <1 x i64> %tmp2
136}
137
138define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind {
139; CHECK-LABEL: vpaddlQs8:
140; CHECK:       @ %bb.0:
141; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
142; CHECK-NEXT:    vpaddl.s8 q8, q8
143; CHECK-NEXT:    vmov r0, r1, d16
144; CHECK-NEXT:    vmov r2, r3, d17
145; CHECK-NEXT:    mov pc, lr
146	%tmp1 = load <16 x i8>, <16 x i8>* %A
147	%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
148	ret <8 x i16> %tmp2
149}
150
151define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind {
152; CHECK-LABEL: vpaddlQs16:
153; CHECK:       @ %bb.0:
154; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
155; CHECK-NEXT:    vpaddl.s16 q8, q8
156; CHECK-NEXT:    vmov r0, r1, d16
157; CHECK-NEXT:    vmov r2, r3, d17
158; CHECK-NEXT:    mov pc, lr
159	%tmp1 = load <8 x i16>, <8 x i16>* %A
160	%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
161	ret <4 x i32> %tmp2
162}
163
164define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind {
165; CHECK-LABEL: vpaddlQs32:
166; CHECK:       @ %bb.0:
167; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
168; CHECK-NEXT:    vpaddl.s32 q8, q8
169; CHECK-NEXT:    vmov r0, r1, d16
170; CHECK-NEXT:    vmov r2, r3, d17
171; CHECK-NEXT:    mov pc, lr
172	%tmp1 = load <4 x i32>, <4 x i32>* %A
173	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
174	ret <2 x i64> %tmp2
175}
176
177define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind {
178; CHECK-LABEL: vpaddlQu8:
179; CHECK:       @ %bb.0:
180; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
181; CHECK-NEXT:    vpaddl.u8 q8, q8
182; CHECK-NEXT:    vmov r0, r1, d16
183; CHECK-NEXT:    vmov r2, r3, d17
184; CHECK-NEXT:    mov pc, lr
185	%tmp1 = load <16 x i8>, <16 x i8>* %A
186	%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
187	ret <8 x i16> %tmp2
188}
189
190define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind {
191; CHECK-LABEL: vpaddlQu16:
192; CHECK:       @ %bb.0:
193; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
194; CHECK-NEXT:    vpaddl.u16 q8, q8
195; CHECK-NEXT:    vmov r0, r1, d16
196; CHECK-NEXT:    vmov r2, r3, d17
197; CHECK-NEXT:    mov pc, lr
198	%tmp1 = load <8 x i16>, <8 x i16>* %A
199	%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
200	ret <4 x i32> %tmp2
201}
202
203define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind {
204; CHECK-LABEL: vpaddlQu32:
205; CHECK:       @ %bb.0:
206; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
207; CHECK-NEXT:    vpaddl.u32 q8, q8
208; CHECK-NEXT:    vmov r0, r1, d16
209; CHECK-NEXT:    vmov r2, r3, d17
210; CHECK-NEXT:    mov pc, lr
211	%tmp1 = load <4 x i32>, <4 x i32>* %A
212	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
213	ret <2 x i64> %tmp2
214}
215
216; Combine vuzp+vadd->vpadd.
217define void @addCombineToVPADD_i8(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp {
218; CHECK-LABEL: addCombineToVPADD_i8:
219; CHECK:       @ %bb.0:
220; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
221; CHECK-NEXT:    vpadd.i8 d16, d16, d17
222; CHECK-NEXT:    vstr d16, [r1]
223; CHECK-NEXT:    mov pc, lr
224  %tmp = load <16 x i8>, <16 x i8>* %cbcr
225  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
226  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
227
228  %add = add <8 x i8> %tmp3, %tmp1
229  store <8 x i8> %add, <8 x i8>* %X, align 8
230  ret void
231}
232
233; Combine vuzp+vadd->vpadd.
234define void @addCombineToVPADD_i16(<8 x i16> *%cbcr, <4 x i16> *%X) nounwind ssp {
235; CHECK-LABEL: addCombineToVPADD_i16:
236; CHECK:       @ %bb.0:
237; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
238; CHECK-NEXT:    vpadd.i16 d16, d16, d17
239; CHECK-NEXT:    vstr d16, [r1]
240; CHECK-NEXT:    mov pc, lr
241  %tmp = load <8 x i16>, <8 x i16>* %cbcr
242  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
243  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
244  %add = add <4 x i16> %tmp3, %tmp1
245  store <4 x i16> %add, <4 x i16>* %X, align 8
246  ret void
247}
248
249; Combine vtrn+vadd->vpadd.
250define void @addCombineToVPADD_i32(<4 x i32> *%cbcr, <2 x i32> *%X) nounwind ssp {
251; CHECK-LABEL: addCombineToVPADD_i32:
252; CHECK:       @ %bb.0:
253; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
254; CHECK-NEXT:    vpadd.i32 d16, d16, d17
255; CHECK-NEXT:    vstr d16, [r1]
256; CHECK-NEXT:    mov pc, lr
257  %tmp = load <4 x i32>, <4 x i32>* %cbcr
258  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
259  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
260  %add = add <2 x i32> %tmp3, %tmp1
261  store <2 x i32> %add, <2 x i32>* %X, align 8
262  ret void
263}
264
265; Combine vuzp+vaddl->vpaddl
266define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
267; CHECK-LABEL: addCombineToVPADDLq_s8:
268; CHECK:       @ %bb.0:
269; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
270; CHECK-NEXT:    vpaddl.s8 q8, q8
271; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
272; CHECK-NEXT:    mov pc, lr
273  %tmp = load <16 x i8>, <16 x i8>* %cbcr
274  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
275  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
276  %tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
277  %tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
278  %add = add <8 x i16> %tmp4, %tmp5
279  store <8 x i16> %add, <8 x i16>* %X, align 8
280  ret void
281}
282
283; Combine vuzp+vaddl->vpaddl
284; FIXME: Legalization butchers the shuffles.
285define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
286; CHECK-LABEL: addCombineToVPADDL_s8:
287; CHECK:       @ %bb.0:
288; CHECK-NEXT:    vmov.i16	d16, #0x8
289; CHECK-NEXT:    vld1.64	{d18, d19}, [r0]
290; CHECK-NEXT:    vext.8	d17, d18, d16, #1
291; CHECK-NEXT:    vneg.s16	d16, d16
292; CHECK-NEXT:    vshl.i16	d18, d18, #8
293; CHECK-NEXT:    vshl.i16	d17, d17, #8
294; CHECK-NEXT:    vshl.s16	d18, d18, d16
295; CHECK-NEXT:    vshl.s16	d16, d17, d16
296; CHECK-NEXT:    vadd.i16	d16, d16, d18
297; CHECK-NEXT:    vstr	d16, [r1]
298; CHECK-NEXT:    mov	pc, lr
299  %tmp = load <16 x i8>, <16 x i8>* %cbcr
300  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
301  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
302  %tmp4 = sext <4 x i8> %tmp3 to <4 x i16>
303  %tmp5 = sext <4 x i8> %tmp1 to <4 x i16>
304  %add = add <4 x i16> %tmp4, %tmp5
305  store <4 x i16> %add, <4 x i16>* %X, align 8
306  ret void
307}
308
309; Combine vuzp+vaddl->vpaddl
310define void @addCombineToVPADDLq_u8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
311; CHECK-LABEL: addCombineToVPADDLq_u8:
312; CHECK:       @ %bb.0:
313; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
314; CHECK-NEXT:    vpaddl.u8 q8, q8
315; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
316; CHECK-NEXT:    mov pc, lr
317  %tmp = load <16 x i8>, <16 x i8>* %cbcr
318  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
319  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
320  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
321  %tmp5 = zext <8 x i8> %tmp1 to <8 x i16>
322  %add = add <8 x i16> %tmp4, %tmp5
323  store <8 x i16> %add, <8 x i16>* %X, align 8
324  ret void
325}
326
327; In theory, it's possible to match this to vpaddl, but rearranging the
328; shuffle is awkward, so this doesn't match at the moment.
329define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp {
330; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext:
331; CHECK:       @ %bb.0:
332; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
333; CHECK-NEXT:    vmovl.u8 q9, d17
334; CHECK-NEXT:    vmovl.u8 q8, d16
335; CHECK-NEXT:    vuzp.16 q8, q9
336; CHECK-NEXT:    vadd.i16 q8, q8, q9
337; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
338; CHECK-NEXT:    mov pc, lr
339  %tmp = load <16 x i8>, <16 x i8>* %cbcr
340  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
341  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
342  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
343  %add = add <8 x i16> %tmp2, %tmp3
344  store <8 x i16> %add, <8 x i16>* %X, align 8
345  ret void
346}
347
348; Combine vuzp+vaddl->vpaddl
349; FIXME: Legalization butchers the shuffle.
350define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
351; CHECK-LABEL: addCombineToVPADDL_u8:
352; CHECK:       @ %bb.0:
353; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
354; CHECK-NEXT:    vext.8 d18, d16, d16, #1
355; CHECK-NEXT:    vbic.i16 d16, #0xff00
356; CHECK-NEXT:    vbic.i16 d18, #0xff00
357; CHECK-NEXT:    vadd.i16 d16, d18, d16
358; CHECK-NEXT:    vstr d16, [r1]
359; CHECK-NEXT:    mov pc, lr
360  %tmp = load <16 x i8>, <16 x i8>* %cbcr
361  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
362  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
363  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
364  %tmp5 = zext <4 x i8> %tmp1 to <4 x i16>
365  %add = add <4 x i16> %tmp4, %tmp5
366  store <4 x i16> %add, <4 x i16>* %X, align 8
367  ret void
368}
369
370; Matching to vpaddl.8 requires matching shuffle(zext()).
371define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
372; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
373; CHECK:       @ %bb.0:
374; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
375; CHECK-NEXT:    vmovl.u8 q8, d16
376; CHECK-NEXT:    vpadd.i16 d16, d16, d17
377; CHECK-NEXT:    vstr d16, [r1]
378; CHECK-NEXT:    mov pc, lr
379  %tmp = load <16 x i8>, <16 x i8>* %cbcr
380  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
381  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
382  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
383  %add = add <4 x i16> %tmp2, %tmp3
384  store <4 x i16> %add, <4 x i16>* %X, align 8
385  ret void
386}
387
388; Combine vuzp+vaddl->vpaddl
389define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
390; CHECK-LABEL: addCombineToVPADDLq_s16:
391; CHECK:       @ %bb.0:
392; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
393; CHECK-NEXT:    vpaddl.s16 q8, q8
394; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
395; CHECK-NEXT:    mov pc, lr
396  %tmp = load <8 x i16>, <8 x i16>* %cbcr
397  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
398  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
399  %tmp4 = sext <4 x i16> %tmp3 to <4 x i32>
400  %tmp5 = sext <4 x i16> %tmp1 to <4 x i32>
401  %add = add <4 x i32> %tmp4, %tmp5
402  store <4 x i32> %add, <4 x i32>* %X, align 8
403  ret void
404}
405
406; Combine vuzp+vaddl->vpaddl
407define void @addCombineToVPADDLq_u16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp {
408; CHECK-LABEL: addCombineToVPADDLq_u16:
409; CHECK:       @ %bb.0:
410; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
411; CHECK-NEXT:    vpaddl.u16 q8, q8
412; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
413; CHECK-NEXT:    mov pc, lr
414  %tmp = load <8 x i16>, <8 x i16>* %cbcr
415  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
416  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
417  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
418  %tmp5 = zext <4 x i16> %tmp1 to <4 x i32>
419  %add = add <4 x i32> %tmp4, %tmp5
420  store <4 x i32> %add, <4 x i32>* %X, align 8
421  ret void
422}
423
424; Combine vtrn+vaddl->vpaddl
425define void @addCombineToVPADDLq_s32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
426; CHECK-LABEL: addCombineToVPADDLq_s32:
427; CHECK:       @ %bb.0:
428; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
429; CHECK-NEXT:    vpaddl.s32 q8, q8
430; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
431; CHECK-NEXT:    mov pc, lr
432  %tmp = load <4 x i32>, <4 x i32>* %cbcr
433  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
434  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
435  %tmp4 = sext <2 x i32> %tmp3 to <2 x i64>
436  %tmp5 = sext <2 x i32> %tmp1 to <2 x i64>
437  %add = add <2 x i64> %tmp4, %tmp5
438  store <2 x i64> %add, <2 x i64>* %X, align 8
439  ret void
440}
441
442; Combine vtrn+vaddl->vpaddl
443define void @addCombineToVPADDLq_u32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp {
444; CHECK-LABEL: addCombineToVPADDLq_u32:
445; CHECK:       @ %bb.0:
446; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
447; CHECK-NEXT:    vpaddl.u32 q8, q8
448; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
449; CHECK-NEXT:    mov pc, lr
450  %tmp = load <4 x i32>, <4 x i32>* %cbcr
451  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
452  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
453  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
454  %tmp5 = zext <2 x i32> %tmp1 to <2 x i64>
455  %add = add <2 x i64> %tmp4, %tmp5
456  store <2 x i64> %add, <2 x i64>* %X, align 8
457  ret void
458}
459
460; Legalization promotes the <4 x i8> to <4 x i16>.
461define <4 x i8> @fromExtendingExtractVectorElt_i8(<8 x i8> %in) {
462; CHECK-LABEL: fromExtendingExtractVectorElt_i8:
463; CHECK:       @ %bb.0:
464; CHECK-NEXT:    vmov d16, r0, r1
465; CHECK-NEXT:    vpaddl.s8 d16, d16
466; CHECK-NEXT:    vmov r0, r1, d16
467; CHECK-NEXT:    mov pc, lr
468  %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
469  %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
470  %x = add <4 x i8> %tmp2, %tmp1
471  ret <4 x i8> %x
472}
473
474; Legalization promotes the <2 x i16> to <2 x i32>.
475define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
476; CHECK-LABEL: fromExtendingExtractVectorElt_i16:
477; CHECK:       @ %bb.0:
478; CHECK-NEXT:    vmov d16, r0, r1
479; CHECK-NEXT:    vpaddl.s16 d16, d16
480; CHECK-NEXT:    vmov r0, r1, d16
481; CHECK-NEXT:    mov pc, lr
482  %tmp1 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
483  %tmp2 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
484  %x = add <2 x i16> %tmp2, %tmp1
485  ret <2 x i16> %x
486}
487
488; And <2 x i8> to <2 x i32>
489define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) {
490; CHECK-LABEL: fromExtendingExtractVectorElt_2i8:
491; CHECK:    vadd.i32
492  %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 0, i32 2>
493  %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 1, i32 3>
494  %x = add <2 x i8> %tmp2, %tmp1
495  ret <2 x i8> %x
496}
497
498define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) {
499; CHECK-LABEL: fromExtendingExtractVectorElt_2i16:
500; CHECK:    vadd.i32
501 %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 0, i32 2>
502 %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 1, i32 3>
503 %x = add <2 x i16> %tmp2, %tmp1
504 ret <2 x i16> %x
505}
506
507
508declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
509declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
510declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
511
512declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
513declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
514declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
515
516declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
517declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
518declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
519
520declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
521declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
522declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
523