1; RUN: llc -mattr=+neon < %s | FileCheck %s
2target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32"
3target triple = "thumbv7-elf"
4
5define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
6;CHECK-LABEL: vqdmulhs16:
7;CHECK: vqdmulh.s16
8	%tmp1 = load <4 x i16>* %A
9	%tmp2 = load <4 x i16>* %B
10	%tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
11	ret <4 x i16> %tmp3
12}
13
14define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
15;CHECK-LABEL: vqdmulhs32:
16;CHECK: vqdmulh.s32
17	%tmp1 = load <2 x i32>* %A
18	%tmp2 = load <2 x i32>* %B
19	%tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
20	ret <2 x i32> %tmp3
21}
22
23define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
24;CHECK-LABEL: vqdmulhQs16:
25;CHECK: vqdmulh.s16
26	%tmp1 = load <8 x i16>* %A
27	%tmp2 = load <8 x i16>* %B
28	%tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
29	ret <8 x i16> %tmp3
30}
31
32define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
33;CHECK-LABEL: vqdmulhQs32:
34;CHECK: vqdmulh.s32
35	%tmp1 = load <4 x i32>* %A
36	%tmp2 = load <4 x i32>* %B
37	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
38	ret <4 x i32> %tmp3
39}
40
41define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
42entry:
43; CHECK: test_vqdmulhQ_lanes16
44; CHECK: vqdmulh.s16 q0, q0, d2[1]
45  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
46  %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
47  ret <8 x i16> %1
48}
49
50define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
51entry:
52; CHECK: test_vqdmulhQ_lanes32
53; CHECK: vqdmulh.s32 q0, q0, d2[1]
54  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
55  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
56  ret <4 x i32> %1
57}
58
59define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
60entry:
61; CHECK: test_vqdmulh_lanes16
62; CHECK: vqdmulh.s16 d0, d0, d1[1]
63  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
64  %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
65  ret <4 x i16> %1
66}
67
68define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
69entry:
70; CHECK: test_vqdmulh_lanes32
71; CHECK: vqdmulh.s32 d0, d0, d1[1]
72  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
73  %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
74  ret <2 x i32> %1
75}
76
77declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
78declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
79
80declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
81declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
82
83define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
84;CHECK-LABEL: vqrdmulhs16:
85;CHECK: vqrdmulh.s16
86	%tmp1 = load <4 x i16>* %A
87	%tmp2 = load <4 x i16>* %B
88	%tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
89	ret <4 x i16> %tmp3
90}
91
92define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
93;CHECK-LABEL: vqrdmulhs32:
94;CHECK: vqrdmulh.s32
95	%tmp1 = load <2 x i32>* %A
96	%tmp2 = load <2 x i32>* %B
97	%tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
98	ret <2 x i32> %tmp3
99}
100
101define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
102;CHECK-LABEL: vqrdmulhQs16:
103;CHECK: vqrdmulh.s16
104	%tmp1 = load <8 x i16>* %A
105	%tmp2 = load <8 x i16>* %B
106	%tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
107	ret <8 x i16> %tmp3
108}
109
110define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
111;CHECK-LABEL: vqrdmulhQs32:
112;CHECK: vqrdmulh.s32
113	%tmp1 = load <4 x i32>* %A
114	%tmp2 = load <4 x i32>* %B
115	%tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
116	ret <4 x i32> %tmp3
117}
118
119define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
120entry:
121; CHECK: test_vqRdmulhQ_lanes16
122; CHECK: vqrdmulh.s16 q0, q0, d2[1]
123  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1]
124  %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1]
125  ret <8 x i16> %1
126}
127
128define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
129entry:
130; CHECK: test_vqRdmulhQ_lanes32
131; CHECK: vqrdmulh.s32 q0, q0, d2[1]
132  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1]
133  %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1]
134  ret <4 x i32> %1
135}
136
137define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
138entry:
139; CHECK: test_vqRdmulh_lanes16
140; CHECK: vqrdmulh.s16 d0, d0, d1[1]
141  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
142  %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1]
143  ret <4 x i16> %1
144}
145
146define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
147entry:
148; CHECK: test_vqRdmulh_lanes32
149; CHECK: vqrdmulh.s32 d0, d0, d1[1]
150  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
151  %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1]
152  ret <2 x i32> %1
153}
154
155declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
156declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
157
158declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
159declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
160
161define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
162;CHECK-LABEL: vqdmulls16:
163;CHECK: vqdmull.s16
164	%tmp1 = load <4 x i16>* %A
165	%tmp2 = load <4 x i16>* %B
166	%tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
167	ret <4 x i32> %tmp3
168}
169
170define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
171;CHECK-LABEL: vqdmulls32:
172;CHECK: vqdmull.s32
173	%tmp1 = load <2 x i32>* %A
174	%tmp2 = load <2 x i32>* %B
175	%tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
176	ret <2 x i64> %tmp3
177}
178
179define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone {
180entry:
181; CHECK: test_vqdmull_lanes16
182; CHECK: vqdmull.s16 q0, d0, d1[1]
183  %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
184  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
185  ret <4 x i32> %1
186}
187
188define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone {
189entry:
190; CHECK: test_vqdmull_lanes32
191; CHECK: vqdmull.s32 q0, d0, d1[1]
192  %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
193  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
194  ret <2 x i64> %1
195}
196
197declare <4 x i32>  @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
198declare <2 x i64>  @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
199
200define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
201;CHECK-LABEL: vqdmlals16_natural:
202;CHECK: vqdmlal.s16
203        %tmp1 = load <4 x i32>* %A
204        %tmp2 = load <4 x i16>* %B
205        %tmp3 = load <4 x i16>* %C
206        %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
207        %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
208        ret <4 x i32> %tmp5
209}
210
211define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
212;CHECK-LABEL: vqdmlals32_natural:
213;CHECK: vqdmlal.s32
214        %tmp1 = load <2 x i64>* %A
215        %tmp2 = load <2 x i32>* %B
216        %tmp3 = load <2 x i32>* %C
217        %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
218        %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
219        ret <2 x i64> %tmp5
220}
221
222define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
223entry:
224; CHECK-LABEL: test_vqdmlal_lanes16_natural:
225; CHECK: vqdmlal.s16 q0, d2, d3[1]
226  %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
227  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
228  %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
229  ret <4 x i32> %2
230}
231
232define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
233entry:
234; CHECK-LABEL: test_vqdmlal_lanes32_natural:
235; CHECK: vqdmlal.s32 q0, d2, d3[1]
236  %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
237  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
238  %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
239  ret <2 x i64> %2
240}
241
242declare <4 x i32>  @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
243declare <2 x i64>  @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
244
245define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
246;CHECK-LABEL: vqdmlsls16_natural:
247;CHECK: vqdmlsl.s16
248        %tmp1 = load <4 x i32>* %A
249        %tmp2 = load <4 x i16>* %B
250        %tmp3 = load <4 x i16>* %C
251        %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
252        %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
253        ret <4 x i32> %tmp5
254}
255
256define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
257;CHECK-LABEL: vqdmlsls32_natural:
258;CHECK: vqdmlsl.s32
259        %tmp1 = load <2 x i64>* %A
260        %tmp2 = load <2 x i32>* %B
261        %tmp3 = load <2 x i32>* %C
262        %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
263        %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
264        ret <2 x i64> %tmp5
265}
266
267define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
268entry:
269; CHECK-LABEL: test_vqdmlsl_lanes16_natural:
270; CHECK: vqdmlsl.s16 q0, d2, d3[1]
271  %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
272  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
273  %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
274  ret <4 x i32> %2
275}
276
277define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
278entry:
279; CHECK-LABEL: test_vqdmlsl_lanes32_natural:
280; CHECK: vqdmlsl.s32 q0, d2, d3[1]
281  %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
282  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
283  %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
284  ret <2 x i64> %2
285}
286
287declare <4 x i32>  @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
288declare <2 x i64>  @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
289