1; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o -| FileCheck %s
2
3define <8 x i16> @smull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4; CHECK-LABEL: smull_v8i8_v8i16:
5; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
6  %tmp1 = load <8 x i8>* %A
7  %tmp2 = load <8 x i8>* %B
8  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
9  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
10  %tmp5 = mul <8 x i16> %tmp3, %tmp4
11  ret <8 x i16> %tmp5
12}
13
14define <4 x i32> @smull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
15; CHECK-LABEL: smull_v4i16_v4i32:
16; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
17  %tmp1 = load <4 x i16>* %A
18  %tmp2 = load <4 x i16>* %B
19  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
20  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
21  %tmp5 = mul <4 x i32> %tmp3, %tmp4
22  ret <4 x i32> %tmp5
23}
24
25define <2 x i64> @smull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
26; CHECK-LABEL: smull_v2i32_v2i64:
27; CHECK:  smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
28  %tmp1 = load <2 x i32>* %A
29  %tmp2 = load <2 x i32>* %B
30  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
31  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
32  %tmp5 = mul <2 x i64> %tmp3, %tmp4
33  ret <2 x i64> %tmp5
34}
35
36define <8 x i16> @umull_v8i8_v8i16(<8 x i8>* %A, <8 x i8>* %B) nounwind {
37; CHECK-LABEL: umull_v8i8_v8i16:
38; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
39  %tmp1 = load <8 x i8>* %A
40  %tmp2 = load <8 x i8>* %B
41  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
42  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
43  %tmp5 = mul <8 x i16> %tmp3, %tmp4
44  ret <8 x i16> %tmp5
45}
46
47define <4 x i32> @umull_v4i16_v4i32(<4 x i16>* %A, <4 x i16>* %B) nounwind {
48; CHECK-LABEL: umull_v4i16_v4i32:
49; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
50  %tmp1 = load <4 x i16>* %A
51  %tmp2 = load <4 x i16>* %B
52  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
53  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
54  %tmp5 = mul <4 x i32> %tmp3, %tmp4
55  ret <4 x i32> %tmp5
56}
57
58define <2 x i64> @umull_v2i32_v2i64(<2 x i32>* %A, <2 x i32>* %B) nounwind {
59; CHECK-LABEL: umull_v2i32_v2i64:
60; CHECK:  umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
61  %tmp1 = load <2 x i32>* %A
62  %tmp2 = load <2 x i32>* %B
63  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
64  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
65  %tmp5 = mul <2 x i64> %tmp3, %tmp4
66  ret <2 x i64> %tmp5
67}
68
69define <8 x i16> @smlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
70; CHECK-LABEL: smlal_v8i8_v8i16:
71; CHECK:  smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
72  %tmp1 = load <8 x i16>* %A
73  %tmp2 = load <8 x i8>* %B
74  %tmp3 = load <8 x i8>* %C
75  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
76  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
77  %tmp6 = mul <8 x i16> %tmp4, %tmp5
78  %tmp7 = add <8 x i16> %tmp1, %tmp6
79  ret <8 x i16> %tmp7
80}
81
82define <4 x i32> @smlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
83; CHECK-LABEL: smlal_v4i16_v4i32:
84; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
85  %tmp1 = load <4 x i32>* %A
86  %tmp2 = load <4 x i16>* %B
87  %tmp3 = load <4 x i16>* %C
88  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
89  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
90  %tmp6 = mul <4 x i32> %tmp4, %tmp5
91  %tmp7 = add <4 x i32> %tmp1, %tmp6
92  ret <4 x i32> %tmp7
93}
94
95define <2 x i64> @smlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
96; CHECK-LABEL: smlal_v2i32_v2i64:
97; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
98  %tmp1 = load <2 x i64>* %A
99  %tmp2 = load <2 x i32>* %B
100  %tmp3 = load <2 x i32>* %C
101  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
102  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
103  %tmp6 = mul <2 x i64> %tmp4, %tmp5
104  %tmp7 = add <2 x i64> %tmp1, %tmp6
105  ret <2 x i64> %tmp7
106}
107
108define <8 x i16> @umlal_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
109; CHECK-LABEL: umlal_v8i8_v8i16:
110; CHECK:  umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
111  %tmp1 = load <8 x i16>* %A
112  %tmp2 = load <8 x i8>* %B
113  %tmp3 = load <8 x i8>* %C
114  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
115  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
116  %tmp6 = mul <8 x i16> %tmp4, %tmp5
117  %tmp7 = add <8 x i16> %tmp1, %tmp6
118  ret <8 x i16> %tmp7
119}
120
121define <4 x i32> @umlal_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
122; CHECK-LABEL: umlal_v4i16_v4i32:
123; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
124  %tmp1 = load <4 x i32>* %A
125  %tmp2 = load <4 x i16>* %B
126  %tmp3 = load <4 x i16>* %C
127  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
128  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
129  %tmp6 = mul <4 x i32> %tmp4, %tmp5
130  %tmp7 = add <4 x i32> %tmp1, %tmp6
131  ret <4 x i32> %tmp7
132}
133
134define <2 x i64> @umlal_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
135; CHECK-LABEL: umlal_v2i32_v2i64:
136; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
137  %tmp1 = load <2 x i64>* %A
138  %tmp2 = load <2 x i32>* %B
139  %tmp3 = load <2 x i32>* %C
140  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
141  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
142  %tmp6 = mul <2 x i64> %tmp4, %tmp5
143  %tmp7 = add <2 x i64> %tmp1, %tmp6
144  ret <2 x i64> %tmp7
145}
146
147define <8 x i16> @smlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
148; CHECK-LABEL: smlsl_v8i8_v8i16:
149; CHECK:  smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
150  %tmp1 = load <8 x i16>* %A
151  %tmp2 = load <8 x i8>* %B
152  %tmp3 = load <8 x i8>* %C
153  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
154  %tmp5 = sext <8 x i8> %tmp3 to <8 x i16>
155  %tmp6 = mul <8 x i16> %tmp4, %tmp5
156  %tmp7 = sub <8 x i16> %tmp1, %tmp6
157  ret <8 x i16> %tmp7
158}
159
160define <4 x i32> @smlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
161; CHECK-LABEL: smlsl_v4i16_v4i32:
162; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
163  %tmp1 = load <4 x i32>* %A
164  %tmp2 = load <4 x i16>* %B
165  %tmp3 = load <4 x i16>* %C
166  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
167  %tmp5 = sext <4 x i16> %tmp3 to <4 x i32>
168  %tmp6 = mul <4 x i32> %tmp4, %tmp5
169  %tmp7 = sub <4 x i32> %tmp1, %tmp6
170  ret <4 x i32> %tmp7
171}
172
173define <2 x i64> @smlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
174; CHECK-LABEL: smlsl_v2i32_v2i64:
175; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
176  %tmp1 = load <2 x i64>* %A
177  %tmp2 = load <2 x i32>* %B
178  %tmp3 = load <2 x i32>* %C
179  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
180  %tmp5 = sext <2 x i32> %tmp3 to <2 x i64>
181  %tmp6 = mul <2 x i64> %tmp4, %tmp5
182  %tmp7 = sub <2 x i64> %tmp1, %tmp6
183  ret <2 x i64> %tmp7
184}
185
186define <8 x i16> @umlsl_v8i8_v8i16(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
187; CHECK-LABEL: umlsl_v8i8_v8i16:
188; CHECK:  umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
189  %tmp1 = load <8 x i16>* %A
190  %tmp2 = load <8 x i8>* %B
191  %tmp3 = load <8 x i8>* %C
192  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
193  %tmp5 = zext <8 x i8> %tmp3 to <8 x i16>
194  %tmp6 = mul <8 x i16> %tmp4, %tmp5
195  %tmp7 = sub <8 x i16> %tmp1, %tmp6
196  ret <8 x i16> %tmp7
197}
198
199define <4 x i32> @umlsl_v4i16_v4i32(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
200; CHECK-LABEL: umlsl_v4i16_v4i32:
201; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
202  %tmp1 = load <4 x i32>* %A
203  %tmp2 = load <4 x i16>* %B
204  %tmp3 = load <4 x i16>* %C
205  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
206  %tmp5 = zext <4 x i16> %tmp3 to <4 x i32>
207  %tmp6 = mul <4 x i32> %tmp4, %tmp5
208  %tmp7 = sub <4 x i32> %tmp1, %tmp6
209  ret <4 x i32> %tmp7
210}
211
212define <2 x i64> @umlsl_v2i32_v2i64(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
213; CHECK-LABEL: umlsl_v2i32_v2i64:
214; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
215  %tmp1 = load <2 x i64>* %A
216  %tmp2 = load <2 x i32>* %B
217  %tmp3 = load <2 x i32>* %C
218  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
219  %tmp5 = zext <2 x i32> %tmp3 to <2 x i64>
220  %tmp6 = mul <2 x i64> %tmp4, %tmp5
221  %tmp7 = sub <2 x i64> %tmp1, %tmp6
222  ret <2 x i64> %tmp7
223}
224
225; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
226define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
227; CHECK-LABEL: smull_extvec_v8i8_v8i16:
228; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
229  %tmp3 = sext <8 x i8> %arg to <8 x i16>
230  %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
231  ret <8 x i16> %tmp4
232}
233
234define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
235; Do not use SMULL if the BUILD_VECTOR element values are too big.
236; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
237; CHECK: movz
238; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
239  %tmp3 = sext <8 x i8> %arg to <8 x i16>
240  %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
241  ret <8 x i16> %tmp4
242}
243
244define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
245; CHECK-LABEL: smull_extvec_v4i16_v4i32:
246; CHECK:  smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
247  %tmp3 = sext <4 x i16> %arg to <4 x i32>
248  %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
249  ret <4 x i32> %tmp4
250}
251
252define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
253; CHECK: smull_extvec_v2i32_v2i64
254; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
255  %tmp3 = sext <2 x i32> %arg to <2 x i64>
256  %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
257  ret <2 x i64> %tmp4
258}
259
260define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
261; CHECK-LABEL: umull_extvec_v8i8_v8i16:
262; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
263  %tmp3 = zext <8 x i8> %arg to <8 x i16>
264  %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
265  ret <8 x i16> %tmp4
266}
267
268define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
269; Do not use SMULL if the BUILD_VECTOR element values are too big.
270; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
271; CHECK: movz
272; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
273  %tmp3 = zext <8 x i8> %arg to <8 x i16>
274  %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
275  ret <8 x i16> %tmp4
276}
277
278define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
279; CHECK-LABEL: umull_extvec_v4i16_v4i32:
280; CHECK:  umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
281  %tmp3 = zext <4 x i16> %arg to <4 x i32>
282  %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
283  ret <4 x i32> %tmp4
284}
285
286define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
287; CHECK-LABEL: umull_extvec_v2i32_v2i64:
288; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
289  %tmp3 = zext <2 x i32> %arg to <2 x i64>
290  %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
291  ret <2 x i64> %tmp4
292}
293
294define i16 @smullWithInconsistentExtensions(<8 x i8> %vec) {
295; If one operand has a zero-extend and the other a sign-extend, smull
296; cannot be used.
297; CHECK-LABEL: smullWithInconsistentExtensions:
298; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
299  %1 = sext <8 x i8> %vec to <8 x i16>
300  %2 = mul <8 x i16> %1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
301  %3 = extractelement <8 x i16> %2, i32 0
302  ret i16 %3
303}
304
305define void @distribute(i16* %dst, i8* %src, i32 %mul) nounwind {
306entry:
307; CHECK-LABEL: distribute:
308; CHECK: umull [[REG1:(v[0-9]+.8h)]], {{v[0-9]+}}.8b, [[REG2:(v[0-9]+.8b)]]
309; CHECK: umlal [[REG1]], {{v[0-9]+}}.8b, [[REG2]]
310  %0 = trunc i32 %mul to i8
311  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
312  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
313  %3 = tail call <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8* %src, i32 1)
314  %4 = bitcast <16 x i8> %3 to <2 x double>
315  %5 = extractelement <2 x double> %4, i32 1
316  %6 = bitcast double %5 to <8 x i8>
317  %7 = zext <8 x i8> %6 to <8 x i16>
318  %8 = zext <8 x i8> %2 to <8 x i16>
319  %9 = extractelement <2 x double> %4, i32 0
320  %10 = bitcast double %9 to <8 x i8>
321  %11 = zext <8 x i8> %10 to <8 x i16>
322  %12 = add <8 x i16> %7, %11
323  %13 = mul <8 x i16> %12, %8
324  %14 = bitcast i16* %dst to i8*
325  tail call void @llvm.aarch64.neon.vst1.v8i16(i8* %14, <8 x i16> %13, i32 2)
326  ret void
327}
328
329declare <16 x i8> @llvm.aarch64.neon.vld1.v16i8(i8*, i32) nounwind readonly
330
331declare void @llvm.aarch64.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
332
333