1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=armv7a-eabi -mattr=+neon -float-abi=hard %s -o - | FileCheck %s
3
4define <8 x i8> @vmlai8(<8 x i8> %A, <8 x i8> %B, <8 x i8>  %C) nounwind {
5; CHECK-LABEL: vmlai8:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vmla.i8 d0, d1, d2
8; CHECK-NEXT:    bx lr
9  %tmp4 = mul <8 x i8> %B, %C
10  %tmp5 = add <8 x i8> %A, %tmp4
11  ret <8 x i8> %tmp5
12}
13
14define <4 x i16> @vmlai16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) nounwind {
15; CHECK-LABEL: vmlai16:
16; CHECK:       @ %bb.0:
17; CHECK-NEXT:    vmla.i16 d0, d1, d2
18; CHECK-NEXT:    bx lr
19  %tmp4 = mul <4 x i16> %B, %C
20  %tmp5 = add <4 x i16> %A, %tmp4
21  ret <4 x i16> %tmp5
22}
23
24define <2 x i32> @vmlai32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) nounwind {
25; CHECK-LABEL: vmlai32:
26; CHECK:       @ %bb.0:
27; CHECK-NEXT:    vmla.i32 d0, d1, d2
28; CHECK-NEXT:    bx lr
29  %tmp4 = mul <2 x i32> %B, %C
30  %tmp5 = add <2 x i32> %A, %tmp4
31  ret <2 x i32> %tmp5
32}
33
34define <2 x float> @vmlaf32(<2 x float> %A, <2 x float> %B, <2 x float> %C) nounwind {
35; CHECK-LABEL: vmlaf32:
36; CHECK:       @ %bb.0:
37; CHECK-NEXT:    vmla.f32 d0, d1, d2
38; CHECK-NEXT:    bx lr
39  %tmp4 = fmul <2 x float> %B, %C
40  %tmp5 = fadd <2 x float> %A, %tmp4
41  ret <2 x float> %tmp5
42}
43
44define <16 x i8> @vmlaQi8(<16 x i8> %A, <16 x i8> %B, <16 x i8>  %C) nounwind {
45; CHECK-LABEL: vmlaQi8:
46; CHECK:       @ %bb.0:
47; CHECK-NEXT:    vmla.i8 q0, q1, q2
48; CHECK-NEXT:    bx lr
49  %tmp4 = mul <16 x i8> %B, %C
50  %tmp5 = add <16 x i8> %A, %tmp4
51  ret <16 x i8> %tmp5
52}
53
54define <8 x i16> @vmlaQi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) nounwind {
55; CHECK-LABEL: vmlaQi16:
56; CHECK:       @ %bb.0:
57; CHECK-NEXT:    vmla.i16 q0, q1, q2
58; CHECK-NEXT:    bx lr
59  %tmp4 = mul <8 x i16> %B, %C
60  %tmp5 = add <8 x i16> %A, %tmp4
61  ret <8 x i16> %tmp5
62}
63
64define <4 x i32> @vmlaQi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) nounwind {
65; CHECK-LABEL: vmlaQi32:
66; CHECK:       @ %bb.0:
67; CHECK-NEXT:    vmla.i32 q0, q1, q2
68; CHECK-NEXT:    bx lr
69  %tmp4 = mul <4 x i32> %B, %C
70  %tmp5 = add <4 x i32> %A, %tmp4
71  ret <4 x i32> %tmp5
72}
73
74define <4 x float> @vmlaQf32(<4 x float> %A, <4 x float> %B, <4 x float> %C) nounwind {
75; CHECK-LABEL: vmlaQf32:
76; CHECK:       @ %bb.0:
77; CHECK-NEXT:    vmla.f32 q0, q1, q2
78; CHECK-NEXT:    bx lr
79  %tmp4 = fmul <4 x float> %B, %C
80  %tmp5 = fadd <4 x float> %A, %tmp4
81  ret <4 x float> %tmp5
82}
83
84define <8 x i16> @vmlals8(<8 x i16> %A, <8 x i8> %B, <8 x i8> %C) nounwind {
85; CHECK-LABEL: vmlals8:
86; CHECK:       @ %bb.0:
87; CHECK-NEXT:    vmlal.s8 q0, d2, d3
88; CHECK-NEXT:    bx lr
89  %tmp4 = sext <8 x i8> %B to <8 x i16>
90  %tmp5 = sext <8 x i8> %C to <8 x i16>
91  %tmp6 = mul <8 x i16> %tmp4, %tmp5
92  %tmp7 = add <8 x i16> %A, %tmp6
93  ret <8 x i16> %tmp7
94}
95
96define <4 x i32> @vmlals16(<4 x i32> %A, <4 x i16> %B, <4 x i16> %C) nounwind {
97; CHECK-LABEL: vmlals16:
98; CHECK:       @ %bb.0:
99; CHECK-NEXT:    vmlal.s16 q0, d2, d3
100; CHECK-NEXT:    bx lr
101  %tmp4 = sext <4 x i16> %B to <4 x i32>
102  %tmp5 = sext <4 x i16> %C to <4 x i32>
103  %tmp6 = mul <4 x i32> %tmp4, %tmp5
104  %tmp7 = add <4 x i32> %A, %tmp6
105  ret <4 x i32> %tmp7
106}
107
108define <2 x i64> @vmlals32(<2 x i64> %A, <2 x i32> %B, <2 x i32> %C) nounwind {
109; CHECK-LABEL: vmlals32:
110; CHECK:       @ %bb.0:
111; CHECK-NEXT:    vmlal.s32 q0, d2, d3
112; CHECK-NEXT:    bx lr
113  %tmp4 = sext <2 x i32> %B to <2 x i64>
114  %tmp5 = sext <2 x i32> %C to <2 x i64>
115  %tmp6 = mul <2 x i64> %tmp4, %tmp5
116  %tmp7 = add <2 x i64> %A, %tmp6
117  ret <2 x i64> %tmp7
118}
119
120define <8 x i16> @vmlalu8(<8 x i16> %A, <8 x i8> %B, <8 x i8> %C) nounwind {
121; CHECK-LABEL: vmlalu8:
122; CHECK:       @ %bb.0:
123; CHECK-NEXT:    vmlal.u8 q0, d2, d3
124; CHECK-NEXT:    bx lr
125  %tmp4 = zext <8 x i8> %B to <8 x i16>
126  %tmp5 = zext <8 x i8> %C to <8 x i16>
127  %tmp6 = mul <8 x i16> %tmp4, %tmp5
128  %tmp7 = add <8 x i16> %A, %tmp6
129  ret <8 x i16> %tmp7
130}
131
132define <4 x i32> @vmlalu16(<4 x i32> %A, <4 x i16> %B, <4 x i16> %C) nounwind {
133; CHECK-LABEL: vmlalu16:
134; CHECK:       @ %bb.0:
135; CHECK-NEXT:    vmlal.u16 q0, d2, d3
136; CHECK-NEXT:    bx lr
137  %tmp4 = zext <4 x i16> %B to <4 x i32>
138  %tmp5 = zext <4 x i16> %C to <4 x i32>
139  %tmp6 = mul <4 x i32> %tmp4, %tmp5
140  %tmp7 = add <4 x i32> %A, %tmp6
141  ret <4 x i32> %tmp7
142}
143
144define <2 x i64> @vmlalu32(<2 x i64> %A, <2 x i32> %B, <2 x i32> %C) nounwind {
145; CHECK-LABEL: vmlalu32:
146; CHECK:       @ %bb.0:
147; CHECK-NEXT:    vmlal.u32 q0, d2, d3
148; CHECK-NEXT:    bx lr
149  %tmp4 = zext <2 x i32> %B to <2 x i64>
150  %tmp5 = zext <2 x i32> %C to <2 x i64>
151  %tmp6 = mul <2 x i64> %tmp4, %tmp5
152  %tmp7 = add <2 x i64> %A, %tmp6
153  ret <2 x i64> %tmp7
154}
155
156define <8 x i16> @vmlala8(<8 x i16> %A, <8 x i8> %B, <8 x i8> %C) nounwind {
157; CHECK-LABEL: vmlala8:
158; CHECK:       @ %bb.0:
159; CHECK-NEXT:    vmlal.u8 q0, d2, d3
160; CHECK-NEXT:    vbic.i16 q0, #0xff00
161; CHECK-NEXT:    bx lr
162  %tmp4 = zext <8 x i8> %B to <8 x i16>
163  %tmp5 = zext <8 x i8> %C to <8 x i16>
164  %tmp6 = mul <8 x i16> %tmp4, %tmp5
165  %tmp7 = add <8 x i16> %A, %tmp6
166  %and = and <8 x i16> %tmp7, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
167  ret <8 x i16> %and
168}
169
170define <4 x i32> @vmlala16(<4 x i32> %A, <4 x i16> %B, <4 x i16> %C) nounwind {
171; CHECK-LABEL: vmlala16:
172; CHECK:       @ %bb.0:
173; CHECK-NEXT:    vmlal.u16 q0, d2, d3
174; CHECK-NEXT:    vmov.i32 q8, #0xffff
175; CHECK-NEXT:    vand q0, q0, q8
176; CHECK-NEXT:    bx lr
177  %tmp4 = zext <4 x i16> %B to <4 x i32>
178  %tmp5 = zext <4 x i16> %C to <4 x i32>
179  %tmp6 = mul <4 x i32> %tmp4, %tmp5
180  %tmp7 = add <4 x i32> %A, %tmp6
181  %and = and <4 x i32> %tmp7, <i32 65535, i32 65535, i32 65535, i32 65535>
182  ret <4 x i32> %and
183}
184
185define <2 x i64> @vmlala32(<2 x i64> %A, <2 x i32> %B, <2 x i32> %C) nounwind {
186; CHECK-LABEL: vmlala32:
187; CHECK:       @ %bb.0:
188; CHECK-NEXT:    vmlal.u32 q0, d2, d3
189; CHECK-NEXT:    vmov.i64 q8, #0xffffffff
190; CHECK-NEXT:    vand q0, q0, q8
191; CHECK-NEXT:    bx lr
192  %tmp4 = zext <2 x i32> %B to <2 x i64>
193  %tmp5 = zext <2 x i32> %C to <2 x i64>
194  %tmp6 = mul <2 x i64> %tmp4, %tmp5
195  %tmp7 = add <2 x i64> %A, %tmp6
196  %and = and <2 x i64> %tmp7, <i64 4294967295, i64 4294967295>
197  ret <2 x i64> %and
198}
199
200define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
201; CHECK-LABEL: test_vmlal_lanes16:
202; CHECK:       @ %bb.0: @ %entry
203; CHECK-NEXT:    vmlal.s16 q0, d2, d3[1]
204; CHECK-NEXT:    bx lr
205entry:
206  %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
207  %1 = sext <4 x i16> %arg1_int16x4_t to <4 x i32>
208  %2 = sext <4 x i16> %0 to <4 x i32>
209  %3 = mul <4 x i32> %1, %2
210  %4 = add <4 x i32> %arg0_int32x4_t, %3
211  ret <4 x i32> %4
212}
213
214define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
215; CHECK-LABEL: test_vmlal_lanes32:
216; CHECK:       @ %bb.0: @ %entry
217; CHECK-NEXT:    vmlal.s32 q0, d2, d3[1]
218; CHECK-NEXT:    bx lr
219entry:
220  %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
221  %1 = sext <2 x i32> %arg1_int32x2_t to <2 x i64>
222  %2 = sext <2 x i32> %0 to <2 x i64>
223  %3 = mul <2 x i64> %1, %2
224  %4 = add <2 x i64> %arg0_int64x2_t, %3
225  ret <2 x i64> %4
226}
227
228define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
229; CHECK-LABEL: test_vmlal_laneu16:
230; CHECK:       @ %bb.0: @ %entry
231; CHECK-NEXT:    vmlal.u16 q0, d2, d3[1]
232; CHECK-NEXT:    bx lr
233entry:
234  %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
235  %1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32>
236  %2 = zext <4 x i16> %0 to <4 x i32>
237  %3 = mul <4 x i32> %1, %2
238  %4 = add <4 x i32> %arg0_uint32x4_t, %3
239  ret <4 x i32> %4
240}
241
242define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
243; CHECK-LABEL: test_vmlal_laneu32:
244; CHECK:       @ %bb.0: @ %entry
245; CHECK-NEXT:    vmlal.u32 q0, d2, d3[1]
246; CHECK-NEXT:    bx lr
247entry:
248  %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
249  %1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64>
250  %2 = zext <2 x i32> %0 to <2 x i64>
251  %3 = mul <2 x i64> %1, %2
252  %4 = add <2 x i64> %arg0_uint64x2_t, %3
253  ret <2 x i64> %4
254}
255
256define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanea16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone {
257; CHECK-LABEL: test_vmlal_lanea16:
258; CHECK:       @ %bb.0: @ %entry
259; CHECK-NEXT:    vmlal.u16 q0, d2, d3[1]
260; CHECK-NEXT:    vmov.i32 q8, #0xffff
261; CHECK-NEXT:    vand q0, q0, q8
262; CHECK-NEXT:    bx lr
263entry:
264  %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
265  %1 = zext <4 x i16> %arg1_uint16x4_t to <4 x i32>
266  %2 = zext <4 x i16> %0 to <4 x i32>
267  %3 = mul <4 x i32> %1, %2
268  %4 = add <4 x i32> %arg0_uint32x4_t, %3
269  %and = and <4 x i32> %4, <i32 65535, i32 65535, i32 65535, i32 65535>
270  ret <4 x i32> %and
271}
272
273define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanea32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone {
274; CHECK-LABEL: test_vmlal_lanea32:
275; CHECK:       @ %bb.0: @ %entry
276; CHECK-NEXT:    vmlal.u32 q0, d2, d3[1]
277; CHECK-NEXT:    vmov.i64 q8, #0xffffffff
278; CHECK-NEXT:    vand q0, q0, q8
279; CHECK-NEXT:    bx lr
280entry:
281  %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
282  %1 = zext <2 x i32> %arg1_uint32x2_t to <2 x i64>
283  %2 = zext <2 x i32> %0 to <2 x i64>
284  %3 = mul <2 x i64> %1, %2
285  %4 = add <2 x i64> %arg0_uint64x2_t, %3
286  %and = and <2 x i64> %4, <i64 4294967295, i64 4294967295>
287  ret <2 x i64> %and
288}
289