1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
5; CHECK-LABEL: add_v4i32_v4i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vmlav.u32 r0, q0, q1
8; CHECK-NEXT:    bx lr
9entry:
10  %m = mul <4 x i32> %x, %y
11  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
12  ret i32 %z
13}
14
15define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
16; CHECK-LABEL: add_v4i32_v4i64_zext:
17; CHECK:       @ %bb.0: @ %entry
18; CHECK-NEXT:    vmlalv.u32 r0, r1, q0, q1
19; CHECK-NEXT:    bx lr
20entry:
21  %xx = zext <4 x i32> %x to <4 x i64>
22  %yy = zext <4 x i32> %y to <4 x i64>
23  %m = mul <4 x i64> %xx, %yy
24  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
25  ret i64 %z
26}
27
28define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
29; CHECK-LABEL: add_v4i32_v4i64_sext:
30; CHECK:       @ %bb.0: @ %entry
31; CHECK-NEXT:    vmlalv.s32 r0, r1, q0, q1
32; CHECK-NEXT:    bx lr
33entry:
34  %xx = sext <4 x i32> %x to <4 x i64>
35  %yy = sext <4 x i32> %y to <4 x i64>
36  %m = mul <4 x i64> %xx, %yy
37  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
38  ret i64 %z
39}
40
41define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
42; CHECK-LABEL: add_v2i32_v2i64_zext:
43; CHECK:       @ %bb.0: @ %entry
44; CHECK-NEXT:    vmullb.u32 q2, q0, q1
45; CHECK-NEXT:    vmov r0, r1, d5
46; CHECK-NEXT:    vmov r2, r3, d4
47; CHECK-NEXT:    adds r0, r0, r2
48; CHECK-NEXT:    adcs r1, r3
49; CHECK-NEXT:    bx lr
50entry:
51  %xx = zext <2 x i32> %x to <2 x i64>
52  %yy = zext <2 x i32> %y to <2 x i64>
53  %m = mul <2 x i64> %xx, %yy
54  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
55  ret i64 %z
56}
57
58define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
59; CHECK-LABEL: add_v2i32_v2i64_sext:
60; CHECK:       @ %bb.0: @ %entry
61; CHECK-NEXT:    vmullb.s32 q2, q0, q1
62; CHECK-NEXT:    vmov r0, r1, d5
63; CHECK-NEXT:    vmov r2, r3, d4
64; CHECK-NEXT:    adds r0, r0, r2
65; CHECK-NEXT:    adcs r1, r3
66; CHECK-NEXT:    bx lr
67entry:
68  %xx = sext <2 x i32> %x to <2 x i64>
69  %yy = sext <2 x i32> %y to <2 x i64>
70  %m = mul <2 x i64> %xx, %yy
71  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
72  ret i64 %z
73}
74
75define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
76; CHECK-LABEL: add_v8i16_v8i32_zext:
77; CHECK:       @ %bb.0: @ %entry
78; CHECK-NEXT:    vmlav.u16 r0, q0, q1
79; CHECK-NEXT:    bx lr
80entry:
81  %xx = zext <8 x i16> %x to <8 x i32>
82  %yy = zext <8 x i16> %y to <8 x i32>
83  %m = mul <8 x i32> %xx, %yy
84  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
85  ret i32 %z
86}
87
88define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
89; CHECK-LABEL: add_v8i16_v8i32_sext:
90; CHECK:       @ %bb.0: @ %entry
91; CHECK-NEXT:    vmlav.s16 r0, q0, q1
92; CHECK-NEXT:    bx lr
93entry:
94  %xx = sext <8 x i16> %x to <8 x i32>
95  %yy = sext <8 x i16> %y to <8 x i32>
96  %m = mul <8 x i32> %xx, %yy
97  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
98  ret i32 %z
99}
100
101define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
102; CHECK-LABEL: add_v4i16_v4i32_zext:
103; CHECK:       @ %bb.0: @ %entry
104; CHECK-NEXT:    vmovlb.u16 q1, q1
105; CHECK-NEXT:    vmovlb.u16 q0, q0
106; CHECK-NEXT:    vmlav.u32 r0, q0, q1
107; CHECK-NEXT:    bx lr
108entry:
109  %xx = zext <4 x i16> %x to <4 x i32>
110  %yy = zext <4 x i16> %y to <4 x i32>
111  %m = mul <4 x i32> %xx, %yy
112  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
113  ret i32 %z
114}
115
116define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
117; CHECK-LABEL: add_v4i16_v4i32_sext:
118; CHECK:       @ %bb.0: @ %entry
119; CHECK-NEXT:    vmovlb.s16 q1, q1
120; CHECK-NEXT:    vmovlb.s16 q0, q0
121; CHECK-NEXT:    vmlav.u32 r0, q0, q1
122; CHECK-NEXT:    bx lr
123entry:
124  %xx = sext <4 x i16> %x to <4 x i32>
125  %yy = sext <4 x i16> %y to <4 x i32>
126  %m = mul <4 x i32> %xx, %yy
127  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
128  ret i32 %z
129}
130
131define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
132; CHECK-LABEL: add_v8i16_v8i16:
133; CHECK:       @ %bb.0: @ %entry
134; CHECK-NEXT:    vmlav.u16 r0, q0, q1
135; CHECK-NEXT:    uxth r0, r0
136; CHECK-NEXT:    bx lr
137entry:
138  %m = mul <8 x i16> %x, %y
139  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
140  ret i16 %z
141}
142
143define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
144; CHECK-LABEL: add_v8i16_v8i64_zext:
145; CHECK:       @ %bb.0: @ %entry
146; CHECK-NEXT:    vmlalv.u16 r0, r1, q0, q1
147; CHECK-NEXT:    bx lr
148entry:
149  %xx = zext <8 x i16> %x to <8 x i64>
150  %yy = zext <8 x i16> %y to <8 x i64>
151  %m = mul <8 x i64> %xx, %yy
152  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
153  ret i64 %z
154}
155
156define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
157; CHECK-LABEL: add_v8i16_v8i64_sext:
158; CHECK:       @ %bb.0: @ %entry
159; CHECK-NEXT:    vmlalv.s16 r0, r1, q0, q1
160; CHECK-NEXT:    bx lr
161entry:
162  %xx = sext <8 x i16> %x to <8 x i64>
163  %yy = sext <8 x i16> %y to <8 x i64>
164  %m = mul <8 x i64> %xx, %yy
165  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
166  ret i64 %z
167}
168
169define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_zext(<8 x i16> %x, <8 x i8> %y) {
170; CHECK-LABEL: add_v8i8i16_v8i64_zext:
171; CHECK:       @ %bb.0: @ %entry
172; CHECK-NEXT:    vmovlb.u8 q1, q1
173; CHECK-NEXT:    vmlalv.u16 r0, r1, q0, q1
174; CHECK-NEXT:    bx lr
175entry:
176  %xx = zext <8 x i16> %x to <8 x i64>
177  %yy = zext <8 x i8> %y to <8 x i64>
178  %m = mul <8 x i64> %xx, %yy
179  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
180  ret i64 %z
181}
182
183define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y) {
184; CHECK-LABEL: add_v8i8i16_v8i64_sext:
185; CHECK:       @ %bb.0: @ %entry
186; CHECK-NEXT:    vmovlb.s8 q1, q1
187; CHECK-NEXT:    vmlalv.s16 r0, r1, q0, q1
188; CHECK-NEXT:    bx lr
189entry:
190  %xx = sext <8 x i16> %x to <8 x i64>
191  %yy = sext <8 x i8> %y to <8 x i64>
192  %m = mul <8 x i64> %xx, %yy
193  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
194  ret i64 %z
195}
196
197define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
198; CHECK-LABEL: add_v4i16_v4i64_zext:
199; CHECK:       @ %bb.0: @ %entry
200; CHECK-NEXT:    vmovlb.u16 q1, q1
201; CHECK-NEXT:    vmovlb.u16 q0, q0
202; CHECK-NEXT:    vmlalv.u32 r0, r1, q0, q1
203; CHECK-NEXT:    bx lr
204entry:
205  %xx = zext <4 x i16> %x to <4 x i64>
206  %yy = zext <4 x i16> %y to <4 x i64>
207  %m = mul <4 x i64> %xx, %yy
208  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
209  ret i64 %z
210}
211
212define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
213; CHECK-LABEL: add_v4i16_v4i64_sext:
214; CHECK:       @ %bb.0: @ %entry
215; CHECK-NEXT:    vmovlb.s16 q1, q1
216; CHECK-NEXT:    vmovlb.s16 q0, q0
217; CHECK-NEXT:    vmlalv.s32 r0, r1, q0, q1
218; CHECK-NEXT:    bx lr
219entry:
220  %xx = sext <4 x i16> %x to <4 x i64>
221  %yy = sext <4 x i16> %y to <4 x i64>
222  %m = mul <4 x i64> %xx, %yy
223  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
224  ret i64 %z
225}
226
227define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
228; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext:
229; CHECK:       @ %bb.0: @ %entry
230; CHECK-NEXT:    vmlalv.u16 r0, r1, q0, q1
231; CHECK-NEXT:    bx lr
232entry:
233  %xx = zext <8 x i16> %x to <8 x i32>
234  %yy = zext <8 x i16> %y to <8 x i32>
235  %m = mul <8 x i32> %xx, %yy
236  %ma = zext <8 x i32> %m to <8 x i64>
237  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
238  ret i64 %z
239}
240
241define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
242; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext:
243; CHECK:       @ %bb.0: @ %entry
244; CHECK-NEXT:    vmlalv.s16 r0, r1, q0, q1
245; CHECK-NEXT:    bx lr
246entry:
247  %xx = sext <8 x i16> %x to <8 x i32>
248  %yy = sext <8 x i16> %y to <8 x i32>
249  %m = mul <8 x i32> %xx, %yy
250  %ma = sext <8 x i32> %m to <8 x i64>
251  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
252  ret i64 %z
253}
254
255define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y) {
256; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext:
257; CHECK:       @ %bb.0: @ %entry
258; CHECK-NEXT:    vmlalv.s16 r0, r1, q0, q0
259; CHECK-NEXT:    bx lr
260entry:
261  %xx = sext <8 x i16> %x to <8 x i32>
262  %m = mul <8 x i32> %xx, %xx
263  %ma = zext <8 x i32> %m to <8 x i64>
264  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
265  ret i64 %z
266}
267
268define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
269; CHECK-LABEL: add_v2i16_v2i64_zext:
270; CHECK:       @ %bb.0: @ %entry
271; CHECK-NEXT:    vmov.i64 q2, #0xffff
272; CHECK-NEXT:    vand q1, q1, q2
273; CHECK-NEXT:    vand q0, q0, q2
274; CHECK-NEXT:    vmov r0, s6
275; CHECK-NEXT:    vmov r1, s2
276; CHECK-NEXT:    vmov r2, s4
277; CHECK-NEXT:    vmov r3, s0
278; CHECK-NEXT:    umull r0, r1, r1, r0
279; CHECK-NEXT:    umlal r0, r1, r3, r2
280; CHECK-NEXT:    bx lr
281entry:
282  %xx = zext <2 x i16> %x to <2 x i64>
283  %yy = zext <2 x i16> %y to <2 x i64>
284  %m = mul <2 x i64> %xx, %yy
285  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
286  ret i64 %z
287}
288
289define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
290; CHECK-LABEL: add_v2i16_v2i64_sext:
291; CHECK:       @ %bb.0: @ %entry
292; CHECK-NEXT:    vmov r0, s6
293; CHECK-NEXT:    vmov r1, s2
294; CHECK-NEXT:    vmov r2, s4
295; CHECK-NEXT:    vmov r3, s0
296; CHECK-NEXT:    sxth r0, r0
297; CHECK-NEXT:    sxth r1, r1
298; CHECK-NEXT:    smull r0, r1, r1, r0
299; CHECK-NEXT:    sxth r2, r2
300; CHECK-NEXT:    sxth r3, r3
301; CHECK-NEXT:    smlal r0, r1, r3, r2
302; CHECK-NEXT:    bx lr
303entry:
304  %xx = sext <2 x i16> %x to <2 x i64>
305  %yy = sext <2 x i16> %y to <2 x i64>
306  %m = mul <2 x i64> %xx, %yy
307  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
308  ret i64 %z
309}
310
311define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
312; CHECK-LABEL: add_v16i8_v16i32_zext:
313; CHECK:       @ %bb.0: @ %entry
314; CHECK-NEXT:    vmlav.u8 r0, q0, q1
315; CHECK-NEXT:    bx lr
316entry:
317  %xx = zext <16 x i8> %x to <16 x i32>
318  %yy = zext <16 x i8> %y to <16 x i32>
319  %m = mul <16 x i32> %xx, %yy
320  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
321  ret i32 %z
322}
323
324define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
325; CHECK-LABEL: add_v16i8_v16i32_sext:
326; CHECK:       @ %bb.0: @ %entry
327; CHECK-NEXT:    vmlav.s8 r0, q0, q1
328; CHECK-NEXT:    bx lr
329entry:
330  %xx = sext <16 x i8> %x to <16 x i32>
331  %yy = sext <16 x i8> %y to <16 x i32>
332  %m = mul <16 x i32> %xx, %yy
333  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
334  ret i32 %z
335}
336
337define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
338; CHECK-LABEL: add_v8i8_v8i32_zext:
339; CHECK:       @ %bb.0: @ %entry
340; CHECK-NEXT:    vmovlb.u8 q1, q1
341; CHECK-NEXT:    vmovlb.u8 q0, q0
342; CHECK-NEXT:    vmlav.u16 r0, q0, q1
343; CHECK-NEXT:    bx lr
344entry:
345  %xx = zext <8 x i8> %x to <8 x i32>
346  %yy = zext <8 x i8> %y to <8 x i32>
347  %m = mul <8 x i32> %xx, %yy
348  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
349  ret i32 %z
350}
351
352define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
353; CHECK-LABEL: add_v8i8_v8i32_sext:
354; CHECK:       @ %bb.0: @ %entry
355; CHECK-NEXT:    vmovlb.s8 q1, q1
356; CHECK-NEXT:    vmovlb.s8 q0, q0
357; CHECK-NEXT:    vmlav.s16 r0, q0, q1
358; CHECK-NEXT:    bx lr
359entry:
360  %xx = sext <8 x i8> %x to <8 x i32>
361  %yy = sext <8 x i8> %y to <8 x i32>
362  %m = mul <8 x i32> %xx, %yy
363  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
364  ret i32 %z
365}
366
367define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext(<8 x i8> %x, <8 x i16> %y) {
368; CHECK-LABEL: add_v8i8i16_v8i32_zext:
369; CHECK:       @ %bb.0: @ %entry
370; CHECK-NEXT:    vmovlb.u8 q0, q0
371; CHECK-NEXT:    vmlav.u16 r0, q0, q1
372; CHECK-NEXT:    bx lr
373entry:
374  %xx = zext <8 x i8> %x to <8 x i32>
375  %yy = zext <8 x i16> %y to <8 x i32>
376  %m = mul <8 x i32> %xx, %yy
377  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
378  ret i32 %z
379}
380
381define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y) {
382; CHECK-LABEL: add_v8i8i16_v8i32_sext:
383; CHECK:       @ %bb.0: @ %entry
384; CHECK-NEXT:    vmovlb.s8 q0, q0
385; CHECK-NEXT:    vmlav.s16 r0, q0, q1
386; CHECK-NEXT:    bx lr
387entry:
388  %xx = sext <8 x i8> %x to <8 x i32>
389  %yy = sext <8 x i16> %y to <8 x i32>
390  %m = mul <8 x i32> %xx, %yy
391  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
392  ret i32 %z
393}
394
395define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
396; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
397; CHECK:       @ %bb.0: @ %entry
398; CHECK-NEXT:    vmlav.u8 r0, q0, q1
399; CHECK-NEXT:    bx lr
400entry:
401  %xx = zext <16 x i8> %x to <16 x i16>
402  %yy = zext <16 x i8> %y to <16 x i16>
403  %m = mul <16 x i16> %xx, %yy
404  %ma = zext <16 x i16> %m to <16 x i32>
405  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
406  ret i32 %z
407}
408
409define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
410; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext:
411; CHECK:       @ %bb.0: @ %entry
412; CHECK-NEXT:    vmlav.s8 r0, q0, q1
413; CHECK-NEXT:    bx lr
414entry:
415  %xx = sext <16 x i8> %x to <16 x i16>
416  %yy = sext <16 x i8> %y to <16 x i16>
417  %m = mul <16 x i16> %xx, %yy
418  %ma = sext <16 x i16> %m to <16 x i32>
419  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
420  ret i32 %z
421}
422
423define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y) {
424; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext:
425; CHECK:       @ %bb.0: @ %entry
426; CHECK-NEXT:    vmlav.s8 r0, q0, q0
427; CHECK-NEXT:    bx lr
428entry:
429  %xx = sext <16 x i8> %x to <16 x i16>
430  %m = mul <16 x i16> %xx, %xx
431  %ma = zext <16 x i16> %m to <16 x i32>
432  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
433  ret i32 %z
434}
435
436define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
437; CHECK-LABEL: add_v4i8_v4i32_zext:
438; CHECK:       @ %bb.0: @ %entry
439; CHECK-NEXT:    vmov.i32 q2, #0xff
440; CHECK-NEXT:    vand q1, q1, q2
441; CHECK-NEXT:    vand q0, q0, q2
442; CHECK-NEXT:    vmlav.u32 r0, q0, q1
443; CHECK-NEXT:    bx lr
444entry:
445  %xx = zext <4 x i8> %x to <4 x i32>
446  %yy = zext <4 x i8> %y to <4 x i32>
447  %m = mul <4 x i32> %xx, %yy
448  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
449  ret i32 %z
450}
451
452define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
453; CHECK-LABEL: add_v4i8_v4i32_sext:
454; CHECK:       @ %bb.0: @ %entry
455; CHECK-NEXT:    vmovlb.s8 q1, q1
456; CHECK-NEXT:    vmovlb.s8 q0, q0
457; CHECK-NEXT:    vmovlb.s16 q1, q1
458; CHECK-NEXT:    vmovlb.s16 q0, q0
459; CHECK-NEXT:    vmlav.u32 r0, q0, q1
460; CHECK-NEXT:    bx lr
461entry:
462  %xx = sext <4 x i8> %x to <4 x i32>
463  %yy = sext <4 x i8> %y to <4 x i32>
464  %m = mul <4 x i32> %xx, %yy
465  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
466  ret i32 %z
467}
468
469define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_szext(<4 x i8> %x, <4 x i8> %y) {
470; CHECK-LABEL: add_v4i8_v4i32_szext:
471; CHECK:       @ %bb.0: @ %entry
472; CHECK-NEXT:    vmovlb.s8 q0, q0
473; CHECK-NEXT:    vmov.i32 q2, #0xff
474; CHECK-NEXT:    vand q1, q1, q2
475; CHECK-NEXT:    vmovlb.s16 q0, q0
476; CHECK-NEXT:    vmlav.u32 r0, q0, q1
477; CHECK-NEXT:    bx lr
478entry:
479  %xx = sext <4 x i8> %x to <4 x i32>
480  %yy = zext <4 x i8> %y to <4 x i32>
481  %m = mul <4 x i32> %xx, %yy
482  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
483  ret i32 %z
484}
485
486define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
487; CHECK-LABEL: add_v16i8_v16i16_zext:
488; CHECK:       @ %bb.0: @ %entry
489; CHECK-NEXT:    vmlav.u8 r0, q0, q1
490; CHECK-NEXT:    uxth r0, r0
491; CHECK-NEXT:    bx lr
492entry:
493  %xx = zext <16 x i8> %x to <16 x i16>
494  %yy = zext <16 x i8> %y to <16 x i16>
495  %m = mul <16 x i16> %xx, %yy
496  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
497  ret i16 %z
498}
499
500define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
501; CHECK-LABEL: add_v16i8_v16i16_sext:
502; CHECK:       @ %bb.0: @ %entry
503; CHECK-NEXT:    vmlav.s8 r0, q0, q1
504; CHECK-NEXT:    sxth r0, r0
505; CHECK-NEXT:    bx lr
506entry:
507  %xx = sext <16 x i8> %x to <16 x i16>
508  %yy = sext <16 x i8> %y to <16 x i16>
509  %m = mul <16 x i16> %xx, %yy
510  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
511  ret i16 %z
512}
513
514define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y) {
515; CHECK-LABEL: add_v16i8_v16i16_szext:
516; CHECK:       @ %bb.0: @ %entry
517; CHECK-NEXT:    .pad #32
518; CHECK-NEXT:    sub sp, #32
519; CHECK-NEXT:    add r0, sp, #16
520; CHECK-NEXT:    mov r1, sp
521; CHECK-NEXT:    vstrw.32 q1, [r0]
522; CHECK-NEXT:    vstrw.32 q0, [r1]
523; CHECK-NEXT:    vldrb.u16 q0, [r0, #8]
524; CHECK-NEXT:    vldrb.s16 q1, [r1, #8]
525; CHECK-NEXT:    vmlav.u16 r2, q1, q0
526; CHECK-NEXT:    vldrb.u16 q0, [r0]
527; CHECK-NEXT:    vldrb.s16 q1, [r1]
528; CHECK-NEXT:    vmlava.u16 r2, q1, q0
529; CHECK-NEXT:    sxth r0, r2
530; CHECK-NEXT:    add sp, #32
531; CHECK-NEXT:    bx lr
532entry:
533  %xx = sext <16 x i8> %x to <16 x i16>
534  %yy = zext <16 x i8> %y to <16 x i16>
535  %m = mul <16 x i16> %xx, %yy
536  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
537  ret i16 %z
538}
539
540define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
541; CHECK-LABEL: add_v8i8_v8i16_zext:
542; CHECK:       @ %bb.0: @ %entry
543; CHECK-NEXT:    vmovlb.u8 q1, q1
544; CHECK-NEXT:    vmovlb.u8 q0, q0
545; CHECK-NEXT:    vmlav.u16 r0, q0, q1
546; CHECK-NEXT:    uxth r0, r0
547; CHECK-NEXT:    bx lr
548entry:
549  %xx = zext <8 x i8> %x to <8 x i16>
550  %yy = zext <8 x i8> %y to <8 x i16>
551  %m = mul <8 x i16> %xx, %yy
552  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
553  ret i16 %z
554}
555
556define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
557; CHECK-LABEL: add_v8i8_v8i16_sext:
558; CHECK:       @ %bb.0: @ %entry
559; CHECK-NEXT:    vmovlb.s8 q1, q1
560; CHECK-NEXT:    vmovlb.s8 q0, q0
561; CHECK-NEXT:    vmlav.u16 r0, q0, q1
562; CHECK-NEXT:    sxth r0, r0
563; CHECK-NEXT:    bx lr
564entry:
565  %xx = sext <8 x i8> %x to <8 x i16>
566  %yy = sext <8 x i8> %y to <8 x i16>
567  %m = mul <8 x i16> %xx, %yy
568  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
569  ret i16 %z
570}
571
572define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
573; CHECK-LABEL: add_v16i8_v16i8:
574; CHECK:       @ %bb.0: @ %entry
575; CHECK-NEXT:    vmlav.u8 r0, q0, q1
576; CHECK-NEXT:    uxtb r0, r0
577; CHECK-NEXT:    bx lr
578entry:
579  %m = mul <16 x i8> %x, %y
580  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
581  ret i8 %z
582}
583
584define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
585; CHECK-LABEL: add_v16i8_v16i64_zext:
586; CHECK:       @ %bb.0: @ %entry
587; CHECK-NEXT:    .pad #32
588; CHECK-NEXT:    sub sp, #32
589; CHECK-NEXT:    add r2, sp, #16
590; CHECK-NEXT:    mov r3, sp
591; CHECK-NEXT:    vstrw.32 q1, [r2]
592; CHECK-NEXT:    vstrw.32 q0, [r3]
593; CHECK-NEXT:    vldrb.u16 q0, [r2]
594; CHECK-NEXT:    vldrb.u16 q1, [r3]
595; CHECK-NEXT:    vmlalv.u16 r0, r1, q1, q0
596; CHECK-NEXT:    vldrb.u16 q0, [r2, #8]
597; CHECK-NEXT:    vldrb.u16 q1, [r3, #8]
598; CHECK-NEXT:    vmlalva.u16 r0, r1, q1, q0
599; CHECK-NEXT:    add sp, #32
600; CHECK-NEXT:    bx lr
601entry:
602  %xx = zext <16 x i8> %x to <16 x i64>
603  %yy = zext <16 x i8> %y to <16 x i64>
604  %m = mul <16 x i64> %xx, %yy
605  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
606  ret i64 %z
607}
608
609define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
610; CHECK-LABEL: add_v16i8_v16i64_sext:
611; CHECK:       @ %bb.0: @ %entry
612; CHECK-NEXT:    .pad #32
613; CHECK-NEXT:    sub sp, #32
614; CHECK-NEXT:    add r2, sp, #16
615; CHECK-NEXT:    mov r3, sp
616; CHECK-NEXT:    vstrw.32 q1, [r2]
617; CHECK-NEXT:    vstrw.32 q0, [r3]
618; CHECK-NEXT:    vldrb.s16 q0, [r2]
619; CHECK-NEXT:    vldrb.s16 q1, [r3]
620; CHECK-NEXT:    vmlalv.s16 r0, r1, q1, q0
621; CHECK-NEXT:    vldrb.s16 q0, [r2, #8]
622; CHECK-NEXT:    vldrb.s16 q1, [r3, #8]
623; CHECK-NEXT:    vmlalva.s16 r0, r1, q1, q0
624; CHECK-NEXT:    add sp, #32
625; CHECK-NEXT:    bx lr
626entry:
627  %xx = sext <16 x i8> %x to <16 x i64>
628  %yy = sext <16 x i8> %y to <16 x i64>
629  %m = mul <16 x i64> %xx, %yy
630  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
631  ret i64 %z
632}
633
634define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext_load(<16 x i8> *%xp, <16 x i8> *%yp) {
635; CHECK-LABEL: add_v16i8_v16i64_zext_load:
636; CHECK:       @ %bb.0: @ %entry
637; CHECK-NEXT:    vldrb.u16 q0, [r1]
638; CHECK-NEXT:    vldrb.u16 q1, [r0]
639; CHECK-NEXT:    vmlalv.u16 r2, r3, q1, q0
640; CHECK-NEXT:    vldrb.u16 q0, [r1, #8]
641; CHECK-NEXT:    vldrb.u16 q1, [r0, #8]
642; CHECK-NEXT:    vmlalva.u16 r2, r3, q1, q0
643; CHECK-NEXT:    mov r0, r2
644; CHECK-NEXT:    mov r1, r3
645; CHECK-NEXT:    bx lr
646entry:
647  %x = load <16 x i8>, <16 x i8>* %xp
648  %y = load <16 x i8>, <16 x i8>* %yp
649  %xx = zext <16 x i8> %x to <16 x i64>
650  %yy = zext <16 x i8> %y to <16 x i64>
651  %m = mul <16 x i64> %xx, %yy
652  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
653  ret i64 %z
654}
655
656define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext_load(<16 x i8> *%xp, <16 x i8> *%yp) {
657; CHECK-LABEL: add_v16i8_v16i64_sext_load:
658; CHECK:       @ %bb.0: @ %entry
659; CHECK-NEXT:    vldrb.s16 q0, [r1]
660; CHECK-NEXT:    vldrb.s16 q1, [r0]
661; CHECK-NEXT:    vmlalv.s16 r2, r3, q1, q0
662; CHECK-NEXT:    vldrb.s16 q0, [r1, #8]
663; CHECK-NEXT:    vldrb.s16 q1, [r0, #8]
664; CHECK-NEXT:    vmlalva.s16 r2, r3, q1, q0
665; CHECK-NEXT:    mov r0, r2
666; CHECK-NEXT:    mov r1, r3
667; CHECK-NEXT:    bx lr
668entry:
669  %x = load <16 x i8>, <16 x i8>* %xp
670  %y = load <16 x i8>, <16 x i8>* %yp
671  %xx = sext <16 x i8> %x to <16 x i64>
672  %yy = sext <16 x i8> %y to <16 x i64>
673  %m = mul <16 x i64> %xx, %yy
674  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
675  ret i64 %z
676}
677
678define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
679; CHECK-LABEL: add_v8i8_v8i64_zext:
680; CHECK:       @ %bb.0: @ %entry
681; CHECK-NEXT:    vmovlb.u8 q1, q1
682; CHECK-NEXT:    vmovlb.u8 q0, q0
683; CHECK-NEXT:    vmlalv.u16 r0, r1, q0, q1
684; CHECK-NEXT:    bx lr
685entry:
686  %xx = zext <8 x i8> %x to <8 x i64>
687  %yy = zext <8 x i8> %y to <8 x i64>
688  %m = mul <8 x i64> %xx, %yy
689  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
690  ret i64 %z
691}
692
693define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
694; CHECK-LABEL: add_v8i8_v8i64_sext:
695; CHECK:       @ %bb.0: @ %entry
696; CHECK-NEXT:    vmovlb.s8 q1, q1
697; CHECK-NEXT:    vmovlb.s8 q0, q0
698; CHECK-NEXT:    vmlalv.s16 r0, r1, q0, q1
699; CHECK-NEXT:    bx lr
700entry:
701  %xx = sext <8 x i8> %x to <8 x i64>
702  %yy = sext <8 x i8> %y to <8 x i64>
703  %m = mul <8 x i64> %xx, %yy
704  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
705  ret i64 %z
706}
707
708define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
709; CHECK-LABEL: add_v4i8_v4i64_zext:
710; CHECK:       @ %bb.0: @ %entry
711; CHECK-NEXT:    vmov.i32 q2, #0xff
712; CHECK-NEXT:    vand q1, q1, q2
713; CHECK-NEXT:    vand q0, q0, q2
714; CHECK-NEXT:    vmlalv.u32 r0, r1, q0, q1
715; CHECK-NEXT:    bx lr
716entry:
717  %xx = zext <4 x i8> %x to <4 x i64>
718  %yy = zext <4 x i8> %y to <4 x i64>
719  %m = mul <4 x i64> %xx, %yy
720  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
721  ret i64 %z
722}
723
724define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
725; CHECK-LABEL: add_v4i8_v4i64_sext:
726; CHECK:       @ %bb.0: @ %entry
727; CHECK-NEXT:    vmovlb.s8 q1, q1
728; CHECK-NEXT:    vmovlb.s8 q0, q0
729; CHECK-NEXT:    vmovlb.s16 q1, q1
730; CHECK-NEXT:    vmovlb.s16 q0, q0
731; CHECK-NEXT:    vmlalv.s32 r0, r1, q0, q1
732; CHECK-NEXT:    bx lr
733entry:
734  %xx = sext <4 x i8> %x to <4 x i64>
735  %yy = sext <4 x i8> %y to <4 x i64>
736  %m = mul <4 x i64> %xx, %yy
737  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
738  ret i64 %z
739}
740
741define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_zext(<4 x i8> %x, <4 x i16> %y) {
742; CHECK-LABEL: add_v4i8i16_v4i64_zext:
743; CHECK:       @ %bb.0: @ %entry
744; CHECK-NEXT:    vmov.i32 q2, #0xff
745; CHECK-NEXT:    vmovlb.u16 q1, q1
746; CHECK-NEXT:    vand q0, q0, q2
747; CHECK-NEXT:    vmlalv.u32 r0, r1, q0, q1
748; CHECK-NEXT:    bx lr
749entry:
750  %xx = zext <4 x i8> %x to <4 x i64>
751  %yy = zext <4 x i16> %y to <4 x i64>
752  %m = mul <4 x i64> %xx, %yy
753  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
754  ret i64 %z
755}
756
757define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y) {
758; CHECK-LABEL: add_v4i8i16_v4i64_sext:
759; CHECK:       @ %bb.0: @ %entry
760; CHECK-NEXT:    vmovlb.s8 q0, q0
761; CHECK-NEXT:    vmovlb.s16 q1, q1
762; CHECK-NEXT:    vmovlb.s16 q0, q0
763; CHECK-NEXT:    vmlalv.s32 r0, r1, q0, q1
764; CHECK-NEXT:    bx lr
765entry:
766  %xx = sext <4 x i8> %x to <4 x i64>
767  %yy = sext <4 x i16> %y to <4 x i64>
768  %m = mul <4 x i64> %xx, %yy
769  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
770  ret i64 %z
771}
772
773define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_zext(<4 x i8> %x, <4 x i16> %y) {
774; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_zext:
775; CHECK:       @ %bb.0: @ %entry
776; CHECK-NEXT:    vmov.i32 q2, #0xff
777; CHECK-NEXT:    vmovlb.u16 q1, q1
778; CHECK-NEXT:    vand q0, q0, q2
779; CHECK-NEXT:    vmlalv.u32 r0, r1, q0, q1
780; CHECK-NEXT:    bx lr
781entry:
782  %xx = zext <4 x i8> %x to <4 x i32>
783  %yy = zext <4 x i16> %y to <4 x i32>
784  %mm = mul <4 x i32> %xx, %yy
785  %m = zext <4 x i32> %mm to <4 x i64>
786  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
787  ret i64 %z
788}
789
790define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y) {
791; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext:
792; CHECK:       @ %bb.0: @ %entry
793; CHECK-NEXT:    vmovlb.s8 q0, q0
794; CHECK-NEXT:    vmovlb.s16 q1, q1
795; CHECK-NEXT:    vmovlb.s16 q0, q0
796; CHECK-NEXT:    vmlalv.s32 r0, r1, q0, q1
797; CHECK-NEXT:    bx lr
798entry:
799  %xx = sext <4 x i8> %x to <4 x i32>
800  %yy = sext <4 x i16> %y to <4 x i32>
801  %mm = mul <4 x i32> %xx, %yy
802  %m = sext <4 x i32> %mm to <4 x i64>
803  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
804  ret i64 %z
805}
806
807define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
808; CHECK-LABEL: add_v2i8_v2i64_zext:
809; CHECK:       @ %bb.0: @ %entry
810; CHECK-NEXT:    vmov.i64 q2, #0xff
811; CHECK-NEXT:    vand q1, q1, q2
812; CHECK-NEXT:    vand q0, q0, q2
813; CHECK-NEXT:    vmov r0, s6
814; CHECK-NEXT:    vmov r1, s2
815; CHECK-NEXT:    vmov r2, s4
816; CHECK-NEXT:    vmov r3, s0
817; CHECK-NEXT:    umull r0, r1, r1, r0
818; CHECK-NEXT:    umull r2, r3, r3, r2
819; CHECK-NEXT:    add r0, r2
820; CHECK-NEXT:    orrs r1, r3
821; CHECK-NEXT:    bx lr
822entry:
823  %xx = zext <2 x i8> %x to <2 x i64>
824  %yy = zext <2 x i8> %y to <2 x i64>
825  %m = mul <2 x i64> %xx, %yy
826  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
827  ret i64 %z
828}
829
830define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
831; CHECK-LABEL: add_v2i8_v2i64_sext:
832; CHECK:       @ %bb.0: @ %entry
833; CHECK-NEXT:    vmov r0, s6
834; CHECK-NEXT:    vmov r1, s2
835; CHECK-NEXT:    vmov r2, s4
836; CHECK-NEXT:    vmov r3, s0
837; CHECK-NEXT:    sxtb r0, r0
838; CHECK-NEXT:    sxtb r1, r1
839; CHECK-NEXT:    smull r0, r1, r1, r0
840; CHECK-NEXT:    sxtb r2, r2
841; CHECK-NEXT:    sxtb r3, r3
842; CHECK-NEXT:    smlal r0, r1, r3, r2
843; CHECK-NEXT:    bx lr
844entry:
845  %xx = sext <2 x i8> %x to <2 x i64>
846  %yy = sext <2 x i8> %y to <2 x i64>
847  %m = mul <2 x i64> %xx, %yy
848  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
849  ret i64 %z
850}
851
852define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
853; CHECK-LABEL: add_v2i64_v2i64:
854; CHECK:       @ %bb.0: @ %entry
855; CHECK-NEXT:    .save {r4, r5, r7, lr}
856; CHECK-NEXT:    push {r4, r5, r7, lr}
857; CHECK-NEXT:    vmov r0, lr, d3
858; CHECK-NEXT:    vmov r2, r3, d1
859; CHECK-NEXT:    umull r12, r1, r2, r0
860; CHECK-NEXT:    mla r1, r2, lr, r1
861; CHECK-NEXT:    mla lr, r3, r0, r1
862; CHECK-NEXT:    vmov r0, r2, d2
863; CHECK-NEXT:    vmov r3, r1, d0
864; CHECK-NEXT:    umull r4, r5, r3, r0
865; CHECK-NEXT:    mla r2, r3, r2, r5
866; CHECK-NEXT:    mla r1, r1, r0, r2
867; CHECK-NEXT:    adds.w r0, r4, r12
868; CHECK-NEXT:    adc.w r1, r1, lr
869; CHECK-NEXT:    pop {r4, r5, r7, pc}
870entry:
871  %m = mul <2 x i64> %x, %y
872  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
873  ret i64 %z
874}
875
876define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, i32 %a) {
877; CHECK-LABEL: add_v4i32_v4i32_acc:
878; CHECK:       @ %bb.0: @ %entry
879; CHECK-NEXT:    vmlava.u32 r0, q0, q1
880; CHECK-NEXT:    bx lr
881entry:
882  %m = mul <4 x i32> %x, %y
883  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
884  %r = add i32 %z, %a
885  ret i32 %r
886}
887
888define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
889; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
890; CHECK:       @ %bb.0: @ %entry
891; CHECK-NEXT:    vmlalva.u32 r0, r1, q0, q1
892; CHECK-NEXT:    bx lr
893entry:
894  %xx = zext <4 x i32> %x to <4 x i64>
895  %yy = zext <4 x i32> %y to <4 x i64>
896  %m = mul <4 x i64> %xx, %yy
897  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
898  %r = add i64 %z, %a
899  ret i64 %r
900}
901
902define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
903; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
904; CHECK:       @ %bb.0: @ %entry
905; CHECK-NEXT:    vmlalva.s32 r0, r1, q0, q1
906; CHECK-NEXT:    bx lr
907entry:
908  %xx = sext <4 x i32> %x to <4 x i64>
909  %yy = sext <4 x i32> %y to <4 x i64>
910  %m = mul <4 x i64> %xx, %yy
911  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
912  %r = add i64 %z, %a
913  ret i64 %r
914}
915
916define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
917; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
918; CHECK:       @ %bb.0: @ %entry
919; CHECK-NEXT:    .save {r7, lr}
920; CHECK-NEXT:    push {r7, lr}
921; CHECK-NEXT:    vmullb.u32 q2, q0, q1
922; CHECK-NEXT:    vmov lr, r12, d5
923; CHECK-NEXT:    vmov r3, r2, d4
924; CHECK-NEXT:    adds.w r3, r3, lr
925; CHECK-NEXT:    adc.w r2, r2, r12
926; CHECK-NEXT:    adds r0, r0, r3
927; CHECK-NEXT:    adcs r1, r2
928; CHECK-NEXT:    pop {r7, pc}
929entry:
930  %xx = zext <2 x i32> %x to <2 x i64>
931  %yy = zext <2 x i32> %y to <2 x i64>
932  %m = mul <2 x i64> %xx, %yy
933  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
934  %r = add i64 %z, %a
935  ret i64 %r
936}
937
938define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
939; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
940; CHECK:       @ %bb.0: @ %entry
941; CHECK-NEXT:    .save {r7, lr}
942; CHECK-NEXT:    push {r7, lr}
943; CHECK-NEXT:    vmullb.s32 q2, q0, q1
944; CHECK-NEXT:    vmov lr, r12, d5
945; CHECK-NEXT:    vmov r3, r2, d4
946; CHECK-NEXT:    adds.w r3, r3, lr
947; CHECK-NEXT:    adc.w r2, r2, r12
948; CHECK-NEXT:    adds r0, r0, r3
949; CHECK-NEXT:    adcs r1, r2
950; CHECK-NEXT:    pop {r7, pc}
951entry:
952  %xx = sext <2 x i32> %x to <2 x i64>
953  %yy = sext <2 x i32> %y to <2 x i64>
954  %m = mul <2 x i64> %xx, %yy
955  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
956  %r = add i64 %z, %a
957  ret i64 %r
958}
959
960define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
961; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
962; CHECK:       @ %bb.0: @ %entry
963; CHECK-NEXT:    vmlava.u16 r0, q0, q1
964; CHECK-NEXT:    bx lr
965entry:
966  %xx = zext <8 x i16> %x to <8 x i32>
967  %yy = zext <8 x i16> %y to <8 x i32>
968  %m = mul <8 x i32> %xx, %yy
969  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
970  %r = add i32 %z, %a
971  ret i32 %r
972}
973
974define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
975; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
976; CHECK:       @ %bb.0: @ %entry
977; CHECK-NEXT:    vmlava.s16 r0, q0, q1
978; CHECK-NEXT:    bx lr
979entry:
980  %xx = sext <8 x i16> %x to <8 x i32>
981  %yy = sext <8 x i16> %y to <8 x i32>
982  %m = mul <8 x i32> %xx, %yy
983  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
984  %r = add i32 %z, %a
985  ret i32 %r
986}
987
988define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
989; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
990; CHECK:       @ %bb.0: @ %entry
991; CHECK-NEXT:    vmovlb.u16 q1, q1
992; CHECK-NEXT:    vmovlb.u16 q0, q0
993; CHECK-NEXT:    vmlava.u32 r0, q0, q1
994; CHECK-NEXT:    bx lr
995entry:
996  %xx = zext <4 x i16> %x to <4 x i32>
997  %yy = zext <4 x i16> %y to <4 x i32>
998  %m = mul <4 x i32> %xx, %yy
999  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
1000  %r = add i32 %z, %a
1001  ret i32 %r
1002}
1003
1004define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
1005; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1006; CHECK:       @ %bb.0: @ %entry
1007; CHECK-NEXT:    vmovlb.s16 q1, q1
1008; CHECK-NEXT:    vmovlb.s16 q0, q0
1009; CHECK-NEXT:    vmlava.u32 r0, q0, q1
1010; CHECK-NEXT:    bx lr
1011entry:
1012  %xx = sext <4 x i16> %x to <4 x i32>
1013  %yy = sext <4 x i16> %y to <4 x i32>
1014  %m = mul <4 x i32> %xx, %yy
1015  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
1016  %r = add i32 %z, %a
1017  ret i32 %r
1018}
1019
1020define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
1021; CHECK-LABEL: add_v8i16_v8i16_acc:
1022; CHECK:       @ %bb.0: @ %entry
1023; CHECK-NEXT:    vmlava.u16 r0, q0, q1
1024; CHECK-NEXT:    uxth r0, r0
1025; CHECK-NEXT:    bx lr
1026entry:
1027  %m = mul <8 x i16> %x, %y
1028  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
1029  %r = add i16 %z, %a
1030  ret i16 %r
1031}
1032
1033define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1034; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1035; CHECK:       @ %bb.0: @ %entry
1036; CHECK-NEXT:    vmlalva.u16 r0, r1, q0, q1
1037; CHECK-NEXT:    bx lr
1038entry:
1039  %xx = zext <8 x i16> %x to <8 x i64>
1040  %yy = zext <8 x i16> %y to <8 x i64>
1041  %m = mul <8 x i64> %xx, %yy
1042  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
1043  %r = add i64 %z, %a
1044  ret i64 %r
1045}
1046
1047define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1048; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1049; CHECK:       @ %bb.0: @ %entry
1050; CHECK-NEXT:    vmlalva.s16 r0, r1, q0, q1
1051; CHECK-NEXT:    bx lr
1052entry:
1053  %xx = sext <8 x i16> %x to <8 x i64>
1054  %yy = sext <8 x i16> %y to <8 x i64>
1055  %m = mul <8 x i64> %xx, %yy
1056  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
1057  %r = add i64 %z, %a
1058  ret i64 %r
1059}
1060
1061define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1062; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext:
1063; CHECK:       @ %bb.0: @ %entry
1064; CHECK-NEXT:    vmlalva.u16 r0, r1, q0, q1
1065; CHECK-NEXT:    bx lr
1066entry:
1067  %xx = zext <8 x i16> %x to <8 x i32>
1068  %yy = zext <8 x i16> %y to <8 x i32>
1069  %m = mul <8 x i32> %xx, %yy
1070  %ma = zext <8 x i32> %m to <8 x i64>
1071  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
1072  %r = add i64 %z, %a
1073  ret i64 %r
1074}
1075
1076define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1077; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext:
1078; CHECK:       @ %bb.0: @ %entry
1079; CHECK-NEXT:    vmlalva.s16 r0, r1, q0, q1
1080; CHECK-NEXT:    bx lr
1081entry:
1082  %xx = sext <8 x i16> %x to <8 x i32>
1083  %yy = sext <8 x i16> %y to <8 x i32>
1084  %m = mul <8 x i32> %xx, %yy
1085  %ma = sext <8 x i32> %m to <8 x i64>
1086  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
1087  %r = add i64 %z, %a
1088  ret i64 %r
1089}
1090
1091define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
1092; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext:
1093; CHECK:       @ %bb.0: @ %entry
1094; CHECK-NEXT:    vmlalva.s16 r0, r1, q0, q0
1095; CHECK-NEXT:    bx lr
1096entry:
1097  %xx = sext <8 x i16> %x to <8 x i32>
1098  %m = mul <8 x i32> %xx, %xx
1099  %ma = zext <8 x i32> %m to <8 x i64>
1100  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
1101  %r = add i64 %z, %a
1102  ret i64 %r
1103}
1104
1105define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
1106; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
1107; CHECK:       @ %bb.0: @ %entry
1108; CHECK-NEXT:    .save {r7, lr}
1109; CHECK-NEXT:    push {r7, lr}
1110; CHECK-NEXT:    vmov.i64 q2, #0xffff
1111; CHECK-NEXT:    vand q1, q1, q2
1112; CHECK-NEXT:    vand q0, q0, q2
1113; CHECK-NEXT:    vmov r2, s6
1114; CHECK-NEXT:    vmov r3, s2
1115; CHECK-NEXT:    vmov r12, s4
1116; CHECK-NEXT:    umull r2, lr, r3, r2
1117; CHECK-NEXT:    vmov r3, s0
1118; CHECK-NEXT:    umlal r2, lr, r3, r12
1119; CHECK-NEXT:    adds r0, r0, r2
1120; CHECK-NEXT:    adc.w r1, r1, lr
1121; CHECK-NEXT:    pop {r7, pc}
1122entry:
1123  %xx = zext <2 x i16> %x to <2 x i64>
1124  %yy = zext <2 x i16> %y to <2 x i64>
1125  %m = mul <2 x i64> %xx, %yy
1126  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1127  %r = add i64 %z, %a
1128  ret i64 %r
1129}
1130
1131define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
1132; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
1133; CHECK:       @ %bb.0: @ %entry
1134; CHECK-NEXT:    .save {r7, lr}
1135; CHECK-NEXT:    push {r7, lr}
1136; CHECK-NEXT:    vmov r2, s6
1137; CHECK-NEXT:    vmov r3, s2
1138; CHECK-NEXT:    sxth r2, r2
1139; CHECK-NEXT:    sxth r3, r3
1140; CHECK-NEXT:    smull r2, r12, r3, r2
1141; CHECK-NEXT:    vmov r3, s4
1142; CHECK-NEXT:    sxth.w lr, r3
1143; CHECK-NEXT:    vmov r3, s0
1144; CHECK-NEXT:    sxth r3, r3
1145; CHECK-NEXT:    smlal r2, r12, r3, lr
1146; CHECK-NEXT:    adds r0, r0, r2
1147; CHECK-NEXT:    adc.w r1, r1, r12
1148; CHECK-NEXT:    pop {r7, pc}
1149entry:
1150  %xx = sext <2 x i16> %x to <2 x i64>
1151  %yy = sext <2 x i16> %y to <2 x i64>
1152  %m = mul <2 x i64> %xx, %yy
1153  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1154  %r = add i64 %z, %a
1155  ret i64 %r
1156}
1157
1158define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1159; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
1160; CHECK:       @ %bb.0: @ %entry
1161; CHECK-NEXT:    vmlava.u8 r0, q0, q1
1162; CHECK-NEXT:    bx lr
1163entry:
1164  %xx = zext <16 x i8> %x to <16 x i32>
1165  %yy = zext <16 x i8> %y to <16 x i32>
1166  %m = mul <16 x i32> %xx, %yy
1167  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
1168  %r = add i32 %z, %a
1169  ret i32 %r
1170}
1171
1172define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1173; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
1174; CHECK:       @ %bb.0: @ %entry
1175; CHECK-NEXT:    vmlava.s8 r0, q0, q1
1176; CHECK-NEXT:    bx lr
1177entry:
1178  %xx = sext <16 x i8> %x to <16 x i32>
1179  %yy = sext <16 x i8> %y to <16 x i32>
1180  %m = mul <16 x i32> %xx, %yy
1181  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
1182  %r = add i32 %z, %a
1183  ret i32 %r
1184}
1185
1186define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1187; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext:
1188; CHECK:       @ %bb.0: @ %entry
1189; CHECK-NEXT:    vmlava.u8 r0, q0, q1
1190; CHECK-NEXT:    bx lr
1191entry:
1192  %xx = zext <16 x i8> %x to <16 x i16>
1193  %yy = zext <16 x i8> %y to <16 x i16>
1194  %m = mul <16 x i16> %xx, %yy
1195  %ma = zext <16 x i16> %m to <16 x i32>
1196  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
1197  %r = add i32 %z, %a
1198  ret i32 %r
1199}
1200
1201define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
1202; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext:
1203; CHECK:       @ %bb.0: @ %entry
1204; CHECK-NEXT:    vmlava.s8 r0, q0, q1
1205; CHECK-NEXT:    bx lr
1206entry:
1207  %xx = sext <16 x i8> %x to <16 x i16>
1208  %yy = sext <16 x i8> %y to <16 x i16>
1209  %m = mul <16 x i16> %xx, %yy
1210  %ma = sext <16 x i16> %m to <16 x i32>
1211  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
1212  %r = add i32 %z, %a
1213  ret i32 %r
1214}
1215
1216define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, i32 %a) {
1217; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext:
1218; CHECK:       @ %bb.0: @ %entry
1219; CHECK-NEXT:    vmlava.s8 r0, q0, q0
1220; CHECK-NEXT:    bx lr
1221entry:
1222  %xx = sext <16 x i8> %x to <16 x i16>
1223  %m = mul <16 x i16> %xx, %xx
1224  %ma = zext <16 x i16> %m to <16 x i32>
1225  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
1226  %r = add i32 %z, %a
1227  ret i32 %r
1228}
1229
1230define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
1231; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
1232; CHECK:       @ %bb.0: @ %entry
1233; CHECK-NEXT:    vmov.i32 q2, #0xff
1234; CHECK-NEXT:    vand q1, q1, q2
1235; CHECK-NEXT:    vand q0, q0, q2
1236; CHECK-NEXT:    vmlava.u32 r0, q0, q1
1237; CHECK-NEXT:    bx lr
1238entry:
1239  %xx = zext <4 x i8> %x to <4 x i32>
1240  %yy = zext <4 x i8> %y to <4 x i32>
1241  %m = mul <4 x i32> %xx, %yy
1242  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
1243  %r = add i32 %z, %a
1244  ret i32 %r
1245}
1246
1247define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
1248; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
1249; CHECK:       @ %bb.0: @ %entry
1250; CHECK-NEXT:    vmovlb.s8 q1, q1
1251; CHECK-NEXT:    vmovlb.s8 q0, q0
1252; CHECK-NEXT:    vmovlb.s16 q1, q1
1253; CHECK-NEXT:    vmovlb.s16 q0, q0
1254; CHECK-NEXT:    vmlava.u32 r0, q0, q1
1255; CHECK-NEXT:    bx lr
1256entry:
1257  %xx = sext <4 x i8> %x to <4 x i32>
1258  %yy = sext <4 x i8> %y to <4 x i32>
1259  %m = mul <4 x i32> %xx, %yy
1260  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
1261  %r = add i32 %z, %a
1262  ret i32 %r
1263}
1264
1265define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
1266; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
1267; CHECK:       @ %bb.0: @ %entry
1268; CHECK-NEXT:    vmlava.u8 r0, q0, q1
1269; CHECK-NEXT:    uxth r0, r0
1270; CHECK-NEXT:    bx lr
1271entry:
1272  %xx = zext <16 x i8> %x to <16 x i16>
1273  %yy = zext <16 x i8> %y to <16 x i16>
1274  %m = mul <16 x i16> %xx, %yy
1275  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
1276  %r = add i16 %z, %a
1277  ret i16 %r
1278}
1279
1280define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
1281; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
1282; CHECK:       @ %bb.0: @ %entry
1283; CHECK-NEXT:    vmlava.s8 r0, q0, q1
1284; CHECK-NEXT:    sxth r0, r0
1285; CHECK-NEXT:    bx lr
1286entry:
1287  %xx = sext <16 x i8> %x to <16 x i16>
1288  %yy = sext <16 x i8> %y to <16 x i16>
1289  %m = mul <16 x i16> %xx, %yy
1290  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
1291  %r = add i16 %z, %a
1292  ret i16 %r
1293}
1294
1295define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
1296; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
1297; CHECK:       @ %bb.0: @ %entry
1298; CHECK-NEXT:    vmovlb.u8 q1, q1
1299; CHECK-NEXT:    vmovlb.u8 q0, q0
1300; CHECK-NEXT:    vmlava.u16 r0, q0, q1
1301; CHECK-NEXT:    uxth r0, r0
1302; CHECK-NEXT:    bx lr
1303entry:
1304  %xx = zext <8 x i8> %x to <8 x i16>
1305  %yy = zext <8 x i8> %y to <8 x i16>
1306  %m = mul <8 x i16> %xx, %yy
1307  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
1308  %r = add i16 %z, %a
1309  ret i16 %r
1310}
1311
1312define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
1313; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
1314; CHECK:       @ %bb.0: @ %entry
1315; CHECK-NEXT:    vmovlb.s8 q1, q1
1316; CHECK-NEXT:    vmovlb.s8 q0, q0
1317; CHECK-NEXT:    vmlava.u16 r0, q0, q1
1318; CHECK-NEXT:    sxth r0, r0
1319; CHECK-NEXT:    bx lr
1320entry:
1321  %xx = sext <8 x i8> %x to <8 x i16>
1322  %yy = sext <8 x i8> %y to <8 x i16>
1323  %m = mul <8 x i16> %xx, %yy
1324  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
1325  %r = add i16 %z, %a
1326  ret i16 %r
1327}
1328
1329define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
1330; CHECK-LABEL: add_v16i8_v16i8_acc:
1331; CHECK:       @ %bb.0: @ %entry
1332; CHECK-NEXT:    vmlava.u8 r0, q0, q1
1333; CHECK-NEXT:    uxtb r0, r0
1334; CHECK-NEXT:    bx lr
1335entry:
1336  %m = mul <16 x i8> %x, %y
1337  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
1338  %r = add i8 %z, %a
1339  ret i8 %r
1340}
1341
1342define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
1343; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
1344; CHECK:       @ %bb.0: @ %entry
1345; CHECK-NEXT:    .pad #32
1346; CHECK-NEXT:    sub sp, #32
1347; CHECK-NEXT:    add r2, sp, #16
1348; CHECK-NEXT:    mov r3, sp
1349; CHECK-NEXT:    vstrw.32 q1, [r2]
1350; CHECK-NEXT:    vstrw.32 q0, [r3]
1351; CHECK-NEXT:    vldrb.u16 q0, [r2]
1352; CHECK-NEXT:    vldrb.u16 q1, [r3]
1353; CHECK-NEXT:    vmlalva.u16 r0, r1, q1, q0
1354; CHECK-NEXT:    vldrb.u16 q0, [r2, #8]
1355; CHECK-NEXT:    vldrb.u16 q1, [r3, #8]
1356; CHECK-NEXT:    vmlalva.u16 r0, r1, q1, q0
1357; CHECK-NEXT:    add sp, #32
1358; CHECK-NEXT:    bx lr
1359entry:
1360  %xx = zext <16 x i8> %x to <16 x i64>
1361  %yy = zext <16 x i8> %y to <16 x i64>
1362  %m = mul <16 x i64> %xx, %yy
1363  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
1364  %r = add i64 %z, %a
1365  ret i64 %r
1366}
1367
1368define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
1369; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
1370; CHECK:       @ %bb.0: @ %entry
1371; CHECK-NEXT:    .pad #32
1372; CHECK-NEXT:    sub sp, #32
1373; CHECK-NEXT:    add r2, sp, #16
1374; CHECK-NEXT:    mov r3, sp
1375; CHECK-NEXT:    vstrw.32 q1, [r2]
1376; CHECK-NEXT:    vstrw.32 q0, [r3]
1377; CHECK-NEXT:    vldrb.s16 q0, [r2]
1378; CHECK-NEXT:    vldrb.s16 q1, [r3]
1379; CHECK-NEXT:    vmlalva.s16 r0, r1, q1, q0
1380; CHECK-NEXT:    vldrb.s16 q0, [r2, #8]
1381; CHECK-NEXT:    vldrb.s16 q1, [r3, #8]
1382; CHECK-NEXT:    vmlalva.s16 r0, r1, q1, q0
1383; CHECK-NEXT:    add sp, #32
1384; CHECK-NEXT:    bx lr
1385entry:
1386  %xx = sext <16 x i8> %x to <16 x i64>
1387  %yy = sext <16 x i8> %y to <16 x i64>
1388  %m = mul <16 x i64> %xx, %yy
1389  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
1390  %r = add i64 %z, %a
1391  ret i64 %r
1392}
1393
1394define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext_load(<16 x i8> *%xp, <16 x i8> *%yp, i64 %a) {
1395; CHECK-LABEL: add_v16i8_v16i64_acc_zext_load:
1396; CHECK:       @ %bb.0: @ %entry
1397; CHECK-NEXT:    vldrb.u16 q0, [r1]
1398; CHECK-NEXT:    vldrb.u16 q1, [r0]
1399; CHECK-NEXT:    vmlalva.u16 r2, r3, q1, q0
1400; CHECK-NEXT:    vldrb.u16 q0, [r1, #8]
1401; CHECK-NEXT:    vldrb.u16 q1, [r0, #8]
1402; CHECK-NEXT:    vmlalva.u16 r2, r3, q1, q0
1403; CHECK-NEXT:    mov r0, r2
1404; CHECK-NEXT:    mov r1, r3
1405; CHECK-NEXT:    bx lr
1406entry:
1407  %x = load <16 x i8>, <16 x i8>* %xp
1408  %y = load <16 x i8>, <16 x i8>* %yp
1409  %xx = zext <16 x i8> %x to <16 x i64>
1410  %yy = zext <16 x i8> %y to <16 x i64>
1411  %m = mul <16 x i64> %xx, %yy
1412  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
1413  %r = add i64 %z, %a
1414  ret i64 %r
1415}
1416
1417define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext_load(<16 x i8> *%xp, <16 x i8> *%yp, i64 %a) {
1418; CHECK-LABEL: add_v16i8_v16i64_acc_sext_load:
1419; CHECK:       @ %bb.0: @ %entry
1420; CHECK-NEXT:    vldrb.s16 q0, [r1]
1421; CHECK-NEXT:    vldrb.s16 q1, [r0]
1422; CHECK-NEXT:    vmlalva.s16 r2, r3, q1, q0
1423; CHECK-NEXT:    vldrb.s16 q0, [r1, #8]
1424; CHECK-NEXT:    vldrb.s16 q1, [r0, #8]
1425; CHECK-NEXT:    vmlalva.s16 r2, r3, q1, q0
1426; CHECK-NEXT:    mov r0, r2
1427; CHECK-NEXT:    mov r1, r3
1428; CHECK-NEXT:    bx lr
1429entry:
1430  %x = load <16 x i8>, <16 x i8>* %xp
1431  %y = load <16 x i8>, <16 x i8>* %yp
1432  %xx = sext <16 x i8> %x to <16 x i64>
1433  %yy = sext <16 x i8> %y to <16 x i64>
1434  %m = mul <16 x i64> %xx, %yy
1435  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
1436  %r = add i64 %z, %a
1437  ret i64 %r
1438}
1439
1440define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
1441; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
1442; CHECK:       @ %bb.0: @ %entry
1443; CHECK-NEXT:    .save {r7, lr}
1444; CHECK-NEXT:    push {r7, lr}
1445; CHECK-NEXT:    vmov.i64 q2, #0xff
1446; CHECK-NEXT:    vand q1, q1, q2
1447; CHECK-NEXT:    vand q0, q0, q2
1448; CHECK-NEXT:    vmov r2, s6
1449; CHECK-NEXT:    vmov r3, s2
1450; CHECK-NEXT:    umull r12, lr, r3, r2
1451; CHECK-NEXT:    vmov r2, s4
1452; CHECK-NEXT:    vmov r3, s0
1453; CHECK-NEXT:    umull r2, r3, r3, r2
1454; CHECK-NEXT:    add r2, r12
1455; CHECK-NEXT:    orr.w r3, r3, lr
1456; CHECK-NEXT:    adds r0, r0, r2
1457; CHECK-NEXT:    adcs r1, r3
1458; CHECK-NEXT:    pop {r7, pc}
1459entry:
1460  %xx = zext <2 x i8> %x to <2 x i64>
1461  %yy = zext <2 x i8> %y to <2 x i64>
1462  %m = mul <2 x i64> %xx, %yy
1463  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1464  %r = add i64 %z, %a
1465  ret i64 %r
1466}
1467
1468define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
1469; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
1470; CHECK:       @ %bb.0: @ %entry
1471; CHECK-NEXT:    .save {r7, lr}
1472; CHECK-NEXT:    push {r7, lr}
1473; CHECK-NEXT:    vmov r2, s6
1474; CHECK-NEXT:    vmov r3, s2
1475; CHECK-NEXT:    sxtb r2, r2
1476; CHECK-NEXT:    sxtb r3, r3
1477; CHECK-NEXT:    smull r2, r12, r3, r2
1478; CHECK-NEXT:    vmov r3, s4
1479; CHECK-NEXT:    sxtb.w lr, r3
1480; CHECK-NEXT:    vmov r3, s0
1481; CHECK-NEXT:    sxtb r3, r3
1482; CHECK-NEXT:    smlal r2, r12, r3, lr
1483; CHECK-NEXT:    adds r0, r0, r2
1484; CHECK-NEXT:    adc.w r1, r1, r12
1485; CHECK-NEXT:    pop {r7, pc}
1486entry:
1487  %xx = sext <2 x i8> %x to <2 x i64>
1488  %yy = sext <2 x i8> %y to <2 x i64>
1489  %m = mul <2 x i64> %xx, %yy
1490  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1491  %r = add i64 %z, %a
1492  ret i64 %r
1493}
1494
1495define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) {
1496; CHECK-LABEL: add_v2i64_v2i64_acc:
1497; CHECK:       @ %bb.0: @ %entry
1498; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
1499; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
1500; CHECK-NEXT:    vmov r2, r12, d3
1501; CHECK-NEXT:    vmov r3, lr, d1
1502; CHECK-NEXT:    vmov r4, r6, d0
1503; CHECK-NEXT:    umull r8, r5, r3, r2
1504; CHECK-NEXT:    mla r3, r3, r12, r5
1505; CHECK-NEXT:    mla r12, lr, r2, r3
1506; CHECK-NEXT:    vmov r3, r5, d2
1507; CHECK-NEXT:    umull r7, r2, r4, r3
1508; CHECK-NEXT:    mla r2, r4, r5, r2
1509; CHECK-NEXT:    mla r2, r6, r3, r2
1510; CHECK-NEXT:    adds.w r3, r7, r8
1511; CHECK-NEXT:    adc.w r2, r2, r12
1512; CHECK-NEXT:    adds r0, r0, r3
1513; CHECK-NEXT:    adcs r1, r2
1514; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
1515entry:
1516  %m = mul <2 x i64> %x, %y
1517  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
1518  %r = add i64 %z, %a
1519  ret i64 %r
1520}
1521
1522declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1523declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1524declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1525declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1526declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1527declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1528declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1529declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1530declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1531declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1532