1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3
4; Various reductions generated fro SLP vectorizing unrolled loops. Generated
5; from https://godbolt.org/z/ebxdPh1Kz with some less interesting cases removed.
6
7define i32 @addv2i32i32(i32* %x) {
8; CHECK-LABEL: addv2i32i32:
9; CHECK:       @ %bb.0: @ %entry
10; CHECK-NEXT:    ldrd r1, r0, [r0]
11; CHECK-NEXT:    add r0, r1
12; CHECK-NEXT:    bx lr
13entry:
14  %0 = load i32, i32* %x, align 4
15  %arrayidx.1 = getelementptr inbounds i32, i32* %x, i32 1
16  %1 = load i32, i32* %arrayidx.1, align 4
17  %add.1 = add nsw i32 %1, %0
18  ret i32 %add.1
19}
20
21define i32 @addv4i32i32(i32* %x) {
22; CHECK-LABEL: addv4i32i32:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vldrw.u32 q0, [r0]
25; CHECK-NEXT:    vaddv.u32 r0, q0
26; CHECK-NEXT:    bx lr
27entry:
28  %0 = bitcast i32* %x to <4 x i32>*
29  %1 = load <4 x i32>, <4 x i32>* %0, align 4
30  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
31  ret i32 %2
32}
33
34define i32 @addv8i32i32(i32* %x) {
35; CHECK-LABEL: addv8i32i32:
36; CHECK:       @ %bb.0: @ %entry
37; CHECK-NEXT:    vldrw.u32 q1, [r0]
38; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
39; CHECK-NEXT:    vaddv.u32 r0, q1
40; CHECK-NEXT:    vaddva.u32 r0, q0
41; CHECK-NEXT:    bx lr
42entry:
43  %0 = bitcast i32* %x to <8 x i32>*
44  %1 = load <8 x i32>, <8 x i32>* %0, align 4
45  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
46  ret i32 %2
47}
48
49define i32 @addv16i32i32(i32* %x) {
50; CHECK-LABEL: addv16i32i32:
51; CHECK:       @ %bb.0: @ %entry
52; CHECK-NEXT:    vldrw.u32 q1, [r0]
53; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
54; CHECK-NEXT:    vaddv.u32 r2, q1
55; CHECK-NEXT:    vaddva.u32 r2, q0
56; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
57; CHECK-NEXT:    vaddva.u32 r2, q0
58; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
59; CHECK-NEXT:    vaddva.u32 r2, q0
60; CHECK-NEXT:    mov r0, r2
61; CHECK-NEXT:    bx lr
62entry:
63  %0 = bitcast i32* %x to <16 x i32>*
64  %1 = load <16 x i32>, <16 x i32>* %0, align 4
65  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
66  ret i32 %2
67}
68
69define i32 @addv24i32i32(i32* %x) {
70; CHECK-LABEL: addv24i32i32:
71; CHECK:       @ %bb.0: @ %entry
72; CHECK-NEXT:    vldrw.u32 q1, [r0]
73; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
74; CHECK-NEXT:    vaddv.u32 r2, q1
75; CHECK-NEXT:    vaddva.u32 r2, q0
76; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
77; CHECK-NEXT:    vaddva.u32 r2, q0
78; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
79; CHECK-NEXT:    vaddva.u32 r2, q0
80; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
81; CHECK-NEXT:    vaddva.u32 r2, q0
82; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
83; CHECK-NEXT:    vaddva.u32 r2, q0
84; CHECK-NEXT:    mov r0, r2
85; CHECK-NEXT:    bx lr
86entry:
87  %0 = bitcast i32* %x to <8 x i32>*
88  %1 = load <8 x i32>, <8 x i32>* %0, align 4
89  %arrayidx.8 = getelementptr inbounds i32, i32* %x, i32 8
90  %2 = bitcast i32* %arrayidx.8 to <16 x i32>*
91  %3 = load <16 x i32>, <16 x i32>* %2, align 4
92  %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
93  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
94  %op.rdx = add nsw i32 %4, %5
95  ret i32 %op.rdx
96}
97
98define i32 @addv32i32i32(i32* %x) {
99; CHECK-LABEL: addv32i32i32:
100; CHECK:       @ %bb.0: @ %entry
101; CHECK-NEXT:    vldrw.u32 q1, [r0]
102; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
103; CHECK-NEXT:    mov r1, r0
104; CHECK-NEXT:    vaddv.u32 r0, q1
105; CHECK-NEXT:    vaddva.u32 r0, q0
106; CHECK-NEXT:    vldrw.u32 q0, [r1, #32]
107; CHECK-NEXT:    vaddva.u32 r0, q0
108; CHECK-NEXT:    vldrw.u32 q0, [r1, #48]
109; CHECK-NEXT:    vaddva.u32 r0, q0
110; CHECK-NEXT:    vldrw.u32 q0, [r1, #64]
111; CHECK-NEXT:    vaddva.u32 r0, q0
112; CHECK-NEXT:    vldrw.u32 q0, [r1, #80]
113; CHECK-NEXT:    vaddva.u32 r0, q0
114; CHECK-NEXT:    vldrw.u32 q0, [r1, #96]
115; CHECK-NEXT:    vaddva.u32 r0, q0
116; CHECK-NEXT:    vldrw.u32 q0, [r1, #112]
117; CHECK-NEXT:    vaddva.u32 r0, q0
118; CHECK-NEXT:    bx lr
119entry:
120  %0 = bitcast i32* %x to <32 x i32>*
121  %1 = load <32 x i32>, <32 x i32>* %0, align 4
122  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
123  ret i32 %2
124}
125
126define i32 @addv64i32i32(i32* %x) {
127; CHECK-LABEL: addv64i32i32:
128; CHECK:       @ %bb.0: @ %entry
129; CHECK-NEXT:    vldrw.u32 q1, [r0]
130; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
131; CHECK-NEXT:    vaddv.u32 r2, q1
132; CHECK-NEXT:    vaddva.u32 r2, q0
133; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
134; CHECK-NEXT:    vaddva.u32 r2, q0
135; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
136; CHECK-NEXT:    vaddva.u32 r2, q0
137; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
138; CHECK-NEXT:    vaddva.u32 r2, q0
139; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
140; CHECK-NEXT:    vaddva.u32 r2, q0
141; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
142; CHECK-NEXT:    vaddva.u32 r2, q0
143; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
144; CHECK-NEXT:    vaddva.u32 r2, q0
145; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
146; CHECK-NEXT:    vaddva.u32 r2, q0
147; CHECK-NEXT:    vldrw.u32 q0, [r0, #144]
148; CHECK-NEXT:    vaddva.u32 r2, q0
149; CHECK-NEXT:    vldrw.u32 q0, [r0, #160]
150; CHECK-NEXT:    vaddva.u32 r2, q0
151; CHECK-NEXT:    vldrw.u32 q0, [r0, #176]
152; CHECK-NEXT:    vaddva.u32 r2, q0
153; CHECK-NEXT:    vldrw.u32 q0, [r0, #192]
154; CHECK-NEXT:    vaddva.u32 r2, q0
155; CHECK-NEXT:    vldrw.u32 q0, [r0, #208]
156; CHECK-NEXT:    vaddva.u32 r2, q0
157; CHECK-NEXT:    vldrw.u32 q0, [r0, #224]
158; CHECK-NEXT:    vaddva.u32 r2, q0
159; CHECK-NEXT:    vldrw.u32 q0, [r0, #240]
160; CHECK-NEXT:    vaddva.u32 r2, q0
161; CHECK-NEXT:    mov r0, r2
162; CHECK-NEXT:    bx lr
163entry:
164  %0 = bitcast i32* %x to <64 x i32>*
165  %1 = load <64 x i32>, <64 x i32>* %0, align 4
166  %2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %1)
167  ret i32 %2
168}
169
170define i32 @addv128i32i32(i32* %x) {
171; CHECK-LABEL: addv128i32i32:
172; CHECK:       @ %bb.0: @ %entry
173; CHECK-NEXT:    vldrw.u32 q1, [r0]
174; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
175; CHECK-NEXT:    vaddv.u32 r2, q1
176; CHECK-NEXT:    vaddva.u32 r2, q0
177; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
178; CHECK-NEXT:    vaddva.u32 r2, q0
179; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
180; CHECK-NEXT:    vaddva.u32 r2, q0
181; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
182; CHECK-NEXT:    vaddva.u32 r2, q0
183; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
184; CHECK-NEXT:    vaddva.u32 r2, q0
185; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
186; CHECK-NEXT:    vaddva.u32 r2, q0
187; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
188; CHECK-NEXT:    vaddva.u32 r2, q0
189; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
190; CHECK-NEXT:    vaddva.u32 r2, q0
191; CHECK-NEXT:    vldrw.u32 q0, [r0, #144]
192; CHECK-NEXT:    vaddva.u32 r2, q0
193; CHECK-NEXT:    vldrw.u32 q0, [r0, #160]
194; CHECK-NEXT:    vaddva.u32 r2, q0
195; CHECK-NEXT:    vldrw.u32 q0, [r0, #176]
196; CHECK-NEXT:    vaddva.u32 r2, q0
197; CHECK-NEXT:    vldrw.u32 q0, [r0, #192]
198; CHECK-NEXT:    vaddva.u32 r2, q0
199; CHECK-NEXT:    vldrw.u32 q0, [r0, #208]
200; CHECK-NEXT:    vaddva.u32 r2, q0
201; CHECK-NEXT:    vldrw.u32 q0, [r0, #224]
202; CHECK-NEXT:    vaddva.u32 r2, q0
203; CHECK-NEXT:    vldrw.u32 q0, [r0, #240]
204; CHECK-NEXT:    vaddva.u32 r2, q0
205; CHECK-NEXT:    vldrw.u32 q0, [r0, #256]
206; CHECK-NEXT:    vaddva.u32 r2, q0
207; CHECK-NEXT:    vldrw.u32 q0, [r0, #272]
208; CHECK-NEXT:    vaddva.u32 r2, q0
209; CHECK-NEXT:    vldrw.u32 q0, [r0, #288]
210; CHECK-NEXT:    vaddva.u32 r2, q0
211; CHECK-NEXT:    vldrw.u32 q0, [r0, #304]
212; CHECK-NEXT:    vaddva.u32 r2, q0
213; CHECK-NEXT:    vldrw.u32 q0, [r0, #320]
214; CHECK-NEXT:    vaddva.u32 r2, q0
215; CHECK-NEXT:    vldrw.u32 q0, [r0, #336]
216; CHECK-NEXT:    vaddva.u32 r2, q0
217; CHECK-NEXT:    vldrw.u32 q0, [r0, #352]
218; CHECK-NEXT:    vaddva.u32 r2, q0
219; CHECK-NEXT:    vldrw.u32 q0, [r0, #368]
220; CHECK-NEXT:    vaddva.u32 r2, q0
221; CHECK-NEXT:    vldrw.u32 q0, [r0, #384]
222; CHECK-NEXT:    vaddva.u32 r2, q0
223; CHECK-NEXT:    vldrw.u32 q0, [r0, #400]
224; CHECK-NEXT:    vaddva.u32 r2, q0
225; CHECK-NEXT:    vldrw.u32 q0, [r0, #416]
226; CHECK-NEXT:    vaddva.u32 r2, q0
227; CHECK-NEXT:    vldrw.u32 q0, [r0, #432]
228; CHECK-NEXT:    vaddva.u32 r2, q0
229; CHECK-NEXT:    vldrw.u32 q0, [r0, #448]
230; CHECK-NEXT:    vaddva.u32 r2, q0
231; CHECK-NEXT:    vldrw.u32 q0, [r0, #464]
232; CHECK-NEXT:    vaddva.u32 r2, q0
233; CHECK-NEXT:    vldrw.u32 q0, [r0, #480]
234; CHECK-NEXT:    vaddva.u32 r2, q0
235; CHECK-NEXT:    vldrw.u32 q0, [r0, #496]
236; CHECK-NEXT:    vaddva.u32 r2, q0
237; CHECK-NEXT:    mov r0, r2
238; CHECK-NEXT:    bx lr
239entry:
240  %0 = bitcast i32* %x to <4 x i32>*
241  %wide.load = load <4 x i32>, <4 x i32>* %0, align 4
242  %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
243  %2 = getelementptr inbounds i32, i32* %x, i32 4
244  %3 = bitcast i32* %2 to <4 x i32>*
245  %wide.load.1 = load <4 x i32>, <4 x i32>* %3, align 4
246  %4 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.1)
247  %5 = add i32 %4, %1
248  %6 = getelementptr inbounds i32, i32* %x, i32 8
249  %7 = bitcast i32* %6 to <4 x i32>*
250  %wide.load.2 = load <4 x i32>, <4 x i32>* %7, align 4
251  %8 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.2)
252  %9 = add i32 %8, %5
253  %10 = getelementptr inbounds i32, i32* %x, i32 12
254  %11 = bitcast i32* %10 to <4 x i32>*
255  %wide.load.3 = load <4 x i32>, <4 x i32>* %11, align 4
256  %12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.3)
257  %13 = add i32 %12, %9
258  %14 = getelementptr inbounds i32, i32* %x, i32 16
259  %15 = bitcast i32* %14 to <4 x i32>*
260  %wide.load.4 = load <4 x i32>, <4 x i32>* %15, align 4
261  %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.4)
262  %17 = add i32 %16, %13
263  %18 = getelementptr inbounds i32, i32* %x, i32 20
264  %19 = bitcast i32* %18 to <4 x i32>*
265  %wide.load.5 = load <4 x i32>, <4 x i32>* %19, align 4
266  %20 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.5)
267  %21 = add i32 %20, %17
268  %22 = getelementptr inbounds i32, i32* %x, i32 24
269  %23 = bitcast i32* %22 to <4 x i32>*
270  %wide.load.6 = load <4 x i32>, <4 x i32>* %23, align 4
271  %24 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.6)
272  %25 = add i32 %24, %21
273  %26 = getelementptr inbounds i32, i32* %x, i32 28
274  %27 = bitcast i32* %26 to <4 x i32>*
275  %wide.load.7 = load <4 x i32>, <4 x i32>* %27, align 4
276  %28 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.7)
277  %29 = add i32 %28, %25
278  %30 = getelementptr inbounds i32, i32* %x, i32 32
279  %31 = bitcast i32* %30 to <4 x i32>*
280  %wide.load.8 = load <4 x i32>, <4 x i32>* %31, align 4
281  %32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.8)
282  %33 = add i32 %32, %29
283  %34 = getelementptr inbounds i32, i32* %x, i32 36
284  %35 = bitcast i32* %34 to <4 x i32>*
285  %wide.load.9 = load <4 x i32>, <4 x i32>* %35, align 4
286  %36 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.9)
287  %37 = add i32 %36, %33
288  %38 = getelementptr inbounds i32, i32* %x, i32 40
289  %39 = bitcast i32* %38 to <4 x i32>*
290  %wide.load.10 = load <4 x i32>, <4 x i32>* %39, align 4
291  %40 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.10)
292  %41 = add i32 %40, %37
293  %42 = getelementptr inbounds i32, i32* %x, i32 44
294  %43 = bitcast i32* %42 to <4 x i32>*
295  %wide.load.11 = load <4 x i32>, <4 x i32>* %43, align 4
296  %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.11)
297  %45 = add i32 %44, %41
298  %46 = getelementptr inbounds i32, i32* %x, i32 48
299  %47 = bitcast i32* %46 to <4 x i32>*
300  %wide.load.12 = load <4 x i32>, <4 x i32>* %47, align 4
301  %48 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.12)
302  %49 = add i32 %48, %45
303  %50 = getelementptr inbounds i32, i32* %x, i32 52
304  %51 = bitcast i32* %50 to <4 x i32>*
305  %wide.load.13 = load <4 x i32>, <4 x i32>* %51, align 4
306  %52 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.13)
307  %53 = add i32 %52, %49
308  %54 = getelementptr inbounds i32, i32* %x, i32 56
309  %55 = bitcast i32* %54 to <4 x i32>*
310  %wide.load.14 = load <4 x i32>, <4 x i32>* %55, align 4
311  %56 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.14)
312  %57 = add i32 %56, %53
313  %58 = getelementptr inbounds i32, i32* %x, i32 60
314  %59 = bitcast i32* %58 to <4 x i32>*
315  %wide.load.15 = load <4 x i32>, <4 x i32>* %59, align 4
316  %60 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.15)
317  %61 = add i32 %60, %57
318  %62 = getelementptr inbounds i32, i32* %x, i32 64
319  %63 = bitcast i32* %62 to <4 x i32>*
320  %wide.load.16 = load <4 x i32>, <4 x i32>* %63, align 4
321  %64 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.16)
322  %65 = add i32 %64, %61
323  %66 = getelementptr inbounds i32, i32* %x, i32 68
324  %67 = bitcast i32* %66 to <4 x i32>*
325  %wide.load.17 = load <4 x i32>, <4 x i32>* %67, align 4
326  %68 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.17)
327  %69 = add i32 %68, %65
328  %70 = getelementptr inbounds i32, i32* %x, i32 72
329  %71 = bitcast i32* %70 to <4 x i32>*
330  %wide.load.18 = load <4 x i32>, <4 x i32>* %71, align 4
331  %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.18)
332  %73 = add i32 %72, %69
333  %74 = getelementptr inbounds i32, i32* %x, i32 76
334  %75 = bitcast i32* %74 to <4 x i32>*
335  %wide.load.19 = load <4 x i32>, <4 x i32>* %75, align 4
336  %76 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.19)
337  %77 = add i32 %76, %73
338  %78 = getelementptr inbounds i32, i32* %x, i32 80
339  %79 = bitcast i32* %78 to <4 x i32>*
340  %wide.load.20 = load <4 x i32>, <4 x i32>* %79, align 4
341  %80 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.20)
342  %81 = add i32 %80, %77
343  %82 = getelementptr inbounds i32, i32* %x, i32 84
344  %83 = bitcast i32* %82 to <4 x i32>*
345  %wide.load.21 = load <4 x i32>, <4 x i32>* %83, align 4
346  %84 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.21)
347  %85 = add i32 %84, %81
348  %86 = getelementptr inbounds i32, i32* %x, i32 88
349  %87 = bitcast i32* %86 to <4 x i32>*
350  %wide.load.22 = load <4 x i32>, <4 x i32>* %87, align 4
351  %88 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.22)
352  %89 = add i32 %88, %85
353  %90 = getelementptr inbounds i32, i32* %x, i32 92
354  %91 = bitcast i32* %90 to <4 x i32>*
355  %wide.load.23 = load <4 x i32>, <4 x i32>* %91, align 4
356  %92 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.23)
357  %93 = add i32 %92, %89
358  %94 = getelementptr inbounds i32, i32* %x, i32 96
359  %95 = bitcast i32* %94 to <4 x i32>*
360  %wide.load.24 = load <4 x i32>, <4 x i32>* %95, align 4
361  %96 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.24)
362  %97 = add i32 %96, %93
363  %98 = getelementptr inbounds i32, i32* %x, i32 100
364  %99 = bitcast i32* %98 to <4 x i32>*
365  %wide.load.25 = load <4 x i32>, <4 x i32>* %99, align 4
366  %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.25)
367  %101 = add i32 %100, %97
368  %102 = getelementptr inbounds i32, i32* %x, i32 104
369  %103 = bitcast i32* %102 to <4 x i32>*
370  %wide.load.26 = load <4 x i32>, <4 x i32>* %103, align 4
371  %104 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.26)
372  %105 = add i32 %104, %101
373  %106 = getelementptr inbounds i32, i32* %x, i32 108
374  %107 = bitcast i32* %106 to <4 x i32>*
375  %wide.load.27 = load <4 x i32>, <4 x i32>* %107, align 4
376  %108 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.27)
377  %109 = add i32 %108, %105
378  %110 = getelementptr inbounds i32, i32* %x, i32 112
379  %111 = bitcast i32* %110 to <4 x i32>*
380  %wide.load.28 = load <4 x i32>, <4 x i32>* %111, align 4
381  %112 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.28)
382  %113 = add i32 %112, %109
383  %114 = getelementptr inbounds i32, i32* %x, i32 116
384  %115 = bitcast i32* %114 to <4 x i32>*
385  %wide.load.29 = load <4 x i32>, <4 x i32>* %115, align 4
386  %116 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.29)
387  %117 = add i32 %116, %113
388  %118 = getelementptr inbounds i32, i32* %x, i32 120
389  %119 = bitcast i32* %118 to <4 x i32>*
390  %wide.load.30 = load <4 x i32>, <4 x i32>* %119, align 4
391  %120 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.30)
392  %121 = add i32 %120, %117
393  %122 = getelementptr inbounds i32, i32* %x, i32 124
394  %123 = bitcast i32* %122 to <4 x i32>*
395  %wide.load.31 = load <4 x i32>, <4 x i32>* %123, align 4
396  %124 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load.31)
397  %125 = add i32 %124, %121
398  ret i32 %125
399}
400
401define i32 @addv2i32i16(i16* %x) {
402; CHECK-LABEL: addv2i32i16:
403; CHECK:       @ %bb.0: @ %entry
404; CHECK-NEXT:    ldrsh.w r1, [r0]
405; CHECK-NEXT:    ldrsh.w r0, [r0, #2]
406; CHECK-NEXT:    add r0, r1
407; CHECK-NEXT:    bx lr
408entry:
409  %0 = load i16, i16* %x, align 2
410  %conv = sext i16 %0 to i32
411  %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1
412  %1 = load i16, i16* %arrayidx.1, align 2
413  %conv.1 = sext i16 %1 to i32
414  %add.1 = add nsw i32 %conv, %conv.1
415  ret i32 %add.1
416}
417
418define i32 @addv4i32i16(i16* %x) {
419; CHECK-LABEL: addv4i32i16:
420; CHECK:       @ %bb.0: @ %entry
421; CHECK-NEXT:    vldrh.s32 q0, [r0]
422; CHECK-NEXT:    vaddv.u32 r0, q0
423; CHECK-NEXT:    bx lr
424entry:
425  %0 = bitcast i16* %x to <4 x i16>*
426  %1 = load <4 x i16>, <4 x i16>* %0, align 2
427  %2 = sext <4 x i16> %1 to <4 x i32>
428  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
429  ret i32 %3
430}
431
432define i32 @addv8i32i16(i16* %x) {
433; CHECK-LABEL: addv8i32i16:
434; CHECK:       @ %bb.0: @ %entry
435; CHECK-NEXT:    vldrh.u16 q0, [r0]
436; CHECK-NEXT:    vaddv.s16 r0, q0
437; CHECK-NEXT:    bx lr
438entry:
439  %0 = bitcast i16* %x to <8 x i16>*
440  %1 = load <8 x i16>, <8 x i16>* %0, align 2
441  %2 = sext <8 x i16> %1 to <8 x i32>
442  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
443  ret i32 %3
444}
445
446define i32 @addv16i32i16(i16* %x) {
447; CHECK-LABEL: addv16i32i16:
448; CHECK:       @ %bb.0: @ %entry
449; CHECK-NEXT:    vldrh.s32 q1, [r0]
450; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
451; CHECK-NEXT:    vaddv.u32 r2, q1
452; CHECK-NEXT:    vaddva.u32 r2, q0
453; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
454; CHECK-NEXT:    vaddva.u32 r2, q0
455; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
456; CHECK-NEXT:    vaddva.u32 r2, q0
457; CHECK-NEXT:    mov r0, r2
458; CHECK-NEXT:    bx lr
459entry:
460  %0 = bitcast i16* %x to <16 x i16>*
461  %1 = load <16 x i16>, <16 x i16>* %0, align 2
462  %2 = sext <16 x i16> %1 to <16 x i32>
463  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
464  ret i32 %3
465}
466
467define i32 @addv24i32i16(i16* %x) {
468; CHECK-LABEL: addv24i32i16:
469; CHECK:       @ %bb.0: @ %entry
470; CHECK-NEXT:    vldrh.s32 q1, [r0]
471; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
472; CHECK-NEXT:    vaddv.u32 r2, q1
473; CHECK-NEXT:    vaddva.u32 r2, q0
474; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
475; CHECK-NEXT:    vaddva.u32 r2, q0
476; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
477; CHECK-NEXT:    vaddva.u32 r2, q0
478; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
479; CHECK-NEXT:    vaddva.s16 r2, q0
480; CHECK-NEXT:    mov r0, r2
481; CHECK-NEXT:    bx lr
482entry:
483  %0 = bitcast i16* %x to <16 x i16>*
484  %1 = load <16 x i16>, <16 x i16>* %0, align 2
485  %2 = sext <16 x i16> %1 to <16 x i32>
486  %arrayidx.16 = getelementptr inbounds i16, i16* %x, i32 16
487  %3 = bitcast i16* %arrayidx.16 to <8 x i16>*
488  %4 = load <8 x i16>, <8 x i16>* %3, align 2
489  %5 = sext <8 x i16> %4 to <8 x i32>
490  %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
491  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
492  %op.rdx = add nsw i32 %6, %7
493  ret i32 %op.rdx
494}
495
496define i32 @addv32i32i16(i16* %x) {
497; CHECK-LABEL: addv32i32i16:
498; CHECK:       @ %bb.0: @ %entry
499; CHECK-NEXT:    vldrh.s32 q1, [r0]
500; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
501; CHECK-NEXT:    vaddv.u32 r2, q1
502; CHECK-NEXT:    vaddva.u32 r2, q0
503; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
504; CHECK-NEXT:    vaddva.u32 r2, q0
505; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
506; CHECK-NEXT:    vaddva.u32 r2, q0
507; CHECK-NEXT:    vldrh.s32 q0, [r0, #32]
508; CHECK-NEXT:    vaddva.u32 r2, q0
509; CHECK-NEXT:    vldrh.s32 q0, [r0, #40]
510; CHECK-NEXT:    vaddva.u32 r2, q0
511; CHECK-NEXT:    vldrh.s32 q0, [r0, #48]
512; CHECK-NEXT:    vaddva.u32 r2, q0
513; CHECK-NEXT:    vldrh.s32 q0, [r0, #56]
514; CHECK-NEXT:    vaddva.u32 r2, q0
515; CHECK-NEXT:    mov r0, r2
516; CHECK-NEXT:    bx lr
517entry:
518  %0 = bitcast i16* %x to <32 x i16>*
519  %1 = load <32 x i16>, <32 x i16>* %0, align 2
520  %2 = sext <32 x i16> %1 to <32 x i32>
521  %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
522  ret i32 %3
523}
524
525define i32 @addv64i32i16(i16* %x) {
526; CHECK-LABEL: addv64i32i16:
527; CHECK:       @ %bb.0: @ %entry
528; CHECK-NEXT:    vldrh.s32 q1, [r0]
529; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
530; CHECK-NEXT:    ldrsh.w r1, [r0, #120]
531; CHECK-NEXT:    vaddv.u32 r2, q1
532; CHECK-NEXT:    ldrsh.w r3, [r0, #122]
533; CHECK-NEXT:    vaddva.u32 r2, q0
534; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
535; CHECK-NEXT:    ldrsh.w r12, [r0, #124]
536; CHECK-NEXT:    vaddva.u32 r2, q0
537; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
538; CHECK-NEXT:    vaddva.u32 r2, q0
539; CHECK-NEXT:    vldrh.s32 q0, [r0, #32]
540; CHECK-NEXT:    vaddva.u32 r2, q0
541; CHECK-NEXT:    vldrh.s32 q0, [r0, #40]
542; CHECK-NEXT:    vaddva.u32 r2, q0
543; CHECK-NEXT:    vldrh.s32 q0, [r0, #48]
544; CHECK-NEXT:    vaddva.u32 r2, q0
545; CHECK-NEXT:    vldrh.s32 q0, [r0, #56]
546; CHECK-NEXT:    vaddva.u32 r2, q0
547; CHECK-NEXT:    vldrh.s32 q0, [r0, #64]
548; CHECK-NEXT:    vaddva.u32 r2, q0
549; CHECK-NEXT:    vldrh.s32 q0, [r0, #72]
550; CHECK-NEXT:    vaddva.u32 r2, q0
551; CHECK-NEXT:    vldrh.s32 q0, [r0, #80]
552; CHECK-NEXT:    vaddva.u32 r2, q0
553; CHECK-NEXT:    vldrh.s32 q0, [r0, #88]
554; CHECK-NEXT:    vaddva.u32 r2, q0
555; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
556; CHECK-NEXT:    vaddva.s16 r2, q0
557; CHECK-NEXT:    vldrh.s32 q0, [r0, #112]
558; CHECK-NEXT:    ldrsh.w r0, [r0, #126]
559; CHECK-NEXT:    vaddva.u32 r2, q0
560; CHECK-NEXT:    add r1, r2
561; CHECK-NEXT:    add r1, r3
562; CHECK-NEXT:    add r1, r12
563; CHECK-NEXT:    add r0, r1
564; CHECK-NEXT:    bx lr
565entry:
566  %0 = bitcast i16* %x to <32 x i16>*
567  %1 = load <32 x i16>, <32 x i16>* %0, align 2
568  %2 = sext <32 x i16> %1 to <32 x i32>
569  %arrayidx.32 = getelementptr inbounds i16, i16* %x, i32 32
570  %3 = bitcast i16* %arrayidx.32 to <16 x i16>*
571  %4 = load <16 x i16>, <16 x i16>* %3, align 2
572  %5 = sext <16 x i16> %4 to <16 x i32>
573  %arrayidx.48 = getelementptr inbounds i16, i16* %x, i32 48
574  %6 = bitcast i16* %arrayidx.48 to <8 x i16>*
575  %7 = load <8 x i16>, <8 x i16>* %6, align 2
576  %8 = sext <8 x i16> %7 to <8 x i32>
577  %arrayidx.56 = getelementptr inbounds i16, i16* %x, i32 56
578  %9 = bitcast i16* %arrayidx.56 to <4 x i16>*
579  %10 = load <4 x i16>, <4 x i16>* %9, align 2
580  %11 = sext <4 x i16> %10 to <4 x i32>
581  %arrayidx.60 = getelementptr inbounds i16, i16* %x, i32 60
582  %12 = load i16, i16* %arrayidx.60, align 2
583  %conv.60 = sext i16 %12 to i32
584  %arrayidx.61 = getelementptr inbounds i16, i16* %x, i32 61
585  %13 = load i16, i16* %arrayidx.61, align 2
586  %conv.61 = sext i16 %13 to i32
587  %arrayidx.62 = getelementptr inbounds i16, i16* %x, i32 62
588  %14 = load i16, i16* %arrayidx.62, align 2
589  %conv.62 = sext i16 %14 to i32
590  %15 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
591  %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
592  %op.rdx = add nsw i32 %15, %16
593  %17 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
594  %op.rdx8 = add nsw i32 %op.rdx, %17
595  %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11)
596  %op.rdx9 = add nsw i32 %op.rdx8, %18
597  %19 = add nsw i32 %op.rdx9, %conv.60
598  %20 = add nsw i32 %19, %conv.61
599  %21 = add nsw i32 %20, %conv.62
600  %arrayidx.63 = getelementptr inbounds i16, i16* %x, i32 63
601  %22 = load i16, i16* %arrayidx.63, align 2
602  %conv.63 = sext i16 %22 to i32
603  %add.63 = add nsw i32 %21, %conv.63
604  ret i32 %add.63
605}
606
607define i32 @addv128i32i16(i16* %x) {
608; CHECK-LABEL: addv128i32i16:
609; CHECK:       @ %bb.0: @ %entry
610; CHECK-NEXT:    vldrh.u16 q1, [r0]
611; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
612; CHECK-NEXT:    vaddv.s16 r2, q1
613; CHECK-NEXT:    vaddva.s16 r2, q0
614; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
615; CHECK-NEXT:    vaddva.s16 r2, q0
616; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
617; CHECK-NEXT:    vaddva.s16 r2, q0
618; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
619; CHECK-NEXT:    vaddva.s16 r2, q0
620; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
621; CHECK-NEXT:    vaddva.s16 r2, q0
622; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
623; CHECK-NEXT:    vaddva.s16 r2, q0
624; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
625; CHECK-NEXT:    vaddva.s16 r2, q0
626; CHECK-NEXT:    vldrh.u16 q0, [r0, #128]
627; CHECK-NEXT:    vaddva.s16 r2, q0
628; CHECK-NEXT:    vldrh.u16 q0, [r0, #144]
629; CHECK-NEXT:    vaddva.s16 r2, q0
630; CHECK-NEXT:    vldrh.u16 q0, [r0, #160]
631; CHECK-NEXT:    vaddva.s16 r2, q0
632; CHECK-NEXT:    vldrh.u16 q0, [r0, #176]
633; CHECK-NEXT:    vaddva.s16 r2, q0
634; CHECK-NEXT:    vldrh.u16 q0, [r0, #192]
635; CHECK-NEXT:    vaddva.s16 r2, q0
636; CHECK-NEXT:    vldrh.u16 q0, [r0, #208]
637; CHECK-NEXT:    vaddva.s16 r2, q0
638; CHECK-NEXT:    vldrh.u16 q0, [r0, #224]
639; CHECK-NEXT:    vaddva.s16 r2, q0
640; CHECK-NEXT:    vldrh.u16 q0, [r0, #240]
641; CHECK-NEXT:    vaddva.s16 r2, q0
642; CHECK-NEXT:    mov r0, r2
643; CHECK-NEXT:    bx lr
644entry:
645  %0 = bitcast i16* %x to <8 x i16>*
646  %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
647  %1 = sext <8 x i16> %wide.load to <8 x i32>
648  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
649  %3 = getelementptr inbounds i16, i16* %x, i32 8
650  %4 = bitcast i16* %3 to <8 x i16>*
651  %wide.load.1 = load <8 x i16>, <8 x i16>* %4, align 2
652  %5 = sext <8 x i16> %wide.load.1 to <8 x i32>
653  %6 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
654  %7 = add i32 %6, %2
655  %8 = getelementptr inbounds i16, i16* %x, i32 16
656  %9 = bitcast i16* %8 to <8 x i16>*
657  %wide.load.2 = load <8 x i16>, <8 x i16>* %9, align 2
658  %10 = sext <8 x i16> %wide.load.2 to <8 x i32>
659  %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %10)
660  %12 = add i32 %11, %7
661  %13 = getelementptr inbounds i16, i16* %x, i32 24
662  %14 = bitcast i16* %13 to <8 x i16>*
663  %wide.load.3 = load <8 x i16>, <8 x i16>* %14, align 2
664  %15 = sext <8 x i16> %wide.load.3 to <8 x i32>
665  %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %15)
666  %17 = add i32 %16, %12
667  %18 = getelementptr inbounds i16, i16* %x, i32 32
668  %19 = bitcast i16* %18 to <8 x i16>*
669  %wide.load.4 = load <8 x i16>, <8 x i16>* %19, align 2
670  %20 = sext <8 x i16> %wide.load.4 to <8 x i32>
671  %21 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %20)
672  %22 = add i32 %21, %17
673  %23 = getelementptr inbounds i16, i16* %x, i32 40
674  %24 = bitcast i16* %23 to <8 x i16>*
675  %wide.load.5 = load <8 x i16>, <8 x i16>* %24, align 2
676  %25 = sext <8 x i16> %wide.load.5 to <8 x i32>
677  %26 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %25)
678  %27 = add i32 %26, %22
679  %28 = getelementptr inbounds i16, i16* %x, i32 48
680  %29 = bitcast i16* %28 to <8 x i16>*
681  %wide.load.6 = load <8 x i16>, <8 x i16>* %29, align 2
682  %30 = sext <8 x i16> %wide.load.6 to <8 x i32>
683  %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30)
684  %32 = add i32 %31, %27
685  %33 = getelementptr inbounds i16, i16* %x, i32 56
686  %34 = bitcast i16* %33 to <8 x i16>*
687  %wide.load.7 = load <8 x i16>, <8 x i16>* %34, align 2
688  %35 = sext <8 x i16> %wide.load.7 to <8 x i32>
689  %36 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %35)
690  %37 = add i32 %36, %32
691  %38 = getelementptr inbounds i16, i16* %x, i32 64
692  %39 = bitcast i16* %38 to <8 x i16>*
693  %wide.load.8 = load <8 x i16>, <8 x i16>* %39, align 2
694  %40 = sext <8 x i16> %wide.load.8 to <8 x i32>
695  %41 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
696  %42 = add i32 %41, %37
697  %43 = getelementptr inbounds i16, i16* %x, i32 72
698  %44 = bitcast i16* %43 to <8 x i16>*
699  %wide.load.9 = load <8 x i16>, <8 x i16>* %44, align 2
700  %45 = sext <8 x i16> %wide.load.9 to <8 x i32>
701  %46 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %45)
702  %47 = add i32 %46, %42
703  %48 = getelementptr inbounds i16, i16* %x, i32 80
704  %49 = bitcast i16* %48 to <8 x i16>*
705  %wide.load.10 = load <8 x i16>, <8 x i16>* %49, align 2
706  %50 = sext <8 x i16> %wide.load.10 to <8 x i32>
707  %51 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %50)
708  %52 = add i32 %51, %47
709  %53 = getelementptr inbounds i16, i16* %x, i32 88
710  %54 = bitcast i16* %53 to <8 x i16>*
711  %wide.load.11 = load <8 x i16>, <8 x i16>* %54, align 2
712  %55 = sext <8 x i16> %wide.load.11 to <8 x i32>
713  %56 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %55)
714  %57 = add i32 %56, %52
715  %58 = getelementptr inbounds i16, i16* %x, i32 96
716  %59 = bitcast i16* %58 to <8 x i16>*
717  %wide.load.12 = load <8 x i16>, <8 x i16>* %59, align 2
718  %60 = sext <8 x i16> %wide.load.12 to <8 x i32>
719  %61 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %60)
720  %62 = add i32 %61, %57
721  %63 = getelementptr inbounds i16, i16* %x, i32 104
722  %64 = bitcast i16* %63 to <8 x i16>*
723  %wide.load.13 = load <8 x i16>, <8 x i16>* %64, align 2
724  %65 = sext <8 x i16> %wide.load.13 to <8 x i32>
725  %66 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %65)
726  %67 = add i32 %66, %62
727  %68 = getelementptr inbounds i16, i16* %x, i32 112
728  %69 = bitcast i16* %68 to <8 x i16>*
729  %wide.load.14 = load <8 x i16>, <8 x i16>* %69, align 2
730  %70 = sext <8 x i16> %wide.load.14 to <8 x i32>
731  %71 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %70)
732  %72 = add i32 %71, %67
733  %73 = getelementptr inbounds i16, i16* %x, i32 120
734  %74 = bitcast i16* %73 to <8 x i16>*
735  %wide.load.15 = load <8 x i16>, <8 x i16>* %74, align 2
736  %75 = sext <8 x i16> %wide.load.15 to <8 x i32>
737  %76 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %75)
738  %77 = add i32 %76, %72
739  ret i32 %77
740}
741
742define i32 @addv2i32i8(i8* %x) {
743; CHECK-LABEL: addv2i32i8:
744; CHECK:       @ %bb.0: @ %entry
745; CHECK-NEXT:    ldrb r1, [r0]
746; CHECK-NEXT:    ldrb r0, [r0, #1]
747; CHECK-NEXT:    add r0, r1
748; CHECK-NEXT:    bx lr
749entry:
750  %0 = load i8, i8* %x, align 1
751  %conv = zext i8 %0 to i32
752  %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1
753  %1 = load i8, i8* %arrayidx.1, align 1
754  %conv.1 = zext i8 %1 to i32
755  %add.1 = add nuw nsw i32 %conv, %conv.1
756  ret i32 %add.1
757}
758
759define i32 @addv4i32i8(i8* %x) {
760; CHECK-LABEL: addv4i32i8:
761; CHECK:       @ %bb.0: @ %entry
762; CHECK-NEXT:    vldrb.u32 q0, [r0]
763; CHECK-NEXT:    vaddv.u32 r0, q0
764; CHECK-NEXT:    bx lr
765entry:
766  %0 = bitcast i8* %x to <4 x i8>*
767  %1 = load <4 x i8>, <4 x i8>* %0, align 1
768  %2 = zext <4 x i8> %1 to <4 x i32>
769  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
770  ret i32 %3
771}
772
773define i32 @addv8i32i8(i8* %x) {
774; CHECK-LABEL: addv8i32i8:
775; CHECK:       @ %bb.0: @ %entry
776; CHECK-NEXT:    vldrb.u16 q0, [r0]
777; CHECK-NEXT:    vaddv.u16 r0, q0
778; CHECK-NEXT:    bx lr
779entry:
780  %0 = bitcast i8* %x to <8 x i8>*
781  %1 = load <8 x i8>, <8 x i8>* %0, align 1
782  %2 = zext <8 x i8> %1 to <8 x i32>
783  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
784  ret i32 %3
785}
786
787define i32 @addv16i32i8(i8* %x) {
788; CHECK-LABEL: addv16i32i8:
789; CHECK:       @ %bb.0: @ %entry
790; CHECK-NEXT:    vldrb.u8 q0, [r0]
791; CHECK-NEXT:    vaddv.u8 r0, q0
792; CHECK-NEXT:    bx lr
793entry:
794  %0 = bitcast i8* %x to <16 x i8>*
795  %1 = load <16 x i8>, <16 x i8>* %0, align 1
796  %2 = zext <16 x i8> %1 to <16 x i32>
797  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
798  ret i32 %3
799}
800
801define i32 @addv24i32i8(i8* %x) {
802; CHECK-LABEL: addv24i32i8:
803; CHECK:       @ %bb.0: @ %entry
804; CHECK-NEXT:    vldrb.u8 q1, [r0]
805; CHECK-NEXT:    vldrb.u16 q0, [r0, #16]
806; CHECK-NEXT:    vaddv.u8 r0, q1
807; CHECK-NEXT:    vaddva.u16 r0, q0
808; CHECK-NEXT:    bx lr
809entry:
810  %0 = bitcast i8* %x to <16 x i8>*
811  %1 = load <16 x i8>, <16 x i8>* %0, align 1
812  %2 = zext <16 x i8> %1 to <16 x i32>
813  %arrayidx.16 = getelementptr inbounds i8, i8* %x, i32 16
814  %3 = bitcast i8* %arrayidx.16 to <8 x i8>*
815  %4 = load <8 x i8>, <8 x i8>* %3, align 1
816  %5 = zext <8 x i8> %4 to <8 x i32>
817  %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
818  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
819  %op.rdx = add nuw nsw i32 %6, %7
820  ret i32 %op.rdx
821}
822
823define i32 @addv32i32i8(i8* %x) {
824; CHECK-LABEL: addv32i32i8:
825; CHECK:       @ %bb.0: @ %entry
826; CHECK-NEXT:    vldrb.u32 q1, [r0]
827; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
828; CHECK-NEXT:    vaddv.u32 r2, q1
829; CHECK-NEXT:    vaddva.u32 r2, q0
830; CHECK-NEXT:    vldrb.u32 q0, [r0, #8]
831; CHECK-NEXT:    vaddva.u32 r2, q0
832; CHECK-NEXT:    vldrb.u32 q0, [r0, #12]
833; CHECK-NEXT:    vaddva.u32 r2, q0
834; CHECK-NEXT:    vldrb.u32 q0, [r0, #16]
835; CHECK-NEXT:    vaddva.u32 r2, q0
836; CHECK-NEXT:    vldrb.u32 q0, [r0, #20]
837; CHECK-NEXT:    vaddva.u32 r2, q0
838; CHECK-NEXT:    vldrb.u32 q0, [r0, #24]
839; CHECK-NEXT:    vaddva.u32 r2, q0
840; CHECK-NEXT:    vldrb.u32 q0, [r0, #28]
841; CHECK-NEXT:    vaddva.u32 r2, q0
842; CHECK-NEXT:    mov r0, r2
843; CHECK-NEXT:    bx lr
844entry:
845  %0 = bitcast i8* %x to <32 x i8>*
846  %1 = load <32 x i8>, <32 x i8>* %0, align 1
847  %2 = zext <32 x i8> %1 to <32 x i32>
848  %3 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
849  ret i32 %3
850}
851
852define i32 @addv64i32i8(i8* %x) {
853; CHECK-LABEL: addv64i32i8:
854; CHECK:       @ %bb.0: @ %entry
855; CHECK-NEXT:    vldrb.u32 q1, [r0]
856; CHECK-NEXT:    vldrb.u32 q0, [r0, #4]
857; CHECK-NEXT:    ldrb.w r1, [r0, #60]
858; CHECK-NEXT:    vaddv.u32 r2, q1
859; CHECK-NEXT:    ldrb.w r3, [r0, #61]
860; CHECK-NEXT:    vaddva.u32 r2, q0
861; CHECK-NEXT:    vldrb.u32 q0, [r0, #8]
862; CHECK-NEXT:    ldrb.w r12, [r0, #62]
863; CHECK-NEXT:    vaddva.u32 r2, q0
864; CHECK-NEXT:    vldrb.u32 q0, [r0, #12]
865; CHECK-NEXT:    vaddva.u32 r2, q0
866; CHECK-NEXT:    vldrb.u32 q0, [r0, #16]
867; CHECK-NEXT:    vaddva.u32 r2, q0
868; CHECK-NEXT:    vldrb.u32 q0, [r0, #20]
869; CHECK-NEXT:    vaddva.u32 r2, q0
870; CHECK-NEXT:    vldrb.u32 q0, [r0, #24]
871; CHECK-NEXT:    vaddva.u32 r2, q0
872; CHECK-NEXT:    vldrb.u32 q0, [r0, #28]
873; CHECK-NEXT:    vaddva.u32 r2, q0
874; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
875; CHECK-NEXT:    vaddva.u8 r2, q0
876; CHECK-NEXT:    vldrb.u16 q0, [r0, #48]
877; CHECK-NEXT:    vaddva.u16 r2, q0
878; CHECK-NEXT:    vldrb.u32 q0, [r0, #56]
879; CHECK-NEXT:    ldrb.w r0, [r0, #63]
880; CHECK-NEXT:    vaddva.u32 r2, q0
881; CHECK-NEXT:    add r1, r2
882; CHECK-NEXT:    add r1, r3
883; CHECK-NEXT:    add r1, r12
884; CHECK-NEXT:    add r0, r1
885; CHECK-NEXT:    bx lr
886entry:
887  %0 = bitcast i8* %x to <32 x i8>*
888  %1 = load <32 x i8>, <32 x i8>* %0, align 1
889  %2 = zext <32 x i8> %1 to <32 x i32>
890  %arrayidx.32 = getelementptr inbounds i8, i8* %x, i32 32
891  %3 = bitcast i8* %arrayidx.32 to <16 x i8>*
892  %4 = load <16 x i8>, <16 x i8>* %3, align 1
893  %5 = zext <16 x i8> %4 to <16 x i32>
894  %arrayidx.48 = getelementptr inbounds i8, i8* %x, i32 48
895  %6 = bitcast i8* %arrayidx.48 to <8 x i8>*
896  %7 = load <8 x i8>, <8 x i8>* %6, align 1
897  %8 = zext <8 x i8> %7 to <8 x i32>
898  %arrayidx.56 = getelementptr inbounds i8, i8* %x, i32 56
899  %9 = bitcast i8* %arrayidx.56 to <4 x i8>*
900  %10 = load <4 x i8>, <4 x i8>* %9, align 1
901  %11 = zext <4 x i8> %10 to <4 x i32>
902  %arrayidx.60 = getelementptr inbounds i8, i8* %x, i32 60
903  %12 = load i8, i8* %arrayidx.60, align 1
904  %conv.60 = zext i8 %12 to i32
905  %arrayidx.61 = getelementptr inbounds i8, i8* %x, i32 61
906  %13 = load i8, i8* %arrayidx.61, align 1
907  %conv.61 = zext i8 %13 to i32
908  %arrayidx.62 = getelementptr inbounds i8, i8* %x, i32 62
909  %14 = load i8, i8* %arrayidx.62, align 1
910  %conv.62 = zext i8 %14 to i32
911  %15 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %2)
912  %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
913  %op.rdx = add nuw nsw i32 %15, %16
914  %17 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %8)
915  %op.rdx8 = add nuw nsw i32 %op.rdx, %17
916  %18 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %11)
917  %op.rdx9 = add nuw nsw i32 %op.rdx8, %18
918  %19 = add nuw nsw i32 %op.rdx9, %conv.60
919  %20 = add nuw nsw i32 %19, %conv.61
920  %21 = add nuw nsw i32 %20, %conv.62
921  %arrayidx.63 = getelementptr inbounds i8, i8* %x, i32 63
922  %22 = load i8, i8* %arrayidx.63, align 1
923  %conv.63 = zext i8 %22 to i32
924  %add.63 = add nuw nsw i32 %21, %conv.63
925  ret i32 %add.63
926}
927
928define i32 @addv128i32i8(i8* %x) {
929; CHECK-LABEL: addv128i32i8:
930; CHECK:       @ %bb.0: @ %entry
931; CHECK-NEXT:    vldrb.u8 q1, [r0]
932; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
933; CHECK-NEXT:    mov r1, r0
934; CHECK-NEXT:    vaddv.u8 r0, q1
935; CHECK-NEXT:    vaddva.u8 r0, q0
936; CHECK-NEXT:    vldrb.u8 q0, [r1, #32]
937; CHECK-NEXT:    vaddva.u8 r0, q0
938; CHECK-NEXT:    vldrb.u8 q0, [r1, #48]
939; CHECK-NEXT:    vaddva.u8 r0, q0
940; CHECK-NEXT:    vldrb.u8 q0, [r1, #64]
941; CHECK-NEXT:    vaddva.u8 r0, q0
942; CHECK-NEXT:    vldrb.u8 q0, [r1, #80]
943; CHECK-NEXT:    vaddva.u8 r0, q0
944; CHECK-NEXT:    vldrb.u8 q0, [r1, #96]
945; CHECK-NEXT:    vaddva.u8 r0, q0
946; CHECK-NEXT:    vldrb.u8 q0, [r1, #112]
947; CHECK-NEXT:    vaddva.u8 r0, q0
948; CHECK-NEXT:    bx lr
949entry:
950  %0 = bitcast i8* %x to <16 x i8>*
951  %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
952  %1 = zext <16 x i8> %wide.load to <16 x i32>
953  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
954  %3 = getelementptr inbounds i8, i8* %x, i32 16
955  %4 = bitcast i8* %3 to <16 x i8>*
956  %wide.load.1 = load <16 x i8>, <16 x i8>* %4, align 1
957  %5 = zext <16 x i8> %wide.load.1 to <16 x i32>
958  %6 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5)
959  %7 = add i32 %6, %2
960  %8 = getelementptr inbounds i8, i8* %x, i32 32
961  %9 = bitcast i8* %8 to <16 x i8>*
962  %wide.load.2 = load <16 x i8>, <16 x i8>* %9, align 1
963  %10 = zext <16 x i8> %wide.load.2 to <16 x i32>
964  %11 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %10)
965  %12 = add i32 %11, %7
966  %13 = getelementptr inbounds i8, i8* %x, i32 48
967  %14 = bitcast i8* %13 to <16 x i8>*
968  %wide.load.3 = load <16 x i8>, <16 x i8>* %14, align 1
969  %15 = zext <16 x i8> %wide.load.3 to <16 x i32>
970  %16 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %15)
971  %17 = add i32 %16, %12
972  %18 = getelementptr inbounds i8, i8* %x, i32 64
973  %19 = bitcast i8* %18 to <16 x i8>*
974  %wide.load.4 = load <16 x i8>, <16 x i8>* %19, align 1
975  %20 = zext <16 x i8> %wide.load.4 to <16 x i32>
976  %21 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %20)
977  %22 = add i32 %21, %17
978  %23 = getelementptr inbounds i8, i8* %x, i32 80
979  %24 = bitcast i8* %23 to <16 x i8>*
980  %wide.load.5 = load <16 x i8>, <16 x i8>* %24, align 1
981  %25 = zext <16 x i8> %wide.load.5 to <16 x i32>
982  %26 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %25)
983  %27 = add i32 %26, %22
984  %28 = getelementptr inbounds i8, i8* %x, i32 96
985  %29 = bitcast i8* %28 to <16 x i8>*
986  %wide.load.6 = load <16 x i8>, <16 x i8>* %29, align 1
987  %30 = zext <16 x i8> %wide.load.6 to <16 x i32>
988  %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30)
989  %32 = add i32 %31, %27
990  %33 = getelementptr inbounds i8, i8* %x, i32 112
991  %34 = bitcast i8* %33 to <16 x i8>*
992  %wide.load.7 = load <16 x i8>, <16 x i8>* %34, align 1
993  %35 = zext <16 x i8> %wide.load.7 to <16 x i32>
994  %36 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %35)
995  %37 = add i32 %36, %32
996  ret i32 %37
997}
998
999define signext i16 @addv2i16i16(i16* %x) {
1000; CHECK-LABEL: addv2i16i16:
1001; CHECK:       @ %bb.0: @ %entry
1002; CHECK-NEXT:    ldrh r1, [r0]
1003; CHECK-NEXT:    ldrh r0, [r0, #2]
1004; CHECK-NEXT:    add r0, r1
1005; CHECK-NEXT:    sxth r0, r0
1006; CHECK-NEXT:    bx lr
1007entry:
1008  %0 = load i16, i16* %x, align 2
1009  %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1
1010  %1 = load i16, i16* %arrayidx.1, align 2
1011  %add.1 = add i16 %1, %0
1012  ret i16 %add.1
1013}
1014
1015define signext i16 @addv4i16i16(i16* %x) {
1016; CHECK-LABEL: addv4i16i16:
1017; CHECK:       @ %bb.0: @ %entry
1018; CHECK-NEXT:    vldrh.u32 q0, [r0]
1019; CHECK-NEXT:    vaddv.u32 r0, q0
1020; CHECK-NEXT:    sxth r0, r0
1021; CHECK-NEXT:    bx lr
1022entry:
1023  %0 = bitcast i16* %x to <4 x i16>*
1024  %1 = load <4 x i16>, <4 x i16>* %0, align 2
1025  %2 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %1)
1026  ret i16 %2
1027}
1028
1029define signext i16 @addv8i16i16(i16* %x) {
1030; CHECK-LABEL: addv8i16i16:
1031; CHECK:       @ %bb.0: @ %entry
1032; CHECK-NEXT:    vldrh.u16 q0, [r0]
1033; CHECK-NEXT:    vaddv.u16 r0, q0
1034; CHECK-NEXT:    sxth r0, r0
1035; CHECK-NEXT:    bx lr
1036entry:
1037  %0 = bitcast i16* %x to <8 x i16>*
1038  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1039  %2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
1040  ret i16 %2
1041}
1042
1043define signext i16 @addv16i16i16(i16* %x) {
1044; CHECK-LABEL: addv16i16i16:
1045; CHECK:       @ %bb.0: @ %entry
1046; CHECK-NEXT:    vldrh.u16 q1, [r0]
1047; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
1048; CHECK-NEXT:    vaddv.u16 r0, q1
1049; CHECK-NEXT:    vaddva.u16 r0, q0
1050; CHECK-NEXT:    sxth r0, r0
1051; CHECK-NEXT:    bx lr
1052entry:
1053  %0 = bitcast i16* %x to <16 x i16>*
1054  %1 = load <16 x i16>, <16 x i16>* %0, align 2
1055  %2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %1)
1056  ret i16 %2
1057}
1058
1059define signext i16 @addv24i16i16(i16* %x) {
1060; CHECK-LABEL: addv24i16i16:
1061; CHECK:       @ %bb.0: @ %entry
1062; CHECK-NEXT:    vldrh.u16 q1, [r0]
1063; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
1064; CHECK-NEXT:    vaddv.u16 r2, q1
1065; CHECK-NEXT:    vaddva.u16 r2, q0
1066; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
1067; CHECK-NEXT:    vaddva.u16 r2, q0
1068; CHECK-NEXT:    sxth r0, r2
1069; CHECK-NEXT:    bx lr
1070entry:
1071  %0 = bitcast i16* %x to <8 x i16>*
1072  %1 = load <8 x i16>, <8 x i16>* %0, align 2
1073  %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8
1074  %2 = bitcast i16* %arrayidx.8 to <16 x i16>*
1075  %3 = load <16 x i16>, <16 x i16>* %2, align 2
1076  %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
1077  %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %1)
1078  %op.rdx = add i16 %4, %5
1079  ret i16 %op.rdx
1080}
1081
1082define signext i16 @addv32i16i16(i16* %x) {
1083; CHECK-LABEL: addv32i16i16:
1084; CHECK:       @ %bb.0: @ %entry
1085; CHECK-NEXT:    vldrh.u16 q1, [r0]
1086; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
1087; CHECK-NEXT:    vaddv.u16 r2, q1
1088; CHECK-NEXT:    vaddva.u16 r2, q0
1089; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
1090; CHECK-NEXT:    vaddva.u16 r2, q0
1091; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
1092; CHECK-NEXT:    vaddva.u16 r2, q0
1093; CHECK-NEXT:    sxth r0, r2
1094; CHECK-NEXT:    bx lr
1095entry:
1096  %0 = bitcast i16* %x to <32 x i16>*
1097  %1 = load <32 x i16>, <32 x i16>* %0, align 2
1098  %2 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %1)
1099  ret i16 %2
1100}
1101
1102define signext i16 @addv64i16i16(i16* %x) {
1103; CHECK-LABEL: addv64i16i16:
1104; CHECK:       @ %bb.0: @ %entry
1105; CHECK-NEXT:    vldrh.u16 q1, [r0]
1106; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
1107; CHECK-NEXT:    vaddv.u16 r2, q1
1108; CHECK-NEXT:    vaddva.u16 r2, q0
1109; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
1110; CHECK-NEXT:    vaddva.u16 r2, q0
1111; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
1112; CHECK-NEXT:    vaddva.u16 r2, q0
1113; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
1114; CHECK-NEXT:    vaddva.u16 r2, q0
1115; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
1116; CHECK-NEXT:    vaddva.u16 r2, q0
1117; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
1118; CHECK-NEXT:    vaddva.u16 r2, q0
1119; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
1120; CHECK-NEXT:    vaddva.u16 r2, q0
1121; CHECK-NEXT:    sxth r0, r2
1122; CHECK-NEXT:    bx lr
1123entry:
1124  %0 = bitcast i16* %x to <64 x i16>*
1125  %1 = load <64 x i16>, <64 x i16>* %0, align 2
1126  %2 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %1)
1127  ret i16 %2
1128}
1129
1130define signext i16 @addv128i16i16(i16* %x) {
1131; CHECK-LABEL: addv128i16i16:
1132; CHECK:       @ %bb.0: @ %entry
1133; CHECK-NEXT:    vldrh.u16 q1, [r0]
1134; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
1135; CHECK-NEXT:    vaddv.u16 r2, q1
1136; CHECK-NEXT:    vaddva.u16 r2, q0
1137; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
1138; CHECK-NEXT:    vaddva.u16 r2, q0
1139; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
1140; CHECK-NEXT:    vaddva.u16 r2, q0
1141; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
1142; CHECK-NEXT:    vaddva.u16 r2, q0
1143; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
1144; CHECK-NEXT:    vaddva.u16 r2, q0
1145; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
1146; CHECK-NEXT:    vaddva.u16 r2, q0
1147; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
1148; CHECK-NEXT:    vaddva.u16 r2, q0
1149; CHECK-NEXT:    vldrh.u16 q0, [r0, #128]
1150; CHECK-NEXT:    vaddva.u16 r2, q0
1151; CHECK-NEXT:    vldrh.u16 q0, [r0, #144]
1152; CHECK-NEXT:    vaddva.u16 r2, q0
1153; CHECK-NEXT:    vldrh.u16 q0, [r0, #160]
1154; CHECK-NEXT:    vaddva.u16 r2, q0
1155; CHECK-NEXT:    vldrh.u16 q0, [r0, #176]
1156; CHECK-NEXT:    vaddva.u16 r2, q0
1157; CHECK-NEXT:    vldrh.u16 q0, [r0, #192]
1158; CHECK-NEXT:    vaddva.u16 r2, q0
1159; CHECK-NEXT:    vldrh.u16 q0, [r0, #208]
1160; CHECK-NEXT:    vaddva.u16 r2, q0
1161; CHECK-NEXT:    vldrh.u16 q0, [r0, #224]
1162; CHECK-NEXT:    vaddva.u16 r2, q0
1163; CHECK-NEXT:    vldrh.u16 q0, [r0, #240]
1164; CHECK-NEXT:    vaddva.u16 r2, q0
1165; CHECK-NEXT:    sxth r0, r2
1166; CHECK-NEXT:    bx lr
1167entry:
1168  %0 = bitcast i16* %x to <8 x i16>*
1169  %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
1170  %1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load)
1171  %2 = getelementptr inbounds i16, i16* %x, i32 8
1172  %3 = bitcast i16* %2 to <8 x i16>*
1173  %wide.load.1 = load <8 x i16>, <8 x i16>* %3, align 2
1174  %4 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.1)
1175  %5 = add i16 %4, %1
1176  %6 = getelementptr inbounds i16, i16* %x, i32 16
1177  %7 = bitcast i16* %6 to <8 x i16>*
1178  %wide.load.2 = load <8 x i16>, <8 x i16>* %7, align 2
1179  %8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.2)
1180  %9 = add i16 %8, %5
1181  %10 = getelementptr inbounds i16, i16* %x, i32 24
1182  %11 = bitcast i16* %10 to <8 x i16>*
1183  %wide.load.3 = load <8 x i16>, <8 x i16>* %11, align 2
1184  %12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.3)
1185  %13 = add i16 %12, %9
1186  %14 = getelementptr inbounds i16, i16* %x, i32 32
1187  %15 = bitcast i16* %14 to <8 x i16>*
1188  %wide.load.4 = load <8 x i16>, <8 x i16>* %15, align 2
1189  %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.4)
1190  %17 = add i16 %16, %13
1191  %18 = getelementptr inbounds i16, i16* %x, i32 40
1192  %19 = bitcast i16* %18 to <8 x i16>*
1193  %wide.load.5 = load <8 x i16>, <8 x i16>* %19, align 2
1194  %20 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.5)
1195  %21 = add i16 %20, %17
1196  %22 = getelementptr inbounds i16, i16* %x, i32 48
1197  %23 = bitcast i16* %22 to <8 x i16>*
1198  %wide.load.6 = load <8 x i16>, <8 x i16>* %23, align 2
1199  %24 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.6)
1200  %25 = add i16 %24, %21
1201  %26 = getelementptr inbounds i16, i16* %x, i32 56
1202  %27 = bitcast i16* %26 to <8 x i16>*
1203  %wide.load.7 = load <8 x i16>, <8 x i16>* %27, align 2
1204  %28 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.7)
1205  %29 = add i16 %28, %25
1206  %30 = getelementptr inbounds i16, i16* %x, i32 64
1207  %31 = bitcast i16* %30 to <8 x i16>*
1208  %wide.load.8 = load <8 x i16>, <8 x i16>* %31, align 2
1209  %32 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.8)
1210  %33 = add i16 %32, %29
1211  %34 = getelementptr inbounds i16, i16* %x, i32 72
1212  %35 = bitcast i16* %34 to <8 x i16>*
1213  %wide.load.9 = load <8 x i16>, <8 x i16>* %35, align 2
1214  %36 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.9)
1215  %37 = add i16 %36, %33
1216  %38 = getelementptr inbounds i16, i16* %x, i32 80
1217  %39 = bitcast i16* %38 to <8 x i16>*
1218  %wide.load.10 = load <8 x i16>, <8 x i16>* %39, align 2
1219  %40 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.10)
1220  %41 = add i16 %40, %37
1221  %42 = getelementptr inbounds i16, i16* %x, i32 88
1222  %43 = bitcast i16* %42 to <8 x i16>*
1223  %wide.load.11 = load <8 x i16>, <8 x i16>* %43, align 2
1224  %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.11)
1225  %45 = add i16 %44, %41
1226  %46 = getelementptr inbounds i16, i16* %x, i32 96
1227  %47 = bitcast i16* %46 to <8 x i16>*
1228  %wide.load.12 = load <8 x i16>, <8 x i16>* %47, align 2
1229  %48 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.12)
1230  %49 = add i16 %48, %45
1231  %50 = getelementptr inbounds i16, i16* %x, i32 104
1232  %51 = bitcast i16* %50 to <8 x i16>*
1233  %wide.load.13 = load <8 x i16>, <8 x i16>* %51, align 2
1234  %52 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.13)
1235  %53 = add i16 %52, %49
1236  %54 = getelementptr inbounds i16, i16* %x, i32 112
1237  %55 = bitcast i16* %54 to <8 x i16>*
1238  %wide.load.14 = load <8 x i16>, <8 x i16>* %55, align 2
1239  %56 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.14)
1240  %57 = add i16 %56, %53
1241  %58 = getelementptr inbounds i16, i16* %x, i32 120
1242  %59 = bitcast i16* %58 to <8 x i16>*
1243  %wide.load.15 = load <8 x i16>, <8 x i16>* %59, align 2
1244  %60 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %wide.load.15)
1245  %61 = add i16 %60, %57
1246  ret i16 %61
1247}
1248
1249define zeroext i8 @addv2i8i8(i8* %x) {
1250; CHECK-LABEL: addv2i8i8:
1251; CHECK:       @ %bb.0: @ %entry
1252; CHECK-NEXT:    ldrb r1, [r0]
1253; CHECK-NEXT:    ldrb r0, [r0, #1]
1254; CHECK-NEXT:    add r0, r1
1255; CHECK-NEXT:    uxtb r0, r0
1256; CHECK-NEXT:    bx lr
1257entry:
1258  %0 = load i8, i8* %x, align 1
1259  %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1
1260  %1 = load i8, i8* %arrayidx.1, align 1
1261  %add.1 = add i8 %1, %0
1262  ret i8 %add.1
1263}
1264
1265define zeroext i8 @addv4i8i8(i8* %x) {
1266; CHECK-LABEL: addv4i8i8:
1267; CHECK:       @ %bb.0: @ %entry
1268; CHECK-NEXT:    vldrb.u32 q0, [r0]
1269; CHECK-NEXT:    vaddv.u32 r0, q0
1270; CHECK-NEXT:    uxtb r0, r0
1271; CHECK-NEXT:    bx lr
1272entry:
1273  %0 = bitcast i8* %x to <4 x i8>*
1274  %1 = load <4 x i8>, <4 x i8>* %0, align 1
1275  %2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %1)
1276  ret i8 %2
1277}
1278
1279define zeroext i8 @addv8i8i8(i8* %x) {
1280; CHECK-LABEL: addv8i8i8:
1281; CHECK:       @ %bb.0: @ %entry
1282; CHECK-NEXT:    vldrb.u16 q0, [r0]
1283; CHECK-NEXT:    vaddv.u16 r0, q0
1284; CHECK-NEXT:    uxtb r0, r0
1285; CHECK-NEXT:    bx lr
1286entry:
1287  %0 = bitcast i8* %x to <8 x i8>*
1288  %1 = load <8 x i8>, <8 x i8>* %0, align 1
1289  %2 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %1)
1290  ret i8 %2
1291}
1292
1293define zeroext i8 @addv16i8i8(i8* %x) {
1294; CHECK-LABEL: addv16i8i8:
1295; CHECK:       @ %bb.0: @ %entry
1296; CHECK-NEXT:    vldrb.u8 q0, [r0]
1297; CHECK-NEXT:    vaddv.u8 r0, q0
1298; CHECK-NEXT:    uxtb r0, r0
1299; CHECK-NEXT:    bx lr
1300entry:
1301  %0 = bitcast i8* %x to <16 x i8>*
1302  %1 = load <16 x i8>, <16 x i8>* %0, align 1
1303  %2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %1)
1304  ret i8 %2
1305}
1306
1307define zeroext i8 @addv24i8i8(i8* %x) {
1308; CHECK-LABEL: addv24i8i8:
1309; CHECK:       @ %bb.0: @ %entry
1310; CHECK-NEXT:    vldrb.u16 q1, [r0]
1311; CHECK-NEXT:    vldrb.u8 q0, [r0, #8]
1312; CHECK-NEXT:    vaddv.u16 r0, q1
1313; CHECK-NEXT:    vaddva.u8 r0, q0
1314; CHECK-NEXT:    uxtb r0, r0
1315; CHECK-NEXT:    bx lr
1316entry:
1317  %0 = bitcast i8* %x to <8 x i8>*
1318  %1 = load <8 x i8>, <8 x i8>* %0, align 1
1319  %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8
1320  %2 = bitcast i8* %arrayidx.8 to <16 x i8>*
1321  %3 = load <16 x i8>, <16 x i8>* %2, align 1
1322  %4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %3)
1323  %5 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %1)
1324  %op.rdx = add i8 %4, %5
1325  ret i8 %op.rdx
1326}
1327
1328define zeroext i8 @addv32i8i8(i8* %x) {
1329; CHECK-LABEL: addv32i8i8:
1330; CHECK:       @ %bb.0: @ %entry
1331; CHECK-NEXT:    vldrb.u8 q1, [r0]
1332; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
1333; CHECK-NEXT:    vaddv.u8 r0, q1
1334; CHECK-NEXT:    vaddva.u8 r0, q0
1335; CHECK-NEXT:    uxtb r0, r0
1336; CHECK-NEXT:    bx lr
1337entry:
1338  %0 = bitcast i8* %x to <32 x i8>*
1339  %1 = load <32 x i8>, <32 x i8>* %0, align 1
1340  %2 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %1)
1341  ret i8 %2
1342}
1343
1344define zeroext i8 @addv64i8i8(i8* %x) {
1345; CHECK-LABEL: addv64i8i8:
1346; CHECK:       @ %bb.0: @ %entry
1347; CHECK-NEXT:    vldrb.u8 q1, [r0]
1348; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
1349; CHECK-NEXT:    vaddv.u8 r2, q1
1350; CHECK-NEXT:    vaddva.u8 r2, q0
1351; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
1352; CHECK-NEXT:    vaddva.u8 r2, q0
1353; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
1354; CHECK-NEXT:    vaddva.u8 r2, q0
1355; CHECK-NEXT:    uxtb r0, r2
1356; CHECK-NEXT:    bx lr
1357entry:
1358  %0 = bitcast i8* %x to <64 x i8>*
1359  %1 = load <64 x i8>, <64 x i8>* %0, align 1
1360  %2 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %1)
1361  ret i8 %2
1362}
1363
1364define zeroext i8 @addv128i8i8(i8* %x) {
1365; CHECK-LABEL: addv128i8i8:
1366; CHECK:       @ %bb.0: @ %entry
1367; CHECK-NEXT:    vldrb.u8 q1, [r0]
1368; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
1369; CHECK-NEXT:    vaddv.u8 r2, q1
1370; CHECK-NEXT:    vaddva.u8 r2, q0
1371; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
1372; CHECK-NEXT:    vaddva.u8 r2, q0
1373; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
1374; CHECK-NEXT:    vaddva.u8 r2, q0
1375; CHECK-NEXT:    vldrb.u8 q0, [r0, #64]
1376; CHECK-NEXT:    vaddva.u8 r2, q0
1377; CHECK-NEXT:    vldrb.u8 q0, [r0, #80]
1378; CHECK-NEXT:    vaddva.u8 r2, q0
1379; CHECK-NEXT:    vldrb.u8 q0, [r0, #96]
1380; CHECK-NEXT:    vaddva.u8 r2, q0
1381; CHECK-NEXT:    vldrb.u8 q0, [r0, #112]
1382; CHECK-NEXT:    vaddva.u8 r2, q0
1383; CHECK-NEXT:    uxtb r0, r2
1384; CHECK-NEXT:    bx lr
1385entry:
1386  %0 = bitcast i8* %x to <16 x i8>*
1387  %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
1388  %1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load)
1389  %2 = getelementptr inbounds i8, i8* %x, i32 16
1390  %3 = bitcast i8* %2 to <16 x i8>*
1391  %wide.load.1 = load <16 x i8>, <16 x i8>* %3, align 1
1392  %4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.1)
1393  %5 = add i8 %4, %1
1394  %6 = getelementptr inbounds i8, i8* %x, i32 32
1395  %7 = bitcast i8* %6 to <16 x i8>*
1396  %wide.load.2 = load <16 x i8>, <16 x i8>* %7, align 1
1397  %8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.2)
1398  %9 = add i8 %8, %5
1399  %10 = getelementptr inbounds i8, i8* %x, i32 48
1400  %11 = bitcast i8* %10 to <16 x i8>*
1401  %wide.load.3 = load <16 x i8>, <16 x i8>* %11, align 1
1402  %12 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.3)
1403  %13 = add i8 %12, %9
1404  %14 = getelementptr inbounds i8, i8* %x, i32 64
1405  %15 = bitcast i8* %14 to <16 x i8>*
1406  %wide.load.4 = load <16 x i8>, <16 x i8>* %15, align 1
1407  %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.4)
1408  %17 = add i8 %16, %13
1409  %18 = getelementptr inbounds i8, i8* %x, i32 80
1410  %19 = bitcast i8* %18 to <16 x i8>*
1411  %wide.load.5 = load <16 x i8>, <16 x i8>* %19, align 1
1412  %20 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.5)
1413  %21 = add i8 %20, %17
1414  %22 = getelementptr inbounds i8, i8* %x, i32 96
1415  %23 = bitcast i8* %22 to <16 x i8>*
1416  %wide.load.6 = load <16 x i8>, <16 x i8>* %23, align 1
1417  %24 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.6)
1418  %25 = add i8 %24, %21
1419  %26 = getelementptr inbounds i8, i8* %x, i32 112
1420  %27 = bitcast i8* %26 to <16 x i8>*
1421  %wide.load.7 = load <16 x i8>, <16 x i8>* %27, align 1
1422  %28 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %wide.load.7)
1423  %29 = add i8 %28, %25
1424  ret i8 %29
1425}
1426
1427
1428
1429define i32 @mlav2i32i32(i32* %x, i32* %y) {
1430; CHECK-LABEL: mlav2i32i32:
1431; CHECK:       @ %bb.0: @ %entry
1432; CHECK-NEXT:    ldrd r2, r0, [r0]
1433; CHECK-NEXT:    ldrd r3, r1, [r1]
1434; CHECK-NEXT:    muls r2, r3, r2
1435; CHECK-NEXT:    mla r0, r1, r0, r2
1436; CHECK-NEXT:    bx lr
1437entry:
1438  %0 = load i32, i32* %x, align 4
1439  %1 = load i32, i32* %y, align 4
1440  %mul = mul nsw i32 %1, %0
1441  %arrayidx.1 = getelementptr inbounds i32, i32* %x, i32 1
1442  %2 = load i32, i32* %arrayidx.1, align 4
1443  %arrayidx1.1 = getelementptr inbounds i32, i32* %y, i32 1
1444  %3 = load i32, i32* %arrayidx1.1, align 4
1445  %mul.1 = mul nsw i32 %3, %2
1446  %add.1 = add nsw i32 %mul.1, %mul
1447  ret i32 %add.1
1448}
1449
1450define i32 @mlav4i32i32(i32* %x, i32* %y) {
1451; CHECK-LABEL: mlav4i32i32:
1452; CHECK:       @ %bb.0: @ %entry
1453; CHECK-NEXT:    vldrw.u32 q0, [r0]
1454; CHECK-NEXT:    vldrw.u32 q1, [r1]
1455; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1456; CHECK-NEXT:    bx lr
1457entry:
1458  %0 = bitcast i32* %x to <4 x i32>*
1459  %1 = load <4 x i32>, <4 x i32>* %0, align 4
1460  %2 = bitcast i32* %y to <4 x i32>*
1461  %3 = load <4 x i32>, <4 x i32>* %2, align 4
1462  %4 = mul nsw <4 x i32> %3, %1
1463  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
1464  ret i32 %5
1465}
1466
1467define i32 @mlav8i32i32(i32* %x, i32* %y) {
1468; CHECK-LABEL: mlav8i32i32:
1469; CHECK:       @ %bb.0: @ %entry
1470; CHECK-NEXT:    vldrw.u32 q0, [r0]
1471; CHECK-NEXT:    vldrw.u32 q1, [r1]
1472; CHECK-NEXT:    vmlav.u32 r2, q1, q0
1473; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
1474; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1475; CHECK-NEXT:    vmlava.u32 r2, q1, q0
1476; CHECK-NEXT:    mov r0, r2
1477; CHECK-NEXT:    bx lr
1478entry:
1479  %0 = bitcast i32* %x to <8 x i32>*
1480  %1 = load <8 x i32>, <8 x i32>* %0, align 4
1481  %2 = bitcast i32* %y to <8 x i32>*
1482  %3 = load <8 x i32>, <8 x i32>* %2, align 4
1483  %4 = mul nsw <8 x i32> %3, %1
1484  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
1485  ret i32 %5
1486}
1487
1488define i32 @mlav16i32i32(i32* %x, i32* %y) {
1489; CHECK-LABEL: mlav16i32i32:
1490; CHECK:       @ %bb.0: @ %entry
1491; CHECK-NEXT:    vldrw.u32 q0, [r0]
1492; CHECK-NEXT:    vldrw.u32 q1, [r1]
1493; CHECK-NEXT:    vmlav.u32 r2, q1, q0
1494; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
1495; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1496; CHECK-NEXT:    vmlava.u32 r2, q1, q0
1497; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
1498; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1499; CHECK-NEXT:    vmlava.u32 r2, q1, q0
1500; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
1501; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1502; CHECK-NEXT:    vmlava.u32 r2, q1, q0
1503; CHECK-NEXT:    mov r0, r2
1504; CHECK-NEXT:    bx lr
1505entry:
1506  %0 = bitcast i32* %x to <16 x i32>*
1507  %1 = load <16 x i32>, <16 x i32>* %0, align 4
1508  %2 = bitcast i32* %y to <16 x i32>*
1509  %3 = load <16 x i32>, <16 x i32>* %2, align 4
1510  %4 = mul nsw <16 x i32> %3, %1
1511  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
1512  ret i32 %5
1513}
1514
1515define i32 @mlav24i32i32(i32* %x, i32* %y) {
1516; CHECK-LABEL: mlav24i32i32:
1517; CHECK:       @ %bb.0: @ %entry
1518; CHECK-NEXT:    vldrw.u32 q0, [r0]
1519; CHECK-NEXT:    vldrw.u32 q1, [r1]
1520; CHECK-NEXT:    mov r2, r0
1521; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1522; CHECK-NEXT:    vldrw.u32 q0, [r2, #16]
1523; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1524; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1525; CHECK-NEXT:    vldrw.u32 q0, [r2, #32]
1526; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1527; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1528; CHECK-NEXT:    vldrw.u32 q0, [r2, #48]
1529; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1530; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1531; CHECK-NEXT:    vldrw.u32 q0, [r2, #64]
1532; CHECK-NEXT:    vldrw.u32 q1, [r1, #64]
1533; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1534; CHECK-NEXT:    vldrw.u32 q0, [r2, #80]
1535; CHECK-NEXT:    vldrw.u32 q1, [r1, #80]
1536; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1537; CHECK-NEXT:    bx lr
1538entry:
1539  %0 = bitcast i32* %x to <8 x i32>*
1540  %1 = load <8 x i32>, <8 x i32>* %0, align 4
1541  %2 = bitcast i32* %y to <8 x i32>*
1542  %3 = load <8 x i32>, <8 x i32>* %2, align 4
1543  %4 = mul nsw <8 x i32> %3, %1
1544  %arrayidx.8 = getelementptr inbounds i32, i32* %x, i32 8
1545  %arrayidx1.8 = getelementptr inbounds i32, i32* %y, i32 8
1546  %5 = bitcast i32* %arrayidx.8 to <16 x i32>*
1547  %6 = load <16 x i32>, <16 x i32>* %5, align 4
1548  %7 = bitcast i32* %arrayidx1.8 to <16 x i32>*
1549  %8 = load <16 x i32>, <16 x i32>* %7, align 4
1550  %9 = mul nsw <16 x i32> %8, %6
1551  %10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %9)
1552  %11 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
1553  %op.rdx = add nsw i32 %10, %11
1554  ret i32 %op.rdx
1555}
1556
1557define i32 @mlav32i32i32(i32* %x, i32* %y) {
1558; CHECK-LABEL: mlav32i32i32:
1559; CHECK:       @ %bb.0: @ %entry
1560; CHECK-NEXT:    vldrw.u32 q0, [r0]
1561; CHECK-NEXT:    vldrw.u32 q1, [r1]
1562; CHECK-NEXT:    mov r2, r0
1563; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1564; CHECK-NEXT:    vldrw.u32 q0, [r2, #16]
1565; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1566; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1567; CHECK-NEXT:    vldrw.u32 q0, [r2, #32]
1568; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1569; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1570; CHECK-NEXT:    vldrw.u32 q0, [r2, #48]
1571; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1572; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1573; CHECK-NEXT:    vldrw.u32 q0, [r2, #64]
1574; CHECK-NEXT:    vldrw.u32 q1, [r1, #64]
1575; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1576; CHECK-NEXT:    vldrw.u32 q0, [r2, #80]
1577; CHECK-NEXT:    vldrw.u32 q1, [r1, #80]
1578; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1579; CHECK-NEXT:    vldrw.u32 q0, [r2, #96]
1580; CHECK-NEXT:    vldrw.u32 q1, [r1, #96]
1581; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1582; CHECK-NEXT:    vldrw.u32 q0, [r2, #112]
1583; CHECK-NEXT:    vldrw.u32 q1, [r1, #112]
1584; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1585; CHECK-NEXT:    bx lr
1586entry:
1587  %0 = bitcast i32* %x to <32 x i32>*
1588  %1 = load <32 x i32>, <32 x i32>* %0, align 4
1589  %2 = bitcast i32* %y to <32 x i32>*
1590  %3 = load <32 x i32>, <32 x i32>* %2, align 4
1591  %4 = mul nsw <32 x i32> %3, %1
1592  %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
1593  ret i32 %5
1594}
1595
1596define i32 @mlav64i32i32(i32* %x, i32* %y) {
1597; CHECK-LABEL: mlav64i32i32:
1598; CHECK:       @ %bb.0: @ %entry
1599; CHECK-NEXT:    vldrw.u32 q0, [r0]
1600; CHECK-NEXT:    vldrw.u32 q1, [r1]
1601; CHECK-NEXT:    mov r2, r0
1602; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1603; CHECK-NEXT:    vldrw.u32 q0, [r2, #16]
1604; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1605; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1606; CHECK-NEXT:    vldrw.u32 q0, [r2, #32]
1607; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1608; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1609; CHECK-NEXT:    vldrw.u32 q0, [r2, #48]
1610; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1611; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1612; CHECK-NEXT:    vldrw.u32 q0, [r2, #64]
1613; CHECK-NEXT:    vldrw.u32 q1, [r1, #64]
1614; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1615; CHECK-NEXT:    vldrw.u32 q0, [r2, #80]
1616; CHECK-NEXT:    vldrw.u32 q1, [r1, #80]
1617; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1618; CHECK-NEXT:    vldrw.u32 q0, [r2, #96]
1619; CHECK-NEXT:    vldrw.u32 q1, [r1, #96]
1620; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1621; CHECK-NEXT:    vldrw.u32 q0, [r2, #112]
1622; CHECK-NEXT:    vldrw.u32 q1, [r1, #112]
1623; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1624; CHECK-NEXT:    vldrw.u32 q0, [r2, #128]
1625; CHECK-NEXT:    vldrw.u32 q1, [r1, #128]
1626; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1627; CHECK-NEXT:    vldrw.u32 q0, [r2, #144]
1628; CHECK-NEXT:    vldrw.u32 q1, [r1, #144]
1629; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1630; CHECK-NEXT:    vldrw.u32 q0, [r2, #160]
1631; CHECK-NEXT:    vldrw.u32 q1, [r1, #160]
1632; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1633; CHECK-NEXT:    vldrw.u32 q0, [r2, #176]
1634; CHECK-NEXT:    vldrw.u32 q1, [r1, #176]
1635; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1636; CHECK-NEXT:    vldrw.u32 q0, [r2, #192]
1637; CHECK-NEXT:    vldrw.u32 q1, [r1, #192]
1638; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1639; CHECK-NEXT:    vldrw.u32 q0, [r2, #208]
1640; CHECK-NEXT:    vldrw.u32 q1, [r1, #208]
1641; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1642; CHECK-NEXT:    vldrw.u32 q0, [r2, #224]
1643; CHECK-NEXT:    vldrw.u32 q1, [r1, #224]
1644; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1645; CHECK-NEXT:    vldrw.u32 q0, [r2, #240]
1646; CHECK-NEXT:    vldrw.u32 q1, [r1, #240]
1647; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1648; CHECK-NEXT:    bx lr
1649entry:
1650  %0 = bitcast i32* %x to <4 x i32>*
1651  %wide.load = load <4 x i32>, <4 x i32>* %0, align 4
1652  %1 = bitcast i32* %y to <4 x i32>*
1653  %wide.load10 = load <4 x i32>, <4 x i32>* %1, align 4
1654  %2 = mul nsw <4 x i32> %wide.load10, %wide.load
1655  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1656  %4 = getelementptr inbounds i32, i32* %x, i32 4
1657  %5 = bitcast i32* %4 to <4 x i32>*
1658  %wide.load.1 = load <4 x i32>, <4 x i32>* %5, align 4
1659  %6 = getelementptr inbounds i32, i32* %y, i32 4
1660  %7 = bitcast i32* %6 to <4 x i32>*
1661  %wide.load10.1 = load <4 x i32>, <4 x i32>* %7, align 4
1662  %8 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
1663  %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
1664  %10 = add i32 %9, %3
1665  %11 = getelementptr inbounds i32, i32* %x, i32 8
1666  %12 = bitcast i32* %11 to <4 x i32>*
1667  %wide.load.2 = load <4 x i32>, <4 x i32>* %12, align 4
1668  %13 = getelementptr inbounds i32, i32* %y, i32 8
1669  %14 = bitcast i32* %13 to <4 x i32>*
1670  %wide.load10.2 = load <4 x i32>, <4 x i32>* %14, align 4
1671  %15 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
1672  %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %15)
1673  %17 = add i32 %16, %10
1674  %18 = getelementptr inbounds i32, i32* %x, i32 12
1675  %19 = bitcast i32* %18 to <4 x i32>*
1676  %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 4
1677  %20 = getelementptr inbounds i32, i32* %y, i32 12
1678  %21 = bitcast i32* %20 to <4 x i32>*
1679  %wide.load10.3 = load <4 x i32>, <4 x i32>* %21, align 4
1680  %22 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
1681  %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %22)
1682  %24 = add i32 %23, %17
1683  %25 = getelementptr inbounds i32, i32* %x, i32 16
1684  %26 = bitcast i32* %25 to <4 x i32>*
1685  %wide.load.4 = load <4 x i32>, <4 x i32>* %26, align 4
1686  %27 = getelementptr inbounds i32, i32* %y, i32 16
1687  %28 = bitcast i32* %27 to <4 x i32>*
1688  %wide.load10.4 = load <4 x i32>, <4 x i32>* %28, align 4
1689  %29 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
1690  %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
1691  %31 = add i32 %30, %24
1692  %32 = getelementptr inbounds i32, i32* %x, i32 20
1693  %33 = bitcast i32* %32 to <4 x i32>*
1694  %wide.load.5 = load <4 x i32>, <4 x i32>* %33, align 4
1695  %34 = getelementptr inbounds i32, i32* %y, i32 20
1696  %35 = bitcast i32* %34 to <4 x i32>*
1697  %wide.load10.5 = load <4 x i32>, <4 x i32>* %35, align 4
1698  %36 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
1699  %37 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %36)
1700  %38 = add i32 %37, %31
1701  %39 = getelementptr inbounds i32, i32* %x, i32 24
1702  %40 = bitcast i32* %39 to <4 x i32>*
1703  %wide.load.6 = load <4 x i32>, <4 x i32>* %40, align 4
1704  %41 = getelementptr inbounds i32, i32* %y, i32 24
1705  %42 = bitcast i32* %41 to <4 x i32>*
1706  %wide.load10.6 = load <4 x i32>, <4 x i32>* %42, align 4
1707  %43 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
1708  %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %43)
1709  %45 = add i32 %44, %38
1710  %46 = getelementptr inbounds i32, i32* %x, i32 28
1711  %47 = bitcast i32* %46 to <4 x i32>*
1712  %wide.load.7 = load <4 x i32>, <4 x i32>* %47, align 4
1713  %48 = getelementptr inbounds i32, i32* %y, i32 28
1714  %49 = bitcast i32* %48 to <4 x i32>*
1715  %wide.load10.7 = load <4 x i32>, <4 x i32>* %49, align 4
1716  %50 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
1717  %51 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %50)
1718  %52 = add i32 %51, %45
1719  %53 = getelementptr inbounds i32, i32* %x, i32 32
1720  %54 = bitcast i32* %53 to <4 x i32>*
1721  %wide.load.8 = load <4 x i32>, <4 x i32>* %54, align 4
1722  %55 = getelementptr inbounds i32, i32* %y, i32 32
1723  %56 = bitcast i32* %55 to <4 x i32>*
1724  %wide.load10.8 = load <4 x i32>, <4 x i32>* %56, align 4
1725  %57 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
1726  %58 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %57)
1727  %59 = add i32 %58, %52
1728  %60 = getelementptr inbounds i32, i32* %x, i32 36
1729  %61 = bitcast i32* %60 to <4 x i32>*
1730  %wide.load.9 = load <4 x i32>, <4 x i32>* %61, align 4
1731  %62 = getelementptr inbounds i32, i32* %y, i32 36
1732  %63 = bitcast i32* %62 to <4 x i32>*
1733  %wide.load10.9 = load <4 x i32>, <4 x i32>* %63, align 4
1734  %64 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
1735  %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
1736  %66 = add i32 %65, %59
1737  %67 = getelementptr inbounds i32, i32* %x, i32 40
1738  %68 = bitcast i32* %67 to <4 x i32>*
1739  %wide.load.10 = load <4 x i32>, <4 x i32>* %68, align 4
1740  %69 = getelementptr inbounds i32, i32* %y, i32 40
1741  %70 = bitcast i32* %69 to <4 x i32>*
1742  %wide.load10.10 = load <4 x i32>, <4 x i32>* %70, align 4
1743  %71 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
1744  %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %71)
1745  %73 = add i32 %72, %66
1746  %74 = getelementptr inbounds i32, i32* %x, i32 44
1747  %75 = bitcast i32* %74 to <4 x i32>*
1748  %wide.load.11 = load <4 x i32>, <4 x i32>* %75, align 4
1749  %76 = getelementptr inbounds i32, i32* %y, i32 44
1750  %77 = bitcast i32* %76 to <4 x i32>*
1751  %wide.load10.11 = load <4 x i32>, <4 x i32>* %77, align 4
1752  %78 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
1753  %79 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %78)
1754  %80 = add i32 %79, %73
1755  %81 = getelementptr inbounds i32, i32* %x, i32 48
1756  %82 = bitcast i32* %81 to <4 x i32>*
1757  %wide.load.12 = load <4 x i32>, <4 x i32>* %82, align 4
1758  %83 = getelementptr inbounds i32, i32* %y, i32 48
1759  %84 = bitcast i32* %83 to <4 x i32>*
1760  %wide.load10.12 = load <4 x i32>, <4 x i32>* %84, align 4
1761  %85 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
1762  %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %85)
1763  %87 = add i32 %86, %80
1764  %88 = getelementptr inbounds i32, i32* %x, i32 52
1765  %89 = bitcast i32* %88 to <4 x i32>*
1766  %wide.load.13 = load <4 x i32>, <4 x i32>* %89, align 4
1767  %90 = getelementptr inbounds i32, i32* %y, i32 52
1768  %91 = bitcast i32* %90 to <4 x i32>*
1769  %wide.load10.13 = load <4 x i32>, <4 x i32>* %91, align 4
1770  %92 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
1771  %93 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %92)
1772  %94 = add i32 %93, %87
1773  %95 = getelementptr inbounds i32, i32* %x, i32 56
1774  %96 = bitcast i32* %95 to <4 x i32>*
1775  %wide.load.14 = load <4 x i32>, <4 x i32>* %96, align 4
1776  %97 = getelementptr inbounds i32, i32* %y, i32 56
1777  %98 = bitcast i32* %97 to <4 x i32>*
1778  %wide.load10.14 = load <4 x i32>, <4 x i32>* %98, align 4
1779  %99 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
1780  %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99)
1781  %101 = add i32 %100, %94
1782  %102 = getelementptr inbounds i32, i32* %x, i32 60
1783  %103 = bitcast i32* %102 to <4 x i32>*
1784  %wide.load.15 = load <4 x i32>, <4 x i32>* %103, align 4
1785  %104 = getelementptr inbounds i32, i32* %y, i32 60
1786  %105 = bitcast i32* %104 to <4 x i32>*
1787  %wide.load10.15 = load <4 x i32>, <4 x i32>* %105, align 4
1788  %106 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
1789  %107 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %106)
1790  %108 = add i32 %107, %101
1791  ret i32 %108
1792}
1793
1794define i32 @mlav128i32i32(i32* %x, i32* %y) {
1795; CHECK-LABEL: mlav128i32i32:
1796; CHECK:       @ %bb.0: @ %entry
1797; CHECK-NEXT:    vldrw.u32 q0, [r0]
1798; CHECK-NEXT:    vldrw.u32 q1, [r1]
1799; CHECK-NEXT:    mov r2, r0
1800; CHECK-NEXT:    vmlav.u32 r0, q1, q0
1801; CHECK-NEXT:    vldrw.u32 q0, [r2, #16]
1802; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
1803; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1804; CHECK-NEXT:    vldrw.u32 q0, [r2, #32]
1805; CHECK-NEXT:    vldrw.u32 q1, [r1, #32]
1806; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1807; CHECK-NEXT:    vldrw.u32 q0, [r2, #48]
1808; CHECK-NEXT:    vldrw.u32 q1, [r1, #48]
1809; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1810; CHECK-NEXT:    vldrw.u32 q0, [r2, #64]
1811; CHECK-NEXT:    vldrw.u32 q1, [r1, #64]
1812; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1813; CHECK-NEXT:    vldrw.u32 q0, [r2, #80]
1814; CHECK-NEXT:    vldrw.u32 q1, [r1, #80]
1815; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1816; CHECK-NEXT:    vldrw.u32 q0, [r2, #96]
1817; CHECK-NEXT:    vldrw.u32 q1, [r1, #96]
1818; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1819; CHECK-NEXT:    vldrw.u32 q0, [r2, #112]
1820; CHECK-NEXT:    vldrw.u32 q1, [r1, #112]
1821; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1822; CHECK-NEXT:    vldrw.u32 q0, [r2, #128]
1823; CHECK-NEXT:    vldrw.u32 q1, [r1, #128]
1824; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1825; CHECK-NEXT:    vldrw.u32 q0, [r2, #144]
1826; CHECK-NEXT:    vldrw.u32 q1, [r1, #144]
1827; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1828; CHECK-NEXT:    vldrw.u32 q0, [r2, #160]
1829; CHECK-NEXT:    vldrw.u32 q1, [r1, #160]
1830; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1831; CHECK-NEXT:    vldrw.u32 q0, [r2, #176]
1832; CHECK-NEXT:    vldrw.u32 q1, [r1, #176]
1833; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1834; CHECK-NEXT:    vldrw.u32 q0, [r2, #192]
1835; CHECK-NEXT:    vldrw.u32 q1, [r1, #192]
1836; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1837; CHECK-NEXT:    vldrw.u32 q0, [r2, #208]
1838; CHECK-NEXT:    vldrw.u32 q1, [r1, #208]
1839; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1840; CHECK-NEXT:    vldrw.u32 q0, [r2, #224]
1841; CHECK-NEXT:    vldrw.u32 q1, [r1, #224]
1842; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1843; CHECK-NEXT:    vldrw.u32 q0, [r2, #240]
1844; CHECK-NEXT:    vldrw.u32 q1, [r1, #240]
1845; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1846; CHECK-NEXT:    vldrw.u32 q0, [r2, #256]
1847; CHECK-NEXT:    vldrw.u32 q1, [r1, #256]
1848; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1849; CHECK-NEXT:    vldrw.u32 q0, [r2, #272]
1850; CHECK-NEXT:    vldrw.u32 q1, [r1, #272]
1851; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1852; CHECK-NEXT:    vldrw.u32 q0, [r2, #288]
1853; CHECK-NEXT:    vldrw.u32 q1, [r1, #288]
1854; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1855; CHECK-NEXT:    vldrw.u32 q0, [r2, #304]
1856; CHECK-NEXT:    vldrw.u32 q1, [r1, #304]
1857; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1858; CHECK-NEXT:    vldrw.u32 q0, [r2, #320]
1859; CHECK-NEXT:    vldrw.u32 q1, [r1, #320]
1860; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1861; CHECK-NEXT:    vldrw.u32 q0, [r2, #336]
1862; CHECK-NEXT:    vldrw.u32 q1, [r1, #336]
1863; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1864; CHECK-NEXT:    vldrw.u32 q0, [r2, #352]
1865; CHECK-NEXT:    vldrw.u32 q1, [r1, #352]
1866; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1867; CHECK-NEXT:    vldrw.u32 q0, [r2, #368]
1868; CHECK-NEXT:    vldrw.u32 q1, [r1, #368]
1869; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1870; CHECK-NEXT:    vldrw.u32 q0, [r2, #384]
1871; CHECK-NEXT:    vldrw.u32 q1, [r1, #384]
1872; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1873; CHECK-NEXT:    vldrw.u32 q0, [r2, #400]
1874; CHECK-NEXT:    vldrw.u32 q1, [r1, #400]
1875; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1876; CHECK-NEXT:    vldrw.u32 q0, [r2, #416]
1877; CHECK-NEXT:    vldrw.u32 q1, [r1, #416]
1878; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1879; CHECK-NEXT:    vldrw.u32 q0, [r2, #432]
1880; CHECK-NEXT:    vldrw.u32 q1, [r1, #432]
1881; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1882; CHECK-NEXT:    vldrw.u32 q0, [r2, #448]
1883; CHECK-NEXT:    vldrw.u32 q1, [r1, #448]
1884; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1885; CHECK-NEXT:    vldrw.u32 q0, [r2, #464]
1886; CHECK-NEXT:    vldrw.u32 q1, [r1, #464]
1887; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1888; CHECK-NEXT:    vldrw.u32 q0, [r2, #480]
1889; CHECK-NEXT:    vldrw.u32 q1, [r1, #480]
1890; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1891; CHECK-NEXT:    vldrw.u32 q0, [r2, #496]
1892; CHECK-NEXT:    vldrw.u32 q1, [r1, #496]
1893; CHECK-NEXT:    vmlava.u32 r0, q1, q0
1894; CHECK-NEXT:    bx lr
1895entry:
1896  %0 = bitcast i32* %x to <4 x i32>*
1897  %wide.load = load <4 x i32>, <4 x i32>* %0, align 4
1898  %1 = bitcast i32* %y to <4 x i32>*
1899  %wide.load10 = load <4 x i32>, <4 x i32>* %1, align 4
1900  %2 = mul nsw <4 x i32> %wide.load10, %wide.load
1901  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1902  %4 = getelementptr inbounds i32, i32* %x, i32 4
1903  %5 = bitcast i32* %4 to <4 x i32>*
1904  %wide.load.1 = load <4 x i32>, <4 x i32>* %5, align 4
1905  %6 = getelementptr inbounds i32, i32* %y, i32 4
1906  %7 = bitcast i32* %6 to <4 x i32>*
1907  %wide.load10.1 = load <4 x i32>, <4 x i32>* %7, align 4
1908  %8 = mul nsw <4 x i32> %wide.load10.1, %wide.load.1
1909  %9 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %8)
1910  %10 = add i32 %9, %3
1911  %11 = getelementptr inbounds i32, i32* %x, i32 8
1912  %12 = bitcast i32* %11 to <4 x i32>*
1913  %wide.load.2 = load <4 x i32>, <4 x i32>* %12, align 4
1914  %13 = getelementptr inbounds i32, i32* %y, i32 8
1915  %14 = bitcast i32* %13 to <4 x i32>*
1916  %wide.load10.2 = load <4 x i32>, <4 x i32>* %14, align 4
1917  %15 = mul nsw <4 x i32> %wide.load10.2, %wide.load.2
1918  %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %15)
1919  %17 = add i32 %16, %10
1920  %18 = getelementptr inbounds i32, i32* %x, i32 12
1921  %19 = bitcast i32* %18 to <4 x i32>*
1922  %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 4
1923  %20 = getelementptr inbounds i32, i32* %y, i32 12
1924  %21 = bitcast i32* %20 to <4 x i32>*
1925  %wide.load10.3 = load <4 x i32>, <4 x i32>* %21, align 4
1926  %22 = mul nsw <4 x i32> %wide.load10.3, %wide.load.3
1927  %23 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %22)
1928  %24 = add i32 %23, %17
1929  %25 = getelementptr inbounds i32, i32* %x, i32 16
1930  %26 = bitcast i32* %25 to <4 x i32>*
1931  %wide.load.4 = load <4 x i32>, <4 x i32>* %26, align 4
1932  %27 = getelementptr inbounds i32, i32* %y, i32 16
1933  %28 = bitcast i32* %27 to <4 x i32>*
1934  %wide.load10.4 = load <4 x i32>, <4 x i32>* %28, align 4
1935  %29 = mul nsw <4 x i32> %wide.load10.4, %wide.load.4
1936  %30 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %29)
1937  %31 = add i32 %30, %24
1938  %32 = getelementptr inbounds i32, i32* %x, i32 20
1939  %33 = bitcast i32* %32 to <4 x i32>*
1940  %wide.load.5 = load <4 x i32>, <4 x i32>* %33, align 4
1941  %34 = getelementptr inbounds i32, i32* %y, i32 20
1942  %35 = bitcast i32* %34 to <4 x i32>*
1943  %wide.load10.5 = load <4 x i32>, <4 x i32>* %35, align 4
1944  %36 = mul nsw <4 x i32> %wide.load10.5, %wide.load.5
1945  %37 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %36)
1946  %38 = add i32 %37, %31
1947  %39 = getelementptr inbounds i32, i32* %x, i32 24
1948  %40 = bitcast i32* %39 to <4 x i32>*
1949  %wide.load.6 = load <4 x i32>, <4 x i32>* %40, align 4
1950  %41 = getelementptr inbounds i32, i32* %y, i32 24
1951  %42 = bitcast i32* %41 to <4 x i32>*
1952  %wide.load10.6 = load <4 x i32>, <4 x i32>* %42, align 4
1953  %43 = mul nsw <4 x i32> %wide.load10.6, %wide.load.6
1954  %44 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %43)
1955  %45 = add i32 %44, %38
1956  %46 = getelementptr inbounds i32, i32* %x, i32 28
1957  %47 = bitcast i32* %46 to <4 x i32>*
1958  %wide.load.7 = load <4 x i32>, <4 x i32>* %47, align 4
1959  %48 = getelementptr inbounds i32, i32* %y, i32 28
1960  %49 = bitcast i32* %48 to <4 x i32>*
1961  %wide.load10.7 = load <4 x i32>, <4 x i32>* %49, align 4
1962  %50 = mul nsw <4 x i32> %wide.load10.7, %wide.load.7
1963  %51 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %50)
1964  %52 = add i32 %51, %45
1965  %53 = getelementptr inbounds i32, i32* %x, i32 32
1966  %54 = bitcast i32* %53 to <4 x i32>*
1967  %wide.load.8 = load <4 x i32>, <4 x i32>* %54, align 4
1968  %55 = getelementptr inbounds i32, i32* %y, i32 32
1969  %56 = bitcast i32* %55 to <4 x i32>*
1970  %wide.load10.8 = load <4 x i32>, <4 x i32>* %56, align 4
1971  %57 = mul nsw <4 x i32> %wide.load10.8, %wide.load.8
1972  %58 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %57)
1973  %59 = add i32 %58, %52
1974  %60 = getelementptr inbounds i32, i32* %x, i32 36
1975  %61 = bitcast i32* %60 to <4 x i32>*
1976  %wide.load.9 = load <4 x i32>, <4 x i32>* %61, align 4
1977  %62 = getelementptr inbounds i32, i32* %y, i32 36
1978  %63 = bitcast i32* %62 to <4 x i32>*
1979  %wide.load10.9 = load <4 x i32>, <4 x i32>* %63, align 4
1980  %64 = mul nsw <4 x i32> %wide.load10.9, %wide.load.9
1981  %65 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %64)
1982  %66 = add i32 %65, %59
1983  %67 = getelementptr inbounds i32, i32* %x, i32 40
1984  %68 = bitcast i32* %67 to <4 x i32>*
1985  %wide.load.10 = load <4 x i32>, <4 x i32>* %68, align 4
1986  %69 = getelementptr inbounds i32, i32* %y, i32 40
1987  %70 = bitcast i32* %69 to <4 x i32>*
1988  %wide.load10.10 = load <4 x i32>, <4 x i32>* %70, align 4
1989  %71 = mul nsw <4 x i32> %wide.load10.10, %wide.load.10
1990  %72 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %71)
1991  %73 = add i32 %72, %66
1992  %74 = getelementptr inbounds i32, i32* %x, i32 44
1993  %75 = bitcast i32* %74 to <4 x i32>*
1994  %wide.load.11 = load <4 x i32>, <4 x i32>* %75, align 4
1995  %76 = getelementptr inbounds i32, i32* %y, i32 44
1996  %77 = bitcast i32* %76 to <4 x i32>*
1997  %wide.load10.11 = load <4 x i32>, <4 x i32>* %77, align 4
1998  %78 = mul nsw <4 x i32> %wide.load10.11, %wide.load.11
1999  %79 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %78)
2000  %80 = add i32 %79, %73
2001  %81 = getelementptr inbounds i32, i32* %x, i32 48
2002  %82 = bitcast i32* %81 to <4 x i32>*
2003  %wide.load.12 = load <4 x i32>, <4 x i32>* %82, align 4
2004  %83 = getelementptr inbounds i32, i32* %y, i32 48
2005  %84 = bitcast i32* %83 to <4 x i32>*
2006  %wide.load10.12 = load <4 x i32>, <4 x i32>* %84, align 4
2007  %85 = mul nsw <4 x i32> %wide.load10.12, %wide.load.12
2008  %86 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %85)
2009  %87 = add i32 %86, %80
2010  %88 = getelementptr inbounds i32, i32* %x, i32 52
2011  %89 = bitcast i32* %88 to <4 x i32>*
2012  %wide.load.13 = load <4 x i32>, <4 x i32>* %89, align 4
2013  %90 = getelementptr inbounds i32, i32* %y, i32 52
2014  %91 = bitcast i32* %90 to <4 x i32>*
2015  %wide.load10.13 = load <4 x i32>, <4 x i32>* %91, align 4
2016  %92 = mul nsw <4 x i32> %wide.load10.13, %wide.load.13
2017  %93 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %92)
2018  %94 = add i32 %93, %87
2019  %95 = getelementptr inbounds i32, i32* %x, i32 56
2020  %96 = bitcast i32* %95 to <4 x i32>*
2021  %wide.load.14 = load <4 x i32>, <4 x i32>* %96, align 4
2022  %97 = getelementptr inbounds i32, i32* %y, i32 56
2023  %98 = bitcast i32* %97 to <4 x i32>*
2024  %wide.load10.14 = load <4 x i32>, <4 x i32>* %98, align 4
2025  %99 = mul nsw <4 x i32> %wide.load10.14, %wide.load.14
2026  %100 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %99)
2027  %101 = add i32 %100, %94
2028  %102 = getelementptr inbounds i32, i32* %x, i32 60
2029  %103 = bitcast i32* %102 to <4 x i32>*
2030  %wide.load.15 = load <4 x i32>, <4 x i32>* %103, align 4
2031  %104 = getelementptr inbounds i32, i32* %y, i32 60
2032  %105 = bitcast i32* %104 to <4 x i32>*
2033  %wide.load10.15 = load <4 x i32>, <4 x i32>* %105, align 4
2034  %106 = mul nsw <4 x i32> %wide.load10.15, %wide.load.15
2035  %107 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %106)
2036  %108 = add i32 %107, %101
2037  %109 = getelementptr inbounds i32, i32* %x, i32 64
2038  %110 = bitcast i32* %109 to <4 x i32>*
2039  %wide.load.16 = load <4 x i32>, <4 x i32>* %110, align 4
2040  %111 = getelementptr inbounds i32, i32* %y, i32 64
2041  %112 = bitcast i32* %111 to <4 x i32>*
2042  %wide.load10.16 = load <4 x i32>, <4 x i32>* %112, align 4
2043  %113 = mul nsw <4 x i32> %wide.load10.16, %wide.load.16
2044  %114 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %113)
2045  %115 = add i32 %114, %108
2046  %116 = getelementptr inbounds i32, i32* %x, i32 68
2047  %117 = bitcast i32* %116 to <4 x i32>*
2048  %wide.load.17 = load <4 x i32>, <4 x i32>* %117, align 4
2049  %118 = getelementptr inbounds i32, i32* %y, i32 68
2050  %119 = bitcast i32* %118 to <4 x i32>*
2051  %wide.load10.17 = load <4 x i32>, <4 x i32>* %119, align 4
2052  %120 = mul nsw <4 x i32> %wide.load10.17, %wide.load.17
2053  %121 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %120)
2054  %122 = add i32 %121, %115
2055  %123 = getelementptr inbounds i32, i32* %x, i32 72
2056  %124 = bitcast i32* %123 to <4 x i32>*
2057  %wide.load.18 = load <4 x i32>, <4 x i32>* %124, align 4
2058  %125 = getelementptr inbounds i32, i32* %y, i32 72
2059  %126 = bitcast i32* %125 to <4 x i32>*
2060  %wide.load10.18 = load <4 x i32>, <4 x i32>* %126, align 4
2061  %127 = mul nsw <4 x i32> %wide.load10.18, %wide.load.18
2062  %128 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %127)
2063  %129 = add i32 %128, %122
2064  %130 = getelementptr inbounds i32, i32* %x, i32 76
2065  %131 = bitcast i32* %130 to <4 x i32>*
2066  %wide.load.19 = load <4 x i32>, <4 x i32>* %131, align 4
2067  %132 = getelementptr inbounds i32, i32* %y, i32 76
2068  %133 = bitcast i32* %132 to <4 x i32>*
2069  %wide.load10.19 = load <4 x i32>, <4 x i32>* %133, align 4
2070  %134 = mul nsw <4 x i32> %wide.load10.19, %wide.load.19
2071  %135 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %134)
2072  %136 = add i32 %135, %129
2073  %137 = getelementptr inbounds i32, i32* %x, i32 80
2074  %138 = bitcast i32* %137 to <4 x i32>*
2075  %wide.load.20 = load <4 x i32>, <4 x i32>* %138, align 4
2076  %139 = getelementptr inbounds i32, i32* %y, i32 80
2077  %140 = bitcast i32* %139 to <4 x i32>*
2078  %wide.load10.20 = load <4 x i32>, <4 x i32>* %140, align 4
2079  %141 = mul nsw <4 x i32> %wide.load10.20, %wide.load.20
2080  %142 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %141)
2081  %143 = add i32 %142, %136
2082  %144 = getelementptr inbounds i32, i32* %x, i32 84
2083  %145 = bitcast i32* %144 to <4 x i32>*
2084  %wide.load.21 = load <4 x i32>, <4 x i32>* %145, align 4
2085  %146 = getelementptr inbounds i32, i32* %y, i32 84
2086  %147 = bitcast i32* %146 to <4 x i32>*
2087  %wide.load10.21 = load <4 x i32>, <4 x i32>* %147, align 4
2088  %148 = mul nsw <4 x i32> %wide.load10.21, %wide.load.21
2089  %149 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %148)
2090  %150 = add i32 %149, %143
2091  %151 = getelementptr inbounds i32, i32* %x, i32 88
2092  %152 = bitcast i32* %151 to <4 x i32>*
2093  %wide.load.22 = load <4 x i32>, <4 x i32>* %152, align 4
2094  %153 = getelementptr inbounds i32, i32* %y, i32 88
2095  %154 = bitcast i32* %153 to <4 x i32>*
2096  %wide.load10.22 = load <4 x i32>, <4 x i32>* %154, align 4
2097  %155 = mul nsw <4 x i32> %wide.load10.22, %wide.load.22
2098  %156 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %155)
2099  %157 = add i32 %156, %150
2100  %158 = getelementptr inbounds i32, i32* %x, i32 92
2101  %159 = bitcast i32* %158 to <4 x i32>*
2102  %wide.load.23 = load <4 x i32>, <4 x i32>* %159, align 4
2103  %160 = getelementptr inbounds i32, i32* %y, i32 92
2104  %161 = bitcast i32* %160 to <4 x i32>*
2105  %wide.load10.23 = load <4 x i32>, <4 x i32>* %161, align 4
2106  %162 = mul nsw <4 x i32> %wide.load10.23, %wide.load.23
2107  %163 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %162)
2108  %164 = add i32 %163, %157
2109  %165 = getelementptr inbounds i32, i32* %x, i32 96
2110  %166 = bitcast i32* %165 to <4 x i32>*
2111  %wide.load.24 = load <4 x i32>, <4 x i32>* %166, align 4
2112  %167 = getelementptr inbounds i32, i32* %y, i32 96
2113  %168 = bitcast i32* %167 to <4 x i32>*
2114  %wide.load10.24 = load <4 x i32>, <4 x i32>* %168, align 4
2115  %169 = mul nsw <4 x i32> %wide.load10.24, %wide.load.24
2116  %170 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %169)
2117  %171 = add i32 %170, %164
2118  %172 = getelementptr inbounds i32, i32* %x, i32 100
2119  %173 = bitcast i32* %172 to <4 x i32>*
2120  %wide.load.25 = load <4 x i32>, <4 x i32>* %173, align 4
2121  %174 = getelementptr inbounds i32, i32* %y, i32 100
2122  %175 = bitcast i32* %174 to <4 x i32>*
2123  %wide.load10.25 = load <4 x i32>, <4 x i32>* %175, align 4
2124  %176 = mul nsw <4 x i32> %wide.load10.25, %wide.load.25
2125  %177 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %176)
2126  %178 = add i32 %177, %171
2127  %179 = getelementptr inbounds i32, i32* %x, i32 104
2128  %180 = bitcast i32* %179 to <4 x i32>*
2129  %wide.load.26 = load <4 x i32>, <4 x i32>* %180, align 4
2130  %181 = getelementptr inbounds i32, i32* %y, i32 104
2131  %182 = bitcast i32* %181 to <4 x i32>*
2132  %wide.load10.26 = load <4 x i32>, <4 x i32>* %182, align 4
2133  %183 = mul nsw <4 x i32> %wide.load10.26, %wide.load.26
2134  %184 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %183)
2135  %185 = add i32 %184, %178
2136  %186 = getelementptr inbounds i32, i32* %x, i32 108
2137  %187 = bitcast i32* %186 to <4 x i32>*
2138  %wide.load.27 = load <4 x i32>, <4 x i32>* %187, align 4
2139  %188 = getelementptr inbounds i32, i32* %y, i32 108
2140  %189 = bitcast i32* %188 to <4 x i32>*
2141  %wide.load10.27 = load <4 x i32>, <4 x i32>* %189, align 4
2142  %190 = mul nsw <4 x i32> %wide.load10.27, %wide.load.27
2143  %191 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %190)
2144  %192 = add i32 %191, %185
2145  %193 = getelementptr inbounds i32, i32* %x, i32 112
2146  %194 = bitcast i32* %193 to <4 x i32>*
2147  %wide.load.28 = load <4 x i32>, <4 x i32>* %194, align 4
2148  %195 = getelementptr inbounds i32, i32* %y, i32 112
2149  %196 = bitcast i32* %195 to <4 x i32>*
2150  %wide.load10.28 = load <4 x i32>, <4 x i32>* %196, align 4
2151  %197 = mul nsw <4 x i32> %wide.load10.28, %wide.load.28
2152  %198 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %197)
2153  %199 = add i32 %198, %192
2154  %200 = getelementptr inbounds i32, i32* %x, i32 116
2155  %201 = bitcast i32* %200 to <4 x i32>*
2156  %wide.load.29 = load <4 x i32>, <4 x i32>* %201, align 4
2157  %202 = getelementptr inbounds i32, i32* %y, i32 116
2158  %203 = bitcast i32* %202 to <4 x i32>*
2159  %wide.load10.29 = load <4 x i32>, <4 x i32>* %203, align 4
2160  %204 = mul nsw <4 x i32> %wide.load10.29, %wide.load.29
2161  %205 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %204)
2162  %206 = add i32 %205, %199
2163  %207 = getelementptr inbounds i32, i32* %x, i32 120
2164  %208 = bitcast i32* %207 to <4 x i32>*
2165  %wide.load.30 = load <4 x i32>, <4 x i32>* %208, align 4
2166  %209 = getelementptr inbounds i32, i32* %y, i32 120
2167  %210 = bitcast i32* %209 to <4 x i32>*
2168  %wide.load10.30 = load <4 x i32>, <4 x i32>* %210, align 4
2169  %211 = mul nsw <4 x i32> %wide.load10.30, %wide.load.30
2170  %212 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %211)
2171  %213 = add i32 %212, %206
2172  %214 = getelementptr inbounds i32, i32* %x, i32 124
2173  %215 = bitcast i32* %214 to <4 x i32>*
2174  %wide.load.31 = load <4 x i32>, <4 x i32>* %215, align 4
2175  %216 = getelementptr inbounds i32, i32* %y, i32 124
2176  %217 = bitcast i32* %216 to <4 x i32>*
2177  %wide.load10.31 = load <4 x i32>, <4 x i32>* %217, align 4
2178  %218 = mul nsw <4 x i32> %wide.load10.31, %wide.load.31
2179  %219 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %218)
2180  %220 = add i32 %219, %213
2181  ret i32 %220
2182}
2183
2184define i32 @mlav2i32i16(i16* %x, i16* %y) {
2185; CHECK-LABEL: mlav2i32i16:
2186; CHECK:       @ %bb.0: @ %entry
2187; CHECK-NEXT:    ldrsh.w r2, [r0]
2188; CHECK-NEXT:    ldrsh.w r3, [r1]
2189; CHECK-NEXT:    ldrsh.w r0, [r0, #2]
2190; CHECK-NEXT:    ldrsh.w r1, [r1, #2]
2191; CHECK-NEXT:    muls r0, r1, r0
2192; CHECK-NEXT:    smlabb r0, r3, r2, r0
2193; CHECK-NEXT:    bx lr
2194entry:
2195  %0 = load i16, i16* %x, align 2
2196  %conv = sext i16 %0 to i32
2197  %1 = load i16, i16* %y, align 2
2198  %conv2 = sext i16 %1 to i32
2199  %mul = mul nsw i32 %conv2, %conv
2200  %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1
2201  %2 = load i16, i16* %arrayidx.1, align 2
2202  %conv.1 = sext i16 %2 to i32
2203  %arrayidx1.1 = getelementptr inbounds i16, i16* %y, i32 1
2204  %3 = load i16, i16* %arrayidx1.1, align 2
2205  %conv2.1 = sext i16 %3 to i32
2206  %mul.1 = mul nsw i32 %conv2.1, %conv.1
2207  %add.1 = add nsw i32 %mul.1, %mul
2208  ret i32 %add.1
2209}
2210
2211define i32 @mlav4i32i16(i16* %x, i16* %y) {
2212; CHECK-LABEL: mlav4i32i16:
2213; CHECK:       @ %bb.0: @ %entry
2214; CHECK-NEXT:    vldrh.s32 q0, [r0]
2215; CHECK-NEXT:    vldrh.s32 q1, [r1]
2216; CHECK-NEXT:    vmlav.u32 r0, q1, q0
2217; CHECK-NEXT:    bx lr
2218entry:
2219  %0 = bitcast i16* %x to <4 x i16>*
2220  %1 = load <4 x i16>, <4 x i16>* %0, align 2
2221  %2 = sext <4 x i16> %1 to <4 x i32>
2222  %3 = bitcast i16* %y to <4 x i16>*
2223  %4 = load <4 x i16>, <4 x i16>* %3, align 2
2224  %5 = sext <4 x i16> %4 to <4 x i32>
2225  %6 = mul nsw <4 x i32> %5, %2
2226  %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6)
2227  ret i32 %7
2228}
2229
2230define i32 @mlav8i32i16(i16* %x, i16* %y) {
2231; CHECK-LABEL: mlav8i32i16:
2232; CHECK:       @ %bb.0: @ %entry
2233; CHECK-NEXT:    vldrh.u16 q0, [r0]
2234; CHECK-NEXT:    vldrh.u16 q1, [r1]
2235; CHECK-NEXT:    vmlav.s16 r0, q1, q0
2236; CHECK-NEXT:    bx lr
2237entry:
2238  %0 = bitcast i16* %x to <8 x i16>*
2239  %1 = load <8 x i16>, <8 x i16>* %0, align 2
2240  %2 = sext <8 x i16> %1 to <8 x i32>
2241  %3 = bitcast i16* %y to <8 x i16>*
2242  %4 = load <8 x i16>, <8 x i16>* %3, align 2
2243  %5 = sext <8 x i16> %4 to <8 x i32>
2244  %6 = mul nsw <8 x i32> %5, %2
2245  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
2246  ret i32 %7
2247}
2248
2249define i32 @mlav16i32i16(i16* %x, i16* %y) {
2250; CHECK-LABEL: mlav16i32i16:
2251; CHECK:       @ %bb.0: @ %entry
2252; CHECK-NEXT:    vldrh.s32 q0, [r0]
2253; CHECK-NEXT:    vldrh.s32 q1, [r1]
2254; CHECK-NEXT:    vmlav.u32 r2, q1, q0
2255; CHECK-NEXT:    vldrh.s32 q0, [r0, #8]
2256; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
2257; CHECK-NEXT:    vmlava.u32 r2, q1, q0
2258; CHECK-NEXT:    vldrh.s32 q0, [r0, #16]
2259; CHECK-NEXT:    vldrh.s32 q1, [r1, #16]
2260; CHECK-NEXT:    vmlava.u32 r2, q1, q0
2261; CHECK-NEXT:    vldrh.s32 q0, [r0, #24]
2262; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
2263; CHECK-NEXT:    vmlava.u32 r2, q1, q0
2264; CHECK-NEXT:    mov r0, r2
2265; CHECK-NEXT:    bx lr
2266entry:
2267  %0 = bitcast i16* %x to <16 x i16>*
2268  %1 = load <16 x i16>, <16 x i16>* %0, align 2
2269  %2 = sext <16 x i16> %1 to <16 x i32>
2270  %3 = bitcast i16* %y to <16 x i16>*
2271  %4 = load <16 x i16>, <16 x i16>* %3, align 2
2272  %5 = sext <16 x i16> %4 to <16 x i32>
2273  %6 = mul nsw <16 x i32> %5, %2
2274  %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
2275  ret i32 %7
2276}
2277
2278define i32 @mlav24i32i16(i16* %x, i16* %y) {
2279; CHECK-LABEL: mlav24i32i16:
2280; CHECK:       @ %bb.0: @ %entry
2281; CHECK-NEXT:    vldrh.u16 q0, [r0]
2282; CHECK-NEXT:    vldrh.u16 q1, [r1]
2283; CHECK-NEXT:    mov r2, r0
2284; CHECK-NEXT:    vmlav.s16 r0, q1, q0
2285; CHECK-NEXT:    vldrh.s32 q0, [r2, #16]
2286; CHECK-NEXT:    vldrh.s32 q1, [r1, #16]
2287; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2288; CHECK-NEXT:    vldrh.s32 q0, [r2, #24]
2289; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
2290; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2291; CHECK-NEXT:    vldrh.s32 q0, [r2, #32]
2292; CHECK-NEXT:    vldrh.s32 q1, [r1, #32]
2293; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2294; CHECK-NEXT:    vldrh.s32 q0, [r2, #40]
2295; CHECK-NEXT:    vldrh.s32 q1, [r1, #40]
2296; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2297; CHECK-NEXT:    bx lr
2298entry:
2299  %0 = bitcast i16* %x to <8 x i16>*
2300  %1 = load <8 x i16>, <8 x i16>* %0, align 2
2301  %2 = sext <8 x i16> %1 to <8 x i32>
2302  %3 = bitcast i16* %y to <8 x i16>*
2303  %4 = load <8 x i16>, <8 x i16>* %3, align 2
2304  %5 = sext <8 x i16> %4 to <8 x i32>
2305  %6 = mul nsw <8 x i32> %5, %2
2306  %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8
2307  %arrayidx1.8 = getelementptr inbounds i16, i16* %y, i32 8
2308  %7 = bitcast i16* %arrayidx.8 to <16 x i16>*
2309  %8 = load <16 x i16>, <16 x i16>* %7, align 2
2310  %9 = sext <16 x i16> %8 to <16 x i32>
2311  %10 = bitcast i16* %arrayidx1.8 to <16 x i16>*
2312  %11 = load <16 x i16>, <16 x i16>* %10, align 2
2313  %12 = sext <16 x i16> %11 to <16 x i32>
2314  %13 = mul nsw <16 x i32> %12, %9
2315  %14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %13)
2316  %15 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
2317  %op.rdx = add nsw i32 %14, %15
2318  ret i32 %op.rdx
2319}
2320
2321define i32 @mlav32i32i16(i16* %x, i16* %y) {
2322; CHECK-LABEL: mlav32i32i16:
2323; CHECK:       @ %bb.0: @ %entry
2324; CHECK-NEXT:    vldrh.s32 q0, [r0]
2325; CHECK-NEXT:    vldrh.s32 q1, [r1]
2326; CHECK-NEXT:    mov r2, r0
2327; CHECK-NEXT:    vmlav.u32 r0, q1, q0
2328; CHECK-NEXT:    vldrh.s32 q0, [r2, #8]
2329; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
2330; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2331; CHECK-NEXT:    vldrh.s32 q0, [r2, #16]
2332; CHECK-NEXT:    vldrh.s32 q1, [r1, #16]
2333; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2334; CHECK-NEXT:    vldrh.s32 q0, [r2, #24]
2335; CHECK-NEXT:    vldrh.s32 q1, [r1, #24]
2336; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2337; CHECK-NEXT:    vldrh.s32 q0, [r2, #32]
2338; CHECK-NEXT:    vldrh.s32 q1, [r1, #32]
2339; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2340; CHECK-NEXT:    vldrh.s32 q0, [r2, #40]
2341; CHECK-NEXT:    vldrh.s32 q1, [r1, #40]
2342; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2343; CHECK-NEXT:    vldrh.s32 q0, [r2, #48]
2344; CHECK-NEXT:    vldrh.s32 q1, [r1, #48]
2345; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2346; CHECK-NEXT:    vldrh.s32 q0, [r2, #56]
2347; CHECK-NEXT:    vldrh.s32 q1, [r1, #56]
2348; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2349; CHECK-NEXT:    bx lr
2350entry:
2351  %0 = bitcast i16* %x to <32 x i16>*
2352  %1 = load <32 x i16>, <32 x i16>* %0, align 2
2353  %2 = sext <32 x i16> %1 to <32 x i32>
2354  %3 = bitcast i16* %y to <32 x i16>*
2355  %4 = load <32 x i16>, <32 x i16>* %3, align 2
2356  %5 = sext <32 x i16> %4 to <32 x i32>
2357  %6 = mul nsw <32 x i32> %5, %2
2358  %7 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %6)
2359  ret i32 %7
2360}
2361
2362define i32 @mlav64i32i16(i16* %x, i16* %y) {
2363; CHECK-LABEL: mlav64i32i16:
2364; CHECK:       @ %bb.0: @ %entry
2365; CHECK-NEXT:    vldrh.u16 q0, [r0]
2366; CHECK-NEXT:    vldrh.u16 q1, [r1]
2367; CHECK-NEXT:    mov r2, r0
2368; CHECK-NEXT:    vmlav.s16 r0, q1, q0
2369; CHECK-NEXT:    vldrh.u16 q0, [r2, #16]
2370; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2371; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2372; CHECK-NEXT:    vldrh.u16 q0, [r2, #32]
2373; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
2374; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2375; CHECK-NEXT:    vldrh.u16 q0, [r2, #48]
2376; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
2377; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2378; CHECK-NEXT:    vldrh.u16 q0, [r2, #64]
2379; CHECK-NEXT:    vldrh.u16 q1, [r1, #64]
2380; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2381; CHECK-NEXT:    vldrh.u16 q0, [r2, #80]
2382; CHECK-NEXT:    vldrh.u16 q1, [r1, #80]
2383; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2384; CHECK-NEXT:    vldrh.u16 q0, [r2, #96]
2385; CHECK-NEXT:    vldrh.u16 q1, [r1, #96]
2386; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2387; CHECK-NEXT:    vldrh.u16 q0, [r2, #112]
2388; CHECK-NEXT:    vldrh.u16 q1, [r1, #112]
2389; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2390; CHECK-NEXT:    bx lr
2391entry:
2392  %0 = bitcast i16* %x to <8 x i16>*
2393  %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
2394  %1 = sext <8 x i16> %wide.load to <8 x i32>
2395  %2 = bitcast i16* %y to <8 x i16>*
2396  %wide.load11 = load <8 x i16>, <8 x i16>* %2, align 2
2397  %3 = sext <8 x i16> %wide.load11 to <8 x i32>
2398  %4 = mul nsw <8 x i32> %3, %1
2399  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2400  %6 = getelementptr inbounds i16, i16* %x, i32 8
2401  %7 = bitcast i16* %6 to <8 x i16>*
2402  %wide.load.1 = load <8 x i16>, <8 x i16>* %7, align 2
2403  %8 = sext <8 x i16> %wide.load.1 to <8 x i32>
2404  %9 = getelementptr inbounds i16, i16* %y, i32 8
2405  %10 = bitcast i16* %9 to <8 x i16>*
2406  %wide.load11.1 = load <8 x i16>, <8 x i16>* %10, align 2
2407  %11 = sext <8 x i16> %wide.load11.1 to <8 x i32>
2408  %12 = mul nsw <8 x i32> %11, %8
2409  %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
2410  %14 = add i32 %13, %5
2411  %15 = getelementptr inbounds i16, i16* %x, i32 16
2412  %16 = bitcast i16* %15 to <8 x i16>*
2413  %wide.load.2 = load <8 x i16>, <8 x i16>* %16, align 2
2414  %17 = sext <8 x i16> %wide.load.2 to <8 x i32>
2415  %18 = getelementptr inbounds i16, i16* %y, i32 16
2416  %19 = bitcast i16* %18 to <8 x i16>*
2417  %wide.load11.2 = load <8 x i16>, <8 x i16>* %19, align 2
2418  %20 = sext <8 x i16> %wide.load11.2 to <8 x i32>
2419  %21 = mul nsw <8 x i32> %20, %17
2420  %22 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %21)
2421  %23 = add i32 %22, %14
2422  %24 = getelementptr inbounds i16, i16* %x, i32 24
2423  %25 = bitcast i16* %24 to <8 x i16>*
2424  %wide.load.3 = load <8 x i16>, <8 x i16>* %25, align 2
2425  %26 = sext <8 x i16> %wide.load.3 to <8 x i32>
2426  %27 = getelementptr inbounds i16, i16* %y, i32 24
2427  %28 = bitcast i16* %27 to <8 x i16>*
2428  %wide.load11.3 = load <8 x i16>, <8 x i16>* %28, align 2
2429  %29 = sext <8 x i16> %wide.load11.3 to <8 x i32>
2430  %30 = mul nsw <8 x i32> %29, %26
2431  %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30)
2432  %32 = add i32 %31, %23
2433  %33 = getelementptr inbounds i16, i16* %x, i32 32
2434  %34 = bitcast i16* %33 to <8 x i16>*
2435  %wide.load.4 = load <8 x i16>, <8 x i16>* %34, align 2
2436  %35 = sext <8 x i16> %wide.load.4 to <8 x i32>
2437  %36 = getelementptr inbounds i16, i16* %y, i32 32
2438  %37 = bitcast i16* %36 to <8 x i16>*
2439  %wide.load11.4 = load <8 x i16>, <8 x i16>* %37, align 2
2440  %38 = sext <8 x i16> %wide.load11.4 to <8 x i32>
2441  %39 = mul nsw <8 x i32> %38, %35
2442  %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39)
2443  %41 = add i32 %40, %32
2444  %42 = getelementptr inbounds i16, i16* %x, i32 40
2445  %43 = bitcast i16* %42 to <8 x i16>*
2446  %wide.load.5 = load <8 x i16>, <8 x i16>* %43, align 2
2447  %44 = sext <8 x i16> %wide.load.5 to <8 x i32>
2448  %45 = getelementptr inbounds i16, i16* %y, i32 40
2449  %46 = bitcast i16* %45 to <8 x i16>*
2450  %wide.load11.5 = load <8 x i16>, <8 x i16>* %46, align 2
2451  %47 = sext <8 x i16> %wide.load11.5 to <8 x i32>
2452  %48 = mul nsw <8 x i32> %47, %44
2453  %49 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %48)
2454  %50 = add i32 %49, %41
2455  %51 = getelementptr inbounds i16, i16* %x, i32 48
2456  %52 = bitcast i16* %51 to <8 x i16>*
2457  %wide.load.6 = load <8 x i16>, <8 x i16>* %52, align 2
2458  %53 = sext <8 x i16> %wide.load.6 to <8 x i32>
2459  %54 = getelementptr inbounds i16, i16* %y, i32 48
2460  %55 = bitcast i16* %54 to <8 x i16>*
2461  %wide.load11.6 = load <8 x i16>, <8 x i16>* %55, align 2
2462  %56 = sext <8 x i16> %wide.load11.6 to <8 x i32>
2463  %57 = mul nsw <8 x i32> %56, %53
2464  %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57)
2465  %59 = add i32 %58, %50
2466  %60 = getelementptr inbounds i16, i16* %x, i32 56
2467  %61 = bitcast i16* %60 to <8 x i16>*
2468  %wide.load.7 = load <8 x i16>, <8 x i16>* %61, align 2
2469  %62 = sext <8 x i16> %wide.load.7 to <8 x i32>
2470  %63 = getelementptr inbounds i16, i16* %y, i32 56
2471  %64 = bitcast i16* %63 to <8 x i16>*
2472  %wide.load11.7 = load <8 x i16>, <8 x i16>* %64, align 2
2473  %65 = sext <8 x i16> %wide.load11.7 to <8 x i32>
2474  %66 = mul nsw <8 x i32> %65, %62
2475  %67 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %66)
2476  %68 = add i32 %67, %59
2477  ret i32 %68
2478}
2479
2480define i32 @mlav128i32i16(i16* %x, i16* %y) {
2481; CHECK-LABEL: mlav128i32i16:
2482; CHECK:       @ %bb.0: @ %entry
2483; CHECK-NEXT:    vldrh.u16 q0, [r0]
2484; CHECK-NEXT:    vldrh.u16 q1, [r1]
2485; CHECK-NEXT:    mov r2, r0
2486; CHECK-NEXT:    vmlav.s16 r0, q1, q0
2487; CHECK-NEXT:    vldrh.u16 q0, [r2, #16]
2488; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
2489; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2490; CHECK-NEXT:    vldrh.u16 q0, [r2, #32]
2491; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
2492; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2493; CHECK-NEXT:    vldrh.u16 q0, [r2, #48]
2494; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
2495; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2496; CHECK-NEXT:    vldrh.u16 q0, [r2, #64]
2497; CHECK-NEXT:    vldrh.u16 q1, [r1, #64]
2498; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2499; CHECK-NEXT:    vldrh.u16 q0, [r2, #80]
2500; CHECK-NEXT:    vldrh.u16 q1, [r1, #80]
2501; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2502; CHECK-NEXT:    vldrh.u16 q0, [r2, #96]
2503; CHECK-NEXT:    vldrh.u16 q1, [r1, #96]
2504; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2505; CHECK-NEXT:    vldrh.u16 q0, [r2, #112]
2506; CHECK-NEXT:    vldrh.u16 q1, [r1, #112]
2507; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2508; CHECK-NEXT:    vldrh.u16 q0, [r2, #128]
2509; CHECK-NEXT:    vldrh.u16 q1, [r1, #128]
2510; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2511; CHECK-NEXT:    vldrh.u16 q0, [r2, #144]
2512; CHECK-NEXT:    vldrh.u16 q1, [r1, #144]
2513; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2514; CHECK-NEXT:    vldrh.u16 q0, [r2, #160]
2515; CHECK-NEXT:    vldrh.u16 q1, [r1, #160]
2516; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2517; CHECK-NEXT:    vldrh.u16 q0, [r2, #176]
2518; CHECK-NEXT:    vldrh.u16 q1, [r1, #176]
2519; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2520; CHECK-NEXT:    vldrh.u16 q0, [r2, #192]
2521; CHECK-NEXT:    vldrh.u16 q1, [r1, #192]
2522; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2523; CHECK-NEXT:    vldrh.u16 q0, [r2, #208]
2524; CHECK-NEXT:    vldrh.u16 q1, [r1, #208]
2525; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2526; CHECK-NEXT:    vldrh.u16 q0, [r2, #224]
2527; CHECK-NEXT:    vldrh.u16 q1, [r1, #224]
2528; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2529; CHECK-NEXT:    vldrh.u16 q0, [r2, #240]
2530; CHECK-NEXT:    vldrh.u16 q1, [r1, #240]
2531; CHECK-NEXT:    vmlava.s16 r0, q1, q0
2532; CHECK-NEXT:    bx lr
2533entry:
2534  %0 = bitcast i16* %x to <8 x i16>*
2535  %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
2536  %1 = sext <8 x i16> %wide.load to <8 x i32>
2537  %2 = bitcast i16* %y to <8 x i16>*
2538  %wide.load11 = load <8 x i16>, <8 x i16>* %2, align 2
2539  %3 = sext <8 x i16> %wide.load11 to <8 x i32>
2540  %4 = mul nsw <8 x i32> %3, %1
2541  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
2542  %6 = getelementptr inbounds i16, i16* %x, i32 8
2543  %7 = bitcast i16* %6 to <8 x i16>*
2544  %wide.load.1 = load <8 x i16>, <8 x i16>* %7, align 2
2545  %8 = sext <8 x i16> %wide.load.1 to <8 x i32>
2546  %9 = getelementptr inbounds i16, i16* %y, i32 8
2547  %10 = bitcast i16* %9 to <8 x i16>*
2548  %wide.load11.1 = load <8 x i16>, <8 x i16>* %10, align 2
2549  %11 = sext <8 x i16> %wide.load11.1 to <8 x i32>
2550  %12 = mul nsw <8 x i32> %11, %8
2551  %13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
2552  %14 = add i32 %13, %5
2553  %15 = getelementptr inbounds i16, i16* %x, i32 16
2554  %16 = bitcast i16* %15 to <8 x i16>*
2555  %wide.load.2 = load <8 x i16>, <8 x i16>* %16, align 2
2556  %17 = sext <8 x i16> %wide.load.2 to <8 x i32>
2557  %18 = getelementptr inbounds i16, i16* %y, i32 16
2558  %19 = bitcast i16* %18 to <8 x i16>*
2559  %wide.load11.2 = load <8 x i16>, <8 x i16>* %19, align 2
2560  %20 = sext <8 x i16> %wide.load11.2 to <8 x i32>
2561  %21 = mul nsw <8 x i32> %20, %17
2562  %22 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %21)
2563  %23 = add i32 %22, %14
2564  %24 = getelementptr inbounds i16, i16* %x, i32 24
2565  %25 = bitcast i16* %24 to <8 x i16>*
2566  %wide.load.3 = load <8 x i16>, <8 x i16>* %25, align 2
2567  %26 = sext <8 x i16> %wide.load.3 to <8 x i32>
2568  %27 = getelementptr inbounds i16, i16* %y, i32 24
2569  %28 = bitcast i16* %27 to <8 x i16>*
2570  %wide.load11.3 = load <8 x i16>, <8 x i16>* %28, align 2
2571  %29 = sext <8 x i16> %wide.load11.3 to <8 x i32>
2572  %30 = mul nsw <8 x i32> %29, %26
2573  %31 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %30)
2574  %32 = add i32 %31, %23
2575  %33 = getelementptr inbounds i16, i16* %x, i32 32
2576  %34 = bitcast i16* %33 to <8 x i16>*
2577  %wide.load.4 = load <8 x i16>, <8 x i16>* %34, align 2
2578  %35 = sext <8 x i16> %wide.load.4 to <8 x i32>
2579  %36 = getelementptr inbounds i16, i16* %y, i32 32
2580  %37 = bitcast i16* %36 to <8 x i16>*
2581  %wide.load11.4 = load <8 x i16>, <8 x i16>* %37, align 2
2582  %38 = sext <8 x i16> %wide.load11.4 to <8 x i32>
2583  %39 = mul nsw <8 x i32> %38, %35
2584  %40 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %39)
2585  %41 = add i32 %40, %32
2586  %42 = getelementptr inbounds i16, i16* %x, i32 40
2587  %43 = bitcast i16* %42 to <8 x i16>*
2588  %wide.load.5 = load <8 x i16>, <8 x i16>* %43, align 2
2589  %44 = sext <8 x i16> %wide.load.5 to <8 x i32>
2590  %45 = getelementptr inbounds i16, i16* %y, i32 40
2591  %46 = bitcast i16* %45 to <8 x i16>*
2592  %wide.load11.5 = load <8 x i16>, <8 x i16>* %46, align 2
2593  %47 = sext <8 x i16> %wide.load11.5 to <8 x i32>
2594  %48 = mul nsw <8 x i32> %47, %44
2595  %49 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %48)
2596  %50 = add i32 %49, %41
2597  %51 = getelementptr inbounds i16, i16* %x, i32 48
2598  %52 = bitcast i16* %51 to <8 x i16>*
2599  %wide.load.6 = load <8 x i16>, <8 x i16>* %52, align 2
2600  %53 = sext <8 x i16> %wide.load.6 to <8 x i32>
2601  %54 = getelementptr inbounds i16, i16* %y, i32 48
2602  %55 = bitcast i16* %54 to <8 x i16>*
2603  %wide.load11.6 = load <8 x i16>, <8 x i16>* %55, align 2
2604  %56 = sext <8 x i16> %wide.load11.6 to <8 x i32>
2605  %57 = mul nsw <8 x i32> %56, %53
2606  %58 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %57)
2607  %59 = add i32 %58, %50
2608  %60 = getelementptr inbounds i16, i16* %x, i32 56
2609  %61 = bitcast i16* %60 to <8 x i16>*
2610  %wide.load.7 = load <8 x i16>, <8 x i16>* %61, align 2
2611  %62 = sext <8 x i16> %wide.load.7 to <8 x i32>
2612  %63 = getelementptr inbounds i16, i16* %y, i32 56
2613  %64 = bitcast i16* %63 to <8 x i16>*
2614  %wide.load11.7 = load <8 x i16>, <8 x i16>* %64, align 2
2615  %65 = sext <8 x i16> %wide.load11.7 to <8 x i32>
2616  %66 = mul nsw <8 x i32> %65, %62
2617  %67 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %66)
2618  %68 = add i32 %67, %59
2619  %69 = getelementptr inbounds i16, i16* %x, i32 64
2620  %70 = bitcast i16* %69 to <8 x i16>*
2621  %wide.load.8 = load <8 x i16>, <8 x i16>* %70, align 2
2622  %71 = sext <8 x i16> %wide.load.8 to <8 x i32>
2623  %72 = getelementptr inbounds i16, i16* %y, i32 64
2624  %73 = bitcast i16* %72 to <8 x i16>*
2625  %wide.load11.8 = load <8 x i16>, <8 x i16>* %73, align 2
2626  %74 = sext <8 x i16> %wide.load11.8 to <8 x i32>
2627  %75 = mul nsw <8 x i32> %74, %71
2628  %76 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %75)
2629  %77 = add i32 %76, %68
2630  %78 = getelementptr inbounds i16, i16* %x, i32 72
2631  %79 = bitcast i16* %78 to <8 x i16>*
2632  %wide.load.9 = load <8 x i16>, <8 x i16>* %79, align 2
2633  %80 = sext <8 x i16> %wide.load.9 to <8 x i32>
2634  %81 = getelementptr inbounds i16, i16* %y, i32 72
2635  %82 = bitcast i16* %81 to <8 x i16>*
2636  %wide.load11.9 = load <8 x i16>, <8 x i16>* %82, align 2
2637  %83 = sext <8 x i16> %wide.load11.9 to <8 x i32>
2638  %84 = mul nsw <8 x i32> %83, %80
2639  %85 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %84)
2640  %86 = add i32 %85, %77
2641  %87 = getelementptr inbounds i16, i16* %x, i32 80
2642  %88 = bitcast i16* %87 to <8 x i16>*
2643  %wide.load.10 = load <8 x i16>, <8 x i16>* %88, align 2
2644  %89 = sext <8 x i16> %wide.load.10 to <8 x i32>
2645  %90 = getelementptr inbounds i16, i16* %y, i32 80
2646  %91 = bitcast i16* %90 to <8 x i16>*
2647  %wide.load11.10 = load <8 x i16>, <8 x i16>* %91, align 2
2648  %92 = sext <8 x i16> %wide.load11.10 to <8 x i32>
2649  %93 = mul nsw <8 x i32> %92, %89
2650  %94 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %93)
2651  %95 = add i32 %94, %86
2652  %96 = getelementptr inbounds i16, i16* %x, i32 88
2653  %97 = bitcast i16* %96 to <8 x i16>*
2654  %wide.load.11 = load <8 x i16>, <8 x i16>* %97, align 2
2655  %98 = sext <8 x i16> %wide.load.11 to <8 x i32>
2656  %99 = getelementptr inbounds i16, i16* %y, i32 88
2657  %100 = bitcast i16* %99 to <8 x i16>*
2658  %wide.load11.11 = load <8 x i16>, <8 x i16>* %100, align 2
2659  %101 = sext <8 x i16> %wide.load11.11 to <8 x i32>
2660  %102 = mul nsw <8 x i32> %101, %98
2661  %103 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %102)
2662  %104 = add i32 %103, %95
2663  %105 = getelementptr inbounds i16, i16* %x, i32 96
2664  %106 = bitcast i16* %105 to <8 x i16>*
2665  %wide.load.12 = load <8 x i16>, <8 x i16>* %106, align 2
2666  %107 = sext <8 x i16> %wide.load.12 to <8 x i32>
2667  %108 = getelementptr inbounds i16, i16* %y, i32 96
2668  %109 = bitcast i16* %108 to <8 x i16>*
2669  %wide.load11.12 = load <8 x i16>, <8 x i16>* %109, align 2
2670  %110 = sext <8 x i16> %wide.load11.12 to <8 x i32>
2671  %111 = mul nsw <8 x i32> %110, %107
2672  %112 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %111)
2673  %113 = add i32 %112, %104
2674  %114 = getelementptr inbounds i16, i16* %x, i32 104
2675  %115 = bitcast i16* %114 to <8 x i16>*
2676  %wide.load.13 = load <8 x i16>, <8 x i16>* %115, align 2
2677  %116 = sext <8 x i16> %wide.load.13 to <8 x i32>
2678  %117 = getelementptr inbounds i16, i16* %y, i32 104
2679  %118 = bitcast i16* %117 to <8 x i16>*
2680  %wide.load11.13 = load <8 x i16>, <8 x i16>* %118, align 2
2681  %119 = sext <8 x i16> %wide.load11.13 to <8 x i32>
2682  %120 = mul nsw <8 x i32> %119, %116
2683  %121 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %120)
2684  %122 = add i32 %121, %113
2685  %123 = getelementptr inbounds i16, i16* %x, i32 112
2686  %124 = bitcast i16* %123 to <8 x i16>*
2687  %wide.load.14 = load <8 x i16>, <8 x i16>* %124, align 2
2688  %125 = sext <8 x i16> %wide.load.14 to <8 x i32>
2689  %126 = getelementptr inbounds i16, i16* %y, i32 112
2690  %127 = bitcast i16* %126 to <8 x i16>*
2691  %wide.load11.14 = load <8 x i16>, <8 x i16>* %127, align 2
2692  %128 = sext <8 x i16> %wide.load11.14 to <8 x i32>
2693  %129 = mul nsw <8 x i32> %128, %125
2694  %130 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %129)
2695  %131 = add i32 %130, %122
2696  %132 = getelementptr inbounds i16, i16* %x, i32 120
2697  %133 = bitcast i16* %132 to <8 x i16>*
2698  %wide.load.15 = load <8 x i16>, <8 x i16>* %133, align 2
2699  %134 = sext <8 x i16> %wide.load.15 to <8 x i32>
2700  %135 = getelementptr inbounds i16, i16* %y, i32 120
2701  %136 = bitcast i16* %135 to <8 x i16>*
2702  %wide.load11.15 = load <8 x i16>, <8 x i16>* %136, align 2
2703  %137 = sext <8 x i16> %wide.load11.15 to <8 x i32>
2704  %138 = mul nsw <8 x i32> %137, %134
2705  %139 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %138)
2706  %140 = add i32 %139, %131
2707  ret i32 %140
2708}
2709
2710define i32 @mlav2i32i8(i8* %x, i8* %y) {
2711; CHECK-LABEL: mlav2i32i8:
2712; CHECK:       @ %bb.0: @ %entry
2713; CHECK-NEXT:    ldrb r2, [r0]
2714; CHECK-NEXT:    ldrb r3, [r1]
2715; CHECK-NEXT:    ldrb r0, [r0, #1]
2716; CHECK-NEXT:    ldrb r1, [r1, #1]
2717; CHECK-NEXT:    muls r0, r1, r0
2718; CHECK-NEXT:    smlabb r0, r3, r2, r0
2719; CHECK-NEXT:    bx lr
2720entry:
2721  %0 = load i8, i8* %x, align 1
2722  %conv = zext i8 %0 to i32
2723  %1 = load i8, i8* %y, align 1
2724  %conv2 = zext i8 %1 to i32
2725  %mul = mul nuw nsw i32 %conv2, %conv
2726  %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1
2727  %2 = load i8, i8* %arrayidx.1, align 1
2728  %conv.1 = zext i8 %2 to i32
2729  %arrayidx1.1 = getelementptr inbounds i8, i8* %y, i32 1
2730  %3 = load i8, i8* %arrayidx1.1, align 1
2731  %conv2.1 = zext i8 %3 to i32
2732  %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
2733  %add.1 = add nuw nsw i32 %mul.1, %mul
2734  ret i32 %add.1
2735}
2736
2737define i32 @mlav4i32i8(i8* %x, i8* %y) {
2738; CHECK-LABEL: mlav4i32i8:
2739; CHECK:       @ %bb.0: @ %entry
2740; CHECK-NEXT:    vldrb.u32 q0, [r0]
2741; CHECK-NEXT:    vldrb.u32 q1, [r1]
2742; CHECK-NEXT:    vmlav.u32 r0, q1, q0
2743; CHECK-NEXT:    bx lr
2744entry:
2745  %0 = bitcast i8* %x to <4 x i8>*
2746  %1 = load <4 x i8>, <4 x i8>* %0, align 1
2747  %2 = zext <4 x i8> %1 to <4 x i32>
2748  %3 = bitcast i8* %y to <4 x i8>*
2749  %4 = load <4 x i8>, <4 x i8>* %3, align 1
2750  %5 = zext <4 x i8> %4 to <4 x i32>
2751  %6 = mul nuw nsw <4 x i32> %5, %2
2752  %7 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %6)
2753  ret i32 %7
2754}
2755
2756define i32 @mlav8i32i8(i8* %x, i8* %y) {
2757; CHECK-LABEL: mlav8i32i8:
2758; CHECK:       @ %bb.0: @ %entry
2759; CHECK-NEXT:    vldrb.u16 q0, [r0]
2760; CHECK-NEXT:    vldrb.u16 q1, [r1]
2761; CHECK-NEXT:    vmlav.u16 r0, q1, q0
2762; CHECK-NEXT:    bx lr
2763entry:
2764  %0 = bitcast i8* %x to <8 x i8>*
2765  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2766  %2 = zext <8 x i8> %1 to <8 x i32>
2767  %3 = bitcast i8* %y to <8 x i8>*
2768  %4 = load <8 x i8>, <8 x i8>* %3, align 1
2769  %5 = zext <8 x i8> %4 to <8 x i32>
2770  %6 = mul nuw nsw <8 x i32> %5, %2
2771  %7 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
2772  ret i32 %7
2773}
2774
2775define i32 @mlav16i32i8(i8* %x, i8* %y) {
2776; CHECK-LABEL: mlav16i32i8:
2777; CHECK:       @ %bb.0: @ %entry
2778; CHECK-NEXT:    vldrb.u8 q0, [r0]
2779; CHECK-NEXT:    vldrb.u8 q1, [r1]
2780; CHECK-NEXT:    vmlav.u8 r0, q1, q0
2781; CHECK-NEXT:    bx lr
2782entry:
2783  %0 = bitcast i8* %x to <16 x i8>*
2784  %1 = load <16 x i8>, <16 x i8>* %0, align 1
2785  %2 = zext <16 x i8> %1 to <16 x i32>
2786  %3 = bitcast i8* %y to <16 x i8>*
2787  %4 = load <16 x i8>, <16 x i8>* %3, align 1
2788  %5 = zext <16 x i8> %4 to <16 x i32>
2789  %6 = mul nuw nsw <16 x i32> %5, %2
2790  %7 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %6)
2791  ret i32 %7
2792}
2793
2794define i32 @mlav24i32i8(i8* %x, i8* %y) {
2795; CHECK-LABEL: mlav24i32i8:
2796; CHECK:       @ %bb.0: @ %entry
2797; CHECK-NEXT:    vldrb.u16 q0, [r0]
2798; CHECK-NEXT:    vldrb.u16 q1, [r1]
2799; CHECK-NEXT:    vmlav.u16 r2, q1, q0
2800; CHECK-NEXT:    vldrb.u8 q0, [r0, #8]
2801; CHECK-NEXT:    vldrb.u8 q1, [r1, #8]
2802; CHECK-NEXT:    vmlava.u8 r2, q1, q0
2803; CHECK-NEXT:    mov r0, r2
2804; CHECK-NEXT:    bx lr
2805entry:
2806  %0 = bitcast i8* %x to <8 x i8>*
2807  %1 = load <8 x i8>, <8 x i8>* %0, align 1
2808  %2 = zext <8 x i8> %1 to <8 x i32>
2809  %3 = bitcast i8* %y to <8 x i8>*
2810  %4 = load <8 x i8>, <8 x i8>* %3, align 1
2811  %5 = zext <8 x i8> %4 to <8 x i32>
2812  %6 = mul nuw nsw <8 x i32> %5, %2
2813  %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8
2814  %arrayidx1.8 = getelementptr inbounds i8, i8* %y, i32 8
2815  %7 = bitcast i8* %arrayidx.8 to <16 x i8>*
2816  %8 = load <16 x i8>, <16 x i8>* %7, align 1
2817  %9 = zext <16 x i8> %8 to <16 x i32>
2818  %10 = bitcast i8* %arrayidx1.8 to <16 x i8>*
2819  %11 = load <16 x i8>, <16 x i8>* %10, align 1
2820  %12 = zext <16 x i8> %11 to <16 x i32>
2821  %13 = mul nuw nsw <16 x i32> %12, %9
2822  %14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %13)
2823  %15 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %6)
2824  %op.rdx = add nuw nsw i32 %14, %15
2825  ret i32 %op.rdx
2826}
2827
2828define i32 @mlav32i32i8(i8* %x, i8* %y) {
2829; CHECK-LABEL: mlav32i32i8:
2830; CHECK:       @ %bb.0: @ %entry
2831; CHECK-NEXT:    vldrb.u32 q0, [r0]
2832; CHECK-NEXT:    vldrb.u32 q1, [r1]
2833; CHECK-NEXT:    mov r2, r0
2834; CHECK-NEXT:    vmlav.u32 r0, q1, q0
2835; CHECK-NEXT:    vldrb.u32 q0, [r2, #4]
2836; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
2837; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2838; CHECK-NEXT:    vldrb.u32 q0, [r2, #8]
2839; CHECK-NEXT:    vldrb.u32 q1, [r1, #8]
2840; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2841; CHECK-NEXT:    vldrb.u32 q0, [r2, #12]
2842; CHECK-NEXT:    vldrb.u32 q1, [r1, #12]
2843; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2844; CHECK-NEXT:    vldrb.u32 q0, [r2, #16]
2845; CHECK-NEXT:    vldrb.u32 q1, [r1, #16]
2846; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2847; CHECK-NEXT:    vldrb.u32 q0, [r2, #20]
2848; CHECK-NEXT:    vldrb.u32 q1, [r1, #20]
2849; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2850; CHECK-NEXT:    vldrb.u32 q0, [r2, #24]
2851; CHECK-NEXT:    vldrb.u32 q1, [r1, #24]
2852; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2853; CHECK-NEXT:    vldrb.u32 q0, [r2, #28]
2854; CHECK-NEXT:    vldrb.u32 q1, [r1, #28]
2855; CHECK-NEXT:    vmlava.u32 r0, q1, q0
2856; CHECK-NEXT:    bx lr
2857entry:
2858  %0 = bitcast i8* %x to <32 x i8>*
2859  %1 = load <32 x i8>, <32 x i8>* %0, align 1
2860  %2 = zext <32 x i8> %1 to <32 x i32>
2861  %3 = bitcast i8* %y to <32 x i8>*
2862  %4 = load <32 x i8>, <32 x i8>* %3, align 1
2863  %5 = zext <32 x i8> %4 to <32 x i32>
2864  %6 = mul nuw nsw <32 x i32> %5, %2
2865  %7 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %6)
2866  ret i32 %7
2867}
2868
2869define i32 @mlav64i32i8(i8* %x, i8* %y) {
2870; CHECK-LABEL: mlav64i32i8:
2871; CHECK:       @ %bb.0: @ %entry
2872; CHECK-NEXT:    vldrb.u8 q0, [r0]
2873; CHECK-NEXT:    vldrb.u8 q1, [r1]
2874; CHECK-NEXT:    vmlav.u8 r2, q1, q0
2875; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
2876; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
2877; CHECK-NEXT:    vmlava.u8 r2, q1, q0
2878; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
2879; CHECK-NEXT:    vldrb.u8 q1, [r1, #32]
2880; CHECK-NEXT:    vmlava.u8 r2, q1, q0
2881; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
2882; CHECK-NEXT:    vldrb.u8 q1, [r1, #48]
2883; CHECK-NEXT:    vmlava.u8 r2, q1, q0
2884; CHECK-NEXT:    mov r0, r2
2885; CHECK-NEXT:    bx lr
2886entry:
2887  %0 = bitcast i8* %x to <16 x i8>*
2888  %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
2889  %1 = zext <16 x i8> %wide.load to <16 x i32>
2890  %2 = bitcast i8* %y to <16 x i8>*
2891  %wide.load11 = load <16 x i8>, <16 x i8>* %2, align 1
2892  %3 = zext <16 x i8> %wide.load11 to <16 x i32>
2893  %4 = mul nuw nsw <16 x i32> %3, %1
2894  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
2895  %6 = getelementptr inbounds i8, i8* %x, i32 16
2896  %7 = bitcast i8* %6 to <16 x i8>*
2897  %wide.load.1 = load <16 x i8>, <16 x i8>* %7, align 1
2898  %8 = zext <16 x i8> %wide.load.1 to <16 x i32>
2899  %9 = getelementptr inbounds i8, i8* %y, i32 16
2900  %10 = bitcast i8* %9 to <16 x i8>*
2901  %wide.load11.1 = load <16 x i8>, <16 x i8>* %10, align 1
2902  %11 = zext <16 x i8> %wide.load11.1 to <16 x i32>
2903  %12 = mul nuw nsw <16 x i32> %11, %8
2904  %13 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12)
2905  %14 = add i32 %13, %5
2906  %15 = getelementptr inbounds i8, i8* %x, i32 32
2907  %16 = bitcast i8* %15 to <16 x i8>*
2908  %wide.load.2 = load <16 x i8>, <16 x i8>* %16, align 1
2909  %17 = zext <16 x i8> %wide.load.2 to <16 x i32>
2910  %18 = getelementptr inbounds i8, i8* %y, i32 32
2911  %19 = bitcast i8* %18 to <16 x i8>*
2912  %wide.load11.2 = load <16 x i8>, <16 x i8>* %19, align 1
2913  %20 = zext <16 x i8> %wide.load11.2 to <16 x i32>
2914  %21 = mul nuw nsw <16 x i32> %20, %17
2915  %22 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %21)
2916  %23 = add i32 %22, %14
2917  %24 = getelementptr inbounds i8, i8* %x, i32 48
2918  %25 = bitcast i8* %24 to <16 x i8>*
2919  %wide.load.3 = load <16 x i8>, <16 x i8>* %25, align 1
2920  %26 = zext <16 x i8> %wide.load.3 to <16 x i32>
2921  %27 = getelementptr inbounds i8, i8* %y, i32 48
2922  %28 = bitcast i8* %27 to <16 x i8>*
2923  %wide.load11.3 = load <16 x i8>, <16 x i8>* %28, align 1
2924  %29 = zext <16 x i8> %wide.load11.3 to <16 x i32>
2925  %30 = mul nuw nsw <16 x i32> %29, %26
2926  %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30)
2927  %32 = add i32 %31, %23
2928  ret i32 %32
2929}
2930
2931define i32 @mlav128i32i8(i8* %x, i8* %y) {
2932; CHECK-LABEL: mlav128i32i8:
2933; CHECK:       @ %bb.0: @ %entry
2934; CHECK-NEXT:    vldrb.u8 q0, [r0]
2935; CHECK-NEXT:    vldrb.u8 q1, [r1]
2936; CHECK-NEXT:    mov r2, r0
2937; CHECK-NEXT:    vmlav.u8 r0, q1, q0
2938; CHECK-NEXT:    vldrb.u8 q0, [r2, #16]
2939; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
2940; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2941; CHECK-NEXT:    vldrb.u8 q0, [r2, #32]
2942; CHECK-NEXT:    vldrb.u8 q1, [r1, #32]
2943; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2944; CHECK-NEXT:    vldrb.u8 q0, [r2, #48]
2945; CHECK-NEXT:    vldrb.u8 q1, [r1, #48]
2946; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2947; CHECK-NEXT:    vldrb.u8 q0, [r2, #64]
2948; CHECK-NEXT:    vldrb.u8 q1, [r1, #64]
2949; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2950; CHECK-NEXT:    vldrb.u8 q0, [r2, #80]
2951; CHECK-NEXT:    vldrb.u8 q1, [r1, #80]
2952; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2953; CHECK-NEXT:    vldrb.u8 q0, [r2, #96]
2954; CHECK-NEXT:    vldrb.u8 q1, [r1, #96]
2955; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2956; CHECK-NEXT:    vldrb.u8 q0, [r2, #112]
2957; CHECK-NEXT:    vldrb.u8 q1, [r1, #112]
2958; CHECK-NEXT:    vmlava.u8 r0, q1, q0
2959; CHECK-NEXT:    bx lr
2960entry:
2961  %0 = bitcast i8* %x to <16 x i8>*
2962  %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
2963  %1 = zext <16 x i8> %wide.load to <16 x i32>
2964  %2 = bitcast i8* %y to <16 x i8>*
2965  %wide.load11 = load <16 x i8>, <16 x i8>* %2, align 1
2966  %3 = zext <16 x i8> %wide.load11 to <16 x i32>
2967  %4 = mul nuw nsw <16 x i32> %3, %1
2968  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
2969  %6 = getelementptr inbounds i8, i8* %x, i32 16
2970  %7 = bitcast i8* %6 to <16 x i8>*
2971  %wide.load.1 = load <16 x i8>, <16 x i8>* %7, align 1
2972  %8 = zext <16 x i8> %wide.load.1 to <16 x i32>
2973  %9 = getelementptr inbounds i8, i8* %y, i32 16
2974  %10 = bitcast i8* %9 to <16 x i8>*
2975  %wide.load11.1 = load <16 x i8>, <16 x i8>* %10, align 1
2976  %11 = zext <16 x i8> %wide.load11.1 to <16 x i32>
2977  %12 = mul nuw nsw <16 x i32> %11, %8
2978  %13 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12)
2979  %14 = add i32 %13, %5
2980  %15 = getelementptr inbounds i8, i8* %x, i32 32
2981  %16 = bitcast i8* %15 to <16 x i8>*
2982  %wide.load.2 = load <16 x i8>, <16 x i8>* %16, align 1
2983  %17 = zext <16 x i8> %wide.load.2 to <16 x i32>
2984  %18 = getelementptr inbounds i8, i8* %y, i32 32
2985  %19 = bitcast i8* %18 to <16 x i8>*
2986  %wide.load11.2 = load <16 x i8>, <16 x i8>* %19, align 1
2987  %20 = zext <16 x i8> %wide.load11.2 to <16 x i32>
2988  %21 = mul nuw nsw <16 x i32> %20, %17
2989  %22 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %21)
2990  %23 = add i32 %22, %14
2991  %24 = getelementptr inbounds i8, i8* %x, i32 48
2992  %25 = bitcast i8* %24 to <16 x i8>*
2993  %wide.load.3 = load <16 x i8>, <16 x i8>* %25, align 1
2994  %26 = zext <16 x i8> %wide.load.3 to <16 x i32>
2995  %27 = getelementptr inbounds i8, i8* %y, i32 48
2996  %28 = bitcast i8* %27 to <16 x i8>*
2997  %wide.load11.3 = load <16 x i8>, <16 x i8>* %28, align 1
2998  %29 = zext <16 x i8> %wide.load11.3 to <16 x i32>
2999  %30 = mul nuw nsw <16 x i32> %29, %26
3000  %31 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %30)
3001  %32 = add i32 %31, %23
3002  %33 = getelementptr inbounds i8, i8* %x, i32 64
3003  %34 = bitcast i8* %33 to <16 x i8>*
3004  %wide.load.4 = load <16 x i8>, <16 x i8>* %34, align 1
3005  %35 = zext <16 x i8> %wide.load.4 to <16 x i32>
3006  %36 = getelementptr inbounds i8, i8* %y, i32 64
3007  %37 = bitcast i8* %36 to <16 x i8>*
3008  %wide.load11.4 = load <16 x i8>, <16 x i8>* %37, align 1
3009  %38 = zext <16 x i8> %wide.load11.4 to <16 x i32>
3010  %39 = mul nuw nsw <16 x i32> %38, %35
3011  %40 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %39)
3012  %41 = add i32 %40, %32
3013  %42 = getelementptr inbounds i8, i8* %x, i32 80
3014  %43 = bitcast i8* %42 to <16 x i8>*
3015  %wide.load.5 = load <16 x i8>, <16 x i8>* %43, align 1
3016  %44 = zext <16 x i8> %wide.load.5 to <16 x i32>
3017  %45 = getelementptr inbounds i8, i8* %y, i32 80
3018  %46 = bitcast i8* %45 to <16 x i8>*
3019  %wide.load11.5 = load <16 x i8>, <16 x i8>* %46, align 1
3020  %47 = zext <16 x i8> %wide.load11.5 to <16 x i32>
3021  %48 = mul nuw nsw <16 x i32> %47, %44
3022  %49 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %48)
3023  %50 = add i32 %49, %41
3024  %51 = getelementptr inbounds i8, i8* %x, i32 96
3025  %52 = bitcast i8* %51 to <16 x i8>*
3026  %wide.load.6 = load <16 x i8>, <16 x i8>* %52, align 1
3027  %53 = zext <16 x i8> %wide.load.6 to <16 x i32>
3028  %54 = getelementptr inbounds i8, i8* %y, i32 96
3029  %55 = bitcast i8* %54 to <16 x i8>*
3030  %wide.load11.6 = load <16 x i8>, <16 x i8>* %55, align 1
3031  %56 = zext <16 x i8> %wide.load11.6 to <16 x i32>
3032  %57 = mul nuw nsw <16 x i32> %56, %53
3033  %58 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %57)
3034  %59 = add i32 %58, %50
3035  %60 = getelementptr inbounds i8, i8* %x, i32 112
3036  %61 = bitcast i8* %60 to <16 x i8>*
3037  %wide.load.7 = load <16 x i8>, <16 x i8>* %61, align 1
3038  %62 = zext <16 x i8> %wide.load.7 to <16 x i32>
3039  %63 = getelementptr inbounds i8, i8* %y, i32 112
3040  %64 = bitcast i8* %63 to <16 x i8>*
3041  %wide.load11.7 = load <16 x i8>, <16 x i8>* %64, align 1
3042  %65 = zext <16 x i8> %wide.load11.7 to <16 x i32>
3043  %66 = mul nuw nsw <16 x i32> %65, %62
3044  %67 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %66)
3045  %68 = add i32 %67, %59
3046  ret i32 %68
3047}
3048
3049define signext i16 @mlav2i16i16(i16* %x, i16* %y) {
3050; CHECK-LABEL: mlav2i16i16:
3051; CHECK:       @ %bb.0: @ %entry
3052; CHECK-NEXT:    ldrh r2, [r0]
3053; CHECK-NEXT:    ldrh r3, [r1]
3054; CHECK-NEXT:    ldrh r0, [r0, #2]
3055; CHECK-NEXT:    ldrh r1, [r1, #2]
3056; CHECK-NEXT:    muls r2, r3, r2
3057; CHECK-NEXT:    mla r0, r1, r0, r2
3058; CHECK-NEXT:    sxth r0, r0
3059; CHECK-NEXT:    bx lr
3060entry:
3061  %0 = load i16, i16* %x, align 2
3062  %1 = load i16, i16* %y, align 2
3063  %mul = mul i16 %1, %0
3064  %arrayidx.1 = getelementptr inbounds i16, i16* %x, i32 1
3065  %2 = load i16, i16* %arrayidx.1, align 2
3066  %arrayidx1.1 = getelementptr inbounds i16, i16* %y, i32 1
3067  %3 = load i16, i16* %arrayidx1.1, align 2
3068  %mul.1 = mul i16 %3, %2
3069  %add.1 = add i16 %mul.1, %mul
3070  ret i16 %add.1
3071}
3072
3073define signext i16 @mlav4i16i16(i16* %x, i16* %y) {
3074; CHECK-LABEL: mlav4i16i16:
3075; CHECK:       @ %bb.0: @ %entry
3076; CHECK-NEXT:    vldrh.u32 q0, [r0]
3077; CHECK-NEXT:    vldrh.u32 q1, [r1]
3078; CHECK-NEXT:    vmlav.u32 r0, q1, q0
3079; CHECK-NEXT:    sxth r0, r0
3080; CHECK-NEXT:    bx lr
3081entry:
3082  %0 = bitcast i16* %x to <4 x i16>*
3083  %1 = load <4 x i16>, <4 x i16>* %0, align 2
3084  %2 = bitcast i16* %y to <4 x i16>*
3085  %3 = load <4 x i16>, <4 x i16>* %2, align 2
3086  %4 = mul <4 x i16> %3, %1
3087  %5 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %4)
3088  ret i16 %5
3089}
3090
3091define signext i16 @mlav8i16i16(i16* %x, i16* %y) {
3092; CHECK-LABEL: mlav8i16i16:
3093; CHECK:       @ %bb.0: @ %entry
3094; CHECK-NEXT:    vldrh.u16 q0, [r0]
3095; CHECK-NEXT:    vldrh.u16 q1, [r1]
3096; CHECK-NEXT:    vmlav.u16 r0, q1, q0
3097; CHECK-NEXT:    sxth r0, r0
3098; CHECK-NEXT:    bx lr
3099entry:
3100  %0 = bitcast i16* %x to <8 x i16>*
3101  %1 = load <8 x i16>, <8 x i16>* %0, align 2
3102  %2 = bitcast i16* %y to <8 x i16>*
3103  %3 = load <8 x i16>, <8 x i16>* %2, align 2
3104  %4 = mul <8 x i16> %3, %1
3105  %5 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4)
3106  ret i16 %5
3107}
3108
3109define signext i16 @mlav16i16i16(i16* %x, i16* %y) {
3110; CHECK-LABEL: mlav16i16i16:
3111; CHECK:       @ %bb.0: @ %entry
3112; CHECK-NEXT:    vldrh.u16 q0, [r0]
3113; CHECK-NEXT:    vldrh.u16 q1, [r1]
3114; CHECK-NEXT:    vmlav.u16 r2, q1, q0
3115; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
3116; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
3117; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3118; CHECK-NEXT:    sxth r0, r2
3119; CHECK-NEXT:    bx lr
3120entry:
3121  %0 = bitcast i16* %x to <16 x i16>*
3122  %1 = load <16 x i16>, <16 x i16>* %0, align 2
3123  %2 = bitcast i16* %y to <16 x i16>*
3124  %3 = load <16 x i16>, <16 x i16>* %2, align 2
3125  %4 = mul <16 x i16> %3, %1
3126  %5 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %4)
3127  ret i16 %5
3128}
3129
3130define signext i16 @mlav24i16i16(i16* %x, i16* %y) {
3131; CHECK-LABEL: mlav24i16i16:
3132; CHECK:       @ %bb.0: @ %entry
3133; CHECK-NEXT:    vldrh.u16 q0, [r0]
3134; CHECK-NEXT:    vldrh.u16 q1, [r1]
3135; CHECK-NEXT:    vmlav.u16 r2, q1, q0
3136; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
3137; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
3138; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3139; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
3140; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
3141; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3142; CHECK-NEXT:    sxth r0, r2
3143; CHECK-NEXT:    bx lr
3144entry:
3145  %0 = bitcast i16* %x to <8 x i16>*
3146  %1 = load <8 x i16>, <8 x i16>* %0, align 2
3147  %2 = bitcast i16* %y to <8 x i16>*
3148  %3 = load <8 x i16>, <8 x i16>* %2, align 2
3149  %4 = mul <8 x i16> %3, %1
3150  %arrayidx.8 = getelementptr inbounds i16, i16* %x, i32 8
3151  %arrayidx1.8 = getelementptr inbounds i16, i16* %y, i32 8
3152  %5 = bitcast i16* %arrayidx.8 to <16 x i16>*
3153  %6 = load <16 x i16>, <16 x i16>* %5, align 2
3154  %7 = bitcast i16* %arrayidx1.8 to <16 x i16>*
3155  %8 = load <16 x i16>, <16 x i16>* %7, align 2
3156  %9 = mul <16 x i16> %8, %6
3157  %10 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %9)
3158  %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %4)
3159  %op.rdx = add i16 %10, %11
3160  ret i16 %op.rdx
3161}
3162
3163define signext i16 @mlav32i16i16(i16* %x, i16* %y) {
3164; CHECK-LABEL: mlav32i16i16:
3165; CHECK:       @ %bb.0: @ %entry
3166; CHECK-NEXT:    vldrh.u16 q0, [r0]
3167; CHECK-NEXT:    vldrh.u16 q1, [r1]
3168; CHECK-NEXT:    vmlav.u16 r2, q1, q0
3169; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
3170; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
3171; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3172; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
3173; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
3174; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3175; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
3176; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
3177; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3178; CHECK-NEXT:    sxth r0, r2
3179; CHECK-NEXT:    bx lr
3180entry:
3181  %0 = bitcast i16* %x to <32 x i16>*
3182  %1 = load <32 x i16>, <32 x i16>* %0, align 2
3183  %2 = bitcast i16* %y to <32 x i16>*
3184  %3 = load <32 x i16>, <32 x i16>* %2, align 2
3185  %4 = mul <32 x i16> %3, %1
3186  %5 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %4)
3187  ret i16 %5
3188}
3189
3190define signext i16 @mlav64i16i16(i16* %x, i16* %y) {
3191; CHECK-LABEL: mlav64i16i16:
3192; CHECK:       @ %bb.0: @ %entry
3193; CHECK-NEXT:    vldrh.u16 q0, [r0]
3194; CHECK-NEXT:    vldrh.u16 q1, [r1]
3195; CHECK-NEXT:    vmlav.u16 r2, q1, q0
3196; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
3197; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
3198; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3199; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
3200; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
3201; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3202; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
3203; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
3204; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3205; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
3206; CHECK-NEXT:    vldrh.u16 q1, [r1, #64]
3207; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3208; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
3209; CHECK-NEXT:    vldrh.u16 q1, [r1, #80]
3210; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3211; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
3212; CHECK-NEXT:    vldrh.u16 q1, [r1, #96]
3213; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3214; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
3215; CHECK-NEXT:    vldrh.u16 q1, [r1, #112]
3216; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3217; CHECK-NEXT:    sxth r0, r2
3218; CHECK-NEXT:    bx lr
3219entry:
3220  %0 = bitcast i16* %x to <8 x i16>*
3221  %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
3222  %1 = bitcast i16* %y to <8 x i16>*
3223  %wide.load13 = load <8 x i16>, <8 x i16>* %1, align 2
3224  %2 = mul <8 x i16> %wide.load13, %wide.load
3225  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
3226  %4 = getelementptr inbounds i16, i16* %x, i32 8
3227  %5 = bitcast i16* %4 to <8 x i16>*
3228  %wide.load.1 = load <8 x i16>, <8 x i16>* %5, align 2
3229  %6 = getelementptr inbounds i16, i16* %y, i32 8
3230  %7 = bitcast i16* %6 to <8 x i16>*
3231  %wide.load13.1 = load <8 x i16>, <8 x i16>* %7, align 2
3232  %8 = mul <8 x i16> %wide.load13.1, %wide.load.1
3233  %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %8)
3234  %10 = add i16 %9, %3
3235  %11 = getelementptr inbounds i16, i16* %x, i32 16
3236  %12 = bitcast i16* %11 to <8 x i16>*
3237  %wide.load.2 = load <8 x i16>, <8 x i16>* %12, align 2
3238  %13 = getelementptr inbounds i16, i16* %y, i32 16
3239  %14 = bitcast i16* %13 to <8 x i16>*
3240  %wide.load13.2 = load <8 x i16>, <8 x i16>* %14, align 2
3241  %15 = mul <8 x i16> %wide.load13.2, %wide.load.2
3242  %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %15)
3243  %17 = add i16 %16, %10
3244  %18 = getelementptr inbounds i16, i16* %x, i32 24
3245  %19 = bitcast i16* %18 to <8 x i16>*
3246  %wide.load.3 = load <8 x i16>, <8 x i16>* %19, align 2
3247  %20 = getelementptr inbounds i16, i16* %y, i32 24
3248  %21 = bitcast i16* %20 to <8 x i16>*
3249  %wide.load13.3 = load <8 x i16>, <8 x i16>* %21, align 2
3250  %22 = mul <8 x i16> %wide.load13.3, %wide.load.3
3251  %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %22)
3252  %24 = add i16 %23, %17
3253  %25 = getelementptr inbounds i16, i16* %x, i32 32
3254  %26 = bitcast i16* %25 to <8 x i16>*
3255  %wide.load.4 = load <8 x i16>, <8 x i16>* %26, align 2
3256  %27 = getelementptr inbounds i16, i16* %y, i32 32
3257  %28 = bitcast i16* %27 to <8 x i16>*
3258  %wide.load13.4 = load <8 x i16>, <8 x i16>* %28, align 2
3259  %29 = mul <8 x i16> %wide.load13.4, %wide.load.4
3260  %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29)
3261  %31 = add i16 %30, %24
3262  %32 = getelementptr inbounds i16, i16* %x, i32 40
3263  %33 = bitcast i16* %32 to <8 x i16>*
3264  %wide.load.5 = load <8 x i16>, <8 x i16>* %33, align 2
3265  %34 = getelementptr inbounds i16, i16* %y, i32 40
3266  %35 = bitcast i16* %34 to <8 x i16>*
3267  %wide.load13.5 = load <8 x i16>, <8 x i16>* %35, align 2
3268  %36 = mul <8 x i16> %wide.load13.5, %wide.load.5
3269  %37 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %36)
3270  %38 = add i16 %37, %31
3271  %39 = getelementptr inbounds i16, i16* %x, i32 48
3272  %40 = bitcast i16* %39 to <8 x i16>*
3273  %wide.load.6 = load <8 x i16>, <8 x i16>* %40, align 2
3274  %41 = getelementptr inbounds i16, i16* %y, i32 48
3275  %42 = bitcast i16* %41 to <8 x i16>*
3276  %wide.load13.6 = load <8 x i16>, <8 x i16>* %42, align 2
3277  %43 = mul <8 x i16> %wide.load13.6, %wide.load.6
3278  %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %43)
3279  %45 = add i16 %44, %38
3280  %46 = getelementptr inbounds i16, i16* %x, i32 56
3281  %47 = bitcast i16* %46 to <8 x i16>*
3282  %wide.load.7 = load <8 x i16>, <8 x i16>* %47, align 2
3283  %48 = getelementptr inbounds i16, i16* %y, i32 56
3284  %49 = bitcast i16* %48 to <8 x i16>*
3285  %wide.load13.7 = load <8 x i16>, <8 x i16>* %49, align 2
3286  %50 = mul <8 x i16> %wide.load13.7, %wide.load.7
3287  %51 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %50)
3288  %52 = add i16 %51, %45
3289  ret i16 %52
3290}
3291
3292define signext i16 @mlav128i16i16(i16* %x, i16* %y) {
3293; CHECK-LABEL: mlav128i16i16:
3294; CHECK:       @ %bb.0: @ %entry
3295; CHECK-NEXT:    vldrh.u16 q0, [r0]
3296; CHECK-NEXT:    vldrh.u16 q1, [r1]
3297; CHECK-NEXT:    vmlav.u16 r2, q1, q0
3298; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
3299; CHECK-NEXT:    vldrh.u16 q1, [r1, #16]
3300; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3301; CHECK-NEXT:    vldrh.u16 q0, [r0, #32]
3302; CHECK-NEXT:    vldrh.u16 q1, [r1, #32]
3303; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3304; CHECK-NEXT:    vldrh.u16 q0, [r0, #48]
3305; CHECK-NEXT:    vldrh.u16 q1, [r1, #48]
3306; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3307; CHECK-NEXT:    vldrh.u16 q0, [r0, #64]
3308; CHECK-NEXT:    vldrh.u16 q1, [r1, #64]
3309; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3310; CHECK-NEXT:    vldrh.u16 q0, [r0, #80]
3311; CHECK-NEXT:    vldrh.u16 q1, [r1, #80]
3312; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3313; CHECK-NEXT:    vldrh.u16 q0, [r0, #96]
3314; CHECK-NEXT:    vldrh.u16 q1, [r1, #96]
3315; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3316; CHECK-NEXT:    vldrh.u16 q0, [r0, #112]
3317; CHECK-NEXT:    vldrh.u16 q1, [r1, #112]
3318; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3319; CHECK-NEXT:    vldrh.u16 q0, [r0, #128]
3320; CHECK-NEXT:    vldrh.u16 q1, [r1, #128]
3321; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3322; CHECK-NEXT:    vldrh.u16 q0, [r0, #144]
3323; CHECK-NEXT:    vldrh.u16 q1, [r1, #144]
3324; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3325; CHECK-NEXT:    vldrh.u16 q0, [r0, #160]
3326; CHECK-NEXT:    vldrh.u16 q1, [r1, #160]
3327; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3328; CHECK-NEXT:    vldrh.u16 q0, [r0, #176]
3329; CHECK-NEXT:    vldrh.u16 q1, [r1, #176]
3330; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3331; CHECK-NEXT:    vldrh.u16 q0, [r0, #192]
3332; CHECK-NEXT:    vldrh.u16 q1, [r1, #192]
3333; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3334; CHECK-NEXT:    vldrh.u16 q0, [r0, #208]
3335; CHECK-NEXT:    vldrh.u16 q1, [r1, #208]
3336; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3337; CHECK-NEXT:    vldrh.u16 q0, [r0, #224]
3338; CHECK-NEXT:    vldrh.u16 q1, [r1, #224]
3339; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3340; CHECK-NEXT:    vldrh.u16 q0, [r0, #240]
3341; CHECK-NEXT:    vldrh.u16 q1, [r1, #240]
3342; CHECK-NEXT:    vmlava.u16 r2, q1, q0
3343; CHECK-NEXT:    sxth r0, r2
3344; CHECK-NEXT:    bx lr
3345entry:
3346  %0 = bitcast i16* %x to <8 x i16>*
3347  %wide.load = load <8 x i16>, <8 x i16>* %0, align 2
3348  %1 = bitcast i16* %y to <8 x i16>*
3349  %wide.load13 = load <8 x i16>, <8 x i16>* %1, align 2
3350  %2 = mul <8 x i16> %wide.load13, %wide.load
3351  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
3352  %4 = getelementptr inbounds i16, i16* %x, i32 8
3353  %5 = bitcast i16* %4 to <8 x i16>*
3354  %wide.load.1 = load <8 x i16>, <8 x i16>* %5, align 2
3355  %6 = getelementptr inbounds i16, i16* %y, i32 8
3356  %7 = bitcast i16* %6 to <8 x i16>*
3357  %wide.load13.1 = load <8 x i16>, <8 x i16>* %7, align 2
3358  %8 = mul <8 x i16> %wide.load13.1, %wide.load.1
3359  %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %8)
3360  %10 = add i16 %9, %3
3361  %11 = getelementptr inbounds i16, i16* %x, i32 16
3362  %12 = bitcast i16* %11 to <8 x i16>*
3363  %wide.load.2 = load <8 x i16>, <8 x i16>* %12, align 2
3364  %13 = getelementptr inbounds i16, i16* %y, i32 16
3365  %14 = bitcast i16* %13 to <8 x i16>*
3366  %wide.load13.2 = load <8 x i16>, <8 x i16>* %14, align 2
3367  %15 = mul <8 x i16> %wide.load13.2, %wide.load.2
3368  %16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %15)
3369  %17 = add i16 %16, %10
3370  %18 = getelementptr inbounds i16, i16* %x, i32 24
3371  %19 = bitcast i16* %18 to <8 x i16>*
3372  %wide.load.3 = load <8 x i16>, <8 x i16>* %19, align 2
3373  %20 = getelementptr inbounds i16, i16* %y, i32 24
3374  %21 = bitcast i16* %20 to <8 x i16>*
3375  %wide.load13.3 = load <8 x i16>, <8 x i16>* %21, align 2
3376  %22 = mul <8 x i16> %wide.load13.3, %wide.load.3
3377  %23 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %22)
3378  %24 = add i16 %23, %17
3379  %25 = getelementptr inbounds i16, i16* %x, i32 32
3380  %26 = bitcast i16* %25 to <8 x i16>*
3381  %wide.load.4 = load <8 x i16>, <8 x i16>* %26, align 2
3382  %27 = getelementptr inbounds i16, i16* %y, i32 32
3383  %28 = bitcast i16* %27 to <8 x i16>*
3384  %wide.load13.4 = load <8 x i16>, <8 x i16>* %28, align 2
3385  %29 = mul <8 x i16> %wide.load13.4, %wide.load.4
3386  %30 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %29)
3387  %31 = add i16 %30, %24
3388  %32 = getelementptr inbounds i16, i16* %x, i32 40
3389  %33 = bitcast i16* %32 to <8 x i16>*
3390  %wide.load.5 = load <8 x i16>, <8 x i16>* %33, align 2
3391  %34 = getelementptr inbounds i16, i16* %y, i32 40
3392  %35 = bitcast i16* %34 to <8 x i16>*
3393  %wide.load13.5 = load <8 x i16>, <8 x i16>* %35, align 2
3394  %36 = mul <8 x i16> %wide.load13.5, %wide.load.5
3395  %37 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %36)
3396  %38 = add i16 %37, %31
3397  %39 = getelementptr inbounds i16, i16* %x, i32 48
3398  %40 = bitcast i16* %39 to <8 x i16>*
3399  %wide.load.6 = load <8 x i16>, <8 x i16>* %40, align 2
3400  %41 = getelementptr inbounds i16, i16* %y, i32 48
3401  %42 = bitcast i16* %41 to <8 x i16>*
3402  %wide.load13.6 = load <8 x i16>, <8 x i16>* %42, align 2
3403  %43 = mul <8 x i16> %wide.load13.6, %wide.load.6
3404  %44 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %43)
3405  %45 = add i16 %44, %38
3406  %46 = getelementptr inbounds i16, i16* %x, i32 56
3407  %47 = bitcast i16* %46 to <8 x i16>*
3408  %wide.load.7 = load <8 x i16>, <8 x i16>* %47, align 2
3409  %48 = getelementptr inbounds i16, i16* %y, i32 56
3410  %49 = bitcast i16* %48 to <8 x i16>*
3411  %wide.load13.7 = load <8 x i16>, <8 x i16>* %49, align 2
3412  %50 = mul <8 x i16> %wide.load13.7, %wide.load.7
3413  %51 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %50)
3414  %52 = add i16 %51, %45
3415  %53 = getelementptr inbounds i16, i16* %x, i32 64
3416  %54 = bitcast i16* %53 to <8 x i16>*
3417  %wide.load.8 = load <8 x i16>, <8 x i16>* %54, align 2
3418  %55 = getelementptr inbounds i16, i16* %y, i32 64
3419  %56 = bitcast i16* %55 to <8 x i16>*
3420  %wide.load13.8 = load <8 x i16>, <8 x i16>* %56, align 2
3421  %57 = mul <8 x i16> %wide.load13.8, %wide.load.8
3422  %58 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %57)
3423  %59 = add i16 %58, %52
3424  %60 = getelementptr inbounds i16, i16* %x, i32 72
3425  %61 = bitcast i16* %60 to <8 x i16>*
3426  %wide.load.9 = load <8 x i16>, <8 x i16>* %61, align 2
3427  %62 = getelementptr inbounds i16, i16* %y, i32 72
3428  %63 = bitcast i16* %62 to <8 x i16>*
3429  %wide.load13.9 = load <8 x i16>, <8 x i16>* %63, align 2
3430  %64 = mul <8 x i16> %wide.load13.9, %wide.load.9
3431  %65 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %64)
3432  %66 = add i16 %65, %59
3433  %67 = getelementptr inbounds i16, i16* %x, i32 80
3434  %68 = bitcast i16* %67 to <8 x i16>*
3435  %wide.load.10 = load <8 x i16>, <8 x i16>* %68, align 2
3436  %69 = getelementptr inbounds i16, i16* %y, i32 80
3437  %70 = bitcast i16* %69 to <8 x i16>*
3438  %wide.load13.10 = load <8 x i16>, <8 x i16>* %70, align 2
3439  %71 = mul <8 x i16> %wide.load13.10, %wide.load.10
3440  %72 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %71)
3441  %73 = add i16 %72, %66
3442  %74 = getelementptr inbounds i16, i16* %x, i32 88
3443  %75 = bitcast i16* %74 to <8 x i16>*
3444  %wide.load.11 = load <8 x i16>, <8 x i16>* %75, align 2
3445  %76 = getelementptr inbounds i16, i16* %y, i32 88
3446  %77 = bitcast i16* %76 to <8 x i16>*
3447  %wide.load13.11 = load <8 x i16>, <8 x i16>* %77, align 2
3448  %78 = mul <8 x i16> %wide.load13.11, %wide.load.11
3449  %79 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %78)
3450  %80 = add i16 %79, %73
3451  %81 = getelementptr inbounds i16, i16* %x, i32 96
3452  %82 = bitcast i16* %81 to <8 x i16>*
3453  %wide.load.12 = load <8 x i16>, <8 x i16>* %82, align 2
3454  %83 = getelementptr inbounds i16, i16* %y, i32 96
3455  %84 = bitcast i16* %83 to <8 x i16>*
3456  %wide.load13.12 = load <8 x i16>, <8 x i16>* %84, align 2
3457  %85 = mul <8 x i16> %wide.load13.12, %wide.load.12
3458  %86 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %85)
3459  %87 = add i16 %86, %80
3460  %88 = getelementptr inbounds i16, i16* %x, i32 104
3461  %89 = bitcast i16* %88 to <8 x i16>*
3462  %wide.load.13 = load <8 x i16>, <8 x i16>* %89, align 2
3463  %90 = getelementptr inbounds i16, i16* %y, i32 104
3464  %91 = bitcast i16* %90 to <8 x i16>*
3465  %wide.load13.13 = load <8 x i16>, <8 x i16>* %91, align 2
3466  %92 = mul <8 x i16> %wide.load13.13, %wide.load.13
3467  %93 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %92)
3468  %94 = add i16 %93, %87
3469  %95 = getelementptr inbounds i16, i16* %x, i32 112
3470  %96 = bitcast i16* %95 to <8 x i16>*
3471  %wide.load.14 = load <8 x i16>, <8 x i16>* %96, align 2
3472  %97 = getelementptr inbounds i16, i16* %y, i32 112
3473  %98 = bitcast i16* %97 to <8 x i16>*
3474  %wide.load13.14 = load <8 x i16>, <8 x i16>* %98, align 2
3475  %99 = mul <8 x i16> %wide.load13.14, %wide.load.14
3476  %100 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %99)
3477  %101 = add i16 %100, %94
3478  %102 = getelementptr inbounds i16, i16* %x, i32 120
3479  %103 = bitcast i16* %102 to <8 x i16>*
3480  %wide.load.15 = load <8 x i16>, <8 x i16>* %103, align 2
3481  %104 = getelementptr inbounds i16, i16* %y, i32 120
3482  %105 = bitcast i16* %104 to <8 x i16>*
3483  %wide.load13.15 = load <8 x i16>, <8 x i16>* %105, align 2
3484  %106 = mul <8 x i16> %wide.load13.15, %wide.load.15
3485  %107 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %106)
3486  %108 = add i16 %107, %101
3487  ret i16 %108
3488}
3489
3490define zeroext i8 @mlav2i8i8(i8* %x, i8* %y) {
3491; CHECK-LABEL: mlav2i8i8:
3492; CHECK:       @ %bb.0: @ %entry
3493; CHECK-NEXT:    ldrb r2, [r0]
3494; CHECK-NEXT:    ldrb r3, [r1]
3495; CHECK-NEXT:    ldrb r0, [r0, #1]
3496; CHECK-NEXT:    ldrb r1, [r1, #1]
3497; CHECK-NEXT:    muls r2, r3, r2
3498; CHECK-NEXT:    mla r0, r1, r0, r2
3499; CHECK-NEXT:    uxtb r0, r0
3500; CHECK-NEXT:    bx lr
3501entry:
3502  %0 = load i8, i8* %x, align 1
3503  %1 = load i8, i8* %y, align 1
3504  %mul = mul i8 %1, %0
3505  %arrayidx.1 = getelementptr inbounds i8, i8* %x, i32 1
3506  %2 = load i8, i8* %arrayidx.1, align 1
3507  %arrayidx1.1 = getelementptr inbounds i8, i8* %y, i32 1
3508  %3 = load i8, i8* %arrayidx1.1, align 1
3509  %mul.1 = mul i8 %3, %2
3510  %add.1 = add i8 %mul.1, %mul
3511  ret i8 %add.1
3512}
3513
3514define zeroext i8 @mlav4i8i8(i8* %x, i8* %y) {
3515; CHECK-LABEL: mlav4i8i8:
3516; CHECK:       @ %bb.0: @ %entry
3517; CHECK-NEXT:    vldrb.u32 q0, [r0]
3518; CHECK-NEXT:    vldrb.u32 q1, [r1]
3519; CHECK-NEXT:    vmlav.u32 r0, q1, q0
3520; CHECK-NEXT:    uxtb r0, r0
3521; CHECK-NEXT:    bx lr
3522entry:
3523  %0 = bitcast i8* %x to <4 x i8>*
3524  %1 = load <4 x i8>, <4 x i8>* %0, align 1
3525  %2 = bitcast i8* %y to <4 x i8>*
3526  %3 = load <4 x i8>, <4 x i8>* %2, align 1
3527  %4 = mul <4 x i8> %3, %1
3528  %5 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %4)
3529  ret i8 %5
3530}
3531
3532define zeroext i8 @mlav8i8i8(i8* %x, i8* %y) {
3533; CHECK-LABEL: mlav8i8i8:
3534; CHECK:       @ %bb.0: @ %entry
3535; CHECK-NEXT:    vldrb.u16 q0, [r0]
3536; CHECK-NEXT:    vldrb.u16 q1, [r1]
3537; CHECK-NEXT:    vmlav.u16 r0, q1, q0
3538; CHECK-NEXT:    uxtb r0, r0
3539; CHECK-NEXT:    bx lr
3540entry:
3541  %0 = bitcast i8* %x to <8 x i8>*
3542  %1 = load <8 x i8>, <8 x i8>* %0, align 1
3543  %2 = bitcast i8* %y to <8 x i8>*
3544  %3 = load <8 x i8>, <8 x i8>* %2, align 1
3545  %4 = mul <8 x i8> %3, %1
3546  %5 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %4)
3547  ret i8 %5
3548}
3549
3550define zeroext i8 @mlav16i8i8(i8* %x, i8* %y) {
3551; CHECK-LABEL: mlav16i8i8:
3552; CHECK:       @ %bb.0: @ %entry
3553; CHECK-NEXT:    vldrb.u8 q0, [r0]
3554; CHECK-NEXT:    vldrb.u8 q1, [r1]
3555; CHECK-NEXT:    vmlav.u8 r0, q1, q0
3556; CHECK-NEXT:    uxtb r0, r0
3557; CHECK-NEXT:    bx lr
3558entry:
3559  %0 = bitcast i8* %x to <16 x i8>*
3560  %1 = load <16 x i8>, <16 x i8>* %0, align 1
3561  %2 = bitcast i8* %y to <16 x i8>*
3562  %3 = load <16 x i8>, <16 x i8>* %2, align 1
3563  %4 = mul <16 x i8> %3, %1
3564  %5 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %4)
3565  ret i8 %5
3566}
3567
3568define zeroext i8 @mlav24i8i8(i8* %x, i8* %y) {
3569; CHECK-LABEL: mlav24i8i8:
3570; CHECK:       @ %bb.0: @ %entry
3571; CHECK-NEXT:    vldrb.u16 q0, [r0]
3572; CHECK-NEXT:    vldrb.u16 q1, [r1]
3573; CHECK-NEXT:    vmlav.u16 r2, q1, q0
3574; CHECK-NEXT:    vldrb.u8 q0, [r0, #8]
3575; CHECK-NEXT:    vldrb.u8 q1, [r1, #8]
3576; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3577; CHECK-NEXT:    uxtb r0, r2
3578; CHECK-NEXT:    bx lr
3579entry:
3580  %0 = bitcast i8* %x to <8 x i8>*
3581  %1 = load <8 x i8>, <8 x i8>* %0, align 1
3582  %2 = bitcast i8* %y to <8 x i8>*
3583  %3 = load <8 x i8>, <8 x i8>* %2, align 1
3584  %4 = mul <8 x i8> %3, %1
3585  %arrayidx.8 = getelementptr inbounds i8, i8* %x, i32 8
3586  %arrayidx1.8 = getelementptr inbounds i8, i8* %y, i32 8
3587  %5 = bitcast i8* %arrayidx.8 to <16 x i8>*
3588  %6 = load <16 x i8>, <16 x i8>* %5, align 1
3589  %7 = bitcast i8* %arrayidx1.8 to <16 x i8>*
3590  %8 = load <16 x i8>, <16 x i8>* %7, align 1
3591  %9 = mul <16 x i8> %8, %6
3592  %10 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %9)
3593  %11 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %4)
3594  %op.rdx = add i8 %10, %11
3595  ret i8 %op.rdx
3596}
3597
3598define zeroext i8 @mlav32i8i8(i8* %x, i8* %y) {
3599; CHECK-LABEL: mlav32i8i8:
3600; CHECK:       @ %bb.0: @ %entry
3601; CHECK-NEXT:    vldrb.u8 q0, [r0]
3602; CHECK-NEXT:    vldrb.u8 q1, [r1]
3603; CHECK-NEXT:    vmlav.u8 r2, q1, q0
3604; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
3605; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
3606; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3607; CHECK-NEXT:    uxtb r0, r2
3608; CHECK-NEXT:    bx lr
3609entry:
3610  %0 = bitcast i8* %x to <32 x i8>*
3611  %1 = load <32 x i8>, <32 x i8>* %0, align 1
3612  %2 = bitcast i8* %y to <32 x i8>*
3613  %3 = load <32 x i8>, <32 x i8>* %2, align 1
3614  %4 = mul <32 x i8> %3, %1
3615  %5 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %4)
3616  ret i8 %5
3617}
3618
3619define zeroext i8 @mlav64i8i8(i8* %x, i8* %y) {
3620; CHECK-LABEL: mlav64i8i8:
3621; CHECK:       @ %bb.0: @ %entry
3622; CHECK-NEXT:    vldrb.u8 q0, [r0]
3623; CHECK-NEXT:    vldrb.u8 q1, [r1]
3624; CHECK-NEXT:    vmlav.u8 r2, q1, q0
3625; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
3626; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
3627; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3628; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
3629; CHECK-NEXT:    vldrb.u8 q1, [r1, #32]
3630; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3631; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
3632; CHECK-NEXT:    vldrb.u8 q1, [r1, #48]
3633; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3634; CHECK-NEXT:    uxtb r0, r2
3635; CHECK-NEXT:    bx lr
3636entry:
3637  %0 = bitcast i8* %x to <16 x i8>*
3638  %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
3639  %1 = bitcast i8* %y to <16 x i8>*
3640  %wide.load12 = load <16 x i8>, <16 x i8>* %1, align 1
3641  %2 = mul <16 x i8> %wide.load12, %wide.load
3642  %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
3643  %4 = getelementptr inbounds i8, i8* %x, i32 16
3644  %5 = bitcast i8* %4 to <16 x i8>*
3645  %wide.load.1 = load <16 x i8>, <16 x i8>* %5, align 1
3646  %6 = getelementptr inbounds i8, i8* %y, i32 16
3647  %7 = bitcast i8* %6 to <16 x i8>*
3648  %wide.load12.1 = load <16 x i8>, <16 x i8>* %7, align 1
3649  %8 = mul <16 x i8> %wide.load12.1, %wide.load.1
3650  %9 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %8)
3651  %10 = add i8 %9, %3
3652  %11 = getelementptr inbounds i8, i8* %x, i32 32
3653  %12 = bitcast i8* %11 to <16 x i8>*
3654  %wide.load.2 = load <16 x i8>, <16 x i8>* %12, align 1
3655  %13 = getelementptr inbounds i8, i8* %y, i32 32
3656  %14 = bitcast i8* %13 to <16 x i8>*
3657  %wide.load12.2 = load <16 x i8>, <16 x i8>* %14, align 1
3658  %15 = mul <16 x i8> %wide.load12.2, %wide.load.2
3659  %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %15)
3660  %17 = add i8 %16, %10
3661  %18 = getelementptr inbounds i8, i8* %x, i32 48
3662  %19 = bitcast i8* %18 to <16 x i8>*
3663  %wide.load.3 = load <16 x i8>, <16 x i8>* %19, align 1
3664  %20 = getelementptr inbounds i8, i8* %y, i32 48
3665  %21 = bitcast i8* %20 to <16 x i8>*
3666  %wide.load12.3 = load <16 x i8>, <16 x i8>* %21, align 1
3667  %22 = mul <16 x i8> %wide.load12.3, %wide.load.3
3668  %23 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %22)
3669  %24 = add i8 %23, %17
3670  ret i8 %24
3671}
3672
3673define zeroext i8 @mlav128i8i8(i8* %x, i8* %y) {
3674; CHECK-LABEL: mlav128i8i8:
3675; CHECK:       @ %bb.0: @ %entry
3676; CHECK-NEXT:    vldrb.u8 q0, [r0]
3677; CHECK-NEXT:    vldrb.u8 q1, [r1]
3678; CHECK-NEXT:    vmlav.u8 r2, q1, q0
3679; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
3680; CHECK-NEXT:    vldrb.u8 q1, [r1, #16]
3681; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3682; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
3683; CHECK-NEXT:    vldrb.u8 q1, [r1, #32]
3684; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3685; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
3686; CHECK-NEXT:    vldrb.u8 q1, [r1, #48]
3687; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3688; CHECK-NEXT:    vldrb.u8 q0, [r0, #64]
3689; CHECK-NEXT:    vldrb.u8 q1, [r1, #64]
3690; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3691; CHECK-NEXT:    vldrb.u8 q0, [r0, #80]
3692; CHECK-NEXT:    vldrb.u8 q1, [r1, #80]
3693; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3694; CHECK-NEXT:    vldrb.u8 q0, [r0, #96]
3695; CHECK-NEXT:    vldrb.u8 q1, [r1, #96]
3696; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3697; CHECK-NEXT:    vldrb.u8 q0, [r0, #112]
3698; CHECK-NEXT:    vldrb.u8 q1, [r1, #112]
3699; CHECK-NEXT:    vmlava.u8 r2, q1, q0
3700; CHECK-NEXT:    uxtb r0, r2
3701; CHECK-NEXT:    bx lr
3702entry:
3703  %0 = bitcast i8* %x to <16 x i8>*
3704  %wide.load = load <16 x i8>, <16 x i8>* %0, align 1
3705  %1 = bitcast i8* %y to <16 x i8>*
3706  %wide.load12 = load <16 x i8>, <16 x i8>* %1, align 1
3707  %2 = mul <16 x i8> %wide.load12, %wide.load
3708  %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
3709  %4 = getelementptr inbounds i8, i8* %x, i32 16
3710  %5 = bitcast i8* %4 to <16 x i8>*
3711  %wide.load.1 = load <16 x i8>, <16 x i8>* %5, align 1
3712  %6 = getelementptr inbounds i8, i8* %y, i32 16
3713  %7 = bitcast i8* %6 to <16 x i8>*
3714  %wide.load12.1 = load <16 x i8>, <16 x i8>* %7, align 1
3715  %8 = mul <16 x i8> %wide.load12.1, %wide.load.1
3716  %9 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %8)
3717  %10 = add i8 %9, %3
3718  %11 = getelementptr inbounds i8, i8* %x, i32 32
3719  %12 = bitcast i8* %11 to <16 x i8>*
3720  %wide.load.2 = load <16 x i8>, <16 x i8>* %12, align 1
3721  %13 = getelementptr inbounds i8, i8* %y, i32 32
3722  %14 = bitcast i8* %13 to <16 x i8>*
3723  %wide.load12.2 = load <16 x i8>, <16 x i8>* %14, align 1
3724  %15 = mul <16 x i8> %wide.load12.2, %wide.load.2
3725  %16 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %15)
3726  %17 = add i8 %16, %10
3727  %18 = getelementptr inbounds i8, i8* %x, i32 48
3728  %19 = bitcast i8* %18 to <16 x i8>*
3729  %wide.load.3 = load <16 x i8>, <16 x i8>* %19, align 1
3730  %20 = getelementptr inbounds i8, i8* %y, i32 48
3731  %21 = bitcast i8* %20 to <16 x i8>*
3732  %wide.load12.3 = load <16 x i8>, <16 x i8>* %21, align 1
3733  %22 = mul <16 x i8> %wide.load12.3, %wide.load.3
3734  %23 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %22)
3735  %24 = add i8 %23, %17
3736  %25 = getelementptr inbounds i8, i8* %x, i32 64
3737  %26 = bitcast i8* %25 to <16 x i8>*
3738  %wide.load.4 = load <16 x i8>, <16 x i8>* %26, align 1
3739  %27 = getelementptr inbounds i8, i8* %y, i32 64
3740  %28 = bitcast i8* %27 to <16 x i8>*
3741  %wide.load12.4 = load <16 x i8>, <16 x i8>* %28, align 1
3742  %29 = mul <16 x i8> %wide.load12.4, %wide.load.4
3743  %30 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %29)
3744  %31 = add i8 %30, %24
3745  %32 = getelementptr inbounds i8, i8* %x, i32 80
3746  %33 = bitcast i8* %32 to <16 x i8>*
3747  %wide.load.5 = load <16 x i8>, <16 x i8>* %33, align 1
3748  %34 = getelementptr inbounds i8, i8* %y, i32 80
3749  %35 = bitcast i8* %34 to <16 x i8>*
3750  %wide.load12.5 = load <16 x i8>, <16 x i8>* %35, align 1
3751  %36 = mul <16 x i8> %wide.load12.5, %wide.load.5
3752  %37 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %36)
3753  %38 = add i8 %37, %31
3754  %39 = getelementptr inbounds i8, i8* %x, i32 96
3755  %40 = bitcast i8* %39 to <16 x i8>*
3756  %wide.load.6 = load <16 x i8>, <16 x i8>* %40, align 1
3757  %41 = getelementptr inbounds i8, i8* %y, i32 96
3758  %42 = bitcast i8* %41 to <16 x i8>*
3759  %wide.load12.6 = load <16 x i8>, <16 x i8>* %42, align 1
3760  %43 = mul <16 x i8> %wide.load12.6, %wide.load.6
3761  %44 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %43)
3762  %45 = add i8 %44, %38
3763  %46 = getelementptr inbounds i8, i8* %x, i32 112
3764  %47 = bitcast i8* %46 to <16 x i8>*
3765  %wide.load.7 = load <16 x i8>, <16 x i8>* %47, align 1
3766  %48 = getelementptr inbounds i8, i8* %y, i32 112
3767  %49 = bitcast i8* %48 to <16 x i8>*
3768  %wide.load12.7 = load <16 x i8>, <16 x i8>* %49, align 1
3769  %50 = mul <16 x i8> %wide.load12.7, %wide.load.7
3770  %51 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %50)
3771  %52 = add i8 %51, %45
3772  ret i8 %52
3773}
3774
3775
3776define arm_aapcs_vfpcc i32 @add_two_const(<4 x i32> %x, <4 x i32> %y) {
3777; CHECK-LABEL: add_two_const:
3778; CHECK:       @ %bb.0: @ %entry
3779; CHECK-NEXT:    vaddv.u32 r0, q1
3780; CHECK-NEXT:    vaddva.u32 r0, q0
3781; CHECK-NEXT:    adds r0, #10
3782; CHECK-NEXT:    bx lr
3783entry:
3784  %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3785  %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3786  %c = add i32 %a, %b
3787  %d = add i32 %c, 10
3788  ret i32 %d
3789}
3790
3791define arm_aapcs_vfpcc i32 @add_two_const2(<4 x i32> %x, <4 x i32> %y) {
3792; CHECK-LABEL: add_two_const2:
3793; CHECK:       @ %bb.0: @ %entry
3794; CHECK-NEXT:    vaddv.u32 r0, q1
3795; CHECK-NEXT:    vaddva.u32 r0, q0
3796; CHECK-NEXT:    adds r0, #10
3797; CHECK-NEXT:    bx lr
3798entry:
3799  %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3800  %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3801  %c = add i32 %a, 10
3802  %d = add i32 %c, %b
3803  ret i32 %d
3804}
3805
3806define arm_aapcs_vfpcc i32 @add_two_const3(<4 x i32> %x, <4 x i32> %y) {
3807; CHECK-LABEL: add_two_const3:
3808; CHECK:       @ %bb.0: @ %entry
3809; CHECK-NEXT:    vaddv.u32 r0, q0
3810; CHECK-NEXT:    vaddva.u32 r0, q1
3811; CHECK-NEXT:    adds r0, #20
3812; CHECK-NEXT:    bx lr
3813entry:
3814  %a = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
3815  %b = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
3816  %c = add i32 %a, 10
3817  %d = add i32 %b, 10
3818  %e = add i32 %c, %d
3819  ret i32 %e
3820}
3821
3822declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
3823declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
3824declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
3825declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
3826declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
3827declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
3828declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
3829declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
3830declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
3831declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
3832declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>)
3833declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
3834declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
3835declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
3836declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
3837