1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=enabled -verify-machineinstrs %s -o - | FileCheck %s
3
4define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
5; CHECK-LABEL: add_i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    cmp r1, #1
10; CHECK-NEXT:    blt .LBB0_3
11; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
12; CHECK-NEXT:    mov r12, r0
13; CHECK-NEXT:    cmp r1, #4
14; CHECK-NEXT:    bhs .LBB0_4
15; CHECK-NEXT:  @ %bb.2:
16; CHECK-NEXT:    movs r3, #0
17; CHECK-NEXT:    movs r0, #0
18; CHECK-NEXT:    b .LBB0_7
19; CHECK-NEXT:  .LBB0_3:
20; CHECK-NEXT:    movs r0, #0
21; CHECK-NEXT:    b .LBB0_9
22; CHECK-NEXT:  .LBB0_4: @ %vector.ph
23; CHECK-NEXT:    bic r3, r1, #3
24; CHECK-NEXT:    movs r2, #1
25; CHECK-NEXT:    subs r0, r3, #4
26; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
27; CHECK-NEXT:    movs r0, #0
28; CHECK-NEXT:    mov r2, r12
29; CHECK-NEXT:  .LBB0_5: @ %vector.body
30; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
31; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
32; CHECK-NEXT:    vaddva.u32 r0, q0
33; CHECK-NEXT:    le lr, .LBB0_5
34; CHECK-NEXT:  @ %bb.6: @ %middle.block
35; CHECK-NEXT:    cmp r3, r1
36; CHECK-NEXT:    it eq
37; CHECK-NEXT:    popeq {r7, pc}
38; CHECK-NEXT:  .LBB0_7: @ %for.body.preheader1
39; CHECK-NEXT:    sub.w lr, r1, r3
40; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
41; CHECK-NEXT:  .LBB0_8: @ %for.body
42; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
43; CHECK-NEXT:    ldr r1, [r2], #4
44; CHECK-NEXT:    add r0, r1
45; CHECK-NEXT:    le lr, .LBB0_8
46; CHECK-NEXT:  .LBB0_9: @ %for.cond.cleanup
47; CHECK-NEXT:    pop {r7, pc}
48entry:
49  %cmp6 = icmp sgt i32 %n, 0
50  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
51
52for.body.preheader:                               ; preds = %entry
53  %min.iters.check = icmp ult i32 %n, 4
54  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
55
56vector.ph:                                        ; preds = %for.body.preheader
57  %n.vec = and i32 %n, -4
58  br label %vector.body
59
60vector.body:                                      ; preds = %vector.body, %vector.ph
61  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
62  %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
63  %0 = getelementptr inbounds i32, i32* %x, i32 %index
64  %1 = bitcast i32* %0 to <4 x i32>*
65  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
66  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
67  %3 = add i32 %2, %vec.phi
68  %index.next = add i32 %index, 4
69  %4 = icmp eq i32 %index.next, %n.vec
70  br i1 %4, label %middle.block, label %vector.body
71
72middle.block:                                     ; preds = %vector.body
73  %cmp.n = icmp eq i32 %n.vec, %n
74  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
75
76for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
77  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
78  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ]
79  br label %for.body
80
81for.body:                                         ; preds = %for.body.preheader1, %for.body
82  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
83  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
84  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
85  %5 = load i32, i32* %arrayidx, align 4
86  %add = add nsw i32 %5, %r.07
87  %inc = add nuw nsw i32 %i.08, 1
88  %exitcond = icmp eq i32 %inc, %n
89  br i1 %exitcond, label %for.cond.cleanup, label %for.body
90
91for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
92  %r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ]
93  ret i32 %r.0.lcssa
94}
95
96define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
97; CHECK-LABEL: mul_i32:
98; CHECK:       @ %bb.0: @ %entry
99; CHECK-NEXT:    .save {r4, lr}
100; CHECK-NEXT:    push {r4, lr}
101; CHECK-NEXT:    movs r2, #1
102; CHECK-NEXT:    cmp r1, #1
103; CHECK-NEXT:    blt .LBB1_8
104; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
105; CHECK-NEXT:    cmp r1, #4
106; CHECK-NEXT:    bhs .LBB1_3
107; CHECK-NEXT:  @ %bb.2:
108; CHECK-NEXT:    mov.w r12, #0
109; CHECK-NEXT:    b .LBB1_6
110; CHECK-NEXT:  .LBB1_3: @ %vector.ph
111; CHECK-NEXT:    bic r12, r1, #3
112; CHECK-NEXT:    vmov.i32 q0, #0x1
113; CHECK-NEXT:    sub.w r3, r12, #4
114; CHECK-NEXT:    add.w lr, r2, r3, lsr #2
115; CHECK-NEXT:    mov r2, r0
116; CHECK-NEXT:  .LBB1_4: @ %vector.body
117; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
118; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
119; CHECK-NEXT:    vmul.i32 q0, q1, q0
120; CHECK-NEXT:    le lr, .LBB1_4
121; CHECK-NEXT:  @ %bb.5: @ %middle.block
122; CHECK-NEXT:    vmov lr, r3, d1
123; CHECK-NEXT:    cmp r12, r1
124; CHECK-NEXT:    vmov r2, r4, d0
125; CHECK-NEXT:    mul r3, lr, r3
126; CHECK-NEXT:    mul r2, r4, r2
127; CHECK-NEXT:    mul r2, r3, r2
128; CHECK-NEXT:    beq .LBB1_8
129; CHECK-NEXT:  .LBB1_6: @ %for.body.preheader1
130; CHECK-NEXT:    sub.w lr, r1, r12
131; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
132; CHECK-NEXT:  .LBB1_7: @ %for.body
133; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
134; CHECK-NEXT:    ldr r1, [r0], #4
135; CHECK-NEXT:    muls r2, r1, r2
136; CHECK-NEXT:    le lr, .LBB1_7
137; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
138; CHECK-NEXT:    mov r0, r2
139; CHECK-NEXT:    pop {r4, pc}
140entry:
141  %cmp6 = icmp sgt i32 %n, 0
142  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
143
144for.body.preheader:                               ; preds = %entry
145  %min.iters.check = icmp ult i32 %n, 4
146  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
147
148vector.ph:                                        ; preds = %for.body.preheader
149  %n.vec = and i32 %n, -4
150  br label %vector.body
151
152vector.body:                                      ; preds = %vector.body, %vector.ph
153  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
154  %vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ]
155  %0 = getelementptr inbounds i32, i32* %x, i32 %index
156  %1 = bitcast i32* %0 to <4 x i32>*
157  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
158  %2 = mul <4 x i32> %wide.load, %vec.phi
159  %index.next = add i32 %index, 4
160  %3 = icmp eq i32 %index.next, %n.vec
161  br i1 %3, label %middle.block, label %vector.body
162
163middle.block:                                     ; preds = %vector.body
164  %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
165  %cmp.n = icmp eq i32 %n.vec, %n
166  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
167
168for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
169  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
170  %r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ]
171  br label %for.body
172
173for.body:                                         ; preds = %for.body.preheader1, %for.body
174  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
175  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
176  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
177  %5 = load i32, i32* %arrayidx, align 4
178  %add = mul nsw i32 %5, %r.07
179  %inc = add nuw nsw i32 %i.08, 1
180  %exitcond = icmp eq i32 %inc, %n
181  br i1 %exitcond, label %for.cond.cleanup, label %for.body
182
183for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
184  %r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
185  ret i32 %r.0.lcssa
186}
187
188define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
189; CHECK-LABEL: and_i32:
190; CHECK:       @ %bb.0: @ %entry
191; CHECK-NEXT:    .save {r4, lr}
192; CHECK-NEXT:    push {r4, lr}
193; CHECK-NEXT:    cmp r1, #1
194; CHECK-NEXT:    blt .LBB2_3
195; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
196; CHECK-NEXT:    cmp r1, #4
197; CHECK-NEXT:    bhs .LBB2_4
198; CHECK-NEXT:  @ %bb.2:
199; CHECK-NEXT:    mov.w r2, #-1
200; CHECK-NEXT:    movs r3, #0
201; CHECK-NEXT:    b .LBB2_7
202; CHECK-NEXT:  .LBB2_3:
203; CHECK-NEXT:    mov.w r2, #-1
204; CHECK-NEXT:    b .LBB2_9
205; CHECK-NEXT:  .LBB2_4: @ %vector.ph
206; CHECK-NEXT:    bic r3, r1, #3
207; CHECK-NEXT:    movs r2, #1
208; CHECK-NEXT:    sub.w r12, r3, #4
209; CHECK-NEXT:    vmov.i8 q0, #0xff
210; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
211; CHECK-NEXT:    mov r2, r0
212; CHECK-NEXT:  .LBB2_5: @ %vector.body
213; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
214; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
215; CHECK-NEXT:    vand q0, q1, q0
216; CHECK-NEXT:    le lr, .LBB2_5
217; CHECK-NEXT:  @ %bb.6: @ %middle.block
218; CHECK-NEXT:    vmov lr, r12, d1
219; CHECK-NEXT:    cmp r3, r1
220; CHECK-NEXT:    vmov r2, r4, d0
221; CHECK-NEXT:    and.w r12, r12, lr
222; CHECK-NEXT:    and.w r2, r2, r4
223; CHECK-NEXT:    and.w r2, r2, r12
224; CHECK-NEXT:    beq .LBB2_9
225; CHECK-NEXT:  .LBB2_7: @ %for.body.preheader1
226; CHECK-NEXT:    sub.w lr, r1, r3
227; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
228; CHECK-NEXT:  .LBB2_8: @ %for.body
229; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
230; CHECK-NEXT:    ldr r1, [r0], #4
231; CHECK-NEXT:    ands r2, r1
232; CHECK-NEXT:    le lr, .LBB2_8
233; CHECK-NEXT:  .LBB2_9: @ %for.cond.cleanup
234; CHECK-NEXT:    mov r0, r2
235; CHECK-NEXT:    pop {r4, pc}
236entry:
237  %cmp6 = icmp sgt i32 %n, 0
238  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
239
240for.body.preheader:                               ; preds = %entry
241  %min.iters.check = icmp ult i32 %n, 4
242  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
243
244vector.ph:                                        ; preds = %for.body.preheader
245  %n.vec = and i32 %n, -4
246  br label %vector.body
247
248vector.body:                                      ; preds = %vector.body, %vector.ph
249  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
250  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ]
251  %0 = getelementptr inbounds i32, i32* %x, i32 %index
252  %1 = bitcast i32* %0 to <4 x i32>*
253  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
254  %2 = and <4 x i32> %wide.load, %vec.phi
255  %index.next = add i32 %index, 4
256  %3 = icmp eq i32 %index.next, %n.vec
257  br i1 %3, label %middle.block, label %vector.body
258
259middle.block:                                     ; preds = %vector.body
260  %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
261  %cmp.n = icmp eq i32 %n.vec, %n
262  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
263
264for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
265  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
266  %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ]
267  br label %for.body
268
269for.body:                                         ; preds = %for.body.preheader1, %for.body
270  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
271  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
272  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
273  %5 = load i32, i32* %arrayidx, align 4
274  %add = and i32 %5, %r.07
275  %inc = add nuw nsw i32 %i.08, 1
276  %exitcond = icmp eq i32 %inc, %n
277  br i1 %exitcond, label %for.cond.cleanup, label %for.body
278
279for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
280  %r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
281  ret i32 %r.0.lcssa
282}
283
284define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
285; CHECK-LABEL: or_i32:
286; CHECK:       @ %bb.0: @ %entry
287; CHECK-NEXT:    .save {r4, lr}
288; CHECK-NEXT:    push {r4, lr}
289; CHECK-NEXT:    cmp r1, #1
290; CHECK-NEXT:    blt .LBB3_3
291; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
292; CHECK-NEXT:    cmp r1, #4
293; CHECK-NEXT:    bhs .LBB3_4
294; CHECK-NEXT:  @ %bb.2:
295; CHECK-NEXT:    movs r3, #0
296; CHECK-NEXT:    movs r2, #0
297; CHECK-NEXT:    b .LBB3_7
298; CHECK-NEXT:  .LBB3_3:
299; CHECK-NEXT:    movs r2, #0
300; CHECK-NEXT:    b .LBB3_9
301; CHECK-NEXT:  .LBB3_4: @ %vector.ph
302; CHECK-NEXT:    bic r3, r1, #3
303; CHECK-NEXT:    movs r2, #1
304; CHECK-NEXT:    sub.w r12, r3, #4
305; CHECK-NEXT:    vmov.i32 q0, #0x0
306; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
307; CHECK-NEXT:    mov r2, r0
308; CHECK-NEXT:  .LBB3_5: @ %vector.body
309; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
310; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
311; CHECK-NEXT:    vorr q0, q1, q0
312; CHECK-NEXT:    le lr, .LBB3_5
313; CHECK-NEXT:  @ %bb.6: @ %middle.block
314; CHECK-NEXT:    vmov lr, r12, d1
315; CHECK-NEXT:    cmp r3, r1
316; CHECK-NEXT:    vmov r2, r4, d0
317; CHECK-NEXT:    orr.w r12, r12, lr
318; CHECK-NEXT:    orr.w r2, r2, r4
319; CHECK-NEXT:    orr.w r2, r2, r12
320; CHECK-NEXT:    beq .LBB3_9
321; CHECK-NEXT:  .LBB3_7: @ %for.body.preheader1
322; CHECK-NEXT:    sub.w lr, r1, r3
323; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
324; CHECK-NEXT:  .LBB3_8: @ %for.body
325; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
326; CHECK-NEXT:    ldr r1, [r0], #4
327; CHECK-NEXT:    orrs r2, r1
328; CHECK-NEXT:    le lr, .LBB3_8
329; CHECK-NEXT:  .LBB3_9: @ %for.cond.cleanup
330; CHECK-NEXT:    mov r0, r2
331; CHECK-NEXT:    pop {r4, pc}
332entry:
333  %cmp6 = icmp sgt i32 %n, 0
334  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
335
336for.body.preheader:                               ; preds = %entry
337  %min.iters.check = icmp ult i32 %n, 4
338  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
339
340vector.ph:                                        ; preds = %for.body.preheader
341  %n.vec = and i32 %n, -4
342  br label %vector.body
343
344vector.body:                                      ; preds = %vector.body, %vector.ph
345  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
346  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
347  %0 = getelementptr inbounds i32, i32* %x, i32 %index
348  %1 = bitcast i32* %0 to <4 x i32>*
349  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
350  %2 = or <4 x i32> %wide.load, %vec.phi
351  %index.next = add i32 %index, 4
352  %3 = icmp eq i32 %index.next, %n.vec
353  br i1 %3, label %middle.block, label %vector.body
354
355middle.block:                                     ; preds = %vector.body
356  %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
357  %cmp.n = icmp eq i32 %n.vec, %n
358  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
359
360for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
361  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
362  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
363  br label %for.body
364
365for.body:                                         ; preds = %for.body.preheader1, %for.body
366  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
367  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
368  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
369  %5 = load i32, i32* %arrayidx, align 4
370  %add = or i32 %5, %r.07
371  %inc = add nuw nsw i32 %i.08, 1
372  %exitcond = icmp eq i32 %inc, %n
373  br i1 %exitcond, label %for.cond.cleanup, label %for.body
374
375for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
376  %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
377  ret i32 %r.0.lcssa
378}
379
380define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
381; CHECK-LABEL: xor_i32:
382; CHECK:       @ %bb.0: @ %entry
383; CHECK-NEXT:    .save {r4, lr}
384; CHECK-NEXT:    push {r4, lr}
385; CHECK-NEXT:    cmp r1, #1
386; CHECK-NEXT:    blt .LBB4_3
387; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
388; CHECK-NEXT:    cmp r1, #4
389; CHECK-NEXT:    bhs .LBB4_4
390; CHECK-NEXT:  @ %bb.2:
391; CHECK-NEXT:    movs r3, #0
392; CHECK-NEXT:    movs r2, #0
393; CHECK-NEXT:    b .LBB4_7
394; CHECK-NEXT:  .LBB4_3:
395; CHECK-NEXT:    movs r2, #0
396; CHECK-NEXT:    b .LBB4_9
397; CHECK-NEXT:  .LBB4_4: @ %vector.ph
398; CHECK-NEXT:    bic r3, r1, #3
399; CHECK-NEXT:    movs r2, #1
400; CHECK-NEXT:    sub.w r12, r3, #4
401; CHECK-NEXT:    vmov.i32 q0, #0x0
402; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
403; CHECK-NEXT:    mov r2, r0
404; CHECK-NEXT:  .LBB4_5: @ %vector.body
405; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
406; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
407; CHECK-NEXT:    veor q0, q1, q0
408; CHECK-NEXT:    le lr, .LBB4_5
409; CHECK-NEXT:  @ %bb.6: @ %middle.block
410; CHECK-NEXT:    vmov lr, r12, d1
411; CHECK-NEXT:    cmp r3, r1
412; CHECK-NEXT:    vmov r2, r4, d0
413; CHECK-NEXT:    eor.w r12, r12, lr
414; CHECK-NEXT:    eor.w r2, r2, r4
415; CHECK-NEXT:    eor.w r2, r2, r12
416; CHECK-NEXT:    beq .LBB4_9
417; CHECK-NEXT:  .LBB4_7: @ %for.body.preheader1
418; CHECK-NEXT:    sub.w lr, r1, r3
419; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
420; CHECK-NEXT:  .LBB4_8: @ %for.body
421; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
422; CHECK-NEXT:    ldr r1, [r0], #4
423; CHECK-NEXT:    eors r2, r1
424; CHECK-NEXT:    le lr, .LBB4_8
425; CHECK-NEXT:  .LBB4_9: @ %for.cond.cleanup
426; CHECK-NEXT:    mov r0, r2
427; CHECK-NEXT:    pop {r4, pc}
428entry:
429  %cmp6 = icmp sgt i32 %n, 0
430  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
431
432for.body.preheader:                               ; preds = %entry
433  %min.iters.check = icmp ult i32 %n, 4
434  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
435
436vector.ph:                                        ; preds = %for.body.preheader
437  %n.vec = and i32 %n, -4
438  br label %vector.body
439
440vector.body:                                      ; preds = %vector.body, %vector.ph
441  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
442  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
443  %0 = getelementptr inbounds i32, i32* %x, i32 %index
444  %1 = bitcast i32* %0 to <4 x i32>*
445  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
446  %2 = xor <4 x i32> %wide.load, %vec.phi
447  %index.next = add i32 %index, 4
448  %3 = icmp eq i32 %index.next, %n.vec
449  br i1 %3, label %middle.block, label %vector.body
450
451middle.block:                                     ; preds = %vector.body
452  %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
453  %cmp.n = icmp eq i32 %n.vec, %n
454  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
455
456for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
457  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
458  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
459  br label %for.body
460
461for.body:                                         ; preds = %for.body.preheader1, %for.body
462  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
463  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
464  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
465  %5 = load i32, i32* %arrayidx, align 4
466  %add = xor i32 %5, %r.07
467  %inc = add nuw nsw i32 %i.08, 1
468  %exitcond = icmp eq i32 %inc, %n
469  br i1 %exitcond, label %for.cond.cleanup, label %for.body
470
471for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
472  %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
473  ret i32 %r.0.lcssa
474}
475
476define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
477; CHECK-LABEL: fadd_f32:
478; CHECK:       @ %bb.0: @ %entry
479; CHECK-NEXT:    .save {r7, lr}
480; CHECK-NEXT:    push {r7, lr}
481; CHECK-NEXT:    cmp r1, #1
482; CHECK-NEXT:    blt .LBB5_3
483; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
484; CHECK-NEXT:    cmp r1, #4
485; CHECK-NEXT:    bhs .LBB5_4
486; CHECK-NEXT:  @ %bb.2:
487; CHECK-NEXT:    vldr s0, .LCPI5_0
488; CHECK-NEXT:    movs r2, #0
489; CHECK-NEXT:    b .LBB5_7
490; CHECK-NEXT:  .LBB5_3:
491; CHECK-NEXT:    vldr s0, .LCPI5_0
492; CHECK-NEXT:    b .LBB5_9
493; CHECK-NEXT:  .LBB5_4: @ %vector.ph
494; CHECK-NEXT:    bic r2, r1, #3
495; CHECK-NEXT:    movs r3, #1
496; CHECK-NEXT:    sub.w r12, r2, #4
497; CHECK-NEXT:    vmov.i32 q0, #0x0
498; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
499; CHECK-NEXT:    mov r3, r0
500; CHECK-NEXT:  .LBB5_5: @ %vector.body
501; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
502; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
503; CHECK-NEXT:    vadd.f32 q0, q1, q0
504; CHECK-NEXT:    le lr, .LBB5_5
505; CHECK-NEXT:  @ %bb.6: @ %middle.block
506; CHECK-NEXT:    vadd.f32 s4, s2, s3
507; CHECK-NEXT:    cmp r2, r1
508; CHECK-NEXT:    vadd.f32 s0, s0, s1
509; CHECK-NEXT:    vadd.f32 s0, s0, s4
510; CHECK-NEXT:    beq .LBB5_9
511; CHECK-NEXT:  .LBB5_7: @ %for.body.preheader1
512; CHECK-NEXT:    sub.w lr, r1, r2
513; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
514; CHECK-NEXT:  .LBB5_8: @ %for.body
515; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
516; CHECK-NEXT:    vldmia r0!, {s2}
517; CHECK-NEXT:    vadd.f32 s0, s2, s0
518; CHECK-NEXT:    le lr, .LBB5_8
519; CHECK-NEXT:  .LBB5_9: @ %for.cond.cleanup
520; CHECK-NEXT:    vmov r0, s0
521; CHECK-NEXT:    pop {r7, pc}
522; CHECK-NEXT:    .p2align 2
523; CHECK-NEXT:  @ %bb.10:
524; CHECK-NEXT:  .LCPI5_0:
525; CHECK-NEXT:    .long 0x00000000 @ float 0
526entry:
527  %cmp6 = icmp sgt i32 %n, 0
528  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
529
530for.body.preheader:                               ; preds = %entry
531  %min.iters.check = icmp ult i32 %n, 4
532  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
533
534vector.ph:                                        ; preds = %for.body.preheader
535  %n.vec = and i32 %n, -4
536  br label %vector.body
537
538vector.body:                                      ; preds = %vector.body, %vector.ph
539  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
540  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
541  %0 = getelementptr inbounds float, float* %x, i32 %index
542  %1 = bitcast float* %0 to <4 x float>*
543  %wide.load = load <4 x float>, <4 x float>* %1, align 4
544  %2 = fadd fast <4 x float> %wide.load, %vec.phi
545  %index.next = add i32 %index, 4
546  %3 = icmp eq i32 %index.next, %n.vec
547  br i1 %3, label %middle.block, label %vector.body
548
549middle.block:                                     ; preds = %vector.body
550  %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
551  %cmp.n = icmp eq i32 %n.vec, %n
552  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
553
554for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
555  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
556  %r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
557  br label %for.body
558
559for.body:                                         ; preds = %for.body.preheader1, %for.body
560  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
561  %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
562  %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
563  %5 = load float, float* %arrayidx, align 4
564  %add = fadd fast float %5, %r.07
565  %inc = add nuw nsw i32 %i.08, 1
566  %exitcond = icmp eq i32 %inc, %n
567  br i1 %exitcond, label %for.cond.cleanup, label %for.body
568
569for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
570  %r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
571  ret float %r.0.lcssa
572}
573
574define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
575; CHECK-LABEL: fmul_f32:
576; CHECK:       @ %bb.0: @ %entry
577; CHECK-NEXT:    .save {r7, lr}
578; CHECK-NEXT:    push {r7, lr}
579; CHECK-NEXT:    cmp r1, #1
580; CHECK-NEXT:    blt .LBB6_3
581; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
582; CHECK-NEXT:    cmp r1, #4
583; CHECK-NEXT:    bhs .LBB6_4
584; CHECK-NEXT:  @ %bb.2:
585; CHECK-NEXT:    vmov.f32 s0, #1.000000e+00
586; CHECK-NEXT:    movs r2, #0
587; CHECK-NEXT:    b .LBB6_7
588; CHECK-NEXT:  .LBB6_3:
589; CHECK-NEXT:    vmov.f32 s0, #1.000000e+00
590; CHECK-NEXT:    b .LBB6_9
591; CHECK-NEXT:  .LBB6_4: @ %vector.ph
592; CHECK-NEXT:    bic r2, r1, #3
593; CHECK-NEXT:    movs r3, #1
594; CHECK-NEXT:    sub.w r12, r2, #4
595; CHECK-NEXT:    vmov.f32 q0, #1.000000e+00
596; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
597; CHECK-NEXT:    mov r3, r0
598; CHECK-NEXT:  .LBB6_5: @ %vector.body
599; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
600; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
601; CHECK-NEXT:    vmul.f32 q0, q1, q0
602; CHECK-NEXT:    le lr, .LBB6_5
603; CHECK-NEXT:  @ %bb.6: @ %middle.block
604; CHECK-NEXT:    vmul.f32 s4, s2, s3
605; CHECK-NEXT:    cmp r2, r1
606; CHECK-NEXT:    vmul.f32 s0, s0, s1
607; CHECK-NEXT:    vmul.f32 s0, s0, s4
608; CHECK-NEXT:    beq .LBB6_9
609; CHECK-NEXT:  .LBB6_7: @ %for.body.preheader1
610; CHECK-NEXT:    sub.w lr, r1, r2
611; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
612; CHECK-NEXT:  .LBB6_8: @ %for.body
613; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
614; CHECK-NEXT:    vldmia r0!, {s2}
615; CHECK-NEXT:    vmul.f32 s0, s2, s0
616; CHECK-NEXT:    le lr, .LBB6_8
617; CHECK-NEXT:  .LBB6_9: @ %for.cond.cleanup
618; CHECK-NEXT:    vmov r0, s0
619; CHECK-NEXT:    pop {r7, pc}
620entry:
621  %cmp6 = icmp sgt i32 %n, 0
622  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
623
624for.body.preheader:                               ; preds = %entry
625  %min.iters.check = icmp ult i32 %n, 4
626  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
627
628vector.ph:                                        ; preds = %for.body.preheader
629  %n.vec = and i32 %n, -4
630  br label %vector.body
631
632vector.body:                                      ; preds = %vector.body, %vector.ph
633  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
634  %vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ]
635  %0 = getelementptr inbounds float, float* %x, i32 %index
636  %1 = bitcast float* %0 to <4 x float>*
637  %wide.load = load <4 x float>, <4 x float>* %1, align 4
638  %2 = fmul fast <4 x float> %wide.load, %vec.phi
639  %index.next = add i32 %index, 4
640  %3 = icmp eq i32 %index.next, %n.vec
641  br i1 %3, label %middle.block, label %vector.body
642
643middle.block:                                     ; preds = %vector.body
644  %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
645  %cmp.n = icmp eq i32 %n.vec, %n
646  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
647
648for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
649  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
650  %r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
651  br label %for.body
652
653for.body:                                         ; preds = %for.body.preheader1, %for.body
654  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
655  %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
656  %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
657  %5 = load float, float* %arrayidx, align 4
658  %add = fmul fast float %5, %r.07
659  %inc = add nuw nsw i32 %i.08, 1
660  %exitcond = icmp eq i32 %inc, %n
661  br i1 %exitcond, label %for.cond.cleanup, label %for.body
662
663for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
664  %r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
665  ret float %r.0.lcssa
666}
667
668define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
669; CHECK-LABEL: smin_i32:
670; CHECK:       @ %bb.0: @ %entry
671; CHECK-NEXT:    .save {r7, lr}
672; CHECK-NEXT:    push {r7, lr}
673; CHECK-NEXT:    cmp r1, #1
674; CHECK-NEXT:    blt .LBB7_3
675; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
676; CHECK-NEXT:    cmp r1, #4
677; CHECK-NEXT:    bhs .LBB7_4
678; CHECK-NEXT:  @ %bb.2:
679; CHECK-NEXT:    mvn r2, #-2147483648
680; CHECK-NEXT:    movs r3, #0
681; CHECK-NEXT:    b .LBB7_7
682; CHECK-NEXT:  .LBB7_3:
683; CHECK-NEXT:    mvn r2, #-2147483648
684; CHECK-NEXT:    b .LBB7_9
685; CHECK-NEXT:  .LBB7_4: @ %vector.ph
686; CHECK-NEXT:    bic r3, r1, #3
687; CHECK-NEXT:    movs r2, #1
688; CHECK-NEXT:    sub.w r12, r3, #4
689; CHECK-NEXT:    vmvn.i32 q0, #0x80000000
690; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
691; CHECK-NEXT:    mov r2, r0
692; CHECK-NEXT:  .LBB7_5: @ %vector.body
693; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
694; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
695; CHECK-NEXT:    vmin.s32 q0, q0, q1
696; CHECK-NEXT:    le lr, .LBB7_5
697; CHECK-NEXT:  @ %bb.6: @ %middle.block
698; CHECK-NEXT:    mvn r2, #-2147483648
699; CHECK-NEXT:    cmp r3, r1
700; CHECK-NEXT:    vminv.s32 r2, q0
701; CHECK-NEXT:    beq .LBB7_9
702; CHECK-NEXT:  .LBB7_7: @ %for.body.preheader1
703; CHECK-NEXT:    sub.w lr, r1, r3
704; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
705; CHECK-NEXT:  .LBB7_8: @ %for.body
706; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
707; CHECK-NEXT:    ldr r1, [r0], #4
708; CHECK-NEXT:    cmp r2, r1
709; CHECK-NEXT:    csel r2, r2, r1, lt
710; CHECK-NEXT:    le lr, .LBB7_8
711; CHECK-NEXT:  .LBB7_9: @ %for.cond.cleanup
712; CHECK-NEXT:    mov r0, r2
713; CHECK-NEXT:    pop {r7, pc}
714entry:
715  %cmp6 = icmp sgt i32 %n, 0
716  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
717
718for.body.preheader:                               ; preds = %entry
719  %min.iters.check = icmp ult i32 %n, 4
720  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
721
722vector.ph:                                        ; preds = %for.body.preheader
723  %n.vec = and i32 %n, -4
724  br label %vector.body
725
726vector.body:                                      ; preds = %vector.body, %vector.ph
727  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
728  %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ]
729  %0 = getelementptr inbounds i32, i32* %x, i32 %index
730  %1 = bitcast i32* %0 to <4 x i32>*
731  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
732  %2 = icmp slt <4 x i32> %vec.phi, %wide.load
733  %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
734  %index.next = add i32 %index, 4
735  %4 = icmp eq i32 %index.next, %n.vec
736  br i1 %4, label %middle.block, label %vector.body
737
738middle.block:                                     ; preds = %vector.body
739  %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
740  %cmp.n = icmp eq i32 %n.vec, %n
741  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
742
743for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
744  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
745  %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
746  br label %for.body
747
748for.body:                                         ; preds = %for.body.preheader1, %for.body
749  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
750  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
751  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
752  %6 = load i32, i32* %arrayidx, align 4
753  %c = icmp slt i32 %r.07, %6
754  %add = select i1 %c, i32 %r.07, i32 %6
755  %inc = add nuw nsw i32 %i.08, 1
756  %exitcond = icmp eq i32 %inc, %n
757  br i1 %exitcond, label %for.cond.cleanup, label %for.body
758
759for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
760  %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
761  ret i32 %r.0.lcssa
762}
763
764define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
765; CHECK-LABEL: smin_i32_inloop:
766; CHECK:       @ %bb.0: @ %entry
767; CHECK-NEXT:    .save {r7, lr}
768; CHECK-NEXT:    push {r7, lr}
769; CHECK-NEXT:    cmp r1, #1
770; CHECK-NEXT:    blt .LBB8_3
771; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
772; CHECK-NEXT:    mov r12, r0
773; CHECK-NEXT:    cmp r1, #4
774; CHECK-NEXT:    bhs .LBB8_4
775; CHECK-NEXT:  @ %bb.2:
776; CHECK-NEXT:    mvn r0, #-2147483648
777; CHECK-NEXT:    movs r3, #0
778; CHECK-NEXT:    b .LBB8_7
779; CHECK-NEXT:  .LBB8_3:
780; CHECK-NEXT:    mvn r0, #-2147483648
781; CHECK-NEXT:    b .LBB8_9
782; CHECK-NEXT:  .LBB8_4: @ %vector.ph
783; CHECK-NEXT:    bic r3, r1, #3
784; CHECK-NEXT:    movs r2, #1
785; CHECK-NEXT:    subs r0, r3, #4
786; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
787; CHECK-NEXT:    mvn r0, #-2147483648
788; CHECK-NEXT:    mov r2, r12
789; CHECK-NEXT:  .LBB8_5: @ %vector.body
790; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
791; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
792; CHECK-NEXT:    vminv.s32 r0, q0
793; CHECK-NEXT:    le lr, .LBB8_5
794; CHECK-NEXT:  @ %bb.6: @ %middle.block
795; CHECK-NEXT:    cmp r3, r1
796; CHECK-NEXT:    it eq
797; CHECK-NEXT:    popeq {r7, pc}
798; CHECK-NEXT:  .LBB8_7: @ %for.body.preheader1
799; CHECK-NEXT:    sub.w lr, r1, r3
800; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
801; CHECK-NEXT:  .LBB8_8: @ %for.body
802; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
803; CHECK-NEXT:    ldr r1, [r2], #4
804; CHECK-NEXT:    cmp r0, r1
805; CHECK-NEXT:    csel r0, r0, r1, lt
806; CHECK-NEXT:    le lr, .LBB8_8
807; CHECK-NEXT:  .LBB8_9: @ %for.cond.cleanup
808; CHECK-NEXT:    pop {r7, pc}
809entry:
810  %cmp6 = icmp sgt i32 %n, 0
811  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
812
813for.body.preheader:                               ; preds = %entry
814  %min.iters.check = icmp ult i32 %n, 4
815  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
816
817vector.ph:                                        ; preds = %for.body.preheader
818  %n.vec = and i32 %n, -4
819  br label %vector.body
820
821vector.body:                                      ; preds = %vector.body, %vector.ph
822  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
823  %vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ]
824  %0 = getelementptr inbounds i32, i32* %x, i32 %index
825  %1 = bitcast i32* %0 to <4 x i32>*
826  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
827  %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
828  %2 = icmp slt i32 %vec.phi, %l5
829  %3 = select i1 %2, i32 %vec.phi, i32 %l5
830  %index.next = add i32 %index, 4
831  %4 = icmp eq i32 %index.next, %n.vec
832  br i1 %4, label %middle.block, label %vector.body
833
834middle.block:                                     ; preds = %vector.body
835  %5 = phi i32 [ %3, %vector.body ]
836  %cmp.n = icmp eq i32 %n.vec, %n
837  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
838
839for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
840  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
841  %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
842  br label %for.body
843
844for.body:                                         ; preds = %for.body.preheader1, %for.body
845  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
846  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
847  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
848  %6 = load i32, i32* %arrayidx, align 4
849  %c = icmp slt i32 %r.07, %6
850  %add = select i1 %c, i32 %r.07, i32 %6
851  %inc = add nuw nsw i32 %i.08, 1
852  %exitcond = icmp eq i32 %inc, %n
853  br i1 %exitcond, label %for.cond.cleanup, label %for.body
854
855for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
856  %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
857  ret i32 %r.0.lcssa
858}
859
860define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
861; CHECK-LABEL: smax_i32:
862; CHECK:       @ %bb.0: @ %entry
863; CHECK-NEXT:    .save {r7, lr}
864; CHECK-NEXT:    push {r7, lr}
865; CHECK-NEXT:    cmp r1, #1
866; CHECK-NEXT:    blt .LBB9_3
867; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
868; CHECK-NEXT:    cmp r1, #4
869; CHECK-NEXT:    bhs .LBB9_4
870; CHECK-NEXT:  @ %bb.2:
871; CHECK-NEXT:    mov.w r2, #-2147483648
872; CHECK-NEXT:    movs r3, #0
873; CHECK-NEXT:    b .LBB9_7
874; CHECK-NEXT:  .LBB9_3:
875; CHECK-NEXT:    mov.w r2, #-2147483648
876; CHECK-NEXT:    b .LBB9_9
877; CHECK-NEXT:  .LBB9_4: @ %vector.ph
878; CHECK-NEXT:    bic r3, r1, #3
879; CHECK-NEXT:    movs r2, #1
880; CHECK-NEXT:    sub.w r12, r3, #4
881; CHECK-NEXT:    vmov.i32 q0, #0x80000000
882; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
883; CHECK-NEXT:    mov r2, r0
884; CHECK-NEXT:  .LBB9_5: @ %vector.body
885; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
886; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
887; CHECK-NEXT:    vmax.s32 q0, q0, q1
888; CHECK-NEXT:    le lr, .LBB9_5
889; CHECK-NEXT:  @ %bb.6: @ %middle.block
890; CHECK-NEXT:    mov.w r2, #-2147483648
891; CHECK-NEXT:    cmp r3, r1
892; CHECK-NEXT:    vmaxv.s32 r2, q0
893; CHECK-NEXT:    beq .LBB9_9
894; CHECK-NEXT:  .LBB9_7: @ %for.body.preheader1
895; CHECK-NEXT:    sub.w lr, r1, r3
896; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
897; CHECK-NEXT:  .LBB9_8: @ %for.body
898; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
899; CHECK-NEXT:    ldr r1, [r0], #4
900; CHECK-NEXT:    cmp r2, r1
901; CHECK-NEXT:    csel r2, r2, r1, gt
902; CHECK-NEXT:    le lr, .LBB9_8
903; CHECK-NEXT:  .LBB9_9: @ %for.cond.cleanup
904; CHECK-NEXT:    mov r0, r2
905; CHECK-NEXT:    pop {r7, pc}
906entry:
907  %cmp6 = icmp sgt i32 %n, 0
908  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
909
910for.body.preheader:                               ; preds = %entry
911  %min.iters.check = icmp ult i32 %n, 4
912  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
913
914vector.ph:                                        ; preds = %for.body.preheader
915  %n.vec = and i32 %n, -4
916  br label %vector.body
917
918vector.body:                                      ; preds = %vector.body, %vector.ph
919  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
920  %vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ]
921  %0 = getelementptr inbounds i32, i32* %x, i32 %index
922  %1 = bitcast i32* %0 to <4 x i32>*
923  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
924  %2 = icmp sgt <4 x i32> %vec.phi, %wide.load
925  %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
926  %index.next = add i32 %index, 4
927  %4 = icmp eq i32 %index.next, %n.vec
928  br i1 %4, label %middle.block, label %vector.body
929
930middle.block:                                     ; preds = %vector.body
931  %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
932  %cmp.n = icmp eq i32 %n.vec, %n
933  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
934
935for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
936  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
937  %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
938  br label %for.body
939
940for.body:                                         ; preds = %for.body.preheader1, %for.body
941  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
942  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
943  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
944  %6 = load i32, i32* %arrayidx, align 4
945  %c = icmp sgt i32 %r.07, %6
946  %add = select i1 %c, i32 %r.07, i32 %6
947  %inc = add nuw nsw i32 %i.08, 1
948  %exitcond = icmp eq i32 %inc, %n
949  br i1 %exitcond, label %for.cond.cleanup, label %for.body
950
951for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
952  %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
953  ret i32 %r.0.lcssa
954}
955
956define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
957; CHECK-LABEL: smax_i32_inloop:
958; CHECK:       @ %bb.0: @ %entry
959; CHECK-NEXT:    .save {r7, lr}
960; CHECK-NEXT:    push {r7, lr}
961; CHECK-NEXT:    cmp r1, #1
962; CHECK-NEXT:    blt .LBB10_3
963; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
964; CHECK-NEXT:    mov r12, r0
965; CHECK-NEXT:    cmp r1, #4
966; CHECK-NEXT:    bhs .LBB10_4
967; CHECK-NEXT:  @ %bb.2:
968; CHECK-NEXT:    mov.w r0, #-2147483648
969; CHECK-NEXT:    movs r3, #0
970; CHECK-NEXT:    b .LBB10_7
971; CHECK-NEXT:  .LBB10_3:
972; CHECK-NEXT:    mov.w r0, #-2147483648
973; CHECK-NEXT:    b .LBB10_9
974; CHECK-NEXT:  .LBB10_4: @ %vector.ph
975; CHECK-NEXT:    bic r3, r1, #3
976; CHECK-NEXT:    movs r2, #1
977; CHECK-NEXT:    subs r0, r3, #4
978; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
979; CHECK-NEXT:    mov.w r0, #-2147483648
980; CHECK-NEXT:    mov r2, r12
981; CHECK-NEXT:  .LBB10_5: @ %vector.body
982; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
983; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
984; CHECK-NEXT:    vmaxv.s32 r0, q0
985; CHECK-NEXT:    le lr, .LBB10_5
986; CHECK-NEXT:  @ %bb.6: @ %middle.block
987; CHECK-NEXT:    cmp r3, r1
988; CHECK-NEXT:    it eq
989; CHECK-NEXT:    popeq {r7, pc}
990; CHECK-NEXT:  .LBB10_7: @ %for.body.preheader1
991; CHECK-NEXT:    sub.w lr, r1, r3
992; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
993; CHECK-NEXT:  .LBB10_8: @ %for.body
994; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
995; CHECK-NEXT:    ldr r1, [r2], #4
996; CHECK-NEXT:    cmp r0, r1
997; CHECK-NEXT:    csel r0, r0, r1, gt
998; CHECK-NEXT:    le lr, .LBB10_8
999; CHECK-NEXT:  .LBB10_9: @ %for.cond.cleanup
1000; CHECK-NEXT:    pop {r7, pc}
1001entry:
1002  %cmp6 = icmp sgt i32 %n, 0
1003  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1004
1005for.body.preheader:                               ; preds = %entry
1006  %min.iters.check = icmp ult i32 %n, 4
1007  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1008
1009vector.ph:                                        ; preds = %for.body.preheader
1010  %n.vec = and i32 %n, -4
1011  br label %vector.body
1012
1013vector.body:                                      ; preds = %vector.body, %vector.ph
1014  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1015  %vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ]
1016  %0 = getelementptr inbounds i32, i32* %x, i32 %index
1017  %1 = bitcast i32* %0 to <4 x i32>*
1018  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1019  %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
1020  %2 = icmp sgt i32 %vec.phi, %l5
1021  %3 = select i1 %2, i32 %vec.phi, i32 %l5
1022  %index.next = add i32 %index, 4
1023  %4 = icmp eq i32 %index.next, %n.vec
1024  br i1 %4, label %middle.block, label %vector.body
1025
1026middle.block:                                     ; preds = %vector.body
1027  %5 = phi i32 [ %3, %vector.body ]
1028  %cmp.n = icmp eq i32 %n.vec, %n
1029  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1030
1031for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1032  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1033  %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
1034  br label %for.body
1035
1036for.body:                                         ; preds = %for.body.preheader1, %for.body
1037  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1038  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1039  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1040  %6 = load i32, i32* %arrayidx, align 4
1041  %c = icmp sgt i32 %r.07, %6
1042  %add = select i1 %c, i32 %r.07, i32 %6
1043  %inc = add nuw nsw i32 %i.08, 1
1044  %exitcond = icmp eq i32 %inc, %n
1045  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1046
1047for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1048  %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1049  ret i32 %r.0.lcssa
1050}
1051
1052define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
1053; CHECK-LABEL: umin_i32:
1054; CHECK:       @ %bb.0: @ %entry
1055; CHECK-NEXT:    .save {r7, lr}
1056; CHECK-NEXT:    push {r7, lr}
1057; CHECK-NEXT:    cmp r1, #1
1058; CHECK-NEXT:    blt .LBB11_3
1059; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1060; CHECK-NEXT:    cmp r1, #4
1061; CHECK-NEXT:    bhs .LBB11_4
1062; CHECK-NEXT:  @ %bb.2:
1063; CHECK-NEXT:    mov.w r2, #-1
1064; CHECK-NEXT:    movs r3, #0
1065; CHECK-NEXT:    b .LBB11_7
1066; CHECK-NEXT:  .LBB11_3:
1067; CHECK-NEXT:    mov.w r2, #-1
1068; CHECK-NEXT:    b .LBB11_9
1069; CHECK-NEXT:  .LBB11_4: @ %vector.ph
1070; CHECK-NEXT:    bic r3, r1, #3
1071; CHECK-NEXT:    movs r2, #1
1072; CHECK-NEXT:    sub.w r12, r3, #4
1073; CHECK-NEXT:    vmov.i8 q0, #0xff
1074; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
1075; CHECK-NEXT:    mov r2, r0
1076; CHECK-NEXT:  .LBB11_5: @ %vector.body
1077; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1078; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
1079; CHECK-NEXT:    vmin.u32 q0, q0, q1
1080; CHECK-NEXT:    le lr, .LBB11_5
1081; CHECK-NEXT:  @ %bb.6: @ %middle.block
1082; CHECK-NEXT:    mov.w r2, #-1
1083; CHECK-NEXT:    cmp r3, r1
1084; CHECK-NEXT:    vminv.u32 r2, q0
1085; CHECK-NEXT:    beq .LBB11_9
1086; CHECK-NEXT:  .LBB11_7: @ %for.body.preheader1
1087; CHECK-NEXT:    sub.w lr, r1, r3
1088; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
1089; CHECK-NEXT:  .LBB11_8: @ %for.body
1090; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1091; CHECK-NEXT:    ldr r1, [r0], #4
1092; CHECK-NEXT:    cmp r2, r1
1093; CHECK-NEXT:    csel r2, r2, r1, lo
1094; CHECK-NEXT:    le lr, .LBB11_8
1095; CHECK-NEXT:  .LBB11_9: @ %for.cond.cleanup
1096; CHECK-NEXT:    mov r0, r2
1097; CHECK-NEXT:    pop {r7, pc}
1098entry:
1099  %cmp6 = icmp sgt i32 %n, 0
1100  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1101
1102for.body.preheader:                               ; preds = %entry
1103  %min.iters.check = icmp ult i32 %n, 4
1104  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1105
1106vector.ph:                                        ; preds = %for.body.preheader
1107  %n.vec = and i32 %n, -4
1108  br label %vector.body
1109
1110vector.body:                                      ; preds = %vector.body, %vector.ph
1111  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1112  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ]
1113  %0 = getelementptr inbounds i32, i32* %x, i32 %index
1114  %1 = bitcast i32* %0 to <4 x i32>*
1115  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1116  %2 = icmp ult <4 x i32> %vec.phi, %wide.load
1117  %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
1118  %index.next = add i32 %index, 4
1119  %4 = icmp eq i32 %index.next, %n.vec
1120  br i1 %4, label %middle.block, label %vector.body
1121
1122middle.block:                                     ; preds = %vector.body
1123  %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
1124  %cmp.n = icmp eq i32 %n.vec, %n
1125  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1126
1127for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1128  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1129  %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
1130  br label %for.body
1131
1132for.body:                                         ; preds = %for.body.preheader1, %for.body
1133  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1134  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1135  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1136  %6 = load i32, i32* %arrayidx, align 4
1137  %c = icmp ult i32 %r.07, %6
1138  %add = select i1 %c, i32 %r.07, i32 %6
1139  %inc = add nuw nsw i32 %i.08, 1
1140  %exitcond = icmp eq i32 %inc, %n
1141  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1142
1143for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1144  %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1145  ret i32 %r.0.lcssa
1146}
1147
1148define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
1149; CHECK-LABEL: umin_i32_inloop:
1150; CHECK:       @ %bb.0: @ %entry
1151; CHECK-NEXT:    .save {r7, lr}
1152; CHECK-NEXT:    push {r7, lr}
1153; CHECK-NEXT:    cmp r1, #1
1154; CHECK-NEXT:    blt .LBB12_3
1155; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1156; CHECK-NEXT:    mov r12, r0
1157; CHECK-NEXT:    cmp r1, #4
1158; CHECK-NEXT:    bhs .LBB12_4
1159; CHECK-NEXT:  @ %bb.2:
1160; CHECK-NEXT:    mov.w r0, #-1
1161; CHECK-NEXT:    movs r3, #0
1162; CHECK-NEXT:    b .LBB12_7
1163; CHECK-NEXT:  .LBB12_3:
1164; CHECK-NEXT:    mov.w r0, #-1
1165; CHECK-NEXT:    b .LBB12_9
1166; CHECK-NEXT:  .LBB12_4: @ %vector.ph
1167; CHECK-NEXT:    bic r3, r1, #3
1168; CHECK-NEXT:    movs r2, #1
1169; CHECK-NEXT:    subs r0, r3, #4
1170; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
1171; CHECK-NEXT:    mov.w r0, #-1
1172; CHECK-NEXT:    mov r2, r12
1173; CHECK-NEXT:  .LBB12_5: @ %vector.body
1174; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1175; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
1176; CHECK-NEXT:    vminv.u32 r0, q0
1177; CHECK-NEXT:    le lr, .LBB12_5
1178; CHECK-NEXT:  @ %bb.6: @ %middle.block
1179; CHECK-NEXT:    cmp r3, r1
1180; CHECK-NEXT:    it eq
1181; CHECK-NEXT:    popeq {r7, pc}
1182; CHECK-NEXT:  .LBB12_7: @ %for.body.preheader1
1183; CHECK-NEXT:    sub.w lr, r1, r3
1184; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
1185; CHECK-NEXT:  .LBB12_8: @ %for.body
1186; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1187; CHECK-NEXT:    ldr r1, [r2], #4
1188; CHECK-NEXT:    cmp r0, r1
1189; CHECK-NEXT:    csel r0, r0, r1, hi
1190; CHECK-NEXT:    le lr, .LBB12_8
1191; CHECK-NEXT:  .LBB12_9: @ %for.cond.cleanup
1192; CHECK-NEXT:    pop {r7, pc}
1193entry:
1194  %cmp6 = icmp sgt i32 %n, 0
1195  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1196
1197for.body.preheader:                               ; preds = %entry
1198  %min.iters.check = icmp ult i32 %n, 4
1199  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1200
1201vector.ph:                                        ; preds = %for.body.preheader
1202  %n.vec = and i32 %n, -4
1203  br label %vector.body
1204
1205vector.body:                                      ; preds = %vector.body, %vector.ph
1206  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1207  %vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ]
1208  %0 = getelementptr inbounds i32, i32* %x, i32 %index
1209  %1 = bitcast i32* %0 to <4 x i32>*
1210  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1211  %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
1212  %2 = icmp ult i32 %vec.phi, %l5
1213  %3 = select i1 %2, i32 %vec.phi, i32 %l5
1214  %index.next = add i32 %index, 4
1215  %4 = icmp eq i32 %index.next, %n.vec
1216  br i1 %4, label %middle.block, label %vector.body
1217
1218middle.block:                                     ; preds = %vector.body
1219  %5 = phi i32 [ %3, %vector.body ]
1220  %cmp.n = icmp eq i32 %n.vec, %n
1221  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1222
1223for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1224  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1225  %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
1226  br label %for.body
1227
1228for.body:                                         ; preds = %for.body.preheader1, %for.body
1229  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1230  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1231  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1232  %6 = load i32, i32* %arrayidx, align 4
1233  %c = icmp ugt i32 %r.07, %6
1234  %add = select i1 %c, i32 %r.07, i32 %6
1235  %inc = add nuw nsw i32 %i.08, 1
1236  %exitcond = icmp eq i32 %inc, %n
1237  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1238
1239for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1240  %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1241  ret i32 %r.0.lcssa
1242}
1243
1244define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
1245; CHECK-LABEL: umax_i32:
1246; CHECK:       @ %bb.0: @ %entry
1247; CHECK-NEXT:    .save {r7, lr}
1248; CHECK-NEXT:    push {r7, lr}
1249; CHECK-NEXT:    cmp r1, #1
1250; CHECK-NEXT:    blt .LBB13_3
1251; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1252; CHECK-NEXT:    cmp r1, #4
1253; CHECK-NEXT:    bhs .LBB13_4
1254; CHECK-NEXT:  @ %bb.2:
1255; CHECK-NEXT:    movs r3, #0
1256; CHECK-NEXT:    movs r2, #0
1257; CHECK-NEXT:    b .LBB13_7
1258; CHECK-NEXT:  .LBB13_3:
1259; CHECK-NEXT:    movs r2, #0
1260; CHECK-NEXT:    b .LBB13_9
1261; CHECK-NEXT:  .LBB13_4: @ %vector.ph
1262; CHECK-NEXT:    bic r3, r1, #3
1263; CHECK-NEXT:    movs r2, #1
1264; CHECK-NEXT:    sub.w r12, r3, #4
1265; CHECK-NEXT:    vmov.i32 q0, #0x0
1266; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
1267; CHECK-NEXT:    mov r2, r0
1268; CHECK-NEXT:  .LBB13_5: @ %vector.body
1269; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1270; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
1271; CHECK-NEXT:    vmax.u32 q0, q0, q1
1272; CHECK-NEXT:    le lr, .LBB13_5
1273; CHECK-NEXT:  @ %bb.6: @ %middle.block
1274; CHECK-NEXT:    movs r2, #0
1275; CHECK-NEXT:    cmp r3, r1
1276; CHECK-NEXT:    vmaxv.u32 r2, q0
1277; CHECK-NEXT:    beq .LBB13_9
1278; CHECK-NEXT:  .LBB13_7: @ %for.body.preheader1
1279; CHECK-NEXT:    sub.w lr, r1, r3
1280; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
1281; CHECK-NEXT:  .LBB13_8: @ %for.body
1282; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1283; CHECK-NEXT:    ldr r1, [r0], #4
1284; CHECK-NEXT:    cmp r2, r1
1285; CHECK-NEXT:    csel r2, r2, r1, hi
1286; CHECK-NEXT:    le lr, .LBB13_8
1287; CHECK-NEXT:  .LBB13_9: @ %for.cond.cleanup
1288; CHECK-NEXT:    mov r0, r2
1289; CHECK-NEXT:    pop {r7, pc}
1290entry:
1291  %cmp6 = icmp sgt i32 %n, 0
1292  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1293
1294for.body.preheader:                               ; preds = %entry
1295  %min.iters.check = icmp ult i32 %n, 4
1296  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1297
1298vector.ph:                                        ; preds = %for.body.preheader
1299  %n.vec = and i32 %n, -4
1300  br label %vector.body
1301
1302vector.body:                                      ; preds = %vector.body, %vector.ph
1303  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1304  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1305  %0 = getelementptr inbounds i32, i32* %x, i32 %index
1306  %1 = bitcast i32* %0 to <4 x i32>*
1307  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1308  %2 = icmp ugt <4 x i32> %vec.phi, %wide.load
1309  %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
1310  %index.next = add i32 %index, 4
1311  %4 = icmp eq i32 %index.next, %n.vec
1312  br i1 %4, label %middle.block, label %vector.body
1313
1314middle.block:                                     ; preds = %vector.body
1315  %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
1316  %cmp.n = icmp eq i32 %n.vec, %n
1317  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1318
1319for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1320  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1321  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
1322  br label %for.body
1323
1324for.body:                                         ; preds = %for.body.preheader1, %for.body
1325  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1326  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1327  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1328  %6 = load i32, i32* %arrayidx, align 4
1329  %c = icmp ugt i32 %r.07, %6
1330  %add = select i1 %c, i32 %r.07, i32 %6
1331  %inc = add nuw nsw i32 %i.08, 1
1332  %exitcond = icmp eq i32 %inc, %n
1333  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1334
1335for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1336  %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1337  ret i32 %r.0.lcssa
1338}
1339
1340define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
1341; CHECK-LABEL: umax_i32_inloop:
1342; CHECK:       @ %bb.0: @ %entry
1343; CHECK-NEXT:    .save {r7, lr}
1344; CHECK-NEXT:    push {r7, lr}
1345; CHECK-NEXT:    cmp r1, #1
1346; CHECK-NEXT:    blt .LBB14_3
1347; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1348; CHECK-NEXT:    mov r12, r0
1349; CHECK-NEXT:    cmp r1, #4
1350; CHECK-NEXT:    bhs .LBB14_4
1351; CHECK-NEXT:  @ %bb.2:
1352; CHECK-NEXT:    movs r3, #0
1353; CHECK-NEXT:    movs r0, #0
1354; CHECK-NEXT:    b .LBB14_7
1355; CHECK-NEXT:  .LBB14_3:
1356; CHECK-NEXT:    movs r0, #0
1357; CHECK-NEXT:    b .LBB14_9
1358; CHECK-NEXT:  .LBB14_4: @ %vector.ph
1359; CHECK-NEXT:    bic r3, r1, #3
1360; CHECK-NEXT:    movs r2, #1
1361; CHECK-NEXT:    subs r0, r3, #4
1362; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
1363; CHECK-NEXT:    movs r0, #0
1364; CHECK-NEXT:    mov r2, r12
1365; CHECK-NEXT:  .LBB14_5: @ %vector.body
1366; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1367; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
1368; CHECK-NEXT:    vmaxv.u32 r0, q0
1369; CHECK-NEXT:    le lr, .LBB14_5
1370; CHECK-NEXT:  @ %bb.6: @ %middle.block
1371; CHECK-NEXT:    cmp r3, r1
1372; CHECK-NEXT:    it eq
1373; CHECK-NEXT:    popeq {r7, pc}
1374; CHECK-NEXT:  .LBB14_7: @ %for.body.preheader1
1375; CHECK-NEXT:    sub.w lr, r1, r3
1376; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
1377; CHECK-NEXT:  .LBB14_8: @ %for.body
1378; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1379; CHECK-NEXT:    ldr r1, [r2], #4
1380; CHECK-NEXT:    cmp r0, r1
1381; CHECK-NEXT:    csel r0, r0, r1, hi
1382; CHECK-NEXT:    le lr, .LBB14_8
1383; CHECK-NEXT:  .LBB14_9: @ %for.cond.cleanup
1384; CHECK-NEXT:    pop {r7, pc}
1385entry:
1386  %cmp6 = icmp sgt i32 %n, 0
1387  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1388
1389for.body.preheader:                               ; preds = %entry
1390  %min.iters.check = icmp ult i32 %n, 4
1391  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1392
1393vector.ph:                                        ; preds = %for.body.preheader
1394  %n.vec = and i32 %n, -4
1395  br label %vector.body
1396
1397vector.body:                                      ; preds = %vector.body, %vector.ph
1398  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1399  %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
1400  %0 = getelementptr inbounds i32, i32* %x, i32 %index
1401  %1 = bitcast i32* %0 to <4 x i32>*
1402  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1403  %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
1404  %2 = icmp ugt i32 %vec.phi, %l5
1405  %3 = select i1 %2, i32 %vec.phi, i32 %l5
1406  %index.next = add i32 %index, 4
1407  %4 = icmp eq i32 %index.next, %n.vec
1408  br i1 %4, label %middle.block, label %vector.body
1409
1410middle.block:                                     ; preds = %vector.body
1411  %5 = phi i32 [ %3, %vector.body ]
1412  %cmp.n = icmp eq i32 %n.vec, %n
1413  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1414
1415for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1416  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1417  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
1418  br label %for.body
1419
1420for.body:                                         ; preds = %for.body.preheader1, %for.body
1421  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1422  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1423  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1424  %6 = load i32, i32* %arrayidx, align 4
1425  %c = icmp ugt i32 %r.07, %6
1426  %add = select i1 %c, i32 %r.07, i32 %6
1427  %inc = add nuw nsw i32 %i.08, 1
1428  %exitcond = icmp eq i32 %inc, %n
1429  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1430
1431for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1432  %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1433  ret i32 %r.0.lcssa
1434}
1435
1436define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
1437; CHECK-LABEL: fmin_f32:
1438; CHECK:       @ %bb.0: @ %entry
1439; CHECK-NEXT:    .save {r7, lr}
1440; CHECK-NEXT:    push {r7, lr}
1441; CHECK-NEXT:    cmp r1, #1
1442; CHECK-NEXT:    blt .LBB15_3
1443; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1444; CHECK-NEXT:    cmp r1, #4
1445; CHECK-NEXT:    bhs .LBB15_4
1446; CHECK-NEXT:  @ %bb.2:
1447; CHECK-NEXT:    vldr s0, .LCPI15_0
1448; CHECK-NEXT:    movs r2, #0
1449; CHECK-NEXT:    b .LBB15_7
1450; CHECK-NEXT:  .LBB15_3:
1451; CHECK-NEXT:    vldr s0, .LCPI15_0
1452; CHECK-NEXT:    b .LBB15_9
1453; CHECK-NEXT:  .LBB15_4: @ %vector.ph
1454; CHECK-NEXT:    bic r2, r1, #3
1455; CHECK-NEXT:    movs r3, #1
1456; CHECK-NEXT:    sub.w r12, r2, #4
1457; CHECK-NEXT:    vmov.i32 q0, #0x0
1458; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
1459; CHECK-NEXT:    mov r3, r0
1460; CHECK-NEXT:  .LBB15_5: @ %vector.body
1461; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1462; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
1463; CHECK-NEXT:    vcmp.f32 lt, q0, q1
1464; CHECK-NEXT:    vpsel q0, q0, q1
1465; CHECK-NEXT:    le lr, .LBB15_5
1466; CHECK-NEXT:  @ %bb.6: @ %middle.block
1467; CHECK-NEXT:    vminnm.f32 s4, s2, s3
1468; CHECK-NEXT:    vminnm.f32 s0, s0, s1
1469; CHECK-NEXT:    vminnm.f32 s0, s0, s4
1470; CHECK-NEXT:    cmp r2, r1
1471; CHECK-NEXT:    beq .LBB15_9
1472; CHECK-NEXT:  .LBB15_7: @ %for.body.preheader1
1473; CHECK-NEXT:    sub.w lr, r1, r2
1474; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
1475; CHECK-NEXT:  .LBB15_8: @ %for.body
1476; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1477; CHECK-NEXT:    vldmia r0!, {s2}
1478; CHECK-NEXT:    vcmp.f32 s0, s2
1479; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
1480; CHECK-NEXT:    vselge.f32 s0, s2, s0
1481; CHECK-NEXT:    le lr, .LBB15_8
1482; CHECK-NEXT:  .LBB15_9: @ %for.cond.cleanup
1483; CHECK-NEXT:    vmov r0, s0
1484; CHECK-NEXT:    pop {r7, pc}
1485; CHECK-NEXT:    .p2align 2
1486; CHECK-NEXT:  @ %bb.10:
1487; CHECK-NEXT:  .LCPI15_0:
1488; CHECK-NEXT:    .long 0x00000000 @ float 0
1489entry:
1490  %cmp6 = icmp sgt i32 %n, 0
1491  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1492
1493for.body.preheader:                               ; preds = %entry
1494  %min.iters.check = icmp ult i32 %n, 4
1495  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1496
1497vector.ph:                                        ; preds = %for.body.preheader
1498  %n.vec = and i32 %n, -4
1499  br label %vector.body
1500
1501vector.body:                                      ; preds = %vector.body, %vector.ph
1502  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1503  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1504  %0 = getelementptr inbounds float, float* %x, i32 %index
1505  %1 = bitcast float* %0 to <4 x float>*
1506  %wide.load = load <4 x float>, <4 x float>* %1, align 4
1507  %2 = fcmp ult <4 x float> %vec.phi, %wide.load
1508  %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
1509  %index.next = add i32 %index, 4
1510  %4 = icmp eq i32 %index.next, %n.vec
1511  br i1 %4, label %middle.block, label %vector.body
1512
1513middle.block:                                     ; preds = %vector.body
1514  %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
1515  %cmp.n = icmp eq i32 %n.vec, %n
1516  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1517
1518for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1519  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1520  %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
1521  br label %for.body
1522
1523for.body:                                         ; preds = %for.body.preheader1, %for.body
1524  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1525  %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1526  %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
1527  %6 = load float, float* %arrayidx, align 4
1528  %c = fcmp ult float %r.07, %6
1529  %add = select i1 %c, float %r.07, float %6
1530  %inc = add nuw nsw i32 %i.08, 1
1531  %exitcond = icmp eq i32 %inc, %n
1532  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1533
1534for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1535  %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1536  ret float %r.0.lcssa
1537}
1538
1539define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
1540; CHECK-LABEL: fmax_f32:
1541; CHECK:       @ %bb.0: @ %entry
1542; CHECK-NEXT:    .save {r7, lr}
1543; CHECK-NEXT:    push {r7, lr}
1544; CHECK-NEXT:    cmp r1, #1
1545; CHECK-NEXT:    blt .LBB16_3
1546; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1547; CHECK-NEXT:    cmp r1, #4
1548; CHECK-NEXT:    bhs .LBB16_4
1549; CHECK-NEXT:  @ %bb.2:
1550; CHECK-NEXT:    vldr s0, .LCPI16_0
1551; CHECK-NEXT:    movs r2, #0
1552; CHECK-NEXT:    b .LBB16_7
1553; CHECK-NEXT:  .LBB16_3:
1554; CHECK-NEXT:    vldr s0, .LCPI16_0
1555; CHECK-NEXT:    b .LBB16_9
1556; CHECK-NEXT:  .LBB16_4: @ %vector.ph
1557; CHECK-NEXT:    bic r2, r1, #3
1558; CHECK-NEXT:    movs r3, #1
1559; CHECK-NEXT:    sub.w r12, r2, #4
1560; CHECK-NEXT:    vmov.i32 q0, #0x0
1561; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
1562; CHECK-NEXT:    mov r3, r0
1563; CHECK-NEXT:  .LBB16_5: @ %vector.body
1564; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1565; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
1566; CHECK-NEXT:    vcmp.f32 lt, q1, q0
1567; CHECK-NEXT:    vpsel q0, q0, q1
1568; CHECK-NEXT:    le lr, .LBB16_5
1569; CHECK-NEXT:  @ %bb.6: @ %middle.block
1570; CHECK-NEXT:    vmaxnm.f32 s4, s2, s3
1571; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
1572; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
1573; CHECK-NEXT:    cmp r2, r1
1574; CHECK-NEXT:    beq .LBB16_9
1575; CHECK-NEXT:  .LBB16_7: @ %for.body.preheader1
1576; CHECK-NEXT:    sub.w lr, r1, r2
1577; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
1578; CHECK-NEXT:  .LBB16_8: @ %for.body
1579; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1580; CHECK-NEXT:    vldmia r0!, {s2}
1581; CHECK-NEXT:    vcmp.f32 s2, s0
1582; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
1583; CHECK-NEXT:    vselge.f32 s0, s2, s0
1584; CHECK-NEXT:    le lr, .LBB16_8
1585; CHECK-NEXT:  .LBB16_9: @ %for.cond.cleanup
1586; CHECK-NEXT:    vmov r0, s0
1587; CHECK-NEXT:    pop {r7, pc}
1588; CHECK-NEXT:    .p2align 2
1589; CHECK-NEXT:  @ %bb.10:
1590; CHECK-NEXT:  .LCPI16_0:
1591; CHECK-NEXT:    .long 0x00000000 @ float 0
1592entry:
1593  %cmp6 = icmp sgt i32 %n, 0
1594  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1595
1596for.body.preheader:                               ; preds = %entry
1597  %min.iters.check = icmp ult i32 %n, 4
1598  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1599
1600vector.ph:                                        ; preds = %for.body.preheader
1601  %n.vec = and i32 %n, -4
1602  br label %vector.body
1603
1604vector.body:                                      ; preds = %vector.body, %vector.ph
1605  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1606  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1607  %0 = getelementptr inbounds float, float* %x, i32 %index
1608  %1 = bitcast float* %0 to <4 x float>*
1609  %wide.load = load <4 x float>, <4 x float>* %1, align 4
1610  %2 = fcmp ugt <4 x float> %vec.phi, %wide.load
1611  %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
1612  %index.next = add i32 %index, 4
1613  %4 = icmp eq i32 %index.next, %n.vec
1614  br i1 %4, label %middle.block, label %vector.body
1615
1616middle.block:                                     ; preds = %vector.body
1617  %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
1618  %cmp.n = icmp eq i32 %n.vec, %n
1619  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1620
1621for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1622  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1623  %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
1624  br label %for.body
1625
1626for.body:                                         ; preds = %for.body.preheader1, %for.body
1627  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1628  %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1629  %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
1630  %6 = load float, float* %arrayidx, align 4
1631  %c = fcmp ugt float %r.07, %6
1632  %add = select i1 %c, float %r.07, float %6
1633  %inc = add nuw nsw i32 %i.08, 1
1634  %exitcond = icmp eq i32 %inc, %n
1635  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1636
1637for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1638  %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1639  ret float %r.0.lcssa
1640}
1641
1642define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) {
1643; CHECK-LABEL: add4i32:
1644; CHECK:       @ %bb.0: @ %entry
1645; CHECK-NEXT:    .save {r7, lr}
1646; CHECK-NEXT:    push {r7, lr}
1647; CHECK-NEXT:    cbz r1, .LBB17_4
1648; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1649; CHECK-NEXT:    movs r2, #0
1650; CHECK-NEXT:    dlstp.32 lr, r1
1651; CHECK-NEXT:  .LBB17_2: @ %vector.body
1652; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1653; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
1654; CHECK-NEXT:    vaddva.u32 r2, q0
1655; CHECK-NEXT:    letp lr, .LBB17_2
1656; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1657; CHECK-NEXT:    mov r0, r2
1658; CHECK-NEXT:    pop {r7, pc}
1659; CHECK-NEXT:  .LBB17_4:
1660; CHECK-NEXT:    movs r2, #0
1661; CHECK-NEXT:    mov r0, r2
1662; CHECK-NEXT:    pop {r7, pc}
1663entry:
1664  %cmp6.not = icmp eq i32 %n, 0
1665  br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1666
1667vector.ph:                                        ; preds = %entry
1668  %n.rnd.up = add i32 %n, 3
1669  %n.vec = and i32 %n.rnd.up, -4
1670  br label %vector.body
1671
1672vector.body:                                      ; preds = %vector.body, %vector.ph
1673  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1674  %vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
1675  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1676  %0 = getelementptr inbounds i32, i32* %x, i32 %index
1677  %1 = bitcast i32* %0 to <4 x i32>*
1678  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1679  %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
1680  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1681  %4 = add i32 %3, %vec.phi
1682  %index.next = add i32 %index, 4
1683  %5 = icmp eq i32 %index.next, %n.vec
1684  br i1 %5, label %for.cond.cleanup, label %vector.body
1685
1686for.cond.cleanup:                                 ; preds = %vector.body, %entry
1687  %s.0.lcssa = phi i32 [ 0, %entry ], [ %4, %vector.body ]
1688  ret i32 %s.0.lcssa
1689}
1690
1691define i32 @mla4i32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
1692; CHECK-LABEL: mla4i32:
1693; CHECK:       @ %bb.0: @ %entry
1694; CHECK-NEXT:    .save {r7, lr}
1695; CHECK-NEXT:    push {r7, lr}
1696; CHECK-NEXT:    cbz r2, .LBB18_4
1697; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1698; CHECK-NEXT:    mov.w r12, #0
1699; CHECK-NEXT:    dlstp.32 lr, r2
1700; CHECK-NEXT:  .LBB18_2: @ %vector.body
1701; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1702; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
1703; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
1704; CHECK-NEXT:    vmlava.u32 r12, q1, q0
1705; CHECK-NEXT:    letp lr, .LBB18_2
1706; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1707; CHECK-NEXT:    mov r0, r12
1708; CHECK-NEXT:    pop {r7, pc}
1709; CHECK-NEXT:  .LBB18_4:
1710; CHECK-NEXT:    mov.w r12, #0
1711; CHECK-NEXT:    mov r0, r12
1712; CHECK-NEXT:    pop {r7, pc}
1713entry:
1714  %cmp8.not = icmp eq i32 %n, 0
1715  br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
1716
1717vector.ph:                                        ; preds = %entry
1718  %n.rnd.up = add i32 %n, 3
1719  %n.vec = and i32 %n.rnd.up, -4
1720  br label %vector.body
1721
1722vector.body:                                      ; preds = %vector.body, %vector.ph
1723  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1724  %vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
1725  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1726  %0 = getelementptr inbounds i32, i32* %x, i32 %index
1727  %1 = bitcast i32* %0 to <4 x i32>*
1728  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1729  %2 = getelementptr inbounds i32, i32* %y, i32 %index
1730  %3 = bitcast i32* %2 to <4 x i32>*
1731  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1732  %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
1733  %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
1734  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
1735  %7 = add i32 %6, %vec.phi
1736  %index.next = add i32 %index, 4
1737  %8 = icmp eq i32 %index.next, %n.vec
1738  br i1 %8, label %for.cond.cleanup, label %vector.body
1739
1740for.cond.cleanup:                                 ; preds = %vector.body, %entry
1741  %s.0.lcssa = phi i32 [ 0, %entry ], [ %7, %vector.body ]
1742  ret i32 %s.0.lcssa
1743}
1744
1745define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) {
1746; CHECK-LABEL: add8i32:
1747; CHECK:       @ %bb.0: @ %entry
1748; CHECK-NEXT:    .save {r7, lr}
1749; CHECK-NEXT:    push {r7, lr}
1750; CHECK-NEXT:    cbz r1, .LBB19_4
1751; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1752; CHECK-NEXT:    movs r2, #0
1753; CHECK-NEXT:    dlstp.16 lr, r1
1754; CHECK-NEXT:  .LBB19_2: @ %vector.body
1755; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1756; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1757; CHECK-NEXT:    vaddva.s16 r2, q0
1758; CHECK-NEXT:    letp lr, .LBB19_2
1759; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1760; CHECK-NEXT:    mov r0, r2
1761; CHECK-NEXT:    pop {r7, pc}
1762; CHECK-NEXT:  .LBB19_4:
1763; CHECK-NEXT:    movs r2, #0
1764; CHECK-NEXT:    mov r0, r2
1765; CHECK-NEXT:    pop {r7, pc}
1766entry:
1767  %cmp6.not = icmp eq i32 %n, 0
1768  br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1769
1770vector.ph:                                        ; preds = %entry
1771  %n.rnd.up = add i32 %n, 7
1772  %n.vec = and i32 %n.rnd.up, -8
1773  br label %vector.body
1774
1775vector.body:                                      ; preds = %vector.body, %vector.ph
1776  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1777  %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
1778  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1779  %0 = getelementptr inbounds i16, i16* %x, i32 %index
1780  %1 = bitcast i16* %0 to <8 x i16>*
1781  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1782  %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1783  %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
1784  %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
1785  %5 = add i32 %4, %vec.phi
1786  %index.next = add i32 %index, 8
1787  %6 = icmp eq i32 %index.next, %n.vec
1788  br i1 %6, label %for.cond.cleanup, label %vector.body
1789
1790for.cond.cleanup:                                 ; preds = %vector.body, %entry
1791  %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
1792  ret i32 %s.0.lcssa
1793}
1794
1795define i32 @mla8i32(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
1796; CHECK-LABEL: mla8i32:
1797; CHECK:       @ %bb.0: @ %entry
1798; CHECK-NEXT:    .save {r7, lr}
1799; CHECK-NEXT:    push {r7, lr}
1800; CHECK-NEXT:    cbz r2, .LBB20_4
1801; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1802; CHECK-NEXT:    mov.w r12, #0
1803; CHECK-NEXT:    dlstp.16 lr, r2
1804; CHECK-NEXT:  .LBB20_2: @ %vector.body
1805; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1806; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1807; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
1808; CHECK-NEXT:    vmlava.s16 r12, q1, q0
1809; CHECK-NEXT:    letp lr, .LBB20_2
1810; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1811; CHECK-NEXT:    mov r0, r12
1812; CHECK-NEXT:    pop {r7, pc}
1813; CHECK-NEXT:  .LBB20_4:
1814; CHECK-NEXT:    mov.w r12, #0
1815; CHECK-NEXT:    mov r0, r12
1816; CHECK-NEXT:    pop {r7, pc}
1817entry:
1818  %cmp9.not = icmp eq i32 %n, 0
1819  br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
1820
1821vector.ph:                                        ; preds = %entry
1822  %n.rnd.up = add i32 %n, 7
1823  %n.vec = and i32 %n.rnd.up, -8
1824  br label %vector.body
1825
1826vector.body:                                      ; preds = %vector.body, %vector.ph
1827  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1828  %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
1829  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1830  %0 = getelementptr inbounds i16, i16* %x, i32 %index
1831  %1 = bitcast i16* %0 to <8 x i16>*
1832  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1833  %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1834  %3 = getelementptr inbounds i16, i16* %y, i32 %index
1835  %4 = bitcast i16* %3 to <8 x i16>*
1836  %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1837  %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
1838  %6 = mul nsw <8 x i32> %5, %2
1839  %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
1840  %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
1841  %9 = add i32 %8, %vec.phi
1842  %index.next = add i32 %index, 8
1843  %10 = icmp eq i32 %index.next, %n.vec
1844  br i1 %10, label %for.cond.cleanup, label %vector.body
1845
1846for.cond.cleanup:                                 ; preds = %vector.body, %entry
1847  %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
1848  ret i32 %s.0.lcssa
1849}
1850
1851define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) {
1852; CHECK-LABEL: add16i32:
1853; CHECK:       @ %bb.0: @ %entry
1854; CHECK-NEXT:    .save {r7, lr}
1855; CHECK-NEXT:    push {r7, lr}
1856; CHECK-NEXT:    cbz r1, .LBB21_4
1857; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1858; CHECK-NEXT:    movs r2, #0
1859; CHECK-NEXT:    dlstp.8 lr, r1
1860; CHECK-NEXT:  .LBB21_2: @ %vector.body
1861; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1862; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
1863; CHECK-NEXT:    vaddva.u8 r2, q0
1864; CHECK-NEXT:    letp lr, .LBB21_2
1865; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1866; CHECK-NEXT:    mov r0, r2
1867; CHECK-NEXT:    pop {r7, pc}
1868; CHECK-NEXT:  .LBB21_4:
1869; CHECK-NEXT:    movs r2, #0
1870; CHECK-NEXT:    mov r0, r2
1871; CHECK-NEXT:    pop {r7, pc}
1872entry:
1873  %cmp6.not = icmp eq i32 %n, 0
1874  br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1875
1876vector.ph:                                        ; preds = %entry
1877  %n.rnd.up = add i32 %n, 15
1878  %n.vec = and i32 %n.rnd.up, -16
1879  br label %vector.body
1880
1881vector.body:                                      ; preds = %vector.body, %vector.ph
1882  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1883  %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
1884  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
1885  %0 = getelementptr inbounds i8, i8* %x, i32 %index
1886  %1 = bitcast i8* %0 to <16 x i8>*
1887  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1888  %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
1889  %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
1890  %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
1891  %5 = add i32 %4, %vec.phi
1892  %index.next = add i32 %index, 16
1893  %6 = icmp eq i32 %index.next, %n.vec
1894  br i1 %6, label %for.cond.cleanup, label %vector.body
1895
1896for.cond.cleanup:                                 ; preds = %vector.body, %entry
1897  %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
1898  ret i32 %s.0.lcssa
1899}
1900
1901define i32 @mla16i32(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
1902; CHECK-LABEL: mla16i32:
1903; CHECK:       @ %bb.0: @ %entry
1904; CHECK-NEXT:    .save {r7, lr}
1905; CHECK-NEXT:    push {r7, lr}
1906; CHECK-NEXT:    cbz r2, .LBB22_4
1907; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1908; CHECK-NEXT:    mov.w r12, #0
1909; CHECK-NEXT:    dlstp.8 lr, r2
1910; CHECK-NEXT:  .LBB22_2: @ %vector.body
1911; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1912; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
1913; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
1914; CHECK-NEXT:    vmlava.u8 r12, q1, q0
1915; CHECK-NEXT:    letp lr, .LBB22_2
1916; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1917; CHECK-NEXT:    mov r0, r12
1918; CHECK-NEXT:    pop {r7, pc}
1919; CHECK-NEXT:  .LBB22_4:
1920; CHECK-NEXT:    mov.w r12, #0
1921; CHECK-NEXT:    mov r0, r12
1922; CHECK-NEXT:    pop {r7, pc}
1923entry:
1924  %cmp9.not = icmp eq i32 %n, 0
1925  br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
1926
1927vector.ph:                                        ; preds = %entry
1928  %n.rnd.up = add i32 %n, 15
1929  %n.vec = and i32 %n.rnd.up, -16
1930  br label %vector.body
1931
1932vector.body:                                      ; preds = %vector.body, %vector.ph
1933  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1934  %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
1935  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
1936  %0 = getelementptr inbounds i8, i8* %x, i32 %index
1937  %1 = bitcast i8* %0 to <16 x i8>*
1938  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1939  %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
1940  %3 = getelementptr inbounds i8, i8* %y, i32 %index
1941  %4 = bitcast i8* %3 to <16 x i8>*
1942  %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1943  %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
1944  %6 = mul nuw nsw <16 x i32> %5, %2
1945  %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
1946  %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
1947  %9 = add i32 %8, %vec.phi
1948  %index.next = add i32 %index, 16
1949  %10 = icmp eq i32 %index.next, %n.vec
1950  br i1 %10, label %for.cond.cleanup, label %vector.body
1951
1952for.cond.cleanup:                                 ; preds = %vector.body, %entry
1953  %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
1954  ret i32 %s.0.lcssa
1955}
1956
1957define signext i16 @add8i16(i16* noalias nocapture readonly %x, i32 %n) {
1958; CHECK-LABEL: add8i16:
1959; CHECK:       @ %bb.0: @ %entry
1960; CHECK-NEXT:    .save {r7, lr}
1961; CHECK-NEXT:    push {r7, lr}
1962; CHECK-NEXT:    cbz r1, .LBB23_4
1963; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1964; CHECK-NEXT:    movs r2, #0
1965; CHECK-NEXT:    dlstp.16 lr, r1
1966; CHECK-NEXT:  .LBB23_2: @ %vector.body
1967; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1968; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1969; CHECK-NEXT:    vaddva.u16 r2, q0
1970; CHECK-NEXT:    letp lr, .LBB23_2
1971; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1972; CHECK-NEXT:    sxth r0, r2
1973; CHECK-NEXT:    pop {r7, pc}
1974; CHECK-NEXT:  .LBB23_4:
1975; CHECK-NEXT:    movs r2, #0
1976; CHECK-NEXT:    sxth r0, r2
1977; CHECK-NEXT:    pop {r7, pc}
1978entry:
1979  %cmp8.not = icmp eq i32 %n, 0
1980  br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
1981
1982vector.ph:                                        ; preds = %entry
1983  %n.rnd.up = add i32 %n, 7
1984  %n.vec = and i32 %n.rnd.up, -8
1985  br label %vector.body
1986
1987vector.body:                                      ; preds = %vector.body, %vector.ph
1988  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1989  %vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ]
1990  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1991  %0 = getelementptr inbounds i16, i16* %x, i32 %index
1992  %1 = bitcast i16* %0 to <8 x i16>*
1993  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1994  %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
1995  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
1996  %4 = add i16 %3, %vec.phi
1997  %index.next = add i32 %index, 8
1998  %5 = icmp eq i32 %index.next, %n.vec
1999  br i1 %5, label %for.cond.cleanup, label %vector.body
2000
2001for.cond.cleanup:                                 ; preds = %vector.body, %entry
2002  %s.0.lcssa = phi i16 [ 0, %entry ], [ %4, %vector.body ]
2003  ret i16 %s.0.lcssa
2004}
2005
2006define signext i16 @mla8i16(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
2007; CHECK-LABEL: mla8i16:
2008; CHECK:       @ %bb.0: @ %entry
2009; CHECK-NEXT:    .save {r7, lr}
2010; CHECK-NEXT:    push {r7, lr}
2011; CHECK-NEXT:    cbz r2, .LBB24_4
2012; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2013; CHECK-NEXT:    mov.w r12, #0
2014; CHECK-NEXT:    dlstp.16 lr, r2
2015; CHECK-NEXT:  .LBB24_2: @ %vector.body
2016; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2017; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
2018; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
2019; CHECK-NEXT:    vmlava.u16 r12, q1, q0
2020; CHECK-NEXT:    letp lr, .LBB24_2
2021; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2022; CHECK-NEXT:    sxth.w r0, r12
2023; CHECK-NEXT:    pop {r7, pc}
2024; CHECK-NEXT:  .LBB24_4:
2025; CHECK-NEXT:    mov.w r12, #0
2026; CHECK-NEXT:    sxth.w r0, r12
2027; CHECK-NEXT:    pop {r7, pc}
2028entry:
2029  %cmp11.not = icmp eq i32 %n, 0
2030  br i1 %cmp11.not, label %for.cond.cleanup, label %vector.ph
2031
2032vector.ph:                                        ; preds = %entry
2033  %n.rnd.up = add i32 %n, 7
2034  %n.vec = and i32 %n.rnd.up, -8
2035  br label %vector.body
2036
2037vector.body:                                      ; preds = %vector.body, %vector.ph
2038  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2039  %vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ]
2040  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2041  %0 = getelementptr inbounds i16, i16* %x, i32 %index
2042  %1 = bitcast i16* %0 to <8 x i16>*
2043  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2044  %2 = getelementptr inbounds i16, i16* %y, i32 %index
2045  %3 = bitcast i16* %2 to <8 x i16>*
2046  %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2047  %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
2048  %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
2049  %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
2050  %7 = add i16 %6, %vec.phi
2051  %index.next = add i32 %index, 8
2052  %8 = icmp eq i32 %index.next, %n.vec
2053  br i1 %8, label %for.cond.cleanup, label %vector.body
2054
2055for.cond.cleanup:                                 ; preds = %vector.body, %entry
2056  %s.0.lcssa = phi i16 [ 0, %entry ], [ %7, %vector.body ]
2057  ret i16 %s.0.lcssa
2058}
2059
2060define signext i16 @add16i16(i8* noalias nocapture readonly %x, i32 %n) {
2061; CHECK-LABEL: add16i16:
2062; CHECK:       @ %bb.0: @ %entry
2063; CHECK-NEXT:    .save {r7, lr}
2064; CHECK-NEXT:    push {r7, lr}
2065; CHECK-NEXT:    cbz r1, .LBB25_4
2066; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2067; CHECK-NEXT:    movs r2, #0
2068; CHECK-NEXT:    dlstp.8 lr, r1
2069; CHECK-NEXT:  .LBB25_2: @ %vector.body
2070; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2071; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2072; CHECK-NEXT:    vaddva.u8 r2, q0
2073; CHECK-NEXT:    letp lr, .LBB25_2
2074; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2075; CHECK-NEXT:    sxth r0, r2
2076; CHECK-NEXT:    pop {r7, pc}
2077; CHECK-NEXT:  .LBB25_4:
2078; CHECK-NEXT:    movs r2, #0
2079; CHECK-NEXT:    sxth r0, r2
2080; CHECK-NEXT:    pop {r7, pc}
2081entry:
2082  %cmp8.not = icmp eq i32 %n, 0
2083  br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
2084
2085vector.ph:                                        ; preds = %entry
2086  %n.rnd.up = add i32 %n, 15
2087  %n.vec = and i32 %n.rnd.up, -16
2088  br label %vector.body
2089
2090vector.body:                                      ; preds = %vector.body, %vector.ph
2091  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2092  %vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ]
2093  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2094  %0 = getelementptr inbounds i8, i8* %x, i32 %index
2095  %1 = bitcast i8* %0 to <16 x i8>*
2096  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2097  %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
2098  %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
2099  %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
2100  %5 = add i16 %4, %vec.phi
2101  %index.next = add i32 %index, 16
2102  %6 = icmp eq i32 %index.next, %n.vec
2103  br i1 %6, label %for.cond.cleanup, label %vector.body
2104
2105for.cond.cleanup:                                 ; preds = %vector.body, %entry
2106  %s.0.lcssa = phi i16 [ 0, %entry ], [ %5, %vector.body ]
2107  ret i16 %s.0.lcssa
2108}
2109
2110define signext i16 @mla16i16(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
2111; CHECK-LABEL: mla16i16:
2112; CHECK:       @ %bb.0: @ %entry
2113; CHECK-NEXT:    .save {r7, lr}
2114; CHECK-NEXT:    push {r7, lr}
2115; CHECK-NEXT:    cbz r2, .LBB26_4
2116; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2117; CHECK-NEXT:    mov.w r12, #0
2118; CHECK-NEXT:    dlstp.8 lr, r2
2119; CHECK-NEXT:  .LBB26_2: @ %vector.body
2120; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2121; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2122; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
2123; CHECK-NEXT:    vmlava.u8 r12, q1, q0
2124; CHECK-NEXT:    letp lr, .LBB26_2
2125; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2126; CHECK-NEXT:    sxth.w r0, r12
2127; CHECK-NEXT:    pop {r7, pc}
2128; CHECK-NEXT:  .LBB26_4:
2129; CHECK-NEXT:    mov.w r12, #0
2130; CHECK-NEXT:    sxth.w r0, r12
2131; CHECK-NEXT:    pop {r7, pc}
2132entry:
2133  %cmp13.not = icmp eq i32 %n, 0
2134  br i1 %cmp13.not, label %for.cond.cleanup, label %vector.ph
2135
2136vector.ph:                                        ; preds = %entry
2137  %n.rnd.up = add i32 %n, 15
2138  %n.vec = and i32 %n.rnd.up, -16
2139  br label %vector.body
2140
2141vector.body:                                      ; preds = %vector.body, %vector.ph
2142  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2143  %vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ]
2144  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2145  %0 = getelementptr inbounds i8, i8* %x, i32 %index
2146  %1 = bitcast i8* %0 to <16 x i8>*
2147  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2148  %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
2149  %3 = getelementptr inbounds i8, i8* %y, i32 %index
2150  %4 = bitcast i8* %3 to <16 x i8>*
2151  %wide.masked.load18 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2152  %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
2153  %6 = mul nuw <16 x i16> %5, %2
2154  %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
2155  %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
2156  %9 = add i16 %8, %vec.phi
2157  %index.next = add i32 %index, 16
2158  %10 = icmp eq i32 %index.next, %n.vec
2159  br i1 %10, label %for.cond.cleanup, label %vector.body
2160
2161for.cond.cleanup:                                 ; preds = %vector.body, %entry
2162  %s.0.lcssa = phi i16 [ 0, %entry ], [ %9, %vector.body ]
2163  ret i16 %s.0.lcssa
2164}
2165
2166define zeroext i8 @add16i8(i8* noalias nocapture readonly %x, i32 %n) {
2167; CHECK-LABEL: add16i8:
2168; CHECK:       @ %bb.0: @ %entry
2169; CHECK-NEXT:    .save {r7, lr}
2170; CHECK-NEXT:    push {r7, lr}
2171; CHECK-NEXT:    cbz r1, .LBB27_4
2172; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2173; CHECK-NEXT:    movs r2, #0
2174; CHECK-NEXT:    dlstp.8 lr, r1
2175; CHECK-NEXT:  .LBB27_2: @ %vector.body
2176; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2177; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2178; CHECK-NEXT:    vaddva.u8 r2, q0
2179; CHECK-NEXT:    letp lr, .LBB27_2
2180; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2181; CHECK-NEXT:    uxtb r0, r2
2182; CHECK-NEXT:    pop {r7, pc}
2183; CHECK-NEXT:  .LBB27_4:
2184; CHECK-NEXT:    movs r2, #0
2185; CHECK-NEXT:    uxtb r0, r2
2186; CHECK-NEXT:    pop {r7, pc}
2187entry:
2188  %cmp7.not = icmp eq i32 %n, 0
2189  br i1 %cmp7.not, label %for.cond.cleanup, label %vector.ph
2190
2191vector.ph:                                        ; preds = %entry
2192  %n.rnd.up = add i32 %n, 15
2193  %n.vec = and i32 %n.rnd.up, -16
2194  br label %vector.body
2195
2196vector.body:                                      ; preds = %vector.body, %vector.ph
2197  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2198  %vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ]
2199  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2200  %0 = getelementptr inbounds i8, i8* %x, i32 %index
2201  %1 = bitcast i8* %0 to <16 x i8>*
2202  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2203  %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
2204  %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
2205  %4 = add i8 %3, %vec.phi
2206  %index.next = add i32 %index, 16
2207  %5 = icmp eq i32 %index.next, %n.vec
2208  br i1 %5, label %for.cond.cleanup, label %vector.body
2209
2210for.cond.cleanup:                                 ; preds = %vector.body, %entry
2211  %s.0.lcssa = phi i8 [ 0, %entry ], [ %4, %vector.body ]
2212  ret i8 %s.0.lcssa
2213}
2214
2215define zeroext i8 @mla16i8(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
2216; CHECK-LABEL: mla16i8:
2217; CHECK:       @ %bb.0: @ %entry
2218; CHECK-NEXT:    .save {r7, lr}
2219; CHECK-NEXT:    push {r7, lr}
2220; CHECK-NEXT:    cbz r2, .LBB28_4
2221; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2222; CHECK-NEXT:    mov.w r12, #0
2223; CHECK-NEXT:    dlstp.8 lr, r2
2224; CHECK-NEXT:  .LBB28_2: @ %vector.body
2225; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2226; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2227; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
2228; CHECK-NEXT:    vmlava.u8 r12, q1, q0
2229; CHECK-NEXT:    letp lr, .LBB28_2
2230; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2231; CHECK-NEXT:    uxtb.w r0, r12
2232; CHECK-NEXT:    pop {r7, pc}
2233; CHECK-NEXT:  .LBB28_4:
2234; CHECK-NEXT:    mov.w r12, #0
2235; CHECK-NEXT:    uxtb.w r0, r12
2236; CHECK-NEXT:    pop {r7, pc}
2237entry:
2238  %cmp10.not = icmp eq i32 %n, 0
2239  br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
2240
2241vector.ph:                                        ; preds = %entry
2242  %n.rnd.up = add i32 %n, 15
2243  %n.vec = and i32 %n.rnd.up, -16
2244  br label %vector.body
2245
2246vector.body:                                      ; preds = %vector.body, %vector.ph
2247  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2248  %vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ]
2249  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2250  %0 = getelementptr inbounds i8, i8* %x, i32 %index
2251  %1 = bitcast i8* %0 to <16 x i8>*
2252  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2253  %2 = getelementptr inbounds i8, i8* %y, i32 %index
2254  %3 = bitcast i8* %2 to <16 x i8>*
2255  %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2256  %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
2257  %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
2258  %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
2259  %7 = add i8 %6, %vec.phi
2260  %index.next = add i32 %index, 16
2261  %8 = icmp eq i32 %index.next, %n.vec
2262  br i1 %8, label %for.cond.cleanup, label %vector.body
2263
2264for.cond.cleanup:                                 ; preds = %vector.body, %entry
2265  %s.0.lcssa = phi i8 [ 0, %entry ], [ %7, %vector.body ]
2266  ret i8 %s.0.lcssa
2267}
2268
2269define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
2270; CHECK-LABEL: add4i64:
2271; CHECK:       @ %bb.0: @ %entry
2272; CHECK-NEXT:    .save {r7, lr}
2273; CHECK-NEXT:    push {r7, lr}
2274; CHECK-NEXT:    cbz r1, .LBB29_3
2275; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2276; CHECK-NEXT:    movs r2, #0
2277; CHECK-NEXT:    mov r3, r2
2278; CHECK-NEXT:    dlstp.32 lr, r1
2279; CHECK-NEXT:  .LBB29_2: @ %vector.body
2280; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2281; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
2282; CHECK-NEXT:    vaddlva.s32 r2, r3, q0
2283; CHECK-NEXT:    letp lr, .LBB29_2
2284; CHECK-NEXT:    b .LBB29_4
2285; CHECK-NEXT:  .LBB29_3:
2286; CHECK-NEXT:    movs r2, #0
2287; CHECK-NEXT:    mov r3, r2
2288; CHECK-NEXT:  .LBB29_4: @ %for.cond.cleanup
2289; CHECK-NEXT:    mov r0, r2
2290; CHECK-NEXT:    mov r1, r3
2291; CHECK-NEXT:    pop {r7, pc}
2292entry:
2293  %cmp6.not = icmp eq i32 %n, 0
2294  br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
2295
2296vector.ph:                                        ; preds = %entry
2297  %n.rnd.up = add i32 %n, 3
2298  %n.vec = and i32 %n.rnd.up, -4
2299  br label %vector.body
2300
2301vector.body:                                      ; preds = %vector.body, %vector.ph
2302  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2303  %vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ]
2304  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2305  %0 = getelementptr inbounds i32, i32* %x, i32 %index
2306  %1 = bitcast i32* %0 to <4 x i32>*
2307  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2308  %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
2309  %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
2310  %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
2311  %5 = add i64 %4, %vec.phi
2312  %index.next = add i32 %index, 4
2313  %6 = icmp eq i32 %index.next, %n.vec
2314  br i1 %6, label %for.cond.cleanup, label %vector.body
2315
2316for.cond.cleanup:                                 ; preds = %vector.body, %entry
2317  %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %vector.body ]
2318  ret i64 %s.0.lcssa
2319}
2320
2321define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
2322; CHECK-LABEL: mla4i64:
2323; CHECK:       @ %bb.0: @ %entry
2324; CHECK-NEXT:    .save {r7, lr}
2325; CHECK-NEXT:    push {r7, lr}
2326; CHECK-NEXT:    cbz r2, .LBB30_3
2327; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2328; CHECK-NEXT:    mov.w r12, #0
2329; CHECK-NEXT:    mov r3, r12
2330; CHECK-NEXT:    dlstp.32 lr, r2
2331; CHECK-NEXT:  .LBB30_2: @ %vector.body
2332; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2333; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
2334; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
2335; CHECK-NEXT:    vmlalva.s32 r12, r3, q1, q0
2336; CHECK-NEXT:    letp lr, .LBB30_2
2337; CHECK-NEXT:    b .LBB30_4
2338; CHECK-NEXT:  .LBB30_3:
2339; CHECK-NEXT:    mov.w r12, #0
2340; CHECK-NEXT:    mov r3, r12
2341; CHECK-NEXT:  .LBB30_4: @ %for.cond.cleanup
2342; CHECK-NEXT:    mov r0, r12
2343; CHECK-NEXT:    mov r1, r3
2344; CHECK-NEXT:    pop {r7, pc}
2345entry:
2346  %cmp9.not = icmp eq i32 %n, 0
2347  br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
2348
2349vector.ph:                                        ; preds = %entry
2350  %n.rnd.up = add i32 %n, 3
2351  %n.vec = and i32 %n.rnd.up, -4
2352  br label %vector.body
2353
2354vector.body:                                      ; preds = %vector.body, %vector.ph
2355  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2356  %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
2357  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2358  %0 = getelementptr inbounds i32, i32* %x, i32 %index
2359  %1 = bitcast i32* %0 to <4 x i32>*
2360  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2361  %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
2362  %3 = getelementptr inbounds i32, i32* %y, i32 %index
2363  %4 = bitcast i32* %3 to <4 x i32>*
2364  %wide.masked.load14 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2365  %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
2366  %6 = mul nsw <4 x i64> %5, %2
2367  %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
2368  %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
2369  %9 = add i64 %8, %vec.phi
2370  %index.next = add i32 %index, 4
2371  %10 = icmp eq i32 %index.next, %n.vec
2372  br i1 %10, label %for.cond.cleanup, label %vector.body
2373
2374for.cond.cleanup:                                 ; preds = %vector.body, %entry
2375  %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
2376  ret i64 %s.0.lcssa
2377}
2378
2379define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
2380; CHECK-LABEL: mla8i64:
2381; CHECK:       @ %bb.0: @ %entry
2382; CHECK-NEXT:    .save {r7, lr}
2383; CHECK-NEXT:    push {r7, lr}
2384; CHECK-NEXT:    cbz r2, .LBB31_3
2385; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2386; CHECK-NEXT:    mov.w r12, #0
2387; CHECK-NEXT:    mov r3, r12
2388; CHECK-NEXT:    dlstp.16 lr, r2
2389; CHECK-NEXT:  .LBB31_2: @ %vector.body
2390; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2391; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
2392; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
2393; CHECK-NEXT:    vmlalva.s16 r12, r3, q1, q0
2394; CHECK-NEXT:    letp lr, .LBB31_2
2395; CHECK-NEXT:    b .LBB31_4
2396; CHECK-NEXT:  .LBB31_3:
2397; CHECK-NEXT:    mov.w r12, #0
2398; CHECK-NEXT:    mov r3, r12
2399; CHECK-NEXT:  .LBB31_4: @ %for.cond.cleanup
2400; CHECK-NEXT:    mov r0, r12
2401; CHECK-NEXT:    mov r1, r3
2402; CHECK-NEXT:    pop {r7, pc}
2403entry:
2404  %cmp9.not = icmp eq i32 %n, 0
2405  br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
2406
2407vector.ph:                                        ; preds = %entry
2408  %n.rnd.up = add i32 %n, 7
2409  %n.vec = and i32 %n.rnd.up, -8
2410  br label %vector.body
2411
2412vector.body:                                      ; preds = %vector.body, %vector.ph
2413  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2414  %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
2415  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2416  %0 = getelementptr inbounds i16, i16* %x, i32 %index
2417  %1 = bitcast i16* %0 to <8 x i16>*
2418  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2419  %2 = sext <8 x i16> %wide.masked.load to <8 x i64>
2420  %3 = getelementptr inbounds i16, i16* %y, i32 %index
2421  %4 = bitcast i16* %3 to <8 x i16>*
2422  %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2423  %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
2424  %6 = mul nsw <8 x i64> %5, %2
2425  %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
2426  %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
2427  %9 = add i64 %8, %vec.phi
2428  %index.next = add i32 %index, 8
2429  %10 = icmp eq i32 %index.next, %n.vec
2430  br i1 %10, label %for.cond.cleanup, label %vector.body
2431
2432for.cond.cleanup:                                 ; preds = %vector.body, %entry
2433  %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
2434  ret i64 %s.0.lcssa
2435}
2436
2437declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
2438declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
2439declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
2440declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
2441declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
2442declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
2443declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2
2444declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
2445declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
2446declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
2447declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
2448declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
2449declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
2450
2451declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
2452declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
2453declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
2454declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
2455declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
2456declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
2457declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
2458declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
2459declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
2460declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
2461declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
2462declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
2463declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
2464