1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) {
5; CHECK-LABEL: vabd_s8:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vabd.s8 q0, q0, q1
8; CHECK-NEXT:    bx lr
9  %sextsrc1 = sext <16 x i8> %src1 to <16 x i16>
10  %sextsrc2 = sext <16 x i8> %src2 to <16 x i16>
11  %add1 = sub <16 x i16> %sextsrc1, %sextsrc2
12  %add2 = sub <16 x i16> zeroinitializer, %add1
13  %c = icmp sge <16 x i16> %add1, zeroinitializer
14  %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
15  %result = trunc <16 x i16> %s to <16 x i8>
16  ret <16 x i8> %result
17}
18
19define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) {
20; CHECK-LABEL: vabd_s16:
21; CHECK:       @ %bb.0:
22; CHECK-NEXT:    vabd.s16 q0, q0, q1
23; CHECK-NEXT:    bx lr
24  %sextsrc1 = sext <8 x i16> %src1 to <8 x i32>
25  %sextsrc2 = sext <8 x i16> %src2 to <8 x i32>
26  %add1 = sub <8 x i32> %sextsrc1, %sextsrc2
27  %add2 = sub <8 x i32> zeroinitializer, %add1
28  %c = icmp sge <8 x i32> %add1, zeroinitializer
29  %s = select <8 x i1> %c, <8 x i32> %add1, <8 x i32> %add2
30  %result = trunc <8 x i32> %s to <8 x i16>
31  ret <8 x i16> %result
32}
33
34define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) {
35; CHECK-LABEL: vabd_s32:
36; CHECK:       @ %bb.0:
37; CHECK-NEXT:    vabd.s32 q0, q0, q1
38; CHECK-NEXT:    bx lr
39  %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
40  %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
41  %add1 = sub <4 x i64> %sextsrc1, %sextsrc2
42  %add2 = sub <4 x i64> zeroinitializer, %add1
43  %c = icmp sge <4 x i64> %add1, zeroinitializer
44  %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2
45  %result = trunc <4 x i64> %s to <4 x i32>
46  ret <4 x i32> %result
47}
48
49define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) {
50; CHECK-LABEL: vabd_u8:
51; CHECK:       @ %bb.0:
52; CHECK-NEXT:    vabd.u8 q0, q0, q1
53; CHECK-NEXT:    bx lr
54  %zextsrc1 = zext <16 x i8> %src1 to <16 x i16>
55  %zextsrc2 = zext <16 x i8> %src2 to <16 x i16>
56  %add1 = sub <16 x i16> %zextsrc1, %zextsrc2
57  %add2 = sub <16 x i16> zeroinitializer, %add1
58  %c = icmp sge <16 x i16> %add1, zeroinitializer
59  %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2
60  %result = trunc <16 x i16> %s to <16 x i8>
61  ret <16 x i8> %result
62}
63
64define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) {
65; CHECK-LABEL: vabd_u16:
66; CHECK:       @ %bb.0:
67; CHECK-NEXT:    vabd.u16 q0, q0, q1
68; CHECK-NEXT:    bx lr
69  %zextsrc1 = zext <8 x i16> %src1 to <8 x i32>
70  %zextsrc2 = zext <8 x i16> %src2 to <8 x i32>
71  %add1 = sub <8 x i32> %zextsrc1, %zextsrc2
72  %add2 = sub <8 x i32> zeroinitializer, %add1
73  %c = icmp sge <8 x i32> %add1, zeroinitializer
74  %s = select <8 x i1> %c, <8 x i32> %add1, <8 x i32> %add2
75  %result = trunc <8 x i32> %s to <8 x i16>
76  ret <8 x i16> %result
77}
78
79define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) {
80; CHECK-LABEL: vabd_u32:
81; CHECK:       @ %bb.0:
82; CHECK-NEXT:    vabd.u32 q0, q0, q1
83; CHECK-NEXT:    bx lr
84  %zextsrc1 = zext <4 x i32> %src1 to <4 x i64>
85  %zextsrc2 = zext <4 x i32> %src2 to <4 x i64>
86  %add1 = sub <4 x i64> %zextsrc1, %zextsrc2
87  %add2 = sub <4 x i64> zeroinitializer, %add1
88  %c = icmp sge <4 x i64> %add1, zeroinitializer
89  %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2
90  %result = trunc <4 x i64> %s to <4 x i32>
91  ret <4 x i32> %result
92}
93
94define void @vabd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
95; CHECK-LABEL: vabd_loop_s8:
96; CHECK:       @ %bb.0: @ %entry
97; CHECK-NEXT:    .save {r7, lr}
98; CHECK-NEXT:    push {r7, lr}
99; CHECK-NEXT:    mov.w lr, #64
100; CHECK-NEXT:  .LBB6_1: @ %vector.body
101; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
102; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
103; CHECK-NEXT:    vldrb.u8 q1, [r0], #16
104; CHECK-NEXT:    vabd.s8 q0, q1, q0
105; CHECK-NEXT:    vstrb.8 q0, [r2], #16
106; CHECK-NEXT:    le lr, .LBB6_1
107; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
108; CHECK-NEXT:    pop {r7, pc}
109entry:
110  br label %vector.body
111
112vector.body:                                      ; preds = %vector.body, %entry
113  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
114  %0 = getelementptr inbounds i8, i8* %x, i32 %index
115  %1 = bitcast i8* %0 to <16 x i8>*
116  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
117  %2 = sext <16 x i8> %wide.load to <16 x i32>
118  %3 = getelementptr inbounds i8, i8* %y, i32 %index
119  %4 = bitcast i8* %3 to <16 x i8>*
120  %wide.load22 = load <16 x i8>, <16 x i8>* %4, align 1
121  %5 = sext <16 x i8> %wide.load22 to <16 x i32>
122  %6 = sub nsw <16 x i32> %2, %5
123  %7 = icmp slt <16 x i32> %6, zeroinitializer
124  %8 = sub nsw <16 x i32> zeroinitializer, %6
125  %9 = select <16 x i1> %7, <16 x i32> %8, <16 x i32> %6
126  %10 = trunc <16 x i32> %9 to <16 x i8>
127  %11 = getelementptr inbounds i8, i8* %z, i32 %index
128  %12 = bitcast i8* %11 to <16 x i8>*
129  store <16 x i8> %10, <16 x i8>* %12, align 1
130  %index.next = add i32 %index, 16
131  %13 = icmp eq i32 %index.next, 1024
132  br i1 %13, label %for.cond.cleanup, label %vector.body
133
134for.cond.cleanup:                                 ; preds = %vector.body
135  ret void
136}
137
138define void @vabd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
139; CHECK-LABEL: vabd_loop_s16:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    .save {r7, lr}
142; CHECK-NEXT:    push {r7, lr}
143; CHECK-NEXT:    mov.w lr, #128
144; CHECK-NEXT:  .LBB7_1: @ %vector.body
145; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
146; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
147; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
148; CHECK-NEXT:    vabd.s16 q0, q1, q0
149; CHECK-NEXT:    vstrb.8 q0, [r2], #16
150; CHECK-NEXT:    le lr, .LBB7_1
151; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
152; CHECK-NEXT:    pop {r7, pc}
153entry:
154  br label %vector.body
155
156vector.body:                                      ; preds = %vector.body, %entry
157  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
158  %0 = getelementptr inbounds i16, i16* %x, i32 %index
159  %1 = bitcast i16* %0 to <8 x i16>*
160  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
161  %2 = sext <8 x i16> %wide.load to <8 x i32>
162  %3 = getelementptr inbounds i16, i16* %y, i32 %index
163  %4 = bitcast i16* %3 to <8 x i16>*
164  %wide.load22 = load <8 x i16>, <8 x i16>* %4, align 2
165  %5 = sext <8 x i16> %wide.load22 to <8 x i32>
166  %6 = sub nsw <8 x i32> %2, %5
167  %7 = icmp slt <8 x i32> %6, zeroinitializer
168  %8 = sub nsw <8 x i32> zeroinitializer, %6
169  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
170  %10 = trunc <8 x i32> %9 to <8 x i16>
171  %11 = getelementptr inbounds i16, i16* %z, i32 %index
172  %12 = bitcast i16* %11 to <8 x i16>*
173  store <8 x i16> %10, <8 x i16>* %12, align 2
174  %index.next = add i32 %index, 8
175  %13 = icmp eq i32 %index.next, 1024
176  br i1 %13, label %for.cond.cleanup, label %vector.body
177
178for.cond.cleanup:                                 ; preds = %vector.body
179  ret void
180}
181
182define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
183; CHECK-LABEL: vabd_loop_s32:
184; CHECK:       @ %bb.0: @ %entry
185; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
186; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
187; CHECK-NEXT:    .pad #4
188; CHECK-NEXT:    sub sp, #4
189; CHECK-NEXT:    .vsave {d9}
190; CHECK-NEXT:    vpush {d9}
191; CHECK-NEXT:    mov.w lr, #256
192; CHECK-NEXT:    mov.w r12, #1
193; CHECK-NEXT:    vmov.i32 q0, #0x0
194; CHECK-NEXT:  .LBB8_1: @ %vector.body
195; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
196; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
197; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
198; CHECK-NEXT:    vmov r3, s4
199; CHECK-NEXT:    vmov r5, s8
200; CHECK-NEXT:    vmov.f32 s14, s5
201; CHECK-NEXT:    vmov.f32 s18, s9
202; CHECK-NEXT:    vmov.f32 s4, s6
203; CHECK-NEXT:    vmov.f32 s6, s7
204; CHECK-NEXT:    vmov.f32 s8, s10
205; CHECK-NEXT:    vmov r7, s18
206; CHECK-NEXT:    asrs r4, r3, #31
207; CHECK-NEXT:    subs.w r8, r3, r5
208; CHECK-NEXT:    sbc.w r4, r4, r5, asr #31
209; CHECK-NEXT:    asrs r5, r4, #31
210; CHECK-NEXT:    movs r4, #0
211; CHECK-NEXT:    bfi r4, r5, #0, #4
212; CHECK-NEXT:    vmov r5, s14
213; CHECK-NEXT:    subs.w r9, r5, r7
214; CHECK-NEXT:    asr.w r6, r5, #31
215; CHECK-NEXT:    vmov r5, s4
216; CHECK-NEXT:    sbc.w r6, r6, r7, asr #31
217; CHECK-NEXT:    and.w r6, r12, r6, asr #31
218; CHECK-NEXT:    rsbs r6, r6, #0
219; CHECK-NEXT:    bfi r4, r6, #4, #4
220; CHECK-NEXT:    vmov r6, s6
221; CHECK-NEXT:    vmov.f32 s6, s11
222; CHECK-NEXT:    vmov r3, s6
223; CHECK-NEXT:    asrs r7, r6, #31
224; CHECK-NEXT:    subs.w r10, r6, r3
225; CHECK-NEXT:    asr.w r6, r5, #31
226; CHECK-NEXT:    sbc.w r3, r7, r3, asr #31
227; CHECK-NEXT:    vmov r7, s8
228; CHECK-NEXT:    asr.w r11, r3, #31
229; CHECK-NEXT:    and.w r3, r12, r3, asr #31
230; CHECK-NEXT:    rsbs r3, r3, #0
231; CHECK-NEXT:    subs r5, r5, r7
232; CHECK-NEXT:    sbc.w r6, r6, r7, asr #31
233; CHECK-NEXT:    asrs r6, r6, #31
234; CHECK-NEXT:    vmov q1[2], q1[0], r6, r11
235; CHECK-NEXT:    vmov r6, s4
236; CHECK-NEXT:    vmov q1[2], q1[0], r8, r5
237; CHECK-NEXT:    vmov q1[3], q1[1], r9, r10
238; CHECK-NEXT:    and r6, r6, #1
239; CHECK-NEXT:    rsbs r6, r6, #0
240; CHECK-NEXT:    bfi r4, r6, #8, #4
241; CHECK-NEXT:    bfi r4, r3, #12, #4
242; CHECK-NEXT:    vmsr p0, r4
243; CHECK-NEXT:    vpst
244; CHECK-NEXT:    vsubt.i32 q1, q0, q1
245; CHECK-NEXT:    vstrb.8 q1, [r2], #16
246; CHECK-NEXT:    le lr, .LBB8_1
247; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
248; CHECK-NEXT:    vpop {d9}
249; CHECK-NEXT:    add sp, #4
250; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
251entry:
252  br label %vector.body
253
254vector.body:                                      ; preds = %vector.body, %entry
255  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
256  %0 = getelementptr inbounds i32, i32* %x, i32 %index
257  %1 = bitcast i32* %0 to <4 x i32>*
258  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
259  %2 = sext <4 x i32> %wide.load to <4 x i64>
260  %3 = getelementptr inbounds i32, i32* %y, i32 %index
261  %4 = bitcast i32* %3 to <4 x i32>*
262  %wide.load23 = load <4 x i32>, <4 x i32>* %4, align 4
263  %5 = sext <4 x i32> %wide.load23 to <4 x i64>
264  %6 = sub nsw <4 x i64> %2, %5
265  %7 = icmp slt <4 x i64> %6, zeroinitializer
266  %8 = trunc <4 x i64> %6 to <4 x i32>
267  %9 = sub <4 x i32> zeroinitializer, %8
268  %10 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %8
269  %11 = getelementptr inbounds i32, i32* %z, i32 %index
270  %12 = bitcast i32* %11 to <4 x i32>*
271  store <4 x i32> %10, <4 x i32>* %12, align 4
272  %index.next = add i32 %index, 4
273  %13 = icmp eq i32 %index.next, 1024
274  br i1 %13, label %for.cond.cleanup, label %vector.body
275
276for.cond.cleanup:                                 ; preds = %vector.body
277  ret void
278}
279
280define void @vabd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
281; CHECK-LABEL: vabd_loop_u8:
282; CHECK:       @ %bb.0: @ %entry
283; CHECK-NEXT:    .save {r7, lr}
284; CHECK-NEXT:    push {r7, lr}
285; CHECK-NEXT:    mov.w lr, #64
286; CHECK-NEXT:  .LBB9_1: @ %vector.body
287; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
288; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
289; CHECK-NEXT:    vldrb.u8 q1, [r0], #16
290; CHECK-NEXT:    vabd.u8 q0, q1, q0
291; CHECK-NEXT:    vstrb.8 q0, [r2], #16
292; CHECK-NEXT:    le lr, .LBB9_1
293; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
294; CHECK-NEXT:    pop {r7, pc}
295entry:
296  br label %vector.body
297
298vector.body:                                      ; preds = %vector.body, %entry
299  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
300  %0 = getelementptr inbounds i8, i8* %x, i32 %index
301  %1 = bitcast i8* %0 to <16 x i8>*
302  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
303  %2 = zext <16 x i8> %wide.load to <16 x i32>
304  %3 = getelementptr inbounds i8, i8* %y, i32 %index
305  %4 = bitcast i8* %3 to <16 x i8>*
306  %wide.load22 = load <16 x i8>, <16 x i8>* %4, align 1
307  %5 = zext <16 x i8> %wide.load22 to <16 x i32>
308  %6 = sub nsw <16 x i32> %2, %5
309  %7 = icmp slt <16 x i32> %6, zeroinitializer
310  %8 = sub nsw <16 x i32> zeroinitializer, %6
311  %9 = select <16 x i1> %7, <16 x i32> %8, <16 x i32> %6
312  %10 = trunc <16 x i32> %9 to <16 x i8>
313  %11 = getelementptr inbounds i8, i8* %z, i32 %index
314  %12 = bitcast i8* %11 to <16 x i8>*
315  store <16 x i8> %10, <16 x i8>* %12, align 1
316  %index.next = add i32 %index, 16
317  %13 = icmp eq i32 %index.next, 1024
318  br i1 %13, label %for.cond.cleanup, label %vector.body
319
320for.cond.cleanup:                                 ; preds = %vector.body
321  ret void
322}
323
324define void @vabd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
325; CHECK-LABEL: vabd_loop_u16:
326; CHECK:       @ %bb.0: @ %entry
327; CHECK-NEXT:    .save {r7, lr}
328; CHECK-NEXT:    push {r7, lr}
329; CHECK-NEXT:    mov.w lr, #128
330; CHECK-NEXT:  .LBB10_1: @ %vector.body
331; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
332; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
333; CHECK-NEXT:    vldrh.u16 q1, [r0], #16
334; CHECK-NEXT:    vabd.u16 q0, q1, q0
335; CHECK-NEXT:    vstrb.8 q0, [r2], #16
336; CHECK-NEXT:    le lr, .LBB10_1
337; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
338; CHECK-NEXT:    pop {r7, pc}
339entry:
340  br label %vector.body
341
342vector.body:                                      ; preds = %vector.body, %entry
343  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
344  %0 = getelementptr inbounds i16, i16* %x, i32 %index
345  %1 = bitcast i16* %0 to <8 x i16>*
346  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
347  %2 = zext <8 x i16> %wide.load to <8 x i32>
348  %3 = getelementptr inbounds i16, i16* %y, i32 %index
349  %4 = bitcast i16* %3 to <8 x i16>*
350  %wide.load22 = load <8 x i16>, <8 x i16>* %4, align 2
351  %5 = zext <8 x i16> %wide.load22 to <8 x i32>
352  %6 = sub nsw <8 x i32> %2, %5
353  %7 = icmp slt <8 x i32> %6, zeroinitializer
354  %8 = sub nsw <8 x i32> zeroinitializer, %6
355  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
356  %10 = trunc <8 x i32> %9 to <8 x i16>
357  %11 = getelementptr inbounds i16, i16* %z, i32 %index
358  %12 = bitcast i16* %11 to <8 x i16>*
359  store <8 x i16> %10, <8 x i16>* %12, align 2
360  %index.next = add i32 %index, 8
361  %13 = icmp eq i32 %index.next, 1024
362  br i1 %13, label %for.cond.cleanup, label %vector.body
363
364for.cond.cleanup:                                 ; preds = %vector.body
365  ret void
366}
367
368define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
369; CHECK-LABEL: vabd_loop_u32:
370; CHECK:       @ %bb.0: @ %entry
371; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
372; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
373; CHECK-NEXT:    .pad #4
374; CHECK-NEXT:    sub sp, #4
375; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
376; CHECK-NEXT:    vpush {d8, d9, d10, d11}
377; CHECK-NEXT:    mov.w lr, #256
378; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
379; CHECK-NEXT:    vmov.i32 q1, #0x0
380; CHECK-NEXT:  .LBB11_1: @ %vector.body
381; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
382; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
383; CHECK-NEXT:    vmov.f32 s12, s8
384; CHECK-NEXT:    vmov.f32 s14, s9
385; CHECK-NEXT:    vand q4, q3, q0
386; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
387; CHECK-NEXT:    vmov r3, r4, d8
388; CHECK-NEXT:    vmov.f32 s20, s12
389; CHECK-NEXT:    vmov.f32 s22, s13
390; CHECK-NEXT:    vand q5, q5, q0
391; CHECK-NEXT:    vmov.f32 s8, s10
392; CHECK-NEXT:    vmov r5, r6, d10
393; CHECK-NEXT:    vmov.f32 s10, s11
394; CHECK-NEXT:    vmov.f32 s12, s14
395; CHECK-NEXT:    vand q2, q2, q0
396; CHECK-NEXT:    vmov.f32 s14, s15
397; CHECK-NEXT:    vand q3, q3, q0
398; CHECK-NEXT:    subs.w r8, r5, r3
399; CHECK-NEXT:    vmov r7, r3, d11
400; CHECK-NEXT:    sbc.w r4, r6, r4
401; CHECK-NEXT:    asrs r5, r4, #31
402; CHECK-NEXT:    movs r4, #0
403; CHECK-NEXT:    bfi r4, r5, #0, #4
404; CHECK-NEXT:    vmov r5, r6, d9
405; CHECK-NEXT:    subs.w r9, r7, r5
406; CHECK-NEXT:    mov.w r7, #1
407; CHECK-NEXT:    sbcs r3, r6
408; CHECK-NEXT:    and.w r3, r7, r3, asr #31
409; CHECK-NEXT:    vmov r7, r5, d7
410; CHECK-NEXT:    rsbs r3, r3, #0
411; CHECK-NEXT:    bfi r4, r3, #4, #4
412; CHECK-NEXT:    vmov r3, r6, d5
413; CHECK-NEXT:    subs.w r10, r7, r3
414; CHECK-NEXT:    vmov r7, r3, d4
415; CHECK-NEXT:    sbcs r5, r6
416; CHECK-NEXT:    vmov r6, r12, d6
417; CHECK-NEXT:    asr.w r11, r5, #31
418; CHECK-NEXT:    subs r6, r6, r7
419; CHECK-NEXT:    sbc.w r3, r12, r3
420; CHECK-NEXT:    asrs r3, r3, #31
421; CHECK-NEXT:    vmov q2[2], q2[0], r3, r11
422; CHECK-NEXT:    vmov r3, s8
423; CHECK-NEXT:    vmov q2[2], q2[0], r8, r6
424; CHECK-NEXT:    vmov q2[3], q2[1], r9, r10
425; CHECK-NEXT:    and r3, r3, #1
426; CHECK-NEXT:    rsbs r3, r3, #0
427; CHECK-NEXT:    bfi r4, r3, #8, #4
428; CHECK-NEXT:    movs r3, #1
429; CHECK-NEXT:    and.w r3, r3, r5, asr #31
430; CHECK-NEXT:    rsbs r3, r3, #0
431; CHECK-NEXT:    bfi r4, r3, #12, #4
432; CHECK-NEXT:    vmsr p0, r4
433; CHECK-NEXT:    vpst
434; CHECK-NEXT:    vsubt.i32 q2, q1, q2
435; CHECK-NEXT:    vstrb.8 q2, [r2], #16
436; CHECK-NEXT:    le lr, .LBB11_1
437; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
438; CHECK-NEXT:    vpop {d8, d9, d10, d11}
439; CHECK-NEXT:    add sp, #4
440; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
441entry:
442  br label %vector.body
443
444vector.body:                                      ; preds = %vector.body, %entry
445  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
446  %0 = getelementptr inbounds i32, i32* %x, i32 %index
447  %1 = bitcast i32* %0 to <4 x i32>*
448  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
449  %2 = zext <4 x i32> %wide.load to <4 x i64>
450  %3 = getelementptr inbounds i32, i32* %y, i32 %index
451  %4 = bitcast i32* %3 to <4 x i32>*
452  %wide.load23 = load <4 x i32>, <4 x i32>* %4, align 4
453  %5 = zext <4 x i32> %wide.load23 to <4 x i64>
454  %6 = sub nsw <4 x i64> %2, %5
455  %7 = icmp slt <4 x i64> %6, zeroinitializer
456  %8 = trunc <4 x i64> %6 to <4 x i32>
457  %9 = sub <4 x i32> zeroinitializer, %8
458  %10 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %8
459  %11 = getelementptr inbounds i32, i32* %z, i32 %index
460  %12 = bitcast i32* %11 to <4 x i32>*
461  store <4 x i32> %10, <4 x i32>* %12, align 4
462  %index.next = add i32 %index, 4
463  %13 = icmp eq i32 %index.next, 1024
464  br i1 %13, label %for.cond.cleanup, label %vector.body
465
466for.cond.cleanup:                                 ; preds = %vector.body
467  ret void
468}
469