1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <16 x i8> @vabd_s8(<16 x i8> %src1, <16 x i8> %src2) { 5; CHECK-LABEL: vabd_s8: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: vabd.s8 q0, q0, q1 8; CHECK-NEXT: bx lr 9 %sextsrc1 = sext <16 x i8> %src1 to <16 x i16> 10 %sextsrc2 = sext <16 x i8> %src2 to <16 x i16> 11 %add1 = sub <16 x i16> %sextsrc1, %sextsrc2 12 %add2 = sub <16 x i16> zeroinitializer, %add1 13 %c = icmp sge <16 x i16> %add1, zeroinitializer 14 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2 15 %result = trunc <16 x i16> %s to <16 x i8> 16 ret <16 x i8> %result 17} 18 19define arm_aapcs_vfpcc <8 x i16> @vabd_s16(<8 x i16> %src1, <8 x i16> %src2) { 20; CHECK-LABEL: vabd_s16: 21; CHECK: @ %bb.0: 22; CHECK-NEXT: vabd.s16 q0, q0, q1 23; CHECK-NEXT: bx lr 24 %sextsrc1 = sext <8 x i16> %src1 to <8 x i32> 25 %sextsrc2 = sext <8 x i16> %src2 to <8 x i32> 26 %add1 = sub <8 x i32> %sextsrc1, %sextsrc2 27 %add2 = sub <8 x i32> zeroinitializer, %add1 28 %c = icmp sge <8 x i32> %add1, zeroinitializer 29 %s = select <8 x i1> %c, <8 x i32> %add1, <8 x i32> %add2 30 %result = trunc <8 x i32> %s to <8 x i16> 31 ret <8 x i16> %result 32} 33 34define arm_aapcs_vfpcc <4 x i32> @vabd_s32(<4 x i32> %src1, <4 x i32> %src2) { 35; CHECK-LABEL: vabd_s32: 36; CHECK: @ %bb.0: 37; CHECK-NEXT: vabd.s32 q0, q0, q1 38; CHECK-NEXT: bx lr 39 %sextsrc1 = sext <4 x i32> %src1 to <4 x i64> 40 %sextsrc2 = sext <4 x i32> %src2 to <4 x i64> 41 %add1 = sub <4 x i64> %sextsrc1, %sextsrc2 42 %add2 = sub <4 x i64> zeroinitializer, %add1 43 %c = icmp sge <4 x i64> %add1, zeroinitializer 44 %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2 45 %result = trunc <4 x i64> %s to <4 x i32> 46 ret <4 x i32> %result 47} 48 49define arm_aapcs_vfpcc <16 x i8> @vabd_u8(<16 x i8> %src1, <16 x i8> %src2) { 50; CHECK-LABEL: vabd_u8: 51; CHECK: @ %bb.0: 52; CHECK-NEXT: vabd.u8 q0, q0, q1 53; CHECK-NEXT: bx lr 54 %zextsrc1 = zext <16 x i8> %src1 to <16 x i16> 55 %zextsrc2 = zext <16 x i8> %src2 to <16 x i16> 56 %add1 = sub <16 x i16> %zextsrc1, %zextsrc2 57 %add2 = sub <16 x i16> zeroinitializer, %add1 58 %c = icmp sge <16 x i16> %add1, zeroinitializer 59 %s = select <16 x i1> %c, <16 x i16> %add1, <16 x i16> %add2 60 %result = trunc <16 x i16> %s to <16 x i8> 61 ret <16 x i8> %result 62} 63 64define arm_aapcs_vfpcc <8 x i16> @vabd_u16(<8 x i16> %src1, <8 x i16> %src2) { 65; CHECK-LABEL: vabd_u16: 66; CHECK: @ %bb.0: 67; CHECK-NEXT: vabd.u16 q0, q0, q1 68; CHECK-NEXT: bx lr 69 %zextsrc1 = zext <8 x i16> %src1 to <8 x i32> 70 %zextsrc2 = zext <8 x i16> %src2 to <8 x i32> 71 %add1 = sub <8 x i32> %zextsrc1, %zextsrc2 72 %add2 = sub <8 x i32> zeroinitializer, %add1 73 %c = icmp sge <8 x i32> %add1, zeroinitializer 74 %s = select <8 x i1> %c, <8 x i32> %add1, <8 x i32> %add2 75 %result = trunc <8 x i32> %s to <8 x i16> 76 ret <8 x i16> %result 77} 78 79define arm_aapcs_vfpcc <4 x i32> @vabd_u32(<4 x i32> %src1, <4 x i32> %src2) { 80; CHECK-LABEL: vabd_u32: 81; CHECK: @ %bb.0: 82; CHECK-NEXT: vabd.u32 q0, q0, q1 83; CHECK-NEXT: bx lr 84 %zextsrc1 = zext <4 x i32> %src1 to <4 x i64> 85 %zextsrc2 = zext <4 x i32> %src2 to <4 x i64> 86 %add1 = sub <4 x i64> %zextsrc1, %zextsrc2 87 %add2 = sub <4 x i64> zeroinitializer, %add1 88 %c = icmp sge <4 x i64> %add1, zeroinitializer 89 %s = select <4 x i1> %c, <4 x i64> %add1, <4 x i64> %add2 90 %result = trunc <4 x i64> %s to <4 x i32> 91 ret <4 x i32> %result 92} 93 94define void @vabd_loop_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { 95; CHECK-LABEL: vabd_loop_s8: 96; CHECK: @ %bb.0: @ %entry 97; CHECK-NEXT: .save {r7, lr} 98; CHECK-NEXT: push {r7, lr} 99; CHECK-NEXT: mov.w lr, #64 100; CHECK-NEXT: .LBB6_1: @ %vector.body 101; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 102; CHECK-NEXT: vldrb.u8 q0, [r1], #16 103; CHECK-NEXT: vldrb.u8 q1, [r0], #16 104; CHECK-NEXT: vabd.s8 q0, q1, q0 105; CHECK-NEXT: vstrb.8 q0, [r2], #16 106; CHECK-NEXT: le lr, .LBB6_1 107; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 108; CHECK-NEXT: pop {r7, pc} 109entry: 110 br label %vector.body 111 112vector.body: ; preds = %vector.body, %entry 113 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 114 %0 = getelementptr inbounds i8, i8* %x, i32 %index 115 %1 = bitcast i8* %0 to <16 x i8>* 116 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 117 %2 = sext <16 x i8> %wide.load to <16 x i32> 118 %3 = getelementptr inbounds i8, i8* %y, i32 %index 119 %4 = bitcast i8* %3 to <16 x i8>* 120 %wide.load22 = load <16 x i8>, <16 x i8>* %4, align 1 121 %5 = sext <16 x i8> %wide.load22 to <16 x i32> 122 %6 = sub nsw <16 x i32> %2, %5 123 %7 = icmp slt <16 x i32> %6, zeroinitializer 124 %8 = sub nsw <16 x i32> zeroinitializer, %6 125 %9 = select <16 x i1> %7, <16 x i32> %8, <16 x i32> %6 126 %10 = trunc <16 x i32> %9 to <16 x i8> 127 %11 = getelementptr inbounds i8, i8* %z, i32 %index 128 %12 = bitcast i8* %11 to <16 x i8>* 129 store <16 x i8> %10, <16 x i8>* %12, align 1 130 %index.next = add i32 %index, 16 131 %13 = icmp eq i32 %index.next, 1024 132 br i1 %13, label %for.cond.cleanup, label %vector.body 133 134for.cond.cleanup: ; preds = %vector.body 135 ret void 136} 137 138define void @vabd_loop_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { 139; CHECK-LABEL: vabd_loop_s16: 140; CHECK: @ %bb.0: @ %entry 141; CHECK-NEXT: .save {r7, lr} 142; CHECK-NEXT: push {r7, lr} 143; CHECK-NEXT: mov.w lr, #128 144; CHECK-NEXT: .LBB7_1: @ %vector.body 145; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 146; CHECK-NEXT: vldrh.u16 q0, [r1], #16 147; CHECK-NEXT: vldrh.u16 q1, [r0], #16 148; CHECK-NEXT: vabd.s16 q0, q1, q0 149; CHECK-NEXT: vstrb.8 q0, [r2], #16 150; CHECK-NEXT: le lr, .LBB7_1 151; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 152; CHECK-NEXT: pop {r7, pc} 153entry: 154 br label %vector.body 155 156vector.body: ; preds = %vector.body, %entry 157 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 158 %0 = getelementptr inbounds i16, i16* %x, i32 %index 159 %1 = bitcast i16* %0 to <8 x i16>* 160 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 161 %2 = sext <8 x i16> %wide.load to <8 x i32> 162 %3 = getelementptr inbounds i16, i16* %y, i32 %index 163 %4 = bitcast i16* %3 to <8 x i16>* 164 %wide.load22 = load <8 x i16>, <8 x i16>* %4, align 2 165 %5 = sext <8 x i16> %wide.load22 to <8 x i32> 166 %6 = sub nsw <8 x i32> %2, %5 167 %7 = icmp slt <8 x i32> %6, zeroinitializer 168 %8 = sub nsw <8 x i32> zeroinitializer, %6 169 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 170 %10 = trunc <8 x i32> %9 to <8 x i16> 171 %11 = getelementptr inbounds i16, i16* %z, i32 %index 172 %12 = bitcast i16* %11 to <8 x i16>* 173 store <8 x i16> %10, <8 x i16>* %12, align 2 174 %index.next = add i32 %index, 8 175 %13 = icmp eq i32 %index.next, 1024 176 br i1 %13, label %for.cond.cleanup, label %vector.body 177 178for.cond.cleanup: ; preds = %vector.body 179 ret void 180} 181 182define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { 183; CHECK-LABEL: vabd_loop_s32: 184; CHECK: @ %bb.0: @ %entry 185; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 186; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 187; CHECK-NEXT: .pad #4 188; CHECK-NEXT: sub sp, #4 189; CHECK-NEXT: .vsave {d9} 190; CHECK-NEXT: vpush {d9} 191; CHECK-NEXT: mov.w lr, #256 192; CHECK-NEXT: mov.w r12, #1 193; CHECK-NEXT: vmov.i32 q0, #0x0 194; CHECK-NEXT: .LBB8_1: @ %vector.body 195; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 196; CHECK-NEXT: vldrw.u32 q1, [r0], #16 197; CHECK-NEXT: vldrw.u32 q2, [r1], #16 198; CHECK-NEXT: vmov r3, s4 199; CHECK-NEXT: vmov r5, s8 200; CHECK-NEXT: vmov.f32 s14, s5 201; CHECK-NEXT: vmov.f32 s18, s9 202; CHECK-NEXT: vmov.f32 s4, s6 203; CHECK-NEXT: vmov.f32 s6, s7 204; CHECK-NEXT: vmov.f32 s8, s10 205; CHECK-NEXT: vmov r7, s18 206; CHECK-NEXT: asrs r4, r3, #31 207; CHECK-NEXT: subs.w r8, r3, r5 208; CHECK-NEXT: sbc.w r4, r4, r5, asr #31 209; CHECK-NEXT: asrs r5, r4, #31 210; CHECK-NEXT: movs r4, #0 211; CHECK-NEXT: bfi r4, r5, #0, #4 212; CHECK-NEXT: vmov r5, s14 213; CHECK-NEXT: subs.w r9, r5, r7 214; CHECK-NEXT: asr.w r6, r5, #31 215; CHECK-NEXT: vmov r5, s4 216; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 217; CHECK-NEXT: and.w r6, r12, r6, asr #31 218; CHECK-NEXT: rsbs r6, r6, #0 219; CHECK-NEXT: bfi r4, r6, #4, #4 220; CHECK-NEXT: vmov r6, s6 221; CHECK-NEXT: vmov.f32 s6, s11 222; CHECK-NEXT: vmov r3, s6 223; CHECK-NEXT: asrs r7, r6, #31 224; CHECK-NEXT: subs.w r10, r6, r3 225; CHECK-NEXT: asr.w r6, r5, #31 226; CHECK-NEXT: sbc.w r3, r7, r3, asr #31 227; CHECK-NEXT: vmov r7, s8 228; CHECK-NEXT: asr.w r11, r3, #31 229; CHECK-NEXT: and.w r3, r12, r3, asr #31 230; CHECK-NEXT: rsbs r3, r3, #0 231; CHECK-NEXT: subs r5, r5, r7 232; CHECK-NEXT: sbc.w r6, r6, r7, asr #31 233; CHECK-NEXT: asrs r6, r6, #31 234; CHECK-NEXT: vmov q1[2], q1[0], r6, r11 235; CHECK-NEXT: vmov r6, s4 236; CHECK-NEXT: vmov q1[2], q1[0], r8, r5 237; CHECK-NEXT: vmov q1[3], q1[1], r9, r10 238; CHECK-NEXT: and r6, r6, #1 239; CHECK-NEXT: rsbs r6, r6, #0 240; CHECK-NEXT: bfi r4, r6, #8, #4 241; CHECK-NEXT: bfi r4, r3, #12, #4 242; CHECK-NEXT: vmsr p0, r4 243; CHECK-NEXT: vpst 244; CHECK-NEXT: vsubt.i32 q1, q0, q1 245; CHECK-NEXT: vstrb.8 q1, [r2], #16 246; CHECK-NEXT: le lr, .LBB8_1 247; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 248; CHECK-NEXT: vpop {d9} 249; CHECK-NEXT: add sp, #4 250; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 251entry: 252 br label %vector.body 253 254vector.body: ; preds = %vector.body, %entry 255 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 256 %0 = getelementptr inbounds i32, i32* %x, i32 %index 257 %1 = bitcast i32* %0 to <4 x i32>* 258 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 259 %2 = sext <4 x i32> %wide.load to <4 x i64> 260 %3 = getelementptr inbounds i32, i32* %y, i32 %index 261 %4 = bitcast i32* %3 to <4 x i32>* 262 %wide.load23 = load <4 x i32>, <4 x i32>* %4, align 4 263 %5 = sext <4 x i32> %wide.load23 to <4 x i64> 264 %6 = sub nsw <4 x i64> %2, %5 265 %7 = icmp slt <4 x i64> %6, zeroinitializer 266 %8 = trunc <4 x i64> %6 to <4 x i32> 267 %9 = sub <4 x i32> zeroinitializer, %8 268 %10 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %8 269 %11 = getelementptr inbounds i32, i32* %z, i32 %index 270 %12 = bitcast i32* %11 to <4 x i32>* 271 store <4 x i32> %10, <4 x i32>* %12, align 4 272 %index.next = add i32 %index, 4 273 %13 = icmp eq i32 %index.next, 1024 274 br i1 %13, label %for.cond.cleanup, label %vector.body 275 276for.cond.cleanup: ; preds = %vector.body 277 ret void 278} 279 280define void @vabd_loop_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) { 281; CHECK-LABEL: vabd_loop_u8: 282; CHECK: @ %bb.0: @ %entry 283; CHECK-NEXT: .save {r7, lr} 284; CHECK-NEXT: push {r7, lr} 285; CHECK-NEXT: mov.w lr, #64 286; CHECK-NEXT: .LBB9_1: @ %vector.body 287; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 288; CHECK-NEXT: vldrb.u8 q0, [r1], #16 289; CHECK-NEXT: vldrb.u8 q1, [r0], #16 290; CHECK-NEXT: vabd.u8 q0, q1, q0 291; CHECK-NEXT: vstrb.8 q0, [r2], #16 292; CHECK-NEXT: le lr, .LBB9_1 293; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 294; CHECK-NEXT: pop {r7, pc} 295entry: 296 br label %vector.body 297 298vector.body: ; preds = %vector.body, %entry 299 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 300 %0 = getelementptr inbounds i8, i8* %x, i32 %index 301 %1 = bitcast i8* %0 to <16 x i8>* 302 %wide.load = load <16 x i8>, <16 x i8>* %1, align 1 303 %2 = zext <16 x i8> %wide.load to <16 x i32> 304 %3 = getelementptr inbounds i8, i8* %y, i32 %index 305 %4 = bitcast i8* %3 to <16 x i8>* 306 %wide.load22 = load <16 x i8>, <16 x i8>* %4, align 1 307 %5 = zext <16 x i8> %wide.load22 to <16 x i32> 308 %6 = sub nsw <16 x i32> %2, %5 309 %7 = icmp slt <16 x i32> %6, zeroinitializer 310 %8 = sub nsw <16 x i32> zeroinitializer, %6 311 %9 = select <16 x i1> %7, <16 x i32> %8, <16 x i32> %6 312 %10 = trunc <16 x i32> %9 to <16 x i8> 313 %11 = getelementptr inbounds i8, i8* %z, i32 %index 314 %12 = bitcast i8* %11 to <16 x i8>* 315 store <16 x i8> %10, <16 x i8>* %12, align 1 316 %index.next = add i32 %index, 16 317 %13 = icmp eq i32 %index.next, 1024 318 br i1 %13, label %for.cond.cleanup, label %vector.body 319 320for.cond.cleanup: ; preds = %vector.body 321 ret void 322} 323 324define void @vabd_loop_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) { 325; CHECK-LABEL: vabd_loop_u16: 326; CHECK: @ %bb.0: @ %entry 327; CHECK-NEXT: .save {r7, lr} 328; CHECK-NEXT: push {r7, lr} 329; CHECK-NEXT: mov.w lr, #128 330; CHECK-NEXT: .LBB10_1: @ %vector.body 331; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 332; CHECK-NEXT: vldrh.u16 q0, [r1], #16 333; CHECK-NEXT: vldrh.u16 q1, [r0], #16 334; CHECK-NEXT: vabd.u16 q0, q1, q0 335; CHECK-NEXT: vstrb.8 q0, [r2], #16 336; CHECK-NEXT: le lr, .LBB10_1 337; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 338; CHECK-NEXT: pop {r7, pc} 339entry: 340 br label %vector.body 341 342vector.body: ; preds = %vector.body, %entry 343 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 344 %0 = getelementptr inbounds i16, i16* %x, i32 %index 345 %1 = bitcast i16* %0 to <8 x i16>* 346 %wide.load = load <8 x i16>, <8 x i16>* %1, align 2 347 %2 = zext <8 x i16> %wide.load to <8 x i32> 348 %3 = getelementptr inbounds i16, i16* %y, i32 %index 349 %4 = bitcast i16* %3 to <8 x i16>* 350 %wide.load22 = load <8 x i16>, <8 x i16>* %4, align 2 351 %5 = zext <8 x i16> %wide.load22 to <8 x i32> 352 %6 = sub nsw <8 x i32> %2, %5 353 %7 = icmp slt <8 x i32> %6, zeroinitializer 354 %8 = sub nsw <8 x i32> zeroinitializer, %6 355 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 356 %10 = trunc <8 x i32> %9 to <8 x i16> 357 %11 = getelementptr inbounds i16, i16* %z, i32 %index 358 %12 = bitcast i16* %11 to <8 x i16>* 359 store <8 x i16> %10, <8 x i16>* %12, align 2 360 %index.next = add i32 %index, 8 361 %13 = icmp eq i32 %index.next, 1024 362 br i1 %13, label %for.cond.cleanup, label %vector.body 363 364for.cond.cleanup: ; preds = %vector.body 365 ret void 366} 367 368define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) { 369; CHECK-LABEL: vabd_loop_u32: 370; CHECK: @ %bb.0: @ %entry 371; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 372; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 373; CHECK-NEXT: .pad #4 374; CHECK-NEXT: sub sp, #4 375; CHECK-NEXT: .vsave {d8, d9, d10, d11} 376; CHECK-NEXT: vpush {d8, d9, d10, d11} 377; CHECK-NEXT: mov.w lr, #256 378; CHECK-NEXT: vmov.i64 q0, #0xffffffff 379; CHECK-NEXT: vmov.i32 q1, #0x0 380; CHECK-NEXT: .LBB11_1: @ %vector.body 381; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 382; CHECK-NEXT: vldrw.u32 q2, [r1], #16 383; CHECK-NEXT: vmov.f32 s12, s8 384; CHECK-NEXT: vmov.f32 s14, s9 385; CHECK-NEXT: vand q4, q3, q0 386; CHECK-NEXT: vldrw.u32 q3, [r0], #16 387; CHECK-NEXT: vmov r3, r4, d8 388; CHECK-NEXT: vmov.f32 s20, s12 389; CHECK-NEXT: vmov.f32 s22, s13 390; CHECK-NEXT: vand q5, q5, q0 391; CHECK-NEXT: vmov.f32 s8, s10 392; CHECK-NEXT: vmov r5, r6, d10 393; CHECK-NEXT: vmov.f32 s10, s11 394; CHECK-NEXT: vmov.f32 s12, s14 395; CHECK-NEXT: vand q2, q2, q0 396; CHECK-NEXT: vmov.f32 s14, s15 397; CHECK-NEXT: vand q3, q3, q0 398; CHECK-NEXT: subs.w r8, r5, r3 399; CHECK-NEXT: vmov r7, r3, d11 400; CHECK-NEXT: sbc.w r4, r6, r4 401; CHECK-NEXT: asrs r5, r4, #31 402; CHECK-NEXT: movs r4, #0 403; CHECK-NEXT: bfi r4, r5, #0, #4 404; CHECK-NEXT: vmov r5, r6, d9 405; CHECK-NEXT: subs.w r9, r7, r5 406; CHECK-NEXT: mov.w r7, #1 407; CHECK-NEXT: sbcs r3, r6 408; CHECK-NEXT: and.w r3, r7, r3, asr #31 409; CHECK-NEXT: vmov r7, r5, d7 410; CHECK-NEXT: rsbs r3, r3, #0 411; CHECK-NEXT: bfi r4, r3, #4, #4 412; CHECK-NEXT: vmov r3, r6, d5 413; CHECK-NEXT: subs.w r10, r7, r3 414; CHECK-NEXT: vmov r7, r3, d4 415; CHECK-NEXT: sbcs r5, r6 416; CHECK-NEXT: vmov r6, r12, d6 417; CHECK-NEXT: asr.w r11, r5, #31 418; CHECK-NEXT: subs r6, r6, r7 419; CHECK-NEXT: sbc.w r3, r12, r3 420; CHECK-NEXT: asrs r3, r3, #31 421; CHECK-NEXT: vmov q2[2], q2[0], r3, r11 422; CHECK-NEXT: vmov r3, s8 423; CHECK-NEXT: vmov q2[2], q2[0], r8, r6 424; CHECK-NEXT: vmov q2[3], q2[1], r9, r10 425; CHECK-NEXT: and r3, r3, #1 426; CHECK-NEXT: rsbs r3, r3, #0 427; CHECK-NEXT: bfi r4, r3, #8, #4 428; CHECK-NEXT: movs r3, #1 429; CHECK-NEXT: and.w r3, r3, r5, asr #31 430; CHECK-NEXT: rsbs r3, r3, #0 431; CHECK-NEXT: bfi r4, r3, #12, #4 432; CHECK-NEXT: vmsr p0, r4 433; CHECK-NEXT: vpst 434; CHECK-NEXT: vsubt.i32 q2, q1, q2 435; CHECK-NEXT: vstrb.8 q2, [r2], #16 436; CHECK-NEXT: le lr, .LBB11_1 437; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 438; CHECK-NEXT: vpop {d8, d9, d10, d11} 439; CHECK-NEXT: add sp, #4 440; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 441entry: 442 br label %vector.body 443 444vector.body: ; preds = %vector.body, %entry 445 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 446 %0 = getelementptr inbounds i32, i32* %x, i32 %index 447 %1 = bitcast i32* %0 to <4 x i32>* 448 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 449 %2 = zext <4 x i32> %wide.load to <4 x i64> 450 %3 = getelementptr inbounds i32, i32* %y, i32 %index 451 %4 = bitcast i32* %3 to <4 x i32>* 452 %wide.load23 = load <4 x i32>, <4 x i32>* %4, align 4 453 %5 = zext <4 x i32> %wide.load23 to <4 x i64> 454 %6 = sub nsw <4 x i64> %2, %5 455 %7 = icmp slt <4 x i64> %6, zeroinitializer 456 %8 = trunc <4 x i64> %6 to <4 x i32> 457 %9 = sub <4 x i32> zeroinitializer, %8 458 %10 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %8 459 %11 = getelementptr inbounds i32, i32* %z, i32 %index 460 %12 = bitcast i32* %11 to <4 x i32>* 461 store <4 x i32> %10, <4 x i32>* %12, align 4 462 %index.next = add i32 %index, 4 463 %13 = icmp eq i32 %index.next, 1024 464 br i1 %13, label %for.cond.cleanup, label %vector.body 465 466for.cond.cleanup: ; preds = %vector.body 467 ret void 468} 469