1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=enabled -verify-machineinstrs %s -o - | FileCheck %s 3 4define i32 @add_i32(i32* nocapture readonly %x, i32 %n) { 5; CHECK-LABEL: add_i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: cmp r1, #1 10; CHECK-NEXT: blt .LBB0_3 11; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 12; CHECK-NEXT: mov r12, r0 13; CHECK-NEXT: cmp r1, #4 14; CHECK-NEXT: bhs .LBB0_4 15; CHECK-NEXT: @ %bb.2: 16; CHECK-NEXT: movs r3, #0 17; CHECK-NEXT: movs r0, #0 18; CHECK-NEXT: b .LBB0_7 19; CHECK-NEXT: .LBB0_3: 20; CHECK-NEXT: movs r0, #0 21; CHECK-NEXT: b .LBB0_9 22; CHECK-NEXT: .LBB0_4: @ %vector.ph 23; CHECK-NEXT: bic r3, r1, #3 24; CHECK-NEXT: movs r2, #1 25; CHECK-NEXT: subs r0, r3, #4 26; CHECK-NEXT: add.w lr, r2, r0, lsr #2 27; CHECK-NEXT: movs r0, #0 28; CHECK-NEXT: mov r2, r12 29; CHECK-NEXT: .LBB0_5: @ %vector.body 30; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 31; CHECK-NEXT: vldrw.u32 q0, [r2], #16 32; CHECK-NEXT: vaddva.u32 r0, q0 33; CHECK-NEXT: le lr, .LBB0_5 34; CHECK-NEXT: @ %bb.6: @ %middle.block 35; CHECK-NEXT: cmp r3, r1 36; CHECK-NEXT: it eq 37; CHECK-NEXT: popeq {r7, pc} 38; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1 39; CHECK-NEXT: sub.w lr, r1, r3 40; CHECK-NEXT: add.w r2, r12, r3, lsl #2 41; CHECK-NEXT: .LBB0_8: @ %for.body 42; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 43; CHECK-NEXT: ldr r1, [r2], #4 44; CHECK-NEXT: add r0, r1 45; CHECK-NEXT: le lr, .LBB0_8 46; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup 47; CHECK-NEXT: pop {r7, pc} 48entry: 49 %cmp6 = icmp sgt i32 %n, 0 50 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 51 52for.body.preheader: ; preds = %entry 53 %min.iters.check = icmp ult i32 %n, 4 54 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 55 56vector.ph: ; preds = %for.body.preheader 57 %n.vec = and i32 %n, -4 58 br label %vector.body 59 60vector.body: ; preds = %vector.body, %vector.ph 61 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 62 %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ] 63 %0 = getelementptr inbounds i32, i32* %x, i32 %index 64 %1 = bitcast i32* %0 to <4 x i32>* 65 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 66 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load) 67 %3 = add i32 %2, %vec.phi 68 %index.next = add i32 %index, 4 69 %4 = icmp eq i32 %index.next, %n.vec 70 br i1 %4, label %middle.block, label %vector.body 71 72middle.block: ; preds = %vector.body 73 %cmp.n = icmp eq i32 %n.vec, %n 74 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 75 76for.body.preheader1: ; preds = %middle.block, %for.body.preheader 77 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 78 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ] 79 br label %for.body 80 81for.body: ; preds = %for.body.preheader1, %for.body 82 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 83 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 84 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 85 %5 = load i32, i32* %arrayidx, align 4 86 %add = add nsw i32 %5, %r.07 87 %inc = add nuw nsw i32 %i.08, 1 88 %exitcond = icmp eq i32 %inc, %n 89 br i1 %exitcond, label %for.cond.cleanup, label %for.body 90 91for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 92 %r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ] 93 ret i32 %r.0.lcssa 94} 95 96define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) { 97; CHECK-LABEL: mul_i32: 98; CHECK: @ %bb.0: @ %entry 99; CHECK-NEXT: .save {r4, lr} 100; CHECK-NEXT: push {r4, lr} 101; CHECK-NEXT: movs r2, #1 102; CHECK-NEXT: cmp r1, #1 103; CHECK-NEXT: blt .LBB1_8 104; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 105; CHECK-NEXT: cmp r1, #4 106; CHECK-NEXT: bhs .LBB1_3 107; CHECK-NEXT: @ %bb.2: 108; CHECK-NEXT: mov.w r12, #0 109; CHECK-NEXT: b .LBB1_6 110; CHECK-NEXT: .LBB1_3: @ %vector.ph 111; CHECK-NEXT: bic r12, r1, #3 112; CHECK-NEXT: vmov.i32 q0, #0x1 113; CHECK-NEXT: sub.w r3, r12, #4 114; CHECK-NEXT: add.w lr, r2, r3, lsr #2 115; CHECK-NEXT: mov r2, r0 116; CHECK-NEXT: .LBB1_4: @ %vector.body 117; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 118; CHECK-NEXT: vldrw.u32 q1, [r2], #16 119; CHECK-NEXT: vmul.i32 q0, q1, q0 120; CHECK-NEXT: le lr, .LBB1_4 121; CHECK-NEXT: @ %bb.5: @ %middle.block 122; CHECK-NEXT: vmov lr, r3, d1 123; CHECK-NEXT: cmp r12, r1 124; CHECK-NEXT: vmov r2, r4, d0 125; CHECK-NEXT: mul r3, lr, r3 126; CHECK-NEXT: mul r2, r4, r2 127; CHECK-NEXT: mul r2, r3, r2 128; CHECK-NEXT: beq .LBB1_8 129; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1 130; CHECK-NEXT: sub.w lr, r1, r12 131; CHECK-NEXT: add.w r0, r0, r12, lsl #2 132; CHECK-NEXT: .LBB1_7: @ %for.body 133; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 134; CHECK-NEXT: ldr r1, [r0], #4 135; CHECK-NEXT: muls r2, r1, r2 136; CHECK-NEXT: le lr, .LBB1_7 137; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup 138; CHECK-NEXT: mov r0, r2 139; CHECK-NEXT: pop {r4, pc} 140entry: 141 %cmp6 = icmp sgt i32 %n, 0 142 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 143 144for.body.preheader: ; preds = %entry 145 %min.iters.check = icmp ult i32 %n, 4 146 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 147 148vector.ph: ; preds = %for.body.preheader 149 %n.vec = and i32 %n, -4 150 br label %vector.body 151 152vector.body: ; preds = %vector.body, %vector.ph 153 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 154 %vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ] 155 %0 = getelementptr inbounds i32, i32* %x, i32 %index 156 %1 = bitcast i32* %0 to <4 x i32>* 157 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 158 %2 = mul <4 x i32> %wide.load, %vec.phi 159 %index.next = add i32 %index, 4 160 %3 = icmp eq i32 %index.next, %n.vec 161 br i1 %3, label %middle.block, label %vector.body 162 163middle.block: ; preds = %vector.body 164 %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2) 165 %cmp.n = icmp eq i32 %n.vec, %n 166 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 167 168for.body.preheader1: ; preds = %middle.block, %for.body.preheader 169 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 170 %r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ] 171 br label %for.body 172 173for.body: ; preds = %for.body.preheader1, %for.body 174 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 175 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 176 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 177 %5 = load i32, i32* %arrayidx, align 4 178 %add = mul nsw i32 %5, %r.07 179 %inc = add nuw nsw i32 %i.08, 1 180 %exitcond = icmp eq i32 %inc, %n 181 br i1 %exitcond, label %for.cond.cleanup, label %for.body 182 183for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 184 %r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 185 ret i32 %r.0.lcssa 186} 187 188define i32 @and_i32(i32* nocapture readonly %x, i32 %n) { 189; CHECK-LABEL: and_i32: 190; CHECK: @ %bb.0: @ %entry 191; CHECK-NEXT: .save {r4, lr} 192; CHECK-NEXT: push {r4, lr} 193; CHECK-NEXT: cmp r1, #1 194; CHECK-NEXT: blt .LBB2_3 195; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 196; CHECK-NEXT: cmp r1, #4 197; CHECK-NEXT: bhs .LBB2_4 198; CHECK-NEXT: @ %bb.2: 199; CHECK-NEXT: mov.w r2, #-1 200; CHECK-NEXT: movs r3, #0 201; CHECK-NEXT: b .LBB2_7 202; CHECK-NEXT: .LBB2_3: 203; CHECK-NEXT: mov.w r2, #-1 204; CHECK-NEXT: b .LBB2_9 205; CHECK-NEXT: .LBB2_4: @ %vector.ph 206; CHECK-NEXT: bic r3, r1, #3 207; CHECK-NEXT: movs r2, #1 208; CHECK-NEXT: sub.w r12, r3, #4 209; CHECK-NEXT: vmov.i8 q0, #0xff 210; CHECK-NEXT: add.w lr, r2, r12, lsr #2 211; CHECK-NEXT: mov r2, r0 212; CHECK-NEXT: .LBB2_5: @ %vector.body 213; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 214; CHECK-NEXT: vldrw.u32 q1, [r2], #16 215; CHECK-NEXT: vand q0, q1, q0 216; CHECK-NEXT: le lr, .LBB2_5 217; CHECK-NEXT: @ %bb.6: @ %middle.block 218; CHECK-NEXT: vmov lr, r12, d1 219; CHECK-NEXT: cmp r3, r1 220; CHECK-NEXT: vmov r2, r4, d0 221; CHECK-NEXT: and.w r12, r12, lr 222; CHECK-NEXT: and.w r2, r2, r4 223; CHECK-NEXT: and.w r2, r2, r12 224; CHECK-NEXT: beq .LBB2_9 225; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1 226; CHECK-NEXT: sub.w lr, r1, r3 227; CHECK-NEXT: add.w r0, r0, r3, lsl #2 228; CHECK-NEXT: .LBB2_8: @ %for.body 229; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 230; CHECK-NEXT: ldr r1, [r0], #4 231; CHECK-NEXT: ands r2, r1 232; CHECK-NEXT: le lr, .LBB2_8 233; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup 234; CHECK-NEXT: mov r0, r2 235; CHECK-NEXT: pop {r4, pc} 236entry: 237 %cmp6 = icmp sgt i32 %n, 0 238 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 239 240for.body.preheader: ; preds = %entry 241 %min.iters.check = icmp ult i32 %n, 4 242 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 243 244vector.ph: ; preds = %for.body.preheader 245 %n.vec = and i32 %n, -4 246 br label %vector.body 247 248vector.body: ; preds = %vector.body, %vector.ph 249 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 250 %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ] 251 %0 = getelementptr inbounds i32, i32* %x, i32 %index 252 %1 = bitcast i32* %0 to <4 x i32>* 253 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 254 %2 = and <4 x i32> %wide.load, %vec.phi 255 %index.next = add i32 %index, 4 256 %3 = icmp eq i32 %index.next, %n.vec 257 br i1 %3, label %middle.block, label %vector.body 258 259middle.block: ; preds = %vector.body 260 %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2) 261 %cmp.n = icmp eq i32 %n.vec, %n 262 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 263 264for.body.preheader1: ; preds = %middle.block, %for.body.preheader 265 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 266 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ] 267 br label %for.body 268 269for.body: ; preds = %for.body.preheader1, %for.body 270 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 271 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 272 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 273 %5 = load i32, i32* %arrayidx, align 4 274 %add = and i32 %5, %r.07 275 %inc = add nuw nsw i32 %i.08, 1 276 %exitcond = icmp eq i32 %inc, %n 277 br i1 %exitcond, label %for.cond.cleanup, label %for.body 278 279for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 280 %r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 281 ret i32 %r.0.lcssa 282} 283 284define i32 @or_i32(i32* nocapture readonly %x, i32 %n) { 285; CHECK-LABEL: or_i32: 286; CHECK: @ %bb.0: @ %entry 287; CHECK-NEXT: .save {r4, lr} 288; CHECK-NEXT: push {r4, lr} 289; CHECK-NEXT: cmp r1, #1 290; CHECK-NEXT: blt .LBB3_3 291; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 292; CHECK-NEXT: cmp r1, #4 293; CHECK-NEXT: bhs .LBB3_4 294; CHECK-NEXT: @ %bb.2: 295; CHECK-NEXT: movs r3, #0 296; CHECK-NEXT: movs r2, #0 297; CHECK-NEXT: b .LBB3_7 298; CHECK-NEXT: .LBB3_3: 299; CHECK-NEXT: movs r2, #0 300; CHECK-NEXT: b .LBB3_9 301; CHECK-NEXT: .LBB3_4: @ %vector.ph 302; CHECK-NEXT: bic r3, r1, #3 303; CHECK-NEXT: movs r2, #1 304; CHECK-NEXT: sub.w r12, r3, #4 305; CHECK-NEXT: vmov.i32 q0, #0x0 306; CHECK-NEXT: add.w lr, r2, r12, lsr #2 307; CHECK-NEXT: mov r2, r0 308; CHECK-NEXT: .LBB3_5: @ %vector.body 309; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 310; CHECK-NEXT: vldrw.u32 q1, [r2], #16 311; CHECK-NEXT: vorr q0, q1, q0 312; CHECK-NEXT: le lr, .LBB3_5 313; CHECK-NEXT: @ %bb.6: @ %middle.block 314; CHECK-NEXT: vmov lr, r12, d1 315; CHECK-NEXT: cmp r3, r1 316; CHECK-NEXT: vmov r2, r4, d0 317; CHECK-NEXT: orr.w r12, r12, lr 318; CHECK-NEXT: orr.w r2, r2, r4 319; CHECK-NEXT: orr.w r2, r2, r12 320; CHECK-NEXT: beq .LBB3_9 321; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1 322; CHECK-NEXT: sub.w lr, r1, r3 323; CHECK-NEXT: add.w r0, r0, r3, lsl #2 324; CHECK-NEXT: .LBB3_8: @ %for.body 325; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 326; CHECK-NEXT: ldr r1, [r0], #4 327; CHECK-NEXT: orrs r2, r1 328; CHECK-NEXT: le lr, .LBB3_8 329; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup 330; CHECK-NEXT: mov r0, r2 331; CHECK-NEXT: pop {r4, pc} 332entry: 333 %cmp6 = icmp sgt i32 %n, 0 334 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 335 336for.body.preheader: ; preds = %entry 337 %min.iters.check = icmp ult i32 %n, 4 338 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 339 340vector.ph: ; preds = %for.body.preheader 341 %n.vec = and i32 %n, -4 342 br label %vector.body 343 344vector.body: ; preds = %vector.body, %vector.ph 345 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 346 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] 347 %0 = getelementptr inbounds i32, i32* %x, i32 %index 348 %1 = bitcast i32* %0 to <4 x i32>* 349 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 350 %2 = or <4 x i32> %wide.load, %vec.phi 351 %index.next = add i32 %index, 4 352 %3 = icmp eq i32 %index.next, %n.vec 353 br i1 %3, label %middle.block, label %vector.body 354 355middle.block: ; preds = %vector.body 356 %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2) 357 %cmp.n = icmp eq i32 %n.vec, %n 358 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 359 360for.body.preheader1: ; preds = %middle.block, %for.body.preheader 361 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 362 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ] 363 br label %for.body 364 365for.body: ; preds = %for.body.preheader1, %for.body 366 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 367 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 368 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 369 %5 = load i32, i32* %arrayidx, align 4 370 %add = or i32 %5, %r.07 371 %inc = add nuw nsw i32 %i.08, 1 372 %exitcond = icmp eq i32 %inc, %n 373 br i1 %exitcond, label %for.cond.cleanup, label %for.body 374 375for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 376 %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 377 ret i32 %r.0.lcssa 378} 379 380define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) { 381; CHECK-LABEL: xor_i32: 382; CHECK: @ %bb.0: @ %entry 383; CHECK-NEXT: .save {r4, lr} 384; CHECK-NEXT: push {r4, lr} 385; CHECK-NEXT: cmp r1, #1 386; CHECK-NEXT: blt .LBB4_3 387; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 388; CHECK-NEXT: cmp r1, #4 389; CHECK-NEXT: bhs .LBB4_4 390; CHECK-NEXT: @ %bb.2: 391; CHECK-NEXT: movs r3, #0 392; CHECK-NEXT: movs r2, #0 393; CHECK-NEXT: b .LBB4_7 394; CHECK-NEXT: .LBB4_3: 395; CHECK-NEXT: movs r2, #0 396; CHECK-NEXT: b .LBB4_9 397; CHECK-NEXT: .LBB4_4: @ %vector.ph 398; CHECK-NEXT: bic r3, r1, #3 399; CHECK-NEXT: movs r2, #1 400; CHECK-NEXT: sub.w r12, r3, #4 401; CHECK-NEXT: vmov.i32 q0, #0x0 402; CHECK-NEXT: add.w lr, r2, r12, lsr #2 403; CHECK-NEXT: mov r2, r0 404; CHECK-NEXT: .LBB4_5: @ %vector.body 405; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 406; CHECK-NEXT: vldrw.u32 q1, [r2], #16 407; CHECK-NEXT: veor q0, q1, q0 408; CHECK-NEXT: le lr, .LBB4_5 409; CHECK-NEXT: @ %bb.6: @ %middle.block 410; CHECK-NEXT: vmov lr, r12, d1 411; CHECK-NEXT: cmp r3, r1 412; CHECK-NEXT: vmov r2, r4, d0 413; CHECK-NEXT: eor.w r12, r12, lr 414; CHECK-NEXT: eor.w r2, r2, r4 415; CHECK-NEXT: eor.w r2, r2, r12 416; CHECK-NEXT: beq .LBB4_9 417; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1 418; CHECK-NEXT: sub.w lr, r1, r3 419; CHECK-NEXT: add.w r0, r0, r3, lsl #2 420; CHECK-NEXT: .LBB4_8: @ %for.body 421; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 422; CHECK-NEXT: ldr r1, [r0], #4 423; CHECK-NEXT: eors r2, r1 424; CHECK-NEXT: le lr, .LBB4_8 425; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup 426; CHECK-NEXT: mov r0, r2 427; CHECK-NEXT: pop {r4, pc} 428entry: 429 %cmp6 = icmp sgt i32 %n, 0 430 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 431 432for.body.preheader: ; preds = %entry 433 %min.iters.check = icmp ult i32 %n, 4 434 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 435 436vector.ph: ; preds = %for.body.preheader 437 %n.vec = and i32 %n, -4 438 br label %vector.body 439 440vector.body: ; preds = %vector.body, %vector.ph 441 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 442 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] 443 %0 = getelementptr inbounds i32, i32* %x, i32 %index 444 %1 = bitcast i32* %0 to <4 x i32>* 445 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 446 %2 = xor <4 x i32> %wide.load, %vec.phi 447 %index.next = add i32 %index, 4 448 %3 = icmp eq i32 %index.next, %n.vec 449 br i1 %3, label %middle.block, label %vector.body 450 451middle.block: ; preds = %vector.body 452 %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2) 453 %cmp.n = icmp eq i32 %n.vec, %n 454 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 455 456for.body.preheader1: ; preds = %middle.block, %for.body.preheader 457 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 458 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ] 459 br label %for.body 460 461for.body: ; preds = %for.body.preheader1, %for.body 462 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 463 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 464 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 465 %5 = load i32, i32* %arrayidx, align 4 466 %add = xor i32 %5, %r.07 467 %inc = add nuw nsw i32 %i.08, 1 468 %exitcond = icmp eq i32 %inc, %n 469 br i1 %exitcond, label %for.cond.cleanup, label %for.body 470 471for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 472 %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 473 ret i32 %r.0.lcssa 474} 475 476define float @fadd_f32(float* nocapture readonly %x, i32 %n) { 477; CHECK-LABEL: fadd_f32: 478; CHECK: @ %bb.0: @ %entry 479; CHECK-NEXT: .save {r7, lr} 480; CHECK-NEXT: push {r7, lr} 481; CHECK-NEXT: cmp r1, #1 482; CHECK-NEXT: blt .LBB5_3 483; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 484; CHECK-NEXT: cmp r1, #4 485; CHECK-NEXT: bhs .LBB5_4 486; CHECK-NEXT: @ %bb.2: 487; CHECK-NEXT: vldr s0, .LCPI5_0 488; CHECK-NEXT: movs r2, #0 489; CHECK-NEXT: b .LBB5_7 490; CHECK-NEXT: .LBB5_3: 491; CHECK-NEXT: vldr s0, .LCPI5_0 492; CHECK-NEXT: b .LBB5_9 493; CHECK-NEXT: .LBB5_4: @ %vector.ph 494; CHECK-NEXT: bic r2, r1, #3 495; CHECK-NEXT: movs r3, #1 496; CHECK-NEXT: sub.w r12, r2, #4 497; CHECK-NEXT: vmov.i32 q0, #0x0 498; CHECK-NEXT: add.w lr, r3, r12, lsr #2 499; CHECK-NEXT: mov r3, r0 500; CHECK-NEXT: .LBB5_5: @ %vector.body 501; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 502; CHECK-NEXT: vldrw.u32 q1, [r3], #16 503; CHECK-NEXT: vadd.f32 q0, q1, q0 504; CHECK-NEXT: le lr, .LBB5_5 505; CHECK-NEXT: @ %bb.6: @ %middle.block 506; CHECK-NEXT: vadd.f32 s4, s2, s3 507; CHECK-NEXT: cmp r2, r1 508; CHECK-NEXT: vadd.f32 s0, s0, s1 509; CHECK-NEXT: vadd.f32 s0, s0, s4 510; CHECK-NEXT: beq .LBB5_9 511; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1 512; CHECK-NEXT: sub.w lr, r1, r2 513; CHECK-NEXT: add.w r0, r0, r2, lsl #2 514; CHECK-NEXT: .LBB5_8: @ %for.body 515; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 516; CHECK-NEXT: vldmia r0!, {s2} 517; CHECK-NEXT: vadd.f32 s0, s2, s0 518; CHECK-NEXT: le lr, .LBB5_8 519; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup 520; CHECK-NEXT: vmov r0, s0 521; CHECK-NEXT: pop {r7, pc} 522; CHECK-NEXT: .p2align 2 523; CHECK-NEXT: @ %bb.10: 524; CHECK-NEXT: .LCPI5_0: 525; CHECK-NEXT: .long 0x00000000 @ float 0 526entry: 527 %cmp6 = icmp sgt i32 %n, 0 528 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 529 530for.body.preheader: ; preds = %entry 531 %min.iters.check = icmp ult i32 %n, 4 532 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 533 534vector.ph: ; preds = %for.body.preheader 535 %n.vec = and i32 %n, -4 536 br label %vector.body 537 538vector.body: ; preds = %vector.body, %vector.ph 539 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 540 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] 541 %0 = getelementptr inbounds float, float* %x, i32 %index 542 %1 = bitcast float* %0 to <4 x float>* 543 %wide.load = load <4 x float>, <4 x float>* %1, align 4 544 %2 = fadd fast <4 x float> %wide.load, %vec.phi 545 %index.next = add i32 %index, 4 546 %3 = icmp eq i32 %index.next, %n.vec 547 br i1 %3, label %middle.block, label %vector.body 548 549middle.block: ; preds = %vector.body 550 %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2) 551 %cmp.n = icmp eq i32 %n.vec, %n 552 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 553 554for.body.preheader1: ; preds = %middle.block, %for.body.preheader 555 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 556 %r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ] 557 br label %for.body 558 559for.body: ; preds = %for.body.preheader1, %for.body 560 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 561 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 562 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08 563 %5 = load float, float* %arrayidx, align 4 564 %add = fadd fast float %5, %r.07 565 %inc = add nuw nsw i32 %i.08, 1 566 %exitcond = icmp eq i32 %inc, %n 567 br i1 %exitcond, label %for.cond.cleanup, label %for.body 568 569for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 570 %r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 571 ret float %r.0.lcssa 572} 573 574define float @fmul_f32(float* nocapture readonly %x, i32 %n) { 575; CHECK-LABEL: fmul_f32: 576; CHECK: @ %bb.0: @ %entry 577; CHECK-NEXT: .save {r7, lr} 578; CHECK-NEXT: push {r7, lr} 579; CHECK-NEXT: cmp r1, #1 580; CHECK-NEXT: blt .LBB6_3 581; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 582; CHECK-NEXT: cmp r1, #4 583; CHECK-NEXT: bhs .LBB6_4 584; CHECK-NEXT: @ %bb.2: 585; CHECK-NEXT: vmov.f32 s0, #1.000000e+00 586; CHECK-NEXT: movs r2, #0 587; CHECK-NEXT: b .LBB6_7 588; CHECK-NEXT: .LBB6_3: 589; CHECK-NEXT: vmov.f32 s0, #1.000000e+00 590; CHECK-NEXT: b .LBB6_9 591; CHECK-NEXT: .LBB6_4: @ %vector.ph 592; CHECK-NEXT: bic r2, r1, #3 593; CHECK-NEXT: movs r3, #1 594; CHECK-NEXT: sub.w r12, r2, #4 595; CHECK-NEXT: vmov.f32 q0, #1.000000e+00 596; CHECK-NEXT: add.w lr, r3, r12, lsr #2 597; CHECK-NEXT: mov r3, r0 598; CHECK-NEXT: .LBB6_5: @ %vector.body 599; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 600; CHECK-NEXT: vldrw.u32 q1, [r3], #16 601; CHECK-NEXT: vmul.f32 q0, q1, q0 602; CHECK-NEXT: le lr, .LBB6_5 603; CHECK-NEXT: @ %bb.6: @ %middle.block 604; CHECK-NEXT: vmul.f32 s4, s2, s3 605; CHECK-NEXT: cmp r2, r1 606; CHECK-NEXT: vmul.f32 s0, s0, s1 607; CHECK-NEXT: vmul.f32 s0, s0, s4 608; CHECK-NEXT: beq .LBB6_9 609; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1 610; CHECK-NEXT: sub.w lr, r1, r2 611; CHECK-NEXT: add.w r0, r0, r2, lsl #2 612; CHECK-NEXT: .LBB6_8: @ %for.body 613; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 614; CHECK-NEXT: vldmia r0!, {s2} 615; CHECK-NEXT: vmul.f32 s0, s2, s0 616; CHECK-NEXT: le lr, .LBB6_8 617; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup 618; CHECK-NEXT: vmov r0, s0 619; CHECK-NEXT: pop {r7, pc} 620entry: 621 %cmp6 = icmp sgt i32 %n, 0 622 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 623 624for.body.preheader: ; preds = %entry 625 %min.iters.check = icmp ult i32 %n, 4 626 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 627 628vector.ph: ; preds = %for.body.preheader 629 %n.vec = and i32 %n, -4 630 br label %vector.body 631 632vector.body: ; preds = %vector.body, %vector.ph 633 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 634 %vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ] 635 %0 = getelementptr inbounds float, float* %x, i32 %index 636 %1 = bitcast float* %0 to <4 x float>* 637 %wide.load = load <4 x float>, <4 x float>* %1, align 4 638 %2 = fmul fast <4 x float> %wide.load, %vec.phi 639 %index.next = add i32 %index, 4 640 %3 = icmp eq i32 %index.next, %n.vec 641 br i1 %3, label %middle.block, label %vector.body 642 643middle.block: ; preds = %vector.body 644 %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2) 645 %cmp.n = icmp eq i32 %n.vec, %n 646 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 647 648for.body.preheader1: ; preds = %middle.block, %for.body.preheader 649 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 650 %r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ] 651 br label %for.body 652 653for.body: ; preds = %for.body.preheader1, %for.body 654 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 655 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 656 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08 657 %5 = load float, float* %arrayidx, align 4 658 %add = fmul fast float %5, %r.07 659 %inc = add nuw nsw i32 %i.08, 1 660 %exitcond = icmp eq i32 %inc, %n 661 br i1 %exitcond, label %for.cond.cleanup, label %for.body 662 663for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 664 %r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 665 ret float %r.0.lcssa 666} 667 668define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) { 669; CHECK-LABEL: smin_i32: 670; CHECK: @ %bb.0: @ %entry 671; CHECK-NEXT: .save {r7, lr} 672; CHECK-NEXT: push {r7, lr} 673; CHECK-NEXT: cmp r1, #1 674; CHECK-NEXT: blt .LBB7_3 675; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 676; CHECK-NEXT: cmp r1, #4 677; CHECK-NEXT: bhs .LBB7_4 678; CHECK-NEXT: @ %bb.2: 679; CHECK-NEXT: mvn r2, #-2147483648 680; CHECK-NEXT: movs r3, #0 681; CHECK-NEXT: b .LBB7_7 682; CHECK-NEXT: .LBB7_3: 683; CHECK-NEXT: mvn r2, #-2147483648 684; CHECK-NEXT: b .LBB7_9 685; CHECK-NEXT: .LBB7_4: @ %vector.ph 686; CHECK-NEXT: bic r3, r1, #3 687; CHECK-NEXT: movs r2, #1 688; CHECK-NEXT: sub.w r12, r3, #4 689; CHECK-NEXT: vmvn.i32 q0, #0x80000000 690; CHECK-NEXT: add.w lr, r2, r12, lsr #2 691; CHECK-NEXT: mov r2, r0 692; CHECK-NEXT: .LBB7_5: @ %vector.body 693; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 694; CHECK-NEXT: vldrw.u32 q1, [r2], #16 695; CHECK-NEXT: vmin.s32 q0, q0, q1 696; CHECK-NEXT: le lr, .LBB7_5 697; CHECK-NEXT: @ %bb.6: @ %middle.block 698; CHECK-NEXT: mvn r2, #-2147483648 699; CHECK-NEXT: cmp r3, r1 700; CHECK-NEXT: vminv.s32 r2, q0 701; CHECK-NEXT: beq .LBB7_9 702; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1 703; CHECK-NEXT: sub.w lr, r1, r3 704; CHECK-NEXT: add.w r0, r0, r3, lsl #2 705; CHECK-NEXT: .LBB7_8: @ %for.body 706; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 707; CHECK-NEXT: ldr r1, [r0], #4 708; CHECK-NEXT: cmp r2, r1 709; CHECK-NEXT: csel r2, r2, r1, lt 710; CHECK-NEXT: le lr, .LBB7_8 711; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup 712; CHECK-NEXT: mov r0, r2 713; CHECK-NEXT: pop {r7, pc} 714entry: 715 %cmp6 = icmp sgt i32 %n, 0 716 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 717 718for.body.preheader: ; preds = %entry 719 %min.iters.check = icmp ult i32 %n, 4 720 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 721 722vector.ph: ; preds = %for.body.preheader 723 %n.vec = and i32 %n, -4 724 br label %vector.body 725 726vector.body: ; preds = %vector.body, %vector.ph 727 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 728 %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ] 729 %0 = getelementptr inbounds i32, i32* %x, i32 %index 730 %1 = bitcast i32* %0 to <4 x i32>* 731 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 732 %2 = icmp slt <4 x i32> %vec.phi, %wide.load 733 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load 734 %index.next = add i32 %index, 4 735 %4 = icmp eq i32 %index.next, %n.vec 736 br i1 %4, label %middle.block, label %vector.body 737 738middle.block: ; preds = %vector.body 739 %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3) 740 %cmp.n = icmp eq i32 %n.vec, %n 741 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 742 743for.body.preheader1: ; preds = %middle.block, %for.body.preheader 744 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 745 %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ] 746 br label %for.body 747 748for.body: ; preds = %for.body.preheader1, %for.body 749 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 750 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 751 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 752 %6 = load i32, i32* %arrayidx, align 4 753 %c = icmp slt i32 %r.07, %6 754 %add = select i1 %c, i32 %r.07, i32 %6 755 %inc = add nuw nsw i32 %i.08, 1 756 %exitcond = icmp eq i32 %inc, %n 757 br i1 %exitcond, label %for.cond.cleanup, label %for.body 758 759for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 760 %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 761 ret i32 %r.0.lcssa 762} 763 764define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) { 765; CHECK-LABEL: smin_i32_inloop: 766; CHECK: @ %bb.0: @ %entry 767; CHECK-NEXT: .save {r7, lr} 768; CHECK-NEXT: push {r7, lr} 769; CHECK-NEXT: cmp r1, #1 770; CHECK-NEXT: blt .LBB8_3 771; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 772; CHECK-NEXT: mov r12, r0 773; CHECK-NEXT: cmp r1, #4 774; CHECK-NEXT: bhs .LBB8_4 775; CHECK-NEXT: @ %bb.2: 776; CHECK-NEXT: mvn r0, #-2147483648 777; CHECK-NEXT: movs r3, #0 778; CHECK-NEXT: b .LBB8_7 779; CHECK-NEXT: .LBB8_3: 780; CHECK-NEXT: mvn r0, #-2147483648 781; CHECK-NEXT: b .LBB8_9 782; CHECK-NEXT: .LBB8_4: @ %vector.ph 783; CHECK-NEXT: bic r3, r1, #3 784; CHECK-NEXT: movs r2, #1 785; CHECK-NEXT: subs r0, r3, #4 786; CHECK-NEXT: add.w lr, r2, r0, lsr #2 787; CHECK-NEXT: mvn r0, #-2147483648 788; CHECK-NEXT: mov r2, r12 789; CHECK-NEXT: .LBB8_5: @ %vector.body 790; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 791; CHECK-NEXT: vldrw.u32 q0, [r2], #16 792; CHECK-NEXT: vminv.s32 r0, q0 793; CHECK-NEXT: le lr, .LBB8_5 794; CHECK-NEXT: @ %bb.6: @ %middle.block 795; CHECK-NEXT: cmp r3, r1 796; CHECK-NEXT: it eq 797; CHECK-NEXT: popeq {r7, pc} 798; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1 799; CHECK-NEXT: sub.w lr, r1, r3 800; CHECK-NEXT: add.w r2, r12, r3, lsl #2 801; CHECK-NEXT: .LBB8_8: @ %for.body 802; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 803; CHECK-NEXT: ldr r1, [r2], #4 804; CHECK-NEXT: cmp r0, r1 805; CHECK-NEXT: csel r0, r0, r1, lt 806; CHECK-NEXT: le lr, .LBB8_8 807; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup 808; CHECK-NEXT: pop {r7, pc} 809entry: 810 %cmp6 = icmp sgt i32 %n, 0 811 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 812 813for.body.preheader: ; preds = %entry 814 %min.iters.check = icmp ult i32 %n, 4 815 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 816 817vector.ph: ; preds = %for.body.preheader 818 %n.vec = and i32 %n, -4 819 br label %vector.body 820 821vector.body: ; preds = %vector.body, %vector.ph 822 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 823 %vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ] 824 %0 = getelementptr inbounds i32, i32* %x, i32 %index 825 %1 = bitcast i32* %0 to <4 x i32>* 826 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 827 %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load) 828 %2 = icmp slt i32 %vec.phi, %l5 829 %3 = select i1 %2, i32 %vec.phi, i32 %l5 830 %index.next = add i32 %index, 4 831 %4 = icmp eq i32 %index.next, %n.vec 832 br i1 %4, label %middle.block, label %vector.body 833 834middle.block: ; preds = %vector.body 835 %5 = phi i32 [ %3, %vector.body ] 836 %cmp.n = icmp eq i32 %n.vec, %n 837 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 838 839for.body.preheader1: ; preds = %middle.block, %for.body.preheader 840 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 841 %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ] 842 br label %for.body 843 844for.body: ; preds = %for.body.preheader1, %for.body 845 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 846 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 847 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 848 %6 = load i32, i32* %arrayidx, align 4 849 %c = icmp slt i32 %r.07, %6 850 %add = select i1 %c, i32 %r.07, i32 %6 851 %inc = add nuw nsw i32 %i.08, 1 852 %exitcond = icmp eq i32 %inc, %n 853 br i1 %exitcond, label %for.cond.cleanup, label %for.body 854 855for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 856 %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 857 ret i32 %r.0.lcssa 858} 859 860define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) { 861; CHECK-LABEL: smax_i32: 862; CHECK: @ %bb.0: @ %entry 863; CHECK-NEXT: .save {r7, lr} 864; CHECK-NEXT: push {r7, lr} 865; CHECK-NEXT: cmp r1, #1 866; CHECK-NEXT: blt .LBB9_3 867; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 868; CHECK-NEXT: cmp r1, #4 869; CHECK-NEXT: bhs .LBB9_4 870; CHECK-NEXT: @ %bb.2: 871; CHECK-NEXT: mov.w r2, #-2147483648 872; CHECK-NEXT: movs r3, #0 873; CHECK-NEXT: b .LBB9_7 874; CHECK-NEXT: .LBB9_3: 875; CHECK-NEXT: mov.w r2, #-2147483648 876; CHECK-NEXT: b .LBB9_9 877; CHECK-NEXT: .LBB9_4: @ %vector.ph 878; CHECK-NEXT: bic r3, r1, #3 879; CHECK-NEXT: movs r2, #1 880; CHECK-NEXT: sub.w r12, r3, #4 881; CHECK-NEXT: vmov.i32 q0, #0x80000000 882; CHECK-NEXT: add.w lr, r2, r12, lsr #2 883; CHECK-NEXT: mov r2, r0 884; CHECK-NEXT: .LBB9_5: @ %vector.body 885; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 886; CHECK-NEXT: vldrw.u32 q1, [r2], #16 887; CHECK-NEXT: vmax.s32 q0, q0, q1 888; CHECK-NEXT: le lr, .LBB9_5 889; CHECK-NEXT: @ %bb.6: @ %middle.block 890; CHECK-NEXT: mov.w r2, #-2147483648 891; CHECK-NEXT: cmp r3, r1 892; CHECK-NEXT: vmaxv.s32 r2, q0 893; CHECK-NEXT: beq .LBB9_9 894; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1 895; CHECK-NEXT: sub.w lr, r1, r3 896; CHECK-NEXT: add.w r0, r0, r3, lsl #2 897; CHECK-NEXT: .LBB9_8: @ %for.body 898; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 899; CHECK-NEXT: ldr r1, [r0], #4 900; CHECK-NEXT: cmp r2, r1 901; CHECK-NEXT: csel r2, r2, r1, gt 902; CHECK-NEXT: le lr, .LBB9_8 903; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup 904; CHECK-NEXT: mov r0, r2 905; CHECK-NEXT: pop {r7, pc} 906entry: 907 %cmp6 = icmp sgt i32 %n, 0 908 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 909 910for.body.preheader: ; preds = %entry 911 %min.iters.check = icmp ult i32 %n, 4 912 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 913 914vector.ph: ; preds = %for.body.preheader 915 %n.vec = and i32 %n, -4 916 br label %vector.body 917 918vector.body: ; preds = %vector.body, %vector.ph 919 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 920 %vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ] 921 %0 = getelementptr inbounds i32, i32* %x, i32 %index 922 %1 = bitcast i32* %0 to <4 x i32>* 923 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 924 %2 = icmp sgt <4 x i32> %vec.phi, %wide.load 925 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load 926 %index.next = add i32 %index, 4 927 %4 = icmp eq i32 %index.next, %n.vec 928 br i1 %4, label %middle.block, label %vector.body 929 930middle.block: ; preds = %vector.body 931 %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3) 932 %cmp.n = icmp eq i32 %n.vec, %n 933 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 934 935for.body.preheader1: ; preds = %middle.block, %for.body.preheader 936 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 937 %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ] 938 br label %for.body 939 940for.body: ; preds = %for.body.preheader1, %for.body 941 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 942 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 943 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 944 %6 = load i32, i32* %arrayidx, align 4 945 %c = icmp sgt i32 %r.07, %6 946 %add = select i1 %c, i32 %r.07, i32 %6 947 %inc = add nuw nsw i32 %i.08, 1 948 %exitcond = icmp eq i32 %inc, %n 949 br i1 %exitcond, label %for.cond.cleanup, label %for.body 950 951for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 952 %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 953 ret i32 %r.0.lcssa 954} 955 956define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) { 957; CHECK-LABEL: smax_i32_inloop: 958; CHECK: @ %bb.0: @ %entry 959; CHECK-NEXT: .save {r7, lr} 960; CHECK-NEXT: push {r7, lr} 961; CHECK-NEXT: cmp r1, #1 962; CHECK-NEXT: blt .LBB10_3 963; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 964; CHECK-NEXT: mov r12, r0 965; CHECK-NEXT: cmp r1, #4 966; CHECK-NEXT: bhs .LBB10_4 967; CHECK-NEXT: @ %bb.2: 968; CHECK-NEXT: mov.w r0, #-2147483648 969; CHECK-NEXT: movs r3, #0 970; CHECK-NEXT: b .LBB10_7 971; CHECK-NEXT: .LBB10_3: 972; CHECK-NEXT: mov.w r0, #-2147483648 973; CHECK-NEXT: b .LBB10_9 974; CHECK-NEXT: .LBB10_4: @ %vector.ph 975; CHECK-NEXT: bic r3, r1, #3 976; CHECK-NEXT: movs r2, #1 977; CHECK-NEXT: subs r0, r3, #4 978; CHECK-NEXT: add.w lr, r2, r0, lsr #2 979; CHECK-NEXT: mov.w r0, #-2147483648 980; CHECK-NEXT: mov r2, r12 981; CHECK-NEXT: .LBB10_5: @ %vector.body 982; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 983; CHECK-NEXT: vldrw.u32 q0, [r2], #16 984; CHECK-NEXT: vmaxv.s32 r0, q0 985; CHECK-NEXT: le lr, .LBB10_5 986; CHECK-NEXT: @ %bb.6: @ %middle.block 987; CHECK-NEXT: cmp r3, r1 988; CHECK-NEXT: it eq 989; CHECK-NEXT: popeq {r7, pc} 990; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1 991; CHECK-NEXT: sub.w lr, r1, r3 992; CHECK-NEXT: add.w r2, r12, r3, lsl #2 993; CHECK-NEXT: .LBB10_8: @ %for.body 994; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 995; CHECK-NEXT: ldr r1, [r2], #4 996; CHECK-NEXT: cmp r0, r1 997; CHECK-NEXT: csel r0, r0, r1, gt 998; CHECK-NEXT: le lr, .LBB10_8 999; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup 1000; CHECK-NEXT: pop {r7, pc} 1001entry: 1002 %cmp6 = icmp sgt i32 %n, 0 1003 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1004 1005for.body.preheader: ; preds = %entry 1006 %min.iters.check = icmp ult i32 %n, 4 1007 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1008 1009vector.ph: ; preds = %for.body.preheader 1010 %n.vec = and i32 %n, -4 1011 br label %vector.body 1012 1013vector.body: ; preds = %vector.body, %vector.ph 1014 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1015 %vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ] 1016 %0 = getelementptr inbounds i32, i32* %x, i32 %index 1017 %1 = bitcast i32* %0 to <4 x i32>* 1018 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 1019 %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load) 1020 %2 = icmp sgt i32 %vec.phi, %l5 1021 %3 = select i1 %2, i32 %vec.phi, i32 %l5 1022 %index.next = add i32 %index, 4 1023 %4 = icmp eq i32 %index.next, %n.vec 1024 br i1 %4, label %middle.block, label %vector.body 1025 1026middle.block: ; preds = %vector.body 1027 %5 = phi i32 [ %3, %vector.body ] 1028 %cmp.n = icmp eq i32 %n.vec, %n 1029 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1030 1031for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1032 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1033 %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ] 1034 br label %for.body 1035 1036for.body: ; preds = %for.body.preheader1, %for.body 1037 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1038 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1039 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 1040 %6 = load i32, i32* %arrayidx, align 4 1041 %c = icmp sgt i32 %r.07, %6 1042 %add = select i1 %c, i32 %r.07, i32 %6 1043 %inc = add nuw nsw i32 %i.08, 1 1044 %exitcond = icmp eq i32 %inc, %n 1045 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1046 1047for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1048 %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1049 ret i32 %r.0.lcssa 1050} 1051 1052define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) { 1053; CHECK-LABEL: umin_i32: 1054; CHECK: @ %bb.0: @ %entry 1055; CHECK-NEXT: .save {r7, lr} 1056; CHECK-NEXT: push {r7, lr} 1057; CHECK-NEXT: cmp r1, #1 1058; CHECK-NEXT: blt .LBB11_3 1059; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1060; CHECK-NEXT: cmp r1, #4 1061; CHECK-NEXT: bhs .LBB11_4 1062; CHECK-NEXT: @ %bb.2: 1063; CHECK-NEXT: mov.w r2, #-1 1064; CHECK-NEXT: movs r3, #0 1065; CHECK-NEXT: b .LBB11_7 1066; CHECK-NEXT: .LBB11_3: 1067; CHECK-NEXT: mov.w r2, #-1 1068; CHECK-NEXT: b .LBB11_9 1069; CHECK-NEXT: .LBB11_4: @ %vector.ph 1070; CHECK-NEXT: bic r3, r1, #3 1071; CHECK-NEXT: movs r2, #1 1072; CHECK-NEXT: sub.w r12, r3, #4 1073; CHECK-NEXT: vmov.i8 q0, #0xff 1074; CHECK-NEXT: add.w lr, r2, r12, lsr #2 1075; CHECK-NEXT: mov r2, r0 1076; CHECK-NEXT: .LBB11_5: @ %vector.body 1077; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1078; CHECK-NEXT: vldrw.u32 q1, [r2], #16 1079; CHECK-NEXT: vmin.u32 q0, q0, q1 1080; CHECK-NEXT: le lr, .LBB11_5 1081; CHECK-NEXT: @ %bb.6: @ %middle.block 1082; CHECK-NEXT: mov.w r2, #-1 1083; CHECK-NEXT: cmp r3, r1 1084; CHECK-NEXT: vminv.u32 r2, q0 1085; CHECK-NEXT: beq .LBB11_9 1086; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1 1087; CHECK-NEXT: sub.w lr, r1, r3 1088; CHECK-NEXT: add.w r0, r0, r3, lsl #2 1089; CHECK-NEXT: .LBB11_8: @ %for.body 1090; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1091; CHECK-NEXT: ldr r1, [r0], #4 1092; CHECK-NEXT: cmp r2, r1 1093; CHECK-NEXT: csel r2, r2, r1, lo 1094; CHECK-NEXT: le lr, .LBB11_8 1095; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup 1096; CHECK-NEXT: mov r0, r2 1097; CHECK-NEXT: pop {r7, pc} 1098entry: 1099 %cmp6 = icmp sgt i32 %n, 0 1100 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1101 1102for.body.preheader: ; preds = %entry 1103 %min.iters.check = icmp ult i32 %n, 4 1104 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1105 1106vector.ph: ; preds = %for.body.preheader 1107 %n.vec = and i32 %n, -4 1108 br label %vector.body 1109 1110vector.body: ; preds = %vector.body, %vector.ph 1111 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1112 %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ] 1113 %0 = getelementptr inbounds i32, i32* %x, i32 %index 1114 %1 = bitcast i32* %0 to <4 x i32>* 1115 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 1116 %2 = icmp ult <4 x i32> %vec.phi, %wide.load 1117 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load 1118 %index.next = add i32 %index, 4 1119 %4 = icmp eq i32 %index.next, %n.vec 1120 br i1 %4, label %middle.block, label %vector.body 1121 1122middle.block: ; preds = %vector.body 1123 %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3) 1124 %cmp.n = icmp eq i32 %n.vec, %n 1125 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1126 1127for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1128 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1129 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ] 1130 br label %for.body 1131 1132for.body: ; preds = %for.body.preheader1, %for.body 1133 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1134 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1135 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 1136 %6 = load i32, i32* %arrayidx, align 4 1137 %c = icmp ult i32 %r.07, %6 1138 %add = select i1 %c, i32 %r.07, i32 %6 1139 %inc = add nuw nsw i32 %i.08, 1 1140 %exitcond = icmp eq i32 %inc, %n 1141 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1142 1143for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1144 %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1145 ret i32 %r.0.lcssa 1146} 1147 1148define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) { 1149; CHECK-LABEL: umin_i32_inloop: 1150; CHECK: @ %bb.0: @ %entry 1151; CHECK-NEXT: .save {r7, lr} 1152; CHECK-NEXT: push {r7, lr} 1153; CHECK-NEXT: cmp r1, #1 1154; CHECK-NEXT: blt .LBB12_3 1155; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1156; CHECK-NEXT: mov r12, r0 1157; CHECK-NEXT: cmp r1, #4 1158; CHECK-NEXT: bhs .LBB12_4 1159; CHECK-NEXT: @ %bb.2: 1160; CHECK-NEXT: mov.w r0, #-1 1161; CHECK-NEXT: movs r3, #0 1162; CHECK-NEXT: b .LBB12_7 1163; CHECK-NEXT: .LBB12_3: 1164; CHECK-NEXT: mov.w r0, #-1 1165; CHECK-NEXT: b .LBB12_9 1166; CHECK-NEXT: .LBB12_4: @ %vector.ph 1167; CHECK-NEXT: bic r3, r1, #3 1168; CHECK-NEXT: movs r2, #1 1169; CHECK-NEXT: subs r0, r3, #4 1170; CHECK-NEXT: add.w lr, r2, r0, lsr #2 1171; CHECK-NEXT: mov.w r0, #-1 1172; CHECK-NEXT: mov r2, r12 1173; CHECK-NEXT: .LBB12_5: @ %vector.body 1174; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1175; CHECK-NEXT: vldrw.u32 q0, [r2], #16 1176; CHECK-NEXT: vminv.u32 r0, q0 1177; CHECK-NEXT: le lr, .LBB12_5 1178; CHECK-NEXT: @ %bb.6: @ %middle.block 1179; CHECK-NEXT: cmp r3, r1 1180; CHECK-NEXT: it eq 1181; CHECK-NEXT: popeq {r7, pc} 1182; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1 1183; CHECK-NEXT: sub.w lr, r1, r3 1184; CHECK-NEXT: add.w r2, r12, r3, lsl #2 1185; CHECK-NEXT: .LBB12_8: @ %for.body 1186; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1187; CHECK-NEXT: ldr r1, [r2], #4 1188; CHECK-NEXT: cmp r0, r1 1189; CHECK-NEXT: csel r0, r0, r1, hi 1190; CHECK-NEXT: le lr, .LBB12_8 1191; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup 1192; CHECK-NEXT: pop {r7, pc} 1193entry: 1194 %cmp6 = icmp sgt i32 %n, 0 1195 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1196 1197for.body.preheader: ; preds = %entry 1198 %min.iters.check = icmp ult i32 %n, 4 1199 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1200 1201vector.ph: ; preds = %for.body.preheader 1202 %n.vec = and i32 %n, -4 1203 br label %vector.body 1204 1205vector.body: ; preds = %vector.body, %vector.ph 1206 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1207 %vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ] 1208 %0 = getelementptr inbounds i32, i32* %x, i32 %index 1209 %1 = bitcast i32* %0 to <4 x i32>* 1210 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 1211 %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load) 1212 %2 = icmp ult i32 %vec.phi, %l5 1213 %3 = select i1 %2, i32 %vec.phi, i32 %l5 1214 %index.next = add i32 %index, 4 1215 %4 = icmp eq i32 %index.next, %n.vec 1216 br i1 %4, label %middle.block, label %vector.body 1217 1218middle.block: ; preds = %vector.body 1219 %5 = phi i32 [ %3, %vector.body ] 1220 %cmp.n = icmp eq i32 %n.vec, %n 1221 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1222 1223for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1224 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1225 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ] 1226 br label %for.body 1227 1228for.body: ; preds = %for.body.preheader1, %for.body 1229 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1230 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1231 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 1232 %6 = load i32, i32* %arrayidx, align 4 1233 %c = icmp ugt i32 %r.07, %6 1234 %add = select i1 %c, i32 %r.07, i32 %6 1235 %inc = add nuw nsw i32 %i.08, 1 1236 %exitcond = icmp eq i32 %inc, %n 1237 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1238 1239for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1240 %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1241 ret i32 %r.0.lcssa 1242} 1243 1244define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) { 1245; CHECK-LABEL: umax_i32: 1246; CHECK: @ %bb.0: @ %entry 1247; CHECK-NEXT: .save {r7, lr} 1248; CHECK-NEXT: push {r7, lr} 1249; CHECK-NEXT: cmp r1, #1 1250; CHECK-NEXT: blt .LBB13_3 1251; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1252; CHECK-NEXT: cmp r1, #4 1253; CHECK-NEXT: bhs .LBB13_4 1254; CHECK-NEXT: @ %bb.2: 1255; CHECK-NEXT: movs r3, #0 1256; CHECK-NEXT: movs r2, #0 1257; CHECK-NEXT: b .LBB13_7 1258; CHECK-NEXT: .LBB13_3: 1259; CHECK-NEXT: movs r2, #0 1260; CHECK-NEXT: b .LBB13_9 1261; CHECK-NEXT: .LBB13_4: @ %vector.ph 1262; CHECK-NEXT: bic r3, r1, #3 1263; CHECK-NEXT: movs r2, #1 1264; CHECK-NEXT: sub.w r12, r3, #4 1265; CHECK-NEXT: vmov.i32 q0, #0x0 1266; CHECK-NEXT: add.w lr, r2, r12, lsr #2 1267; CHECK-NEXT: mov r2, r0 1268; CHECK-NEXT: .LBB13_5: @ %vector.body 1269; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1270; CHECK-NEXT: vldrw.u32 q1, [r2], #16 1271; CHECK-NEXT: vmax.u32 q0, q0, q1 1272; CHECK-NEXT: le lr, .LBB13_5 1273; CHECK-NEXT: @ %bb.6: @ %middle.block 1274; CHECK-NEXT: movs r2, #0 1275; CHECK-NEXT: cmp r3, r1 1276; CHECK-NEXT: vmaxv.u32 r2, q0 1277; CHECK-NEXT: beq .LBB13_9 1278; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1 1279; CHECK-NEXT: sub.w lr, r1, r3 1280; CHECK-NEXT: add.w r0, r0, r3, lsl #2 1281; CHECK-NEXT: .LBB13_8: @ %for.body 1282; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1283; CHECK-NEXT: ldr r1, [r0], #4 1284; CHECK-NEXT: cmp r2, r1 1285; CHECK-NEXT: csel r2, r2, r1, hi 1286; CHECK-NEXT: le lr, .LBB13_8 1287; CHECK-NEXT: .LBB13_9: @ %for.cond.cleanup 1288; CHECK-NEXT: mov r0, r2 1289; CHECK-NEXT: pop {r7, pc} 1290entry: 1291 %cmp6 = icmp sgt i32 %n, 0 1292 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1293 1294for.body.preheader: ; preds = %entry 1295 %min.iters.check = icmp ult i32 %n, 4 1296 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1297 1298vector.ph: ; preds = %for.body.preheader 1299 %n.vec = and i32 %n, -4 1300 br label %vector.body 1301 1302vector.body: ; preds = %vector.body, %vector.ph 1303 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1304 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 1305 %0 = getelementptr inbounds i32, i32* %x, i32 %index 1306 %1 = bitcast i32* %0 to <4 x i32>* 1307 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 1308 %2 = icmp ugt <4 x i32> %vec.phi, %wide.load 1309 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load 1310 %index.next = add i32 %index, 4 1311 %4 = icmp eq i32 %index.next, %n.vec 1312 br i1 %4, label %middle.block, label %vector.body 1313 1314middle.block: ; preds = %vector.body 1315 %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3) 1316 %cmp.n = icmp eq i32 %n.vec, %n 1317 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1318 1319for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1320 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1321 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ] 1322 br label %for.body 1323 1324for.body: ; preds = %for.body.preheader1, %for.body 1325 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1326 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1327 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 1328 %6 = load i32, i32* %arrayidx, align 4 1329 %c = icmp ugt i32 %r.07, %6 1330 %add = select i1 %c, i32 %r.07, i32 %6 1331 %inc = add nuw nsw i32 %i.08, 1 1332 %exitcond = icmp eq i32 %inc, %n 1333 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1334 1335for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1336 %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1337 ret i32 %r.0.lcssa 1338} 1339 1340define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) { 1341; CHECK-LABEL: umax_i32_inloop: 1342; CHECK: @ %bb.0: @ %entry 1343; CHECK-NEXT: .save {r7, lr} 1344; CHECK-NEXT: push {r7, lr} 1345; CHECK-NEXT: cmp r1, #1 1346; CHECK-NEXT: blt .LBB14_3 1347; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1348; CHECK-NEXT: mov r12, r0 1349; CHECK-NEXT: cmp r1, #4 1350; CHECK-NEXT: bhs .LBB14_4 1351; CHECK-NEXT: @ %bb.2: 1352; CHECK-NEXT: movs r3, #0 1353; CHECK-NEXT: movs r0, #0 1354; CHECK-NEXT: b .LBB14_7 1355; CHECK-NEXT: .LBB14_3: 1356; CHECK-NEXT: movs r0, #0 1357; CHECK-NEXT: b .LBB14_9 1358; CHECK-NEXT: .LBB14_4: @ %vector.ph 1359; CHECK-NEXT: bic r3, r1, #3 1360; CHECK-NEXT: movs r2, #1 1361; CHECK-NEXT: subs r0, r3, #4 1362; CHECK-NEXT: add.w lr, r2, r0, lsr #2 1363; CHECK-NEXT: movs r0, #0 1364; CHECK-NEXT: mov r2, r12 1365; CHECK-NEXT: .LBB14_5: @ %vector.body 1366; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1367; CHECK-NEXT: vldrw.u32 q0, [r2], #16 1368; CHECK-NEXT: vmaxv.u32 r0, q0 1369; CHECK-NEXT: le lr, .LBB14_5 1370; CHECK-NEXT: @ %bb.6: @ %middle.block 1371; CHECK-NEXT: cmp r3, r1 1372; CHECK-NEXT: it eq 1373; CHECK-NEXT: popeq {r7, pc} 1374; CHECK-NEXT: .LBB14_7: @ %for.body.preheader1 1375; CHECK-NEXT: sub.w lr, r1, r3 1376; CHECK-NEXT: add.w r2, r12, r3, lsl #2 1377; CHECK-NEXT: .LBB14_8: @ %for.body 1378; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1379; CHECK-NEXT: ldr r1, [r2], #4 1380; CHECK-NEXT: cmp r0, r1 1381; CHECK-NEXT: csel r0, r0, r1, hi 1382; CHECK-NEXT: le lr, .LBB14_8 1383; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup 1384; CHECK-NEXT: pop {r7, pc} 1385entry: 1386 %cmp6 = icmp sgt i32 %n, 0 1387 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1388 1389for.body.preheader: ; preds = %entry 1390 %min.iters.check = icmp ult i32 %n, 4 1391 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1392 1393vector.ph: ; preds = %for.body.preheader 1394 %n.vec = and i32 %n, -4 1395 br label %vector.body 1396 1397vector.body: ; preds = %vector.body, %vector.ph 1398 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1399 %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ] 1400 %0 = getelementptr inbounds i32, i32* %x, i32 %index 1401 %1 = bitcast i32* %0 to <4 x i32>* 1402 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 1403 %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load) 1404 %2 = icmp ugt i32 %vec.phi, %l5 1405 %3 = select i1 %2, i32 %vec.phi, i32 %l5 1406 %index.next = add i32 %index, 4 1407 %4 = icmp eq i32 %index.next, %n.vec 1408 br i1 %4, label %middle.block, label %vector.body 1409 1410middle.block: ; preds = %vector.body 1411 %5 = phi i32 [ %3, %vector.body ] 1412 %cmp.n = icmp eq i32 %n.vec, %n 1413 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1414 1415for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1416 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1417 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ] 1418 br label %for.body 1419 1420for.body: ; preds = %for.body.preheader1, %for.body 1421 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1422 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1423 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08 1424 %6 = load i32, i32* %arrayidx, align 4 1425 %c = icmp ugt i32 %r.07, %6 1426 %add = select i1 %c, i32 %r.07, i32 %6 1427 %inc = add nuw nsw i32 %i.08, 1 1428 %exitcond = icmp eq i32 %inc, %n 1429 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1430 1431for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1432 %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1433 ret i32 %r.0.lcssa 1434} 1435 1436define float @fmin_f32(float* nocapture readonly %x, i32 %n) { 1437; CHECK-LABEL: fmin_f32: 1438; CHECK: @ %bb.0: @ %entry 1439; CHECK-NEXT: .save {r7, lr} 1440; CHECK-NEXT: push {r7, lr} 1441; CHECK-NEXT: cmp r1, #1 1442; CHECK-NEXT: blt .LBB15_3 1443; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1444; CHECK-NEXT: cmp r1, #4 1445; CHECK-NEXT: bhs .LBB15_4 1446; CHECK-NEXT: @ %bb.2: 1447; CHECK-NEXT: vldr s0, .LCPI15_0 1448; CHECK-NEXT: movs r2, #0 1449; CHECK-NEXT: b .LBB15_7 1450; CHECK-NEXT: .LBB15_3: 1451; CHECK-NEXT: vldr s0, .LCPI15_0 1452; CHECK-NEXT: b .LBB15_9 1453; CHECK-NEXT: .LBB15_4: @ %vector.ph 1454; CHECK-NEXT: bic r2, r1, #3 1455; CHECK-NEXT: movs r3, #1 1456; CHECK-NEXT: sub.w r12, r2, #4 1457; CHECK-NEXT: vmov.i32 q0, #0x0 1458; CHECK-NEXT: add.w lr, r3, r12, lsr #2 1459; CHECK-NEXT: mov r3, r0 1460; CHECK-NEXT: .LBB15_5: @ %vector.body 1461; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1462; CHECK-NEXT: vldrw.u32 q1, [r3], #16 1463; CHECK-NEXT: vcmp.f32 lt, q0, q1 1464; CHECK-NEXT: vpsel q0, q0, q1 1465; CHECK-NEXT: le lr, .LBB15_5 1466; CHECK-NEXT: @ %bb.6: @ %middle.block 1467; CHECK-NEXT: vminnm.f32 s4, s2, s3 1468; CHECK-NEXT: vminnm.f32 s0, s0, s1 1469; CHECK-NEXT: vminnm.f32 s0, s0, s4 1470; CHECK-NEXT: cmp r2, r1 1471; CHECK-NEXT: beq .LBB15_9 1472; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1 1473; CHECK-NEXT: sub.w lr, r1, r2 1474; CHECK-NEXT: add.w r0, r0, r2, lsl #2 1475; CHECK-NEXT: .LBB15_8: @ %for.body 1476; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1477; CHECK-NEXT: vldmia r0!, {s2} 1478; CHECK-NEXT: vcmp.f32 s0, s2 1479; CHECK-NEXT: vmrs APSR_nzcv, fpscr 1480; CHECK-NEXT: vselge.f32 s0, s2, s0 1481; CHECK-NEXT: le lr, .LBB15_8 1482; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup 1483; CHECK-NEXT: vmov r0, s0 1484; CHECK-NEXT: pop {r7, pc} 1485; CHECK-NEXT: .p2align 2 1486; CHECK-NEXT: @ %bb.10: 1487; CHECK-NEXT: .LCPI15_0: 1488; CHECK-NEXT: .long 0x00000000 @ float 0 1489entry: 1490 %cmp6 = icmp sgt i32 %n, 0 1491 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1492 1493for.body.preheader: ; preds = %entry 1494 %min.iters.check = icmp ult i32 %n, 4 1495 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1496 1497vector.ph: ; preds = %for.body.preheader 1498 %n.vec = and i32 %n, -4 1499 br label %vector.body 1500 1501vector.body: ; preds = %vector.body, %vector.ph 1502 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1503 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 1504 %0 = getelementptr inbounds float, float* %x, i32 %index 1505 %1 = bitcast float* %0 to <4 x float>* 1506 %wide.load = load <4 x float>, <4 x float>* %1, align 4 1507 %2 = fcmp ult <4 x float> %vec.phi, %wide.load 1508 %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load 1509 %index.next = add i32 %index, 4 1510 %4 = icmp eq i32 %index.next, %n.vec 1511 br i1 %4, label %middle.block, label %vector.body 1512 1513middle.block: ; preds = %vector.body 1514 %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3) 1515 %cmp.n = icmp eq i32 %n.vec, %n 1516 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1517 1518for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1519 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1520 %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ] 1521 br label %for.body 1522 1523for.body: ; preds = %for.body.preheader1, %for.body 1524 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1525 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1526 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08 1527 %6 = load float, float* %arrayidx, align 4 1528 %c = fcmp ult float %r.07, %6 1529 %add = select i1 %c, float %r.07, float %6 1530 %inc = add nuw nsw i32 %i.08, 1 1531 %exitcond = icmp eq i32 %inc, %n 1532 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1533 1534for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1535 %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1536 ret float %r.0.lcssa 1537} 1538 1539define float @fmax_f32(float* nocapture readonly %x, i32 %n) { 1540; CHECK-LABEL: fmax_f32: 1541; CHECK: @ %bb.0: @ %entry 1542; CHECK-NEXT: .save {r7, lr} 1543; CHECK-NEXT: push {r7, lr} 1544; CHECK-NEXT: cmp r1, #1 1545; CHECK-NEXT: blt .LBB16_3 1546; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1547; CHECK-NEXT: cmp r1, #4 1548; CHECK-NEXT: bhs .LBB16_4 1549; CHECK-NEXT: @ %bb.2: 1550; CHECK-NEXT: vldr s0, .LCPI16_0 1551; CHECK-NEXT: movs r2, #0 1552; CHECK-NEXT: b .LBB16_7 1553; CHECK-NEXT: .LBB16_3: 1554; CHECK-NEXT: vldr s0, .LCPI16_0 1555; CHECK-NEXT: b .LBB16_9 1556; CHECK-NEXT: .LBB16_4: @ %vector.ph 1557; CHECK-NEXT: bic r2, r1, #3 1558; CHECK-NEXT: movs r3, #1 1559; CHECK-NEXT: sub.w r12, r2, #4 1560; CHECK-NEXT: vmov.i32 q0, #0x0 1561; CHECK-NEXT: add.w lr, r3, r12, lsr #2 1562; CHECK-NEXT: mov r3, r0 1563; CHECK-NEXT: .LBB16_5: @ %vector.body 1564; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1565; CHECK-NEXT: vldrw.u32 q1, [r3], #16 1566; CHECK-NEXT: vcmp.f32 lt, q1, q0 1567; CHECK-NEXT: vpsel q0, q0, q1 1568; CHECK-NEXT: le lr, .LBB16_5 1569; CHECK-NEXT: @ %bb.6: @ %middle.block 1570; CHECK-NEXT: vmaxnm.f32 s4, s2, s3 1571; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 1572; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 1573; CHECK-NEXT: cmp r2, r1 1574; CHECK-NEXT: beq .LBB16_9 1575; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1 1576; CHECK-NEXT: sub.w lr, r1, r2 1577; CHECK-NEXT: add.w r0, r0, r2, lsl #2 1578; CHECK-NEXT: .LBB16_8: @ %for.body 1579; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1580; CHECK-NEXT: vldmia r0!, {s2} 1581; CHECK-NEXT: vcmp.f32 s2, s0 1582; CHECK-NEXT: vmrs APSR_nzcv, fpscr 1583; CHECK-NEXT: vselge.f32 s0, s2, s0 1584; CHECK-NEXT: le lr, .LBB16_8 1585; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup 1586; CHECK-NEXT: vmov r0, s0 1587; CHECK-NEXT: pop {r7, pc} 1588; CHECK-NEXT: .p2align 2 1589; CHECK-NEXT: @ %bb.10: 1590; CHECK-NEXT: .LCPI16_0: 1591; CHECK-NEXT: .long 0x00000000 @ float 0 1592entry: 1593 %cmp6 = icmp sgt i32 %n, 0 1594 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1595 1596for.body.preheader: ; preds = %entry 1597 %min.iters.check = icmp ult i32 %n, 4 1598 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1599 1600vector.ph: ; preds = %for.body.preheader 1601 %n.vec = and i32 %n, -4 1602 br label %vector.body 1603 1604vector.body: ; preds = %vector.body, %vector.ph 1605 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1606 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 1607 %0 = getelementptr inbounds float, float* %x, i32 %index 1608 %1 = bitcast float* %0 to <4 x float>* 1609 %wide.load = load <4 x float>, <4 x float>* %1, align 4 1610 %2 = fcmp ugt <4 x float> %vec.phi, %wide.load 1611 %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load 1612 %index.next = add i32 %index, 4 1613 %4 = icmp eq i32 %index.next, %n.vec 1614 br i1 %4, label %middle.block, label %vector.body 1615 1616middle.block: ; preds = %vector.body 1617 %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3) 1618 %cmp.n = icmp eq i32 %n.vec, %n 1619 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1620 1621for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1622 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1623 %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ] 1624 br label %for.body 1625 1626for.body: ; preds = %for.body.preheader1, %for.body 1627 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1628 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1629 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08 1630 %6 = load float, float* %arrayidx, align 4 1631 %c = fcmp ugt float %r.07, %6 1632 %add = select i1 %c, float %r.07, float %6 1633 %inc = add nuw nsw i32 %i.08, 1 1634 %exitcond = icmp eq i32 %inc, %n 1635 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1636 1637for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1638 %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1639 ret float %r.0.lcssa 1640} 1641 1642define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) { 1643; CHECK-LABEL: add4i32: 1644; CHECK: @ %bb.0: @ %entry 1645; CHECK-NEXT: .save {r7, lr} 1646; CHECK-NEXT: push {r7, lr} 1647; CHECK-NEXT: cbz r1, .LBB17_4 1648; CHECK-NEXT: @ %bb.1: @ %vector.ph 1649; CHECK-NEXT: movs r2, #0 1650; CHECK-NEXT: dlstp.32 lr, r1 1651; CHECK-NEXT: .LBB17_2: @ %vector.body 1652; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1653; CHECK-NEXT: vldrw.u32 q0, [r0], #16 1654; CHECK-NEXT: vaddva.u32 r2, q0 1655; CHECK-NEXT: letp lr, .LBB17_2 1656; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1657; CHECK-NEXT: mov r0, r2 1658; CHECK-NEXT: pop {r7, pc} 1659; CHECK-NEXT: .LBB17_4: 1660; CHECK-NEXT: movs r2, #0 1661; CHECK-NEXT: mov r0, r2 1662; CHECK-NEXT: pop {r7, pc} 1663entry: 1664 %cmp6.not = icmp eq i32 %n, 0 1665 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph 1666 1667vector.ph: ; preds = %entry 1668 %n.rnd.up = add i32 %n, 3 1669 %n.vec = and i32 %n.rnd.up, -4 1670 br label %vector.body 1671 1672vector.body: ; preds = %vector.body, %vector.ph 1673 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1674 %vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ] 1675 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 1676 %0 = getelementptr inbounds i32, i32* %x, i32 %index 1677 %1 = bitcast i32* %0 to <4 x i32>* 1678 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1679 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer 1680 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 1681 %4 = add i32 %3, %vec.phi 1682 %index.next = add i32 %index, 4 1683 %5 = icmp eq i32 %index.next, %n.vec 1684 br i1 %5, label %for.cond.cleanup, label %vector.body 1685 1686for.cond.cleanup: ; preds = %vector.body, %entry 1687 %s.0.lcssa = phi i32 [ 0, %entry ], [ %4, %vector.body ] 1688 ret i32 %s.0.lcssa 1689} 1690 1691define i32 @mla4i32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) { 1692; CHECK-LABEL: mla4i32: 1693; CHECK: @ %bb.0: @ %entry 1694; CHECK-NEXT: .save {r7, lr} 1695; CHECK-NEXT: push {r7, lr} 1696; CHECK-NEXT: cbz r2, .LBB18_4 1697; CHECK-NEXT: @ %bb.1: @ %vector.ph 1698; CHECK-NEXT: mov.w r12, #0 1699; CHECK-NEXT: dlstp.32 lr, r2 1700; CHECK-NEXT: .LBB18_2: @ %vector.body 1701; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1702; CHECK-NEXT: vldrw.u32 q0, [r0], #16 1703; CHECK-NEXT: vldrw.u32 q1, [r1], #16 1704; CHECK-NEXT: vmlava.u32 r12, q1, q0 1705; CHECK-NEXT: letp lr, .LBB18_2 1706; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1707; CHECK-NEXT: mov r0, r12 1708; CHECK-NEXT: pop {r7, pc} 1709; CHECK-NEXT: .LBB18_4: 1710; CHECK-NEXT: mov.w r12, #0 1711; CHECK-NEXT: mov r0, r12 1712; CHECK-NEXT: pop {r7, pc} 1713entry: 1714 %cmp8.not = icmp eq i32 %n, 0 1715 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph 1716 1717vector.ph: ; preds = %entry 1718 %n.rnd.up = add i32 %n, 3 1719 %n.vec = and i32 %n.rnd.up, -4 1720 br label %vector.body 1721 1722vector.body: ; preds = %vector.body, %vector.ph 1723 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1724 %vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ] 1725 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 1726 %0 = getelementptr inbounds i32, i32* %x, i32 %index 1727 %1 = bitcast i32* %0 to <4 x i32>* 1728 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1729 %2 = getelementptr inbounds i32, i32* %y, i32 %index 1730 %3 = bitcast i32* %2 to <4 x i32>* 1731 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1732 %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load 1733 %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer 1734 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 1735 %7 = add i32 %6, %vec.phi 1736 %index.next = add i32 %index, 4 1737 %8 = icmp eq i32 %index.next, %n.vec 1738 br i1 %8, label %for.cond.cleanup, label %vector.body 1739 1740for.cond.cleanup: ; preds = %vector.body, %entry 1741 %s.0.lcssa = phi i32 [ 0, %entry ], [ %7, %vector.body ] 1742 ret i32 %s.0.lcssa 1743} 1744 1745define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) { 1746; CHECK-LABEL: add8i32: 1747; CHECK: @ %bb.0: @ %entry 1748; CHECK-NEXT: .save {r7, lr} 1749; CHECK-NEXT: push {r7, lr} 1750; CHECK-NEXT: cbz r1, .LBB19_4 1751; CHECK-NEXT: @ %bb.1: @ %vector.ph 1752; CHECK-NEXT: movs r2, #0 1753; CHECK-NEXT: dlstp.16 lr, r1 1754; CHECK-NEXT: .LBB19_2: @ %vector.body 1755; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1756; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1757; CHECK-NEXT: vaddva.s16 r2, q0 1758; CHECK-NEXT: letp lr, .LBB19_2 1759; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1760; CHECK-NEXT: mov r0, r2 1761; CHECK-NEXT: pop {r7, pc} 1762; CHECK-NEXT: .LBB19_4: 1763; CHECK-NEXT: movs r2, #0 1764; CHECK-NEXT: mov r0, r2 1765; CHECK-NEXT: pop {r7, pc} 1766entry: 1767 %cmp6.not = icmp eq i32 %n, 0 1768 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph 1769 1770vector.ph: ; preds = %entry 1771 %n.rnd.up = add i32 %n, 7 1772 %n.vec = and i32 %n.rnd.up, -8 1773 br label %vector.body 1774 1775vector.body: ; preds = %vector.body, %vector.ph 1776 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1777 %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ] 1778 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 1779 %0 = getelementptr inbounds i16, i16* %x, i32 %index 1780 %1 = bitcast i16* %0 to <8 x i16>* 1781 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 1782 %2 = sext <8 x i16> %wide.masked.load to <8 x i32> 1783 %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer 1784 %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) 1785 %5 = add i32 %4, %vec.phi 1786 %index.next = add i32 %index, 8 1787 %6 = icmp eq i32 %index.next, %n.vec 1788 br i1 %6, label %for.cond.cleanup, label %vector.body 1789 1790for.cond.cleanup: ; preds = %vector.body, %entry 1791 %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ] 1792 ret i32 %s.0.lcssa 1793} 1794 1795define i32 @mla8i32(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) { 1796; CHECK-LABEL: mla8i32: 1797; CHECK: @ %bb.0: @ %entry 1798; CHECK-NEXT: .save {r7, lr} 1799; CHECK-NEXT: push {r7, lr} 1800; CHECK-NEXT: cbz r2, .LBB20_4 1801; CHECK-NEXT: @ %bb.1: @ %vector.ph 1802; CHECK-NEXT: mov.w r12, #0 1803; CHECK-NEXT: dlstp.16 lr, r2 1804; CHECK-NEXT: .LBB20_2: @ %vector.body 1805; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1806; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1807; CHECK-NEXT: vldrh.u16 q1, [r1], #16 1808; CHECK-NEXT: vmlava.s16 r12, q1, q0 1809; CHECK-NEXT: letp lr, .LBB20_2 1810; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1811; CHECK-NEXT: mov r0, r12 1812; CHECK-NEXT: pop {r7, pc} 1813; CHECK-NEXT: .LBB20_4: 1814; CHECK-NEXT: mov.w r12, #0 1815; CHECK-NEXT: mov r0, r12 1816; CHECK-NEXT: pop {r7, pc} 1817entry: 1818 %cmp9.not = icmp eq i32 %n, 0 1819 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph 1820 1821vector.ph: ; preds = %entry 1822 %n.rnd.up = add i32 %n, 7 1823 %n.vec = and i32 %n.rnd.up, -8 1824 br label %vector.body 1825 1826vector.body: ; preds = %vector.body, %vector.ph 1827 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1828 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ] 1829 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 1830 %0 = getelementptr inbounds i16, i16* %x, i32 %index 1831 %1 = bitcast i16* %0 to <8 x i16>* 1832 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 1833 %2 = sext <8 x i16> %wide.masked.load to <8 x i32> 1834 %3 = getelementptr inbounds i16, i16* %y, i32 %index 1835 %4 = bitcast i16* %3 to <8 x i16>* 1836 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 1837 %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32> 1838 %6 = mul nsw <8 x i32> %5, %2 1839 %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer 1840 %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7) 1841 %9 = add i32 %8, %vec.phi 1842 %index.next = add i32 %index, 8 1843 %10 = icmp eq i32 %index.next, %n.vec 1844 br i1 %10, label %for.cond.cleanup, label %vector.body 1845 1846for.cond.cleanup: ; preds = %vector.body, %entry 1847 %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ] 1848 ret i32 %s.0.lcssa 1849} 1850 1851define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) { 1852; CHECK-LABEL: add16i32: 1853; CHECK: @ %bb.0: @ %entry 1854; CHECK-NEXT: .save {r7, lr} 1855; CHECK-NEXT: push {r7, lr} 1856; CHECK-NEXT: cbz r1, .LBB21_4 1857; CHECK-NEXT: @ %bb.1: @ %vector.ph 1858; CHECK-NEXT: movs r2, #0 1859; CHECK-NEXT: dlstp.8 lr, r1 1860; CHECK-NEXT: .LBB21_2: @ %vector.body 1861; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1862; CHECK-NEXT: vldrb.u8 q0, [r0], #16 1863; CHECK-NEXT: vaddva.u8 r2, q0 1864; CHECK-NEXT: letp lr, .LBB21_2 1865; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1866; CHECK-NEXT: mov r0, r2 1867; CHECK-NEXT: pop {r7, pc} 1868; CHECK-NEXT: .LBB21_4: 1869; CHECK-NEXT: movs r2, #0 1870; CHECK-NEXT: mov r0, r2 1871; CHECK-NEXT: pop {r7, pc} 1872entry: 1873 %cmp6.not = icmp eq i32 %n, 0 1874 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph 1875 1876vector.ph: ; preds = %entry 1877 %n.rnd.up = add i32 %n, 15 1878 %n.vec = and i32 %n.rnd.up, -16 1879 br label %vector.body 1880 1881vector.body: ; preds = %vector.body, %vector.ph 1882 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1883 %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ] 1884 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 1885 %0 = getelementptr inbounds i8, i8* %x, i32 %index 1886 %1 = bitcast i8* %0 to <16 x i8>* 1887 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 1888 %2 = zext <16 x i8> %wide.masked.load to <16 x i32> 1889 %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer 1890 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) 1891 %5 = add i32 %4, %vec.phi 1892 %index.next = add i32 %index, 16 1893 %6 = icmp eq i32 %index.next, %n.vec 1894 br i1 %6, label %for.cond.cleanup, label %vector.body 1895 1896for.cond.cleanup: ; preds = %vector.body, %entry 1897 %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ] 1898 ret i32 %s.0.lcssa 1899} 1900 1901define i32 @mla16i32(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) { 1902; CHECK-LABEL: mla16i32: 1903; CHECK: @ %bb.0: @ %entry 1904; CHECK-NEXT: .save {r7, lr} 1905; CHECK-NEXT: push {r7, lr} 1906; CHECK-NEXT: cbz r2, .LBB22_4 1907; CHECK-NEXT: @ %bb.1: @ %vector.ph 1908; CHECK-NEXT: mov.w r12, #0 1909; CHECK-NEXT: dlstp.8 lr, r2 1910; CHECK-NEXT: .LBB22_2: @ %vector.body 1911; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1912; CHECK-NEXT: vldrb.u8 q0, [r0], #16 1913; CHECK-NEXT: vldrb.u8 q1, [r1], #16 1914; CHECK-NEXT: vmlava.u8 r12, q1, q0 1915; CHECK-NEXT: letp lr, .LBB22_2 1916; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1917; CHECK-NEXT: mov r0, r12 1918; CHECK-NEXT: pop {r7, pc} 1919; CHECK-NEXT: .LBB22_4: 1920; CHECK-NEXT: mov.w r12, #0 1921; CHECK-NEXT: mov r0, r12 1922; CHECK-NEXT: pop {r7, pc} 1923entry: 1924 %cmp9.not = icmp eq i32 %n, 0 1925 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph 1926 1927vector.ph: ; preds = %entry 1928 %n.rnd.up = add i32 %n, 15 1929 %n.vec = and i32 %n.rnd.up, -16 1930 br label %vector.body 1931 1932vector.body: ; preds = %vector.body, %vector.ph 1933 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1934 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ] 1935 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 1936 %0 = getelementptr inbounds i8, i8* %x, i32 %index 1937 %1 = bitcast i8* %0 to <16 x i8>* 1938 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 1939 %2 = zext <16 x i8> %wide.masked.load to <16 x i32> 1940 %3 = getelementptr inbounds i8, i8* %y, i32 %index 1941 %4 = bitcast i8* %3 to <16 x i8>* 1942 %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 1943 %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32> 1944 %6 = mul nuw nsw <16 x i32> %5, %2 1945 %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer 1946 %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7) 1947 %9 = add i32 %8, %vec.phi 1948 %index.next = add i32 %index, 16 1949 %10 = icmp eq i32 %index.next, %n.vec 1950 br i1 %10, label %for.cond.cleanup, label %vector.body 1951 1952for.cond.cleanup: ; preds = %vector.body, %entry 1953 %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ] 1954 ret i32 %s.0.lcssa 1955} 1956 1957define signext i16 @add8i16(i16* noalias nocapture readonly %x, i32 %n) { 1958; CHECK-LABEL: add8i16: 1959; CHECK: @ %bb.0: @ %entry 1960; CHECK-NEXT: .save {r7, lr} 1961; CHECK-NEXT: push {r7, lr} 1962; CHECK-NEXT: cbz r1, .LBB23_4 1963; CHECK-NEXT: @ %bb.1: @ %vector.ph 1964; CHECK-NEXT: movs r2, #0 1965; CHECK-NEXT: dlstp.16 lr, r1 1966; CHECK-NEXT: .LBB23_2: @ %vector.body 1967; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1968; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1969; CHECK-NEXT: vaddva.u16 r2, q0 1970; CHECK-NEXT: letp lr, .LBB23_2 1971; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1972; CHECK-NEXT: sxth r0, r2 1973; CHECK-NEXT: pop {r7, pc} 1974; CHECK-NEXT: .LBB23_4: 1975; CHECK-NEXT: movs r2, #0 1976; CHECK-NEXT: sxth r0, r2 1977; CHECK-NEXT: pop {r7, pc} 1978entry: 1979 %cmp8.not = icmp eq i32 %n, 0 1980 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph 1981 1982vector.ph: ; preds = %entry 1983 %n.rnd.up = add i32 %n, 7 1984 %n.vec = and i32 %n.rnd.up, -8 1985 br label %vector.body 1986 1987vector.body: ; preds = %vector.body, %vector.ph 1988 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1989 %vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ] 1990 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 1991 %0 = getelementptr inbounds i16, i16* %x, i32 %index 1992 %1 = bitcast i16* %0 to <8 x i16>* 1993 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 1994 %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer 1995 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) 1996 %4 = add i16 %3, %vec.phi 1997 %index.next = add i32 %index, 8 1998 %5 = icmp eq i32 %index.next, %n.vec 1999 br i1 %5, label %for.cond.cleanup, label %vector.body 2000 2001for.cond.cleanup: ; preds = %vector.body, %entry 2002 %s.0.lcssa = phi i16 [ 0, %entry ], [ %4, %vector.body ] 2003 ret i16 %s.0.lcssa 2004} 2005 2006define signext i16 @mla8i16(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) { 2007; CHECK-LABEL: mla8i16: 2008; CHECK: @ %bb.0: @ %entry 2009; CHECK-NEXT: .save {r7, lr} 2010; CHECK-NEXT: push {r7, lr} 2011; CHECK-NEXT: cbz r2, .LBB24_4 2012; CHECK-NEXT: @ %bb.1: @ %vector.ph 2013; CHECK-NEXT: mov.w r12, #0 2014; CHECK-NEXT: dlstp.16 lr, r2 2015; CHECK-NEXT: .LBB24_2: @ %vector.body 2016; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2017; CHECK-NEXT: vldrh.u16 q0, [r0], #16 2018; CHECK-NEXT: vldrh.u16 q1, [r1], #16 2019; CHECK-NEXT: vmlava.u16 r12, q1, q0 2020; CHECK-NEXT: letp lr, .LBB24_2 2021; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2022; CHECK-NEXT: sxth.w r0, r12 2023; CHECK-NEXT: pop {r7, pc} 2024; CHECK-NEXT: .LBB24_4: 2025; CHECK-NEXT: mov.w r12, #0 2026; CHECK-NEXT: sxth.w r0, r12 2027; CHECK-NEXT: pop {r7, pc} 2028entry: 2029 %cmp11.not = icmp eq i32 %n, 0 2030 br i1 %cmp11.not, label %for.cond.cleanup, label %vector.ph 2031 2032vector.ph: ; preds = %entry 2033 %n.rnd.up = add i32 %n, 7 2034 %n.vec = and i32 %n.rnd.up, -8 2035 br label %vector.body 2036 2037vector.body: ; preds = %vector.body, %vector.ph 2038 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2039 %vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ] 2040 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 2041 %0 = getelementptr inbounds i16, i16* %x, i32 %index 2042 %1 = bitcast i16* %0 to <8 x i16>* 2043 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2044 %2 = getelementptr inbounds i16, i16* %y, i32 %index 2045 %3 = bitcast i16* %2 to <8 x i16>* 2046 %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2047 %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load 2048 %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer 2049 %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5) 2050 %7 = add i16 %6, %vec.phi 2051 %index.next = add i32 %index, 8 2052 %8 = icmp eq i32 %index.next, %n.vec 2053 br i1 %8, label %for.cond.cleanup, label %vector.body 2054 2055for.cond.cleanup: ; preds = %vector.body, %entry 2056 %s.0.lcssa = phi i16 [ 0, %entry ], [ %7, %vector.body ] 2057 ret i16 %s.0.lcssa 2058} 2059 2060define signext i16 @add16i16(i8* noalias nocapture readonly %x, i32 %n) { 2061; CHECK-LABEL: add16i16: 2062; CHECK: @ %bb.0: @ %entry 2063; CHECK-NEXT: .save {r7, lr} 2064; CHECK-NEXT: push {r7, lr} 2065; CHECK-NEXT: cbz r1, .LBB25_4 2066; CHECK-NEXT: @ %bb.1: @ %vector.ph 2067; CHECK-NEXT: movs r2, #0 2068; CHECK-NEXT: dlstp.8 lr, r1 2069; CHECK-NEXT: .LBB25_2: @ %vector.body 2070; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2071; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2072; CHECK-NEXT: vaddva.u8 r2, q0 2073; CHECK-NEXT: letp lr, .LBB25_2 2074; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2075; CHECK-NEXT: sxth r0, r2 2076; CHECK-NEXT: pop {r7, pc} 2077; CHECK-NEXT: .LBB25_4: 2078; CHECK-NEXT: movs r2, #0 2079; CHECK-NEXT: sxth r0, r2 2080; CHECK-NEXT: pop {r7, pc} 2081entry: 2082 %cmp8.not = icmp eq i32 %n, 0 2083 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph 2084 2085vector.ph: ; preds = %entry 2086 %n.rnd.up = add i32 %n, 15 2087 %n.vec = and i32 %n.rnd.up, -16 2088 br label %vector.body 2089 2090vector.body: ; preds = %vector.body, %vector.ph 2091 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2092 %vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ] 2093 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 2094 %0 = getelementptr inbounds i8, i8* %x, i32 %index 2095 %1 = bitcast i8* %0 to <16 x i8>* 2096 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2097 %2 = zext <16 x i8> %wide.masked.load to <16 x i16> 2098 %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer 2099 %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3) 2100 %5 = add i16 %4, %vec.phi 2101 %index.next = add i32 %index, 16 2102 %6 = icmp eq i32 %index.next, %n.vec 2103 br i1 %6, label %for.cond.cleanup, label %vector.body 2104 2105for.cond.cleanup: ; preds = %vector.body, %entry 2106 %s.0.lcssa = phi i16 [ 0, %entry ], [ %5, %vector.body ] 2107 ret i16 %s.0.lcssa 2108} 2109 2110define signext i16 @mla16i16(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) { 2111; CHECK-LABEL: mla16i16: 2112; CHECK: @ %bb.0: @ %entry 2113; CHECK-NEXT: .save {r7, lr} 2114; CHECK-NEXT: push {r7, lr} 2115; CHECK-NEXT: cbz r2, .LBB26_4 2116; CHECK-NEXT: @ %bb.1: @ %vector.ph 2117; CHECK-NEXT: mov.w r12, #0 2118; CHECK-NEXT: dlstp.8 lr, r2 2119; CHECK-NEXT: .LBB26_2: @ %vector.body 2120; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2121; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2122; CHECK-NEXT: vldrb.u8 q1, [r1], #16 2123; CHECK-NEXT: vmlava.u8 r12, q1, q0 2124; CHECK-NEXT: letp lr, .LBB26_2 2125; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2126; CHECK-NEXT: sxth.w r0, r12 2127; CHECK-NEXT: pop {r7, pc} 2128; CHECK-NEXT: .LBB26_4: 2129; CHECK-NEXT: mov.w r12, #0 2130; CHECK-NEXT: sxth.w r0, r12 2131; CHECK-NEXT: pop {r7, pc} 2132entry: 2133 %cmp13.not = icmp eq i32 %n, 0 2134 br i1 %cmp13.not, label %for.cond.cleanup, label %vector.ph 2135 2136vector.ph: ; preds = %entry 2137 %n.rnd.up = add i32 %n, 15 2138 %n.vec = and i32 %n.rnd.up, -16 2139 br label %vector.body 2140 2141vector.body: ; preds = %vector.body, %vector.ph 2142 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2143 %vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ] 2144 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 2145 %0 = getelementptr inbounds i8, i8* %x, i32 %index 2146 %1 = bitcast i8* %0 to <16 x i8>* 2147 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2148 %2 = zext <16 x i8> %wide.masked.load to <16 x i16> 2149 %3 = getelementptr inbounds i8, i8* %y, i32 %index 2150 %4 = bitcast i8* %3 to <16 x i8>* 2151 %wide.masked.load18 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2152 %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16> 2153 %6 = mul nuw <16 x i16> %5, %2 2154 %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer 2155 %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7) 2156 %9 = add i16 %8, %vec.phi 2157 %index.next = add i32 %index, 16 2158 %10 = icmp eq i32 %index.next, %n.vec 2159 br i1 %10, label %for.cond.cleanup, label %vector.body 2160 2161for.cond.cleanup: ; preds = %vector.body, %entry 2162 %s.0.lcssa = phi i16 [ 0, %entry ], [ %9, %vector.body ] 2163 ret i16 %s.0.lcssa 2164} 2165 2166define zeroext i8 @add16i8(i8* noalias nocapture readonly %x, i32 %n) { 2167; CHECK-LABEL: add16i8: 2168; CHECK: @ %bb.0: @ %entry 2169; CHECK-NEXT: .save {r7, lr} 2170; CHECK-NEXT: push {r7, lr} 2171; CHECK-NEXT: cbz r1, .LBB27_4 2172; CHECK-NEXT: @ %bb.1: @ %vector.ph 2173; CHECK-NEXT: movs r2, #0 2174; CHECK-NEXT: dlstp.8 lr, r1 2175; CHECK-NEXT: .LBB27_2: @ %vector.body 2176; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2177; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2178; CHECK-NEXT: vaddva.u8 r2, q0 2179; CHECK-NEXT: letp lr, .LBB27_2 2180; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2181; CHECK-NEXT: uxtb r0, r2 2182; CHECK-NEXT: pop {r7, pc} 2183; CHECK-NEXT: .LBB27_4: 2184; CHECK-NEXT: movs r2, #0 2185; CHECK-NEXT: uxtb r0, r2 2186; CHECK-NEXT: pop {r7, pc} 2187entry: 2188 %cmp7.not = icmp eq i32 %n, 0 2189 br i1 %cmp7.not, label %for.cond.cleanup, label %vector.ph 2190 2191vector.ph: ; preds = %entry 2192 %n.rnd.up = add i32 %n, 15 2193 %n.vec = and i32 %n.rnd.up, -16 2194 br label %vector.body 2195 2196vector.body: ; preds = %vector.body, %vector.ph 2197 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2198 %vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ] 2199 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 2200 %0 = getelementptr inbounds i8, i8* %x, i32 %index 2201 %1 = bitcast i8* %0 to <16 x i8>* 2202 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2203 %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer 2204 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) 2205 %4 = add i8 %3, %vec.phi 2206 %index.next = add i32 %index, 16 2207 %5 = icmp eq i32 %index.next, %n.vec 2208 br i1 %5, label %for.cond.cleanup, label %vector.body 2209 2210for.cond.cleanup: ; preds = %vector.body, %entry 2211 %s.0.lcssa = phi i8 [ 0, %entry ], [ %4, %vector.body ] 2212 ret i8 %s.0.lcssa 2213} 2214 2215define zeroext i8 @mla16i8(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) { 2216; CHECK-LABEL: mla16i8: 2217; CHECK: @ %bb.0: @ %entry 2218; CHECK-NEXT: .save {r7, lr} 2219; CHECK-NEXT: push {r7, lr} 2220; CHECK-NEXT: cbz r2, .LBB28_4 2221; CHECK-NEXT: @ %bb.1: @ %vector.ph 2222; CHECK-NEXT: mov.w r12, #0 2223; CHECK-NEXT: dlstp.8 lr, r2 2224; CHECK-NEXT: .LBB28_2: @ %vector.body 2225; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2226; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2227; CHECK-NEXT: vldrb.u8 q1, [r1], #16 2228; CHECK-NEXT: vmlava.u8 r12, q1, q0 2229; CHECK-NEXT: letp lr, .LBB28_2 2230; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2231; CHECK-NEXT: uxtb.w r0, r12 2232; CHECK-NEXT: pop {r7, pc} 2233; CHECK-NEXT: .LBB28_4: 2234; CHECK-NEXT: mov.w r12, #0 2235; CHECK-NEXT: uxtb.w r0, r12 2236; CHECK-NEXT: pop {r7, pc} 2237entry: 2238 %cmp10.not = icmp eq i32 %n, 0 2239 br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph 2240 2241vector.ph: ; preds = %entry 2242 %n.rnd.up = add i32 %n, 15 2243 %n.vec = and i32 %n.rnd.up, -16 2244 br label %vector.body 2245 2246vector.body: ; preds = %vector.body, %vector.ph 2247 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2248 %vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ] 2249 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 2250 %0 = getelementptr inbounds i8, i8* %x, i32 %index 2251 %1 = bitcast i8* %0 to <16 x i8>* 2252 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2253 %2 = getelementptr inbounds i8, i8* %y, i32 %index 2254 %3 = bitcast i8* %2 to <16 x i8>* 2255 %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2256 %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load 2257 %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer 2258 %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5) 2259 %7 = add i8 %6, %vec.phi 2260 %index.next = add i32 %index, 16 2261 %8 = icmp eq i32 %index.next, %n.vec 2262 br i1 %8, label %for.cond.cleanup, label %vector.body 2263 2264for.cond.cleanup: ; preds = %vector.body, %entry 2265 %s.0.lcssa = phi i8 [ 0, %entry ], [ %7, %vector.body ] 2266 ret i8 %s.0.lcssa 2267} 2268 2269define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) { 2270; CHECK-LABEL: add4i64: 2271; CHECK: @ %bb.0: @ %entry 2272; CHECK-NEXT: .save {r7, lr} 2273; CHECK-NEXT: push {r7, lr} 2274; CHECK-NEXT: cbz r1, .LBB29_3 2275; CHECK-NEXT: @ %bb.1: @ %vector.ph 2276; CHECK-NEXT: movs r2, #0 2277; CHECK-NEXT: mov r3, r2 2278; CHECK-NEXT: dlstp.32 lr, r1 2279; CHECK-NEXT: .LBB29_2: @ %vector.body 2280; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2281; CHECK-NEXT: vldrw.u32 q0, [r0], #16 2282; CHECK-NEXT: vaddlva.s32 r2, r3, q0 2283; CHECK-NEXT: letp lr, .LBB29_2 2284; CHECK-NEXT: b .LBB29_4 2285; CHECK-NEXT: .LBB29_3: 2286; CHECK-NEXT: movs r2, #0 2287; CHECK-NEXT: mov r3, r2 2288; CHECK-NEXT: .LBB29_4: @ %for.cond.cleanup 2289; CHECK-NEXT: mov r0, r2 2290; CHECK-NEXT: mov r1, r3 2291; CHECK-NEXT: pop {r7, pc} 2292entry: 2293 %cmp6.not = icmp eq i32 %n, 0 2294 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph 2295 2296vector.ph: ; preds = %entry 2297 %n.rnd.up = add i32 %n, 3 2298 %n.vec = and i32 %n.rnd.up, -4 2299 br label %vector.body 2300 2301vector.body: ; preds = %vector.body, %vector.ph 2302 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2303 %vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ] 2304 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 2305 %0 = getelementptr inbounds i32, i32* %x, i32 %index 2306 %1 = bitcast i32* %0 to <4 x i32>* 2307 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 2308 %2 = sext <4 x i32> %wide.masked.load to <4 x i64> 2309 %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer 2310 %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3) 2311 %5 = add i64 %4, %vec.phi 2312 %index.next = add i32 %index, 4 2313 %6 = icmp eq i32 %index.next, %n.vec 2314 br i1 %6, label %for.cond.cleanup, label %vector.body 2315 2316for.cond.cleanup: ; preds = %vector.body, %entry 2317 %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %vector.body ] 2318 ret i64 %s.0.lcssa 2319} 2320 2321define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) { 2322; CHECK-LABEL: mla4i64: 2323; CHECK: @ %bb.0: @ %entry 2324; CHECK-NEXT: .save {r7, lr} 2325; CHECK-NEXT: push {r7, lr} 2326; CHECK-NEXT: cbz r2, .LBB30_3 2327; CHECK-NEXT: @ %bb.1: @ %vector.ph 2328; CHECK-NEXT: mov.w r12, #0 2329; CHECK-NEXT: mov r3, r12 2330; CHECK-NEXT: dlstp.32 lr, r2 2331; CHECK-NEXT: .LBB30_2: @ %vector.body 2332; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2333; CHECK-NEXT: vldrw.u32 q0, [r0], #16 2334; CHECK-NEXT: vldrw.u32 q1, [r1], #16 2335; CHECK-NEXT: vmlalva.s32 r12, r3, q1, q0 2336; CHECK-NEXT: letp lr, .LBB30_2 2337; CHECK-NEXT: b .LBB30_4 2338; CHECK-NEXT: .LBB30_3: 2339; CHECK-NEXT: mov.w r12, #0 2340; CHECK-NEXT: mov r3, r12 2341; CHECK-NEXT: .LBB30_4: @ %for.cond.cleanup 2342; CHECK-NEXT: mov r0, r12 2343; CHECK-NEXT: mov r1, r3 2344; CHECK-NEXT: pop {r7, pc} 2345entry: 2346 %cmp9.not = icmp eq i32 %n, 0 2347 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph 2348 2349vector.ph: ; preds = %entry 2350 %n.rnd.up = add i32 %n, 3 2351 %n.vec = and i32 %n.rnd.up, -4 2352 br label %vector.body 2353 2354vector.body: ; preds = %vector.body, %vector.ph 2355 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2356 %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ] 2357 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 2358 %0 = getelementptr inbounds i32, i32* %x, i32 %index 2359 %1 = bitcast i32* %0 to <4 x i32>* 2360 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 2361 %2 = sext <4 x i32> %wide.masked.load to <4 x i64> 2362 %3 = getelementptr inbounds i32, i32* %y, i32 %index 2363 %4 = bitcast i32* %3 to <4 x i32>* 2364 %wide.masked.load14 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 2365 %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64> 2366 %6 = mul nsw <4 x i64> %5, %2 2367 %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer 2368 %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7) 2369 %9 = add i64 %8, %vec.phi 2370 %index.next = add i32 %index, 4 2371 %10 = icmp eq i32 %index.next, %n.vec 2372 br i1 %10, label %for.cond.cleanup, label %vector.body 2373 2374for.cond.cleanup: ; preds = %vector.body, %entry 2375 %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ] 2376 ret i64 %s.0.lcssa 2377} 2378 2379define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) { 2380; CHECK-LABEL: mla8i64: 2381; CHECK: @ %bb.0: @ %entry 2382; CHECK-NEXT: .save {r7, lr} 2383; CHECK-NEXT: push {r7, lr} 2384; CHECK-NEXT: cbz r2, .LBB31_3 2385; CHECK-NEXT: @ %bb.1: @ %vector.ph 2386; CHECK-NEXT: mov.w r12, #0 2387; CHECK-NEXT: mov r3, r12 2388; CHECK-NEXT: dlstp.16 lr, r2 2389; CHECK-NEXT: .LBB31_2: @ %vector.body 2390; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2391; CHECK-NEXT: vldrh.u16 q0, [r0], #16 2392; CHECK-NEXT: vldrh.u16 q1, [r1], #16 2393; CHECK-NEXT: vmlalva.s16 r12, r3, q1, q0 2394; CHECK-NEXT: letp lr, .LBB31_2 2395; CHECK-NEXT: b .LBB31_4 2396; CHECK-NEXT: .LBB31_3: 2397; CHECK-NEXT: mov.w r12, #0 2398; CHECK-NEXT: mov r3, r12 2399; CHECK-NEXT: .LBB31_4: @ %for.cond.cleanup 2400; CHECK-NEXT: mov r0, r12 2401; CHECK-NEXT: mov r1, r3 2402; CHECK-NEXT: pop {r7, pc} 2403entry: 2404 %cmp9.not = icmp eq i32 %n, 0 2405 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph 2406 2407vector.ph: ; preds = %entry 2408 %n.rnd.up = add i32 %n, 7 2409 %n.vec = and i32 %n.rnd.up, -8 2410 br label %vector.body 2411 2412vector.body: ; preds = %vector.body, %vector.ph 2413 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2414 %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ] 2415 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 2416 %0 = getelementptr inbounds i16, i16* %x, i32 %index 2417 %1 = bitcast i16* %0 to <8 x i16>* 2418 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2419 %2 = sext <8 x i16> %wide.masked.load to <8 x i64> 2420 %3 = getelementptr inbounds i16, i16* %y, i32 %index 2421 %4 = bitcast i16* %3 to <8 x i16>* 2422 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2423 %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64> 2424 %6 = mul nsw <8 x i64> %5, %2 2425 %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer 2426 %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7) 2427 %9 = add i64 %8, %vec.phi 2428 %index.next = add i32 %index, 8 2429 %10 = icmp eq i32 %index.next, %n.vec 2430 br i1 %10, label %for.cond.cleanup, label %vector.body 2431 2432for.cond.cleanup: ; preds = %vector.body, %entry 2433 %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ] 2434 ret i64 %s.0.lcssa 2435} 2436 2437declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 2438declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2 2439declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1 2440declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2 2441declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3 2442declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1 2443declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2 2444declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3 2445declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3 2446declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3 2447declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3 2448declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3 2449declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3 2450 2451declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 2452declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) 2453declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) 2454declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) 2455declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) 2456declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 2457declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 2458declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) 2459declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) 2460declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) 2461declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) 2462declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) 2463declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 2464