1; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s 2 3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 4 5;CHECK-LABEL: @reduction_sum( 6;CHECK: phi <4 x i32> 7;CHECK: load <4 x i32> 8;CHECK: add <4 x i32> 9;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> 10;CHECK: ret i32 11define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { 12 %1 = icmp sgt i32 %n, 0 13 br i1 %1, label %.lr.ph, label %._crit_edge 14 15.lr.ph: ; preds = %0, %.lr.ph 16 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 17 %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] 18 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 19 %3 = load i32, i32* %2, align 4 20 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 21 %5 = load i32, i32* %4, align 4 22 %6 = trunc i64 %indvars.iv to i32 23 %7 = add i32 %sum.02, %6 24 %8 = add i32 %7, %3 25 %9 = add i32 %8, %5 26 %indvars.iv.next = add i64 %indvars.iv, 1 27 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 28 %exitcond = icmp eq i32 %lftr.wideiv, %n 29 br i1 %exitcond, label %._crit_edge, label %.lr.ph 30 31._crit_edge: ; preds = %.lr.ph, %0 32 %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] 33 ret i32 %sum.0.lcssa 34} 35 36;CHECK-LABEL: @reduction_prod( 37;CHECK: phi <4 x i32> 38;CHECK: load <4 x i32> 39;CHECK: mul <4 x i32> 40;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> 41;CHECK: ret i32 42define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { 43 %1 = icmp sgt i32 %n, 0 44 br i1 %1, label %.lr.ph, label %._crit_edge 45 46.lr.ph: ; preds = %0, %.lr.ph 47 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 48 %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ] 49 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 50 %3 = load i32, i32* %2, align 4 51 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 52 %5 = load i32, i32* %4, align 4 53 %6 = trunc i64 %indvars.iv to i32 54 %7 = mul i32 %prod.02, %6 55 %8 = mul i32 %7, %3 56 %9 = mul i32 %8, %5 57 %indvars.iv.next = add i64 %indvars.iv, 1 58 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 59 %exitcond = icmp eq i32 %lftr.wideiv, %n 60 br i1 %exitcond, label %._crit_edge, label %.lr.ph 61 62._crit_edge: ; preds = %.lr.ph, %0 63 %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ] 64 ret i32 %prod.0.lcssa 65} 66 67;CHECK-LABEL: @reduction_mix( 68;CHECK: phi <4 x i32> 69;CHECK: load <4 x i32> 70;CHECK: mul nsw <4 x i32> 71;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> 72;CHECK: ret i32 73define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { 74 %1 = icmp sgt i32 %n, 0 75 br i1 %1, label %.lr.ph, label %._crit_edge 76 77.lr.ph: ; preds = %0, %.lr.ph 78 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 79 %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ] 80 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 81 %3 = load i32, i32* %2, align 4 82 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 83 %5 = load i32, i32* %4, align 4 84 %6 = mul nsw i32 %5, %3 85 %7 = trunc i64 %indvars.iv to i32 86 %8 = add i32 %sum.02, %7 87 %9 = add i32 %8, %6 88 %indvars.iv.next = add i64 %indvars.iv, 1 89 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 90 %exitcond = icmp eq i32 %lftr.wideiv, %n 91 br i1 %exitcond, label %._crit_edge, label %.lr.ph 92 93._crit_edge: ; preds = %.lr.ph, %0 94 %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] 95 ret i32 %sum.0.lcssa 96} 97 98;CHECK-LABEL: @reduction_mul( 99;CHECK: mul <4 x i32> 100;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> 101;CHECK: ret i32 102define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { 103 %1 = icmp sgt i32 %n, 0 104 br i1 %1, label %.lr.ph, label %._crit_edge 105 106.lr.ph: ; preds = %0, %.lr.ph 107 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 108 %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ] 109 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 110 %3 = load i32, i32* %2, align 4 111 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 112 %5 = load i32, i32* %4, align 4 113 %6 = trunc i64 %indvars.iv to i32 114 %7 = add i32 %3, %6 115 %8 = add i32 %7, %5 116 %9 = mul i32 %8, %sum.02 117 %indvars.iv.next = add i64 %indvars.iv, 1 118 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 119 %exitcond = icmp eq i32 %lftr.wideiv, %n 120 br i1 %exitcond, label %._crit_edge, label %.lr.ph 121 122._crit_edge: ; preds = %.lr.ph, %0 123 %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ] 124 ret i32 %sum.0.lcssa 125} 126 127;CHECK-LABEL: @start_at_non_zero( 128;CHECK: phi <4 x i32> 129;CHECK: <i32 120, i32 0, i32 0, i32 0> 130;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> 131;CHECK: ret i32 132define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp { 133entry: 134 %cmp7 = icmp sgt i32 %n, 0 135 br i1 %cmp7, label %for.body, label %for.end 136 137for.body: ; preds = %entry, %for.body 138 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 139 %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ] 140 %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv 141 %0 = load i32, i32* %arrayidx, align 4 142 %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv 143 %1 = load i32, i32* %arrayidx2, align 4 144 %mul = mul nsw i32 %1, %0 145 %add = add nsw i32 %mul, %sum.09 146 %indvars.iv.next = add i64 %indvars.iv, 1 147 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 148 %exitcond = icmp eq i32 %lftr.wideiv, %n 149 br i1 %exitcond, label %for.end, label %for.body 150 151for.end: ; preds = %for.body, %entry 152 %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ] 153 ret i32 %sum.0.lcssa 154} 155 156;CHECK-LABEL: @reduction_and( 157;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1> 158;CHECK: and <4 x i32> 159;CHECK: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> 160;CHECK: ret i32 161define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { 162entry: 163 %cmp7 = icmp sgt i32 %n, 0 164 br i1 %cmp7, label %for.body, label %for.end 165 166for.body: ; preds = %entry, %for.body 167 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 168 %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ] 169 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 170 %0 = load i32, i32* %arrayidx, align 4 171 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 172 %1 = load i32, i32* %arrayidx2, align 4 173 %add = add nsw i32 %1, %0 174 %and = and i32 %add, %result.08 175 %indvars.iv.next = add i64 %indvars.iv, 1 176 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 177 %exitcond = icmp eq i32 %lftr.wideiv, %n 178 br i1 %exitcond, label %for.end, label %for.body 179 180for.end: ; preds = %for.body, %entry 181 %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ] 182 ret i32 %result.0.lcssa 183} 184 185;CHECK-LABEL: @reduction_or( 186;CHECK: or <4 x i32> 187;CHECK: call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> 188;CHECK: ret i32 189define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { 190entry: 191 %cmp7 = icmp sgt i32 %n, 0 192 br i1 %cmp7, label %for.body, label %for.end 193 194for.body: ; preds = %entry, %for.body 195 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 196 %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ] 197 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 198 %0 = load i32, i32* %arrayidx, align 4 199 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 200 %1 = load i32, i32* %arrayidx2, align 4 201 %add = add nsw i32 %1, %0 202 %or = or i32 %add, %result.08 203 %indvars.iv.next = add i64 %indvars.iv, 1 204 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 205 %exitcond = icmp eq i32 %lftr.wideiv, %n 206 br i1 %exitcond, label %for.end, label %for.body 207 208for.end: ; preds = %for.body, %entry 209 %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ] 210 ret i32 %result.0.lcssa 211} 212 213;CHECK-LABEL: @reduction_xor( 214;CHECK: xor <4 x i32> 215;CHECK: call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> 216;CHECK: ret i32 217define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { 218entry: 219 %cmp7 = icmp sgt i32 %n, 0 220 br i1 %cmp7, label %for.body, label %for.end 221 222for.body: ; preds = %entry, %for.body 223 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 224 %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ] 225 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 226 %0 = load i32, i32* %arrayidx, align 4 227 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 228 %1 = load i32, i32* %arrayidx2, align 4 229 %add = add nsw i32 %1, %0 230 %xor = xor i32 %add, %result.08 231 %indvars.iv.next = add i64 %indvars.iv, 1 232 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 233 %exitcond = icmp eq i32 %lftr.wideiv, %n 234 br i1 %exitcond, label %for.end, label %for.body 235 236for.end: ; preds = %for.body, %entry 237 %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ] 238 ret i32 %result.0.lcssa 239} 240 241; In this code the subtracted variable is on the RHS and this is not an induction variable. 242;CHECK-LABEL: @reduction_sub_rhs( 243;CHECK-NOT: phi <4 x i32> 244;CHECK-NOT: sub nsw <4 x i32> 245;CHECK: ret i32 246define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly { 247entry: 248 %cmp4 = icmp sgt i32 %n, 0 249 br i1 %cmp4, label %for.body, label %for.end 250 251for.body: ; preds = %entry, %for.body 252 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 253 %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ] 254 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 255 %0 = load i32, i32* %arrayidx, align 4 256 %sub = sub nsw i32 %0, %x.05 257 %indvars.iv.next = add i64 %indvars.iv, 1 258 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 259 %exitcond = icmp eq i32 %lftr.wideiv, %n 260 br i1 %exitcond, label %for.end, label %for.body 261 262for.end: ; preds = %for.body, %entry 263 %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ] 264 ret i32 %x.0.lcssa 265} 266 267 268; In this test the reduction variable is on the LHS and we can vectorize it. 269;CHECK-LABEL: @reduction_sub_lhs( 270;CHECK: phi <4 x i32> 271;CHECK: sub <4 x i32> 272;CHECK: ret i32 273define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly { 274entry: 275 %cmp4 = icmp sgt i32 %n, 0 276 br i1 %cmp4, label %for.body, label %for.end 277 278for.body: ; preds = %entry, %for.body 279 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] 280 %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ] 281 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 282 %0 = load i32, i32* %arrayidx, align 4 283 %sub = sub nsw i32 %x.05, %0 284 %indvars.iv.next = add i64 %indvars.iv, 1 285 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 286 %exitcond = icmp eq i32 %lftr.wideiv, %n 287 br i1 %exitcond, label %for.end, label %for.body 288 289for.end: ; preds = %for.body, %entry 290 %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ] 291 ret i32 %x.0.lcssa 292} 293 294; We can vectorize conditional reductions with multi-input phis. 295; CHECK: reduction_conditional 296; CHECK: fadd fast <4 x float> 297 298define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) { 299entry: 300 br label %for.body 301 302for.body: 303 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 304 %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ] 305 %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv 306 %0 = load float, float* %arrayidx, align 4 307 %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv 308 %1 = load float, float* %arrayidx2, align 4 309 %cmp3 = fcmp ogt float %0, %1 310 br i1 %cmp3, label %if.then, label %for.inc 311 312if.then: 313 %cmp6 = fcmp ogt float %1, 1.000000e+00 314 br i1 %cmp6, label %if.then8, label %if.else 315 316if.then8: 317 %add = fadd fast float %sum.033, %0 318 br label %for.inc 319 320if.else: 321 %cmp14 = fcmp ogt float %0, 2.000000e+00 322 br i1 %cmp14, label %if.then16, label %for.inc 323 324if.then16: 325 %add19 = fadd fast float %sum.033, %1 326 br label %for.inc 327 328for.inc: 329 %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ] 330 %indvars.iv.next = add i64 %indvars.iv, 1 331 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 332 %exitcond = icmp ne i32 %lftr.wideiv, 128 333 br i1 %exitcond, label %for.body, label %for.end 334 335for.end: 336 %sum.1.lcssa = phi float [ %sum.1, %for.inc ] 337 ret float %sum.1.lcssa 338} 339 340; We can't vectorize reductions with phi inputs from outside the reduction. 341; CHECK: noreduction_phi 342; CHECK-NOT: fadd <4 x float> 343define float @noreduction_phi(float* %A, float* %B, float* %C, float %S) { 344entry: 345 br label %for.body 346 347for.body: 348 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 349 %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ] 350 %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv 351 %0 = load float, float* %arrayidx, align 4 352 %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv 353 %1 = load float, float* %arrayidx2, align 4 354 %cmp3 = fcmp ogt float %0, %1 355 br i1 %cmp3, label %if.then, label %for.inc 356 357if.then: 358 %cmp6 = fcmp ogt float %1, 1.000000e+00 359 br i1 %cmp6, label %if.then8, label %if.else 360 361if.then8: 362 %add = fadd fast float %sum.033, %0 363 br label %for.inc 364 365if.else: 366 %cmp14 = fcmp ogt float %0, 2.000000e+00 367 br i1 %cmp14, label %if.then16, label %for.inc 368 369if.then16: 370 %add19 = fadd fast float %sum.033, %1 371 br label %for.inc 372 373for.inc: 374 %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ 0.000000e+00, %if.else ], [ %sum.033, %for.body ] 375 %indvars.iv.next = add i64 %indvars.iv, 1 376 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 377 %exitcond = icmp ne i32 %lftr.wideiv, 128 378 br i1 %exitcond, label %for.body, label %for.end 379 380for.end: 381 %sum.1.lcssa = phi float [ %sum.1, %for.inc ] 382 ret float %sum.1.lcssa 383} 384 385; We can't vectorize reductions that feed another header PHI. 386; CHECK: noredux_header_phi 387; CHECK-NOT: fadd <4 x float> 388 389define float @noredux_header_phi(float* %A, float* %B, float* %C, float %S) { 390entry: 391 br label %for.body 392 393for.body: 394 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 395 %sum2.09 = phi float [ 0.000000e+00, %entry ], [ %add1, %for.body ] 396 %sum.08 = phi float [ %S, %entry ], [ %add, %for.body ] 397 %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv 398 %0 = load float, float* %arrayidx, align 4 399 %add = fadd fast float %sum.08, %0 400 %add1 = fadd fast float %sum2.09, %add 401 %indvars.iv.next = add i64 %indvars.iv, 1 402 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 403 %exitcond = icmp ne i32 %lftr.wideiv, 128 404 br i1 %exitcond, label %for.body, label %for.end 405 406for.end: 407 %add1.lcssa = phi float [ %add1, %for.body ] 408 %add.lcssa = phi float [ %add, %for.body ] 409 %add2 = fadd fast float %add.lcssa, %add1.lcssa 410 ret float %add2 411} 412 413 414; When vectorizing a reduction whose loop header phi value is used outside the 415; loop special care must be taken. Otherwise, the reduced value feeding into the 416; outside user misses a few iterations (VF-1) of the loop. 417; PR16522 418 419; CHECK-LABEL: @phivalueredux( 420; CHECK-NOT: x i32> 421 422define i32 @phivalueredux(i32 %p) { 423entry: 424 br label %for.body 425 426for.body: 427 %t.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 428 %p.addr.02 = phi i32 [ %p, %entry ], [ %xor, %for.body ] 429 %xor = xor i32 %p.addr.02, -1 430 %inc = add nsw i32 %t.03, 1 431 %exitcond = icmp eq i32 %inc, 16 432 br i1 %exitcond, label %for.end, label %for.body 433 434for.end: 435 ret i32 %p.addr.02 436} 437 438; Don't vectorize a reduction value that is not the last in a reduction cyle. We 439; would loose iterations (VF-1) on the operations after that use. 440; PR17498 441 442; CHECK-LABEL: not_last_operation 443; CHECK-NOT: x i32> 444define i32 @not_last_operation(i32 %p, i32 %val) { 445entry: 446 %tobool = icmp eq i32 %p, 0 447 br label %for.body 448 449for.body: 450 %inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ] 451 %inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ] 452 %0 = zext i1 %tobool to i32 453 %inc4.1 = xor i32 %0, 1 454 %inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1 455 %inc5.1 = add nsw i32 %inc511.1.inc4.1, 1 456 %inc6.1 = add nsw i32 %inc613.1, 1 457 %exitcond.1 = icmp eq i32 %inc6.1, 22 458 br i1 %exitcond.1, label %exit, label %for.body 459 460exit: 461 %inc.2 = add nsw i32 %inc511.1.inc4.1, 2 462 ret i32 %inc.2 463} 464 465;CHECK-LABEL: @reduction_sum_multiuse( 466;CHECK: phi <4 x i32> 467;CHECK: load <4 x i32> 468;CHECK: add <4 x i32> 469;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> 470;CHECK: %sum.copy = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ] 471;CHECK: ret i32 472define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) { 473 %1 = icmp sgt i32 %n, 0 474 br i1 %1, label %.lr.ph.preheader, label %end 475.lr.ph.preheader: ; preds = %0 476 br label %.lr.ph 477 478.lr.ph: ; preds = %0, %.lr.ph 479 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] 480 %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ] 481 %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 482 %3 = load i32, i32* %2, align 4 483 %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 484 %5 = load i32, i32* %4, align 4 485 %6 = trunc i64 %indvars.iv to i32 486 %7 = add i32 %sum.02, %6 487 %8 = add i32 %7, %3 488 %9 = add i32 %8, %5 489 %indvars.iv.next = add i64 %indvars.iv, 1 490 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 491 %exitcond = icmp eq i32 %lftr.wideiv, %n 492 br i1 %exitcond, label %._crit_edge, label %.lr.ph 493 494._crit_edge: ; preds = %.lr.ph, %0 495 %sum.lcssa = phi i32 [ %9, %.lr.ph ] 496 %sum.copy = phi i32 [ %9, %.lr.ph ] 497 br label %end 498 499end: 500 %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ] 501 %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ] 502 %final = add i32 %f1, %f2 503 ret i32 %final 504} 505 506; This looks like a predicated reduction, but it is a reset of the reduction 507; variable. We cannot vectorize this. 508; CHECK-LABEL: reduction_reset( 509; CHECK-NOT: <4 x i32> 510define void @reduction_reset(i32 %N, i32* nocapture readonly %arrayA, i32* nocapture %arrayB) { 511entry: 512 %c4 = icmp sgt i32 %N, 0 513 br i1 %c4, label %.lr.ph.preheader, label %._crit_edge 514 515.lr.ph.preheader: ; preds = %entry 516 %c5 = add i32 %N, -1 517 %wide.trip.count = zext i32 %N to i64 518 br label %.lr.ph 519 520.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader 521 %indvars.iv = phi i64 [ 0, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ] 522 %.017 = phi i32 [ 100, %.lr.ph.preheader ], [ %csel, %.lr.ph ] 523 %c6 = getelementptr inbounds i32, i32* %arrayA, i64 %indvars.iv 524 %c7 = load i32, i32* %c6, align 4 525 %c8 = icmp sgt i32 %c7, 0 526 %c9 = add nsw i32 %c7, %.017 527 %csel = select i1 %c8, i32 %c9, i32 0 528 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 529 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 530 br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph 531 532._crit_edge.loopexit: ; preds = %.lr.ph 533 %csel.lcssa = phi i32 [ %csel, %.lr.ph ] 534 %phitmp19 = sext i32 %c5 to i64 535 br label %._crit_edge 536 537._crit_edge: ; preds = %._crit_edge.loopexit, %entry 538 %.015.lcssa = phi i64 [ -1, %entry ], [ %phitmp19, %._crit_edge.loopexit ] 539 %.0.lcssa = phi i32 [ 100, %entry ], [ %csel.lcssa, %._crit_edge.loopexit ] 540 %c10 = getelementptr inbounds i32, i32* %arrayB, i64 %.015.lcssa 541 store i32 %.0.lcssa, i32* %c10, align 4 542 ret void 543} 544 545; Can vectorize reduction with redundant single-operand phi input. 546define i64 @reduction_with_phi_with_one_incoming_on_backedge(i16 %n, i64* %A) { 547; CHECK-LABEL: @reduction_with_phi_with_one_incoming_on_backedge 548; CHECK: add <4 x i64> 549; 550entry: 551 br label %loop.header 552 553loop.header: 554 %iv = phi i16 [ 1, %entry ], [ %iv.next, %loop.latch ] 555 %sum = phi i64 [ 0, %entry ], [ %phi.sum.next, %loop.latch ] 556 %gep.A = getelementptr i64, i64* %A, i16 %iv 557 %lv.A = load i64, i64* %gep.A 558 %sum.next = add nsw i64 %sum, %lv.A 559 br label %loop.bb 560 561loop.bb: 562 %phi.sum.next = phi i64 [ %sum.next, %loop.header ] 563 br label %loop.latch 564 565loop.latch: 566 %iv.next = add nsw i16 %iv, 1 567 %cond = icmp slt i16 %iv.next, %n 568 br i1 %cond, label %loop.header, label %exit 569 570exit: 571 %lcssa.exit = phi i64 [ %phi.sum.next, %loop.latch ] 572 ret i64 %lcssa.exit 573} 574 575; Can vectorize reduction with redundant two-operand phi input. 576define i64 @reduction_with_phi_with_two_incoming_on_backedge(i16 %n, i64* %A) { 577; CHECK-LABEL: @reduction_with_phi_with_two_incoming_on_backedge 578; CHECK: add <4 x i64> 579; 580entry: 581 br label %loop.header 582 583loop.header: 584 %iv = phi i16 [ 1, %entry ], [ %iv.next, %loop.latch ] 585 %sum = phi i64 [ 0, %entry ], [ %phi.sum.next, %loop.latch ] 586 %gep.A = getelementptr i64, i64* %A, i16 %iv 587 %lv.A = load i64, i64* %gep.A 588 %sum.next = add nsw i64 %sum, %lv.A 589 %cmp.0 = icmp eq i64 %lv.A, 29 590 br i1 %cmp.0, label %loop.bb, label %loop.latch 591 592loop.bb: 593 br label %loop.latch 594 595loop.latch: 596 %phi.sum.next = phi i64 [ %sum.next, %loop.bb ], [ %sum.next, %loop.header ] 597 %iv.next = add nsw i16 %iv, 1 598 %cond = icmp slt i16 %iv.next, %n 599 br i1 %cond, label %loop.header, label %exit 600 601exit: 602 %lcssa.exit = phi i64 [ %phi.sum.next, %loop.latch ] 603 ret i64 %lcssa.exit 604} 605 606; Make sure any check-not directives are not triggered by function declarations. 607; CHECK: declare 608