1 //===- LoopFlatten.cpp - Loop flattening pass------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass flattens pairs nested loops into a single loop.
10 //
11 // The intention is to optimise loop nests like this, which together access an
12 // array linearly:
13 // for (int i = 0; i < N; ++i)
14 // for (int j = 0; j < M; ++j)
15 // f(A[i*M+j]);
16 // into one loop:
17 // for (int i = 0; i < (N*M); ++i)
18 // f(A[i]);
19 //
20 // It can also flatten loops where the induction variables are not used in the
21 // loop. This is only worth doing if the induction variables are only used in an
22 // expression like i*M+j. If they had any other uses, we would have to insert a
23 // div/mod to reconstruct the original values, so this wouldn't be profitable.
24 //
25 // We also need to prove that N*M will not overflow.
26 //
27 //===----------------------------------------------------------------------===//
28
29 #include "llvm/Transforms/Scalar/LoopFlatten.h"
30 #include "llvm/Analysis/AssumptionCache.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
33 #include "llvm/Analysis/ScalarEvolution.h"
34 #include "llvm/Analysis/TargetTransformInfo.h"
35 #include "llvm/Analysis/ValueTracking.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/Module.h"
40 #include "llvm/IR/PatternMatch.h"
41 #include "llvm/IR/Verifier.h"
42 #include "llvm/InitializePasses.h"
43 #include "llvm/Pass.h"
44 #include "llvm/Support/Debug.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Utils/Local.h"
48 #include "llvm/Transforms/Utils/LoopUtils.h"
49 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
50 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
51
52 #define DEBUG_TYPE "loop-flatten"
53
54 using namespace llvm;
55 using namespace llvm::PatternMatch;
56
57 static cl::opt<unsigned> RepeatedInstructionThreshold(
58 "loop-flatten-cost-threshold", cl::Hidden, cl::init(2),
59 cl::desc("Limit on the cost of instructions that can be repeated due to "
60 "loop flattening"));
61
62 static cl::opt<bool>
63 AssumeNoOverflow("loop-flatten-assume-no-overflow", cl::Hidden,
64 cl::init(false),
65 cl::desc("Assume that the product of the two iteration "
66 "trip counts will never overflow"));
67
68 static cl::opt<bool>
69 WidenIV("loop-flatten-widen-iv", cl::Hidden,
70 cl::init(true),
71 cl::desc("Widen the loop induction variables, if possible, so "
72 "overflow checks won't reject flattening"));
73
74 struct FlattenInfo {
75 Loop *OuterLoop = nullptr;
76 Loop *InnerLoop = nullptr;
77 // These PHINodes correspond to loop induction variables, which are expected
78 // to start at zero and increment by one on each loop.
79 PHINode *InnerInductionPHI = nullptr;
80 PHINode *OuterInductionPHI = nullptr;
81 Value *InnerTripCount = nullptr;
82 Value *OuterTripCount = nullptr;
83 BinaryOperator *InnerIncrement = nullptr;
84 BinaryOperator *OuterIncrement = nullptr;
85 BranchInst *InnerBranch = nullptr;
86 BranchInst *OuterBranch = nullptr;
87 SmallPtrSet<Value *, 4> LinearIVUses;
88 SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
89
90 // Whether this holds the flatten info before or after widening.
91 bool Widened = false;
92
FlattenInfoFlattenInfo93 FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
94 };
95
96 // Finds the induction variable, increment and trip count for a simple loop that
97 // we can flatten.
findLoopComponents(Loop * L,SmallPtrSetImpl<Instruction * > & IterationInstructions,PHINode * & InductionPHI,Value * & TripCount,BinaryOperator * & Increment,BranchInst * & BackBranch,ScalarEvolution * SE,bool IsWidened)98 static bool findLoopComponents(
99 Loop *L, SmallPtrSetImpl<Instruction *> &IterationInstructions,
100 PHINode *&InductionPHI, Value *&TripCount, BinaryOperator *&Increment,
101 BranchInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) {
102 LLVM_DEBUG(dbgs() << "Finding components of loop: " << L->getName() << "\n");
103
104 if (!L->isLoopSimplifyForm()) {
105 LLVM_DEBUG(dbgs() << "Loop is not in normal form\n");
106 return false;
107 }
108
109 // Currently, to simplify the implementation, the Loop induction variable must
110 // start at zero and increment with a step size of one.
111 if (!L->isCanonical(*SE)) {
112 LLVM_DEBUG(dbgs() << "Loop is not canonical\n");
113 return false;
114 }
115
116 // There must be exactly one exiting block, and it must be the same at the
117 // latch.
118 BasicBlock *Latch = L->getLoopLatch();
119 if (L->getExitingBlock() != Latch) {
120 LLVM_DEBUG(dbgs() << "Exiting and latch block are different\n");
121 return false;
122 }
123
124 // Find the induction PHI. If there is no induction PHI, we can't do the
125 // transformation. TODO: could other variables trigger this? Do we have to
126 // search for the best one?
127 InductionPHI = L->getInductionVariable(*SE);
128 if (!InductionPHI) {
129 LLVM_DEBUG(dbgs() << "Could not find induction PHI\n");
130 return false;
131 }
132 LLVM_DEBUG(dbgs() << "Found induction PHI: "; InductionPHI->dump());
133
134 bool ContinueOnTrue = L->contains(Latch->getTerminator()->getSuccessor(0));
135 auto IsValidPredicate = [&](ICmpInst::Predicate Pred) {
136 if (ContinueOnTrue)
137 return Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT;
138 else
139 return Pred == CmpInst::ICMP_EQ;
140 };
141
142 // Find Compare and make sure it is valid. getLatchCmpInst checks that the
143 // back branch of the latch is conditional.
144 ICmpInst *Compare = L->getLatchCmpInst();
145 if (!Compare || !IsValidPredicate(Compare->getUnsignedPredicate()) ||
146 Compare->hasNUsesOrMore(2)) {
147 LLVM_DEBUG(dbgs() << "Could not find valid comparison\n");
148 return false;
149 }
150 BackBranch = cast<BranchInst>(Latch->getTerminator());
151 IterationInstructions.insert(BackBranch);
152 LLVM_DEBUG(dbgs() << "Found back branch: "; BackBranch->dump());
153 IterationInstructions.insert(Compare);
154 LLVM_DEBUG(dbgs() << "Found comparison: "; Compare->dump());
155
156 // Find increment and trip count.
157 // There are exactly 2 incoming values to the induction phi; one from the
158 // pre-header and one from the latch. The incoming latch value is the
159 // increment variable.
160 Increment =
161 dyn_cast<BinaryOperator>(InductionPHI->getIncomingValueForBlock(Latch));
162 if (Increment->hasNUsesOrMore(3)) {
163 LLVM_DEBUG(dbgs() << "Could not find valid increment\n");
164 return false;
165 }
166 // The trip count is the RHS of the compare. If this doesn't match the trip
167 // count computed by SCEV then this is either because the trip count variable
168 // has been widened (then leave the trip count as it is), or because it is a
169 // constant and another transformation has changed the compare, e.g.
170 // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, then we don't flatten
171 // the loop (yet).
172 TripCount = Compare->getOperand(1);
173 const SCEV *SCEVTripCount =
174 SE->getTripCountFromExitCount(SE->getBackedgeTakenCount(L));
175 if (SE->getSCEV(TripCount) != SCEVTripCount) {
176 if (!IsWidened) {
177 LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
178 return false;
179 }
180 auto TripCountInst = dyn_cast<Instruction>(TripCount);
181 if (!TripCountInst) {
182 LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
183 return false;
184 }
185 if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) ||
186 SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) {
187 LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
188 return false;
189 }
190 }
191 IterationInstructions.insert(Increment);
192 LLVM_DEBUG(dbgs() << "Found increment: "; Increment->dump());
193 LLVM_DEBUG(dbgs() << "Found trip count: "; TripCount->dump());
194
195 LLVM_DEBUG(dbgs() << "Successfully found all loop components\n");
196 return true;
197 }
198
checkPHIs(FlattenInfo & FI,const TargetTransformInfo * TTI)199 static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
200 // All PHIs in the inner and outer headers must either be:
201 // - The induction PHI, which we are going to rewrite as one induction in
202 // the new loop. This is already checked by findLoopComponents.
203 // - An outer header PHI with all incoming values from outside the loop.
204 // LoopSimplify guarantees we have a pre-header, so we don't need to
205 // worry about that here.
206 // - Pairs of PHIs in the inner and outer headers, which implement a
207 // loop-carried dependency that will still be valid in the new loop. To
208 // be valid, this variable must be modified only in the inner loop.
209
210 // The set of PHI nodes in the outer loop header that we know will still be
211 // valid after the transformation. These will not need to be modified (with
212 // the exception of the induction variable), but we do need to check that
213 // there are no unsafe PHI nodes.
214 SmallPtrSet<PHINode *, 4> SafeOuterPHIs;
215 SafeOuterPHIs.insert(FI.OuterInductionPHI);
216
217 // Check that all PHI nodes in the inner loop header match one of the valid
218 // patterns.
219 for (PHINode &InnerPHI : FI.InnerLoop->getHeader()->phis()) {
220 // The induction PHIs break these rules, and that's OK because we treat
221 // them specially when doing the transformation.
222 if (&InnerPHI == FI.InnerInductionPHI)
223 continue;
224
225 // Each inner loop PHI node must have two incoming values/blocks - one
226 // from the pre-header, and one from the latch.
227 assert(InnerPHI.getNumIncomingValues() == 2);
228 Value *PreHeaderValue =
229 InnerPHI.getIncomingValueForBlock(FI.InnerLoop->getLoopPreheader());
230 Value *LatchValue =
231 InnerPHI.getIncomingValueForBlock(FI.InnerLoop->getLoopLatch());
232
233 // The incoming value from the outer loop must be the PHI node in the
234 // outer loop header, with no modifications made in the top of the outer
235 // loop.
236 PHINode *OuterPHI = dyn_cast<PHINode>(PreHeaderValue);
237 if (!OuterPHI || OuterPHI->getParent() != FI.OuterLoop->getHeader()) {
238 LLVM_DEBUG(dbgs() << "value modified in top of outer loop\n");
239 return false;
240 }
241
242 // The other incoming value must come from the inner loop, without any
243 // modifications in the tail end of the outer loop. We are in LCSSA form,
244 // so this will actually be a PHI in the inner loop's exit block, which
245 // only uses values from inside the inner loop.
246 PHINode *LCSSAPHI = dyn_cast<PHINode>(
247 OuterPHI->getIncomingValueForBlock(FI.OuterLoop->getLoopLatch()));
248 if (!LCSSAPHI) {
249 LLVM_DEBUG(dbgs() << "could not find LCSSA PHI\n");
250 return false;
251 }
252
253 // The value used by the LCSSA PHI must be the same one that the inner
254 // loop's PHI uses.
255 if (LCSSAPHI->hasConstantValue() != LatchValue) {
256 LLVM_DEBUG(
257 dbgs() << "LCSSA PHI incoming value does not match latch value\n");
258 return false;
259 }
260
261 LLVM_DEBUG(dbgs() << "PHI pair is safe:\n");
262 LLVM_DEBUG(dbgs() << " Inner: "; InnerPHI.dump());
263 LLVM_DEBUG(dbgs() << " Outer: "; OuterPHI->dump());
264 SafeOuterPHIs.insert(OuterPHI);
265 FI.InnerPHIsToTransform.insert(&InnerPHI);
266 }
267
268 for (PHINode &OuterPHI : FI.OuterLoop->getHeader()->phis()) {
269 if (!SafeOuterPHIs.count(&OuterPHI)) {
270 LLVM_DEBUG(dbgs() << "found unsafe PHI in outer loop: "; OuterPHI.dump());
271 return false;
272 }
273 }
274
275 LLVM_DEBUG(dbgs() << "checkPHIs: OK\n");
276 return true;
277 }
278
279 static bool
checkOuterLoopInsts(FlattenInfo & FI,SmallPtrSetImpl<Instruction * > & IterationInstructions,const TargetTransformInfo * TTI)280 checkOuterLoopInsts(FlattenInfo &FI,
281 SmallPtrSetImpl<Instruction *> &IterationInstructions,
282 const TargetTransformInfo *TTI) {
283 // Check for instructions in the outer but not inner loop. If any of these
284 // have side-effects then this transformation is not legal, and if there is
285 // a significant amount of code here which can't be optimised out that it's
286 // not profitable (as these instructions would get executed for each
287 // iteration of the inner loop).
288 InstructionCost RepeatedInstrCost = 0;
289 for (auto *B : FI.OuterLoop->getBlocks()) {
290 if (FI.InnerLoop->contains(B))
291 continue;
292
293 for (auto &I : *B) {
294 if (!isa<PHINode>(&I) && !I.isTerminator() &&
295 !isSafeToSpeculativelyExecute(&I)) {
296 LLVM_DEBUG(dbgs() << "Cannot flatten because instruction may have "
297 "side effects: ";
298 I.dump());
299 return false;
300 }
301 // The execution count of the outer loop's iteration instructions
302 // (increment, compare and branch) will be increased, but the
303 // equivalent instructions will be removed from the inner loop, so
304 // they make a net difference of zero.
305 if (IterationInstructions.count(&I))
306 continue;
307 // The uncoditional branch to the inner loop's header will turn into
308 // a fall-through, so adds no cost.
309 BranchInst *Br = dyn_cast<BranchInst>(&I);
310 if (Br && Br->isUnconditional() &&
311 Br->getSuccessor(0) == FI.InnerLoop->getHeader())
312 continue;
313 // Multiplies of the outer iteration variable and inner iteration
314 // count will be optimised out.
315 if (match(&I, m_c_Mul(m_Specific(FI.OuterInductionPHI),
316 m_Specific(FI.InnerTripCount))))
317 continue;
318 InstructionCost Cost =
319 TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
320 LLVM_DEBUG(dbgs() << "Cost " << Cost << ": "; I.dump());
321 RepeatedInstrCost += Cost;
322 }
323 }
324
325 LLVM_DEBUG(dbgs() << "Cost of instructions that will be repeated: "
326 << RepeatedInstrCost << "\n");
327 // Bail out if flattening the loops would cause instructions in the outer
328 // loop but not in the inner loop to be executed extra times.
329 if (RepeatedInstrCost > RepeatedInstructionThreshold) {
330 LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: not profitable, bailing.\n");
331 return false;
332 }
333
334 LLVM_DEBUG(dbgs() << "checkOuterLoopInsts: OK\n");
335 return true;
336 }
337
checkIVUsers(FlattenInfo & FI)338 static bool checkIVUsers(FlattenInfo &FI) {
339 // We require all uses of both induction variables to match this pattern:
340 //
341 // (OuterPHI * InnerTripCount) + InnerPHI
342 //
343 // Any uses of the induction variables not matching that pattern would
344 // require a div/mod to reconstruct in the flattened loop, so the
345 // transformation wouldn't be profitable.
346
347 Value *InnerTripCount = FI.InnerTripCount;
348 if (FI.Widened &&
349 (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount)))
350 InnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0);
351
352 // Check that all uses of the inner loop's induction variable match the
353 // expected pattern, recording the uses of the outer IV.
354 SmallPtrSet<Value *, 4> ValidOuterPHIUses;
355 for (User *U : FI.InnerInductionPHI->users()) {
356 if (U == FI.InnerIncrement)
357 continue;
358
359 // After widening the IVs, a trunc instruction might have been introduced, so
360 // look through truncs.
361 if (isa<TruncInst>(U)) {
362 if (!U->hasOneUse())
363 return false;
364 U = *U->user_begin();
365 }
366
367 LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
368
369 Value *MatchedMul;
370 Value *MatchedItCount;
371 bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
372 m_Value(MatchedMul))) &&
373 match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI),
374 m_Value(MatchedItCount)));
375
376 // Matches the same pattern as above, except it also looks for truncs
377 // on the phi, which can be the result of widening the induction variables.
378 bool IsAddTrunc = match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
379 m_Value(MatchedMul))) &&
380 match(MatchedMul,
381 m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
382 m_Value(MatchedItCount)));
383
384 if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) {
385 LLVM_DEBUG(dbgs() << "Use is optimisable\n");
386 ValidOuterPHIUses.insert(MatchedMul);
387 FI.LinearIVUses.insert(U);
388 } else {
389 LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
390 return false;
391 }
392 }
393
394 // Check that there are no uses of the outer IV other than the ones found
395 // as part of the pattern above.
396 for (User *U : FI.OuterInductionPHI->users()) {
397 if (U == FI.OuterIncrement)
398 continue;
399
400 auto IsValidOuterPHIUses = [&] (User *U) -> bool {
401 LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
402 if (!ValidOuterPHIUses.count(U)) {
403 LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
404 return false;
405 }
406 LLVM_DEBUG(dbgs() << "Use is optimisable\n");
407 return true;
408 };
409
410 if (auto *V = dyn_cast<TruncInst>(U)) {
411 for (auto *K : V->users()) {
412 if (!IsValidOuterPHIUses(K))
413 return false;
414 }
415 continue;
416 }
417
418 if (!IsValidOuterPHIUses(U))
419 return false;
420 }
421
422 LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n";
423 dbgs() << "Found " << FI.LinearIVUses.size()
424 << " value(s) that can be replaced:\n";
425 for (Value *V : FI.LinearIVUses) {
426 dbgs() << " ";
427 V->dump();
428 });
429 return true;
430 }
431
432 // Return an OverflowResult dependant on if overflow of the multiplication of
433 // InnerTripCount and OuterTripCount can be assumed not to happen.
checkOverflow(FlattenInfo & FI,DominatorTree * DT,AssumptionCache * AC)434 static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT,
435 AssumptionCache *AC) {
436 Function *F = FI.OuterLoop->getHeader()->getParent();
437 const DataLayout &DL = F->getParent()->getDataLayout();
438
439 // For debugging/testing.
440 if (AssumeNoOverflow)
441 return OverflowResult::NeverOverflows;
442
443 // Check if the multiply could not overflow due to known ranges of the
444 // input values.
445 OverflowResult OR = computeOverflowForUnsignedMul(
446 FI.InnerTripCount, FI.OuterTripCount, DL, AC,
447 FI.OuterLoop->getLoopPreheader()->getTerminator(), DT);
448 if (OR != OverflowResult::MayOverflow)
449 return OR;
450
451 for (Value *V : FI.LinearIVUses) {
452 for (Value *U : V->users()) {
453 if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
454 // The IV is used as the operand of a GEP, and the IV is at least as
455 // wide as the address space of the GEP. In this case, the GEP would
456 // wrap around the address space before the IV increment wraps, which
457 // would be UB.
458 if (GEP->isInBounds() &&
459 V->getType()->getIntegerBitWidth() >=
460 DL.getPointerTypeSizeInBits(GEP->getType())) {
461 LLVM_DEBUG(
462 dbgs() << "use of linear IV would be UB if overflow occurred: ";
463 GEP->dump());
464 return OverflowResult::NeverOverflows;
465 }
466 }
467 }
468 }
469
470 return OverflowResult::MayOverflow;
471 }
472
CanFlattenLoopPair(FlattenInfo & FI,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,const TargetTransformInfo * TTI)473 static bool CanFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
474 ScalarEvolution *SE, AssumptionCache *AC,
475 const TargetTransformInfo *TTI) {
476 SmallPtrSet<Instruction *, 8> IterationInstructions;
477 if (!findLoopComponents(FI.InnerLoop, IterationInstructions,
478 FI.InnerInductionPHI, FI.InnerTripCount,
479 FI.InnerIncrement, FI.InnerBranch, SE, FI.Widened))
480 return false;
481 if (!findLoopComponents(FI.OuterLoop, IterationInstructions,
482 FI.OuterInductionPHI, FI.OuterTripCount,
483 FI.OuterIncrement, FI.OuterBranch, SE, FI.Widened))
484 return false;
485
486 // Both of the loop trip count values must be invariant in the outer loop
487 // (non-instructions are all inherently invariant).
488 if (!FI.OuterLoop->isLoopInvariant(FI.InnerTripCount)) {
489 LLVM_DEBUG(dbgs() << "inner loop trip count not invariant\n");
490 return false;
491 }
492 if (!FI.OuterLoop->isLoopInvariant(FI.OuterTripCount)) {
493 LLVM_DEBUG(dbgs() << "outer loop trip count not invariant\n");
494 return false;
495 }
496
497 if (!checkPHIs(FI, TTI))
498 return false;
499
500 // FIXME: it should be possible to handle different types correctly.
501 if (FI.InnerInductionPHI->getType() != FI.OuterInductionPHI->getType())
502 return false;
503
504 if (!checkOuterLoopInsts(FI, IterationInstructions, TTI))
505 return false;
506
507 // Find the values in the loop that can be replaced with the linearized
508 // induction variable, and check that there are no other uses of the inner
509 // or outer induction variable. If there were, we could still do this
510 // transformation, but we'd have to insert a div/mod to calculate the
511 // original IVs, so it wouldn't be profitable.
512 if (!checkIVUsers(FI))
513 return false;
514
515 LLVM_DEBUG(dbgs() << "CanFlattenLoopPair: OK\n");
516 return true;
517 }
518
DoFlattenLoopPair(FlattenInfo & FI,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,const TargetTransformInfo * TTI)519 static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
520 ScalarEvolution *SE, AssumptionCache *AC,
521 const TargetTransformInfo *TTI) {
522 Function *F = FI.OuterLoop->getHeader()->getParent();
523 LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
524 {
525 using namespace ore;
526 OptimizationRemark Remark(DEBUG_TYPE, "Flattened", FI.InnerLoop->getStartLoc(),
527 FI.InnerLoop->getHeader());
528 OptimizationRemarkEmitter ORE(F);
529 Remark << "Flattened into outer loop";
530 ORE.emit(Remark);
531 }
532
533 Value *NewTripCount = BinaryOperator::CreateMul(
534 FI.InnerTripCount, FI.OuterTripCount, "flatten.tripcount",
535 FI.OuterLoop->getLoopPreheader()->getTerminator());
536 LLVM_DEBUG(dbgs() << "Created new trip count in preheader: ";
537 NewTripCount->dump());
538
539 // Fix up PHI nodes that take values from the inner loop back-edge, which
540 // we are about to remove.
541 FI.InnerInductionPHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
542
543 // The old Phi will be optimised away later, but for now we can't leave
544 // leave it in an invalid state, so are updating them too.
545 for (PHINode *PHI : FI.InnerPHIsToTransform)
546 PHI->removeIncomingValue(FI.InnerLoop->getLoopLatch());
547
548 // Modify the trip count of the outer loop to be the product of the two
549 // trip counts.
550 cast<User>(FI.OuterBranch->getCondition())->setOperand(1, NewTripCount);
551
552 // Replace the inner loop backedge with an unconditional branch to the exit.
553 BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock();
554 BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock();
555 InnerExitingBlock->getTerminator()->eraseFromParent();
556 BranchInst::Create(InnerExitBlock, InnerExitingBlock);
557 DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
558
559 // Replace all uses of the polynomial calculated from the two induction
560 // variables with the one new one.
561 IRBuilder<> Builder(FI.OuterInductionPHI->getParent()->getTerminator());
562 for (Value *V : FI.LinearIVUses) {
563 Value *OuterValue = FI.OuterInductionPHI;
564 if (FI.Widened)
565 OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(),
566 "flatten.trunciv");
567
568 LLVM_DEBUG(dbgs() << "Replacing: "; V->dump();
569 dbgs() << "with: "; OuterValue->dump());
570 V->replaceAllUsesWith(OuterValue);
571 }
572
573 // Tell LoopInfo, SCEV and the pass manager that the inner loop has been
574 // deleted, and any information that have about the outer loop invalidated.
575 SE->forgetLoop(FI.OuterLoop);
576 SE->forgetLoop(FI.InnerLoop);
577 LI->erase(FI.InnerLoop);
578 return true;
579 }
580
CanWidenIV(FlattenInfo & FI,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,const TargetTransformInfo * TTI)581 static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
582 ScalarEvolution *SE, AssumptionCache *AC,
583 const TargetTransformInfo *TTI) {
584 if (!WidenIV) {
585 LLVM_DEBUG(dbgs() << "Widening the IVs is disabled\n");
586 return false;
587 }
588
589 LLVM_DEBUG(dbgs() << "Try widening the IVs\n");
590 Module *M = FI.InnerLoop->getHeader()->getParent()->getParent();
591 auto &DL = M->getDataLayout();
592 auto *InnerType = FI.InnerInductionPHI->getType();
593 auto *OuterType = FI.OuterInductionPHI->getType();
594 unsigned MaxLegalSize = DL.getLargestLegalIntTypeSizeInBits();
595 auto *MaxLegalType = DL.getLargestLegalIntType(M->getContext());
596
597 // If both induction types are less than the maximum legal integer width,
598 // promote both to the widest type available so we know calculating
599 // (OuterTripCount * InnerTripCount) as the new trip count is safe.
600 if (InnerType != OuterType ||
601 InnerType->getScalarSizeInBits() >= MaxLegalSize ||
602 MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) {
603 LLVM_DEBUG(dbgs() << "Can't widen the IV\n");
604 return false;
605 }
606
607 SCEVExpander Rewriter(*SE, DL, "loopflatten");
608 SmallVector<WideIVInfo, 2> WideIVs;
609 SmallVector<WeakTrackingVH, 4> DeadInsts;
610 WideIVs.push_back( {FI.InnerInductionPHI, MaxLegalType, false });
611 WideIVs.push_back( {FI.OuterInductionPHI, MaxLegalType, false });
612 unsigned ElimExt = 0;
613 unsigned Widened = 0;
614
615 for (const auto &WideIV : WideIVs) {
616 PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts,
617 ElimExt, Widened, true /* HasGuards */,
618 true /* UsePostIncrementRanges */);
619 if (!WidePhi)
620 return false;
621 LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
622 LLVM_DEBUG(dbgs() << "Deleting old phi: "; WideIV.NarrowIV->dump());
623 RecursivelyDeleteDeadPHINode(WideIV.NarrowIV);
624 }
625 // After widening, rediscover all the loop components.
626 assert(Widened && "Widened IV expected");
627 FI.Widened = true;
628 return CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
629 }
630
FlattenLoopPair(FlattenInfo & FI,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,const TargetTransformInfo * TTI)631 static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
632 ScalarEvolution *SE, AssumptionCache *AC,
633 const TargetTransformInfo *TTI) {
634 LLVM_DEBUG(
635 dbgs() << "Loop flattening running on outer loop "
636 << FI.OuterLoop->getHeader()->getName() << " and inner loop "
637 << FI.InnerLoop->getHeader()->getName() << " in "
638 << FI.OuterLoop->getHeader()->getParent()->getName() << "\n");
639
640 if (!CanFlattenLoopPair(FI, DT, LI, SE, AC, TTI))
641 return false;
642
643 // Check if we can widen the induction variables to avoid overflow checks.
644 if (CanWidenIV(FI, DT, LI, SE, AC, TTI))
645 return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
646
647 // Check if the new iteration variable might overflow. In this case, we
648 // need to version the loop, and select the original version at runtime if
649 // the iteration space is too large.
650 // TODO: We currently don't version the loop.
651 OverflowResult OR = checkOverflow(FI, DT, AC);
652 if (OR == OverflowResult::AlwaysOverflowsHigh ||
653 OR == OverflowResult::AlwaysOverflowsLow) {
654 LLVM_DEBUG(dbgs() << "Multiply would always overflow, so not profitable\n");
655 return false;
656 } else if (OR == OverflowResult::MayOverflow) {
657 LLVM_DEBUG(dbgs() << "Multiply might overflow, not flattening\n");
658 return false;
659 }
660
661 LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
662 return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
663 }
664
Flatten(LoopNest & LN,DominatorTree * DT,LoopInfo * LI,ScalarEvolution * SE,AssumptionCache * AC,TargetTransformInfo * TTI)665 bool Flatten(LoopNest &LN, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE,
666 AssumptionCache *AC, TargetTransformInfo *TTI) {
667 bool Changed = false;
668 for (Loop *InnerLoop : LN.getLoops()) {
669 auto *OuterLoop = InnerLoop->getParentLoop();
670 if (!OuterLoop)
671 continue;
672 FlattenInfo FI(OuterLoop, InnerLoop);
673 Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI);
674 }
675 return Changed;
676 }
677
run(LoopNest & LN,LoopAnalysisManager & LAM,LoopStandardAnalysisResults & AR,LPMUpdater & U)678 PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
679 LoopStandardAnalysisResults &AR,
680 LPMUpdater &U) {
681
682 bool Changed = false;
683
684 // The loop flattening pass requires loops to be
685 // in simplified form, and also needs LCSSA. Running
686 // this pass will simplify all loops that contain inner loops,
687 // regardless of whether anything ends up being flattened.
688 Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI);
689
690 if (!Changed)
691 return PreservedAnalyses::all();
692
693 return PreservedAnalyses::none();
694 }
695
696 namespace {
697 class LoopFlattenLegacyPass : public FunctionPass {
698 public:
699 static char ID; // Pass ID, replacement for typeid
LoopFlattenLegacyPass()700 LoopFlattenLegacyPass() : FunctionPass(ID) {
701 initializeLoopFlattenLegacyPassPass(*PassRegistry::getPassRegistry());
702 }
703
704 // Possibly flatten loop L into its child.
705 bool runOnFunction(Function &F) override;
706
getAnalysisUsage(AnalysisUsage & AU) const707 void getAnalysisUsage(AnalysisUsage &AU) const override {
708 getLoopAnalysisUsage(AU);
709 AU.addRequired<TargetTransformInfoWrapperPass>();
710 AU.addPreserved<TargetTransformInfoWrapperPass>();
711 AU.addRequired<AssumptionCacheTracker>();
712 AU.addPreserved<AssumptionCacheTracker>();
713 }
714 };
715 } // namespace
716
717 char LoopFlattenLegacyPass::ID = 0;
718 INITIALIZE_PASS_BEGIN(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
719 false, false)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)720 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
721 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
722 INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
723 false, false)
724
725 FunctionPass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); }
726
runOnFunction(Function & F)727 bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
728 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
729 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
730 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
731 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
732 auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>();
733 auto *TTI = &TTIP.getTTI(F);
734 auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
735 bool Changed = false;
736 for (Loop *L : *LI) {
737 auto LN = LoopNest::getLoopNest(*L, *SE);
738 Changed |= Flatten(*LN, DT, LI, SE, AC, TTI);
739 }
740 return Changed;
741 }
742