1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements loop unroll and jam as a routine, much like
11 // LoopUnroll.cpp implements loop unroll.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/ADT/SmallPtrSet.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/AssumptionCache.h"
18 #include "llvm/Analysis/DependenceAnalysis.h"
19 #include "llvm/Analysis/InstructionSimplify.h"
20 #include "llvm/Analysis/LoopAnalysisManager.h"
21 #include "llvm/Analysis/LoopIterator.h"
22 #include "llvm/Analysis/LoopPass.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/Analysis/ScalarEvolution.h"
25 #include "llvm/Analysis/ScalarEvolutionExpander.h"
26 #include "llvm/Analysis/Utils/Local.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DebugInfoMetadata.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/LLVMContext.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
36 #include "llvm/Transforms/Utils/Cloning.h"
37 #include "llvm/Transforms/Utils/LoopSimplify.h"
38 #include "llvm/Transforms/Utils/LoopUtils.h"
39 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
40 #include "llvm/Transforms/Utils/UnrollLoop.h"
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "loop-unroll-and-jam"
44 
45 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
46 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
47 
48 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
49 
50 // Partition blocks in an outer/inner loop pair into blocks before and after
51 // the loop
partitionOuterLoopBlocks(Loop * L,Loop * SubLoop,BasicBlockSet & ForeBlocks,BasicBlockSet & SubLoopBlocks,BasicBlockSet & AftBlocks,DominatorTree * DT)52 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
53                                      BasicBlockSet &ForeBlocks,
54                                      BasicBlockSet &SubLoopBlocks,
55                                      BasicBlockSet &AftBlocks,
56                                      DominatorTree *DT) {
57   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
58   SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
59 
60   for (BasicBlock *BB : L->blocks()) {
61     if (!SubLoop->contains(BB)) {
62       if (DT->dominates(SubLoopLatch, BB))
63         AftBlocks.insert(BB);
64       else
65         ForeBlocks.insert(BB);
66     }
67   }
68 
69   // Check that all blocks in ForeBlocks together dominate the subloop
70   // TODO: This might ideally be done better with a dominator/postdominators.
71   BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
72   for (BasicBlock *BB : ForeBlocks) {
73     if (BB == SubLoopPreHeader)
74       continue;
75     TerminatorInst *TI = BB->getTerminator();
76     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
77       if (!ForeBlocks.count(TI->getSuccessor(i)))
78         return false;
79   }
80 
81   return true;
82 }
83 
84 // Looks at the phi nodes in Header for values coming from Latch. For these
85 // instructions and all their operands calls Visit on them, keeping going for
86 // all the operands in AftBlocks. Returns false if Visit returns false,
87 // otherwise returns true. This is used to process the instructions in the
88 // Aft blocks that need to be moved before the subloop. It is used in two
89 // places. One to check that the required set of instructions can be moved
90 // before the loop. Then to collect the instructions to actually move in
91 // moveHeaderPhiOperandsToForeBlocks.
92 template <typename T>
processHeaderPhiOperands(BasicBlock * Header,BasicBlock * Latch,BasicBlockSet & AftBlocks,T Visit)93 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
94                                      BasicBlockSet &AftBlocks, T Visit) {
95   SmallVector<Instruction *, 8> Worklist;
96   for (auto &Phi : Header->phis()) {
97     Value *V = Phi.getIncomingValueForBlock(Latch);
98     if (Instruction *I = dyn_cast<Instruction>(V))
99       Worklist.push_back(I);
100   }
101 
102   while (!Worklist.empty()) {
103     Instruction *I = Worklist.back();
104     Worklist.pop_back();
105     if (!Visit(I))
106       return false;
107 
108     if (AftBlocks.count(I->getParent()))
109       for (auto &U : I->operands())
110         if (Instruction *II = dyn_cast<Instruction>(U))
111           Worklist.push_back(II);
112   }
113 
114   return true;
115 }
116 
117 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
moveHeaderPhiOperandsToForeBlocks(BasicBlock * Header,BasicBlock * Latch,Instruction * InsertLoc,BasicBlockSet & AftBlocks)118 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
119                                               BasicBlock *Latch,
120                                               Instruction *InsertLoc,
121                                               BasicBlockSet &AftBlocks) {
122   // We need to ensure we move the instructions in the correct order,
123   // starting with the earliest required instruction and moving forward.
124   std::vector<Instruction *> Visited;
125   processHeaderPhiOperands(Header, Latch, AftBlocks,
126                            [&Visited, &AftBlocks](Instruction *I) {
127                              if (AftBlocks.count(I->getParent()))
128                                Visited.push_back(I);
129                              return true;
130                            });
131 
132   // Move all instructions in program order to before the InsertLoc
133   BasicBlock *InsertLocBB = InsertLoc->getParent();
134   for (Instruction *I : reverse(Visited)) {
135     if (I->getParent() != InsertLocBB)
136       I->moveBefore(InsertLoc);
137   }
138 }
139 
140 /*
141   This method performs Unroll and Jam. For a simple loop like:
142   for (i = ..)
143     Fore(i)
144     for (j = ..)
145       SubLoop(i, j)
146     Aft(i)
147 
148   Instead of doing normal inner or outer unrolling, we do:
149   for (i = .., i+=2)
150     Fore(i)
151     Fore(i+1)
152     for (j = ..)
153       SubLoop(i, j)
154       SubLoop(i+1, j)
155     Aft(i)
156     Aft(i+1)
157 
158   So the outer loop is essetially unrolled and then the inner loops are fused
159   ("jammed") together into a single loop. This can increase speed when there
160   are loads in SubLoop that are invariant to i, as they become shared between
161   the now jammed inner loops.
162 
163   We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
164   Fore blocks are those before the inner loop, Aft are those after. Normal
165   Unroll code is used to copy each of these sets of blocks and the results are
166   combined together into the final form above.
167 
168   isSafeToUnrollAndJam should be used prior to calling this to make sure the
169   unrolling will be valid. Checking profitablility is also advisable.
170 */
171 LoopUnrollResult
UnrollAndJamLoop(Loop * L,unsigned Count,unsigned TripCount,unsigned TripMultiple,bool UnrollRemainder,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,AssumptionCache * AC,OptimizationRemarkEmitter * ORE)172 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
173                        unsigned TripMultiple, bool UnrollRemainder,
174                        LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
175                        AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
176 
177   // When we enter here we should have already checked that it is safe
178   BasicBlock *Header = L->getHeader();
179   assert(L->getSubLoops().size() == 1);
180   Loop *SubLoop = *L->begin();
181 
182   // Don't enter the unroll code if there is nothing to do.
183   if (TripCount == 0 && Count < 2) {
184     LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
185     return LoopUnrollResult::Unmodified;
186   }
187 
188   assert(Count > 0);
189   assert(TripMultiple > 0);
190   assert(TripCount == 0 || TripCount % TripMultiple == 0);
191 
192   // Are we eliminating the loop control altogether?
193   bool CompletelyUnroll = (Count == TripCount);
194 
195   // We use the runtime remainder in cases where we don't know trip multiple
196   if (TripMultiple == 1 || TripMultiple % Count != 0) {
197     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
198                                     /*UseEpilogRemainder*/ true,
199                                     UnrollRemainder, LI, SE, DT, AC, true)) {
200       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
201                            "generated when assuming runtime trip count\n");
202       return LoopUnrollResult::Unmodified;
203     }
204   }
205 
206   // Notify ScalarEvolution that the loop will be substantially changed,
207   // if not outright eliminated.
208   if (SE) {
209     SE->forgetLoop(L);
210     SE->forgetLoop(SubLoop);
211   }
212 
213   using namespace ore;
214   // Report the unrolling decision.
215   if (CompletelyUnroll) {
216     LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
217                       << Header->getName() << " with trip count " << TripCount
218                       << "!\n");
219     ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
220                                  L->getHeader())
221               << "completely unroll and jammed loop with "
222               << NV("UnrollCount", TripCount) << " iterations");
223   } else {
224     auto DiagBuilder = [&]() {
225       OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
226                               L->getHeader());
227       return Diag << "unroll and jammed loop by a factor of "
228                   << NV("UnrollCount", Count);
229     };
230 
231     LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
232                       << " by " << Count);
233     if (TripMultiple != 1) {
234       LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
235       ORE->emit([&]() {
236         return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
237                              << " trips per branch";
238       });
239     } else {
240       LLVM_DEBUG(dbgs() << " with run-time trip count");
241       ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
242     }
243     LLVM_DEBUG(dbgs() << "!\n");
244   }
245 
246   BasicBlock *Preheader = L->getLoopPreheader();
247   BasicBlock *LatchBlock = L->getLoopLatch();
248   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
249   assert(Preheader && LatchBlock && Header);
250   assert(BI && !BI->isUnconditional());
251   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
252   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
253   bool SubLoopContinueOnTrue = SubLoop->contains(
254       SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
255 
256   // Partition blocks in an outer/inner loop pair into blocks before and after
257   // the loop
258   BasicBlockSet SubLoopBlocks;
259   BasicBlockSet ForeBlocks;
260   BasicBlockSet AftBlocks;
261   partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
262                            DT);
263 
264   // We keep track of the entering/first and exiting/last block of each of
265   // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
266   // blocks easier.
267   std::vector<BasicBlock *> ForeBlocksFirst;
268   std::vector<BasicBlock *> ForeBlocksLast;
269   std::vector<BasicBlock *> SubLoopBlocksFirst;
270   std::vector<BasicBlock *> SubLoopBlocksLast;
271   std::vector<BasicBlock *> AftBlocksFirst;
272   std::vector<BasicBlock *> AftBlocksLast;
273   ForeBlocksFirst.push_back(Header);
274   ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
275   SubLoopBlocksFirst.push_back(SubLoop->getHeader());
276   SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
277   AftBlocksFirst.push_back(SubLoop->getExitBlock());
278   AftBlocksLast.push_back(L->getExitingBlock());
279   // Maps Blocks[0] -> Blocks[It]
280   ValueToValueMapTy LastValueMap;
281 
282   // Move any instructions from fore phi operands from AftBlocks into Fore.
283   moveHeaderPhiOperandsToForeBlocks(
284       Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
285       AftBlocks);
286 
287   // The current on-the-fly SSA update requires blocks to be processed in
288   // reverse postorder so that LastValueMap contains the correct value at each
289   // exit.
290   LoopBlocksDFS DFS(L);
291   DFS.perform(LI);
292   // Stash the DFS iterators before adding blocks to the loop.
293   LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
294   LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
295 
296   if (Header->getParent()->isDebugInfoForProfiling())
297     for (BasicBlock *BB : L->getBlocks())
298       for (Instruction &I : *BB)
299         if (!isa<DbgInfoIntrinsic>(&I))
300           if (const DILocation *DIL = I.getDebugLoc())
301             I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
302 
303   // Copy all blocks
304   for (unsigned It = 1; It != Count; ++It) {
305     std::vector<BasicBlock *> NewBlocks;
306     // Maps Blocks[It] -> Blocks[It-1]
307     DenseMap<Value *, Value *> PrevItValueMap;
308 
309     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
310       ValueToValueMapTy VMap;
311       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
312       Header->getParent()->getBasicBlockList().push_back(New);
313 
314       if (ForeBlocks.count(*BB)) {
315         L->addBasicBlockToLoop(New, *LI);
316 
317         if (*BB == ForeBlocksFirst[0])
318           ForeBlocksFirst.push_back(New);
319         if (*BB == ForeBlocksLast[0])
320           ForeBlocksLast.push_back(New);
321       } else if (SubLoopBlocks.count(*BB)) {
322         SubLoop->addBasicBlockToLoop(New, *LI);
323 
324         if (*BB == SubLoopBlocksFirst[0])
325           SubLoopBlocksFirst.push_back(New);
326         if (*BB == SubLoopBlocksLast[0])
327           SubLoopBlocksLast.push_back(New);
328       } else if (AftBlocks.count(*BB)) {
329         L->addBasicBlockToLoop(New, *LI);
330 
331         if (*BB == AftBlocksFirst[0])
332           AftBlocksFirst.push_back(New);
333         if (*BB == AftBlocksLast[0])
334           AftBlocksLast.push_back(New);
335       } else {
336         llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
337       }
338 
339       // Update our running maps of newest clones
340       PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
341       LastValueMap[*BB] = New;
342       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
343            VI != VE; ++VI) {
344         PrevItValueMap[VI->second] =
345             const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
346         LastValueMap[VI->first] = VI->second;
347       }
348 
349       NewBlocks.push_back(New);
350 
351       // Update DomTree:
352       if (*BB == ForeBlocksFirst[0])
353         DT->addNewBlock(New, ForeBlocksLast[It - 1]);
354       else if (*BB == SubLoopBlocksFirst[0])
355         DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
356       else if (*BB == AftBlocksFirst[0])
357         DT->addNewBlock(New, AftBlocksLast[It - 1]);
358       else {
359         // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
360         // structure.
361         auto BBDomNode = DT->getNode(*BB);
362         auto BBIDom = BBDomNode->getIDom();
363         BasicBlock *OriginalBBIDom = BBIDom->getBlock();
364         assert(OriginalBBIDom);
365         assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
366         DT->addNewBlock(
367             New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
368       }
369     }
370 
371     // Remap all instructions in the most recent iteration
372     for (BasicBlock *NewBlock : NewBlocks) {
373       for (Instruction &I : *NewBlock) {
374         ::remapInstruction(&I, LastValueMap);
375         if (auto *II = dyn_cast<IntrinsicInst>(&I))
376           if (II->getIntrinsicID() == Intrinsic::assume)
377             AC->registerAssumption(II);
378       }
379     }
380 
381     // Alter the ForeBlocks phi's, pointing them at the latest version of the
382     // value from the previous iteration's phis
383     for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
384       Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
385       assert(OldValue && "should have incoming edge from Aft[It]");
386       Value *NewValue = OldValue;
387       if (Value *PrevValue = PrevItValueMap[OldValue])
388         NewValue = PrevValue;
389 
390       assert(Phi.getNumOperands() == 2);
391       Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
392       Phi.setIncomingValue(0, NewValue);
393       Phi.removeIncomingValue(1);
394     }
395   }
396 
397   // Now that all the basic blocks for the unrolled iterations are in place,
398   // finish up connecting the blocks and phi nodes. At this point LastValueMap
399   // is the last unrolled iterations values.
400 
401   // Update Phis in BB from OldBB to point to NewBB
402   auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
403                             BasicBlock *NewBB) {
404     for (PHINode &Phi : BB->phis()) {
405       int I = Phi.getBasicBlockIndex(OldBB);
406       Phi.setIncomingBlock(I, NewBB);
407     }
408   };
409   // Update Phis in BB from OldBB to point to NewBB and use the latest value
410   // from LastValueMap
411   auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
412                                      BasicBlock *NewBB,
413                                      ValueToValueMapTy &LastValueMap) {
414     for (PHINode &Phi : BB->phis()) {
415       for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
416         if (Phi.getIncomingBlock(b) == OldBB) {
417           Value *OldValue = Phi.getIncomingValue(b);
418           if (Value *LastValue = LastValueMap[OldValue])
419             Phi.setIncomingValue(b, LastValue);
420           Phi.setIncomingBlock(b, NewBB);
421           break;
422         }
423       }
424     }
425   };
426   // Move all the phis from Src into Dest
427   auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
428     Instruction *insertPoint = Dest->getFirstNonPHI();
429     while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
430       Phi->moveBefore(insertPoint);
431   };
432 
433   // Update the PHI values outside the loop to point to the last block
434   updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
435                            LastValueMap);
436 
437   // Update ForeBlocks successors and phi nodes
438   BranchInst *ForeTerm =
439       cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
440   BasicBlock *Dest = SubLoopBlocksFirst[0];
441   ForeTerm->setSuccessor(0, Dest);
442 
443   if (CompletelyUnroll) {
444     while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
445       Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
446       Phi->getParent()->getInstList().erase(Phi);
447     }
448   } else {
449     // Update the PHI values to point to the last aft block
450     updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
451                              AftBlocksLast.back(), LastValueMap);
452   }
453 
454   for (unsigned It = 1; It != Count; It++) {
455     // Remap ForeBlock successors from previous iteration to this
456     BranchInst *ForeTerm =
457         cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
458     BasicBlock *Dest = ForeBlocksFirst[It];
459     ForeTerm->setSuccessor(0, Dest);
460   }
461 
462   // Subloop successors and phis
463   BranchInst *SubTerm =
464       cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
465   SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
466   SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
467   updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
468                   ForeBlocksLast.back());
469   updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
470                   SubLoopBlocksLast.back());
471 
472   for (unsigned It = 1; It != Count; It++) {
473     // Replace the conditional branch of the previous iteration subloop with an
474     // unconditional one to this one
475     BranchInst *SubTerm =
476         cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
477     BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
478     SubTerm->eraseFromParent();
479 
480     updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
481                     ForeBlocksLast.back());
482     updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
483                     SubLoopBlocksLast.back());
484     movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
485   }
486 
487   // Aft blocks successors and phis
488   BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
489   if (CompletelyUnroll) {
490     BranchInst::Create(LoopExit, Term);
491     Term->eraseFromParent();
492   } else {
493     Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
494   }
495   updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
496                   SubLoopBlocksLast.back());
497 
498   for (unsigned It = 1; It != Count; It++) {
499     // Replace the conditional branch of the previous iteration subloop with an
500     // unconditional one to this one
501     BranchInst *AftTerm =
502         cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
503     BranchInst::Create(AftBlocksFirst[It], AftTerm);
504     AftTerm->eraseFromParent();
505 
506     updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
507                     SubLoopBlocksLast.back());
508     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
509   }
510 
511   // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
512   // new ones required.
513   if (Count != 1) {
514     SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
515     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
516                            SubLoopBlocksFirst[0]);
517     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
518                            SubLoopBlocksLast[0], AftBlocksFirst[0]);
519 
520     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
521                            ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
522     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
523                            SubLoopBlocksLast.back(), AftBlocksFirst[0]);
524     DT->applyUpdates(DTUpdates);
525   }
526 
527   // Merge adjacent basic blocks, if possible.
528   SmallPtrSet<BasicBlock *, 16> MergeBlocks;
529   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
530   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
531   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
532   while (!MergeBlocks.empty()) {
533     BasicBlock *BB = *MergeBlocks.begin();
534     BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
535     if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
536       BasicBlock *Dest = Term->getSuccessor(0);
537       if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
538         // Don't remove BB and add Fold as they are the same BB
539         assert(Fold == BB);
540         (void)Fold;
541         MergeBlocks.erase(Dest);
542       } else
543         MergeBlocks.erase(BB);
544     } else
545       MergeBlocks.erase(BB);
546   }
547 
548   // At this point, the code is well formed.  We now do a quick sweep over the
549   // inserted code, doing constant propagation and dead code elimination as we
550   // go.
551   simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
552   simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
553 
554   NumCompletelyUnrolledAndJammed += CompletelyUnroll;
555   ++NumUnrolledAndJammed;
556 
557 #ifndef NDEBUG
558   // We shouldn't have done anything to break loop simplify form or LCSSA.
559   Loop *OuterL = L->getParentLoop();
560   Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
561   assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
562   if (!CompletelyUnroll)
563     assert(L->isLoopSimplifyForm());
564   assert(SubLoop->isLoopSimplifyForm());
565   assert(DT->verify());
566 #endif
567 
568   // Update LoopInfo if the loop is completely removed.
569   if (CompletelyUnroll)
570     LI->erase(L);
571 
572   return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
573                           : LoopUnrollResult::PartiallyUnrolled;
574 }
575 
getLoadsAndStores(BasicBlockSet & Blocks,SmallVector<Value *,4> & MemInstr)576 static bool getLoadsAndStores(BasicBlockSet &Blocks,
577                               SmallVector<Value *, 4> &MemInstr) {
578   // Scan the BBs and collect legal loads and stores.
579   // Returns false if non-simple loads/stores are found.
580   for (BasicBlock *BB : Blocks) {
581     for (Instruction &I : *BB) {
582       if (auto *Ld = dyn_cast<LoadInst>(&I)) {
583         if (!Ld->isSimple())
584           return false;
585         MemInstr.push_back(&I);
586       } else if (auto *St = dyn_cast<StoreInst>(&I)) {
587         if (!St->isSimple())
588           return false;
589         MemInstr.push_back(&I);
590       } else if (I.mayReadOrWriteMemory()) {
591         return false;
592       }
593     }
594   }
595   return true;
596 }
597 
checkDependencies(SmallVector<Value *,4> & Earlier,SmallVector<Value *,4> & Later,unsigned LoopDepth,bool InnerLoop,DependenceInfo & DI)598 static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
599                               SmallVector<Value *, 4> &Later,
600                               unsigned LoopDepth, bool InnerLoop,
601                               DependenceInfo &DI) {
602   // Use DA to check for dependencies between loads and stores that make unroll
603   // and jam invalid
604   for (Value *I : Earlier) {
605     for (Value *J : Later) {
606       Instruction *Src = cast<Instruction>(I);
607       Instruction *Dst = cast<Instruction>(J);
608       if (Src == Dst)
609         continue;
610       // Ignore Input dependencies.
611       if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
612         continue;
613 
614       // Track dependencies, and if we find them take a conservative approach
615       // by allowing only = or < (not >), altough some > would be safe
616       // (depending upon unroll width).
617       // For the inner loop, we need to disallow any (> <) dependencies
618       // FIXME: Allow > so long as distance is less than unroll width
619       if (auto D = DI.depends(Src, Dst, true)) {
620         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
621 
622         if (D->isConfused())
623           return false;
624         if (!InnerLoop) {
625           if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
626             return false;
627         } else {
628           assert(LoopDepth + 1 <= D->getLevels());
629           if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
630               D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
631             return false;
632         }
633       }
634     }
635   }
636   return true;
637 }
638 
checkDependencies(Loop * L,BasicBlockSet & ForeBlocks,BasicBlockSet & SubLoopBlocks,BasicBlockSet & AftBlocks,DependenceInfo & DI)639 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
640                               BasicBlockSet &SubLoopBlocks,
641                               BasicBlockSet &AftBlocks, DependenceInfo &DI) {
642   // Get all loads/store pairs for each blocks
643   SmallVector<Value *, 4> ForeMemInstr;
644   SmallVector<Value *, 4> SubLoopMemInstr;
645   SmallVector<Value *, 4> AftMemInstr;
646   if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
647       !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
648       !getLoadsAndStores(AftBlocks, AftMemInstr))
649     return false;
650 
651   // Check for dependencies between any blocks that may change order
652   unsigned LoopDepth = L->getLoopDepth();
653   return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
654                            DI) &&
655          checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
656          checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
657                            DI) &&
658          checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
659                            DI);
660 }
661 
isSafeToUnrollAndJam(Loop * L,ScalarEvolution & SE,DominatorTree & DT,DependenceInfo & DI)662 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
663                                 DependenceInfo &DI) {
664   /* We currently handle outer loops like this:
665         |
666     ForeFirst    <----\    }
667      Blocks           |    } ForeBlocks
668     ForeLast          |    }
669         |             |
670     SubLoopFirst  <\  |    }
671      Blocks        |  |    } SubLoopBlocks
672     SubLoopLast   -/  |    }
673         |             |
674     AftFirst          |    }
675      Blocks           |    } AftBlocks
676     AftLast     ------/    }
677         |
678 
679     There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
680     and AftBlocks, providing that there is one edge from Fores to SubLoops,
681     one edge from SubLoops to Afts and a single outer loop exit (from Afts).
682     In practice we currently limit Aft blocks to a single block, and limit
683     things further in the profitablility checks of the unroll and jam pass.
684 
685     Because of the way we rearrange basic blocks, we also require that
686     the Fore blocks on all unrolled iterations are safe to move before the
687     SubLoop blocks of all iterations. So we require that the phi node looping
688     operands of ForeHeader can be moved to at least the end of ForeEnd, so that
689     we can arrange cloned Fore Blocks before the subloop and match up Phi's
690     correctly.
691 
692     i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
693     It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
694 
695     There are then a number of checks along the lines of no calls, no
696     exceptions, inner loop IV is consistent, etc. Note that for loops requiring
697     runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
698     UnrollAndJamLoop if the trip count cannot be easily calculated.
699   */
700 
701   if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
702     return false;
703   Loop *SubLoop = L->getSubLoops()[0];
704   if (!SubLoop->isLoopSimplifyForm())
705     return false;
706 
707   BasicBlock *Header = L->getHeader();
708   BasicBlock *Latch = L->getLoopLatch();
709   BasicBlock *Exit = L->getExitingBlock();
710   BasicBlock *SubLoopHeader = SubLoop->getHeader();
711   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
712   BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
713 
714   if (Latch != Exit)
715     return false;
716   if (SubLoopLatch != SubLoopExit)
717     return false;
718 
719   if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
720     return false;
721 
722   // Split blocks into Fore/SubLoop/Aft based on dominators
723   BasicBlockSet SubLoopBlocks;
724   BasicBlockSet ForeBlocks;
725   BasicBlockSet AftBlocks;
726   if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
727                                 AftBlocks, &DT))
728     return false;
729 
730   // Aft blocks may need to move instructions to fore blocks, which becomes more
731   // difficult if there are multiple (potentially conditionally executed)
732   // blocks. For now we just exclude loops with multiple aft blocks.
733   if (AftBlocks.size() != 1)
734     return false;
735 
736   // Check inner loop IV is consistent between all iterations
737   const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
738   if (isa<SCEVCouldNotCompute>(SubLoopBECountSC) ||
739       !SubLoopBECountSC->getType()->isIntegerTy())
740     return false;
741   ScalarEvolution::LoopDisposition LD =
742       SE.getLoopDisposition(SubLoopBECountSC, L);
743   if (LD != ScalarEvolution::LoopInvariant)
744     return false;
745 
746   // Check the loop safety info for exceptions.
747   LoopSafetyInfo LSI;
748   computeLoopSafetyInfo(&LSI, L);
749   if (LSI.MayThrow)
750     return false;
751 
752   // We've ruled out the easy stuff and now need to check that there are no
753   // interdependencies which may prevent us from moving the:
754   //  ForeBlocks before Subloop and AftBlocks.
755   //  Subloop before AftBlocks.
756   //  ForeBlock phi operands before the subloop
757 
758   // Make sure we can move all instructions we need to before the subloop
759   if (!processHeaderPhiOperands(
760           Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
761             if (SubLoop->contains(I->getParent()))
762               return false;
763             if (AftBlocks.count(I->getParent())) {
764               // If we hit a phi node in afts we know we are done (probably
765               // LCSSA)
766               if (isa<PHINode>(I))
767                 return false;
768               // Can't move instructions with side effects or memory
769               // reads/writes
770               if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
771                 return false;
772             }
773             // Keep going
774             return true;
775           }))
776     return false;
777 
778   // Check for memory dependencies which prohibit the unrolling we are doing.
779   // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
780   // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
781   if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
782     return false;
783 
784   return true;
785 }
786