1 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/ADT/SmallPtrSet.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/AssumptionCache.h"
17 #include "llvm/Analysis/DependenceAnalysis.h"
18 #include "llvm/Analysis/InstructionSimplify.h"
19 #include "llvm/Analysis/LoopAnalysisManager.h"
20 #include "llvm/Analysis/LoopIterator.h"
21 #include "llvm/Analysis/LoopPass.h"
22 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
23 #include "llvm/Analysis/ScalarEvolution.h"
24 #include "llvm/Analysis/Utils/Local.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/DataLayout.h"
27 #include "llvm/IR/DebugInfoMetadata.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/IntrinsicInst.h"
30 #include "llvm/IR/LLVMContext.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
34 #include "llvm/Transforms/Utils/Cloning.h"
35 #include "llvm/Transforms/Utils/LoopSimplify.h"
36 #include "llvm/Transforms/Utils/LoopUtils.h"
37 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
38 #include "llvm/Transforms/Utils/UnrollLoop.h"
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "loop-unroll-and-jam"
42 
43 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
44 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
45 
46 typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
47 
48 // Partition blocks in an outer/inner loop pair into blocks before and after
49 // the loop
50 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
51                                      BasicBlockSet &ForeBlocks,
52                                      BasicBlockSet &SubLoopBlocks,
53                                      BasicBlockSet &AftBlocks,
54                                      DominatorTree *DT) {
55   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
56   SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
57 
58   for (BasicBlock *BB : L->blocks()) {
59     if (!SubLoop->contains(BB)) {
60       if (DT->dominates(SubLoopLatch, BB))
61         AftBlocks.insert(BB);
62       else
63         ForeBlocks.insert(BB);
64     }
65   }
66 
67   // Check that all blocks in ForeBlocks together dominate the subloop
68   // TODO: This might ideally be done better with a dominator/postdominators.
69   BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
70   for (BasicBlock *BB : ForeBlocks) {
71     if (BB == SubLoopPreHeader)
72       continue;
73     Instruction *TI = BB->getTerminator();
74     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
75       if (!ForeBlocks.count(TI->getSuccessor(i)))
76         return false;
77   }
78 
79   return true;
80 }
81 
82 // Looks at the phi nodes in Header for values coming from Latch. For these
83 // instructions and all their operands calls Visit on them, keeping going for
84 // all the operands in AftBlocks. Returns false if Visit returns false,
85 // otherwise returns true. This is used to process the instructions in the
86 // Aft blocks that need to be moved before the subloop. It is used in two
87 // places. One to check that the required set of instructions can be moved
88 // before the loop. Then to collect the instructions to actually move in
89 // moveHeaderPhiOperandsToForeBlocks.
90 template <typename T>
91 static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
92                                      BasicBlockSet &AftBlocks, T Visit) {
93   SmallVector<Instruction *, 8> Worklist;
94   for (auto &Phi : Header->phis()) {
95     Value *V = Phi.getIncomingValueForBlock(Latch);
96     if (Instruction *I = dyn_cast<Instruction>(V))
97       Worklist.push_back(I);
98   }
99 
100   while (!Worklist.empty()) {
101     Instruction *I = Worklist.back();
102     Worklist.pop_back();
103     if (!Visit(I))
104       return false;
105 
106     if (AftBlocks.count(I->getParent()))
107       for (auto &U : I->operands())
108         if (Instruction *II = dyn_cast<Instruction>(U))
109           Worklist.push_back(II);
110   }
111 
112   return true;
113 }
114 
115 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
116 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
117                                               BasicBlock *Latch,
118                                               Instruction *InsertLoc,
119                                               BasicBlockSet &AftBlocks) {
120   // We need to ensure we move the instructions in the correct order,
121   // starting with the earliest required instruction and moving forward.
122   std::vector<Instruction *> Visited;
123   processHeaderPhiOperands(Header, Latch, AftBlocks,
124                            [&Visited, &AftBlocks](Instruction *I) {
125                              if (AftBlocks.count(I->getParent()))
126                                Visited.push_back(I);
127                              return true;
128                            });
129 
130   // Move all instructions in program order to before the InsertLoc
131   BasicBlock *InsertLocBB = InsertLoc->getParent();
132   for (Instruction *I : reverse(Visited)) {
133     if (I->getParent() != InsertLocBB)
134       I->moveBefore(InsertLoc);
135   }
136 }
137 
138 /*
139   This method performs Unroll and Jam. For a simple loop like:
140   for (i = ..)
141     Fore(i)
142     for (j = ..)
143       SubLoop(i, j)
144     Aft(i)
145 
146   Instead of doing normal inner or outer unrolling, we do:
147   for (i = .., i+=2)
148     Fore(i)
149     Fore(i+1)
150     for (j = ..)
151       SubLoop(i, j)
152       SubLoop(i+1, j)
153     Aft(i)
154     Aft(i+1)
155 
156   So the outer loop is essetially unrolled and then the inner loops are fused
157   ("jammed") together into a single loop. This can increase speed when there
158   are loads in SubLoop that are invariant to i, as they become shared between
159   the now jammed inner loops.
160 
161   We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
162   Fore blocks are those before the inner loop, Aft are those after. Normal
163   Unroll code is used to copy each of these sets of blocks and the results are
164   combined together into the final form above.
165 
166   isSafeToUnrollAndJam should be used prior to calling this to make sure the
167   unrolling will be valid. Checking profitablility is also advisable.
168 
169   If EpilogueLoop is non-null, it receives the epilogue loop (if it was
170   necessary to create one and not fully unrolled).
171 */
172 LoopUnrollResult llvm::UnrollAndJamLoop(
173     Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
174     bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
175     AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
176 
177   // When we enter here we should have already checked that it is safe
178   BasicBlock *Header = L->getHeader();
179   assert(Header && "No header.");
180   assert(L->getSubLoops().size() == 1);
181   Loop *SubLoop = *L->begin();
182 
183   // Don't enter the unroll code if there is nothing to do.
184   if (TripCount == 0 && Count < 2) {
185     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
186     return LoopUnrollResult::Unmodified;
187   }
188 
189   assert(Count > 0);
190   assert(TripMultiple > 0);
191   assert(TripCount == 0 || TripCount % TripMultiple == 0);
192 
193   // Are we eliminating the loop control altogether?
194   bool CompletelyUnroll = (Count == TripCount);
195 
196   // We use the runtime remainder in cases where we don't know trip multiple
197   if (TripMultiple == 1 || TripMultiple % Count != 0) {
198     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
199                                     /*UseEpilogRemainder*/ true,
200                                     UnrollRemainder, /*ForgetAllSCEV*/ false,
201                                     LI, SE, DT, AC, true, EpilogueLoop)) {
202       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
203                            "generated when assuming runtime trip count\n");
204       return LoopUnrollResult::Unmodified;
205     }
206   }
207 
208   // Notify ScalarEvolution that the loop will be substantially changed,
209   // if not outright eliminated.
210   if (SE) {
211     SE->forgetLoop(L);
212     SE->forgetLoop(SubLoop);
213   }
214 
215   using namespace ore;
216   // Report the unrolling decision.
217   if (CompletelyUnroll) {
218     LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
219                       << Header->getName() << " with trip count " << TripCount
220                       << "!\n");
221     ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
222                                  L->getHeader())
223               << "completely unroll and jammed loop with "
224               << NV("UnrollCount", TripCount) << " iterations");
225   } else {
226     auto DiagBuilder = [&]() {
227       OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
228                               L->getHeader());
229       return Diag << "unroll and jammed loop by a factor of "
230                   << NV("UnrollCount", Count);
231     };
232 
233     LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
234                       << " by " << Count);
235     if (TripMultiple != 1) {
236       LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
237       ORE->emit([&]() {
238         return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
239                              << " trips per branch";
240       });
241     } else {
242       LLVM_DEBUG(dbgs() << " with run-time trip count");
243       ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
244     }
245     LLVM_DEBUG(dbgs() << "!\n");
246   }
247 
248   BasicBlock *Preheader = L->getLoopPreheader();
249   BasicBlock *LatchBlock = L->getLoopLatch();
250   assert(Preheader && "No preheader");
251   assert(LatchBlock && "No latch block");
252   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
253   assert(BI && !BI->isUnconditional());
254   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
255   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
256   bool SubLoopContinueOnTrue = SubLoop->contains(
257       SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
258 
259   // Partition blocks in an outer/inner loop pair into blocks before and after
260   // the loop
261   BasicBlockSet SubLoopBlocks;
262   BasicBlockSet ForeBlocks;
263   BasicBlockSet AftBlocks;
264   partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
265                            DT);
266 
267   // We keep track of the entering/first and exiting/last block of each of
268   // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
269   // blocks easier.
270   std::vector<BasicBlock *> ForeBlocksFirst;
271   std::vector<BasicBlock *> ForeBlocksLast;
272   std::vector<BasicBlock *> SubLoopBlocksFirst;
273   std::vector<BasicBlock *> SubLoopBlocksLast;
274   std::vector<BasicBlock *> AftBlocksFirst;
275   std::vector<BasicBlock *> AftBlocksLast;
276   ForeBlocksFirst.push_back(Header);
277   ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
278   SubLoopBlocksFirst.push_back(SubLoop->getHeader());
279   SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
280   AftBlocksFirst.push_back(SubLoop->getExitBlock());
281   AftBlocksLast.push_back(L->getExitingBlock());
282   // Maps Blocks[0] -> Blocks[It]
283   ValueToValueMapTy LastValueMap;
284 
285   // Move any instructions from fore phi operands from AftBlocks into Fore.
286   moveHeaderPhiOperandsToForeBlocks(
287       Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
288       AftBlocks);
289 
290   // The current on-the-fly SSA update requires blocks to be processed in
291   // reverse postorder so that LastValueMap contains the correct value at each
292   // exit.
293   LoopBlocksDFS DFS(L);
294   DFS.perform(LI);
295   // Stash the DFS iterators before adding blocks to the loop.
296   LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
297   LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
298 
299   if (Header->getParent()->isDebugInfoForProfiling())
300     for (BasicBlock *BB : L->getBlocks())
301       for (Instruction &I : *BB)
302         if (!isa<DbgInfoIntrinsic>(&I))
303           if (const DILocation *DIL = I.getDebugLoc()) {
304             auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
305             if (NewDIL)
306               I.setDebugLoc(NewDIL.getValue());
307             else
308               LLVM_DEBUG(dbgs()
309                          << "Failed to create new discriminator: "
310                          << DIL->getFilename() << " Line: " << DIL->getLine());
311           }
312 
313   // Copy all blocks
314   for (unsigned It = 1; It != Count; ++It) {
315     std::vector<BasicBlock *> NewBlocks;
316     // Maps Blocks[It] -> Blocks[It-1]
317     DenseMap<Value *, Value *> PrevItValueMap;
318 
319     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
320       ValueToValueMapTy VMap;
321       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
322       Header->getParent()->getBasicBlockList().push_back(New);
323 
324       if (ForeBlocks.count(*BB)) {
325         L->addBasicBlockToLoop(New, *LI);
326 
327         if (*BB == ForeBlocksFirst[0])
328           ForeBlocksFirst.push_back(New);
329         if (*BB == ForeBlocksLast[0])
330           ForeBlocksLast.push_back(New);
331       } else if (SubLoopBlocks.count(*BB)) {
332         SubLoop->addBasicBlockToLoop(New, *LI);
333 
334         if (*BB == SubLoopBlocksFirst[0])
335           SubLoopBlocksFirst.push_back(New);
336         if (*BB == SubLoopBlocksLast[0])
337           SubLoopBlocksLast.push_back(New);
338       } else if (AftBlocks.count(*BB)) {
339         L->addBasicBlockToLoop(New, *LI);
340 
341         if (*BB == AftBlocksFirst[0])
342           AftBlocksFirst.push_back(New);
343         if (*BB == AftBlocksLast[0])
344           AftBlocksLast.push_back(New);
345       } else {
346         llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
347       }
348 
349       // Update our running maps of newest clones
350       PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
351       LastValueMap[*BB] = New;
352       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
353            VI != VE; ++VI) {
354         PrevItValueMap[VI->second] =
355             const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
356         LastValueMap[VI->first] = VI->second;
357       }
358 
359       NewBlocks.push_back(New);
360 
361       // Update DomTree:
362       if (*BB == ForeBlocksFirst[0])
363         DT->addNewBlock(New, ForeBlocksLast[It - 1]);
364       else if (*BB == SubLoopBlocksFirst[0])
365         DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
366       else if (*BB == AftBlocksFirst[0])
367         DT->addNewBlock(New, AftBlocksLast[It - 1]);
368       else {
369         // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
370         // structure.
371         auto BBDomNode = DT->getNode(*BB);
372         auto BBIDom = BBDomNode->getIDom();
373         BasicBlock *OriginalBBIDom = BBIDom->getBlock();
374         assert(OriginalBBIDom);
375         assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
376         DT->addNewBlock(
377             New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
378       }
379     }
380 
381     // Remap all instructions in the most recent iteration
382     for (BasicBlock *NewBlock : NewBlocks) {
383       for (Instruction &I : *NewBlock) {
384         ::remapInstruction(&I, LastValueMap);
385         if (auto *II = dyn_cast<IntrinsicInst>(&I))
386           if (II->getIntrinsicID() == Intrinsic::assume)
387             AC->registerAssumption(II);
388       }
389     }
390 
391     // Alter the ForeBlocks phi's, pointing them at the latest version of the
392     // value from the previous iteration's phis
393     for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
394       Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
395       assert(OldValue && "should have incoming edge from Aft[It]");
396       Value *NewValue = OldValue;
397       if (Value *PrevValue = PrevItValueMap[OldValue])
398         NewValue = PrevValue;
399 
400       assert(Phi.getNumOperands() == 2);
401       Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
402       Phi.setIncomingValue(0, NewValue);
403       Phi.removeIncomingValue(1);
404     }
405   }
406 
407   // Now that all the basic blocks for the unrolled iterations are in place,
408   // finish up connecting the blocks and phi nodes. At this point LastValueMap
409   // is the last unrolled iterations values.
410 
411   // Update Phis in BB from OldBB to point to NewBB
412   auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
413                             BasicBlock *NewBB) {
414     for (PHINode &Phi : BB->phis()) {
415       int I = Phi.getBasicBlockIndex(OldBB);
416       Phi.setIncomingBlock(I, NewBB);
417     }
418   };
419   // Update Phis in BB from OldBB to point to NewBB and use the latest value
420   // from LastValueMap
421   auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
422                                      BasicBlock *NewBB,
423                                      ValueToValueMapTy &LastValueMap) {
424     for (PHINode &Phi : BB->phis()) {
425       for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
426         if (Phi.getIncomingBlock(b) == OldBB) {
427           Value *OldValue = Phi.getIncomingValue(b);
428           if (Value *LastValue = LastValueMap[OldValue])
429             Phi.setIncomingValue(b, LastValue);
430           Phi.setIncomingBlock(b, NewBB);
431           break;
432         }
433       }
434     }
435   };
436   // Move all the phis from Src into Dest
437   auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
438     Instruction *insertPoint = Dest->getFirstNonPHI();
439     while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
440       Phi->moveBefore(insertPoint);
441   };
442 
443   // Update the PHI values outside the loop to point to the last block
444   updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
445                            LastValueMap);
446 
447   // Update ForeBlocks successors and phi nodes
448   BranchInst *ForeTerm =
449       cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
450   BasicBlock *Dest = SubLoopBlocksFirst[0];
451   ForeTerm->setSuccessor(0, Dest);
452 
453   if (CompletelyUnroll) {
454     while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
455       Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
456       Phi->getParent()->getInstList().erase(Phi);
457     }
458   } else {
459     // Update the PHI values to point to the last aft block
460     updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
461                              AftBlocksLast.back(), LastValueMap);
462   }
463 
464   for (unsigned It = 1; It != Count; It++) {
465     // Remap ForeBlock successors from previous iteration to this
466     BranchInst *ForeTerm =
467         cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
468     BasicBlock *Dest = ForeBlocksFirst[It];
469     ForeTerm->setSuccessor(0, Dest);
470   }
471 
472   // Subloop successors and phis
473   BranchInst *SubTerm =
474       cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
475   SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
476   SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
477   updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
478                   ForeBlocksLast.back());
479   updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
480                   SubLoopBlocksLast.back());
481 
482   for (unsigned It = 1; It != Count; It++) {
483     // Replace the conditional branch of the previous iteration subloop with an
484     // unconditional one to this one
485     BranchInst *SubTerm =
486         cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
487     BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
488     SubTerm->eraseFromParent();
489 
490     updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
491                     ForeBlocksLast.back());
492     updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
493                     SubLoopBlocksLast.back());
494     movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
495   }
496 
497   // Aft blocks successors and phis
498   BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
499   if (CompletelyUnroll) {
500     BranchInst::Create(LoopExit, Term);
501     Term->eraseFromParent();
502   } else {
503     Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
504   }
505   updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
506                   SubLoopBlocksLast.back());
507 
508   for (unsigned It = 1; It != Count; It++) {
509     // Replace the conditional branch of the previous iteration subloop with an
510     // unconditional one to this one
511     BranchInst *AftTerm =
512         cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
513     BranchInst::Create(AftBlocksFirst[It], AftTerm);
514     AftTerm->eraseFromParent();
515 
516     updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
517                     SubLoopBlocksLast.back());
518     movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
519   }
520 
521   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
522   // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
523   // new ones required.
524   if (Count != 1) {
525     SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
526     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
527                            SubLoopBlocksFirst[0]);
528     DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
529                            SubLoopBlocksLast[0], AftBlocksFirst[0]);
530 
531     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
532                            ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
533     DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
534                            SubLoopBlocksLast.back(), AftBlocksFirst[0]);
535     DTU.applyUpdatesPermissive(DTUpdates);
536   }
537 
538   // Merge adjacent basic blocks, if possible.
539   SmallPtrSet<BasicBlock *, 16> MergeBlocks;
540   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
541   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
542   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
543   while (!MergeBlocks.empty()) {
544     BasicBlock *BB = *MergeBlocks.begin();
545     BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
546     if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
547       BasicBlock *Dest = Term->getSuccessor(0);
548       BasicBlock *Fold = Dest->getUniquePredecessor();
549       if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
550         // Don't remove BB and add Fold as they are the same BB
551         assert(Fold == BB);
552         (void)Fold;
553         MergeBlocks.erase(Dest);
554       } else
555         MergeBlocks.erase(BB);
556     } else
557       MergeBlocks.erase(BB);
558   }
559   // Apply updates to the DomTree.
560   DT = &DTU.getDomTree();
561 
562   // At this point, the code is well formed.  We now do a quick sweep over the
563   // inserted code, doing constant propagation and dead code elimination as we
564   // go.
565   simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
566   simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
567 
568   NumCompletelyUnrolledAndJammed += CompletelyUnroll;
569   ++NumUnrolledAndJammed;
570 
571 #ifndef NDEBUG
572   // We shouldn't have done anything to break loop simplify form or LCSSA.
573   Loop *OuterL = L->getParentLoop();
574   Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
575   assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
576   if (!CompletelyUnroll)
577     assert(L->isLoopSimplifyForm());
578   assert(SubLoop->isLoopSimplifyForm());
579   assert(DT->verify());
580 #endif
581 
582   // Update LoopInfo if the loop is completely removed.
583   if (CompletelyUnroll)
584     LI->erase(L);
585 
586   return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
587                           : LoopUnrollResult::PartiallyUnrolled;
588 }
589 
590 static bool getLoadsAndStores(BasicBlockSet &Blocks,
591                               SmallVector<Value *, 4> &MemInstr) {
592   // Scan the BBs and collect legal loads and stores.
593   // Returns false if non-simple loads/stores are found.
594   for (BasicBlock *BB : Blocks) {
595     for (Instruction &I : *BB) {
596       if (auto *Ld = dyn_cast<LoadInst>(&I)) {
597         if (!Ld->isSimple())
598           return false;
599         MemInstr.push_back(&I);
600       } else if (auto *St = dyn_cast<StoreInst>(&I)) {
601         if (!St->isSimple())
602           return false;
603         MemInstr.push_back(&I);
604       } else if (I.mayReadOrWriteMemory()) {
605         return false;
606       }
607     }
608   }
609   return true;
610 }
611 
612 static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
613                               SmallVector<Value *, 4> &Later,
614                               unsigned LoopDepth, bool InnerLoop,
615                               DependenceInfo &DI) {
616   // Use DA to check for dependencies between loads and stores that make unroll
617   // and jam invalid
618   for (Value *I : Earlier) {
619     for (Value *J : Later) {
620       Instruction *Src = cast<Instruction>(I);
621       Instruction *Dst = cast<Instruction>(J);
622       if (Src == Dst)
623         continue;
624       // Ignore Input dependencies.
625       if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
626         continue;
627 
628       // Track dependencies, and if we find them take a conservative approach
629       // by allowing only = or < (not >), altough some > would be safe
630       // (depending upon unroll width).
631       // For the inner loop, we need to disallow any (> <) dependencies
632       // FIXME: Allow > so long as distance is less than unroll width
633       if (auto D = DI.depends(Src, Dst, true)) {
634         assert(D->isOrdered() && "Expected an output, flow or anti dep.");
635 
636         if (D->isConfused()) {
637           LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
638                             << "  " << *Src << "\n"
639                             << "  " << *Dst << "\n");
640           return false;
641         }
642         if (!InnerLoop) {
643           if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
644             LLVM_DEBUG(dbgs() << "  > dependency between:\n"
645                               << "  " << *Src << "\n"
646                               << "  " << *Dst << "\n");
647             return false;
648           }
649         } else {
650           assert(LoopDepth + 1 <= D->getLevels());
651           if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
652               D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
653             LLVM_DEBUG(dbgs() << "  < > dependency between:\n"
654                               << "  " << *Src << "\n"
655                               << "  " << *Dst << "\n");
656             return false;
657           }
658         }
659       }
660     }
661   }
662   return true;
663 }
664 
665 static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
666                               BasicBlockSet &SubLoopBlocks,
667                               BasicBlockSet &AftBlocks, DependenceInfo &DI) {
668   // Get all loads/store pairs for each blocks
669   SmallVector<Value *, 4> ForeMemInstr;
670   SmallVector<Value *, 4> SubLoopMemInstr;
671   SmallVector<Value *, 4> AftMemInstr;
672   if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
673       !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
674       !getLoadsAndStores(AftBlocks, AftMemInstr))
675     return false;
676 
677   // Check for dependencies between any blocks that may change order
678   unsigned LoopDepth = L->getLoopDepth();
679   return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
680                            DI) &&
681          checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
682          checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
683                            DI) &&
684          checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
685                            DI);
686 }
687 
688 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
689                                 DependenceInfo &DI) {
690   /* We currently handle outer loops like this:
691         |
692     ForeFirst    <----\    }
693      Blocks           |    } ForeBlocks
694     ForeLast          |    }
695         |             |
696     SubLoopFirst  <\  |    }
697      Blocks        |  |    } SubLoopBlocks
698     SubLoopLast   -/  |    }
699         |             |
700     AftFirst          |    }
701      Blocks           |    } AftBlocks
702     AftLast     ------/    }
703         |
704 
705     There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
706     and AftBlocks, providing that there is one edge from Fores to SubLoops,
707     one edge from SubLoops to Afts and a single outer loop exit (from Afts).
708     In practice we currently limit Aft blocks to a single block, and limit
709     things further in the profitablility checks of the unroll and jam pass.
710 
711     Because of the way we rearrange basic blocks, we also require that
712     the Fore blocks on all unrolled iterations are safe to move before the
713     SubLoop blocks of all iterations. So we require that the phi node looping
714     operands of ForeHeader can be moved to at least the end of ForeEnd, so that
715     we can arrange cloned Fore Blocks before the subloop and match up Phi's
716     correctly.
717 
718     i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
719     It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
720 
721     There are then a number of checks along the lines of no calls, no
722     exceptions, inner loop IV is consistent, etc. Note that for loops requiring
723     runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
724     UnrollAndJamLoop if the trip count cannot be easily calculated.
725   */
726 
727   if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
728     return false;
729   Loop *SubLoop = L->getSubLoops()[0];
730   if (!SubLoop->isLoopSimplifyForm())
731     return false;
732 
733   BasicBlock *Header = L->getHeader();
734   BasicBlock *Latch = L->getLoopLatch();
735   BasicBlock *Exit = L->getExitingBlock();
736   BasicBlock *SubLoopHeader = SubLoop->getHeader();
737   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
738   BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
739 
740   if (Latch != Exit)
741     return false;
742   if (SubLoopLatch != SubLoopExit)
743     return false;
744 
745   if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
746     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
747     return false;
748   }
749 
750   // Split blocks into Fore/SubLoop/Aft based on dominators
751   BasicBlockSet SubLoopBlocks;
752   BasicBlockSet ForeBlocks;
753   BasicBlockSet AftBlocks;
754   if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
755                                 AftBlocks, &DT)) {
756     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
757     return false;
758   }
759 
760   // Aft blocks may need to move instructions to fore blocks, which becomes more
761   // difficult if there are multiple (potentially conditionally executed)
762   // blocks. For now we just exclude loops with multiple aft blocks.
763   if (AftBlocks.size() != 1) {
764     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
765                          "multiple blocks after the loop\n");
766     return false;
767   }
768 
769   // Check inner loop backedge count is consistent on all iterations of the
770   // outer loop
771   if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
772     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
773                          "not consistent on each iteration\n");
774     return false;
775   }
776 
777   // Check the loop safety info for exceptions.
778   SimpleLoopSafetyInfo LSI;
779   LSI.computeLoopSafetyInfo(L);
780   if (LSI.anyBlockMayThrow()) {
781     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
782     return false;
783   }
784 
785   // We've ruled out the easy stuff and now need to check that there are no
786   // interdependencies which may prevent us from moving the:
787   //  ForeBlocks before Subloop and AftBlocks.
788   //  Subloop before AftBlocks.
789   //  ForeBlock phi operands before the subloop
790 
791   // Make sure we can move all instructions we need to before the subloop
792   if (!processHeaderPhiOperands(
793           Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
794             if (SubLoop->contains(I->getParent()))
795               return false;
796             if (AftBlocks.count(I->getParent())) {
797               // If we hit a phi node in afts we know we are done (probably
798               // LCSSA)
799               if (isa<PHINode>(I))
800                 return false;
801               // Can't move instructions with side effects or memory
802               // reads/writes
803               if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
804                 return false;
805             }
806             // Keep going
807             return true;
808           })) {
809     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
810                          "instructions after subloop to before it\n");
811     return false;
812   }
813 
814   // Check for memory dependencies which prohibit the unrolling we are doing.
815   // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
816   // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
817   if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
818     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
819     return false;
820   }
821 
822   return true;
823 }
824