1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// The goal of hot/cold splitting is to improve the memory locality of code.
11 /// The splitting pass does this by identifying cold blocks and moving them into
12 /// separate functions.
13 ///
14 /// When the splitting pass finds a cold block (referred to as "the sink"), it
15 /// grows a maximal cold region around that block. The maximal region contains
16 /// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
17 /// cold as the sink. Once a region is found, it's split out of the original
18 /// function provided it's profitable to do so.
19 ///
20 /// [*] In practice, there is some added complexity because some blocks are not
21 /// safe to extract.
22 ///
23 /// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
24 /// TODO: Reorder outlined functions.
25 ///
26 //===----------------------------------------------------------------------===//
27 
28 #include "llvm/Transforms/IPO/HotColdSplitting.h"
29 #include "llvm/ADT/PostOrderIterator.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/BlockFrequencyInfo.h"
33 #include "llvm/Analysis/BranchProbabilityInfo.h"
34 #include "llvm/Analysis/CFG.h"
35 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
36 #include "llvm/Analysis/PostDominators.h"
37 #include "llvm/Analysis/ProfileSummaryInfo.h"
38 #include "llvm/Analysis/TargetTransformInfo.h"
39 #include "llvm/IR/BasicBlock.h"
40 #include "llvm/IR/CFG.h"
41 #include "llvm/IR/DataLayout.h"
42 #include "llvm/IR/DiagnosticInfo.h"
43 #include "llvm/IR/Dominators.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/Instruction.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/IntrinsicInst.h"
48 #include "llvm/IR/Metadata.h"
49 #include "llvm/IR/Module.h"
50 #include "llvm/IR/PassManager.h"
51 #include "llvm/IR/Type.h"
52 #include "llvm/IR/Use.h"
53 #include "llvm/IR/User.h"
54 #include "llvm/IR/Value.h"
55 #include "llvm/InitializePasses.h"
56 #include "llvm/Pass.h"
57 #include "llvm/Support/BlockFrequency.h"
58 #include "llvm/Support/BranchProbability.h"
59 #include "llvm/Support/CommandLine.h"
60 #include "llvm/Support/Debug.h"
61 #include "llvm/Support/raw_ostream.h"
62 #include "llvm/Transforms/IPO.h"
63 #include "llvm/Transforms/Scalar.h"
64 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
65 #include "llvm/Transforms/Utils/Cloning.h"
66 #include "llvm/Transforms/Utils/CodeExtractor.h"
67 #include "llvm/Transforms/Utils/Local.h"
68 #include "llvm/Transforms/Utils/ValueMapper.h"
69 #include <algorithm>
70 #include <limits>
71 #include <cassert>
72 #include <string>
73 
74 #define DEBUG_TYPE "hotcoldsplit"
75 
76 STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
77 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
78 
79 using namespace llvm;
80 
81 static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
82                                           cl::init(true), cl::Hidden);
83 
84 static cl::opt<int>
85     SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
86                        cl::desc("Base penalty for splitting cold code (as a "
87                                 "multiple of TCC_Basic)"));
88 
89 static cl::opt<bool> EnableColdSection(
90     "enable-cold-section", cl::init(false), cl::Hidden,
91     cl::desc("Enable placement of extracted cold functions"
92              " into a separate section after hot-cold splitting."));
93 
94 static cl::opt<std::string>
95     ColdSectionName("hotcoldsplit-cold-section-name", cl::init("__llvm_cold"),
96                     cl::Hidden,
97                     cl::desc("Name for the section containing cold functions "
98                              "extracted by hot-cold splitting."));
99 
100 static cl::opt<int> MaxParametersForSplit(
101     "hotcoldsplit-max-params", cl::init(4), cl::Hidden,
102     cl::desc("Maximum number of parameters for a split function"));
103 
104 namespace {
105 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
106 // this function unless you modify the MBB version as well.
107 //
108 /// A no successor, non-return block probably ends in unreachable and is cold.
109 /// Also consider a block that ends in an indirect branch to be a return block,
110 /// since many targets use plain indirect branches to return.
111 bool blockEndsInUnreachable(const BasicBlock &BB) {
112   if (!succ_empty(&BB))
113     return false;
114   if (BB.empty())
115     return true;
116   const Instruction *I = BB.getTerminator();
117   return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
118 }
119 
120 bool unlikelyExecuted(BasicBlock &BB) {
121   // Exception handling blocks are unlikely executed.
122   if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
123     return true;
124 
125   // The block is cold if it calls/invokes a cold function. However, do not
126   // mark sanitizer traps as cold.
127   for (Instruction &I : BB)
128     if (auto *CB = dyn_cast<CallBase>(&I))
129       if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
130         return true;
131 
132   // The block is cold if it has an unreachable terminator, unless it's
133   // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
134   if (blockEndsInUnreachable(BB)) {
135     if (auto *CI =
136             dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
137       if (CI->hasFnAttr(Attribute::NoReturn))
138         return false;
139     return true;
140   }
141 
142   return false;
143 }
144 
145 /// Check whether it's safe to outline \p BB.
146 static bool mayExtractBlock(const BasicBlock &BB) {
147   // EH pads are unsafe to outline because doing so breaks EH type tables. It
148   // follows that invoke instructions cannot be extracted, because CodeExtractor
149   // requires unwind destinations to be within the extraction region.
150   //
151   // Resumes that are not reachable from a cleanup landing pad are considered to
152   // be unreachable. It’s not safe to split them out either.
153   if (BB.hasAddressTaken() || BB.isEHPad())
154     return false;
155   auto Term = BB.getTerminator();
156   return !isa<InvokeInst>(Term) && !isa<ResumeInst>(Term);
157 }
158 
159 /// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
160 /// If \p UpdateEntryCount is true (set when this is a new split function and
161 /// module has profile data), set entry count to 0 to ensure treated as cold.
162 /// Return true if the function is changed.
163 static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
164   assert(!F.hasOptNone() && "Can't mark this cold");
165   bool Changed = false;
166   if (!F.hasFnAttribute(Attribute::Cold)) {
167     F.addFnAttr(Attribute::Cold);
168     Changed = true;
169   }
170   if (!F.hasFnAttribute(Attribute::MinSize)) {
171     F.addFnAttr(Attribute::MinSize);
172     Changed = true;
173   }
174   if (UpdateEntryCount) {
175     // Set the entry count to 0 to ensure it is placed in the unlikely text
176     // section when function sections are enabled.
177     F.setEntryCount(0);
178     Changed = true;
179   }
180 
181   return Changed;
182 }
183 
184 class HotColdSplittingLegacyPass : public ModulePass {
185 public:
186   static char ID;
187   HotColdSplittingLegacyPass() : ModulePass(ID) {
188     initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
189   }
190 
191   void getAnalysisUsage(AnalysisUsage &AU) const override {
192     AU.addRequired<BlockFrequencyInfoWrapperPass>();
193     AU.addRequired<ProfileSummaryInfoWrapperPass>();
194     AU.addRequired<TargetTransformInfoWrapperPass>();
195     AU.addUsedIfAvailable<AssumptionCacheTracker>();
196   }
197 
198   bool runOnModule(Module &M) override;
199 };
200 
201 } // end anonymous namespace
202 
203 /// Check whether \p F is inherently cold.
204 bool HotColdSplitting::isFunctionCold(const Function &F) const {
205   if (F.hasFnAttribute(Attribute::Cold))
206     return true;
207 
208   if (F.getCallingConv() == CallingConv::Cold)
209     return true;
210 
211   if (PSI->isFunctionEntryCold(&F))
212     return true;
213 
214   return false;
215 }
216 
217 // Returns false if the function should not be considered for hot-cold split
218 // optimization.
219 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
220   if (F.hasFnAttribute(Attribute::AlwaysInline))
221     return false;
222 
223   if (F.hasFnAttribute(Attribute::NoInline))
224     return false;
225 
226   // A function marked `noreturn` may contain unreachable terminators: these
227   // should not be considered cold, as the function may be a trampoline.
228   if (F.hasFnAttribute(Attribute::NoReturn))
229     return false;
230 
231   if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
232       F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
233       F.hasFnAttribute(Attribute::SanitizeThread) ||
234       F.hasFnAttribute(Attribute::SanitizeMemory))
235     return false;
236 
237   return true;
238 }
239 
240 /// Get the benefit score of outlining \p Region.
241 static InstructionCost getOutliningBenefit(ArrayRef<BasicBlock *> Region,
242                                            TargetTransformInfo &TTI) {
243   // Sum up the code size costs of non-terminator instructions. Tight coupling
244   // with \ref getOutliningPenalty is needed to model the costs of terminators.
245   InstructionCost Benefit = 0;
246   for (BasicBlock *BB : Region)
247     for (Instruction &I : BB->instructionsWithoutDebug())
248       if (&I != BB->getTerminator())
249         Benefit +=
250             TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
251 
252   return Benefit;
253 }
254 
255 /// Get the penalty score for outlining \p Region.
256 static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
257                                unsigned NumInputs, unsigned NumOutputs) {
258   int Penalty = SplittingThreshold;
259   LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
260 
261   // If the splitting threshold is set at or below zero, skip the usual
262   // profitability check.
263   if (SplittingThreshold <= 0)
264     return Penalty;
265 
266   // Find the number of distinct exit blocks for the region. Use a conservative
267   // check to determine whether control returns from the region.
268   bool NoBlocksReturn = true;
269   SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
270   for (BasicBlock *BB : Region) {
271     // If a block has no successors, only assume it does not return if it's
272     // unreachable.
273     if (succ_empty(BB)) {
274       NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
275       continue;
276     }
277 
278     for (BasicBlock *SuccBB : successors(BB)) {
279       if (!is_contained(Region, SuccBB)) {
280         NoBlocksReturn = false;
281         SuccsOutsideRegion.insert(SuccBB);
282       }
283     }
284   }
285 
286   // Count the number of phis in exit blocks with >= 2 incoming values from the
287   // outlining region. These phis are split (\ref severSplitPHINodesOfExits),
288   // and new outputs are created to supply the split phis. CodeExtractor can't
289   // report these new outputs until extraction begins, but it's important to
290   // factor the cost of the outputs into the cost calculation.
291   unsigned NumSplitExitPhis = 0;
292   for (BasicBlock *ExitBB : SuccsOutsideRegion) {
293     for (PHINode &PN : ExitBB->phis()) {
294       // Find all incoming values from the outlining region.
295       int NumIncomingVals = 0;
296       for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
297         if (find(Region, PN.getIncomingBlock(i)) != Region.end()) {
298           ++NumIncomingVals;
299           if (NumIncomingVals > 1) {
300             ++NumSplitExitPhis;
301             break;
302           }
303         }
304     }
305   }
306 
307   // Apply a penalty for calling the split function. Factor in the cost of
308   // materializing all of the parameters.
309   int NumOutputsAndSplitPhis = NumOutputs + NumSplitExitPhis;
310   int NumParams = NumInputs + NumOutputsAndSplitPhis;
311   if (NumParams > MaxParametersForSplit) {
312     LLVM_DEBUG(dbgs() << NumInputs << " inputs and " << NumOutputsAndSplitPhis
313                       << " outputs exceeds parameter limit ("
314                       << MaxParametersForSplit << ")\n");
315     return std::numeric_limits<int>::max();
316   }
317   const int CostForArgMaterialization = 2 * TargetTransformInfo::TCC_Basic;
318   LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumParams << " params\n");
319   Penalty += CostForArgMaterialization * NumParams;
320 
321   // Apply the typical code size cost for an output alloca and its associated
322   // reload in the caller. Also penalize the associated store in the callee.
323   LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputsAndSplitPhis
324                     << " outputs/split phis\n");
325   const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
326   Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
327 
328   // Apply a `noreturn` bonus.
329   if (NoBlocksReturn) {
330     LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
331                       << " non-returning terminators\n");
332     Penalty -= Region.size();
333   }
334 
335   // Apply a penalty for having more than one successor outside of the region.
336   // This penalty accounts for the switch needed in the caller.
337   if (SuccsOutsideRegion.size() > 1) {
338     LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
339                       << " non-region successors\n");
340     Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
341   }
342 
343   return Penalty;
344 }
345 
346 Function *HotColdSplitting::extractColdRegion(
347     const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
348     DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
349     OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
350   assert(!Region.empty());
351 
352   // TODO: Pass BFI and BPI to update profile information.
353   CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
354                    /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
355                    /* AllowAlloca */ false,
356                    /* Suffix */ "cold." + std::to_string(Count));
357 
358   // Perform a simple cost/benefit analysis to decide whether or not to permit
359   // splitting.
360   SetVector<Value *> Inputs, Outputs, Sinks;
361   CE.findInputsOutputs(Inputs, Outputs, Sinks);
362   InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
363   int OutliningPenalty =
364       getOutliningPenalty(Region, Inputs.size(), Outputs.size());
365   LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
366                     << ", penalty = " << OutliningPenalty << "\n");
367   if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
368     return nullptr;
369 
370   Function *OrigF = Region[0]->getParent();
371   if (Function *OutF = CE.extractCodeRegion(CEAC)) {
372     User *U = *OutF->user_begin();
373     CallInst *CI = cast<CallInst>(U);
374     NumColdRegionsOutlined++;
375     if (TTI.useColdCCForColdCall(*OutF)) {
376       OutF->setCallingConv(CallingConv::Cold);
377       CI->setCallingConv(CallingConv::Cold);
378     }
379     CI->setIsNoInline();
380 
381     if (EnableColdSection)
382       OutF->setSection(ColdSectionName);
383     else {
384       if (OrigF->hasSection())
385         OutF->setSection(OrigF->getSection());
386     }
387 
388     markFunctionCold(*OutF, BFI != nullptr);
389 
390     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
391     ORE.emit([&]() {
392       return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
393                                 &*Region[0]->begin())
394              << ore::NV("Original", OrigF) << " split cold code into "
395              << ore::NV("Split", OutF);
396     });
397     return OutF;
398   }
399 
400   ORE.emit([&]() {
401     return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
402                                     &*Region[0]->begin())
403            << "Failed to extract region at block "
404            << ore::NV("Block", Region.front());
405   });
406   return nullptr;
407 }
408 
409 /// A pair of (basic block, score).
410 using BlockTy = std::pair<BasicBlock *, unsigned>;
411 
412 namespace {
413 /// A maximal outlining region. This contains all blocks post-dominated by a
414 /// sink block, the sink block itself, and all blocks dominated by the sink.
415 /// If sink-predecessors and sink-successors cannot be extracted in one region,
416 /// the static constructor returns a list of suitable extraction regions.
417 class OutliningRegion {
418   /// A list of (block, score) pairs. A block's score is non-zero iff it's a
419   /// viable sub-region entry point. Blocks with higher scores are better entry
420   /// points (i.e. they are more distant ancestors of the sink block).
421   SmallVector<BlockTy, 0> Blocks = {};
422 
423   /// The suggested entry point into the region. If the region has multiple
424   /// entry points, all blocks within the region may not be reachable from this
425   /// entry point.
426   BasicBlock *SuggestedEntryPoint = nullptr;
427 
428   /// Whether the entire function is cold.
429   bool EntireFunctionCold = false;
430 
431   /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
432   static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
433     return mayExtractBlock(BB) ? Score : 0;
434   }
435 
436   /// These scores should be lower than the score for predecessor blocks,
437   /// because regions starting at predecessor blocks are typically larger.
438   static constexpr unsigned ScoreForSuccBlock = 1;
439   static constexpr unsigned ScoreForSinkBlock = 1;
440 
441   OutliningRegion(const OutliningRegion &) = delete;
442   OutliningRegion &operator=(const OutliningRegion &) = delete;
443 
444 public:
445   OutliningRegion() = default;
446   OutliningRegion(OutliningRegion &&) = default;
447   OutliningRegion &operator=(OutliningRegion &&) = default;
448 
449   static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
450                                              const DominatorTree &DT,
451                                              const PostDominatorTree &PDT) {
452     std::vector<OutliningRegion> Regions;
453     SmallPtrSet<BasicBlock *, 4> RegionBlocks;
454 
455     Regions.emplace_back();
456     OutliningRegion *ColdRegion = &Regions.back();
457 
458     auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
459       RegionBlocks.insert(BB);
460       ColdRegion->Blocks.emplace_back(BB, Score);
461     };
462 
463     // The ancestor farthest-away from SinkBB, and also post-dominated by it.
464     unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
465     ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
466     unsigned BestScore = SinkScore;
467 
468     // Visit SinkBB's ancestors using inverse DFS.
469     auto PredIt = ++idf_begin(&SinkBB);
470     auto PredEnd = idf_end(&SinkBB);
471     while (PredIt != PredEnd) {
472       BasicBlock &PredBB = **PredIt;
473       bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
474 
475       // If the predecessor is cold and has no predecessors, the entire
476       // function must be cold.
477       if (SinkPostDom && pred_empty(&PredBB)) {
478         ColdRegion->EntireFunctionCold = true;
479         return Regions;
480       }
481 
482       // If SinkBB does not post-dominate a predecessor, do not mark the
483       // predecessor (or any of its predecessors) cold.
484       if (!SinkPostDom || !mayExtractBlock(PredBB)) {
485         PredIt.skipChildren();
486         continue;
487       }
488 
489       // Keep track of the post-dominated ancestor farthest away from the sink.
490       // The path length is always >= 2, ensuring that predecessor blocks are
491       // considered as entry points before the sink block.
492       unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
493       if (PredScore > BestScore) {
494         ColdRegion->SuggestedEntryPoint = &PredBB;
495         BestScore = PredScore;
496       }
497 
498       addBlockToRegion(&PredBB, PredScore);
499       ++PredIt;
500     }
501 
502     // If the sink can be added to the cold region, do so. It's considered as
503     // an entry point before any sink-successor blocks.
504     //
505     // Otherwise, split cold sink-successor blocks using a separate region.
506     // This satisfies the requirement that all extraction blocks other than the
507     // first have predecessors within the extraction region.
508     if (mayExtractBlock(SinkBB)) {
509       addBlockToRegion(&SinkBB, SinkScore);
510       if (pred_empty(&SinkBB)) {
511         ColdRegion->EntireFunctionCold = true;
512         return Regions;
513       }
514     } else {
515       Regions.emplace_back();
516       ColdRegion = &Regions.back();
517       BestScore = 0;
518     }
519 
520     // Find all successors of SinkBB dominated by SinkBB using DFS.
521     auto SuccIt = ++df_begin(&SinkBB);
522     auto SuccEnd = df_end(&SinkBB);
523     while (SuccIt != SuccEnd) {
524       BasicBlock &SuccBB = **SuccIt;
525       bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
526 
527       // Don't allow the backwards & forwards DFSes to mark the same block.
528       bool DuplicateBlock = RegionBlocks.count(&SuccBB);
529 
530       // If SinkBB does not dominate a successor, do not mark the successor (or
531       // any of its successors) cold.
532       if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
533         SuccIt.skipChildren();
534         continue;
535       }
536 
537       unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
538       if (SuccScore > BestScore) {
539         ColdRegion->SuggestedEntryPoint = &SuccBB;
540         BestScore = SuccScore;
541       }
542 
543       addBlockToRegion(&SuccBB, SuccScore);
544       ++SuccIt;
545     }
546 
547     return Regions;
548   }
549 
550   /// Whether this region has nothing to extract.
551   bool empty() const { return !SuggestedEntryPoint; }
552 
553   /// The blocks in this region.
554   ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
555 
556   /// Whether the entire function containing this region is cold.
557   bool isEntireFunctionCold() const { return EntireFunctionCold; }
558 
559   /// Remove a sub-region from this region and return it as a block sequence.
560   BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
561     assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
562 
563     // Remove blocks dominated by the suggested entry point from this region.
564     // During the removal, identify the next best entry point into the region.
565     // Ensure that the first extracted block is the suggested entry point.
566     BlockSequence SubRegion = {SuggestedEntryPoint};
567     BasicBlock *NextEntryPoint = nullptr;
568     unsigned NextScore = 0;
569     auto RegionEndIt = Blocks.end();
570     auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
571       BasicBlock *BB = Block.first;
572       unsigned Score = Block.second;
573       bool InSubRegion =
574           BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
575       if (!InSubRegion && Score > NextScore) {
576         NextEntryPoint = BB;
577         NextScore = Score;
578       }
579       if (InSubRegion && BB != SuggestedEntryPoint)
580         SubRegion.push_back(BB);
581       return InSubRegion;
582     });
583     Blocks.erase(RegionStartIt, RegionEndIt);
584 
585     // Update the suggested entry point.
586     SuggestedEntryPoint = NextEntryPoint;
587 
588     return SubRegion;
589   }
590 };
591 } // namespace
592 
593 bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
594   bool Changed = false;
595 
596   // The set of cold blocks.
597   SmallPtrSet<BasicBlock *, 4> ColdBlocks;
598 
599   // The worklist of non-intersecting regions left to outline.
600   SmallVector<OutliningRegion, 2> OutliningWorklist;
601 
602   // Set up an RPO traversal. Experimentally, this performs better (outlines
603   // more) than a PO traversal, because we prevent region overlap by keeping
604   // the first region to contain a block.
605   ReversePostOrderTraversal<Function *> RPOT(&F);
606 
607   // Calculate domtrees lazily. This reduces compile-time significantly.
608   std::unique_ptr<DominatorTree> DT;
609   std::unique_ptr<PostDominatorTree> PDT;
610 
611   // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
612   // reduces compile-time significantly. TODO: When we *do* use BFI, we should
613   // be able to salvage its domtrees instead of recomputing them.
614   BlockFrequencyInfo *BFI = nullptr;
615   if (HasProfileSummary)
616     BFI = GetBFI(F);
617 
618   TargetTransformInfo &TTI = GetTTI(F);
619   OptimizationRemarkEmitter &ORE = (*GetORE)(F);
620   AssumptionCache *AC = LookupAC(F);
621 
622   // Find all cold regions.
623   for (BasicBlock *BB : RPOT) {
624     // This block is already part of some outlining region.
625     if (ColdBlocks.count(BB))
626       continue;
627 
628     bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
629                 (EnableStaticAnalysis && unlikelyExecuted(*BB));
630     if (!Cold)
631       continue;
632 
633     LLVM_DEBUG({
634       dbgs() << "Found a cold block:\n";
635       BB->dump();
636     });
637 
638     if (!DT)
639       DT = std::make_unique<DominatorTree>(F);
640     if (!PDT)
641       PDT = std::make_unique<PostDominatorTree>(F);
642 
643     auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
644     for (OutliningRegion &Region : Regions) {
645       if (Region.empty())
646         continue;
647 
648       if (Region.isEntireFunctionCold()) {
649         LLVM_DEBUG(dbgs() << "Entire function is cold\n");
650         return markFunctionCold(F);
651       }
652 
653       // If this outlining region intersects with another, drop the new region.
654       //
655       // TODO: It's theoretically possible to outline more by only keeping the
656       // largest region which contains a block, but the extra bookkeeping to do
657       // this is tricky/expensive.
658       bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
659         return !ColdBlocks.insert(Block.first).second;
660       });
661       if (RegionsOverlap)
662         continue;
663 
664       OutliningWorklist.emplace_back(std::move(Region));
665       ++NumColdRegionsFound;
666     }
667   }
668 
669   if (OutliningWorklist.empty())
670     return Changed;
671 
672   // Outline single-entry cold regions, splitting up larger regions as needed.
673   unsigned OutlinedFunctionID = 1;
674   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
675   CodeExtractorAnalysisCache CEAC(F);
676   do {
677     OutliningRegion Region = OutliningWorklist.pop_back_val();
678     assert(!Region.empty() && "Empty outlining region in worklist");
679     do {
680       BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
681       LLVM_DEBUG({
682         dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
683         for (BasicBlock *BB : SubRegion)
684           BB->dump();
685       });
686 
687       Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
688                                              ORE, AC, OutlinedFunctionID);
689       if (Outlined) {
690         ++OutlinedFunctionID;
691         Changed = true;
692       }
693     } while (!Region.empty());
694   } while (!OutliningWorklist.empty());
695 
696   return Changed;
697 }
698 
699 bool HotColdSplitting::run(Module &M) {
700   bool Changed = false;
701   bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
702   for (Function &F : M) {
703     // Do not touch declarations.
704     if (F.isDeclaration())
705       continue;
706 
707     // Do not modify `optnone` functions.
708     if (F.hasOptNone())
709       continue;
710 
711     // Detect inherently cold functions and mark them as such.
712     if (isFunctionCold(F)) {
713       Changed |= markFunctionCold(F);
714       continue;
715     }
716 
717     if (!shouldOutlineFrom(F)) {
718       LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
719       continue;
720     }
721 
722     LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
723     Changed |= outlineColdRegions(F, HasProfileSummary);
724   }
725   return Changed;
726 }
727 
728 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
729   if (skipModule(M))
730     return false;
731   ProfileSummaryInfo *PSI =
732       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
733   auto GTTI = [this](Function &F) -> TargetTransformInfo & {
734     return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
735   };
736   auto GBFI = [this](Function &F) {
737     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
738   };
739   std::unique_ptr<OptimizationRemarkEmitter> ORE;
740   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
741       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
742     ORE.reset(new OptimizationRemarkEmitter(&F));
743     return *ORE.get();
744   };
745   auto LookupAC = [this](Function &F) -> AssumptionCache * {
746     if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
747       return ACT->lookupAssumptionCache(F);
748     return nullptr;
749   };
750 
751   return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
752 }
753 
754 PreservedAnalyses
755 HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
756   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
757 
758   auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
759     return FAM.getCachedResult<AssumptionAnalysis>(F);
760   };
761 
762   auto GBFI = [&FAM](Function &F) {
763     return &FAM.getResult<BlockFrequencyAnalysis>(F);
764   };
765 
766   std::function<TargetTransformInfo &(Function &)> GTTI =
767       [&FAM](Function &F) -> TargetTransformInfo & {
768     return FAM.getResult<TargetIRAnalysis>(F);
769   };
770 
771   std::unique_ptr<OptimizationRemarkEmitter> ORE;
772   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
773       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
774     ORE.reset(new OptimizationRemarkEmitter(&F));
775     return *ORE.get();
776   };
777 
778   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
779 
780   if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
781     return PreservedAnalyses::none();
782   return PreservedAnalyses::all();
783 }
784 
785 char HotColdSplittingLegacyPass::ID = 0;
786 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
787                       "Hot Cold Splitting", false, false)
788 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
789 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
790 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
791                     "Hot Cold Splitting", false, false)
792 
793 ModulePass *llvm::createHotColdSplittingPass() {
794   return new HotColdSplittingLegacyPass();
795 }
796