1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69           "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71                                         "cold outlined regions were partially "
72                                         "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound,
74            "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined,
76            "Number of cold single entry/exit regions outlined.");
77 
78 // Command line option to disable partial-inlining. The default is false:
79 static cl::opt<bool>
80     DisablePartialInlining("disable-partial-inlining", cl::init(false),
81                            cl::Hidden, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
83 // false:
84 static cl::opt<bool> DisableMultiRegionPartialInline(
85     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86     cl::desc("Disable multi-region partial inlining"));
87 
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
90 static cl::opt<bool>
91     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92                cl::desc("Force outline regions with live exits"));
93 
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
96 static cl::opt<bool>
97     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98                        cl::desc("Mark outline function calls with ColdCC"));
99 
100 // This is an option used by testing:
101 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
102                                       cl::init(false), cl::ZeroOrMore,
103                                       cl::ReallyHidden,
104                                       cl::desc("Skip Cost Analysis"));
105 // Used to determine if a cold region is worth outlining based on
106 // its inlining cost compared to the original function.  Default is set at 10%.
107 // ie. if the cold region reduces the inlining cost of the original function by
108 // at least 10%.
109 static cl::opt<float> MinRegionSizeRatio(
110     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
111     cl::desc("Minimum ratio comparing relative sizes of each "
112              "outline candidate and original function"));
113 // Used to tune the minimum number of execution counts needed in the predecessor
114 // block to the cold edge. ie. confidence interval.
115 static cl::opt<unsigned>
116     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
117                              cl::desc("Minimum block executions to consider "
118                                       "its BranchProbabilityInfo valid"));
119 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
120 // if the branch probability is 10% or less, then it is deemed as 'cold'.
121 static cl::opt<float> ColdBranchRatio(
122     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
123     cl::desc("Minimum BranchProbability to consider a region cold."));
124 
125 static cl::opt<unsigned> MaxNumInlineBlocks(
126     "max-num-inline-blocks", cl::init(5), cl::Hidden,
127     cl::desc("Max number of blocks to be partially inlined"));
128 
129 // Command line option to set the maximum number of partial inlining allowed
130 // for the module. The default value of -1 means no limit.
131 static cl::opt<int> MaxNumPartialInlining(
132     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
133     cl::desc("Max number of partial inlining. The default is unlimited"));
134 
135 // Used only when PGO or user annotated branch data is absent. It is
136 // the least value that is used to weigh the outline region. If BFI
137 // produces larger value, the BFI value will be used.
138 static cl::opt<int>
139     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
140                              cl::Hidden, cl::ZeroOrMore,
141                              cl::desc("Relative frequency of outline region to "
142                                       "the entry block"));
143 
144 static cl::opt<unsigned> ExtraOutliningPenalty(
145     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
146     cl::desc("A debug option to add additional penalty to the computed one."));
147 
148 namespace {
149 
150 struct FunctionOutliningInfo {
151   FunctionOutliningInfo() = default;
152 
153   // Returns the number of blocks to be inlined including all blocks
154   // in Entries and one return block.
155   unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
156 
157   // A set of blocks including the function entry that guard
158   // the region to be outlined.
159   SmallVector<BasicBlock *, 4> Entries;
160 
161   // The return block that is not included in the outlined region.
162   BasicBlock *ReturnBlock = nullptr;
163 
164   // The dominating block of the region to be outlined.
165   BasicBlock *NonReturnBlock = nullptr;
166 
167   // The set of blocks in Entries that that are predecessors to ReturnBlock
168   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
169 };
170 
171 struct FunctionOutliningMultiRegionInfo {
172   FunctionOutliningMultiRegionInfo() {}
173 
174   // Container for outline regions
175   struct OutlineRegionInfo {
176     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
177                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
178                       BasicBlock *ReturnBlock)
179         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
180           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
181     SmallVector<BasicBlock *, 8> Region;
182     BasicBlock *EntryBlock;
183     BasicBlock *ExitBlock;
184     BasicBlock *ReturnBlock;
185   };
186 
187   SmallVector<OutlineRegionInfo, 4> ORI;
188 };
189 
190 struct PartialInlinerImpl {
191 
192   PartialInlinerImpl(
193       function_ref<AssumptionCache &(Function &)> GetAC,
194       function_ref<AssumptionCache *(Function &)> LookupAC,
195       function_ref<TargetTransformInfo &(Function &)> GTTI,
196       function_ref<const TargetLibraryInfo &(Function &)> GTLI,
197       ProfileSummaryInfo &ProfSI,
198       function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
199       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
200         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
201 
202   bool run(Module &M);
203   // Main part of the transformation that calls helper functions to find
204   // outlining candidates, clone & outline the function, and attempt to
205   // partially inline the resulting function. Returns true if
206   // inlining was successful, false otherwise.  Also returns the outline
207   // function (only if we partially inlined early returns) as there is a
208   // possibility to further "peel" early return statements that were left in the
209   // outline function due to code size.
210   std::pair<bool, Function *> unswitchFunction(Function &F);
211 
212   // This class speculatively clones the function to be partial inlined.
213   // At the end of partial inlining, the remaining callsites to the cloned
214   // function that are not partially inlined will be fixed up to reference
215   // the original function, and the cloned function will be erased.
216   struct FunctionCloner {
217     // Two constructors, one for single region outlining, the other for
218     // multi-region outlining.
219     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
220                    OptimizationRemarkEmitter &ORE,
221                    function_ref<AssumptionCache *(Function &)> LookupAC,
222                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
223     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
224                    OptimizationRemarkEmitter &ORE,
225                    function_ref<AssumptionCache *(Function &)> LookupAC,
226                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
227 
228     ~FunctionCloner();
229 
230     // Prepare for function outlining: making sure there is only
231     // one incoming edge from the extracted/outlined region to
232     // the return block.
233     void normalizeReturnBlock() const;
234 
235     // Do function outlining for cold regions.
236     bool doMultiRegionFunctionOutlining();
237     // Do function outlining for region after early return block(s).
238     // NOTE: For vararg functions that do the vararg handling in the outlined
239     //       function, we temporarily generate IR that does not properly
240     //       forward varargs to the outlined function. Calling InlineFunction
241     //       will update calls to the outlined functions to properly forward
242     //       the varargs.
243     Function *doSingleRegionFunctionOutlining();
244 
245     Function *OrigFunc = nullptr;
246     Function *ClonedFunc = nullptr;
247 
248     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
249     // Keep track of Outlined Functions and the basic block they're called from.
250     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
251 
252     // ClonedFunc is inlined in one of its callers after function
253     // outlining.
254     bool IsFunctionInlined = false;
255     // The cost of the region to be outlined.
256     InstructionCost OutlinedRegionCost = 0;
257     // ClonedOI is specific to outlining non-early return blocks.
258     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
259     // ClonedOMRI is specific to outlining cold regions.
260     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
261     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
262     OptimizationRemarkEmitter &ORE;
263     function_ref<AssumptionCache *(Function &)> LookupAC;
264     function_ref<TargetTransformInfo &(Function &)> GetTTI;
265   };
266 
267 private:
268   int NumPartialInlining = 0;
269   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
270   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
271   function_ref<TargetTransformInfo &(Function &)> GetTTI;
272   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
273   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
274   ProfileSummaryInfo &PSI;
275 
276   // Return the frequency of the OutlininingBB relative to F's entry point.
277   // The result is no larger than 1 and is represented using BP.
278   // (Note that the outlined region's 'head' block can only have incoming
279   // edges from the guarding entry blocks).
280   BranchProbability
281   getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
282 
283   // Return true if the callee of CB should be partially inlined with
284   // profit.
285   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
286                            BlockFrequency WeightedOutliningRcost,
287                            OptimizationRemarkEmitter &ORE) const;
288 
289   // Try to inline DuplicateFunction (cloned from F with call to
290   // the OutlinedFunction into its callers. Return true
291   // if there is any successful inlining.
292   bool tryPartialInline(FunctionCloner &Cloner);
293 
294   // Compute the mapping from use site of DuplicationFunction to the enclosing
295   // BB's profile count.
296   void
297   computeCallsiteToProfCountMap(Function *DuplicateFunction,
298                                 DenseMap<User *, uint64_t> &SiteCountMap) const;
299 
300   bool isLimitReached() const {
301     return (MaxNumPartialInlining != -1 &&
302             NumPartialInlining >= MaxNumPartialInlining);
303   }
304 
305   static CallBase *getSupportedCallBase(User *U) {
306     if (isa<CallInst>(U) || isa<InvokeInst>(U))
307       return cast<CallBase>(U);
308     llvm_unreachable("All uses must be calls");
309     return nullptr;
310   }
311 
312   static CallBase *getOneCallSiteTo(Function &F) {
313     User *User = *F.user_begin();
314     return getSupportedCallBase(User);
315   }
316 
317   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
318     CallBase *CB = getOneCallSiteTo(F);
319     DebugLoc DLoc = CB->getDebugLoc();
320     BasicBlock *Block = CB->getParent();
321     return std::make_tuple(DLoc, Block);
322   }
323 
324   // Returns the costs associated with function outlining:
325   // - The first value is the non-weighted runtime cost for making the call
326   //   to the outlined function, including the addtional  setup cost in the
327   //    outlined function itself;
328   // - The second value is the estimated size of the new call sequence in
329   //   basic block Cloner.OutliningCallBB;
330   std::tuple<InstructionCost, InstructionCost>
331   computeOutliningCosts(FunctionCloner &Cloner) const;
332 
333   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
334   // approximate both the size and runtime cost (Note that in the current
335   // inline cost analysis, there is no clear distinction there either).
336   static InstructionCost computeBBInlineCost(BasicBlock *BB,
337                                              TargetTransformInfo *TTI);
338 
339   std::unique_ptr<FunctionOutliningInfo>
340   computeOutliningInfo(Function &F) const;
341 
342   std::unique_ptr<FunctionOutliningMultiRegionInfo>
343   computeOutliningColdRegionsInfo(Function &F,
344                                   OptimizationRemarkEmitter &ORE) const;
345 };
346 
347 struct PartialInlinerLegacyPass : public ModulePass {
348   static char ID; // Pass identification, replacement for typeid
349 
350   PartialInlinerLegacyPass() : ModulePass(ID) {
351     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
352   }
353 
354   void getAnalysisUsage(AnalysisUsage &AU) const override {
355     AU.addRequired<AssumptionCacheTracker>();
356     AU.addRequired<ProfileSummaryInfoWrapperPass>();
357     AU.addRequired<TargetTransformInfoWrapperPass>();
358     AU.addRequired<TargetLibraryInfoWrapperPass>();
359   }
360 
361   bool runOnModule(Module &M) override {
362     if (skipModule(M))
363       return false;
364 
365     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
366     TargetTransformInfoWrapperPass *TTIWP =
367         &getAnalysis<TargetTransformInfoWrapperPass>();
368     ProfileSummaryInfo &PSI =
369         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
370 
371     auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
372       return ACT->getAssumptionCache(F);
373     };
374 
375     auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
376       return ACT->lookupAssumptionCache(F);
377     };
378 
379     auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
380       return TTIWP->getTTI(F);
381     };
382 
383     auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
384       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
385     };
386 
387     return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
388                               GetTLI, PSI)
389         .run(M);
390   }
391 };
392 
393 } // end anonymous namespace
394 
395 std::unique_ptr<FunctionOutliningMultiRegionInfo>
396 PartialInlinerImpl::computeOutliningColdRegionsInfo(
397     Function &F, OptimizationRemarkEmitter &ORE) const {
398   BasicBlock *EntryBlock = &F.front();
399 
400   DominatorTree DT(F);
401   LoopInfo LI(DT);
402   BranchProbabilityInfo BPI(F, LI);
403   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
404   BlockFrequencyInfo *BFI;
405   if (!GetBFI) {
406     ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
407     BFI = ScopedBFI.get();
408   } else
409     BFI = &(GetBFI(F));
410 
411   // Return if we don't have profiling information.
412   if (!PSI.hasInstrumentationProfile())
413     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
414 
415   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
416       std::make_unique<FunctionOutliningMultiRegionInfo>();
417 
418   auto IsSingleExit =
419       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
420     BasicBlock *ExitBlock = nullptr;
421     for (auto *Block : BlockList) {
422       for (BasicBlock *Succ : successors(Block)) {
423         if (!is_contained(BlockList, Succ)) {
424           if (ExitBlock) {
425             ORE.emit([&]() {
426               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
427                                               &Succ->front())
428                      << "Region dominated by "
429                      << ore::NV("Block", BlockList.front()->getName())
430                      << " has more than one region exit edge.";
431             });
432             return nullptr;
433           }
434 
435           ExitBlock = Block;
436         }
437       }
438     }
439     return ExitBlock;
440   };
441 
442   auto BBProfileCount = [BFI](BasicBlock *BB) {
443     return BFI->getBlockProfileCount(BB).getValueOr(0);
444   };
445 
446   // Use the same computeBBInlineCost function to compute the cost savings of
447   // the outlining the candidate region.
448   TargetTransformInfo *FTTI = &GetTTI(F);
449   InstructionCost OverallFunctionCost = 0;
450   for (auto &BB : F)
451     OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
452 
453   LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
454                     << "\n";);
455 
456   InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
457       [&](auto Cost) { return Cost * MinRegionSizeRatio; });
458 
459   BranchProbability MinBranchProbability(
460       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
461       MinBlockCounterExecution);
462   bool ColdCandidateFound = false;
463   BasicBlock *CurrEntry = EntryBlock;
464   std::vector<BasicBlock *> DFS;
465   DenseMap<BasicBlock *, bool> VisitedMap;
466   DFS.push_back(CurrEntry);
467   VisitedMap[CurrEntry] = true;
468 
469   // Use Depth First Search on the basic blocks to find CFG edges that are
470   // considered cold.
471   // Cold regions considered must also have its inline cost compared to the
472   // overall inline cost of the original function.  The region is outlined only
473   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
474   // more.
475   while (!DFS.empty()) {
476     auto *ThisBB = DFS.back();
477     DFS.pop_back();
478     // Only consider regions with predecessor blocks that are considered
479     // not-cold (default: part of the top 99.99% of all block counters)
480     // AND greater than our minimum block execution count (default: 100).
481     if (PSI.isColdBlock(ThisBB, BFI) ||
482         BBProfileCount(ThisBB) < MinBlockCounterExecution)
483       continue;
484     for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
485       if (VisitedMap[*SI])
486         continue;
487       VisitedMap[*SI] = true;
488       DFS.push_back(*SI);
489       // If branch isn't cold, we skip to the next one.
490       BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
491       if (SuccProb > MinBranchProbability)
492         continue;
493 
494       LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
495                         << SI->getName()
496                         << "\nBranch Probability = " << SuccProb << "\n";);
497 
498       SmallVector<BasicBlock *, 8> DominateVector;
499       DT.getDescendants(*SI, DominateVector);
500       assert(!DominateVector.empty() &&
501              "SI should be reachable and have at least itself as descendant");
502 
503       // We can only outline single entry regions (for now).
504       if (!DominateVector.front()->hasNPredecessors(1)) {
505         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
506                           << " doesn't have a single predecessor in the "
507                              "dominator tree\n";);
508         continue;
509       }
510 
511       BasicBlock *ExitBlock = nullptr;
512       // We can only outline single exit regions (for now).
513       if (!(ExitBlock = IsSingleExit(DominateVector))) {
514         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
515                           << " doesn't have a unique successor\n";);
516         continue;
517       }
518 
519       InstructionCost OutlineRegionCost = 0;
520       for (auto *BB : DominateVector)
521         OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
522 
523       LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
524                         << "\n";);
525 
526       if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
527         ORE.emit([&]() {
528           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
529                                             &SI->front())
530                  << ore::NV("Callee", &F)
531                  << " inline cost-savings smaller than "
532                  << ore::NV("Cost", MinOutlineRegionCost);
533         });
534 
535         LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
536                           << MinOutlineRegionCost << "\n";);
537         continue;
538       }
539 
540       // For now, ignore blocks that belong to a SISE region that is a
541       // candidate for outlining.  In the future, we may want to look
542       // at inner regions because the outer region may have live-exit
543       // variables.
544       for (auto *BB : DominateVector)
545         VisitedMap[BB] = true;
546 
547       // ReturnBlock here means the block after the outline call
548       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
549       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
550           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
551       OutliningInfo->ORI.push_back(RegInfo);
552       LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
553                         << DominateVector.front()->getName() << "\n";);
554       ColdCandidateFound = true;
555       NumColdRegionsFound++;
556     }
557   }
558 
559   if (ColdCandidateFound)
560     return OutliningInfo;
561 
562   return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
563 }
564 
565 std::unique_ptr<FunctionOutliningInfo>
566 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
567   BasicBlock *EntryBlock = &F.front();
568   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
569   if (!BR || BR->isUnconditional())
570     return std::unique_ptr<FunctionOutliningInfo>();
571 
572   // Returns true if Succ is BB's successor
573   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
574     return is_contained(successors(BB), Succ);
575   };
576 
577   auto IsReturnBlock = [](BasicBlock *BB) {
578     Instruction *TI = BB->getTerminator();
579     return isa<ReturnInst>(TI);
580   };
581 
582   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
583     if (IsReturnBlock(Succ1))
584       return std::make_tuple(Succ1, Succ2);
585     if (IsReturnBlock(Succ2))
586       return std::make_tuple(Succ2, Succ1);
587 
588     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
589   };
590 
591   // Detect a triangular shape:
592   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
593     if (IsSuccessor(Succ1, Succ2))
594       return std::make_tuple(Succ1, Succ2);
595     if (IsSuccessor(Succ2, Succ1))
596       return std::make_tuple(Succ2, Succ1);
597 
598     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
599   };
600 
601   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
602       std::make_unique<FunctionOutliningInfo>();
603 
604   BasicBlock *CurrEntry = EntryBlock;
605   bool CandidateFound = false;
606   do {
607     // The number of blocks to be inlined has already reached
608     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
609     // disables partial inlining for the function.
610     if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
611       break;
612 
613     if (succ_size(CurrEntry) != 2)
614       break;
615 
616     BasicBlock *Succ1 = *succ_begin(CurrEntry);
617     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
618 
619     BasicBlock *ReturnBlock, *NonReturnBlock;
620     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
621 
622     if (ReturnBlock) {
623       OutliningInfo->Entries.push_back(CurrEntry);
624       OutliningInfo->ReturnBlock = ReturnBlock;
625       OutliningInfo->NonReturnBlock = NonReturnBlock;
626       CandidateFound = true;
627       break;
628     }
629 
630     BasicBlock *CommSucc, *OtherSucc;
631     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
632 
633     if (!CommSucc)
634       break;
635 
636     OutliningInfo->Entries.push_back(CurrEntry);
637     CurrEntry = OtherSucc;
638   } while (true);
639 
640   if (!CandidateFound)
641     return std::unique_ptr<FunctionOutliningInfo>();
642 
643   // There should not be any successors (not in the entry set) other than
644   // {ReturnBlock, NonReturnBlock}
645   assert(OutliningInfo->Entries[0] == &F.front() &&
646          "Function Entry must be the first in Entries vector");
647   DenseSet<BasicBlock *> Entries;
648   for (BasicBlock *E : OutliningInfo->Entries)
649     Entries.insert(E);
650 
651   // Returns true of BB has Predecessor which is not
652   // in Entries set.
653   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
654     for (auto *Pred : predecessors(BB)) {
655       if (!Entries.count(Pred))
656         return true;
657     }
658     return false;
659   };
660   auto CheckAndNormalizeCandidate =
661       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
662         for (BasicBlock *E : OutliningInfo->Entries) {
663           for (auto *Succ : successors(E)) {
664             if (Entries.count(Succ))
665               continue;
666             if (Succ == OutliningInfo->ReturnBlock)
667               OutliningInfo->ReturnBlockPreds.push_back(E);
668             else if (Succ != OutliningInfo->NonReturnBlock)
669               return false;
670           }
671           // There should not be any outside incoming edges either:
672           if (HasNonEntryPred(E))
673             return false;
674         }
675         return true;
676       };
677 
678   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
679     return std::unique_ptr<FunctionOutliningInfo>();
680 
681   // Now further growing the candidate's inlining region by
682   // peeling off dominating blocks from the outlining region:
683   while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
684     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
685     if (succ_size(Cand) != 2)
686       break;
687 
688     if (HasNonEntryPred(Cand))
689       break;
690 
691     BasicBlock *Succ1 = *succ_begin(Cand);
692     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
693 
694     BasicBlock *ReturnBlock, *NonReturnBlock;
695     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
696     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
697       break;
698 
699     if (NonReturnBlock->getSinglePredecessor() != Cand)
700       break;
701 
702     // Now grow and update OutlininigInfo:
703     OutliningInfo->Entries.push_back(Cand);
704     OutliningInfo->NonReturnBlock = NonReturnBlock;
705     OutliningInfo->ReturnBlockPreds.push_back(Cand);
706     Entries.insert(Cand);
707   }
708 
709   return OutliningInfo;
710 }
711 
712 // Check if there is PGO data or user annotated branch data:
713 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
714   if (F.hasProfileData())
715     return true;
716   // Now check if any of the entry block has MD_prof data:
717   for (auto *E : OI.Entries) {
718     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
719     if (!BR || BR->isUnconditional())
720       continue;
721     uint64_t T, F;
722     if (BR->extractProfMetadata(T, F))
723       return true;
724   }
725   return false;
726 }
727 
728 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
729     FunctionCloner &Cloner) const {
730   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
731   auto EntryFreq =
732       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
733   auto OutliningCallFreq =
734       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
735   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
736   // we outlined any regions, so we may encounter situations where the
737   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
738   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
739     OutliningCallFreq = EntryFreq;
740 
741   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
742       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
743 
744   if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
745     return OutlineRegionRelFreq;
746 
747   // When profile data is not available, we need to be conservative in
748   // estimating the overall savings. Static branch prediction can usually
749   // guess the branch direction right (taken/non-taken), but the guessed
750   // branch probability is usually not biased enough. In case when the
751   // outlined region is predicted to be likely, its probability needs
752   // to be made higher (more biased) to not under-estimate the cost of
753   // function outlining. On the other hand, if the outlined region
754   // is predicted to be less likely, the predicted probablity is usually
755   // higher than the actual. For instance, the actual probability of the
756   // less likely target is only 5%, but the guessed probablity can be
757   // 40%. In the latter case, there is no need for further adjustement.
758   // FIXME: add an option for this.
759   if (OutlineRegionRelFreq < BranchProbability(45, 100))
760     return OutlineRegionRelFreq;
761 
762   OutlineRegionRelFreq = std::max(
763       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
764 
765   return OutlineRegionRelFreq;
766 }
767 
768 bool PartialInlinerImpl::shouldPartialInline(
769     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
770     OptimizationRemarkEmitter &ORE) const {
771   using namespace ore;
772 
773   Function *Callee = CB.getCalledFunction();
774   assert(Callee == Cloner.ClonedFunc);
775 
776   if (SkipCostAnalysis)
777     return isInlineViable(*Callee).isSuccess();
778 
779   Function *Caller = CB.getCaller();
780   auto &CalleeTTI = GetTTI(*Callee);
781   bool RemarksEnabled =
782       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
783           DEBUG_TYPE);
784   InlineCost IC =
785       getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
786                     GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
787 
788   if (IC.isAlways()) {
789     ORE.emit([&]() {
790       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
791              << NV("Callee", Cloner.OrigFunc)
792              << " should always be fully inlined, not partially";
793     });
794     return false;
795   }
796 
797   if (IC.isNever()) {
798     ORE.emit([&]() {
799       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
800              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
801              << NV("Caller", Caller)
802              << " because it should never be inlined (cost=never)";
803     });
804     return false;
805   }
806 
807   if (!IC) {
808     ORE.emit([&]() {
809       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
810              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
811              << NV("Caller", Caller) << " because too costly to inline (cost="
812              << NV("Cost", IC.getCost()) << ", threshold="
813              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
814     });
815     return false;
816   }
817   const DataLayout &DL = Caller->getParent()->getDataLayout();
818 
819   // The savings of eliminating the call:
820   int NonWeightedSavings = getCallsiteCost(CB, DL);
821   BlockFrequency NormWeightedSavings(NonWeightedSavings);
822 
823   // Weighted saving is smaller than weighted cost, return false
824   if (NormWeightedSavings < WeightedOutliningRcost) {
825     ORE.emit([&]() {
826       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
827                                         &CB)
828              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
829              << NV("Caller", Caller) << " runtime overhead (overhead="
830              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
831              << ", savings="
832              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
833              << ")"
834              << " of making the outlined call is too high";
835     });
836 
837     return false;
838   }
839 
840   ORE.emit([&]() {
841     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
842            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
843            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
844            << " (threshold="
845            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
846   });
847   return true;
848 }
849 
850 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
851 // For now use a simplified version. The returned 'InlineCost' will be used
852 // to esimate the size cost as well as runtime cost of the BB.
853 InstructionCost
854 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
855                                         TargetTransformInfo *TTI) {
856   InstructionCost InlineCost = 0;
857   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
858   for (Instruction &I : BB->instructionsWithoutDebug()) {
859     // Skip free instructions.
860     switch (I.getOpcode()) {
861     case Instruction::BitCast:
862     case Instruction::PtrToInt:
863     case Instruction::IntToPtr:
864     case Instruction::Alloca:
865     case Instruction::PHI:
866       continue;
867     case Instruction::GetElementPtr:
868       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
869         continue;
870       break;
871     default:
872       break;
873     }
874 
875     if (I.isLifetimeStartOrEnd())
876       continue;
877 
878     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
879       Intrinsic::ID IID = II->getIntrinsicID();
880       SmallVector<Type *, 4> Tys;
881       FastMathFlags FMF;
882       for (Value *Val : II->args())
883         Tys.push_back(Val->getType());
884 
885       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
886         FMF = FPMO->getFastMathFlags();
887 
888       IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
889       InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
890       continue;
891     }
892 
893     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
894       InlineCost += getCallsiteCost(*CI, DL);
895       continue;
896     }
897 
898     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
899       InlineCost += getCallsiteCost(*II, DL);
900       continue;
901     }
902 
903     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
904       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
905       continue;
906     }
907     InlineCost += InlineConstants::InstrCost;
908   }
909 
910   return InlineCost;
911 }
912 
913 std::tuple<InstructionCost, InstructionCost>
914 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
915   InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
916   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
917     Function *OutlinedFunc = FuncBBPair.first;
918     BasicBlock* OutliningCallBB = FuncBBPair.second;
919     // Now compute the cost of the call sequence to the outlined function
920     // 'OutlinedFunction' in BB 'OutliningCallBB':
921     auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
922     OutliningFuncCallCost +=
923         computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
924 
925     // Now compute the cost of the extracted/outlined function itself:
926     for (BasicBlock &BB : *OutlinedFunc)
927       OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
928   }
929   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
930          "Outlined function cost should be no less than the outlined region");
931 
932   // The code extractor introduces a new root and exit stub blocks with
933   // additional unconditional branches. Those branches will be eliminated
934   // later with bb layout. The cost should be adjusted accordingly:
935   OutlinedFunctionCost -=
936       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
937 
938   InstructionCost OutliningRuntimeOverhead =
939       OutliningFuncCallCost +
940       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
941       ExtraOutliningPenalty.getValue();
942 
943   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
944 }
945 
946 // Create the callsite to profile count map which is
947 // used to update the original function's entry count,
948 // after the function is partially inlined into the callsite.
949 void PartialInlinerImpl::computeCallsiteToProfCountMap(
950     Function *DuplicateFunction,
951     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
952   std::vector<User *> Users(DuplicateFunction->user_begin(),
953                             DuplicateFunction->user_end());
954   Function *CurrentCaller = nullptr;
955   std::unique_ptr<BlockFrequencyInfo> TempBFI;
956   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
957 
958   auto ComputeCurrBFI = [&,this](Function *Caller) {
959       // For the old pass manager:
960       if (!GetBFI) {
961         DominatorTree DT(*Caller);
962         LoopInfo LI(DT);
963         BranchProbabilityInfo BPI(*Caller, LI);
964         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
965         CurrentCallerBFI = TempBFI.get();
966       } else {
967         // New pass manager:
968         CurrentCallerBFI = &(GetBFI(*Caller));
969       }
970   };
971 
972   for (User *User : Users) {
973     // Don't bother with BlockAddress used by CallBr for asm goto.
974     if (isa<BlockAddress>(User))
975       continue;
976     CallBase *CB = getSupportedCallBase(User);
977     Function *Caller = CB->getCaller();
978     if (CurrentCaller != Caller) {
979       CurrentCaller = Caller;
980       ComputeCurrBFI(Caller);
981     } else {
982       assert(CurrentCallerBFI && "CallerBFI is not set");
983     }
984     BasicBlock *CallBB = CB->getParent();
985     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
986     if (Count)
987       CallSiteToProfCountMap[User] = *Count;
988     else
989       CallSiteToProfCountMap[User] = 0;
990   }
991 }
992 
993 PartialInlinerImpl::FunctionCloner::FunctionCloner(
994     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
995     function_ref<AssumptionCache *(Function &)> LookupAC,
996     function_ref<TargetTransformInfo &(Function &)> GetTTI)
997     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
998   ClonedOI = std::make_unique<FunctionOutliningInfo>();
999 
1000   // Clone the function, so that we can hack away on it.
1001   ValueToValueMapTy VMap;
1002   ClonedFunc = CloneFunction(F, VMap);
1003 
1004   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
1005   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
1006   for (BasicBlock *BB : OI->Entries)
1007     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1008 
1009   for (BasicBlock *E : OI->ReturnBlockPreds) {
1010     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1011     ClonedOI->ReturnBlockPreds.push_back(NewE);
1012   }
1013   // Go ahead and update all uses to the duplicate, so that we can just
1014   // use the inliner functionality when we're done hacking.
1015   F->replaceAllUsesWith(ClonedFunc);
1016 }
1017 
1018 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1019     Function *F, FunctionOutliningMultiRegionInfo *OI,
1020     OptimizationRemarkEmitter &ORE,
1021     function_ref<AssumptionCache *(Function &)> LookupAC,
1022     function_ref<TargetTransformInfo &(Function &)> GetTTI)
1023     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1024   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1025 
1026   // Clone the function, so that we can hack away on it.
1027   ValueToValueMapTy VMap;
1028   ClonedFunc = CloneFunction(F, VMap);
1029 
1030   // Go through all Outline Candidate Regions and update all BasicBlock
1031   // information.
1032   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1033        OI->ORI) {
1034     SmallVector<BasicBlock *, 8> Region;
1035     for (BasicBlock *BB : RegionInfo.Region)
1036       Region.push_back(cast<BasicBlock>(VMap[BB]));
1037 
1038     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1039     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1040     BasicBlock *NewReturnBlock = nullptr;
1041     if (RegionInfo.ReturnBlock)
1042       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1043     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1044         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1045     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1046   }
1047   // Go ahead and update all uses to the duplicate, so that we can just
1048   // use the inliner functionality when we're done hacking.
1049   F->replaceAllUsesWith(ClonedFunc);
1050 }
1051 
1052 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1053   auto GetFirstPHI = [](BasicBlock *BB) {
1054     BasicBlock::iterator I = BB->begin();
1055     PHINode *FirstPhi = nullptr;
1056     while (I != BB->end()) {
1057       PHINode *Phi = dyn_cast<PHINode>(I);
1058       if (!Phi)
1059         break;
1060       if (!FirstPhi) {
1061         FirstPhi = Phi;
1062         break;
1063       }
1064     }
1065     return FirstPhi;
1066   };
1067 
1068   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1069   // blocks.
1070   if (!ClonedOI)
1071     return;
1072 
1073   // Special hackery is needed with PHI nodes that have inputs from more than
1074   // one extracted block.  For simplicity, just split the PHIs into a two-level
1075   // sequence of PHIs, some of which will go in the extracted region, and some
1076   // of which will go outside.
1077   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1078   // only split block when necessary:
1079   PHINode *FirstPhi = GetFirstPHI(PreReturn);
1080   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1081 
1082   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1083     return;
1084 
1085   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1086     Value *CommonValue = PN->getIncomingValue(0);
1087     if (all_of(PN->incoming_values(),
1088                [&](Value *V) { return V == CommonValue; }))
1089       return CommonValue;
1090     return nullptr;
1091   };
1092 
1093   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1094       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1095   BasicBlock::iterator I = PreReturn->begin();
1096   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1097   SmallVector<Instruction *, 4> DeadPhis;
1098   while (I != PreReturn->end()) {
1099     PHINode *OldPhi = dyn_cast<PHINode>(I);
1100     if (!OldPhi)
1101       break;
1102 
1103     PHINode *RetPhi =
1104         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1105     OldPhi->replaceAllUsesWith(RetPhi);
1106     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1107 
1108     RetPhi->addIncoming(&*I, PreReturn);
1109     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1110       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1111       OldPhi->removeIncomingValue(E);
1112     }
1113 
1114     // After incoming values splitting, the old phi may become trivial.
1115     // Keeping the trivial phi can introduce definition inside the outline
1116     // region which is live-out, causing necessary overhead (load, store
1117     // arg passing etc).
1118     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1119       OldPhi->replaceAllUsesWith(OldPhiVal);
1120       DeadPhis.push_back(OldPhi);
1121     }
1122     ++I;
1123   }
1124   for (auto *DP : DeadPhis)
1125     DP->eraseFromParent();
1126 
1127   for (auto *E : ClonedOI->ReturnBlockPreds)
1128     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1129 }
1130 
1131 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1132 
1133   auto ComputeRegionCost =
1134       [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1135     InstructionCost Cost = 0;
1136     for (BasicBlock* BB : Region)
1137       Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1138     return Cost;
1139   };
1140 
1141   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1142 
1143   if (ClonedOMRI->ORI.empty())
1144     return false;
1145 
1146   // The CodeExtractor needs a dominator tree.
1147   DominatorTree DT;
1148   DT.recalculate(*ClonedFunc);
1149 
1150   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1151   LoopInfo LI(DT);
1152   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1153   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1154 
1155   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1156   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1157 
1158   SetVector<Value *> Inputs, Outputs, Sinks;
1159   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1160        ClonedOMRI->ORI) {
1161     InstructionCost CurrentOutlinedRegionCost =
1162         ComputeRegionCost(RegionInfo.Region);
1163 
1164     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1165                      ClonedFuncBFI.get(), &BPI,
1166                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1167                      /* AllowVarargs */ false);
1168 
1169     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1170 
1171     LLVM_DEBUG({
1172       dbgs() << "inputs: " << Inputs.size() << "\n";
1173       dbgs() << "outputs: " << Outputs.size() << "\n";
1174       for (Value *value : Inputs)
1175         dbgs() << "value used in func: " << *value << "\n";
1176       for (Value *output : Outputs)
1177         dbgs() << "instr used in func: " << *output << "\n";
1178     });
1179 
1180     // Do not extract regions that have live exit variables.
1181     if (Outputs.size() > 0 && !ForceLiveExit)
1182       continue;
1183 
1184     if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1185       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1186       BasicBlock *OutliningCallBB = OCS->getParent();
1187       assert(OutliningCallBB->getParent() == ClonedFunc);
1188       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1189       NumColdRegionsOutlined++;
1190       OutlinedRegionCost += CurrentOutlinedRegionCost;
1191 
1192       if (MarkOutlinedColdCC) {
1193         OutlinedFunc->setCallingConv(CallingConv::Cold);
1194         OCS->setCallingConv(CallingConv::Cold);
1195       }
1196     } else
1197       ORE.emit([&]() {
1198         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1199                                         &RegionInfo.Region.front()->front())
1200                << "Failed to extract region at block "
1201                << ore::NV("Block", RegionInfo.Region.front());
1202       });
1203   }
1204 
1205   return !OutlinedFunctions.empty();
1206 }
1207 
1208 Function *
1209 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1210   // Returns true if the block is to be partial inlined into the caller
1211   // (i.e. not to be extracted to the out of line function)
1212   auto ToBeInlined = [&, this](BasicBlock *BB) {
1213     return BB == ClonedOI->ReturnBlock ||
1214            llvm::is_contained(ClonedOI->Entries, BB);
1215   };
1216 
1217   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1218   // The CodeExtractor needs a dominator tree.
1219   DominatorTree DT;
1220   DT.recalculate(*ClonedFunc);
1221 
1222   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1223   LoopInfo LI(DT);
1224   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1225   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1226 
1227   // Gather up the blocks that we're going to extract.
1228   std::vector<BasicBlock *> ToExtract;
1229   auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1230   ToExtract.push_back(ClonedOI->NonReturnBlock);
1231   OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1232       ClonedOI->NonReturnBlock, ClonedFuncTTI);
1233   for (BasicBlock &BB : *ClonedFunc)
1234     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1235       ToExtract.push_back(&BB);
1236       // FIXME: the code extractor may hoist/sink more code
1237       // into the outlined function which may make the outlining
1238       // overhead (the difference of the outlined function cost
1239       // and OutliningRegionCost) look larger.
1240       OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1241     }
1242 
1243   // Extract the body of the if.
1244   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1245   Function *OutlinedFunc =
1246       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1247                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1248                     /* AllowVarargs */ true)
1249           .extractCodeRegion(CEAC);
1250 
1251   if (OutlinedFunc) {
1252     BasicBlock *OutliningCallBB =
1253         PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1254     assert(OutliningCallBB->getParent() == ClonedFunc);
1255     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1256   } else
1257     ORE.emit([&]() {
1258       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1259                                       &ToExtract.front()->front())
1260              << "Failed to extract region at block "
1261              << ore::NV("Block", ToExtract.front());
1262     });
1263 
1264   return OutlinedFunc;
1265 }
1266 
1267 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1268   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1269   // users (function pointers, etc.) back to the original function.
1270   ClonedFunc->replaceAllUsesWith(OrigFunc);
1271   ClonedFunc->eraseFromParent();
1272   if (!IsFunctionInlined) {
1273     // Remove each function that was speculatively created if there is no
1274     // reference.
1275     for (auto FuncBBPair : OutlinedFunctions) {
1276       Function *Func = FuncBBPair.first;
1277       Func->eraseFromParent();
1278     }
1279   }
1280 }
1281 
1282 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1283   if (F.hasAddressTaken())
1284     return {false, nullptr};
1285 
1286   // Let inliner handle it
1287   if (F.hasFnAttribute(Attribute::AlwaysInline))
1288     return {false, nullptr};
1289 
1290   if (F.hasFnAttribute(Attribute::NoInline))
1291     return {false, nullptr};
1292 
1293   if (PSI.isFunctionEntryCold(&F))
1294     return {false, nullptr};
1295 
1296   if (F.users().empty())
1297     return {false, nullptr};
1298 
1299   OptimizationRemarkEmitter ORE(&F);
1300 
1301   // Only try to outline cold regions if we have a profile summary, which
1302   // implies we have profiling information.
1303   if (PSI.hasProfileSummary() && F.hasProfileData() &&
1304       !DisableMultiRegionPartialInline) {
1305     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1306         computeOutliningColdRegionsInfo(F, ORE);
1307     if (OMRI) {
1308       FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1309 
1310       LLVM_DEBUG({
1311         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1312         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1313                << "\n";
1314       });
1315 
1316       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1317 
1318       if (DidOutline) {
1319         LLVM_DEBUG({
1320           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1321           Cloner.ClonedFunc->print(dbgs());
1322           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1323         });
1324 
1325         if (tryPartialInline(Cloner))
1326           return {true, nullptr};
1327       }
1328     }
1329   }
1330 
1331   // Fall-thru to regular partial inlining if we:
1332   //    i) can't find any cold regions to outline, or
1333   //   ii) can't inline the outlined function anywhere.
1334   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1335   if (!OI)
1336     return {false, nullptr};
1337 
1338   FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1339   Cloner.normalizeReturnBlock();
1340 
1341   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1342 
1343   if (!OutlinedFunction)
1344     return {false, nullptr};
1345 
1346   if (tryPartialInline(Cloner))
1347     return {true, OutlinedFunction};
1348 
1349   return {false, nullptr};
1350 }
1351 
1352 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1353   if (Cloner.OutlinedFunctions.empty())
1354     return false;
1355 
1356   int SizeCost = 0;
1357   BlockFrequency WeightedRcost;
1358   int NonWeightedRcost;
1359 
1360   auto OutliningCosts = computeOutliningCosts(Cloner);
1361   assert(std::get<0>(OutliningCosts).isValid() &&
1362          std::get<1>(OutliningCosts).isValid() && "Expected valid costs");
1363 
1364   SizeCost = *std::get<0>(OutliningCosts).getValue();
1365   NonWeightedRcost = *std::get<1>(OutliningCosts).getValue();
1366 
1367   // Only calculate RelativeToEntryFreq when we are doing single region
1368   // outlining.
1369   BranchProbability RelativeToEntryFreq;
1370   if (Cloner.ClonedOI)
1371     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1372   else
1373     // RelativeToEntryFreq doesn't make sense when we have more than one
1374     // outlined call because each call will have a different relative frequency
1375     // to the entry block.  We can consider using the average, but the
1376     // usefulness of that information is questionable. For now, assume we never
1377     // execute the calls to outlined functions.
1378     RelativeToEntryFreq = BranchProbability(0, 1);
1379 
1380   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1381 
1382   // The call sequence(s) to the outlined function(s) are larger than the sum of
1383   // the original outlined region size(s), it does not increase the chances of
1384   // inlining the function with outlining (The inliner uses the size increase to
1385   // model the cost of inlining a callee).
1386   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1387     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1388     DebugLoc DLoc;
1389     BasicBlock *Block;
1390     std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1391     OrigFuncORE.emit([&]() {
1392       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1393                                         DLoc, Block)
1394              << ore::NV("Function", Cloner.OrigFunc)
1395              << " not partially inlined into callers (Original Size = "
1396              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1397              << ", Size of call sequence to outlined function = "
1398              << ore::NV("NewSize", SizeCost) << ")";
1399     });
1400     return false;
1401   }
1402 
1403   assert(Cloner.OrigFunc->users().empty() &&
1404          "F's users should all be replaced!");
1405 
1406   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1407                             Cloner.ClonedFunc->user_end());
1408 
1409   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1410   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1411   if (CalleeEntryCount)
1412     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1413 
1414   uint64_t CalleeEntryCountV =
1415       (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1416 
1417   bool AnyInline = false;
1418   for (User *User : Users) {
1419     // Don't bother with BlockAddress used by CallBr for asm goto.
1420     if (isa<BlockAddress>(User))
1421       continue;
1422 
1423     CallBase *CB = getSupportedCallBase(User);
1424 
1425     if (isLimitReached())
1426       continue;
1427 
1428     OptimizationRemarkEmitter CallerORE(CB->getCaller());
1429     if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1430       continue;
1431 
1432     // Construct remark before doing the inlining, as after successful inlining
1433     // the callsite is removed.
1434     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1435     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1436        << ore::NV("Caller", CB->getCaller());
1437 
1438     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1439     // We can only forward varargs when we outlined a single region, else we
1440     // bail on vararg functions.
1441     if (!InlineFunction(*CB, IFI, nullptr, true,
1442                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1443                                          : nullptr))
1444              .isSuccess())
1445       continue;
1446 
1447     CallerORE.emit(OR);
1448 
1449     // Now update the entry count:
1450     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1451       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1452       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1453     }
1454 
1455     AnyInline = true;
1456     NumPartialInlining++;
1457     // Update the stats
1458     if (Cloner.ClonedOI)
1459       NumPartialInlined++;
1460     else
1461       NumColdOutlinePartialInlined++;
1462   }
1463 
1464   if (AnyInline) {
1465     Cloner.IsFunctionInlined = true;
1466     if (CalleeEntryCount)
1467       Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1468           CalleeEntryCountV, CalleeEntryCount->getType()));
1469     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1470     OrigFuncORE.emit([&]() {
1471       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1472              << "Partially inlined into at least one caller";
1473     });
1474   }
1475 
1476   return AnyInline;
1477 }
1478 
1479 bool PartialInlinerImpl::run(Module &M) {
1480   if (DisablePartialInlining)
1481     return false;
1482 
1483   std::vector<Function *> Worklist;
1484   Worklist.reserve(M.size());
1485   for (Function &F : M)
1486     if (!F.use_empty() && !F.isDeclaration())
1487       Worklist.push_back(&F);
1488 
1489   bool Changed = false;
1490   while (!Worklist.empty()) {
1491     Function *CurrFunc = Worklist.back();
1492     Worklist.pop_back();
1493 
1494     if (CurrFunc->use_empty())
1495       continue;
1496 
1497     bool Recursive = false;
1498     for (User *U : CurrFunc->users())
1499       if (Instruction *I = dyn_cast<Instruction>(U))
1500         if (I->getParent()->getParent() == CurrFunc) {
1501           Recursive = true;
1502           break;
1503         }
1504     if (Recursive)
1505       continue;
1506 
1507     std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1508     if (Result.second)
1509       Worklist.push_back(Result.second);
1510     Changed |= Result.first;
1511   }
1512 
1513   return Changed;
1514 }
1515 
1516 char PartialInlinerLegacyPass::ID = 0;
1517 
1518 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1519                       "Partial Inliner", false, false)
1520 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1521 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1522 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1523 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1524 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1525                     "Partial Inliner", false, false)
1526 
1527 ModulePass *llvm::createPartialInliningPass() {
1528   return new PartialInlinerLegacyPass();
1529 }
1530 
1531 PreservedAnalyses PartialInlinerPass::run(Module &M,
1532                                           ModuleAnalysisManager &AM) {
1533   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1534 
1535   auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1536     return FAM.getResult<AssumptionAnalysis>(F);
1537   };
1538 
1539   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1540     return FAM.getCachedResult<AssumptionAnalysis>(F);
1541   };
1542 
1543   auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1544     return FAM.getResult<BlockFrequencyAnalysis>(F);
1545   };
1546 
1547   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1548     return FAM.getResult<TargetIRAnalysis>(F);
1549   };
1550 
1551   auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1552     return FAM.getResult<TargetLibraryAnalysis>(F);
1553   };
1554 
1555   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1556 
1557   if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1558                          GetTLI, PSI, GetBFI)
1559           .run(M))
1560     return PreservedAnalyses::none();
1561   return PreservedAnalyses::all();
1562 }
1563