1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69           "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71                                         "cold outlined regions were partially "
72                                         "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound,
74            "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined,
76            "Number of cold single entry/exit regions outlined.");
77 
78 // Command line option to disable partial-inlining. The default is false:
79 static cl::opt<bool>
80     DisablePartialInlining("disable-partial-inlining", cl::init(false),
81                            cl::Hidden, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
83 // false:
84 static cl::opt<bool> DisableMultiRegionPartialInline(
85     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86     cl::desc("Disable multi-region partial inlining"));
87 
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
90 static cl::opt<bool>
91     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92                cl::desc("Force outline regions with live exits"));
93 
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
96 static cl::opt<bool>
97     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98                        cl::desc("Mark outline function calls with ColdCC"));
99 
100 #ifndef NDEBUG
101 // Command line option to debug partial-inlining. The default is none:
102 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
103                                           cl::init(false), cl::Hidden,
104                                           cl::desc("Trace partial inlining."));
105 #endif
106 
107 // This is an option used by testing:
108 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
109                                       cl::init(false), cl::ZeroOrMore,
110                                       cl::ReallyHidden,
111                                       cl::desc("Skip Cost Analysis"));
112 // Used to determine if a cold region is worth outlining based on
113 // its inlining cost compared to the original function.  Default is set at 10%.
114 // ie. if the cold region reduces the inlining cost of the original function by
115 // at least 10%.
116 static cl::opt<float> MinRegionSizeRatio(
117     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
118     cl::desc("Minimum ratio comparing relative sizes of each "
119              "outline candidate and original function"));
120 // Used to tune the minimum number of execution counts needed in the predecessor
121 // block to the cold edge. ie. confidence interval.
122 static cl::opt<unsigned>
123     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
124                              cl::desc("Minimum block executions to consider "
125                                       "its BranchProbabilityInfo valid"));
126 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
127 // if the branch probability is 10% or less, then it is deemed as 'cold'.
128 static cl::opt<float> ColdBranchRatio(
129     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
130     cl::desc("Minimum BranchProbability to consider a region cold."));
131 
132 static cl::opt<unsigned> MaxNumInlineBlocks(
133     "max-num-inline-blocks", cl::init(5), cl::Hidden,
134     cl::desc("Max number of blocks to be partially inlined"));
135 
136 // Command line option to set the maximum number of partial inlining allowed
137 // for the module. The default value of -1 means no limit.
138 static cl::opt<int> MaxNumPartialInlining(
139     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
140     cl::desc("Max number of partial inlining. The default is unlimited"));
141 
142 // Used only when PGO or user annotated branch data is absent. It is
143 // the least value that is used to weigh the outline region. If BFI
144 // produces larger value, the BFI value will be used.
145 static cl::opt<int>
146     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
147                              cl::Hidden, cl::ZeroOrMore,
148                              cl::desc("Relative frequency of outline region to "
149                                       "the entry block"));
150 
151 static cl::opt<unsigned> ExtraOutliningPenalty(
152     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
153     cl::desc("A debug option to add additional penalty to the computed one."));
154 
155 namespace {
156 
157 struct FunctionOutliningInfo {
158   FunctionOutliningInfo() = default;
159 
160   // Returns the number of blocks to be inlined including all blocks
161   // in Entries and one return block.
GetNumInlinedBlocks__anon5c3931f20111::FunctionOutliningInfo162   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
163 
164   // A set of blocks including the function entry that guard
165   // the region to be outlined.
166   SmallVector<BasicBlock *, 4> Entries;
167 
168   // The return block that is not included in the outlined region.
169   BasicBlock *ReturnBlock = nullptr;
170 
171   // The dominating block of the region to be outlined.
172   BasicBlock *NonReturnBlock = nullptr;
173 
174   // The set of blocks in Entries that that are predecessors to ReturnBlock
175   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
176 };
177 
178 struct FunctionOutliningMultiRegionInfo {
FunctionOutliningMultiRegionInfo__anon5c3931f20111::FunctionOutliningMultiRegionInfo179   FunctionOutliningMultiRegionInfo()
180       : ORI() {}
181 
182   // Container for outline regions
183   struct OutlineRegionInfo {
OutlineRegionInfo__anon5c3931f20111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo184     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
185                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
186                       BasicBlock *ReturnBlock)
187         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
188           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
189     SmallVector<BasicBlock *, 8> Region;
190     BasicBlock *EntryBlock;
191     BasicBlock *ExitBlock;
192     BasicBlock *ReturnBlock;
193   };
194 
195   SmallVector<OutlineRegionInfo, 4> ORI;
196 };
197 
198 struct PartialInlinerImpl {
199 
PartialInlinerImpl__anon5c3931f20111::PartialInlinerImpl200   PartialInlinerImpl(
201       function_ref<AssumptionCache &(Function &)> GetAC,
202       function_ref<AssumptionCache *(Function &)> LookupAC,
203       function_ref<TargetTransformInfo &(Function &)> GTTI,
204       function_ref<const TargetLibraryInfo &(Function &)> GTLI,
205       ProfileSummaryInfo &ProfSI,
206       function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
207       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
208         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
209 
210   bool run(Module &M);
211   // Main part of the transformation that calls helper functions to find
212   // outlining candidates, clone & outline the function, and attempt to
213   // partially inline the resulting function. Returns true if
214   // inlining was successful, false otherwise.  Also returns the outline
215   // function (only if we partially inlined early returns) as there is a
216   // possibility to further "peel" early return statements that were left in the
217   // outline function due to code size.
218   std::pair<bool, Function *> unswitchFunction(Function *F);
219 
220   // This class speculatively clones the function to be partial inlined.
221   // At the end of partial inlining, the remaining callsites to the cloned
222   // function that are not partially inlined will be fixed up to reference
223   // the original function, and the cloned function will be erased.
224   struct FunctionCloner {
225     // Two constructors, one for single region outlining, the other for
226     // multi-region outlining.
227     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
228                    OptimizationRemarkEmitter &ORE,
229                    function_ref<AssumptionCache *(Function &)> LookupAC);
230     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
231                    OptimizationRemarkEmitter &ORE,
232                    function_ref<AssumptionCache *(Function &)> LookupAC);
233     ~FunctionCloner();
234 
235     // Prepare for function outlining: making sure there is only
236     // one incoming edge from the extracted/outlined region to
237     // the return block.
238     void NormalizeReturnBlock();
239 
240     // Do function outlining for cold regions.
241     bool doMultiRegionFunctionOutlining();
242     // Do function outlining for region after early return block(s).
243     // NOTE: For vararg functions that do the vararg handling in the outlined
244     //       function, we temporarily generate IR that does not properly
245     //       forward varargs to the outlined function. Calling InlineFunction
246     //       will update calls to the outlined functions to properly forward
247     //       the varargs.
248     Function *doSingleRegionFunctionOutlining();
249 
250     Function *OrigFunc = nullptr;
251     Function *ClonedFunc = nullptr;
252 
253     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
254     // Keep track of Outlined Functions and the basic block they're called from.
255     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
256 
257     // ClonedFunc is inlined in one of its callers after function
258     // outlining.
259     bool IsFunctionInlined = false;
260     // The cost of the region to be outlined.
261     int OutlinedRegionCost = 0;
262     // ClonedOI is specific to outlining non-early return blocks.
263     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
264     // ClonedOMRI is specific to outlining cold regions.
265     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
266     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
267     OptimizationRemarkEmitter &ORE;
268     function_ref<AssumptionCache *(Function &)> LookupAC;
269   };
270 
271 private:
272   int NumPartialInlining = 0;
273   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
274   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
275   function_ref<TargetTransformInfo &(Function &)> GetTTI;
276   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
277   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
278   ProfileSummaryInfo &PSI;
279 
280   // Return the frequency of the OutlininingBB relative to F's entry point.
281   // The result is no larger than 1 and is represented using BP.
282   // (Note that the outlined region's 'head' block can only have incoming
283   // edges from the guarding entry blocks).
284   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
285 
286   // Return true if the callee of CB should be partially inlined with
287   // profit.
288   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
289                            BlockFrequency WeightedOutliningRcost,
290                            OptimizationRemarkEmitter &ORE);
291 
292   // Try to inline DuplicateFunction (cloned from F with call to
293   // the OutlinedFunction into its callers. Return true
294   // if there is any successful inlining.
295   bool tryPartialInline(FunctionCloner &Cloner);
296 
297   // Compute the mapping from use site of DuplicationFunction to the enclosing
298   // BB's profile count.
299   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
300                                      DenseMap<User *, uint64_t> &SiteCountMap);
301 
IsLimitReached__anon5c3931f20111::PartialInlinerImpl302   bool IsLimitReached() {
303     return (MaxNumPartialInlining != -1 &&
304             NumPartialInlining >= MaxNumPartialInlining);
305   }
306 
getSupportedCallBase__anon5c3931f20111::PartialInlinerImpl307   static CallBase *getSupportedCallBase(User *U) {
308     if (isa<CallInst>(U) || isa<InvokeInst>(U))
309       return cast<CallBase>(U);
310     llvm_unreachable("All uses must be calls");
311     return nullptr;
312   }
313 
getOneCallSiteTo__anon5c3931f20111::PartialInlinerImpl314   static CallBase *getOneCallSiteTo(Function *F) {
315     User *User = *F->user_begin();
316     return getSupportedCallBase(User);
317   }
318 
getOneDebugLoc__anon5c3931f20111::PartialInlinerImpl319   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
320     CallBase *CB = getOneCallSiteTo(F);
321     DebugLoc DLoc = CB->getDebugLoc();
322     BasicBlock *Block = CB->getParent();
323     return std::make_tuple(DLoc, Block);
324   }
325 
326   // Returns the costs associated with function outlining:
327   // - The first value is the non-weighted runtime cost for making the call
328   //   to the outlined function, including the addtional  setup cost in the
329   //    outlined function itself;
330   // - The second value is the estimated size of the new call sequence in
331   //   basic block Cloner.OutliningCallBB;
332   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
333 
334   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
335   // approximate both the size and runtime cost (Note that in the current
336   // inline cost analysis, there is no clear distinction there either).
337   static int computeBBInlineCost(BasicBlock *BB);
338 
339   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
340   std::unique_ptr<FunctionOutliningMultiRegionInfo>
341   computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
342 };
343 
344 struct PartialInlinerLegacyPass : public ModulePass {
345   static char ID; // Pass identification, replacement for typeid
346 
PartialInlinerLegacyPass__anon5c3931f20111::PartialInlinerLegacyPass347   PartialInlinerLegacyPass() : ModulePass(ID) {
348     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
349   }
350 
getAnalysisUsage__anon5c3931f20111::PartialInlinerLegacyPass351   void getAnalysisUsage(AnalysisUsage &AU) const override {
352     AU.addRequired<AssumptionCacheTracker>();
353     AU.addRequired<ProfileSummaryInfoWrapperPass>();
354     AU.addRequired<TargetTransformInfoWrapperPass>();
355     AU.addRequired<TargetLibraryInfoWrapperPass>();
356   }
357 
runOnModule__anon5c3931f20111::PartialInlinerLegacyPass358   bool runOnModule(Module &M) override {
359     if (skipModule(M))
360       return false;
361 
362     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
363     TargetTransformInfoWrapperPass *TTIWP =
364         &getAnalysis<TargetTransformInfoWrapperPass>();
365     ProfileSummaryInfo &PSI =
366         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
367 
368     auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
369       return ACT->getAssumptionCache(F);
370     };
371 
372     auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
373       return ACT->lookupAssumptionCache(F);
374     };
375 
376     auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
377       return TTIWP->getTTI(F);
378     };
379 
380     auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
381       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
382     };
383 
384     return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
385                               GetTLI, PSI)
386         .run(M);
387   }
388 };
389 
390 } // end anonymous namespace
391 
392 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function * F,OptimizationRemarkEmitter & ORE)393 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
394                                                     OptimizationRemarkEmitter &ORE) {
395   BasicBlock *EntryBlock = &F->front();
396 
397   DominatorTree DT(*F);
398   LoopInfo LI(DT);
399   BranchProbabilityInfo BPI(*F, LI);
400   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
401   BlockFrequencyInfo *BFI;
402   if (!GetBFI) {
403     ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
404     BFI = ScopedBFI.get();
405   } else
406     BFI = &(GetBFI(*F));
407 
408   // Return if we don't have profiling information.
409   if (!PSI.hasInstrumentationProfile())
410     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
411 
412   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
413       std::make_unique<FunctionOutliningMultiRegionInfo>();
414 
415   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
416     BasicBlock *Dom = BlockList.front();
417     return BlockList.size() > 1 && Dom->hasNPredecessors(1);
418   };
419 
420   auto IsSingleExit =
421       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
422     BasicBlock *ExitBlock = nullptr;
423     for (auto *Block : BlockList) {
424       for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
425         if (!is_contained(BlockList, *SI)) {
426           if (ExitBlock) {
427             ORE.emit([&]() {
428               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
429                                               &SI->front())
430                      << "Region dominated by "
431                      << ore::NV("Block", BlockList.front()->getName())
432                      << " has more than one region exit edge.";
433             });
434             return nullptr;
435           } else
436             ExitBlock = Block;
437         }
438       }
439     }
440     return ExitBlock;
441   };
442 
443   auto BBProfileCount = [BFI](BasicBlock *BB) {
444     return BFI->getBlockProfileCount(BB)
445                ? BFI->getBlockProfileCount(BB).getValue()
446                : 0;
447   };
448 
449   // Use the same computeBBInlineCost function to compute the cost savings of
450   // the outlining the candidate region.
451   int OverallFunctionCost = 0;
452   for (auto &BB : *F)
453     OverallFunctionCost += computeBBInlineCost(&BB);
454 
455 #ifndef NDEBUG
456   if (TracePartialInlining)
457     dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
458 #endif
459   int MinOutlineRegionCost =
460       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
461   BranchProbability MinBranchProbability(
462       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
463       MinBlockCounterExecution);
464   bool ColdCandidateFound = false;
465   BasicBlock *CurrEntry = EntryBlock;
466   std::vector<BasicBlock *> DFS;
467   DenseMap<BasicBlock *, bool> VisitedMap;
468   DFS.push_back(CurrEntry);
469   VisitedMap[CurrEntry] = true;
470   // Use Depth First Search on the basic blocks to find CFG edges that are
471   // considered cold.
472   // Cold regions considered must also have its inline cost compared to the
473   // overall inline cost of the original function.  The region is outlined only
474   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
475   // more.
476   while (!DFS.empty()) {
477     auto *thisBB = DFS.back();
478     DFS.pop_back();
479     // Only consider regions with predecessor blocks that are considered
480     // not-cold (default: part of the top 99.99% of all block counters)
481     // AND greater than our minimum block execution count (default: 100).
482     if (PSI.isColdBlock(thisBB, BFI) ||
483         BBProfileCount(thisBB) < MinBlockCounterExecution)
484       continue;
485     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
486       if (VisitedMap[*SI])
487         continue;
488       VisitedMap[*SI] = true;
489       DFS.push_back(*SI);
490       // If branch isn't cold, we skip to the next one.
491       BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
492       if (SuccProb > MinBranchProbability)
493         continue;
494 #ifndef NDEBUG
495       if (TracePartialInlining) {
496         dbgs() << "Found cold edge: " << thisBB->getName() << "->"
497                << (*SI)->getName() << "\nBranch Probability = " << SuccProb
498                << "\n";
499       }
500 #endif
501       SmallVector<BasicBlock *, 8> DominateVector;
502       DT.getDescendants(*SI, DominateVector);
503       // We can only outline single entry regions (for now).
504       if (!IsSingleEntry(DominateVector))
505         continue;
506       BasicBlock *ExitBlock = nullptr;
507       // We can only outline single exit regions (for now).
508       if (!(ExitBlock = IsSingleExit(DominateVector)))
509         continue;
510       int OutlineRegionCost = 0;
511       for (auto *BB : DominateVector)
512         OutlineRegionCost += computeBBInlineCost(BB);
513 
514 #ifndef NDEBUG
515       if (TracePartialInlining)
516         dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
517 #endif
518 
519       if (OutlineRegionCost < MinOutlineRegionCost) {
520         ORE.emit([&]() {
521           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
522                                             &SI->front())
523                  << ore::NV("Callee", F) << " inline cost-savings smaller than "
524                  << ore::NV("Cost", MinOutlineRegionCost);
525         });
526         continue;
527       }
528       // For now, ignore blocks that belong to a SISE region that is a
529       // candidate for outlining.  In the future, we may want to look
530       // at inner regions because the outer region may have live-exit
531       // variables.
532       for (auto *BB : DominateVector)
533         VisitedMap[BB] = true;
534       // ReturnBlock here means the block after the outline call
535       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
536       // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
537       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
538           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
539       OutliningInfo->ORI.push_back(RegInfo);
540 #ifndef NDEBUG
541       if (TracePartialInlining) {
542         dbgs() << "Found Cold Candidate starting at block: "
543                << DominateVector.front()->getName() << "\n";
544       }
545 #endif
546       ColdCandidateFound = true;
547       NumColdRegionsFound++;
548     }
549   }
550   if (ColdCandidateFound)
551     return OutliningInfo;
552   else
553     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
554 }
555 
556 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function * F)557 PartialInlinerImpl::computeOutliningInfo(Function *F) {
558   BasicBlock *EntryBlock = &F->front();
559   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
560   if (!BR || BR->isUnconditional())
561     return std::unique_ptr<FunctionOutliningInfo>();
562 
563   // Returns true if Succ is BB's successor
564   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
565     return is_contained(successors(BB), Succ);
566   };
567 
568   auto IsReturnBlock = [](BasicBlock *BB) {
569     Instruction *TI = BB->getTerminator();
570     return isa<ReturnInst>(TI);
571   };
572 
573   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
574     if (IsReturnBlock(Succ1))
575       return std::make_tuple(Succ1, Succ2);
576     if (IsReturnBlock(Succ2))
577       return std::make_tuple(Succ2, Succ1);
578 
579     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
580   };
581 
582   // Detect a triangular shape:
583   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
584     if (IsSuccessor(Succ1, Succ2))
585       return std::make_tuple(Succ1, Succ2);
586     if (IsSuccessor(Succ2, Succ1))
587       return std::make_tuple(Succ2, Succ1);
588 
589     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
590   };
591 
592   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
593       std::make_unique<FunctionOutliningInfo>();
594 
595   BasicBlock *CurrEntry = EntryBlock;
596   bool CandidateFound = false;
597   do {
598     // The number of blocks to be inlined has already reached
599     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
600     // disables partial inlining for the function.
601     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
602       break;
603 
604     if (succ_size(CurrEntry) != 2)
605       break;
606 
607     BasicBlock *Succ1 = *succ_begin(CurrEntry);
608     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
609 
610     BasicBlock *ReturnBlock, *NonReturnBlock;
611     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
612 
613     if (ReturnBlock) {
614       OutliningInfo->Entries.push_back(CurrEntry);
615       OutliningInfo->ReturnBlock = ReturnBlock;
616       OutliningInfo->NonReturnBlock = NonReturnBlock;
617       CandidateFound = true;
618       break;
619     }
620 
621     BasicBlock *CommSucc;
622     BasicBlock *OtherSucc;
623     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
624 
625     if (!CommSucc)
626       break;
627 
628     OutliningInfo->Entries.push_back(CurrEntry);
629     CurrEntry = OtherSucc;
630   } while (true);
631 
632   if (!CandidateFound)
633     return std::unique_ptr<FunctionOutliningInfo>();
634 
635   // Do sanity check of the entries: threre should not
636   // be any successors (not in the entry set) other than
637   // {ReturnBlock, NonReturnBlock}
638   assert(OutliningInfo->Entries[0] == &F->front() &&
639          "Function Entry must be the first in Entries vector");
640   DenseSet<BasicBlock *> Entries;
641   for (BasicBlock *E : OutliningInfo->Entries)
642     Entries.insert(E);
643 
644   // Returns true of BB has Predecessor which is not
645   // in Entries set.
646   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
647     for (auto Pred : predecessors(BB)) {
648       if (!Entries.count(Pred))
649         return true;
650     }
651     return false;
652   };
653   auto CheckAndNormalizeCandidate =
654       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
655         for (BasicBlock *E : OutliningInfo->Entries) {
656           for (auto Succ : successors(E)) {
657             if (Entries.count(Succ))
658               continue;
659             if (Succ == OutliningInfo->ReturnBlock)
660               OutliningInfo->ReturnBlockPreds.push_back(E);
661             else if (Succ != OutliningInfo->NonReturnBlock)
662               return false;
663           }
664           // There should not be any outside incoming edges either:
665           if (HasNonEntryPred(E))
666             return false;
667         }
668         return true;
669       };
670 
671   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
672     return std::unique_ptr<FunctionOutliningInfo>();
673 
674   // Now further growing the candidate's inlining region by
675   // peeling off dominating blocks from the outlining region:
676   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
677     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
678     if (succ_size(Cand) != 2)
679       break;
680 
681     if (HasNonEntryPred(Cand))
682       break;
683 
684     BasicBlock *Succ1 = *succ_begin(Cand);
685     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
686 
687     BasicBlock *ReturnBlock, *NonReturnBlock;
688     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
689     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
690       break;
691 
692     if (NonReturnBlock->getSinglePredecessor() != Cand)
693       break;
694 
695     // Now grow and update OutlininigInfo:
696     OutliningInfo->Entries.push_back(Cand);
697     OutliningInfo->NonReturnBlock = NonReturnBlock;
698     OutliningInfo->ReturnBlockPreds.push_back(Cand);
699     Entries.insert(Cand);
700   }
701 
702   return OutliningInfo;
703 }
704 
705 // Check if there is PGO data or user annotated branch data:
hasProfileData(Function * F,FunctionOutliningInfo * OI)706 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
707   if (F->hasProfileData())
708     return true;
709   // Now check if any of the entry block has MD_prof data:
710   for (auto *E : OI->Entries) {
711     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
712     if (!BR || BR->isUnconditional())
713       continue;
714     uint64_t T, F;
715     if (BR->extractProfMetadata(T, F))
716       return true;
717   }
718   return false;
719 }
720 
721 BranchProbability
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner)722 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
723   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
724   auto EntryFreq =
725       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
726   auto OutliningCallFreq =
727       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
728   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
729   // we outlined any regions, so we may encounter situations where the
730   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
731   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
732     OutliningCallFreq = EntryFreq;
733   }
734   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
735       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
736 
737   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
738     return OutlineRegionRelFreq;
739 
740   // When profile data is not available, we need to be conservative in
741   // estimating the overall savings. Static branch prediction can usually
742   // guess the branch direction right (taken/non-taken), but the guessed
743   // branch probability is usually not biased enough. In case when the
744   // outlined region is predicted to be likely, its probability needs
745   // to be made higher (more biased) to not under-estimate the cost of
746   // function outlining. On the other hand, if the outlined region
747   // is predicted to be less likely, the predicted probablity is usually
748   // higher than the actual. For instance, the actual probability of the
749   // less likely target is only 5%, but the guessed probablity can be
750   // 40%. In the latter case, there is no need for further adjustement.
751   // FIXME: add an option for this.
752   if (OutlineRegionRelFreq < BranchProbability(45, 100))
753     return OutlineRegionRelFreq;
754 
755   OutlineRegionRelFreq = std::max(
756       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
757 
758   return OutlineRegionRelFreq;
759 }
760 
shouldPartialInline(CallBase & CB,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE)761 bool PartialInlinerImpl::shouldPartialInline(
762     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
763     OptimizationRemarkEmitter &ORE) {
764   using namespace ore;
765 
766   Function *Callee = CB.getCalledFunction();
767   assert(Callee == Cloner.ClonedFunc);
768 
769   if (SkipCostAnalysis)
770     return isInlineViable(*Callee).isSuccess();
771 
772   Function *Caller = CB.getCaller();
773   auto &CalleeTTI = GetTTI(*Callee);
774   bool RemarksEnabled =
775       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
776           DEBUG_TYPE);
777   InlineCost IC =
778       getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
779                     GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
780 
781   if (IC.isAlways()) {
782     ORE.emit([&]() {
783       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
784              << NV("Callee", Cloner.OrigFunc)
785              << " should always be fully inlined, not partially";
786     });
787     return false;
788   }
789 
790   if (IC.isNever()) {
791     ORE.emit([&]() {
792       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
793              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
794              << NV("Caller", Caller)
795              << " because it should never be inlined (cost=never)";
796     });
797     return false;
798   }
799 
800   if (!IC) {
801     ORE.emit([&]() {
802       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
803              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
804              << NV("Caller", Caller) << " because too costly to inline (cost="
805              << NV("Cost", IC.getCost()) << ", threshold="
806              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
807     });
808     return false;
809   }
810   const DataLayout &DL = Caller->getParent()->getDataLayout();
811 
812   // The savings of eliminating the call:
813   int NonWeightedSavings = getCallsiteCost(CB, DL);
814   BlockFrequency NormWeightedSavings(NonWeightedSavings);
815 
816   // Weighted saving is smaller than weighted cost, return false
817   if (NormWeightedSavings < WeightedOutliningRcost) {
818     ORE.emit([&]() {
819       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
820                                         &CB)
821              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
822              << NV("Caller", Caller) << " runtime overhead (overhead="
823              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
824              << ", savings="
825              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
826              << ")"
827              << " of making the outlined call is too high";
828     });
829 
830     return false;
831   }
832 
833   ORE.emit([&]() {
834     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
835            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
836            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
837            << " (threshold="
838            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
839   });
840   return true;
841 }
842 
843 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
844 // For now use a simplified version. The returned 'InlineCost' will be used
845 // to esimate the size cost as well as runtime cost of the BB.
computeBBInlineCost(BasicBlock * BB)846 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
847   int InlineCost = 0;
848   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
849   for (Instruction &I : BB->instructionsWithoutDebug()) {
850     // Skip free instructions.
851     switch (I.getOpcode()) {
852     case Instruction::BitCast:
853     case Instruction::PtrToInt:
854     case Instruction::IntToPtr:
855     case Instruction::Alloca:
856     case Instruction::PHI:
857       continue;
858     case Instruction::GetElementPtr:
859       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
860         continue;
861       break;
862     default:
863       break;
864     }
865 
866     if (I.isLifetimeStartOrEnd())
867       continue;
868 
869     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
870       InlineCost += getCallsiteCost(*CI, DL);
871       continue;
872     }
873 
874     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
875       InlineCost += getCallsiteCost(*II, DL);
876       continue;
877     }
878 
879     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
880       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
881       continue;
882     }
883     InlineCost += InlineConstants::InstrCost;
884   }
885   return InlineCost;
886 }
887 
888 std::tuple<int, int>
computeOutliningCosts(FunctionCloner & Cloner)889 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
890   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
891   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
892     Function *OutlinedFunc = FuncBBPair.first;
893     BasicBlock* OutliningCallBB = FuncBBPair.second;
894     // Now compute the cost of the call sequence to the outlined function
895     // 'OutlinedFunction' in BB 'OutliningCallBB':
896     OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
897 
898     // Now compute the cost of the extracted/outlined function itself:
899     for (BasicBlock &BB : *OutlinedFunc)
900       OutlinedFunctionCost += computeBBInlineCost(&BB);
901   }
902   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
903          "Outlined function cost should be no less than the outlined region");
904 
905   // The code extractor introduces a new root and exit stub blocks with
906   // additional unconditional branches. Those branches will be eliminated
907   // later with bb layout. The cost should be adjusted accordingly:
908   OutlinedFunctionCost -=
909       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
910 
911   int OutliningRuntimeOverhead =
912       OutliningFuncCallCost +
913       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
914       ExtraOutliningPenalty;
915 
916   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
917 }
918 
919 // Create the callsite to profile count map which is
920 // used to update the original function's entry count,
921 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap)922 void PartialInlinerImpl::computeCallsiteToProfCountMap(
923     Function *DuplicateFunction,
924     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
925   std::vector<User *> Users(DuplicateFunction->user_begin(),
926                             DuplicateFunction->user_end());
927   Function *CurrentCaller = nullptr;
928   std::unique_ptr<BlockFrequencyInfo> TempBFI;
929   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
930 
931   auto ComputeCurrBFI = [&,this](Function *Caller) {
932       // For the old pass manager:
933       if (!GetBFI) {
934         DominatorTree DT(*Caller);
935         LoopInfo LI(DT);
936         BranchProbabilityInfo BPI(*Caller, LI);
937         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
938         CurrentCallerBFI = TempBFI.get();
939       } else {
940         // New pass manager:
941         CurrentCallerBFI = &(GetBFI(*Caller));
942       }
943   };
944 
945   for (User *User : Users) {
946     CallBase *CB = getSupportedCallBase(User);
947     Function *Caller = CB->getCaller();
948     if (CurrentCaller != Caller) {
949       CurrentCaller = Caller;
950       ComputeCurrBFI(Caller);
951     } else {
952       assert(CurrentCallerBFI && "CallerBFI is not set");
953     }
954     BasicBlock *CallBB = CB->getParent();
955     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
956     if (Count)
957       CallSiteToProfCountMap[User] = *Count;
958     else
959       CallSiteToProfCountMap[User] = 0;
960   }
961 }
962 
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC)963 PartialInlinerImpl::FunctionCloner::FunctionCloner(
964     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
965     function_ref<AssumptionCache *(Function &)> LookupAC)
966     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
967   ClonedOI = std::make_unique<FunctionOutliningInfo>();
968 
969   // Clone the function, so that we can hack away on it.
970   ValueToValueMapTy VMap;
971   ClonedFunc = CloneFunction(F, VMap);
972 
973   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
974   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
975   for (BasicBlock *BB : OI->Entries) {
976     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
977   }
978   for (BasicBlock *E : OI->ReturnBlockPreds) {
979     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
980     ClonedOI->ReturnBlockPreds.push_back(NewE);
981   }
982   // Go ahead and update all uses to the duplicate, so that we can just
983   // use the inliner functionality when we're done hacking.
984   F->replaceAllUsesWith(ClonedFunc);
985 }
986 
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC)987 PartialInlinerImpl::FunctionCloner::FunctionCloner(
988     Function *F, FunctionOutliningMultiRegionInfo *OI,
989     OptimizationRemarkEmitter &ORE,
990     function_ref<AssumptionCache *(Function &)> LookupAC)
991     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
992   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
993 
994   // Clone the function, so that we can hack away on it.
995   ValueToValueMapTy VMap;
996   ClonedFunc = CloneFunction(F, VMap);
997 
998   // Go through all Outline Candidate Regions and update all BasicBlock
999   // information.
1000   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1001        OI->ORI) {
1002     SmallVector<BasicBlock *, 8> Region;
1003     for (BasicBlock *BB : RegionInfo.Region) {
1004       Region.push_back(cast<BasicBlock>(VMap[BB]));
1005     }
1006     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1007     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1008     BasicBlock *NewReturnBlock = nullptr;
1009     if (RegionInfo.ReturnBlock)
1010       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1011     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1012         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1013     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1014   }
1015   // Go ahead and update all uses to the duplicate, so that we can just
1016   // use the inliner functionality when we're done hacking.
1017   F->replaceAllUsesWith(ClonedFunc);
1018 }
1019 
NormalizeReturnBlock()1020 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1021   auto getFirstPHI = [](BasicBlock *BB) {
1022     BasicBlock::iterator I = BB->begin();
1023     PHINode *FirstPhi = nullptr;
1024     while (I != BB->end()) {
1025       PHINode *Phi = dyn_cast<PHINode>(I);
1026       if (!Phi)
1027         break;
1028       if (!FirstPhi) {
1029         FirstPhi = Phi;
1030         break;
1031       }
1032     }
1033     return FirstPhi;
1034   };
1035 
1036   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1037   // blocks.
1038   if (!ClonedOI)
1039     return;
1040 
1041   // Special hackery is needed with PHI nodes that have inputs from more than
1042   // one extracted block.  For simplicity, just split the PHIs into a two-level
1043   // sequence of PHIs, some of which will go in the extracted region, and some
1044   // of which will go outside.
1045   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1046   // only split block when necessary:
1047   PHINode *FirstPhi = getFirstPHI(PreReturn);
1048   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1049 
1050   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1051     return;
1052 
1053   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1054     Value *CommonValue = PN->getIncomingValue(0);
1055     if (all_of(PN->incoming_values(),
1056                [&](Value *V) { return V == CommonValue; }))
1057       return CommonValue;
1058     return nullptr;
1059   };
1060 
1061   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1062       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1063   BasicBlock::iterator I = PreReturn->begin();
1064   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1065   SmallVector<Instruction *, 4> DeadPhis;
1066   while (I != PreReturn->end()) {
1067     PHINode *OldPhi = dyn_cast<PHINode>(I);
1068     if (!OldPhi)
1069       break;
1070 
1071     PHINode *RetPhi =
1072         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1073     OldPhi->replaceAllUsesWith(RetPhi);
1074     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1075 
1076     RetPhi->addIncoming(&*I, PreReturn);
1077     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1078       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1079       OldPhi->removeIncomingValue(E);
1080     }
1081 
1082     // After incoming values splitting, the old phi may become trivial.
1083     // Keeping the trivial phi can introduce definition inside the outline
1084     // region which is live-out, causing necessary overhead (load, store
1085     // arg passing etc).
1086     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1087       OldPhi->replaceAllUsesWith(OldPhiVal);
1088       DeadPhis.push_back(OldPhi);
1089     }
1090     ++I;
1091   }
1092   for (auto *DP : DeadPhis)
1093     DP->eraseFromParent();
1094 
1095   for (auto E : ClonedOI->ReturnBlockPreds) {
1096     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1097   }
1098 }
1099 
doMultiRegionFunctionOutlining()1100 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1101 
1102   auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1103     int Cost = 0;
1104     for (BasicBlock* BB : Region)
1105       Cost += computeBBInlineCost(BB);
1106     return Cost;
1107   };
1108 
1109   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1110 
1111   if (ClonedOMRI->ORI.empty())
1112     return false;
1113 
1114   // The CodeExtractor needs a dominator tree.
1115   DominatorTree DT;
1116   DT.recalculate(*ClonedFunc);
1117 
1118   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1119   LoopInfo LI(DT);
1120   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1121   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1122 
1123   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1124   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1125 
1126   SetVector<Value *> Inputs, Outputs, Sinks;
1127   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1128        ClonedOMRI->ORI) {
1129     int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1130 
1131     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1132                      ClonedFuncBFI.get(), &BPI,
1133                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1134                      /* AllowVarargs */ false);
1135 
1136     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1137 
1138 #ifndef NDEBUG
1139     if (TracePartialInlining) {
1140       dbgs() << "inputs: " << Inputs.size() << "\n";
1141       dbgs() << "outputs: " << Outputs.size() << "\n";
1142       for (Value *value : Inputs)
1143         dbgs() << "value used in func: " << *value << "\n";
1144       for (Value *output : Outputs)
1145         dbgs() << "instr used in func: " << *output << "\n";
1146     }
1147 #endif
1148     // Do not extract regions that have live exit variables.
1149     if (Outputs.size() > 0 && !ForceLiveExit)
1150       continue;
1151 
1152     Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
1153 
1154     if (OutlinedFunc) {
1155       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1156       BasicBlock *OutliningCallBB = OCS->getParent();
1157       assert(OutliningCallBB->getParent() == ClonedFunc);
1158       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1159       NumColdRegionsOutlined++;
1160       OutlinedRegionCost += CurrentOutlinedRegionCost;
1161 
1162       if (MarkOutlinedColdCC) {
1163         OutlinedFunc->setCallingConv(CallingConv::Cold);
1164         OCS->setCallingConv(CallingConv::Cold);
1165       }
1166     } else
1167       ORE.emit([&]() {
1168         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1169                                         &RegionInfo.Region.front()->front())
1170                << "Failed to extract region at block "
1171                << ore::NV("Block", RegionInfo.Region.front());
1172       });
1173   }
1174 
1175   return !OutlinedFunctions.empty();
1176 }
1177 
1178 Function *
doSingleRegionFunctionOutlining()1179 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1180   // Returns true if the block is to be partial inlined into the caller
1181   // (i.e. not to be extracted to the out of line function)
1182   auto ToBeInlined = [&, this](BasicBlock *BB) {
1183     return BB == ClonedOI->ReturnBlock ||
1184            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1185             ClonedOI->Entries.end());
1186   };
1187 
1188   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1189   // The CodeExtractor needs a dominator tree.
1190   DominatorTree DT;
1191   DT.recalculate(*ClonedFunc);
1192 
1193   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1194   LoopInfo LI(DT);
1195   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1196   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1197 
1198   // Gather up the blocks that we're going to extract.
1199   std::vector<BasicBlock *> ToExtract;
1200   ToExtract.push_back(ClonedOI->NonReturnBlock);
1201   OutlinedRegionCost +=
1202       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1203   for (BasicBlock &BB : *ClonedFunc)
1204     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1205       ToExtract.push_back(&BB);
1206       // FIXME: the code extractor may hoist/sink more code
1207       // into the outlined function which may make the outlining
1208       // overhead (the difference of the outlined function cost
1209       // and OutliningRegionCost) look larger.
1210       OutlinedRegionCost += computeBBInlineCost(&BB);
1211     }
1212 
1213   // Extract the body of the if.
1214   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1215   Function *OutlinedFunc =
1216       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1217                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1218                     /* AllowVarargs */ true)
1219           .extractCodeRegion(CEAC);
1220 
1221   if (OutlinedFunc) {
1222     BasicBlock *OutliningCallBB =
1223         PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1224             ->getParent();
1225     assert(OutliningCallBB->getParent() == ClonedFunc);
1226     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1227   } else
1228     ORE.emit([&]() {
1229       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1230                                       &ToExtract.front()->front())
1231              << "Failed to extract region at block "
1232              << ore::NV("Block", ToExtract.front());
1233     });
1234 
1235   return OutlinedFunc;
1236 }
1237 
~FunctionCloner()1238 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1239   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1240   // users (function pointers, etc.) back to the original function.
1241   ClonedFunc->replaceAllUsesWith(OrigFunc);
1242   ClonedFunc->eraseFromParent();
1243   if (!IsFunctionInlined) {
1244     // Remove each function that was speculatively created if there is no
1245     // reference.
1246     for (auto FuncBBPair : OutlinedFunctions) {
1247       Function *Func = FuncBBPair.first;
1248       Func->eraseFromParent();
1249     }
1250   }
1251 }
1252 
unswitchFunction(Function * F)1253 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1254 
1255   if (F->hasAddressTaken())
1256     return {false, nullptr};
1257 
1258   // Let inliner handle it
1259   if (F->hasFnAttribute(Attribute::AlwaysInline))
1260     return {false, nullptr};
1261 
1262   if (F->hasFnAttribute(Attribute::NoInline))
1263     return {false, nullptr};
1264 
1265   if (PSI.isFunctionEntryCold(F))
1266     return {false, nullptr};
1267 
1268   if (F->users().empty())
1269     return {false, nullptr};
1270 
1271   OptimizationRemarkEmitter ORE(F);
1272 
1273   // Only try to outline cold regions if we have a profile summary, which
1274   // implies we have profiling information.
1275   if (PSI.hasProfileSummary() && F->hasProfileData() &&
1276       !DisableMultiRegionPartialInline) {
1277     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1278         computeOutliningColdRegionsInfo(F, ORE);
1279     if (OMRI) {
1280       FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
1281 
1282 #ifndef NDEBUG
1283       if (TracePartialInlining) {
1284         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1285         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1286                << "\n";
1287       }
1288 #endif
1289       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1290 
1291       if (DidOutline) {
1292 #ifndef NDEBUG
1293         if (TracePartialInlining) {
1294           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1295           Cloner.ClonedFunc->print(dbgs());
1296           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1297         }
1298 #endif
1299 
1300         if (tryPartialInline(Cloner))
1301           return {true, nullptr};
1302       }
1303     }
1304   }
1305 
1306   // Fall-thru to regular partial inlining if we:
1307   //    i) can't find any cold regions to outline, or
1308   //   ii) can't inline the outlined function anywhere.
1309   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1310   if (!OI)
1311     return {false, nullptr};
1312 
1313   FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
1314   Cloner.NormalizeReturnBlock();
1315 
1316   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1317 
1318   if (!OutlinedFunction)
1319     return {false, nullptr};
1320 
1321   bool AnyInline = tryPartialInline(Cloner);
1322 
1323   if (AnyInline)
1324     return {true, OutlinedFunction};
1325 
1326   return {false, nullptr};
1327 }
1328 
tryPartialInline(FunctionCloner & Cloner)1329 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1330   if (Cloner.OutlinedFunctions.empty())
1331     return false;
1332 
1333   int SizeCost = 0;
1334   BlockFrequency WeightedRcost;
1335   int NonWeightedRcost;
1336   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1337 
1338   // Only calculate RelativeToEntryFreq when we are doing single region
1339   // outlining.
1340   BranchProbability RelativeToEntryFreq;
1341   if (Cloner.ClonedOI) {
1342     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1343   } else
1344     // RelativeToEntryFreq doesn't make sense when we have more than one
1345     // outlined call because each call will have a different relative frequency
1346     // to the entry block.  We can consider using the average, but the
1347     // usefulness of that information is questionable. For now, assume we never
1348     // execute the calls to outlined functions.
1349     RelativeToEntryFreq = BranchProbability(0, 1);
1350 
1351   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1352 
1353   // The call sequence(s) to the outlined function(s) are larger than the sum of
1354   // the original outlined region size(s), it does not increase the chances of
1355   // inlining the function with outlining (The inliner uses the size increase to
1356   // model the cost of inlining a callee).
1357   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1358     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1359     DebugLoc DLoc;
1360     BasicBlock *Block;
1361     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1362     OrigFuncORE.emit([&]() {
1363       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1364                                         DLoc, Block)
1365              << ore::NV("Function", Cloner.OrigFunc)
1366              << " not partially inlined into callers (Original Size = "
1367              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1368              << ", Size of call sequence to outlined function = "
1369              << ore::NV("NewSize", SizeCost) << ")";
1370     });
1371     return false;
1372   }
1373 
1374   assert(Cloner.OrigFunc->users().empty() &&
1375          "F's users should all be replaced!");
1376 
1377   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1378                             Cloner.ClonedFunc->user_end());
1379 
1380   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1381   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1382   if (CalleeEntryCount)
1383     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1384 
1385   uint64_t CalleeEntryCountV =
1386       (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1387 
1388   bool AnyInline = false;
1389   for (User *User : Users) {
1390     CallBase *CB = getSupportedCallBase(User);
1391 
1392     if (IsLimitReached())
1393       continue;
1394 
1395     OptimizationRemarkEmitter CallerORE(CB->getCaller());
1396     if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1397       continue;
1398 
1399     // Construct remark before doing the inlining, as after successful inlining
1400     // the callsite is removed.
1401     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1402     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1403        << ore::NV("Caller", CB->getCaller());
1404 
1405     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1406     // We can only forward varargs when we outlined a single region, else we
1407     // bail on vararg functions.
1408     if (!InlineFunction(*CB, IFI, nullptr, true,
1409                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1410                                          : nullptr))
1411              .isSuccess())
1412       continue;
1413 
1414     CallerORE.emit(OR);
1415 
1416     // Now update the entry count:
1417     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1418       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1419       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1420     }
1421 
1422     AnyInline = true;
1423     NumPartialInlining++;
1424     // Update the stats
1425     if (Cloner.ClonedOI)
1426       NumPartialInlined++;
1427     else
1428       NumColdOutlinePartialInlined++;
1429 
1430   }
1431 
1432   if (AnyInline) {
1433     Cloner.IsFunctionInlined = true;
1434     if (CalleeEntryCount)
1435       Cloner.OrigFunc->setEntryCount(
1436           CalleeEntryCount.setCount(CalleeEntryCountV));
1437     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1438     OrigFuncORE.emit([&]() {
1439       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1440              << "Partially inlined into at least one caller";
1441     });
1442 
1443   }
1444 
1445   return AnyInline;
1446 }
1447 
run(Module & M)1448 bool PartialInlinerImpl::run(Module &M) {
1449   if (DisablePartialInlining)
1450     return false;
1451 
1452   std::vector<Function *> Worklist;
1453   Worklist.reserve(M.size());
1454   for (Function &F : M)
1455     if (!F.use_empty() && !F.isDeclaration())
1456       Worklist.push_back(&F);
1457 
1458   bool Changed = false;
1459   while (!Worklist.empty()) {
1460     Function *CurrFunc = Worklist.back();
1461     Worklist.pop_back();
1462 
1463     if (CurrFunc->use_empty())
1464       continue;
1465 
1466     bool Recursive = false;
1467     for (User *U : CurrFunc->users())
1468       if (Instruction *I = dyn_cast<Instruction>(U))
1469         if (I->getParent()->getParent() == CurrFunc) {
1470           Recursive = true;
1471           break;
1472         }
1473     if (Recursive)
1474       continue;
1475 
1476     std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1477     if (Result.second)
1478       Worklist.push_back(Result.second);
1479     Changed |= Result.first;
1480   }
1481 
1482   return Changed;
1483 }
1484 
1485 char PartialInlinerLegacyPass::ID = 0;
1486 
1487 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1488                       "Partial Inliner", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1489 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1490 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1491 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1492 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1493 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1494                     "Partial Inliner", false, false)
1495 
1496 ModulePass *llvm::createPartialInliningPass() {
1497   return new PartialInlinerLegacyPass();
1498 }
1499 
run(Module & M,ModuleAnalysisManager & AM)1500 PreservedAnalyses PartialInlinerPass::run(Module &M,
1501                                           ModuleAnalysisManager &AM) {
1502   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1503 
1504   auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1505     return FAM.getResult<AssumptionAnalysis>(F);
1506   };
1507 
1508   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1509     return FAM.getCachedResult<AssumptionAnalysis>(F);
1510   };
1511 
1512   auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1513     return FAM.getResult<BlockFrequencyAnalysis>(F);
1514   };
1515 
1516   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1517     return FAM.getResult<TargetIRAnalysis>(F);
1518   };
1519 
1520   auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1521     return FAM.getResult<TargetLibraryAnalysis>(F);
1522   };
1523 
1524   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1525 
1526   if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1527                          GetTLI, PSI, GetBFI)
1528           .run(M))
1529     return PreservedAnalyses::none();
1530   return PreservedAnalyses::all();
1531 }
1532