1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/BranchProbabilityInfo.h"
22 #include "llvm/Analysis/InlineCost.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/ProfileSummaryInfo.h"
26 #include "llvm/Analysis/TargetLibraryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/IR/Attributes.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/CFG.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/IR/DiagnosticInfo.h"
33 #include "llvm/IR/Dominators.h"
34 #include "llvm/IR/Function.h"
35 #include "llvm/IR/InstrTypes.h"
36 #include "llvm/IR/Instruction.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Intrinsics.h"
40 #include "llvm/IR/Module.h"
41 #include "llvm/IR/Operator.h"
42 #include "llvm/IR/ProfDataUtils.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <memory>
59 #include <tuple>
60 #include <vector>
61 
62 using namespace llvm;
63 
64 #define DEBUG_TYPE "partial-inlining"
65 
66 STATISTIC(NumPartialInlined,
67           "Number of callsites functions partially inlined into.");
68 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
69                                         "cold outlined regions were partially "
70                                         "inlined into its caller(s).");
71 STATISTIC(NumColdRegionsFound,
72            "Number of cold single entry/exit regions found.");
73 STATISTIC(NumColdRegionsOutlined,
74            "Number of cold single entry/exit regions outlined.");
75 
76 // Command line option to disable partial-inlining. The default is false:
77 static cl::opt<bool>
78     DisablePartialInlining("disable-partial-inlining", cl::init(false),
79                            cl::Hidden, cl::desc("Disable partial inlining"));
80 // Command line option to disable multi-region partial-inlining. The default is
81 // false:
82 static cl::opt<bool> DisableMultiRegionPartialInline(
83     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
84     cl::desc("Disable multi-region partial inlining"));
85 
86 // Command line option to force outlining in regions with live exit variables.
87 // The default is false:
88 static cl::opt<bool>
89     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
90                cl::desc("Force outline regions with live exits"));
91 
92 // Command line option to enable marking outline functions with Cold Calling
93 // Convention. The default is false:
94 static cl::opt<bool>
95     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
96                        cl::desc("Mark outline function calls with ColdCC"));
97 
98 // This is an option used by testing:
99 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
100 
101                                       cl::ReallyHidden,
102                                       cl::desc("Skip Cost Analysis"));
103 // Used to determine if a cold region is worth outlining based on
104 // its inlining cost compared to the original function.  Default is set at 10%.
105 // ie. if the cold region reduces the inlining cost of the original function by
106 // at least 10%.
107 static cl::opt<float> MinRegionSizeRatio(
108     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
109     cl::desc("Minimum ratio comparing relative sizes of each "
110              "outline candidate and original function"));
111 // Used to tune the minimum number of execution counts needed in the predecessor
112 // block to the cold edge. ie. confidence interval.
113 static cl::opt<unsigned>
114     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
115                              cl::desc("Minimum block executions to consider "
116                                       "its BranchProbabilityInfo valid"));
117 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
118 // if the branch probability is 10% or less, then it is deemed as 'cold'.
119 static cl::opt<float> ColdBranchRatio(
120     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
121     cl::desc("Minimum BranchProbability to consider a region cold."));
122 
123 static cl::opt<unsigned> MaxNumInlineBlocks(
124     "max-num-inline-blocks", cl::init(5), cl::Hidden,
125     cl::desc("Max number of blocks to be partially inlined"));
126 
127 // Command line option to set the maximum number of partial inlining allowed
128 // for the module. The default value of -1 means no limit.
129 static cl::opt<int> MaxNumPartialInlining(
130     "max-partial-inlining", cl::init(-1), cl::Hidden,
131     cl::desc("Max number of partial inlining. The default is unlimited"));
132 
133 // Used only when PGO or user annotated branch data is absent. It is
134 // the least value that is used to weigh the outline region. If BFI
135 // produces larger value, the BFI value will be used.
136 static cl::opt<int>
137     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
138                              cl::Hidden,
139                              cl::desc("Relative frequency of outline region to "
140                                       "the entry block"));
141 
142 static cl::opt<unsigned> ExtraOutliningPenalty(
143     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
144     cl::desc("A debug option to add additional penalty to the computed one."));
145 
146 namespace {
147 
148 struct FunctionOutliningInfo {
149   FunctionOutliningInfo() = default;
150 
151   // Returns the number of blocks to be inlined including all blocks
152   // in Entries and one return block.
getNumInlinedBlocks__anon4298a4970111::FunctionOutliningInfo153   unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
154 
155   // A set of blocks including the function entry that guard
156   // the region to be outlined.
157   SmallVector<BasicBlock *, 4> Entries;
158 
159   // The return block that is not included in the outlined region.
160   BasicBlock *ReturnBlock = nullptr;
161 
162   // The dominating block of the region to be outlined.
163   BasicBlock *NonReturnBlock = nullptr;
164 
165   // The set of blocks in Entries that that are predecessors to ReturnBlock
166   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
167 };
168 
169 struct FunctionOutliningMultiRegionInfo {
170   FunctionOutliningMultiRegionInfo() = default;
171 
172   // Container for outline regions
173   struct OutlineRegionInfo {
OutlineRegionInfo__anon4298a4970111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo174     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
175                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
176                       BasicBlock *ReturnBlock)
177         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
178           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
179     SmallVector<BasicBlock *, 8> Region;
180     BasicBlock *EntryBlock;
181     BasicBlock *ExitBlock;
182     BasicBlock *ReturnBlock;
183   };
184 
185   SmallVector<OutlineRegionInfo, 4> ORI;
186 };
187 
188 struct PartialInlinerImpl {
189 
PartialInlinerImpl__anon4298a4970111::PartialInlinerImpl190   PartialInlinerImpl(
191       function_ref<AssumptionCache &(Function &)> GetAC,
192       function_ref<AssumptionCache *(Function &)> LookupAC,
193       function_ref<TargetTransformInfo &(Function &)> GTTI,
194       function_ref<const TargetLibraryInfo &(Function &)> GTLI,
195       ProfileSummaryInfo &ProfSI,
196       function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
197       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
198         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
199 
200   bool run(Module &M);
201   // Main part of the transformation that calls helper functions to find
202   // outlining candidates, clone & outline the function, and attempt to
203   // partially inline the resulting function. Returns true if
204   // inlining was successful, false otherwise.  Also returns the outline
205   // function (only if we partially inlined early returns) as there is a
206   // possibility to further "peel" early return statements that were left in the
207   // outline function due to code size.
208   std::pair<bool, Function *> unswitchFunction(Function &F);
209 
210   // This class speculatively clones the function to be partial inlined.
211   // At the end of partial inlining, the remaining callsites to the cloned
212   // function that are not partially inlined will be fixed up to reference
213   // the original function, and the cloned function will be erased.
214   struct FunctionCloner {
215     // Two constructors, one for single region outlining, the other for
216     // multi-region outlining.
217     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
218                    OptimizationRemarkEmitter &ORE,
219                    function_ref<AssumptionCache *(Function &)> LookupAC,
220                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
221     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
222                    OptimizationRemarkEmitter &ORE,
223                    function_ref<AssumptionCache *(Function &)> LookupAC,
224                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
225 
226     ~FunctionCloner();
227 
228     // Prepare for function outlining: making sure there is only
229     // one incoming edge from the extracted/outlined region to
230     // the return block.
231     void normalizeReturnBlock() const;
232 
233     // Do function outlining for cold regions.
234     bool doMultiRegionFunctionOutlining();
235     // Do function outlining for region after early return block(s).
236     // NOTE: For vararg functions that do the vararg handling in the outlined
237     //       function, we temporarily generate IR that does not properly
238     //       forward varargs to the outlined function. Calling InlineFunction
239     //       will update calls to the outlined functions to properly forward
240     //       the varargs.
241     Function *doSingleRegionFunctionOutlining();
242 
243     Function *OrigFunc = nullptr;
244     Function *ClonedFunc = nullptr;
245 
246     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
247     // Keep track of Outlined Functions and the basic block they're called from.
248     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
249 
250     // ClonedFunc is inlined in one of its callers after function
251     // outlining.
252     bool IsFunctionInlined = false;
253     // The cost of the region to be outlined.
254     InstructionCost OutlinedRegionCost = 0;
255     // ClonedOI is specific to outlining non-early return blocks.
256     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
257     // ClonedOMRI is specific to outlining cold regions.
258     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
259     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
260     OptimizationRemarkEmitter &ORE;
261     function_ref<AssumptionCache *(Function &)> LookupAC;
262     function_ref<TargetTransformInfo &(Function &)> GetTTI;
263   };
264 
265 private:
266   int NumPartialInlining = 0;
267   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
268   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
269   function_ref<TargetTransformInfo &(Function &)> GetTTI;
270   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
271   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
272   ProfileSummaryInfo &PSI;
273 
274   // Return the frequency of the OutlininingBB relative to F's entry point.
275   // The result is no larger than 1 and is represented using BP.
276   // (Note that the outlined region's 'head' block can only have incoming
277   // edges from the guarding entry blocks).
278   BranchProbability
279   getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
280 
281   // Return true if the callee of CB should be partially inlined with
282   // profit.
283   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
284                            BlockFrequency WeightedOutliningRcost,
285                            OptimizationRemarkEmitter &ORE) const;
286 
287   // Try to inline DuplicateFunction (cloned from F with call to
288   // the OutlinedFunction into its callers. Return true
289   // if there is any successful inlining.
290   bool tryPartialInline(FunctionCloner &Cloner);
291 
292   // Compute the mapping from use site of DuplicationFunction to the enclosing
293   // BB's profile count.
294   void
295   computeCallsiteToProfCountMap(Function *DuplicateFunction,
296                                 DenseMap<User *, uint64_t> &SiteCountMap) const;
297 
isLimitReached__anon4298a4970111::PartialInlinerImpl298   bool isLimitReached() const {
299     return (MaxNumPartialInlining != -1 &&
300             NumPartialInlining >= MaxNumPartialInlining);
301   }
302 
getSupportedCallBase__anon4298a4970111::PartialInlinerImpl303   static CallBase *getSupportedCallBase(User *U) {
304     if (isa<CallInst>(U) || isa<InvokeInst>(U))
305       return cast<CallBase>(U);
306     llvm_unreachable("All uses must be calls");
307     return nullptr;
308   }
309 
getOneCallSiteTo__anon4298a4970111::PartialInlinerImpl310   static CallBase *getOneCallSiteTo(Function &F) {
311     User *User = *F.user_begin();
312     return getSupportedCallBase(User);
313   }
314 
getOneDebugLoc__anon4298a4970111::PartialInlinerImpl315   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
316     CallBase *CB = getOneCallSiteTo(F);
317     DebugLoc DLoc = CB->getDebugLoc();
318     BasicBlock *Block = CB->getParent();
319     return std::make_tuple(DLoc, Block);
320   }
321 
322   // Returns the costs associated with function outlining:
323   // - The first value is the non-weighted runtime cost for making the call
324   //   to the outlined function, including the addtional  setup cost in the
325   //    outlined function itself;
326   // - The second value is the estimated size of the new call sequence in
327   //   basic block Cloner.OutliningCallBB;
328   std::tuple<InstructionCost, InstructionCost>
329   computeOutliningCosts(FunctionCloner &Cloner) const;
330 
331   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
332   // approximate both the size and runtime cost (Note that in the current
333   // inline cost analysis, there is no clear distinction there either).
334   static InstructionCost computeBBInlineCost(BasicBlock *BB,
335                                              TargetTransformInfo *TTI);
336 
337   std::unique_ptr<FunctionOutliningInfo>
338   computeOutliningInfo(Function &F) const;
339 
340   std::unique_ptr<FunctionOutliningMultiRegionInfo>
341   computeOutliningColdRegionsInfo(Function &F,
342                                   OptimizationRemarkEmitter &ORE) const;
343 };
344 
345 struct PartialInlinerLegacyPass : public ModulePass {
346   static char ID; // Pass identification, replacement for typeid
347 
PartialInlinerLegacyPass__anon4298a4970111::PartialInlinerLegacyPass348   PartialInlinerLegacyPass() : ModulePass(ID) {
349     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
350   }
351 
getAnalysisUsage__anon4298a4970111::PartialInlinerLegacyPass352   void getAnalysisUsage(AnalysisUsage &AU) const override {
353     AU.addRequired<AssumptionCacheTracker>();
354     AU.addRequired<ProfileSummaryInfoWrapperPass>();
355     AU.addRequired<TargetTransformInfoWrapperPass>();
356     AU.addRequired<TargetLibraryInfoWrapperPass>();
357   }
358 
runOnModule__anon4298a4970111::PartialInlinerLegacyPass359   bool runOnModule(Module &M) override {
360     if (skipModule(M))
361       return false;
362 
363     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
364     TargetTransformInfoWrapperPass *TTIWP =
365         &getAnalysis<TargetTransformInfoWrapperPass>();
366     ProfileSummaryInfo &PSI =
367         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
368 
369     auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
370       return ACT->getAssumptionCache(F);
371     };
372 
373     auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
374       return ACT->lookupAssumptionCache(F);
375     };
376 
377     auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
378       return TTIWP->getTTI(F);
379     };
380 
381     auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
382       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
383     };
384 
385     return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
386                               GetTLI, PSI)
387         .run(M);
388   }
389 };
390 
391 } // end anonymous namespace
392 
393 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function & F,OptimizationRemarkEmitter & ORE) const394 PartialInlinerImpl::computeOutliningColdRegionsInfo(
395     Function &F, OptimizationRemarkEmitter &ORE) const {
396   BasicBlock *EntryBlock = &F.front();
397 
398   DominatorTree DT(F);
399   LoopInfo LI(DT);
400   BranchProbabilityInfo BPI(F, LI);
401   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
402   BlockFrequencyInfo *BFI;
403   if (!GetBFI) {
404     ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
405     BFI = ScopedBFI.get();
406   } else
407     BFI = &(GetBFI(F));
408 
409   // Return if we don't have profiling information.
410   if (!PSI.hasInstrumentationProfile())
411     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
412 
413   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
414       std::make_unique<FunctionOutliningMultiRegionInfo>();
415 
416   auto IsSingleExit =
417       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
418     BasicBlock *ExitBlock = nullptr;
419     for (auto *Block : BlockList) {
420       for (BasicBlock *Succ : successors(Block)) {
421         if (!is_contained(BlockList, Succ)) {
422           if (ExitBlock) {
423             ORE.emit([&]() {
424               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
425                                               &Succ->front())
426                      << "Region dominated by "
427                      << ore::NV("Block", BlockList.front()->getName())
428                      << " has more than one region exit edge.";
429             });
430             return nullptr;
431           }
432 
433           ExitBlock = Block;
434         }
435       }
436     }
437     return ExitBlock;
438   };
439 
440   auto BBProfileCount = [BFI](BasicBlock *BB) {
441     return BFI->getBlockProfileCount(BB).value_or(0);
442   };
443 
444   // Use the same computeBBInlineCost function to compute the cost savings of
445   // the outlining the candidate region.
446   TargetTransformInfo *FTTI = &GetTTI(F);
447   InstructionCost OverallFunctionCost = 0;
448   for (auto &BB : F)
449     OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
450 
451   LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
452                     << "\n";);
453 
454   InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
455       [&](auto Cost) { return Cost * MinRegionSizeRatio; });
456 
457   BranchProbability MinBranchProbability(
458       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
459       MinBlockCounterExecution);
460   bool ColdCandidateFound = false;
461   BasicBlock *CurrEntry = EntryBlock;
462   std::vector<BasicBlock *> DFS;
463   DenseMap<BasicBlock *, bool> VisitedMap;
464   DFS.push_back(CurrEntry);
465   VisitedMap[CurrEntry] = true;
466 
467   // Use Depth First Search on the basic blocks to find CFG edges that are
468   // considered cold.
469   // Cold regions considered must also have its inline cost compared to the
470   // overall inline cost of the original function.  The region is outlined only
471   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
472   // more.
473   while (!DFS.empty()) {
474     auto *ThisBB = DFS.back();
475     DFS.pop_back();
476     // Only consider regions with predecessor blocks that are considered
477     // not-cold (default: part of the top 99.99% of all block counters)
478     // AND greater than our minimum block execution count (default: 100).
479     if (PSI.isColdBlock(ThisBB, BFI) ||
480         BBProfileCount(ThisBB) < MinBlockCounterExecution)
481       continue;
482     for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
483       if (VisitedMap[*SI])
484         continue;
485       VisitedMap[*SI] = true;
486       DFS.push_back(*SI);
487       // If branch isn't cold, we skip to the next one.
488       BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
489       if (SuccProb > MinBranchProbability)
490         continue;
491 
492       LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
493                         << SI->getName()
494                         << "\nBranch Probability = " << SuccProb << "\n";);
495 
496       SmallVector<BasicBlock *, 8> DominateVector;
497       DT.getDescendants(*SI, DominateVector);
498       assert(!DominateVector.empty() &&
499              "SI should be reachable and have at least itself as descendant");
500 
501       // We can only outline single entry regions (for now).
502       if (!DominateVector.front()->hasNPredecessors(1)) {
503         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
504                           << " doesn't have a single predecessor in the "
505                              "dominator tree\n";);
506         continue;
507       }
508 
509       BasicBlock *ExitBlock = nullptr;
510       // We can only outline single exit regions (for now).
511       if (!(ExitBlock = IsSingleExit(DominateVector))) {
512         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
513                           << " doesn't have a unique successor\n";);
514         continue;
515       }
516 
517       InstructionCost OutlineRegionCost = 0;
518       for (auto *BB : DominateVector)
519         OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
520 
521       LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
522                         << "\n";);
523 
524       if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
525         ORE.emit([&]() {
526           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
527                                             &SI->front())
528                  << ore::NV("Callee", &F)
529                  << " inline cost-savings smaller than "
530                  << ore::NV("Cost", MinOutlineRegionCost);
531         });
532 
533         LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
534                           << MinOutlineRegionCost << "\n";);
535         continue;
536       }
537 
538       // For now, ignore blocks that belong to a SISE region that is a
539       // candidate for outlining.  In the future, we may want to look
540       // at inner regions because the outer region may have live-exit
541       // variables.
542       for (auto *BB : DominateVector)
543         VisitedMap[BB] = true;
544 
545       // ReturnBlock here means the block after the outline call
546       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
547       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
548           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
549       OutliningInfo->ORI.push_back(RegInfo);
550       LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
551                         << DominateVector.front()->getName() << "\n";);
552       ColdCandidateFound = true;
553       NumColdRegionsFound++;
554     }
555   }
556 
557   if (ColdCandidateFound)
558     return OutliningInfo;
559 
560   return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
561 }
562 
563 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function & F) const564 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
565   BasicBlock *EntryBlock = &F.front();
566   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
567   if (!BR || BR->isUnconditional())
568     return std::unique_ptr<FunctionOutliningInfo>();
569 
570   // Returns true if Succ is BB's successor
571   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
572     return is_contained(successors(BB), Succ);
573   };
574 
575   auto IsReturnBlock = [](BasicBlock *BB) {
576     Instruction *TI = BB->getTerminator();
577     return isa<ReturnInst>(TI);
578   };
579 
580   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
581     if (IsReturnBlock(Succ1))
582       return std::make_tuple(Succ1, Succ2);
583     if (IsReturnBlock(Succ2))
584       return std::make_tuple(Succ2, Succ1);
585 
586     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
587   };
588 
589   // Detect a triangular shape:
590   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
591     if (IsSuccessor(Succ1, Succ2))
592       return std::make_tuple(Succ1, Succ2);
593     if (IsSuccessor(Succ2, Succ1))
594       return std::make_tuple(Succ2, Succ1);
595 
596     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
597   };
598 
599   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
600       std::make_unique<FunctionOutliningInfo>();
601 
602   BasicBlock *CurrEntry = EntryBlock;
603   bool CandidateFound = false;
604   do {
605     // The number of blocks to be inlined has already reached
606     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
607     // disables partial inlining for the function.
608     if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
609       break;
610 
611     if (succ_size(CurrEntry) != 2)
612       break;
613 
614     BasicBlock *Succ1 = *succ_begin(CurrEntry);
615     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
616 
617     BasicBlock *ReturnBlock, *NonReturnBlock;
618     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
619 
620     if (ReturnBlock) {
621       OutliningInfo->Entries.push_back(CurrEntry);
622       OutliningInfo->ReturnBlock = ReturnBlock;
623       OutliningInfo->NonReturnBlock = NonReturnBlock;
624       CandidateFound = true;
625       break;
626     }
627 
628     BasicBlock *CommSucc, *OtherSucc;
629     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
630 
631     if (!CommSucc)
632       break;
633 
634     OutliningInfo->Entries.push_back(CurrEntry);
635     CurrEntry = OtherSucc;
636   } while (true);
637 
638   if (!CandidateFound)
639     return std::unique_ptr<FunctionOutliningInfo>();
640 
641   // There should not be any successors (not in the entry set) other than
642   // {ReturnBlock, NonReturnBlock}
643   assert(OutliningInfo->Entries[0] == &F.front() &&
644          "Function Entry must be the first in Entries vector");
645   DenseSet<BasicBlock *> Entries;
646   for (BasicBlock *E : OutliningInfo->Entries)
647     Entries.insert(E);
648 
649   // Returns true of BB has Predecessor which is not
650   // in Entries set.
651   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
652     for (auto *Pred : predecessors(BB)) {
653       if (!Entries.count(Pred))
654         return true;
655     }
656     return false;
657   };
658   auto CheckAndNormalizeCandidate =
659       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
660         for (BasicBlock *E : OutliningInfo->Entries) {
661           for (auto *Succ : successors(E)) {
662             if (Entries.count(Succ))
663               continue;
664             if (Succ == OutliningInfo->ReturnBlock)
665               OutliningInfo->ReturnBlockPreds.push_back(E);
666             else if (Succ != OutliningInfo->NonReturnBlock)
667               return false;
668           }
669           // There should not be any outside incoming edges either:
670           if (HasNonEntryPred(E))
671             return false;
672         }
673         return true;
674       };
675 
676   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
677     return std::unique_ptr<FunctionOutliningInfo>();
678 
679   // Now further growing the candidate's inlining region by
680   // peeling off dominating blocks from the outlining region:
681   while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
682     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
683     if (succ_size(Cand) != 2)
684       break;
685 
686     if (HasNonEntryPred(Cand))
687       break;
688 
689     BasicBlock *Succ1 = *succ_begin(Cand);
690     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
691 
692     BasicBlock *ReturnBlock, *NonReturnBlock;
693     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
694     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
695       break;
696 
697     if (NonReturnBlock->getSinglePredecessor() != Cand)
698       break;
699 
700     // Now grow and update OutlininigInfo:
701     OutliningInfo->Entries.push_back(Cand);
702     OutliningInfo->NonReturnBlock = NonReturnBlock;
703     OutliningInfo->ReturnBlockPreds.push_back(Cand);
704     Entries.insert(Cand);
705   }
706 
707   return OutliningInfo;
708 }
709 
710 // Check if there is PGO data or user annotated branch data:
hasProfileData(const Function & F,const FunctionOutliningInfo & OI)711 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
712   if (F.hasProfileData())
713     return true;
714   // Now check if any of the entry block has MD_prof data:
715   for (auto *E : OI.Entries) {
716     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
717     if (!BR || BR->isUnconditional())
718       continue;
719     if (hasBranchWeightMD(*BR))
720       return true;
721   }
722   return false;
723 }
724 
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner) const725 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
726     FunctionCloner &Cloner) const {
727   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
728   auto EntryFreq =
729       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
730   auto OutliningCallFreq =
731       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
732   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
733   // we outlined any regions, so we may encounter situations where the
734   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
735   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
736     OutliningCallFreq = EntryFreq;
737 
738   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
739       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
740 
741   if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
742     return OutlineRegionRelFreq;
743 
744   // When profile data is not available, we need to be conservative in
745   // estimating the overall savings. Static branch prediction can usually
746   // guess the branch direction right (taken/non-taken), but the guessed
747   // branch probability is usually not biased enough. In case when the
748   // outlined region is predicted to be likely, its probability needs
749   // to be made higher (more biased) to not under-estimate the cost of
750   // function outlining. On the other hand, if the outlined region
751   // is predicted to be less likely, the predicted probablity is usually
752   // higher than the actual. For instance, the actual probability of the
753   // less likely target is only 5%, but the guessed probablity can be
754   // 40%. In the latter case, there is no need for further adjustment.
755   // FIXME: add an option for this.
756   if (OutlineRegionRelFreq < BranchProbability(45, 100))
757     return OutlineRegionRelFreq;
758 
759   OutlineRegionRelFreq = std::max(
760       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
761 
762   return OutlineRegionRelFreq;
763 }
764 
shouldPartialInline(CallBase & CB,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE) const765 bool PartialInlinerImpl::shouldPartialInline(
766     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
767     OptimizationRemarkEmitter &ORE) const {
768   using namespace ore;
769 
770   Function *Callee = CB.getCalledFunction();
771   assert(Callee == Cloner.ClonedFunc);
772 
773   if (SkipCostAnalysis)
774     return isInlineViable(*Callee).isSuccess();
775 
776   Function *Caller = CB.getCaller();
777   auto &CalleeTTI = GetTTI(*Callee);
778   bool RemarksEnabled =
779       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
780           DEBUG_TYPE);
781   InlineCost IC =
782       getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
783                     GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
784 
785   if (IC.isAlways()) {
786     ORE.emit([&]() {
787       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
788              << NV("Callee", Cloner.OrigFunc)
789              << " should always be fully inlined, not partially";
790     });
791     return false;
792   }
793 
794   if (IC.isNever()) {
795     ORE.emit([&]() {
796       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
797              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
798              << NV("Caller", Caller)
799              << " because it should never be inlined (cost=never)";
800     });
801     return false;
802   }
803 
804   if (!IC) {
805     ORE.emit([&]() {
806       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
807              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
808              << NV("Caller", Caller) << " because too costly to inline (cost="
809              << NV("Cost", IC.getCost()) << ", threshold="
810              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
811     });
812     return false;
813   }
814   const DataLayout &DL = Caller->getParent()->getDataLayout();
815 
816   // The savings of eliminating the call:
817   int NonWeightedSavings = getCallsiteCost(CB, DL);
818   BlockFrequency NormWeightedSavings(NonWeightedSavings);
819 
820   // Weighted saving is smaller than weighted cost, return false
821   if (NormWeightedSavings < WeightedOutliningRcost) {
822     ORE.emit([&]() {
823       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
824                                         &CB)
825              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
826              << NV("Caller", Caller) << " runtime overhead (overhead="
827              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
828              << ", savings="
829              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
830              << ")"
831              << " of making the outlined call is too high";
832     });
833 
834     return false;
835   }
836 
837   ORE.emit([&]() {
838     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
839            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
840            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
841            << " (threshold="
842            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
843   });
844   return true;
845 }
846 
847 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
848 // For now use a simplified version. The returned 'InlineCost' will be used
849 // to esimate the size cost as well as runtime cost of the BB.
850 InstructionCost
computeBBInlineCost(BasicBlock * BB,TargetTransformInfo * TTI)851 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
852                                         TargetTransformInfo *TTI) {
853   InstructionCost InlineCost = 0;
854   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
855   int InstrCost = InlineConstants::getInstrCost();
856   for (Instruction &I : BB->instructionsWithoutDebug()) {
857     // Skip free instructions.
858     switch (I.getOpcode()) {
859     case Instruction::BitCast:
860     case Instruction::PtrToInt:
861     case Instruction::IntToPtr:
862     case Instruction::Alloca:
863     case Instruction::PHI:
864       continue;
865     case Instruction::GetElementPtr:
866       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
867         continue;
868       break;
869     default:
870       break;
871     }
872 
873     if (I.isLifetimeStartOrEnd())
874       continue;
875 
876     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
877       Intrinsic::ID IID = II->getIntrinsicID();
878       SmallVector<Type *, 4> Tys;
879       FastMathFlags FMF;
880       for (Value *Val : II->args())
881         Tys.push_back(Val->getType());
882 
883       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
884         FMF = FPMO->getFastMathFlags();
885 
886       IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
887       InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
888       continue;
889     }
890 
891     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
892       InlineCost += getCallsiteCost(*CI, DL);
893       continue;
894     }
895 
896     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
897       InlineCost += getCallsiteCost(*II, DL);
898       continue;
899     }
900 
901     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
902       InlineCost += (SI->getNumCases() + 1) * InstrCost;
903       continue;
904     }
905     InlineCost += InstrCost;
906   }
907 
908   return InlineCost;
909 }
910 
911 std::tuple<InstructionCost, InstructionCost>
computeOutliningCosts(FunctionCloner & Cloner) const912 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
913   InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
914   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
915     Function *OutlinedFunc = FuncBBPair.first;
916     BasicBlock* OutliningCallBB = FuncBBPair.second;
917     // Now compute the cost of the call sequence to the outlined function
918     // 'OutlinedFunction' in BB 'OutliningCallBB':
919     auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
920     OutliningFuncCallCost +=
921         computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
922 
923     // Now compute the cost of the extracted/outlined function itself:
924     for (BasicBlock &BB : *OutlinedFunc)
925       OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
926   }
927   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
928          "Outlined function cost should be no less than the outlined region");
929 
930   // The code extractor introduces a new root and exit stub blocks with
931   // additional unconditional branches. Those branches will be eliminated
932   // later with bb layout. The cost should be adjusted accordingly:
933   OutlinedFunctionCost -=
934       2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
935 
936   InstructionCost OutliningRuntimeOverhead =
937       OutliningFuncCallCost +
938       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
939       ExtraOutliningPenalty.getValue();
940 
941   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
942 }
943 
944 // Create the callsite to profile count map which is
945 // used to update the original function's entry count,
946 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap) const947 void PartialInlinerImpl::computeCallsiteToProfCountMap(
948     Function *DuplicateFunction,
949     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
950   std::vector<User *> Users(DuplicateFunction->user_begin(),
951                             DuplicateFunction->user_end());
952   Function *CurrentCaller = nullptr;
953   std::unique_ptr<BlockFrequencyInfo> TempBFI;
954   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
955 
956   auto ComputeCurrBFI = [&,this](Function *Caller) {
957       // For the old pass manager:
958       if (!GetBFI) {
959         DominatorTree DT(*Caller);
960         LoopInfo LI(DT);
961         BranchProbabilityInfo BPI(*Caller, LI);
962         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
963         CurrentCallerBFI = TempBFI.get();
964       } else {
965         // New pass manager:
966         CurrentCallerBFI = &(GetBFI(*Caller));
967       }
968   };
969 
970   for (User *User : Users) {
971     // Don't bother with BlockAddress used by CallBr for asm goto.
972     if (isa<BlockAddress>(User))
973       continue;
974     CallBase *CB = getSupportedCallBase(User);
975     Function *Caller = CB->getCaller();
976     if (CurrentCaller != Caller) {
977       CurrentCaller = Caller;
978       ComputeCurrBFI(Caller);
979     } else {
980       assert(CurrentCallerBFI && "CallerBFI is not set");
981     }
982     BasicBlock *CallBB = CB->getParent();
983     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
984     if (Count)
985       CallSiteToProfCountMap[User] = *Count;
986     else
987       CallSiteToProfCountMap[User] = 0;
988   }
989 }
990 
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC,function_ref<TargetTransformInfo & (Function &)> GetTTI)991 PartialInlinerImpl::FunctionCloner::FunctionCloner(
992     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
993     function_ref<AssumptionCache *(Function &)> LookupAC,
994     function_ref<TargetTransformInfo &(Function &)> GetTTI)
995     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
996   ClonedOI = std::make_unique<FunctionOutliningInfo>();
997 
998   // Clone the function, so that we can hack away on it.
999   ValueToValueMapTy VMap;
1000   ClonedFunc = CloneFunction(F, VMap);
1001 
1002   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
1003   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
1004   for (BasicBlock *BB : OI->Entries)
1005     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1006 
1007   for (BasicBlock *E : OI->ReturnBlockPreds) {
1008     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1009     ClonedOI->ReturnBlockPreds.push_back(NewE);
1010   }
1011   // Go ahead and update all uses to the duplicate, so that we can just
1012   // use the inliner functionality when we're done hacking.
1013   F->replaceAllUsesWith(ClonedFunc);
1014 }
1015 
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC,function_ref<TargetTransformInfo & (Function &)> GetTTI)1016 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1017     Function *F, FunctionOutliningMultiRegionInfo *OI,
1018     OptimizationRemarkEmitter &ORE,
1019     function_ref<AssumptionCache *(Function &)> LookupAC,
1020     function_ref<TargetTransformInfo &(Function &)> GetTTI)
1021     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1022   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1023 
1024   // Clone the function, so that we can hack away on it.
1025   ValueToValueMapTy VMap;
1026   ClonedFunc = CloneFunction(F, VMap);
1027 
1028   // Go through all Outline Candidate Regions and update all BasicBlock
1029   // information.
1030   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1031        OI->ORI) {
1032     SmallVector<BasicBlock *, 8> Region;
1033     for (BasicBlock *BB : RegionInfo.Region)
1034       Region.push_back(cast<BasicBlock>(VMap[BB]));
1035 
1036     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1037     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1038     BasicBlock *NewReturnBlock = nullptr;
1039     if (RegionInfo.ReturnBlock)
1040       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1041     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1042         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1043     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1044   }
1045   // Go ahead and update all uses to the duplicate, so that we can just
1046   // use the inliner functionality when we're done hacking.
1047   F->replaceAllUsesWith(ClonedFunc);
1048 }
1049 
normalizeReturnBlock() const1050 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1051   auto GetFirstPHI = [](BasicBlock *BB) {
1052     BasicBlock::iterator I = BB->begin();
1053     PHINode *FirstPhi = nullptr;
1054     while (I != BB->end()) {
1055       PHINode *Phi = dyn_cast<PHINode>(I);
1056       if (!Phi)
1057         break;
1058       if (!FirstPhi) {
1059         FirstPhi = Phi;
1060         break;
1061       }
1062     }
1063     return FirstPhi;
1064   };
1065 
1066   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1067   // blocks.
1068   if (!ClonedOI)
1069     return;
1070 
1071   // Special hackery is needed with PHI nodes that have inputs from more than
1072   // one extracted block.  For simplicity, just split the PHIs into a two-level
1073   // sequence of PHIs, some of which will go in the extracted region, and some
1074   // of which will go outside.
1075   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1076   // only split block when necessary:
1077   PHINode *FirstPhi = GetFirstPHI(PreReturn);
1078   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1079 
1080   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1081     return;
1082 
1083   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1084     if (llvm::all_equal(PN->incoming_values()))
1085       return PN->getIncomingValue(0);
1086     return nullptr;
1087   };
1088 
1089   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1090       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1091   BasicBlock::iterator I = PreReturn->begin();
1092   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1093   SmallVector<Instruction *, 4> DeadPhis;
1094   while (I != PreReturn->end()) {
1095     PHINode *OldPhi = dyn_cast<PHINode>(I);
1096     if (!OldPhi)
1097       break;
1098 
1099     PHINode *RetPhi =
1100         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1101     OldPhi->replaceAllUsesWith(RetPhi);
1102     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1103 
1104     RetPhi->addIncoming(&*I, PreReturn);
1105     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1106       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1107       OldPhi->removeIncomingValue(E);
1108     }
1109 
1110     // After incoming values splitting, the old phi may become trivial.
1111     // Keeping the trivial phi can introduce definition inside the outline
1112     // region which is live-out, causing necessary overhead (load, store
1113     // arg passing etc).
1114     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1115       OldPhi->replaceAllUsesWith(OldPhiVal);
1116       DeadPhis.push_back(OldPhi);
1117     }
1118     ++I;
1119   }
1120   for (auto *DP : DeadPhis)
1121     DP->eraseFromParent();
1122 
1123   for (auto *E : ClonedOI->ReturnBlockPreds)
1124     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1125 }
1126 
doMultiRegionFunctionOutlining()1127 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1128 
1129   auto ComputeRegionCost =
1130       [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1131     InstructionCost Cost = 0;
1132     for (BasicBlock* BB : Region)
1133       Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1134     return Cost;
1135   };
1136 
1137   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1138 
1139   if (ClonedOMRI->ORI.empty())
1140     return false;
1141 
1142   // The CodeExtractor needs a dominator tree.
1143   DominatorTree DT;
1144   DT.recalculate(*ClonedFunc);
1145 
1146   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1147   LoopInfo LI(DT);
1148   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1149   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1150 
1151   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1152   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1153 
1154   SetVector<Value *> Inputs, Outputs, Sinks;
1155   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1156        ClonedOMRI->ORI) {
1157     InstructionCost CurrentOutlinedRegionCost =
1158         ComputeRegionCost(RegionInfo.Region);
1159 
1160     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1161                      ClonedFuncBFI.get(), &BPI,
1162                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1163                      /* AllowVarargs */ false);
1164 
1165     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1166 
1167     LLVM_DEBUG({
1168       dbgs() << "inputs: " << Inputs.size() << "\n";
1169       dbgs() << "outputs: " << Outputs.size() << "\n";
1170       for (Value *value : Inputs)
1171         dbgs() << "value used in func: " << *value << "\n";
1172       for (Value *output : Outputs)
1173         dbgs() << "instr used in func: " << *output << "\n";
1174     });
1175 
1176     // Do not extract regions that have live exit variables.
1177     if (Outputs.size() > 0 && !ForceLiveExit)
1178       continue;
1179 
1180     if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1181       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1182       BasicBlock *OutliningCallBB = OCS->getParent();
1183       assert(OutliningCallBB->getParent() == ClonedFunc);
1184       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1185       NumColdRegionsOutlined++;
1186       OutlinedRegionCost += CurrentOutlinedRegionCost;
1187 
1188       if (MarkOutlinedColdCC) {
1189         OutlinedFunc->setCallingConv(CallingConv::Cold);
1190         OCS->setCallingConv(CallingConv::Cold);
1191       }
1192     } else
1193       ORE.emit([&]() {
1194         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1195                                         &RegionInfo.Region.front()->front())
1196                << "Failed to extract region at block "
1197                << ore::NV("Block", RegionInfo.Region.front());
1198       });
1199   }
1200 
1201   return !OutlinedFunctions.empty();
1202 }
1203 
1204 Function *
doSingleRegionFunctionOutlining()1205 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1206   // Returns true if the block is to be partial inlined into the caller
1207   // (i.e. not to be extracted to the out of line function)
1208   auto ToBeInlined = [&, this](BasicBlock *BB) {
1209     return BB == ClonedOI->ReturnBlock ||
1210            llvm::is_contained(ClonedOI->Entries, BB);
1211   };
1212 
1213   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1214   // The CodeExtractor needs a dominator tree.
1215   DominatorTree DT;
1216   DT.recalculate(*ClonedFunc);
1217 
1218   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1219   LoopInfo LI(DT);
1220   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1221   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1222 
1223   // Gather up the blocks that we're going to extract.
1224   std::vector<BasicBlock *> ToExtract;
1225   auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1226   ToExtract.push_back(ClonedOI->NonReturnBlock);
1227   OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1228       ClonedOI->NonReturnBlock, ClonedFuncTTI);
1229   for (BasicBlock &BB : *ClonedFunc)
1230     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1231       ToExtract.push_back(&BB);
1232       // FIXME: the code extractor may hoist/sink more code
1233       // into the outlined function which may make the outlining
1234       // overhead (the difference of the outlined function cost
1235       // and OutliningRegionCost) look larger.
1236       OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1237     }
1238 
1239   // Extract the body of the if.
1240   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1241   Function *OutlinedFunc =
1242       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1243                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1244                     /* AllowVarargs */ true)
1245           .extractCodeRegion(CEAC);
1246 
1247   if (OutlinedFunc) {
1248     BasicBlock *OutliningCallBB =
1249         PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1250     assert(OutliningCallBB->getParent() == ClonedFunc);
1251     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1252   } else
1253     ORE.emit([&]() {
1254       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1255                                       &ToExtract.front()->front())
1256              << "Failed to extract region at block "
1257              << ore::NV("Block", ToExtract.front());
1258     });
1259 
1260   return OutlinedFunc;
1261 }
1262 
~FunctionCloner()1263 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1264   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1265   // users (function pointers, etc.) back to the original function.
1266   ClonedFunc->replaceAllUsesWith(OrigFunc);
1267   ClonedFunc->eraseFromParent();
1268   if (!IsFunctionInlined) {
1269     // Remove each function that was speculatively created if there is no
1270     // reference.
1271     for (auto FuncBBPair : OutlinedFunctions) {
1272       Function *Func = FuncBBPair.first;
1273       Func->eraseFromParent();
1274     }
1275   }
1276 }
1277 
unswitchFunction(Function & F)1278 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1279   if (F.hasAddressTaken())
1280     return {false, nullptr};
1281 
1282   // Let inliner handle it
1283   if (F.hasFnAttribute(Attribute::AlwaysInline))
1284     return {false, nullptr};
1285 
1286   if (F.hasFnAttribute(Attribute::NoInline))
1287     return {false, nullptr};
1288 
1289   if (PSI.isFunctionEntryCold(&F))
1290     return {false, nullptr};
1291 
1292   if (F.users().empty())
1293     return {false, nullptr};
1294 
1295   OptimizationRemarkEmitter ORE(&F);
1296 
1297   // Only try to outline cold regions if we have a profile summary, which
1298   // implies we have profiling information.
1299   if (PSI.hasProfileSummary() && F.hasProfileData() &&
1300       !DisableMultiRegionPartialInline) {
1301     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1302         computeOutliningColdRegionsInfo(F, ORE);
1303     if (OMRI) {
1304       FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1305 
1306       LLVM_DEBUG({
1307         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1308         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1309                << "\n";
1310       });
1311 
1312       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1313 
1314       if (DidOutline) {
1315         LLVM_DEBUG({
1316           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1317           Cloner.ClonedFunc->print(dbgs());
1318           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1319         });
1320 
1321         if (tryPartialInline(Cloner))
1322           return {true, nullptr};
1323       }
1324     }
1325   }
1326 
1327   // Fall-thru to regular partial inlining if we:
1328   //    i) can't find any cold regions to outline, or
1329   //   ii) can't inline the outlined function anywhere.
1330   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1331   if (!OI)
1332     return {false, nullptr};
1333 
1334   FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1335   Cloner.normalizeReturnBlock();
1336 
1337   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1338 
1339   if (!OutlinedFunction)
1340     return {false, nullptr};
1341 
1342   if (tryPartialInline(Cloner))
1343     return {true, OutlinedFunction};
1344 
1345   return {false, nullptr};
1346 }
1347 
tryPartialInline(FunctionCloner & Cloner)1348 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1349   if (Cloner.OutlinedFunctions.empty())
1350     return false;
1351 
1352   auto OutliningCosts = computeOutliningCosts(Cloner);
1353 
1354   InstructionCost SizeCost = std::get<0>(OutliningCosts);
1355   InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts);
1356 
1357   assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1358          "Expected valid costs");
1359 
1360   // Only calculate RelativeToEntryFreq when we are doing single region
1361   // outlining.
1362   BranchProbability RelativeToEntryFreq;
1363   if (Cloner.ClonedOI)
1364     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1365   else
1366     // RelativeToEntryFreq doesn't make sense when we have more than one
1367     // outlined call because each call will have a different relative frequency
1368     // to the entry block.  We can consider using the average, but the
1369     // usefulness of that information is questionable. For now, assume we never
1370     // execute the calls to outlined functions.
1371     RelativeToEntryFreq = BranchProbability(0, 1);
1372 
1373   BlockFrequency WeightedRcost =
1374       BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1375 
1376   // The call sequence(s) to the outlined function(s) are larger than the sum of
1377   // the original outlined region size(s), it does not increase the chances of
1378   // inlining the function with outlining (The inliner uses the size increase to
1379   // model the cost of inlining a callee).
1380   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1381     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1382     DebugLoc DLoc;
1383     BasicBlock *Block;
1384     std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1385     OrigFuncORE.emit([&]() {
1386       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1387                                         DLoc, Block)
1388              << ore::NV("Function", Cloner.OrigFunc)
1389              << " not partially inlined into callers (Original Size = "
1390              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1391              << ", Size of call sequence to outlined function = "
1392              << ore::NV("NewSize", SizeCost) << ")";
1393     });
1394     return false;
1395   }
1396 
1397   assert(Cloner.OrigFunc->users().empty() &&
1398          "F's users should all be replaced!");
1399 
1400   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1401                             Cloner.ClonedFunc->user_end());
1402 
1403   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1404   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1405   if (CalleeEntryCount)
1406     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1407 
1408   uint64_t CalleeEntryCountV =
1409       (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1410 
1411   bool AnyInline = false;
1412   for (User *User : Users) {
1413     // Don't bother with BlockAddress used by CallBr for asm goto.
1414     if (isa<BlockAddress>(User))
1415       continue;
1416 
1417     CallBase *CB = getSupportedCallBase(User);
1418 
1419     if (isLimitReached())
1420       continue;
1421 
1422     OptimizationRemarkEmitter CallerORE(CB->getCaller());
1423     if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1424       continue;
1425 
1426     // Construct remark before doing the inlining, as after successful inlining
1427     // the callsite is removed.
1428     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1429     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1430        << ore::NV("Caller", CB->getCaller());
1431 
1432     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1433     // We can only forward varargs when we outlined a single region, else we
1434     // bail on vararg functions.
1435     if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,
1436                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1437                                          : nullptr))
1438              .isSuccess())
1439       continue;
1440 
1441     CallerORE.emit(OR);
1442 
1443     // Now update the entry count:
1444     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1445       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1446       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1447     }
1448 
1449     AnyInline = true;
1450     NumPartialInlining++;
1451     // Update the stats
1452     if (Cloner.ClonedOI)
1453       NumPartialInlined++;
1454     else
1455       NumColdOutlinePartialInlined++;
1456   }
1457 
1458   if (AnyInline) {
1459     Cloner.IsFunctionInlined = true;
1460     if (CalleeEntryCount)
1461       Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1462           CalleeEntryCountV, CalleeEntryCount->getType()));
1463     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1464     OrigFuncORE.emit([&]() {
1465       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1466              << "Partially inlined into at least one caller";
1467     });
1468   }
1469 
1470   return AnyInline;
1471 }
1472 
run(Module & M)1473 bool PartialInlinerImpl::run(Module &M) {
1474   if (DisablePartialInlining)
1475     return false;
1476 
1477   std::vector<Function *> Worklist;
1478   Worklist.reserve(M.size());
1479   for (Function &F : M)
1480     if (!F.use_empty() && !F.isDeclaration())
1481       Worklist.push_back(&F);
1482 
1483   bool Changed = false;
1484   while (!Worklist.empty()) {
1485     Function *CurrFunc = Worklist.back();
1486     Worklist.pop_back();
1487 
1488     if (CurrFunc->use_empty())
1489       continue;
1490 
1491     std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1492     if (Result.second)
1493       Worklist.push_back(Result.second);
1494     Changed |= Result.first;
1495   }
1496 
1497   return Changed;
1498 }
1499 
1500 char PartialInlinerLegacyPass::ID = 0;
1501 
1502 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1503                       "Partial Inliner", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1504 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1505 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1506 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1507 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1508 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1509                     "Partial Inliner", false, false)
1510 
1511 ModulePass *llvm::createPartialInliningPass() {
1512   return new PartialInlinerLegacyPass();
1513 }
1514 
run(Module & M,ModuleAnalysisManager & AM)1515 PreservedAnalyses PartialInlinerPass::run(Module &M,
1516                                           ModuleAnalysisManager &AM) {
1517   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1518 
1519   auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1520     return FAM.getResult<AssumptionAnalysis>(F);
1521   };
1522 
1523   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1524     return FAM.getCachedResult<AssumptionAnalysis>(F);
1525   };
1526 
1527   auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1528     return FAM.getResult<BlockFrequencyAnalysis>(F);
1529   };
1530 
1531   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1532     return FAM.getResult<TargetIRAnalysis>(F);
1533   };
1534 
1535   auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1536     return FAM.getResult<TargetLibraryAnalysis>(F);
1537   };
1538 
1539   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1540 
1541   if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1542                          GetTLI, PSI, GetBFI)
1543           .run(M))
1544     return PreservedAnalyses::none();
1545   return PreservedAnalyses::all();
1546 }
1547