1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/DepthFirstIterator.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/BranchProbabilityInfo.h"
23 #include "llvm/Analysis/InlineCost.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/IR/Attributes.h"
30 #include "llvm/IR/BasicBlock.h"
31 #include "llvm/IR/CFG.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Module.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/ProfDataUtils.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/Support/BlockFrequency.h"
46 #include "llvm/Support/BranchProbability.h"
47 #include "llvm/Support/Casting.h"
48 #include "llvm/Support/CommandLine.h"
49 #include "llvm/Support/ErrorHandling.h"
50 #include "llvm/Transforms/IPO.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/CodeExtractor.h"
53 #include "llvm/Transforms/Utils/ValueMapper.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <memory>
58 #include <tuple>
59 #include <vector>
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "partial-inlining"
64 
65 STATISTIC(NumPartialInlined,
66           "Number of callsites functions partially inlined into.");
67 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
68                                         "cold outlined regions were partially "
69                                         "inlined into its caller(s).");
70 STATISTIC(NumColdRegionsFound,
71            "Number of cold single entry/exit regions found.");
72 STATISTIC(NumColdRegionsOutlined,
73            "Number of cold single entry/exit regions outlined.");
74 
75 // Command line option to disable partial-inlining. The default is false:
76 static cl::opt<bool>
77     DisablePartialInlining("disable-partial-inlining", cl::init(false),
78                            cl::Hidden, cl::desc("Disable partial inlining"));
79 // Command line option to disable multi-region partial-inlining. The default is
80 // false:
81 static cl::opt<bool> DisableMultiRegionPartialInline(
82     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
83     cl::desc("Disable multi-region partial inlining"));
84 
85 // Command line option to force outlining in regions with live exit variables.
86 // The default is false:
87 static cl::opt<bool>
88     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
89                cl::desc("Force outline regions with live exits"));
90 
91 // Command line option to enable marking outline functions with Cold Calling
92 // Convention. The default is false:
93 static cl::opt<bool>
94     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
95                        cl::desc("Mark outline function calls with ColdCC"));
96 
97 // This is an option used by testing:
98 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
99 
100                                       cl::ReallyHidden,
101                                       cl::desc("Skip Cost Analysis"));
102 // Used to determine if a cold region is worth outlining based on
103 // its inlining cost compared to the original function.  Default is set at 10%.
104 // ie. if the cold region reduces the inlining cost of the original function by
105 // at least 10%.
106 static cl::opt<float> MinRegionSizeRatio(
107     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
108     cl::desc("Minimum ratio comparing relative sizes of each "
109              "outline candidate and original function"));
110 // Used to tune the minimum number of execution counts needed in the predecessor
111 // block to the cold edge. ie. confidence interval.
112 static cl::opt<unsigned>
113     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
114                              cl::desc("Minimum block executions to consider "
115                                       "its BranchProbabilityInfo valid"));
116 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
117 // if the branch probability is 10% or less, then it is deemed as 'cold'.
118 static cl::opt<float> ColdBranchRatio(
119     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
120     cl::desc("Minimum BranchProbability to consider a region cold."));
121 
122 static cl::opt<unsigned> MaxNumInlineBlocks(
123     "max-num-inline-blocks", cl::init(5), cl::Hidden,
124     cl::desc("Max number of blocks to be partially inlined"));
125 
126 // Command line option to set the maximum number of partial inlining allowed
127 // for the module. The default value of -1 means no limit.
128 static cl::opt<int> MaxNumPartialInlining(
129     "max-partial-inlining", cl::init(-1), cl::Hidden,
130     cl::desc("Max number of partial inlining. The default is unlimited"));
131 
132 // Used only when PGO or user annotated branch data is absent. It is
133 // the least value that is used to weigh the outline region. If BFI
134 // produces larger value, the BFI value will be used.
135 static cl::opt<int>
136     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
137                              cl::Hidden,
138                              cl::desc("Relative frequency of outline region to "
139                                       "the entry block"));
140 
141 static cl::opt<unsigned> ExtraOutliningPenalty(
142     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
143     cl::desc("A debug option to add additional penalty to the computed one."));
144 
145 namespace {
146 
147 struct FunctionOutliningInfo {
148   FunctionOutliningInfo() = default;
149 
150   // Returns the number of blocks to be inlined including all blocks
151   // in Entries and one return block.
getNumInlinedBlocks__anon4aedeb920111::FunctionOutliningInfo152   unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
153 
154   // A set of blocks including the function entry that guard
155   // the region to be outlined.
156   SmallVector<BasicBlock *, 4> Entries;
157 
158   // The return block that is not included in the outlined region.
159   BasicBlock *ReturnBlock = nullptr;
160 
161   // The dominating block of the region to be outlined.
162   BasicBlock *NonReturnBlock = nullptr;
163 
164   // The set of blocks in Entries that are predecessors to ReturnBlock
165   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
166 };
167 
168 struct FunctionOutliningMultiRegionInfo {
169   FunctionOutliningMultiRegionInfo() = default;
170 
171   // Container for outline regions
172   struct OutlineRegionInfo {
OutlineRegionInfo__anon4aedeb920111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo173     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
174                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
175                       BasicBlock *ReturnBlock)
176         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
177           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
178     SmallVector<BasicBlock *, 8> Region;
179     BasicBlock *EntryBlock;
180     BasicBlock *ExitBlock;
181     BasicBlock *ReturnBlock;
182   };
183 
184   SmallVector<OutlineRegionInfo, 4> ORI;
185 };
186 
187 struct PartialInlinerImpl {
188 
PartialInlinerImpl__anon4aedeb920111::PartialInlinerImpl189   PartialInlinerImpl(
190       function_ref<AssumptionCache &(Function &)> GetAC,
191       function_ref<AssumptionCache *(Function &)> LookupAC,
192       function_ref<TargetTransformInfo &(Function &)> GTTI,
193       function_ref<const TargetLibraryInfo &(Function &)> GTLI,
194       ProfileSummaryInfo &ProfSI,
195       function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
196       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
197         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
198 
199   bool run(Module &M);
200   // Main part of the transformation that calls helper functions to find
201   // outlining candidates, clone & outline the function, and attempt to
202   // partially inline the resulting function. Returns true if
203   // inlining was successful, false otherwise.  Also returns the outline
204   // function (only if we partially inlined early returns) as there is a
205   // possibility to further "peel" early return statements that were left in the
206   // outline function due to code size.
207   std::pair<bool, Function *> unswitchFunction(Function &F);
208 
209   // This class speculatively clones the function to be partial inlined.
210   // At the end of partial inlining, the remaining callsites to the cloned
211   // function that are not partially inlined will be fixed up to reference
212   // the original function, and the cloned function will be erased.
213   struct FunctionCloner {
214     // Two constructors, one for single region outlining, the other for
215     // multi-region outlining.
216     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
217                    OptimizationRemarkEmitter &ORE,
218                    function_ref<AssumptionCache *(Function &)> LookupAC,
219                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
220     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
221                    OptimizationRemarkEmitter &ORE,
222                    function_ref<AssumptionCache *(Function &)> LookupAC,
223                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
224 
225     ~FunctionCloner();
226 
227     // Prepare for function outlining: making sure there is only
228     // one incoming edge from the extracted/outlined region to
229     // the return block.
230     void normalizeReturnBlock() const;
231 
232     // Do function outlining for cold regions.
233     bool doMultiRegionFunctionOutlining();
234     // Do function outlining for region after early return block(s).
235     // NOTE: For vararg functions that do the vararg handling in the outlined
236     //       function, we temporarily generate IR that does not properly
237     //       forward varargs to the outlined function. Calling InlineFunction
238     //       will update calls to the outlined functions to properly forward
239     //       the varargs.
240     Function *doSingleRegionFunctionOutlining();
241 
242     Function *OrigFunc = nullptr;
243     Function *ClonedFunc = nullptr;
244 
245     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
246     // Keep track of Outlined Functions and the basic block they're called from.
247     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
248 
249     // ClonedFunc is inlined in one of its callers after function
250     // outlining.
251     bool IsFunctionInlined = false;
252     // The cost of the region to be outlined.
253     InstructionCost OutlinedRegionCost = 0;
254     // ClonedOI is specific to outlining non-early return blocks.
255     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
256     // ClonedOMRI is specific to outlining cold regions.
257     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
258     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
259     OptimizationRemarkEmitter &ORE;
260     function_ref<AssumptionCache *(Function &)> LookupAC;
261     function_ref<TargetTransformInfo &(Function &)> GetTTI;
262   };
263 
264 private:
265   int NumPartialInlining = 0;
266   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
267   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
268   function_ref<TargetTransformInfo &(Function &)> GetTTI;
269   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
270   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
271   ProfileSummaryInfo &PSI;
272 
273   // Return the frequency of the OutlininingBB relative to F's entry point.
274   // The result is no larger than 1 and is represented using BP.
275   // (Note that the outlined region's 'head' block can only have incoming
276   // edges from the guarding entry blocks).
277   BranchProbability
278   getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
279 
280   // Return true if the callee of CB should be partially inlined with
281   // profit.
282   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
283                            BlockFrequency WeightedOutliningRcost,
284                            OptimizationRemarkEmitter &ORE) const;
285 
286   // Try to inline DuplicateFunction (cloned from F with call to
287   // the OutlinedFunction into its callers. Return true
288   // if there is any successful inlining.
289   bool tryPartialInline(FunctionCloner &Cloner);
290 
291   // Compute the mapping from use site of DuplicationFunction to the enclosing
292   // BB's profile count.
293   void
294   computeCallsiteToProfCountMap(Function *DuplicateFunction,
295                                 DenseMap<User *, uint64_t> &SiteCountMap) const;
296 
isLimitReached__anon4aedeb920111::PartialInlinerImpl297   bool isLimitReached() const {
298     return (MaxNumPartialInlining != -1 &&
299             NumPartialInlining >= MaxNumPartialInlining);
300   }
301 
getSupportedCallBase__anon4aedeb920111::PartialInlinerImpl302   static CallBase *getSupportedCallBase(User *U) {
303     if (isa<CallInst>(U) || isa<InvokeInst>(U))
304       return cast<CallBase>(U);
305     llvm_unreachable("All uses must be calls");
306     return nullptr;
307   }
308 
getOneCallSiteTo__anon4aedeb920111::PartialInlinerImpl309   static CallBase *getOneCallSiteTo(Function &F) {
310     User *User = *F.user_begin();
311     return getSupportedCallBase(User);
312   }
313 
getOneDebugLoc__anon4aedeb920111::PartialInlinerImpl314   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
315     CallBase *CB = getOneCallSiteTo(F);
316     DebugLoc DLoc = CB->getDebugLoc();
317     BasicBlock *Block = CB->getParent();
318     return std::make_tuple(DLoc, Block);
319   }
320 
321   // Returns the costs associated with function outlining:
322   // - The first value is the non-weighted runtime cost for making the call
323   //   to the outlined function, including the addtional  setup cost in the
324   //    outlined function itself;
325   // - The second value is the estimated size of the new call sequence in
326   //   basic block Cloner.OutliningCallBB;
327   std::tuple<InstructionCost, InstructionCost>
328   computeOutliningCosts(FunctionCloner &Cloner) const;
329 
330   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
331   // approximate both the size and runtime cost (Note that in the current
332   // inline cost analysis, there is no clear distinction there either).
333   static InstructionCost computeBBInlineCost(BasicBlock *BB,
334                                              TargetTransformInfo *TTI);
335 
336   std::unique_ptr<FunctionOutliningInfo>
337   computeOutliningInfo(Function &F) const;
338 
339   std::unique_ptr<FunctionOutliningMultiRegionInfo>
340   computeOutliningColdRegionsInfo(Function &F,
341                                   OptimizationRemarkEmitter &ORE) const;
342 };
343 
344 } // end anonymous namespace
345 
346 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function & F,OptimizationRemarkEmitter & ORE) const347 PartialInlinerImpl::computeOutliningColdRegionsInfo(
348     Function &F, OptimizationRemarkEmitter &ORE) const {
349   BasicBlock *EntryBlock = &F.front();
350 
351   DominatorTree DT(F);
352   LoopInfo LI(DT);
353   BranchProbabilityInfo BPI(F, LI);
354   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
355   BlockFrequencyInfo *BFI;
356   if (!GetBFI) {
357     ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
358     BFI = ScopedBFI.get();
359   } else
360     BFI = &(GetBFI(F));
361 
362   // Return if we don't have profiling information.
363   if (!PSI.hasInstrumentationProfile())
364     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
365 
366   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
367       std::make_unique<FunctionOutliningMultiRegionInfo>();
368 
369   auto IsSingleExit =
370       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
371     BasicBlock *ExitBlock = nullptr;
372     for (auto *Block : BlockList) {
373       for (BasicBlock *Succ : successors(Block)) {
374         if (!is_contained(BlockList, Succ)) {
375           if (ExitBlock) {
376             ORE.emit([&]() {
377               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
378                                               &Succ->front())
379                      << "Region dominated by "
380                      << ore::NV("Block", BlockList.front()->getName())
381                      << " has more than one region exit edge.";
382             });
383             return nullptr;
384           }
385 
386           ExitBlock = Block;
387         }
388       }
389     }
390     return ExitBlock;
391   };
392 
393   auto BBProfileCount = [BFI](BasicBlock *BB) {
394     return BFI->getBlockProfileCount(BB).value_or(0);
395   };
396 
397   // Use the same computeBBInlineCost function to compute the cost savings of
398   // the outlining the candidate region.
399   TargetTransformInfo *FTTI = &GetTTI(F);
400   InstructionCost OverallFunctionCost = 0;
401   for (auto &BB : F)
402     OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
403 
404   LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
405                     << "\n";);
406 
407   InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
408       [&](auto Cost) { return Cost * MinRegionSizeRatio; });
409 
410   BranchProbability MinBranchProbability(
411       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
412       MinBlockCounterExecution);
413   bool ColdCandidateFound = false;
414   BasicBlock *CurrEntry = EntryBlock;
415   std::vector<BasicBlock *> DFS;
416   DenseMap<BasicBlock *, bool> VisitedMap;
417   DFS.push_back(CurrEntry);
418   VisitedMap[CurrEntry] = true;
419 
420   // Use Depth First Search on the basic blocks to find CFG edges that are
421   // considered cold.
422   // Cold regions considered must also have its inline cost compared to the
423   // overall inline cost of the original function.  The region is outlined only
424   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
425   // more.
426   while (!DFS.empty()) {
427     auto *ThisBB = DFS.back();
428     DFS.pop_back();
429     // Only consider regions with predecessor blocks that are considered
430     // not-cold (default: part of the top 99.99% of all block counters)
431     // AND greater than our minimum block execution count (default: 100).
432     if (PSI.isColdBlock(ThisBB, BFI) ||
433         BBProfileCount(ThisBB) < MinBlockCounterExecution)
434       continue;
435     for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
436       if (VisitedMap[*SI])
437         continue;
438       VisitedMap[*SI] = true;
439       DFS.push_back(*SI);
440       // If branch isn't cold, we skip to the next one.
441       BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
442       if (SuccProb > MinBranchProbability)
443         continue;
444 
445       LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
446                         << SI->getName()
447                         << "\nBranch Probability = " << SuccProb << "\n";);
448 
449       SmallVector<BasicBlock *, 8> DominateVector;
450       DT.getDescendants(*SI, DominateVector);
451       assert(!DominateVector.empty() &&
452              "SI should be reachable and have at least itself as descendant");
453 
454       // We can only outline single entry regions (for now).
455       if (!DominateVector.front()->hasNPredecessors(1)) {
456         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
457                           << " doesn't have a single predecessor in the "
458                              "dominator tree\n";);
459         continue;
460       }
461 
462       BasicBlock *ExitBlock = nullptr;
463       // We can only outline single exit regions (for now).
464       if (!(ExitBlock = IsSingleExit(DominateVector))) {
465         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
466                           << " doesn't have a unique successor\n";);
467         continue;
468       }
469 
470       InstructionCost OutlineRegionCost = 0;
471       for (auto *BB : DominateVector)
472         OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
473 
474       LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
475                         << "\n";);
476 
477       if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
478         ORE.emit([&]() {
479           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
480                                             &SI->front())
481                  << ore::NV("Callee", &F)
482                  << " inline cost-savings smaller than "
483                  << ore::NV("Cost", MinOutlineRegionCost);
484         });
485 
486         LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
487                           << MinOutlineRegionCost << "\n";);
488         continue;
489       }
490 
491       // For now, ignore blocks that belong to a SISE region that is a
492       // candidate for outlining.  In the future, we may want to look
493       // at inner regions because the outer region may have live-exit
494       // variables.
495       for (auto *BB : DominateVector)
496         VisitedMap[BB] = true;
497 
498       // ReturnBlock here means the block after the outline call
499       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
500       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
501           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
502       OutliningInfo->ORI.push_back(RegInfo);
503       LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
504                         << DominateVector.front()->getName() << "\n";);
505       ColdCandidateFound = true;
506       NumColdRegionsFound++;
507     }
508   }
509 
510   if (ColdCandidateFound)
511     return OutliningInfo;
512 
513   return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
514 }
515 
516 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function & F) const517 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
518   BasicBlock *EntryBlock = &F.front();
519   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
520   if (!BR || BR->isUnconditional())
521     return std::unique_ptr<FunctionOutliningInfo>();
522 
523   // Returns true if Succ is BB's successor
524   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
525     return is_contained(successors(BB), Succ);
526   };
527 
528   auto IsReturnBlock = [](BasicBlock *BB) {
529     Instruction *TI = BB->getTerminator();
530     return isa<ReturnInst>(TI);
531   };
532 
533   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
534     if (IsReturnBlock(Succ1))
535       return std::make_tuple(Succ1, Succ2);
536     if (IsReturnBlock(Succ2))
537       return std::make_tuple(Succ2, Succ1);
538 
539     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
540   };
541 
542   // Detect a triangular shape:
543   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
544     if (IsSuccessor(Succ1, Succ2))
545       return std::make_tuple(Succ1, Succ2);
546     if (IsSuccessor(Succ2, Succ1))
547       return std::make_tuple(Succ2, Succ1);
548 
549     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
550   };
551 
552   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
553       std::make_unique<FunctionOutliningInfo>();
554 
555   BasicBlock *CurrEntry = EntryBlock;
556   bool CandidateFound = false;
557   do {
558     // The number of blocks to be inlined has already reached
559     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
560     // disables partial inlining for the function.
561     if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
562       break;
563 
564     if (succ_size(CurrEntry) != 2)
565       break;
566 
567     BasicBlock *Succ1 = *succ_begin(CurrEntry);
568     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
569 
570     BasicBlock *ReturnBlock, *NonReturnBlock;
571     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
572 
573     if (ReturnBlock) {
574       OutliningInfo->Entries.push_back(CurrEntry);
575       OutliningInfo->ReturnBlock = ReturnBlock;
576       OutliningInfo->NonReturnBlock = NonReturnBlock;
577       CandidateFound = true;
578       break;
579     }
580 
581     BasicBlock *CommSucc, *OtherSucc;
582     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
583 
584     if (!CommSucc)
585       break;
586 
587     OutliningInfo->Entries.push_back(CurrEntry);
588     CurrEntry = OtherSucc;
589   } while (true);
590 
591   if (!CandidateFound)
592     return std::unique_ptr<FunctionOutliningInfo>();
593 
594   // There should not be any successors (not in the entry set) other than
595   // {ReturnBlock, NonReturnBlock}
596   assert(OutliningInfo->Entries[0] == &F.front() &&
597          "Function Entry must be the first in Entries vector");
598   DenseSet<BasicBlock *> Entries;
599   for (BasicBlock *E : OutliningInfo->Entries)
600     Entries.insert(E);
601 
602   // Returns true of BB has Predecessor which is not
603   // in Entries set.
604   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
605     for (auto *Pred : predecessors(BB)) {
606       if (!Entries.count(Pred))
607         return true;
608     }
609     return false;
610   };
611   auto CheckAndNormalizeCandidate =
612       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
613         for (BasicBlock *E : OutliningInfo->Entries) {
614           for (auto *Succ : successors(E)) {
615             if (Entries.count(Succ))
616               continue;
617             if (Succ == OutliningInfo->ReturnBlock)
618               OutliningInfo->ReturnBlockPreds.push_back(E);
619             else if (Succ != OutliningInfo->NonReturnBlock)
620               return false;
621           }
622           // There should not be any outside incoming edges either:
623           if (HasNonEntryPred(E))
624             return false;
625         }
626         return true;
627       };
628 
629   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
630     return std::unique_ptr<FunctionOutliningInfo>();
631 
632   // Now further growing the candidate's inlining region by
633   // peeling off dominating blocks from the outlining region:
634   while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
635     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
636     if (succ_size(Cand) != 2)
637       break;
638 
639     if (HasNonEntryPred(Cand))
640       break;
641 
642     BasicBlock *Succ1 = *succ_begin(Cand);
643     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
644 
645     BasicBlock *ReturnBlock, *NonReturnBlock;
646     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
647     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
648       break;
649 
650     if (NonReturnBlock->getSinglePredecessor() != Cand)
651       break;
652 
653     // Now grow and update OutlininigInfo:
654     OutliningInfo->Entries.push_back(Cand);
655     OutliningInfo->NonReturnBlock = NonReturnBlock;
656     OutliningInfo->ReturnBlockPreds.push_back(Cand);
657     Entries.insert(Cand);
658   }
659 
660   return OutliningInfo;
661 }
662 
663 // Check if there is PGO data or user annotated branch data:
hasProfileData(const Function & F,const FunctionOutliningInfo & OI)664 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
665   if (F.hasProfileData())
666     return true;
667   // Now check if any of the entry block has MD_prof data:
668   for (auto *E : OI.Entries) {
669     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
670     if (!BR || BR->isUnconditional())
671       continue;
672     if (hasBranchWeightMD(*BR))
673       return true;
674   }
675   return false;
676 }
677 
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner) const678 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
679     FunctionCloner &Cloner) const {
680   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
681   auto EntryFreq =
682       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
683   auto OutliningCallFreq =
684       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
685   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
686   // we outlined any regions, so we may encounter situations where the
687   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
688   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
689     OutliningCallFreq = EntryFreq;
690 
691   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
692       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
693 
694   if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
695     return OutlineRegionRelFreq;
696 
697   // When profile data is not available, we need to be conservative in
698   // estimating the overall savings. Static branch prediction can usually
699   // guess the branch direction right (taken/non-taken), but the guessed
700   // branch probability is usually not biased enough. In case when the
701   // outlined region is predicted to be likely, its probability needs
702   // to be made higher (more biased) to not under-estimate the cost of
703   // function outlining. On the other hand, if the outlined region
704   // is predicted to be less likely, the predicted probablity is usually
705   // higher than the actual. For instance, the actual probability of the
706   // less likely target is only 5%, but the guessed probablity can be
707   // 40%. In the latter case, there is no need for further adjustment.
708   // FIXME: add an option for this.
709   if (OutlineRegionRelFreq < BranchProbability(45, 100))
710     return OutlineRegionRelFreq;
711 
712   OutlineRegionRelFreq = std::max(
713       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
714 
715   return OutlineRegionRelFreq;
716 }
717 
shouldPartialInline(CallBase & CB,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE) const718 bool PartialInlinerImpl::shouldPartialInline(
719     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
720     OptimizationRemarkEmitter &ORE) const {
721   using namespace ore;
722 
723   Function *Callee = CB.getCalledFunction();
724   assert(Callee == Cloner.ClonedFunc);
725 
726   if (SkipCostAnalysis)
727     return isInlineViable(*Callee).isSuccess();
728 
729   Function *Caller = CB.getCaller();
730   auto &CalleeTTI = GetTTI(*Callee);
731   bool RemarksEnabled =
732       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
733           DEBUG_TYPE);
734   InlineCost IC =
735       getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
736                     GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
737 
738   if (IC.isAlways()) {
739     ORE.emit([&]() {
740       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
741              << NV("Callee", Cloner.OrigFunc)
742              << " should always be fully inlined, not partially";
743     });
744     return false;
745   }
746 
747   if (IC.isNever()) {
748     ORE.emit([&]() {
749       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
750              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
751              << NV("Caller", Caller)
752              << " because it should never be inlined (cost=never)";
753     });
754     return false;
755   }
756 
757   if (!IC) {
758     ORE.emit([&]() {
759       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
760              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
761              << NV("Caller", Caller) << " because too costly to inline (cost="
762              << NV("Cost", IC.getCost()) << ", threshold="
763              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
764     });
765     return false;
766   }
767   const DataLayout &DL = Caller->getParent()->getDataLayout();
768 
769   // The savings of eliminating the call:
770   int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL);
771   BlockFrequency NormWeightedSavings(NonWeightedSavings);
772 
773   // Weighted saving is smaller than weighted cost, return false
774   if (NormWeightedSavings < WeightedOutliningRcost) {
775     ORE.emit([&]() {
776       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
777                                         &CB)
778              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
779              << NV("Caller", Caller) << " runtime overhead (overhead="
780              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
781              << ", savings="
782              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
783              << ")"
784              << " of making the outlined call is too high";
785     });
786 
787     return false;
788   }
789 
790   ORE.emit([&]() {
791     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
792            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
793            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
794            << " (threshold="
795            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
796   });
797   return true;
798 }
799 
800 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
801 // For now use a simplified version. The returned 'InlineCost' will be used
802 // to esimate the size cost as well as runtime cost of the BB.
803 InstructionCost
computeBBInlineCost(BasicBlock * BB,TargetTransformInfo * TTI)804 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
805                                         TargetTransformInfo *TTI) {
806   InstructionCost InlineCost = 0;
807   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
808   int InstrCost = InlineConstants::getInstrCost();
809   for (Instruction &I : BB->instructionsWithoutDebug()) {
810     // Skip free instructions.
811     switch (I.getOpcode()) {
812     case Instruction::BitCast:
813     case Instruction::PtrToInt:
814     case Instruction::IntToPtr:
815     case Instruction::Alloca:
816     case Instruction::PHI:
817       continue;
818     case Instruction::GetElementPtr:
819       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
820         continue;
821       break;
822     default:
823       break;
824     }
825 
826     if (I.isLifetimeStartOrEnd())
827       continue;
828 
829     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
830       Intrinsic::ID IID = II->getIntrinsicID();
831       SmallVector<Type *, 4> Tys;
832       FastMathFlags FMF;
833       for (Value *Val : II->args())
834         Tys.push_back(Val->getType());
835 
836       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
837         FMF = FPMO->getFastMathFlags();
838 
839       IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
840       InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
841       continue;
842     }
843 
844     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
845       InlineCost += getCallsiteCost(*TTI, *CI, DL);
846       continue;
847     }
848 
849     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
850       InlineCost += getCallsiteCost(*TTI, *II, DL);
851       continue;
852     }
853 
854     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
855       InlineCost += (SI->getNumCases() + 1) * InstrCost;
856       continue;
857     }
858     InlineCost += InstrCost;
859   }
860 
861   return InlineCost;
862 }
863 
864 std::tuple<InstructionCost, InstructionCost>
computeOutliningCosts(FunctionCloner & Cloner) const865 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
866   InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
867   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
868     Function *OutlinedFunc = FuncBBPair.first;
869     BasicBlock* OutliningCallBB = FuncBBPair.second;
870     // Now compute the cost of the call sequence to the outlined function
871     // 'OutlinedFunction' in BB 'OutliningCallBB':
872     auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
873     OutliningFuncCallCost +=
874         computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
875 
876     // Now compute the cost of the extracted/outlined function itself:
877     for (BasicBlock &BB : *OutlinedFunc)
878       OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
879   }
880   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
881          "Outlined function cost should be no less than the outlined region");
882 
883   // The code extractor introduces a new root and exit stub blocks with
884   // additional unconditional branches. Those branches will be eliminated
885   // later with bb layout. The cost should be adjusted accordingly:
886   OutlinedFunctionCost -=
887       2 * InlineConstants::getInstrCost() * Cloner.OutlinedFunctions.size();
888 
889   InstructionCost OutliningRuntimeOverhead =
890       OutliningFuncCallCost +
891       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
892       ExtraOutliningPenalty.getValue();
893 
894   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
895 }
896 
897 // Create the callsite to profile count map which is
898 // used to update the original function's entry count,
899 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap) const900 void PartialInlinerImpl::computeCallsiteToProfCountMap(
901     Function *DuplicateFunction,
902     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
903   std::vector<User *> Users(DuplicateFunction->user_begin(),
904                             DuplicateFunction->user_end());
905   Function *CurrentCaller = nullptr;
906   std::unique_ptr<BlockFrequencyInfo> TempBFI;
907   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
908 
909   auto ComputeCurrBFI = [&,this](Function *Caller) {
910       // For the old pass manager:
911       if (!GetBFI) {
912         DominatorTree DT(*Caller);
913         LoopInfo LI(DT);
914         BranchProbabilityInfo BPI(*Caller, LI);
915         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
916         CurrentCallerBFI = TempBFI.get();
917       } else {
918         // New pass manager:
919         CurrentCallerBFI = &(GetBFI(*Caller));
920       }
921   };
922 
923   for (User *User : Users) {
924     // Don't bother with BlockAddress used by CallBr for asm goto.
925     if (isa<BlockAddress>(User))
926       continue;
927     CallBase *CB = getSupportedCallBase(User);
928     Function *Caller = CB->getCaller();
929     if (CurrentCaller != Caller) {
930       CurrentCaller = Caller;
931       ComputeCurrBFI(Caller);
932     } else {
933       assert(CurrentCallerBFI && "CallerBFI is not set");
934     }
935     BasicBlock *CallBB = CB->getParent();
936     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
937     if (Count)
938       CallSiteToProfCountMap[User] = *Count;
939     else
940       CallSiteToProfCountMap[User] = 0;
941   }
942 }
943 
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC,function_ref<TargetTransformInfo & (Function &)> GetTTI)944 PartialInlinerImpl::FunctionCloner::FunctionCloner(
945     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
946     function_ref<AssumptionCache *(Function &)> LookupAC,
947     function_ref<TargetTransformInfo &(Function &)> GetTTI)
948     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
949   ClonedOI = std::make_unique<FunctionOutliningInfo>();
950 
951   // Clone the function, so that we can hack away on it.
952   ValueToValueMapTy VMap;
953   ClonedFunc = CloneFunction(F, VMap);
954 
955   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
956   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
957   for (BasicBlock *BB : OI->Entries)
958     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
959 
960   for (BasicBlock *E : OI->ReturnBlockPreds) {
961     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
962     ClonedOI->ReturnBlockPreds.push_back(NewE);
963   }
964   // Go ahead and update all uses to the duplicate, so that we can just
965   // use the inliner functionality when we're done hacking.
966   F->replaceAllUsesWith(ClonedFunc);
967 }
968 
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC,function_ref<TargetTransformInfo & (Function &)> GetTTI)969 PartialInlinerImpl::FunctionCloner::FunctionCloner(
970     Function *F, FunctionOutliningMultiRegionInfo *OI,
971     OptimizationRemarkEmitter &ORE,
972     function_ref<AssumptionCache *(Function &)> LookupAC,
973     function_ref<TargetTransformInfo &(Function &)> GetTTI)
974     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
975   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
976 
977   // Clone the function, so that we can hack away on it.
978   ValueToValueMapTy VMap;
979   ClonedFunc = CloneFunction(F, VMap);
980 
981   // Go through all Outline Candidate Regions and update all BasicBlock
982   // information.
983   for (const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &RegionInfo :
984        OI->ORI) {
985     SmallVector<BasicBlock *, 8> Region;
986     for (BasicBlock *BB : RegionInfo.Region)
987       Region.push_back(cast<BasicBlock>(VMap[BB]));
988 
989     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
990     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
991     BasicBlock *NewReturnBlock = nullptr;
992     if (RegionInfo.ReturnBlock)
993       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
994     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
995         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
996     ClonedOMRI->ORI.push_back(MappedRegionInfo);
997   }
998   // Go ahead and update all uses to the duplicate, so that we can just
999   // use the inliner functionality when we're done hacking.
1000   F->replaceAllUsesWith(ClonedFunc);
1001 }
1002 
normalizeReturnBlock() const1003 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1004   auto GetFirstPHI = [](BasicBlock *BB) {
1005     BasicBlock::iterator I = BB->begin();
1006     PHINode *FirstPhi = nullptr;
1007     while (I != BB->end()) {
1008       PHINode *Phi = dyn_cast<PHINode>(I);
1009       if (!Phi)
1010         break;
1011       if (!FirstPhi) {
1012         FirstPhi = Phi;
1013         break;
1014       }
1015     }
1016     return FirstPhi;
1017   };
1018 
1019   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1020   // blocks.
1021   if (!ClonedOI)
1022     return;
1023 
1024   // Special hackery is needed with PHI nodes that have inputs from more than
1025   // one extracted block.  For simplicity, just split the PHIs into a two-level
1026   // sequence of PHIs, some of which will go in the extracted region, and some
1027   // of which will go outside.
1028   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1029   // only split block when necessary:
1030   PHINode *FirstPhi = GetFirstPHI(PreReturn);
1031   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1032 
1033   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1034     return;
1035 
1036   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1037     if (llvm::all_equal(PN->incoming_values()))
1038       return PN->getIncomingValue(0);
1039     return nullptr;
1040   };
1041 
1042   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1043       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1044   BasicBlock::iterator I = PreReturn->begin();
1045   BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin();
1046   SmallVector<Instruction *, 4> DeadPhis;
1047   while (I != PreReturn->end()) {
1048     PHINode *OldPhi = dyn_cast<PHINode>(I);
1049     if (!OldPhi)
1050       break;
1051 
1052     PHINode *RetPhi =
1053         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "");
1054     RetPhi->insertBefore(Ins);
1055     OldPhi->replaceAllUsesWith(RetPhi);
1056     Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();
1057 
1058     RetPhi->addIncoming(&*I, PreReturn);
1059     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1060       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1061       OldPhi->removeIncomingValue(E);
1062     }
1063 
1064     // After incoming values splitting, the old phi may become trivial.
1065     // Keeping the trivial phi can introduce definition inside the outline
1066     // region which is live-out, causing necessary overhead (load, store
1067     // arg passing etc).
1068     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1069       OldPhi->replaceAllUsesWith(OldPhiVal);
1070       DeadPhis.push_back(OldPhi);
1071     }
1072     ++I;
1073   }
1074   for (auto *DP : DeadPhis)
1075     DP->eraseFromParent();
1076 
1077   for (auto *E : ClonedOI->ReturnBlockPreds)
1078     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1079 }
1080 
doMultiRegionFunctionOutlining()1081 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1082 
1083   auto ComputeRegionCost =
1084       [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1085     InstructionCost Cost = 0;
1086     for (BasicBlock* BB : Region)
1087       Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1088     return Cost;
1089   };
1090 
1091   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1092 
1093   if (ClonedOMRI->ORI.empty())
1094     return false;
1095 
1096   // The CodeExtractor needs a dominator tree.
1097   DominatorTree DT;
1098   DT.recalculate(*ClonedFunc);
1099 
1100   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1101   LoopInfo LI(DT);
1102   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1103   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1104 
1105   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1106   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1107 
1108   SetVector<Value *> Inputs, Outputs, Sinks;
1109   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1110        ClonedOMRI->ORI) {
1111     InstructionCost CurrentOutlinedRegionCost =
1112         ComputeRegionCost(RegionInfo.Region);
1113 
1114     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1115                      ClonedFuncBFI.get(), &BPI,
1116                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1117                      /* AllowVarargs */ false);
1118 
1119     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1120 
1121     LLVM_DEBUG({
1122       dbgs() << "inputs: " << Inputs.size() << "\n";
1123       dbgs() << "outputs: " << Outputs.size() << "\n";
1124       for (Value *value : Inputs)
1125         dbgs() << "value used in func: " << *value << "\n";
1126       for (Value *output : Outputs)
1127         dbgs() << "instr used in func: " << *output << "\n";
1128     });
1129 
1130     // Do not extract regions that have live exit variables.
1131     if (Outputs.size() > 0 && !ForceLiveExit)
1132       continue;
1133 
1134     if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1135       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1136       BasicBlock *OutliningCallBB = OCS->getParent();
1137       assert(OutliningCallBB->getParent() == ClonedFunc);
1138       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1139       NumColdRegionsOutlined++;
1140       OutlinedRegionCost += CurrentOutlinedRegionCost;
1141 
1142       if (MarkOutlinedColdCC) {
1143         OutlinedFunc->setCallingConv(CallingConv::Cold);
1144         OCS->setCallingConv(CallingConv::Cold);
1145       }
1146     } else
1147       ORE.emit([&]() {
1148         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1149                                         &RegionInfo.Region.front()->front())
1150                << "Failed to extract region at block "
1151                << ore::NV("Block", RegionInfo.Region.front());
1152       });
1153   }
1154 
1155   return !OutlinedFunctions.empty();
1156 }
1157 
1158 Function *
doSingleRegionFunctionOutlining()1159 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1160   // Returns true if the block is to be partial inlined into the caller
1161   // (i.e. not to be extracted to the out of line function)
1162   auto ToBeInlined = [&, this](BasicBlock *BB) {
1163     return BB == ClonedOI->ReturnBlock ||
1164            llvm::is_contained(ClonedOI->Entries, BB);
1165   };
1166 
1167   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1168   // The CodeExtractor needs a dominator tree.
1169   DominatorTree DT;
1170   DT.recalculate(*ClonedFunc);
1171 
1172   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1173   LoopInfo LI(DT);
1174   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1175   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1176 
1177   // Gather up the blocks that we're going to extract.
1178   std::vector<BasicBlock *> ToExtract;
1179   auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1180   ToExtract.push_back(ClonedOI->NonReturnBlock);
1181   OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1182       ClonedOI->NonReturnBlock, ClonedFuncTTI);
1183   for (BasicBlock *BB : depth_first(&ClonedFunc->getEntryBlock()))
1184     if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {
1185       ToExtract.push_back(BB);
1186       // FIXME: the code extractor may hoist/sink more code
1187       // into the outlined function which may make the outlining
1188       // overhead (the difference of the outlined function cost
1189       // and OutliningRegionCost) look larger.
1190       OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI);
1191     }
1192 
1193   // Extract the body of the if.
1194   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1195   Function *OutlinedFunc =
1196       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1197                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1198                     /* AllowVarargs */ true)
1199           .extractCodeRegion(CEAC);
1200 
1201   if (OutlinedFunc) {
1202     BasicBlock *OutliningCallBB =
1203         PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1204     assert(OutliningCallBB->getParent() == ClonedFunc);
1205     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1206   } else
1207     ORE.emit([&]() {
1208       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1209                                       &ToExtract.front()->front())
1210              << "Failed to extract region at block "
1211              << ore::NV("Block", ToExtract.front());
1212     });
1213 
1214   return OutlinedFunc;
1215 }
1216 
~FunctionCloner()1217 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1218   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1219   // users (function pointers, etc.) back to the original function.
1220   ClonedFunc->replaceAllUsesWith(OrigFunc);
1221   ClonedFunc->eraseFromParent();
1222   if (!IsFunctionInlined) {
1223     // Remove each function that was speculatively created if there is no
1224     // reference.
1225     for (auto FuncBBPair : OutlinedFunctions) {
1226       Function *Func = FuncBBPair.first;
1227       Func->eraseFromParent();
1228     }
1229   }
1230 }
1231 
unswitchFunction(Function & F)1232 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1233   if (F.hasAddressTaken())
1234     return {false, nullptr};
1235 
1236   // Let inliner handle it
1237   if (F.hasFnAttribute(Attribute::AlwaysInline))
1238     return {false, nullptr};
1239 
1240   if (F.hasFnAttribute(Attribute::NoInline))
1241     return {false, nullptr};
1242 
1243   if (PSI.isFunctionEntryCold(&F))
1244     return {false, nullptr};
1245 
1246   if (F.users().empty())
1247     return {false, nullptr};
1248 
1249   OptimizationRemarkEmitter ORE(&F);
1250 
1251   // Only try to outline cold regions if we have a profile summary, which
1252   // implies we have profiling information.
1253   if (PSI.hasProfileSummary() && F.hasProfileData() &&
1254       !DisableMultiRegionPartialInline) {
1255     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1256         computeOutliningColdRegionsInfo(F, ORE);
1257     if (OMRI) {
1258       FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1259 
1260       LLVM_DEBUG({
1261         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1262         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1263                << "\n";
1264       });
1265 
1266       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1267 
1268       if (DidOutline) {
1269         LLVM_DEBUG({
1270           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1271           Cloner.ClonedFunc->print(dbgs());
1272           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1273         });
1274 
1275         if (tryPartialInline(Cloner))
1276           return {true, nullptr};
1277       }
1278     }
1279   }
1280 
1281   // Fall-thru to regular partial inlining if we:
1282   //    i) can't find any cold regions to outline, or
1283   //   ii) can't inline the outlined function anywhere.
1284   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1285   if (!OI)
1286     return {false, nullptr};
1287 
1288   FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1289   Cloner.normalizeReturnBlock();
1290 
1291   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1292 
1293   if (!OutlinedFunction)
1294     return {false, nullptr};
1295 
1296   if (tryPartialInline(Cloner))
1297     return {true, OutlinedFunction};
1298 
1299   return {false, nullptr};
1300 }
1301 
tryPartialInline(FunctionCloner & Cloner)1302 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1303   if (Cloner.OutlinedFunctions.empty())
1304     return false;
1305 
1306   auto OutliningCosts = computeOutliningCosts(Cloner);
1307 
1308   InstructionCost SizeCost = std::get<0>(OutliningCosts);
1309   InstructionCost NonWeightedRcost = std::get<1>(OutliningCosts);
1310 
1311   assert(SizeCost.isValid() && NonWeightedRcost.isValid() &&
1312          "Expected valid costs");
1313 
1314   // Only calculate RelativeToEntryFreq when we are doing single region
1315   // outlining.
1316   BranchProbability RelativeToEntryFreq;
1317   if (Cloner.ClonedOI)
1318     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1319   else
1320     // RelativeToEntryFreq doesn't make sense when we have more than one
1321     // outlined call because each call will have a different relative frequency
1322     // to the entry block.  We can consider using the average, but the
1323     // usefulness of that information is questionable. For now, assume we never
1324     // execute the calls to outlined functions.
1325     RelativeToEntryFreq = BranchProbability(0, 1);
1326 
1327   BlockFrequency WeightedRcost =
1328       BlockFrequency(*NonWeightedRcost.getValue()) * RelativeToEntryFreq;
1329 
1330   // The call sequence(s) to the outlined function(s) are larger than the sum of
1331   // the original outlined region size(s), it does not increase the chances of
1332   // inlining the function with outlining (The inliner uses the size increase to
1333   // model the cost of inlining a callee).
1334   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1335     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1336     DebugLoc DLoc;
1337     BasicBlock *Block;
1338     std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1339     OrigFuncORE.emit([&]() {
1340       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1341                                         DLoc, Block)
1342              << ore::NV("Function", Cloner.OrigFunc)
1343              << " not partially inlined into callers (Original Size = "
1344              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1345              << ", Size of call sequence to outlined function = "
1346              << ore::NV("NewSize", SizeCost) << ")";
1347     });
1348     return false;
1349   }
1350 
1351   assert(Cloner.OrigFunc->users().empty() &&
1352          "F's users should all be replaced!");
1353 
1354   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1355                             Cloner.ClonedFunc->user_end());
1356 
1357   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1358   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1359   if (CalleeEntryCount)
1360     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1361 
1362   uint64_t CalleeEntryCountV =
1363       (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1364 
1365   bool AnyInline = false;
1366   for (User *User : Users) {
1367     // Don't bother with BlockAddress used by CallBr for asm goto.
1368     if (isa<BlockAddress>(User))
1369       continue;
1370 
1371     CallBase *CB = getSupportedCallBase(User);
1372 
1373     if (isLimitReached())
1374       continue;
1375 
1376     OptimizationRemarkEmitter CallerORE(CB->getCaller());
1377     if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1378       continue;
1379 
1380     // Construct remark before doing the inlining, as after successful inlining
1381     // the callsite is removed.
1382     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1383     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1384        << ore::NV("Caller", CB->getCaller());
1385 
1386     InlineFunctionInfo IFI(GetAssumptionCache, &PSI);
1387     // We can only forward varargs when we outlined a single region, else we
1388     // bail on vararg functions.
1389     if (!InlineFunction(*CB, IFI, /*MergeAttributes=*/false, nullptr, true,
1390                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1391                                          : nullptr))
1392              .isSuccess())
1393       continue;
1394 
1395     CallerORE.emit(OR);
1396 
1397     // Now update the entry count:
1398     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1399       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1400       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1401     }
1402 
1403     AnyInline = true;
1404     NumPartialInlining++;
1405     // Update the stats
1406     if (Cloner.ClonedOI)
1407       NumPartialInlined++;
1408     else
1409       NumColdOutlinePartialInlined++;
1410   }
1411 
1412   if (AnyInline) {
1413     Cloner.IsFunctionInlined = true;
1414     if (CalleeEntryCount)
1415       Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1416           CalleeEntryCountV, CalleeEntryCount->getType()));
1417     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1418     OrigFuncORE.emit([&]() {
1419       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1420              << "Partially inlined into at least one caller";
1421     });
1422   }
1423 
1424   return AnyInline;
1425 }
1426 
run(Module & M)1427 bool PartialInlinerImpl::run(Module &M) {
1428   if (DisablePartialInlining)
1429     return false;
1430 
1431   std::vector<Function *> Worklist;
1432   Worklist.reserve(M.size());
1433   for (Function &F : M)
1434     if (!F.use_empty() && !F.isDeclaration())
1435       Worklist.push_back(&F);
1436 
1437   bool Changed = false;
1438   while (!Worklist.empty()) {
1439     Function *CurrFunc = Worklist.back();
1440     Worklist.pop_back();
1441 
1442     if (CurrFunc->use_empty())
1443       continue;
1444 
1445     std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1446     if (Result.second)
1447       Worklist.push_back(Result.second);
1448     Changed |= Result.first;
1449   }
1450 
1451   return Changed;
1452 }
1453 
run(Module & M,ModuleAnalysisManager & AM)1454 PreservedAnalyses PartialInlinerPass::run(Module &M,
1455                                           ModuleAnalysisManager &AM) {
1456   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1457 
1458   auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1459     return FAM.getResult<AssumptionAnalysis>(F);
1460   };
1461 
1462   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1463     return FAM.getCachedResult<AssumptionAnalysis>(F);
1464   };
1465 
1466   auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1467     return FAM.getResult<BlockFrequencyAnalysis>(F);
1468   };
1469 
1470   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1471     return FAM.getResult<TargetIRAnalysis>(F);
1472   };
1473 
1474   auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1475     return FAM.getResult<TargetLibraryAnalysis>(F);
1476   };
1477 
1478   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1479 
1480   if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1481                          GetTLI, PSI, GetBFI)
1482           .run(M))
1483     return PreservedAnalyses::none();
1484   return PreservedAnalyses::all();
1485 }
1486