1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63
64 using namespace llvm;
65
66 #define DEBUG_TYPE "partial-inlining"
67
68 STATISTIC(NumPartialInlined,
69 "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71 "cold outlined regions were partially "
72 "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound,
74 "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined,
76 "Number of cold single entry/exit regions outlined.");
77
78 // Command line option to disable partial-inlining. The default is false:
79 static cl::opt<bool>
80 DisablePartialInlining("disable-partial-inlining", cl::init(false),
81 cl::Hidden, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
83 // false:
84 static cl::opt<bool> DisableMultiRegionPartialInline(
85 "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86 cl::desc("Disable multi-region partial inlining"));
87
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
90 static cl::opt<bool>
91 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92 cl::desc("Force outline regions with live exits"));
93
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
96 static cl::opt<bool>
97 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98 cl::desc("Mark outline function calls with ColdCC"));
99
100 #ifndef NDEBUG
101 // Command line option to debug partial-inlining. The default is none:
102 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
103 cl::init(false), cl::Hidden,
104 cl::desc("Trace partial inlining."));
105 #endif
106
107 // This is an option used by testing:
108 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
109 cl::init(false), cl::ZeroOrMore,
110 cl::ReallyHidden,
111 cl::desc("Skip Cost Analysis"));
112 // Used to determine if a cold region is worth outlining based on
113 // its inlining cost compared to the original function. Default is set at 10%.
114 // ie. if the cold region reduces the inlining cost of the original function by
115 // at least 10%.
116 static cl::opt<float> MinRegionSizeRatio(
117 "min-region-size-ratio", cl::init(0.1), cl::Hidden,
118 cl::desc("Minimum ratio comparing relative sizes of each "
119 "outline candidate and original function"));
120 // Used to tune the minimum number of execution counts needed in the predecessor
121 // block to the cold edge. ie. confidence interval.
122 static cl::opt<unsigned>
123 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
124 cl::desc("Minimum block executions to consider "
125 "its BranchProbabilityInfo valid"));
126 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
127 // if the branch probability is 10% or less, then it is deemed as 'cold'.
128 static cl::opt<float> ColdBranchRatio(
129 "cold-branch-ratio", cl::init(0.1), cl::Hidden,
130 cl::desc("Minimum BranchProbability to consider a region cold."));
131
132 static cl::opt<unsigned> MaxNumInlineBlocks(
133 "max-num-inline-blocks", cl::init(5), cl::Hidden,
134 cl::desc("Max number of blocks to be partially inlined"));
135
136 // Command line option to set the maximum number of partial inlining allowed
137 // for the module. The default value of -1 means no limit.
138 static cl::opt<int> MaxNumPartialInlining(
139 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
140 cl::desc("Max number of partial inlining. The default is unlimited"));
141
142 // Used only when PGO or user annotated branch data is absent. It is
143 // the least value that is used to weigh the outline region. If BFI
144 // produces larger value, the BFI value will be used.
145 static cl::opt<int>
146 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
147 cl::Hidden, cl::ZeroOrMore,
148 cl::desc("Relative frequency of outline region to "
149 "the entry block"));
150
151 static cl::opt<unsigned> ExtraOutliningPenalty(
152 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
153 cl::desc("A debug option to add additional penalty to the computed one."));
154
155 namespace {
156
157 struct FunctionOutliningInfo {
158 FunctionOutliningInfo() = default;
159
160 // Returns the number of blocks to be inlined including all blocks
161 // in Entries and one return block.
GetNumInlinedBlocks__anon5c3931f20111::FunctionOutliningInfo162 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
163
164 // A set of blocks including the function entry that guard
165 // the region to be outlined.
166 SmallVector<BasicBlock *, 4> Entries;
167
168 // The return block that is not included in the outlined region.
169 BasicBlock *ReturnBlock = nullptr;
170
171 // The dominating block of the region to be outlined.
172 BasicBlock *NonReturnBlock = nullptr;
173
174 // The set of blocks in Entries that that are predecessors to ReturnBlock
175 SmallVector<BasicBlock *, 4> ReturnBlockPreds;
176 };
177
178 struct FunctionOutliningMultiRegionInfo {
FunctionOutliningMultiRegionInfo__anon5c3931f20111::FunctionOutliningMultiRegionInfo179 FunctionOutliningMultiRegionInfo()
180 : ORI() {}
181
182 // Container for outline regions
183 struct OutlineRegionInfo {
OutlineRegionInfo__anon5c3931f20111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo184 OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
185 BasicBlock *EntryBlock, BasicBlock *ExitBlock,
186 BasicBlock *ReturnBlock)
187 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
188 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
189 SmallVector<BasicBlock *, 8> Region;
190 BasicBlock *EntryBlock;
191 BasicBlock *ExitBlock;
192 BasicBlock *ReturnBlock;
193 };
194
195 SmallVector<OutlineRegionInfo, 4> ORI;
196 };
197
198 struct PartialInlinerImpl {
199
PartialInlinerImpl__anon5c3931f20111::PartialInlinerImpl200 PartialInlinerImpl(
201 function_ref<AssumptionCache &(Function &)> GetAC,
202 function_ref<AssumptionCache *(Function &)> LookupAC,
203 function_ref<TargetTransformInfo &(Function &)> GTTI,
204 function_ref<const TargetLibraryInfo &(Function &)> GTLI,
205 ProfileSummaryInfo &ProfSI,
206 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
207 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
208 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
209
210 bool run(Module &M);
211 // Main part of the transformation that calls helper functions to find
212 // outlining candidates, clone & outline the function, and attempt to
213 // partially inline the resulting function. Returns true if
214 // inlining was successful, false otherwise. Also returns the outline
215 // function (only if we partially inlined early returns) as there is a
216 // possibility to further "peel" early return statements that were left in the
217 // outline function due to code size.
218 std::pair<bool, Function *> unswitchFunction(Function *F);
219
220 // This class speculatively clones the function to be partial inlined.
221 // At the end of partial inlining, the remaining callsites to the cloned
222 // function that are not partially inlined will be fixed up to reference
223 // the original function, and the cloned function will be erased.
224 struct FunctionCloner {
225 // Two constructors, one for single region outlining, the other for
226 // multi-region outlining.
227 FunctionCloner(Function *F, FunctionOutliningInfo *OI,
228 OptimizationRemarkEmitter &ORE,
229 function_ref<AssumptionCache *(Function &)> LookupAC);
230 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
231 OptimizationRemarkEmitter &ORE,
232 function_ref<AssumptionCache *(Function &)> LookupAC);
233 ~FunctionCloner();
234
235 // Prepare for function outlining: making sure there is only
236 // one incoming edge from the extracted/outlined region to
237 // the return block.
238 void NormalizeReturnBlock();
239
240 // Do function outlining for cold regions.
241 bool doMultiRegionFunctionOutlining();
242 // Do function outlining for region after early return block(s).
243 // NOTE: For vararg functions that do the vararg handling in the outlined
244 // function, we temporarily generate IR that does not properly
245 // forward varargs to the outlined function. Calling InlineFunction
246 // will update calls to the outlined functions to properly forward
247 // the varargs.
248 Function *doSingleRegionFunctionOutlining();
249
250 Function *OrigFunc = nullptr;
251 Function *ClonedFunc = nullptr;
252
253 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
254 // Keep track of Outlined Functions and the basic block they're called from.
255 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
256
257 // ClonedFunc is inlined in one of its callers after function
258 // outlining.
259 bool IsFunctionInlined = false;
260 // The cost of the region to be outlined.
261 int OutlinedRegionCost = 0;
262 // ClonedOI is specific to outlining non-early return blocks.
263 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
264 // ClonedOMRI is specific to outlining cold regions.
265 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
266 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
267 OptimizationRemarkEmitter &ORE;
268 function_ref<AssumptionCache *(Function &)> LookupAC;
269 };
270
271 private:
272 int NumPartialInlining = 0;
273 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
274 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
275 function_ref<TargetTransformInfo &(Function &)> GetTTI;
276 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
277 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
278 ProfileSummaryInfo &PSI;
279
280 // Return the frequency of the OutlininingBB relative to F's entry point.
281 // The result is no larger than 1 and is represented using BP.
282 // (Note that the outlined region's 'head' block can only have incoming
283 // edges from the guarding entry blocks).
284 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
285
286 // Return true if the callee of CB should be partially inlined with
287 // profit.
288 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
289 BlockFrequency WeightedOutliningRcost,
290 OptimizationRemarkEmitter &ORE);
291
292 // Try to inline DuplicateFunction (cloned from F with call to
293 // the OutlinedFunction into its callers. Return true
294 // if there is any successful inlining.
295 bool tryPartialInline(FunctionCloner &Cloner);
296
297 // Compute the mapping from use site of DuplicationFunction to the enclosing
298 // BB's profile count.
299 void computeCallsiteToProfCountMap(Function *DuplicateFunction,
300 DenseMap<User *, uint64_t> &SiteCountMap);
301
IsLimitReached__anon5c3931f20111::PartialInlinerImpl302 bool IsLimitReached() {
303 return (MaxNumPartialInlining != -1 &&
304 NumPartialInlining >= MaxNumPartialInlining);
305 }
306
getSupportedCallBase__anon5c3931f20111::PartialInlinerImpl307 static CallBase *getSupportedCallBase(User *U) {
308 if (isa<CallInst>(U) || isa<InvokeInst>(U))
309 return cast<CallBase>(U);
310 llvm_unreachable("All uses must be calls");
311 return nullptr;
312 }
313
getOneCallSiteTo__anon5c3931f20111::PartialInlinerImpl314 static CallBase *getOneCallSiteTo(Function *F) {
315 User *User = *F->user_begin();
316 return getSupportedCallBase(User);
317 }
318
getOneDebugLoc__anon5c3931f20111::PartialInlinerImpl319 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
320 CallBase *CB = getOneCallSiteTo(F);
321 DebugLoc DLoc = CB->getDebugLoc();
322 BasicBlock *Block = CB->getParent();
323 return std::make_tuple(DLoc, Block);
324 }
325
326 // Returns the costs associated with function outlining:
327 // - The first value is the non-weighted runtime cost for making the call
328 // to the outlined function, including the addtional setup cost in the
329 // outlined function itself;
330 // - The second value is the estimated size of the new call sequence in
331 // basic block Cloner.OutliningCallBB;
332 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
333
334 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
335 // approximate both the size and runtime cost (Note that in the current
336 // inline cost analysis, there is no clear distinction there either).
337 static int computeBBInlineCost(BasicBlock *BB);
338
339 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
340 std::unique_ptr<FunctionOutliningMultiRegionInfo>
341 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
342 };
343
344 struct PartialInlinerLegacyPass : public ModulePass {
345 static char ID; // Pass identification, replacement for typeid
346
PartialInlinerLegacyPass__anon5c3931f20111::PartialInlinerLegacyPass347 PartialInlinerLegacyPass() : ModulePass(ID) {
348 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
349 }
350
getAnalysisUsage__anon5c3931f20111::PartialInlinerLegacyPass351 void getAnalysisUsage(AnalysisUsage &AU) const override {
352 AU.addRequired<AssumptionCacheTracker>();
353 AU.addRequired<ProfileSummaryInfoWrapperPass>();
354 AU.addRequired<TargetTransformInfoWrapperPass>();
355 AU.addRequired<TargetLibraryInfoWrapperPass>();
356 }
357
runOnModule__anon5c3931f20111::PartialInlinerLegacyPass358 bool runOnModule(Module &M) override {
359 if (skipModule(M))
360 return false;
361
362 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
363 TargetTransformInfoWrapperPass *TTIWP =
364 &getAnalysis<TargetTransformInfoWrapperPass>();
365 ProfileSummaryInfo &PSI =
366 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
367
368 auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
369 return ACT->getAssumptionCache(F);
370 };
371
372 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
373 return ACT->lookupAssumptionCache(F);
374 };
375
376 auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
377 return TTIWP->getTTI(F);
378 };
379
380 auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
381 return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
382 };
383
384 return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
385 GetTLI, PSI)
386 .run(M);
387 }
388 };
389
390 } // end anonymous namespace
391
392 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function * F,OptimizationRemarkEmitter & ORE)393 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
394 OptimizationRemarkEmitter &ORE) {
395 BasicBlock *EntryBlock = &F->front();
396
397 DominatorTree DT(*F);
398 LoopInfo LI(DT);
399 BranchProbabilityInfo BPI(*F, LI);
400 std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
401 BlockFrequencyInfo *BFI;
402 if (!GetBFI) {
403 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
404 BFI = ScopedBFI.get();
405 } else
406 BFI = &(GetBFI(*F));
407
408 // Return if we don't have profiling information.
409 if (!PSI.hasInstrumentationProfile())
410 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
411
412 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
413 std::make_unique<FunctionOutliningMultiRegionInfo>();
414
415 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
416 BasicBlock *Dom = BlockList.front();
417 return BlockList.size() > 1 && Dom->hasNPredecessors(1);
418 };
419
420 auto IsSingleExit =
421 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
422 BasicBlock *ExitBlock = nullptr;
423 for (auto *Block : BlockList) {
424 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
425 if (!is_contained(BlockList, *SI)) {
426 if (ExitBlock) {
427 ORE.emit([&]() {
428 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
429 &SI->front())
430 << "Region dominated by "
431 << ore::NV("Block", BlockList.front()->getName())
432 << " has more than one region exit edge.";
433 });
434 return nullptr;
435 } else
436 ExitBlock = Block;
437 }
438 }
439 }
440 return ExitBlock;
441 };
442
443 auto BBProfileCount = [BFI](BasicBlock *BB) {
444 return BFI->getBlockProfileCount(BB)
445 ? BFI->getBlockProfileCount(BB).getValue()
446 : 0;
447 };
448
449 // Use the same computeBBInlineCost function to compute the cost savings of
450 // the outlining the candidate region.
451 int OverallFunctionCost = 0;
452 for (auto &BB : *F)
453 OverallFunctionCost += computeBBInlineCost(&BB);
454
455 #ifndef NDEBUG
456 if (TracePartialInlining)
457 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
458 #endif
459 int MinOutlineRegionCost =
460 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
461 BranchProbability MinBranchProbability(
462 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
463 MinBlockCounterExecution);
464 bool ColdCandidateFound = false;
465 BasicBlock *CurrEntry = EntryBlock;
466 std::vector<BasicBlock *> DFS;
467 DenseMap<BasicBlock *, bool> VisitedMap;
468 DFS.push_back(CurrEntry);
469 VisitedMap[CurrEntry] = true;
470 // Use Depth First Search on the basic blocks to find CFG edges that are
471 // considered cold.
472 // Cold regions considered must also have its inline cost compared to the
473 // overall inline cost of the original function. The region is outlined only
474 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
475 // more.
476 while (!DFS.empty()) {
477 auto *thisBB = DFS.back();
478 DFS.pop_back();
479 // Only consider regions with predecessor blocks that are considered
480 // not-cold (default: part of the top 99.99% of all block counters)
481 // AND greater than our minimum block execution count (default: 100).
482 if (PSI.isColdBlock(thisBB, BFI) ||
483 BBProfileCount(thisBB) < MinBlockCounterExecution)
484 continue;
485 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
486 if (VisitedMap[*SI])
487 continue;
488 VisitedMap[*SI] = true;
489 DFS.push_back(*SI);
490 // If branch isn't cold, we skip to the next one.
491 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
492 if (SuccProb > MinBranchProbability)
493 continue;
494 #ifndef NDEBUG
495 if (TracePartialInlining) {
496 dbgs() << "Found cold edge: " << thisBB->getName() << "->"
497 << (*SI)->getName() << "\nBranch Probability = " << SuccProb
498 << "\n";
499 }
500 #endif
501 SmallVector<BasicBlock *, 8> DominateVector;
502 DT.getDescendants(*SI, DominateVector);
503 // We can only outline single entry regions (for now).
504 if (!IsSingleEntry(DominateVector))
505 continue;
506 BasicBlock *ExitBlock = nullptr;
507 // We can only outline single exit regions (for now).
508 if (!(ExitBlock = IsSingleExit(DominateVector)))
509 continue;
510 int OutlineRegionCost = 0;
511 for (auto *BB : DominateVector)
512 OutlineRegionCost += computeBBInlineCost(BB);
513
514 #ifndef NDEBUG
515 if (TracePartialInlining)
516 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
517 #endif
518
519 if (OutlineRegionCost < MinOutlineRegionCost) {
520 ORE.emit([&]() {
521 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
522 &SI->front())
523 << ore::NV("Callee", F) << " inline cost-savings smaller than "
524 << ore::NV("Cost", MinOutlineRegionCost);
525 });
526 continue;
527 }
528 // For now, ignore blocks that belong to a SISE region that is a
529 // candidate for outlining. In the future, we may want to look
530 // at inner regions because the outer region may have live-exit
531 // variables.
532 for (auto *BB : DominateVector)
533 VisitedMap[BB] = true;
534 // ReturnBlock here means the block after the outline call
535 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
536 // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
537 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
538 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
539 OutliningInfo->ORI.push_back(RegInfo);
540 #ifndef NDEBUG
541 if (TracePartialInlining) {
542 dbgs() << "Found Cold Candidate starting at block: "
543 << DominateVector.front()->getName() << "\n";
544 }
545 #endif
546 ColdCandidateFound = true;
547 NumColdRegionsFound++;
548 }
549 }
550 if (ColdCandidateFound)
551 return OutliningInfo;
552 else
553 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
554 }
555
556 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function * F)557 PartialInlinerImpl::computeOutliningInfo(Function *F) {
558 BasicBlock *EntryBlock = &F->front();
559 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
560 if (!BR || BR->isUnconditional())
561 return std::unique_ptr<FunctionOutliningInfo>();
562
563 // Returns true if Succ is BB's successor
564 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
565 return is_contained(successors(BB), Succ);
566 };
567
568 auto IsReturnBlock = [](BasicBlock *BB) {
569 Instruction *TI = BB->getTerminator();
570 return isa<ReturnInst>(TI);
571 };
572
573 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
574 if (IsReturnBlock(Succ1))
575 return std::make_tuple(Succ1, Succ2);
576 if (IsReturnBlock(Succ2))
577 return std::make_tuple(Succ2, Succ1);
578
579 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
580 };
581
582 // Detect a triangular shape:
583 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
584 if (IsSuccessor(Succ1, Succ2))
585 return std::make_tuple(Succ1, Succ2);
586 if (IsSuccessor(Succ2, Succ1))
587 return std::make_tuple(Succ2, Succ1);
588
589 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
590 };
591
592 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
593 std::make_unique<FunctionOutliningInfo>();
594
595 BasicBlock *CurrEntry = EntryBlock;
596 bool CandidateFound = false;
597 do {
598 // The number of blocks to be inlined has already reached
599 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
600 // disables partial inlining for the function.
601 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
602 break;
603
604 if (succ_size(CurrEntry) != 2)
605 break;
606
607 BasicBlock *Succ1 = *succ_begin(CurrEntry);
608 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
609
610 BasicBlock *ReturnBlock, *NonReturnBlock;
611 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
612
613 if (ReturnBlock) {
614 OutliningInfo->Entries.push_back(CurrEntry);
615 OutliningInfo->ReturnBlock = ReturnBlock;
616 OutliningInfo->NonReturnBlock = NonReturnBlock;
617 CandidateFound = true;
618 break;
619 }
620
621 BasicBlock *CommSucc;
622 BasicBlock *OtherSucc;
623 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
624
625 if (!CommSucc)
626 break;
627
628 OutliningInfo->Entries.push_back(CurrEntry);
629 CurrEntry = OtherSucc;
630 } while (true);
631
632 if (!CandidateFound)
633 return std::unique_ptr<FunctionOutliningInfo>();
634
635 // Do sanity check of the entries: threre should not
636 // be any successors (not in the entry set) other than
637 // {ReturnBlock, NonReturnBlock}
638 assert(OutliningInfo->Entries[0] == &F->front() &&
639 "Function Entry must be the first in Entries vector");
640 DenseSet<BasicBlock *> Entries;
641 for (BasicBlock *E : OutliningInfo->Entries)
642 Entries.insert(E);
643
644 // Returns true of BB has Predecessor which is not
645 // in Entries set.
646 auto HasNonEntryPred = [Entries](BasicBlock *BB) {
647 for (auto Pred : predecessors(BB)) {
648 if (!Entries.count(Pred))
649 return true;
650 }
651 return false;
652 };
653 auto CheckAndNormalizeCandidate =
654 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
655 for (BasicBlock *E : OutliningInfo->Entries) {
656 for (auto Succ : successors(E)) {
657 if (Entries.count(Succ))
658 continue;
659 if (Succ == OutliningInfo->ReturnBlock)
660 OutliningInfo->ReturnBlockPreds.push_back(E);
661 else if (Succ != OutliningInfo->NonReturnBlock)
662 return false;
663 }
664 // There should not be any outside incoming edges either:
665 if (HasNonEntryPred(E))
666 return false;
667 }
668 return true;
669 };
670
671 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
672 return std::unique_ptr<FunctionOutliningInfo>();
673
674 // Now further growing the candidate's inlining region by
675 // peeling off dominating blocks from the outlining region:
676 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
677 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
678 if (succ_size(Cand) != 2)
679 break;
680
681 if (HasNonEntryPred(Cand))
682 break;
683
684 BasicBlock *Succ1 = *succ_begin(Cand);
685 BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
686
687 BasicBlock *ReturnBlock, *NonReturnBlock;
688 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
689 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
690 break;
691
692 if (NonReturnBlock->getSinglePredecessor() != Cand)
693 break;
694
695 // Now grow and update OutlininigInfo:
696 OutliningInfo->Entries.push_back(Cand);
697 OutliningInfo->NonReturnBlock = NonReturnBlock;
698 OutliningInfo->ReturnBlockPreds.push_back(Cand);
699 Entries.insert(Cand);
700 }
701
702 return OutliningInfo;
703 }
704
705 // Check if there is PGO data or user annotated branch data:
hasProfileData(Function * F,FunctionOutliningInfo * OI)706 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
707 if (F->hasProfileData())
708 return true;
709 // Now check if any of the entry block has MD_prof data:
710 for (auto *E : OI->Entries) {
711 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
712 if (!BR || BR->isUnconditional())
713 continue;
714 uint64_t T, F;
715 if (BR->extractProfMetadata(T, F))
716 return true;
717 }
718 return false;
719 }
720
721 BranchProbability
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner)722 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
723 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
724 auto EntryFreq =
725 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
726 auto OutliningCallFreq =
727 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
728 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
729 // we outlined any regions, so we may encounter situations where the
730 // OutliningCallFreq is *slightly* bigger than the EntryFreq.
731 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
732 OutliningCallFreq = EntryFreq;
733 }
734 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
735 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
736
737 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
738 return OutlineRegionRelFreq;
739
740 // When profile data is not available, we need to be conservative in
741 // estimating the overall savings. Static branch prediction can usually
742 // guess the branch direction right (taken/non-taken), but the guessed
743 // branch probability is usually not biased enough. In case when the
744 // outlined region is predicted to be likely, its probability needs
745 // to be made higher (more biased) to not under-estimate the cost of
746 // function outlining. On the other hand, if the outlined region
747 // is predicted to be less likely, the predicted probablity is usually
748 // higher than the actual. For instance, the actual probability of the
749 // less likely target is only 5%, but the guessed probablity can be
750 // 40%. In the latter case, there is no need for further adjustement.
751 // FIXME: add an option for this.
752 if (OutlineRegionRelFreq < BranchProbability(45, 100))
753 return OutlineRegionRelFreq;
754
755 OutlineRegionRelFreq = std::max(
756 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
757
758 return OutlineRegionRelFreq;
759 }
760
shouldPartialInline(CallBase & CB,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE)761 bool PartialInlinerImpl::shouldPartialInline(
762 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
763 OptimizationRemarkEmitter &ORE) {
764 using namespace ore;
765
766 Function *Callee = CB.getCalledFunction();
767 assert(Callee == Cloner.ClonedFunc);
768
769 if (SkipCostAnalysis)
770 return isInlineViable(*Callee).isSuccess();
771
772 Function *Caller = CB.getCaller();
773 auto &CalleeTTI = GetTTI(*Callee);
774 bool RemarksEnabled =
775 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
776 DEBUG_TYPE);
777 InlineCost IC =
778 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
779 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
780
781 if (IC.isAlways()) {
782 ORE.emit([&]() {
783 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
784 << NV("Callee", Cloner.OrigFunc)
785 << " should always be fully inlined, not partially";
786 });
787 return false;
788 }
789
790 if (IC.isNever()) {
791 ORE.emit([&]() {
792 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
793 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
794 << NV("Caller", Caller)
795 << " because it should never be inlined (cost=never)";
796 });
797 return false;
798 }
799
800 if (!IC) {
801 ORE.emit([&]() {
802 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
803 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
804 << NV("Caller", Caller) << " because too costly to inline (cost="
805 << NV("Cost", IC.getCost()) << ", threshold="
806 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
807 });
808 return false;
809 }
810 const DataLayout &DL = Caller->getParent()->getDataLayout();
811
812 // The savings of eliminating the call:
813 int NonWeightedSavings = getCallsiteCost(CB, DL);
814 BlockFrequency NormWeightedSavings(NonWeightedSavings);
815
816 // Weighted saving is smaller than weighted cost, return false
817 if (NormWeightedSavings < WeightedOutliningRcost) {
818 ORE.emit([&]() {
819 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
820 &CB)
821 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
822 << NV("Caller", Caller) << " runtime overhead (overhead="
823 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
824 << ", savings="
825 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
826 << ")"
827 << " of making the outlined call is too high";
828 });
829
830 return false;
831 }
832
833 ORE.emit([&]() {
834 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
835 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
836 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
837 << " (threshold="
838 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
839 });
840 return true;
841 }
842
843 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
844 // For now use a simplified version. The returned 'InlineCost' will be used
845 // to esimate the size cost as well as runtime cost of the BB.
computeBBInlineCost(BasicBlock * BB)846 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
847 int InlineCost = 0;
848 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
849 for (Instruction &I : BB->instructionsWithoutDebug()) {
850 // Skip free instructions.
851 switch (I.getOpcode()) {
852 case Instruction::BitCast:
853 case Instruction::PtrToInt:
854 case Instruction::IntToPtr:
855 case Instruction::Alloca:
856 case Instruction::PHI:
857 continue;
858 case Instruction::GetElementPtr:
859 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
860 continue;
861 break;
862 default:
863 break;
864 }
865
866 if (I.isLifetimeStartOrEnd())
867 continue;
868
869 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
870 InlineCost += getCallsiteCost(*CI, DL);
871 continue;
872 }
873
874 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
875 InlineCost += getCallsiteCost(*II, DL);
876 continue;
877 }
878
879 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
880 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
881 continue;
882 }
883 InlineCost += InlineConstants::InstrCost;
884 }
885 return InlineCost;
886 }
887
888 std::tuple<int, int>
computeOutliningCosts(FunctionCloner & Cloner)889 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
890 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
891 for (auto FuncBBPair : Cloner.OutlinedFunctions) {
892 Function *OutlinedFunc = FuncBBPair.first;
893 BasicBlock* OutliningCallBB = FuncBBPair.second;
894 // Now compute the cost of the call sequence to the outlined function
895 // 'OutlinedFunction' in BB 'OutliningCallBB':
896 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
897
898 // Now compute the cost of the extracted/outlined function itself:
899 for (BasicBlock &BB : *OutlinedFunc)
900 OutlinedFunctionCost += computeBBInlineCost(&BB);
901 }
902 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
903 "Outlined function cost should be no less than the outlined region");
904
905 // The code extractor introduces a new root and exit stub blocks with
906 // additional unconditional branches. Those branches will be eliminated
907 // later with bb layout. The cost should be adjusted accordingly:
908 OutlinedFunctionCost -=
909 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
910
911 int OutliningRuntimeOverhead =
912 OutliningFuncCallCost +
913 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
914 ExtraOutliningPenalty;
915
916 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
917 }
918
919 // Create the callsite to profile count map which is
920 // used to update the original function's entry count,
921 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap)922 void PartialInlinerImpl::computeCallsiteToProfCountMap(
923 Function *DuplicateFunction,
924 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
925 std::vector<User *> Users(DuplicateFunction->user_begin(),
926 DuplicateFunction->user_end());
927 Function *CurrentCaller = nullptr;
928 std::unique_ptr<BlockFrequencyInfo> TempBFI;
929 BlockFrequencyInfo *CurrentCallerBFI = nullptr;
930
931 auto ComputeCurrBFI = [&,this](Function *Caller) {
932 // For the old pass manager:
933 if (!GetBFI) {
934 DominatorTree DT(*Caller);
935 LoopInfo LI(DT);
936 BranchProbabilityInfo BPI(*Caller, LI);
937 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
938 CurrentCallerBFI = TempBFI.get();
939 } else {
940 // New pass manager:
941 CurrentCallerBFI = &(GetBFI(*Caller));
942 }
943 };
944
945 for (User *User : Users) {
946 CallBase *CB = getSupportedCallBase(User);
947 Function *Caller = CB->getCaller();
948 if (CurrentCaller != Caller) {
949 CurrentCaller = Caller;
950 ComputeCurrBFI(Caller);
951 } else {
952 assert(CurrentCallerBFI && "CallerBFI is not set");
953 }
954 BasicBlock *CallBB = CB->getParent();
955 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
956 if (Count)
957 CallSiteToProfCountMap[User] = *Count;
958 else
959 CallSiteToProfCountMap[User] = 0;
960 }
961 }
962
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC)963 PartialInlinerImpl::FunctionCloner::FunctionCloner(
964 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
965 function_ref<AssumptionCache *(Function &)> LookupAC)
966 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
967 ClonedOI = std::make_unique<FunctionOutliningInfo>();
968
969 // Clone the function, so that we can hack away on it.
970 ValueToValueMapTy VMap;
971 ClonedFunc = CloneFunction(F, VMap);
972
973 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
974 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
975 for (BasicBlock *BB : OI->Entries) {
976 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
977 }
978 for (BasicBlock *E : OI->ReturnBlockPreds) {
979 BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
980 ClonedOI->ReturnBlockPreds.push_back(NewE);
981 }
982 // Go ahead and update all uses to the duplicate, so that we can just
983 // use the inliner functionality when we're done hacking.
984 F->replaceAllUsesWith(ClonedFunc);
985 }
986
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC)987 PartialInlinerImpl::FunctionCloner::FunctionCloner(
988 Function *F, FunctionOutliningMultiRegionInfo *OI,
989 OptimizationRemarkEmitter &ORE,
990 function_ref<AssumptionCache *(Function &)> LookupAC)
991 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
992 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
993
994 // Clone the function, so that we can hack away on it.
995 ValueToValueMapTy VMap;
996 ClonedFunc = CloneFunction(F, VMap);
997
998 // Go through all Outline Candidate Regions and update all BasicBlock
999 // information.
1000 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1001 OI->ORI) {
1002 SmallVector<BasicBlock *, 8> Region;
1003 for (BasicBlock *BB : RegionInfo.Region) {
1004 Region.push_back(cast<BasicBlock>(VMap[BB]));
1005 }
1006 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1007 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1008 BasicBlock *NewReturnBlock = nullptr;
1009 if (RegionInfo.ReturnBlock)
1010 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1011 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1012 Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1013 ClonedOMRI->ORI.push_back(MappedRegionInfo);
1014 }
1015 // Go ahead and update all uses to the duplicate, so that we can just
1016 // use the inliner functionality when we're done hacking.
1017 F->replaceAllUsesWith(ClonedFunc);
1018 }
1019
NormalizeReturnBlock()1020 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1021 auto getFirstPHI = [](BasicBlock *BB) {
1022 BasicBlock::iterator I = BB->begin();
1023 PHINode *FirstPhi = nullptr;
1024 while (I != BB->end()) {
1025 PHINode *Phi = dyn_cast<PHINode>(I);
1026 if (!Phi)
1027 break;
1028 if (!FirstPhi) {
1029 FirstPhi = Phi;
1030 break;
1031 }
1032 }
1033 return FirstPhi;
1034 };
1035
1036 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1037 // blocks.
1038 if (!ClonedOI)
1039 return;
1040
1041 // Special hackery is needed with PHI nodes that have inputs from more than
1042 // one extracted block. For simplicity, just split the PHIs into a two-level
1043 // sequence of PHIs, some of which will go in the extracted region, and some
1044 // of which will go outside.
1045 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1046 // only split block when necessary:
1047 PHINode *FirstPhi = getFirstPHI(PreReturn);
1048 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1049
1050 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1051 return;
1052
1053 auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1054 Value *CommonValue = PN->getIncomingValue(0);
1055 if (all_of(PN->incoming_values(),
1056 [&](Value *V) { return V == CommonValue; }))
1057 return CommonValue;
1058 return nullptr;
1059 };
1060
1061 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1062 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1063 BasicBlock::iterator I = PreReturn->begin();
1064 Instruction *Ins = &ClonedOI->ReturnBlock->front();
1065 SmallVector<Instruction *, 4> DeadPhis;
1066 while (I != PreReturn->end()) {
1067 PHINode *OldPhi = dyn_cast<PHINode>(I);
1068 if (!OldPhi)
1069 break;
1070
1071 PHINode *RetPhi =
1072 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1073 OldPhi->replaceAllUsesWith(RetPhi);
1074 Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1075
1076 RetPhi->addIncoming(&*I, PreReturn);
1077 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1078 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1079 OldPhi->removeIncomingValue(E);
1080 }
1081
1082 // After incoming values splitting, the old phi may become trivial.
1083 // Keeping the trivial phi can introduce definition inside the outline
1084 // region which is live-out, causing necessary overhead (load, store
1085 // arg passing etc).
1086 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1087 OldPhi->replaceAllUsesWith(OldPhiVal);
1088 DeadPhis.push_back(OldPhi);
1089 }
1090 ++I;
1091 }
1092 for (auto *DP : DeadPhis)
1093 DP->eraseFromParent();
1094
1095 for (auto E : ClonedOI->ReturnBlockPreds) {
1096 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1097 }
1098 }
1099
doMultiRegionFunctionOutlining()1100 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1101
1102 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1103 int Cost = 0;
1104 for (BasicBlock* BB : Region)
1105 Cost += computeBBInlineCost(BB);
1106 return Cost;
1107 };
1108
1109 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1110
1111 if (ClonedOMRI->ORI.empty())
1112 return false;
1113
1114 // The CodeExtractor needs a dominator tree.
1115 DominatorTree DT;
1116 DT.recalculate(*ClonedFunc);
1117
1118 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1119 LoopInfo LI(DT);
1120 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1121 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1122
1123 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1124 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1125
1126 SetVector<Value *> Inputs, Outputs, Sinks;
1127 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1128 ClonedOMRI->ORI) {
1129 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1130
1131 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1132 ClonedFuncBFI.get(), &BPI,
1133 LookupAC(*RegionInfo.EntryBlock->getParent()),
1134 /* AllowVarargs */ false);
1135
1136 CE.findInputsOutputs(Inputs, Outputs, Sinks);
1137
1138 #ifndef NDEBUG
1139 if (TracePartialInlining) {
1140 dbgs() << "inputs: " << Inputs.size() << "\n";
1141 dbgs() << "outputs: " << Outputs.size() << "\n";
1142 for (Value *value : Inputs)
1143 dbgs() << "value used in func: " << *value << "\n";
1144 for (Value *output : Outputs)
1145 dbgs() << "instr used in func: " << *output << "\n";
1146 }
1147 #endif
1148 // Do not extract regions that have live exit variables.
1149 if (Outputs.size() > 0 && !ForceLiveExit)
1150 continue;
1151
1152 Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
1153
1154 if (OutlinedFunc) {
1155 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1156 BasicBlock *OutliningCallBB = OCS->getParent();
1157 assert(OutliningCallBB->getParent() == ClonedFunc);
1158 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1159 NumColdRegionsOutlined++;
1160 OutlinedRegionCost += CurrentOutlinedRegionCost;
1161
1162 if (MarkOutlinedColdCC) {
1163 OutlinedFunc->setCallingConv(CallingConv::Cold);
1164 OCS->setCallingConv(CallingConv::Cold);
1165 }
1166 } else
1167 ORE.emit([&]() {
1168 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1169 &RegionInfo.Region.front()->front())
1170 << "Failed to extract region at block "
1171 << ore::NV("Block", RegionInfo.Region.front());
1172 });
1173 }
1174
1175 return !OutlinedFunctions.empty();
1176 }
1177
1178 Function *
doSingleRegionFunctionOutlining()1179 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1180 // Returns true if the block is to be partial inlined into the caller
1181 // (i.e. not to be extracted to the out of line function)
1182 auto ToBeInlined = [&, this](BasicBlock *BB) {
1183 return BB == ClonedOI->ReturnBlock ||
1184 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1185 ClonedOI->Entries.end());
1186 };
1187
1188 assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1189 // The CodeExtractor needs a dominator tree.
1190 DominatorTree DT;
1191 DT.recalculate(*ClonedFunc);
1192
1193 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1194 LoopInfo LI(DT);
1195 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1196 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1197
1198 // Gather up the blocks that we're going to extract.
1199 std::vector<BasicBlock *> ToExtract;
1200 ToExtract.push_back(ClonedOI->NonReturnBlock);
1201 OutlinedRegionCost +=
1202 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1203 for (BasicBlock &BB : *ClonedFunc)
1204 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1205 ToExtract.push_back(&BB);
1206 // FIXME: the code extractor may hoist/sink more code
1207 // into the outlined function which may make the outlining
1208 // overhead (the difference of the outlined function cost
1209 // and OutliningRegionCost) look larger.
1210 OutlinedRegionCost += computeBBInlineCost(&BB);
1211 }
1212
1213 // Extract the body of the if.
1214 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1215 Function *OutlinedFunc =
1216 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1217 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1218 /* AllowVarargs */ true)
1219 .extractCodeRegion(CEAC);
1220
1221 if (OutlinedFunc) {
1222 BasicBlock *OutliningCallBB =
1223 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1224 ->getParent();
1225 assert(OutliningCallBB->getParent() == ClonedFunc);
1226 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1227 } else
1228 ORE.emit([&]() {
1229 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1230 &ToExtract.front()->front())
1231 << "Failed to extract region at block "
1232 << ore::NV("Block", ToExtract.front());
1233 });
1234
1235 return OutlinedFunc;
1236 }
1237
~FunctionCloner()1238 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1239 // Ditch the duplicate, since we're done with it, and rewrite all remaining
1240 // users (function pointers, etc.) back to the original function.
1241 ClonedFunc->replaceAllUsesWith(OrigFunc);
1242 ClonedFunc->eraseFromParent();
1243 if (!IsFunctionInlined) {
1244 // Remove each function that was speculatively created if there is no
1245 // reference.
1246 for (auto FuncBBPair : OutlinedFunctions) {
1247 Function *Func = FuncBBPair.first;
1248 Func->eraseFromParent();
1249 }
1250 }
1251 }
1252
unswitchFunction(Function * F)1253 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1254
1255 if (F->hasAddressTaken())
1256 return {false, nullptr};
1257
1258 // Let inliner handle it
1259 if (F->hasFnAttribute(Attribute::AlwaysInline))
1260 return {false, nullptr};
1261
1262 if (F->hasFnAttribute(Attribute::NoInline))
1263 return {false, nullptr};
1264
1265 if (PSI.isFunctionEntryCold(F))
1266 return {false, nullptr};
1267
1268 if (F->users().empty())
1269 return {false, nullptr};
1270
1271 OptimizationRemarkEmitter ORE(F);
1272
1273 // Only try to outline cold regions if we have a profile summary, which
1274 // implies we have profiling information.
1275 if (PSI.hasProfileSummary() && F->hasProfileData() &&
1276 !DisableMultiRegionPartialInline) {
1277 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1278 computeOutliningColdRegionsInfo(F, ORE);
1279 if (OMRI) {
1280 FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
1281
1282 #ifndef NDEBUG
1283 if (TracePartialInlining) {
1284 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1285 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1286 << "\n";
1287 }
1288 #endif
1289 bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1290
1291 if (DidOutline) {
1292 #ifndef NDEBUG
1293 if (TracePartialInlining) {
1294 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1295 Cloner.ClonedFunc->print(dbgs());
1296 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1297 }
1298 #endif
1299
1300 if (tryPartialInline(Cloner))
1301 return {true, nullptr};
1302 }
1303 }
1304 }
1305
1306 // Fall-thru to regular partial inlining if we:
1307 // i) can't find any cold regions to outline, or
1308 // ii) can't inline the outlined function anywhere.
1309 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1310 if (!OI)
1311 return {false, nullptr};
1312
1313 FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
1314 Cloner.NormalizeReturnBlock();
1315
1316 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1317
1318 if (!OutlinedFunction)
1319 return {false, nullptr};
1320
1321 bool AnyInline = tryPartialInline(Cloner);
1322
1323 if (AnyInline)
1324 return {true, OutlinedFunction};
1325
1326 return {false, nullptr};
1327 }
1328
tryPartialInline(FunctionCloner & Cloner)1329 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1330 if (Cloner.OutlinedFunctions.empty())
1331 return false;
1332
1333 int SizeCost = 0;
1334 BlockFrequency WeightedRcost;
1335 int NonWeightedRcost;
1336 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1337
1338 // Only calculate RelativeToEntryFreq when we are doing single region
1339 // outlining.
1340 BranchProbability RelativeToEntryFreq;
1341 if (Cloner.ClonedOI) {
1342 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1343 } else
1344 // RelativeToEntryFreq doesn't make sense when we have more than one
1345 // outlined call because each call will have a different relative frequency
1346 // to the entry block. We can consider using the average, but the
1347 // usefulness of that information is questionable. For now, assume we never
1348 // execute the calls to outlined functions.
1349 RelativeToEntryFreq = BranchProbability(0, 1);
1350
1351 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1352
1353 // The call sequence(s) to the outlined function(s) are larger than the sum of
1354 // the original outlined region size(s), it does not increase the chances of
1355 // inlining the function with outlining (The inliner uses the size increase to
1356 // model the cost of inlining a callee).
1357 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1358 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1359 DebugLoc DLoc;
1360 BasicBlock *Block;
1361 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1362 OrigFuncORE.emit([&]() {
1363 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1364 DLoc, Block)
1365 << ore::NV("Function", Cloner.OrigFunc)
1366 << " not partially inlined into callers (Original Size = "
1367 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1368 << ", Size of call sequence to outlined function = "
1369 << ore::NV("NewSize", SizeCost) << ")";
1370 });
1371 return false;
1372 }
1373
1374 assert(Cloner.OrigFunc->users().empty() &&
1375 "F's users should all be replaced!");
1376
1377 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1378 Cloner.ClonedFunc->user_end());
1379
1380 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1381 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1382 if (CalleeEntryCount)
1383 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1384
1385 uint64_t CalleeEntryCountV =
1386 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1387
1388 bool AnyInline = false;
1389 for (User *User : Users) {
1390 CallBase *CB = getSupportedCallBase(User);
1391
1392 if (IsLimitReached())
1393 continue;
1394
1395 OptimizationRemarkEmitter CallerORE(CB->getCaller());
1396 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1397 continue;
1398
1399 // Construct remark before doing the inlining, as after successful inlining
1400 // the callsite is removed.
1401 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1402 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1403 << ore::NV("Caller", CB->getCaller());
1404
1405 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1406 // We can only forward varargs when we outlined a single region, else we
1407 // bail on vararg functions.
1408 if (!InlineFunction(*CB, IFI, nullptr, true,
1409 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1410 : nullptr))
1411 .isSuccess())
1412 continue;
1413
1414 CallerORE.emit(OR);
1415
1416 // Now update the entry count:
1417 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1418 uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1419 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1420 }
1421
1422 AnyInline = true;
1423 NumPartialInlining++;
1424 // Update the stats
1425 if (Cloner.ClonedOI)
1426 NumPartialInlined++;
1427 else
1428 NumColdOutlinePartialInlined++;
1429
1430 }
1431
1432 if (AnyInline) {
1433 Cloner.IsFunctionInlined = true;
1434 if (CalleeEntryCount)
1435 Cloner.OrigFunc->setEntryCount(
1436 CalleeEntryCount.setCount(CalleeEntryCountV));
1437 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1438 OrigFuncORE.emit([&]() {
1439 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1440 << "Partially inlined into at least one caller";
1441 });
1442
1443 }
1444
1445 return AnyInline;
1446 }
1447
run(Module & M)1448 bool PartialInlinerImpl::run(Module &M) {
1449 if (DisablePartialInlining)
1450 return false;
1451
1452 std::vector<Function *> Worklist;
1453 Worklist.reserve(M.size());
1454 for (Function &F : M)
1455 if (!F.use_empty() && !F.isDeclaration())
1456 Worklist.push_back(&F);
1457
1458 bool Changed = false;
1459 while (!Worklist.empty()) {
1460 Function *CurrFunc = Worklist.back();
1461 Worklist.pop_back();
1462
1463 if (CurrFunc->use_empty())
1464 continue;
1465
1466 bool Recursive = false;
1467 for (User *U : CurrFunc->users())
1468 if (Instruction *I = dyn_cast<Instruction>(U))
1469 if (I->getParent()->getParent() == CurrFunc) {
1470 Recursive = true;
1471 break;
1472 }
1473 if (Recursive)
1474 continue;
1475
1476 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1477 if (Result.second)
1478 Worklist.push_back(Result.second);
1479 Changed |= Result.first;
1480 }
1481
1482 return Changed;
1483 }
1484
1485 char PartialInlinerLegacyPass::ID = 0;
1486
1487 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1488 "Partial Inliner", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1489 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1490 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1491 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1492 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1493 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1494 "Partial Inliner", false, false)
1495
1496 ModulePass *llvm::createPartialInliningPass() {
1497 return new PartialInlinerLegacyPass();
1498 }
1499
run(Module & M,ModuleAnalysisManager & AM)1500 PreservedAnalyses PartialInlinerPass::run(Module &M,
1501 ModuleAnalysisManager &AM) {
1502 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1503
1504 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1505 return FAM.getResult<AssumptionAnalysis>(F);
1506 };
1507
1508 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1509 return FAM.getCachedResult<AssumptionAnalysis>(F);
1510 };
1511
1512 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1513 return FAM.getResult<BlockFrequencyAnalysis>(F);
1514 };
1515
1516 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1517 return FAM.getResult<TargetIRAnalysis>(F);
1518 };
1519
1520 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1521 return FAM.getResult<TargetLibraryAnalysis>(F);
1522 };
1523
1524 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1525
1526 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1527 GetTLI, PSI, GetBFI)
1528 .run(M))
1529 return PreservedAnalyses::none();
1530 return PreservedAnalyses::all();
1531 }
1532