1 //===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SampleProfileLoader transformation. This pass
10 // reads a profile file generated by a sampling profiler (e.g. Linux Perf -
11 // http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
12 // profile information in the given profile.
13 //
14 // This pass generates branch weight annotations on the IR:
15 //
16 // - prof: Represents branch weights. This annotation is added to branches
17 // to indicate the weights of each edge coming out of the branch.
18 // The weight of each edge is the weight of the target block for
19 // that edge. The weight of a block B is computed as the maximum
20 // number of samples found in B.
21 //
22 //===----------------------------------------------------------------------===//
23
24 #include "llvm/Transforms/IPO/SampleProfile.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
28 #include "llvm/ADT/MapVector.h"
29 #include "llvm/ADT/PriorityQueue.h"
30 #include "llvm/ADT/SCCIterator.h"
31 #include "llvm/ADT/SmallVector.h"
32 #include "llvm/ADT/Statistic.h"
33 #include "llvm/ADT/StringMap.h"
34 #include "llvm/ADT/StringRef.h"
35 #include "llvm/ADT/Twine.h"
36 #include "llvm/Analysis/AssumptionCache.h"
37 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
38 #include "llvm/Analysis/CallGraph.h"
39 #include "llvm/Analysis/InlineAdvisor.h"
40 #include "llvm/Analysis/InlineCost.h"
41 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
42 #include "llvm/Analysis/ProfileSummaryInfo.h"
43 #include "llvm/Analysis/ReplayInlineAdvisor.h"
44 #include "llvm/Analysis/TargetLibraryInfo.h"
45 #include "llvm/Analysis/TargetTransformInfo.h"
46 #include "llvm/IR/BasicBlock.h"
47 #include "llvm/IR/DebugLoc.h"
48 #include "llvm/IR/DiagnosticInfo.h"
49 #include "llvm/IR/Function.h"
50 #include "llvm/IR/GlobalValue.h"
51 #include "llvm/IR/InstrTypes.h"
52 #include "llvm/IR/Instruction.h"
53 #include "llvm/IR/Instructions.h"
54 #include "llvm/IR/IntrinsicInst.h"
55 #include "llvm/IR/LLVMContext.h"
56 #include "llvm/IR/MDBuilder.h"
57 #include "llvm/IR/Module.h"
58 #include "llvm/IR/PassManager.h"
59 #include "llvm/IR/PseudoProbe.h"
60 #include "llvm/IR/ValueSymbolTable.h"
61 #include "llvm/InitializePasses.h"
62 #include "llvm/Pass.h"
63 #include "llvm/ProfileData/InstrProf.h"
64 #include "llvm/ProfileData/SampleProf.h"
65 #include "llvm/ProfileData/SampleProfReader.h"
66 #include "llvm/Support/Casting.h"
67 #include "llvm/Support/CommandLine.h"
68 #include "llvm/Support/Debug.h"
69 #include "llvm/Support/ErrorOr.h"
70 #include "llvm/Support/raw_ostream.h"
71 #include "llvm/Transforms/IPO.h"
72 #include "llvm/Transforms/IPO/ProfiledCallGraph.h"
73 #include "llvm/Transforms/IPO/SampleContextTracker.h"
74 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
75 #include "llvm/Transforms/Instrumentation.h"
76 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
77 #include "llvm/Transforms/Utils/Cloning.h"
78 #include "llvm/Transforms/Utils/MisExpect.h"
79 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
80 #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
81 #include <algorithm>
82 #include <cassert>
83 #include <cstdint>
84 #include <functional>
85 #include <limits>
86 #include <map>
87 #include <memory>
88 #include <queue>
89 #include <string>
90 #include <system_error>
91 #include <utility>
92 #include <vector>
93
94 using namespace llvm;
95 using namespace sampleprof;
96 using namespace llvm::sampleprofutil;
97 using ProfileCount = Function::ProfileCount;
98 #define DEBUG_TYPE "sample-profile"
99 #define CSINLINE_DEBUG DEBUG_TYPE "-inline"
100
101 STATISTIC(NumCSInlined,
102 "Number of functions inlined with context sensitive profile");
103 STATISTIC(NumCSNotInlined,
104 "Number of functions not inlined with context sensitive profile");
105 STATISTIC(NumMismatchedProfile,
106 "Number of functions with CFG mismatched profile");
107 STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
108 STATISTIC(NumDuplicatedInlinesite,
109 "Number of inlined callsites with a partial distribution factor");
110
111 STATISTIC(NumCSInlinedHitMinLimit,
112 "Number of functions with FDO inline stopped due to min size limit");
113 STATISTIC(NumCSInlinedHitMaxLimit,
114 "Number of functions with FDO inline stopped due to max size limit");
115 STATISTIC(
116 NumCSInlinedHitGrowthLimit,
117 "Number of functions with FDO inline stopped due to growth size limit");
118
119 // Command line option to specify the file to read samples from. This is
120 // mainly used for debugging.
121 static cl::opt<std::string> SampleProfileFile(
122 "sample-profile-file", cl::init(""), cl::value_desc("filename"),
123 cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
124
125 // The named file contains a set of transformations that may have been applied
126 // to the symbol names between the program from which the sample data was
127 // collected and the current program's symbols.
128 static cl::opt<std::string> SampleProfileRemappingFile(
129 "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
130 cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
131
132 static cl::opt<bool> ReportProfileStaleness(
133 "report-profile-staleness", cl::Hidden, cl::init(false),
134 cl::desc("Compute and report stale profile statistical metrics."));
135
136 static cl::opt<bool> PersistProfileStaleness(
137 "persist-profile-staleness", cl::Hidden, cl::init(false),
138 cl::desc("Compute stale profile statistical metrics and write it into the "
139 "native object file(.llvm_stats section)."));
140
141 static cl::opt<bool> ProfileSampleAccurate(
142 "profile-sample-accurate", cl::Hidden, cl::init(false),
143 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
144 "callsite and function as having 0 samples. Otherwise, treat "
145 "un-sampled callsites and functions conservatively as unknown. "));
146
147 static cl::opt<bool> ProfileSampleBlockAccurate(
148 "profile-sample-block-accurate", cl::Hidden, cl::init(false),
149 cl::desc("If the sample profile is accurate, we will mark all un-sampled "
150 "branches and calls as having 0 samples. Otherwise, treat "
151 "them conservatively as unknown. "));
152
153 static cl::opt<bool> ProfileAccurateForSymsInList(
154 "profile-accurate-for-symsinlist", cl::Hidden, cl::init(true),
155 cl::desc("For symbols in profile symbol list, regard their profiles to "
156 "be accurate. It may be overriden by profile-sample-accurate. "));
157
158 static cl::opt<bool> ProfileMergeInlinee(
159 "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
160 cl::desc("Merge past inlinee's profile to outline version if sample "
161 "profile loader decided not to inline a call site. It will "
162 "only be enabled when top-down order of profile loading is "
163 "enabled. "));
164
165 static cl::opt<bool> ProfileTopDownLoad(
166 "sample-profile-top-down-load", cl::Hidden, cl::init(true),
167 cl::desc("Do profile annotation and inlining for functions in top-down "
168 "order of call graph during sample profile loading. It only "
169 "works for new pass manager. "));
170
171 static cl::opt<bool>
172 UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
173 cl::desc("Process functions in a top-down order "
174 "defined by the profiled call graph when "
175 "-sample-profile-top-down-load is on."));
176 cl::opt<bool>
177 SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
178 cl::desc("Sort profiled recursion by edge weights."));
179
180 static cl::opt<bool> ProfileSizeInline(
181 "sample-profile-inline-size", cl::Hidden, cl::init(false),
182 cl::desc("Inline cold call sites in profile loader if it's beneficial "
183 "for code size."));
184
185 // Since profiles are consumed by many passes, turning on this option has
186 // side effects. For instance, pre-link SCC inliner would see merged profiles
187 // and inline the hot functions (that are skipped in this pass).
188 static cl::opt<bool> DisableSampleLoaderInlining(
189 "disable-sample-loader-inlining", cl::Hidden, cl::init(false),
190 cl::desc("If true, artifically skip inline transformation in sample-loader "
191 "pass, and merge (or scale) profiles (as configured by "
192 "--sample-profile-merge-inlinee)."));
193
194 cl::opt<int> ProfileInlineGrowthLimit(
195 "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
196 cl::desc("The size growth ratio limit for proirity-based sample profile "
197 "loader inlining."));
198
199 cl::opt<int> ProfileInlineLimitMin(
200 "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
201 cl::desc("The lower bound of size growth limit for "
202 "proirity-based sample profile loader inlining."));
203
204 cl::opt<int> ProfileInlineLimitMax(
205 "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
206 cl::desc("The upper bound of size growth limit for "
207 "proirity-based sample profile loader inlining."));
208
209 cl::opt<int> SampleHotCallSiteThreshold(
210 "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
211 cl::desc("Hot callsite threshold for proirity-based sample profile loader "
212 "inlining."));
213
214 cl::opt<int> SampleColdCallSiteThreshold(
215 "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
216 cl::desc("Threshold for inlining cold callsites"));
217
218 static cl::opt<unsigned> ProfileICPRelativeHotness(
219 "sample-profile-icp-relative-hotness", cl::Hidden, cl::init(25),
220 cl::desc(
221 "Relative hotness percentage threshold for indirect "
222 "call promotion in proirity-based sample profile loader inlining."));
223
224 static cl::opt<unsigned> ProfileICPRelativeHotnessSkip(
225 "sample-profile-icp-relative-hotness-skip", cl::Hidden, cl::init(1),
226 cl::desc(
227 "Skip relative hotness check for ICP up to given number of targets."));
228
229 static cl::opt<bool> CallsitePrioritizedInline(
230 "sample-profile-prioritized-inline", cl::Hidden,
231
232 cl::desc("Use call site prioritized inlining for sample profile loader."
233 "Currently only CSSPGO is supported."));
234
235 static cl::opt<bool> UsePreInlinerDecision(
236 "sample-profile-use-preinliner", cl::Hidden,
237
238 cl::desc("Use the preinliner decisions stored in profile context."));
239
240 static cl::opt<bool> AllowRecursiveInline(
241 "sample-profile-recursive-inline", cl::Hidden,
242
243 cl::desc("Allow sample loader inliner to inline recursive calls."));
244
245 static cl::opt<std::string> ProfileInlineReplayFile(
246 "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
247 cl::desc(
248 "Optimization remarks file containing inline remarks to be replayed "
249 "by inlining from sample profile loader."),
250 cl::Hidden);
251
252 static cl::opt<ReplayInlinerSettings::Scope> ProfileInlineReplayScope(
253 "sample-profile-inline-replay-scope",
254 cl::init(ReplayInlinerSettings::Scope::Function),
255 cl::values(clEnumValN(ReplayInlinerSettings::Scope::Function, "Function",
256 "Replay on functions that have remarks associated "
257 "with them (default)"),
258 clEnumValN(ReplayInlinerSettings::Scope::Module, "Module",
259 "Replay on the entire module")),
260 cl::desc("Whether inline replay should be applied to the entire "
261 "Module or just the Functions (default) that are present as "
262 "callers in remarks during sample profile inlining."),
263 cl::Hidden);
264
265 static cl::opt<ReplayInlinerSettings::Fallback> ProfileInlineReplayFallback(
266 "sample-profile-inline-replay-fallback",
267 cl::init(ReplayInlinerSettings::Fallback::Original),
268 cl::values(
269 clEnumValN(
270 ReplayInlinerSettings::Fallback::Original, "Original",
271 "All decisions not in replay send to original advisor (default)"),
272 clEnumValN(ReplayInlinerSettings::Fallback::AlwaysInline,
273 "AlwaysInline", "All decisions not in replay are inlined"),
274 clEnumValN(ReplayInlinerSettings::Fallback::NeverInline, "NeverInline",
275 "All decisions not in replay are not inlined")),
276 cl::desc("How sample profile inline replay treats sites that don't come "
277 "from the replay. Original: defers to original advisor, "
278 "AlwaysInline: inline all sites not in replay, NeverInline: "
279 "inline no sites not in replay"),
280 cl::Hidden);
281
282 static cl::opt<CallSiteFormat::Format> ProfileInlineReplayFormat(
283 "sample-profile-inline-replay-format",
284 cl::init(CallSiteFormat::Format::LineColumnDiscriminator),
285 cl::values(
286 clEnumValN(CallSiteFormat::Format::Line, "Line", "<Line Number>"),
287 clEnumValN(CallSiteFormat::Format::LineColumn, "LineColumn",
288 "<Line Number>:<Column Number>"),
289 clEnumValN(CallSiteFormat::Format::LineDiscriminator,
290 "LineDiscriminator", "<Line Number>.<Discriminator>"),
291 clEnumValN(CallSiteFormat::Format::LineColumnDiscriminator,
292 "LineColumnDiscriminator",
293 "<Line Number>:<Column Number>.<Discriminator> (default)")),
294 cl::desc("How sample profile inline replay file is formatted"), cl::Hidden);
295
296 static cl::opt<unsigned>
297 MaxNumPromotions("sample-profile-icp-max-prom", cl::init(3), cl::Hidden,
298 cl::desc("Max number of promotions for a single indirect "
299 "call callsite in sample profile loader"));
300
301 static cl::opt<bool> OverwriteExistingWeights(
302 "overwrite-existing-weights", cl::Hidden, cl::init(false),
303 cl::desc("Ignore existing branch weights on IR and always overwrite."));
304
305 static cl::opt<bool> AnnotateSampleProfileInlinePhase(
306 "annotate-sample-profile-inline-phase", cl::Hidden, cl::init(false),
307 cl::desc("Annotate LTO phase (prelink / postlink), or main (no LTO) for "
308 "sample-profile inline pass name."));
309
310 extern cl::opt<bool> EnableExtTspBlockPlacement;
311
312 namespace {
313
314 using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
315 using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
316 using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
317 using EdgeWeightMap = DenseMap<Edge, uint64_t>;
318 using BlockEdgeMap =
319 DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
320
321 class GUIDToFuncNameMapper {
322 public:
GUIDToFuncNameMapper(Module & M,SampleProfileReader & Reader,DenseMap<uint64_t,StringRef> & GUIDToFuncNameMap)323 GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
324 DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
325 : CurrentReader(Reader), CurrentModule(M),
326 CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
327 if (!CurrentReader.useMD5())
328 return;
329
330 for (const auto &F : CurrentModule) {
331 StringRef OrigName = F.getName();
332 CurrentGUIDToFuncNameMap.insert(
333 {Function::getGUID(OrigName), OrigName});
334
335 // Local to global var promotion used by optimization like thinlto
336 // will rename the var and add suffix like ".llvm.xxx" to the
337 // original local name. In sample profile, the suffixes of function
338 // names are all stripped. Since it is possible that the mapper is
339 // built in post-thin-link phase and var promotion has been done,
340 // we need to add the substring of function name without the suffix
341 // into the GUIDToFuncNameMap.
342 StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
343 if (CanonName != OrigName)
344 CurrentGUIDToFuncNameMap.insert(
345 {Function::getGUID(CanonName), CanonName});
346 }
347
348 // Update GUIDToFuncNameMap for each function including inlinees.
349 SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
350 }
351
~GUIDToFuncNameMapper()352 ~GUIDToFuncNameMapper() {
353 if (!CurrentReader.useMD5())
354 return;
355
356 CurrentGUIDToFuncNameMap.clear();
357
358 // Reset GUIDToFuncNameMap for of each function as they're no
359 // longer valid at this point.
360 SetGUIDToFuncNameMapForAll(nullptr);
361 }
362
363 private:
SetGUIDToFuncNameMapForAll(DenseMap<uint64_t,StringRef> * Map)364 void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
365 std::queue<FunctionSamples *> FSToUpdate;
366 for (auto &IFS : CurrentReader.getProfiles()) {
367 FSToUpdate.push(&IFS.second);
368 }
369
370 while (!FSToUpdate.empty()) {
371 FunctionSamples *FS = FSToUpdate.front();
372 FSToUpdate.pop();
373 FS->GUIDToFuncNameMap = Map;
374 for (const auto &ICS : FS->getCallsiteSamples()) {
375 const FunctionSamplesMap &FSMap = ICS.second;
376 for (const auto &IFS : FSMap) {
377 FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
378 FSToUpdate.push(&FS);
379 }
380 }
381 }
382 }
383
384 SampleProfileReader &CurrentReader;
385 Module &CurrentModule;
386 DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
387 };
388
389 // Inline candidate used by iterative callsite prioritized inliner
390 struct InlineCandidate {
391 CallBase *CallInstr;
392 const FunctionSamples *CalleeSamples;
393 // Prorated callsite count, which will be used to guide inlining. For example,
394 // if a callsite is duplicated in LTO prelink, then in LTO postlink the two
395 // copies will get their own distribution factors and their prorated counts
396 // will be used to decide if they should be inlined independently.
397 uint64_t CallsiteCount;
398 // Call site distribution factor to prorate the profile samples for a
399 // duplicated callsite. Default value is 1.0.
400 float CallsiteDistribution;
401 };
402
403 // Inline candidate comparer using call site weight
404 struct CandidateComparer {
operator ()__anon4fb4f3a50111::CandidateComparer405 bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) {
406 if (LHS.CallsiteCount != RHS.CallsiteCount)
407 return LHS.CallsiteCount < RHS.CallsiteCount;
408
409 const FunctionSamples *LCS = LHS.CalleeSamples;
410 const FunctionSamples *RCS = RHS.CalleeSamples;
411 assert(LCS && RCS && "Expect non-null FunctionSamples");
412
413 // Tie breaker using number of samples try to favor smaller functions first
414 if (LCS->getBodySamples().size() != RCS->getBodySamples().size())
415 return LCS->getBodySamples().size() > RCS->getBodySamples().size();
416
417 // Tie breaker using GUID so we have stable/deterministic inlining order
418 return LCS->getGUID(LCS->getName()) < RCS->getGUID(RCS->getName());
419 }
420 };
421
422 using CandidateQueue =
423 PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
424 CandidateComparer>;
425
426 // Sample profile matching - fuzzy match.
427 class SampleProfileMatcher {
428 Module &M;
429 SampleProfileReader &Reader;
430 const PseudoProbeManager *ProbeManager;
431
432 // Profile mismatching statstics.
433 uint64_t TotalProfiledCallsites = 0;
434 uint64_t NumMismatchedCallsites = 0;
435 uint64_t MismatchedCallsiteSamples = 0;
436 uint64_t TotalCallsiteSamples = 0;
437 uint64_t TotalProfiledFunc = 0;
438 uint64_t NumMismatchedFuncHash = 0;
439 uint64_t MismatchedFuncHashSamples = 0;
440 uint64_t TotalFuncHashSamples = 0;
441
442 public:
SampleProfileMatcher(Module & M,SampleProfileReader & Reader,const PseudoProbeManager * ProbeManager)443 SampleProfileMatcher(Module &M, SampleProfileReader &Reader,
444 const PseudoProbeManager *ProbeManager)
445 : M(M), Reader(Reader), ProbeManager(ProbeManager) {}
446 void detectProfileMismatch();
447 void detectProfileMismatch(const Function &F, const FunctionSamples &FS);
448 };
449
450 /// Sample profile pass.
451 ///
452 /// This pass reads profile data from the file specified by
453 /// -sample-profile-file and annotates every affected function with the
454 /// profile information found in that file.
455 class SampleProfileLoader final
456 : public SampleProfileLoaderBaseImpl<BasicBlock> {
457 public:
SampleProfileLoader(StringRef Name,StringRef RemapName,ThinOrFullLTOPhase LTOPhase,std::function<AssumptionCache & (Function &)> GetAssumptionCache,std::function<TargetTransformInfo & (Function &)> GetTargetTransformInfo,std::function<const TargetLibraryInfo & (Function &)> GetTLI)458 SampleProfileLoader(
459 StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
460 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
461 std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
462 std::function<const TargetLibraryInfo &(Function &)> GetTLI)
463 : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)),
464 GetAC(std::move(GetAssumptionCache)),
465 GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
466 LTOPhase(LTOPhase),
467 AnnotatedPassName(AnnotateSampleProfileInlinePhase
468 ? llvm::AnnotateInlinePassName(InlineContext{
469 LTOPhase, InlinePass::SampleProfileInliner})
470 : CSINLINE_DEBUG) {}
471
472 bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
473 bool runOnModule(Module &M, ModuleAnalysisManager *AM,
474 ProfileSummaryInfo *_PSI, CallGraph *CG);
475
476 protected:
477 bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
478 bool emitAnnotations(Function &F);
479 ErrorOr<uint64_t> getInstWeight(const Instruction &I) override;
480 ErrorOr<uint64_t> getProbeWeight(const Instruction &I);
481 const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
482 const FunctionSamples *
483 findFunctionSamples(const Instruction &I) const override;
484 std::vector<const FunctionSamples *>
485 findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
486 void findExternalInlineCandidate(CallBase *CB, const FunctionSamples *Samples,
487 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
488 const StringMap<Function *> &SymbolMap,
489 uint64_t Threshold);
490 // Attempt to promote indirect call and also inline the promoted call
491 bool tryPromoteAndInlineCandidate(
492 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
493 uint64_t &Sum, SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
494
495 bool inlineHotFunctions(Function &F,
496 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
497 std::optional<InlineCost> getExternalInlineAdvisorCost(CallBase &CB);
498 bool getExternalInlineAdvisorShouldInline(CallBase &CB);
499 InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
500 bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
501 bool
502 tryInlineCandidate(InlineCandidate &Candidate,
503 SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
504 bool
505 inlineHotFunctionsWithPriority(Function &F,
506 DenseSet<GlobalValue::GUID> &InlinedGUIDs);
507 // Inline cold/small functions in addition to hot ones
508 bool shouldInlineColdCallee(CallBase &CallInst);
509 void emitOptimizationRemarksForInlineCandidates(
510 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
511 bool Hot);
512 void promoteMergeNotInlinedContextSamples(
513 MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites,
514 const Function &F);
515 std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
516 std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(CallGraph &CG);
517 void generateMDProfMetadata(Function &F);
518
519 /// Map from function name to Function *. Used to find the function from
520 /// the function name. If the function name contains suffix, additional
521 /// entry is added to map from the stripped name to the function if there
522 /// is one-to-one mapping.
523 StringMap<Function *> SymbolMap;
524
525 std::function<AssumptionCache &(Function &)> GetAC;
526 std::function<TargetTransformInfo &(Function &)> GetTTI;
527 std::function<const TargetLibraryInfo &(Function &)> GetTLI;
528
529 /// Profile tracker for different context.
530 std::unique_ptr<SampleContextTracker> ContextTracker;
531
532 /// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
533 ///
534 /// We need to know the LTO phase because for example in ThinLTOPrelink
535 /// phase, in annotation, we should not promote indirect calls. Instead,
536 /// we will mark GUIDs that needs to be annotated to the function.
537 const ThinOrFullLTOPhase LTOPhase;
538 const std::string AnnotatedPassName;
539
540 /// Profle Symbol list tells whether a function name appears in the binary
541 /// used to generate the current profile.
542 std::unique_ptr<ProfileSymbolList> PSL;
543
544 /// Total number of samples collected in this profile.
545 ///
546 /// This is the sum of all the samples collected in all the functions executed
547 /// at runtime.
548 uint64_t TotalCollectedSamples = 0;
549
550 // Information recorded when we declined to inline a call site
551 // because we have determined it is too cold is accumulated for
552 // each callee function. Initially this is just the entry count.
553 struct NotInlinedProfileInfo {
554 uint64_t entryCount;
555 };
556 DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
557
558 // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
559 // all the function symbols defined or declared in current module.
560 DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
561
562 // All the Names used in FunctionSamples including outline function
563 // names, inline instance names and call target names.
564 StringSet<> NamesInProfile;
565
566 // For symbol in profile symbol list, whether to regard their profiles
567 // to be accurate. It is mainly decided by existance of profile symbol
568 // list and -profile-accurate-for-symsinlist flag, but it can be
569 // overriden by -profile-sample-accurate or profile-sample-accurate
570 // attribute.
571 bool ProfAccForSymsInList;
572
573 // External inline advisor used to replay inline decision from remarks.
574 std::unique_ptr<InlineAdvisor> ExternalInlineAdvisor;
575
576 // A pseudo probe helper to correlate the imported sample counts.
577 std::unique_ptr<PseudoProbeManager> ProbeManager;
578
579 // A helper to implement the sample profile matching algorithm.
580 std::unique_ptr<SampleProfileMatcher> MatchingManager;
581
582 private:
getAnnotatedRemarkPassName() const583 const char *getAnnotatedRemarkPassName() const {
584 return AnnotatedPassName.c_str();
585 }
586 };
587 } // end anonymous namespace
588
getInstWeight(const Instruction & Inst)589 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
590 if (FunctionSamples::ProfileIsProbeBased)
591 return getProbeWeight(Inst);
592
593 const DebugLoc &DLoc = Inst.getDebugLoc();
594 if (!DLoc)
595 return std::error_code();
596
597 // Ignore all intrinsics, phinodes and branch instructions.
598 // Branch and phinodes instruction usually contains debug info from sources
599 // outside of the residing basic block, thus we ignore them during annotation.
600 if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
601 return std::error_code();
602
603 // For non-CS profile, if a direct call/invoke instruction is inlined in
604 // profile (findCalleeFunctionSamples returns non-empty result), but not
605 // inlined here, it means that the inlined callsite has no sample, thus the
606 // call instruction should have 0 count.
607 // For CS profile, the callsite count of previously inlined callees is
608 // populated with the entry count of the callees.
609 if (!FunctionSamples::ProfileIsCS)
610 if (const auto *CB = dyn_cast<CallBase>(&Inst))
611 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
612 return 0;
613
614 return getInstWeightImpl(Inst);
615 }
616
617 // Here use error_code to represent: 1) The dangling probe. 2) Ignore the weight
618 // of non-probe instruction. So if all instructions of the BB give error_code,
619 // tell the inference algorithm to infer the BB weight.
getProbeWeight(const Instruction & Inst)620 ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
621 assert(FunctionSamples::ProfileIsProbeBased &&
622 "Profile is not pseudo probe based");
623 std::optional<PseudoProbe> Probe = extractProbe(Inst);
624 // Ignore the non-probe instruction. If none of the instruction in the BB is
625 // probe, we choose to infer the BB's weight.
626 if (!Probe)
627 return std::error_code();
628
629 const FunctionSamples *FS = findFunctionSamples(Inst);
630 // If none of the instruction has FunctionSample, we choose to return zero
631 // value sample to indicate the BB is cold. This could happen when the
632 // instruction is from inlinee and no profile data is found.
633 // FIXME: This should not be affected by the source drift issue as 1) if the
634 // newly added function is top-level inliner, it won't match the CFG checksum
635 // in the function profile or 2) if it's the inlinee, the inlinee should have
636 // a profile, otherwise it wouldn't be inlined. For non-probe based profile,
637 // we can improve it by adding a switch for profile-sample-block-accurate for
638 // block level counts in the future.
639 if (!FS)
640 return 0;
641
642 // For non-CS profile, If a direct call/invoke instruction is inlined in
643 // profile (findCalleeFunctionSamples returns non-empty result), but not
644 // inlined here, it means that the inlined callsite has no sample, thus the
645 // call instruction should have 0 count.
646 // For CS profile, the callsite count of previously inlined callees is
647 // populated with the entry count of the callees.
648 if (!FunctionSamples::ProfileIsCS)
649 if (const auto *CB = dyn_cast<CallBase>(&Inst))
650 if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
651 return 0;
652
653 const ErrorOr<uint64_t> &R = FS->findSamplesAt(Probe->Id, 0);
654 if (R) {
655 uint64_t Samples = R.get() * Probe->Factor;
656 bool FirstMark = CoverageTracker.markSamplesUsed(FS, Probe->Id, 0, Samples);
657 if (FirstMark) {
658 ORE->emit([&]() {
659 OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
660 Remark << "Applied " << ore::NV("NumSamples", Samples);
661 Remark << " samples from profile (ProbeId=";
662 Remark << ore::NV("ProbeId", Probe->Id);
663 Remark << ", Factor=";
664 Remark << ore::NV("Factor", Probe->Factor);
665 Remark << ", OriginalSamples=";
666 Remark << ore::NV("OriginalSamples", R.get());
667 Remark << ")";
668 return Remark;
669 });
670 }
671 LLVM_DEBUG(dbgs() << " " << Probe->Id << ":" << Inst
672 << " - weight: " << R.get() << " - factor: "
673 << format("%0.2f", Probe->Factor) << ")\n");
674 return Samples;
675 }
676 return R;
677 }
678
679 /// Get the FunctionSamples for a call instruction.
680 ///
681 /// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
682 /// instance in which that call instruction is calling to. It contains
683 /// all samples that resides in the inlined instance. We first find the
684 /// inlined instance in which the call instruction is from, then we
685 /// traverse its children to find the callsite with the matching
686 /// location.
687 ///
688 /// \param Inst Call/Invoke instruction to query.
689 ///
690 /// \returns The FunctionSamples pointer to the inlined instance.
691 const FunctionSamples *
findCalleeFunctionSamples(const CallBase & Inst) const692 SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
693 const DILocation *DIL = Inst.getDebugLoc();
694 if (!DIL) {
695 return nullptr;
696 }
697
698 StringRef CalleeName;
699 if (Function *Callee = Inst.getCalledFunction())
700 CalleeName = Callee->getName();
701
702 if (FunctionSamples::ProfileIsCS)
703 return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
704
705 const FunctionSamples *FS = findFunctionSamples(Inst);
706 if (FS == nullptr)
707 return nullptr;
708
709 return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
710 CalleeName, Reader->getRemapper());
711 }
712
713 /// Returns a vector of FunctionSamples that are the indirect call targets
714 /// of \p Inst. The vector is sorted by the total number of samples. Stores
715 /// the total call count of the indirect call in \p Sum.
716 std::vector<const FunctionSamples *>
findIndirectCallFunctionSamples(const Instruction & Inst,uint64_t & Sum) const717 SampleProfileLoader::findIndirectCallFunctionSamples(
718 const Instruction &Inst, uint64_t &Sum) const {
719 const DILocation *DIL = Inst.getDebugLoc();
720 std::vector<const FunctionSamples *> R;
721
722 if (!DIL) {
723 return R;
724 }
725
726 auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
727 assert(L && R && "Expect non-null FunctionSamples");
728 if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
729 return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
730 return FunctionSamples::getGUID(L->getName()) <
731 FunctionSamples::getGUID(R->getName());
732 };
733
734 if (FunctionSamples::ProfileIsCS) {
735 auto CalleeSamples =
736 ContextTracker->getIndirectCalleeContextSamplesFor(DIL);
737 if (CalleeSamples.empty())
738 return R;
739
740 // For CSSPGO, we only use target context profile's entry count
741 // as that already includes both inlined callee and non-inlined ones..
742 Sum = 0;
743 for (const auto *const FS : CalleeSamples) {
744 Sum += FS->getHeadSamplesEstimate();
745 R.push_back(FS);
746 }
747 llvm::sort(R, FSCompare);
748 return R;
749 }
750
751 const FunctionSamples *FS = findFunctionSamples(Inst);
752 if (FS == nullptr)
753 return R;
754
755 auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
756 auto T = FS->findCallTargetMapAt(CallSite);
757 Sum = 0;
758 if (T)
759 for (const auto &T_C : T.get())
760 Sum += T_C.second;
761 if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
762 if (M->empty())
763 return R;
764 for (const auto &NameFS : *M) {
765 Sum += NameFS.second.getHeadSamplesEstimate();
766 R.push_back(&NameFS.second);
767 }
768 llvm::sort(R, FSCompare);
769 }
770 return R;
771 }
772
773 const FunctionSamples *
findFunctionSamples(const Instruction & Inst) const774 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
775 if (FunctionSamples::ProfileIsProbeBased) {
776 std::optional<PseudoProbe> Probe = extractProbe(Inst);
777 if (!Probe)
778 return nullptr;
779 }
780
781 const DILocation *DIL = Inst.getDebugLoc();
782 if (!DIL)
783 return Samples;
784
785 auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
786 if (it.second) {
787 if (FunctionSamples::ProfileIsCS)
788 it.first->second = ContextTracker->getContextSamplesFor(DIL);
789 else
790 it.first->second =
791 Samples->findFunctionSamples(DIL, Reader->getRemapper());
792 }
793 return it.first->second;
794 }
795
796 /// Check whether the indirect call promotion history of \p Inst allows
797 /// the promotion for \p Candidate.
798 /// If the profile count for the promotion candidate \p Candidate is
799 /// NOMORE_ICP_MAGICNUM, it means \p Candidate has already been promoted
800 /// for \p Inst. If we already have at least MaxNumPromotions
801 /// NOMORE_ICP_MAGICNUM count values in the value profile of \p Inst, we
802 /// cannot promote for \p Inst anymore.
doesHistoryAllowICP(const Instruction & Inst,StringRef Candidate)803 static bool doesHistoryAllowICP(const Instruction &Inst, StringRef Candidate) {
804 uint32_t NumVals = 0;
805 uint64_t TotalCount = 0;
806 std::unique_ptr<InstrProfValueData[]> ValueData =
807 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
808 bool Valid =
809 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
810 ValueData.get(), NumVals, TotalCount, true);
811 // No valid value profile so no promoted targets have been recorded
812 // before. Ok to do ICP.
813 if (!Valid)
814 return true;
815
816 unsigned NumPromoted = 0;
817 for (uint32_t I = 0; I < NumVals; I++) {
818 if (ValueData[I].Count != NOMORE_ICP_MAGICNUM)
819 continue;
820
821 // If the promotion candidate has NOMORE_ICP_MAGICNUM count in the
822 // metadata, it means the candidate has been promoted for this
823 // indirect call.
824 if (ValueData[I].Value == Function::getGUID(Candidate))
825 return false;
826 NumPromoted++;
827 // If already have MaxNumPromotions promotion, don't do it anymore.
828 if (NumPromoted == MaxNumPromotions)
829 return false;
830 }
831 return true;
832 }
833
834 /// Update indirect call target profile metadata for \p Inst.
835 /// Usually \p Sum is the sum of counts of all the targets for \p Inst.
836 /// If it is 0, it means updateIDTMetaData is used to mark a
837 /// certain target to be promoted already. If it is not zero,
838 /// we expect to use it to update the total count in the value profile.
839 static void
updateIDTMetaData(Instruction & Inst,const SmallVectorImpl<InstrProfValueData> & CallTargets,uint64_t Sum)840 updateIDTMetaData(Instruction &Inst,
841 const SmallVectorImpl<InstrProfValueData> &CallTargets,
842 uint64_t Sum) {
843 // Bail out early if MaxNumPromotions is zero.
844 // This prevents allocating an array of zero length below.
845 //
846 // Note `updateIDTMetaData` is called in two places so check
847 // `MaxNumPromotions` inside it.
848 if (MaxNumPromotions == 0)
849 return;
850 uint32_t NumVals = 0;
851 // OldSum is the existing total count in the value profile data.
852 uint64_t OldSum = 0;
853 std::unique_ptr<InstrProfValueData[]> ValueData =
854 std::make_unique<InstrProfValueData[]>(MaxNumPromotions);
855 bool Valid =
856 getValueProfDataFromInst(Inst, IPVK_IndirectCallTarget, MaxNumPromotions,
857 ValueData.get(), NumVals, OldSum, true);
858
859 DenseMap<uint64_t, uint64_t> ValueCountMap;
860 if (Sum == 0) {
861 assert((CallTargets.size() == 1 &&
862 CallTargets[0].Count == NOMORE_ICP_MAGICNUM) &&
863 "If sum is 0, assume only one element in CallTargets "
864 "with count being NOMORE_ICP_MAGICNUM");
865 // Initialize ValueCountMap with existing value profile data.
866 if (Valid) {
867 for (uint32_t I = 0; I < NumVals; I++)
868 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
869 }
870 auto Pair =
871 ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count);
872 // If the target already exists in value profile, decrease the total
873 // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM.
874 if (!Pair.second) {
875 OldSum -= Pair.first->second;
876 Pair.first->second = NOMORE_ICP_MAGICNUM;
877 }
878 Sum = OldSum;
879 } else {
880 // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM
881 // counts in the value profile.
882 if (Valid) {
883 for (uint32_t I = 0; I < NumVals; I++) {
884 if (ValueData[I].Count == NOMORE_ICP_MAGICNUM)
885 ValueCountMap[ValueData[I].Value] = ValueData[I].Count;
886 }
887 }
888
889 for (const auto &Data : CallTargets) {
890 auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count);
891 if (Pair.second)
892 continue;
893 // The target represented by Data.Value has already been promoted.
894 // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease
895 // Sum by Data.Count.
896 assert(Sum >= Data.Count && "Sum should never be less than Data.Count");
897 Sum -= Data.Count;
898 }
899 }
900
901 SmallVector<InstrProfValueData, 8> NewCallTargets;
902 for (const auto &ValueCount : ValueCountMap) {
903 NewCallTargets.emplace_back(
904 InstrProfValueData{ValueCount.first, ValueCount.second});
905 }
906
907 llvm::sort(NewCallTargets,
908 [](const InstrProfValueData &L, const InstrProfValueData &R) {
909 if (L.Count != R.Count)
910 return L.Count > R.Count;
911 return L.Value > R.Value;
912 });
913
914 uint32_t MaxMDCount =
915 std::min(NewCallTargets.size(), static_cast<size_t>(MaxNumPromotions));
916 annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst,
917 NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount);
918 }
919
920 /// Attempt to promote indirect call and also inline the promoted call.
921 ///
922 /// \param F Caller function.
923 /// \param Candidate ICP and inline candidate.
924 /// \param SumOrigin Original sum of target counts for indirect call before
925 /// promoting given candidate.
926 /// \param Sum Prorated sum of remaining target counts for indirect call
927 /// after promoting given candidate.
928 /// \param InlinedCallSite Output vector for new call sites exposed after
929 /// inlining.
tryPromoteAndInlineCandidate(Function & F,InlineCandidate & Candidate,uint64_t SumOrigin,uint64_t & Sum,SmallVector<CallBase *,8> * InlinedCallSite)930 bool SampleProfileLoader::tryPromoteAndInlineCandidate(
931 Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum,
932 SmallVector<CallBase *, 8> *InlinedCallSite) {
933 // Bail out early if sample-loader inliner is disabled.
934 if (DisableSampleLoaderInlining)
935 return false;
936
937 // Bail out early if MaxNumPromotions is zero.
938 // This prevents allocating an array of zero length in callees below.
939 if (MaxNumPromotions == 0)
940 return false;
941 auto CalleeFunctionName = Candidate.CalleeSamples->getFuncName();
942 auto R = SymbolMap.find(CalleeFunctionName);
943 if (R == SymbolMap.end() || !R->getValue())
944 return false;
945
946 auto &CI = *Candidate.CallInstr;
947 if (!doesHistoryAllowICP(CI, R->getValue()->getName()))
948 return false;
949
950 const char *Reason = "Callee function not available";
951 // R->getValue() != &F is to prevent promoting a recursive call.
952 // If it is a recursive call, we do not inline it as it could bloat
953 // the code exponentially. There is way to better handle this, e.g.
954 // clone the caller first, and inline the cloned caller if it is
955 // recursive. As llvm does not inline recursive calls, we will
956 // simply ignore it instead of handling it explicitly.
957 if (!R->getValue()->isDeclaration() && R->getValue()->getSubprogram() &&
958 R->getValue()->hasFnAttribute("use-sample-profile") &&
959 R->getValue() != &F && isLegalToPromote(CI, R->getValue(), &Reason)) {
960 // For promoted target, set its value with NOMORE_ICP_MAGICNUM count
961 // in the value profile metadata so the target won't be promoted again.
962 SmallVector<InstrProfValueData, 1> SortedCallTargets = {InstrProfValueData{
963 Function::getGUID(R->getValue()->getName()), NOMORE_ICP_MAGICNUM}};
964 updateIDTMetaData(CI, SortedCallTargets, 0);
965
966 auto *DI = &pgo::promoteIndirectCall(
967 CI, R->getValue(), Candidate.CallsiteCount, Sum, false, ORE);
968 if (DI) {
969 Sum -= Candidate.CallsiteCount;
970 // Do not prorate the indirect callsite distribution since the original
971 // distribution will be used to scale down non-promoted profile target
972 // counts later. By doing this we lose track of the real callsite count
973 // for the leftover indirect callsite as a trade off for accurate call
974 // target counts.
975 // TODO: Ideally we would have two separate factors, one for call site
976 // counts and one is used to prorate call target counts.
977 // Do not update the promoted direct callsite distribution at this
978 // point since the original distribution combined with the callee profile
979 // will be used to prorate callsites from the callee if inlined. Once not
980 // inlined, the direct callsite distribution should be prorated so that
981 // the it will reflect the real callsite counts.
982 Candidate.CallInstr = DI;
983 if (isa<CallInst>(DI) || isa<InvokeInst>(DI)) {
984 bool Inlined = tryInlineCandidate(Candidate, InlinedCallSite);
985 if (!Inlined) {
986 // Prorate the direct callsite distribution so that it reflects real
987 // callsite counts.
988 setProbeDistributionFactor(
989 *DI, static_cast<float>(Candidate.CallsiteCount) / SumOrigin);
990 }
991 return Inlined;
992 }
993 }
994 } else {
995 LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
996 << Candidate.CalleeSamples->getFuncName() << " because "
997 << Reason << "\n");
998 }
999 return false;
1000 }
1001
shouldInlineColdCallee(CallBase & CallInst)1002 bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
1003 if (!ProfileSizeInline)
1004 return false;
1005
1006 Function *Callee = CallInst.getCalledFunction();
1007 if (Callee == nullptr)
1008 return false;
1009
1010 InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
1011 GetAC, GetTLI);
1012
1013 if (Cost.isNever())
1014 return false;
1015
1016 if (Cost.isAlways())
1017 return true;
1018
1019 return Cost.getCost() <= SampleColdCallSiteThreshold;
1020 }
1021
emitOptimizationRemarksForInlineCandidates(const SmallVectorImpl<CallBase * > & Candidates,const Function & F,bool Hot)1022 void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
1023 const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
1024 bool Hot) {
1025 for (auto *I : Candidates) {
1026 Function *CalledFunction = I->getCalledFunction();
1027 if (CalledFunction) {
1028 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1029 "InlineAttempt", I->getDebugLoc(),
1030 I->getParent())
1031 << "previous inlining reattempted for "
1032 << (Hot ? "hotness: '" : "size: '")
1033 << ore::NV("Callee", CalledFunction) << "' into '"
1034 << ore::NV("Caller", &F) << "'");
1035 }
1036 }
1037 }
1038
findExternalInlineCandidate(CallBase * CB,const FunctionSamples * Samples,DenseSet<GlobalValue::GUID> & InlinedGUIDs,const StringMap<Function * > & SymbolMap,uint64_t Threshold)1039 void SampleProfileLoader::findExternalInlineCandidate(
1040 CallBase *CB, const FunctionSamples *Samples,
1041 DenseSet<GlobalValue::GUID> &InlinedGUIDs,
1042 const StringMap<Function *> &SymbolMap, uint64_t Threshold) {
1043
1044 // If ExternalInlineAdvisor wants to inline an external function
1045 // make sure it's imported
1046 if (CB && getExternalInlineAdvisorShouldInline(*CB)) {
1047 // Samples may not exist for replayed function, if so
1048 // just add the direct GUID and move on
1049 if (!Samples) {
1050 InlinedGUIDs.insert(
1051 FunctionSamples::getGUID(CB->getCalledFunction()->getName()));
1052 return;
1053 }
1054 // Otherwise, drop the threshold to import everything that we can
1055 Threshold = 0;
1056 }
1057
1058 assert(Samples && "expect non-null caller profile");
1059
1060 // For AutoFDO profile, retrieve candidate profiles by walking over
1061 // the nested inlinee profiles.
1062 if (!FunctionSamples::ProfileIsCS) {
1063 Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold);
1064 return;
1065 }
1066
1067 ContextTrieNode *Caller = ContextTracker->getContextNodeForProfile(Samples);
1068 std::queue<ContextTrieNode *> CalleeList;
1069 CalleeList.push(Caller);
1070 while (!CalleeList.empty()) {
1071 ContextTrieNode *Node = CalleeList.front();
1072 CalleeList.pop();
1073 FunctionSamples *CalleeSample = Node->getFunctionSamples();
1074 // For CSSPGO profile, retrieve candidate profile by walking over the
1075 // trie built for context profile. Note that also take call targets
1076 // even if callee doesn't have a corresponding context profile.
1077 if (!CalleeSample)
1078 continue;
1079
1080 // If pre-inliner decision is used, honor that for importing as well.
1081 bool PreInline =
1082 UsePreInlinerDecision &&
1083 CalleeSample->getContext().hasAttribute(ContextShouldBeInlined);
1084 if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
1085 continue;
1086
1087 StringRef Name = CalleeSample->getFuncName();
1088 Function *Func = SymbolMap.lookup(Name);
1089 // Add to the import list only when it's defined out of module.
1090 if (!Func || Func->isDeclaration())
1091 InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeSample->getName()));
1092
1093 // Import hot CallTargets, which may not be available in IR because full
1094 // profile annotation cannot be done until backend compilation in ThinLTO.
1095 for (const auto &BS : CalleeSample->getBodySamples())
1096 for (const auto &TS : BS.second.getCallTargets())
1097 if (TS.getValue() > Threshold) {
1098 StringRef CalleeName = CalleeSample->getFuncName(TS.getKey());
1099 const Function *Callee = SymbolMap.lookup(CalleeName);
1100 if (!Callee || Callee->isDeclaration())
1101 InlinedGUIDs.insert(FunctionSamples::getGUID(TS.getKey()));
1102 }
1103
1104 // Import hot child context profile associted with callees. Note that this
1105 // may have some overlap with the call target loop above, but doing this
1106 // based child context profile again effectively allow us to use the max of
1107 // entry count and call target count to determine importing.
1108 for (auto &Child : Node->getAllChildContext()) {
1109 ContextTrieNode *CalleeNode = &Child.second;
1110 CalleeList.push(CalleeNode);
1111 }
1112 }
1113 }
1114
1115 /// Iteratively inline hot callsites of a function.
1116 ///
1117 /// Iteratively traverse all callsites of the function \p F, so as to
1118 /// find out callsites with corresponding inline instances.
1119 ///
1120 /// For such callsites,
1121 /// - If it is hot enough, inline the callsites and adds callsites of the callee
1122 /// into the caller. If the call is an indirect call, first promote
1123 /// it to direct call. Each indirect call is limited with a single target.
1124 ///
1125 /// - If a callsite is not inlined, merge the its profile to the outline
1126 /// version (if --sample-profile-merge-inlinee is true), or scale the
1127 /// counters of standalone function based on the profile of inlined
1128 /// instances (if --sample-profile-merge-inlinee is false).
1129 ///
1130 /// Later passes may consume the updated profiles.
1131 ///
1132 /// \param F function to perform iterative inlining.
1133 /// \param InlinedGUIDs a set to be updated to include all GUIDs that are
1134 /// inlined in the profiled binary.
1135 ///
1136 /// \returns True if there is any inline happened.
inlineHotFunctions(Function & F,DenseSet<GlobalValue::GUID> & InlinedGUIDs)1137 bool SampleProfileLoader::inlineHotFunctions(
1138 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1139 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1140 // Profile symbol list is ignored when profile-sample-accurate is on.
1141 assert((!ProfAccForSymsInList ||
1142 (!ProfileSampleAccurate &&
1143 !F.hasFnAttribute("profile-sample-accurate"))) &&
1144 "ProfAccForSymsInList should be false when profile-sample-accurate "
1145 "is enabled");
1146
1147 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1148 bool Changed = false;
1149 bool LocalChanged = true;
1150 while (LocalChanged) {
1151 LocalChanged = false;
1152 SmallVector<CallBase *, 10> CIS;
1153 for (auto &BB : F) {
1154 bool Hot = false;
1155 SmallVector<CallBase *, 10> AllCandidates;
1156 SmallVector<CallBase *, 10> ColdCandidates;
1157 for (auto &I : BB) {
1158 const FunctionSamples *FS = nullptr;
1159 if (auto *CB = dyn_cast<CallBase>(&I)) {
1160 if (!isa<IntrinsicInst>(I)) {
1161 if ((FS = findCalleeFunctionSamples(*CB))) {
1162 assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
1163 "GUIDToFuncNameMap has to be populated");
1164 AllCandidates.push_back(CB);
1165 if (FS->getHeadSamplesEstimate() > 0 ||
1166 FunctionSamples::ProfileIsCS)
1167 LocalNotInlinedCallSites.insert({CB, FS});
1168 if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1169 Hot = true;
1170 else if (shouldInlineColdCallee(*CB))
1171 ColdCandidates.push_back(CB);
1172 } else if (getExternalInlineAdvisorShouldInline(*CB)) {
1173 AllCandidates.push_back(CB);
1174 }
1175 }
1176 }
1177 }
1178 if (Hot || ExternalInlineAdvisor) {
1179 CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
1180 emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
1181 } else {
1182 CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
1183 emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
1184 }
1185 }
1186 for (CallBase *I : CIS) {
1187 Function *CalledFunction = I->getCalledFunction();
1188 InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I),
1189 0 /* dummy count */,
1190 1.0 /* dummy distribution factor */};
1191 // Do not inline recursive calls.
1192 if (CalledFunction == &F)
1193 continue;
1194 if (I->isIndirectCall()) {
1195 uint64_t Sum;
1196 for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
1197 uint64_t SumOrigin = Sum;
1198 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1199 findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
1200 PSI->getOrCompHotCountThreshold());
1201 continue;
1202 }
1203 if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
1204 continue;
1205
1206 Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
1207 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
1208 LocalNotInlinedCallSites.erase(I);
1209 LocalChanged = true;
1210 }
1211 }
1212 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1213 !CalledFunction->isDeclaration()) {
1214 if (tryInlineCandidate(Candidate)) {
1215 LocalNotInlinedCallSites.erase(I);
1216 LocalChanged = true;
1217 }
1218 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1219 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1220 InlinedGUIDs, SymbolMap,
1221 PSI->getOrCompHotCountThreshold());
1222 }
1223 }
1224 Changed |= LocalChanged;
1225 }
1226
1227 // For CS profile, profile for not inlined context will be merged when
1228 // base profile is being retrieved.
1229 if (!FunctionSamples::ProfileIsCS)
1230 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1231 return Changed;
1232 }
1233
tryInlineCandidate(InlineCandidate & Candidate,SmallVector<CallBase *,8> * InlinedCallSites)1234 bool SampleProfileLoader::tryInlineCandidate(
1235 InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
1236 // Do not attempt to inline a candidate if
1237 // --disable-sample-loader-inlining is true.
1238 if (DisableSampleLoaderInlining)
1239 return false;
1240
1241 CallBase &CB = *Candidate.CallInstr;
1242 Function *CalledFunction = CB.getCalledFunction();
1243 assert(CalledFunction && "Expect a callee with definition");
1244 DebugLoc DLoc = CB.getDebugLoc();
1245 BasicBlock *BB = CB.getParent();
1246
1247 InlineCost Cost = shouldInlineCandidate(Candidate);
1248 if (Cost.isNever()) {
1249 ORE->emit(OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(),
1250 "InlineFail", DLoc, BB)
1251 << "incompatible inlining");
1252 return false;
1253 }
1254
1255 if (!Cost)
1256 return false;
1257
1258 InlineFunctionInfo IFI(nullptr, GetAC);
1259 IFI.UpdateProfile = false;
1260 InlineResult IR = InlineFunction(CB, IFI,
1261 /*MergeAttributes=*/true);
1262 if (!IR.isSuccess())
1263 return false;
1264
1265 // The call to InlineFunction erases I, so we can't pass it here.
1266 emitInlinedIntoBasedOnCost(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(),
1267 Cost, true, getAnnotatedRemarkPassName());
1268
1269 // Now populate the list of newly exposed call sites.
1270 if (InlinedCallSites) {
1271 InlinedCallSites->clear();
1272 for (auto &I : IFI.InlinedCallSites)
1273 InlinedCallSites->push_back(I);
1274 }
1275
1276 if (FunctionSamples::ProfileIsCS)
1277 ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples);
1278 ++NumCSInlined;
1279
1280 // Prorate inlined probes for a duplicated inlining callsite which probably
1281 // has a distribution less than 100%. Samples for an inlinee should be
1282 // distributed among the copies of the original callsite based on each
1283 // callsite's distribution factor for counts accuracy. Note that an inlined
1284 // probe may come with its own distribution factor if it has been duplicated
1285 // in the inlinee body. The two factor are multiplied to reflect the
1286 // aggregation of duplication.
1287 if (Candidate.CallsiteDistribution < 1) {
1288 for (auto &I : IFI.InlinedCallSites) {
1289 if (std::optional<PseudoProbe> Probe = extractProbe(*I))
1290 setProbeDistributionFactor(*I, Probe->Factor *
1291 Candidate.CallsiteDistribution);
1292 }
1293 NumDuplicatedInlinesite++;
1294 }
1295
1296 return true;
1297 }
1298
getInlineCandidate(InlineCandidate * NewCandidate,CallBase * CB)1299 bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
1300 CallBase *CB) {
1301 assert(CB && "Expect non-null call instruction");
1302
1303 if (isa<IntrinsicInst>(CB))
1304 return false;
1305
1306 // Find the callee's profile. For indirect call, find hottest target profile.
1307 const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB);
1308 // If ExternalInlineAdvisor wants to inline this site, do so even
1309 // if Samples are not present.
1310 if (!CalleeSamples && !getExternalInlineAdvisorShouldInline(*CB))
1311 return false;
1312
1313 float Factor = 1.0;
1314 if (std::optional<PseudoProbe> Probe = extractProbe(*CB))
1315 Factor = Probe->Factor;
1316
1317 uint64_t CallsiteCount =
1318 CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
1319 *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
1320 return true;
1321 }
1322
1323 std::optional<InlineCost>
getExternalInlineAdvisorCost(CallBase & CB)1324 SampleProfileLoader::getExternalInlineAdvisorCost(CallBase &CB) {
1325 std::unique_ptr<InlineAdvice> Advice = nullptr;
1326 if (ExternalInlineAdvisor) {
1327 Advice = ExternalInlineAdvisor->getAdvice(CB);
1328 if (Advice) {
1329 if (!Advice->isInliningRecommended()) {
1330 Advice->recordUnattemptedInlining();
1331 return InlineCost::getNever("not previously inlined");
1332 }
1333 Advice->recordInlining();
1334 return InlineCost::getAlways("previously inlined");
1335 }
1336 }
1337
1338 return {};
1339 }
1340
getExternalInlineAdvisorShouldInline(CallBase & CB)1341 bool SampleProfileLoader::getExternalInlineAdvisorShouldInline(CallBase &CB) {
1342 std::optional<InlineCost> Cost = getExternalInlineAdvisorCost(CB);
1343 return Cost ? !!*Cost : false;
1344 }
1345
1346 InlineCost
shouldInlineCandidate(InlineCandidate & Candidate)1347 SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
1348 if (std::optional<InlineCost> ReplayCost =
1349 getExternalInlineAdvisorCost(*Candidate.CallInstr))
1350 return *ReplayCost;
1351 // Adjust threshold based on call site hotness, only do this for callsite
1352 // prioritized inliner because otherwise cost-benefit check is done earlier.
1353 int SampleThreshold = SampleColdCallSiteThreshold;
1354 if (CallsitePrioritizedInline) {
1355 if (Candidate.CallsiteCount > PSI->getHotCountThreshold())
1356 SampleThreshold = SampleHotCallSiteThreshold;
1357 else if (!ProfileSizeInline)
1358 return InlineCost::getNever("cold callsite");
1359 }
1360
1361 Function *Callee = Candidate.CallInstr->getCalledFunction();
1362 assert(Callee && "Expect a definition for inline candidate of direct call");
1363
1364 InlineParams Params = getInlineParams();
1365 // We will ignore the threshold from inline cost, so always get full cost.
1366 Params.ComputeFullInlineCost = true;
1367 Params.AllowRecursiveCall = AllowRecursiveInline;
1368 // Checks if there is anything in the reachable portion of the callee at
1369 // this callsite that makes this inlining potentially illegal. Need to
1370 // set ComputeFullInlineCost, otherwise getInlineCost may return early
1371 // when cost exceeds threshold without checking all IRs in the callee.
1372 // The acutal cost does not matter because we only checks isNever() to
1373 // see if it is legal to inline the callsite.
1374 InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params,
1375 GetTTI(*Callee), GetAC, GetTLI);
1376
1377 // Honor always inline and never inline from call analyzer
1378 if (Cost.isNever() || Cost.isAlways())
1379 return Cost;
1380
1381 // With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1382 // decisions based on hotness as well as accurate function byte sizes for
1383 // given context using function/inlinee sizes from previous build. It
1384 // stores the decision in profile, and also adjust/merge context profile
1385 // aiming at better context-sensitive post-inline profile quality, assuming
1386 // all inline decision estimates are going to be honored by compiler. Here
1387 // we replay that inline decision under `sample-profile-use-preinliner`.
1388 // Note that we don't need to handle negative decision from preinliner as
1389 // context profile for not inlined calls are merged by preinliner already.
1390 if (UsePreInlinerDecision && Candidate.CalleeSamples) {
1391 // Once two node are merged due to promotion, we're losing some context
1392 // so the original context-sensitive preinliner decision should be ignored
1393 // for SyntheticContext.
1394 SampleContext &Context = Candidate.CalleeSamples->getContext();
1395 if (!Context.hasState(SyntheticContext) &&
1396 Context.hasAttribute(ContextShouldBeInlined))
1397 return InlineCost::getAlways("preinliner");
1398 }
1399
1400 // For old FDO inliner, we inline the call site as long as cost is not
1401 // "Never". The cost-benefit check is done earlier.
1402 if (!CallsitePrioritizedInline) {
1403 return InlineCost::get(Cost.getCost(), INT_MAX);
1404 }
1405
1406 // Otherwise only use the cost from call analyzer, but overwite threshold with
1407 // Sample PGO threshold.
1408 return InlineCost::get(Cost.getCost(), SampleThreshold);
1409 }
1410
inlineHotFunctionsWithPriority(Function & F,DenseSet<GlobalValue::GUID> & InlinedGUIDs)1411 bool SampleProfileLoader::inlineHotFunctionsWithPriority(
1412 Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
1413 // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
1414 // Profile symbol list is ignored when profile-sample-accurate is on.
1415 assert((!ProfAccForSymsInList ||
1416 (!ProfileSampleAccurate &&
1417 !F.hasFnAttribute("profile-sample-accurate"))) &&
1418 "ProfAccForSymsInList should be false when profile-sample-accurate "
1419 "is enabled");
1420
1421 // Populating worklist with initial call sites from root inliner, along
1422 // with call site weights.
1423 CandidateQueue CQueue;
1424 InlineCandidate NewCandidate;
1425 for (auto &BB : F) {
1426 for (auto &I : BB) {
1427 auto *CB = dyn_cast<CallBase>(&I);
1428 if (!CB)
1429 continue;
1430 if (getInlineCandidate(&NewCandidate, CB))
1431 CQueue.push(NewCandidate);
1432 }
1433 }
1434
1435 // Cap the size growth from profile guided inlining. This is needed even
1436 // though cost of each inline candidate already accounts for callee size,
1437 // because with top-down inlining, we can grow inliner size significantly
1438 // with large number of smaller inlinees each pass the cost check.
1439 assert(ProfileInlineLimitMax >= ProfileInlineLimitMin &&
1440 "Max inline size limit should not be smaller than min inline size "
1441 "limit.");
1442 unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit;
1443 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
1444 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
1445 if (ExternalInlineAdvisor)
1446 SizeLimit = std::numeric_limits<unsigned>::max();
1447
1448 MapVector<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
1449
1450 // Perform iterative BFS call site prioritized inlining
1451 bool Changed = false;
1452 while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) {
1453 InlineCandidate Candidate = CQueue.top();
1454 CQueue.pop();
1455 CallBase *I = Candidate.CallInstr;
1456 Function *CalledFunction = I->getCalledFunction();
1457
1458 if (CalledFunction == &F)
1459 continue;
1460 if (I->isIndirectCall()) {
1461 uint64_t Sum = 0;
1462 auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum);
1463 uint64_t SumOrigin = Sum;
1464 Sum *= Candidate.CallsiteDistribution;
1465 unsigned ICPCount = 0;
1466 for (const auto *FS : CalleeSamples) {
1467 // TODO: Consider disable pre-lTO ICP for MonoLTO as well
1468 if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1469 findExternalInlineCandidate(I, FS, InlinedGUIDs, SymbolMap,
1470 PSI->getOrCompHotCountThreshold());
1471 continue;
1472 }
1473 uint64_t EntryCountDistributed =
1474 FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
1475 // In addition to regular inline cost check, we also need to make sure
1476 // ICP isn't introducing excessive speculative checks even if individual
1477 // target looks beneficial to promote and inline. That means we should
1478 // only do ICP when there's a small number dominant targets.
1479 if (ICPCount >= ProfileICPRelativeHotnessSkip &&
1480 EntryCountDistributed * 100 < SumOrigin * ProfileICPRelativeHotness)
1481 break;
1482 // TODO: Fix CallAnalyzer to handle all indirect calls.
1483 // For indirect call, we don't run CallAnalyzer to get InlineCost
1484 // before actual inlining. This is because we could see two different
1485 // types from the same definition, which makes CallAnalyzer choke as
1486 // it's expecting matching parameter type on both caller and callee
1487 // side. See example from PR18962 for the triggering cases (the bug was
1488 // fixed, but we generate different types).
1489 if (!PSI->isHotCount(EntryCountDistributed))
1490 break;
1491 SmallVector<CallBase *, 8> InlinedCallSites;
1492 // Attach function profile for promoted indirect callee, and update
1493 // call site count for the promoted inline candidate too.
1494 Candidate = {I, FS, EntryCountDistributed,
1495 Candidate.CallsiteDistribution};
1496 if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
1497 &InlinedCallSites)) {
1498 for (auto *CB : InlinedCallSites) {
1499 if (getInlineCandidate(&NewCandidate, CB))
1500 CQueue.emplace(NewCandidate);
1501 }
1502 ICPCount++;
1503 Changed = true;
1504 } else if (!ContextTracker) {
1505 LocalNotInlinedCallSites.insert({I, FS});
1506 }
1507 }
1508 } else if (CalledFunction && CalledFunction->getSubprogram() &&
1509 !CalledFunction->isDeclaration()) {
1510 SmallVector<CallBase *, 8> InlinedCallSites;
1511 if (tryInlineCandidate(Candidate, &InlinedCallSites)) {
1512 for (auto *CB : InlinedCallSites) {
1513 if (getInlineCandidate(&NewCandidate, CB))
1514 CQueue.emplace(NewCandidate);
1515 }
1516 Changed = true;
1517 } else if (!ContextTracker) {
1518 LocalNotInlinedCallSites.insert({I, Candidate.CalleeSamples});
1519 }
1520 } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
1521 findExternalInlineCandidate(I, findCalleeFunctionSamples(*I),
1522 InlinedGUIDs, SymbolMap,
1523 PSI->getOrCompHotCountThreshold());
1524 }
1525 }
1526
1527 if (!CQueue.empty()) {
1528 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
1529 ++NumCSInlinedHitMaxLimit;
1530 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
1531 ++NumCSInlinedHitMinLimit;
1532 else
1533 ++NumCSInlinedHitGrowthLimit;
1534 }
1535
1536 // For CS profile, profile for not inlined context will be merged when
1537 // base profile is being retrieved.
1538 if (!FunctionSamples::ProfileIsCS)
1539 promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F);
1540 return Changed;
1541 }
1542
promoteMergeNotInlinedContextSamples(MapVector<CallBase *,const FunctionSamples * > NonInlinedCallSites,const Function & F)1543 void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
1544 MapVector<CallBase *, const FunctionSamples *> NonInlinedCallSites,
1545 const Function &F) {
1546 // Accumulate not inlined callsite information into notInlinedSamples
1547 for (const auto &Pair : NonInlinedCallSites) {
1548 CallBase *I = Pair.first;
1549 Function *Callee = I->getCalledFunction();
1550 if (!Callee || Callee->isDeclaration())
1551 continue;
1552
1553 ORE->emit(
1554 OptimizationRemarkAnalysis(getAnnotatedRemarkPassName(), "NotInline",
1555 I->getDebugLoc(), I->getParent())
1556 << "previous inlining not repeated: '" << ore::NV("Callee", Callee)
1557 << "' into '" << ore::NV("Caller", &F) << "'");
1558
1559 ++NumCSNotInlined;
1560 const FunctionSamples *FS = Pair.second;
1561 if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
1562 continue;
1563 }
1564
1565 // Do not merge a context that is already duplicated into the base profile.
1566 if (FS->getContext().hasAttribute(sampleprof::ContextDuplicatedIntoBase))
1567 continue;
1568
1569 if (ProfileMergeInlinee) {
1570 // A function call can be replicated by optimizations like callsite
1571 // splitting or jump threading and the replicates end up sharing the
1572 // sample nested callee profile instead of slicing the original
1573 // inlinee's profile. We want to do merge exactly once by filtering out
1574 // callee profiles with a non-zero head sample count.
1575 if (FS->getHeadSamples() == 0) {
1576 // Use entry samples as head samples during the merge, as inlinees
1577 // don't have head samples.
1578 const_cast<FunctionSamples *>(FS)->addHeadSamples(
1579 FS->getHeadSamplesEstimate());
1580
1581 // Note that we have to do the merge right after processing function.
1582 // This allows OutlineFS's profile to be used for annotation during
1583 // top-down processing of functions' annotation.
1584 FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
1585 OutlineFS->merge(*FS, 1);
1586 // Set outlined profile to be synthetic to not bias the inliner.
1587 OutlineFS->SetContextSynthetic();
1588 }
1589 } else {
1590 auto pair =
1591 notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
1592 pair.first->second.entryCount += FS->getHeadSamplesEstimate();
1593 }
1594 }
1595 }
1596
1597 /// Returns the sorted CallTargetMap \p M by count in descending order.
1598 static SmallVector<InstrProfValueData, 2>
GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap & M)1599 GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) {
1600 SmallVector<InstrProfValueData, 2> R;
1601 for (const auto &I : SampleRecord::SortCallTargets(M)) {
1602 R.emplace_back(
1603 InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
1604 }
1605 return R;
1606 }
1607
1608 // Generate MD_prof metadata for every branch instruction using the
1609 // edge weights computed during propagation.
generateMDProfMetadata(Function & F)1610 void SampleProfileLoader::generateMDProfMetadata(Function &F) {
1611 // Generate MD_prof metadata for every branch instruction using the
1612 // edge weights computed during propagation.
1613 LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
1614 LLVMContext &Ctx = F.getContext();
1615 MDBuilder MDB(Ctx);
1616 for (auto &BI : F) {
1617 BasicBlock *BB = &BI;
1618
1619 if (BlockWeights[BB]) {
1620 for (auto &I : *BB) {
1621 if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
1622 continue;
1623 if (!cast<CallBase>(I).getCalledFunction()) {
1624 const DebugLoc &DLoc = I.getDebugLoc();
1625 if (!DLoc)
1626 continue;
1627 const DILocation *DIL = DLoc;
1628 const FunctionSamples *FS = findFunctionSamples(I);
1629 if (!FS)
1630 continue;
1631 auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
1632 auto T = FS->findCallTargetMapAt(CallSite);
1633 if (!T || T.get().empty())
1634 continue;
1635 if (FunctionSamples::ProfileIsProbeBased) {
1636 // Prorate the callsite counts based on the pre-ICP distribution
1637 // factor to reflect what is already done to the callsite before
1638 // ICP, such as calliste cloning.
1639 if (std::optional<PseudoProbe> Probe = extractProbe(I)) {
1640 if (Probe->Factor < 1)
1641 T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
1642 }
1643 }
1644 SmallVector<InstrProfValueData, 2> SortedCallTargets =
1645 GetSortedValueDataFromCallTargets(T.get());
1646 uint64_t Sum = 0;
1647 for (const auto &C : T.get())
1648 Sum += C.second;
1649 // With CSSPGO all indirect call targets are counted torwards the
1650 // original indirect call site in the profile, including both
1651 // inlined and non-inlined targets.
1652 if (!FunctionSamples::ProfileIsCS) {
1653 if (const FunctionSamplesMap *M =
1654 FS->findFunctionSamplesMapAt(CallSite)) {
1655 for (const auto &NameFS : *M)
1656 Sum += NameFS.second.getHeadSamplesEstimate();
1657 }
1658 }
1659 if (Sum)
1660 updateIDTMetaData(I, SortedCallTargets, Sum);
1661 else if (OverwriteExistingWeights)
1662 I.setMetadata(LLVMContext::MD_prof, nullptr);
1663 } else if (!isa<IntrinsicInst>(&I)) {
1664 I.setMetadata(LLVMContext::MD_prof,
1665 MDB.createBranchWeights(
1666 {static_cast<uint32_t>(BlockWeights[BB])}));
1667 }
1668 }
1669 } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) {
1670 // Set profile metadata (possibly annotated by LTO prelink) to zero or
1671 // clear it for cold code.
1672 for (auto &I : *BB) {
1673 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
1674 if (cast<CallBase>(I).isIndirectCall())
1675 I.setMetadata(LLVMContext::MD_prof, nullptr);
1676 else
1677 I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(0));
1678 }
1679 }
1680 }
1681
1682 Instruction *TI = BB->getTerminator();
1683 if (TI->getNumSuccessors() == 1)
1684 continue;
1685 if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
1686 !isa<IndirectBrInst>(TI))
1687 continue;
1688
1689 DebugLoc BranchLoc = TI->getDebugLoc();
1690 LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
1691 << ((BranchLoc) ? Twine(BranchLoc.getLine())
1692 : Twine("<UNKNOWN LOCATION>"))
1693 << ".\n");
1694 SmallVector<uint32_t, 4> Weights;
1695 uint32_t MaxWeight = 0;
1696 Instruction *MaxDestInst;
1697 // Since profi treats multiple edges (multiway branches) as a single edge,
1698 // we need to distribute the computed weight among the branches. We do
1699 // this by evenly splitting the edge weight among destinations.
1700 DenseMap<const BasicBlock *, uint64_t> EdgeMultiplicity;
1701 std::vector<uint64_t> EdgeIndex;
1702 if (SampleProfileUseProfi) {
1703 EdgeIndex.resize(TI->getNumSuccessors());
1704 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1705 const BasicBlock *Succ = TI->getSuccessor(I);
1706 EdgeIndex[I] = EdgeMultiplicity[Succ];
1707 EdgeMultiplicity[Succ]++;
1708 }
1709 }
1710 for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
1711 BasicBlock *Succ = TI->getSuccessor(I);
1712 Edge E = std::make_pair(BB, Succ);
1713 uint64_t Weight = EdgeWeights[E];
1714 LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
1715 // Use uint32_t saturated arithmetic to adjust the incoming weights,
1716 // if needed. Sample counts in profiles are 64-bit unsigned values,
1717 // but internally branch weights are expressed as 32-bit values.
1718 if (Weight > std::numeric_limits<uint32_t>::max()) {
1719 LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
1720 Weight = std::numeric_limits<uint32_t>::max();
1721 }
1722 if (!SampleProfileUseProfi) {
1723 // Weight is added by one to avoid propagation errors introduced by
1724 // 0 weights.
1725 Weights.push_back(static_cast<uint32_t>(Weight + 1));
1726 } else {
1727 // Profi creates proper weights that do not require "+1" adjustments but
1728 // we evenly split the weight among branches with the same destination.
1729 uint64_t W = Weight / EdgeMultiplicity[Succ];
1730 // Rounding up, if needed, so that first branches are hotter.
1731 if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ])
1732 W++;
1733 Weights.push_back(static_cast<uint32_t>(W));
1734 }
1735 if (Weight != 0) {
1736 if (Weight > MaxWeight) {
1737 MaxWeight = Weight;
1738 MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
1739 }
1740 }
1741 }
1742
1743 misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
1744
1745 uint64_t TempWeight;
1746 // Only set weights if there is at least one non-zero weight.
1747 // In any other case, let the analyzer set weights.
1748 // Do not set weights if the weights are present unless under
1749 // OverwriteExistingWeights. In ThinLTO, the profile annotation is done
1750 // twice. If the first annotation already set the weights, the second pass
1751 // does not need to set it. With OverwriteExistingWeights, Blocks with zero
1752 // weight should have their existing metadata (possibly annotated by LTO
1753 // prelink) cleared.
1754 if (MaxWeight > 0 &&
1755 (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
1756 LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
1757 TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
1758 ORE->emit([&]() {
1759 return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
1760 << "most popular destination for conditional branches at "
1761 << ore::NV("CondBranchesLoc", BranchLoc);
1762 });
1763 } else {
1764 if (OverwriteExistingWeights) {
1765 TI->setMetadata(LLVMContext::MD_prof, nullptr);
1766 LLVM_DEBUG(dbgs() << "CLEARED. All branch weights are zero.\n");
1767 } else {
1768 LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
1769 }
1770 }
1771 }
1772 }
1773
1774 /// Once all the branch weights are computed, we emit the MD_prof
1775 /// metadata on BB using the computed values for each of its branches.
1776 ///
1777 /// \param F The function to query.
1778 ///
1779 /// \returns true if \p F was modified. Returns false, otherwise.
emitAnnotations(Function & F)1780 bool SampleProfileLoader::emitAnnotations(Function &F) {
1781 bool Changed = false;
1782
1783 if (FunctionSamples::ProfileIsProbeBased) {
1784 if (!ProbeManager->profileIsValid(F, *Samples)) {
1785 LLVM_DEBUG(
1786 dbgs() << "Profile is invalid due to CFG mismatch for Function "
1787 << F.getName());
1788 ++NumMismatchedProfile;
1789 return false;
1790 }
1791 ++NumMatchedProfile;
1792 } else {
1793 if (getFunctionLoc(F) == 0)
1794 return false;
1795
1796 LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
1797 << F.getName() << ": " << getFunctionLoc(F) << "\n");
1798 }
1799
1800 DenseSet<GlobalValue::GUID> InlinedGUIDs;
1801 if (CallsitePrioritizedInline)
1802 Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
1803 else
1804 Changed |= inlineHotFunctions(F, InlinedGUIDs);
1805
1806 Changed |= computeAndPropagateWeights(F, InlinedGUIDs);
1807
1808 if (Changed)
1809 generateMDProfMetadata(F);
1810
1811 emitCoverageRemarks(F);
1812 return Changed;
1813 }
1814
1815 std::unique_ptr<ProfiledCallGraph>
buildProfiledCallGraph(CallGraph & CG)1816 SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) {
1817 std::unique_ptr<ProfiledCallGraph> ProfiledCG;
1818 if (FunctionSamples::ProfileIsCS)
1819 ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker);
1820 else
1821 ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles());
1822
1823 // Add all functions into the profiled call graph even if they are not in
1824 // the profile. This makes sure functions missing from the profile still
1825 // gets a chance to be processed.
1826 for (auto &Node : CG) {
1827 const auto *F = Node.first;
1828 if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile"))
1829 continue;
1830 ProfiledCG->addProfiledFunction(FunctionSamples::getCanonicalFnName(*F));
1831 }
1832
1833 return ProfiledCG;
1834 }
1835
1836 std::vector<Function *>
buildFunctionOrder(Module & M,CallGraph * CG)1837 SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
1838 std::vector<Function *> FunctionOrderList;
1839 FunctionOrderList.reserve(M.size());
1840
1841 if (!ProfileTopDownLoad && UseProfiledCallGraph)
1842 errs() << "WARNING: -use-profiled-call-graph ignored, should be used "
1843 "together with -sample-profile-top-down-load.\n";
1844
1845 if (!ProfileTopDownLoad || CG == nullptr) {
1846 if (ProfileMergeInlinee) {
1847 // Disable ProfileMergeInlinee if profile is not loaded in top down order,
1848 // because the profile for a function may be used for the profile
1849 // annotation of its outline copy before the profile merging of its
1850 // non-inlined inline instances, and that is not the way how
1851 // ProfileMergeInlinee is supposed to work.
1852 ProfileMergeInlinee = false;
1853 }
1854
1855 for (Function &F : M)
1856 if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
1857 FunctionOrderList.push_back(&F);
1858 return FunctionOrderList;
1859 }
1860
1861 assert(&CG->getModule() == &M);
1862
1863 if (UseProfiledCallGraph || (FunctionSamples::ProfileIsCS &&
1864 !UseProfiledCallGraph.getNumOccurrences())) {
1865 // Use profiled call edges to augment the top-down order. There are cases
1866 // that the top-down order computed based on the static call graph doesn't
1867 // reflect real execution order. For example
1868 //
1869 // 1. Incomplete static call graph due to unknown indirect call targets.
1870 // Adjusting the order by considering indirect call edges from the
1871 // profile can enable the inlining of indirect call targets by allowing
1872 // the caller processed before them.
1873 // 2. Mutual call edges in an SCC. The static processing order computed for
1874 // an SCC may not reflect the call contexts in the context-sensitive
1875 // profile, thus may cause potential inlining to be overlooked. The
1876 // function order in one SCC is being adjusted to a top-down order based
1877 // on the profile to favor more inlining. This is only a problem with CS
1878 // profile.
1879 // 3. Transitive indirect call edges due to inlining. When a callee function
1880 // (say B) is inlined into into a caller function (say A) in LTO prelink,
1881 // every call edge originated from the callee B will be transferred to
1882 // the caller A. If any transferred edge (say A->C) is indirect, the
1883 // original profiled indirect edge B->C, even if considered, would not
1884 // enforce a top-down order from the caller A to the potential indirect
1885 // call target C in LTO postlink since the inlined callee B is gone from
1886 // the static call graph.
1887 // 4. #3 can happen even for direct call targets, due to functions defined
1888 // in header files. A header function (say A), when included into source
1889 // files, is defined multiple times but only one definition survives due
1890 // to ODR. Therefore, the LTO prelink inlining done on those dropped
1891 // definitions can be useless based on a local file scope. More
1892 // importantly, the inlinee (say B), once fully inlined to a
1893 // to-be-dropped A, will have no profile to consume when its outlined
1894 // version is compiled. This can lead to a profile-less prelink
1895 // compilation for the outlined version of B which may be called from
1896 // external modules. while this isn't easy to fix, we rely on the
1897 // postlink AutoFDO pipeline to optimize B. Since the survived copy of
1898 // the A can be inlined in its local scope in prelink, it may not exist
1899 // in the merged IR in postlink, and we'll need the profiled call edges
1900 // to enforce a top-down order for the rest of the functions.
1901 //
1902 // Considering those cases, a profiled call graph completely independent of
1903 // the static call graph is constructed based on profile data, where
1904 // function objects are not even needed to handle case #3 and case 4.
1905 //
1906 // Note that static callgraph edges are completely ignored since they
1907 // can be conflicting with profiled edges for cyclic SCCs and may result in
1908 // an SCC order incompatible with profile-defined one. Using strictly
1909 // profile order ensures a maximum inlining experience. On the other hand,
1910 // static call edges are not so important when they don't correspond to a
1911 // context in the profile.
1912
1913 std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG);
1914 scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
1915 while (!CGI.isAtEnd()) {
1916 auto Range = *CGI;
1917 if (SortProfiledSCC) {
1918 // Sort nodes in one SCC based on callsite hotness.
1919 scc_member_iterator<ProfiledCallGraph *> SI(*CGI);
1920 Range = *SI;
1921 }
1922 for (auto *Node : Range) {
1923 Function *F = SymbolMap.lookup(Node->Name);
1924 if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
1925 FunctionOrderList.push_back(F);
1926 }
1927 ++CGI;
1928 }
1929 } else {
1930 scc_iterator<CallGraph *> CGI = scc_begin(CG);
1931 while (!CGI.isAtEnd()) {
1932 for (CallGraphNode *Node : *CGI) {
1933 auto *F = Node->getFunction();
1934 if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
1935 FunctionOrderList.push_back(F);
1936 }
1937 ++CGI;
1938 }
1939 }
1940
1941 LLVM_DEBUG({
1942 dbgs() << "Function processing order:\n";
1943 for (auto F : reverse(FunctionOrderList)) {
1944 dbgs() << F->getName() << "\n";
1945 }
1946 });
1947
1948 std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
1949 return FunctionOrderList;
1950 }
1951
doInitialization(Module & M,FunctionAnalysisManager * FAM)1952 bool SampleProfileLoader::doInitialization(Module &M,
1953 FunctionAnalysisManager *FAM) {
1954 auto &Ctx = M.getContext();
1955
1956 auto ReaderOrErr = SampleProfileReader::create(
1957 Filename, Ctx, FSDiscriminatorPass::Base, RemappingFilename);
1958 if (std::error_code EC = ReaderOrErr.getError()) {
1959 std::string Msg = "Could not open profile: " + EC.message();
1960 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1961 return false;
1962 }
1963 Reader = std::move(ReaderOrErr.get());
1964 Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
1965 // set module before reading the profile so reader may be able to only
1966 // read the function profiles which are used by the current module.
1967 Reader->setModule(&M);
1968 if (std::error_code EC = Reader->read()) {
1969 std::string Msg = "profile reading failed: " + EC.message();
1970 Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
1971 return false;
1972 }
1973
1974 PSL = Reader->getProfileSymbolList();
1975
1976 // While profile-sample-accurate is on, ignore symbol list.
1977 ProfAccForSymsInList =
1978 ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;
1979 if (ProfAccForSymsInList) {
1980 NamesInProfile.clear();
1981 if (auto NameTable = Reader->getNameTable())
1982 NamesInProfile.insert(NameTable->begin(), NameTable->end());
1983 CoverageTracker.setProfAccForSymsInList(true);
1984 }
1985
1986 if (FAM && !ProfileInlineReplayFile.empty()) {
1987 ExternalInlineAdvisor = getReplayInlineAdvisor(
1988 M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr,
1989 ReplayInlinerSettings{ProfileInlineReplayFile,
1990 ProfileInlineReplayScope,
1991 ProfileInlineReplayFallback,
1992 {ProfileInlineReplayFormat}},
1993 /*EmitRemarks=*/false, InlineContext{LTOPhase, InlinePass::ReplaySampleProfileInliner});
1994 }
1995
1996 // Apply tweaks if context-sensitive or probe-based profile is available.
1997 if (Reader->profileIsCS() || Reader->profileIsPreInlined() ||
1998 Reader->profileIsProbeBased()) {
1999 if (!UseIterativeBFIInference.getNumOccurrences())
2000 UseIterativeBFIInference = true;
2001 if (!SampleProfileUseProfi.getNumOccurrences())
2002 SampleProfileUseProfi = true;
2003 if (!EnableExtTspBlockPlacement.getNumOccurrences())
2004 EnableExtTspBlockPlacement = true;
2005 // Enable priority-base inliner and size inline by default for CSSPGO.
2006 if (!ProfileSizeInline.getNumOccurrences())
2007 ProfileSizeInline = true;
2008 if (!CallsitePrioritizedInline.getNumOccurrences())
2009 CallsitePrioritizedInline = true;
2010 // For CSSPGO, we also allow recursive inline to best use context profile.
2011 if (!AllowRecursiveInline.getNumOccurrences())
2012 AllowRecursiveInline = true;
2013
2014 if (Reader->profileIsPreInlined()) {
2015 if (!UsePreInlinerDecision.getNumOccurrences())
2016 UsePreInlinerDecision = true;
2017 }
2018
2019 if (!Reader->profileIsCS()) {
2020 // Non-CS profile should be fine without a function size budget for the
2021 // inliner since the contexts in the profile are either all from inlining
2022 // in the prevoius build or pre-computed by the preinliner with a size
2023 // cap, thus they are bounded.
2024 if (!ProfileInlineLimitMin.getNumOccurrences())
2025 ProfileInlineLimitMin = std::numeric_limits<unsigned>::max();
2026 if (!ProfileInlineLimitMax.getNumOccurrences())
2027 ProfileInlineLimitMax = std::numeric_limits<unsigned>::max();
2028 }
2029 }
2030
2031 if (Reader->profileIsCS()) {
2032 // Tracker for profiles under different context
2033 ContextTracker = std::make_unique<SampleContextTracker>(
2034 Reader->getProfiles(), &GUIDToFuncNameMap);
2035 }
2036
2037 // Load pseudo probe descriptors for probe-based function samples.
2038 if (Reader->profileIsProbeBased()) {
2039 ProbeManager = std::make_unique<PseudoProbeManager>(M);
2040 if (!ProbeManager->moduleIsProbed(M)) {
2041 const char *Msg =
2042 "Pseudo-probe-based profile requires SampleProfileProbePass";
2043 Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg,
2044 DS_Warning));
2045 return false;
2046 }
2047 }
2048
2049 if (ReportProfileStaleness || PersistProfileStaleness) {
2050 MatchingManager =
2051 std::make_unique<SampleProfileMatcher>(M, *Reader, ProbeManager.get());
2052 }
2053
2054 return true;
2055 }
2056
detectProfileMismatch(const Function & F,const FunctionSamples & FS)2057 void SampleProfileMatcher::detectProfileMismatch(const Function &F,
2058 const FunctionSamples &FS) {
2059 if (FunctionSamples::ProfileIsProbeBased) {
2060 uint64_t Count = FS.getTotalSamples();
2061 TotalFuncHashSamples += Count;
2062 TotalProfiledFunc++;
2063 if (!ProbeManager->profileIsValid(F, FS)) {
2064 MismatchedFuncHashSamples += Count;
2065 NumMismatchedFuncHash++;
2066 return;
2067 }
2068 }
2069
2070 std::unordered_set<LineLocation, LineLocationHash> MatchedCallsiteLocs;
2071
2072 // Go through all the callsites on the IR and flag the callsite if the target
2073 // name is the same as the one in the profile.
2074 for (auto &BB : F) {
2075 for (auto &I : BB) {
2076 if (!isa<CallBase>(&I) || isa<IntrinsicInst>(&I))
2077 continue;
2078
2079 const auto *CB = dyn_cast<CallBase>(&I);
2080 if (auto &DLoc = I.getDebugLoc()) {
2081 LineLocation IRCallsite = FunctionSamples::getCallSiteIdentifier(DLoc);
2082
2083 StringRef CalleeName;
2084 if (Function *Callee = CB->getCalledFunction())
2085 CalleeName = FunctionSamples::getCanonicalFnName(Callee->getName());
2086
2087 const auto CTM = FS.findCallTargetMapAt(IRCallsite);
2088 const auto CallsiteFS = FS.findFunctionSamplesMapAt(IRCallsite);
2089
2090 // Indirect call case.
2091 if (CalleeName.empty()) {
2092 // Since indirect call does not have the CalleeName, check
2093 // conservatively if callsite in the profile is a callsite location.
2094 // This is to avoid nums of false positive since otherwise all the
2095 // indirect call samples will be reported as mismatching.
2096 if ((CTM && !CTM->empty()) || (CallsiteFS && !CallsiteFS->empty()))
2097 MatchedCallsiteLocs.insert(IRCallsite);
2098 } else {
2099 // Check if the call target name is matched for direct call case.
2100 if ((CTM && CTM->count(CalleeName)) ||
2101 (CallsiteFS && CallsiteFS->count(CalleeName)))
2102 MatchedCallsiteLocs.insert(IRCallsite);
2103 }
2104 }
2105 }
2106 }
2107
2108 auto isInvalidLineOffset = [](uint32_t LineOffset) {
2109 return LineOffset & 0x8000;
2110 };
2111
2112 // Check if there are any callsites in the profile that does not match to any
2113 // IR callsites, those callsite samples will be discarded.
2114 for (auto &I : FS.getBodySamples()) {
2115 const LineLocation &Loc = I.first;
2116 if (isInvalidLineOffset(Loc.LineOffset))
2117 continue;
2118
2119 uint64_t Count = I.second.getSamples();
2120 if (!I.second.getCallTargets().empty()) {
2121 TotalCallsiteSamples += Count;
2122 TotalProfiledCallsites++;
2123 if (!MatchedCallsiteLocs.count(Loc)) {
2124 MismatchedCallsiteSamples += Count;
2125 NumMismatchedCallsites++;
2126 }
2127 }
2128 }
2129
2130 for (auto &I : FS.getCallsiteSamples()) {
2131 const LineLocation &Loc = I.first;
2132 if (isInvalidLineOffset(Loc.LineOffset))
2133 continue;
2134
2135 uint64_t Count = 0;
2136 for (auto &FM : I.second) {
2137 Count += FM.second.getHeadSamplesEstimate();
2138 }
2139 TotalCallsiteSamples += Count;
2140 TotalProfiledCallsites++;
2141 if (!MatchedCallsiteLocs.count(Loc)) {
2142 MismatchedCallsiteSamples += Count;
2143 NumMismatchedCallsites++;
2144 }
2145 }
2146 }
2147
detectProfileMismatch()2148 void SampleProfileMatcher::detectProfileMismatch() {
2149 for (auto &F : M) {
2150 if (F.isDeclaration() || !F.hasFnAttribute("use-sample-profile"))
2151 continue;
2152 FunctionSamples *FS = Reader.getSamplesFor(F);
2153 if (!FS)
2154 continue;
2155 detectProfileMismatch(F, *FS);
2156 }
2157
2158 if (ReportProfileStaleness) {
2159 if (FunctionSamples::ProfileIsProbeBased) {
2160 errs() << "(" << NumMismatchedFuncHash << "/" << TotalProfiledFunc << ")"
2161 << " of functions' profile are invalid and "
2162 << " (" << MismatchedFuncHashSamples << "/" << TotalFuncHashSamples
2163 << ")"
2164 << " of samples are discarded due to function hash mismatch.\n";
2165 }
2166 errs() << "(" << NumMismatchedCallsites << "/" << TotalProfiledCallsites
2167 << ")"
2168 << " of callsites' profile are invalid and "
2169 << "(" << MismatchedCallsiteSamples << "/" << TotalCallsiteSamples
2170 << ")"
2171 << " of samples are discarded due to callsite location mismatch.\n";
2172 }
2173
2174 if (PersistProfileStaleness) {
2175 LLVMContext &Ctx = M.getContext();
2176 MDBuilder MDB(Ctx);
2177
2178 SmallVector<std::pair<StringRef, uint64_t>> ProfStatsVec;
2179 if (FunctionSamples::ProfileIsProbeBased) {
2180 ProfStatsVec.emplace_back("NumMismatchedFuncHash", NumMismatchedFuncHash);
2181 ProfStatsVec.emplace_back("TotalProfiledFunc", TotalProfiledFunc);
2182 ProfStatsVec.emplace_back("MismatchedFuncHashSamples",
2183 MismatchedFuncHashSamples);
2184 ProfStatsVec.emplace_back("TotalFuncHashSamples", TotalFuncHashSamples);
2185 }
2186
2187 ProfStatsVec.emplace_back("NumMismatchedCallsites", NumMismatchedCallsites);
2188 ProfStatsVec.emplace_back("TotalProfiledCallsites", TotalProfiledCallsites);
2189 ProfStatsVec.emplace_back("MismatchedCallsiteSamples",
2190 MismatchedCallsiteSamples);
2191 ProfStatsVec.emplace_back("TotalCallsiteSamples", TotalCallsiteSamples);
2192
2193 auto *MD = MDB.createLLVMStats(ProfStatsVec);
2194 auto *NMD = M.getOrInsertNamedMetadata("llvm.stats");
2195 NMD->addOperand(MD);
2196 }
2197 }
2198
runOnModule(Module & M,ModuleAnalysisManager * AM,ProfileSummaryInfo * _PSI,CallGraph * CG)2199 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
2200 ProfileSummaryInfo *_PSI, CallGraph *CG) {
2201 GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
2202
2203 PSI = _PSI;
2204 if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
2205 M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
2206 ProfileSummary::PSK_Sample);
2207 PSI->refresh();
2208 }
2209 // Compute the total number of samples collected in this profile.
2210 for (const auto &I : Reader->getProfiles())
2211 TotalCollectedSamples += I.second.getTotalSamples();
2212
2213 auto Remapper = Reader->getRemapper();
2214 // Populate the symbol map.
2215 for (const auto &N_F : M.getValueSymbolTable()) {
2216 StringRef OrigName = N_F.getKey();
2217 Function *F = dyn_cast<Function>(N_F.getValue());
2218 if (F == nullptr || OrigName.empty())
2219 continue;
2220 SymbolMap[OrigName] = F;
2221 StringRef NewName = FunctionSamples::getCanonicalFnName(*F);
2222 if (OrigName != NewName && !NewName.empty()) {
2223 auto r = SymbolMap.insert(std::make_pair(NewName, F));
2224 // Failiing to insert means there is already an entry in SymbolMap,
2225 // thus there are multiple functions that are mapped to the same
2226 // stripped name. In this case of name conflicting, set the value
2227 // to nullptr to avoid confusion.
2228 if (!r.second)
2229 r.first->second = nullptr;
2230 OrigName = NewName;
2231 }
2232 // Insert the remapped names into SymbolMap.
2233 if (Remapper) {
2234 if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
2235 if (*MapName != OrigName && !MapName->empty())
2236 SymbolMap.insert(std::make_pair(*MapName, F));
2237 }
2238 }
2239 }
2240 assert(SymbolMap.count(StringRef()) == 0 &&
2241 "No empty StringRef should be added in SymbolMap");
2242
2243 if (ReportProfileStaleness || PersistProfileStaleness)
2244 MatchingManager->detectProfileMismatch();
2245
2246 bool retval = false;
2247 for (auto *F : buildFunctionOrder(M, CG)) {
2248 assert(!F->isDeclaration());
2249 clearFunctionData();
2250 retval |= runOnFunction(*F, AM);
2251 }
2252
2253 // Account for cold calls not inlined....
2254 if (!FunctionSamples::ProfileIsCS)
2255 for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
2256 notInlinedCallInfo)
2257 updateProfileCallee(pair.first, pair.second.entryCount);
2258
2259 return retval;
2260 }
2261
runOnFunction(Function & F,ModuleAnalysisManager * AM)2262 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
2263 LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
2264 DILocation2SampleMap.clear();
2265 // By default the entry count is initialized to -1, which will be treated
2266 // conservatively by getEntryCount as the same as unknown (None). This is
2267 // to avoid newly added code to be treated as cold. If we have samples
2268 // this will be overwritten in emitAnnotations.
2269 uint64_t initialEntryCount = -1;
2270
2271 ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
2272 if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
2273 // initialize all the function entry counts to 0. It means all the
2274 // functions without profile will be regarded as cold.
2275 initialEntryCount = 0;
2276 // profile-sample-accurate is a user assertion which has a higher precedence
2277 // than symbol list. When profile-sample-accurate is on, ignore symbol list.
2278 ProfAccForSymsInList = false;
2279 }
2280 CoverageTracker.setProfAccForSymsInList(ProfAccForSymsInList);
2281
2282 // PSL -- profile symbol list include all the symbols in sampled binary.
2283 // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
2284 // old functions without samples being cold, without having to worry
2285 // about new and hot functions being mistakenly treated as cold.
2286 if (ProfAccForSymsInList) {
2287 // Initialize the entry count to 0 for functions in the list.
2288 if (PSL->contains(F.getName()))
2289 initialEntryCount = 0;
2290
2291 // Function in the symbol list but without sample will be regarded as
2292 // cold. To minimize the potential negative performance impact it could
2293 // have, we want to be a little conservative here saying if a function
2294 // shows up in the profile, no matter as outline function, inline instance
2295 // or call targets, treat the function as not being cold. This will handle
2296 // the cases such as most callsites of a function are inlined in sampled
2297 // binary but not inlined in current build (because of source code drift,
2298 // imprecise debug information, or the callsites are all cold individually
2299 // but not cold accumulatively...), so the outline function showing up as
2300 // cold in sampled binary will actually not be cold after current build.
2301 StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
2302 if (NamesInProfile.count(CanonName))
2303 initialEntryCount = -1;
2304 }
2305
2306 // Initialize entry count when the function has no existing entry
2307 // count value.
2308 if (!F.getEntryCount())
2309 F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
2310 std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
2311 if (AM) {
2312 auto &FAM =
2313 AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
2314 .getManager();
2315 ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
2316 } else {
2317 OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
2318 ORE = OwnedORE.get();
2319 }
2320
2321 if (FunctionSamples::ProfileIsCS)
2322 Samples = ContextTracker->getBaseSamplesFor(F);
2323 else
2324 Samples = Reader->getSamplesFor(F);
2325
2326 if (Samples && !Samples->empty())
2327 return emitAnnotations(F);
2328 return false;
2329 }
2330
run(Module & M,ModuleAnalysisManager & AM)2331 PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
2332 ModuleAnalysisManager &AM) {
2333 FunctionAnalysisManager &FAM =
2334 AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
2335
2336 auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
2337 return FAM.getResult<AssumptionAnalysis>(F);
2338 };
2339 auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
2340 return FAM.getResult<TargetIRAnalysis>(F);
2341 };
2342 auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
2343 return FAM.getResult<TargetLibraryAnalysis>(F);
2344 };
2345
2346 SampleProfileLoader SampleLoader(
2347 ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
2348 ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
2349 : ProfileRemappingFileName,
2350 LTOPhase, GetAssumptionCache, GetTTI, GetTLI);
2351
2352 if (!SampleLoader.doInitialization(M, &FAM))
2353 return PreservedAnalyses::all();
2354
2355 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
2356 CallGraph &CG = AM.getResult<CallGraphAnalysis>(M);
2357 if (!SampleLoader.runOnModule(M, &AM, PSI, &CG))
2358 return PreservedAnalyses::all();
2359
2360 return PreservedAnalyses::none();
2361 }
2362