1 //===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "ProfileGenerator.h"
9 #include "ErrorHandling.h"
10 #include "MissingFrameInferrer.h"
11 #include "PerfReader.h"
12 #include "ProfiledBinary.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include "llvm/ProfileData/ProfileCommon.h"
15 #include <algorithm>
16 #include <float.h>
17 #include <unordered_set>
18 #include <utility>
19
20 cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
21 cl::Required,
22 cl::desc("Output profile file"));
23 static cl::alias OutputA("o", cl::desc("Alias for --output"),
24 cl::aliasopt(OutputFilename));
25
26 static cl::opt<SampleProfileFormat> OutputFormat(
27 "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary),
28 cl::values(
29 clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
30 clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"),
31 clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"),
32 clEnumValN(SPF_Text, "text", "Text encoding"),
33 clEnumValN(SPF_GCC, "gcc",
34 "GCC encoding (only meaningful for -sample)")));
35
36 static cl::opt<bool> UseMD5(
37 "use-md5", cl::Hidden,
38 cl::desc("Use md5 to represent function names in the output profile (only "
39 "meaningful for -extbinary)"));
40
41 static cl::opt<bool> PopulateProfileSymbolList(
42 "populate-profile-symbol-list", cl::init(false), cl::Hidden,
43 cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
44
45 static cl::opt<bool> FillZeroForAllFuncs(
46 "fill-zero-for-all-funcs", cl::init(false), cl::Hidden,
47 cl::desc("Attribute all functions' range with zero count "
48 "even it's not hit by any samples."));
49
50 static cl::opt<int32_t, true> RecursionCompression(
51 "compress-recursion",
52 cl::desc("Compressing recursion by deduplicating adjacent frame "
53 "sequences up to the specified size. -1 means no size limit."),
54 cl::Hidden,
55 cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
56
57 static cl::opt<bool>
58 TrimColdProfile("trim-cold-profile",
59 cl::desc("If the total count of the profile is smaller "
60 "than threshold, it will be trimmed."));
61
62 static cl::opt<bool> CSProfMergeColdContext(
63 "csprof-merge-cold-context", cl::init(true),
64 cl::desc("If the total count of context profile is smaller than "
65 "the threshold, it will be merged into context-less base "
66 "profile."));
67
68 static cl::opt<uint32_t> CSProfMaxColdContextDepth(
69 "csprof-max-cold-context-depth", cl::init(1),
70 cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
71 "context-less base profile"));
72
73 static cl::opt<int, true> CSProfMaxContextDepth(
74 "csprof-max-context-depth",
75 cl::desc("Keep the last K contexts while merging profile. -1 means no "
76 "depth limit."),
77 cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
78
79 static cl::opt<double> HotFunctionDensityThreshold(
80 "hot-function-density-threshold", llvm::cl::init(1000),
81 llvm::cl::desc(
82 "specify density threshold for hot functions (default: 1000)"),
83 llvm::cl::Optional);
84 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
85 llvm::cl::desc("show profile density details"),
86 llvm::cl::Optional);
87
88 static cl::opt<bool> UpdateTotalSamples(
89 "update-total-samples", llvm::cl::init(false),
90 llvm::cl::desc(
91 "Update total samples by accumulating all its body samples."),
92 llvm::cl::Optional);
93
94 static cl::opt<bool> GenCSNestedProfile(
95 "gen-cs-nested-profile", cl::Hidden, cl::init(true),
96 cl::desc("Generate nested function profiles for CSSPGO"));
97
98 cl::opt<bool> InferMissingFrames(
99 "infer-missing-frames", llvm::cl::init(true),
100 llvm::cl::desc(
101 "Infer missing call frames due to compiler tail call elimination."),
102 llvm::cl::Optional);
103
104 using namespace llvm;
105 using namespace sampleprof;
106
107 namespace llvm {
108 extern cl::opt<int> ProfileSummaryCutoffHot;
109 extern cl::opt<bool> UseContextLessSummary;
110
111 namespace sampleprof {
112
113 // Initialize the MaxCompressionSize to -1 which means no size limit
114 int32_t CSProfileGenerator::MaxCompressionSize = -1;
115
116 int CSProfileGenerator::MaxContextDepth = -1;
117
118 bool ProfileGeneratorBase::UseFSDiscriminator = false;
119
120 std::unique_ptr<ProfileGeneratorBase>
create(ProfiledBinary * Binary,const ContextSampleCounterMap * SampleCounters,bool ProfileIsCS)121 ProfileGeneratorBase::create(ProfiledBinary *Binary,
122 const ContextSampleCounterMap *SampleCounters,
123 bool ProfileIsCS) {
124 std::unique_ptr<ProfileGeneratorBase> Generator;
125 if (ProfileIsCS) {
126 if (Binary->useFSDiscriminator())
127 exitWithError("FS discriminator is not supported in CS profile.");
128 Generator.reset(new CSProfileGenerator(Binary, SampleCounters));
129 } else {
130 Generator.reset(new ProfileGenerator(Binary, SampleCounters));
131 }
132 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
133 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
134
135 return Generator;
136 }
137
138 std::unique_ptr<ProfileGeneratorBase>
create(ProfiledBinary * Binary,SampleProfileMap & Profiles,bool ProfileIsCS)139 ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles,
140 bool ProfileIsCS) {
141 std::unique_ptr<ProfileGeneratorBase> Generator;
142 if (ProfileIsCS) {
143 if (Binary->useFSDiscriminator())
144 exitWithError("FS discriminator is not supported in CS profile.");
145 Generator.reset(new CSProfileGenerator(Binary, Profiles));
146 } else {
147 Generator.reset(new ProfileGenerator(Binary, std::move(Profiles)));
148 }
149 ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
150 FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
151
152 return Generator;
153 }
154
write(std::unique_ptr<SampleProfileWriter> Writer,SampleProfileMap & ProfileMap)155 void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
156 SampleProfileMap &ProfileMap) {
157 // Populate profile symbol list if extended binary format is used.
158 ProfileSymbolList SymbolList;
159
160 if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
161 Binary->populateSymbolListFromDWARF(SymbolList);
162 Writer->setProfileSymbolList(&SymbolList);
163 }
164
165 if (std::error_code EC = Writer->write(ProfileMap))
166 exitWithError(std::move(EC));
167 }
168
write()169 void ProfileGeneratorBase::write() {
170 auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
171 if (std::error_code EC = WriterOrErr.getError())
172 exitWithError(EC, OutputFilename);
173
174 if (UseMD5) {
175 if (OutputFormat != SPF_Ext_Binary)
176 WithColor::warning() << "-use-md5 is ignored. Specify "
177 "--format=extbinary to enable it\n";
178 else
179 WriterOrErr.get()->setUseMD5();
180 }
181
182 write(std::move(WriterOrErr.get()), ProfileMap);
183 }
184
showDensitySuggestion(double Density)185 void ProfileGeneratorBase::showDensitySuggestion(double Density) {
186 if (Density == 0.0)
187 WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
188 "set too low. Please check your command.\n";
189 else if (Density < HotFunctionDensityThreshold)
190 WithColor::warning()
191 << "AutoFDO is estimated to optimize better with "
192 << format("%.1f", HotFunctionDensityThreshold / Density)
193 << "x more samples. Please consider increasing sampling rate or "
194 "profiling for longer duration to get more samples.\n";
195
196 if (ShowDensity)
197 outs() << "Minimum profile density for hot functions with top "
198 << format("%.2f",
199 static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
200 10000)
201 << "% total samples: " << format("%.1f", Density) << "\n";
202 }
203
calculateDensity(const SampleProfileMap & Profiles,uint64_t HotCntThreshold)204 double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
205 uint64_t HotCntThreshold) {
206 double Density = DBL_MAX;
207 std::vector<const FunctionSamples *> HotFuncs;
208 for (auto &I : Profiles) {
209 auto &FuncSamples = I.second;
210 if (FuncSamples.getTotalSamples() < HotCntThreshold)
211 continue;
212 HotFuncs.emplace_back(&FuncSamples);
213 }
214
215 for (auto *FuncSamples : HotFuncs) {
216 auto *Func = Binary->getBinaryFunction(FuncSamples->getName());
217 if (!Func)
218 continue;
219 uint64_t FuncSize = Func->getFuncSize();
220 if (FuncSize == 0)
221 continue;
222 Density =
223 std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
224 FuncSize);
225 }
226
227 return Density == DBL_MAX ? 0.0 : Density;
228 }
229
findDisjointRanges(RangeSample & DisjointRanges,const RangeSample & Ranges)230 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
231 const RangeSample &Ranges) {
232
233 /*
234 Regions may overlap with each other. Using the boundary info, find all
235 disjoint ranges and their sample count. BoundaryPoint contains the count
236 multiple samples begin/end at this points.
237
238 |<--100-->| Sample1
239 |<------200------>| Sample2
240 A B C
241
242 In the example above,
243 Sample1 begins at A, ends at B, its value is 100.
244 Sample2 beings at A, ends at C, its value is 200.
245 For A, BeginCount is the sum of sample begins at A, which is 300 and no
246 samples ends at A, so EndCount is 0.
247 Then boundary points A, B, and C with begin/end counts are:
248 A: (300, 0)
249 B: (0, 100)
250 C: (0, 200)
251 */
252 struct BoundaryPoint {
253 // Sum of sample counts beginning at this point
254 uint64_t BeginCount = UINT64_MAX;
255 // Sum of sample counts ending at this point
256 uint64_t EndCount = UINT64_MAX;
257 // Is the begin point of a zero range.
258 bool IsZeroRangeBegin = false;
259 // Is the end point of a zero range.
260 bool IsZeroRangeEnd = false;
261
262 void addBeginCount(uint64_t Count) {
263 if (BeginCount == UINT64_MAX)
264 BeginCount = 0;
265 BeginCount += Count;
266 }
267
268 void addEndCount(uint64_t Count) {
269 if (EndCount == UINT64_MAX)
270 EndCount = 0;
271 EndCount += Count;
272 }
273 };
274
275 /*
276 For the above example. With boundary points, follwing logic finds two
277 disjoint region of
278
279 [A,B]: 300
280 [B+1,C]: 200
281
282 If there is a boundary point that both begin and end, the point itself
283 becomes a separate disjoint region. For example, if we have original
284 ranges of
285
286 |<--- 100 --->|
287 |<--- 200 --->|
288 A B C
289
290 there are three boundary points with their begin/end counts of
291
292 A: (100, 0)
293 B: (200, 100)
294 C: (0, 200)
295
296 the disjoint ranges would be
297
298 [A, B-1]: 100
299 [B, B]: 300
300 [B+1, C]: 200.
301
302 Example for zero value range:
303
304 |<--- 100 --->|
305 |<--- 200 --->|
306 |<--------------- 0 ----------------->|
307 A B C D E F
308
309 [A, B-1] : 0
310 [B, C] : 100
311 [C+1, D-1]: 0
312 [D, E] : 200
313 [E+1, F] : 0
314 */
315 std::map<uint64_t, BoundaryPoint> Boundaries;
316
317 for (const auto &Item : Ranges) {
318 assert(Item.first.first <= Item.first.second &&
319 "Invalid instruction range");
320 auto &BeginPoint = Boundaries[Item.first.first];
321 auto &EndPoint = Boundaries[Item.first.second];
322 uint64_t Count = Item.second;
323
324 BeginPoint.addBeginCount(Count);
325 EndPoint.addEndCount(Count);
326 if (Count == 0) {
327 BeginPoint.IsZeroRangeBegin = true;
328 EndPoint.IsZeroRangeEnd = true;
329 }
330 }
331
332 // Use UINT64_MAX to indicate there is no existing range between BeginAddress
333 // and the next valid address
334 uint64_t BeginAddress = UINT64_MAX;
335 int ZeroRangeDepth = 0;
336 uint64_t Count = 0;
337 for (const auto &Item : Boundaries) {
338 uint64_t Address = Item.first;
339 const BoundaryPoint &Point = Item.second;
340 if (Point.BeginCount != UINT64_MAX) {
341 if (BeginAddress != UINT64_MAX)
342 DisjointRanges[{BeginAddress, Address - 1}] = Count;
343 Count += Point.BeginCount;
344 BeginAddress = Address;
345 ZeroRangeDepth += Point.IsZeroRangeBegin;
346 }
347 if (Point.EndCount != UINT64_MAX) {
348 assert((BeginAddress != UINT64_MAX) &&
349 "First boundary point cannot be 'end' point");
350 DisjointRanges[{BeginAddress, Address}] = Count;
351 assert(Count >= Point.EndCount && "Mismatched live ranges");
352 Count -= Point.EndCount;
353 BeginAddress = Address + 1;
354 ZeroRangeDepth -= Point.IsZeroRangeEnd;
355 // If the remaining count is zero and it's no longer in a zero range, this
356 // means we consume all the ranges before, thus mark BeginAddress as
357 // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
358 // [<---- 10 ---->]
359 // [<---- 20 ---->]
360 // A B C D
361 // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
362 // have the [B+1, C-1] zero range.
363 if (Count == 0 && ZeroRangeDepth == 0)
364 BeginAddress = UINT64_MAX;
365 }
366 }
367 }
368
updateBodySamplesforFunctionProfile(FunctionSamples & FunctionProfile,const SampleContextFrame & LeafLoc,uint64_t Count)369 void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
370 FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
371 uint64_t Count) {
372 // Use the maximum count of samples with same line location
373 uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator);
374
375 // Use duplication factor to compensated for loop unroll/vectorization.
376 // Note that this is only needed when we're taking MAX of the counts at
377 // the location instead of SUM.
378 Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
379
380 ErrorOr<uint64_t> R =
381 FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator);
382
383 uint64_t PreviousCount = R ? R.get() : 0;
384 if (PreviousCount <= Count) {
385 FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
386 Count - PreviousCount);
387 }
388 }
389
updateTotalSamples()390 void ProfileGeneratorBase::updateTotalSamples() {
391 for (auto &Item : ProfileMap) {
392 FunctionSamples &FunctionProfile = Item.second;
393 FunctionProfile.updateTotalSamples();
394 }
395 }
396
updateCallsiteSamples()397 void ProfileGeneratorBase::updateCallsiteSamples() {
398 for (auto &Item : ProfileMap) {
399 FunctionSamples &FunctionProfile = Item.second;
400 FunctionProfile.updateCallsiteSamples();
401 }
402 }
403
updateFunctionSamples()404 void ProfileGeneratorBase::updateFunctionSamples() {
405 updateCallsiteSamples();
406
407 if (UpdateTotalSamples)
408 updateTotalSamples();
409 }
410
collectProfiledFunctions()411 void ProfileGeneratorBase::collectProfiledFunctions() {
412 std::unordered_set<const BinaryFunction *> ProfiledFunctions;
413 if (collectFunctionsFromRawProfile(ProfiledFunctions))
414 Binary->setProfiledFunctions(ProfiledFunctions);
415 else if (collectFunctionsFromLLVMProfile(ProfiledFunctions))
416 Binary->setProfiledFunctions(ProfiledFunctions);
417 else
418 llvm_unreachable("Unsupported input profile");
419 }
420
collectFunctionsFromRawProfile(std::unordered_set<const BinaryFunction * > & ProfiledFunctions)421 bool ProfileGeneratorBase::collectFunctionsFromRawProfile(
422 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
423 if (!SampleCounters)
424 return false;
425 // Go through all the stacks, ranges and branches in sample counters, use
426 // the start of the range to look up the function it belongs and record the
427 // function.
428 for (const auto &CI : *SampleCounters) {
429 if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(CI.first.getPtr())) {
430 for (auto StackAddr : CtxKey->Context) {
431 if (FuncRange *FRange = Binary->findFuncRange(StackAddr))
432 ProfiledFunctions.insert(FRange->Func);
433 }
434 }
435
436 for (auto Item : CI.second.RangeCounter) {
437 uint64_t StartAddress = Item.first.first;
438 if (FuncRange *FRange = Binary->findFuncRange(StartAddress))
439 ProfiledFunctions.insert(FRange->Func);
440 }
441
442 for (auto Item : CI.second.BranchCounter) {
443 uint64_t SourceAddress = Item.first.first;
444 uint64_t TargetAddress = Item.first.second;
445 if (FuncRange *FRange = Binary->findFuncRange(SourceAddress))
446 ProfiledFunctions.insert(FRange->Func);
447 if (FuncRange *FRange = Binary->findFuncRange(TargetAddress))
448 ProfiledFunctions.insert(FRange->Func);
449 }
450 }
451 return true;
452 }
453
collectFunctionsFromLLVMProfile(std::unordered_set<const BinaryFunction * > & ProfiledFunctions)454 bool ProfileGenerator::collectFunctionsFromLLVMProfile(
455 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
456 for (const auto &FS : ProfileMap) {
457 if (auto *Func = Binary->getBinaryFunction(FS.first.getName()))
458 ProfiledFunctions.insert(Func);
459 }
460 return true;
461 }
462
collectFunctionsFromLLVMProfile(std::unordered_set<const BinaryFunction * > & ProfiledFunctions)463 bool CSProfileGenerator::collectFunctionsFromLLVMProfile(
464 std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
465 for (auto *Node : ContextTracker) {
466 if (!Node->getFuncName().empty())
467 if (auto *Func = Binary->getBinaryFunction(Node->getFuncName()))
468 ProfiledFunctions.insert(Func);
469 }
470 return true;
471 }
472
473 FunctionSamples &
getTopLevelFunctionProfile(StringRef FuncName)474 ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) {
475 SampleContext Context(FuncName);
476 auto Ret = ProfileMap.emplace(Context, FunctionSamples());
477 if (Ret.second) {
478 FunctionSamples &FProfile = Ret.first->second;
479 FProfile.setContext(Context);
480 }
481 return Ret.first->second;
482 }
483
generateProfile()484 void ProfileGenerator::generateProfile() {
485 collectProfiledFunctions();
486
487 if (Binary->usePseudoProbes())
488 Binary->decodePseudoProbe();
489
490 if (SampleCounters) {
491 if (Binary->usePseudoProbes()) {
492 generateProbeBasedProfile();
493 } else {
494 generateLineNumBasedProfile();
495 }
496 }
497
498 postProcessProfiles();
499 }
500
postProcessProfiles()501 void ProfileGenerator::postProcessProfiles() {
502 computeSummaryAndThreshold(ProfileMap);
503 trimColdProfiles(ProfileMap, ColdCountThreshold);
504 calculateAndShowDensity(ProfileMap);
505 }
506
trimColdProfiles(const SampleProfileMap & Profiles,uint64_t ColdCntThreshold)507 void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles,
508 uint64_t ColdCntThreshold) {
509 if (!TrimColdProfile)
510 return;
511
512 // Move cold profiles into a tmp container.
513 std::vector<SampleContext> ColdProfiles;
514 for (const auto &I : ProfileMap) {
515 if (I.second.getTotalSamples() < ColdCntThreshold)
516 ColdProfiles.emplace_back(I.first);
517 }
518
519 // Remove the cold profile from ProfileMap.
520 for (const auto &I : ColdProfiles)
521 ProfileMap.erase(I);
522 }
523
generateLineNumBasedProfile()524 void ProfileGenerator::generateLineNumBasedProfile() {
525 assert(SampleCounters->size() == 1 &&
526 "Must have one entry for profile generation.");
527 const SampleCounter &SC = SampleCounters->begin()->second;
528 // Fill in function body samples
529 populateBodySamplesForAllFunctions(SC.RangeCounter);
530 // Fill in boundary sample counts as well as call site samples for calls
531 populateBoundarySamplesForAllFunctions(SC.BranchCounter);
532
533 updateFunctionSamples();
534 }
535
generateProbeBasedProfile()536 void ProfileGenerator::generateProbeBasedProfile() {
537 assert(SampleCounters->size() == 1 &&
538 "Must have one entry for profile generation.");
539 // Enable pseudo probe functionalities in SampleProf
540 FunctionSamples::ProfileIsProbeBased = true;
541 const SampleCounter &SC = SampleCounters->begin()->second;
542 // Fill in function body samples
543 populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter);
544 // Fill in boundary sample counts as well as call site samples for calls
545 populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter);
546
547 updateFunctionSamples();
548 }
549
populateBodySamplesWithProbesForAllFunctions(const RangeSample & RangeCounter)550 void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
551 const RangeSample &RangeCounter) {
552 ProbeCounterMap ProbeCounter;
553 // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
554 // inside extractProbesFromRange.
555 extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter,
556 false);
557
558 for (const auto &PI : ProbeCounter) {
559 const MCDecodedPseudoProbe *Probe = PI.first;
560 uint64_t Count = PI.second;
561 SampleContextFrameVector FrameVec;
562 Binary->getInlineContextForProbe(Probe, FrameVec, true);
563 FunctionSamples &FunctionProfile =
564 getLeafProfileAndAddTotalSamples(FrameVec, Count);
565 FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count);
566 if (Probe->isEntry())
567 FunctionProfile.addHeadSamples(Count);
568 }
569 }
570
populateBoundarySamplesWithProbesForAllFunctions(const BranchSample & BranchCounters)571 void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
572 const BranchSample &BranchCounters) {
573 for (const auto &Entry : BranchCounters) {
574 uint64_t SourceAddress = Entry.first.first;
575 uint64_t TargetAddress = Entry.first.second;
576 uint64_t Count = Entry.second;
577 assert(Count != 0 && "Unexpected zero weight branch");
578
579 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
580 if (CalleeName.size() == 0)
581 continue;
582
583 const MCDecodedPseudoProbe *CallProbe =
584 Binary->getCallProbeForAddr(SourceAddress);
585 if (CallProbe == nullptr)
586 continue;
587
588 // Record called target sample and its count.
589 SampleContextFrameVector FrameVec;
590 Binary->getInlineContextForProbe(CallProbe, FrameVec, true);
591
592 if (!FrameVec.empty()) {
593 FunctionSamples &FunctionProfile =
594 getLeafProfileAndAddTotalSamples(FrameVec, 0);
595 FunctionProfile.addCalledTargetSamples(
596 FrameVec.back().Location.LineOffset, 0, CalleeName, Count);
597 }
598 }
599 }
600
getLeafProfileAndAddTotalSamples(const SampleContextFrameVector & FrameVec,uint64_t Count)601 FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
602 const SampleContextFrameVector &FrameVec, uint64_t Count) {
603 // Get top level profile
604 FunctionSamples *FunctionProfile =
605 &getTopLevelFunctionProfile(FrameVec[0].FuncName);
606 FunctionProfile->addTotalSamples(Count);
607 if (Binary->usePseudoProbes()) {
608 const auto *FuncDesc = Binary->getFuncDescForGUID(
609 Function::getGUID(FunctionProfile->getName()));
610 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
611 }
612
613 for (size_t I = 1; I < FrameVec.size(); I++) {
614 LineLocation Callsite(
615 FrameVec[I - 1].Location.LineOffset,
616 getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator));
617 FunctionSamplesMap &SamplesMap =
618 FunctionProfile->functionSamplesAt(Callsite);
619 auto Ret =
620 SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples());
621 if (Ret.second) {
622 SampleContext Context(FrameVec[I].FuncName);
623 Ret.first->second.setContext(Context);
624 }
625 FunctionProfile = &Ret.first->second;
626 FunctionProfile->addTotalSamples(Count);
627 if (Binary->usePseudoProbes()) {
628 const auto *FuncDesc = Binary->getFuncDescForGUID(
629 Function::getGUID(FunctionProfile->getName()));
630 FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
631 }
632 }
633
634 return *FunctionProfile;
635 }
636
637 RangeSample
preprocessRangeCounter(const RangeSample & RangeCounter)638 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
639 RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
640 if (FillZeroForAllFuncs) {
641 for (auto &FuncI : Binary->getAllBinaryFunctions()) {
642 for (auto &R : FuncI.second.Ranges) {
643 Ranges[{R.first, R.second - 1}] += 0;
644 }
645 }
646 } else {
647 // For each range, we search for all ranges of the function it belongs to
648 // and initialize it with zero count, so it remains zero if doesn't hit any
649 // samples. This is to be consistent with compiler that interpret zero count
650 // as unexecuted(cold).
651 for (const auto &I : RangeCounter) {
652 uint64_t StartAddress = I.first.first;
653 for (const auto &Range : Binary->getRanges(StartAddress))
654 Ranges[{Range.first, Range.second - 1}] += 0;
655 }
656 }
657 RangeSample DisjointRanges;
658 findDisjointRanges(DisjointRanges, Ranges);
659 return DisjointRanges;
660 }
661
populateBodySamplesForAllFunctions(const RangeSample & RangeCounter)662 void ProfileGenerator::populateBodySamplesForAllFunctions(
663 const RangeSample &RangeCounter) {
664 for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
665 uint64_t RangeBegin = Range.first.first;
666 uint64_t RangeEnd = Range.first.second;
667 uint64_t Count = Range.second;
668
669 InstructionPointer IP(Binary, RangeBegin, true);
670 // Disjoint ranges may have range in the middle of two instr,
671 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
672 // can be Addr1+1 to Addr2-1. We should ignore such range.
673 if (IP.Address > RangeEnd)
674 continue;
675
676 do {
677 const SampleContextFrameVector FrameVec =
678 Binary->getFrameLocationStack(IP.Address);
679 if (!FrameVec.empty()) {
680 // FIXME: As accumulating total count per instruction caused some
681 // regression, we changed to accumulate total count per byte as a
682 // workaround. Tuning hotness threshold on the compiler side might be
683 // necessary in the future.
684 FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
685 FrameVec, Count * Binary->getInstSize(IP.Address));
686 updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(),
687 Count);
688 }
689 } while (IP.advance() && IP.Address <= RangeEnd);
690 }
691 }
692
693 StringRef
getCalleeNameForAddress(uint64_t TargetAddress)694 ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) {
695 // Get the function range by branch target if it's a call branch.
696 auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress);
697
698 // We won't accumulate sample count for a range whose start is not the real
699 // function entry such as outlined function or inner labels.
700 if (!FRange || !FRange->IsFuncEntry)
701 return StringRef();
702
703 return FunctionSamples::getCanonicalFnName(FRange->getFuncName());
704 }
705
populateBoundarySamplesForAllFunctions(const BranchSample & BranchCounters)706 void ProfileGenerator::populateBoundarySamplesForAllFunctions(
707 const BranchSample &BranchCounters) {
708 for (const auto &Entry : BranchCounters) {
709 uint64_t SourceAddress = Entry.first.first;
710 uint64_t TargetAddress = Entry.first.second;
711 uint64_t Count = Entry.second;
712 assert(Count != 0 && "Unexpected zero weight branch");
713
714 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
715 if (CalleeName.size() == 0)
716 continue;
717 // Record called target sample and its count.
718 const SampleContextFrameVector &FrameVec =
719 Binary->getCachedFrameLocationStack(SourceAddress);
720 if (!FrameVec.empty()) {
721 FunctionSamples &FunctionProfile =
722 getLeafProfileAndAddTotalSamples(FrameVec, 0);
723 FunctionProfile.addCalledTargetSamples(
724 FrameVec.back().Location.LineOffset,
725 getBaseDiscriminator(FrameVec.back().Location.Discriminator),
726 CalleeName, Count);
727 }
728 // Add head samples for callee.
729 FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName);
730 CalleeProfile.addHeadSamples(Count);
731 }
732 }
733
calculateAndShowDensity(const SampleProfileMap & Profiles)734 void ProfileGeneratorBase::calculateAndShowDensity(
735 const SampleProfileMap &Profiles) {
736 double Density = calculateDensity(Profiles, HotCountThreshold);
737 showDensitySuggestion(Density);
738 }
739
740 FunctionSamples *
getOrCreateFunctionSamples(ContextTrieNode * ContextNode,bool WasLeafInlined)741 CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
742 bool WasLeafInlined) {
743 FunctionSamples *FProfile = ContextNode->getFunctionSamples();
744 if (!FProfile) {
745 FSamplesList.emplace_back();
746 FProfile = &FSamplesList.back();
747 FProfile->setName(ContextNode->getFuncName());
748 ContextNode->setFunctionSamples(FProfile);
749 }
750 // Update ContextWasInlined attribute for existing contexts.
751 // The current function can be called in two ways:
752 // - when processing a probe of the current frame
753 // - when processing the entry probe of an inlinee's frame, which
754 // is then used to update the callsite count of the current frame.
755 // The two can happen in any order, hence here we are making sure
756 // `ContextWasInlined` is always set as expected.
757 // TODO: Note that the former does not always happen if no probes of the
758 // current frame has samples, and if the latter happens, we could lose the
759 // attribute. This should be fixed.
760 if (WasLeafInlined)
761 FProfile->getContext().setAttribute(ContextWasInlined);
762 return FProfile;
763 }
764
765 ContextTrieNode *
getOrCreateContextNode(const SampleContextFrames Context,bool WasLeafInlined)766 CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context,
767 bool WasLeafInlined) {
768 ContextTrieNode *ContextNode =
769 ContextTracker.getOrCreateContextPath(Context, true);
770 getOrCreateFunctionSamples(ContextNode, WasLeafInlined);
771 return ContextNode;
772 }
773
generateProfile()774 void CSProfileGenerator::generateProfile() {
775 FunctionSamples::ProfileIsCS = true;
776
777 collectProfiledFunctions();
778
779 if (Binary->usePseudoProbes()) {
780 Binary->decodePseudoProbe();
781 if (InferMissingFrames)
782 initializeMissingFrameInferrer();
783 }
784
785 if (SampleCounters) {
786 if (Binary->usePseudoProbes()) {
787 generateProbeBasedProfile();
788 } else {
789 generateLineNumBasedProfile();
790 }
791 }
792
793 if (Binary->getTrackFuncContextSize())
794 computeSizeForProfiledFunctions();
795
796 postProcessProfiles();
797 }
798
initializeMissingFrameInferrer()799 void CSProfileGenerator::initializeMissingFrameInferrer() {
800 Binary->getMissingContextInferrer()->initialize(SampleCounters);
801 }
802
inferMissingFrames(const SmallVectorImpl<uint64_t> & Context,SmallVectorImpl<uint64_t> & NewContext)803 void CSProfileGenerator::inferMissingFrames(
804 const SmallVectorImpl<uint64_t> &Context,
805 SmallVectorImpl<uint64_t> &NewContext) {
806 Binary->inferMissingFrames(Context, NewContext);
807 }
808
computeSizeForProfiledFunctions()809 void CSProfileGenerator::computeSizeForProfiledFunctions() {
810 for (auto *Func : Binary->getProfiledFunctions())
811 Binary->computeInlinedContextSizeForFunc(Func);
812
813 // Flush the symbolizer to save memory.
814 Binary->flushSymbolizer();
815 }
816
updateFunctionSamples()817 void CSProfileGenerator::updateFunctionSamples() {
818 for (auto *Node : ContextTracker) {
819 FunctionSamples *FSamples = Node->getFunctionSamples();
820 if (FSamples) {
821 if (UpdateTotalSamples)
822 FSamples->updateTotalSamples();
823 FSamples->updateCallsiteSamples();
824 }
825 }
826 }
827
generateLineNumBasedProfile()828 void CSProfileGenerator::generateLineNumBasedProfile() {
829 for (const auto &CI : *SampleCounters) {
830 const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr());
831
832 ContextTrieNode *ContextNode = &getRootContext();
833 // Sample context will be empty if the jump is an external-to-internal call
834 // pattern, the head samples should be added for the internal function.
835 if (!CtxKey->Context.empty()) {
836 // Get or create function profile for the range
837 ContextNode =
838 getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined);
839 // Fill in function body samples
840 populateBodySamplesForFunction(*ContextNode->getFunctionSamples(),
841 CI.second.RangeCounter);
842 }
843 // Fill in boundary sample counts as well as call site samples for calls
844 populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter);
845 }
846 // Fill in call site value sample for inlined calls and also use context to
847 // infer missing samples. Since we don't have call count for inlined
848 // functions, we estimate it from inlinee's profile using the entry of the
849 // body sample.
850 populateInferredFunctionSamples(getRootContext());
851
852 updateFunctionSamples();
853 }
854
populateBodySamplesForFunction(FunctionSamples & FunctionProfile,const RangeSample & RangeCounter)855 void CSProfileGenerator::populateBodySamplesForFunction(
856 FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
857 // Compute disjoint ranges first, so we can use MAX
858 // for calculating count for each location.
859 RangeSample Ranges;
860 findDisjointRanges(Ranges, RangeCounter);
861 for (const auto &Range : Ranges) {
862 uint64_t RangeBegin = Range.first.first;
863 uint64_t RangeEnd = Range.first.second;
864 uint64_t Count = Range.second;
865 // Disjoint ranges have introduce zero-filled gap that
866 // doesn't belong to current context, filter them out.
867 if (Count == 0)
868 continue;
869
870 InstructionPointer IP(Binary, RangeBegin, true);
871 // Disjoint ranges may have range in the middle of two instr,
872 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
873 // can be Addr1+1 to Addr2-1. We should ignore such range.
874 if (IP.Address > RangeEnd)
875 continue;
876
877 do {
878 auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address);
879 if (LeafLoc) {
880 // Recording body sample for this specific context
881 updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count);
882 FunctionProfile.addTotalSamples(Count);
883 }
884 } while (IP.advance() && IP.Address <= RangeEnd);
885 }
886 }
887
populateBoundarySamplesForFunction(ContextTrieNode * Node,const BranchSample & BranchCounters)888 void CSProfileGenerator::populateBoundarySamplesForFunction(
889 ContextTrieNode *Node, const BranchSample &BranchCounters) {
890
891 for (const auto &Entry : BranchCounters) {
892 uint64_t SourceAddress = Entry.first.first;
893 uint64_t TargetAddress = Entry.first.second;
894 uint64_t Count = Entry.second;
895 assert(Count != 0 && "Unexpected zero weight branch");
896
897 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
898 if (CalleeName.size() == 0)
899 continue;
900
901 ContextTrieNode *CallerNode = Node;
902 LineLocation CalleeCallSite(0, 0);
903 if (CallerNode != &getRootContext()) {
904 // Record called target sample and its count
905 auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress);
906 if (LeafLoc) {
907 CallerNode->getFunctionSamples()->addCalledTargetSamples(
908 LeafLoc->Location.LineOffset,
909 getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName,
910 Count);
911 // Record head sample for called target(callee)
912 CalleeCallSite = LeafLoc->Location;
913 }
914 }
915
916 ContextTrieNode *CalleeNode =
917 CallerNode->getOrCreateChildContext(CalleeCallSite, CalleeName);
918 FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode);
919 CalleeProfile->addHeadSamples(Count);
920 }
921 }
922
populateInferredFunctionSamples(ContextTrieNode & Node)923 void CSProfileGenerator::populateInferredFunctionSamples(
924 ContextTrieNode &Node) {
925 // There is no call jmp sample between the inliner and inlinee, we need to use
926 // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s
927 // sample depends on child(inlinee)'s sample, so traverse the tree in
928 // post-order.
929 for (auto &It : Node.getAllChildContext())
930 populateInferredFunctionSamples(It.second);
931
932 FunctionSamples *CalleeProfile = Node.getFunctionSamples();
933 if (!CalleeProfile)
934 return;
935 // If we already have head sample counts, we must have value profile
936 // for call sites added already. Skip to avoid double counting.
937 if (CalleeProfile->getHeadSamples())
938 return;
939 ContextTrieNode *CallerNode = Node.getParentContext();
940 // If we don't have context, nothing to do for caller's call site.
941 // This could happen for entry point function.
942 if (CallerNode == &getRootContext())
943 return;
944
945 LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc();
946 FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode);
947 // Since we don't have call count for inlined functions, we
948 // estimate it from inlinee's profile using entry body sample.
949 uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate();
950 // If we don't have samples with location, use 1 to indicate live.
951 if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size())
952 EstimatedCallCount = 1;
953 CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset,
954 CallerLeafFrameLoc.Discriminator,
955 Node.getFuncName(), EstimatedCallCount);
956 CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset,
957 CallerLeafFrameLoc.Discriminator,
958 EstimatedCallCount);
959 CallerProfile.addTotalSamples(EstimatedCallCount);
960 }
961
convertToProfileMap(ContextTrieNode & Node,SampleContextFrameVector & Context)962 void CSProfileGenerator::convertToProfileMap(
963 ContextTrieNode &Node, SampleContextFrameVector &Context) {
964 FunctionSamples *FProfile = Node.getFunctionSamples();
965 if (FProfile) {
966 Context.emplace_back(Node.getFuncName(), LineLocation(0, 0));
967 // Save the new context for future references.
968 SampleContextFrames NewContext = *Contexts.insert(Context).first;
969 auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile));
970 FunctionSamples &NewProfile = Ret.first->second;
971 NewProfile.getContext().setContext(NewContext);
972 Context.pop_back();
973 }
974
975 for (auto &It : Node.getAllChildContext()) {
976 ContextTrieNode &ChildNode = It.second;
977 Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc());
978 convertToProfileMap(ChildNode, Context);
979 Context.pop_back();
980 }
981 }
982
convertToProfileMap()983 void CSProfileGenerator::convertToProfileMap() {
984 assert(ProfileMap.empty() &&
985 "ProfileMap should be empty before converting from the trie");
986 assert(IsProfileValidOnTrie &&
987 "Do not convert the trie twice, it's already destroyed");
988
989 SampleContextFrameVector Context;
990 for (auto &It : getRootContext().getAllChildContext())
991 convertToProfileMap(It.second, Context);
992
993 IsProfileValidOnTrie = false;
994 }
995
postProcessProfiles()996 void CSProfileGenerator::postProcessProfiles() {
997 // Compute hot/cold threshold based on profile. This will be used for cold
998 // context profile merging/trimming.
999 computeSummaryAndThreshold();
1000
1001 // Run global pre-inliner to adjust/merge context profile based on estimated
1002 // inline decisions.
1003 if (EnableCSPreInliner) {
1004 ContextTracker.populateFuncToCtxtMap();
1005 CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
1006 // Turn off the profile merger by default unless it is explicitly enabled.
1007 if (!CSProfMergeColdContext.getNumOccurrences())
1008 CSProfMergeColdContext = false;
1009 }
1010
1011 convertToProfileMap();
1012
1013 // Trim and merge cold context profile using cold threshold above.
1014 if (TrimColdProfile || CSProfMergeColdContext) {
1015 SampleContextTrimmer(ProfileMap)
1016 .trimAndMergeColdContextProfiles(
1017 HotCountThreshold, TrimColdProfile, CSProfMergeColdContext,
1018 CSProfMaxColdContextDepth, EnableCSPreInliner);
1019 }
1020
1021 // Merge function samples of CS profile to calculate profile density.
1022 sampleprof::SampleProfileMap ContextLessProfiles;
1023 for (const auto &I : ProfileMap) {
1024 ContextLessProfiles[I.second.getName()].merge(I.second);
1025 }
1026
1027 calculateAndShowDensity(ContextLessProfiles);
1028 if (GenCSNestedProfile) {
1029 CSProfileConverter CSConverter(ProfileMap);
1030 CSConverter.convertProfiles();
1031 FunctionSamples::ProfileIsCS = false;
1032 }
1033 }
1034
computeSummaryAndThreshold(SampleProfileMap & Profiles)1035 void ProfileGeneratorBase::computeSummaryAndThreshold(
1036 SampleProfileMap &Profiles) {
1037 SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
1038 Summary = Builder.computeSummaryForProfiles(Profiles);
1039 HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
1040 (Summary->getDetailedSummary()));
1041 ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
1042 (Summary->getDetailedSummary()));
1043 }
1044
computeSummaryAndThreshold()1045 void CSProfileGenerator::computeSummaryAndThreshold() {
1046 // Always merge and use context-less profile map to compute summary.
1047 SampleProfileMap ContextLessProfiles;
1048 ContextTracker.createContextLessProfileMap(ContextLessProfiles);
1049
1050 // Set the flag below to avoid merging the profile again in
1051 // computeSummaryAndThreshold
1052 FunctionSamples::ProfileIsCS = false;
1053 assert(
1054 (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) &&
1055 "Don't set --profile-summary-contextless to false for profile "
1056 "generation");
1057 ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles);
1058 // Recover the old value.
1059 FunctionSamples::ProfileIsCS = true;
1060 }
1061
extractProbesFromRange(const RangeSample & RangeCounter,ProbeCounterMap & ProbeCounter,bool FindDisjointRanges)1062 void ProfileGeneratorBase::extractProbesFromRange(
1063 const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
1064 bool FindDisjointRanges) {
1065 const RangeSample *PRanges = &RangeCounter;
1066 RangeSample Ranges;
1067 if (FindDisjointRanges) {
1068 findDisjointRanges(Ranges, RangeCounter);
1069 PRanges = &Ranges;
1070 }
1071
1072 for (const auto &Range : *PRanges) {
1073 uint64_t RangeBegin = Range.first.first;
1074 uint64_t RangeEnd = Range.first.second;
1075 uint64_t Count = Range.second;
1076
1077 InstructionPointer IP(Binary, RangeBegin, true);
1078 // Disjoint ranges may have range in the middle of two instr,
1079 // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1080 // can be Addr1+1 to Addr2-1. We should ignore such range.
1081 if (IP.Address > RangeEnd)
1082 continue;
1083
1084 do {
1085 const AddressProbesMap &Address2ProbesMap =
1086 Binary->getAddress2ProbesMap();
1087 auto It = Address2ProbesMap.find(IP.Address);
1088 if (It != Address2ProbesMap.end()) {
1089 for (const auto &Probe : It->second) {
1090 ProbeCounter[&Probe] += Count;
1091 }
1092 }
1093 } while (IP.advance() && IP.Address <= RangeEnd);
1094 }
1095 }
1096
extractPrefixContextStack(SampleContextFrameVector & ContextStack,const SmallVectorImpl<uint64_t> & AddrVec,ProfiledBinary * Binary)1097 static void extractPrefixContextStack(SampleContextFrameVector &ContextStack,
1098 const SmallVectorImpl<uint64_t> &AddrVec,
1099 ProfiledBinary *Binary) {
1100 SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
1101 for (auto Address : reverse(AddrVec)) {
1102 const MCDecodedPseudoProbe *CallProbe =
1103 Binary->getCallProbeForAddr(Address);
1104 // These could be the cases when a probe is not found at a calliste. Cutting
1105 // off the context from here since the inliner will not know how to consume
1106 // a context with unknown callsites.
1107 // 1. for functions that are not sampled when
1108 // --decode-probe-for-profiled-functions-only is on.
1109 // 2. for a merged callsite. Callsite merging may cause the loss of original
1110 // probe IDs.
1111 // 3. for an external callsite.
1112 if (!CallProbe)
1113 break;
1114 Probes.push_back(CallProbe);
1115 }
1116
1117 std::reverse(Probes.begin(), Probes.end());
1118
1119 // Extract context stack for reusing, leaf context stack will be added
1120 // compressed while looking up function profile.
1121 for (const auto *P : Probes) {
1122 Binary->getInlineContextForProbe(P, ContextStack, true);
1123 }
1124 }
1125
generateProbeBasedProfile()1126 void CSProfileGenerator::generateProbeBasedProfile() {
1127 // Enable pseudo probe functionalities in SampleProf
1128 FunctionSamples::ProfileIsProbeBased = true;
1129 for (const auto &CI : *SampleCounters) {
1130 const AddrBasedCtxKey *CtxKey =
1131 dyn_cast<AddrBasedCtxKey>(CI.first.getPtr());
1132 // Fill in function body samples from probes, also infer caller's samples
1133 // from callee's probe
1134 populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey);
1135 // Fill in boundary samples for a call probe
1136 populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey);
1137 }
1138 }
1139
populateBodySamplesWithProbes(const RangeSample & RangeCounter,const AddrBasedCtxKey * CtxKey)1140 void CSProfileGenerator::populateBodySamplesWithProbes(
1141 const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) {
1142 ProbeCounterMap ProbeCounter;
1143 // Extract the top frame probes by looking up each address among the range in
1144 // the Address2ProbeMap
1145 extractProbesFromRange(RangeCounter, ProbeCounter);
1146 std::unordered_map<MCDecodedPseudoProbeInlineTree *,
1147 std::unordered_set<FunctionSamples *>>
1148 FrameSamples;
1149 for (const auto &PI : ProbeCounter) {
1150 const MCDecodedPseudoProbe *Probe = PI.first;
1151 uint64_t Count = PI.second;
1152 // Disjoint ranges have introduce zero-filled gap that
1153 // doesn't belong to current context, filter them out.
1154 if (!Probe->isBlock() || Count == 0)
1155 continue;
1156
1157 ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe);
1158 FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples();
1159 // Record the current frame and FunctionProfile whenever samples are
1160 // collected for non-danglie probes. This is for reporting all of the
1161 // zero count probes of the frame later.
1162 FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile);
1163 FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count);
1164 FunctionProfile.addTotalSamples(Count);
1165 if (Probe->isEntry()) {
1166 FunctionProfile.addHeadSamples(Count);
1167 // Look up for the caller's function profile
1168 const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
1169 ContextTrieNode *CallerNode = ContextNode->getParentContext();
1170 if (InlinerDesc != nullptr && CallerNode != &getRootContext()) {
1171 // Since the context id will be compressed, we have to use callee's
1172 // context id to infer caller's context id to ensure they share the
1173 // same context prefix.
1174 uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset;
1175 assert(CallerIndex &&
1176 "Inferred caller's location index shouldn't be zero!");
1177 FunctionSamples &CallerProfile =
1178 *getOrCreateFunctionSamples(CallerNode);
1179 CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
1180 CallerProfile.addBodySamples(CallerIndex, 0, Count);
1181 CallerProfile.addTotalSamples(Count);
1182 CallerProfile.addCalledTargetSamples(CallerIndex, 0,
1183 ContextNode->getFuncName(), Count);
1184 }
1185 }
1186 }
1187
1188 // Assign zero count for remaining probes without sample hits to
1189 // differentiate from probes optimized away, of which the counts are unknown
1190 // and will be inferred by the compiler.
1191 for (auto &I : FrameSamples) {
1192 for (auto *FunctionProfile : I.second) {
1193 for (auto *Probe : I.first->getProbes()) {
1194 FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0);
1195 }
1196 }
1197 }
1198 }
1199
populateBoundarySamplesWithProbes(const BranchSample & BranchCounter,const AddrBasedCtxKey * CtxKey)1200 void CSProfileGenerator::populateBoundarySamplesWithProbes(
1201 const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) {
1202 for (const auto &BI : BranchCounter) {
1203 uint64_t SourceAddress = BI.first.first;
1204 uint64_t TargetAddress = BI.first.second;
1205 uint64_t Count = BI.second;
1206 const MCDecodedPseudoProbe *CallProbe =
1207 Binary->getCallProbeForAddr(SourceAddress);
1208 if (CallProbe == nullptr)
1209 continue;
1210 FunctionSamples &FunctionProfile =
1211 getFunctionProfileForLeafProbe(CtxKey, CallProbe);
1212 FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count);
1213 FunctionProfile.addTotalSamples(Count);
1214 StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1215 if (CalleeName.size() == 0)
1216 continue;
1217 FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName,
1218 Count);
1219 }
1220 }
1221
getContextNodeForLeafProbe(const AddrBasedCtxKey * CtxKey,const MCDecodedPseudoProbe * LeafProbe)1222 ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe(
1223 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1224
1225 const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context;
1226 SmallVector<uint64_t, 16> NewContext;
1227
1228 if (InferMissingFrames) {
1229 SmallVector<uint64_t, 16> Context = CtxKey->Context;
1230 // Append leaf frame for a complete inference.
1231 Context.push_back(LeafProbe->getAddress());
1232 inferMissingFrames(Context, NewContext);
1233 // Pop out the leaf probe that was pushed in above.
1234 NewContext.pop_back();
1235 PContext = &NewContext;
1236 }
1237
1238 SampleContextFrameVector ContextStack;
1239 extractPrefixContextStack(ContextStack, *PContext, Binary);
1240
1241 // Explicitly copy the context for appending the leaf context
1242 SampleContextFrameVector NewContextStack(ContextStack.begin(),
1243 ContextStack.end());
1244 Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true);
1245 // For leaf inlined context with the top frame, we should strip off the top
1246 // frame's probe id, like:
1247 // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
1248 auto LeafFrame = NewContextStack.back();
1249 LeafFrame.Location = LineLocation(0, 0);
1250 NewContextStack.pop_back();
1251 // Compress the context string except for the leaf frame
1252 CSProfileGenerator::compressRecursionContext(NewContextStack);
1253 CSProfileGenerator::trimContext(NewContextStack);
1254 NewContextStack.push_back(LeafFrame);
1255
1256 const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid());
1257 bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
1258 ContextTrieNode *ContextNode =
1259 getOrCreateContextNode(NewContextStack, WasLeafInlined);
1260 ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash);
1261 return ContextNode;
1262 }
1263
getFunctionProfileForLeafProbe(const AddrBasedCtxKey * CtxKey,const MCDecodedPseudoProbe * LeafProbe)1264 FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
1265 const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1266 return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples();
1267 }
1268
1269 } // end namespace sampleprof
1270 } // end namespace llvm
1271