1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/Module.h"
39 #include "llvm/IR/ModuleSummaryIndex.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/FileSystem.h"
43 #include "llvm/Support/GraphWriter.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Utils/Cloning.h"
47 #include <sstream>
48 #include <unordered_map>
49 #include <vector>
50 using namespace llvm;
51 using namespace llvm::memprof;
52
53 #define DEBUG_TYPE "memprof-context-disambiguation"
54
55 STATISTIC(FunctionClonesAnalysis,
56 "Number of function clones created during whole program analysis");
57 STATISTIC(FunctionClonesThinBackend,
58 "Number of function clones created during ThinLTO backend");
59 STATISTIC(FunctionsClonedThinBackend,
60 "Number of functions that had clones created during ThinLTO backend");
61 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
62 "cloned) during whole program analysis");
63 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
64 "during whole program analysis");
65 STATISTIC(AllocTypeNotColdThinBackend,
66 "Number of not cold static allocations (possibly cloned) during "
67 "ThinLTO backend");
68 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
69 "(possibly cloned) during ThinLTO backend");
70 STATISTIC(OrigAllocsThinBackend,
71 "Number of original (not cloned) allocations with memprof profiles "
72 "during ThinLTO backend");
73 STATISTIC(
74 AllocVersionsThinBackend,
75 "Number of allocation versions (including clones) during ThinLTO backend");
76 STATISTIC(MaxAllocVersionsThinBackend,
77 "Maximum number of allocation versions created for an original "
78 "allocation during ThinLTO backend");
79 STATISTIC(UnclonableAllocsThinBackend,
80 "Number of unclonable ambigous allocations during ThinLTO backend");
81 STATISTIC(RemovedEdgesWithMismatchedCallees,
82 "Number of edges removed due to mismatched callees (profiled vs IR)");
83 STATISTIC(FoundProfiledCalleeCount,
84 "Number of profiled callees found via tail calls");
85 STATISTIC(FoundProfiledCalleeDepth,
86 "Aggregate depth of profiled callees found via tail calls");
87 STATISTIC(FoundProfiledCalleeMaxDepth,
88 "Maximum depth of profiled callees found via tail calls");
89 STATISTIC(FoundProfiledCalleeNonUniquelyCount,
90 "Number of profiled callees found via multiple tail call chains");
91
92 static cl::opt<std::string> DotFilePathPrefix(
93 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
94 cl::value_desc("filename"),
95 cl::desc("Specify the path prefix of the MemProf dot files."));
96
97 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
98 cl::Hidden,
99 cl::desc("Export graph to dot files."));
100
101 static cl::opt<bool>
102 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
103 cl::desc("Dump CallingContextGraph to stdout after each stage."));
104
105 static cl::opt<bool>
106 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
107 cl::desc("Perform verification checks on CallingContextGraph."));
108
109 static cl::opt<bool>
110 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
111 cl::desc("Perform frequent verification checks on nodes."));
112
113 static cl::opt<std::string> MemProfImportSummary(
114 "memprof-import-summary",
115 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
116 cl::Hidden);
117
118 static cl::opt<unsigned>
119 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
120 cl::Hidden,
121 cl::desc("Max depth to recursively search for missing "
122 "frames through tail calls."));
123
124 namespace llvm {
125 // Indicate we are linking with an allocator that supports hot/cold operator
126 // new interfaces.
127 cl::opt<bool> SupportsHotColdNew(
128 "supports-hot-cold-new", cl::init(false), cl::Hidden,
129 cl::desc("Linking with hot/cold operator new interfaces"));
130 } // namespace llvm
131
132 namespace {
133 /// CRTP base for graphs built from either IR or ThinLTO summary index.
134 ///
135 /// The graph represents the call contexts in all memprof metadata on allocation
136 /// calls, with nodes for the allocations themselves, as well as for the calls
137 /// in each context. The graph is initially built from the allocation memprof
138 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
139 /// metadata onto the nodes, updating it to reflect any inlining performed on
140 /// those calls.
141 ///
142 /// Each MIB (representing an allocation's call context with allocation
143 /// behavior) is assigned a unique context id during the graph build. The edges
144 /// and nodes in the graph are decorated with the context ids they carry. This
145 /// is used to correctly update the graph when cloning is performed so that we
146 /// can uniquify the context for a single (possibly cloned) allocation.
147 template <typename DerivedCCG, typename FuncTy, typename CallTy>
148 class CallsiteContextGraph {
149 public:
150 CallsiteContextGraph() = default;
151 CallsiteContextGraph(const CallsiteContextGraph &) = default;
152 CallsiteContextGraph(CallsiteContextGraph &&) = default;
153
154 /// Main entry point to perform analysis and transformations on graph.
155 bool process();
156
157 /// Perform cloning on the graph necessary to uniquely identify the allocation
158 /// behavior of an allocation based on its context.
159 void identifyClones();
160
161 /// Assign callsite clones to functions, cloning functions as needed to
162 /// accommodate the combinations of their callsite clones reached by callers.
163 /// For regular LTO this clones functions and callsites in the IR, but for
164 /// ThinLTO the cloning decisions are noted in the summaries and later applied
165 /// in applyImport.
166 bool assignFunctions();
167
168 void dump() const;
169 void print(raw_ostream &OS) const;
170
operator <<(raw_ostream & OS,const CallsiteContextGraph & CCG)171 friend raw_ostream &operator<<(raw_ostream &OS,
172 const CallsiteContextGraph &CCG) {
173 CCG.print(OS);
174 return OS;
175 }
176
177 friend struct GraphTraits<
178 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
179 friend struct DOTGraphTraits<
180 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
181
182 void exportToDot(std::string Label) const;
183
184 /// Represents a function clone via FuncTy pointer and clone number pair.
185 struct FuncInfo final
186 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
187 using Base = std::pair<FuncTy *, unsigned>;
FuncInfo__anon2f5d20380111::CallsiteContextGraph::FuncInfo188 FuncInfo(const Base &B) : Base(B) {}
FuncInfo__anon2f5d20380111::CallsiteContextGraph::FuncInfo189 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
operator bool__anon2f5d20380111::CallsiteContextGraph::FuncInfo190 explicit operator bool() const { return this->first != nullptr; }
func__anon2f5d20380111::CallsiteContextGraph::FuncInfo191 FuncTy *func() const { return this->first; }
cloneNo__anon2f5d20380111::CallsiteContextGraph::FuncInfo192 unsigned cloneNo() const { return this->second; }
193 };
194
195 /// Represents a callsite clone via CallTy and clone number pair.
196 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
197 using Base = std::pair<CallTy, unsigned>;
CallInfo__anon2f5d20380111::CallsiteContextGraph::CallInfo198 CallInfo(const Base &B) : Base(B) {}
CallInfo__anon2f5d20380111::CallsiteContextGraph::CallInfo199 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
200 : Base(Call, CloneNo) {}
operator bool__anon2f5d20380111::CallsiteContextGraph::CallInfo201 explicit operator bool() const { return (bool)this->first; }
call__anon2f5d20380111::CallsiteContextGraph::CallInfo202 CallTy call() const { return this->first; }
cloneNo__anon2f5d20380111::CallsiteContextGraph::CallInfo203 unsigned cloneNo() const { return this->second; }
setCloneNo__anon2f5d20380111::CallsiteContextGraph::CallInfo204 void setCloneNo(unsigned N) { this->second = N; }
print__anon2f5d20380111::CallsiteContextGraph::CallInfo205 void print(raw_ostream &OS) const {
206 if (!operator bool()) {
207 assert(!cloneNo());
208 OS << "null Call";
209 return;
210 }
211 call()->print(OS);
212 OS << "\t(clone " << cloneNo() << ")";
213 }
dump__anon2f5d20380111::CallsiteContextGraph::CallInfo214 void dump() const {
215 print(dbgs());
216 dbgs() << "\n";
217 }
operator <<(raw_ostream & OS,const CallInfo & Call)218 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
219 Call.print(OS);
220 return OS;
221 }
222 };
223
224 struct ContextEdge;
225
226 /// Node in the Callsite Context Graph
227 struct ContextNode {
228 // Keep this for now since in the IR case where we have an Instruction* it
229 // is not as immediately discoverable. Used for printing richer information
230 // when dumping graph.
231 bool IsAllocation;
232
233 // Keeps track of when the Call was reset to null because there was
234 // recursion.
235 bool Recursive = false;
236
237 // The corresponding allocation or interior call.
238 CallInfo Call;
239
240 // For alloc nodes this is a unique id assigned when constructed, and for
241 // callsite stack nodes it is the original stack id when the node is
242 // constructed from the memprof MIB metadata on the alloc nodes. Note that
243 // this is only used when matching callsite metadata onto the stack nodes
244 // created when processing the allocation memprof MIBs, and for labeling
245 // nodes in the dot graph. Therefore we don't bother to assign a value for
246 // clones.
247 uint64_t OrigStackOrAllocId = 0;
248
249 // This will be formed by ORing together the AllocationType enum values
250 // for contexts including this node.
251 uint8_t AllocTypes = 0;
252
253 // Edges to all callees in the profiled call stacks.
254 // TODO: Should this be a map (from Callee node) for more efficient lookup?
255 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
256
257 // Edges to all callers in the profiled call stacks.
258 // TODO: Should this be a map (from Caller node) for more efficient lookup?
259 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
260
261 // The set of IDs for contexts including this node.
262 DenseSet<uint32_t> ContextIds;
263
264 // List of clones of this ContextNode, initially empty.
265 std::vector<ContextNode *> Clones;
266
267 // If a clone, points to the original uncloned node.
268 ContextNode *CloneOf = nullptr;
269
ContextNode__anon2f5d20380111::CallsiteContextGraph::ContextNode270 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
271
ContextNode__anon2f5d20380111::CallsiteContextGraph::ContextNode272 ContextNode(bool IsAllocation, CallInfo C)
273 : IsAllocation(IsAllocation), Call(C) {}
274
addClone__anon2f5d20380111::CallsiteContextGraph::ContextNode275 void addClone(ContextNode *Clone) {
276 if (CloneOf) {
277 CloneOf->Clones.push_back(Clone);
278 Clone->CloneOf = CloneOf;
279 } else {
280 Clones.push_back(Clone);
281 assert(!Clone->CloneOf);
282 Clone->CloneOf = this;
283 }
284 }
285
getOrigNode__anon2f5d20380111::CallsiteContextGraph::ContextNode286 ContextNode *getOrigNode() {
287 if (!CloneOf)
288 return this;
289 return CloneOf;
290 }
291
292 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
293 unsigned int ContextId);
294
295 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
296 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
297 void eraseCalleeEdge(const ContextEdge *Edge);
298 void eraseCallerEdge(const ContextEdge *Edge);
299
setCall__anon2f5d20380111::CallsiteContextGraph::ContextNode300 void setCall(CallInfo C) { Call = C; }
301
hasCall__anon2f5d20380111::CallsiteContextGraph::ContextNode302 bool hasCall() const { return (bool)Call.call(); }
303
printCall__anon2f5d20380111::CallsiteContextGraph::ContextNode304 void printCall(raw_ostream &OS) const { Call.print(OS); }
305
306 // True if this node was effectively removed from the graph, in which case
307 // its context id set, caller edges, and callee edges should all be empty.
isRemoved__anon2f5d20380111::CallsiteContextGraph::ContextNode308 bool isRemoved() const {
309 assert(ContextIds.empty() ==
310 (CalleeEdges.empty() && CallerEdges.empty()));
311 return ContextIds.empty();
312 }
313
314 void dump() const;
315 void print(raw_ostream &OS) const;
316
operator <<(raw_ostream & OS,const ContextNode & Node)317 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
318 Node.print(OS);
319 return OS;
320 }
321 };
322
323 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
324 /// callee.
325 struct ContextEdge {
326 ContextNode *Callee;
327 ContextNode *Caller;
328
329 // This will be formed by ORing together the AllocationType enum values
330 // for contexts including this edge.
331 uint8_t AllocTypes = 0;
332
333 // The set of IDs for contexts including this edge.
334 DenseSet<uint32_t> ContextIds;
335
ContextEdge__anon2f5d20380111::CallsiteContextGraph::ContextEdge336 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
337 DenseSet<uint32_t> ContextIds)
338 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
339 ContextIds(ContextIds) {}
340
getContextIds__anon2f5d20380111::CallsiteContextGraph::ContextEdge341 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
342
343 void dump() const;
344 void print(raw_ostream &OS) const;
345
operator <<(raw_ostream & OS,const ContextEdge & Edge)346 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
347 Edge.print(OS);
348 return OS;
349 }
350 };
351
352 /// Helper to remove callee edges that have allocation type None (due to not
353 /// carrying any context ids) after transformations.
354 void removeNoneTypeCalleeEdges(ContextNode *Node);
355
356 protected:
357 /// Get a list of nodes corresponding to the stack ids in the given callsite
358 /// context.
359 template <class NodeT, class IteratorT>
360 std::vector<uint64_t>
361 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
362
363 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
364 /// metadata (or summary).
365 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
366
367 /// Adds nodes for the given MIB stack ids.
368 template <class NodeT, class IteratorT>
369 void addStackNodesForMIB(ContextNode *AllocNode,
370 CallStack<NodeT, IteratorT> &StackContext,
371 CallStack<NodeT, IteratorT> &CallsiteContext,
372 AllocationType AllocType);
373
374 /// Matches all callsite metadata (or summary) to the nodes created for
375 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
376 /// inlining performed on those callsite instructions.
377 void updateStackNodes();
378
379 /// Update graph to conservatively handle any callsite stack nodes that target
380 /// multiple different callee target functions.
381 void handleCallsitesWithMultipleTargets();
382
383 /// Save lists of calls with MemProf metadata in each function, for faster
384 /// iteration.
385 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
386
387 /// Map from callsite node to the enclosing caller function.
388 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
389
390 private:
391 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
392
393 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
394 const FuncTy *, DenseSet<uint32_t>>;
395
396 /// Assigns the given Node to calls at or inlined into the location with
397 /// the Node's stack id, after post order traversing and processing its
398 /// caller nodes. Uses the call information recorded in the given
399 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
400 /// as needed. Called by updateStackNodes which sets up the given
401 /// StackIdToMatchingCalls map.
402 void assignStackNodesPostOrder(
403 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
404 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
405
406 /// Duplicates the given set of context ids, updating the provided
407 /// map from each original id with the newly generated context ids,
408 /// and returning the new duplicated id set.
409 DenseSet<uint32_t> duplicateContextIds(
410 const DenseSet<uint32_t> &StackSequenceContextIds,
411 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
412
413 /// Propagates all duplicated context ids across the graph.
414 void propagateDuplicateContextIds(
415 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
416
417 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
418 /// else to its callers. Also updates OrigNode's edges to remove any context
419 /// ids moved to the newly created edge.
420 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
421 bool TowardsCallee);
422
423 /// Get the stack id corresponding to the given Id or Index (for IR this will
424 /// return itself, for a summary index this will return the id recorded in the
425 /// index for that stack id index value).
getStackId(uint64_t IdOrIndex) const426 uint64_t getStackId(uint64_t IdOrIndex) const {
427 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
428 }
429
430 /// Returns true if the given call targets the callee of the given edge, or if
431 /// we were able to identify the call chain through intermediate tail calls.
432 /// In the latter case new context nodes are added to the graph for the
433 /// identified tail calls, and their synthesized nodes are added to
434 /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
435 /// next element after the input position (either incremented or updated after
436 /// removing the old edge).
437 bool
438 calleesMatch(CallTy Call, EdgeIter &EI,
439 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
440
441 /// Returns true if the given call targets the given function, or if we were
442 /// able to identify the call chain through intermediate tail calls (in which
443 /// case FoundCalleeChain will be populated).
calleeMatchesFunc(CallTy Call,const FuncTy * Func,const FuncTy * CallerFunc,std::vector<std::pair<CallTy,FuncTy * >> & FoundCalleeChain)444 bool calleeMatchesFunc(
445 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
446 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
447 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
448 Call, Func, CallerFunc, FoundCalleeChain);
449 }
450
451 /// Get a list of nodes corresponding to the stack ids in the given
452 /// callsite's context.
getStackIdsWithContextNodesForCall(CallTy Call)453 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
454 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
455 Call);
456 }
457
458 /// Get the last stack id in the context for callsite.
getLastStackId(CallTy Call)459 uint64_t getLastStackId(CallTy Call) {
460 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
461 }
462
463 /// Update the allocation call to record type of allocated memory.
updateAllocationCall(CallInfo & Call,AllocationType AllocType)464 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
465 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
466 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
467 }
468
469 /// Update non-allocation call to invoke (possibly cloned) function
470 /// CalleeFunc.
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)471 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
472 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
473 }
474
475 /// Clone the given function for the given callsite, recording mapping of all
476 /// of the functions tracked calls to their new versions in the CallMap.
477 /// Assigns new clones to clone number CloneNo.
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)478 FuncInfo cloneFunctionForCallsite(
479 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
480 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
481 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
482 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
483 }
484
485 /// Gets a label to use in the dot graph for the given call clone in the given
486 /// function.
getLabel(const FuncTy * Func,const CallTy Call,unsigned CloneNo) const487 std::string getLabel(const FuncTy *Func, const CallTy Call,
488 unsigned CloneNo) const {
489 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
490 }
491
492 /// Helpers to find the node corresponding to the given call or stackid.
493 ContextNode *getNodeForInst(const CallInfo &C);
494 ContextNode *getNodeForAlloc(const CallInfo &C);
495 ContextNode *getNodeForStackId(uint64_t StackId);
496
497 /// Removes the node information recorded for the given call.
498 void unsetNodeForInst(const CallInfo &C);
499
500 /// Computes the alloc type corresponding to the given context ids, by
501 /// unioning their recorded alloc types.
502 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
503
504 /// Returns the alloction type of the intersection of the contexts of two
505 /// nodes (based on their provided context id sets), optimized for the case
506 /// when Node1Ids is smaller than Node2Ids.
507 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
508 const DenseSet<uint32_t> &Node2Ids);
509
510 /// Returns the alloction type of the intersection of the contexts of two
511 /// nodes (based on their provided context id sets).
512 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
513 const DenseSet<uint32_t> &Node2Ids);
514
515 /// Create a clone of Edge's callee and move Edge to that new callee node,
516 /// performing the necessary context id and allocation type updates.
517 /// If callee's caller edge iterator is supplied, it is updated when removing
518 /// the edge from that list.
519 ContextNode *
520 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
521 EdgeIter *CallerEdgeI = nullptr);
522
523 /// Change the callee of Edge to existing callee clone NewCallee, performing
524 /// the necessary context id and allocation type updates.
525 /// If callee's caller edge iterator is supplied, it is updated when removing
526 /// the edge from that list.
527 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
528 ContextNode *NewCallee,
529 EdgeIter *CallerEdgeI = nullptr,
530 bool NewClone = false);
531
532 /// Recursively perform cloning on the graph for the given Node and its
533 /// callers, in order to uniquely identify the allocation behavior of an
534 /// allocation given its context.
535 void identifyClones(ContextNode *Node,
536 DenseSet<const ContextNode *> &Visited);
537
538 /// Map from each context ID to the AllocationType assigned to that context.
539 std::map<uint32_t, AllocationType> ContextIdToAllocationType;
540
541 /// Identifies the context node created for a stack id when adding the MIB
542 /// contexts to the graph. This is used to locate the context nodes when
543 /// trying to assign the corresponding callsites with those stack ids to these
544 /// nodes.
545 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
546
547 /// Maps to track the calls to their corresponding nodes in the graph.
548 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
549 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
550
551 /// Owner of all ContextNode unique_ptrs.
552 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
553
554 /// Perform sanity checks on graph when requested.
555 void check() const;
556
557 /// Keeps track of the last unique context id assigned.
558 unsigned int LastContextId = 0;
559 };
560
561 template <typename DerivedCCG, typename FuncTy, typename CallTy>
562 using ContextNode =
563 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
564 template <typename DerivedCCG, typename FuncTy, typename CallTy>
565 using ContextEdge =
566 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
567 template <typename DerivedCCG, typename FuncTy, typename CallTy>
568 using FuncInfo =
569 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
570 template <typename DerivedCCG, typename FuncTy, typename CallTy>
571 using CallInfo =
572 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
573
574 /// CRTP derived class for graphs built from IR (regular LTO).
575 class ModuleCallsiteContextGraph
576 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
577 Instruction *> {
578 public:
579 ModuleCallsiteContextGraph(
580 Module &M,
581 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
582
583 private:
584 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
585 Instruction *>;
586
587 uint64_t getStackId(uint64_t IdOrIndex) const;
588 bool calleeMatchesFunc(
589 Instruction *Call, const Function *Func, const Function *CallerFunc,
590 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
591 bool findProfiledCalleeThroughTailCalls(
592 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
593 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
594 bool &FoundMultipleCalleeChains);
595 uint64_t getLastStackId(Instruction *Call);
596 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
597 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
598 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
599 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
600 Instruction *>::FuncInfo
601 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
602 std::map<CallInfo, CallInfo> &CallMap,
603 std::vector<CallInfo> &CallsWithMetadataInFunc,
604 unsigned CloneNo);
605 std::string getLabel(const Function *Func, const Instruction *Call,
606 unsigned CloneNo) const;
607
608 const Module &Mod;
609 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
610 };
611
612 /// Represents a call in the summary index graph, which can either be an
613 /// allocation or an interior callsite node in an allocation's context.
614 /// Holds a pointer to the corresponding data structure in the index.
615 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
IndexCall__anon2f5d20380111::IndexCall616 IndexCall() : PointerUnion() {}
IndexCall__anon2f5d20380111::IndexCall617 IndexCall(std::nullptr_t) : IndexCall() {}
IndexCall__anon2f5d20380111::IndexCall618 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
IndexCall__anon2f5d20380111::IndexCall619 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
IndexCall__anon2f5d20380111::IndexCall620 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
621
operator ->__anon2f5d20380111::IndexCall622 IndexCall *operator->() { return this; }
623
getBase__anon2f5d20380111::IndexCall624 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
625
print__anon2f5d20380111::IndexCall626 void print(raw_ostream &OS) const {
627 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
628 OS << *AI;
629 } else {
630 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
631 assert(CI);
632 OS << *CI;
633 }
634 }
635 };
636
637 /// CRTP derived class for graphs built from summary index (ThinLTO).
638 class IndexCallsiteContextGraph
639 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
640 IndexCall> {
641 public:
642 IndexCallsiteContextGraph(
643 ModuleSummaryIndex &Index,
644 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
645 isPrevailing);
646
~IndexCallsiteContextGraph()647 ~IndexCallsiteContextGraph() {
648 // Now that we are done with the graph it is safe to add the new
649 // CallsiteInfo structs to the function summary vectors. The graph nodes
650 // point into locations within these vectors, so we don't want to add them
651 // any earlier.
652 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
653 auto *FS = I.first;
654 for (auto &Callsite : I.second)
655 FS->addCallsite(*Callsite.second);
656 }
657 }
658
659 private:
660 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
661 IndexCall>;
662
663 uint64_t getStackId(uint64_t IdOrIndex) const;
664 bool calleeMatchesFunc(
665 IndexCall &Call, const FunctionSummary *Func,
666 const FunctionSummary *CallerFunc,
667 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
668 bool findProfiledCalleeThroughTailCalls(
669 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
670 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
671 bool &FoundMultipleCalleeChains);
672 uint64_t getLastStackId(IndexCall &Call);
673 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
674 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
675 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
676 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
677 IndexCall>::FuncInfo
678 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
679 std::map<CallInfo, CallInfo> &CallMap,
680 std::vector<CallInfo> &CallsWithMetadataInFunc,
681 unsigned CloneNo);
682 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
683 unsigned CloneNo) const;
684
685 // Saves mapping from function summaries containing memprof records back to
686 // its VI, for use in checking and debugging.
687 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
688
689 const ModuleSummaryIndex &Index;
690 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
691 isPrevailing;
692
693 // Saves/owns the callsite info structures synthesized for missing tail call
694 // frames that we discover while building the graph.
695 // It maps from the summary of the function making the tail call, to a map
696 // of callee ValueInfo to corresponding synthesized callsite info.
697 std::unordered_map<FunctionSummary *,
698 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
699 FunctionCalleesToSynthesizedCallsiteInfos;
700 };
701 } // namespace
702
703 namespace llvm {
704 template <>
705 struct DenseMapInfo<typename CallsiteContextGraph<
706 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
707 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
708 template <>
709 struct DenseMapInfo<typename CallsiteContextGraph<
710 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
711 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
712 template <>
713 struct DenseMapInfo<IndexCall>
714 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
715 } // end namespace llvm
716
717 namespace {
718
719 struct FieldSeparator {
720 bool Skip = true;
721 const char *Sep;
722
FieldSeparator__anon2f5d20380211::FieldSeparator723 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
724 };
725
operator <<(raw_ostream & OS,FieldSeparator & FS)726 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
727 if (FS.Skip) {
728 FS.Skip = false;
729 return OS;
730 }
731 return OS << FS.Sep;
732 }
733
734 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
735 // type we should actually use on the corresponding allocation.
736 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
737 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
738 // from NotCold.
allocTypeToUse(uint8_t AllocTypes)739 AllocationType allocTypeToUse(uint8_t AllocTypes) {
740 assert(AllocTypes != (uint8_t)AllocationType::None);
741 if (AllocTypes ==
742 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
743 return AllocationType::NotCold;
744 else
745 return (AllocationType)AllocTypes;
746 }
747
748 // Helper to check if the alloc types for all edges recorded in the
749 // InAllocTypes vector match the alloc types for all edges in the Edges
750 // vector.
751 template <typename DerivedCCG, typename FuncTy, typename CallTy>
allocTypesMatch(const std::vector<uint8_t> & InAllocTypes,const std::vector<std::shared_ptr<ContextEdge<DerivedCCG,FuncTy,CallTy>>> & Edges)752 bool allocTypesMatch(
753 const std::vector<uint8_t> &InAllocTypes,
754 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
755 &Edges) {
756 return std::equal(
757 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
758 [](const uint8_t &l,
759 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
760 // Can share if one of the edges is None type - don't
761 // care about the type along that edge as it doesn't
762 // exist for those context ids.
763 if (l == (uint8_t)AllocationType::None ||
764 r->AllocTypes == (uint8_t)AllocationType::None)
765 return true;
766 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
767 });
768 }
769
770 } // end anonymous namespace
771
772 template <typename DerivedCCG, typename FuncTy, typename CallTy>
773 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForInst(const CallInfo & C)774 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
775 const CallInfo &C) {
776 ContextNode *Node = getNodeForAlloc(C);
777 if (Node)
778 return Node;
779
780 return NonAllocationCallToContextNodeMap.lookup(C);
781 }
782
783 template <typename DerivedCCG, typename FuncTy, typename CallTy>
784 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForAlloc(const CallInfo & C)785 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
786 const CallInfo &C) {
787 return AllocationCallToContextNodeMap.lookup(C);
788 }
789
790 template <typename DerivedCCG, typename FuncTy, typename CallTy>
791 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForStackId(uint64_t StackId)792 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
793 uint64_t StackId) {
794 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
795 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
796 return StackEntryNode->second;
797 return nullptr;
798 }
799
800 template <typename DerivedCCG, typename FuncTy, typename CallTy>
unsetNodeForInst(const CallInfo & C)801 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
802 const CallInfo &C) {
803 AllocationCallToContextNodeMap.erase(C) ||
804 NonAllocationCallToContextNodeMap.erase(C);
805 assert(!AllocationCallToContextNodeMap.count(C) &&
806 !NonAllocationCallToContextNodeMap.count(C));
807 }
808
809 template <typename DerivedCCG, typename FuncTy, typename CallTy>
810 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
addOrUpdateCallerEdge(ContextNode * Caller,AllocationType AllocType,unsigned int ContextId)811 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
812 unsigned int ContextId) {
813 for (auto &Edge : CallerEdges) {
814 if (Edge->Caller == Caller) {
815 Edge->AllocTypes |= (uint8_t)AllocType;
816 Edge->getContextIds().insert(ContextId);
817 return;
818 }
819 }
820 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
821 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
822 CallerEdges.push_back(Edge);
823 Caller->CalleeEdges.push_back(Edge);
824 }
825
826 template <typename DerivedCCG, typename FuncTy, typename CallTy>
827 void CallsiteContextGraph<
removeNoneTypeCalleeEdges(ContextNode * Node)828 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
829 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
830 auto Edge = *EI;
831 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
832 assert(Edge->ContextIds.empty());
833 Edge->Callee->eraseCallerEdge(Edge.get());
834 EI = Node->CalleeEdges.erase(EI);
835 } else
836 ++EI;
837 }
838 }
839
840 template <typename DerivedCCG, typename FuncTy, typename CallTy>
841 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
842 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCallee(const ContextNode * Callee)843 findEdgeFromCallee(const ContextNode *Callee) {
844 for (const auto &Edge : CalleeEdges)
845 if (Edge->Callee == Callee)
846 return Edge.get();
847 return nullptr;
848 }
849
850 template <typename DerivedCCG, typename FuncTy, typename CallTy>
851 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
852 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCaller(const ContextNode * Caller)853 findEdgeFromCaller(const ContextNode *Caller) {
854 for (const auto &Edge : CallerEdges)
855 if (Edge->Caller == Caller)
856 return Edge.get();
857 return nullptr;
858 }
859
860 template <typename DerivedCCG, typename FuncTy, typename CallTy>
861 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCalleeEdge(const ContextEdge * Edge)862 eraseCalleeEdge(const ContextEdge *Edge) {
863 auto EI = llvm::find_if(
864 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
865 return CalleeEdge.get() == Edge;
866 });
867 assert(EI != CalleeEdges.end());
868 CalleeEdges.erase(EI);
869 }
870
871 template <typename DerivedCCG, typename FuncTy, typename CallTy>
872 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCallerEdge(const ContextEdge * Edge)873 eraseCallerEdge(const ContextEdge *Edge) {
874 auto EI = llvm::find_if(
875 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
876 return CallerEdge.get() == Edge;
877 });
878 assert(EI != CallerEdges.end());
879 CallerEdges.erase(EI);
880 }
881
882 template <typename DerivedCCG, typename FuncTy, typename CallTy>
computeAllocType(DenseSet<uint32_t> & ContextIds)883 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
884 DenseSet<uint32_t> &ContextIds) {
885 uint8_t BothTypes =
886 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
887 uint8_t AllocType = (uint8_t)AllocationType::None;
888 for (auto Id : ContextIds) {
889 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
890 // Bail early if alloc type reached both, no further refinement.
891 if (AllocType == BothTypes)
892 return AllocType;
893 }
894 return AllocType;
895 }
896
897 template <typename DerivedCCG, typename FuncTy, typename CallTy>
898 uint8_t
intersectAllocTypesImpl(const DenseSet<uint32_t> & Node1Ids,const DenseSet<uint32_t> & Node2Ids)899 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
900 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
901 uint8_t BothTypes =
902 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
903 uint8_t AllocType = (uint8_t)AllocationType::None;
904 for (auto Id : Node1Ids) {
905 if (!Node2Ids.count(Id))
906 continue;
907 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
908 // Bail early if alloc type reached both, no further refinement.
909 if (AllocType == BothTypes)
910 return AllocType;
911 }
912 return AllocType;
913 }
914
915 template <typename DerivedCCG, typename FuncTy, typename CallTy>
intersectAllocTypes(const DenseSet<uint32_t> & Node1Ids,const DenseSet<uint32_t> & Node2Ids)916 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
917 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
918 if (Node1Ids.size() < Node2Ids.size())
919 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
920 else
921 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
922 }
923
924 template <typename DerivedCCG, typename FuncTy, typename CallTy>
925 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
addAllocNode(CallInfo Call,const FuncTy * F)926 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
927 CallInfo Call, const FuncTy *F) {
928 assert(!getNodeForAlloc(Call));
929 NodeOwner.push_back(
930 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
931 ContextNode *AllocNode = NodeOwner.back().get();
932 AllocationCallToContextNodeMap[Call] = AllocNode;
933 NodeToCallingFunc[AllocNode] = F;
934 // Use LastContextId as a uniq id for MIB allocation nodes.
935 AllocNode->OrigStackOrAllocId = LastContextId;
936 // Alloc type should be updated as we add in the MIBs. We should assert
937 // afterwards that it is not still None.
938 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
939
940 return AllocNode;
941 }
942
943 template <typename DerivedCCG, typename FuncTy, typename CallTy>
944 template <class NodeT, class IteratorT>
addStackNodesForMIB(ContextNode * AllocNode,CallStack<NodeT,IteratorT> & StackContext,CallStack<NodeT,IteratorT> & CallsiteContext,AllocationType AllocType)945 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
946 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
947 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
948 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
949 // is done.
950 if (AllocType == AllocationType::Hot)
951 AllocType = AllocationType::NotCold;
952
953 ContextIdToAllocationType[++LastContextId] = AllocType;
954
955 // Update alloc type and context ids for this MIB.
956 AllocNode->AllocTypes |= (uint8_t)AllocType;
957 AllocNode->ContextIds.insert(LastContextId);
958
959 // Now add or update nodes for each stack id in alloc's context.
960 // Later when processing the stack ids on non-alloc callsites we will adjust
961 // for any inlining in the context.
962 ContextNode *PrevNode = AllocNode;
963 // Look for recursion (direct recursion should have been collapsed by
964 // module summary analysis, here we should just be detecting mutual
965 // recursion). Mark these nodes so we don't try to clone.
966 SmallSet<uint64_t, 8> StackIdSet;
967 // Skip any on the allocation call (inlining).
968 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
969 ContextIter != StackContext.end(); ++ContextIter) {
970 auto StackId = getStackId(*ContextIter);
971 ContextNode *StackNode = getNodeForStackId(StackId);
972 if (!StackNode) {
973 NodeOwner.push_back(
974 std::make_unique<ContextNode>(/*IsAllocation=*/false));
975 StackNode = NodeOwner.back().get();
976 StackEntryIdToContextNodeMap[StackId] = StackNode;
977 StackNode->OrigStackOrAllocId = StackId;
978 }
979 auto Ins = StackIdSet.insert(StackId);
980 if (!Ins.second)
981 StackNode->Recursive = true;
982 StackNode->ContextIds.insert(LastContextId);
983 StackNode->AllocTypes |= (uint8_t)AllocType;
984 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
985 PrevNode = StackNode;
986 }
987 }
988
989 template <typename DerivedCCG, typename FuncTy, typename CallTy>
990 DenseSet<uint32_t>
duplicateContextIds(const DenseSet<uint32_t> & StackSequenceContextIds,DenseMap<uint32_t,DenseSet<uint32_t>> & OldToNewContextIds)991 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
992 const DenseSet<uint32_t> &StackSequenceContextIds,
993 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
994 DenseSet<uint32_t> NewContextIds;
995 for (auto OldId : StackSequenceContextIds) {
996 NewContextIds.insert(++LastContextId);
997 OldToNewContextIds[OldId].insert(LastContextId);
998 assert(ContextIdToAllocationType.count(OldId));
999 // The new context has the same allocation type as original.
1000 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1001 }
1002 return NewContextIds;
1003 }
1004
1005 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1006 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
propagateDuplicateContextIds(const DenseMap<uint32_t,DenseSet<uint32_t>> & OldToNewContextIds)1007 propagateDuplicateContextIds(
1008 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1009 // Build a set of duplicated context ids corresponding to the input id set.
1010 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1011 DenseSet<uint32_t> NewIds;
1012 for (auto Id : ContextIds)
1013 if (auto NewId = OldToNewContextIds.find(Id);
1014 NewId != OldToNewContextIds.end())
1015 NewIds.insert(NewId->second.begin(), NewId->second.end());
1016 return NewIds;
1017 };
1018
1019 // Recursively update context ids sets along caller edges.
1020 auto UpdateCallers = [&](ContextNode *Node,
1021 DenseSet<const ContextEdge *> &Visited,
1022 auto &&UpdateCallers) -> void {
1023 for (const auto &Edge : Node->CallerEdges) {
1024 auto Inserted = Visited.insert(Edge.get());
1025 if (!Inserted.second)
1026 continue;
1027 ContextNode *NextNode = Edge->Caller;
1028 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1029 // Only need to recursively iterate to NextNode via this caller edge if
1030 // it resulted in any added ids to NextNode.
1031 if (!NewIdsToAdd.empty()) {
1032 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1033 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1034 UpdateCallers(NextNode, Visited, UpdateCallers);
1035 }
1036 }
1037 };
1038
1039 DenseSet<const ContextEdge *> Visited;
1040 for (auto &Entry : AllocationCallToContextNodeMap) {
1041 auto *Node = Entry.second;
1042 // Update ids on the allocation nodes before calling the recursive
1043 // update along caller edges, since this simplifies the logic during
1044 // that traversal.
1045 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
1046 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1047 UpdateCallers(Node, Visited, UpdateCallers);
1048 }
1049 }
1050
1051 template <typename DerivedCCG, typename FuncTy, typename CallTy>
connectNewNode(ContextNode * NewNode,ContextNode * OrigNode,bool TowardsCallee)1052 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1053 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
1054 // Make a copy of the context ids, since this will be adjusted below as they
1055 // are moved.
1056 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
1057 auto &OrigEdges =
1058 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1059 // Increment iterator in loop so that we can remove edges as needed.
1060 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1061 auto Edge = *EI;
1062 // Remove any matching context ids from Edge, return set that were found and
1063 // removed, these are the new edge's context ids. Also update the remaining
1064 // (not found ids).
1065 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1066 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1067 NotFoundContextIds);
1068 RemainingContextIds.swap(NotFoundContextIds);
1069 // If no matching context ids for this edge, skip it.
1070 if (NewEdgeContextIds.empty()) {
1071 ++EI;
1072 continue;
1073 }
1074 if (TowardsCallee) {
1075 auto NewEdge = std::make_shared<ContextEdge>(
1076 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1077 NewEdgeContextIds);
1078 NewNode->CalleeEdges.push_back(NewEdge);
1079 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1080 } else {
1081 auto NewEdge = std::make_shared<ContextEdge>(
1082 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1083 NewEdgeContextIds);
1084 NewNode->CallerEdges.push_back(NewEdge);
1085 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1086 }
1087 // Remove old edge if context ids empty.
1088 if (Edge->getContextIds().empty()) {
1089 if (TowardsCallee) {
1090 Edge->Callee->eraseCallerEdge(Edge.get());
1091 EI = OrigNode->CalleeEdges.erase(EI);
1092 } else {
1093 Edge->Caller->eraseCalleeEdge(Edge.get());
1094 EI = OrigNode->CallerEdges.erase(EI);
1095 }
1096 continue;
1097 }
1098 ++EI;
1099 }
1100 }
1101
1102 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1103 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
assignStackNodesPostOrder(ContextNode * Node,DenseSet<const ContextNode * > & Visited,DenseMap<uint64_t,std::vector<CallContextInfo>> & StackIdToMatchingCalls)1104 assignStackNodesPostOrder(ContextNode *Node,
1105 DenseSet<const ContextNode *> &Visited,
1106 DenseMap<uint64_t, std::vector<CallContextInfo>>
1107 &StackIdToMatchingCalls) {
1108 auto Inserted = Visited.insert(Node);
1109 if (!Inserted.second)
1110 return;
1111 // Post order traversal. Iterate over a copy since we may add nodes and
1112 // therefore new callers during the recursive call, invalidating any
1113 // iterator over the original edge vector. We don't need to process these
1114 // new nodes as they were already processed on creation.
1115 auto CallerEdges = Node->CallerEdges;
1116 for (auto &Edge : CallerEdges) {
1117 // Skip any that have been removed during the recursion.
1118 if (!Edge)
1119 continue;
1120 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1121 }
1122
1123 // If this node's stack id is in the map, update the graph to contain new
1124 // nodes representing any inlining at interior callsites. Note we move the
1125 // associated context ids over to the new nodes.
1126
1127 // Ignore this node if it is for an allocation or we didn't record any
1128 // stack id lists ending at it.
1129 if (Node->IsAllocation ||
1130 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1131 return;
1132
1133 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1134 // Handle the simple case first. A single call with a single stack id.
1135 // In this case there is no need to create any new context nodes, simply
1136 // assign the context node for stack id to this Call.
1137 if (Calls.size() == 1) {
1138 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1139 if (Ids.size() == 1) {
1140 assert(SavedContextIds.empty());
1141 // It should be this Node
1142 assert(Node == getNodeForStackId(Ids[0]));
1143 if (Node->Recursive)
1144 return;
1145 Node->setCall(Call);
1146 NonAllocationCallToContextNodeMap[Call] = Node;
1147 NodeToCallingFunc[Node] = Func;
1148 return;
1149 }
1150 }
1151
1152 // Find the node for the last stack id, which should be the same
1153 // across all calls recorded for this id, and is this node's id.
1154 uint64_t LastId = Node->OrigStackOrAllocId;
1155 ContextNode *LastNode = getNodeForStackId(LastId);
1156 // We should only have kept stack ids that had nodes.
1157 assert(LastNode);
1158
1159 for (unsigned I = 0; I < Calls.size(); I++) {
1160 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1161 // Skip any for which we didn't assign any ids, these don't get a node in
1162 // the graph.
1163 if (SavedContextIds.empty())
1164 continue;
1165
1166 assert(LastId == Ids.back());
1167
1168 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1169 assert(FirstNode);
1170
1171 // Recompute the context ids for this stack id sequence (the
1172 // intersection of the context ids of the corresponding nodes).
1173 // Start with the ids we saved in the map for this call, which could be
1174 // duplicated context ids. We have to recompute as we might have overlap
1175 // overlap between the saved context ids for different last nodes, and
1176 // removed them already during the post order traversal.
1177 set_intersect(SavedContextIds, FirstNode->ContextIds);
1178 ContextNode *PrevNode = nullptr;
1179 for (auto Id : Ids) {
1180 ContextNode *CurNode = getNodeForStackId(Id);
1181 // We should only have kept stack ids that had nodes and weren't
1182 // recursive.
1183 assert(CurNode);
1184 assert(!CurNode->Recursive);
1185 if (!PrevNode) {
1186 PrevNode = CurNode;
1187 continue;
1188 }
1189 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1190 if (!Edge) {
1191 SavedContextIds.clear();
1192 break;
1193 }
1194 PrevNode = CurNode;
1195 set_intersect(SavedContextIds, Edge->getContextIds());
1196
1197 // If we now have no context ids for clone, skip this call.
1198 if (SavedContextIds.empty())
1199 break;
1200 }
1201 if (SavedContextIds.empty())
1202 continue;
1203
1204 // Create new context node.
1205 NodeOwner.push_back(
1206 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1207 ContextNode *NewNode = NodeOwner.back().get();
1208 NodeToCallingFunc[NewNode] = Func;
1209 NonAllocationCallToContextNodeMap[Call] = NewNode;
1210 NewNode->ContextIds = SavedContextIds;
1211 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1212
1213 // Connect to callees of innermost stack frame in inlined call chain.
1214 // This updates context ids for FirstNode's callee's to reflect those
1215 // moved to NewNode.
1216 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1217
1218 // Connect to callers of outermost stack frame in inlined call chain.
1219 // This updates context ids for FirstNode's caller's to reflect those
1220 // moved to NewNode.
1221 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1222
1223 // Now we need to remove context ids from edges/nodes between First and
1224 // Last Node.
1225 PrevNode = nullptr;
1226 for (auto Id : Ids) {
1227 ContextNode *CurNode = getNodeForStackId(Id);
1228 // We should only have kept stack ids that had nodes.
1229 assert(CurNode);
1230
1231 // Remove the context ids moved to NewNode from CurNode, and the
1232 // edge from the prior node.
1233 set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1234 if (PrevNode) {
1235 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1236 assert(PrevEdge);
1237 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1238 if (PrevEdge->getContextIds().empty()) {
1239 PrevNode->eraseCallerEdge(PrevEdge);
1240 CurNode->eraseCalleeEdge(PrevEdge);
1241 }
1242 }
1243 PrevNode = CurNode;
1244 }
1245 }
1246 }
1247
1248 template <typename DerivedCCG, typename FuncTy, typename CallTy>
updateStackNodes()1249 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1250 // Map of stack id to all calls with that as the last (outermost caller)
1251 // callsite id that has a context node (some might not due to pruning
1252 // performed during matching of the allocation profile contexts).
1253 // The CallContextInfo contains the Call and a list of its stack ids with
1254 // ContextNodes, the function containing Call, and the set of context ids
1255 // the analysis will eventually identify for use in any new node created
1256 // for that callsite.
1257 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1258 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1259 for (auto &Call : CallsWithMetadata) {
1260 // Ignore allocations, already handled.
1261 if (AllocationCallToContextNodeMap.count(Call))
1262 continue;
1263 auto StackIdsWithContextNodes =
1264 getStackIdsWithContextNodesForCall(Call.call());
1265 // If there were no nodes created for MIBs on allocs (maybe this was in
1266 // the unambiguous part of the MIB stack that was pruned), ignore.
1267 if (StackIdsWithContextNodes.empty())
1268 continue;
1269 // Otherwise, record this Call along with the list of ids for the last
1270 // (outermost caller) stack id with a node.
1271 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1272 {Call.call(), StackIdsWithContextNodes, Func, {}});
1273 }
1274 }
1275
1276 // First make a pass through all stack ids that correspond to a call,
1277 // as identified in the above loop. Compute the context ids corresponding to
1278 // each of these calls when they correspond to multiple stack ids due to
1279 // due to inlining. Perform any duplication of context ids required when
1280 // there is more than one call with the same stack ids. Their (possibly newly
1281 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1282 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1283 for (auto &It : StackIdToMatchingCalls) {
1284 auto &Calls = It.getSecond();
1285 // Skip single calls with a single stack id. These don't need a new node.
1286 if (Calls.size() == 1) {
1287 auto &Ids = std::get<1>(Calls[0]);
1288 if (Ids.size() == 1)
1289 continue;
1290 }
1291 // In order to do the best and maximal matching of inlined calls to context
1292 // node sequences we will sort the vectors of stack ids in descending order
1293 // of length, and within each length, lexicographically by stack id. The
1294 // latter is so that we can specially handle calls that have identical stack
1295 // id sequences (either due to cloning or artificially because of the MIB
1296 // context pruning).
1297 std::stable_sort(Calls.begin(), Calls.end(),
1298 [](const CallContextInfo &A, const CallContextInfo &B) {
1299 auto &IdsA = std::get<1>(A);
1300 auto &IdsB = std::get<1>(B);
1301 return IdsA.size() > IdsB.size() ||
1302 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1303 });
1304
1305 // Find the node for the last stack id, which should be the same
1306 // across all calls recorded for this id, and is the id for this
1307 // entry in the StackIdToMatchingCalls map.
1308 uint64_t LastId = It.getFirst();
1309 ContextNode *LastNode = getNodeForStackId(LastId);
1310 // We should only have kept stack ids that had nodes.
1311 assert(LastNode);
1312
1313 if (LastNode->Recursive)
1314 continue;
1315
1316 // Initialize the context ids with the last node's. We will subsequently
1317 // refine the context ids by computing the intersection along all edges.
1318 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1319 assert(!LastNodeContextIds.empty());
1320
1321 for (unsigned I = 0; I < Calls.size(); I++) {
1322 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1323 assert(SavedContextIds.empty());
1324 assert(LastId == Ids.back());
1325
1326 // First compute the context ids for this stack id sequence (the
1327 // intersection of the context ids of the corresponding nodes).
1328 // Start with the remaining saved ids for the last node.
1329 assert(!LastNodeContextIds.empty());
1330 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1331
1332 ContextNode *PrevNode = LastNode;
1333 ContextNode *CurNode = LastNode;
1334 bool Skip = false;
1335
1336 // Iterate backwards through the stack Ids, starting after the last Id
1337 // in the list, which was handled once outside for all Calls.
1338 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1339 auto Id = *IdIter;
1340 CurNode = getNodeForStackId(Id);
1341 // We should only have kept stack ids that had nodes.
1342 assert(CurNode);
1343
1344 if (CurNode->Recursive) {
1345 Skip = true;
1346 break;
1347 }
1348
1349 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1350 // If there is no edge then the nodes belong to different MIB contexts,
1351 // and we should skip this inlined context sequence. For example, this
1352 // particular inlined context may include stack ids A->B, and we may
1353 // indeed have nodes for both A and B, but it is possible that they were
1354 // never profiled in sequence in a single MIB for any allocation (i.e.
1355 // we might have profiled an allocation that involves the callsite A,
1356 // but through a different one of its callee callsites, and we might
1357 // have profiled an allocation that involves callsite B, but reached
1358 // from a different caller callsite).
1359 if (!Edge) {
1360 Skip = true;
1361 break;
1362 }
1363 PrevNode = CurNode;
1364
1365 // Update the context ids, which is the intersection of the ids along
1366 // all edges in the sequence.
1367 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1368
1369 // If we now have no context ids for clone, skip this call.
1370 if (StackSequenceContextIds.empty()) {
1371 Skip = true;
1372 break;
1373 }
1374 }
1375 if (Skip)
1376 continue;
1377
1378 // If some of this call's stack ids did not have corresponding nodes (due
1379 // to pruning), don't include any context ids for contexts that extend
1380 // beyond these nodes. Otherwise we would be matching part of unrelated /
1381 // not fully matching stack contexts. To do this, subtract any context ids
1382 // found in caller nodes of the last node found above.
1383 if (Ids.back() != getLastStackId(Call)) {
1384 for (const auto &PE : CurNode->CallerEdges) {
1385 set_subtract(StackSequenceContextIds, PE->getContextIds());
1386 if (StackSequenceContextIds.empty())
1387 break;
1388 }
1389 // If we now have no context ids for clone, skip this call.
1390 if (StackSequenceContextIds.empty())
1391 continue;
1392 }
1393
1394 // Check if the next set of stack ids is the same (since the Calls vector
1395 // of tuples is sorted by the stack ids we can just look at the next one).
1396 bool DuplicateContextIds = false;
1397 if (I + 1 < Calls.size()) {
1398 auto NextIds = std::get<1>(Calls[I + 1]);
1399 DuplicateContextIds = Ids == NextIds;
1400 }
1401
1402 // If we don't have duplicate context ids, then we can assign all the
1403 // context ids computed for the original node sequence to this call.
1404 // If there are duplicate calls with the same stack ids then we synthesize
1405 // new context ids that are duplicates of the originals. These are
1406 // assigned to SavedContextIds, which is a reference into the map entry
1407 // for this call, allowing us to access these ids later on.
1408 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1409 StackSequenceContextIds.size());
1410 SavedContextIds =
1411 DuplicateContextIds
1412 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1413 : StackSequenceContextIds;
1414 assert(!SavedContextIds.empty());
1415
1416 if (!DuplicateContextIds) {
1417 // Update saved last node's context ids to remove those that are
1418 // assigned to other calls, so that it is ready for the next call at
1419 // this stack id.
1420 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1421 if (LastNodeContextIds.empty())
1422 break;
1423 }
1424 }
1425 }
1426
1427 // Propagate the duplicate context ids over the graph.
1428 propagateDuplicateContextIds(OldToNewContextIds);
1429
1430 if (VerifyCCG)
1431 check();
1432
1433 // Now perform a post-order traversal over the graph, starting with the
1434 // allocation nodes, essentially processing nodes from callers to callees.
1435 // For any that contains an id in the map, update the graph to contain new
1436 // nodes representing any inlining at interior callsites. Note we move the
1437 // associated context ids over to the new nodes.
1438 DenseSet<const ContextNode *> Visited;
1439 for (auto &Entry : AllocationCallToContextNodeMap)
1440 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1441 }
1442
getLastStackId(Instruction * Call)1443 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1444 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1445 Call->getMetadata(LLVMContext::MD_callsite));
1446 return CallsiteContext.back();
1447 }
1448
getLastStackId(IndexCall & Call)1449 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1450 assert(isa<CallsiteInfo *>(Call.getBase()));
1451 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1452 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1453 // Need to convert index into stack id.
1454 return Index.getStackIdAtIndex(CallsiteContext.back());
1455 }
1456
1457 static const std::string MemProfCloneSuffix = ".memprof.";
1458
getMemProfFuncName(Twine Base,unsigned CloneNo)1459 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1460 // We use CloneNo == 0 to refer to the original version, which doesn't get
1461 // renamed with a suffix.
1462 if (!CloneNo)
1463 return Base.str();
1464 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1465 }
1466
getLabel(const Function * Func,const Instruction * Call,unsigned CloneNo) const1467 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1468 const Instruction *Call,
1469 unsigned CloneNo) const {
1470 return (Twine(Call->getFunction()->getName()) + " -> " +
1471 cast<CallBase>(Call)->getCalledFunction()->getName())
1472 .str();
1473 }
1474
getLabel(const FunctionSummary * Func,const IndexCall & Call,unsigned CloneNo) const1475 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1476 const IndexCall &Call,
1477 unsigned CloneNo) const {
1478 auto VI = FSToVIMap.find(Func);
1479 assert(VI != FSToVIMap.end());
1480 if (isa<AllocInfo *>(Call.getBase()))
1481 return (VI->second.name() + " -> alloc").str();
1482 else {
1483 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1484 return (VI->second.name() + " -> " +
1485 getMemProfFuncName(Callsite->Callee.name(),
1486 Callsite->Clones[CloneNo]))
1487 .str();
1488 }
1489 }
1490
1491 std::vector<uint64_t>
getStackIdsWithContextNodesForCall(Instruction * Call)1492 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1493 Instruction *Call) {
1494 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1495 Call->getMetadata(LLVMContext::MD_callsite));
1496 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1497 CallsiteContext);
1498 }
1499
1500 std::vector<uint64_t>
getStackIdsWithContextNodesForCall(IndexCall & Call)1501 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1502 assert(isa<CallsiteInfo *>(Call.getBase()));
1503 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1504 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1505 return getStackIdsWithContextNodes<CallsiteInfo,
1506 SmallVector<unsigned>::const_iterator>(
1507 CallsiteContext);
1508 }
1509
1510 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1511 template <class NodeT, class IteratorT>
1512 std::vector<uint64_t>
getStackIdsWithContextNodes(CallStack<NodeT,IteratorT> & CallsiteContext)1513 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1514 CallStack<NodeT, IteratorT> &CallsiteContext) {
1515 std::vector<uint64_t> StackIds;
1516 for (auto IdOrIndex : CallsiteContext) {
1517 auto StackId = getStackId(IdOrIndex);
1518 ContextNode *Node = getNodeForStackId(StackId);
1519 if (!Node)
1520 break;
1521 StackIds.push_back(StackId);
1522 }
1523 return StackIds;
1524 }
1525
ModuleCallsiteContextGraph(Module & M,llvm::function_ref<OptimizationRemarkEmitter & (Function *)> OREGetter)1526 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1527 Module &M,
1528 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1529 : Mod(M), OREGetter(OREGetter) {
1530 for (auto &F : M) {
1531 std::vector<CallInfo> CallsWithMetadata;
1532 for (auto &BB : F) {
1533 for (auto &I : BB) {
1534 if (!isa<CallBase>(I))
1535 continue;
1536 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1537 CallsWithMetadata.push_back(&I);
1538 auto *AllocNode = addAllocNode(&I, &F);
1539 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1540 assert(CallsiteMD);
1541 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1542 // Add all of the MIBs and their stack nodes.
1543 for (auto &MDOp : MemProfMD->operands()) {
1544 auto *MIBMD = cast<const MDNode>(MDOp);
1545 MDNode *StackNode = getMIBStackNode(MIBMD);
1546 assert(StackNode);
1547 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1548 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1549 AllocNode, StackContext, CallsiteContext,
1550 getMIBAllocType(MIBMD));
1551 }
1552 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1553 // Memprof and callsite metadata on memory allocations no longer
1554 // needed.
1555 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1556 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1557 }
1558 // For callsite metadata, add to list for this function for later use.
1559 else if (I.getMetadata(LLVMContext::MD_callsite))
1560 CallsWithMetadata.push_back(&I);
1561 }
1562 }
1563 if (!CallsWithMetadata.empty())
1564 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1565 }
1566
1567 if (DumpCCG) {
1568 dbgs() << "CCG before updating call stack chains:\n";
1569 dbgs() << *this;
1570 }
1571
1572 if (ExportToDot)
1573 exportToDot("prestackupdate");
1574
1575 updateStackNodes();
1576
1577 handleCallsitesWithMultipleTargets();
1578
1579 // Strip off remaining callsite metadata, no longer needed.
1580 for (auto &FuncEntry : FuncToCallsWithMetadata)
1581 for (auto &Call : FuncEntry.second)
1582 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1583 }
1584
IndexCallsiteContextGraph(ModuleSummaryIndex & Index,llvm::function_ref<bool (GlobalValue::GUID,const GlobalValueSummary *)> isPrevailing)1585 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1586 ModuleSummaryIndex &Index,
1587 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1588 isPrevailing)
1589 : Index(Index), isPrevailing(isPrevailing) {
1590 for (auto &I : Index) {
1591 auto VI = Index.getValueInfo(I);
1592 for (auto &S : VI.getSummaryList()) {
1593 // We should only add the prevailing nodes. Otherwise we may try to clone
1594 // in a weak copy that won't be linked (and may be different than the
1595 // prevailing version).
1596 // We only keep the memprof summary on the prevailing copy now when
1597 // building the combined index, as a space optimization, however don't
1598 // rely on this optimization. The linker doesn't resolve local linkage
1599 // values so don't check whether those are prevailing.
1600 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1601 !isPrevailing(VI.getGUID(), S.get()))
1602 continue;
1603 auto *FS = dyn_cast<FunctionSummary>(S.get());
1604 if (!FS)
1605 continue;
1606 std::vector<CallInfo> CallsWithMetadata;
1607 if (!FS->allocs().empty()) {
1608 for (auto &AN : FS->mutableAllocs()) {
1609 // This can happen because of recursion elimination handling that
1610 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1611 // We still added them to the summary because we need to be able to
1612 // correlate properly in applyImport in the backends.
1613 if (AN.MIBs.empty())
1614 continue;
1615 CallsWithMetadata.push_back({&AN});
1616 auto *AllocNode = addAllocNode({&AN}, FS);
1617 // Pass an empty CallStack to the CallsiteContext (second)
1618 // parameter, since for ThinLTO we already collapsed out the inlined
1619 // stack ids on the allocation call during ModuleSummaryAnalysis.
1620 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1621 EmptyContext;
1622 // Now add all of the MIBs and their stack nodes.
1623 for (auto &MIB : AN.MIBs) {
1624 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1625 StackContext(&MIB);
1626 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1627 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1628 }
1629 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1630 // Initialize version 0 on the summary alloc node to the current alloc
1631 // type, unless it has both types in which case make it default, so
1632 // that in the case where we aren't able to clone the original version
1633 // always ends up with the default allocation behavior.
1634 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1635 }
1636 }
1637 // For callsite metadata, add to list for this function for later use.
1638 if (!FS->callsites().empty())
1639 for (auto &SN : FS->mutableCallsites())
1640 CallsWithMetadata.push_back({&SN});
1641
1642 if (!CallsWithMetadata.empty())
1643 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1644
1645 if (!FS->allocs().empty() || !FS->callsites().empty())
1646 FSToVIMap[FS] = VI;
1647 }
1648 }
1649
1650 if (DumpCCG) {
1651 dbgs() << "CCG before updating call stack chains:\n";
1652 dbgs() << *this;
1653 }
1654
1655 if (ExportToDot)
1656 exportToDot("prestackupdate");
1657
1658 updateStackNodes();
1659
1660 handleCallsitesWithMultipleTargets();
1661 }
1662
1663 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1664 void CallsiteContextGraph<DerivedCCG, FuncTy,
handleCallsitesWithMultipleTargets()1665 CallTy>::handleCallsitesWithMultipleTargets() {
1666 // Look for and workaround callsites that call multiple functions.
1667 // This can happen for indirect calls, which needs better handling, and in
1668 // more rare cases (e.g. macro expansion).
1669 // TODO: To fix this for indirect calls we will want to perform speculative
1670 // devirtualization using either the normal PGO info with ICP, or using the
1671 // information in the profiled MemProf contexts. We can do this prior to
1672 // this transformation for regular LTO, and for ThinLTO we can simulate that
1673 // effect in the summary and perform the actual speculative devirtualization
1674 // while cloning in the ThinLTO backend.
1675
1676 // Keep track of the new nodes synthesized for discovered tail calls missing
1677 // from the profiled contexts.
1678 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1679
1680 for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1681 Entry != NonAllocationCallToContextNodeMap.end();) {
1682 auto *Node = Entry->second;
1683 assert(Node->Clones.empty());
1684 // Check all node callees and see if in the same function.
1685 bool Removed = false;
1686 auto Call = Node->Call.call();
1687 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1688 auto Edge = *EI;
1689 if (!Edge->Callee->hasCall()) {
1690 ++EI;
1691 continue;
1692 }
1693 assert(NodeToCallingFunc.count(Edge->Callee));
1694 // Check if the called function matches that of the callee node.
1695 if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1696 continue;
1697 RemovedEdgesWithMismatchedCallees++;
1698 // Work around by setting Node to have a null call, so it gets
1699 // skipped during cloning. Otherwise assignFunctions will assert
1700 // because its data structures are not designed to handle this case.
1701 Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1702 Node->setCall(CallInfo());
1703 Removed = true;
1704 break;
1705 }
1706 if (!Removed)
1707 Entry++;
1708 }
1709
1710 // Add the new nodes after the above loop so that the iteration is not
1711 // invalidated.
1712 for (auto &[Call, Node] : TailCallToContextNodeMap)
1713 NonAllocationCallToContextNodeMap[Call] = Node;
1714 }
1715
getStackId(uint64_t IdOrIndex) const1716 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1717 // In the Module (IR) case this is already the Id.
1718 return IdOrIndex;
1719 }
1720
getStackId(uint64_t IdOrIndex) const1721 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1722 // In the Index case this is an index into the stack id list in the summary
1723 // index, convert it to an Id.
1724 return Index.getStackIdAtIndex(IdOrIndex);
1725 }
1726
1727 template <typename DerivedCCG, typename FuncTy, typename CallTy>
calleesMatch(CallTy Call,EdgeIter & EI,MapVector<CallInfo,ContextNode * > & TailCallToContextNodeMap)1728 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1729 CallTy Call, EdgeIter &EI,
1730 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1731 auto Edge = *EI;
1732 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1733 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1734 // Will be populated in order of callee to caller if we find a chain of tail
1735 // calls between the profiled caller and callee.
1736 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1737 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1738 FoundCalleeChain)) {
1739 ++EI;
1740 return false;
1741 }
1742
1743 // The usual case where the profiled callee matches that of the IR/summary.
1744 if (FoundCalleeChain.empty()) {
1745 ++EI;
1746 return true;
1747 }
1748
1749 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1750 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1751 // If there is already an edge between these nodes, simply update it and
1752 // return.
1753 if (CurEdge) {
1754 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1755 Edge->ContextIds.end());
1756 CurEdge->AllocTypes |= Edge->AllocTypes;
1757 return;
1758 }
1759 // Otherwise, create a new edge and insert it into the caller and callee
1760 // lists.
1761 auto NewEdge = std::make_shared<ContextEdge>(
1762 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1763 Callee->CallerEdges.push_back(NewEdge);
1764 if (Caller == Edge->Caller) {
1765 // If we are inserting the new edge into the current edge's caller, insert
1766 // the new edge before the current iterator position, and then increment
1767 // back to the current edge.
1768 EI = Caller->CalleeEdges.insert(EI, NewEdge);
1769 ++EI;
1770 assert(*EI == Edge &&
1771 "Iterator position not restored after insert and increment");
1772 } else
1773 Caller->CalleeEdges.push_back(NewEdge);
1774 };
1775
1776 // Create new nodes for each found callee and connect in between the profiled
1777 // caller and callee.
1778 auto *CurCalleeNode = Edge->Callee;
1779 for (auto &[NewCall, Func] : FoundCalleeChain) {
1780 ContextNode *NewNode = nullptr;
1781 // First check if we have already synthesized a node for this tail call.
1782 if (TailCallToContextNodeMap.count(NewCall)) {
1783 NewNode = TailCallToContextNodeMap[NewCall];
1784 NewNode->ContextIds.insert(Edge->ContextIds.begin(),
1785 Edge->ContextIds.end());
1786 NewNode->AllocTypes |= Edge->AllocTypes;
1787 } else {
1788 FuncToCallsWithMetadata[Func].push_back({NewCall});
1789 // Create Node and record node info.
1790 NodeOwner.push_back(
1791 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1792 NewNode = NodeOwner.back().get();
1793 NodeToCallingFunc[NewNode] = Func;
1794 TailCallToContextNodeMap[NewCall] = NewNode;
1795 NewNode->ContextIds = Edge->ContextIds;
1796 NewNode->AllocTypes = Edge->AllocTypes;
1797 }
1798
1799 // Hook up node to its callee node
1800 AddEdge(NewNode, CurCalleeNode);
1801
1802 CurCalleeNode = NewNode;
1803 }
1804
1805 // Hook up edge's original caller to new callee node.
1806 AddEdge(Edge->Caller, CurCalleeNode);
1807
1808 // Remove old edge
1809 Edge->Callee->eraseCallerEdge(Edge.get());
1810 EI = Edge->Caller->CalleeEdges.erase(EI);
1811
1812 return true;
1813 }
1814
findProfiledCalleeThroughTailCalls(const Function * ProfiledCallee,Value * CurCallee,unsigned Depth,std::vector<std::pair<Instruction *,Function * >> & FoundCalleeChain,bool & FoundMultipleCalleeChains)1815 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1816 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1817 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1818 bool &FoundMultipleCalleeChains) {
1819 // Stop recursive search if we have already explored the maximum specified
1820 // depth.
1821 if (Depth > TailCallSearchDepth)
1822 return false;
1823
1824 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1825 FoundCalleeChain.push_back({Callsite, F});
1826 };
1827
1828 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1829 if (!CalleeFunc) {
1830 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1831 assert(Alias);
1832 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1833 assert(CalleeFunc);
1834 }
1835
1836 // Look for tail calls in this function, and check if they either call the
1837 // profiled callee directly, or indirectly (via a recursive search).
1838 // Only succeed if there is a single unique tail call chain found between the
1839 // profiled caller and callee, otherwise we could perform incorrect cloning.
1840 bool FoundSingleCalleeChain = false;
1841 for (auto &BB : *CalleeFunc) {
1842 for (auto &I : BB) {
1843 auto *CB = dyn_cast<CallBase>(&I);
1844 if (!CB || !CB->isTailCall())
1845 continue;
1846 auto *CalledValue = CB->getCalledOperand();
1847 auto *CalledFunction = CB->getCalledFunction();
1848 if (CalledValue && !CalledFunction) {
1849 CalledValue = CalledValue->stripPointerCasts();
1850 // Stripping pointer casts can reveal a called function.
1851 CalledFunction = dyn_cast<Function>(CalledValue);
1852 }
1853 // Check if this is an alias to a function. If so, get the
1854 // called aliasee for the checks below.
1855 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
1856 assert(!CalledFunction &&
1857 "Expected null called function in callsite for alias");
1858 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
1859 }
1860 if (!CalledFunction)
1861 continue;
1862 if (CalledFunction == ProfiledCallee) {
1863 if (FoundSingleCalleeChain) {
1864 FoundMultipleCalleeChains = true;
1865 return false;
1866 }
1867 FoundSingleCalleeChain = true;
1868 FoundProfiledCalleeCount++;
1869 FoundProfiledCalleeDepth += Depth;
1870 if (Depth > FoundProfiledCalleeMaxDepth)
1871 FoundProfiledCalleeMaxDepth = Depth;
1872 SaveCallsiteInfo(&I, CalleeFunc);
1873 } else if (findProfiledCalleeThroughTailCalls(
1874 ProfiledCallee, CalledFunction, Depth + 1,
1875 FoundCalleeChain, FoundMultipleCalleeChains)) {
1876 if (FoundMultipleCalleeChains)
1877 return false;
1878 if (FoundSingleCalleeChain) {
1879 FoundMultipleCalleeChains = true;
1880 return false;
1881 }
1882 FoundSingleCalleeChain = true;
1883 SaveCallsiteInfo(&I, CalleeFunc);
1884 }
1885 }
1886 }
1887
1888 return FoundSingleCalleeChain;
1889 }
1890
calleeMatchesFunc(Instruction * Call,const Function * Func,const Function * CallerFunc,std::vector<std::pair<Instruction *,Function * >> & FoundCalleeChain)1891 bool ModuleCallsiteContextGraph::calleeMatchesFunc(
1892 Instruction *Call, const Function *Func, const Function *CallerFunc,
1893 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
1894 auto *CB = dyn_cast<CallBase>(Call);
1895 if (!CB->getCalledOperand())
1896 return false;
1897 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1898 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1899 if (CalleeFunc == Func)
1900 return true;
1901 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1902 if (Alias && Alias->getAliasee() == Func)
1903 return true;
1904
1905 // Recursively search for the profiled callee through tail calls starting with
1906 // the actual Callee. The discovered tail call chain is saved in
1907 // FoundCalleeChain, and we will fixup the graph to include these callsites
1908 // after returning.
1909 // FIXME: We will currently redo the same recursive walk if we find the same
1910 // mismatched callee from another callsite. We can improve this with more
1911 // bookkeeping of the created chain of new nodes for each mismatch.
1912 unsigned Depth = 1;
1913 bool FoundMultipleCalleeChains = false;
1914 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
1915 FoundCalleeChain,
1916 FoundMultipleCalleeChains)) {
1917 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
1918 << Func->getName() << " from " << CallerFunc->getName()
1919 << " that actually called " << CalleeVal->getName()
1920 << (FoundMultipleCalleeChains
1921 ? " (found multiple possible chains)"
1922 : "")
1923 << "\n");
1924 if (FoundMultipleCalleeChains)
1925 FoundProfiledCalleeNonUniquelyCount++;
1926 return false;
1927 }
1928
1929 return true;
1930 }
1931
findProfiledCalleeThroughTailCalls(ValueInfo ProfiledCallee,ValueInfo CurCallee,unsigned Depth,std::vector<std::pair<IndexCall,FunctionSummary * >> & FoundCalleeChain,bool & FoundMultipleCalleeChains)1932 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1933 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1934 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1935 bool &FoundMultipleCalleeChains) {
1936 // Stop recursive search if we have already explored the maximum specified
1937 // depth.
1938 if (Depth > TailCallSearchDepth)
1939 return false;
1940
1941 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
1942 // Make a CallsiteInfo for each discovered callee, if one hasn't already
1943 // been synthesized.
1944 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
1945 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
1946 // StackIds is empty (we don't have debug info available in the index for
1947 // these callsites)
1948 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
1949 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
1950 CallsiteInfo *NewCallsiteInfo =
1951 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
1952 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
1953 };
1954
1955 // Look for tail calls in this function, and check if they either call the
1956 // profiled callee directly, or indirectly (via a recursive search).
1957 // Only succeed if there is a single unique tail call chain found between the
1958 // profiled caller and callee, otherwise we could perform incorrect cloning.
1959 bool FoundSingleCalleeChain = false;
1960 for (auto &S : CurCallee.getSummaryList()) {
1961 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1962 !isPrevailing(CurCallee.getGUID(), S.get()))
1963 continue;
1964 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
1965 if (!FS)
1966 continue;
1967 auto FSVI = CurCallee;
1968 auto *AS = dyn_cast<AliasSummary>(S.get());
1969 if (AS)
1970 FSVI = AS->getAliaseeVI();
1971 for (auto &CallEdge : FS->calls()) {
1972 if (!CallEdge.second.hasTailCall())
1973 continue;
1974 if (CallEdge.first == ProfiledCallee) {
1975 if (FoundSingleCalleeChain) {
1976 FoundMultipleCalleeChains = true;
1977 return false;
1978 }
1979 FoundSingleCalleeChain = true;
1980 FoundProfiledCalleeCount++;
1981 FoundProfiledCalleeDepth += Depth;
1982 if (Depth > FoundProfiledCalleeMaxDepth)
1983 FoundProfiledCalleeMaxDepth = Depth;
1984 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
1985 // Add FS to FSToVIMap in case it isn't already there.
1986 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
1987 FSToVIMap[FS] = FSVI;
1988 } else if (findProfiledCalleeThroughTailCalls(
1989 ProfiledCallee, CallEdge.first, Depth + 1,
1990 FoundCalleeChain, FoundMultipleCalleeChains)) {
1991 if (FoundMultipleCalleeChains)
1992 return false;
1993 if (FoundSingleCalleeChain) {
1994 FoundMultipleCalleeChains = true;
1995 return false;
1996 }
1997 FoundSingleCalleeChain = true;
1998 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
1999 // Add FS to FSToVIMap in case it isn't already there.
2000 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2001 FSToVIMap[FS] = FSVI;
2002 }
2003 }
2004 }
2005
2006 return FoundSingleCalleeChain;
2007 }
2008
calleeMatchesFunc(IndexCall & Call,const FunctionSummary * Func,const FunctionSummary * CallerFunc,std::vector<std::pair<IndexCall,FunctionSummary * >> & FoundCalleeChain)2009 bool IndexCallsiteContextGraph::calleeMatchesFunc(
2010 IndexCall &Call, const FunctionSummary *Func,
2011 const FunctionSummary *CallerFunc,
2012 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2013 ValueInfo Callee =
2014 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2015 // If there is no summary list then this is a call to an externally defined
2016 // symbol.
2017 AliasSummary *Alias =
2018 Callee.getSummaryList().empty()
2019 ? nullptr
2020 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2021 assert(FSToVIMap.count(Func));
2022 auto FuncVI = FSToVIMap[Func];
2023 if (Callee == FuncVI ||
2024 // If callee is an alias, check the aliasee, since only function
2025 // summary base objects will contain the stack node summaries and thus
2026 // get a context node.
2027 (Alias && Alias->getAliaseeVI() == FuncVI))
2028 return true;
2029
2030 // Recursively search for the profiled callee through tail calls starting with
2031 // the actual Callee. The discovered tail call chain is saved in
2032 // FoundCalleeChain, and we will fixup the graph to include these callsites
2033 // after returning.
2034 // FIXME: We will currently redo the same recursive walk if we find the same
2035 // mismatched callee from another callsite. We can improve this with more
2036 // bookkeeping of the created chain of new nodes for each mismatch.
2037 unsigned Depth = 1;
2038 bool FoundMultipleCalleeChains = false;
2039 if (!findProfiledCalleeThroughTailCalls(
2040 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2041 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2042 << " from " << FSToVIMap[CallerFunc]
2043 << " that actually called " << Callee
2044 << (FoundMultipleCalleeChains
2045 ? " (found multiple possible chains)"
2046 : "")
2047 << "\n");
2048 if (FoundMultipleCalleeChains)
2049 FoundProfiledCalleeNonUniquelyCount++;
2050 return false;
2051 }
2052
2053 return true;
2054 }
2055
getAllocTypeString(uint8_t AllocTypes)2056 static std::string getAllocTypeString(uint8_t AllocTypes) {
2057 if (!AllocTypes)
2058 return "None";
2059 std::string Str;
2060 if (AllocTypes & (uint8_t)AllocationType::NotCold)
2061 Str += "NotCold";
2062 if (AllocTypes & (uint8_t)AllocationType::Cold)
2063 Str += "Cold";
2064 return Str;
2065 }
2066
2067 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2068 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2069 const {
2070 print(dbgs());
2071 dbgs() << "\n";
2072 }
2073
2074 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2075 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2076 raw_ostream &OS) const {
2077 OS << "Node " << this << "\n";
2078 OS << "\t";
2079 printCall(OS);
2080 if (Recursive)
2081 OS << " (recursive)";
2082 OS << "\n";
2083 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2084 OS << "\tContextIds:";
2085 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2086 std::sort(SortedIds.begin(), SortedIds.end());
2087 for (auto Id : SortedIds)
2088 OS << " " << Id;
2089 OS << "\n";
2090 OS << "\tCalleeEdges:\n";
2091 for (auto &Edge : CalleeEdges)
2092 OS << "\t\t" << *Edge << "\n";
2093 OS << "\tCallerEdges:\n";
2094 for (auto &Edge : CallerEdges)
2095 OS << "\t\t" << *Edge << "\n";
2096 if (!Clones.empty()) {
2097 OS << "\tClones: ";
2098 FieldSeparator FS;
2099 for (auto *Clone : Clones)
2100 OS << FS << Clone;
2101 OS << "\n";
2102 } else if (CloneOf) {
2103 OS << "\tClone of " << CloneOf << "\n";
2104 }
2105 }
2106
2107 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2108 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2109 const {
2110 print(dbgs());
2111 dbgs() << "\n";
2112 }
2113
2114 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2115 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2116 raw_ostream &OS) const {
2117 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2118 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2119 OS << " ContextIds:";
2120 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2121 std::sort(SortedIds.begin(), SortedIds.end());
2122 for (auto Id : SortedIds)
2123 OS << " " << Id;
2124 }
2125
2126 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2127 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2128 print(dbgs());
2129 }
2130
2131 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2132 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2133 raw_ostream &OS) const {
2134 OS << "Callsite Context Graph:\n";
2135 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2136 for (const auto Node : nodes<GraphType>(this)) {
2137 if (Node->isRemoved())
2138 continue;
2139 Node->print(OS);
2140 OS << "\n";
2141 }
2142 }
2143
2144 template <typename DerivedCCG, typename FuncTy, typename CallTy>
checkEdge(const std::shared_ptr<ContextEdge<DerivedCCG,FuncTy,CallTy>> & Edge)2145 static void checkEdge(
2146 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
2147 // Confirm that alloc type is not None and that we have at least one context
2148 // id.
2149 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
2150 assert(!Edge->ContextIds.empty());
2151 }
2152
2153 template <typename DerivedCCG, typename FuncTy, typename CallTy>
checkNode(const ContextNode<DerivedCCG,FuncTy,CallTy> * Node,bool CheckEdges=true)2154 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
2155 bool CheckEdges = true) {
2156 if (Node->isRemoved())
2157 return;
2158 // Node's context ids should be the union of both its callee and caller edge
2159 // context ids.
2160 if (Node->CallerEdges.size()) {
2161 auto EI = Node->CallerEdges.begin();
2162 auto &FirstEdge = *EI;
2163 EI++;
2164 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
2165 for (; EI != Node->CallerEdges.end(); EI++) {
2166 const auto &Edge = *EI;
2167 if (CheckEdges)
2168 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2169 set_union(CallerEdgeContextIds, Edge->ContextIds);
2170 }
2171 // Node can have more context ids than callers if some contexts terminate at
2172 // node and some are longer.
2173 assert(Node->ContextIds == CallerEdgeContextIds ||
2174 set_is_subset(CallerEdgeContextIds, Node->ContextIds));
2175 }
2176 if (Node->CalleeEdges.size()) {
2177 auto EI = Node->CalleeEdges.begin();
2178 auto &FirstEdge = *EI;
2179 EI++;
2180 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
2181 for (; EI != Node->CalleeEdges.end(); EI++) {
2182 const auto &Edge = *EI;
2183 if (CheckEdges)
2184 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2185 set_union(CalleeEdgeContextIds, Edge->ContextIds);
2186 }
2187 assert(Node->ContextIds == CalleeEdgeContextIds);
2188 }
2189 }
2190
2191 template <typename DerivedCCG, typename FuncTy, typename CallTy>
check() const2192 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2193 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2194 for (const auto Node : nodes<GraphType>(this)) {
2195 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2196 for (auto &Edge : Node->CallerEdges)
2197 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2198 }
2199 }
2200
2201 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2202 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2203 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2204 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2205
2206 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
getNodeGraphTraits2207 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2208
2209 using nodes_iterator =
2210 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2211 decltype(&getNode)>;
2212
nodes_beginGraphTraits2213 static nodes_iterator nodes_begin(GraphType G) {
2214 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2215 }
2216
nodes_endGraphTraits2217 static nodes_iterator nodes_end(GraphType G) {
2218 return nodes_iterator(G->NodeOwner.end(), &getNode);
2219 }
2220
getEntryNodeGraphTraits2221 static NodeRef getEntryNode(GraphType G) {
2222 return G->NodeOwner.begin()->get();
2223 }
2224
2225 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2226 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
GetCalleeGraphTraits2227 GetCallee(const EdgePtrTy &P) {
2228 return P->Callee;
2229 }
2230
2231 using ChildIteratorType =
2232 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2233 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2234 decltype(&GetCallee)>;
2235
child_beginGraphTraits2236 static ChildIteratorType child_begin(NodeRef N) {
2237 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2238 }
2239
child_endGraphTraits2240 static ChildIteratorType child_end(NodeRef N) {
2241 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2242 }
2243 };
2244
2245 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2246 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2247 : public DefaultDOTGraphTraits {
DOTGraphTraitsDOTGraphTraits2248 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2249
2250 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2251 using GTraits = GraphTraits<GraphType>;
2252 using NodeRef = typename GTraits::NodeRef;
2253 using ChildIteratorType = typename GTraits::ChildIteratorType;
2254
getNodeLabelDOTGraphTraits2255 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2256 std::string LabelString =
2257 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2258 Twine(Node->OrigStackOrAllocId))
2259 .str();
2260 LabelString += "\n";
2261 if (Node->hasCall()) {
2262 auto Func = G->NodeToCallingFunc.find(Node);
2263 assert(Func != G->NodeToCallingFunc.end());
2264 LabelString +=
2265 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2266 } else {
2267 LabelString += "null call";
2268 if (Node->Recursive)
2269 LabelString += " (recursive)";
2270 else
2271 LabelString += " (external)";
2272 }
2273 return LabelString;
2274 }
2275
getNodeAttributesDOTGraphTraits2276 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2277 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2278 getContextIds(Node->ContextIds) + "\"")
2279 .str();
2280 AttributeString +=
2281 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2282 AttributeString += ",style=\"filled\"";
2283 if (Node->CloneOf) {
2284 AttributeString += ",color=\"blue\"";
2285 AttributeString += ",style=\"filled,bold,dashed\"";
2286 } else
2287 AttributeString += ",style=\"filled\"";
2288 return AttributeString;
2289 }
2290
getEdgeAttributesDOTGraphTraits2291 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2292 GraphType) {
2293 auto &Edge = *(ChildIter.getCurrent());
2294 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2295 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2296 .str();
2297 }
2298
2299 // Since the NodeOwners list includes nodes that are no longer connected to
2300 // the graph, skip them here.
isNodeHiddenDOTGraphTraits2301 static bool isNodeHidden(NodeRef Node, GraphType) {
2302 return Node->isRemoved();
2303 }
2304
2305 private:
getContextIdsDOTGraphTraits2306 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2307 std::string IdString = "ContextIds:";
2308 if (ContextIds.size() < 100) {
2309 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2310 std::sort(SortedIds.begin(), SortedIds.end());
2311 for (auto Id : SortedIds)
2312 IdString += (" " + Twine(Id)).str();
2313 } else {
2314 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2315 }
2316 return IdString;
2317 }
2318
getColorDOTGraphTraits2319 static std::string getColor(uint8_t AllocTypes) {
2320 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2321 // Color "brown1" actually looks like a lighter red.
2322 return "brown1";
2323 if (AllocTypes == (uint8_t)AllocationType::Cold)
2324 return "cyan";
2325 if (AllocTypes ==
2326 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2327 // Lighter purple.
2328 return "mediumorchid1";
2329 return "gray";
2330 }
2331
getNodeIdDOTGraphTraits2332 static std::string getNodeId(NodeRef Node) {
2333 std::stringstream SStream;
2334 SStream << std::hex << "N0x" << (unsigned long long)Node;
2335 std::string Result = SStream.str();
2336 return Result;
2337 }
2338 };
2339
2340 template <typename DerivedCCG, typename FuncTy, typename CallTy>
exportToDot(std::string Label) const2341 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2342 std::string Label) const {
2343 WriteGraph(this, "", false, Label,
2344 DotFilePathPrefix + "ccg." + Label + ".dot");
2345 }
2346
2347 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2348 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> & Edge,EdgeIter * CallerEdgeI)2349 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2350 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) {
2351 ContextNode *Node = Edge->Callee;
2352 NodeOwner.push_back(
2353 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2354 ContextNode *Clone = NodeOwner.back().get();
2355 Node->addClone(Clone);
2356 assert(NodeToCallingFunc.count(Node));
2357 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2358 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true);
2359 return Clone;
2360 }
2361
2362 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2363 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> & Edge,ContextNode * NewCallee,EdgeIter * CallerEdgeI,bool NewClone)2364 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2365 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2366 bool NewClone) {
2367 // NewCallee and Edge's current callee must be clones of the same original
2368 // node (Edge's current callee may be the original node too).
2369 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2370 auto &EdgeContextIds = Edge->getContextIds();
2371 ContextNode *OldCallee = Edge->Callee;
2372 if (CallerEdgeI)
2373 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2374 else
2375 OldCallee->eraseCallerEdge(Edge.get());
2376 Edge->Callee = NewCallee;
2377 NewCallee->CallerEdges.push_back(Edge);
2378 // Don't need to update Edge's context ids since we are simply reconnecting
2379 // it.
2380 set_subtract(OldCallee->ContextIds, EdgeContextIds);
2381 NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end());
2382 NewCallee->AllocTypes |= Edge->AllocTypes;
2383 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2384 // OldCallee alloc type should be None iff its context id set is now empty.
2385 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2386 OldCallee->ContextIds.empty());
2387 // Now walk the old callee node's callee edges and move Edge's context ids
2388 // over to the corresponding edge into the clone (which is created here if
2389 // this is a newly created clone).
2390 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2391 // The context ids moving to the new callee are the subset of this edge's
2392 // context ids and the context ids on the caller edge being moved.
2393 DenseSet<uint32_t> EdgeContextIdsToMove =
2394 set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds);
2395 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2396 OldCalleeEdge->AllocTypes =
2397 computeAllocType(OldCalleeEdge->getContextIds());
2398 if (!NewClone) {
2399 // Update context ids / alloc type on corresponding edge to NewCallee.
2400 // There is a chance this may not exist if we are reusing an existing
2401 // clone, specifically during function assignment, where we would have
2402 // removed none type edges after creating the clone. If we can't find
2403 // a corresponding edge there, fall through to the cloning below.
2404 if (auto *NewCalleeEdge =
2405 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2406 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2407 EdgeContextIdsToMove.end());
2408 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2409 continue;
2410 }
2411 }
2412 auto NewEdge = std::make_shared<ContextEdge>(
2413 OldCalleeEdge->Callee, NewCallee,
2414 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2415 NewCallee->CalleeEdges.push_back(NewEdge);
2416 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2417 }
2418 if (VerifyCCG) {
2419 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2420 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2421 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2422 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2423 /*CheckEdges=*/false);
2424 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2425 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2426 /*CheckEdges=*/false);
2427 }
2428 }
2429
2430 template <typename DerivedCCG, typename FuncTy, typename CallTy>
identifyClones()2431 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2432 DenseSet<const ContextNode *> Visited;
2433 for (auto &Entry : AllocationCallToContextNodeMap)
2434 identifyClones(Entry.second, Visited);
2435 }
2436
2437 // helper function to check an AllocType is cold or notcold or both.
checkColdOrNotCold(uint8_t AllocType)2438 bool checkColdOrNotCold(uint8_t AllocType) {
2439 return (AllocType == (uint8_t)AllocationType::Cold) ||
2440 (AllocType == (uint8_t)AllocationType::NotCold) ||
2441 (AllocType ==
2442 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2443 }
2444
2445 template <typename DerivedCCG, typename FuncTy, typename CallTy>
identifyClones(ContextNode * Node,DenseSet<const ContextNode * > & Visited)2446 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2447 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2448 if (VerifyNodes)
2449 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2450 assert(!Node->CloneOf);
2451
2452 // If Node as a null call, then either it wasn't found in the module (regular
2453 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2454 // cloning (e.g. recursion, calls multiple targets, etc).
2455 // Do this here so that we don't try to recursively clone callers below, which
2456 // isn't useful at least for this node.
2457 if (!Node->hasCall())
2458 return;
2459
2460 #ifndef NDEBUG
2461 auto Insert =
2462 #endif
2463 Visited.insert(Node);
2464 // We should not have visited this node yet.
2465 assert(Insert.second);
2466 // The recursive call to identifyClones may delete the current edge from the
2467 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2468 // in an iterator and having recursive call erase from it. Other edges may
2469 // also get removed during the recursion, which will have null Callee and
2470 // Caller pointers (and are deleted later), so we skip those below.
2471 {
2472 auto CallerEdges = Node->CallerEdges;
2473 for (auto &Edge : CallerEdges) {
2474 // Skip any that have been removed by an earlier recursive call.
2475 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2476 assert(!llvm::count(Node->CallerEdges, Edge));
2477 continue;
2478 }
2479 // Ignore any caller we previously visited via another edge.
2480 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2481 identifyClones(Edge->Caller, Visited);
2482 }
2483 }
2484 }
2485
2486 // Check if we reached an unambiguous call or have have only a single caller.
2487 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2488 return;
2489
2490 // We need to clone.
2491
2492 // Try to keep the original version as alloc type NotCold. This will make
2493 // cases with indirect calls or any other situation with an unknown call to
2494 // the original function get the default behavior. We do this by sorting the
2495 // CallerEdges of the Node we will clone by alloc type.
2496 //
2497 // Give NotCold edge the lowest sort priority so those edges are at the end of
2498 // the caller edges vector, and stay on the original version (since the below
2499 // code clones greedily until it finds all remaining edges have the same type
2500 // and leaves the remaining ones on the original Node).
2501 //
2502 // We shouldn't actually have any None type edges, so the sorting priority for
2503 // that is arbitrary, and we assert in that case below.
2504 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2505 /*Cold*/ 1,
2506 /*NotColdCold*/ 2};
2507 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2508 [&](const std::shared_ptr<ContextEdge> &A,
2509 const std::shared_ptr<ContextEdge> &B) {
2510 assert(checkColdOrNotCold(A->AllocTypes) &&
2511 checkColdOrNotCold(B->AllocTypes));
2512
2513 if (A->AllocTypes == B->AllocTypes)
2514 // Use the first context id for each edge as a
2515 // tie-breaker.
2516 return *A->ContextIds.begin() < *B->ContextIds.begin();
2517 return AllocTypeCloningPriority[A->AllocTypes] <
2518 AllocTypeCloningPriority[B->AllocTypes];
2519 });
2520
2521 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2522
2523 // Iterate until we find no more opportunities for disambiguating the alloc
2524 // types via cloning. In most cases this loop will terminate once the Node
2525 // has a single allocation type, in which case no more cloning is needed.
2526 // We need to be able to remove Edge from CallerEdges, so need to adjust
2527 // iterator inside the loop.
2528 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2529 auto CallerEdge = *EI;
2530
2531 // See if cloning the prior caller edge left this node with a single alloc
2532 // type or a single caller. In that case no more cloning of Node is needed.
2533 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2534 break;
2535
2536 // Compute the node callee edge alloc types corresponding to the context ids
2537 // for this caller edge.
2538 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2539 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2540 for (auto &CalleeEdge : Node->CalleeEdges)
2541 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2542 CalleeEdge->getContextIds(), CallerEdge->getContextIds()));
2543
2544 // Don't clone if doing so will not disambiguate any alloc types amongst
2545 // caller edges (including the callee edges that would be cloned).
2546 // Otherwise we will simply move all edges to the clone.
2547 //
2548 // First check if by cloning we will disambiguate the caller allocation
2549 // type from node's allocation type. Query allocTypeToUse so that we don't
2550 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2551 // neither of these should be None type.
2552 //
2553 // Then check if by cloning node at least one of the callee edges will be
2554 // disambiguated by splitting out different context ids.
2555 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2556 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2557 if (allocTypeToUse(CallerEdge->AllocTypes) ==
2558 allocTypeToUse(Node->AllocTypes) &&
2559 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2560 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2561 ++EI;
2562 continue;
2563 }
2564
2565 // First see if we can use an existing clone. Check each clone and its
2566 // callee edges for matching alloc types.
2567 ContextNode *Clone = nullptr;
2568 for (auto *CurClone : Node->Clones) {
2569 if (allocTypeToUse(CurClone->AllocTypes) !=
2570 allocTypeToUse(CallerEdge->AllocTypes))
2571 continue;
2572
2573 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2574 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2575 continue;
2576 Clone = CurClone;
2577 break;
2578 }
2579
2580 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2581 if (Clone)
2582 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI);
2583 else
2584 Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI);
2585
2586 assert(EI == Node->CallerEdges.end() ||
2587 Node->AllocTypes != (uint8_t)AllocationType::None);
2588 // Sanity check that no alloc types on clone or its edges are None.
2589 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2590 assert(llvm::none_of(
2591 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2592 return E->AllocTypes == (uint8_t)AllocationType::None;
2593 }));
2594 }
2595
2596 // Cloning may have resulted in some cloned callee edges with type None,
2597 // because they aren't carrying any contexts. Remove those edges.
2598 for (auto *Clone : Node->Clones) {
2599 removeNoneTypeCalleeEdges(Clone);
2600 if (VerifyNodes)
2601 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2602 }
2603 // We should still have some context ids on the original Node.
2604 assert(!Node->ContextIds.empty());
2605
2606 // Remove any callee edges that ended up with alloc type None after creating
2607 // clones and updating callee edges.
2608 removeNoneTypeCalleeEdges(Node);
2609
2610 // Sanity check that no alloc types on node or edges are None.
2611 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2612 assert(llvm::none_of(Node->CalleeEdges,
2613 [&](const std::shared_ptr<ContextEdge> &E) {
2614 return E->AllocTypes == (uint8_t)AllocationType::None;
2615 }));
2616 assert(llvm::none_of(Node->CallerEdges,
2617 [&](const std::shared_ptr<ContextEdge> &E) {
2618 return E->AllocTypes == (uint8_t)AllocationType::None;
2619 }));
2620
2621 if (VerifyNodes)
2622 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2623 }
2624
updateAllocationCall(CallInfo & Call,AllocationType AllocType)2625 void ModuleCallsiteContextGraph::updateAllocationCall(
2626 CallInfo &Call, AllocationType AllocType) {
2627 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2628 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2629 "memprof", AllocTypeString);
2630 cast<CallBase>(Call.call())->addFnAttr(A);
2631 OREGetter(Call.call()->getFunction())
2632 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2633 << ore::NV("AllocationCall", Call.call()) << " in clone "
2634 << ore::NV("Caller", Call.call()->getFunction())
2635 << " marked with memprof allocation attribute "
2636 << ore::NV("Attribute", AllocTypeString));
2637 }
2638
updateAllocationCall(CallInfo & Call,AllocationType AllocType)2639 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2640 AllocationType AllocType) {
2641 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2642 assert(AI);
2643 assert(AI->Versions.size() > Call.cloneNo());
2644 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2645 }
2646
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)2647 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2648 FuncInfo CalleeFunc) {
2649 if (CalleeFunc.cloneNo() > 0)
2650 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2651 OREGetter(CallerCall.call()->getFunction())
2652 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2653 << ore::NV("Call", CallerCall.call()) << " in clone "
2654 << ore::NV("Caller", CallerCall.call()->getFunction())
2655 << " assigned to call function clone "
2656 << ore::NV("Callee", CalleeFunc.func()));
2657 }
2658
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)2659 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2660 FuncInfo CalleeFunc) {
2661 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2662 assert(CI &&
2663 "Caller cannot be an allocation which should not have profiled calls");
2664 assert(CI->Clones.size() > CallerCall.cloneNo());
2665 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2666 }
2667
2668 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2669 Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)2670 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2671 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2672 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2673 // Use existing LLVM facilities for cloning and obtaining Call in clone
2674 ValueToValueMapTy VMap;
2675 auto *NewFunc = CloneFunction(Func.func(), VMap);
2676 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2677 assert(!Func.func()->getParent()->getFunction(Name));
2678 NewFunc->setName(Name);
2679 for (auto &Inst : CallsWithMetadataInFunc) {
2680 // This map always has the initial version in it.
2681 assert(Inst.cloneNo() == 0);
2682 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2683 }
2684 OREGetter(Func.func())
2685 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2686 << "created clone " << ore::NV("NewFunction", NewFunc));
2687 return {NewFunc, CloneNo};
2688 }
2689
2690 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2691 IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)2692 IndexCallsiteContextGraph::cloneFunctionForCallsite(
2693 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2694 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2695 // Check how many clones we have of Call (and therefore function).
2696 // The next clone number is the current size of versions array.
2697 // Confirm this matches the CloneNo provided by the caller, which is based on
2698 // the number of function clones we have.
2699 assert(CloneNo ==
2700 (Call.call().is<AllocInfo *>()
2701 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2702 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2703 // Walk all the instructions in this function. Create a new version for
2704 // each (by adding an entry to the Versions/Clones summary array), and copy
2705 // over the version being called for the function clone being cloned here.
2706 // Additionally, add an entry to the CallMap for the new function clone,
2707 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2708 // to the new call clone.
2709 for (auto &Inst : CallsWithMetadataInFunc) {
2710 // This map always has the initial version in it.
2711 assert(Inst.cloneNo() == 0);
2712 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2713 assert(AI->Versions.size() == CloneNo);
2714 // We assign the allocation type later (in updateAllocationCall), just add
2715 // an entry for it here.
2716 AI->Versions.push_back(0);
2717 } else {
2718 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2719 assert(CI && CI->Clones.size() == CloneNo);
2720 // We assign the clone number later (in updateCall), just add an entry for
2721 // it here.
2722 CI->Clones.push_back(0);
2723 }
2724 CallMap[Inst] = {Inst.call(), CloneNo};
2725 }
2726 return {Func.func(), CloneNo};
2727 }
2728
2729 // This method assigns cloned callsites to functions, cloning the functions as
2730 // needed. The assignment is greedy and proceeds roughly as follows:
2731 //
2732 // For each function Func:
2733 // For each call with graph Node having clones:
2734 // Initialize ClonesWorklist to Node and its clones
2735 // Initialize NodeCloneCount to 0
2736 // While ClonesWorklist is not empty:
2737 // Clone = pop front ClonesWorklist
2738 // NodeCloneCount++
2739 // If Func has been cloned less than NodeCloneCount times:
2740 // If NodeCloneCount is 1:
2741 // Assign Clone to original Func
2742 // Continue
2743 // Create a new function clone
2744 // If other callers not assigned to call a function clone yet:
2745 // Assign them to call new function clone
2746 // Continue
2747 // Assign any other caller calling the cloned version to new clone
2748 //
2749 // For each caller of Clone:
2750 // If caller is assigned to call a specific function clone:
2751 // If we cannot assign Clone to that function clone:
2752 // Create new callsite Clone NewClone
2753 // Add NewClone to ClonesWorklist
2754 // Continue
2755 // Assign Clone to existing caller's called function clone
2756 // Else:
2757 // If Clone not already assigned to a function clone:
2758 // Assign to first function clone without assignment
2759 // Assign caller to selected function clone
2760 template <typename DerivedCCG, typename FuncTy, typename CallTy>
assignFunctions()2761 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2762 bool Changed = false;
2763
2764 // Keep track of the assignment of nodes (callsites) to function clones they
2765 // call.
2766 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2767
2768 // Update caller node to call function version CalleeFunc, by recording the
2769 // assignment in CallsiteToCalleeFuncCloneMap.
2770 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2771 const FuncInfo &CalleeFunc) {
2772 assert(Caller->hasCall());
2773 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2774 };
2775
2776 // Walk all functions for which we saw calls with memprof metadata, and handle
2777 // cloning for each of its calls.
2778 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2779 FuncInfo OrigFunc(Func);
2780 // Map from each clone of OrigFunc to a map of remappings of each call of
2781 // interest (from original uncloned call to the corresponding cloned call in
2782 // that function clone).
2783 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2784 for (auto &Call : CallsWithMetadata) {
2785 ContextNode *Node = getNodeForInst(Call);
2786 // Skip call if we do not have a node for it (all uses of its stack ids
2787 // were either on inlined chains or pruned from the MIBs), or if we did
2788 // not create any clones for it.
2789 if (!Node || Node->Clones.empty())
2790 continue;
2791 assert(Node->hasCall() &&
2792 "Not having a call should have prevented cloning");
2793
2794 // Track the assignment of function clones to clones of the current
2795 // callsite Node being handled.
2796 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2797
2798 // Assign callsite version CallsiteClone to function version FuncClone,
2799 // and also assign (possibly cloned) Call to CallsiteClone.
2800 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2801 CallInfo &Call,
2802 ContextNode *CallsiteClone,
2803 bool IsAlloc) {
2804 // Record the clone of callsite node assigned to this function clone.
2805 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2806
2807 assert(FuncClonesToCallMap.count(FuncClone));
2808 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2809 CallInfo CallClone(Call);
2810 if (CallMap.count(Call))
2811 CallClone = CallMap[Call];
2812 CallsiteClone->setCall(CallClone);
2813 };
2814
2815 // Keep track of the clones of callsite Node that need to be assigned to
2816 // function clones. This list may be expanded in the loop body below if we
2817 // find additional cloning is required.
2818 std::deque<ContextNode *> ClonesWorklist;
2819 // Ignore original Node if we moved all of its contexts to clones.
2820 if (!Node->ContextIds.empty())
2821 ClonesWorklist.push_back(Node);
2822 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2823 Node->Clones.end());
2824
2825 // Now walk through all of the clones of this callsite Node that we need,
2826 // and determine the assignment to a corresponding clone of the current
2827 // function (creating new function clones as needed).
2828 unsigned NodeCloneCount = 0;
2829 while (!ClonesWorklist.empty()) {
2830 ContextNode *Clone = ClonesWorklist.front();
2831 ClonesWorklist.pop_front();
2832 NodeCloneCount++;
2833 if (VerifyNodes)
2834 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2835
2836 // Need to create a new function clone if we have more callsite clones
2837 // than existing function clones, which would have been assigned to an
2838 // earlier clone in the list (we assign callsite clones to function
2839 // clones greedily).
2840 if (FuncClonesToCallMap.size() < NodeCloneCount) {
2841 // If this is the first callsite copy, assign to original function.
2842 if (NodeCloneCount == 1) {
2843 // Since FuncClonesToCallMap is empty in this case, no clones have
2844 // been created for this function yet, and no callers should have
2845 // been assigned a function clone for this callee node yet.
2846 assert(llvm::none_of(
2847 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2848 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2849 }));
2850 // Initialize with empty call map, assign Clone to original function
2851 // and its callers, and skip to the next clone.
2852 FuncClonesToCallMap[OrigFunc] = {};
2853 AssignCallsiteCloneToFuncClone(
2854 OrigFunc, Call, Clone,
2855 AllocationCallToContextNodeMap.count(Call));
2856 for (auto &CE : Clone->CallerEdges) {
2857 // Ignore any caller that does not have a recorded callsite Call.
2858 if (!CE->Caller->hasCall())
2859 continue;
2860 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2861 }
2862 continue;
2863 }
2864
2865 // First locate which copy of OrigFunc to clone again. If a caller
2866 // of this callsite clone was already assigned to call a particular
2867 // function clone, we need to redirect all of those callers to the
2868 // new function clone, and update their other callees within this
2869 // function.
2870 FuncInfo PreviousAssignedFuncClone;
2871 auto EI = llvm::find_if(
2872 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2873 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2874 });
2875 bool CallerAssignedToCloneOfFunc = false;
2876 if (EI != Clone->CallerEdges.end()) {
2877 const std::shared_ptr<ContextEdge> &Edge = *EI;
2878 PreviousAssignedFuncClone =
2879 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2880 CallerAssignedToCloneOfFunc = true;
2881 }
2882
2883 // Clone function and save it along with the CallInfo map created
2884 // during cloning in the FuncClonesToCallMap.
2885 std::map<CallInfo, CallInfo> NewCallMap;
2886 unsigned CloneNo = FuncClonesToCallMap.size();
2887 assert(CloneNo > 0 && "Clone 0 is the original function, which "
2888 "should already exist in the map");
2889 FuncInfo NewFuncClone = cloneFunctionForCallsite(
2890 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2891 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
2892 FunctionClonesAnalysis++;
2893 Changed = true;
2894
2895 // If no caller callsites were already assigned to a clone of this
2896 // function, we can simply assign this clone to the new func clone
2897 // and update all callers to it, then skip to the next clone.
2898 if (!CallerAssignedToCloneOfFunc) {
2899 AssignCallsiteCloneToFuncClone(
2900 NewFuncClone, Call, Clone,
2901 AllocationCallToContextNodeMap.count(Call));
2902 for (auto &CE : Clone->CallerEdges) {
2903 // Ignore any caller that does not have a recorded callsite Call.
2904 if (!CE->Caller->hasCall())
2905 continue;
2906 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2907 }
2908 continue;
2909 }
2910
2911 // We may need to do additional node cloning in this case.
2912 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
2913 // that were previously assigned to call PreviousAssignedFuncClone,
2914 // to record that they now call NewFuncClone.
2915 for (auto CE : Clone->CallerEdges) {
2916 // Skip any that have been removed on an earlier iteration.
2917 if (!CE)
2918 continue;
2919 // Ignore any caller that does not have a recorded callsite Call.
2920 if (!CE->Caller->hasCall())
2921 continue;
2922
2923 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
2924 // We subsequently fall through to later handling that
2925 // will perform any additional cloning required for
2926 // callers that were calling other function clones.
2927 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
2928 PreviousAssignedFuncClone)
2929 continue;
2930
2931 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2932
2933 // If we are cloning a function that was already assigned to some
2934 // callers, then essentially we are creating new callsite clones
2935 // of the other callsites in that function that are reached by those
2936 // callers. Clone the other callees of the current callsite's caller
2937 // that were already assigned to PreviousAssignedFuncClone
2938 // accordingly. This is important since we subsequently update the
2939 // calls from the nodes in the graph and their assignments to callee
2940 // functions recorded in CallsiteToCalleeFuncCloneMap.
2941 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
2942 // Skip any that have been removed on an earlier iteration when
2943 // cleaning up newly None type callee edges.
2944 if (!CalleeEdge)
2945 continue;
2946 ContextNode *Callee = CalleeEdge->Callee;
2947 // Skip the current callsite, we are looking for other
2948 // callsites Caller calls, as well as any that does not have a
2949 // recorded callsite Call.
2950 if (Callee == Clone || !Callee->hasCall())
2951 continue;
2952 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
2953 removeNoneTypeCalleeEdges(NewClone);
2954 // Moving the edge may have resulted in some none type
2955 // callee edges on the original Callee.
2956 removeNoneTypeCalleeEdges(Callee);
2957 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2958 // If the Callee node was already assigned to call a specific
2959 // function version, make sure its new clone is assigned to call
2960 // that same function clone.
2961 if (CallsiteToCalleeFuncCloneMap.count(Callee))
2962 RecordCalleeFuncOfCallsite(
2963 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
2964 // Update NewClone with the new Call clone of this callsite's Call
2965 // created for the new function clone created earlier.
2966 // Recall that we have already ensured when building the graph
2967 // that each caller can only call callsites within the same
2968 // function, so we are guaranteed that Callee Call is in the
2969 // current OrigFunc.
2970 // CallMap is set up as indexed by original Call at clone 0.
2971 CallInfo OrigCall(Callee->getOrigNode()->Call);
2972 OrigCall.setCloneNo(0);
2973 std::map<CallInfo, CallInfo> &CallMap =
2974 FuncClonesToCallMap[NewFuncClone];
2975 assert(CallMap.count(OrigCall));
2976 CallInfo NewCall(CallMap[OrigCall]);
2977 assert(NewCall);
2978 NewClone->setCall(NewCall);
2979 }
2980 }
2981 // Fall through to handling below to perform the recording of the
2982 // function for this callsite clone. This enables handling of cases
2983 // where the callers were assigned to different clones of a function.
2984 }
2985
2986 // See if we can use existing function clone. Walk through
2987 // all caller edges to see if any have already been assigned to
2988 // a clone of this callsite's function. If we can use it, do so. If not,
2989 // because that function clone is already assigned to a different clone
2990 // of this callsite, then we need to clone again.
2991 // Basically, this checking is needed to handle the case where different
2992 // caller functions/callsites may need versions of this function
2993 // containing different mixes of callsite clones across the different
2994 // callsites within the function. If that happens, we need to create
2995 // additional function clones to handle the various combinations.
2996 //
2997 // Keep track of any new clones of this callsite created by the
2998 // following loop, as well as any existing clone that we decided to
2999 // assign this clone to.
3000 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3001 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3002 // We need to be able to remove Edge from CallerEdges, so need to adjust
3003 // iterator in the loop.
3004 for (auto EI = Clone->CallerEdges.begin();
3005 EI != Clone->CallerEdges.end();) {
3006 auto Edge = *EI;
3007 // Ignore any caller that does not have a recorded callsite Call.
3008 if (!Edge->Caller->hasCall()) {
3009 EI++;
3010 continue;
3011 }
3012 // If this caller already assigned to call a version of OrigFunc, need
3013 // to ensure we can assign this callsite clone to that function clone.
3014 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3015 FuncInfo FuncCloneCalledByCaller =
3016 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3017 // First we need to confirm that this function clone is available
3018 // for use by this callsite node clone.
3019 //
3020 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3021 // its callsite clones, one of those callsite clones X could have
3022 // been assigned to the same function clone called by Edge's caller
3023 // - if Edge's caller calls another callsite within Node's original
3024 // function, and that callsite has another caller reaching clone X.
3025 // We need to clone Node again in this case.
3026 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3027 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3028 Clone) ||
3029 // Detect when we have multiple callers of this callsite that
3030 // have already been assigned to specific, and different, clones
3031 // of OrigFunc (due to other unrelated callsites in Func they
3032 // reach via call contexts). Is this Clone of callsite Node
3033 // assigned to a different clone of OrigFunc? If so, clone Node
3034 // again.
3035 (FuncCloneAssignedToCurCallsiteClone &&
3036 FuncCloneAssignedToCurCallsiteClone !=
3037 FuncCloneCalledByCaller)) {
3038 // We need to use a different newly created callsite clone, in
3039 // order to assign it to another new function clone on a
3040 // subsequent iteration over the Clones array (adjusted below).
3041 // Note we specifically do not reset the
3042 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3043 // when this new clone is processed later we know which version of
3044 // the function to copy (so that other callsite clones we have
3045 // assigned to that function clone are properly cloned over). See
3046 // comments in the function cloning handling earlier.
3047
3048 // Check if we already have cloned this callsite again while
3049 // walking through caller edges, for a caller calling the same
3050 // function clone. If so, we can move this edge to that new clone
3051 // rather than creating yet another new clone.
3052 if (FuncCloneToNewCallsiteCloneMap.count(
3053 FuncCloneCalledByCaller)) {
3054 ContextNode *NewClone =
3055 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3056 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3057 // Cleanup any none type edges cloned over.
3058 removeNoneTypeCalleeEdges(NewClone);
3059 } else {
3060 // Create a new callsite clone.
3061 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3062 removeNoneTypeCalleeEdges(NewClone);
3063 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3064 NewClone;
3065 // Add to list of clones and process later.
3066 ClonesWorklist.push_back(NewClone);
3067 assert(EI == Clone->CallerEdges.end() ||
3068 Clone->AllocTypes != (uint8_t)AllocationType::None);
3069 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3070 }
3071 // Moving the caller edge may have resulted in some none type
3072 // callee edges.
3073 removeNoneTypeCalleeEdges(Clone);
3074 // We will handle the newly created callsite clone in a subsequent
3075 // iteration over this Node's Clones. Continue here since we
3076 // already adjusted iterator EI while moving the edge.
3077 continue;
3078 }
3079
3080 // Otherwise, we can use the function clone already assigned to this
3081 // caller.
3082 if (!FuncCloneAssignedToCurCallsiteClone) {
3083 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3084 // Assign Clone to FuncCloneCalledByCaller
3085 AssignCallsiteCloneToFuncClone(
3086 FuncCloneCalledByCaller, Call, Clone,
3087 AllocationCallToContextNodeMap.count(Call));
3088 } else
3089 // Don't need to do anything - callsite is already calling this
3090 // function clone.
3091 assert(FuncCloneAssignedToCurCallsiteClone ==
3092 FuncCloneCalledByCaller);
3093
3094 } else {
3095 // We have not already assigned this caller to a version of
3096 // OrigFunc. Do the assignment now.
3097
3098 // First check if we have already assigned this callsite clone to a
3099 // clone of OrigFunc for another caller during this iteration over
3100 // its caller edges.
3101 if (!FuncCloneAssignedToCurCallsiteClone) {
3102 // Find first function in FuncClonesToCallMap without an assigned
3103 // clone of this callsite Node. We should always have one
3104 // available at this point due to the earlier cloning when the
3105 // FuncClonesToCallMap size was smaller than the clone number.
3106 for (auto &CF : FuncClonesToCallMap) {
3107 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3108 FuncCloneAssignedToCurCallsiteClone = CF.first;
3109 break;
3110 }
3111 }
3112 assert(FuncCloneAssignedToCurCallsiteClone);
3113 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3114 AssignCallsiteCloneToFuncClone(
3115 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3116 AllocationCallToContextNodeMap.count(Call));
3117 } else
3118 assert(FuncCloneToCurNodeCloneMap
3119 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3120 // Update callers to record function version called.
3121 RecordCalleeFuncOfCallsite(Edge->Caller,
3122 FuncCloneAssignedToCurCallsiteClone);
3123 }
3124
3125 EI++;
3126 }
3127 }
3128 if (VerifyCCG) {
3129 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3130 for (const auto &PE : Node->CalleeEdges)
3131 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3132 for (const auto &CE : Node->CallerEdges)
3133 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3134 for (auto *Clone : Node->Clones) {
3135 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3136 for (const auto &PE : Clone->CalleeEdges)
3137 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3138 for (const auto &CE : Clone->CallerEdges)
3139 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3140 }
3141 }
3142 }
3143 }
3144
3145 auto UpdateCalls = [&](ContextNode *Node,
3146 DenseSet<const ContextNode *> &Visited,
3147 auto &&UpdateCalls) {
3148 auto Inserted = Visited.insert(Node);
3149 if (!Inserted.second)
3150 return;
3151
3152 for (auto *Clone : Node->Clones)
3153 UpdateCalls(Clone, Visited, UpdateCalls);
3154
3155 for (auto &Edge : Node->CallerEdges)
3156 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3157
3158 // Skip if either no call to update, or if we ended up with no context ids
3159 // (we moved all edges onto other clones).
3160 if (!Node->hasCall() || Node->ContextIds.empty())
3161 return;
3162
3163 if (Node->IsAllocation) {
3164 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3165 return;
3166 }
3167
3168 if (!CallsiteToCalleeFuncCloneMap.count(Node))
3169 return;
3170
3171 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3172 updateCall(Node->Call, CalleeFunc);
3173 };
3174
3175 // Performs DFS traversal starting from allocation nodes to update calls to
3176 // reflect cloning decisions recorded earlier. For regular LTO this will
3177 // update the actual calls in the IR to call the appropriate function clone
3178 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3179 // are recorded in the summary entries.
3180 DenseSet<const ContextNode *> Visited;
3181 for (auto &Entry : AllocationCallToContextNodeMap)
3182 UpdateCalls(Entry.second, Visited, UpdateCalls);
3183
3184 return Changed;
3185 }
3186
createFunctionClones(Function & F,unsigned NumClones,Module & M,OptimizationRemarkEmitter & ORE,std::map<const Function *,SmallPtrSet<const GlobalAlias *,1>> & FuncToAliasMap)3187 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
3188 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3189 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3190 &FuncToAliasMap) {
3191 // The first "clone" is the original copy, we should only call this if we
3192 // needed to create new clones.
3193 assert(NumClones > 1);
3194 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3195 VMaps.reserve(NumClones - 1);
3196 FunctionsClonedThinBackend++;
3197 for (unsigned I = 1; I < NumClones; I++) {
3198 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3199 auto *NewF = CloneFunction(&F, *VMaps.back());
3200 FunctionClonesThinBackend++;
3201 // Strip memprof and callsite metadata from clone as they are no longer
3202 // needed.
3203 for (auto &BB : *NewF) {
3204 for (auto &Inst : BB) {
3205 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3206 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3207 }
3208 }
3209 std::string Name = getMemProfFuncName(F.getName(), I);
3210 auto *PrevF = M.getFunction(Name);
3211 if (PrevF) {
3212 // We might have created this when adjusting callsite in another
3213 // function. It should be a declaration.
3214 assert(PrevF->isDeclaration());
3215 NewF->takeName(PrevF);
3216 PrevF->replaceAllUsesWith(NewF);
3217 PrevF->eraseFromParent();
3218 } else
3219 NewF->setName(Name);
3220 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3221 << "created clone " << ore::NV("NewFunction", NewF));
3222
3223 // Now handle aliases to this function, and clone those as well.
3224 if (!FuncToAliasMap.count(&F))
3225 continue;
3226 for (auto *A : FuncToAliasMap[&F]) {
3227 std::string Name = getMemProfFuncName(A->getName(), I);
3228 auto *PrevA = M.getNamedAlias(Name);
3229 auto *NewA = GlobalAlias::create(A->getValueType(),
3230 A->getType()->getPointerAddressSpace(),
3231 A->getLinkage(), Name, NewF);
3232 NewA->copyAttributesFrom(A);
3233 if (PrevA) {
3234 // We might have created this when adjusting callsite in another
3235 // function. It should be a declaration.
3236 assert(PrevA->isDeclaration());
3237 NewA->takeName(PrevA);
3238 PrevA->replaceAllUsesWith(NewA);
3239 PrevA->eraseFromParent();
3240 }
3241 }
3242 }
3243 return VMaps;
3244 }
3245
3246 // Locate the summary for F. This is complicated by the fact that it might
3247 // have been internalized or promoted.
findValueInfoForFunc(const Function & F,const Module & M,const ModuleSummaryIndex * ImportSummary)3248 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
3249 const ModuleSummaryIndex *ImportSummary) {
3250 // FIXME: Ideally we would retain the original GUID in some fashion on the
3251 // function (e.g. as metadata), but for now do our best to locate the
3252 // summary without that information.
3253 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3254 if (!TheFnVI)
3255 // See if theFn was internalized, by checking index directly with
3256 // original name (this avoids the name adjustment done by getGUID() for
3257 // internal symbols).
3258 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3259 if (TheFnVI)
3260 return TheFnVI;
3261 // Now query with the original name before any promotion was performed.
3262 StringRef OrigName =
3263 ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
3264 std::string OrigId = GlobalValue::getGlobalIdentifier(
3265 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3266 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3267 if (TheFnVI)
3268 return TheFnVI;
3269 // Could be a promoted local imported from another module. We need to pass
3270 // down more info here to find the original module id. For now, try with
3271 // the OrigName which might have been stored in the OidGuidMap in the
3272 // index. This would not work if there were same-named locals in multiple
3273 // modules, however.
3274 auto OrigGUID =
3275 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3276 if (OrigGUID)
3277 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3278 return TheFnVI;
3279 }
3280
applyImport(Module & M)3281 bool MemProfContextDisambiguation::applyImport(Module &M) {
3282 assert(ImportSummary);
3283 bool Changed = false;
3284
3285 auto IsMemProfClone = [](const Function &F) {
3286 return F.getName().contains(MemProfCloneSuffix);
3287 };
3288
3289 // We also need to clone any aliases that reference cloned functions, because
3290 // the modified callsites may invoke via the alias. Keep track of the aliases
3291 // for each function.
3292 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3293 FuncToAliasMap;
3294 for (auto &A : M.aliases()) {
3295 auto *Aliasee = A.getAliaseeObject();
3296 if (auto *F = dyn_cast<Function>(Aliasee))
3297 FuncToAliasMap[F].insert(&A);
3298 }
3299
3300 for (auto &F : M) {
3301 if (F.isDeclaration() || IsMemProfClone(F))
3302 continue;
3303
3304 OptimizationRemarkEmitter ORE(&F);
3305
3306 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3307 bool ClonesCreated = false;
3308 unsigned NumClonesCreated = 0;
3309 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3310 // We should at least have version 0 which is the original copy.
3311 assert(NumClones > 0);
3312 // If only one copy needed use original.
3313 if (NumClones == 1)
3314 return;
3315 // If we already performed cloning of this function, confirm that the
3316 // requested number of clones matches (the thin link should ensure the
3317 // number of clones for each constituent callsite is consistent within
3318 // each function), before returning.
3319 if (ClonesCreated) {
3320 assert(NumClonesCreated == NumClones);
3321 return;
3322 }
3323 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3324 // The first "clone" is the original copy, which doesn't have a VMap.
3325 assert(VMaps.size() == NumClones - 1);
3326 Changed = true;
3327 ClonesCreated = true;
3328 NumClonesCreated = NumClones;
3329 };
3330
3331 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3332 Function *CalledFunction) {
3333 // Perform cloning if not yet done.
3334 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3335
3336 // Should have skipped indirect calls via mayHaveMemprofSummary.
3337 assert(CalledFunction);
3338 assert(!IsMemProfClone(*CalledFunction));
3339
3340 // Update the calls per the summary info.
3341 // Save orig name since it gets updated in the first iteration
3342 // below.
3343 auto CalleeOrigName = CalledFunction->getName();
3344 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3345 // Do nothing if this version calls the original version of its
3346 // callee.
3347 if (!StackNode.Clones[J])
3348 continue;
3349 auto NewF = M.getOrInsertFunction(
3350 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3351 CalledFunction->getFunctionType());
3352 CallBase *CBClone;
3353 // Copy 0 is the original function.
3354 if (!J)
3355 CBClone = CB;
3356 else
3357 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3358 CBClone->setCalledFunction(NewF);
3359 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3360 << ore::NV("Call", CBClone) << " in clone "
3361 << ore::NV("Caller", CBClone->getFunction())
3362 << " assigned to call function clone "
3363 << ore::NV("Callee", NewF.getCallee()));
3364 }
3365 };
3366
3367 // Locate the summary for F.
3368 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3369 // If not found, this could be an imported local (see comment in
3370 // findValueInfoForFunc). Skip for now as it will be cloned in its original
3371 // module (where it would have been promoted to global scope so should
3372 // satisfy any reference in this module).
3373 if (!TheFnVI)
3374 continue;
3375
3376 auto *GVSummary =
3377 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3378 if (!GVSummary)
3379 // Must have been imported, use the first summary (might be multiple if
3380 // this was a linkonce_odr).
3381 GVSummary = TheFnVI.getSummaryList().front().get();
3382
3383 // If this was an imported alias skip it as we won't have the function
3384 // summary, and it should be cloned in the original module.
3385 if (isa<AliasSummary>(GVSummary))
3386 continue;
3387
3388 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3389
3390 if (FS->allocs().empty() && FS->callsites().empty())
3391 continue;
3392
3393 auto SI = FS->callsites().begin();
3394 auto AI = FS->allocs().begin();
3395
3396 // To handle callsite infos synthesized for tail calls which have missing
3397 // frames in the profiled context, map callee VI to the synthesized callsite
3398 // info.
3399 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3400 // Iterate the callsites for this function in reverse, since we place all
3401 // those synthesized for tail calls at the end.
3402 for (auto CallsiteIt = FS->callsites().rbegin();
3403 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3404 auto &Callsite = *CallsiteIt;
3405 // Stop as soon as we see a non-synthesized callsite info (see comment
3406 // above loop). All the entries added for discovered tail calls have empty
3407 // stack ids.
3408 if (!Callsite.StackIdIndices.empty())
3409 break;
3410 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3411 }
3412
3413 // Assume for now that the instructions are in the exact same order
3414 // as when the summary was created, but confirm this is correct by
3415 // matching the stack ids.
3416 for (auto &BB : F) {
3417 for (auto &I : BB) {
3418 auto *CB = dyn_cast<CallBase>(&I);
3419 // Same handling as when creating module summary.
3420 if (!mayHaveMemprofSummary(CB))
3421 continue;
3422
3423 auto *CalledValue = CB->getCalledOperand();
3424 auto *CalledFunction = CB->getCalledFunction();
3425 if (CalledValue && !CalledFunction) {
3426 CalledValue = CalledValue->stripPointerCasts();
3427 // Stripping pointer casts can reveal a called function.
3428 CalledFunction = dyn_cast<Function>(CalledValue);
3429 }
3430 // Check if this is an alias to a function. If so, get the
3431 // called aliasee for the checks below.
3432 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3433 assert(!CalledFunction &&
3434 "Expected null called function in callsite for alias");
3435 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3436 }
3437
3438 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3439 I.getMetadata(LLVMContext::MD_callsite));
3440 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3441
3442 // Include allocs that were already assigned a memprof function
3443 // attribute in the statistics.
3444 if (CB->getAttributes().hasFnAttr("memprof")) {
3445 assert(!MemProfMD);
3446 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3447 ? AllocTypeColdThinBackend++
3448 : AllocTypeNotColdThinBackend++;
3449 OrigAllocsThinBackend++;
3450 AllocVersionsThinBackend++;
3451 if (!MaxAllocVersionsThinBackend)
3452 MaxAllocVersionsThinBackend = 1;
3453 // Remove any remaining callsite metadata and we can skip the rest of
3454 // the handling for this instruction, since no cloning needed.
3455 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3456 continue;
3457 }
3458
3459 if (MemProfMD) {
3460 // Consult the next alloc node.
3461 assert(AI != FS->allocs().end());
3462 auto &AllocNode = *(AI++);
3463
3464 // Sanity check that the MIB stack ids match between the summary and
3465 // instruction metadata.
3466 auto MIBIter = AllocNode.MIBs.begin();
3467 for (auto &MDOp : MemProfMD->operands()) {
3468 assert(MIBIter != AllocNode.MIBs.end());
3469 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3470 MIBIter->StackIdIndices.begin();
3471 auto *MIBMD = cast<const MDNode>(MDOp);
3472 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3473 assert(StackMDNode);
3474 SmallVector<unsigned> StackIdsFromMetadata;
3475 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3476 for (auto ContextIter =
3477 StackContext.beginAfterSharedPrefix(CallsiteContext);
3478 ContextIter != StackContext.end(); ++ContextIter) {
3479 // If this is a direct recursion, simply skip the duplicate
3480 // entries, to be consistent with how the summary ids were
3481 // generated during ModuleSummaryAnalysis.
3482 if (!StackIdsFromMetadata.empty() &&
3483 StackIdsFromMetadata.back() == *ContextIter)
3484 continue;
3485 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3486 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3487 *ContextIter);
3488 StackIdIndexIter++;
3489 }
3490 MIBIter++;
3491 }
3492
3493 // Perform cloning if not yet done.
3494 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3495
3496 OrigAllocsThinBackend++;
3497 AllocVersionsThinBackend += AllocNode.Versions.size();
3498 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3499 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3500
3501 // If there is only one version that means we didn't end up
3502 // considering this function for cloning, and in that case the alloc
3503 // will still be none type or should have gotten the default NotCold.
3504 // Skip that after calling clone helper since that does some sanity
3505 // checks that confirm we haven't decided yet that we need cloning.
3506 if (AllocNode.Versions.size() == 1) {
3507 assert((AllocationType)AllocNode.Versions[0] ==
3508 AllocationType::NotCold ||
3509 (AllocationType)AllocNode.Versions[0] ==
3510 AllocationType::None);
3511 UnclonableAllocsThinBackend++;
3512 continue;
3513 }
3514
3515 // All versions should have a singular allocation type.
3516 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3517 return Type == ((uint8_t)AllocationType::NotCold |
3518 (uint8_t)AllocationType::Cold);
3519 }));
3520
3521 // Update the allocation types per the summary info.
3522 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3523 // Ignore any that didn't get an assigned allocation type.
3524 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3525 continue;
3526 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3527 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3528 : AllocTypeNotColdThinBackend++;
3529 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3530 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3531 AllocTypeString);
3532 CallBase *CBClone;
3533 // Copy 0 is the original function.
3534 if (!J)
3535 CBClone = CB;
3536 else
3537 // Since VMaps are only created for new clones, we index with
3538 // clone J-1 (J==0 is the original clone and does not have a VMaps
3539 // entry).
3540 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3541 CBClone->addFnAttr(A);
3542 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3543 << ore::NV("AllocationCall", CBClone) << " in clone "
3544 << ore::NV("Caller", CBClone->getFunction())
3545 << " marked with memprof allocation attribute "
3546 << ore::NV("Attribute", AllocTypeString));
3547 }
3548 } else if (!CallsiteContext.empty()) {
3549 // Consult the next callsite node.
3550 assert(SI != FS->callsites().end());
3551 auto &StackNode = *(SI++);
3552
3553 #ifndef NDEBUG
3554 // Sanity check that the stack ids match between the summary and
3555 // instruction metadata.
3556 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3557 for (auto StackId : CallsiteContext) {
3558 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3559 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3560 StackId);
3561 StackIdIndexIter++;
3562 }
3563 #endif
3564
3565 CloneCallsite(StackNode, CB, CalledFunction);
3566 } else if (CB->isTailCall()) {
3567 // Locate the synthesized callsite info for the callee VI, if any was
3568 // created, and use that for cloning.
3569 ValueInfo CalleeVI =
3570 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3571 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3572 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3573 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3574 CloneCallsite(Callsite->second, CB, CalledFunction);
3575 }
3576 }
3577 // Memprof and callsite metadata on memory allocations no longer needed.
3578 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3579 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3580 }
3581 }
3582 }
3583
3584 return Changed;
3585 }
3586
3587 template <typename DerivedCCG, typename FuncTy, typename CallTy>
process()3588 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3589 if (DumpCCG) {
3590 dbgs() << "CCG before cloning:\n";
3591 dbgs() << *this;
3592 }
3593 if (ExportToDot)
3594 exportToDot("postbuild");
3595
3596 if (VerifyCCG) {
3597 check();
3598 }
3599
3600 identifyClones();
3601
3602 if (VerifyCCG) {
3603 check();
3604 }
3605
3606 if (DumpCCG) {
3607 dbgs() << "CCG after cloning:\n";
3608 dbgs() << *this;
3609 }
3610 if (ExportToDot)
3611 exportToDot("cloned");
3612
3613 bool Changed = assignFunctions();
3614
3615 if (DumpCCG) {
3616 dbgs() << "CCG after assigning function clones:\n";
3617 dbgs() << *this;
3618 }
3619 if (ExportToDot)
3620 exportToDot("clonefuncassign");
3621
3622 return Changed;
3623 }
3624
processModule(Module & M,llvm::function_ref<OptimizationRemarkEmitter & (Function *)> OREGetter)3625 bool MemProfContextDisambiguation::processModule(
3626 Module &M,
3627 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3628
3629 // If we have an import summary, then the cloning decisions were made during
3630 // the thin link on the index. Apply them and return.
3631 if (ImportSummary)
3632 return applyImport(M);
3633
3634 // TODO: If/when other types of memprof cloning are enabled beyond just for
3635 // hot and cold, we will need to change this to individually control the
3636 // AllocationType passed to addStackNodesForMIB during CCG construction.
3637 // Note that we specifically check this after applying imports above, so that
3638 // the option isn't needed to be passed to distributed ThinLTO backend
3639 // clang processes, which won't necessarily have visibility into the linker
3640 // dependences. Instead the information is communicated from the LTO link to
3641 // the backends via the combined summary index.
3642 if (!SupportsHotColdNew)
3643 return false;
3644
3645 ModuleCallsiteContextGraph CCG(M, OREGetter);
3646 return CCG.process();
3647 }
3648
MemProfContextDisambiguation(const ModuleSummaryIndex * Summary)3649 MemProfContextDisambiguation::MemProfContextDisambiguation(
3650 const ModuleSummaryIndex *Summary)
3651 : ImportSummary(Summary) {
3652 if (ImportSummary) {
3653 // The MemProfImportSummary should only be used for testing ThinLTO
3654 // distributed backend handling via opt, in which case we don't have a
3655 // summary from the pass pipeline.
3656 assert(MemProfImportSummary.empty());
3657 return;
3658 }
3659 if (MemProfImportSummary.empty())
3660 return;
3661
3662 auto ReadSummaryFile =
3663 errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
3664 if (!ReadSummaryFile) {
3665 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3666 "Error loading file '" + MemProfImportSummary +
3667 "': ");
3668 return;
3669 }
3670 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3671 if (!ImportSummaryForTestingOrErr) {
3672 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3673 "Error parsing file '" + MemProfImportSummary +
3674 "': ");
3675 return;
3676 }
3677 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3678 ImportSummary = ImportSummaryForTesting.get();
3679 }
3680
run(Module & M,ModuleAnalysisManager & AM)3681 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3682 ModuleAnalysisManager &AM) {
3683 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3684 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3685 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
3686 };
3687 if (!processModule(M, OREGetter))
3688 return PreservedAnalyses::all();
3689 return PreservedAnalyses::none();
3690 }
3691
run(ModuleSummaryIndex & Index,llvm::function_ref<bool (GlobalValue::GUID,const GlobalValueSummary *)> isPrevailing)3692 void MemProfContextDisambiguation::run(
3693 ModuleSummaryIndex &Index,
3694 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3695 isPrevailing) {
3696 // TODO: If/when other types of memprof cloning are enabled beyond just for
3697 // hot and cold, we will need to change this to individually control the
3698 // AllocationType passed to addStackNodesForMIB during CCG construction.
3699 // The index was set from the option, so these should be in sync.
3700 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3701 if (!SupportsHotColdNew)
3702 return;
3703
3704 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3705 CCG.process();
3706 }
3707