1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/Module.h"
39 #include "llvm/IR/ModuleSummaryIndex.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/FileSystem.h"
43 #include "llvm/Support/GraphWriter.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Utils/Cloning.h"
47 #include <sstream>
48 #include <unordered_map>
49 #include <vector>
50 using namespace llvm;
51 using namespace llvm::memprof;
52 
53 #define DEBUG_TYPE "memprof-context-disambiguation"
54 
55 STATISTIC(FunctionClonesAnalysis,
56           "Number of function clones created during whole program analysis");
57 STATISTIC(FunctionClonesThinBackend,
58           "Number of function clones created during ThinLTO backend");
59 STATISTIC(FunctionsClonedThinBackend,
60           "Number of functions that had clones created during ThinLTO backend");
61 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
62                             "cloned) during whole program analysis");
63 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
64                          "during whole program analysis");
65 STATISTIC(AllocTypeNotColdThinBackend,
66           "Number of not cold static allocations (possibly cloned) during "
67           "ThinLTO backend");
68 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
69                                     "(possibly cloned) during ThinLTO backend");
70 STATISTIC(OrigAllocsThinBackend,
71           "Number of original (not cloned) allocations with memprof profiles "
72           "during ThinLTO backend");
73 STATISTIC(
74     AllocVersionsThinBackend,
75     "Number of allocation versions (including clones) during ThinLTO backend");
76 STATISTIC(MaxAllocVersionsThinBackend,
77           "Maximum number of allocation versions created for an original "
78           "allocation during ThinLTO backend");
79 STATISTIC(UnclonableAllocsThinBackend,
80           "Number of unclonable ambigous allocations during ThinLTO backend");
81 STATISTIC(RemovedEdgesWithMismatchedCallees,
82           "Number of edges removed due to mismatched callees (profiled vs IR)");
83 STATISTIC(FoundProfiledCalleeCount,
84           "Number of profiled callees found via tail calls");
85 STATISTIC(FoundProfiledCalleeDepth,
86           "Aggregate depth of profiled callees found via tail calls");
87 STATISTIC(FoundProfiledCalleeMaxDepth,
88           "Maximum depth of profiled callees found via tail calls");
89 STATISTIC(FoundProfiledCalleeNonUniquelyCount,
90           "Number of profiled callees found via multiple tail call chains");
91 
92 static cl::opt<std::string> DotFilePathPrefix(
93     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
94     cl::value_desc("filename"),
95     cl::desc("Specify the path prefix of the MemProf dot files."));
96 
97 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
98                                  cl::Hidden,
99                                  cl::desc("Export graph to dot files."));
100 
101 static cl::opt<bool>
102     DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
103             cl::desc("Dump CallingContextGraph to stdout after each stage."));
104 
105 static cl::opt<bool>
106     VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
107               cl::desc("Perform verification checks on CallingContextGraph."));
108 
109 static cl::opt<bool>
110     VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
111                 cl::desc("Perform frequent verification checks on nodes."));
112 
113 static cl::opt<std::string> MemProfImportSummary(
114     "memprof-import-summary",
115     cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
116     cl::Hidden);
117 
118 static cl::opt<unsigned>
119     TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
120                         cl::Hidden,
121                         cl::desc("Max depth to recursively search for missing "
122                                  "frames through tail calls."));
123 
124 namespace llvm {
125 // Indicate we are linking with an allocator that supports hot/cold operator
126 // new interfaces.
127 cl::opt<bool> SupportsHotColdNew(
128     "supports-hot-cold-new", cl::init(false), cl::Hidden,
129     cl::desc("Linking with hot/cold operator new interfaces"));
130 } // namespace llvm
131 
132 namespace {
133 /// CRTP base for graphs built from either IR or ThinLTO summary index.
134 ///
135 /// The graph represents the call contexts in all memprof metadata on allocation
136 /// calls, with nodes for the allocations themselves, as well as for the calls
137 /// in each context. The graph is initially built from the allocation memprof
138 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
139 /// metadata onto the nodes, updating it to reflect any inlining performed on
140 /// those calls.
141 ///
142 /// Each MIB (representing an allocation's call context with allocation
143 /// behavior) is assigned a unique context id during the graph build. The edges
144 /// and nodes in the graph are decorated with the context ids they carry. This
145 /// is used to correctly update the graph when cloning is performed so that we
146 /// can uniquify the context for a single (possibly cloned) allocation.
147 template <typename DerivedCCG, typename FuncTy, typename CallTy>
148 class CallsiteContextGraph {
149 public:
150   CallsiteContextGraph() = default;
151   CallsiteContextGraph(const CallsiteContextGraph &) = default;
152   CallsiteContextGraph(CallsiteContextGraph &&) = default;
153 
154   /// Main entry point to perform analysis and transformations on graph.
155   bool process();
156 
157   /// Perform cloning on the graph necessary to uniquely identify the allocation
158   /// behavior of an allocation based on its context.
159   void identifyClones();
160 
161   /// Assign callsite clones to functions, cloning functions as needed to
162   /// accommodate the combinations of their callsite clones reached by callers.
163   /// For regular LTO this clones functions and callsites in the IR, but for
164   /// ThinLTO the cloning decisions are noted in the summaries and later applied
165   /// in applyImport.
166   bool assignFunctions();
167 
168   void dump() const;
169   void print(raw_ostream &OS) const;
170 
operator <<(raw_ostream & OS,const CallsiteContextGraph & CCG)171   friend raw_ostream &operator<<(raw_ostream &OS,
172                                  const CallsiteContextGraph &CCG) {
173     CCG.print(OS);
174     return OS;
175   }
176 
177   friend struct GraphTraits<
178       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
179   friend struct DOTGraphTraits<
180       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
181 
182   void exportToDot(std::string Label) const;
183 
184   /// Represents a function clone via FuncTy pointer and clone number pair.
185   struct FuncInfo final
186       : public std::pair<FuncTy *, unsigned /*Clone number*/> {
187     using Base = std::pair<FuncTy *, unsigned>;
FuncInfo__anon2f5d20380111::CallsiteContextGraph::FuncInfo188     FuncInfo(const Base &B) : Base(B) {}
FuncInfo__anon2f5d20380111::CallsiteContextGraph::FuncInfo189     FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
operator bool__anon2f5d20380111::CallsiteContextGraph::FuncInfo190     explicit operator bool() const { return this->first != nullptr; }
func__anon2f5d20380111::CallsiteContextGraph::FuncInfo191     FuncTy *func() const { return this->first; }
cloneNo__anon2f5d20380111::CallsiteContextGraph::FuncInfo192     unsigned cloneNo() const { return this->second; }
193   };
194 
195   /// Represents a callsite clone via CallTy and clone number pair.
196   struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
197     using Base = std::pair<CallTy, unsigned>;
CallInfo__anon2f5d20380111::CallsiteContextGraph::CallInfo198     CallInfo(const Base &B) : Base(B) {}
CallInfo__anon2f5d20380111::CallsiteContextGraph::CallInfo199     CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
200         : Base(Call, CloneNo) {}
operator bool__anon2f5d20380111::CallsiteContextGraph::CallInfo201     explicit operator bool() const { return (bool)this->first; }
call__anon2f5d20380111::CallsiteContextGraph::CallInfo202     CallTy call() const { return this->first; }
cloneNo__anon2f5d20380111::CallsiteContextGraph::CallInfo203     unsigned cloneNo() const { return this->second; }
setCloneNo__anon2f5d20380111::CallsiteContextGraph::CallInfo204     void setCloneNo(unsigned N) { this->second = N; }
print__anon2f5d20380111::CallsiteContextGraph::CallInfo205     void print(raw_ostream &OS) const {
206       if (!operator bool()) {
207         assert(!cloneNo());
208         OS << "null Call";
209         return;
210       }
211       call()->print(OS);
212       OS << "\t(clone " << cloneNo() << ")";
213     }
dump__anon2f5d20380111::CallsiteContextGraph::CallInfo214     void dump() const {
215       print(dbgs());
216       dbgs() << "\n";
217     }
operator <<(raw_ostream & OS,const CallInfo & Call)218     friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
219       Call.print(OS);
220       return OS;
221     }
222   };
223 
224   struct ContextEdge;
225 
226   /// Node in the Callsite Context Graph
227   struct ContextNode {
228     // Keep this for now since in the IR case where we have an Instruction* it
229     // is not as immediately discoverable. Used for printing richer information
230     // when dumping graph.
231     bool IsAllocation;
232 
233     // Keeps track of when the Call was reset to null because there was
234     // recursion.
235     bool Recursive = false;
236 
237     // The corresponding allocation or interior call.
238     CallInfo Call;
239 
240     // For alloc nodes this is a unique id assigned when constructed, and for
241     // callsite stack nodes it is the original stack id when the node is
242     // constructed from the memprof MIB metadata on the alloc nodes. Note that
243     // this is only used when matching callsite metadata onto the stack nodes
244     // created when processing the allocation memprof MIBs, and for labeling
245     // nodes in the dot graph. Therefore we don't bother to assign a value for
246     // clones.
247     uint64_t OrigStackOrAllocId = 0;
248 
249     // This will be formed by ORing together the AllocationType enum values
250     // for contexts including this node.
251     uint8_t AllocTypes = 0;
252 
253     // Edges to all callees in the profiled call stacks.
254     // TODO: Should this be a map (from Callee node) for more efficient lookup?
255     std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
256 
257     // Edges to all callers in the profiled call stacks.
258     // TODO: Should this be a map (from Caller node) for more efficient lookup?
259     std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
260 
261     // The set of IDs for contexts including this node.
262     DenseSet<uint32_t> ContextIds;
263 
264     // List of clones of this ContextNode, initially empty.
265     std::vector<ContextNode *> Clones;
266 
267     // If a clone, points to the original uncloned node.
268     ContextNode *CloneOf = nullptr;
269 
ContextNode__anon2f5d20380111::CallsiteContextGraph::ContextNode270     ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
271 
ContextNode__anon2f5d20380111::CallsiteContextGraph::ContextNode272     ContextNode(bool IsAllocation, CallInfo C)
273         : IsAllocation(IsAllocation), Call(C) {}
274 
addClone__anon2f5d20380111::CallsiteContextGraph::ContextNode275     void addClone(ContextNode *Clone) {
276       if (CloneOf) {
277         CloneOf->Clones.push_back(Clone);
278         Clone->CloneOf = CloneOf;
279       } else {
280         Clones.push_back(Clone);
281         assert(!Clone->CloneOf);
282         Clone->CloneOf = this;
283       }
284     }
285 
getOrigNode__anon2f5d20380111::CallsiteContextGraph::ContextNode286     ContextNode *getOrigNode() {
287       if (!CloneOf)
288         return this;
289       return CloneOf;
290     }
291 
292     void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
293                                unsigned int ContextId);
294 
295     ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
296     ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
297     void eraseCalleeEdge(const ContextEdge *Edge);
298     void eraseCallerEdge(const ContextEdge *Edge);
299 
setCall__anon2f5d20380111::CallsiteContextGraph::ContextNode300     void setCall(CallInfo C) { Call = C; }
301 
hasCall__anon2f5d20380111::CallsiteContextGraph::ContextNode302     bool hasCall() const { return (bool)Call.call(); }
303 
printCall__anon2f5d20380111::CallsiteContextGraph::ContextNode304     void printCall(raw_ostream &OS) const { Call.print(OS); }
305 
306     // True if this node was effectively removed from the graph, in which case
307     // its context id set, caller edges, and callee edges should all be empty.
isRemoved__anon2f5d20380111::CallsiteContextGraph::ContextNode308     bool isRemoved() const {
309       assert(ContextIds.empty() ==
310              (CalleeEdges.empty() && CallerEdges.empty()));
311       return ContextIds.empty();
312     }
313 
314     void dump() const;
315     void print(raw_ostream &OS) const;
316 
operator <<(raw_ostream & OS,const ContextNode & Node)317     friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
318       Node.print(OS);
319       return OS;
320     }
321   };
322 
323   /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
324   /// callee.
325   struct ContextEdge {
326     ContextNode *Callee;
327     ContextNode *Caller;
328 
329     // This will be formed by ORing together the AllocationType enum values
330     // for contexts including this edge.
331     uint8_t AllocTypes = 0;
332 
333     // The set of IDs for contexts including this edge.
334     DenseSet<uint32_t> ContextIds;
335 
ContextEdge__anon2f5d20380111::CallsiteContextGraph::ContextEdge336     ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
337                 DenseSet<uint32_t> ContextIds)
338         : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
339           ContextIds(ContextIds) {}
340 
getContextIds__anon2f5d20380111::CallsiteContextGraph::ContextEdge341     DenseSet<uint32_t> &getContextIds() { return ContextIds; }
342 
343     void dump() const;
344     void print(raw_ostream &OS) const;
345 
operator <<(raw_ostream & OS,const ContextEdge & Edge)346     friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
347       Edge.print(OS);
348       return OS;
349     }
350   };
351 
352   /// Helper to remove callee edges that have allocation type None (due to not
353   /// carrying any context ids) after transformations.
354   void removeNoneTypeCalleeEdges(ContextNode *Node);
355 
356 protected:
357   /// Get a list of nodes corresponding to the stack ids in the given callsite
358   /// context.
359   template <class NodeT, class IteratorT>
360   std::vector<uint64_t>
361   getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
362 
363   /// Adds nodes for the given allocation and any stack ids on its memprof MIB
364   /// metadata (or summary).
365   ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
366 
367   /// Adds nodes for the given MIB stack ids.
368   template <class NodeT, class IteratorT>
369   void addStackNodesForMIB(ContextNode *AllocNode,
370                            CallStack<NodeT, IteratorT> &StackContext,
371                            CallStack<NodeT, IteratorT> &CallsiteContext,
372                            AllocationType AllocType);
373 
374   /// Matches all callsite metadata (or summary) to the nodes created for
375   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
376   /// inlining performed on those callsite instructions.
377   void updateStackNodes();
378 
379   /// Update graph to conservatively handle any callsite stack nodes that target
380   /// multiple different callee target functions.
381   void handleCallsitesWithMultipleTargets();
382 
383   /// Save lists of calls with MemProf metadata in each function, for faster
384   /// iteration.
385   MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
386 
387   /// Map from callsite node to the enclosing caller function.
388   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
389 
390 private:
391   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
392 
393   using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
394                                      const FuncTy *, DenseSet<uint32_t>>;
395 
396   /// Assigns the given Node to calls at or inlined into the location with
397   /// the Node's stack id, after post order traversing and processing its
398   /// caller nodes. Uses the call information recorded in the given
399   /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
400   /// as needed. Called by updateStackNodes which sets up the given
401   /// StackIdToMatchingCalls map.
402   void assignStackNodesPostOrder(
403       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
404       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
405 
406   /// Duplicates the given set of context ids, updating the provided
407   /// map from each original id with the newly generated context ids,
408   /// and returning the new duplicated id set.
409   DenseSet<uint32_t> duplicateContextIds(
410       const DenseSet<uint32_t> &StackSequenceContextIds,
411       DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
412 
413   /// Propagates all duplicated context ids across the graph.
414   void propagateDuplicateContextIds(
415       const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
416 
417   /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
418   /// else to its callers. Also updates OrigNode's edges to remove any context
419   /// ids moved to the newly created edge.
420   void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
421                       bool TowardsCallee);
422 
423   /// Get the stack id corresponding to the given Id or Index (for IR this will
424   /// return itself, for a summary index this will return the id recorded in the
425   /// index for that stack id index value).
getStackId(uint64_t IdOrIndex) const426   uint64_t getStackId(uint64_t IdOrIndex) const {
427     return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
428   }
429 
430   /// Returns true if the given call targets the callee of the given edge, or if
431   /// we were able to identify the call chain through intermediate tail calls.
432   /// In the latter case new context nodes are added to the graph for the
433   /// identified tail calls, and their synthesized nodes are added to
434   /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
435   /// next element after the input position (either incremented or updated after
436   /// removing the old edge).
437   bool
438   calleesMatch(CallTy Call, EdgeIter &EI,
439                MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
440 
441   /// Returns true if the given call targets the given function, or if we were
442   /// able to identify the call chain through intermediate tail calls (in which
443   /// case FoundCalleeChain will be populated).
calleeMatchesFunc(CallTy Call,const FuncTy * Func,const FuncTy * CallerFunc,std::vector<std::pair<CallTy,FuncTy * >> & FoundCalleeChain)444   bool calleeMatchesFunc(
445       CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
446       std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
447     return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
448         Call, Func, CallerFunc, FoundCalleeChain);
449   }
450 
451   /// Get a list of nodes corresponding to the stack ids in the given
452   /// callsite's context.
getStackIdsWithContextNodesForCall(CallTy Call)453   std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
454     return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
455         Call);
456   }
457 
458   /// Get the last stack id in the context for callsite.
getLastStackId(CallTy Call)459   uint64_t getLastStackId(CallTy Call) {
460     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
461   }
462 
463   /// Update the allocation call to record type of allocated memory.
updateAllocationCall(CallInfo & Call,AllocationType AllocType)464   void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
465     AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
466     static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
467   }
468 
469   /// Update non-allocation call to invoke (possibly cloned) function
470   /// CalleeFunc.
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)471   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
472     static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
473   }
474 
475   /// Clone the given function for the given callsite, recording mapping of all
476   /// of the functions tracked calls to their new versions in the CallMap.
477   /// Assigns new clones to clone number CloneNo.
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)478   FuncInfo cloneFunctionForCallsite(
479       FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
480       std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
481     return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
482         Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
483   }
484 
485   /// Gets a label to use in the dot graph for the given call clone in the given
486   /// function.
getLabel(const FuncTy * Func,const CallTy Call,unsigned CloneNo) const487   std::string getLabel(const FuncTy *Func, const CallTy Call,
488                        unsigned CloneNo) const {
489     return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
490   }
491 
492   /// Helpers to find the node corresponding to the given call or stackid.
493   ContextNode *getNodeForInst(const CallInfo &C);
494   ContextNode *getNodeForAlloc(const CallInfo &C);
495   ContextNode *getNodeForStackId(uint64_t StackId);
496 
497   /// Removes the node information recorded for the given call.
498   void unsetNodeForInst(const CallInfo &C);
499 
500   /// Computes the alloc type corresponding to the given context ids, by
501   /// unioning their recorded alloc types.
502   uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
503 
504   /// Returns the alloction type of the intersection of the contexts of two
505   /// nodes (based on their provided context id sets), optimized for the case
506   /// when Node1Ids is smaller than Node2Ids.
507   uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
508                                   const DenseSet<uint32_t> &Node2Ids);
509 
510   /// Returns the alloction type of the intersection of the contexts of two
511   /// nodes (based on their provided context id sets).
512   uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
513                               const DenseSet<uint32_t> &Node2Ids);
514 
515   /// Create a clone of Edge's callee and move Edge to that new callee node,
516   /// performing the necessary context id and allocation type updates.
517   /// If callee's caller edge iterator is supplied, it is updated when removing
518   /// the edge from that list.
519   ContextNode *
520   moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
521                            EdgeIter *CallerEdgeI = nullptr);
522 
523   /// Change the callee of Edge to existing callee clone NewCallee, performing
524   /// the necessary context id and allocation type updates.
525   /// If callee's caller edge iterator is supplied, it is updated when removing
526   /// the edge from that list.
527   void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
528                                      ContextNode *NewCallee,
529                                      EdgeIter *CallerEdgeI = nullptr,
530                                      bool NewClone = false);
531 
532   /// Recursively perform cloning on the graph for the given Node and its
533   /// callers, in order to uniquely identify the allocation behavior of an
534   /// allocation given its context.
535   void identifyClones(ContextNode *Node,
536                       DenseSet<const ContextNode *> &Visited);
537 
538   /// Map from each context ID to the AllocationType assigned to that context.
539   std::map<uint32_t, AllocationType> ContextIdToAllocationType;
540 
541   /// Identifies the context node created for a stack id when adding the MIB
542   /// contexts to the graph. This is used to locate the context nodes when
543   /// trying to assign the corresponding callsites with those stack ids to these
544   /// nodes.
545   std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
546 
547   /// Maps to track the calls to their corresponding nodes in the graph.
548   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
549   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
550 
551   /// Owner of all ContextNode unique_ptrs.
552   std::vector<std::unique_ptr<ContextNode>> NodeOwner;
553 
554   /// Perform sanity checks on graph when requested.
555   void check() const;
556 
557   /// Keeps track of the last unique context id assigned.
558   unsigned int LastContextId = 0;
559 };
560 
561 template <typename DerivedCCG, typename FuncTy, typename CallTy>
562 using ContextNode =
563     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
564 template <typename DerivedCCG, typename FuncTy, typename CallTy>
565 using ContextEdge =
566     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
567 template <typename DerivedCCG, typename FuncTy, typename CallTy>
568 using FuncInfo =
569     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
570 template <typename DerivedCCG, typename FuncTy, typename CallTy>
571 using CallInfo =
572     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
573 
574 /// CRTP derived class for graphs built from IR (regular LTO).
575 class ModuleCallsiteContextGraph
576     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
577                                   Instruction *> {
578 public:
579   ModuleCallsiteContextGraph(
580       Module &M,
581       llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
582 
583 private:
584   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
585                               Instruction *>;
586 
587   uint64_t getStackId(uint64_t IdOrIndex) const;
588   bool calleeMatchesFunc(
589       Instruction *Call, const Function *Func, const Function *CallerFunc,
590       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
591   bool findProfiledCalleeThroughTailCalls(
592       const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
593       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
594       bool &FoundMultipleCalleeChains);
595   uint64_t getLastStackId(Instruction *Call);
596   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
597   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
598   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
599   CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
600                        Instruction *>::FuncInfo
601   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
602                            std::map<CallInfo, CallInfo> &CallMap,
603                            std::vector<CallInfo> &CallsWithMetadataInFunc,
604                            unsigned CloneNo);
605   std::string getLabel(const Function *Func, const Instruction *Call,
606                        unsigned CloneNo) const;
607 
608   const Module &Mod;
609   llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
610 };
611 
612 /// Represents a call in the summary index graph, which can either be an
613 /// allocation or an interior callsite node in an allocation's context.
614 /// Holds a pointer to the corresponding data structure in the index.
615 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
IndexCall__anon2f5d20380111::IndexCall616   IndexCall() : PointerUnion() {}
IndexCall__anon2f5d20380111::IndexCall617   IndexCall(std::nullptr_t) : IndexCall() {}
IndexCall__anon2f5d20380111::IndexCall618   IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
IndexCall__anon2f5d20380111::IndexCall619   IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
IndexCall__anon2f5d20380111::IndexCall620   IndexCall(PointerUnion PT) : PointerUnion(PT) {}
621 
operator ->__anon2f5d20380111::IndexCall622   IndexCall *operator->() { return this; }
623 
getBase__anon2f5d20380111::IndexCall624   PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
625 
print__anon2f5d20380111::IndexCall626   void print(raw_ostream &OS) const {
627     if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
628       OS << *AI;
629     } else {
630       auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
631       assert(CI);
632       OS << *CI;
633     }
634   }
635 };
636 
637 /// CRTP derived class for graphs built from summary index (ThinLTO).
638 class IndexCallsiteContextGraph
639     : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
640                                   IndexCall> {
641 public:
642   IndexCallsiteContextGraph(
643       ModuleSummaryIndex &Index,
644       llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
645           isPrevailing);
646 
~IndexCallsiteContextGraph()647   ~IndexCallsiteContextGraph() {
648     // Now that we are done with the graph it is safe to add the new
649     // CallsiteInfo structs to the function summary vectors. The graph nodes
650     // point into locations within these vectors, so we don't want to add them
651     // any earlier.
652     for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
653       auto *FS = I.first;
654       for (auto &Callsite : I.second)
655         FS->addCallsite(*Callsite.second);
656     }
657   }
658 
659 private:
660   friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
661                               IndexCall>;
662 
663   uint64_t getStackId(uint64_t IdOrIndex) const;
664   bool calleeMatchesFunc(
665       IndexCall &Call, const FunctionSummary *Func,
666       const FunctionSummary *CallerFunc,
667       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
668   bool findProfiledCalleeThroughTailCalls(
669       ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
670       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
671       bool &FoundMultipleCalleeChains);
672   uint64_t getLastStackId(IndexCall &Call);
673   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
674   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
675   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
676   CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
677                        IndexCall>::FuncInfo
678   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
679                            std::map<CallInfo, CallInfo> &CallMap,
680                            std::vector<CallInfo> &CallsWithMetadataInFunc,
681                            unsigned CloneNo);
682   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
683                        unsigned CloneNo) const;
684 
685   // Saves mapping from function summaries containing memprof records back to
686   // its VI, for use in checking and debugging.
687   std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
688 
689   const ModuleSummaryIndex &Index;
690   llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
691       isPrevailing;
692 
693   // Saves/owns the callsite info structures synthesized for missing tail call
694   // frames that we discover while building the graph.
695   // It maps from the summary of the function making the tail call, to a map
696   // of callee ValueInfo to corresponding synthesized callsite info.
697   std::unordered_map<FunctionSummary *,
698                      std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
699       FunctionCalleesToSynthesizedCallsiteInfos;
700 };
701 } // namespace
702 
703 namespace llvm {
704 template <>
705 struct DenseMapInfo<typename CallsiteContextGraph<
706     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
707     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
708 template <>
709 struct DenseMapInfo<typename CallsiteContextGraph<
710     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
711     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
712 template <>
713 struct DenseMapInfo<IndexCall>
714     : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
715 } // end namespace llvm
716 
717 namespace {
718 
719 struct FieldSeparator {
720   bool Skip = true;
721   const char *Sep;
722 
FieldSeparator__anon2f5d20380211::FieldSeparator723   FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
724 };
725 
operator <<(raw_ostream & OS,FieldSeparator & FS)726 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
727   if (FS.Skip) {
728     FS.Skip = false;
729     return OS;
730   }
731   return OS << FS.Sep;
732 }
733 
734 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
735 // type we should actually use on the corresponding allocation.
736 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
737 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
738 // from NotCold.
allocTypeToUse(uint8_t AllocTypes)739 AllocationType allocTypeToUse(uint8_t AllocTypes) {
740   assert(AllocTypes != (uint8_t)AllocationType::None);
741   if (AllocTypes ==
742       ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
743     return AllocationType::NotCold;
744   else
745     return (AllocationType)AllocTypes;
746 }
747 
748 // Helper to check if the alloc types for all edges recorded in the
749 // InAllocTypes vector match the alloc types for all edges in the Edges
750 // vector.
751 template <typename DerivedCCG, typename FuncTy, typename CallTy>
allocTypesMatch(const std::vector<uint8_t> & InAllocTypes,const std::vector<std::shared_ptr<ContextEdge<DerivedCCG,FuncTy,CallTy>>> & Edges)752 bool allocTypesMatch(
753     const std::vector<uint8_t> &InAllocTypes,
754     const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
755         &Edges) {
756   return std::equal(
757       InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
758       [](const uint8_t &l,
759          const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
760         // Can share if one of the edges is None type - don't
761         // care about the type along that edge as it doesn't
762         // exist for those context ids.
763         if (l == (uint8_t)AllocationType::None ||
764             r->AllocTypes == (uint8_t)AllocationType::None)
765           return true;
766         return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
767       });
768 }
769 
770 } // end anonymous namespace
771 
772 template <typename DerivedCCG, typename FuncTy, typename CallTy>
773 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForInst(const CallInfo & C)774 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
775     const CallInfo &C) {
776   ContextNode *Node = getNodeForAlloc(C);
777   if (Node)
778     return Node;
779 
780   return NonAllocationCallToContextNodeMap.lookup(C);
781 }
782 
783 template <typename DerivedCCG, typename FuncTy, typename CallTy>
784 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForAlloc(const CallInfo & C)785 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
786     const CallInfo &C) {
787   return AllocationCallToContextNodeMap.lookup(C);
788 }
789 
790 template <typename DerivedCCG, typename FuncTy, typename CallTy>
791 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForStackId(uint64_t StackId)792 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
793     uint64_t StackId) {
794   auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
795   if (StackEntryNode != StackEntryIdToContextNodeMap.end())
796     return StackEntryNode->second;
797   return nullptr;
798 }
799 
800 template <typename DerivedCCG, typename FuncTy, typename CallTy>
unsetNodeForInst(const CallInfo & C)801 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
802     const CallInfo &C) {
803   AllocationCallToContextNodeMap.erase(C) ||
804       NonAllocationCallToContextNodeMap.erase(C);
805   assert(!AllocationCallToContextNodeMap.count(C) &&
806          !NonAllocationCallToContextNodeMap.count(C));
807 }
808 
809 template <typename DerivedCCG, typename FuncTy, typename CallTy>
810 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
addOrUpdateCallerEdge(ContextNode * Caller,AllocationType AllocType,unsigned int ContextId)811     addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
812                           unsigned int ContextId) {
813   for (auto &Edge : CallerEdges) {
814     if (Edge->Caller == Caller) {
815       Edge->AllocTypes |= (uint8_t)AllocType;
816       Edge->getContextIds().insert(ContextId);
817       return;
818     }
819   }
820   std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
821       this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
822   CallerEdges.push_back(Edge);
823   Caller->CalleeEdges.push_back(Edge);
824 }
825 
826 template <typename DerivedCCG, typename FuncTy, typename CallTy>
827 void CallsiteContextGraph<
removeNoneTypeCalleeEdges(ContextNode * Node)828     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
829   for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
830     auto Edge = *EI;
831     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
832       assert(Edge->ContextIds.empty());
833       Edge->Callee->eraseCallerEdge(Edge.get());
834       EI = Node->CalleeEdges.erase(EI);
835     } else
836       ++EI;
837   }
838 }
839 
840 template <typename DerivedCCG, typename FuncTy, typename CallTy>
841 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
842 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCallee(const ContextNode * Callee)843     findEdgeFromCallee(const ContextNode *Callee) {
844   for (const auto &Edge : CalleeEdges)
845     if (Edge->Callee == Callee)
846       return Edge.get();
847   return nullptr;
848 }
849 
850 template <typename DerivedCCG, typename FuncTy, typename CallTy>
851 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
852 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCaller(const ContextNode * Caller)853     findEdgeFromCaller(const ContextNode *Caller) {
854   for (const auto &Edge : CallerEdges)
855     if (Edge->Caller == Caller)
856       return Edge.get();
857   return nullptr;
858 }
859 
860 template <typename DerivedCCG, typename FuncTy, typename CallTy>
861 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCalleeEdge(const ContextEdge * Edge)862     eraseCalleeEdge(const ContextEdge *Edge) {
863   auto EI = llvm::find_if(
864       CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
865         return CalleeEdge.get() == Edge;
866       });
867   assert(EI != CalleeEdges.end());
868   CalleeEdges.erase(EI);
869 }
870 
871 template <typename DerivedCCG, typename FuncTy, typename CallTy>
872 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCallerEdge(const ContextEdge * Edge)873     eraseCallerEdge(const ContextEdge *Edge) {
874   auto EI = llvm::find_if(
875       CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
876         return CallerEdge.get() == Edge;
877       });
878   assert(EI != CallerEdges.end());
879   CallerEdges.erase(EI);
880 }
881 
882 template <typename DerivedCCG, typename FuncTy, typename CallTy>
computeAllocType(DenseSet<uint32_t> & ContextIds)883 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
884     DenseSet<uint32_t> &ContextIds) {
885   uint8_t BothTypes =
886       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
887   uint8_t AllocType = (uint8_t)AllocationType::None;
888   for (auto Id : ContextIds) {
889     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
890     // Bail early if alloc type reached both, no further refinement.
891     if (AllocType == BothTypes)
892       return AllocType;
893   }
894   return AllocType;
895 }
896 
897 template <typename DerivedCCG, typename FuncTy, typename CallTy>
898 uint8_t
intersectAllocTypesImpl(const DenseSet<uint32_t> & Node1Ids,const DenseSet<uint32_t> & Node2Ids)899 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
900     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
901   uint8_t BothTypes =
902       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
903   uint8_t AllocType = (uint8_t)AllocationType::None;
904   for (auto Id : Node1Ids) {
905     if (!Node2Ids.count(Id))
906       continue;
907     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
908     // Bail early if alloc type reached both, no further refinement.
909     if (AllocType == BothTypes)
910       return AllocType;
911   }
912   return AllocType;
913 }
914 
915 template <typename DerivedCCG, typename FuncTy, typename CallTy>
intersectAllocTypes(const DenseSet<uint32_t> & Node1Ids,const DenseSet<uint32_t> & Node2Ids)916 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
917     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
918   if (Node1Ids.size() < Node2Ids.size())
919     return intersectAllocTypesImpl(Node1Ids, Node2Ids);
920   else
921     return intersectAllocTypesImpl(Node2Ids, Node1Ids);
922 }
923 
924 template <typename DerivedCCG, typename FuncTy, typename CallTy>
925 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
addAllocNode(CallInfo Call,const FuncTy * F)926 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
927     CallInfo Call, const FuncTy *F) {
928   assert(!getNodeForAlloc(Call));
929   NodeOwner.push_back(
930       std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
931   ContextNode *AllocNode = NodeOwner.back().get();
932   AllocationCallToContextNodeMap[Call] = AllocNode;
933   NodeToCallingFunc[AllocNode] = F;
934   // Use LastContextId as a uniq id for MIB allocation nodes.
935   AllocNode->OrigStackOrAllocId = LastContextId;
936   // Alloc type should be updated as we add in the MIBs. We should assert
937   // afterwards that it is not still None.
938   AllocNode->AllocTypes = (uint8_t)AllocationType::None;
939 
940   return AllocNode;
941 }
942 
943 template <typename DerivedCCG, typename FuncTy, typename CallTy>
944 template <class NodeT, class IteratorT>
addStackNodesForMIB(ContextNode * AllocNode,CallStack<NodeT,IteratorT> & StackContext,CallStack<NodeT,IteratorT> & CallsiteContext,AllocationType AllocType)945 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
946     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
947     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
948   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
949   // is done.
950   if (AllocType == AllocationType::Hot)
951     AllocType = AllocationType::NotCold;
952 
953   ContextIdToAllocationType[++LastContextId] = AllocType;
954 
955   // Update alloc type and context ids for this MIB.
956   AllocNode->AllocTypes |= (uint8_t)AllocType;
957   AllocNode->ContextIds.insert(LastContextId);
958 
959   // Now add or update nodes for each stack id in alloc's context.
960   // Later when processing the stack ids on non-alloc callsites we will adjust
961   // for any inlining in the context.
962   ContextNode *PrevNode = AllocNode;
963   // Look for recursion (direct recursion should have been collapsed by
964   // module summary analysis, here we should just be detecting mutual
965   // recursion). Mark these nodes so we don't try to clone.
966   SmallSet<uint64_t, 8> StackIdSet;
967   // Skip any on the allocation call (inlining).
968   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
969        ContextIter != StackContext.end(); ++ContextIter) {
970     auto StackId = getStackId(*ContextIter);
971     ContextNode *StackNode = getNodeForStackId(StackId);
972     if (!StackNode) {
973       NodeOwner.push_back(
974           std::make_unique<ContextNode>(/*IsAllocation=*/false));
975       StackNode = NodeOwner.back().get();
976       StackEntryIdToContextNodeMap[StackId] = StackNode;
977       StackNode->OrigStackOrAllocId = StackId;
978     }
979     auto Ins = StackIdSet.insert(StackId);
980     if (!Ins.second)
981       StackNode->Recursive = true;
982     StackNode->ContextIds.insert(LastContextId);
983     StackNode->AllocTypes |= (uint8_t)AllocType;
984     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
985     PrevNode = StackNode;
986   }
987 }
988 
989 template <typename DerivedCCG, typename FuncTy, typename CallTy>
990 DenseSet<uint32_t>
duplicateContextIds(const DenseSet<uint32_t> & StackSequenceContextIds,DenseMap<uint32_t,DenseSet<uint32_t>> & OldToNewContextIds)991 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
992     const DenseSet<uint32_t> &StackSequenceContextIds,
993     DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
994   DenseSet<uint32_t> NewContextIds;
995   for (auto OldId : StackSequenceContextIds) {
996     NewContextIds.insert(++LastContextId);
997     OldToNewContextIds[OldId].insert(LastContextId);
998     assert(ContextIdToAllocationType.count(OldId));
999     // The new context has the same allocation type as original.
1000     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1001   }
1002   return NewContextIds;
1003 }
1004 
1005 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1006 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
propagateDuplicateContextIds(const DenseMap<uint32_t,DenseSet<uint32_t>> & OldToNewContextIds)1007     propagateDuplicateContextIds(
1008         const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1009   // Build a set of duplicated context ids corresponding to the input id set.
1010   auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1011     DenseSet<uint32_t> NewIds;
1012     for (auto Id : ContextIds)
1013       if (auto NewId = OldToNewContextIds.find(Id);
1014           NewId != OldToNewContextIds.end())
1015         NewIds.insert(NewId->second.begin(), NewId->second.end());
1016     return NewIds;
1017   };
1018 
1019   // Recursively update context ids sets along caller edges.
1020   auto UpdateCallers = [&](ContextNode *Node,
1021                            DenseSet<const ContextEdge *> &Visited,
1022                            auto &&UpdateCallers) -> void {
1023     for (const auto &Edge : Node->CallerEdges) {
1024       auto Inserted = Visited.insert(Edge.get());
1025       if (!Inserted.second)
1026         continue;
1027       ContextNode *NextNode = Edge->Caller;
1028       DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1029       // Only need to recursively iterate to NextNode via this caller edge if
1030       // it resulted in any added ids to NextNode.
1031       if (!NewIdsToAdd.empty()) {
1032         Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1033         NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1034         UpdateCallers(NextNode, Visited, UpdateCallers);
1035       }
1036     }
1037   };
1038 
1039   DenseSet<const ContextEdge *> Visited;
1040   for (auto &Entry : AllocationCallToContextNodeMap) {
1041     auto *Node = Entry.second;
1042     // Update ids on the allocation nodes before calling the recursive
1043     // update along caller edges, since this simplifies the logic during
1044     // that traversal.
1045     DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
1046     Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1047     UpdateCallers(Node, Visited, UpdateCallers);
1048   }
1049 }
1050 
1051 template <typename DerivedCCG, typename FuncTy, typename CallTy>
connectNewNode(ContextNode * NewNode,ContextNode * OrigNode,bool TowardsCallee)1052 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1053     ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
1054   // Make a copy of the context ids, since this will be adjusted below as they
1055   // are moved.
1056   DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
1057   auto &OrigEdges =
1058       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1059   // Increment iterator in loop so that we can remove edges as needed.
1060   for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1061     auto Edge = *EI;
1062     // Remove any matching context ids from Edge, return set that were found and
1063     // removed, these are the new edge's context ids. Also update the remaining
1064     // (not found ids).
1065     DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1066     set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1067                  NotFoundContextIds);
1068     RemainingContextIds.swap(NotFoundContextIds);
1069     // If no matching context ids for this edge, skip it.
1070     if (NewEdgeContextIds.empty()) {
1071       ++EI;
1072       continue;
1073     }
1074     if (TowardsCallee) {
1075       auto NewEdge = std::make_shared<ContextEdge>(
1076           Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1077           NewEdgeContextIds);
1078       NewNode->CalleeEdges.push_back(NewEdge);
1079       NewEdge->Callee->CallerEdges.push_back(NewEdge);
1080     } else {
1081       auto NewEdge = std::make_shared<ContextEdge>(
1082           NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1083           NewEdgeContextIds);
1084       NewNode->CallerEdges.push_back(NewEdge);
1085       NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1086     }
1087     // Remove old edge if context ids empty.
1088     if (Edge->getContextIds().empty()) {
1089       if (TowardsCallee) {
1090         Edge->Callee->eraseCallerEdge(Edge.get());
1091         EI = OrigNode->CalleeEdges.erase(EI);
1092       } else {
1093         Edge->Caller->eraseCalleeEdge(Edge.get());
1094         EI = OrigNode->CallerEdges.erase(EI);
1095       }
1096       continue;
1097     }
1098     ++EI;
1099   }
1100 }
1101 
1102 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1103 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
assignStackNodesPostOrder(ContextNode * Node,DenseSet<const ContextNode * > & Visited,DenseMap<uint64_t,std::vector<CallContextInfo>> & StackIdToMatchingCalls)1104     assignStackNodesPostOrder(ContextNode *Node,
1105                               DenseSet<const ContextNode *> &Visited,
1106                               DenseMap<uint64_t, std::vector<CallContextInfo>>
1107                                   &StackIdToMatchingCalls) {
1108   auto Inserted = Visited.insert(Node);
1109   if (!Inserted.second)
1110     return;
1111   // Post order traversal. Iterate over a copy since we may add nodes and
1112   // therefore new callers during the recursive call, invalidating any
1113   // iterator over the original edge vector. We don't need to process these
1114   // new nodes as they were already processed on creation.
1115   auto CallerEdges = Node->CallerEdges;
1116   for (auto &Edge : CallerEdges) {
1117     // Skip any that have been removed during the recursion.
1118     if (!Edge)
1119       continue;
1120     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1121   }
1122 
1123   // If this node's stack id is in the map, update the graph to contain new
1124   // nodes representing any inlining at interior callsites. Note we move the
1125   // associated context ids over to the new nodes.
1126 
1127   // Ignore this node if it is for an allocation or we didn't record any
1128   // stack id lists ending at it.
1129   if (Node->IsAllocation ||
1130       !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1131     return;
1132 
1133   auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1134   // Handle the simple case first. A single call with a single stack id.
1135   // In this case there is no need to create any new context nodes, simply
1136   // assign the context node for stack id to this Call.
1137   if (Calls.size() == 1) {
1138     auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1139     if (Ids.size() == 1) {
1140       assert(SavedContextIds.empty());
1141       // It should be this Node
1142       assert(Node == getNodeForStackId(Ids[0]));
1143       if (Node->Recursive)
1144         return;
1145       Node->setCall(Call);
1146       NonAllocationCallToContextNodeMap[Call] = Node;
1147       NodeToCallingFunc[Node] = Func;
1148       return;
1149     }
1150   }
1151 
1152   // Find the node for the last stack id, which should be the same
1153   // across all calls recorded for this id, and is this node's id.
1154   uint64_t LastId = Node->OrigStackOrAllocId;
1155   ContextNode *LastNode = getNodeForStackId(LastId);
1156   // We should only have kept stack ids that had nodes.
1157   assert(LastNode);
1158 
1159   for (unsigned I = 0; I < Calls.size(); I++) {
1160     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1161     // Skip any for which we didn't assign any ids, these don't get a node in
1162     // the graph.
1163     if (SavedContextIds.empty())
1164       continue;
1165 
1166     assert(LastId == Ids.back());
1167 
1168     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1169     assert(FirstNode);
1170 
1171     // Recompute the context ids for this stack id sequence (the
1172     // intersection of the context ids of the corresponding nodes).
1173     // Start with the ids we saved in the map for this call, which could be
1174     // duplicated context ids. We have to recompute as we might have overlap
1175     // overlap between the saved context ids for different last nodes, and
1176     // removed them already during the post order traversal.
1177     set_intersect(SavedContextIds, FirstNode->ContextIds);
1178     ContextNode *PrevNode = nullptr;
1179     for (auto Id : Ids) {
1180       ContextNode *CurNode = getNodeForStackId(Id);
1181       // We should only have kept stack ids that had nodes and weren't
1182       // recursive.
1183       assert(CurNode);
1184       assert(!CurNode->Recursive);
1185       if (!PrevNode) {
1186         PrevNode = CurNode;
1187         continue;
1188       }
1189       auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1190       if (!Edge) {
1191         SavedContextIds.clear();
1192         break;
1193       }
1194       PrevNode = CurNode;
1195       set_intersect(SavedContextIds, Edge->getContextIds());
1196 
1197       // If we now have no context ids for clone, skip this call.
1198       if (SavedContextIds.empty())
1199         break;
1200     }
1201     if (SavedContextIds.empty())
1202       continue;
1203 
1204     // Create new context node.
1205     NodeOwner.push_back(
1206         std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1207     ContextNode *NewNode = NodeOwner.back().get();
1208     NodeToCallingFunc[NewNode] = Func;
1209     NonAllocationCallToContextNodeMap[Call] = NewNode;
1210     NewNode->ContextIds = SavedContextIds;
1211     NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1212 
1213     // Connect to callees of innermost stack frame in inlined call chain.
1214     // This updates context ids for FirstNode's callee's to reflect those
1215     // moved to NewNode.
1216     connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1217 
1218     // Connect to callers of outermost stack frame in inlined call chain.
1219     // This updates context ids for FirstNode's caller's to reflect those
1220     // moved to NewNode.
1221     connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1222 
1223     // Now we need to remove context ids from edges/nodes between First and
1224     // Last Node.
1225     PrevNode = nullptr;
1226     for (auto Id : Ids) {
1227       ContextNode *CurNode = getNodeForStackId(Id);
1228       // We should only have kept stack ids that had nodes.
1229       assert(CurNode);
1230 
1231       // Remove the context ids moved to NewNode from CurNode, and the
1232       // edge from the prior node.
1233       set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1234       if (PrevNode) {
1235         auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1236         assert(PrevEdge);
1237         set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1238         if (PrevEdge->getContextIds().empty()) {
1239           PrevNode->eraseCallerEdge(PrevEdge);
1240           CurNode->eraseCalleeEdge(PrevEdge);
1241         }
1242       }
1243       PrevNode = CurNode;
1244     }
1245   }
1246 }
1247 
1248 template <typename DerivedCCG, typename FuncTy, typename CallTy>
updateStackNodes()1249 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1250   // Map of stack id to all calls with that as the last (outermost caller)
1251   // callsite id that has a context node (some might not due to pruning
1252   // performed during matching of the allocation profile contexts).
1253   // The CallContextInfo contains the Call and a list of its stack ids with
1254   // ContextNodes, the function containing Call, and the set of context ids
1255   // the analysis will eventually identify for use in any new node created
1256   // for that callsite.
1257   DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1258   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1259     for (auto &Call : CallsWithMetadata) {
1260       // Ignore allocations, already handled.
1261       if (AllocationCallToContextNodeMap.count(Call))
1262         continue;
1263       auto StackIdsWithContextNodes =
1264           getStackIdsWithContextNodesForCall(Call.call());
1265       // If there were no nodes created for MIBs on allocs (maybe this was in
1266       // the unambiguous part of the MIB stack that was pruned), ignore.
1267       if (StackIdsWithContextNodes.empty())
1268         continue;
1269       // Otherwise, record this Call along with the list of ids for the last
1270       // (outermost caller) stack id with a node.
1271       StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1272           {Call.call(), StackIdsWithContextNodes, Func, {}});
1273     }
1274   }
1275 
1276   // First make a pass through all stack ids that correspond to a call,
1277   // as identified in the above loop. Compute the context ids corresponding to
1278   // each of these calls when they correspond to multiple stack ids due to
1279   // due to inlining. Perform any duplication of context ids required when
1280   // there is more than one call with the same stack ids. Their (possibly newly
1281   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1282   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1283   for (auto &It : StackIdToMatchingCalls) {
1284     auto &Calls = It.getSecond();
1285     // Skip single calls with a single stack id. These don't need a new node.
1286     if (Calls.size() == 1) {
1287       auto &Ids = std::get<1>(Calls[0]);
1288       if (Ids.size() == 1)
1289         continue;
1290     }
1291     // In order to do the best and maximal matching of inlined calls to context
1292     // node sequences we will sort the vectors of stack ids in descending order
1293     // of length, and within each length, lexicographically by stack id. The
1294     // latter is so that we can specially handle calls that have identical stack
1295     // id sequences (either due to cloning or artificially because of the MIB
1296     // context pruning).
1297     std::stable_sort(Calls.begin(), Calls.end(),
1298                      [](const CallContextInfo &A, const CallContextInfo &B) {
1299                        auto &IdsA = std::get<1>(A);
1300                        auto &IdsB = std::get<1>(B);
1301                        return IdsA.size() > IdsB.size() ||
1302                               (IdsA.size() == IdsB.size() && IdsA < IdsB);
1303                      });
1304 
1305     // Find the node for the last stack id, which should be the same
1306     // across all calls recorded for this id, and is the id for this
1307     // entry in the StackIdToMatchingCalls map.
1308     uint64_t LastId = It.getFirst();
1309     ContextNode *LastNode = getNodeForStackId(LastId);
1310     // We should only have kept stack ids that had nodes.
1311     assert(LastNode);
1312 
1313     if (LastNode->Recursive)
1314       continue;
1315 
1316     // Initialize the context ids with the last node's. We will subsequently
1317     // refine the context ids by computing the intersection along all edges.
1318     DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1319     assert(!LastNodeContextIds.empty());
1320 
1321     for (unsigned I = 0; I < Calls.size(); I++) {
1322       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1323       assert(SavedContextIds.empty());
1324       assert(LastId == Ids.back());
1325 
1326       // First compute the context ids for this stack id sequence (the
1327       // intersection of the context ids of the corresponding nodes).
1328       // Start with the remaining saved ids for the last node.
1329       assert(!LastNodeContextIds.empty());
1330       DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1331 
1332       ContextNode *PrevNode = LastNode;
1333       ContextNode *CurNode = LastNode;
1334       bool Skip = false;
1335 
1336       // Iterate backwards through the stack Ids, starting after the last Id
1337       // in the list, which was handled once outside for all Calls.
1338       for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1339         auto Id = *IdIter;
1340         CurNode = getNodeForStackId(Id);
1341         // We should only have kept stack ids that had nodes.
1342         assert(CurNode);
1343 
1344         if (CurNode->Recursive) {
1345           Skip = true;
1346           break;
1347         }
1348 
1349         auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1350         // If there is no edge then the nodes belong to different MIB contexts,
1351         // and we should skip this inlined context sequence. For example, this
1352         // particular inlined context may include stack ids A->B, and we may
1353         // indeed have nodes for both A and B, but it is possible that they were
1354         // never profiled in sequence in a single MIB for any allocation (i.e.
1355         // we might have profiled an allocation that involves the callsite A,
1356         // but through a different one of its callee callsites, and we might
1357         // have profiled an allocation that involves callsite B, but reached
1358         // from a different caller callsite).
1359         if (!Edge) {
1360           Skip = true;
1361           break;
1362         }
1363         PrevNode = CurNode;
1364 
1365         // Update the context ids, which is the intersection of the ids along
1366         // all edges in the sequence.
1367         set_intersect(StackSequenceContextIds, Edge->getContextIds());
1368 
1369         // If we now have no context ids for clone, skip this call.
1370         if (StackSequenceContextIds.empty()) {
1371           Skip = true;
1372           break;
1373         }
1374       }
1375       if (Skip)
1376         continue;
1377 
1378       // If some of this call's stack ids did not have corresponding nodes (due
1379       // to pruning), don't include any context ids for contexts that extend
1380       // beyond these nodes. Otherwise we would be matching part of unrelated /
1381       // not fully matching stack contexts. To do this, subtract any context ids
1382       // found in caller nodes of the last node found above.
1383       if (Ids.back() != getLastStackId(Call)) {
1384         for (const auto &PE : CurNode->CallerEdges) {
1385           set_subtract(StackSequenceContextIds, PE->getContextIds());
1386           if (StackSequenceContextIds.empty())
1387             break;
1388         }
1389         // If we now have no context ids for clone, skip this call.
1390         if (StackSequenceContextIds.empty())
1391           continue;
1392       }
1393 
1394       // Check if the next set of stack ids is the same (since the Calls vector
1395       // of tuples is sorted by the stack ids we can just look at the next one).
1396       bool DuplicateContextIds = false;
1397       if (I + 1 < Calls.size()) {
1398         auto NextIds = std::get<1>(Calls[I + 1]);
1399         DuplicateContextIds = Ids == NextIds;
1400       }
1401 
1402       // If we don't have duplicate context ids, then we can assign all the
1403       // context ids computed for the original node sequence to this call.
1404       // If there are duplicate calls with the same stack ids then we synthesize
1405       // new context ids that are duplicates of the originals. These are
1406       // assigned to SavedContextIds, which is a reference into the map entry
1407       // for this call, allowing us to access these ids later on.
1408       OldToNewContextIds.reserve(OldToNewContextIds.size() +
1409                                  StackSequenceContextIds.size());
1410       SavedContextIds =
1411           DuplicateContextIds
1412               ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1413               : StackSequenceContextIds;
1414       assert(!SavedContextIds.empty());
1415 
1416       if (!DuplicateContextIds) {
1417         // Update saved last node's context ids to remove those that are
1418         // assigned to other calls, so that it is ready for the next call at
1419         // this stack id.
1420         set_subtract(LastNodeContextIds, StackSequenceContextIds);
1421         if (LastNodeContextIds.empty())
1422           break;
1423       }
1424     }
1425   }
1426 
1427   // Propagate the duplicate context ids over the graph.
1428   propagateDuplicateContextIds(OldToNewContextIds);
1429 
1430   if (VerifyCCG)
1431     check();
1432 
1433   // Now perform a post-order traversal over the graph, starting with the
1434   // allocation nodes, essentially processing nodes from callers to callees.
1435   // For any that contains an id in the map, update the graph to contain new
1436   // nodes representing any inlining at interior callsites. Note we move the
1437   // associated context ids over to the new nodes.
1438   DenseSet<const ContextNode *> Visited;
1439   for (auto &Entry : AllocationCallToContextNodeMap)
1440     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1441 }
1442 
getLastStackId(Instruction * Call)1443 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1444   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1445       Call->getMetadata(LLVMContext::MD_callsite));
1446   return CallsiteContext.back();
1447 }
1448 
getLastStackId(IndexCall & Call)1449 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1450   assert(isa<CallsiteInfo *>(Call.getBase()));
1451   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1452       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1453   // Need to convert index into stack id.
1454   return Index.getStackIdAtIndex(CallsiteContext.back());
1455 }
1456 
1457 static const std::string MemProfCloneSuffix = ".memprof.";
1458 
getMemProfFuncName(Twine Base,unsigned CloneNo)1459 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1460   // We use CloneNo == 0 to refer to the original version, which doesn't get
1461   // renamed with a suffix.
1462   if (!CloneNo)
1463     return Base.str();
1464   return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1465 }
1466 
getLabel(const Function * Func,const Instruction * Call,unsigned CloneNo) const1467 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1468                                                  const Instruction *Call,
1469                                                  unsigned CloneNo) const {
1470   return (Twine(Call->getFunction()->getName()) + " -> " +
1471           cast<CallBase>(Call)->getCalledFunction()->getName())
1472       .str();
1473 }
1474 
getLabel(const FunctionSummary * Func,const IndexCall & Call,unsigned CloneNo) const1475 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1476                                                 const IndexCall &Call,
1477                                                 unsigned CloneNo) const {
1478   auto VI = FSToVIMap.find(Func);
1479   assert(VI != FSToVIMap.end());
1480   if (isa<AllocInfo *>(Call.getBase()))
1481     return (VI->second.name() + " -> alloc").str();
1482   else {
1483     auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1484     return (VI->second.name() + " -> " +
1485             getMemProfFuncName(Callsite->Callee.name(),
1486                                Callsite->Clones[CloneNo]))
1487         .str();
1488   }
1489 }
1490 
1491 std::vector<uint64_t>
getStackIdsWithContextNodesForCall(Instruction * Call)1492 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1493     Instruction *Call) {
1494   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1495       Call->getMetadata(LLVMContext::MD_callsite));
1496   return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1497       CallsiteContext);
1498 }
1499 
1500 std::vector<uint64_t>
getStackIdsWithContextNodesForCall(IndexCall & Call)1501 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1502   assert(isa<CallsiteInfo *>(Call.getBase()));
1503   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1504       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1505   return getStackIdsWithContextNodes<CallsiteInfo,
1506                                      SmallVector<unsigned>::const_iterator>(
1507       CallsiteContext);
1508 }
1509 
1510 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1511 template <class NodeT, class IteratorT>
1512 std::vector<uint64_t>
getStackIdsWithContextNodes(CallStack<NodeT,IteratorT> & CallsiteContext)1513 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1514     CallStack<NodeT, IteratorT> &CallsiteContext) {
1515   std::vector<uint64_t> StackIds;
1516   for (auto IdOrIndex : CallsiteContext) {
1517     auto StackId = getStackId(IdOrIndex);
1518     ContextNode *Node = getNodeForStackId(StackId);
1519     if (!Node)
1520       break;
1521     StackIds.push_back(StackId);
1522   }
1523   return StackIds;
1524 }
1525 
ModuleCallsiteContextGraph(Module & M,llvm::function_ref<OptimizationRemarkEmitter & (Function *)> OREGetter)1526 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1527     Module &M,
1528     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1529     : Mod(M), OREGetter(OREGetter) {
1530   for (auto &F : M) {
1531     std::vector<CallInfo> CallsWithMetadata;
1532     for (auto &BB : F) {
1533       for (auto &I : BB) {
1534         if (!isa<CallBase>(I))
1535           continue;
1536         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1537           CallsWithMetadata.push_back(&I);
1538           auto *AllocNode = addAllocNode(&I, &F);
1539           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1540           assert(CallsiteMD);
1541           CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1542           // Add all of the MIBs and their stack nodes.
1543           for (auto &MDOp : MemProfMD->operands()) {
1544             auto *MIBMD = cast<const MDNode>(MDOp);
1545             MDNode *StackNode = getMIBStackNode(MIBMD);
1546             assert(StackNode);
1547             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1548             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1549                 AllocNode, StackContext, CallsiteContext,
1550                 getMIBAllocType(MIBMD));
1551           }
1552           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1553           // Memprof and callsite metadata on memory allocations no longer
1554           // needed.
1555           I.setMetadata(LLVMContext::MD_memprof, nullptr);
1556           I.setMetadata(LLVMContext::MD_callsite, nullptr);
1557         }
1558         // For callsite metadata, add to list for this function for later use.
1559         else if (I.getMetadata(LLVMContext::MD_callsite))
1560           CallsWithMetadata.push_back(&I);
1561       }
1562     }
1563     if (!CallsWithMetadata.empty())
1564       FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1565   }
1566 
1567   if (DumpCCG) {
1568     dbgs() << "CCG before updating call stack chains:\n";
1569     dbgs() << *this;
1570   }
1571 
1572   if (ExportToDot)
1573     exportToDot("prestackupdate");
1574 
1575   updateStackNodes();
1576 
1577   handleCallsitesWithMultipleTargets();
1578 
1579   // Strip off remaining callsite metadata, no longer needed.
1580   for (auto &FuncEntry : FuncToCallsWithMetadata)
1581     for (auto &Call : FuncEntry.second)
1582       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1583 }
1584 
IndexCallsiteContextGraph(ModuleSummaryIndex & Index,llvm::function_ref<bool (GlobalValue::GUID,const GlobalValueSummary *)> isPrevailing)1585 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1586     ModuleSummaryIndex &Index,
1587     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1588         isPrevailing)
1589     : Index(Index), isPrevailing(isPrevailing) {
1590   for (auto &I : Index) {
1591     auto VI = Index.getValueInfo(I);
1592     for (auto &S : VI.getSummaryList()) {
1593       // We should only add the prevailing nodes. Otherwise we may try to clone
1594       // in a weak copy that won't be linked (and may be different than the
1595       // prevailing version).
1596       // We only keep the memprof summary on the prevailing copy now when
1597       // building the combined index, as a space optimization, however don't
1598       // rely on this optimization. The linker doesn't resolve local linkage
1599       // values so don't check whether those are prevailing.
1600       if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1601           !isPrevailing(VI.getGUID(), S.get()))
1602         continue;
1603       auto *FS = dyn_cast<FunctionSummary>(S.get());
1604       if (!FS)
1605         continue;
1606       std::vector<CallInfo> CallsWithMetadata;
1607       if (!FS->allocs().empty()) {
1608         for (auto &AN : FS->mutableAllocs()) {
1609           // This can happen because of recursion elimination handling that
1610           // currently exists in ModuleSummaryAnalysis. Skip these for now.
1611           // We still added them to the summary because we need to be able to
1612           // correlate properly in applyImport in the backends.
1613           if (AN.MIBs.empty())
1614             continue;
1615           CallsWithMetadata.push_back({&AN});
1616           auto *AllocNode = addAllocNode({&AN}, FS);
1617           // Pass an empty CallStack to the CallsiteContext (second)
1618           // parameter, since for ThinLTO we already collapsed out the inlined
1619           // stack ids on the allocation call during ModuleSummaryAnalysis.
1620           CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1621               EmptyContext;
1622           // Now add all of the MIBs and their stack nodes.
1623           for (auto &MIB : AN.MIBs) {
1624             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1625                 StackContext(&MIB);
1626             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1627                 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1628           }
1629           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1630           // Initialize version 0 on the summary alloc node to the current alloc
1631           // type, unless it has both types in which case make it default, so
1632           // that in the case where we aren't able to clone the original version
1633           // always ends up with the default allocation behavior.
1634           AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1635         }
1636       }
1637       // For callsite metadata, add to list for this function for later use.
1638       if (!FS->callsites().empty())
1639         for (auto &SN : FS->mutableCallsites())
1640           CallsWithMetadata.push_back({&SN});
1641 
1642       if (!CallsWithMetadata.empty())
1643         FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1644 
1645       if (!FS->allocs().empty() || !FS->callsites().empty())
1646         FSToVIMap[FS] = VI;
1647     }
1648   }
1649 
1650   if (DumpCCG) {
1651     dbgs() << "CCG before updating call stack chains:\n";
1652     dbgs() << *this;
1653   }
1654 
1655   if (ExportToDot)
1656     exportToDot("prestackupdate");
1657 
1658   updateStackNodes();
1659 
1660   handleCallsitesWithMultipleTargets();
1661 }
1662 
1663 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1664 void CallsiteContextGraph<DerivedCCG, FuncTy,
handleCallsitesWithMultipleTargets()1665                           CallTy>::handleCallsitesWithMultipleTargets() {
1666   // Look for and workaround callsites that call multiple functions.
1667   // This can happen for indirect calls, which needs better handling, and in
1668   // more rare cases (e.g. macro expansion).
1669   // TODO: To fix this for indirect calls we will want to perform speculative
1670   // devirtualization using either the normal PGO info with ICP, or using the
1671   // information in the profiled MemProf contexts. We can do this prior to
1672   // this transformation for regular LTO, and for ThinLTO we can simulate that
1673   // effect in the summary and perform the actual speculative devirtualization
1674   // while cloning in the ThinLTO backend.
1675 
1676   // Keep track of the new nodes synthesized for discovered tail calls missing
1677   // from the profiled contexts.
1678   MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1679 
1680   for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1681        Entry != NonAllocationCallToContextNodeMap.end();) {
1682     auto *Node = Entry->second;
1683     assert(Node->Clones.empty());
1684     // Check all node callees and see if in the same function.
1685     bool Removed = false;
1686     auto Call = Node->Call.call();
1687     for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1688       auto Edge = *EI;
1689       if (!Edge->Callee->hasCall()) {
1690         ++EI;
1691         continue;
1692       }
1693       assert(NodeToCallingFunc.count(Edge->Callee));
1694       // Check if the called function matches that of the callee node.
1695       if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1696         continue;
1697       RemovedEdgesWithMismatchedCallees++;
1698       // Work around by setting Node to have a null call, so it gets
1699       // skipped during cloning. Otherwise assignFunctions will assert
1700       // because its data structures are not designed to handle this case.
1701       Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1702       Node->setCall(CallInfo());
1703       Removed = true;
1704       break;
1705     }
1706     if (!Removed)
1707       Entry++;
1708   }
1709 
1710   // Add the new nodes after the above loop so that the iteration is not
1711   // invalidated.
1712   for (auto &[Call, Node] : TailCallToContextNodeMap)
1713     NonAllocationCallToContextNodeMap[Call] = Node;
1714 }
1715 
getStackId(uint64_t IdOrIndex) const1716 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1717   // In the Module (IR) case this is already the Id.
1718   return IdOrIndex;
1719 }
1720 
getStackId(uint64_t IdOrIndex) const1721 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1722   // In the Index case this is an index into the stack id list in the summary
1723   // index, convert it to an Id.
1724   return Index.getStackIdAtIndex(IdOrIndex);
1725 }
1726 
1727 template <typename DerivedCCG, typename FuncTy, typename CallTy>
calleesMatch(CallTy Call,EdgeIter & EI,MapVector<CallInfo,ContextNode * > & TailCallToContextNodeMap)1728 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1729     CallTy Call, EdgeIter &EI,
1730     MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1731   auto Edge = *EI;
1732   const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1733   const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1734   // Will be populated in order of callee to caller if we find a chain of tail
1735   // calls between the profiled caller and callee.
1736   std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1737   if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1738                          FoundCalleeChain)) {
1739     ++EI;
1740     return false;
1741   }
1742 
1743   // The usual case where the profiled callee matches that of the IR/summary.
1744   if (FoundCalleeChain.empty()) {
1745     ++EI;
1746     return true;
1747   }
1748 
1749   auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1750     auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1751     // If there is already an edge between these nodes, simply update it and
1752     // return.
1753     if (CurEdge) {
1754       CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1755                                  Edge->ContextIds.end());
1756       CurEdge->AllocTypes |= Edge->AllocTypes;
1757       return;
1758     }
1759     // Otherwise, create a new edge and insert it into the caller and callee
1760     // lists.
1761     auto NewEdge = std::make_shared<ContextEdge>(
1762         Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1763     Callee->CallerEdges.push_back(NewEdge);
1764     if (Caller == Edge->Caller) {
1765       // If we are inserting the new edge into the current edge's caller, insert
1766       // the new edge before the current iterator position, and then increment
1767       // back to the current edge.
1768       EI = Caller->CalleeEdges.insert(EI, NewEdge);
1769       ++EI;
1770       assert(*EI == Edge &&
1771              "Iterator position not restored after insert and increment");
1772     } else
1773       Caller->CalleeEdges.push_back(NewEdge);
1774   };
1775 
1776   // Create new nodes for each found callee and connect in between the profiled
1777   // caller and callee.
1778   auto *CurCalleeNode = Edge->Callee;
1779   for (auto &[NewCall, Func] : FoundCalleeChain) {
1780     ContextNode *NewNode = nullptr;
1781     // First check if we have already synthesized a node for this tail call.
1782     if (TailCallToContextNodeMap.count(NewCall)) {
1783       NewNode = TailCallToContextNodeMap[NewCall];
1784       NewNode->ContextIds.insert(Edge->ContextIds.begin(),
1785                                  Edge->ContextIds.end());
1786       NewNode->AllocTypes |= Edge->AllocTypes;
1787     } else {
1788       FuncToCallsWithMetadata[Func].push_back({NewCall});
1789       // Create Node and record node info.
1790       NodeOwner.push_back(
1791           std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1792       NewNode = NodeOwner.back().get();
1793       NodeToCallingFunc[NewNode] = Func;
1794       TailCallToContextNodeMap[NewCall] = NewNode;
1795       NewNode->ContextIds = Edge->ContextIds;
1796       NewNode->AllocTypes = Edge->AllocTypes;
1797     }
1798 
1799     // Hook up node to its callee node
1800     AddEdge(NewNode, CurCalleeNode);
1801 
1802     CurCalleeNode = NewNode;
1803   }
1804 
1805   // Hook up edge's original caller to new callee node.
1806   AddEdge(Edge->Caller, CurCalleeNode);
1807 
1808   // Remove old edge
1809   Edge->Callee->eraseCallerEdge(Edge.get());
1810   EI = Edge->Caller->CalleeEdges.erase(EI);
1811 
1812   return true;
1813 }
1814 
findProfiledCalleeThroughTailCalls(const Function * ProfiledCallee,Value * CurCallee,unsigned Depth,std::vector<std::pair<Instruction *,Function * >> & FoundCalleeChain,bool & FoundMultipleCalleeChains)1815 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1816     const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1817     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1818     bool &FoundMultipleCalleeChains) {
1819   // Stop recursive search if we have already explored the maximum specified
1820   // depth.
1821   if (Depth > TailCallSearchDepth)
1822     return false;
1823 
1824   auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1825     FoundCalleeChain.push_back({Callsite, F});
1826   };
1827 
1828   auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1829   if (!CalleeFunc) {
1830     auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1831     assert(Alias);
1832     CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1833     assert(CalleeFunc);
1834   }
1835 
1836   // Look for tail calls in this function, and check if they either call the
1837   // profiled callee directly, or indirectly (via a recursive search).
1838   // Only succeed if there is a single unique tail call chain found between the
1839   // profiled caller and callee, otherwise we could perform incorrect cloning.
1840   bool FoundSingleCalleeChain = false;
1841   for (auto &BB : *CalleeFunc) {
1842     for (auto &I : BB) {
1843       auto *CB = dyn_cast<CallBase>(&I);
1844       if (!CB || !CB->isTailCall())
1845         continue;
1846       auto *CalledValue = CB->getCalledOperand();
1847       auto *CalledFunction = CB->getCalledFunction();
1848       if (CalledValue && !CalledFunction) {
1849         CalledValue = CalledValue->stripPointerCasts();
1850         // Stripping pointer casts can reveal a called function.
1851         CalledFunction = dyn_cast<Function>(CalledValue);
1852       }
1853       // Check if this is an alias to a function. If so, get the
1854       // called aliasee for the checks below.
1855       if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
1856         assert(!CalledFunction &&
1857                "Expected null called function in callsite for alias");
1858         CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
1859       }
1860       if (!CalledFunction)
1861         continue;
1862       if (CalledFunction == ProfiledCallee) {
1863         if (FoundSingleCalleeChain) {
1864           FoundMultipleCalleeChains = true;
1865           return false;
1866         }
1867         FoundSingleCalleeChain = true;
1868         FoundProfiledCalleeCount++;
1869         FoundProfiledCalleeDepth += Depth;
1870         if (Depth > FoundProfiledCalleeMaxDepth)
1871           FoundProfiledCalleeMaxDepth = Depth;
1872         SaveCallsiteInfo(&I, CalleeFunc);
1873       } else if (findProfiledCalleeThroughTailCalls(
1874                      ProfiledCallee, CalledFunction, Depth + 1,
1875                      FoundCalleeChain, FoundMultipleCalleeChains)) {
1876         if (FoundMultipleCalleeChains)
1877           return false;
1878         if (FoundSingleCalleeChain) {
1879           FoundMultipleCalleeChains = true;
1880           return false;
1881         }
1882         FoundSingleCalleeChain = true;
1883         SaveCallsiteInfo(&I, CalleeFunc);
1884       }
1885     }
1886   }
1887 
1888   return FoundSingleCalleeChain;
1889 }
1890 
calleeMatchesFunc(Instruction * Call,const Function * Func,const Function * CallerFunc,std::vector<std::pair<Instruction *,Function * >> & FoundCalleeChain)1891 bool ModuleCallsiteContextGraph::calleeMatchesFunc(
1892     Instruction *Call, const Function *Func, const Function *CallerFunc,
1893     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
1894   auto *CB = dyn_cast<CallBase>(Call);
1895   if (!CB->getCalledOperand())
1896     return false;
1897   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1898   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1899   if (CalleeFunc == Func)
1900     return true;
1901   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1902   if (Alias && Alias->getAliasee() == Func)
1903     return true;
1904 
1905   // Recursively search for the profiled callee through tail calls starting with
1906   // the actual Callee. The discovered tail call chain is saved in
1907   // FoundCalleeChain, and we will fixup the graph to include these callsites
1908   // after returning.
1909   // FIXME: We will currently redo the same recursive walk if we find the same
1910   // mismatched callee from another callsite. We can improve this with more
1911   // bookkeeping of the created chain of new nodes for each mismatch.
1912   unsigned Depth = 1;
1913   bool FoundMultipleCalleeChains = false;
1914   if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
1915                                           FoundCalleeChain,
1916                                           FoundMultipleCalleeChains)) {
1917     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
1918                       << Func->getName() << " from " << CallerFunc->getName()
1919                       << " that actually called " << CalleeVal->getName()
1920                       << (FoundMultipleCalleeChains
1921                               ? " (found multiple possible chains)"
1922                               : "")
1923                       << "\n");
1924     if (FoundMultipleCalleeChains)
1925       FoundProfiledCalleeNonUniquelyCount++;
1926     return false;
1927   }
1928 
1929   return true;
1930 }
1931 
findProfiledCalleeThroughTailCalls(ValueInfo ProfiledCallee,ValueInfo CurCallee,unsigned Depth,std::vector<std::pair<IndexCall,FunctionSummary * >> & FoundCalleeChain,bool & FoundMultipleCalleeChains)1932 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1933     ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1934     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1935     bool &FoundMultipleCalleeChains) {
1936   // Stop recursive search if we have already explored the maximum specified
1937   // depth.
1938   if (Depth > TailCallSearchDepth)
1939     return false;
1940 
1941   auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
1942     // Make a CallsiteInfo for each discovered callee, if one hasn't already
1943     // been synthesized.
1944     if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
1945         !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
1946       // StackIds is empty (we don't have debug info available in the index for
1947       // these callsites)
1948       FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
1949           std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
1950     CallsiteInfo *NewCallsiteInfo =
1951         FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
1952     FoundCalleeChain.push_back({NewCallsiteInfo, FS});
1953   };
1954 
1955   // Look for tail calls in this function, and check if they either call the
1956   // profiled callee directly, or indirectly (via a recursive search).
1957   // Only succeed if there is a single unique tail call chain found between the
1958   // profiled caller and callee, otherwise we could perform incorrect cloning.
1959   bool FoundSingleCalleeChain = false;
1960   for (auto &S : CurCallee.getSummaryList()) {
1961     if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1962         !isPrevailing(CurCallee.getGUID(), S.get()))
1963       continue;
1964     auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
1965     if (!FS)
1966       continue;
1967     auto FSVI = CurCallee;
1968     auto *AS = dyn_cast<AliasSummary>(S.get());
1969     if (AS)
1970       FSVI = AS->getAliaseeVI();
1971     for (auto &CallEdge : FS->calls()) {
1972       if (!CallEdge.second.hasTailCall())
1973         continue;
1974       if (CallEdge.first == ProfiledCallee) {
1975         if (FoundSingleCalleeChain) {
1976           FoundMultipleCalleeChains = true;
1977           return false;
1978         }
1979         FoundSingleCalleeChain = true;
1980         FoundProfiledCalleeCount++;
1981         FoundProfiledCalleeDepth += Depth;
1982         if (Depth > FoundProfiledCalleeMaxDepth)
1983           FoundProfiledCalleeMaxDepth = Depth;
1984         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
1985         // Add FS to FSToVIMap  in case it isn't already there.
1986         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
1987         FSToVIMap[FS] = FSVI;
1988       } else if (findProfiledCalleeThroughTailCalls(
1989                      ProfiledCallee, CallEdge.first, Depth + 1,
1990                      FoundCalleeChain, FoundMultipleCalleeChains)) {
1991         if (FoundMultipleCalleeChains)
1992           return false;
1993         if (FoundSingleCalleeChain) {
1994           FoundMultipleCalleeChains = true;
1995           return false;
1996         }
1997         FoundSingleCalleeChain = true;
1998         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
1999         // Add FS to FSToVIMap  in case it isn't already there.
2000         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2001         FSToVIMap[FS] = FSVI;
2002       }
2003     }
2004   }
2005 
2006   return FoundSingleCalleeChain;
2007 }
2008 
calleeMatchesFunc(IndexCall & Call,const FunctionSummary * Func,const FunctionSummary * CallerFunc,std::vector<std::pair<IndexCall,FunctionSummary * >> & FoundCalleeChain)2009 bool IndexCallsiteContextGraph::calleeMatchesFunc(
2010     IndexCall &Call, const FunctionSummary *Func,
2011     const FunctionSummary *CallerFunc,
2012     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2013   ValueInfo Callee =
2014       dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2015   // If there is no summary list then this is a call to an externally defined
2016   // symbol.
2017   AliasSummary *Alias =
2018       Callee.getSummaryList().empty()
2019           ? nullptr
2020           : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2021   assert(FSToVIMap.count(Func));
2022   auto FuncVI = FSToVIMap[Func];
2023   if (Callee == FuncVI ||
2024       // If callee is an alias, check the aliasee, since only function
2025       // summary base objects will contain the stack node summaries and thus
2026       // get a context node.
2027       (Alias && Alias->getAliaseeVI() == FuncVI))
2028     return true;
2029 
2030   // Recursively search for the profiled callee through tail calls starting with
2031   // the actual Callee. The discovered tail call chain is saved in
2032   // FoundCalleeChain, and we will fixup the graph to include these callsites
2033   // after returning.
2034   // FIXME: We will currently redo the same recursive walk if we find the same
2035   // mismatched callee from another callsite. We can improve this with more
2036   // bookkeeping of the created chain of new nodes for each mismatch.
2037   unsigned Depth = 1;
2038   bool FoundMultipleCalleeChains = false;
2039   if (!findProfiledCalleeThroughTailCalls(
2040           FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2041     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2042                       << " from " << FSToVIMap[CallerFunc]
2043                       << " that actually called " << Callee
2044                       << (FoundMultipleCalleeChains
2045                               ? " (found multiple possible chains)"
2046                               : "")
2047                       << "\n");
2048     if (FoundMultipleCalleeChains)
2049       FoundProfiledCalleeNonUniquelyCount++;
2050     return false;
2051   }
2052 
2053   return true;
2054 }
2055 
getAllocTypeString(uint8_t AllocTypes)2056 static std::string getAllocTypeString(uint8_t AllocTypes) {
2057   if (!AllocTypes)
2058     return "None";
2059   std::string Str;
2060   if (AllocTypes & (uint8_t)AllocationType::NotCold)
2061     Str += "NotCold";
2062   if (AllocTypes & (uint8_t)AllocationType::Cold)
2063     Str += "Cold";
2064   return Str;
2065 }
2066 
2067 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2068 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2069     const {
2070   print(dbgs());
2071   dbgs() << "\n";
2072 }
2073 
2074 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2075 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2076     raw_ostream &OS) const {
2077   OS << "Node " << this << "\n";
2078   OS << "\t";
2079   printCall(OS);
2080   if (Recursive)
2081     OS << " (recursive)";
2082   OS << "\n";
2083   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2084   OS << "\tContextIds:";
2085   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2086   std::sort(SortedIds.begin(), SortedIds.end());
2087   for (auto Id : SortedIds)
2088     OS << " " << Id;
2089   OS << "\n";
2090   OS << "\tCalleeEdges:\n";
2091   for (auto &Edge : CalleeEdges)
2092     OS << "\t\t" << *Edge << "\n";
2093   OS << "\tCallerEdges:\n";
2094   for (auto &Edge : CallerEdges)
2095     OS << "\t\t" << *Edge << "\n";
2096   if (!Clones.empty()) {
2097     OS << "\tClones: ";
2098     FieldSeparator FS;
2099     for (auto *Clone : Clones)
2100       OS << FS << Clone;
2101     OS << "\n";
2102   } else if (CloneOf) {
2103     OS << "\tClone of " << CloneOf << "\n";
2104   }
2105 }
2106 
2107 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2108 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2109     const {
2110   print(dbgs());
2111   dbgs() << "\n";
2112 }
2113 
2114 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2115 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2116     raw_ostream &OS) const {
2117   OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2118      << " AllocTypes: " << getAllocTypeString(AllocTypes);
2119   OS << " ContextIds:";
2120   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2121   std::sort(SortedIds.begin(), SortedIds.end());
2122   for (auto Id : SortedIds)
2123     OS << " " << Id;
2124 }
2125 
2126 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2127 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2128   print(dbgs());
2129 }
2130 
2131 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2132 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2133     raw_ostream &OS) const {
2134   OS << "Callsite Context Graph:\n";
2135   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2136   for (const auto Node : nodes<GraphType>(this)) {
2137     if (Node->isRemoved())
2138       continue;
2139     Node->print(OS);
2140     OS << "\n";
2141   }
2142 }
2143 
2144 template <typename DerivedCCG, typename FuncTy, typename CallTy>
checkEdge(const std::shared_ptr<ContextEdge<DerivedCCG,FuncTy,CallTy>> & Edge)2145 static void checkEdge(
2146     const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
2147   // Confirm that alloc type is not None and that we have at least one context
2148   // id.
2149   assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
2150   assert(!Edge->ContextIds.empty());
2151 }
2152 
2153 template <typename DerivedCCG, typename FuncTy, typename CallTy>
checkNode(const ContextNode<DerivedCCG,FuncTy,CallTy> * Node,bool CheckEdges=true)2154 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
2155                       bool CheckEdges = true) {
2156   if (Node->isRemoved())
2157     return;
2158   // Node's context ids should be the union of both its callee and caller edge
2159   // context ids.
2160   if (Node->CallerEdges.size()) {
2161     auto EI = Node->CallerEdges.begin();
2162     auto &FirstEdge = *EI;
2163     EI++;
2164     DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
2165     for (; EI != Node->CallerEdges.end(); EI++) {
2166       const auto &Edge = *EI;
2167       if (CheckEdges)
2168         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2169       set_union(CallerEdgeContextIds, Edge->ContextIds);
2170     }
2171     // Node can have more context ids than callers if some contexts terminate at
2172     // node and some are longer.
2173     assert(Node->ContextIds == CallerEdgeContextIds ||
2174            set_is_subset(CallerEdgeContextIds, Node->ContextIds));
2175   }
2176   if (Node->CalleeEdges.size()) {
2177     auto EI = Node->CalleeEdges.begin();
2178     auto &FirstEdge = *EI;
2179     EI++;
2180     DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
2181     for (; EI != Node->CalleeEdges.end(); EI++) {
2182       const auto &Edge = *EI;
2183       if (CheckEdges)
2184         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2185       set_union(CalleeEdgeContextIds, Edge->ContextIds);
2186     }
2187     assert(Node->ContextIds == CalleeEdgeContextIds);
2188   }
2189 }
2190 
2191 template <typename DerivedCCG, typename FuncTy, typename CallTy>
check() const2192 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2193   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2194   for (const auto Node : nodes<GraphType>(this)) {
2195     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2196     for (auto &Edge : Node->CallerEdges)
2197       checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2198   }
2199 }
2200 
2201 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2202 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2203   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2204   using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2205 
2206   using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
getNodeGraphTraits2207   static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2208 
2209   using nodes_iterator =
2210       mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2211                       decltype(&getNode)>;
2212 
nodes_beginGraphTraits2213   static nodes_iterator nodes_begin(GraphType G) {
2214     return nodes_iterator(G->NodeOwner.begin(), &getNode);
2215   }
2216 
nodes_endGraphTraits2217   static nodes_iterator nodes_end(GraphType G) {
2218     return nodes_iterator(G->NodeOwner.end(), &getNode);
2219   }
2220 
getEntryNodeGraphTraits2221   static NodeRef getEntryNode(GraphType G) {
2222     return G->NodeOwner.begin()->get();
2223   }
2224 
2225   using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2226   static const ContextNode<DerivedCCG, FuncTy, CallTy> *
GetCalleeGraphTraits2227   GetCallee(const EdgePtrTy &P) {
2228     return P->Callee;
2229   }
2230 
2231   using ChildIteratorType =
2232       mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2233                           DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2234                       decltype(&GetCallee)>;
2235 
child_beginGraphTraits2236   static ChildIteratorType child_begin(NodeRef N) {
2237     return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2238   }
2239 
child_endGraphTraits2240   static ChildIteratorType child_end(NodeRef N) {
2241     return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2242   }
2243 };
2244 
2245 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2246 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2247     : public DefaultDOTGraphTraits {
DOTGraphTraitsDOTGraphTraits2248   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2249 
2250   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2251   using GTraits = GraphTraits<GraphType>;
2252   using NodeRef = typename GTraits::NodeRef;
2253   using ChildIteratorType = typename GTraits::ChildIteratorType;
2254 
getNodeLabelDOTGraphTraits2255   static std::string getNodeLabel(NodeRef Node, GraphType G) {
2256     std::string LabelString =
2257         (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2258          Twine(Node->OrigStackOrAllocId))
2259             .str();
2260     LabelString += "\n";
2261     if (Node->hasCall()) {
2262       auto Func = G->NodeToCallingFunc.find(Node);
2263       assert(Func != G->NodeToCallingFunc.end());
2264       LabelString +=
2265           G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2266     } else {
2267       LabelString += "null call";
2268       if (Node->Recursive)
2269         LabelString += " (recursive)";
2270       else
2271         LabelString += " (external)";
2272     }
2273     return LabelString;
2274   }
2275 
getNodeAttributesDOTGraphTraits2276   static std::string getNodeAttributes(NodeRef Node, GraphType) {
2277     std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2278                                    getContextIds(Node->ContextIds) + "\"")
2279                                       .str();
2280     AttributeString +=
2281         (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2282     AttributeString += ",style=\"filled\"";
2283     if (Node->CloneOf) {
2284       AttributeString += ",color=\"blue\"";
2285       AttributeString += ",style=\"filled,bold,dashed\"";
2286     } else
2287       AttributeString += ",style=\"filled\"";
2288     return AttributeString;
2289   }
2290 
getEdgeAttributesDOTGraphTraits2291   static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2292                                        GraphType) {
2293     auto &Edge = *(ChildIter.getCurrent());
2294     return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2295             Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2296         .str();
2297   }
2298 
2299   // Since the NodeOwners list includes nodes that are no longer connected to
2300   // the graph, skip them here.
isNodeHiddenDOTGraphTraits2301   static bool isNodeHidden(NodeRef Node, GraphType) {
2302     return Node->isRemoved();
2303   }
2304 
2305 private:
getContextIdsDOTGraphTraits2306   static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2307     std::string IdString = "ContextIds:";
2308     if (ContextIds.size() < 100) {
2309       std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2310       std::sort(SortedIds.begin(), SortedIds.end());
2311       for (auto Id : SortedIds)
2312         IdString += (" " + Twine(Id)).str();
2313     } else {
2314       IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2315     }
2316     return IdString;
2317   }
2318 
getColorDOTGraphTraits2319   static std::string getColor(uint8_t AllocTypes) {
2320     if (AllocTypes == (uint8_t)AllocationType::NotCold)
2321       // Color "brown1" actually looks like a lighter red.
2322       return "brown1";
2323     if (AllocTypes == (uint8_t)AllocationType::Cold)
2324       return "cyan";
2325     if (AllocTypes ==
2326         ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2327       // Lighter purple.
2328       return "mediumorchid1";
2329     return "gray";
2330   }
2331 
getNodeIdDOTGraphTraits2332   static std::string getNodeId(NodeRef Node) {
2333     std::stringstream SStream;
2334     SStream << std::hex << "N0x" << (unsigned long long)Node;
2335     std::string Result = SStream.str();
2336     return Result;
2337   }
2338 };
2339 
2340 template <typename DerivedCCG, typename FuncTy, typename CallTy>
exportToDot(std::string Label) const2341 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2342     std::string Label) const {
2343   WriteGraph(this, "", false, Label,
2344              DotFilePathPrefix + "ccg." + Label + ".dot");
2345 }
2346 
2347 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2348 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> & Edge,EdgeIter * CallerEdgeI)2349 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2350     const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) {
2351   ContextNode *Node = Edge->Callee;
2352   NodeOwner.push_back(
2353       std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2354   ContextNode *Clone = NodeOwner.back().get();
2355   Node->addClone(Clone);
2356   assert(NodeToCallingFunc.count(Node));
2357   NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2358   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true);
2359   return Clone;
2360 }
2361 
2362 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2363 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> & Edge,ContextNode * NewCallee,EdgeIter * CallerEdgeI,bool NewClone)2364     moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2365                                   ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2366                                   bool NewClone) {
2367   // NewCallee and Edge's current callee must be clones of the same original
2368   // node (Edge's current callee may be the original node too).
2369   assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2370   auto &EdgeContextIds = Edge->getContextIds();
2371   ContextNode *OldCallee = Edge->Callee;
2372   if (CallerEdgeI)
2373     *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2374   else
2375     OldCallee->eraseCallerEdge(Edge.get());
2376   Edge->Callee = NewCallee;
2377   NewCallee->CallerEdges.push_back(Edge);
2378   // Don't need to update Edge's context ids since we are simply reconnecting
2379   // it.
2380   set_subtract(OldCallee->ContextIds, EdgeContextIds);
2381   NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end());
2382   NewCallee->AllocTypes |= Edge->AllocTypes;
2383   OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2384   // OldCallee alloc type should be None iff its context id set is now empty.
2385   assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2386          OldCallee->ContextIds.empty());
2387   // Now walk the old callee node's callee edges and move Edge's context ids
2388   // over to the corresponding edge into the clone (which is created here if
2389   // this is a newly created clone).
2390   for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2391     // The context ids moving to the new callee are the subset of this edge's
2392     // context ids and the context ids on the caller edge being moved.
2393     DenseSet<uint32_t> EdgeContextIdsToMove =
2394         set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds);
2395     set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2396     OldCalleeEdge->AllocTypes =
2397         computeAllocType(OldCalleeEdge->getContextIds());
2398     if (!NewClone) {
2399       // Update context ids / alloc type on corresponding edge to NewCallee.
2400       // There is a chance this may not exist if we are reusing an existing
2401       // clone, specifically during function assignment, where we would have
2402       // removed none type edges after creating the clone. If we can't find
2403       // a corresponding edge there, fall through to the cloning below.
2404       if (auto *NewCalleeEdge =
2405               NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2406         NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2407                                               EdgeContextIdsToMove.end());
2408         NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2409         continue;
2410       }
2411     }
2412     auto NewEdge = std::make_shared<ContextEdge>(
2413         OldCalleeEdge->Callee, NewCallee,
2414         computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2415     NewCallee->CalleeEdges.push_back(NewEdge);
2416     NewEdge->Callee->CallerEdges.push_back(NewEdge);
2417   }
2418   if (VerifyCCG) {
2419     checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2420     checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2421     for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2422       checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2423                                             /*CheckEdges=*/false);
2424     for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2425       checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2426                                             /*CheckEdges=*/false);
2427   }
2428 }
2429 
2430 template <typename DerivedCCG, typename FuncTy, typename CallTy>
identifyClones()2431 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2432   DenseSet<const ContextNode *> Visited;
2433   for (auto &Entry : AllocationCallToContextNodeMap)
2434     identifyClones(Entry.second, Visited);
2435 }
2436 
2437 // helper function to check an AllocType is cold or notcold or both.
checkColdOrNotCold(uint8_t AllocType)2438 bool checkColdOrNotCold(uint8_t AllocType) {
2439   return (AllocType == (uint8_t)AllocationType::Cold) ||
2440          (AllocType == (uint8_t)AllocationType::NotCold) ||
2441          (AllocType ==
2442           ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2443 }
2444 
2445 template <typename DerivedCCG, typename FuncTy, typename CallTy>
identifyClones(ContextNode * Node,DenseSet<const ContextNode * > & Visited)2446 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2447     ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2448   if (VerifyNodes)
2449     checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2450   assert(!Node->CloneOf);
2451 
2452   // If Node as a null call, then either it wasn't found in the module (regular
2453   // LTO) or summary index (ThinLTO), or there were other conditions blocking
2454   // cloning (e.g. recursion, calls multiple targets, etc).
2455   // Do this here so that we don't try to recursively clone callers below, which
2456   // isn't useful at least for this node.
2457   if (!Node->hasCall())
2458     return;
2459 
2460 #ifndef NDEBUG
2461   auto Insert =
2462 #endif
2463       Visited.insert(Node);
2464   // We should not have visited this node yet.
2465   assert(Insert.second);
2466   // The recursive call to identifyClones may delete the current edge from the
2467   // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2468   // in an iterator and having recursive call erase from it. Other edges may
2469   // also get removed during the recursion, which will have null Callee and
2470   // Caller pointers (and are deleted later), so we skip those below.
2471   {
2472     auto CallerEdges = Node->CallerEdges;
2473     for (auto &Edge : CallerEdges) {
2474       // Skip any that have been removed by an earlier recursive call.
2475       if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2476         assert(!llvm::count(Node->CallerEdges, Edge));
2477         continue;
2478       }
2479       // Ignore any caller we previously visited via another edge.
2480       if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2481         identifyClones(Edge->Caller, Visited);
2482       }
2483     }
2484   }
2485 
2486   // Check if we reached an unambiguous call or have have only a single caller.
2487   if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2488     return;
2489 
2490   // We need to clone.
2491 
2492   // Try to keep the original version as alloc type NotCold. This will make
2493   // cases with indirect calls or any other situation with an unknown call to
2494   // the original function get the default behavior. We do this by sorting the
2495   // CallerEdges of the Node we will clone by alloc type.
2496   //
2497   // Give NotCold edge the lowest sort priority so those edges are at the end of
2498   // the caller edges vector, and stay on the original version (since the below
2499   // code clones greedily until it finds all remaining edges have the same type
2500   // and leaves the remaining ones on the original Node).
2501   //
2502   // We shouldn't actually have any None type edges, so the sorting priority for
2503   // that is arbitrary, and we assert in that case below.
2504   const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2505                                                /*Cold*/ 1,
2506                                                /*NotColdCold*/ 2};
2507   std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2508                    [&](const std::shared_ptr<ContextEdge> &A,
2509                        const std::shared_ptr<ContextEdge> &B) {
2510                      assert(checkColdOrNotCold(A->AllocTypes) &&
2511                             checkColdOrNotCold(B->AllocTypes));
2512 
2513                      if (A->AllocTypes == B->AllocTypes)
2514                        // Use the first context id for each edge as a
2515                        // tie-breaker.
2516                        return *A->ContextIds.begin() < *B->ContextIds.begin();
2517                      return AllocTypeCloningPriority[A->AllocTypes] <
2518                             AllocTypeCloningPriority[B->AllocTypes];
2519                    });
2520 
2521   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2522 
2523   // Iterate until we find no more opportunities for disambiguating the alloc
2524   // types via cloning. In most cases this loop will terminate once the Node
2525   // has a single allocation type, in which case no more cloning is needed.
2526   // We need to be able to remove Edge from CallerEdges, so need to adjust
2527   // iterator inside the loop.
2528   for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2529     auto CallerEdge = *EI;
2530 
2531     // See if cloning the prior caller edge left this node with a single alloc
2532     // type or a single caller. In that case no more cloning of Node is needed.
2533     if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2534       break;
2535 
2536     // Compute the node callee edge alloc types corresponding to the context ids
2537     // for this caller edge.
2538     std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2539     CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2540     for (auto &CalleeEdge : Node->CalleeEdges)
2541       CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2542           CalleeEdge->getContextIds(), CallerEdge->getContextIds()));
2543 
2544     // Don't clone if doing so will not disambiguate any alloc types amongst
2545     // caller edges (including the callee edges that would be cloned).
2546     // Otherwise we will simply move all edges to the clone.
2547     //
2548     // First check if by cloning we will disambiguate the caller allocation
2549     // type from node's allocation type. Query allocTypeToUse so that we don't
2550     // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2551     // neither of these should be None type.
2552     //
2553     // Then check if by cloning node at least one of the callee edges will be
2554     // disambiguated by splitting out different context ids.
2555     assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2556     assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2557     if (allocTypeToUse(CallerEdge->AllocTypes) ==
2558             allocTypeToUse(Node->AllocTypes) &&
2559         allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2560             CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2561       ++EI;
2562       continue;
2563     }
2564 
2565     // First see if we can use an existing clone. Check each clone and its
2566     // callee edges for matching alloc types.
2567     ContextNode *Clone = nullptr;
2568     for (auto *CurClone : Node->Clones) {
2569       if (allocTypeToUse(CurClone->AllocTypes) !=
2570           allocTypeToUse(CallerEdge->AllocTypes))
2571         continue;
2572 
2573       if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2574               CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2575         continue;
2576       Clone = CurClone;
2577       break;
2578     }
2579 
2580     // The edge iterator is adjusted when we move the CallerEdge to the clone.
2581     if (Clone)
2582       moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI);
2583     else
2584       Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI);
2585 
2586     assert(EI == Node->CallerEdges.end() ||
2587            Node->AllocTypes != (uint8_t)AllocationType::None);
2588     // Sanity check that no alloc types on clone or its edges are None.
2589     assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2590     assert(llvm::none_of(
2591         Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2592           return E->AllocTypes == (uint8_t)AllocationType::None;
2593         }));
2594   }
2595 
2596   // Cloning may have resulted in some cloned callee edges with type None,
2597   // because they aren't carrying any contexts. Remove those edges.
2598   for (auto *Clone : Node->Clones) {
2599     removeNoneTypeCalleeEdges(Clone);
2600     if (VerifyNodes)
2601       checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2602   }
2603   // We should still have some context ids on the original Node.
2604   assert(!Node->ContextIds.empty());
2605 
2606   // Remove any callee edges that ended up with alloc type None after creating
2607   // clones and updating callee edges.
2608   removeNoneTypeCalleeEdges(Node);
2609 
2610   // Sanity check that no alloc types on node or edges are None.
2611   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2612   assert(llvm::none_of(Node->CalleeEdges,
2613                        [&](const std::shared_ptr<ContextEdge> &E) {
2614                          return E->AllocTypes == (uint8_t)AllocationType::None;
2615                        }));
2616   assert(llvm::none_of(Node->CallerEdges,
2617                        [&](const std::shared_ptr<ContextEdge> &E) {
2618                          return E->AllocTypes == (uint8_t)AllocationType::None;
2619                        }));
2620 
2621   if (VerifyNodes)
2622     checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2623 }
2624 
updateAllocationCall(CallInfo & Call,AllocationType AllocType)2625 void ModuleCallsiteContextGraph::updateAllocationCall(
2626     CallInfo &Call, AllocationType AllocType) {
2627   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2628   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2629                                 "memprof", AllocTypeString);
2630   cast<CallBase>(Call.call())->addFnAttr(A);
2631   OREGetter(Call.call()->getFunction())
2632       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2633             << ore::NV("AllocationCall", Call.call()) << " in clone "
2634             << ore::NV("Caller", Call.call()->getFunction())
2635             << " marked with memprof allocation attribute "
2636             << ore::NV("Attribute", AllocTypeString));
2637 }
2638 
updateAllocationCall(CallInfo & Call,AllocationType AllocType)2639 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2640                                                      AllocationType AllocType) {
2641   auto *AI = Call.call().dyn_cast<AllocInfo *>();
2642   assert(AI);
2643   assert(AI->Versions.size() > Call.cloneNo());
2644   AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2645 }
2646 
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)2647 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2648                                             FuncInfo CalleeFunc) {
2649   if (CalleeFunc.cloneNo() > 0)
2650     cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2651   OREGetter(CallerCall.call()->getFunction())
2652       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2653             << ore::NV("Call", CallerCall.call()) << " in clone "
2654             << ore::NV("Caller", CallerCall.call()->getFunction())
2655             << " assigned to call function clone "
2656             << ore::NV("Callee", CalleeFunc.func()));
2657 }
2658 
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)2659 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2660                                            FuncInfo CalleeFunc) {
2661   auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2662   assert(CI &&
2663          "Caller cannot be an allocation which should not have profiled calls");
2664   assert(CI->Clones.size() > CallerCall.cloneNo());
2665   CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2666 }
2667 
2668 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2669                      Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)2670 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2671     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2672     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2673   // Use existing LLVM facilities for cloning and obtaining Call in clone
2674   ValueToValueMapTy VMap;
2675   auto *NewFunc = CloneFunction(Func.func(), VMap);
2676   std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2677   assert(!Func.func()->getParent()->getFunction(Name));
2678   NewFunc->setName(Name);
2679   for (auto &Inst : CallsWithMetadataInFunc) {
2680     // This map always has the initial version in it.
2681     assert(Inst.cloneNo() == 0);
2682     CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2683   }
2684   OREGetter(Func.func())
2685       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2686             << "created clone " << ore::NV("NewFunction", NewFunc));
2687   return {NewFunc, CloneNo};
2688 }
2689 
2690 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2691                      IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)2692 IndexCallsiteContextGraph::cloneFunctionForCallsite(
2693     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2694     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2695   // Check how many clones we have of Call (and therefore function).
2696   // The next clone number is the current size of versions array.
2697   // Confirm this matches the CloneNo provided by the caller, which is based on
2698   // the number of function clones we have.
2699   assert(CloneNo ==
2700          (Call.call().is<AllocInfo *>()
2701               ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2702               : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2703   // Walk all the instructions in this function. Create a new version for
2704   // each (by adding an entry to the Versions/Clones summary array), and copy
2705   // over the version being called for the function clone being cloned here.
2706   // Additionally, add an entry to the CallMap for the new function clone,
2707   // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2708   // to the new call clone.
2709   for (auto &Inst : CallsWithMetadataInFunc) {
2710     // This map always has the initial version in it.
2711     assert(Inst.cloneNo() == 0);
2712     if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2713       assert(AI->Versions.size() == CloneNo);
2714       // We assign the allocation type later (in updateAllocationCall), just add
2715       // an entry for it here.
2716       AI->Versions.push_back(0);
2717     } else {
2718       auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2719       assert(CI && CI->Clones.size() == CloneNo);
2720       // We assign the clone number later (in updateCall), just add an entry for
2721       // it here.
2722       CI->Clones.push_back(0);
2723     }
2724     CallMap[Inst] = {Inst.call(), CloneNo};
2725   }
2726   return {Func.func(), CloneNo};
2727 }
2728 
2729 // This method assigns cloned callsites to functions, cloning the functions as
2730 // needed. The assignment is greedy and proceeds roughly as follows:
2731 //
2732 // For each function Func:
2733 //   For each call with graph Node having clones:
2734 //     Initialize ClonesWorklist to Node and its clones
2735 //     Initialize NodeCloneCount to 0
2736 //     While ClonesWorklist is not empty:
2737 //        Clone = pop front ClonesWorklist
2738 //        NodeCloneCount++
2739 //        If Func has been cloned less than NodeCloneCount times:
2740 //           If NodeCloneCount is 1:
2741 //             Assign Clone to original Func
2742 //             Continue
2743 //           Create a new function clone
2744 //           If other callers not assigned to call a function clone yet:
2745 //              Assign them to call new function clone
2746 //              Continue
2747 //           Assign any other caller calling the cloned version to new clone
2748 //
2749 //        For each caller of Clone:
2750 //           If caller is assigned to call a specific function clone:
2751 //             If we cannot assign Clone to that function clone:
2752 //               Create new callsite Clone NewClone
2753 //               Add NewClone to ClonesWorklist
2754 //               Continue
2755 //             Assign Clone to existing caller's called function clone
2756 //           Else:
2757 //             If Clone not already assigned to a function clone:
2758 //                Assign to first function clone without assignment
2759 //             Assign caller to selected function clone
2760 template <typename DerivedCCG, typename FuncTy, typename CallTy>
assignFunctions()2761 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2762   bool Changed = false;
2763 
2764   // Keep track of the assignment of nodes (callsites) to function clones they
2765   // call.
2766   DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2767 
2768   // Update caller node to call function version CalleeFunc, by recording the
2769   // assignment in CallsiteToCalleeFuncCloneMap.
2770   auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2771                                         const FuncInfo &CalleeFunc) {
2772     assert(Caller->hasCall());
2773     CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2774   };
2775 
2776   // Walk all functions for which we saw calls with memprof metadata, and handle
2777   // cloning for each of its calls.
2778   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2779     FuncInfo OrigFunc(Func);
2780     // Map from each clone of OrigFunc to a map of remappings of each call of
2781     // interest (from original uncloned call to the corresponding cloned call in
2782     // that function clone).
2783     std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2784     for (auto &Call : CallsWithMetadata) {
2785       ContextNode *Node = getNodeForInst(Call);
2786       // Skip call if we do not have a node for it (all uses of its stack ids
2787       // were either on inlined chains or pruned from the MIBs), or if we did
2788       // not create any clones for it.
2789       if (!Node || Node->Clones.empty())
2790         continue;
2791       assert(Node->hasCall() &&
2792              "Not having a call should have prevented cloning");
2793 
2794       // Track the assignment of function clones to clones of the current
2795       // callsite Node being handled.
2796       std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2797 
2798       // Assign callsite version CallsiteClone to function version FuncClone,
2799       // and also assign (possibly cloned) Call to CallsiteClone.
2800       auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2801                                                 CallInfo &Call,
2802                                                 ContextNode *CallsiteClone,
2803                                                 bool IsAlloc) {
2804         // Record the clone of callsite node assigned to this function clone.
2805         FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2806 
2807         assert(FuncClonesToCallMap.count(FuncClone));
2808         std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2809         CallInfo CallClone(Call);
2810         if (CallMap.count(Call))
2811           CallClone = CallMap[Call];
2812         CallsiteClone->setCall(CallClone);
2813       };
2814 
2815       // Keep track of the clones of callsite Node that need to be assigned to
2816       // function clones. This list may be expanded in the loop body below if we
2817       // find additional cloning is required.
2818       std::deque<ContextNode *> ClonesWorklist;
2819       // Ignore original Node if we moved all of its contexts to clones.
2820       if (!Node->ContextIds.empty())
2821         ClonesWorklist.push_back(Node);
2822       ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2823                             Node->Clones.end());
2824 
2825       // Now walk through all of the clones of this callsite Node that we need,
2826       // and determine the assignment to a corresponding clone of the current
2827       // function (creating new function clones as needed).
2828       unsigned NodeCloneCount = 0;
2829       while (!ClonesWorklist.empty()) {
2830         ContextNode *Clone = ClonesWorklist.front();
2831         ClonesWorklist.pop_front();
2832         NodeCloneCount++;
2833         if (VerifyNodes)
2834           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2835 
2836         // Need to create a new function clone if we have more callsite clones
2837         // than existing function clones, which would have been assigned to an
2838         // earlier clone in the list (we assign callsite clones to function
2839         // clones greedily).
2840         if (FuncClonesToCallMap.size() < NodeCloneCount) {
2841           // If this is the first callsite copy, assign to original function.
2842           if (NodeCloneCount == 1) {
2843             // Since FuncClonesToCallMap is empty in this case, no clones have
2844             // been created for this function yet, and no callers should have
2845             // been assigned a function clone for this callee node yet.
2846             assert(llvm::none_of(
2847                 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2848                   return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2849                 }));
2850             // Initialize with empty call map, assign Clone to original function
2851             // and its callers, and skip to the next clone.
2852             FuncClonesToCallMap[OrigFunc] = {};
2853             AssignCallsiteCloneToFuncClone(
2854                 OrigFunc, Call, Clone,
2855                 AllocationCallToContextNodeMap.count(Call));
2856             for (auto &CE : Clone->CallerEdges) {
2857               // Ignore any caller that does not have a recorded callsite Call.
2858               if (!CE->Caller->hasCall())
2859                 continue;
2860               RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2861             }
2862             continue;
2863           }
2864 
2865           // First locate which copy of OrigFunc to clone again. If a caller
2866           // of this callsite clone was already assigned to call a particular
2867           // function clone, we need to redirect all of those callers to the
2868           // new function clone, and update their other callees within this
2869           // function.
2870           FuncInfo PreviousAssignedFuncClone;
2871           auto EI = llvm::find_if(
2872               Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2873                 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2874               });
2875           bool CallerAssignedToCloneOfFunc = false;
2876           if (EI != Clone->CallerEdges.end()) {
2877             const std::shared_ptr<ContextEdge> &Edge = *EI;
2878             PreviousAssignedFuncClone =
2879                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2880             CallerAssignedToCloneOfFunc = true;
2881           }
2882 
2883           // Clone function and save it along with the CallInfo map created
2884           // during cloning in the FuncClonesToCallMap.
2885           std::map<CallInfo, CallInfo> NewCallMap;
2886           unsigned CloneNo = FuncClonesToCallMap.size();
2887           assert(CloneNo > 0 && "Clone 0 is the original function, which "
2888                                 "should already exist in the map");
2889           FuncInfo NewFuncClone = cloneFunctionForCallsite(
2890               OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2891           FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
2892           FunctionClonesAnalysis++;
2893           Changed = true;
2894 
2895           // If no caller callsites were already assigned to a clone of this
2896           // function, we can simply assign this clone to the new func clone
2897           // and update all callers to it, then skip to the next clone.
2898           if (!CallerAssignedToCloneOfFunc) {
2899             AssignCallsiteCloneToFuncClone(
2900                 NewFuncClone, Call, Clone,
2901                 AllocationCallToContextNodeMap.count(Call));
2902             for (auto &CE : Clone->CallerEdges) {
2903               // Ignore any caller that does not have a recorded callsite Call.
2904               if (!CE->Caller->hasCall())
2905                 continue;
2906               RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2907             }
2908             continue;
2909           }
2910 
2911           // We may need to do additional node cloning in this case.
2912           // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
2913           // that were previously assigned to call PreviousAssignedFuncClone,
2914           // to record that they now call NewFuncClone.
2915           for (auto CE : Clone->CallerEdges) {
2916             // Skip any that have been removed on an earlier iteration.
2917             if (!CE)
2918               continue;
2919             // Ignore any caller that does not have a recorded callsite Call.
2920             if (!CE->Caller->hasCall())
2921               continue;
2922 
2923             if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
2924                 // We subsequently fall through to later handling that
2925                 // will perform any additional cloning required for
2926                 // callers that were calling other function clones.
2927                 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
2928                     PreviousAssignedFuncClone)
2929               continue;
2930 
2931             RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2932 
2933             // If we are cloning a function that was already assigned to some
2934             // callers, then essentially we are creating new callsite clones
2935             // of the other callsites in that function that are reached by those
2936             // callers. Clone the other callees of the current callsite's caller
2937             // that were already assigned to PreviousAssignedFuncClone
2938             // accordingly. This is important since we subsequently update the
2939             // calls from the nodes in the graph and their assignments to callee
2940             // functions recorded in CallsiteToCalleeFuncCloneMap.
2941             for (auto CalleeEdge : CE->Caller->CalleeEdges) {
2942               // Skip any that have been removed on an earlier iteration when
2943               // cleaning up newly None type callee edges.
2944               if (!CalleeEdge)
2945                 continue;
2946               ContextNode *Callee = CalleeEdge->Callee;
2947               // Skip the current callsite, we are looking for other
2948               // callsites Caller calls, as well as any that does not have a
2949               // recorded callsite Call.
2950               if (Callee == Clone || !Callee->hasCall())
2951                 continue;
2952               ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
2953               removeNoneTypeCalleeEdges(NewClone);
2954               // Moving the edge may have resulted in some none type
2955               // callee edges on the original Callee.
2956               removeNoneTypeCalleeEdges(Callee);
2957               assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2958               // If the Callee node was already assigned to call a specific
2959               // function version, make sure its new clone is assigned to call
2960               // that same function clone.
2961               if (CallsiteToCalleeFuncCloneMap.count(Callee))
2962                 RecordCalleeFuncOfCallsite(
2963                     NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
2964               // Update NewClone with the new Call clone of this callsite's Call
2965               // created for the new function clone created earlier.
2966               // Recall that we have already ensured when building the graph
2967               // that each caller can only call callsites within the same
2968               // function, so we are guaranteed that Callee Call is in the
2969               // current OrigFunc.
2970               // CallMap is set up as indexed by original Call at clone 0.
2971               CallInfo OrigCall(Callee->getOrigNode()->Call);
2972               OrigCall.setCloneNo(0);
2973               std::map<CallInfo, CallInfo> &CallMap =
2974                   FuncClonesToCallMap[NewFuncClone];
2975               assert(CallMap.count(OrigCall));
2976               CallInfo NewCall(CallMap[OrigCall]);
2977               assert(NewCall);
2978               NewClone->setCall(NewCall);
2979             }
2980           }
2981           // Fall through to handling below to perform the recording of the
2982           // function for this callsite clone. This enables handling of cases
2983           // where the callers were assigned to different clones of a function.
2984         }
2985 
2986         // See if we can use existing function clone. Walk through
2987         // all caller edges to see if any have already been assigned to
2988         // a clone of this callsite's function. If we can use it, do so. If not,
2989         // because that function clone is already assigned to a different clone
2990         // of this callsite, then we need to clone again.
2991         // Basically, this checking is needed to handle the case where different
2992         // caller functions/callsites may need versions of this function
2993         // containing different mixes of callsite clones across the different
2994         // callsites within the function. If that happens, we need to create
2995         // additional function clones to handle the various combinations.
2996         //
2997         // Keep track of any new clones of this callsite created by the
2998         // following loop, as well as any existing clone that we decided to
2999         // assign this clone to.
3000         std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3001         FuncInfo FuncCloneAssignedToCurCallsiteClone;
3002         // We need to be able to remove Edge from CallerEdges, so need to adjust
3003         // iterator in the loop.
3004         for (auto EI = Clone->CallerEdges.begin();
3005              EI != Clone->CallerEdges.end();) {
3006           auto Edge = *EI;
3007           // Ignore any caller that does not have a recorded callsite Call.
3008           if (!Edge->Caller->hasCall()) {
3009             EI++;
3010             continue;
3011           }
3012           // If this caller already assigned to call a version of OrigFunc, need
3013           // to ensure we can assign this callsite clone to that function clone.
3014           if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3015             FuncInfo FuncCloneCalledByCaller =
3016                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3017             // First we need to confirm that this function clone is available
3018             // for use by this callsite node clone.
3019             //
3020             // While FuncCloneToCurNodeCloneMap is built only for this Node and
3021             // its callsite clones, one of those callsite clones X could have
3022             // been assigned to the same function clone called by Edge's caller
3023             // - if Edge's caller calls another callsite within Node's original
3024             // function, and that callsite has another caller reaching clone X.
3025             // We need to clone Node again in this case.
3026             if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3027                  FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3028                      Clone) ||
3029                 // Detect when we have multiple callers of this callsite that
3030                 // have already been assigned to specific, and different, clones
3031                 // of OrigFunc (due to other unrelated callsites in Func they
3032                 // reach via call contexts). Is this Clone of callsite Node
3033                 // assigned to a different clone of OrigFunc? If so, clone Node
3034                 // again.
3035                 (FuncCloneAssignedToCurCallsiteClone &&
3036                  FuncCloneAssignedToCurCallsiteClone !=
3037                      FuncCloneCalledByCaller)) {
3038               // We need to use a different newly created callsite clone, in
3039               // order to assign it to another new function clone on a
3040               // subsequent iteration over the Clones array (adjusted below).
3041               // Note we specifically do not reset the
3042               // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3043               // when this new clone is processed later we know which version of
3044               // the function to copy (so that other callsite clones we have
3045               // assigned to that function clone are properly cloned over). See
3046               // comments in the function cloning handling earlier.
3047 
3048               // Check if we already have cloned this callsite again while
3049               // walking through caller edges, for a caller calling the same
3050               // function clone. If so, we can move this edge to that new clone
3051               // rather than creating yet another new clone.
3052               if (FuncCloneToNewCallsiteCloneMap.count(
3053                       FuncCloneCalledByCaller)) {
3054                 ContextNode *NewClone =
3055                     FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3056                 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3057                 // Cleanup any none type edges cloned over.
3058                 removeNoneTypeCalleeEdges(NewClone);
3059               } else {
3060                 // Create a new callsite clone.
3061                 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3062                 removeNoneTypeCalleeEdges(NewClone);
3063                 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3064                     NewClone;
3065                 // Add to list of clones and process later.
3066                 ClonesWorklist.push_back(NewClone);
3067                 assert(EI == Clone->CallerEdges.end() ||
3068                        Clone->AllocTypes != (uint8_t)AllocationType::None);
3069                 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3070               }
3071               // Moving the caller edge may have resulted in some none type
3072               // callee edges.
3073               removeNoneTypeCalleeEdges(Clone);
3074               // We will handle the newly created callsite clone in a subsequent
3075               // iteration over this Node's Clones. Continue here since we
3076               // already adjusted iterator EI while moving the edge.
3077               continue;
3078             }
3079 
3080             // Otherwise, we can use the function clone already assigned to this
3081             // caller.
3082             if (!FuncCloneAssignedToCurCallsiteClone) {
3083               FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3084               // Assign Clone to FuncCloneCalledByCaller
3085               AssignCallsiteCloneToFuncClone(
3086                   FuncCloneCalledByCaller, Call, Clone,
3087                   AllocationCallToContextNodeMap.count(Call));
3088             } else
3089               // Don't need to do anything - callsite is already calling this
3090               // function clone.
3091               assert(FuncCloneAssignedToCurCallsiteClone ==
3092                      FuncCloneCalledByCaller);
3093 
3094           } else {
3095             // We have not already assigned this caller to a version of
3096             // OrigFunc. Do the assignment now.
3097 
3098             // First check if we have already assigned this callsite clone to a
3099             // clone of OrigFunc for another caller during this iteration over
3100             // its caller edges.
3101             if (!FuncCloneAssignedToCurCallsiteClone) {
3102               // Find first function in FuncClonesToCallMap without an assigned
3103               // clone of this callsite Node. We should always have one
3104               // available at this point due to the earlier cloning when the
3105               // FuncClonesToCallMap size was smaller than the clone number.
3106               for (auto &CF : FuncClonesToCallMap) {
3107                 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3108                   FuncCloneAssignedToCurCallsiteClone = CF.first;
3109                   break;
3110                 }
3111               }
3112               assert(FuncCloneAssignedToCurCallsiteClone);
3113               // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3114               AssignCallsiteCloneToFuncClone(
3115                   FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3116                   AllocationCallToContextNodeMap.count(Call));
3117             } else
3118               assert(FuncCloneToCurNodeCloneMap
3119                          [FuncCloneAssignedToCurCallsiteClone] == Clone);
3120             // Update callers to record function version called.
3121             RecordCalleeFuncOfCallsite(Edge->Caller,
3122                                        FuncCloneAssignedToCurCallsiteClone);
3123           }
3124 
3125           EI++;
3126         }
3127       }
3128       if (VerifyCCG) {
3129         checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3130         for (const auto &PE : Node->CalleeEdges)
3131           checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3132         for (const auto &CE : Node->CallerEdges)
3133           checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3134         for (auto *Clone : Node->Clones) {
3135           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3136           for (const auto &PE : Clone->CalleeEdges)
3137             checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3138           for (const auto &CE : Clone->CallerEdges)
3139             checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3140         }
3141       }
3142     }
3143   }
3144 
3145   auto UpdateCalls = [&](ContextNode *Node,
3146                          DenseSet<const ContextNode *> &Visited,
3147                          auto &&UpdateCalls) {
3148     auto Inserted = Visited.insert(Node);
3149     if (!Inserted.second)
3150       return;
3151 
3152     for (auto *Clone : Node->Clones)
3153       UpdateCalls(Clone, Visited, UpdateCalls);
3154 
3155     for (auto &Edge : Node->CallerEdges)
3156       UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3157 
3158     // Skip if either no call to update, or if we ended up with no context ids
3159     // (we moved all edges onto other clones).
3160     if (!Node->hasCall() || Node->ContextIds.empty())
3161       return;
3162 
3163     if (Node->IsAllocation) {
3164       updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3165       return;
3166     }
3167 
3168     if (!CallsiteToCalleeFuncCloneMap.count(Node))
3169       return;
3170 
3171     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3172     updateCall(Node->Call, CalleeFunc);
3173   };
3174 
3175   // Performs DFS traversal starting from allocation nodes to update calls to
3176   // reflect cloning decisions recorded earlier. For regular LTO this will
3177   // update the actual calls in the IR to call the appropriate function clone
3178   // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3179   // are recorded in the summary entries.
3180   DenseSet<const ContextNode *> Visited;
3181   for (auto &Entry : AllocationCallToContextNodeMap)
3182     UpdateCalls(Entry.second, Visited, UpdateCalls);
3183 
3184   return Changed;
3185 }
3186 
createFunctionClones(Function & F,unsigned NumClones,Module & M,OptimizationRemarkEmitter & ORE,std::map<const Function *,SmallPtrSet<const GlobalAlias *,1>> & FuncToAliasMap)3187 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
3188     Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3189     std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3190         &FuncToAliasMap) {
3191   // The first "clone" is the original copy, we should only call this if we
3192   // needed to create new clones.
3193   assert(NumClones > 1);
3194   SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3195   VMaps.reserve(NumClones - 1);
3196   FunctionsClonedThinBackend++;
3197   for (unsigned I = 1; I < NumClones; I++) {
3198     VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3199     auto *NewF = CloneFunction(&F, *VMaps.back());
3200     FunctionClonesThinBackend++;
3201     // Strip memprof and callsite metadata from clone as they are no longer
3202     // needed.
3203     for (auto &BB : *NewF) {
3204       for (auto &Inst : BB) {
3205         Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3206         Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3207       }
3208     }
3209     std::string Name = getMemProfFuncName(F.getName(), I);
3210     auto *PrevF = M.getFunction(Name);
3211     if (PrevF) {
3212       // We might have created this when adjusting callsite in another
3213       // function. It should be a declaration.
3214       assert(PrevF->isDeclaration());
3215       NewF->takeName(PrevF);
3216       PrevF->replaceAllUsesWith(NewF);
3217       PrevF->eraseFromParent();
3218     } else
3219       NewF->setName(Name);
3220     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3221              << "created clone " << ore::NV("NewFunction", NewF));
3222 
3223     // Now handle aliases to this function, and clone those as well.
3224     if (!FuncToAliasMap.count(&F))
3225       continue;
3226     for (auto *A : FuncToAliasMap[&F]) {
3227       std::string Name = getMemProfFuncName(A->getName(), I);
3228       auto *PrevA = M.getNamedAlias(Name);
3229       auto *NewA = GlobalAlias::create(A->getValueType(),
3230                                        A->getType()->getPointerAddressSpace(),
3231                                        A->getLinkage(), Name, NewF);
3232       NewA->copyAttributesFrom(A);
3233       if (PrevA) {
3234         // We might have created this when adjusting callsite in another
3235         // function. It should be a declaration.
3236         assert(PrevA->isDeclaration());
3237         NewA->takeName(PrevA);
3238         PrevA->replaceAllUsesWith(NewA);
3239         PrevA->eraseFromParent();
3240       }
3241     }
3242   }
3243   return VMaps;
3244 }
3245 
3246 // Locate the summary for F. This is complicated by the fact that it might
3247 // have been internalized or promoted.
findValueInfoForFunc(const Function & F,const Module & M,const ModuleSummaryIndex * ImportSummary)3248 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
3249                                       const ModuleSummaryIndex *ImportSummary) {
3250   // FIXME: Ideally we would retain the original GUID in some fashion on the
3251   // function (e.g. as metadata), but for now do our best to locate the
3252   // summary without that information.
3253   ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3254   if (!TheFnVI)
3255     // See if theFn was internalized, by checking index directly with
3256     // original name (this avoids the name adjustment done by getGUID() for
3257     // internal symbols).
3258     TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3259   if (TheFnVI)
3260     return TheFnVI;
3261   // Now query with the original name before any promotion was performed.
3262   StringRef OrigName =
3263       ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
3264   std::string OrigId = GlobalValue::getGlobalIdentifier(
3265       OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3266   TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3267   if (TheFnVI)
3268     return TheFnVI;
3269   // Could be a promoted local imported from another module. We need to pass
3270   // down more info here to find the original module id. For now, try with
3271   // the OrigName which might have been stored in the OidGuidMap in the
3272   // index. This would not work if there were same-named locals in multiple
3273   // modules, however.
3274   auto OrigGUID =
3275       ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3276   if (OrigGUID)
3277     TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3278   return TheFnVI;
3279 }
3280 
applyImport(Module & M)3281 bool MemProfContextDisambiguation::applyImport(Module &M) {
3282   assert(ImportSummary);
3283   bool Changed = false;
3284 
3285   auto IsMemProfClone = [](const Function &F) {
3286     return F.getName().contains(MemProfCloneSuffix);
3287   };
3288 
3289   // We also need to clone any aliases that reference cloned functions, because
3290   // the modified callsites may invoke via the alias. Keep track of the aliases
3291   // for each function.
3292   std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3293       FuncToAliasMap;
3294   for (auto &A : M.aliases()) {
3295     auto *Aliasee = A.getAliaseeObject();
3296     if (auto *F = dyn_cast<Function>(Aliasee))
3297       FuncToAliasMap[F].insert(&A);
3298   }
3299 
3300   for (auto &F : M) {
3301     if (F.isDeclaration() || IsMemProfClone(F))
3302       continue;
3303 
3304     OptimizationRemarkEmitter ORE(&F);
3305 
3306     SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3307     bool ClonesCreated = false;
3308     unsigned NumClonesCreated = 0;
3309     auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3310       // We should at least have version 0 which is the original copy.
3311       assert(NumClones > 0);
3312       // If only one copy needed use original.
3313       if (NumClones == 1)
3314         return;
3315       // If we already performed cloning of this function, confirm that the
3316       // requested number of clones matches (the thin link should ensure the
3317       // number of clones for each constituent callsite is consistent within
3318       // each function), before returning.
3319       if (ClonesCreated) {
3320         assert(NumClonesCreated == NumClones);
3321         return;
3322       }
3323       VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3324       // The first "clone" is the original copy, which doesn't have a VMap.
3325       assert(VMaps.size() == NumClones - 1);
3326       Changed = true;
3327       ClonesCreated = true;
3328       NumClonesCreated = NumClones;
3329     };
3330 
3331     auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3332                              Function *CalledFunction) {
3333       // Perform cloning if not yet done.
3334       CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3335 
3336       // Should have skipped indirect calls via mayHaveMemprofSummary.
3337       assert(CalledFunction);
3338       assert(!IsMemProfClone(*CalledFunction));
3339 
3340       // Update the calls per the summary info.
3341       // Save orig name since it gets updated in the first iteration
3342       // below.
3343       auto CalleeOrigName = CalledFunction->getName();
3344       for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3345         // Do nothing if this version calls the original version of its
3346         // callee.
3347         if (!StackNode.Clones[J])
3348           continue;
3349         auto NewF = M.getOrInsertFunction(
3350             getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3351             CalledFunction->getFunctionType());
3352         CallBase *CBClone;
3353         // Copy 0 is the original function.
3354         if (!J)
3355           CBClone = CB;
3356         else
3357           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3358         CBClone->setCalledFunction(NewF);
3359         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3360                  << ore::NV("Call", CBClone) << " in clone "
3361                  << ore::NV("Caller", CBClone->getFunction())
3362                  << " assigned to call function clone "
3363                  << ore::NV("Callee", NewF.getCallee()));
3364       }
3365     };
3366 
3367     // Locate the summary for F.
3368     ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3369     // If not found, this could be an imported local (see comment in
3370     // findValueInfoForFunc). Skip for now as it will be cloned in its original
3371     // module (where it would have been promoted to global scope so should
3372     // satisfy any reference in this module).
3373     if (!TheFnVI)
3374       continue;
3375 
3376     auto *GVSummary =
3377         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3378     if (!GVSummary)
3379       // Must have been imported, use the first summary (might be multiple if
3380       // this was a linkonce_odr).
3381       GVSummary = TheFnVI.getSummaryList().front().get();
3382 
3383     // If this was an imported alias skip it as we won't have the function
3384     // summary, and it should be cloned in the original module.
3385     if (isa<AliasSummary>(GVSummary))
3386       continue;
3387 
3388     auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3389 
3390     if (FS->allocs().empty() && FS->callsites().empty())
3391       continue;
3392 
3393     auto SI = FS->callsites().begin();
3394     auto AI = FS->allocs().begin();
3395 
3396     // To handle callsite infos synthesized for tail calls which have missing
3397     // frames in the profiled context, map callee VI to the synthesized callsite
3398     // info.
3399     DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3400     // Iterate the callsites for this function in reverse, since we place all
3401     // those synthesized for tail calls at the end.
3402     for (auto CallsiteIt = FS->callsites().rbegin();
3403          CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3404       auto &Callsite = *CallsiteIt;
3405       // Stop as soon as we see a non-synthesized callsite info (see comment
3406       // above loop). All the entries added for discovered tail calls have empty
3407       // stack ids.
3408       if (!Callsite.StackIdIndices.empty())
3409         break;
3410       MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3411     }
3412 
3413     // Assume for now that the instructions are in the exact same order
3414     // as when the summary was created, but confirm this is correct by
3415     // matching the stack ids.
3416     for (auto &BB : F) {
3417       for (auto &I : BB) {
3418         auto *CB = dyn_cast<CallBase>(&I);
3419         // Same handling as when creating module summary.
3420         if (!mayHaveMemprofSummary(CB))
3421           continue;
3422 
3423         auto *CalledValue = CB->getCalledOperand();
3424         auto *CalledFunction = CB->getCalledFunction();
3425         if (CalledValue && !CalledFunction) {
3426           CalledValue = CalledValue->stripPointerCasts();
3427           // Stripping pointer casts can reveal a called function.
3428           CalledFunction = dyn_cast<Function>(CalledValue);
3429         }
3430         // Check if this is an alias to a function. If so, get the
3431         // called aliasee for the checks below.
3432         if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3433           assert(!CalledFunction &&
3434                  "Expected null called function in callsite for alias");
3435           CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3436         }
3437 
3438         CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3439             I.getMetadata(LLVMContext::MD_callsite));
3440         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3441 
3442         // Include allocs that were already assigned a memprof function
3443         // attribute in the statistics.
3444         if (CB->getAttributes().hasFnAttr("memprof")) {
3445           assert(!MemProfMD);
3446           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3447               ? AllocTypeColdThinBackend++
3448               : AllocTypeNotColdThinBackend++;
3449           OrigAllocsThinBackend++;
3450           AllocVersionsThinBackend++;
3451           if (!MaxAllocVersionsThinBackend)
3452             MaxAllocVersionsThinBackend = 1;
3453           // Remove any remaining callsite metadata and we can skip the rest of
3454           // the handling for this instruction, since no cloning needed.
3455           I.setMetadata(LLVMContext::MD_callsite, nullptr);
3456           continue;
3457         }
3458 
3459         if (MemProfMD) {
3460           // Consult the next alloc node.
3461           assert(AI != FS->allocs().end());
3462           auto &AllocNode = *(AI++);
3463 
3464           // Sanity check that the MIB stack ids match between the summary and
3465           // instruction metadata.
3466           auto MIBIter = AllocNode.MIBs.begin();
3467           for (auto &MDOp : MemProfMD->operands()) {
3468             assert(MIBIter != AllocNode.MIBs.end());
3469             LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3470                 MIBIter->StackIdIndices.begin();
3471             auto *MIBMD = cast<const MDNode>(MDOp);
3472             MDNode *StackMDNode = getMIBStackNode(MIBMD);
3473             assert(StackMDNode);
3474             SmallVector<unsigned> StackIdsFromMetadata;
3475             CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3476             for (auto ContextIter =
3477                      StackContext.beginAfterSharedPrefix(CallsiteContext);
3478                  ContextIter != StackContext.end(); ++ContextIter) {
3479               // If this is a direct recursion, simply skip the duplicate
3480               // entries, to be consistent with how the summary ids were
3481               // generated during ModuleSummaryAnalysis.
3482               if (!StackIdsFromMetadata.empty() &&
3483                   StackIdsFromMetadata.back() == *ContextIter)
3484                 continue;
3485               assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3486               assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3487                      *ContextIter);
3488               StackIdIndexIter++;
3489             }
3490             MIBIter++;
3491           }
3492 
3493           // Perform cloning if not yet done.
3494           CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3495 
3496           OrigAllocsThinBackend++;
3497           AllocVersionsThinBackend += AllocNode.Versions.size();
3498           if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3499             MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3500 
3501           // If there is only one version that means we didn't end up
3502           // considering this function for cloning, and in that case the alloc
3503           // will still be none type or should have gotten the default NotCold.
3504           // Skip that after calling clone helper since that does some sanity
3505           // checks that confirm we haven't decided yet that we need cloning.
3506           if (AllocNode.Versions.size() == 1) {
3507             assert((AllocationType)AllocNode.Versions[0] ==
3508                        AllocationType::NotCold ||
3509                    (AllocationType)AllocNode.Versions[0] ==
3510                        AllocationType::None);
3511             UnclonableAllocsThinBackend++;
3512             continue;
3513           }
3514 
3515           // All versions should have a singular allocation type.
3516           assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3517             return Type == ((uint8_t)AllocationType::NotCold |
3518                             (uint8_t)AllocationType::Cold);
3519           }));
3520 
3521           // Update the allocation types per the summary info.
3522           for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3523             // Ignore any that didn't get an assigned allocation type.
3524             if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3525               continue;
3526             AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3527             AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3528                                             : AllocTypeNotColdThinBackend++;
3529             std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3530             auto A = llvm::Attribute::get(F.getContext(), "memprof",
3531                                           AllocTypeString);
3532             CallBase *CBClone;
3533             // Copy 0 is the original function.
3534             if (!J)
3535               CBClone = CB;
3536             else
3537               // Since VMaps are only created for new clones, we index with
3538               // clone J-1 (J==0 is the original clone and does not have a VMaps
3539               // entry).
3540               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3541             CBClone->addFnAttr(A);
3542             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3543                      << ore::NV("AllocationCall", CBClone) << " in clone "
3544                      << ore::NV("Caller", CBClone->getFunction())
3545                      << " marked with memprof allocation attribute "
3546                      << ore::NV("Attribute", AllocTypeString));
3547           }
3548         } else if (!CallsiteContext.empty()) {
3549           // Consult the next callsite node.
3550           assert(SI != FS->callsites().end());
3551           auto &StackNode = *(SI++);
3552 
3553 #ifndef NDEBUG
3554           // Sanity check that the stack ids match between the summary and
3555           // instruction metadata.
3556           auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3557           for (auto StackId : CallsiteContext) {
3558             assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3559             assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3560                    StackId);
3561             StackIdIndexIter++;
3562           }
3563 #endif
3564 
3565           CloneCallsite(StackNode, CB, CalledFunction);
3566         } else if (CB->isTailCall()) {
3567           // Locate the synthesized callsite info for the callee VI, if any was
3568           // created, and use that for cloning.
3569           ValueInfo CalleeVI =
3570               findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3571           if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3572             auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3573             assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3574             CloneCallsite(Callsite->second, CB, CalledFunction);
3575           }
3576         }
3577         // Memprof and callsite metadata on memory allocations no longer needed.
3578         I.setMetadata(LLVMContext::MD_memprof, nullptr);
3579         I.setMetadata(LLVMContext::MD_callsite, nullptr);
3580       }
3581     }
3582   }
3583 
3584   return Changed;
3585 }
3586 
3587 template <typename DerivedCCG, typename FuncTy, typename CallTy>
process()3588 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3589   if (DumpCCG) {
3590     dbgs() << "CCG before cloning:\n";
3591     dbgs() << *this;
3592   }
3593   if (ExportToDot)
3594     exportToDot("postbuild");
3595 
3596   if (VerifyCCG) {
3597     check();
3598   }
3599 
3600   identifyClones();
3601 
3602   if (VerifyCCG) {
3603     check();
3604   }
3605 
3606   if (DumpCCG) {
3607     dbgs() << "CCG after cloning:\n";
3608     dbgs() << *this;
3609   }
3610   if (ExportToDot)
3611     exportToDot("cloned");
3612 
3613   bool Changed = assignFunctions();
3614 
3615   if (DumpCCG) {
3616     dbgs() << "CCG after assigning function clones:\n";
3617     dbgs() << *this;
3618   }
3619   if (ExportToDot)
3620     exportToDot("clonefuncassign");
3621 
3622   return Changed;
3623 }
3624 
processModule(Module & M,llvm::function_ref<OptimizationRemarkEmitter & (Function *)> OREGetter)3625 bool MemProfContextDisambiguation::processModule(
3626     Module &M,
3627     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3628 
3629   // If we have an import summary, then the cloning decisions were made during
3630   // the thin link on the index. Apply them and return.
3631   if (ImportSummary)
3632     return applyImport(M);
3633 
3634   // TODO: If/when other types of memprof cloning are enabled beyond just for
3635   // hot and cold, we will need to change this to individually control the
3636   // AllocationType passed to addStackNodesForMIB during CCG construction.
3637   // Note that we specifically check this after applying imports above, so that
3638   // the option isn't needed to be passed to distributed ThinLTO backend
3639   // clang processes, which won't necessarily have visibility into the linker
3640   // dependences. Instead the information is communicated from the LTO link to
3641   // the backends via the combined summary index.
3642   if (!SupportsHotColdNew)
3643     return false;
3644 
3645   ModuleCallsiteContextGraph CCG(M, OREGetter);
3646   return CCG.process();
3647 }
3648 
MemProfContextDisambiguation(const ModuleSummaryIndex * Summary)3649 MemProfContextDisambiguation::MemProfContextDisambiguation(
3650     const ModuleSummaryIndex *Summary)
3651     : ImportSummary(Summary) {
3652   if (ImportSummary) {
3653     // The MemProfImportSummary should only be used for testing ThinLTO
3654     // distributed backend handling via opt, in which case we don't have a
3655     // summary from the pass pipeline.
3656     assert(MemProfImportSummary.empty());
3657     return;
3658   }
3659   if (MemProfImportSummary.empty())
3660     return;
3661 
3662   auto ReadSummaryFile =
3663       errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
3664   if (!ReadSummaryFile) {
3665     logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3666                           "Error loading file '" + MemProfImportSummary +
3667                               "': ");
3668     return;
3669   }
3670   auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3671   if (!ImportSummaryForTestingOrErr) {
3672     logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3673                           "Error parsing file '" + MemProfImportSummary +
3674                               "': ");
3675     return;
3676   }
3677   ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3678   ImportSummary = ImportSummaryForTesting.get();
3679 }
3680 
run(Module & M,ModuleAnalysisManager & AM)3681 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3682                                                     ModuleAnalysisManager &AM) {
3683   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3684   auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3685     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
3686   };
3687   if (!processModule(M, OREGetter))
3688     return PreservedAnalyses::all();
3689   return PreservedAnalyses::none();
3690 }
3691 
run(ModuleSummaryIndex & Index,llvm::function_ref<bool (GlobalValue::GUID,const GlobalValueSummary *)> isPrevailing)3692 void MemProfContextDisambiguation::run(
3693     ModuleSummaryIndex &Index,
3694     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3695         isPrevailing) {
3696   // TODO: If/when other types of memprof cloning are enabled beyond just for
3697   // hot and cold, we will need to change this to individually control the
3698   // AllocationType passed to addStackNodesForMIB during CCG construction.
3699   // The index was set from the option, so these should be in sync.
3700   assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3701   if (!SupportsHotColdNew)
3702     return;
3703 
3704   IndexCallsiteContextGraph CCG(Index, isPrevailing);
3705   CCG.process();
3706 }
3707