1 //
2 // Copyright 2016 Pixar
3 //
4 // Licensed under the Apache License, Version 2.0 (the "Apache License")
5 // with the following modification; you may not use this file except in
6 // compliance with the Apache License and the following modification to it:
7 // Section 6. Trademarks. is deleted and replaced with:
8 //
9 // 6. Trademarks. This License does not grant permission to use the trade
10 //    names, trademarks, service marks, or product names of the Licensor
11 //    and its affiliates, except as required to comply with Section 4(c) of
12 //    the License and to reproduce the content of the NOTICE file.
13 //
14 // You may obtain a copy of the Apache License at
15 //
16 //     http://www.apache.org/licenses/LICENSE-2.0
17 //
18 // Unless required by applicable law or agreed to in writing, software
19 // distributed under the Apache License with the above modification is
20 // distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
21 // KIND, either express or implied. See the Apache License for the specific
22 // language governing permissions and limitations under the Apache License.
23 //
24 
25 #include "pxr/pxr.h"
26 #include "pxr/base/tf/mallocTag.h"
27 
28 #include "pxr/base/tf/diagnostic.h"
29 #include "pxr/base/tf/getenv.h"
30 #include "pxr/base/tf/hash.h"
31 #include "pxr/base/tf/hashmap.h"
32 #include "pxr/base/tf/iterator.h"
33 #include "pxr/base/tf/stl.h"
34 #include "pxr/base/tf/stringUtils.h"
35 #include "pxr/base/tf/tf.h"
36 
37 #include "pxr/base/arch/attributes.h"
38 #include "pxr/base/arch/debugger.h"
39 #include "pxr/base/arch/hash.h"
40 #include "pxr/base/arch/inttypes.h"
41 #include "pxr/base/arch/mallocHook.h"
42 #include "pxr/base/arch/stackTrace.h"
43 
44 #include <tbb/spin_mutex.h>
45 
46 #include <algorithm>
47 #include <string>
48 #include <stdlib.h>
49 #include <thread>
50 #include <type_traits>
51 #include <vector>
52 #include <ostream>
53 
54 using std::map;
55 using std::make_pair;
56 using std::pair;
57 using std::string;
58 using std::vector;
59 
60 PXR_NAMESPACE_OPEN_SCOPE
61 
62 // Change the following line and recompile this file to disable decrementing
63 // the allocation counts when freeing memory.
64 #define _DECREMENT_ALLOCATION_COUNTS true
65 
66 // The max number of captured unique malloc stacks printed out in the report.
67 static const size_t _MaxReportedMallocStacks = 100;
68 
69 // The max number of call stack frames stored when malloc stack capturing
70 // is enabled.  Note that two malloc stacks are considered identical if all
71 // their frames up to this depth are matching (the uncaptured parts of the
72 // stacks can still differ).
73 static const size_t _MaxMallocStackDepth = 64;
74 
75 // The number of top stack frames to ignore when saving frames for a
76 // malloc stack.  Currently these frames are:
77 // #0   ArchGetStackFrames(unsigned long, vector<unsigned long, allocator<unsigned long> >*)
78 // #1   Tf_MallocGlobalData::_CaptureMallocStack(Tf_MallocPathNode const*, void const*, unsigned long)
79 // #2   TfMallocTag::_MallocWrapper(unsigned long, void const*)
80 static const size_t _IgnoreStackFramesCount = 3;
81 
82 struct Tf_MallocPathNode;
83 struct Tf_MallocGlobalData;
84 
85 static ArchMallocHook _mallocHook;      // zero-initialized POD
86 static Tf_MallocGlobalData* _mallocGlobalData = NULL;
87 bool TfMallocTag::_doTagging = false;
88 
89 static bool
_UsePtmalloc()90 _UsePtmalloc()
91 {
92     string impl = TfGetenv("TF_MALLOC_TAG_IMPL", "auto");
93     vector<string> legalImpl = {"auto",     "agnostic",
94                                 "jemalloc", "jemalloc force",
95                                 "ptmalloc", "ptmalloc force",
96                                 "pxmalloc", "pxmalloc force"};
97 
98     if (std::find(legalImpl.begin(), legalImpl.end(), impl) == legalImpl.end()) {
99         string values = TfStringJoin(legalImpl, "', '");
100         TF_WARN("Invalid value '%s' for TF_MALLOC_TAG_IMPL: "
101                 "(not one of '%s')", impl.c_str(), values.c_str());
102     }
103 
104     if (impl != "auto") {
105         fprintf(stderr, "########################################################################\n"
106                         "#  TF_MALLOC_TAG_IMPL is overridden to '%s'.  Default is 'auto'  #\n"
107                         "########################################################################\n",
108                 impl.c_str());
109     }
110 
111     if (impl == "agnostic")
112         return false;
113 
114     if (ArchIsPtmallocActive()) {
115         return true;
116     }
117     else if (TfStringStartsWith(impl, "ptmalloc")) {
118         TF_WARN("TfMallocTag can only use ptmalloc-specific implementation "
119                 "when ptmalloc is active. Falling back to agnostic "
120                 "implementation.");
121     }
122 
123     return false;
124 }
125 
126 /*
127  * We let malloc have BITS_FOR_MALLOC_SIZE instead of the usual 64.
128  * That leaves us 64 - BITS_FOR_MALLOC_SIZE for storing our own index,
129  * which effectively gives us a pointer to a Tf_MallocPathNode (but
130  * only for MAX_PATH_NODES different nodes).
131  */
132 static const unsigned BITS_FOR_MALLOC_SIZE = 40;
133 static const unsigned BITS_FOR_INDEX = 64 - BITS_FOR_MALLOC_SIZE;
134 static const size_t MAX_PATH_NODES = 1 << BITS_FOR_INDEX;
135 static const unsigned HIWORD_INDEX_BIT_OFFSET = BITS_FOR_MALLOC_SIZE - 32;
136 static const unsigned HIWORD_INDEX_MASK = ~(~0U << HIWORD_INDEX_BIT_OFFSET);  // (HIWORD_INDEX_BIT_OFFSET no. of 1 bits.)
137 static const unsigned long long MALLOC_SIZE_MASK = ~(~0ULL << BITS_FOR_MALLOC_SIZE) & ~0x7ULL;
138 
139 static bool Tf_MatchesMallocTagDebugName(const string& name);
140 static bool Tf_MatchesMallocTagTraceName(const string& name);
141 static void Tf_MallocTagDebugHook(void* ptr, size_t size) ARCH_NOINLINE;
142 
Tf_MallocTagDebugHook(void * ptr,size_t size)143 static void Tf_MallocTagDebugHook(void* ptr, size_t size)
144 {
145     // Clients don't call this directly so the debugger can conveniently
146     // see the pointer and size in the stack trace.
147     ARCH_DEBUGGER_TRAP;
148 }
149 
Tf_GetMallocBlockSize(void * ptr,size_t requestedSize)150 static size_t Tf_GetMallocBlockSize(void* ptr, size_t requestedSize)
151 {
152     // The allocator-agnostic implementation keeps track of the exact memory
153     // block sizes requested by consumers. This ignores allocator-specific
154     // overhead, such as alignment, associated metadata, etc. We believe this
155     // is the right thing to be measuring, as malloc tags are intended to
156     // allow consumers to bill memory requests to their originating subsystem.
157     //
158     // Uncomment the following line to enable tracking of 'actual' block sizes.
159     // Be sure that the allocator in use provides this function! If not, this
160     // will call the default glibc implementation, which will likely return
161     // the wrong value (unless you're using the glibc allocator).
162     // return malloc_usable_size(ptr);
163 
164     return requestedSize;
165 }
166 
167 struct Tf_MallocBlockInfo {
Tf_MallocBlockInfoTf_MallocBlockInfo168     Tf_MallocBlockInfo()
169         : blockSize(0), pathNodeIndex(0)
170     { }
171 
Tf_MallocBlockInfoTf_MallocBlockInfo172     Tf_MallocBlockInfo(size_t size, uint32_t index)
173         : blockSize(size), pathNodeIndex(index)
174     { }
175 
176     size_t       blockSize:BITS_FOR_MALLOC_SIZE;
177     uint32_t pathNodeIndex:BITS_FOR_INDEX;
178 };
179 
180 #if !defined(ARCH_OS_WINDOWS)
181 static_assert(sizeof(Tf_MallocBlockInfo) == 8,
182               "Unexpected size for Tf_MallocBlockInfo");
183 #endif
184 
185 /*
186  * Utility for checking a const char* against a table of match strings.
187  * Each string is tested against each item in the table in order.  Each
188  * item can either allow or deny the string, with later entries overriding
189  * earlier results.  Match strings can end in '*' to wildcard the suffix
190  * and can start with '-' to deny or '+' or nothing to allow.
191  *
192  * Match strings are concatenated into lists using commas, newlines or tabs.
193  * Spaces are not delimiters but they are trimmed from each end.
194  */
195 class Tf_MallocTagStringMatchTable {
196 public:
197     Tf_MallocTagStringMatchTable();
198     explicit Tf_MallocTagStringMatchTable(const std::string& matchList);
199 
200     // Replace the list of matches.
201     void SetMatchList(const std::string& matchList);
202 
203     // Return \c true iff \p s matches the most recently set match list.
204     bool Match(const char* s) const;
205 
206 private:
207     struct _MatchString {
208         _MatchString(const std::string&);
209 
210         std::string str;    // String to match.
211         bool allow:1;       // New result if str matches.
212         bool wildcard:1;    // str has a suffix wildcard.
213     };
214     std::vector<_MatchString> _matchStrings;
215 };
216 
_MatchString(const std::string & s)217 Tf_MallocTagStringMatchTable::_MatchString::_MatchString(const std::string& s) :
218     str(s),
219     allow(true),
220     wildcard(false)
221 {
222     if (!str.empty()) {
223         if (str[str.size() - 1] == '*') {
224             wildcard = true;
225             str.resize(str.size() - 1);
226         }
227         if (!str.empty()) {
228             if (str[0] == '-') {
229                 allow = false;
230                 str.erase(0, 1);
231             }
232             else if (str[0] == '+') {
233                 str.erase(0, 1);
234             }
235         }
236     }
237 }
238 
Tf_MallocTagStringMatchTable()239 Tf_MallocTagStringMatchTable::Tf_MallocTagStringMatchTable()
240 {
241     // Do nothing
242 }
243 
Tf_MallocTagStringMatchTable(const std::string & matchList)244 Tf_MallocTagStringMatchTable::Tf_MallocTagStringMatchTable(
245     const std::string& matchList)
246 {
247     SetMatchList(matchList);
248 }
249 
250 void
SetMatchList(const std::string & matchList)251 Tf_MallocTagStringMatchTable::SetMatchList(const std::string& matchList)
252 {
253     _matchStrings.clear();
254     std::vector<std::string> items = TfStringTokenize(matchList, ",\t\n");
255     TF_FOR_ALL(i, items) {
256         _matchStrings.push_back(_MatchString(TfStringTrim(*i, " ")));
257     }
258 }
259 
260 bool
Match(const char * s) const261 Tf_MallocTagStringMatchTable::Match(const char* s) const
262 {
263     // The last match defines the overall result.  If the last match had
264     // a '-' prefix then we don't match, otherwise we do.
265     TF_REVERSE_FOR_ALL(i, _matchStrings) {
266         if (i->wildcard) {
267             // Check prefix match.
268             const char* m = i->str.c_str();
269             while (*m && *m == *s) {
270                 ++m, ++s;
271             }
272             if (*m != '\0') {
273                 continue;
274             }
275         }
276         else {
277             // Check exact match.
278             if (i->str != s) {
279                 continue;
280             }
281         }
282 
283         // Matched.
284         return i->allow;
285     }
286 
287     // No match.
288     return false;
289 }
290 
291 /*
292  * There is a different call-site object associated with each different
293  * tag string used to construct a TfAutoMallocTag.
294  */
295 struct Tf_MallocCallSite
296 {
Tf_MallocCallSiteTf_MallocCallSite297     Tf_MallocCallSite(const string& name, uint32_t index)
298         : _name(name), _totalBytes(0), _nPaths(0), _index(index)
299     {
300         _debug = Tf_MatchesMallocTagDebugName(_name);
301         _trace = Tf_MatchesMallocTagTraceName(_name);
302     }
303 
304     // Note: _name needs to be const since we call c_str() on it.
305     const string _name;
306     int64_t _totalBytes;
307     size_t _nPaths;
308     uint32_t _index;
309 
310     // If true then invoke the debugger trap when allocating or freeing
311     // at this site.
312     bool _debug:1;
313 
314     // If true then capture a stack trace when allocating at this site.
315     bool _trace:1;
316 };
317 
318 namespace {
319 
320 typedef TfHashMap<const char*, struct Tf_MallocCallSite*,
321                              TfHashCString,
322                              TfEqualCString> Tf_MallocCallSiteTable;
323 
Tf_GetOrCreateCallSite(Tf_MallocCallSiteTable * table,const char * name,size_t * traceSiteCount)324 Tf_MallocCallSite* Tf_GetOrCreateCallSite(Tf_MallocCallSiteTable* table,
325                                           const char* name,
326                                           size_t* traceSiteCount) {
327     TF_AXIOM(table);
328     Tf_MallocCallSiteTable::iterator it = table->find(name);
329 
330     if (it == table->end()) {
331         Tf_MallocCallSite* site =
332             new Tf_MallocCallSite(name, static_cast<uint32_t>(table->size()));
333         // site->_name is const so it is ok to use c_str() as the key.
334         (*table)[site->_name.c_str()] = site;
335         if (site->_trace) {
336             ++*traceSiteCount;
337         }
338         return site;
339     } else {
340         return it->second;
341     }
342 }
343 }
344 
345 /*
346  * This is a singleton.  Because access to this structure is gated via checks
347  * to TfMallocTag::_doTagging, we forego the usual TfSingleton pattern and just
348  * use a single static-scoped pointer (_mallocGlobalData) to point to the
349  * singleton instance.
350  */
351 struct Tf_MallocGlobalData
352 {
Tf_MallocGlobalDataTf_MallocGlobalData353     Tf_MallocGlobalData() {
354         _allPathNodes.reserve(1024);
355         _totalBytes = 0;
356         _maxTotalBytes = 0;
357         _warned = false;
358         _captureCallSiteCount = 0;
359         _captureStack.reserve(_MaxMallocStackDepth);
360     }
361 
_GetOrCreateCallSiteTf_MallocGlobalData362     Tf_MallocCallSite* _GetOrCreateCallSite(const char* name) {
363         return Tf_GetOrCreateCallSite(&_callSiteTable, name,
364                                       &_captureCallSiteCount);
365     }
366 
367     inline bool _RegisterPathNode(Tf_MallocPathNode*);
368     inline bool _RegisterPathNodeForBlock(
369         Tf_MallocPathNode* pathNode, void* block, size_t blockSize);
370     inline bool _UnregisterPathNodeForBlock(
371         void* block, Tf_MallocBlockInfo* blockInfo);
372 
_IsMallocStackCapturingEnabledTf_MallocGlobalData373     bool _IsMallocStackCapturingEnabled() const {
374         return _captureCallSiteCount != 0;
375     }
376 
377     void _RunDebugHookForNode(const Tf_MallocPathNode* node, void*, size_t);
378 
379     void _GetStackTrace(size_t skipFrames, std::vector<uintptr_t>* stack);
380 
381     void _SetTraceNames(const std::string& matchList);
382     bool _MatchesTraceName(const std::string& name);
383     void _CaptureMallocStack(
384         const Tf_MallocPathNode* node, const void *ptr, size_t size);
385     void _ReleaseMallocStack(
386         const Tf_MallocPathNode* node, const void *ptr);
387 
388     void _BuildUniqueMallocStacks(TfMallocTag::CallTree* tree);
389 
390     void _SetDebugNames(const std::string& matchList);
391     bool _MatchesDebugName(const std::string& name);
392 
393     typedef TfHashMap<const void *, TfMallocTag::CallStackInfo, TfHash>
394     _CallStackTableType;
395 
396     tbb::spin_mutex _mutex;
397     Tf_MallocPathNode* _rootNode;
398     Tf_MallocCallSiteTable _callSiteTable;
399 
400     // Vector of path nodes indicating location of an allocated block.
401     // Implementations associate indices into this vector with a block.
402     vector<struct Tf_MallocPathNode*> _allPathNodes;
403 
404     // Mapping from memory block to information about that block.
405     // Used by allocator-agnostic implementation.
406     typedef TfHashMap<const void *, Tf_MallocBlockInfo, TfHash>
407     _PathNodeTableType;
408     _PathNodeTableType _pathNodeTable;
409 
410     size_t _captureCallSiteCount;
411     _CallStackTableType _callStackTable;
412     Tf_MallocTagStringMatchTable _traceMatchTable;
413 
414     int64_t _totalBytes;
415     int64_t _maxTotalBytes;
416     bool _warned;
417 
418     Tf_MallocTagStringMatchTable _debugMatchTable;
419 
420     // Pre-allocated space for getting stack traces.
421     vector<uintptr_t> _captureStack;
422 };
423 
424 /*
425  * Each node describes a sequence (i.e. path) of call sites.
426  * However, a given call-site can occur only once in a given path -- recursive
427  * call loops are excised.
428  */
429 struct Tf_MallocPathNode
430 {
Tf_MallocPathNodeTf_MallocPathNode431     Tf_MallocPathNode(Tf_MallocCallSite* callSite)
432         : _callSite(callSite),
433           _totalBytes(0),
434           _numAllocations(0),
435           _index(0),
436           _repeated(false)
437     {
438     }
439 
_GetOrCreateChildTf_MallocPathNode440     Tf_MallocPathNode* _GetOrCreateChild(Tf_MallocCallSite* site)
441     {
442         // Note: As long as the number of children is quite small, using a
443         // vector is a good option here.  If this assumption changes we
444         // should change this back to using a map (or TfHashMap).
445         TF_FOR_ALL(it, _children) {
446             if (it->first == site) {
447                 return it->second;
448             }
449         }
450         Tf_MallocPathNode* pathNode = new Tf_MallocPathNode(site);
451         if (!_mallocGlobalData->_RegisterPathNode(pathNode)) {
452             delete pathNode;
453             return NULL;
454         }
455 
456         _children.push_back(make_pair(site, pathNode));
457         site->_nPaths++;
458         return pathNode;
459     }
460 
461     void _BuildTree(TfMallocTag::CallTree::PathNode* node,
462                     bool skipRepeated);
463 
464     Tf_MallocCallSite* _callSite;
465     int64_t _totalBytes;
466     int64_t _numAllocations;
467     vector<pair<Tf_MallocCallSite*, Tf_MallocPathNode*> > _children;
468     uint32_t _index;    // only 24 bits
469     bool _repeated;    // repeated node
470 };
471 
472 inline bool
_RegisterPathNode(Tf_MallocPathNode * pathNode)473 Tf_MallocGlobalData::_RegisterPathNode(Tf_MallocPathNode* pathNode)
474 {
475     if (_allPathNodes.size() == MAX_PATH_NODES) {
476         if (!_warned) {
477             TF_WARN("maximum no. of TfMallocTag nodes has been reached!");
478             _warned = true;
479         }
480         return false;
481 
482     }
483     pathNode->_index = static_cast<uint32_t>(_allPathNodes.size());
484     _allPathNodes.push_back(pathNode);
485     return true;
486 }
487 
488 inline bool
_RegisterPathNodeForBlock(Tf_MallocPathNode * pathNode,void * block,size_t blockSize)489 Tf_MallocGlobalData::_RegisterPathNodeForBlock(
490     Tf_MallocPathNode* pathNode, void* block, size_t blockSize)
491 {
492     // Disable tagging for this thread so any allocations caused
493     // here do not get intercepted and cause recursion.
494     TfMallocTag::_TemporaryTaggingState tmpState(TfMallocTag::_TaggingDisabled);
495 
496     const Tf_MallocBlockInfo blockInfo(blockSize, pathNode->_index);
497     return _pathNodeTable.insert(std::make_pair(block, blockInfo)).second;
498 }
499 
500 inline bool
_UnregisterPathNodeForBlock(void * block,Tf_MallocBlockInfo * blockInfo)501 Tf_MallocGlobalData::_UnregisterPathNodeForBlock(
502     void* block, Tf_MallocBlockInfo* blockInfo)
503 {
504     // Disable tagging for this thread so any allocations caused
505     // here do not get intercepted and cause recursion.
506     TfMallocTag::_TemporaryTaggingState tmpState(TfMallocTag::_TaggingDisabled);
507 
508     _PathNodeTableType::iterator it = _pathNodeTable.find(block);
509     if (it != _pathNodeTable.end()) {
510         *blockInfo = it->second;
511         _pathNodeTable.erase(it);
512         return true;
513     }
514 
515     return false;
516 }
517 
518 void
_GetStackTrace(size_t skipFrames,std::vector<uintptr_t> * stack)519 Tf_MallocGlobalData::_GetStackTrace(
520     size_t skipFrames,
521     std::vector<uintptr_t>* stack)
522 {
523     // Get the stack trace.
524     ArchGetStackFrames(_MaxMallocStackDepth, skipFrames, &_captureStack);
525 
526     // Copy into stack, reserving exactly enough space.
527     stack->reserve(_captureStack.size());
528     stack->insert(stack->end(), _captureStack.begin(), _captureStack.end());
529 
530     // Done with stack trace.
531     _captureStack.clear();
532 }
533 
534 void
_SetTraceNames(const std::string & matchList)535 Tf_MallocGlobalData::_SetTraceNames(const std::string& matchList)
536 {
537     TfMallocTag::_TemporaryTaggingState tmpState(TfMallocTag::_TaggingDisabled);
538 
539     _traceMatchTable.SetMatchList(matchList);
540 
541     // Update trace flag on every existing call site.
542     _captureCallSiteCount = 0;
543     TF_FOR_ALL(i, _callSiteTable) {
544         i->second->_trace = _traceMatchTable.Match(i->second->_name.c_str());
545         if (i->second->_trace) {
546             ++_captureCallSiteCount;
547         }
548     }
549 }
550 
551 bool
_MatchesTraceName(const std::string & name)552 Tf_MallocGlobalData::_MatchesTraceName(const std::string& name)
553 {
554     return _traceMatchTable.Match(name.c_str());
555 }
556 
Tf_MatchesMallocTagTraceName(const string & name)557 static bool Tf_MatchesMallocTagTraceName(const string& name)
558 {
559     return _mallocGlobalData->_MatchesTraceName(name);
560 }
561 
562 void
_CaptureMallocStack(const Tf_MallocPathNode * node,const void * ptr,size_t size)563 Tf_MallocGlobalData::_CaptureMallocStack(
564     const Tf_MallocPathNode* node, const void *ptr, size_t size)
565 {
566     if (node->_callSite->_trace) {
567         // Disable tagging for this thread so any allocations caused
568         // here do not get intercepted and cause recursion.
569         TfMallocTag::_TemporaryTaggingState
570             tmpState(TfMallocTag::_TaggingDisabled);
571 
572         TfMallocTag::CallStackInfo &stackInfo = _callStackTable[ptr];
573         _GetStackTrace(_IgnoreStackFramesCount, &stackInfo.stack);
574         stackInfo.size = size;
575         stackInfo.numAllocations = 1;
576     }
577 }
578 
579 void
_ReleaseMallocStack(const Tf_MallocPathNode * node,const void * ptr)580 Tf_MallocGlobalData::_ReleaseMallocStack(
581     const Tf_MallocPathNode* node, const void *ptr)
582 {
583     if (node->_callSite->_trace) {
584         _CallStackTableType::iterator i = _callStackTable.find(ptr);
585         if (i != _callStackTable.end()) {
586             // Disable tagging for this thread so any allocations caused
587             // here do not get intercepted and cause recursion.
588             TfMallocTag::_TemporaryTaggingState
589                 tmpState(TfMallocTag::_TaggingDisabled);
590             _callStackTable.erase(i);
591         }
592     }
593 }
594 
595 void
_RunDebugHookForNode(const Tf_MallocPathNode * node,void * ptr,size_t size)596 Tf_MallocGlobalData::_RunDebugHookForNode(
597     const Tf_MallocPathNode* node, void* ptr, size_t size)
598 {
599     if (node->_callSite->_debug)
600         Tf_MallocTagDebugHook(ptr, size);
601 }
602 
603 void
_SetDebugNames(const std::string & matchList)604 Tf_MallocGlobalData::_SetDebugNames(const std::string& matchList)
605 {
606     TfMallocTag::_TemporaryTaggingState tmpState(TfMallocTag::_TaggingDisabled);
607 
608     _debugMatchTable.SetMatchList(matchList);
609 
610     // Update debug flag on every existing call site.
611     TF_FOR_ALL(i, _callSiteTable) {
612         i->second->_debug = _debugMatchTable.Match(i->second->_name.c_str());
613     }
614 }
615 
616 bool
_MatchesDebugName(const std::string & name)617 Tf_MallocGlobalData::_MatchesDebugName(const std::string& name)
618 {
619     return _debugMatchTable.Match(name.c_str());
620 }
621 
Tf_MatchesMallocTagDebugName(const string & name)622 static bool Tf_MatchesMallocTagDebugName(const string& name)
623 {
624     return _mallocGlobalData->_MatchesDebugName(name);
625 }
626 
627 namespace {
628 // Hash functor for a malloc stack.
629 //
630 struct _HashMallocStack
631 {
operator ()__anoneab1f55c0211::_HashMallocStack632     size_t operator()(const vector<uintptr_t> &stack) const {
633         return ArchHash(
634             (const char *)&stack[0], sizeof(uintptr_t) * stack.size());
635     }
636 };
637 
638 // The data associated with a malloc stack (a pointer to the malloc stack
639 // itself, and the allocation size and number of allocations).
640 //
641 struct _MallocStackData
642 {
643     const vector<uintptr_t> *stack;
644     size_t size;
645     size_t numAllocations;
646 };
647 }
648 
649 // Comparison for sorting TfMallocTag::CallStackInfo based on their
650 // allocation size.
651 //
652 static bool
_MallocStackDataLessThan(const _MallocStackData * lhs,const _MallocStackData * rhs)653 _MallocStackDataLessThan(
654     const _MallocStackData *lhs,
655     const _MallocStackData *rhs)
656 {
657     return lhs->size < rhs->size;
658 }
659 
660 // Builds a vector of unique captured malloc stacks and stores the result
661 // in tree->capturedCallStacks.  The malloc stacks are sorted with the
662 // stacks that allocated the most memory at the front of the vector.
663 //
664 void
_BuildUniqueMallocStacks(TfMallocTag::CallTree * tree)665 Tf_MallocGlobalData::_BuildUniqueMallocStacks(TfMallocTag::CallTree* tree)
666 {
667     if (!_callStackTable.empty()) {
668         // Create a map from malloc stacks to the malloc stack data.
669         typedef TfHashMap<
670             vector<uintptr_t>, _MallocStackData, _HashMallocStack> _Map;
671         _Map map;
672 
673         TF_FOR_ALL(it, _callStackTable) {
674             // Since _callStackTable does not change at this point it is
675             // ok to store the address of the malloc stack in the data.
676             const TfMallocTag::CallStackInfo &stackInfo = it->second;
677             _MallocStackData data = { &stackInfo.stack, 0, 0 };
678 
679             pair<_Map::iterator, bool> insertResult = map.insert(
680                 make_pair(stackInfo.stack, data));
681 
682             _MallocStackData &updateData = insertResult.first->second;
683             updateData.size += stackInfo.size;
684             updateData.numAllocations += stackInfo.numAllocations;
685         }
686 
687         // Sort the malloc stack data by allocation size.
688         std::vector<const _MallocStackData *> sortedStackData;
689         sortedStackData.reserve(map.size());
690         TF_FOR_ALL(it, map) {
691             sortedStackData.push_back(&it->second);
692         }
693 
694         std::sort(
695             sortedStackData.begin(),
696             sortedStackData.end(),
697             _MallocStackDataLessThan);
698 
699         tree->capturedCallStacks.reserve(sortedStackData.size());
700         TF_REVERSE_FOR_ALL(it, sortedStackData) {
701             const _MallocStackData &data = **it;
702 
703             tree->capturedCallStacks.push_back(TfMallocTag::CallStackInfo());
704             TfMallocTag::CallStackInfo &stackInfo =
705                 tree->capturedCallStacks.back();
706 
707             // Take a copy of the malloc stack.
708             stackInfo.stack = *data.stack;
709             stackInfo.size = data.size;
710             stackInfo.numAllocations = data.numAllocations;
711         }
712     }
713 }
714 
715 
716 void
_BuildTree(TfMallocTag::CallTree::PathNode * node,bool skipRepeated)717 Tf_MallocPathNode::_BuildTree(TfMallocTag::CallTree::PathNode* node,
718                               bool skipRepeated)
719 {
720     node->children.reserve(_children.size());
721     node->nBytes = node->nBytesDirect = _totalBytes;
722     node->nAllocations = _numAllocations;
723     node->siteName = _callSite->_name;
724 
725     TF_FOR_ALL(pi, _children) {
726         // The tree is built in a special way, if the repeated allocations
727         // should be skipped. First, the full tree is built using temporary
728         // nodes for all allocations that should be skipped. Then tree is
729         // collapsed by copying the children of temporary nodes to their parents
730         // in bottom-up fasion.
731         if (skipRepeated && pi->second->_repeated) {
732             // Create a temporary node
733             TfMallocTag::CallTree::PathNode childNode;
734             pi->second->_BuildTree(&childNode, skipRepeated);
735             // Add the direct contribution of this node to the parent.
736             node->nBytesDirect += childNode.nBytesDirect;
737             // Copy the children, if there are any
738             if (!childNode.children.empty()) {
739                 node->children.insert(node->children.end(),
740                                       childNode.children.begin(),
741                                       childNode.children.end());
742             }
743             node->nBytes += childNode.nBytes;
744         } else {
745             node->children.push_back(TfMallocTag::CallTree::PathNode());
746             TfMallocTag::CallTree::PathNode& childNode = node->children.back();
747             pi->second->_BuildTree(&childNode, skipRepeated);
748             node->nBytes += childNode.nBytes;
749         }
750     }
751 }
752 
753 namespace {
Tf_GetCallSites(TfMallocTag::CallTree::PathNode * node,Tf_MallocCallSiteTable * table)754 void Tf_GetCallSites(TfMallocTag::CallTree::PathNode* node,
755                      Tf_MallocCallSiteTable* table) {
756     TF_AXIOM(node);
757     TF_AXIOM(table);
758 
759     size_t dummy;
760     Tf_MallocCallSite* site =
761         Tf_GetOrCreateCallSite(table, node->siteName.c_str(), &dummy);
762     site->_totalBytes += node->nBytesDirect;
763 
764     TF_FOR_ALL(pi, node->children) {
765         Tf_GetCallSites(&(*pi), table);
766     }
767 }
768 }
769 
770 /*
771  * None of this is implemented for a 32-bit build.
772  */
773 
774 #define _HI_WORD(sptr) *(((int *)sptr) + 1)
775 #define _LO_WORD(sptr) *((int *)sptr)
776 
777 #if defined(ARCH_BITS_64)
778 
779 // This modifies the control word associated with \a ptr, removing the stored
780 // index, and returning the index and allocation size.
781 static inline void
_ExtractIndexAndGetSize(void * ptr,size_t * size,uint32_t * index)782 _ExtractIndexAndGetSize(void *ptr, size_t *size, uint32_t *index)
783 {
784     // Get the control word.
785     size_t *sptr = static_cast<size_t *>(ptr) - 1;
786 
787     // Read the stored index.
788     *index = _HI_WORD(sptr) >> HIWORD_INDEX_BIT_OFFSET;
789 
790     // Read the size.
791     *size = *sptr & MALLOC_SIZE_MASK;
792 
793     // Remove the stored index from the word.
794     _HI_WORD(sptr) &= HIWORD_INDEX_MASK;
795 
796 }
797 
798 // This modifies the control word associated with \a ptr, storing \a index, and
799 // returning the allocation size.
800 static inline void
_StoreIndexAndGetSize(void * ptr,size_t * size,uint32_t index)801 _StoreIndexAndGetSize(void *ptr, size_t *size, uint32_t index)
802 {
803     // Get the control word.
804     size_t const *sptr = static_cast<size_t const *>(ptr) - 1;
805 
806     // Read the size.
807     *size = *sptr & MALLOC_SIZE_MASK;
808 
809     // Write the index.
810     _HI_WORD(sptr) |= (index << HIWORD_INDEX_BIT_OFFSET);
811 }
812 
813 #else
814 
815 // Allow compilation, but just fatal error.  This code shouldn't ever be active
816 
817 static inline void
_ExtractIndexAndGetSize(void *,size_t *,uint32_t *)818 _ExtractIndexAndGetSize(void *, size_t *, uint32_t *)
819 {
820     TF_FATAL_ERROR("Attempting to use Malloc Tags on unsupported platform");
821 }
822 
823 static inline void
_StoreIndexAndGetSize(void *,size_t *,uint32_t)824 _StoreIndexAndGetSize(void *, size_t *, uint32_t)
825 {
826     TF_FATAL_ERROR("Attempting to use Malloc Tags on unsupported platform");
827 }
828 
829 #endif
830 
831 // Per-thread data for TfMallocTag.
832 struct TfMallocTag::_ThreadData {
_ThreadDataTfMallocTag::_ThreadData833     _ThreadData() : _tagState(_TaggingDormant) { }
834     _ThreadData(const _ThreadData &) = delete;
835     _ThreadData(_ThreadData&&) = delete;
836     _ThreadData& operator=(const _ThreadData &rhs) = delete;
837     _ThreadData& operator=(_ThreadData&&) = delete;
838 
839     _Tagging _tagState;
840     std::vector<Tf_MallocPathNode*> _tagStack;
841     std::vector<unsigned int> _callSiteOnStack;
842 };
843 
844 class TfMallocTag::Tls {
845 public:
846     static
847     TfMallocTag::_ThreadData*
Find()848     Find()
849     {
850 #if defined(ARCH_HAS_THREAD_LOCAL)
851         // This weirdness is so we don't use the heap and we don't call
852         // the destructor of _ThreadData when the thread is exiting.
853         // We can't do the latter because we don't know in what order
854         // objects will be destroyed and objects destroyed after the
855         // _ThreadData may do heap (de)allocation, which requires the
856         // _ThreadData object.  We leak the heap allocated blocks in
857         // the _ThreadData.
858         static thread_local
859             std::aligned_storage<sizeof(_ThreadData),
860                                  alignof(_ThreadData)>::type dataBuffer;
861         static thread_local _ThreadData* data = new (&dataBuffer) _ThreadData;
862         return data;
863 #else
864         TF_FATAL_ERROR("TfMallocTag not supported on platforms "
865                        "without thread_local");
866         return nullptr;
867 #endif
868     }
869 };
870 
871 /*
872  * If this returns false, it sets *tptr.  Otherwise,
873  * we don't need *tptr, so it may not be set.
874  */
875 inline bool
_ShouldNotTag(TfMallocTag::_ThreadData ** tptr,_Tagging * statePtr)876 TfMallocTag::_ShouldNotTag(TfMallocTag::_ThreadData** tptr, _Tagging* statePtr)
877 {
878     if (!TfMallocTag::_doTagging) {
879         if (statePtr) {
880             *statePtr = _TaggingDormant;
881         }
882         return true;
883     }
884     else {
885         *tptr = TfMallocTag::Tls::Find();
886             if (statePtr) {
887                 *statePtr = (*tptr)->_tagState;
888             }
889             return (*tptr)->_tagState != _TaggingEnabled;
890         }
891 }
892 
893 // Helper function to retrieve the current path node from a _ThreadData
894 // object. Note that _mallocGlobalData->_mutex must be locked before calling
895 // this function.
896 inline Tf_MallocPathNode*
_GetCurrentPathNodeNoLock(const TfMallocTag::_ThreadData * tptr)897 TfMallocTag::_GetCurrentPathNodeNoLock(const TfMallocTag::_ThreadData* tptr)
898 {
899     if (!tptr->_tagStack.empty()) {
900         return tptr->_tagStack.back();
901     }
902 
903     // If the _ThreadData does not have any entries in its tag stack, return
904     // the global root so that any memory allocations are assigned to that
905     // node.
906     return _mallocGlobalData->_rootNode;
907 }
908 
909 void
SetDebugMatchList(const std::string & matchList)910 TfMallocTag::SetDebugMatchList(const std::string& matchList)
911 {
912     if (TfMallocTag::IsInitialized()) {
913         tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
914         _mallocGlobalData->_SetDebugNames(matchList);
915     }
916 }
917 
918 void
SetCapturedMallocStacksMatchList(const std::string & matchList)919 TfMallocTag::SetCapturedMallocStacksMatchList(const std::string& matchList)
920 {
921     if (TfMallocTag::IsInitialized()) {
922         tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
923         _mallocGlobalData->_SetTraceNames(matchList);
924     }
925 }
926 
927 vector<vector<uintptr_t> >
GetCapturedMallocStacks()928 TfMallocTag::GetCapturedMallocStacks()
929 {
930     vector<vector<uintptr_t> > result;
931 
932     if (!TfMallocTag::IsInitialized())
933         return result;
934 
935     // Push some malloc tags, so what we do here doesn't pollute the root
936     // stacks.
937     TfAutoMallocTag2 tag("Tf", "TfGetRootMallocStacks");
938 
939     // Copy off the stack traces, make sure to malloc outside.
940     Tf_MallocGlobalData::_CallStackTableType traces;
941 
942     // Swap them out while holding the lock.
943     {
944         tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
945         traces.swap(_mallocGlobalData->_callStackTable);
946     }
947 
948     TF_FOR_ALL(i, traces)
949         result.push_back(i->second.stack);
950 
951     return result;
952 }
953 
954 void*
_MallocWrapper(size_t nBytes,const void *)955 TfMallocTag::_MallocWrapper(size_t nBytes, const void*)
956 {
957     void* ptr = _mallocHook.Malloc(nBytes);
958 
959     _ThreadData* td;
960     if (_ShouldNotTag(&td) || ARCH_UNLIKELY(!ptr))
961         return ptr;
962 
963     {
964         tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
965 
966         Tf_MallocPathNode* node = _GetCurrentPathNodeNoLock(td);
967         size_t blockSize = Tf_GetMallocBlockSize(ptr, nBytes);
968 
969         // Update malloc global data with bookkeeping information. This has to
970         // happen while the mutex is held.
971         if (_mallocGlobalData->_RegisterPathNodeForBlock(node, ptr, blockSize)) {
972             _mallocGlobalData->_CaptureMallocStack(node, ptr, blockSize);
973 
974             node->_totalBytes += blockSize;
975             node->_numAllocations++;
976             node->_callSite->_totalBytes += blockSize;
977             _mallocGlobalData->_totalBytes += blockSize;
978 
979             _mallocGlobalData->_maxTotalBytes =
980                 std::max(_mallocGlobalData->_totalBytes,
981                          _mallocGlobalData->_maxTotalBytes);
982 
983             _mallocGlobalData->_RunDebugHookForNode(node, ptr, blockSize);
984 
985             return ptr;
986         }
987     }
988 
989     // Make sure we issue this error while the mutex is unlocked, as issuing
990     // the error could cause more allocations, leading to a reentrant call.
991     //
992     // This should only happen if there's a bug with removing previously
993     // allocated blocks from the path node table. This likely would cause us to
994     // miscount memory usage, but the allocated pointer is still valid and the
995     // system should continue to work. So, we issue a warning but continue on
996     // instead of using an axiom.
997     TF_VERIFY(!"Failed to register path for allocated block. "
998                "Memory usage may be miscounted");
999 
1000     return ptr;
1001 }
1002 
1003 void*
_ReallocWrapper(void * oldPtr,size_t nBytes,const void *)1004 TfMallocTag::_ReallocWrapper(void* oldPtr, size_t nBytes, const void*)
1005 {
1006     /*
1007      * If ptr is NULL, we want to make sure we don't double count,
1008      * because a call to _mallocHook.Realloc(ptr, nBytes) could call
1009      * through to our malloc.  To avoid this, we'll explicitly short-circuit
1010      * ourselves rather than trust that the malloc library will do it.
1011      */
1012     if (!oldPtr)
1013         return _MallocWrapper(nBytes, NULL);
1014 
1015     _ThreadData* td = NULL;
1016     _Tagging tagState;
1017     const bool shouldNotTag = _ShouldNotTag(&td, &tagState);
1018 
1019     // If tagging is explicitly disabled, just do the realloc and skip
1020     // everything else. This avoids a deadlock if we get here while updating
1021     // Tf_MallocGlobalData::_pathNodeTable.
1022     //
1023     // If tagState is _TaggingDormant, we still need to unregister the oldPtr.
1024     // However, we won't need to register the newly realloc'd ptr later on.
1025     if (tagState == _TaggingDisabled) {
1026         return _mallocHook.Realloc(oldPtr, nBytes);
1027     }
1028 
1029     void* newPtr = NULL;
1030     {
1031         tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1032 
1033         Tf_MallocBlockInfo info;
1034         if (_mallocGlobalData->_UnregisterPathNodeForBlock(oldPtr, &info)) {
1035 
1036             size_t bytesFreed = info.blockSize;
1037             Tf_MallocPathNode* oldNode =
1038                 _mallocGlobalData->_allPathNodes[info.pathNodeIndex];
1039 
1040             _mallocGlobalData->_RunDebugHookForNode(oldNode, oldPtr, bytesFreed);
1041 
1042             // Check if we should release a malloc stack.  This has to happen
1043             // while the mutex is held.
1044             _mallocGlobalData->_ReleaseMallocStack(oldNode, oldPtr);
1045 
1046             oldNode->_totalBytes -= bytesFreed;
1047             oldNode->_numAllocations -= (_DECREMENT_ALLOCATION_COUNTS) ? 1 : 0;
1048             oldNode->_callSite->_totalBytes -= bytesFreed;
1049             _mallocGlobalData->_totalBytes -= bytesFreed;
1050         }
1051 
1052         newPtr = _mallocHook.Realloc(oldPtr, nBytes);
1053 
1054         if (shouldNotTag || ARCH_UNLIKELY(!newPtr))
1055             return newPtr;
1056 
1057         Tf_MallocPathNode* newNode = _GetCurrentPathNodeNoLock(td);
1058         size_t blockSize = Tf_GetMallocBlockSize(newPtr, nBytes);
1059 
1060         // Update malloc global data with bookkeeping information. This has to
1061         // happen while the mutex is held.
1062         if (_mallocGlobalData->_RegisterPathNodeForBlock(
1063                 newNode, newPtr, blockSize)) {
1064 
1065             _mallocGlobalData->_CaptureMallocStack(
1066                 newNode, newPtr, blockSize);
1067 
1068             newNode->_totalBytes += blockSize;
1069             newNode->_numAllocations++;
1070             newNode->_callSite->_totalBytes += blockSize;
1071             _mallocGlobalData->_totalBytes += blockSize;
1072 
1073             _mallocGlobalData->_maxTotalBytes =
1074                 std::max(_mallocGlobalData->_totalBytes,
1075                          _mallocGlobalData->_maxTotalBytes);
1076 
1077             _mallocGlobalData->_RunDebugHookForNode(
1078                 newNode, newPtr, blockSize);
1079         }
1080 
1081         return newPtr;
1082     }
1083 
1084     // See comment in _MallocWrapper.
1085     TF_VERIFY(!"Failed to register path for allocated block. "
1086                "Memory usage may be miscounted");
1087     return newPtr;
1088 }
1089 
1090 void*
_MemalignWrapper(size_t alignment,size_t nBytes,const void *)1091 TfMallocTag::_MemalignWrapper(size_t alignment, size_t nBytes, const void*)
1092 {
1093     void* ptr = _mallocHook.Memalign(alignment, nBytes);
1094 
1095     _ThreadData* td;
1096     if (_ShouldNotTag(&td) || ARCH_UNLIKELY(!ptr))
1097         return ptr;
1098 
1099     tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1100 
1101     Tf_MallocPathNode* node = _GetCurrentPathNodeNoLock(td);
1102     size_t blockSize = Tf_GetMallocBlockSize(ptr, nBytes);
1103 
1104     // Update malloc global data with bookkeeping information. This has to
1105     // happen while the mutex is held.
1106     _mallocGlobalData->_RegisterPathNodeForBlock(node, ptr, blockSize);
1107     _mallocGlobalData->_CaptureMallocStack(node, ptr, blockSize);
1108 
1109     node->_totalBytes += blockSize;
1110     node->_numAllocations++;
1111     node->_callSite->_totalBytes += blockSize;
1112     _mallocGlobalData->_totalBytes += blockSize;
1113 
1114     _mallocGlobalData->_maxTotalBytes = std::max(_mallocGlobalData->_totalBytes,
1115         _mallocGlobalData->_maxTotalBytes);
1116 
1117     _mallocGlobalData->_RunDebugHookForNode(node, ptr, blockSize);
1118 
1119     return ptr;
1120 }
1121 
1122 void
_FreeWrapper(void * ptr,const void *)1123 TfMallocTag::_FreeWrapper(void* ptr, const void*)
1124 {
1125     if (!ptr)
1126         return;
1127 
1128     // If tagging is explicitly disabled, just do the free and skip
1129     // everything else. This avoids a deadlock if we get here while updating
1130     // Tf_MallocGlobalData::_pathNodeTable.
1131     _ThreadData* td;
1132     _Tagging tagState;
1133     if (_ShouldNotTag(&td, &tagState) && tagState == _TaggingDisabled) {
1134         _mallocHook.Free(ptr);
1135         return;
1136     }
1137 
1138     tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1139 
1140     Tf_MallocBlockInfo info;
1141     if (_mallocGlobalData->_UnregisterPathNodeForBlock(ptr, &info)) {
1142         size_t bytesFreed = info.blockSize;
1143         Tf_MallocPathNode* node =
1144             _mallocGlobalData->_allPathNodes[info.pathNodeIndex];
1145 
1146         _mallocGlobalData->_RunDebugHookForNode(node, ptr, bytesFreed);
1147 
1148         // Check if we should release a malloc stack.  This has to happen
1149         // while the mutex is held.
1150         _mallocGlobalData->_ReleaseMallocStack(node, ptr);
1151 
1152         node->_totalBytes -= bytesFreed;
1153         node->_numAllocations -= (_DECREMENT_ALLOCATION_COUNTS) ? 1 : 0;
1154         node->_callSite->_totalBytes -= bytesFreed;
1155         _mallocGlobalData->_totalBytes -= bytesFreed;
1156     }
1157 
1158     _mallocHook.Free(ptr);
1159 }
1160 
1161 void*
_MallocWrapper_ptmalloc(size_t nBytes,const void *)1162 TfMallocTag::_MallocWrapper_ptmalloc(size_t nBytes, const void*)
1163 {
1164     void* ptr = _mallocHook.Malloc(nBytes);
1165 
1166     _ThreadData* td;
1167     if (_ShouldNotTag(&td))
1168         return ptr;
1169 
1170     tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1171 
1172     Tf_MallocPathNode* node = _GetCurrentPathNodeNoLock(td);
1173     size_t actualBytes;
1174     _StoreIndexAndGetSize(ptr, &actualBytes, node->_index);
1175 
1176     // Check if we should capture a malloc stack.  This has to happen while
1177     // the mutex is held.
1178     _mallocGlobalData->_CaptureMallocStack(node, ptr, actualBytes);
1179 
1180     node->_totalBytes += actualBytes;
1181     node->_numAllocations++;
1182     node->_callSite->_totalBytes += actualBytes;
1183     _mallocGlobalData->_totalBytes += actualBytes;
1184 
1185     _mallocGlobalData->_maxTotalBytes = std::max(_mallocGlobalData->_totalBytes,
1186         _mallocGlobalData->_maxTotalBytes);
1187 
1188     _mallocGlobalData->_RunDebugHookForNode(node, ptr, actualBytes);
1189 
1190     return ptr;
1191 }
1192 
1193 void*
_ReallocWrapper_ptmalloc(void * oldPtr,size_t nBytes,const void *)1194 TfMallocTag::_ReallocWrapper_ptmalloc(void* oldPtr, size_t nBytes, const void*)
1195 {
1196     /*
1197      * If ptr is NULL, we want to make sure we don't double count,
1198      * because a call to _mallocHook.Realloc(ptr, nBytes) could call
1199      * through to our malloc.  To avoid this, we'll explicitly short-circuit
1200      * ourselves rather than trust that the malloc library will do it.
1201      */
1202     if (!oldPtr)
1203         return _MallocWrapper_ptmalloc(nBytes, NULL);
1204 
1205     /*
1206      * Account for the implicit free, and fix-up oldPtr
1207      * regardless of whether we're currently tagging or not:
1208      */
1209     uint32_t index;
1210     size_t bytesFreed;
1211     _ExtractIndexAndGetSize(oldPtr, &bytesFreed, &index);
1212 
1213     void* newPtr = _mallocHook.Realloc(oldPtr, nBytes);
1214 
1215     _ThreadData* td;
1216     if (_ShouldNotTag(&td))
1217         return newPtr;
1218 
1219     tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1220 
1221     Tf_MallocPathNode* newNode = _GetCurrentPathNodeNoLock(td);
1222     size_t actualBytes;
1223     _StoreIndexAndGetSize(newPtr, &actualBytes, newNode->_index);
1224 
1225     if (index) {
1226         Tf_MallocPathNode* oldNode = _mallocGlobalData->_allPathNodes[index];
1227 
1228         _mallocGlobalData->_RunDebugHookForNode(oldNode, oldPtr, bytesFreed);
1229 
1230         // Check if we should release a malloc stack.  This has to happen while
1231         // the mutex is held.
1232         _mallocGlobalData->_ReleaseMallocStack(oldNode, oldPtr);
1233 
1234         oldNode->_totalBytes -= bytesFreed;
1235         oldNode->_numAllocations -= (_DECREMENT_ALLOCATION_COUNTS) ? 1 : 0;
1236         oldNode->_callSite->_totalBytes -= bytesFreed;
1237         _mallocGlobalData->_totalBytes -= bytesFreed;
1238     }
1239 
1240     // Check if we should capture a malloc stack.  This has to happen while
1241     // the mutex is held.
1242     _mallocGlobalData->_CaptureMallocStack(newNode, newPtr, actualBytes);
1243 
1244     newNode->_totalBytes += actualBytes;
1245     newNode->_numAllocations++;
1246     newNode->_callSite->_totalBytes += actualBytes;
1247     _mallocGlobalData->_totalBytes += actualBytes;
1248 
1249     _mallocGlobalData->_maxTotalBytes = std::max(_mallocGlobalData->_totalBytes,
1250         _mallocGlobalData->_maxTotalBytes);
1251 
1252     _mallocGlobalData->_RunDebugHookForNode(newNode, newPtr, actualBytes);
1253 
1254     return newPtr;
1255 }
1256 
1257 void*
_MemalignWrapper_ptmalloc(size_t alignment,size_t nBytes,const void *)1258 TfMallocTag::_MemalignWrapper_ptmalloc(size_t alignment, size_t nBytes, const void*)
1259 {
1260     void* ptr = _mallocHook.Memalign(alignment, nBytes);
1261 
1262     _ThreadData* td;
1263     if (_ShouldNotTag(&td))
1264         return ptr;
1265 
1266     tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1267 
1268     Tf_MallocPathNode* node = _GetCurrentPathNodeNoLock(td);
1269     size_t actualBytes;
1270     _StoreIndexAndGetSize(ptr, &actualBytes, node->_index);
1271 
1272     // Check if we should capture a malloc stack.  This has to happen while
1273     // the mutex is held.
1274     _mallocGlobalData->_CaptureMallocStack(node, ptr, actualBytes);
1275 
1276     node->_totalBytes += actualBytes;
1277     node->_numAllocations++;
1278     node->_callSite->_totalBytes += actualBytes;
1279     _mallocGlobalData->_totalBytes += actualBytes;
1280 
1281     _mallocGlobalData->_maxTotalBytes = std::max(_mallocGlobalData->_totalBytes,
1282         _mallocGlobalData->_maxTotalBytes);
1283 
1284     _mallocGlobalData->_RunDebugHookForNode(node, ptr, actualBytes);
1285 
1286     return ptr;
1287 }
1288 
1289 void
_FreeWrapper_ptmalloc(void * ptr,const void *)1290 TfMallocTag::_FreeWrapper_ptmalloc(void* ptr, const void*)
1291 {
1292     if (!ptr)
1293         return;
1294 
1295     /*
1296      * Make ptr safe in case it has index bits set:
1297      */
1298     uint32_t index;
1299     size_t bytesFreed;
1300     _ExtractIndexAndGetSize(ptr, &bytesFreed, &index);
1301 
1302     if (index && TfMallocTag::_doTagging) {
1303         tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1304         Tf_MallocPathNode* node = _mallocGlobalData->_allPathNodes[index];
1305 
1306         _mallocGlobalData->_RunDebugHookForNode(node, ptr, bytesFreed);
1307 
1308         // Check if we should release a malloc stack.  This has to happen
1309         // while the mutex is held.
1310         _mallocGlobalData->_ReleaseMallocStack(node, ptr);
1311 
1312         node->_totalBytes -= bytesFreed;
1313         node->_numAllocations -= (_DECREMENT_ALLOCATION_COUNTS) ? 1 : 0;
1314         node->_callSite->_totalBytes -= bytesFreed;
1315         _mallocGlobalData->_totalBytes -= bytesFreed;
1316     }
1317 
1318     _mallocHook.Free(ptr);
1319 }
1320 
1321 bool
Initialize(string * errMsg)1322 TfMallocTag::Initialize(string* errMsg)
1323 {
1324     static bool status = _Initialize(errMsg);
1325     return status;
1326 }
1327 
1328 
1329 bool
GetCallTree(CallTree * tree,bool skipRepeated)1330 TfMallocTag::GetCallTree(CallTree* tree, bool skipRepeated)
1331 {
1332     tree->callSites.clear();
1333     tree->root.nBytes = tree->root.nBytesDirect = 0;
1334     tree->root.nAllocations = 0;
1335     tree->root.siteName.clear();
1336     tree->root.children.clear();
1337 
1338     if (Tf_MallocGlobalData* gd = _mallocGlobalData) {
1339         TfMallocTag::_TemporaryTaggingState tmpState(_TaggingDisabled);
1340 
1341         gd->_mutex.lock();
1342 
1343         // Build the snapshot call tree
1344         gd->_rootNode->_BuildTree(&tree->root, skipRepeated);
1345 
1346         // Build the snapshot callsites map based on the tree
1347         Tf_MallocCallSiteTable callSiteTable;
1348         Tf_GetCallSites(&tree->root, &callSiteTable);
1349 
1350         // Copy the callsites into the calltree
1351         tree->callSites.reserve(callSiteTable.size());
1352         TF_FOR_ALL(csi, callSiteTable) {
1353             CallTree::CallSite cs = {
1354                 csi->second->_name,
1355                 static_cast<size_t>(csi->second->_totalBytes)
1356             };
1357             tree->callSites.push_back(cs);
1358             delete csi->second;
1359         }
1360 
1361         gd->_BuildUniqueMallocStacks(tree);
1362 
1363         gd->_mutex.unlock();
1364         return true;
1365     }
1366     else
1367         return false;
1368 }
1369 
1370 size_t
GetTotalBytes()1371 TfMallocTag::GetTotalBytes()
1372 {
1373     if (!_mallocGlobalData)
1374         return 0;
1375 
1376     tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1377     return _mallocGlobalData->_totalBytes;
1378 }
1379 
1380 size_t
GetMaxTotalBytes()1381 TfMallocTag::GetMaxTotalBytes()
1382 {
1383     if (!_mallocGlobalData)
1384         return 0;
1385 
1386     tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1387     return _mallocGlobalData->_maxTotalBytes;
1388 }
1389 
1390 void
_SetTagging(_Tagging status)1391 TfMallocTag::_SetTagging(_Tagging status)
1392 {
1393     TfMallocTag::Tls::Find()->_tagState = status;
1394 }
1395 
1396 TfMallocTag::_Tagging
_GetTagging()1397 TfMallocTag::_GetTagging()
1398 {
1399     return TfMallocTag::Tls::Find()->_tagState;
1400 }
1401 
1402 bool
_Initialize(std::string * errMsg)1403 TfMallocTag::_Initialize(std::string* errMsg)
1404 {
1405     /*
1406      * This is called from an EXECUTE_ONCE block, so no
1407      * need to lock anything.
1408      */
1409     TF_AXIOM(!_mallocGlobalData);
1410     _mallocGlobalData = new Tf_MallocGlobalData();
1411 
1412     // Note that we are *not* using the _TemporaryTaggingState object
1413     // here. We explicitly want the tagging set to enabled as the end
1414     // of this function so that all subsequent memory allocations are captured.
1415     _SetTagging(_TaggingDisabled);
1416 
1417     bool usePtmalloc = _UsePtmalloc();
1418 
1419     if (usePtmalloc) {
1420         // index 0 is reserved for untracked malloc/free's:
1421         _mallocGlobalData->_allPathNodes.push_back(NULL);
1422     }
1423 
1424     Tf_MallocCallSite* site = _mallocGlobalData->_GetOrCreateCallSite("__root");
1425     Tf_MallocPathNode* rootNode = new Tf_MallocPathNode(site);
1426     _mallocGlobalData->_rootNode = rootNode;
1427     (void) _mallocGlobalData->_RegisterPathNode(rootNode);
1428     TfMallocTag::Tls::Find()->_tagStack.reserve(64);
1429     TfMallocTag::Tls::Find()->_tagStack.push_back(rootNode);
1430 
1431     _SetTagging(_TaggingEnabled);
1432 
1433     TfMallocTag::_doTagging = true;
1434 
1435     if (usePtmalloc) {
1436         return _mallocHook.Initialize(_MallocWrapper_ptmalloc,
1437                                         _ReallocWrapper_ptmalloc,
1438                                         _MemalignWrapper_ptmalloc,
1439                                         _FreeWrapper_ptmalloc,
1440                                         errMsg);
1441     }
1442     else {
1443         return _mallocHook.Initialize(_MallocWrapper,
1444                                         _ReallocWrapper,
1445                                         _MemalignWrapper,
1446                                         _FreeWrapper,
1447                                         errMsg);
1448     }
1449 }
1450 
1451 void
_Begin(const string & name)1452 TfMallocTag::Auto::_Begin(const string& name)
1453 {
1454     _Begin(name.c_str());
1455 }
1456 
1457 void
_Begin(const char * name)1458 TfMallocTag::Auto::_Begin(const char* name)
1459 {
1460     if (!name || !name[0])
1461         return;
1462 
1463     _threadData = TfMallocTag::Tls::Find();
1464 
1465     _threadData->_tagState = _TaggingDisabled;
1466     Tf_MallocPathNode* thisNode;
1467     Tf_MallocCallSite* site;
1468 
1469     {
1470         tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1471         site = _mallocGlobalData->_GetOrCreateCallSite(name);
1472 
1473         if (_threadData->_callSiteOnStack.size() <= site->_index) {
1474             if (_threadData->_callSiteOnStack.capacity() == 0)
1475                 _threadData->_callSiteOnStack.reserve(128);
1476             _threadData->_callSiteOnStack.resize(site->_index + 1, 0);
1477         }
1478 
1479 
1480         if (_threadData->_tagStack.empty())
1481             thisNode = _mallocGlobalData->_rootNode->_GetOrCreateChild(site);
1482         else
1483             thisNode = _threadData->_tagStack.back()->_GetOrCreateChild(site);
1484 
1485         if (_threadData->_callSiteOnStack[site->_index]) {
1486             thisNode->_repeated = true;
1487         }
1488     }
1489 
1490     if (thisNode) {
1491         _threadData->_tagStack.push_back(thisNode);
1492         _threadData->_callSiteOnStack[site->_index] += 1;
1493         _threadData->_tagState = _TaggingEnabled;
1494     }
1495     else {
1496         _threadData->_tagState = _TaggingEnabled;
1497         _threadData = NULL;
1498     }
1499 }
1500 
1501 void
_End()1502 TfMallocTag::Auto::_End()
1503 {
1504     Tf_MallocPathNode* node = _threadData->_tagStack.back();
1505     TF_AXIOM(_threadData->_callSiteOnStack[node->_callSite->_index] > 0);
1506     _threadData->_callSiteOnStack[node->_callSite->_index] -= 1;
1507     _threadData->_tagStack.pop_back();
1508 }
1509 
1510 void
Pop(const char * name)1511 TfMallocTag::Pop(const char* name)
1512 {
1513     if (!TfMallocTag::_doTagging)
1514         return;
1515 
1516     _ThreadData* threadData = TfMallocTag::Tls::Find();
1517     Tf_MallocPathNode* node = threadData->_tagStack.back();
1518 
1519     if (name && node->_callSite->_name != name) {
1520         TF_CODING_ERROR("mismatched call Pop(\"%s\"); top of stack is \"%s\"",
1521                         name, node->_callSite->_name.c_str());
1522     }
1523 
1524     TF_AXIOM(threadData->_callSiteOnStack[node->_callSite->_index] > 0);
1525     threadData->_callSiteOnStack[node->_callSite->_index] -= 1;
1526     threadData->_tagStack.pop_back();
1527 }
1528 
1529 // Returns the given number as a string with commas used as thousands
1530 // separators.
1531 //
1532 static string
_GetAsCommaSeparatedString(size_t number)1533 _GetAsCommaSeparatedString(size_t number)
1534 {
1535     string result;
1536 
1537     string str = TfStringPrintf("%ld", number);
1538     size_t n = str.size();
1539 
1540     TF_FOR_ALL(it, str) {
1541         if (n < str.size() && n%3 == 0) {
1542             result.push_back(',');
1543         }
1544         result.push_back(*it);
1545         n--;
1546     }
1547     return result;
1548 }
1549 
1550 static void
_PrintHeader(string * rpt)1551 _PrintHeader(string *rpt)
1552 {
1553     *rpt += "\n" + string(80, '-') + "\n";
1554     *rpt += TfStringPrintf("\nMalloc Tag Report\n\n\n");
1555     *rpt += TfStringPrintf("Total bytes = %s\n\n\n",
1556         _GetAsCommaSeparatedString(TfMallocTag::GetTotalBytes()).c_str());
1557 }
1558 
1559 static size_t
_PrintMallocNode(string * rpt,const TfMallocTag::CallTree::PathNode & node,size_t rootTotal,size_t parentTotal,size_t level,size_t & printedNodes,size_t maxPrintedNodes)1560 _PrintMallocNode(
1561     string* rpt,
1562     const TfMallocTag::CallTree::PathNode &node,
1563     size_t rootTotal,
1564     size_t parentTotal,
1565     size_t level,
1566     size_t &printedNodes,
1567     size_t maxPrintedNodes)
1568 {
1569     if (!level) {
1570         // XXX:cleanup  We should pass in maxNameWidth and generate format
1571         //              strings like in _PrintMallocCallSites().
1572         *rpt += TfStringPrintf("%-72s %15s%15s %5s %5s %5s\n", "TAGNAME",
1573                                "BytesIncl", "BytesExcl", "%Prnt", "% Exc",
1574                                "%Totl");
1575         *rpt += TfStringPrintf("%-72s %12s%12s %5s %5s %5s\n\n",
1576                                string(72, '-').c_str(),
1577                                " --------------", " --------------",
1578                                "-----", "-----", "-----");
1579 
1580         rootTotal = node.nBytes;
1581     }
1582 
1583     size_t maxNameWidth = 72;
1584     size_t indent = level;
1585 
1586     if (printedNodes >= maxPrintedNodes) {
1587         return 0;
1588     }
1589     printedNodes++;
1590 
1591     string name = string(indent, ' ') +
1592         node.siteName.substr(0, maxNameWidth-indent);
1593     int postLen = static_cast<int>(maxNameWidth - name.length());
1594     if (postLen > 0) {
1595         name += string(postLen, ' ');
1596     }
1597 
1598     *rpt += TfStringPrintf(
1599         "%s %15s%15s ",
1600         name.c_str(),
1601         _GetAsCommaSeparatedString(node.nBytes).c_str(),
1602         _GetAsCommaSeparatedString(node.nBytesDirect).c_str());
1603 
1604     string curPercent;
1605     string curPercentDirect;
1606     string percentDirectOfRoot;
1607 
1608     if (parentTotal) {
1609 
1610         float percent = node.nBytes/(float)parentTotal*100;
1611         if (percent > 0.5) {
1612             curPercent = TfStringPrintf(" %.0f%%", percent);
1613         }
1614         percent = node.nBytesDirect/(float)node.nBytes*100;
1615         if (percent > 0.5) {
1616             curPercentDirect = TfStringPrintf(" %.0f%%", percent);
1617         }
1618 
1619         percent = node.nBytesDirect/(float)rootTotal*100;
1620         if (percent > 0.5) {
1621             percentDirectOfRoot = TfStringPrintf(" %.0f%%", percent);
1622         }
1623     }
1624 
1625     if (!level) {
1626         // For Root, take the bytesDirect as the rootPercentage
1627 
1628         float percent = 100*node.nBytesDirect/(float)rootTotal;
1629         if (percent > 0.5) {
1630             percentDirectOfRoot = TfStringPrintf(" %.0f%%", percent);
1631         }
1632     }
1633     *rpt += TfStringPrintf("%5s %5s %5s\n", curPercent.c_str(),
1634                            curPercentDirect.c_str(),
1635                            percentDirectOfRoot.c_str());
1636 
1637     vector<TfMallocTag::CallTree::PathNode>::const_iterator it;
1638     for (it = node.children.begin(); it != node.children.end(); ++it) {
1639         _PrintMallocNode(rpt, *it, rootTotal, node.nBytes, level+1,
1640                          printedNodes,
1641                          maxPrintedNodes);
1642     }
1643 
1644     return rootTotal;
1645 }
1646 
1647 static void
_PrintMallocCallSites(string * rpt,const vector<TfMallocTag::CallTree::CallSite> & callSites,size_t rootTotal)1648 _PrintMallocCallSites(
1649    string* rpt,
1650    const vector<TfMallocTag::CallTree::CallSite>& callSites,
1651    size_t rootTotal)
1652 {
1653     *rpt += TfStringPrintf("\n\nCall Sites\n\n");
1654 
1655     // Use a map to sort by allocation size.
1656     map<size_t, const string *> map;
1657     TF_FOR_ALL(csi, callSites) {
1658         map.insert(make_pair(csi->nBytes, &csi->name));
1659     }
1660 
1661     // XXX:cleanup We should pass in maxNameWidth.
1662     const size_t maxNameWidth = 72;
1663     const size_t maxBytesWidth = 15;
1664     const size_t maxPercentageWidth = 15;
1665 
1666     string fmt = TfStringPrintf(
1667         "%%-%lds %%%lds %%%lds\n",
1668         maxNameWidth, maxBytesWidth, maxPercentageWidth);
1669 
1670     *rpt += TfStringPrintf(fmt.c_str(), "NAME", "BYTES", "%ROOT");
1671     *rpt += string(maxNameWidth, '-') + ' ' +
1672             string(maxBytesWidth, '-') + ' ' +
1673             string(maxPercentageWidth, '-') + "\n\n";
1674 
1675     TF_REVERSE_FOR_ALL(it, map) {
1676         size_t nBytes = it->first;
1677         const string &name = *it->second;
1678 
1679         string curPercent;
1680         if (rootTotal) {
1681             double percent = 100.0*nBytes/rootTotal;
1682             // Don't print anything less than 0.1%.
1683             if (percent < 0.1) {
1684                 break;
1685             }
1686             curPercent = TfStringPrintf("%.1f%%", percent);
1687         }
1688 
1689         *rpt += TfStringPrintf(
1690             fmt.c_str(),
1691             name.substr(0, maxNameWidth).c_str(),
1692             _GetAsCommaSeparatedString(nBytes).c_str(),
1693             curPercent.c_str());
1694     }
1695 }
1696 
1697 // Comparison for sorting CallTree::PathNodes.
1698 //
1699 static bool
_MallocPathNodeLessThan(const TfMallocTag::CallTree::PathNode * lhs,const TfMallocTag::CallTree::PathNode * rhs)1700 _MallocPathNodeLessThan(
1701     const TfMallocTag::CallTree::PathNode *lhs,
1702     const TfMallocTag::CallTree::PathNode *rhs)
1703 {
1704     return lhs->siteName < rhs->siteName;
1705 }
1706 
1707 #if !(_DECREMENT_ALLOCATION_COUNTS)
1708 // Returns the total number of allocations in the given sub-tree.
1709 //
1710 static int64_t
_GetNumAllocationInSubTree(const TfMallocTag::CallTree::PathNode & node)1711 _GetNumAllocationInSubTree(
1712     const TfMallocTag::CallTree::PathNode &node)
1713 {
1714     int64_t nAllocations = node.nAllocations;
1715     TF_FOR_ALL(it, node.children) {
1716         nAllocations += _GetNumAllocationInSubTree(*it);
1717     }
1718     return nAllocations;
1719 }
1720 #endif
1721 
1722 static void
_ReportMallocNode(std::ostream & out,const TfMallocTag::CallTree::PathNode & node,size_t level,const std::string * rootName=nullptr)1723 _ReportMallocNode(
1724     std::ostream &out,
1725     const TfMallocTag::CallTree::PathNode &node,
1726     size_t level,
1727     const std::string *rootName = nullptr)
1728 {
1729     // Prune empty branches.
1730     if (node.nBytes == 0) {
1731         #if _DECREMENT_ALLOCATION_COUNTS
1732             return;
1733         #else
1734             if (_GetNumAllocationInSubTree(node) == 0) {
1735                 return;
1736             }
1737         #endif
1738     }
1739 
1740     string indent(2*level, ' ');
1741 
1742     // Insert '|' characters every 4 spaces.
1743     for (size_t i=0; i<(level + 1)/2; i++) {
1744         indent[4*i] = '|';
1745     }
1746 
1747     out << TfStringPrintf(
1748         "%13s B %13s B %7ld samples    ",
1749         _GetAsCommaSeparatedString(node.nBytes).c_str(),
1750         _GetAsCommaSeparatedString(node.nBytesDirect).c_str(),
1751         node.nAllocations);
1752 
1753     out << indent
1754         << (rootName && !rootName->empty() ? *rootName : node.siteName)
1755         << std::endl;
1756 
1757     // Sort the children by name.  The reason for doing this is that it is
1758     // the easiest way to provide stable results for diffing.  You could
1759     // argue that letting the diff program do the sorting is more correct
1760     // (i.e. that sorting is a view into the unaltered source data).
1761     std::vector<const TfMallocTag::CallTree::PathNode *> sortedChildren;
1762     sortedChildren.reserve(node.children.size());
1763     TF_FOR_ALL(it, node.children) {
1764         sortedChildren.push_back(&(*it));
1765     }
1766 
1767     std::sort(
1768         sortedChildren.begin(), sortedChildren.end(), _MallocPathNodeLessThan);
1769 
1770     TF_FOR_ALL(it, sortedChildren) {
1771         _ReportMallocNode(out, **it, level+1);
1772     }
1773 }
1774 
1775 static void
_ReportCapturedMallocStacks(std::ostream & out,const std::vector<TfMallocTag::CallStackInfo> & stackInfos)1776 _ReportCapturedMallocStacks(
1777     std::ostream &out,
1778     const std::vector<TfMallocTag::CallStackInfo> &stackInfos)
1779 {
1780     size_t numReportedStacks =
1781         TfMin(stackInfos.size(), _MaxReportedMallocStacks);
1782 
1783     size_t totalSize = 0;
1784     size_t totalNumAllocations = 0;
1785     size_t reportSize = 0;
1786     size_t reportNumAllocations = 0;
1787 
1788     for(size_t n=0; n<stackInfos.size(); n++) {
1789         const TfMallocTag::CallStackInfo &stackInfo = stackInfos[n];
1790         totalSize += stackInfo.size;
1791         totalNumAllocations += stackInfo.numAllocations;
1792         if (n < numReportedStacks) {
1793             reportSize += stackInfo.size;
1794             reportNumAllocations += stackInfo.numAllocations;
1795         }
1796     }
1797 
1798     out << "\n\n\n"
1799         << "Captured Malloc Stacks\n"
1800         << "\n"
1801         << "Number of unique captured malloc stacks:          "
1802             << _GetAsCommaSeparatedString(stackInfos.size()) << "\n"
1803         << "Total allocated memory by captured mallocs:       "
1804             << _GetAsCommaSeparatedString(totalSize) << "\n"
1805         << "Total number of allocations by captured mallocs:  "
1806             << _GetAsCommaSeparatedString(totalNumAllocations) << "\n"
1807         << "\n"
1808         << "Number of captured malloc stacks in report:       "
1809             << _GetAsCommaSeparatedString(numReportedStacks) << "\n"
1810         << "Allocated memory by mallocs in report:            "
1811             << _GetAsCommaSeparatedString(reportSize) << "\n"
1812         << "Number of allocations by mallocs in report:       "
1813             << _GetAsCommaSeparatedString(reportNumAllocations) << "\n"
1814         << "Percentage of allocated memory covered by report: "
1815             << TfStringPrintf("%.1f%%", 100.0*reportSize/totalSize) << "\n\n";
1816 
1817     for(size_t n=0; n<numReportedStacks; n++) {
1818         const TfMallocTag::CallStackInfo &stackInfo = stackInfos[n];
1819 
1820         out << string(100, '-') << "\n"
1821             << "Captured malloc stack #" << n << "\n"
1822             << "Size:            " <<
1823                 _GetAsCommaSeparatedString(stackInfo.size) << "\n"
1824             << "Num allocations: " <<
1825                 _GetAsCommaSeparatedString(stackInfo.numAllocations) << "\n";
1826 
1827         ArchPrintStackFrames(out, stackInfo.stack);
1828     }
1829 }
1830 
1831 
1832 string
GetPrettyPrintString(PrintSetting setting,size_t maxPrintedNodes) const1833 TfMallocTag::CallTree::GetPrettyPrintString(PrintSetting setting,
1834                                             size_t maxPrintedNodes) const
1835 {
1836     string rpt;
1837 
1838     _PrintHeader(&rpt);
1839 
1840     if (setting == TREE || setting == BOTH) {
1841         size_t printedNodes = 0;
1842         size_t reportedMem =
1843             _PrintMallocNode(&rpt, this->root, 0, 0, 0, printedNodes,
1844                              maxPrintedNodes);
1845         if (printedNodes >= maxPrintedNodes
1846             && reportedMem != GetTotalBytes()) {
1847             rpt += TfStringPrintf("\nWARNING: limit of %zu nodes visted, but "
1848                                   "only %zu bytes of %zu accounted for.  "
1849                                   "Running with a larger maxPrintedNodes will "
1850                                   "produce more accurate results.\n",
1851                                   maxPrintedNodes,
1852                                   reportedMem,
1853                                   GetTotalBytes());
1854 
1855         }
1856     }
1857 
1858     if (setting == CALLSITES || setting == BOTH) {
1859         _PrintMallocCallSites(&rpt, this->callSites, this->root.nBytes);
1860     }
1861 
1862     return rpt;
1863 }
1864 
1865 void
Report(std::ostream & out) const1866 TfMallocTag::CallTree::Report(
1867     std::ostream &out) const
1868 {
1869     const std::string emptyRootName;
1870     Report(out, emptyRootName);
1871 }
1872 
1873 void
Report(std::ostream & out,const std::string & rootName) const1874 TfMallocTag::CallTree::Report(
1875     std::ostream &out,
1876     const std::string &rootName) const
1877 {
1878     out << "\nTree view  ==============\n";
1879     out << "      inclusive       exclusive\n";
1880 
1881     _ReportMallocNode(out, this->root, 0, &rootName);
1882 
1883     // Also add the dominant call sites to the report.
1884     out << GetPrettyPrintString(CALLSITES);
1885 
1886     // And the captured malloc stacks if there are any.
1887     if (!this->capturedCallStacks.empty()) {
1888         _ReportCapturedMallocStacks(out, this->capturedCallStacks);
1889     }
1890 }
1891 
1892 TfMallocTag::
_TemporaryTaggingState(_Tagging tempStatus)1893 _TemporaryTaggingState::_TemporaryTaggingState(_Tagging tempStatus)
1894     : _oldState(TfMallocTag::_GetTagging())
1895 {
1896     TfMallocTag::_SetTagging(tempStatus);
1897 }
1898 
~_TemporaryTaggingState()1899 TfMallocTag::_TemporaryTaggingState::~_TemporaryTaggingState()
1900 {
1901     TfMallocTag::_SetTagging(_oldState);
1902 }
1903 
1904 PXR_NAMESPACE_CLOSE_SCOPE
1905