1 //
2 // Copyright 2016 Pixar
3 //
4 // Licensed under the Apache License, Version 2.0 (the "Apache License")
5 // with the following modification; you may not use this file except in
6 // compliance with the Apache License and the following modification to it:
7 // Section 6. Trademarks. is deleted and replaced with:
8 //
9 // 6. Trademarks. This License does not grant permission to use the trade
10 // names, trademarks, service marks, or product names of the Licensor
11 // and its affiliates, except as required to comply with Section 4(c) of
12 // the License and to reproduce the content of the NOTICE file.
13 //
14 // You may obtain a copy of the Apache License at
15 //
16 // http://www.apache.org/licenses/LICENSE-2.0
17 //
18 // Unless required by applicable law or agreed to in writing, software
19 // distributed under the Apache License with the above modification is
20 // distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
21 // KIND, either express or implied. See the Apache License for the specific
22 // language governing permissions and limitations under the Apache License.
23 //
24
25 #include "pxr/pxr.h"
26 #include "pxr/base/tf/mallocTag.h"
27
28 #include "pxr/base/tf/diagnostic.h"
29 #include "pxr/base/tf/getenv.h"
30 #include "pxr/base/tf/hash.h"
31 #include "pxr/base/tf/hashmap.h"
32 #include "pxr/base/tf/iterator.h"
33 #include "pxr/base/tf/stl.h"
34 #include "pxr/base/tf/stringUtils.h"
35 #include "pxr/base/tf/tf.h"
36
37 #include "pxr/base/arch/attributes.h"
38 #include "pxr/base/arch/debugger.h"
39 #include "pxr/base/arch/hash.h"
40 #include "pxr/base/arch/inttypes.h"
41 #include "pxr/base/arch/mallocHook.h"
42 #include "pxr/base/arch/stackTrace.h"
43
44 #include <tbb/spin_mutex.h>
45
46 #include <algorithm>
47 #include <string>
48 #include <stdlib.h>
49 #include <thread>
50 #include <type_traits>
51 #include <vector>
52 #include <ostream>
53
54 using std::map;
55 using std::make_pair;
56 using std::pair;
57 using std::string;
58 using std::vector;
59
60 PXR_NAMESPACE_OPEN_SCOPE
61
62 // Change the following line and recompile this file to disable decrementing
63 // the allocation counts when freeing memory.
64 #define _DECREMENT_ALLOCATION_COUNTS true
65
66 // The max number of captured unique malloc stacks printed out in the report.
67 static const size_t _MaxReportedMallocStacks = 100;
68
69 // The max number of call stack frames stored when malloc stack capturing
70 // is enabled. Note that two malloc stacks are considered identical if all
71 // their frames up to this depth are matching (the uncaptured parts of the
72 // stacks can still differ).
73 static const size_t _MaxMallocStackDepth = 64;
74
75 // The number of top stack frames to ignore when saving frames for a
76 // malloc stack. Currently these frames are:
77 // #0 ArchGetStackFrames(unsigned long, vector<unsigned long, allocator<unsigned long> >*)
78 // #1 Tf_MallocGlobalData::_CaptureMallocStack(Tf_MallocPathNode const*, void const*, unsigned long)
79 // #2 TfMallocTag::_MallocWrapper(unsigned long, void const*)
80 static const size_t _IgnoreStackFramesCount = 3;
81
82 struct Tf_MallocPathNode;
83 struct Tf_MallocGlobalData;
84
85 static ArchMallocHook _mallocHook; // zero-initialized POD
86 static Tf_MallocGlobalData* _mallocGlobalData = NULL;
87 bool TfMallocTag::_doTagging = false;
88
89 static bool
_UsePtmalloc()90 _UsePtmalloc()
91 {
92 string impl = TfGetenv("TF_MALLOC_TAG_IMPL", "auto");
93 vector<string> legalImpl = {"auto", "agnostic",
94 "jemalloc", "jemalloc force",
95 "ptmalloc", "ptmalloc force",
96 "pxmalloc", "pxmalloc force"};
97
98 if (std::find(legalImpl.begin(), legalImpl.end(), impl) == legalImpl.end()) {
99 string values = TfStringJoin(legalImpl, "', '");
100 TF_WARN("Invalid value '%s' for TF_MALLOC_TAG_IMPL: "
101 "(not one of '%s')", impl.c_str(), values.c_str());
102 }
103
104 if (impl != "auto") {
105 fprintf(stderr, "########################################################################\n"
106 "# TF_MALLOC_TAG_IMPL is overridden to '%s'. Default is 'auto' #\n"
107 "########################################################################\n",
108 impl.c_str());
109 }
110
111 if (impl == "agnostic")
112 return false;
113
114 if (ArchIsPtmallocActive()) {
115 return true;
116 }
117 else if (TfStringStartsWith(impl, "ptmalloc")) {
118 TF_WARN("TfMallocTag can only use ptmalloc-specific implementation "
119 "when ptmalloc is active. Falling back to agnostic "
120 "implementation.");
121 }
122
123 return false;
124 }
125
126 /*
127 * We let malloc have BITS_FOR_MALLOC_SIZE instead of the usual 64.
128 * That leaves us 64 - BITS_FOR_MALLOC_SIZE for storing our own index,
129 * which effectively gives us a pointer to a Tf_MallocPathNode (but
130 * only for MAX_PATH_NODES different nodes).
131 */
132 static const unsigned BITS_FOR_MALLOC_SIZE = 40;
133 static const unsigned BITS_FOR_INDEX = 64 - BITS_FOR_MALLOC_SIZE;
134 static const size_t MAX_PATH_NODES = 1 << BITS_FOR_INDEX;
135 static const unsigned HIWORD_INDEX_BIT_OFFSET = BITS_FOR_MALLOC_SIZE - 32;
136 static const unsigned HIWORD_INDEX_MASK = ~(~0U << HIWORD_INDEX_BIT_OFFSET); // (HIWORD_INDEX_BIT_OFFSET no. of 1 bits.)
137 static const unsigned long long MALLOC_SIZE_MASK = ~(~0ULL << BITS_FOR_MALLOC_SIZE) & ~0x7ULL;
138
139 static bool Tf_MatchesMallocTagDebugName(const string& name);
140 static bool Tf_MatchesMallocTagTraceName(const string& name);
141 static void Tf_MallocTagDebugHook(void* ptr, size_t size) ARCH_NOINLINE;
142
Tf_MallocTagDebugHook(void * ptr,size_t size)143 static void Tf_MallocTagDebugHook(void* ptr, size_t size)
144 {
145 // Clients don't call this directly so the debugger can conveniently
146 // see the pointer and size in the stack trace.
147 ARCH_DEBUGGER_TRAP;
148 }
149
Tf_GetMallocBlockSize(void * ptr,size_t requestedSize)150 static size_t Tf_GetMallocBlockSize(void* ptr, size_t requestedSize)
151 {
152 // The allocator-agnostic implementation keeps track of the exact memory
153 // block sizes requested by consumers. This ignores allocator-specific
154 // overhead, such as alignment, associated metadata, etc. We believe this
155 // is the right thing to be measuring, as malloc tags are intended to
156 // allow consumers to bill memory requests to their originating subsystem.
157 //
158 // Uncomment the following line to enable tracking of 'actual' block sizes.
159 // Be sure that the allocator in use provides this function! If not, this
160 // will call the default glibc implementation, which will likely return
161 // the wrong value (unless you're using the glibc allocator).
162 // return malloc_usable_size(ptr);
163
164 return requestedSize;
165 }
166
167 struct Tf_MallocBlockInfo {
Tf_MallocBlockInfoTf_MallocBlockInfo168 Tf_MallocBlockInfo()
169 : blockSize(0), pathNodeIndex(0)
170 { }
171
Tf_MallocBlockInfoTf_MallocBlockInfo172 Tf_MallocBlockInfo(size_t size, uint32_t index)
173 : blockSize(size), pathNodeIndex(index)
174 { }
175
176 size_t blockSize:BITS_FOR_MALLOC_SIZE;
177 uint32_t pathNodeIndex:BITS_FOR_INDEX;
178 };
179
180 #if !defined(ARCH_OS_WINDOWS)
181 static_assert(sizeof(Tf_MallocBlockInfo) == 8,
182 "Unexpected size for Tf_MallocBlockInfo");
183 #endif
184
185 /*
186 * Utility for checking a const char* against a table of match strings.
187 * Each string is tested against each item in the table in order. Each
188 * item can either allow or deny the string, with later entries overriding
189 * earlier results. Match strings can end in '*' to wildcard the suffix
190 * and can start with '-' to deny or '+' or nothing to allow.
191 *
192 * Match strings are concatenated into lists using commas, newlines or tabs.
193 * Spaces are not delimiters but they are trimmed from each end.
194 */
195 class Tf_MallocTagStringMatchTable {
196 public:
197 Tf_MallocTagStringMatchTable();
198 explicit Tf_MallocTagStringMatchTable(const std::string& matchList);
199
200 // Replace the list of matches.
201 void SetMatchList(const std::string& matchList);
202
203 // Return \c true iff \p s matches the most recently set match list.
204 bool Match(const char* s) const;
205
206 private:
207 struct _MatchString {
208 _MatchString(const std::string&);
209
210 std::string str; // String to match.
211 bool allow:1; // New result if str matches.
212 bool wildcard:1; // str has a suffix wildcard.
213 };
214 std::vector<_MatchString> _matchStrings;
215 };
216
_MatchString(const std::string & s)217 Tf_MallocTagStringMatchTable::_MatchString::_MatchString(const std::string& s) :
218 str(s),
219 allow(true),
220 wildcard(false)
221 {
222 if (!str.empty()) {
223 if (str[str.size() - 1] == '*') {
224 wildcard = true;
225 str.resize(str.size() - 1);
226 }
227 if (!str.empty()) {
228 if (str[0] == '-') {
229 allow = false;
230 str.erase(0, 1);
231 }
232 else if (str[0] == '+') {
233 str.erase(0, 1);
234 }
235 }
236 }
237 }
238
Tf_MallocTagStringMatchTable()239 Tf_MallocTagStringMatchTable::Tf_MallocTagStringMatchTable()
240 {
241 // Do nothing
242 }
243
Tf_MallocTagStringMatchTable(const std::string & matchList)244 Tf_MallocTagStringMatchTable::Tf_MallocTagStringMatchTable(
245 const std::string& matchList)
246 {
247 SetMatchList(matchList);
248 }
249
250 void
SetMatchList(const std::string & matchList)251 Tf_MallocTagStringMatchTable::SetMatchList(const std::string& matchList)
252 {
253 _matchStrings.clear();
254 std::vector<std::string> items = TfStringTokenize(matchList, ",\t\n");
255 TF_FOR_ALL(i, items) {
256 _matchStrings.push_back(_MatchString(TfStringTrim(*i, " ")));
257 }
258 }
259
260 bool
Match(const char * s) const261 Tf_MallocTagStringMatchTable::Match(const char* s) const
262 {
263 // The last match defines the overall result. If the last match had
264 // a '-' prefix then we don't match, otherwise we do.
265 TF_REVERSE_FOR_ALL(i, _matchStrings) {
266 if (i->wildcard) {
267 // Check prefix match.
268 const char* m = i->str.c_str();
269 while (*m && *m == *s) {
270 ++m, ++s;
271 }
272 if (*m != '\0') {
273 continue;
274 }
275 }
276 else {
277 // Check exact match.
278 if (i->str != s) {
279 continue;
280 }
281 }
282
283 // Matched.
284 return i->allow;
285 }
286
287 // No match.
288 return false;
289 }
290
291 /*
292 * There is a different call-site object associated with each different
293 * tag string used to construct a TfAutoMallocTag.
294 */
295 struct Tf_MallocCallSite
296 {
Tf_MallocCallSiteTf_MallocCallSite297 Tf_MallocCallSite(const string& name, uint32_t index)
298 : _name(name), _totalBytes(0), _nPaths(0), _index(index)
299 {
300 _debug = Tf_MatchesMallocTagDebugName(_name);
301 _trace = Tf_MatchesMallocTagTraceName(_name);
302 }
303
304 // Note: _name needs to be const since we call c_str() on it.
305 const string _name;
306 int64_t _totalBytes;
307 size_t _nPaths;
308 uint32_t _index;
309
310 // If true then invoke the debugger trap when allocating or freeing
311 // at this site.
312 bool _debug:1;
313
314 // If true then capture a stack trace when allocating at this site.
315 bool _trace:1;
316 };
317
318 namespace {
319
320 typedef TfHashMap<const char*, struct Tf_MallocCallSite*,
321 TfHashCString,
322 TfEqualCString> Tf_MallocCallSiteTable;
323
Tf_GetOrCreateCallSite(Tf_MallocCallSiteTable * table,const char * name,size_t * traceSiteCount)324 Tf_MallocCallSite* Tf_GetOrCreateCallSite(Tf_MallocCallSiteTable* table,
325 const char* name,
326 size_t* traceSiteCount) {
327 TF_AXIOM(table);
328 Tf_MallocCallSiteTable::iterator it = table->find(name);
329
330 if (it == table->end()) {
331 Tf_MallocCallSite* site =
332 new Tf_MallocCallSite(name, static_cast<uint32_t>(table->size()));
333 // site->_name is const so it is ok to use c_str() as the key.
334 (*table)[site->_name.c_str()] = site;
335 if (site->_trace) {
336 ++*traceSiteCount;
337 }
338 return site;
339 } else {
340 return it->second;
341 }
342 }
343 }
344
345 /*
346 * This is a singleton. Because access to this structure is gated via checks
347 * to TfMallocTag::_doTagging, we forego the usual TfSingleton pattern and just
348 * use a single static-scoped pointer (_mallocGlobalData) to point to the
349 * singleton instance.
350 */
351 struct Tf_MallocGlobalData
352 {
Tf_MallocGlobalDataTf_MallocGlobalData353 Tf_MallocGlobalData() {
354 _allPathNodes.reserve(1024);
355 _totalBytes = 0;
356 _maxTotalBytes = 0;
357 _warned = false;
358 _captureCallSiteCount = 0;
359 _captureStack.reserve(_MaxMallocStackDepth);
360 }
361
_GetOrCreateCallSiteTf_MallocGlobalData362 Tf_MallocCallSite* _GetOrCreateCallSite(const char* name) {
363 return Tf_GetOrCreateCallSite(&_callSiteTable, name,
364 &_captureCallSiteCount);
365 }
366
367 inline bool _RegisterPathNode(Tf_MallocPathNode*);
368 inline bool _RegisterPathNodeForBlock(
369 Tf_MallocPathNode* pathNode, void* block, size_t blockSize);
370 inline bool _UnregisterPathNodeForBlock(
371 void* block, Tf_MallocBlockInfo* blockInfo);
372
_IsMallocStackCapturingEnabledTf_MallocGlobalData373 bool _IsMallocStackCapturingEnabled() const {
374 return _captureCallSiteCount != 0;
375 }
376
377 void _RunDebugHookForNode(const Tf_MallocPathNode* node, void*, size_t);
378
379 void _GetStackTrace(size_t skipFrames, std::vector<uintptr_t>* stack);
380
381 void _SetTraceNames(const std::string& matchList);
382 bool _MatchesTraceName(const std::string& name);
383 void _CaptureMallocStack(
384 const Tf_MallocPathNode* node, const void *ptr, size_t size);
385 void _ReleaseMallocStack(
386 const Tf_MallocPathNode* node, const void *ptr);
387
388 void _BuildUniqueMallocStacks(TfMallocTag::CallTree* tree);
389
390 void _SetDebugNames(const std::string& matchList);
391 bool _MatchesDebugName(const std::string& name);
392
393 typedef TfHashMap<const void *, TfMallocTag::CallStackInfo, TfHash>
394 _CallStackTableType;
395
396 tbb::spin_mutex _mutex;
397 Tf_MallocPathNode* _rootNode;
398 Tf_MallocCallSiteTable _callSiteTable;
399
400 // Vector of path nodes indicating location of an allocated block.
401 // Implementations associate indices into this vector with a block.
402 vector<struct Tf_MallocPathNode*> _allPathNodes;
403
404 // Mapping from memory block to information about that block.
405 // Used by allocator-agnostic implementation.
406 typedef TfHashMap<const void *, Tf_MallocBlockInfo, TfHash>
407 _PathNodeTableType;
408 _PathNodeTableType _pathNodeTable;
409
410 size_t _captureCallSiteCount;
411 _CallStackTableType _callStackTable;
412 Tf_MallocTagStringMatchTable _traceMatchTable;
413
414 int64_t _totalBytes;
415 int64_t _maxTotalBytes;
416 bool _warned;
417
418 Tf_MallocTagStringMatchTable _debugMatchTable;
419
420 // Pre-allocated space for getting stack traces.
421 vector<uintptr_t> _captureStack;
422 };
423
424 /*
425 * Each node describes a sequence (i.e. path) of call sites.
426 * However, a given call-site can occur only once in a given path -- recursive
427 * call loops are excised.
428 */
429 struct Tf_MallocPathNode
430 {
Tf_MallocPathNodeTf_MallocPathNode431 Tf_MallocPathNode(Tf_MallocCallSite* callSite)
432 : _callSite(callSite),
433 _totalBytes(0),
434 _numAllocations(0),
435 _index(0),
436 _repeated(false)
437 {
438 }
439
_GetOrCreateChildTf_MallocPathNode440 Tf_MallocPathNode* _GetOrCreateChild(Tf_MallocCallSite* site)
441 {
442 // Note: As long as the number of children is quite small, using a
443 // vector is a good option here. If this assumption changes we
444 // should change this back to using a map (or TfHashMap).
445 TF_FOR_ALL(it, _children) {
446 if (it->first == site) {
447 return it->second;
448 }
449 }
450 Tf_MallocPathNode* pathNode = new Tf_MallocPathNode(site);
451 if (!_mallocGlobalData->_RegisterPathNode(pathNode)) {
452 delete pathNode;
453 return NULL;
454 }
455
456 _children.push_back(make_pair(site, pathNode));
457 site->_nPaths++;
458 return pathNode;
459 }
460
461 void _BuildTree(TfMallocTag::CallTree::PathNode* node,
462 bool skipRepeated);
463
464 Tf_MallocCallSite* _callSite;
465 int64_t _totalBytes;
466 int64_t _numAllocations;
467 vector<pair<Tf_MallocCallSite*, Tf_MallocPathNode*> > _children;
468 uint32_t _index; // only 24 bits
469 bool _repeated; // repeated node
470 };
471
472 inline bool
_RegisterPathNode(Tf_MallocPathNode * pathNode)473 Tf_MallocGlobalData::_RegisterPathNode(Tf_MallocPathNode* pathNode)
474 {
475 if (_allPathNodes.size() == MAX_PATH_NODES) {
476 if (!_warned) {
477 TF_WARN("maximum no. of TfMallocTag nodes has been reached!");
478 _warned = true;
479 }
480 return false;
481
482 }
483 pathNode->_index = static_cast<uint32_t>(_allPathNodes.size());
484 _allPathNodes.push_back(pathNode);
485 return true;
486 }
487
488 inline bool
_RegisterPathNodeForBlock(Tf_MallocPathNode * pathNode,void * block,size_t blockSize)489 Tf_MallocGlobalData::_RegisterPathNodeForBlock(
490 Tf_MallocPathNode* pathNode, void* block, size_t blockSize)
491 {
492 // Disable tagging for this thread so any allocations caused
493 // here do not get intercepted and cause recursion.
494 TfMallocTag::_TemporaryTaggingState tmpState(TfMallocTag::_TaggingDisabled);
495
496 const Tf_MallocBlockInfo blockInfo(blockSize, pathNode->_index);
497 return _pathNodeTable.insert(std::make_pair(block, blockInfo)).second;
498 }
499
500 inline bool
_UnregisterPathNodeForBlock(void * block,Tf_MallocBlockInfo * blockInfo)501 Tf_MallocGlobalData::_UnregisterPathNodeForBlock(
502 void* block, Tf_MallocBlockInfo* blockInfo)
503 {
504 // Disable tagging for this thread so any allocations caused
505 // here do not get intercepted and cause recursion.
506 TfMallocTag::_TemporaryTaggingState tmpState(TfMallocTag::_TaggingDisabled);
507
508 _PathNodeTableType::iterator it = _pathNodeTable.find(block);
509 if (it != _pathNodeTable.end()) {
510 *blockInfo = it->second;
511 _pathNodeTable.erase(it);
512 return true;
513 }
514
515 return false;
516 }
517
518 void
_GetStackTrace(size_t skipFrames,std::vector<uintptr_t> * stack)519 Tf_MallocGlobalData::_GetStackTrace(
520 size_t skipFrames,
521 std::vector<uintptr_t>* stack)
522 {
523 // Get the stack trace.
524 ArchGetStackFrames(_MaxMallocStackDepth, skipFrames, &_captureStack);
525
526 // Copy into stack, reserving exactly enough space.
527 stack->reserve(_captureStack.size());
528 stack->insert(stack->end(), _captureStack.begin(), _captureStack.end());
529
530 // Done with stack trace.
531 _captureStack.clear();
532 }
533
534 void
_SetTraceNames(const std::string & matchList)535 Tf_MallocGlobalData::_SetTraceNames(const std::string& matchList)
536 {
537 TfMallocTag::_TemporaryTaggingState tmpState(TfMallocTag::_TaggingDisabled);
538
539 _traceMatchTable.SetMatchList(matchList);
540
541 // Update trace flag on every existing call site.
542 _captureCallSiteCount = 0;
543 TF_FOR_ALL(i, _callSiteTable) {
544 i->second->_trace = _traceMatchTable.Match(i->second->_name.c_str());
545 if (i->second->_trace) {
546 ++_captureCallSiteCount;
547 }
548 }
549 }
550
551 bool
_MatchesTraceName(const std::string & name)552 Tf_MallocGlobalData::_MatchesTraceName(const std::string& name)
553 {
554 return _traceMatchTable.Match(name.c_str());
555 }
556
Tf_MatchesMallocTagTraceName(const string & name)557 static bool Tf_MatchesMallocTagTraceName(const string& name)
558 {
559 return _mallocGlobalData->_MatchesTraceName(name);
560 }
561
562 void
_CaptureMallocStack(const Tf_MallocPathNode * node,const void * ptr,size_t size)563 Tf_MallocGlobalData::_CaptureMallocStack(
564 const Tf_MallocPathNode* node, const void *ptr, size_t size)
565 {
566 if (node->_callSite->_trace) {
567 // Disable tagging for this thread so any allocations caused
568 // here do not get intercepted and cause recursion.
569 TfMallocTag::_TemporaryTaggingState
570 tmpState(TfMallocTag::_TaggingDisabled);
571
572 TfMallocTag::CallStackInfo &stackInfo = _callStackTable[ptr];
573 _GetStackTrace(_IgnoreStackFramesCount, &stackInfo.stack);
574 stackInfo.size = size;
575 stackInfo.numAllocations = 1;
576 }
577 }
578
579 void
_ReleaseMallocStack(const Tf_MallocPathNode * node,const void * ptr)580 Tf_MallocGlobalData::_ReleaseMallocStack(
581 const Tf_MallocPathNode* node, const void *ptr)
582 {
583 if (node->_callSite->_trace) {
584 _CallStackTableType::iterator i = _callStackTable.find(ptr);
585 if (i != _callStackTable.end()) {
586 // Disable tagging for this thread so any allocations caused
587 // here do not get intercepted and cause recursion.
588 TfMallocTag::_TemporaryTaggingState
589 tmpState(TfMallocTag::_TaggingDisabled);
590 _callStackTable.erase(i);
591 }
592 }
593 }
594
595 void
_RunDebugHookForNode(const Tf_MallocPathNode * node,void * ptr,size_t size)596 Tf_MallocGlobalData::_RunDebugHookForNode(
597 const Tf_MallocPathNode* node, void* ptr, size_t size)
598 {
599 if (node->_callSite->_debug)
600 Tf_MallocTagDebugHook(ptr, size);
601 }
602
603 void
_SetDebugNames(const std::string & matchList)604 Tf_MallocGlobalData::_SetDebugNames(const std::string& matchList)
605 {
606 TfMallocTag::_TemporaryTaggingState tmpState(TfMallocTag::_TaggingDisabled);
607
608 _debugMatchTable.SetMatchList(matchList);
609
610 // Update debug flag on every existing call site.
611 TF_FOR_ALL(i, _callSiteTable) {
612 i->second->_debug = _debugMatchTable.Match(i->second->_name.c_str());
613 }
614 }
615
616 bool
_MatchesDebugName(const std::string & name)617 Tf_MallocGlobalData::_MatchesDebugName(const std::string& name)
618 {
619 return _debugMatchTable.Match(name.c_str());
620 }
621
Tf_MatchesMallocTagDebugName(const string & name)622 static bool Tf_MatchesMallocTagDebugName(const string& name)
623 {
624 return _mallocGlobalData->_MatchesDebugName(name);
625 }
626
627 namespace {
628 // Hash functor for a malloc stack.
629 //
630 struct _HashMallocStack
631 {
operator ()__anoneab1f55c0211::_HashMallocStack632 size_t operator()(const vector<uintptr_t> &stack) const {
633 return ArchHash(
634 (const char *)&stack[0], sizeof(uintptr_t) * stack.size());
635 }
636 };
637
638 // The data associated with a malloc stack (a pointer to the malloc stack
639 // itself, and the allocation size and number of allocations).
640 //
641 struct _MallocStackData
642 {
643 const vector<uintptr_t> *stack;
644 size_t size;
645 size_t numAllocations;
646 };
647 }
648
649 // Comparison for sorting TfMallocTag::CallStackInfo based on their
650 // allocation size.
651 //
652 static bool
_MallocStackDataLessThan(const _MallocStackData * lhs,const _MallocStackData * rhs)653 _MallocStackDataLessThan(
654 const _MallocStackData *lhs,
655 const _MallocStackData *rhs)
656 {
657 return lhs->size < rhs->size;
658 }
659
660 // Builds a vector of unique captured malloc stacks and stores the result
661 // in tree->capturedCallStacks. The malloc stacks are sorted with the
662 // stacks that allocated the most memory at the front of the vector.
663 //
664 void
_BuildUniqueMallocStacks(TfMallocTag::CallTree * tree)665 Tf_MallocGlobalData::_BuildUniqueMallocStacks(TfMallocTag::CallTree* tree)
666 {
667 if (!_callStackTable.empty()) {
668 // Create a map from malloc stacks to the malloc stack data.
669 typedef TfHashMap<
670 vector<uintptr_t>, _MallocStackData, _HashMallocStack> _Map;
671 _Map map;
672
673 TF_FOR_ALL(it, _callStackTable) {
674 // Since _callStackTable does not change at this point it is
675 // ok to store the address of the malloc stack in the data.
676 const TfMallocTag::CallStackInfo &stackInfo = it->second;
677 _MallocStackData data = { &stackInfo.stack, 0, 0 };
678
679 pair<_Map::iterator, bool> insertResult = map.insert(
680 make_pair(stackInfo.stack, data));
681
682 _MallocStackData &updateData = insertResult.first->second;
683 updateData.size += stackInfo.size;
684 updateData.numAllocations += stackInfo.numAllocations;
685 }
686
687 // Sort the malloc stack data by allocation size.
688 std::vector<const _MallocStackData *> sortedStackData;
689 sortedStackData.reserve(map.size());
690 TF_FOR_ALL(it, map) {
691 sortedStackData.push_back(&it->second);
692 }
693
694 std::sort(
695 sortedStackData.begin(),
696 sortedStackData.end(),
697 _MallocStackDataLessThan);
698
699 tree->capturedCallStacks.reserve(sortedStackData.size());
700 TF_REVERSE_FOR_ALL(it, sortedStackData) {
701 const _MallocStackData &data = **it;
702
703 tree->capturedCallStacks.push_back(TfMallocTag::CallStackInfo());
704 TfMallocTag::CallStackInfo &stackInfo =
705 tree->capturedCallStacks.back();
706
707 // Take a copy of the malloc stack.
708 stackInfo.stack = *data.stack;
709 stackInfo.size = data.size;
710 stackInfo.numAllocations = data.numAllocations;
711 }
712 }
713 }
714
715
716 void
_BuildTree(TfMallocTag::CallTree::PathNode * node,bool skipRepeated)717 Tf_MallocPathNode::_BuildTree(TfMallocTag::CallTree::PathNode* node,
718 bool skipRepeated)
719 {
720 node->children.reserve(_children.size());
721 node->nBytes = node->nBytesDirect = _totalBytes;
722 node->nAllocations = _numAllocations;
723 node->siteName = _callSite->_name;
724
725 TF_FOR_ALL(pi, _children) {
726 // The tree is built in a special way, if the repeated allocations
727 // should be skipped. First, the full tree is built using temporary
728 // nodes for all allocations that should be skipped. Then tree is
729 // collapsed by copying the children of temporary nodes to their parents
730 // in bottom-up fasion.
731 if (skipRepeated && pi->second->_repeated) {
732 // Create a temporary node
733 TfMallocTag::CallTree::PathNode childNode;
734 pi->second->_BuildTree(&childNode, skipRepeated);
735 // Add the direct contribution of this node to the parent.
736 node->nBytesDirect += childNode.nBytesDirect;
737 // Copy the children, if there are any
738 if (!childNode.children.empty()) {
739 node->children.insert(node->children.end(),
740 childNode.children.begin(),
741 childNode.children.end());
742 }
743 node->nBytes += childNode.nBytes;
744 } else {
745 node->children.push_back(TfMallocTag::CallTree::PathNode());
746 TfMallocTag::CallTree::PathNode& childNode = node->children.back();
747 pi->second->_BuildTree(&childNode, skipRepeated);
748 node->nBytes += childNode.nBytes;
749 }
750 }
751 }
752
753 namespace {
Tf_GetCallSites(TfMallocTag::CallTree::PathNode * node,Tf_MallocCallSiteTable * table)754 void Tf_GetCallSites(TfMallocTag::CallTree::PathNode* node,
755 Tf_MallocCallSiteTable* table) {
756 TF_AXIOM(node);
757 TF_AXIOM(table);
758
759 size_t dummy;
760 Tf_MallocCallSite* site =
761 Tf_GetOrCreateCallSite(table, node->siteName.c_str(), &dummy);
762 site->_totalBytes += node->nBytesDirect;
763
764 TF_FOR_ALL(pi, node->children) {
765 Tf_GetCallSites(&(*pi), table);
766 }
767 }
768 }
769
770 /*
771 * None of this is implemented for a 32-bit build.
772 */
773
774 #define _HI_WORD(sptr) *(((int *)sptr) + 1)
775 #define _LO_WORD(sptr) *((int *)sptr)
776
777 #if defined(ARCH_BITS_64)
778
779 // This modifies the control word associated with \a ptr, removing the stored
780 // index, and returning the index and allocation size.
781 static inline void
_ExtractIndexAndGetSize(void * ptr,size_t * size,uint32_t * index)782 _ExtractIndexAndGetSize(void *ptr, size_t *size, uint32_t *index)
783 {
784 // Get the control word.
785 size_t *sptr = static_cast<size_t *>(ptr) - 1;
786
787 // Read the stored index.
788 *index = _HI_WORD(sptr) >> HIWORD_INDEX_BIT_OFFSET;
789
790 // Read the size.
791 *size = *sptr & MALLOC_SIZE_MASK;
792
793 // Remove the stored index from the word.
794 _HI_WORD(sptr) &= HIWORD_INDEX_MASK;
795
796 }
797
798 // This modifies the control word associated with \a ptr, storing \a index, and
799 // returning the allocation size.
800 static inline void
_StoreIndexAndGetSize(void * ptr,size_t * size,uint32_t index)801 _StoreIndexAndGetSize(void *ptr, size_t *size, uint32_t index)
802 {
803 // Get the control word.
804 size_t const *sptr = static_cast<size_t const *>(ptr) - 1;
805
806 // Read the size.
807 *size = *sptr & MALLOC_SIZE_MASK;
808
809 // Write the index.
810 _HI_WORD(sptr) |= (index << HIWORD_INDEX_BIT_OFFSET);
811 }
812
813 #else
814
815 // Allow compilation, but just fatal error. This code shouldn't ever be active
816
817 static inline void
_ExtractIndexAndGetSize(void *,size_t *,uint32_t *)818 _ExtractIndexAndGetSize(void *, size_t *, uint32_t *)
819 {
820 TF_FATAL_ERROR("Attempting to use Malloc Tags on unsupported platform");
821 }
822
823 static inline void
_StoreIndexAndGetSize(void *,size_t *,uint32_t)824 _StoreIndexAndGetSize(void *, size_t *, uint32_t)
825 {
826 TF_FATAL_ERROR("Attempting to use Malloc Tags on unsupported platform");
827 }
828
829 #endif
830
831 // Per-thread data for TfMallocTag.
832 struct TfMallocTag::_ThreadData {
_ThreadDataTfMallocTag::_ThreadData833 _ThreadData() : _tagState(_TaggingDormant) { }
834 _ThreadData(const _ThreadData &) = delete;
835 _ThreadData(_ThreadData&&) = delete;
836 _ThreadData& operator=(const _ThreadData &rhs) = delete;
837 _ThreadData& operator=(_ThreadData&&) = delete;
838
839 _Tagging _tagState;
840 std::vector<Tf_MallocPathNode*> _tagStack;
841 std::vector<unsigned int> _callSiteOnStack;
842 };
843
844 class TfMallocTag::Tls {
845 public:
846 static
847 TfMallocTag::_ThreadData*
Find()848 Find()
849 {
850 #if defined(ARCH_HAS_THREAD_LOCAL)
851 // This weirdness is so we don't use the heap and we don't call
852 // the destructor of _ThreadData when the thread is exiting.
853 // We can't do the latter because we don't know in what order
854 // objects will be destroyed and objects destroyed after the
855 // _ThreadData may do heap (de)allocation, which requires the
856 // _ThreadData object. We leak the heap allocated blocks in
857 // the _ThreadData.
858 static thread_local
859 std::aligned_storage<sizeof(_ThreadData),
860 alignof(_ThreadData)>::type dataBuffer;
861 static thread_local _ThreadData* data = new (&dataBuffer) _ThreadData;
862 return data;
863 #else
864 TF_FATAL_ERROR("TfMallocTag not supported on platforms "
865 "without thread_local");
866 return nullptr;
867 #endif
868 }
869 };
870
871 /*
872 * If this returns false, it sets *tptr. Otherwise,
873 * we don't need *tptr, so it may not be set.
874 */
875 inline bool
_ShouldNotTag(TfMallocTag::_ThreadData ** tptr,_Tagging * statePtr)876 TfMallocTag::_ShouldNotTag(TfMallocTag::_ThreadData** tptr, _Tagging* statePtr)
877 {
878 if (!TfMallocTag::_doTagging) {
879 if (statePtr) {
880 *statePtr = _TaggingDormant;
881 }
882 return true;
883 }
884 else {
885 *tptr = TfMallocTag::Tls::Find();
886 if (statePtr) {
887 *statePtr = (*tptr)->_tagState;
888 }
889 return (*tptr)->_tagState != _TaggingEnabled;
890 }
891 }
892
893 // Helper function to retrieve the current path node from a _ThreadData
894 // object. Note that _mallocGlobalData->_mutex must be locked before calling
895 // this function.
896 inline Tf_MallocPathNode*
_GetCurrentPathNodeNoLock(const TfMallocTag::_ThreadData * tptr)897 TfMallocTag::_GetCurrentPathNodeNoLock(const TfMallocTag::_ThreadData* tptr)
898 {
899 if (!tptr->_tagStack.empty()) {
900 return tptr->_tagStack.back();
901 }
902
903 // If the _ThreadData does not have any entries in its tag stack, return
904 // the global root so that any memory allocations are assigned to that
905 // node.
906 return _mallocGlobalData->_rootNode;
907 }
908
909 void
SetDebugMatchList(const std::string & matchList)910 TfMallocTag::SetDebugMatchList(const std::string& matchList)
911 {
912 if (TfMallocTag::IsInitialized()) {
913 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
914 _mallocGlobalData->_SetDebugNames(matchList);
915 }
916 }
917
918 void
SetCapturedMallocStacksMatchList(const std::string & matchList)919 TfMallocTag::SetCapturedMallocStacksMatchList(const std::string& matchList)
920 {
921 if (TfMallocTag::IsInitialized()) {
922 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
923 _mallocGlobalData->_SetTraceNames(matchList);
924 }
925 }
926
927 vector<vector<uintptr_t> >
GetCapturedMallocStacks()928 TfMallocTag::GetCapturedMallocStacks()
929 {
930 vector<vector<uintptr_t> > result;
931
932 if (!TfMallocTag::IsInitialized())
933 return result;
934
935 // Push some malloc tags, so what we do here doesn't pollute the root
936 // stacks.
937 TfAutoMallocTag2 tag("Tf", "TfGetRootMallocStacks");
938
939 // Copy off the stack traces, make sure to malloc outside.
940 Tf_MallocGlobalData::_CallStackTableType traces;
941
942 // Swap them out while holding the lock.
943 {
944 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
945 traces.swap(_mallocGlobalData->_callStackTable);
946 }
947
948 TF_FOR_ALL(i, traces)
949 result.push_back(i->second.stack);
950
951 return result;
952 }
953
954 void*
_MallocWrapper(size_t nBytes,const void *)955 TfMallocTag::_MallocWrapper(size_t nBytes, const void*)
956 {
957 void* ptr = _mallocHook.Malloc(nBytes);
958
959 _ThreadData* td;
960 if (_ShouldNotTag(&td) || ARCH_UNLIKELY(!ptr))
961 return ptr;
962
963 {
964 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
965
966 Tf_MallocPathNode* node = _GetCurrentPathNodeNoLock(td);
967 size_t blockSize = Tf_GetMallocBlockSize(ptr, nBytes);
968
969 // Update malloc global data with bookkeeping information. This has to
970 // happen while the mutex is held.
971 if (_mallocGlobalData->_RegisterPathNodeForBlock(node, ptr, blockSize)) {
972 _mallocGlobalData->_CaptureMallocStack(node, ptr, blockSize);
973
974 node->_totalBytes += blockSize;
975 node->_numAllocations++;
976 node->_callSite->_totalBytes += blockSize;
977 _mallocGlobalData->_totalBytes += blockSize;
978
979 _mallocGlobalData->_maxTotalBytes =
980 std::max(_mallocGlobalData->_totalBytes,
981 _mallocGlobalData->_maxTotalBytes);
982
983 _mallocGlobalData->_RunDebugHookForNode(node, ptr, blockSize);
984
985 return ptr;
986 }
987 }
988
989 // Make sure we issue this error while the mutex is unlocked, as issuing
990 // the error could cause more allocations, leading to a reentrant call.
991 //
992 // This should only happen if there's a bug with removing previously
993 // allocated blocks from the path node table. This likely would cause us to
994 // miscount memory usage, but the allocated pointer is still valid and the
995 // system should continue to work. So, we issue a warning but continue on
996 // instead of using an axiom.
997 TF_VERIFY(!"Failed to register path for allocated block. "
998 "Memory usage may be miscounted");
999
1000 return ptr;
1001 }
1002
1003 void*
_ReallocWrapper(void * oldPtr,size_t nBytes,const void *)1004 TfMallocTag::_ReallocWrapper(void* oldPtr, size_t nBytes, const void*)
1005 {
1006 /*
1007 * If ptr is NULL, we want to make sure we don't double count,
1008 * because a call to _mallocHook.Realloc(ptr, nBytes) could call
1009 * through to our malloc. To avoid this, we'll explicitly short-circuit
1010 * ourselves rather than trust that the malloc library will do it.
1011 */
1012 if (!oldPtr)
1013 return _MallocWrapper(nBytes, NULL);
1014
1015 _ThreadData* td = NULL;
1016 _Tagging tagState;
1017 const bool shouldNotTag = _ShouldNotTag(&td, &tagState);
1018
1019 // If tagging is explicitly disabled, just do the realloc and skip
1020 // everything else. This avoids a deadlock if we get here while updating
1021 // Tf_MallocGlobalData::_pathNodeTable.
1022 //
1023 // If tagState is _TaggingDormant, we still need to unregister the oldPtr.
1024 // However, we won't need to register the newly realloc'd ptr later on.
1025 if (tagState == _TaggingDisabled) {
1026 return _mallocHook.Realloc(oldPtr, nBytes);
1027 }
1028
1029 void* newPtr = NULL;
1030 {
1031 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1032
1033 Tf_MallocBlockInfo info;
1034 if (_mallocGlobalData->_UnregisterPathNodeForBlock(oldPtr, &info)) {
1035
1036 size_t bytesFreed = info.blockSize;
1037 Tf_MallocPathNode* oldNode =
1038 _mallocGlobalData->_allPathNodes[info.pathNodeIndex];
1039
1040 _mallocGlobalData->_RunDebugHookForNode(oldNode, oldPtr, bytesFreed);
1041
1042 // Check if we should release a malloc stack. This has to happen
1043 // while the mutex is held.
1044 _mallocGlobalData->_ReleaseMallocStack(oldNode, oldPtr);
1045
1046 oldNode->_totalBytes -= bytesFreed;
1047 oldNode->_numAllocations -= (_DECREMENT_ALLOCATION_COUNTS) ? 1 : 0;
1048 oldNode->_callSite->_totalBytes -= bytesFreed;
1049 _mallocGlobalData->_totalBytes -= bytesFreed;
1050 }
1051
1052 newPtr = _mallocHook.Realloc(oldPtr, nBytes);
1053
1054 if (shouldNotTag || ARCH_UNLIKELY(!newPtr))
1055 return newPtr;
1056
1057 Tf_MallocPathNode* newNode = _GetCurrentPathNodeNoLock(td);
1058 size_t blockSize = Tf_GetMallocBlockSize(newPtr, nBytes);
1059
1060 // Update malloc global data with bookkeeping information. This has to
1061 // happen while the mutex is held.
1062 if (_mallocGlobalData->_RegisterPathNodeForBlock(
1063 newNode, newPtr, blockSize)) {
1064
1065 _mallocGlobalData->_CaptureMallocStack(
1066 newNode, newPtr, blockSize);
1067
1068 newNode->_totalBytes += blockSize;
1069 newNode->_numAllocations++;
1070 newNode->_callSite->_totalBytes += blockSize;
1071 _mallocGlobalData->_totalBytes += blockSize;
1072
1073 _mallocGlobalData->_maxTotalBytes =
1074 std::max(_mallocGlobalData->_totalBytes,
1075 _mallocGlobalData->_maxTotalBytes);
1076
1077 _mallocGlobalData->_RunDebugHookForNode(
1078 newNode, newPtr, blockSize);
1079 }
1080
1081 return newPtr;
1082 }
1083
1084 // See comment in _MallocWrapper.
1085 TF_VERIFY(!"Failed to register path for allocated block. "
1086 "Memory usage may be miscounted");
1087 return newPtr;
1088 }
1089
1090 void*
_MemalignWrapper(size_t alignment,size_t nBytes,const void *)1091 TfMallocTag::_MemalignWrapper(size_t alignment, size_t nBytes, const void*)
1092 {
1093 void* ptr = _mallocHook.Memalign(alignment, nBytes);
1094
1095 _ThreadData* td;
1096 if (_ShouldNotTag(&td) || ARCH_UNLIKELY(!ptr))
1097 return ptr;
1098
1099 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1100
1101 Tf_MallocPathNode* node = _GetCurrentPathNodeNoLock(td);
1102 size_t blockSize = Tf_GetMallocBlockSize(ptr, nBytes);
1103
1104 // Update malloc global data with bookkeeping information. This has to
1105 // happen while the mutex is held.
1106 _mallocGlobalData->_RegisterPathNodeForBlock(node, ptr, blockSize);
1107 _mallocGlobalData->_CaptureMallocStack(node, ptr, blockSize);
1108
1109 node->_totalBytes += blockSize;
1110 node->_numAllocations++;
1111 node->_callSite->_totalBytes += blockSize;
1112 _mallocGlobalData->_totalBytes += blockSize;
1113
1114 _mallocGlobalData->_maxTotalBytes = std::max(_mallocGlobalData->_totalBytes,
1115 _mallocGlobalData->_maxTotalBytes);
1116
1117 _mallocGlobalData->_RunDebugHookForNode(node, ptr, blockSize);
1118
1119 return ptr;
1120 }
1121
1122 void
_FreeWrapper(void * ptr,const void *)1123 TfMallocTag::_FreeWrapper(void* ptr, const void*)
1124 {
1125 if (!ptr)
1126 return;
1127
1128 // If tagging is explicitly disabled, just do the free and skip
1129 // everything else. This avoids a deadlock if we get here while updating
1130 // Tf_MallocGlobalData::_pathNodeTable.
1131 _ThreadData* td;
1132 _Tagging tagState;
1133 if (_ShouldNotTag(&td, &tagState) && tagState == _TaggingDisabled) {
1134 _mallocHook.Free(ptr);
1135 return;
1136 }
1137
1138 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1139
1140 Tf_MallocBlockInfo info;
1141 if (_mallocGlobalData->_UnregisterPathNodeForBlock(ptr, &info)) {
1142 size_t bytesFreed = info.blockSize;
1143 Tf_MallocPathNode* node =
1144 _mallocGlobalData->_allPathNodes[info.pathNodeIndex];
1145
1146 _mallocGlobalData->_RunDebugHookForNode(node, ptr, bytesFreed);
1147
1148 // Check if we should release a malloc stack. This has to happen
1149 // while the mutex is held.
1150 _mallocGlobalData->_ReleaseMallocStack(node, ptr);
1151
1152 node->_totalBytes -= bytesFreed;
1153 node->_numAllocations -= (_DECREMENT_ALLOCATION_COUNTS) ? 1 : 0;
1154 node->_callSite->_totalBytes -= bytesFreed;
1155 _mallocGlobalData->_totalBytes -= bytesFreed;
1156 }
1157
1158 _mallocHook.Free(ptr);
1159 }
1160
1161 void*
_MallocWrapper_ptmalloc(size_t nBytes,const void *)1162 TfMallocTag::_MallocWrapper_ptmalloc(size_t nBytes, const void*)
1163 {
1164 void* ptr = _mallocHook.Malloc(nBytes);
1165
1166 _ThreadData* td;
1167 if (_ShouldNotTag(&td))
1168 return ptr;
1169
1170 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1171
1172 Tf_MallocPathNode* node = _GetCurrentPathNodeNoLock(td);
1173 size_t actualBytes;
1174 _StoreIndexAndGetSize(ptr, &actualBytes, node->_index);
1175
1176 // Check if we should capture a malloc stack. This has to happen while
1177 // the mutex is held.
1178 _mallocGlobalData->_CaptureMallocStack(node, ptr, actualBytes);
1179
1180 node->_totalBytes += actualBytes;
1181 node->_numAllocations++;
1182 node->_callSite->_totalBytes += actualBytes;
1183 _mallocGlobalData->_totalBytes += actualBytes;
1184
1185 _mallocGlobalData->_maxTotalBytes = std::max(_mallocGlobalData->_totalBytes,
1186 _mallocGlobalData->_maxTotalBytes);
1187
1188 _mallocGlobalData->_RunDebugHookForNode(node, ptr, actualBytes);
1189
1190 return ptr;
1191 }
1192
1193 void*
_ReallocWrapper_ptmalloc(void * oldPtr,size_t nBytes,const void *)1194 TfMallocTag::_ReallocWrapper_ptmalloc(void* oldPtr, size_t nBytes, const void*)
1195 {
1196 /*
1197 * If ptr is NULL, we want to make sure we don't double count,
1198 * because a call to _mallocHook.Realloc(ptr, nBytes) could call
1199 * through to our malloc. To avoid this, we'll explicitly short-circuit
1200 * ourselves rather than trust that the malloc library will do it.
1201 */
1202 if (!oldPtr)
1203 return _MallocWrapper_ptmalloc(nBytes, NULL);
1204
1205 /*
1206 * Account for the implicit free, and fix-up oldPtr
1207 * regardless of whether we're currently tagging or not:
1208 */
1209 uint32_t index;
1210 size_t bytesFreed;
1211 _ExtractIndexAndGetSize(oldPtr, &bytesFreed, &index);
1212
1213 void* newPtr = _mallocHook.Realloc(oldPtr, nBytes);
1214
1215 _ThreadData* td;
1216 if (_ShouldNotTag(&td))
1217 return newPtr;
1218
1219 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1220
1221 Tf_MallocPathNode* newNode = _GetCurrentPathNodeNoLock(td);
1222 size_t actualBytes;
1223 _StoreIndexAndGetSize(newPtr, &actualBytes, newNode->_index);
1224
1225 if (index) {
1226 Tf_MallocPathNode* oldNode = _mallocGlobalData->_allPathNodes[index];
1227
1228 _mallocGlobalData->_RunDebugHookForNode(oldNode, oldPtr, bytesFreed);
1229
1230 // Check if we should release a malloc stack. This has to happen while
1231 // the mutex is held.
1232 _mallocGlobalData->_ReleaseMallocStack(oldNode, oldPtr);
1233
1234 oldNode->_totalBytes -= bytesFreed;
1235 oldNode->_numAllocations -= (_DECREMENT_ALLOCATION_COUNTS) ? 1 : 0;
1236 oldNode->_callSite->_totalBytes -= bytesFreed;
1237 _mallocGlobalData->_totalBytes -= bytesFreed;
1238 }
1239
1240 // Check if we should capture a malloc stack. This has to happen while
1241 // the mutex is held.
1242 _mallocGlobalData->_CaptureMallocStack(newNode, newPtr, actualBytes);
1243
1244 newNode->_totalBytes += actualBytes;
1245 newNode->_numAllocations++;
1246 newNode->_callSite->_totalBytes += actualBytes;
1247 _mallocGlobalData->_totalBytes += actualBytes;
1248
1249 _mallocGlobalData->_maxTotalBytes = std::max(_mallocGlobalData->_totalBytes,
1250 _mallocGlobalData->_maxTotalBytes);
1251
1252 _mallocGlobalData->_RunDebugHookForNode(newNode, newPtr, actualBytes);
1253
1254 return newPtr;
1255 }
1256
1257 void*
_MemalignWrapper_ptmalloc(size_t alignment,size_t nBytes,const void *)1258 TfMallocTag::_MemalignWrapper_ptmalloc(size_t alignment, size_t nBytes, const void*)
1259 {
1260 void* ptr = _mallocHook.Memalign(alignment, nBytes);
1261
1262 _ThreadData* td;
1263 if (_ShouldNotTag(&td))
1264 return ptr;
1265
1266 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1267
1268 Tf_MallocPathNode* node = _GetCurrentPathNodeNoLock(td);
1269 size_t actualBytes;
1270 _StoreIndexAndGetSize(ptr, &actualBytes, node->_index);
1271
1272 // Check if we should capture a malloc stack. This has to happen while
1273 // the mutex is held.
1274 _mallocGlobalData->_CaptureMallocStack(node, ptr, actualBytes);
1275
1276 node->_totalBytes += actualBytes;
1277 node->_numAllocations++;
1278 node->_callSite->_totalBytes += actualBytes;
1279 _mallocGlobalData->_totalBytes += actualBytes;
1280
1281 _mallocGlobalData->_maxTotalBytes = std::max(_mallocGlobalData->_totalBytes,
1282 _mallocGlobalData->_maxTotalBytes);
1283
1284 _mallocGlobalData->_RunDebugHookForNode(node, ptr, actualBytes);
1285
1286 return ptr;
1287 }
1288
1289 void
_FreeWrapper_ptmalloc(void * ptr,const void *)1290 TfMallocTag::_FreeWrapper_ptmalloc(void* ptr, const void*)
1291 {
1292 if (!ptr)
1293 return;
1294
1295 /*
1296 * Make ptr safe in case it has index bits set:
1297 */
1298 uint32_t index;
1299 size_t bytesFreed;
1300 _ExtractIndexAndGetSize(ptr, &bytesFreed, &index);
1301
1302 if (index && TfMallocTag::_doTagging) {
1303 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1304 Tf_MallocPathNode* node = _mallocGlobalData->_allPathNodes[index];
1305
1306 _mallocGlobalData->_RunDebugHookForNode(node, ptr, bytesFreed);
1307
1308 // Check if we should release a malloc stack. This has to happen
1309 // while the mutex is held.
1310 _mallocGlobalData->_ReleaseMallocStack(node, ptr);
1311
1312 node->_totalBytes -= bytesFreed;
1313 node->_numAllocations -= (_DECREMENT_ALLOCATION_COUNTS) ? 1 : 0;
1314 node->_callSite->_totalBytes -= bytesFreed;
1315 _mallocGlobalData->_totalBytes -= bytesFreed;
1316 }
1317
1318 _mallocHook.Free(ptr);
1319 }
1320
1321 bool
Initialize(string * errMsg)1322 TfMallocTag::Initialize(string* errMsg)
1323 {
1324 static bool status = _Initialize(errMsg);
1325 return status;
1326 }
1327
1328
1329 bool
GetCallTree(CallTree * tree,bool skipRepeated)1330 TfMallocTag::GetCallTree(CallTree* tree, bool skipRepeated)
1331 {
1332 tree->callSites.clear();
1333 tree->root.nBytes = tree->root.nBytesDirect = 0;
1334 tree->root.nAllocations = 0;
1335 tree->root.siteName.clear();
1336 tree->root.children.clear();
1337
1338 if (Tf_MallocGlobalData* gd = _mallocGlobalData) {
1339 TfMallocTag::_TemporaryTaggingState tmpState(_TaggingDisabled);
1340
1341 gd->_mutex.lock();
1342
1343 // Build the snapshot call tree
1344 gd->_rootNode->_BuildTree(&tree->root, skipRepeated);
1345
1346 // Build the snapshot callsites map based on the tree
1347 Tf_MallocCallSiteTable callSiteTable;
1348 Tf_GetCallSites(&tree->root, &callSiteTable);
1349
1350 // Copy the callsites into the calltree
1351 tree->callSites.reserve(callSiteTable.size());
1352 TF_FOR_ALL(csi, callSiteTable) {
1353 CallTree::CallSite cs = {
1354 csi->second->_name,
1355 static_cast<size_t>(csi->second->_totalBytes)
1356 };
1357 tree->callSites.push_back(cs);
1358 delete csi->second;
1359 }
1360
1361 gd->_BuildUniqueMallocStacks(tree);
1362
1363 gd->_mutex.unlock();
1364 return true;
1365 }
1366 else
1367 return false;
1368 }
1369
1370 size_t
GetTotalBytes()1371 TfMallocTag::GetTotalBytes()
1372 {
1373 if (!_mallocGlobalData)
1374 return 0;
1375
1376 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1377 return _mallocGlobalData->_totalBytes;
1378 }
1379
1380 size_t
GetMaxTotalBytes()1381 TfMallocTag::GetMaxTotalBytes()
1382 {
1383 if (!_mallocGlobalData)
1384 return 0;
1385
1386 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1387 return _mallocGlobalData->_maxTotalBytes;
1388 }
1389
1390 void
_SetTagging(_Tagging status)1391 TfMallocTag::_SetTagging(_Tagging status)
1392 {
1393 TfMallocTag::Tls::Find()->_tagState = status;
1394 }
1395
1396 TfMallocTag::_Tagging
_GetTagging()1397 TfMallocTag::_GetTagging()
1398 {
1399 return TfMallocTag::Tls::Find()->_tagState;
1400 }
1401
1402 bool
_Initialize(std::string * errMsg)1403 TfMallocTag::_Initialize(std::string* errMsg)
1404 {
1405 /*
1406 * This is called from an EXECUTE_ONCE block, so no
1407 * need to lock anything.
1408 */
1409 TF_AXIOM(!_mallocGlobalData);
1410 _mallocGlobalData = new Tf_MallocGlobalData();
1411
1412 // Note that we are *not* using the _TemporaryTaggingState object
1413 // here. We explicitly want the tagging set to enabled as the end
1414 // of this function so that all subsequent memory allocations are captured.
1415 _SetTagging(_TaggingDisabled);
1416
1417 bool usePtmalloc = _UsePtmalloc();
1418
1419 if (usePtmalloc) {
1420 // index 0 is reserved for untracked malloc/free's:
1421 _mallocGlobalData->_allPathNodes.push_back(NULL);
1422 }
1423
1424 Tf_MallocCallSite* site = _mallocGlobalData->_GetOrCreateCallSite("__root");
1425 Tf_MallocPathNode* rootNode = new Tf_MallocPathNode(site);
1426 _mallocGlobalData->_rootNode = rootNode;
1427 (void) _mallocGlobalData->_RegisterPathNode(rootNode);
1428 TfMallocTag::Tls::Find()->_tagStack.reserve(64);
1429 TfMallocTag::Tls::Find()->_tagStack.push_back(rootNode);
1430
1431 _SetTagging(_TaggingEnabled);
1432
1433 TfMallocTag::_doTagging = true;
1434
1435 if (usePtmalloc) {
1436 return _mallocHook.Initialize(_MallocWrapper_ptmalloc,
1437 _ReallocWrapper_ptmalloc,
1438 _MemalignWrapper_ptmalloc,
1439 _FreeWrapper_ptmalloc,
1440 errMsg);
1441 }
1442 else {
1443 return _mallocHook.Initialize(_MallocWrapper,
1444 _ReallocWrapper,
1445 _MemalignWrapper,
1446 _FreeWrapper,
1447 errMsg);
1448 }
1449 }
1450
1451 void
_Begin(const string & name)1452 TfMallocTag::Auto::_Begin(const string& name)
1453 {
1454 _Begin(name.c_str());
1455 }
1456
1457 void
_Begin(const char * name)1458 TfMallocTag::Auto::_Begin(const char* name)
1459 {
1460 if (!name || !name[0])
1461 return;
1462
1463 _threadData = TfMallocTag::Tls::Find();
1464
1465 _threadData->_tagState = _TaggingDisabled;
1466 Tf_MallocPathNode* thisNode;
1467 Tf_MallocCallSite* site;
1468
1469 {
1470 tbb::spin_mutex::scoped_lock lock(_mallocGlobalData->_mutex);
1471 site = _mallocGlobalData->_GetOrCreateCallSite(name);
1472
1473 if (_threadData->_callSiteOnStack.size() <= site->_index) {
1474 if (_threadData->_callSiteOnStack.capacity() == 0)
1475 _threadData->_callSiteOnStack.reserve(128);
1476 _threadData->_callSiteOnStack.resize(site->_index + 1, 0);
1477 }
1478
1479
1480 if (_threadData->_tagStack.empty())
1481 thisNode = _mallocGlobalData->_rootNode->_GetOrCreateChild(site);
1482 else
1483 thisNode = _threadData->_tagStack.back()->_GetOrCreateChild(site);
1484
1485 if (_threadData->_callSiteOnStack[site->_index]) {
1486 thisNode->_repeated = true;
1487 }
1488 }
1489
1490 if (thisNode) {
1491 _threadData->_tagStack.push_back(thisNode);
1492 _threadData->_callSiteOnStack[site->_index] += 1;
1493 _threadData->_tagState = _TaggingEnabled;
1494 }
1495 else {
1496 _threadData->_tagState = _TaggingEnabled;
1497 _threadData = NULL;
1498 }
1499 }
1500
1501 void
_End()1502 TfMallocTag::Auto::_End()
1503 {
1504 Tf_MallocPathNode* node = _threadData->_tagStack.back();
1505 TF_AXIOM(_threadData->_callSiteOnStack[node->_callSite->_index] > 0);
1506 _threadData->_callSiteOnStack[node->_callSite->_index] -= 1;
1507 _threadData->_tagStack.pop_back();
1508 }
1509
1510 void
Pop(const char * name)1511 TfMallocTag::Pop(const char* name)
1512 {
1513 if (!TfMallocTag::_doTagging)
1514 return;
1515
1516 _ThreadData* threadData = TfMallocTag::Tls::Find();
1517 Tf_MallocPathNode* node = threadData->_tagStack.back();
1518
1519 if (name && node->_callSite->_name != name) {
1520 TF_CODING_ERROR("mismatched call Pop(\"%s\"); top of stack is \"%s\"",
1521 name, node->_callSite->_name.c_str());
1522 }
1523
1524 TF_AXIOM(threadData->_callSiteOnStack[node->_callSite->_index] > 0);
1525 threadData->_callSiteOnStack[node->_callSite->_index] -= 1;
1526 threadData->_tagStack.pop_back();
1527 }
1528
1529 // Returns the given number as a string with commas used as thousands
1530 // separators.
1531 //
1532 static string
_GetAsCommaSeparatedString(size_t number)1533 _GetAsCommaSeparatedString(size_t number)
1534 {
1535 string result;
1536
1537 string str = TfStringPrintf("%ld", number);
1538 size_t n = str.size();
1539
1540 TF_FOR_ALL(it, str) {
1541 if (n < str.size() && n%3 == 0) {
1542 result.push_back(',');
1543 }
1544 result.push_back(*it);
1545 n--;
1546 }
1547 return result;
1548 }
1549
1550 static void
_PrintHeader(string * rpt)1551 _PrintHeader(string *rpt)
1552 {
1553 *rpt += "\n" + string(80, '-') + "\n";
1554 *rpt += TfStringPrintf("\nMalloc Tag Report\n\n\n");
1555 *rpt += TfStringPrintf("Total bytes = %s\n\n\n",
1556 _GetAsCommaSeparatedString(TfMallocTag::GetTotalBytes()).c_str());
1557 }
1558
1559 static size_t
_PrintMallocNode(string * rpt,const TfMallocTag::CallTree::PathNode & node,size_t rootTotal,size_t parentTotal,size_t level,size_t & printedNodes,size_t maxPrintedNodes)1560 _PrintMallocNode(
1561 string* rpt,
1562 const TfMallocTag::CallTree::PathNode &node,
1563 size_t rootTotal,
1564 size_t parentTotal,
1565 size_t level,
1566 size_t &printedNodes,
1567 size_t maxPrintedNodes)
1568 {
1569 if (!level) {
1570 // XXX:cleanup We should pass in maxNameWidth and generate format
1571 // strings like in _PrintMallocCallSites().
1572 *rpt += TfStringPrintf("%-72s %15s%15s %5s %5s %5s\n", "TAGNAME",
1573 "BytesIncl", "BytesExcl", "%Prnt", "% Exc",
1574 "%Totl");
1575 *rpt += TfStringPrintf("%-72s %12s%12s %5s %5s %5s\n\n",
1576 string(72, '-').c_str(),
1577 " --------------", " --------------",
1578 "-----", "-----", "-----");
1579
1580 rootTotal = node.nBytes;
1581 }
1582
1583 size_t maxNameWidth = 72;
1584 size_t indent = level;
1585
1586 if (printedNodes >= maxPrintedNodes) {
1587 return 0;
1588 }
1589 printedNodes++;
1590
1591 string name = string(indent, ' ') +
1592 node.siteName.substr(0, maxNameWidth-indent);
1593 int postLen = static_cast<int>(maxNameWidth - name.length());
1594 if (postLen > 0) {
1595 name += string(postLen, ' ');
1596 }
1597
1598 *rpt += TfStringPrintf(
1599 "%s %15s%15s ",
1600 name.c_str(),
1601 _GetAsCommaSeparatedString(node.nBytes).c_str(),
1602 _GetAsCommaSeparatedString(node.nBytesDirect).c_str());
1603
1604 string curPercent;
1605 string curPercentDirect;
1606 string percentDirectOfRoot;
1607
1608 if (parentTotal) {
1609
1610 float percent = node.nBytes/(float)parentTotal*100;
1611 if (percent > 0.5) {
1612 curPercent = TfStringPrintf(" %.0f%%", percent);
1613 }
1614 percent = node.nBytesDirect/(float)node.nBytes*100;
1615 if (percent > 0.5) {
1616 curPercentDirect = TfStringPrintf(" %.0f%%", percent);
1617 }
1618
1619 percent = node.nBytesDirect/(float)rootTotal*100;
1620 if (percent > 0.5) {
1621 percentDirectOfRoot = TfStringPrintf(" %.0f%%", percent);
1622 }
1623 }
1624
1625 if (!level) {
1626 // For Root, take the bytesDirect as the rootPercentage
1627
1628 float percent = 100*node.nBytesDirect/(float)rootTotal;
1629 if (percent > 0.5) {
1630 percentDirectOfRoot = TfStringPrintf(" %.0f%%", percent);
1631 }
1632 }
1633 *rpt += TfStringPrintf("%5s %5s %5s\n", curPercent.c_str(),
1634 curPercentDirect.c_str(),
1635 percentDirectOfRoot.c_str());
1636
1637 vector<TfMallocTag::CallTree::PathNode>::const_iterator it;
1638 for (it = node.children.begin(); it != node.children.end(); ++it) {
1639 _PrintMallocNode(rpt, *it, rootTotal, node.nBytes, level+1,
1640 printedNodes,
1641 maxPrintedNodes);
1642 }
1643
1644 return rootTotal;
1645 }
1646
1647 static void
_PrintMallocCallSites(string * rpt,const vector<TfMallocTag::CallTree::CallSite> & callSites,size_t rootTotal)1648 _PrintMallocCallSites(
1649 string* rpt,
1650 const vector<TfMallocTag::CallTree::CallSite>& callSites,
1651 size_t rootTotal)
1652 {
1653 *rpt += TfStringPrintf("\n\nCall Sites\n\n");
1654
1655 // Use a map to sort by allocation size.
1656 map<size_t, const string *> map;
1657 TF_FOR_ALL(csi, callSites) {
1658 map.insert(make_pair(csi->nBytes, &csi->name));
1659 }
1660
1661 // XXX:cleanup We should pass in maxNameWidth.
1662 const size_t maxNameWidth = 72;
1663 const size_t maxBytesWidth = 15;
1664 const size_t maxPercentageWidth = 15;
1665
1666 string fmt = TfStringPrintf(
1667 "%%-%lds %%%lds %%%lds\n",
1668 maxNameWidth, maxBytesWidth, maxPercentageWidth);
1669
1670 *rpt += TfStringPrintf(fmt.c_str(), "NAME", "BYTES", "%ROOT");
1671 *rpt += string(maxNameWidth, '-') + ' ' +
1672 string(maxBytesWidth, '-') + ' ' +
1673 string(maxPercentageWidth, '-') + "\n\n";
1674
1675 TF_REVERSE_FOR_ALL(it, map) {
1676 size_t nBytes = it->first;
1677 const string &name = *it->second;
1678
1679 string curPercent;
1680 if (rootTotal) {
1681 double percent = 100.0*nBytes/rootTotal;
1682 // Don't print anything less than 0.1%.
1683 if (percent < 0.1) {
1684 break;
1685 }
1686 curPercent = TfStringPrintf("%.1f%%", percent);
1687 }
1688
1689 *rpt += TfStringPrintf(
1690 fmt.c_str(),
1691 name.substr(0, maxNameWidth).c_str(),
1692 _GetAsCommaSeparatedString(nBytes).c_str(),
1693 curPercent.c_str());
1694 }
1695 }
1696
1697 // Comparison for sorting CallTree::PathNodes.
1698 //
1699 static bool
_MallocPathNodeLessThan(const TfMallocTag::CallTree::PathNode * lhs,const TfMallocTag::CallTree::PathNode * rhs)1700 _MallocPathNodeLessThan(
1701 const TfMallocTag::CallTree::PathNode *lhs,
1702 const TfMallocTag::CallTree::PathNode *rhs)
1703 {
1704 return lhs->siteName < rhs->siteName;
1705 }
1706
1707 #if !(_DECREMENT_ALLOCATION_COUNTS)
1708 // Returns the total number of allocations in the given sub-tree.
1709 //
1710 static int64_t
_GetNumAllocationInSubTree(const TfMallocTag::CallTree::PathNode & node)1711 _GetNumAllocationInSubTree(
1712 const TfMallocTag::CallTree::PathNode &node)
1713 {
1714 int64_t nAllocations = node.nAllocations;
1715 TF_FOR_ALL(it, node.children) {
1716 nAllocations += _GetNumAllocationInSubTree(*it);
1717 }
1718 return nAllocations;
1719 }
1720 #endif
1721
1722 static void
_ReportMallocNode(std::ostream & out,const TfMallocTag::CallTree::PathNode & node,size_t level,const std::string * rootName=nullptr)1723 _ReportMallocNode(
1724 std::ostream &out,
1725 const TfMallocTag::CallTree::PathNode &node,
1726 size_t level,
1727 const std::string *rootName = nullptr)
1728 {
1729 // Prune empty branches.
1730 if (node.nBytes == 0) {
1731 #if _DECREMENT_ALLOCATION_COUNTS
1732 return;
1733 #else
1734 if (_GetNumAllocationInSubTree(node) == 0) {
1735 return;
1736 }
1737 #endif
1738 }
1739
1740 string indent(2*level, ' ');
1741
1742 // Insert '|' characters every 4 spaces.
1743 for (size_t i=0; i<(level + 1)/2; i++) {
1744 indent[4*i] = '|';
1745 }
1746
1747 out << TfStringPrintf(
1748 "%13s B %13s B %7ld samples ",
1749 _GetAsCommaSeparatedString(node.nBytes).c_str(),
1750 _GetAsCommaSeparatedString(node.nBytesDirect).c_str(),
1751 node.nAllocations);
1752
1753 out << indent
1754 << (rootName && !rootName->empty() ? *rootName : node.siteName)
1755 << std::endl;
1756
1757 // Sort the children by name. The reason for doing this is that it is
1758 // the easiest way to provide stable results for diffing. You could
1759 // argue that letting the diff program do the sorting is more correct
1760 // (i.e. that sorting is a view into the unaltered source data).
1761 std::vector<const TfMallocTag::CallTree::PathNode *> sortedChildren;
1762 sortedChildren.reserve(node.children.size());
1763 TF_FOR_ALL(it, node.children) {
1764 sortedChildren.push_back(&(*it));
1765 }
1766
1767 std::sort(
1768 sortedChildren.begin(), sortedChildren.end(), _MallocPathNodeLessThan);
1769
1770 TF_FOR_ALL(it, sortedChildren) {
1771 _ReportMallocNode(out, **it, level+1);
1772 }
1773 }
1774
1775 static void
_ReportCapturedMallocStacks(std::ostream & out,const std::vector<TfMallocTag::CallStackInfo> & stackInfos)1776 _ReportCapturedMallocStacks(
1777 std::ostream &out,
1778 const std::vector<TfMallocTag::CallStackInfo> &stackInfos)
1779 {
1780 size_t numReportedStacks =
1781 TfMin(stackInfos.size(), _MaxReportedMallocStacks);
1782
1783 size_t totalSize = 0;
1784 size_t totalNumAllocations = 0;
1785 size_t reportSize = 0;
1786 size_t reportNumAllocations = 0;
1787
1788 for(size_t n=0; n<stackInfos.size(); n++) {
1789 const TfMallocTag::CallStackInfo &stackInfo = stackInfos[n];
1790 totalSize += stackInfo.size;
1791 totalNumAllocations += stackInfo.numAllocations;
1792 if (n < numReportedStacks) {
1793 reportSize += stackInfo.size;
1794 reportNumAllocations += stackInfo.numAllocations;
1795 }
1796 }
1797
1798 out << "\n\n\n"
1799 << "Captured Malloc Stacks\n"
1800 << "\n"
1801 << "Number of unique captured malloc stacks: "
1802 << _GetAsCommaSeparatedString(stackInfos.size()) << "\n"
1803 << "Total allocated memory by captured mallocs: "
1804 << _GetAsCommaSeparatedString(totalSize) << "\n"
1805 << "Total number of allocations by captured mallocs: "
1806 << _GetAsCommaSeparatedString(totalNumAllocations) << "\n"
1807 << "\n"
1808 << "Number of captured malloc stacks in report: "
1809 << _GetAsCommaSeparatedString(numReportedStacks) << "\n"
1810 << "Allocated memory by mallocs in report: "
1811 << _GetAsCommaSeparatedString(reportSize) << "\n"
1812 << "Number of allocations by mallocs in report: "
1813 << _GetAsCommaSeparatedString(reportNumAllocations) << "\n"
1814 << "Percentage of allocated memory covered by report: "
1815 << TfStringPrintf("%.1f%%", 100.0*reportSize/totalSize) << "\n\n";
1816
1817 for(size_t n=0; n<numReportedStacks; n++) {
1818 const TfMallocTag::CallStackInfo &stackInfo = stackInfos[n];
1819
1820 out << string(100, '-') << "\n"
1821 << "Captured malloc stack #" << n << "\n"
1822 << "Size: " <<
1823 _GetAsCommaSeparatedString(stackInfo.size) << "\n"
1824 << "Num allocations: " <<
1825 _GetAsCommaSeparatedString(stackInfo.numAllocations) << "\n";
1826
1827 ArchPrintStackFrames(out, stackInfo.stack);
1828 }
1829 }
1830
1831
1832 string
GetPrettyPrintString(PrintSetting setting,size_t maxPrintedNodes) const1833 TfMallocTag::CallTree::GetPrettyPrintString(PrintSetting setting,
1834 size_t maxPrintedNodes) const
1835 {
1836 string rpt;
1837
1838 _PrintHeader(&rpt);
1839
1840 if (setting == TREE || setting == BOTH) {
1841 size_t printedNodes = 0;
1842 size_t reportedMem =
1843 _PrintMallocNode(&rpt, this->root, 0, 0, 0, printedNodes,
1844 maxPrintedNodes);
1845 if (printedNodes >= maxPrintedNodes
1846 && reportedMem != GetTotalBytes()) {
1847 rpt += TfStringPrintf("\nWARNING: limit of %zu nodes visted, but "
1848 "only %zu bytes of %zu accounted for. "
1849 "Running with a larger maxPrintedNodes will "
1850 "produce more accurate results.\n",
1851 maxPrintedNodes,
1852 reportedMem,
1853 GetTotalBytes());
1854
1855 }
1856 }
1857
1858 if (setting == CALLSITES || setting == BOTH) {
1859 _PrintMallocCallSites(&rpt, this->callSites, this->root.nBytes);
1860 }
1861
1862 return rpt;
1863 }
1864
1865 void
Report(std::ostream & out) const1866 TfMallocTag::CallTree::Report(
1867 std::ostream &out) const
1868 {
1869 const std::string emptyRootName;
1870 Report(out, emptyRootName);
1871 }
1872
1873 void
Report(std::ostream & out,const std::string & rootName) const1874 TfMallocTag::CallTree::Report(
1875 std::ostream &out,
1876 const std::string &rootName) const
1877 {
1878 out << "\nTree view ==============\n";
1879 out << " inclusive exclusive\n";
1880
1881 _ReportMallocNode(out, this->root, 0, &rootName);
1882
1883 // Also add the dominant call sites to the report.
1884 out << GetPrettyPrintString(CALLSITES);
1885
1886 // And the captured malloc stacks if there are any.
1887 if (!this->capturedCallStacks.empty()) {
1888 _ReportCapturedMallocStacks(out, this->capturedCallStacks);
1889 }
1890 }
1891
1892 TfMallocTag::
_TemporaryTaggingState(_Tagging tempStatus)1893 _TemporaryTaggingState::_TemporaryTaggingState(_Tagging tempStatus)
1894 : _oldState(TfMallocTag::_GetTagging())
1895 {
1896 TfMallocTag::_SetTagging(tempStatus);
1897 }
1898
~_TemporaryTaggingState()1899 TfMallocTag::_TemporaryTaggingState::~_TemporaryTaggingState()
1900 {
1901 TfMallocTag::_SetTagging(_oldState);
1902 }
1903
1904 PXR_NAMESPACE_CLOSE_SCOPE
1905