1 // -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
2 // Copyright (c) 2008, Google Inc.
3 // All rights reserved.
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // ---
32 // Author: Ken Ashcraft <opensource@google.com>
33 
34 #include <config.h>
35 #include "thread_cache.h"
36 #include <errno.h>
37 #include <string.h>                     // for memcpy
38 #include <algorithm>                    // for max, min
39 #include "base/commandlineflags.h"      // for SpinLockHolder
40 #include "base/spinlock.h"              // for SpinLockHolder
41 #include "getenv_safe.h"                // for TCMallocGetenvSafe
42 #include "central_freelist.h"           // for CentralFreeListPadded
43 #include "maybe_threads.h"
44 
45 using std::min;
46 using std::max;
47 
48 // Note: this is initialized manually in InitModule to ensure that
49 // it's configured at right time
50 //
51 // DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
52 //              EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
53 //                         kDefaultOverallThreadCacheSize),
54 //              "Bound on the total amount of bytes allocated to "
55 //              "thread caches. This bound is not strict, so it is possible "
56 //              "for the cache to go over this bound in certain circumstances. "
57 //              "Maximum value of this flag is capped to 1 GB.");
58 
59 
60 namespace tcmalloc {
61 
62 static bool phinited = false;
63 
64 volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
65 size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
66 ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
67 PageHeapAllocator<ThreadCache> threadcache_allocator;
68 ThreadCache* ThreadCache::thread_heaps_ = NULL;
69 int ThreadCache::thread_heap_count_ = 0;
70 ThreadCache* ThreadCache::next_memory_steal_ = NULL;
71 #ifdef HAVE_TLS
72 __thread ThreadCache::ThreadLocalData ThreadCache::threadlocal_data_
73     ATTR_INITIAL_EXEC CACHELINE_ALIGNED;
74 #endif
75 bool ThreadCache::tsd_inited_ = false;
76 pthread_key_t ThreadCache::heap_key_;
77 
Init(pthread_t tid)78 void ThreadCache::Init(pthread_t tid) {
79   size_ = 0;
80 
81   max_size_ = 0;
82   IncreaseCacheLimitLocked();
83   if (max_size_ == 0) {
84     // There isn't enough memory to go around.  Just give the minimum to
85     // this thread.
86     SetMaxSize(kMinThreadCacheSize);
87 
88     // Take unclaimed_cache_space_ negative.
89     unclaimed_cache_space_ -= kMinThreadCacheSize;
90     ASSERT(unclaimed_cache_space_ < 0);
91   }
92 
93   next_ = NULL;
94   prev_ = NULL;
95   tid_  = tid;
96   in_setspecific_ = false;
97   for (uint32 cl = 0; cl < Static::num_size_classes(); ++cl) {
98     list_[cl].Init(Static::sizemap()->class_to_size(cl));
99   }
100 
101   uint32_t sampler_seed;
102   memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
103   sampler_.Init(sampler_seed);
104 }
105 
Cleanup()106 void ThreadCache::Cleanup() {
107   // Put unused memory back into central cache
108   for (uint32 cl = 0; cl < Static::num_size_classes(); ++cl) {
109     if (list_[cl].length() > 0) {
110       ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
111     }
112   }
113 }
114 
115 // Remove some objects of class "cl" from central cache and add to thread heap.
116 // On success, return the first object for immediate use; otherwise return NULL.
FetchFromCentralCache(uint32 cl,int32_t byte_size,void * (* oom_handler)(size_t size))117 void* ThreadCache::FetchFromCentralCache(uint32 cl, int32_t byte_size,
118                                          void *(*oom_handler)(size_t size)) {
119   FreeList* list = &list_[cl];
120   ASSERT(list->empty());
121   const int batch_size = Static::sizemap()->num_objects_to_move(cl);
122 
123   const int num_to_move = min<int>(list->max_length(), batch_size);
124   void *start, *end;
125   int fetch_count = Static::central_cache()[cl].RemoveRange(
126       &start, &end, num_to_move);
127 
128   if (fetch_count == 0) {
129     ASSERT(start == NULL);
130     return oom_handler(byte_size);
131   }
132   ASSERT(start != NULL);
133 
134   if (--fetch_count >= 0) {
135     size_ += byte_size * fetch_count;
136     list->PushRange(fetch_count, SLL_Next(start), end);
137   }
138 
139   // Increase max length slowly up to batch_size.  After that,
140   // increase by batch_size in one shot so that the length is a
141   // multiple of batch_size.
142   if (list->max_length() < batch_size) {
143     list->set_max_length(list->max_length() + 1);
144   } else {
145     // Don't let the list get too long.  In 32 bit builds, the length
146     // is represented by a 16 bit int, so we need to watch out for
147     // integer overflow.
148     int new_length = min<int>(list->max_length() + batch_size,
149                               kMaxDynamicFreeListLength);
150     // The list's max_length must always be a multiple of batch_size,
151     // and kMaxDynamicFreeListLength is not necessarily a multiple
152     // of batch_size.
153     new_length -= new_length % batch_size;
154     ASSERT(new_length % batch_size == 0);
155     list->set_max_length(new_length);
156   }
157   return start;
158 }
159 
ListTooLong(FreeList * list,uint32 cl)160 void ThreadCache::ListTooLong(FreeList* list, uint32 cl) {
161   size_ += list->object_size();
162 
163   const int batch_size = Static::sizemap()->num_objects_to_move(cl);
164   ReleaseToCentralCache(list, cl, batch_size);
165 
166   // If the list is too long, we need to transfer some number of
167   // objects to the central cache.  Ideally, we would transfer
168   // num_objects_to_move, so the code below tries to make max_length
169   // converge on num_objects_to_move.
170 
171   if (list->max_length() < batch_size) {
172     // Slow start the max_length so we don't overreserve.
173     list->set_max_length(list->max_length() + 1);
174   } else if (list->max_length() > batch_size) {
175     // If we consistently go over max_length, shrink max_length.  If we don't
176     // shrink it, some amount of memory will always stay in this freelist.
177     list->set_length_overages(list->length_overages() + 1);
178     if (list->length_overages() > kMaxOverages) {
179       ASSERT(list->max_length() > batch_size);
180       list->set_max_length(list->max_length() - batch_size);
181       list->set_length_overages(0);
182     }
183   }
184 
185   if (PREDICT_FALSE(size_ > max_size_)) {
186     Scavenge();
187   }
188 }
189 
190 // Remove some objects of class "cl" from thread heap and add to central cache
ReleaseToCentralCache(FreeList * src,uint32 cl,int N)191 void ThreadCache::ReleaseToCentralCache(FreeList* src, uint32 cl, int N) {
192   ASSERT(src == &list_[cl]);
193   if (N > src->length()) N = src->length();
194   size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
195 
196   // We return prepackaged chains of the correct size to the central cache.
197   // TODO: Use the same format internally in the thread caches?
198   int batch_size = Static::sizemap()->num_objects_to_move(cl);
199   while (N > batch_size) {
200     void *tail, *head;
201     src->PopRange(batch_size, &head, &tail);
202     Static::central_cache()[cl].InsertRange(head, tail, batch_size);
203     N -= batch_size;
204   }
205   void *tail, *head;
206   src->PopRange(N, &head, &tail);
207   Static::central_cache()[cl].InsertRange(head, tail, N);
208   size_ -= delta_bytes;
209 }
210 
211 // Release idle memory to the central cache
Scavenge()212 void ThreadCache::Scavenge() {
213   // If the low-water mark for the free list is L, it means we would
214   // not have had to allocate anything from the central cache even if
215   // we had reduced the free list size by L.  We aim to get closer to
216   // that situation by dropping L/2 nodes from the free list.  This
217   // may not release much memory, but if so we will call scavenge again
218   // pretty soon and the low-water marks will be high on that call.
219   for (int cl = 0; cl < Static::num_size_classes(); cl++) {
220     FreeList* list = &list_[cl];
221     const int lowmark = list->lowwatermark();
222     if (lowmark > 0) {
223       const int drop = (lowmark > 1) ? lowmark/2 : 1;
224       ReleaseToCentralCache(list, cl, drop);
225 
226       // Shrink the max length if it isn't used.  Only shrink down to
227       // batch_size -- if the thread was active enough to get the max_length
228       // above batch_size, it will likely be that active again.  If
229       // max_length shinks below batch_size, the thread will have to
230       // go through the slow-start behavior again.  The slow-start is useful
231       // mainly for threads that stay relatively idle for their entire
232       // lifetime.
233       const int batch_size = Static::sizemap()->num_objects_to_move(cl);
234       if (list->max_length() > batch_size) {
235         list->set_max_length(
236             max<int>(list->max_length() - batch_size, batch_size));
237       }
238     }
239     list->clear_lowwatermark();
240   }
241 
242   IncreaseCacheLimit();
243 }
244 
IncreaseCacheLimit()245 void ThreadCache::IncreaseCacheLimit() {
246   SpinLockHolder h(Static::pageheap_lock());
247   IncreaseCacheLimitLocked();
248 }
249 
IncreaseCacheLimitLocked()250 void ThreadCache::IncreaseCacheLimitLocked() {
251   if (unclaimed_cache_space_ > 0) {
252     // Possibly make unclaimed_cache_space_ negative.
253     unclaimed_cache_space_ -= kStealAmount;
254     SetMaxSize(max_size_ + kStealAmount);
255     return;
256   }
257   // Don't hold pageheap_lock too long.  Try to steal from 10 other
258   // threads before giving up.  The i < 10 condition also prevents an
259   // infinite loop in case none of the existing thread heaps are
260   // suitable places to steal from.
261   for (int i = 0; i < 10;
262        ++i, next_memory_steal_ = next_memory_steal_->next_) {
263     // Reached the end of the linked list.  Start at the beginning.
264     if (next_memory_steal_ == NULL) {
265       ASSERT(thread_heaps_ != NULL);
266       next_memory_steal_ = thread_heaps_;
267     }
268     if (next_memory_steal_ == this ||
269         next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
270       continue;
271     }
272     next_memory_steal_->SetMaxSize(next_memory_steal_->max_size_ - kStealAmount);
273     SetMaxSize(max_size_ + kStealAmount);
274 
275     next_memory_steal_ = next_memory_steal_->next_;
276     return;
277   }
278 }
279 
GetSamplePeriod()280 int ThreadCache::GetSamplePeriod() {
281   return Sampler::GetSamplePeriod();
282 }
283 
InitModule()284 void ThreadCache::InitModule() {
285   {
286     SpinLockHolder h(Static::pageheap_lock());
287     if (phinited) {
288       return;
289     }
290     const char *tcb = TCMallocGetenvSafe("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES");
291     if (tcb) {
292       set_overall_thread_cache_size(strtoll(tcb, NULL, 10));
293     }
294     Static::InitStaticVars();
295     threadcache_allocator.Init();
296     phinited = 1;
297   }
298 
299   // We do "late" part of initialization without holding lock since
300   // there is chance it'll recurse into malloc
301   Static::InitLateMaybeRecursive();
302 }
303 
InitTSD()304 void ThreadCache::InitTSD() {
305   ASSERT(!tsd_inited_);
306   perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
307   tsd_inited_ = true;
308 
309 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
310   // We may have used a fake pthread_t for the main thread.  Fix it.
311   pthread_t zero;
312   memset(&zero, 0, sizeof(zero));
313   SpinLockHolder h(Static::pageheap_lock());
314   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
315     if (h->tid_ == zero) {
316       h->tid_ = pthread_self();
317     }
318   }
319 #endif
320 }
321 
CreateCacheIfNecessary()322 ThreadCache* ThreadCache::CreateCacheIfNecessary() {
323   if (!tsd_inited_) {
324 #ifndef NDEBUG
325     // tests that freeing nullptr very early is working
326     free(NULL);
327 #endif
328 
329     InitModule();
330   }
331 
332   // Initialize per-thread data if necessary
333   ThreadCache* heap = NULL;
334 
335   bool seach_condition = true;
336 #ifdef HAVE_TLS
337   static __thread ThreadCache** current_heap_ptr ATTR_INITIAL_EXEC;
338   if (tsd_inited_) {
339     // In most common case we're avoiding expensive linear search
340     // through all heaps (see below). Working TLS enables faster
341     // protection from malloc recursion in pthread_setspecific
342     seach_condition = false;
343 
344     if (current_heap_ptr != NULL) {
345       // we're being recursively called by pthread_setspecific below.
346       return *current_heap_ptr;
347     }
348     current_heap_ptr = &heap;
349   }
350 #endif
351 
352   {
353     SpinLockHolder h(Static::pageheap_lock());
354     // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
355     // calling pthread routines (even pthread_self) too early could
356     // cause a segfault.  Since we can call pthreads quite early, we
357     // have to protect against that in such situations by making a
358     // 'fake' pthread.  This is not ideal since it doesn't work well
359     // when linking tcmalloc statically with apps that create threads
360     // before main, so we only do it if we have to.
361 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
362     pthread_t me;
363     if (!tsd_inited_) {
364       memset(&me, 0, sizeof(me));
365     } else {
366       me = pthread_self();
367     }
368 #else
369     const pthread_t me = pthread_self();
370 #endif
371 
372     // This may be a recursive malloc call from pthread_setspecific()
373     // In that case, the heap for this thread has already been created
374     // and added to the linked list.  So we search for that first.
375     if (seach_condition) {
376       for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
377         if (h->tid_ == me) {
378           heap = h;
379           break;
380         }
381       }
382     }
383 
384     if (heap == NULL) heap = NewHeap(me);
385   }
386 
387   // We call pthread_setspecific() outside the lock because it may
388   // call malloc() recursively.  We check for the recursive call using
389   // the "in_setspecific_" flag so that we can avoid calling
390   // pthread_setspecific() if we are already inside pthread_setspecific().
391   if (!heap->in_setspecific_ && tsd_inited_) {
392     heap->in_setspecific_ = true;
393     perftools_pthread_setspecific(heap_key_, heap);
394 #ifdef HAVE_TLS
395     // Also keep a copy in __thread for faster retrieval
396     threadlocal_data_.heap = heap;
397     threadlocal_data_.fast_path_heap = heap;
398 #endif
399     heap->in_setspecific_ = false;
400   }
401 #ifdef HAVE_TLS
402   current_heap_ptr = NULL;
403 #endif
404   return heap;
405 }
406 
NewHeap(pthread_t tid)407 ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
408   // Create the heap and add it to the linked list
409   ThreadCache *heap = threadcache_allocator.New();
410   heap->Init(tid);
411   heap->next_ = thread_heaps_;
412   heap->prev_ = NULL;
413   if (thread_heaps_ != NULL) {
414     thread_heaps_->prev_ = heap;
415   } else {
416     // This is the only thread heap at the momment.
417     ASSERT(next_memory_steal_ == NULL);
418     next_memory_steal_ = heap;
419   }
420   thread_heaps_ = heap;
421   thread_heap_count_++;
422   return heap;
423 }
424 
BecomeIdle()425 void ThreadCache::BecomeIdle() {
426   if (!tsd_inited_) return;              // No caches yet
427   ThreadCache* heap = GetThreadHeap();
428   if (heap == NULL) return;             // No thread cache to remove
429   if (heap->in_setspecific_) return;    // Do not disturb the active caller
430 
431   heap->in_setspecific_ = true;
432   perftools_pthread_setspecific(heap_key_, NULL);
433 #ifdef HAVE_TLS
434   // Also update the copy in __thread
435   threadlocal_data_.heap = NULL;
436   threadlocal_data_.fast_path_heap = NULL;
437 #endif
438   heap->in_setspecific_ = false;
439   if (GetThreadHeap() == heap) {
440     // Somehow heap got reinstated by a recursive call to malloc
441     // from pthread_setspecific.  We give up in this case.
442     return;
443   }
444 
445   // We can now get rid of the heap
446   DeleteCache(heap);
447 }
448 
BecomeTemporarilyIdle()449 void ThreadCache::BecomeTemporarilyIdle() {
450   ThreadCache* heap = GetCacheIfPresent();
451   if (heap)
452     heap->Cleanup();
453 }
454 
DestroyThreadCache(void * ptr)455 void ThreadCache::DestroyThreadCache(void* ptr) {
456   // Note that "ptr" cannot be NULL since pthread promises not
457   // to invoke the destructor on NULL values, but for safety,
458   // we check anyway.
459   if (ptr == NULL) return;
460 #ifdef HAVE_TLS
461   // Prevent fast path of GetThreadHeap() from returning heap.
462   threadlocal_data_.heap = NULL;
463   threadlocal_data_.fast_path_heap = NULL;
464 #endif
465   DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
466 }
467 
DeleteCache(ThreadCache * heap)468 void ThreadCache::DeleteCache(ThreadCache* heap) {
469   // Remove all memory from heap
470   heap->Cleanup();
471 
472   // Remove from linked list
473   SpinLockHolder h(Static::pageheap_lock());
474   if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
475   if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
476   if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
477   thread_heap_count_--;
478 
479   if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
480   if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
481   unclaimed_cache_space_ += heap->max_size_;
482 
483   threadcache_allocator.Delete(heap);
484 }
485 
RecomputePerThreadCacheSize()486 void ThreadCache::RecomputePerThreadCacheSize() {
487   // Divide available space across threads
488   int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
489   size_t space = overall_thread_cache_size_ / n;
490 
491   // Limit to allowed range
492   if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
493   if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
494 
495   double ratio = space / max<double>(1, per_thread_cache_size_);
496   size_t claimed = 0;
497   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
498     // Increasing the total cache size should not circumvent the
499     // slow-start growth of max_size_.
500     if (ratio < 1.0) {
501       h->SetMaxSize(h->max_size_ * ratio);
502     }
503     claimed += h->max_size_;
504   }
505   unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
506   per_thread_cache_size_ = space;
507 }
508 
GetThreadStats(uint64_t * total_bytes,uint64_t * class_count)509 void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
510   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
511     *total_bytes += h->Size();
512     if (class_count) {
513       for (int cl = 0; cl < Static::num_size_classes(); ++cl) {
514         class_count[cl] += h->freelist_length(cl);
515       }
516     }
517   }
518 }
519 
set_overall_thread_cache_size(size_t new_size)520 void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
521   // Clip the value to a reasonable range
522   if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
523   if (new_size > (1<<30)) new_size = (1<<30);     // Limit to 1GB
524   overall_thread_cache_size_ = new_size;
525 
526   RecomputePerThreadCacheSize();
527 }
528 
529 }  // namespace tcmalloc
530