1 // -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*-
2 // Copyright (c) 2008, Google Inc.
3 // All rights reserved.
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // ---
32 // Author: Ken Ashcraft <opensource@google.com>
33
34 #include <config.h>
35 #include "thread_cache.h"
36 #include <errno.h>
37 #include <string.h> // for memcpy
38 #include <algorithm> // for max, min
39 #include "base/commandlineflags.h" // for SpinLockHolder
40 #include "base/spinlock.h" // for SpinLockHolder
41 #include "getenv_safe.h" // for TCMallocGetenvSafe
42 #include "central_freelist.h" // for CentralFreeListPadded
43 #include "maybe_threads.h"
44
45 using std::min;
46 using std::max;
47
48 // Note: this is initialized manually in InitModule to ensure that
49 // it's configured at right time
50 //
51 // DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
52 // EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
53 // kDefaultOverallThreadCacheSize),
54 // "Bound on the total amount of bytes allocated to "
55 // "thread caches. This bound is not strict, so it is possible "
56 // "for the cache to go over this bound in certain circumstances. "
57 // "Maximum value of this flag is capped to 1 GB.");
58
59
60 namespace tcmalloc {
61
62 static bool phinited = false;
63
64 volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
65 size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
66 ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
67 PageHeapAllocator<ThreadCache> threadcache_allocator;
68 ThreadCache* ThreadCache::thread_heaps_ = NULL;
69 int ThreadCache::thread_heap_count_ = 0;
70 ThreadCache* ThreadCache::next_memory_steal_ = NULL;
71 #ifdef HAVE_TLS
72 __thread ThreadCache::ThreadLocalData ThreadCache::threadlocal_data_
73 ATTR_INITIAL_EXEC CACHELINE_ALIGNED;
74 #endif
75 bool ThreadCache::tsd_inited_ = false;
76 pthread_key_t ThreadCache::heap_key_;
77
Init(pthread_t tid)78 void ThreadCache::Init(pthread_t tid) {
79 size_ = 0;
80
81 max_size_ = 0;
82 IncreaseCacheLimitLocked();
83 if (max_size_ == 0) {
84 // There isn't enough memory to go around. Just give the minimum to
85 // this thread.
86 SetMaxSize(kMinThreadCacheSize);
87
88 // Take unclaimed_cache_space_ negative.
89 unclaimed_cache_space_ -= kMinThreadCacheSize;
90 ASSERT(unclaimed_cache_space_ < 0);
91 }
92
93 next_ = NULL;
94 prev_ = NULL;
95 tid_ = tid;
96 in_setspecific_ = false;
97 for (uint32 cl = 0; cl < Static::num_size_classes(); ++cl) {
98 list_[cl].Init(Static::sizemap()->class_to_size(cl));
99 }
100
101 uint32_t sampler_seed;
102 memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
103 sampler_.Init(sampler_seed);
104 }
105
Cleanup()106 void ThreadCache::Cleanup() {
107 // Put unused memory back into central cache
108 for (uint32 cl = 0; cl < Static::num_size_classes(); ++cl) {
109 if (list_[cl].length() > 0) {
110 ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
111 }
112 }
113 }
114
115 // Remove some objects of class "cl" from central cache and add to thread heap.
116 // On success, return the first object for immediate use; otherwise return NULL.
FetchFromCentralCache(uint32 cl,int32_t byte_size,void * (* oom_handler)(size_t size))117 void* ThreadCache::FetchFromCentralCache(uint32 cl, int32_t byte_size,
118 void *(*oom_handler)(size_t size)) {
119 FreeList* list = &list_[cl];
120 ASSERT(list->empty());
121 const int batch_size = Static::sizemap()->num_objects_to_move(cl);
122
123 const int num_to_move = min<int>(list->max_length(), batch_size);
124 void *start, *end;
125 int fetch_count = Static::central_cache()[cl].RemoveRange(
126 &start, &end, num_to_move);
127
128 if (fetch_count == 0) {
129 ASSERT(start == NULL);
130 return oom_handler(byte_size);
131 }
132 ASSERT(start != NULL);
133
134 if (--fetch_count >= 0) {
135 size_ += byte_size * fetch_count;
136 list->PushRange(fetch_count, SLL_Next(start), end);
137 }
138
139 // Increase max length slowly up to batch_size. After that,
140 // increase by batch_size in one shot so that the length is a
141 // multiple of batch_size.
142 if (list->max_length() < batch_size) {
143 list->set_max_length(list->max_length() + 1);
144 } else {
145 // Don't let the list get too long. In 32 bit builds, the length
146 // is represented by a 16 bit int, so we need to watch out for
147 // integer overflow.
148 int new_length = min<int>(list->max_length() + batch_size,
149 kMaxDynamicFreeListLength);
150 // The list's max_length must always be a multiple of batch_size,
151 // and kMaxDynamicFreeListLength is not necessarily a multiple
152 // of batch_size.
153 new_length -= new_length % batch_size;
154 ASSERT(new_length % batch_size == 0);
155 list->set_max_length(new_length);
156 }
157 return start;
158 }
159
ListTooLong(FreeList * list,uint32 cl)160 void ThreadCache::ListTooLong(FreeList* list, uint32 cl) {
161 size_ += list->object_size();
162
163 const int batch_size = Static::sizemap()->num_objects_to_move(cl);
164 ReleaseToCentralCache(list, cl, batch_size);
165
166 // If the list is too long, we need to transfer some number of
167 // objects to the central cache. Ideally, we would transfer
168 // num_objects_to_move, so the code below tries to make max_length
169 // converge on num_objects_to_move.
170
171 if (list->max_length() < batch_size) {
172 // Slow start the max_length so we don't overreserve.
173 list->set_max_length(list->max_length() + 1);
174 } else if (list->max_length() > batch_size) {
175 // If we consistently go over max_length, shrink max_length. If we don't
176 // shrink it, some amount of memory will always stay in this freelist.
177 list->set_length_overages(list->length_overages() + 1);
178 if (list->length_overages() > kMaxOverages) {
179 ASSERT(list->max_length() > batch_size);
180 list->set_max_length(list->max_length() - batch_size);
181 list->set_length_overages(0);
182 }
183 }
184
185 if (PREDICT_FALSE(size_ > max_size_)) {
186 Scavenge();
187 }
188 }
189
190 // Remove some objects of class "cl" from thread heap and add to central cache
ReleaseToCentralCache(FreeList * src,uint32 cl,int N)191 void ThreadCache::ReleaseToCentralCache(FreeList* src, uint32 cl, int N) {
192 ASSERT(src == &list_[cl]);
193 if (N > src->length()) N = src->length();
194 size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
195
196 // We return prepackaged chains of the correct size to the central cache.
197 // TODO: Use the same format internally in the thread caches?
198 int batch_size = Static::sizemap()->num_objects_to_move(cl);
199 while (N > batch_size) {
200 void *tail, *head;
201 src->PopRange(batch_size, &head, &tail);
202 Static::central_cache()[cl].InsertRange(head, tail, batch_size);
203 N -= batch_size;
204 }
205 void *tail, *head;
206 src->PopRange(N, &head, &tail);
207 Static::central_cache()[cl].InsertRange(head, tail, N);
208 size_ -= delta_bytes;
209 }
210
211 // Release idle memory to the central cache
Scavenge()212 void ThreadCache::Scavenge() {
213 // If the low-water mark for the free list is L, it means we would
214 // not have had to allocate anything from the central cache even if
215 // we had reduced the free list size by L. We aim to get closer to
216 // that situation by dropping L/2 nodes from the free list. This
217 // may not release much memory, but if so we will call scavenge again
218 // pretty soon and the low-water marks will be high on that call.
219 for (int cl = 0; cl < Static::num_size_classes(); cl++) {
220 FreeList* list = &list_[cl];
221 const int lowmark = list->lowwatermark();
222 if (lowmark > 0) {
223 const int drop = (lowmark > 1) ? lowmark/2 : 1;
224 ReleaseToCentralCache(list, cl, drop);
225
226 // Shrink the max length if it isn't used. Only shrink down to
227 // batch_size -- if the thread was active enough to get the max_length
228 // above batch_size, it will likely be that active again. If
229 // max_length shinks below batch_size, the thread will have to
230 // go through the slow-start behavior again. The slow-start is useful
231 // mainly for threads that stay relatively idle for their entire
232 // lifetime.
233 const int batch_size = Static::sizemap()->num_objects_to_move(cl);
234 if (list->max_length() > batch_size) {
235 list->set_max_length(
236 max<int>(list->max_length() - batch_size, batch_size));
237 }
238 }
239 list->clear_lowwatermark();
240 }
241
242 IncreaseCacheLimit();
243 }
244
IncreaseCacheLimit()245 void ThreadCache::IncreaseCacheLimit() {
246 SpinLockHolder h(Static::pageheap_lock());
247 IncreaseCacheLimitLocked();
248 }
249
IncreaseCacheLimitLocked()250 void ThreadCache::IncreaseCacheLimitLocked() {
251 if (unclaimed_cache_space_ > 0) {
252 // Possibly make unclaimed_cache_space_ negative.
253 unclaimed_cache_space_ -= kStealAmount;
254 SetMaxSize(max_size_ + kStealAmount);
255 return;
256 }
257 // Don't hold pageheap_lock too long. Try to steal from 10 other
258 // threads before giving up. The i < 10 condition also prevents an
259 // infinite loop in case none of the existing thread heaps are
260 // suitable places to steal from.
261 for (int i = 0; i < 10;
262 ++i, next_memory_steal_ = next_memory_steal_->next_) {
263 // Reached the end of the linked list. Start at the beginning.
264 if (next_memory_steal_ == NULL) {
265 ASSERT(thread_heaps_ != NULL);
266 next_memory_steal_ = thread_heaps_;
267 }
268 if (next_memory_steal_ == this ||
269 next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
270 continue;
271 }
272 next_memory_steal_->SetMaxSize(next_memory_steal_->max_size_ - kStealAmount);
273 SetMaxSize(max_size_ + kStealAmount);
274
275 next_memory_steal_ = next_memory_steal_->next_;
276 return;
277 }
278 }
279
GetSamplePeriod()280 int ThreadCache::GetSamplePeriod() {
281 return Sampler::GetSamplePeriod();
282 }
283
InitModule()284 void ThreadCache::InitModule() {
285 {
286 SpinLockHolder h(Static::pageheap_lock());
287 if (phinited) {
288 return;
289 }
290 const char *tcb = TCMallocGetenvSafe("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES");
291 if (tcb) {
292 set_overall_thread_cache_size(strtoll(tcb, NULL, 10));
293 }
294 Static::InitStaticVars();
295 threadcache_allocator.Init();
296 phinited = 1;
297 }
298
299 // We do "late" part of initialization without holding lock since
300 // there is chance it'll recurse into malloc
301 Static::InitLateMaybeRecursive();
302 }
303
InitTSD()304 void ThreadCache::InitTSD() {
305 ASSERT(!tsd_inited_);
306 perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
307 tsd_inited_ = true;
308
309 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
310 // We may have used a fake pthread_t for the main thread. Fix it.
311 pthread_t zero;
312 memset(&zero, 0, sizeof(zero));
313 SpinLockHolder h(Static::pageheap_lock());
314 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
315 if (h->tid_ == zero) {
316 h->tid_ = pthread_self();
317 }
318 }
319 #endif
320 }
321
CreateCacheIfNecessary()322 ThreadCache* ThreadCache::CreateCacheIfNecessary() {
323 if (!tsd_inited_) {
324 #ifndef NDEBUG
325 // tests that freeing nullptr very early is working
326 free(NULL);
327 #endif
328
329 InitModule();
330 }
331
332 // Initialize per-thread data if necessary
333 ThreadCache* heap = NULL;
334
335 bool seach_condition = true;
336 #ifdef HAVE_TLS
337 static __thread ThreadCache** current_heap_ptr ATTR_INITIAL_EXEC;
338 if (tsd_inited_) {
339 // In most common case we're avoiding expensive linear search
340 // through all heaps (see below). Working TLS enables faster
341 // protection from malloc recursion in pthread_setspecific
342 seach_condition = false;
343
344 if (current_heap_ptr != NULL) {
345 // we're being recursively called by pthread_setspecific below.
346 return *current_heap_ptr;
347 }
348 current_heap_ptr = &heap;
349 }
350 #endif
351
352 {
353 SpinLockHolder h(Static::pageheap_lock());
354 // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
355 // calling pthread routines (even pthread_self) too early could
356 // cause a segfault. Since we can call pthreads quite early, we
357 // have to protect against that in such situations by making a
358 // 'fake' pthread. This is not ideal since it doesn't work well
359 // when linking tcmalloc statically with apps that create threads
360 // before main, so we only do it if we have to.
361 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
362 pthread_t me;
363 if (!tsd_inited_) {
364 memset(&me, 0, sizeof(me));
365 } else {
366 me = pthread_self();
367 }
368 #else
369 const pthread_t me = pthread_self();
370 #endif
371
372 // This may be a recursive malloc call from pthread_setspecific()
373 // In that case, the heap for this thread has already been created
374 // and added to the linked list. So we search for that first.
375 if (seach_condition) {
376 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
377 if (h->tid_ == me) {
378 heap = h;
379 break;
380 }
381 }
382 }
383
384 if (heap == NULL) heap = NewHeap(me);
385 }
386
387 // We call pthread_setspecific() outside the lock because it may
388 // call malloc() recursively. We check for the recursive call using
389 // the "in_setspecific_" flag so that we can avoid calling
390 // pthread_setspecific() if we are already inside pthread_setspecific().
391 if (!heap->in_setspecific_ && tsd_inited_) {
392 heap->in_setspecific_ = true;
393 perftools_pthread_setspecific(heap_key_, heap);
394 #ifdef HAVE_TLS
395 // Also keep a copy in __thread for faster retrieval
396 threadlocal_data_.heap = heap;
397 threadlocal_data_.fast_path_heap = heap;
398 #endif
399 heap->in_setspecific_ = false;
400 }
401 #ifdef HAVE_TLS
402 current_heap_ptr = NULL;
403 #endif
404 return heap;
405 }
406
NewHeap(pthread_t tid)407 ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
408 // Create the heap and add it to the linked list
409 ThreadCache *heap = threadcache_allocator.New();
410 heap->Init(tid);
411 heap->next_ = thread_heaps_;
412 heap->prev_ = NULL;
413 if (thread_heaps_ != NULL) {
414 thread_heaps_->prev_ = heap;
415 } else {
416 // This is the only thread heap at the momment.
417 ASSERT(next_memory_steal_ == NULL);
418 next_memory_steal_ = heap;
419 }
420 thread_heaps_ = heap;
421 thread_heap_count_++;
422 return heap;
423 }
424
BecomeIdle()425 void ThreadCache::BecomeIdle() {
426 if (!tsd_inited_) return; // No caches yet
427 ThreadCache* heap = GetThreadHeap();
428 if (heap == NULL) return; // No thread cache to remove
429 if (heap->in_setspecific_) return; // Do not disturb the active caller
430
431 heap->in_setspecific_ = true;
432 perftools_pthread_setspecific(heap_key_, NULL);
433 #ifdef HAVE_TLS
434 // Also update the copy in __thread
435 threadlocal_data_.heap = NULL;
436 threadlocal_data_.fast_path_heap = NULL;
437 #endif
438 heap->in_setspecific_ = false;
439 if (GetThreadHeap() == heap) {
440 // Somehow heap got reinstated by a recursive call to malloc
441 // from pthread_setspecific. We give up in this case.
442 return;
443 }
444
445 // We can now get rid of the heap
446 DeleteCache(heap);
447 }
448
BecomeTemporarilyIdle()449 void ThreadCache::BecomeTemporarilyIdle() {
450 ThreadCache* heap = GetCacheIfPresent();
451 if (heap)
452 heap->Cleanup();
453 }
454
DestroyThreadCache(void * ptr)455 void ThreadCache::DestroyThreadCache(void* ptr) {
456 // Note that "ptr" cannot be NULL since pthread promises not
457 // to invoke the destructor on NULL values, but for safety,
458 // we check anyway.
459 if (ptr == NULL) return;
460 #ifdef HAVE_TLS
461 // Prevent fast path of GetThreadHeap() from returning heap.
462 threadlocal_data_.heap = NULL;
463 threadlocal_data_.fast_path_heap = NULL;
464 #endif
465 DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
466 }
467
DeleteCache(ThreadCache * heap)468 void ThreadCache::DeleteCache(ThreadCache* heap) {
469 // Remove all memory from heap
470 heap->Cleanup();
471
472 // Remove from linked list
473 SpinLockHolder h(Static::pageheap_lock());
474 if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
475 if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
476 if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
477 thread_heap_count_--;
478
479 if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
480 if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
481 unclaimed_cache_space_ += heap->max_size_;
482
483 threadcache_allocator.Delete(heap);
484 }
485
RecomputePerThreadCacheSize()486 void ThreadCache::RecomputePerThreadCacheSize() {
487 // Divide available space across threads
488 int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
489 size_t space = overall_thread_cache_size_ / n;
490
491 // Limit to allowed range
492 if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
493 if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
494
495 double ratio = space / max<double>(1, per_thread_cache_size_);
496 size_t claimed = 0;
497 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
498 // Increasing the total cache size should not circumvent the
499 // slow-start growth of max_size_.
500 if (ratio < 1.0) {
501 h->SetMaxSize(h->max_size_ * ratio);
502 }
503 claimed += h->max_size_;
504 }
505 unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
506 per_thread_cache_size_ = space;
507 }
508
GetThreadStats(uint64_t * total_bytes,uint64_t * class_count)509 void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
510 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
511 *total_bytes += h->Size();
512 if (class_count) {
513 for (int cl = 0; cl < Static::num_size_classes(); ++cl) {
514 class_count[cl] += h->freelist_length(cl);
515 }
516 }
517 }
518 }
519
set_overall_thread_cache_size(size_t new_size)520 void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
521 // Clip the value to a reasonable range
522 if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
523 if (new_size > (1<<30)) new_size = (1<<30); // Limit to 1GB
524 overall_thread_cache_size_ = new_size;
525
526 RecomputePerThreadCacheSize();
527 }
528
529 } // namespace tcmalloc
530