1 /*
2 Copyright (c) 2005-2021 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16
17 #include "tbbmalloc_internal.h"
18 #include <errno.h>
19 #include <new> /* for placement new */
20 #include <string.h> /* for memset */
21
22 #include "oneapi/tbb/version.h"
23 #include "../tbb/environment.h"
24 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
25
26 #if USE_PTHREAD
27 #define TlsSetValue_func pthread_setspecific
28 #define TlsGetValue_func pthread_getspecific
29 #define GetMyTID() pthread_self()
30 #include <sched.h>
do_yield()31 inline void do_yield() {sched_yield();}
32 extern "C" { static void mallocThreadShutdownNotification(void*); }
33 #if __sun || __SUNPRO_CC
34 #define __asm__ asm
35 #endif
36 #include <unistd.h> // sysconf(_SC_PAGESIZE)
37 #elif USE_WINTHREAD
38 #define GetMyTID() GetCurrentThreadId()
39 #if __TBB_WIN8UI_SUPPORT
40 #include<thread>
41 #define TlsSetValue_func FlsSetValue
42 #define TlsGetValue_func FlsGetValue
43 #define TlsAlloc() FlsAlloc(NULL)
44 #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
45 #define TlsFree FlsFree
do_yield()46 inline void do_yield() {std::this_thread::yield();}
47 #else
48 #define TlsSetValue_func TlsSetValue
49 #define TlsGetValue_func TlsGetValue
50 #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
do_yield()51 inline void do_yield() {SwitchToThread();}
52 #endif
53 #else
54 #error Must define USE_PTHREAD or USE_WINTHREAD
55 #endif
56
57 #define FREELIST_NONBLOCKING 1
58
59 namespace rml {
60 class MemoryPool;
61 namespace internal {
62
63 class Block;
64 class MemoryPool;
65
66 #if MALLOC_CHECK_RECURSION
67
68 inline bool isMallocInitialized();
69
70 #endif // MALLOC_CHECK_RECURSION
71
72 /** Support for handling the special UNUSABLE pointer state **/
73 const intptr_t UNUSABLE = 0x1;
isSolidPtr(void * ptr)74 inline bool isSolidPtr( void* ptr ) {
75 return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
76 }
isNotForUse(void * ptr)77 inline bool isNotForUse( void* ptr ) {
78 return (intptr_t)ptr==UNUSABLE;
79 }
80
81 /*
82 * Block::objectSize value used to mark blocks allocated by startupAlloc
83 */
84 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
85
86 /*
87 * The following constant is used to define the size of struct Block, the block header.
88 * The intent is to have the size of a Block multiple of the cache line size, this allows us to
89 * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
90 */
91 const int blockHeaderAlignment = estimatedCacheLineSize;
92
93 /********* The data structures and global objects **************/
94
95 /*
96 * The malloc routines themselves need to be able to occasionally malloc some space,
97 * in order to set up the structures used by the thread local structures. This
98 * routine performs that functions.
99 */
100 class BootStrapBlocks {
101 MallocMutex bootStrapLock;
102 Block *bootStrapBlock;
103 Block *bootStrapBlockUsed;
104 FreeObject *bootStrapObjectList;
105 public:
106 void *allocate(MemoryPool *memPool, size_t size);
107 void free(void* ptr);
108 void reset();
109 };
110
111 #if USE_INTERNAL_TID
112 class ThreadId {
113 static tls_key_t Tid_key;
114 std::atomic<intptr_t> ThreadCount;
115
116 unsigned int id;
117
tlsNumber()118 static unsigned int tlsNumber() {
119 unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
120 if( !result ) {
121 RecursiveMallocCallProtector scoped;
122 // Thread-local value is zero -> first call from this thread,
123 // need to initialize with next ID value (IDs start from 1)
124 result = ++ThreadCount; // returned new value!
125 TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
126 }
127 return result;
128 }
129 public:
init()130 static bool init() {
131 #if USE_WINTHREAD
132 Tid_key = TlsAlloc();
133 if (Tid_key == TLS_ALLOC_FAILURE)
134 return false;
135 #else
136 int status = pthread_key_create( &Tid_key, NULL );
137 if ( status ) {
138 fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
139 return false;
140 }
141 #endif /* USE_WINTHREAD */
142 return true;
143 }
144 #if __TBB_SOURCE_DIRECTLY_INCLUDED
destroy()145 static void destroy() {
146 if( Tid_key ) {
147 #if USE_WINTHREAD
148 BOOL status = !(TlsFree( Tid_key )); // fail is zero
149 #else
150 int status = pthread_key_delete( Tid_key );
151 #endif /* USE_WINTHREAD */
152 if ( status )
153 fprintf (stderr, "The memory manager cannot delete tls key\n");
154 Tid_key = 0;
155 }
156 }
157 #endif
158
ThreadId()159 ThreadId() : id(ThreadId::tlsNumber()) {}
isCurrentThreadId() const160 bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
161
162 #if COLLECT_STATISTICS || MALLOC_TRACE
getThreadId()163 friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
164 #endif
165 #if COLLECT_STATISTICS
getMaxThreadId()166 static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); }
167
168 friend int STAT_increment(ThreadId tid, int bin, int ctr);
169 #endif
170 };
171
172 tls_key_t ThreadId::Tid_key;
173 intptr_t ThreadId::ThreadCount;
174
175 #if COLLECT_STATISTICS
STAT_increment(ThreadId tid,int bin,int ctr)176 int STAT_increment(ThreadId tid, int bin, int ctr)
177 {
178 return ::STAT_increment(tid.id, bin, ctr);
179 }
180 #endif
181
182 #else // USE_INTERNAL_TID
183
184 class ThreadId {
185 #if USE_PTHREAD
186 std::atomic<pthread_t> tid;
187 #else
188 std::atomic<DWORD> tid;
189 #endif
190 public:
ThreadId()191 ThreadId() : tid(GetMyTID()) {}
192 #if USE_PTHREAD
isCurrentThreadId() const193 bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); }
194 #else
isCurrentThreadId() const195 bool isCurrentThreadId() const { return GetCurrentThreadId() == tid.load(std::memory_order_relaxed); }
196 #endif
operator =(const ThreadId & other)197 ThreadId& operator=(const ThreadId& other) {
198 tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed);
199 return *this;
200 }
init()201 static bool init() { return true; }
202 #if __TBB_SOURCE_DIRECTLY_INCLUDED
destroy()203 static void destroy() {}
204 #endif
205 };
206
207 #endif // USE_INTERNAL_TID
208
209 /*********** Code to provide thread ID and a thread-local void pointer **********/
210
init()211 bool TLSKey::init()
212 {
213 #if USE_WINTHREAD
214 TLS_pointer_key = TlsAlloc();
215 if (TLS_pointer_key == TLS_ALLOC_FAILURE)
216 return false;
217 #else
218 int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
219 if ( status )
220 return false;
221 #endif /* USE_WINTHREAD */
222 return true;
223 }
224
destroy()225 bool TLSKey::destroy()
226 {
227 #if USE_WINTHREAD
228 BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
229 #else
230 int status1 = pthread_key_delete(TLS_pointer_key);
231 #endif /* USE_WINTHREAD */
232 MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
233 return status1==0;
234 }
235
getThreadMallocTLS() const236 inline TLSData* TLSKey::getThreadMallocTLS() const
237 {
238 return (TLSData *)TlsGetValue_func( TLS_pointer_key );
239 }
240
setThreadMallocTLS(TLSData * newvalue)241 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
242 RecursiveMallocCallProtector scoped;
243 TlsSetValue_func( TLS_pointer_key, newvalue );
244 }
245
246 /* The 'next' field in the block header has to maintain some invariants:
247 * it needs to be on a 16K boundary and the first field in the block.
248 * Any value stored there needs to have the lower 14 bits set to 0
249 * so that various assert work. This means that if you want to smash this memory
250 * for debugging purposes you will need to obey this invariant.
251 * The total size of the header needs to be a power of 2 to simplify
252 * the alignment requirements. For now it is a 128 byte structure.
253 * To avoid false sharing, the fields changed only locally are separated
254 * from the fields changed by foreign threads.
255 * Changing the size of the block header would require to change
256 * some bin allocation sizes, in particular "fitting" sizes (see above).
257 */
258 class Bin;
259 class StartupBlock;
260
261 class MemoryPool {
262 // if no explicit grainsize, expect to see malloc in user's pAlloc
263 // and set reasonable low granularity
264 static const size_t defaultGranularity = estimatedCacheLineSize;
265
266 MemoryPool(); // deny
267 public:
268 static MallocMutex memPoolListLock;
269
270 // list of all active pools is used to release
271 // all TLS data on thread termination or library unload
272 MemoryPool *next,
273 *prev;
274 ExtMemoryPool extMemPool;
275 BootStrapBlocks bootStrapBlocks;
276
277 static void initDefaultPool();
278
279 bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
280 bool reset();
281 bool destroy();
282 void onThreadShutdown(TLSData *tlsData);
283
284 inline TLSData *getTLS(bool create);
clearTLS()285 void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(NULL); }
286
287 Block *getEmptyBlock(size_t size);
288 void returnEmptyBlock(Block *block, bool poolTheBlock);
289
290 // get/put large object to/from local large object cache
291 void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
292 void putToLLOCache(TLSData *tls, void *object);
293 };
294
295 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
296 (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
297 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
298 const size_t MemoryPool::defaultGranularity;
299 // zero-initialized
300 MallocMutex MemoryPool::memPoolListLock;
301 // TODO: move huge page status to default pool, because that's its states
302 HugePagesStatus hugePages;
303 static bool usedBySrcIncluded = false;
304
305 // Padding helpers
306 template<size_t padd>
307 struct PaddingImpl {
308 size_t __padding[padd];
309 };
310
311 template<>
312 struct PaddingImpl<0> {};
313
314 template<int N>
315 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
316
317 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
318 // fields and fields commonly accessed by not owner threads.
319 class GlobalBlockFields : public BlockI {
320 protected:
321 std::atomic<FreeObject*> publicFreeList;
322 std::atomic<Block*> nextPrivatizable;
323 MemoryPool *poolPtr;
324 };
325
326 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)> {
327 protected:
328 Block *next;
329 Block *previous; /* Use double linked list to speed up removal */
330 FreeObject *bumpPtr; /* Bump pointer moves from the end to the beginning of a block */
331 FreeObject *freeList;
332 /* Pointer to local data for the owner thread. Used for fast finding tls
333 when releasing object from a block that current thread owned.
334 NULL for orphaned blocks. */
335 std::atomic<TLSData*> tlsPtr;
336 ThreadId ownerTid; /* the ID of the thread that owns or last owned the block */
337 BackRefIdx backRefIdx;
338 uint16_t allocatedCount; /* Number of objects allocated (obviously by the owning thread) */
339 uint16_t objectSize;
340 bool isFull;
341
342 friend class FreeBlockPool;
343 friend class StartupBlock;
344 friend class LifoList;
345 friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
346 friend bool OrphanedBlocks::cleanup(Backend*);
347 friend Block *MemoryPool::getEmptyBlock(size_t);
348 };
349
350 // Use inheritance to guarantee that a user data start on next cache line.
351 // Can't use member for it, because when LocalBlockFields already on cache line,
352 // we must have no additional memory consumption for all compilers.
353 class Block : public LocalBlockFields,
354 Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
355 public:
empty() const356 bool empty() const {
357 if (allocatedCount > 0) return false;
358 MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT);
359 return true;
360 }
361 inline FreeObject* allocate();
362 inline FreeObject *allocateFromFreeList();
363
364 inline bool adjustFullness();
365 void adjustPositionInBin(Bin* bin = NULL);
366 #if MALLOC_DEBUG
freeListNonNull()367 bool freeListNonNull() { return freeList; }
368 #endif
369 void freePublicObject(FreeObject *objectToFree);
370 inline void freeOwnObject(void *object);
371 void reset();
372 void privatizePublicFreeList( bool reset = true );
373 void restoreBumpPtr();
374 void privatizeOrphaned(TLSData *tls, unsigned index);
375 bool readyToShare();
376 void shareOrphaned(intptr_t binTag, unsigned index);
getSize() const377 unsigned int getSize() const {
378 MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
379 "Invalid object size");
380 return isStartupAllocObject()? 0 : objectSize;
381 }
getBackRefIdx() const382 const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
383 inline bool isOwnedByCurrentThread() const;
isStartupAllocObject() const384 bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
385 inline FreeObject *findObjectToFree(const void *object) const;
checkFreePrecond(const void * object) const386 void checkFreePrecond(const void *object) const {
387 #if MALLOC_DEBUG
388 const char *msg = "Possible double free or heap corruption.";
389 // small objects are always at least sizeof(size_t) Byte aligned,
390 // try to check this before this dereference as for invalid objects
391 // this may be unreadable
392 MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
393 #if !__TBB_USE_THREAD_SANITIZER
394 // releasing to free slab
395 MALLOC_ASSERT(allocatedCount>0, msg);
396 #endif
397 // must not point to slab's header
398 MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
399 if (startupAllocObjSizeMark == objectSize) // startup block
400 MALLOC_ASSERT(object<=bumpPtr, msg);
401 else {
402 // non-startup objects are 8 Byte aligned
403 MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
404 FreeObject *toFree = findObjectToFree(object);
405 #if !__TBB_USE_THREAD_SANITIZER
406 MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
407 && (!bumpPtr || object>bumpPtr), msg);
408 // check against head of freeList, as this is mostly
409 // expected after double free
410 MALLOC_ASSERT(toFree != freeList, msg);
411 #endif
412 // check against head of publicFreeList, to detect double free
413 // involving foreign thread
414 MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg);
415 }
416 #else
417 suppress_unused_warning(object);
418 #endif
419 }
420 void initEmptyBlock(TLSData *tls, size_t size);
421 size_t findObjectSize(void *object) const;
getMemPool() const422 MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
423
424 protected:
425 void cleanBlockHeader();
426
427 private:
428 static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
429
430 inline FreeObject *allocateFromBumpPtr();
431 inline FreeObject *findAllocatedObject(const void *address) const;
432 #if MALLOC_DEBUG
433 inline bool isProperlyPlaced(const void *object) const;
434 #endif
markOwned(TLSData * tls)435 inline void markOwned(TLSData *tls) {
436 MALLOC_ASSERT(!tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
437 ownerTid = ThreadId(); /* save the ID of the current thread */
438 tlsPtr.store(tls, std::memory_order_relaxed);
439 }
markOrphaned()440 inline void markOrphaned() {
441 MALLOC_ASSERT(tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
442 tlsPtr.store(nullptr, std::memory_order_relaxed);
443 }
444
445 friend class Bin;
446 friend class TLSData;
447 friend bool MemoryPool::destroy();
448 };
449
450 const float Block::emptyEnoughRatio = 1.0 / 4.0;
451
452 static_assert(sizeof(Block) <= 2*estimatedCacheLineSize,
453 "The class Block does not fit into 2 cache lines on this platform. "
454 "Defining USE_INTERNAL_TID may help to fix it.");
455
456 class Bin {
457 private:
458 public:
459 Block *activeBlk;
460 std::atomic<Block*> mailbox;
461 MallocMutex mailLock;
462
463 public:
getActiveBlock() const464 inline Block* getActiveBlock() const { return activeBlk; }
resetActiveBlock()465 void resetActiveBlock() { activeBlk = NULL; }
466 inline void setActiveBlock(Block *block);
467 inline Block* setPreviousBlockActive();
468 Block* getPrivatizedFreeListBlock();
469 void moveBlockToFront(Block *block);
470 bool cleanPublicFreeLists();
471 void processEmptyBlock(Block *block, bool poolTheBlock);
472 void addPublicFreeListBlock(Block* block);
473
474 void outofTLSBin(Block* block);
475 void verifyTLSBin(size_t size) const;
476 void pushTLSBin(Block* block);
477
478 #if MALLOC_DEBUG
verifyInitState() const479 void verifyInitState() const {
480 MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
481 MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT );
482 }
483 #endif
484
485 friend void Block::freePublicObject (FreeObject *objectToFree);
486 };
487
488 /********* End of the data structures **************/
489
490 /*
491 * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
492 */
493 const uint32_t minSmallObjectIndex = 0;
494 const uint32_t numSmallObjectBins = 8;
495 const uint32_t maxSmallObjectSize = 64;
496
497 /*
498 * There are 4 bins between each couple of powers of 2 [64-128-256-...]
499 * from maxSmallObjectSize till this size; 16 bins in total
500 */
501 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
502 const uint32_t numSegregatedObjectBins = 16;
503 const uint32_t maxSegregatedObjectSize = 1024;
504
505 /*
506 * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
507 * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
508 */
509 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
510 const uint32_t numFittingBins = 5;
511
512 const uint32_t fittingAlignment = estimatedCacheLineSize;
513
514 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
515 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
516 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
517 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
518 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
519 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
520 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
521 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
522 #undef SET_FITTING_SIZE
523
524 /*
525 * The total number of thread-specific Block-based bins
526 */
527 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
528
529 /*
530 * Objects of this size and larger are considered large objects.
531 */
532 const uint32_t minLargeObjectSize = fittingSize5 + 1;
533
534 /*
535 * Per-thread pool of slab blocks. Idea behind it is to not share with other
536 * threads memory that are likely in local cache(s) of our CPU.
537 */
538 class FreeBlockPool {
539 private:
540 std::atomic<Block*> head;
541 int size;
542 Backend *backend;
543 bool lastAccessMiss;
544 public:
545 static const int POOL_HIGH_MARK = 32;
546 static const int POOL_LOW_MARK = 8;
547
548 class ResOfGet {
549 ResOfGet();
550 public:
551 Block* block;
552 bool lastAccMiss;
ResOfGet(Block * b,bool lastMiss)553 ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
554 };
555
556 // allocated in zero-initialized memory
FreeBlockPool(Backend * bknd)557 FreeBlockPool(Backend *bknd) : backend(bknd) {}
558 ResOfGet getBlock();
559 void returnBlock(Block *block);
560 bool externalCleanup(); // can be called by another thread
561 };
562
563 template<int LOW_MARK, int HIGH_MARK>
564 class LocalLOCImpl {
565 private:
566 static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
567 // TODO: can single-linked list be faster here?
568 LargeMemoryBlock *tail; // need it when do releasing on overflow
569 std::atomic<LargeMemoryBlock*> head;
570 size_t totalSize;
571 int numOfBlocks;
572 public:
573 bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
574 LargeMemoryBlock *get(size_t size);
575 bool externalCleanup(ExtMemoryPool *extMemPool);
576 #if __TBB_MALLOC_WHITEBOX_TEST
LocalLOCImpl()577 LocalLOCImpl() : tail(NULL), head(NULL), totalSize(0), numOfBlocks(0) {}
getMaxSize()578 static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
579 static const int LOC_HIGH_MARK = HIGH_MARK;
580 #else
581 // no ctor, object must be created in zero-initialized memory
582 #endif
583 };
584
585 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
586
587 class TLSData : public TLSRemote {
588 MemoryPool *memPool;
589 public:
590 Bin bin[numBlockBinLimit];
591 FreeBlockPool freeSlabBlocks;
592 LocalLOC lloc;
593 unsigned currCacheIdx;
594 private:
595 std::atomic<bool> unused;
596 public:
TLSData(MemoryPool * mPool,Backend * bknd)597 TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd) {}
getMemPool() const598 MemoryPool *getMemPool() const { return memPool; }
599 Bin* getAllocationBin(size_t size);
600 void release();
externalCleanup(bool cleanOnlyUnused,bool cleanBins)601 bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
602 if (!unused.load(std::memory_order_relaxed) && cleanOnlyUnused) return false;
603 // Heavy operation in terms of synchronization complexity,
604 // should be called only for the current thread
605 bool released = cleanBins ? cleanupBlockBins() : false;
606 // both cleanups to be called, and the order is not important
607 return released | lloc.externalCleanup(&memPool->extMemPool) | freeSlabBlocks.externalCleanup();
608 }
609 bool cleanupBlockBins();
markUsed()610 void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched
markUnused()611 void markUnused() { unused.store(true, std::memory_order_relaxed); } // can be called by not owner thread
612 };
613
createTLS(MemoryPool * memPool,Backend * backend)614 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
615 {
616 MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
617 TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
618 if ( !tls )
619 return NULL;
620 new(tls) TLSData(memPool, backend);
621 /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
622 #if MALLOC_DEBUG
623 for (uint32_t i = 0; i < numBlockBinLimit; i++)
624 tls->bin[i].verifyInitState();
625 #endif
626 setThreadMallocTLS(tls);
627 memPool->extMemPool.allLocalCaches.registerThread(tls);
628 return tls;
629 }
630
cleanupBlockBins()631 bool TLSData::cleanupBlockBins()
632 {
633 bool released = false;
634 for (uint32_t i = 0; i < numBlockBinLimit; i++) {
635 released |= bin[i].cleanPublicFreeLists();
636 // After cleaning public free lists, only the active block might be empty.
637 // Do not use processEmptyBlock because it will just restore bumpPtr.
638 Block *block = bin[i].getActiveBlock();
639 if (block && block->empty()) {
640 bin[i].outofTLSBin(block);
641 memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
642 released = true;
643 }
644 }
645 return released;
646 }
647
releaseAllLocalCaches()648 bool ExtMemoryPool::releaseAllLocalCaches()
649 {
650 // Iterate all registered TLS data and clean LLOC and Slab pools
651 bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
652
653 // Bins privatization is done only for the current thread
654 if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
655 released |= tlsData->cleanupBlockBins();
656
657 return released;
658 }
659
registerThread(TLSRemote * tls)660 void AllLocalCaches::registerThread(TLSRemote *tls)
661 {
662 tls->prev = NULL;
663 MallocMutex::scoped_lock lock(listLock);
664 MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
665 tls->next = head;
666 if (head)
667 head->prev = tls;
668 head = tls;
669 MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
670 }
671
unregisterThread(TLSRemote * tls)672 void AllLocalCaches::unregisterThread(TLSRemote *tls)
673 {
674 MallocMutex::scoped_lock lock(listLock);
675 MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
676 if (head == tls)
677 head = tls->next;
678 if (tls->next)
679 tls->next->prev = tls->prev;
680 if (tls->prev)
681 tls->prev->next = tls->next;
682 MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
683 }
684
cleanup(bool cleanOnlyUnused)685 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
686 {
687 bool released = false;
688 {
689 MallocMutex::scoped_lock lock(listLock);
690 for (TLSRemote *curr=head; curr; curr=curr->next)
691 released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
692 }
693 return released;
694 }
695
markUnused()696 void AllLocalCaches::markUnused()
697 {
698 bool locked;
699 MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
700 if (!locked) // not wait for marking if someone doing something with it
701 return;
702
703 for (TLSRemote *curr=head; curr; curr=curr->next)
704 static_cast<TLSData*>(curr)->markUnused();
705 }
706
707 #if MALLOC_CHECK_RECURSION
708 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
709 std::atomic<pthread_t> RecursiveMallocCallProtector::owner_thread;
710 std::atomic<void*> RecursiveMallocCallProtector::autoObjPtr;
711 bool RecursiveMallocCallProtector::mallocRecursionDetected;
712 #if __FreeBSD__ || __DragonFly__
713 bool RecursiveMallocCallProtector::canUsePthread;
714 #endif
715
716 #endif
717
718 /*********** End code to provide thread ID and a TLS pointer **********/
719
720 // Parameter for isLargeObject, keeps our expectations on memory origin.
721 // Assertions must use unknownMem to reliably report object invalidity.
722 enum MemoryOrigin {
723 ourMem, // allocated by TBB allocator
724 unknownMem // can be allocated by system allocator or TBB allocator
725 };
726
727 template<MemoryOrigin>
728 #if __TBB_USE_THREAD_SANITIZER
729 // We have a real race when accessing the large object header for
730 // non large objects (e.g. small or foreign objects).
731 // Therefore, we need to hide this access from the thread sanitizer
732 __attribute__((no_sanitize("thread")))
733 #endif
734 bool isLargeObject(void *object);
735 static void *internalMalloc(size_t size);
736 static void internalFree(void *object);
737 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
738 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
739
740 #if !MALLOC_DEBUG
741 #if __INTEL_COMPILER || _MSC_VER
742 #define NOINLINE(decl) __declspec(noinline) decl
743 #define ALWAYSINLINE(decl) __forceinline decl
744 #elif __GNUC__
745 #define NOINLINE(decl) decl __attribute__ ((noinline))
746 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
747 #else
748 #define NOINLINE(decl) decl
749 #define ALWAYSINLINE(decl) decl
750 #endif
751
752 static NOINLINE( bool doInitialization() );
753 ALWAYSINLINE( bool isMallocInitialized() );
754
755 #undef ALWAYSINLINE
756 #undef NOINLINE
757 #endif /* !MALLOC_DEBUG */
758
759
760 /********* Now some rough utility code to deal with indexing the size bins. **************/
761
762 /*
763 * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
764 * Moreover, on some platforms, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
765 * This is enough for current algorithm of distribution of sizes among bins.
766 * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
767 */
768 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
769 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
770 #pragma intrinsic(_BitScanReverse)
771 #endif
highestBitPos(unsigned int n)772 static inline unsigned int highestBitPos(unsigned int n)
773 {
774 MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
775 unsigned int pos;
776 #if __ARCH_x86_32||__ARCH_x86_64
777
778 # if __unix__||__APPLE__||__MINGW32__
779 __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
780 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
781 __asm
782 {
783 bsr eax, n
784 mov pos, eax
785 }
786 # elif _WIN64 && _MSC_VER>=1400
787 _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
788 # else
789 # error highestBitPos() not implemented for this platform
790 # endif
791 #elif __arm__
792 __asm__ __volatile__
793 (
794 "clz %0, %1\n"
795 "rsb %0, %0, %2\n"
796 :"=r" (pos) :"r" (n), "I" (31)
797 );
798 #else
799 static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
800 pos = bsr[ n>>6 ];
801 #endif /* __ARCH_* */
802 return pos;
803 }
804
getSmallObjectIndex(unsigned int size)805 unsigned int getSmallObjectIndex(unsigned int size)
806 {
807 unsigned int result = (size-1)>>3;
808 if (sizeof(void*)==8) {
809 // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
810 if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
811 }
812 return result;
813 }
814
815 /*
816 * Depending on indexRequest, for a given size return either the index into the bin
817 * for objects of this size, or the actual size of objects in this bin.
818 */
819 template<bool indexRequest>
getIndexOrObjectSize(unsigned int size)820 static unsigned int getIndexOrObjectSize (unsigned int size)
821 {
822 if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
823 unsigned int index = getSmallObjectIndex( size );
824 /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
825 return indexRequest ? index : (index+1)<<3;
826 }
827 else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
828 unsigned int order = highestBitPos(size-1); // which group of bin sizes?
829 MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
830 if (indexRequest)
831 return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
832 else {
833 unsigned int alignment = 128 >> (9-order); // alignment in the group
834 MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
835 return alignUp(size,alignment);
836 }
837 }
838 else {
839 if( size <= fittingSize3 ) {
840 if( size <= fittingSize2 ) {
841 if( size <= fittingSize1 )
842 return indexRequest ? minFittingIndex : fittingSize1;
843 else
844 return indexRequest ? minFittingIndex+1 : fittingSize2;
845 } else
846 return indexRequest ? minFittingIndex+2 : fittingSize3;
847 } else {
848 if( size <= fittingSize5 ) {
849 if( size <= fittingSize4 )
850 return indexRequest ? minFittingIndex+3 : fittingSize4;
851 else
852 return indexRequest ? minFittingIndex+4 : fittingSize5;
853 } else {
854 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
855 return ~0U;
856 }
857 }
858 }
859 }
860
getIndex(unsigned int size)861 static unsigned int getIndex (unsigned int size)
862 {
863 return getIndexOrObjectSize</*indexRequest=*/true>(size);
864 }
865
getObjectSize(unsigned int size)866 static unsigned int getObjectSize (unsigned int size)
867 {
868 return getIndexOrObjectSize</*indexRequest=*/false>(size);
869 }
870
871
allocate(MemoryPool * memPool,size_t size)872 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
873 {
874 FreeObject *result;
875
876 MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
877
878 { // Lock with acquire
879 MallocMutex::scoped_lock scoped_cs(bootStrapLock);
880
881 if( bootStrapObjectList) {
882 result = bootStrapObjectList;
883 bootStrapObjectList = bootStrapObjectList->next;
884 } else {
885 if (!bootStrapBlock) {
886 bootStrapBlock = memPool->getEmptyBlock(size);
887 if (!bootStrapBlock) return NULL;
888 }
889 result = bootStrapBlock->bumpPtr;
890 bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
891 if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
892 bootStrapBlock->bumpPtr = NULL;
893 bootStrapBlock->next = bootStrapBlockUsed;
894 bootStrapBlockUsed = bootStrapBlock;
895 bootStrapBlock = NULL;
896 }
897 }
898 } // Unlock with release
899 memset (result, 0, size);
900 return (void*)result;
901 }
902
free(void * ptr)903 void BootStrapBlocks::free(void* ptr)
904 {
905 MALLOC_ASSERT( ptr, ASSERT_TEXT );
906 { // Lock with acquire
907 MallocMutex::scoped_lock scoped_cs(bootStrapLock);
908 ((FreeObject*)ptr)->next = bootStrapObjectList;
909 bootStrapObjectList = (FreeObject*)ptr;
910 } // Unlock with release
911 }
912
reset()913 void BootStrapBlocks::reset()
914 {
915 bootStrapBlock = bootStrapBlockUsed = NULL;
916 bootStrapObjectList = NULL;
917 }
918
919 #if !(FREELIST_NONBLOCKING)
920 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
921 #endif
922
923 /********* End rough utility code **************/
924
925 /* LifoList assumes zero initialization so a vector of it can be created
926 * by just allocating some space with no call to constructor.
927 * On Linux, it seems to be necessary to avoid linking with C++ libraries.
928 *
929 * By usage convention there is no race on the initialization. */
LifoList()930 LifoList::LifoList( ) : top(nullptr)
931 {
932 // MallocMutex assumes zero initialization
933 memset(&lock, 0, sizeof(MallocMutex));
934 }
935
push(Block * block)936 void LifoList::push(Block *block)
937 {
938 MallocMutex::scoped_lock scoped_cs(lock);
939 block->next = top.load(std::memory_order_relaxed);
940 top.store(block, std::memory_order_relaxed);
941 }
942
pop()943 Block *LifoList::pop()
944 {
945 Block* block = nullptr;
946 if (top.load(std::memory_order_relaxed)) {
947 MallocMutex::scoped_lock scoped_cs(lock);
948 block = top.load(std::memory_order_relaxed);
949 if (block) {
950 top.store(block->next, std::memory_order_relaxed);
951 }
952 }
953 return block;
954 }
955
grab()956 Block *LifoList::grab()
957 {
958 Block *block = nullptr;
959 if (top.load(std::memory_order_relaxed)) {
960 MallocMutex::scoped_lock scoped_cs(lock);
961 block = top.load(std::memory_order_relaxed);
962 top.store(nullptr, std::memory_order_relaxed);
963 }
964 return block;
965 }
966
967 /********* Thread and block related code *************/
968
releaseAll(Backend * backend)969 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
970 LargeMemoryBlock *next, *lmb = loHead;
971 loHead = NULL;
972
973 for (; lmb; lmb = next) {
974 next = lmb->gNext;
975 if (poolDestroy) {
976 // as it's pool destruction, no need to return object to backend,
977 // only remove backrefs, as they are global
978 removeBackRef(lmb->backRefIdx);
979 } else {
980 // clean g(Next|Prev) to prevent removing lmb
981 // from AllLargeBlocksList inside returnLargeObject
982 lmb->gNext = lmb->gPrev = NULL;
983 backend->returnLargeObject(lmb);
984 }
985 }
986 }
987
getTLS(bool create)988 TLSData* MemoryPool::getTLS(bool create)
989 {
990 TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
991 if (create && !tls)
992 tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
993 return tls;
994 }
995
996 /*
997 * Return the bin for the given size.
998 */
getAllocationBin(size_t size)999 inline Bin* TLSData::getAllocationBin(size_t size)
1000 {
1001 return bin + getIndex(size);
1002 }
1003
1004 /* Return an empty uninitialized block in a non-blocking fashion. */
getEmptyBlock(size_t size)1005 Block *MemoryPool::getEmptyBlock(size_t size)
1006 {
1007 TLSData* tls = getTLS(/*create=*/false);
1008 // try to use per-thread cache, if TLS available
1009 FreeBlockPool::ResOfGet resOfGet = tls?
1010 tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(NULL, false);
1011 Block *result = resOfGet.block;
1012
1013 if (!result) { // not found in local cache, asks backend for slabs
1014 int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1015 BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1016
1017 result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1018 if (!result) return NULL;
1019
1020 if (!extMemPool.userPool())
1021 for (int i=0; i<num; i++) {
1022 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1023 if (backRefIdx[i].isInvalid()) {
1024 // roll back resource allocation
1025 for (int j=0; j<i; j++)
1026 removeBackRef(backRefIdx[j]);
1027 Block *b = result;
1028 for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1029 extMemPool.backend.putSlabBlock(b);
1030 return NULL;
1031 }
1032 }
1033 // resources were allocated, register blocks
1034 Block *b = result;
1035 for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1036 // slab block in user's pool must have invalid backRefIdx
1037 if (extMemPool.userPool()) {
1038 new (&b->backRefIdx) BackRefIdx();
1039 } else {
1040 setBackRef(backRefIdx[i], b);
1041 b->backRefIdx = backRefIdx[i];
1042 }
1043 b->tlsPtr.store(tls, std::memory_order_relaxed);
1044 b->poolPtr = this;
1045 // all but first one go to per-thread pool
1046 if (i > 0) {
1047 MALLOC_ASSERT(tls, ASSERT_TEXT);
1048 tls->freeSlabBlocks.returnBlock(b);
1049 }
1050 }
1051 }
1052 MALLOC_ASSERT(result, ASSERT_TEXT);
1053 result->initEmptyBlock(tls, size);
1054 STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1055 return result;
1056 }
1057
returnEmptyBlock(Block * block,bool poolTheBlock)1058 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1059 {
1060 block->reset();
1061 if (poolTheBlock) {
1062 getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1063 } else {
1064 // slab blocks in user's pools do not have valid backRefIdx
1065 if (!extMemPool.userPool())
1066 removeBackRef(*(block->getBackRefIdx()));
1067 extMemPool.backend.putSlabBlock(block);
1068 }
1069 }
1070
init(intptr_t poolId,rawAllocType rawAlloc,rawFreeType rawFree,size_t granularity,bool keepAllMemory,bool fixedPool)1071 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1072 rawFreeType rawFree, size_t granularity,
1073 bool keepAllMemory, bool fixedPool)
1074 {
1075 this->poolId = poolId;
1076 this->rawAlloc = rawAlloc;
1077 this->rawFree = rawFree;
1078 this->granularity = granularity;
1079 this->keepAllMemory = keepAllMemory;
1080 this->fixedPool = fixedPool;
1081 this->delayRegsReleasing = false;
1082 if (!initTLS())
1083 return false;
1084 loc.init(this);
1085 backend.init(this);
1086 MALLOC_ASSERT(isPoolValid(), NULL);
1087 return true;
1088 }
1089
initTLS()1090 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1091
init(intptr_t poolId,const MemPoolPolicy * policy)1092 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1093 {
1094 if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1095 policy->granularity? policy->granularity : defaultGranularity,
1096 policy->keepAllMemory, policy->fixedPool))
1097 return false;
1098 {
1099 MallocMutex::scoped_lock lock(memPoolListLock);
1100 next = defaultMemPool->next;
1101 defaultMemPool->next = this;
1102 prev = defaultMemPool;
1103 if (next)
1104 next->prev = this;
1105 }
1106 return true;
1107 }
1108
reset()1109 bool MemoryPool::reset()
1110 {
1111 MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1112 // memory is not releasing during pool reset
1113 // TODO: mark regions to release unused on next reset()
1114 extMemPool.delayRegionsReleasing(true);
1115
1116 bootStrapBlocks.reset();
1117 extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1118 if (!extMemPool.reset())
1119 return false;
1120
1121 if (!extMemPool.initTLS())
1122 return false;
1123 extMemPool.delayRegionsReleasing(false);
1124 return true;
1125 }
1126
destroy()1127 bool MemoryPool::destroy()
1128 {
1129 #if __TBB_MALLOC_LOCACHE_STAT
1130 extMemPool.loc.reportStat(stdout);
1131 #endif
1132 #if __TBB_MALLOC_BACKEND_STAT
1133 extMemPool.backend.reportStat(stdout);
1134 #endif
1135 {
1136 MallocMutex::scoped_lock lock(memPoolListLock);
1137 // remove itself from global pool list
1138 if (prev)
1139 prev->next = next;
1140 if (next)
1141 next->prev = prev;
1142 }
1143 // slab blocks in non-default pool do not have backreferences,
1144 // only large objects do
1145 if (extMemPool.userPool())
1146 extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1147 else {
1148 // only one non-userPool() is supported now
1149 MALLOC_ASSERT(this==defaultMemPool, NULL);
1150 // There and below in extMemPool.destroy(), do not restore initial state
1151 // for user pool, because it's just about to be released. But for system
1152 // pool restoring, we do not want to do zeroing of it on subsequent reload.
1153 bootStrapBlocks.reset();
1154 extMemPool.orphanedBlocks.reset();
1155 }
1156 return extMemPool.destroy();
1157 }
1158
onThreadShutdown(TLSData * tlsData)1159 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1160 {
1161 if (tlsData) { // might be called for "empty" TLS
1162 tlsData->release();
1163 bootStrapBlocks.free(tlsData);
1164 clearTLS();
1165 }
1166 }
1167
1168 #if MALLOC_DEBUG
verifyTLSBin(size_t size) const1169 void Bin::verifyTLSBin (size_t size) const
1170 {
1171 /* The debug version verifies the TLSBin as needed */
1172 uint32_t objSize = getObjectSize(size);
1173
1174 if (activeBlk) {
1175 MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1176 MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1177 #if MALLOC_DEBUG>1
1178 for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1179 MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1180 MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1181 MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1182 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1183 if (temp->next) {
1184 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1185 }
1186 }
1187 for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1188 MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1189 MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1190 MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1191 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1192 if (temp->previous) {
1193 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1194 }
1195 }
1196 #endif /* MALLOC_DEBUG>1 */
1197 }
1198 }
1199 #else /* MALLOC_DEBUG */
verifyTLSBin(size_t) const1200 inline void Bin::verifyTLSBin (size_t) const { }
1201 #endif /* MALLOC_DEBUG */
1202
1203 /*
1204 * Add a block to the start of this tls bin list.
1205 */
pushTLSBin(Block * block)1206 void Bin::pushTLSBin(Block* block)
1207 {
1208 /* The objectSize should be defined and not a parameter
1209 because the function is applied to partially filled blocks as well */
1210 unsigned int size = block->objectSize;
1211
1212 MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1213 MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1214 MALLOC_ASSERT( block->next == NULL, ASSERT_TEXT );
1215 MALLOC_ASSERT( block->previous == NULL, ASSERT_TEXT );
1216
1217 MALLOC_ASSERT( this, ASSERT_TEXT );
1218 verifyTLSBin(size);
1219
1220 block->next = activeBlk;
1221 if( activeBlk ) {
1222 block->previous = activeBlk->previous;
1223 activeBlk->previous = block;
1224 if( block->previous )
1225 block->previous->next = block;
1226 } else {
1227 activeBlk = block;
1228 }
1229
1230 verifyTLSBin(size);
1231 }
1232
1233 /*
1234 * Take a block out of its tls bin (e.g. before removal).
1235 */
outofTLSBin(Block * block)1236 void Bin::outofTLSBin(Block* block)
1237 {
1238 unsigned int size = block->objectSize;
1239
1240 MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1241 MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1242
1243 MALLOC_ASSERT( this, ASSERT_TEXT );
1244 verifyTLSBin(size);
1245
1246 if (block == activeBlk) {
1247 activeBlk = block->previous? block->previous : block->next;
1248 }
1249 /* Unlink the block */
1250 if (block->previous) {
1251 MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1252 block->previous->next = block->next;
1253 }
1254 if (block->next) {
1255 MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1256 block->next->previous = block->previous;
1257 }
1258 block->next = NULL;
1259 block->previous = NULL;
1260
1261 verifyTLSBin(size);
1262 }
1263
getPrivatizedFreeListBlock()1264 Block* Bin::getPrivatizedFreeListBlock()
1265 {
1266 Block* block;
1267 MALLOC_ASSERT( this, ASSERT_TEXT );
1268 // if this method is called, active block usage must be unsuccessful
1269 MALLOC_ASSERT( !activeBlk && !mailbox.load(std::memory_order_relaxed) || activeBlk && activeBlk->isFull, ASSERT_TEXT );
1270
1271 // the counter should be changed STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1272 if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox
1273 return NULL;
1274 else { // mailbox is not empty, take lock and inspect it
1275 MallocMutex::scoped_lock scoped_cs(mailLock);
1276 block = mailbox.load(std::memory_order_relaxed);
1277 if( block ) {
1278 MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1279 MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1280 mailbox.store(block->nextPrivatizable.load(std::memory_order_relaxed), std::memory_order_relaxed);
1281 block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1282 }
1283 }
1284 if( block ) {
1285 MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1286 block->privatizePublicFreeList();
1287 block->adjustPositionInBin(this);
1288 }
1289 return block;
1290 }
1291
addPublicFreeListBlock(Block * block)1292 void Bin::addPublicFreeListBlock(Block* block)
1293 {
1294 MallocMutex::scoped_lock scoped_cs(mailLock);
1295 block->nextPrivatizable.store(mailbox.load(std::memory_order_relaxed), std::memory_order_relaxed);
1296 mailbox.store(block, std::memory_order_relaxed);
1297 }
1298
1299 // Process publicly freed objects in all blocks and return empty blocks
1300 // to the backend in order to reduce overall footprint.
cleanPublicFreeLists()1301 bool Bin::cleanPublicFreeLists()
1302 {
1303 Block* block;
1304 if (!mailbox.load(std::memory_order_acquire))
1305 return false;
1306 else {
1307 // Grab all the blocks in the mailbox
1308 MallocMutex::scoped_lock scoped_cs(mailLock);
1309 block = mailbox.load(std::memory_order_relaxed);
1310 mailbox.store(NULL, std::memory_order_relaxed);
1311 }
1312 bool released = false;
1313 while (block) {
1314 MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1315 Block* tmp = block->nextPrivatizable.load(std::memory_order_relaxed);
1316 block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1317 block->privatizePublicFreeList();
1318 if (block->empty()) {
1319 processEmptyBlock(block, /*poolTheBlock=*/false);
1320 released = true;
1321 } else
1322 block->adjustPositionInBin(this);
1323 block = tmp;
1324 }
1325 return released;
1326 }
1327
adjustFullness()1328 bool Block::adjustFullness()
1329 {
1330 if (bumpPtr) {
1331 /* If we are still using a bump ptr for this block it is empty enough to use. */
1332 STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1333 isFull = false;
1334 } else {
1335 const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1336 /* allocatedCount shows how many objects in the block are in use; however it still counts
1337 * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1338 isFull = (allocatedCount*objectSize > threshold) ? true : false;
1339 #if COLLECT_STATISTICS
1340 if (isFull)
1341 STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1342 else
1343 STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1344 #endif
1345 }
1346 return isFull;
1347 }
1348
1349 // This method resides in class Block, and not in class Bin, in order to avoid
1350 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
adjustPositionInBin(Bin * bin)1351 void Block::adjustPositionInBin(Bin* bin/*=NULL*/)
1352 {
1353 // If the block were full, but became empty enough to use,
1354 // move it to the front of the list
1355 if (isFull && !adjustFullness()) {
1356 if (!bin)
1357 bin = tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize);
1358 bin->moveBlockToFront(this);
1359 }
1360 }
1361
1362 /* Restore the bump pointer for an empty block that is planned to use */
restoreBumpPtr()1363 void Block::restoreBumpPtr()
1364 {
1365 MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1366 MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1367 STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1368 bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1369 freeList = NULL;
1370 isFull = false;
1371 }
1372
freeOwnObject(void * object)1373 void Block::freeOwnObject(void *object)
1374 {
1375 tlsPtr.load(std::memory_order_relaxed)->markUsed();
1376 allocatedCount--;
1377 MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1378 #if COLLECT_STATISTICS
1379 // Note that getAllocationBin is not called on the hottest path with statistics off.
1380 if (tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->getActiveBlock() != this)
1381 STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1382 else
1383 STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1384 #endif
1385 if (empty()) {
1386 // If the last object of a slab is freed, the slab cannot be marked full
1387 MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1388 tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1389 } else { // hot path
1390 FreeObject *objectToFree = findObjectToFree(object);
1391 objectToFree->next = freeList;
1392 freeList = objectToFree;
1393 adjustPositionInBin();
1394 }
1395 }
1396
freePublicObject(FreeObject * objectToFree)1397 void Block::freePublicObject (FreeObject *objectToFree)
1398 {
1399 FreeObject* localPublicFreeList{};
1400
1401 MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1402 #if FREELIST_NONBLOCKING
1403 // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario
1404 localPublicFreeList = publicFreeList.load(std::memory_order_relaxed);
1405 do {
1406 objectToFree->next = localPublicFreeList;
1407 // no backoff necessary because trying to make change, not waiting for a change
1408 } while( !publicFreeList.compare_exchange_strong(localPublicFreeList, objectToFree) );
1409 #else
1410 STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1411 {
1412 MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1413 localPublicFreeList = objectToFree->next = publicFreeList;
1414 publicFreeList = objectToFree;
1415 }
1416 #endif
1417
1418 if( localPublicFreeList==NULL ) {
1419 // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1420 // otherwise, it should point to the bin the block belongs to.
1421 // reading nextPrivatizable is thread-safe below, because:
1422 // 1) the executing thread atomically got publicFreeList==NULL and changed it to non-NULL;
1423 // 2) only owning thread can change it back to NULL,
1424 // 3) but it can not be done until the block is put to the mailbox
1425 // So the executing thread is now the only one that can change nextPrivatizable
1426 Block* next = nextPrivatizable.load(std::memory_order_acquire);
1427 if( !isNotForUse(next) ) {
1428 MALLOC_ASSERT( next!=nullptr, ASSERT_TEXT );
1429 Bin* theBin = (Bin*) next;
1430 #if MALLOC_DEBUG && TBB_REVAMP_TODO
1431 // FIXME: The thread that returns the block is not the block's owner.
1432 // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails.
1433 // Need to find a way to get the correct remote bin for comparison.
1434 { // check that nextPrivatizable points to the bin the block belongs to
1435 uint32_t index = getIndex( objectSize );
1436 TLSData* tls = getThreadMallocTLS();
1437 MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT );
1438 }
1439 #endif // MALLOC_DEBUG
1440 theBin->addPublicFreeListBlock(this);
1441 }
1442 }
1443 STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1444 STAT_increment(ownerTid.load(std::memory_order_relaxed), getIndex(objectSize), freeByOtherThread);
1445 }
1446
1447 // Make objects freed by other threads available for use again
privatizePublicFreeList(bool reset)1448 void Block::privatizePublicFreeList( bool reset )
1449 {
1450 FreeObject *localPublicFreeList;
1451 // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1452 // to properly synchronize with other threads freeing objects to this slab.
1453 const intptr_t endMarker = reset ? 0 : UNUSABLE;
1454
1455 // Only the owner thread may reset the pointer to NULL
1456 MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1457 #if FREELIST_NONBLOCKING
1458 localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker);
1459 #else
1460 STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1461 {
1462 MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1463 localPublicFreeList = publicFreeList;
1464 publicFreeList = endMarker;
1465 }
1466 #endif
1467 MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1468 MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1469
1470 // publicFreeList must have been UNUSABLE or valid, but not NULL
1471 MALLOC_ASSERT( localPublicFreeList!=NULL, ASSERT_TEXT );
1472 if( isSolidPtr(localPublicFreeList) ) {
1473 MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1474 /* other threads did not change the counter freeing our blocks */
1475 allocatedCount--;
1476 FreeObject *temp = localPublicFreeList;
1477 while( isSolidPtr(temp->next) ){ // the list will end with either NULL or UNUSABLE
1478 temp = temp->next;
1479 allocatedCount--;
1480 MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1481 }
1482 /* merge with local freeList */
1483 temp->next = freeList;
1484 freeList = localPublicFreeList;
1485 STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1486 }
1487 }
1488
privatizeOrphaned(TLSData * tls,unsigned index)1489 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1490 {
1491 Bin* bin = tls->bin + index;
1492 STAT_increment(getThreadId(), index, allocBlockPublic);
1493 next = NULL;
1494 previous = NULL;
1495 MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != NULL, ASSERT_TEXT );
1496 /* There is not a race here since no other thread owns this block */
1497 markOwned(tls);
1498 // It is safe to change nextPrivatizable, as publicFreeList is not null
1499 MALLOC_ASSERT( isNotForUse(nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1500 nextPrivatizable.store((Block*)bin, std::memory_order_relaxed);
1501 // the next call is required to change publicFreeList to 0
1502 privatizePublicFreeList();
1503 if( empty() ) {
1504 restoreBumpPtr();
1505 } else {
1506 adjustFullness(); // check the block fullness and set isFull
1507 }
1508 MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1509 }
1510
1511
readyToShare()1512 bool Block::readyToShare()
1513 {
1514 FreeObject* oldVal = NULL;
1515 #if FREELIST_NONBLOCKING
1516 publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE);
1517 #else
1518 STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1519 {
1520 MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1521 if ( (oldVal=publicFreeList)==NULL )
1522 (intptr_t&)(publicFreeList) = UNUSABLE;
1523 }
1524 #endif
1525 return oldVal==NULL;
1526 }
1527
shareOrphaned(intptr_t binTag,unsigned index)1528 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1529 {
1530 MALLOC_ASSERT( binTag, ASSERT_TEXT );
1531 // unreferenced formal parameter warning
1532 tbb::detail::suppress_unused_warning(index);
1533 STAT_increment(getThreadId(), index, freeBlockPublic);
1534 markOrphaned();
1535 if ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1536 // First check passed: the block is not in mailbox yet.
1537 // Need to set publicFreeList to non-zero, so other threads
1538 // will not change nextPrivatizable and it can be zeroed.
1539 if ( !readyToShare() ) {
1540 // another thread freed an object; we need to wait until it finishes.
1541 // There is no need for exponential backoff, as the wait here is not for a lock;
1542 // but need to yield, so the thread we wait has a chance to run.
1543 // TODO: add a pause to also be friendly to hyperthreads
1544 int count = 256;
1545 while ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1546 if (--count==0) {
1547 do_yield();
1548 count = 256;
1549 }
1550 }
1551 }
1552 }
1553 MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=NULL, ASSERT_TEXT );
1554 // now it is safe to change our data
1555 previous = NULL;
1556 // it is caller responsibility to ensure that the list of blocks
1557 // formed by nextPrivatizable pointers is kept consistent if required.
1558 // if only called from thread shutdown code, it does not matter.
1559 nextPrivatizable.store((Block*)UNUSABLE, std::memory_order_relaxed);
1560 }
1561
cleanBlockHeader()1562 void Block::cleanBlockHeader()
1563 {
1564 next = nullptr;
1565 previous = nullptr;
1566 freeList = nullptr;
1567 allocatedCount = 0;
1568 isFull = false;
1569 tlsPtr.store(nullptr, std::memory_order_relaxed);
1570
1571 publicFreeList.store(nullptr, std::memory_order_relaxed);
1572 }
1573
initEmptyBlock(TLSData * tls,size_t size)1574 void Block::initEmptyBlock(TLSData *tls, size_t size)
1575 {
1576 // Having getIndex and getObjectSize called next to each other
1577 // allows better compiler optimization as they basically share the code.
1578 unsigned int index = getIndex(size);
1579 unsigned int objSz = getObjectSize(size);
1580
1581 cleanBlockHeader();
1582 objectSize = objSz;
1583 markOwned(tls);
1584 // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1585 bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1586
1587 // each block should have the address where the head of the list of "privatizable" blocks is kept
1588 // the only exception is a block for boot strap which is initialized when TLS is yet NULL
1589 nextPrivatizable.store( tls? (Block*)(tls->bin + index) : nullptr, std::memory_order_relaxed);
1590 TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1591 this, tlsPtr.load(std::memory_order_relaxed) ? getThreadId() : -1, objectSize, bumpPtr ));
1592 }
1593
get(TLSData * tls,unsigned int size)1594 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1595 {
1596 // TODO: try to use index from getAllocationBin
1597 unsigned int index = getIndex(size);
1598 Block *block = bins[index].pop();
1599 if (block) {
1600 MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1601 block->privatizeOrphaned(tls, index);
1602 }
1603 return block;
1604 }
1605
put(intptr_t binTag,Block * block)1606 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1607 {
1608 unsigned int index = getIndex(block->getSize());
1609 block->shareOrphaned(binTag, index);
1610 MALLOC_ITT_SYNC_RELEASING(bins+index);
1611 bins[index].push(block);
1612 }
1613
reset()1614 void OrphanedBlocks::reset()
1615 {
1616 for (uint32_t i=0; i<numBlockBinLimit; i++)
1617 new (bins+i) LifoList();
1618 }
1619
cleanup(Backend * backend)1620 bool OrphanedBlocks::cleanup(Backend* backend)
1621 {
1622 bool released = false;
1623 for (uint32_t i=0; i<numBlockBinLimit; i++) {
1624 Block* block = bins[i].grab();
1625 MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1626 while (block) {
1627 Block* next = block->next;
1628 block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to NULL
1629 if (block->empty()) {
1630 block->reset();
1631 // slab blocks in user's pools do not have valid backRefIdx
1632 if (!backend->inUserPool())
1633 removeBackRef(*(block->getBackRefIdx()));
1634 backend->putSlabBlock(block);
1635 released = true;
1636 } else {
1637 MALLOC_ITT_SYNC_RELEASING(bins+i);
1638 bins[i].push(block);
1639 }
1640 block = next;
1641 }
1642 }
1643 return released;
1644 }
1645
getBlock()1646 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1647 {
1648 Block *b = head.exchange(NULL);
1649
1650 if (b) {
1651 size--;
1652 Block *newHead = b->next;
1653 lastAccessMiss = false;
1654 head.store(newHead, std::memory_order_release);
1655 } else {
1656 lastAccessMiss = true;
1657 }
1658 return ResOfGet(b, lastAccessMiss);
1659 }
1660
returnBlock(Block * block)1661 void FreeBlockPool::returnBlock(Block *block)
1662 {
1663 MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1664 Block *localHead = head.exchange(NULL);
1665
1666 if (!localHead) {
1667 size = 0; // head was stolen by externalClean, correct size accordingly
1668 } else if (size == POOL_HIGH_MARK) {
1669 // release cold blocks and add hot one,
1670 // so keep POOL_LOW_MARK-1 blocks and add new block to head
1671 Block *headToFree = localHead, *helper;
1672 for (int i=0; i<POOL_LOW_MARK-2; i++)
1673 headToFree = headToFree->next;
1674 Block *last = headToFree;
1675 headToFree = headToFree->next;
1676 last->next = NULL;
1677 size = POOL_LOW_MARK-1;
1678 for (Block *currBl = headToFree; currBl; currBl = helper) {
1679 helper = currBl->next;
1680 // slab blocks in user's pools do not have valid backRefIdx
1681 if (!backend->inUserPool())
1682 removeBackRef(currBl->backRefIdx);
1683 backend->putSlabBlock(currBl);
1684 }
1685 }
1686 size++;
1687 block->next = localHead;
1688 head.store(block, std::memory_order_release);
1689 }
1690
externalCleanup()1691 bool FreeBlockPool::externalCleanup()
1692 {
1693 Block *helper;
1694 bool released = false;
1695
1696 for (Block *currBl=head.exchange(NULL); currBl; currBl=helper) {
1697 helper = currBl->next;
1698 // slab blocks in user's pools do not have valid backRefIdx
1699 if (!backend->inUserPool())
1700 removeBackRef(currBl->backRefIdx);
1701 backend->putSlabBlock(currBl);
1702 released = true;
1703 }
1704 return released;
1705 }
1706
1707 /* Prepare the block for returning to FreeBlockPool */
reset()1708 void Block::reset()
1709 {
1710 // it is caller's responsibility to ensure no data is lost before calling this
1711 MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1712 MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1713 if (!isStartupAllocObject())
1714 STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1715
1716 cleanBlockHeader();
1717
1718 nextPrivatizable.store(nullptr, std::memory_order_relaxed);
1719
1720 objectSize = 0;
1721 // for an empty block, bump pointer should point right after the end of the block
1722 bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1723 }
1724
setActiveBlock(Block * block)1725 inline void Bin::setActiveBlock (Block *block)
1726 {
1727 // MALLOC_ASSERT( bin, ASSERT_TEXT );
1728 MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1729 // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1730 activeBlk = block;
1731 }
1732
setPreviousBlockActive()1733 inline Block* Bin::setPreviousBlockActive()
1734 {
1735 MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1736 Block* temp = activeBlk->previous;
1737 if( temp ) {
1738 MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1739 activeBlk = temp;
1740 }
1741 return temp;
1742 }
1743
isOwnedByCurrentThread() const1744 inline bool Block::isOwnedByCurrentThread() const {
1745 return tlsPtr.load(std::memory_order_relaxed) && ownerTid.isCurrentThreadId();
1746 }
1747
findObjectToFree(const void * object) const1748 FreeObject *Block::findObjectToFree(const void *object) const
1749 {
1750 FreeObject *objectToFree;
1751 // Due to aligned allocations, a pointer passed to scalable_free
1752 // might differ from the address of internally allocated object.
1753 // Small objects however should always be fine.
1754 if (objectSize <= maxSegregatedObjectSize)
1755 objectToFree = (FreeObject*)object;
1756 // "Fitting size" allocations are suspicious if aligned higher than naturally
1757 else {
1758 if ( ! isAligned(object,2*fittingAlignment) )
1759 // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1760 // so might even be slower in average than unconditional use of findAllocatedObject.
1761 // here it should be a "real" object
1762 objectToFree = (FreeObject*)object;
1763 else
1764 // here object can be an aligned address, so applying additional checks
1765 objectToFree = findAllocatedObject(object);
1766 MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1767 }
1768 MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1769
1770 return objectToFree;
1771 }
1772
release()1773 void TLSData::release()
1774 {
1775 memPool->extMemPool.allLocalCaches.unregisterThread(this);
1776 externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1777
1778 for (unsigned index = 0; index < numBlockBins; index++) {
1779 Block *activeBlk = bin[index].getActiveBlock();
1780 if (!activeBlk)
1781 continue;
1782 Block *threadlessBlock = activeBlk->previous;
1783 bool syncOnMailbox = false;
1784 while (threadlessBlock) {
1785 Block *threadBlock = threadlessBlock->previous;
1786 if (threadlessBlock->empty()) {
1787 /* we destroy the thread, so not use its block pool */
1788 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1789 } else {
1790 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1791 syncOnMailbox = true;
1792 }
1793 threadlessBlock = threadBlock;
1794 }
1795 threadlessBlock = activeBlk;
1796 while (threadlessBlock) {
1797 Block *threadBlock = threadlessBlock->next;
1798 if (threadlessBlock->empty()) {
1799 /* we destroy the thread, so not use its block pool */
1800 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1801 } else {
1802 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1803 syncOnMailbox = true;
1804 }
1805 threadlessBlock = threadBlock;
1806 }
1807 bin[index].resetActiveBlock();
1808
1809 if (syncOnMailbox) {
1810 // Although, we synchronized on nextPrivatizable inside a block, we still need to
1811 // synchronize on the bin lifetime because the thread releasing an object into the public
1812 // free list is touching the bin (mailbox and mailLock)
1813 MallocMutex::scoped_lock scoped_cs(bin[index].mailLock);
1814 }
1815 }
1816 }
1817
1818
1819 #if MALLOC_CHECK_RECURSION
1820 // TODO: Use dedicated heap for this
1821
1822 /*
1823 * It's a special kind of allocation that can be used when malloc is
1824 * not available (either during startup or when malloc was already called and
1825 * we are, say, inside pthread_setspecific's call).
1826 * Block can contain objects of different sizes,
1827 * allocations are performed by moving bump pointer and increasing of object counter,
1828 * releasing is done via counter of objects allocated in the block
1829 * or moving bump pointer if releasing object is on a bound.
1830 * TODO: make bump pointer to grow to the same backward direction as all the others.
1831 */
1832
1833 class StartupBlock : public Block {
availableSize() const1834 size_t availableSize() const {
1835 return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1836 }
1837 static StartupBlock *getBlock();
1838 public:
1839 static FreeObject *allocate(size_t size);
msize(void * ptr)1840 static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1841 void free(void *ptr);
1842 };
1843
1844 static MallocMutex startupMallocLock;
1845 static StartupBlock *firstStartupBlock;
1846
getBlock()1847 StartupBlock *StartupBlock::getBlock()
1848 {
1849 BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1850 if (backRefIdx.isInvalid()) return NULL;
1851
1852 StartupBlock *block = static_cast<StartupBlock*>(
1853 defaultMemPool->extMemPool.backend.getSlabBlock(1));
1854 if (!block) return NULL;
1855
1856 block->cleanBlockHeader();
1857 setBackRef(backRefIdx, block);
1858 block->backRefIdx = backRefIdx;
1859 // use startupAllocObjSizeMark to mark objects from startup block marker
1860 block->objectSize = startupAllocObjSizeMark;
1861 block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1862 return block;
1863 }
1864
allocate(size_t size)1865 FreeObject *StartupBlock::allocate(size_t size)
1866 {
1867 FreeObject *result;
1868 StartupBlock *newBlock = NULL;
1869 bool newBlockUnused = false;
1870
1871 /* Objects must be aligned on their natural bounds,
1872 and objects bigger than word on word's bound. */
1873 size = alignUp(size, sizeof(size_t));
1874 // We need size of an object to implement msize.
1875 size_t reqSize = size + sizeof(size_t);
1876 {
1877 MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1878 // Re-check whether we need a new block (conditions might have changed)
1879 if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1880 if (!newBlock) {
1881 newBlock = StartupBlock::getBlock();
1882 if (!newBlock) return NULL;
1883 }
1884 newBlock->next = (Block*)firstStartupBlock;
1885 if (firstStartupBlock)
1886 firstStartupBlock->previous = (Block*)newBlock;
1887 firstStartupBlock = newBlock;
1888 }
1889 result = firstStartupBlock->bumpPtr;
1890 firstStartupBlock->allocatedCount++;
1891 firstStartupBlock->bumpPtr =
1892 (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1893 }
1894
1895 // keep object size at the negative offset
1896 *((size_t*)result) = size;
1897 return (FreeObject*)((size_t*)result+1);
1898 }
1899
free(void * ptr)1900 void StartupBlock::free(void *ptr)
1901 {
1902 Block* blockToRelease = NULL;
1903 {
1904 MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1905
1906 MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1907 MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1908 && allocatedCount>0, ASSERT_TEXT);
1909 MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1910 && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1911 ASSERT_TEXT);
1912 if (0 == --allocatedCount) {
1913 if (this == firstStartupBlock)
1914 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1915 if (previous)
1916 previous->next = next;
1917 if (next)
1918 next->previous = previous;
1919 blockToRelease = this;
1920 } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1921 // last object in the block released
1922 FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1923 MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1924 ASSERT_TEXT);
1925 bumpPtr = newBump;
1926 }
1927 }
1928 if (blockToRelease) {
1929 blockToRelease->previous = blockToRelease->next = NULL;
1930 defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1931 }
1932 }
1933
1934 #endif /* MALLOC_CHECK_RECURSION */
1935
1936 /********* End thread related code *************/
1937
1938 /********* Library initialization *************/
1939
1940 //! Value indicating the state of initialization.
1941 /* 0 = initialization not started.
1942 * 1 = initialization started but not finished.
1943 * 2 = initialization finished.
1944 * In theory, we only need values 0 and 2. But value 1 is nonetheless
1945 * useful for detecting errors in the double-check pattern.
1946 */
1947 static std::atomic<intptr_t> mallocInitialized{0}; // implicitly initialized to 0
1948 static MallocMutex initMutex;
1949
1950 /** The leading "\0" is here so that applying "strings" to the binary
1951 delivers a clean result. */
1952 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1953
1954 #if USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1955
1956 /* Decrease race interval between dynamic library unloading and pthread key
1957 destructor. Protect only Pthreads with supported unloading. */
1958 class ShutdownSync {
1959 /* flag is the number of threads in pthread key dtor body
1960 (i.e., between threadDtorStart() and threadDtorDone())
1961 or the signal to skip dtor, if flag < 0 */
1962 std::atomic<intptr_t> flag;
1963 static const intptr_t skipDtor = INTPTR_MIN/2;
1964 public:
init()1965 void init() { flag.store(0, std::memory_order_release); }
1966 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1967 simultaneously, so flag never becomes negative because of that. */
threadDtorStart()1968 bool threadDtorStart() {
1969 if (flag.load(std::memory_order_acquire) < 0)
1970 return false;
1971 if (++flag <= 0) { // note that new value returned
1972 flag.fetch_sub(1); // flag is spoiled by us, restore it
1973 return false;
1974 }
1975 return true;
1976 }
threadDtorDone()1977 void threadDtorDone() {
1978 flag.fetch_sub(1);
1979 }
processExit()1980 void processExit() {
1981 if (flag.fetch_add(skipDtor) != 0) {
1982 SpinWaitUntilEq(flag, skipDtor);
1983 }
1984 }
1985 };
1986
1987 #else
1988
1989 class ShutdownSync {
1990 public:
init()1991 void init() { }
threadDtorStart()1992 bool threadDtorStart() { return true; }
threadDtorDone()1993 void threadDtorDone() { }
processExit()1994 void processExit() { }
1995 };
1996
1997 #endif // USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1998
1999 static ShutdownSync shutdownSync;
2000
isMallocInitialized()2001 inline bool isMallocInitialized() {
2002 // Load must have acquire fence; otherwise thread taking "initialized" path
2003 // might perform textually later loads *before* mallocInitialized becomes 2.
2004 return 2 == mallocInitialized.load(std::memory_order_acquire);
2005 }
2006
2007 /* Caller is responsible for ensuring this routine is called exactly once. */
MallocInitializeITT()2008 extern "C" void MallocInitializeITT() {
2009 #if __TBB_USE_ITT_NOTIFY
2010 if (!usedBySrcIncluded)
2011 tbb::detail::r1::__TBB_load_ittnotify();
2012 #endif
2013 }
2014
initDefaultPool()2015 void MemoryPool::initDefaultPool() {
2016 hugePages.init();
2017 }
2018
2019 /*
2020 * Allocator initialization routine;
2021 * it is called lazily on the very first scalable_malloc call.
2022 */
initMemoryManager()2023 static bool initMemoryManager()
2024 {
2025 TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2026 sizeof(Block), sizeof(uintptr_t) ));
2027 MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2028 MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2029 MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2030 "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2031
2032 #if USE_WINTHREAD
2033 const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2034 #else
2035 // POSIX.1-2001-compliant way to get page size
2036 const size_t granularity = sysconf(_SC_PAGESIZE);
2037 #endif
2038 if (!defaultMemPool) {
2039 // Do not rely on static constructors and do the assignment in case
2040 // of library static section not initialized at this call yet.
2041 defaultMemPool = (MemoryPool*)defaultMemPool_space;
2042 }
2043 bool initOk = defaultMemPool->
2044 extMemPool.init(0, NULL, NULL, granularity,
2045 /*keepAllMemory=*/false, /*fixedPool=*/false);
2046 // TODO: extMemPool.init() to not allocate memory
2047 if (!initOk || !initBackRefMaster(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2048 return false;
2049 MemoryPool::initDefaultPool();
2050 // init() is required iff initMemoryManager() is called
2051 // after mallocProcessShutdownNotification()
2052 shutdownSync.init();
2053 #if COLLECT_STATISTICS
2054 initStatisticsCollection();
2055 #endif
2056 return true;
2057 }
2058
GetBoolEnvironmentVariable(const char * name)2059 static bool GetBoolEnvironmentVariable(const char* name) {
2060 return tbb::detail::r1::GetBoolEnvironmentVariable(name);
2061 }
2062
2063 //! Ensures that initMemoryManager() is called once and only once.
2064 /** Does not return until initMemoryManager() has been completed by a thread.
2065 There is no need to call this routine if mallocInitialized==2 . */
doInitialization()2066 static bool doInitialization()
2067 {
2068 MallocMutex::scoped_lock lock( initMutex );
2069 if (mallocInitialized.load(std::memory_order_relaxed)!=2) {
2070 MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT );
2071 mallocInitialized.store(1, std::memory_order_relaxed);
2072 RecursiveMallocCallProtector scoped;
2073 if (!initMemoryManager()) {
2074 mallocInitialized.store(0, std::memory_order_relaxed); // restore and out
2075 return false;
2076 }
2077 #ifdef MALLOC_EXTRA_INITIALIZATION
2078 MALLOC_EXTRA_INITIALIZATION;
2079 #endif
2080 #if MALLOC_CHECK_RECURSION
2081 RecursiveMallocCallProtector::detectNaiveOverload();
2082 #endif
2083 MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT );
2084 // Store must have release fence, otherwise mallocInitialized==2
2085 // might become remotely visible before side effects of
2086 // initMemoryManager() become remotely visible.
2087 mallocInitialized.store(2, std::memory_order_release);
2088 if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2089 fputs(VersionString+1,stderr);
2090 hugePages.printStatus();
2091 }
2092 }
2093 /* It can't be 0 or I would have initialized it */
2094 MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT );
2095 return true;
2096 }
2097
2098 /********* End library initialization *************/
2099
2100 /********* The malloc show begins *************/
2101
2102
allocateFromFreeList()2103 FreeObject *Block::allocateFromFreeList()
2104 {
2105 FreeObject *result;
2106
2107 if (!freeList) return NULL;
2108
2109 result = freeList;
2110 MALLOC_ASSERT( result, ASSERT_TEXT );
2111
2112 freeList = result->next;
2113 MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2114 allocatedCount++;
2115 STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2116
2117 return result;
2118 }
2119
allocateFromBumpPtr()2120 FreeObject *Block::allocateFromBumpPtr()
2121 {
2122 FreeObject *result = bumpPtr;
2123 if (result) {
2124 bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2125 if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2126 bumpPtr = NULL;
2127 }
2128 MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2129 allocatedCount++;
2130 STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2131 }
2132 return result;
2133 }
2134
allocate()2135 inline FreeObject* Block::allocate()
2136 {
2137 MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2138
2139 /* for better cache locality, first looking in the free list. */
2140 if ( FreeObject *result = allocateFromFreeList() ) {
2141 return result;
2142 }
2143 MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2144
2145 /* if free list is empty, try thread local bump pointer allocation. */
2146 if ( FreeObject *result = allocateFromBumpPtr() ) {
2147 return result;
2148 }
2149 MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2150
2151 /* the block is considered full. */
2152 isFull = true;
2153 return NULL;
2154 }
2155
findObjectSize(void * object) const2156 size_t Block::findObjectSize(void *object) const
2157 {
2158 size_t blSize = getSize();
2159 #if MALLOC_CHECK_RECURSION
2160 // Currently, there is no aligned allocations from startup blocks,
2161 // so we can return just StartupBlock::msize().
2162 // TODO: This must be extended if we add aligned allocation from startup blocks.
2163 if (!blSize)
2164 return StartupBlock::msize(object);
2165 #endif
2166 // object can be aligned, so real size can be less than block's
2167 size_t size =
2168 blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2169 MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2170 return size;
2171 }
2172
moveBlockToFront(Block * block)2173 void Bin::moveBlockToFront(Block *block)
2174 {
2175 /* move the block to the front of the bin */
2176 if (block == activeBlk) return;
2177 outofTLSBin(block);
2178 pushTLSBin(block);
2179 }
2180
processEmptyBlock(Block * block,bool poolTheBlock)2181 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2182 {
2183 if (block != activeBlk) {
2184 /* We are not using this block; return it to the pool */
2185 outofTLSBin(block);
2186 block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2187 } else {
2188 /* all objects are free - let's restore the bump pointer */
2189 block->restoreBumpPtr();
2190 }
2191 }
2192
2193 template<int LOW_MARK, int HIGH_MARK>
put(LargeMemoryBlock * object,ExtMemoryPool * extMemPool)2194 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2195 {
2196 const size_t size = object->unalignedSize;
2197 // not spoil cache with too large object, that can cause its total cleanup
2198 if (size > MAX_TOTAL_SIZE)
2199 return false;
2200 LargeMemoryBlock *localHead = head.exchange(NULL);
2201
2202 object->prev = NULL;
2203 object->next = localHead;
2204 if (localHead)
2205 localHead->prev = object;
2206 else {
2207 // those might not be cleaned during local cache stealing, correct them
2208 totalSize = 0;
2209 numOfBlocks = 0;
2210 tail = object;
2211 }
2212 localHead = object;
2213 totalSize += size;
2214 numOfBlocks++;
2215 // must meet both size and number of cached objects constrains
2216 if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2217 // scanning from tail until meet conditions
2218 while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2219 totalSize -= tail->unalignedSize;
2220 numOfBlocks--;
2221 tail = tail->prev;
2222 }
2223 LargeMemoryBlock *headToRelease = tail->next;
2224 tail->next = NULL;
2225
2226 extMemPool->freeLargeObjectList(headToRelease);
2227 }
2228
2229 head.store(localHead, std::memory_order_release);
2230 return true;
2231 }
2232
2233 template<int LOW_MARK, int HIGH_MARK>
get(size_t size)2234 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2235 {
2236 LargeMemoryBlock *localHead, *res = NULL;
2237
2238 if (size > MAX_TOTAL_SIZE)
2239 return NULL;
2240
2241 // TBB_REVAMP_TODO: review this line
2242 if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(NULL)) == NULL) {
2243 // do not restore totalSize, numOfBlocks and tail at this point,
2244 // as they are used only in put(), where they must be restored
2245 return NULL;
2246 }
2247
2248 for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2249 if (curr->unalignedSize == size) {
2250 res = curr;
2251 if (curr->next)
2252 curr->next->prev = curr->prev;
2253 else
2254 tail = curr->prev;
2255 if (curr != localHead)
2256 curr->prev->next = curr->next;
2257 else
2258 localHead = curr->next;
2259 totalSize -= size;
2260 numOfBlocks--;
2261 break;
2262 }
2263 }
2264
2265 head.store(localHead, std::memory_order_release);
2266 return res;
2267 }
2268
2269 template<int LOW_MARK, int HIGH_MARK>
externalCleanup(ExtMemoryPool * extMemPool)2270 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2271 {
2272 if (LargeMemoryBlock *localHead = head.exchange(NULL)) {
2273 extMemPool->freeLargeObjectList(localHead);
2274 return true;
2275 }
2276 return false;
2277 }
2278
getFromLLOCache(TLSData * tls,size_t size,size_t alignment)2279 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2280 {
2281 LargeMemoryBlock *lmb = NULL;
2282
2283 size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2284 size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2285 if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2286 return NULL;
2287 MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2288
2289 if (tls) {
2290 tls->markUsed();
2291 lmb = tls->lloc.get(allocationSize);
2292 }
2293 if (!lmb)
2294 lmb = extMemPool.mallocLargeObject(this, allocationSize);
2295
2296 if (lmb) {
2297 // doing shuffle we suppose that alignment offset guarantees
2298 // that different cache lines are in use
2299 MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2300
2301 void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2302 uintptr_t alignedRight =
2303 alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2304 // Has some room to shuffle object between cache lines?
2305 // Note that alignedRight and alignedArea are aligned at alignment.
2306 unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2307 if (ptrDelta && tls) { // !tls is cold path
2308 // for the hot path of alignment==estimatedCacheLineSize,
2309 // allow compilers to use shift for division
2310 // (since estimatedCacheLineSize is a power-of-2 constant)
2311 unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2312 ptrDelta / estimatedCacheLineSize :
2313 ptrDelta / alignment;
2314 unsigned myCacheIdx = ++tls->currCacheIdx;
2315 unsigned offset = myCacheIdx % numOfPossibleOffsets;
2316
2317 // Move object to a cache line with an offset that is different from
2318 // previous allocation. This supposedly allows us to use cache
2319 // associativity more efficiently.
2320 alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2321 }
2322 MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2323 (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2324 LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2325 header->memoryBlock = lmb;
2326 header->backRefIdx = lmb->backRefIdx;
2327 setBackRef(header->backRefIdx, header);
2328
2329 lmb->objectSize = size;
2330
2331 MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2332 MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2333
2334 return alignedArea;
2335 }
2336 return NULL;
2337 }
2338
putToLLOCache(TLSData * tls,void * object)2339 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2340 {
2341 LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2342 // overwrite backRefIdx to simplify double free detection
2343 header->backRefIdx = BackRefIdx();
2344
2345 if (tls) {
2346 tls->markUsed();
2347 if (tls->lloc.put(header->memoryBlock, &extMemPool))
2348 return;
2349 }
2350 extMemPool.freeLargeObject(header->memoryBlock);
2351 }
2352
2353 /*
2354 * All aligned allocations fall into one of the following categories:
2355 * 1. if both request size and alignment are <= maxSegregatedObjectSize,
2356 * we just align the size up, and request this amount, because for every size
2357 * aligned to some power of 2, the allocated object is at least that aligned.
2358 * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2359 * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2360 * its address up; given such pointer, scalable_free could find the real object.
2361 * Wrapping of size+alignment is impossible because maximal allowed
2362 * alignment plus minLargeObjectSize can't lead to wrapping.
2363 * 4. otherwise, aligned large object is allocated.
2364 */
allocateAligned(MemoryPool * memPool,size_t size,size_t alignment)2365 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2366 {
2367 MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2368
2369 if (!isMallocInitialized())
2370 if (!doInitialization())
2371 return NULL;
2372
2373 void *result;
2374 if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2375 result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2376 else if (size<minLargeObjectSize) {
2377 if (alignment<=fittingAlignment)
2378 result = internalPoolMalloc(memPool, size);
2379 else if (size+alignment < minLargeObjectSize) {
2380 void *unaligned = internalPoolMalloc(memPool, size+alignment);
2381 if (!unaligned) return NULL;
2382 result = alignUp(unaligned, alignment);
2383 } else
2384 goto LargeObjAlloc;
2385 } else {
2386 LargeObjAlloc:
2387 TLSData *tls = memPool->getTLS(/*create=*/true);
2388 // take into account only alignment that are higher then natural
2389 result =
2390 memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2391 largeObjectAlignment: alignment);
2392 }
2393
2394 MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2395 return result;
2396 }
2397
reallocAligned(MemoryPool * memPool,void * ptr,size_t newSize,size_t alignment=0)2398 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2399 size_t newSize, size_t alignment = 0)
2400 {
2401 void *result;
2402 size_t copySize;
2403
2404 if (isLargeObject<ourMem>(ptr)) {
2405 LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2406 copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2407
2408 // Apply different strategies if size decreases
2409 if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2410
2411 // For huge objects (that do not fit in backend cache), keep the same space unless
2412 // the new size is at least twice smaller
2413 bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2414 size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2415 if (newSize > threshold) {
2416 lmb->objectSize = newSize;
2417 return ptr;
2418 }
2419 // TODO: For large objects suitable for the backend cache,
2420 // split out the excessive part and put it to the backend.
2421 }
2422 // Reallocate for real
2423 copySize = lmb->objectSize;
2424 #if BACKEND_HAS_MREMAP
2425 if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2426 alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2427 return r;
2428 #endif
2429 result = alignment ? allocateAligned(memPool, newSize, alignment) :
2430 internalPoolMalloc(memPool, newSize);
2431
2432 } else {
2433 Block* block = (Block *)alignDown(ptr, slabSize);
2434 copySize = block->findObjectSize(ptr);
2435
2436 // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2437 // Currently, in case of size decreasing, old pointer is returned
2438 if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2439 return ptr;
2440 } else {
2441 result = alignment ? allocateAligned(memPool, newSize, alignment) :
2442 internalPoolMalloc(memPool, newSize);
2443 }
2444 }
2445 if (result) {
2446 memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2447 internalPoolFree(memPool, ptr, 0);
2448 }
2449 return result;
2450 }
2451
2452 #if MALLOC_DEBUG
2453 /* A predicate checks if an object is properly placed inside its block */
isProperlyPlaced(const void * object) const2454 inline bool Block::isProperlyPlaced(const void *object) const
2455 {
2456 return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2457 }
2458 #endif
2459
2460 /* Finds the real object inside the block */
findAllocatedObject(const void * address) const2461 FreeObject *Block::findAllocatedObject(const void *address) const
2462 {
2463 // calculate offset from the end of the block space
2464 uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2465 MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2466 // find offset difference from a multiple of allocation size
2467 offset %= objectSize;
2468 // and move the address down to where the real object starts.
2469 return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2470 }
2471
2472 /*
2473 * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2474 * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2475 */
safer_dereference(const BackRefIdx * ptr)2476 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2477 {
2478 BackRefIdx id;
2479 #if _MSC_VER
2480 __try {
2481 #endif
2482 id = dereference(ptr);
2483 #if _MSC_VER
2484 } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2485 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2486 id = BackRefIdx();
2487 }
2488 #endif
2489 return id;
2490 }
2491
2492 template<MemoryOrigin memOrigin>
isLargeObject(void * object)2493 bool isLargeObject(void *object)
2494 {
2495 if (!isAligned(object, largeObjectAlignment))
2496 return false;
2497 LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2498 BackRefIdx idx = (memOrigin == unknownMem) ?
2499 safer_dereference(&header->backRefIdx) : dereference(&header->backRefIdx);
2500
2501 return idx.isLargeObject()
2502 // in valid LargeObjectHdr memoryBlock is not NULL
2503 && header->memoryBlock
2504 // in valid LargeObjectHdr memoryBlock points somewhere before header
2505 // TODO: more strict check
2506 && (uintptr_t)header->memoryBlock < (uintptr_t)header
2507 && getBackRef(idx) == header;
2508 }
2509
isSmallObject(void * ptr)2510 static inline bool isSmallObject (void *ptr)
2511 {
2512 Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2513 const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2514
2515 bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2516 if (isSmall)
2517 expectedBlock->checkFreePrecond(ptr);
2518 return isSmall;
2519 }
2520
2521 /**** Check if an object was allocated by scalable_malloc ****/
isRecognized(void * ptr)2522 static inline bool isRecognized (void* ptr)
2523 {
2524 return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2525 (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2526 }
2527
freeSmallObject(void * object)2528 static inline void freeSmallObject(void *object)
2529 {
2530 /* mask low bits to get the block */
2531 Block *block = (Block *)alignDown(object, slabSize);
2532 block->checkFreePrecond(object);
2533
2534 #if MALLOC_CHECK_RECURSION
2535 if (block->isStartupAllocObject()) {
2536 ((StartupBlock *)block)->free(object);
2537 return;
2538 }
2539 #endif
2540 if (block->isOwnedByCurrentThread()) {
2541 block->freeOwnObject(object);
2542 } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2543 FreeObject *objectToFree = block->findObjectToFree(object);
2544 block->freePublicObject(objectToFree);
2545 }
2546 }
2547
internalPoolMalloc(MemoryPool * memPool,size_t size)2548 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2549 {
2550 Bin* bin;
2551 Block * mallocBlock;
2552
2553 if (!memPool) return NULL;
2554
2555 if (!size) size = sizeof(size_t);
2556
2557 TLSData *tls = memPool->getTLS(/*create=*/true);
2558
2559 /* Allocate a large object */
2560 if (size >= minLargeObjectSize)
2561 return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2562
2563 if (!tls) return NULL;
2564
2565 tls->markUsed();
2566 /*
2567 * Get an element in thread-local array corresponding to the given size;
2568 * It keeps ptr to the active block for allocations of this size
2569 */
2570 bin = tls->getAllocationBin(size);
2571 if ( !bin ) return NULL;
2572
2573 /* Get a block to try to allocate in. */
2574 for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2575 mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2576 {
2577 if( FreeObject *result = mallocBlock->allocate() )
2578 return result;
2579 }
2580
2581 /*
2582 * else privatize publicly freed objects in some block and allocate from it
2583 */
2584 mallocBlock = bin->getPrivatizedFreeListBlock();
2585 if (mallocBlock) {
2586 MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2587 if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2588 return result;
2589 /* Else something strange happened, need to retry from the beginning; */
2590 TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2591 return internalPoolMalloc(memPool, size);
2592 }
2593
2594 /*
2595 * no suitable own blocks, try to get a partial block that some other thread has discarded.
2596 */
2597 mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2598 while (mallocBlock) {
2599 bin->pushTLSBin(mallocBlock);
2600 bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2601 if( FreeObject *result = mallocBlock->allocate() )
2602 return result;
2603 mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2604 }
2605
2606 /*
2607 * else try to get a new empty block
2608 */
2609 mallocBlock = memPool->getEmptyBlock(size);
2610 if (mallocBlock) {
2611 bin->pushTLSBin(mallocBlock);
2612 bin->setActiveBlock(mallocBlock);
2613 if( FreeObject *result = mallocBlock->allocate() )
2614 return result;
2615 /* Else something strange happened, need to retry from the beginning; */
2616 TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2617 return internalPoolMalloc(memPool, size);
2618 }
2619 /*
2620 * else nothing works so return NULL
2621 */
2622 TRACEF(( "[ScalableMalloc trace] No memory found, returning NULL.\n" ));
2623 return NULL;
2624 }
2625
2626 // When size==0 (i.e. unknown), detect here whether the object is large.
2627 // For size is known and < minLargeObjectSize, we still need to check
2628 // if the actual object is large, because large objects might be used
2629 // for aligned small allocations.
internalPoolFree(MemoryPool * memPool,void * object,size_t size)2630 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2631 {
2632 if (!memPool || !object) return false;
2633
2634 // The library is initialized at allocation call, so releasing while
2635 // not initialized means foreign object is releasing.
2636 MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2637 MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2638 "Invalid pointer during object releasing is detected.");
2639
2640 if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2641 memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2642 else
2643 freeSmallObject(object);
2644 return true;
2645 }
2646
internalMalloc(size_t size)2647 static void *internalMalloc(size_t size)
2648 {
2649 if (!size) size = sizeof(size_t);
2650
2651 #if MALLOC_CHECK_RECURSION
2652 if (RecursiveMallocCallProtector::sameThreadActive())
2653 return size<minLargeObjectSize? StartupBlock::allocate(size) :
2654 // nested allocation, so skip tls
2655 (FreeObject*)defaultMemPool->getFromLLOCache(NULL, size, slabSize);
2656 #endif
2657
2658 if (!isMallocInitialized())
2659 if (!doInitialization())
2660 return NULL;
2661 return internalPoolMalloc(defaultMemPool, size);
2662 }
2663
internalFree(void * object)2664 static void internalFree(void *object)
2665 {
2666 internalPoolFree(defaultMemPool, object, 0);
2667 }
2668
internalMsize(void * ptr)2669 static size_t internalMsize(void* ptr)
2670 {
2671 MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2672 if (isLargeObject<ourMem>(ptr)) {
2673 // TODO: return the maximum memory size, that can be written to this object
2674 LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2675 return lmb->objectSize;
2676 } else {
2677 Block *block = (Block*)alignDown(ptr, slabSize);
2678 return block->findObjectSize(ptr);
2679 }
2680 }
2681
2682 } // namespace internal
2683
2684 using namespace rml::internal;
2685
2686 // legacy entry point saved for compatibility with binaries complied
2687 // with pre-6003 versions of TBB
pool_create(intptr_t pool_id,const MemPoolPolicy * policy)2688 TBBMALLOC_EXPORT rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2689 {
2690 rml::MemoryPool *pool;
2691 MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2692
2693 pool_create_v1(pool_id, &pol, &pool);
2694 return pool;
2695 }
2696
pool_create_v1(intptr_t pool_id,const MemPoolPolicy * policy,rml::MemoryPool ** pool)2697 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2698 rml::MemoryPool **pool)
2699 {
2700 if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2701 // empty pFree allowed only for fixed pools
2702 || !(policy->fixedPool || policy->pFree)) {
2703 *pool = NULL;
2704 return INVALID_POLICY;
2705 }
2706 if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2707 // new flags can be added in place of reserved, but default
2708 // behaviour must be supported by this version
2709 || policy->reserved ) {
2710 *pool = NULL;
2711 return UNSUPPORTED_POLICY;
2712 }
2713 if (!isMallocInitialized())
2714 if (!doInitialization()) {
2715 *pool = NULL;
2716 return NO_MEMORY;
2717 }
2718 rml::internal::MemoryPool *memPool =
2719 (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2720 if (!memPool) {
2721 *pool = NULL;
2722 return NO_MEMORY;
2723 }
2724 memset(memPool, 0, sizeof(rml::internal::MemoryPool));
2725 if (!memPool->init(pool_id, policy)) {
2726 internalFree(memPool);
2727 *pool = NULL;
2728 return NO_MEMORY;
2729 }
2730
2731 *pool = (rml::MemoryPool*)memPool;
2732 return POOL_OK;
2733 }
2734
pool_destroy(rml::MemoryPool * memPool)2735 bool pool_destroy(rml::MemoryPool* memPool)
2736 {
2737 if (!memPool) return false;
2738 bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2739 internalFree(memPool);
2740
2741 return ret;
2742 }
2743
pool_reset(rml::MemoryPool * memPool)2744 bool pool_reset(rml::MemoryPool* memPool)
2745 {
2746 if (!memPool) return false;
2747
2748 return ((rml::internal::MemoryPool*)memPool)->reset();
2749 }
2750
pool_malloc(rml::MemoryPool * mPool,size_t size)2751 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2752 {
2753 return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2754 }
2755
pool_realloc(rml::MemoryPool * mPool,void * object,size_t size)2756 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2757 {
2758 if (!object)
2759 return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2760 if (!size) {
2761 internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2762 return NULL;
2763 }
2764 return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2765 }
2766
pool_aligned_malloc(rml::MemoryPool * mPool,size_t size,size_t alignment)2767 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2768 {
2769 if (!isPowerOfTwo(alignment) || 0==size)
2770 return NULL;
2771
2772 return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2773 }
2774
pool_aligned_realloc(rml::MemoryPool * memPool,void * ptr,size_t size,size_t alignment)2775 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2776 {
2777 if (!isPowerOfTwo(alignment))
2778 return NULL;
2779 rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2780 void *tmp;
2781
2782 if (!ptr)
2783 tmp = allocateAligned(mPool, size, alignment);
2784 else if (!size) {
2785 internalPoolFree(mPool, ptr, 0);
2786 return NULL;
2787 } else
2788 tmp = reallocAligned(mPool, ptr, size, alignment);
2789
2790 return tmp;
2791 }
2792
pool_free(rml::MemoryPool * mPool,void * object)2793 bool pool_free(rml::MemoryPool *mPool, void *object)
2794 {
2795 return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2796 }
2797
pool_identify(void * object)2798 rml::MemoryPool *pool_identify(void *object)
2799 {
2800 rml::internal::MemoryPool *pool;
2801 if (isLargeObject<ourMem>(object)) {
2802 LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2803 pool = header->memoryBlock->pool;
2804 } else {
2805 Block *block = (Block*)alignDown(object, slabSize);
2806 pool = block->getMemPool();
2807 }
2808 // do not return defaultMemPool, as it can't be used in pool_free() etc
2809 __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2810 "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2811 return (rml::MemoryPool*)pool;
2812 }
2813
pool_msize(rml::MemoryPool * mPool,void * object)2814 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2815 {
2816 if (object) {
2817 // No assert for object recognition, cause objects allocated from non-default
2818 // memory pool do not participate in range checking and do not have valid backreferences for
2819 // small objects. Instead, check that an object belong to the certain memory pool.
2820 MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2821 return internalMsize(object);
2822 }
2823 errno = EINVAL;
2824 // Unlike _msize, return 0 in case of parameter error.
2825 // Returning size_t(-1) looks more like the way to troubles.
2826 return 0;
2827 }
2828
2829 } // namespace rml
2830
2831 using namespace rml::internal;
2832
2833 #if MALLOC_TRACE
2834 static unsigned int threadGoingDownCount = 0;
2835 #endif
2836
2837 /*
2838 * When a thread is shutting down this routine should be called to remove all the thread ids
2839 * from the malloc blocks and replace them with a NULL thread id.
2840 *
2841 * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2842 * It will be automatically called at thread exit with the key value as the argument,
2843 * unless that value is NULL.
2844 * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2845 *
2846 * However neither of the above is called for the main process thread, so the routine
2847 * also needs to be called during the process shutdown.
2848 *
2849 */
2850 // TODO: Consider making this function part of class MemoryPool.
doThreadShutdownNotification(TLSData * tls,bool main_thread)2851 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2852 {
2853 TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2854 getThreadId(), threadGoingDownCount++ ));
2855
2856 #if USE_PTHREAD
2857 if (tls) {
2858 if (!shutdownSync.threadDtorStart()) return;
2859 tls->getMemPool()->onThreadShutdown(tls);
2860 shutdownSync.threadDtorDone();
2861 } else
2862 #endif
2863 {
2864 suppress_unused_warning(tls); // not used on Windows
2865 // The default pool is safe to use at this point:
2866 // on Linux, only the main thread can go here before destroying defaultMemPool;
2867 // on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2868 // See also __TBB_mallocProcessShutdownNotification()
2869 defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2870 // Take lock to walk through other pools; but waiting might be dangerous at this point
2871 // (e.g. on Windows the main thread might deadlock)
2872 bool locked;
2873 MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2874 if (locked) { // the list is safe to process
2875 for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2876 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2877 }
2878 }
2879
2880 TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2881 }
2882
2883 #if USE_PTHREAD
mallocThreadShutdownNotification(void * arg)2884 void mallocThreadShutdownNotification(void* arg)
2885 {
2886 // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2887 if (!isMallocInitialized()) return;
2888 doThreadShutdownNotification((TLSData*)arg, false);
2889 }
2890 #else
__TBB_mallocThreadShutdownNotification()2891 extern "C" void __TBB_mallocThreadShutdownNotification()
2892 {
2893 // The routine is called once per thread on Windows
2894 if (!isMallocInitialized()) return;
2895 doThreadShutdownNotification(NULL, false);
2896 }
2897 #endif
2898
__TBB_mallocProcessShutdownNotification(bool windows_process_dying)2899 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2900 {
2901 if (!isMallocInitialized()) return;
2902
2903 // Don't clean allocator internals if the entire process is exiting
2904 if (!windows_process_dying) {
2905 doThreadShutdownNotification(NULL, /*main_thread=*/true);
2906 }
2907 #if __TBB_MALLOC_LOCACHE_STAT
2908 printf("cache hit ratio %f, size hit %f\n",
2909 1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2910 defaultMemPool->extMemPool.loc.reportStat(stdout);
2911 #endif
2912
2913 shutdownSync.processExit();
2914 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2915 /* Pthread keys must be deleted as soon as possible to not call key dtor
2916 on thread termination when then the tbbmalloc code can be already unloaded.
2917 */
2918 defaultMemPool->destroy();
2919 destroyBackRefMaster(&defaultMemPool->extMemPool.backend);
2920 ThreadId::destroy(); // Delete key for thread id
2921 hugePages.reset();
2922 // new total malloc initialization is possible after this point
2923 mallocInitialized.store(0, std::memory_order_release);
2924 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2925
2926 #if COLLECT_STATISTICS
2927 unsigned nThreads = ThreadId::getMaxThreadId();
2928 for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2929 STAT_print(i);
2930 #endif
2931 if (!usedBySrcIncluded)
2932 MALLOC_ITT_FINI_ITTLIB();
2933 }
2934
scalable_malloc(size_t size)2935 extern "C" void * scalable_malloc(size_t size)
2936 {
2937 void *ptr = internalMalloc(size);
2938 if (!ptr) errno = ENOMEM;
2939 return ptr;
2940 }
2941
scalable_free(void * object)2942 extern "C" void scalable_free(void *object)
2943 {
2944 internalFree(object);
2945 }
2946
2947 #if MALLOC_ZONE_OVERLOAD_ENABLED
__TBB_malloc_free_definite_size(void * object,size_t size)2948 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2949 {
2950 internalPoolFree(defaultMemPool, object, size);
2951 }
2952 #endif
2953
2954 /*
2955 * A variant that provides additional memory safety, by checking whether the given address
2956 * was obtained with this allocator, and if not redirecting to the provided alternative call.
2957 */
__TBB_malloc_safer_free(void * object,void (* original_free)(void *))2958 extern "C" TBBMALLOC_EXPORT void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2959 {
2960 if (!object)
2961 return;
2962
2963 // tbbmalloc can allocate object only when tbbmalloc has been initialized
2964 if (mallocInitialized.load(std::memory_order_acquire) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2965 if (isLargeObject<unknownMem>(object)) {
2966 // must check 1st for large object, because small object check touches 4 pages on left,
2967 // and it can be inaccessible
2968 TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2969
2970 defaultMemPool->putToLLOCache(tls, object);
2971 return;
2972 } else if (isSmallObject(object)) {
2973 freeSmallObject(object);
2974 return;
2975 }
2976 }
2977 if (original_free)
2978 original_free(object);
2979 }
2980
2981 /********* End the free code *************/
2982
2983 /********* Code for scalable_realloc ***********/
2984
2985 /*
2986 * From K&R
2987 * "realloc changes the size of the object pointed to by p to size. The contents will
2988 * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
2989 * the new space is uninitialized. realloc returns a pointer to the new space, or
2990 * NULL if the request cannot be satisfied, in which case *p is unchanged."
2991 *
2992 */
scalable_realloc(void * ptr,size_t size)2993 extern "C" void* scalable_realloc(void* ptr, size_t size)
2994 {
2995 void *tmp;
2996
2997 if (!ptr)
2998 tmp = internalMalloc(size);
2999 else if (!size) {
3000 internalFree(ptr);
3001 return NULL;
3002 } else
3003 tmp = reallocAligned(defaultMemPool, ptr, size, 0);
3004
3005 if (!tmp) errno = ENOMEM;
3006 return tmp;
3007 }
3008
3009 /*
3010 * A variant that provides additional memory safety, by checking whether the given address
3011 * was obtained with this allocator, and if not redirecting to the provided alternative call.
3012 */
__TBB_malloc_safer_realloc(void * ptr,size_t sz,void * original_realloc)3013 extern "C" TBBMALLOC_EXPORT void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3014 {
3015 void *tmp; // TODO: fix warnings about uninitialized use of tmp
3016
3017 if (!ptr) {
3018 tmp = internalMalloc(sz);
3019 } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3020 if (!sz) {
3021 internalFree(ptr);
3022 return NULL;
3023 } else {
3024 tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3025 }
3026 }
3027 #if USE_WINTHREAD
3028 else if (original_realloc && sz) {
3029 orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3030 if ( original_ptrs->msize ){
3031 size_t oldSize = original_ptrs->msize(ptr);
3032 tmp = internalMalloc(sz);
3033 if (tmp) {
3034 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3035 if ( original_ptrs->free ){
3036 original_ptrs->free( ptr );
3037 }
3038 }
3039 } else
3040 tmp = NULL;
3041 }
3042 #else
3043 else if (original_realloc) {
3044 typedef void* (*realloc_ptr_t)(void*,size_t);
3045 realloc_ptr_t original_realloc_ptr;
3046 (void *&)original_realloc_ptr = original_realloc;
3047 tmp = original_realloc_ptr(ptr,sz);
3048 }
3049 #endif
3050 else tmp = NULL;
3051
3052 if (!tmp) errno = ENOMEM;
3053 return tmp;
3054 }
3055
3056 /********* End code for scalable_realloc ***********/
3057
3058 /********* Code for scalable_calloc ***********/
3059
3060 /*
3061 * From K&R
3062 * calloc returns a pointer to space for an array of nobj objects,
3063 * each of size size, or NULL if the request cannot be satisfied.
3064 * The space is initialized to zero bytes.
3065 *
3066 */
3067
scalable_calloc(size_t nobj,size_t size)3068 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3069 {
3070 // it's square root of maximal size_t value
3071 const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3072 const size_t arraySize = nobj * size;
3073
3074 // check for overflow during multiplication:
3075 if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3076 if (nobj && arraySize / nobj != size) { // 2) exact check
3077 errno = ENOMEM;
3078 return NULL;
3079 }
3080 void* result = internalMalloc(arraySize);
3081 if (result)
3082 memset(result, 0, arraySize);
3083 else
3084 errno = ENOMEM;
3085 return result;
3086 }
3087
3088 /********* End code for scalable_calloc ***********/
3089
3090 /********* Code for aligned allocation API **********/
3091
scalable_posix_memalign(void ** memptr,size_t alignment,size_t size)3092 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3093 {
3094 if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3095 return EINVAL;
3096 void *result = allocateAligned(defaultMemPool, size, alignment);
3097 if (!result)
3098 return ENOMEM;
3099 *memptr = result;
3100 return 0;
3101 }
3102
scalable_aligned_malloc(size_t size,size_t alignment)3103 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3104 {
3105 if (!isPowerOfTwo(alignment) || 0==size) {
3106 errno = EINVAL;
3107 return NULL;
3108 }
3109 void *tmp = allocateAligned(defaultMemPool, size, alignment);
3110 if (!tmp) errno = ENOMEM;
3111 return tmp;
3112 }
3113
scalable_aligned_realloc(void * ptr,size_t size,size_t alignment)3114 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3115 {
3116 if (!isPowerOfTwo(alignment)) {
3117 errno = EINVAL;
3118 return NULL;
3119 }
3120 void *tmp;
3121
3122 if (!ptr)
3123 tmp = allocateAligned(defaultMemPool, size, alignment);
3124 else if (!size) {
3125 scalable_free(ptr);
3126 return NULL;
3127 } else
3128 tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3129
3130 if (!tmp) errno = ENOMEM;
3131 return tmp;
3132 }
3133
__TBB_malloc_safer_aligned_realloc(void * ptr,size_t size,size_t alignment,void * orig_function)3134 extern "C" TBBMALLOC_EXPORT void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3135 {
3136 /* corner cases left out of reallocAligned to not deal with errno there */
3137 if (!isPowerOfTwo(alignment)) {
3138 errno = EINVAL;
3139 return NULL;
3140 }
3141 void *tmp = NULL;
3142
3143 if (!ptr) {
3144 tmp = allocateAligned(defaultMemPool, size, alignment);
3145 } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3146 if (!size) {
3147 internalFree(ptr);
3148 return NULL;
3149 } else {
3150 tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3151 }
3152 }
3153 #if USE_WINTHREAD
3154 else {
3155 orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3156 if (size) {
3157 // Without orig_msize, we can't do anything with this.
3158 // Just keeping old pointer.
3159 if ( original_ptrs->aligned_msize ){
3160 // set alignment and offset to have possibly correct oldSize
3161 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3162 tmp = allocateAligned(defaultMemPool, size, alignment);
3163 if (tmp) {
3164 memcpy(tmp, ptr, size<oldSize? size : oldSize);
3165 if ( original_ptrs->aligned_free ){
3166 original_ptrs->aligned_free( ptr );
3167 }
3168 }
3169 }
3170 } else {
3171 if ( original_ptrs->aligned_free ){
3172 original_ptrs->aligned_free( ptr );
3173 }
3174 return NULL;
3175 }
3176 }
3177 #else
3178 // As original_realloc can't align result, and there is no way to find
3179 // size of reallocating object, we are giving up.
3180 suppress_unused_warning(orig_function);
3181 #endif
3182 if (!tmp) errno = ENOMEM;
3183 return tmp;
3184 }
3185
scalable_aligned_free(void * ptr)3186 extern "C" void scalable_aligned_free(void *ptr)
3187 {
3188 internalFree(ptr);
3189 }
3190
3191 /********* end code for aligned allocation API **********/
3192
3193 /********* Code for scalable_msize ***********/
3194
3195 /*
3196 * Returns the size of a memory block allocated in the heap.
3197 */
scalable_msize(void * ptr)3198 extern "C" size_t scalable_msize(void* ptr)
3199 {
3200 if (ptr) {
3201 MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3202 return internalMsize(ptr);
3203 }
3204 errno = EINVAL;
3205 // Unlike _msize, return 0 in case of parameter error.
3206 // Returning size_t(-1) looks more like the way to troubles.
3207 return 0;
3208 }
3209
3210 /*
3211 * A variant that provides additional memory safety, by checking whether the given address
3212 * was obtained with this allocator, and if not redirecting to the provided alternative call.
3213 */
__TBB_malloc_safer_msize(void * object,size_t (* original_msize)(void *))3214 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3215 {
3216 if (object) {
3217 // Check if the memory was allocated by scalable_malloc
3218 if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3219 return internalMsize(object);
3220 else if (original_msize)
3221 return original_msize(object);
3222 }
3223 // object is NULL or unknown, or foreign and no original_msize
3224 #if USE_WINTHREAD
3225 errno = EINVAL; // errno expected to be set only on this platform
3226 #endif
3227 return 0;
3228 }
3229
3230 /*
3231 * The same as above but for _aligned_msize case
3232 */
__TBB_malloc_safer_aligned_msize(void * object,size_t alignment,size_t offset,size_t (* orig_aligned_msize)(void *,size_t,size_t))3233 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3234 {
3235 if (object) {
3236 // Check if the memory was allocated by scalable_malloc
3237 if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3238 return internalMsize(object);
3239 else if (orig_aligned_msize)
3240 return orig_aligned_msize(object,alignment,offset);
3241 }
3242 // object is NULL or unknown
3243 errno = EINVAL;
3244 return 0;
3245 }
3246
3247 /********* End code for scalable_msize ***********/
3248
scalable_allocation_mode(int param,intptr_t value)3249 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3250 {
3251 if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3252 defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3253 return TBBMALLOC_OK;
3254 } else if (param == USE_HUGE_PAGES) {
3255 #if __unix__
3256 switch (value) {
3257 case 0:
3258 case 1:
3259 hugePages.setMode(value);
3260 return TBBMALLOC_OK;
3261 default:
3262 return TBBMALLOC_INVALID_PARAM;
3263 }
3264 #else
3265 return TBBMALLOC_NO_EFFECT;
3266 #endif
3267 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3268 } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3269 switch (value) {
3270 case 0: // used by dynamic library
3271 case 1: // used by static library or directly included sources
3272 usedBySrcIncluded = value;
3273 return TBBMALLOC_OK;
3274 default:
3275 return TBBMALLOC_INVALID_PARAM;
3276 }
3277 #endif
3278 } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3279 defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3280 return TBBMALLOC_OK;
3281 }
3282 return TBBMALLOC_INVALID_PARAM;
3283 }
3284
scalable_allocation_command(int cmd,void * param)3285 extern "C" int scalable_allocation_command(int cmd, void *param)
3286 {
3287 if (param)
3288 return TBBMALLOC_INVALID_PARAM;
3289
3290 bool released = false;
3291 switch(cmd) {
3292 case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3293 if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3294 released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3295 break;
3296 case TBBMALLOC_CLEAN_ALL_BUFFERS:
3297 released = defaultMemPool->extMemPool.hardCachesCleanup();
3298 break;
3299 default:
3300 return TBBMALLOC_INVALID_PARAM;
3301 }
3302 return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3303 }
3304