1 /*
2     Copyright (c) 2005-2020 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 
18 #include "tbbmalloc_internal.h"
19 #include <errno.h>
20 #include <new>        /* for placement new */
21 #include <string.h>   /* for memset */
22 
23 #include "../tbb/tbb_version.h"
24 #include "../tbb/tbb_environment.h"
25 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
26 
27 #if USE_PTHREAD
28     #define TlsSetValue_func pthread_setspecific
29     #define TlsGetValue_func pthread_getspecific
30     #define GetMyTID() pthread_self()
31     #include <sched.h>
do_yield()32     inline void do_yield() {sched_yield();}
33     extern "C" { static void mallocThreadShutdownNotification(void*); }
34     #if __sun || __SUNPRO_CC
35     #define __asm__ asm
36     #endif
37     #include <unistd.h> // sysconf(_SC_PAGESIZE)
38 #elif USE_WINTHREAD
39     #define GetMyTID() GetCurrentThreadId()
40 #if __TBB_WIN8UI_SUPPORT
41     #include<thread>
42     #define TlsSetValue_func FlsSetValue
43     #define TlsGetValue_func FlsGetValue
44     #define TlsAlloc() FlsAlloc(NULL)
45     #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
46     #define TlsFree FlsFree
do_yield()47     inline void do_yield() {std::this_thread::yield();}
48 #else
49     #define TlsSetValue_func TlsSetValue
50     #define TlsGetValue_func TlsGetValue
51     #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
do_yield()52     inline void do_yield() {SwitchToThread();}
53 #endif
54 #else
55     #error Must define USE_PTHREAD or USE_WINTHREAD
56 
57 #endif
58 
59 
60 #define FREELIST_NONBLOCKING 1
61 
62 namespace rml {
63 class MemoryPool;
64 namespace internal {
65 
66 class Block;
67 class MemoryPool;
68 
69 #if MALLOC_CHECK_RECURSION
70 
71 inline bool isMallocInitialized();
72 
noRecursion()73 bool RecursiveMallocCallProtector::noRecursion() {
74     MALLOC_ASSERT(isMallocInitialized(),
75                   "Recursion status can be checked only when initialization was done.");
76     return !mallocRecursionDetected;
77 }
78 
79 #endif // MALLOC_CHECK_RECURSION
80 
81 /** Support for handling the special UNUSABLE pointer state **/
82 const intptr_t UNUSABLE = 0x1;
isSolidPtr(void * ptr)83 inline bool isSolidPtr( void* ptr ) {
84     return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
85 }
isNotForUse(void * ptr)86 inline bool isNotForUse( void* ptr ) {
87     return (intptr_t)ptr==UNUSABLE;
88 }
89 
90 /*
91  * Block::objectSize value used to mark blocks allocated by startupAlloc
92  */
93 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
94 
95 /*
96  * The following constant is used to define the size of struct Block, the block header.
97  * The intent is to have the size of a Block multiple of the cache line size, this allows us to
98  * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
99  */
100 const int blockHeaderAlignment = estimatedCacheLineSize;
101 
102 /********* The data structures and global objects        **************/
103 
104 /*
105  * The malloc routines themselves need to be able to occasionally malloc some space,
106  * in order to set up the structures used by the thread local structures. This
107  * routine performs that functions.
108  */
109 class BootStrapBlocks {
110     MallocMutex bootStrapLock;
111     Block      *bootStrapBlock;
112     Block      *bootStrapBlockUsed;
113     FreeObject *bootStrapObjectList;
114 public:
115     void *allocate(MemoryPool *memPool, size_t size);
116     void free(void* ptr);
117     void reset();
118 };
119 
120 #if USE_INTERNAL_TID
121 class ThreadId {
122     static tls_key_t Tid_key;
123     static intptr_t ThreadCount;
124 
125     unsigned int id;
126 
tlsNumber()127     static unsigned int tlsNumber() {
128         unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
129         if( !result ) {
130             RecursiveMallocCallProtector scoped;
131             // Thread-local value is zero -> first call from this thread,
132             // need to initialize with next ID value (IDs start from 1)
133             result = AtomicIncrement(ThreadCount); // returned new value!
134             TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
135         }
136         return result;
137     }
138 public:
init()139     static bool init() {
140 #if USE_WINTHREAD
141         Tid_key = TlsAlloc();
142         if (Tid_key == TLS_ALLOC_FAILURE)
143             return false;
144 #else
145         int status = pthread_key_create( &Tid_key, NULL );
146         if ( status ) {
147             fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
148             return false;
149         }
150 #endif /* USE_WINTHREAD */
151         return true;
152     }
destroy()153     static void destroy() {
154         if( Tid_key ) {
155 #if USE_WINTHREAD
156             BOOL status = !(TlsFree( Tid_key ));  // fail is zero
157 #else
158             int status = pthread_key_delete( Tid_key );
159 #endif /* USE_WINTHREAD */
160             if ( status )
161                 fprintf (stderr, "The memory manager cannot delete tls key\n");
162             Tid_key = 0;
163         }
164     }
165 
ThreadId()166     ThreadId() : id(ThreadId::tlsNumber()) {}
isCurrentThreadId() const167     bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
168 
169 #if COLLECT_STATISTICS || MALLOC_TRACE
getThreadId()170     friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
171 #endif
172 #if COLLECT_STATISTICS
getMaxThreadId()173     static unsigned getMaxThreadId() { return ThreadCount; }
174 
175     friend int STAT_increment(ThreadId tid, int bin, int ctr);
176 #endif
177 };
178 
179 tls_key_t ThreadId::Tid_key;
180 intptr_t ThreadId::ThreadCount;
181 
182 #if COLLECT_STATISTICS
STAT_increment(ThreadId tid,int bin,int ctr)183 int STAT_increment(ThreadId tid, int bin, int ctr)
184 {
185     return ::STAT_increment(tid.id, bin, ctr);
186 }
187 #endif
188 
189 #else // USE_INTERNAL_TID
190 
191 class ThreadId {
192 #if USE_PTHREAD
193     pthread_t tid;
194 #else
195     DWORD     tid;
196 #endif
197 public:
ThreadId()198     ThreadId() : tid(GetMyTID()) {}
199 #if USE_PTHREAD
isCurrentThreadId() const200     bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid); }
201 #else
isCurrentThreadId() const202     bool isCurrentThreadId() const { return GetCurrentThreadId() == tid; }
203 #endif
init()204     static bool init() { return true; }
destroy()205     static void destroy() {}
206 };
207 
208 #endif // USE_INTERNAL_TID
209 
210 /*********** Code to provide thread ID and a thread-local void pointer **********/
211 
init()212 bool TLSKey::init()
213 {
214 #if USE_WINTHREAD
215     TLS_pointer_key = TlsAlloc();
216     if (TLS_pointer_key == TLS_ALLOC_FAILURE)
217         return false;
218 #else
219     int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
220     if ( status )
221         return false;
222 #endif /* USE_WINTHREAD */
223     return true;
224 }
225 
destroy()226 bool TLSKey::destroy()
227 {
228 #if USE_WINTHREAD
229     BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
230 #else
231     int status1 = pthread_key_delete(TLS_pointer_key);
232 #endif /* USE_WINTHREAD */
233     MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
234     return status1==0;
235 }
236 
getThreadMallocTLS() const237 inline TLSData* TLSKey::getThreadMallocTLS() const
238 {
239     return (TLSData *)TlsGetValue_func( TLS_pointer_key );
240 }
241 
setThreadMallocTLS(TLSData * newvalue)242 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
243     RecursiveMallocCallProtector scoped;
244     TlsSetValue_func( TLS_pointer_key, newvalue );
245 }
246 
247 /* The 'next' field in the block header has to maintain some invariants:
248  *   it needs to be on a 16K boundary and the first field in the block.
249  *   Any value stored there needs to have the lower 14 bits set to 0
250  *   so that various assert work. This means that if you want to smash this memory
251  *   for debugging purposes you will need to obey this invariant.
252  * The total size of the header needs to be a power of 2 to simplify
253  * the alignment requirements. For now it is a 128 byte structure.
254  * To avoid false sharing, the fields changed only locally are separated
255  * from the fields changed by foreign threads.
256  * Changing the size of the block header would require to change
257  * some bin allocation sizes, in particular "fitting" sizes (see above).
258  */
259 class Bin;
260 class StartupBlock;
261 
262 class MemoryPool {
263     // if no explicit grainsize, expect to see malloc in user's pAlloc
264     // and set reasonable low granularity
265     static const size_t defaultGranularity = estimatedCacheLineSize;
266 
267     MemoryPool();                  // deny
268 public:
269     static MallocMutex  memPoolListLock;
270 
271     // list of all active pools is used to release
272     // all TLS data on thread termination or library unload
273     MemoryPool    *next,
274                   *prev;
275     ExtMemoryPool  extMemPool;
276     BootStrapBlocks bootStrapBlocks;
277 
278     static void initDefaultPool();
279 
280     bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
281     bool reset();
282     bool destroy();
283     void onThreadShutdown(TLSData *tlsData);
284 
285     inline TLSData *getTLS(bool create);
clearTLS()286     void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(NULL); }
287 
288     Block *getEmptyBlock(size_t size);
289     void returnEmptyBlock(Block *block, bool poolTheBlock);
290 
291     // get/put large object to/from local large object cache
292     void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
293     void putToLLOCache(TLSData *tls, void *object);
294 };
295 
296 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
297                                      (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
298 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
299 const size_t MemoryPool::defaultGranularity;
300 // zero-initialized
301 MallocMutex  MemoryPool::memPoolListLock;
302 // TODO: move huge page status to default pool, because that's its states
303 HugePagesStatus hugePages;
304 static bool usedBySrcIncluded = false;
305 
306 // Padding helpers
307 template<size_t padd>
308 struct PaddingImpl {
309     size_t       __padding[padd];
310 };
311 
312 template<>
313 struct PaddingImpl<0> {};
314 
315 template<int N>
316 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
317 
318 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
319 // fields and fields commonly accessed by not owner threads.
320 class GlobalBlockFields : public BlockI {
321 protected:
322     FreeObject  *publicFreeList;
323     Block       *nextPrivatizable;
324     MemoryPool  *poolPtr;
325 };
326 
327 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)>  {
328 protected:
329     Block       *next;
330     Block       *previous;        /* Use double linked list to speed up removal */
331     FreeObject  *bumpPtr;         /* Bump pointer moves from the end to the beginning of a block */
332     FreeObject  *freeList;
333     /* Pointer to local data for the owner thread. Used for fast finding tls
334        when releasing object from a block that current thread owned.
335        NULL for orphaned blocks. */
336     TLSData     *tlsPtr;
337     ThreadId     ownerTid;        /* the ID of the thread that owns or last owned the block */
338     BackRefIdx   backRefIdx;
339     uint16_t     allocatedCount;  /* Number of objects allocated (obviously by the owning thread) */
340     uint16_t     objectSize;
341     bool         isFull;
342 
343     friend class FreeBlockPool;
344     friend class StartupBlock;
345     friend class LifoList;
346     friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
347     friend bool OrphanedBlocks::cleanup(Backend*);
348     friend Block *MemoryPool::getEmptyBlock(size_t);
349 };
350 
351 // Use inheritance to guarantee that a user data start on next cache line.
352 // Can't use member for it, because when LocalBlockFields already on cache line,
353 // we must have no additional memory consumption for all compilers.
354 class Block : public LocalBlockFields,
355               Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
356 public:
empty() const357     bool empty() const {
358         if (allocatedCount > 0) return false;
359         MALLOC_ASSERT(!isSolidPtr(publicFreeList), ASSERT_TEXT);
360         return true;
361     }
362     inline FreeObject* allocate();
363     inline FreeObject *allocateFromFreeList();
364 
365     inline bool adjustFullness();
366     void adjustPositionInBin(Bin* bin = NULL);
367 
freeListNonNull()368     bool freeListNonNull() { return freeList; }
369     void freePublicObject(FreeObject *objectToFree);
370     inline void freeOwnObject(void *object);
371     void reset();
372     void privatizePublicFreeList( bool reset = true );
373     void restoreBumpPtr();
374     void privatizeOrphaned(TLSData *tls, unsigned index);
375     bool readyToShare();
376     void shareOrphaned(intptr_t binTag, unsigned index);
getSize() const377     unsigned int getSize() const {
378         MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
379                       "Invalid object size");
380         return isStartupAllocObject()? 0 : objectSize;
381     }
getBackRefIdx() const382     const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
383     inline bool isOwnedByCurrentThread() const;
isStartupAllocObject() const384     bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
385     inline FreeObject *findObjectToFree(const void *object) const;
checkFreePrecond(const void * object) const386     void checkFreePrecond(const void *object) const {
387 #if MALLOC_DEBUG
388         const char *msg = "Possible double free or heap corruption.";
389         // small objects are always at least sizeof(size_t) Byte aligned,
390         // try to check this before this dereference as for invalid objects
391         // this may be unreadable
392         MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
393         // releasing to free slab
394         MALLOC_ASSERT(allocatedCount>0, msg);
395         // must not point to slab's header
396         MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
397         if (startupAllocObjSizeMark == objectSize) // startup block
398             MALLOC_ASSERT(object<=bumpPtr, msg);
399         else {
400             // non-startup objects are 8 Byte aligned
401             MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
402             MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
403                           && (!bumpPtr || object>bumpPtr), msg);
404             FreeObject *toFree = findObjectToFree(object);
405             // check against head of freeList, as this is mostly
406             // expected after double free
407             MALLOC_ASSERT(toFree != freeList, msg);
408             // check against head of publicFreeList, to detect double free
409             // involving foreign thread
410             MALLOC_ASSERT(toFree != publicFreeList, msg);
411         }
412 #else
413         suppress_unused_warning(object);
414 #endif
415     }
416     void initEmptyBlock(TLSData *tls, size_t size);
417     size_t findObjectSize(void *object) const;
getMemPool() const418     MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
419 
420 protected:
421     void cleanBlockHeader();
422 
423 private:
424     static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
425 
426     inline FreeObject *allocateFromBumpPtr();
427     inline FreeObject *findAllocatedObject(const void *address) const;
428     inline bool isProperlyPlaced(const void *object) const;
markOwned(TLSData * tls)429     inline void markOwned(TLSData *tls) {
430         MALLOC_ASSERT(!tlsPtr, ASSERT_TEXT);
431         ownerTid = ThreadId(); /* save the ID of the current thread */
432         tlsPtr = tls;
433     }
markOrphaned()434     inline void markOrphaned() {
435         MALLOC_ASSERT(tlsPtr, ASSERT_TEXT);
436         tlsPtr = NULL;
437     }
438 
439     friend class Bin;
440     friend class TLSData;
441     friend bool MemoryPool::destroy();
442 };
443 
444 const float Block::emptyEnoughRatio = 1.0 / 4.0;
445 
446 MALLOC_STATIC_ASSERT(sizeof(Block) <= 2*estimatedCacheLineSize,
447     "The class Block does not fit into 2 cache lines on this platform. "
448     "Defining USE_INTERNAL_TID may help to fix it.");
449 
450 class Bin {
451 private:
452     Block      *activeBlk;
453     Block      *mailbox;
454     MallocMutex mailLock;
455 
456 public:
getActiveBlock() const457     inline Block* getActiveBlock() const { return activeBlk; }
resetActiveBlock()458     void resetActiveBlock() { activeBlk = NULL; }
activeBlockUnused() const459     bool activeBlockUnused() const { return activeBlk && !activeBlk->allocatedCount; }
460     inline void setActiveBlock(Block *block);
461     inline Block* setPreviousBlockActive();
462     Block* getPrivatizedFreeListBlock();
463     void moveBlockToFront(Block *block);
464     bool cleanPublicFreeLists();
465     void processEmptyBlock(Block *block, bool poolTheBlock);
466     void addPublicFreeListBlock(Block* block);
467 
468     void outofTLSBin(Block* block);
469     void verifyTLSBin(size_t size) const;
470     void pushTLSBin(Block* block);
471 
verifyInitState() const472     void verifyInitState() const {
473         MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
474         MALLOC_ASSERT( !mailbox, ASSERT_TEXT );
475     }
476 
477     friend void Block::freePublicObject (FreeObject *objectToFree);
478 };
479 
480 /********* End of the data structures                    **************/
481 
482 /*
483  * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
484  */
485 const uint32_t minSmallObjectIndex = 0;
486 const uint32_t numSmallObjectBins = 8;
487 const uint32_t maxSmallObjectSize = 64;
488 
489 /*
490  * There are 4 bins between each couple of powers of 2 [64-128-256-...]
491  * from maxSmallObjectSize till this size; 16 bins in total
492  */
493 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
494 const uint32_t numSegregatedObjectBins = 16;
495 const uint32_t maxSegregatedObjectSize = 1024;
496 
497 /*
498  * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
499  * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
500  */
501 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
502 const uint32_t numFittingBins = 5;
503 
504 const uint32_t fittingAlignment = estimatedCacheLineSize;
505 
506 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
507 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
508 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
509 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
510 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
511 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
512 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
513 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
514 #undef SET_FITTING_SIZE
515 
516 /*
517  * The total number of thread-specific Block-based bins
518  */
519 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
520 
521 /*
522  * Objects of this size and larger are considered large objects.
523  */
524 const uint32_t minLargeObjectSize = fittingSize5 + 1;
525 
526 /*
527  * Per-thread pool of slab blocks. Idea behind it is to not share with other
528  * threads memory that are likely in local cache(s) of our CPU.
529  */
530 class FreeBlockPool {
531 private:
532     Block      *head;
533     int         size;
534     Backend    *backend;
535     bool        lastAccessMiss;
536 public:
537     static const int POOL_HIGH_MARK = 32;
538     static const int POOL_LOW_MARK  = 8;
539 
540     class ResOfGet {
541         ResOfGet();
542     public:
543         Block* block;
544         bool   lastAccMiss;
ResOfGet(Block * b,bool lastMiss)545         ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
546     };
547 
548     // allocated in zero-initialized memory
FreeBlockPool(Backend * bknd)549     FreeBlockPool(Backend *bknd) : backend(bknd) {}
550     ResOfGet getBlock();
551     void returnBlock(Block *block);
552     bool externalCleanup(); // can be called by another thread
553 };
554 
555 template<int LOW_MARK, int HIGH_MARK>
556 class LocalLOCImpl {
557 private:
558     static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
559     // TODO: can single-linked list be faster here?
560     LargeMemoryBlock *head,
561                      *tail; // need it when do releasing on overflow
562     size_t            totalSize;
563     int               numOfBlocks;
564 public:
565     bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
566     LargeMemoryBlock *get(size_t size);
567     bool externalCleanup(ExtMemoryPool *extMemPool);
568 #if __TBB_MALLOC_WHITEBOX_TEST
LocalLOCImpl()569     LocalLOCImpl() : head(NULL), tail(NULL), totalSize(0), numOfBlocks(0) {}
getMaxSize()570     static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
571     static const int LOC_HIGH_MARK = HIGH_MARK;
572 #else
573     // no ctor, object must be created in zero-initialized memory
574 #endif
575 };
576 
577 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
578 
579 class TLSData : public TLSRemote {
580     MemoryPool   *memPool;
581 public:
582     Bin           bin[numBlockBinLimit];
583     FreeBlockPool freeSlabBlocks;
584     LocalLOC      lloc;
585     unsigned      currCacheIdx;
586 private:
587     bool unused;
588 public:
TLSData(MemoryPool * mPool,Backend * bknd)589     TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd) {}
getMemPool() const590     MemoryPool *getMemPool() const { return memPool; }
591     Bin* getAllocationBin(size_t size);
592     void release();
externalCleanup(bool cleanOnlyUnused,bool cleanBins)593     bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
594         if (!unused && cleanOnlyUnused) return false;
595         // Heavy operation in terms of synchronization complexity,
596         // should be called only for the current thread
597         bool released = cleanBins ? cleanupBlockBins() : false;
598         // both cleanups to be called, and the order is not important
599         return released | lloc.externalCleanup(&memPool->extMemPool) | freeSlabBlocks.externalCleanup();
600     }
601     bool cleanupBlockBins();
markUsed()602     void markUsed() { unused = false; } // called by owner when TLS touched
markUnused()603     void markUnused() { unused =  true; } // can be called by not owner thread
604 };
605 
createTLS(MemoryPool * memPool,Backend * backend)606 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
607 {
608     MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
609     TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
610     if ( !tls )
611         return NULL;
612     new(tls) TLSData(memPool, backend);
613     /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
614 #if MALLOC_DEBUG
615     for (uint32_t i = 0; i < numBlockBinLimit; i++)
616         tls->bin[i].verifyInitState();
617 #endif
618     setThreadMallocTLS(tls);
619     memPool->extMemPool.allLocalCaches.registerThread(tls);
620     return tls;
621 }
622 
cleanupBlockBins()623 bool TLSData::cleanupBlockBins()
624 {
625     bool released = false;
626     for (uint32_t i = 0; i < numBlockBinLimit; i++) {
627         released |= bin[i].cleanPublicFreeLists();
628         // After cleaning public free lists, only the active block might be empty.
629         // Do not use processEmptyBlock because it will just restore bumpPtr.
630         Block *block = bin[i].getActiveBlock();
631         if (block && block->empty()) {
632             bin[i].outofTLSBin(block);
633             memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
634             released = true;
635         }
636     }
637     return released;
638 }
639 
releaseAllLocalCaches()640 bool ExtMemoryPool::releaseAllLocalCaches()
641 {
642     // Iterate all registered TLS data and clean LLOC and Slab pools
643     bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
644 
645     // Bins privatization is done only for the current thread
646     if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
647         released |= tlsData->cleanupBlockBins();
648 
649     return released;
650 }
651 
registerThread(TLSRemote * tls)652 void AllLocalCaches::registerThread(TLSRemote *tls)
653 {
654     tls->prev = NULL;
655     MallocMutex::scoped_lock lock(listLock);
656     MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
657     tls->next = head;
658     if (head)
659         head->prev = tls;
660     head = tls;
661     MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
662 }
663 
unregisterThread(TLSRemote * tls)664 void AllLocalCaches::unregisterThread(TLSRemote *tls)
665 {
666     MallocMutex::scoped_lock lock(listLock);
667     MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
668     if (head == tls)
669         head = tls->next;
670     if (tls->next)
671         tls->next->prev = tls->prev;
672     if (tls->prev)
673         tls->prev->next = tls->next;
674     MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
675 }
676 
cleanup(bool cleanOnlyUnused)677 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
678 {
679     bool released = false;
680     {
681         MallocMutex::scoped_lock lock(listLock);
682         for (TLSRemote *curr=head; curr; curr=curr->next)
683             released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
684     }
685     return released;
686 }
687 
markUnused()688 void AllLocalCaches::markUnused()
689 {
690     bool locked;
691     MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
692     if (!locked) // not wait for marking if someone doing something with it
693         return;
694 
695     for (TLSRemote *curr=head; curr; curr=curr->next)
696         static_cast<TLSData*>(curr)->markUnused();
697 }
698 
699 #if MALLOC_CHECK_RECURSION
700 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
701 pthread_t   RecursiveMallocCallProtector::owner_thread;
702 void       *RecursiveMallocCallProtector::autoObjPtr;
703 bool        RecursiveMallocCallProtector::mallocRecursionDetected;
704 #if (__FreeBSD__||__DragonFly__)
705 bool        RecursiveMallocCallProtector::canUsePthread;
706 #endif
707 
708 #endif
709 
710 /*********** End code to provide thread ID and a TLS pointer **********/
711 
712 // Parameter for isLargeObject, keeps our expectations on memory origin.
713 // Assertions must use unknownMem to reliably report object invalidity.
714 enum MemoryOrigin {
715     ourMem,    // allocated by TBB allocator
716     unknownMem // can be allocated by system allocator or TBB allocator
717 };
718 
719 template<MemoryOrigin> bool isLargeObject(void *object);
720 static void *internalMalloc(size_t size);
721 static void internalFree(void *object);
722 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
723 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
724 
725 #if !MALLOC_DEBUG
726 #if __INTEL_COMPILER || _MSC_VER
727 #define NOINLINE(decl) __declspec(noinline) decl
728 #define ALWAYSINLINE(decl) __forceinline decl
729 #elif __GNUC__
730 #define NOINLINE(decl) decl __attribute__ ((noinline))
731 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
732 #else
733 #define NOINLINE(decl) decl
734 #define ALWAYSINLINE(decl) decl
735 #endif
736 
737 static NOINLINE( bool doInitialization() );
738 ALWAYSINLINE( bool isMallocInitialized() );
739 
740 #undef ALWAYSINLINE
741 #undef NOINLINE
742 #endif /* !MALLOC_DEBUG */
743 
744 
745 /********* Now some rough utility code to deal with indexing the size bins. **************/
746 
747 /*
748  * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
749  * Moreover, on IPF, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
750  * This is enough for current algorithm of distribution of sizes among bins.
751  * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
752  */
753 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
754 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
755 #pragma intrinsic(_BitScanReverse)
756 #endif
highestBitPos(unsigned int n)757 static inline unsigned int highestBitPos(unsigned int n)
758 {
759     MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
760     unsigned int pos;
761 #if __ARCH_x86_32||__ARCH_x86_64
762 
763 # if __linux__||__APPLE__||(__FreeBSD__||__DragonFly__)||__NetBSD__||__OpenBSD__||__sun||__MINGW32__
764     __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
765 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
766     __asm
767     {
768         bsr eax, n
769         mov pos, eax
770     }
771 # elif _WIN64 && _MSC_VER>=1400
772     _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
773 # else
774 #   error highestBitPos() not implemented for this platform
775 # endif
776 #elif __arm__
777     __asm__ __volatile__
778     (
779        "clz %0, %1\n"
780        "rsb %0, %0, %2\n"
781        :"=r" (pos) :"r" (n), "I" (31)
782     );
783 #else
784     static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
785     pos = bsr[ n>>6 ];
786 #endif /* __ARCH_* */
787     return pos;
788 }
789 
790 template<bool Is32Bit>
getSmallObjectIndex(unsigned int size)791 unsigned int getSmallObjectIndex(unsigned int size)
792 {
793     return (size-1)>>3;
794 }
795 template<>
getSmallObjectIndex(unsigned int size)796 unsigned int getSmallObjectIndex</*Is32Bit=*/false>(unsigned int size)
797 {
798     // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
799     unsigned int result = (size-1)>>3;
800     if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
801     return result;
802 }
803 /*
804  * Depending on indexRequest, for a given size return either the index into the bin
805  * for objects of this size, or the actual size of objects in this bin.
806  */
807 template<bool indexRequest>
getIndexOrObjectSize(unsigned int size)808 static unsigned int getIndexOrObjectSize (unsigned int size)
809 {
810     if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
811         unsigned int index = getSmallObjectIndex</*Is32Bit=*/(sizeof(size_t)<=4)>( size );
812          /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
813         return indexRequest ? index : (index+1)<<3;
814     }
815     else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
816         unsigned int order = highestBitPos(size-1); // which group of bin sizes?
817         MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
818         if (indexRequest)
819             return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
820         else {
821             unsigned int alignment = 128 >> (9-order); // alignment in the group
822             MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
823             return alignUp(size,alignment);
824         }
825     }
826     else {
827         if( size <= fittingSize3 ) {
828             if( size <= fittingSize2 ) {
829                 if( size <= fittingSize1 )
830                     return indexRequest ? minFittingIndex : fittingSize1;
831                 else
832                     return indexRequest ? minFittingIndex+1 : fittingSize2;
833             } else
834                 return indexRequest ? minFittingIndex+2 : fittingSize3;
835         } else {
836             if( size <= fittingSize5 ) {
837                 if( size <= fittingSize4 )
838                     return indexRequest ? minFittingIndex+3 : fittingSize4;
839                 else
840                     return indexRequest ? minFittingIndex+4 : fittingSize5;
841             } else {
842                 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
843                 return ~0U;
844             }
845         }
846     }
847 }
848 
getIndex(unsigned int size)849 static unsigned int getIndex (unsigned int size)
850 {
851     return getIndexOrObjectSize</*indexRequest=*/true>(size);
852 }
853 
getObjectSize(unsigned int size)854 static unsigned int getObjectSize (unsigned int size)
855 {
856     return getIndexOrObjectSize</*indexRequest=*/false>(size);
857 }
858 
859 
allocate(MemoryPool * memPool,size_t size)860 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
861 {
862     FreeObject *result;
863 
864     MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
865 
866     { // Lock with acquire
867         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
868 
869         if( bootStrapObjectList) {
870             result = bootStrapObjectList;
871             bootStrapObjectList = bootStrapObjectList->next;
872         } else {
873             if (!bootStrapBlock) {
874                 bootStrapBlock = memPool->getEmptyBlock(size);
875                 if (!bootStrapBlock) return NULL;
876             }
877             result = bootStrapBlock->bumpPtr;
878             bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
879             if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
880                 bootStrapBlock->bumpPtr = NULL;
881                 bootStrapBlock->next = bootStrapBlockUsed;
882                 bootStrapBlockUsed = bootStrapBlock;
883                 bootStrapBlock = NULL;
884             }
885         }
886     } // Unlock with release
887 
888     memset (result, 0, size);
889     return (void*)result;
890 }
891 
free(void * ptr)892 void BootStrapBlocks::free(void* ptr)
893 {
894     MALLOC_ASSERT( ptr, ASSERT_TEXT );
895     { // Lock with acquire
896         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
897         ((FreeObject*)ptr)->next = bootStrapObjectList;
898         bootStrapObjectList = (FreeObject*)ptr;
899     } // Unlock with release
900 }
901 
reset()902 void BootStrapBlocks::reset()
903 {
904     bootStrapBlock = bootStrapBlockUsed = NULL;
905     bootStrapObjectList = NULL;
906 }
907 
908 #if !(FREELIST_NONBLOCKING)
909 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
910 #endif
911 
912 /********* End rough utility code  **************/
913 
914 /* LifoList assumes zero initialization so a vector of it can be created
915  * by just allocating some space with no call to constructor.
916  * On Linux, it seems to be necessary to avoid linking with C++ libraries.
917  *
918  * By usage convention there is no race on the initialization. */
LifoList()919 LifoList::LifoList( ) : top(NULL)
920 {
921     // MallocMutex assumes zero initialization
922     memset(&lock, 0, sizeof(MallocMutex));
923 }
924 
push(Block * block)925 void LifoList::push(Block *block)
926 {
927     MallocMutex::scoped_lock scoped_cs(lock);
928     block->next = top;
929     top = block;
930 }
931 
pop()932 Block *LifoList::pop()
933 {
934     Block *block=NULL;
935     if (top) {
936         MallocMutex::scoped_lock scoped_cs(lock);
937         if (top) {
938             block = top;
939             top = block->next;
940         }
941     }
942     return block;
943 }
944 
grab()945 Block *LifoList::grab()
946 {
947     Block *block = NULL;
948     if (top) {
949         MallocMutex::scoped_lock scoped_cs(lock);
950         block = top;
951         top = NULL;
952     }
953     return block;
954 }
955 
956 /********* Thread and block related code      *************/
957 
releaseAll(Backend * backend)958 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
959      LargeMemoryBlock *next, *lmb = loHead;
960      loHead = NULL;
961 
962      for (; lmb; lmb = next) {
963          next = lmb->gNext;
964          if (poolDestroy) {
965              // as it's pool destruction, no need to return object to backend,
966              // only remove backrefs, as they are global
967              removeBackRef(lmb->backRefIdx);
968          } else {
969              // clean g(Next|Prev) to prevent removing lmb
970              // from AllLargeBlocksList inside returnLargeObject
971              lmb->gNext = lmb->gPrev = NULL;
972              backend->returnLargeObject(lmb);
973          }
974      }
975 }
976 
getTLS(bool create)977 TLSData* MemoryPool::getTLS(bool create)
978 {
979     TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
980     if (create && !tls)
981         tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
982     return tls;
983 }
984 
985 /*
986  * Return the bin for the given size.
987  */
getAllocationBin(size_t size)988 inline Bin* TLSData::getAllocationBin(size_t size)
989 {
990     return bin + getIndex(size);
991 }
992 
993 /* Return an empty uninitialized block in a non-blocking fashion. */
getEmptyBlock(size_t size)994 Block *MemoryPool::getEmptyBlock(size_t size)
995 {
996     TLSData* tls = getTLS(/*create=*/false);
997     // try to use per-thread cache, if TLS available
998     FreeBlockPool::ResOfGet resOfGet = tls?
999         tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(NULL, false);
1000     Block *result = resOfGet.block;
1001 
1002     if (!result) { // not found in local cache, asks backend for slabs
1003         int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1004         BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1005 
1006         result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1007         if (!result) return NULL;
1008 
1009         if (!extMemPool.userPool())
1010             for (int i=0; i<num; i++) {
1011                 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1012                 if (backRefIdx[i].isInvalid()) {
1013                     // roll back resource allocation
1014                     for (int j=0; j<i; j++)
1015                         removeBackRef(backRefIdx[j]);
1016                     Block *b = result;
1017                     for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1018                         extMemPool.backend.putSlabBlock(b);
1019                     return NULL;
1020                 }
1021             }
1022         // resources were allocated, register blocks
1023         Block *b = result;
1024         for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1025             // slab block in user's pool must have invalid backRefIdx
1026             if (extMemPool.userPool()) {
1027                 new (&b->backRefIdx) BackRefIdx();
1028             } else {
1029                 setBackRef(backRefIdx[i], b);
1030                 b->backRefIdx = backRefIdx[i];
1031             }
1032             b->tlsPtr = tls;
1033             b->poolPtr = this;
1034             // all but first one go to per-thread pool
1035             if (i > 0) {
1036                 MALLOC_ASSERT(tls, ASSERT_TEXT);
1037                 tls->freeSlabBlocks.returnBlock(b);
1038             }
1039         }
1040     }
1041     MALLOC_ASSERT(result, ASSERT_TEXT);
1042     result->initEmptyBlock(tls, size);
1043     STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1044     return result;
1045 }
1046 
returnEmptyBlock(Block * block,bool poolTheBlock)1047 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1048 {
1049     block->reset();
1050     if (poolTheBlock) {
1051         getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1052     } else {
1053         // slab blocks in user's pools do not have valid backRefIdx
1054         if (!extMemPool.userPool())
1055             removeBackRef(*(block->getBackRefIdx()));
1056         extMemPool.backend.putSlabBlock(block);
1057     }
1058 }
1059 
init(intptr_t poolId,rawAllocType rawAlloc,rawFreeType rawFree,size_t granularity,bool keepAllMemory,bool fixedPool)1060 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1061                          rawFreeType rawFree, size_t granularity,
1062                          bool keepAllMemory, bool fixedPool)
1063 {
1064     this->poolId = poolId;
1065     this->rawAlloc = rawAlloc;
1066     this->rawFree = rawFree;
1067     this->granularity = granularity;
1068     this->keepAllMemory = keepAllMemory;
1069     this->fixedPool = fixedPool;
1070     this->delayRegsReleasing = false;
1071     if (!initTLS())
1072         return false;
1073     loc.init(this);
1074     backend.init(this);
1075     MALLOC_ASSERT(isPoolValid(), NULL);
1076     return true;
1077 }
1078 
initTLS()1079 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1080 
init(intptr_t poolId,const MemPoolPolicy * policy)1081 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1082 {
1083     if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1084                policy->granularity? policy->granularity : defaultGranularity,
1085                policy->keepAllMemory, policy->fixedPool))
1086         return false;
1087     {
1088         MallocMutex::scoped_lock lock(memPoolListLock);
1089         next = defaultMemPool->next;
1090         defaultMemPool->next = this;
1091         prev = defaultMemPool;
1092         if (next)
1093             next->prev = this;
1094     }
1095     return true;
1096 }
1097 
reset()1098 bool MemoryPool::reset()
1099 {
1100     MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1101     // memory is not releasing during pool reset
1102     // TODO: mark regions to release unused on next reset()
1103     extMemPool.delayRegionsReleasing(true);
1104 
1105     bootStrapBlocks.reset();
1106     extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1107     if (!extMemPool.reset())
1108         return false;
1109 
1110     if (!extMemPool.initTLS())
1111         return false;
1112     extMemPool.delayRegionsReleasing(false);
1113     return true;
1114 }
1115 
destroy()1116 bool MemoryPool::destroy()
1117 {
1118 #if __TBB_MALLOC_LOCACHE_STAT
1119     extMemPool.loc.reportStat(stdout);
1120 #endif
1121 #if __TBB_MALLOC_BACKEND_STAT
1122     extMemPool.backend.reportStat(stdout);
1123 #endif
1124     {
1125         MallocMutex::scoped_lock lock(memPoolListLock);
1126         // remove itself from global pool list
1127         if (prev)
1128             prev->next = next;
1129         if (next)
1130             next->prev = prev;
1131     }
1132     // slab blocks in non-default pool do not have backreferences,
1133     // only large objects do
1134     if (extMemPool.userPool())
1135         extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1136     else {
1137         // only one non-userPool() is supported now
1138         MALLOC_ASSERT(this==defaultMemPool, NULL);
1139         // There and below in extMemPool.destroy(), do not restore initial state
1140         // for user pool, because it's just about to be released. But for system
1141         // pool restoring, we do not want to do zeroing of it on subsequent reload.
1142         bootStrapBlocks.reset();
1143         extMemPool.orphanedBlocks.reset();
1144     }
1145     return extMemPool.destroy();
1146 }
1147 
onThreadShutdown(TLSData * tlsData)1148 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1149 {
1150     if (tlsData) { // might be called for "empty" TLS
1151         tlsData->release();
1152         bootStrapBlocks.free(tlsData);
1153         clearTLS();
1154     }
1155 }
1156 
1157 #if MALLOC_DEBUG
verifyTLSBin(size_t size) const1158 void Bin::verifyTLSBin (size_t size) const
1159 {
1160 /* The debug version verifies the TLSBin as needed */
1161     uint32_t objSize = getObjectSize(size);
1162 
1163     if (activeBlk) {
1164         MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1165         MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1166 #if MALLOC_DEBUG>1
1167         for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1168             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1169             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1170             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1171             MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1172             if (temp->next) {
1173                 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1174             }
1175         }
1176         for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1177             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1178             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1179             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1180             MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1181             if (temp->previous) {
1182                 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1183             }
1184         }
1185 #endif /* MALLOC_DEBUG>1 */
1186     }
1187 }
1188 #else /* MALLOC_DEBUG */
verifyTLSBin(size_t) const1189 inline void Bin::verifyTLSBin (size_t) const { }
1190 #endif /* MALLOC_DEBUG */
1191 
1192 /*
1193  * Add a block to the start of this tls bin list.
1194  */
pushTLSBin(Block * block)1195 void Bin::pushTLSBin(Block* block)
1196 {
1197     /* The objectSize should be defined and not a parameter
1198        because the function is applied to partially filled blocks as well */
1199     unsigned int size = block->objectSize;
1200 
1201     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1202     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1203     MALLOC_ASSERT( block->next == NULL, ASSERT_TEXT );
1204     MALLOC_ASSERT( block->previous == NULL, ASSERT_TEXT );
1205 
1206     MALLOC_ASSERT( this, ASSERT_TEXT );
1207     verifyTLSBin(size);
1208 
1209     block->next = activeBlk;
1210     if( activeBlk ) {
1211         block->previous = activeBlk->previous;
1212         activeBlk->previous = block;
1213         if( block->previous )
1214             block->previous->next = block;
1215     } else {
1216         activeBlk = block;
1217     }
1218 
1219     verifyTLSBin(size);
1220 }
1221 
1222 /*
1223  * Take a block out of its tls bin (e.g. before removal).
1224  */
outofTLSBin(Block * block)1225 void Bin::outofTLSBin(Block* block)
1226 {
1227     unsigned int size = block->objectSize;
1228 
1229     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1230     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1231 
1232     MALLOC_ASSERT( this, ASSERT_TEXT );
1233     verifyTLSBin(size);
1234 
1235     if (block == activeBlk) {
1236         activeBlk = block->previous? block->previous : block->next;
1237     }
1238     /* Unlink the block */
1239     if (block->previous) {
1240         MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1241         block->previous->next = block->next;
1242     }
1243     if (block->next) {
1244         MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1245         block->next->previous = block->previous;
1246     }
1247     block->next = NULL;
1248     block->previous = NULL;
1249 
1250     verifyTLSBin(size);
1251 }
1252 
getPrivatizedFreeListBlock()1253 Block* Bin::getPrivatizedFreeListBlock()
1254 {
1255     Block* block;
1256     MALLOC_ASSERT( this, ASSERT_TEXT );
1257     // if this method is called, active block usage must be unsuccessful
1258     MALLOC_ASSERT( !activeBlk && !mailbox || activeBlk && activeBlk->isFull, ASSERT_TEXT );
1259 
1260 // the counter should be changed    STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1261     if (!FencedLoad((intptr_t&)mailbox)) // hotpath is empty mailbox
1262         return NULL;
1263     else { // mailbox is not empty, take lock and inspect it
1264         MallocMutex::scoped_lock scoped_cs(mailLock);
1265         block = mailbox;
1266         if( block ) {
1267             MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1268             MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable), ASSERT_TEXT );
1269             mailbox = block->nextPrivatizable;
1270             block->nextPrivatizable = (Block*) this;
1271         }
1272     }
1273     if( block ) {
1274         MALLOC_ASSERT( isSolidPtr(block->publicFreeList), ASSERT_TEXT );
1275         block->privatizePublicFreeList();
1276         block->adjustPositionInBin(this);
1277     }
1278     return block;
1279 }
1280 
addPublicFreeListBlock(Block * block)1281 void Bin::addPublicFreeListBlock(Block* block)
1282 {
1283     MallocMutex::scoped_lock scoped_cs(mailLock);
1284     block->nextPrivatizable = mailbox;
1285     mailbox = block;
1286 }
1287 
1288 // Process publicly freed objects in all blocks and return empty blocks
1289 // to the backend in order to reduce overall footprint.
cleanPublicFreeLists()1290 bool Bin::cleanPublicFreeLists()
1291 {
1292     Block* block;
1293     if (!FencedLoad((intptr_t&)mailbox))
1294         return false;
1295     else {
1296         // Grab all the blocks in the mailbox
1297         MallocMutex::scoped_lock scoped_cs(mailLock);
1298         block = mailbox;
1299         mailbox = NULL;
1300     }
1301     bool released = false;
1302     while (block) {
1303         MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1304         Block* tmp = block->nextPrivatizable;
1305         block->nextPrivatizable = (Block*) this;
1306         block->privatizePublicFreeList();
1307         if (block->empty()) {
1308             processEmptyBlock(block, /*poolTheBlock=*/false);
1309             released = true;
1310         } else
1311             block->adjustPositionInBin(this);
1312         block = tmp;
1313     }
1314     return released;
1315 }
1316 
adjustFullness()1317 bool Block::adjustFullness()
1318 {
1319     if (bumpPtr) {
1320         /* If we are still using a bump ptr for this block it is empty enough to use. */
1321         STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1322         isFull = false;
1323     } else {
1324         const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1325         /* allocatedCount shows how many objects in the block are in use; however it still counts
1326          * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1327         isFull = (allocatedCount*objectSize > threshold) ? true : false;
1328 #if COLLECT_STATISTICS
1329         if (isFull)
1330             STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1331         else
1332             STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1333 #endif
1334     }
1335     return isFull;
1336 }
1337 
1338 // This method resides in class Block, and not in class Bin, in order to avoid
1339 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
adjustPositionInBin(Bin * bin)1340 void Block::adjustPositionInBin(Bin* bin/*=NULL*/)
1341 {
1342     // If the block were full, but became empty enough to use,
1343     // move it to the front of the list
1344     if (isFull && !adjustFullness()) {
1345         if (!bin)
1346             bin = tlsPtr->getAllocationBin(objectSize);
1347         bin->moveBlockToFront(this);
1348     }
1349 }
1350 
1351 /* Restore the bump pointer for an empty block that is planned to use */
restoreBumpPtr()1352 void Block::restoreBumpPtr()
1353 {
1354     MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1355     MALLOC_ASSERT( !isSolidPtr(publicFreeList), ASSERT_TEXT );
1356     STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1357     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1358     freeList = NULL;
1359     isFull = false;
1360 }
1361 
freeOwnObject(void * object)1362 void Block::freeOwnObject(void *object)
1363 {
1364     tlsPtr->markUsed();
1365     allocatedCount--;
1366     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1367 #if COLLECT_STATISTICS
1368     // Note that getAllocationBin is not called on the hottest path with statistics off.
1369     if (tlsPtr->getAllocationBin(objectSize)->getActiveBlock() != this)
1370         STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1371     else
1372         STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1373 #endif
1374     if (empty()) {
1375         // If the last object of a slab is freed, the slab cannot be marked full
1376         MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1377         tlsPtr->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1378     } else { // hot path
1379         FreeObject *objectToFree = findObjectToFree(object);
1380         objectToFree->next = freeList;
1381         freeList = objectToFree;
1382         adjustPositionInBin();
1383     }
1384 }
1385 
freePublicObject(FreeObject * objectToFree)1386 void Block::freePublicObject (FreeObject *objectToFree)
1387 {
1388     FreeObject *localPublicFreeList;
1389 
1390     MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1391 #if FREELIST_NONBLOCKING
1392     FreeObject *temp = publicFreeList;
1393     do {
1394         localPublicFreeList = objectToFree->next = temp;
1395         temp = (FreeObject*)AtomicCompareExchange(
1396                                 (intptr_t&)publicFreeList,
1397                                 (intptr_t)objectToFree, (intptr_t)localPublicFreeList );
1398         // no backoff necessary because trying to make change, not waiting for a change
1399     } while( temp != localPublicFreeList );
1400 #else
1401     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1402     {
1403         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1404         localPublicFreeList = objectToFree->next = publicFreeList;
1405         publicFreeList = objectToFree;
1406     }
1407 #endif
1408 
1409     if( localPublicFreeList==NULL ) {
1410         // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1411         // otherwise, it should point to the bin the block belongs to.
1412         // reading nextPrivatizable is thread-safe below, because:
1413         // 1) the executing thread atomically got publicFreeList==NULL and changed it to non-NULL;
1414         // 2) only owning thread can change it back to NULL,
1415         // 3) but it can not be done until the block is put to the mailbox
1416         // So the executing thread is now the only one that can change nextPrivatizable
1417         if( !isNotForUse(nextPrivatizable) ) {
1418             MALLOC_ASSERT( nextPrivatizable!=NULL, ASSERT_TEXT );
1419             Bin* theBin = (Bin*) nextPrivatizable;
1420             theBin->addPublicFreeListBlock(this);
1421         }
1422     }
1423     STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1424     STAT_increment(ownerTid, getIndex(objectSize), freeByOtherThread);
1425 }
1426 
1427 // Make objects freed by other threads available for use again
privatizePublicFreeList(bool reset)1428 void Block::privatizePublicFreeList( bool reset )
1429 {
1430     FreeObject *localPublicFreeList;
1431     // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1432     // to properly synchronize with other threads freeing objects to this slab.
1433     const intptr_t endMarker = reset ? 0 : UNUSABLE;
1434 
1435     // Only the owner thread may reset the pointer to NULL
1436     MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1437 #if FREELIST_NONBLOCKING
1438     localPublicFreeList = (FreeObject*)AtomicFetchStore( &publicFreeList, endMarker );
1439 #else
1440     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1441     {
1442         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1443         localPublicFreeList = publicFreeList;
1444         publicFreeList = endMarker;
1445     }
1446 #endif
1447     MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1448     MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1449 
1450     // publicFreeList must have been UNUSABLE or valid, but not NULL
1451     MALLOC_ASSERT( localPublicFreeList!=NULL, ASSERT_TEXT );
1452     if( isSolidPtr(localPublicFreeList) ) {
1453         MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1454         /* other threads did not change the counter freeing our blocks */
1455         allocatedCount--;
1456         FreeObject *temp = localPublicFreeList;
1457         while( isSolidPtr(temp->next) ){ // the list will end with either NULL or UNUSABLE
1458             temp = temp->next;
1459             allocatedCount--;
1460             MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1461         }
1462         /* merge with local freeList */
1463         temp->next = freeList;
1464         freeList = localPublicFreeList;
1465         STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1466     }
1467 }
1468 
privatizeOrphaned(TLSData * tls,unsigned index)1469 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1470 {
1471     Bin* bin = tls->bin + index;
1472     STAT_increment(getThreadId(), index, allocBlockPublic);
1473     next = NULL;
1474     previous = NULL;
1475     MALLOC_ASSERT( publicFreeList!=NULL, ASSERT_TEXT );
1476     /* There is not a race here since no other thread owns this block */
1477     markOwned(tls);
1478     // It is safe to change nextPrivatizable, as publicFreeList is not null
1479     MALLOC_ASSERT( isNotForUse(nextPrivatizable), ASSERT_TEXT );
1480     nextPrivatizable = (Block*)bin;
1481     // the next call is required to change publicFreeList to 0
1482     privatizePublicFreeList();
1483     if( empty() ) {
1484         restoreBumpPtr();
1485     } else {
1486         adjustFullness(); // check the block fullness and set isFull
1487     }
1488     MALLOC_ASSERT( !isNotForUse(publicFreeList), ASSERT_TEXT );
1489 }
1490 
1491 
readyToShare()1492 bool Block::readyToShare()
1493 {
1494     void* oldval;
1495 #if FREELIST_NONBLOCKING
1496     oldval = (void*)AtomicCompareExchange((intptr_t&)publicFreeList, UNUSABLE, 0);
1497 #else
1498     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1499     {
1500         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1501         if ( (oldval=publicFreeList)==NULL )
1502             (intptr_t&)(publicFreeList) = UNUSABLE;
1503     }
1504 #endif
1505     return oldval==NULL;
1506 }
1507 
shareOrphaned(intptr_t binTag,unsigned index)1508 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1509 {
1510     MALLOC_ASSERT( binTag, ASSERT_TEXT );
1511     STAT_increment(getThreadId(), index, freeBlockPublic);
1512     markOrphaned();
1513     if ((intptr_t)nextPrivatizable==binTag) {
1514         // First check passed: the block is not in mailbox yet.
1515         // Need to set publicFreeList to non-zero, so other threads
1516         // will not change nextPrivatizable and it can be zeroed.
1517         if ( !readyToShare() ) {
1518             // another thread freed an object; we need to wait until it finishes.
1519             // There is no need for exponential backoff, as the wait here is not for a lock;
1520             // but need to yield, so the thread we wait has a chance to run.
1521             // TODO: add a pause to also be friendly to hyperthreads
1522             int count = 256;
1523             while( (intptr_t)const_cast<Block* volatile &>(nextPrivatizable)==binTag ) {
1524                 if (--count==0) {
1525                     do_yield();
1526                     count = 256;
1527                 }
1528             }
1529         }
1530     }
1531     MALLOC_ASSERT( publicFreeList!=NULL, ASSERT_TEXT );
1532     // now it is safe to change our data
1533     previous = NULL;
1534     // it is caller responsibility to ensure that the list of blocks
1535     // formed by nextPrivatizable pointers is kept consistent if required.
1536     // if only called from thread shutdown code, it does not matter.
1537     (intptr_t&)(nextPrivatizable) = UNUSABLE;
1538 }
1539 
cleanBlockHeader()1540 void Block::cleanBlockHeader()
1541 {
1542     next = NULL;
1543     previous = NULL;
1544     freeList = NULL;
1545     allocatedCount = 0;
1546     isFull = false;
1547     tlsPtr = NULL;
1548 
1549     publicFreeList = NULL;
1550 }
1551 
initEmptyBlock(TLSData * tls,size_t size)1552 void Block::initEmptyBlock(TLSData *tls, size_t size)
1553 {
1554     // Having getIndex and getObjectSize called next to each other
1555     // allows better compiler optimization as they basically share the code.
1556     unsigned int index = getIndex(size);
1557     unsigned int objSz = getObjectSize(size);
1558 
1559     cleanBlockHeader();
1560     objectSize = objSz;
1561     markOwned(tls);
1562     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1563     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1564 
1565     // each block should have the address where the head of the list of "privatizable" blocks is kept
1566     // the only exception is a block for boot strap which is initialized when TLS is yet NULL
1567     nextPrivatizable = tls? (Block*)(tls->bin + index) : NULL;
1568     TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1569              this, tlsPtr ? getThreadId() : -1, objectSize, bumpPtr ));
1570 }
1571 
get(TLSData * tls,unsigned int size)1572 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1573 {
1574     // TODO: try to use index from getAllocationBin
1575     unsigned int index = getIndex(size);
1576     Block *block = bins[index].pop();
1577     if (block) {
1578         MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1579         block->privatizeOrphaned(tls, index);
1580     }
1581     return block;
1582 }
1583 
put(intptr_t binTag,Block * block)1584 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1585 {
1586     unsigned int index = getIndex(block->getSize());
1587     block->shareOrphaned(binTag, index);
1588     MALLOC_ITT_SYNC_RELEASING(bins+index);
1589     bins[index].push(block);
1590 }
1591 
reset()1592 void OrphanedBlocks::reset()
1593 {
1594     for (uint32_t i=0; i<numBlockBinLimit; i++)
1595         new (bins+i) LifoList();
1596 }
1597 
cleanup(Backend * backend)1598 bool OrphanedBlocks::cleanup(Backend* backend)
1599 {
1600     bool released = false;
1601     for (uint32_t i=0; i<numBlockBinLimit; i++) {
1602         Block* block = bins[i].grab();
1603         MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1604         while (block) {
1605             Block* next = block->next;
1606             block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to NULL
1607             if (block->empty()) {
1608                 block->reset();
1609                 // slab blocks in user's pools do not have valid backRefIdx
1610                 if (!backend->inUserPool())
1611                     removeBackRef(*(block->getBackRefIdx()));
1612                 backend->putSlabBlock(block);
1613                 released = true;
1614             } else {
1615                 MALLOC_ITT_SYNC_RELEASING(bins+i);
1616                 bins[i].push(block);
1617             }
1618             block = next;
1619         }
1620     }
1621     return released;
1622 }
1623 
getBlock()1624 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1625 {
1626     Block *b = (Block*)AtomicFetchStore(&head, 0);
1627 
1628     if (b) {
1629         size--;
1630         Block *newHead = b->next;
1631         lastAccessMiss = false;
1632         FencedStore((intptr_t&)head, (intptr_t)newHead);
1633     } else
1634         lastAccessMiss = true;
1635 
1636     return ResOfGet(b, lastAccessMiss);
1637 }
1638 
returnBlock(Block * block)1639 void FreeBlockPool::returnBlock(Block *block)
1640 {
1641     MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1642     Block *localHead = (Block*)AtomicFetchStore(&head, 0);
1643 
1644     if (!localHead)
1645         size = 0; // head was stolen by externalClean, correct size accordingly
1646     else if (size == POOL_HIGH_MARK) {
1647         // release cold blocks and add hot one,
1648         // so keep POOL_LOW_MARK-1 blocks and add new block to head
1649         Block *headToFree = localHead, *helper;
1650         for (int i=0; i<POOL_LOW_MARK-2; i++)
1651             headToFree = headToFree->next;
1652         Block *last = headToFree;
1653         headToFree = headToFree->next;
1654         last->next = NULL;
1655         size = POOL_LOW_MARK-1;
1656         for (Block *currBl = headToFree; currBl; currBl = helper) {
1657             helper = currBl->next;
1658             // slab blocks in user's pools do not have valid backRefIdx
1659             if (!backend->inUserPool())
1660                 removeBackRef(currBl->backRefIdx);
1661             backend->putSlabBlock(currBl);
1662         }
1663     }
1664     size++;
1665     block->next = localHead;
1666     FencedStore((intptr_t&)head, (intptr_t)block);
1667 }
1668 
externalCleanup()1669 bool FreeBlockPool::externalCleanup()
1670 {
1671     Block *helper;
1672     bool released = false;
1673 
1674     for (Block *currBl=(Block*)AtomicFetchStore(&head, 0); currBl; currBl=helper) {
1675         helper = currBl->next;
1676         // slab blocks in user's pools do not have valid backRefIdx
1677         if (!backend->inUserPool())
1678             removeBackRef(currBl->backRefIdx);
1679         backend->putSlabBlock(currBl);
1680         released = true;
1681     }
1682     return released;
1683 }
1684 
1685 /* Prepare the block for returning to FreeBlockPool */
reset()1686 void Block::reset()
1687 {
1688     // it is caller's responsibility to ensure no data is lost before calling this
1689     MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1690     MALLOC_ASSERT( !isSolidPtr(publicFreeList), ASSERT_TEXT );
1691     if (!isStartupAllocObject())
1692         STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1693 
1694     cleanBlockHeader();
1695 
1696     nextPrivatizable = NULL;
1697 
1698     objectSize = 0;
1699     // for an empty block, bump pointer should point right after the end of the block
1700     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1701 }
1702 
setActiveBlock(Block * block)1703 inline void Bin::setActiveBlock (Block *block)
1704 {
1705 //    MALLOC_ASSERT( bin, ASSERT_TEXT );
1706     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1707     // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1708     activeBlk = block;
1709 }
1710 
setPreviousBlockActive()1711 inline Block* Bin::setPreviousBlockActive()
1712 {
1713     MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1714     Block* temp = activeBlk->previous;
1715     if( temp ) {
1716         MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1717         activeBlk = temp;
1718     }
1719     return temp;
1720 }
1721 
isOwnedByCurrentThread() const1722 inline bool Block::isOwnedByCurrentThread() const {
1723     return tlsPtr && ownerTid.isCurrentThreadId();
1724 }
1725 
findObjectToFree(const void * object) const1726 FreeObject *Block::findObjectToFree(const void *object) const
1727 {
1728     FreeObject *objectToFree;
1729     // Due to aligned allocations, a pointer passed to scalable_free
1730     // might differ from the address of internally allocated object.
1731     // Small objects however should always be fine.
1732     if (objectSize <= maxSegregatedObjectSize)
1733         objectToFree = (FreeObject*)object;
1734     // "Fitting size" allocations are suspicious if aligned higher than naturally
1735     else {
1736         if ( ! isAligned(object,2*fittingAlignment) )
1737             // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1738             //       so might even be slower in average than unconditional use of findAllocatedObject.
1739             // here it should be a "real" object
1740             objectToFree = (FreeObject*)object;
1741         else
1742             // here object can be an aligned address, so applying additional checks
1743             objectToFree = findAllocatedObject(object);
1744         MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1745     }
1746     MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1747 
1748     return objectToFree;
1749 }
1750 
release()1751 void TLSData::release()
1752 {
1753     memPool->extMemPool.allLocalCaches.unregisterThread(this);
1754     externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1755 
1756     for (unsigned index = 0; index < numBlockBins; index++) {
1757         Block *activeBlk = bin[index].getActiveBlock();
1758         if (!activeBlk)
1759             continue;
1760         Block *threadlessBlock = activeBlk->previous;
1761         while (threadlessBlock) {
1762             Block *threadBlock = threadlessBlock->previous;
1763             if (threadlessBlock->empty()) {
1764                 /* we destroy the thread, so not use its block pool */
1765                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1766             } else {
1767                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1768             }
1769             threadlessBlock = threadBlock;
1770         }
1771         threadlessBlock = activeBlk;
1772         while (threadlessBlock) {
1773             Block *threadBlock = threadlessBlock->next;
1774             if (threadlessBlock->empty()) {
1775                 /* we destroy the thread, so not use its block pool */
1776                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1777             } else {
1778                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1779             }
1780             threadlessBlock = threadBlock;
1781         }
1782         bin[index].resetActiveBlock();
1783     }
1784 }
1785 
1786 
1787 #if MALLOC_CHECK_RECURSION
1788 // TODO: Use dedicated heap for this
1789 
1790 /*
1791  * It's a special kind of allocation that can be used when malloc is
1792  * not available (either during startup or when malloc was already called and
1793  * we are, say, inside pthread_setspecific's call).
1794  * Block can contain objects of different sizes,
1795  * allocations are performed by moving bump pointer and increasing of object counter,
1796  * releasing is done via counter of objects allocated in the block
1797  * or moving bump pointer if releasing object is on a bound.
1798  * TODO: make bump pointer to grow to the same backward direction as all the others.
1799  */
1800 
1801 class StartupBlock : public Block {
availableSize() const1802     size_t availableSize() const {
1803         return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1804     }
1805     static StartupBlock *getBlock();
1806 public:
1807     static FreeObject *allocate(size_t size);
msize(void * ptr)1808     static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1809     void free(void *ptr);
1810 };
1811 
1812 static MallocMutex startupMallocLock;
1813 static StartupBlock *firstStartupBlock;
1814 
getBlock()1815 StartupBlock *StartupBlock::getBlock()
1816 {
1817     BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1818     if (backRefIdx.isInvalid()) return NULL;
1819 
1820     StartupBlock *block = static_cast<StartupBlock*>(
1821         defaultMemPool->extMemPool.backend.getSlabBlock(1));
1822     if (!block) return NULL;
1823 
1824     block->cleanBlockHeader();
1825     setBackRef(backRefIdx, block);
1826     block->backRefIdx = backRefIdx;
1827     // use startupAllocObjSizeMark to mark objects from startup block marker
1828     block->objectSize = startupAllocObjSizeMark;
1829     block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1830     return block;
1831 }
1832 
allocate(size_t size)1833 FreeObject *StartupBlock::allocate(size_t size)
1834 {
1835     FreeObject *result;
1836     StartupBlock *newBlock = NULL;
1837     bool newBlockUnused = false;
1838 
1839     /* Objects must be aligned on their natural bounds,
1840        and objects bigger than word on word's bound. */
1841     size = alignUp(size, sizeof(size_t));
1842     // We need size of an object to implement msize.
1843     size_t reqSize = size + sizeof(size_t);
1844     // speculatively allocates newBlock to try avoid allocation while holding lock
1845     /* TODO: The function is called when malloc nested call is detected,
1846              so simultaneous usage from different threads seems unlikely.
1847              If pre-allocation is found useless, the code might be simplified. */
1848     if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1849         newBlock = StartupBlock::getBlock();
1850         if (!newBlock) return NULL;
1851     }
1852     {
1853         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1854         // Re-check whether we need a new block (conditions might have changed)
1855         if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1856             if (!newBlock) {
1857                 newBlock = StartupBlock::getBlock();
1858                 if (!newBlock) return NULL;
1859             }
1860             newBlock->next = (Block*)firstStartupBlock;
1861             if (firstStartupBlock)
1862                 firstStartupBlock->previous = (Block*)newBlock;
1863             firstStartupBlock = newBlock;
1864         } else
1865             newBlockUnused = true;
1866         result = firstStartupBlock->bumpPtr;
1867         firstStartupBlock->allocatedCount++;
1868         firstStartupBlock->bumpPtr =
1869             (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1870     }
1871     if (newBlock && newBlockUnused)
1872         defaultMemPool->returnEmptyBlock(newBlock, /*poolTheBlock=*/false);
1873 
1874     // keep object size at the negative offset
1875     *((size_t*)result) = size;
1876     return (FreeObject*)((size_t*)result+1);
1877 }
1878 
free(void * ptr)1879 void StartupBlock::free(void *ptr)
1880 {
1881     Block* blockToRelease = NULL;
1882     {
1883         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1884 
1885         MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1886         MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1887                       && allocatedCount>0, ASSERT_TEXT);
1888         MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1889                       && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1890                       ASSERT_TEXT);
1891         if (0 == --allocatedCount) {
1892             if (this == firstStartupBlock)
1893                 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1894             if (previous)
1895                 previous->next = next;
1896             if (next)
1897                 next->previous = previous;
1898             blockToRelease = this;
1899         } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1900             // last object in the block released
1901             FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1902             MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1903                           ASSERT_TEXT);
1904             bumpPtr = newBump;
1905         }
1906     }
1907     if (blockToRelease) {
1908         blockToRelease->previous = blockToRelease->next = NULL;
1909         defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1910     }
1911 }
1912 
1913 #endif /* MALLOC_CHECK_RECURSION */
1914 
1915 /********* End thread related code  *************/
1916 
1917 /********* Library initialization *************/
1918 
1919 //! Value indicating the state of initialization.
1920 /* 0 = initialization not started.
1921  * 1 = initialization started but not finished.
1922  * 2 = initialization finished.
1923  * In theory, we only need values 0 and 2. But value 1 is nonetheless
1924  * useful for detecting errors in the double-check pattern.
1925  */
1926 static intptr_t mallocInitialized;   // implicitly initialized to 0
1927 static MallocMutex initMutex;
1928 
1929 /** The leading "\0" is here so that applying "strings" to the binary
1930     delivers a clean result. */
1931 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1932 
1933 #if USE_PTHREAD && (__TBB_SOURCE_DIRECTLY_INCLUDED || __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND)
1934 
1935 /* Decrease race interval between dynamic library unloading and pthread key
1936    destructor. Protect only Pthreads with supported unloading. */
1937 class ShutdownSync {
1938 /* flag is the number of threads in pthread key dtor body
1939    (i.e., between threadDtorStart() and threadDtorDone())
1940    or the signal to skip dtor, if flag < 0 */
1941     intptr_t flag;
1942     static const intptr_t skipDtor = INTPTR_MIN/2;
1943 public:
init()1944     void init() { flag = 0; }
1945 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1946    simultaneously, so flag never becomes negative because of that. */
threadDtorStart()1947     bool threadDtorStart() {
1948         if (flag < 0)
1949             return false;
1950         if (AtomicIncrement(flag) <= 0) { // note that new value returned
1951             AtomicAdd(flag, -1);  // flag is spoiled by us, restore it
1952             return false;
1953         }
1954         return true;
1955     }
threadDtorDone()1956     void threadDtorDone() {
1957         AtomicAdd(flag, -1);
1958     }
processExit()1959     void processExit() {
1960         if (AtomicAdd(flag, skipDtor) != 0)
1961             SpinWaitUntilEq(flag, skipDtor);
1962     }
1963 };
1964 
1965 #else
1966 
1967 class ShutdownSync {
1968 public:
init()1969     void init() { }
threadDtorStart()1970     bool threadDtorStart() { return true; }
threadDtorDone()1971     void threadDtorDone() { }
processExit()1972     void processExit() { }
1973 };
1974 
1975 #endif // USE_PTHREAD && (__TBB_SOURCE_DIRECTLY_INCLUDED || __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND)
1976 
1977 static ShutdownSync shutdownSync;
1978 
isMallocInitialized()1979 inline bool isMallocInitialized() {
1980     // Load must have acquire fence; otherwise thread taking "initialized" path
1981     // might perform textually later loads *before* mallocInitialized becomes 2.
1982     return 2 == FencedLoad(mallocInitialized);
1983 }
1984 
isMallocInitializedExt()1985 bool isMallocInitializedExt() {
1986     return isMallocInitialized();
1987 }
1988 
1989 /* Caller is responsible for ensuring this routine is called exactly once. */
MallocInitializeITT()1990 extern "C" void MallocInitializeITT() {
1991 #if DO_ITT_NOTIFY
1992     if (!usedBySrcIncluded)
1993         tbb::internal::__TBB_load_ittnotify();
1994 #endif
1995 }
1996 
initDefaultPool()1997 void MemoryPool::initDefaultPool() {
1998     hugePages.init();
1999 }
2000 
2001 /*
2002  * Allocator initialization routine;
2003  * it is called lazily on the very first scalable_malloc call.
2004  */
initMemoryManager()2005 static bool initMemoryManager()
2006 {
2007     TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2008              sizeof(Block), sizeof(uintptr_t) ));
2009     MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2010     MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2011     MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2012                    "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2013 
2014 #if USE_WINTHREAD
2015     const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2016 #else
2017     // POSIX.1-2001-compliant way to get page size
2018     const size_t granularity = sysconf(_SC_PAGESIZE);
2019 #endif
2020     if (!defaultMemPool) {
2021         // Do not rely on static constructors and do the assignment in case
2022         // of library static section not initialized at this call yet.
2023         defaultMemPool = (MemoryPool*)defaultMemPool_space;
2024     }
2025     bool initOk = defaultMemPool->
2026         extMemPool.init(0, NULL, NULL, granularity,
2027                         /*keepAllMemory=*/false, /*fixedPool=*/false);
2028 // TODO: extMemPool.init() to not allocate memory
2029     if (!initOk || !initBackRefMaster(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2030         return false;
2031     MemoryPool::initDefaultPool();
2032     // init() is required iff initMemoryManager() is called
2033     // after mallocProcessShutdownNotification()
2034     shutdownSync.init();
2035 #if COLLECT_STATISTICS
2036     initStatisticsCollection();
2037 #endif
2038     return true;
2039 }
2040 
GetBoolEnvironmentVariable(const char * name)2041 static bool GetBoolEnvironmentVariable(const char* name) {
2042     return tbb::internal::GetBoolEnvironmentVariable(name);
2043 }
2044 
2045 //! Ensures that initMemoryManager() is called once and only once.
2046 /** Does not return until initMemoryManager() has been completed by a thread.
2047     There is no need to call this routine if mallocInitialized==2 . */
doInitialization()2048 static bool doInitialization()
2049 {
2050     MallocMutex::scoped_lock lock( initMutex );
2051     if (mallocInitialized!=2) {
2052         MALLOC_ASSERT( mallocInitialized==0, ASSERT_TEXT );
2053         mallocInitialized = 1;
2054         RecursiveMallocCallProtector scoped;
2055         if (!initMemoryManager()) {
2056             mallocInitialized = 0; // restore and out
2057             return false;
2058         }
2059 #ifdef  MALLOC_EXTRA_INITIALIZATION
2060         MALLOC_EXTRA_INITIALIZATION;
2061 #endif
2062 #if MALLOC_CHECK_RECURSION
2063         RecursiveMallocCallProtector::detectNaiveOverload();
2064 #endif
2065         MALLOC_ASSERT( mallocInitialized==1, ASSERT_TEXT );
2066         // Store must have release fence, otherwise mallocInitialized==2
2067         // might become remotely visible before side effects of
2068         // initMemoryManager() become remotely visible.
2069         FencedStore( mallocInitialized, 2 );
2070         if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2071             fputs(VersionString+1,stderr);
2072             hugePages.printStatus();
2073         }
2074     }
2075     /* It can't be 0 or I would have initialized it */
2076     MALLOC_ASSERT( mallocInitialized==2, ASSERT_TEXT );
2077     return true;
2078 }
2079 
2080 /********* End library initialization *************/
2081 
2082 /********* The malloc show begins     *************/
2083 
2084 
allocateFromFreeList()2085 FreeObject *Block::allocateFromFreeList()
2086 {
2087     FreeObject *result;
2088 
2089     if (!freeList) return NULL;
2090 
2091     result = freeList;
2092     MALLOC_ASSERT( result, ASSERT_TEXT );
2093 
2094     freeList = result->next;
2095     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2096     allocatedCount++;
2097     STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2098 
2099     return result;
2100 }
2101 
allocateFromBumpPtr()2102 FreeObject *Block::allocateFromBumpPtr()
2103 {
2104     FreeObject *result = bumpPtr;
2105     if (result) {
2106         bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2107         if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2108             bumpPtr = NULL;
2109         }
2110         MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2111         allocatedCount++;
2112         STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2113     }
2114     return result;
2115 }
2116 
allocate()2117 inline FreeObject* Block::allocate()
2118 {
2119     MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2120 
2121     /* for better cache locality, first looking in the free list. */
2122     if ( FreeObject *result = allocateFromFreeList() ) {
2123         return result;
2124     }
2125     MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2126 
2127     /* if free list is empty, try thread local bump pointer allocation. */
2128     if ( FreeObject *result = allocateFromBumpPtr() ) {
2129         return result;
2130     }
2131     MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2132 
2133     /* the block is considered full. */
2134     isFull = true;
2135     return NULL;
2136 }
2137 
findObjectSize(void * object) const2138 size_t Block::findObjectSize(void *object) const
2139 {
2140     size_t blSize = getSize();
2141 #if MALLOC_CHECK_RECURSION
2142     // Currently, there is no aligned allocations from startup blocks,
2143     // so we can return just StartupBlock::msize().
2144     // TODO: This must be extended if we add aligned allocation from startup blocks.
2145     if (!blSize)
2146         return StartupBlock::msize(object);
2147 #endif
2148     // object can be aligned, so real size can be less than block's
2149     size_t size =
2150         blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2151     MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2152     return size;
2153 }
2154 
moveBlockToFront(Block * block)2155 void Bin::moveBlockToFront(Block *block)
2156 {
2157     /* move the block to the front of the bin */
2158     if (block == activeBlk) return;
2159     outofTLSBin(block);
2160     pushTLSBin(block);
2161 }
2162 
processEmptyBlock(Block * block,bool poolTheBlock)2163 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2164 {
2165     if (block != activeBlk) {
2166         /* We are not using this block; return it to the pool */
2167         outofTLSBin(block);
2168         block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2169     } else {
2170         /* all objects are free - let's restore the bump pointer */
2171         block->restoreBumpPtr();
2172     }
2173 }
2174 
2175 template<int LOW_MARK, int HIGH_MARK>
put(LargeMemoryBlock * object,ExtMemoryPool * extMemPool)2176 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2177 {
2178     const size_t size = object->unalignedSize;
2179     // not spoil cache with too large object, that can cause its total cleanup
2180     if (size > MAX_TOTAL_SIZE)
2181         return false;
2182     LargeMemoryBlock *localHead = (LargeMemoryBlock*)AtomicFetchStore(&head, 0);
2183 
2184     object->prev = NULL;
2185     object->next = localHead;
2186     if (localHead)
2187         localHead->prev = object;
2188     else {
2189         // those might not be cleaned during local cache stealing, correct them
2190         totalSize = 0;
2191         numOfBlocks = 0;
2192         tail = object;
2193     }
2194     localHead = object;
2195     totalSize += size;
2196     numOfBlocks++;
2197     // must meet both size and number of cached objects constrains
2198     if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2199         // scanning from tail until meet conditions
2200         while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2201             totalSize -= tail->unalignedSize;
2202             numOfBlocks--;
2203             tail = tail->prev;
2204         }
2205         LargeMemoryBlock *headToRelease = tail->next;
2206         tail->next = NULL;
2207 
2208         extMemPool->freeLargeObjectList(headToRelease);
2209     }
2210 
2211     FencedStore((intptr_t&)head, (intptr_t)localHead);
2212     return true;
2213 }
2214 
2215 template<int LOW_MARK, int HIGH_MARK>
get(size_t size)2216 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2217 {
2218     LargeMemoryBlock *localHead, *res=NULL;
2219 
2220     if (size > MAX_TOTAL_SIZE)
2221         return NULL;
2222 
2223     if (!head || (localHead = (LargeMemoryBlock*)AtomicFetchStore(&head, 0)) == NULL) {
2224         // do not restore totalSize, numOfBlocks and tail at this point,
2225         // as they are used only in put(), where they must be restored
2226         return NULL;
2227     }
2228 
2229     for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2230         if (curr->unalignedSize == size) {
2231             res = curr;
2232             if (curr->next)
2233                 curr->next->prev = curr->prev;
2234             else
2235                 tail = curr->prev;
2236             if (curr != localHead)
2237                 curr->prev->next = curr->next;
2238             else
2239                 localHead = curr->next;
2240             totalSize -= size;
2241             numOfBlocks--;
2242             break;
2243         }
2244     }
2245     FencedStore((intptr_t&)head, (intptr_t)localHead);
2246     return res;
2247 }
2248 
2249 template<int LOW_MARK, int HIGH_MARK>
externalCleanup(ExtMemoryPool * extMemPool)2250 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2251 {
2252     if (LargeMemoryBlock *localHead = (LargeMemoryBlock*)AtomicFetchStore(&head, 0)) {
2253         extMemPool->freeLargeObjectList(localHead);
2254         return true;
2255     }
2256     return false;
2257 }
2258 
getFromLLOCache(TLSData * tls,size_t size,size_t alignment)2259 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2260 {
2261     LargeMemoryBlock *lmb = NULL;
2262 
2263     size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2264     size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2265     if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2266         return NULL;
2267     MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2268 
2269     if (tls) {
2270         tls->markUsed();
2271         lmb = tls->lloc.get(allocationSize);
2272     }
2273     if (!lmb)
2274         lmb = extMemPool.mallocLargeObject(this, allocationSize);
2275 
2276     if (lmb) {
2277         // doing shuffle we suppose that alignment offset guarantees
2278         // that different cache lines are in use
2279         MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2280 
2281         void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2282         uintptr_t alignedRight =
2283             alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2284         // Has some room to shuffle object between cache lines?
2285         // Note that alignedRight and alignedArea are aligned at alignment.
2286         unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2287         if (ptrDelta && tls) { // !tls is cold path
2288             // for the hot path of alignment==estimatedCacheLineSize,
2289             // allow compilers to use shift for division
2290             // (since estimatedCacheLineSize is a power-of-2 constant)
2291             unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2292                   ptrDelta / estimatedCacheLineSize :
2293                   ptrDelta / alignment;
2294             unsigned myCacheIdx = ++tls->currCacheIdx;
2295             unsigned offset = myCacheIdx % numOfPossibleOffsets;
2296 
2297             // Move object to a cache line with an offset that is different from
2298             // previous allocation. This supposedly allows us to use cache
2299             // associativity more efficiently.
2300             alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2301         }
2302         MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2303                       (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2304         LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2305         header->memoryBlock = lmb;
2306         header->backRefIdx = lmb->backRefIdx;
2307         setBackRef(header->backRefIdx, header);
2308 
2309         lmb->objectSize = size;
2310 
2311         MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2312         MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2313 
2314         return alignedArea;
2315     }
2316     return NULL;
2317 }
2318 
putToLLOCache(TLSData * tls,void * object)2319 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2320 {
2321     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2322     // overwrite backRefIdx to simplify double free detection
2323     header->backRefIdx = BackRefIdx();
2324 
2325     if (tls) {
2326         tls->markUsed();
2327         if (tls->lloc.put(header->memoryBlock, &extMemPool))
2328             return;
2329     }
2330     extMemPool.freeLargeObject(header->memoryBlock);
2331 }
2332 
2333 /*
2334  * All aligned allocations fall into one of the following categories:
2335  *  1. if both request size and alignment are <= maxSegregatedObjectSize,
2336  *       we just align the size up, and request this amount, because for every size
2337  *       aligned to some power of 2, the allocated object is at least that aligned.
2338  * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2339  * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2340  *       its address up; given such pointer, scalable_free could find the real object.
2341  *       Wrapping of size+alignment is impossible because maximal allowed
2342  *       alignment plus minLargeObjectSize can't lead to wrapping.
2343  * 4. otherwise, aligned large object is allocated.
2344  */
allocateAligned(MemoryPool * memPool,size_t size,size_t alignment)2345 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2346 {
2347     MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2348 
2349     if (!isMallocInitialized())
2350         if (!doInitialization())
2351             return NULL;
2352 
2353     void *result;
2354     if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2355         result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2356     else if (size<minLargeObjectSize) {
2357         if (alignment<=fittingAlignment)
2358             result = internalPoolMalloc(memPool, size);
2359         else if (size+alignment < minLargeObjectSize) {
2360             void *unaligned = internalPoolMalloc(memPool, size+alignment);
2361             if (!unaligned) return NULL;
2362             result = alignUp(unaligned, alignment);
2363         } else
2364             goto LargeObjAlloc;
2365     } else {
2366     LargeObjAlloc:
2367         TLSData *tls = memPool->getTLS(/*create=*/true);
2368         // take into account only alignment that are higher then natural
2369         result =
2370             memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2371                                                largeObjectAlignment: alignment);
2372     }
2373 
2374     MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2375     return result;
2376 }
2377 
reallocAligned(MemoryPool * memPool,void * ptr,size_t newSize,size_t alignment=0)2378 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2379                             size_t newSize, size_t alignment = 0)
2380 {
2381     void *result;
2382     size_t copySize;
2383 
2384     if (isLargeObject<ourMem>(ptr)) {
2385         LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2386         copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2387 
2388         // Apply different strategies if size decreases
2389         if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2390 
2391             // For huge objects (that do not fit in backend cache), keep the same space unless
2392             // the new size is at least twice smaller
2393             bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2394             size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2395             if (newSize > threshold) {
2396                 lmb->objectSize = newSize;
2397                 return ptr;
2398             }
2399             // TODO: For large objects suitable for the backend cache,
2400             // split out the excessive part and put it to the backend.
2401         }
2402         // Reallocate for real
2403         copySize = lmb->objectSize;
2404 #if BACKEND_HAS_MREMAP
2405         if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2406                           alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2407             return r;
2408 #endif
2409         result = alignment ? allocateAligned(memPool, newSize, alignment) :
2410             internalPoolMalloc(memPool, newSize);
2411 
2412     } else {
2413         Block* block = (Block *)alignDown(ptr, slabSize);
2414         copySize = block->findObjectSize(ptr);
2415 
2416         // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2417         // Currently, in case of size decreasing, old pointer is returned
2418         if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2419             return ptr;
2420         } else {
2421             result = alignment ? allocateAligned(memPool, newSize, alignment) :
2422                 internalPoolMalloc(memPool, newSize);
2423         }
2424     }
2425     if (result) {
2426         memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2427         internalPoolFree(memPool, ptr, 0);
2428     }
2429     return result;
2430 }
2431 
2432 /* A predicate checks if an object is properly placed inside its block */
isProperlyPlaced(const void * object) const2433 inline bool Block::isProperlyPlaced(const void *object) const
2434 {
2435     return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2436 }
2437 
2438 /* Finds the real object inside the block */
findAllocatedObject(const void * address) const2439 FreeObject *Block::findAllocatedObject(const void *address) const
2440 {
2441     // calculate offset from the end of the block space
2442     uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2443     MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2444     // find offset difference from a multiple of allocation size
2445     offset %= objectSize;
2446     // and move the address down to where the real object starts.
2447     return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2448 }
2449 
2450 /*
2451  * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2452  * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2453  */
safer_dereference(const BackRefIdx * ptr)2454 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2455 {
2456     BackRefIdx id;
2457 #if _MSC_VER
2458     __try {
2459 #endif
2460         id = *ptr;
2461 #if _MSC_VER
2462     } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2463                 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2464         id = BackRefIdx();
2465     }
2466 #endif
2467     return id;
2468 }
2469 
2470 template<MemoryOrigin memOrigin>
isLargeObject(void * object)2471 bool isLargeObject(void *object)
2472 {
2473     if (!isAligned(object, largeObjectAlignment))
2474         return false;
2475     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2476     BackRefIdx idx = (memOrigin == unknownMem) ?
2477         safer_dereference(&header->backRefIdx) : header->backRefIdx;
2478 
2479     return idx.isLargeObject()
2480         // in valid LargeObjectHdr memoryBlock is not NULL
2481         && header->memoryBlock
2482         // in valid LargeObjectHdr memoryBlock points somewhere before header
2483         // TODO: more strict check
2484         && (uintptr_t)header->memoryBlock < (uintptr_t)header
2485         && getBackRef(idx) == header;
2486 }
2487 
isSmallObject(void * ptr)2488 static inline bool isSmallObject (void *ptr)
2489 {
2490     Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2491     const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2492 
2493     bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2494     if (isSmall)
2495         expectedBlock->checkFreePrecond(ptr);
2496     return isSmall;
2497 }
2498 
2499 /**** Check if an object was allocated by scalable_malloc ****/
isRecognized(void * ptr)2500 static inline bool isRecognized (void* ptr)
2501 {
2502     return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2503         (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2504 }
2505 
freeSmallObject(void * object)2506 static inline void freeSmallObject(void *object)
2507 {
2508     /* mask low bits to get the block */
2509     Block *block = (Block *)alignDown(object, slabSize);
2510     block->checkFreePrecond(object);
2511 
2512 #if MALLOC_CHECK_RECURSION
2513     if (block->isStartupAllocObject()) {
2514         ((StartupBlock *)block)->free(object);
2515         return;
2516     }
2517 #endif
2518     if (block->isOwnedByCurrentThread()) {
2519         block->freeOwnObject(object);
2520     } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2521         FreeObject *objectToFree = block->findObjectToFree(object);
2522         block->freePublicObject(objectToFree);
2523     }
2524 }
2525 
internalPoolMalloc(MemoryPool * memPool,size_t size)2526 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2527 {
2528     Bin* bin;
2529     Block * mallocBlock;
2530 
2531     if (!memPool) return NULL;
2532 
2533     if (!size) size = sizeof(size_t);
2534 
2535     TLSData *tls = memPool->getTLS(/*create=*/true);
2536 
2537     /* Allocate a large object */
2538     if (size >= minLargeObjectSize)
2539         return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2540 
2541     if (!tls) return NULL;
2542 
2543     tls->markUsed();
2544     /*
2545      * Get an element in thread-local array corresponding to the given size;
2546      * It keeps ptr to the active block for allocations of this size
2547      */
2548     bin = tls->getAllocationBin(size);
2549     if ( !bin ) return NULL;
2550 
2551     /* Get a block to try to allocate in. */
2552     for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2553          mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2554     {
2555         if( FreeObject *result = mallocBlock->allocate() )
2556             return result;
2557     }
2558 
2559     /*
2560      * else privatize publicly freed objects in some block and allocate from it
2561      */
2562     mallocBlock = bin->getPrivatizedFreeListBlock();
2563     if (mallocBlock) {
2564         MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2565         if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2566             return result;
2567         /* Else something strange happened, need to retry from the beginning; */
2568         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2569         return internalPoolMalloc(memPool, size);
2570     }
2571 
2572     /*
2573      * no suitable own blocks, try to get a partial block that some other thread has discarded.
2574      */
2575     mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2576     while (mallocBlock) {
2577         bin->pushTLSBin(mallocBlock);
2578         bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2579         if( FreeObject *result = mallocBlock->allocate() )
2580             return result;
2581         mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2582     }
2583 
2584     /*
2585      * else try to get a new empty block
2586      */
2587     mallocBlock = memPool->getEmptyBlock(size);
2588     if (mallocBlock) {
2589         bin->pushTLSBin(mallocBlock);
2590         bin->setActiveBlock(mallocBlock);
2591         if( FreeObject *result = mallocBlock->allocate() )
2592             return result;
2593         /* Else something strange happened, need to retry from the beginning; */
2594         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2595         return internalPoolMalloc(memPool, size);
2596     }
2597     /*
2598      * else nothing works so return NULL
2599      */
2600     TRACEF(( "[ScalableMalloc trace] No memory found, returning NULL.\n" ));
2601     return NULL;
2602 }
2603 
2604 // When size==0 (i.e. unknown), detect here whether the object is large.
2605 // For size is known and < minLargeObjectSize, we still need to check
2606 // if the actual object is large, because large objects might be used
2607 // for aligned small allocations.
internalPoolFree(MemoryPool * memPool,void * object,size_t size)2608 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2609 {
2610     if (!memPool || !object) return false;
2611 
2612     // The library is initialized at allocation call, so releasing while
2613     // not initialized means foreign object is releasing.
2614     MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2615     MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2616                   "Invalid pointer during object releasing is detected.");
2617 
2618     if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2619         memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2620     else
2621         freeSmallObject(object);
2622     return true;
2623 }
2624 
internalMalloc(size_t size)2625 static void *internalMalloc(size_t size)
2626 {
2627     if (!size) size = sizeof(size_t);
2628 
2629 #if MALLOC_CHECK_RECURSION
2630     if (RecursiveMallocCallProtector::sameThreadActive())
2631         return size<minLargeObjectSize? StartupBlock::allocate(size) :
2632             // nested allocation, so skip tls
2633             (FreeObject*)defaultMemPool->getFromLLOCache(NULL, size, slabSize);
2634 #endif
2635 
2636     if (!isMallocInitialized())
2637         if (!doInitialization())
2638             return NULL;
2639     return internalPoolMalloc(defaultMemPool, size);
2640 }
2641 
internalFree(void * object)2642 static void internalFree(void *object)
2643 {
2644     internalPoolFree(defaultMemPool, object, 0);
2645 }
2646 
internalMsize(void * ptr)2647 static size_t internalMsize(void* ptr)
2648 {
2649     MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2650     if (isLargeObject<ourMem>(ptr)) {
2651         // TODO: return the maximum memory size, that can be written to this object
2652         LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2653         return lmb->objectSize;
2654     } else {
2655         Block *block = (Block*)alignDown(ptr, slabSize);
2656         return block->findObjectSize(ptr);
2657     }
2658 }
2659 
2660 } // namespace internal
2661 
2662 using namespace rml::internal;
2663 
2664 // legacy entry point saved for compatibility with binaries complied
2665 // with pre-6003 versions of TBB
pool_create(intptr_t pool_id,const MemPoolPolicy * policy)2666 rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2667 {
2668     rml::MemoryPool *pool;
2669     MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2670 
2671     pool_create_v1(pool_id, &pol, &pool);
2672     return pool;
2673 }
2674 
pool_create_v1(intptr_t pool_id,const MemPoolPolicy * policy,rml::MemoryPool ** pool)2675 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2676                                  rml::MemoryPool **pool)
2677 {
2678     if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2679          // empty pFree allowed only for fixed pools
2680          || !(policy->fixedPool || policy->pFree)) {
2681         *pool = NULL;
2682         return INVALID_POLICY;
2683     }
2684     if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2685          // new flags can be added in place of reserved, but default
2686          // behaviour must be supported by this version
2687          || policy->reserved ) {
2688         *pool = NULL;
2689         return UNSUPPORTED_POLICY;
2690     }
2691     if (!isMallocInitialized())
2692         if (!doInitialization()) {
2693             *pool = NULL;
2694             return NO_MEMORY;
2695         }
2696     rml::internal::MemoryPool *memPool =
2697         (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2698     if (!memPool) {
2699         *pool = NULL;
2700         return NO_MEMORY;
2701     }
2702     memset(memPool, 0, sizeof(rml::internal::MemoryPool));
2703     if (!memPool->init(pool_id, policy)) {
2704         internalFree(memPool);
2705         *pool = NULL;
2706         return NO_MEMORY;
2707     }
2708 
2709     *pool = (rml::MemoryPool*)memPool;
2710     return POOL_OK;
2711 }
2712 
pool_destroy(rml::MemoryPool * memPool)2713 bool pool_destroy(rml::MemoryPool* memPool)
2714 {
2715     if (!memPool) return false;
2716     bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2717     internalFree(memPool);
2718 
2719     return ret;
2720 }
2721 
pool_reset(rml::MemoryPool * memPool)2722 bool pool_reset(rml::MemoryPool* memPool)
2723 {
2724     if (!memPool) return false;
2725 
2726     return ((rml::internal::MemoryPool*)memPool)->reset();
2727 }
2728 
pool_malloc(rml::MemoryPool * mPool,size_t size)2729 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2730 {
2731     return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2732 }
2733 
pool_realloc(rml::MemoryPool * mPool,void * object,size_t size)2734 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2735 {
2736     if (!object)
2737         return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2738     if (!size) {
2739         internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2740         return NULL;
2741     }
2742     return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2743 }
2744 
pool_aligned_malloc(rml::MemoryPool * mPool,size_t size,size_t alignment)2745 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2746 {
2747     if (!isPowerOfTwo(alignment) || 0==size)
2748         return NULL;
2749 
2750     return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2751 }
2752 
pool_aligned_realloc(rml::MemoryPool * memPool,void * ptr,size_t size,size_t alignment)2753 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2754 {
2755     if (!isPowerOfTwo(alignment))
2756         return NULL;
2757     rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2758     void *tmp;
2759 
2760     if (!ptr)
2761         tmp = allocateAligned(mPool, size, alignment);
2762     else if (!size) {
2763         internalPoolFree(mPool, ptr, 0);
2764         return NULL;
2765     } else
2766         tmp = reallocAligned(mPool, ptr, size, alignment);
2767 
2768     return tmp;
2769 }
2770 
pool_free(rml::MemoryPool * mPool,void * object)2771 bool pool_free(rml::MemoryPool *mPool, void *object)
2772 {
2773     return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2774 }
2775 
pool_identify(void * object)2776 rml::MemoryPool *pool_identify(void *object)
2777 {
2778     rml::internal::MemoryPool *pool;
2779     if (isLargeObject<ourMem>(object)) {
2780         LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2781         pool = header->memoryBlock->pool;
2782     } else {
2783         Block *block = (Block*)alignDown(object, slabSize);
2784         pool = block->getMemPool();
2785     }
2786     // do not return defaultMemPool, as it can't be used in pool_free() etc
2787     __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2788         "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2789     return (rml::MemoryPool*)pool;
2790 }
2791 
pool_msize(rml::MemoryPool * mPool,void * object)2792 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2793 {
2794     if (object) {
2795         // No assert for object recognition, cause objects allocated from non-default
2796         // memory pool do not participate in range checking and do not have valid backreferences for
2797         // small objects. Instead, check that an object belong to the certain memory pool.
2798         MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2799         return internalMsize(object);
2800     }
2801     errno = EINVAL;
2802     // Unlike _msize, return 0 in case of parameter error.
2803     // Returning size_t(-1) looks more like the way to troubles.
2804     return 0;
2805 }
2806 
2807 } // namespace rml
2808 
2809 using namespace rml::internal;
2810 
2811 #if MALLOC_TRACE
2812 static unsigned int threadGoingDownCount = 0;
2813 #endif
2814 
2815 /*
2816  * When a thread is shutting down this routine should be called to remove all the thread ids
2817  * from the malloc blocks and replace them with a NULL thread id.
2818  *
2819  * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2820  * It will be automatically called at thread exit with the key value as the argument,
2821  * unless that value is NULL.
2822  * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2823  *
2824  * However neither of the above is called for the main process thread, so the routine
2825  * also needs to be called during the process shutdown.
2826  *
2827 */
2828 // TODO: Consider making this function part of class MemoryPool.
doThreadShutdownNotification(TLSData * tls,bool main_thread)2829 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2830 {
2831     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2832              getThreadId(),  threadGoingDownCount++ ));
2833 
2834 #if USE_PTHREAD
2835     if (tls) {
2836         if (!shutdownSync.threadDtorStart()) return;
2837         tls->getMemPool()->onThreadShutdown(tls);
2838         shutdownSync.threadDtorDone();
2839     } else
2840 #endif
2841     {
2842         suppress_unused_warning(tls); // not used on Windows
2843         // The default pool is safe to use at this point:
2844         //   on Linux, only the main thread can go here before destroying defaultMemPool;
2845         //   on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2846         // See also __TBB_mallocProcessShutdownNotification()
2847         defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2848         // Take lock to walk through other pools; but waiting might be dangerous at this point
2849         // (e.g. on Windows the main thread might deadlock)
2850         bool locked;
2851         MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2852         if (locked) { // the list is safe to process
2853             for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2854                 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2855         }
2856     }
2857 
2858     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2859 }
2860 
2861 #if USE_PTHREAD
mallocThreadShutdownNotification(void * arg)2862 void mallocThreadShutdownNotification(void* arg)
2863 {
2864     // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2865     if (!isMallocInitialized()) return;
2866     doThreadShutdownNotification((TLSData*)arg, false);
2867 }
2868 #else
__TBB_mallocThreadShutdownNotification()2869 extern "C" void __TBB_mallocThreadShutdownNotification()
2870 {
2871     // The routine is called once per thread on Windows
2872     if (!isMallocInitialized()) return;
2873     doThreadShutdownNotification(NULL, false);
2874 }
2875 #endif
2876 
__TBB_mallocProcessShutdownNotification(bool windows_process_dying)2877 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2878 {
2879     if (!isMallocInitialized()) return;
2880 
2881     // Don't clean allocator internals if the entire process is exiting
2882     if (!windows_process_dying) {
2883         doThreadShutdownNotification(NULL, /*main_thread=*/true);
2884     }
2885 #if  __TBB_MALLOC_LOCACHE_STAT
2886     printf("cache hit ratio %f, size hit %f\n",
2887            1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2888     defaultMemPool->extMemPool.loc.reportStat(stdout);
2889 #endif
2890 
2891     shutdownSync.processExit();
2892 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2893 /* Pthread keys must be deleted as soon as possible to not call key dtor
2894    on thread termination when then the tbbmalloc code can be already unloaded.
2895 */
2896     defaultMemPool->destroy();
2897     destroyBackRefMaster(&defaultMemPool->extMemPool.backend);
2898     ThreadId::destroy();      // Delete key for thread id
2899     hugePages.reset();
2900     // new total malloc initialization is possible after this point
2901     FencedStore(mallocInitialized, 0);
2902 #elif __TBB_USE_DLOPEN_REENTRANCY_WORKAROUND
2903 /* In most cases we prevent unloading tbbmalloc, and don't clean up memory
2904    on process shutdown. When impossible to prevent, library unload results
2905    in shutdown notification, and it makes sense to release unused memory
2906    at that point (we can't release all memory because it's possible that
2907    it will be accessed after this point).
2908    TODO: better support systems where we can't prevent unloading by removing
2909    pthread destructors and releasing caches.
2910  */
2911     defaultMemPool->extMemPool.hardCachesCleanup();
2912 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2913 
2914 #if COLLECT_STATISTICS
2915     unsigned nThreads = ThreadId::getMaxThreadId();
2916     for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2917         STAT_print(i);
2918 #endif
2919     if (!usedBySrcIncluded)
2920         MALLOC_ITT_FINI_ITTLIB();
2921 }
2922 
scalable_malloc(size_t size)2923 extern "C" void * scalable_malloc(size_t size)
2924 {
2925     void *ptr = internalMalloc(size);
2926     if (!ptr) errno = ENOMEM;
2927     return ptr;
2928 }
2929 
scalable_free(void * object)2930 extern "C" void scalable_free(void *object)
2931 {
2932     internalFree(object);
2933 }
2934 
2935 #if MALLOC_ZONE_OVERLOAD_ENABLED
__TBB_malloc_free_definite_size(void * object,size_t size)2936 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2937 {
2938     internalPoolFree(defaultMemPool, object, size);
2939 }
2940 #endif
2941 
2942 /*
2943  * A variant that provides additional memory safety, by checking whether the given address
2944  * was obtained with this allocator, and if not redirecting to the provided alternative call.
2945  */
__TBB_malloc_safer_free(void * object,void (* original_free)(void *))2946 extern "C" void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2947 {
2948     if (!object)
2949         return;
2950 
2951     // tbbmalloc can allocate object only when tbbmalloc has been initialized
2952     if (FencedLoad(mallocInitialized) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2953         if (isLargeObject<unknownMem>(object)) {
2954             // must check 1st for large object, because small object check touches 4 pages on left,
2955             // and it can be inaccessible
2956             TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2957 
2958             defaultMemPool->putToLLOCache(tls, object);
2959             return;
2960         } else if (isSmallObject(object)) {
2961             freeSmallObject(object);
2962             return;
2963         }
2964     }
2965     if (original_free)
2966         original_free(object);
2967 }
2968 
2969 /********* End the free code        *************/
2970 
2971 /********* Code for scalable_realloc       ***********/
2972 
2973 /*
2974  * From K&R
2975  * "realloc changes the size of the object pointed to by p to size. The contents will
2976  * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
2977  * the new space is uninitialized. realloc returns a pointer to the new space, or
2978  * NULL if the request cannot be satisfied, in which case *p is unchanged."
2979  *
2980  */
scalable_realloc(void * ptr,size_t size)2981 extern "C" void* scalable_realloc(void* ptr, size_t size)
2982 {
2983     void *tmp;
2984 
2985     if (!ptr)
2986         tmp = internalMalloc(size);
2987     else if (!size) {
2988         internalFree(ptr);
2989         return NULL;
2990     } else
2991         tmp = reallocAligned(defaultMemPool, ptr, size, 0);
2992 
2993     if (!tmp) errno = ENOMEM;
2994     return tmp;
2995 }
2996 
2997 /*
2998  * A variant that provides additional memory safety, by checking whether the given address
2999  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3000  */
__TBB_malloc_safer_realloc(void * ptr,size_t sz,void * original_realloc)3001 extern "C" void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3002 {
3003     void *tmp; // TODO: fix warnings about uninitialized use of tmp
3004 
3005     if (!ptr) {
3006         tmp = internalMalloc(sz);
3007     } else if (FencedLoad(mallocInitialized) && isRecognized(ptr)) {
3008         if (!sz) {
3009             internalFree(ptr);
3010             return NULL;
3011         } else {
3012             tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3013         }
3014     }
3015 #if USE_WINTHREAD
3016     else if (original_realloc && sz) {
3017         orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3018         if ( original_ptrs->msize ){
3019             size_t oldSize = original_ptrs->msize(ptr);
3020             tmp = internalMalloc(sz);
3021             if (tmp) {
3022                 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3023                 if ( original_ptrs->free ){
3024                     original_ptrs->free( ptr );
3025                 }
3026             }
3027         } else
3028             tmp = NULL;
3029     }
3030 #else
3031     else if (original_realloc) {
3032         typedef void* (*realloc_ptr_t)(void*,size_t);
3033         realloc_ptr_t original_realloc_ptr;
3034         (void *&)original_realloc_ptr = original_realloc;
3035         tmp = original_realloc_ptr(ptr,sz);
3036     }
3037 #endif
3038     else tmp = NULL;
3039 
3040     if (!tmp) errno = ENOMEM;
3041     return tmp;
3042 }
3043 
3044 /********* End code for scalable_realloc   ***********/
3045 
3046 /********* Code for scalable_calloc   ***********/
3047 
3048 /*
3049  * From K&R
3050  * calloc returns a pointer to space for an array of nobj objects,
3051  * each of size size, or NULL if the request cannot be satisfied.
3052  * The space is initialized to zero bytes.
3053  *
3054  */
3055 
scalable_calloc(size_t nobj,size_t size)3056 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3057 {
3058     // it's square root of maximal size_t value
3059     const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3060     const size_t arraySize = nobj * size;
3061 
3062     // check for overflow during multiplication:
3063     if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3064         if (nobj && arraySize / nobj != size) {             // 2) exact check
3065             errno = ENOMEM;
3066             return NULL;
3067         }
3068     void* result = internalMalloc(arraySize);
3069     if (result)
3070         memset(result, 0, arraySize);
3071     else
3072         errno = ENOMEM;
3073     return result;
3074 }
3075 
3076 /********* End code for scalable_calloc   ***********/
3077 
3078 /********* Code for aligned allocation API **********/
3079 
scalable_posix_memalign(void ** memptr,size_t alignment,size_t size)3080 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3081 {
3082     if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3083         return EINVAL;
3084     void *result = allocateAligned(defaultMemPool, size, alignment);
3085     if (!result)
3086         return ENOMEM;
3087     *memptr = result;
3088     return 0;
3089 }
3090 
scalable_aligned_malloc(size_t size,size_t alignment)3091 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3092 {
3093     if (!isPowerOfTwo(alignment) || 0==size) {
3094         errno = EINVAL;
3095         return NULL;
3096     }
3097     void *tmp = allocateAligned(defaultMemPool, size, alignment);
3098     if (!tmp) errno = ENOMEM;
3099     return tmp;
3100 }
3101 
scalable_aligned_realloc(void * ptr,size_t size,size_t alignment)3102 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3103 {
3104     if (!isPowerOfTwo(alignment)) {
3105         errno = EINVAL;
3106         return NULL;
3107     }
3108     void *tmp;
3109 
3110     if (!ptr)
3111         tmp = allocateAligned(defaultMemPool, size, alignment);
3112     else if (!size) {
3113         scalable_free(ptr);
3114         return NULL;
3115     } else
3116         tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3117 
3118     if (!tmp) errno = ENOMEM;
3119     return tmp;
3120 }
3121 
__TBB_malloc_safer_aligned_realloc(void * ptr,size_t size,size_t alignment,void * orig_function)3122 extern "C" void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3123 {
3124     /* corner cases left out of reallocAligned to not deal with errno there */
3125     if (!isPowerOfTwo(alignment)) {
3126         errno = EINVAL;
3127         return NULL;
3128     }
3129     void *tmp = NULL;
3130 
3131     if (!ptr) {
3132         tmp = allocateAligned(defaultMemPool, size, alignment);
3133     } else if (FencedLoad(mallocInitialized) && isRecognized(ptr)) {
3134         if (!size) {
3135             internalFree(ptr);
3136             return NULL;
3137         } else {
3138             tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3139         }
3140     }
3141 #if USE_WINTHREAD
3142     else {
3143         orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3144         if (size) {
3145             // Without orig_msize, we can't do anything with this.
3146             // Just keeping old pointer.
3147             if ( original_ptrs->aligned_msize ){
3148                 // set alignment and offset to have possibly correct oldSize
3149                 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3150                 tmp = allocateAligned(defaultMemPool, size, alignment);
3151                 if (tmp) {
3152                     memcpy(tmp, ptr, size<oldSize? size : oldSize);
3153                     if ( original_ptrs->aligned_free ){
3154                         original_ptrs->aligned_free( ptr );
3155                     }
3156                 }
3157             }
3158         } else {
3159             if ( original_ptrs->aligned_free ){
3160                 original_ptrs->aligned_free( ptr );
3161             }
3162             return NULL;
3163         }
3164     }
3165 #else
3166     // As original_realloc can't align result, and there is no way to find
3167     // size of reallocating object, we are giving up.
3168     suppress_unused_warning(orig_function);
3169 #endif
3170     if (!tmp) errno = ENOMEM;
3171     return tmp;
3172 }
3173 
scalable_aligned_free(void * ptr)3174 extern "C" void scalable_aligned_free(void *ptr)
3175 {
3176     internalFree(ptr);
3177 }
3178 
3179 /********* end code for aligned allocation API **********/
3180 
3181 /********* Code for scalable_msize       ***********/
3182 
3183 /*
3184  * Returns the size of a memory block allocated in the heap.
3185  */
scalable_msize(void * ptr)3186 extern "C" size_t scalable_msize(void* ptr)
3187 {
3188     if (ptr) {
3189         MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3190         return internalMsize(ptr);
3191     }
3192     errno = EINVAL;
3193     // Unlike _msize, return 0 in case of parameter error.
3194     // Returning size_t(-1) looks more like the way to troubles.
3195     return 0;
3196 }
3197 
3198 /*
3199  * A variant that provides additional memory safety, by checking whether the given address
3200  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3201  */
__TBB_malloc_safer_msize(void * object,size_t (* original_msize)(void *))3202 extern "C" size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3203 {
3204     if (object) {
3205         // Check if the memory was allocated by scalable_malloc
3206         if (FencedLoad(mallocInitialized) && isRecognized(object))
3207             return internalMsize(object);
3208         else if (original_msize)
3209             return original_msize(object);
3210     }
3211     // object is NULL or unknown, or foreign and no original_msize
3212 #if USE_WINTHREAD
3213     errno = EINVAL; // errno expected to be set only on this platform
3214 #endif
3215     return 0;
3216 }
3217 
3218 /*
3219  * The same as above but for _aligned_msize case
3220  */
__TBB_malloc_safer_aligned_msize(void * object,size_t alignment,size_t offset,size_t (* orig_aligned_msize)(void *,size_t,size_t))3221 extern "C" size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3222 {
3223     if (object) {
3224         // Check if the memory was allocated by scalable_malloc
3225         if (FencedLoad(mallocInitialized) && isRecognized(object))
3226             return internalMsize(object);
3227         else if (orig_aligned_msize)
3228             return orig_aligned_msize(object,alignment,offset);
3229     }
3230     // object is NULL or unknown
3231     errno = EINVAL;
3232     return 0;
3233 }
3234 
3235 /********* End code for scalable_msize   ***********/
3236 
scalable_allocation_mode(int param,intptr_t value)3237 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3238 {
3239     if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3240         defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3241         return TBBMALLOC_OK;
3242     } else if (param == USE_HUGE_PAGES) {
3243 #if __linux__
3244         switch (value) {
3245         case 0:
3246         case 1:
3247             hugePages.setMode(value);
3248             return TBBMALLOC_OK;
3249         default:
3250             return TBBMALLOC_INVALID_PARAM;
3251         }
3252 #else
3253         return TBBMALLOC_NO_EFFECT;
3254 #endif
3255 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3256     } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3257         switch (value) {
3258         case 0: // used by dynamic library
3259         case 1: // used by static library or directly included sources
3260             usedBySrcIncluded = value;
3261             return TBBMALLOC_OK;
3262         default:
3263             return TBBMALLOC_INVALID_PARAM;
3264         }
3265 #endif
3266     } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3267         defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3268         return TBBMALLOC_OK;
3269     }
3270     return TBBMALLOC_INVALID_PARAM;
3271 }
3272 
scalable_allocation_command(int cmd,void * param)3273 extern "C" int scalable_allocation_command(int cmd, void *param)
3274 {
3275     if (param)
3276         return TBBMALLOC_INVALID_PARAM;
3277 
3278     bool released = false;
3279     switch(cmd) {
3280     case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3281         if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3282             released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3283         break;
3284     case TBBMALLOC_CLEAN_ALL_BUFFERS:
3285         released = defaultMemPool->extMemPool.hardCachesCleanup();
3286         break;
3287     default:
3288         return TBBMALLOC_INVALID_PARAM;
3289     }
3290     return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3291 }
3292