1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "tbbmalloc_internal.h"
18 #include <errno.h>
19 #include <new>        /* for placement new */
20 #include <string.h>   /* for memset */
21 
22 #include "oneapi/tbb/version.h"
23 #include "../tbb/environment.h"
24 #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify()
25 
26 #if USE_PTHREAD
27     #define TlsSetValue_func pthread_setspecific
28     #define TlsGetValue_func pthread_getspecific
29     #define GetMyTID() pthread_self()
30     #include <sched.h>
do_yield()31     inline void do_yield() {sched_yield();}
32     extern "C" { static void mallocThreadShutdownNotification(void*); }
33     #if __sun || __SUNPRO_CC
34     #define __asm__ asm
35     #endif
36     #include <unistd.h> // sysconf(_SC_PAGESIZE)
37 #elif USE_WINTHREAD
38     #define GetMyTID() GetCurrentThreadId()
39 #if __TBB_WIN8UI_SUPPORT
40     #include<thread>
41     #define TlsSetValue_func FlsSetValue
42     #define TlsGetValue_func FlsGetValue
43     #define TlsAlloc() FlsAlloc(NULL)
44     #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES
45     #define TlsFree FlsFree
do_yield()46     inline void do_yield() {std::this_thread::yield();}
47 #else
48     #define TlsSetValue_func TlsSetValue
49     #define TlsGetValue_func TlsGetValue
50     #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES
do_yield()51     inline void do_yield() {SwitchToThread();}
52 #endif
53 #else
54     #error Must define USE_PTHREAD or USE_WINTHREAD
55 #endif
56 
57 #define FREELIST_NONBLOCKING 1
58 
59 namespace rml {
60 class MemoryPool;
61 namespace internal {
62 
63 class Block;
64 class MemoryPool;
65 
66 #if MALLOC_CHECK_RECURSION
67 
68 inline bool isMallocInitialized();
69 
70 #endif // MALLOC_CHECK_RECURSION
71 
72 /** Support for handling the special UNUSABLE pointer state **/
73 const intptr_t UNUSABLE = 0x1;
isSolidPtr(void * ptr)74 inline bool isSolidPtr( void* ptr ) {
75     return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE;
76 }
isNotForUse(void * ptr)77 inline bool isNotForUse( void* ptr ) {
78     return (intptr_t)ptr==UNUSABLE;
79 }
80 
81 /*
82  * Block::objectSize value used to mark blocks allocated by startupAlloc
83  */
84 const uint16_t startupAllocObjSizeMark = ~(uint16_t)0;
85 
86 /*
87  * The following constant is used to define the size of struct Block, the block header.
88  * The intent is to have the size of a Block multiple of the cache line size, this allows us to
89  * get good alignment at the cost of some overhead equal to the amount of padding included in the Block.
90  */
91 const int blockHeaderAlignment = estimatedCacheLineSize;
92 
93 /********* The data structures and global objects        **************/
94 
95 /*
96  * The malloc routines themselves need to be able to occasionally malloc some space,
97  * in order to set up the structures used by the thread local structures. This
98  * routine performs that functions.
99  */
100 class BootStrapBlocks {
101     MallocMutex bootStrapLock;
102     Block      *bootStrapBlock;
103     Block      *bootStrapBlockUsed;
104     FreeObject *bootStrapObjectList;
105 public:
106     void *allocate(MemoryPool *memPool, size_t size);
107     void free(void* ptr);
108     void reset();
109 };
110 
111 #if USE_INTERNAL_TID
112 class ThreadId {
113     static tls_key_t Tid_key;
114     std::atomic<intptr_t> ThreadCount;
115 
116     unsigned int id;
117 
tlsNumber()118     static unsigned int tlsNumber() {
119         unsigned int result = reinterpret_cast<intptr_t>(TlsGetValue_func(Tid_key));
120         if( !result ) {
121             RecursiveMallocCallProtector scoped;
122             // Thread-local value is zero -> first call from this thread,
123             // need to initialize with next ID value (IDs start from 1)
124             result = ++ThreadCount; // returned new value!
125             TlsSetValue_func( Tid_key, reinterpret_cast<void*>(result) );
126         }
127         return result;
128     }
129 public:
init()130     static bool init() {
131 #if USE_WINTHREAD
132         Tid_key = TlsAlloc();
133         if (Tid_key == TLS_ALLOC_FAILURE)
134             return false;
135 #else
136         int status = pthread_key_create( &Tid_key, NULL );
137         if ( status ) {
138             fprintf (stderr, "The memory manager cannot create tls key during initialization\n");
139             return false;
140         }
141 #endif /* USE_WINTHREAD */
142         return true;
143     }
144 #if __TBB_SOURCE_DIRECTLY_INCLUDED
destroy()145     static void destroy() {
146         if( Tid_key ) {
147 #if USE_WINTHREAD
148             BOOL status = !(TlsFree( Tid_key ));  // fail is zero
149 #else
150             int status = pthread_key_delete( Tid_key );
151 #endif /* USE_WINTHREAD */
152             if ( status )
153                 fprintf (stderr, "The memory manager cannot delete tls key\n");
154             Tid_key = 0;
155         }
156     }
157 #endif
158 
ThreadId()159     ThreadId() : id(ThreadId::tlsNumber()) {}
isCurrentThreadId() const160     bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); }
161 
162 #if COLLECT_STATISTICS || MALLOC_TRACE
getThreadId()163     friend unsigned int getThreadId() { return ThreadId::tlsNumber(); }
164 #endif
165 #if COLLECT_STATISTICS
getMaxThreadId()166     static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); }
167 
168     friend int STAT_increment(ThreadId tid, int bin, int ctr);
169 #endif
170 };
171 
172 tls_key_t ThreadId::Tid_key;
173 intptr_t ThreadId::ThreadCount;
174 
175 #if COLLECT_STATISTICS
STAT_increment(ThreadId tid,int bin,int ctr)176 int STAT_increment(ThreadId tid, int bin, int ctr)
177 {
178     return ::STAT_increment(tid.id, bin, ctr);
179 }
180 #endif
181 
182 #else // USE_INTERNAL_TID
183 
184 class ThreadId {
185 #if USE_PTHREAD
186     std::atomic<pthread_t> tid;
187 #else
188     std::atomic<DWORD>     tid;
189 #endif
190 public:
ThreadId()191     ThreadId() : tid(GetMyTID()) {}
192 #if USE_PTHREAD
isCurrentThreadId() const193     bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); }
194 #else
isCurrentThreadId() const195     bool isCurrentThreadId() const { return GetCurrentThreadId() == tid.load(std::memory_order_relaxed); }
196 #endif
operator =(const ThreadId & other)197     ThreadId& operator=(const ThreadId& other) {
198         tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed);
199         return *this;
200     }
init()201     static bool init() { return true; }
202 #if __TBB_SOURCE_DIRECTLY_INCLUDED
destroy()203     static void destroy() {}
204 #endif
205 };
206 
207 #endif // USE_INTERNAL_TID
208 
209 /*********** Code to provide thread ID and a thread-local void pointer **********/
210 
init()211 bool TLSKey::init()
212 {
213 #if USE_WINTHREAD
214     TLS_pointer_key = TlsAlloc();
215     if (TLS_pointer_key == TLS_ALLOC_FAILURE)
216         return false;
217 #else
218     int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification );
219     if ( status )
220         return false;
221 #endif /* USE_WINTHREAD */
222     return true;
223 }
224 
destroy()225 bool TLSKey::destroy()
226 {
227 #if USE_WINTHREAD
228     BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero
229 #else
230     int status1 = pthread_key_delete(TLS_pointer_key);
231 #endif /* USE_WINTHREAD */
232     MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key.");
233     return status1==0;
234 }
235 
getThreadMallocTLS() const236 inline TLSData* TLSKey::getThreadMallocTLS() const
237 {
238     return (TLSData *)TlsGetValue_func( TLS_pointer_key );
239 }
240 
setThreadMallocTLS(TLSData * newvalue)241 inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) {
242     RecursiveMallocCallProtector scoped;
243     TlsSetValue_func( TLS_pointer_key, newvalue );
244 }
245 
246 /* The 'next' field in the block header has to maintain some invariants:
247  *   it needs to be on a 16K boundary and the first field in the block.
248  *   Any value stored there needs to have the lower 14 bits set to 0
249  *   so that various assert work. This means that if you want to smash this memory
250  *   for debugging purposes you will need to obey this invariant.
251  * The total size of the header needs to be a power of 2 to simplify
252  * the alignment requirements. For now it is a 128 byte structure.
253  * To avoid false sharing, the fields changed only locally are separated
254  * from the fields changed by foreign threads.
255  * Changing the size of the block header would require to change
256  * some bin allocation sizes, in particular "fitting" sizes (see above).
257  */
258 class Bin;
259 class StartupBlock;
260 
261 class MemoryPool {
262     // if no explicit grainsize, expect to see malloc in user's pAlloc
263     // and set reasonable low granularity
264     static const size_t defaultGranularity = estimatedCacheLineSize;
265 
266     MemoryPool();                  // deny
267 public:
268     static MallocMutex  memPoolListLock;
269 
270     // list of all active pools is used to release
271     // all TLS data on thread termination or library unload
272     MemoryPool    *next,
273                   *prev;
274     ExtMemoryPool  extMemPool;
275     BootStrapBlocks bootStrapBlocks;
276 
277     static void initDefaultPool();
278 
279     bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy);
280     bool reset();
281     bool destroy();
282     void onThreadShutdown(TLSData *tlsData);
283 
284     inline TLSData *getTLS(bool create);
clearTLS()285     void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(NULL); }
286 
287     Block *getEmptyBlock(size_t size);
288     void returnEmptyBlock(Block *block, bool poolTheBlock);
289 
290     // get/put large object to/from local large object cache
291     void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment);
292     void putToLLOCache(TLSData *tls, void *object);
293 };
294 
295 static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) +
296                                      (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)];
297 static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space;
298 const size_t MemoryPool::defaultGranularity;
299 // zero-initialized
300 MallocMutex  MemoryPool::memPoolListLock;
301 // TODO: move huge page status to default pool, because that's its states
302 HugePagesStatus hugePages;
303 static bool usedBySrcIncluded = false;
304 
305 // Padding helpers
306 template<size_t padd>
307 struct PaddingImpl {
308     size_t       __padding[padd];
309 };
310 
311 template<>
312 struct PaddingImpl<0> {};
313 
314 template<int N>
315 struct Padding : PaddingImpl<N/sizeof(size_t)> {};
316 
317 // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed
318 // fields and fields commonly accessed by not owner threads.
319 class GlobalBlockFields : public BlockI {
320 protected:
321     std::atomic<FreeObject*> publicFreeList;
322     std::atomic<Block*> nextPrivatizable;
323     MemoryPool  *poolPtr;
324 };
325 
326 class LocalBlockFields : public GlobalBlockFields, Padding<blockHeaderAlignment - sizeof(GlobalBlockFields)>  {
327 protected:
328     Block       *next;
329     Block       *previous;        /* Use double linked list to speed up removal */
330     FreeObject  *bumpPtr;         /* Bump pointer moves from the end to the beginning of a block */
331     FreeObject  *freeList;
332     /* Pointer to local data for the owner thread. Used for fast finding tls
333        when releasing object from a block that current thread owned.
334        NULL for orphaned blocks. */
335     std::atomic<TLSData*> tlsPtr;
336     ThreadId     ownerTid;        /* the ID of the thread that owns or last owned the block */
337     BackRefIdx   backRefIdx;
338     uint16_t     allocatedCount;  /* Number of objects allocated (obviously by the owning thread) */
339     uint16_t     objectSize;
340     bool         isFull;
341 
342     friend class FreeBlockPool;
343     friend class StartupBlock;
344     friend class LifoList;
345     friend void *BootStrapBlocks::allocate(MemoryPool *, size_t);
346     friend bool OrphanedBlocks::cleanup(Backend*);
347     friend Block *MemoryPool::getEmptyBlock(size_t);
348 };
349 
350 // Use inheritance to guarantee that a user data start on next cache line.
351 // Can't use member for it, because when LocalBlockFields already on cache line,
352 // we must have no additional memory consumption for all compilers.
353 class Block : public LocalBlockFields,
354               Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> {
355 public:
empty() const356     bool empty() const {
357         if (allocatedCount > 0) return false;
358         MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT);
359         return true;
360     }
361     inline FreeObject* allocate();
362     inline FreeObject *allocateFromFreeList();
363 
364     inline bool adjustFullness();
365     void adjustPositionInBin(Bin* bin = NULL);
366 #if MALLOC_DEBUG
freeListNonNull()367     bool freeListNonNull() { return freeList; }
368 #endif
369     void freePublicObject(FreeObject *objectToFree);
370     inline void freeOwnObject(void *object);
371     void reset();
372     void privatizePublicFreeList( bool reset = true );
373     void restoreBumpPtr();
374     void privatizeOrphaned(TLSData *tls, unsigned index);
375     bool readyToShare();
376     void shareOrphaned(intptr_t binTag, unsigned index);
getSize() const377     unsigned int getSize() const {
378         MALLOC_ASSERT(isStartupAllocObject() || objectSize<minLargeObjectSize,
379                       "Invalid object size");
380         return isStartupAllocObject()? 0 : objectSize;
381     }
getBackRefIdx() const382     const BackRefIdx *getBackRefIdx() const { return &backRefIdx; }
383     inline bool isOwnedByCurrentThread() const;
isStartupAllocObject() const384     bool isStartupAllocObject() const { return objectSize == startupAllocObjSizeMark; }
385     inline FreeObject *findObjectToFree(const void *object) const;
checkFreePrecond(const void * object) const386     void checkFreePrecond(const void *object) const {
387 #if MALLOC_DEBUG
388         const char *msg = "Possible double free or heap corruption.";
389         // small objects are always at least sizeof(size_t) Byte aligned,
390         // try to check this before this dereference as for invalid objects
391         // this may be unreadable
392         MALLOC_ASSERT(isAligned(object, sizeof(size_t)), "Try to free invalid small object");
393 #if !__TBB_USE_THREAD_SANITIZER
394         // releasing to free slab
395         MALLOC_ASSERT(allocatedCount>0, msg);
396 #endif
397         // must not point to slab's header
398         MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg);
399         if (startupAllocObjSizeMark == objectSize) // startup block
400             MALLOC_ASSERT(object<=bumpPtr, msg);
401         else {
402             // non-startup objects are 8 Byte aligned
403             MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object");
404             FreeObject *toFree = findObjectToFree(object);
405 #if !__TBB_USE_THREAD_SANITIZER
406             MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize
407                           && (!bumpPtr || object>bumpPtr), msg);
408             // check against head of freeList, as this is mostly
409             // expected after double free
410             MALLOC_ASSERT(toFree != freeList, msg);
411 #endif
412             // check against head of publicFreeList, to detect double free
413             // involving foreign thread
414             MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg);
415         }
416 #else
417         suppress_unused_warning(object);
418 #endif
419     }
420     void initEmptyBlock(TLSData *tls, size_t size);
421     size_t findObjectSize(void *object) const;
getMemPool() const422     MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path!
423 
424 protected:
425     void cleanBlockHeader();
426 
427 private:
428     static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */
429 
430     inline FreeObject *allocateFromBumpPtr();
431     inline FreeObject *findAllocatedObject(const void *address) const;
432 #if MALLOC_DEBUG
433     inline bool isProperlyPlaced(const void *object) const;
434 #endif
markOwned(TLSData * tls)435     inline void markOwned(TLSData *tls) {
436         MALLOC_ASSERT(!tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
437         ownerTid = ThreadId(); /* save the ID of the current thread */
438         tlsPtr.store(tls, std::memory_order_relaxed);
439     }
markOrphaned()440     inline void markOrphaned() {
441         MALLOC_ASSERT(tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT);
442         tlsPtr.store(nullptr, std::memory_order_relaxed);
443     }
444 
445     friend class Bin;
446     friend class TLSData;
447     friend bool MemoryPool::destroy();
448 };
449 
450 const float Block::emptyEnoughRatio = 1.0 / 4.0;
451 
452 static_assert(sizeof(Block) <= 2*estimatedCacheLineSize,
453     "The class Block does not fit into 2 cache lines on this platform. "
454     "Defining USE_INTERNAL_TID may help to fix it.");
455 
456 class Bin {
457 private:
458 public:
459     Block *activeBlk;
460     std::atomic<Block*> mailbox;
461     MallocMutex mailLock;
462 
463 public:
getActiveBlock() const464     inline Block* getActiveBlock() const { return activeBlk; }
resetActiveBlock()465     void resetActiveBlock() { activeBlk = NULL; }
466     inline void setActiveBlock(Block *block);
467     inline Block* setPreviousBlockActive();
468     Block* getPrivatizedFreeListBlock();
469     void moveBlockToFront(Block *block);
470     bool cleanPublicFreeLists();
471     void processEmptyBlock(Block *block, bool poolTheBlock);
472     void addPublicFreeListBlock(Block* block);
473 
474     void outofTLSBin(Block* block);
475     void verifyTLSBin(size_t size) const;
476     void pushTLSBin(Block* block);
477 
478 #if MALLOC_DEBUG
verifyInitState() const479     void verifyInitState() const {
480         MALLOC_ASSERT( !activeBlk, ASSERT_TEXT );
481         MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT );
482     }
483 #endif
484 
485     friend void Block::freePublicObject (FreeObject *objectToFree);
486 };
487 
488 /********* End of the data structures                    **************/
489 
490 /*
491  * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total
492  */
493 const uint32_t minSmallObjectIndex = 0;
494 const uint32_t numSmallObjectBins = 8;
495 const uint32_t maxSmallObjectSize = 64;
496 
497 /*
498  * There are 4 bins between each couple of powers of 2 [64-128-256-...]
499  * from maxSmallObjectSize till this size; 16 bins in total
500  */
501 const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins;
502 const uint32_t numSegregatedObjectBins = 16;
503 const uint32_t maxSegregatedObjectSize = 1024;
504 
505 /*
506  * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize
507  * and selected to fit 9, 6, 4, 3, and 2 allocations in a block.
508  */
509 const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins;
510 const uint32_t numFittingBins = 5;
511 
512 const uint32_t fittingAlignment = estimatedCacheLineSize;
513 
514 #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1)
515 // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize,
516 // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128:
517 const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000
518 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000
519 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256
520 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000
521 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000
522 #undef SET_FITTING_SIZE
523 
524 /*
525  * The total number of thread-specific Block-based bins
526  */
527 const uint32_t numBlockBins = minFittingIndex+numFittingBins;
528 
529 /*
530  * Objects of this size and larger are considered large objects.
531  */
532 const uint32_t minLargeObjectSize = fittingSize5 + 1;
533 
534 /*
535  * Per-thread pool of slab blocks. Idea behind it is to not share with other
536  * threads memory that are likely in local cache(s) of our CPU.
537  */
538 class FreeBlockPool {
539 private:
540     std::atomic<Block*> head;
541     int         size;
542     Backend    *backend;
543     bool        lastAccessMiss;
544 public:
545     static const int POOL_HIGH_MARK = 32;
546     static const int POOL_LOW_MARK  = 8;
547 
548     class ResOfGet {
549         ResOfGet();
550     public:
551         Block* block;
552         bool   lastAccMiss;
ResOfGet(Block * b,bool lastMiss)553         ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {}
554     };
555 
556     // allocated in zero-initialized memory
FreeBlockPool(Backend * bknd)557     FreeBlockPool(Backend *bknd) : backend(bknd) {}
558     ResOfGet getBlock();
559     void returnBlock(Block *block);
560     bool externalCleanup(); // can be called by another thread
561 };
562 
563 template<int LOW_MARK, int HIGH_MARK>
564 class LocalLOCImpl {
565 private:
566     static const size_t MAX_TOTAL_SIZE = 4*1024*1024;
567     // TODO: can single-linked list be faster here?
568     LargeMemoryBlock *tail; // need it when do releasing on overflow
569     std::atomic<LargeMemoryBlock*> head;
570     size_t            totalSize;
571     int               numOfBlocks;
572 public:
573     bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool);
574     LargeMemoryBlock *get(size_t size);
575     bool externalCleanup(ExtMemoryPool *extMemPool);
576 #if __TBB_MALLOC_WHITEBOX_TEST
LocalLOCImpl()577     LocalLOCImpl() : tail(NULL), head(NULL), totalSize(0), numOfBlocks(0) {}
getMaxSize()578     static size_t getMaxSize() { return MAX_TOTAL_SIZE; }
579     static const int LOC_HIGH_MARK = HIGH_MARK;
580 #else
581     // no ctor, object must be created in zero-initialized memory
582 #endif
583 };
584 
585 typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters
586 
587 class TLSData : public TLSRemote {
588     MemoryPool   *memPool;
589 public:
590     Bin           bin[numBlockBinLimit];
591     FreeBlockPool freeSlabBlocks;
592     LocalLOC      lloc;
593     unsigned      currCacheIdx;
594 private:
595     std::atomic<bool> unused;
596 public:
TLSData(MemoryPool * mPool,Backend * bknd)597     TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd) {}
getMemPool() const598     MemoryPool *getMemPool() const { return memPool; }
599     Bin* getAllocationBin(size_t size);
600     void release();
externalCleanup(bool cleanOnlyUnused,bool cleanBins)601     bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) {
602         if (!unused.load(std::memory_order_relaxed) && cleanOnlyUnused) return false;
603         // Heavy operation in terms of synchronization complexity,
604         // should be called only for the current thread
605         bool released = cleanBins ? cleanupBlockBins() : false;
606         // both cleanups to be called, and the order is not important
607         return released | lloc.externalCleanup(&memPool->extMemPool) | freeSlabBlocks.externalCleanup();
608     }
609     bool cleanupBlockBins();
markUsed()610     void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched
markUnused()611     void markUnused() { unused.store(true, std::memory_order_relaxed); } // can be called by not owner thread
612 };
613 
createTLS(MemoryPool * memPool,Backend * backend)614 TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend)
615 {
616     MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT );
617     TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData));
618     if ( !tls )
619         return NULL;
620     new(tls) TLSData(memPool, backend);
621     /* the block contains zeroes after bootStrapMalloc, so bins are initialized */
622 #if MALLOC_DEBUG
623     for (uint32_t i = 0; i < numBlockBinLimit; i++)
624         tls->bin[i].verifyInitState();
625 #endif
626     setThreadMallocTLS(tls);
627     memPool->extMemPool.allLocalCaches.registerThread(tls);
628     return tls;
629 }
630 
cleanupBlockBins()631 bool TLSData::cleanupBlockBins()
632 {
633     bool released = false;
634     for (uint32_t i = 0; i < numBlockBinLimit; i++) {
635         released |= bin[i].cleanPublicFreeLists();
636         // After cleaning public free lists, only the active block might be empty.
637         // Do not use processEmptyBlock because it will just restore bumpPtr.
638         Block *block = bin[i].getActiveBlock();
639         if (block && block->empty()) {
640             bin[i].outofTLSBin(block);
641             memPool->returnEmptyBlock(block, /*poolTheBlock=*/false);
642             released = true;
643         }
644     }
645     return released;
646 }
647 
releaseAllLocalCaches()648 bool ExtMemoryPool::releaseAllLocalCaches()
649 {
650     // Iterate all registered TLS data and clean LLOC and Slab pools
651     bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false);
652 
653     // Bins privatization is done only for the current thread
654     if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS())
655         released |= tlsData->cleanupBlockBins();
656 
657     return released;
658 }
659 
registerThread(TLSRemote * tls)660 void AllLocalCaches::registerThread(TLSRemote *tls)
661 {
662     tls->prev = NULL;
663     MallocMutex::scoped_lock lock(listLock);
664     MALLOC_ASSERT(head!=tls, ASSERT_TEXT);
665     tls->next = head;
666     if (head)
667         head->prev = tls;
668     head = tls;
669     MALLOC_ASSERT(head->next!=head, ASSERT_TEXT);
670 }
671 
unregisterThread(TLSRemote * tls)672 void AllLocalCaches::unregisterThread(TLSRemote *tls)
673 {
674     MallocMutex::scoped_lock lock(listLock);
675     MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered.");
676     if (head == tls)
677         head = tls->next;
678     if (tls->next)
679         tls->next->prev = tls->prev;
680     if (tls->prev)
681         tls->prev->next = tls->next;
682     MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT);
683 }
684 
cleanup(bool cleanOnlyUnused)685 bool AllLocalCaches::cleanup(bool cleanOnlyUnused)
686 {
687     bool released = false;
688     {
689         MallocMutex::scoped_lock lock(listLock);
690         for (TLSRemote *curr=head; curr; curr=curr->next)
691             released |= static_cast<TLSData*>(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false);
692     }
693     return released;
694 }
695 
markUnused()696 void AllLocalCaches::markUnused()
697 {
698     bool locked;
699     MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked);
700     if (!locked) // not wait for marking if someone doing something with it
701         return;
702 
703     for (TLSRemote *curr=head; curr; curr=curr->next)
704         static_cast<TLSData*>(curr)->markUnused();
705 }
706 
707 #if MALLOC_CHECK_RECURSION
708 MallocMutex RecursiveMallocCallProtector::rmc_mutex;
709 std::atomic<pthread_t> RecursiveMallocCallProtector::owner_thread;
710 std::atomic<void*> RecursiveMallocCallProtector::autoObjPtr;
711 bool        RecursiveMallocCallProtector::mallocRecursionDetected;
712 #if __FreeBSD__ || __DragonFly__
713 bool        RecursiveMallocCallProtector::canUsePthread;
714 #endif
715 
716 #endif
717 
718 /*********** End code to provide thread ID and a TLS pointer **********/
719 
720 // Parameter for isLargeObject, keeps our expectations on memory origin.
721 // Assertions must use unknownMem to reliably report object invalidity.
722 enum MemoryOrigin {
723     ourMem,    // allocated by TBB allocator
724     unknownMem // can be allocated by system allocator or TBB allocator
725 };
726 
727 template<MemoryOrigin>
728 #if __TBB_USE_THREAD_SANITIZER
729 // We have a real race when accessing the large object header for
730 // non large objects (e.g. small or foreign objects).
731 // Therefore, we need to hide this access from the thread sanitizer
732 __attribute__((no_sanitize("thread")))
733 #endif
734 bool isLargeObject(void *object);
735 static void *internalMalloc(size_t size);
736 static void internalFree(void *object);
737 static void *internalPoolMalloc(MemoryPool* mPool, size_t size);
738 static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size);
739 
740 #if !MALLOC_DEBUG
741 #if __INTEL_COMPILER || _MSC_VER
742 #define NOINLINE(decl) __declspec(noinline) decl
743 #define ALWAYSINLINE(decl) __forceinline decl
744 #elif __GNUC__
745 #define NOINLINE(decl) decl __attribute__ ((noinline))
746 #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline))
747 #else
748 #define NOINLINE(decl) decl
749 #define ALWAYSINLINE(decl) decl
750 #endif
751 
752 static NOINLINE( bool doInitialization() );
753 ALWAYSINLINE( bool isMallocInitialized() );
754 
755 #undef ALWAYSINLINE
756 #undef NOINLINE
757 #endif /* !MALLOC_DEBUG */
758 
759 
760 /********* Now some rough utility code to deal with indexing the size bins. **************/
761 
762 /*
763  * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only.
764  * Moreover, on some platforms, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023.
765  * This is enough for current algorithm of distribution of sizes among bins.
766  * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources.
767  */
768 #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER
769 extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w );
770 #pragma intrinsic(_BitScanReverse)
771 #endif
highestBitPos(unsigned int n)772 static inline unsigned int highestBitPos(unsigned int n)
773 {
774     MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true
775     unsigned int pos;
776 #if __ARCH_x86_32||__ARCH_x86_64
777 
778 # if __unix__||__APPLE__||__MINGW32__
779     __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n));
780 # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER))
781     __asm
782     {
783         bsr eax, n
784         mov pos, eax
785     }
786 # elif _WIN64 && _MSC_VER>=1400
787     _BitScanReverse((unsigned long*)&pos, (unsigned long)n);
788 # else
789 #   error highestBitPos() not implemented for this platform
790 # endif
791 #elif __arm__
792     __asm__ __volatile__
793     (
794        "clz %0, %1\n"
795        "rsb %0, %0, %2\n"
796        :"=r" (pos) :"r" (n), "I" (31)
797     );
798 #else
799     static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9};
800     pos = bsr[ n>>6 ];
801 #endif /* __ARCH_* */
802     return pos;
803 }
804 
getSmallObjectIndex(unsigned int size)805 unsigned int getSmallObjectIndex(unsigned int size)
806 {
807     unsigned int result = (size-1)>>3;
808     if (sizeof(void*)==8) {
809         // For 64-bit malloc, 16 byte alignment is needed except for bin 0.
810         if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes
811     }
812     return result;
813 }
814 
815 /*
816  * Depending on indexRequest, for a given size return either the index into the bin
817  * for objects of this size, or the actual size of objects in this bin.
818  */
819 template<bool indexRequest>
getIndexOrObjectSize(unsigned int size)820 static unsigned int getIndexOrObjectSize (unsigned int size)
821 {
822     if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64
823         unsigned int index = getSmallObjectIndex( size );
824          /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */
825         return indexRequest ? index : (index+1)<<3;
826     }
827     else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024
828         unsigned int order = highestBitPos(size-1); // which group of bin sizes?
829         MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT );
830         if (indexRequest)
831             return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2));
832         else {
833             unsigned int alignment = 128 >> (9-order); // alignment in the group
834             MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT );
835             return alignUp(size,alignment);
836         }
837     }
838     else {
839         if( size <= fittingSize3 ) {
840             if( size <= fittingSize2 ) {
841                 if( size <= fittingSize1 )
842                     return indexRequest ? minFittingIndex : fittingSize1;
843                 else
844                     return indexRequest ? minFittingIndex+1 : fittingSize2;
845             } else
846                 return indexRequest ? minFittingIndex+2 : fittingSize3;
847         } else {
848             if( size <= fittingSize5 ) {
849                 if( size <= fittingSize4 )
850                     return indexRequest ? minFittingIndex+3 : fittingSize4;
851                 else
852                     return indexRequest ? minFittingIndex+4 : fittingSize5;
853             } else {
854                 MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen
855                 return ~0U;
856             }
857         }
858     }
859 }
860 
getIndex(unsigned int size)861 static unsigned int getIndex (unsigned int size)
862 {
863     return getIndexOrObjectSize</*indexRequest=*/true>(size);
864 }
865 
getObjectSize(unsigned int size)866 static unsigned int getObjectSize (unsigned int size)
867 {
868     return getIndexOrObjectSize</*indexRequest=*/false>(size);
869 }
870 
871 
allocate(MemoryPool * memPool,size_t size)872 void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size)
873 {
874     FreeObject *result;
875 
876     MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT );
877 
878     { // Lock with acquire
879         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
880 
881         if( bootStrapObjectList) {
882             result = bootStrapObjectList;
883             bootStrapObjectList = bootStrapObjectList->next;
884         } else {
885             if (!bootStrapBlock) {
886                 bootStrapBlock = memPool->getEmptyBlock(size);
887                 if (!bootStrapBlock) return NULL;
888             }
889             result = bootStrapBlock->bumpPtr;
890             bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize);
891             if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) {
892                 bootStrapBlock->bumpPtr = NULL;
893                 bootStrapBlock->next = bootStrapBlockUsed;
894                 bootStrapBlockUsed = bootStrapBlock;
895                 bootStrapBlock = NULL;
896             }
897         }
898     } // Unlock with release
899     memset (result, 0, size);
900     return (void*)result;
901 }
902 
free(void * ptr)903 void BootStrapBlocks::free(void* ptr)
904 {
905     MALLOC_ASSERT( ptr, ASSERT_TEXT );
906     { // Lock with acquire
907         MallocMutex::scoped_lock scoped_cs(bootStrapLock);
908         ((FreeObject*)ptr)->next = bootStrapObjectList;
909         bootStrapObjectList = (FreeObject*)ptr;
910     } // Unlock with release
911 }
912 
reset()913 void BootStrapBlocks::reset()
914 {
915     bootStrapBlock = bootStrapBlockUsed = NULL;
916     bootStrapObjectList = NULL;
917 }
918 
919 #if !(FREELIST_NONBLOCKING)
920 static MallocMutex publicFreeListLock; // lock for changes of publicFreeList
921 #endif
922 
923 /********* End rough utility code  **************/
924 
925 /* LifoList assumes zero initialization so a vector of it can be created
926  * by just allocating some space with no call to constructor.
927  * On Linux, it seems to be necessary to avoid linking with C++ libraries.
928  *
929  * By usage convention there is no race on the initialization. */
LifoList()930 LifoList::LifoList( ) : top(nullptr)
931 {
932     // MallocMutex assumes zero initialization
933     memset(&lock, 0, sizeof(MallocMutex));
934 }
935 
push(Block * block)936 void LifoList::push(Block *block)
937 {
938     MallocMutex::scoped_lock scoped_cs(lock);
939     block->next = top.load(std::memory_order_relaxed);
940     top.store(block, std::memory_order_relaxed);
941 }
942 
pop()943 Block *LifoList::pop()
944 {
945     Block* block = nullptr;
946     if (top.load(std::memory_order_relaxed)) {
947         MallocMutex::scoped_lock scoped_cs(lock);
948         block = top.load(std::memory_order_relaxed);
949         if (block) {
950             top.store(block->next, std::memory_order_relaxed);
951         }
952     }
953     return block;
954 }
955 
grab()956 Block *LifoList::grab()
957 {
958     Block *block = nullptr;
959     if (top.load(std::memory_order_relaxed)) {
960         MallocMutex::scoped_lock scoped_cs(lock);
961         block = top.load(std::memory_order_relaxed);
962         top.store(nullptr, std::memory_order_relaxed);
963     }
964     return block;
965 }
966 
967 /********* Thread and block related code      *************/
968 
releaseAll(Backend * backend)969 template<bool poolDestroy> void AllLargeBlocksList::releaseAll(Backend *backend) {
970      LargeMemoryBlock *next, *lmb = loHead;
971      loHead = NULL;
972 
973      for (; lmb; lmb = next) {
974          next = lmb->gNext;
975          if (poolDestroy) {
976              // as it's pool destruction, no need to return object to backend,
977              // only remove backrefs, as they are global
978              removeBackRef(lmb->backRefIdx);
979          } else {
980              // clean g(Next|Prev) to prevent removing lmb
981              // from AllLargeBlocksList inside returnLargeObject
982              lmb->gNext = lmb->gPrev = NULL;
983              backend->returnLargeObject(lmb);
984          }
985      }
986 }
987 
getTLS(bool create)988 TLSData* MemoryPool::getTLS(bool create)
989 {
990     TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS();
991     if (create && !tls)
992         tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend);
993     return tls;
994 }
995 
996 /*
997  * Return the bin for the given size.
998  */
getAllocationBin(size_t size)999 inline Bin* TLSData::getAllocationBin(size_t size)
1000 {
1001     return bin + getIndex(size);
1002 }
1003 
1004 /* Return an empty uninitialized block in a non-blocking fashion. */
getEmptyBlock(size_t size)1005 Block *MemoryPool::getEmptyBlock(size_t size)
1006 {
1007     TLSData* tls = getTLS(/*create=*/false);
1008     // try to use per-thread cache, if TLS available
1009     FreeBlockPool::ResOfGet resOfGet = tls?
1010         tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(NULL, false);
1011     Block *result = resOfGet.block;
1012 
1013     if (!result) { // not found in local cache, asks backend for slabs
1014         int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1;
1015         BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss];
1016 
1017         result = static_cast<Block*>(extMemPool.backend.getSlabBlock(num));
1018         if (!result) return NULL;
1019 
1020         if (!extMemPool.userPool())
1021             for (int i=0; i<num; i++) {
1022                 backRefIdx[i] = BackRefIdx::newBackRef(/*largeObj=*/false);
1023                 if (backRefIdx[i].isInvalid()) {
1024                     // roll back resource allocation
1025                     for (int j=0; j<i; j++)
1026                         removeBackRef(backRefIdx[j]);
1027                     Block *b = result;
1028                     for (int j=0; j<num; b=(Block*)((uintptr_t)b+slabSize), j++)
1029                         extMemPool.backend.putSlabBlock(b);
1030                     return NULL;
1031                 }
1032             }
1033         // resources were allocated, register blocks
1034         Block *b = result;
1035         for (int i=0; i<num; b=(Block*)((uintptr_t)b+slabSize), i++) {
1036             // slab block in user's pool must have invalid backRefIdx
1037             if (extMemPool.userPool()) {
1038                 new (&b->backRefIdx) BackRefIdx();
1039             } else {
1040                 setBackRef(backRefIdx[i], b);
1041                 b->backRefIdx = backRefIdx[i];
1042             }
1043             b->tlsPtr.store(tls, std::memory_order_relaxed);
1044             b->poolPtr = this;
1045             // all but first one go to per-thread pool
1046             if (i > 0) {
1047                 MALLOC_ASSERT(tls, ASSERT_TEXT);
1048                 tls->freeSlabBlocks.returnBlock(b);
1049             }
1050         }
1051     }
1052     MALLOC_ASSERT(result, ASSERT_TEXT);
1053     result->initEmptyBlock(tls, size);
1054     STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew);
1055     return result;
1056 }
1057 
returnEmptyBlock(Block * block,bool poolTheBlock)1058 void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock)
1059 {
1060     block->reset();
1061     if (poolTheBlock) {
1062         getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block);
1063     } else {
1064         // slab blocks in user's pools do not have valid backRefIdx
1065         if (!extMemPool.userPool())
1066             removeBackRef(*(block->getBackRefIdx()));
1067         extMemPool.backend.putSlabBlock(block);
1068     }
1069 }
1070 
init(intptr_t poolId,rawAllocType rawAlloc,rawFreeType rawFree,size_t granularity,bool keepAllMemory,bool fixedPool)1071 bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc,
1072                          rawFreeType rawFree, size_t granularity,
1073                          bool keepAllMemory, bool fixedPool)
1074 {
1075     this->poolId = poolId;
1076     this->rawAlloc = rawAlloc;
1077     this->rawFree = rawFree;
1078     this->granularity = granularity;
1079     this->keepAllMemory = keepAllMemory;
1080     this->fixedPool = fixedPool;
1081     this->delayRegsReleasing = false;
1082     if (!initTLS())
1083         return false;
1084     loc.init(this);
1085     backend.init(this);
1086     MALLOC_ASSERT(isPoolValid(), NULL);
1087     return true;
1088 }
1089 
initTLS()1090 bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); }
1091 
init(intptr_t poolId,const MemPoolPolicy * policy)1092 bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy)
1093 {
1094     if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree,
1095                policy->granularity? policy->granularity : defaultGranularity,
1096                policy->keepAllMemory, policy->fixedPool))
1097         return false;
1098     {
1099         MallocMutex::scoped_lock lock(memPoolListLock);
1100         next = defaultMemPool->next;
1101         defaultMemPool->next = this;
1102         prev = defaultMemPool;
1103         if (next)
1104             next->prev = this;
1105     }
1106     return true;
1107 }
1108 
reset()1109 bool MemoryPool::reset()
1110 {
1111     MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool.");
1112     // memory is not releasing during pool reset
1113     // TODO: mark regions to release unused on next reset()
1114     extMemPool.delayRegionsReleasing(true);
1115 
1116     bootStrapBlocks.reset();
1117     extMemPool.lmbList.releaseAll</*poolDestroy=*/false>(&extMemPool.backend);
1118     if (!extMemPool.reset())
1119         return false;
1120 
1121     if (!extMemPool.initTLS())
1122         return false;
1123     extMemPool.delayRegionsReleasing(false);
1124     return true;
1125 }
1126 
destroy()1127 bool MemoryPool::destroy()
1128 {
1129 #if __TBB_MALLOC_LOCACHE_STAT
1130     extMemPool.loc.reportStat(stdout);
1131 #endif
1132 #if __TBB_MALLOC_BACKEND_STAT
1133     extMemPool.backend.reportStat(stdout);
1134 #endif
1135     {
1136         MallocMutex::scoped_lock lock(memPoolListLock);
1137         // remove itself from global pool list
1138         if (prev)
1139             prev->next = next;
1140         if (next)
1141             next->prev = prev;
1142     }
1143     // slab blocks in non-default pool do not have backreferences,
1144     // only large objects do
1145     if (extMemPool.userPool())
1146         extMemPool.lmbList.releaseAll</*poolDestroy=*/true>(&extMemPool.backend);
1147     else {
1148         // only one non-userPool() is supported now
1149         MALLOC_ASSERT(this==defaultMemPool, NULL);
1150         // There and below in extMemPool.destroy(), do not restore initial state
1151         // for user pool, because it's just about to be released. But for system
1152         // pool restoring, we do not want to do zeroing of it on subsequent reload.
1153         bootStrapBlocks.reset();
1154         extMemPool.orphanedBlocks.reset();
1155     }
1156     return extMemPool.destroy();
1157 }
1158 
onThreadShutdown(TLSData * tlsData)1159 void MemoryPool::onThreadShutdown(TLSData *tlsData)
1160 {
1161     if (tlsData) { // might be called for "empty" TLS
1162         tlsData->release();
1163         bootStrapBlocks.free(tlsData);
1164         clearTLS();
1165     }
1166 }
1167 
1168 #if MALLOC_DEBUG
verifyTLSBin(size_t size) const1169 void Bin::verifyTLSBin (size_t size) const
1170 {
1171 /* The debug version verifies the TLSBin as needed */
1172     uint32_t objSize = getObjectSize(size);
1173 
1174     if (activeBlk) {
1175         MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT );
1176         MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT );
1177 #if MALLOC_DEBUG>1
1178         for (Block* temp = activeBlk->next; temp; temp=temp->next) {
1179             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1180             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1181             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1182             MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1183             if (temp->next) {
1184                 MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1185             }
1186         }
1187         for (Block* temp = activeBlk->previous; temp; temp=temp->previous) {
1188             MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT );
1189             MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT );
1190             MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT );
1191             MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT );
1192             if (temp->previous) {
1193                 MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT );
1194             }
1195         }
1196 #endif /* MALLOC_DEBUG>1 */
1197     }
1198 }
1199 #else /* MALLOC_DEBUG */
verifyTLSBin(size_t) const1200 inline void Bin::verifyTLSBin (size_t) const { }
1201 #endif /* MALLOC_DEBUG */
1202 
1203 /*
1204  * Add a block to the start of this tls bin list.
1205  */
pushTLSBin(Block * block)1206 void Bin::pushTLSBin(Block* block)
1207 {
1208     /* The objectSize should be defined and not a parameter
1209        because the function is applied to partially filled blocks as well */
1210     unsigned int size = block->objectSize;
1211 
1212     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1213     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1214     MALLOC_ASSERT( block->next == NULL, ASSERT_TEXT );
1215     MALLOC_ASSERT( block->previous == NULL, ASSERT_TEXT );
1216 
1217     MALLOC_ASSERT( this, ASSERT_TEXT );
1218     verifyTLSBin(size);
1219 
1220     block->next = activeBlk;
1221     if( activeBlk ) {
1222         block->previous = activeBlk->previous;
1223         activeBlk->previous = block;
1224         if( block->previous )
1225             block->previous->next = block;
1226     } else {
1227         activeBlk = block;
1228     }
1229 
1230     verifyTLSBin(size);
1231 }
1232 
1233 /*
1234  * Take a block out of its tls bin (e.g. before removal).
1235  */
outofTLSBin(Block * block)1236 void Bin::outofTLSBin(Block* block)
1237 {
1238     unsigned int size = block->objectSize;
1239 
1240     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1241     MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT );
1242 
1243     MALLOC_ASSERT( this, ASSERT_TEXT );
1244     verifyTLSBin(size);
1245 
1246     if (block == activeBlk) {
1247         activeBlk = block->previous? block->previous : block->next;
1248     }
1249     /* Unlink the block */
1250     if (block->previous) {
1251         MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT );
1252         block->previous->next = block->next;
1253     }
1254     if (block->next) {
1255         MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT );
1256         block->next->previous = block->previous;
1257     }
1258     block->next = NULL;
1259     block->previous = NULL;
1260 
1261     verifyTLSBin(size);
1262 }
1263 
getPrivatizedFreeListBlock()1264 Block* Bin::getPrivatizedFreeListBlock()
1265 {
1266     Block* block;
1267     MALLOC_ASSERT( this, ASSERT_TEXT );
1268     // if this method is called, active block usage must be unsuccessful
1269     MALLOC_ASSERT( !activeBlk && !mailbox.load(std::memory_order_relaxed) || activeBlk && activeBlk->isFull, ASSERT_TEXT );
1270 
1271 // the counter should be changed    STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1272     if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox
1273         return NULL;
1274     else { // mailbox is not empty, take lock and inspect it
1275         MallocMutex::scoped_lock scoped_cs(mailLock);
1276         block = mailbox.load(std::memory_order_relaxed);
1277         if( block ) {
1278             MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1279             MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1280             mailbox.store(block->nextPrivatizable.load(std::memory_order_relaxed), std::memory_order_relaxed);
1281             block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1282         }
1283     }
1284     if( block ) {
1285         MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1286         block->privatizePublicFreeList();
1287         block->adjustPositionInBin(this);
1288     }
1289     return block;
1290 }
1291 
addPublicFreeListBlock(Block * block)1292 void Bin::addPublicFreeListBlock(Block* block)
1293 {
1294     MallocMutex::scoped_lock scoped_cs(mailLock);
1295     block->nextPrivatizable.store(mailbox.load(std::memory_order_relaxed), std::memory_order_relaxed);
1296     mailbox.store(block, std::memory_order_relaxed);
1297 }
1298 
1299 // Process publicly freed objects in all blocks and return empty blocks
1300 // to the backend in order to reduce overall footprint.
cleanPublicFreeLists()1301 bool Bin::cleanPublicFreeLists()
1302 {
1303     Block* block;
1304     if (!mailbox.load(std::memory_order_acquire))
1305         return false;
1306     else {
1307         // Grab all the blocks in the mailbox
1308         MallocMutex::scoped_lock scoped_cs(mailLock);
1309         block = mailbox.load(std::memory_order_relaxed);
1310         mailbox.store(NULL, std::memory_order_relaxed);
1311     }
1312     bool released = false;
1313     while (block) {
1314         MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1315         Block* tmp = block->nextPrivatizable.load(std::memory_order_relaxed);
1316         block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed);
1317         block->privatizePublicFreeList();
1318         if (block->empty()) {
1319             processEmptyBlock(block, /*poolTheBlock=*/false);
1320             released = true;
1321         } else
1322             block->adjustPositionInBin(this);
1323         block = tmp;
1324     }
1325     return released;
1326 }
1327 
adjustFullness()1328 bool Block::adjustFullness()
1329 {
1330     if (bumpPtr) {
1331         /* If we are still using a bump ptr for this block it is empty enough to use. */
1332         STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1333         isFull = false;
1334     } else {
1335         const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio);
1336         /* allocatedCount shows how many objects in the block are in use; however it still counts
1337          * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */
1338         isFull = (allocatedCount*objectSize > threshold) ? true : false;
1339 #if COLLECT_STATISTICS
1340         if (isFull)
1341             STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty);
1342         else
1343             STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough);
1344 #endif
1345     }
1346     return isFull;
1347 }
1348 
1349 // This method resides in class Block, and not in class Bin, in order to avoid
1350 // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject
adjustPositionInBin(Bin * bin)1351 void Block::adjustPositionInBin(Bin* bin/*=NULL*/)
1352 {
1353     // If the block were full, but became empty enough to use,
1354     // move it to the front of the list
1355     if (isFull && !adjustFullness()) {
1356         if (!bin)
1357             bin = tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize);
1358         bin->moveBlockToFront(this);
1359     }
1360 }
1361 
1362 /* Restore the bump pointer for an empty block that is planned to use */
restoreBumpPtr()1363 void Block::restoreBumpPtr()
1364 {
1365     MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT );
1366     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1367     STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr);
1368     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1369     freeList = NULL;
1370     isFull = false;
1371 }
1372 
freeOwnObject(void * object)1373 void Block::freeOwnObject(void *object)
1374 {
1375     tlsPtr.load(std::memory_order_relaxed)->markUsed();
1376     allocatedCount--;
1377     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1378 #if COLLECT_STATISTICS
1379     // Note that getAllocationBin is not called on the hottest path with statistics off.
1380     if (tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->getActiveBlock() != this)
1381         STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock);
1382     else
1383         STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock);
1384 #endif
1385     if (empty()) {
1386         // If the last object of a slab is freed, the slab cannot be marked full
1387         MALLOC_ASSERT(!isFull, ASSERT_TEXT);
1388         tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true);
1389     } else { // hot path
1390         FreeObject *objectToFree = findObjectToFree(object);
1391         objectToFree->next = freeList;
1392         freeList = objectToFree;
1393         adjustPositionInBin();
1394     }
1395 }
1396 
freePublicObject(FreeObject * objectToFree)1397 void Block::freePublicObject (FreeObject *objectToFree)
1398 {
1399     FreeObject* localPublicFreeList{};
1400 
1401     MALLOC_ITT_SYNC_RELEASING(&publicFreeList);
1402 #if FREELIST_NONBLOCKING
1403     // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario
1404     localPublicFreeList = publicFreeList.load(std::memory_order_relaxed);
1405     do {
1406         objectToFree->next = localPublicFreeList;
1407         // no backoff necessary because trying to make change, not waiting for a change
1408     } while( !publicFreeList.compare_exchange_strong(localPublicFreeList, objectToFree) );
1409 #else
1410     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1411     {
1412         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1413         localPublicFreeList = objectToFree->next = publicFreeList;
1414         publicFreeList = objectToFree;
1415     }
1416 #endif
1417 
1418     if( localPublicFreeList==NULL ) {
1419         // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE
1420         // otherwise, it should point to the bin the block belongs to.
1421         // reading nextPrivatizable is thread-safe below, because:
1422         // 1) the executing thread atomically got publicFreeList==NULL and changed it to non-NULL;
1423         // 2) only owning thread can change it back to NULL,
1424         // 3) but it can not be done until the block is put to the mailbox
1425         // So the executing thread is now the only one that can change nextPrivatizable
1426         Block* next = nextPrivatizable.load(std::memory_order_acquire);
1427         if( !isNotForUse(next) ) {
1428             MALLOC_ASSERT( next!=nullptr, ASSERT_TEXT );
1429             Bin* theBin = (Bin*) next;
1430 #if MALLOC_DEBUG && TBB_REVAMP_TODO
1431             // FIXME: The thread that returns the block is not the block's owner.
1432             // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails.
1433             // Need to find a way to get the correct remote bin for comparison.
1434             { // check that nextPrivatizable points to the bin the block belongs to
1435                 uint32_t index = getIndex( objectSize );
1436                 TLSData* tls = getThreadMallocTLS();
1437                 MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT );
1438             }
1439 #endif // MALLOC_DEBUG
1440             theBin->addPublicFreeListBlock(this);
1441         }
1442     }
1443     STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread);
1444     STAT_increment(ownerTid.load(std::memory_order_relaxed), getIndex(objectSize), freeByOtherThread);
1445 }
1446 
1447 // Make objects freed by other threads available for use again
privatizePublicFreeList(bool reset)1448 void Block::privatizePublicFreeList( bool reset )
1449 {
1450     FreeObject *localPublicFreeList;
1451     // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE
1452     // to properly synchronize with other threads freeing objects to this slab.
1453     const intptr_t endMarker = reset ? 0 : UNUSABLE;
1454 
1455     // Only the owner thread may reset the pointer to NULL
1456     MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT );
1457 #if FREELIST_NONBLOCKING
1458     localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker);
1459 #else
1460     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1461     {
1462         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1463         localPublicFreeList = publicFreeList;
1464         publicFreeList = endMarker;
1465     }
1466 #endif
1467     MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList);
1468     MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT );
1469 
1470     // publicFreeList must have been UNUSABLE or valid, but not NULL
1471     MALLOC_ASSERT( localPublicFreeList!=NULL, ASSERT_TEXT );
1472     if( isSolidPtr(localPublicFreeList) ) {
1473         MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1474         /* other threads did not change the counter freeing our blocks */
1475         allocatedCount--;
1476         FreeObject *temp = localPublicFreeList;
1477         while( isSolidPtr(temp->next) ){ // the list will end with either NULL or UNUSABLE
1478             temp = temp->next;
1479             allocatedCount--;
1480             MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
1481         }
1482         /* merge with local freeList */
1483         temp->next = freeList;
1484         freeList = localPublicFreeList;
1485         STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized);
1486     }
1487 }
1488 
privatizeOrphaned(TLSData * tls,unsigned index)1489 void Block::privatizeOrphaned(TLSData *tls, unsigned index)
1490 {
1491     Bin* bin = tls->bin + index;
1492     STAT_increment(getThreadId(), index, allocBlockPublic);
1493     next = NULL;
1494     previous = NULL;
1495     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != NULL, ASSERT_TEXT );
1496     /* There is not a race here since no other thread owns this block */
1497     markOwned(tls);
1498     // It is safe to change nextPrivatizable, as publicFreeList is not null
1499     MALLOC_ASSERT( isNotForUse(nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT );
1500     nextPrivatizable.store((Block*)bin, std::memory_order_relaxed);
1501     // the next call is required to change publicFreeList to 0
1502     privatizePublicFreeList();
1503     if( empty() ) {
1504         restoreBumpPtr();
1505     } else {
1506         adjustFullness(); // check the block fullness and set isFull
1507     }
1508     MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1509 }
1510 
1511 
readyToShare()1512 bool Block::readyToShare()
1513 {
1514     FreeObject* oldVal = NULL;
1515 #if FREELIST_NONBLOCKING
1516     publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE);
1517 #else
1518     STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList);
1519     {
1520         MallocMutex::scoped_lock scoped_cs(publicFreeListLock);
1521         if ( (oldVal=publicFreeList)==NULL )
1522             (intptr_t&)(publicFreeList) = UNUSABLE;
1523     }
1524 #endif
1525     return oldVal==NULL;
1526 }
1527 
shareOrphaned(intptr_t binTag,unsigned index)1528 void Block::shareOrphaned(intptr_t binTag, unsigned index)
1529 {
1530     MALLOC_ASSERT( binTag, ASSERT_TEXT );
1531     // unreferenced formal parameter warning
1532     tbb::detail::suppress_unused_warning(index);
1533     STAT_increment(getThreadId(), index, freeBlockPublic);
1534     markOrphaned();
1535     if ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1536         // First check passed: the block is not in mailbox yet.
1537         // Need to set publicFreeList to non-zero, so other threads
1538         // will not change nextPrivatizable and it can be zeroed.
1539         if ( !readyToShare() ) {
1540             // another thread freed an object; we need to wait until it finishes.
1541             // There is no need for exponential backoff, as the wait here is not for a lock;
1542             // but need to yield, so the thread we wait has a chance to run.
1543             // TODO: add a pause to also be friendly to hyperthreads
1544             int count = 256;
1545             while ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) {
1546                 if (--count==0) {
1547                     do_yield();
1548                     count = 256;
1549                 }
1550             }
1551         }
1552     }
1553     MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=NULL, ASSERT_TEXT );
1554     // now it is safe to change our data
1555     previous = NULL;
1556     // it is caller responsibility to ensure that the list of blocks
1557     // formed by nextPrivatizable pointers is kept consistent if required.
1558     // if only called from thread shutdown code, it does not matter.
1559     nextPrivatizable.store((Block*)UNUSABLE, std::memory_order_relaxed);
1560 }
1561 
cleanBlockHeader()1562 void Block::cleanBlockHeader()
1563 {
1564     next = nullptr;
1565     previous = nullptr;
1566     freeList = nullptr;
1567     allocatedCount = 0;
1568     isFull = false;
1569     tlsPtr.store(nullptr, std::memory_order_relaxed);
1570 
1571     publicFreeList.store(nullptr, std::memory_order_relaxed);
1572 }
1573 
initEmptyBlock(TLSData * tls,size_t size)1574 void Block::initEmptyBlock(TLSData *tls, size_t size)
1575 {
1576     // Having getIndex and getObjectSize called next to each other
1577     // allows better compiler optimization as they basically share the code.
1578     unsigned int index = getIndex(size);
1579     unsigned int objSz = getObjectSize(size);
1580 
1581     cleanBlockHeader();
1582     objectSize = objSz;
1583     markOwned(tls);
1584     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
1585     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize);
1586 
1587     // each block should have the address where the head of the list of "privatizable" blocks is kept
1588     // the only exception is a block for boot strap which is initialized when TLS is yet NULL
1589     nextPrivatizable.store( tls? (Block*)(tls->bin + index) : nullptr, std::memory_order_relaxed);
1590     TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n",
1591              this, tlsPtr.load(std::memory_order_relaxed) ? getThreadId() : -1, objectSize, bumpPtr ));
1592 }
1593 
get(TLSData * tls,unsigned int size)1594 Block *OrphanedBlocks::get(TLSData *tls, unsigned int size)
1595 {
1596     // TODO: try to use index from getAllocationBin
1597     unsigned int index = getIndex(size);
1598     Block *block = bins[index].pop();
1599     if (block) {
1600         MALLOC_ITT_SYNC_ACQUIRED(bins+index);
1601         block->privatizeOrphaned(tls, index);
1602     }
1603     return block;
1604 }
1605 
put(intptr_t binTag,Block * block)1606 void OrphanedBlocks::put(intptr_t binTag, Block *block)
1607 {
1608     unsigned int index = getIndex(block->getSize());
1609     block->shareOrphaned(binTag, index);
1610     MALLOC_ITT_SYNC_RELEASING(bins+index);
1611     bins[index].push(block);
1612 }
1613 
reset()1614 void OrphanedBlocks::reset()
1615 {
1616     for (uint32_t i=0; i<numBlockBinLimit; i++)
1617         new (bins+i) LifoList();
1618 }
1619 
cleanup(Backend * backend)1620 bool OrphanedBlocks::cleanup(Backend* backend)
1621 {
1622     bool released = false;
1623     for (uint32_t i=0; i<numBlockBinLimit; i++) {
1624         Block* block = bins[i].grab();
1625         MALLOC_ITT_SYNC_ACQUIRED(bins+i);
1626         while (block) {
1627             Block* next = block->next;
1628             block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to NULL
1629             if (block->empty()) {
1630                 block->reset();
1631                 // slab blocks in user's pools do not have valid backRefIdx
1632                 if (!backend->inUserPool())
1633                     removeBackRef(*(block->getBackRefIdx()));
1634                 backend->putSlabBlock(block);
1635                 released = true;
1636             } else {
1637                 MALLOC_ITT_SYNC_RELEASING(bins+i);
1638                 bins[i].push(block);
1639             }
1640             block = next;
1641         }
1642     }
1643     return released;
1644 }
1645 
getBlock()1646 FreeBlockPool::ResOfGet FreeBlockPool::getBlock()
1647 {
1648     Block *b = head.exchange(NULL);
1649 
1650     if (b) {
1651         size--;
1652         Block *newHead = b->next;
1653         lastAccessMiss = false;
1654         head.store(newHead, std::memory_order_release);
1655     } else {
1656         lastAccessMiss = true;
1657     }
1658     return ResOfGet(b, lastAccessMiss);
1659 }
1660 
returnBlock(Block * block)1661 void FreeBlockPool::returnBlock(Block *block)
1662 {
1663     MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT );
1664     Block *localHead = head.exchange(NULL);
1665 
1666     if (!localHead) {
1667         size = 0; // head was stolen by externalClean, correct size accordingly
1668     } else if (size == POOL_HIGH_MARK) {
1669         // release cold blocks and add hot one,
1670         // so keep POOL_LOW_MARK-1 blocks and add new block to head
1671         Block *headToFree = localHead, *helper;
1672         for (int i=0; i<POOL_LOW_MARK-2; i++)
1673             headToFree = headToFree->next;
1674         Block *last = headToFree;
1675         headToFree = headToFree->next;
1676         last->next = NULL;
1677         size = POOL_LOW_MARK-1;
1678         for (Block *currBl = headToFree; currBl; currBl = helper) {
1679             helper = currBl->next;
1680             // slab blocks in user's pools do not have valid backRefIdx
1681             if (!backend->inUserPool())
1682                 removeBackRef(currBl->backRefIdx);
1683             backend->putSlabBlock(currBl);
1684         }
1685     }
1686     size++;
1687     block->next = localHead;
1688     head.store(block, std::memory_order_release);
1689 }
1690 
externalCleanup()1691 bool FreeBlockPool::externalCleanup()
1692 {
1693     Block *helper;
1694     bool released = false;
1695 
1696     for (Block *currBl=head.exchange(NULL); currBl; currBl=helper) {
1697         helper = currBl->next;
1698         // slab blocks in user's pools do not have valid backRefIdx
1699         if (!backend->inUserPool())
1700             removeBackRef(currBl->backRefIdx);
1701         backend->putSlabBlock(currBl);
1702         released = true;
1703     }
1704     return released;
1705 }
1706 
1707 /* Prepare the block for returning to FreeBlockPool */
reset()1708 void Block::reset()
1709 {
1710     // it is caller's responsibility to ensure no data is lost before calling this
1711     MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT );
1712     MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT );
1713     if (!isStartupAllocObject())
1714         STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack);
1715 
1716     cleanBlockHeader();
1717 
1718     nextPrivatizable.store(nullptr, std::memory_order_relaxed);
1719 
1720     objectSize = 0;
1721     // for an empty block, bump pointer should point right after the end of the block
1722     bumpPtr = (FreeObject *)((uintptr_t)this + slabSize);
1723 }
1724 
setActiveBlock(Block * block)1725 inline void Bin::setActiveBlock (Block *block)
1726 {
1727 //    MALLOC_ASSERT( bin, ASSERT_TEXT );
1728     MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT );
1729     // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list)
1730     activeBlk = block;
1731 }
1732 
setPreviousBlockActive()1733 inline Block* Bin::setPreviousBlockActive()
1734 {
1735     MALLOC_ASSERT( activeBlk, ASSERT_TEXT );
1736     Block* temp = activeBlk->previous;
1737     if( temp ) {
1738         MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT );
1739         activeBlk = temp;
1740     }
1741     return temp;
1742 }
1743 
isOwnedByCurrentThread() const1744 inline bool Block::isOwnedByCurrentThread() const {
1745     return tlsPtr.load(std::memory_order_relaxed) && ownerTid.isCurrentThreadId();
1746 }
1747 
findObjectToFree(const void * object) const1748 FreeObject *Block::findObjectToFree(const void *object) const
1749 {
1750     FreeObject *objectToFree;
1751     // Due to aligned allocations, a pointer passed to scalable_free
1752     // might differ from the address of internally allocated object.
1753     // Small objects however should always be fine.
1754     if (objectSize <= maxSegregatedObjectSize)
1755         objectToFree = (FreeObject*)object;
1756     // "Fitting size" allocations are suspicious if aligned higher than naturally
1757     else {
1758         if ( ! isAligned(object,2*fittingAlignment) )
1759             // TODO: the above check is questionable - it gives false negatives in ~50% cases,
1760             //       so might even be slower in average than unconditional use of findAllocatedObject.
1761             // here it should be a "real" object
1762             objectToFree = (FreeObject*)object;
1763         else
1764             // here object can be an aligned address, so applying additional checks
1765             objectToFree = findAllocatedObject(object);
1766         MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT );
1767     }
1768     MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT );
1769 
1770     return objectToFree;
1771 }
1772 
release()1773 void TLSData::release()
1774 {
1775     memPool->extMemPool.allLocalCaches.unregisterThread(this);
1776     externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false);
1777 
1778     for (unsigned index = 0; index < numBlockBins; index++) {
1779         Block *activeBlk = bin[index].getActiveBlock();
1780         if (!activeBlk)
1781             continue;
1782         Block *threadlessBlock = activeBlk->previous;
1783         bool syncOnMailbox = false;
1784         while (threadlessBlock) {
1785             Block *threadBlock = threadlessBlock->previous;
1786             if (threadlessBlock->empty()) {
1787                 /* we destroy the thread, so not use its block pool */
1788                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1789             } else {
1790                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1791                 syncOnMailbox = true;
1792             }
1793             threadlessBlock = threadBlock;
1794         }
1795         threadlessBlock = activeBlk;
1796         while (threadlessBlock) {
1797             Block *threadBlock = threadlessBlock->next;
1798             if (threadlessBlock->empty()) {
1799                 /* we destroy the thread, so not use its block pool */
1800                 memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false);
1801             } else {
1802                 memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock);
1803                 syncOnMailbox = true;
1804             }
1805             threadlessBlock = threadBlock;
1806         }
1807         bin[index].resetActiveBlock();
1808 
1809         if (syncOnMailbox) {
1810             // Although, we synchronized on nextPrivatizable inside a block, we still need to
1811             // synchronize on the bin lifetime because the thread releasing an object into the public
1812             // free list is touching the bin (mailbox and mailLock)
1813             MallocMutex::scoped_lock scoped_cs(bin[index].mailLock);
1814         }
1815     }
1816 }
1817 
1818 
1819 #if MALLOC_CHECK_RECURSION
1820 // TODO: Use dedicated heap for this
1821 
1822 /*
1823  * It's a special kind of allocation that can be used when malloc is
1824  * not available (either during startup or when malloc was already called and
1825  * we are, say, inside pthread_setspecific's call).
1826  * Block can contain objects of different sizes,
1827  * allocations are performed by moving bump pointer and increasing of object counter,
1828  * releasing is done via counter of objects allocated in the block
1829  * or moving bump pointer if releasing object is on a bound.
1830  * TODO: make bump pointer to grow to the same backward direction as all the others.
1831  */
1832 
1833 class StartupBlock : public Block {
availableSize() const1834     size_t availableSize() const {
1835         return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this);
1836     }
1837     static StartupBlock *getBlock();
1838 public:
1839     static FreeObject *allocate(size_t size);
msize(void * ptr)1840     static size_t msize(void *ptr) { return *((size_t*)ptr - 1); }
1841     void free(void *ptr);
1842 };
1843 
1844 static MallocMutex startupMallocLock;
1845 static StartupBlock *firstStartupBlock;
1846 
getBlock()1847 StartupBlock *StartupBlock::getBlock()
1848 {
1849     BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false);
1850     if (backRefIdx.isInvalid()) return NULL;
1851 
1852     StartupBlock *block = static_cast<StartupBlock*>(
1853         defaultMemPool->extMemPool.backend.getSlabBlock(1));
1854     if (!block) return NULL;
1855 
1856     block->cleanBlockHeader();
1857     setBackRef(backRefIdx, block);
1858     block->backRefIdx = backRefIdx;
1859     // use startupAllocObjSizeMark to mark objects from startup block marker
1860     block->objectSize = startupAllocObjSizeMark;
1861     block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock));
1862     return block;
1863 }
1864 
allocate(size_t size)1865 FreeObject *StartupBlock::allocate(size_t size)
1866 {
1867     FreeObject *result;
1868     StartupBlock *newBlock = NULL;
1869     bool newBlockUnused = false;
1870 
1871     /* Objects must be aligned on their natural bounds,
1872        and objects bigger than word on word's bound. */
1873     size = alignUp(size, sizeof(size_t));
1874     // We need size of an object to implement msize.
1875     size_t reqSize = size + sizeof(size_t);
1876     {
1877         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1878         // Re-check whether we need a new block (conditions might have changed)
1879         if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) {
1880             if (!newBlock) {
1881                 newBlock = StartupBlock::getBlock();
1882                 if (!newBlock) return NULL;
1883             }
1884             newBlock->next = (Block*)firstStartupBlock;
1885             if (firstStartupBlock)
1886                 firstStartupBlock->previous = (Block*)newBlock;
1887             firstStartupBlock = newBlock;
1888         }
1889         result = firstStartupBlock->bumpPtr;
1890         firstStartupBlock->allocatedCount++;
1891         firstStartupBlock->bumpPtr =
1892             (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize);
1893     }
1894 
1895     // keep object size at the negative offset
1896     *((size_t*)result) = size;
1897     return (FreeObject*)((size_t*)result+1);
1898 }
1899 
free(void * ptr)1900 void StartupBlock::free(void *ptr)
1901 {
1902     Block* blockToRelease = NULL;
1903     {
1904         MallocMutex::scoped_lock scoped_cs(startupMallocLock);
1905 
1906         MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT);
1907         MALLOC_ASSERT(startupAllocObjSizeMark==objectSize
1908                       && allocatedCount>0, ASSERT_TEXT);
1909         MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock)
1910                       && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize,
1911                       ASSERT_TEXT);
1912         if (0 == --allocatedCount) {
1913             if (this == firstStartupBlock)
1914                 firstStartupBlock = (StartupBlock*)firstStartupBlock->next;
1915             if (previous)
1916                 previous->next = next;
1917             if (next)
1918                 next->previous = previous;
1919             blockToRelease = this;
1920         } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) {
1921             // last object in the block released
1922             FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1);
1923             MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock),
1924                           ASSERT_TEXT);
1925             bumpPtr = newBump;
1926         }
1927     }
1928     if (blockToRelease) {
1929         blockToRelease->previous = blockToRelease->next = NULL;
1930         defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false);
1931     }
1932 }
1933 
1934 #endif /* MALLOC_CHECK_RECURSION */
1935 
1936 /********* End thread related code  *************/
1937 
1938 /********* Library initialization *************/
1939 
1940 //! Value indicating the state of initialization.
1941 /* 0 = initialization not started.
1942  * 1 = initialization started but not finished.
1943  * 2 = initialization finished.
1944  * In theory, we only need values 0 and 2. But value 1 is nonetheless
1945  * useful for detecting errors in the double-check pattern.
1946  */
1947 static std::atomic<intptr_t> mallocInitialized{0};   // implicitly initialized to 0
1948 static MallocMutex initMutex;
1949 
1950 /** The leading "\0" is here so that applying "strings" to the binary
1951     delivers a clean result. */
1952 static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS;
1953 
1954 #if USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1955 
1956 /* Decrease race interval between dynamic library unloading and pthread key
1957    destructor. Protect only Pthreads with supported unloading. */
1958 class ShutdownSync {
1959 /* flag is the number of threads in pthread key dtor body
1960    (i.e., between threadDtorStart() and threadDtorDone())
1961    or the signal to skip dtor, if flag < 0 */
1962     std::atomic<intptr_t> flag;
1963     static const intptr_t skipDtor = INTPTR_MIN/2;
1964 public:
init()1965     void init() { flag.store(0, std::memory_order_release); }
1966 /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart()
1967    simultaneously, so flag never becomes negative because of that. */
threadDtorStart()1968     bool threadDtorStart() {
1969         if (flag.load(std::memory_order_acquire) < 0)
1970             return false;
1971         if (++flag <= 0) { // note that new value returned
1972             flag.fetch_sub(1); // flag is spoiled by us, restore it
1973             return false;
1974         }
1975         return true;
1976     }
threadDtorDone()1977     void threadDtorDone() {
1978         flag.fetch_sub(1);
1979     }
processExit()1980     void processExit() {
1981         if (flag.fetch_add(skipDtor) != 0) {
1982             SpinWaitUntilEq(flag, skipDtor);
1983         }
1984     }
1985 };
1986 
1987 #else
1988 
1989 class ShutdownSync {
1990 public:
init()1991     void init() { }
threadDtorStart()1992     bool threadDtorStart() { return true; }
threadDtorDone()1993     void threadDtorDone() { }
processExit()1994     void processExit() { }
1995 };
1996 
1997 #endif // USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED
1998 
1999 static ShutdownSync shutdownSync;
2000 
isMallocInitialized()2001 inline bool isMallocInitialized() {
2002     // Load must have acquire fence; otherwise thread taking "initialized" path
2003     // might perform textually later loads *before* mallocInitialized becomes 2.
2004     return 2 == mallocInitialized.load(std::memory_order_acquire);
2005 }
2006 
2007 /* Caller is responsible for ensuring this routine is called exactly once. */
MallocInitializeITT()2008 extern "C" void MallocInitializeITT() {
2009 #if __TBB_USE_ITT_NOTIFY
2010     if (!usedBySrcIncluded)
2011         tbb::detail::r1::__TBB_load_ittnotify();
2012 #endif
2013 }
2014 
initDefaultPool()2015 void MemoryPool::initDefaultPool() {
2016     hugePages.init();
2017 }
2018 
2019 /*
2020  * Allocator initialization routine;
2021  * it is called lazily on the very first scalable_malloc call.
2022  */
initMemoryManager()2023 static bool initMemoryManager()
2024 {
2025     TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n",
2026              sizeof(Block), sizeof(uintptr_t) ));
2027     MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT );
2028     MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT );
2029     MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)),
2030                    "Memory pool must be void*-aligned for atomic to work over aligned arguments.");
2031 
2032 #if USE_WINTHREAD
2033     const size_t granularity = 64*1024; // granulatity of VirtualAlloc
2034 #else
2035     // POSIX.1-2001-compliant way to get page size
2036     const size_t granularity = sysconf(_SC_PAGESIZE);
2037 #endif
2038     if (!defaultMemPool) {
2039         // Do not rely on static constructors and do the assignment in case
2040         // of library static section not initialized at this call yet.
2041         defaultMemPool = (MemoryPool*)defaultMemPool_space;
2042     }
2043     bool initOk = defaultMemPool->
2044         extMemPool.init(0, NULL, NULL, granularity,
2045                         /*keepAllMemory=*/false, /*fixedPool=*/false);
2046 // TODO: extMemPool.init() to not allocate memory
2047     if (!initOk || !initBackRefMaster(&defaultMemPool->extMemPool.backend) || !ThreadId::init())
2048         return false;
2049     MemoryPool::initDefaultPool();
2050     // init() is required iff initMemoryManager() is called
2051     // after mallocProcessShutdownNotification()
2052     shutdownSync.init();
2053 #if COLLECT_STATISTICS
2054     initStatisticsCollection();
2055 #endif
2056     return true;
2057 }
2058 
GetBoolEnvironmentVariable(const char * name)2059 static bool GetBoolEnvironmentVariable(const char* name) {
2060     return tbb::detail::r1::GetBoolEnvironmentVariable(name);
2061 }
2062 
2063 //! Ensures that initMemoryManager() is called once and only once.
2064 /** Does not return until initMemoryManager() has been completed by a thread.
2065     There is no need to call this routine if mallocInitialized==2 . */
doInitialization()2066 static bool doInitialization()
2067 {
2068     MallocMutex::scoped_lock lock( initMutex );
2069     if (mallocInitialized.load(std::memory_order_relaxed)!=2) {
2070         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT );
2071         mallocInitialized.store(1, std::memory_order_relaxed);
2072         RecursiveMallocCallProtector scoped;
2073         if (!initMemoryManager()) {
2074             mallocInitialized.store(0, std::memory_order_relaxed); // restore and out
2075             return false;
2076         }
2077 #ifdef  MALLOC_EXTRA_INITIALIZATION
2078         MALLOC_EXTRA_INITIALIZATION;
2079 #endif
2080 #if MALLOC_CHECK_RECURSION
2081         RecursiveMallocCallProtector::detectNaiveOverload();
2082 #endif
2083         MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT );
2084         // Store must have release fence, otherwise mallocInitialized==2
2085         // might become remotely visible before side effects of
2086         // initMemoryManager() become remotely visible.
2087         mallocInitialized.store(2, std::memory_order_release);
2088         if( GetBoolEnvironmentVariable("TBB_VERSION") ) {
2089             fputs(VersionString+1,stderr);
2090             hugePages.printStatus();
2091         }
2092     }
2093     /* It can't be 0 or I would have initialized it */
2094     MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT );
2095     return true;
2096 }
2097 
2098 /********* End library initialization *************/
2099 
2100 /********* The malloc show begins     *************/
2101 
2102 
allocateFromFreeList()2103 FreeObject *Block::allocateFromFreeList()
2104 {
2105     FreeObject *result;
2106 
2107     if (!freeList) return NULL;
2108 
2109     result = freeList;
2110     MALLOC_ASSERT( result, ASSERT_TEXT );
2111 
2112     freeList = result->next;
2113     MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2114     allocatedCount++;
2115     STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed);
2116 
2117     return result;
2118 }
2119 
allocateFromBumpPtr()2120 FreeObject *Block::allocateFromBumpPtr()
2121 {
2122     FreeObject *result = bumpPtr;
2123     if (result) {
2124         bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize);
2125         if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) {
2126             bumpPtr = NULL;
2127         }
2128         MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT );
2129         allocatedCount++;
2130         STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed);
2131     }
2132     return result;
2133 }
2134 
allocate()2135 inline FreeObject* Block::allocate()
2136 {
2137     MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT );
2138 
2139     /* for better cache locality, first looking in the free list. */
2140     if ( FreeObject *result = allocateFromFreeList() ) {
2141         return result;
2142     }
2143     MALLOC_ASSERT( !freeList, ASSERT_TEXT );
2144 
2145     /* if free list is empty, try thread local bump pointer allocation. */
2146     if ( FreeObject *result = allocateFromBumpPtr() ) {
2147         return result;
2148     }
2149     MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT );
2150 
2151     /* the block is considered full. */
2152     isFull = true;
2153     return NULL;
2154 }
2155 
findObjectSize(void * object) const2156 size_t Block::findObjectSize(void *object) const
2157 {
2158     size_t blSize = getSize();
2159 #if MALLOC_CHECK_RECURSION
2160     // Currently, there is no aligned allocations from startup blocks,
2161     // so we can return just StartupBlock::msize().
2162     // TODO: This must be extended if we add aligned allocation from startup blocks.
2163     if (!blSize)
2164         return StartupBlock::msize(object);
2165 #endif
2166     // object can be aligned, so real size can be less than block's
2167     size_t size =
2168         blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object));
2169     MALLOC_ASSERT(size>0 && size<minLargeObjectSize, ASSERT_TEXT);
2170     return size;
2171 }
2172 
moveBlockToFront(Block * block)2173 void Bin::moveBlockToFront(Block *block)
2174 {
2175     /* move the block to the front of the bin */
2176     if (block == activeBlk) return;
2177     outofTLSBin(block);
2178     pushTLSBin(block);
2179 }
2180 
processEmptyBlock(Block * block,bool poolTheBlock)2181 void Bin::processEmptyBlock(Block *block, bool poolTheBlock)
2182 {
2183     if (block != activeBlk) {
2184         /* We are not using this block; return it to the pool */
2185         outofTLSBin(block);
2186         block->getMemPool()->returnEmptyBlock(block, poolTheBlock);
2187     } else {
2188         /* all objects are free - let's restore the bump pointer */
2189         block->restoreBumpPtr();
2190     }
2191 }
2192 
2193 template<int LOW_MARK, int HIGH_MARK>
put(LargeMemoryBlock * object,ExtMemoryPool * extMemPool)2194 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool)
2195 {
2196     const size_t size = object->unalignedSize;
2197     // not spoil cache with too large object, that can cause its total cleanup
2198     if (size > MAX_TOTAL_SIZE)
2199         return false;
2200     LargeMemoryBlock *localHead = head.exchange(NULL);
2201 
2202     object->prev = NULL;
2203     object->next = localHead;
2204     if (localHead)
2205         localHead->prev = object;
2206     else {
2207         // those might not be cleaned during local cache stealing, correct them
2208         totalSize = 0;
2209         numOfBlocks = 0;
2210         tail = object;
2211     }
2212     localHead = object;
2213     totalSize += size;
2214     numOfBlocks++;
2215     // must meet both size and number of cached objects constrains
2216     if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) {
2217         // scanning from tail until meet conditions
2218         while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) {
2219             totalSize -= tail->unalignedSize;
2220             numOfBlocks--;
2221             tail = tail->prev;
2222         }
2223         LargeMemoryBlock *headToRelease = tail->next;
2224         tail->next = NULL;
2225 
2226         extMemPool->freeLargeObjectList(headToRelease);
2227     }
2228 
2229     head.store(localHead, std::memory_order_release);
2230     return true;
2231 }
2232 
2233 template<int LOW_MARK, int HIGH_MARK>
get(size_t size)2234 LargeMemoryBlock *LocalLOCImpl<LOW_MARK, HIGH_MARK>::get(size_t size)
2235 {
2236     LargeMemoryBlock *localHead, *res = NULL;
2237 
2238     if (size > MAX_TOTAL_SIZE)
2239         return NULL;
2240 
2241     // TBB_REVAMP_TODO: review this line
2242     if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(NULL)) == NULL) {
2243         // do not restore totalSize, numOfBlocks and tail at this point,
2244         // as they are used only in put(), where they must be restored
2245         return NULL;
2246     }
2247 
2248     for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) {
2249         if (curr->unalignedSize == size) {
2250             res = curr;
2251             if (curr->next)
2252                 curr->next->prev = curr->prev;
2253             else
2254                 tail = curr->prev;
2255             if (curr != localHead)
2256                 curr->prev->next = curr->next;
2257             else
2258                 localHead = curr->next;
2259             totalSize -= size;
2260             numOfBlocks--;
2261             break;
2262         }
2263     }
2264 
2265     head.store(localHead, std::memory_order_release);
2266     return res;
2267 }
2268 
2269 template<int LOW_MARK, int HIGH_MARK>
externalCleanup(ExtMemoryPool * extMemPool)2270 bool LocalLOCImpl<LOW_MARK, HIGH_MARK>::externalCleanup(ExtMemoryPool *extMemPool)
2271 {
2272     if (LargeMemoryBlock *localHead = head.exchange(NULL)) {
2273         extMemPool->freeLargeObjectList(localHead);
2274         return true;
2275     }
2276     return false;
2277 }
2278 
getFromLLOCache(TLSData * tls,size_t size,size_t alignment)2279 void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment)
2280 {
2281     LargeMemoryBlock *lmb = NULL;
2282 
2283     size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr);
2284     size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment);
2285     if (allocationSize < size) // allocationSize is wrapped around after alignToBin
2286         return NULL;
2287     MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before.");
2288 
2289     if (tls) {
2290         tls->markUsed();
2291         lmb = tls->lloc.get(allocationSize);
2292     }
2293     if (!lmb)
2294         lmb = extMemPool.mallocLargeObject(this, allocationSize);
2295 
2296     if (lmb) {
2297         // doing shuffle we suppose that alignment offset guarantees
2298         // that different cache lines are in use
2299         MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT);
2300 
2301         void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment);
2302         uintptr_t alignedRight =
2303             alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment);
2304         // Has some room to shuffle object between cache lines?
2305         // Note that alignedRight and alignedArea are aligned at alignment.
2306         unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea;
2307         if (ptrDelta && tls) { // !tls is cold path
2308             // for the hot path of alignment==estimatedCacheLineSize,
2309             // allow compilers to use shift for division
2310             // (since estimatedCacheLineSize is a power-of-2 constant)
2311             unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize?
2312                   ptrDelta / estimatedCacheLineSize :
2313                   ptrDelta / alignment;
2314             unsigned myCacheIdx = ++tls->currCacheIdx;
2315             unsigned offset = myCacheIdx % numOfPossibleOffsets;
2316 
2317             // Move object to a cache line with an offset that is different from
2318             // previous allocation. This supposedly allows us to use cache
2319             // associativity more efficiently.
2320             alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment);
2321         }
2322         MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >=
2323                       (uintptr_t)alignedArea+size, "Object doesn't fit the block.");
2324         LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1;
2325         header->memoryBlock = lmb;
2326         header->backRefIdx = lmb->backRefIdx;
2327         setBackRef(header->backRefIdx, header);
2328 
2329         lmb->objectSize = size;
2330 
2331         MALLOC_ASSERT( isLargeObject<unknownMem>(alignedArea), ASSERT_TEXT );
2332         MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT );
2333 
2334         return alignedArea;
2335     }
2336     return NULL;
2337 }
2338 
putToLLOCache(TLSData * tls,void * object)2339 void MemoryPool::putToLLOCache(TLSData *tls, void *object)
2340 {
2341     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2342     // overwrite backRefIdx to simplify double free detection
2343     header->backRefIdx = BackRefIdx();
2344 
2345     if (tls) {
2346         tls->markUsed();
2347         if (tls->lloc.put(header->memoryBlock, &extMemPool))
2348             return;
2349     }
2350     extMemPool.freeLargeObject(header->memoryBlock);
2351 }
2352 
2353 /*
2354  * All aligned allocations fall into one of the following categories:
2355  *  1. if both request size and alignment are <= maxSegregatedObjectSize,
2356  *       we just align the size up, and request this amount, because for every size
2357  *       aligned to some power of 2, the allocated object is at least that aligned.
2358  * 2. for size<minLargeObjectSize, check if already guaranteed fittingAlignment is enough.
2359  * 3. if size+alignment<minLargeObjectSize, we take an object of fittingSizeN and align
2360  *       its address up; given such pointer, scalable_free could find the real object.
2361  *       Wrapping of size+alignment is impossible because maximal allowed
2362  *       alignment plus minLargeObjectSize can't lead to wrapping.
2363  * 4. otherwise, aligned large object is allocated.
2364  */
allocateAligned(MemoryPool * memPool,size_t size,size_t alignment)2365 static void *allocateAligned(MemoryPool *memPool, size_t size, size_t alignment)
2366 {
2367     MALLOC_ASSERT( isPowerOfTwo(alignment), ASSERT_TEXT );
2368 
2369     if (!isMallocInitialized())
2370         if (!doInitialization())
2371             return NULL;
2372 
2373     void *result;
2374     if (size<=maxSegregatedObjectSize && alignment<=maxSegregatedObjectSize)
2375         result = internalPoolMalloc(memPool, alignUp(size? size: sizeof(size_t), alignment));
2376     else if (size<minLargeObjectSize) {
2377         if (alignment<=fittingAlignment)
2378             result = internalPoolMalloc(memPool, size);
2379         else if (size+alignment < minLargeObjectSize) {
2380             void *unaligned = internalPoolMalloc(memPool, size+alignment);
2381             if (!unaligned) return NULL;
2382             result = alignUp(unaligned, alignment);
2383         } else
2384             goto LargeObjAlloc;
2385     } else {
2386     LargeObjAlloc:
2387         TLSData *tls = memPool->getTLS(/*create=*/true);
2388         // take into account only alignment that are higher then natural
2389         result =
2390             memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment?
2391                                                largeObjectAlignment: alignment);
2392     }
2393 
2394     MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT );
2395     return result;
2396 }
2397 
reallocAligned(MemoryPool * memPool,void * ptr,size_t newSize,size_t alignment=0)2398 static void *reallocAligned(MemoryPool *memPool, void *ptr,
2399                             size_t newSize, size_t alignment = 0)
2400 {
2401     void *result;
2402     size_t copySize;
2403 
2404     if (isLargeObject<ourMem>(ptr)) {
2405         LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock;
2406         copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb);
2407 
2408         // Apply different strategies if size decreases
2409         if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) {
2410 
2411             // For huge objects (that do not fit in backend cache), keep the same space unless
2412             // the new size is at least twice smaller
2413             bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize();
2414             size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0;
2415             if (newSize > threshold) {
2416                 lmb->objectSize = newSize;
2417                 return ptr;
2418             }
2419             // TODO: For large objects suitable for the backend cache,
2420             // split out the excessive part and put it to the backend.
2421         }
2422         // Reallocate for real
2423         copySize = lmb->objectSize;
2424 #if BACKEND_HAS_MREMAP
2425         if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize,
2426                           alignment < largeObjectAlignment ? largeObjectAlignment : alignment))
2427             return r;
2428 #endif
2429         result = alignment ? allocateAligned(memPool, newSize, alignment) :
2430             internalPoolMalloc(memPool, newSize);
2431 
2432     } else {
2433         Block* block = (Block *)alignDown(ptr, slabSize);
2434         copySize = block->findObjectSize(ptr);
2435 
2436         // TODO: Move object to another bin if size decreases and the current bin is "empty enough".
2437         // Currently, in case of size decreasing, old pointer is returned
2438         if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) {
2439             return ptr;
2440         } else {
2441             result = alignment ? allocateAligned(memPool, newSize, alignment) :
2442                 internalPoolMalloc(memPool, newSize);
2443         }
2444     }
2445     if (result) {
2446         memcpy(result, ptr, copySize < newSize ? copySize : newSize);
2447         internalPoolFree(memPool, ptr, 0);
2448     }
2449     return result;
2450 }
2451 
2452 #if MALLOC_DEBUG
2453 /* A predicate checks if an object is properly placed inside its block */
isProperlyPlaced(const void * object) const2454 inline bool Block::isProperlyPlaced(const void *object) const
2455 {
2456     return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize;
2457 }
2458 #endif
2459 
2460 /* Finds the real object inside the block */
findAllocatedObject(const void * address) const2461 FreeObject *Block::findAllocatedObject(const void *address) const
2462 {
2463     // calculate offset from the end of the block space
2464     uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address;
2465     MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT );
2466     // find offset difference from a multiple of allocation size
2467     offset %= objectSize;
2468     // and move the address down to where the real object starts.
2469     return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0));
2470 }
2471 
2472 /*
2473  * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain.
2474  * Separate function isolates SEH code, as it has bad influence on compiler optimization.
2475  */
safer_dereference(const BackRefIdx * ptr)2476 static inline BackRefIdx safer_dereference (const BackRefIdx *ptr)
2477 {
2478     BackRefIdx id;
2479 #if _MSC_VER
2480     __try {
2481 #endif
2482         id = dereference(ptr);
2483 #if _MSC_VER
2484     } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION?
2485                 EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) {
2486         id = BackRefIdx();
2487     }
2488 #endif
2489     return id;
2490 }
2491 
2492 template<MemoryOrigin memOrigin>
isLargeObject(void * object)2493 bool isLargeObject(void *object)
2494 {
2495     if (!isAligned(object, largeObjectAlignment))
2496         return false;
2497     LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2498     BackRefIdx idx = (memOrigin == unknownMem) ?
2499         safer_dereference(&header->backRefIdx) : dereference(&header->backRefIdx);
2500 
2501     return idx.isLargeObject()
2502         // in valid LargeObjectHdr memoryBlock is not NULL
2503         && header->memoryBlock
2504         // in valid LargeObjectHdr memoryBlock points somewhere before header
2505         // TODO: more strict check
2506         && (uintptr_t)header->memoryBlock < (uintptr_t)header
2507         && getBackRef(idx) == header;
2508 }
2509 
isSmallObject(void * ptr)2510 static inline bool isSmallObject (void *ptr)
2511 {
2512     Block* expectedBlock = (Block*)alignDown(ptr, slabSize);
2513     const BackRefIdx* idx = expectedBlock->getBackRefIdx();
2514 
2515     bool isSmall = expectedBlock == getBackRef(safer_dereference(idx));
2516     if (isSmall)
2517         expectedBlock->checkFreePrecond(ptr);
2518     return isSmall;
2519 }
2520 
2521 /**** Check if an object was allocated by scalable_malloc ****/
isRecognized(void * ptr)2522 static inline bool isRecognized (void* ptr)
2523 {
2524     return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) &&
2525         (isLargeObject<unknownMem>(ptr) || isSmallObject(ptr));
2526 }
2527 
freeSmallObject(void * object)2528 static inline void freeSmallObject(void *object)
2529 {
2530     /* mask low bits to get the block */
2531     Block *block = (Block *)alignDown(object, slabSize);
2532     block->checkFreePrecond(object);
2533 
2534 #if MALLOC_CHECK_RECURSION
2535     if (block->isStartupAllocObject()) {
2536         ((StartupBlock *)block)->free(object);
2537         return;
2538     }
2539 #endif
2540     if (block->isOwnedByCurrentThread()) {
2541         block->freeOwnObject(object);
2542     } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */
2543         FreeObject *objectToFree = block->findObjectToFree(object);
2544         block->freePublicObject(objectToFree);
2545     }
2546 }
2547 
internalPoolMalloc(MemoryPool * memPool,size_t size)2548 static void *internalPoolMalloc(MemoryPool* memPool, size_t size)
2549 {
2550     Bin* bin;
2551     Block * mallocBlock;
2552 
2553     if (!memPool) return NULL;
2554 
2555     if (!size) size = sizeof(size_t);
2556 
2557     TLSData *tls = memPool->getTLS(/*create=*/true);
2558 
2559     /* Allocate a large object */
2560     if (size >= minLargeObjectSize)
2561         return memPool->getFromLLOCache(tls, size, largeObjectAlignment);
2562 
2563     if (!tls) return NULL;
2564 
2565     tls->markUsed();
2566     /*
2567      * Get an element in thread-local array corresponding to the given size;
2568      * It keeps ptr to the active block for allocations of this size
2569      */
2570     bin = tls->getAllocationBin(size);
2571     if ( !bin ) return NULL;
2572 
2573     /* Get a block to try to allocate in. */
2574     for( mallocBlock = bin->getActiveBlock(); mallocBlock;
2575          mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough
2576     {
2577         if( FreeObject *result = mallocBlock->allocate() )
2578             return result;
2579     }
2580 
2581     /*
2582      * else privatize publicly freed objects in some block and allocate from it
2583      */
2584     mallocBlock = bin->getPrivatizedFreeListBlock();
2585     if (mallocBlock) {
2586         MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT );
2587         if ( FreeObject *result = mallocBlock->allocateFromFreeList() )
2588             return result;
2589         /* Else something strange happened, need to retry from the beginning; */
2590         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" ));
2591         return internalPoolMalloc(memPool, size);
2592     }
2593 
2594     /*
2595      * no suitable own blocks, try to get a partial block that some other thread has discarded.
2596      */
2597     mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2598     while (mallocBlock) {
2599         bin->pushTLSBin(mallocBlock);
2600         bin->setActiveBlock(mallocBlock); // TODO: move under the below condition?
2601         if( FreeObject *result = mallocBlock->allocate() )
2602             return result;
2603         mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size);
2604     }
2605 
2606     /*
2607      * else try to get a new empty block
2608      */
2609     mallocBlock = memPool->getEmptyBlock(size);
2610     if (mallocBlock) {
2611         bin->pushTLSBin(mallocBlock);
2612         bin->setActiveBlock(mallocBlock);
2613         if( FreeObject *result = mallocBlock->allocate() )
2614             return result;
2615         /* Else something strange happened, need to retry from the beginning; */
2616         TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" ));
2617         return internalPoolMalloc(memPool, size);
2618     }
2619     /*
2620      * else nothing works so return NULL
2621      */
2622     TRACEF(( "[ScalableMalloc trace] No memory found, returning NULL.\n" ));
2623     return NULL;
2624 }
2625 
2626 // When size==0 (i.e. unknown), detect here whether the object is large.
2627 // For size is known and < minLargeObjectSize, we still need to check
2628 // if the actual object is large, because large objects might be used
2629 // for aligned small allocations.
internalPoolFree(MemoryPool * memPool,void * object,size_t size)2630 static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size)
2631 {
2632     if (!memPool || !object) return false;
2633 
2634     // The library is initialized at allocation call, so releasing while
2635     // not initialized means foreign object is releasing.
2636     MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT);
2637     MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object),
2638                   "Invalid pointer during object releasing is detected.");
2639 
2640     if (size >= minLargeObjectSize || isLargeObject<ourMem>(object))
2641         memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object);
2642     else
2643         freeSmallObject(object);
2644     return true;
2645 }
2646 
internalMalloc(size_t size)2647 static void *internalMalloc(size_t size)
2648 {
2649     if (!size) size = sizeof(size_t);
2650 
2651 #if MALLOC_CHECK_RECURSION
2652     if (RecursiveMallocCallProtector::sameThreadActive())
2653         return size<minLargeObjectSize? StartupBlock::allocate(size) :
2654             // nested allocation, so skip tls
2655             (FreeObject*)defaultMemPool->getFromLLOCache(NULL, size, slabSize);
2656 #endif
2657 
2658     if (!isMallocInitialized())
2659         if (!doInitialization())
2660             return NULL;
2661     return internalPoolMalloc(defaultMemPool, size);
2662 }
2663 
internalFree(void * object)2664 static void internalFree(void *object)
2665 {
2666     internalPoolFree(defaultMemPool, object, 0);
2667 }
2668 
internalMsize(void * ptr)2669 static size_t internalMsize(void* ptr)
2670 {
2671     MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize");
2672     if (isLargeObject<ourMem>(ptr)) {
2673         // TODO: return the maximum memory size, that can be written to this object
2674         LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock;
2675         return lmb->objectSize;
2676     } else {
2677         Block *block = (Block*)alignDown(ptr, slabSize);
2678         return block->findObjectSize(ptr);
2679     }
2680 }
2681 
2682 } // namespace internal
2683 
2684 using namespace rml::internal;
2685 
2686 // legacy entry point saved for compatibility with binaries complied
2687 // with pre-6003 versions of TBB
pool_create(intptr_t pool_id,const MemPoolPolicy * policy)2688 TBBMALLOC_EXPORT rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy)
2689 {
2690     rml::MemoryPool *pool;
2691     MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity);
2692 
2693     pool_create_v1(pool_id, &pol, &pool);
2694     return pool;
2695 }
2696 
pool_create_v1(intptr_t pool_id,const MemPoolPolicy * policy,rml::MemoryPool ** pool)2697 rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy,
2698                                  rml::MemoryPool **pool)
2699 {
2700     if ( !policy->pAlloc || policy->version<MemPoolPolicy::TBBMALLOC_POOL_VERSION
2701          // empty pFree allowed only for fixed pools
2702          || !(policy->fixedPool || policy->pFree)) {
2703         *pool = NULL;
2704         return INVALID_POLICY;
2705     }
2706     if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported
2707          // new flags can be added in place of reserved, but default
2708          // behaviour must be supported by this version
2709          || policy->reserved ) {
2710         *pool = NULL;
2711         return UNSUPPORTED_POLICY;
2712     }
2713     if (!isMallocInitialized())
2714         if (!doInitialization()) {
2715             *pool = NULL;
2716             return NO_MEMORY;
2717         }
2718     rml::internal::MemoryPool *memPool =
2719         (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool)));
2720     if (!memPool) {
2721         *pool = NULL;
2722         return NO_MEMORY;
2723     }
2724     memset(memPool, 0, sizeof(rml::internal::MemoryPool));
2725     if (!memPool->init(pool_id, policy)) {
2726         internalFree(memPool);
2727         *pool = NULL;
2728         return NO_MEMORY;
2729     }
2730 
2731     *pool = (rml::MemoryPool*)memPool;
2732     return POOL_OK;
2733 }
2734 
pool_destroy(rml::MemoryPool * memPool)2735 bool pool_destroy(rml::MemoryPool* memPool)
2736 {
2737     if (!memPool) return false;
2738     bool ret = ((rml::internal::MemoryPool*)memPool)->destroy();
2739     internalFree(memPool);
2740 
2741     return ret;
2742 }
2743 
pool_reset(rml::MemoryPool * memPool)2744 bool pool_reset(rml::MemoryPool* memPool)
2745 {
2746     if (!memPool) return false;
2747 
2748     return ((rml::internal::MemoryPool*)memPool)->reset();
2749 }
2750 
pool_malloc(rml::MemoryPool * mPool,size_t size)2751 void *pool_malloc(rml::MemoryPool* mPool, size_t size)
2752 {
2753     return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2754 }
2755 
pool_realloc(rml::MemoryPool * mPool,void * object,size_t size)2756 void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size)
2757 {
2758     if (!object)
2759         return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size);
2760     if (!size) {
2761         internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2762         return NULL;
2763     }
2764     return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0);
2765 }
2766 
pool_aligned_malloc(rml::MemoryPool * mPool,size_t size,size_t alignment)2767 void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment)
2768 {
2769     if (!isPowerOfTwo(alignment) || 0==size)
2770         return NULL;
2771 
2772     return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment);
2773 }
2774 
pool_aligned_realloc(rml::MemoryPool * memPool,void * ptr,size_t size,size_t alignment)2775 void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment)
2776 {
2777     if (!isPowerOfTwo(alignment))
2778         return NULL;
2779     rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool;
2780     void *tmp;
2781 
2782     if (!ptr)
2783         tmp = allocateAligned(mPool, size, alignment);
2784     else if (!size) {
2785         internalPoolFree(mPool, ptr, 0);
2786         return NULL;
2787     } else
2788         tmp = reallocAligned(mPool, ptr, size, alignment);
2789 
2790     return tmp;
2791 }
2792 
pool_free(rml::MemoryPool * mPool,void * object)2793 bool pool_free(rml::MemoryPool *mPool, void *object)
2794 {
2795     return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0);
2796 }
2797 
pool_identify(void * object)2798 rml::MemoryPool *pool_identify(void *object)
2799 {
2800     rml::internal::MemoryPool *pool;
2801     if (isLargeObject<ourMem>(object)) {
2802         LargeObjectHdr *header = (LargeObjectHdr*)object - 1;
2803         pool = header->memoryBlock->pool;
2804     } else {
2805         Block *block = (Block*)alignDown(object, slabSize);
2806         pool = block->getMemPool();
2807     }
2808     // do not return defaultMemPool, as it can't be used in pool_free() etc
2809     __TBB_ASSERT_RELEASE(pool!=defaultMemPool,
2810         "rml::pool_identify() can't be used for scalable_malloc() etc results.");
2811     return (rml::MemoryPool*)pool;
2812 }
2813 
pool_msize(rml::MemoryPool * mPool,void * object)2814 size_t pool_msize(rml::MemoryPool *mPool, void* object)
2815 {
2816     if (object) {
2817         // No assert for object recognition, cause objects allocated from non-default
2818         // memory pool do not participate in range checking and do not have valid backreferences for
2819         // small objects. Instead, check that an object belong to the certain memory pool.
2820         MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool");
2821         return internalMsize(object);
2822     }
2823     errno = EINVAL;
2824     // Unlike _msize, return 0 in case of parameter error.
2825     // Returning size_t(-1) looks more like the way to troubles.
2826     return 0;
2827 }
2828 
2829 } // namespace rml
2830 
2831 using namespace rml::internal;
2832 
2833 #if MALLOC_TRACE
2834 static unsigned int threadGoingDownCount = 0;
2835 #endif
2836 
2837 /*
2838  * When a thread is shutting down this routine should be called to remove all the thread ids
2839  * from the malloc blocks and replace them with a NULL thread id.
2840  *
2841  * For pthreads, the function is set as a callback in pthread_key_create for TLS bin.
2842  * It will be automatically called at thread exit with the key value as the argument,
2843  * unless that value is NULL.
2844  * For Windows, it is called from DllMain( DLL_THREAD_DETACH ).
2845  *
2846  * However neither of the above is called for the main process thread, so the routine
2847  * also needs to be called during the process shutdown.
2848  *
2849 */
2850 // TODO: Consider making this function part of class MemoryPool.
doThreadShutdownNotification(TLSData * tls,bool main_thread)2851 void doThreadShutdownNotification(TLSData* tls, bool main_thread)
2852 {
2853     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n",
2854              getThreadId(),  threadGoingDownCount++ ));
2855 
2856 #if USE_PTHREAD
2857     if (tls) {
2858         if (!shutdownSync.threadDtorStart()) return;
2859         tls->getMemPool()->onThreadShutdown(tls);
2860         shutdownSync.threadDtorDone();
2861     } else
2862 #endif
2863     {
2864         suppress_unused_warning(tls); // not used on Windows
2865         // The default pool is safe to use at this point:
2866         //   on Linux, only the main thread can go here before destroying defaultMemPool;
2867         //   on Windows, shutdown is synchronized via loader lock and isMallocInitialized().
2868         // See also __TBB_mallocProcessShutdownNotification()
2869         defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false));
2870         // Take lock to walk through other pools; but waiting might be dangerous at this point
2871         // (e.g. on Windows the main thread might deadlock)
2872         bool locked;
2873         MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked);
2874         if (locked) { // the list is safe to process
2875             for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next)
2876                 memPool->onThreadShutdown(memPool->getTLS(/*create=*/false));
2877         }
2878     }
2879 
2880     TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() ));
2881 }
2882 
2883 #if USE_PTHREAD
mallocThreadShutdownNotification(void * arg)2884 void mallocThreadShutdownNotification(void* arg)
2885 {
2886     // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread
2887     if (!isMallocInitialized()) return;
2888     doThreadShutdownNotification((TLSData*)arg, false);
2889 }
2890 #else
__TBB_mallocThreadShutdownNotification()2891 extern "C" void __TBB_mallocThreadShutdownNotification()
2892 {
2893     // The routine is called once per thread on Windows
2894     if (!isMallocInitialized()) return;
2895     doThreadShutdownNotification(NULL, false);
2896 }
2897 #endif
2898 
__TBB_mallocProcessShutdownNotification(bool windows_process_dying)2899 extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying)
2900 {
2901     if (!isMallocInitialized()) return;
2902 
2903     // Don't clean allocator internals if the entire process is exiting
2904     if (!windows_process_dying) {
2905         doThreadShutdownNotification(NULL, /*main_thread=*/true);
2906     }
2907 #if  __TBB_MALLOC_LOCACHE_STAT
2908     printf("cache hit ratio %f, size hit %f\n",
2909            1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB);
2910     defaultMemPool->extMemPool.loc.reportStat(stdout);
2911 #endif
2912 
2913     shutdownSync.processExit();
2914 #if __TBB_SOURCE_DIRECTLY_INCLUDED
2915 /* Pthread keys must be deleted as soon as possible to not call key dtor
2916    on thread termination when then the tbbmalloc code can be already unloaded.
2917 */
2918     defaultMemPool->destroy();
2919     destroyBackRefMaster(&defaultMemPool->extMemPool.backend);
2920     ThreadId::destroy();      // Delete key for thread id
2921     hugePages.reset();
2922     // new total malloc initialization is possible after this point
2923     mallocInitialized.store(0, std::memory_order_release);
2924 #endif // __TBB_SOURCE_DIRECTLY_INCLUDED
2925 
2926 #if COLLECT_STATISTICS
2927     unsigned nThreads = ThreadId::getMaxThreadId();
2928     for( int i=1; i<=nThreads && i<MAX_THREADS; ++i )
2929         STAT_print(i);
2930 #endif
2931     if (!usedBySrcIncluded)
2932         MALLOC_ITT_FINI_ITTLIB();
2933 }
2934 
scalable_malloc(size_t size)2935 extern "C" void * scalable_malloc(size_t size)
2936 {
2937     void *ptr = internalMalloc(size);
2938     if (!ptr) errno = ENOMEM;
2939     return ptr;
2940 }
2941 
scalable_free(void * object)2942 extern "C" void scalable_free(void *object)
2943 {
2944     internalFree(object);
2945 }
2946 
2947 #if MALLOC_ZONE_OVERLOAD_ENABLED
__TBB_malloc_free_definite_size(void * object,size_t size)2948 extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
2949 {
2950     internalPoolFree(defaultMemPool, object, size);
2951 }
2952 #endif
2953 
2954 /*
2955  * A variant that provides additional memory safety, by checking whether the given address
2956  * was obtained with this allocator, and if not redirecting to the provided alternative call.
2957  */
__TBB_malloc_safer_free(void * object,void (* original_free)(void *))2958 extern "C" TBBMALLOC_EXPORT void __TBB_malloc_safer_free(void *object, void (*original_free)(void*))
2959 {
2960     if (!object)
2961         return;
2962 
2963     // tbbmalloc can allocate object only when tbbmalloc has been initialized
2964     if (mallocInitialized.load(std::memory_order_acquire) && defaultMemPool->extMemPool.backend.ptrCanBeValid(object)) {
2965         if (isLargeObject<unknownMem>(object)) {
2966             // must check 1st for large object, because small object check touches 4 pages on left,
2967             // and it can be inaccessible
2968             TLSData *tls = defaultMemPool->getTLS(/*create=*/false);
2969 
2970             defaultMemPool->putToLLOCache(tls, object);
2971             return;
2972         } else if (isSmallObject(object)) {
2973             freeSmallObject(object);
2974             return;
2975         }
2976     }
2977     if (original_free)
2978         original_free(object);
2979 }
2980 
2981 /********* End the free code        *************/
2982 
2983 /********* Code for scalable_realloc       ***********/
2984 
2985 /*
2986  * From K&R
2987  * "realloc changes the size of the object pointed to by p to size. The contents will
2988  * be unchanged up to the minimum of the old and the new sizes. If the new size is larger,
2989  * the new space is uninitialized. realloc returns a pointer to the new space, or
2990  * NULL if the request cannot be satisfied, in which case *p is unchanged."
2991  *
2992  */
scalable_realloc(void * ptr,size_t size)2993 extern "C" void* scalable_realloc(void* ptr, size_t size)
2994 {
2995     void *tmp;
2996 
2997     if (!ptr)
2998         tmp = internalMalloc(size);
2999     else if (!size) {
3000         internalFree(ptr);
3001         return NULL;
3002     } else
3003         tmp = reallocAligned(defaultMemPool, ptr, size, 0);
3004 
3005     if (!tmp) errno = ENOMEM;
3006     return tmp;
3007 }
3008 
3009 /*
3010  * A variant that provides additional memory safety, by checking whether the given address
3011  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3012  */
__TBB_malloc_safer_realloc(void * ptr,size_t sz,void * original_realloc)3013 extern "C" TBBMALLOC_EXPORT void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc)
3014 {
3015     void *tmp; // TODO: fix warnings about uninitialized use of tmp
3016 
3017     if (!ptr) {
3018         tmp = internalMalloc(sz);
3019     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3020         if (!sz) {
3021             internalFree(ptr);
3022             return NULL;
3023         } else {
3024             tmp = reallocAligned(defaultMemPool, ptr, sz, 0);
3025         }
3026     }
3027 #if USE_WINTHREAD
3028     else if (original_realloc && sz) {
3029         orig_ptrs *original_ptrs = static_cast<orig_ptrs*>(original_realloc);
3030         if ( original_ptrs->msize ){
3031             size_t oldSize = original_ptrs->msize(ptr);
3032             tmp = internalMalloc(sz);
3033             if (tmp) {
3034                 memcpy(tmp, ptr, sz<oldSize? sz : oldSize);
3035                 if ( original_ptrs->free ){
3036                     original_ptrs->free( ptr );
3037                 }
3038             }
3039         } else
3040             tmp = NULL;
3041     }
3042 #else
3043     else if (original_realloc) {
3044         typedef void* (*realloc_ptr_t)(void*,size_t);
3045         realloc_ptr_t original_realloc_ptr;
3046         (void *&)original_realloc_ptr = original_realloc;
3047         tmp = original_realloc_ptr(ptr,sz);
3048     }
3049 #endif
3050     else tmp = NULL;
3051 
3052     if (!tmp) errno = ENOMEM;
3053     return tmp;
3054 }
3055 
3056 /********* End code for scalable_realloc   ***********/
3057 
3058 /********* Code for scalable_calloc   ***********/
3059 
3060 /*
3061  * From K&R
3062  * calloc returns a pointer to space for an array of nobj objects,
3063  * each of size size, or NULL if the request cannot be satisfied.
3064  * The space is initialized to zero bytes.
3065  *
3066  */
3067 
scalable_calloc(size_t nobj,size_t size)3068 extern "C" void * scalable_calloc(size_t nobj, size_t size)
3069 {
3070     // it's square root of maximal size_t value
3071     const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2);
3072     const size_t arraySize = nobj * size;
3073 
3074     // check for overflow during multiplication:
3075     if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check
3076         if (nobj && arraySize / nobj != size) {             // 2) exact check
3077             errno = ENOMEM;
3078             return NULL;
3079         }
3080     void* result = internalMalloc(arraySize);
3081     if (result)
3082         memset(result, 0, arraySize);
3083     else
3084         errno = ENOMEM;
3085     return result;
3086 }
3087 
3088 /********* End code for scalable_calloc   ***********/
3089 
3090 /********* Code for aligned allocation API **********/
3091 
scalable_posix_memalign(void ** memptr,size_t alignment,size_t size)3092 extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size)
3093 {
3094     if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) )
3095         return EINVAL;
3096     void *result = allocateAligned(defaultMemPool, size, alignment);
3097     if (!result)
3098         return ENOMEM;
3099     *memptr = result;
3100     return 0;
3101 }
3102 
scalable_aligned_malloc(size_t size,size_t alignment)3103 extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment)
3104 {
3105     if (!isPowerOfTwo(alignment) || 0==size) {
3106         errno = EINVAL;
3107         return NULL;
3108     }
3109     void *tmp = allocateAligned(defaultMemPool, size, alignment);
3110     if (!tmp) errno = ENOMEM;
3111     return tmp;
3112 }
3113 
scalable_aligned_realloc(void * ptr,size_t size,size_t alignment)3114 extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment)
3115 {
3116     if (!isPowerOfTwo(alignment)) {
3117         errno = EINVAL;
3118         return NULL;
3119     }
3120     void *tmp;
3121 
3122     if (!ptr)
3123         tmp = allocateAligned(defaultMemPool, size, alignment);
3124     else if (!size) {
3125         scalable_free(ptr);
3126         return NULL;
3127     } else
3128         tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3129 
3130     if (!tmp) errno = ENOMEM;
3131     return tmp;
3132 }
3133 
__TBB_malloc_safer_aligned_realloc(void * ptr,size_t size,size_t alignment,void * orig_function)3134 extern "C" TBBMALLOC_EXPORT void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function)
3135 {
3136     /* corner cases left out of reallocAligned to not deal with errno there */
3137     if (!isPowerOfTwo(alignment)) {
3138         errno = EINVAL;
3139         return NULL;
3140     }
3141     void *tmp = NULL;
3142 
3143     if (!ptr) {
3144         tmp = allocateAligned(defaultMemPool, size, alignment);
3145     } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) {
3146         if (!size) {
3147             internalFree(ptr);
3148             return NULL;
3149         } else {
3150             tmp = reallocAligned(defaultMemPool, ptr, size, alignment);
3151         }
3152     }
3153 #if USE_WINTHREAD
3154     else {
3155         orig_aligned_ptrs *original_ptrs = static_cast<orig_aligned_ptrs*>(orig_function);
3156         if (size) {
3157             // Without orig_msize, we can't do anything with this.
3158             // Just keeping old pointer.
3159             if ( original_ptrs->aligned_msize ){
3160                 // set alignment and offset to have possibly correct oldSize
3161                 size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0);
3162                 tmp = allocateAligned(defaultMemPool, size, alignment);
3163                 if (tmp) {
3164                     memcpy(tmp, ptr, size<oldSize? size : oldSize);
3165                     if ( original_ptrs->aligned_free ){
3166                         original_ptrs->aligned_free( ptr );
3167                     }
3168                 }
3169             }
3170         } else {
3171             if ( original_ptrs->aligned_free ){
3172                 original_ptrs->aligned_free( ptr );
3173             }
3174             return NULL;
3175         }
3176     }
3177 #else
3178     // As original_realloc can't align result, and there is no way to find
3179     // size of reallocating object, we are giving up.
3180     suppress_unused_warning(orig_function);
3181 #endif
3182     if (!tmp) errno = ENOMEM;
3183     return tmp;
3184 }
3185 
scalable_aligned_free(void * ptr)3186 extern "C" void scalable_aligned_free(void *ptr)
3187 {
3188     internalFree(ptr);
3189 }
3190 
3191 /********* end code for aligned allocation API **********/
3192 
3193 /********* Code for scalable_msize       ***********/
3194 
3195 /*
3196  * Returns the size of a memory block allocated in the heap.
3197  */
scalable_msize(void * ptr)3198 extern "C" size_t scalable_msize(void* ptr)
3199 {
3200     if (ptr) {
3201         MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected.");
3202         return internalMsize(ptr);
3203     }
3204     errno = EINVAL;
3205     // Unlike _msize, return 0 in case of parameter error.
3206     // Returning size_t(-1) looks more like the way to troubles.
3207     return 0;
3208 }
3209 
3210 /*
3211  * A variant that provides additional memory safety, by checking whether the given address
3212  * was obtained with this allocator, and if not redirecting to the provided alternative call.
3213  */
__TBB_malloc_safer_msize(void * object,size_t (* original_msize)(void *))3214 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*))
3215 {
3216     if (object) {
3217         // Check if the memory was allocated by scalable_malloc
3218         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3219             return internalMsize(object);
3220         else if (original_msize)
3221             return original_msize(object);
3222     }
3223     // object is NULL or unknown, or foreign and no original_msize
3224 #if USE_WINTHREAD
3225     errno = EINVAL; // errno expected to be set only on this platform
3226 #endif
3227     return 0;
3228 }
3229 
3230 /*
3231  * The same as above but for _aligned_msize case
3232  */
__TBB_malloc_safer_aligned_msize(void * object,size_t alignment,size_t offset,size_t (* orig_aligned_msize)(void *,size_t,size_t))3233 extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t))
3234 {
3235     if (object) {
3236         // Check if the memory was allocated by scalable_malloc
3237         if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object))
3238             return internalMsize(object);
3239         else if (orig_aligned_msize)
3240             return orig_aligned_msize(object,alignment,offset);
3241     }
3242     // object is NULL or unknown
3243     errno = EINVAL;
3244     return 0;
3245 }
3246 
3247 /********* End code for scalable_msize   ***********/
3248 
scalable_allocation_mode(int param,intptr_t value)3249 extern "C" int scalable_allocation_mode(int param, intptr_t value)
3250 {
3251     if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) {
3252         defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value);
3253         return TBBMALLOC_OK;
3254     } else if (param == USE_HUGE_PAGES) {
3255 #if __unix__
3256         switch (value) {
3257         case 0:
3258         case 1:
3259             hugePages.setMode(value);
3260             return TBBMALLOC_OK;
3261         default:
3262             return TBBMALLOC_INVALID_PARAM;
3263         }
3264 #else
3265         return TBBMALLOC_NO_EFFECT;
3266 #endif
3267 #if __TBB_SOURCE_DIRECTLY_INCLUDED
3268     } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) {
3269         switch (value) {
3270         case 0: // used by dynamic library
3271         case 1: // used by static library or directly included sources
3272             usedBySrcIncluded = value;
3273             return TBBMALLOC_OK;
3274         default:
3275             return TBBMALLOC_INVALID_PARAM;
3276         }
3277 #endif
3278     } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) {
3279         defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value);
3280         return TBBMALLOC_OK;
3281     }
3282     return TBBMALLOC_INVALID_PARAM;
3283 }
3284 
scalable_allocation_command(int cmd,void * param)3285 extern "C" int scalable_allocation_command(int cmd, void *param)
3286 {
3287     if (param)
3288         return TBBMALLOC_INVALID_PARAM;
3289 
3290     bool released = false;
3291     switch(cmd) {
3292     case TBBMALLOC_CLEAN_THREAD_BUFFERS:
3293         if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false))
3294             released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true);
3295         break;
3296     case TBBMALLOC_CLEAN_ALL_BUFFERS:
3297         released = defaultMemPool->extMemPool.hardCachesCleanup();
3298         break;
3299     default:
3300         return TBBMALLOC_INVALID_PARAM;
3301     }
3302     return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT;
3303 }
3304