1 /*
2 * FastAlloc.cpp
3 *
4 * This source file is part of the FoundationDB open source project
5 *
6 * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
7 *
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21 #include "flow/FastAlloc.h"
22
23 #include "flow/ThreadPrimitives.h"
24 #include "flow/Trace.h"
25 #include "flow/Error.h"
26 #include "flow/Knobs.h"
27 #include "flow/flow.h"
28
29 #include <cstdint>
30 #include <unordered_map>
31
32 #ifdef WIN32
33 #include <windows.h>
34 #undef min
35 #undef max
36 #endif
37
38 #ifdef __linux__
39 #include <sys/mman.h>
40 #include <linux/mman.h>
41 #endif
42
43 #define FAST_ALLOCATOR_DEBUG 0
44
45 #ifdef _MSC_VER
46 // warning 4073 warns about "initializers put in library initialization area", which is our intent
47 #pragma warning (disable: 4073)
48 #pragma init_seg(lib)
49 #define INIT_SEG
50 #elif defined(__GNUG__)
51 #ifdef __linux__
52 #define INIT_SEG __attribute__ ((init_priority (1000)))
53 #elif defined(__APPLE__)
54 #pragma message "init_priority is not supported on this platform; will this be a problem?"
55 #define INIT_SEG
56 #else
57 #error Where am I?
58 #endif
59 #else
60 #error Port me? (init_seg(lib))
61 #endif
62
63 template<int Size>
64 INIT_SEG thread_local typename FastAllocator<Size>::ThreadData FastAllocator<Size>::threadData;
65
66 template<int Size>
67 thread_local bool FastAllocator<Size>::threadInitialized = false;
68
69 #ifdef VALGRIND
70 template<int Size>
71 unsigned long FastAllocator<Size>::vLock = 1;
72 #endif
73
74 template<int Size>
75 void* FastAllocator<Size>::freelist = nullptr;
76
77 typedef void (*ThreadInitFunction)();
78
79 ThreadInitFunction threadInitFunction = 0; // See ThreadCleanup.cpp in the C binding
setFastAllocatorThreadInitFunction(ThreadInitFunction f)80 void setFastAllocatorThreadInitFunction( ThreadInitFunction f ) {
81 ASSERT( !threadInitFunction );
82 threadInitFunction = f;
83 }
84
85 int64_t g_hugeArenaMemory = 0;
86
87 double hugeArenaLastLogged = 0;
88 std::map<std::string, std::pair<int,int>> hugeArenaTraces;
89
hugeArenaSample(int size)90 void hugeArenaSample(int size) {
91 auto& info = hugeArenaTraces[platform::get_backtrace()];
92 info.first++;
93 info.second+=size;
94 if(now() - hugeArenaLastLogged > FLOW_KNOBS->HUGE_ARENA_LOGGING_INTERVAL) {
95 for(auto& it : hugeArenaTraces) {
96 TraceEvent("HugeArenaSample").detail("Count", it.second.first).detail("Size", it.second.second).detail("Backtrace", it.first);
97 }
98 hugeArenaLastLogged = now();
99 hugeArenaTraces.clear();
100 }
101 }
102
103 #ifdef ALLOC_INSTRUMENTATION
104 INIT_SEG std::map<const char*, AllocInstrInfo> allocInstr;
105 INIT_SEG std::unordered_map<int64_t, std::pair<uint32_t, size_t>> memSample;
106 INIT_SEG std::unordered_map<uint32_t, BackTraceAccount> backTraceLookup;
107 INIT_SEG ThreadSpinLock memLock;
108 const size_t SAMPLE_BYTES = 1e7;
109 template<int Size>
110 volatile int32_t FastAllocator<Size>::pageCount;
111 thread_local bool memSample_entered = false;
112 #endif
113
114 #ifdef ALLOC_INSTRUMENTATION_STDOUT
115 thread_local bool inRecordAllocation = false;
116 #endif
117
recordAllocation(void * ptr,size_t size)118 void recordAllocation( void *ptr, size_t size ) {
119 #ifdef ALLOC_INSTRUMENTATION_STDOUT
120 if( inRecordAllocation )
121 return;
122 inRecordAllocation = true;
123 std::string trace = platform::get_backtrace();
124 printf("Alloc\t%p\t%d\t%s\n", ptr, size, trace.c_str());
125 inRecordAllocation = false;
126 #endif
127 #ifdef ALLOC_INSTRUMENTATION
128 if( memSample_entered )
129 return;
130 memSample_entered = true;
131
132 if(((double)rand()) / RAND_MAX < ((double)size) / SAMPLE_BYTES) {
133 void *buffer[100];
134 #if defined(__linux__)
135 int nptrs = backtrace( buffer, 100 );
136 #elif defined(_WIN32)
137 // We could be using fourth parameter to get a hash, but we'll do this
138 // in a unified way between platforms
139 int nptrs = CaptureStackBackTrace( 1, 100, buffer, NULL );
140 #else
141 #error Instrumentation not supported on this platform
142 #endif
143
144 uint32_t a = 0, b = 0;
145 if( nptrs > 0 ) {
146 hashlittle2( buffer, nptrs * sizeof(void *), &a, &b );
147 }
148
149 double countDelta = std::max(1.0, ((double)SAMPLE_BYTES) / size);
150 size_t sizeDelta = std::max(SAMPLE_BYTES, size);
151 ThreadSpinLockHolder holder( memLock );
152 auto it = backTraceLookup.find( a );
153 if( it == backTraceLookup.end() ) {
154 auto& bt = backTraceLookup[ a ];
155 bt.backTrace = new std::vector<void*>();
156 for (int j = 0; j < nptrs; j++) {
157 bt.backTrace->push_back( buffer[j] );
158 }
159 bt.totalSize = sizeDelta;
160 bt.count = countDelta;
161 bt.sampleCount = 1;
162 } else {
163 it->second.totalSize += sizeDelta;
164 it->second.count += countDelta;
165 it->second.sampleCount++;
166 }
167 memSample[(int64_t)ptr] = std::make_pair(a, size);
168 }
169 memSample_entered = false;
170 #endif
171 }
172
recordDeallocation(void * ptr)173 void recordDeallocation( void *ptr ) {
174 #ifdef ALLOC_INSTRUMENTATION_STDOUT
175 if( inRecordAllocation )
176 return;
177 printf("Dealloc\t%p\n", ptr);
178 inRecordAllocation = false;
179 #endif
180 #ifdef ALLOC_INSTRUMENTATION
181 if( memSample_entered ) // could this lead to deallocations not being recorded?
182 return;
183 memSample_entered = true;
184 {
185 ThreadSpinLockHolder holder( memLock );
186
187 auto it = memSample.find( (int64_t)ptr );
188 if( it == memSample.end() ) {
189 memSample_entered = false;
190 return;
191 }
192 auto bti = backTraceLookup.find( it->second.first );
193 ASSERT( bti != backTraceLookup.end() );
194
195 size_t sizeDelta = std::max(SAMPLE_BYTES, it->second.second);
196 double countDelta = std::max(1.0, ((double)SAMPLE_BYTES) / it->second.second);
197
198 bti->second.totalSize -= sizeDelta;
199 bti->second.count -= countDelta;
200 bti->second.sampleCount--;
201 memSample.erase( it );
202 }
203 memSample_entered = false;
204 #endif
205 }
206
207 template <int Size>
208 struct FastAllocator<Size>::GlobalData {
209 CRITICAL_SECTION mutex;
210 std::vector<void*> magazines; // These magazines are always exactly magazine_size ("full")
211 std::vector<std::pair<int, void*>> partial_magazines; // Magazines that are not "full" and their counts. Only created by releaseThreadMagazines().
212 long long totalMemory;
213 long long partialMagazineUnallocatedMemory;
214 long long activeThreads;
GlobalDataFastAllocator::GlobalData215 GlobalData() : totalMemory(0), partialMagazineUnallocatedMemory(0), activeThreads(0) {
216 InitializeCriticalSection(&mutex);
217 }
218 };
219
220 template <int Size>
getTotalMemory()221 long long FastAllocator<Size>::getTotalMemory() {
222 return globalData()->totalMemory;
223 }
224
225 // This does not include memory held by various threads that's available for allocation
226 template <int Size>
getApproximateMemoryUnused()227 long long FastAllocator<Size>::getApproximateMemoryUnused() {
228 return globalData()->magazines.size() * magazine_size * Size + globalData()->partialMagazineUnallocatedMemory;
229 }
230
231 template <int Size>
getActiveThreads()232 long long FastAllocator<Size>::getActiveThreads() {
233 return globalData()->activeThreads;
234 }
235
236 #if FAST_ALLOCATOR_DEBUG
getSizeCode(int i)237 static int64_t getSizeCode(int i) {
238 switch (i) {
239 case 16: return 1;
240 case 32: return 2;
241 case 64: return 3;
242 case 128: return 4;
243 case 256: return 5;
244 case 512: return 6;
245 case 1024: return 7;
246 case 2048: return 8;
247 case 4096: return 9;
248 case 8192: return 10;
249 default: return 11;
250 }
251 }
252 #endif
253
254 template<int Size>
allocate()255 void *FastAllocator<Size>::allocate() {
256 if(!threadInitialized) {
257 initThread();
258 }
259
260 #if FASTALLOC_THREAD_SAFE
261 ThreadData& thr = threadData;
262 if (!thr.freelist) {
263 ASSERT(thr.count == 0);
264 if (thr.alternate) {
265 thr.freelist = thr.alternate;
266 thr.alternate = nullptr;
267 thr.count = magazine_size;
268 } else {
269 getMagazine();
270 }
271 }
272 --thr.count;
273 void* p = thr.freelist;
274 #if VALGRIND
275 VALGRIND_MAKE_MEM_DEFINED(p, sizeof(void*));
276 #endif
277 thr.freelist = *(void**)p;
278 ASSERT(!thr.freelist == (thr.count == 0)); // freelist is empty if and only if count is 0
279 //check( p, true );
280 #else
281 void* p = freelist;
282 if (!p) getMagazine();
283 #if VALGRIND
284 VALGRIND_MAKE_MEM_DEFINED(p, sizeof(void*));
285 #endif
286 freelist = *(void**)p;
287 #endif
288 #if VALGRIND
289 VALGRIND_MALLOCLIKE_BLOCK( p, Size, 0, 0 );
290 #endif
291 #if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT)
292 recordAllocation(p, Size);
293 #endif
294 return p;
295 }
296
297 template<int Size>
release(void * ptr)298 void FastAllocator<Size>::release(void *ptr) {
299 if(!threadInitialized) {
300 initThread();
301 }
302
303 #if FASTALLOC_THREAD_SAFE
304 ThreadData& thr = threadData;
305 if (thr.count == magazine_size) {
306 if (thr.alternate) // Two full magazines, return one
307 releaseMagazine( thr.alternate );
308 thr.alternate = thr.freelist;
309 thr.freelist = nullptr;
310 thr.count = 0;
311 }
312
313 ASSERT(!thr.freelist == (thr.count == 0)); // freelist is empty if and only if count is 0
314
315 ++thr.count;
316 *(void**)ptr = thr.freelist;
317 //check(ptr, false);
318 thr.freelist = ptr;
319 #else
320 *(void**)ptr = freelist;
321 freelist = ptr;
322 #endif
323
324 #if VALGRIND
325 VALGRIND_FREELIKE_BLOCK( ptr, 0 );
326 #endif
327 #if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT)
328 recordDeallocation( ptr );
329 #endif
330 }
331
332 template <int Size>
check(void * ptr,bool alloc)333 void FastAllocator<Size>::check(void* ptr, bool alloc) {
334 #if FAST_ALLOCATOR_DEBUG
335 //if (ptr == (void*)0x400200180)
336 // printf("%c%p\n", alloc?'+':'-', ptr);
337
338 // Check for pointers that aren't part of this FastAllocator
339 if (ptr < (void*)(((getSizeCode(Size)<<11) + 0) * magazine_size*Size) ||
340 ptr > (void*)(((getSizeCode(Size)<<11) + 4000) * magazine_size*Size) ||
341 (int64_t(ptr)&(Size-1)))
342 {
343 printf("Bad ptr: %p\n", ptr);
344 abort();
345 }
346
347 // Redundant freelist pointers to detect outright smashing of the freelist
348 if (alloc) {
349 if ( *((void**)ptr+1) != *(void**)ptr ) {
350 printf("Freelist corruption? %p %p\n", *(void**)ptr, *((void**)ptr+1));
351 abort();
352 }
353 *((void**)ptr+1) = (void*)0;
354 } else {
355 *((void**)ptr+1) = *(void**)ptr;
356 }
357
358 // Track allocated/free status in a completely separate data structure to detect double frees
359 int i = (int)((int64_t)ptr - ((getSizeCode(Size)<<11) + 0) * magazine_size*Size) / Size;
360 static std::vector<bool> isFreed;
361 if (!alloc) {
362 if (i+1 > isFreed.size())
363 isFreed.resize(i+1, false);
364 if (isFreed[i]) {
365 printf("Double free: %p\n", ptr);
366 abort();
367 }
368 isFreed[i] = true;
369 } else {
370 if (i+1 > isFreed.size()) {
371 printf("Allocate beyond end: %p\n", ptr);
372 abort();
373 }
374 if (!isFreed[i]) {
375 printf("Allocate non-freed: %p\n", ptr);
376 abort();
377 }
378 isFreed[i] = false;
379 }
380 #endif
381 }
382
383 template <int Size>
initThread()384 void FastAllocator<Size>::initThread() {
385 threadInitialized = true;
386 if (threadInitFunction) {
387 threadInitFunction();
388 }
389
390 EnterCriticalSection(&globalData()->mutex);
391 ++globalData()->activeThreads;
392 LeaveCriticalSection(&globalData()->mutex);
393
394 threadData.freelist = nullptr;
395 threadData.alternate = nullptr;
396 threadData.count = 0;
397 }
398
399 template <int Size>
getMagazine()400 void FastAllocator<Size>::getMagazine() {
401 ASSERT(threadInitialized);
402 ASSERT(!threadData.freelist && !threadData.alternate && threadData.count == 0);
403
404 EnterCriticalSection(&globalData()->mutex);
405 if (globalData()->magazines.size()) {
406 void* m = globalData()->magazines.back();
407 globalData()->magazines.pop_back();
408 LeaveCriticalSection(&globalData()->mutex);
409 threadData.freelist = m;
410 threadData.count = magazine_size;
411 return;
412 } else if (globalData()->partial_magazines.size()) {
413 std::pair<int, void*> p = globalData()->partial_magazines.back();
414 globalData()->partial_magazines.pop_back();
415 globalData()->partialMagazineUnallocatedMemory -= p.first * Size;
416 LeaveCriticalSection(&globalData()->mutex);
417 threadData.freelist = p.second;
418 threadData.count = p.first;
419 return;
420 }
421 globalData()->totalMemory += magazine_size*Size;
422 LeaveCriticalSection(&globalData()->mutex);
423
424 // Allocate a new page of data from the system allocator
425 #ifdef ALLOC_INSTRUMENTATION
426 interlockedIncrement(&pageCount);
427 #endif
428
429 void** block = nullptr;
430 #if FAST_ALLOCATOR_DEBUG
431 #ifdef WIN32
432 static int alt = 0; alt++;
433 block = (void**)VirtualAllocEx( GetCurrentProcess(),
434 (void*)( ((getSizeCode(Size)<<11) + alt) * magazine_size*Size), magazine_size*Size, MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE );
435 #else
436 static int alt = 0; alt++;
437 void* desiredBlock = (void*)( ((getSizeCode(Size)<<11) + alt) * magazine_size*Size);
438 block = (void**)mmap( desiredBlock, magazine_size*Size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 );
439 ASSERT( block == desiredBlock );
440 #endif
441 #else
442 // FIXME: We should be able to allocate larger magazine sizes here if we
443 // detect that the underlying system supports hugepages. Using hugepages
444 // with smaller-than-2MiB magazine sizes strands memory. See issue #909.
445 if(FLOW_KNOBS && g_nondeterministic_random && g_nondeterministic_random->random01() < (magazine_size * Size)/FLOW_KNOBS->FAST_ALLOC_LOGGING_BYTES) {
446 TraceEvent("GetMagazineSample").detail("Size", Size).backtrace();
447 }
448 block = (void **)::allocate(magazine_size * Size, false);
449 #endif
450
451 //void** block = new void*[ magazine_size * PSize ];
452 for(int i=0; i<magazine_size-1; i++) {
453 block[i*PSize+1] = block[i*PSize] = &block[(i+1)*PSize];
454 check( &block[i*PSize], false );
455 }
456
457 block[(magazine_size-1)*PSize+1] = block[(magazine_size-1)*PSize] = nullptr;
458 check( &block[(magazine_size-1)*PSize], false );
459 threadData.freelist = block;
460 threadData.count = magazine_size;
461 }
462 template <int Size>
releaseMagazine(void * mag)463 void FastAllocator<Size>::releaseMagazine(void* mag) {
464 ASSERT(threadInitialized);
465 EnterCriticalSection(&globalData()->mutex);
466 globalData()->magazines.push_back(mag);
467 LeaveCriticalSection(&globalData()->mutex);
468 }
469 template <int Size>
releaseThreadMagazines()470 void FastAllocator<Size>::releaseThreadMagazines() {
471 if(threadInitialized) {
472 threadInitialized = false;
473 ThreadData& thr = threadData;
474
475 EnterCriticalSection(&globalData()->mutex);
476 if (thr.freelist || thr.alternate) {
477 if (thr.freelist) {
478 ASSERT(thr.count > 0 && thr.count <= magazine_size);
479 globalData()->partial_magazines.push_back( std::make_pair(thr.count, thr.freelist) );
480 globalData()->partialMagazineUnallocatedMemory += thr.count * Size;
481 }
482 if (thr.alternate) {
483 globalData()->magazines.push_back(thr.alternate);
484 }
485 }
486 --globalData()->activeThreads;
487 LeaveCriticalSection(&globalData()->mutex);
488
489 thr.count = 0;
490 thr.alternate = nullptr;
491 thr.freelist = nullptr;
492 }
493 }
494
releaseAllThreadMagazines()495 void releaseAllThreadMagazines() {
496 FastAllocator<16>::releaseThreadMagazines();
497 FastAllocator<32>::releaseThreadMagazines();
498 FastAllocator<64>::releaseThreadMagazines();
499 FastAllocator<128>::releaseThreadMagazines();
500 FastAllocator<256>::releaseThreadMagazines();
501 FastAllocator<512>::releaseThreadMagazines();
502 FastAllocator<1024>::releaseThreadMagazines();
503 FastAllocator<2048>::releaseThreadMagazines();
504 FastAllocator<4096>::releaseThreadMagazines();
505 FastAllocator<8192>::releaseThreadMagazines();
506 }
507
getTotalUnusedAllocatedMemory()508 int64_t getTotalUnusedAllocatedMemory() {
509 int64_t unusedMemory = 0;
510
511 unusedMemory += FastAllocator<16>::getApproximateMemoryUnused();
512 unusedMemory += FastAllocator<32>::getApproximateMemoryUnused();
513 unusedMemory += FastAllocator<64>::getApproximateMemoryUnused();
514 unusedMemory += FastAllocator<128>::getApproximateMemoryUnused();
515 unusedMemory += FastAllocator<256>::getApproximateMemoryUnused();
516 unusedMemory += FastAllocator<512>::getApproximateMemoryUnused();
517 unusedMemory += FastAllocator<1024>::getApproximateMemoryUnused();
518 unusedMemory += FastAllocator<2048>::getApproximateMemoryUnused();
519 unusedMemory += FastAllocator<4096>::getApproximateMemoryUnused();
520 unusedMemory += FastAllocator<8192>::getApproximateMemoryUnused();
521
522 return unusedMemory;
523 }
524
525 template class FastAllocator<16>;
526 template class FastAllocator<32>;
527 template class FastAllocator<64>;
528 template class FastAllocator<128>;
529 template class FastAllocator<256>;
530 template class FastAllocator<512>;
531 template class FastAllocator<1024>;
532 template class FastAllocator<2048>;
533 template class FastAllocator<4096>;
534 template class FastAllocator<8192>;
535
536