1 /*
2  * FastAlloc.cpp
3  *
4  * This source file is part of the FoundationDB open source project
5  *
6  * Copyright 2013-2018 Apple Inc. and the FoundationDB project authors
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *     http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 #include "flow/FastAlloc.h"
22 
23 #include "flow/ThreadPrimitives.h"
24 #include "flow/Trace.h"
25 #include "flow/Error.h"
26 #include "flow/Knobs.h"
27 #include "flow/flow.h"
28 
29 #include <cstdint>
30 #include <unordered_map>
31 
32 #ifdef WIN32
33 #include <windows.h>
34 #undef min
35 #undef max
36 #endif
37 
38 #ifdef __linux__
39 #include <sys/mman.h>
40 #include <linux/mman.h>
41 #endif
42 
43 #define FAST_ALLOCATOR_DEBUG 0
44 
45 #ifdef _MSC_VER
46 // warning 4073 warns about "initializers put in library initialization area", which is our intent
47 #pragma warning (disable: 4073)
48 #pragma init_seg(lib)
49 #define INIT_SEG
50 #elif defined(__GNUG__)
51 #ifdef __linux__
52 #define INIT_SEG __attribute__ ((init_priority (1000)))
53 #elif defined(__APPLE__)
54 #pragma message "init_priority is not supported on this platform; will this be a problem?"
55 #define INIT_SEG
56 #else
57 #error Where am I?
58 #endif
59 #else
60 #error Port me? (init_seg(lib))
61 #endif
62 
63 template<int Size>
64 INIT_SEG thread_local typename FastAllocator<Size>::ThreadData FastAllocator<Size>::threadData;
65 
66 template<int Size>
67 thread_local bool FastAllocator<Size>::threadInitialized = false;
68 
69 #ifdef VALGRIND
70 template<int Size>
71 unsigned long FastAllocator<Size>::vLock = 1;
72 #endif
73 
74 template<int Size>
75 void* FastAllocator<Size>::freelist = nullptr;
76 
77 typedef void (*ThreadInitFunction)();
78 
79 ThreadInitFunction threadInitFunction = 0;  // See ThreadCleanup.cpp in the C binding
setFastAllocatorThreadInitFunction(ThreadInitFunction f)80 void setFastAllocatorThreadInitFunction( ThreadInitFunction f ) {
81 	ASSERT( !threadInitFunction );
82 	threadInitFunction = f;
83 }
84 
85 int64_t g_hugeArenaMemory = 0;
86 
87 double hugeArenaLastLogged = 0;
88 std::map<std::string, std::pair<int,int>> hugeArenaTraces;
89 
hugeArenaSample(int size)90 void hugeArenaSample(int size) {
91 	auto& info = hugeArenaTraces[platform::get_backtrace()];
92 	info.first++;
93 	info.second+=size;
94 	if(now() - hugeArenaLastLogged > FLOW_KNOBS->HUGE_ARENA_LOGGING_INTERVAL) {
95 		for(auto& it : hugeArenaTraces) {
96 			TraceEvent("HugeArenaSample").detail("Count", it.second.first).detail("Size", it.second.second).detail("Backtrace", it.first);
97 		}
98 		hugeArenaLastLogged = now();
99 		hugeArenaTraces.clear();
100 	}
101 }
102 
103 #ifdef ALLOC_INSTRUMENTATION
104 INIT_SEG std::map<const char*, AllocInstrInfo> allocInstr;
105 INIT_SEG std::unordered_map<int64_t, std::pair<uint32_t, size_t>> memSample;
106 INIT_SEG std::unordered_map<uint32_t, BackTraceAccount> backTraceLookup;
107 INIT_SEG ThreadSpinLock memLock;
108 const size_t SAMPLE_BYTES = 1e7;
109 template<int Size>
110 volatile int32_t FastAllocator<Size>::pageCount;
111 thread_local bool memSample_entered = false;
112 #endif
113 
114 #ifdef ALLOC_INSTRUMENTATION_STDOUT
115 thread_local bool inRecordAllocation = false;
116 #endif
117 
recordAllocation(void * ptr,size_t size)118 void recordAllocation( void *ptr, size_t size ) {
119 #ifdef ALLOC_INSTRUMENTATION_STDOUT
120 	if( inRecordAllocation )
121 		return;
122 	inRecordAllocation = true;
123 	std::string trace = platform::get_backtrace();
124 	printf("Alloc\t%p\t%d\t%s\n", ptr, size, trace.c_str());
125 	inRecordAllocation = false;
126 #endif
127 #ifdef ALLOC_INSTRUMENTATION
128 	if( memSample_entered )
129 		return;
130 	memSample_entered = true;
131 
132 	if(((double)rand()) / RAND_MAX < ((double)size) / SAMPLE_BYTES) {
133 		void *buffer[100];
134 #if defined(__linux__)
135 		int nptrs = backtrace( buffer, 100 );
136 #elif defined(_WIN32)
137 		// We could be using fourth parameter to get a hash, but we'll do this
138 		//  in a unified way between platforms
139 		int nptrs = CaptureStackBackTrace( 1, 100, buffer, NULL );
140 #else
141 #error Instrumentation not supported on this platform
142 #endif
143 
144 		uint32_t a = 0, b = 0;
145 		if( nptrs > 0 ) {
146 			hashlittle2( buffer, nptrs * sizeof(void *), &a, &b );
147 		}
148 
149 		double countDelta = std::max(1.0, ((double)SAMPLE_BYTES) / size);
150 		size_t sizeDelta = std::max(SAMPLE_BYTES, size);
151 		ThreadSpinLockHolder holder( memLock );
152 		auto it = backTraceLookup.find( a );
153 		if( it == backTraceLookup.end() ) {
154 			auto& bt = backTraceLookup[ a ];
155 			bt.backTrace = new std::vector<void*>();
156 			for (int j = 0; j < nptrs; j++) {
157 				bt.backTrace->push_back( buffer[j] );
158 			}
159 			bt.totalSize = sizeDelta;
160 			bt.count = countDelta;
161 			bt.sampleCount = 1;
162 		} else {
163 			it->second.totalSize += sizeDelta;
164 			it->second.count += countDelta;
165 			it->second.sampleCount++;
166 		}
167 		memSample[(int64_t)ptr] = std::make_pair(a, size);
168 	}
169 	memSample_entered = false;
170 #endif
171 }
172 
recordDeallocation(void * ptr)173 void recordDeallocation( void *ptr ) {
174 #ifdef ALLOC_INSTRUMENTATION_STDOUT
175 	if( inRecordAllocation )
176 		return;
177 	printf("Dealloc\t%p\n", ptr);
178 	inRecordAllocation = false;
179 #endif
180 #ifdef ALLOC_INSTRUMENTATION
181 	if( memSample_entered ) // could this lead to deallocations not being recorded?
182 		return;
183 	memSample_entered = true;
184 	{
185 		ThreadSpinLockHolder holder( memLock );
186 
187 		auto it = memSample.find( (int64_t)ptr );
188 		if( it == memSample.end() ) {
189 			memSample_entered = false;
190 			return;
191 		}
192 		auto bti = backTraceLookup.find( it->second.first );
193 		ASSERT( bti != backTraceLookup.end() );
194 
195 		size_t sizeDelta = std::max(SAMPLE_BYTES, it->second.second);
196 		double countDelta = std::max(1.0, ((double)SAMPLE_BYTES) / it->second.second);
197 
198 		bti->second.totalSize -= sizeDelta;
199 		bti->second.count -= countDelta;
200 		bti->second.sampleCount--;
201 		memSample.erase( it );
202 	}
203 	memSample_entered = false;
204 #endif
205 }
206 
207 template <int Size>
208 struct FastAllocator<Size>::GlobalData {
209 	CRITICAL_SECTION mutex;
210 	std::vector<void*> magazines;   // These magazines are always exactly magazine_size ("full")
211 	std::vector<std::pair<int, void*>> partial_magazines;  // Magazines that are not "full" and their counts.  Only created by releaseThreadMagazines().
212 	long long totalMemory;
213 	long long partialMagazineUnallocatedMemory;
214 	long long activeThreads;
GlobalDataFastAllocator::GlobalData215 	GlobalData() : totalMemory(0), partialMagazineUnallocatedMemory(0), activeThreads(0) {
216 		InitializeCriticalSection(&mutex);
217 	}
218 };
219 
220 template <int Size>
getTotalMemory()221 long long FastAllocator<Size>::getTotalMemory() {
222 	return globalData()->totalMemory;
223 }
224 
225 // This does not include memory held by various threads that's available for allocation
226 template <int Size>
getApproximateMemoryUnused()227 long long FastAllocator<Size>::getApproximateMemoryUnused() {
228 	return globalData()->magazines.size() * magazine_size * Size + globalData()->partialMagazineUnallocatedMemory;
229 }
230 
231 template <int Size>
getActiveThreads()232 long long FastAllocator<Size>::getActiveThreads() {
233 	return globalData()->activeThreads;
234 }
235 
236 #if FAST_ALLOCATOR_DEBUG
getSizeCode(int i)237 static int64_t getSizeCode(int i) {
238 	switch (i) {
239 		case 16: return 1;
240 		case 32: return 2;
241 		case 64: return 3;
242 		case 128: return 4;
243 		case 256: return 5;
244 		case 512: return 6;
245 		case 1024: return 7;
246 		case 2048: return 8;
247 		case 4096: return 9;
248 		case 8192: return 10;
249 		default: return 11;
250 	}
251 }
252 #endif
253 
254 template<int Size>
allocate()255 void *FastAllocator<Size>::allocate() {
256 	if(!threadInitialized) {
257 		initThread();
258 	}
259 
260 #if FASTALLOC_THREAD_SAFE
261 	ThreadData& thr = threadData;
262 	if (!thr.freelist) {
263 		ASSERT(thr.count == 0);
264 		if (thr.alternate) {
265 			thr.freelist = thr.alternate;
266 			thr.alternate = nullptr;
267 			thr.count = magazine_size;
268 		} else {
269 			getMagazine();
270 		}
271 	}
272 	--thr.count;
273 	void* p = thr.freelist;
274 #if VALGRIND
275 	VALGRIND_MAKE_MEM_DEFINED(p, sizeof(void*));
276 #endif
277 	thr.freelist = *(void**)p;
278 	ASSERT(!thr.freelist == (thr.count == 0)); // freelist is empty if and only if count is 0
279 	//check( p, true );
280 #else
281 	void* p = freelist;
282 	if (!p) getMagazine();
283 #if VALGRIND
284 	VALGRIND_MAKE_MEM_DEFINED(p, sizeof(void*));
285 #endif
286 	freelist = *(void**)p;
287 #endif
288 #if VALGRIND
289 	VALGRIND_MALLOCLIKE_BLOCK( p, Size, 0, 0 );
290 #endif
291 #if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT)
292 	recordAllocation(p, Size);
293 #endif
294 	return p;
295 }
296 
297 template<int Size>
release(void * ptr)298 void FastAllocator<Size>::release(void *ptr) {
299 	if(!threadInitialized) {
300 		initThread();
301 	}
302 
303 #if FASTALLOC_THREAD_SAFE
304 	ThreadData& thr = threadData;
305 	if (thr.count == magazine_size) {
306 		if (thr.alternate)		// Two full magazines, return one
307 			releaseMagazine( thr.alternate );
308 		thr.alternate = thr.freelist;
309 		thr.freelist = nullptr;
310 		thr.count = 0;
311 	}
312 
313 	ASSERT(!thr.freelist == (thr.count == 0)); // freelist is empty if and only if count is 0
314 
315 	++thr.count;
316 	*(void**)ptr = thr.freelist;
317 	//check(ptr, false);
318 	thr.freelist = ptr;
319 #else
320 	*(void**)ptr = freelist;
321 	freelist = ptr;
322 #endif
323 
324 #if VALGRIND
325 	VALGRIND_FREELIKE_BLOCK( ptr, 0 );
326 #endif
327 #if defined(ALLOC_INSTRUMENTATION) || defined(ALLOC_INSTRUMENTATION_STDOUT)
328 	recordDeallocation( ptr );
329 #endif
330 }
331 
332 template <int Size>
check(void * ptr,bool alloc)333 void FastAllocator<Size>::check(void* ptr, bool alloc) {
334 #if FAST_ALLOCATOR_DEBUG
335 	//if (ptr == (void*)0x400200180)
336 	//	printf("%c%p\n", alloc?'+':'-', ptr);
337 
338 	// Check for pointers that aren't part of this FastAllocator
339 	if (ptr < (void*)(((getSizeCode(Size)<<11) + 0) * magazine_size*Size) ||
340 		ptr > (void*)(((getSizeCode(Size)<<11) + 4000) * magazine_size*Size) ||
341 		(int64_t(ptr)&(Size-1)))
342 	{
343 		printf("Bad ptr: %p\n", ptr);
344 		abort();
345 	}
346 
347 	// Redundant freelist pointers to detect outright smashing of the freelist
348 	if (alloc) {
349 		if ( *((void**)ptr+1) != *(void**)ptr ) {
350 			printf("Freelist corruption? %p %p\n", *(void**)ptr, *((void**)ptr+1));
351 			abort();
352 		}
353 		*((void**)ptr+1) = (void*)0;
354 	} else {
355 		*((void**)ptr+1) = *(void**)ptr;
356 	}
357 
358 	// Track allocated/free status in a completely separate data structure to detect double frees
359 	int i = (int)((int64_t)ptr - ((getSizeCode(Size)<<11) + 0) * magazine_size*Size) / Size;
360 	static std::vector<bool> isFreed;
361 	if (!alloc) {
362 		if (i+1 > isFreed.size())
363 			isFreed.resize(i+1, false);
364 		if (isFreed[i]) {
365 			printf("Double free: %p\n", ptr);
366 			abort();
367 		}
368 		isFreed[i] = true;
369 	} else {
370 		if (i+1 > isFreed.size()) {
371 			printf("Allocate beyond end: %p\n", ptr);
372 			abort();
373 		}
374 		if (!isFreed[i]) {
375 			printf("Allocate non-freed: %p\n", ptr);
376 			abort();
377 		}
378 		isFreed[i] = false;
379 	}
380 #endif
381 }
382 
383 template <int Size>
initThread()384 void FastAllocator<Size>::initThread() {
385 	threadInitialized = true;
386 	if (threadInitFunction) {
387 		threadInitFunction();
388 	}
389 
390 	EnterCriticalSection(&globalData()->mutex);
391 	++globalData()->activeThreads;
392 	LeaveCriticalSection(&globalData()->mutex);
393 
394 	threadData.freelist = nullptr;
395 	threadData.alternate = nullptr;
396 	threadData.count = 0;
397 }
398 
399 template <int Size>
getMagazine()400 void FastAllocator<Size>::getMagazine() {
401 	ASSERT(threadInitialized);
402 	ASSERT(!threadData.freelist && !threadData.alternate && threadData.count == 0);
403 
404 	EnterCriticalSection(&globalData()->mutex);
405 	if (globalData()->magazines.size()) {
406 		void* m = globalData()->magazines.back();
407 		globalData()->magazines.pop_back();
408 		LeaveCriticalSection(&globalData()->mutex);
409 		threadData.freelist = m;
410 		threadData.count = magazine_size;
411 		return;
412 	} else if (globalData()->partial_magazines.size()) {
413 		std::pair<int, void*> p = globalData()->partial_magazines.back();
414 		globalData()->partial_magazines.pop_back();
415 		globalData()->partialMagazineUnallocatedMemory -= p.first * Size;
416 		LeaveCriticalSection(&globalData()->mutex);
417 		threadData.freelist = p.second;
418 		threadData.count = p.first;
419 		return;
420 	}
421 	globalData()->totalMemory += magazine_size*Size;
422 	LeaveCriticalSection(&globalData()->mutex);
423 
424 	// Allocate a new page of data from the system allocator
425 	#ifdef ALLOC_INSTRUMENTATION
426 	interlockedIncrement(&pageCount);
427 	#endif
428 
429 	void** block = nullptr;
430 #if FAST_ALLOCATOR_DEBUG
431 #ifdef WIN32
432 	static int alt = 0; alt++;
433 	block = (void**)VirtualAllocEx( GetCurrentProcess(),
434 									(void*)( ((getSizeCode(Size)<<11) + alt) * magazine_size*Size), magazine_size*Size, MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE );
435 #else
436 	static int alt = 0; alt++;
437 	void* desiredBlock = (void*)( ((getSizeCode(Size)<<11) + alt) * magazine_size*Size);
438 	block = (void**)mmap( desiredBlock, magazine_size*Size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 );
439 	ASSERT( block == desiredBlock );
440 #endif
441 #else
442 	// FIXME: We should be able to allocate larger magazine sizes here if we
443 	// detect that the underlying system supports hugepages.  Using hugepages
444 	// with smaller-than-2MiB magazine sizes strands memory.  See issue #909.
445 	if(FLOW_KNOBS && g_nondeterministic_random && g_nondeterministic_random->random01() < (magazine_size * Size)/FLOW_KNOBS->FAST_ALLOC_LOGGING_BYTES) {
446 		TraceEvent("GetMagazineSample").detail("Size", Size).backtrace();
447 	}
448 	block = (void **)::allocate(magazine_size * Size, false);
449 #endif
450 
451 	//void** block = new void*[ magazine_size * PSize ];
452 	for(int i=0; i<magazine_size-1; i++) {
453 		block[i*PSize+1] = block[i*PSize] = &block[(i+1)*PSize];
454 		check( &block[i*PSize], false );
455 	}
456 
457 	block[(magazine_size-1)*PSize+1] = block[(magazine_size-1)*PSize] = nullptr;
458 	check( &block[(magazine_size-1)*PSize], false );
459 	threadData.freelist = block;
460 	threadData.count = magazine_size;
461 }
462 template <int Size>
releaseMagazine(void * mag)463 void FastAllocator<Size>::releaseMagazine(void* mag) {
464 	ASSERT(threadInitialized);
465 	EnterCriticalSection(&globalData()->mutex);
466 	globalData()->magazines.push_back(mag);
467 	LeaveCriticalSection(&globalData()->mutex);
468 }
469 template <int Size>
releaseThreadMagazines()470 void FastAllocator<Size>::releaseThreadMagazines() {
471 	if(threadInitialized) {
472 		threadInitialized = false;
473 		ThreadData& thr = threadData;
474 
475 		EnterCriticalSection(&globalData()->mutex);
476 		if (thr.freelist || thr.alternate) {
477 			if (thr.freelist) {
478 				ASSERT(thr.count > 0 && thr.count <= magazine_size);
479 				globalData()->partial_magazines.push_back( std::make_pair(thr.count, thr.freelist) );
480 				globalData()->partialMagazineUnallocatedMemory += thr.count * Size;
481 			}
482 			if (thr.alternate) {
483 				globalData()->magazines.push_back(thr.alternate);
484 			}
485 		}
486 		--globalData()->activeThreads;
487 		LeaveCriticalSection(&globalData()->mutex);
488 
489 		thr.count = 0;
490 		thr.alternate = nullptr;
491 		thr.freelist = nullptr;
492 	}
493 }
494 
releaseAllThreadMagazines()495 void releaseAllThreadMagazines() {
496 	FastAllocator<16>::releaseThreadMagazines();
497 	FastAllocator<32>::releaseThreadMagazines();
498 	FastAllocator<64>::releaseThreadMagazines();
499 	FastAllocator<128>::releaseThreadMagazines();
500 	FastAllocator<256>::releaseThreadMagazines();
501 	FastAllocator<512>::releaseThreadMagazines();
502 	FastAllocator<1024>::releaseThreadMagazines();
503 	FastAllocator<2048>::releaseThreadMagazines();
504 	FastAllocator<4096>::releaseThreadMagazines();
505 	FastAllocator<8192>::releaseThreadMagazines();
506 }
507 
getTotalUnusedAllocatedMemory()508 int64_t getTotalUnusedAllocatedMemory() {
509 	int64_t unusedMemory = 0;
510 
511 	unusedMemory += FastAllocator<16>::getApproximateMemoryUnused();
512 	unusedMemory += FastAllocator<32>::getApproximateMemoryUnused();
513 	unusedMemory += FastAllocator<64>::getApproximateMemoryUnused();
514 	unusedMemory += FastAllocator<128>::getApproximateMemoryUnused();
515 	unusedMemory += FastAllocator<256>::getApproximateMemoryUnused();
516 	unusedMemory += FastAllocator<512>::getApproximateMemoryUnused();
517 	unusedMemory += FastAllocator<1024>::getApproximateMemoryUnused();
518 	unusedMemory += FastAllocator<2048>::getApproximateMemoryUnused();
519 	unusedMemory += FastAllocator<4096>::getApproximateMemoryUnused();
520 	unusedMemory += FastAllocator<8192>::getApproximateMemoryUnused();
521 
522 	return unusedMemory;
523 }
524 
525 template class FastAllocator<16>;
526 template class FastAllocator<32>;
527 template class FastAllocator<64>;
528 template class FastAllocator<128>;
529 template class FastAllocator<256>;
530 template class FastAllocator<512>;
531 template class FastAllocator<1024>;
532 template class FastAllocator<2048>;
533 template class FastAllocator<4096>;
534 template class FastAllocator<8192>;
535 
536