xref: /dragonfly/lib/libc/stdlib/dmalloc.c (revision 17183580)
1 /*
2  * DMALLOC.C	- Dillon's malloc
3  *
4  * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module implements a modified slab allocator as a drop-in replacement
38  * for the libc malloc().  The slab algorithm has been adjusted to support
39  * dynamic sizing of slabs which effectively allows slabs to be used for
40  * allocations of any size.  Because of this we neither have a small-block
41  * allocator or a big-block allocator and the code paths are simplified.
42  *
43  * To support dynamic slab sizing available user virtual memory is broken
44  * down into ~1024 regions.  Each region has fixed slab size whos value is
45  * set when the region is opened up for use.  The free() path simply applies
46  * a mask based on the region to the pointer to acquire the base of the
47  * governing slab structure.
48  *
49  * Regions[NREGIONS]	(1024)
50  *
51  * Slab management and locking is done on a per-zone basis.
52  *
53  *	Alloc Size	Chunking        Number of zones
54  *	0-127		8		16
55  *	128-255		16		8
56  *	256-511		32		8
57  *	512-1023	64		8
58  *	1024-2047	128		8
59  *	2048-4095	256		8
60  *	4096-8191	512		8
61  *	8192-16383	1024		8
62  *	16384-32767	2048		8
63  *	32768-65535	4096		8
64  *	... continues forever ...	4 zones
65  *
66  *	For a 2^63 memory space each doubling >= 64K is broken down into
67  *	4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
68  *
69  *			   API FEATURES AND SIDE EFFECTS
70  *
71  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
72  *	Above that power-of-2 sized allocations are page-aligned.  Non
73  *	power-of-2 sized allocations are aligned the same as the chunk
74  *	size for their zone.
75  *    + ability to allocate arbitrarily large chunks of memory
76  *    + realloc will reuse the passed pointer if possible, within the
77  *	limitations of the zone chunking.
78  *
79  * On top of the slab allocator we also implement a 16-entry-per-thread
80  * magazine cache for allocations <= NOMSLABSIZE.
81  *
82  *				FUTURE FEATURES
83  *
84  *    + [better] garbage collection
85  *    + better initial sizing.
86  *
87  * TUNING
88  *
89  * The value of the environment variable MALLOC_OPTIONS is a character string
90  * containing various flags to tune nmalloc.  Upper case letters enabled
91  * or increase the feature, lower case disables or decreases the feature.
92  *
93  * U		Enable UTRACE for all operations, observable with ktrace.
94  *		Diasbled by default.
95  *
96  * Z		Zero out allocations, otherwise allocations (except for
97  *		calloc) will contain garbage.
98  *		Disabled by default.
99  *
100  * H		Pass a hint with madvise() about unused pages.
101  *		Disabled by default.
102  *		Not currently implemented.
103  *
104  * F		Disable local per-thread caching.
105  *		Disabled by default.
106  *
107  * C		Increase (decrease) how much excess cache to retain.
108  *		Set to 4 by default.
109  */
110 
111 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
112 
113 #ifndef STANDALONE_DEBUG
114 #include "libc_private.h"
115 #endif
116 
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/mman.h>
120 #include <sys/queue.h>
121 #include <sys/ktrace.h>
122 #include <stdio.h>
123 #include <stdint.h>
124 #include <stdlib.h>
125 #include <stdarg.h>
126 #include <stddef.h>
127 #include <unistd.h>
128 #include <string.h>
129 #include <fcntl.h>
130 #include <errno.h>
131 #include <pthread.h>
132 #include <limits.h>
133 
134 #include <machine/atomic.h>
135 #include <machine/cpufunc.h>
136 
137 #ifdef STANDALONE_DEBUG
138 void _nmalloc_thr_init(void);
139 #else
140 #include "spinlock.h"
141 #include "un-namespace.h"
142 #endif
143 
144 #ifndef MAP_SIZEALIGN
145 #define MAP_SIZEALIGN	0
146 #endif
147 
148 #if SSIZE_MAX == 0x7FFFFFFF
149 #define ADDRBITS	32
150 #define UVM_BITS	32	/* worst case */
151 #else
152 #define ADDRBITS	64
153 #define UVM_BITS	48	/* worst case XXX */
154 #endif
155 
156 #if LONG_MAX == 0x7FFFFFFF
157 #define LONG_BITS	32
158 #define LONG_BITS_SHIFT	5
159 #else
160 #define LONG_BITS	64
161 #define LONG_BITS_SHIFT	6
162 #endif
163 
164 #define LOCKEDPTR	((void *)(intptr_t)-1)
165 
166 /*
167  * Regions[]
168  */
169 #define NREGIONS_BITS	10
170 #define NREGIONS	(1 << NREGIONS_BITS)
171 #define NREGIONS_MASK	(NREGIONS - 1)
172 #define NREGIONS_SHIFT	(UVM_BITS - NREGIONS_BITS)
173 #define NREGIONS_SIZE	(1LU << NREGIONS_SHIFT)
174 
175 typedef struct region *region_t;
176 typedef struct slglobaldata *slglobaldata_t;
177 typedef struct slab *slab_t;
178 
179 struct region {
180 	uintptr_t	mask;
181 	slab_t		slab;	/* conditional out of band slab */
182 };
183 
184 static struct region Regions[NREGIONS];
185 
186 /*
187  * Number of chunking zones available
188  */
189 #define CHUNKFACTOR	8
190 #if ADDRBITS == 32
191 #define NZONES		(16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
192 #else
193 #define NZONES		(16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
194 #endif
195 
196 static int MaxChunks[NZONES];
197 
198 #define NDEPOTS		8		/* must be power of 2 */
199 
200 /*
201  * Maximum number of chunks per slab, governed by the allocation bitmap in
202  * each slab.  The maximum is reduced for large chunk sizes.
203  */
204 #define MAXCHUNKS	(LONG_BITS * LONG_BITS)
205 #define MAXCHUNKS_BITS	(LONG_BITS_SHIFT * LONG_BITS_SHIFT)
206 #define LITSLABSIZE	(32 * 1024)
207 #define NOMSLABSIZE	(2 * 1024 * 1024)
208 #define BIGSLABSIZE	(128 * 1024 * 1024)
209 
210 #define ZALLOC_SLAB_MAGIC	0x736c6162	/* magic sanity */
211 
212 TAILQ_HEAD(slab_list, slab);
213 
214 /*
215  * A slab structure
216  */
217 struct slab {
218 	struct slab	*next;		/* slabs with available space */
219 	TAILQ_ENTRY(slab) entry;
220 	int32_t		magic;		/* magic number for sanity check */
221 	u_int		navail;		/* number of free elements available */
222 	u_int		nmax;
223 	u_int		free_bit;	/* free hint bitno */
224 	u_int		free_index;	/* free hint index */
225 	u_long		bitmap[LONG_BITS]; /* free chunks */
226 	size_t		slab_size;	/* size of entire slab */
227 	size_t		chunk_size;	/* chunk size for validation */
228 	int		zone_index;
229 	enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
230 	int		flags;
231 	region_t	region;		/* related region */
232 	char		*chunks;	/* chunk base */
233 	slglobaldata_t	slgd;		/* localized to thread else NULL */
234 };
235 
236 /*
237  * per-thread data + global depot
238  *
239  * NOTE: The magazine shortcut is only used for per-thread data.
240  */
241 #define NMAGSHORTCUT	16
242 
243 struct slglobaldata {
244 	spinlock_t	lock;		/* only used by slglobaldepot */
245 	struct zoneinfo {
246 		slab_t	avail_base;
247 		slab_t	empty_base;
248 		int	best_region;
249 		int	mag_index;
250 		int	avail_count;
251 		int	empty_count;
252 		void	*mag_shortcut[NMAGSHORTCUT];
253 	} zone[NZONES];
254 	struct slab_list full_zones;	/* via entry */
255 	int		masked;
256 	int		biggest_index;
257 	size_t		nslabs;
258 };
259 
260 #define SLAB_ZEROD		0x0001
261 
262 /*
263  * Misc constants.  Note that allocations that are exact multiples of
264  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
265  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
266  */
267 #define MIN_CHUNK_SIZE		8		/* in bytes */
268 #define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
269 
270 #define SAFLAG_ZERO	0x00000001
271 
272 /*
273  * The WEIRD_ADDR is used as known text to copy into free objects to
274  * try to create deterministic failure cases if the data is accessed after
275  * free.
276  *
277  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
278  *	    not be larger then 64.
279  */
280 #ifdef INVARIANTS
281 #define WEIRD_ADDR      0xdeadc0de
282 #endif
283 
284 /*
285  * Thread control
286  */
287 
288 #define MASSERT(exp)	do { if (__predict_false(!(exp)))	\
289 				_mpanic("assertion: %s in %s",	\
290 				#exp, __func__);		\
291 			    } while (0)
292 
293 /*
294  * With this attribute set, do not require a function call for accessing
295  * this variable when the code is compiled -fPIC.
296  *
297  * Must be empty for libc_rtld (similar to __thread)
298  */
299 #if defined(__LIBC_RTLD)
300 #define TLS_ATTRIBUTE
301 #else
302 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
303 #endif
304 
305 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
306 static pthread_key_t thread_malloc_key;
307 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
308 static struct slglobaldata slglobaldepot;
309 
310 static int opt_madvise = 0;
311 static int opt_free = 0;
312 static int opt_cache = 4;
313 static int opt_utrace = 0;
314 static int g_malloc_flags = 0;
315 static int malloc_panic;
316 
317 #ifdef INVARIANTS
318 static const int32_t weirdary[16] = {
319 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
320 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
321 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
322 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
323 };
324 #endif
325 
326 static void *memalloc(size_t size, int flags);
327 static void *memrealloc(void *ptr, size_t size);
328 static void memfree(void *ptr, int);
329 static int memalign(void **memptr, size_t alignment, size_t size);
330 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
331 static void slabfree(slab_t slab);
332 static void slabterm(slglobaldata_t slgd, slab_t slab);
333 static void *_vmem_alloc(int ri, size_t slab_size);
334 static void _vmem_free(void *ptr, size_t slab_size);
335 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
336 #ifndef STANDALONE_DEBUG
337 static void malloc_init(void) __constructor(101);
338 #else
339 static void malloc_init(void) __constructor(101);
340 #endif
341 
342 
343 struct nmalloc_utrace {
344 	void *p;
345 	size_t s;
346 	void *r;
347 };
348 
349 #define UTRACE(a, b, c)						\
350 	if (opt_utrace) {					\
351 		struct nmalloc_utrace ut = {			\
352 			.p = (a),				\
353 			.s = (b),				\
354 			.r = (c)				\
355 		};						\
356 		utrace(&ut, sizeof(ut));			\
357 	}
358 
359 #ifdef INVARIANTS
360 /*
361  * If enabled any memory allocated without M_ZERO is initialized to -1.
362  */
363 static int  use_malloc_pattern;
364 #endif
365 
366 static void
malloc_init(void)367 malloc_init(void)
368 {
369 	const char *p = NULL;
370 
371 	TAILQ_INIT(&slglobal.full_zones);
372 
373 	Regions[0].mask = -1; /* disallow activity in lowest region */
374 
375 	if (issetugid() == 0)
376 		p = getenv("MALLOC_OPTIONS");
377 
378 	for (; p != NULL && *p != '\0'; p++) {
379 		switch(*p) {
380 		case 'u':
381 			opt_utrace = 0;
382 			break;
383 		case 'U':
384 			opt_utrace = 1;
385 			break;
386 		case 'h':
387 			opt_madvise = 0;
388 			break;
389 		case 'H':
390 			opt_madvise = 1;
391 			break;
392 		case 'c':
393 			if (opt_cache > 0)
394 				--opt_cache;
395 			break;
396 		case 'C':
397 			++opt_cache;
398 			break;
399 		case 'f':
400 			opt_free = 0;
401 			break;
402 		case 'F':
403 			opt_free = 1;
404 			break;
405 		case 'z':
406 			g_malloc_flags = 0;
407 			break;
408 		case 'Z':
409 			g_malloc_flags = SAFLAG_ZERO;
410 			break;
411 		default:
412 			break;
413 		}
414 	}
415 
416 	UTRACE((void *) -1, 0, NULL);
417 }
418 
419 /*
420  * We have to install a handler for nmalloc thread teardowns when
421  * the thread is created.  We cannot delay this because destructors in
422  * sophisticated userland programs can call malloc() for the first time
423  * during their thread exit.
424  *
425  * This routine is called directly from pthreads.
426  */
427 static void _nmalloc_thr_init_once(void);
428 static void _nmalloc_thr_destructor(void *thrp);
429 
430 void
_nmalloc_thr_init(void)431 _nmalloc_thr_init(void)
432 {
433 	static int did_init;
434 
435 	TAILQ_INIT(&slglobal.full_zones);
436 
437 	if (slglobal.masked)
438 		return;
439 
440 	slglobal.masked = 1;
441 	if (did_init == 0) {
442 		did_init = 1;
443 		pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
444 	}
445 	pthread_setspecific(thread_malloc_key, &slglobal);
446 	slglobal.masked = 0;
447 }
448 
449 void
_nmalloc_thr_prepfork(void)450 _nmalloc_thr_prepfork(void)
451 {
452 	if (__isthreaded)
453 		_SPINLOCK(&slglobaldepot.lock);
454 }
455 
456 void
_nmalloc_thr_parentfork(void)457 _nmalloc_thr_parentfork(void)
458 {
459 	if (__isthreaded)
460 		_SPINUNLOCK(&slglobaldepot.lock);
461 }
462 
463 void
_nmalloc_thr_childfork(void)464 _nmalloc_thr_childfork(void)
465 {
466 	if (__isthreaded)
467 		_SPINUNLOCK(&slglobaldepot.lock);
468 }
469 
470 /*
471  * Called just once
472  */
473 static void
_nmalloc_thr_init_once(void)474 _nmalloc_thr_init_once(void)
475 {
476 	/* ignore error from stub if not threaded */
477 	pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
478 }
479 
480 /*
481  * Called for each thread undergoing exit
482  *
483  * Move all of the thread's slabs into a depot.
484  */
485 static void
_nmalloc_thr_destructor(void * thrp)486 _nmalloc_thr_destructor(void *thrp)
487 {
488 	slglobaldata_t slgd = thrp;
489 	struct zoneinfo *zinfo;
490 	slab_t slab;
491 	void *ptr;
492 	int i;
493 	int j;
494 
495 	slgd->masked = 1;
496 
497 	for (i = 0; i <= slgd->biggest_index; i++) {
498 		zinfo = &slgd->zone[i];
499 
500 		while ((j = zinfo->mag_index) > 0) {
501 			--j;
502 			ptr = zinfo->mag_shortcut[j];
503 			zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
504 			zinfo->mag_index = j;
505 			memfree(ptr, 0);
506 		}
507 
508 		while ((slab = zinfo->empty_base) != NULL) {
509 			zinfo->empty_base = slab->next;
510 			--zinfo->empty_count;
511 			slabterm(slgd, slab);
512 		}
513 
514 		while ((slab = zinfo->avail_base) != NULL) {
515 			zinfo->avail_base = slab->next;
516 			--zinfo->avail_count;
517 			slabterm(slgd, slab);
518 		}
519 
520 		while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
521 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
522 			slabterm(slgd, slab);
523 		}
524 	}
525 }
526 
527 /*
528  * Calculate the zone index for the allocation request size and set the
529  * allocation request size to that particular zone's chunk size.
530  *
531  * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform
532  * with malloc requirements for intel/amd.
533  */
534 static __inline int
zoneindex(size_t * bytes,size_t * chunking)535 zoneindex(size_t *bytes, size_t *chunking)
536 {
537 	size_t n = (size_t)*bytes;
538 	size_t x;
539 	size_t c;
540 	int i;
541 
542 	if (n < 128) {
543 		if (n < 16) {
544 			*bytes = n = (n + 7) & ~7;
545 			*chunking = 8;
546 			return(n / 8 - 1);	/* 8 byte chunks, 2 zones */
547 		} else {
548 			*bytes = n = (n + 15) & ~15;
549 			*chunking = 16;
550 			return(n / 16 + 2);	/* 16 byte chunks, 8 zones */
551 		}
552 	}
553 	if (n < 4096) {
554 		x = 256;
555 		c = x / (CHUNKFACTOR * 2);
556 		i = 16;
557 	} else {
558 		x = 8192;
559 		c = x / (CHUNKFACTOR * 2);
560 		i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
561 	}
562 	while (n >= x) {
563 		x <<= 1;
564 		c <<= 1;
565 		i += CHUNKFACTOR;
566 		if (x == 0)
567 			_mpanic("slaballoc: byte value too high");
568 	}
569 	*bytes = n = roundup2(n, c);
570 	*chunking = c;
571 	return (i + n / c - CHUNKFACTOR);
572 #if 0
573 	*bytes = n = (n + c - 1) & ~(c - 1);
574 	*chunking = c;
575 	return (n / c + i);
576 
577 	if (n < 256) {
578 		*bytes = n = (n + 15) & ~15;
579 		*chunking = 16;
580 		return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
581 	}
582 	if (n < 8192) {
583 		if (n < 512) {
584 			*bytes = n = (n + 31) & ~31;
585 			*chunking = 32;
586 			return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
587 		}
588 		if (n < 1024) {
589 			*bytes = n = (n + 63) & ~63;
590 			*chunking = 64;
591 			return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
592 		}
593 		if (n < 2048) {
594 			*bytes = n = (n + 127) & ~127;
595 			*chunking = 128;
596 			return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
597 		}
598 		if (n < 4096) {
599 			*bytes = n = (n + 255) & ~255;
600 			*chunking = 256;
601 			return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
602 		}
603 		*bytes = n = (n + 511) & ~511;
604 		*chunking = 512;
605 		return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
606 	}
607 	if (n < 16384) {
608 		*bytes = n = (n + 1023) & ~1023;
609 		*chunking = 1024;
610 		return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
611 	}
612 	if (n < 32768) {				/* 16384-32767 */
613 		*bytes = n = (n + 2047) & ~2047;
614 		*chunking = 2048;
615 		return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
616 	}
617 	if (n < 65536) {
618 		*bytes = n = (n + 4095) & ~4095;	/* 32768-65535 */
619 		*chunking = 4096;
620 		return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
621 	}
622 
623 	x = 131072;
624 	c = 8192;
625 	i = CHUNKINGLO*10 - 1;
626 
627 	while (n >= x) {
628 		x <<= 1;
629 		c <<= 1;
630 		i += CHUNKINGHI;
631 		if (x == 0)
632 			_mpanic("slaballoc: byte value too high");
633 	}
634 	*bytes = n = (n + c - 1) & ~(c - 1);
635 	*chunking = c;
636 	return (n / c + i);
637 #endif
638 }
639 
640 /*
641  * malloc() - call internal slab allocator
642  */
643 void *
__malloc(size_t size)644 __malloc(size_t size)
645 {
646 	void *ptr;
647 
648 	ptr = memalloc(size, 0);
649 	if (ptr == NULL)
650 		errno = ENOMEM;
651 	else
652 		UTRACE(0, size, ptr);
653 	return(ptr);
654 }
655 
656 /*
657  * calloc() - call internal slab allocator
658  */
659 void *
__calloc(size_t number,size_t size)660 __calloc(size_t number, size_t size)
661 {
662 	void *ptr;
663 
664 	ptr = memalloc(number * size, SAFLAG_ZERO);
665 	if (ptr == NULL)
666 		errno = ENOMEM;
667 	else
668 		UTRACE(0, number * size, ptr);
669 	return(ptr);
670 }
671 
672 /*
673  * realloc() (SLAB ALLOCATOR)
674  *
675  * We do not attempt to optimize this routine beyond reusing the same
676  * pointer if the new size fits within the chunking of the old pointer's
677  * zone.
678  */
679 void *
__realloc(void * ptr,size_t size)680 __realloc(void *ptr, size_t size)
681 {
682 	void *ret;
683 
684 	if (ptr == NULL)
685 		ret = memalloc(size, 0);
686 	else
687 		ret = memrealloc(ptr, size);
688 	if (ret == NULL)
689 		errno = ENOMEM;
690 	else
691 		UTRACE(ptr, size, ret);
692 	return(ret);
693 }
694 
695 /*
696  * aligned_alloc()
697  *
698  * Allocate (size) bytes with a alignment of (alignment).
699  */
700 void *
__aligned_alloc(size_t alignment,size_t size)701 __aligned_alloc(size_t alignment, size_t size)
702 {
703 	void *ptr;
704 	int rc;
705 
706 	ptr = NULL;
707 	rc = memalign(&ptr, alignment, size);
708 	if (rc)
709 		errno = rc;
710 
711 	return (ptr);
712 }
713 
714 /*
715  * posix_memalign()
716  *
717  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
718  * is a power of 2 >= sizeof(void *).
719  */
720 int
__posix_memalign(void ** memptr,size_t alignment,size_t size)721 __posix_memalign(void **memptr, size_t alignment, size_t size)
722 {
723 	int rc;
724 
725 	/*
726 	 * OpenGroup spec issue 6 check
727 	 */
728 	if (alignment < sizeof(void *)) {
729 		*memptr = NULL;
730 		return(EINVAL);
731 	}
732 
733 	rc = memalign(memptr, alignment, size);
734 
735 	return (rc);
736 }
737 
738 /*
739  * The slab allocator will allocate on power-of-2 boundaries up to at least
740  * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
741  * matching the requirements.
742  */
743 static int
memalign(void ** memptr,size_t alignment,size_t size)744 memalign(void **memptr, size_t alignment, size_t size)
745 {
746 
747 	if (alignment < 1) {
748 		*memptr = NULL;
749 		return(EINVAL);
750 	}
751 
752 	/*
753 	 * OpenGroup spec issue 6 check
754 	 */
755 	if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
756 		*memptr = NULL;
757 		return(EINVAL);
758 	}
759 
760 	/*
761 	 * XXX for now just find the nearest power of 2 >= size and also
762 	 * >= alignment and allocate that.
763 	 */
764 	while (alignment < size) {
765 		alignment <<= 1;
766 		if (alignment == 0)
767 			_mpanic("posix_memalign: byte value too high");
768 	}
769 	*memptr = memalloc(alignment, 0);
770 	return(*memptr ? 0 : ENOMEM);
771 }
772 
773 /*
774  * free() (SLAB ALLOCATOR) - do the obvious
775  */
776 void
__free(void * ptr)777 __free(void *ptr)
778 {
779 	if (ptr) {
780 		UTRACE(ptr, 0, 0);
781 		memfree(ptr, 0);
782 	}
783 }
784 
785 /*
786  * memalloc()	(SLAB ALLOCATOR)
787  *
788  *	Allocate memory via the slab allocator.
789  */
790 static void *
memalloc(size_t size,int flags)791 memalloc(size_t size, int flags)
792 {
793 	slglobaldata_t slgd;
794 	struct zoneinfo *zinfo;
795 	slab_t slab;
796 	size_t chunking;
797 	int bmi;
798 	int bno;
799 	u_long *bmp;
800 	int zi;
801 #ifdef INVARIANTS
802 	int i;
803 #endif
804 	int j;
805 	char *obj;
806 
807 	/*
808 	 * If 0 bytes is requested we have to return a unique pointer, allocate
809 	 * at least one byte.
810 	 */
811 	if (size == 0)
812 		size = 1;
813 
814 	/* Capture global flags */
815 	flags |= g_malloc_flags;
816 
817 	/* Compute allocation zone; zoneindex will panic on excessive sizes */
818 	zi = zoneindex(&size, &chunking);
819 	MASSERT(zi < NZONES);
820 	if (size == 0)
821 		return(NULL);
822 
823 	/*
824 	 * Try magazine shortcut first
825 	 */
826 	slgd = &slglobal;
827 	zinfo = &slgd->zone[zi];
828 
829 	if ((j = zinfo->mag_index) != 0) {
830 		zinfo->mag_index = --j;
831 		obj = zinfo->mag_shortcut[j];
832 		zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
833 		if (flags & SAFLAG_ZERO)
834 			bzero(obj, size);
835 		return obj;
836 	}
837 
838 	/*
839 	 * Locate a slab with available space.  If no slabs are available
840 	 * back-off to the empty list and if we still come up dry allocate
841 	 * a new slab (which will try the depot first).
842 	 */
843 retry:
844 	if ((slab = zinfo->avail_base) == NULL) {
845 		if ((slab = zinfo->empty_base) == NULL) {
846 			/*
847 			 * Still dry
848 			 */
849 			slab = slaballoc(zi, chunking, size);
850 			if (slab == NULL)
851 				return(NULL);
852 			slab->next = zinfo->avail_base;
853 			zinfo->avail_base = slab;
854 			++zinfo->avail_count;
855 			slab->state = AVAIL;
856 			if (slgd->biggest_index < zi)
857 				slgd->biggest_index = zi;
858 			++slgd->nslabs;
859 		} else {
860 			/*
861 			 * Pulled from empty list
862 			 */
863 			zinfo->empty_base = slab->next;
864 			slab->next = zinfo->avail_base;
865 			zinfo->avail_base = slab;
866 			++zinfo->avail_count;
867 			slab->state = AVAIL;
868 			--zinfo->empty_count;
869 		}
870 	}
871 
872 	/*
873 	 * Allocate a chunk out of the slab.  HOT PATH
874 	 *
875 	 * Only the thread owning the slab can allocate out of it.
876 	 *
877 	 * NOTE: The last bit in the bitmap is always marked allocated so
878 	 *	 we cannot overflow here.
879 	 */
880 	bno = slab->free_bit;
881 	bmi = slab->free_index;
882 	bmp = &slab->bitmap[bmi];
883 	if (*bmp & (1LU << bno)) {
884 		atomic_clear_long(bmp, 1LU << bno);
885 		obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
886 		slab->free_bit = (bno + 1) & (LONG_BITS - 1);
887 		atomic_add_int(&slab->navail, -1);
888 		if (flags & SAFLAG_ZERO)
889 			bzero(obj, size);
890 		return (obj);
891 	}
892 
893 	/*
894 	 * Allocate a chunk out of a slab.  COLD PATH
895 	 */
896 	if (slab->navail == 0) {
897 		zinfo->avail_base = slab->next;
898 		--zinfo->avail_count;
899 		slab->state = FULL;
900 		TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
901 		goto retry;
902 	}
903 
904 	while (bmi < LONG_BITS) {
905 		bmp = &slab->bitmap[bmi];
906 		if (*bmp) {
907 			bno = bsflong(*bmp);
908 			atomic_clear_long(bmp, 1LU << bno);
909 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
910 					     size;
911 			slab->free_index = bmi;
912 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
913 			atomic_add_int(&slab->navail, -1);
914 			if (flags & SAFLAG_ZERO)
915 				bzero(obj, size);
916 			return (obj);
917 		}
918 		++bmi;
919 	}
920 	bmi = 0;
921 	while (bmi < LONG_BITS) {
922 		bmp = &slab->bitmap[bmi];
923 		if (*bmp) {
924 			bno = bsflong(*bmp);
925 			atomic_clear_long(bmp, 1LU << bno);
926 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
927 					     size;
928 			slab->free_index = bmi;
929 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
930 			atomic_add_int(&slab->navail, -1);
931 			if (flags & SAFLAG_ZERO)
932 				bzero(obj, size);
933 			return (obj);
934 		}
935 		++bmi;
936 	}
937 	_mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
938 	/* not reached */
939 	return NULL;
940 }
941 
942 /*
943  * Reallocate memory within the chunk
944  */
945 static void *
memrealloc(void * ptr,size_t nsize)946 memrealloc(void *ptr, size_t nsize)
947 {
948 	region_t region;
949 	slab_t slab;
950 	size_t osize;
951 	char *obj;
952 	int flags = 0;
953 
954 	/*
955 	 * If 0 bytes is requested we have to return a unique pointer, allocate
956 	 * at least one byte.
957 	 */
958 	if (nsize == 0)
959 		nsize = 1;
960 
961 	/* Capture global flags */
962 	flags |= g_malloc_flags;
963 
964 	/*
965 	 * Locate the zone by looking up the dynamic slab size mask based
966 	 * on the memory region the allocation resides in.
967 	 */
968 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
969 	if ((slab = region->slab) == NULL)
970 		slab = (void *)((uintptr_t)ptr & region->mask);
971 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
972 	osize = slab->chunk_size;
973 	if (nsize <= osize) {
974 		if (osize < 32 || nsize >= osize / 2) {
975 			obj = ptr;
976 			if ((flags & SAFLAG_ZERO) && nsize < osize)
977 				bzero(obj + nsize, osize - nsize);
978 			return(obj);
979 		}
980 	}
981 
982 	/*
983 	 * Otherwise resize the object
984 	 */
985 	obj = memalloc(nsize, 0);
986 	if (obj) {
987 		if (nsize > osize)
988 			nsize = osize;
989 		bcopy(ptr, obj, nsize);
990 		memfree(ptr, 0);
991 	}
992 	return (obj);
993 }
994 
995 /*
996  * free (SLAB ALLOCATOR)
997  *
998  * Free a memory block previously allocated by malloc.
999  *
1000  * MPSAFE
1001  */
1002 static void
memfree(void * ptr,int flags)1003 memfree(void *ptr, int flags)
1004 {
1005 	region_t region;
1006 	slglobaldata_t slgd;
1007 	slab_t slab;
1008 	slab_t stmp;
1009 	slab_t *slabp;
1010 	int bmi;
1011 	int bno;
1012 	int j;
1013 	u_long *bmp;
1014 
1015 	/*
1016 	 * Locate the zone by looking up the dynamic slab size mask based
1017 	 * on the memory region the allocation resides in.
1018 	 *
1019 	 * WARNING!  The slab may be owned by another thread!
1020 	 */
1021 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
1022 	if ((slab = region->slab) == NULL)
1023 		slab = (void *)((uintptr_t)ptr & region->mask);
1024 	MASSERT(slab != NULL);
1025 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
1026 
1027 #ifdef INVARIANTS
1028 	/*
1029 	 * Put weird data into the memory to detect modifications after
1030 	 * freeing, illegal pointer use after freeing (we should fault on
1031 	 * the odd address), and so forth.
1032 	 */
1033 	if (slab->chunk_size < sizeof(weirdary))
1034 		bcopy(weirdary, ptr, slab->chunk_size);
1035 	else
1036 		bcopy(weirdary, ptr, sizeof(weirdary));
1037 #endif
1038 	slgd = &slglobal;
1039 
1040 	/*
1041 	 * Use mag_shortcut[] when possible
1042 	 */
1043 	if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) {
1044 		struct zoneinfo *zinfo;
1045 
1046 		zinfo = &slgd->zone[slab->zone_index];
1047 		j = zinfo->mag_index;
1048 		if (j < NMAGSHORTCUT) {
1049 			zinfo->mag_shortcut[j] = ptr;
1050 			zinfo->mag_index = j + 1;
1051 			return;
1052 		}
1053 	}
1054 
1055 	/*
1056 	 * Free to slab and increment navail.  We can delay incrementing
1057 	 * navail to prevent the slab from being destroyed out from under
1058 	 * us while we do other optimizations.
1059 	 */
1060 	bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
1061 	bmi = bno >> LONG_BITS_SHIFT;
1062 	bno &= (LONG_BITS - 1);
1063 	bmp = &slab->bitmap[bmi];
1064 
1065 	MASSERT(bmi >= 0 && bmi < slab->nmax);
1066 	MASSERT((*bmp & (1LU << bno)) == 0);
1067 	atomic_set_long(bmp, 1LU << bno);
1068 
1069 	if (slab->slgd == slgd) {
1070 		/*
1071 		 * We can only do the following if we own the slab.  Note
1072 		 * that navail can be incremented by any thread even if
1073 		 * we own the slab.
1074 		 */
1075 		struct zoneinfo *zinfo;
1076 
1077 		atomic_add_int(&slab->navail, 1);
1078 		if (slab->free_index > bmi) {
1079 			slab->free_index = bmi;
1080 			slab->free_bit = bno;
1081 		} else if (slab->free_index == bmi &&
1082 			   slab->free_bit > bno) {
1083 			slab->free_bit = bno;
1084 		}
1085 		zinfo = &slgd->zone[slab->zone_index];
1086 
1087 		/*
1088 		 * Freeing an object from a full slab makes it less than
1089 		 * full.  The slab must be moved to the available list.
1090 		 *
1091 		 * If the available list has too many slabs, release some
1092 		 * to the depot.
1093 		 */
1094 		if (slab->state == FULL) {
1095 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
1096 			slab->state = AVAIL;
1097 			stmp = zinfo->avail_base;
1098 			slab->next = stmp;
1099 			zinfo->avail_base = slab;
1100 			++zinfo->avail_count;
1101 			while (zinfo->avail_count > opt_cache) {
1102 				slab = zinfo->avail_base;
1103 				zinfo->avail_base = slab->next;
1104 				--zinfo->avail_count;
1105 				slabterm(slgd, slab);
1106 			}
1107 			goto done;
1108 		}
1109 
1110 		/*
1111 		 * If the slab becomes completely empty dispose of it in
1112 		 * some manner.  By default each thread caches up to 4
1113 		 * empty slabs.  Only small slabs are cached.
1114 		 */
1115 		if (slab->navail == slab->nmax && slab->state == AVAIL) {
1116 			/*
1117 			 * Remove slab from available queue
1118 			 */
1119 			slabp = &zinfo->avail_base;
1120 			while ((stmp = *slabp) != slab)
1121 				slabp = &stmp->next;
1122 			*slabp = slab->next;
1123 			--zinfo->avail_count;
1124 
1125 			if (opt_free || opt_cache == 0) {
1126 				/*
1127 				 * If local caching is disabled cache the
1128 				 * slab in the depot (or free it).
1129 				 */
1130 				slabterm(slgd, slab);
1131 			} else if (slab->slab_size > BIGSLABSIZE) {
1132 				/*
1133 				 * We do not try to retain large slabs
1134 				 * in per-thread caches.
1135 				 */
1136 				slabterm(slgd, slab);
1137 			} else if (zinfo->empty_count > opt_cache) {
1138 				/*
1139 				 * We have too many slabs cached, but
1140 				 * instead of freeing this one free
1141 				 * an empty slab that's been idle longer.
1142 				 *
1143 				 * (empty_count does not change)
1144 				 */
1145 				stmp = zinfo->empty_base;
1146 				slab->state = EMPTY;
1147 				slab->next = stmp->next;
1148 				zinfo->empty_base = slab;
1149 				slabterm(slgd, stmp);
1150 			} else {
1151 				/*
1152 				 * Cache the empty slab in our thread local
1153 				 * empty list.
1154 				 */
1155 				++zinfo->empty_count;
1156 				slab->state = EMPTY;
1157 				slab->next = zinfo->empty_base;
1158 				zinfo->empty_base = slab;
1159 			}
1160 		}
1161 	} else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1162 		slglobaldata_t sldepot;
1163 
1164 		/*
1165 		 * If freeing to a slab owned by the global depot, and
1166 		 * the slab becomes completely EMPTY, try to move it to
1167 		 * the correct list.
1168 		 */
1169 		sldepot = &slglobaldepot;
1170 		if (__isthreaded)
1171 			_SPINLOCK(&sldepot->lock);
1172 		if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1173 			struct zoneinfo *zinfo;
1174 
1175 			/*
1176 			 * Move the slab to the empty list
1177 			 */
1178 			MASSERT(slab->state == AVAIL);
1179 			atomic_add_int(&slab->navail, 1);
1180 			zinfo = &sldepot->zone[slab->zone_index];
1181 			slabp = &zinfo->avail_base;
1182 			while (slab != *slabp)
1183 				slabp = &(*slabp)->next;
1184 			*slabp = slab->next;
1185 			--zinfo->avail_count;
1186 
1187 			/*
1188 			 * Clean out excessive empty entries from the
1189 			 * depot.
1190 			 */
1191 			slab->state = EMPTY;
1192 			slab->next = zinfo->empty_base;
1193 			zinfo->empty_base = slab;
1194 			++zinfo->empty_count;
1195 			while (zinfo->empty_count > opt_cache) {
1196 				slab = zinfo->empty_base;
1197 				zinfo->empty_base = slab->next;
1198 				--zinfo->empty_count;
1199 				slab->state = UNKNOWN;
1200 				if (__isthreaded)
1201 					_SPINUNLOCK(&sldepot->lock);
1202 				slabfree(slab);
1203 				if (__isthreaded)
1204 					_SPINLOCK(&sldepot->lock);
1205 			}
1206 		} else {
1207 			atomic_add_int(&slab->navail, 1);
1208 		}
1209 		if (__isthreaded)
1210 			_SPINUNLOCK(&sldepot->lock);
1211 	} else {
1212 		/*
1213 		 * We can't act on the slab other than by adjusting navail
1214 		 * (and the bitmap which we did in the common code at the
1215 		 * top).
1216 		 */
1217 		atomic_add_int(&slab->navail, 1);
1218 	}
1219 done:
1220 	;
1221 }
1222 
1223 /*
1224  * Allocate a new slab holding objects of size chunk_size.
1225  */
1226 static slab_t
slaballoc(int zi,size_t chunking,size_t chunk_size)1227 slaballoc(int zi, size_t chunking, size_t chunk_size)
1228 {
1229 	slglobaldata_t slgd;
1230 	slglobaldata_t sldepot;
1231 	struct zoneinfo *zinfo;
1232 	region_t region;
1233 	void *save;
1234 	slab_t slab;
1235 	size_t slab_desire;
1236 	size_t slab_size;
1237 	size_t region_mask;
1238 	uintptr_t chunk_offset;
1239 	ssize_t maxchunks;
1240 	ssize_t tmpchunks;
1241 	int ispower2;
1242 	int power;
1243 	int ri;
1244 	int rx;
1245 	int nswath;
1246 	int j;
1247 
1248 	/*
1249 	 * First look in the depot.  Any given zone in the depot may be
1250 	 * locked by being set to -1.  We have to do this instead of simply
1251 	 * removing the entire chain because removing the entire chain can
1252 	 * cause racing threads to allocate local slabs for large objects,
1253 	 * resulting in a large VSZ.
1254 	 */
1255 	slgd = &slglobal;
1256 	sldepot = &slglobaldepot;
1257 	zinfo = &sldepot->zone[zi];
1258 
1259 	if (zinfo->avail_base) {
1260 		if (__isthreaded)
1261 			_SPINLOCK(&sldepot->lock);
1262 		slab = zinfo->avail_base;
1263 		if (slab) {
1264 			MASSERT(slab->slgd == NULL);
1265 			slab->slgd = slgd;
1266 			zinfo->avail_base = slab->next;
1267 			--zinfo->avail_count;
1268 			if (__isthreaded)
1269 				_SPINUNLOCK(&sldepot->lock);
1270 			return slab;
1271 		}
1272 		if (__isthreaded)
1273 			_SPINUNLOCK(&sldepot->lock);
1274 	}
1275 
1276 	/*
1277 	 * Nothing in the depot, allocate a new slab by locating or assigning
1278 	 * a region and then using the system virtual memory allocator.
1279 	 */
1280 	slab = NULL;
1281 
1282 	/*
1283 	 * Calculate the start of the data chunks relative to the start
1284 	 * of the slab.  If chunk_size is a power of 2 we guarantee
1285 	 * power of 2 alignment.  If it is not we guarantee alignment
1286 	 * to the chunk size.
1287 	 */
1288 	if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1289 		ispower2 = 1;
1290 		chunk_offset = roundup2(sizeof(*slab), chunk_size);
1291 	} else {
1292 		ispower2 = 0;
1293 		chunk_offset = sizeof(*slab) + chunking - 1;
1294 		chunk_offset -= chunk_offset % chunking;
1295 	}
1296 
1297 	/*
1298 	 * Calculate a reasonable number of chunks for the slab.
1299 	 *
1300 	 * Once initialized the MaxChunks[] array can only ever be
1301 	 * reinitialized to the same value.
1302 	 */
1303 	maxchunks = MaxChunks[zi];
1304 	if (maxchunks == 0) {
1305 		/*
1306 		 * First calculate how many chunks would fit in 1/1024
1307 		 * available memory.  This is around 2MB on a 32 bit
1308 		 * system and 128G on a 64-bit (48-bits available) system.
1309 		 */
1310 		maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1311 			    (ssize_t)chunk_size;
1312 		if (maxchunks <= 0)
1313 			maxchunks = 1;
1314 
1315 		/*
1316 		 * A slab cannot handle more than MAXCHUNKS chunks, but
1317 		 * limit us to approximately MAXCHUNKS / 2 here because
1318 		 * we may have to expand maxchunks when we calculate the
1319 		 * actual power-of-2 slab.
1320 		 */
1321 		if (maxchunks > MAXCHUNKS / 2)
1322 			maxchunks = MAXCHUNKS / 2;
1323 
1324 		/*
1325 		 * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1326 		 * slabs will be created if the allocation does not fit.
1327 		 */
1328 		if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1329 			tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1330 				    (ssize_t)chunk_size;
1331 			if (tmpchunks <= 0)
1332 				tmpchunks = 1;
1333 			if (maxchunks > tmpchunks)
1334 				maxchunks = tmpchunks;
1335 		}
1336 
1337 		/*
1338 		 * If the slab calculates to greater than 2MB see if we
1339 		 * can cut it down to ~2MB.  This controls VSZ but has
1340 		 * no effect on run-time size or performance.
1341 		 *
1342 		 * This is very important in case you core dump and also
1343 		 * important to reduce unnecessary region allocations.
1344 		 */
1345 		if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1346 			tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1347 				    (ssize_t)chunk_size;
1348 			if (tmpchunks < 1)
1349 				tmpchunks = 1;
1350 			if (maxchunks > tmpchunks)
1351 				maxchunks = tmpchunks;
1352 		}
1353 
1354 		/*
1355 		 * If the slab calculates to greater than 128K see if we
1356 		 * can cut it down to ~128K while still maintaining a
1357 		 * reasonably large number of chunks in each slab.  This
1358 		 * controls VSZ but has no effect on run-time size or
1359 		 * performance.
1360 		 *
1361 		 * This is very important in case you core dump and also
1362 		 * important to reduce unnecessary region allocations.
1363 		 */
1364 		if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1365 			tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1366 				    (ssize_t)chunk_size;
1367 			if (tmpchunks < 32)
1368 				tmpchunks = 32;
1369 			if (maxchunks > tmpchunks)
1370 				maxchunks = tmpchunks;
1371 		}
1372 
1373 		MaxChunks[zi] = maxchunks;
1374 	}
1375 	MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1376 
1377 	/*
1378 	 * Calculate the actual slab size.  maxchunks will be recalculated
1379 	 * a little later.
1380 	 */
1381 	slab_desire = chunk_offset + chunk_size * maxchunks;
1382 	slab_size = 8 * MAXCHUNKS;
1383 	power = 3 + MAXCHUNKS_BITS;
1384 	while (slab_size < slab_desire) {
1385 		slab_size <<= 1;
1386 		++power;
1387 	}
1388 
1389 	/*
1390 	 * Do a quick recalculation based on the actual slab size but not
1391 	 * yet dealing with whether the slab header is in-band or out-of-band.
1392 	 * The purpose here is to see if we can reasonably reduce slab_size
1393 	 * to a power of 4 to allow more chunk sizes to use the same slab
1394 	 * size.
1395 	 */
1396 	if ((power & 1) && slab_size > 32768) {
1397 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1398 		if (maxchunks >= MAXCHUNKS / 8) {
1399 			slab_size >>= 1;
1400 			--power;
1401 		}
1402 	}
1403 	if ((power & 2) && slab_size > 32768 * 4) {
1404 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1405 		if (maxchunks >= MAXCHUNKS / 4) {
1406 			slab_size >>= 2;
1407 			power -= 2;
1408 		}
1409 	}
1410 	/*
1411 	 * This case occurs when the slab_size is larger than 1/1024 available
1412 	 * UVM.
1413 	 */
1414 	nswath = slab_size / NREGIONS_SIZE;
1415 	if (nswath > NREGIONS)
1416 		return (NULL);
1417 
1418 
1419 	/*
1420 	 * Try to allocate from our current best region for this zi
1421 	 */
1422 	region_mask = ~(slab_size - 1);
1423 	ri = slgd->zone[zi].best_region;
1424 	if (Regions[ri].mask == region_mask) {
1425 		if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1426 			goto found;
1427 	}
1428 
1429 	/*
1430 	 * Try to find an existing region to allocate from.  The normal
1431 	 * case will be for allocations that are less than 1/1024 available
1432 	 * UVM, which fit into a single Regions[] entry.
1433 	 */
1434 	while (slab_size <= NREGIONS_SIZE) {
1435 		rx = -1;
1436 		for (ri = 0; ri < NREGIONS; ++ri) {
1437 			if (rx < 0 && Regions[ri].mask == 0)
1438 				rx = ri;
1439 			if (Regions[ri].mask == region_mask) {
1440 				slab = _vmem_alloc(ri, slab_size);
1441 				if (slab) {
1442 					slgd->zone[zi].best_region = ri;
1443 					goto found;
1444 				}
1445 			}
1446 		}
1447 
1448 		if (rx < 0)
1449 			return(NULL);
1450 
1451 		/*
1452 		 * This can fail, retry either way
1453 		 */
1454 		atomic_cmpset_ptr((void **)&Regions[rx].mask,
1455 				  NULL,
1456 				  (void *)region_mask);
1457 	}
1458 
1459 	for (;;) {
1460 		rx = -1;
1461 		for (ri = 0; ri < NREGIONS; ri += nswath) {
1462 			if (Regions[ri].mask == region_mask) {
1463 				slab = _vmem_alloc(ri, slab_size);
1464 				if (slab) {
1465 					slgd->zone[zi].best_region = ri;
1466 					goto found;
1467 				}
1468 			}
1469 			if (rx < 0) {
1470 				for (j = nswath - 1; j >= 0; --j) {
1471 					if (Regions[ri+j].mask != 0)
1472 						break;
1473 				}
1474 				if (j < 0)
1475 					rx = ri;
1476 			}
1477 		}
1478 
1479 		/*
1480 		 * We found a candidate, try to allocate it backwards so
1481 		 * another thread racing a slaballoc() does not see the
1482 		 * mask in the base index position until we are done.
1483 		 *
1484 		 * We can safely zero-out any partial allocations because
1485 		 * the mask is only accessed from the base index.  Any other
1486 		 * threads racing us will fail prior to us clearing the mask.
1487 		 */
1488 		if (rx < 0)
1489 			return(NULL);
1490 		for (j = nswath - 1; j >= 0; --j) {
1491 			if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1492 					       NULL, (void *)region_mask)) {
1493 				while (++j < nswath)
1494 					Regions[rx+j].mask = 0;
1495 				break;
1496 			}
1497 		}
1498 		/* retry */
1499 	}
1500 
1501 	/*
1502 	 * Fill in the new slab in region ri.  If the slab_size completely
1503 	 * fills one or more region slots we move the slab structure out of
1504 	 * band which should optimize the chunking (particularly for a power
1505 	 * of 2).
1506 	 */
1507 found:
1508 	region = &Regions[ri];
1509 	MASSERT(region->slab == NULL);
1510 	if (slab_size >= NREGIONS_SIZE) {
1511 		save = slab;
1512 		slab = memalloc(sizeof(*slab), 0);
1513 		bzero(slab, sizeof(*slab));
1514 		slab->chunks = save;
1515 		for (j = 0; j < nswath; ++j)
1516 			region[j].slab = slab;
1517 		chunk_offset = 0;
1518 	} else {
1519 		bzero(slab, sizeof(*slab));
1520 		slab->chunks = (char *)slab + chunk_offset;
1521 	}
1522 
1523 	/*
1524 	 * Calculate the start of the chunks memory and recalculate the
1525 	 * actual number of chunks the slab can hold.
1526 	 */
1527 	maxchunks = (slab_size - chunk_offset) / chunk_size;
1528 	if (maxchunks > MAXCHUNKS)
1529 		maxchunks = MAXCHUNKS;
1530 
1531 	/*
1532 	 * And fill in the rest
1533 	 */
1534 	slab->magic = ZALLOC_SLAB_MAGIC;
1535 	slab->navail = maxchunks;
1536 	slab->nmax = maxchunks;
1537 	slab->slab_size = slab_size;
1538 	slab->chunk_size = chunk_size;
1539 	slab->zone_index = zi;
1540 	slab->slgd = slgd;
1541 	slab->state = UNKNOWN;
1542 	slab->region = region;
1543 
1544 	for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1545 		if (ri + LONG_BITS <= maxchunks)
1546 			slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1547 		else
1548 			slab->bitmap[ri >> LONG_BITS_SHIFT] =
1549 						(1LU << (maxchunks - ri)) - 1;
1550 	}
1551 	return (slab);
1552 }
1553 
1554 /*
1555  * Free a slab.
1556  */
1557 static void
slabfree(slab_t slab)1558 slabfree(slab_t slab)
1559 {
1560 	int nswath;
1561 	int j;
1562 
1563 	if (slab->region->slab == slab) {
1564 		/*
1565 		 * Out-of-band slab.
1566 		 */
1567 		nswath = slab->slab_size / NREGIONS_SIZE;
1568 		for (j = 0; j < nswath; ++j)
1569 			slab->region[j].slab = NULL;
1570 		slab->magic = 0;
1571 		_vmem_free(slab->chunks, slab->slab_size);
1572 		memfree(slab, 0);
1573 	} else {
1574 		/*
1575 		 * In-band slab.
1576 		 */
1577 		slab->magic = 0;
1578 		_vmem_free(slab, slab->slab_size);
1579 	}
1580 }
1581 
1582 /*
1583  * Terminate a slab's use in the current thread.  The slab may still have
1584  * outstanding allocations and thus not be deallocatable.
1585  */
1586 static void
slabterm(slglobaldata_t slgd,slab_t slab)1587 slabterm(slglobaldata_t slgd, slab_t slab)
1588 {
1589 	slglobaldata_t sldepot;
1590 	struct zoneinfo *zinfo;
1591 	int zi = slab->zone_index;
1592 
1593 	slab->slgd = NULL;
1594 	--slgd->nslabs;
1595 	sldepot = &slglobaldepot;
1596 	zinfo = &sldepot->zone[zi];
1597 
1598 	/*
1599 	 * Move the slab to the avail list or the empty list.
1600 	 */
1601 	if (__isthreaded)
1602 		_SPINLOCK(&sldepot->lock);
1603 	if (slab->navail == slab->nmax) {
1604 		slab->state = EMPTY;
1605 		slab->next = zinfo->empty_base;
1606 		zinfo->empty_base = slab;
1607 		++zinfo->empty_count;
1608 	} else {
1609 		slab->state = AVAIL;
1610 		slab->next = zinfo->avail_base;
1611 		zinfo->avail_base = slab;
1612 		++zinfo->avail_count;
1613 	}
1614 
1615 	/*
1616 	 * Clean extra slabs out of the empty list
1617 	 */
1618 	while (zinfo->empty_count > opt_cache) {
1619 		slab = zinfo->empty_base;
1620 		zinfo->empty_base = slab->next;
1621 		--zinfo->empty_count;
1622 		slab->state = UNKNOWN;
1623 		if (__isthreaded)
1624 			_SPINUNLOCK(&sldepot->lock);
1625 		slabfree(slab);
1626 		if (__isthreaded)
1627 			_SPINLOCK(&sldepot->lock);
1628 	}
1629 	if (__isthreaded)
1630 		_SPINUNLOCK(&sldepot->lock);
1631 }
1632 
1633 /*
1634  * _vmem_alloc()
1635  *
1636  *	Directly map memory in PAGE_SIZE'd chunks with the specified
1637  *	alignment.
1638  *
1639  *	Alignment must be a multiple of PAGE_SIZE.
1640  *
1641  *	Size must be >= alignment.
1642  */
1643 static void *
_vmem_alloc(int ri,size_t slab_size)1644 _vmem_alloc(int ri, size_t slab_size)
1645 {
1646 	char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1647 	char *eaddr;
1648 	char *addr;
1649 	char *save;
1650 	uintptr_t excess;
1651 
1652 	if (slab_size < NREGIONS_SIZE)
1653 		eaddr = baddr + NREGIONS_SIZE;
1654 	else
1655 		eaddr = baddr + slab_size;
1656 
1657 	/*
1658 	 * This usually just works but might not.
1659 	 */
1660 	addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1661 		    MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1662 	if (addr == MAP_FAILED) {
1663 		return (NULL);
1664 	}
1665 	if (addr < baddr || addr + slab_size > eaddr) {
1666 		munmap(addr, slab_size);
1667 		return (NULL);
1668 	}
1669 
1670 	/*
1671 	 * Check alignment.  The misaligned offset is also the excess
1672 	 * amount.  If misaligned unmap the excess so we have a chance of
1673 	 * mapping at the next alignment point and recursively try again.
1674 	 *
1675 	 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB	block alignment
1676 	 *   aaaaaaaaa aaaaaaaaaaa aa		mis-aligned allocation
1677 	 *   xxxxxxxxx				final excess calculation
1678 	 *   ^ returned address
1679 	 */
1680 	excess = (uintptr_t)addr & (slab_size - 1);
1681 	while (excess) {
1682 		excess = slab_size - excess;
1683 		save = addr;
1684 
1685 		munmap(save + excess, slab_size - excess);
1686 		addr = _vmem_alloc(ri, slab_size);
1687 		munmap(save, excess);
1688 		if (addr == NULL)
1689 			return (NULL);
1690 		if (addr < baddr || addr + slab_size > eaddr) {
1691 			munmap(addr, slab_size);
1692 			return (NULL);
1693 		}
1694 		excess = (uintptr_t)addr & (slab_size - 1);
1695 	}
1696 	return (addr);
1697 }
1698 
1699 /*
1700  * _vmem_free()
1701  *
1702  *	Free a chunk of memory allocated with _vmem_alloc()
1703  */
1704 static void
_vmem_free(void * ptr,size_t size)1705 _vmem_free(void *ptr, size_t size)
1706 {
1707 	munmap(ptr, size);
1708 }
1709 
1710 /*
1711  * Panic on fatal conditions
1712  */
1713 static void
_mpanic(const char * ctl,...)1714 _mpanic(const char *ctl, ...)
1715 {
1716 	va_list va;
1717 
1718 	if (malloc_panic == 0) {
1719 		malloc_panic = 1;
1720 		va_start(va, ctl);
1721 		vfprintf(stderr, ctl, va);
1722 		fprintf(stderr, "\n");
1723 		fflush(stderr);
1724 		va_end(va);
1725 	}
1726 	abort();
1727 }
1728 
1729 __weak_reference(__aligned_alloc, aligned_alloc);
1730 __weak_reference(__malloc, malloc);
1731 __weak_reference(__calloc, calloc);
1732 __weak_reference(__posix_memalign, posix_memalign);
1733 __weak_reference(__realloc, realloc);
1734 __weak_reference(__free, free);
1735