xref: /dragonfly/lib/libc/stdlib/dmalloc.c (revision 6491ace5)
1 /*
2  * DMALLOC.C	- Dillon's malloc
3  *
4  * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module implements a modified slab allocator as a drop-in replacement
38  * for the libc malloc().  The slab algorithm has been adjusted to support
39  * dynamic sizing of slabs which effectively allows slabs to be used for
40  * allocations of any size.  Because of this we neither have a small-block
41  * allocator or a big-block allocator and the code paths are simplified.
42  *
43  * To support dynamic slab sizing available user virtual memory is broken
44  * down into ~1024 regions.  Each region has fixed slab size whos value is
45  * set when the region is opened up for use.  The free() path simply applies
46  * a mask based on the region to the pointer to acquire the base of the
47  * governing slab structure.
48  *
49  * Regions[NREGIONS]	(1024)
50  *
51  * Slab management and locking is done on a per-zone basis.
52  *
53  *	Alloc Size	Chunking        Number of zones
54  *	0-127		8		16
55  *	128-255		16		8
56  *	256-511		32		8
57  *	512-1023	64		8
58  *	1024-2047	128		8
59  *	2048-4095	256		8
60  *	4096-8191	512		8
61  *	8192-16383	1024		8
62  *	16384-32767	2048		8
63  *	32768-65535	4096		8
64  *	... continues forever ...	4 zones
65  *
66  *	For a 2^63 memory space each doubling >= 64K is broken down into
67  *	4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
68  *
69  *			   API FEATURES AND SIDE EFFECTS
70  *
71  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
72  *	Above that power-of-2 sized allocations are page-aligned.  Non
73  *	power-of-2 sized allocations are aligned the same as the chunk
74  *	size for their zone.
75  *    + ability to allocate arbitrarily large chunks of memory
76  *    + realloc will reuse the passed pointer if possible, within the
77  *	limitations of the zone chunking.
78  *
79  * On top of the slab allocator we also implement a 16-entry-per-thread
80  * magazine cache for allocations <= NOMSLABSIZE.
81  *
82  *				FUTURE FEATURES
83  *
84  *    + [better] garbage collection
85  *    + better initial sizing.
86  *
87  * TUNING
88  *
89  * The value of the environment variable MALLOC_OPTIONS is a character string
90  * containing various flags to tune nmalloc.  Upper case letters enabled
91  * or increase the feature, lower case disables or decreases the feature.
92  *
93  * U		Enable UTRACE for all operations, observable with ktrace.
94  *		Diasbled by default.
95  *
96  * Z		Zero out allocations, otherwise allocations (except for
97  *		calloc) will contain garbage.
98  *		Disabled by default.
99  *
100  * H		Pass a hint with madvise() about unused pages.
101  *		Disabled by default.
102  *		Not currently implemented.
103  *
104  * F		Disable local per-thread caching.
105  *		Disabled by default.
106  *
107  * C		Increase (decrease) how much excess cache to retain.
108  *		Set to 4 by default.
109  */
110 
111 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
112 
113 #ifndef STANDALONE_DEBUG
114 #include "libc_private.h"
115 #endif
116 
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/mman.h>
120 #include <sys/queue.h>
121 #include <sys/uio.h>
122 #include <sys/ktrace.h>
123 #include <stdio.h>
124 #include <stdint.h>
125 #include <stdlib.h>
126 #include <stdarg.h>
127 #include <stddef.h>
128 #include <unistd.h>
129 #include <string.h>
130 #include <fcntl.h>
131 #include <errno.h>
132 #include <pthread.h>
133 #include <limits.h>
134 
135 #include <machine/atomic.h>
136 #include <machine/cpufunc.h>
137 
138 #ifdef STANDALONE_DEBUG
139 void _nmalloc_thr_init(void);
140 #else
141 #include "spinlock.h"
142 #include "un-namespace.h"
143 #endif
144 
145 #ifndef MAP_SIZEALIGN
146 #define MAP_SIZEALIGN	0
147 #endif
148 
149 #if SSIZE_MAX == 0x7FFFFFFF
150 #define ADDRBITS	32
151 #define UVM_BITS	32	/* worst case */
152 #else
153 #define ADDRBITS	64
154 #define UVM_BITS	48	/* worst case XXX */
155 #endif
156 
157 #if LONG_MAX == 0x7FFFFFFF
158 #define LONG_BITS	32
159 #define LONG_BITS_SHIFT	5
160 #else
161 #define LONG_BITS	64
162 #define LONG_BITS_SHIFT	6
163 #endif
164 
165 #define LOCKEDPTR	((void *)(intptr_t)-1)
166 
167 /*
168  * Regions[]
169  */
170 #define NREGIONS_BITS	10
171 #define NREGIONS	(1 << NREGIONS_BITS)
172 #define NREGIONS_MASK	(NREGIONS - 1)
173 #define NREGIONS_SHIFT	(UVM_BITS - NREGIONS_BITS)
174 #define NREGIONS_SIZE	(1LU << NREGIONS_SHIFT)
175 
176 typedef struct region *region_t;
177 typedef struct slglobaldata *slglobaldata_t;
178 typedef struct slab *slab_t;
179 
180 struct region {
181 	uintptr_t	mask;
182 	slab_t		slab;	/* conditional out of band slab */
183 };
184 
185 static struct region Regions[NREGIONS];
186 
187 /*
188  * Number of chunking zones available
189  */
190 #define CHUNKFACTOR	8
191 #if ADDRBITS == 32
192 #define NZONES		(16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
193 #else
194 #define NZONES		(16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
195 #endif
196 
197 static int MaxChunks[NZONES];
198 
199 #define NDEPOTS		8		/* must be power of 2 */
200 
201 /*
202  * Maximum number of chunks per slab, governed by the allocation bitmap in
203  * each slab.  The maximum is reduced for large chunk sizes.
204  */
205 #define MAXCHUNKS	(LONG_BITS * LONG_BITS)
206 #define MAXCHUNKS_BITS	(LONG_BITS_SHIFT * LONG_BITS_SHIFT)
207 #define LITSLABSIZE	(32 * 1024)
208 #define NOMSLABSIZE	(2 * 1024 * 1024)
209 #define BIGSLABSIZE	(128 * 1024 * 1024)
210 
211 #define ZALLOC_SLAB_MAGIC	0x736c6162	/* magic sanity */
212 
213 TAILQ_HEAD(slab_list, slab);
214 
215 /*
216  * A slab structure
217  */
218 struct slab {
219 	struct slab	*next;		/* slabs with available space */
220 	TAILQ_ENTRY(slab) entry;
221 	int32_t		magic;		/* magic number for sanity check */
222 	u_int		navail;		/* number of free elements available */
223 	u_int		nmax;
224 	u_int		free_bit;	/* free hint bitno */
225 	u_int		free_index;	/* free hint index */
226 	u_long		bitmap[LONG_BITS]; /* free chunks */
227 	size_t		slab_size;	/* size of entire slab */
228 	size_t		chunk_size;	/* chunk size for validation */
229 	int		zone_index;
230 	enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
231 	int		flags;
232 	region_t	region;		/* related region */
233 	char		*chunks;	/* chunk base */
234 	slglobaldata_t	slgd;		/* localized to thread else NULL */
235 };
236 
237 /*
238  * per-thread data + global depot
239  *
240  * NOTE: The magazine shortcut is only used for per-thread data.
241  */
242 #define NMAGSHORTCUT	16
243 
244 struct slglobaldata {
245 	spinlock_t	lock;		/* only used by slglobaldepot */
246 	struct zoneinfo {
247 		slab_t	avail_base;
248 		slab_t	empty_base;
249 		int	best_region;
250 		int	mag_index;
251 		int	avail_count;
252 		int	empty_count;
253 		void	*mag_shortcut[NMAGSHORTCUT];
254 	} zone[NZONES];
255 	struct slab_list full_zones;	/* via entry */
256 	int		masked;
257 	int		biggest_index;
258 	size_t		nslabs;
259 };
260 
261 #define SLAB_ZEROD		0x0001
262 
263 /*
264  * Misc constants.  Note that allocations that are exact multiples of
265  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
266  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
267  */
268 #define MIN_CHUNK_SIZE		8		/* in bytes */
269 #define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
270 
271 #define SAFLAG_ZERO	0x00000001
272 
273 /*
274  * The WEIRD_ADDR is used as known text to copy into free objects to
275  * try to create deterministic failure cases if the data is accessed after
276  * free.
277  *
278  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
279  *	    not be larger then 64.
280  */
281 #ifdef INVARIANTS
282 #define WEIRD_ADDR      0xdeadc0de
283 #endif
284 
285 /*
286  * Thread control
287  */
288 
289 #define MASSERT(exp)	do { if (__predict_false(!(exp)))	\
290 				_mpanic("assertion: %s in %s",	\
291 				#exp, __func__);		\
292 			    } while (0)
293 
294 /*
295  * With this attribute set, do not require a function call for accessing
296  * this variable when the code is compiled -fPIC.
297  *
298  * Must be empty for libc_rtld (similar to __thread)
299  */
300 #if defined(__LIBC_RTLD)
301 #define TLS_ATTRIBUTE
302 #else
303 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
304 #endif
305 
306 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
307 static pthread_key_t thread_malloc_key;
308 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
309 static struct slglobaldata slglobaldepot;
310 
311 static int opt_madvise = 0;
312 static int opt_free = 0;
313 static int opt_cache = 4;
314 static int opt_utrace = 0;
315 static int g_malloc_flags = 0;
316 static int malloc_panic;
317 
318 #ifdef INVARIANTS
319 static const int32_t weirdary[16] = {
320 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
321 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
322 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
323 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
324 };
325 #endif
326 
327 static void *memalloc(size_t size, int flags);
328 static void *memrealloc(void *ptr, size_t size);
329 static void memfree(void *ptr, int);
330 static int memalign(void **memptr, size_t alignment, size_t size);
331 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
332 static void slabfree(slab_t slab);
333 static void slabterm(slglobaldata_t slgd, slab_t slab);
334 static void *_vmem_alloc(int ri, size_t slab_size);
335 static void _vmem_free(void *ptr, size_t slab_size);
336 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
337 #ifndef STANDALONE_DEBUG
338 static void malloc_init(void) __constructor(101);
339 #else
340 static void malloc_init(void) __constructor(101);
341 #endif
342 
343 
344 struct nmalloc_utrace {
345 	void *p;
346 	size_t s;
347 	void *r;
348 };
349 
350 #define UTRACE(a, b, c)						\
351 	if (opt_utrace) {					\
352 		struct nmalloc_utrace ut = {			\
353 			.p = (a),				\
354 			.s = (b),				\
355 			.r = (c)				\
356 		};						\
357 		utrace(&ut, sizeof(ut));			\
358 	}
359 
360 #ifdef INVARIANTS
361 /*
362  * If enabled any memory allocated without M_ZERO is initialized to -1.
363  */
364 static int  use_malloc_pattern;
365 #endif
366 
367 static void
368 malloc_init(void)
369 {
370 	const char *p = NULL;
371 
372 	TAILQ_INIT(&slglobal.full_zones);
373 
374 	Regions[0].mask = -1; /* disallow activity in lowest region */
375 
376 	if (issetugid() == 0)
377 		p = getenv("MALLOC_OPTIONS");
378 
379 	for (; p != NULL && *p != '\0'; p++) {
380 		switch(*p) {
381 		case 'u':
382 			opt_utrace = 0;
383 			break;
384 		case 'U':
385 			opt_utrace = 1;
386 			break;
387 		case 'h':
388 			opt_madvise = 0;
389 			break;
390 		case 'H':
391 			opt_madvise = 1;
392 			break;
393 		case 'c':
394 			if (opt_cache > 0)
395 				--opt_cache;
396 			break;
397 		case 'C':
398 			++opt_cache;
399 			break;
400 		case 'f':
401 			opt_free = 0;
402 			break;
403 		case 'F':
404 			opt_free = 1;
405 			break;
406 		case 'z':
407 			g_malloc_flags = 0;
408 			break;
409 		case 'Z':
410 			g_malloc_flags = SAFLAG_ZERO;
411 			break;
412 		default:
413 			break;
414 		}
415 	}
416 
417 	UTRACE((void *) -1, 0, NULL);
418 }
419 
420 /*
421  * We have to install a handler for nmalloc thread teardowns when
422  * the thread is created.  We cannot delay this because destructors in
423  * sophisticated userland programs can call malloc() for the first time
424  * during their thread exit.
425  *
426  * This routine is called directly from pthreads.
427  */
428 static void _nmalloc_thr_init_once(void);
429 static void _nmalloc_thr_destructor(void *thrp);
430 
431 void
432 _nmalloc_thr_init(void)
433 {
434 	static int did_init;
435 
436 	TAILQ_INIT(&slglobal.full_zones);
437 
438 	if (slglobal.masked)
439 		return;
440 
441 	slglobal.masked = 1;
442 	if (did_init == 0) {
443 		did_init = 1;
444 		pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
445 	}
446 	pthread_setspecific(thread_malloc_key, &slglobal);
447 	slglobal.masked = 0;
448 }
449 
450 void
451 _nmalloc_thr_prepfork(void)
452 {
453 	if (__isthreaded)
454 		_SPINLOCK(&slglobaldepot.lock);
455 }
456 
457 void
458 _nmalloc_thr_parentfork(void)
459 {
460 	if (__isthreaded)
461 		_SPINUNLOCK(&slglobaldepot.lock);
462 }
463 
464 void
465 _nmalloc_thr_childfork(void)
466 {
467 	if (__isthreaded)
468 		_SPINUNLOCK(&slglobaldepot.lock);
469 }
470 
471 /*
472  * Called just once
473  */
474 static void
475 _nmalloc_thr_init_once(void)
476 {
477 	int error;
478 
479 	error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
480 	if (error)
481 		abort();
482 }
483 
484 /*
485  * Called for each thread undergoing exit
486  *
487  * Move all of the thread's slabs into a depot.
488  */
489 static void
490 _nmalloc_thr_destructor(void *thrp)
491 {
492 	slglobaldata_t slgd = thrp;
493 	struct zoneinfo *zinfo;
494 	slab_t slab;
495 	void *ptr;
496 	int i;
497 	int j;
498 
499 	slgd->masked = 1;
500 
501 	for (i = 0; i <= slgd->biggest_index; i++) {
502 		zinfo = &slgd->zone[i];
503 
504 		while ((j = zinfo->mag_index) > 0) {
505 			--j;
506 			ptr = zinfo->mag_shortcut[j];
507 			zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
508 			zinfo->mag_index = j;
509 			memfree(ptr, 0);
510 		}
511 
512 		while ((slab = zinfo->empty_base) != NULL) {
513 			zinfo->empty_base = slab->next;
514 			--zinfo->empty_count;
515 			slabterm(slgd, slab);
516 		}
517 
518 		while ((slab = zinfo->avail_base) != NULL) {
519 			zinfo->avail_base = slab->next;
520 			--zinfo->avail_count;
521 			slabterm(slgd, slab);
522 		}
523 
524 		while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
525 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
526 			slabterm(slgd, slab);
527 		}
528 	}
529 }
530 
531 /*
532  * Calculate the zone index for the allocation request size and set the
533  * allocation request size to that particular zone's chunk size.
534  *
535  * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform
536  * with malloc requirements for intel/amd.
537  */
538 static __inline int
539 zoneindex(size_t *bytes, size_t *chunking)
540 {
541 	size_t n = (size_t)*bytes;
542 	size_t x;
543 	size_t c;
544 	int i;
545 
546 	if (n < 128) {
547 		if (n < 16) {
548 			*bytes = n = (n + 7) & ~7;
549 			*chunking = 8;
550 			return(n / 8 - 1);	/* 8 byte chunks, 2 zones */
551 		} else {
552 			*bytes = n = (n + 15) & ~15;
553 			*chunking = 16;
554 			return(n / 16 + 2);	/* 16 byte chunks, 8 zones */
555 		}
556 	}
557 	if (n < 4096) {
558 		x = 256;
559 		c = x / (CHUNKFACTOR * 2);
560 		i = 16;
561 	} else {
562 		x = 8192;
563 		c = x / (CHUNKFACTOR * 2);
564 		i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
565 	}
566 	while (n >= x) {
567 		x <<= 1;
568 		c <<= 1;
569 		i += CHUNKFACTOR;
570 		if (x == 0)
571 			_mpanic("slaballoc: byte value too high");
572 	}
573 	*bytes = n = roundup2(n, c);
574 	*chunking = c;
575 	return (i + n / c - CHUNKFACTOR);
576 #if 0
577 	*bytes = n = (n + c - 1) & ~(c - 1);
578 	*chunking = c;
579 	return (n / c + i);
580 
581 	if (n < 256) {
582 		*bytes = n = (n + 15) & ~15;
583 		*chunking = 16;
584 		return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
585 	}
586 	if (n < 8192) {
587 		if (n < 512) {
588 			*bytes = n = (n + 31) & ~31;
589 			*chunking = 32;
590 			return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
591 		}
592 		if (n < 1024) {
593 			*bytes = n = (n + 63) & ~63;
594 			*chunking = 64;
595 			return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
596 		}
597 		if (n < 2048) {
598 			*bytes = n = (n + 127) & ~127;
599 			*chunking = 128;
600 			return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
601 		}
602 		if (n < 4096) {
603 			*bytes = n = (n + 255) & ~255;
604 			*chunking = 256;
605 			return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
606 		}
607 		*bytes = n = (n + 511) & ~511;
608 		*chunking = 512;
609 		return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
610 	}
611 	if (n < 16384) {
612 		*bytes = n = (n + 1023) & ~1023;
613 		*chunking = 1024;
614 		return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
615 	}
616 	if (n < 32768) {				/* 16384-32767 */
617 		*bytes = n = (n + 2047) & ~2047;
618 		*chunking = 2048;
619 		return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
620 	}
621 	if (n < 65536) {
622 		*bytes = n = (n + 4095) & ~4095;	/* 32768-65535 */
623 		*chunking = 4096;
624 		return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
625 	}
626 
627 	x = 131072;
628 	c = 8192;
629 	i = CHUNKINGLO*10 - 1;
630 
631 	while (n >= x) {
632 		x <<= 1;
633 		c <<= 1;
634 		i += CHUNKINGHI;
635 		if (x == 0)
636 			_mpanic("slaballoc: byte value too high");
637 	}
638 	*bytes = n = (n + c - 1) & ~(c - 1);
639 	*chunking = c;
640 	return (n / c + i);
641 #endif
642 }
643 
644 /*
645  * malloc() - call internal slab allocator
646  */
647 void *
648 __malloc(size_t size)
649 {
650 	void *ptr;
651 
652 	ptr = memalloc(size, 0);
653 	if (ptr == NULL)
654 		errno = ENOMEM;
655 	else
656 		UTRACE(0, size, ptr);
657 	return(ptr);
658 }
659 
660 /*
661  * calloc() - call internal slab allocator
662  */
663 void *
664 __calloc(size_t number, size_t size)
665 {
666 	void *ptr;
667 
668 	ptr = memalloc(number * size, SAFLAG_ZERO);
669 	if (ptr == NULL)
670 		errno = ENOMEM;
671 	else
672 		UTRACE(0, number * size, ptr);
673 	return(ptr);
674 }
675 
676 /*
677  * realloc() (SLAB ALLOCATOR)
678  *
679  * We do not attempt to optimize this routine beyond reusing the same
680  * pointer if the new size fits within the chunking of the old pointer's
681  * zone.
682  */
683 void *
684 __realloc(void *ptr, size_t size)
685 {
686 	void *ret;
687 
688 	if (ptr == NULL)
689 		ret = memalloc(size, 0);
690 	else
691 		ret = memrealloc(ptr, size);
692 	if (ret == NULL)
693 		errno = ENOMEM;
694 	else
695 		UTRACE(ptr, size, ret);
696 	return(ret);
697 }
698 
699 /*
700  * aligned_alloc()
701  *
702  * Allocate (size) bytes with a alignment of (alignment).
703  */
704 void *
705 __aligned_alloc(size_t alignment, size_t size)
706 {
707 	void *ptr;
708 	int rc;
709 
710 	ptr = NULL;
711 	rc = memalign(&ptr, alignment, size);
712 	if (rc)
713 		errno = rc;
714 
715 	return (ptr);
716 }
717 
718 /*
719  * posix_memalign()
720  *
721  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
722  * is a power of 2 >= sizeof(void *).
723  */
724 int
725 __posix_memalign(void **memptr, size_t alignment, size_t size)
726 {
727 	int rc;
728 
729 	/*
730 	 * OpenGroup spec issue 6 check
731 	 */
732 	if (alignment < sizeof(void *)) {
733 		*memptr = NULL;
734 		return(EINVAL);
735 	}
736 
737 	rc = memalign(memptr, alignment, size);
738 
739 	return (rc);
740 }
741 
742 /*
743  * The slab allocator will allocate on power-of-2 boundaries up to at least
744  * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
745  * matching the requirements.
746  */
747 static int
748 memalign(void **memptr, size_t alignment, size_t size)
749 {
750 
751 	if (alignment < 1) {
752 		*memptr = NULL;
753 		return(EINVAL);
754 	}
755 
756 	/*
757 	 * OpenGroup spec issue 6 check
758 	 */
759 	if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
760 		*memptr = NULL;
761 		return(EINVAL);
762 	}
763 
764 	/*
765 	 * XXX for now just find the nearest power of 2 >= size and also
766 	 * >= alignment and allocate that.
767 	 */
768 	while (alignment < size) {
769 		alignment <<= 1;
770 		if (alignment == 0)
771 			_mpanic("posix_memalign: byte value too high");
772 	}
773 	*memptr = memalloc(alignment, 0);
774 	return(*memptr ? 0 : ENOMEM);
775 }
776 
777 /*
778  * free() (SLAB ALLOCATOR) - do the obvious
779  */
780 void
781 __free(void *ptr)
782 {
783 	if (ptr) {
784 		UTRACE(ptr, 0, 0);
785 		memfree(ptr, 0);
786 	}
787 }
788 
789 /*
790  * memalloc()	(SLAB ALLOCATOR)
791  *
792  *	Allocate memory via the slab allocator.
793  */
794 static void *
795 memalloc(size_t size, int flags)
796 {
797 	slglobaldata_t slgd;
798 	struct zoneinfo *zinfo;
799 	slab_t slab;
800 	size_t chunking;
801 	int bmi;
802 	int bno;
803 	u_long *bmp;
804 	int zi;
805 #ifdef INVARIANTS
806 	int i;
807 #endif
808 	int j;
809 	char *obj;
810 
811 	/*
812 	 * If 0 bytes is requested we have to return a unique pointer, allocate
813 	 * at least one byte.
814 	 */
815 	if (size == 0)
816 		size = 1;
817 
818 	/* Capture global flags */
819 	flags |= g_malloc_flags;
820 
821 	/* Compute allocation zone; zoneindex will panic on excessive sizes */
822 	zi = zoneindex(&size, &chunking);
823 	MASSERT(zi < NZONES);
824 	if (size == 0)
825 		return(NULL);
826 
827 	/*
828 	 * Try magazine shortcut first
829 	 */
830 	slgd = &slglobal;
831 	zinfo = &slgd->zone[zi];
832 
833 	if ((j = zinfo->mag_index) != 0) {
834 		zinfo->mag_index = --j;
835 		obj = zinfo->mag_shortcut[j];
836 		zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
837 		if (flags & SAFLAG_ZERO)
838 			bzero(obj, size);
839 		return obj;
840 	}
841 
842 	/*
843 	 * Locate a slab with available space.  If no slabs are available
844 	 * back-off to the empty list and if we still come up dry allocate
845 	 * a new slab (which will try the depot first).
846 	 */
847 retry:
848 	if ((slab = zinfo->avail_base) == NULL) {
849 		if ((slab = zinfo->empty_base) == NULL) {
850 			/*
851 			 * Still dry
852 			 */
853 			slab = slaballoc(zi, chunking, size);
854 			if (slab == NULL)
855 				return(NULL);
856 			slab->next = zinfo->avail_base;
857 			zinfo->avail_base = slab;
858 			++zinfo->avail_count;
859 			slab->state = AVAIL;
860 			if (slgd->biggest_index < zi)
861 				slgd->biggest_index = zi;
862 			++slgd->nslabs;
863 		} else {
864 			/*
865 			 * Pulled from empty list
866 			 */
867 			zinfo->empty_base = slab->next;
868 			slab->next = zinfo->avail_base;
869 			zinfo->avail_base = slab;
870 			++zinfo->avail_count;
871 			slab->state = AVAIL;
872 			--zinfo->empty_count;
873 		}
874 	}
875 
876 	/*
877 	 * Allocate a chunk out of the slab.  HOT PATH
878 	 *
879 	 * Only the thread owning the slab can allocate out of it.
880 	 *
881 	 * NOTE: The last bit in the bitmap is always marked allocated so
882 	 *	 we cannot overflow here.
883 	 */
884 	bno = slab->free_bit;
885 	bmi = slab->free_index;
886 	bmp = &slab->bitmap[bmi];
887 	if (*bmp & (1LU << bno)) {
888 		atomic_clear_long(bmp, 1LU << bno);
889 		obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
890 		slab->free_bit = (bno + 1) & (LONG_BITS - 1);
891 		atomic_add_int(&slab->navail, -1);
892 		if (flags & SAFLAG_ZERO)
893 			bzero(obj, size);
894 		return (obj);
895 	}
896 
897 	/*
898 	 * Allocate a chunk out of a slab.  COLD PATH
899 	 */
900 	if (slab->navail == 0) {
901 		zinfo->avail_base = slab->next;
902 		--zinfo->avail_count;
903 		slab->state = FULL;
904 		TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
905 		goto retry;
906 	}
907 
908 	while (bmi < LONG_BITS) {
909 		bmp = &slab->bitmap[bmi];
910 		if (*bmp) {
911 			bno = bsflong(*bmp);
912 			atomic_clear_long(bmp, 1LU << bno);
913 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
914 					     size;
915 			slab->free_index = bmi;
916 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
917 			atomic_add_int(&slab->navail, -1);
918 			if (flags & SAFLAG_ZERO)
919 				bzero(obj, size);
920 			return (obj);
921 		}
922 		++bmi;
923 	}
924 	bmi = 0;
925 	while (bmi < LONG_BITS) {
926 		bmp = &slab->bitmap[bmi];
927 		if (*bmp) {
928 			bno = bsflong(*bmp);
929 			atomic_clear_long(bmp, 1LU << bno);
930 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
931 					     size;
932 			slab->free_index = bmi;
933 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
934 			atomic_add_int(&slab->navail, -1);
935 			if (flags & SAFLAG_ZERO)
936 				bzero(obj, size);
937 			return (obj);
938 		}
939 		++bmi;
940 	}
941 	_mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
942 	/* not reached */
943 	return NULL;
944 }
945 
946 /*
947  * Reallocate memory within the chunk
948  */
949 static void *
950 memrealloc(void *ptr, size_t nsize)
951 {
952 	region_t region;
953 	slab_t slab;
954 	size_t osize;
955 	char *obj;
956 	int flags = 0;
957 
958 	/*
959 	 * If 0 bytes is requested we have to return a unique pointer, allocate
960 	 * at least one byte.
961 	 */
962 	if (nsize == 0)
963 		nsize = 1;
964 
965 	/* Capture global flags */
966 	flags |= g_malloc_flags;
967 
968 	/*
969 	 * Locate the zone by looking up the dynamic slab size mask based
970 	 * on the memory region the allocation resides in.
971 	 */
972 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
973 	if ((slab = region->slab) == NULL)
974 		slab = (void *)((uintptr_t)ptr & region->mask);
975 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
976 	osize = slab->chunk_size;
977 	if (nsize <= osize) {
978 		if (osize < 32 || nsize >= osize / 2) {
979 			obj = ptr;
980 			if ((flags & SAFLAG_ZERO) && nsize < osize)
981 				bzero(obj + nsize, osize - nsize);
982 			return(obj);
983 		}
984 	}
985 
986 	/*
987 	 * Otherwise resize the object
988 	 */
989 	obj = memalloc(nsize, 0);
990 	if (obj) {
991 		if (nsize > osize)
992 			nsize = osize;
993 		bcopy(ptr, obj, nsize);
994 		memfree(ptr, 0);
995 	}
996 	return (obj);
997 }
998 
999 /*
1000  * free (SLAB ALLOCATOR)
1001  *
1002  * Free a memory block previously allocated by malloc.
1003  *
1004  * MPSAFE
1005  */
1006 static void
1007 memfree(void *ptr, int flags)
1008 {
1009 	region_t region;
1010 	slglobaldata_t slgd;
1011 	slab_t slab;
1012 	slab_t stmp;
1013 	slab_t *slabp;
1014 	int bmi;
1015 	int bno;
1016 	int j;
1017 	u_long *bmp;
1018 
1019 	/*
1020 	 * Locate the zone by looking up the dynamic slab size mask based
1021 	 * on the memory region the allocation resides in.
1022 	 *
1023 	 * WARNING!  The slab may be owned by another thread!
1024 	 */
1025 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
1026 	if ((slab = region->slab) == NULL)
1027 		slab = (void *)((uintptr_t)ptr & region->mask);
1028 	MASSERT(slab != NULL);
1029 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
1030 
1031 #ifdef INVARIANTS
1032 	/*
1033 	 * Put weird data into the memory to detect modifications after
1034 	 * freeing, illegal pointer use after freeing (we should fault on
1035 	 * the odd address), and so forth.
1036 	 */
1037 	if (slab->chunk_size < sizeof(weirdary))
1038 		bcopy(weirdary, ptr, slab->chunk_size);
1039 	else
1040 		bcopy(weirdary, ptr, sizeof(weirdary));
1041 #endif
1042 	slgd = &slglobal;
1043 
1044 	/*
1045 	 * Use mag_shortcut[] when possible
1046 	 */
1047 	if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) {
1048 		struct zoneinfo *zinfo;
1049 
1050 		zinfo = &slgd->zone[slab->zone_index];
1051 		j = zinfo->mag_index;
1052 		if (j < NMAGSHORTCUT) {
1053 			zinfo->mag_shortcut[j] = ptr;
1054 			zinfo->mag_index = j + 1;
1055 			return;
1056 		}
1057 	}
1058 
1059 	/*
1060 	 * Free to slab and increment navail.  We can delay incrementing
1061 	 * navail to prevent the slab from being destroyed out from under
1062 	 * us while we do other optimizations.
1063 	 */
1064 	bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
1065 	bmi = bno >> LONG_BITS_SHIFT;
1066 	bno &= (LONG_BITS - 1);
1067 	bmp = &slab->bitmap[bmi];
1068 
1069 	MASSERT(bmi >= 0 && bmi < slab->nmax);
1070 	MASSERT((*bmp & (1LU << bno)) == 0);
1071 	atomic_set_long(bmp, 1LU << bno);
1072 
1073 	if (slab->slgd == slgd) {
1074 		/*
1075 		 * We can only do the following if we own the slab.  Note
1076 		 * that navail can be incremented by any thread even if
1077 		 * we own the slab.
1078 		 */
1079 		struct zoneinfo *zinfo;
1080 
1081 		atomic_add_int(&slab->navail, 1);
1082 		if (slab->free_index > bmi) {
1083 			slab->free_index = bmi;
1084 			slab->free_bit = bno;
1085 		} else if (slab->free_index == bmi &&
1086 			   slab->free_bit > bno) {
1087 			slab->free_bit = bno;
1088 		}
1089 		zinfo = &slgd->zone[slab->zone_index];
1090 
1091 		/*
1092 		 * Freeing an object from a full slab makes it less than
1093 		 * full.  The slab must be moved to the available list.
1094 		 *
1095 		 * If the available list has too many slabs, release some
1096 		 * to the depot.
1097 		 */
1098 		if (slab->state == FULL) {
1099 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
1100 			slab->state = AVAIL;
1101 			stmp = zinfo->avail_base;
1102 			slab->next = stmp;
1103 			zinfo->avail_base = slab;
1104 			++zinfo->avail_count;
1105 			while (zinfo->avail_count > opt_cache) {
1106 				slab = zinfo->avail_base;
1107 				zinfo->avail_base = slab->next;
1108 				--zinfo->avail_count;
1109 				slabterm(slgd, slab);
1110 			}
1111 			goto done;
1112 		}
1113 
1114 		/*
1115 		 * If the slab becomes completely empty dispose of it in
1116 		 * some manner.  By default each thread caches up to 4
1117 		 * empty slabs.  Only small slabs are cached.
1118 		 */
1119 		if (slab->navail == slab->nmax && slab->state == AVAIL) {
1120 			/*
1121 			 * Remove slab from available queue
1122 			 */
1123 			slabp = &zinfo->avail_base;
1124 			while ((stmp = *slabp) != slab)
1125 				slabp = &stmp->next;
1126 			*slabp = slab->next;
1127 			--zinfo->avail_count;
1128 
1129 			if (opt_free || opt_cache == 0) {
1130 				/*
1131 				 * If local caching is disabled cache the
1132 				 * slab in the depot (or free it).
1133 				 */
1134 				slabterm(slgd, slab);
1135 			} else if (slab->slab_size > BIGSLABSIZE) {
1136 				/*
1137 				 * We do not try to retain large slabs
1138 				 * in per-thread caches.
1139 				 */
1140 				slabterm(slgd, slab);
1141 			} else if (zinfo->empty_count > opt_cache) {
1142 				/*
1143 				 * We have too many slabs cached, but
1144 				 * instead of freeing this one free
1145 				 * an empty slab that's been idle longer.
1146 				 *
1147 				 * (empty_count does not change)
1148 				 */
1149 				stmp = zinfo->empty_base;
1150 				slab->state = EMPTY;
1151 				slab->next = stmp->next;
1152 				zinfo->empty_base = slab;
1153 				slabterm(slgd, stmp);
1154 			} else {
1155 				/*
1156 				 * Cache the empty slab in our thread local
1157 				 * empty list.
1158 				 */
1159 				++zinfo->empty_count;
1160 				slab->state = EMPTY;
1161 				slab->next = zinfo->empty_base;
1162 				zinfo->empty_base = slab;
1163 			}
1164 		}
1165 	} else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1166 		slglobaldata_t sldepot;
1167 
1168 		/*
1169 		 * If freeing to a slab owned by the global depot, and
1170 		 * the slab becomes completely EMPTY, try to move it to
1171 		 * the correct list.
1172 		 */
1173 		sldepot = &slglobaldepot;
1174 		if (__isthreaded)
1175 			_SPINLOCK(&sldepot->lock);
1176 		if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1177 			struct zoneinfo *zinfo;
1178 
1179 			/*
1180 			 * Move the slab to the empty list
1181 			 */
1182 			MASSERT(slab->state == AVAIL);
1183 			atomic_add_int(&slab->navail, 1);
1184 			zinfo = &sldepot->zone[slab->zone_index];
1185 			slabp = &zinfo->avail_base;
1186 			while (slab != *slabp)
1187 				slabp = &(*slabp)->next;
1188 			*slabp = slab->next;
1189 			--zinfo->avail_count;
1190 
1191 			/*
1192 			 * Clean out excessive empty entries from the
1193 			 * depot.
1194 			 */
1195 			slab->state = EMPTY;
1196 			slab->next = zinfo->empty_base;
1197 			zinfo->empty_base = slab;
1198 			++zinfo->empty_count;
1199 			while (zinfo->empty_count > opt_cache) {
1200 				slab = zinfo->empty_base;
1201 				zinfo->empty_base = slab->next;
1202 				--zinfo->empty_count;
1203 				slab->state = UNKNOWN;
1204 				if (__isthreaded)
1205 					_SPINUNLOCK(&sldepot->lock);
1206 				slabfree(slab);
1207 				if (__isthreaded)
1208 					_SPINLOCK(&sldepot->lock);
1209 			}
1210 		} else {
1211 			atomic_add_int(&slab->navail, 1);
1212 		}
1213 		if (__isthreaded)
1214 			_SPINUNLOCK(&sldepot->lock);
1215 	} else {
1216 		/*
1217 		 * We can't act on the slab other than by adjusting navail
1218 		 * (and the bitmap which we did in the common code at the
1219 		 * top).
1220 		 */
1221 		atomic_add_int(&slab->navail, 1);
1222 	}
1223 done:
1224 	;
1225 }
1226 
1227 /*
1228  * Allocate a new slab holding objects of size chunk_size.
1229  */
1230 static slab_t
1231 slaballoc(int zi, size_t chunking, size_t chunk_size)
1232 {
1233 	slglobaldata_t slgd;
1234 	slglobaldata_t sldepot;
1235 	struct zoneinfo *zinfo;
1236 	region_t region;
1237 	void *save;
1238 	slab_t slab;
1239 	size_t slab_desire;
1240 	size_t slab_size;
1241 	size_t region_mask;
1242 	uintptr_t chunk_offset;
1243 	ssize_t maxchunks;
1244 	ssize_t tmpchunks;
1245 	int ispower2;
1246 	int power;
1247 	int ri;
1248 	int rx;
1249 	int nswath;
1250 	int j;
1251 
1252 	/*
1253 	 * First look in the depot.  Any given zone in the depot may be
1254 	 * locked by being set to -1.  We have to do this instead of simply
1255 	 * removing the entire chain because removing the entire chain can
1256 	 * cause racing threads to allocate local slabs for large objects,
1257 	 * resulting in a large VSZ.
1258 	 */
1259 	slgd = &slglobal;
1260 	sldepot = &slglobaldepot;
1261 	zinfo = &sldepot->zone[zi];
1262 
1263 	if (zinfo->avail_base) {
1264 		if (__isthreaded)
1265 			_SPINLOCK(&sldepot->lock);
1266 		slab = zinfo->avail_base;
1267 		if (slab) {
1268 			MASSERT(slab->slgd == NULL);
1269 			slab->slgd = slgd;
1270 			zinfo->avail_base = slab->next;
1271 			--zinfo->avail_count;
1272 			if (__isthreaded)
1273 				_SPINUNLOCK(&sldepot->lock);
1274 			return slab;
1275 		}
1276 		if (__isthreaded)
1277 			_SPINUNLOCK(&sldepot->lock);
1278 	}
1279 
1280 	/*
1281 	 * Nothing in the depot, allocate a new slab by locating or assigning
1282 	 * a region and then using the system virtual memory allocator.
1283 	 */
1284 	slab = NULL;
1285 
1286 	/*
1287 	 * Calculate the start of the data chunks relative to the start
1288 	 * of the slab.  If chunk_size is a power of 2 we guarantee
1289 	 * power of 2 alignment.  If it is not we guarantee alignment
1290 	 * to the chunk size.
1291 	 */
1292 	if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1293 		ispower2 = 1;
1294 		chunk_offset = roundup2(sizeof(*slab), chunk_size);
1295 	} else {
1296 		ispower2 = 0;
1297 		chunk_offset = sizeof(*slab) + chunking - 1;
1298 		chunk_offset -= chunk_offset % chunking;
1299 	}
1300 
1301 	/*
1302 	 * Calculate a reasonable number of chunks for the slab.
1303 	 *
1304 	 * Once initialized the MaxChunks[] array can only ever be
1305 	 * reinitialized to the same value.
1306 	 */
1307 	maxchunks = MaxChunks[zi];
1308 	if (maxchunks == 0) {
1309 		/*
1310 		 * First calculate how many chunks would fit in 1/1024
1311 		 * available memory.  This is around 2MB on a 32 bit
1312 		 * system and 128G on a 64-bit (48-bits available) system.
1313 		 */
1314 		maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1315 			    (ssize_t)chunk_size;
1316 		if (maxchunks <= 0)
1317 			maxchunks = 1;
1318 
1319 		/*
1320 		 * A slab cannot handle more than MAXCHUNKS chunks, but
1321 		 * limit us to approximately MAXCHUNKS / 2 here because
1322 		 * we may have to expand maxchunks when we calculate the
1323 		 * actual power-of-2 slab.
1324 		 */
1325 		if (maxchunks > MAXCHUNKS / 2)
1326 			maxchunks = MAXCHUNKS / 2;
1327 
1328 		/*
1329 		 * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1330 		 * slabs will be created if the allocation does not fit.
1331 		 */
1332 		if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1333 			tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1334 				    (ssize_t)chunk_size;
1335 			if (tmpchunks <= 0)
1336 				tmpchunks = 1;
1337 			if (maxchunks > tmpchunks)
1338 				maxchunks = tmpchunks;
1339 		}
1340 
1341 		/*
1342 		 * If the slab calculates to greater than 2MB see if we
1343 		 * can cut it down to ~2MB.  This controls VSZ but has
1344 		 * no effect on run-time size or performance.
1345 		 *
1346 		 * This is very important in case you core dump and also
1347 		 * important to reduce unnecessary region allocations.
1348 		 */
1349 		if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1350 			tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1351 				    (ssize_t)chunk_size;
1352 			if (tmpchunks < 1)
1353 				tmpchunks = 1;
1354 			if (maxchunks > tmpchunks)
1355 				maxchunks = tmpchunks;
1356 		}
1357 
1358 		/*
1359 		 * If the slab calculates to greater than 128K see if we
1360 		 * can cut it down to ~128K while still maintaining a
1361 		 * reasonably large number of chunks in each slab.  This
1362 		 * controls VSZ but has no effect on run-time size or
1363 		 * performance.
1364 		 *
1365 		 * This is very important in case you core dump and also
1366 		 * important to reduce unnecessary region allocations.
1367 		 */
1368 		if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1369 			tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1370 				    (ssize_t)chunk_size;
1371 			if (tmpchunks < 32)
1372 				tmpchunks = 32;
1373 			if (maxchunks > tmpchunks)
1374 				maxchunks = tmpchunks;
1375 		}
1376 
1377 		MaxChunks[zi] = maxchunks;
1378 	}
1379 	MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1380 
1381 	/*
1382 	 * Calculate the actual slab size.  maxchunks will be recalculated
1383 	 * a little later.
1384 	 */
1385 	slab_desire = chunk_offset + chunk_size * maxchunks;
1386 	slab_size = 8 * MAXCHUNKS;
1387 	power = 3 + MAXCHUNKS_BITS;
1388 	while (slab_size < slab_desire) {
1389 		slab_size <<= 1;
1390 		++power;
1391 	}
1392 
1393 	/*
1394 	 * Do a quick recalculation based on the actual slab size but not
1395 	 * yet dealing with whether the slab header is in-band or out-of-band.
1396 	 * The purpose here is to see if we can reasonably reduce slab_size
1397 	 * to a power of 4 to allow more chunk sizes to use the same slab
1398 	 * size.
1399 	 */
1400 	if ((power & 1) && slab_size > 32768) {
1401 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1402 		if (maxchunks >= MAXCHUNKS / 8) {
1403 			slab_size >>= 1;
1404 			--power;
1405 		}
1406 	}
1407 	if ((power & 2) && slab_size > 32768 * 4) {
1408 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1409 		if (maxchunks >= MAXCHUNKS / 4) {
1410 			slab_size >>= 2;
1411 			power -= 2;
1412 		}
1413 	}
1414 	/*
1415 	 * This case occurs when the slab_size is larger than 1/1024 available
1416 	 * UVM.
1417 	 */
1418 	nswath = slab_size / NREGIONS_SIZE;
1419 	if (nswath > NREGIONS)
1420 		return (NULL);
1421 
1422 
1423 	/*
1424 	 * Try to allocate from our current best region for this zi
1425 	 */
1426 	region_mask = ~(slab_size - 1);
1427 	ri = slgd->zone[zi].best_region;
1428 	if (Regions[ri].mask == region_mask) {
1429 		if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1430 			goto found;
1431 	}
1432 
1433 	/*
1434 	 * Try to find an existing region to allocate from.  The normal
1435 	 * case will be for allocations that are less than 1/1024 available
1436 	 * UVM, which fit into a single Regions[] entry.
1437 	 */
1438 	while (slab_size <= NREGIONS_SIZE) {
1439 		rx = -1;
1440 		for (ri = 0; ri < NREGIONS; ++ri) {
1441 			if (rx < 0 && Regions[ri].mask == 0)
1442 				rx = ri;
1443 			if (Regions[ri].mask == region_mask) {
1444 				slab = _vmem_alloc(ri, slab_size);
1445 				if (slab) {
1446 					slgd->zone[zi].best_region = ri;
1447 					goto found;
1448 				}
1449 			}
1450 		}
1451 
1452 		if (rx < 0)
1453 			return(NULL);
1454 
1455 		/*
1456 		 * This can fail, retry either way
1457 		 */
1458 		atomic_cmpset_ptr((void **)&Regions[rx].mask,
1459 				  NULL,
1460 				  (void *)region_mask);
1461 	}
1462 
1463 	for (;;) {
1464 		rx = -1;
1465 		for (ri = 0; ri < NREGIONS; ri += nswath) {
1466 			if (Regions[ri].mask == region_mask) {
1467 				slab = _vmem_alloc(ri, slab_size);
1468 				if (slab) {
1469 					slgd->zone[zi].best_region = ri;
1470 					goto found;
1471 				}
1472 			}
1473 			if (rx < 0) {
1474 				for (j = nswath - 1; j >= 0; --j) {
1475 					if (Regions[ri+j].mask != 0)
1476 						break;
1477 				}
1478 				if (j < 0)
1479 					rx = ri;
1480 			}
1481 		}
1482 
1483 		/*
1484 		 * We found a candidate, try to allocate it backwards so
1485 		 * another thread racing a slaballoc() does not see the
1486 		 * mask in the base index position until we are done.
1487 		 *
1488 		 * We can safely zero-out any partial allocations because
1489 		 * the mask is only accessed from the base index.  Any other
1490 		 * threads racing us will fail prior to us clearing the mask.
1491 		 */
1492 		if (rx < 0)
1493 			return(NULL);
1494 		for (j = nswath - 1; j >= 0; --j) {
1495 			if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1496 					       NULL, (void *)region_mask)) {
1497 				while (++j < nswath)
1498 					Regions[rx+j].mask = 0;
1499 				break;
1500 			}
1501 		}
1502 		/* retry */
1503 	}
1504 
1505 	/*
1506 	 * Fill in the new slab in region ri.  If the slab_size completely
1507 	 * fills one or more region slots we move the slab structure out of
1508 	 * band which should optimize the chunking (particularly for a power
1509 	 * of 2).
1510 	 */
1511 found:
1512 	region = &Regions[ri];
1513 	MASSERT(region->slab == NULL);
1514 	if (slab_size >= NREGIONS_SIZE) {
1515 		save = slab;
1516 		slab = memalloc(sizeof(*slab), 0);
1517 		bzero(slab, sizeof(*slab));
1518 		slab->chunks = save;
1519 		for (j = 0; j < nswath; ++j)
1520 			region[j].slab = slab;
1521 		chunk_offset = 0;
1522 	} else {
1523 		bzero(slab, sizeof(*slab));
1524 		slab->chunks = (char *)slab + chunk_offset;
1525 	}
1526 
1527 	/*
1528 	 * Calculate the start of the chunks memory and recalculate the
1529 	 * actual number of chunks the slab can hold.
1530 	 */
1531 	maxchunks = (slab_size - chunk_offset) / chunk_size;
1532 	if (maxchunks > MAXCHUNKS)
1533 		maxchunks = MAXCHUNKS;
1534 
1535 	/*
1536 	 * And fill in the rest
1537 	 */
1538 	slab->magic = ZALLOC_SLAB_MAGIC;
1539 	slab->navail = maxchunks;
1540 	slab->nmax = maxchunks;
1541 	slab->slab_size = slab_size;
1542 	slab->chunk_size = chunk_size;
1543 	slab->zone_index = zi;
1544 	slab->slgd = slgd;
1545 	slab->state = UNKNOWN;
1546 	slab->region = region;
1547 
1548 	for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1549 		if (ri + LONG_BITS <= maxchunks)
1550 			slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1551 		else
1552 			slab->bitmap[ri >> LONG_BITS_SHIFT] =
1553 						(1LU << (maxchunks - ri)) - 1;
1554 	}
1555 	return (slab);
1556 }
1557 
1558 /*
1559  * Free a slab.
1560  */
1561 static void
1562 slabfree(slab_t slab)
1563 {
1564 	int nswath;
1565 	int j;
1566 
1567 	if (slab->region->slab == slab) {
1568 		/*
1569 		 * Out-of-band slab.
1570 		 */
1571 		nswath = slab->slab_size / NREGIONS_SIZE;
1572 		for (j = 0; j < nswath; ++j)
1573 			slab->region[j].slab = NULL;
1574 		slab->magic = 0;
1575 		_vmem_free(slab->chunks, slab->slab_size);
1576 		memfree(slab, 0);
1577 	} else {
1578 		/*
1579 		 * In-band slab.
1580 		 */
1581 		slab->magic = 0;
1582 		_vmem_free(slab, slab->slab_size);
1583 	}
1584 }
1585 
1586 /*
1587  * Terminate a slab's use in the current thread.  The slab may still have
1588  * outstanding allocations and thus not be deallocatable.
1589  */
1590 static void
1591 slabterm(slglobaldata_t slgd, slab_t slab)
1592 {
1593 	slglobaldata_t sldepot;
1594 	struct zoneinfo *zinfo;
1595 	int zi = slab->zone_index;
1596 
1597 	slab->slgd = NULL;
1598 	--slgd->nslabs;
1599 	sldepot = &slglobaldepot;
1600 	zinfo = &sldepot->zone[zi];
1601 
1602 	/*
1603 	 * Move the slab to the avail list or the empty list.
1604 	 */
1605 	if (__isthreaded)
1606 		_SPINLOCK(&sldepot->lock);
1607 	if (slab->navail == slab->nmax) {
1608 		slab->state = EMPTY;
1609 		slab->next = zinfo->empty_base;
1610 		zinfo->empty_base = slab;
1611 		++zinfo->empty_count;
1612 	} else {
1613 		slab->state = AVAIL;
1614 		slab->next = zinfo->avail_base;
1615 		zinfo->avail_base = slab;
1616 		++zinfo->avail_count;
1617 	}
1618 
1619 	/*
1620 	 * Clean extra slabs out of the empty list
1621 	 */
1622 	while (zinfo->empty_count > opt_cache) {
1623 		slab = zinfo->empty_base;
1624 		zinfo->empty_base = slab->next;
1625 		--zinfo->empty_count;
1626 		slab->state = UNKNOWN;
1627 		if (__isthreaded)
1628 			_SPINUNLOCK(&sldepot->lock);
1629 		slabfree(slab);
1630 		if (__isthreaded)
1631 			_SPINLOCK(&sldepot->lock);
1632 	}
1633 	if (__isthreaded)
1634 		_SPINUNLOCK(&sldepot->lock);
1635 }
1636 
1637 /*
1638  * _vmem_alloc()
1639  *
1640  *	Directly map memory in PAGE_SIZE'd chunks with the specified
1641  *	alignment.
1642  *
1643  *	Alignment must be a multiple of PAGE_SIZE.
1644  *
1645  *	Size must be >= alignment.
1646  */
1647 static void *
1648 _vmem_alloc(int ri, size_t slab_size)
1649 {
1650 	char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1651 	char *eaddr;
1652 	char *addr;
1653 	char *save;
1654 	uintptr_t excess;
1655 
1656 	if (slab_size < NREGIONS_SIZE)
1657 		eaddr = baddr + NREGIONS_SIZE;
1658 	else
1659 		eaddr = baddr + slab_size;
1660 
1661 	/*
1662 	 * This usually just works but might not.
1663 	 */
1664 	addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1665 		    MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1666 	if (addr == MAP_FAILED) {
1667 		return (NULL);
1668 	}
1669 	if (addr < baddr || addr + slab_size > eaddr) {
1670 		munmap(addr, slab_size);
1671 		return (NULL);
1672 	}
1673 
1674 	/*
1675 	 * Check alignment.  The misaligned offset is also the excess
1676 	 * amount.  If misaligned unmap the excess so we have a chance of
1677 	 * mapping at the next alignment point and recursively try again.
1678 	 *
1679 	 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB	block alignment
1680 	 *   aaaaaaaaa aaaaaaaaaaa aa		mis-aligned allocation
1681 	 *   xxxxxxxxx				final excess calculation
1682 	 *   ^ returned address
1683 	 */
1684 	excess = (uintptr_t)addr & (slab_size - 1);
1685 	while (excess) {
1686 		excess = slab_size - excess;
1687 		save = addr;
1688 
1689 		munmap(save + excess, slab_size - excess);
1690 		addr = _vmem_alloc(ri, slab_size);
1691 		munmap(save, excess);
1692 		if (addr == NULL)
1693 			return (NULL);
1694 		if (addr < baddr || addr + slab_size > eaddr) {
1695 			munmap(addr, slab_size);
1696 			return (NULL);
1697 		}
1698 		excess = (uintptr_t)addr & (slab_size - 1);
1699 	}
1700 	return (addr);
1701 }
1702 
1703 /*
1704  * _vmem_free()
1705  *
1706  *	Free a chunk of memory allocated with _vmem_alloc()
1707  */
1708 static void
1709 _vmem_free(void *ptr, size_t size)
1710 {
1711 	munmap(ptr, size);
1712 }
1713 
1714 /*
1715  * Panic on fatal conditions
1716  */
1717 static void
1718 _mpanic(const char *ctl, ...)
1719 {
1720 	va_list va;
1721 
1722 	if (malloc_panic == 0) {
1723 		malloc_panic = 1;
1724 		va_start(va, ctl);
1725 		vfprintf(stderr, ctl, va);
1726 		fprintf(stderr, "\n");
1727 		fflush(stderr);
1728 		va_end(va);
1729 	}
1730 	abort();
1731 }
1732 
1733 __weak_reference(__aligned_alloc, aligned_alloc);
1734 __weak_reference(__malloc, malloc);
1735 __weak_reference(__calloc, calloc);
1736 __weak_reference(__posix_memalign, posix_memalign);
1737 __weak_reference(__realloc, realloc);
1738 __weak_reference(__free, free);
1739