xref: /dragonfly/lib/libc/stdlib/dmalloc.c (revision 51871435)
1 /*
2  * DMALLOC.C	- Dillon's malloc
3  *
4  * Copyright (c) 2011 The DragonFly Project. All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module implements a modified slab allocator drop-in replacement for
38  * the libc malloc().  The slab algorithm has been adjusted to support dynamic
39  * sizing of slabs which effectively allows slabs to be used for allocations of
40  * any size.  Because of this we neither have a small-block allocator or a
41  * big-block allocator and the code paths are simplified to the point where
42  * allocations, caching, and freeing, is screaming fast.
43  *
44  * There is very little interaction between threads.  A global depot accessed
45  * via atomic cmpxchg instructions (only! no spinlocks!) is used as a
46  * catch-all and to deal with thread exits and such.
47  *
48  * To support dynamic slab sizing available user virtual memory is broken
49  * down into ~1024 regions.  Each region has fixed slab size whos value is
50  * set when the region is opened up for use.  The free() path simply applies
51  * a mask based on the region to the pointer to acquire the base of the
52  * governing slab structure.
53  *
54  * Regions[NREGIONS]	(1024)
55  *
56  * Slab management and locking is done on a per-zone basis.
57  *
58  *	Alloc Size	Chunking        Number of zones
59  *	0-127		8		16
60  *	128-255		16		8
61  *	256-511		32		8
62  *	512-1023	64		8
63  *	1024-2047	128		8
64  *	2048-4095	256		8
65  *	4096-8191	512		8
66  *	8192-16383	1024		8
67  *	16384-32767	2048		8
68  *	32768-65535	4096		8
69  *	... continues unlimited ...	4 zones
70  *
71  *	For a 2^63 memory space each doubling >= 64K is broken down into
72  *	4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
73  *
74  *			   API FEATURES AND SIDE EFFECTS
75  *
76  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
77  *	Above that power-of-2 sized allocations are page-aligned.  Non
78  *	power-of-2 sized allocations are aligned the same as the chunk
79  *	size for their zone.
80  *    + malloc(0) returns a special non-NULL value
81  *    + ability to allocate arbitrarily large chunks of memory
82  *    + realloc will reuse the passed pointer if possible, within the
83  *	limitations of the zone chunking.
84  *
85  *				FUTURE FEATURES
86  *
87  *    + [better] garbage collection
88  *    + better initial sizing.
89  *
90  * TUNING
91  *
92  * The value of the environment variable MALLOC_OPTIONS is a character string
93  * containing various flags to tune nmalloc.  Upper case letters enabled
94  * or increase the feature, lower case disables or decreases the feature.
95  *
96  * U		Enable UTRACE for all operations, observable with ktrace.
97  *		Diasbled by default.
98  *
99  * Z		Zero out allocations, otherwise allocations (except for
100  *		calloc) will contain garbage.
101  *		Disabled by default.
102  *
103  * H		Pass a hint with madvise() about unused pages.
104  *		Disabled by default.
105  *		Not currently implemented.
106  *
107  * F		Disable local per-thread caching.
108  *		Disabled by default.
109  *
110  * C		Increase (decrease) how much excess cache to retain.
111  *		Set to 4 by default.
112  */
113 
114 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
115 
116 #ifndef STANDALONE_DEBUG
117 #include "libc_private.h"
118 #endif
119 
120 #include <sys/param.h>
121 #include <sys/types.h>
122 #include <sys/mman.h>
123 #include <sys/queue.h>
124 #include <sys/uio.h>
125 #include <sys/ktrace.h>
126 #include <stdio.h>
127 #include <stdint.h>
128 #include <stdlib.h>
129 #include <stdarg.h>
130 #include <stddef.h>
131 #include <unistd.h>
132 #include <string.h>
133 #include <fcntl.h>
134 #include <errno.h>
135 #include <pthread.h>
136 #include <limits.h>
137 
138 #include <machine/atomic.h>
139 #include <machine/cpufunc.h>
140 
141 #ifdef STANDALONE_DEBUG
142 void _nmalloc_thr_init(void);
143 #else
144 #include "spinlock.h"
145 #include "un-namespace.h"
146 #endif
147 
148 #ifndef MAP_SIZEALIGN
149 #define MAP_SIZEALIGN	0
150 #endif
151 
152 #if SSIZE_MAX == 0x7FFFFFFF
153 #define ADDRBITS	32
154 #define UVM_BITS	32	/* worst case */
155 #else
156 #define ADDRBITS	64
157 #define UVM_BITS	48	/* worst case XXX */
158 #endif
159 
160 #if LONG_MAX == 0x7FFFFFFF
161 #define LONG_BITS	32
162 #define LONG_BITS_SHIFT	5
163 #else
164 #define LONG_BITS	64
165 #define LONG_BITS_SHIFT	6
166 #endif
167 
168 #define LOCKEDPTR	((void *)(intptr_t)-1)
169 
170 /*
171  * Regions[]
172  */
173 #define NREGIONS_BITS	10
174 #define NREGIONS	(1 << NREGIONS_BITS)
175 #define NREGIONS_MASK	(NREGIONS - 1)
176 #define NREGIONS_SHIFT	(UVM_BITS - NREGIONS_BITS)
177 #define NREGIONS_SIZE	(1LU << NREGIONS_SHIFT)
178 
179 typedef struct region *region_t;
180 typedef struct slglobaldata *slglobaldata_t;
181 typedef struct slab *slab_t;
182 
183 struct region {
184 	uintptr_t	mask;
185 	slab_t		slab;	/* conditional out of band slab */
186 };
187 
188 static struct region Regions[NREGIONS];
189 
190 /*
191  * Number of chunking zones available
192  */
193 #define CHUNKFACTOR	8
194 #if ADDRBITS == 32
195 #define NZONES		(16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
196 #else
197 #define NZONES		(16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
198 #endif
199 
200 static int MaxChunks[NZONES];
201 
202 #define NDEPOTS		8		/* must be power of 2 */
203 
204 /*
205  * Maximum number of chunks per slab, governed by the allocation bitmap in
206  * each slab.  The maximum is reduced for large chunk sizes.
207  */
208 #define MAXCHUNKS	(LONG_BITS * LONG_BITS)
209 #define MAXCHUNKS_BITS	(LONG_BITS_SHIFT * LONG_BITS_SHIFT)
210 #define LITSLABSIZE	(32 * 1024)
211 #define NOMSLABSIZE	(2 * 1024 * 1024)
212 #define BIGSLABSIZE	(128 * 1024 * 1024)
213 
214 #define ZALLOC_SLAB_MAGIC	0x736c6162	/* magic sanity */
215 
216 TAILQ_HEAD(slab_list, slab);
217 
218 /*
219  * A slab structure
220  */
221 struct slab {
222 	struct slab	*next;		/* slabs with available space */
223 	TAILQ_ENTRY(slab) entry;
224 	int32_t		magic;		/* magic number for sanity check */
225 	u_int		navail;		/* number of free elements available */
226 	u_int		nmax;
227 	u_int		free_bit;	/* free hint bitno */
228 	u_int		free_index;	/* free hint index */
229 	u_long		bitmap[LONG_BITS]; /* free chunks */
230 	size_t		slab_size;	/* size of entire slab */
231 	size_t		chunk_size;	/* chunk size for validation */
232 	int		zone_index;
233 	enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
234 	int		flags;
235 	region_t	region;		/* related region */
236 	char		*chunks;	/* chunk base */
237 	slglobaldata_t	slgd;
238 };
239 
240 /*
241  * per-thread data
242  */
243 struct slglobaldata {
244 	struct zoneinfo {
245 		slab_t	avail_base;
246 		slab_t	empty_base;
247 		int	best_region;
248 		int	empty_count;
249 	} zone[NZONES];
250 	struct slab_list full_zones;		/* via entry */
251 	int		masked;
252 	int		biggest_index;
253 	size_t		nslabs;
254 	slglobaldata_t	sldepot;
255 };
256 
257 #define SLAB_ZEROD		0x0001
258 
259 /*
260  * Misc constants.  Note that allocations that are exact multiples of
261  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
262  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
263  */
264 #define MIN_CHUNK_SIZE		8		/* in bytes */
265 #define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
266 
267 #define SAFLAG_ZERO	0x00000001
268 
269 /*
270  * The WEIRD_ADDR is used as known text to copy into free objects to
271  * try to create deterministic failure cases if the data is accessed after
272  * free.
273  *
274  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
275  *	    not be larger then 64.
276  */
277 #define WEIRD_ADDR      0xdeadc0de
278 #define MAX_COPY        sizeof(weirdary)
279 
280 #define arysize(ary)	(sizeof(ary)/sizeof((ary)[0]))
281 
282 /*
283  * Thread control
284  */
285 
286 #define MASSERT(exp)	do { if (__predict_false(!(exp)))	\
287 				_mpanic("assertion: %s in %s",	\
288 				#exp, __func__);		\
289 			    } while (0)
290 
291 /* With this attribute set, do not require a function call for accessing
292  * this variable when the code is compiled -fPIC */
293 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
294 
295 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
296 static pthread_key_t thread_malloc_key;
297 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
298 static struct slglobaldata sldepots[NDEPOTS];
299 
300 static int opt_madvise = 0;
301 static int opt_free = 0;
302 static int opt_cache = 4;
303 static int opt_utrace = 0;
304 static int g_malloc_flags = 0;
305 static int malloc_panic;
306 static int malloc_started = 0;
307 
308 static const int32_t weirdary[16] = {
309 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
310 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
311 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
312 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
313 };
314 
315 static void *memalloc(size_t size, int flags);
316 static void *memrealloc(void *ptr, size_t size);
317 static void memfree(void *ptr, int);
318 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
319 static void slabfree(slab_t slab);
320 static void slabterm(slglobaldata_t slgd, slab_t slab);
321 static void *_vmem_alloc(int ri, size_t slab_size);
322 static void _vmem_free(void *ptr, size_t slab_size);
323 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
324 #ifndef STANDALONE_DEBUG
325 static void malloc_init(void) __constructor(0);
326 #else
327 static void malloc_init(void) __constructor(101);
328 #endif
329 
330 
331 struct nmalloc_utrace {
332 	void *p;
333 	size_t s;
334 	void *r;
335 };
336 
337 #define UTRACE(a, b, c)						\
338 	if (opt_utrace) {					\
339 		struct nmalloc_utrace ut = {			\
340 			.p = (a),				\
341 			.s = (b),				\
342 			.r = (c)				\
343 		};						\
344 		utrace(&ut, sizeof(ut));			\
345 	}
346 
347 #ifdef INVARIANTS
348 /*
349  * If enabled any memory allocated without M_ZERO is initialized to -1.
350  */
351 static int  use_malloc_pattern;
352 #endif
353 
354 static void
355 malloc_init(void)
356 {
357 	const char *p = NULL;
358 	static spinlock_t malloc_init_lock;
359 
360 	if (malloc_started)
361 		return;
362 
363 	if (__isthreaded) {
364 		_SPINLOCK(&malloc_init_lock);
365 		if (malloc_started) {
366 			_SPINUNLOCK(&malloc_init_lock);
367 			return;
368 		}
369 	}
370 
371 	Regions[0].mask = -1; /* disallow activity in lowest region */
372 
373 	if (issetugid() == 0)
374 		p = getenv("MALLOC_OPTIONS");
375 
376 	for (; p != NULL && *p != '\0'; p++) {
377 		switch(*p) {
378 		case 'u':
379 			opt_utrace = 0;
380 			break;
381 		case 'U':
382 			opt_utrace = 1;
383 			break;
384 		case 'h':
385 			opt_madvise = 0;
386 			break;
387 		case 'H':
388 			opt_madvise = 1;
389 			break;
390 		case 'c':
391 			if (opt_cache > 0)
392 				--opt_cache;
393 			break;
394 		case 'C':
395 			++opt_cache;
396 			break;
397 		case 'f':
398 			opt_free = 0;
399 			break;
400 		case 'F':
401 			opt_free = 1;
402 			break;
403 		case 'z':
404 			g_malloc_flags = 0;
405 			break;
406 		case 'Z':
407 			g_malloc_flags = SAFLAG_ZERO;
408 			break;
409 		default:
410 			break;
411 		}
412 	}
413 
414 	UTRACE((void *) -1, 0, NULL);
415 	_nmalloc_thr_init();
416 	malloc_started = 1;
417 
418 	if (__isthreaded)
419 		_SPINUNLOCK(&malloc_init_lock);
420 }
421 
422 /*
423  * We have to install a handler for nmalloc thread teardowns when
424  * the thread is created.  We cannot delay this because destructors in
425  * sophisticated userland programs can call malloc() for the first time
426  * during their thread exit.
427  *
428  * This routine is called directly from pthreads.
429  */
430 static void _nmalloc_thr_init_once(void);
431 static void _nmalloc_thr_destructor(void *thrp);
432 
433 void
434 _nmalloc_thr_init(void)
435 {
436 	static int did_init;
437 	static int SLGI;
438 	int slgi;
439 
440 	slgi = SLGI++;
441 	cpu_ccfence();
442 	TAILQ_INIT(&slglobal.full_zones);
443 	slglobal.sldepot = &sldepots[slgi & (NDEPOTS - 1)];
444 
445 	if (slglobal.masked)
446 		return;
447 
448 	slglobal.masked = 1;
449 	if (did_init == 0) {
450 		did_init = 1;
451 		pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
452 	}
453 	pthread_setspecific(thread_malloc_key, &slglobal);
454 	slglobal.masked = 0;
455 }
456 
457 /*
458  * Called just once
459  */
460 static void
461 _nmalloc_thr_init_once(void)
462 {
463 	int error;
464 
465 	error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
466 	if (error)
467 		abort();
468 }
469 
470 /*
471  * Called for each thread undergoing exit
472  *
473  * Move all of the thread's slabs into a depot.
474  */
475 static void
476 _nmalloc_thr_destructor(void *thrp)
477 {
478 	slglobaldata_t slgd = thrp;
479 	slab_t slab;
480 	int i;
481 
482 	slgd->masked = 1;
483 
484 	for (i = 0; i <= slgd->biggest_index; i++) {
485 		while ((slab = slgd->zone[i].empty_base) != NULL) {
486 			slgd->zone[i].empty_base = slab->next;
487 			slabterm(slgd, slab);
488 		}
489 
490 		while ((slab = slgd->zone[i].avail_base) != NULL) {
491 			slgd->zone[i].avail_base = slab->next;
492 			slabterm(slgd, slab);
493 		}
494 
495 		while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
496 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
497 			slabterm(slgd, slab);
498 		}
499 	}
500 }
501 
502 /*
503  * Calculate the zone index for the allocation request size and set the
504  * allocation request size to that particular zone's chunk size.
505  */
506 static __inline int
507 zoneindex(size_t *bytes, size_t *chunking)
508 {
509 	size_t n = (size_t)*bytes;
510 	size_t x;
511 	size_t c;
512 	int i;
513 
514 	if (n < 128) {
515 		*bytes = n = (n + 7) & ~7;
516 		*chunking = 8;
517 		return(n / 8);			/* 8 byte chunks, 16 zones */
518 	}
519 	if (n < 4096) {
520 		x = 256;
521 		c = x / (CHUNKFACTOR * 2);
522 		i = 16;
523 	} else {
524 		x = 8192;
525 		c = x / (CHUNKFACTOR * 2);
526 		i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
527 	}
528 	while (n >= x) {
529 		x <<= 1;
530 		c <<= 1;
531 		i += CHUNKFACTOR;
532 		if (x == 0)
533 			_mpanic("slaballoc: byte value too high");
534 	}
535 	*bytes = n = (n + c - 1) & ~(c - 1);
536 	*chunking = c;
537 	return (i + n / c - CHUNKFACTOR);
538 #if 0
539 	*bytes = n = (n + c - 1) & ~(c - 1);
540 	*chunking = c;
541 	return (n / c + i);
542 
543 	if (n < 256) {
544 		*bytes = n = (n + 15) & ~15;
545 		*chunking = 16;
546 		return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
547 	}
548 	if (n < 8192) {
549 		if (n < 512) {
550 			*bytes = n = (n + 31) & ~31;
551 			*chunking = 32;
552 			return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
553 		}
554 		if (n < 1024) {
555 			*bytes = n = (n + 63) & ~63;
556 			*chunking = 64;
557 			return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
558 		}
559 		if (n < 2048) {
560 			*bytes = n = (n + 127) & ~127;
561 			*chunking = 128;
562 			return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
563 		}
564 		if (n < 4096) {
565 			*bytes = n = (n + 255) & ~255;
566 			*chunking = 256;
567 			return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
568 		}
569 		*bytes = n = (n + 511) & ~511;
570 		*chunking = 512;
571 		return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
572 	}
573 	if (n < 16384) {
574 		*bytes = n = (n + 1023) & ~1023;
575 		*chunking = 1024;
576 		return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
577 	}
578 	if (n < 32768) {				/* 16384-32767 */
579 		*bytes = n = (n + 2047) & ~2047;
580 		*chunking = 2048;
581 		return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
582 	}
583 	if (n < 65536) {
584 		*bytes = n = (n + 4095) & ~4095;	/* 32768-65535 */
585 		*chunking = 4096;
586 		return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
587 	}
588 
589 	x = 131072;
590 	c = 8192;
591 	i = CHUNKINGLO*10 - 1;
592 
593 	while (n >= x) {
594 		x <<= 1;
595 		c <<= 1;
596 		i += CHUNKINGHI;
597 		if (x == 0)
598 			_mpanic("slaballoc: byte value too high");
599 	}
600 	*bytes = n = (n + c - 1) & ~(c - 1);
601 	*chunking = c;
602 	return (n / c + i);
603 #endif
604 }
605 
606 /*
607  * malloc() - call internal slab allocator
608  */
609 void *
610 malloc(size_t size)
611 {
612 	void *ptr;
613 
614 	ptr = memalloc(size, 0);
615 	if (ptr == NULL)
616 		errno = ENOMEM;
617 	else
618 		UTRACE(0, size, ptr);
619 	return(ptr);
620 }
621 
622 /*
623  * calloc() - call internal slab allocator
624  */
625 void *
626 calloc(size_t number, size_t size)
627 {
628 	void *ptr;
629 
630 	ptr = memalloc(number * size, SAFLAG_ZERO);
631 	if (ptr == NULL)
632 		errno = ENOMEM;
633 	else
634 		UTRACE(0, number * size, ptr);
635 	return(ptr);
636 }
637 
638 /*
639  * realloc() (SLAB ALLOCATOR)
640  *
641  * We do not attempt to optimize this routine beyond reusing the same
642  * pointer if the new size fits within the chunking of the old pointer's
643  * zone.
644  */
645 void *
646 realloc(void *ptr, size_t size)
647 {
648 	void *ret;
649 
650 	if (ptr == NULL)
651 		ret = memalloc(size, 0);
652 	else
653 		ret = memrealloc(ptr, size);
654 	if (ret == NULL)
655 		errno = ENOMEM;
656 	else
657 		UTRACE(ptr, size, ret);
658 	return(ret);
659 }
660 
661 /*
662  * posix_memalign()
663  *
664  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
665  * is a power of 2 >= sizeof(void *).
666  *
667  * The slab allocator will allocate on power-of-2 boundaries up to at least
668  * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
669  * matching the requirements.
670  */
671 int
672 posix_memalign(void **memptr, size_t alignment, size_t size)
673 {
674 	size_t chunking;
675 	int zi;
676 
677 	/*
678 	 * OpenGroup spec issue 6 checks
679 	 */
680 	if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
681 		*memptr = NULL;
682 		return(EINVAL);
683 	}
684 	if (alignment < sizeof(void *)) {
685 		*memptr = NULL;
686 		return(EINVAL);
687 	}
688 
689 	/*
690 	 * XXX for now just find the nearest power of 2 >= size and also
691 	 * >= alignment and allocate that.
692 	 */
693 	while (alignment < size) {
694 		alignment <<= 1;
695 		if (alignment == 0)
696 			_mpanic("posix_memalign: byte value too high");
697 	}
698 	*memptr = memalloc(alignment, 0);
699 	return(*memptr ? 0 : ENOMEM);
700 }
701 
702 /*
703  * free() (SLAB ALLOCATOR) - do the obvious
704  */
705 void
706 free(void *ptr)
707 {
708 	if (ptr) {
709 		UTRACE(ptr, 0, 0);
710 		memfree(ptr, 0);
711 	}
712 }
713 
714 /*
715  * memalloc()	(SLAB ALLOCATOR)
716  *
717  *	Allocate memory via the slab allocator.
718  */
719 static void *
720 memalloc(size_t size, int flags)
721 {
722 	slglobaldata_t slgd;
723 	struct zoneinfo *zinfo;
724 	slab_t slab;
725 	size_t chunking;
726 	int bmi;
727 	int bno;
728 	u_long *bmp;
729 	int zi;
730 #ifdef INVARIANTS
731 	int i;
732 #endif
733 	size_t off;
734 	char *obj;
735 
736 	if (!malloc_started)
737 		malloc_init();
738 
739 	/*
740 	 * If 0 bytes is requested we have to return a unique pointer, allocate
741 	 * at least one byte.
742 	 */
743 	if (size == 0)
744 		size = 1;
745 
746 	/* Capture global flags */
747 	flags |= g_malloc_flags;
748 
749 	/* Compute allocation zone; zoneindex will panic on excessive sizes */
750 	zi = zoneindex(&size, &chunking);
751 	MASSERT(zi < NZONES);
752 	if (size == 0)
753 		return(NULL);
754 
755 	/*
756 	 * Locate a slab with available space.  If no slabs are available
757 	 * back-off to the empty list and if we still come up dry allocate
758 	 * a new slab (which will try the depot first).
759 	 */
760 retry:
761 	slgd = &slglobal;
762 	zinfo = &slgd->zone[zi];
763 	if ((slab = zinfo->avail_base) == NULL) {
764 		if ((slab = zinfo->empty_base) == NULL) {
765 			/*
766 			 * Still dry
767 			 */
768 			slab = slaballoc(zi, chunking, size);
769 			if (slab == NULL)
770 				return(NULL);
771 			slab->next = zinfo->avail_base;
772 			zinfo->avail_base = slab;
773 			slab->state = AVAIL;
774 			if (slgd->biggest_index < zi)
775 				slgd->biggest_index = zi;
776 			++slgd->nslabs;
777 		} else {
778 			/*
779 			 * Pulled from empty list
780 			 */
781 			zinfo->empty_base = slab->next;
782 			slab->next = zinfo->avail_base;
783 			zinfo->avail_base = slab;
784 			slab->state = AVAIL;
785 			--zinfo->empty_count;
786 		}
787 	}
788 
789 	/*
790 	 * Allocate a chunk out of the slab.  HOT PATH
791 	 *
792 	 * Only the thread owning the slab can allocate out of it.
793 	 *
794 	 * NOTE: The last bit in the bitmap is always marked allocated so
795 	 *	 we cannot overflow here.
796 	 */
797 	bno = slab->free_bit;
798 	bmi = slab->free_index;
799 	bmp = &slab->bitmap[bmi];
800 	if (*bmp & (1LU << bno)) {
801 		atomic_clear_long(bmp, 1LU << bno);
802 		obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
803 		slab->free_bit = (bno + 1) & (LONG_BITS - 1);
804 		atomic_add_int(&slab->navail, -1);
805 		if (flags & SAFLAG_ZERO)
806 			bzero(obj, size);
807 		return (obj);
808 	}
809 
810 	/*
811 	 * Allocate a chunk out of a slab.  COLD PATH
812 	 */
813 	if (slab->navail == 0) {
814 		zinfo->avail_base = slab->next;
815 		slab->state = FULL;
816 		TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
817 		goto retry;
818 	}
819 
820 	while (bmi < LONG_BITS) {
821 		bmp = &slab->bitmap[bmi];
822 		if (*bmp) {
823 			bno = bsflong(*bmp);
824 			atomic_clear_long(bmp, 1LU << bno);
825 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
826 					     size;
827 			slab->free_index = bmi;
828 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
829 			atomic_add_int(&slab->navail, -1);
830 			if (flags & SAFLAG_ZERO)
831 				bzero(obj, size);
832 			return (obj);
833 		}
834 		++bmi;
835 	}
836 	bmi = 0;
837 	while (bmi < LONG_BITS) {
838 		bmp = &slab->bitmap[bmi];
839 		if (*bmp) {
840 			bno = bsflong(*bmp);
841 			atomic_clear_long(bmp, 1LU << bno);
842 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
843 					     size;
844 			slab->free_index = bmi;
845 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
846 			atomic_add_int(&slab->navail, -1);
847 			if (flags & SAFLAG_ZERO)
848 				bzero(obj, size);
849 			return (obj);
850 		}
851 		++bmi;
852 	}
853 	_mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
854 	/* not reached */
855 	return NULL;
856 }
857 
858 /*
859  * Reallocate memory within the chunk
860  */
861 static void *
862 memrealloc(void *ptr, size_t nsize)
863 {
864 	region_t region;
865 	slglobaldata_t slgd;
866 	slab_t slab;
867 	size_t osize;
868 	char *obj;
869 	int flags;
870 
871 	/*
872 	 * If 0 bytes is requested we have to return a unique pointer, allocate
873 	 * at least one byte.
874 	 */
875 	if (nsize == 0)
876 		nsize = 1;
877 
878 	/* Capture global flags */
879 	flags |= g_malloc_flags;
880 
881 	/*
882 	 * Locate the zone by looking up the dynamic slab size mask based
883 	 * on the memory region the allocation resides in.
884 	 */
885 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
886 	if ((slab = region->slab) == NULL)
887 		slab = (void *)((uintptr_t)ptr & region->mask);
888 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
889 	osize = slab->chunk_size;
890 	if (nsize <= osize) {
891 		if (osize < 32 || nsize >= osize / 2) {
892 			obj = ptr;
893 			if ((flags & SAFLAG_ZERO) && nsize < osize)
894 				bzero(obj + nsize, osize - nsize);
895 			return(obj);
896 		}
897 	}
898 
899 	/*
900 	 * Otherwise resize the object
901 	 */
902 	obj = memalloc(nsize, 0);
903 	if (obj) {
904 		if (nsize > osize)
905 			nsize = osize;
906 		bcopy(ptr, obj, nsize);
907 		memfree(ptr, 0);
908 	}
909 	return (obj);
910 }
911 
912 /*
913  * free (SLAB ALLOCATOR)
914  *
915  * Free a memory block previously allocated by malloc.
916  *
917  * MPSAFE
918  */
919 static void
920 memfree(void *ptr, int flags)
921 {
922 	region_t region;
923 	slglobaldata_t slgd;
924 	slab_t slab;
925 	slab_t stmp;
926 	slab_t *slabp;
927 	char *obj;
928 	int bmi;
929 	int bno;
930 	u_long *bmp;
931 
932 	/*
933 	 * Locate the zone by looking up the dynamic slab size mask based
934 	 * on the memory region the allocation resides in.
935 	 *
936 	 * WARNING!  The slab may be owned by another thread!
937 	 */
938 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
939 	if ((slab = region->slab) == NULL)
940 		slab = (void *)((uintptr_t)ptr & region->mask);
941 	MASSERT(slab != NULL);
942 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
943 
944 #ifdef INVARIANTS
945 	/*
946 	 * Put weird data into the memory to detect modifications after
947 	 * freeing, illegal pointer use after freeing (we should fault on
948 	 * the odd address), and so forth.
949 	 */
950 	if (slab->chunk_size < sizeof(weirdary))
951 		bcopy(weirdary, ptr, slab->chunk_size);
952 	else
953 		bcopy(weirdary, ptr, sizeof(weirdary));
954 #endif
955 
956 	bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
957 	bmi = bno >> LONG_BITS_SHIFT;
958 	bno &= (LONG_BITS - 1);
959 	bmp = &slab->bitmap[bmi];
960 
961 	MASSERT(bmi >= 0 && bmi < slab->nmax);
962 	MASSERT((*bmp & (1LU << bno)) == 0);
963 	atomic_set_long(bmp, 1LU << bno);
964 	atomic_add_int(&slab->navail, 1);
965 
966 	/*
967 	 * We can only do the following if we own the slab
968 	 */
969 	slgd = &slglobal;
970 	if (slab->slgd == slgd) {
971 		struct zoneinfo *zinfo;
972 
973 		if (slab->free_index > bmi) {
974 			slab->free_index = bmi;
975 			slab->free_bit = bno;
976 		} else if (slab->free_index == bmi &&
977 			   slab->free_bit > bno) {
978 			slab->free_bit = bno;
979 		}
980 		zinfo = &slgd->zone[slab->zone_index];
981 
982 		/*
983 		 * Freeing an object from a full slab will move it to the
984 		 * available list.  If the available list already has a
985 		 * slab we terminate the full slab instead, moving it to
986 		 * the depot.
987 		 */
988 		if (slab->state == FULL) {
989 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
990 			if (zinfo->avail_base == NULL) {
991 				slab->state = AVAIL;
992 				stmp = zinfo->avail_base;
993 				slab->next = stmp;
994 				zinfo->avail_base = slab;
995 			} else {
996 				slabterm(slgd, slab);
997 				goto done;
998 			}
999 		}
1000 
1001 		/*
1002 		 * If the slab becomes completely empty dispose of it in
1003 		 * some manner.  By default each thread caches up to 4
1004 		 * empty slabs.  Only small slabs are cached.
1005 		 */
1006 		if (slab->navail == slab->nmax && slab->state == AVAIL) {
1007 			/*
1008 			 * Remove slab from available queue
1009 			 */
1010 			slabp = &zinfo->avail_base;
1011 			while ((stmp = *slabp) != slab)
1012 				slabp = &stmp->next;
1013 			*slabp = slab->next;
1014 
1015 			if (opt_free || opt_cache == 0) {
1016 				/*
1017 				 * If local caching is disabled cache the
1018 				 * slab in the depot (or free it).
1019 				 */
1020 				slabterm(slgd, slab);
1021 			} else if (slab->slab_size > BIGSLABSIZE) {
1022 				/*
1023 				 * We do not try to retain large slabs
1024 				 * in per-thread caches.
1025 				 */
1026 				slabterm(slgd, slab);
1027 			} else if (zinfo->empty_count > opt_cache) {
1028 				/*
1029 				 * We have too many slabs cached, but
1030 				 * instead of freeing this one free
1031 				 * an empty slab that's been idle longer.
1032 				 *
1033 				 * (empty_count does not change)
1034 				 */
1035 				stmp = zinfo->empty_base;
1036 				slab->state = EMPTY;
1037 				slab->next = stmp->next;
1038 				zinfo->empty_base = slab;
1039 				slabterm(slgd, stmp);
1040 			} else {
1041 				/*
1042 				 * Cache the empty slab in our thread local
1043 				 * empty list.
1044 				 */
1045 				++zinfo->empty_count;
1046 				slab->state = EMPTY;
1047 				slab->next = zinfo->empty_base;
1048 				zinfo->empty_base = slab;
1049 			}
1050 		}
1051 	}
1052 done:
1053 	;
1054 }
1055 
1056 /*
1057  * Allocate a new slab holding objects of size chunk_size.
1058  */
1059 static slab_t
1060 slaballoc(int zi, size_t chunking, size_t chunk_size)
1061 {
1062 	slglobaldata_t slgd;
1063 	slglobaldata_t sldepot;
1064 	struct zoneinfo *zinfo;
1065 	region_t region;
1066 	void *save;
1067 	slab_t slab;
1068 	slab_t stmp;
1069 	size_t slab_desire;
1070 	size_t slab_size;
1071 	size_t region_mask;
1072 	uintptr_t chunk_offset;
1073 	ssize_t maxchunks;
1074 	ssize_t tmpchunks;
1075 	int ispower2;
1076 	int power;
1077 	int ri;
1078 	int rx;
1079 	int nswath;
1080 	int j;
1081 
1082 	/*
1083 	 * First look in the depot.  Any given zone in the depot may be
1084 	 * locked by being set to -1.  We have to do this instead of simply
1085 	 * removing the entire chain because removing the entire chain can
1086 	 * cause racing threads to allocate local slabs for large objects,
1087 	 * resulting in a large VSZ.
1088 	 */
1089 	slgd = &slglobal;
1090 	sldepot = slgd->sldepot;
1091 	zinfo = &sldepot->zone[zi];
1092 
1093 	while ((slab = zinfo->avail_base) != NULL) {
1094 		if ((void *)slab == LOCKEDPTR) {
1095 			cpu_pause();
1096 			continue;
1097 		}
1098 		if (atomic_cmpset_ptr(&zinfo->avail_base, slab, LOCKEDPTR)) {
1099 			MASSERT(slab->slgd == NULL);
1100 			slab->slgd = slgd;
1101 			zinfo->avail_base = slab->next;
1102 			return(slab);
1103 		}
1104 	}
1105 
1106 	/*
1107 	 * Nothing in the depot, allocate a new slab by locating or assigning
1108 	 * a region and then using the system virtual memory allocator.
1109 	 */
1110 	slab = NULL;
1111 
1112 	/*
1113 	 * Calculate the start of the data chunks relative to the start
1114 	 * of the slab.
1115 	 */
1116 	if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1117 		ispower2 = 1;
1118 		chunk_offset = (sizeof(*slab) + (chunk_size - 1)) &
1119 			       ~(chunk_size - 1);
1120 	} else {
1121 		ispower2 = 0;
1122 		chunk_offset = sizeof(*slab) + chunking - 1;
1123 		chunk_offset -= chunk_offset % chunking;
1124 	}
1125 
1126 	/*
1127 	 * Calculate a reasonable number of chunks for the slab.
1128 	 *
1129 	 * Once initialized the MaxChunks[] array can only ever be
1130 	 * reinitialized to the same value.
1131 	 */
1132 	maxchunks = MaxChunks[zi];
1133 	if (maxchunks == 0) {
1134 		/*
1135 		 * First calculate how many chunks would fit in 1/1024
1136 		 * available memory.  This is around 2MB on a 32 bit
1137 		 * system and 128G on a 64-bit (48-bits available) system.
1138 		 */
1139 		maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1140 			    (ssize_t)chunk_size;
1141 		if (maxchunks <= 0)
1142 			maxchunks = 1;
1143 
1144 		/*
1145 		 * A slab cannot handle more than MAXCHUNKS chunks, but
1146 		 * limit us to approximately MAXCHUNKS / 2 here because
1147 		 * we may have to expand maxchunks when we calculate the
1148 		 * actual power-of-2 slab.
1149 		 */
1150 		if (maxchunks > MAXCHUNKS / 2)
1151 			maxchunks = MAXCHUNKS / 2;
1152 
1153 		/*
1154 		 * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1155 		 * slabs will be created if the allocation does not fit.
1156 		 */
1157 		if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1158 			tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1159 				    (ssize_t)chunk_size;
1160 			if (tmpchunks <= 0)
1161 				tmpchunks = 1;
1162 			if (maxchunks > tmpchunks)
1163 				maxchunks = tmpchunks;
1164 		}
1165 
1166 		/*
1167 		 * If the slab calculates to greater than 2MB see if we
1168 		 * can cut it down to ~2MB.  This controls VSZ but has
1169 		 * no effect on run-time size or performance.
1170 		 *
1171 		 * This is very important in case you core dump and also
1172 		 * important to reduce unnecessary region allocations.
1173 		 */
1174 		if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1175 			tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1176 				    (ssize_t)chunk_size;
1177 			if (tmpchunks < 1)
1178 				tmpchunks = 1;
1179 			if (maxchunks > tmpchunks)
1180 				maxchunks = tmpchunks;
1181 		}
1182 
1183 		/*
1184 		 * If the slab calculates to greater than 128K see if we
1185 		 * can cut it down to ~128K while still maintaining a
1186 		 * reasonably large number of chunks in each slab.  This
1187 		 * controls VSZ but has no effect on run-time size or
1188 		 * performance.
1189 		 *
1190 		 * This is very important in case you core dump and also
1191 		 * important to reduce unnecessary region allocations.
1192 		 */
1193 		if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1194 			tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1195 				    (ssize_t)chunk_size;
1196 			if (tmpchunks < 32)
1197 				tmpchunks = 32;
1198 			if (maxchunks > tmpchunks)
1199 				maxchunks = tmpchunks;
1200 		}
1201 
1202 		MaxChunks[zi] = maxchunks;
1203 	}
1204 	MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1205 
1206 	/*
1207 	 * Calculate the actual slab size.  maxchunks will be recalculated
1208 	 * a little later.
1209 	 */
1210 	slab_desire = chunk_offset + chunk_size * maxchunks;
1211 	slab_size = 8 * MAXCHUNKS;
1212 	power = 3 + MAXCHUNKS_BITS;
1213 	while (slab_size < slab_desire) {
1214 		slab_size <<= 1;
1215 		++power;
1216 	}
1217 
1218 	/*
1219 	 * Do a quick recalculation based on the actual slab size but not
1220 	 * yet dealing with whether the slab header is in-band or out-of-band.
1221 	 * The purpose here is to see if we can reasonably reduce slab_size
1222 	 * to a power of 4 to allow more chunk sizes to use the same slab
1223 	 * size.
1224 	 */
1225 	if ((power & 1) && slab_size > 32768) {
1226 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1227 		if (maxchunks >= MAXCHUNKS / 8) {
1228 			slab_size >>= 1;
1229 			--power;
1230 		}
1231 	}
1232 	if ((power & 2) && slab_size > 32768 * 4) {
1233 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1234 		if (maxchunks >= MAXCHUNKS / 4) {
1235 			slab_size >>= 2;
1236 			power -= 2;
1237 		}
1238 	}
1239 	/*
1240 	 * This case occurs when the slab_size is larger than 1/1024 available
1241 	 * UVM.
1242 	 */
1243 	nswath = slab_size / NREGIONS_SIZE;
1244 	if (nswath > NREGIONS)
1245 		return (NULL);
1246 
1247 
1248 	/*
1249 	 * Try to allocate from our current best region for this zi
1250 	 */
1251 	region_mask = ~(slab_size - 1);
1252 	ri = slgd->zone[zi].best_region;
1253 	if (Regions[ri].mask == region_mask) {
1254 		if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1255 			goto found;
1256 	}
1257 
1258 	/*
1259 	 * Try to find an existing region to allocate from.  The normal
1260 	 * case will be for allocations that are less than 1/1024 available
1261 	 * UVM, which fit into a single Regions[] entry.
1262 	 */
1263 	while (slab_size <= NREGIONS_SIZE) {
1264 		rx = -1;
1265 		for (ri = 0; ri < NREGIONS; ++ri) {
1266 			if (rx < 0 && Regions[ri].mask == 0)
1267 				rx = ri;
1268 			if (Regions[ri].mask == region_mask) {
1269 				slab = _vmem_alloc(ri, slab_size);
1270 				if (slab) {
1271 					slgd->zone[zi].best_region = ri;
1272 					goto found;
1273 				}
1274 			}
1275 		}
1276 
1277 		if (rx < 0)
1278 			return(NULL);
1279 
1280 		/*
1281 		 * This can fail, retry either way
1282 		 */
1283 		atomic_cmpset_ptr((void **)&Regions[rx].mask,
1284 				  NULL,
1285 				  (void *)region_mask);
1286 	}
1287 
1288 	for (;;) {
1289 		rx = -1;
1290 		for (ri = 0; ri < NREGIONS; ri += nswath) {
1291 			if (Regions[ri].mask == region_mask) {
1292 				slab = _vmem_alloc(ri, slab_size);
1293 				if (slab) {
1294 					slgd->zone[zi].best_region = ri;
1295 					goto found;
1296 				}
1297 			}
1298 			if (rx < 0) {
1299 				for (j = nswath - 1; j >= 0; --j) {
1300 					if (Regions[ri+j].mask != 0)
1301 						break;
1302 				}
1303 				if (j < 0)
1304 					rx = ri;
1305 			}
1306 		}
1307 
1308 		/*
1309 		 * We found a candidate, try to allocate it backwards so
1310 		 * another thread racing a slaballoc() does not see the
1311 		 * mask in the base index position until we are done.
1312 		 *
1313 		 * We can safely zero-out any partial allocations because
1314 		 * the mask is only accessed from the base index.  Any other
1315 		 * threads racing us will fail prior to us clearing the mask.
1316 		 */
1317 		if (rx < 0)
1318 			return(NULL);
1319 		for (j = nswath - 1; j >= 0; --j) {
1320 			if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1321 					       NULL, (void *)region_mask)) {
1322 				while (++j < nswath)
1323 					Regions[rx+j].mask = 0;
1324 				break;
1325 			}
1326 		}
1327 		/* retry */
1328 	}
1329 
1330 	/*
1331 	 * Fill in the new slab in region ri.  If the slab_size completely
1332 	 * fills one or more region slots we move the slab structure out of
1333 	 * band which should optimize the chunking (particularly for a power
1334 	 * of 2).
1335 	 */
1336 found:
1337 	region = &Regions[ri];
1338 	MASSERT(region->slab == NULL);
1339 	if (slab_size >= NREGIONS_SIZE) {
1340 		save = slab;
1341 		slab = memalloc(sizeof(*slab), 0);
1342 		bzero(slab, sizeof(*slab));
1343 		slab->chunks = save;
1344 		for (j = 0; j < nswath; ++j)
1345 			region[j].slab = slab;
1346 		chunk_offset = 0;
1347 	} else {
1348 		bzero(slab, sizeof(*slab));
1349 		slab->chunks = (char *)slab + chunk_offset;
1350 	}
1351 
1352 	/*
1353 	 * Calculate the start of the chunks memory and recalculate the
1354 	 * actual number of chunks the slab can hold.
1355 	 */
1356 	maxchunks = (slab_size - chunk_offset) / chunk_size;
1357 	if (maxchunks > MAXCHUNKS)
1358 		maxchunks = MAXCHUNKS;
1359 
1360 	/*
1361 	 * And fill in the rest
1362 	 */
1363 	slab->magic = ZALLOC_SLAB_MAGIC;
1364 	slab->navail = maxchunks;
1365 	slab->nmax = maxchunks;
1366 	slab->slab_size = slab_size;
1367 	slab->chunk_size = chunk_size;
1368 	slab->zone_index = zi;
1369 	slab->slgd = slgd;
1370 	slab->state = UNKNOWN;
1371 	slab->region = region;
1372 
1373 	for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1374 		if (ri + LONG_BITS <= maxchunks)
1375 			slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1376 		else
1377 			slab->bitmap[ri >> LONG_BITS_SHIFT] =
1378 						(1LU << (maxchunks - ri)) - 1;
1379 	}
1380 	return (slab);
1381 }
1382 
1383 /*
1384  * Free a slab.
1385  */
1386 static void
1387 slabfree(slab_t slab)
1388 {
1389 	int nswath;
1390 	int j;
1391 
1392 	if (slab->region->slab == slab) {
1393 		/*
1394 		 * Out-of-band slab.
1395 		 */
1396 		nswath = slab->slab_size / NREGIONS_SIZE;
1397 		for (j = 0; j < nswath; ++j)
1398 			slab->region[j].slab = NULL;
1399 		slab->magic = 0;
1400 		_vmem_free(slab->chunks, slab->slab_size);
1401 		memfree(slab, 0);
1402 	} else {
1403 		/*
1404 		 * In-band slab.
1405 		 */
1406 		slab->magic = 0;
1407 		_vmem_free(slab, slab->slab_size);
1408 	}
1409 }
1410 
1411 /*
1412  * Terminate a slab's use in the current thread.  The slab may still have
1413  * outstanding allocations and thus not be deallocatable.
1414  */
1415 static void
1416 slabterm(slglobaldata_t slgd, slab_t slab)
1417 {
1418 	slglobaldata_t sldepot = slgd->sldepot;
1419 	struct zoneinfo *zinfo;
1420 	slab_t dnext;
1421 	int zi = slab->zone_index;
1422 
1423 	slab->slgd = NULL;
1424 	--slgd->nslabs;
1425 	zinfo = &sldepot->zone[zi];
1426 
1427 	/*
1428 	 * If the slab can be freed and the depot is either locked or not
1429 	 * empty, then free the slab.
1430 	 */
1431 	if (slab->navail == slab->nmax && zinfo->avail_base) {
1432 		slab->state = UNKNOWN;
1433 		slabfree(slab);
1434 		return;
1435 	}
1436 	slab->state = AVAIL;
1437 
1438 	/*
1439 	 * Link the slab into the depot
1440 	 */
1441 	for (;;) {
1442 		dnext = zinfo->avail_base;
1443 		cpu_ccfence();
1444 		if ((void *)dnext == LOCKEDPTR) {
1445 			cpu_pause();
1446 			continue;
1447 		}
1448 		slab->next = dnext;
1449 		if (atomic_cmpset_ptr(&zinfo->avail_base, dnext, slab))
1450 			break;
1451 	}
1452 }
1453 
1454 /*
1455  * _vmem_alloc()
1456  *
1457  *	Directly map memory in PAGE_SIZE'd chunks with the specified
1458  *	alignment.
1459  *
1460  *	Alignment must be a multiple of PAGE_SIZE.
1461  *
1462  *	Size must be >= alignment.
1463  */
1464 static void *
1465 _vmem_alloc(int ri, size_t slab_size)
1466 {
1467 	char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1468 	char *eaddr;
1469 	char *addr;
1470 	char *save;
1471 	uintptr_t excess;
1472 
1473 	if (slab_size < NREGIONS_SIZE)
1474 		eaddr = baddr + NREGIONS_SIZE;
1475 	else
1476 		eaddr = baddr + slab_size;
1477 
1478 	/*
1479 	 * This usually just works but might not.
1480 	 */
1481 	addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1482 		    MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1483 	if (addr == MAP_FAILED) {
1484 		return (NULL);
1485 	}
1486 	if (addr < baddr || addr + slab_size > eaddr) {
1487 		munmap(addr, slab_size);
1488 		return (NULL);
1489 	}
1490 
1491 	/*
1492 	 * Check alignment.  The misaligned offset is also the excess
1493 	 * amount.  If misaligned unmap the excess so we have a chance of
1494 	 * mapping at the next alignment point and recursively try again.
1495 	 *
1496 	 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB	block alignment
1497 	 *   aaaaaaaaa aaaaaaaaaaa aa		mis-aligned allocation
1498 	 *   xxxxxxxxx				final excess calculation
1499 	 *   ^ returned address
1500 	 */
1501 	excess = (uintptr_t)addr & (slab_size - 1);
1502 	while (excess) {
1503 		excess = slab_size - excess;
1504 		save = addr;
1505 
1506 		munmap(save + excess, slab_size - excess);
1507 		addr = _vmem_alloc(ri, slab_size);
1508 		munmap(save, excess);
1509 		if (addr == NULL)
1510 			return (NULL);
1511 		if (addr < baddr || addr + slab_size > eaddr) {
1512 			munmap(addr, slab_size);
1513 			return (NULL);
1514 		}
1515 		excess = (uintptr_t)addr & (slab_size - 1);
1516 	}
1517 	return (addr);
1518 }
1519 
1520 /*
1521  * _vmem_free()
1522  *
1523  *	Free a chunk of memory allocated with _vmem_alloc()
1524  */
1525 static void
1526 _vmem_free(void *ptr, size_t size)
1527 {
1528 	munmap(ptr, size);
1529 }
1530 
1531 /*
1532  * Panic on fatal conditions
1533  */
1534 static void
1535 _mpanic(const char *ctl, ...)
1536 {
1537 	va_list va;
1538 
1539 	if (malloc_panic == 0) {
1540 		malloc_panic = 1;
1541 		va_start(va, ctl);
1542 		vfprintf(stderr, ctl, va);
1543 		fprintf(stderr, "\n");
1544 		fflush(stderr);
1545 		va_end(va);
1546 	}
1547 	abort();
1548 }
1549