xref: /dragonfly/lib/libc/stdlib/dmalloc.c (revision d9f85b33)
1 /*
2  * DMALLOC.C	- Dillon's malloc
3  *
4  * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module implements a modified slab allocator as a drop-in replacement
38  * for the libc malloc().  The slab algorithm has been adjusted to support
39  * dynamic sizing of slabs which effectively allows slabs to be used for
40  * allocations of any size.  Because of this we neither have a small-block
41  * allocator or a big-block allocator and the code paths are simplified.
42  *
43  * To support dynamic slab sizing available user virtual memory is broken
44  * down into ~1024 regions.  Each region has fixed slab size whos value is
45  * set when the region is opened up for use.  The free() path simply applies
46  * a mask based on the region to the pointer to acquire the base of the
47  * governing slab structure.
48  *
49  * Regions[NREGIONS]	(1024)
50  *
51  * Slab management and locking is done on a per-zone basis.
52  *
53  *	Alloc Size	Chunking        Number of zones
54  *	0-127		8		16
55  *	128-255		16		8
56  *	256-511		32		8
57  *	512-1023	64		8
58  *	1024-2047	128		8
59  *	2048-4095	256		8
60  *	4096-8191	512		8
61  *	8192-16383	1024		8
62  *	16384-32767	2048		8
63  *	32768-65535	4096		8
64  *	... continues forever ...	4 zones
65  *
66  *	For a 2^63 memory space each doubling >= 64K is broken down into
67  *	4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
68  *
69  *			   API FEATURES AND SIDE EFFECTS
70  *
71  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
72  *	Above that power-of-2 sized allocations are page-aligned.  Non
73  *	power-of-2 sized allocations are aligned the same as the chunk
74  *	size for their zone.
75  *    + malloc(0) returns a special non-NULL value
76  *    + ability to allocate arbitrarily large chunks of memory
77  *    + realloc will reuse the passed pointer if possible, within the
78  *	limitations of the zone chunking.
79  *
80  * On top of th slab allocator we also implement a 16-entry-per-thread
81  * magazine cache for allocations <= NOMSLABSIZE.
82  *
83  *				FUTURE FEATURES
84  *
85  *    + [better] garbage collection
86  *    + better initial sizing.
87  *
88  * TUNING
89  *
90  * The value of the environment variable MALLOC_OPTIONS is a character string
91  * containing various flags to tune nmalloc.  Upper case letters enabled
92  * or increase the feature, lower case disables or decreases the feature.
93  *
94  * U		Enable UTRACE for all operations, observable with ktrace.
95  *		Diasbled by default.
96  *
97  * Z		Zero out allocations, otherwise allocations (except for
98  *		calloc) will contain garbage.
99  *		Disabled by default.
100  *
101  * H		Pass a hint with madvise() about unused pages.
102  *		Disabled by default.
103  *		Not currently implemented.
104  *
105  * F		Disable local per-thread caching.
106  *		Disabled by default.
107  *
108  * C		Increase (decrease) how much excess cache to retain.
109  *		Set to 4 by default.
110  */
111 
112 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
113 
114 #ifndef STANDALONE_DEBUG
115 #include "libc_private.h"
116 #endif
117 
118 #include <sys/param.h>
119 #include <sys/types.h>
120 #include <sys/mman.h>
121 #include <sys/queue.h>
122 #include <sys/uio.h>
123 #include <sys/ktrace.h>
124 #include <stdio.h>
125 #include <stdint.h>
126 #include <stdlib.h>
127 #include <stdarg.h>
128 #include <stddef.h>
129 #include <unistd.h>
130 #include <string.h>
131 #include <fcntl.h>
132 #include <errno.h>
133 #include <pthread.h>
134 #include <limits.h>
135 
136 #include <machine/atomic.h>
137 #include <machine/cpufunc.h>
138 
139 #ifdef STANDALONE_DEBUG
140 void _nmalloc_thr_init(void);
141 #else
142 #include "spinlock.h"
143 #include "un-namespace.h"
144 #endif
145 
146 #ifndef MAP_SIZEALIGN
147 #define MAP_SIZEALIGN	0
148 #endif
149 
150 #if SSIZE_MAX == 0x7FFFFFFF
151 #define ADDRBITS	32
152 #define UVM_BITS	32	/* worst case */
153 #else
154 #define ADDRBITS	64
155 #define UVM_BITS	48	/* worst case XXX */
156 #endif
157 
158 #if LONG_MAX == 0x7FFFFFFF
159 #define LONG_BITS	32
160 #define LONG_BITS_SHIFT	5
161 #else
162 #define LONG_BITS	64
163 #define LONG_BITS_SHIFT	6
164 #endif
165 
166 #define LOCKEDPTR	((void *)(intptr_t)-1)
167 
168 /*
169  * Regions[]
170  */
171 #define NREGIONS_BITS	10
172 #define NREGIONS	(1 << NREGIONS_BITS)
173 #define NREGIONS_MASK	(NREGIONS - 1)
174 #define NREGIONS_SHIFT	(UVM_BITS - NREGIONS_BITS)
175 #define NREGIONS_SIZE	(1LU << NREGIONS_SHIFT)
176 
177 typedef struct region *region_t;
178 typedef struct slglobaldata *slglobaldata_t;
179 typedef struct slab *slab_t;
180 
181 struct region {
182 	uintptr_t	mask;
183 	slab_t		slab;	/* conditional out of band slab */
184 };
185 
186 static struct region Regions[NREGIONS];
187 
188 /*
189  * Number of chunking zones available
190  */
191 #define CHUNKFACTOR	8
192 #if ADDRBITS == 32
193 #define NZONES		(16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
194 #else
195 #define NZONES		(16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
196 #endif
197 
198 static int MaxChunks[NZONES];
199 
200 #define NDEPOTS		8		/* must be power of 2 */
201 
202 /*
203  * Maximum number of chunks per slab, governed by the allocation bitmap in
204  * each slab.  The maximum is reduced for large chunk sizes.
205  */
206 #define MAXCHUNKS	(LONG_BITS * LONG_BITS)
207 #define MAXCHUNKS_BITS	(LONG_BITS_SHIFT * LONG_BITS_SHIFT)
208 #define LITSLABSIZE	(32 * 1024)
209 #define NOMSLABSIZE	(2 * 1024 * 1024)
210 #define BIGSLABSIZE	(128 * 1024 * 1024)
211 
212 #define ZALLOC_SLAB_MAGIC	0x736c6162	/* magic sanity */
213 
214 TAILQ_HEAD(slab_list, slab);
215 
216 /*
217  * A slab structure
218  */
219 struct slab {
220 	struct slab	*next;		/* slabs with available space */
221 	TAILQ_ENTRY(slab) entry;
222 	int32_t		magic;		/* magic number for sanity check */
223 	u_int		navail;		/* number of free elements available */
224 	u_int		nmax;
225 	u_int		free_bit;	/* free hint bitno */
226 	u_int		free_index;	/* free hint index */
227 	u_long		bitmap[LONG_BITS]; /* free chunks */
228 	size_t		slab_size;	/* size of entire slab */
229 	size_t		chunk_size;	/* chunk size for validation */
230 	int		zone_index;
231 	enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
232 	int		flags;
233 	region_t	region;		/* related region */
234 	char		*chunks;	/* chunk base */
235 	slglobaldata_t	slgd;		/* localized to thread else NULL */
236 };
237 
238 /*
239  * per-thread data + global depot
240  *
241  * NOTE: The magazine shortcut is only used for per-thread data.
242  */
243 #define NMAGSHORTCUT	16
244 
245 struct slglobaldata {
246 	spinlock_t	lock;		/* only used by slglobaldepot */
247 	struct zoneinfo {
248 		slab_t	avail_base;
249 		slab_t	empty_base;
250 		int	best_region;
251 		int	mag_index;
252 		int	avail_count;
253 		int	empty_count;
254 		void	*mag_shortcut[NMAGSHORTCUT];
255 	} zone[NZONES];
256 	struct slab_list full_zones;	/* via entry */
257 	int		masked;
258 	int		biggest_index;
259 	size_t		nslabs;
260 };
261 
262 #define SLAB_ZEROD		0x0001
263 
264 /*
265  * Misc constants.  Note that allocations that are exact multiples of
266  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
267  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
268  */
269 #define MIN_CHUNK_SIZE		8		/* in bytes */
270 #define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
271 
272 #define SAFLAG_ZERO	0x00000001
273 
274 /*
275  * The WEIRD_ADDR is used as known text to copy into free objects to
276  * try to create deterministic failure cases if the data is accessed after
277  * free.
278  *
279  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
280  *	    not be larger then 64.
281  */
282 #define WEIRD_ADDR      0xdeadc0de
283 #define MAX_COPY        sizeof(weirdary)
284 
285 #define arysize(ary)	(sizeof(ary)/sizeof((ary)[0]))
286 
287 /*
288  * Thread control
289  */
290 
291 #define MASSERT(exp)	do { if (__predict_false(!(exp)))	\
292 				_mpanic("assertion: %s in %s",	\
293 				#exp, __func__);		\
294 			    } while (0)
295 
296 /*
297  * With this attribute set, do not require a function call for accessing
298  * this variable when the code is compiled -fPIC.
299  *
300  * Must be empty for libc_rtld (similar to __thread)
301  */
302 #if defined(__LIBC_RTLD)
303 #define TLS_ATTRIBUTE
304 #else
305 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
306 #endif
307 
308 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
309 static pthread_key_t thread_malloc_key;
310 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
311 static struct slglobaldata slglobaldepot;
312 
313 static int opt_madvise = 0;
314 static int opt_free = 0;
315 static int opt_cache = 4;
316 static int opt_utrace = 0;
317 static int g_malloc_flags = 0;
318 static int malloc_panic;
319 
320 static const int32_t weirdary[16] = {
321 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
322 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
323 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
324 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
325 };
326 
327 static void *memalloc(size_t size, int flags);
328 static void *memrealloc(void *ptr, size_t size);
329 static void memfree(void *ptr, int);
330 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
331 static void slabfree(slab_t slab);
332 static void slabterm(slglobaldata_t slgd, slab_t slab);
333 static void *_vmem_alloc(int ri, size_t slab_size);
334 static void _vmem_free(void *ptr, size_t slab_size);
335 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
336 #ifndef STANDALONE_DEBUG
337 static void malloc_init(void) __constructor(101);
338 #else
339 static void malloc_init(void) __constructor(101);
340 #endif
341 
342 
343 struct nmalloc_utrace {
344 	void *p;
345 	size_t s;
346 	void *r;
347 };
348 
349 #define UTRACE(a, b, c)						\
350 	if (opt_utrace) {					\
351 		struct nmalloc_utrace ut = {			\
352 			.p = (a),				\
353 			.s = (b),				\
354 			.r = (c)				\
355 		};						\
356 		utrace(&ut, sizeof(ut));			\
357 	}
358 
359 #ifdef INVARIANTS
360 /*
361  * If enabled any memory allocated without M_ZERO is initialized to -1.
362  */
363 static int  use_malloc_pattern;
364 #endif
365 
366 static void
367 malloc_init(void)
368 {
369 	const char *p = NULL;
370 
371 	TAILQ_INIT(&slglobal.full_zones);
372 
373 	Regions[0].mask = -1; /* disallow activity in lowest region */
374 
375 	if (issetugid() == 0)
376 		p = getenv("MALLOC_OPTIONS");
377 
378 	for (; p != NULL && *p != '\0'; p++) {
379 		switch(*p) {
380 		case 'u':
381 			opt_utrace = 0;
382 			break;
383 		case 'U':
384 			opt_utrace = 1;
385 			break;
386 		case 'h':
387 			opt_madvise = 0;
388 			break;
389 		case 'H':
390 			opt_madvise = 1;
391 			break;
392 		case 'c':
393 			if (opt_cache > 0)
394 				--opt_cache;
395 			break;
396 		case 'C':
397 			++opt_cache;
398 			break;
399 		case 'f':
400 			opt_free = 0;
401 			break;
402 		case 'F':
403 			opt_free = 1;
404 			break;
405 		case 'z':
406 			g_malloc_flags = 0;
407 			break;
408 		case 'Z':
409 			g_malloc_flags = SAFLAG_ZERO;
410 			break;
411 		default:
412 			break;
413 		}
414 	}
415 
416 	UTRACE((void *) -1, 0, NULL);
417 }
418 
419 /*
420  * We have to install a handler for nmalloc thread teardowns when
421  * the thread is created.  We cannot delay this because destructors in
422  * sophisticated userland programs can call malloc() for the first time
423  * during their thread exit.
424  *
425  * This routine is called directly from pthreads.
426  */
427 static void _nmalloc_thr_init_once(void);
428 static void _nmalloc_thr_destructor(void *thrp);
429 
430 void
431 _nmalloc_thr_init(void)
432 {
433 	static int did_init;
434 
435 	TAILQ_INIT(&slglobal.full_zones);
436 
437 	if (slglobal.masked)
438 		return;
439 
440 	slglobal.masked = 1;
441 	if (did_init == 0) {
442 		did_init = 1;
443 		pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
444 	}
445 	pthread_setspecific(thread_malloc_key, &slglobal);
446 	slglobal.masked = 0;
447 }
448 
449 void
450 _nmalloc_thr_prepfork(void)
451 {
452 	if (__isthreaded)
453 		_SPINLOCK(&slglobaldepot.lock);
454 }
455 
456 void
457 _nmalloc_thr_parentfork(void)
458 {
459 	if (__isthreaded)
460 		_SPINUNLOCK(&slglobaldepot.lock);
461 }
462 
463 void
464 _nmalloc_thr_childfork(void)
465 {
466 	if (__isthreaded)
467 		_SPINUNLOCK(&slglobaldepot.lock);
468 }
469 
470 /*
471  * Called just once
472  */
473 static void
474 _nmalloc_thr_init_once(void)
475 {
476 	int error;
477 
478 	error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
479 	if (error)
480 		abort();
481 }
482 
483 /*
484  * Called for each thread undergoing exit
485  *
486  * Move all of the thread's slabs into a depot.
487  */
488 static void
489 _nmalloc_thr_destructor(void *thrp)
490 {
491 	slglobaldata_t slgd = thrp;
492 	struct zoneinfo *zinfo;
493 	slab_t slab;
494 	void *ptr;
495 	int i;
496 	int j;
497 
498 	slgd->masked = 1;
499 
500 	for (i = 0; i <= slgd->biggest_index; i++) {
501 		zinfo = &slgd->zone[i];
502 
503 		while ((j = zinfo->mag_index) > 0) {
504 			--j;
505 			ptr = zinfo->mag_shortcut[j];
506 			zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
507 			zinfo->mag_index = j;
508 			memfree(ptr, 0);
509 		}
510 
511 		while ((slab = zinfo->empty_base) != NULL) {
512 			zinfo->empty_base = slab->next;
513 			--zinfo->empty_count;
514 			slabterm(slgd, slab);
515 		}
516 
517 		while ((slab = zinfo->avail_base) != NULL) {
518 			zinfo->avail_base = slab->next;
519 			--zinfo->avail_count;
520 			slabterm(slgd, slab);
521 		}
522 
523 		while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
524 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
525 			slabterm(slgd, slab);
526 		}
527 	}
528 }
529 
530 /*
531  * Calculate the zone index for the allocation request size and set the
532  * allocation request size to that particular zone's chunk size.
533  *
534  * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform
535  * with malloc requirements for intel/amd.
536  */
537 static __inline int
538 zoneindex(size_t *bytes, size_t *chunking)
539 {
540 	size_t n = (size_t)*bytes;
541 	size_t x;
542 	size_t c;
543 	int i;
544 
545 	if (n < 128) {
546 		if (n < 16) {
547 			*bytes = n = (n + 7) & ~7;
548 			*chunking = 8;
549 			return(n / 8 - 1);	/* 8 byte chunks, 2 zones */
550 		} else {
551 			*bytes = n = (n + 15) & ~15;
552 			*chunking = 16;
553 			return(n / 16 + 2);	/* 16 byte chunks, 8 zones */
554 		}
555 	}
556 	if (n < 4096) {
557 		x = 256;
558 		c = x / (CHUNKFACTOR * 2);
559 		i = 16;
560 	} else {
561 		x = 8192;
562 		c = x / (CHUNKFACTOR * 2);
563 		i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
564 	}
565 	while (n >= x) {
566 		x <<= 1;
567 		c <<= 1;
568 		i += CHUNKFACTOR;
569 		if (x == 0)
570 			_mpanic("slaballoc: byte value too high");
571 	}
572 	*bytes = n = roundup2(n, c);
573 	*chunking = c;
574 	return (i + n / c - CHUNKFACTOR);
575 #if 0
576 	*bytes = n = (n + c - 1) & ~(c - 1);
577 	*chunking = c;
578 	return (n / c + i);
579 
580 	if (n < 256) {
581 		*bytes = n = (n + 15) & ~15;
582 		*chunking = 16;
583 		return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
584 	}
585 	if (n < 8192) {
586 		if (n < 512) {
587 			*bytes = n = (n + 31) & ~31;
588 			*chunking = 32;
589 			return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
590 		}
591 		if (n < 1024) {
592 			*bytes = n = (n + 63) & ~63;
593 			*chunking = 64;
594 			return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
595 		}
596 		if (n < 2048) {
597 			*bytes = n = (n + 127) & ~127;
598 			*chunking = 128;
599 			return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
600 		}
601 		if (n < 4096) {
602 			*bytes = n = (n + 255) & ~255;
603 			*chunking = 256;
604 			return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
605 		}
606 		*bytes = n = (n + 511) & ~511;
607 		*chunking = 512;
608 		return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
609 	}
610 	if (n < 16384) {
611 		*bytes = n = (n + 1023) & ~1023;
612 		*chunking = 1024;
613 		return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
614 	}
615 	if (n < 32768) {				/* 16384-32767 */
616 		*bytes = n = (n + 2047) & ~2047;
617 		*chunking = 2048;
618 		return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
619 	}
620 	if (n < 65536) {
621 		*bytes = n = (n + 4095) & ~4095;	/* 32768-65535 */
622 		*chunking = 4096;
623 		return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
624 	}
625 
626 	x = 131072;
627 	c = 8192;
628 	i = CHUNKINGLO*10 - 1;
629 
630 	while (n >= x) {
631 		x <<= 1;
632 		c <<= 1;
633 		i += CHUNKINGHI;
634 		if (x == 0)
635 			_mpanic("slaballoc: byte value too high");
636 	}
637 	*bytes = n = (n + c - 1) & ~(c - 1);
638 	*chunking = c;
639 	return (n / c + i);
640 #endif
641 }
642 
643 /*
644  * malloc() - call internal slab allocator
645  */
646 void *
647 __malloc(size_t size)
648 {
649 	void *ptr;
650 
651 	ptr = memalloc(size, 0);
652 	if (ptr == NULL)
653 		errno = ENOMEM;
654 	else
655 		UTRACE(0, size, ptr);
656 	return(ptr);
657 }
658 
659 /*
660  * calloc() - call internal slab allocator
661  */
662 void *
663 __calloc(size_t number, size_t size)
664 {
665 	void *ptr;
666 
667 	ptr = memalloc(number * size, SAFLAG_ZERO);
668 	if (ptr == NULL)
669 		errno = ENOMEM;
670 	else
671 		UTRACE(0, number * size, ptr);
672 	return(ptr);
673 }
674 
675 /*
676  * realloc() (SLAB ALLOCATOR)
677  *
678  * We do not attempt to optimize this routine beyond reusing the same
679  * pointer if the new size fits within the chunking of the old pointer's
680  * zone.
681  */
682 void *
683 __realloc(void *ptr, size_t size)
684 {
685 	void *ret;
686 
687 	if (ptr == NULL)
688 		ret = memalloc(size, 0);
689 	else
690 		ret = memrealloc(ptr, size);
691 	if (ret == NULL)
692 		errno = ENOMEM;
693 	else
694 		UTRACE(ptr, size, ret);
695 	return(ret);
696 }
697 
698 /*
699  * posix_memalign()
700  *
701  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
702  * is a power of 2 >= sizeof(void *).
703  *
704  * The slab allocator will allocate on power-of-2 boundaries up to at least
705  * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
706  * matching the requirements.
707  */
708 int
709 __posix_memalign(void **memptr, size_t alignment, size_t size)
710 {
711 	/*
712 	 * OpenGroup spec issue 6 checks
713 	 */
714 	if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
715 		*memptr = NULL;
716 		return(EINVAL);
717 	}
718 	if (alignment < sizeof(void *)) {
719 		*memptr = NULL;
720 		return(EINVAL);
721 	}
722 
723 	/*
724 	 * XXX for now just find the nearest power of 2 >= size and also
725 	 * >= alignment and allocate that.
726 	 */
727 	while (alignment < size) {
728 		alignment <<= 1;
729 		if (alignment == 0)
730 			_mpanic("posix_memalign: byte value too high");
731 	}
732 	*memptr = memalloc(alignment, 0);
733 	return(*memptr ? 0 : ENOMEM);
734 }
735 
736 /*
737  * free() (SLAB ALLOCATOR) - do the obvious
738  */
739 void
740 __free(void *ptr)
741 {
742 	if (ptr) {
743 		UTRACE(ptr, 0, 0);
744 		memfree(ptr, 0);
745 	}
746 }
747 
748 /*
749  * memalloc()	(SLAB ALLOCATOR)
750  *
751  *	Allocate memory via the slab allocator.
752  */
753 static void *
754 memalloc(size_t size, int flags)
755 {
756 	slglobaldata_t slgd;
757 	struct zoneinfo *zinfo;
758 	slab_t slab;
759 	size_t chunking;
760 	int bmi;
761 	int bno;
762 	u_long *bmp;
763 	int zi;
764 #ifdef INVARIANTS
765 	int i;
766 #endif
767 	int j;
768 	char *obj;
769 
770 	/*
771 	 * If 0 bytes is requested we have to return a unique pointer, allocate
772 	 * at least one byte.
773 	 */
774 	if (size == 0)
775 		size = 1;
776 
777 	/* Capture global flags */
778 	flags |= g_malloc_flags;
779 
780 	/* Compute allocation zone; zoneindex will panic on excessive sizes */
781 	zi = zoneindex(&size, &chunking);
782 	MASSERT(zi < NZONES);
783 	if (size == 0)
784 		return(NULL);
785 
786 	/*
787 	 * Try magazine shortcut first
788 	 */
789 	slgd = &slglobal;
790 	zinfo = &slgd->zone[zi];
791 
792 	if ((j = zinfo->mag_index) != 0) {
793 		zinfo->mag_index = --j;
794 		obj = zinfo->mag_shortcut[j];
795 		zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
796 		if (flags & SAFLAG_ZERO)
797 			bzero(obj, size);
798 		return obj;
799 	}
800 
801 	/*
802 	 * Locate a slab with available space.  If no slabs are available
803 	 * back-off to the empty list and if we still come up dry allocate
804 	 * a new slab (which will try the depot first).
805 	 */
806 retry:
807 	if ((slab = zinfo->avail_base) == NULL) {
808 		if ((slab = zinfo->empty_base) == NULL) {
809 			/*
810 			 * Still dry
811 			 */
812 			slab = slaballoc(zi, chunking, size);
813 			if (slab == NULL)
814 				return(NULL);
815 			slab->next = zinfo->avail_base;
816 			zinfo->avail_base = slab;
817 			++zinfo->avail_count;
818 			slab->state = AVAIL;
819 			if (slgd->biggest_index < zi)
820 				slgd->biggest_index = zi;
821 			++slgd->nslabs;
822 		} else {
823 			/*
824 			 * Pulled from empty list
825 			 */
826 			zinfo->empty_base = slab->next;
827 			slab->next = zinfo->avail_base;
828 			zinfo->avail_base = slab;
829 			++zinfo->avail_count;
830 			slab->state = AVAIL;
831 			--zinfo->empty_count;
832 		}
833 	}
834 
835 	/*
836 	 * Allocate a chunk out of the slab.  HOT PATH
837 	 *
838 	 * Only the thread owning the slab can allocate out of it.
839 	 *
840 	 * NOTE: The last bit in the bitmap is always marked allocated so
841 	 *	 we cannot overflow here.
842 	 */
843 	bno = slab->free_bit;
844 	bmi = slab->free_index;
845 	bmp = &slab->bitmap[bmi];
846 	if (*bmp & (1LU << bno)) {
847 		atomic_clear_long(bmp, 1LU << bno);
848 		obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
849 		slab->free_bit = (bno + 1) & (LONG_BITS - 1);
850 		atomic_add_int(&slab->navail, -1);
851 		if (flags & SAFLAG_ZERO)
852 			bzero(obj, size);
853 		return (obj);
854 	}
855 
856 	/*
857 	 * Allocate a chunk out of a slab.  COLD PATH
858 	 */
859 	if (slab->navail == 0) {
860 		zinfo->avail_base = slab->next;
861 		--zinfo->avail_count;
862 		slab->state = FULL;
863 		TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
864 		goto retry;
865 	}
866 
867 	while (bmi < LONG_BITS) {
868 		bmp = &slab->bitmap[bmi];
869 		if (*bmp) {
870 			bno = bsflong(*bmp);
871 			atomic_clear_long(bmp, 1LU << bno);
872 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
873 					     size;
874 			slab->free_index = bmi;
875 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
876 			atomic_add_int(&slab->navail, -1);
877 			if (flags & SAFLAG_ZERO)
878 				bzero(obj, size);
879 			return (obj);
880 		}
881 		++bmi;
882 	}
883 	bmi = 0;
884 	while (bmi < LONG_BITS) {
885 		bmp = &slab->bitmap[bmi];
886 		if (*bmp) {
887 			bno = bsflong(*bmp);
888 			atomic_clear_long(bmp, 1LU << bno);
889 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
890 					     size;
891 			slab->free_index = bmi;
892 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
893 			atomic_add_int(&slab->navail, -1);
894 			if (flags & SAFLAG_ZERO)
895 				bzero(obj, size);
896 			return (obj);
897 		}
898 		++bmi;
899 	}
900 	_mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
901 	/* not reached */
902 	return NULL;
903 }
904 
905 /*
906  * Reallocate memory within the chunk
907  */
908 static void *
909 memrealloc(void *ptr, size_t nsize)
910 {
911 	region_t region;
912 	slab_t slab;
913 	size_t osize;
914 	char *obj;
915 	int flags = 0;
916 
917 	/*
918 	 * If 0 bytes is requested we have to return a unique pointer, allocate
919 	 * at least one byte.
920 	 */
921 	if (nsize == 0)
922 		nsize = 1;
923 
924 	/* Capture global flags */
925 	flags |= g_malloc_flags;
926 
927 	/*
928 	 * Locate the zone by looking up the dynamic slab size mask based
929 	 * on the memory region the allocation resides in.
930 	 */
931 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
932 	if ((slab = region->slab) == NULL)
933 		slab = (void *)((uintptr_t)ptr & region->mask);
934 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
935 	osize = slab->chunk_size;
936 	if (nsize <= osize) {
937 		if (osize < 32 || nsize >= osize / 2) {
938 			obj = ptr;
939 			if ((flags & SAFLAG_ZERO) && nsize < osize)
940 				bzero(obj + nsize, osize - nsize);
941 			return(obj);
942 		}
943 	}
944 
945 	/*
946 	 * Otherwise resize the object
947 	 */
948 	obj = memalloc(nsize, 0);
949 	if (obj) {
950 		if (nsize > osize)
951 			nsize = osize;
952 		bcopy(ptr, obj, nsize);
953 		memfree(ptr, 0);
954 	}
955 	return (obj);
956 }
957 
958 /*
959  * free (SLAB ALLOCATOR)
960  *
961  * Free a memory block previously allocated by malloc.
962  *
963  * MPSAFE
964  */
965 static void
966 memfree(void *ptr, int flags)
967 {
968 	region_t region;
969 	slglobaldata_t slgd;
970 	slab_t slab;
971 	slab_t stmp;
972 	slab_t *slabp;
973 	int bmi;
974 	int bno;
975 	int j;
976 	u_long *bmp;
977 
978 	/*
979 	 * Locate the zone by looking up the dynamic slab size mask based
980 	 * on the memory region the allocation resides in.
981 	 *
982 	 * WARNING!  The slab may be owned by another thread!
983 	 */
984 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
985 	if ((slab = region->slab) == NULL)
986 		slab = (void *)((uintptr_t)ptr & region->mask);
987 	MASSERT(slab != NULL);
988 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
989 
990 #ifdef INVARIANTS
991 	/*
992 	 * Put weird data into the memory to detect modifications after
993 	 * freeing, illegal pointer use after freeing (we should fault on
994 	 * the odd address), and so forth.
995 	 */
996 	if (slab->chunk_size < sizeof(weirdary))
997 		bcopy(weirdary, ptr, slab->chunk_size);
998 	else
999 		bcopy(weirdary, ptr, sizeof(weirdary));
1000 #endif
1001 	slgd = &slglobal;
1002 
1003 	/*
1004 	 * Use mag_shortcut[] when possible
1005 	 */
1006 	if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) {
1007 		struct zoneinfo *zinfo;
1008 
1009 		zinfo = &slgd->zone[slab->zone_index];
1010 		j = zinfo->mag_index;
1011 		if (j < NMAGSHORTCUT) {
1012 			zinfo->mag_shortcut[j] = ptr;
1013 			zinfo->mag_index = j + 1;
1014 			return;
1015 		}
1016 	}
1017 
1018 	/*
1019 	 * Free to slab and increment navail.  We can delay incrementing
1020 	 * navail to prevent the slab from being destroyed out from under
1021 	 * us while we do other optimizations.
1022 	 */
1023 	bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
1024 	bmi = bno >> LONG_BITS_SHIFT;
1025 	bno &= (LONG_BITS - 1);
1026 	bmp = &slab->bitmap[bmi];
1027 
1028 	MASSERT(bmi >= 0 && bmi < slab->nmax);
1029 	MASSERT((*bmp & (1LU << bno)) == 0);
1030 	atomic_set_long(bmp, 1LU << bno);
1031 
1032 	if (slab->slgd == slgd) {
1033 		/*
1034 		 * We can only do the following if we own the slab.  Note
1035 		 * that navail can be incremented by any thread even if
1036 		 * we own the slab.
1037 		 */
1038 		struct zoneinfo *zinfo;
1039 
1040 		atomic_add_int(&slab->navail, 1);
1041 		if (slab->free_index > bmi) {
1042 			slab->free_index = bmi;
1043 			slab->free_bit = bno;
1044 		} else if (slab->free_index == bmi &&
1045 			   slab->free_bit > bno) {
1046 			slab->free_bit = bno;
1047 		}
1048 		zinfo = &slgd->zone[slab->zone_index];
1049 
1050 		/*
1051 		 * Freeing an object from a full slab makes it less than
1052 		 * full.  The slab must be moved to the available list.
1053 		 *
1054 		 * If the available list has too many slabs, release some
1055 		 * to the depot.
1056 		 */
1057 		if (slab->state == FULL) {
1058 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
1059 			slab->state = AVAIL;
1060 			stmp = zinfo->avail_base;
1061 			slab->next = stmp;
1062 			zinfo->avail_base = slab;
1063 			++zinfo->avail_count;
1064 			while (zinfo->avail_count > opt_cache) {
1065 				slab = zinfo->avail_base;
1066 				zinfo->avail_base = slab->next;
1067 				--zinfo->avail_count;
1068 				slabterm(slgd, slab);
1069 			}
1070 			goto done;
1071 		}
1072 
1073 		/*
1074 		 * If the slab becomes completely empty dispose of it in
1075 		 * some manner.  By default each thread caches up to 4
1076 		 * empty slabs.  Only small slabs are cached.
1077 		 */
1078 		if (slab->navail == slab->nmax && slab->state == AVAIL) {
1079 			/*
1080 			 * Remove slab from available queue
1081 			 */
1082 			slabp = &zinfo->avail_base;
1083 			while ((stmp = *slabp) != slab)
1084 				slabp = &stmp->next;
1085 			*slabp = slab->next;
1086 			--zinfo->avail_count;
1087 
1088 			if (opt_free || opt_cache == 0) {
1089 				/*
1090 				 * If local caching is disabled cache the
1091 				 * slab in the depot (or free it).
1092 				 */
1093 				slabterm(slgd, slab);
1094 			} else if (slab->slab_size > BIGSLABSIZE) {
1095 				/*
1096 				 * We do not try to retain large slabs
1097 				 * in per-thread caches.
1098 				 */
1099 				slabterm(slgd, slab);
1100 			} else if (zinfo->empty_count > opt_cache) {
1101 				/*
1102 				 * We have too many slabs cached, but
1103 				 * instead of freeing this one free
1104 				 * an empty slab that's been idle longer.
1105 				 *
1106 				 * (empty_count does not change)
1107 				 */
1108 				stmp = zinfo->empty_base;
1109 				slab->state = EMPTY;
1110 				slab->next = stmp->next;
1111 				zinfo->empty_base = slab;
1112 				slabterm(slgd, stmp);
1113 			} else {
1114 				/*
1115 				 * Cache the empty slab in our thread local
1116 				 * empty list.
1117 				 */
1118 				++zinfo->empty_count;
1119 				slab->state = EMPTY;
1120 				slab->next = zinfo->empty_base;
1121 				zinfo->empty_base = slab;
1122 			}
1123 		}
1124 	} else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1125 		slglobaldata_t sldepot;
1126 
1127 		/*
1128 		 * If freeing to a slab owned by the global depot, and
1129 		 * the slab becomes completely EMPTY, try to move it to
1130 		 * the correct list.
1131 		 */
1132 		sldepot = &slglobaldepot;
1133 		if (__isthreaded)
1134 			_SPINLOCK(&sldepot->lock);
1135 		if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1136 			struct zoneinfo *zinfo;
1137 
1138 			/*
1139 			 * Move the slab to the empty list
1140 			 */
1141 			MASSERT(slab->state == AVAIL);
1142 			atomic_add_int(&slab->navail, 1);
1143 			zinfo = &sldepot->zone[slab->zone_index];
1144 			slabp = &zinfo->avail_base;
1145 			while (slab != *slabp)
1146 				slabp = &(*slabp)->next;
1147 			*slabp = slab->next;
1148 			--zinfo->avail_count;
1149 
1150 			/*
1151 			 * Clean out excessive empty entries from the
1152 			 * depot.
1153 			 */
1154 			slab->state = EMPTY;
1155 			slab->next = zinfo->empty_base;
1156 			zinfo->empty_base = slab;
1157 			++zinfo->empty_count;
1158 			while (zinfo->empty_count > opt_cache) {
1159 				slab = zinfo->empty_base;
1160 				zinfo->empty_base = slab->next;
1161 				--zinfo->empty_count;
1162 				slab->state = UNKNOWN;
1163 				if (__isthreaded)
1164 					_SPINUNLOCK(&sldepot->lock);
1165 				slabfree(slab);
1166 				if (__isthreaded)
1167 					_SPINLOCK(&sldepot->lock);
1168 			}
1169 		} else {
1170 			atomic_add_int(&slab->navail, 1);
1171 		}
1172 		if (__isthreaded)
1173 			_SPINUNLOCK(&sldepot->lock);
1174 	} else {
1175 		/*
1176 		 * We can't act on the slab other than by adjusting navail
1177 		 * (and the bitmap which we did in the common code at the
1178 		 * top).
1179 		 */
1180 		atomic_add_int(&slab->navail, 1);
1181 	}
1182 done:
1183 	;
1184 }
1185 
1186 /*
1187  * Allocate a new slab holding objects of size chunk_size.
1188  */
1189 static slab_t
1190 slaballoc(int zi, size_t chunking, size_t chunk_size)
1191 {
1192 	slglobaldata_t slgd;
1193 	slglobaldata_t sldepot;
1194 	struct zoneinfo *zinfo;
1195 	region_t region;
1196 	void *save;
1197 	slab_t slab;
1198 	size_t slab_desire;
1199 	size_t slab_size;
1200 	size_t region_mask;
1201 	uintptr_t chunk_offset;
1202 	ssize_t maxchunks;
1203 	ssize_t tmpchunks;
1204 	int ispower2;
1205 	int power;
1206 	int ri;
1207 	int rx;
1208 	int nswath;
1209 	int j;
1210 
1211 	/*
1212 	 * First look in the depot.  Any given zone in the depot may be
1213 	 * locked by being set to -1.  We have to do this instead of simply
1214 	 * removing the entire chain because removing the entire chain can
1215 	 * cause racing threads to allocate local slabs for large objects,
1216 	 * resulting in a large VSZ.
1217 	 */
1218 	slgd = &slglobal;
1219 	sldepot = &slglobaldepot;
1220 	zinfo = &sldepot->zone[zi];
1221 
1222 	if (zinfo->avail_base) {
1223 		if (__isthreaded)
1224 			_SPINLOCK(&sldepot->lock);
1225 		slab = zinfo->avail_base;
1226 		if (slab) {
1227 			MASSERT(slab->slgd == NULL);
1228 			slab->slgd = slgd;
1229 			zinfo->avail_base = slab->next;
1230 			--zinfo->avail_count;
1231 			if (__isthreaded)
1232 				_SPINUNLOCK(&sldepot->lock);
1233 			return slab;
1234 		}
1235 		if (__isthreaded)
1236 			_SPINUNLOCK(&sldepot->lock);
1237 	}
1238 
1239 	/*
1240 	 * Nothing in the depot, allocate a new slab by locating or assigning
1241 	 * a region and then using the system virtual memory allocator.
1242 	 */
1243 	slab = NULL;
1244 
1245 	/*
1246 	 * Calculate the start of the data chunks relative to the start
1247 	 * of the slab.  If chunk_size is a power of 2 we guarantee
1248 	 * power of 2 alignment.  If it is not we guarantee alignment
1249 	 * to the chunk size.
1250 	 */
1251 	if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1252 		ispower2 = 1;
1253 		chunk_offset = roundup2(sizeof(*slab), chunk_size);
1254 	} else {
1255 		ispower2 = 0;
1256 		chunk_offset = sizeof(*slab) + chunking - 1;
1257 		chunk_offset -= chunk_offset % chunking;
1258 	}
1259 
1260 	/*
1261 	 * Calculate a reasonable number of chunks for the slab.
1262 	 *
1263 	 * Once initialized the MaxChunks[] array can only ever be
1264 	 * reinitialized to the same value.
1265 	 */
1266 	maxchunks = MaxChunks[zi];
1267 	if (maxchunks == 0) {
1268 		/*
1269 		 * First calculate how many chunks would fit in 1/1024
1270 		 * available memory.  This is around 2MB on a 32 bit
1271 		 * system and 128G on a 64-bit (48-bits available) system.
1272 		 */
1273 		maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1274 			    (ssize_t)chunk_size;
1275 		if (maxchunks <= 0)
1276 			maxchunks = 1;
1277 
1278 		/*
1279 		 * A slab cannot handle more than MAXCHUNKS chunks, but
1280 		 * limit us to approximately MAXCHUNKS / 2 here because
1281 		 * we may have to expand maxchunks when we calculate the
1282 		 * actual power-of-2 slab.
1283 		 */
1284 		if (maxchunks > MAXCHUNKS / 2)
1285 			maxchunks = MAXCHUNKS / 2;
1286 
1287 		/*
1288 		 * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1289 		 * slabs will be created if the allocation does not fit.
1290 		 */
1291 		if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1292 			tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1293 				    (ssize_t)chunk_size;
1294 			if (tmpchunks <= 0)
1295 				tmpchunks = 1;
1296 			if (maxchunks > tmpchunks)
1297 				maxchunks = tmpchunks;
1298 		}
1299 
1300 		/*
1301 		 * If the slab calculates to greater than 2MB see if we
1302 		 * can cut it down to ~2MB.  This controls VSZ but has
1303 		 * no effect on run-time size or performance.
1304 		 *
1305 		 * This is very important in case you core dump and also
1306 		 * important to reduce unnecessary region allocations.
1307 		 */
1308 		if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1309 			tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1310 				    (ssize_t)chunk_size;
1311 			if (tmpchunks < 1)
1312 				tmpchunks = 1;
1313 			if (maxchunks > tmpchunks)
1314 				maxchunks = tmpchunks;
1315 		}
1316 
1317 		/*
1318 		 * If the slab calculates to greater than 128K see if we
1319 		 * can cut it down to ~128K while still maintaining a
1320 		 * reasonably large number of chunks in each slab.  This
1321 		 * controls VSZ but has no effect on run-time size or
1322 		 * performance.
1323 		 *
1324 		 * This is very important in case you core dump and also
1325 		 * important to reduce unnecessary region allocations.
1326 		 */
1327 		if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1328 			tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1329 				    (ssize_t)chunk_size;
1330 			if (tmpchunks < 32)
1331 				tmpchunks = 32;
1332 			if (maxchunks > tmpchunks)
1333 				maxchunks = tmpchunks;
1334 		}
1335 
1336 		MaxChunks[zi] = maxchunks;
1337 	}
1338 	MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1339 
1340 	/*
1341 	 * Calculate the actual slab size.  maxchunks will be recalculated
1342 	 * a little later.
1343 	 */
1344 	slab_desire = chunk_offset + chunk_size * maxchunks;
1345 	slab_size = 8 * MAXCHUNKS;
1346 	power = 3 + MAXCHUNKS_BITS;
1347 	while (slab_size < slab_desire) {
1348 		slab_size <<= 1;
1349 		++power;
1350 	}
1351 
1352 	/*
1353 	 * Do a quick recalculation based on the actual slab size but not
1354 	 * yet dealing with whether the slab header is in-band or out-of-band.
1355 	 * The purpose here is to see if we can reasonably reduce slab_size
1356 	 * to a power of 4 to allow more chunk sizes to use the same slab
1357 	 * size.
1358 	 */
1359 	if ((power & 1) && slab_size > 32768) {
1360 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1361 		if (maxchunks >= MAXCHUNKS / 8) {
1362 			slab_size >>= 1;
1363 			--power;
1364 		}
1365 	}
1366 	if ((power & 2) && slab_size > 32768 * 4) {
1367 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1368 		if (maxchunks >= MAXCHUNKS / 4) {
1369 			slab_size >>= 2;
1370 			power -= 2;
1371 		}
1372 	}
1373 	/*
1374 	 * This case occurs when the slab_size is larger than 1/1024 available
1375 	 * UVM.
1376 	 */
1377 	nswath = slab_size / NREGIONS_SIZE;
1378 	if (nswath > NREGIONS)
1379 		return (NULL);
1380 
1381 
1382 	/*
1383 	 * Try to allocate from our current best region for this zi
1384 	 */
1385 	region_mask = ~(slab_size - 1);
1386 	ri = slgd->zone[zi].best_region;
1387 	if (Regions[ri].mask == region_mask) {
1388 		if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1389 			goto found;
1390 	}
1391 
1392 	/*
1393 	 * Try to find an existing region to allocate from.  The normal
1394 	 * case will be for allocations that are less than 1/1024 available
1395 	 * UVM, which fit into a single Regions[] entry.
1396 	 */
1397 	while (slab_size <= NREGIONS_SIZE) {
1398 		rx = -1;
1399 		for (ri = 0; ri < NREGIONS; ++ri) {
1400 			if (rx < 0 && Regions[ri].mask == 0)
1401 				rx = ri;
1402 			if (Regions[ri].mask == region_mask) {
1403 				slab = _vmem_alloc(ri, slab_size);
1404 				if (slab) {
1405 					slgd->zone[zi].best_region = ri;
1406 					goto found;
1407 				}
1408 			}
1409 		}
1410 
1411 		if (rx < 0)
1412 			return(NULL);
1413 
1414 		/*
1415 		 * This can fail, retry either way
1416 		 */
1417 		atomic_cmpset_ptr((void **)&Regions[rx].mask,
1418 				  NULL,
1419 				  (void *)region_mask);
1420 	}
1421 
1422 	for (;;) {
1423 		rx = -1;
1424 		for (ri = 0; ri < NREGIONS; ri += nswath) {
1425 			if (Regions[ri].mask == region_mask) {
1426 				slab = _vmem_alloc(ri, slab_size);
1427 				if (slab) {
1428 					slgd->zone[zi].best_region = ri;
1429 					goto found;
1430 				}
1431 			}
1432 			if (rx < 0) {
1433 				for (j = nswath - 1; j >= 0; --j) {
1434 					if (Regions[ri+j].mask != 0)
1435 						break;
1436 				}
1437 				if (j < 0)
1438 					rx = ri;
1439 			}
1440 		}
1441 
1442 		/*
1443 		 * We found a candidate, try to allocate it backwards so
1444 		 * another thread racing a slaballoc() does not see the
1445 		 * mask in the base index position until we are done.
1446 		 *
1447 		 * We can safely zero-out any partial allocations because
1448 		 * the mask is only accessed from the base index.  Any other
1449 		 * threads racing us will fail prior to us clearing the mask.
1450 		 */
1451 		if (rx < 0)
1452 			return(NULL);
1453 		for (j = nswath - 1; j >= 0; --j) {
1454 			if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1455 					       NULL, (void *)region_mask)) {
1456 				while (++j < nswath)
1457 					Regions[rx+j].mask = 0;
1458 				break;
1459 			}
1460 		}
1461 		/* retry */
1462 	}
1463 
1464 	/*
1465 	 * Fill in the new slab in region ri.  If the slab_size completely
1466 	 * fills one or more region slots we move the slab structure out of
1467 	 * band which should optimize the chunking (particularly for a power
1468 	 * of 2).
1469 	 */
1470 found:
1471 	region = &Regions[ri];
1472 	MASSERT(region->slab == NULL);
1473 	if (slab_size >= NREGIONS_SIZE) {
1474 		save = slab;
1475 		slab = memalloc(sizeof(*slab), 0);
1476 		bzero(slab, sizeof(*slab));
1477 		slab->chunks = save;
1478 		for (j = 0; j < nswath; ++j)
1479 			region[j].slab = slab;
1480 		chunk_offset = 0;
1481 	} else {
1482 		bzero(slab, sizeof(*slab));
1483 		slab->chunks = (char *)slab + chunk_offset;
1484 	}
1485 
1486 	/*
1487 	 * Calculate the start of the chunks memory and recalculate the
1488 	 * actual number of chunks the slab can hold.
1489 	 */
1490 	maxchunks = (slab_size - chunk_offset) / chunk_size;
1491 	if (maxchunks > MAXCHUNKS)
1492 		maxchunks = MAXCHUNKS;
1493 
1494 	/*
1495 	 * And fill in the rest
1496 	 */
1497 	slab->magic = ZALLOC_SLAB_MAGIC;
1498 	slab->navail = maxchunks;
1499 	slab->nmax = maxchunks;
1500 	slab->slab_size = slab_size;
1501 	slab->chunk_size = chunk_size;
1502 	slab->zone_index = zi;
1503 	slab->slgd = slgd;
1504 	slab->state = UNKNOWN;
1505 	slab->region = region;
1506 
1507 	for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1508 		if (ri + LONG_BITS <= maxchunks)
1509 			slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1510 		else
1511 			slab->bitmap[ri >> LONG_BITS_SHIFT] =
1512 						(1LU << (maxchunks - ri)) - 1;
1513 	}
1514 	return (slab);
1515 }
1516 
1517 /*
1518  * Free a slab.
1519  */
1520 static void
1521 slabfree(slab_t slab)
1522 {
1523 	int nswath;
1524 	int j;
1525 
1526 	if (slab->region->slab == slab) {
1527 		/*
1528 		 * Out-of-band slab.
1529 		 */
1530 		nswath = slab->slab_size / NREGIONS_SIZE;
1531 		for (j = 0; j < nswath; ++j)
1532 			slab->region[j].slab = NULL;
1533 		slab->magic = 0;
1534 		_vmem_free(slab->chunks, slab->slab_size);
1535 		memfree(slab, 0);
1536 	} else {
1537 		/*
1538 		 * In-band slab.
1539 		 */
1540 		slab->magic = 0;
1541 		_vmem_free(slab, slab->slab_size);
1542 	}
1543 }
1544 
1545 /*
1546  * Terminate a slab's use in the current thread.  The slab may still have
1547  * outstanding allocations and thus not be deallocatable.
1548  */
1549 static void
1550 slabterm(slglobaldata_t slgd, slab_t slab)
1551 {
1552 	slglobaldata_t sldepot;
1553 	struct zoneinfo *zinfo;
1554 	int zi = slab->zone_index;
1555 
1556 	slab->slgd = NULL;
1557 	--slgd->nslabs;
1558 	sldepot = &slglobaldepot;
1559 	zinfo = &sldepot->zone[zi];
1560 
1561 	/*
1562 	 * Move the slab to the avail list or the empty list.
1563 	 */
1564 	if (__isthreaded)
1565 		_SPINLOCK(&sldepot->lock);
1566 	if (slab->navail == slab->nmax) {
1567 		slab->state = EMPTY;
1568 		slab->next = zinfo->empty_base;
1569 		zinfo->empty_base = slab;
1570 		++zinfo->empty_count;
1571 	} else {
1572 		slab->state = AVAIL;
1573 		slab->next = zinfo->avail_base;
1574 		zinfo->avail_base = slab;
1575 		++zinfo->avail_count;
1576 	}
1577 
1578 	/*
1579 	 * Clean extra slabs out of the empty list
1580 	 */
1581 	while (zinfo->empty_count > opt_cache) {
1582 		slab = zinfo->empty_base;
1583 		zinfo->empty_base = slab->next;
1584 		--zinfo->empty_count;
1585 		slab->state = UNKNOWN;
1586 		if (__isthreaded)
1587 			_SPINUNLOCK(&sldepot->lock);
1588 		slabfree(slab);
1589 		if (__isthreaded)
1590 			_SPINLOCK(&sldepot->lock);
1591 	}
1592 	if (__isthreaded)
1593 		_SPINUNLOCK(&sldepot->lock);
1594 }
1595 
1596 /*
1597  * _vmem_alloc()
1598  *
1599  *	Directly map memory in PAGE_SIZE'd chunks with the specified
1600  *	alignment.
1601  *
1602  *	Alignment must be a multiple of PAGE_SIZE.
1603  *
1604  *	Size must be >= alignment.
1605  */
1606 static void *
1607 _vmem_alloc(int ri, size_t slab_size)
1608 {
1609 	char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1610 	char *eaddr;
1611 	char *addr;
1612 	char *save;
1613 	uintptr_t excess;
1614 
1615 	if (slab_size < NREGIONS_SIZE)
1616 		eaddr = baddr + NREGIONS_SIZE;
1617 	else
1618 		eaddr = baddr + slab_size;
1619 
1620 	/*
1621 	 * This usually just works but might not.
1622 	 */
1623 	addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1624 		    MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1625 	if (addr == MAP_FAILED) {
1626 		return (NULL);
1627 	}
1628 	if (addr < baddr || addr + slab_size > eaddr) {
1629 		munmap(addr, slab_size);
1630 		return (NULL);
1631 	}
1632 
1633 	/*
1634 	 * Check alignment.  The misaligned offset is also the excess
1635 	 * amount.  If misaligned unmap the excess so we have a chance of
1636 	 * mapping at the next alignment point and recursively try again.
1637 	 *
1638 	 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB	block alignment
1639 	 *   aaaaaaaaa aaaaaaaaaaa aa		mis-aligned allocation
1640 	 *   xxxxxxxxx				final excess calculation
1641 	 *   ^ returned address
1642 	 */
1643 	excess = (uintptr_t)addr & (slab_size - 1);
1644 	while (excess) {
1645 		excess = slab_size - excess;
1646 		save = addr;
1647 
1648 		munmap(save + excess, slab_size - excess);
1649 		addr = _vmem_alloc(ri, slab_size);
1650 		munmap(save, excess);
1651 		if (addr == NULL)
1652 			return (NULL);
1653 		if (addr < baddr || addr + slab_size > eaddr) {
1654 			munmap(addr, slab_size);
1655 			return (NULL);
1656 		}
1657 		excess = (uintptr_t)addr & (slab_size - 1);
1658 	}
1659 	return (addr);
1660 }
1661 
1662 /*
1663  * _vmem_free()
1664  *
1665  *	Free a chunk of memory allocated with _vmem_alloc()
1666  */
1667 static void
1668 _vmem_free(void *ptr, size_t size)
1669 {
1670 	munmap(ptr, size);
1671 }
1672 
1673 /*
1674  * Panic on fatal conditions
1675  */
1676 static void
1677 _mpanic(const char *ctl, ...)
1678 {
1679 	va_list va;
1680 
1681 	if (malloc_panic == 0) {
1682 		malloc_panic = 1;
1683 		va_start(va, ctl);
1684 		vfprintf(stderr, ctl, va);
1685 		fprintf(stderr, "\n");
1686 		fflush(stderr);
1687 		va_end(va);
1688 	}
1689 	abort();
1690 }
1691 
1692 __weak_reference(__malloc, malloc);
1693 __weak_reference(__calloc, calloc);
1694 __weak_reference(__posix_memalign, posix_memalign);
1695 __weak_reference(__realloc, realloc);
1696 __weak_reference(__free, free);
1697