xref: /dragonfly/lib/libc/stdlib/dmalloc.c (revision 45914ee7)
1 /*
2  * DMALLOC.C	- Dillon's malloc
3  *
4  * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module implements a modified slab allocator as a drop-in replacement
38  * for the libc malloc().  The slab algorithm has been adjusted to support
39  * dynamic sizing of slabs which effectively allows slabs to be used for
40  * allocations of any size.  Because of this we neither have a small-block
41  * allocator or a big-block allocator and the code paths are simplified.
42  *
43  * To support dynamic slab sizing available user virtual memory is broken
44  * down into ~1024 regions.  Each region has fixed slab size whos value is
45  * set when the region is opened up for use.  The free() path simply applies
46  * a mask based on the region to the pointer to acquire the base of the
47  * governing slab structure.
48  *
49  * Regions[NREGIONS]	(1024)
50  *
51  * Slab management and locking is done on a per-zone basis.
52  *
53  *	Alloc Size	Chunking        Number of zones
54  *	0-127		8		16
55  *	128-255		16		8
56  *	256-511		32		8
57  *	512-1023	64		8
58  *	1024-2047	128		8
59  *	2048-4095	256		8
60  *	4096-8191	512		8
61  *	8192-16383	1024		8
62  *	16384-32767	2048		8
63  *	32768-65535	4096		8
64  *	... continues forever ...	4 zones
65  *
66  *	For a 2^63 memory space each doubling >= 64K is broken down into
67  *	4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
68  *
69  *			   API FEATURES AND SIDE EFFECTS
70  *
71  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
72  *	Above that power-of-2 sized allocations are page-aligned.  Non
73  *	power-of-2 sized allocations are aligned the same as the chunk
74  *	size for their zone.
75  *    + ability to allocate arbitrarily large chunks of memory
76  *    + realloc will reuse the passed pointer if possible, within the
77  *	limitations of the zone chunking.
78  *
79  * On top of the slab allocator we also implement a 16-entry-per-thread
80  * magazine cache for allocations <= NOMSLABSIZE.
81  *
82  *				FUTURE FEATURES
83  *
84  *    + [better] garbage collection
85  *    + better initial sizing.
86  *
87  * TUNING
88  *
89  * The value of the environment variable MALLOC_OPTIONS is a character string
90  * containing various flags to tune nmalloc.  Upper case letters enabled
91  * or increase the feature, lower case disables or decreases the feature.
92  *
93  * U		Enable UTRACE for all operations, observable with ktrace.
94  *		Diasbled by default.
95  *
96  * Z		Zero out allocations, otherwise allocations (except for
97  *		calloc) will contain garbage.
98  *		Disabled by default.
99  *
100  * H		Pass a hint with madvise() about unused pages.
101  *		Disabled by default.
102  *		Not currently implemented.
103  *
104  * F		Disable local per-thread caching.
105  *		Disabled by default.
106  *
107  * C		Increase (decrease) how much excess cache to retain.
108  *		Set to 4 by default.
109  */
110 
111 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
112 
113 #ifndef STANDALONE_DEBUG
114 #include "libc_private.h"
115 #endif
116 
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/mman.h>
120 #include <sys/queue.h>
121 #include <sys/uio.h>
122 #include <sys/ktrace.h>
123 #include <stdio.h>
124 #include <stdint.h>
125 #include <stdlib.h>
126 #include <stdarg.h>
127 #include <stddef.h>
128 #include <unistd.h>
129 #include <string.h>
130 #include <fcntl.h>
131 #include <errno.h>
132 #include <pthread.h>
133 #include <limits.h>
134 
135 #include <machine/atomic.h>
136 #include <machine/cpufunc.h>
137 
138 #ifdef STANDALONE_DEBUG
139 void _nmalloc_thr_init(void);
140 #else
141 #include "spinlock.h"
142 #include "un-namespace.h"
143 #endif
144 
145 #ifndef MAP_SIZEALIGN
146 #define MAP_SIZEALIGN	0
147 #endif
148 
149 #if SSIZE_MAX == 0x7FFFFFFF
150 #define ADDRBITS	32
151 #define UVM_BITS	32	/* worst case */
152 #else
153 #define ADDRBITS	64
154 #define UVM_BITS	48	/* worst case XXX */
155 #endif
156 
157 #if LONG_MAX == 0x7FFFFFFF
158 #define LONG_BITS	32
159 #define LONG_BITS_SHIFT	5
160 #else
161 #define LONG_BITS	64
162 #define LONG_BITS_SHIFT	6
163 #endif
164 
165 #define LOCKEDPTR	((void *)(intptr_t)-1)
166 
167 /*
168  * Regions[]
169  */
170 #define NREGIONS_BITS	10
171 #define NREGIONS	(1 << NREGIONS_BITS)
172 #define NREGIONS_MASK	(NREGIONS - 1)
173 #define NREGIONS_SHIFT	(UVM_BITS - NREGIONS_BITS)
174 #define NREGIONS_SIZE	(1LU << NREGIONS_SHIFT)
175 
176 typedef struct region *region_t;
177 typedef struct slglobaldata *slglobaldata_t;
178 typedef struct slab *slab_t;
179 
180 struct region {
181 	uintptr_t	mask;
182 	slab_t		slab;	/* conditional out of band slab */
183 };
184 
185 static struct region Regions[NREGIONS];
186 
187 /*
188  * Number of chunking zones available
189  */
190 #define CHUNKFACTOR	8
191 #if ADDRBITS == 32
192 #define NZONES		(16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
193 #else
194 #define NZONES		(16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
195 #endif
196 
197 static int MaxChunks[NZONES];
198 
199 #define NDEPOTS		8		/* must be power of 2 */
200 
201 /*
202  * Maximum number of chunks per slab, governed by the allocation bitmap in
203  * each slab.  The maximum is reduced for large chunk sizes.
204  */
205 #define MAXCHUNKS	(LONG_BITS * LONG_BITS)
206 #define MAXCHUNKS_BITS	(LONG_BITS_SHIFT * LONG_BITS_SHIFT)
207 #define LITSLABSIZE	(32 * 1024)
208 #define NOMSLABSIZE	(2 * 1024 * 1024)
209 #define BIGSLABSIZE	(128 * 1024 * 1024)
210 
211 #define ZALLOC_SLAB_MAGIC	0x736c6162	/* magic sanity */
212 
213 TAILQ_HEAD(slab_list, slab);
214 
215 /*
216  * A slab structure
217  */
218 struct slab {
219 	struct slab	*next;		/* slabs with available space */
220 	TAILQ_ENTRY(slab) entry;
221 	int32_t		magic;		/* magic number for sanity check */
222 	u_int		navail;		/* number of free elements available */
223 	u_int		nmax;
224 	u_int		free_bit;	/* free hint bitno */
225 	u_int		free_index;	/* free hint index */
226 	u_long		bitmap[LONG_BITS]; /* free chunks */
227 	size_t		slab_size;	/* size of entire slab */
228 	size_t		chunk_size;	/* chunk size for validation */
229 	int		zone_index;
230 	enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
231 	int		flags;
232 	region_t	region;		/* related region */
233 	char		*chunks;	/* chunk base */
234 	slglobaldata_t	slgd;		/* localized to thread else NULL */
235 };
236 
237 /*
238  * per-thread data + global depot
239  *
240  * NOTE: The magazine shortcut is only used for per-thread data.
241  */
242 #define NMAGSHORTCUT	16
243 
244 struct slglobaldata {
245 	spinlock_t	lock;		/* only used by slglobaldepot */
246 	struct zoneinfo {
247 		slab_t	avail_base;
248 		slab_t	empty_base;
249 		int	best_region;
250 		int	mag_index;
251 		int	avail_count;
252 		int	empty_count;
253 		void	*mag_shortcut[NMAGSHORTCUT];
254 	} zone[NZONES];
255 	struct slab_list full_zones;	/* via entry */
256 	int		masked;
257 	int		biggest_index;
258 	size_t		nslabs;
259 };
260 
261 #define SLAB_ZEROD		0x0001
262 
263 /*
264  * Misc constants.  Note that allocations that are exact multiples of
265  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
266  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
267  */
268 #define MIN_CHUNK_SIZE		8		/* in bytes */
269 #define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
270 
271 #define SAFLAG_ZERO	0x00000001
272 
273 /*
274  * The WEIRD_ADDR is used as known text to copy into free objects to
275  * try to create deterministic failure cases if the data is accessed after
276  * free.
277  *
278  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
279  *	    not be larger then 64.
280  */
281 #define WEIRD_ADDR      0xdeadc0de
282 #define MAX_COPY        sizeof(weirdary)
283 
284 #define arysize(ary)	(sizeof(ary)/sizeof((ary)[0]))
285 
286 /*
287  * Thread control
288  */
289 
290 #define MASSERT(exp)	do { if (__predict_false(!(exp)))	\
291 				_mpanic("assertion: %s in %s",	\
292 				#exp, __func__);		\
293 			    } while (0)
294 
295 /*
296  * With this attribute set, do not require a function call for accessing
297  * this variable when the code is compiled -fPIC.
298  *
299  * Must be empty for libc_rtld (similar to __thread)
300  */
301 #if defined(__LIBC_RTLD)
302 #define TLS_ATTRIBUTE
303 #else
304 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
305 #endif
306 
307 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
308 static pthread_key_t thread_malloc_key;
309 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
310 static struct slglobaldata slglobaldepot;
311 
312 static int opt_madvise = 0;
313 static int opt_free = 0;
314 static int opt_cache = 4;
315 static int opt_utrace = 0;
316 static int g_malloc_flags = 0;
317 static int malloc_panic;
318 
319 static const int32_t weirdary[16] = {
320 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
321 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
322 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
323 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
324 };
325 
326 static void *memalloc(size_t size, int flags);
327 static void *memrealloc(void *ptr, size_t size);
328 static void memfree(void *ptr, int);
329 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
330 static void slabfree(slab_t slab);
331 static void slabterm(slglobaldata_t slgd, slab_t slab);
332 static void *_vmem_alloc(int ri, size_t slab_size);
333 static void _vmem_free(void *ptr, size_t slab_size);
334 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
335 #ifndef STANDALONE_DEBUG
336 static void malloc_init(void) __constructor(101);
337 #else
338 static void malloc_init(void) __constructor(101);
339 #endif
340 
341 
342 struct nmalloc_utrace {
343 	void *p;
344 	size_t s;
345 	void *r;
346 };
347 
348 #define UTRACE(a, b, c)						\
349 	if (opt_utrace) {					\
350 		struct nmalloc_utrace ut = {			\
351 			.p = (a),				\
352 			.s = (b),				\
353 			.r = (c)				\
354 		};						\
355 		utrace(&ut, sizeof(ut));			\
356 	}
357 
358 #ifdef INVARIANTS
359 /*
360  * If enabled any memory allocated without M_ZERO is initialized to -1.
361  */
362 static int  use_malloc_pattern;
363 #endif
364 
365 static void
366 malloc_init(void)
367 {
368 	const char *p = NULL;
369 
370 	TAILQ_INIT(&slglobal.full_zones);
371 
372 	Regions[0].mask = -1; /* disallow activity in lowest region */
373 
374 	if (issetugid() == 0)
375 		p = getenv("MALLOC_OPTIONS");
376 
377 	for (; p != NULL && *p != '\0'; p++) {
378 		switch(*p) {
379 		case 'u':
380 			opt_utrace = 0;
381 			break;
382 		case 'U':
383 			opt_utrace = 1;
384 			break;
385 		case 'h':
386 			opt_madvise = 0;
387 			break;
388 		case 'H':
389 			opt_madvise = 1;
390 			break;
391 		case 'c':
392 			if (opt_cache > 0)
393 				--opt_cache;
394 			break;
395 		case 'C':
396 			++opt_cache;
397 			break;
398 		case 'f':
399 			opt_free = 0;
400 			break;
401 		case 'F':
402 			opt_free = 1;
403 			break;
404 		case 'z':
405 			g_malloc_flags = 0;
406 			break;
407 		case 'Z':
408 			g_malloc_flags = SAFLAG_ZERO;
409 			break;
410 		default:
411 			break;
412 		}
413 	}
414 
415 	UTRACE((void *) -1, 0, NULL);
416 }
417 
418 /*
419  * We have to install a handler for nmalloc thread teardowns when
420  * the thread is created.  We cannot delay this because destructors in
421  * sophisticated userland programs can call malloc() for the first time
422  * during their thread exit.
423  *
424  * This routine is called directly from pthreads.
425  */
426 static void _nmalloc_thr_init_once(void);
427 static void _nmalloc_thr_destructor(void *thrp);
428 
429 void
430 _nmalloc_thr_init(void)
431 {
432 	static int did_init;
433 
434 	TAILQ_INIT(&slglobal.full_zones);
435 
436 	if (slglobal.masked)
437 		return;
438 
439 	slglobal.masked = 1;
440 	if (did_init == 0) {
441 		did_init = 1;
442 		pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
443 	}
444 	pthread_setspecific(thread_malloc_key, &slglobal);
445 	slglobal.masked = 0;
446 }
447 
448 void
449 _nmalloc_thr_prepfork(void)
450 {
451 	if (__isthreaded)
452 		_SPINLOCK(&slglobaldepot.lock);
453 }
454 
455 void
456 _nmalloc_thr_parentfork(void)
457 {
458 	if (__isthreaded)
459 		_SPINUNLOCK(&slglobaldepot.lock);
460 }
461 
462 void
463 _nmalloc_thr_childfork(void)
464 {
465 	if (__isthreaded)
466 		_SPINUNLOCK(&slglobaldepot.lock);
467 }
468 
469 /*
470  * Called just once
471  */
472 static void
473 _nmalloc_thr_init_once(void)
474 {
475 	int error;
476 
477 	error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
478 	if (error)
479 		abort();
480 }
481 
482 /*
483  * Called for each thread undergoing exit
484  *
485  * Move all of the thread's slabs into a depot.
486  */
487 static void
488 _nmalloc_thr_destructor(void *thrp)
489 {
490 	slglobaldata_t slgd = thrp;
491 	struct zoneinfo *zinfo;
492 	slab_t slab;
493 	void *ptr;
494 	int i;
495 	int j;
496 
497 	slgd->masked = 1;
498 
499 	for (i = 0; i <= slgd->biggest_index; i++) {
500 		zinfo = &slgd->zone[i];
501 
502 		while ((j = zinfo->mag_index) > 0) {
503 			--j;
504 			ptr = zinfo->mag_shortcut[j];
505 			zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
506 			zinfo->mag_index = j;
507 			memfree(ptr, 0);
508 		}
509 
510 		while ((slab = zinfo->empty_base) != NULL) {
511 			zinfo->empty_base = slab->next;
512 			--zinfo->empty_count;
513 			slabterm(slgd, slab);
514 		}
515 
516 		while ((slab = zinfo->avail_base) != NULL) {
517 			zinfo->avail_base = slab->next;
518 			--zinfo->avail_count;
519 			slabterm(slgd, slab);
520 		}
521 
522 		while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
523 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
524 			slabterm(slgd, slab);
525 		}
526 	}
527 }
528 
529 /*
530  * Calculate the zone index for the allocation request size and set the
531  * allocation request size to that particular zone's chunk size.
532  *
533  * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform
534  * with malloc requirements for intel/amd.
535  */
536 static __inline int
537 zoneindex(size_t *bytes, size_t *chunking)
538 {
539 	size_t n = (size_t)*bytes;
540 	size_t x;
541 	size_t c;
542 	int i;
543 
544 	if (n < 128) {
545 		if (n < 16) {
546 			*bytes = n = (n + 7) & ~7;
547 			*chunking = 8;
548 			return(n / 8 - 1);	/* 8 byte chunks, 2 zones */
549 		} else {
550 			*bytes = n = (n + 15) & ~15;
551 			*chunking = 16;
552 			return(n / 16 + 2);	/* 16 byte chunks, 8 zones */
553 		}
554 	}
555 	if (n < 4096) {
556 		x = 256;
557 		c = x / (CHUNKFACTOR * 2);
558 		i = 16;
559 	} else {
560 		x = 8192;
561 		c = x / (CHUNKFACTOR * 2);
562 		i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
563 	}
564 	while (n >= x) {
565 		x <<= 1;
566 		c <<= 1;
567 		i += CHUNKFACTOR;
568 		if (x == 0)
569 			_mpanic("slaballoc: byte value too high");
570 	}
571 	*bytes = n = roundup2(n, c);
572 	*chunking = c;
573 	return (i + n / c - CHUNKFACTOR);
574 #if 0
575 	*bytes = n = (n + c - 1) & ~(c - 1);
576 	*chunking = c;
577 	return (n / c + i);
578 
579 	if (n < 256) {
580 		*bytes = n = (n + 15) & ~15;
581 		*chunking = 16;
582 		return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
583 	}
584 	if (n < 8192) {
585 		if (n < 512) {
586 			*bytes = n = (n + 31) & ~31;
587 			*chunking = 32;
588 			return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
589 		}
590 		if (n < 1024) {
591 			*bytes = n = (n + 63) & ~63;
592 			*chunking = 64;
593 			return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
594 		}
595 		if (n < 2048) {
596 			*bytes = n = (n + 127) & ~127;
597 			*chunking = 128;
598 			return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
599 		}
600 		if (n < 4096) {
601 			*bytes = n = (n + 255) & ~255;
602 			*chunking = 256;
603 			return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
604 		}
605 		*bytes = n = (n + 511) & ~511;
606 		*chunking = 512;
607 		return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
608 	}
609 	if (n < 16384) {
610 		*bytes = n = (n + 1023) & ~1023;
611 		*chunking = 1024;
612 		return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
613 	}
614 	if (n < 32768) {				/* 16384-32767 */
615 		*bytes = n = (n + 2047) & ~2047;
616 		*chunking = 2048;
617 		return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
618 	}
619 	if (n < 65536) {
620 		*bytes = n = (n + 4095) & ~4095;	/* 32768-65535 */
621 		*chunking = 4096;
622 		return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
623 	}
624 
625 	x = 131072;
626 	c = 8192;
627 	i = CHUNKINGLO*10 - 1;
628 
629 	while (n >= x) {
630 		x <<= 1;
631 		c <<= 1;
632 		i += CHUNKINGHI;
633 		if (x == 0)
634 			_mpanic("slaballoc: byte value too high");
635 	}
636 	*bytes = n = (n + c - 1) & ~(c - 1);
637 	*chunking = c;
638 	return (n / c + i);
639 #endif
640 }
641 
642 /*
643  * malloc() - call internal slab allocator
644  */
645 void *
646 __malloc(size_t size)
647 {
648 	void *ptr;
649 
650 	ptr = memalloc(size, 0);
651 	if (ptr == NULL)
652 		errno = ENOMEM;
653 	else
654 		UTRACE(0, size, ptr);
655 	return(ptr);
656 }
657 
658 /*
659  * calloc() - call internal slab allocator
660  */
661 void *
662 __calloc(size_t number, size_t size)
663 {
664 	void *ptr;
665 
666 	ptr = memalloc(number * size, SAFLAG_ZERO);
667 	if (ptr == NULL)
668 		errno = ENOMEM;
669 	else
670 		UTRACE(0, number * size, ptr);
671 	return(ptr);
672 }
673 
674 /*
675  * realloc() (SLAB ALLOCATOR)
676  *
677  * We do not attempt to optimize this routine beyond reusing the same
678  * pointer if the new size fits within the chunking of the old pointer's
679  * zone.
680  */
681 void *
682 __realloc(void *ptr, size_t size)
683 {
684 	void *ret;
685 
686 	if (ptr == NULL)
687 		ret = memalloc(size, 0);
688 	else
689 		ret = memrealloc(ptr, size);
690 	if (ret == NULL)
691 		errno = ENOMEM;
692 	else
693 		UTRACE(ptr, size, ret);
694 	return(ret);
695 }
696 
697 /*
698  * posix_memalign()
699  *
700  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
701  * is a power of 2 >= sizeof(void *).
702  *
703  * The slab allocator will allocate on power-of-2 boundaries up to at least
704  * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
705  * matching the requirements.
706  */
707 int
708 __posix_memalign(void **memptr, size_t alignment, size_t size)
709 {
710 	/*
711 	 * OpenGroup spec issue 6 checks
712 	 */
713 	if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
714 		*memptr = NULL;
715 		return(EINVAL);
716 	}
717 	if (alignment < sizeof(void *)) {
718 		*memptr = NULL;
719 		return(EINVAL);
720 	}
721 
722 	/*
723 	 * XXX for now just find the nearest power of 2 >= size and also
724 	 * >= alignment and allocate that.
725 	 */
726 	while (alignment < size) {
727 		alignment <<= 1;
728 		if (alignment == 0)
729 			_mpanic("posix_memalign: byte value too high");
730 	}
731 	*memptr = memalloc(alignment, 0);
732 	return(*memptr ? 0 : ENOMEM);
733 }
734 
735 /*
736  * free() (SLAB ALLOCATOR) - do the obvious
737  */
738 void
739 __free(void *ptr)
740 {
741 	if (ptr) {
742 		UTRACE(ptr, 0, 0);
743 		memfree(ptr, 0);
744 	}
745 }
746 
747 /*
748  * memalloc()	(SLAB ALLOCATOR)
749  *
750  *	Allocate memory via the slab allocator.
751  */
752 static void *
753 memalloc(size_t size, int flags)
754 {
755 	slglobaldata_t slgd;
756 	struct zoneinfo *zinfo;
757 	slab_t slab;
758 	size_t chunking;
759 	int bmi;
760 	int bno;
761 	u_long *bmp;
762 	int zi;
763 #ifdef INVARIANTS
764 	int i;
765 #endif
766 	int j;
767 	char *obj;
768 
769 	/*
770 	 * If 0 bytes is requested we have to return a unique pointer, allocate
771 	 * at least one byte.
772 	 */
773 	if (size == 0)
774 		size = 1;
775 
776 	/* Capture global flags */
777 	flags |= g_malloc_flags;
778 
779 	/* Compute allocation zone; zoneindex will panic on excessive sizes */
780 	zi = zoneindex(&size, &chunking);
781 	MASSERT(zi < NZONES);
782 	if (size == 0)
783 		return(NULL);
784 
785 	/*
786 	 * Try magazine shortcut first
787 	 */
788 	slgd = &slglobal;
789 	zinfo = &slgd->zone[zi];
790 
791 	if ((j = zinfo->mag_index) != 0) {
792 		zinfo->mag_index = --j;
793 		obj = zinfo->mag_shortcut[j];
794 		zinfo->mag_shortcut[j] = NULL;	/* SAFETY */
795 		if (flags & SAFLAG_ZERO)
796 			bzero(obj, size);
797 		return obj;
798 	}
799 
800 	/*
801 	 * Locate a slab with available space.  If no slabs are available
802 	 * back-off to the empty list and if we still come up dry allocate
803 	 * a new slab (which will try the depot first).
804 	 */
805 retry:
806 	if ((slab = zinfo->avail_base) == NULL) {
807 		if ((slab = zinfo->empty_base) == NULL) {
808 			/*
809 			 * Still dry
810 			 */
811 			slab = slaballoc(zi, chunking, size);
812 			if (slab == NULL)
813 				return(NULL);
814 			slab->next = zinfo->avail_base;
815 			zinfo->avail_base = slab;
816 			++zinfo->avail_count;
817 			slab->state = AVAIL;
818 			if (slgd->biggest_index < zi)
819 				slgd->biggest_index = zi;
820 			++slgd->nslabs;
821 		} else {
822 			/*
823 			 * Pulled from empty list
824 			 */
825 			zinfo->empty_base = slab->next;
826 			slab->next = zinfo->avail_base;
827 			zinfo->avail_base = slab;
828 			++zinfo->avail_count;
829 			slab->state = AVAIL;
830 			--zinfo->empty_count;
831 		}
832 	}
833 
834 	/*
835 	 * Allocate a chunk out of the slab.  HOT PATH
836 	 *
837 	 * Only the thread owning the slab can allocate out of it.
838 	 *
839 	 * NOTE: The last bit in the bitmap is always marked allocated so
840 	 *	 we cannot overflow here.
841 	 */
842 	bno = slab->free_bit;
843 	bmi = slab->free_index;
844 	bmp = &slab->bitmap[bmi];
845 	if (*bmp & (1LU << bno)) {
846 		atomic_clear_long(bmp, 1LU << bno);
847 		obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
848 		slab->free_bit = (bno + 1) & (LONG_BITS - 1);
849 		atomic_add_int(&slab->navail, -1);
850 		if (flags & SAFLAG_ZERO)
851 			bzero(obj, size);
852 		return (obj);
853 	}
854 
855 	/*
856 	 * Allocate a chunk out of a slab.  COLD PATH
857 	 */
858 	if (slab->navail == 0) {
859 		zinfo->avail_base = slab->next;
860 		--zinfo->avail_count;
861 		slab->state = FULL;
862 		TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
863 		goto retry;
864 	}
865 
866 	while (bmi < LONG_BITS) {
867 		bmp = &slab->bitmap[bmi];
868 		if (*bmp) {
869 			bno = bsflong(*bmp);
870 			atomic_clear_long(bmp, 1LU << bno);
871 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
872 					     size;
873 			slab->free_index = bmi;
874 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
875 			atomic_add_int(&slab->navail, -1);
876 			if (flags & SAFLAG_ZERO)
877 				bzero(obj, size);
878 			return (obj);
879 		}
880 		++bmi;
881 	}
882 	bmi = 0;
883 	while (bmi < LONG_BITS) {
884 		bmp = &slab->bitmap[bmi];
885 		if (*bmp) {
886 			bno = bsflong(*bmp);
887 			atomic_clear_long(bmp, 1LU << bno);
888 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
889 					     size;
890 			slab->free_index = bmi;
891 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
892 			atomic_add_int(&slab->navail, -1);
893 			if (flags & SAFLAG_ZERO)
894 				bzero(obj, size);
895 			return (obj);
896 		}
897 		++bmi;
898 	}
899 	_mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
900 	/* not reached */
901 	return NULL;
902 }
903 
904 /*
905  * Reallocate memory within the chunk
906  */
907 static void *
908 memrealloc(void *ptr, size_t nsize)
909 {
910 	region_t region;
911 	slab_t slab;
912 	size_t osize;
913 	char *obj;
914 	int flags = 0;
915 
916 	/*
917 	 * If 0 bytes is requested we have to return a unique pointer, allocate
918 	 * at least one byte.
919 	 */
920 	if (nsize == 0)
921 		nsize = 1;
922 
923 	/* Capture global flags */
924 	flags |= g_malloc_flags;
925 
926 	/*
927 	 * Locate the zone by looking up the dynamic slab size mask based
928 	 * on the memory region the allocation resides in.
929 	 */
930 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
931 	if ((slab = region->slab) == NULL)
932 		slab = (void *)((uintptr_t)ptr & region->mask);
933 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
934 	osize = slab->chunk_size;
935 	if (nsize <= osize) {
936 		if (osize < 32 || nsize >= osize / 2) {
937 			obj = ptr;
938 			if ((flags & SAFLAG_ZERO) && nsize < osize)
939 				bzero(obj + nsize, osize - nsize);
940 			return(obj);
941 		}
942 	}
943 
944 	/*
945 	 * Otherwise resize the object
946 	 */
947 	obj = memalloc(nsize, 0);
948 	if (obj) {
949 		if (nsize > osize)
950 			nsize = osize;
951 		bcopy(ptr, obj, nsize);
952 		memfree(ptr, 0);
953 	}
954 	return (obj);
955 }
956 
957 /*
958  * free (SLAB ALLOCATOR)
959  *
960  * Free a memory block previously allocated by malloc.
961  *
962  * MPSAFE
963  */
964 static void
965 memfree(void *ptr, int flags)
966 {
967 	region_t region;
968 	slglobaldata_t slgd;
969 	slab_t slab;
970 	slab_t stmp;
971 	slab_t *slabp;
972 	int bmi;
973 	int bno;
974 	int j;
975 	u_long *bmp;
976 
977 	/*
978 	 * Locate the zone by looking up the dynamic slab size mask based
979 	 * on the memory region the allocation resides in.
980 	 *
981 	 * WARNING!  The slab may be owned by another thread!
982 	 */
983 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
984 	if ((slab = region->slab) == NULL)
985 		slab = (void *)((uintptr_t)ptr & region->mask);
986 	MASSERT(slab != NULL);
987 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
988 
989 #ifdef INVARIANTS
990 	/*
991 	 * Put weird data into the memory to detect modifications after
992 	 * freeing, illegal pointer use after freeing (we should fault on
993 	 * the odd address), and so forth.
994 	 */
995 	if (slab->chunk_size < sizeof(weirdary))
996 		bcopy(weirdary, ptr, slab->chunk_size);
997 	else
998 		bcopy(weirdary, ptr, sizeof(weirdary));
999 #endif
1000 	slgd = &slglobal;
1001 
1002 	/*
1003 	 * Use mag_shortcut[] when possible
1004 	 */
1005 	if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) {
1006 		struct zoneinfo *zinfo;
1007 
1008 		zinfo = &slgd->zone[slab->zone_index];
1009 		j = zinfo->mag_index;
1010 		if (j < NMAGSHORTCUT) {
1011 			zinfo->mag_shortcut[j] = ptr;
1012 			zinfo->mag_index = j + 1;
1013 			return;
1014 		}
1015 	}
1016 
1017 	/*
1018 	 * Free to slab and increment navail.  We can delay incrementing
1019 	 * navail to prevent the slab from being destroyed out from under
1020 	 * us while we do other optimizations.
1021 	 */
1022 	bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
1023 	bmi = bno >> LONG_BITS_SHIFT;
1024 	bno &= (LONG_BITS - 1);
1025 	bmp = &slab->bitmap[bmi];
1026 
1027 	MASSERT(bmi >= 0 && bmi < slab->nmax);
1028 	MASSERT((*bmp & (1LU << bno)) == 0);
1029 	atomic_set_long(bmp, 1LU << bno);
1030 
1031 	if (slab->slgd == slgd) {
1032 		/*
1033 		 * We can only do the following if we own the slab.  Note
1034 		 * that navail can be incremented by any thread even if
1035 		 * we own the slab.
1036 		 */
1037 		struct zoneinfo *zinfo;
1038 
1039 		atomic_add_int(&slab->navail, 1);
1040 		if (slab->free_index > bmi) {
1041 			slab->free_index = bmi;
1042 			slab->free_bit = bno;
1043 		} else if (slab->free_index == bmi &&
1044 			   slab->free_bit > bno) {
1045 			slab->free_bit = bno;
1046 		}
1047 		zinfo = &slgd->zone[slab->zone_index];
1048 
1049 		/*
1050 		 * Freeing an object from a full slab makes it less than
1051 		 * full.  The slab must be moved to the available list.
1052 		 *
1053 		 * If the available list has too many slabs, release some
1054 		 * to the depot.
1055 		 */
1056 		if (slab->state == FULL) {
1057 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
1058 			slab->state = AVAIL;
1059 			stmp = zinfo->avail_base;
1060 			slab->next = stmp;
1061 			zinfo->avail_base = slab;
1062 			++zinfo->avail_count;
1063 			while (zinfo->avail_count > opt_cache) {
1064 				slab = zinfo->avail_base;
1065 				zinfo->avail_base = slab->next;
1066 				--zinfo->avail_count;
1067 				slabterm(slgd, slab);
1068 			}
1069 			goto done;
1070 		}
1071 
1072 		/*
1073 		 * If the slab becomes completely empty dispose of it in
1074 		 * some manner.  By default each thread caches up to 4
1075 		 * empty slabs.  Only small slabs are cached.
1076 		 */
1077 		if (slab->navail == slab->nmax && slab->state == AVAIL) {
1078 			/*
1079 			 * Remove slab from available queue
1080 			 */
1081 			slabp = &zinfo->avail_base;
1082 			while ((stmp = *slabp) != slab)
1083 				slabp = &stmp->next;
1084 			*slabp = slab->next;
1085 			--zinfo->avail_count;
1086 
1087 			if (opt_free || opt_cache == 0) {
1088 				/*
1089 				 * If local caching is disabled cache the
1090 				 * slab in the depot (or free it).
1091 				 */
1092 				slabterm(slgd, slab);
1093 			} else if (slab->slab_size > BIGSLABSIZE) {
1094 				/*
1095 				 * We do not try to retain large slabs
1096 				 * in per-thread caches.
1097 				 */
1098 				slabterm(slgd, slab);
1099 			} else if (zinfo->empty_count > opt_cache) {
1100 				/*
1101 				 * We have too many slabs cached, but
1102 				 * instead of freeing this one free
1103 				 * an empty slab that's been idle longer.
1104 				 *
1105 				 * (empty_count does not change)
1106 				 */
1107 				stmp = zinfo->empty_base;
1108 				slab->state = EMPTY;
1109 				slab->next = stmp->next;
1110 				zinfo->empty_base = slab;
1111 				slabterm(slgd, stmp);
1112 			} else {
1113 				/*
1114 				 * Cache the empty slab in our thread local
1115 				 * empty list.
1116 				 */
1117 				++zinfo->empty_count;
1118 				slab->state = EMPTY;
1119 				slab->next = zinfo->empty_base;
1120 				zinfo->empty_base = slab;
1121 			}
1122 		}
1123 	} else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1124 		slglobaldata_t sldepot;
1125 
1126 		/*
1127 		 * If freeing to a slab owned by the global depot, and
1128 		 * the slab becomes completely EMPTY, try to move it to
1129 		 * the correct list.
1130 		 */
1131 		sldepot = &slglobaldepot;
1132 		if (__isthreaded)
1133 			_SPINLOCK(&sldepot->lock);
1134 		if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1135 			struct zoneinfo *zinfo;
1136 
1137 			/*
1138 			 * Move the slab to the empty list
1139 			 */
1140 			MASSERT(slab->state == AVAIL);
1141 			atomic_add_int(&slab->navail, 1);
1142 			zinfo = &sldepot->zone[slab->zone_index];
1143 			slabp = &zinfo->avail_base;
1144 			while (slab != *slabp)
1145 				slabp = &(*slabp)->next;
1146 			*slabp = slab->next;
1147 			--zinfo->avail_count;
1148 
1149 			/*
1150 			 * Clean out excessive empty entries from the
1151 			 * depot.
1152 			 */
1153 			slab->state = EMPTY;
1154 			slab->next = zinfo->empty_base;
1155 			zinfo->empty_base = slab;
1156 			++zinfo->empty_count;
1157 			while (zinfo->empty_count > opt_cache) {
1158 				slab = zinfo->empty_base;
1159 				zinfo->empty_base = slab->next;
1160 				--zinfo->empty_count;
1161 				slab->state = UNKNOWN;
1162 				if (__isthreaded)
1163 					_SPINUNLOCK(&sldepot->lock);
1164 				slabfree(slab);
1165 				if (__isthreaded)
1166 					_SPINLOCK(&sldepot->lock);
1167 			}
1168 		} else {
1169 			atomic_add_int(&slab->navail, 1);
1170 		}
1171 		if (__isthreaded)
1172 			_SPINUNLOCK(&sldepot->lock);
1173 	} else {
1174 		/*
1175 		 * We can't act on the slab other than by adjusting navail
1176 		 * (and the bitmap which we did in the common code at the
1177 		 * top).
1178 		 */
1179 		atomic_add_int(&slab->navail, 1);
1180 	}
1181 done:
1182 	;
1183 }
1184 
1185 /*
1186  * Allocate a new slab holding objects of size chunk_size.
1187  */
1188 static slab_t
1189 slaballoc(int zi, size_t chunking, size_t chunk_size)
1190 {
1191 	slglobaldata_t slgd;
1192 	slglobaldata_t sldepot;
1193 	struct zoneinfo *zinfo;
1194 	region_t region;
1195 	void *save;
1196 	slab_t slab;
1197 	size_t slab_desire;
1198 	size_t slab_size;
1199 	size_t region_mask;
1200 	uintptr_t chunk_offset;
1201 	ssize_t maxchunks;
1202 	ssize_t tmpchunks;
1203 	int ispower2;
1204 	int power;
1205 	int ri;
1206 	int rx;
1207 	int nswath;
1208 	int j;
1209 
1210 	/*
1211 	 * First look in the depot.  Any given zone in the depot may be
1212 	 * locked by being set to -1.  We have to do this instead of simply
1213 	 * removing the entire chain because removing the entire chain can
1214 	 * cause racing threads to allocate local slabs for large objects,
1215 	 * resulting in a large VSZ.
1216 	 */
1217 	slgd = &slglobal;
1218 	sldepot = &slglobaldepot;
1219 	zinfo = &sldepot->zone[zi];
1220 
1221 	if (zinfo->avail_base) {
1222 		if (__isthreaded)
1223 			_SPINLOCK(&sldepot->lock);
1224 		slab = zinfo->avail_base;
1225 		if (slab) {
1226 			MASSERT(slab->slgd == NULL);
1227 			slab->slgd = slgd;
1228 			zinfo->avail_base = slab->next;
1229 			--zinfo->avail_count;
1230 			if (__isthreaded)
1231 				_SPINUNLOCK(&sldepot->lock);
1232 			return slab;
1233 		}
1234 		if (__isthreaded)
1235 			_SPINUNLOCK(&sldepot->lock);
1236 	}
1237 
1238 	/*
1239 	 * Nothing in the depot, allocate a new slab by locating or assigning
1240 	 * a region and then using the system virtual memory allocator.
1241 	 */
1242 	slab = NULL;
1243 
1244 	/*
1245 	 * Calculate the start of the data chunks relative to the start
1246 	 * of the slab.  If chunk_size is a power of 2 we guarantee
1247 	 * power of 2 alignment.  If it is not we guarantee alignment
1248 	 * to the chunk size.
1249 	 */
1250 	if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1251 		ispower2 = 1;
1252 		chunk_offset = roundup2(sizeof(*slab), chunk_size);
1253 	} else {
1254 		ispower2 = 0;
1255 		chunk_offset = sizeof(*slab) + chunking - 1;
1256 		chunk_offset -= chunk_offset % chunking;
1257 	}
1258 
1259 	/*
1260 	 * Calculate a reasonable number of chunks for the slab.
1261 	 *
1262 	 * Once initialized the MaxChunks[] array can only ever be
1263 	 * reinitialized to the same value.
1264 	 */
1265 	maxchunks = MaxChunks[zi];
1266 	if (maxchunks == 0) {
1267 		/*
1268 		 * First calculate how many chunks would fit in 1/1024
1269 		 * available memory.  This is around 2MB on a 32 bit
1270 		 * system and 128G on a 64-bit (48-bits available) system.
1271 		 */
1272 		maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1273 			    (ssize_t)chunk_size;
1274 		if (maxchunks <= 0)
1275 			maxchunks = 1;
1276 
1277 		/*
1278 		 * A slab cannot handle more than MAXCHUNKS chunks, but
1279 		 * limit us to approximately MAXCHUNKS / 2 here because
1280 		 * we may have to expand maxchunks when we calculate the
1281 		 * actual power-of-2 slab.
1282 		 */
1283 		if (maxchunks > MAXCHUNKS / 2)
1284 			maxchunks = MAXCHUNKS / 2;
1285 
1286 		/*
1287 		 * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1288 		 * slabs will be created if the allocation does not fit.
1289 		 */
1290 		if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1291 			tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1292 				    (ssize_t)chunk_size;
1293 			if (tmpchunks <= 0)
1294 				tmpchunks = 1;
1295 			if (maxchunks > tmpchunks)
1296 				maxchunks = tmpchunks;
1297 		}
1298 
1299 		/*
1300 		 * If the slab calculates to greater than 2MB see if we
1301 		 * can cut it down to ~2MB.  This controls VSZ but has
1302 		 * no effect on run-time size or performance.
1303 		 *
1304 		 * This is very important in case you core dump and also
1305 		 * important to reduce unnecessary region allocations.
1306 		 */
1307 		if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1308 			tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1309 				    (ssize_t)chunk_size;
1310 			if (tmpchunks < 1)
1311 				tmpchunks = 1;
1312 			if (maxchunks > tmpchunks)
1313 				maxchunks = tmpchunks;
1314 		}
1315 
1316 		/*
1317 		 * If the slab calculates to greater than 128K see if we
1318 		 * can cut it down to ~128K while still maintaining a
1319 		 * reasonably large number of chunks in each slab.  This
1320 		 * controls VSZ but has no effect on run-time size or
1321 		 * performance.
1322 		 *
1323 		 * This is very important in case you core dump and also
1324 		 * important to reduce unnecessary region allocations.
1325 		 */
1326 		if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1327 			tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1328 				    (ssize_t)chunk_size;
1329 			if (tmpchunks < 32)
1330 				tmpchunks = 32;
1331 			if (maxchunks > tmpchunks)
1332 				maxchunks = tmpchunks;
1333 		}
1334 
1335 		MaxChunks[zi] = maxchunks;
1336 	}
1337 	MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1338 
1339 	/*
1340 	 * Calculate the actual slab size.  maxchunks will be recalculated
1341 	 * a little later.
1342 	 */
1343 	slab_desire = chunk_offset + chunk_size * maxchunks;
1344 	slab_size = 8 * MAXCHUNKS;
1345 	power = 3 + MAXCHUNKS_BITS;
1346 	while (slab_size < slab_desire) {
1347 		slab_size <<= 1;
1348 		++power;
1349 	}
1350 
1351 	/*
1352 	 * Do a quick recalculation based on the actual slab size but not
1353 	 * yet dealing with whether the slab header is in-band or out-of-band.
1354 	 * The purpose here is to see if we can reasonably reduce slab_size
1355 	 * to a power of 4 to allow more chunk sizes to use the same slab
1356 	 * size.
1357 	 */
1358 	if ((power & 1) && slab_size > 32768) {
1359 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1360 		if (maxchunks >= MAXCHUNKS / 8) {
1361 			slab_size >>= 1;
1362 			--power;
1363 		}
1364 	}
1365 	if ((power & 2) && slab_size > 32768 * 4) {
1366 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1367 		if (maxchunks >= MAXCHUNKS / 4) {
1368 			slab_size >>= 2;
1369 			power -= 2;
1370 		}
1371 	}
1372 	/*
1373 	 * This case occurs when the slab_size is larger than 1/1024 available
1374 	 * UVM.
1375 	 */
1376 	nswath = slab_size / NREGIONS_SIZE;
1377 	if (nswath > NREGIONS)
1378 		return (NULL);
1379 
1380 
1381 	/*
1382 	 * Try to allocate from our current best region for this zi
1383 	 */
1384 	region_mask = ~(slab_size - 1);
1385 	ri = slgd->zone[zi].best_region;
1386 	if (Regions[ri].mask == region_mask) {
1387 		if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1388 			goto found;
1389 	}
1390 
1391 	/*
1392 	 * Try to find an existing region to allocate from.  The normal
1393 	 * case will be for allocations that are less than 1/1024 available
1394 	 * UVM, which fit into a single Regions[] entry.
1395 	 */
1396 	while (slab_size <= NREGIONS_SIZE) {
1397 		rx = -1;
1398 		for (ri = 0; ri < NREGIONS; ++ri) {
1399 			if (rx < 0 && Regions[ri].mask == 0)
1400 				rx = ri;
1401 			if (Regions[ri].mask == region_mask) {
1402 				slab = _vmem_alloc(ri, slab_size);
1403 				if (slab) {
1404 					slgd->zone[zi].best_region = ri;
1405 					goto found;
1406 				}
1407 			}
1408 		}
1409 
1410 		if (rx < 0)
1411 			return(NULL);
1412 
1413 		/*
1414 		 * This can fail, retry either way
1415 		 */
1416 		atomic_cmpset_ptr((void **)&Regions[rx].mask,
1417 				  NULL,
1418 				  (void *)region_mask);
1419 	}
1420 
1421 	for (;;) {
1422 		rx = -1;
1423 		for (ri = 0; ri < NREGIONS; ri += nswath) {
1424 			if (Regions[ri].mask == region_mask) {
1425 				slab = _vmem_alloc(ri, slab_size);
1426 				if (slab) {
1427 					slgd->zone[zi].best_region = ri;
1428 					goto found;
1429 				}
1430 			}
1431 			if (rx < 0) {
1432 				for (j = nswath - 1; j >= 0; --j) {
1433 					if (Regions[ri+j].mask != 0)
1434 						break;
1435 				}
1436 				if (j < 0)
1437 					rx = ri;
1438 			}
1439 		}
1440 
1441 		/*
1442 		 * We found a candidate, try to allocate it backwards so
1443 		 * another thread racing a slaballoc() does not see the
1444 		 * mask in the base index position until we are done.
1445 		 *
1446 		 * We can safely zero-out any partial allocations because
1447 		 * the mask is only accessed from the base index.  Any other
1448 		 * threads racing us will fail prior to us clearing the mask.
1449 		 */
1450 		if (rx < 0)
1451 			return(NULL);
1452 		for (j = nswath - 1; j >= 0; --j) {
1453 			if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1454 					       NULL, (void *)region_mask)) {
1455 				while (++j < nswath)
1456 					Regions[rx+j].mask = 0;
1457 				break;
1458 			}
1459 		}
1460 		/* retry */
1461 	}
1462 
1463 	/*
1464 	 * Fill in the new slab in region ri.  If the slab_size completely
1465 	 * fills one or more region slots we move the slab structure out of
1466 	 * band which should optimize the chunking (particularly for a power
1467 	 * of 2).
1468 	 */
1469 found:
1470 	region = &Regions[ri];
1471 	MASSERT(region->slab == NULL);
1472 	if (slab_size >= NREGIONS_SIZE) {
1473 		save = slab;
1474 		slab = memalloc(sizeof(*slab), 0);
1475 		bzero(slab, sizeof(*slab));
1476 		slab->chunks = save;
1477 		for (j = 0; j < nswath; ++j)
1478 			region[j].slab = slab;
1479 		chunk_offset = 0;
1480 	} else {
1481 		bzero(slab, sizeof(*slab));
1482 		slab->chunks = (char *)slab + chunk_offset;
1483 	}
1484 
1485 	/*
1486 	 * Calculate the start of the chunks memory and recalculate the
1487 	 * actual number of chunks the slab can hold.
1488 	 */
1489 	maxchunks = (slab_size - chunk_offset) / chunk_size;
1490 	if (maxchunks > MAXCHUNKS)
1491 		maxchunks = MAXCHUNKS;
1492 
1493 	/*
1494 	 * And fill in the rest
1495 	 */
1496 	slab->magic = ZALLOC_SLAB_MAGIC;
1497 	slab->navail = maxchunks;
1498 	slab->nmax = maxchunks;
1499 	slab->slab_size = slab_size;
1500 	slab->chunk_size = chunk_size;
1501 	slab->zone_index = zi;
1502 	slab->slgd = slgd;
1503 	slab->state = UNKNOWN;
1504 	slab->region = region;
1505 
1506 	for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1507 		if (ri + LONG_BITS <= maxchunks)
1508 			slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1509 		else
1510 			slab->bitmap[ri >> LONG_BITS_SHIFT] =
1511 						(1LU << (maxchunks - ri)) - 1;
1512 	}
1513 	return (slab);
1514 }
1515 
1516 /*
1517  * Free a slab.
1518  */
1519 static void
1520 slabfree(slab_t slab)
1521 {
1522 	int nswath;
1523 	int j;
1524 
1525 	if (slab->region->slab == slab) {
1526 		/*
1527 		 * Out-of-band slab.
1528 		 */
1529 		nswath = slab->slab_size / NREGIONS_SIZE;
1530 		for (j = 0; j < nswath; ++j)
1531 			slab->region[j].slab = NULL;
1532 		slab->magic = 0;
1533 		_vmem_free(slab->chunks, slab->slab_size);
1534 		memfree(slab, 0);
1535 	} else {
1536 		/*
1537 		 * In-band slab.
1538 		 */
1539 		slab->magic = 0;
1540 		_vmem_free(slab, slab->slab_size);
1541 	}
1542 }
1543 
1544 /*
1545  * Terminate a slab's use in the current thread.  The slab may still have
1546  * outstanding allocations and thus not be deallocatable.
1547  */
1548 static void
1549 slabterm(slglobaldata_t slgd, slab_t slab)
1550 {
1551 	slglobaldata_t sldepot;
1552 	struct zoneinfo *zinfo;
1553 	int zi = slab->zone_index;
1554 
1555 	slab->slgd = NULL;
1556 	--slgd->nslabs;
1557 	sldepot = &slglobaldepot;
1558 	zinfo = &sldepot->zone[zi];
1559 
1560 	/*
1561 	 * Move the slab to the avail list or the empty list.
1562 	 */
1563 	if (__isthreaded)
1564 		_SPINLOCK(&sldepot->lock);
1565 	if (slab->navail == slab->nmax) {
1566 		slab->state = EMPTY;
1567 		slab->next = zinfo->empty_base;
1568 		zinfo->empty_base = slab;
1569 		++zinfo->empty_count;
1570 	} else {
1571 		slab->state = AVAIL;
1572 		slab->next = zinfo->avail_base;
1573 		zinfo->avail_base = slab;
1574 		++zinfo->avail_count;
1575 	}
1576 
1577 	/*
1578 	 * Clean extra slabs out of the empty list
1579 	 */
1580 	while (zinfo->empty_count > opt_cache) {
1581 		slab = zinfo->empty_base;
1582 		zinfo->empty_base = slab->next;
1583 		--zinfo->empty_count;
1584 		slab->state = UNKNOWN;
1585 		if (__isthreaded)
1586 			_SPINUNLOCK(&sldepot->lock);
1587 		slabfree(slab);
1588 		if (__isthreaded)
1589 			_SPINLOCK(&sldepot->lock);
1590 	}
1591 	if (__isthreaded)
1592 		_SPINUNLOCK(&sldepot->lock);
1593 }
1594 
1595 /*
1596  * _vmem_alloc()
1597  *
1598  *	Directly map memory in PAGE_SIZE'd chunks with the specified
1599  *	alignment.
1600  *
1601  *	Alignment must be a multiple of PAGE_SIZE.
1602  *
1603  *	Size must be >= alignment.
1604  */
1605 static void *
1606 _vmem_alloc(int ri, size_t slab_size)
1607 {
1608 	char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1609 	char *eaddr;
1610 	char *addr;
1611 	char *save;
1612 	uintptr_t excess;
1613 
1614 	if (slab_size < NREGIONS_SIZE)
1615 		eaddr = baddr + NREGIONS_SIZE;
1616 	else
1617 		eaddr = baddr + slab_size;
1618 
1619 	/*
1620 	 * This usually just works but might not.
1621 	 */
1622 	addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1623 		    MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1624 	if (addr == MAP_FAILED) {
1625 		return (NULL);
1626 	}
1627 	if (addr < baddr || addr + slab_size > eaddr) {
1628 		munmap(addr, slab_size);
1629 		return (NULL);
1630 	}
1631 
1632 	/*
1633 	 * Check alignment.  The misaligned offset is also the excess
1634 	 * amount.  If misaligned unmap the excess so we have a chance of
1635 	 * mapping at the next alignment point and recursively try again.
1636 	 *
1637 	 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB	block alignment
1638 	 *   aaaaaaaaa aaaaaaaaaaa aa		mis-aligned allocation
1639 	 *   xxxxxxxxx				final excess calculation
1640 	 *   ^ returned address
1641 	 */
1642 	excess = (uintptr_t)addr & (slab_size - 1);
1643 	while (excess) {
1644 		excess = slab_size - excess;
1645 		save = addr;
1646 
1647 		munmap(save + excess, slab_size - excess);
1648 		addr = _vmem_alloc(ri, slab_size);
1649 		munmap(save, excess);
1650 		if (addr == NULL)
1651 			return (NULL);
1652 		if (addr < baddr || addr + slab_size > eaddr) {
1653 			munmap(addr, slab_size);
1654 			return (NULL);
1655 		}
1656 		excess = (uintptr_t)addr & (slab_size - 1);
1657 	}
1658 	return (addr);
1659 }
1660 
1661 /*
1662  * _vmem_free()
1663  *
1664  *	Free a chunk of memory allocated with _vmem_alloc()
1665  */
1666 static void
1667 _vmem_free(void *ptr, size_t size)
1668 {
1669 	munmap(ptr, size);
1670 }
1671 
1672 /*
1673  * Panic on fatal conditions
1674  */
1675 static void
1676 _mpanic(const char *ctl, ...)
1677 {
1678 	va_list va;
1679 
1680 	if (malloc_panic == 0) {
1681 		malloc_panic = 1;
1682 		va_start(va, ctl);
1683 		vfprintf(stderr, ctl, va);
1684 		fprintf(stderr, "\n");
1685 		fflush(stderr);
1686 		va_end(va);
1687 	}
1688 	abort();
1689 }
1690 
1691 __weak_reference(__malloc, malloc);
1692 __weak_reference(__calloc, calloc);
1693 __weak_reference(__posix_memalign, posix_memalign);
1694 __weak_reference(__realloc, realloc);
1695 __weak_reference(__free, free);
1696