xref: /dragonfly/lib/libc/stdlib/dmalloc.c (revision 73610d44)
1 /*
2  * DMALLOC.C	- Dillon's malloc
3  *
4  * Copyright (c) 2011 The DragonFly Project. All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module implements a modified slab allocator drop-in replacement for
38  * the libc malloc().  The slab algorithm has been adjusted to support dynamic
39  * sizing of slabs which effectively allows slabs to be used for allocations of
40  * any size.  Because of this we neither have a small-block allocator or a
41  * big-block allocator and the code paths are simplified to the point where
42  * allocations, caching, and freeing, is screaming fast.
43  *
44  * There is very little interaction between threads.  A global depot accessed
45  * via atomic cmpxchg instructions (only! no spinlocks!) is used as a
46  * catch-all and to deal with thread exits and such.
47  *
48  * To support dynamic slab sizing available user virtual memory is broken
49  * down into ~1024 regions.  Each region has fixed slab size whos value is
50  * set when the region is opened up for use.  The free() path simply applies
51  * a mask based on the region to the pointer to acquire the base of the
52  * governing slab structure.
53  *
54  * Regions[NREGIONS]	(1024)
55  *
56  * Slab management and locking is done on a per-zone basis.
57  *
58  *	Alloc Size	Chunking        Number of zones
59  *	0-127		8		16
60  *	128-255		16		8
61  *	256-511		32		8
62  *	512-1023	64		8
63  *	1024-2047	128		8
64  *	2048-4095	256		8
65  *	4096-8191	512		8
66  *	8192-16383	1024		8
67  *	16384-32767	2048		8
68  *	32768-65535	4096		8
69  *	... continues unlimited ...	4 zones
70  *
71  *	For a 2^63 memory space each doubling >= 64K is broken down into
72  *	4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
73  *
74  *			   API FEATURES AND SIDE EFFECTS
75  *
76  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
77  *	Above that power-of-2 sized allocations are page-aligned.  Non
78  *	power-of-2 sized allocations are aligned the same as the chunk
79  *	size for their zone.
80  *    + malloc(0) returns a special non-NULL value
81  *    + ability to allocate arbitrarily large chunks of memory
82  *    + realloc will reuse the passed pointer if possible, within the
83  *	limitations of the zone chunking.
84  *
85  *				FUTURE FEATURES
86  *
87  *    + [better] garbage collection
88  *    + better initial sizing.
89  *
90  * TUNING
91  *
92  * The value of the environment variable MALLOC_OPTIONS is a character string
93  * containing various flags to tune nmalloc.  Upper case letters enabled
94  * or increase the feature, lower case disables or decreases the feature.
95  *
96  * U		Enable UTRACE for all operations, observable with ktrace.
97  *		Diasbled by default.
98  *
99  * Z		Zero out allocations, otherwise allocations (except for
100  *		calloc) will contain garbage.
101  *		Disabled by default.
102  *
103  * H		Pass a hint with madvise() about unused pages.
104  *		Disabled by default.
105  *		Not currently implemented.
106  *
107  * F		Disable local per-thread caching.
108  *		Disabled by default.
109  *
110  * C		Increase (decrease) how much excess cache to retain.
111  *		Set to 4 by default.
112  */
113 
114 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
115 
116 #ifndef STANDALONE_DEBUG
117 #include "libc_private.h"
118 #endif
119 
120 #include <sys/param.h>
121 #include <sys/types.h>
122 #include <sys/mman.h>
123 #include <sys/queue.h>
124 #include <sys/uio.h>
125 #include <sys/ktrace.h>
126 #include <stdio.h>
127 #include <stdint.h>
128 #include <stdlib.h>
129 #include <stdarg.h>
130 #include <stddef.h>
131 #include <unistd.h>
132 #include <string.h>
133 #include <fcntl.h>
134 #include <errno.h>
135 #include <pthread.h>
136 #include <limits.h>
137 
138 #include <machine/atomic.h>
139 #include <machine/cpufunc.h>
140 
141 #ifdef STANDALONE_DEBUG
142 void _nmalloc_thr_init(void);
143 #else
144 #include "spinlock.h"
145 #include "un-namespace.h"
146 #endif
147 
148 #ifndef MAP_SIZEALIGN
149 #define MAP_SIZEALIGN	0
150 #endif
151 
152 #if SSIZE_MAX == 0x7FFFFFFF
153 #define ADDRBITS	32
154 #define UVM_BITS	32	/* worst case */
155 #else
156 #define ADDRBITS	64
157 #define UVM_BITS	48	/* worst case XXX */
158 #endif
159 
160 #if LONG_MAX == 0x7FFFFFFF
161 #define LONG_BITS	32
162 #define LONG_BITS_SHIFT	5
163 #else
164 #define LONG_BITS	64
165 #define LONG_BITS_SHIFT	6
166 #endif
167 
168 #define LOCKEDPTR	((void *)(intptr_t)-1)
169 
170 /*
171  * Regions[]
172  */
173 #define NREGIONS_BITS	10
174 #define NREGIONS	(1 << NREGIONS_BITS)
175 #define NREGIONS_MASK	(NREGIONS - 1)
176 #define NREGIONS_SHIFT	(UVM_BITS - NREGIONS_BITS)
177 #define NREGIONS_SIZE	(1LU << NREGIONS_SHIFT)
178 
179 typedef struct region *region_t;
180 typedef struct slglobaldata *slglobaldata_t;
181 typedef struct slab *slab_t;
182 
183 struct region {
184 	uintptr_t	mask;
185 	slab_t		slab;	/* conditional out of band slab */
186 };
187 
188 static struct region Regions[NREGIONS];
189 
190 /*
191  * Number of chunking zones available
192  */
193 #define CHUNKFACTOR	8
194 #if ADDRBITS == 32
195 #define NZONES		(16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
196 #else
197 #define NZONES		(16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
198 #endif
199 
200 static int MaxChunks[NZONES];
201 
202 #define NDEPOTS		8		/* must be power of 2 */
203 
204 /*
205  * Maximum number of chunks per slab, governed by the allocation bitmap in
206  * each slab.  The maximum is reduced for large chunk sizes.
207  */
208 #define MAXCHUNKS	(LONG_BITS * LONG_BITS)
209 #define MAXCHUNKS_BITS	(LONG_BITS_SHIFT * LONG_BITS_SHIFT)
210 #define LITSLABSIZE	(32 * 1024)
211 #define NOMSLABSIZE	(2 * 1024 * 1024)
212 #define BIGSLABSIZE	(128 * 1024 * 1024)
213 
214 #define ZALLOC_SLAB_MAGIC	0x736c6162	/* magic sanity */
215 
216 TAILQ_HEAD(slab_list, slab);
217 
218 /*
219  * A slab structure
220  */
221 struct slab {
222 	struct slab	*next;		/* slabs with available space */
223 	TAILQ_ENTRY(slab) entry;
224 	int32_t		magic;		/* magic number for sanity check */
225 	u_int		navail;		/* number of free elements available */
226 	u_int		nmax;
227 	u_int		free_bit;	/* free hint bitno */
228 	u_int		free_index;	/* free hint index */
229 	u_long		bitmap[LONG_BITS]; /* free chunks */
230 	size_t		slab_size;	/* size of entire slab */
231 	size_t		chunk_size;	/* chunk size for validation */
232 	int		zone_index;
233 	enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
234 	int		flags;
235 	region_t	region;		/* related region */
236 	char		*chunks;	/* chunk base */
237 	slglobaldata_t	slgd;
238 };
239 
240 /*
241  * per-thread data
242  */
243 struct slglobaldata {
244 	struct zoneinfo {
245 		slab_t	avail_base;
246 		slab_t	empty_base;
247 		int	best_region;
248 		int	empty_count;
249 	} zone[NZONES];
250 	struct slab_list full_zones;		/* via entry */
251 	int		masked;
252 	int		biggest_index;
253 	size_t		nslabs;
254 	slglobaldata_t	sldepot;
255 };
256 
257 #define SLAB_ZEROD		0x0001
258 
259 /*
260  * Misc constants.  Note that allocations that are exact multiples of
261  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
262  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
263  */
264 #define MIN_CHUNK_SIZE		8		/* in bytes */
265 #define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
266 
267 #define SAFLAG_ZERO	0x00000001
268 
269 /*
270  * The WEIRD_ADDR is used as known text to copy into free objects to
271  * try to create deterministic failure cases if the data is accessed after
272  * free.
273  *
274  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
275  *	    not be larger then 64.
276  */
277 #define WEIRD_ADDR      0xdeadc0de
278 #define MAX_COPY        sizeof(weirdary)
279 
280 #define arysize(ary)	(sizeof(ary)/sizeof((ary)[0]))
281 
282 /*
283  * Thread control
284  */
285 
286 #define MASSERT(exp)	do { if (__predict_false(!(exp)))	\
287 				_mpanic("assertion: %s in %s",	\
288 				#exp, __func__);		\
289 			    } while (0)
290 
291 /* With this attribute set, do not require a function call for accessing
292  * this variable when the code is compiled -fPIC */
293 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
294 
295 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
296 static pthread_key_t thread_malloc_key;
297 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
298 static struct slglobaldata sldepots[NDEPOTS];
299 
300 static int opt_madvise = 0;
301 static int opt_free = 0;
302 static int opt_cache = 4;
303 static int opt_utrace = 0;
304 static int g_malloc_flags = 0;
305 static int malloc_panic;
306 static int malloc_started = 0;
307 
308 static const int32_t weirdary[16] = {
309 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
310 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
311 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
312 	WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
313 };
314 
315 static void *memalloc(size_t size, int flags);
316 static void *memrealloc(void *ptr, size_t size);
317 static void memfree(void *ptr, int);
318 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
319 static void slabfree(slab_t slab);
320 static void slabterm(slglobaldata_t slgd, slab_t slab);
321 static void *_vmem_alloc(int ri, size_t slab_size);
322 static void _vmem_free(void *ptr, size_t slab_size);
323 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
324 #ifndef STANDALONE_DEBUG
325 static void malloc_init(void) __constructor(101);
326 #else
327 static void malloc_init(void) __constructor(101);
328 #endif
329 
330 
331 struct nmalloc_utrace {
332 	void *p;
333 	size_t s;
334 	void *r;
335 };
336 
337 #define UTRACE(a, b, c)						\
338 	if (opt_utrace) {					\
339 		struct nmalloc_utrace ut = {			\
340 			.p = (a),				\
341 			.s = (b),				\
342 			.r = (c)				\
343 		};						\
344 		utrace(&ut, sizeof(ut));			\
345 	}
346 
347 #ifdef INVARIANTS
348 /*
349  * If enabled any memory allocated without M_ZERO is initialized to -1.
350  */
351 static int  use_malloc_pattern;
352 #endif
353 
354 static void
355 malloc_init(void)
356 {
357 	const char *p = NULL;
358 	static spinlock_t malloc_init_lock;
359 
360 	if (malloc_started)
361 		return;
362 
363 	if (__isthreaded) {
364 		_SPINLOCK(&malloc_init_lock);
365 		if (malloc_started) {
366 			_SPINUNLOCK(&malloc_init_lock);
367 			return;
368 		}
369 	}
370 
371 	Regions[0].mask = -1; /* disallow activity in lowest region */
372 
373 	if (issetugid() == 0)
374 		p = getenv("MALLOC_OPTIONS");
375 
376 	for (; p != NULL && *p != '\0'; p++) {
377 		switch(*p) {
378 		case 'u':
379 			opt_utrace = 0;
380 			break;
381 		case 'U':
382 			opt_utrace = 1;
383 			break;
384 		case 'h':
385 			opt_madvise = 0;
386 			break;
387 		case 'H':
388 			opt_madvise = 1;
389 			break;
390 		case 'c':
391 			if (opt_cache > 0)
392 				--opt_cache;
393 			break;
394 		case 'C':
395 			++opt_cache;
396 			break;
397 		case 'f':
398 			opt_free = 0;
399 			break;
400 		case 'F':
401 			opt_free = 1;
402 			break;
403 		case 'z':
404 			g_malloc_flags = 0;
405 			break;
406 		case 'Z':
407 			g_malloc_flags = SAFLAG_ZERO;
408 			break;
409 		default:
410 			break;
411 		}
412 	}
413 
414 	UTRACE((void *) -1, 0, NULL);
415 	_nmalloc_thr_init();
416 	malloc_started = 1;
417 
418 	if (__isthreaded)
419 		_SPINUNLOCK(&malloc_init_lock);
420 }
421 
422 /*
423  * We have to install a handler for nmalloc thread teardowns when
424  * the thread is created.  We cannot delay this because destructors in
425  * sophisticated userland programs can call malloc() for the first time
426  * during their thread exit.
427  *
428  * This routine is called directly from pthreads.
429  */
430 static void _nmalloc_thr_init_once(void);
431 static void _nmalloc_thr_destructor(void *thrp);
432 
433 void
434 _nmalloc_thr_init(void)
435 {
436 	static int did_init;
437 	static int SLGI;
438 	int slgi;
439 
440 	slgi = SLGI++;
441 	cpu_ccfence();
442 	TAILQ_INIT(&slglobal.full_zones);
443 	slglobal.sldepot = &sldepots[slgi & (NDEPOTS - 1)];
444 
445 	if (slglobal.masked)
446 		return;
447 
448 	slglobal.masked = 1;
449 	if (did_init == 0) {
450 		did_init = 1;
451 		pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
452 	}
453 	pthread_setspecific(thread_malloc_key, &slglobal);
454 	slglobal.masked = 0;
455 }
456 
457 /*
458  * Called just once
459  */
460 static void
461 _nmalloc_thr_init_once(void)
462 {
463 	int error;
464 
465 	error = pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
466 	if (error)
467 		abort();
468 }
469 
470 /*
471  * Called for each thread undergoing exit
472  *
473  * Move all of the thread's slabs into a depot.
474  */
475 static void
476 _nmalloc_thr_destructor(void *thrp)
477 {
478 	slglobaldata_t slgd = thrp;
479 	slab_t slab;
480 	int i;
481 
482 	slgd->masked = 1;
483 
484 	for (i = 0; i <= slgd->biggest_index; i++) {
485 		while ((slab = slgd->zone[i].empty_base) != NULL) {
486 			slgd->zone[i].empty_base = slab->next;
487 			slabterm(slgd, slab);
488 		}
489 
490 		while ((slab = slgd->zone[i].avail_base) != NULL) {
491 			slgd->zone[i].avail_base = slab->next;
492 			slabterm(slgd, slab);
493 		}
494 
495 		while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
496 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
497 			slabterm(slgd, slab);
498 		}
499 	}
500 }
501 
502 /*
503  * Calculate the zone index for the allocation request size and set the
504  * allocation request size to that particular zone's chunk size.
505  */
506 static __inline int
507 zoneindex(size_t *bytes, size_t *chunking)
508 {
509 	size_t n = (size_t)*bytes;
510 	size_t x;
511 	size_t c;
512 	int i;
513 
514 	if (n < 128) {
515 		*bytes = n = (n + 7) & ~7;
516 		*chunking = 8;
517 		return(n / 8);			/* 8 byte chunks, 16 zones */
518 	}
519 	if (n < 4096) {
520 		x = 256;
521 		c = x / (CHUNKFACTOR * 2);
522 		i = 16;
523 	} else {
524 		x = 8192;
525 		c = x / (CHUNKFACTOR * 2);
526 		i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
527 	}
528 	while (n >= x) {
529 		x <<= 1;
530 		c <<= 1;
531 		i += CHUNKFACTOR;
532 		if (x == 0)
533 			_mpanic("slaballoc: byte value too high");
534 	}
535 	*bytes = n = roundup2(n, c);
536 	*chunking = c;
537 	return (i + n / c - CHUNKFACTOR);
538 #if 0
539 	*bytes = n = (n + c - 1) & ~(c - 1);
540 	*chunking = c;
541 	return (n / c + i);
542 
543 	if (n < 256) {
544 		*bytes = n = (n + 15) & ~15;
545 		*chunking = 16;
546 		return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
547 	}
548 	if (n < 8192) {
549 		if (n < 512) {
550 			*bytes = n = (n + 31) & ~31;
551 			*chunking = 32;
552 			return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
553 		}
554 		if (n < 1024) {
555 			*bytes = n = (n + 63) & ~63;
556 			*chunking = 64;
557 			return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
558 		}
559 		if (n < 2048) {
560 			*bytes = n = (n + 127) & ~127;
561 			*chunking = 128;
562 			return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
563 		}
564 		if (n < 4096) {
565 			*bytes = n = (n + 255) & ~255;
566 			*chunking = 256;
567 			return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
568 		}
569 		*bytes = n = (n + 511) & ~511;
570 		*chunking = 512;
571 		return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
572 	}
573 	if (n < 16384) {
574 		*bytes = n = (n + 1023) & ~1023;
575 		*chunking = 1024;
576 		return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
577 	}
578 	if (n < 32768) {				/* 16384-32767 */
579 		*bytes = n = (n + 2047) & ~2047;
580 		*chunking = 2048;
581 		return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
582 	}
583 	if (n < 65536) {
584 		*bytes = n = (n + 4095) & ~4095;	/* 32768-65535 */
585 		*chunking = 4096;
586 		return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
587 	}
588 
589 	x = 131072;
590 	c = 8192;
591 	i = CHUNKINGLO*10 - 1;
592 
593 	while (n >= x) {
594 		x <<= 1;
595 		c <<= 1;
596 		i += CHUNKINGHI;
597 		if (x == 0)
598 			_mpanic("slaballoc: byte value too high");
599 	}
600 	*bytes = n = (n + c - 1) & ~(c - 1);
601 	*chunking = c;
602 	return (n / c + i);
603 #endif
604 }
605 
606 /*
607  * malloc() - call internal slab allocator
608  */
609 void *
610 malloc(size_t size)
611 {
612 	void *ptr;
613 
614 	ptr = memalloc(size, 0);
615 	if (ptr == NULL)
616 		errno = ENOMEM;
617 	else
618 		UTRACE(0, size, ptr);
619 	return(ptr);
620 }
621 
622 /*
623  * calloc() - call internal slab allocator
624  */
625 void *
626 calloc(size_t number, size_t size)
627 {
628 	void *ptr;
629 
630 	ptr = memalloc(number * size, SAFLAG_ZERO);
631 	if (ptr == NULL)
632 		errno = ENOMEM;
633 	else
634 		UTRACE(0, number * size, ptr);
635 	return(ptr);
636 }
637 
638 /*
639  * realloc() (SLAB ALLOCATOR)
640  *
641  * We do not attempt to optimize this routine beyond reusing the same
642  * pointer if the new size fits within the chunking of the old pointer's
643  * zone.
644  */
645 void *
646 realloc(void *ptr, size_t size)
647 {
648 	void *ret;
649 
650 	if (ptr == NULL)
651 		ret = memalloc(size, 0);
652 	else
653 		ret = memrealloc(ptr, size);
654 	if (ret == NULL)
655 		errno = ENOMEM;
656 	else
657 		UTRACE(ptr, size, ret);
658 	return(ret);
659 }
660 
661 /*
662  * posix_memalign()
663  *
664  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
665  * is a power of 2 >= sizeof(void *).
666  *
667  * The slab allocator will allocate on power-of-2 boundaries up to at least
668  * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
669  * matching the requirements.
670  */
671 int
672 posix_memalign(void **memptr, size_t alignment, size_t size)
673 {
674 	/*
675 	 * OpenGroup spec issue 6 checks
676 	 */
677 	if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
678 		*memptr = NULL;
679 		return(EINVAL);
680 	}
681 	if (alignment < sizeof(void *)) {
682 		*memptr = NULL;
683 		return(EINVAL);
684 	}
685 
686 	/*
687 	 * XXX for now just find the nearest power of 2 >= size and also
688 	 * >= alignment and allocate that.
689 	 */
690 	while (alignment < size) {
691 		alignment <<= 1;
692 		if (alignment == 0)
693 			_mpanic("posix_memalign: byte value too high");
694 	}
695 	*memptr = memalloc(alignment, 0);
696 	return(*memptr ? 0 : ENOMEM);
697 }
698 
699 /*
700  * free() (SLAB ALLOCATOR) - do the obvious
701  */
702 void
703 free(void *ptr)
704 {
705 	if (ptr) {
706 		UTRACE(ptr, 0, 0);
707 		memfree(ptr, 0);
708 	}
709 }
710 
711 /*
712  * memalloc()	(SLAB ALLOCATOR)
713  *
714  *	Allocate memory via the slab allocator.
715  */
716 static void *
717 memalloc(size_t size, int flags)
718 {
719 	slglobaldata_t slgd;
720 	struct zoneinfo *zinfo;
721 	slab_t slab;
722 	size_t chunking;
723 	int bmi;
724 	int bno;
725 	u_long *bmp;
726 	int zi;
727 #ifdef INVARIANTS
728 	int i;
729 #endif
730 	size_t off;
731 	char *obj;
732 
733 	if (!malloc_started)
734 		malloc_init();
735 
736 	/*
737 	 * If 0 bytes is requested we have to return a unique pointer, allocate
738 	 * at least one byte.
739 	 */
740 	if (size == 0)
741 		size = 1;
742 
743 	/* Capture global flags */
744 	flags |= g_malloc_flags;
745 
746 	/* Compute allocation zone; zoneindex will panic on excessive sizes */
747 	zi = zoneindex(&size, &chunking);
748 	MASSERT(zi < NZONES);
749 	if (size == 0)
750 		return(NULL);
751 
752 	/*
753 	 * Locate a slab with available space.  If no slabs are available
754 	 * back-off to the empty list and if we still come up dry allocate
755 	 * a new slab (which will try the depot first).
756 	 */
757 retry:
758 	slgd = &slglobal;
759 	zinfo = &slgd->zone[zi];
760 	if ((slab = zinfo->avail_base) == NULL) {
761 		if ((slab = zinfo->empty_base) == NULL) {
762 			/*
763 			 * Still dry
764 			 */
765 			slab = slaballoc(zi, chunking, size);
766 			if (slab == NULL)
767 				return(NULL);
768 			slab->next = zinfo->avail_base;
769 			zinfo->avail_base = slab;
770 			slab->state = AVAIL;
771 			if (slgd->biggest_index < zi)
772 				slgd->biggest_index = zi;
773 			++slgd->nslabs;
774 		} else {
775 			/*
776 			 * Pulled from empty list
777 			 */
778 			zinfo->empty_base = slab->next;
779 			slab->next = zinfo->avail_base;
780 			zinfo->avail_base = slab;
781 			slab->state = AVAIL;
782 			--zinfo->empty_count;
783 		}
784 	}
785 
786 	/*
787 	 * Allocate a chunk out of the slab.  HOT PATH
788 	 *
789 	 * Only the thread owning the slab can allocate out of it.
790 	 *
791 	 * NOTE: The last bit in the bitmap is always marked allocated so
792 	 *	 we cannot overflow here.
793 	 */
794 	bno = slab->free_bit;
795 	bmi = slab->free_index;
796 	bmp = &slab->bitmap[bmi];
797 	if (*bmp & (1LU << bno)) {
798 		atomic_clear_long(bmp, 1LU << bno);
799 		obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
800 		slab->free_bit = (bno + 1) & (LONG_BITS - 1);
801 		atomic_add_int(&slab->navail, -1);
802 		if (flags & SAFLAG_ZERO)
803 			bzero(obj, size);
804 		return (obj);
805 	}
806 
807 	/*
808 	 * Allocate a chunk out of a slab.  COLD PATH
809 	 */
810 	if (slab->navail == 0) {
811 		zinfo->avail_base = slab->next;
812 		slab->state = FULL;
813 		TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
814 		goto retry;
815 	}
816 
817 	while (bmi < LONG_BITS) {
818 		bmp = &slab->bitmap[bmi];
819 		if (*bmp) {
820 			bno = bsflong(*bmp);
821 			atomic_clear_long(bmp, 1LU << bno);
822 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
823 					     size;
824 			slab->free_index = bmi;
825 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
826 			atomic_add_int(&slab->navail, -1);
827 			if (flags & SAFLAG_ZERO)
828 				bzero(obj, size);
829 			return (obj);
830 		}
831 		++bmi;
832 	}
833 	bmi = 0;
834 	while (bmi < LONG_BITS) {
835 		bmp = &slab->bitmap[bmi];
836 		if (*bmp) {
837 			bno = bsflong(*bmp);
838 			atomic_clear_long(bmp, 1LU << bno);
839 			obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
840 					     size;
841 			slab->free_index = bmi;
842 			slab->free_bit = (bno + 1) & (LONG_BITS - 1);
843 			atomic_add_int(&slab->navail, -1);
844 			if (flags & SAFLAG_ZERO)
845 				bzero(obj, size);
846 			return (obj);
847 		}
848 		++bmi;
849 	}
850 	_mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
851 	/* not reached */
852 	return NULL;
853 }
854 
855 /*
856  * Reallocate memory within the chunk
857  */
858 static void *
859 memrealloc(void *ptr, size_t nsize)
860 {
861 	region_t region;
862 	slab_t slab;
863 	size_t osize;
864 	char *obj;
865 	int flags = 0;
866 
867 	/*
868 	 * If 0 bytes is requested we have to return a unique pointer, allocate
869 	 * at least one byte.
870 	 */
871 	if (nsize == 0)
872 		nsize = 1;
873 
874 	/* Capture global flags */
875 	flags |= g_malloc_flags;
876 
877 	/*
878 	 * Locate the zone by looking up the dynamic slab size mask based
879 	 * on the memory region the allocation resides in.
880 	 */
881 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
882 	if ((slab = region->slab) == NULL)
883 		slab = (void *)((uintptr_t)ptr & region->mask);
884 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
885 	osize = slab->chunk_size;
886 	if (nsize <= osize) {
887 		if (osize < 32 || nsize >= osize / 2) {
888 			obj = ptr;
889 			if ((flags & SAFLAG_ZERO) && nsize < osize)
890 				bzero(obj + nsize, osize - nsize);
891 			return(obj);
892 		}
893 	}
894 
895 	/*
896 	 * Otherwise resize the object
897 	 */
898 	obj = memalloc(nsize, 0);
899 	if (obj) {
900 		if (nsize > osize)
901 			nsize = osize;
902 		bcopy(ptr, obj, nsize);
903 		memfree(ptr, 0);
904 	}
905 	return (obj);
906 }
907 
908 /*
909  * free (SLAB ALLOCATOR)
910  *
911  * Free a memory block previously allocated by malloc.
912  *
913  * MPSAFE
914  */
915 static void
916 memfree(void *ptr, int flags)
917 {
918 	region_t region;
919 	slglobaldata_t slgd;
920 	slab_t slab;
921 	slab_t stmp;
922 	slab_t *slabp;
923 	char *obj;
924 	int bmi;
925 	int bno;
926 	u_long *bmp;
927 
928 	/*
929 	 * Locate the zone by looking up the dynamic slab size mask based
930 	 * on the memory region the allocation resides in.
931 	 *
932 	 * WARNING!  The slab may be owned by another thread!
933 	 */
934 	region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
935 	if ((slab = region->slab) == NULL)
936 		slab = (void *)((uintptr_t)ptr & region->mask);
937 	MASSERT(slab != NULL);
938 	MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
939 
940 #ifdef INVARIANTS
941 	/*
942 	 * Put weird data into the memory to detect modifications after
943 	 * freeing, illegal pointer use after freeing (we should fault on
944 	 * the odd address), and so forth.
945 	 */
946 	if (slab->chunk_size < sizeof(weirdary))
947 		bcopy(weirdary, ptr, slab->chunk_size);
948 	else
949 		bcopy(weirdary, ptr, sizeof(weirdary));
950 #endif
951 
952 	bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
953 	bmi = bno >> LONG_BITS_SHIFT;
954 	bno &= (LONG_BITS - 1);
955 	bmp = &slab->bitmap[bmi];
956 
957 	MASSERT(bmi >= 0 && bmi < slab->nmax);
958 	MASSERT((*bmp & (1LU << bno)) == 0);
959 	atomic_set_long(bmp, 1LU << bno);
960 	atomic_add_int(&slab->navail, 1);
961 
962 	/*
963 	 * We can only do the following if we own the slab
964 	 */
965 	slgd = &slglobal;
966 	if (slab->slgd == slgd) {
967 		struct zoneinfo *zinfo;
968 
969 		if (slab->free_index > bmi) {
970 			slab->free_index = bmi;
971 			slab->free_bit = bno;
972 		} else if (slab->free_index == bmi &&
973 			   slab->free_bit > bno) {
974 			slab->free_bit = bno;
975 		}
976 		zinfo = &slgd->zone[slab->zone_index];
977 
978 		/*
979 		 * Freeing an object from a full slab will move it to the
980 		 * available list.  If the available list already has a
981 		 * slab we terminate the full slab instead, moving it to
982 		 * the depot.
983 		 */
984 		if (slab->state == FULL) {
985 			TAILQ_REMOVE(&slgd->full_zones, slab, entry);
986 			if (zinfo->avail_base == NULL) {
987 				slab->state = AVAIL;
988 				stmp = zinfo->avail_base;
989 				slab->next = stmp;
990 				zinfo->avail_base = slab;
991 			} else {
992 				slabterm(slgd, slab);
993 				goto done;
994 			}
995 		}
996 
997 		/*
998 		 * If the slab becomes completely empty dispose of it in
999 		 * some manner.  By default each thread caches up to 4
1000 		 * empty slabs.  Only small slabs are cached.
1001 		 */
1002 		if (slab->navail == slab->nmax && slab->state == AVAIL) {
1003 			/*
1004 			 * Remove slab from available queue
1005 			 */
1006 			slabp = &zinfo->avail_base;
1007 			while ((stmp = *slabp) != slab)
1008 				slabp = &stmp->next;
1009 			*slabp = slab->next;
1010 
1011 			if (opt_free || opt_cache == 0) {
1012 				/*
1013 				 * If local caching is disabled cache the
1014 				 * slab in the depot (or free it).
1015 				 */
1016 				slabterm(slgd, slab);
1017 			} else if (slab->slab_size > BIGSLABSIZE) {
1018 				/*
1019 				 * We do not try to retain large slabs
1020 				 * in per-thread caches.
1021 				 */
1022 				slabterm(slgd, slab);
1023 			} else if (zinfo->empty_count > opt_cache) {
1024 				/*
1025 				 * We have too many slabs cached, but
1026 				 * instead of freeing this one free
1027 				 * an empty slab that's been idle longer.
1028 				 *
1029 				 * (empty_count does not change)
1030 				 */
1031 				stmp = zinfo->empty_base;
1032 				slab->state = EMPTY;
1033 				slab->next = stmp->next;
1034 				zinfo->empty_base = slab;
1035 				slabterm(slgd, stmp);
1036 			} else {
1037 				/*
1038 				 * Cache the empty slab in our thread local
1039 				 * empty list.
1040 				 */
1041 				++zinfo->empty_count;
1042 				slab->state = EMPTY;
1043 				slab->next = zinfo->empty_base;
1044 				zinfo->empty_base = slab;
1045 			}
1046 		}
1047 	}
1048 done:
1049 	;
1050 }
1051 
1052 /*
1053  * Allocate a new slab holding objects of size chunk_size.
1054  */
1055 static slab_t
1056 slaballoc(int zi, size_t chunking, size_t chunk_size)
1057 {
1058 	slglobaldata_t slgd;
1059 	slglobaldata_t sldepot;
1060 	struct zoneinfo *zinfo;
1061 	region_t region;
1062 	void *save;
1063 	slab_t slab;
1064 	slab_t stmp;
1065 	size_t slab_desire;
1066 	size_t slab_size;
1067 	size_t region_mask;
1068 	uintptr_t chunk_offset;
1069 	ssize_t maxchunks;
1070 	ssize_t tmpchunks;
1071 	int ispower2;
1072 	int power;
1073 	int ri;
1074 	int rx;
1075 	int nswath;
1076 	int j;
1077 
1078 	/*
1079 	 * First look in the depot.  Any given zone in the depot may be
1080 	 * locked by being set to -1.  We have to do this instead of simply
1081 	 * removing the entire chain because removing the entire chain can
1082 	 * cause racing threads to allocate local slabs for large objects,
1083 	 * resulting in a large VSZ.
1084 	 */
1085 	slgd = &slglobal;
1086 	sldepot = slgd->sldepot;
1087 	zinfo = &sldepot->zone[zi];
1088 
1089 	while ((slab = zinfo->avail_base) != NULL) {
1090 		if ((void *)slab == LOCKEDPTR) {
1091 			cpu_pause();
1092 			continue;
1093 		}
1094 		if (atomic_cmpset_ptr(&zinfo->avail_base, slab, LOCKEDPTR)) {
1095 			MASSERT(slab->slgd == NULL);
1096 			slab->slgd = slgd;
1097 			zinfo->avail_base = slab->next;
1098 			return(slab);
1099 		}
1100 	}
1101 
1102 	/*
1103 	 * Nothing in the depot, allocate a new slab by locating or assigning
1104 	 * a region and then using the system virtual memory allocator.
1105 	 */
1106 	slab = NULL;
1107 
1108 	/*
1109 	 * Calculate the start of the data chunks relative to the start
1110 	 * of the slab.
1111 	 */
1112 	if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1113 		ispower2 = 1;
1114 		chunk_offset = roundup2(sizeof(*slab), chunk_size);
1115 	} else {
1116 		ispower2 = 0;
1117 		chunk_offset = sizeof(*slab) + chunking - 1;
1118 		chunk_offset -= chunk_offset % chunking;
1119 	}
1120 
1121 	/*
1122 	 * Calculate a reasonable number of chunks for the slab.
1123 	 *
1124 	 * Once initialized the MaxChunks[] array can only ever be
1125 	 * reinitialized to the same value.
1126 	 */
1127 	maxchunks = MaxChunks[zi];
1128 	if (maxchunks == 0) {
1129 		/*
1130 		 * First calculate how many chunks would fit in 1/1024
1131 		 * available memory.  This is around 2MB on a 32 bit
1132 		 * system and 128G on a 64-bit (48-bits available) system.
1133 		 */
1134 		maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1135 			    (ssize_t)chunk_size;
1136 		if (maxchunks <= 0)
1137 			maxchunks = 1;
1138 
1139 		/*
1140 		 * A slab cannot handle more than MAXCHUNKS chunks, but
1141 		 * limit us to approximately MAXCHUNKS / 2 here because
1142 		 * we may have to expand maxchunks when we calculate the
1143 		 * actual power-of-2 slab.
1144 		 */
1145 		if (maxchunks > MAXCHUNKS / 2)
1146 			maxchunks = MAXCHUNKS / 2;
1147 
1148 		/*
1149 		 * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1150 		 * slabs will be created if the allocation does not fit.
1151 		 */
1152 		if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1153 			tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1154 				    (ssize_t)chunk_size;
1155 			if (tmpchunks <= 0)
1156 				tmpchunks = 1;
1157 			if (maxchunks > tmpchunks)
1158 				maxchunks = tmpchunks;
1159 		}
1160 
1161 		/*
1162 		 * If the slab calculates to greater than 2MB see if we
1163 		 * can cut it down to ~2MB.  This controls VSZ but has
1164 		 * no effect on run-time size or performance.
1165 		 *
1166 		 * This is very important in case you core dump and also
1167 		 * important to reduce unnecessary region allocations.
1168 		 */
1169 		if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1170 			tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1171 				    (ssize_t)chunk_size;
1172 			if (tmpchunks < 1)
1173 				tmpchunks = 1;
1174 			if (maxchunks > tmpchunks)
1175 				maxchunks = tmpchunks;
1176 		}
1177 
1178 		/*
1179 		 * If the slab calculates to greater than 128K see if we
1180 		 * can cut it down to ~128K while still maintaining a
1181 		 * reasonably large number of chunks in each slab.  This
1182 		 * controls VSZ but has no effect on run-time size or
1183 		 * performance.
1184 		 *
1185 		 * This is very important in case you core dump and also
1186 		 * important to reduce unnecessary region allocations.
1187 		 */
1188 		if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1189 			tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1190 				    (ssize_t)chunk_size;
1191 			if (tmpchunks < 32)
1192 				tmpchunks = 32;
1193 			if (maxchunks > tmpchunks)
1194 				maxchunks = tmpchunks;
1195 		}
1196 
1197 		MaxChunks[zi] = maxchunks;
1198 	}
1199 	MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1200 
1201 	/*
1202 	 * Calculate the actual slab size.  maxchunks will be recalculated
1203 	 * a little later.
1204 	 */
1205 	slab_desire = chunk_offset + chunk_size * maxchunks;
1206 	slab_size = 8 * MAXCHUNKS;
1207 	power = 3 + MAXCHUNKS_BITS;
1208 	while (slab_size < slab_desire) {
1209 		slab_size <<= 1;
1210 		++power;
1211 	}
1212 
1213 	/*
1214 	 * Do a quick recalculation based on the actual slab size but not
1215 	 * yet dealing with whether the slab header is in-band or out-of-band.
1216 	 * The purpose here is to see if we can reasonably reduce slab_size
1217 	 * to a power of 4 to allow more chunk sizes to use the same slab
1218 	 * size.
1219 	 */
1220 	if ((power & 1) && slab_size > 32768) {
1221 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1222 		if (maxchunks >= MAXCHUNKS / 8) {
1223 			slab_size >>= 1;
1224 			--power;
1225 		}
1226 	}
1227 	if ((power & 2) && slab_size > 32768 * 4) {
1228 		maxchunks = (slab_size - chunk_offset) / chunk_size;
1229 		if (maxchunks >= MAXCHUNKS / 4) {
1230 			slab_size >>= 2;
1231 			power -= 2;
1232 		}
1233 	}
1234 	/*
1235 	 * This case occurs when the slab_size is larger than 1/1024 available
1236 	 * UVM.
1237 	 */
1238 	nswath = slab_size / NREGIONS_SIZE;
1239 	if (nswath > NREGIONS)
1240 		return (NULL);
1241 
1242 
1243 	/*
1244 	 * Try to allocate from our current best region for this zi
1245 	 */
1246 	region_mask = ~(slab_size - 1);
1247 	ri = slgd->zone[zi].best_region;
1248 	if (Regions[ri].mask == region_mask) {
1249 		if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1250 			goto found;
1251 	}
1252 
1253 	/*
1254 	 * Try to find an existing region to allocate from.  The normal
1255 	 * case will be for allocations that are less than 1/1024 available
1256 	 * UVM, which fit into a single Regions[] entry.
1257 	 */
1258 	while (slab_size <= NREGIONS_SIZE) {
1259 		rx = -1;
1260 		for (ri = 0; ri < NREGIONS; ++ri) {
1261 			if (rx < 0 && Regions[ri].mask == 0)
1262 				rx = ri;
1263 			if (Regions[ri].mask == region_mask) {
1264 				slab = _vmem_alloc(ri, slab_size);
1265 				if (slab) {
1266 					slgd->zone[zi].best_region = ri;
1267 					goto found;
1268 				}
1269 			}
1270 		}
1271 
1272 		if (rx < 0)
1273 			return(NULL);
1274 
1275 		/*
1276 		 * This can fail, retry either way
1277 		 */
1278 		atomic_cmpset_ptr((void **)&Regions[rx].mask,
1279 				  NULL,
1280 				  (void *)region_mask);
1281 	}
1282 
1283 	for (;;) {
1284 		rx = -1;
1285 		for (ri = 0; ri < NREGIONS; ri += nswath) {
1286 			if (Regions[ri].mask == region_mask) {
1287 				slab = _vmem_alloc(ri, slab_size);
1288 				if (slab) {
1289 					slgd->zone[zi].best_region = ri;
1290 					goto found;
1291 				}
1292 			}
1293 			if (rx < 0) {
1294 				for (j = nswath - 1; j >= 0; --j) {
1295 					if (Regions[ri+j].mask != 0)
1296 						break;
1297 				}
1298 				if (j < 0)
1299 					rx = ri;
1300 			}
1301 		}
1302 
1303 		/*
1304 		 * We found a candidate, try to allocate it backwards so
1305 		 * another thread racing a slaballoc() does not see the
1306 		 * mask in the base index position until we are done.
1307 		 *
1308 		 * We can safely zero-out any partial allocations because
1309 		 * the mask is only accessed from the base index.  Any other
1310 		 * threads racing us will fail prior to us clearing the mask.
1311 		 */
1312 		if (rx < 0)
1313 			return(NULL);
1314 		for (j = nswath - 1; j >= 0; --j) {
1315 			if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1316 					       NULL, (void *)region_mask)) {
1317 				while (++j < nswath)
1318 					Regions[rx+j].mask = 0;
1319 				break;
1320 			}
1321 		}
1322 		/* retry */
1323 	}
1324 
1325 	/*
1326 	 * Fill in the new slab in region ri.  If the slab_size completely
1327 	 * fills one or more region slots we move the slab structure out of
1328 	 * band which should optimize the chunking (particularly for a power
1329 	 * of 2).
1330 	 */
1331 found:
1332 	region = &Regions[ri];
1333 	MASSERT(region->slab == NULL);
1334 	if (slab_size >= NREGIONS_SIZE) {
1335 		save = slab;
1336 		slab = memalloc(sizeof(*slab), 0);
1337 		bzero(slab, sizeof(*slab));
1338 		slab->chunks = save;
1339 		for (j = 0; j < nswath; ++j)
1340 			region[j].slab = slab;
1341 		chunk_offset = 0;
1342 	} else {
1343 		bzero(slab, sizeof(*slab));
1344 		slab->chunks = (char *)slab + chunk_offset;
1345 	}
1346 
1347 	/*
1348 	 * Calculate the start of the chunks memory and recalculate the
1349 	 * actual number of chunks the slab can hold.
1350 	 */
1351 	maxchunks = (slab_size - chunk_offset) / chunk_size;
1352 	if (maxchunks > MAXCHUNKS)
1353 		maxchunks = MAXCHUNKS;
1354 
1355 	/*
1356 	 * And fill in the rest
1357 	 */
1358 	slab->magic = ZALLOC_SLAB_MAGIC;
1359 	slab->navail = maxchunks;
1360 	slab->nmax = maxchunks;
1361 	slab->slab_size = slab_size;
1362 	slab->chunk_size = chunk_size;
1363 	slab->zone_index = zi;
1364 	slab->slgd = slgd;
1365 	slab->state = UNKNOWN;
1366 	slab->region = region;
1367 
1368 	for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1369 		if (ri + LONG_BITS <= maxchunks)
1370 			slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1371 		else
1372 			slab->bitmap[ri >> LONG_BITS_SHIFT] =
1373 						(1LU << (maxchunks - ri)) - 1;
1374 	}
1375 	return (slab);
1376 }
1377 
1378 /*
1379  * Free a slab.
1380  */
1381 static void
1382 slabfree(slab_t slab)
1383 {
1384 	int nswath;
1385 	int j;
1386 
1387 	if (slab->region->slab == slab) {
1388 		/*
1389 		 * Out-of-band slab.
1390 		 */
1391 		nswath = slab->slab_size / NREGIONS_SIZE;
1392 		for (j = 0; j < nswath; ++j)
1393 			slab->region[j].slab = NULL;
1394 		slab->magic = 0;
1395 		_vmem_free(slab->chunks, slab->slab_size);
1396 		memfree(slab, 0);
1397 	} else {
1398 		/*
1399 		 * In-band slab.
1400 		 */
1401 		slab->magic = 0;
1402 		_vmem_free(slab, slab->slab_size);
1403 	}
1404 }
1405 
1406 /*
1407  * Terminate a slab's use in the current thread.  The slab may still have
1408  * outstanding allocations and thus not be deallocatable.
1409  */
1410 static void
1411 slabterm(slglobaldata_t slgd, slab_t slab)
1412 {
1413 	slglobaldata_t sldepot = slgd->sldepot;
1414 	struct zoneinfo *zinfo;
1415 	slab_t dnext;
1416 	int zi = slab->zone_index;
1417 
1418 	slab->slgd = NULL;
1419 	--slgd->nslabs;
1420 	zinfo = &sldepot->zone[zi];
1421 
1422 	/*
1423 	 * If the slab can be freed and the depot is either locked or not
1424 	 * empty, then free the slab.
1425 	 */
1426 	if (slab->navail == slab->nmax && zinfo->avail_base) {
1427 		slab->state = UNKNOWN;
1428 		slabfree(slab);
1429 		return;
1430 	}
1431 	slab->state = AVAIL;
1432 
1433 	/*
1434 	 * Link the slab into the depot
1435 	 */
1436 	for (;;) {
1437 		dnext = zinfo->avail_base;
1438 		cpu_ccfence();
1439 		if ((void *)dnext == LOCKEDPTR) {
1440 			cpu_pause();
1441 			continue;
1442 		}
1443 		slab->next = dnext;
1444 		if (atomic_cmpset_ptr(&zinfo->avail_base, dnext, slab))
1445 			break;
1446 	}
1447 }
1448 
1449 /*
1450  * _vmem_alloc()
1451  *
1452  *	Directly map memory in PAGE_SIZE'd chunks with the specified
1453  *	alignment.
1454  *
1455  *	Alignment must be a multiple of PAGE_SIZE.
1456  *
1457  *	Size must be >= alignment.
1458  */
1459 static void *
1460 _vmem_alloc(int ri, size_t slab_size)
1461 {
1462 	char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1463 	char *eaddr;
1464 	char *addr;
1465 	char *save;
1466 	uintptr_t excess;
1467 
1468 	if (slab_size < NREGIONS_SIZE)
1469 		eaddr = baddr + NREGIONS_SIZE;
1470 	else
1471 		eaddr = baddr + slab_size;
1472 
1473 	/*
1474 	 * This usually just works but might not.
1475 	 */
1476 	addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1477 		    MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1478 	if (addr == MAP_FAILED) {
1479 		return (NULL);
1480 	}
1481 	if (addr < baddr || addr + slab_size > eaddr) {
1482 		munmap(addr, slab_size);
1483 		return (NULL);
1484 	}
1485 
1486 	/*
1487 	 * Check alignment.  The misaligned offset is also the excess
1488 	 * amount.  If misaligned unmap the excess so we have a chance of
1489 	 * mapping at the next alignment point and recursively try again.
1490 	 *
1491 	 * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB	block alignment
1492 	 *   aaaaaaaaa aaaaaaaaaaa aa		mis-aligned allocation
1493 	 *   xxxxxxxxx				final excess calculation
1494 	 *   ^ returned address
1495 	 */
1496 	excess = (uintptr_t)addr & (slab_size - 1);
1497 	while (excess) {
1498 		excess = slab_size - excess;
1499 		save = addr;
1500 
1501 		munmap(save + excess, slab_size - excess);
1502 		addr = _vmem_alloc(ri, slab_size);
1503 		munmap(save, excess);
1504 		if (addr == NULL)
1505 			return (NULL);
1506 		if (addr < baddr || addr + slab_size > eaddr) {
1507 			munmap(addr, slab_size);
1508 			return (NULL);
1509 		}
1510 		excess = (uintptr_t)addr & (slab_size - 1);
1511 	}
1512 	return (addr);
1513 }
1514 
1515 /*
1516  * _vmem_free()
1517  *
1518  *	Free a chunk of memory allocated with _vmem_alloc()
1519  */
1520 static void
1521 _vmem_free(void *ptr, size_t size)
1522 {
1523 	munmap(ptr, size);
1524 }
1525 
1526 /*
1527  * Panic on fatal conditions
1528  */
1529 static void
1530 _mpanic(const char *ctl, ...)
1531 {
1532 	va_list va;
1533 
1534 	if (malloc_panic == 0) {
1535 		malloc_panic = 1;
1536 		va_start(va, ctl);
1537 		vfprintf(stderr, ctl, va);
1538 		fprintf(stderr, "\n");
1539 		fflush(stderr);
1540 		va_end(va);
1541 	}
1542 	abort();
1543 }
1544