xref: /freebsd/sys/vm/uma_core.c (revision 2a01feab)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
5  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
6  * Copyright (c) 2004-2006 Robert N. M. Watson
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice unmodified, this list of conditions, and the following
14  *    disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 /*
32  * uma_core.c  Implementation of the Universal Memory allocator
33  *
34  * This allocator is intended to replace the multitude of similar object caches
35  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
36  * efficient.  A primary design goal is to return unused memory to the rest of
37  * the system.  This will make the system as a whole more flexible due to the
38  * ability to move memory to subsystems which most need it instead of leaving
39  * pools of reserved memory unused.
40  *
41  * The basic ideas stem from similar slab/zone based allocators whose algorithms
42  * are well known.
43  *
44  */
45 
46 /*
47  * TODO:
48  *	- Improve memory usage for large allocations
49  *	- Investigate cache size adjustments
50  */
51 
52 #include <sys/cdefs.h>
53 __FBSDID("$FreeBSD$");
54 
55 #include "opt_ddb.h"
56 #include "opt_param.h"
57 #include "opt_vm.h"
58 
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/bitset.h>
62 #include <sys/eventhandler.h>
63 #include <sys/kernel.h>
64 #include <sys/types.h>
65 #include <sys/limits.h>
66 #include <sys/queue.h>
67 #include <sys/malloc.h>
68 #include <sys/ktr.h>
69 #include <sys/lock.h>
70 #include <sys/sysctl.h>
71 #include <sys/mutex.h>
72 #include <sys/proc.h>
73 #include <sys/random.h>
74 #include <sys/rwlock.h>
75 #include <sys/sbuf.h>
76 #include <sys/sched.h>
77 #include <sys/smp.h>
78 #include <sys/taskqueue.h>
79 #include <sys/vmmeter.h>
80 
81 #include <vm/vm.h>
82 #include <vm/vm_object.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_pageout.h>
85 #include <vm/vm_param.h>
86 #include <vm/vm_phys.h>
87 #include <vm/vm_pagequeue.h>
88 #include <vm/vm_map.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/uma.h>
92 #include <vm/uma_int.h>
93 #include <vm/uma_dbg.h>
94 
95 #include <ddb/ddb.h>
96 
97 #ifdef DEBUG_MEMGUARD
98 #include <vm/memguard.h>
99 #endif
100 
101 /*
102  * This is the zone and keg from which all zones are spawned.
103  */
104 static uma_zone_t kegs;
105 static uma_zone_t zones;
106 
107 /* This is the zone from which all offpage uma_slab_ts are allocated. */
108 static uma_zone_t slabzone;
109 
110 /*
111  * The initial hash tables come out of this zone so they can be allocated
112  * prior to malloc coming up.
113  */
114 static uma_zone_t hashzone;
115 
116 /* The boot-time adjusted value for cache line alignment. */
117 int uma_align_cache = 64 - 1;
118 
119 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
120 
121 /*
122  * Are we allowed to allocate buckets?
123  */
124 static int bucketdisable = 1;
125 
126 /* Linked list of all kegs in the system */
127 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
128 
129 /* Linked list of all cache-only zones in the system */
130 static LIST_HEAD(,uma_zone) uma_cachezones =
131     LIST_HEAD_INITIALIZER(uma_cachezones);
132 
133 /* This RW lock protects the keg list */
134 static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
135 
136 /*
137  * Pointer and counter to pool of pages, that is preallocated at
138  * startup to bootstrap UMA.
139  */
140 static char *bootmem;
141 static int boot_pages;
142 
143 static struct sx uma_drain_lock;
144 
145 /* kmem soft limit. */
146 static unsigned long uma_kmem_limit = LONG_MAX;
147 static volatile unsigned long uma_kmem_total;
148 
149 /* Is the VM done starting up? */
150 static enum { BOOT_COLD = 0, BOOT_STRAPPED, BOOT_PAGEALLOC, BOOT_BUCKETS,
151     BOOT_RUNNING } booted = BOOT_COLD;
152 
153 /*
154  * This is the handle used to schedule events that need to happen
155  * outside of the allocation fast path.
156  */
157 static struct callout uma_callout;
158 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
159 
160 /*
161  * This structure is passed as the zone ctor arg so that I don't have to create
162  * a special allocation function just for zones.
163  */
164 struct uma_zctor_args {
165 	const char *name;
166 	size_t size;
167 	uma_ctor ctor;
168 	uma_dtor dtor;
169 	uma_init uminit;
170 	uma_fini fini;
171 	uma_import import;
172 	uma_release release;
173 	void *arg;
174 	uma_keg_t keg;
175 	int align;
176 	uint32_t flags;
177 };
178 
179 struct uma_kctor_args {
180 	uma_zone_t zone;
181 	size_t size;
182 	uma_init uminit;
183 	uma_fini fini;
184 	int align;
185 	uint32_t flags;
186 };
187 
188 struct uma_bucket_zone {
189 	uma_zone_t	ubz_zone;
190 	char		*ubz_name;
191 	int		ubz_entries;	/* Number of items it can hold. */
192 	int		ubz_maxsize;	/* Maximum allocation size per-item. */
193 };
194 
195 /*
196  * Compute the actual number of bucket entries to pack them in power
197  * of two sizes for more efficient space utilization.
198  */
199 #define	BUCKET_SIZE(n)						\
200     (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
201 
202 #define	BUCKET_MAX	BUCKET_SIZE(256)
203 
204 struct uma_bucket_zone bucket_zones[] = {
205 	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
206 	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
207 	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
208 	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
209 	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
210 	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
211 	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
212 	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
213 	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
214 	{ NULL, NULL, 0}
215 };
216 
217 /*
218  * Flags and enumerations to be passed to internal functions.
219  */
220 enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
221 
222 #define	UMA_ANYDOMAIN	-1	/* Special value for domain search. */
223 
224 /* Prototypes.. */
225 
226 int	uma_startup_count(int);
227 void	uma_startup(void *, int);
228 void	uma_startup1(void);
229 void	uma_startup2(void);
230 
231 static void *noobj_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
232 static void *page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
233 static void *pcpu_page_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
234 static void *startup_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
235 static void page_free(void *, vm_size_t, uint8_t);
236 static void pcpu_page_free(void *, vm_size_t, uint8_t);
237 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int, int);
238 static void cache_drain(uma_zone_t);
239 static void bucket_drain(uma_zone_t, uma_bucket_t);
240 static void bucket_cache_drain(uma_zone_t zone);
241 static int keg_ctor(void *, int, void *, int);
242 static void keg_dtor(void *, int, void *);
243 static int zone_ctor(void *, int, void *, int);
244 static void zone_dtor(void *, int, void *);
245 static int zero_init(void *, int, int);
246 static void keg_small_init(uma_keg_t keg);
247 static void keg_large_init(uma_keg_t keg);
248 static void zone_foreach(void (*zfunc)(uma_zone_t));
249 static void zone_timeout(uma_zone_t zone);
250 static int hash_alloc(struct uma_hash *);
251 static int hash_expand(struct uma_hash *, struct uma_hash *);
252 static void hash_free(struct uma_hash *hash);
253 static void uma_timeout(void *);
254 static void uma_startup3(void);
255 static void *zone_alloc_item(uma_zone_t, void *, int, int);
256 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
257 static void bucket_enable(void);
258 static void bucket_init(void);
259 static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
260 static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
261 static void bucket_zone_drain(void);
262 static uma_bucket_t zone_alloc_bucket(uma_zone_t, void *, int, int);
263 static uma_slab_t zone_fetch_slab(uma_zone_t, uma_keg_t, int, int);
264 static uma_slab_t zone_fetch_slab_multi(uma_zone_t, uma_keg_t, int, int);
265 static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
266 static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
267 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
268     uma_fini fini, int align, uint32_t flags);
269 static int zone_import(uma_zone_t, void **, int, int, int);
270 static void zone_release(uma_zone_t, void **, int);
271 static void uma_zero_item(void *, uma_zone_t);
272 
273 void uma_print_zone(uma_zone_t);
274 void uma_print_stats(void);
275 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
276 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
277 
278 #ifdef INVARIANTS
279 static bool uma_dbg_kskip(uma_keg_t keg, void *mem);
280 static bool uma_dbg_zskip(uma_zone_t zone, void *mem);
281 static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
282 static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
283 
284 static SYSCTL_NODE(_vm, OID_AUTO, debug, CTLFLAG_RD, 0,
285     "Memory allocation debugging");
286 
287 static u_int dbg_divisor = 1;
288 SYSCTL_UINT(_vm_debug, OID_AUTO, divisor,
289     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &dbg_divisor, 0,
290     "Debug & thrash every this item in memory allocator");
291 
292 static counter_u64_t uma_dbg_cnt = EARLY_COUNTER;
293 static counter_u64_t uma_skip_cnt = EARLY_COUNTER;
294 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, trashed, CTLFLAG_RD,
295     &uma_dbg_cnt, "memory items debugged");
296 SYSCTL_COUNTER_U64(_vm_debug, OID_AUTO, skipped, CTLFLAG_RD,
297     &uma_skip_cnt, "memory items skipped, not debugged");
298 #endif
299 
300 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
301 
302 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
303     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
304 
305 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
306     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
307 
308 static int zone_warnings = 1;
309 SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
310     "Warn when UMA zones becomes full");
311 
312 /* Adjust bytes under management by UMA. */
313 static inline void
314 uma_total_dec(unsigned long size)
315 {
316 
317 	atomic_subtract_long(&uma_kmem_total, size);
318 }
319 
320 static inline void
321 uma_total_inc(unsigned long size)
322 {
323 
324 	if (atomic_fetchadd_long(&uma_kmem_total, size) > uma_kmem_limit)
325 		uma_reclaim_wakeup();
326 }
327 
328 /*
329  * This routine checks to see whether or not it's safe to enable buckets.
330  */
331 static void
332 bucket_enable(void)
333 {
334 	bucketdisable = vm_page_count_min();
335 }
336 
337 /*
338  * Initialize bucket_zones, the array of zones of buckets of various sizes.
339  *
340  * For each zone, calculate the memory required for each bucket, consisting
341  * of the header and an array of pointers.
342  */
343 static void
344 bucket_init(void)
345 {
346 	struct uma_bucket_zone *ubz;
347 	int size;
348 
349 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
350 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
351 		size += sizeof(void *) * ubz->ubz_entries;
352 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
353 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
354 		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET | UMA_ZONE_NUMA);
355 	}
356 }
357 
358 /*
359  * Given a desired number of entries for a bucket, return the zone from which
360  * to allocate the bucket.
361  */
362 static struct uma_bucket_zone *
363 bucket_zone_lookup(int entries)
364 {
365 	struct uma_bucket_zone *ubz;
366 
367 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
368 		if (ubz->ubz_entries >= entries)
369 			return (ubz);
370 	ubz--;
371 	return (ubz);
372 }
373 
374 static int
375 bucket_select(int size)
376 {
377 	struct uma_bucket_zone *ubz;
378 
379 	ubz = &bucket_zones[0];
380 	if (size > ubz->ubz_maxsize)
381 		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
382 
383 	for (; ubz->ubz_entries != 0; ubz++)
384 		if (ubz->ubz_maxsize < size)
385 			break;
386 	ubz--;
387 	return (ubz->ubz_entries);
388 }
389 
390 static uma_bucket_t
391 bucket_alloc(uma_zone_t zone, void *udata, int flags)
392 {
393 	struct uma_bucket_zone *ubz;
394 	uma_bucket_t bucket;
395 
396 	/*
397 	 * This is to stop us from allocating per cpu buckets while we're
398 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
399 	 * boot pages.  This also prevents us from allocating buckets in
400 	 * low memory situations.
401 	 */
402 	if (bucketdisable)
403 		return (NULL);
404 	/*
405 	 * To limit bucket recursion we store the original zone flags
406 	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
407 	 * NOVM flag to persist even through deep recursions.  We also
408 	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
409 	 * a bucket for a bucket zone so we do not allow infinite bucket
410 	 * recursion.  This cookie will even persist to frees of unused
411 	 * buckets via the allocation path or bucket allocations in the
412 	 * free path.
413 	 */
414 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
415 		udata = (void *)(uintptr_t)zone->uz_flags;
416 	else {
417 		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
418 			return (NULL);
419 		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
420 	}
421 	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
422 		flags |= M_NOVM;
423 	ubz = bucket_zone_lookup(zone->uz_count);
424 	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
425 		ubz++;
426 	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
427 	if (bucket) {
428 #ifdef INVARIANTS
429 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
430 #endif
431 		bucket->ub_cnt = 0;
432 		bucket->ub_entries = ubz->ubz_entries;
433 	}
434 
435 	return (bucket);
436 }
437 
438 static void
439 bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
440 {
441 	struct uma_bucket_zone *ubz;
442 
443 	KASSERT(bucket->ub_cnt == 0,
444 	    ("bucket_free: Freeing a non free bucket."));
445 	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
446 		udata = (void *)(uintptr_t)zone->uz_flags;
447 	ubz = bucket_zone_lookup(bucket->ub_entries);
448 	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
449 }
450 
451 static void
452 bucket_zone_drain(void)
453 {
454 	struct uma_bucket_zone *ubz;
455 
456 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
457 		zone_drain(ubz->ubz_zone);
458 }
459 
460 static void
461 zone_log_warning(uma_zone_t zone)
462 {
463 	static const struct timeval warninterval = { 300, 0 };
464 
465 	if (!zone_warnings || zone->uz_warning == NULL)
466 		return;
467 
468 	if (ratecheck(&zone->uz_ratecheck, &warninterval))
469 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
470 }
471 
472 static inline void
473 zone_maxaction(uma_zone_t zone)
474 {
475 
476 	if (zone->uz_maxaction.ta_func != NULL)
477 		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
478 }
479 
480 static void
481 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
482 {
483 	uma_klink_t klink;
484 
485 	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
486 		kegfn(klink->kl_keg);
487 }
488 
489 /*
490  * Routine called by timeout which is used to fire off some time interval
491  * based calculations.  (stats, hash size, etc.)
492  *
493  * Arguments:
494  *	arg   Unused
495  *
496  * Returns:
497  *	Nothing
498  */
499 static void
500 uma_timeout(void *unused)
501 {
502 	bucket_enable();
503 	zone_foreach(zone_timeout);
504 
505 	/* Reschedule this event */
506 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
507 }
508 
509 /*
510  * Routine to perform timeout driven calculations.  This expands the
511  * hashes and does per cpu statistics aggregation.
512  *
513  *  Returns nothing.
514  */
515 static void
516 keg_timeout(uma_keg_t keg)
517 {
518 
519 	KEG_LOCK(keg);
520 	/*
521 	 * Expand the keg hash table.
522 	 *
523 	 * This is done if the number of slabs is larger than the hash size.
524 	 * What I'm trying to do here is completely reduce collisions.  This
525 	 * may be a little aggressive.  Should I allow for two collisions max?
526 	 */
527 	if (keg->uk_flags & UMA_ZONE_HASH &&
528 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
529 		struct uma_hash newhash;
530 		struct uma_hash oldhash;
531 		int ret;
532 
533 		/*
534 		 * This is so involved because allocating and freeing
535 		 * while the keg lock is held will lead to deadlock.
536 		 * I have to do everything in stages and check for
537 		 * races.
538 		 */
539 		newhash = keg->uk_hash;
540 		KEG_UNLOCK(keg);
541 		ret = hash_alloc(&newhash);
542 		KEG_LOCK(keg);
543 		if (ret) {
544 			if (hash_expand(&keg->uk_hash, &newhash)) {
545 				oldhash = keg->uk_hash;
546 				keg->uk_hash = newhash;
547 			} else
548 				oldhash = newhash;
549 
550 			KEG_UNLOCK(keg);
551 			hash_free(&oldhash);
552 			return;
553 		}
554 	}
555 	KEG_UNLOCK(keg);
556 }
557 
558 static void
559 zone_timeout(uma_zone_t zone)
560 {
561 
562 	zone_foreach_keg(zone, &keg_timeout);
563 }
564 
565 /*
566  * Allocate and zero fill the next sized hash table from the appropriate
567  * backing store.
568  *
569  * Arguments:
570  *	hash  A new hash structure with the old hash size in uh_hashsize
571  *
572  * Returns:
573  *	1 on success and 0 on failure.
574  */
575 static int
576 hash_alloc(struct uma_hash *hash)
577 {
578 	int oldsize;
579 	int alloc;
580 
581 	oldsize = hash->uh_hashsize;
582 
583 	/* We're just going to go to a power of two greater */
584 	if (oldsize)  {
585 		hash->uh_hashsize = oldsize * 2;
586 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
587 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
588 		    M_UMAHASH, M_NOWAIT);
589 	} else {
590 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
591 		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
592 		    UMA_ANYDOMAIN, M_WAITOK);
593 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
594 	}
595 	if (hash->uh_slab_hash) {
596 		bzero(hash->uh_slab_hash, alloc);
597 		hash->uh_hashmask = hash->uh_hashsize - 1;
598 		return (1);
599 	}
600 
601 	return (0);
602 }
603 
604 /*
605  * Expands the hash table for HASH zones.  This is done from zone_timeout
606  * to reduce collisions.  This must not be done in the regular allocation
607  * path, otherwise, we can recurse on the vm while allocating pages.
608  *
609  * Arguments:
610  *	oldhash  The hash you want to expand
611  *	newhash  The hash structure for the new table
612  *
613  * Returns:
614  *	Nothing
615  *
616  * Discussion:
617  */
618 static int
619 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
620 {
621 	uma_slab_t slab;
622 	int hval;
623 	int i;
624 
625 	if (!newhash->uh_slab_hash)
626 		return (0);
627 
628 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
629 		return (0);
630 
631 	/*
632 	 * I need to investigate hash algorithms for resizing without a
633 	 * full rehash.
634 	 */
635 
636 	for (i = 0; i < oldhash->uh_hashsize; i++)
637 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
638 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
639 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
640 			hval = UMA_HASH(newhash, slab->us_data);
641 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
642 			    slab, us_hlink);
643 		}
644 
645 	return (1);
646 }
647 
648 /*
649  * Free the hash bucket to the appropriate backing store.
650  *
651  * Arguments:
652  *	slab_hash  The hash bucket we're freeing
653  *	hashsize   The number of entries in that hash bucket
654  *
655  * Returns:
656  *	Nothing
657  */
658 static void
659 hash_free(struct uma_hash *hash)
660 {
661 	if (hash->uh_slab_hash == NULL)
662 		return;
663 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
664 		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
665 	else
666 		free(hash->uh_slab_hash, M_UMAHASH);
667 }
668 
669 /*
670  * Frees all outstanding items in a bucket
671  *
672  * Arguments:
673  *	zone   The zone to free to, must be unlocked.
674  *	bucket The free/alloc bucket with items, cpu queue must be locked.
675  *
676  * Returns:
677  *	Nothing
678  */
679 
680 static void
681 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
682 {
683 	int i;
684 
685 	if (bucket == NULL)
686 		return;
687 
688 	if (zone->uz_fini)
689 		for (i = 0; i < bucket->ub_cnt; i++)
690 			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
691 	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
692 	bucket->ub_cnt = 0;
693 }
694 
695 /*
696  * Drains the per cpu caches for a zone.
697  *
698  * NOTE: This may only be called while the zone is being turn down, and not
699  * during normal operation.  This is necessary in order that we do not have
700  * to migrate CPUs to drain the per-CPU caches.
701  *
702  * Arguments:
703  *	zone     The zone to drain, must be unlocked.
704  *
705  * Returns:
706  *	Nothing
707  */
708 static void
709 cache_drain(uma_zone_t zone)
710 {
711 	uma_cache_t cache;
712 	int cpu;
713 
714 	/*
715 	 * XXX: It is safe to not lock the per-CPU caches, because we're
716 	 * tearing down the zone anyway.  I.e., there will be no further use
717 	 * of the caches at this point.
718 	 *
719 	 * XXX: It would good to be able to assert that the zone is being
720 	 * torn down to prevent improper use of cache_drain().
721 	 *
722 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
723 	 * it is used elsewhere.  Should the tear-down path be made special
724 	 * there in some form?
725 	 */
726 	CPU_FOREACH(cpu) {
727 		cache = &zone->uz_cpu[cpu];
728 		bucket_drain(zone, cache->uc_allocbucket);
729 		bucket_drain(zone, cache->uc_freebucket);
730 		if (cache->uc_allocbucket != NULL)
731 			bucket_free(zone, cache->uc_allocbucket, NULL);
732 		if (cache->uc_freebucket != NULL)
733 			bucket_free(zone, cache->uc_freebucket, NULL);
734 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
735 	}
736 	ZONE_LOCK(zone);
737 	bucket_cache_drain(zone);
738 	ZONE_UNLOCK(zone);
739 }
740 
741 static void
742 cache_shrink(uma_zone_t zone)
743 {
744 
745 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
746 		return;
747 
748 	ZONE_LOCK(zone);
749 	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
750 	ZONE_UNLOCK(zone);
751 }
752 
753 static void
754 cache_drain_safe_cpu(uma_zone_t zone)
755 {
756 	uma_cache_t cache;
757 	uma_bucket_t b1, b2;
758 	int domain;
759 
760 	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
761 		return;
762 
763 	b1 = b2 = NULL;
764 	ZONE_LOCK(zone);
765 	critical_enter();
766 	if (zone->uz_flags & UMA_ZONE_NUMA)
767 		domain = PCPU_GET(domain);
768 	else
769 		domain = 0;
770 	cache = &zone->uz_cpu[curcpu];
771 	if (cache->uc_allocbucket) {
772 		if (cache->uc_allocbucket->ub_cnt != 0)
773 			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
774 			    cache->uc_allocbucket, ub_link);
775 		else
776 			b1 = cache->uc_allocbucket;
777 		cache->uc_allocbucket = NULL;
778 	}
779 	if (cache->uc_freebucket) {
780 		if (cache->uc_freebucket->ub_cnt != 0)
781 			LIST_INSERT_HEAD(&zone->uz_domain[domain].uzd_buckets,
782 			    cache->uc_freebucket, ub_link);
783 		else
784 			b2 = cache->uc_freebucket;
785 		cache->uc_freebucket = NULL;
786 	}
787 	critical_exit();
788 	ZONE_UNLOCK(zone);
789 	if (b1)
790 		bucket_free(zone, b1, NULL);
791 	if (b2)
792 		bucket_free(zone, b2, NULL);
793 }
794 
795 /*
796  * Safely drain per-CPU caches of a zone(s) to alloc bucket.
797  * This is an expensive call because it needs to bind to all CPUs
798  * one by one and enter a critical section on each of them in order
799  * to safely access their cache buckets.
800  * Zone lock must not be held on call this function.
801  */
802 static void
803 cache_drain_safe(uma_zone_t zone)
804 {
805 	int cpu;
806 
807 	/*
808 	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
809 	 */
810 	if (zone)
811 		cache_shrink(zone);
812 	else
813 		zone_foreach(cache_shrink);
814 
815 	CPU_FOREACH(cpu) {
816 		thread_lock(curthread);
817 		sched_bind(curthread, cpu);
818 		thread_unlock(curthread);
819 
820 		if (zone)
821 			cache_drain_safe_cpu(zone);
822 		else
823 			zone_foreach(cache_drain_safe_cpu);
824 	}
825 	thread_lock(curthread);
826 	sched_unbind(curthread);
827 	thread_unlock(curthread);
828 }
829 
830 /*
831  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
832  */
833 static void
834 bucket_cache_drain(uma_zone_t zone)
835 {
836 	uma_zone_domain_t zdom;
837 	uma_bucket_t bucket;
838 	int i;
839 
840 	/*
841 	 * Drain the bucket queues and free the buckets.
842 	 */
843 	for (i = 0; i < vm_ndomains; i++) {
844 		zdom = &zone->uz_domain[i];
845 		while ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
846 			LIST_REMOVE(bucket, ub_link);
847 			ZONE_UNLOCK(zone);
848 			bucket_drain(zone, bucket);
849 			bucket_free(zone, bucket, NULL);
850 			ZONE_LOCK(zone);
851 		}
852 	}
853 
854 	/*
855 	 * Shrink further bucket sizes.  Price of single zone lock collision
856 	 * is probably lower then price of global cache drain.
857 	 */
858 	if (zone->uz_count > zone->uz_count_min)
859 		zone->uz_count--;
860 }
861 
862 static void
863 keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
864 {
865 	uint8_t *mem;
866 	int i;
867 	uint8_t flags;
868 
869 	CTR4(KTR_UMA, "keg_free_slab keg %s(%p) slab %p, returning %d bytes",
870 	    keg->uk_name, keg, slab, PAGE_SIZE * keg->uk_ppera);
871 
872 	mem = slab->us_data;
873 	flags = slab->us_flags;
874 	i = start;
875 	if (keg->uk_fini != NULL) {
876 		for (i--; i > -1; i--)
877 #ifdef INVARIANTS
878 		/*
879 		 * trash_fini implies that dtor was trash_dtor. trash_fini
880 		 * would check that memory hasn't been modified since free,
881 		 * which executed trash_dtor.
882 		 * That's why we need to run uma_dbg_kskip() check here,
883 		 * albeit we don't make skip check for other init/fini
884 		 * invocations.
885 		 */
886 		if (!uma_dbg_kskip(keg, slab->us_data + (keg->uk_rsize * i)) ||
887 		    keg->uk_fini != trash_fini)
888 #endif
889 			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
890 			    keg->uk_size);
891 	}
892 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
893 		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
894 	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
895 	uma_total_dec(PAGE_SIZE * keg->uk_ppera);
896 }
897 
898 /*
899  * Frees pages from a keg back to the system.  This is done on demand from
900  * the pageout daemon.
901  *
902  * Returns nothing.
903  */
904 static void
905 keg_drain(uma_keg_t keg)
906 {
907 	struct slabhead freeslabs = { 0 };
908 	uma_domain_t dom;
909 	uma_slab_t slab, tmp;
910 	int i;
911 
912 	/*
913 	 * We don't want to take pages from statically allocated kegs at this
914 	 * time
915 	 */
916 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
917 		return;
918 
919 	CTR3(KTR_UMA, "keg_drain %s(%p) free items: %u",
920 	    keg->uk_name, keg, keg->uk_free);
921 	KEG_LOCK(keg);
922 	if (keg->uk_free == 0)
923 		goto finished;
924 
925 	for (i = 0; i < vm_ndomains; i++) {
926 		dom = &keg->uk_domain[i];
927 		LIST_FOREACH_SAFE(slab, &dom->ud_free_slab, us_link, tmp) {
928 			/* We have nowhere to free these to. */
929 			if (slab->us_flags & UMA_SLAB_BOOT)
930 				continue;
931 
932 			LIST_REMOVE(slab, us_link);
933 			keg->uk_pages -= keg->uk_ppera;
934 			keg->uk_free -= keg->uk_ipers;
935 
936 			if (keg->uk_flags & UMA_ZONE_HASH)
937 				UMA_HASH_REMOVE(&keg->uk_hash, slab,
938 				    slab->us_data);
939 
940 			SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
941 		}
942 	}
943 
944 finished:
945 	KEG_UNLOCK(keg);
946 
947 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
948 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
949 		keg_free_slab(keg, slab, keg->uk_ipers);
950 	}
951 }
952 
953 static void
954 zone_drain_wait(uma_zone_t zone, int waitok)
955 {
956 
957 	/*
958 	 * Set draining to interlock with zone_dtor() so we can release our
959 	 * locks as we go.  Only dtor() should do a WAITOK call since it
960 	 * is the only call that knows the structure will still be available
961 	 * when it wakes up.
962 	 */
963 	ZONE_LOCK(zone);
964 	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
965 		if (waitok == M_NOWAIT)
966 			goto out;
967 		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
968 	}
969 	zone->uz_flags |= UMA_ZFLAG_DRAINING;
970 	bucket_cache_drain(zone);
971 	ZONE_UNLOCK(zone);
972 	/*
973 	 * The DRAINING flag protects us from being freed while
974 	 * we're running.  Normally the uma_rwlock would protect us but we
975 	 * must be able to release and acquire the right lock for each keg.
976 	 */
977 	zone_foreach_keg(zone, &keg_drain);
978 	ZONE_LOCK(zone);
979 	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
980 	wakeup(zone);
981 out:
982 	ZONE_UNLOCK(zone);
983 }
984 
985 void
986 zone_drain(uma_zone_t zone)
987 {
988 
989 	zone_drain_wait(zone, M_NOWAIT);
990 }
991 
992 /*
993  * Allocate a new slab for a keg.  This does not insert the slab onto a list.
994  *
995  * Arguments:
996  *	wait  Shall we wait?
997  *
998  * Returns:
999  *	The slab that was allocated or NULL if there is no memory and the
1000  *	caller specified M_NOWAIT.
1001  */
1002 static uma_slab_t
1003 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int domain, int wait)
1004 {
1005 	uma_alloc allocf;
1006 	uma_slab_t slab;
1007 	unsigned long size;
1008 	uint8_t *mem;
1009 	uint8_t flags;
1010 	int i;
1011 
1012 	KASSERT(domain >= 0 && domain < vm_ndomains,
1013 	    ("keg_alloc_slab: domain %d out of range", domain));
1014 	mtx_assert(&keg->uk_lock, MA_OWNED);
1015 	slab = NULL;
1016 	mem = NULL;
1017 
1018 	allocf = keg->uk_allocf;
1019 	KEG_UNLOCK(keg);
1020 	size = keg->uk_ppera * PAGE_SIZE;
1021 
1022 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1023 		slab = zone_alloc_item(keg->uk_slabzone, NULL, domain, wait);
1024 		if (slab == NULL)
1025 			goto out;
1026 	}
1027 
1028 	/*
1029 	 * This reproduces the old vm_zone behavior of zero filling pages the
1030 	 * first time they are added to a zone.
1031 	 *
1032 	 * Malloced items are zeroed in uma_zalloc.
1033 	 */
1034 
1035 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
1036 		wait |= M_ZERO;
1037 	else
1038 		wait &= ~M_ZERO;
1039 
1040 	if (keg->uk_flags & UMA_ZONE_NODUMP)
1041 		wait |= M_NODUMP;
1042 
1043 	/* zone is passed for legacy reasons. */
1044 	mem = allocf(zone, size, domain, &flags, wait);
1045 	if (mem == NULL) {
1046 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1047 			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
1048 		slab = NULL;
1049 		goto out;
1050 	}
1051 	uma_total_inc(size);
1052 
1053 	/* Point the slab into the allocated memory */
1054 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
1055 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
1056 
1057 	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
1058 		for (i = 0; i < keg->uk_ppera; i++)
1059 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
1060 
1061 	slab->us_keg = keg;
1062 	slab->us_data = mem;
1063 	slab->us_freecount = keg->uk_ipers;
1064 	slab->us_flags = flags;
1065 	slab->us_domain = domain;
1066 	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
1067 #ifdef INVARIANTS
1068 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
1069 #endif
1070 
1071 	if (keg->uk_init != NULL) {
1072 		for (i = 0; i < keg->uk_ipers; i++)
1073 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
1074 			    keg->uk_size, wait) != 0)
1075 				break;
1076 		if (i != keg->uk_ipers) {
1077 			keg_free_slab(keg, slab, i);
1078 			slab = NULL;
1079 			goto out;
1080 		}
1081 	}
1082 out:
1083 	KEG_LOCK(keg);
1084 
1085 	CTR3(KTR_UMA, "keg_alloc_slab: allocated slab %p for %s(%p)",
1086 	    slab, keg->uk_name, keg);
1087 
1088 	if (slab != NULL) {
1089 		if (keg->uk_flags & UMA_ZONE_HASH)
1090 			UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
1091 
1092 		keg->uk_pages += keg->uk_ppera;
1093 		keg->uk_free += keg->uk_ipers;
1094 	}
1095 
1096 	return (slab);
1097 }
1098 
1099 /*
1100  * This function is intended to be used early on in place of page_alloc() so
1101  * that we may use the boot time page cache to satisfy allocations before
1102  * the VM is ready.
1103  */
1104 static void *
1105 startup_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1106     int wait)
1107 {
1108 	uma_keg_t keg;
1109 	void *mem;
1110 	int pages;
1111 
1112 	keg = zone_first_keg(zone);
1113 
1114 	/*
1115 	 * If we are in BOOT_BUCKETS or higher, than switch to real
1116 	 * allocator.  Zones with page sized slabs switch at BOOT_PAGEALLOC.
1117 	 */
1118 	switch (booted) {
1119 		case BOOT_COLD:
1120 		case BOOT_STRAPPED:
1121 			break;
1122 		case BOOT_PAGEALLOC:
1123 			if (keg->uk_ppera > 1)
1124 				break;
1125 		case BOOT_BUCKETS:
1126 		case BOOT_RUNNING:
1127 #ifdef UMA_MD_SMALL_ALLOC
1128 			keg->uk_allocf = (keg->uk_ppera > 1) ?
1129 			    page_alloc : uma_small_alloc;
1130 #else
1131 			keg->uk_allocf = page_alloc;
1132 #endif
1133 			return keg->uk_allocf(zone, bytes, domain, pflag, wait);
1134 	}
1135 
1136 	/*
1137 	 * Check our small startup cache to see if it has pages remaining.
1138 	 */
1139 	pages = howmany(bytes, PAGE_SIZE);
1140 	KASSERT(pages > 0, ("%s can't reserve 0 pages", __func__));
1141 	if (pages > boot_pages)
1142 		panic("UMA zone \"%s\": Increase vm.boot_pages", zone->uz_name);
1143 #ifdef DIAGNOSTIC
1144 	printf("%s from \"%s\", %d boot pages left\n", __func__, zone->uz_name,
1145 	    boot_pages);
1146 #endif
1147 	mem = bootmem;
1148 	boot_pages -= pages;
1149 	bootmem += pages * PAGE_SIZE;
1150 	*pflag = UMA_SLAB_BOOT;
1151 
1152 	return (mem);
1153 }
1154 
1155 /*
1156  * Allocates a number of pages from the system
1157  *
1158  * Arguments:
1159  *	bytes  The number of bytes requested
1160  *	wait  Shall we wait?
1161  *
1162  * Returns:
1163  *	A pointer to the alloced memory or possibly
1164  *	NULL if M_NOWAIT is set.
1165  */
1166 static void *
1167 page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1168     int wait)
1169 {
1170 	void *p;	/* Returned page */
1171 
1172 	*pflag = UMA_SLAB_KERNEL;
1173 	p = (void *) kmem_malloc_domain(domain, bytes, wait);
1174 
1175 	return (p);
1176 }
1177 
1178 static void *
1179 pcpu_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
1180     int wait)
1181 {
1182 	struct pglist alloctail;
1183 	vm_offset_t addr, zkva;
1184 	int cpu, flags;
1185 	vm_page_t p, p_next;
1186 #ifdef NUMA
1187 	struct pcpu *pc;
1188 #endif
1189 
1190 	MPASS(bytes == (mp_maxid + 1) * PAGE_SIZE);
1191 
1192 	TAILQ_INIT(&alloctail);
1193 	flags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1194 	    malloc2vm_flags(wait);
1195 	*pflag = UMA_SLAB_KERNEL;
1196 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
1197 		if (CPU_ABSENT(cpu)) {
1198 			p = vm_page_alloc(NULL, 0, flags);
1199 		} else {
1200 #ifndef NUMA
1201 			p = vm_page_alloc(NULL, 0, flags);
1202 #else
1203 			pc = pcpu_find(cpu);
1204 			p = vm_page_alloc_domain(NULL, 0, pc->pc_domain, flags);
1205 			if (__predict_false(p == NULL))
1206 				p = vm_page_alloc(NULL, 0, flags);
1207 #endif
1208 		}
1209 		if (__predict_false(p == NULL))
1210 			goto fail;
1211 		TAILQ_INSERT_TAIL(&alloctail, p, listq);
1212 	}
1213 	if ((addr = kva_alloc(bytes)) == 0)
1214 		goto fail;
1215 	zkva = addr;
1216 	TAILQ_FOREACH(p, &alloctail, listq) {
1217 		pmap_qenter(zkva, &p, 1);
1218 		zkva += PAGE_SIZE;
1219 	}
1220 	return ((void*)addr);
1221  fail:
1222 	TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1223 		vm_page_unwire(p, PQ_NONE);
1224 		vm_page_free(p);
1225 	}
1226 	return (NULL);
1227 }
1228 
1229 /*
1230  * Allocates a number of pages from within an object
1231  *
1232  * Arguments:
1233  *	bytes  The number of bytes requested
1234  *	wait   Shall we wait?
1235  *
1236  * Returns:
1237  *	A pointer to the alloced memory or possibly
1238  *	NULL if M_NOWAIT is set.
1239  */
1240 static void *
1241 noobj_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
1242     int wait)
1243 {
1244 	TAILQ_HEAD(, vm_page) alloctail;
1245 	u_long npages;
1246 	vm_offset_t retkva, zkva;
1247 	vm_page_t p, p_next;
1248 	uma_keg_t keg;
1249 
1250 	TAILQ_INIT(&alloctail);
1251 	keg = zone_first_keg(zone);
1252 
1253 	npages = howmany(bytes, PAGE_SIZE);
1254 	while (npages > 0) {
1255 		p = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_INTERRUPT |
1256 		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
1257 		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
1258 		    VM_ALLOC_NOWAIT));
1259 		if (p != NULL) {
1260 			/*
1261 			 * Since the page does not belong to an object, its
1262 			 * listq is unused.
1263 			 */
1264 			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1265 			npages--;
1266 			continue;
1267 		}
1268 		/*
1269 		 * Page allocation failed, free intermediate pages and
1270 		 * exit.
1271 		 */
1272 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1273 			vm_page_unwire(p, PQ_NONE);
1274 			vm_page_free(p);
1275 		}
1276 		return (NULL);
1277 	}
1278 	*flags = UMA_SLAB_PRIV;
1279 	zkva = keg->uk_kva +
1280 	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1281 	retkva = zkva;
1282 	TAILQ_FOREACH(p, &alloctail, listq) {
1283 		pmap_qenter(zkva, &p, 1);
1284 		zkva += PAGE_SIZE;
1285 	}
1286 
1287 	return ((void *)retkva);
1288 }
1289 
1290 /*
1291  * Frees a number of pages to the system
1292  *
1293  * Arguments:
1294  *	mem   A pointer to the memory to be freed
1295  *	size  The size of the memory being freed
1296  *	flags The original p->us_flags field
1297  *
1298  * Returns:
1299  *	Nothing
1300  */
1301 static void
1302 page_free(void *mem, vm_size_t size, uint8_t flags)
1303 {
1304 
1305 	if ((flags & UMA_SLAB_KERNEL) == 0)
1306 		panic("UMA: page_free used with invalid flags %x", flags);
1307 
1308 	kmem_free((vm_offset_t)mem, size);
1309 }
1310 
1311 /*
1312  * Frees pcpu zone allocations
1313  *
1314  * Arguments:
1315  *	mem   A pointer to the memory to be freed
1316  *	size  The size of the memory being freed
1317  *	flags The original p->us_flags field
1318  *
1319  * Returns:
1320  *	Nothing
1321  */
1322 static void
1323 pcpu_page_free(void *mem, vm_size_t size, uint8_t flags)
1324 {
1325 	vm_offset_t sva, curva;
1326 	vm_paddr_t paddr;
1327 	vm_page_t m;
1328 
1329 	MPASS(size == (mp_maxid+1)*PAGE_SIZE);
1330 	sva = (vm_offset_t)mem;
1331 	for (curva = sva; curva < sva + size; curva += PAGE_SIZE) {
1332 		paddr = pmap_kextract(curva);
1333 		m = PHYS_TO_VM_PAGE(paddr);
1334 		vm_page_unwire(m, PQ_NONE);
1335 		vm_page_free(m);
1336 	}
1337 	pmap_qremove(sva, size >> PAGE_SHIFT);
1338 	kva_free(sva, size);
1339 }
1340 
1341 
1342 /*
1343  * Zero fill initializer
1344  *
1345  * Arguments/Returns follow uma_init specifications
1346  */
1347 static int
1348 zero_init(void *mem, int size, int flags)
1349 {
1350 	bzero(mem, size);
1351 	return (0);
1352 }
1353 
1354 /*
1355  * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1356  *
1357  * Arguments
1358  *	keg  The zone we should initialize
1359  *
1360  * Returns
1361  *	Nothing
1362  */
1363 static void
1364 keg_small_init(uma_keg_t keg)
1365 {
1366 	u_int rsize;
1367 	u_int memused;
1368 	u_int wastedspace;
1369 	u_int shsize;
1370 	u_int slabsize;
1371 
1372 	if (keg->uk_flags & UMA_ZONE_PCPU) {
1373 		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
1374 
1375 		slabsize = UMA_PCPU_ALLOC_SIZE;
1376 		keg->uk_ppera = ncpus;
1377 	} else {
1378 		slabsize = UMA_SLAB_SIZE;
1379 		keg->uk_ppera = 1;
1380 	}
1381 
1382 	/*
1383 	 * Calculate the size of each allocation (rsize) according to
1384 	 * alignment.  If the requested size is smaller than we have
1385 	 * allocation bits for we round it up.
1386 	 */
1387 	rsize = keg->uk_size;
1388 	if (rsize < slabsize / SLAB_SETSIZE)
1389 		rsize = slabsize / SLAB_SETSIZE;
1390 	if (rsize & keg->uk_align)
1391 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1392 	keg->uk_rsize = rsize;
1393 
1394 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1395 	    keg->uk_rsize < UMA_PCPU_ALLOC_SIZE,
1396 	    ("%s: size %u too large", __func__, keg->uk_rsize));
1397 
1398 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1399 		shsize = 0;
1400 	else
1401 		shsize = sizeof(struct uma_slab);
1402 
1403 	if (rsize <= slabsize - shsize)
1404 		keg->uk_ipers = (slabsize - shsize) / rsize;
1405 	else {
1406 		/* Handle special case when we have 1 item per slab, so
1407 		 * alignment requirement can be relaxed. */
1408 		KASSERT(keg->uk_size <= slabsize - shsize,
1409 		    ("%s: size %u greater than slab", __func__, keg->uk_size));
1410 		keg->uk_ipers = 1;
1411 	}
1412 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1413 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1414 
1415 	memused = keg->uk_ipers * rsize + shsize;
1416 	wastedspace = slabsize - memused;
1417 
1418 	/*
1419 	 * We can't do OFFPAGE if we're internal or if we've been
1420 	 * asked to not go to the VM for buckets.  If we do this we
1421 	 * may end up going to the VM  for slabs which we do not
1422 	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1423 	 * of UMA_ZONE_VM, which clearly forbids it.
1424 	 */
1425 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1426 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1427 		return;
1428 
1429 	/*
1430 	 * See if using an OFFPAGE slab will limit our waste.  Only do
1431 	 * this if it permits more items per-slab.
1432 	 *
1433 	 * XXX We could try growing slabsize to limit max waste as well.
1434 	 * Historically this was not done because the VM could not
1435 	 * efficiently handle contiguous allocations.
1436 	 */
1437 	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
1438 	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
1439 		keg->uk_ipers = slabsize / keg->uk_rsize;
1440 		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1441 		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1442 		CTR6(KTR_UMA, "UMA decided we need offpage slab headers for "
1443 		    "keg: %s(%p), calculated wastedspace = %d, "
1444 		    "maximum wasted space allowed = %d, "
1445 		    "calculated ipers = %d, "
1446 		    "new wasted space = %d\n", keg->uk_name, keg, wastedspace,
1447 		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1448 		    slabsize - keg->uk_ipers * keg->uk_rsize);
1449 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1450 	}
1451 
1452 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1453 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1454 		keg->uk_flags |= UMA_ZONE_HASH;
1455 }
1456 
1457 /*
1458  * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1459  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1460  * more complicated.
1461  *
1462  * Arguments
1463  *	keg  The keg we should initialize
1464  *
1465  * Returns
1466  *	Nothing
1467  */
1468 static void
1469 keg_large_init(uma_keg_t keg)
1470 {
1471 	u_int shsize;
1472 
1473 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1474 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1475 	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1476 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1477 	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1478 
1479 	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1480 	keg->uk_ipers = 1;
1481 	keg->uk_rsize = keg->uk_size;
1482 
1483 	/* Check whether we have enough space to not do OFFPAGE. */
1484 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
1485 		shsize = sizeof(struct uma_slab);
1486 		if (shsize & UMA_ALIGN_PTR)
1487 			shsize = (shsize & ~UMA_ALIGN_PTR) +
1488 			    (UMA_ALIGN_PTR + 1);
1489 
1490 		if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) {
1491 			/*
1492 			 * We can't do OFFPAGE if we're internal, in which case
1493 			 * we need an extra page per allocation to contain the
1494 			 * slab header.
1495 			 */
1496 			if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
1497 				keg->uk_flags |= UMA_ZONE_OFFPAGE;
1498 			else
1499 				keg->uk_ppera++;
1500 		}
1501 	}
1502 
1503 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1504 	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1505 		keg->uk_flags |= UMA_ZONE_HASH;
1506 }
1507 
1508 static void
1509 keg_cachespread_init(uma_keg_t keg)
1510 {
1511 	int alignsize;
1512 	int trailer;
1513 	int pages;
1514 	int rsize;
1515 
1516 	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1517 	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1518 
1519 	alignsize = keg->uk_align + 1;
1520 	rsize = keg->uk_size;
1521 	/*
1522 	 * We want one item to start on every align boundary in a page.  To
1523 	 * do this we will span pages.  We will also extend the item by the
1524 	 * size of align if it is an even multiple of align.  Otherwise, it
1525 	 * would fall on the same boundary every time.
1526 	 */
1527 	if (rsize & keg->uk_align)
1528 		rsize = (rsize & ~keg->uk_align) + alignsize;
1529 	if ((rsize & alignsize) == 0)
1530 		rsize += alignsize;
1531 	trailer = rsize - keg->uk_size;
1532 	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1533 	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1534 	keg->uk_rsize = rsize;
1535 	keg->uk_ppera = pages;
1536 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1537 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1538 	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
1539 	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1540 	    keg->uk_ipers));
1541 }
1542 
1543 /*
1544  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1545  * the keg onto the global keg list.
1546  *
1547  * Arguments/Returns follow uma_ctor specifications
1548  *	udata  Actually uma_kctor_args
1549  */
1550 static int
1551 keg_ctor(void *mem, int size, void *udata, int flags)
1552 {
1553 	struct uma_kctor_args *arg = udata;
1554 	uma_keg_t keg = mem;
1555 	uma_zone_t zone;
1556 
1557 	bzero(keg, size);
1558 	keg->uk_size = arg->size;
1559 	keg->uk_init = arg->uminit;
1560 	keg->uk_fini = arg->fini;
1561 	keg->uk_align = arg->align;
1562 	keg->uk_cursor = 0;
1563 	keg->uk_free = 0;
1564 	keg->uk_reserve = 0;
1565 	keg->uk_pages = 0;
1566 	keg->uk_flags = arg->flags;
1567 	keg->uk_slabzone = NULL;
1568 
1569 	/*
1570 	 * The master zone is passed to us at keg-creation time.
1571 	 */
1572 	zone = arg->zone;
1573 	keg->uk_name = zone->uz_name;
1574 
1575 	if (arg->flags & UMA_ZONE_VM)
1576 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1577 
1578 	if (arg->flags & UMA_ZONE_ZINIT)
1579 		keg->uk_init = zero_init;
1580 
1581 	if (arg->flags & UMA_ZONE_MALLOC)
1582 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1583 
1584 	if (arg->flags & UMA_ZONE_PCPU)
1585 #ifdef SMP
1586 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1587 #else
1588 		keg->uk_flags &= ~UMA_ZONE_PCPU;
1589 #endif
1590 
1591 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1592 		keg_cachespread_init(keg);
1593 	} else {
1594 		if (keg->uk_size > UMA_SLAB_SPACE)
1595 			keg_large_init(keg);
1596 		else
1597 			keg_small_init(keg);
1598 	}
1599 
1600 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1601 		keg->uk_slabzone = slabzone;
1602 
1603 	/*
1604 	 * If we haven't booted yet we need allocations to go through the
1605 	 * startup cache until the vm is ready.
1606 	 */
1607 	if (booted < BOOT_PAGEALLOC)
1608 		keg->uk_allocf = startup_alloc;
1609 #ifdef UMA_MD_SMALL_ALLOC
1610 	else if (keg->uk_ppera == 1)
1611 		keg->uk_allocf = uma_small_alloc;
1612 #endif
1613 	else if (keg->uk_flags & UMA_ZONE_PCPU)
1614 		keg->uk_allocf = pcpu_page_alloc;
1615 	else
1616 		keg->uk_allocf = page_alloc;
1617 #ifdef UMA_MD_SMALL_ALLOC
1618 	if (keg->uk_ppera == 1)
1619 		keg->uk_freef = uma_small_free;
1620 	else
1621 #endif
1622 	if (keg->uk_flags & UMA_ZONE_PCPU)
1623 		keg->uk_freef = pcpu_page_free;
1624 	else
1625 		keg->uk_freef = page_free;
1626 
1627 	/*
1628 	 * Initialize keg's lock
1629 	 */
1630 	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1631 
1632 	/*
1633 	 * If we're putting the slab header in the actual page we need to
1634 	 * figure out where in each page it goes.  This calculates a right
1635 	 * justified offset into the memory on an ALIGN_PTR boundary.
1636 	 */
1637 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1638 		u_int totsize;
1639 
1640 		/* Size of the slab struct and free list */
1641 		totsize = sizeof(struct uma_slab);
1642 
1643 		if (totsize & UMA_ALIGN_PTR)
1644 			totsize = (totsize & ~UMA_ALIGN_PTR) +
1645 			    (UMA_ALIGN_PTR + 1);
1646 		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
1647 
1648 		/*
1649 		 * The only way the following is possible is if with our
1650 		 * UMA_ALIGN_PTR adjustments we are now bigger than
1651 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1652 		 * mathematically possible for all cases, so we make
1653 		 * sure here anyway.
1654 		 */
1655 		totsize = keg->uk_pgoff + sizeof(struct uma_slab);
1656 		if (totsize > PAGE_SIZE * keg->uk_ppera) {
1657 			printf("zone %s ipers %d rsize %d size %d\n",
1658 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1659 			    keg->uk_size);
1660 			panic("UMA slab won't fit.");
1661 		}
1662 	}
1663 
1664 	if (keg->uk_flags & UMA_ZONE_HASH)
1665 		hash_alloc(&keg->uk_hash);
1666 
1667 	CTR5(KTR_UMA, "keg_ctor %p zone %s(%p) out %d free %d\n",
1668 	    keg, zone->uz_name, zone,
1669 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
1670 	    keg->uk_free);
1671 
1672 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1673 
1674 	rw_wlock(&uma_rwlock);
1675 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1676 	rw_wunlock(&uma_rwlock);
1677 	return (0);
1678 }
1679 
1680 /*
1681  * Zone header ctor.  This initializes all fields, locks, etc.
1682  *
1683  * Arguments/Returns follow uma_ctor specifications
1684  *	udata  Actually uma_zctor_args
1685  */
1686 static int
1687 zone_ctor(void *mem, int size, void *udata, int flags)
1688 {
1689 	struct uma_zctor_args *arg = udata;
1690 	uma_zone_t zone = mem;
1691 	uma_zone_t z;
1692 	uma_keg_t keg;
1693 
1694 	bzero(zone, size);
1695 	zone->uz_name = arg->name;
1696 	zone->uz_ctor = arg->ctor;
1697 	zone->uz_dtor = arg->dtor;
1698 	zone->uz_slab = zone_fetch_slab;
1699 	zone->uz_init = NULL;
1700 	zone->uz_fini = NULL;
1701 	zone->uz_allocs = 0;
1702 	zone->uz_frees = 0;
1703 	zone->uz_fails = 0;
1704 	zone->uz_sleeps = 0;
1705 	zone->uz_count = 0;
1706 	zone->uz_count_min = 0;
1707 	zone->uz_flags = 0;
1708 	zone->uz_warning = NULL;
1709 	/* The domain structures follow the cpu structures. */
1710 	zone->uz_domain = (struct uma_zone_domain *)&zone->uz_cpu[mp_ncpus];
1711 	timevalclear(&zone->uz_ratecheck);
1712 	keg = arg->keg;
1713 
1714 	ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1715 
1716 	/*
1717 	 * This is a pure cache zone, no kegs.
1718 	 */
1719 	if (arg->import) {
1720 		if (arg->flags & UMA_ZONE_VM)
1721 			arg->flags |= UMA_ZFLAG_CACHEONLY;
1722 		zone->uz_flags = arg->flags;
1723 		zone->uz_size = arg->size;
1724 		zone->uz_import = arg->import;
1725 		zone->uz_release = arg->release;
1726 		zone->uz_arg = arg->arg;
1727 		zone->uz_lockptr = &zone->uz_lock;
1728 		rw_wlock(&uma_rwlock);
1729 		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
1730 		rw_wunlock(&uma_rwlock);
1731 		goto out;
1732 	}
1733 
1734 	/*
1735 	 * Use the regular zone/keg/slab allocator.
1736 	 */
1737 	zone->uz_import = (uma_import)zone_import;
1738 	zone->uz_release = (uma_release)zone_release;
1739 	zone->uz_arg = zone;
1740 
1741 	if (arg->flags & UMA_ZONE_SECONDARY) {
1742 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1743 		zone->uz_init = arg->uminit;
1744 		zone->uz_fini = arg->fini;
1745 		zone->uz_lockptr = &keg->uk_lock;
1746 		zone->uz_flags |= UMA_ZONE_SECONDARY;
1747 		rw_wlock(&uma_rwlock);
1748 		ZONE_LOCK(zone);
1749 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1750 			if (LIST_NEXT(z, uz_link) == NULL) {
1751 				LIST_INSERT_AFTER(z, zone, uz_link);
1752 				break;
1753 			}
1754 		}
1755 		ZONE_UNLOCK(zone);
1756 		rw_wunlock(&uma_rwlock);
1757 	} else if (keg == NULL) {
1758 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1759 		    arg->align, arg->flags)) == NULL)
1760 			return (ENOMEM);
1761 	} else {
1762 		struct uma_kctor_args karg;
1763 		int error;
1764 
1765 		/* We should only be here from uma_startup() */
1766 		karg.size = arg->size;
1767 		karg.uminit = arg->uminit;
1768 		karg.fini = arg->fini;
1769 		karg.align = arg->align;
1770 		karg.flags = arg->flags;
1771 		karg.zone = zone;
1772 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1773 		    flags);
1774 		if (error)
1775 			return (error);
1776 	}
1777 
1778 	/*
1779 	 * Link in the first keg.
1780 	 */
1781 	zone->uz_klink.kl_keg = keg;
1782 	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1783 	zone->uz_lockptr = &keg->uk_lock;
1784 	zone->uz_size = keg->uk_size;
1785 	zone->uz_flags |= (keg->uk_flags &
1786 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1787 
1788 	/*
1789 	 * Some internal zones don't have room allocated for the per cpu
1790 	 * caches.  If we're internal, bail out here.
1791 	 */
1792 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1793 		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1794 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1795 		return (0);
1796 	}
1797 
1798 out:
1799 	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
1800 	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
1801 	    ("Invalid zone flag combination"));
1802 	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
1803 		zone->uz_count = BUCKET_MAX;
1804 	else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
1805 		zone->uz_count = 0;
1806 	else
1807 		zone->uz_count = bucket_select(zone->uz_size);
1808 	zone->uz_count_min = zone->uz_count;
1809 
1810 	return (0);
1811 }
1812 
1813 /*
1814  * Keg header dtor.  This frees all data, destroys locks, frees the hash
1815  * table and removes the keg from the global list.
1816  *
1817  * Arguments/Returns follow uma_dtor specifications
1818  *	udata  unused
1819  */
1820 static void
1821 keg_dtor(void *arg, int size, void *udata)
1822 {
1823 	uma_keg_t keg;
1824 
1825 	keg = (uma_keg_t)arg;
1826 	KEG_LOCK(keg);
1827 	if (keg->uk_free != 0) {
1828 		printf("Freed UMA keg (%s) was not empty (%d items). "
1829 		    " Lost %d pages of memory.\n",
1830 		    keg->uk_name ? keg->uk_name : "",
1831 		    keg->uk_free, keg->uk_pages);
1832 	}
1833 	KEG_UNLOCK(keg);
1834 
1835 	hash_free(&keg->uk_hash);
1836 
1837 	KEG_LOCK_FINI(keg);
1838 }
1839 
1840 /*
1841  * Zone header dtor.
1842  *
1843  * Arguments/Returns follow uma_dtor specifications
1844  *	udata  unused
1845  */
1846 static void
1847 zone_dtor(void *arg, int size, void *udata)
1848 {
1849 	uma_klink_t klink;
1850 	uma_zone_t zone;
1851 	uma_keg_t keg;
1852 
1853 	zone = (uma_zone_t)arg;
1854 	keg = zone_first_keg(zone);
1855 
1856 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1857 		cache_drain(zone);
1858 
1859 	rw_wlock(&uma_rwlock);
1860 	LIST_REMOVE(zone, uz_link);
1861 	rw_wunlock(&uma_rwlock);
1862 	/*
1863 	 * XXX there are some races here where
1864 	 * the zone can be drained but zone lock
1865 	 * released and then refilled before we
1866 	 * remove it... we dont care for now
1867 	 */
1868 	zone_drain_wait(zone, M_WAITOK);
1869 	/*
1870 	 * Unlink all of our kegs.
1871 	 */
1872 	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1873 		klink->kl_keg = NULL;
1874 		LIST_REMOVE(klink, kl_link);
1875 		if (klink == &zone->uz_klink)
1876 			continue;
1877 		free(klink, M_TEMP);
1878 	}
1879 	/*
1880 	 * We only destroy kegs from non secondary zones.
1881 	 */
1882 	if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1883 		rw_wlock(&uma_rwlock);
1884 		LIST_REMOVE(keg, uk_link);
1885 		rw_wunlock(&uma_rwlock);
1886 		zone_free_item(kegs, keg, NULL, SKIP_NONE);
1887 	}
1888 	ZONE_LOCK_FINI(zone);
1889 }
1890 
1891 /*
1892  * Traverses every zone in the system and calls a callback
1893  *
1894  * Arguments:
1895  *	zfunc  A pointer to a function which accepts a zone
1896  *		as an argument.
1897  *
1898  * Returns:
1899  *	Nothing
1900  */
1901 static void
1902 zone_foreach(void (*zfunc)(uma_zone_t))
1903 {
1904 	uma_keg_t keg;
1905 	uma_zone_t zone;
1906 
1907 	rw_rlock(&uma_rwlock);
1908 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
1909 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1910 			zfunc(zone);
1911 	}
1912 	rw_runlock(&uma_rwlock);
1913 }
1914 
1915 /*
1916  * Count how many pages do we need to bootstrap.  VM supplies
1917  * its need in early zones in the argument, we add up our zones,
1918  * which consist of: UMA Slabs, UMA Hash and 9 Bucket zones. The
1919  * zone of zones and zone of kegs are accounted separately.
1920  */
1921 #define	UMA_BOOT_ZONES	11
1922 /* Zone of zones and zone of kegs have arbitrary alignment. */
1923 #define	UMA_BOOT_ALIGN	32
1924 static int zsize, ksize;
1925 int
1926 uma_startup_count(int vm_zones)
1927 {
1928 	int zones, pages;
1929 
1930 	ksize = sizeof(struct uma_keg) +
1931 	    (sizeof(struct uma_domain) * vm_ndomains);
1932 	zsize = sizeof(struct uma_zone) +
1933 	    (sizeof(struct uma_cache) * (mp_maxid + 1)) +
1934 	    (sizeof(struct uma_zone_domain) * vm_ndomains);
1935 
1936 	/*
1937 	 * Memory for the zone of kegs and its keg,
1938 	 * and for zone of zones.
1939 	 */
1940 	pages = howmany(roundup(zsize, CACHE_LINE_SIZE) * 2 +
1941 	    roundup(ksize, CACHE_LINE_SIZE), PAGE_SIZE);
1942 
1943 #ifdef	UMA_MD_SMALL_ALLOC
1944 	zones = UMA_BOOT_ZONES;
1945 #else
1946 	zones = UMA_BOOT_ZONES + vm_zones;
1947 	vm_zones = 0;
1948 #endif
1949 
1950 	/* Memory for the rest of startup zones, UMA and VM, ... */
1951 	if (zsize > UMA_SLAB_SPACE)
1952 		pages += (zones + vm_zones) *
1953 		    howmany(roundup2(zsize, UMA_BOOT_ALIGN), UMA_SLAB_SIZE);
1954 	else if (roundup2(zsize, UMA_BOOT_ALIGN) > UMA_SLAB_SPACE)
1955 		pages += zones;
1956 	else
1957 		pages += howmany(zones,
1958 		    UMA_SLAB_SPACE / roundup2(zsize, UMA_BOOT_ALIGN));
1959 
1960 	/* ... and their kegs. Note that zone of zones allocates a keg! */
1961 	pages += howmany(zones + 1,
1962 	    UMA_SLAB_SPACE / roundup2(ksize, UMA_BOOT_ALIGN));
1963 
1964 	/*
1965 	 * Most of startup zones are not going to be offpages, that's
1966 	 * why we use UMA_SLAB_SPACE instead of UMA_SLAB_SIZE in all
1967 	 * calculations.  Some large bucket zones will be offpage, and
1968 	 * thus will allocate hashes.  We take conservative approach
1969 	 * and assume that all zones may allocate hash.  This may give
1970 	 * us some positive inaccuracy, usually an extra single page.
1971 	 */
1972 	pages += howmany(zones, UMA_SLAB_SPACE /
1973 	    (sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT));
1974 
1975 	return (pages);
1976 }
1977 
1978 void
1979 uma_startup(void *mem, int npages)
1980 {
1981 	struct uma_zctor_args args;
1982 	uma_keg_t masterkeg;
1983 	uintptr_t m;
1984 
1985 #ifdef DIAGNOSTIC
1986 	printf("Entering %s with %d boot pages configured\n", __func__, npages);
1987 #endif
1988 
1989 	rw_init(&uma_rwlock, "UMA lock");
1990 
1991 	/* Use bootpages memory for the zone of zones and zone of kegs. */
1992 	m = (uintptr_t)mem;
1993 	zones = (uma_zone_t)m;
1994 	m += roundup(zsize, CACHE_LINE_SIZE);
1995 	kegs = (uma_zone_t)m;
1996 	m += roundup(zsize, CACHE_LINE_SIZE);
1997 	masterkeg = (uma_keg_t)m;
1998 	m += roundup(ksize, CACHE_LINE_SIZE);
1999 	m = roundup(m, PAGE_SIZE);
2000 	npages -= (m - (uintptr_t)mem) / PAGE_SIZE;
2001 	mem = (void *)m;
2002 
2003 	/* "manually" create the initial zone */
2004 	memset(&args, 0, sizeof(args));
2005 	args.name = "UMA Kegs";
2006 	args.size = ksize;
2007 	args.ctor = keg_ctor;
2008 	args.dtor = keg_dtor;
2009 	args.uminit = zero_init;
2010 	args.fini = NULL;
2011 	args.keg = masterkeg;
2012 	args.align = UMA_BOOT_ALIGN - 1;
2013 	args.flags = UMA_ZFLAG_INTERNAL;
2014 	zone_ctor(kegs, zsize, &args, M_WAITOK);
2015 
2016 	bootmem = mem;
2017 	boot_pages = npages;
2018 
2019 	args.name = "UMA Zones";
2020 	args.size = zsize;
2021 	args.ctor = zone_ctor;
2022 	args.dtor = zone_dtor;
2023 	args.uminit = zero_init;
2024 	args.fini = NULL;
2025 	args.keg = NULL;
2026 	args.align = UMA_BOOT_ALIGN - 1;
2027 	args.flags = UMA_ZFLAG_INTERNAL;
2028 	zone_ctor(zones, zsize, &args, M_WAITOK);
2029 
2030 	/* Now make a zone for slab headers */
2031 	slabzone = uma_zcreate("UMA Slabs",
2032 				sizeof(struct uma_slab),
2033 				NULL, NULL, NULL, NULL,
2034 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2035 
2036 	hashzone = uma_zcreate("UMA Hash",
2037 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
2038 	    NULL, NULL, NULL, NULL,
2039 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
2040 
2041 	bucket_init();
2042 
2043 	booted = BOOT_STRAPPED;
2044 }
2045 
2046 void
2047 uma_startup1(void)
2048 {
2049 
2050 #ifdef DIAGNOSTIC
2051 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2052 #endif
2053 	booted = BOOT_PAGEALLOC;
2054 }
2055 
2056 void
2057 uma_startup2(void)
2058 {
2059 
2060 #ifdef DIAGNOSTIC
2061 	printf("Entering %s with %d boot pages left\n", __func__, boot_pages);
2062 #endif
2063 	booted = BOOT_BUCKETS;
2064 	sx_init(&uma_drain_lock, "umadrain");
2065 	bucket_enable();
2066 }
2067 
2068 /*
2069  * Initialize our callout handle
2070  *
2071  */
2072 static void
2073 uma_startup3(void)
2074 {
2075 
2076 #ifdef INVARIANTS
2077 	TUNABLE_INT_FETCH("vm.debug.divisor", &dbg_divisor);
2078 	uma_dbg_cnt = counter_u64_alloc(M_WAITOK);
2079 	uma_skip_cnt = counter_u64_alloc(M_WAITOK);
2080 #endif
2081 	callout_init(&uma_callout, 1);
2082 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
2083 	booted = BOOT_RUNNING;
2084 }
2085 
2086 static uma_keg_t
2087 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
2088 		int align, uint32_t flags)
2089 {
2090 	struct uma_kctor_args args;
2091 
2092 	args.size = size;
2093 	args.uminit = uminit;
2094 	args.fini = fini;
2095 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
2096 	args.flags = flags;
2097 	args.zone = zone;
2098 	return (zone_alloc_item(kegs, &args, UMA_ANYDOMAIN, M_WAITOK));
2099 }
2100 
2101 /* Public functions */
2102 /* See uma.h */
2103 void
2104 uma_set_align(int align)
2105 {
2106 
2107 	if (align != UMA_ALIGN_CACHE)
2108 		uma_align_cache = align;
2109 }
2110 
2111 /* See uma.h */
2112 uma_zone_t
2113 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
2114 		uma_init uminit, uma_fini fini, int align, uint32_t flags)
2115 
2116 {
2117 	struct uma_zctor_args args;
2118 	uma_zone_t res;
2119 	bool locked;
2120 
2121 	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
2122 	    align, name));
2123 
2124 	/* This stuff is essential for the zone ctor */
2125 	memset(&args, 0, sizeof(args));
2126 	args.name = name;
2127 	args.size = size;
2128 	args.ctor = ctor;
2129 	args.dtor = dtor;
2130 	args.uminit = uminit;
2131 	args.fini = fini;
2132 #ifdef  INVARIANTS
2133 	/*
2134 	 * If a zone is being created with an empty constructor and
2135 	 * destructor, pass UMA constructor/destructor which checks for
2136 	 * memory use after free.
2137 	 */
2138 	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
2139 	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
2140 		args.ctor = trash_ctor;
2141 		args.dtor = trash_dtor;
2142 		args.uminit = trash_init;
2143 		args.fini = trash_fini;
2144 	}
2145 #endif
2146 	args.align = align;
2147 	args.flags = flags;
2148 	args.keg = NULL;
2149 
2150 	if (booted < BOOT_BUCKETS) {
2151 		locked = false;
2152 	} else {
2153 		sx_slock(&uma_drain_lock);
2154 		locked = true;
2155 	}
2156 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2157 	if (locked)
2158 		sx_sunlock(&uma_drain_lock);
2159 	return (res);
2160 }
2161 
2162 /* See uma.h */
2163 uma_zone_t
2164 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
2165 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
2166 {
2167 	struct uma_zctor_args args;
2168 	uma_keg_t keg;
2169 	uma_zone_t res;
2170 	bool locked;
2171 
2172 	keg = zone_first_keg(master);
2173 	memset(&args, 0, sizeof(args));
2174 	args.name = name;
2175 	args.size = keg->uk_size;
2176 	args.ctor = ctor;
2177 	args.dtor = dtor;
2178 	args.uminit = zinit;
2179 	args.fini = zfini;
2180 	args.align = keg->uk_align;
2181 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
2182 	args.keg = keg;
2183 
2184 	if (booted < BOOT_BUCKETS) {
2185 		locked = false;
2186 	} else {
2187 		sx_slock(&uma_drain_lock);
2188 		locked = true;
2189 	}
2190 	/* XXX Attaches only one keg of potentially many. */
2191 	res = zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK);
2192 	if (locked)
2193 		sx_sunlock(&uma_drain_lock);
2194 	return (res);
2195 }
2196 
2197 /* See uma.h */
2198 uma_zone_t
2199 uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
2200 		    uma_init zinit, uma_fini zfini, uma_import zimport,
2201 		    uma_release zrelease, void *arg, int flags)
2202 {
2203 	struct uma_zctor_args args;
2204 
2205 	memset(&args, 0, sizeof(args));
2206 	args.name = name;
2207 	args.size = size;
2208 	args.ctor = ctor;
2209 	args.dtor = dtor;
2210 	args.uminit = zinit;
2211 	args.fini = zfini;
2212 	args.import = zimport;
2213 	args.release = zrelease;
2214 	args.arg = arg;
2215 	args.align = 0;
2216 	args.flags = flags;
2217 
2218 	return (zone_alloc_item(zones, &args, UMA_ANYDOMAIN, M_WAITOK));
2219 }
2220 
2221 static void
2222 zone_lock_pair(uma_zone_t a, uma_zone_t b)
2223 {
2224 	if (a < b) {
2225 		ZONE_LOCK(a);
2226 		mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
2227 	} else {
2228 		ZONE_LOCK(b);
2229 		mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
2230 	}
2231 }
2232 
2233 static void
2234 zone_unlock_pair(uma_zone_t a, uma_zone_t b)
2235 {
2236 
2237 	ZONE_UNLOCK(a);
2238 	ZONE_UNLOCK(b);
2239 }
2240 
2241 int
2242 uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
2243 {
2244 	uma_klink_t klink;
2245 	uma_klink_t kl;
2246 	int error;
2247 
2248 	error = 0;
2249 	klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
2250 
2251 	zone_lock_pair(zone, master);
2252 	/*
2253 	 * zone must use vtoslab() to resolve objects and must already be
2254 	 * a secondary.
2255 	 */
2256 	if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
2257 	    != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
2258 		error = EINVAL;
2259 		goto out;
2260 	}
2261 	/*
2262 	 * The new master must also use vtoslab().
2263 	 */
2264 	if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
2265 		error = EINVAL;
2266 		goto out;
2267 	}
2268 
2269 	/*
2270 	 * The underlying object must be the same size.  rsize
2271 	 * may be different.
2272 	 */
2273 	if (master->uz_size != zone->uz_size) {
2274 		error = E2BIG;
2275 		goto out;
2276 	}
2277 	/*
2278 	 * Put it at the end of the list.
2279 	 */
2280 	klink->kl_keg = zone_first_keg(master);
2281 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
2282 		if (LIST_NEXT(kl, kl_link) == NULL) {
2283 			LIST_INSERT_AFTER(kl, klink, kl_link);
2284 			break;
2285 		}
2286 	}
2287 	klink = NULL;
2288 	zone->uz_flags |= UMA_ZFLAG_MULTI;
2289 	zone->uz_slab = zone_fetch_slab_multi;
2290 
2291 out:
2292 	zone_unlock_pair(zone, master);
2293 	if (klink != NULL)
2294 		free(klink, M_TEMP);
2295 
2296 	return (error);
2297 }
2298 
2299 
2300 /* See uma.h */
2301 void
2302 uma_zdestroy(uma_zone_t zone)
2303 {
2304 
2305 	sx_slock(&uma_drain_lock);
2306 	zone_free_item(zones, zone, NULL, SKIP_NONE);
2307 	sx_sunlock(&uma_drain_lock);
2308 }
2309 
2310 void
2311 uma_zwait(uma_zone_t zone)
2312 {
2313 	void *item;
2314 
2315 	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
2316 	uma_zfree(zone, item);
2317 }
2318 
2319 void *
2320 uma_zalloc_pcpu_arg(uma_zone_t zone, void *udata, int flags)
2321 {
2322 	void *item;
2323 #ifdef SMP
2324 	int i;
2325 
2326 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2327 #endif
2328 	item = uma_zalloc_arg(zone, udata, flags & ~M_ZERO);
2329 	if (item != NULL && (flags & M_ZERO)) {
2330 #ifdef SMP
2331 		for (i = 0; i <= mp_maxid; i++)
2332 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
2333 #else
2334 		bzero(item, zone->uz_size);
2335 #endif
2336 	}
2337 	return (item);
2338 }
2339 
2340 /*
2341  * A stub while both regular and pcpu cases are identical.
2342  */
2343 void
2344 uma_zfree_pcpu_arg(uma_zone_t zone, void *item, void *udata)
2345 {
2346 
2347 #ifdef SMP
2348 	MPASS(zone->uz_flags & UMA_ZONE_PCPU);
2349 #endif
2350 	uma_zfree_arg(zone, item, udata);
2351 }
2352 
2353 /* See uma.h */
2354 void *
2355 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
2356 {
2357 	uma_zone_domain_t zdom;
2358 	uma_bucket_t bucket;
2359 	uma_cache_t cache;
2360 	void *item;
2361 	int cpu, domain, lockfail;
2362 #ifdef INVARIANTS
2363 	bool skipdbg;
2364 #endif
2365 
2366 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2367 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2368 
2369 	/* This is the fast path allocation */
2370 	CTR4(KTR_UMA, "uma_zalloc_arg thread %x zone %s(%p) flags %d",
2371 	    curthread, zone->uz_name, zone, flags);
2372 
2373 	if (flags & M_WAITOK) {
2374 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2375 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2376 	}
2377 	KASSERT((flags & M_EXEC) == 0, ("uma_zalloc_arg: called with M_EXEC"));
2378 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2379 	    ("uma_zalloc_arg: called with spinlock or critical section held"));
2380 	if (zone->uz_flags & UMA_ZONE_PCPU)
2381 		KASSERT((flags & M_ZERO) == 0, ("allocating from a pcpu zone "
2382 		    "with M_ZERO passed"));
2383 
2384 #ifdef DEBUG_MEMGUARD
2385 	if (memguard_cmp_zone(zone)) {
2386 		item = memguard_alloc(zone->uz_size, flags);
2387 		if (item != NULL) {
2388 			if (zone->uz_init != NULL &&
2389 			    zone->uz_init(item, zone->uz_size, flags) != 0)
2390 				return (NULL);
2391 			if (zone->uz_ctor != NULL &&
2392 			    zone->uz_ctor(item, zone->uz_size, udata,
2393 			    flags) != 0) {
2394 			    	zone->uz_fini(item, zone->uz_size);
2395 				return (NULL);
2396 			}
2397 			return (item);
2398 		}
2399 		/* This is unfortunate but should not be fatal. */
2400 	}
2401 #endif
2402 	/*
2403 	 * If possible, allocate from the per-CPU cache.  There are two
2404 	 * requirements for safe access to the per-CPU cache: (1) the thread
2405 	 * accessing the cache must not be preempted or yield during access,
2406 	 * and (2) the thread must not migrate CPUs without switching which
2407 	 * cache it accesses.  We rely on a critical section to prevent
2408 	 * preemption and migration.  We release the critical section in
2409 	 * order to acquire the zone mutex if we are unable to allocate from
2410 	 * the current cache; when we re-acquire the critical section, we
2411 	 * must detect and handle migration if it has occurred.
2412 	 */
2413 	critical_enter();
2414 	cpu = curcpu;
2415 	cache = &zone->uz_cpu[cpu];
2416 
2417 zalloc_start:
2418 	bucket = cache->uc_allocbucket;
2419 	if (bucket != NULL && bucket->ub_cnt > 0) {
2420 		bucket->ub_cnt--;
2421 		item = bucket->ub_bucket[bucket->ub_cnt];
2422 #ifdef INVARIANTS
2423 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
2424 #endif
2425 		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2426 		cache->uc_allocs++;
2427 		critical_exit();
2428 #ifdef INVARIANTS
2429 		skipdbg = uma_dbg_zskip(zone, item);
2430 #endif
2431 		if (zone->uz_ctor != NULL &&
2432 #ifdef INVARIANTS
2433 		    (!skipdbg || zone->uz_ctor != trash_ctor ||
2434 		    zone->uz_dtor != trash_dtor) &&
2435 #endif
2436 		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2437 			atomic_add_long(&zone->uz_fails, 1);
2438 			zone_free_item(zone, item, udata, SKIP_DTOR);
2439 			return (NULL);
2440 		}
2441 #ifdef INVARIANTS
2442 		if (!skipdbg)
2443 			uma_dbg_alloc(zone, NULL, item);
2444 #endif
2445 		if (flags & M_ZERO)
2446 			uma_zero_item(item, zone);
2447 		return (item);
2448 	}
2449 
2450 	/*
2451 	 * We have run out of items in our alloc bucket.
2452 	 * See if we can switch with our free bucket.
2453 	 */
2454 	bucket = cache->uc_freebucket;
2455 	if (bucket != NULL && bucket->ub_cnt > 0) {
2456 		CTR2(KTR_UMA,
2457 		    "uma_zalloc: zone %s(%p) swapping empty with alloc",
2458 		    zone->uz_name, zone);
2459 		cache->uc_freebucket = cache->uc_allocbucket;
2460 		cache->uc_allocbucket = bucket;
2461 		goto zalloc_start;
2462 	}
2463 
2464 	/*
2465 	 * Discard any empty allocation bucket while we hold no locks.
2466 	 */
2467 	bucket = cache->uc_allocbucket;
2468 	cache->uc_allocbucket = NULL;
2469 	critical_exit();
2470 	if (bucket != NULL)
2471 		bucket_free(zone, bucket, udata);
2472 
2473 	if (zone->uz_flags & UMA_ZONE_NUMA) {
2474 		domain = PCPU_GET(domain);
2475 		if (VM_DOMAIN_EMPTY(domain))
2476 			domain = UMA_ANYDOMAIN;
2477 	} else
2478 		domain = UMA_ANYDOMAIN;
2479 
2480 	/* Short-circuit for zones without buckets and low memory. */
2481 	if (zone->uz_count == 0 || bucketdisable)
2482 		goto zalloc_item;
2483 
2484 	/*
2485 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2486 	 * we must go back to the zone.  This requires the zone lock, so we
2487 	 * must drop the critical section, then re-acquire it when we go back
2488 	 * to the cache.  Since the critical section is released, we may be
2489 	 * preempted or migrate.  As such, make sure not to maintain any
2490 	 * thread-local state specific to the cache from prior to releasing
2491 	 * the critical section.
2492 	 */
2493 	lockfail = 0;
2494 	if (ZONE_TRYLOCK(zone) == 0) {
2495 		/* Record contention to size the buckets. */
2496 		ZONE_LOCK(zone);
2497 		lockfail = 1;
2498 	}
2499 	critical_enter();
2500 	cpu = curcpu;
2501 	cache = &zone->uz_cpu[cpu];
2502 
2503 	/* See if we lost the race to fill the cache. */
2504 	if (cache->uc_allocbucket != NULL) {
2505 		ZONE_UNLOCK(zone);
2506 		goto zalloc_start;
2507 	}
2508 
2509 	/*
2510 	 * Check the zone's cache of buckets.
2511 	 */
2512 	if (domain == UMA_ANYDOMAIN)
2513 		zdom = &zone->uz_domain[0];
2514 	else
2515 		zdom = &zone->uz_domain[domain];
2516 	if ((bucket = LIST_FIRST(&zdom->uzd_buckets)) != NULL) {
2517 		KASSERT(bucket->ub_cnt != 0,
2518 		    ("uma_zalloc_arg: Returning an empty bucket."));
2519 
2520 		LIST_REMOVE(bucket, ub_link);
2521 		cache->uc_allocbucket = bucket;
2522 		ZONE_UNLOCK(zone);
2523 		goto zalloc_start;
2524 	}
2525 	/* We are no longer associated with this CPU. */
2526 	critical_exit();
2527 
2528 	/*
2529 	 * We bump the uz count when the cache size is insufficient to
2530 	 * handle the working set.
2531 	 */
2532 	if (lockfail && zone->uz_count < BUCKET_MAX)
2533 		zone->uz_count++;
2534 	ZONE_UNLOCK(zone);
2535 
2536 	/*
2537 	 * Now lets just fill a bucket and put it on the free list.  If that
2538 	 * works we'll restart the allocation from the beginning and it
2539 	 * will use the just filled bucket.
2540 	 */
2541 	bucket = zone_alloc_bucket(zone, udata, domain, flags);
2542 	CTR3(KTR_UMA, "uma_zalloc: zone %s(%p) bucket zone returned %p",
2543 	    zone->uz_name, zone, bucket);
2544 	if (bucket != NULL) {
2545 		ZONE_LOCK(zone);
2546 		critical_enter();
2547 		cpu = curcpu;
2548 		cache = &zone->uz_cpu[cpu];
2549 		/*
2550 		 * See if we lost the race or were migrated.  Cache the
2551 		 * initialized bucket to make this less likely or claim
2552 		 * the memory directly.
2553 		 */
2554 		if (cache->uc_allocbucket != NULL ||
2555 		    (zone->uz_flags & UMA_ZONE_NUMA &&
2556 		    domain != PCPU_GET(domain)))
2557 			LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
2558 		else
2559 			cache->uc_allocbucket = bucket;
2560 		ZONE_UNLOCK(zone);
2561 		goto zalloc_start;
2562 	}
2563 
2564 	/*
2565 	 * We may not be able to get a bucket so return an actual item.
2566 	 */
2567 zalloc_item:
2568 	item = zone_alloc_item(zone, udata, domain, flags);
2569 
2570 	return (item);
2571 }
2572 
2573 void *
2574 uma_zalloc_domain(uma_zone_t zone, void *udata, int domain, int flags)
2575 {
2576 
2577 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
2578 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
2579 
2580 	/* This is the fast path allocation */
2581 	CTR5(KTR_UMA,
2582 	    "uma_zalloc_domain thread %x zone %s(%p) domain %d flags %d",
2583 	    curthread, zone->uz_name, zone, domain, flags);
2584 
2585 	if (flags & M_WAITOK) {
2586 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2587 		    "uma_zalloc_domain: zone \"%s\"", zone->uz_name);
2588 	}
2589 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
2590 	    ("uma_zalloc_domain: called with spinlock or critical section held"));
2591 
2592 	return (zone_alloc_item(zone, udata, domain, flags));
2593 }
2594 
2595 /*
2596  * Find a slab with some space.  Prefer slabs that are partially used over those
2597  * that are totally full.  This helps to reduce fragmentation.
2598  *
2599  * If 'rr' is 1, search all domains starting from 'domain'.  Otherwise check
2600  * only 'domain'.
2601  */
2602 static uma_slab_t
2603 keg_first_slab(uma_keg_t keg, int domain, int rr)
2604 {
2605 	uma_domain_t dom;
2606 	uma_slab_t slab;
2607 	int start;
2608 
2609 	KASSERT(domain >= 0 && domain < vm_ndomains,
2610 	    ("keg_first_slab: domain %d out of range", domain));
2611 
2612 	slab = NULL;
2613 	start = domain;
2614 	do {
2615 		dom = &keg->uk_domain[domain];
2616 		if (!LIST_EMPTY(&dom->ud_part_slab))
2617 			return (LIST_FIRST(&dom->ud_part_slab));
2618 		if (!LIST_EMPTY(&dom->ud_free_slab)) {
2619 			slab = LIST_FIRST(&dom->ud_free_slab);
2620 			LIST_REMOVE(slab, us_link);
2621 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2622 			return (slab);
2623 		}
2624 		if (rr)
2625 			domain = (domain + 1) % vm_ndomains;
2626 	} while (domain != start);
2627 
2628 	return (NULL);
2629 }
2630 
2631 static uma_slab_t
2632 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int rdomain, int flags)
2633 {
2634 	uma_domain_t dom;
2635 	uma_slab_t slab;
2636 	int allocflags, domain, reserve, rr, start;
2637 
2638 	mtx_assert(&keg->uk_lock, MA_OWNED);
2639 	slab = NULL;
2640 	reserve = 0;
2641 	allocflags = flags;
2642 	if ((flags & M_USE_RESERVE) == 0)
2643 		reserve = keg->uk_reserve;
2644 
2645 	/*
2646 	 * Round-robin for non first-touch zones when there is more than one
2647 	 * domain.
2648 	 */
2649 	if (vm_ndomains == 1)
2650 		rdomain = 0;
2651 	rr = rdomain == UMA_ANYDOMAIN;
2652 	if (rr) {
2653 		start = keg->uk_cursor;
2654 		do {
2655 			keg->uk_cursor = (keg->uk_cursor + 1) % vm_ndomains;
2656 			domain = keg->uk_cursor;
2657 		} while (VM_DOMAIN_EMPTY(domain) && domain != start);
2658 		domain = start = keg->uk_cursor;
2659 		/* Only block on the second pass. */
2660 		if ((flags & (M_WAITOK | M_NOVM)) == M_WAITOK)
2661 			allocflags = (allocflags & ~M_WAITOK) | M_NOWAIT;
2662 	} else
2663 		domain = start = rdomain;
2664 
2665 again:
2666 	do {
2667 		if (keg->uk_free > reserve &&
2668 		    (slab = keg_first_slab(keg, domain, rr)) != NULL) {
2669 			MPASS(slab->us_keg == keg);
2670 			return (slab);
2671 		}
2672 
2673 		/*
2674 		 * M_NOVM means don't ask at all!
2675 		 */
2676 		if (flags & M_NOVM)
2677 			break;
2678 
2679 		if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2680 			keg->uk_flags |= UMA_ZFLAG_FULL;
2681 			/*
2682 			 * If this is not a multi-zone, set the FULL bit.
2683 			 * Otherwise slab_multi() takes care of it.
2684 			 */
2685 			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2686 				zone->uz_flags |= UMA_ZFLAG_FULL;
2687 				zone_log_warning(zone);
2688 				zone_maxaction(zone);
2689 			}
2690 			if (flags & M_NOWAIT)
2691 				return (NULL);
2692 			zone->uz_sleeps++;
2693 			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2694 			continue;
2695 		}
2696 		slab = keg_alloc_slab(keg, zone, domain, allocflags);
2697 		/*
2698 		 * If we got a slab here it's safe to mark it partially used
2699 		 * and return.  We assume that the caller is going to remove
2700 		 * at least one item.
2701 		 */
2702 		if (slab) {
2703 			MPASS(slab->us_keg == keg);
2704 			dom = &keg->uk_domain[slab->us_domain];
2705 			LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
2706 			return (slab);
2707 		}
2708 		if (rr) {
2709 			do {
2710 				domain = (domain + 1) % vm_ndomains;
2711 			} while (VM_DOMAIN_EMPTY(domain) && domain != start);
2712 		}
2713 	} while (domain != start);
2714 
2715 	/* Retry domain scan with blocking. */
2716 	if (allocflags != flags) {
2717 		allocflags = flags;
2718 		goto again;
2719 	}
2720 
2721 	/*
2722 	 * We might not have been able to get a slab but another cpu
2723 	 * could have while we were unlocked.  Check again before we
2724 	 * fail.
2725 	 */
2726 	if (keg->uk_free > reserve &&
2727 	    (slab = keg_first_slab(keg, domain, rr)) != NULL) {
2728 		MPASS(slab->us_keg == keg);
2729 		return (slab);
2730 	}
2731 	return (NULL);
2732 }
2733 
2734 static uma_slab_t
2735 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int domain, int flags)
2736 {
2737 	uma_slab_t slab;
2738 
2739 	if (keg == NULL) {
2740 		keg = zone_first_keg(zone);
2741 		KEG_LOCK(keg);
2742 	}
2743 
2744 	for (;;) {
2745 		slab = keg_fetch_slab(keg, zone, domain, flags);
2746 		if (slab)
2747 			return (slab);
2748 		if (flags & (M_NOWAIT | M_NOVM))
2749 			break;
2750 	}
2751 	KEG_UNLOCK(keg);
2752 	return (NULL);
2753 }
2754 
2755 /*
2756  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2757  * with the keg locked.  On NULL no lock is held.
2758  *
2759  * The last pointer is used to seed the search.  It is not required.
2760  */
2761 static uma_slab_t
2762 zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int domain, int rflags)
2763 {
2764 	uma_klink_t klink;
2765 	uma_slab_t slab;
2766 	uma_keg_t keg;
2767 	int flags;
2768 	int empty;
2769 	int full;
2770 
2771 	/*
2772 	 * Don't wait on the first pass.  This will skip limit tests
2773 	 * as well.  We don't want to block if we can find a provider
2774 	 * without blocking.
2775 	 */
2776 	flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2777 	/*
2778 	 * Use the last slab allocated as a hint for where to start
2779 	 * the search.
2780 	 */
2781 	if (last != NULL) {
2782 		slab = keg_fetch_slab(last, zone, domain, flags);
2783 		if (slab)
2784 			return (slab);
2785 		KEG_UNLOCK(last);
2786 	}
2787 	/*
2788 	 * Loop until we have a slab incase of transient failures
2789 	 * while M_WAITOK is specified.  I'm not sure this is 100%
2790 	 * required but we've done it for so long now.
2791 	 */
2792 	for (;;) {
2793 		empty = 0;
2794 		full = 0;
2795 		/*
2796 		 * Search the available kegs for slabs.  Be careful to hold the
2797 		 * correct lock while calling into the keg layer.
2798 		 */
2799 		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2800 			keg = klink->kl_keg;
2801 			KEG_LOCK(keg);
2802 			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2803 				slab = keg_fetch_slab(keg, zone, domain, flags);
2804 				if (slab)
2805 					return (slab);
2806 			}
2807 			if (keg->uk_flags & UMA_ZFLAG_FULL)
2808 				full++;
2809 			else
2810 				empty++;
2811 			KEG_UNLOCK(keg);
2812 		}
2813 		if (rflags & (M_NOWAIT | M_NOVM))
2814 			break;
2815 		flags = rflags;
2816 		/*
2817 		 * All kegs are full.  XXX We can't atomically check all kegs
2818 		 * and sleep so just sleep for a short period and retry.
2819 		 */
2820 		if (full && !empty) {
2821 			ZONE_LOCK(zone);
2822 			zone->uz_flags |= UMA_ZFLAG_FULL;
2823 			zone->uz_sleeps++;
2824 			zone_log_warning(zone);
2825 			zone_maxaction(zone);
2826 			msleep(zone, zone->uz_lockptr, PVM,
2827 			    "zonelimit", hz/100);
2828 			zone->uz_flags &= ~UMA_ZFLAG_FULL;
2829 			ZONE_UNLOCK(zone);
2830 			continue;
2831 		}
2832 	}
2833 	return (NULL);
2834 }
2835 
2836 static void *
2837 slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2838 {
2839 	uma_domain_t dom;
2840 	void *item;
2841 	uint8_t freei;
2842 
2843 	MPASS(keg == slab->us_keg);
2844 	mtx_assert(&keg->uk_lock, MA_OWNED);
2845 
2846 	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2847 	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2848 	item = slab->us_data + (keg->uk_rsize * freei);
2849 	slab->us_freecount--;
2850 	keg->uk_free--;
2851 
2852 	/* Move this slab to the full list */
2853 	if (slab->us_freecount == 0) {
2854 		LIST_REMOVE(slab, us_link);
2855 		dom = &keg->uk_domain[slab->us_domain];
2856 		LIST_INSERT_HEAD(&dom->ud_full_slab, slab, us_link);
2857 	}
2858 
2859 	return (item);
2860 }
2861 
2862 static int
2863 zone_import(uma_zone_t zone, void **bucket, int max, int domain, int flags)
2864 {
2865 	uma_slab_t slab;
2866 	uma_keg_t keg;
2867 #ifdef NUMA
2868 	int stripe;
2869 #endif
2870 	int i;
2871 
2872 	slab = NULL;
2873 	keg = NULL;
2874 	/* Try to keep the buckets totally full */
2875 	for (i = 0; i < max; ) {
2876 		if ((slab = zone->uz_slab(zone, keg, domain, flags)) == NULL)
2877 			break;
2878 		keg = slab->us_keg;
2879 #ifdef NUMA
2880 		stripe = howmany(max, vm_ndomains);
2881 #endif
2882 		while (slab->us_freecount && i < max) {
2883 			bucket[i++] = slab_alloc_item(keg, slab);
2884 			if (keg->uk_free <= keg->uk_reserve)
2885 				break;
2886 #ifdef NUMA
2887 			/*
2888 			 * If the zone is striped we pick a new slab for every
2889 			 * N allocations.  Eliminating this conditional will
2890 			 * instead pick a new domain for each bucket rather
2891 			 * than stripe within each bucket.  The current option
2892 			 * produces more fragmentation and requires more cpu
2893 			 * time but yields better distribution.
2894 			 */
2895 			if ((zone->uz_flags & UMA_ZONE_NUMA) == 0 &&
2896 			    vm_ndomains > 1 && --stripe == 0)
2897 				break;
2898 #endif
2899 		}
2900 		/* Don't block if we allocated any successfully. */
2901 		flags &= ~M_WAITOK;
2902 		flags |= M_NOWAIT;
2903 	}
2904 	if (slab != NULL)
2905 		KEG_UNLOCK(keg);
2906 
2907 	return i;
2908 }
2909 
2910 static uma_bucket_t
2911 zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags)
2912 {
2913 	uma_bucket_t bucket;
2914 	int max;
2915 
2916 	CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
2917 
2918 	/* Don't wait for buckets, preserve caller's NOVM setting. */
2919 	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2920 	if (bucket == NULL)
2921 		return (NULL);
2922 
2923 	max = MIN(bucket->ub_entries, zone->uz_count);
2924 	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2925 	    max, domain, flags);
2926 
2927 	/*
2928 	 * Initialize the memory if necessary.
2929 	 */
2930 	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2931 		int i;
2932 
2933 		for (i = 0; i < bucket->ub_cnt; i++)
2934 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2935 			    flags) != 0)
2936 				break;
2937 		/*
2938 		 * If we couldn't initialize the whole bucket, put the
2939 		 * rest back onto the freelist.
2940 		 */
2941 		if (i != bucket->ub_cnt) {
2942 			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2943 			    bucket->ub_cnt - i);
2944 #ifdef INVARIANTS
2945 			bzero(&bucket->ub_bucket[i],
2946 			    sizeof(void *) * (bucket->ub_cnt - i));
2947 #endif
2948 			bucket->ub_cnt = i;
2949 		}
2950 	}
2951 
2952 	if (bucket->ub_cnt == 0) {
2953 		bucket_free(zone, bucket, udata);
2954 		atomic_add_long(&zone->uz_fails, 1);
2955 		return (NULL);
2956 	}
2957 
2958 	return (bucket);
2959 }
2960 
2961 /*
2962  * Allocates a single item from a zone.
2963  *
2964  * Arguments
2965  *	zone   The zone to alloc for.
2966  *	udata  The data to be passed to the constructor.
2967  *	domain The domain to allocate from or UMA_ANYDOMAIN.
2968  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
2969  *
2970  * Returns
2971  *	NULL if there is no memory and M_NOWAIT is set
2972  *	An item if successful
2973  */
2974 
2975 static void *
2976 zone_alloc_item(uma_zone_t zone, void *udata, int domain, int flags)
2977 {
2978 	void *item;
2979 #ifdef INVARIANTS
2980 	bool skipdbg;
2981 #endif
2982 
2983 	item = NULL;
2984 
2985 	if (domain != UMA_ANYDOMAIN) {
2986 		/* avoid allocs targeting empty domains */
2987 		if (VM_DOMAIN_EMPTY(domain))
2988 			domain = UMA_ANYDOMAIN;
2989 	}
2990 	if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
2991 		goto fail;
2992 	atomic_add_long(&zone->uz_allocs, 1);
2993 
2994 #ifdef INVARIANTS
2995 	skipdbg = uma_dbg_zskip(zone, item);
2996 #endif
2997 	/*
2998 	 * We have to call both the zone's init (not the keg's init)
2999 	 * and the zone's ctor.  This is because the item is going from
3000 	 * a keg slab directly to the user, and the user is expecting it
3001 	 * to be both zone-init'd as well as zone-ctor'd.
3002 	 */
3003 	if (zone->uz_init != NULL) {
3004 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
3005 			zone_free_item(zone, item, udata, SKIP_FINI);
3006 			goto fail;
3007 		}
3008 	}
3009 	if (zone->uz_ctor != NULL &&
3010 #ifdef INVARIANTS
3011 	    (!skipdbg || zone->uz_ctor != trash_ctor ||
3012 	    zone->uz_dtor != trash_dtor) &&
3013 #endif
3014 	    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
3015 		zone_free_item(zone, item, udata, SKIP_DTOR);
3016 		goto fail;
3017 	}
3018 #ifdef INVARIANTS
3019 	if (!skipdbg)
3020 		uma_dbg_alloc(zone, NULL, item);
3021 #endif
3022 	if (flags & M_ZERO)
3023 		uma_zero_item(item, zone);
3024 
3025 	CTR3(KTR_UMA, "zone_alloc_item item %p from %s(%p)", item,
3026 	    zone->uz_name, zone);
3027 
3028 	return (item);
3029 
3030 fail:
3031 	CTR2(KTR_UMA, "zone_alloc_item failed from %s(%p)",
3032 	    zone->uz_name, zone);
3033 	atomic_add_long(&zone->uz_fails, 1);
3034 	return (NULL);
3035 }
3036 
3037 /* See uma.h */
3038 void
3039 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
3040 {
3041 	uma_cache_t cache;
3042 	uma_bucket_t bucket;
3043 	uma_zone_domain_t zdom;
3044 	int cpu, domain, lockfail;
3045 #ifdef INVARIANTS
3046 	bool skipdbg;
3047 #endif
3048 
3049 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3050 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3051 
3052 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
3053 	    zone->uz_name);
3054 
3055 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3056 	    ("uma_zfree_arg: called with spinlock or critical section held"));
3057 
3058         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3059         if (item == NULL)
3060                 return;
3061 #ifdef DEBUG_MEMGUARD
3062 	if (is_memguard_addr(item)) {
3063 		if (zone->uz_dtor != NULL)
3064 			zone->uz_dtor(item, zone->uz_size, udata);
3065 		if (zone->uz_fini != NULL)
3066 			zone->uz_fini(item, zone->uz_size);
3067 		memguard_free(item);
3068 		return;
3069 	}
3070 #endif
3071 #ifdef INVARIANTS
3072 	skipdbg = uma_dbg_zskip(zone, item);
3073 	if (skipdbg == false) {
3074 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3075 			uma_dbg_free(zone, udata, item);
3076 		else
3077 			uma_dbg_free(zone, NULL, item);
3078 	}
3079 	if (zone->uz_dtor != NULL && (!skipdbg ||
3080 	    zone->uz_dtor != trash_dtor || zone->uz_ctor != trash_ctor))
3081 #else
3082 	if (zone->uz_dtor != NULL)
3083 #endif
3084 		zone->uz_dtor(item, zone->uz_size, udata);
3085 
3086 	/*
3087 	 * The race here is acceptable.  If we miss it we'll just have to wait
3088 	 * a little longer for the limits to be reset.
3089 	 */
3090 	if (zone->uz_flags & UMA_ZFLAG_FULL)
3091 		goto zfree_item;
3092 
3093 	/*
3094 	 * If possible, free to the per-CPU cache.  There are two
3095 	 * requirements for safe access to the per-CPU cache: (1) the thread
3096 	 * accessing the cache must not be preempted or yield during access,
3097 	 * and (2) the thread must not migrate CPUs without switching which
3098 	 * cache it accesses.  We rely on a critical section to prevent
3099 	 * preemption and migration.  We release the critical section in
3100 	 * order to acquire the zone mutex if we are unable to free to the
3101 	 * current cache; when we re-acquire the critical section, we must
3102 	 * detect and handle migration if it has occurred.
3103 	 */
3104 zfree_restart:
3105 	critical_enter();
3106 	cpu = curcpu;
3107 	cache = &zone->uz_cpu[cpu];
3108 
3109 zfree_start:
3110 	/*
3111 	 * Try to free into the allocbucket first to give LIFO ordering
3112 	 * for cache-hot datastructures.  Spill over into the freebucket
3113 	 * if necessary.  Alloc will swap them if one runs dry.
3114 	 */
3115 	bucket = cache->uc_allocbucket;
3116 	if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
3117 		bucket = cache->uc_freebucket;
3118 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3119 		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
3120 		    ("uma_zfree: Freeing to non free bucket index."));
3121 		bucket->ub_bucket[bucket->ub_cnt] = item;
3122 		bucket->ub_cnt++;
3123 		cache->uc_frees++;
3124 		critical_exit();
3125 		return;
3126 	}
3127 
3128 	/*
3129 	 * We must go back the zone, which requires acquiring the zone lock,
3130 	 * which in turn means we must release and re-acquire the critical
3131 	 * section.  Since the critical section is released, we may be
3132 	 * preempted or migrate.  As such, make sure not to maintain any
3133 	 * thread-local state specific to the cache from prior to releasing
3134 	 * the critical section.
3135 	 */
3136 	critical_exit();
3137 	if (zone->uz_count == 0 || bucketdisable)
3138 		goto zfree_item;
3139 
3140 	lockfail = 0;
3141 	if (ZONE_TRYLOCK(zone) == 0) {
3142 		/* Record contention to size the buckets. */
3143 		ZONE_LOCK(zone);
3144 		lockfail = 1;
3145 	}
3146 	critical_enter();
3147 	cpu = curcpu;
3148 	cache = &zone->uz_cpu[cpu];
3149 
3150 	bucket = cache->uc_freebucket;
3151 	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
3152 		ZONE_UNLOCK(zone);
3153 		goto zfree_start;
3154 	}
3155 	cache->uc_freebucket = NULL;
3156 	/* We are no longer associated with this CPU. */
3157 	critical_exit();
3158 
3159 	if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
3160 		domain = PCPU_GET(domain);
3161 		if (VM_DOMAIN_EMPTY(domain))
3162 			domain = UMA_ANYDOMAIN;
3163 	} else
3164 		domain = 0;
3165 	zdom = &zone->uz_domain[0];
3166 
3167 	/* Can we throw this on the zone full list? */
3168 	if (bucket != NULL) {
3169 		CTR3(KTR_UMA,
3170 		    "uma_zfree: zone %s(%p) putting bucket %p on free list",
3171 		    zone->uz_name, zone, bucket);
3172 		/* ub_cnt is pointing to the last free item */
3173 		KASSERT(bucket->ub_cnt != 0,
3174 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
3175 		if ((zone->uz_flags & UMA_ZONE_NOBUCKETCACHE) != 0) {
3176 			ZONE_UNLOCK(zone);
3177 			bucket_drain(zone, bucket);
3178 			bucket_free(zone, bucket, udata);
3179 			goto zfree_restart;
3180 		} else
3181 			LIST_INSERT_HEAD(&zdom->uzd_buckets, bucket, ub_link);
3182 	}
3183 
3184 	/*
3185 	 * We bump the uz count when the cache size is insufficient to
3186 	 * handle the working set.
3187 	 */
3188 	if (lockfail && zone->uz_count < BUCKET_MAX)
3189 		zone->uz_count++;
3190 	ZONE_UNLOCK(zone);
3191 
3192 	bucket = bucket_alloc(zone, udata, M_NOWAIT);
3193 	CTR3(KTR_UMA, "uma_zfree: zone %s(%p) allocated bucket %p",
3194 	    zone->uz_name, zone, bucket);
3195 	if (bucket) {
3196 		critical_enter();
3197 		cpu = curcpu;
3198 		cache = &zone->uz_cpu[cpu];
3199 		if (cache->uc_freebucket == NULL &&
3200 		    ((zone->uz_flags & UMA_ZONE_NUMA) == 0 ||
3201 		    domain == PCPU_GET(domain))) {
3202 			cache->uc_freebucket = bucket;
3203 			goto zfree_start;
3204 		}
3205 		/*
3206 		 * We lost the race, start over.  We have to drop our
3207 		 * critical section to free the bucket.
3208 		 */
3209 		critical_exit();
3210 		bucket_free(zone, bucket, udata);
3211 		goto zfree_restart;
3212 	}
3213 
3214 	/*
3215 	 * If nothing else caught this, we'll just do an internal free.
3216 	 */
3217 zfree_item:
3218 	zone_free_item(zone, item, udata, SKIP_DTOR);
3219 
3220 	return;
3221 }
3222 
3223 void
3224 uma_zfree_domain(uma_zone_t zone, void *item, void *udata)
3225 {
3226 
3227 	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
3228 	random_harvest_fast_uma(&zone, sizeof(zone), RANDOM_UMA);
3229 
3230 	CTR2(KTR_UMA, "uma_zfree_domain thread %x zone %s", curthread,
3231 	    zone->uz_name);
3232 
3233 	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
3234 	    ("uma_zfree_domain: called with spinlock or critical section held"));
3235 
3236         /* uma_zfree(..., NULL) does nothing, to match free(9). */
3237         if (item == NULL)
3238                 return;
3239 	zone_free_item(zone, item, udata, SKIP_NONE);
3240 }
3241 
3242 static void
3243 slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
3244 {
3245 	uma_domain_t dom;
3246 	uint8_t freei;
3247 
3248 	mtx_assert(&keg->uk_lock, MA_OWNED);
3249 	MPASS(keg == slab->us_keg);
3250 
3251 	dom = &keg->uk_domain[slab->us_domain];
3252 
3253 	/* Do we need to remove from any lists? */
3254 	if (slab->us_freecount+1 == keg->uk_ipers) {
3255 		LIST_REMOVE(slab, us_link);
3256 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3257 	} else if (slab->us_freecount == 0) {
3258 		LIST_REMOVE(slab, us_link);
3259 		LIST_INSERT_HEAD(&dom->ud_part_slab, slab, us_link);
3260 	}
3261 
3262 	/* Slab management. */
3263 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3264 	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
3265 	slab->us_freecount++;
3266 
3267 	/* Keg statistics. */
3268 	keg->uk_free++;
3269 }
3270 
3271 static void
3272 zone_release(uma_zone_t zone, void **bucket, int cnt)
3273 {
3274 	void *item;
3275 	uma_slab_t slab;
3276 	uma_keg_t keg;
3277 	uint8_t *mem;
3278 	int clearfull;
3279 	int i;
3280 
3281 	clearfull = 0;
3282 	keg = zone_first_keg(zone);
3283 	KEG_LOCK(keg);
3284 	for (i = 0; i < cnt; i++) {
3285 		item = bucket[i];
3286 		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
3287 			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
3288 			if (zone->uz_flags & UMA_ZONE_HASH) {
3289 				slab = hash_sfind(&keg->uk_hash, mem);
3290 			} else {
3291 				mem += keg->uk_pgoff;
3292 				slab = (uma_slab_t)mem;
3293 			}
3294 		} else {
3295 			slab = vtoslab((vm_offset_t)item);
3296 			if (slab->us_keg != keg) {
3297 				KEG_UNLOCK(keg);
3298 				keg = slab->us_keg;
3299 				KEG_LOCK(keg);
3300 			}
3301 		}
3302 		slab_free_item(keg, slab, item);
3303 		if (keg->uk_flags & UMA_ZFLAG_FULL) {
3304 			if (keg->uk_pages < keg->uk_maxpages) {
3305 				keg->uk_flags &= ~UMA_ZFLAG_FULL;
3306 				clearfull = 1;
3307 			}
3308 
3309 			/*
3310 			 * We can handle one more allocation. Since we're
3311 			 * clearing ZFLAG_FULL, wake up all procs blocked
3312 			 * on pages. This should be uncommon, so keeping this
3313 			 * simple for now (rather than adding count of blocked
3314 			 * threads etc).
3315 			 */
3316 			wakeup(keg);
3317 		}
3318 	}
3319 	KEG_UNLOCK(keg);
3320 	if (clearfull) {
3321 		ZONE_LOCK(zone);
3322 		zone->uz_flags &= ~UMA_ZFLAG_FULL;
3323 		wakeup(zone);
3324 		ZONE_UNLOCK(zone);
3325 	}
3326 
3327 }
3328 
3329 /*
3330  * Frees a single item to any zone.
3331  *
3332  * Arguments:
3333  *	zone   The zone to free to
3334  *	item   The item we're freeing
3335  *	udata  User supplied data for the dtor
3336  *	skip   Skip dtors and finis
3337  */
3338 static void
3339 zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
3340 {
3341 #ifdef INVARIANTS
3342 	bool skipdbg;
3343 
3344 	skipdbg = uma_dbg_zskip(zone, item);
3345 	if (skip == SKIP_NONE && !skipdbg) {
3346 		if (zone->uz_flags & UMA_ZONE_MALLOC)
3347 			uma_dbg_free(zone, udata, item);
3348 		else
3349 			uma_dbg_free(zone, NULL, item);
3350 	}
3351 
3352 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL &&
3353 	    (!skipdbg || zone->uz_dtor != trash_dtor ||
3354 	    zone->uz_ctor != trash_ctor))
3355 #else
3356 	if (skip < SKIP_DTOR && zone->uz_dtor != NULL)
3357 #endif
3358 		zone->uz_dtor(item, zone->uz_size, udata);
3359 
3360 	if (skip < SKIP_FINI && zone->uz_fini)
3361 		zone->uz_fini(item, zone->uz_size);
3362 
3363 	atomic_add_long(&zone->uz_frees, 1);
3364 	zone->uz_release(zone->uz_arg, &item, 1);
3365 }
3366 
3367 /* See uma.h */
3368 int
3369 uma_zone_set_max(uma_zone_t zone, int nitems)
3370 {
3371 	uma_keg_t keg;
3372 
3373 	keg = zone_first_keg(zone);
3374 	if (keg == NULL)
3375 		return (0);
3376 	KEG_LOCK(keg);
3377 	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
3378 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
3379 		keg->uk_maxpages += keg->uk_ppera;
3380 	nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
3381 	KEG_UNLOCK(keg);
3382 
3383 	return (nitems);
3384 }
3385 
3386 /* See uma.h */
3387 int
3388 uma_zone_get_max(uma_zone_t zone)
3389 {
3390 	int nitems;
3391 	uma_keg_t keg;
3392 
3393 	keg = zone_first_keg(zone);
3394 	if (keg == NULL)
3395 		return (0);
3396 	KEG_LOCK(keg);
3397 	nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
3398 	KEG_UNLOCK(keg);
3399 
3400 	return (nitems);
3401 }
3402 
3403 /* See uma.h */
3404 void
3405 uma_zone_set_warning(uma_zone_t zone, const char *warning)
3406 {
3407 
3408 	ZONE_LOCK(zone);
3409 	zone->uz_warning = warning;
3410 	ZONE_UNLOCK(zone);
3411 }
3412 
3413 /* See uma.h */
3414 void
3415 uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
3416 {
3417 
3418 	ZONE_LOCK(zone);
3419 	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
3420 	ZONE_UNLOCK(zone);
3421 }
3422 
3423 /* See uma.h */
3424 int
3425 uma_zone_get_cur(uma_zone_t zone)
3426 {
3427 	int64_t nitems;
3428 	u_int i;
3429 
3430 	ZONE_LOCK(zone);
3431 	nitems = zone->uz_allocs - zone->uz_frees;
3432 	CPU_FOREACH(i) {
3433 		/*
3434 		 * See the comment in sysctl_vm_zone_stats() regarding the
3435 		 * safety of accessing the per-cpu caches. With the zone lock
3436 		 * held, it is safe, but can potentially result in stale data.
3437 		 */
3438 		nitems += zone->uz_cpu[i].uc_allocs -
3439 		    zone->uz_cpu[i].uc_frees;
3440 	}
3441 	ZONE_UNLOCK(zone);
3442 
3443 	return (nitems < 0 ? 0 : nitems);
3444 }
3445 
3446 /* See uma.h */
3447 void
3448 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
3449 {
3450 	uma_keg_t keg;
3451 
3452 	keg = zone_first_keg(zone);
3453 	KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
3454 	KEG_LOCK(keg);
3455 	KASSERT(keg->uk_pages == 0,
3456 	    ("uma_zone_set_init on non-empty keg"));
3457 	keg->uk_init = uminit;
3458 	KEG_UNLOCK(keg);
3459 }
3460 
3461 /* See uma.h */
3462 void
3463 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
3464 {
3465 	uma_keg_t keg;
3466 
3467 	keg = zone_first_keg(zone);
3468 	KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
3469 	KEG_LOCK(keg);
3470 	KASSERT(keg->uk_pages == 0,
3471 	    ("uma_zone_set_fini on non-empty keg"));
3472 	keg->uk_fini = fini;
3473 	KEG_UNLOCK(keg);
3474 }
3475 
3476 /* See uma.h */
3477 void
3478 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
3479 {
3480 
3481 	ZONE_LOCK(zone);
3482 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
3483 	    ("uma_zone_set_zinit on non-empty keg"));
3484 	zone->uz_init = zinit;
3485 	ZONE_UNLOCK(zone);
3486 }
3487 
3488 /* See uma.h */
3489 void
3490 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
3491 {
3492 
3493 	ZONE_LOCK(zone);
3494 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
3495 	    ("uma_zone_set_zfini on non-empty keg"));
3496 	zone->uz_fini = zfini;
3497 	ZONE_UNLOCK(zone);
3498 }
3499 
3500 /* See uma.h */
3501 /* XXX uk_freef is not actually used with the zone locked */
3502 void
3503 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
3504 {
3505 	uma_keg_t keg;
3506 
3507 	keg = zone_first_keg(zone);
3508 	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
3509 	KEG_LOCK(keg);
3510 	keg->uk_freef = freef;
3511 	KEG_UNLOCK(keg);
3512 }
3513 
3514 /* See uma.h */
3515 /* XXX uk_allocf is not actually used with the zone locked */
3516 void
3517 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
3518 {
3519 	uma_keg_t keg;
3520 
3521 	keg = zone_first_keg(zone);
3522 	KEG_LOCK(keg);
3523 	keg->uk_allocf = allocf;
3524 	KEG_UNLOCK(keg);
3525 }
3526 
3527 /* See uma.h */
3528 void
3529 uma_zone_reserve(uma_zone_t zone, int items)
3530 {
3531 	uma_keg_t keg;
3532 
3533 	keg = zone_first_keg(zone);
3534 	if (keg == NULL)
3535 		return;
3536 	KEG_LOCK(keg);
3537 	keg->uk_reserve = items;
3538 	KEG_UNLOCK(keg);
3539 
3540 	return;
3541 }
3542 
3543 /* See uma.h */
3544 int
3545 uma_zone_reserve_kva(uma_zone_t zone, int count)
3546 {
3547 	uma_keg_t keg;
3548 	vm_offset_t kva;
3549 	u_int pages;
3550 
3551 	keg = zone_first_keg(zone);
3552 	if (keg == NULL)
3553 		return (0);
3554 	pages = count / keg->uk_ipers;
3555 
3556 	if (pages * keg->uk_ipers < count)
3557 		pages++;
3558 	pages *= keg->uk_ppera;
3559 
3560 #ifdef UMA_MD_SMALL_ALLOC
3561 	if (keg->uk_ppera > 1) {
3562 #else
3563 	if (1) {
3564 #endif
3565 		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
3566 		if (kva == 0)
3567 			return (0);
3568 	} else
3569 		kva = 0;
3570 	KEG_LOCK(keg);
3571 	keg->uk_kva = kva;
3572 	keg->uk_offset = 0;
3573 	keg->uk_maxpages = pages;
3574 #ifdef UMA_MD_SMALL_ALLOC
3575 	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
3576 #else
3577 	keg->uk_allocf = noobj_alloc;
3578 #endif
3579 	keg->uk_flags |= UMA_ZONE_NOFREE;
3580 	KEG_UNLOCK(keg);
3581 
3582 	return (1);
3583 }
3584 
3585 /* See uma.h */
3586 void
3587 uma_prealloc(uma_zone_t zone, int items)
3588 {
3589 	uma_domain_t dom;
3590 	uma_slab_t slab;
3591 	uma_keg_t keg;
3592 	int domain, slabs;
3593 
3594 	keg = zone_first_keg(zone);
3595 	if (keg == NULL)
3596 		return;
3597 	KEG_LOCK(keg);
3598 	slabs = items / keg->uk_ipers;
3599 	domain = 0;
3600 	if (slabs * keg->uk_ipers < items)
3601 		slabs++;
3602 	while (slabs > 0) {
3603 		slab = keg_alloc_slab(keg, zone, domain, M_WAITOK);
3604 		if (slab == NULL)
3605 			break;
3606 		MPASS(slab->us_keg == keg);
3607 		dom = &keg->uk_domain[slab->us_domain];
3608 		LIST_INSERT_HEAD(&dom->ud_free_slab, slab, us_link);
3609 		slabs--;
3610 		do {
3611 			domain = (domain + 1) % vm_ndomains;
3612 		} while (VM_DOMAIN_EMPTY(domain));
3613 	}
3614 	KEG_UNLOCK(keg);
3615 }
3616 
3617 /* See uma.h */
3618 static void
3619 uma_reclaim_locked(bool kmem_danger)
3620 {
3621 
3622 	CTR0(KTR_UMA, "UMA: vm asked us to release pages!");
3623 	sx_assert(&uma_drain_lock, SA_XLOCKED);
3624 	bucket_enable();
3625 	zone_foreach(zone_drain);
3626 	if (vm_page_count_min() || kmem_danger) {
3627 		cache_drain_safe(NULL);
3628 		zone_foreach(zone_drain);
3629 	}
3630 	/*
3631 	 * Some slabs may have been freed but this zone will be visited early
3632 	 * we visit again so that we can free pages that are empty once other
3633 	 * zones are drained.  We have to do the same for buckets.
3634 	 */
3635 	zone_drain(slabzone);
3636 	bucket_zone_drain();
3637 }
3638 
3639 void
3640 uma_reclaim(void)
3641 {
3642 
3643 	sx_xlock(&uma_drain_lock);
3644 	uma_reclaim_locked(false);
3645 	sx_xunlock(&uma_drain_lock);
3646 }
3647 
3648 static volatile int uma_reclaim_needed;
3649 
3650 void
3651 uma_reclaim_wakeup(void)
3652 {
3653 
3654 	if (atomic_fetchadd_int(&uma_reclaim_needed, 1) == 0)
3655 		wakeup(uma_reclaim);
3656 }
3657 
3658 void
3659 uma_reclaim_worker(void *arg __unused)
3660 {
3661 
3662 	for (;;) {
3663 		sx_xlock(&uma_drain_lock);
3664 		while (atomic_load_int(&uma_reclaim_needed) == 0)
3665 			sx_sleep(uma_reclaim, &uma_drain_lock, PVM, "umarcl",
3666 			    hz);
3667 		sx_xunlock(&uma_drain_lock);
3668 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
3669 		sx_xlock(&uma_drain_lock);
3670 		uma_reclaim_locked(true);
3671 		atomic_store_int(&uma_reclaim_needed, 0);
3672 		sx_xunlock(&uma_drain_lock);
3673 		/* Don't fire more than once per-second. */
3674 		pause("umarclslp", hz);
3675 	}
3676 }
3677 
3678 /* See uma.h */
3679 int
3680 uma_zone_exhausted(uma_zone_t zone)
3681 {
3682 	int full;
3683 
3684 	ZONE_LOCK(zone);
3685 	full = (zone->uz_flags & UMA_ZFLAG_FULL);
3686 	ZONE_UNLOCK(zone);
3687 	return (full);
3688 }
3689 
3690 int
3691 uma_zone_exhausted_nolock(uma_zone_t zone)
3692 {
3693 	return (zone->uz_flags & UMA_ZFLAG_FULL);
3694 }
3695 
3696 void *
3697 uma_large_malloc_domain(vm_size_t size, int domain, int wait)
3698 {
3699 	vm_offset_t addr;
3700 	uma_slab_t slab;
3701 
3702 	if (domain != UMA_ANYDOMAIN) {
3703 		/* avoid allocs targeting empty domains */
3704 		if (VM_DOMAIN_EMPTY(domain))
3705 			domain = UMA_ANYDOMAIN;
3706 	}
3707 	slab = zone_alloc_item(slabzone, NULL, domain, wait);
3708 	if (slab == NULL)
3709 		return (NULL);
3710 	if (domain == UMA_ANYDOMAIN)
3711 		addr = kmem_malloc(size, wait);
3712 	else
3713 		addr = kmem_malloc_domain(domain, size, wait);
3714 	if (addr != 0) {
3715 		vsetslab(addr, slab);
3716 		slab->us_data = (void *)addr;
3717 		slab->us_flags = UMA_SLAB_KERNEL | UMA_SLAB_MALLOC;
3718 		slab->us_size = size;
3719 		slab->us_domain = vm_phys_domain(PHYS_TO_VM_PAGE(
3720 		    pmap_kextract(addr)));
3721 		uma_total_inc(size);
3722 	} else {
3723 		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3724 	}
3725 
3726 	return ((void *)addr);
3727 }
3728 
3729 void *
3730 uma_large_malloc(vm_size_t size, int wait)
3731 {
3732 
3733 	return uma_large_malloc_domain(size, UMA_ANYDOMAIN, wait);
3734 }
3735 
3736 void
3737 uma_large_free(uma_slab_t slab)
3738 {
3739 
3740 	KASSERT((slab->us_flags & UMA_SLAB_KERNEL) != 0,
3741 	    ("uma_large_free:  Memory not allocated with uma_large_malloc."));
3742 	kmem_free((vm_offset_t)slab->us_data, slab->us_size);
3743 	uma_total_dec(slab->us_size);
3744 	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3745 }
3746 
3747 static void
3748 uma_zero_item(void *item, uma_zone_t zone)
3749 {
3750 
3751 	bzero(item, zone->uz_size);
3752 }
3753 
3754 unsigned long
3755 uma_limit(void)
3756 {
3757 
3758 	return (uma_kmem_limit);
3759 }
3760 
3761 void
3762 uma_set_limit(unsigned long limit)
3763 {
3764 
3765 	uma_kmem_limit = limit;
3766 }
3767 
3768 unsigned long
3769 uma_size(void)
3770 {
3771 
3772 	return (uma_kmem_total);
3773 }
3774 
3775 long
3776 uma_avail(void)
3777 {
3778 
3779 	return (uma_kmem_limit - uma_kmem_total);
3780 }
3781 
3782 void
3783 uma_print_stats(void)
3784 {
3785 	zone_foreach(uma_print_zone);
3786 }
3787 
3788 static void
3789 slab_print(uma_slab_t slab)
3790 {
3791 	printf("slab: keg %p, data %p, freecount %d\n",
3792 		slab->us_keg, slab->us_data, slab->us_freecount);
3793 }
3794 
3795 static void
3796 cache_print(uma_cache_t cache)
3797 {
3798 	printf("alloc: %p(%d), free: %p(%d)\n",
3799 		cache->uc_allocbucket,
3800 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3801 		cache->uc_freebucket,
3802 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3803 }
3804 
3805 static void
3806 uma_print_keg(uma_keg_t keg)
3807 {
3808 	uma_domain_t dom;
3809 	uma_slab_t slab;
3810 	int i;
3811 
3812 	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3813 	    "out %d free %d limit %d\n",
3814 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3815 	    keg->uk_ipers, keg->uk_ppera,
3816 	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
3817 	    keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3818 	for (i = 0; i < vm_ndomains; i++) {
3819 		dom = &keg->uk_domain[i];
3820 		printf("Part slabs:\n");
3821 		LIST_FOREACH(slab, &dom->ud_part_slab, us_link)
3822 			slab_print(slab);
3823 		printf("Free slabs:\n");
3824 		LIST_FOREACH(slab, &dom->ud_free_slab, us_link)
3825 			slab_print(slab);
3826 		printf("Full slabs:\n");
3827 		LIST_FOREACH(slab, &dom->ud_full_slab, us_link)
3828 			slab_print(slab);
3829 	}
3830 }
3831 
3832 void
3833 uma_print_zone(uma_zone_t zone)
3834 {
3835 	uma_cache_t cache;
3836 	uma_klink_t kl;
3837 	int i;
3838 
3839 	printf("zone: %s(%p) size %d flags %#x\n",
3840 	    zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3841 	LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3842 		uma_print_keg(kl->kl_keg);
3843 	CPU_FOREACH(i) {
3844 		cache = &zone->uz_cpu[i];
3845 		printf("CPU %d Cache:\n", i);
3846 		cache_print(cache);
3847 	}
3848 }
3849 
3850 #ifdef DDB
3851 /*
3852  * Generate statistics across both the zone and its per-cpu cache's.  Return
3853  * desired statistics if the pointer is non-NULL for that statistic.
3854  *
3855  * Note: does not update the zone statistics, as it can't safely clear the
3856  * per-CPU cache statistic.
3857  *
3858  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3859  * safe from off-CPU; we should modify the caches to track this information
3860  * directly so that we don't have to.
3861  */
3862 static void
3863 uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
3864     uint64_t *freesp, uint64_t *sleepsp)
3865 {
3866 	uma_cache_t cache;
3867 	uint64_t allocs, frees, sleeps;
3868 	int cachefree, cpu;
3869 
3870 	allocs = frees = sleeps = 0;
3871 	cachefree = 0;
3872 	CPU_FOREACH(cpu) {
3873 		cache = &z->uz_cpu[cpu];
3874 		if (cache->uc_allocbucket != NULL)
3875 			cachefree += cache->uc_allocbucket->ub_cnt;
3876 		if (cache->uc_freebucket != NULL)
3877 			cachefree += cache->uc_freebucket->ub_cnt;
3878 		allocs += cache->uc_allocs;
3879 		frees += cache->uc_frees;
3880 	}
3881 	allocs += z->uz_allocs;
3882 	frees += z->uz_frees;
3883 	sleeps += z->uz_sleeps;
3884 	if (cachefreep != NULL)
3885 		*cachefreep = cachefree;
3886 	if (allocsp != NULL)
3887 		*allocsp = allocs;
3888 	if (freesp != NULL)
3889 		*freesp = frees;
3890 	if (sleepsp != NULL)
3891 		*sleepsp = sleeps;
3892 }
3893 #endif /* DDB */
3894 
3895 static int
3896 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3897 {
3898 	uma_keg_t kz;
3899 	uma_zone_t z;
3900 	int count;
3901 
3902 	count = 0;
3903 	rw_rlock(&uma_rwlock);
3904 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3905 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3906 			count++;
3907 	}
3908 	rw_runlock(&uma_rwlock);
3909 	return (sysctl_handle_int(oidp, &count, 0, req));
3910 }
3911 
3912 static int
3913 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3914 {
3915 	struct uma_stream_header ush;
3916 	struct uma_type_header uth;
3917 	struct uma_percpu_stat *ups;
3918 	uma_bucket_t bucket;
3919 	uma_zone_domain_t zdom;
3920 	struct sbuf sbuf;
3921 	uma_cache_t cache;
3922 	uma_klink_t kl;
3923 	uma_keg_t kz;
3924 	uma_zone_t z;
3925 	uma_keg_t k;
3926 	int count, error, i;
3927 
3928 	error = sysctl_wire_old_buffer(req, 0);
3929 	if (error != 0)
3930 		return (error);
3931 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3932 	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
3933 	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
3934 
3935 	count = 0;
3936 	rw_rlock(&uma_rwlock);
3937 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3938 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3939 			count++;
3940 	}
3941 
3942 	/*
3943 	 * Insert stream header.
3944 	 */
3945 	bzero(&ush, sizeof(ush));
3946 	ush.ush_version = UMA_STREAM_VERSION;
3947 	ush.ush_maxcpus = (mp_maxid + 1);
3948 	ush.ush_count = count;
3949 	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3950 
3951 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3952 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3953 			bzero(&uth, sizeof(uth));
3954 			ZONE_LOCK(z);
3955 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3956 			uth.uth_align = kz->uk_align;
3957 			uth.uth_size = kz->uk_size;
3958 			uth.uth_rsize = kz->uk_rsize;
3959 			LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3960 				k = kl->kl_keg;
3961 				uth.uth_maxpages += k->uk_maxpages;
3962 				uth.uth_pages += k->uk_pages;
3963 				uth.uth_keg_free += k->uk_free;
3964 				uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3965 				    * k->uk_ipers;
3966 			}
3967 
3968 			/*
3969 			 * A zone is secondary is it is not the first entry
3970 			 * on the keg's zone list.
3971 			 */
3972 			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3973 			    (LIST_FIRST(&kz->uk_zones) != z))
3974 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3975 
3976 			for (i = 0; i < vm_ndomains; i++) {
3977 				zdom = &z->uz_domain[i];
3978 				LIST_FOREACH(bucket, &zdom->uzd_buckets,
3979 				    ub_link)
3980 					uth.uth_zone_free += bucket->ub_cnt;
3981 			}
3982 			uth.uth_allocs = z->uz_allocs;
3983 			uth.uth_frees = z->uz_frees;
3984 			uth.uth_fails = z->uz_fails;
3985 			uth.uth_sleeps = z->uz_sleeps;
3986 			/*
3987 			 * While it is not normally safe to access the cache
3988 			 * bucket pointers while not on the CPU that owns the
3989 			 * cache, we only allow the pointers to be exchanged
3990 			 * without the zone lock held, not invalidated, so
3991 			 * accept the possible race associated with bucket
3992 			 * exchange during monitoring.
3993 			 */
3994 			for (i = 0; i < mp_maxid + 1; i++) {
3995 				bzero(&ups[i], sizeof(*ups));
3996 				if (kz->uk_flags & UMA_ZFLAG_INTERNAL ||
3997 				    CPU_ABSENT(i))
3998 					continue;
3999 				cache = &z->uz_cpu[i];
4000 				if (cache->uc_allocbucket != NULL)
4001 					ups[i].ups_cache_free +=
4002 					    cache->uc_allocbucket->ub_cnt;
4003 				if (cache->uc_freebucket != NULL)
4004 					ups[i].ups_cache_free +=
4005 					    cache->uc_freebucket->ub_cnt;
4006 				ups[i].ups_allocs = cache->uc_allocs;
4007 				ups[i].ups_frees = cache->uc_frees;
4008 			}
4009 			ZONE_UNLOCK(z);
4010 			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
4011 			for (i = 0; i < mp_maxid + 1; i++)
4012 				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
4013 		}
4014 	}
4015 	rw_runlock(&uma_rwlock);
4016 	error = sbuf_finish(&sbuf);
4017 	sbuf_delete(&sbuf);
4018 	free(ups, M_TEMP);
4019 	return (error);
4020 }
4021 
4022 int
4023 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
4024 {
4025 	uma_zone_t zone = *(uma_zone_t *)arg1;
4026 	int error, max;
4027 
4028 	max = uma_zone_get_max(zone);
4029 	error = sysctl_handle_int(oidp, &max, 0, req);
4030 	if (error || !req->newptr)
4031 		return (error);
4032 
4033 	uma_zone_set_max(zone, max);
4034 
4035 	return (0);
4036 }
4037 
4038 int
4039 sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
4040 {
4041 	uma_zone_t zone = *(uma_zone_t *)arg1;
4042 	int cur;
4043 
4044 	cur = uma_zone_get_cur(zone);
4045 	return (sysctl_handle_int(oidp, &cur, 0, req));
4046 }
4047 
4048 #ifdef INVARIANTS
4049 static uma_slab_t
4050 uma_dbg_getslab(uma_zone_t zone, void *item)
4051 {
4052 	uma_slab_t slab;
4053 	uma_keg_t keg;
4054 	uint8_t *mem;
4055 
4056 	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
4057 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
4058 		slab = vtoslab((vm_offset_t)mem);
4059 	} else {
4060 		/*
4061 		 * It is safe to return the slab here even though the
4062 		 * zone is unlocked because the item's allocation state
4063 		 * essentially holds a reference.
4064 		 */
4065 		ZONE_LOCK(zone);
4066 		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
4067 		if (keg->uk_flags & UMA_ZONE_HASH)
4068 			slab = hash_sfind(&keg->uk_hash, mem);
4069 		else
4070 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
4071 		ZONE_UNLOCK(zone);
4072 	}
4073 
4074 	return (slab);
4075 }
4076 
4077 static bool
4078 uma_dbg_zskip(uma_zone_t zone, void *mem)
4079 {
4080 	uma_keg_t keg;
4081 
4082 	if ((keg = zone_first_keg(zone)) == NULL)
4083 		return (true);
4084 
4085 	return (uma_dbg_kskip(keg, mem));
4086 }
4087 
4088 static bool
4089 uma_dbg_kskip(uma_keg_t keg, void *mem)
4090 {
4091 	uintptr_t idx;
4092 
4093 	if (dbg_divisor == 0)
4094 		return (true);
4095 
4096 	if (dbg_divisor == 1)
4097 		return (false);
4098 
4099 	idx = (uintptr_t)mem >> PAGE_SHIFT;
4100 	if (keg->uk_ipers > 1) {
4101 		idx *= keg->uk_ipers;
4102 		idx += ((uintptr_t)mem & PAGE_MASK) / keg->uk_rsize;
4103 	}
4104 
4105 	if ((idx / dbg_divisor) * dbg_divisor != idx) {
4106 		counter_u64_add(uma_skip_cnt, 1);
4107 		return (true);
4108 	}
4109 	counter_u64_add(uma_dbg_cnt, 1);
4110 
4111 	return (false);
4112 }
4113 
4114 /*
4115  * Set up the slab's freei data such that uma_dbg_free can function.
4116  *
4117  */
4118 static void
4119 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
4120 {
4121 	uma_keg_t keg;
4122 	int freei;
4123 
4124 	if (slab == NULL) {
4125 		slab = uma_dbg_getslab(zone, item);
4126 		if (slab == NULL)
4127 			panic("uma: item %p did not belong to zone %s\n",
4128 			    item, zone->uz_name);
4129 	}
4130 	keg = slab->us_keg;
4131 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4132 
4133 	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4134 		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
4135 		    item, zone, zone->uz_name, slab, freei);
4136 	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4137 
4138 	return;
4139 }
4140 
4141 /*
4142  * Verifies freed addresses.  Checks for alignment, valid slab membership
4143  * and duplicate frees.
4144  *
4145  */
4146 static void
4147 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
4148 {
4149 	uma_keg_t keg;
4150 	int freei;
4151 
4152 	if (slab == NULL) {
4153 		slab = uma_dbg_getslab(zone, item);
4154 		if (slab == NULL)
4155 			panic("uma: Freed item %p did not belong to zone %s\n",
4156 			    item, zone->uz_name);
4157 	}
4158 	keg = slab->us_keg;
4159 	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
4160 
4161 	if (freei >= keg->uk_ipers)
4162 		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
4163 		    item, zone, zone->uz_name, slab, freei);
4164 
4165 	if (((freei * keg->uk_rsize) + slab->us_data) != item)
4166 		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
4167 		    item, zone, zone->uz_name, slab, freei);
4168 
4169 	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
4170 		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
4171 		    item, zone, zone->uz_name, slab, freei);
4172 
4173 	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
4174 }
4175 #endif /* INVARIANTS */
4176 
4177 #ifdef DDB
4178 DB_SHOW_COMMAND(uma, db_show_uma)
4179 {
4180 	uma_bucket_t bucket;
4181 	uma_keg_t kz;
4182 	uma_zone_t z;
4183 	uma_zone_domain_t zdom;
4184 	uint64_t allocs, frees, sleeps;
4185 	int cachefree, i;
4186 
4187 	db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
4188 	    "Free", "Requests", "Sleeps", "Bucket");
4189 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
4190 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
4191 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
4192 				allocs = z->uz_allocs;
4193 				frees = z->uz_frees;
4194 				sleeps = z->uz_sleeps;
4195 				cachefree = 0;
4196 			} else
4197 				uma_zone_sumstat(z, &cachefree, &allocs,
4198 				    &frees, &sleeps);
4199 			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
4200 			    (LIST_FIRST(&kz->uk_zones) != z)))
4201 				cachefree += kz->uk_free;
4202 			for (i = 0; i < vm_ndomains; i++) {
4203 				zdom = &z->uz_domain[i];
4204 				LIST_FOREACH(bucket, &zdom->uzd_buckets,
4205 				    ub_link)
4206 					cachefree += bucket->ub_cnt;
4207 			}
4208 			db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n",
4209 			    z->uz_name, (uintmax_t)kz->uk_size,
4210 			    (intmax_t)(allocs - frees), cachefree,
4211 			    (uintmax_t)allocs, sleeps, z->uz_count);
4212 			if (db_pager_quit)
4213 				return;
4214 		}
4215 	}
4216 }
4217 
4218 DB_SHOW_COMMAND(umacache, db_show_umacache)
4219 {
4220 	uma_bucket_t bucket;
4221 	uma_zone_t z;
4222 	uma_zone_domain_t zdom;
4223 	uint64_t allocs, frees;
4224 	int cachefree, i;
4225 
4226 	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
4227 	    "Requests", "Bucket");
4228 	LIST_FOREACH(z, &uma_cachezones, uz_link) {
4229 		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
4230 		for (i = 0; i < vm_ndomains; i++) {
4231 			zdom = &z->uz_domain[i];
4232 			LIST_FOREACH(bucket, &zdom->uzd_buckets, ub_link)
4233 				cachefree += bucket->ub_cnt;
4234 		}
4235 		db_printf("%18s %8ju %8jd %8d %12ju %8u\n",
4236 		    z->uz_name, (uintmax_t)z->uz_size,
4237 		    (intmax_t)(allocs - frees), cachefree,
4238 		    (uintmax_t)allocs, z->uz_count);
4239 		if (db_pager_quit)
4240 			return;
4241 	}
4242 }
4243 #endif	/* DDB */
4244