xref: /openbsd/sys/kern/subr_pool.c (revision 097a140d)
1 /*	$OpenBSD: subr_pool.c,v 1.233 2021/03/10 10:21:47 jsg Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/errno.h>
37 #include <sys/kernel.h>
38 #include <sys/malloc.h>
39 #include <sys/pool.h>
40 #include <sys/proc.h>
41 #include <sys/syslog.h>
42 #include <sys/sysctl.h>
43 #include <sys/task.h>
44 #include <sys/time.h>
45 #include <sys/timeout.h>
46 #include <sys/percpu.h>
47 #include <sys/tracepoint.h>
48 
49 #include <uvm/uvm_extern.h>
50 
51 /*
52  * Pool resource management utility.
53  *
54  * Memory is allocated in pages which are split into pieces according to
55  * the pool item size. Each page is kept on one of three lists in the
56  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
57  * for empty, full and partially-full pages respectively. The individual
58  * pool items are on a linked list headed by `ph_items' in each page
59  * header. The memory for building the page list is either taken from
60  * the allocated pages themselves (for small pool items) or taken from
61  * an internal pool of page headers (`phpool').
62  */
63 
64 /* List of all pools */
65 SIMPLEQ_HEAD(,pool) pool_head = SIMPLEQ_HEAD_INITIALIZER(pool_head);
66 
67 /*
68  * Every pool gets a unique serial number assigned to it. If this counter
69  * wraps, we're screwed, but we shouldn't create so many pools anyway.
70  */
71 unsigned int pool_serial;
72 unsigned int pool_count;
73 
74 /* Lock the previous variables making up the global pool state */
75 struct rwlock pool_lock = RWLOCK_INITIALIZER("pools");
76 
77 /* Private pool for page header structures */
78 struct pool phpool;
79 
80 struct pool_lock_ops {
81 	void	(*pl_init)(struct pool *, union pool_lock *,
82 		    const struct lock_type *);
83 	void	(*pl_enter)(union pool_lock *);
84 	int	(*pl_enter_try)(union pool_lock *);
85 	void	(*pl_leave)(union pool_lock *);
86 	void	(*pl_assert_locked)(union pool_lock *);
87 	void	(*pl_assert_unlocked)(union pool_lock *);
88 	int	(*pl_sleep)(void *, union pool_lock *, int, const char *);
89 };
90 
91 static const struct pool_lock_ops pool_lock_ops_mtx;
92 static const struct pool_lock_ops pool_lock_ops_rw;
93 
94 #ifdef WITNESS
95 #define pl_init(pp, pl) do {						\
96 	static const struct lock_type __lock_type = { .lt_name = #pl };	\
97 	(pp)->pr_lock_ops->pl_init(pp, pl, &__lock_type);		\
98 } while (0)
99 #else /* WITNESS */
100 #define pl_init(pp, pl)		(pp)->pr_lock_ops->pl_init(pp, pl, NULL)
101 #endif /* WITNESS */
102 
103 static inline void
104 pl_enter(struct pool *pp, union pool_lock *pl)
105 {
106 	pp->pr_lock_ops->pl_enter(pl);
107 }
108 static inline int
109 pl_enter_try(struct pool *pp, union pool_lock *pl)
110 {
111 	return pp->pr_lock_ops->pl_enter_try(pl);
112 }
113 static inline void
114 pl_leave(struct pool *pp, union pool_lock *pl)
115 {
116 	pp->pr_lock_ops->pl_leave(pl);
117 }
118 static inline void
119 pl_assert_locked(struct pool *pp, union pool_lock *pl)
120 {
121 	pp->pr_lock_ops->pl_assert_locked(pl);
122 }
123 static inline void
124 pl_assert_unlocked(struct pool *pp, union pool_lock *pl)
125 {
126 	pp->pr_lock_ops->pl_assert_unlocked(pl);
127 }
128 static inline int
129 pl_sleep(struct pool *pp, void *ident, union pool_lock *lock, int priority,
130     const char *wmesg)
131 {
132 	return pp->pr_lock_ops->pl_sleep(ident, lock, priority, wmesg);
133 }
134 
135 struct pool_item {
136 	u_long				pi_magic;
137 	XSIMPLEQ_ENTRY(pool_item)	pi_list;
138 };
139 #define POOL_IMAGIC(ph, pi) ((u_long)(pi) ^ (ph)->ph_magic)
140 
141 struct pool_page_header {
142 	/* Page headers */
143 	TAILQ_ENTRY(pool_page_header)
144 				ph_entry;	/* pool page list */
145 	XSIMPLEQ_HEAD(, pool_item)
146 				ph_items;	/* free items on the page */
147 	RBT_ENTRY(pool_page_header)
148 				ph_node;	/* off-page page headers */
149 	unsigned int		ph_nmissing;	/* # of chunks in use */
150 	caddr_t			ph_page;	/* this page's address */
151 	caddr_t			ph_colored;	/* page's colored address */
152 	unsigned long		ph_magic;
153 	uint64_t		ph_timestamp;
154 };
155 #define POOL_MAGICBIT (1 << 3) /* keep away from perturbed low bits */
156 #define POOL_PHPOISON(ph) ISSET((ph)->ph_magic, POOL_MAGICBIT)
157 
158 #ifdef MULTIPROCESSOR
159 struct pool_cache_item {
160 	struct pool_cache_item	*ci_next;	/* next item in list */
161 	unsigned long		 ci_nitems;	/* number of items in list */
162 	TAILQ_ENTRY(pool_cache_item)
163 				 ci_nextl;	/* entry in list of lists */
164 };
165 
166 /* we store whether the cached item is poisoned in the high bit of nitems */
167 #define POOL_CACHE_ITEM_NITEMS_MASK	0x7ffffffUL
168 #define POOL_CACHE_ITEM_NITEMS_POISON	0x8000000UL
169 
170 #define POOL_CACHE_ITEM_NITEMS(_ci)					\
171     ((_ci)->ci_nitems & POOL_CACHE_ITEM_NITEMS_MASK)
172 
173 #define POOL_CACHE_ITEM_POISONED(_ci)					\
174     ISSET((_ci)->ci_nitems, POOL_CACHE_ITEM_NITEMS_POISON)
175 
176 struct pool_cache {
177 	struct pool_cache_item	*pc_actv;	/* active list of items */
178 	unsigned long		 pc_nactv;	/* actv head nitems cache */
179 	struct pool_cache_item	*pc_prev;	/* previous list of items */
180 
181 	uint64_t		 pc_gen;	/* generation number */
182 	uint64_t		 pc_nget;	/* # of successful requests */
183 	uint64_t		 pc_nfail;	/* # of unsuccessful reqs */
184 	uint64_t		 pc_nput;	/* # of releases */
185 	uint64_t		 pc_nlget;	/* # of list requests */
186 	uint64_t		 pc_nlfail;	/* # of fails getting a list */
187 	uint64_t		 pc_nlput;	/* # of list releases */
188 
189 	int			 pc_nout;
190 };
191 
192 void	*pool_cache_get(struct pool *);
193 void	 pool_cache_put(struct pool *, void *);
194 void	 pool_cache_destroy(struct pool *);
195 void	 pool_cache_gc(struct pool *);
196 #endif
197 void	 pool_cache_pool_info(struct pool *, struct kinfo_pool *);
198 int	 pool_cache_info(struct pool *, void *, size_t *);
199 int	 pool_cache_cpus_info(struct pool *, void *, size_t *);
200 
201 #ifdef POOL_DEBUG
202 int	pool_debug = 1;
203 #else
204 int	pool_debug = 0;
205 #endif
206 
207 #define POOL_INPGHDR(pp) ((pp)->pr_phoffset != 0)
208 
209 struct pool_page_header *
210 	 pool_p_alloc(struct pool *, int, int *);
211 void	 pool_p_insert(struct pool *, struct pool_page_header *);
212 void	 pool_p_remove(struct pool *, struct pool_page_header *);
213 void	 pool_p_free(struct pool *, struct pool_page_header *);
214 
215 void	 pool_update_curpage(struct pool *);
216 void	*pool_do_get(struct pool *, int, int *);
217 void	 pool_do_put(struct pool *, void *);
218 int	 pool_chk_page(struct pool *, struct pool_page_header *, int);
219 int	 pool_chk(struct pool *);
220 void	 pool_get_done(struct pool *, void *, void *);
221 void	 pool_runqueue(struct pool *, int);
222 
223 void	*pool_allocator_alloc(struct pool *, int, int *);
224 void	 pool_allocator_free(struct pool *, void *);
225 
226 /*
227  * The default pool allocator.
228  */
229 void	*pool_page_alloc(struct pool *, int, int *);
230 void	pool_page_free(struct pool *, void *);
231 
232 /*
233  * safe for interrupts; this is the default allocator
234  */
235 struct pool_allocator pool_allocator_single = {
236 	pool_page_alloc,
237 	pool_page_free,
238 	POOL_ALLOC_SIZE(PAGE_SIZE, POOL_ALLOC_ALIGNED)
239 };
240 
241 void	*pool_multi_alloc(struct pool *, int, int *);
242 void	pool_multi_free(struct pool *, void *);
243 
244 struct pool_allocator pool_allocator_multi = {
245 	pool_multi_alloc,
246 	pool_multi_free,
247 	POOL_ALLOC_SIZES(PAGE_SIZE, (1UL << 31), POOL_ALLOC_ALIGNED)
248 };
249 
250 void	*pool_multi_alloc_ni(struct pool *, int, int *);
251 void	pool_multi_free_ni(struct pool *, void *);
252 
253 struct pool_allocator pool_allocator_multi_ni = {
254 	pool_multi_alloc_ni,
255 	pool_multi_free_ni,
256 	POOL_ALLOC_SIZES(PAGE_SIZE, (1UL << 31), POOL_ALLOC_ALIGNED)
257 };
258 
259 #ifdef DDB
260 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
261 	     __attribute__((__format__(__kprintf__,1,2))));
262 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
263 	     __attribute__((__format__(__kprintf__,1,2))));
264 #endif
265 
266 /* stale page garbage collectors */
267 void	pool_gc_sched(void *);
268 struct timeout pool_gc_tick = TIMEOUT_INITIALIZER(pool_gc_sched, NULL);
269 void	pool_gc_pages(void *);
270 struct task pool_gc_task = TASK_INITIALIZER(pool_gc_pages, NULL);
271 
272 #define POOL_WAIT_FREE	SEC_TO_NSEC(1)
273 #define POOL_WAIT_GC	SEC_TO_NSEC(8)
274 
275 /*
276  * TODO Move getnsecuptime() to kern_tc.c and document it when we
277  * have callers in other modules.
278  */
279 static uint64_t
280 getnsecuptime(void)
281 {
282 	struct timespec now;
283 
284 	getnanouptime(&now);
285 	return TIMESPEC_TO_NSEC(&now);
286 }
287 
288 RBT_PROTOTYPE(phtree, pool_page_header, ph_node, phtree_compare);
289 
290 static inline int
291 phtree_compare(const struct pool_page_header *a,
292     const struct pool_page_header *b)
293 {
294 	vaddr_t va = (vaddr_t)a->ph_page;
295 	vaddr_t vb = (vaddr_t)b->ph_page;
296 
297 	/* the compares in this order are important for the NFIND to work */
298 	if (vb < va)
299 		return (-1);
300 	if (vb > va)
301 		return (1);
302 
303 	return (0);
304 }
305 
306 RBT_GENERATE(phtree, pool_page_header, ph_node, phtree_compare);
307 
308 /*
309  * Return the pool page header based on page address.
310  */
311 static inline struct pool_page_header *
312 pr_find_pagehead(struct pool *pp, void *v)
313 {
314 	struct pool_page_header *ph, key;
315 
316 	if (POOL_INPGHDR(pp)) {
317 		caddr_t page;
318 
319 		page = (caddr_t)((vaddr_t)v & pp->pr_pgmask);
320 
321 		return ((struct pool_page_header *)(page + pp->pr_phoffset));
322 	}
323 
324 	key.ph_page = v;
325 	ph = RBT_NFIND(phtree, &pp->pr_phtree, &key);
326 	if (ph == NULL)
327 		panic("%s: %s: page header missing", __func__, pp->pr_wchan);
328 
329 	KASSERT(ph->ph_page <= (caddr_t)v);
330 	if (ph->ph_page + pp->pr_pgsize <= (caddr_t)v)
331 		panic("%s: %s: incorrect page", __func__, pp->pr_wchan);
332 
333 	return (ph);
334 }
335 
336 /*
337  * Initialize the given pool resource structure.
338  *
339  * We export this routine to allow other kernel parts to declare
340  * static pools that must be initialized before malloc() is available.
341  */
342 void
343 pool_init(struct pool *pp, size_t size, u_int align, int ipl, int flags,
344     const char *wchan, struct pool_allocator *palloc)
345 {
346 	int off = 0, space;
347 	unsigned int pgsize = PAGE_SIZE, items;
348 	size_t pa_pagesz;
349 #ifdef DIAGNOSTIC
350 	struct pool *iter;
351 #endif
352 
353 	if (align == 0)
354 		align = ALIGN(1);
355 
356 	if (size < sizeof(struct pool_item))
357 		size = sizeof(struct pool_item);
358 
359 	size = roundup(size, align);
360 
361 	while (size * 8 > pgsize)
362 		pgsize <<= 1;
363 
364 	if (palloc == NULL) {
365 		if (pgsize > PAGE_SIZE) {
366 			palloc = ISSET(flags, PR_WAITOK) ?
367 			    &pool_allocator_multi_ni : &pool_allocator_multi;
368 		} else
369 			palloc = &pool_allocator_single;
370 
371 		pa_pagesz = palloc->pa_pagesz;
372 	} else {
373 		size_t pgsizes;
374 
375 		pa_pagesz = palloc->pa_pagesz;
376 		if (pa_pagesz == 0)
377 			pa_pagesz = POOL_ALLOC_DEFAULT;
378 
379 		pgsizes = pa_pagesz & ~POOL_ALLOC_ALIGNED;
380 
381 		/* make sure the allocator can fit at least one item */
382 		if (size > pgsizes) {
383 			panic("%s: pool %s item size 0x%zx > "
384 			    "allocator %p sizes 0x%zx", __func__, wchan,
385 			    size, palloc, pgsizes);
386 		}
387 
388 		/* shrink pgsize until it fits into the range */
389 		while (!ISSET(pgsizes, pgsize))
390 			pgsize >>= 1;
391 	}
392 	KASSERT(ISSET(pa_pagesz, pgsize));
393 
394 	items = pgsize / size;
395 
396 	/*
397 	 * Decide whether to put the page header off page to avoid
398 	 * wasting too large a part of the page. Off-page page headers
399 	 * go into an RB tree, so we can match a returned item with
400 	 * its header based on the page address.
401 	 */
402 	if (ISSET(pa_pagesz, POOL_ALLOC_ALIGNED)) {
403 		if (pgsize - (size * items) >
404 		    sizeof(struct pool_page_header)) {
405 			off = pgsize - sizeof(struct pool_page_header);
406 		} else if (sizeof(struct pool_page_header) * 2 >= size) {
407 			off = pgsize - sizeof(struct pool_page_header);
408 			items = off / size;
409 		}
410 	}
411 
412 	KASSERT(items > 0);
413 
414 	/*
415 	 * Initialize the pool structure.
416 	 */
417 	memset(pp, 0, sizeof(*pp));
418 	if (ISSET(flags, PR_RWLOCK)) {
419 		KASSERT(flags & PR_WAITOK);
420 		pp->pr_lock_ops = &pool_lock_ops_rw;
421 	} else
422 		pp->pr_lock_ops = &pool_lock_ops_mtx;
423 	TAILQ_INIT(&pp->pr_emptypages);
424 	TAILQ_INIT(&pp->pr_fullpages);
425 	TAILQ_INIT(&pp->pr_partpages);
426 	pp->pr_curpage = NULL;
427 	pp->pr_npages = 0;
428 	pp->pr_minitems = 0;
429 	pp->pr_minpages = 0;
430 	pp->pr_maxpages = 8;
431 	pp->pr_size = size;
432 	pp->pr_pgsize = pgsize;
433 	pp->pr_pgmask = ~0UL ^ (pgsize - 1);
434 	pp->pr_phoffset = off;
435 	pp->pr_itemsperpage = items;
436 	pp->pr_wchan = wchan;
437 	pp->pr_alloc = palloc;
438 	pp->pr_nitems = 0;
439 	pp->pr_nout = 0;
440 	pp->pr_hardlimit = UINT_MAX;
441 	pp->pr_hardlimit_warning = NULL;
442 	pp->pr_hardlimit_ratecap.tv_sec = 0;
443 	pp->pr_hardlimit_ratecap.tv_usec = 0;
444 	pp->pr_hardlimit_warning_last.tv_sec = 0;
445 	pp->pr_hardlimit_warning_last.tv_usec = 0;
446 	RBT_INIT(phtree, &pp->pr_phtree);
447 
448 	/*
449 	 * Use the space between the chunks and the page header
450 	 * for cache coloring.
451 	 */
452 	space = POOL_INPGHDR(pp) ? pp->pr_phoffset : pp->pr_pgsize;
453 	space -= pp->pr_itemsperpage * pp->pr_size;
454 	pp->pr_align = align;
455 	pp->pr_maxcolors = (space / align) + 1;
456 
457 	pp->pr_nget = 0;
458 	pp->pr_nfail = 0;
459 	pp->pr_nput = 0;
460 	pp->pr_npagealloc = 0;
461 	pp->pr_npagefree = 0;
462 	pp->pr_hiwat = 0;
463 	pp->pr_nidle = 0;
464 
465 	pp->pr_ipl = ipl;
466 	pp->pr_flags = flags;
467 
468 	pl_init(pp, &pp->pr_lock);
469 	pl_init(pp, &pp->pr_requests_lock);
470 	TAILQ_INIT(&pp->pr_requests);
471 
472 	if (phpool.pr_size == 0) {
473 		pool_init(&phpool, sizeof(struct pool_page_header), 0,
474 		    IPL_HIGH, 0, "phpool", NULL);
475 
476 		/* make sure phpool won't "recurse" */
477 		KASSERT(POOL_INPGHDR(&phpool));
478 	}
479 
480 	/* pglistalloc/constraint parameters */
481 	pp->pr_crange = &kp_dirty;
482 
483 	/* Insert this into the list of all pools. */
484 	rw_enter_write(&pool_lock);
485 #ifdef DIAGNOSTIC
486 	SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
487 		if (iter == pp)
488 			panic("%s: pool %s already on list", __func__, wchan);
489 	}
490 #endif
491 
492 	pp->pr_serial = ++pool_serial;
493 	if (pool_serial == 0)
494 		panic("%s: too much uptime", __func__);
495 
496 	SIMPLEQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
497 	pool_count++;
498 	rw_exit_write(&pool_lock);
499 }
500 
501 /*
502  * Decommission a pool resource.
503  */
504 void
505 pool_destroy(struct pool *pp)
506 {
507 	struct pool_page_header *ph;
508 	struct pool *prev, *iter;
509 
510 #ifdef MULTIPROCESSOR
511 	if (pp->pr_cache != NULL)
512 		pool_cache_destroy(pp);
513 #endif
514 
515 #ifdef DIAGNOSTIC
516 	if (pp->pr_nout != 0)
517 		panic("%s: pool busy: still out: %u", __func__, pp->pr_nout);
518 #endif
519 
520 	/* Remove from global pool list */
521 	rw_enter_write(&pool_lock);
522 	pool_count--;
523 	if (pp == SIMPLEQ_FIRST(&pool_head))
524 		SIMPLEQ_REMOVE_HEAD(&pool_head, pr_poollist);
525 	else {
526 		prev = SIMPLEQ_FIRST(&pool_head);
527 		SIMPLEQ_FOREACH(iter, &pool_head, pr_poollist) {
528 			if (iter == pp) {
529 				SIMPLEQ_REMOVE_AFTER(&pool_head, prev,
530 				    pr_poollist);
531 				break;
532 			}
533 			prev = iter;
534 		}
535 	}
536 	rw_exit_write(&pool_lock);
537 
538 	/* Remove all pages */
539 	while ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL) {
540 		pl_enter(pp, &pp->pr_lock);
541 		pool_p_remove(pp, ph);
542 		pl_leave(pp, &pp->pr_lock);
543 		pool_p_free(pp, ph);
544 	}
545 	KASSERT(TAILQ_EMPTY(&pp->pr_fullpages));
546 	KASSERT(TAILQ_EMPTY(&pp->pr_partpages));
547 }
548 
549 void
550 pool_request_init(struct pool_request *pr,
551     void (*handler)(struct pool *, void *, void *), void *cookie)
552 {
553 	pr->pr_handler = handler;
554 	pr->pr_cookie = cookie;
555 	pr->pr_item = NULL;
556 }
557 
558 void
559 pool_request(struct pool *pp, struct pool_request *pr)
560 {
561 	pl_enter(pp, &pp->pr_requests_lock);
562 	TAILQ_INSERT_TAIL(&pp->pr_requests, pr, pr_entry);
563 	pool_runqueue(pp, PR_NOWAIT);
564 	pl_leave(pp, &pp->pr_requests_lock);
565 }
566 
567 struct pool_get_memory {
568 	union pool_lock lock;
569 	void * volatile v;
570 };
571 
572 /*
573  * Grab an item from the pool.
574  */
575 void *
576 pool_get(struct pool *pp, int flags)
577 {
578 	void *v = NULL;
579 	int slowdown = 0;
580 
581 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
582 	if (pp->pr_flags & PR_RWLOCK)
583 		KASSERT(flags & PR_WAITOK);
584 
585 #ifdef MULTIPROCESSOR
586 	if (pp->pr_cache != NULL) {
587 		v = pool_cache_get(pp);
588 		if (v != NULL)
589 			goto good;
590 	}
591 #endif
592 
593 	pl_enter(pp, &pp->pr_lock);
594 	if (pp->pr_nout >= pp->pr_hardlimit) {
595 		if (ISSET(flags, PR_NOWAIT|PR_LIMITFAIL))
596 			goto fail;
597 	} else if ((v = pool_do_get(pp, flags, &slowdown)) == NULL) {
598 		if (ISSET(flags, PR_NOWAIT))
599 			goto fail;
600 	}
601 	pl_leave(pp, &pp->pr_lock);
602 
603 	if ((slowdown || pool_debug == 2) && ISSET(flags, PR_WAITOK))
604 		yield();
605 
606 	if (v == NULL) {
607 		struct pool_get_memory mem = { .v = NULL };
608 		struct pool_request pr;
609 
610 #ifdef DIAGNOSTIC
611 		if (ISSET(flags, PR_WAITOK) && curproc == &proc0)
612 			panic("%s: cannot sleep for memory during boot",
613 			    __func__);
614 #endif
615 		pl_init(pp, &mem.lock);
616 		pool_request_init(&pr, pool_get_done, &mem);
617 		pool_request(pp, &pr);
618 
619 		pl_enter(pp, &mem.lock);
620 		while (mem.v == NULL)
621 			pl_sleep(pp, &mem, &mem.lock, PSWP, pp->pr_wchan);
622 		pl_leave(pp, &mem.lock);
623 
624 		v = mem.v;
625 	}
626 
627 #ifdef MULTIPROCESSOR
628 good:
629 #endif
630 	if (ISSET(flags, PR_ZERO))
631 		memset(v, 0, pp->pr_size);
632 
633 	TRACEPOINT(uvm, pool_get, pp, v, flags);
634 
635 	return (v);
636 
637 fail:
638 	pp->pr_nfail++;
639 	pl_leave(pp, &pp->pr_lock);
640 	return (NULL);
641 }
642 
643 void
644 pool_get_done(struct pool *pp, void *xmem, void *v)
645 {
646 	struct pool_get_memory *mem = xmem;
647 
648 	pl_enter(pp, &mem->lock);
649 	mem->v = v;
650 	pl_leave(pp, &mem->lock);
651 
652 	wakeup_one(mem);
653 }
654 
655 void
656 pool_runqueue(struct pool *pp, int flags)
657 {
658 	struct pool_requests prl = TAILQ_HEAD_INITIALIZER(prl);
659 	struct pool_request *pr;
660 
661 	pl_assert_unlocked(pp, &pp->pr_lock);
662 	pl_assert_locked(pp, &pp->pr_requests_lock);
663 
664 	if (pp->pr_requesting++)
665 		return;
666 
667 	do {
668 		pp->pr_requesting = 1;
669 
670 		TAILQ_CONCAT(&prl, &pp->pr_requests, pr_entry);
671 		if (TAILQ_EMPTY(&prl))
672 			continue;
673 
674 		pl_leave(pp, &pp->pr_requests_lock);
675 
676 		pl_enter(pp, &pp->pr_lock);
677 		pr = TAILQ_FIRST(&prl);
678 		while (pr != NULL) {
679 			int slowdown = 0;
680 
681 			if (pp->pr_nout >= pp->pr_hardlimit)
682 				break;
683 
684 			pr->pr_item = pool_do_get(pp, flags, &slowdown);
685 			if (pr->pr_item == NULL) /* || slowdown ? */
686 				break;
687 
688 			pr = TAILQ_NEXT(pr, pr_entry);
689 		}
690 		pl_leave(pp, &pp->pr_lock);
691 
692 		while ((pr = TAILQ_FIRST(&prl)) != NULL &&
693 		    pr->pr_item != NULL) {
694 			TAILQ_REMOVE(&prl, pr, pr_entry);
695 			(*pr->pr_handler)(pp, pr->pr_cookie, pr->pr_item);
696 		}
697 
698 		pl_enter(pp, &pp->pr_requests_lock);
699 	} while (--pp->pr_requesting);
700 
701 	TAILQ_CONCAT(&pp->pr_requests, &prl, pr_entry);
702 }
703 
704 void *
705 pool_do_get(struct pool *pp, int flags, int *slowdown)
706 {
707 	struct pool_item *pi;
708 	struct pool_page_header *ph;
709 
710 	pl_assert_locked(pp, &pp->pr_lock);
711 
712 	splassert(pp->pr_ipl);
713 
714 	/*
715 	 * Account for this item now to avoid races if we need to give up
716 	 * pr_lock to allocate a page.
717 	 */
718 	pp->pr_nout++;
719 
720 	if (pp->pr_curpage == NULL) {
721 		pl_leave(pp, &pp->pr_lock);
722 		ph = pool_p_alloc(pp, flags, slowdown);
723 		pl_enter(pp, &pp->pr_lock);
724 
725 		if (ph == NULL) {
726 			pp->pr_nout--;
727 			return (NULL);
728 		}
729 
730 		pool_p_insert(pp, ph);
731 	}
732 
733 	ph = pp->pr_curpage;
734 	pi = XSIMPLEQ_FIRST(&ph->ph_items);
735 	if (__predict_false(pi == NULL))
736 		panic("%s: %s: page empty", __func__, pp->pr_wchan);
737 
738 	if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
739 		panic("%s: %s free list modified: "
740 		    "page %p; item addr %p; offset 0x%x=0x%lx != 0x%lx",
741 		    __func__, pp->pr_wchan, ph->ph_page, pi,
742 		    0, pi->pi_magic, POOL_IMAGIC(ph, pi));
743 	}
744 
745 	XSIMPLEQ_REMOVE_HEAD(&ph->ph_items, pi_list);
746 
747 #ifdef DIAGNOSTIC
748 	if (pool_debug && POOL_PHPOISON(ph)) {
749 		size_t pidx;
750 		uint32_t pval;
751 		if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
752 		    &pidx, &pval)) {
753 			int *ip = (int *)(pi + 1);
754 			panic("%s: %s free list modified: "
755 			    "page %p; item addr %p; offset 0x%zx=0x%x",
756 			    __func__, pp->pr_wchan, ph->ph_page, pi,
757 			    (pidx * sizeof(int)) + sizeof(*pi), ip[pidx]);
758 		}
759 	}
760 #endif /* DIAGNOSTIC */
761 
762 	if (ph->ph_nmissing++ == 0) {
763 		/*
764 		 * This page was previously empty.  Move it to the list of
765 		 * partially-full pages.  This page is already curpage.
766 		 */
767 		TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_entry);
768 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_entry);
769 
770 		pp->pr_nidle--;
771 	}
772 
773 	if (ph->ph_nmissing == pp->pr_itemsperpage) {
774 		/*
775 		 * This page is now full.  Move it to the full list
776 		 * and select a new current page.
777 		 */
778 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry);
779 		TAILQ_INSERT_TAIL(&pp->pr_fullpages, ph, ph_entry);
780 		pool_update_curpage(pp);
781 	}
782 
783 	pp->pr_nget++;
784 
785 	return (pi);
786 }
787 
788 /*
789  * Return resource to the pool.
790  */
791 void
792 pool_put(struct pool *pp, void *v)
793 {
794 	struct pool_page_header *ph, *freeph = NULL;
795 
796 #ifdef DIAGNOSTIC
797 	if (v == NULL)
798 		panic("%s: NULL item", __func__);
799 #endif
800 
801 	TRACEPOINT(uvm, pool_put, pp, v);
802 
803 #ifdef MULTIPROCESSOR
804 	if (pp->pr_cache != NULL && TAILQ_EMPTY(&pp->pr_requests)) {
805 		pool_cache_put(pp, v);
806 		return;
807 	}
808 #endif
809 
810 	pl_enter(pp, &pp->pr_lock);
811 
812 	pool_do_put(pp, v);
813 
814 	pp->pr_nout--;
815 	pp->pr_nput++;
816 
817 	/* is it time to free a page? */
818 	if (pp->pr_nidle > pp->pr_maxpages &&
819 	    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
820 	    getnsecuptime() - ph->ph_timestamp > POOL_WAIT_FREE) {
821 		freeph = ph;
822 		pool_p_remove(pp, freeph);
823 	}
824 
825 	pl_leave(pp, &pp->pr_lock);
826 
827 	if (freeph != NULL)
828 		pool_p_free(pp, freeph);
829 
830 	pool_wakeup(pp);
831 }
832 
833 void
834 pool_wakeup(struct pool *pp)
835 {
836 	if (!TAILQ_EMPTY(&pp->pr_requests)) {
837 		pl_enter(pp, &pp->pr_requests_lock);
838 		pool_runqueue(pp, PR_NOWAIT);
839 		pl_leave(pp, &pp->pr_requests_lock);
840 	}
841 }
842 
843 void
844 pool_do_put(struct pool *pp, void *v)
845 {
846 	struct pool_item *pi = v;
847 	struct pool_page_header *ph;
848 
849 	splassert(pp->pr_ipl);
850 
851 	ph = pr_find_pagehead(pp, v);
852 
853 #ifdef DIAGNOSTIC
854 	if (pool_debug) {
855 		struct pool_item *qi;
856 		XSIMPLEQ_FOREACH(qi, &ph->ph_items, pi_list) {
857 			if (pi == qi) {
858 				panic("%s: %s: double pool_put: %p", __func__,
859 				    pp->pr_wchan, pi);
860 			}
861 		}
862 	}
863 #endif /* DIAGNOSTIC */
864 
865 	pi->pi_magic = POOL_IMAGIC(ph, pi);
866 	XSIMPLEQ_INSERT_HEAD(&ph->ph_items, pi, pi_list);
867 #ifdef DIAGNOSTIC
868 	if (POOL_PHPOISON(ph))
869 		poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
870 #endif /* DIAGNOSTIC */
871 
872 	if (ph->ph_nmissing-- == pp->pr_itemsperpage) {
873 		/*
874 		 * The page was previously completely full, move it to the
875 		 * partially-full list.
876 		 */
877 		TAILQ_REMOVE(&pp->pr_fullpages, ph, ph_entry);
878 		TAILQ_INSERT_TAIL(&pp->pr_partpages, ph, ph_entry);
879 	}
880 
881 	if (ph->ph_nmissing == 0) {
882 		/*
883 		 * The page is now empty, so move it to the empty page list.
884 		 */
885 		pp->pr_nidle++;
886 
887 		ph->ph_timestamp = getnsecuptime();
888 		TAILQ_REMOVE(&pp->pr_partpages, ph, ph_entry);
889 		TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_entry);
890 		pool_update_curpage(pp);
891 	}
892 }
893 
894 /*
895  * Add N items to the pool.
896  */
897 int
898 pool_prime(struct pool *pp, int n)
899 {
900 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
901 	struct pool_page_header *ph;
902 	int newpages;
903 
904 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
905 
906 	while (newpages-- > 0) {
907 		int slowdown = 0;
908 
909 		ph = pool_p_alloc(pp, PR_NOWAIT, &slowdown);
910 		if (ph == NULL) /* or slowdown? */
911 			break;
912 
913 		TAILQ_INSERT_TAIL(&pl, ph, ph_entry);
914 	}
915 
916 	pl_enter(pp, &pp->pr_lock);
917 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
918 		TAILQ_REMOVE(&pl, ph, ph_entry);
919 		pool_p_insert(pp, ph);
920 	}
921 	pl_leave(pp, &pp->pr_lock);
922 
923 	return (0);
924 }
925 
926 struct pool_page_header *
927 pool_p_alloc(struct pool *pp, int flags, int *slowdown)
928 {
929 	struct pool_page_header *ph;
930 	struct pool_item *pi;
931 	caddr_t addr;
932 	unsigned int order;
933 	int o;
934 	int n;
935 
936 	pl_assert_unlocked(pp, &pp->pr_lock);
937 	KASSERT(pp->pr_size >= sizeof(*pi));
938 
939 	addr = pool_allocator_alloc(pp, flags, slowdown);
940 	if (addr == NULL)
941 		return (NULL);
942 
943 	if (POOL_INPGHDR(pp))
944 		ph = (struct pool_page_header *)(addr + pp->pr_phoffset);
945 	else {
946 		ph = pool_get(&phpool, flags);
947 		if (ph == NULL) {
948 			pool_allocator_free(pp, addr);
949 			return (NULL);
950 		}
951 	}
952 
953 	XSIMPLEQ_INIT(&ph->ph_items);
954 	ph->ph_page = addr;
955 	addr += pp->pr_align * (pp->pr_npagealloc % pp->pr_maxcolors);
956 	ph->ph_colored = addr;
957 	ph->ph_nmissing = 0;
958 	arc4random_buf(&ph->ph_magic, sizeof(ph->ph_magic));
959 #ifdef DIAGNOSTIC
960 	/* use a bit in ph_magic to record if we poison page items */
961 	if (pool_debug)
962 		SET(ph->ph_magic, POOL_MAGICBIT);
963 	else
964 		CLR(ph->ph_magic, POOL_MAGICBIT);
965 #endif /* DIAGNOSTIC */
966 
967 	n = pp->pr_itemsperpage;
968 	o = 32;
969 	while (n--) {
970 		pi = (struct pool_item *)addr;
971 		pi->pi_magic = POOL_IMAGIC(ph, pi);
972 
973 		if (o == 32) {
974 			order = arc4random();
975 			o = 0;
976 		}
977 		if (ISSET(order, 1 << o++))
978 			XSIMPLEQ_INSERT_TAIL(&ph->ph_items, pi, pi_list);
979 		else
980 			XSIMPLEQ_INSERT_HEAD(&ph->ph_items, pi, pi_list);
981 
982 #ifdef DIAGNOSTIC
983 		if (POOL_PHPOISON(ph))
984 			poison_mem(pi + 1, pp->pr_size - sizeof(*pi));
985 #endif /* DIAGNOSTIC */
986 
987 		addr += pp->pr_size;
988 	}
989 
990 	return (ph);
991 }
992 
993 void
994 pool_p_free(struct pool *pp, struct pool_page_header *ph)
995 {
996 	struct pool_item *pi;
997 
998 	pl_assert_unlocked(pp, &pp->pr_lock);
999 	KASSERT(ph->ph_nmissing == 0);
1000 
1001 	XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) {
1002 		if (__predict_false(pi->pi_magic != POOL_IMAGIC(ph, pi))) {
1003 			panic("%s: %s free list modified: "
1004 			    "page %p; item addr %p; offset 0x%x=0x%lx",
1005 			    __func__, pp->pr_wchan, ph->ph_page, pi,
1006 			    0, pi->pi_magic);
1007 		}
1008 
1009 #ifdef DIAGNOSTIC
1010 		if (POOL_PHPOISON(ph)) {
1011 			size_t pidx;
1012 			uint32_t pval;
1013 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1014 			    &pidx, &pval)) {
1015 				int *ip = (int *)(pi + 1);
1016 				panic("%s: %s free list modified: "
1017 				    "page %p; item addr %p; offset 0x%zx=0x%x",
1018 				    __func__, pp->pr_wchan, ph->ph_page, pi,
1019 				    pidx * sizeof(int), ip[pidx]);
1020 			}
1021 		}
1022 #endif
1023 	}
1024 
1025 	pool_allocator_free(pp, ph->ph_page);
1026 
1027 	if (!POOL_INPGHDR(pp))
1028 		pool_put(&phpool, ph);
1029 }
1030 
1031 void
1032 pool_p_insert(struct pool *pp, struct pool_page_header *ph)
1033 {
1034 	pl_assert_locked(pp, &pp->pr_lock);
1035 
1036 	/* If the pool was depleted, point at the new page */
1037 	if (pp->pr_curpage == NULL)
1038 		pp->pr_curpage = ph;
1039 
1040 	TAILQ_INSERT_TAIL(&pp->pr_emptypages, ph, ph_entry);
1041 	if (!POOL_INPGHDR(pp))
1042 		RBT_INSERT(phtree, &pp->pr_phtree, ph);
1043 
1044 	pp->pr_nitems += pp->pr_itemsperpage;
1045 	pp->pr_nidle++;
1046 
1047 	pp->pr_npagealloc++;
1048 	if (++pp->pr_npages > pp->pr_hiwat)
1049 		pp->pr_hiwat = pp->pr_npages;
1050 }
1051 
1052 void
1053 pool_p_remove(struct pool *pp, struct pool_page_header *ph)
1054 {
1055 	pl_assert_locked(pp, &pp->pr_lock);
1056 
1057 	pp->pr_npagefree++;
1058 	pp->pr_npages--;
1059 	pp->pr_nidle--;
1060 	pp->pr_nitems -= pp->pr_itemsperpage;
1061 
1062 	if (!POOL_INPGHDR(pp))
1063 		RBT_REMOVE(phtree, &pp->pr_phtree, ph);
1064 	TAILQ_REMOVE(&pp->pr_emptypages, ph, ph_entry);
1065 
1066 	pool_update_curpage(pp);
1067 }
1068 
1069 void
1070 pool_update_curpage(struct pool *pp)
1071 {
1072 	pp->pr_curpage = TAILQ_LAST(&pp->pr_partpages, pool_pagelist);
1073 	if (pp->pr_curpage == NULL) {
1074 		pp->pr_curpage = TAILQ_LAST(&pp->pr_emptypages, pool_pagelist);
1075 	}
1076 }
1077 
1078 void
1079 pool_setlowat(struct pool *pp, int n)
1080 {
1081 	int prime = 0;
1082 
1083 	pl_enter(pp, &pp->pr_lock);
1084 	pp->pr_minitems = n;
1085 	pp->pr_minpages = (n == 0)
1086 		? 0
1087 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1088 
1089 	if (pp->pr_nitems < n)
1090 		prime = n - pp->pr_nitems;
1091 	pl_leave(pp, &pp->pr_lock);
1092 
1093 	if (prime > 0)
1094 		pool_prime(pp, prime);
1095 }
1096 
1097 void
1098 pool_sethiwat(struct pool *pp, int n)
1099 {
1100 	pp->pr_maxpages = (n == 0)
1101 		? 0
1102 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1103 }
1104 
1105 int
1106 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
1107 {
1108 	int error = 0;
1109 
1110 	if (n < pp->pr_nout) {
1111 		error = EINVAL;
1112 		goto done;
1113 	}
1114 
1115 	pp->pr_hardlimit = n;
1116 	pp->pr_hardlimit_warning = warnmsg;
1117 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
1118 	pp->pr_hardlimit_warning_last.tv_sec = 0;
1119 	pp->pr_hardlimit_warning_last.tv_usec = 0;
1120 
1121 done:
1122 	return (error);
1123 }
1124 
1125 void
1126 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
1127 {
1128 	pp->pr_crange = mode;
1129 }
1130 
1131 /*
1132  * Release all complete pages that have not been used recently.
1133  *
1134  * Returns non-zero if any pages have been reclaimed.
1135  */
1136 int
1137 pool_reclaim(struct pool *pp)
1138 {
1139 	struct pool_page_header *ph, *phnext;
1140 	struct pool_pagelist pl = TAILQ_HEAD_INITIALIZER(pl);
1141 
1142 	pl_enter(pp, &pp->pr_lock);
1143 	for (ph = TAILQ_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
1144 		phnext = TAILQ_NEXT(ph, ph_entry);
1145 
1146 		/* Check our minimum page claim */
1147 		if (pp->pr_npages <= pp->pr_minpages)
1148 			break;
1149 
1150 		/*
1151 		 * If freeing this page would put us below
1152 		 * the low water mark, stop now.
1153 		 */
1154 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1155 		    pp->pr_minitems)
1156 			break;
1157 
1158 		pool_p_remove(pp, ph);
1159 		TAILQ_INSERT_TAIL(&pl, ph, ph_entry);
1160 	}
1161 	pl_leave(pp, &pp->pr_lock);
1162 
1163 	if (TAILQ_EMPTY(&pl))
1164 		return (0);
1165 
1166 	while ((ph = TAILQ_FIRST(&pl)) != NULL) {
1167 		TAILQ_REMOVE(&pl, ph, ph_entry);
1168 		pool_p_free(pp, ph);
1169 	}
1170 
1171 	return (1);
1172 }
1173 
1174 /*
1175  * Release all complete pages that have not been used recently
1176  * from all pools.
1177  */
1178 void
1179 pool_reclaim_all(void)
1180 {
1181 	struct pool	*pp;
1182 
1183 	rw_enter_read(&pool_lock);
1184 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist)
1185 		pool_reclaim(pp);
1186 	rw_exit_read(&pool_lock);
1187 }
1188 
1189 #ifdef DDB
1190 #include <machine/db_machdep.h>
1191 #include <ddb/db_output.h>
1192 
1193 /*
1194  * Diagnostic helpers.
1195  */
1196 void
1197 pool_printit(struct pool *pp, const char *modif,
1198     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1199 {
1200 	pool_print1(pp, modif, pr);
1201 }
1202 
1203 void
1204 pool_print_pagelist(struct pool_pagelist *pl,
1205     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1206 {
1207 	struct pool_page_header *ph;
1208 	struct pool_item *pi;
1209 
1210 	TAILQ_FOREACH(ph, pl, ph_entry) {
1211 		(*pr)("\t\tpage %p, color %p, nmissing %d\n",
1212 		    ph->ph_page, ph->ph_colored, ph->ph_nmissing);
1213 		XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) {
1214 			if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1215 				(*pr)("\t\t\titem %p, magic 0x%lx\n",
1216 				    pi, pi->pi_magic);
1217 			}
1218 		}
1219 	}
1220 }
1221 
1222 void
1223 pool_print1(struct pool *pp, const char *modif,
1224     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
1225 {
1226 	struct pool_page_header *ph;
1227 	int print_pagelist = 0;
1228 	char c;
1229 
1230 	while ((c = *modif++) != '\0') {
1231 		if (c == 'p')
1232 			print_pagelist = 1;
1233 		modif++;
1234 	}
1235 
1236 	(*pr)("POOL %s: size %u maxcolors %u\n", pp->pr_wchan, pp->pr_size,
1237 	    pp->pr_maxcolors);
1238 	(*pr)("\talloc %p\n", pp->pr_alloc);
1239 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1240 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1241 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1242 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1243 
1244 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1245 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1246 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1247 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1248 
1249 	if (print_pagelist == 0)
1250 		return;
1251 
1252 	if ((ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL)
1253 		(*pr)("\n\tempty page list:\n");
1254 	pool_print_pagelist(&pp->pr_emptypages, pr);
1255 	if ((ph = TAILQ_FIRST(&pp->pr_fullpages)) != NULL)
1256 		(*pr)("\n\tfull page list:\n");
1257 	pool_print_pagelist(&pp->pr_fullpages, pr);
1258 	if ((ph = TAILQ_FIRST(&pp->pr_partpages)) != NULL)
1259 		(*pr)("\n\tpartial-page list:\n");
1260 	pool_print_pagelist(&pp->pr_partpages, pr);
1261 
1262 	if (pp->pr_curpage == NULL)
1263 		(*pr)("\tno current page\n");
1264 	else
1265 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1266 }
1267 
1268 void
1269 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1270 {
1271 	struct pool *pp;
1272 	char maxp[16];
1273 	int ovflw;
1274 	char mode;
1275 
1276 	mode = modif[0];
1277 	if (mode != '\0' && mode != 'a') {
1278 		db_printf("usage: show all pools [/a]\n");
1279 		return;
1280 	}
1281 
1282 	if (mode == '\0')
1283 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1284 		    "Name",
1285 		    "Size",
1286 		    "Requests",
1287 		    "Fail",
1288 		    "Releases",
1289 		    "Pgreq",
1290 		    "Pgrel",
1291 		    "Npage",
1292 		    "Hiwat",
1293 		    "Minpg",
1294 		    "Maxpg",
1295 		    "Idle");
1296 	else
1297 		db_printf("%-12s %18s %18s\n",
1298 		    "Name", "Address", "Allocator");
1299 
1300 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1301 		if (mode == 'a') {
1302 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1303 			    pp->pr_alloc);
1304 			continue;
1305 		}
1306 
1307 		if (!pp->pr_nget)
1308 			continue;
1309 
1310 		if (pp->pr_maxpages == UINT_MAX)
1311 			snprintf(maxp, sizeof maxp, "inf");
1312 		else
1313 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1314 
1315 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1316 	(ovflw) += db_printf((fmt),			\
1317 	    (width) - (fixed) - (ovflw) > 0 ?		\
1318 	    (width) - (fixed) - (ovflw) : 0,		\
1319 	    (val)) - (width);				\
1320 	if ((ovflw) < 0)				\
1321 		(ovflw) = 0;				\
1322 } while (/* CONSTCOND */0)
1323 
1324 		ovflw = 0;
1325 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1326 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1327 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1328 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1329 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1330 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1331 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1332 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1333 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1334 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1335 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1336 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1337 
1338 		pool_chk(pp);
1339 	}
1340 }
1341 #endif /* DDB */
1342 
1343 #if defined(POOL_DEBUG) || defined(DDB)
1344 int
1345 pool_chk_page(struct pool *pp, struct pool_page_header *ph, int expected)
1346 {
1347 	struct pool_item *pi;
1348 	caddr_t page;
1349 	int n;
1350 	const char *label = pp->pr_wchan;
1351 
1352 	page = (caddr_t)((u_long)ph & pp->pr_pgmask);
1353 	if (page != ph->ph_page && POOL_INPGHDR(pp)) {
1354 		printf("%s: ", label);
1355 		printf("pool(%p:%s): page inconsistency: page %p; "
1356 		    "at page head addr %p (p %p)\n",
1357 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1358 		return 1;
1359 	}
1360 
1361 	for (pi = XSIMPLEQ_FIRST(&ph->ph_items), n = 0;
1362 	     pi != NULL;
1363 	     pi = XSIMPLEQ_NEXT(&ph->ph_items, pi, pi_list), n++) {
1364 		if ((caddr_t)pi < ph->ph_page ||
1365 		    (caddr_t)pi >= ph->ph_page + pp->pr_pgsize) {
1366 			printf("%s: ", label);
1367 			printf("pool(%p:%s): page inconsistency: page %p;"
1368 			    " item ordinal %d; addr %p\n", pp,
1369 			    pp->pr_wchan, ph->ph_page, n, pi);
1370 			return (1);
1371 		}
1372 
1373 		if (pi->pi_magic != POOL_IMAGIC(ph, pi)) {
1374 			printf("%s: ", label);
1375 			printf("pool(%p:%s): free list modified: "
1376 			    "page %p; item ordinal %d; addr %p "
1377 			    "(p %p); offset 0x%x=0x%lx\n",
1378 			    pp, pp->pr_wchan, ph->ph_page, n, pi, page,
1379 			    0, pi->pi_magic);
1380 		}
1381 
1382 #ifdef DIAGNOSTIC
1383 		if (POOL_PHPOISON(ph)) {
1384 			size_t pidx;
1385 			uint32_t pval;
1386 			if (poison_check(pi + 1, pp->pr_size - sizeof(*pi),
1387 			    &pidx, &pval)) {
1388 				int *ip = (int *)(pi + 1);
1389 				printf("pool(%s): free list modified: "
1390 				    "page %p; item ordinal %d; addr %p "
1391 				    "(p %p); offset 0x%zx=0x%x\n",
1392 				    pp->pr_wchan, ph->ph_page, n, pi,
1393 				    page, pidx * sizeof(int), ip[pidx]);
1394 			}
1395 		}
1396 #endif /* DIAGNOSTIC */
1397 	}
1398 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1399 		printf("pool(%p:%s): page inconsistency: page %p;"
1400 		    " %d on list, %d missing, %d items per page\n", pp,
1401 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1402 		    pp->pr_itemsperpage);
1403 		return 1;
1404 	}
1405 	if (expected >= 0 && n != expected) {
1406 		printf("pool(%p:%s): page inconsistency: page %p;"
1407 		    " %d on list, %d missing, %d expected\n", pp,
1408 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1409 		    expected);
1410 		return 1;
1411 	}
1412 	return 0;
1413 }
1414 
1415 int
1416 pool_chk(struct pool *pp)
1417 {
1418 	struct pool_page_header *ph;
1419 	int r = 0;
1420 
1421 	TAILQ_FOREACH(ph, &pp->pr_emptypages, ph_entry)
1422 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1423 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_entry)
1424 		r += pool_chk_page(pp, ph, 0);
1425 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_entry)
1426 		r += pool_chk_page(pp, ph, -1);
1427 
1428 	return (r);
1429 }
1430 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1431 
1432 #ifdef DDB
1433 void
1434 pool_walk(struct pool *pp, int full,
1435     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))),
1436     void (*func)(void *, int, int (*)(const char *, ...)
1437 	    __attribute__((__format__(__kprintf__,1,2)))))
1438 {
1439 	struct pool_page_header *ph;
1440 	struct pool_item *pi;
1441 	caddr_t cp;
1442 	int n;
1443 
1444 	TAILQ_FOREACH(ph, &pp->pr_fullpages, ph_entry) {
1445 		cp = ph->ph_colored;
1446 		n = ph->ph_nmissing;
1447 
1448 		while (n--) {
1449 			func(cp, full, pr);
1450 			cp += pp->pr_size;
1451 		}
1452 	}
1453 
1454 	TAILQ_FOREACH(ph, &pp->pr_partpages, ph_entry) {
1455 		cp = ph->ph_colored;
1456 		n = ph->ph_nmissing;
1457 
1458 		do {
1459 			XSIMPLEQ_FOREACH(pi, &ph->ph_items, pi_list) {
1460 				if (cp == (caddr_t)pi)
1461 					break;
1462 			}
1463 			if (cp != (caddr_t)pi) {
1464 				func(cp, full, pr);
1465 				n--;
1466 			}
1467 
1468 			cp += pp->pr_size;
1469 		} while (n > 0);
1470 	}
1471 }
1472 #endif
1473 
1474 /*
1475  * We have three different sysctls.
1476  * kern.pool.npools - the number of pools.
1477  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1478  * kern.pool.name.<pool#> - the name for pool#.
1479  */
1480 int
1481 sysctl_dopool(int *name, u_int namelen, char *oldp, size_t *oldlenp)
1482 {
1483 	struct kinfo_pool pi;
1484 	struct pool *pp;
1485 	int rv = ENOENT;
1486 
1487 	switch (name[0]) {
1488 	case KERN_POOL_NPOOLS:
1489 		if (namelen != 1)
1490 			return (ENOTDIR);
1491 		return (sysctl_rdint(oldp, oldlenp, NULL, pool_count));
1492 
1493 	case KERN_POOL_NAME:
1494 	case KERN_POOL_POOL:
1495 	case KERN_POOL_CACHE:
1496 	case KERN_POOL_CACHE_CPUS:
1497 		break;
1498 	default:
1499 		return (EOPNOTSUPP);
1500 	}
1501 
1502 	if (namelen != 2)
1503 		return (ENOTDIR);
1504 
1505 	rw_enter_read(&pool_lock);
1506 
1507 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1508 		if (name[1] == pp->pr_serial)
1509 			break;
1510 	}
1511 
1512 	if (pp == NULL)
1513 		goto done;
1514 
1515 	switch (name[0]) {
1516 	case KERN_POOL_NAME:
1517 		rv = sysctl_rdstring(oldp, oldlenp, NULL, pp->pr_wchan);
1518 		break;
1519 	case KERN_POOL_POOL:
1520 		memset(&pi, 0, sizeof(pi));
1521 
1522 		pl_enter(pp, &pp->pr_lock);
1523 		pi.pr_size = pp->pr_size;
1524 		pi.pr_pgsize = pp->pr_pgsize;
1525 		pi.pr_itemsperpage = pp->pr_itemsperpage;
1526 		pi.pr_npages = pp->pr_npages;
1527 		pi.pr_minpages = pp->pr_minpages;
1528 		pi.pr_maxpages = pp->pr_maxpages;
1529 		pi.pr_hardlimit = pp->pr_hardlimit;
1530 		pi.pr_nout = pp->pr_nout;
1531 		pi.pr_nitems = pp->pr_nitems;
1532 		pi.pr_nget = pp->pr_nget;
1533 		pi.pr_nput = pp->pr_nput;
1534 		pi.pr_nfail = pp->pr_nfail;
1535 		pi.pr_npagealloc = pp->pr_npagealloc;
1536 		pi.pr_npagefree = pp->pr_npagefree;
1537 		pi.pr_hiwat = pp->pr_hiwat;
1538 		pi.pr_nidle = pp->pr_nidle;
1539 		pl_leave(pp, &pp->pr_lock);
1540 
1541 		pool_cache_pool_info(pp, &pi);
1542 
1543 		rv = sysctl_rdstruct(oldp, oldlenp, NULL, &pi, sizeof(pi));
1544 		break;
1545 
1546 	case KERN_POOL_CACHE:
1547 		rv = pool_cache_info(pp, oldp, oldlenp);
1548 		break;
1549 
1550 	case KERN_POOL_CACHE_CPUS:
1551 		rv = pool_cache_cpus_info(pp, oldp, oldlenp);
1552 		break;
1553 	}
1554 
1555 done:
1556 	rw_exit_read(&pool_lock);
1557 
1558 	return (rv);
1559 }
1560 
1561 void
1562 pool_gc_sched(void *null)
1563 {
1564 	task_add(systqmp, &pool_gc_task);
1565 }
1566 
1567 void
1568 pool_gc_pages(void *null)
1569 {
1570 	struct pool *pp;
1571 	struct pool_page_header *ph, *freeph;
1572 	int s;
1573 
1574 	rw_enter_read(&pool_lock);
1575 	s = splvm(); /* XXX go to splvm until all pools _setipl properly */
1576 	SIMPLEQ_FOREACH(pp, &pool_head, pr_poollist) {
1577 #ifdef MULTIPROCESSOR
1578 		if (pp->pr_cache != NULL)
1579 			pool_cache_gc(pp);
1580 #endif
1581 
1582 		if (pp->pr_nidle <= pp->pr_minpages || /* guess */
1583 		    !pl_enter_try(pp, &pp->pr_lock)) /* try */
1584 			continue;
1585 
1586 		/* is it time to free a page? */
1587 		if (pp->pr_nidle > pp->pr_minpages &&
1588 		    (ph = TAILQ_FIRST(&pp->pr_emptypages)) != NULL &&
1589 		    getnsecuptime() - ph->ph_timestamp > POOL_WAIT_GC) {
1590 			freeph = ph;
1591 			pool_p_remove(pp, freeph);
1592 		} else
1593 			freeph = NULL;
1594 
1595 		pl_leave(pp, &pp->pr_lock);
1596 
1597 		if (freeph != NULL)
1598 			pool_p_free(pp, freeph);
1599 	}
1600 	splx(s);
1601 	rw_exit_read(&pool_lock);
1602 
1603 	timeout_add_sec(&pool_gc_tick, 1);
1604 }
1605 
1606 /*
1607  * Pool backend allocators.
1608  */
1609 
1610 void *
1611 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1612 {
1613 	void *v;
1614 
1615 	v = (*pp->pr_alloc->pa_alloc)(pp, flags, slowdown);
1616 
1617 #ifdef DIAGNOSTIC
1618 	if (v != NULL && POOL_INPGHDR(pp)) {
1619 		vaddr_t addr = (vaddr_t)v;
1620 		if ((addr & pp->pr_pgmask) != addr) {
1621 			panic("%s: %s page address %p isn't aligned to %u",
1622 			    __func__, pp->pr_wchan, v, pp->pr_pgsize);
1623 		}
1624 	}
1625 #endif
1626 
1627 	return (v);
1628 }
1629 
1630 void
1631 pool_allocator_free(struct pool *pp, void *v)
1632 {
1633 	struct pool_allocator *pa = pp->pr_alloc;
1634 
1635 	(*pa->pa_free)(pp, v);
1636 }
1637 
1638 void *
1639 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1640 {
1641 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1642 
1643 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1644 	kd.kd_slowdown = slowdown;
1645 
1646 	return (km_alloc(pp->pr_pgsize, &kv_page, pp->pr_crange, &kd));
1647 }
1648 
1649 void
1650 pool_page_free(struct pool *pp, void *v)
1651 {
1652 	km_free(v, pp->pr_pgsize, &kv_page, pp->pr_crange);
1653 }
1654 
1655 void *
1656 pool_multi_alloc(struct pool *pp, int flags, int *slowdown)
1657 {
1658 	struct kmem_va_mode kv = kv_intrsafe;
1659 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1660 	void *v;
1661 	int s;
1662 
1663 	if (POOL_INPGHDR(pp))
1664 		kv.kv_align = pp->pr_pgsize;
1665 
1666 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1667 	kd.kd_slowdown = slowdown;
1668 
1669 	s = splvm();
1670 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1671 	splx(s);
1672 
1673 	return (v);
1674 }
1675 
1676 void
1677 pool_multi_free(struct pool *pp, void *v)
1678 {
1679 	struct kmem_va_mode kv = kv_intrsafe;
1680 	int s;
1681 
1682 	if (POOL_INPGHDR(pp))
1683 		kv.kv_align = pp->pr_pgsize;
1684 
1685 	s = splvm();
1686 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1687 	splx(s);
1688 }
1689 
1690 void *
1691 pool_multi_alloc_ni(struct pool *pp, int flags, int *slowdown)
1692 {
1693 	struct kmem_va_mode kv = kv_any;
1694 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1695 	void *v;
1696 
1697 	if (POOL_INPGHDR(pp))
1698 		kv.kv_align = pp->pr_pgsize;
1699 
1700 	kd.kd_waitok = ISSET(flags, PR_WAITOK);
1701 	kd.kd_slowdown = slowdown;
1702 
1703 	KERNEL_LOCK();
1704 	v = km_alloc(pp->pr_pgsize, &kv, pp->pr_crange, &kd);
1705 	KERNEL_UNLOCK();
1706 
1707 	return (v);
1708 }
1709 
1710 void
1711 pool_multi_free_ni(struct pool *pp, void *v)
1712 {
1713 	struct kmem_va_mode kv = kv_any;
1714 
1715 	if (POOL_INPGHDR(pp))
1716 		kv.kv_align = pp->pr_pgsize;
1717 
1718 	KERNEL_LOCK();
1719 	km_free(v, pp->pr_pgsize, &kv, pp->pr_crange);
1720 	KERNEL_UNLOCK();
1721 }
1722 
1723 #ifdef MULTIPROCESSOR
1724 
1725 struct pool pool_caches; /* per cpu cache entries */
1726 
1727 void
1728 pool_cache_init(struct pool *pp)
1729 {
1730 	struct cpumem *cm;
1731 	struct pool_cache *pc;
1732 	struct cpumem_iter i;
1733 
1734 	if (pool_caches.pr_size == 0) {
1735 		pool_init(&pool_caches, sizeof(struct pool_cache),
1736 		    CACHELINESIZE, IPL_NONE, PR_WAITOK | PR_RWLOCK,
1737 		    "plcache", NULL);
1738 	}
1739 
1740 	/* must be able to use the pool items as cache list items */
1741 	KASSERT(pp->pr_size >= sizeof(struct pool_cache_item));
1742 
1743 	cm = cpumem_get(&pool_caches);
1744 
1745 	pl_init(pp, &pp->pr_cache_lock);
1746 	arc4random_buf(pp->pr_cache_magic, sizeof(pp->pr_cache_magic));
1747 	TAILQ_INIT(&pp->pr_cache_lists);
1748 	pp->pr_cache_nitems = 0;
1749 	pp->pr_cache_timestamp = getnsecuptime();
1750 	pp->pr_cache_items = 8;
1751 	pp->pr_cache_contention = 0;
1752 	pp->pr_cache_ngc = 0;
1753 
1754 	CPUMEM_FOREACH(pc, &i, cm) {
1755 		pc->pc_actv = NULL;
1756 		pc->pc_nactv = 0;
1757 		pc->pc_prev = NULL;
1758 
1759 		pc->pc_nget = 0;
1760 		pc->pc_nfail = 0;
1761 		pc->pc_nput = 0;
1762 		pc->pc_nlget = 0;
1763 		pc->pc_nlfail = 0;
1764 		pc->pc_nlput = 0;
1765 		pc->pc_nout = 0;
1766 	}
1767 
1768 	membar_producer();
1769 
1770 	pp->pr_cache = cm;
1771 }
1772 
1773 static inline void
1774 pool_cache_item_magic(struct pool *pp, struct pool_cache_item *ci)
1775 {
1776 	unsigned long *entry = (unsigned long *)&ci->ci_nextl;
1777 
1778 	entry[0] = pp->pr_cache_magic[0] ^ (u_long)ci;
1779 	entry[1] = pp->pr_cache_magic[1] ^ (u_long)ci->ci_next;
1780 }
1781 
1782 static inline void
1783 pool_cache_item_magic_check(struct pool *pp, struct pool_cache_item *ci)
1784 {
1785 	unsigned long *entry;
1786 	unsigned long val;
1787 
1788 	entry = (unsigned long *)&ci->ci_nextl;
1789 	val = pp->pr_cache_magic[0] ^ (u_long)ci;
1790 	if (*entry != val)
1791 		goto fail;
1792 
1793 	entry++;
1794 	val = pp->pr_cache_magic[1] ^ (u_long)ci->ci_next;
1795 	if (*entry != val)
1796 		goto fail;
1797 
1798 	return;
1799 
1800 fail:
1801 	panic("%s: %s cpu free list modified: item addr %p+%zu 0x%lx!=0x%lx",
1802 	    __func__, pp->pr_wchan, ci, (caddr_t)entry - (caddr_t)ci,
1803 	    *entry, val);
1804 }
1805 
1806 static inline void
1807 pool_list_enter(struct pool *pp)
1808 {
1809 	if (pl_enter_try(pp, &pp->pr_cache_lock) == 0) {
1810 		pl_enter(pp, &pp->pr_cache_lock);
1811 		pp->pr_cache_contention++;
1812 	}
1813 }
1814 
1815 static inline void
1816 pool_list_leave(struct pool *pp)
1817 {
1818 	pl_leave(pp, &pp->pr_cache_lock);
1819 }
1820 
1821 static inline struct pool_cache_item *
1822 pool_cache_list_alloc(struct pool *pp, struct pool_cache *pc)
1823 {
1824 	struct pool_cache_item *pl;
1825 
1826 	pool_list_enter(pp);
1827 	pl = TAILQ_FIRST(&pp->pr_cache_lists);
1828 	if (pl != NULL) {
1829 		TAILQ_REMOVE(&pp->pr_cache_lists, pl, ci_nextl);
1830 		pp->pr_cache_nitems -= POOL_CACHE_ITEM_NITEMS(pl);
1831 
1832 		pool_cache_item_magic(pp, pl);
1833 
1834 		pc->pc_nlget++;
1835 	} else
1836 		pc->pc_nlfail++;
1837 
1838 	/* fold this cpus nout into the global while we have the lock */
1839 	pp->pr_cache_nout += pc->pc_nout;
1840 	pc->pc_nout = 0;
1841 	pool_list_leave(pp);
1842 
1843 	return (pl);
1844 }
1845 
1846 static inline void
1847 pool_cache_list_free(struct pool *pp, struct pool_cache *pc,
1848     struct pool_cache_item *ci)
1849 {
1850 	pool_list_enter(pp);
1851 	if (TAILQ_EMPTY(&pp->pr_cache_lists))
1852 		pp->pr_cache_timestamp = getnsecuptime();
1853 
1854 	pp->pr_cache_nitems += POOL_CACHE_ITEM_NITEMS(ci);
1855 	TAILQ_INSERT_TAIL(&pp->pr_cache_lists, ci, ci_nextl);
1856 
1857 	pc->pc_nlput++;
1858 
1859 	/* fold this cpus nout into the global while we have the lock */
1860 	pp->pr_cache_nout += pc->pc_nout;
1861 	pc->pc_nout = 0;
1862 	pool_list_leave(pp);
1863 }
1864 
1865 static inline struct pool_cache *
1866 pool_cache_enter(struct pool *pp, int *s)
1867 {
1868 	struct pool_cache *pc;
1869 
1870 	pc = cpumem_enter(pp->pr_cache);
1871 	*s = splraise(pp->pr_ipl);
1872 	pc->pc_gen++;
1873 
1874 	return (pc);
1875 }
1876 
1877 static inline void
1878 pool_cache_leave(struct pool *pp, struct pool_cache *pc, int s)
1879 {
1880 	pc->pc_gen++;
1881 	splx(s);
1882 	cpumem_leave(pp->pr_cache, pc);
1883 }
1884 
1885 void *
1886 pool_cache_get(struct pool *pp)
1887 {
1888 	struct pool_cache *pc;
1889 	struct pool_cache_item *ci;
1890 	int s;
1891 
1892 	pc = pool_cache_enter(pp, &s);
1893 
1894 	if (pc->pc_actv != NULL) {
1895 		ci = pc->pc_actv;
1896 	} else if (pc->pc_prev != NULL) {
1897 		ci = pc->pc_prev;
1898 		pc->pc_prev = NULL;
1899 	} else if ((ci = pool_cache_list_alloc(pp, pc)) == NULL) {
1900 		pc->pc_nfail++;
1901 		goto done;
1902 	}
1903 
1904 	pool_cache_item_magic_check(pp, ci);
1905 #ifdef DIAGNOSTIC
1906 	if (pool_debug && POOL_CACHE_ITEM_POISONED(ci)) {
1907 		size_t pidx;
1908 		uint32_t pval;
1909 
1910 		if (poison_check(ci + 1, pp->pr_size - sizeof(*ci),
1911 		    &pidx, &pval)) {
1912 			int *ip = (int *)(ci + 1);
1913 			ip += pidx;
1914 
1915 			panic("%s: %s cpu free list modified: "
1916 			    "item addr %p+%zu 0x%x!=0x%x",
1917 			    __func__, pp->pr_wchan, ci,
1918 			    (caddr_t)ip - (caddr_t)ci, *ip, pval);
1919 		}
1920 	}
1921 #endif
1922 
1923 	pc->pc_actv = ci->ci_next;
1924 	pc->pc_nactv = POOL_CACHE_ITEM_NITEMS(ci) - 1;
1925 	pc->pc_nget++;
1926 	pc->pc_nout++;
1927 
1928 done:
1929 	pool_cache_leave(pp, pc, s);
1930 
1931 	return (ci);
1932 }
1933 
1934 void
1935 pool_cache_put(struct pool *pp, void *v)
1936 {
1937 	struct pool_cache *pc;
1938 	struct pool_cache_item *ci = v;
1939 	unsigned long nitems;
1940 	int s;
1941 #ifdef DIAGNOSTIC
1942 	int poison = pool_debug && pp->pr_size > sizeof(*ci);
1943 
1944 	if (poison)
1945 		poison_mem(ci + 1, pp->pr_size - sizeof(*ci));
1946 #endif
1947 
1948 	pc = pool_cache_enter(pp, &s);
1949 
1950 	nitems = pc->pc_nactv;
1951 	if (nitems >= pp->pr_cache_items) {
1952 		if (pc->pc_prev != NULL)
1953 			pool_cache_list_free(pp, pc, pc->pc_prev);
1954 
1955 		pc->pc_prev = pc->pc_actv;
1956 
1957 		pc->pc_actv = NULL;
1958 		pc->pc_nactv = 0;
1959 		nitems = 0;
1960 	}
1961 
1962 	ci->ci_next = pc->pc_actv;
1963 	ci->ci_nitems = ++nitems;
1964 #ifdef DIAGNOSTIC
1965 	ci->ci_nitems |= poison ? POOL_CACHE_ITEM_NITEMS_POISON : 0;
1966 #endif
1967 	pool_cache_item_magic(pp, ci);
1968 
1969 	pc->pc_actv = ci;
1970 	pc->pc_nactv = nitems;
1971 
1972 	pc->pc_nput++;
1973 	pc->pc_nout--;
1974 
1975 	pool_cache_leave(pp, pc, s);
1976 }
1977 
1978 struct pool_cache_item *
1979 pool_cache_list_put(struct pool *pp, struct pool_cache_item *pl)
1980 {
1981 	struct pool_cache_item *rpl, *next;
1982 
1983 	if (pl == NULL)
1984 		return (NULL);
1985 
1986 	rpl = TAILQ_NEXT(pl, ci_nextl);
1987 
1988 	pl_enter(pp, &pp->pr_lock);
1989 	do {
1990 		next = pl->ci_next;
1991 		pool_do_put(pp, pl);
1992 		pl = next;
1993 	} while (pl != NULL);
1994 	pl_leave(pp, &pp->pr_lock);
1995 
1996 	return (rpl);
1997 }
1998 
1999 void
2000 pool_cache_destroy(struct pool *pp)
2001 {
2002 	struct pool_cache *pc;
2003 	struct pool_cache_item *pl;
2004 	struct cpumem_iter i;
2005 	struct cpumem *cm;
2006 
2007 	rw_enter_write(&pool_lock); /* serialise with the gc */
2008 	cm = pp->pr_cache;
2009 	pp->pr_cache = NULL; /* make pool_put avoid the cache */
2010 	rw_exit_write(&pool_lock);
2011 
2012 	CPUMEM_FOREACH(pc, &i, cm) {
2013 		pool_cache_list_put(pp, pc->pc_actv);
2014 		pool_cache_list_put(pp, pc->pc_prev);
2015 	}
2016 
2017 	cpumem_put(&pool_caches, cm);
2018 
2019 	pl = TAILQ_FIRST(&pp->pr_cache_lists);
2020 	while (pl != NULL)
2021 		pl = pool_cache_list_put(pp, pl);
2022 }
2023 
2024 void
2025 pool_cache_gc(struct pool *pp)
2026 {
2027 	unsigned int contention, delta;
2028 
2029 	if (getnsecuptime() - pp->pr_cache_timestamp > POOL_WAIT_GC &&
2030 	    !TAILQ_EMPTY(&pp->pr_cache_lists) &&
2031 	    pl_enter_try(pp, &pp->pr_cache_lock)) {
2032 		struct pool_cache_item *pl = NULL;
2033 
2034 		pl = TAILQ_FIRST(&pp->pr_cache_lists);
2035 		if (pl != NULL) {
2036 			TAILQ_REMOVE(&pp->pr_cache_lists, pl, ci_nextl);
2037 			pp->pr_cache_nitems -= POOL_CACHE_ITEM_NITEMS(pl);
2038 			pp->pr_cache_timestamp = getnsecuptime();
2039 
2040 			pp->pr_cache_ngc++;
2041 		}
2042 
2043 		pl_leave(pp, &pp->pr_cache_lock);
2044 
2045 		pool_cache_list_put(pp, pl);
2046 	}
2047 
2048 	/*
2049 	 * if there's a lot of contention on the pr_cache_mtx then consider
2050 	 * growing the length of the list to reduce the need to access the
2051 	 * global pool.
2052 	 */
2053 
2054 	contention = pp->pr_cache_contention;
2055 	delta = contention - pp->pr_cache_contention_prev;
2056 	if (delta > 8 /* magic */) {
2057 		if ((ncpusfound * 8 * 2) <= pp->pr_cache_nitems)
2058 			pp->pr_cache_items += 8;
2059 	} else if (delta == 0) {
2060 		if (pp->pr_cache_items > 8)
2061 			pp->pr_cache_items--;
2062 	}
2063 	pp->pr_cache_contention_prev = contention;
2064 }
2065 
2066 void
2067 pool_cache_pool_info(struct pool *pp, struct kinfo_pool *pi)
2068 {
2069 	struct pool_cache *pc;
2070 	struct cpumem_iter i;
2071 
2072 	if (pp->pr_cache == NULL)
2073 		return;
2074 
2075 	/* loop through the caches twice to collect stats */
2076 
2077 	/* once without the lock so we can yield while reading nget/nput */
2078 	CPUMEM_FOREACH(pc, &i, pp->pr_cache) {
2079 		uint64_t gen, nget, nput;
2080 
2081 		do {
2082 			while ((gen = pc->pc_gen) & 1)
2083 				yield();
2084 
2085 			nget = pc->pc_nget;
2086 			nput = pc->pc_nput;
2087 		} while (gen != pc->pc_gen);
2088 
2089 		pi->pr_nget += nget;
2090 		pi->pr_nput += nput;
2091 	}
2092 
2093 	/* and once with the mtx so we can get consistent nout values */
2094 	pl_enter(pp, &pp->pr_cache_lock);
2095 	CPUMEM_FOREACH(pc, &i, pp->pr_cache)
2096 		pi->pr_nout += pc->pc_nout;
2097 
2098 	pi->pr_nout += pp->pr_cache_nout;
2099 	pl_leave(pp, &pp->pr_cache_lock);
2100 }
2101 
2102 int
2103 pool_cache_info(struct pool *pp, void *oldp, size_t *oldlenp)
2104 {
2105 	struct kinfo_pool_cache kpc;
2106 
2107 	if (pp->pr_cache == NULL)
2108 		return (EOPNOTSUPP);
2109 
2110 	memset(&kpc, 0, sizeof(kpc)); /* don't leak padding */
2111 
2112 	pl_enter(pp, &pp->pr_cache_lock);
2113 	kpc.pr_ngc = pp->pr_cache_ngc;
2114 	kpc.pr_len = pp->pr_cache_items;
2115 	kpc.pr_nitems = pp->pr_cache_nitems;
2116 	kpc.pr_contention = pp->pr_cache_contention;
2117 	pl_leave(pp, &pp->pr_cache_lock);
2118 
2119 	return (sysctl_rdstruct(oldp, oldlenp, NULL, &kpc, sizeof(kpc)));
2120 }
2121 
2122 int
2123 pool_cache_cpus_info(struct pool *pp, void *oldp, size_t *oldlenp)
2124 {
2125 	struct pool_cache *pc;
2126 	struct kinfo_pool_cache_cpu *kpcc, *info;
2127 	unsigned int cpu = 0;
2128 	struct cpumem_iter i;
2129 	int error = 0;
2130 	size_t len;
2131 
2132 	if (pp->pr_cache == NULL)
2133 		return (EOPNOTSUPP);
2134 	if (*oldlenp % sizeof(*kpcc))
2135 		return (EINVAL);
2136 
2137 	kpcc = mallocarray(ncpusfound, sizeof(*kpcc), M_TEMP,
2138 	    M_WAITOK|M_CANFAIL|M_ZERO);
2139 	if (kpcc == NULL)
2140 		return (EIO);
2141 
2142 	len = ncpusfound * sizeof(*kpcc);
2143 
2144 	CPUMEM_FOREACH(pc, &i, pp->pr_cache) {
2145 		uint64_t gen;
2146 
2147 		if (cpu >= ncpusfound) {
2148 			error = EIO;
2149 			goto err;
2150 		}
2151 
2152 		info = &kpcc[cpu];
2153 		info->pr_cpu = cpu;
2154 
2155 		do {
2156 			while ((gen = pc->pc_gen) & 1)
2157 				yield();
2158 
2159 			info->pr_nget = pc->pc_nget;
2160 			info->pr_nfail = pc->pc_nfail;
2161 			info->pr_nput = pc->pc_nput;
2162 			info->pr_nlget = pc->pc_nlget;
2163 			info->pr_nlfail = pc->pc_nlfail;
2164 			info->pr_nlput = pc->pc_nlput;
2165 		} while (gen != pc->pc_gen);
2166 
2167 		cpu++;
2168 	}
2169 
2170 	error = sysctl_rdstruct(oldp, oldlenp, NULL, kpcc, len);
2171 err:
2172 	free(kpcc, M_TEMP, len);
2173 
2174 	return (error);
2175 }
2176 #else /* MULTIPROCESSOR */
2177 void
2178 pool_cache_init(struct pool *pp)
2179 {
2180 	/* nop */
2181 }
2182 
2183 void
2184 pool_cache_pool_info(struct pool *pp, struct kinfo_pool *pi)
2185 {
2186 	/* nop */
2187 }
2188 
2189 int
2190 pool_cache_info(struct pool *pp, void *oldp, size_t *oldlenp)
2191 {
2192 	return (EOPNOTSUPP);
2193 }
2194 
2195 int
2196 pool_cache_cpus_info(struct pool *pp, void *oldp, size_t *oldlenp)
2197 {
2198 	return (EOPNOTSUPP);
2199 }
2200 #endif /* MULTIPROCESSOR */
2201 
2202 
2203 void
2204 pool_lock_mtx_init(struct pool *pp, union pool_lock *lock,
2205     const struct lock_type *type)
2206 {
2207 	_mtx_init_flags(&lock->prl_mtx, pp->pr_ipl, pp->pr_wchan, 0, type);
2208 }
2209 
2210 void
2211 pool_lock_mtx_enter(union pool_lock *lock)
2212 {
2213 	mtx_enter(&lock->prl_mtx);
2214 }
2215 
2216 int
2217 pool_lock_mtx_enter_try(union pool_lock *lock)
2218 {
2219 	return (mtx_enter_try(&lock->prl_mtx));
2220 }
2221 
2222 void
2223 pool_lock_mtx_leave(union pool_lock *lock)
2224 {
2225 	mtx_leave(&lock->prl_mtx);
2226 }
2227 
2228 void
2229 pool_lock_mtx_assert_locked(union pool_lock *lock)
2230 {
2231 	MUTEX_ASSERT_LOCKED(&lock->prl_mtx);
2232 }
2233 
2234 void
2235 pool_lock_mtx_assert_unlocked(union pool_lock *lock)
2236 {
2237 	MUTEX_ASSERT_UNLOCKED(&lock->prl_mtx);
2238 }
2239 
2240 int
2241 pool_lock_mtx_sleep(void *ident, union pool_lock *lock, int priority,
2242     const char *wmesg)
2243 {
2244 	return msleep_nsec(ident, &lock->prl_mtx, priority, wmesg, INFSLP);
2245 }
2246 
2247 static const struct pool_lock_ops pool_lock_ops_mtx = {
2248 	pool_lock_mtx_init,
2249 	pool_lock_mtx_enter,
2250 	pool_lock_mtx_enter_try,
2251 	pool_lock_mtx_leave,
2252 	pool_lock_mtx_assert_locked,
2253 	pool_lock_mtx_assert_unlocked,
2254 	pool_lock_mtx_sleep,
2255 };
2256 
2257 void
2258 pool_lock_rw_init(struct pool *pp, union pool_lock *lock,
2259     const struct lock_type *type)
2260 {
2261 	_rw_init_flags(&lock->prl_rwlock, pp->pr_wchan, 0, type);
2262 }
2263 
2264 void
2265 pool_lock_rw_enter(union pool_lock *lock)
2266 {
2267 	rw_enter_write(&lock->prl_rwlock);
2268 }
2269 
2270 int
2271 pool_lock_rw_enter_try(union pool_lock *lock)
2272 {
2273 	return (rw_enter(&lock->prl_rwlock, RW_WRITE | RW_NOSLEEP) == 0);
2274 }
2275 
2276 void
2277 pool_lock_rw_leave(union pool_lock *lock)
2278 {
2279 	rw_exit_write(&lock->prl_rwlock);
2280 }
2281 
2282 void
2283 pool_lock_rw_assert_locked(union pool_lock *lock)
2284 {
2285 	rw_assert_wrlock(&lock->prl_rwlock);
2286 }
2287 
2288 void
2289 pool_lock_rw_assert_unlocked(union pool_lock *lock)
2290 {
2291 	KASSERT(rw_status(&lock->prl_rwlock) != RW_WRITE);
2292 }
2293 
2294 int
2295 pool_lock_rw_sleep(void *ident, union pool_lock *lock, int priority,
2296     const char *wmesg)
2297 {
2298 	return rwsleep_nsec(ident, &lock->prl_rwlock, priority, wmesg, INFSLP);
2299 }
2300 
2301 static const struct pool_lock_ops pool_lock_ops_rw = {
2302 	pool_lock_rw_init,
2303 	pool_lock_rw_enter,
2304 	pool_lock_rw_enter_try,
2305 	pool_lock_rw_leave,
2306 	pool_lock_rw_assert_locked,
2307 	pool_lock_rw_assert_unlocked,
2308 	pool_lock_rw_sleep,
2309 };
2310