xref: /openbsd/sys/kern/subr_pool.c (revision 898184e3)
1 /*	$OpenBSD: subr_pool.c,v 1.114 2013/02/17 17:39:29 miod Exp $	*/
2 /*	$NetBSD: subr_pool.c,v 1.61 2001/09/26 07:14:56 chs Exp $	*/
3 
4 /*-
5  * Copyright (c) 1997, 1999, 2000 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
25  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31  * POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/proc.h>
37 #include <sys/errno.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
40 #include <sys/pool.h>
41 #include <sys/syslog.h>
42 #include <sys/sysctl.h>
43 
44 #include <uvm/uvm.h>
45 #include <dev/rndvar.h>
46 
47 /*
48  * Pool resource management utility.
49  *
50  * Memory is allocated in pages which are split into pieces according to
51  * the pool item size. Each page is kept on one of three lists in the
52  * pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
53  * for empty, full and partially-full pages respectively. The individual
54  * pool items are on a linked list headed by `ph_itemlist' in each page
55  * header. The memory for building the page list is either taken from
56  * the allocated pages themselves (for small pool items) or taken from
57  * an internal pool of page headers (`phpool').
58  */
59 
60 /* List of all pools */
61 TAILQ_HEAD(,pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
62 
63 /* Private pool for page header structures */
64 struct pool phpool;
65 
66 struct pool_item_header {
67 	/* Page headers */
68 	LIST_ENTRY(pool_item_header)
69 				ph_pagelist;	/* pool page list */
70 	TAILQ_HEAD(,pool_item)	ph_itemlist;	/* chunk list for this page */
71 	RB_ENTRY(pool_item_header)
72 				ph_node;	/* Off-page page headers */
73 	int			ph_nmissing;	/* # of chunks in use */
74 	caddr_t			ph_page;	/* this page's address */
75 	caddr_t			ph_colored;	/* page's colored address */
76 	int			ph_pagesize;
77 	int			ph_magic;
78 };
79 
80 struct pool_item {
81 #ifdef DIAGNOSTIC
82 	u_int32_t pi_magic;
83 #endif
84 	/* Other entries use only this list entry */
85 	TAILQ_ENTRY(pool_item)	pi_list;
86 };
87 
88 #ifdef DEADBEEF1
89 #define	PI_MAGIC DEADBEEF1
90 #else
91 #define	PI_MAGIC 0xdeafbeef
92 #endif
93 
94 #ifdef POOL_DEBUG
95 int	pool_debug = 1;
96 #else
97 int	pool_debug = 0;
98 #endif
99 
100 #define	POOL_NEEDS_CATCHUP(pp)						\
101 	((pp)->pr_nitems < (pp)->pr_minitems)
102 
103 /*
104  * Every pool gets a unique serial number assigned to it. If this counter
105  * wraps, we're screwed, but we shouldn't create so many pools anyway.
106  */
107 unsigned int pool_serial;
108 
109 int	 pool_catchup(struct pool *);
110 void	 pool_prime_page(struct pool *, caddr_t, struct pool_item_header *);
111 void	 pool_update_curpage(struct pool *);
112 void	*pool_do_get(struct pool *, int);
113 void	 pool_do_put(struct pool *, void *);
114 void	 pr_rmpage(struct pool *, struct pool_item_header *,
115 	    struct pool_pagelist *);
116 int	 pool_chk_page(struct pool *, struct pool_item_header *, int);
117 int	 pool_chk(struct pool *);
118 struct pool_item_header *pool_alloc_item_header(struct pool *, caddr_t , int);
119 
120 void	*pool_allocator_alloc(struct pool *, int, int *);
121 void	 pool_allocator_free(struct pool *, void *);
122 
123 /*
124  * XXX - quick hack. For pools with large items we want to use a special
125  *       allocator. For now, instead of having the allocator figure out
126  *       the allocation size from the pool (which can be done trivially
127  *       with round_page(pr_itemsperpage * pr_size)) which would require
128  *	 lots of changes everywhere, we just create allocators for each
129  *	 size. We limit those to 128 pages.
130  */
131 #define POOL_LARGE_MAXPAGES 128
132 struct pool_allocator pool_allocator_large[POOL_LARGE_MAXPAGES];
133 struct pool_allocator pool_allocator_large_ni[POOL_LARGE_MAXPAGES];
134 void	*pool_large_alloc(struct pool *, int, int *);
135 void	pool_large_free(struct pool *, void *);
136 void	*pool_large_alloc_ni(struct pool *, int, int *);
137 void	pool_large_free_ni(struct pool *, void *);
138 
139 
140 #ifdef DDB
141 void	 pool_print_pagelist(struct pool_pagelist *, int (*)(const char *, ...)
142 	    /* __attribute__((__format__(__kprintf__,1,2))) */);
143 void	 pool_print1(struct pool *, const char *, int (*)(const char *, ...)
144 	    /* __attribute__((__format__(__kprintf__,1,2))) */);
145 #endif
146 
147 #define pool_sleep(pl) msleep(pl, &pl->pr_mtx, PSWP, pl->pr_wchan, 0)
148 
149 static __inline int
150 phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
151 {
152 	long diff = (vaddr_t)a->ph_page - (vaddr_t)b->ph_page;
153 	if (diff < 0)
154 		return -(-diff >= a->ph_pagesize);
155 	else if (diff > 0)
156 		return (diff >= b->ph_pagesize);
157 	else
158 		return (0);
159 }
160 
161 RB_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);
162 RB_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
163 
164 /*
165  * Return the pool page header based on page address.
166  */
167 static __inline struct pool_item_header *
168 pr_find_pagehead(struct pool *pp, void *v)
169 {
170 	struct pool_item_header *ph, tmp;
171 
172 	if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
173 		caddr_t page;
174 
175 		page = (caddr_t)((vaddr_t)v & pp->pr_alloc->pa_pagemask);
176 
177 		return ((struct pool_item_header *)(page + pp->pr_phoffset));
178 	}
179 
180 	/*
181 	 * The trick we're using in the tree compare function is to compare
182 	 * two elements equal when they overlap. We want to return the
183 	 * page header that belongs to the element just before this address.
184 	 * We don't want this element to compare equal to the next element,
185 	 * so the compare function takes the pagesize from the lower element.
186 	 * If this header is the lower, its pagesize is zero, so it can't
187 	 * overlap with the next header. But if the header we're looking for
188 	 * is lower, we'll use its pagesize and it will overlap and return
189 	 * equal.
190 	 */
191 	tmp.ph_page = v;
192 	tmp.ph_pagesize = 0;
193 	ph = RB_FIND(phtree, &pp->pr_phtree, &tmp);
194 
195 	if (ph) {
196 		KASSERT(ph->ph_page <= (caddr_t)v);
197 		KASSERT(ph->ph_page + ph->ph_pagesize > (caddr_t)v);
198 	}
199 	return ph;
200 }
201 
202 /*
203  * Remove a page from the pool.
204  */
205 void
206 pr_rmpage(struct pool *pp, struct pool_item_header *ph,
207     struct pool_pagelist *pq)
208 {
209 
210 	/*
211 	 * If the page was idle, decrement the idle page count.
212 	 */
213 	if (ph->ph_nmissing == 0) {
214 #ifdef DIAGNOSTIC
215 		if (pp->pr_nidle == 0)
216 			panic("pr_rmpage: nidle inconsistent");
217 		if (pp->pr_nitems < pp->pr_itemsperpage)
218 			panic("pr_rmpage: nitems inconsistent");
219 #endif
220 		pp->pr_nidle--;
221 	}
222 
223 	pp->pr_nitems -= pp->pr_itemsperpage;
224 
225 	/*
226 	 * Unlink a page from the pool and release it (or queue it for release).
227 	 */
228 	LIST_REMOVE(ph, ph_pagelist);
229 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
230 		RB_REMOVE(phtree, &pp->pr_phtree, ph);
231 	pp->pr_npages--;
232 	pp->pr_npagefree++;
233 	pool_update_curpage(pp);
234 
235 	if (pq) {
236 		LIST_INSERT_HEAD(pq, ph, ph_pagelist);
237 	} else {
238 		pool_allocator_free(pp, ph->ph_page);
239 		if ((pp->pr_roflags & PR_PHINPAGE) == 0)
240 			pool_put(&phpool, ph);
241 	}
242 }
243 
244 /*
245  * Initialize the given pool resource structure.
246  *
247  * We export this routine to allow other kernel parts to declare
248  * static pools that must be initialized before malloc() is available.
249  */
250 void
251 pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
252     const char *wchan, struct pool_allocator *palloc)
253 {
254 	int off, slack;
255 
256 #ifdef MALLOC_DEBUG
257 	if ((flags & PR_DEBUG) && (ioff != 0 || align != 0))
258 		flags &= ~PR_DEBUG;
259 #endif
260 	/*
261 	 * Check arguments and construct default values.
262 	 */
263 	if (palloc == NULL) {
264 		if (size > PAGE_SIZE) {
265 			int psize;
266 
267 			/*
268 			 * XXX - should take align into account as well.
269 			 */
270 			if (size == round_page(size))
271 				psize = size / PAGE_SIZE;
272 			else
273 				psize = PAGE_SIZE / roundup(size % PAGE_SIZE,
274 				    1024);
275 			if (psize > POOL_LARGE_MAXPAGES)
276 				psize = POOL_LARGE_MAXPAGES;
277 			if (flags & PR_WAITOK)
278 				palloc = &pool_allocator_large_ni[psize-1];
279 			else
280 				palloc = &pool_allocator_large[psize-1];
281 			if (palloc->pa_pagesz == 0) {
282 				palloc->pa_pagesz = psize * PAGE_SIZE;
283 				if (flags & PR_WAITOK) {
284 					palloc->pa_alloc = pool_large_alloc_ni;
285 					palloc->pa_free = pool_large_free_ni;
286 				} else {
287 					palloc->pa_alloc = pool_large_alloc;
288 					palloc->pa_free = pool_large_free;
289 				}
290 			}
291 		} else {
292 			palloc = &pool_allocator_nointr;
293 		}
294 	}
295 	if (palloc->pa_pagesz == 0) {
296 		palloc->pa_pagesz = PAGE_SIZE;
297 	}
298 	if (palloc->pa_pagemask == 0) {
299 		palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
300 		palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
301 	}
302 
303 	if (align == 0)
304 		align = ALIGN(1);
305 
306 	if (size < sizeof(struct pool_item))
307 		size = sizeof(struct pool_item);
308 
309 	size = roundup(size, align);
310 #ifdef DIAGNOSTIC
311 	if (size > palloc->pa_pagesz)
312 		panic("pool_init: pool item size (%lu) too large",
313 		    (u_long)size);
314 #endif
315 
316 	/*
317 	 * Initialize the pool structure.
318 	 */
319 	LIST_INIT(&pp->pr_emptypages);
320 	LIST_INIT(&pp->pr_fullpages);
321 	LIST_INIT(&pp->pr_partpages);
322 	pp->pr_curpage = NULL;
323 	pp->pr_npages = 0;
324 	pp->pr_minitems = 0;
325 	pp->pr_minpages = 0;
326 	pp->pr_maxpages = 8;
327 	pp->pr_roflags = flags;
328 	pp->pr_flags = 0;
329 	pp->pr_size = size;
330 	pp->pr_align = align;
331 	pp->pr_wchan = wchan;
332 	pp->pr_alloc = palloc;
333 	pp->pr_nitems = 0;
334 	pp->pr_nout = 0;
335 	pp->pr_hardlimit = UINT_MAX;
336 	pp->pr_hardlimit_warning = NULL;
337 	pp->pr_hardlimit_ratecap.tv_sec = 0;
338 	pp->pr_hardlimit_ratecap.tv_usec = 0;
339 	pp->pr_hardlimit_warning_last.tv_sec = 0;
340 	pp->pr_hardlimit_warning_last.tv_usec = 0;
341 	pp->pr_serial = ++pool_serial;
342 	if (pool_serial == 0)
343 		panic("pool_init: too much uptime");
344 
345         /* constructor, destructor, and arg */
346 	pp->pr_ctor = NULL;
347 	pp->pr_dtor = NULL;
348 	pp->pr_arg = NULL;
349 
350 	/*
351 	 * Decide whether to put the page header off page to avoid
352 	 * wasting too large a part of the page. Off-page page headers
353 	 * go into an RB tree, so we can match a returned item with
354 	 * its header based on the page address.
355 	 * We use 1/16 of the page size as the threshold (XXX: tune)
356 	 */
357 	if (pp->pr_size < palloc->pa_pagesz/16 && pp->pr_size < PAGE_SIZE) {
358 		/* Use the end of the page for the page header */
359 		pp->pr_roflags |= PR_PHINPAGE;
360 		pp->pr_phoffset = off = palloc->pa_pagesz -
361 		    ALIGN(sizeof(struct pool_item_header));
362 	} else {
363 		/* The page header will be taken from our page header pool */
364 		pp->pr_phoffset = 0;
365 		off = palloc->pa_pagesz;
366 		RB_INIT(&pp->pr_phtree);
367 	}
368 
369 	/*
370 	 * Alignment is to take place at `ioff' within the item. This means
371 	 * we must reserve up to `align - 1' bytes on the page to allow
372 	 * appropriate positioning of each item.
373 	 *
374 	 * Silently enforce `0 <= ioff < align'.
375 	 */
376 	pp->pr_itemoffset = ioff = ioff % align;
377 	pp->pr_itemsperpage = (off - ((align - ioff) % align)) / pp->pr_size;
378 	KASSERT(pp->pr_itemsperpage != 0);
379 
380 	/*
381 	 * Use the slack between the chunks and the page header
382 	 * for "cache coloring".
383 	 */
384 	slack = off - pp->pr_itemsperpage * pp->pr_size;
385 	pp->pr_maxcolor = (slack / align) * align;
386 	pp->pr_curcolor = 0;
387 
388 	pp->pr_nget = 0;
389 	pp->pr_nfail = 0;
390 	pp->pr_nput = 0;
391 	pp->pr_npagealloc = 0;
392 	pp->pr_npagefree = 0;
393 	pp->pr_hiwat = 0;
394 	pp->pr_nidle = 0;
395 
396 	pp->pr_ipl = -1;
397 	mtx_init(&pp->pr_mtx, IPL_NONE);
398 
399 	if (phpool.pr_size == 0) {
400 		pool_init(&phpool, sizeof(struct pool_item_header), 0, 0,
401 		    0, "phpool", NULL);
402 		pool_setipl(&phpool, IPL_HIGH);
403 	}
404 
405 	/* pglistalloc/constraint parameters */
406 	pp->pr_crange = &kp_dirty;
407 
408 	/* Insert this into the list of all pools. */
409 	TAILQ_INSERT_HEAD(&pool_head, pp, pr_poollist);
410 }
411 
412 void
413 pool_setipl(struct pool *pp, int ipl)
414 {
415 	pp->pr_ipl = ipl;
416 	mtx_init(&pp->pr_mtx, ipl);
417 }
418 
419 /*
420  * Decommission a pool resource.
421  */
422 void
423 pool_destroy(struct pool *pp)
424 {
425 	struct pool_item_header *ph;
426 
427 #ifdef DIAGNOSTIC
428 	if (pp->pr_nout != 0)
429 		panic("pool_destroy: pool busy: still out: %u", pp->pr_nout);
430 #endif
431 
432 	/* Remove all pages */
433 	while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
434 		pr_rmpage(pp, ph, NULL);
435 	KASSERT(LIST_EMPTY(&pp->pr_fullpages));
436 	KASSERT(LIST_EMPTY(&pp->pr_partpages));
437 
438 	/* Remove from global pool list */
439 	TAILQ_REMOVE(&pool_head, pp, pr_poollist);
440 }
441 
442 struct pool_item_header *
443 pool_alloc_item_header(struct pool *pp, caddr_t storage, int flags)
444 {
445 	struct pool_item_header *ph;
446 
447 	if ((pp->pr_roflags & PR_PHINPAGE) != 0)
448 		ph = (struct pool_item_header *)(storage + pp->pr_phoffset);
449 	else
450 		ph = pool_get(&phpool, (flags & ~(PR_WAITOK | PR_ZERO)) |
451 		    PR_NOWAIT);
452 	if (pool_debug && ph != NULL)
453 		ph->ph_magic = PI_MAGIC;
454 	return (ph);
455 }
456 
457 /*
458  * Grab an item from the pool; must be called at appropriate spl level
459  */
460 void *
461 pool_get(struct pool *pp, int flags)
462 {
463 	void *v;
464 
465 	KASSERT(flags & (PR_WAITOK | PR_NOWAIT));
466 
467 #ifdef DIAGNOSTIC
468 	if ((flags & PR_WAITOK) != 0)
469 		assertwaitok();
470 #endif /* DIAGNOSTIC */
471 
472 	mtx_enter(&pp->pr_mtx);
473 #ifdef POOL_DEBUG
474 	if (pp->pr_roflags & PR_DEBUGCHK) {
475 		if (pool_chk(pp))
476 			panic("before pool_get");
477 	}
478 #endif
479 	v = pool_do_get(pp, flags);
480 #ifdef POOL_DEBUG
481 	if (pp->pr_roflags & PR_DEBUGCHK) {
482 		if (pool_chk(pp))
483 			panic("after pool_get");
484 	}
485 #endif
486 	if (v != NULL)
487 		pp->pr_nget++;
488 	mtx_leave(&pp->pr_mtx);
489 	if (v == NULL)
490 		return (v);
491 
492 	if (pp->pr_ctor) {
493 		if (flags & PR_ZERO)
494 			panic("pool_get: PR_ZERO when ctor set");
495 		if (pp->pr_ctor(pp->pr_arg, v, flags)) {
496 			mtx_enter(&pp->pr_mtx);
497 			pp->pr_nget--;
498 			pool_do_put(pp, v);
499 			mtx_leave(&pp->pr_mtx);
500 			v = NULL;
501 		}
502 	} else {
503 		if (flags & PR_ZERO)
504 			memset(v, 0, pp->pr_size);
505 	}
506 	return (v);
507 }
508 
509 void *
510 pool_do_get(struct pool *pp, int flags)
511 {
512 	struct pool_item *pi;
513 	struct pool_item_header *ph;
514 	void *v;
515 	int slowdown = 0;
516 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
517 	int i, *ip;
518 #endif
519 
520 #ifdef MALLOC_DEBUG
521 	if (pp->pr_roflags & PR_DEBUG) {
522 		void *addr;
523 
524 		addr = NULL;
525 		debug_malloc(pp->pr_size, M_DEBUG,
526 		    (flags & PR_WAITOK) ? M_WAITOK : M_NOWAIT, &addr);
527 		return (addr);
528 	}
529 #endif
530 
531 startover:
532 	/*
533 	 * Check to see if we've reached the hard limit.  If we have,
534 	 * and we can wait, then wait until an item has been returned to
535 	 * the pool.
536 	 */
537 #ifdef DIAGNOSTIC
538 	if (pp->pr_nout > pp->pr_hardlimit)
539 		panic("pool_do_get: %s: crossed hard limit", pp->pr_wchan);
540 #endif
541 	if (pp->pr_nout == pp->pr_hardlimit) {
542 		if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
543 			/*
544 			 * XXX: A warning isn't logged in this case.  Should
545 			 * it be?
546 			 */
547 			pp->pr_flags |= PR_WANTED;
548 			pool_sleep(pp);
549 			goto startover;
550 		}
551 
552 		/*
553 		 * Log a message that the hard limit has been hit.
554 		 */
555 		if (pp->pr_hardlimit_warning != NULL &&
556 		    ratecheck(&pp->pr_hardlimit_warning_last,
557 		    &pp->pr_hardlimit_ratecap))
558 			log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
559 
560 		pp->pr_nfail++;
561 		return (NULL);
562 	}
563 
564 	/*
565 	 * The convention we use is that if `curpage' is not NULL, then
566 	 * it points at a non-empty bucket. In particular, `curpage'
567 	 * never points at a page header which has PR_PHINPAGE set and
568 	 * has no items in its bucket.
569 	 */
570 	if ((ph = pp->pr_curpage) == NULL) {
571 #ifdef DIAGNOSTIC
572 		if (pp->pr_nitems != 0) {
573 			printf("pool_do_get: %s: curpage NULL, nitems %u\n",
574 			    pp->pr_wchan, pp->pr_nitems);
575 			panic("pool_do_get: nitems inconsistent");
576 		}
577 #endif
578 
579 		/*
580 		 * Call the back-end page allocator for more memory.
581 		 */
582 		v = pool_allocator_alloc(pp, flags, &slowdown);
583 		if (v != NULL)
584 			ph = pool_alloc_item_header(pp, v, flags);
585 
586 		if (v == NULL || ph == NULL) {
587 			if (v != NULL)
588 				pool_allocator_free(pp, v);
589 
590 			if ((flags & PR_WAITOK) == 0) {
591 				pp->pr_nfail++;
592 				return (NULL);
593 			}
594 
595 			/*
596 			 * Wait for items to be returned to this pool.
597 			 *
598 			 * XXX: maybe we should wake up once a second and
599 			 * try again?
600 			 */
601 			pp->pr_flags |= PR_WANTED;
602 			pool_sleep(pp);
603 			goto startover;
604 		}
605 
606 		/* We have more memory; add it to the pool */
607 		pool_prime_page(pp, v, ph);
608 		pp->pr_npagealloc++;
609 
610 		if (slowdown && (flags & PR_WAITOK)) {
611 			mtx_leave(&pp->pr_mtx);
612 			yield();
613 			mtx_enter(&pp->pr_mtx);
614 		}
615 
616 		/* Start the allocation process over. */
617 		goto startover;
618 	}
619 	if ((v = pi = TAILQ_FIRST(&ph->ph_itemlist)) == NULL) {
620 		panic("pool_do_get: %s: page empty", pp->pr_wchan);
621 	}
622 #ifdef DIAGNOSTIC
623 	if (pp->pr_nitems == 0) {
624 		printf("pool_do_get: %s: items on itemlist, nitems %u\n",
625 		    pp->pr_wchan, pp->pr_nitems);
626 		panic("pool_do_get: nitems inconsistent");
627 	}
628 #endif
629 
630 #ifdef DIAGNOSTIC
631 	if (pi->pi_magic != PI_MAGIC)
632 		panic("pool_do_get(%s): free list modified: "
633 		    "page %p; item addr %p; offset 0x%x=0x%x",
634 		    pp->pr_wchan, ph->ph_page, pi, 0, pi->pi_magic);
635 #ifdef POOL_DEBUG
636 	if (pool_debug && ph->ph_magic) {
637 		for (ip = (int *)pi, i = sizeof(*pi) / sizeof(int);
638 		    i < pp->pr_size / sizeof(int); i++) {
639 			if (ip[i] != ph->ph_magic) {
640 				panic("pool_do_get(%s): free list modified: "
641 				    "page %p; item addr %p; offset 0x%zx=0x%x",
642 				    pp->pr_wchan, ph->ph_page, pi,
643 				    i * sizeof(int), ip[i]);
644 			}
645 		}
646 	}
647 #endif /* POOL_DEBUG */
648 #endif /* DIAGNOSTIC */
649 
650 	/*
651 	 * Remove from item list.
652 	 */
653 	TAILQ_REMOVE(&ph->ph_itemlist, pi, pi_list);
654 	pp->pr_nitems--;
655 	pp->pr_nout++;
656 	if (ph->ph_nmissing == 0) {
657 #ifdef DIAGNOSTIC
658 		if (pp->pr_nidle == 0)
659 			panic("pool_do_get: nidle inconsistent");
660 #endif
661 		pp->pr_nidle--;
662 
663 		/*
664 		 * This page was previously empty.  Move it to the list of
665 		 * partially-full pages.  This page is already curpage.
666 		 */
667 		LIST_REMOVE(ph, ph_pagelist);
668 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
669 	}
670 	ph->ph_nmissing++;
671 	if (TAILQ_EMPTY(&ph->ph_itemlist)) {
672 #ifdef DIAGNOSTIC
673 		if (ph->ph_nmissing != pp->pr_itemsperpage) {
674 			panic("pool_do_get: %s: nmissing inconsistent",
675 			    pp->pr_wchan);
676 		}
677 #endif
678 		/*
679 		 * This page is now full.  Move it to the full list
680 		 * and select a new current page.
681 		 */
682 		LIST_REMOVE(ph, ph_pagelist);
683 		LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist);
684 		pool_update_curpage(pp);
685 	}
686 
687 	/*
688 	 * If we have a low water mark and we are now below that low
689 	 * water mark, add more items to the pool.
690 	 */
691 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
692 		/*
693 		 * XXX: Should we log a warning?  Should we set up a timeout
694 		 * to try again in a second or so?  The latter could break
695 		 * a caller's assumptions about interrupt protection, etc.
696 		 */
697 	}
698 	return (v);
699 }
700 
701 /*
702  * Return resource to the pool; must be called at appropriate spl level
703  */
704 void
705 pool_put(struct pool *pp, void *v)
706 {
707 	if (pp->pr_dtor)
708 		pp->pr_dtor(pp->pr_arg, v);
709 	mtx_enter(&pp->pr_mtx);
710 #ifdef POOL_DEBUG
711 	if (pp->pr_roflags & PR_DEBUGCHK) {
712 		if (pool_chk(pp))
713 			panic("before pool_put");
714 	}
715 #endif
716 	pool_do_put(pp, v);
717 #ifdef POOL_DEBUG
718 	if (pp->pr_roflags & PR_DEBUGCHK) {
719 		if (pool_chk(pp))
720 			panic("after pool_put");
721 	}
722 #endif
723 	pp->pr_nput++;
724 	mtx_leave(&pp->pr_mtx);
725 }
726 
727 /*
728  * Internal version of pool_put().
729  */
730 void
731 pool_do_put(struct pool *pp, void *v)
732 {
733 	struct pool_item *pi = v;
734 	struct pool_item_header *ph;
735 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
736 	int i, *ip;
737 #endif
738 
739 	if (v == NULL)
740 		panic("pool_put of NULL");
741 
742 #ifdef MALLOC_DEBUG
743 	if (pp->pr_roflags & PR_DEBUG) {
744 		debug_free(v, M_DEBUG);
745 		return;
746 	}
747 #endif
748 
749 #ifdef DIAGNOSTIC
750 	if (pp->pr_ipl != -1)
751 		splassert(pp->pr_ipl);
752 
753 	if (pp->pr_nout == 0) {
754 		printf("pool %s: putting with none out\n",
755 		    pp->pr_wchan);
756 		panic("pool_do_put");
757 	}
758 #endif
759 
760 	if ((ph = pr_find_pagehead(pp, v)) == NULL) {
761 		panic("pool_do_put: %s: page header missing", pp->pr_wchan);
762 	}
763 
764 	/*
765 	 * Return to item list.
766 	 */
767 #ifdef DIAGNOSTIC
768 	pi->pi_magic = PI_MAGIC;
769 #ifdef POOL_DEBUG
770 	if (ph->ph_magic) {
771 		for (ip = (int *)pi, i = sizeof(*pi)/sizeof(int);
772 		    i < pp->pr_size / sizeof(int); i++)
773 			ip[i] = ph->ph_magic;
774 	}
775 #endif /* POOL_DEBUG */
776 #endif /* DIAGNOSTIC */
777 
778 	TAILQ_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
779 	ph->ph_nmissing--;
780 	pp->pr_nitems++;
781 	pp->pr_nout--;
782 
783 	/* Cancel "pool empty" condition if it exists */
784 	if (pp->pr_curpage == NULL)
785 		pp->pr_curpage = ph;
786 
787 	if (pp->pr_flags & PR_WANTED) {
788 		pp->pr_flags &= ~PR_WANTED;
789 		wakeup(pp);
790 	}
791 
792 	/*
793 	 * If this page is now empty, do one of two things:
794 	 *
795 	 *	(1) If we have more pages than the page high water mark,
796 	 *	    free the page back to the system.
797 	 *
798 	 *	(2) Otherwise, move the page to the empty page list.
799 	 *
800 	 * Either way, select a new current page (so we use a partially-full
801 	 * page if one is available).
802 	 */
803 	if (ph->ph_nmissing == 0) {
804 		pp->pr_nidle++;
805 		if (pp->pr_nidle > pp->pr_maxpages) {
806 			pr_rmpage(pp, ph, NULL);
807 		} else {
808 			LIST_REMOVE(ph, ph_pagelist);
809 			LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
810 			pool_update_curpage(pp);
811 		}
812 	}
813 
814 	/*
815 	 * If the page was previously completely full, move it to the
816 	 * partially-full list and make it the current page.  The next
817 	 * allocation will get the item from this page, instead of
818 	 * further fragmenting the pool.
819 	 */
820 	else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) {
821 		LIST_REMOVE(ph, ph_pagelist);
822 		LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
823 		pp->pr_curpage = ph;
824 	}
825 }
826 
827 /*
828  * Add N items to the pool.
829  */
830 int
831 pool_prime(struct pool *pp, int n)
832 {
833 	struct pool_item_header *ph;
834 	caddr_t cp;
835 	int newpages;
836 	int slowdown;
837 
838 	mtx_enter(&pp->pr_mtx);
839 	newpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
840 
841 	while (newpages-- > 0) {
842 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
843 		if (cp != NULL)
844 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
845 		if (cp == NULL || ph == NULL) {
846 			if (cp != NULL)
847 				pool_allocator_free(pp, cp);
848 			break;
849 		}
850 
851 		pool_prime_page(pp, cp, ph);
852 		pp->pr_npagealloc++;
853 		pp->pr_minpages++;
854 	}
855 
856 	if (pp->pr_minpages >= pp->pr_maxpages)
857 		pp->pr_maxpages = pp->pr_minpages + 1;	/* XXX */
858 
859 	mtx_leave(&pp->pr_mtx);
860 	return (0);
861 }
862 
863 /*
864  * Add a page worth of items to the pool.
865  *
866  * Note, we must be called with the pool descriptor LOCKED.
867  */
868 void
869 pool_prime_page(struct pool *pp, caddr_t storage, struct pool_item_header *ph)
870 {
871 	struct pool_item *pi;
872 	caddr_t cp = storage;
873 	unsigned int align = pp->pr_align;
874 	unsigned int ioff = pp->pr_itemoffset;
875 	int n;
876 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
877 	int i, *ip;
878 #endif
879 
880 	/*
881 	 * Insert page header.
882 	 */
883 	LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
884 	TAILQ_INIT(&ph->ph_itemlist);
885 	ph->ph_page = storage;
886 	ph->ph_pagesize = pp->pr_alloc->pa_pagesz;
887 	ph->ph_nmissing = 0;
888 	if ((pp->pr_roflags & PR_PHINPAGE) == 0)
889 		RB_INSERT(phtree, &pp->pr_phtree, ph);
890 
891 	pp->pr_nidle++;
892 
893 	/*
894 	 * Color this page.
895 	 */
896 	cp = (caddr_t)(cp + pp->pr_curcolor);
897 	if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
898 		pp->pr_curcolor = 0;
899 
900 	/*
901 	 * Adjust storage to apply alignment to `pr_itemoffset' in each item.
902 	 */
903 	if (ioff != 0)
904 		cp = (caddr_t)(cp + (align - ioff));
905 	ph->ph_colored = cp;
906 
907 	/*
908 	 * Insert remaining chunks on the bucket list.
909 	 */
910 	n = pp->pr_itemsperpage;
911 	pp->pr_nitems += n;
912 
913 	while (n--) {
914 		pi = (struct pool_item *)cp;
915 
916 		KASSERT(((((vaddr_t)pi) + ioff) & (align - 1)) == 0);
917 
918 		/* Insert on page list */
919 		TAILQ_INSERT_TAIL(&ph->ph_itemlist, pi, pi_list);
920 
921 #ifdef DIAGNOSTIC
922 		pi->pi_magic = PI_MAGIC;
923 #ifdef POOL_DEBUG
924 		if (ph->ph_magic) {
925 			for (ip = (int *)pi, i = sizeof(*pi)/sizeof(int);
926 			    i < pp->pr_size / sizeof(int); i++)
927 				ip[i] = ph->ph_magic;
928 		}
929 #endif /* POOL_DEBUG */
930 #endif /* DIAGNOSTIC */
931 		cp = (caddr_t)(cp + pp->pr_size);
932 	}
933 
934 	/*
935 	 * If the pool was depleted, point at the new page.
936 	 */
937 	if (pp->pr_curpage == NULL)
938 		pp->pr_curpage = ph;
939 
940 	if (++pp->pr_npages > pp->pr_hiwat)
941 		pp->pr_hiwat = pp->pr_npages;
942 }
943 
944 /*
945  * Used by pool_get() when nitems drops below the low water mark.  This
946  * is used to catch up pr_nitems with the low water mark.
947  *
948  * Note we never wait for memory here, we let the caller decide what to do.
949  */
950 int
951 pool_catchup(struct pool *pp)
952 {
953 	struct pool_item_header *ph;
954 	caddr_t cp;
955 	int error = 0;
956 	int slowdown;
957 
958 	while (POOL_NEEDS_CATCHUP(pp)) {
959 		/*
960 		 * Call the page back-end allocator for more memory.
961 		 */
962 		cp = pool_allocator_alloc(pp, PR_NOWAIT, &slowdown);
963 		if (cp != NULL)
964 			ph = pool_alloc_item_header(pp, cp, PR_NOWAIT);
965 		if (cp == NULL || ph == NULL) {
966 			if (cp != NULL)
967 				pool_allocator_free(pp, cp);
968 			error = ENOMEM;
969 			break;
970 		}
971 		pool_prime_page(pp, cp, ph);
972 		pp->pr_npagealloc++;
973 	}
974 
975 	return (error);
976 }
977 
978 void
979 pool_update_curpage(struct pool *pp)
980 {
981 
982 	pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
983 	if (pp->pr_curpage == NULL) {
984 		pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
985 	}
986 }
987 
988 void
989 pool_setlowat(struct pool *pp, int n)
990 {
991 
992 	pp->pr_minitems = n;
993 	pp->pr_minpages = (n == 0)
994 		? 0
995 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
996 
997 	mtx_enter(&pp->pr_mtx);
998 	/* Make sure we're caught up with the newly-set low water mark. */
999 	if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
1000 		/*
1001 		 * XXX: Should we log a warning?  Should we set up a timeout
1002 		 * to try again in a second or so?  The latter could break
1003 		 * a caller's assumptions about interrupt protection, etc.
1004 		 */
1005 	}
1006 	mtx_leave(&pp->pr_mtx);
1007 }
1008 
1009 void
1010 pool_sethiwat(struct pool *pp, int n)
1011 {
1012 
1013 	pp->pr_maxpages = (n == 0)
1014 		? 0
1015 		: roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
1016 }
1017 
1018 int
1019 pool_sethardlimit(struct pool *pp, u_int n, const char *warnmsg, int ratecap)
1020 {
1021 	int error = 0;
1022 
1023 	if (n < pp->pr_nout) {
1024 		error = EINVAL;
1025 		goto done;
1026 	}
1027 
1028 	pp->pr_hardlimit = n;
1029 	pp->pr_hardlimit_warning = warnmsg;
1030 	pp->pr_hardlimit_ratecap.tv_sec = ratecap;
1031 	pp->pr_hardlimit_warning_last.tv_sec = 0;
1032 	pp->pr_hardlimit_warning_last.tv_usec = 0;
1033 
1034 done:
1035 	return (error);
1036 }
1037 
1038 void
1039 pool_set_constraints(struct pool *pp, const struct kmem_pa_mode *mode)
1040 {
1041 	pp->pr_crange = mode;
1042 }
1043 
1044 void
1045 pool_set_ctordtor(struct pool *pp, int (*ctor)(void *, void *, int),
1046     void (*dtor)(void *, void *), void *arg)
1047 {
1048 	pp->pr_ctor = ctor;
1049 	pp->pr_dtor = dtor;
1050 	pp->pr_arg = arg;
1051 }
1052 /*
1053  * Release all complete pages that have not been used recently.
1054  *
1055  * Returns non-zero if any pages have been reclaimed.
1056  */
1057 int
1058 pool_reclaim(struct pool *pp)
1059 {
1060 	struct pool_item_header *ph, *phnext;
1061 	struct pool_pagelist pq;
1062 
1063 	LIST_INIT(&pq);
1064 
1065 	mtx_enter(&pp->pr_mtx);
1066 	for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
1067 		phnext = LIST_NEXT(ph, ph_pagelist);
1068 
1069 		/* Check our minimum page claim */
1070 		if (pp->pr_npages <= pp->pr_minpages)
1071 			break;
1072 
1073 		KASSERT(ph->ph_nmissing == 0);
1074 
1075 		/*
1076 		 * If freeing this page would put us below
1077 		 * the low water mark, stop now.
1078 		 */
1079 		if ((pp->pr_nitems - pp->pr_itemsperpage) <
1080 		    pp->pr_minitems)
1081 			break;
1082 
1083 		pr_rmpage(pp, ph, &pq);
1084 	}
1085 	mtx_leave(&pp->pr_mtx);
1086 
1087 	if (LIST_EMPTY(&pq))
1088 		return (0);
1089 	while ((ph = LIST_FIRST(&pq)) != NULL) {
1090 		LIST_REMOVE(ph, ph_pagelist);
1091 		pool_allocator_free(pp, ph->ph_page);
1092 		if (pp->pr_roflags & PR_PHINPAGE)
1093 			continue;
1094 		pool_put(&phpool, ph);
1095 	}
1096 
1097 	return (1);
1098 }
1099 
1100 /*
1101  * Release all complete pages that have not been used recently
1102  * from all pools.
1103  */
1104 void
1105 pool_reclaim_all(void)
1106 {
1107 	struct pool	*pp;
1108 	int		s;
1109 
1110 	s = splhigh();
1111 	TAILQ_FOREACH(pp, &pool_head, pr_poollist)
1112 		pool_reclaim(pp);
1113 	splx(s);
1114 }
1115 
1116 #ifdef DDB
1117 #include <machine/db_machdep.h>
1118 #include <ddb/db_interface.h>
1119 #include <ddb/db_output.h>
1120 
1121 /*
1122  * Diagnostic helpers.
1123  */
1124 void
1125 pool_printit(struct pool *pp, const char *modif,
1126     int (*pr)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */)
1127 {
1128 	pool_print1(pp, modif, pr);
1129 }
1130 
1131 void
1132 pool_print_pagelist(struct pool_pagelist *pl,
1133     int (*pr)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */)
1134 {
1135 	struct pool_item_header *ph;
1136 #ifdef DIAGNOSTIC
1137 	struct pool_item *pi;
1138 #endif
1139 
1140 	LIST_FOREACH(ph, pl, ph_pagelist) {
1141 		(*pr)("\t\tpage %p, nmissing %d\n",
1142 		    ph->ph_page, ph->ph_nmissing);
1143 #ifdef DIAGNOSTIC
1144 		TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1145 			if (pi->pi_magic != PI_MAGIC) {
1146 				(*pr)("\t\t\titem %p, magic 0x%x\n",
1147 				    pi, pi->pi_magic);
1148 			}
1149 		}
1150 #endif
1151 	}
1152 }
1153 
1154 void
1155 pool_print1(struct pool *pp, const char *modif,
1156     int (*pr)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */)
1157 {
1158 	struct pool_item_header *ph;
1159 	int print_pagelist = 0;
1160 	char c;
1161 
1162 	while ((c = *modif++) != '\0') {
1163 		if (c == 'p')
1164 			print_pagelist = 1;
1165 		modif++;
1166 	}
1167 
1168 	(*pr)("POOL %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
1169 	    pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
1170 	    pp->pr_roflags);
1171 	(*pr)("\talloc %p\n", pp->pr_alloc);
1172 	(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
1173 	    pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
1174 	(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
1175 	    pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
1176 
1177 	(*pr)("\n\tnget %lu, nfail %lu, nput %lu\n",
1178 	    pp->pr_nget, pp->pr_nfail, pp->pr_nput);
1179 	(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
1180 	    pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
1181 
1182 	if (print_pagelist == 0)
1183 		return;
1184 
1185 	if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
1186 		(*pr)("\n\tempty page list:\n");
1187 	pool_print_pagelist(&pp->pr_emptypages, pr);
1188 	if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
1189 		(*pr)("\n\tfull page list:\n");
1190 	pool_print_pagelist(&pp->pr_fullpages, pr);
1191 	if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
1192 		(*pr)("\n\tpartial-page list:\n");
1193 	pool_print_pagelist(&pp->pr_partpages, pr);
1194 
1195 	if (pp->pr_curpage == NULL)
1196 		(*pr)("\tno current page\n");
1197 	else
1198 		(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
1199 }
1200 
1201 void
1202 db_show_all_pools(db_expr_t expr, int haddr, db_expr_t count, char *modif)
1203 {
1204 	struct pool *pp;
1205 	char maxp[16];
1206 	int ovflw;
1207 	char mode;
1208 
1209 	mode = modif[0];
1210 	if (mode != '\0' && mode != 'a') {
1211 		db_printf("usage: show all pools [/a]\n");
1212 		return;
1213 	}
1214 
1215 	if (mode == '\0')
1216 		db_printf("%-10s%4s%9s%5s%9s%6s%6s%6s%6s%6s%6s%5s\n",
1217 		    "Name",
1218 		    "Size",
1219 		    "Requests",
1220 		    "Fail",
1221 		    "Releases",
1222 		    "Pgreq",
1223 		    "Pgrel",
1224 		    "Npage",
1225 		    "Hiwat",
1226 		    "Minpg",
1227 		    "Maxpg",
1228 		    "Idle");
1229 	else
1230 		db_printf("%-12s %18s %18s\n",
1231 		    "Name", "Address", "Allocator");
1232 
1233 	TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
1234 		if (mode == 'a') {
1235 			db_printf("%-12s %18p %18p\n", pp->pr_wchan, pp,
1236 			    pp->pr_alloc);
1237 			continue;
1238 		}
1239 
1240 		if (!pp->pr_nget)
1241 			continue;
1242 
1243 		if (pp->pr_maxpages == UINT_MAX)
1244 			snprintf(maxp, sizeof maxp, "inf");
1245 		else
1246 			snprintf(maxp, sizeof maxp, "%u", pp->pr_maxpages);
1247 
1248 #define PRWORD(ovflw, fmt, width, fixed, val) do {	\
1249 	(ovflw) += db_printf((fmt),			\
1250 	    (width) - (fixed) - (ovflw) > 0 ?		\
1251 	    (width) - (fixed) - (ovflw) : 0,		\
1252 	    (val)) - (width);				\
1253 	if ((ovflw) < 0)				\
1254 		(ovflw) = 0;				\
1255 } while (/* CONSTCOND */0)
1256 
1257 		ovflw = 0;
1258 		PRWORD(ovflw, "%-*s", 10, 0, pp->pr_wchan);
1259 		PRWORD(ovflw, " %*u", 4, 1, pp->pr_size);
1260 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nget);
1261 		PRWORD(ovflw, " %*lu", 5, 1, pp->pr_nfail);
1262 		PRWORD(ovflw, " %*lu", 9, 1, pp->pr_nput);
1263 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagealloc);
1264 		PRWORD(ovflw, " %*lu", 6, 1, pp->pr_npagefree);
1265 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_npages);
1266 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_hiwat);
1267 		PRWORD(ovflw, " %*d", 6, 1, pp->pr_minpages);
1268 		PRWORD(ovflw, " %*s", 6, 1, maxp);
1269 		PRWORD(ovflw, " %*lu\n", 5, 1, pp->pr_nidle);
1270 
1271 		pool_chk(pp);
1272 	}
1273 }
1274 #endif /* DDB */
1275 
1276 #if defined(POOL_DEBUG) || defined(DDB)
1277 int
1278 pool_chk_page(struct pool *pp, struct pool_item_header *ph, int expected)
1279 {
1280 	struct pool_item *pi;
1281 	caddr_t page;
1282 	int n;
1283 #if defined(DIAGNOSTIC) && defined(POOL_DEBUG)
1284 	int i, *ip;
1285 #endif
1286 	const char *label = pp->pr_wchan;
1287 
1288 	page = (caddr_t)((u_long)ph & pp->pr_alloc->pa_pagemask);
1289 	if (page != ph->ph_page &&
1290 	    (pp->pr_roflags & PR_PHINPAGE) != 0) {
1291 		printf("%s: ", label);
1292 		printf("pool(%p:%s): page inconsistency: page %p; "
1293 		    "at page head addr %p (p %p)\n",
1294 		    pp, pp->pr_wchan, ph->ph_page, ph, page);
1295 		return 1;
1296 	}
1297 
1298 	for (pi = TAILQ_FIRST(&ph->ph_itemlist), n = 0;
1299 	     pi != NULL;
1300 	     pi = TAILQ_NEXT(pi,pi_list), n++) {
1301 
1302 #ifdef DIAGNOSTIC
1303 		if (pi->pi_magic != PI_MAGIC) {
1304 			printf("%s: ", label);
1305 			printf("pool(%s): free list modified: "
1306 			    "page %p; item ordinal %d; addr %p "
1307 			    "(p %p); offset 0x%x=0x%x\n",
1308 			    pp->pr_wchan, ph->ph_page, n, pi, page,
1309 			    0, pi->pi_magic);
1310 		}
1311 #ifdef POOL_DEBUG
1312 		if (pool_debug && ph->ph_magic) {
1313 			for (ip = (int *)pi, i = sizeof(*pi) / sizeof(int);
1314 			    i < pp->pr_size / sizeof(int); i++) {
1315 				if (ip[i] != ph->ph_magic) {
1316 					printf("pool(%s): free list modified: "
1317 					    "page %p; item ordinal %d; addr %p "
1318 					    "(p %p); offset 0x%zx=0x%x\n",
1319 					    pp->pr_wchan, ph->ph_page, n, pi,
1320 					    page, i * sizeof(int), ip[i]);
1321 				}
1322 			}
1323 		}
1324 
1325 #endif /* POOL_DEBUG */
1326 #endif /* DIAGNOSTIC */
1327 		page =
1328 		    (caddr_t)((u_long)pi & pp->pr_alloc->pa_pagemask);
1329 		if (page == ph->ph_page)
1330 			continue;
1331 
1332 		printf("%s: ", label);
1333 		printf("pool(%p:%s): page inconsistency: page %p;"
1334 		    " item ordinal %d; addr %p (p %p)\n", pp,
1335 		    pp->pr_wchan, ph->ph_page, n, pi, page);
1336 		return 1;
1337 	}
1338 	if (n + ph->ph_nmissing != pp->pr_itemsperpage) {
1339 		printf("pool(%p:%s): page inconsistency: page %p;"
1340 		    " %d on list, %d missing, %d items per page\n", pp,
1341 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1342 		    pp->pr_itemsperpage);
1343 		return 1;
1344 	}
1345 	if (expected >= 0 && n != expected) {
1346 		printf("pool(%p:%s): page inconsistency: page %p;"
1347 		    " %d on list, %d missing, %d expected\n", pp,
1348 		    pp->pr_wchan, ph->ph_page, n, ph->ph_nmissing,
1349 		    expected);
1350 		return 1;
1351 	}
1352 	return 0;
1353 }
1354 
1355 int
1356 pool_chk(struct pool *pp)
1357 {
1358 	struct pool_item_header *ph;
1359 	int r = 0;
1360 
1361 	LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist)
1362 		r += pool_chk_page(pp, ph, pp->pr_itemsperpage);
1363 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist)
1364 		r += pool_chk_page(pp, ph, 0);
1365 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist)
1366 		r += pool_chk_page(pp, ph, -1);
1367 
1368 	return (r);
1369 }
1370 #endif /* defined(POOL_DEBUG) || defined(DDB) */
1371 
1372 #ifdef DDB
1373 void
1374 pool_walk(struct pool *pp, int full,
1375     int (*pr)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */,
1376     void (*func)(void *, int, int (*)(const char *, ...) /* __attribute__((__format__(__kprintf__,1,2))) */))
1377 {
1378 	struct pool_item_header *ph;
1379 	struct pool_item *pi;
1380 	caddr_t cp;
1381 	int n;
1382 
1383 	LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
1384 		cp = ph->ph_colored;
1385 		n = ph->ph_nmissing;
1386 
1387 		while (n--) {
1388 			func(cp, full, pr);
1389 			cp += pp->pr_size;
1390 		}
1391 	}
1392 
1393 	LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
1394 		cp = ph->ph_colored;
1395 		n = ph->ph_nmissing;
1396 
1397 		do {
1398 			TAILQ_FOREACH(pi, &ph->ph_itemlist, pi_list) {
1399 				if (cp == (caddr_t)pi)
1400 					break;
1401 			}
1402 			if (cp != (caddr_t)pi) {
1403 				func(cp, full, pr);
1404 				n--;
1405 			}
1406 
1407 			cp += pp->pr_size;
1408 		} while (n > 0);
1409 	}
1410 }
1411 #endif
1412 
1413 /*
1414  * We have three different sysctls.
1415  * kern.pool.npools - the number of pools.
1416  * kern.pool.pool.<pool#> - the pool struct for the pool#.
1417  * kern.pool.name.<pool#> - the name for pool#.
1418  */
1419 int
1420 sysctl_dopool(int *name, u_int namelen, char *where, size_t *sizep)
1421 {
1422 	struct pool *pp, *foundpool = NULL;
1423 	size_t buflen = where != NULL ? *sizep : 0;
1424 	int npools = 0, s;
1425 	unsigned int lookfor;
1426 	size_t len;
1427 
1428 	switch (*name) {
1429 	case KERN_POOL_NPOOLS:
1430 		if (namelen != 1 || buflen != sizeof(int))
1431 			return (EINVAL);
1432 		lookfor = 0;
1433 		break;
1434 	case KERN_POOL_NAME:
1435 		if (namelen != 2 || buflen < 1)
1436 			return (EINVAL);
1437 		lookfor = name[1];
1438 		break;
1439 	case KERN_POOL_POOL:
1440 		if (namelen != 2 || buflen != sizeof(struct pool))
1441 			return (EINVAL);
1442 		lookfor = name[1];
1443 		break;
1444 	default:
1445 		return (EINVAL);
1446 	}
1447 
1448 	s = splvm();
1449 
1450 	TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
1451 		npools++;
1452 		if (lookfor == pp->pr_serial) {
1453 			foundpool = pp;
1454 			break;
1455 		}
1456 	}
1457 
1458 	splx(s);
1459 
1460 	if (*name != KERN_POOL_NPOOLS && foundpool == NULL)
1461 		return (ENOENT);
1462 
1463 	switch (*name) {
1464 	case KERN_POOL_NPOOLS:
1465 		return copyout(&npools, where, buflen);
1466 	case KERN_POOL_NAME:
1467 		len = strlen(foundpool->pr_wchan) + 1;
1468 		if (*sizep < len)
1469 			return (ENOMEM);
1470 		*sizep = len;
1471 		return copyout(foundpool->pr_wchan, where, len);
1472 	case KERN_POOL_POOL:
1473 		return copyout(foundpool, where, buflen);
1474 	}
1475 	/* NOTREACHED */
1476 	return (0); /* XXX - Stupid gcc */
1477 }
1478 
1479 /*
1480  * Pool backend allocators.
1481  *
1482  * Each pool has a backend allocator that handles allocation, deallocation
1483  */
1484 void	*pool_page_alloc(struct pool *, int, int *);
1485 void	pool_page_free(struct pool *, void *);
1486 
1487 /*
1488  * safe for interrupts, name preserved for compat this is the default
1489  * allocator
1490  */
1491 struct pool_allocator pool_allocator_nointr = {
1492 	pool_page_alloc, pool_page_free, 0,
1493 };
1494 
1495 /*
1496  * XXX - we have at least three different resources for the same allocation
1497  *  and each resource can be depleted. First we have the ready elements in
1498  *  the pool. Then we have the resource (typically a vm_map) for this
1499  *  allocator, then we have physical memory. Waiting for any of these can
1500  *  be unnecessary when any other is freed, but the kernel doesn't support
1501  *  sleeping on multiple addresses, so we have to fake. The caller sleeps on
1502  *  the pool (so that we can be awakened when an item is returned to the pool),
1503  *  but we set PA_WANT on the allocator. When a page is returned to
1504  *  the allocator and PA_WANT is set pool_allocator_free will wakeup all
1505  *  sleeping pools belonging to this allocator. (XXX - thundering herd).
1506  *  We also wake up the allocator in case someone without a pool (malloc)
1507  *  is sleeping waiting for this allocator.
1508  */
1509 
1510 void *
1511 pool_allocator_alloc(struct pool *pp, int flags, int *slowdown)
1512 {
1513 	boolean_t waitok = (flags & PR_WAITOK) ? TRUE : FALSE;
1514 	void *v;
1515 
1516 	if (waitok)
1517 		mtx_leave(&pp->pr_mtx);
1518 	v = pp->pr_alloc->pa_alloc(pp, flags, slowdown);
1519 	if (waitok)
1520 		mtx_enter(&pp->pr_mtx);
1521 
1522 	return (v);
1523 }
1524 
1525 void
1526 pool_allocator_free(struct pool *pp, void *v)
1527 {
1528 	struct pool_allocator *pa = pp->pr_alloc;
1529 
1530 	(*pa->pa_free)(pp, v);
1531 }
1532 
1533 void *
1534 pool_page_alloc(struct pool *pp, int flags, int *slowdown)
1535 {
1536 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1537 
1538 	kd.kd_waitok = (flags & PR_WAITOK);
1539 	kd.kd_slowdown = slowdown;
1540 
1541 	return (km_alloc(PAGE_SIZE, &kv_page, pp->pr_crange, &kd));
1542 }
1543 
1544 void
1545 pool_page_free(struct pool *pp, void *v)
1546 {
1547 	km_free(v, PAGE_SIZE, &kv_page, pp->pr_crange);
1548 }
1549 
1550 void *
1551 pool_large_alloc(struct pool *pp, int flags, int *slowdown)
1552 {
1553 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1554 	void *v;
1555 	int s;
1556 
1557 	kd.kd_waitok = (flags & PR_WAITOK);
1558 	kd.kd_slowdown = slowdown;
1559 
1560 	s = splvm();
1561 	v = km_alloc(pp->pr_alloc->pa_pagesz, &kv_intrsafe, pp->pr_crange,
1562 	    &kd);
1563 	splx(s);
1564 
1565 	return (v);
1566 }
1567 
1568 void
1569 pool_large_free(struct pool *pp, void *v)
1570 {
1571 	int s;
1572 
1573 	s = splvm();
1574 	km_free(v, pp->pr_alloc->pa_pagesz, &kv_intrsafe, pp->pr_crange);
1575 	splx(s);
1576 }
1577 
1578 void *
1579 pool_large_alloc_ni(struct pool *pp, int flags, int *slowdown)
1580 {
1581 	struct kmem_dyn_mode kd = KMEM_DYN_INITIALIZER;
1582 
1583 	kd.kd_waitok = (flags & PR_WAITOK);
1584 	kd.kd_slowdown = slowdown;
1585 
1586 	return (km_alloc(pp->pr_alloc->pa_pagesz, &kv_any, pp->pr_crange, &kd));
1587 }
1588 
1589 void
1590 pool_large_free_ni(struct pool *pp, void *v)
1591 {
1592 	km_free(v, pp->pr_alloc->pa_pagesz, &kv_any, pp->pr_crange);
1593 }
1594