xref: /dragonfly/sys/vm/vm_zone.c (revision 91dc43dd)
1 /*
2  * Copyright (c) 1997, 1998 John S. Dyson.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *	notice immediately at the beginning of the file, without modification,
9  *	this list of conditions, and the following disclaimer.
10  * 2. Absolutely no warranty of function or purpose is made by the author
11  *	John S. Dyson.
12  *
13  * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $
14  *
15  * Copyright (c) 2003-2017,2019 The DragonFly Project.  All rights reserved.
16  *
17  * This code is derived from software contributed to The DragonFly Project
18  * by Matthew Dillon <dillon@backplane.com>
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  * 1. Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  * 2. Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  * 3. Neither the name of The DragonFly Project nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific, prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
37  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
38  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
39  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
40  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
41  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
42  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
43  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
44  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
45  * SUCH DAMAGE.
46  */
47 
48 #include <sys/param.h>
49 #include <sys/queue.h>
50 #include <sys/systm.h>
51 #include <sys/kernel.h>
52 #include <sys/lock.h>
53 #include <sys/malloc.h>
54 #include <sys/sysctl.h>
55 #include <sys/vmmeter.h>
56 
57 #include <vm/vm.h>
58 #include <vm/vm_object.h>
59 #include <vm/vm_page.h>
60 #include <vm/vm_map.h>
61 #include <vm/vm_kern.h>
62 #include <vm/vm_extern.h>
63 #include <vm/vm_zone.h>
64 
65 #include <sys/spinlock2.h>
66 #include <vm/vm_page2.h>
67 
68 static MALLOC_DEFINE(M_ZONE, "ZONE", "Zone header");
69 
70 #define	ZONE_ERROR_INVALID 0
71 #define	ZONE_ERROR_NOTFREE 1
72 #define	ZONE_ERROR_ALREADYFREE 2
73 
74 #define ZONE_ROUNDING	32
75 
76 #define	ZENTRY_FREE	0x12342378
77 
78 long zone_burst = 128;
79 
80 static void *zget(vm_zone_t z);
81 
82 /*
83  * Return an item from the specified zone.   This function is non-blocking for
84  * ZONE_INTERRUPT zones.
85  *
86  * No requirements.
87  */
88 void *
89 zalloc(vm_zone_t z)
90 {
91 	globaldata_t gd = mycpu;
92 	vm_zpcpu_t *zpcpu;
93 	void *item;
94 	long n;
95 
96 #ifdef INVARIANTS
97 	if (z == NULL)
98 		zerror(ZONE_ERROR_INVALID);
99 #endif
100 	zpcpu = &z->zpcpu[gd->gd_cpuid];
101 retry:
102 	/*
103 	 * Avoid spinlock contention by allocating from a per-cpu queue
104 	 */
105 	if (zpcpu->zfreecnt > 0) {
106 		crit_enter_gd(gd);
107 		if (zpcpu->zfreecnt > 0) {
108 			item = zpcpu->zitems;
109 #ifdef INVARIANTS
110 			KASSERT(item != NULL,
111 				("zitems_pcpu unexpectedly NULL"));
112 			if (((void **)item)[1] != (void *)ZENTRY_FREE)
113 				zerror(ZONE_ERROR_NOTFREE);
114 			((void **)item)[1] = NULL;
115 #endif
116 			zpcpu->zitems = ((void **) item)[0];
117 			--zpcpu->zfreecnt;
118 			++zpcpu->znalloc;
119 			crit_exit_gd(gd);
120 
121 			return item;
122 		}
123 		crit_exit_gd(gd);
124 	}
125 
126 	/*
127 	 * Per-zone spinlock for the remainder.  Always load at least one
128 	 * item.
129 	 */
130 	spin_lock(&z->zspin);
131 	if (z->zfreecnt > z->zfreemin) {
132 		n = zone_burst;
133 		do {
134 			item = z->zitems;
135 #ifdef INVARIANTS
136 			KASSERT(item != NULL, ("zitems unexpectedly NULL"));
137 			if (((void **)item)[1] != (void *)ZENTRY_FREE)
138 				zerror(ZONE_ERROR_NOTFREE);
139 #endif
140 			z->zitems = ((void **)item)[0];
141 			--z->zfreecnt;
142 			((void **)item)[0] = zpcpu->zitems;
143 			zpcpu->zitems = item;
144 			++zpcpu->zfreecnt;
145 		} while (--n > 0 && z->zfreecnt > z->zfreemin);
146 		spin_unlock(&z->zspin);
147 		goto retry;
148 	} else {
149 		spin_unlock(&z->zspin);
150 		item = zget(z);
151 		/*
152 		 * PANICFAIL allows the caller to assume that the zalloc()
153 		 * will always succeed.  If it doesn't, we panic here.
154 		 */
155 		if (item == NULL && (z->zflags & ZONE_PANICFAIL))
156 			panic("zalloc(%s) failed", z->zname);
157 	}
158 	return item;
159 }
160 
161 /*
162  * Free an item to the specified zone.
163  *
164  * No requirements.
165  */
166 void
167 zfree(vm_zone_t z, void *item)
168 {
169 	globaldata_t gd = mycpu;
170 	vm_zpcpu_t *zpcpu;
171 	void *tail_item;
172 	long count;
173 	long zmax;
174 
175 	zpcpu = &z->zpcpu[gd->gd_cpuid];
176 
177 	/*
178 	 * Avoid spinlock contention by freeing into a per-cpu queue
179 	 */
180 	zmax = z->zmax_pcpu;
181 	if (zmax < 1024)
182 		zmax = 1024;
183 
184 	/*
185 	 * Add to pcpu cache
186 	 */
187 	crit_enter_gd(gd);
188 	((void **)item)[0] = zpcpu->zitems;
189 #ifdef INVARIANTS
190 	if (((void **)item)[1] == (void *)ZENTRY_FREE)
191 		zerror(ZONE_ERROR_ALREADYFREE);
192 	((void **)item)[1] = (void *)ZENTRY_FREE;
193 #endif
194 	zpcpu->zitems = item;
195 	++zpcpu->zfreecnt;
196 
197 	if (zpcpu->zfreecnt < zmax) {
198 		crit_exit_gd(gd);
199 		return;
200 	}
201 
202 	/*
203 	 * Hystereis, move (zmax) (calculated below) items to the pool.
204 	 */
205 	zmax = zmax / 2;
206 	if (zmax > zone_burst)
207 		zmax = zone_burst;
208 	tail_item = item;
209 	count = 1;
210 
211 	while (count < zmax) {
212 		tail_item = ((void **)tail_item)[0];
213 		++count;
214 	}
215 	zpcpu->zitems = ((void **)tail_item)[0];
216 	zpcpu->zfreecnt -= count;
217 
218 	/*
219 	 * Per-zone spinlock for the remainder.
220 	 *
221 	 * Also implement hysteresis by freeing a number of pcpu
222 	 * entries.
223 	 */
224 	spin_lock(&z->zspin);
225 	((void **)tail_item)[0] = z->zitems;
226 	z->zitems = item;
227 	z->zfreecnt += count;
228 	spin_unlock(&z->zspin);
229 
230 	crit_exit_gd(gd);
231 }
232 
233 /*
234  * This file comprises a very simple zone allocator.  This is used
235  * in lieu of the malloc allocator, where needed or more optimal.
236  *
237  * Note that the initial implementation of this had coloring, and
238  * absolutely no improvement (actually perf degradation) occurred.
239  *
240  * Note also that the zones are type stable.  The only restriction is
241  * that the first two longwords of a data structure can be changed
242  * between allocations.  Any data that must be stable between allocations
243  * must reside in areas after the first two longwords.
244  *
245  * zinitna, zinit, zbootinit are the initialization routines.
246  * zalloc, zfree, are the allocation/free routines.
247  */
248 
249 LIST_HEAD(zlist, vm_zone) zlist = LIST_HEAD_INITIALIZER(zlist);
250 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS);
251 static vm_pindex_t zone_kmem_pages, zone_kern_pages;
252 static long zone_kmem_kvaspace;
253 
254 /*
255  * Create a zone, but don't allocate the zone structure.  If the
256  * zone had been previously created by the zone boot code, initialize
257  * various parts of the zone code.
258  *
259  * If waits are not allowed during allocation (e.g. during interrupt
260  * code), a-priori allocate the kernel virtual space, and allocate
261  * only pages when needed.
262  *
263  * Arguments:
264  * z		pointer to zone structure.
265  * obj		pointer to VM object (opt).
266  * name		name of zone.
267  * size		size of zone entries.
268  * nentries	number of zone entries allocated (only ZONE_INTERRUPT.)
269  * flags	ZONE_INTERRUPT -- items can be allocated at interrupt time.
270  * zalloc	number of pages allocated when memory is needed.
271  *
272  * Note that when using ZONE_INTERRUPT, the size of the zone is limited
273  * by the nentries argument.  The size of the memory allocatable is
274  * unlimited if ZONE_INTERRUPT is not set.
275  *
276  * No requirements.
277  */
278 int
279 zinitna(vm_zone_t z, char *name, size_t size, long nentries, uint32_t flags)
280 {
281 	size_t totsize;
282 
283 	/*
284 	 * Only zones created with zinit() are destroyable.
285 	 */
286 	if (z->zflags & ZONE_DESTROYABLE)
287 		panic("zinitna: can't create destroyable zone");
288 
289 	/*
290 	 * NOTE: We can only adjust zsize if we previously did not
291 	 * 	 use zbootinit().
292 	 */
293 	if ((z->zflags & ZONE_BOOT) == 0) {
294 		z->zsize = roundup2(size, ZONE_ROUNDING);
295 		spin_init(&z->zspin, "zinitna");
296 		z->zfreecnt = 0;
297 		z->ztotal = 0;
298 		z->zmax = 0;
299 		z->zname = name;
300 		z->zitems = NULL;
301 
302 		lwkt_gettoken(&vm_token);
303 		LIST_INSERT_HEAD(&zlist, z, zlink);
304 		lwkt_reltoken(&vm_token);
305 
306 		bzero(z->zpcpu, sizeof(z->zpcpu));
307 	}
308 
309 	z->zkmvec = NULL;
310 	z->zkmcur = z->zkmmax = 0;
311 	z->zflags |= flags;
312 
313 	/*
314 	 * If we cannot wait, allocate KVA space up front, and we will fill
315 	 * in pages as needed.  This is particularly required when creating
316 	 * an allocation space for map entries in kernel_map, because we
317 	 * do not want to go into a recursion deadlock with
318 	 * vm_map_entry_reserve().
319 	 */
320 	if (z->zflags & ZONE_INTERRUPT) {
321 		totsize = round_page((size_t)z->zsize * nentries);
322 		atomic_add_long(&zone_kmem_kvaspace, totsize);
323 
324 		z->zkva = kmem_alloc_pageable(&kernel_map, totsize,
325 					      VM_SUBSYS_ZALLOC);
326 		if (z->zkva == 0) {
327 			LIST_REMOVE(z, zlink);
328 			return 0;
329 		}
330 
331 		z->zpagemax = totsize / PAGE_SIZE;
332 		z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT |
333 				VM_ALLOC_NORMAL | VM_ALLOC_RETRY;
334 		z->zmax += nentries;
335 
336 		/*
337 		 * Set reasonable pcpu cache bounds.  Low-memory systems
338 		 * might try to cache too little, large-memory systems
339 		 * might try to cache more than necessarsy.
340 		 *
341 		 * In particular, pvzone can wind up being excessive and
342 		 * waste memory unnecessarily.
343 		 */
344 		z->zmax_pcpu = z->zmax / ncpus / 64;
345 		if (z->zmax_pcpu < 1024)
346 			z->zmax_pcpu = 1024;
347 		if (z->zmax_pcpu * z->zsize > 16*1024*1024)
348 			z->zmax_pcpu = 16*1024*1024 / z->zsize;
349 	} else {
350 		z->zallocflag = VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM;
351 		z->zmax = 0;
352 		z->zmax_pcpu = 8192;
353 	}
354 
355 
356 	if (z->zsize > PAGE_SIZE)
357 		z->zfreemin = 1;
358 	else
359 		z->zfreemin = PAGE_SIZE / z->zsize;
360 
361 	z->zpagecount = 0;
362 
363 	/*
364 	 * Reduce kernel_map spam by allocating in chunks.
365 	 */
366 	z->zalloc = ZONE_MAXPGLOAD;
367 
368 	/*
369 	 * Populate the interrrupt zone at creation time rather than
370 	 * on first allocation, as this is a potentially long operation.
371 	 */
372 	if (z->zflags & ZONE_INTERRUPT) {
373 		void *buf;
374 
375 		buf = zget(z);
376 		if (buf)
377 			zfree(z, buf);
378 	}
379 
380 	return 1;
381 }
382 
383 /*
384  * Subroutine same as zinitna, except zone data structure is allocated
385  * automatically by malloc.  This routine should normally be used, except
386  * in certain tricky startup conditions in the VM system -- then
387  * zbootinit and zinitna can be used.  Zinit is the standard zone
388  * initialization call.
389  *
390  * No requirements.
391  */
392 vm_zone_t
393 zinit(char *name, size_t size, long nentries, uint32_t flags)
394 {
395 	vm_zone_t z;
396 
397 	z = (vm_zone_t) kmalloc(sizeof (struct vm_zone), M_ZONE, M_NOWAIT);
398 	if (z == NULL)
399 		return NULL;
400 
401 	z->zflags = 0;
402 	if (zinitna(z, name, size, nentries, flags & ~ZONE_DESTROYABLE) == 0) {
403 		kfree(z, M_ZONE);
404 		return NULL;
405 	}
406 
407 	if (flags & ZONE_DESTROYABLE)
408 		z->zflags |= ZONE_DESTROYABLE;
409 
410 	return z;
411 }
412 
413 /*
414  * Initialize a zone before the system is fully up.  This routine should
415  * only be called before full VM startup.
416  *
417  * Called from the low level boot code only.
418  */
419 void
420 zbootinit(vm_zone_t z, char *name, size_t size, void *item, long nitems)
421 {
422 	long i;
423 
424 	spin_init(&z->zspin, "zbootinit");
425 	bzero(z->zpcpu, sizeof(z->zpcpu));
426 	z->zname = name;
427 	z->zsize = size;
428 	z->zpagemax = 0;
429 	z->zflags = ZONE_BOOT;
430 	z->zfreemin = 0;
431 	z->zallocflag = 0;
432 	z->zpagecount = 0;
433 	z->zalloc = 0;
434 
435 	bzero(item, (size_t)nitems * z->zsize);
436 	z->zitems = NULL;
437 	for (i = 0; i < nitems; i++) {
438 		((void **)item)[0] = z->zitems;
439 #ifdef INVARIANTS
440 		((void **)item)[1] = (void *)ZENTRY_FREE;
441 #endif
442 		z->zitems = item;
443 		item = (uint8_t *)item + z->zsize;
444 	}
445 	z->zfreecnt = nitems;
446 	z->zmax = nitems;
447 	z->ztotal = nitems;
448 
449 	lwkt_gettoken(&vm_token);
450 	LIST_INSERT_HEAD(&zlist, z, zlink);
451 	lwkt_reltoken(&vm_token);
452 }
453 
454 /*
455  * Release all resources owned by zone created with zinit().
456  *
457  * No requirements.
458  */
459 void
460 zdestroy(vm_zone_t z)
461 {
462 	vm_pindex_t i;
463 
464 	if (z == NULL)
465 		panic("zdestroy: null zone");
466 	if ((z->zflags & ZONE_DESTROYABLE) == 0)
467 		panic("zdestroy: undestroyable zone");
468 
469 	lwkt_gettoken(&vm_token);
470 	LIST_REMOVE(z, zlink);
471 	lwkt_reltoken(&vm_token);
472 
473 	/*
474 	 * Release virtual mappings, physical memory and update sysctl stats.
475 	 */
476 	KKASSERT((z->zflags & ZONE_INTERRUPT) == 0);
477 	for (i = 0; i < z->zkmcur; i++) {
478 		kmem_free(&kernel_map, z->zkmvec[i],
479 			  (size_t)z->zalloc * PAGE_SIZE);
480 		atomic_subtract_long(&zone_kern_pages, z->zalloc);
481 	}
482 	if (z->zkmvec != NULL)
483 		kfree(z->zkmvec, M_ZONE);
484 
485 	spin_uninit(&z->zspin);
486 	kfree(z, M_ZONE);
487 }
488 
489 
490 /*
491  * void *zalloc(vm_zone_t zone) --
492  *	Returns an item from a specified zone.  May not be called from a
493  *	FAST interrupt or IPI function.
494  *
495  * void zfree(vm_zone_t zone, void *item) --
496  *	Frees an item back to a specified zone.  May not be called from a
497  *	FAST interrupt or IPI function.
498  */
499 
500 /*
501  * Internal zone routine.  Not to be called from external (non vm_zone) code.
502  *
503  * This function may return NULL.
504  *
505  * No requirements.
506  */
507 static void *
508 zget(vm_zone_t z)
509 {
510 	vm_page_t pgs[ZONE_MAXPGLOAD];
511 	vm_page_t m;
512 	long nitems;
513 	long savezpc;
514 	size_t nbytes;
515 	size_t noffset;
516 	void *item;
517 	vm_pindex_t npages;
518 	vm_pindex_t nalloc;
519 	vm_pindex_t i;
520 
521 	if (z == NULL)
522 		panic("zget: null zone");
523 
524 	if (z->zflags & ZONE_INTERRUPT) {
525 		/*
526 		 * Interrupt zones do not mess with the kernel_map, they
527 		 * simply populate an existing mapping.
528 		 *
529 		 * First allocate as many pages as we can, stopping at
530 		 * our limit or if the page allocation fails.  Try to
531 		 * avoid exhausting the interrupt free minimum by backing
532 		 * off to normal page allocations after a certain point.
533 		 */
534 		for (i = 0; i < ZONE_MAXPGLOAD && i < z->zalloc; ++i) {
535 			if (i < 4) {
536 				m = vm_page_alloc(NULL,
537 						  mycpu->gd_rand_incr++,
538 						  z->zallocflag);
539 			} else {
540 				m = vm_page_alloc(NULL,
541 						  mycpu->gd_rand_incr++,
542 						  VM_ALLOC_NORMAL |
543 						  VM_ALLOC_SYSTEM);
544 			}
545 			if (m == NULL)
546 				break;
547 			pgs[i] = m;
548 		}
549 		nalloc = i;
550 
551 		/*
552 		 * Account for the pages.
553 		 *
554 		 * NOTE! Do not allow overlap with a prior page as it
555 		 *	 may still be undergoing allocation on another
556 		 *	 cpu.
557 		 */
558 		spin_lock(&z->zspin);
559 		noffset = (size_t)z->zpagecount * PAGE_SIZE;
560 		/* noffset -= noffset % z->zsize; */
561 		savezpc = z->zpagecount;
562 
563 		/*
564 		 * Track total memory use and kmem offset.
565 		 */
566 		if (z->zpagecount + nalloc > z->zpagemax)
567 			z->zpagecount = z->zpagemax;
568 		else
569 			z->zpagecount += nalloc;
570 
571 		item = (char *)z->zkva + noffset;
572 		npages = z->zpagecount - savezpc;
573 		nitems = ((size_t)(savezpc + npages) * PAGE_SIZE - noffset) /
574 			 z->zsize;
575 		atomic_add_long(&zone_kmem_pages, npages);
576 		spin_unlock(&z->zspin);
577 
578 		/*
579 		 * Enter the pages into the reserved KVA space.
580 		 */
581 		for (i = 0; i < npages; ++i) {
582 			vm_offset_t zkva;
583 
584 			m = pgs[i];
585 			KKASSERT(m->queue == PQ_NONE);
586 			m->valid = VM_PAGE_BITS_ALL;
587 			vm_page_wire(m);
588 			vm_page_wakeup(m);
589 
590 			zkva = z->zkva + (size_t)(savezpc + i) * PAGE_SIZE;
591 			pmap_kenter(zkva, VM_PAGE_TO_PHYS(m));
592 			bzero((void *)zkva, PAGE_SIZE);
593 		}
594 		for (i = npages; i < nalloc; ++i) {
595 			m = pgs[i];
596 			vm_page_free(m);
597 		}
598 	} else if (z->zflags & ZONE_SPECIAL) {
599 		/*
600 		 * The special zone is the one used for vm_map_entry_t's.
601 		 * We have to avoid an infinite recursion in
602 		 * vm_map_entry_reserve() by using vm_map_entry_kreserve()
603 		 * instead.  The map entries are pre-reserved by the kernel
604 		 * by vm_map_entry_reserve_cpu_init().
605 		 */
606 		nbytes = (size_t)z->zalloc * PAGE_SIZE;
607 		z->zpagecount += z->zalloc;	/* Track total memory use */
608 
609 		item = (void *)kmem_alloc3(&kernel_map, nbytes,
610 					   VM_SUBSYS_ZALLOC, KM_KRESERVE);
611 
612 		/* note: z might be modified due to blocking */
613 		if (item != NULL) {
614 			atomic_add_long(&zone_kern_pages, z->zalloc);
615 			bzero(item, nbytes);
616 		} else {
617 			nbytes = 0;
618 		}
619 		nitems = nbytes / z->zsize;
620 	} else {
621 		/*
622 		 * Otherwise allocate KVA from the kernel_map.
623 		 */
624 		nbytes = (size_t)z->zalloc * PAGE_SIZE;
625 		z->zpagecount += z->zalloc;	/* Track total memory use */
626 
627 		item = (void *)kmem_alloc3(&kernel_map, nbytes,
628 					   VM_SUBSYS_ZALLOC, 0);
629 
630 		/* note: z might be modified due to blocking */
631 		if (item != NULL) {
632 			atomic_add_long(&zone_kern_pages, z->zalloc);
633 			bzero(item, nbytes);
634 
635 			if (z->zflags & ZONE_DESTROYABLE) {
636 				if (z->zkmcur == z->zkmmax) {
637 					z->zkmmax =
638 						z->zkmmax==0 ? 1 : z->zkmmax*2;
639 					z->zkmvec = krealloc(z->zkmvec,
640 					    z->zkmmax * sizeof(z->zkmvec[0]),
641 					    M_ZONE, M_WAITOK);
642 				}
643 				z->zkmvec[z->zkmcur++] = (vm_offset_t)item;
644 			}
645 		} else {
646 			nbytes = 0;
647 		}
648 		nitems = nbytes / z->zsize;
649 	}
650 
651 	/*
652 	 * Enter any new pages into the pool, reserving one, or get the
653 	 * item from the existing pool.
654 	 */
655 	spin_lock(&z->zspin);
656 	z->ztotal += nitems;
657 
658 	/*
659 	 * The zone code may need to allocate kernel memory, which can
660 	 * recurse zget() infinitely if we do not handle it properly.
661 	 * We deal with this by directly repopulating the pcpu vm_map_entry
662 	 * cache.
663 	 */
664 	if (nitems > 1 && (z->zflags & ZONE_SPECIAL)) {
665 		struct globaldata *gd = mycpu;
666 		vm_map_entry_t entry;
667 
668 		/*
669 		 * Make sure we have enough structures in gd_vme_base to handle
670 		 * the reservation request.
671 		 *
672 		 * The critical section protects access to the per-cpu gd.
673 		 */
674 		crit_enter();
675 		while (gd->gd_vme_avail < 2 && nitems > 1) {
676 			entry = item;
677 			MAPENT_FREELIST(entry) = gd->gd_vme_base;
678 			gd->gd_vme_base = entry;
679 			atomic_add_int(&gd->gd_vme_avail, 1);
680 			item = (uint8_t *)item + z->zsize;
681 			--nitems;
682 		}
683 		crit_exit();
684 	}
685 
686 	if (nitems != 0) {
687 		/*
688 		 * Enter pages into the pool saving one for immediate
689 		 * allocation.
690 		 */
691 		nitems -= 1;
692 		for (i = 0; i < nitems; i++) {
693 			((void **)item)[0] = z->zitems;
694 #ifdef INVARIANTS
695 			((void **)item)[1] = (void *)ZENTRY_FREE;
696 #endif
697 			z->zitems = item;
698 			item = (uint8_t *)item + z->zsize;
699 		}
700 		z->zfreecnt += nitems;
701 		++z->znalloc;
702 	} else if (z->zfreecnt > 0) {
703 		/*
704 		 * Get an item from the existing pool.
705 		 */
706 		item = z->zitems;
707 		z->zitems = ((void **)item)[0];
708 #ifdef INVARIANTS
709 		if (((void **)item)[1] != (void *)ZENTRY_FREE)
710 			zerror(ZONE_ERROR_NOTFREE);
711 		((void **) item)[1] = NULL;
712 #endif
713 		--z->zfreecnt;
714 		++z->znalloc;
715 	} else {
716 		/*
717 		 * No items available.
718 		 */
719 		item = NULL;
720 	}
721 	spin_unlock(&z->zspin);
722 
723 	return item;
724 }
725 
726 /*
727  * No requirements.
728  */
729 static int
730 sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
731 {
732 	vm_zone_t curzone;
733 	char tmpbuf[128];
734 	char tmpname[14];
735 	int error = 0;
736 
737 	ksnprintf(tmpbuf, sizeof(tmpbuf),
738 	    "\nITEM            SIZE     LIMIT    USED    FREE  REQUESTS\n");
739 	error = SYSCTL_OUT(req, tmpbuf, strlen(tmpbuf));
740 	if (error)
741 		return (error);
742 
743 	lwkt_gettoken(&vm_token);
744 	LIST_FOREACH(curzone, &zlist, zlink) {
745 		size_t i;
746 		size_t len;
747 		int offset;
748 		long freecnt;
749 		long znalloc;
750 		int n;
751 
752 		len = strlen(curzone->zname);
753 		if (len >= (sizeof(tmpname) - 1))
754 			len = (sizeof(tmpname) - 1);
755 		for(i = 0; i < sizeof(tmpname) - 1; i++)
756 			tmpname[i] = ' ';
757 		tmpname[i] = 0;
758 		memcpy(tmpname, curzone->zname, len);
759 		tmpname[len] = ':';
760 		offset = 0;
761 		if (curzone == LIST_FIRST(&zlist)) {
762 			offset = 1;
763 			tmpbuf[0] = '\n';
764 		}
765 		freecnt = curzone->zfreecnt;
766 		znalloc = curzone->znalloc;
767 		for (n = 0; n < ncpus; ++n) {
768 			freecnt += curzone->zpcpu[n].zfreecnt;
769 			znalloc += curzone->zpcpu[n].znalloc;
770 		}
771 
772 		ksnprintf(tmpbuf + offset, sizeof(tmpbuf) - offset,
773 			"%s %6.6lu, %8.8lu, %6.6lu, %6.6lu, %8.8lu\n",
774 			tmpname, curzone->zsize, curzone->zmax,
775 			(curzone->ztotal - freecnt),
776 			freecnt, znalloc);
777 
778 		len = strlen((char *)tmpbuf);
779 		if (LIST_NEXT(curzone, zlink) == NULL)
780 			tmpbuf[len - 1] = 0;
781 
782 		error = SYSCTL_OUT(req, tmpbuf, len);
783 
784 		if (error)
785 			break;
786 	}
787 	lwkt_reltoken(&vm_token);
788 	return (error);
789 }
790 
791 #if defined(INVARIANTS)
792 
793 /*
794  * Debugging only.
795  */
796 void
797 zerror(int error)
798 {
799 	char *msg;
800 
801 	switch (error) {
802 	case ZONE_ERROR_INVALID:
803 		msg = "zone: invalid zone";
804 		break;
805 	case ZONE_ERROR_NOTFREE:
806 		msg = "zone: entry not free";
807 		break;
808 	case ZONE_ERROR_ALREADYFREE:
809 		msg = "zone: freeing free entry";
810 		break;
811 	default:
812 		msg = "zone: invalid error";
813 		break;
814 	}
815 	panic("%s", msg);
816 }
817 #endif
818 
819 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, \
820 	NULL, 0, sysctl_vm_zone, "A", "Zone Info");
821 
822 SYSCTL_LONG(_vm, OID_AUTO, zone_kmem_pages,
823 	CTLFLAG_RD, &zone_kmem_pages, 0, "Number of interrupt safe pages allocated by zone");
824 SYSCTL_LONG(_vm, OID_AUTO, zone_burst,
825 	CTLFLAG_RW, &zone_burst, 0, "Burst from depot to pcpu cache");
826 SYSCTL_LONG(_vm, OID_AUTO, zone_kmem_kvaspace,
827 	CTLFLAG_RD, &zone_kmem_kvaspace, 0, "KVA space allocated by zone");
828 SYSCTL_LONG(_vm, OID_AUTO, zone_kern_pages,
829 	CTLFLAG_RD, &zone_kern_pages, 0, "Number of non-interrupt safe pages allocated by zone");
830