xref: /dragonfly/sys/vm/vm_page.c (revision a3127495)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
33  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
34  */
35 
36 /*
37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38  * All rights reserved.
39  *
40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41  *
42  * Permission to use, copy, modify and distribute this software and
43  * its documentation is hereby granted, provided that both the copyright
44  * notice and this permission notice appear in all copies of the
45  * software, derivative works or modified versions, and any portions
46  * thereof, and that both notices appear in supporting documentation.
47  *
48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51  *
52  * Carnegie Mellon requests users of this software to return to
53  *
54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55  *  School of Computer Science
56  *  Carnegie Mellon University
57  *  Pittsburgh PA 15213-3890
58  *
59  * any improvements or extensions that they make and grant Carnegie the
60  * rights to redistribute these changes.
61  */
62 /*
63  * Resident memory management module.  The module manipulates 'VM pages'.
64  * A VM page is the core building block for memory management.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/malloc.h>
70 #include <sys/proc.h>
71 #include <sys/vmmeter.h>
72 #include <sys/vnode.h>
73 #include <sys/kernel.h>
74 #include <sys/alist.h>
75 #include <sys/sysctl.h>
76 #include <sys/cpu_topology.h>
77 
78 #include <vm/vm.h>
79 #include <vm/vm_param.h>
80 #include <sys/lock.h>
81 #include <vm/vm_kern.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/vm_extern.h>
89 #include <vm/swap_pager.h>
90 
91 #include <machine/inttypes.h>
92 #include <machine/md_var.h>
93 #include <machine/specialreg.h>
94 #include <machine/bus_dma.h>
95 
96 #include <vm/vm_page2.h>
97 #include <sys/spinlock2.h>
98 
99 /*
100  * SET - Minimum required set associative size, must be a power of 2.  We
101  *	 want this to match or exceed the set-associativeness of the cpu.
102  *
103  * GRP - A larger set that allows bleed-over into the domains of other
104  *	 nearby cpus.  Also must be a power of 2.  Used by the page zeroing
105  *	 code to smooth things out a bit.
106  */
107 #define PQ_SET_ASSOC		16
108 #define PQ_SET_ASSOC_MASK	(PQ_SET_ASSOC - 1)
109 
110 #define PQ_GRP_ASSOC		(PQ_SET_ASSOC * 2)
111 #define PQ_GRP_ASSOC_MASK	(PQ_GRP_ASSOC - 1)
112 
113 static void vm_page_queue_init(void);
114 static void vm_page_free_wakeup(void);
115 static vm_page_t vm_page_select_cache(u_short pg_color);
116 static vm_page_t _vm_page_list_find2(int basequeue, int index);
117 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
118 
119 /*
120  * Array of tailq lists
121  */
122 __cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
123 
124 static volatile int vm_pages_waiting;
125 static struct alist vm_contig_alist;
126 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
127 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
128 
129 static u_long vm_dma_reserved = 0;
130 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
131 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
132 	    "Memory reserved for DMA");
133 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
134 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
135 
136 static int vm_contig_verbose = 0;
137 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
138 
139 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
140 	     vm_pindex_t, pindex);
141 
142 static void
143 vm_page_queue_init(void)
144 {
145 	int i;
146 
147 	for (i = 0; i < PQ_L2_SIZE; i++)
148 		vm_page_queues[PQ_FREE+i].cnt_offset =
149 			offsetof(struct vmstats, v_free_count);
150 	for (i = 0; i < PQ_L2_SIZE; i++)
151 		vm_page_queues[PQ_CACHE+i].cnt_offset =
152 			offsetof(struct vmstats, v_cache_count);
153 	for (i = 0; i < PQ_L2_SIZE; i++)
154 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
155 			offsetof(struct vmstats, v_inactive_count);
156 	for (i = 0; i < PQ_L2_SIZE; i++)
157 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
158 			offsetof(struct vmstats, v_active_count);
159 	for (i = 0; i < PQ_L2_SIZE; i++)
160 		vm_page_queues[PQ_HOLD+i].cnt_offset =
161 			offsetof(struct vmstats, v_active_count);
162 	/* PQ_NONE has no queue */
163 
164 	for (i = 0; i < PQ_COUNT; i++) {
165 		TAILQ_INIT(&vm_page_queues[i].pl);
166 		spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
167 	}
168 }
169 
170 /*
171  * note: place in initialized data section?  Is this necessary?
172  */
173 vm_pindex_t first_page = 0;
174 vm_pindex_t vm_page_array_size = 0;
175 vm_page_t vm_page_array = NULL;
176 vm_paddr_t vm_low_phys_reserved;
177 
178 /*
179  * (low level boot)
180  *
181  * Sets the page size, perhaps based upon the memory size.
182  * Must be called before any use of page-size dependent functions.
183  */
184 void
185 vm_set_page_size(void)
186 {
187 	if (vmstats.v_page_size == 0)
188 		vmstats.v_page_size = PAGE_SIZE;
189 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
190 		panic("vm_set_page_size: page size not a power of two");
191 }
192 
193 /*
194  * (low level boot)
195  *
196  * Add a new page to the freelist for use by the system.  New pages
197  * are added to both the head and tail of the associated free page
198  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
199  * requests pull 'recent' adds (higher physical addresses) first.
200  *
201  * Beware that the page zeroing daemon will also be running soon after
202  * boot, moving pages from the head to the tail of the PQ_FREE queues.
203  *
204  * Must be called in a critical section.
205  */
206 static void
207 vm_add_new_page(vm_paddr_t pa)
208 {
209 	struct vpgqueues *vpq;
210 	vm_page_t m;
211 
212 	m = PHYS_TO_VM_PAGE(pa);
213 	m->phys_addr = pa;
214 	m->flags = 0;
215 	m->pat_mode = PAT_WRITE_BACK;
216 	m->pc = (pa >> PAGE_SHIFT);
217 
218 	/*
219 	 * Twist for cpu localization in addition to page coloring, so
220 	 * different cpus selecting by m->queue get different page colors.
221 	 */
222 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
223 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
224 	m->pc &= PQ_L2_MASK;
225 
226 	/*
227 	 * Reserve a certain number of contiguous low memory pages for
228 	 * contigmalloc() to use.
229 	 */
230 	if (pa < vm_low_phys_reserved) {
231 		atomic_add_long(&vmstats.v_page_count, 1);
232 		atomic_add_long(&vmstats.v_dma_pages, 1);
233 		m->queue = PQ_NONE;
234 		m->wire_count = 1;
235 		atomic_add_long(&vmstats.v_wire_count, 1);
236 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
237 		return;
238 	}
239 
240 	/*
241 	 * General page
242 	 */
243 	m->queue = m->pc + PQ_FREE;
244 	KKASSERT(m->dirty == 0);
245 
246 	atomic_add_long(&vmstats.v_page_count, 1);
247 	atomic_add_long(&vmstats.v_free_count, 1);
248 	vpq = &vm_page_queues[m->queue];
249 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
250 	++vpq->lcnt;
251 }
252 
253 /*
254  * (low level boot)
255  *
256  * Initializes the resident memory module.
257  *
258  * Preallocates memory for critical VM structures and arrays prior to
259  * kernel_map becoming available.
260  *
261  * Memory is allocated from (virtual2_start, virtual2_end) if available,
262  * otherwise memory is allocated from (virtual_start, virtual_end).
263  *
264  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
265  * large enough to hold vm_page_array & other structures for machines with
266  * large amounts of ram, so we want to use virtual2* when available.
267  */
268 void
269 vm_page_startup(void)
270 {
271 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
272 	vm_offset_t mapped;
273 	vm_pindex_t npages;
274 	vm_paddr_t page_range;
275 	vm_paddr_t new_end;
276 	int i;
277 	vm_paddr_t pa;
278 	vm_paddr_t last_pa;
279 	vm_paddr_t end;
280 	vm_paddr_t biggestone, biggestsize;
281 	vm_paddr_t total;
282 	vm_page_t m;
283 
284 	total = 0;
285 	biggestsize = 0;
286 	biggestone = 0;
287 	vaddr = round_page(vaddr);
288 
289 	/*
290 	 * Make sure ranges are page-aligned.
291 	 */
292 	for (i = 0; phys_avail[i].phys_end; ++i) {
293 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
294 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
295 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
296 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
297 	}
298 
299 	/*
300 	 * Locate largest block
301 	 */
302 	for (i = 0; phys_avail[i].phys_end; ++i) {
303 		vm_paddr_t size = phys_avail[i].phys_end -
304 				  phys_avail[i].phys_beg;
305 
306 		if (size > biggestsize) {
307 			biggestone = i;
308 			biggestsize = size;
309 		}
310 		total += size;
311 	}
312 	--i;	/* adjust to last entry for use down below */
313 
314 	end = phys_avail[biggestone].phys_end;
315 	end = trunc_page(end);
316 
317 	/*
318 	 * Initialize the queue headers for the free queue, the active queue
319 	 * and the inactive queue.
320 	 */
321 	vm_page_queue_init();
322 
323 #if !defined(_KERNEL_VIRTUAL)
324 	/*
325 	 * VKERNELs don't support minidumps and as such don't need
326 	 * vm_page_dump
327 	 *
328 	 * Allocate a bitmap to indicate that a random physical page
329 	 * needs to be included in a minidump.
330 	 *
331 	 * The amd64 port needs this to indicate which direct map pages
332 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
333 	 *
334 	 * However, i386 still needs this workspace internally within the
335 	 * minidump code.  In theory, they are not needed on i386, but are
336 	 * included should the sf_buf code decide to use them.
337 	 */
338 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
339 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
340 	end -= vm_page_dump_size;
341 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
342 					VM_PROT_READ | VM_PROT_WRITE);
343 	bzero((void *)vm_page_dump, vm_page_dump_size);
344 #endif
345 	/*
346 	 * Compute the number of pages of memory that will be available for
347 	 * use (taking into account the overhead of a page structure per
348 	 * page).
349 	 */
350 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
351 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
352 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
353 
354 #ifndef _KERNEL_VIRTUAL
355 	/*
356 	 * (only applies to real kernels)
357 	 *
358 	 * Reserve a large amount of low memory for potential 32-bit DMA
359 	 * space allocations.  Once device initialization is complete we
360 	 * release most of it, but keep (vm_dma_reserved) memory reserved
361 	 * for later use.  Typically for X / graphics.  Through trial and
362 	 * error we find that GPUs usually requires ~60-100MB or so.
363 	 *
364 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
365 	 */
366 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
367 	if (vm_low_phys_reserved > total / 4)
368 		vm_low_phys_reserved = total / 4;
369 	if (vm_dma_reserved == 0) {
370 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
371 		if (vm_dma_reserved > total / 16)
372 			vm_dma_reserved = total / 16;
373 	}
374 #endif
375 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
376 		   ALIST_RECORDS_65536);
377 
378 	/*
379 	 * Initialize the mem entry structures now, and put them in the free
380 	 * queue.
381 	 */
382 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
383 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
384 	vm_page_array = (vm_page_t)mapped;
385 
386 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
387 	/*
388 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
389 	 * we have to manually add these pages to the minidump tracking so
390 	 * that they can be dumped, including the vm_page_array.
391 	 */
392 	for (pa = new_end;
393 	     pa < phys_avail[biggestone].phys_end;
394 	     pa += PAGE_SIZE) {
395 		dump_add_page(pa);
396 	}
397 #endif
398 
399 	/*
400 	 * Clear all of the page structures, run basic initialization so
401 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
402 	 * map.
403 	 */
404 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
405 	vm_page_array_size = page_range;
406 
407 	m = &vm_page_array[0];
408 	pa = ptoa(first_page);
409 	for (i = 0; i < page_range; ++i) {
410 		spin_init(&m->spin, "vm_page");
411 		m->phys_addr = pa;
412 		pa += PAGE_SIZE;
413 		++m;
414 	}
415 
416 	/*
417 	 * Construct the free queue(s) in ascending order (by physical
418 	 * address) so that the first 16MB of physical memory is allocated
419 	 * last rather than first.  On large-memory machines, this avoids
420 	 * the exhaustion of low physical memory before isa_dma_init has run.
421 	 */
422 	vmstats.v_page_count = 0;
423 	vmstats.v_free_count = 0;
424 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
425 		pa = phys_avail[i].phys_beg;
426 		if (i == biggestone)
427 			last_pa = new_end;
428 		else
429 			last_pa = phys_avail[i].phys_end;
430 		while (pa < last_pa && npages-- > 0) {
431 			vm_add_new_page(pa);
432 			pa += PAGE_SIZE;
433 		}
434 	}
435 	if (virtual2_start)
436 		virtual2_start = vaddr;
437 	else
438 		virtual_start = vaddr;
439 	mycpu->gd_vmstats = vmstats;
440 }
441 
442 /*
443  * Reorganize VM pages based on numa data.  May be called as many times as
444  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
445  * to allow vm_page_alloc() to choose pages based on socket affinity.
446  *
447  * NOTE: This function is only called while we are still in UP mode, so
448  *	 we only need a critical section to protect the queues (which
449  *	 saves a lot of time, there are likely a ton of pages).
450  */
451 void
452 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
453 {
454 	vm_paddr_t scan_beg;
455 	vm_paddr_t scan_end;
456 	vm_paddr_t ran_end;
457 	struct vpgqueues *vpq;
458 	vm_page_t m;
459 	vm_page_t mend;
460 	int i;
461 	int socket_mod;
462 	int socket_value;
463 
464 	/*
465 	 * Check if no physical information, or there was only one socket
466 	 * (so don't waste time doing nothing!).
467 	 */
468 	if (cpu_topology_phys_ids <= 1 ||
469 	    cpu_topology_core_ids == 0) {
470 		return;
471 	}
472 
473 	/*
474 	 * Setup for our iteration.  Note that ACPI may iterate CPU
475 	 * sockets starting at 0 or 1 or some other number.  The
476 	 * cpu_topology code mod's it against the socket count.
477 	 */
478 	ran_end = ran_beg + bytes;
479 	physid %= cpu_topology_phys_ids;
480 
481 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
482 	socket_value = physid * socket_mod;
483 	mend = &vm_page_array[vm_page_array_size];
484 
485 	crit_enter();
486 
487 	/*
488 	 * Adjust vm_page->pc and requeue all affected pages.  The
489 	 * allocator will then be able to localize memory allocations
490 	 * to some degree.
491 	 */
492 	for (i = 0; phys_avail[i].phys_end; ++i) {
493 		scan_beg = phys_avail[i].phys_beg;
494 		scan_end = phys_avail[i].phys_end;
495 		if (scan_end <= ran_beg)
496 			continue;
497 		if (scan_beg >= ran_end)
498 			continue;
499 		if (scan_beg < ran_beg)
500 			scan_beg = ran_beg;
501 		if (scan_end > ran_end)
502 			scan_end = ran_end;
503 		if (atop(scan_end) > first_page + vm_page_array_size)
504 			scan_end = ptoa(first_page + vm_page_array_size);
505 
506 		m = PHYS_TO_VM_PAGE(scan_beg);
507 		while (scan_beg < scan_end) {
508 			KKASSERT(m < mend);
509 			if (m->queue != PQ_NONE) {
510 				vpq = &vm_page_queues[m->queue];
511 				TAILQ_REMOVE(&vpq->pl, m, pageq);
512 				--vpq->lcnt;
513 				/* queue doesn't change, no need to adj cnt */
514 				m->queue -= m->pc;
515 				m->pc %= socket_mod;
516 				m->pc += socket_value;
517 				m->pc &= PQ_L2_MASK;
518 				m->queue += m->pc;
519 				vpq = &vm_page_queues[m->queue];
520 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
521 				++vpq->lcnt;
522 				/* queue doesn't change, no need to adj cnt */
523 			} else {
524 				m->pc %= socket_mod;
525 				m->pc += socket_value;
526 				m->pc &= PQ_L2_MASK;
527 			}
528 			scan_beg += PAGE_SIZE;
529 			++m;
530 		}
531 	}
532 	crit_exit();
533 }
534 
535 /*
536  * We tended to reserve a ton of memory for contigmalloc().  Now that most
537  * drivers have initialized we want to return most the remaining free
538  * reserve back to the VM page queues so they can be used for normal
539  * allocations.
540  *
541  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
542  */
543 static void
544 vm_page_startup_finish(void *dummy __unused)
545 {
546 	alist_blk_t blk;
547 	alist_blk_t rblk;
548 	alist_blk_t count;
549 	alist_blk_t xcount;
550 	alist_blk_t bfree;
551 	vm_page_t m;
552 
553 	spin_lock(&vm_contig_spin);
554 	for (;;) {
555 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
556 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
557 			break;
558 		if (count == 0)
559 			break;
560 
561 		/*
562 		 * Figure out how much of the initial reserve we have to
563 		 * free in order to reach our target.
564 		 */
565 		bfree -= vm_dma_reserved / PAGE_SIZE;
566 		if (count > bfree) {
567 			blk += count - bfree;
568 			count = bfree;
569 		}
570 
571 		/*
572 		 * Calculate the nearest power of 2 <= count.
573 		 */
574 		for (xcount = 1; xcount <= count; xcount <<= 1)
575 			;
576 		xcount >>= 1;
577 		blk += count - xcount;
578 		count = xcount;
579 
580 		/*
581 		 * Allocate the pages from the alist, then free them to
582 		 * the normal VM page queues.
583 		 *
584 		 * Pages allocated from the alist are wired.  We have to
585 		 * busy, unwire, and free them.  We must also adjust
586 		 * vm_low_phys_reserved before freeing any pages to prevent
587 		 * confusion.
588 		 */
589 		rblk = alist_alloc(&vm_contig_alist, blk, count);
590 		if (rblk != blk) {
591 			kprintf("vm_page_startup_finish: Unable to return "
592 				"dma space @0x%08x/%d -> 0x%08x\n",
593 				blk, count, rblk);
594 			break;
595 		}
596 		atomic_add_long(&vmstats.v_dma_pages, -(long)count);
597 		spin_unlock(&vm_contig_spin);
598 
599 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
600 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
601 		while (count) {
602 			vm_page_busy_wait(m, FALSE, "cpgfr");
603 			vm_page_unwire(m, 0);
604 			vm_page_free(m);
605 			--count;
606 			++m;
607 		}
608 		spin_lock(&vm_contig_spin);
609 	}
610 	spin_unlock(&vm_contig_spin);
611 
612 	/*
613 	 * Print out how much DMA space drivers have already allocated and
614 	 * how much is left over.
615 	 */
616 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
617 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
618 		(PAGE_SIZE / 1024),
619 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
620 }
621 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
622 	vm_page_startup_finish, NULL);
623 
624 
625 /*
626  * Scan comparison function for Red-Black tree scans.  An inclusive
627  * (start,end) is expected.  Other fields are not used.
628  */
629 int
630 rb_vm_page_scancmp(struct vm_page *p, void *data)
631 {
632 	struct rb_vm_page_scan_info *info = data;
633 
634 	if (p->pindex < info->start_pindex)
635 		return(-1);
636 	if (p->pindex > info->end_pindex)
637 		return(1);
638 	return(0);
639 }
640 
641 int
642 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
643 {
644 	if (p1->pindex < p2->pindex)
645 		return(-1);
646 	if (p1->pindex > p2->pindex)
647 		return(1);
648 	return(0);
649 }
650 
651 void
652 vm_page_init(vm_page_t m)
653 {
654 	/* do nothing for now.  Called from pmap_page_init() */
655 }
656 
657 /*
658  * Each page queue has its own spin lock, which is fairly optimal for
659  * allocating and freeing pages at least.
660  *
661  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
662  * queue spinlock via this function.  Also note that m->queue cannot change
663  * unless both the page and queue are locked.
664  */
665 static __inline
666 void
667 _vm_page_queue_spin_lock(vm_page_t m)
668 {
669 	u_short queue;
670 
671 	queue = m->queue;
672 	if (queue != PQ_NONE) {
673 		spin_lock(&vm_page_queues[queue].spin);
674 		KKASSERT(queue == m->queue);
675 	}
676 }
677 
678 static __inline
679 void
680 _vm_page_queue_spin_unlock(vm_page_t m)
681 {
682 	u_short queue;
683 
684 	queue = m->queue;
685 	cpu_ccfence();
686 	if (queue != PQ_NONE)
687 		spin_unlock(&vm_page_queues[queue].spin);
688 }
689 
690 static __inline
691 void
692 _vm_page_queues_spin_lock(u_short queue)
693 {
694 	cpu_ccfence();
695 	if (queue != PQ_NONE)
696 		spin_lock(&vm_page_queues[queue].spin);
697 }
698 
699 
700 static __inline
701 void
702 _vm_page_queues_spin_unlock(u_short queue)
703 {
704 	cpu_ccfence();
705 	if (queue != PQ_NONE)
706 		spin_unlock(&vm_page_queues[queue].spin);
707 }
708 
709 void
710 vm_page_queue_spin_lock(vm_page_t m)
711 {
712 	_vm_page_queue_spin_lock(m);
713 }
714 
715 void
716 vm_page_queues_spin_lock(u_short queue)
717 {
718 	_vm_page_queues_spin_lock(queue);
719 }
720 
721 void
722 vm_page_queue_spin_unlock(vm_page_t m)
723 {
724 	_vm_page_queue_spin_unlock(m);
725 }
726 
727 void
728 vm_page_queues_spin_unlock(u_short queue)
729 {
730 	_vm_page_queues_spin_unlock(queue);
731 }
732 
733 /*
734  * This locks the specified vm_page and its queue in the proper order
735  * (page first, then queue).  The queue may change so the caller must
736  * recheck on return.
737  */
738 static __inline
739 void
740 _vm_page_and_queue_spin_lock(vm_page_t m)
741 {
742 	vm_page_spin_lock(m);
743 	_vm_page_queue_spin_lock(m);
744 }
745 
746 static __inline
747 void
748 _vm_page_and_queue_spin_unlock(vm_page_t m)
749 {
750 	_vm_page_queues_spin_unlock(m->queue);
751 	vm_page_spin_unlock(m);
752 }
753 
754 void
755 vm_page_and_queue_spin_unlock(vm_page_t m)
756 {
757 	_vm_page_and_queue_spin_unlock(m);
758 }
759 
760 void
761 vm_page_and_queue_spin_lock(vm_page_t m)
762 {
763 	_vm_page_and_queue_spin_lock(m);
764 }
765 
766 /*
767  * Helper function removes vm_page from its current queue.
768  * Returns the base queue the page used to be on.
769  *
770  * The vm_page and the queue must be spinlocked.
771  * This function will unlock the queue but leave the page spinlocked.
772  */
773 static __inline u_short
774 _vm_page_rem_queue_spinlocked(vm_page_t m)
775 {
776 	struct vpgqueues *pq;
777 	u_short queue;
778 	u_short oqueue;
779 	long *cnt;
780 
781 	queue = m->queue;
782 	if (queue != PQ_NONE) {
783 		pq = &vm_page_queues[queue];
784 		TAILQ_REMOVE(&pq->pl, m, pageq);
785 
786 		/*
787 		 * Adjust our pcpu stats.  In order for the nominal low-memory
788 		 * algorithms to work properly we don't let any pcpu stat get
789 		 * too negative before we force it to be rolled-up into the
790 		 * global stats.  Otherwise our pageout and vm_wait tests
791 		 * will fail badly.
792 		 *
793 		 * The idea here is to reduce unnecessary SMP cache
794 		 * mastership changes in the global vmstats, which can be
795 		 * particularly bad in multi-socket systems.
796 		 */
797 		cnt = (long *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
798 		atomic_add_long(cnt, -1);
799 		if (*cnt < -VMMETER_SLOP_COUNT) {
800 			u_long copy = atomic_swap_long(cnt, 0);
801 			cnt = (long *)((char *)&vmstats + pq->cnt_offset);
802 			atomic_add_long(cnt, copy);
803 			cnt = (long *)((char *)&mycpu->gd_vmstats +
804 				      pq->cnt_offset);
805 			atomic_add_long(cnt, copy);
806 		}
807 		pq->lcnt--;
808 		m->queue = PQ_NONE;
809 		oqueue = queue;
810 		queue -= m->pc;
811 		vm_page_queues_spin_unlock(oqueue);	/* intended */
812 	}
813 	return queue;
814 }
815 
816 /*
817  * Helper function places the vm_page on the specified queue.  Generally
818  * speaking only PQ_FREE pages are placed at the head, to allow them to
819  * be allocated sooner rather than later on the assumption that they
820  * are cache-hot.
821  *
822  * The vm_page must be spinlocked.
823  * This function will return with both the page and the queue locked.
824  */
825 static __inline void
826 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
827 {
828 	struct vpgqueues *pq;
829 	u_long *cnt;
830 
831 	KKASSERT(m->queue == PQ_NONE);
832 
833 	if (queue != PQ_NONE) {
834 		vm_page_queues_spin_lock(queue);
835 		pq = &vm_page_queues[queue];
836 		++pq->lcnt;
837 
838 		/*
839 		 * Adjust our pcpu stats.  If a system entity really needs
840 		 * to incorporate the count it will call vmstats_rollup()
841 		 * to roll it all up into the global vmstats strufture.
842 		 */
843 		cnt = (long *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
844 		atomic_add_long(cnt, 1);
845 
846 		/*
847 		 * PQ_FREE is always handled LIFO style to try to provide
848 		 * cache-hot pages to programs.
849 		 */
850 		m->queue = queue;
851 		if (queue - m->pc == PQ_FREE) {
852 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
853 		} else if (athead) {
854 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
855 		} else {
856 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
857 		}
858 		/* leave the queue spinlocked */
859 	}
860 }
861 
862 /*
863  * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
864  * until the page is no longer BUSY or SBUSY (busy_count field is 0).
865  *
866  * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
867  * call will be made before returning.
868  *
869  * This function does NOT busy the page and on return the page is not
870  * guaranteed to be available.
871  */
872 void
873 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
874 {
875 	u_int32_t busy_count;
876 
877 	for (;;) {
878 		busy_count = m->busy_count;
879 		cpu_ccfence();
880 
881 		if ((busy_count & PBUSY_LOCKED) == 0 &&
882 		    (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
883 			break;
884 		}
885 		tsleep_interlock(m, 0);
886 		if (atomic_cmpset_int(&m->busy_count, busy_count,
887 				      busy_count | PBUSY_WANTED)) {
888 			atomic_set_int(&m->flags, PG_REFERENCED);
889 			tsleep(m, PINTERLOCKED, msg, 0);
890 			break;
891 		}
892 	}
893 }
894 
895 /*
896  * This calculates and returns a page color given an optional VM object and
897  * either a pindex or an iterator.  We attempt to return a cpu-localized
898  * pg_color that is still roughly 16-way set-associative.  The CPU topology
899  * is used if it was probed.
900  *
901  * The caller may use the returned value to index into e.g. PQ_FREE when
902  * allocating a page in order to nominally obtain pages that are hopefully
903  * already localized to the requesting cpu.  This function is not able to
904  * provide any sort of guarantee of this, but does its best to improve
905  * hardware cache management performance.
906  *
907  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
908  */
909 u_short
910 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
911 {
912 	u_short pg_color;
913 	int phys_id;
914 	int core_id;
915 	int object_pg_color;
916 
917 	phys_id = get_cpu_phys_id(cpuid);
918 	core_id = get_cpu_core_id(cpuid);
919 	object_pg_color = object ? object->pg_color : 0;
920 
921 	if (cpu_topology_phys_ids && cpu_topology_core_ids) {
922 		int grpsize;
923 
924 		/*
925 		 * Break us down by socket and cpu
926 		 */
927 		pg_color = phys_id * PQ_L2_SIZE / cpu_topology_phys_ids;
928 		pg_color += core_id * PQ_L2_SIZE /
929 			    (cpu_topology_core_ids * cpu_topology_phys_ids);
930 
931 		/*
932 		 * Calculate remaining component for object/queue color
933 		 */
934 		grpsize = PQ_L2_SIZE / (cpu_topology_core_ids *
935 					cpu_topology_phys_ids);
936 		if (grpsize >= 8) {
937 			pg_color += (pindex + object_pg_color) % grpsize;
938 		} else {
939 			if (grpsize <= 2) {
940 				grpsize = 8;
941 			} else {
942 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
943 				grpsize += grpsize;
944 				if (grpsize < 8)
945 					grpsize += grpsize;
946 			}
947 			pg_color += (pindex + object_pg_color) % grpsize;
948 		}
949 	} else {
950 		/*
951 		 * Unknown topology, distribute things evenly.
952 		 */
953 		pg_color = cpuid * PQ_L2_SIZE / ncpus;
954 		pg_color += pindex + object_pg_color;
955 	}
956 	return (pg_color & PQ_L2_MASK);
957 }
958 
959 /*
960  * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
961  * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
962  */
963 void
964 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
965 				     int also_m_busy, const char *msg
966 				     VM_PAGE_DEBUG_ARGS)
967 {
968 	u_int32_t busy_count;
969 
970 	for (;;) {
971 		busy_count = m->busy_count;
972 		cpu_ccfence();
973 		if (busy_count & PBUSY_LOCKED) {
974 			tsleep_interlock(m, 0);
975 			if (atomic_cmpset_int(&m->busy_count, busy_count,
976 					  busy_count | PBUSY_WANTED)) {
977 				atomic_set_int(&m->flags, PG_REFERENCED);
978 				tsleep(m, PINTERLOCKED, msg, 0);
979 			}
980 		} else if (also_m_busy && busy_count) {
981 			tsleep_interlock(m, 0);
982 			if (atomic_cmpset_int(&m->busy_count, busy_count,
983 					  busy_count | PBUSY_WANTED)) {
984 				atomic_set_int(&m->flags, PG_REFERENCED);
985 				tsleep(m, PINTERLOCKED, msg, 0);
986 			}
987 		} else {
988 			if (atomic_cmpset_int(&m->busy_count, busy_count,
989 					      busy_count | PBUSY_LOCKED)) {
990 #ifdef VM_PAGE_DEBUG
991 				m->busy_func = func;
992 				m->busy_line = lineno;
993 #endif
994 				break;
995 			}
996 		}
997 	}
998 }
999 
1000 /*
1001  * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
1002  * m->busy_count is also 0.
1003  *
1004  * Returns non-zero on failure.
1005  */
1006 int
1007 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1008 				    VM_PAGE_DEBUG_ARGS)
1009 {
1010 	u_int32_t busy_count;
1011 
1012 	for (;;) {
1013 		busy_count = m->busy_count;
1014 		cpu_ccfence();
1015 		if (busy_count & PBUSY_LOCKED)
1016 			return TRUE;
1017 		if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1018 			return TRUE;
1019 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1020 				      busy_count | PBUSY_LOCKED)) {
1021 #ifdef VM_PAGE_DEBUG
1022 				m->busy_func = func;
1023 				m->busy_line = lineno;
1024 #endif
1025 			return FALSE;
1026 		}
1027 	}
1028 }
1029 
1030 /*
1031  * Clear the BUSY flag and return non-zero to indicate to the caller
1032  * that a wakeup() should be performed.
1033  *
1034  * The vm_page must be spinlocked and will remain spinlocked on return.
1035  * The related queue must NOT be spinlocked (which could deadlock us).
1036  *
1037  * (inline version)
1038  */
1039 static __inline
1040 int
1041 _vm_page_wakeup(vm_page_t m)
1042 {
1043 	u_int32_t busy_count;
1044 
1045 	for (;;) {
1046 		busy_count = m->busy_count;
1047 		cpu_ccfence();
1048 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1049 				      busy_count &
1050 				      ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1051 			break;
1052 		}
1053 	}
1054 	return((int)(busy_count & PBUSY_WANTED));
1055 }
1056 
1057 /*
1058  * Clear the BUSY flag and wakeup anyone waiting for the page.  This
1059  * is typically the last call you make on a page before moving onto
1060  * other things.
1061  */
1062 void
1063 vm_page_wakeup(vm_page_t m)
1064 {
1065         KASSERT(m->busy_count & PBUSY_LOCKED,
1066 		("vm_page_wakeup: page not busy!!!"));
1067 	vm_page_spin_lock(m);
1068 	if (_vm_page_wakeup(m)) {
1069 		vm_page_spin_unlock(m);
1070 		wakeup(m);
1071 	} else {
1072 		vm_page_spin_unlock(m);
1073 	}
1074 }
1075 
1076 /*
1077  * Holding a page keeps it from being reused.  Other parts of the system
1078  * can still disassociate the page from its current object and free it, or
1079  * perform read or write I/O on it and/or otherwise manipulate the page,
1080  * but if the page is held the VM system will leave the page and its data
1081  * intact and not reuse the page for other purposes until the last hold
1082  * reference is released.  (see vm_page_wire() if you want to prevent the
1083  * page from being disassociated from its object too).
1084  *
1085  * The caller must still validate the contents of the page and, if necessary,
1086  * wait for any pending I/O (e.g. vm_page_sleep_busy() loop) to complete
1087  * before manipulating the page.
1088  *
1089  * XXX get vm_page_spin_lock() here and move FREE->HOLD if necessary
1090  */
1091 void
1092 vm_page_hold(vm_page_t m)
1093 {
1094 	vm_page_spin_lock(m);
1095 	atomic_add_int(&m->hold_count, 1);
1096 	if (m->queue - m->pc == PQ_FREE) {
1097 		_vm_page_queue_spin_lock(m);
1098 		_vm_page_rem_queue_spinlocked(m);
1099 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
1100 		_vm_page_queue_spin_unlock(m);
1101 	}
1102 	vm_page_spin_unlock(m);
1103 }
1104 
1105 /*
1106  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1107  * it was freed while held and must be moved back to the FREE queue.
1108  */
1109 void
1110 vm_page_unhold(vm_page_t m)
1111 {
1112 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1113 		("vm_page_unhold: pg %p illegal hold_count (%d) or on FREE queue (%d)",
1114 		 m, m->hold_count, m->queue - m->pc));
1115 	vm_page_spin_lock(m);
1116 	atomic_add_int(&m->hold_count, -1);
1117 	if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1118 		_vm_page_queue_spin_lock(m);
1119 		_vm_page_rem_queue_spinlocked(m);
1120 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1121 		_vm_page_queue_spin_unlock(m);
1122 	}
1123 	vm_page_spin_unlock(m);
1124 }
1125 
1126 /*
1127  *	vm_page_getfake:
1128  *
1129  *	Create a fictitious page with the specified physical address and
1130  *	memory attribute.  The memory attribute is the only the machine-
1131  *	dependent aspect of a fictitious page that must be initialized.
1132  */
1133 
1134 void
1135 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1136 {
1137 
1138 	if ((m->flags & PG_FICTITIOUS) != 0) {
1139 		/*
1140 		 * The page's memattr might have changed since the
1141 		 * previous initialization.  Update the pmap to the
1142 		 * new memattr.
1143 		 */
1144 		goto memattr;
1145 	}
1146 	m->phys_addr = paddr;
1147 	m->queue = PQ_NONE;
1148 	/* Fictitious pages don't use "segind". */
1149 	/* Fictitious pages don't use "order" or "pool". */
1150 	m->flags = PG_FICTITIOUS | PG_UNMANAGED;
1151 	m->busy_count = PBUSY_LOCKED;
1152 	m->wire_count = 1;
1153 	spin_init(&m->spin, "fake_page");
1154 	pmap_page_init(m);
1155 memattr:
1156 	pmap_page_set_memattr(m, memattr);
1157 }
1158 
1159 /*
1160  * Inserts the given vm_page into the object and object list.
1161  *
1162  * The pagetables are not updated but will presumably fault the page
1163  * in if necessary, or if a kernel page the caller will at some point
1164  * enter the page into the kernel's pmap.  We are not allowed to block
1165  * here so we *can't* do this anyway.
1166  *
1167  * This routine may not block.
1168  * This routine must be called with the vm_object held.
1169  * This routine must be called with a critical section held.
1170  *
1171  * This routine returns TRUE if the page was inserted into the object
1172  * successfully, and FALSE if the page already exists in the object.
1173  */
1174 int
1175 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1176 {
1177 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1178 	if (m->object != NULL)
1179 		panic("vm_page_insert: already inserted");
1180 
1181 	atomic_add_int(&object->generation, 1);
1182 
1183 	/*
1184 	 * Record the object/offset pair in this page and add the
1185 	 * pv_list_count of the page to the object.
1186 	 *
1187 	 * The vm_page spin lock is required for interactions with the pmap.
1188 	 */
1189 	vm_page_spin_lock(m);
1190 	m->object = object;
1191 	m->pindex = pindex;
1192 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1193 		m->object = NULL;
1194 		m->pindex = 0;
1195 		vm_page_spin_unlock(m);
1196 		return FALSE;
1197 	}
1198 	++object->resident_page_count;
1199 	++mycpu->gd_vmtotal.t_rm;
1200 	vm_page_spin_unlock(m);
1201 
1202 	/*
1203 	 * Since we are inserting a new and possibly dirty page,
1204 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1205 	 */
1206 	if ((m->valid & m->dirty) ||
1207 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1208 		vm_object_set_writeable_dirty(object);
1209 
1210 	/*
1211 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1212 	 */
1213 	swap_pager_page_inserted(m);
1214 	return TRUE;
1215 }
1216 
1217 /*
1218  * Removes the given vm_page_t from the (object,index) table
1219  *
1220  * The underlying pmap entry (if any) is NOT removed here.
1221  * This routine may not block.
1222  *
1223  * The page must be BUSY and will remain BUSY on return.
1224  * No other requirements.
1225  *
1226  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1227  *	 it busy.
1228  */
1229 void
1230 vm_page_remove(vm_page_t m)
1231 {
1232 	vm_object_t object;
1233 
1234 	if (m->object == NULL) {
1235 		return;
1236 	}
1237 
1238 	if ((m->busy_count & PBUSY_LOCKED) == 0)
1239 		panic("vm_page_remove: page not busy");
1240 
1241 	object = m->object;
1242 
1243 	vm_object_hold(object);
1244 
1245 	/*
1246 	 * Remove the page from the object and update the object.
1247 	 *
1248 	 * The vm_page spin lock is required for interactions with the pmap.
1249 	 */
1250 	vm_page_spin_lock(m);
1251 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1252 	--object->resident_page_count;
1253 	--mycpu->gd_vmtotal.t_rm;
1254 	m->object = NULL;
1255 	atomic_add_int(&object->generation, 1);
1256 	vm_page_spin_unlock(m);
1257 
1258 	vm_object_drop(object);
1259 }
1260 
1261 /*
1262  * Locate and return the page at (object, pindex), or NULL if the
1263  * page could not be found.
1264  *
1265  * The caller must hold the vm_object token.
1266  */
1267 vm_page_t
1268 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1269 {
1270 	vm_page_t m;
1271 
1272 	/*
1273 	 * Search the hash table for this object/offset pair
1274 	 */
1275 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1276 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1277 	KKASSERT(m == NULL || (m->object == object && m->pindex == pindex));
1278 	return(m);
1279 }
1280 
1281 vm_page_t
1282 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1283 					    vm_pindex_t pindex,
1284 					    int also_m_busy, const char *msg
1285 					    VM_PAGE_DEBUG_ARGS)
1286 {
1287 	u_int32_t busy_count;
1288 	vm_page_t m;
1289 
1290 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1291 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1292 	while (m) {
1293 		KKASSERT(m->object == object && m->pindex == pindex);
1294 		busy_count = m->busy_count;
1295 		cpu_ccfence();
1296 		if (busy_count & PBUSY_LOCKED) {
1297 			tsleep_interlock(m, 0);
1298 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1299 					  busy_count | PBUSY_WANTED)) {
1300 				atomic_set_int(&m->flags, PG_REFERENCED);
1301 				tsleep(m, PINTERLOCKED, msg, 0);
1302 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1303 							      pindex);
1304 			}
1305 		} else if (also_m_busy && busy_count) {
1306 			tsleep_interlock(m, 0);
1307 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1308 					  busy_count | PBUSY_WANTED)) {
1309 				atomic_set_int(&m->flags, PG_REFERENCED);
1310 				tsleep(m, PINTERLOCKED, msg, 0);
1311 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1312 							      pindex);
1313 			}
1314 		} else if (atomic_cmpset_int(&m->busy_count, busy_count,
1315 					     busy_count | PBUSY_LOCKED)) {
1316 #ifdef VM_PAGE_DEBUG
1317 			m->busy_func = func;
1318 			m->busy_line = lineno;
1319 #endif
1320 			break;
1321 		}
1322 	}
1323 	return m;
1324 }
1325 
1326 /*
1327  * Attempt to lookup and busy a page.
1328  *
1329  * Returns NULL if the page could not be found
1330  *
1331  * Returns a vm_page and error == TRUE if the page exists but could not
1332  * be busied.
1333  *
1334  * Returns a vm_page and error == FALSE on success.
1335  */
1336 vm_page_t
1337 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1338 					   vm_pindex_t pindex,
1339 					   int also_m_busy, int *errorp
1340 					   VM_PAGE_DEBUG_ARGS)
1341 {
1342 	u_int32_t busy_count;
1343 	vm_page_t m;
1344 
1345 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1346 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1347 	*errorp = FALSE;
1348 	while (m) {
1349 		KKASSERT(m->object == object && m->pindex == pindex);
1350 		busy_count = m->busy_count;
1351 		cpu_ccfence();
1352 		if (busy_count & PBUSY_LOCKED) {
1353 			*errorp = TRUE;
1354 			break;
1355 		}
1356 		if (also_m_busy && busy_count) {
1357 			*errorp = TRUE;
1358 			break;
1359 		}
1360 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1361 				      busy_count | PBUSY_LOCKED)) {
1362 #ifdef VM_PAGE_DEBUG
1363 			m->busy_func = func;
1364 			m->busy_line = lineno;
1365 #endif
1366 			break;
1367 		}
1368 	}
1369 	return m;
1370 }
1371 
1372 /*
1373  * Returns a page that is only soft-busied for use by the caller in
1374  * a read-only fashion.  Returns NULL if the page could not be found,
1375  * the soft busy could not be obtained, or the page data is invalid.
1376  */
1377 vm_page_t
1378 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1379 			 int pgoff, int pgbytes)
1380 {
1381 	vm_page_t m;
1382 
1383 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1384 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1385 	if (m) {
1386 		if ((m->valid != VM_PAGE_BITS_ALL &&
1387 		     !vm_page_is_valid(m, pgoff, pgbytes)) ||
1388 		    (m->flags & PG_FICTITIOUS)) {
1389 			m = NULL;
1390 		} else if (vm_page_sbusy_try(m)) {
1391 			m = NULL;
1392 		} else if ((m->valid != VM_PAGE_BITS_ALL &&
1393 			    !vm_page_is_valid(m, pgoff, pgbytes)) ||
1394 			   (m->flags & PG_FICTITIOUS)) {
1395 			vm_page_sbusy_drop(m);
1396 			m = NULL;
1397 		}
1398 	}
1399 	return m;
1400 }
1401 
1402 /*
1403  * Caller must hold the related vm_object
1404  */
1405 vm_page_t
1406 vm_page_next(vm_page_t m)
1407 {
1408 	vm_page_t next;
1409 
1410 	next = vm_page_rb_tree_RB_NEXT(m);
1411 	if (next && next->pindex != m->pindex + 1)
1412 		next = NULL;
1413 	return (next);
1414 }
1415 
1416 /*
1417  * vm_page_rename()
1418  *
1419  * Move the given vm_page from its current object to the specified
1420  * target object/offset.  The page must be busy and will remain so
1421  * on return.
1422  *
1423  * new_object must be held.
1424  * This routine might block. XXX ?
1425  *
1426  * NOTE: Swap associated with the page must be invalidated by the move.  We
1427  *       have to do this for several reasons:  (1) we aren't freeing the
1428  *       page, (2) we are dirtying the page, (3) the VM system is probably
1429  *       moving the page from object A to B, and will then later move
1430  *       the backing store from A to B and we can't have a conflict.
1431  *
1432  * NOTE: We *always* dirty the page.  It is necessary both for the
1433  *       fact that we moved it, and because we may be invalidating
1434  *	 swap.  If the page is on the cache, we have to deactivate it
1435  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1436  *	 on the cache.
1437  */
1438 void
1439 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1440 {
1441 	KKASSERT(m->busy_count & PBUSY_LOCKED);
1442 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1443 	if (m->object) {
1444 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1445 		vm_page_remove(m);
1446 	}
1447 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1448 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1449 		      new_object, new_pindex);
1450 	}
1451 	if (m->queue - m->pc == PQ_CACHE)
1452 		vm_page_deactivate(m);
1453 	vm_page_dirty(m);
1454 }
1455 
1456 /*
1457  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1458  * is to remain BUSYied by the caller.
1459  *
1460  * This routine may not block.
1461  */
1462 void
1463 vm_page_unqueue_nowakeup(vm_page_t m)
1464 {
1465 	vm_page_and_queue_spin_lock(m);
1466 	(void)_vm_page_rem_queue_spinlocked(m);
1467 	vm_page_spin_unlock(m);
1468 }
1469 
1470 /*
1471  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1472  * if necessary.
1473  *
1474  * This routine may not block.
1475  */
1476 void
1477 vm_page_unqueue(vm_page_t m)
1478 {
1479 	u_short queue;
1480 
1481 	vm_page_and_queue_spin_lock(m);
1482 	queue = _vm_page_rem_queue_spinlocked(m);
1483 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1484 		vm_page_spin_unlock(m);
1485 		pagedaemon_wakeup();
1486 	} else {
1487 		vm_page_spin_unlock(m);
1488 	}
1489 }
1490 
1491 /*
1492  * vm_page_list_find()
1493  *
1494  * Find a page on the specified queue with color optimization.
1495  *
1496  * The page coloring optimization attempts to locate a page that does
1497  * not overload other nearby pages in the object in the cpu's L1 or L2
1498  * caches.  We need this optimization because cpu caches tend to be
1499  * physical caches, while object spaces tend to be virtual.
1500  *
1501  * The page coloring optimization also, very importantly, tries to localize
1502  * memory to cpus and physical sockets.
1503  *
1504  * On MP systems each PQ_FREE and PQ_CACHE color queue has its own spinlock
1505  * and the algorithm is adjusted to localize allocations on a per-core basis.
1506  * This is done by 'twisting' the colors.
1507  *
1508  * The page is returned spinlocked and removed from its queue (it will
1509  * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
1510  * is responsible for dealing with the busy-page case (usually by
1511  * deactivating the page and looping).
1512  *
1513  * NOTE:  This routine is carefully inlined.  A non-inlined version
1514  *	  is available for outside callers but the only critical path is
1515  *	  from within this source file.
1516  *
1517  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1518  *	  represent stable storage, allowing us to order our locks vm_page
1519  *	  first, then queue.
1520  */
1521 static __inline
1522 vm_page_t
1523 _vm_page_list_find(int basequeue, int index)
1524 {
1525 	vm_page_t m;
1526 
1527 	for (;;) {
1528 		m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
1529 		if (m == NULL) {
1530 			m = _vm_page_list_find2(basequeue, index);
1531 			return(m);
1532 		}
1533 		vm_page_and_queue_spin_lock(m);
1534 		if (m->queue == basequeue + index) {
1535 			_vm_page_rem_queue_spinlocked(m);
1536 			/* vm_page_t spin held, no queue spin */
1537 			break;
1538 		}
1539 		vm_page_and_queue_spin_unlock(m);
1540 	}
1541 	return(m);
1542 }
1543 
1544 /*
1545  * If we could not find the page in the desired queue try to find it in
1546  * a nearby queue.
1547  */
1548 static vm_page_t
1549 _vm_page_list_find2(int basequeue, int index)
1550 {
1551 	struct vpgqueues *pq;
1552 	vm_page_t m = NULL;
1553 	int pqmask = PQ_SET_ASSOC_MASK >> 1;
1554 	int pqi;
1555 	int i;
1556 
1557 	index &= PQ_L2_MASK;
1558 	pq = &vm_page_queues[basequeue];
1559 
1560 	/*
1561 	 * Run local sets of 16, 32, 64, 128, and the whole queue if all
1562 	 * else fails (PQ_L2_MASK which is 255).
1563 	 */
1564 	do {
1565 		pqmask = (pqmask << 1) | 1;
1566 		for (i = 0; i <= pqmask; ++i) {
1567 			pqi = (index & ~pqmask) | ((index + i) & pqmask);
1568 			m = TAILQ_FIRST(&pq[pqi].pl);
1569 			if (m) {
1570 				_vm_page_and_queue_spin_lock(m);
1571 				if (m->queue == basequeue + pqi) {
1572 					_vm_page_rem_queue_spinlocked(m);
1573 					return(m);
1574 				}
1575 				_vm_page_and_queue_spin_unlock(m);
1576 				--i;
1577 				continue;
1578 			}
1579 		}
1580 	} while (pqmask != PQ_L2_MASK);
1581 
1582 	return(m);
1583 }
1584 
1585 /*
1586  * Returns a vm_page candidate for allocation.  The page is not busied so
1587  * it can move around.  The caller must busy the page (and typically
1588  * deactivate it if it cannot be busied!)
1589  *
1590  * Returns a spinlocked vm_page that has been removed from its queue.
1591  */
1592 vm_page_t
1593 vm_page_list_find(int basequeue, int index)
1594 {
1595 	return(_vm_page_list_find(basequeue, index));
1596 }
1597 
1598 /*
1599  * Find a page on the cache queue with color optimization, remove it
1600  * from the queue, and busy it.  The returned page will not be spinlocked.
1601  *
1602  * A candidate failure will be deactivated.  Candidates can fail due to
1603  * being busied by someone else, in which case they will be deactivated.
1604  *
1605  * This routine may not block.
1606  *
1607  */
1608 static vm_page_t
1609 vm_page_select_cache(u_short pg_color)
1610 {
1611 	vm_page_t m;
1612 
1613 	for (;;) {
1614 		m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK);
1615 		if (m == NULL)
1616 			break;
1617 		/*
1618 		 * (m) has been removed from its queue and spinlocked
1619 		 */
1620 		if (vm_page_busy_try(m, TRUE)) {
1621 			_vm_page_deactivate_locked(m, 0);
1622 			vm_page_spin_unlock(m);
1623 		} else {
1624 			/*
1625 			 * We successfully busied the page
1626 			 */
1627 			if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0 &&
1628 			    m->hold_count == 0 &&
1629 			    m->wire_count == 0 &&
1630 			    (m->dirty & m->valid) == 0) {
1631 				vm_page_spin_unlock(m);
1632 				pagedaemon_wakeup();
1633 				return(m);
1634 			}
1635 
1636 			/*
1637 			 * The page cannot be recycled, deactivate it.
1638 			 */
1639 			_vm_page_deactivate_locked(m, 0);
1640 			if (_vm_page_wakeup(m)) {
1641 				vm_page_spin_unlock(m);
1642 				wakeup(m);
1643 			} else {
1644 				vm_page_spin_unlock(m);
1645 			}
1646 		}
1647 	}
1648 	return (m);
1649 }
1650 
1651 /*
1652  * Find a free page.  We attempt to inline the nominal case and fall back
1653  * to _vm_page_select_free() otherwise.  A busied page is removed from
1654  * the queue and returned.
1655  *
1656  * This routine may not block.
1657  */
1658 static __inline vm_page_t
1659 vm_page_select_free(u_short pg_color)
1660 {
1661 	vm_page_t m;
1662 
1663 	for (;;) {
1664 		m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK);
1665 		if (m == NULL)
1666 			break;
1667 		if (vm_page_busy_try(m, TRUE)) {
1668 			/*
1669 			 * Various mechanisms such as a pmap_collect can
1670 			 * result in a busy page on the free queue.  We
1671 			 * have to move the page out of the way so we can
1672 			 * retry the allocation.  If the other thread is not
1673 			 * allocating the page then m->valid will remain 0 and
1674 			 * the pageout daemon will free the page later on.
1675 			 *
1676 			 * Since we could not busy the page, however, we
1677 			 * cannot make assumptions as to whether the page
1678 			 * will be allocated by the other thread or not,
1679 			 * so all we can do is deactivate it to move it out
1680 			 * of the way.  In particular, if the other thread
1681 			 * wires the page it may wind up on the inactive
1682 			 * queue and the pageout daemon will have to deal
1683 			 * with that case too.
1684 			 */
1685 			_vm_page_deactivate_locked(m, 0);
1686 			vm_page_spin_unlock(m);
1687 		} else {
1688 			/*
1689 			 * Theoretically if we are able to busy the page
1690 			 * atomic with the queue removal (using the vm_page
1691 			 * lock) nobody else should be able to mess with the
1692 			 * page before us.
1693 			 */
1694 			KKASSERT((m->flags & (PG_UNMANAGED |
1695 					      PG_NEED_COMMIT)) == 0);
1696 			KASSERT(m->hold_count == 0, ("m->hold_count is not zero "
1697 						     "pg %p q=%d flags=%08x hold=%d wire=%d",
1698 						     m, m->queue, m->flags, m->hold_count, m->wire_count));
1699 			KKASSERT(m->wire_count == 0);
1700 			vm_page_spin_unlock(m);
1701 			pagedaemon_wakeup();
1702 
1703 			/* return busied and removed page */
1704 			return(m);
1705 		}
1706 	}
1707 	return(m);
1708 }
1709 
1710 /*
1711  * vm_page_alloc()
1712  *
1713  * Allocate and return a memory cell associated with this VM object/offset
1714  * pair.  If object is NULL an unassociated page will be allocated.
1715  *
1716  * The returned page will be busied and removed from its queues.  This
1717  * routine can block and may return NULL if a race occurs and the page
1718  * is found to already exist at the specified (object, pindex).
1719  *
1720  *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
1721  *	VM_ALLOC_QUICK		like normal but cannot use cache
1722  *	VM_ALLOC_SYSTEM		greater free drain
1723  *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
1724  *	VM_ALLOC_ZERO		advisory request for pre-zero'd page only
1725  *	VM_ALLOC_FORCE_ZERO	advisory request for pre-zero'd page only
1726  *	VM_ALLOC_NULL_OK	ok to return NULL on insertion collision
1727  *				(see vm_page_grab())
1728  *	VM_ALLOC_USE_GD		ok to use per-gd cache
1729  *
1730  *	VM_ALLOC_CPU(n)		allocate using specified cpu localization
1731  *
1732  * The object must be held if not NULL
1733  * This routine may not block
1734  *
1735  * Additional special handling is required when called from an interrupt
1736  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
1737  * in this case.
1738  */
1739 vm_page_t
1740 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
1741 {
1742 	globaldata_t gd;
1743 	vm_object_t obj;
1744 	vm_page_t m;
1745 	u_short pg_color;
1746 	int cpuid_local;
1747 
1748 #if 0
1749 	/*
1750 	 * Special per-cpu free VM page cache.  The pages are pre-busied
1751 	 * and pre-zerod for us.
1752 	 */
1753 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
1754 		crit_enter_gd(gd);
1755 		if (gd->gd_vmpg_count) {
1756 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
1757 			crit_exit_gd(gd);
1758 			goto done;
1759                 }
1760 		crit_exit_gd(gd);
1761         }
1762 #endif
1763 	m = NULL;
1764 
1765 	/*
1766 	 * CPU LOCALIZATION
1767 	 *
1768 	 * CPU localization algorithm.  Break the page queues up by physical
1769 	 * id and core id (note that two cpu threads will have the same core
1770 	 * id, and core_id != gd_cpuid).
1771 	 *
1772 	 * This is nowhere near perfect, for example the last pindex in a
1773 	 * subgroup will overflow into the next cpu or package.  But this
1774 	 * should get us good page reuse locality in heavy mixed loads.
1775 	 *
1776 	 * (may be executed before the APs are started, so other GDs might
1777 	 *  not exist!)
1778 	 */
1779 	if (page_req & VM_ALLOC_CPU_SPEC)
1780 		cpuid_local = VM_ALLOC_GETCPU(page_req);
1781 	else
1782 		cpuid_local = mycpu->gd_cpuid;
1783 
1784 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
1785 
1786 	KKASSERT(page_req &
1787 		(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
1788 		 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
1789 
1790 	/*
1791 	 * Certain system threads (pageout daemon, buf_daemon's) are
1792 	 * allowed to eat deeper into the free page list.
1793 	 */
1794 	if (curthread->td_flags & TDF_SYSTHREAD)
1795 		page_req |= VM_ALLOC_SYSTEM;
1796 
1797 	/*
1798 	 * Impose various limitations.  Note that the v_free_reserved test
1799 	 * must match the opposite of vm_page_count_target() to avoid
1800 	 * livelocks, be careful.
1801 	 */
1802 loop:
1803 	gd = mycpu;
1804 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
1805 	    ((page_req & VM_ALLOC_INTERRUPT) &&
1806 	     gd->gd_vmstats.v_free_count > 0) ||
1807 	    ((page_req & VM_ALLOC_SYSTEM) &&
1808 	     gd->gd_vmstats.v_cache_count == 0 &&
1809 		gd->gd_vmstats.v_free_count >
1810 		gd->gd_vmstats.v_interrupt_free_min)
1811 	) {
1812 		/*
1813 		 * The free queue has sufficient free pages to take one out.
1814 		 */
1815 		m = vm_page_select_free(pg_color);
1816 	} else if (page_req & VM_ALLOC_NORMAL) {
1817 		/*
1818 		 * Allocatable from the cache (non-interrupt only).  On
1819 		 * success, we must free the page and try again, thus
1820 		 * ensuring that vmstats.v_*_free_min counters are replenished.
1821 		 */
1822 #ifdef INVARIANTS
1823 		if (curthread->td_preempted) {
1824 			kprintf("vm_page_alloc(): warning, attempt to allocate"
1825 				" cache page from preempting interrupt\n");
1826 			m = NULL;
1827 		} else {
1828 			m = vm_page_select_cache(pg_color);
1829 		}
1830 #else
1831 		m = vm_page_select_cache(pg_color);
1832 #endif
1833 		/*
1834 		 * On success move the page into the free queue and loop.
1835 		 *
1836 		 * Only do this if we can safely acquire the vm_object lock,
1837 		 * because this is effectively a random page and the caller
1838 		 * might be holding the lock shared, we don't want to
1839 		 * deadlock.
1840 		 */
1841 		if (m != NULL) {
1842 			KASSERT(m->dirty == 0,
1843 				("Found dirty cache page %p", m));
1844 			if ((obj = m->object) != NULL) {
1845 				if (vm_object_hold_try(obj)) {
1846 					vm_page_protect(m, VM_PROT_NONE);
1847 					vm_page_free(m);
1848 					/* m->object NULL here */
1849 					vm_object_drop(obj);
1850 				} else {
1851 					vm_page_deactivate(m);
1852 					vm_page_wakeup(m);
1853 				}
1854 			} else {
1855 				vm_page_protect(m, VM_PROT_NONE);
1856 				vm_page_free(m);
1857 			}
1858 			goto loop;
1859 		}
1860 
1861 		/*
1862 		 * On failure return NULL
1863 		 */
1864 		atomic_add_int(&vm_pageout_deficit, 1);
1865 		pagedaemon_wakeup();
1866 		return (NULL);
1867 	} else {
1868 		/*
1869 		 * No pages available, wakeup the pageout daemon and give up.
1870 		 */
1871 		atomic_add_int(&vm_pageout_deficit, 1);
1872 		pagedaemon_wakeup();
1873 		return (NULL);
1874 	}
1875 
1876 	/*
1877 	 * v_free_count can race so loop if we don't find the expected
1878 	 * page.
1879 	 */
1880 	if (m == NULL) {
1881 		vmstats_rollup();
1882 		goto loop;
1883 	}
1884 
1885 	/*
1886 	 * Good page found.  The page has already been busied for us and
1887 	 * removed from its queues.
1888 	 */
1889 	KASSERT(m->dirty == 0,
1890 		("vm_page_alloc: free/cache page %p was dirty", m));
1891 	KKASSERT(m->queue == PQ_NONE);
1892 
1893 #if 0
1894 done:
1895 #endif
1896 	/*
1897 	 * Initialize the structure, inheriting some flags but clearing
1898 	 * all the rest.  The page has already been busied for us.
1899 	 */
1900 	vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
1901 
1902 	KKASSERT(m->wire_count == 0);
1903 	KKASSERT((m->busy_count & PBUSY_MASK) == 0);
1904 	m->act_count = 0;
1905 	m->valid = 0;
1906 
1907 	/*
1908 	 * Caller must be holding the object lock (asserted by
1909 	 * vm_page_insert()).
1910 	 *
1911 	 * NOTE: Inserting a page here does not insert it into any pmaps
1912 	 *	 (which could cause us to block allocating memory).
1913 	 *
1914 	 * NOTE: If no object an unassociated page is allocated, m->pindex
1915 	 *	 can be used by the caller for any purpose.
1916 	 */
1917 	if (object) {
1918 		if (vm_page_insert(m, object, pindex) == FALSE) {
1919 			vm_page_free(m);
1920 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
1921 				panic("PAGE RACE %p[%ld]/%p",
1922 				      object, (long)pindex, m);
1923 			m = NULL;
1924 		}
1925 	} else {
1926 		m->pindex = pindex;
1927 	}
1928 
1929 	/*
1930 	 * Don't wakeup too often - wakeup the pageout daemon when
1931 	 * we would be nearly out of memory.
1932 	 */
1933 	pagedaemon_wakeup();
1934 
1935 	/*
1936 	 * A BUSY page is returned.
1937 	 */
1938 	return (m);
1939 }
1940 
1941 /*
1942  * Returns number of pages available in our DMA memory reserve
1943  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
1944  */
1945 vm_size_t
1946 vm_contig_avail_pages(void)
1947 {
1948 	alist_blk_t blk;
1949 	alist_blk_t count;
1950 	alist_blk_t bfree;
1951 	spin_lock(&vm_contig_spin);
1952 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
1953 	spin_unlock(&vm_contig_spin);
1954 
1955 	return bfree;
1956 }
1957 
1958 /*
1959  * Attempt to allocate contiguous physical memory with the specified
1960  * requirements.
1961  */
1962 vm_page_t
1963 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
1964 		     unsigned long alignment, unsigned long boundary,
1965 		     unsigned long size, vm_memattr_t memattr)
1966 {
1967 	alist_blk_t blk;
1968 	vm_page_t m;
1969 	vm_pindex_t i;
1970 	static vm_pindex_t contig_rover;
1971 
1972 	alignment >>= PAGE_SHIFT;
1973 	if (alignment == 0)
1974 		alignment = 1;
1975 	boundary >>= PAGE_SHIFT;
1976 	if (boundary == 0)
1977 		boundary = 1;
1978 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
1979 
1980 	if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
1981 	    boundary <= PAGE_SIZE && size == 1 &&
1982 	    memattr == VM_MEMATTR_DEFAULT) {
1983 		/*
1984 		 * Any page will work, use vm_page_alloc()
1985 		 * (e.g. when used from kmem_alloc_attr())
1986 		 */
1987 		m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
1988 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
1989 				  VM_ALLOC_INTERRUPT);
1990 		m->valid = VM_PAGE_BITS_ALL;
1991 		vm_page_wire(m);
1992 		vm_page_wakeup(m);
1993 	} else {
1994 		/*
1995 		 * Use the low-memory dma reserve
1996 		 */
1997 		spin_lock(&vm_contig_spin);
1998 		blk = alist_alloc(&vm_contig_alist, 0, size);
1999 		if (blk == ALIST_BLOCK_NONE) {
2000 			spin_unlock(&vm_contig_spin);
2001 			if (bootverbose) {
2002 				kprintf("vm_page_alloc_contig: %ldk nospace\n",
2003 					(size << PAGE_SHIFT) / 1024);
2004 				print_backtrace(5);
2005 			}
2006 			return(NULL);
2007 		}
2008 		if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2009 			alist_free(&vm_contig_alist, blk, size);
2010 			spin_unlock(&vm_contig_spin);
2011 			if (bootverbose) {
2012 				kprintf("vm_page_alloc_contig: %ldk high "
2013 					"%016jx failed\n",
2014 					(size << PAGE_SHIFT) / 1024,
2015 					(intmax_t)high);
2016 			}
2017 			return(NULL);
2018 		}
2019 		spin_unlock(&vm_contig_spin);
2020 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2021 	}
2022 	if (vm_contig_verbose) {
2023 		kprintf("vm_page_alloc_contig: %016jx/%ldk "
2024 			"(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2025 			(intmax_t)m->phys_addr,
2026 			(size << PAGE_SHIFT) / 1024,
2027 			low, high, alignment, boundary, size, memattr);
2028 	}
2029 	if (memattr != VM_MEMATTR_DEFAULT) {
2030 		for (i = 0;i < size; i++)
2031 			pmap_page_set_memattr(&m[i], memattr);
2032 	}
2033 	return m;
2034 }
2035 
2036 /*
2037  * Free contiguously allocated pages.  The pages will be wired but not busy.
2038  * When freeing to the alist we leave them wired and not busy.
2039  */
2040 void
2041 vm_page_free_contig(vm_page_t m, unsigned long size)
2042 {
2043 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2044 	vm_pindex_t start = pa >> PAGE_SHIFT;
2045 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2046 
2047 	if (vm_contig_verbose) {
2048 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2049 			(intmax_t)pa, size / 1024);
2050 	}
2051 	if (pa < vm_low_phys_reserved) {
2052 		KKASSERT(pa + size <= vm_low_phys_reserved);
2053 		spin_lock(&vm_contig_spin);
2054 		alist_free(&vm_contig_alist, start, pages);
2055 		spin_unlock(&vm_contig_spin);
2056 	} else {
2057 		while (pages) {
2058 			vm_page_busy_wait(m, FALSE, "cpgfr");
2059 			vm_page_unwire(m, 0);
2060 			vm_page_free(m);
2061 			--pages;
2062 			++m;
2063 		}
2064 
2065 	}
2066 }
2067 
2068 
2069 /*
2070  * Wait for sufficient free memory for nominal heavy memory use kernel
2071  * operations.
2072  *
2073  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2074  *	     will trivially deadlock the system.
2075  */
2076 void
2077 vm_wait_nominal(void)
2078 {
2079 	while (vm_page_count_min(0))
2080 		vm_wait(0);
2081 }
2082 
2083 /*
2084  * Test if vm_wait_nominal() would block.
2085  */
2086 int
2087 vm_test_nominal(void)
2088 {
2089 	if (vm_page_count_min(0))
2090 		return(1);
2091 	return(0);
2092 }
2093 
2094 /*
2095  * Block until free pages are available for allocation, called in various
2096  * places before memory allocations.
2097  *
2098  * The caller may loop if vm_page_count_min() == FALSE so we cannot be
2099  * more generous then that.
2100  */
2101 void
2102 vm_wait(int timo)
2103 {
2104 	/*
2105 	 * never wait forever
2106 	 */
2107 	if (timo == 0)
2108 		timo = hz;
2109 	lwkt_gettoken(&vm_token);
2110 
2111 	if (curthread == pagethread ||
2112 	    curthread == emergpager) {
2113 		/*
2114 		 * The pageout daemon itself needs pages, this is bad.
2115 		 */
2116 		if (vm_page_count_min(0)) {
2117 			vm_pageout_pages_needed = 1;
2118 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2119 		}
2120 	} else {
2121 		/*
2122 		 * Wakeup the pageout daemon if necessary and wait.
2123 		 *
2124 		 * Do not wait indefinitely for the target to be reached,
2125 		 * as load might prevent it from being reached any time soon.
2126 		 * But wait a little to try to slow down page allocations
2127 		 * and to give more important threads (the pagedaemon)
2128 		 * allocation priority.
2129 		 */
2130 		if (vm_page_count_target()) {
2131 			if (vm_pages_needed == 0) {
2132 				vm_pages_needed = 1;
2133 				wakeup(&vm_pages_needed);
2134 			}
2135 			++vm_pages_waiting;	/* SMP race ok */
2136 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2137 		}
2138 	}
2139 	lwkt_reltoken(&vm_token);
2140 }
2141 
2142 /*
2143  * Block until free pages are available for allocation
2144  *
2145  * Called only from vm_fault so that processes page faulting can be
2146  * easily tracked.
2147  */
2148 void
2149 vm_wait_pfault(void)
2150 {
2151 	/*
2152 	 * Wakeup the pageout daemon if necessary and wait.
2153 	 *
2154 	 * Do not wait indefinitely for the target to be reached,
2155 	 * as load might prevent it from being reached any time soon.
2156 	 * But wait a little to try to slow down page allocations
2157 	 * and to give more important threads (the pagedaemon)
2158 	 * allocation priority.
2159 	 */
2160 	if (vm_page_count_min(0)) {
2161 		lwkt_gettoken(&vm_token);
2162 		while (vm_page_count_severe()) {
2163 			if (vm_page_count_target()) {
2164 				thread_t td;
2165 
2166 				if (vm_pages_needed == 0) {
2167 					vm_pages_needed = 1;
2168 					wakeup(&vm_pages_needed);
2169 				}
2170 				++vm_pages_waiting;	/* SMP race ok */
2171 				tsleep(&vmstats.v_free_count, 0, "pfault", hz);
2172 
2173 				/*
2174 				 * Do not stay stuck in the loop if the system is trying
2175 				 * to kill the process.
2176 				 */
2177 				td = curthread;
2178 				if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL))
2179 					break;
2180 			}
2181 		}
2182 		lwkt_reltoken(&vm_token);
2183 	}
2184 }
2185 
2186 /*
2187  * Put the specified page on the active list (if appropriate).  Ensure
2188  * that act_count is at least ACT_INIT but do not otherwise mess with it.
2189  *
2190  * The caller should be holding the page busied ? XXX
2191  * This routine may not block.
2192  */
2193 void
2194 vm_page_activate(vm_page_t m)
2195 {
2196 	u_short oqueue;
2197 
2198 	vm_page_spin_lock(m);
2199 	if (m->queue - m->pc != PQ_ACTIVE) {
2200 		_vm_page_queue_spin_lock(m);
2201 		oqueue = _vm_page_rem_queue_spinlocked(m);
2202 		/* page is left spinlocked, queue is unlocked */
2203 
2204 		if (oqueue == PQ_CACHE)
2205 			mycpu->gd_cnt.v_reactivated++;
2206 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2207 			if (m->act_count < ACT_INIT)
2208 				m->act_count = ACT_INIT;
2209 			_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
2210 		}
2211 		_vm_page_and_queue_spin_unlock(m);
2212 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
2213 			pagedaemon_wakeup();
2214 	} else {
2215 		if (m->act_count < ACT_INIT)
2216 			m->act_count = ACT_INIT;
2217 		vm_page_spin_unlock(m);
2218 	}
2219 }
2220 
2221 /*
2222  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
2223  * routine is called when a page has been added to the cache or free
2224  * queues.
2225  *
2226  * This routine may not block.
2227  */
2228 static __inline void
2229 vm_page_free_wakeup(void)
2230 {
2231 	globaldata_t gd = mycpu;
2232 
2233 	/*
2234 	 * If the pageout daemon itself needs pages, then tell it that
2235 	 * there are some free.
2236 	 */
2237 	if (vm_pageout_pages_needed &&
2238 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
2239 	    gd->gd_vmstats.v_pageout_free_min
2240 	) {
2241 		vm_pageout_pages_needed = 0;
2242 		wakeup(&vm_pageout_pages_needed);
2243 	}
2244 
2245 	/*
2246 	 * Wakeup processes that are waiting on memory.
2247 	 *
2248 	 * Generally speaking we want to wakeup stuck processes as soon as
2249 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
2250 	 * where we can do this.  Wait a bit longer to reduce degenerate
2251 	 * re-blocking (vm_page_free_hysteresis).  The target check is just
2252 	 * to make sure the min-check w/hysteresis does not exceed the
2253 	 * normal target.
2254 	 */
2255 	if (vm_pages_waiting) {
2256 		if (!vm_page_count_min(vm_page_free_hysteresis) ||
2257 		    !vm_page_count_target()) {
2258 			vm_pages_waiting = 0;
2259 			wakeup(&vmstats.v_free_count);
2260 			++mycpu->gd_cnt.v_ppwakeups;
2261 		}
2262 #if 0
2263 		if (!vm_page_count_target()) {
2264 			/*
2265 			 * Plenty of pages are free, wakeup everyone.
2266 			 */
2267 			vm_pages_waiting = 0;
2268 			wakeup(&vmstats.v_free_count);
2269 			++mycpu->gd_cnt.v_ppwakeups;
2270 		} else if (!vm_page_count_min(0)) {
2271 			/*
2272 			 * Some pages are free, wakeup someone.
2273 			 */
2274 			int wcount = vm_pages_waiting;
2275 			if (wcount > 0)
2276 				--wcount;
2277 			vm_pages_waiting = wcount;
2278 			wakeup_one(&vmstats.v_free_count);
2279 			++mycpu->gd_cnt.v_ppwakeups;
2280 		}
2281 #endif
2282 	}
2283 }
2284 
2285 /*
2286  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
2287  * it from its VM object.
2288  *
2289  * The vm_page must be BUSY on entry.  BUSY will be released on
2290  * return (the page will have been freed).
2291  */
2292 void
2293 vm_page_free_toq(vm_page_t m)
2294 {
2295 	mycpu->gd_cnt.v_tfree++;
2296 	KKASSERT((m->flags & PG_MAPPED) == 0);
2297 	KKASSERT(m->busy_count & PBUSY_LOCKED);
2298 
2299 	if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
2300 		kprintf("vm_page_free: pindex(%lu), busy %08x, "
2301 			"hold(%d)\n",
2302 			(u_long)m->pindex, m->busy_count, m->hold_count);
2303 		if ((m->queue - m->pc) == PQ_FREE)
2304 			panic("vm_page_free: freeing free page");
2305 		else
2306 			panic("vm_page_free: freeing busy page");
2307 	}
2308 
2309 	/*
2310 	 * Remove from object, spinlock the page and its queues and
2311 	 * remove from any queue.  No queue spinlock will be held
2312 	 * after this section (because the page was removed from any
2313 	 * queue).
2314 	 */
2315 	vm_page_remove(m);
2316 	vm_page_and_queue_spin_lock(m);
2317 	_vm_page_rem_queue_spinlocked(m);
2318 
2319 	/*
2320 	 * No further management of fictitious pages occurs beyond object
2321 	 * and queue removal.
2322 	 */
2323 	if ((m->flags & PG_FICTITIOUS) != 0) {
2324 		vm_page_spin_unlock(m);
2325 		vm_page_wakeup(m);
2326 		return;
2327 	}
2328 
2329 	m->valid = 0;
2330 	vm_page_undirty(m);
2331 
2332 	if (m->wire_count != 0) {
2333 		if (m->wire_count > 1) {
2334 		    panic(
2335 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
2336 			m->wire_count, (long)m->pindex);
2337 		}
2338 		panic("vm_page_free: freeing wired page");
2339 	}
2340 
2341 	/*
2342 	 * Clear the UNMANAGED flag when freeing an unmanaged page.
2343 	 * Clear the NEED_COMMIT flag
2344 	 */
2345 	if (m->flags & PG_UNMANAGED)
2346 		vm_page_flag_clear(m, PG_UNMANAGED);
2347 	if (m->flags & PG_NEED_COMMIT)
2348 		vm_page_flag_clear(m, PG_NEED_COMMIT);
2349 
2350 	if (m->hold_count != 0) {
2351 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
2352 	} else {
2353 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
2354 	}
2355 
2356 	/*
2357 	 * This sequence allows us to clear BUSY while still holding
2358 	 * its spin lock, which reduces contention vs allocators.  We
2359 	 * must not leave the queue locked or _vm_page_wakeup() may
2360 	 * deadlock.
2361 	 */
2362 	_vm_page_queue_spin_unlock(m);
2363 	if (_vm_page_wakeup(m)) {
2364 		vm_page_spin_unlock(m);
2365 		wakeup(m);
2366 	} else {
2367 		vm_page_spin_unlock(m);
2368 	}
2369 	vm_page_free_wakeup();
2370 }
2371 
2372 /*
2373  * vm_page_unmanage()
2374  *
2375  * Prevent PV management from being done on the page.  The page is
2376  * removed from the paging queues as if it were wired, and as a
2377  * consequence of no longer being managed the pageout daemon will not
2378  * touch it (since there is no way to locate the pte mappings for the
2379  * page).  madvise() calls that mess with the pmap will also no longer
2380  * operate on the page.
2381  *
2382  * Beyond that the page is still reasonably 'normal'.  Freeing the page
2383  * will clear the flag.
2384  *
2385  * This routine is used by OBJT_PHYS objects - objects using unswappable
2386  * physical memory as backing store rather then swap-backed memory and
2387  * will eventually be extended to support 4MB unmanaged physical
2388  * mappings.
2389  *
2390  * Caller must be holding the page busy.
2391  */
2392 void
2393 vm_page_unmanage(vm_page_t m)
2394 {
2395 	KKASSERT(m->busy_count & PBUSY_LOCKED);
2396 	if ((m->flags & PG_UNMANAGED) == 0) {
2397 		if (m->wire_count == 0)
2398 			vm_page_unqueue(m);
2399 	}
2400 	vm_page_flag_set(m, PG_UNMANAGED);
2401 }
2402 
2403 /*
2404  * Mark this page as wired down by yet another map, removing it from
2405  * paging queues as necessary.
2406  *
2407  * Caller must be holding the page busy.
2408  */
2409 void
2410 vm_page_wire(vm_page_t m)
2411 {
2412 	/*
2413 	 * Only bump the wire statistics if the page is not already wired,
2414 	 * and only unqueue the page if it is on some queue (if it is unmanaged
2415 	 * it is already off the queues).  Don't do anything with fictitious
2416 	 * pages because they are always wired.
2417 	 */
2418 	KKASSERT(m->busy_count & PBUSY_LOCKED);
2419 	if ((m->flags & PG_FICTITIOUS) == 0) {
2420 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
2421 			if ((m->flags & PG_UNMANAGED) == 0)
2422 				vm_page_unqueue(m);
2423 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
2424 		}
2425 		KASSERT(m->wire_count != 0,
2426 			("vm_page_wire: wire_count overflow m=%p", m));
2427 	}
2428 }
2429 
2430 /*
2431  * Release one wiring of this page, potentially enabling it to be paged again.
2432  *
2433  * Many pages placed on the inactive queue should actually go
2434  * into the cache, but it is difficult to figure out which.  What
2435  * we do instead, if the inactive target is well met, is to put
2436  * clean pages at the head of the inactive queue instead of the tail.
2437  * This will cause them to be moved to the cache more quickly and
2438  * if not actively re-referenced, freed more quickly.  If we just
2439  * stick these pages at the end of the inactive queue, heavy filesystem
2440  * meta-data accesses can cause an unnecessary paging load on memory bound
2441  * processes.  This optimization causes one-time-use metadata to be
2442  * reused more quickly.
2443  *
2444  * Pages marked PG_NEED_COMMIT are always activated and never placed on
2445  * the inactive queue.  This helps the pageout daemon determine memory
2446  * pressure and act on out-of-memory situations more quickly.
2447  *
2448  * BUT, if we are in a low-memory situation we have no choice but to
2449  * put clean pages on the cache queue.
2450  *
2451  * A number of routines use vm_page_unwire() to guarantee that the page
2452  * will go into either the inactive or active queues, and will NEVER
2453  * be placed in the cache - for example, just after dirtying a page.
2454  * dirty pages in the cache are not allowed.
2455  *
2456  * This routine may not block.
2457  */
2458 void
2459 vm_page_unwire(vm_page_t m, int activate)
2460 {
2461 	KKASSERT(m->busy_count & PBUSY_LOCKED);
2462 	if (m->flags & PG_FICTITIOUS) {
2463 		/* do nothing */
2464 	} else if (m->wire_count <= 0) {
2465 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
2466 	} else {
2467 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
2468 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
2469 			if (m->flags & PG_UNMANAGED) {
2470 				;
2471 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
2472 				vm_page_spin_lock(m);
2473 				_vm_page_add_queue_spinlocked(m,
2474 							PQ_ACTIVE + m->pc, 0);
2475 				_vm_page_and_queue_spin_unlock(m);
2476 			} else {
2477 				vm_page_spin_lock(m);
2478 				vm_page_flag_clear(m, PG_WINATCFLS);
2479 				_vm_page_add_queue_spinlocked(m,
2480 							PQ_INACTIVE + m->pc, 0);
2481 				++vm_swapcache_inactive_heuristic;
2482 				_vm_page_and_queue_spin_unlock(m);
2483 			}
2484 		}
2485 	}
2486 }
2487 
2488 /*
2489  * Move the specified page to the inactive queue.  If the page has
2490  * any associated swap, the swap is deallocated.
2491  *
2492  * Normally athead is 0 resulting in LRU operation.  athead is set
2493  * to 1 if we want this page to be 'as if it were placed in the cache',
2494  * except without unmapping it from the process address space.
2495  *
2496  * vm_page's spinlock must be held on entry and will remain held on return.
2497  * This routine may not block.
2498  */
2499 static void
2500 _vm_page_deactivate_locked(vm_page_t m, int athead)
2501 {
2502 	u_short oqueue;
2503 
2504 	/*
2505 	 * Ignore if already inactive.
2506 	 */
2507 	if (m->queue - m->pc == PQ_INACTIVE)
2508 		return;
2509 	_vm_page_queue_spin_lock(m);
2510 	oqueue = _vm_page_rem_queue_spinlocked(m);
2511 
2512 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2513 		if (oqueue == PQ_CACHE)
2514 			mycpu->gd_cnt.v_reactivated++;
2515 		vm_page_flag_clear(m, PG_WINATCFLS);
2516 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
2517 		if (athead == 0)
2518 			++vm_swapcache_inactive_heuristic;
2519 	}
2520 	/* NOTE: PQ_NONE if condition not taken */
2521 	_vm_page_queue_spin_unlock(m);
2522 	/* leaves vm_page spinlocked */
2523 }
2524 
2525 /*
2526  * Attempt to deactivate a page.
2527  *
2528  * No requirements.
2529  */
2530 void
2531 vm_page_deactivate(vm_page_t m)
2532 {
2533 	vm_page_spin_lock(m);
2534 	_vm_page_deactivate_locked(m, 0);
2535 	vm_page_spin_unlock(m);
2536 }
2537 
2538 void
2539 vm_page_deactivate_locked(vm_page_t m)
2540 {
2541 	_vm_page_deactivate_locked(m, 0);
2542 }
2543 
2544 /*
2545  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
2546  *
2547  * This function returns non-zero if it successfully moved the page to
2548  * PQ_CACHE.
2549  *
2550  * This function unconditionally unbusies the page on return.
2551  */
2552 int
2553 vm_page_try_to_cache(vm_page_t m)
2554 {
2555 	vm_page_spin_lock(m);
2556 	if (m->dirty || m->hold_count || m->wire_count ||
2557 	    (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT))) {
2558 		if (_vm_page_wakeup(m)) {
2559 			vm_page_spin_unlock(m);
2560 			wakeup(m);
2561 		} else {
2562 			vm_page_spin_unlock(m);
2563 		}
2564 		return(0);
2565 	}
2566 	vm_page_spin_unlock(m);
2567 
2568 	/*
2569 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
2570 	 * be moved to the cache.
2571 	 */
2572 	vm_page_test_dirty(m);
2573 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2574 		vm_page_wakeup(m);
2575 		return(0);
2576 	}
2577 	vm_page_cache(m);
2578 	return(1);
2579 }
2580 
2581 /*
2582  * Attempt to free the page.  If we cannot free it, we do nothing.
2583  * 1 is returned on success, 0 on failure.
2584  *
2585  * No requirements.
2586  */
2587 int
2588 vm_page_try_to_free(vm_page_t m)
2589 {
2590 	vm_page_spin_lock(m);
2591 	if (vm_page_busy_try(m, TRUE)) {
2592 		vm_page_spin_unlock(m);
2593 		return(0);
2594 	}
2595 
2596 	/*
2597 	 * The page can be in any state, including already being on the free
2598 	 * queue.  Check to see if it really can be freed.
2599 	 */
2600 	if (m->dirty ||				/* can't free if it is dirty */
2601 	    m->hold_count ||			/* or held (XXX may be wrong) */
2602 	    m->wire_count ||			/* or wired */
2603 	    (m->flags & (PG_UNMANAGED |		/* or unmanaged */
2604 			 PG_NEED_COMMIT)) ||	/* or needs a commit */
2605 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
2606 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
2607 		if (_vm_page_wakeup(m)) {
2608 			vm_page_spin_unlock(m);
2609 			wakeup(m);
2610 		} else {
2611 			vm_page_spin_unlock(m);
2612 		}
2613 		return(0);
2614 	}
2615 	vm_page_spin_unlock(m);
2616 
2617 	/*
2618 	 * We can probably free the page.
2619 	 *
2620 	 * Page busied by us and no longer spinlocked.  Dirty pages will
2621 	 * not be freed by this function.    We have to re-test the
2622 	 * dirty bit after cleaning out the pmaps.
2623 	 */
2624 	vm_page_test_dirty(m);
2625 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2626 		vm_page_wakeup(m);
2627 		return(0);
2628 	}
2629 	vm_page_protect(m, VM_PROT_NONE);
2630 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2631 		vm_page_wakeup(m);
2632 		return(0);
2633 	}
2634 	vm_page_free(m);
2635 	return(1);
2636 }
2637 
2638 /*
2639  * vm_page_cache
2640  *
2641  * Put the specified page onto the page cache queue (if appropriate).
2642  *
2643  * The page must be busy, and this routine will release the busy and
2644  * possibly even free the page.
2645  */
2646 void
2647 vm_page_cache(vm_page_t m)
2648 {
2649 	/*
2650 	 * Not suitable for the cache
2651 	 */
2652 	if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
2653 	    (m->busy_count & PBUSY_MASK) ||
2654 	    m->wire_count || m->hold_count) {
2655 		vm_page_wakeup(m);
2656 		return;
2657 	}
2658 
2659 	/*
2660 	 * Already in the cache (and thus not mapped)
2661 	 */
2662 	if ((m->queue - m->pc) == PQ_CACHE) {
2663 		KKASSERT((m->flags & PG_MAPPED) == 0);
2664 		vm_page_wakeup(m);
2665 		return;
2666 	}
2667 
2668 	/*
2669 	 * Caller is required to test m->dirty, but note that the act of
2670 	 * removing the page from its maps can cause it to become dirty
2671 	 * on an SMP system due to another cpu running in usermode.
2672 	 */
2673 	if (m->dirty) {
2674 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
2675 			(long)m->pindex);
2676 	}
2677 
2678 	/*
2679 	 * Remove all pmaps and indicate that the page is not
2680 	 * writeable or mapped.  Our vm_page_protect() call may
2681 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
2682 	 * everything.
2683 	 */
2684 	vm_page_protect(m, VM_PROT_NONE);
2685 	if ((m->flags & (PG_UNMANAGED | PG_MAPPED)) ||
2686 	    (m->busy_count & PBUSY_MASK) ||
2687 	    m->wire_count || m->hold_count) {
2688 		vm_page_wakeup(m);
2689 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2690 		vm_page_deactivate(m);
2691 		vm_page_wakeup(m);
2692 	} else {
2693 		_vm_page_and_queue_spin_lock(m);
2694 		_vm_page_rem_queue_spinlocked(m);
2695 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
2696 		_vm_page_queue_spin_unlock(m);
2697 		if (_vm_page_wakeup(m)) {
2698 			vm_page_spin_unlock(m);
2699 			wakeup(m);
2700 		} else {
2701 			vm_page_spin_unlock(m);
2702 		}
2703 		vm_page_free_wakeup();
2704 	}
2705 }
2706 
2707 /*
2708  * vm_page_dontneed()
2709  *
2710  * Cache, deactivate, or do nothing as appropriate.  This routine
2711  * is typically used by madvise() MADV_DONTNEED.
2712  *
2713  * Generally speaking we want to move the page into the cache so
2714  * it gets reused quickly.  However, this can result in a silly syndrome
2715  * due to the page recycling too quickly.  Small objects will not be
2716  * fully cached.  On the otherhand, if we move the page to the inactive
2717  * queue we wind up with a problem whereby very large objects
2718  * unnecessarily blow away our inactive and cache queues.
2719  *
2720  * The solution is to move the pages based on a fixed weighting.  We
2721  * either leave them alone, deactivate them, or move them to the cache,
2722  * where moving them to the cache has the highest weighting.
2723  * By forcing some pages into other queues we eventually force the
2724  * system to balance the queues, potentially recovering other unrelated
2725  * space from active.  The idea is to not force this to happen too
2726  * often.
2727  *
2728  * The page must be busied.
2729  */
2730 void
2731 vm_page_dontneed(vm_page_t m)
2732 {
2733 	static int dnweight;
2734 	int dnw;
2735 	int head;
2736 
2737 	dnw = ++dnweight;
2738 
2739 	/*
2740 	 * occassionally leave the page alone
2741 	 */
2742 	if ((dnw & 0x01F0) == 0 ||
2743 	    m->queue - m->pc == PQ_INACTIVE ||
2744 	    m->queue - m->pc == PQ_CACHE
2745 	) {
2746 		if (m->act_count >= ACT_INIT)
2747 			--m->act_count;
2748 		return;
2749 	}
2750 
2751 	/*
2752 	 * If vm_page_dontneed() is inactivating a page, it must clear
2753 	 * the referenced flag; otherwise the pagedaemon will see references
2754 	 * on the page in the inactive queue and reactivate it. Until the
2755 	 * page can move to the cache queue, madvise's job is not done.
2756 	 */
2757 	vm_page_flag_clear(m, PG_REFERENCED);
2758 	pmap_clear_reference(m);
2759 
2760 	if (m->dirty == 0)
2761 		vm_page_test_dirty(m);
2762 
2763 	if (m->dirty || (dnw & 0x0070) == 0) {
2764 		/*
2765 		 * Deactivate the page 3 times out of 32.
2766 		 */
2767 		head = 0;
2768 	} else {
2769 		/*
2770 		 * Cache the page 28 times out of every 32.  Note that
2771 		 * the page is deactivated instead of cached, but placed
2772 		 * at the head of the queue instead of the tail.
2773 		 */
2774 		head = 1;
2775 	}
2776 	vm_page_spin_lock(m);
2777 	_vm_page_deactivate_locked(m, head);
2778 	vm_page_spin_unlock(m);
2779 }
2780 
2781 /*
2782  * These routines manipulate the 'soft busy' count for a page.  A soft busy
2783  * is almost like a hard BUSY except that it allows certain compatible
2784  * operations to occur on the page while it is busy.  For example, a page
2785  * undergoing a write can still be mapped read-only.
2786  *
2787  * We also use soft-busy to quickly pmap_enter shared read-only pages
2788  * without having to hold the page locked.
2789  *
2790  * The soft-busy count can be > 1 in situations where multiple threads
2791  * are pmap_enter()ing the same page simultaneously, or when two buffer
2792  * cache buffers overlap the same page.
2793  *
2794  * The caller must hold the page BUSY when making these two calls.
2795  */
2796 void
2797 vm_page_io_start(vm_page_t m)
2798 {
2799 	uint32_t ocount;
2800 
2801 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
2802 	KKASSERT(ocount & PBUSY_LOCKED);
2803 }
2804 
2805 void
2806 vm_page_io_finish(vm_page_t m)
2807 {
2808 	uint32_t ocount;
2809 
2810 	ocount = atomic_fetchadd_int(&m->busy_count, -1);
2811 	KKASSERT(ocount & PBUSY_MASK);
2812 #if 0
2813 	if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
2814 		wakeup(m);
2815 #endif
2816 }
2817 
2818 /*
2819  * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
2820  *
2821  * Returns 0 on success, non-zero on failure.
2822  */
2823 int
2824 vm_page_sbusy_try(vm_page_t m)
2825 {
2826 	uint32_t ocount;
2827 
2828 	if (m->busy_count & PBUSY_LOCKED)
2829 		return 1;
2830 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
2831 	if (ocount & PBUSY_LOCKED) {
2832 		vm_page_sbusy_drop(m);
2833 		return 1;
2834 	}
2835 	return 0;
2836 }
2837 
2838 /*
2839  * Indicate that a clean VM page requires a filesystem commit and cannot
2840  * be reused.  Used by tmpfs.
2841  */
2842 void
2843 vm_page_need_commit(vm_page_t m)
2844 {
2845 	vm_page_flag_set(m, PG_NEED_COMMIT);
2846 	vm_object_set_writeable_dirty(m->object);
2847 }
2848 
2849 void
2850 vm_page_clear_commit(vm_page_t m)
2851 {
2852 	vm_page_flag_clear(m, PG_NEED_COMMIT);
2853 }
2854 
2855 /*
2856  * Grab a page, blocking if it is busy and allocating a page if necessary.
2857  * A busy page is returned or NULL.  The page may or may not be valid and
2858  * might not be on a queue (the caller is responsible for the disposition of
2859  * the page).
2860  *
2861  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
2862  * page will be zero'd and marked valid.
2863  *
2864  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
2865  * valid even if it already exists.
2866  *
2867  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
2868  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
2869  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
2870  *
2871  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
2872  * always returned if we had blocked.
2873  *
2874  * This routine may not be called from an interrupt.
2875  *
2876  * No other requirements.
2877  */
2878 vm_page_t
2879 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2880 {
2881 	vm_page_t m;
2882 	int error;
2883 	int shared = 1;
2884 
2885 	KKASSERT(allocflags &
2886 		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2887 	vm_object_hold_shared(object);
2888 	for (;;) {
2889 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
2890 		if (error) {
2891 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
2892 			if ((allocflags & VM_ALLOC_RETRY) == 0) {
2893 				m = NULL;
2894 				break;
2895 			}
2896 			/* retry */
2897 		} else if (m == NULL) {
2898 			if (shared) {
2899 				vm_object_upgrade(object);
2900 				shared = 0;
2901 			}
2902 			if (allocflags & VM_ALLOC_RETRY)
2903 				allocflags |= VM_ALLOC_NULL_OK;
2904 			m = vm_page_alloc(object, pindex,
2905 					  allocflags & ~VM_ALLOC_RETRY);
2906 			if (m)
2907 				break;
2908 			vm_wait(0);
2909 			if ((allocflags & VM_ALLOC_RETRY) == 0)
2910 				goto failed;
2911 		} else {
2912 			/* m found */
2913 			break;
2914 		}
2915 	}
2916 
2917 	/*
2918 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
2919 	 *
2920 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
2921 	 * valid even if already valid.
2922 	 *
2923 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
2924 	 *	  removed the idle zeroing code.  These optimizations actually
2925 	 *	  slow things down on modern cpus because the zerod area is
2926 	 *	  likely uncached, placing a memory-access burden on the
2927 	 *	  accesors taking the fault.
2928 	 *
2929 	 *	  By always zeroing the page in-line with the fault, no
2930 	 *	  dynamic ram reads are needed and the caches are hot, ready
2931 	 *	  for userland to access the memory.
2932 	 */
2933 	if (m->valid == 0) {
2934 		if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
2935 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
2936 			m->valid = VM_PAGE_BITS_ALL;
2937 		}
2938 	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
2939 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
2940 		m->valid = VM_PAGE_BITS_ALL;
2941 	}
2942 failed:
2943 	vm_object_drop(object);
2944 	return(m);
2945 }
2946 
2947 /*
2948  * Mapping function for valid bits or for dirty bits in
2949  * a page.  May not block.
2950  *
2951  * Inputs are required to range within a page.
2952  *
2953  * No requirements.
2954  * Non blocking.
2955  */
2956 int
2957 vm_page_bits(int base, int size)
2958 {
2959 	int first_bit;
2960 	int last_bit;
2961 
2962 	KASSERT(
2963 	    base + size <= PAGE_SIZE,
2964 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
2965 	);
2966 
2967 	if (size == 0)		/* handle degenerate case */
2968 		return(0);
2969 
2970 	first_bit = base >> DEV_BSHIFT;
2971 	last_bit = (base + size - 1) >> DEV_BSHIFT;
2972 
2973 	return ((2 << last_bit) - (1 << first_bit));
2974 }
2975 
2976 /*
2977  * Sets portions of a page valid and clean.  The arguments are expected
2978  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2979  * of any partial chunks touched by the range.  The invalid portion of
2980  * such chunks will be zero'd.
2981  *
2982  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
2983  *	 align base to DEV_BSIZE so as not to mark clean a partially
2984  *	 truncated device block.  Otherwise the dirty page status might be
2985  *	 lost.
2986  *
2987  * This routine may not block.
2988  *
2989  * (base + size) must be less then or equal to PAGE_SIZE.
2990  */
2991 static void
2992 _vm_page_zero_valid(vm_page_t m, int base, int size)
2993 {
2994 	int frag;
2995 	int endoff;
2996 
2997 	if (size == 0)	/* handle degenerate case */
2998 		return;
2999 
3000 	/*
3001 	 * If the base is not DEV_BSIZE aligned and the valid
3002 	 * bit is clear, we have to zero out a portion of the
3003 	 * first block.
3004 	 */
3005 
3006 	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
3007 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3008 	) {
3009 		pmap_zero_page_area(
3010 		    VM_PAGE_TO_PHYS(m),
3011 		    frag,
3012 		    base - frag
3013 		);
3014 	}
3015 
3016 	/*
3017 	 * If the ending offset is not DEV_BSIZE aligned and the
3018 	 * valid bit is clear, we have to zero out a portion of
3019 	 * the last block.
3020 	 */
3021 
3022 	endoff = base + size;
3023 
3024 	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
3025 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3026 	) {
3027 		pmap_zero_page_area(
3028 		    VM_PAGE_TO_PHYS(m),
3029 		    endoff,
3030 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3031 		);
3032 	}
3033 }
3034 
3035 /*
3036  * Set valid, clear dirty bits.  If validating the entire
3037  * page we can safely clear the pmap modify bit.  We also
3038  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3039  * takes a write fault on a MAP_NOSYNC memory area the flag will
3040  * be set again.
3041  *
3042  * We set valid bits inclusive of any overlap, but we can only
3043  * clear dirty bits for DEV_BSIZE chunks that are fully within
3044  * the range.
3045  *
3046  * Page must be busied?
3047  * No other requirements.
3048  */
3049 void
3050 vm_page_set_valid(vm_page_t m, int base, int size)
3051 {
3052 	_vm_page_zero_valid(m, base, size);
3053 	m->valid |= vm_page_bits(base, size);
3054 }
3055 
3056 
3057 /*
3058  * Set valid bits and clear dirty bits.
3059  *
3060  * Page must be busied by caller.
3061  *
3062  * NOTE: This function does not clear the pmap modified bit.
3063  *	 Also note that e.g. NFS may use a byte-granular base
3064  *	 and size.
3065  *
3066  * No other requirements.
3067  */
3068 void
3069 vm_page_set_validclean(vm_page_t m, int base, int size)
3070 {
3071 	int pagebits;
3072 
3073 	_vm_page_zero_valid(m, base, size);
3074 	pagebits = vm_page_bits(base, size);
3075 	m->valid |= pagebits;
3076 	m->dirty &= ~pagebits;
3077 	if (base == 0 && size == PAGE_SIZE) {
3078 		/*pmap_clear_modify(m);*/
3079 		vm_page_flag_clear(m, PG_NOSYNC);
3080 	}
3081 }
3082 
3083 /*
3084  * Set valid & dirty.  Used by buwrite()
3085  *
3086  * Page must be busied by caller.
3087  */
3088 void
3089 vm_page_set_validdirty(vm_page_t m, int base, int size)
3090 {
3091 	int pagebits;
3092 
3093 	pagebits = vm_page_bits(base, size);
3094 	m->valid |= pagebits;
3095 	m->dirty |= pagebits;
3096 	if (m->object)
3097 	       vm_object_set_writeable_dirty(m->object);
3098 }
3099 
3100 /*
3101  * Clear dirty bits.
3102  *
3103  * NOTE: This function does not clear the pmap modified bit.
3104  *	 Also note that e.g. NFS may use a byte-granular base
3105  *	 and size.
3106  *
3107  * Page must be busied?
3108  * No other requirements.
3109  */
3110 void
3111 vm_page_clear_dirty(vm_page_t m, int base, int size)
3112 {
3113 	m->dirty &= ~vm_page_bits(base, size);
3114 	if (base == 0 && size == PAGE_SIZE) {
3115 		/*pmap_clear_modify(m);*/
3116 		vm_page_flag_clear(m, PG_NOSYNC);
3117 	}
3118 }
3119 
3120 /*
3121  * Make the page all-dirty.
3122  *
3123  * Also make sure the related object and vnode reflect the fact that the
3124  * object may now contain a dirty page.
3125  *
3126  * Page must be busied?
3127  * No other requirements.
3128  */
3129 void
3130 vm_page_dirty(vm_page_t m)
3131 {
3132 #ifdef INVARIANTS
3133         int pqtype = m->queue - m->pc;
3134 #endif
3135         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3136                 ("vm_page_dirty: page in free/cache queue!"));
3137 	if (m->dirty != VM_PAGE_BITS_ALL) {
3138 		m->dirty = VM_PAGE_BITS_ALL;
3139 		if (m->object)
3140 			vm_object_set_writeable_dirty(m->object);
3141 	}
3142 }
3143 
3144 /*
3145  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3146  * valid and dirty bits for the effected areas are cleared.
3147  *
3148  * Page must be busied?
3149  * Does not block.
3150  * No other requirements.
3151  */
3152 void
3153 vm_page_set_invalid(vm_page_t m, int base, int size)
3154 {
3155 	int bits;
3156 
3157 	bits = vm_page_bits(base, size);
3158 	m->valid &= ~bits;
3159 	m->dirty &= ~bits;
3160 	atomic_add_int(&m->object->generation, 1);
3161 }
3162 
3163 /*
3164  * The kernel assumes that the invalid portions of a page contain
3165  * garbage, but such pages can be mapped into memory by user code.
3166  * When this occurs, we must zero out the non-valid portions of the
3167  * page so user code sees what it expects.
3168  *
3169  * Pages are most often semi-valid when the end of a file is mapped
3170  * into memory and the file's size is not page aligned.
3171  *
3172  * Page must be busied?
3173  * No other requirements.
3174  */
3175 void
3176 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
3177 {
3178 	int b;
3179 	int i;
3180 
3181 	/*
3182 	 * Scan the valid bits looking for invalid sections that
3183 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
3184 	 * valid bit may be set ) have already been zerod by
3185 	 * vm_page_set_validclean().
3186 	 */
3187 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
3188 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
3189 		    (m->valid & (1 << i))
3190 		) {
3191 			if (i > b) {
3192 				pmap_zero_page_area(
3193 				    VM_PAGE_TO_PHYS(m),
3194 				    b << DEV_BSHIFT,
3195 				    (i - b) << DEV_BSHIFT
3196 				);
3197 			}
3198 			b = i + 1;
3199 		}
3200 	}
3201 
3202 	/*
3203 	 * setvalid is TRUE when we can safely set the zero'd areas
3204 	 * as being valid.  We can do this if there are no cache consistency
3205 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
3206 	 */
3207 	if (setvalid)
3208 		m->valid = VM_PAGE_BITS_ALL;
3209 }
3210 
3211 /*
3212  * Is a (partial) page valid?  Note that the case where size == 0
3213  * will return FALSE in the degenerate case where the page is entirely
3214  * invalid, and TRUE otherwise.
3215  *
3216  * Does not block.
3217  * No other requirements.
3218  */
3219 int
3220 vm_page_is_valid(vm_page_t m, int base, int size)
3221 {
3222 	int bits = vm_page_bits(base, size);
3223 
3224 	if (m->valid && ((m->valid & bits) == bits))
3225 		return 1;
3226 	else
3227 		return 0;
3228 }
3229 
3230 /*
3231  * update dirty bits from pmap/mmu.  May not block.
3232  *
3233  * Caller must hold the page busy
3234  */
3235 void
3236 vm_page_test_dirty(vm_page_t m)
3237 {
3238 	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
3239 		vm_page_dirty(m);
3240 	}
3241 }
3242 
3243 #include "opt_ddb.h"
3244 #ifdef DDB
3245 #include <ddb/ddb.h>
3246 
3247 DB_SHOW_COMMAND(page, vm_page_print_page_info)
3248 {
3249 	db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
3250 	db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
3251 	db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
3252 	db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
3253 	db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
3254 	db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
3255 	db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
3256 	db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
3257 	db_printf("vmstats.v_cache_min: %ld\n", vmstats.v_cache_min);
3258 	db_printf("vmstats.v_inactive_target: %ld\n",
3259 		  vmstats.v_inactive_target);
3260 }
3261 
3262 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3263 {
3264 	int i;
3265 	db_printf("PQ_FREE:");
3266 	for (i = 0; i < PQ_L2_SIZE; i++) {
3267 		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
3268 	}
3269 	db_printf("\n");
3270 
3271 	db_printf("PQ_CACHE:");
3272 	for(i = 0; i < PQ_L2_SIZE; i++) {
3273 		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
3274 	}
3275 	db_printf("\n");
3276 
3277 	db_printf("PQ_ACTIVE:");
3278 	for(i = 0; i < PQ_L2_SIZE; i++) {
3279 		db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt);
3280 	}
3281 	db_printf("\n");
3282 
3283 	db_printf("PQ_INACTIVE:");
3284 	for(i = 0; i < PQ_L2_SIZE; i++) {
3285 		db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt);
3286 	}
3287 	db_printf("\n");
3288 }
3289 #endif /* DDB */
3290