xref: /dragonfly/sys/vm/vm_page.c (revision 9348a738)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
33  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
34  */
35 
36 /*
37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38  * All rights reserved.
39  *
40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41  *
42  * Permission to use, copy, modify and distribute this software and
43  * its documentation is hereby granted, provided that both the copyright
44  * notice and this permission notice appear in all copies of the
45  * software, derivative works or modified versions, and any portions
46  * thereof, and that both notices appear in supporting documentation.
47  *
48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51  *
52  * Carnegie Mellon requests users of this software to return to
53  *
54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55  *  School of Computer Science
56  *  Carnegie Mellon University
57  *  Pittsburgh PA 15213-3890
58  *
59  * any improvements or extensions that they make and grant Carnegie the
60  * rights to redistribute these changes.
61  */
62 /*
63  * Resident memory management module.  The module manipulates 'VM pages'.
64  * A VM page is the core building block for memory management.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/malloc.h>
70 #include <sys/proc.h>
71 #include <sys/vmmeter.h>
72 #include <sys/vnode.h>
73 #include <sys/kernel.h>
74 #include <sys/alist.h>
75 #include <sys/sysctl.h>
76 #include <sys/cpu_topology.h>
77 
78 #include <vm/vm.h>
79 #include <vm/vm_param.h>
80 #include <sys/lock.h>
81 #include <vm/vm_kern.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/vm_extern.h>
89 #include <vm/swap_pager.h>
90 
91 #include <machine/inttypes.h>
92 #include <machine/md_var.h>
93 #include <machine/specialreg.h>
94 
95 #include <vm/vm_page2.h>
96 #include <sys/spinlock2.h>
97 
98 /*
99  * Action hash for user umtx support.
100  */
101 #define VMACTION_HSIZE		256
102 #define VMACTION_HMASK		(VMACTION_HSIZE - 1)
103 
104 /*
105  * SET - Minimum required set associative size, must be a power of 2.  We
106  *	 want this to match or exceed the set-associativeness of the cpu.
107  *
108  * GRP - A larger set that allows bleed-over into the domains of other
109  *	 nearby cpus.  Also must be a power of 2.  Used by the page zeroing
110  *	 code to smooth things out a bit.
111  */
112 #define PQ_SET_ASSOC		16
113 #define PQ_SET_ASSOC_MASK	(PQ_SET_ASSOC - 1)
114 
115 #define PQ_GRP_ASSOC		(PQ_SET_ASSOC * 2)
116 #define PQ_GRP_ASSOC_MASK	(PQ_GRP_ASSOC - 1)
117 
118 static void vm_page_queue_init(void);
119 static void vm_page_free_wakeup(void);
120 static vm_page_t vm_page_select_cache(u_short pg_color);
121 static vm_page_t _vm_page_list_find2(int basequeue, int index);
122 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
123 
124 /*
125  * Array of tailq lists
126  */
127 __cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
128 
129 LIST_HEAD(vm_page_action_list, vm_page_action);
130 
131 struct vm_page_action_hash {
132 	struct vm_page_action_list list;
133 	struct lock	lk;
134 } __cachealign;
135 
136 struct vm_page_action_hash	action_hash[VMACTION_HSIZE];
137 static volatile int vm_pages_waiting;
138 
139 static struct alist vm_contig_alist;
140 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
141 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
142 
143 static u_long vm_dma_reserved = 0;
144 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
145 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
146 	    "Memory reserved for DMA");
147 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
148 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
149 
150 static int vm_contig_verbose = 0;
151 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
152 
153 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
154 	     vm_pindex_t, pindex);
155 
156 static void
157 vm_page_queue_init(void)
158 {
159 	int i;
160 
161 	for (i = 0; i < PQ_L2_SIZE; i++)
162 		vm_page_queues[PQ_FREE+i].cnt_offset =
163 			offsetof(struct vmstats, v_free_count);
164 	for (i = 0; i < PQ_L2_SIZE; i++)
165 		vm_page_queues[PQ_CACHE+i].cnt_offset =
166 			offsetof(struct vmstats, v_cache_count);
167 	for (i = 0; i < PQ_L2_SIZE; i++)
168 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
169 			offsetof(struct vmstats, v_inactive_count);
170 	for (i = 0; i < PQ_L2_SIZE; i++)
171 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
172 			offsetof(struct vmstats, v_active_count);
173 	for (i = 0; i < PQ_L2_SIZE; i++)
174 		vm_page_queues[PQ_HOLD+i].cnt_offset =
175 			offsetof(struct vmstats, v_active_count);
176 	/* PQ_NONE has no queue */
177 
178 	for (i = 0; i < PQ_COUNT; i++) {
179 		TAILQ_INIT(&vm_page_queues[i].pl);
180 		spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
181 	}
182 
183 	/*
184 	 * NOTE: Action lock might recurse due to callback, so allow
185 	 *	 recursion.
186 	 */
187 	for (i = 0; i < VMACTION_HSIZE; i++) {
188 		LIST_INIT(&action_hash[i].list);
189 		lockinit(&action_hash[i].lk, "actlk", 0, LK_CANRECURSE);
190 	}
191 }
192 
193 /*
194  * note: place in initialized data section?  Is this necessary?
195  */
196 long first_page = 0;
197 int vm_page_array_size = 0;
198 vm_page_t vm_page_array = NULL;
199 vm_paddr_t vm_low_phys_reserved;
200 
201 /*
202  * (low level boot)
203  *
204  * Sets the page size, perhaps based upon the memory size.
205  * Must be called before any use of page-size dependent functions.
206  */
207 void
208 vm_set_page_size(void)
209 {
210 	if (vmstats.v_page_size == 0)
211 		vmstats.v_page_size = PAGE_SIZE;
212 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
213 		panic("vm_set_page_size: page size not a power of two");
214 }
215 
216 /*
217  * (low level boot)
218  *
219  * Add a new page to the freelist for use by the system.  New pages
220  * are added to both the head and tail of the associated free page
221  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
222  * requests pull 'recent' adds (higher physical addresses) first.
223  *
224  * Beware that the page zeroing daemon will also be running soon after
225  * boot, moving pages from the head to the tail of the PQ_FREE queues.
226  *
227  * Must be called in a critical section.
228  */
229 static void
230 vm_add_new_page(vm_paddr_t pa)
231 {
232 	struct vpgqueues *vpq;
233 	vm_page_t m;
234 
235 	m = PHYS_TO_VM_PAGE(pa);
236 	m->phys_addr = pa;
237 	m->flags = 0;
238 	m->pat_mode = PAT_WRITE_BACK;
239 	m->pc = (pa >> PAGE_SHIFT);
240 
241 	/*
242 	 * Twist for cpu localization in addition to page coloring, so
243 	 * different cpus selecting by m->queue get different page colors.
244 	 */
245 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
246 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
247 	m->pc &= PQ_L2_MASK;
248 
249 	/*
250 	 * Reserve a certain number of contiguous low memory pages for
251 	 * contigmalloc() to use.
252 	 */
253 	if (pa < vm_low_phys_reserved) {
254 		atomic_add_int(&vmstats.v_page_count, 1);
255 		atomic_add_int(&vmstats.v_dma_pages, 1);
256 		m->queue = PQ_NONE;
257 		m->wire_count = 1;
258 		atomic_add_int(&vmstats.v_wire_count, 1);
259 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
260 		return;
261 	}
262 
263 	/*
264 	 * General page
265 	 */
266 	m->queue = m->pc + PQ_FREE;
267 	KKASSERT(m->dirty == 0);
268 
269 	atomic_add_int(&vmstats.v_page_count, 1);
270 	atomic_add_int(&vmstats.v_free_count, 1);
271 	vpq = &vm_page_queues[m->queue];
272 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
273 	++vpq->lcnt;
274 }
275 
276 /*
277  * (low level boot)
278  *
279  * Initializes the resident memory module.
280  *
281  * Preallocates memory for critical VM structures and arrays prior to
282  * kernel_map becoming available.
283  *
284  * Memory is allocated from (virtual2_start, virtual2_end) if available,
285  * otherwise memory is allocated from (virtual_start, virtual_end).
286  *
287  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
288  * large enough to hold vm_page_array & other structures for machines with
289  * large amounts of ram, so we want to use virtual2* when available.
290  */
291 void
292 vm_page_startup(void)
293 {
294 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
295 	vm_offset_t mapped;
296 	vm_size_t npages;
297 	vm_paddr_t page_range;
298 	vm_paddr_t new_end;
299 	int i;
300 	vm_paddr_t pa;
301 	vm_paddr_t last_pa;
302 	vm_paddr_t end;
303 	vm_paddr_t biggestone, biggestsize;
304 	vm_paddr_t total;
305 	vm_page_t m;
306 
307 	total = 0;
308 	biggestsize = 0;
309 	biggestone = 0;
310 	vaddr = round_page(vaddr);
311 
312 	/*
313 	 * Make sure ranges are page-aligned.
314 	 */
315 	for (i = 0; phys_avail[i].phys_end; ++i) {
316 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
317 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
318 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
319 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
320 	}
321 
322 	/*
323 	 * Locate largest block
324 	 */
325 	for (i = 0; phys_avail[i].phys_end; ++i) {
326 		vm_paddr_t size = phys_avail[i].phys_end -
327 				  phys_avail[i].phys_beg;
328 
329 		if (size > biggestsize) {
330 			biggestone = i;
331 			biggestsize = size;
332 		}
333 		total += size;
334 	}
335 	--i;	/* adjust to last entry for use down below */
336 
337 	end = phys_avail[biggestone].phys_end;
338 	end = trunc_page(end);
339 
340 	/*
341 	 * Initialize the queue headers for the free queue, the active queue
342 	 * and the inactive queue.
343 	 */
344 	vm_page_queue_init();
345 
346 #if !defined(_KERNEL_VIRTUAL)
347 	/*
348 	 * VKERNELs don't support minidumps and as such don't need
349 	 * vm_page_dump
350 	 *
351 	 * Allocate a bitmap to indicate that a random physical page
352 	 * needs to be included in a minidump.
353 	 *
354 	 * The amd64 port needs this to indicate which direct map pages
355 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
356 	 *
357 	 * However, i386 still needs this workspace internally within the
358 	 * minidump code.  In theory, they are not needed on i386, but are
359 	 * included should the sf_buf code decide to use them.
360 	 */
361 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
362 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
363 	end -= vm_page_dump_size;
364 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
365 					VM_PROT_READ | VM_PROT_WRITE);
366 	bzero((void *)vm_page_dump, vm_page_dump_size);
367 #endif
368 	/*
369 	 * Compute the number of pages of memory that will be available for
370 	 * use (taking into account the overhead of a page structure per
371 	 * page).
372 	 */
373 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
374 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
375 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
376 
377 #ifndef _KERNEL_VIRTUAL
378 	/*
379 	 * (only applies to real kernels)
380 	 *
381 	 * Reserve a large amount of low memory for potential 32-bit DMA
382 	 * space allocations.  Once device initialization is complete we
383 	 * release most of it, but keep (vm_dma_reserved) memory reserved
384 	 * for later use.  Typically for X / graphics.  Through trial and
385 	 * error we find that GPUs usually requires ~60-100MB or so.
386 	 *
387 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
388 	 */
389 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
390 	if (vm_low_phys_reserved > total / 4)
391 		vm_low_phys_reserved = total / 4;
392 	if (vm_dma_reserved == 0) {
393 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
394 		if (vm_dma_reserved > total / 16)
395 			vm_dma_reserved = total / 16;
396 	}
397 #endif
398 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
399 		   ALIST_RECORDS_65536);
400 
401 	/*
402 	 * Initialize the mem entry structures now, and put them in the free
403 	 * queue.
404 	 */
405 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
406 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
407 	vm_page_array = (vm_page_t)mapped;
408 
409 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
410 	/*
411 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
412 	 * we have to manually add these pages to the minidump tracking so
413 	 * that they can be dumped, including the vm_page_array.
414 	 */
415 	for (pa = new_end;
416 	     pa < phys_avail[biggestone].phys_end;
417 	     pa += PAGE_SIZE) {
418 		dump_add_page(pa);
419 	}
420 #endif
421 
422 	/*
423 	 * Clear all of the page structures, run basic initialization so
424 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
425 	 * map.
426 	 */
427 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
428 	vm_page_array_size = page_range;
429 
430 	m = &vm_page_array[0];
431 	pa = ptoa(first_page);
432 	for (i = 0; i < page_range; ++i) {
433 		spin_init(&m->spin, "vm_page");
434 		m->phys_addr = pa;
435 		pa += PAGE_SIZE;
436 		++m;
437 	}
438 
439 	/*
440 	 * Construct the free queue(s) in ascending order (by physical
441 	 * address) so that the first 16MB of physical memory is allocated
442 	 * last rather than first.  On large-memory machines, this avoids
443 	 * the exhaustion of low physical memory before isa_dmainit has run.
444 	 */
445 	vmstats.v_page_count = 0;
446 	vmstats.v_free_count = 0;
447 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
448 		pa = phys_avail[i].phys_beg;
449 		if (i == biggestone)
450 			last_pa = new_end;
451 		else
452 			last_pa = phys_avail[i].phys_end;
453 		while (pa < last_pa && npages-- > 0) {
454 			vm_add_new_page(pa);
455 			pa += PAGE_SIZE;
456 		}
457 	}
458 	if (virtual2_start)
459 		virtual2_start = vaddr;
460 	else
461 		virtual_start = vaddr;
462 	mycpu->gd_vmstats = vmstats;
463 }
464 
465 /*
466  * Reorganize VM pages based on numa data.  May be called as many times as
467  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
468  * to allow vm_page_alloc() to choose pages based on socket affinity.
469  *
470  * NOTE: This function is only called while we are still in UP mode, so
471  *	 we only need a critical section to protect the queues (which
472  *	 saves a lot of time, there are likely a ton of pages).
473  */
474 void
475 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
476 {
477 	vm_paddr_t scan_beg;
478 	vm_paddr_t scan_end;
479 	vm_paddr_t ran_end;
480 	struct vpgqueues *vpq;
481 	vm_page_t m;
482 	vm_page_t mend;
483 	int i;
484 	int socket_mod;
485 	int socket_value;
486 
487 	/*
488 	 * Check if no physical information, or there was only one socket
489 	 * (so don't waste time doing nothing!).
490 	 */
491 	if (cpu_topology_phys_ids <= 1 ||
492 	    cpu_topology_core_ids == 0) {
493 		return;
494 	}
495 
496 	/*
497 	 * Setup for our iteration.  Note that ACPI may iterate CPU
498 	 * sockets starting at 0 or 1 or some other number.  The
499 	 * cpu_topology code mod's it against the socket count.
500 	 */
501 	ran_end = ran_beg + bytes;
502 	physid %= cpu_topology_phys_ids;
503 
504 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
505 	socket_value = physid * socket_mod;
506 	mend = &vm_page_array[vm_page_array_size];
507 
508 	crit_enter();
509 
510 	/*
511 	 * Adjust vm_page->pc and requeue all affected pages.  The
512 	 * allocator will then be able to localize memory allocations
513 	 * to some degree.
514 	 */
515 	for (i = 0; phys_avail[i].phys_end; ++i) {
516 		scan_beg = phys_avail[i].phys_beg;
517 		scan_end = phys_avail[i].phys_end;
518 		if (scan_end <= ran_beg)
519 			continue;
520 		if (scan_beg >= ran_end)
521 			continue;
522 		if (scan_beg < ran_beg)
523 			scan_beg = ran_beg;
524 		if (scan_end > ran_end)
525 			scan_end = ran_end;
526 		if (atop(scan_end) > first_page + vm_page_array_size)
527 			scan_end = ptoa(first_page + vm_page_array_size);
528 
529 		m = PHYS_TO_VM_PAGE(scan_beg);
530 		while (scan_beg < scan_end) {
531 			KKASSERT(m < mend);
532 			if (m->queue != PQ_NONE) {
533 				vpq = &vm_page_queues[m->queue];
534 				TAILQ_REMOVE(&vpq->pl, m, pageq);
535 				--vpq->lcnt;
536 				/* queue doesn't change, no need to adj cnt */
537 				m->queue -= m->pc;
538 				m->pc %= socket_mod;
539 				m->pc += socket_value;
540 				m->pc &= PQ_L2_MASK;
541 				m->queue += m->pc;
542 				vpq = &vm_page_queues[m->queue];
543 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
544 				++vpq->lcnt;
545 				/* queue doesn't change, no need to adj cnt */
546 			} else {
547 				m->pc %= socket_mod;
548 				m->pc += socket_value;
549 				m->pc &= PQ_L2_MASK;
550 			}
551 			scan_beg += PAGE_SIZE;
552 			++m;
553 		}
554 	}
555 	crit_exit();
556 }
557 
558 /*
559  * We tended to reserve a ton of memory for contigmalloc().  Now that most
560  * drivers have initialized we want to return most the remaining free
561  * reserve back to the VM page queues so they can be used for normal
562  * allocations.
563  *
564  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
565  */
566 static void
567 vm_page_startup_finish(void *dummy __unused)
568 {
569 	alist_blk_t blk;
570 	alist_blk_t rblk;
571 	alist_blk_t count;
572 	alist_blk_t xcount;
573 	alist_blk_t bfree;
574 	vm_page_t m;
575 
576 	spin_lock(&vm_contig_spin);
577 	for (;;) {
578 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
579 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
580 			break;
581 		if (count == 0)
582 			break;
583 
584 		/*
585 		 * Figure out how much of the initial reserve we have to
586 		 * free in order to reach our target.
587 		 */
588 		bfree -= vm_dma_reserved / PAGE_SIZE;
589 		if (count > bfree) {
590 			blk += count - bfree;
591 			count = bfree;
592 		}
593 
594 		/*
595 		 * Calculate the nearest power of 2 <= count.
596 		 */
597 		for (xcount = 1; xcount <= count; xcount <<= 1)
598 			;
599 		xcount >>= 1;
600 		blk += count - xcount;
601 		count = xcount;
602 
603 		/*
604 		 * Allocate the pages from the alist, then free them to
605 		 * the normal VM page queues.
606 		 *
607 		 * Pages allocated from the alist are wired.  We have to
608 		 * busy, unwire, and free them.  We must also adjust
609 		 * vm_low_phys_reserved before freeing any pages to prevent
610 		 * confusion.
611 		 */
612 		rblk = alist_alloc(&vm_contig_alist, blk, count);
613 		if (rblk != blk) {
614 			kprintf("vm_page_startup_finish: Unable to return "
615 				"dma space @0x%08x/%d -> 0x%08x\n",
616 				blk, count, rblk);
617 			break;
618 		}
619 		atomic_add_int(&vmstats.v_dma_pages, -count);
620 		spin_unlock(&vm_contig_spin);
621 
622 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
623 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
624 		while (count) {
625 			vm_page_busy_wait(m, FALSE, "cpgfr");
626 			vm_page_unwire(m, 0);
627 			vm_page_free(m);
628 			--count;
629 			++m;
630 		}
631 		spin_lock(&vm_contig_spin);
632 	}
633 	spin_unlock(&vm_contig_spin);
634 
635 	/*
636 	 * Print out how much DMA space drivers have already allocated and
637 	 * how much is left over.
638 	 */
639 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
640 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
641 		(PAGE_SIZE / 1024),
642 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
643 }
644 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
645 	vm_page_startup_finish, NULL);
646 
647 
648 /*
649  * Scan comparison function for Red-Black tree scans.  An inclusive
650  * (start,end) is expected.  Other fields are not used.
651  */
652 int
653 rb_vm_page_scancmp(struct vm_page *p, void *data)
654 {
655 	struct rb_vm_page_scan_info *info = data;
656 
657 	if (p->pindex < info->start_pindex)
658 		return(-1);
659 	if (p->pindex > info->end_pindex)
660 		return(1);
661 	return(0);
662 }
663 
664 int
665 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
666 {
667 	if (p1->pindex < p2->pindex)
668 		return(-1);
669 	if (p1->pindex > p2->pindex)
670 		return(1);
671 	return(0);
672 }
673 
674 void
675 vm_page_init(vm_page_t m)
676 {
677 	/* do nothing for now.  Called from pmap_page_init() */
678 }
679 
680 /*
681  * Each page queue has its own spin lock, which is fairly optimal for
682  * allocating and freeing pages at least.
683  *
684  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
685  * queue spinlock via this function.  Also note that m->queue cannot change
686  * unless both the page and queue are locked.
687  */
688 static __inline
689 void
690 _vm_page_queue_spin_lock(vm_page_t m)
691 {
692 	u_short queue;
693 
694 	queue = m->queue;
695 	if (queue != PQ_NONE) {
696 		spin_lock(&vm_page_queues[queue].spin);
697 		KKASSERT(queue == m->queue);
698 	}
699 }
700 
701 static __inline
702 void
703 _vm_page_queue_spin_unlock(vm_page_t m)
704 {
705 	u_short queue;
706 
707 	queue = m->queue;
708 	cpu_ccfence();
709 	if (queue != PQ_NONE)
710 		spin_unlock(&vm_page_queues[queue].spin);
711 }
712 
713 static __inline
714 void
715 _vm_page_queues_spin_lock(u_short queue)
716 {
717 	cpu_ccfence();
718 	if (queue != PQ_NONE)
719 		spin_lock(&vm_page_queues[queue].spin);
720 }
721 
722 
723 static __inline
724 void
725 _vm_page_queues_spin_unlock(u_short queue)
726 {
727 	cpu_ccfence();
728 	if (queue != PQ_NONE)
729 		spin_unlock(&vm_page_queues[queue].spin);
730 }
731 
732 void
733 vm_page_queue_spin_lock(vm_page_t m)
734 {
735 	_vm_page_queue_spin_lock(m);
736 }
737 
738 void
739 vm_page_queues_spin_lock(u_short queue)
740 {
741 	_vm_page_queues_spin_lock(queue);
742 }
743 
744 void
745 vm_page_queue_spin_unlock(vm_page_t m)
746 {
747 	_vm_page_queue_spin_unlock(m);
748 }
749 
750 void
751 vm_page_queues_spin_unlock(u_short queue)
752 {
753 	_vm_page_queues_spin_unlock(queue);
754 }
755 
756 /*
757  * This locks the specified vm_page and its queue in the proper order
758  * (page first, then queue).  The queue may change so the caller must
759  * recheck on return.
760  */
761 static __inline
762 void
763 _vm_page_and_queue_spin_lock(vm_page_t m)
764 {
765 	vm_page_spin_lock(m);
766 	_vm_page_queue_spin_lock(m);
767 }
768 
769 static __inline
770 void
771 _vm_page_and_queue_spin_unlock(vm_page_t m)
772 {
773 	_vm_page_queues_spin_unlock(m->queue);
774 	vm_page_spin_unlock(m);
775 }
776 
777 void
778 vm_page_and_queue_spin_unlock(vm_page_t m)
779 {
780 	_vm_page_and_queue_spin_unlock(m);
781 }
782 
783 void
784 vm_page_and_queue_spin_lock(vm_page_t m)
785 {
786 	_vm_page_and_queue_spin_lock(m);
787 }
788 
789 /*
790  * Helper function removes vm_page from its current queue.
791  * Returns the base queue the page used to be on.
792  *
793  * The vm_page and the queue must be spinlocked.
794  * This function will unlock the queue but leave the page spinlocked.
795  */
796 static __inline u_short
797 _vm_page_rem_queue_spinlocked(vm_page_t m)
798 {
799 	struct vpgqueues *pq;
800 	u_short queue;
801 	u_short oqueue;
802 	int *cnt;
803 
804 	queue = m->queue;
805 	if (queue != PQ_NONE) {
806 		pq = &vm_page_queues[queue];
807 		TAILQ_REMOVE(&pq->pl, m, pageq);
808 
809 		/*
810 		 * Adjust our pcpu stats.  In order for the nominal low-memory
811 		 * algorithms to work properly we don't let any pcpu stat get
812 		 * too negative before we force it to be rolled-up into the
813 		 * global stats.  Otherwise our pageout and vm_wait tests
814 		 * will fail badly.
815 		 *
816 		 * The idea here is to reduce unnecessary SMP cache
817 		 * mastership changes in the global vmstats, which can be
818 		 * particularly bad in multi-socket systems.
819 		 */
820 		cnt = (int *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
821 		atomic_add_int(cnt, -1);
822 		if (*cnt < -VMMETER_SLOP_COUNT) {
823 			u_int copy = atomic_swap_int(cnt, 0);
824 			cnt = (int *)((char *)&vmstats + pq->cnt_offset);
825 			atomic_add_int(cnt, copy);
826 			cnt = (int *)((char *)&mycpu->gd_vmstats +
827 				      pq->cnt_offset);
828 			atomic_add_int(cnt, copy);
829 		}
830 		pq->lcnt--;
831 		m->queue = PQ_NONE;
832 		oqueue = queue;
833 		queue -= m->pc;
834 		vm_page_queues_spin_unlock(oqueue);	/* intended */
835 	}
836 	return queue;
837 }
838 
839 /*
840  * Helper function places the vm_page on the specified queue.  Generally
841  * speaking only PQ_FREE pages are placed at the head, to allow them to
842  * be allocated sooner rather than later on the assumption that they
843  * are cache-hot.
844  *
845  * The vm_page must be spinlocked.
846  * This function will return with both the page and the queue locked.
847  */
848 static __inline void
849 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
850 {
851 	struct vpgqueues *pq;
852 	u_int *cnt;
853 
854 	KKASSERT(m->queue == PQ_NONE);
855 
856 	if (queue != PQ_NONE) {
857 		vm_page_queues_spin_lock(queue);
858 		pq = &vm_page_queues[queue];
859 		++pq->lcnt;
860 
861 		/*
862 		 * Adjust our pcpu stats.  If a system entity really needs
863 		 * to incorporate the count it will call vmstats_rollup()
864 		 * to roll it all up into the global vmstats strufture.
865 		 */
866 		cnt = (int *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
867 		atomic_add_int(cnt, 1);
868 
869 		/*
870 		 * PQ_FREE is always handled LIFO style to try to provide
871 		 * cache-hot pages to programs.
872 		 */
873 		m->queue = queue;
874 		if (queue - m->pc == PQ_FREE) {
875 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
876 		} else if (athead) {
877 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
878 		} else {
879 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
880 		}
881 		/* leave the queue spinlocked */
882 	}
883 }
884 
885 /*
886  * Wait until page is no longer PG_BUSY or (if also_m_busy is TRUE)
887  * m->busy is zero.  Returns TRUE if it had to sleep, FALSE if we
888  * did not.  Only one sleep call will be made before returning.
889  *
890  * This function does NOT busy the page and on return the page is not
891  * guaranteed to be available.
892  */
893 void
894 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
895 {
896 	u_int32_t flags;
897 
898 	for (;;) {
899 		flags = m->flags;
900 		cpu_ccfence();
901 
902 		if ((flags & PG_BUSY) == 0 &&
903 		    (also_m_busy == 0 || (flags & PG_SBUSY) == 0)) {
904 			break;
905 		}
906 		tsleep_interlock(m, 0);
907 		if (atomic_cmpset_int(&m->flags, flags,
908 				      flags | PG_WANTED | PG_REFERENCED)) {
909 			tsleep(m, PINTERLOCKED, msg, 0);
910 			break;
911 		}
912 	}
913 }
914 
915 /*
916  * This calculates and returns a page color given an optional VM object and
917  * either a pindex or an iterator.  We attempt to return a cpu-localized
918  * pg_color that is still roughly 16-way set-associative.  The CPU topology
919  * is used if it was probed.
920  *
921  * The caller may use the returned value to index into e.g. PQ_FREE when
922  * allocating a page in order to nominally obtain pages that are hopefully
923  * already localized to the requesting cpu.  This function is not able to
924  * provide any sort of guarantee of this, but does its best to improve
925  * hardware cache management performance.
926  *
927  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
928  */
929 u_short
930 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
931 {
932 	u_short pg_color;
933 	int phys_id;
934 	int core_id;
935 	int object_pg_color;
936 
937 	phys_id = get_cpu_phys_id(cpuid);
938 	core_id = get_cpu_core_id(cpuid);
939 	object_pg_color = object ? object->pg_color : 0;
940 
941 	if (cpu_topology_phys_ids && cpu_topology_core_ids) {
942 		int grpsize;
943 
944 		/*
945 		 * Break us down by socket and cpu
946 		 */
947 		pg_color = phys_id * PQ_L2_SIZE / cpu_topology_phys_ids;
948 		pg_color += core_id * PQ_L2_SIZE /
949 			    (cpu_topology_core_ids * cpu_topology_phys_ids);
950 
951 		/*
952 		 * Calculate remaining component for object/queue color
953 		 */
954 		grpsize = PQ_L2_SIZE / (cpu_topology_core_ids *
955 					cpu_topology_phys_ids);
956 		if (grpsize >= 8) {
957 			pg_color += (pindex + object_pg_color) % grpsize;
958 		} else {
959 			if (grpsize <= 2) {
960 				grpsize = 8;
961 			} else {
962 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
963 				grpsize += grpsize;
964 				if (grpsize < 8)
965 					grpsize += grpsize;
966 			}
967 			pg_color += (pindex + object_pg_color) % grpsize;
968 		}
969 	} else {
970 		/*
971 		 * Unknown topology, distribute things evenly.
972 		 */
973 		pg_color = cpuid * PQ_L2_SIZE / ncpus;
974 		pg_color += pindex + object_pg_color;
975 	}
976 	return (pg_color & PQ_L2_MASK);
977 }
978 
979 /*
980  * Wait until PG_BUSY can be set, then set it.  If also_m_busy is TRUE we
981  * also wait for m->busy to become 0 before setting PG_BUSY.
982  */
983 void
984 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
985 				     int also_m_busy, const char *msg
986 				     VM_PAGE_DEBUG_ARGS)
987 {
988 	u_int32_t flags;
989 
990 	for (;;) {
991 		flags = m->flags;
992 		cpu_ccfence();
993 		if (flags & PG_BUSY) {
994 			tsleep_interlock(m, 0);
995 			if (atomic_cmpset_int(&m->flags, flags,
996 					  flags | PG_WANTED | PG_REFERENCED)) {
997 				tsleep(m, PINTERLOCKED, msg, 0);
998 			}
999 		} else if (also_m_busy && (flags & PG_SBUSY)) {
1000 			tsleep_interlock(m, 0);
1001 			if (atomic_cmpset_int(&m->flags, flags,
1002 					  flags | PG_WANTED | PG_REFERENCED)) {
1003 				tsleep(m, PINTERLOCKED, msg, 0);
1004 			}
1005 		} else {
1006 			if (atomic_cmpset_int(&m->flags, flags,
1007 					      flags | PG_BUSY)) {
1008 #ifdef VM_PAGE_DEBUG
1009 				m->busy_func = func;
1010 				m->busy_line = lineno;
1011 #endif
1012 				break;
1013 			}
1014 		}
1015 	}
1016 }
1017 
1018 /*
1019  * Attempt to set PG_BUSY.  If also_m_busy is TRUE we only succeed if m->busy
1020  * is also 0.
1021  *
1022  * Returns non-zero on failure.
1023  */
1024 int
1025 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1026 				    VM_PAGE_DEBUG_ARGS)
1027 {
1028 	u_int32_t flags;
1029 
1030 	for (;;) {
1031 		flags = m->flags;
1032 		cpu_ccfence();
1033 		if (flags & PG_BUSY)
1034 			return TRUE;
1035 		if (also_m_busy && (flags & PG_SBUSY))
1036 			return TRUE;
1037 		if (atomic_cmpset_int(&m->flags, flags, flags | PG_BUSY)) {
1038 #ifdef VM_PAGE_DEBUG
1039 				m->busy_func = func;
1040 				m->busy_line = lineno;
1041 #endif
1042 			return FALSE;
1043 		}
1044 	}
1045 }
1046 
1047 /*
1048  * Clear the PG_BUSY flag and return non-zero to indicate to the caller
1049  * that a wakeup() should be performed.
1050  *
1051  * The vm_page must be spinlocked and will remain spinlocked on return.
1052  * The related queue must NOT be spinlocked (which could deadlock us).
1053  *
1054  * (inline version)
1055  */
1056 static __inline
1057 int
1058 _vm_page_wakeup(vm_page_t m)
1059 {
1060 	u_int32_t flags;
1061 
1062 	for (;;) {
1063 		flags = m->flags;
1064 		cpu_ccfence();
1065 		if (atomic_cmpset_int(&m->flags, flags,
1066 				      flags & ~(PG_BUSY | PG_WANTED))) {
1067 			break;
1068 		}
1069 	}
1070 	return(flags & PG_WANTED);
1071 }
1072 
1073 /*
1074  * Clear the PG_BUSY flag and wakeup anyone waiting for the page.  This
1075  * is typically the last call you make on a page before moving onto
1076  * other things.
1077  */
1078 void
1079 vm_page_wakeup(vm_page_t m)
1080 {
1081         KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!"));
1082 	vm_page_spin_lock(m);
1083 	if (_vm_page_wakeup(m)) {
1084 		vm_page_spin_unlock(m);
1085 		wakeup(m);
1086 	} else {
1087 		vm_page_spin_unlock(m);
1088 	}
1089 }
1090 
1091 /*
1092  * Holding a page keeps it from being reused.  Other parts of the system
1093  * can still disassociate the page from its current object and free it, or
1094  * perform read or write I/O on it and/or otherwise manipulate the page,
1095  * but if the page is held the VM system will leave the page and its data
1096  * intact and not reuse the page for other purposes until the last hold
1097  * reference is released.  (see vm_page_wire() if you want to prevent the
1098  * page from being disassociated from its object too).
1099  *
1100  * The caller must still validate the contents of the page and, if necessary,
1101  * wait for any pending I/O (e.g. vm_page_sleep_busy() loop) to complete
1102  * before manipulating the page.
1103  *
1104  * XXX get vm_page_spin_lock() here and move FREE->HOLD if necessary
1105  */
1106 void
1107 vm_page_hold(vm_page_t m)
1108 {
1109 	vm_page_spin_lock(m);
1110 	atomic_add_int(&m->hold_count, 1);
1111 	if (m->queue - m->pc == PQ_FREE) {
1112 		_vm_page_queue_spin_lock(m);
1113 		_vm_page_rem_queue_spinlocked(m);
1114 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
1115 		_vm_page_queue_spin_unlock(m);
1116 	}
1117 	vm_page_spin_unlock(m);
1118 }
1119 
1120 /*
1121  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1122  * it was freed while held and must be moved back to the FREE queue.
1123  */
1124 void
1125 vm_page_unhold(vm_page_t m)
1126 {
1127 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1128 		("vm_page_unhold: pg %p illegal hold_count (%d) or on FREE queue (%d)",
1129 		 m, m->hold_count, m->queue - m->pc));
1130 	vm_page_spin_lock(m);
1131 	atomic_add_int(&m->hold_count, -1);
1132 	if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1133 		_vm_page_queue_spin_lock(m);
1134 		_vm_page_rem_queue_spinlocked(m);
1135 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1136 		_vm_page_queue_spin_unlock(m);
1137 	}
1138 	vm_page_spin_unlock(m);
1139 }
1140 
1141 /*
1142  *	vm_page_getfake:
1143  *
1144  *	Create a fictitious page with the specified physical address and
1145  *	memory attribute.  The memory attribute is the only the machine-
1146  *	dependent aspect of a fictitious page that must be initialized.
1147  */
1148 
1149 void
1150 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1151 {
1152 
1153 	if ((m->flags & PG_FICTITIOUS) != 0) {
1154 		/*
1155 		 * The page's memattr might have changed since the
1156 		 * previous initialization.  Update the pmap to the
1157 		 * new memattr.
1158 		 */
1159 		goto memattr;
1160 	}
1161 	m->phys_addr = paddr;
1162 	m->queue = PQ_NONE;
1163 	/* Fictitious pages don't use "segind". */
1164 	/* Fictitious pages don't use "order" or "pool". */
1165 	m->flags = PG_FICTITIOUS | PG_UNMANAGED | PG_BUSY;
1166 	m->wire_count = 1;
1167 	spin_init(&m->spin, "fake_page");
1168 	pmap_page_init(m);
1169 memattr:
1170 	pmap_page_set_memattr(m, memattr);
1171 }
1172 
1173 /*
1174  * Inserts the given vm_page into the object and object list.
1175  *
1176  * The pagetables are not updated but will presumably fault the page
1177  * in if necessary, or if a kernel page the caller will at some point
1178  * enter the page into the kernel's pmap.  We are not allowed to block
1179  * here so we *can't* do this anyway.
1180  *
1181  * This routine may not block.
1182  * This routine must be called with the vm_object held.
1183  * This routine must be called with a critical section held.
1184  *
1185  * This routine returns TRUE if the page was inserted into the object
1186  * successfully, and FALSE if the page already exists in the object.
1187  */
1188 int
1189 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1190 {
1191 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1192 	if (m->object != NULL)
1193 		panic("vm_page_insert: already inserted");
1194 
1195 	atomic_add_int(&object->generation, 1);
1196 
1197 	/*
1198 	 * Record the object/offset pair in this page and add the
1199 	 * pv_list_count of the page to the object.
1200 	 *
1201 	 * The vm_page spin lock is required for interactions with the pmap.
1202 	 */
1203 	vm_page_spin_lock(m);
1204 	m->object = object;
1205 	m->pindex = pindex;
1206 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1207 		m->object = NULL;
1208 		m->pindex = 0;
1209 		vm_page_spin_unlock(m);
1210 		return FALSE;
1211 	}
1212 	++object->resident_page_count;
1213 	++mycpu->gd_vmtotal.t_rm;
1214 	vm_page_spin_unlock(m);
1215 
1216 	/*
1217 	 * Since we are inserting a new and possibly dirty page,
1218 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1219 	 */
1220 	if ((m->valid & m->dirty) ||
1221 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1222 		vm_object_set_writeable_dirty(object);
1223 
1224 	/*
1225 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1226 	 */
1227 	swap_pager_page_inserted(m);
1228 	return TRUE;
1229 }
1230 
1231 /*
1232  * Removes the given vm_page_t from the (object,index) table
1233  *
1234  * The underlying pmap entry (if any) is NOT removed here.
1235  * This routine may not block.
1236  *
1237  * The page must be BUSY and will remain BUSY on return.
1238  * No other requirements.
1239  *
1240  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1241  *	 it busy.
1242  */
1243 void
1244 vm_page_remove(vm_page_t m)
1245 {
1246 	vm_object_t object;
1247 
1248 	if (m->object == NULL) {
1249 		return;
1250 	}
1251 
1252 	if ((m->flags & PG_BUSY) == 0)
1253 		panic("vm_page_remove: page not busy");
1254 
1255 	object = m->object;
1256 
1257 	vm_object_hold(object);
1258 
1259 	/*
1260 	 * Remove the page from the object and update the object.
1261 	 *
1262 	 * The vm_page spin lock is required for interactions with the pmap.
1263 	 */
1264 	vm_page_spin_lock(m);
1265 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1266 	--object->resident_page_count;
1267 	--mycpu->gd_vmtotal.t_rm;
1268 	m->object = NULL;
1269 	atomic_add_int(&object->generation, 1);
1270 	vm_page_spin_unlock(m);
1271 
1272 	vm_object_drop(object);
1273 }
1274 
1275 /*
1276  * Locate and return the page at (object, pindex), or NULL if the
1277  * page could not be found.
1278  *
1279  * The caller must hold the vm_object token.
1280  */
1281 vm_page_t
1282 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1283 {
1284 	vm_page_t m;
1285 
1286 	/*
1287 	 * Search the hash table for this object/offset pair
1288 	 */
1289 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1290 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1291 	KKASSERT(m == NULL || (m->object == object && m->pindex == pindex));
1292 	return(m);
1293 }
1294 
1295 vm_page_t
1296 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1297 					    vm_pindex_t pindex,
1298 					    int also_m_busy, const char *msg
1299 					    VM_PAGE_DEBUG_ARGS)
1300 {
1301 	u_int32_t flags;
1302 	vm_page_t m;
1303 
1304 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1305 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1306 	while (m) {
1307 		KKASSERT(m->object == object && m->pindex == pindex);
1308 		flags = m->flags;
1309 		cpu_ccfence();
1310 		if (flags & PG_BUSY) {
1311 			tsleep_interlock(m, 0);
1312 			if (atomic_cmpset_int(&m->flags, flags,
1313 					  flags | PG_WANTED | PG_REFERENCED)) {
1314 				tsleep(m, PINTERLOCKED, msg, 0);
1315 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1316 							      pindex);
1317 			}
1318 		} else if (also_m_busy && (flags & PG_SBUSY)) {
1319 			tsleep_interlock(m, 0);
1320 			if (atomic_cmpset_int(&m->flags, flags,
1321 					  flags | PG_WANTED | PG_REFERENCED)) {
1322 				tsleep(m, PINTERLOCKED, msg, 0);
1323 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1324 							      pindex);
1325 			}
1326 		} else if (atomic_cmpset_int(&m->flags, flags,
1327 					     flags | PG_BUSY)) {
1328 #ifdef VM_PAGE_DEBUG
1329 			m->busy_func = func;
1330 			m->busy_line = lineno;
1331 #endif
1332 			break;
1333 		}
1334 	}
1335 	return m;
1336 }
1337 
1338 /*
1339  * Attempt to lookup and busy a page.
1340  *
1341  * Returns NULL if the page could not be found
1342  *
1343  * Returns a vm_page and error == TRUE if the page exists but could not
1344  * be busied.
1345  *
1346  * Returns a vm_page and error == FALSE on success.
1347  */
1348 vm_page_t
1349 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1350 					   vm_pindex_t pindex,
1351 					   int also_m_busy, int *errorp
1352 					   VM_PAGE_DEBUG_ARGS)
1353 {
1354 	u_int32_t flags;
1355 	vm_page_t m;
1356 
1357 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1358 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1359 	*errorp = FALSE;
1360 	while (m) {
1361 		KKASSERT(m->object == object && m->pindex == pindex);
1362 		flags = m->flags;
1363 		cpu_ccfence();
1364 		if (flags & PG_BUSY) {
1365 			*errorp = TRUE;
1366 			break;
1367 		}
1368 		if (also_m_busy && (flags & PG_SBUSY)) {
1369 			*errorp = TRUE;
1370 			break;
1371 		}
1372 		if (atomic_cmpset_int(&m->flags, flags, flags | PG_BUSY)) {
1373 #ifdef VM_PAGE_DEBUG
1374 			m->busy_func = func;
1375 			m->busy_line = lineno;
1376 #endif
1377 			break;
1378 		}
1379 	}
1380 	return m;
1381 }
1382 
1383 /*
1384  * Attempt to repurpose the passed-in page.  If the passed-in page cannot
1385  * be repurposed it will be released, *must_reenter will be set to 1, and
1386  * this function will fall-through to vm_page_lookup_busy_try().
1387  *
1388  * The passed-in page must be wired and not busy.  The returned page will
1389  * be busied and not wired.
1390  *
1391  * A different page may be returned.  The returned page will be busied and
1392  * not wired.
1393  *
1394  * NULL can be returned.  If so, the required page could not be busied.
1395  * The passed-in page will be unwired.
1396  */
1397 vm_page_t
1398 vm_page_repurpose(struct vm_object *object, vm_pindex_t pindex,
1399 		  int also_m_busy, int *errorp, vm_page_t m,
1400 		  int *must_reenter, int *iswired)
1401 {
1402 	if (m) {
1403 		/*
1404 		 * Do not mess with pages in a complex state, such as pages
1405 		 * which are mapped, as repurposing such pages can be more
1406 		 * expensive than simply allocatin a new one.
1407 		 *
1408 		 * NOTE: Soft-busying can deadlock against putpages or I/O
1409 		 *	 so we only allow hard-busying here.
1410 		 */
1411 		KKASSERT(also_m_busy == FALSE);
1412 		vm_page_busy_wait(m, also_m_busy, "biodep");
1413 
1414 		if ((m->flags & (PG_UNMANAGED | PG_MAPPED |
1415 				 PG_FICTITIOUS | PG_SBUSY)) ||
1416 		    m->busy || m->wire_count != 1 || m->hold_count) {
1417 			vm_page_unwire(m, 0);
1418 			vm_page_wakeup(m);
1419 			/* fall through to normal lookup */
1420 		} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
1421 			vm_page_unwire(m, 0);
1422 			vm_page_deactivate(m);
1423 			vm_page_wakeup(m);
1424 			/* fall through to normal lookup */
1425 		} else {
1426 			/*
1427 			 * We can safely repurpose the page.  It should
1428 			 * already be unqueued.
1429 			 */
1430 			KKASSERT(m->queue == PQ_NONE && m->dirty == 0);
1431 			vm_page_remove(m);
1432 			m->valid = 0;
1433 			m->act_count = 0;
1434 			if (vm_page_insert(m, object, pindex)) {
1435 				*errorp = 0;
1436 				*iswired = 1;
1437 
1438 				return m;
1439 			}
1440 			vm_page_unwire(m, 0);
1441 			vm_page_free(m);
1442 			/* fall through to normal lookup */
1443 		}
1444 	}
1445 
1446 	/*
1447 	 * Cannot repurpose page, attempt to locate the desired page.  May
1448 	 * return NULL.
1449 	 */
1450 	*must_reenter = 1;
1451 	*iswired = 0;
1452 	m = vm_page_lookup_busy_try(object, pindex, also_m_busy, errorp);
1453 
1454 	return m;
1455 }
1456 
1457 /*
1458  * Caller must hold the related vm_object
1459  */
1460 vm_page_t
1461 vm_page_next(vm_page_t m)
1462 {
1463 	vm_page_t next;
1464 
1465 	next = vm_page_rb_tree_RB_NEXT(m);
1466 	if (next && next->pindex != m->pindex + 1)
1467 		next = NULL;
1468 	return (next);
1469 }
1470 
1471 /*
1472  * vm_page_rename()
1473  *
1474  * Move the given vm_page from its current object to the specified
1475  * target object/offset.  The page must be busy and will remain so
1476  * on return.
1477  *
1478  * new_object must be held.
1479  * This routine might block. XXX ?
1480  *
1481  * NOTE: Swap associated with the page must be invalidated by the move.  We
1482  *       have to do this for several reasons:  (1) we aren't freeing the
1483  *       page, (2) we are dirtying the page, (3) the VM system is probably
1484  *       moving the page from object A to B, and will then later move
1485  *       the backing store from A to B and we can't have a conflict.
1486  *
1487  * NOTE: We *always* dirty the page.  It is necessary both for the
1488  *       fact that we moved it, and because we may be invalidating
1489  *	 swap.  If the page is on the cache, we have to deactivate it
1490  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1491  *	 on the cache.
1492  */
1493 void
1494 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1495 {
1496 	KKASSERT(m->flags & PG_BUSY);
1497 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1498 	if (m->object) {
1499 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1500 		vm_page_remove(m);
1501 	}
1502 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1503 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1504 		      new_object, new_pindex);
1505 	}
1506 	if (m->queue - m->pc == PQ_CACHE)
1507 		vm_page_deactivate(m);
1508 	vm_page_dirty(m);
1509 }
1510 
1511 /*
1512  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1513  * is to remain BUSYied by the caller.
1514  *
1515  * This routine may not block.
1516  */
1517 void
1518 vm_page_unqueue_nowakeup(vm_page_t m)
1519 {
1520 	vm_page_and_queue_spin_lock(m);
1521 	(void)_vm_page_rem_queue_spinlocked(m);
1522 	vm_page_spin_unlock(m);
1523 }
1524 
1525 /*
1526  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1527  * if necessary.
1528  *
1529  * This routine may not block.
1530  */
1531 void
1532 vm_page_unqueue(vm_page_t m)
1533 {
1534 	u_short queue;
1535 
1536 	vm_page_and_queue_spin_lock(m);
1537 	queue = _vm_page_rem_queue_spinlocked(m);
1538 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1539 		vm_page_spin_unlock(m);
1540 		pagedaemon_wakeup();
1541 	} else {
1542 		vm_page_spin_unlock(m);
1543 	}
1544 }
1545 
1546 /*
1547  * vm_page_list_find()
1548  *
1549  * Find a page on the specified queue with color optimization.
1550  *
1551  * The page coloring optimization attempts to locate a page that does
1552  * not overload other nearby pages in the object in the cpu's L1 or L2
1553  * caches.  We need this optimization because cpu caches tend to be
1554  * physical caches, while object spaces tend to be virtual.
1555  *
1556  * The page coloring optimization also, very importantly, tries to localize
1557  * memory to cpus and physical sockets.
1558  *
1559  * On MP systems each PQ_FREE and PQ_CACHE color queue has its own spinlock
1560  * and the algorithm is adjusted to localize allocations on a per-core basis.
1561  * This is done by 'twisting' the colors.
1562  *
1563  * The page is returned spinlocked and removed from its queue (it will
1564  * be on PQ_NONE), or NULL. The page is not PG_BUSY'd.  The caller
1565  * is responsible for dealing with the busy-page case (usually by
1566  * deactivating the page and looping).
1567  *
1568  * NOTE:  This routine is carefully inlined.  A non-inlined version
1569  *	  is available for outside callers but the only critical path is
1570  *	  from within this source file.
1571  *
1572  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1573  *	  represent stable storage, allowing us to order our locks vm_page
1574  *	  first, then queue.
1575  */
1576 static __inline
1577 vm_page_t
1578 _vm_page_list_find(int basequeue, int index)
1579 {
1580 	vm_page_t m;
1581 
1582 	for (;;) {
1583 		m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
1584 		if (m == NULL) {
1585 			m = _vm_page_list_find2(basequeue, index);
1586 			return(m);
1587 		}
1588 		vm_page_and_queue_spin_lock(m);
1589 		if (m->queue == basequeue + index) {
1590 			_vm_page_rem_queue_spinlocked(m);
1591 			/* vm_page_t spin held, no queue spin */
1592 			break;
1593 		}
1594 		vm_page_and_queue_spin_unlock(m);
1595 	}
1596 	return(m);
1597 }
1598 
1599 /*
1600  * If we could not find the page in the desired queue try to find it in
1601  * a nearby queue.
1602  */
1603 static vm_page_t
1604 _vm_page_list_find2(int basequeue, int index)
1605 {
1606 	struct vpgqueues *pq;
1607 	vm_page_t m = NULL;
1608 	int pqmask = PQ_SET_ASSOC_MASK >> 1;
1609 	int pqi;
1610 	int i;
1611 
1612 	index &= PQ_L2_MASK;
1613 	pq = &vm_page_queues[basequeue];
1614 
1615 	/*
1616 	 * Run local sets of 16, 32, 64, 128, and the whole queue if all
1617 	 * else fails (PQ_L2_MASK which is 255).
1618 	 */
1619 	do {
1620 		pqmask = (pqmask << 1) | 1;
1621 		for (i = 0; i <= pqmask; ++i) {
1622 			pqi = (index & ~pqmask) | ((index + i) & pqmask);
1623 			m = TAILQ_FIRST(&pq[pqi].pl);
1624 			if (m) {
1625 				_vm_page_and_queue_spin_lock(m);
1626 				if (m->queue == basequeue + pqi) {
1627 					_vm_page_rem_queue_spinlocked(m);
1628 					return(m);
1629 				}
1630 				_vm_page_and_queue_spin_unlock(m);
1631 				--i;
1632 				continue;
1633 			}
1634 		}
1635 	} while (pqmask != PQ_L2_MASK);
1636 
1637 	return(m);
1638 }
1639 
1640 /*
1641  * Returns a vm_page candidate for allocation.  The page is not busied so
1642  * it can move around.  The caller must busy the page (and typically
1643  * deactivate it if it cannot be busied!)
1644  *
1645  * Returns a spinlocked vm_page that has been removed from its queue.
1646  */
1647 vm_page_t
1648 vm_page_list_find(int basequeue, int index)
1649 {
1650 	return(_vm_page_list_find(basequeue, index));
1651 }
1652 
1653 /*
1654  * Find a page on the cache queue with color optimization, remove it
1655  * from the queue, and busy it.  The returned page will not be spinlocked.
1656  *
1657  * A candidate failure will be deactivated.  Candidates can fail due to
1658  * being busied by someone else, in which case they will be deactivated.
1659  *
1660  * This routine may not block.
1661  *
1662  */
1663 static vm_page_t
1664 vm_page_select_cache(u_short pg_color)
1665 {
1666 	vm_page_t m;
1667 
1668 	for (;;) {
1669 		m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK);
1670 		if (m == NULL)
1671 			break;
1672 		/*
1673 		 * (m) has been removed from its queue and spinlocked
1674 		 */
1675 		if (vm_page_busy_try(m, TRUE)) {
1676 			_vm_page_deactivate_locked(m, 0);
1677 			vm_page_spin_unlock(m);
1678 		} else {
1679 			/*
1680 			 * We successfully busied the page
1681 			 */
1682 			if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0 &&
1683 			    m->hold_count == 0 &&
1684 			    m->wire_count == 0 &&
1685 			    (m->dirty & m->valid) == 0) {
1686 				vm_page_spin_unlock(m);
1687 				pagedaemon_wakeup();
1688 				return(m);
1689 			}
1690 
1691 			/*
1692 			 * The page cannot be recycled, deactivate it.
1693 			 */
1694 			_vm_page_deactivate_locked(m, 0);
1695 			if (_vm_page_wakeup(m)) {
1696 				vm_page_spin_unlock(m);
1697 				wakeup(m);
1698 			} else {
1699 				vm_page_spin_unlock(m);
1700 			}
1701 		}
1702 	}
1703 	return (m);
1704 }
1705 
1706 /*
1707  * Find a free page.  We attempt to inline the nominal case and fall back
1708  * to _vm_page_select_free() otherwise.  A busied page is removed from
1709  * the queue and returned.
1710  *
1711  * This routine may not block.
1712  */
1713 static __inline vm_page_t
1714 vm_page_select_free(u_short pg_color)
1715 {
1716 	vm_page_t m;
1717 
1718 	for (;;) {
1719 		m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK);
1720 		if (m == NULL)
1721 			break;
1722 		if (vm_page_busy_try(m, TRUE)) {
1723 			/*
1724 			 * Various mechanisms such as a pmap_collect can
1725 			 * result in a busy page on the free queue.  We
1726 			 * have to move the page out of the way so we can
1727 			 * retry the allocation.  If the other thread is not
1728 			 * allocating the page then m->valid will remain 0 and
1729 			 * the pageout daemon will free the page later on.
1730 			 *
1731 			 * Since we could not busy the page, however, we
1732 			 * cannot make assumptions as to whether the page
1733 			 * will be allocated by the other thread or not,
1734 			 * so all we can do is deactivate it to move it out
1735 			 * of the way.  In particular, if the other thread
1736 			 * wires the page it may wind up on the inactive
1737 			 * queue and the pageout daemon will have to deal
1738 			 * with that case too.
1739 			 */
1740 			_vm_page_deactivate_locked(m, 0);
1741 			vm_page_spin_unlock(m);
1742 		} else {
1743 			/*
1744 			 * Theoretically if we are able to busy the page
1745 			 * atomic with the queue removal (using the vm_page
1746 			 * lock) nobody else should be able to mess with the
1747 			 * page before us.
1748 			 */
1749 			KKASSERT((m->flags & (PG_UNMANAGED |
1750 					      PG_NEED_COMMIT)) == 0);
1751 			KASSERT(m->hold_count == 0, ("m->hold_count is not zero "
1752 						     "pg %p q=%d flags=%08x hold=%d wire=%d",
1753 						     m, m->queue, m->flags, m->hold_count, m->wire_count));
1754 			KKASSERT(m->wire_count == 0);
1755 			vm_page_spin_unlock(m);
1756 			pagedaemon_wakeup();
1757 
1758 			/* return busied and removed page */
1759 			return(m);
1760 		}
1761 	}
1762 	return(m);
1763 }
1764 
1765 /*
1766  * vm_page_alloc()
1767  *
1768  * Allocate and return a memory cell associated with this VM object/offset
1769  * pair.  If object is NULL an unassociated page will be allocated.
1770  *
1771  * The returned page will be busied and removed from its queues.  This
1772  * routine can block and may return NULL if a race occurs and the page
1773  * is found to already exist at the specified (object, pindex).
1774  *
1775  *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
1776  *	VM_ALLOC_QUICK		like normal but cannot use cache
1777  *	VM_ALLOC_SYSTEM		greater free drain
1778  *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
1779  *	VM_ALLOC_ZERO		advisory request for pre-zero'd page only
1780  *	VM_ALLOC_FORCE_ZERO	advisory request for pre-zero'd page only
1781  *	VM_ALLOC_NULL_OK	ok to return NULL on insertion collision
1782  *				(see vm_page_grab())
1783  *	VM_ALLOC_USE_GD		ok to use per-gd cache
1784  *
1785  *	VM_ALLOC_CPU(n)		allocate using specified cpu localization
1786  *
1787  * The object must be held if not NULL
1788  * This routine may not block
1789  *
1790  * Additional special handling is required when called from an interrupt
1791  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
1792  * in this case.
1793  */
1794 vm_page_t
1795 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
1796 {
1797 	globaldata_t gd;
1798 	vm_object_t obj;
1799 	vm_page_t m;
1800 	u_short pg_color;
1801 	int cpuid_local;
1802 
1803 #if 0
1804 	/*
1805 	 * Special per-cpu free VM page cache.  The pages are pre-busied
1806 	 * and pre-zerod for us.
1807 	 */
1808 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
1809 		crit_enter_gd(gd);
1810 		if (gd->gd_vmpg_count) {
1811 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
1812 			crit_exit_gd(gd);
1813 			goto done;
1814                 }
1815 		crit_exit_gd(gd);
1816         }
1817 #endif
1818 	m = NULL;
1819 
1820 	/*
1821 	 * CPU LOCALIZATION
1822 	 *
1823 	 * CPU localization algorithm.  Break the page queues up by physical
1824 	 * id and core id (note that two cpu threads will have the same core
1825 	 * id, and core_id != gd_cpuid).
1826 	 *
1827 	 * This is nowhere near perfect, for example the last pindex in a
1828 	 * subgroup will overflow into the next cpu or package.  But this
1829 	 * should get us good page reuse locality in heavy mixed loads.
1830 	 *
1831 	 * (may be executed before the APs are started, so other GDs might
1832 	 *  not exist!)
1833 	 */
1834 	if (page_req & VM_ALLOC_CPU_SPEC)
1835 		cpuid_local = VM_ALLOC_GETCPU(page_req);
1836 	else
1837 		cpuid_local = mycpu->gd_cpuid;
1838 
1839 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
1840 
1841 	KKASSERT(page_req &
1842 		(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
1843 		 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
1844 
1845 	/*
1846 	 * Certain system threads (pageout daemon, buf_daemon's) are
1847 	 * allowed to eat deeper into the free page list.
1848 	 */
1849 	if (curthread->td_flags & TDF_SYSTHREAD)
1850 		page_req |= VM_ALLOC_SYSTEM;
1851 
1852 	/*
1853 	 * Impose various limitations.  Note that the v_free_reserved test
1854 	 * must match the opposite of vm_page_count_target() to avoid
1855 	 * livelocks, be careful.
1856 	 */
1857 loop:
1858 	gd = mycpu;
1859 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
1860 	    ((page_req & VM_ALLOC_INTERRUPT) &&
1861 	     gd->gd_vmstats.v_free_count > 0) ||
1862 	    ((page_req & VM_ALLOC_SYSTEM) &&
1863 	     gd->gd_vmstats.v_cache_count == 0 &&
1864 		gd->gd_vmstats.v_free_count >
1865 		gd->gd_vmstats.v_interrupt_free_min)
1866 	) {
1867 		/*
1868 		 * The free queue has sufficient free pages to take one out.
1869 		 */
1870 		m = vm_page_select_free(pg_color);
1871 	} else if (page_req & VM_ALLOC_NORMAL) {
1872 		/*
1873 		 * Allocatable from the cache (non-interrupt only).  On
1874 		 * success, we must free the page and try again, thus
1875 		 * ensuring that vmstats.v_*_free_min counters are replenished.
1876 		 */
1877 #ifdef INVARIANTS
1878 		if (curthread->td_preempted) {
1879 			kprintf("vm_page_alloc(): warning, attempt to allocate"
1880 				" cache page from preempting interrupt\n");
1881 			m = NULL;
1882 		} else {
1883 			m = vm_page_select_cache(pg_color);
1884 		}
1885 #else
1886 		m = vm_page_select_cache(pg_color);
1887 #endif
1888 		/*
1889 		 * On success move the page into the free queue and loop.
1890 		 *
1891 		 * Only do this if we can safely acquire the vm_object lock,
1892 		 * because this is effectively a random page and the caller
1893 		 * might be holding the lock shared, we don't want to
1894 		 * deadlock.
1895 		 */
1896 		if (m != NULL) {
1897 			KASSERT(m->dirty == 0,
1898 				("Found dirty cache page %p", m));
1899 			if ((obj = m->object) != NULL) {
1900 				if (vm_object_hold_try(obj)) {
1901 					vm_page_protect(m, VM_PROT_NONE);
1902 					vm_page_free(m);
1903 					/* m->object NULL here */
1904 					vm_object_drop(obj);
1905 				} else {
1906 					vm_page_deactivate(m);
1907 					vm_page_wakeup(m);
1908 				}
1909 			} else {
1910 				vm_page_protect(m, VM_PROT_NONE);
1911 				vm_page_free(m);
1912 			}
1913 			goto loop;
1914 		}
1915 
1916 		/*
1917 		 * On failure return NULL
1918 		 */
1919 		atomic_add_int(&vm_pageout_deficit, 1);
1920 		pagedaemon_wakeup();
1921 		return (NULL);
1922 	} else {
1923 		/*
1924 		 * No pages available, wakeup the pageout daemon and give up.
1925 		 */
1926 		atomic_add_int(&vm_pageout_deficit, 1);
1927 		pagedaemon_wakeup();
1928 		return (NULL);
1929 	}
1930 
1931 	/*
1932 	 * v_free_count can race so loop if we don't find the expected
1933 	 * page.
1934 	 */
1935 	if (m == NULL) {
1936 		vmstats_rollup();
1937 		goto loop;
1938 	}
1939 
1940 	/*
1941 	 * Good page found.  The page has already been busied for us and
1942 	 * removed from its queues.
1943 	 */
1944 	KASSERT(m->dirty == 0,
1945 		("vm_page_alloc: free/cache page %p was dirty", m));
1946 	KKASSERT(m->queue == PQ_NONE);
1947 
1948 #if 0
1949 done:
1950 #endif
1951 	/*
1952 	 * Initialize the structure, inheriting some flags but clearing
1953 	 * all the rest.  The page has already been busied for us.
1954 	 */
1955 	vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
1956 
1957 	KKASSERT(m->wire_count == 0);
1958 	KKASSERT(m->busy == 0);
1959 	m->act_count = 0;
1960 	m->valid = 0;
1961 
1962 	/*
1963 	 * Caller must be holding the object lock (asserted by
1964 	 * vm_page_insert()).
1965 	 *
1966 	 * NOTE: Inserting a page here does not insert it into any pmaps
1967 	 *	 (which could cause us to block allocating memory).
1968 	 *
1969 	 * NOTE: If no object an unassociated page is allocated, m->pindex
1970 	 *	 can be used by the caller for any purpose.
1971 	 */
1972 	if (object) {
1973 		if (vm_page_insert(m, object, pindex) == FALSE) {
1974 			vm_page_free(m);
1975 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
1976 				panic("PAGE RACE %p[%ld]/%p",
1977 				      object, (long)pindex, m);
1978 			m = NULL;
1979 		}
1980 	} else {
1981 		m->pindex = pindex;
1982 	}
1983 
1984 	/*
1985 	 * Don't wakeup too often - wakeup the pageout daemon when
1986 	 * we would be nearly out of memory.
1987 	 */
1988 	pagedaemon_wakeup();
1989 
1990 	/*
1991 	 * A PG_BUSY page is returned.
1992 	 */
1993 	return (m);
1994 }
1995 
1996 /*
1997  * Returns number of pages available in our DMA memory reserve
1998  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
1999  */
2000 vm_size_t
2001 vm_contig_avail_pages(void)
2002 {
2003 	alist_blk_t blk;
2004 	alist_blk_t count;
2005 	alist_blk_t bfree;
2006 	spin_lock(&vm_contig_spin);
2007 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2008 	spin_unlock(&vm_contig_spin);
2009 
2010 	return bfree;
2011 }
2012 
2013 /*
2014  * Attempt to allocate contiguous physical memory with the specified
2015  * requirements.
2016  */
2017 vm_page_t
2018 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2019 		     unsigned long alignment, unsigned long boundary,
2020 		     unsigned long size, vm_memattr_t memattr)
2021 {
2022 	alist_blk_t blk;
2023 	vm_page_t m;
2024 	int i;
2025 
2026 	alignment >>= PAGE_SHIFT;
2027 	if (alignment == 0)
2028 		alignment = 1;
2029 	boundary >>= PAGE_SHIFT;
2030 	if (boundary == 0)
2031 		boundary = 1;
2032 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
2033 
2034 	spin_lock(&vm_contig_spin);
2035 	blk = alist_alloc(&vm_contig_alist, 0, size);
2036 	if (blk == ALIST_BLOCK_NONE) {
2037 		spin_unlock(&vm_contig_spin);
2038 		if (bootverbose) {
2039 			kprintf("vm_page_alloc_contig: %ldk nospace\n",
2040 				(size + PAGE_MASK) * (PAGE_SIZE / 1024));
2041 		}
2042 		return(NULL);
2043 	}
2044 	if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2045 		alist_free(&vm_contig_alist, blk, size);
2046 		spin_unlock(&vm_contig_spin);
2047 		if (bootverbose) {
2048 			kprintf("vm_page_alloc_contig: %ldk high "
2049 				"%016jx failed\n",
2050 				(size + PAGE_MASK) * (PAGE_SIZE / 1024),
2051 				(intmax_t)high);
2052 		}
2053 		return(NULL);
2054 	}
2055 	spin_unlock(&vm_contig_spin);
2056 	if (vm_contig_verbose) {
2057 		kprintf("vm_page_alloc_contig: %016jx/%ldk\n",
2058 			(intmax_t)(vm_paddr_t)blk << PAGE_SHIFT,
2059 			(size + PAGE_MASK) * (PAGE_SIZE / 1024));
2060 	}
2061 
2062 	m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2063 	if (memattr != VM_MEMATTR_DEFAULT)
2064 		for (i = 0;i < size;i++)
2065 			pmap_page_set_memattr(&m[i], memattr);
2066 	return m;
2067 }
2068 
2069 /*
2070  * Free contiguously allocated pages.  The pages will be wired but not busy.
2071  * When freeing to the alist we leave them wired and not busy.
2072  */
2073 void
2074 vm_page_free_contig(vm_page_t m, unsigned long size)
2075 {
2076 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2077 	vm_pindex_t start = pa >> PAGE_SHIFT;
2078 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2079 
2080 	if (vm_contig_verbose) {
2081 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2082 			(intmax_t)pa, size / 1024);
2083 	}
2084 	if (pa < vm_low_phys_reserved) {
2085 		KKASSERT(pa + size <= vm_low_phys_reserved);
2086 		spin_lock(&vm_contig_spin);
2087 		alist_free(&vm_contig_alist, start, pages);
2088 		spin_unlock(&vm_contig_spin);
2089 	} else {
2090 		while (pages) {
2091 			vm_page_busy_wait(m, FALSE, "cpgfr");
2092 			vm_page_unwire(m, 0);
2093 			vm_page_free(m);
2094 			--pages;
2095 			++m;
2096 		}
2097 
2098 	}
2099 }
2100 
2101 
2102 /*
2103  * Wait for sufficient free memory for nominal heavy memory use kernel
2104  * operations.
2105  *
2106  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2107  *	     will trivially deadlock the system.
2108  */
2109 void
2110 vm_wait_nominal(void)
2111 {
2112 	while (vm_page_count_min(0))
2113 		vm_wait(0);
2114 }
2115 
2116 /*
2117  * Test if vm_wait_nominal() would block.
2118  */
2119 int
2120 vm_test_nominal(void)
2121 {
2122 	if (vm_page_count_min(0))
2123 		return(1);
2124 	return(0);
2125 }
2126 
2127 /*
2128  * Block until free pages are available for allocation, called in various
2129  * places before memory allocations.
2130  *
2131  * The caller may loop if vm_page_count_min() == FALSE so we cannot be
2132  * more generous then that.
2133  */
2134 void
2135 vm_wait(int timo)
2136 {
2137 	/*
2138 	 * never wait forever
2139 	 */
2140 	if (timo == 0)
2141 		timo = hz;
2142 	lwkt_gettoken(&vm_token);
2143 
2144 	if (curthread == pagethread) {
2145 		/*
2146 		 * The pageout daemon itself needs pages, this is bad.
2147 		 */
2148 		if (vm_page_count_min(0)) {
2149 			vm_pageout_pages_needed = 1;
2150 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2151 		}
2152 	} else {
2153 		/*
2154 		 * Wakeup the pageout daemon if necessary and wait.
2155 		 *
2156 		 * Do not wait indefinitely for the target to be reached,
2157 		 * as load might prevent it from being reached any time soon.
2158 		 * But wait a little to try to slow down page allocations
2159 		 * and to give more important threads (the pagedaemon)
2160 		 * allocation priority.
2161 		 */
2162 		if (vm_page_count_target()) {
2163 			if (vm_pages_needed == 0) {
2164 				vm_pages_needed = 1;
2165 				wakeup(&vm_pages_needed);
2166 			}
2167 			++vm_pages_waiting;	/* SMP race ok */
2168 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2169 		}
2170 	}
2171 	lwkt_reltoken(&vm_token);
2172 }
2173 
2174 /*
2175  * Block until free pages are available for allocation
2176  *
2177  * Called only from vm_fault so that processes page faulting can be
2178  * easily tracked.
2179  */
2180 void
2181 vm_wait_pfault(void)
2182 {
2183 	/*
2184 	 * Wakeup the pageout daemon if necessary and wait.
2185 	 *
2186 	 * Do not wait indefinitely for the target to be reached,
2187 	 * as load might prevent it from being reached any time soon.
2188 	 * But wait a little to try to slow down page allocations
2189 	 * and to give more important threads (the pagedaemon)
2190 	 * allocation priority.
2191 	 */
2192 	if (vm_page_count_min(0)) {
2193 		lwkt_gettoken(&vm_token);
2194 		while (vm_page_count_severe()) {
2195 			if (vm_page_count_target()) {
2196 				thread_t td;
2197 
2198 				if (vm_pages_needed == 0) {
2199 					vm_pages_needed = 1;
2200 					wakeup(&vm_pages_needed);
2201 				}
2202 				++vm_pages_waiting;	/* SMP race ok */
2203 				tsleep(&vmstats.v_free_count, 0, "pfault", hz);
2204 
2205 				/*
2206 				 * Do not stay stuck in the loop if the system is trying
2207 				 * to kill the process.
2208 				 */
2209 				td = curthread;
2210 				if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL))
2211 					break;
2212 			}
2213 		}
2214 		lwkt_reltoken(&vm_token);
2215 	}
2216 }
2217 
2218 /*
2219  * Put the specified page on the active list (if appropriate).  Ensure
2220  * that act_count is at least ACT_INIT but do not otherwise mess with it.
2221  *
2222  * The caller should be holding the page busied ? XXX
2223  * This routine may not block.
2224  */
2225 void
2226 vm_page_activate(vm_page_t m)
2227 {
2228 	u_short oqueue;
2229 
2230 	vm_page_spin_lock(m);
2231 	if (m->queue - m->pc != PQ_ACTIVE) {
2232 		_vm_page_queue_spin_lock(m);
2233 		oqueue = _vm_page_rem_queue_spinlocked(m);
2234 		/* page is left spinlocked, queue is unlocked */
2235 
2236 		if (oqueue == PQ_CACHE)
2237 			mycpu->gd_cnt.v_reactivated++;
2238 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2239 			if (m->act_count < ACT_INIT)
2240 				m->act_count = ACT_INIT;
2241 			_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
2242 		}
2243 		_vm_page_and_queue_spin_unlock(m);
2244 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
2245 			pagedaemon_wakeup();
2246 	} else {
2247 		if (m->act_count < ACT_INIT)
2248 			m->act_count = ACT_INIT;
2249 		vm_page_spin_unlock(m);
2250 	}
2251 }
2252 
2253 /*
2254  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
2255  * routine is called when a page has been added to the cache or free
2256  * queues.
2257  *
2258  * This routine may not block.
2259  */
2260 static __inline void
2261 vm_page_free_wakeup(void)
2262 {
2263 	globaldata_t gd = mycpu;
2264 
2265 	/*
2266 	 * If the pageout daemon itself needs pages, then tell it that
2267 	 * there are some free.
2268 	 */
2269 	if (vm_pageout_pages_needed &&
2270 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
2271 	    gd->gd_vmstats.v_pageout_free_min
2272 	) {
2273 		vm_pageout_pages_needed = 0;
2274 		wakeup(&vm_pageout_pages_needed);
2275 	}
2276 
2277 	/*
2278 	 * Wakeup processes that are waiting on memory.
2279 	 *
2280 	 * Generally speaking we want to wakeup stuck processes as soon as
2281 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
2282 	 * where we can do this.  Wait a bit longer to reduce degenerate
2283 	 * re-blocking (vm_page_free_hysteresis).  The target check is just
2284 	 * to make sure the min-check w/hysteresis does not exceed the
2285 	 * normal target.
2286 	 */
2287 	if (vm_pages_waiting) {
2288 		if (!vm_page_count_min(vm_page_free_hysteresis) ||
2289 		    !vm_page_count_target()) {
2290 			vm_pages_waiting = 0;
2291 			wakeup(&vmstats.v_free_count);
2292 			++mycpu->gd_cnt.v_ppwakeups;
2293 		}
2294 #if 0
2295 		if (!vm_page_count_target()) {
2296 			/*
2297 			 * Plenty of pages are free, wakeup everyone.
2298 			 */
2299 			vm_pages_waiting = 0;
2300 			wakeup(&vmstats.v_free_count);
2301 			++mycpu->gd_cnt.v_ppwakeups;
2302 		} else if (!vm_page_count_min(0)) {
2303 			/*
2304 			 * Some pages are free, wakeup someone.
2305 			 */
2306 			int wcount = vm_pages_waiting;
2307 			if (wcount > 0)
2308 				--wcount;
2309 			vm_pages_waiting = wcount;
2310 			wakeup_one(&vmstats.v_free_count);
2311 			++mycpu->gd_cnt.v_ppwakeups;
2312 		}
2313 #endif
2314 	}
2315 }
2316 
2317 /*
2318  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
2319  * it from its VM object.
2320  *
2321  * The vm_page must be PG_BUSY on entry.  PG_BUSY will be released on
2322  * return (the page will have been freed).
2323  */
2324 void
2325 vm_page_free_toq(vm_page_t m)
2326 {
2327 	mycpu->gd_cnt.v_tfree++;
2328 	KKASSERT((m->flags & PG_MAPPED) == 0);
2329 	KKASSERT(m->flags & PG_BUSY);
2330 
2331 	if (m->busy || ((m->queue - m->pc) == PQ_FREE)) {
2332 		kprintf("vm_page_free: pindex(%lu), busy(%d), "
2333 			"PG_BUSY(%d), hold(%d)\n",
2334 			(u_long)m->pindex, m->busy,
2335 			((m->flags & PG_BUSY) ? 1 : 0), m->hold_count);
2336 		if ((m->queue - m->pc) == PQ_FREE)
2337 			panic("vm_page_free: freeing free page");
2338 		else
2339 			panic("vm_page_free: freeing busy page");
2340 	}
2341 
2342 	/*
2343 	 * Remove from object, spinlock the page and its queues and
2344 	 * remove from any queue.  No queue spinlock will be held
2345 	 * after this section (because the page was removed from any
2346 	 * queue).
2347 	 */
2348 	vm_page_remove(m);
2349 	vm_page_and_queue_spin_lock(m);
2350 	_vm_page_rem_queue_spinlocked(m);
2351 
2352 	/*
2353 	 * No further management of fictitious pages occurs beyond object
2354 	 * and queue removal.
2355 	 */
2356 	if ((m->flags & PG_FICTITIOUS) != 0) {
2357 		vm_page_spin_unlock(m);
2358 		vm_page_wakeup(m);
2359 		return;
2360 	}
2361 
2362 	m->valid = 0;
2363 	vm_page_undirty(m);
2364 
2365 	if (m->wire_count != 0) {
2366 		if (m->wire_count > 1) {
2367 		    panic(
2368 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
2369 			m->wire_count, (long)m->pindex);
2370 		}
2371 		panic("vm_page_free: freeing wired page");
2372 	}
2373 
2374 	/*
2375 	 * Clear the UNMANAGED flag when freeing an unmanaged page.
2376 	 * Clear the NEED_COMMIT flag
2377 	 */
2378 	if (m->flags & PG_UNMANAGED)
2379 		vm_page_flag_clear(m, PG_UNMANAGED);
2380 	if (m->flags & PG_NEED_COMMIT)
2381 		vm_page_flag_clear(m, PG_NEED_COMMIT);
2382 
2383 	if (m->hold_count != 0) {
2384 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
2385 	} else {
2386 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
2387 	}
2388 
2389 	/*
2390 	 * This sequence allows us to clear PG_BUSY while still holding
2391 	 * its spin lock, which reduces contention vs allocators.  We
2392 	 * must not leave the queue locked or _vm_page_wakeup() may
2393 	 * deadlock.
2394 	 */
2395 	_vm_page_queue_spin_unlock(m);
2396 	if (_vm_page_wakeup(m)) {
2397 		vm_page_spin_unlock(m);
2398 		wakeup(m);
2399 	} else {
2400 		vm_page_spin_unlock(m);
2401 	}
2402 	vm_page_free_wakeup();
2403 }
2404 
2405 /*
2406  * vm_page_unmanage()
2407  *
2408  * Prevent PV management from being done on the page.  The page is
2409  * removed from the paging queues as if it were wired, and as a
2410  * consequence of no longer being managed the pageout daemon will not
2411  * touch it (since there is no way to locate the pte mappings for the
2412  * page).  madvise() calls that mess with the pmap will also no longer
2413  * operate on the page.
2414  *
2415  * Beyond that the page is still reasonably 'normal'.  Freeing the page
2416  * will clear the flag.
2417  *
2418  * This routine is used by OBJT_PHYS objects - objects using unswappable
2419  * physical memory as backing store rather then swap-backed memory and
2420  * will eventually be extended to support 4MB unmanaged physical
2421  * mappings.
2422  *
2423  * Caller must be holding the page busy.
2424  */
2425 void
2426 vm_page_unmanage(vm_page_t m)
2427 {
2428 	KKASSERT(m->flags & PG_BUSY);
2429 	if ((m->flags & PG_UNMANAGED) == 0) {
2430 		if (m->wire_count == 0)
2431 			vm_page_unqueue(m);
2432 	}
2433 	vm_page_flag_set(m, PG_UNMANAGED);
2434 }
2435 
2436 /*
2437  * Mark this page as wired down by yet another map, removing it from
2438  * paging queues as necessary.
2439  *
2440  * Caller must be holding the page busy.
2441  */
2442 void
2443 vm_page_wire(vm_page_t m)
2444 {
2445 	/*
2446 	 * Only bump the wire statistics if the page is not already wired,
2447 	 * and only unqueue the page if it is on some queue (if it is unmanaged
2448 	 * it is already off the queues).  Don't do anything with fictitious
2449 	 * pages because they are always wired.
2450 	 */
2451 	KKASSERT(m->flags & PG_BUSY);
2452 	if ((m->flags & PG_FICTITIOUS) == 0) {
2453 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
2454 			if ((m->flags & PG_UNMANAGED) == 0)
2455 				vm_page_unqueue(m);
2456 			atomic_add_int(&mycpu->gd_vmstats_adj.v_wire_count, 1);
2457 		}
2458 		KASSERT(m->wire_count != 0,
2459 			("vm_page_wire: wire_count overflow m=%p", m));
2460 	}
2461 }
2462 
2463 /*
2464  * Release one wiring of this page, potentially enabling it to be paged again.
2465  *
2466  * Many pages placed on the inactive queue should actually go
2467  * into the cache, but it is difficult to figure out which.  What
2468  * we do instead, if the inactive target is well met, is to put
2469  * clean pages at the head of the inactive queue instead of the tail.
2470  * This will cause them to be moved to the cache more quickly and
2471  * if not actively re-referenced, freed more quickly.  If we just
2472  * stick these pages at the end of the inactive queue, heavy filesystem
2473  * meta-data accesses can cause an unnecessary paging load on memory bound
2474  * processes.  This optimization causes one-time-use metadata to be
2475  * reused more quickly.
2476  *
2477  * Pages marked PG_NEED_COMMIT are always activated and never placed on
2478  * the inactive queue.  This helps the pageout daemon determine memory
2479  * pressure and act on out-of-memory situations more quickly.
2480  *
2481  * BUT, if we are in a low-memory situation we have no choice but to
2482  * put clean pages on the cache queue.
2483  *
2484  * A number of routines use vm_page_unwire() to guarantee that the page
2485  * will go into either the inactive or active queues, and will NEVER
2486  * be placed in the cache - for example, just after dirtying a page.
2487  * dirty pages in the cache are not allowed.
2488  *
2489  * This routine may not block.
2490  */
2491 void
2492 vm_page_unwire(vm_page_t m, int activate)
2493 {
2494 	KKASSERT(m->flags & PG_BUSY);
2495 	if (m->flags & PG_FICTITIOUS) {
2496 		/* do nothing */
2497 	} else if (m->wire_count <= 0) {
2498 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
2499 	} else {
2500 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
2501 			atomic_add_int(&mycpu->gd_vmstats_adj.v_wire_count, -1);
2502 			if (m->flags & PG_UNMANAGED) {
2503 				;
2504 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
2505 				vm_page_spin_lock(m);
2506 				_vm_page_add_queue_spinlocked(m,
2507 							PQ_ACTIVE + m->pc, 0);
2508 				_vm_page_and_queue_spin_unlock(m);
2509 			} else {
2510 				vm_page_spin_lock(m);
2511 				vm_page_flag_clear(m, PG_WINATCFLS);
2512 				_vm_page_add_queue_spinlocked(m,
2513 							PQ_INACTIVE + m->pc, 0);
2514 				++vm_swapcache_inactive_heuristic;
2515 				_vm_page_and_queue_spin_unlock(m);
2516 			}
2517 		}
2518 	}
2519 }
2520 
2521 /*
2522  * Move the specified page to the inactive queue.  If the page has
2523  * any associated swap, the swap is deallocated.
2524  *
2525  * Normally athead is 0 resulting in LRU operation.  athead is set
2526  * to 1 if we want this page to be 'as if it were placed in the cache',
2527  * except without unmapping it from the process address space.
2528  *
2529  * vm_page's spinlock must be held on entry and will remain held on return.
2530  * This routine may not block.
2531  */
2532 static void
2533 _vm_page_deactivate_locked(vm_page_t m, int athead)
2534 {
2535 	u_short oqueue;
2536 
2537 	/*
2538 	 * Ignore if already inactive.
2539 	 */
2540 	if (m->queue - m->pc == PQ_INACTIVE)
2541 		return;
2542 	_vm_page_queue_spin_lock(m);
2543 	oqueue = _vm_page_rem_queue_spinlocked(m);
2544 
2545 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2546 		if (oqueue == PQ_CACHE)
2547 			mycpu->gd_cnt.v_reactivated++;
2548 		vm_page_flag_clear(m, PG_WINATCFLS);
2549 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
2550 		if (athead == 0)
2551 			++vm_swapcache_inactive_heuristic;
2552 	}
2553 	/* NOTE: PQ_NONE if condition not taken */
2554 	_vm_page_queue_spin_unlock(m);
2555 	/* leaves vm_page spinlocked */
2556 }
2557 
2558 /*
2559  * Attempt to deactivate a page.
2560  *
2561  * No requirements.
2562  */
2563 void
2564 vm_page_deactivate(vm_page_t m)
2565 {
2566 	vm_page_spin_lock(m);
2567 	_vm_page_deactivate_locked(m, 0);
2568 	vm_page_spin_unlock(m);
2569 }
2570 
2571 void
2572 vm_page_deactivate_locked(vm_page_t m)
2573 {
2574 	_vm_page_deactivate_locked(m, 0);
2575 }
2576 
2577 /*
2578  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
2579  *
2580  * This function returns non-zero if it successfully moved the page to
2581  * PQ_CACHE.
2582  *
2583  * This function unconditionally unbusies the page on return.
2584  */
2585 int
2586 vm_page_try_to_cache(vm_page_t m)
2587 {
2588 	vm_page_spin_lock(m);
2589 	if (m->dirty || m->hold_count || m->wire_count ||
2590 	    (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT))) {
2591 		if (_vm_page_wakeup(m)) {
2592 			vm_page_spin_unlock(m);
2593 			wakeup(m);
2594 		} else {
2595 			vm_page_spin_unlock(m);
2596 		}
2597 		return(0);
2598 	}
2599 	vm_page_spin_unlock(m);
2600 
2601 	/*
2602 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
2603 	 * be moved to the cache.
2604 	 */
2605 	vm_page_test_dirty(m);
2606 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2607 		vm_page_wakeup(m);
2608 		return(0);
2609 	}
2610 	vm_page_cache(m);
2611 	return(1);
2612 }
2613 
2614 /*
2615  * Attempt to free the page.  If we cannot free it, we do nothing.
2616  * 1 is returned on success, 0 on failure.
2617  *
2618  * No requirements.
2619  */
2620 int
2621 vm_page_try_to_free(vm_page_t m)
2622 {
2623 	vm_page_spin_lock(m);
2624 	if (vm_page_busy_try(m, TRUE)) {
2625 		vm_page_spin_unlock(m);
2626 		return(0);
2627 	}
2628 
2629 	/*
2630 	 * The page can be in any state, including already being on the free
2631 	 * queue.  Check to see if it really can be freed.
2632 	 */
2633 	if (m->dirty ||				/* can't free if it is dirty */
2634 	    m->hold_count ||			/* or held (XXX may be wrong) */
2635 	    m->wire_count ||			/* or wired */
2636 	    (m->flags & (PG_UNMANAGED |		/* or unmanaged */
2637 			 PG_NEED_COMMIT)) ||	/* or needs a commit */
2638 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
2639 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
2640 		if (_vm_page_wakeup(m)) {
2641 			vm_page_spin_unlock(m);
2642 			wakeup(m);
2643 		} else {
2644 			vm_page_spin_unlock(m);
2645 		}
2646 		return(0);
2647 	}
2648 	vm_page_spin_unlock(m);
2649 
2650 	/*
2651 	 * We can probably free the page.
2652 	 *
2653 	 * Page busied by us and no longer spinlocked.  Dirty pages will
2654 	 * not be freed by this function.    We have to re-test the
2655 	 * dirty bit after cleaning out the pmaps.
2656 	 */
2657 	vm_page_test_dirty(m);
2658 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2659 		vm_page_wakeup(m);
2660 		return(0);
2661 	}
2662 	vm_page_protect(m, VM_PROT_NONE);
2663 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2664 		vm_page_wakeup(m);
2665 		return(0);
2666 	}
2667 	vm_page_free(m);
2668 	return(1);
2669 }
2670 
2671 /*
2672  * vm_page_cache
2673  *
2674  * Put the specified page onto the page cache queue (if appropriate).
2675  *
2676  * The page must be busy, and this routine will release the busy and
2677  * possibly even free the page.
2678  */
2679 void
2680 vm_page_cache(vm_page_t m)
2681 {
2682 	/*
2683 	 * Not suitable for the cache
2684 	 */
2685 	if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
2686 	    m->busy || m->wire_count || m->hold_count) {
2687 		vm_page_wakeup(m);
2688 		return;
2689 	}
2690 
2691 	/*
2692 	 * Already in the cache (and thus not mapped)
2693 	 */
2694 	if ((m->queue - m->pc) == PQ_CACHE) {
2695 		KKASSERT((m->flags & PG_MAPPED) == 0);
2696 		vm_page_wakeup(m);
2697 		return;
2698 	}
2699 
2700 	/*
2701 	 * Caller is required to test m->dirty, but note that the act of
2702 	 * removing the page from its maps can cause it to become dirty
2703 	 * on an SMP system due to another cpu running in usermode.
2704 	 */
2705 	if (m->dirty) {
2706 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
2707 			(long)m->pindex);
2708 	}
2709 
2710 	/*
2711 	 * Remove all pmaps and indicate that the page is not
2712 	 * writeable or mapped.  Our vm_page_protect() call may
2713 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
2714 	 * everything.
2715 	 */
2716 	vm_page_protect(m, VM_PROT_NONE);
2717 	if ((m->flags & (PG_UNMANAGED | PG_MAPPED)) ||
2718 	    m->busy || m->wire_count || m->hold_count) {
2719 		vm_page_wakeup(m);
2720 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2721 		vm_page_deactivate(m);
2722 		vm_page_wakeup(m);
2723 	} else {
2724 		_vm_page_and_queue_spin_lock(m);
2725 		_vm_page_rem_queue_spinlocked(m);
2726 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
2727 		_vm_page_queue_spin_unlock(m);
2728 		if (_vm_page_wakeup(m)) {
2729 			vm_page_spin_unlock(m);
2730 			wakeup(m);
2731 		} else {
2732 			vm_page_spin_unlock(m);
2733 		}
2734 		vm_page_free_wakeup();
2735 	}
2736 }
2737 
2738 /*
2739  * vm_page_dontneed()
2740  *
2741  * Cache, deactivate, or do nothing as appropriate.  This routine
2742  * is typically used by madvise() MADV_DONTNEED.
2743  *
2744  * Generally speaking we want to move the page into the cache so
2745  * it gets reused quickly.  However, this can result in a silly syndrome
2746  * due to the page recycling too quickly.  Small objects will not be
2747  * fully cached.  On the otherhand, if we move the page to the inactive
2748  * queue we wind up with a problem whereby very large objects
2749  * unnecessarily blow away our inactive and cache queues.
2750  *
2751  * The solution is to move the pages based on a fixed weighting.  We
2752  * either leave them alone, deactivate them, or move them to the cache,
2753  * where moving them to the cache has the highest weighting.
2754  * By forcing some pages into other queues we eventually force the
2755  * system to balance the queues, potentially recovering other unrelated
2756  * space from active.  The idea is to not force this to happen too
2757  * often.
2758  *
2759  * The page must be busied.
2760  */
2761 void
2762 vm_page_dontneed(vm_page_t m)
2763 {
2764 	static int dnweight;
2765 	int dnw;
2766 	int head;
2767 
2768 	dnw = ++dnweight;
2769 
2770 	/*
2771 	 * occassionally leave the page alone
2772 	 */
2773 	if ((dnw & 0x01F0) == 0 ||
2774 	    m->queue - m->pc == PQ_INACTIVE ||
2775 	    m->queue - m->pc == PQ_CACHE
2776 	) {
2777 		if (m->act_count >= ACT_INIT)
2778 			--m->act_count;
2779 		return;
2780 	}
2781 
2782 	/*
2783 	 * If vm_page_dontneed() is inactivating a page, it must clear
2784 	 * the referenced flag; otherwise the pagedaemon will see references
2785 	 * on the page in the inactive queue and reactivate it. Until the
2786 	 * page can move to the cache queue, madvise's job is not done.
2787 	 */
2788 	vm_page_flag_clear(m, PG_REFERENCED);
2789 	pmap_clear_reference(m);
2790 
2791 	if (m->dirty == 0)
2792 		vm_page_test_dirty(m);
2793 
2794 	if (m->dirty || (dnw & 0x0070) == 0) {
2795 		/*
2796 		 * Deactivate the page 3 times out of 32.
2797 		 */
2798 		head = 0;
2799 	} else {
2800 		/*
2801 		 * Cache the page 28 times out of every 32.  Note that
2802 		 * the page is deactivated instead of cached, but placed
2803 		 * at the head of the queue instead of the tail.
2804 		 */
2805 		head = 1;
2806 	}
2807 	vm_page_spin_lock(m);
2808 	_vm_page_deactivate_locked(m, head);
2809 	vm_page_spin_unlock(m);
2810 }
2811 
2812 /*
2813  * These routines manipulate the 'soft busy' count for a page.  A soft busy
2814  * is almost like PG_BUSY except that it allows certain compatible operations
2815  * to occur on the page while it is busy.  For example, a page undergoing a
2816  * write can still be mapped read-only.
2817  *
2818  * Because vm_pages can overlap buffers m->busy can be > 1.  m->busy is only
2819  * adjusted while the vm_page is PG_BUSY so the flash will occur when the
2820  * busy bit is cleared.
2821  *
2822  * The caller must hold the page BUSY when making these two calls.
2823  */
2824 void
2825 vm_page_io_start(vm_page_t m)
2826 {
2827         KASSERT(m->flags & PG_BUSY, ("vm_page_io_start: page not busy!!!"));
2828         atomic_add_char(&m->busy, 1);
2829 	vm_page_flag_set(m, PG_SBUSY);
2830 }
2831 
2832 void
2833 vm_page_io_finish(vm_page_t m)
2834 {
2835         KASSERT(m->flags & PG_BUSY, ("vm_page_io_finish: page not busy!!!"));
2836         atomic_subtract_char(&m->busy, 1);
2837 	if (m->busy == 0)
2838 		vm_page_flag_clear(m, PG_SBUSY);
2839 }
2840 
2841 /*
2842  * Indicate that a clean VM page requires a filesystem commit and cannot
2843  * be reused.  Used by tmpfs.
2844  */
2845 void
2846 vm_page_need_commit(vm_page_t m)
2847 {
2848 	vm_page_flag_set(m, PG_NEED_COMMIT);
2849 	vm_object_set_writeable_dirty(m->object);
2850 }
2851 
2852 void
2853 vm_page_clear_commit(vm_page_t m)
2854 {
2855 	vm_page_flag_clear(m, PG_NEED_COMMIT);
2856 }
2857 
2858 /*
2859  * Grab a page, blocking if it is busy and allocating a page if necessary.
2860  * A busy page is returned or NULL.  The page may or may not be valid and
2861  * might not be on a queue (the caller is responsible for the disposition of
2862  * the page).
2863  *
2864  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
2865  * page will be zero'd and marked valid.
2866  *
2867  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
2868  * valid even if it already exists.
2869  *
2870  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
2871  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
2872  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
2873  *
2874  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
2875  * always returned if we had blocked.
2876  *
2877  * This routine may not be called from an interrupt.
2878  *
2879  * No other requirements.
2880  */
2881 vm_page_t
2882 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2883 {
2884 	vm_page_t m;
2885 	int error;
2886 	int shared = 1;
2887 
2888 	KKASSERT(allocflags &
2889 		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2890 	vm_object_hold_shared(object);
2891 	for (;;) {
2892 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
2893 		if (error) {
2894 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
2895 			if ((allocflags & VM_ALLOC_RETRY) == 0) {
2896 				m = NULL;
2897 				break;
2898 			}
2899 			/* retry */
2900 		} else if (m == NULL) {
2901 			if (shared) {
2902 				vm_object_upgrade(object);
2903 				shared = 0;
2904 			}
2905 			if (allocflags & VM_ALLOC_RETRY)
2906 				allocflags |= VM_ALLOC_NULL_OK;
2907 			m = vm_page_alloc(object, pindex,
2908 					  allocflags & ~VM_ALLOC_RETRY);
2909 			if (m)
2910 				break;
2911 			vm_wait(0);
2912 			if ((allocflags & VM_ALLOC_RETRY) == 0)
2913 				goto failed;
2914 		} else {
2915 			/* m found */
2916 			break;
2917 		}
2918 	}
2919 
2920 	/*
2921 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
2922 	 *
2923 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
2924 	 * valid even if already valid.
2925 	 *
2926 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
2927 	 *	  removed the idle zeroing code.  These optimizations actually
2928 	 *	  slow things down on modern cpus because the zerod area is
2929 	 *	  likely uncached, placing a memory-access burden on the
2930 	 *	  accesors taking the fault.
2931 	 *
2932 	 *	  By always zeroing the page in-line with the fault, no
2933 	 *	  dynamic ram reads are needed and the caches are hot, ready
2934 	 *	  for userland to access the memory.
2935 	 */
2936 	if (m->valid == 0) {
2937 		if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
2938 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
2939 			m->valid = VM_PAGE_BITS_ALL;
2940 		}
2941 	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
2942 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
2943 		m->valid = VM_PAGE_BITS_ALL;
2944 	}
2945 failed:
2946 	vm_object_drop(object);
2947 	return(m);
2948 }
2949 
2950 /*
2951  * Mapping function for valid bits or for dirty bits in
2952  * a page.  May not block.
2953  *
2954  * Inputs are required to range within a page.
2955  *
2956  * No requirements.
2957  * Non blocking.
2958  */
2959 int
2960 vm_page_bits(int base, int size)
2961 {
2962 	int first_bit;
2963 	int last_bit;
2964 
2965 	KASSERT(
2966 	    base + size <= PAGE_SIZE,
2967 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
2968 	);
2969 
2970 	if (size == 0)		/* handle degenerate case */
2971 		return(0);
2972 
2973 	first_bit = base >> DEV_BSHIFT;
2974 	last_bit = (base + size - 1) >> DEV_BSHIFT;
2975 
2976 	return ((2 << last_bit) - (1 << first_bit));
2977 }
2978 
2979 /*
2980  * Sets portions of a page valid and clean.  The arguments are expected
2981  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
2982  * of any partial chunks touched by the range.  The invalid portion of
2983  * such chunks will be zero'd.
2984  *
2985  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
2986  *	 align base to DEV_BSIZE so as not to mark clean a partially
2987  *	 truncated device block.  Otherwise the dirty page status might be
2988  *	 lost.
2989  *
2990  * This routine may not block.
2991  *
2992  * (base + size) must be less then or equal to PAGE_SIZE.
2993  */
2994 static void
2995 _vm_page_zero_valid(vm_page_t m, int base, int size)
2996 {
2997 	int frag;
2998 	int endoff;
2999 
3000 	if (size == 0)	/* handle degenerate case */
3001 		return;
3002 
3003 	/*
3004 	 * If the base is not DEV_BSIZE aligned and the valid
3005 	 * bit is clear, we have to zero out a portion of the
3006 	 * first block.
3007 	 */
3008 
3009 	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
3010 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3011 	) {
3012 		pmap_zero_page_area(
3013 		    VM_PAGE_TO_PHYS(m),
3014 		    frag,
3015 		    base - frag
3016 		);
3017 	}
3018 
3019 	/*
3020 	 * If the ending offset is not DEV_BSIZE aligned and the
3021 	 * valid bit is clear, we have to zero out a portion of
3022 	 * the last block.
3023 	 */
3024 
3025 	endoff = base + size;
3026 
3027 	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
3028 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3029 	) {
3030 		pmap_zero_page_area(
3031 		    VM_PAGE_TO_PHYS(m),
3032 		    endoff,
3033 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3034 		);
3035 	}
3036 }
3037 
3038 /*
3039  * Set valid, clear dirty bits.  If validating the entire
3040  * page we can safely clear the pmap modify bit.  We also
3041  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3042  * takes a write fault on a MAP_NOSYNC memory area the flag will
3043  * be set again.
3044  *
3045  * We set valid bits inclusive of any overlap, but we can only
3046  * clear dirty bits for DEV_BSIZE chunks that are fully within
3047  * the range.
3048  *
3049  * Page must be busied?
3050  * No other requirements.
3051  */
3052 void
3053 vm_page_set_valid(vm_page_t m, int base, int size)
3054 {
3055 	_vm_page_zero_valid(m, base, size);
3056 	m->valid |= vm_page_bits(base, size);
3057 }
3058 
3059 
3060 /*
3061  * Set valid bits and clear dirty bits.
3062  *
3063  * Page must be busied by caller.
3064  *
3065  * NOTE: This function does not clear the pmap modified bit.
3066  *	 Also note that e.g. NFS may use a byte-granular base
3067  *	 and size.
3068  *
3069  * No other requirements.
3070  */
3071 void
3072 vm_page_set_validclean(vm_page_t m, int base, int size)
3073 {
3074 	int pagebits;
3075 
3076 	_vm_page_zero_valid(m, base, size);
3077 	pagebits = vm_page_bits(base, size);
3078 	m->valid |= pagebits;
3079 	m->dirty &= ~pagebits;
3080 	if (base == 0 && size == PAGE_SIZE) {
3081 		/*pmap_clear_modify(m);*/
3082 		vm_page_flag_clear(m, PG_NOSYNC);
3083 	}
3084 }
3085 
3086 /*
3087  * Set valid & dirty.  Used by buwrite()
3088  *
3089  * Page must be busied by caller.
3090  */
3091 void
3092 vm_page_set_validdirty(vm_page_t m, int base, int size)
3093 {
3094 	int pagebits;
3095 
3096 	pagebits = vm_page_bits(base, size);
3097 	m->valid |= pagebits;
3098 	m->dirty |= pagebits;
3099 	if (m->object)
3100 	       vm_object_set_writeable_dirty(m->object);
3101 }
3102 
3103 /*
3104  * Clear dirty bits.
3105  *
3106  * NOTE: This function does not clear the pmap modified bit.
3107  *	 Also note that e.g. NFS may use a byte-granular base
3108  *	 and size.
3109  *
3110  * Page must be busied?
3111  * No other requirements.
3112  */
3113 void
3114 vm_page_clear_dirty(vm_page_t m, int base, int size)
3115 {
3116 	m->dirty &= ~vm_page_bits(base, size);
3117 	if (base == 0 && size == PAGE_SIZE) {
3118 		/*pmap_clear_modify(m);*/
3119 		vm_page_flag_clear(m, PG_NOSYNC);
3120 	}
3121 }
3122 
3123 /*
3124  * Make the page all-dirty.
3125  *
3126  * Also make sure the related object and vnode reflect the fact that the
3127  * object may now contain a dirty page.
3128  *
3129  * Page must be busied?
3130  * No other requirements.
3131  */
3132 void
3133 vm_page_dirty(vm_page_t m)
3134 {
3135 #ifdef INVARIANTS
3136         int pqtype = m->queue - m->pc;
3137 #endif
3138         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3139                 ("vm_page_dirty: page in free/cache queue!"));
3140 	if (m->dirty != VM_PAGE_BITS_ALL) {
3141 		m->dirty = VM_PAGE_BITS_ALL;
3142 		if (m->object)
3143 			vm_object_set_writeable_dirty(m->object);
3144 	}
3145 }
3146 
3147 /*
3148  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3149  * valid and dirty bits for the effected areas are cleared.
3150  *
3151  * Page must be busied?
3152  * Does not block.
3153  * No other requirements.
3154  */
3155 void
3156 vm_page_set_invalid(vm_page_t m, int base, int size)
3157 {
3158 	int bits;
3159 
3160 	bits = vm_page_bits(base, size);
3161 	m->valid &= ~bits;
3162 	m->dirty &= ~bits;
3163 	atomic_add_int(&m->object->generation, 1);
3164 }
3165 
3166 /*
3167  * The kernel assumes that the invalid portions of a page contain
3168  * garbage, but such pages can be mapped into memory by user code.
3169  * When this occurs, we must zero out the non-valid portions of the
3170  * page so user code sees what it expects.
3171  *
3172  * Pages are most often semi-valid when the end of a file is mapped
3173  * into memory and the file's size is not page aligned.
3174  *
3175  * Page must be busied?
3176  * No other requirements.
3177  */
3178 void
3179 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
3180 {
3181 	int b;
3182 	int i;
3183 
3184 	/*
3185 	 * Scan the valid bits looking for invalid sections that
3186 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
3187 	 * valid bit may be set ) have already been zerod by
3188 	 * vm_page_set_validclean().
3189 	 */
3190 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
3191 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
3192 		    (m->valid & (1 << i))
3193 		) {
3194 			if (i > b) {
3195 				pmap_zero_page_area(
3196 				    VM_PAGE_TO_PHYS(m),
3197 				    b << DEV_BSHIFT,
3198 				    (i - b) << DEV_BSHIFT
3199 				);
3200 			}
3201 			b = i + 1;
3202 		}
3203 	}
3204 
3205 	/*
3206 	 * setvalid is TRUE when we can safely set the zero'd areas
3207 	 * as being valid.  We can do this if there are no cache consistency
3208 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
3209 	 */
3210 	if (setvalid)
3211 		m->valid = VM_PAGE_BITS_ALL;
3212 }
3213 
3214 /*
3215  * Is a (partial) page valid?  Note that the case where size == 0
3216  * will return FALSE in the degenerate case where the page is entirely
3217  * invalid, and TRUE otherwise.
3218  *
3219  * Does not block.
3220  * No other requirements.
3221  */
3222 int
3223 vm_page_is_valid(vm_page_t m, int base, int size)
3224 {
3225 	int bits = vm_page_bits(base, size);
3226 
3227 	if (m->valid && ((m->valid & bits) == bits))
3228 		return 1;
3229 	else
3230 		return 0;
3231 }
3232 
3233 /*
3234  * update dirty bits from pmap/mmu.  May not block.
3235  *
3236  * Caller must hold the page busy
3237  */
3238 void
3239 vm_page_test_dirty(vm_page_t m)
3240 {
3241 	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
3242 		vm_page_dirty(m);
3243 	}
3244 }
3245 
3246 /*
3247  * Register an action, associating it with its vm_page
3248  */
3249 void
3250 vm_page_register_action(vm_page_action_t action, vm_page_event_t event)
3251 {
3252 	struct vm_page_action_hash *hash;
3253 	int hv;
3254 
3255 	hv = (int)((intptr_t)action->m >> 8) & VMACTION_HMASK;
3256 	hash = &action_hash[hv];
3257 
3258 	lockmgr(&hash->lk, LK_EXCLUSIVE);
3259 	vm_page_flag_set(action->m, PG_ACTIONLIST);
3260 	action->event = event;
3261 	LIST_INSERT_HEAD(&hash->list, action, entry);
3262 	lockmgr(&hash->lk, LK_RELEASE);
3263 }
3264 
3265 /*
3266  * Unregister an action, disassociating it from its related vm_page
3267  */
3268 void
3269 vm_page_unregister_action(vm_page_action_t action)
3270 {
3271 	struct vm_page_action_hash *hash;
3272 	int hv;
3273 
3274 	hv = (int)((intptr_t)action->m >> 8) & VMACTION_HMASK;
3275 	hash = &action_hash[hv];
3276 	lockmgr(&hash->lk, LK_EXCLUSIVE);
3277 	if (action->event != VMEVENT_NONE) {
3278 		action->event = VMEVENT_NONE;
3279 		LIST_REMOVE(action, entry);
3280 
3281 		if (LIST_EMPTY(&hash->list))
3282 			vm_page_flag_clear(action->m, PG_ACTIONLIST);
3283 	}
3284 	lockmgr(&hash->lk, LK_RELEASE);
3285 }
3286 
3287 /*
3288  * Issue an event on a VM page.  Corresponding action structures are
3289  * removed from the page's list and called.
3290  *
3291  * If the vm_page has no more pending action events we clear its
3292  * PG_ACTIONLIST flag.
3293  */
3294 void
3295 vm_page_event_internal(vm_page_t m, vm_page_event_t event)
3296 {
3297 	struct vm_page_action_hash *hash;
3298 	struct vm_page_action *scan;
3299 	struct vm_page_action *next;
3300 	int hv;
3301 	int all;
3302 
3303 	hv = (int)((intptr_t)m >> 8) & VMACTION_HMASK;
3304 	hash = &action_hash[hv];
3305 	all = 1;
3306 
3307 	lockmgr(&hash->lk, LK_EXCLUSIVE);
3308 	LIST_FOREACH_MUTABLE(scan, &hash->list, entry, next) {
3309 		if (scan->m == m) {
3310 			if (scan->event == event) {
3311 				scan->event = VMEVENT_NONE;
3312 				LIST_REMOVE(scan, entry);
3313 				scan->func(m, scan);
3314 				/* XXX */
3315 			} else {
3316 				all = 0;
3317 			}
3318 		}
3319 	}
3320 	if (all)
3321 		vm_page_flag_clear(m, PG_ACTIONLIST);
3322 	lockmgr(&hash->lk, LK_RELEASE);
3323 }
3324 
3325 #include "opt_ddb.h"
3326 #ifdef DDB
3327 #include <ddb/ddb.h>
3328 
3329 DB_SHOW_COMMAND(page, vm_page_print_page_info)
3330 {
3331 	db_printf("vmstats.v_free_count: %d\n", vmstats.v_free_count);
3332 	db_printf("vmstats.v_cache_count: %d\n", vmstats.v_cache_count);
3333 	db_printf("vmstats.v_inactive_count: %d\n", vmstats.v_inactive_count);
3334 	db_printf("vmstats.v_active_count: %d\n", vmstats.v_active_count);
3335 	db_printf("vmstats.v_wire_count: %d\n", vmstats.v_wire_count);
3336 	db_printf("vmstats.v_free_reserved: %d\n", vmstats.v_free_reserved);
3337 	db_printf("vmstats.v_free_min: %d\n", vmstats.v_free_min);
3338 	db_printf("vmstats.v_free_target: %d\n", vmstats.v_free_target);
3339 	db_printf("vmstats.v_cache_min: %d\n", vmstats.v_cache_min);
3340 	db_printf("vmstats.v_inactive_target: %d\n", vmstats.v_inactive_target);
3341 }
3342 
3343 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3344 {
3345 	int i;
3346 	db_printf("PQ_FREE:");
3347 	for (i = 0; i < PQ_L2_SIZE; i++) {
3348 		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
3349 	}
3350 	db_printf("\n");
3351 
3352 	db_printf("PQ_CACHE:");
3353 	for(i = 0; i < PQ_L2_SIZE; i++) {
3354 		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
3355 	}
3356 	db_printf("\n");
3357 
3358 	db_printf("PQ_ACTIVE:");
3359 	for(i = 0; i < PQ_L2_SIZE; i++) {
3360 		db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt);
3361 	}
3362 	db_printf("\n");
3363 
3364 	db_printf("PQ_INACTIVE:");
3365 	for(i = 0; i < PQ_L2_SIZE; i++) {
3366 		db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt);
3367 	}
3368 	db_printf("\n");
3369 }
3370 #endif /* DDB */
3371