xref: /dragonfly/sys/vm/vm_page.c (revision b97d93a4)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
38  */
39 
40 /*
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  */
66 /*
67  * Resident memory management module.  The module manipulates 'VM pages'.
68  * A VM page is the core building block for memory management.
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/malloc.h>
74 #include <sys/proc.h>
75 #include <sys/vmmeter.h>
76 #include <sys/vnode.h>
77 #include <sys/kernel.h>
78 #include <sys/alist.h>
79 #include <sys/sysctl.h>
80 #include <sys/cpu_topology.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <sys/lock.h>
85 #include <vm/vm_kern.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_extern.h>
93 #include <vm/swap_pager.h>
94 
95 #include <machine/inttypes.h>
96 #include <machine/md_var.h>
97 #include <machine/specialreg.h>
98 #include <machine/bus_dma.h>
99 
100 #include <vm/vm_page2.h>
101 #include <sys/spinlock2.h>
102 
103 /*
104  * SET - Minimum required set associative size, must be a power of 2.  We
105  *	 want this to match or exceed the set-associativeness of the cpu.
106  *
107  * GRP - A larger set that allows bleed-over into the domains of other
108  *	 nearby cpus.  Also must be a power of 2.  Used by the page zeroing
109  *	 code to smooth things out a bit.
110  */
111 #define PQ_SET_ASSOC		16
112 #define PQ_SET_ASSOC_MASK	(PQ_SET_ASSOC - 1)
113 
114 #define PQ_GRP_ASSOC		(PQ_SET_ASSOC * 2)
115 #define PQ_GRP_ASSOC_MASK	(PQ_GRP_ASSOC - 1)
116 
117 static void vm_page_queue_init(void);
118 static void vm_page_free_wakeup(void);
119 static vm_page_t vm_page_select_cache(u_short pg_color);
120 static vm_page_t _vm_page_list_find2(int basequeue, int index);
121 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
122 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
123 
124 /*
125  * Array of tailq lists
126  */
127 __cachealign struct vpgqueues vm_page_queues[PQ_COUNT];
128 
129 static volatile int vm_pages_waiting;
130 static struct alist vm_contig_alist;
131 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
132 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
133 
134 static u_long vm_dma_reserved = 0;
135 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
136 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
137 	    "Memory reserved for DMA");
138 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
139 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
140 
141 static int vm_contig_verbose = 0;
142 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
143 
144 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
145 	     vm_pindex_t, pindex);
146 
147 static void
148 vm_page_queue_init(void)
149 {
150 	int i;
151 
152 	for (i = 0; i < PQ_L2_SIZE; i++)
153 		vm_page_queues[PQ_FREE+i].cnt_offset =
154 			offsetof(struct vmstats, v_free_count);
155 	for (i = 0; i < PQ_L2_SIZE; i++)
156 		vm_page_queues[PQ_CACHE+i].cnt_offset =
157 			offsetof(struct vmstats, v_cache_count);
158 	for (i = 0; i < PQ_L2_SIZE; i++)
159 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
160 			offsetof(struct vmstats, v_inactive_count);
161 	for (i = 0; i < PQ_L2_SIZE; i++)
162 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
163 			offsetof(struct vmstats, v_active_count);
164 	for (i = 0; i < PQ_L2_SIZE; i++)
165 		vm_page_queues[PQ_HOLD+i].cnt_offset =
166 			offsetof(struct vmstats, v_active_count);
167 	/* PQ_NONE has no queue */
168 
169 	for (i = 0; i < PQ_COUNT; i++) {
170 		TAILQ_INIT(&vm_page_queues[i].pl);
171 		spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
172 	}
173 }
174 
175 /*
176  * note: place in initialized data section?  Is this necessary?
177  */
178 vm_pindex_t first_page = 0;
179 vm_pindex_t vm_page_array_size = 0;
180 vm_page_t vm_page_array = NULL;
181 vm_paddr_t vm_low_phys_reserved;
182 
183 /*
184  * (low level boot)
185  *
186  * Sets the page size, perhaps based upon the memory size.
187  * Must be called before any use of page-size dependent functions.
188  */
189 void
190 vm_set_page_size(void)
191 {
192 	if (vmstats.v_page_size == 0)
193 		vmstats.v_page_size = PAGE_SIZE;
194 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
195 		panic("vm_set_page_size: page size not a power of two");
196 }
197 
198 /*
199  * (low level boot)
200  *
201  * Add a new page to the freelist for use by the system.  New pages
202  * are added to both the head and tail of the associated free page
203  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
204  * requests pull 'recent' adds (higher physical addresses) first.
205  *
206  * Beware that the page zeroing daemon will also be running soon after
207  * boot, moving pages from the head to the tail of the PQ_FREE queues.
208  *
209  * Must be called in a critical section.
210  */
211 static void
212 vm_add_new_page(vm_paddr_t pa)
213 {
214 	struct vpgqueues *vpq;
215 	vm_page_t m;
216 
217 	m = PHYS_TO_VM_PAGE(pa);
218 	m->phys_addr = pa;
219 	m->flags = 0;
220 	m->pat_mode = PAT_WRITE_BACK;
221 	m->pc = (pa >> PAGE_SHIFT);
222 
223 	/*
224 	 * Twist for cpu localization in addition to page coloring, so
225 	 * different cpus selecting by m->queue get different page colors.
226 	 */
227 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
228 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
229 	m->pc &= PQ_L2_MASK;
230 
231 	/*
232 	 * Reserve a certain number of contiguous low memory pages for
233 	 * contigmalloc() to use.
234 	 */
235 	if (pa < vm_low_phys_reserved) {
236 		atomic_add_long(&vmstats.v_page_count, 1);
237 		atomic_add_long(&vmstats.v_dma_pages, 1);
238 		m->queue = PQ_NONE;
239 		m->wire_count = 1;
240 		atomic_add_long(&vmstats.v_wire_count, 1);
241 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
242 		return;
243 	}
244 
245 	/*
246 	 * General page
247 	 */
248 	m->queue = m->pc + PQ_FREE;
249 	KKASSERT(m->dirty == 0);
250 
251 	atomic_add_long(&vmstats.v_page_count, 1);
252 	atomic_add_long(&vmstats.v_free_count, 1);
253 	vpq = &vm_page_queues[m->queue];
254 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
255 	++vpq->lcnt;
256 }
257 
258 /*
259  * (low level boot)
260  *
261  * Initializes the resident memory module.
262  *
263  * Preallocates memory for critical VM structures and arrays prior to
264  * kernel_map becoming available.
265  *
266  * Memory is allocated from (virtual2_start, virtual2_end) if available,
267  * otherwise memory is allocated from (virtual_start, virtual_end).
268  *
269  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
270  * large enough to hold vm_page_array & other structures for machines with
271  * large amounts of ram, so we want to use virtual2* when available.
272  */
273 void
274 vm_page_startup(void)
275 {
276 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
277 	vm_offset_t mapped;
278 	vm_pindex_t npages;
279 	vm_paddr_t page_range;
280 	vm_paddr_t new_end;
281 	int i;
282 	vm_paddr_t pa;
283 	vm_paddr_t last_pa;
284 	vm_paddr_t end;
285 	vm_paddr_t biggestone, biggestsize;
286 	vm_paddr_t total;
287 	vm_page_t m;
288 
289 	total = 0;
290 	biggestsize = 0;
291 	biggestone = 0;
292 	vaddr = round_page(vaddr);
293 
294 	/*
295 	 * Make sure ranges are page-aligned.
296 	 */
297 	for (i = 0; phys_avail[i].phys_end; ++i) {
298 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
299 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
300 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
301 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
302 	}
303 
304 	/*
305 	 * Locate largest block
306 	 */
307 	for (i = 0; phys_avail[i].phys_end; ++i) {
308 		vm_paddr_t size = phys_avail[i].phys_end -
309 				  phys_avail[i].phys_beg;
310 
311 		if (size > biggestsize) {
312 			biggestone = i;
313 			biggestsize = size;
314 		}
315 		total += size;
316 	}
317 	--i;	/* adjust to last entry for use down below */
318 
319 	end = phys_avail[biggestone].phys_end;
320 	end = trunc_page(end);
321 
322 	/*
323 	 * Initialize the queue headers for the free queue, the active queue
324 	 * and the inactive queue.
325 	 */
326 	vm_page_queue_init();
327 
328 #if !defined(_KERNEL_VIRTUAL)
329 	/*
330 	 * VKERNELs don't support minidumps and as such don't need
331 	 * vm_page_dump
332 	 *
333 	 * Allocate a bitmap to indicate that a random physical page
334 	 * needs to be included in a minidump.
335 	 *
336 	 * The amd64 port needs this to indicate which direct map pages
337 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
338 	 *
339 	 * However, x86 still needs this workspace internally within the
340 	 * minidump code.  In theory, they are not needed on x86, but are
341 	 * included should the sf_buf code decide to use them.
342 	 */
343 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
344 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
345 	end -= vm_page_dump_size;
346 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
347 					VM_PROT_READ | VM_PROT_WRITE);
348 	bzero((void *)vm_page_dump, vm_page_dump_size);
349 #endif
350 	/*
351 	 * Compute the number of pages of memory that will be available for
352 	 * use (taking into account the overhead of a page structure per
353 	 * page).
354 	 */
355 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
356 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
357 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
358 
359 #ifndef _KERNEL_VIRTUAL
360 	/*
361 	 * (only applies to real kernels)
362 	 *
363 	 * Reserve a large amount of low memory for potential 32-bit DMA
364 	 * space allocations.  Once device initialization is complete we
365 	 * release most of it, but keep (vm_dma_reserved) memory reserved
366 	 * for later use.  Typically for X / graphics.  Through trial and
367 	 * error we find that GPUs usually requires ~60-100MB or so.
368 	 *
369 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
370 	 */
371 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
372 	if (vm_low_phys_reserved > total / 4)
373 		vm_low_phys_reserved = total / 4;
374 	if (vm_dma_reserved == 0) {
375 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
376 		if (vm_dma_reserved > total / 16)
377 			vm_dma_reserved = total / 16;
378 	}
379 #endif
380 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
381 		   ALIST_RECORDS_65536);
382 
383 	/*
384 	 * Initialize the mem entry structures now, and put them in the free
385 	 * queue.
386 	 */
387 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
388 		kprintf("initializing vm_page_array ");
389 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
390 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
391 	vm_page_array = (vm_page_t)mapped;
392 
393 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
394 	/*
395 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
396 	 * we have to manually add these pages to the minidump tracking so
397 	 * that they can be dumped, including the vm_page_array.
398 	 */
399 	for (pa = new_end;
400 	     pa < phys_avail[biggestone].phys_end;
401 	     pa += PAGE_SIZE) {
402 		dump_add_page(pa);
403 	}
404 #endif
405 
406 	/*
407 	 * Clear all of the page structures, run basic initialization so
408 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
409 	 * map.
410 	 */
411 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
412 	vm_page_array_size = page_range;
413 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
414 		kprintf("size = 0x%zx\n", vm_page_array_size);
415 
416 	m = &vm_page_array[0];
417 	pa = ptoa(first_page);
418 	for (i = 0; i < page_range; ++i) {
419 		spin_init(&m->spin, "vm_page");
420 		m->phys_addr = pa;
421 		pa += PAGE_SIZE;
422 		++m;
423 	}
424 
425 	/*
426 	 * Construct the free queue(s) in ascending order (by physical
427 	 * address) so that the first 16MB of physical memory is allocated
428 	 * last rather than first.  On large-memory machines, this avoids
429 	 * the exhaustion of low physical memory before isa_dma_init has run.
430 	 */
431 	vmstats.v_page_count = 0;
432 	vmstats.v_free_count = 0;
433 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
434 		pa = phys_avail[i].phys_beg;
435 		if (i == biggestone)
436 			last_pa = new_end;
437 		else
438 			last_pa = phys_avail[i].phys_end;
439 		while (pa < last_pa && npages-- > 0) {
440 			vm_add_new_page(pa);
441 			pa += PAGE_SIZE;
442 		}
443 	}
444 	if (virtual2_start)
445 		virtual2_start = vaddr;
446 	else
447 		virtual_start = vaddr;
448 	mycpu->gd_vmstats = vmstats;
449 }
450 
451 /*
452  * Reorganize VM pages based on numa data.  May be called as many times as
453  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
454  * to allow vm_page_alloc() to choose pages based on socket affinity.
455  *
456  * NOTE: This function is only called while we are still in UP mode, so
457  *	 we only need a critical section to protect the queues (which
458  *	 saves a lot of time, there are likely a ton of pages).
459  */
460 void
461 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
462 {
463 	vm_paddr_t scan_beg;
464 	vm_paddr_t scan_end;
465 	vm_paddr_t ran_end;
466 	struct vpgqueues *vpq;
467 	vm_page_t m;
468 	vm_page_t mend;
469 	int i;
470 	int socket_mod;
471 	int socket_value;
472 
473 	/*
474 	 * Check if no physical information, or there was only one socket
475 	 * (so don't waste time doing nothing!).
476 	 */
477 	if (cpu_topology_phys_ids <= 1 ||
478 	    cpu_topology_core_ids == 0) {
479 		return;
480 	}
481 
482 	/*
483 	 * Setup for our iteration.  Note that ACPI may iterate CPU
484 	 * sockets starting at 0 or 1 or some other number.  The
485 	 * cpu_topology code mod's it against the socket count.
486 	 */
487 	ran_end = ran_beg + bytes;
488 
489 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
490 	socket_value = (physid % cpu_topology_phys_ids) * socket_mod;
491 	mend = &vm_page_array[vm_page_array_size];
492 
493 	crit_enter();
494 
495 	/*
496 	 * Adjust cpu_topology's phys_mem parameter
497 	 */
498 	if (root_cpu_node)
499 		vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes);
500 
501 	/*
502 	 * Adjust vm_page->pc and requeue all affected pages.  The
503 	 * allocator will then be able to localize memory allocations
504 	 * to some degree.
505 	 */
506 	for (i = 0; phys_avail[i].phys_end; ++i) {
507 		scan_beg = phys_avail[i].phys_beg;
508 		scan_end = phys_avail[i].phys_end;
509 		if (scan_end <= ran_beg)
510 			continue;
511 		if (scan_beg >= ran_end)
512 			continue;
513 		if (scan_beg < ran_beg)
514 			scan_beg = ran_beg;
515 		if (scan_end > ran_end)
516 			scan_end = ran_end;
517 		if (atop(scan_end) > first_page + vm_page_array_size)
518 			scan_end = ptoa(first_page + vm_page_array_size);
519 
520 		m = PHYS_TO_VM_PAGE(scan_beg);
521 		while (scan_beg < scan_end) {
522 			KKASSERT(m < mend);
523 			if (m->queue != PQ_NONE) {
524 				vpq = &vm_page_queues[m->queue];
525 				TAILQ_REMOVE(&vpq->pl, m, pageq);
526 				--vpq->lcnt;
527 				/* queue doesn't change, no need to adj cnt */
528 				m->queue -= m->pc;
529 				m->pc %= socket_mod;
530 				m->pc += socket_value;
531 				m->pc &= PQ_L2_MASK;
532 				m->queue += m->pc;
533 				vpq = &vm_page_queues[m->queue];
534 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
535 				++vpq->lcnt;
536 				/* queue doesn't change, no need to adj cnt */
537 			} else {
538 				m->pc %= socket_mod;
539 				m->pc += socket_value;
540 				m->pc &= PQ_L2_MASK;
541 			}
542 			scan_beg += PAGE_SIZE;
543 			++m;
544 		}
545 	}
546 	crit_exit();
547 }
548 
549 static
550 void
551 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes)
552 {
553 	int cpuid;
554 	int i;
555 
556 	switch(cpup->type) {
557 	case PACKAGE_LEVEL:
558 		cpup->phys_mem += bytes;
559 		break;
560 	case CHIP_LEVEL:
561 		/*
562 		 * All members should have the same chipid, so we only need
563 		 * to pull out one member.
564 		 */
565 		if (CPUMASK_TESTNZERO(cpup->members)) {
566 			cpuid = BSFCPUMASK(cpup->members);
567 			if (physid ==
568 			    get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) {
569 				cpup->phys_mem += bytes;
570 			}
571 		}
572 		break;
573 	case CORE_LEVEL:
574 	case THREAD_LEVEL:
575 		/*
576 		 * Just inherit from the parent node
577 		 */
578 		cpup->phys_mem = cpup->parent_node->phys_mem;
579 		break;
580 	}
581 	for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i)
582 		vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes);
583 }
584 
585 /*
586  * We tended to reserve a ton of memory for contigmalloc().  Now that most
587  * drivers have initialized we want to return most the remaining free
588  * reserve back to the VM page queues so they can be used for normal
589  * allocations.
590  *
591  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
592  */
593 static void
594 vm_page_startup_finish(void *dummy __unused)
595 {
596 	alist_blk_t blk;
597 	alist_blk_t rblk;
598 	alist_blk_t count;
599 	alist_blk_t xcount;
600 	alist_blk_t bfree;
601 	vm_page_t m;
602 
603 	spin_lock(&vm_contig_spin);
604 	for (;;) {
605 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
606 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
607 			break;
608 		if (count == 0)
609 			break;
610 
611 		/*
612 		 * Figure out how much of the initial reserve we have to
613 		 * free in order to reach our target.
614 		 */
615 		bfree -= vm_dma_reserved / PAGE_SIZE;
616 		if (count > bfree) {
617 			blk += count - bfree;
618 			count = bfree;
619 		}
620 
621 		/*
622 		 * Calculate the nearest power of 2 <= count.
623 		 */
624 		for (xcount = 1; xcount <= count; xcount <<= 1)
625 			;
626 		xcount >>= 1;
627 		blk += count - xcount;
628 		count = xcount;
629 
630 		/*
631 		 * Allocate the pages from the alist, then free them to
632 		 * the normal VM page queues.
633 		 *
634 		 * Pages allocated from the alist are wired.  We have to
635 		 * busy, unwire, and free them.  We must also adjust
636 		 * vm_low_phys_reserved before freeing any pages to prevent
637 		 * confusion.
638 		 */
639 		rblk = alist_alloc(&vm_contig_alist, blk, count);
640 		if (rblk != blk) {
641 			kprintf("vm_page_startup_finish: Unable to return "
642 				"dma space @0x%08x/%d -> 0x%08x\n",
643 				blk, count, rblk);
644 			break;
645 		}
646 		atomic_add_long(&vmstats.v_dma_pages, -(long)count);
647 		spin_unlock(&vm_contig_spin);
648 
649 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
650 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
651 		while (count) {
652 			vm_page_busy_wait(m, FALSE, "cpgfr");
653 			vm_page_unwire(m, 0);
654 			vm_page_free(m);
655 			--count;
656 			++m;
657 		}
658 		spin_lock(&vm_contig_spin);
659 	}
660 	spin_unlock(&vm_contig_spin);
661 
662 	/*
663 	 * Print out how much DMA space drivers have already allocated and
664 	 * how much is left over.
665 	 */
666 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
667 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
668 		(PAGE_SIZE / 1024),
669 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
670 }
671 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
672 	vm_page_startup_finish, NULL);
673 
674 
675 /*
676  * Scan comparison function for Red-Black tree scans.  An inclusive
677  * (start,end) is expected.  Other fields are not used.
678  */
679 int
680 rb_vm_page_scancmp(struct vm_page *p, void *data)
681 {
682 	struct rb_vm_page_scan_info *info = data;
683 
684 	if (p->pindex < info->start_pindex)
685 		return(-1);
686 	if (p->pindex > info->end_pindex)
687 		return(1);
688 	return(0);
689 }
690 
691 int
692 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
693 {
694 	if (p1->pindex < p2->pindex)
695 		return(-1);
696 	if (p1->pindex > p2->pindex)
697 		return(1);
698 	return(0);
699 }
700 
701 void
702 vm_page_init(vm_page_t m)
703 {
704 	/* do nothing for now.  Called from pmap_page_init() */
705 }
706 
707 /*
708  * Each page queue has its own spin lock, which is fairly optimal for
709  * allocating and freeing pages at least.
710  *
711  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
712  * queue spinlock via this function.  Also note that m->queue cannot change
713  * unless both the page and queue are locked.
714  */
715 static __inline
716 void
717 _vm_page_queue_spin_lock(vm_page_t m)
718 {
719 	u_short queue;
720 
721 	queue = m->queue;
722 	if (queue != PQ_NONE) {
723 		spin_lock(&vm_page_queues[queue].spin);
724 		KKASSERT(queue == m->queue);
725 	}
726 }
727 
728 static __inline
729 void
730 _vm_page_queue_spin_unlock(vm_page_t m)
731 {
732 	u_short queue;
733 
734 	queue = m->queue;
735 	cpu_ccfence();
736 	if (queue != PQ_NONE)
737 		spin_unlock(&vm_page_queues[queue].spin);
738 }
739 
740 static __inline
741 void
742 _vm_page_queues_spin_lock(u_short queue)
743 {
744 	cpu_ccfence();
745 	if (queue != PQ_NONE)
746 		spin_lock(&vm_page_queues[queue].spin);
747 }
748 
749 
750 static __inline
751 void
752 _vm_page_queues_spin_unlock(u_short queue)
753 {
754 	cpu_ccfence();
755 	if (queue != PQ_NONE)
756 		spin_unlock(&vm_page_queues[queue].spin);
757 }
758 
759 void
760 vm_page_queue_spin_lock(vm_page_t m)
761 {
762 	_vm_page_queue_spin_lock(m);
763 }
764 
765 void
766 vm_page_queues_spin_lock(u_short queue)
767 {
768 	_vm_page_queues_spin_lock(queue);
769 }
770 
771 void
772 vm_page_queue_spin_unlock(vm_page_t m)
773 {
774 	_vm_page_queue_spin_unlock(m);
775 }
776 
777 void
778 vm_page_queues_spin_unlock(u_short queue)
779 {
780 	_vm_page_queues_spin_unlock(queue);
781 }
782 
783 /*
784  * This locks the specified vm_page and its queue in the proper order
785  * (page first, then queue).  The queue may change so the caller must
786  * recheck on return.
787  */
788 static __inline
789 void
790 _vm_page_and_queue_spin_lock(vm_page_t m)
791 {
792 	vm_page_spin_lock(m);
793 	_vm_page_queue_spin_lock(m);
794 }
795 
796 static __inline
797 void
798 _vm_page_and_queue_spin_unlock(vm_page_t m)
799 {
800 	_vm_page_queues_spin_unlock(m->queue);
801 	vm_page_spin_unlock(m);
802 }
803 
804 void
805 vm_page_and_queue_spin_unlock(vm_page_t m)
806 {
807 	_vm_page_and_queue_spin_unlock(m);
808 }
809 
810 void
811 vm_page_and_queue_spin_lock(vm_page_t m)
812 {
813 	_vm_page_and_queue_spin_lock(m);
814 }
815 
816 /*
817  * Helper function removes vm_page from its current queue.
818  * Returns the base queue the page used to be on.
819  *
820  * The vm_page and the queue must be spinlocked.
821  * This function will unlock the queue but leave the page spinlocked.
822  */
823 static __inline u_short
824 _vm_page_rem_queue_spinlocked(vm_page_t m)
825 {
826 	struct vpgqueues *pq;
827 	u_short queue;
828 	u_short oqueue;
829 	long *cnt;
830 
831 	queue = m->queue;
832 	if (queue != PQ_NONE) {
833 		pq = &vm_page_queues[queue];
834 		TAILQ_REMOVE(&pq->pl, m, pageq);
835 
836 		/*
837 		 * Adjust our pcpu stats.  In order for the nominal low-memory
838 		 * algorithms to work properly we don't let any pcpu stat get
839 		 * too negative before we force it to be rolled-up into the
840 		 * global stats.  Otherwise our pageout and vm_wait tests
841 		 * will fail badly.
842 		 *
843 		 * The idea here is to reduce unnecessary SMP cache
844 		 * mastership changes in the global vmstats, which can be
845 		 * particularly bad in multi-socket systems.
846 		 */
847 		cnt = (long *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
848 		atomic_add_long(cnt, -1);
849 		if (*cnt < -VMMETER_SLOP_COUNT) {
850 			u_long copy = atomic_swap_long(cnt, 0);
851 			cnt = (long *)((char *)&vmstats + pq->cnt_offset);
852 			atomic_add_long(cnt, copy);
853 			cnt = (long *)((char *)&mycpu->gd_vmstats +
854 				      pq->cnt_offset);
855 			atomic_add_long(cnt, copy);
856 		}
857 		pq->lcnt--;
858 		m->queue = PQ_NONE;
859 		oqueue = queue;
860 		queue -= m->pc;
861 		vm_page_queues_spin_unlock(oqueue);	/* intended */
862 	}
863 	return queue;
864 }
865 
866 /*
867  * Helper function places the vm_page on the specified queue.  Generally
868  * speaking only PQ_FREE pages are placed at the head, to allow them to
869  * be allocated sooner rather than later on the assumption that they
870  * are cache-hot.
871  *
872  * The vm_page must be spinlocked.
873  * This function will return with both the page and the queue locked.
874  */
875 static __inline void
876 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
877 {
878 	struct vpgqueues *pq;
879 	u_long *cnt;
880 
881 	KKASSERT(m->queue == PQ_NONE);
882 
883 	if (queue != PQ_NONE) {
884 		vm_page_queues_spin_lock(queue);
885 		pq = &vm_page_queues[queue];
886 		++pq->lcnt;
887 
888 		/*
889 		 * Adjust our pcpu stats.  If a system entity really needs
890 		 * to incorporate the count it will call vmstats_rollup()
891 		 * to roll it all up into the global vmstats strufture.
892 		 */
893 		cnt = (long *)((char *)&mycpu->gd_vmstats_adj + pq->cnt_offset);
894 		atomic_add_long(cnt, 1);
895 
896 		/*
897 		 * PQ_FREE is always handled LIFO style to try to provide
898 		 * cache-hot pages to programs.
899 		 */
900 		m->queue = queue;
901 		if (queue - m->pc == PQ_FREE) {
902 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
903 		} else if (athead) {
904 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
905 		} else {
906 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
907 		}
908 		/* leave the queue spinlocked */
909 	}
910 }
911 
912 /*
913  * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
914  * until the page is no longer BUSY or SBUSY (busy_count field is 0).
915  *
916  * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
917  * call will be made before returning.
918  *
919  * This function does NOT busy the page and on return the page is not
920  * guaranteed to be available.
921  */
922 void
923 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
924 {
925 	u_int32_t busy_count;
926 
927 	for (;;) {
928 		busy_count = m->busy_count;
929 		cpu_ccfence();
930 
931 		if ((busy_count & PBUSY_LOCKED) == 0 &&
932 		    (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
933 			break;
934 		}
935 		tsleep_interlock(m, 0);
936 		if (atomic_cmpset_int(&m->busy_count, busy_count,
937 				      busy_count | PBUSY_WANTED)) {
938 			atomic_set_int(&m->flags, PG_REFERENCED);
939 			tsleep(m, PINTERLOCKED, msg, 0);
940 			break;
941 		}
942 	}
943 }
944 
945 /*
946  * This calculates and returns a page color given an optional VM object and
947  * either a pindex or an iterator.  We attempt to return a cpu-localized
948  * pg_color that is still roughly 16-way set-associative.  The CPU topology
949  * is used if it was probed.
950  *
951  * The caller may use the returned value to index into e.g. PQ_FREE when
952  * allocating a page in order to nominally obtain pages that are hopefully
953  * already localized to the requesting cpu.  This function is not able to
954  * provide any sort of guarantee of this, but does its best to improve
955  * hardware cache management performance.
956  *
957  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
958  */
959 u_short
960 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
961 {
962 	u_short pg_color;
963 	int phys_id;
964 	int core_id;
965 	int object_pg_color;
966 
967 	phys_id = get_cpu_phys_id(cpuid);
968 	core_id = get_cpu_core_id(cpuid);
969 	object_pg_color = object ? object->pg_color : 0;
970 
971 	if (cpu_topology_phys_ids && cpu_topology_core_ids) {
972 		int grpsize;
973 
974 		/*
975 		 * Break us down by socket and cpu
976 		 */
977 		pg_color = phys_id * PQ_L2_SIZE / cpu_topology_phys_ids;
978 		pg_color += core_id * PQ_L2_SIZE /
979 			    (cpu_topology_core_ids * cpu_topology_phys_ids);
980 
981 		/*
982 		 * Calculate remaining component for object/queue color
983 		 */
984 		grpsize = PQ_L2_SIZE / (cpu_topology_core_ids *
985 					cpu_topology_phys_ids);
986 		if (grpsize >= 8) {
987 			pg_color += (pindex + object_pg_color) % grpsize;
988 		} else {
989 			if (grpsize <= 2) {
990 				grpsize = 8;
991 			} else {
992 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
993 				grpsize += grpsize;
994 				if (grpsize < 8)
995 					grpsize += grpsize;
996 			}
997 			pg_color += (pindex + object_pg_color) % grpsize;
998 		}
999 	} else {
1000 		/*
1001 		 * Unknown topology, distribute things evenly.
1002 		 */
1003 		pg_color = cpuid * PQ_L2_SIZE / ncpus;
1004 		pg_color += pindex + object_pg_color;
1005 	}
1006 	return (pg_color & PQ_L2_MASK);
1007 }
1008 
1009 /*
1010  * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
1011  * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
1012  */
1013 void
1014 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
1015 				     int also_m_busy, const char *msg
1016 				     VM_PAGE_DEBUG_ARGS)
1017 {
1018 	u_int32_t busy_count;
1019 
1020 	for (;;) {
1021 		busy_count = m->busy_count;
1022 		cpu_ccfence();
1023 		if (busy_count & PBUSY_LOCKED) {
1024 			tsleep_interlock(m, 0);
1025 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1026 					  busy_count | PBUSY_WANTED)) {
1027 				atomic_set_int(&m->flags, PG_REFERENCED);
1028 				tsleep(m, PINTERLOCKED, msg, 0);
1029 			}
1030 		} else if (also_m_busy && busy_count) {
1031 			tsleep_interlock(m, 0);
1032 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1033 					  busy_count | PBUSY_WANTED)) {
1034 				atomic_set_int(&m->flags, PG_REFERENCED);
1035 				tsleep(m, PINTERLOCKED, msg, 0);
1036 			}
1037 		} else {
1038 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1039 					      busy_count | PBUSY_LOCKED)) {
1040 #ifdef VM_PAGE_DEBUG
1041 				m->busy_func = func;
1042 				m->busy_line = lineno;
1043 #endif
1044 				break;
1045 			}
1046 		}
1047 	}
1048 }
1049 
1050 /*
1051  * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
1052  * m->busy_count is also 0.
1053  *
1054  * Returns non-zero on failure.
1055  */
1056 int
1057 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1058 				    VM_PAGE_DEBUG_ARGS)
1059 {
1060 	u_int32_t busy_count;
1061 
1062 	for (;;) {
1063 		busy_count = m->busy_count;
1064 		cpu_ccfence();
1065 		if (busy_count & PBUSY_LOCKED)
1066 			return TRUE;
1067 		if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1068 			return TRUE;
1069 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1070 				      busy_count | PBUSY_LOCKED)) {
1071 #ifdef VM_PAGE_DEBUG
1072 				m->busy_func = func;
1073 				m->busy_line = lineno;
1074 #endif
1075 			return FALSE;
1076 		}
1077 	}
1078 }
1079 
1080 /*
1081  * Clear the BUSY flag and return non-zero to indicate to the caller
1082  * that a wakeup() should be performed.
1083  *
1084  * The vm_page must be spinlocked and will remain spinlocked on return.
1085  * The related queue must NOT be spinlocked (which could deadlock us).
1086  *
1087  * (inline version)
1088  */
1089 static __inline
1090 int
1091 _vm_page_wakeup(vm_page_t m)
1092 {
1093 	u_int32_t busy_count;
1094 
1095 	for (;;) {
1096 		busy_count = m->busy_count;
1097 		cpu_ccfence();
1098 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1099 				      busy_count &
1100 				      ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1101 			break;
1102 		}
1103 	}
1104 	return((int)(busy_count & PBUSY_WANTED));
1105 }
1106 
1107 /*
1108  * Clear the BUSY flag and wakeup anyone waiting for the page.  This
1109  * is typically the last call you make on a page before moving onto
1110  * other things.
1111  */
1112 void
1113 vm_page_wakeup(vm_page_t m)
1114 {
1115         KASSERT(m->busy_count & PBUSY_LOCKED,
1116 		("vm_page_wakeup: page not busy!!!"));
1117 	vm_page_spin_lock(m);
1118 	if (_vm_page_wakeup(m)) {
1119 		vm_page_spin_unlock(m);
1120 		wakeup(m);
1121 	} else {
1122 		vm_page_spin_unlock(m);
1123 	}
1124 }
1125 
1126 /*
1127  * Holding a page keeps it from being reused.  Other parts of the system
1128  * can still disassociate the page from its current object and free it, or
1129  * perform read or write I/O on it and/or otherwise manipulate the page,
1130  * but if the page is held the VM system will leave the page and its data
1131  * intact and not reuse the page for other purposes until the last hold
1132  * reference is released.  (see vm_page_wire() if you want to prevent the
1133  * page from being disassociated from its object too).
1134  *
1135  * The caller must still validate the contents of the page and, if necessary,
1136  * wait for any pending I/O (e.g. vm_page_sleep_busy() loop) to complete
1137  * before manipulating the page.
1138  *
1139  * XXX get vm_page_spin_lock() here and move FREE->HOLD if necessary
1140  */
1141 void
1142 vm_page_hold(vm_page_t m)
1143 {
1144 	vm_page_spin_lock(m);
1145 	atomic_add_int(&m->hold_count, 1);
1146 	if (m->queue - m->pc == PQ_FREE) {
1147 		_vm_page_queue_spin_lock(m);
1148 		_vm_page_rem_queue_spinlocked(m);
1149 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
1150 		_vm_page_queue_spin_unlock(m);
1151 	}
1152 	vm_page_spin_unlock(m);
1153 }
1154 
1155 /*
1156  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1157  * it was freed while held and must be moved back to the FREE queue.
1158  */
1159 void
1160 vm_page_unhold(vm_page_t m)
1161 {
1162 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1163 		("vm_page_unhold: pg %p illegal hold_count (%d) or on FREE queue (%d)",
1164 		 m, m->hold_count, m->queue - m->pc));
1165 	vm_page_spin_lock(m);
1166 	atomic_add_int(&m->hold_count, -1);
1167 	if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1168 		_vm_page_queue_spin_lock(m);
1169 		_vm_page_rem_queue_spinlocked(m);
1170 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1171 		_vm_page_queue_spin_unlock(m);
1172 	}
1173 	vm_page_spin_unlock(m);
1174 }
1175 
1176 /*
1177  *	vm_page_getfake:
1178  *
1179  *	Create a fictitious page with the specified physical address and
1180  *	memory attribute.  The memory attribute is the only the machine-
1181  *	dependent aspect of a fictitious page that must be initialized.
1182  */
1183 
1184 void
1185 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1186 {
1187 
1188 	if ((m->flags & PG_FICTITIOUS) != 0) {
1189 		/*
1190 		 * The page's memattr might have changed since the
1191 		 * previous initialization.  Update the pmap to the
1192 		 * new memattr.
1193 		 */
1194 		goto memattr;
1195 	}
1196 	m->phys_addr = paddr;
1197 	m->queue = PQ_NONE;
1198 	/* Fictitious pages don't use "segind". */
1199 	/* Fictitious pages don't use "order" or "pool". */
1200 	m->flags = PG_FICTITIOUS | PG_UNMANAGED;
1201 	m->busy_count = PBUSY_LOCKED;
1202 	m->wire_count = 1;
1203 	spin_init(&m->spin, "fake_page");
1204 	pmap_page_init(m);
1205 memattr:
1206 	pmap_page_set_memattr(m, memattr);
1207 }
1208 
1209 /*
1210  * Inserts the given vm_page into the object and object list.
1211  *
1212  * The pagetables are not updated but will presumably fault the page
1213  * in if necessary, or if a kernel page the caller will at some point
1214  * enter the page into the kernel's pmap.  We are not allowed to block
1215  * here so we *can't* do this anyway.
1216  *
1217  * This routine may not block.
1218  * This routine must be called with the vm_object held.
1219  * This routine must be called with a critical section held.
1220  *
1221  * This routine returns TRUE if the page was inserted into the object
1222  * successfully, and FALSE if the page already exists in the object.
1223  */
1224 int
1225 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1226 {
1227 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1228 	if (m->object != NULL)
1229 		panic("vm_page_insert: already inserted");
1230 
1231 	atomic_add_int(&object->generation, 1);
1232 
1233 	/*
1234 	 * Record the object/offset pair in this page and add the
1235 	 * pv_list_count of the page to the object.
1236 	 *
1237 	 * The vm_page spin lock is required for interactions with the pmap.
1238 	 */
1239 	vm_page_spin_lock(m);
1240 	m->object = object;
1241 	m->pindex = pindex;
1242 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1243 		m->object = NULL;
1244 		m->pindex = 0;
1245 		vm_page_spin_unlock(m);
1246 		return FALSE;
1247 	}
1248 	++object->resident_page_count;
1249 	++mycpu->gd_vmtotal.t_rm;
1250 	vm_page_spin_unlock(m);
1251 
1252 	/*
1253 	 * Since we are inserting a new and possibly dirty page,
1254 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1255 	 */
1256 	if ((m->valid & m->dirty) ||
1257 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1258 		vm_object_set_writeable_dirty(object);
1259 
1260 	/*
1261 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1262 	 */
1263 	swap_pager_page_inserted(m);
1264 	return TRUE;
1265 }
1266 
1267 /*
1268  * Removes the given vm_page_t from the (object,index) table
1269  *
1270  * The underlying pmap entry (if any) is NOT removed here.
1271  * This routine may not block.
1272  *
1273  * The page must be BUSY and will remain BUSY on return.
1274  * No other requirements.
1275  *
1276  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1277  *	 it busy.
1278  */
1279 void
1280 vm_page_remove(vm_page_t m)
1281 {
1282 	vm_object_t object;
1283 
1284 	if (m->object == NULL) {
1285 		return;
1286 	}
1287 
1288 	if ((m->busy_count & PBUSY_LOCKED) == 0)
1289 		panic("vm_page_remove: page not busy");
1290 
1291 	object = m->object;
1292 
1293 	vm_object_hold(object);
1294 
1295 	/*
1296 	 * Remove the page from the object and update the object.
1297 	 *
1298 	 * The vm_page spin lock is required for interactions with the pmap.
1299 	 */
1300 	vm_page_spin_lock(m);
1301 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1302 	--object->resident_page_count;
1303 	--mycpu->gd_vmtotal.t_rm;
1304 	m->object = NULL;
1305 	atomic_add_int(&object->generation, 1);
1306 	vm_page_spin_unlock(m);
1307 
1308 	vm_object_drop(object);
1309 }
1310 
1311 /*
1312  * Locate and return the page at (object, pindex), or NULL if the
1313  * page could not be found.
1314  *
1315  * The caller must hold the vm_object token.
1316  */
1317 vm_page_t
1318 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1319 {
1320 	vm_page_t m;
1321 
1322 	/*
1323 	 * Search the hash table for this object/offset pair
1324 	 */
1325 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1326 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1327 	KKASSERT(m == NULL || (m->object == object && m->pindex == pindex));
1328 	return(m);
1329 }
1330 
1331 vm_page_t
1332 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1333 					    vm_pindex_t pindex,
1334 					    int also_m_busy, const char *msg
1335 					    VM_PAGE_DEBUG_ARGS)
1336 {
1337 	u_int32_t busy_count;
1338 	vm_page_t m;
1339 
1340 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1341 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1342 	while (m) {
1343 		KKASSERT(m->object == object && m->pindex == pindex);
1344 		busy_count = m->busy_count;
1345 		cpu_ccfence();
1346 		if (busy_count & PBUSY_LOCKED) {
1347 			tsleep_interlock(m, 0);
1348 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1349 					  busy_count | PBUSY_WANTED)) {
1350 				atomic_set_int(&m->flags, PG_REFERENCED);
1351 				tsleep(m, PINTERLOCKED, msg, 0);
1352 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1353 							      pindex);
1354 			}
1355 		} else if (also_m_busy && busy_count) {
1356 			tsleep_interlock(m, 0);
1357 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1358 					  busy_count | PBUSY_WANTED)) {
1359 				atomic_set_int(&m->flags, PG_REFERENCED);
1360 				tsleep(m, PINTERLOCKED, msg, 0);
1361 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1362 							      pindex);
1363 			}
1364 		} else if (atomic_cmpset_int(&m->busy_count, busy_count,
1365 					     busy_count | PBUSY_LOCKED)) {
1366 #ifdef VM_PAGE_DEBUG
1367 			m->busy_func = func;
1368 			m->busy_line = lineno;
1369 #endif
1370 			break;
1371 		}
1372 	}
1373 	return m;
1374 }
1375 
1376 /*
1377  * Attempt to lookup and busy a page.
1378  *
1379  * Returns NULL if the page could not be found
1380  *
1381  * Returns a vm_page and error == TRUE if the page exists but could not
1382  * be busied.
1383  *
1384  * Returns a vm_page and error == FALSE on success.
1385  */
1386 vm_page_t
1387 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1388 					   vm_pindex_t pindex,
1389 					   int also_m_busy, int *errorp
1390 					   VM_PAGE_DEBUG_ARGS)
1391 {
1392 	u_int32_t busy_count;
1393 	vm_page_t m;
1394 
1395 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1396 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1397 	*errorp = FALSE;
1398 	while (m) {
1399 		KKASSERT(m->object == object && m->pindex == pindex);
1400 		busy_count = m->busy_count;
1401 		cpu_ccfence();
1402 		if (busy_count & PBUSY_LOCKED) {
1403 			*errorp = TRUE;
1404 			break;
1405 		}
1406 		if (also_m_busy && busy_count) {
1407 			*errorp = TRUE;
1408 			break;
1409 		}
1410 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1411 				      busy_count | PBUSY_LOCKED)) {
1412 #ifdef VM_PAGE_DEBUG
1413 			m->busy_func = func;
1414 			m->busy_line = lineno;
1415 #endif
1416 			break;
1417 		}
1418 	}
1419 	return m;
1420 }
1421 
1422 /*
1423  * Returns a page that is only soft-busied for use by the caller in
1424  * a read-only fashion.  Returns NULL if the page could not be found,
1425  * the soft busy could not be obtained, or the page data is invalid.
1426  */
1427 vm_page_t
1428 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1429 			 int pgoff, int pgbytes)
1430 {
1431 	vm_page_t m;
1432 
1433 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1434 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1435 	if (m) {
1436 		if ((m->valid != VM_PAGE_BITS_ALL &&
1437 		     !vm_page_is_valid(m, pgoff, pgbytes)) ||
1438 		    (m->flags & PG_FICTITIOUS)) {
1439 			m = NULL;
1440 		} else if (vm_page_sbusy_try(m)) {
1441 			m = NULL;
1442 		} else if ((m->valid != VM_PAGE_BITS_ALL &&
1443 			    !vm_page_is_valid(m, pgoff, pgbytes)) ||
1444 			   (m->flags & PG_FICTITIOUS)) {
1445 			vm_page_sbusy_drop(m);
1446 			m = NULL;
1447 		}
1448 	}
1449 	return m;
1450 }
1451 
1452 /*
1453  * Caller must hold the related vm_object
1454  */
1455 vm_page_t
1456 vm_page_next(vm_page_t m)
1457 {
1458 	vm_page_t next;
1459 
1460 	next = vm_page_rb_tree_RB_NEXT(m);
1461 	if (next && next->pindex != m->pindex + 1)
1462 		next = NULL;
1463 	return (next);
1464 }
1465 
1466 /*
1467  * vm_page_rename()
1468  *
1469  * Move the given vm_page from its current object to the specified
1470  * target object/offset.  The page must be busy and will remain so
1471  * on return.
1472  *
1473  * new_object must be held.
1474  * This routine might block. XXX ?
1475  *
1476  * NOTE: Swap associated with the page must be invalidated by the move.  We
1477  *       have to do this for several reasons:  (1) we aren't freeing the
1478  *       page, (2) we are dirtying the page, (3) the VM system is probably
1479  *       moving the page from object A to B, and will then later move
1480  *       the backing store from A to B and we can't have a conflict.
1481  *
1482  * NOTE: We *always* dirty the page.  It is necessary both for the
1483  *       fact that we moved it, and because we may be invalidating
1484  *	 swap.  If the page is on the cache, we have to deactivate it
1485  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1486  *	 on the cache.
1487  */
1488 void
1489 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1490 {
1491 	KKASSERT(m->busy_count & PBUSY_LOCKED);
1492 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1493 	if (m->object) {
1494 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1495 		vm_page_remove(m);
1496 	}
1497 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1498 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1499 		      new_object, new_pindex);
1500 	}
1501 	if (m->queue - m->pc == PQ_CACHE)
1502 		vm_page_deactivate(m);
1503 	vm_page_dirty(m);
1504 }
1505 
1506 /*
1507  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1508  * is to remain BUSYied by the caller.
1509  *
1510  * This routine may not block.
1511  */
1512 void
1513 vm_page_unqueue_nowakeup(vm_page_t m)
1514 {
1515 	vm_page_and_queue_spin_lock(m);
1516 	(void)_vm_page_rem_queue_spinlocked(m);
1517 	vm_page_spin_unlock(m);
1518 }
1519 
1520 /*
1521  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1522  * if necessary.
1523  *
1524  * This routine may not block.
1525  */
1526 void
1527 vm_page_unqueue(vm_page_t m)
1528 {
1529 	u_short queue;
1530 
1531 	vm_page_and_queue_spin_lock(m);
1532 	queue = _vm_page_rem_queue_spinlocked(m);
1533 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1534 		vm_page_spin_unlock(m);
1535 		pagedaemon_wakeup();
1536 	} else {
1537 		vm_page_spin_unlock(m);
1538 	}
1539 }
1540 
1541 /*
1542  * vm_page_list_find()
1543  *
1544  * Find a page on the specified queue with color optimization.
1545  *
1546  * The page coloring optimization attempts to locate a page that does
1547  * not overload other nearby pages in the object in the cpu's L1 or L2
1548  * caches.  We need this optimization because cpu caches tend to be
1549  * physical caches, while object spaces tend to be virtual.
1550  *
1551  * The page coloring optimization also, very importantly, tries to localize
1552  * memory to cpus and physical sockets.
1553  *
1554  * On MP systems each PQ_FREE and PQ_CACHE color queue has its own spinlock
1555  * and the algorithm is adjusted to localize allocations on a per-core basis.
1556  * This is done by 'twisting' the colors.
1557  *
1558  * The page is returned spinlocked and removed from its queue (it will
1559  * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
1560  * is responsible for dealing with the busy-page case (usually by
1561  * deactivating the page and looping).
1562  *
1563  * NOTE:  This routine is carefully inlined.  A non-inlined version
1564  *	  is available for outside callers but the only critical path is
1565  *	  from within this source file.
1566  *
1567  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1568  *	  represent stable storage, allowing us to order our locks vm_page
1569  *	  first, then queue.
1570  */
1571 static __inline
1572 vm_page_t
1573 _vm_page_list_find(int basequeue, int index)
1574 {
1575 	vm_page_t m;
1576 
1577 	for (;;) {
1578 		m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
1579 		if (m == NULL) {
1580 			m = _vm_page_list_find2(basequeue, index);
1581 			return(m);
1582 		}
1583 		vm_page_and_queue_spin_lock(m);
1584 		if (m->queue == basequeue + index) {
1585 			_vm_page_rem_queue_spinlocked(m);
1586 			/* vm_page_t spin held, no queue spin */
1587 			break;
1588 		}
1589 		vm_page_and_queue_spin_unlock(m);
1590 	}
1591 	return(m);
1592 }
1593 
1594 /*
1595  * If we could not find the page in the desired queue try to find it in
1596  * a nearby queue.
1597  */
1598 static vm_page_t
1599 _vm_page_list_find2(int basequeue, int index)
1600 {
1601 	struct vpgqueues *pq;
1602 	vm_page_t m = NULL;
1603 	int pqmask = PQ_SET_ASSOC_MASK >> 1;
1604 	int pqi;
1605 	int i;
1606 
1607 	index &= PQ_L2_MASK;
1608 	pq = &vm_page_queues[basequeue];
1609 
1610 	/*
1611 	 * Run local sets of 16, 32, 64, 128, and the whole queue if all
1612 	 * else fails (PQ_L2_MASK which is 255).
1613 	 */
1614 	do {
1615 		pqmask = (pqmask << 1) | 1;
1616 		for (i = 0; i <= pqmask; ++i) {
1617 			pqi = (index & ~pqmask) | ((index + i) & pqmask);
1618 			m = TAILQ_FIRST(&pq[pqi].pl);
1619 			if (m) {
1620 				_vm_page_and_queue_spin_lock(m);
1621 				if (m->queue == basequeue + pqi) {
1622 					_vm_page_rem_queue_spinlocked(m);
1623 					return(m);
1624 				}
1625 				_vm_page_and_queue_spin_unlock(m);
1626 				--i;
1627 				continue;
1628 			}
1629 		}
1630 	} while (pqmask != PQ_L2_MASK);
1631 
1632 	return(m);
1633 }
1634 
1635 /*
1636  * Returns a vm_page candidate for allocation.  The page is not busied so
1637  * it can move around.  The caller must busy the page (and typically
1638  * deactivate it if it cannot be busied!)
1639  *
1640  * Returns a spinlocked vm_page that has been removed from its queue.
1641  */
1642 vm_page_t
1643 vm_page_list_find(int basequeue, int index)
1644 {
1645 	return(_vm_page_list_find(basequeue, index));
1646 }
1647 
1648 /*
1649  * Find a page on the cache queue with color optimization, remove it
1650  * from the queue, and busy it.  The returned page will not be spinlocked.
1651  *
1652  * A candidate failure will be deactivated.  Candidates can fail due to
1653  * being busied by someone else, in which case they will be deactivated.
1654  *
1655  * This routine may not block.
1656  *
1657  */
1658 static vm_page_t
1659 vm_page_select_cache(u_short pg_color)
1660 {
1661 	vm_page_t m;
1662 
1663 	for (;;) {
1664 		m = _vm_page_list_find(PQ_CACHE, pg_color & PQ_L2_MASK);
1665 		if (m == NULL)
1666 			break;
1667 		/*
1668 		 * (m) has been removed from its queue and spinlocked
1669 		 */
1670 		if (vm_page_busy_try(m, TRUE)) {
1671 			_vm_page_deactivate_locked(m, 0);
1672 			vm_page_spin_unlock(m);
1673 		} else {
1674 			/*
1675 			 * We successfully busied the page
1676 			 */
1677 			if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0 &&
1678 			    m->hold_count == 0 &&
1679 			    m->wire_count == 0 &&
1680 			    (m->dirty & m->valid) == 0) {
1681 				vm_page_spin_unlock(m);
1682 				pagedaemon_wakeup();
1683 				return(m);
1684 			}
1685 
1686 			/*
1687 			 * The page cannot be recycled, deactivate it.
1688 			 */
1689 			_vm_page_deactivate_locked(m, 0);
1690 			if (_vm_page_wakeup(m)) {
1691 				vm_page_spin_unlock(m);
1692 				wakeup(m);
1693 			} else {
1694 				vm_page_spin_unlock(m);
1695 			}
1696 		}
1697 	}
1698 	return (m);
1699 }
1700 
1701 /*
1702  * Find a free page.  We attempt to inline the nominal case and fall back
1703  * to _vm_page_select_free() otherwise.  A busied page is removed from
1704  * the queue and returned.
1705  *
1706  * This routine may not block.
1707  */
1708 static __inline vm_page_t
1709 vm_page_select_free(u_short pg_color)
1710 {
1711 	vm_page_t m;
1712 
1713 	for (;;) {
1714 		m = _vm_page_list_find(PQ_FREE, pg_color & PQ_L2_MASK);
1715 		if (m == NULL)
1716 			break;
1717 		if (vm_page_busy_try(m, TRUE)) {
1718 			/*
1719 			 * Various mechanisms such as a pmap_collect can
1720 			 * result in a busy page on the free queue.  We
1721 			 * have to move the page out of the way so we can
1722 			 * retry the allocation.  If the other thread is not
1723 			 * allocating the page then m->valid will remain 0 and
1724 			 * the pageout daemon will free the page later on.
1725 			 *
1726 			 * Since we could not busy the page, however, we
1727 			 * cannot make assumptions as to whether the page
1728 			 * will be allocated by the other thread or not,
1729 			 * so all we can do is deactivate it to move it out
1730 			 * of the way.  In particular, if the other thread
1731 			 * wires the page it may wind up on the inactive
1732 			 * queue and the pageout daemon will have to deal
1733 			 * with that case too.
1734 			 */
1735 			_vm_page_deactivate_locked(m, 0);
1736 			vm_page_spin_unlock(m);
1737 		} else {
1738 			/*
1739 			 * Theoretically if we are able to busy the page
1740 			 * atomic with the queue removal (using the vm_page
1741 			 * lock) nobody else should be able to mess with the
1742 			 * page before us.
1743 			 */
1744 			KKASSERT((m->flags & (PG_UNMANAGED |
1745 					      PG_NEED_COMMIT)) == 0);
1746 			KASSERT(m->hold_count == 0, ("m->hold_count is not zero "
1747 						     "pg %p q=%d flags=%08x hold=%d wire=%d",
1748 						     m, m->queue, m->flags, m->hold_count, m->wire_count));
1749 			KKASSERT(m->wire_count == 0);
1750 			vm_page_spin_unlock(m);
1751 			pagedaemon_wakeup();
1752 
1753 			/* return busied and removed page */
1754 			return(m);
1755 		}
1756 	}
1757 	return(m);
1758 }
1759 
1760 /*
1761  * vm_page_alloc()
1762  *
1763  * Allocate and return a memory cell associated with this VM object/offset
1764  * pair.  If object is NULL an unassociated page will be allocated.
1765  *
1766  * The returned page will be busied and removed from its queues.  This
1767  * routine can block and may return NULL if a race occurs and the page
1768  * is found to already exist at the specified (object, pindex).
1769  *
1770  *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
1771  *	VM_ALLOC_QUICK		like normal but cannot use cache
1772  *	VM_ALLOC_SYSTEM		greater free drain
1773  *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
1774  *	VM_ALLOC_ZERO		advisory request for pre-zero'd page only
1775  *	VM_ALLOC_FORCE_ZERO	advisory request for pre-zero'd page only
1776  *	VM_ALLOC_NULL_OK	ok to return NULL on insertion collision
1777  *				(see vm_page_grab())
1778  *	VM_ALLOC_USE_GD		ok to use per-gd cache
1779  *
1780  *	VM_ALLOC_CPU(n)		allocate using specified cpu localization
1781  *
1782  * The object must be held if not NULL
1783  * This routine may not block
1784  *
1785  * Additional special handling is required when called from an interrupt
1786  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
1787  * in this case.
1788  */
1789 vm_page_t
1790 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
1791 {
1792 	globaldata_t gd;
1793 	vm_object_t obj;
1794 	vm_page_t m;
1795 	u_short pg_color;
1796 	int cpuid_local;
1797 
1798 #if 0
1799 	/*
1800 	 * Special per-cpu free VM page cache.  The pages are pre-busied
1801 	 * and pre-zerod for us.
1802 	 */
1803 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
1804 		crit_enter_gd(gd);
1805 		if (gd->gd_vmpg_count) {
1806 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
1807 			crit_exit_gd(gd);
1808 			goto done;
1809                 }
1810 		crit_exit_gd(gd);
1811         }
1812 #endif
1813 	m = NULL;
1814 
1815 	/*
1816 	 * CPU LOCALIZATION
1817 	 *
1818 	 * CPU localization algorithm.  Break the page queues up by physical
1819 	 * id and core id (note that two cpu threads will have the same core
1820 	 * id, and core_id != gd_cpuid).
1821 	 *
1822 	 * This is nowhere near perfect, for example the last pindex in a
1823 	 * subgroup will overflow into the next cpu or package.  But this
1824 	 * should get us good page reuse locality in heavy mixed loads.
1825 	 *
1826 	 * (may be executed before the APs are started, so other GDs might
1827 	 *  not exist!)
1828 	 */
1829 	if (page_req & VM_ALLOC_CPU_SPEC)
1830 		cpuid_local = VM_ALLOC_GETCPU(page_req);
1831 	else
1832 		cpuid_local = mycpu->gd_cpuid;
1833 
1834 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
1835 
1836 	KKASSERT(page_req &
1837 		(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
1838 		 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
1839 
1840 	/*
1841 	 * Certain system threads (pageout daemon, buf_daemon's) are
1842 	 * allowed to eat deeper into the free page list.
1843 	 */
1844 	if (curthread->td_flags & TDF_SYSTHREAD)
1845 		page_req |= VM_ALLOC_SYSTEM;
1846 
1847 	/*
1848 	 * Impose various limitations.  Note that the v_free_reserved test
1849 	 * must match the opposite of vm_page_count_target() to avoid
1850 	 * livelocks, be careful.
1851 	 */
1852 loop:
1853 	gd = mycpu;
1854 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
1855 	    ((page_req & VM_ALLOC_INTERRUPT) &&
1856 	     gd->gd_vmstats.v_free_count > 0) ||
1857 	    ((page_req & VM_ALLOC_SYSTEM) &&
1858 	     gd->gd_vmstats.v_cache_count == 0 &&
1859 		gd->gd_vmstats.v_free_count >
1860 		gd->gd_vmstats.v_interrupt_free_min)
1861 	) {
1862 		/*
1863 		 * The free queue has sufficient free pages to take one out.
1864 		 */
1865 		m = vm_page_select_free(pg_color);
1866 	} else if (page_req & VM_ALLOC_NORMAL) {
1867 		/*
1868 		 * Allocatable from the cache (non-interrupt only).  On
1869 		 * success, we must free the page and try again, thus
1870 		 * ensuring that vmstats.v_*_free_min counters are replenished.
1871 		 */
1872 #ifdef INVARIANTS
1873 		if (curthread->td_preempted) {
1874 			kprintf("vm_page_alloc(): warning, attempt to allocate"
1875 				" cache page from preempting interrupt\n");
1876 			m = NULL;
1877 		} else {
1878 			m = vm_page_select_cache(pg_color);
1879 		}
1880 #else
1881 		m = vm_page_select_cache(pg_color);
1882 #endif
1883 		/*
1884 		 * On success move the page into the free queue and loop.
1885 		 *
1886 		 * Only do this if we can safely acquire the vm_object lock,
1887 		 * because this is effectively a random page and the caller
1888 		 * might be holding the lock shared, we don't want to
1889 		 * deadlock.
1890 		 */
1891 		if (m != NULL) {
1892 			KASSERT(m->dirty == 0,
1893 				("Found dirty cache page %p", m));
1894 			if ((obj = m->object) != NULL) {
1895 				if (vm_object_hold_try(obj)) {
1896 					vm_page_protect(m, VM_PROT_NONE);
1897 					vm_page_free(m);
1898 					/* m->object NULL here */
1899 					vm_object_drop(obj);
1900 				} else {
1901 					vm_page_deactivate(m);
1902 					vm_page_wakeup(m);
1903 				}
1904 			} else {
1905 				vm_page_protect(m, VM_PROT_NONE);
1906 				vm_page_free(m);
1907 			}
1908 			goto loop;
1909 		}
1910 
1911 		/*
1912 		 * On failure return NULL
1913 		 */
1914 		atomic_add_int(&vm_pageout_deficit, 1);
1915 		pagedaemon_wakeup();
1916 		return (NULL);
1917 	} else {
1918 		/*
1919 		 * No pages available, wakeup the pageout daemon and give up.
1920 		 */
1921 		atomic_add_int(&vm_pageout_deficit, 1);
1922 		pagedaemon_wakeup();
1923 		return (NULL);
1924 	}
1925 
1926 	/*
1927 	 * v_free_count can race so loop if we don't find the expected
1928 	 * page.
1929 	 */
1930 	if (m == NULL) {
1931 		vmstats_rollup();
1932 		goto loop;
1933 	}
1934 
1935 	/*
1936 	 * Good page found.  The page has already been busied for us and
1937 	 * removed from its queues.
1938 	 */
1939 	KASSERT(m->dirty == 0,
1940 		("vm_page_alloc: free/cache page %p was dirty", m));
1941 	KKASSERT(m->queue == PQ_NONE);
1942 
1943 #if 0
1944 done:
1945 #endif
1946 	/*
1947 	 * Initialize the structure, inheriting some flags but clearing
1948 	 * all the rest.  The page has already been busied for us.
1949 	 */
1950 	vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
1951 
1952 	KKASSERT(m->wire_count == 0);
1953 	KKASSERT((m->busy_count & PBUSY_MASK) == 0);
1954 	m->act_count = 0;
1955 	m->valid = 0;
1956 
1957 	/*
1958 	 * Caller must be holding the object lock (asserted by
1959 	 * vm_page_insert()).
1960 	 *
1961 	 * NOTE: Inserting a page here does not insert it into any pmaps
1962 	 *	 (which could cause us to block allocating memory).
1963 	 *
1964 	 * NOTE: If no object an unassociated page is allocated, m->pindex
1965 	 *	 can be used by the caller for any purpose.
1966 	 */
1967 	if (object) {
1968 		if (vm_page_insert(m, object, pindex) == FALSE) {
1969 			vm_page_free(m);
1970 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
1971 				panic("PAGE RACE %p[%ld]/%p",
1972 				      object, (long)pindex, m);
1973 			m = NULL;
1974 		}
1975 	} else {
1976 		m->pindex = pindex;
1977 	}
1978 
1979 	/*
1980 	 * Don't wakeup too often - wakeup the pageout daemon when
1981 	 * we would be nearly out of memory.
1982 	 */
1983 	pagedaemon_wakeup();
1984 
1985 	/*
1986 	 * A BUSY page is returned.
1987 	 */
1988 	return (m);
1989 }
1990 
1991 /*
1992  * Returns number of pages available in our DMA memory reserve
1993  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
1994  */
1995 vm_size_t
1996 vm_contig_avail_pages(void)
1997 {
1998 	alist_blk_t blk;
1999 	alist_blk_t count;
2000 	alist_blk_t bfree;
2001 	spin_lock(&vm_contig_spin);
2002 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2003 	spin_unlock(&vm_contig_spin);
2004 
2005 	return bfree;
2006 }
2007 
2008 /*
2009  * Attempt to allocate contiguous physical memory with the specified
2010  * requirements.
2011  */
2012 vm_page_t
2013 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2014 		     unsigned long alignment, unsigned long boundary,
2015 		     unsigned long size, vm_memattr_t memattr)
2016 {
2017 	alist_blk_t blk;
2018 	vm_page_t m;
2019 	vm_pindex_t i;
2020 #if 0
2021 	static vm_pindex_t contig_rover;
2022 #endif
2023 
2024 	alignment >>= PAGE_SHIFT;
2025 	if (alignment == 0)
2026 		alignment = 1;
2027 	boundary >>= PAGE_SHIFT;
2028 	if (boundary == 0)
2029 		boundary = 1;
2030 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
2031 
2032 #if 0
2033 	/*
2034 	 * Disabled temporarily until we find a solution for DRM (a flag
2035 	 * to always use the free space reserve, for performance).
2036 	 */
2037 	if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
2038 	    boundary <= PAGE_SIZE && size == 1 &&
2039 	    memattr == VM_MEMATTR_DEFAULT) {
2040 		/*
2041 		 * Any page will work, use vm_page_alloc()
2042 		 * (e.g. when used from kmem_alloc_attr())
2043 		 */
2044 		m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
2045 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2046 				  VM_ALLOC_INTERRUPT);
2047 		m->valid = VM_PAGE_BITS_ALL;
2048 		vm_page_wire(m);
2049 		vm_page_wakeup(m);
2050 	} else
2051 #endif
2052 	{
2053 		/*
2054 		 * Use the low-memory dma reserve
2055 		 */
2056 		spin_lock(&vm_contig_spin);
2057 		blk = alist_alloc(&vm_contig_alist, 0, size);
2058 		if (blk == ALIST_BLOCK_NONE) {
2059 			spin_unlock(&vm_contig_spin);
2060 			if (bootverbose) {
2061 				kprintf("vm_page_alloc_contig: %ldk nospace\n",
2062 					(size << PAGE_SHIFT) / 1024);
2063 				print_backtrace(5);
2064 			}
2065 			return(NULL);
2066 		}
2067 		if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2068 			alist_free(&vm_contig_alist, blk, size);
2069 			spin_unlock(&vm_contig_spin);
2070 			if (bootverbose) {
2071 				kprintf("vm_page_alloc_contig: %ldk high "
2072 					"%016jx failed\n",
2073 					(size << PAGE_SHIFT) / 1024,
2074 					(intmax_t)high);
2075 			}
2076 			return(NULL);
2077 		}
2078 		spin_unlock(&vm_contig_spin);
2079 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2080 	}
2081 	if (vm_contig_verbose) {
2082 		kprintf("vm_page_alloc_contig: %016jx/%ldk "
2083 			"(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2084 			(intmax_t)m->phys_addr,
2085 			(size << PAGE_SHIFT) / 1024,
2086 			low, high, alignment, boundary, size, memattr);
2087 	}
2088 	if (memattr != VM_MEMATTR_DEFAULT) {
2089 		for (i = 0;i < size; i++)
2090 			pmap_page_set_memattr(&m[i], memattr);
2091 	}
2092 	return m;
2093 }
2094 
2095 /*
2096  * Free contiguously allocated pages.  The pages will be wired but not busy.
2097  * When freeing to the alist we leave them wired and not busy.
2098  */
2099 void
2100 vm_page_free_contig(vm_page_t m, unsigned long size)
2101 {
2102 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2103 	vm_pindex_t start = pa >> PAGE_SHIFT;
2104 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2105 
2106 	if (vm_contig_verbose) {
2107 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2108 			(intmax_t)pa, size / 1024);
2109 	}
2110 	if (pa < vm_low_phys_reserved) {
2111 		KKASSERT(pa + size <= vm_low_phys_reserved);
2112 		spin_lock(&vm_contig_spin);
2113 		alist_free(&vm_contig_alist, start, pages);
2114 		spin_unlock(&vm_contig_spin);
2115 	} else {
2116 		while (pages) {
2117 			vm_page_busy_wait(m, FALSE, "cpgfr");
2118 			vm_page_unwire(m, 0);
2119 			vm_page_free(m);
2120 			--pages;
2121 			++m;
2122 		}
2123 
2124 	}
2125 }
2126 
2127 
2128 /*
2129  * Wait for sufficient free memory for nominal heavy memory use kernel
2130  * operations.
2131  *
2132  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2133  *	     will trivially deadlock the system.
2134  */
2135 void
2136 vm_wait_nominal(void)
2137 {
2138 	while (vm_page_count_min(0))
2139 		vm_wait(0);
2140 }
2141 
2142 /*
2143  * Test if vm_wait_nominal() would block.
2144  */
2145 int
2146 vm_test_nominal(void)
2147 {
2148 	if (vm_page_count_min(0))
2149 		return(1);
2150 	return(0);
2151 }
2152 
2153 /*
2154  * Block until free pages are available for allocation, called in various
2155  * places before memory allocations.
2156  *
2157  * The caller may loop if vm_page_count_min() == FALSE so we cannot be
2158  * more generous then that.
2159  */
2160 void
2161 vm_wait(int timo)
2162 {
2163 	/*
2164 	 * never wait forever
2165 	 */
2166 	if (timo == 0)
2167 		timo = hz;
2168 	lwkt_gettoken(&vm_token);
2169 
2170 	if (curthread == pagethread ||
2171 	    curthread == emergpager) {
2172 		/*
2173 		 * The pageout daemon itself needs pages, this is bad.
2174 		 */
2175 		if (vm_page_count_min(0)) {
2176 			vm_pageout_pages_needed = 1;
2177 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2178 		}
2179 	} else {
2180 		/*
2181 		 * Wakeup the pageout daemon if necessary and wait.
2182 		 *
2183 		 * Do not wait indefinitely for the target to be reached,
2184 		 * as load might prevent it from being reached any time soon.
2185 		 * But wait a little to try to slow down page allocations
2186 		 * and to give more important threads (the pagedaemon)
2187 		 * allocation priority.
2188 		 */
2189 		if (vm_page_count_target()) {
2190 			if (vm_pages_needed == 0) {
2191 				vm_pages_needed = 1;
2192 				wakeup(&vm_pages_needed);
2193 			}
2194 			++vm_pages_waiting;	/* SMP race ok */
2195 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2196 		}
2197 	}
2198 	lwkt_reltoken(&vm_token);
2199 }
2200 
2201 /*
2202  * Block until free pages are available for allocation
2203  *
2204  * Called only from vm_fault so that processes page faulting can be
2205  * easily tracked.
2206  */
2207 void
2208 vm_wait_pfault(void)
2209 {
2210 	/*
2211 	 * Wakeup the pageout daemon if necessary and wait.
2212 	 *
2213 	 * Do not wait indefinitely for the target to be reached,
2214 	 * as load might prevent it from being reached any time soon.
2215 	 * But wait a little to try to slow down page allocations
2216 	 * and to give more important threads (the pagedaemon)
2217 	 * allocation priority.
2218 	 */
2219 	if (vm_page_count_min(0)) {
2220 		lwkt_gettoken(&vm_token);
2221 		while (vm_page_count_severe()) {
2222 			if (vm_page_count_target()) {
2223 				thread_t td;
2224 
2225 				if (vm_pages_needed == 0) {
2226 					vm_pages_needed = 1;
2227 					wakeup(&vm_pages_needed);
2228 				}
2229 				++vm_pages_waiting;	/* SMP race ok */
2230 				tsleep(&vmstats.v_free_count, 0, "pfault", hz);
2231 
2232 				/*
2233 				 * Do not stay stuck in the loop if the system is trying
2234 				 * to kill the process.
2235 				 */
2236 				td = curthread;
2237 				if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL))
2238 					break;
2239 			}
2240 		}
2241 		lwkt_reltoken(&vm_token);
2242 	}
2243 }
2244 
2245 /*
2246  * Put the specified page on the active list (if appropriate).  Ensure
2247  * that act_count is at least ACT_INIT but do not otherwise mess with it.
2248  *
2249  * The caller should be holding the page busied ? XXX
2250  * This routine may not block.
2251  */
2252 void
2253 vm_page_activate(vm_page_t m)
2254 {
2255 	u_short oqueue;
2256 
2257 	vm_page_spin_lock(m);
2258 	if (m->queue - m->pc != PQ_ACTIVE) {
2259 		_vm_page_queue_spin_lock(m);
2260 		oqueue = _vm_page_rem_queue_spinlocked(m);
2261 		/* page is left spinlocked, queue is unlocked */
2262 
2263 		if (oqueue == PQ_CACHE)
2264 			mycpu->gd_cnt.v_reactivated++;
2265 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2266 			if (m->act_count < ACT_INIT)
2267 				m->act_count = ACT_INIT;
2268 			_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
2269 		}
2270 		_vm_page_and_queue_spin_unlock(m);
2271 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
2272 			pagedaemon_wakeup();
2273 	} else {
2274 		if (m->act_count < ACT_INIT)
2275 			m->act_count = ACT_INIT;
2276 		vm_page_spin_unlock(m);
2277 	}
2278 }
2279 
2280 /*
2281  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
2282  * routine is called when a page has been added to the cache or free
2283  * queues.
2284  *
2285  * This routine may not block.
2286  */
2287 static __inline void
2288 vm_page_free_wakeup(void)
2289 {
2290 	globaldata_t gd = mycpu;
2291 
2292 	/*
2293 	 * If the pageout daemon itself needs pages, then tell it that
2294 	 * there are some free.
2295 	 */
2296 	if (vm_pageout_pages_needed &&
2297 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
2298 	    gd->gd_vmstats.v_pageout_free_min
2299 	) {
2300 		vm_pageout_pages_needed = 0;
2301 		wakeup(&vm_pageout_pages_needed);
2302 	}
2303 
2304 	/*
2305 	 * Wakeup processes that are waiting on memory.
2306 	 *
2307 	 * Generally speaking we want to wakeup stuck processes as soon as
2308 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
2309 	 * where we can do this.  Wait a bit longer to reduce degenerate
2310 	 * re-blocking (vm_page_free_hysteresis).  The target check is just
2311 	 * to make sure the min-check w/hysteresis does not exceed the
2312 	 * normal target.
2313 	 */
2314 	if (vm_pages_waiting) {
2315 		if (!vm_page_count_min(vm_page_free_hysteresis) ||
2316 		    !vm_page_count_target()) {
2317 			vm_pages_waiting = 0;
2318 			wakeup(&vmstats.v_free_count);
2319 			++mycpu->gd_cnt.v_ppwakeups;
2320 		}
2321 #if 0
2322 		if (!vm_page_count_target()) {
2323 			/*
2324 			 * Plenty of pages are free, wakeup everyone.
2325 			 */
2326 			vm_pages_waiting = 0;
2327 			wakeup(&vmstats.v_free_count);
2328 			++mycpu->gd_cnt.v_ppwakeups;
2329 		} else if (!vm_page_count_min(0)) {
2330 			/*
2331 			 * Some pages are free, wakeup someone.
2332 			 */
2333 			int wcount = vm_pages_waiting;
2334 			if (wcount > 0)
2335 				--wcount;
2336 			vm_pages_waiting = wcount;
2337 			wakeup_one(&vmstats.v_free_count);
2338 			++mycpu->gd_cnt.v_ppwakeups;
2339 		}
2340 #endif
2341 	}
2342 }
2343 
2344 /*
2345  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
2346  * it from its VM object.
2347  *
2348  * The vm_page must be BUSY on entry.  BUSY will be released on
2349  * return (the page will have been freed).
2350  */
2351 void
2352 vm_page_free_toq(vm_page_t m)
2353 {
2354 	mycpu->gd_cnt.v_tfree++;
2355 	KKASSERT((m->flags & PG_MAPPED) == 0);
2356 	KKASSERT(m->busy_count & PBUSY_LOCKED);
2357 
2358 	if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
2359 		kprintf("vm_page_free: pindex(%lu), busy %08x, "
2360 			"hold(%d)\n",
2361 			(u_long)m->pindex, m->busy_count, m->hold_count);
2362 		if ((m->queue - m->pc) == PQ_FREE)
2363 			panic("vm_page_free: freeing free page");
2364 		else
2365 			panic("vm_page_free: freeing busy page");
2366 	}
2367 
2368 	/*
2369 	 * Remove from object, spinlock the page and its queues and
2370 	 * remove from any queue.  No queue spinlock will be held
2371 	 * after this section (because the page was removed from any
2372 	 * queue).
2373 	 */
2374 	vm_page_remove(m);
2375 	vm_page_and_queue_spin_lock(m);
2376 	_vm_page_rem_queue_spinlocked(m);
2377 
2378 	/*
2379 	 * No further management of fictitious pages occurs beyond object
2380 	 * and queue removal.
2381 	 */
2382 	if ((m->flags & PG_FICTITIOUS) != 0) {
2383 		vm_page_spin_unlock(m);
2384 		vm_page_wakeup(m);
2385 		return;
2386 	}
2387 
2388 	m->valid = 0;
2389 	vm_page_undirty(m);
2390 
2391 	if (m->wire_count != 0) {
2392 		if (m->wire_count > 1) {
2393 		    panic(
2394 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
2395 			m->wire_count, (long)m->pindex);
2396 		}
2397 		panic("vm_page_free: freeing wired page");
2398 	}
2399 
2400 	/*
2401 	 * Clear the UNMANAGED flag when freeing an unmanaged page.
2402 	 * Clear the NEED_COMMIT flag
2403 	 */
2404 	if (m->flags & PG_UNMANAGED)
2405 		vm_page_flag_clear(m, PG_UNMANAGED);
2406 	if (m->flags & PG_NEED_COMMIT)
2407 		vm_page_flag_clear(m, PG_NEED_COMMIT);
2408 
2409 	if (m->hold_count != 0) {
2410 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
2411 	} else {
2412 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
2413 	}
2414 
2415 	/*
2416 	 * This sequence allows us to clear BUSY while still holding
2417 	 * its spin lock, which reduces contention vs allocators.  We
2418 	 * must not leave the queue locked or _vm_page_wakeup() may
2419 	 * deadlock.
2420 	 */
2421 	_vm_page_queue_spin_unlock(m);
2422 	if (_vm_page_wakeup(m)) {
2423 		vm_page_spin_unlock(m);
2424 		wakeup(m);
2425 	} else {
2426 		vm_page_spin_unlock(m);
2427 	}
2428 	vm_page_free_wakeup();
2429 }
2430 
2431 /*
2432  * vm_page_unmanage()
2433  *
2434  * Prevent PV management from being done on the page.  The page is
2435  * removed from the paging queues as if it were wired, and as a
2436  * consequence of no longer being managed the pageout daemon will not
2437  * touch it (since there is no way to locate the pte mappings for the
2438  * page).  madvise() calls that mess with the pmap will also no longer
2439  * operate on the page.
2440  *
2441  * Beyond that the page is still reasonably 'normal'.  Freeing the page
2442  * will clear the flag.
2443  *
2444  * This routine is used by OBJT_PHYS objects - objects using unswappable
2445  * physical memory as backing store rather then swap-backed memory and
2446  * will eventually be extended to support 4MB unmanaged physical
2447  * mappings.
2448  *
2449  * Caller must be holding the page busy.
2450  */
2451 void
2452 vm_page_unmanage(vm_page_t m)
2453 {
2454 	KKASSERT(m->busy_count & PBUSY_LOCKED);
2455 	if ((m->flags & PG_UNMANAGED) == 0) {
2456 		if (m->wire_count == 0)
2457 			vm_page_unqueue(m);
2458 	}
2459 	vm_page_flag_set(m, PG_UNMANAGED);
2460 }
2461 
2462 /*
2463  * Mark this page as wired down by yet another map, removing it from
2464  * paging queues as necessary.
2465  *
2466  * Caller must be holding the page busy.
2467  */
2468 void
2469 vm_page_wire(vm_page_t m)
2470 {
2471 	/*
2472 	 * Only bump the wire statistics if the page is not already wired,
2473 	 * and only unqueue the page if it is on some queue (if it is unmanaged
2474 	 * it is already off the queues).  Don't do anything with fictitious
2475 	 * pages because they are always wired.
2476 	 */
2477 	KKASSERT(m->busy_count & PBUSY_LOCKED);
2478 	if ((m->flags & PG_FICTITIOUS) == 0) {
2479 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
2480 			if ((m->flags & PG_UNMANAGED) == 0)
2481 				vm_page_unqueue(m);
2482 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
2483 		}
2484 		KASSERT(m->wire_count != 0,
2485 			("vm_page_wire: wire_count overflow m=%p", m));
2486 	}
2487 }
2488 
2489 /*
2490  * Release one wiring of this page, potentially enabling it to be paged again.
2491  *
2492  * Many pages placed on the inactive queue should actually go
2493  * into the cache, but it is difficult to figure out which.  What
2494  * we do instead, if the inactive target is well met, is to put
2495  * clean pages at the head of the inactive queue instead of the tail.
2496  * This will cause them to be moved to the cache more quickly and
2497  * if not actively re-referenced, freed more quickly.  If we just
2498  * stick these pages at the end of the inactive queue, heavy filesystem
2499  * meta-data accesses can cause an unnecessary paging load on memory bound
2500  * processes.  This optimization causes one-time-use metadata to be
2501  * reused more quickly.
2502  *
2503  * Pages marked PG_NEED_COMMIT are always activated and never placed on
2504  * the inactive queue.  This helps the pageout daemon determine memory
2505  * pressure and act on out-of-memory situations more quickly.
2506  *
2507  * BUT, if we are in a low-memory situation we have no choice but to
2508  * put clean pages on the cache queue.
2509  *
2510  * A number of routines use vm_page_unwire() to guarantee that the page
2511  * will go into either the inactive or active queues, and will NEVER
2512  * be placed in the cache - for example, just after dirtying a page.
2513  * dirty pages in the cache are not allowed.
2514  *
2515  * This routine may not block.
2516  */
2517 void
2518 vm_page_unwire(vm_page_t m, int activate)
2519 {
2520 	KKASSERT(m->busy_count & PBUSY_LOCKED);
2521 	if (m->flags & PG_FICTITIOUS) {
2522 		/* do nothing */
2523 	} else if (m->wire_count <= 0) {
2524 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
2525 	} else {
2526 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
2527 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
2528 			if (m->flags & PG_UNMANAGED) {
2529 				;
2530 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
2531 				vm_page_spin_lock(m);
2532 				_vm_page_add_queue_spinlocked(m,
2533 							PQ_ACTIVE + m->pc, 0);
2534 				_vm_page_and_queue_spin_unlock(m);
2535 			} else {
2536 				vm_page_spin_lock(m);
2537 				vm_page_flag_clear(m, PG_WINATCFLS);
2538 				_vm_page_add_queue_spinlocked(m,
2539 							PQ_INACTIVE + m->pc, 0);
2540 				++vm_swapcache_inactive_heuristic;
2541 				_vm_page_and_queue_spin_unlock(m);
2542 			}
2543 		}
2544 	}
2545 }
2546 
2547 /*
2548  * Move the specified page to the inactive queue.  If the page has
2549  * any associated swap, the swap is deallocated.
2550  *
2551  * Normally athead is 0 resulting in LRU operation.  athead is set
2552  * to 1 if we want this page to be 'as if it were placed in the cache',
2553  * except without unmapping it from the process address space.
2554  *
2555  * vm_page's spinlock must be held on entry and will remain held on return.
2556  * This routine may not block.
2557  */
2558 static void
2559 _vm_page_deactivate_locked(vm_page_t m, int athead)
2560 {
2561 	u_short oqueue;
2562 
2563 	/*
2564 	 * Ignore if already inactive.
2565 	 */
2566 	if (m->queue - m->pc == PQ_INACTIVE)
2567 		return;
2568 	_vm_page_queue_spin_lock(m);
2569 	oqueue = _vm_page_rem_queue_spinlocked(m);
2570 
2571 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
2572 		if (oqueue == PQ_CACHE)
2573 			mycpu->gd_cnt.v_reactivated++;
2574 		vm_page_flag_clear(m, PG_WINATCFLS);
2575 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
2576 		if (athead == 0)
2577 			++vm_swapcache_inactive_heuristic;
2578 	}
2579 	/* NOTE: PQ_NONE if condition not taken */
2580 	_vm_page_queue_spin_unlock(m);
2581 	/* leaves vm_page spinlocked */
2582 }
2583 
2584 /*
2585  * Attempt to deactivate a page.
2586  *
2587  * No requirements.
2588  */
2589 void
2590 vm_page_deactivate(vm_page_t m)
2591 {
2592 	vm_page_spin_lock(m);
2593 	_vm_page_deactivate_locked(m, 0);
2594 	vm_page_spin_unlock(m);
2595 }
2596 
2597 void
2598 vm_page_deactivate_locked(vm_page_t m)
2599 {
2600 	_vm_page_deactivate_locked(m, 0);
2601 }
2602 
2603 /*
2604  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
2605  *
2606  * This function returns non-zero if it successfully moved the page to
2607  * PQ_CACHE.
2608  *
2609  * This function unconditionally unbusies the page on return.
2610  */
2611 int
2612 vm_page_try_to_cache(vm_page_t m)
2613 {
2614 	vm_page_spin_lock(m);
2615 	if (m->dirty || m->hold_count || m->wire_count ||
2616 	    (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT))) {
2617 		if (_vm_page_wakeup(m)) {
2618 			vm_page_spin_unlock(m);
2619 			wakeup(m);
2620 		} else {
2621 			vm_page_spin_unlock(m);
2622 		}
2623 		return(0);
2624 	}
2625 	vm_page_spin_unlock(m);
2626 
2627 	/*
2628 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
2629 	 * be moved to the cache.
2630 	 */
2631 	vm_page_test_dirty(m);
2632 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2633 		vm_page_wakeup(m);
2634 		return(0);
2635 	}
2636 	vm_page_cache(m);
2637 	return(1);
2638 }
2639 
2640 /*
2641  * Attempt to free the page.  If we cannot free it, we do nothing.
2642  * 1 is returned on success, 0 on failure.
2643  *
2644  * No requirements.
2645  */
2646 int
2647 vm_page_try_to_free(vm_page_t m)
2648 {
2649 	vm_page_spin_lock(m);
2650 	if (vm_page_busy_try(m, TRUE)) {
2651 		vm_page_spin_unlock(m);
2652 		return(0);
2653 	}
2654 
2655 	/*
2656 	 * The page can be in any state, including already being on the free
2657 	 * queue.  Check to see if it really can be freed.
2658 	 */
2659 	if (m->dirty ||				/* can't free if it is dirty */
2660 	    m->hold_count ||			/* or held (XXX may be wrong) */
2661 	    m->wire_count ||			/* or wired */
2662 	    (m->flags & (PG_UNMANAGED |		/* or unmanaged */
2663 			 PG_NEED_COMMIT)) ||	/* or needs a commit */
2664 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
2665 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
2666 		if (_vm_page_wakeup(m)) {
2667 			vm_page_spin_unlock(m);
2668 			wakeup(m);
2669 		} else {
2670 			vm_page_spin_unlock(m);
2671 		}
2672 		return(0);
2673 	}
2674 	vm_page_spin_unlock(m);
2675 
2676 	/*
2677 	 * We can probably free the page.
2678 	 *
2679 	 * Page busied by us and no longer spinlocked.  Dirty pages will
2680 	 * not be freed by this function.    We have to re-test the
2681 	 * dirty bit after cleaning out the pmaps.
2682 	 */
2683 	vm_page_test_dirty(m);
2684 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2685 		vm_page_wakeup(m);
2686 		return(0);
2687 	}
2688 	vm_page_protect(m, VM_PROT_NONE);
2689 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2690 		vm_page_wakeup(m);
2691 		return(0);
2692 	}
2693 	vm_page_free(m);
2694 	return(1);
2695 }
2696 
2697 /*
2698  * vm_page_cache
2699  *
2700  * Put the specified page onto the page cache queue (if appropriate).
2701  *
2702  * The page must be busy, and this routine will release the busy and
2703  * possibly even free the page.
2704  */
2705 void
2706 vm_page_cache(vm_page_t m)
2707 {
2708 	/*
2709 	 * Not suitable for the cache
2710 	 */
2711 	if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
2712 	    (m->busy_count & PBUSY_MASK) ||
2713 	    m->wire_count || m->hold_count) {
2714 		vm_page_wakeup(m);
2715 		return;
2716 	}
2717 
2718 	/*
2719 	 * Already in the cache (and thus not mapped)
2720 	 */
2721 	if ((m->queue - m->pc) == PQ_CACHE) {
2722 		KKASSERT((m->flags & PG_MAPPED) == 0);
2723 		vm_page_wakeup(m);
2724 		return;
2725 	}
2726 
2727 	/*
2728 	 * Caller is required to test m->dirty, but note that the act of
2729 	 * removing the page from its maps can cause it to become dirty
2730 	 * on an SMP system due to another cpu running in usermode.
2731 	 */
2732 	if (m->dirty) {
2733 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
2734 			(long)m->pindex);
2735 	}
2736 
2737 	/*
2738 	 * Remove all pmaps and indicate that the page is not
2739 	 * writeable or mapped.  Our vm_page_protect() call may
2740 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
2741 	 * everything.
2742 	 */
2743 	vm_page_protect(m, VM_PROT_NONE);
2744 	if ((m->flags & (PG_UNMANAGED | PG_MAPPED)) ||
2745 	    (m->busy_count & PBUSY_MASK) ||
2746 	    m->wire_count || m->hold_count) {
2747 		vm_page_wakeup(m);
2748 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
2749 		vm_page_deactivate(m);
2750 		vm_page_wakeup(m);
2751 	} else {
2752 		_vm_page_and_queue_spin_lock(m);
2753 		_vm_page_rem_queue_spinlocked(m);
2754 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
2755 		_vm_page_queue_spin_unlock(m);
2756 		if (_vm_page_wakeup(m)) {
2757 			vm_page_spin_unlock(m);
2758 			wakeup(m);
2759 		} else {
2760 			vm_page_spin_unlock(m);
2761 		}
2762 		vm_page_free_wakeup();
2763 	}
2764 }
2765 
2766 /*
2767  * vm_page_dontneed()
2768  *
2769  * Cache, deactivate, or do nothing as appropriate.  This routine
2770  * is typically used by madvise() MADV_DONTNEED.
2771  *
2772  * Generally speaking we want to move the page into the cache so
2773  * it gets reused quickly.  However, this can result in a silly syndrome
2774  * due to the page recycling too quickly.  Small objects will not be
2775  * fully cached.  On the otherhand, if we move the page to the inactive
2776  * queue we wind up with a problem whereby very large objects
2777  * unnecessarily blow away our inactive and cache queues.
2778  *
2779  * The solution is to move the pages based on a fixed weighting.  We
2780  * either leave them alone, deactivate them, or move them to the cache,
2781  * where moving them to the cache has the highest weighting.
2782  * By forcing some pages into other queues we eventually force the
2783  * system to balance the queues, potentially recovering other unrelated
2784  * space from active.  The idea is to not force this to happen too
2785  * often.
2786  *
2787  * The page must be busied.
2788  */
2789 void
2790 vm_page_dontneed(vm_page_t m)
2791 {
2792 	static int dnweight;
2793 	int dnw;
2794 	int head;
2795 
2796 	dnw = ++dnweight;
2797 
2798 	/*
2799 	 * occassionally leave the page alone
2800 	 */
2801 	if ((dnw & 0x01F0) == 0 ||
2802 	    m->queue - m->pc == PQ_INACTIVE ||
2803 	    m->queue - m->pc == PQ_CACHE
2804 	) {
2805 		if (m->act_count >= ACT_INIT)
2806 			--m->act_count;
2807 		return;
2808 	}
2809 
2810 	/*
2811 	 * If vm_page_dontneed() is inactivating a page, it must clear
2812 	 * the referenced flag; otherwise the pagedaemon will see references
2813 	 * on the page in the inactive queue and reactivate it. Until the
2814 	 * page can move to the cache queue, madvise's job is not done.
2815 	 */
2816 	vm_page_flag_clear(m, PG_REFERENCED);
2817 	pmap_clear_reference(m);
2818 
2819 	if (m->dirty == 0)
2820 		vm_page_test_dirty(m);
2821 
2822 	if (m->dirty || (dnw & 0x0070) == 0) {
2823 		/*
2824 		 * Deactivate the page 3 times out of 32.
2825 		 */
2826 		head = 0;
2827 	} else {
2828 		/*
2829 		 * Cache the page 28 times out of every 32.  Note that
2830 		 * the page is deactivated instead of cached, but placed
2831 		 * at the head of the queue instead of the tail.
2832 		 */
2833 		head = 1;
2834 	}
2835 	vm_page_spin_lock(m);
2836 	_vm_page_deactivate_locked(m, head);
2837 	vm_page_spin_unlock(m);
2838 }
2839 
2840 /*
2841  * These routines manipulate the 'soft busy' count for a page.  A soft busy
2842  * is almost like a hard BUSY except that it allows certain compatible
2843  * operations to occur on the page while it is busy.  For example, a page
2844  * undergoing a write can still be mapped read-only.
2845  *
2846  * We also use soft-busy to quickly pmap_enter shared read-only pages
2847  * without having to hold the page locked.
2848  *
2849  * The soft-busy count can be > 1 in situations where multiple threads
2850  * are pmap_enter()ing the same page simultaneously, or when two buffer
2851  * cache buffers overlap the same page.
2852  *
2853  * The caller must hold the page BUSY when making these two calls.
2854  */
2855 void
2856 vm_page_io_start(vm_page_t m)
2857 {
2858 	uint32_t ocount;
2859 
2860 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
2861 	KKASSERT(ocount & PBUSY_LOCKED);
2862 }
2863 
2864 void
2865 vm_page_io_finish(vm_page_t m)
2866 {
2867 	uint32_t ocount;
2868 
2869 	ocount = atomic_fetchadd_int(&m->busy_count, -1);
2870 	KKASSERT(ocount & PBUSY_MASK);
2871 #if 0
2872 	if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
2873 		wakeup(m);
2874 #endif
2875 }
2876 
2877 /*
2878  * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
2879  *
2880  * We can't use fetchadd here because we might race a hard-busy and the
2881  * page freeing code asserts on a non-zero soft-busy count (even if only
2882  * temporary).
2883  *
2884  * Returns 0 on success, non-zero on failure.
2885  */
2886 int
2887 vm_page_sbusy_try(vm_page_t m)
2888 {
2889 	uint32_t ocount;
2890 
2891 	for (;;) {
2892 		ocount = m->busy_count;
2893 		cpu_ccfence();
2894 		if (ocount & PBUSY_LOCKED)
2895 			return 1;
2896 		if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1))
2897 			break;
2898 	}
2899 	return 0;
2900 #if 0
2901 	if (m->busy_count & PBUSY_LOCKED)
2902 		return 1;
2903 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
2904 	if (ocount & PBUSY_LOCKED) {
2905 		vm_page_sbusy_drop(m);
2906 		return 1;
2907 	}
2908 	return 0;
2909 #endif
2910 }
2911 
2912 /*
2913  * Indicate that a clean VM page requires a filesystem commit and cannot
2914  * be reused.  Used by tmpfs.
2915  */
2916 void
2917 vm_page_need_commit(vm_page_t m)
2918 {
2919 	vm_page_flag_set(m, PG_NEED_COMMIT);
2920 	vm_object_set_writeable_dirty(m->object);
2921 }
2922 
2923 void
2924 vm_page_clear_commit(vm_page_t m)
2925 {
2926 	vm_page_flag_clear(m, PG_NEED_COMMIT);
2927 }
2928 
2929 /*
2930  * Grab a page, blocking if it is busy and allocating a page if necessary.
2931  * A busy page is returned or NULL.  The page may or may not be valid and
2932  * might not be on a queue (the caller is responsible for the disposition of
2933  * the page).
2934  *
2935  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
2936  * page will be zero'd and marked valid.
2937  *
2938  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
2939  * valid even if it already exists.
2940  *
2941  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
2942  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
2943  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
2944  *
2945  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
2946  * always returned if we had blocked.
2947  *
2948  * This routine may not be called from an interrupt.
2949  *
2950  * No other requirements.
2951  */
2952 vm_page_t
2953 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
2954 {
2955 	vm_page_t m;
2956 	int error;
2957 	int shared = 1;
2958 
2959 	KKASSERT(allocflags &
2960 		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2961 	vm_object_hold_shared(object);
2962 	for (;;) {
2963 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
2964 		if (error) {
2965 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
2966 			if ((allocflags & VM_ALLOC_RETRY) == 0) {
2967 				m = NULL;
2968 				break;
2969 			}
2970 			/* retry */
2971 		} else if (m == NULL) {
2972 			if (shared) {
2973 				vm_object_upgrade(object);
2974 				shared = 0;
2975 			}
2976 			if (allocflags & VM_ALLOC_RETRY)
2977 				allocflags |= VM_ALLOC_NULL_OK;
2978 			m = vm_page_alloc(object, pindex,
2979 					  allocflags & ~VM_ALLOC_RETRY);
2980 			if (m)
2981 				break;
2982 			vm_wait(0);
2983 			if ((allocflags & VM_ALLOC_RETRY) == 0)
2984 				goto failed;
2985 		} else {
2986 			/* m found */
2987 			break;
2988 		}
2989 	}
2990 
2991 	/*
2992 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
2993 	 *
2994 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
2995 	 * valid even if already valid.
2996 	 *
2997 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
2998 	 *	  removed the idle zeroing code.  These optimizations actually
2999 	 *	  slow things down on modern cpus because the zerod area is
3000 	 *	  likely uncached, placing a memory-access burden on the
3001 	 *	  accesors taking the fault.
3002 	 *
3003 	 *	  By always zeroing the page in-line with the fault, no
3004 	 *	  dynamic ram reads are needed and the caches are hot, ready
3005 	 *	  for userland to access the memory.
3006 	 */
3007 	if (m->valid == 0) {
3008 		if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3009 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
3010 			m->valid = VM_PAGE_BITS_ALL;
3011 		}
3012 	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
3013 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
3014 		m->valid = VM_PAGE_BITS_ALL;
3015 	}
3016 failed:
3017 	vm_object_drop(object);
3018 	return(m);
3019 }
3020 
3021 /*
3022  * Mapping function for valid bits or for dirty bits in
3023  * a page.  May not block.
3024  *
3025  * Inputs are required to range within a page.
3026  *
3027  * No requirements.
3028  * Non blocking.
3029  */
3030 int
3031 vm_page_bits(int base, int size)
3032 {
3033 	int first_bit;
3034 	int last_bit;
3035 
3036 	KASSERT(
3037 	    base + size <= PAGE_SIZE,
3038 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
3039 	);
3040 
3041 	if (size == 0)		/* handle degenerate case */
3042 		return(0);
3043 
3044 	first_bit = base >> DEV_BSHIFT;
3045 	last_bit = (base + size - 1) >> DEV_BSHIFT;
3046 
3047 	return ((2 << last_bit) - (1 << first_bit));
3048 }
3049 
3050 /*
3051  * Sets portions of a page valid and clean.  The arguments are expected
3052  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3053  * of any partial chunks touched by the range.  The invalid portion of
3054  * such chunks will be zero'd.
3055  *
3056  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
3057  *	 align base to DEV_BSIZE so as not to mark clean a partially
3058  *	 truncated device block.  Otherwise the dirty page status might be
3059  *	 lost.
3060  *
3061  * This routine may not block.
3062  *
3063  * (base + size) must be less then or equal to PAGE_SIZE.
3064  */
3065 static void
3066 _vm_page_zero_valid(vm_page_t m, int base, int size)
3067 {
3068 	int frag;
3069 	int endoff;
3070 
3071 	if (size == 0)	/* handle degenerate case */
3072 		return;
3073 
3074 	/*
3075 	 * If the base is not DEV_BSIZE aligned and the valid
3076 	 * bit is clear, we have to zero out a portion of the
3077 	 * first block.
3078 	 */
3079 
3080 	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
3081 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3082 	) {
3083 		pmap_zero_page_area(
3084 		    VM_PAGE_TO_PHYS(m),
3085 		    frag,
3086 		    base - frag
3087 		);
3088 	}
3089 
3090 	/*
3091 	 * If the ending offset is not DEV_BSIZE aligned and the
3092 	 * valid bit is clear, we have to zero out a portion of
3093 	 * the last block.
3094 	 */
3095 
3096 	endoff = base + size;
3097 
3098 	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
3099 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3100 	) {
3101 		pmap_zero_page_area(
3102 		    VM_PAGE_TO_PHYS(m),
3103 		    endoff,
3104 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3105 		);
3106 	}
3107 }
3108 
3109 /*
3110  * Set valid, clear dirty bits.  If validating the entire
3111  * page we can safely clear the pmap modify bit.  We also
3112  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3113  * takes a write fault on a MAP_NOSYNC memory area the flag will
3114  * be set again.
3115  *
3116  * We set valid bits inclusive of any overlap, but we can only
3117  * clear dirty bits for DEV_BSIZE chunks that are fully within
3118  * the range.
3119  *
3120  * Page must be busied?
3121  * No other requirements.
3122  */
3123 void
3124 vm_page_set_valid(vm_page_t m, int base, int size)
3125 {
3126 	_vm_page_zero_valid(m, base, size);
3127 	m->valid |= vm_page_bits(base, size);
3128 }
3129 
3130 
3131 /*
3132  * Set valid bits and clear dirty bits.
3133  *
3134  * Page must be busied by caller.
3135  *
3136  * NOTE: This function does not clear the pmap modified bit.
3137  *	 Also note that e.g. NFS may use a byte-granular base
3138  *	 and size.
3139  *
3140  * No other requirements.
3141  */
3142 void
3143 vm_page_set_validclean(vm_page_t m, int base, int size)
3144 {
3145 	int pagebits;
3146 
3147 	_vm_page_zero_valid(m, base, size);
3148 	pagebits = vm_page_bits(base, size);
3149 	m->valid |= pagebits;
3150 	m->dirty &= ~pagebits;
3151 	if (base == 0 && size == PAGE_SIZE) {
3152 		/*pmap_clear_modify(m);*/
3153 		vm_page_flag_clear(m, PG_NOSYNC);
3154 	}
3155 }
3156 
3157 /*
3158  * Set valid & dirty.  Used by buwrite()
3159  *
3160  * Page must be busied by caller.
3161  */
3162 void
3163 vm_page_set_validdirty(vm_page_t m, int base, int size)
3164 {
3165 	int pagebits;
3166 
3167 	pagebits = vm_page_bits(base, size);
3168 	m->valid |= pagebits;
3169 	m->dirty |= pagebits;
3170 	if (m->object)
3171 	       vm_object_set_writeable_dirty(m->object);
3172 }
3173 
3174 /*
3175  * Clear dirty bits.
3176  *
3177  * NOTE: This function does not clear the pmap modified bit.
3178  *	 Also note that e.g. NFS may use a byte-granular base
3179  *	 and size.
3180  *
3181  * Page must be busied?
3182  * No other requirements.
3183  */
3184 void
3185 vm_page_clear_dirty(vm_page_t m, int base, int size)
3186 {
3187 	m->dirty &= ~vm_page_bits(base, size);
3188 	if (base == 0 && size == PAGE_SIZE) {
3189 		/*pmap_clear_modify(m);*/
3190 		vm_page_flag_clear(m, PG_NOSYNC);
3191 	}
3192 }
3193 
3194 /*
3195  * Make the page all-dirty.
3196  *
3197  * Also make sure the related object and vnode reflect the fact that the
3198  * object may now contain a dirty page.
3199  *
3200  * Page must be busied?
3201  * No other requirements.
3202  */
3203 void
3204 vm_page_dirty(vm_page_t m)
3205 {
3206 #ifdef INVARIANTS
3207         int pqtype = m->queue - m->pc;
3208 #endif
3209         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3210                 ("vm_page_dirty: page in free/cache queue!"));
3211 	if (m->dirty != VM_PAGE_BITS_ALL) {
3212 		m->dirty = VM_PAGE_BITS_ALL;
3213 		if (m->object)
3214 			vm_object_set_writeable_dirty(m->object);
3215 	}
3216 }
3217 
3218 /*
3219  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3220  * valid and dirty bits for the effected areas are cleared.
3221  *
3222  * Page must be busied?
3223  * Does not block.
3224  * No other requirements.
3225  */
3226 void
3227 vm_page_set_invalid(vm_page_t m, int base, int size)
3228 {
3229 	int bits;
3230 
3231 	bits = vm_page_bits(base, size);
3232 	m->valid &= ~bits;
3233 	m->dirty &= ~bits;
3234 	atomic_add_int(&m->object->generation, 1);
3235 }
3236 
3237 /*
3238  * The kernel assumes that the invalid portions of a page contain
3239  * garbage, but such pages can be mapped into memory by user code.
3240  * When this occurs, we must zero out the non-valid portions of the
3241  * page so user code sees what it expects.
3242  *
3243  * Pages are most often semi-valid when the end of a file is mapped
3244  * into memory and the file's size is not page aligned.
3245  *
3246  * Page must be busied?
3247  * No other requirements.
3248  */
3249 void
3250 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
3251 {
3252 	int b;
3253 	int i;
3254 
3255 	/*
3256 	 * Scan the valid bits looking for invalid sections that
3257 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
3258 	 * valid bit may be set ) have already been zerod by
3259 	 * vm_page_set_validclean().
3260 	 */
3261 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
3262 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
3263 		    (m->valid & (1 << i))
3264 		) {
3265 			if (i > b) {
3266 				pmap_zero_page_area(
3267 				    VM_PAGE_TO_PHYS(m),
3268 				    b << DEV_BSHIFT,
3269 				    (i - b) << DEV_BSHIFT
3270 				);
3271 			}
3272 			b = i + 1;
3273 		}
3274 	}
3275 
3276 	/*
3277 	 * setvalid is TRUE when we can safely set the zero'd areas
3278 	 * as being valid.  We can do this if there are no cache consistency
3279 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
3280 	 */
3281 	if (setvalid)
3282 		m->valid = VM_PAGE_BITS_ALL;
3283 }
3284 
3285 /*
3286  * Is a (partial) page valid?  Note that the case where size == 0
3287  * will return FALSE in the degenerate case where the page is entirely
3288  * invalid, and TRUE otherwise.
3289  *
3290  * Does not block.
3291  * No other requirements.
3292  */
3293 int
3294 vm_page_is_valid(vm_page_t m, int base, int size)
3295 {
3296 	int bits = vm_page_bits(base, size);
3297 
3298 	if (m->valid && ((m->valid & bits) == bits))
3299 		return 1;
3300 	else
3301 		return 0;
3302 }
3303 
3304 /*
3305  * update dirty bits from pmap/mmu.  May not block.
3306  *
3307  * Caller must hold the page busy
3308  */
3309 void
3310 vm_page_test_dirty(vm_page_t m)
3311 {
3312 	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
3313 		vm_page_dirty(m);
3314 	}
3315 }
3316 
3317 #include "opt_ddb.h"
3318 #ifdef DDB
3319 #include <ddb/ddb.h>
3320 
3321 DB_SHOW_COMMAND(page, vm_page_print_page_info)
3322 {
3323 	db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
3324 	db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
3325 	db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
3326 	db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
3327 	db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
3328 	db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
3329 	db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
3330 	db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
3331 	db_printf("vmstats.v_cache_min: %ld\n", vmstats.v_cache_min);
3332 	db_printf("vmstats.v_inactive_target: %ld\n",
3333 		  vmstats.v_inactive_target);
3334 }
3335 
3336 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
3337 {
3338 	int i;
3339 	db_printf("PQ_FREE:");
3340 	for (i = 0; i < PQ_L2_SIZE; i++) {
3341 		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
3342 	}
3343 	db_printf("\n");
3344 
3345 	db_printf("PQ_CACHE:");
3346 	for(i = 0; i < PQ_L2_SIZE; i++) {
3347 		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
3348 	}
3349 	db_printf("\n");
3350 
3351 	db_printf("PQ_ACTIVE:");
3352 	for(i = 0; i < PQ_L2_SIZE; i++) {
3353 		db_printf(" %d", vm_page_queues[PQ_ACTIVE + i].lcnt);
3354 	}
3355 	db_printf("\n");
3356 
3357 	db_printf("PQ_INACTIVE:");
3358 	for(i = 0; i < PQ_L2_SIZE; i++) {
3359 		db_printf(" %d", vm_page_queues[PQ_INACTIVE + i].lcnt);
3360 	}
3361 	db_printf("\n");
3362 }
3363 #endif /* DDB */
3364