xref: /dragonfly/sys/vm/vm_page.c (revision 13c79986)
1 /*
2  * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
3  * Copyright (c) 1991 Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
38  */
39 
40 /*
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  */
66 /*
67  * Resident memory management module.  The module manipulates 'VM pages'.
68  * A VM page is the core building block for memory management.
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/malloc.h>
74 #include <sys/proc.h>
75 #include <sys/vmmeter.h>
76 #include <sys/vnode.h>
77 #include <sys/kernel.h>
78 #include <sys/alist.h>
79 #include <sys/sysctl.h>
80 #include <sys/cpu_topology.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <sys/lock.h>
85 #include <vm/vm_kern.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_extern.h>
93 #include <vm/swap_pager.h>
94 
95 #include <machine/inttypes.h>
96 #include <machine/md_var.h>
97 #include <machine/specialreg.h>
98 #include <machine/bus_dma.h>
99 
100 #include <vm/vm_page2.h>
101 #include <sys/spinlock2.h>
102 
103 struct vm_page_hash_elm {
104 	vm_page_t	m;
105 	int		ticks;
106 	int		unused01;
107 };
108 
109 #define VM_PAGE_HASH_SET	4		    /* power of 2, set-assoc */
110 #define VM_PAGE_HASH_MAX	(16 * 1024 * 1024)  /* power of 2, max size */
111 
112 /*
113  * SET - Minimum required set associative size, must be a power of 2.  We
114  *	 want this to match or exceed the set-associativeness of the cpu,
115  *	 up to a reasonable limit (we will use 16).
116  */
117 __read_mostly static int set_assoc_mask = 16 - 1;
118 
119 static void vm_page_queue_init(void);
120 static void vm_page_free_wakeup(void);
121 static vm_page_t vm_page_select_cache(u_short pg_color);
122 static vm_page_t _vm_page_list_find_wide(int basequeue, int index, int *lastp);
123 static vm_page_t _vm_page_list_find2_wide(int bq1, int bq2, int index,
124 			int *lastp1, int *lastp);
125 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
126 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
127 
128 /*
129  * Array of tailq lists
130  */
131 struct vpgqueues vm_page_queues[PQ_COUNT];
132 
133 static volatile int vm_pages_waiting;
134 static struct alist vm_contig_alist;
135 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
136 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
137 
138 __read_mostly static int vm_page_hash_vnode_only;
139 __read_mostly static int vm_page_hash_size;
140 __read_mostly static struct vm_page_hash_elm *vm_page_hash;
141 
142 static u_long vm_dma_reserved = 0;
143 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
144 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
145 	    "Memory reserved for DMA");
146 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
147 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
148 
149 SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW,
150 	    &vm_page_hash_vnode_only, 0, "Only hash vnode pages");
151 #if 0
152 static int vm_page_hash_debug;
153 SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW,
154 	    &vm_page_hash_debug, 0, "Only hash vnode pages");
155 #endif
156 
157 static int vm_contig_verbose = 0;
158 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
159 
160 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
161 	     vm_pindex_t, pindex);
162 
163 static void
164 vm_page_queue_init(void)
165 {
166 	int i;
167 
168 	for (i = 0; i < PQ_L2_SIZE; i++)
169 		vm_page_queues[PQ_FREE+i].cnt_offset =
170 			offsetof(struct vmstats, v_free_count);
171 	for (i = 0; i < PQ_L2_SIZE; i++)
172 		vm_page_queues[PQ_CACHE+i].cnt_offset =
173 			offsetof(struct vmstats, v_cache_count);
174 	for (i = 0; i < PQ_L2_SIZE; i++)
175 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
176 			offsetof(struct vmstats, v_inactive_count);
177 	for (i = 0; i < PQ_L2_SIZE; i++)
178 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
179 			offsetof(struct vmstats, v_active_count);
180 	for (i = 0; i < PQ_L2_SIZE; i++)
181 		vm_page_queues[PQ_HOLD+i].cnt_offset =
182 			offsetof(struct vmstats, v_active_count);
183 	/* PQ_NONE has no queue */
184 
185 	for (i = 0; i < PQ_COUNT; i++) {
186 		vm_page_queues[i].lastq = -1;
187 		TAILQ_INIT(&vm_page_queues[i].pl);
188 		spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
189 	}
190 }
191 
192 /*
193  * note: place in initialized data section?  Is this necessary?
194  */
195 vm_pindex_t first_page = 0;
196 vm_pindex_t vm_page_array_size = 0;
197 vm_page_t vm_page_array = NULL;
198 vm_paddr_t vm_low_phys_reserved;
199 
200 /*
201  * (low level boot)
202  *
203  * Sets the page size, perhaps based upon the memory size.
204  * Must be called before any use of page-size dependent functions.
205  */
206 void
207 vm_set_page_size(void)
208 {
209 	if (vmstats.v_page_size == 0)
210 		vmstats.v_page_size = PAGE_SIZE;
211 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
212 		panic("vm_set_page_size: page size not a power of two");
213 }
214 
215 /*
216  * (low level boot)
217  *
218  * Add a new page to the freelist for use by the system.  New pages
219  * are added to both the head and tail of the associated free page
220  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
221  * requests pull 'recent' adds (higher physical addresses) first.
222  *
223  * Beware that the page zeroing daemon will also be running soon after
224  * boot, moving pages from the head to the tail of the PQ_FREE queues.
225  *
226  * Must be called in a critical section.
227  */
228 static void
229 vm_add_new_page(vm_paddr_t pa, int *badcountp)
230 {
231 	struct vpgqueues *vpq;
232 	vm_page_t m;
233 
234 	m = PHYS_TO_VM_PAGE(pa);
235 
236 	/*
237 	 * Make sure it isn't a duplicate (due to BIOS page range overlaps,
238 	 * which we consider bugs... but don't crash).  Note that m->phys_addr
239 	 * is pre-initialized, so use m->queue as a check.
240 	 */
241 	if (m->queue) {
242 		if (*badcountp < 10) {
243 			kprintf("vm_add_new_page: duplicate pa %016jx\n",
244 				(intmax_t)pa);
245 			++*badcountp;
246 		} else if (*badcountp == 10) {
247 			kprintf("vm_add_new_page: duplicate pa (many more)\n");
248 			++*badcountp;
249 		}
250 		return;
251 	}
252 
253 	m->phys_addr = pa;
254 	m->flags = 0;
255 	m->pat_mode = PAT_WRITE_BACK;
256 	m->pc = (pa >> PAGE_SHIFT);
257 
258 	/*
259 	 * Twist for cpu localization in addition to page coloring, so
260 	 * different cpus selecting by m->queue get different page colors.
261 	 */
262 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
263 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
264 	m->pc &= PQ_L2_MASK;
265 
266 	/*
267 	 * Reserve a certain number of contiguous low memory pages for
268 	 * contigmalloc() to use.
269 	 *
270 	 * Even though these pages represent real ram and can be
271 	 * reverse-mapped, we set PG_FICTITIOUS and PG_UNQUEUED
272 	 * because their use is special-cased.
273 	 *
274 	 * WARNING! Once PG_FICTITIOUS is set, vm_page_wire*()
275 	 *	    and vm_page_unwire*() calls have no effect.
276 	 */
277 	if (pa < vm_low_phys_reserved) {
278 		atomic_add_long(&vmstats.v_page_count, 1);
279 		atomic_add_long(&vmstats.v_dma_pages, 1);
280 		m->flags |= PG_FICTITIOUS | PG_UNQUEUED;
281 		m->queue = PQ_NONE;
282 		m->wire_count = 1;
283 		atomic_add_long(&vmstats.v_wire_count, 1);
284 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
285 		return;
286 	}
287 
288 	/*
289 	 * General page
290 	 */
291 	m->queue = m->pc + PQ_FREE;
292 	KKASSERT(m->dirty == 0);
293 
294 	atomic_add_long(&vmstats.v_page_count, 1);
295 	atomic_add_long(&vmstats.v_free_count, 1);
296 	vpq = &vm_page_queues[m->queue];
297 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
298 	++vpq->lcnt;
299 }
300 
301 /*
302  * (low level boot)
303  *
304  * Initializes the resident memory module.
305  *
306  * Preallocates memory for critical VM structures and arrays prior to
307  * kernel_map becoming available.
308  *
309  * Memory is allocated from (virtual2_start, virtual2_end) if available,
310  * otherwise memory is allocated from (virtual_start, virtual_end).
311  *
312  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
313  * large enough to hold vm_page_array & other structures for machines with
314  * large amounts of ram, so we want to use virtual2* when available.
315  */
316 void
317 vm_page_startup(void)
318 {
319 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
320 	vm_offset_t mapped;
321 	vm_pindex_t npages;
322 	vm_paddr_t page_range;
323 	vm_paddr_t new_end;
324 	int i;
325 	vm_paddr_t pa;
326 	vm_paddr_t last_pa;
327 	vm_paddr_t end;
328 	vm_paddr_t biggestone, biggestsize;
329 	vm_paddr_t total;
330 	vm_page_t m;
331 	int badcount;
332 
333 	total = 0;
334 	badcount = 0;
335 	biggestsize = 0;
336 	biggestone = 0;
337 	vaddr = round_page(vaddr);
338 
339 	/*
340 	 * Make sure ranges are page-aligned.
341 	 */
342 	for (i = 0; phys_avail[i].phys_end; ++i) {
343 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
344 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
345 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
346 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
347 	}
348 
349 	/*
350 	 * Locate largest block
351 	 */
352 	for (i = 0; phys_avail[i].phys_end; ++i) {
353 		vm_paddr_t size = phys_avail[i].phys_end -
354 				  phys_avail[i].phys_beg;
355 
356 		if (size > biggestsize) {
357 			biggestone = i;
358 			biggestsize = size;
359 		}
360 		total += size;
361 	}
362 	--i;	/* adjust to last entry for use down below */
363 
364 	end = phys_avail[biggestone].phys_end;
365 	end = trunc_page(end);
366 
367 	/*
368 	 * Initialize the queue headers for the free queue, the active queue
369 	 * and the inactive queue.
370 	 */
371 	vm_page_queue_init();
372 
373 #if !defined(_KERNEL_VIRTUAL)
374 	/*
375 	 * VKERNELs don't support minidumps and as such don't need
376 	 * vm_page_dump
377 	 *
378 	 * Allocate a bitmap to indicate that a random physical page
379 	 * needs to be included in a minidump.
380 	 *
381 	 * The amd64 port needs this to indicate which direct map pages
382 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
383 	 *
384 	 * However, x86 still needs this workspace internally within the
385 	 * minidump code.  In theory, they are not needed on x86, but are
386 	 * included should the sf_buf code decide to use them.
387 	 */
388 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
389 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
390 	end -= vm_page_dump_size;
391 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
392 					VM_PROT_READ | VM_PROT_WRITE);
393 	bzero((void *)vm_page_dump, vm_page_dump_size);
394 #endif
395 	/*
396 	 * Compute the number of pages of memory that will be available for
397 	 * use (taking into account the overhead of a page structure per
398 	 * page).
399 	 */
400 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
401 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
402 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
403 
404 #ifndef _KERNEL_VIRTUAL
405 	/*
406 	 * (only applies to real kernels)
407 	 *
408 	 * Reserve a large amount of low memory for potential 32-bit DMA
409 	 * space allocations.  Once device initialization is complete we
410 	 * release most of it, but keep (vm_dma_reserved) memory reserved
411 	 * for later use.  Typically for X / graphics.  Through trial and
412 	 * error we find that GPUs usually requires ~60-100MB or so.
413 	 *
414 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
415 	 */
416 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
417 	if (vm_low_phys_reserved > total / 4)
418 		vm_low_phys_reserved = total / 4;
419 	if (vm_dma_reserved == 0) {
420 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
421 		if (vm_dma_reserved > total / 16)
422 			vm_dma_reserved = total / 16;
423 	}
424 #endif
425 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
426 		   ALIST_RECORDS_65536);
427 
428 	/*
429 	 * Initialize the mem entry structures now, and put them in the free
430 	 * queue.
431 	 */
432 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
433 		kprintf("initializing vm_page_array ");
434 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
435 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
436 	vm_page_array = (vm_page_t)mapped;
437 
438 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
439 	/*
440 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
441 	 * we have to manually add these pages to the minidump tracking so
442 	 * that they can be dumped, including the vm_page_array.
443 	 */
444 	for (pa = new_end;
445 	     pa < phys_avail[biggestone].phys_end;
446 	     pa += PAGE_SIZE) {
447 		dump_add_page(pa);
448 	}
449 #endif
450 
451 	/*
452 	 * Clear all of the page structures, run basic initialization so
453 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
454 	 * map.
455 	 */
456 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
457 	vm_page_array_size = page_range;
458 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
459 		kprintf("size = 0x%zx\n", vm_page_array_size);
460 
461 	m = &vm_page_array[0];
462 	pa = ptoa(first_page);
463 	for (i = 0; i < page_range; ++i) {
464 		spin_init(&m->spin, "vm_page");
465 		m->phys_addr = pa;
466 		pa += PAGE_SIZE;
467 		++m;
468 	}
469 
470 	/*
471 	 * Construct the free queue(s) in ascending order (by physical
472 	 * address) so that the first 16MB of physical memory is allocated
473 	 * last rather than first.  On large-memory machines, this avoids
474 	 * the exhaustion of low physical memory before isa_dma_init has run.
475 	 */
476 	vmstats.v_page_count = 0;
477 	vmstats.v_free_count = 0;
478 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
479 		pa = phys_avail[i].phys_beg;
480 		if (i == biggestone)
481 			last_pa = new_end;
482 		else
483 			last_pa = phys_avail[i].phys_end;
484 		while (pa < last_pa && npages-- > 0) {
485 			vm_add_new_page(pa, &badcount);
486 			pa += PAGE_SIZE;
487 		}
488 	}
489 	if (virtual2_start)
490 		virtual2_start = vaddr;
491 	else
492 		virtual_start = vaddr;
493 	mycpu->gd_vmstats = vmstats;
494 }
495 
496 /*
497  * (called from early boot only)
498  *
499  * Reorganize VM pages based on numa data.  May be called as many times as
500  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
501  * to allow vm_page_alloc() to choose pages based on socket affinity.
502  *
503  * NOTE: This function is only called while we are still in UP mode, so
504  *	 we only need a critical section to protect the queues (which
505  *	 saves a lot of time, there are likely a ton of pages).
506  */
507 void
508 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
509 {
510 	vm_paddr_t scan_beg;
511 	vm_paddr_t scan_end;
512 	vm_paddr_t ran_end;
513 	struct vpgqueues *vpq;
514 	vm_page_t m;
515 	vm_page_t mend;
516 	int socket_mod;
517 	int socket_value;
518 	int i;
519 
520 	/*
521 	 * Check if no physical information, or there was only one socket
522 	 * (so don't waste time doing nothing!).
523 	 */
524 	if (cpu_topology_phys_ids <= 1 ||
525 	    cpu_topology_core_ids == 0) {
526 		return;
527 	}
528 
529 	/*
530 	 * Setup for our iteration.  Note that ACPI may iterate CPU
531 	 * sockets starting at 0 or 1 or some other number.  The
532 	 * cpu_topology code mod's it against the socket count.
533 	 */
534 	ran_end = ran_beg + bytes;
535 
536 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
537 	socket_value = (physid % cpu_topology_phys_ids) * socket_mod;
538 	mend = &vm_page_array[vm_page_array_size];
539 
540 	crit_enter();
541 
542 	/*
543 	 * Adjust cpu_topology's phys_mem parameter
544 	 */
545 	if (root_cpu_node)
546 		vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes);
547 
548 	/*
549 	 * Adjust vm_page->pc and requeue all affected pages.  The
550 	 * allocator will then be able to localize memory allocations
551 	 * to some degree.
552 	 */
553 	for (i = 0; phys_avail[i].phys_end; ++i) {
554 		scan_beg = phys_avail[i].phys_beg;
555 		scan_end = phys_avail[i].phys_end;
556 		if (scan_end <= ran_beg)
557 			continue;
558 		if (scan_beg >= ran_end)
559 			continue;
560 		if (scan_beg < ran_beg)
561 			scan_beg = ran_beg;
562 		if (scan_end > ran_end)
563 			scan_end = ran_end;
564 		if (atop(scan_end) > first_page + vm_page_array_size)
565 			scan_end = ptoa(first_page + vm_page_array_size);
566 
567 		m = PHYS_TO_VM_PAGE(scan_beg);
568 		while (scan_beg < scan_end) {
569 			KKASSERT(m < mend);
570 			if (m->queue != PQ_NONE) {
571 				vpq = &vm_page_queues[m->queue];
572 				TAILQ_REMOVE(&vpq->pl, m, pageq);
573 				--vpq->lcnt;
574 				/* queue doesn't change, no need to adj cnt */
575 				m->queue -= m->pc;
576 				m->pc %= socket_mod;
577 				m->pc += socket_value;
578 				m->pc &= PQ_L2_MASK;
579 				m->queue += m->pc;
580 				vpq = &vm_page_queues[m->queue];
581 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
582 				++vpq->lcnt;
583 				/* queue doesn't change, no need to adj cnt */
584 			} else {
585 				m->pc %= socket_mod;
586 				m->pc += socket_value;
587 				m->pc &= PQ_L2_MASK;
588 			}
589 			scan_beg += PAGE_SIZE;
590 			++m;
591 		}
592 	}
593 
594 	crit_exit();
595 }
596 
597 /*
598  * (called from early boot only)
599  *
600  * Don't allow the NUMA organization to leave vm_page_queues[] nodes
601  * completely empty for a logical cpu.  Doing so would force allocations
602  * on that cpu to always borrow from a nearby cpu, create unnecessary
603  * contention, and cause vm_page_alloc() to iterate more queues and run more
604  * slowly.
605  *
606  * This situation can occur when memory sticks are not entirely populated,
607  * populated at different densities, or in naturally assymetric systems
608  * such as the 2990WX.  There could very well be many vm_page_queues[]
609  * entries with *NO* pages assigned to them.
610  *
611  * Fixing this up ensures that each logical CPU has roughly the same
612  * sized memory pool, and more importantly ensures that logical CPUs
613  * do not wind up with an empty memory pool.
614  *
615  * At them moment we just iterate the other queues and borrow pages,
616  * moving them into the queues for cpus with severe deficits even though
617  * the memory might not be local to those cpus.  I am not doing this in
618  * a 'smart' way, its effectively UMA style (sorta, since its page-by-page
619  * whereas real UMA typically exchanges address bits 8-10 with high address
620  * bits).  But it works extremely well and gives us fairly good deterministic
621  * results on the cpu cores associated with these secondary nodes.
622  */
623 void
624 vm_numa_organize_finalize(void)
625 {
626 	struct vpgqueues *vpq;
627 	vm_page_t m;
628 	long lcnt_lo;
629 	long lcnt_hi;
630 	int iter;
631 	int i;
632 	int scale_lim;
633 
634 	crit_enter();
635 
636 	/*
637 	 * Machines might not use an exact power of 2 for phys_ids,
638 	 * core_ids, ht_ids, etc.  This can slightly reduce the actual
639 	 * range of indices in vm_page_queues[] that are nominally used.
640 	 */
641 	if (cpu_topology_ht_ids) {
642 		scale_lim = PQ_L2_SIZE / cpu_topology_phys_ids;
643 		scale_lim = scale_lim / cpu_topology_core_ids;
644 		scale_lim = scale_lim / cpu_topology_ht_ids;
645 		scale_lim = scale_lim * cpu_topology_ht_ids;
646 		scale_lim = scale_lim * cpu_topology_core_ids;
647 		scale_lim = scale_lim * cpu_topology_phys_ids;
648 	} else {
649 		scale_lim = PQ_L2_SIZE;
650 	}
651 
652 	/*
653 	 * Calculate an average, set hysteresis for balancing from
654 	 * 10% below the average to the average.
655 	 */
656 	lcnt_hi = 0;
657 	for (i = 0; i < scale_lim; ++i) {
658 		lcnt_hi += vm_page_queues[i].lcnt;
659 	}
660 	lcnt_hi /= scale_lim;
661 	lcnt_lo = lcnt_hi - lcnt_hi / 10;
662 
663 	kprintf("vm_page: avg %ld pages per queue, %d queues\n",
664 		lcnt_hi, scale_lim);
665 
666 	iter = 0;
667 	for (i = 0; i < scale_lim; ++i) {
668 		vpq = &vm_page_queues[PQ_FREE + i];
669 		while (vpq->lcnt < lcnt_lo) {
670 			struct vpgqueues *vptmp;
671 
672 			iter = (iter + 1) & PQ_L2_MASK;
673 			vptmp = &vm_page_queues[PQ_FREE + iter];
674 			if (vptmp->lcnt < lcnt_hi)
675 				continue;
676 			m = TAILQ_FIRST(&vptmp->pl);
677 			KKASSERT(m->queue == PQ_FREE + iter);
678 			TAILQ_REMOVE(&vptmp->pl, m, pageq);
679 			--vptmp->lcnt;
680 			/* queue doesn't change, no need to adj cnt */
681 			m->queue -= m->pc;
682 			m->pc = i;
683 			m->queue += m->pc;
684 			TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
685 			++vpq->lcnt;
686 		}
687 	}
688 	crit_exit();
689 }
690 
691 static
692 void
693 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes)
694 {
695 	int cpuid;
696 	int i;
697 
698 	switch(cpup->type) {
699 	case PACKAGE_LEVEL:
700 		cpup->phys_mem += bytes;
701 		break;
702 	case CHIP_LEVEL:
703 		/*
704 		 * All members should have the same chipid, so we only need
705 		 * to pull out one member.
706 		 */
707 		if (CPUMASK_TESTNZERO(cpup->members)) {
708 			cpuid = BSFCPUMASK(cpup->members);
709 			if (physid ==
710 			    get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) {
711 				cpup->phys_mem += bytes;
712 			}
713 		}
714 		break;
715 	case CORE_LEVEL:
716 	case THREAD_LEVEL:
717 		/*
718 		 * Just inherit from the parent node
719 		 */
720 		cpup->phys_mem = cpup->parent_node->phys_mem;
721 		break;
722 	}
723 	for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i)
724 		vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes);
725 }
726 
727 /*
728  * We tended to reserve a ton of memory for contigmalloc().  Now that most
729  * drivers have initialized we want to return most the remaining free
730  * reserve back to the VM page queues so they can be used for normal
731  * allocations.
732  *
733  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
734  */
735 static void
736 vm_page_startup_finish(void *dummy __unused)
737 {
738 	alist_blk_t blk;
739 	alist_blk_t rblk;
740 	alist_blk_t count;
741 	alist_blk_t xcount;
742 	alist_blk_t bfree;
743 	vm_page_t m;
744 	struct vm_page_hash_elm *mp;
745 	int mask;
746 
747 	/*
748 	 * Set the set_assoc_mask based on the fitted number of CPUs.
749 	 * This is a mask, so we subject 1.
750 	 *
751 	 * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8.
752 	 * So if we have 256 CPUs, two hyper-threads will wind up sharing.
753 	 *
754 	 * The maximum is PQ_L2_SIZE.  However, we limit the starting
755 	 * maximum to 16 (mask = 15) in order to improve the cache locality
756 	 * of related kernel data structures.
757 	 */
758 	mask = PQ_L2_SIZE / ncpus_fit - 1;
759 	if (mask < 7)		/* minimum is 8-way w/256 CPU threads */
760 		mask = 7;
761 	if (mask < 15)
762 		mask = 15;
763 	cpu_ccfence();
764 	set_assoc_mask = mask;
765 
766 	/*
767 	 * Return part of the initial reserve back to the system
768 	 */
769 	spin_lock(&vm_contig_spin);
770 	for (;;) {
771 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
772 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
773 			break;
774 		if (count == 0)
775 			break;
776 
777 		/*
778 		 * Figure out how much of the initial reserve we have to
779 		 * free in order to reach our target.
780 		 */
781 		bfree -= vm_dma_reserved / PAGE_SIZE;
782 		if (count > bfree) {
783 			blk += count - bfree;
784 			count = bfree;
785 		}
786 
787 		/*
788 		 * Calculate the nearest power of 2 <= count.
789 		 */
790 		for (xcount = 1; xcount <= count; xcount <<= 1)
791 			;
792 		xcount >>= 1;
793 		blk += count - xcount;
794 		count = xcount;
795 
796 		/*
797 		 * Allocate the pages from the alist, then free them to
798 		 * the normal VM page queues.
799 		 *
800 		 * Pages allocated from the alist are wired.  We have to
801 		 * busy, unwire, and free them.  We must also adjust
802 		 * vm_low_phys_reserved before freeing any pages to prevent
803 		 * confusion.
804 		 */
805 		rblk = alist_alloc(&vm_contig_alist, blk, count);
806 		if (rblk != blk) {
807 			kprintf("vm_page_startup_finish: Unable to return "
808 				"dma space @0x%08x/%d -> 0x%08x\n",
809 				blk, count, rblk);
810 			break;
811 		}
812 		atomic_add_long(&vmstats.v_dma_pages, -(long)count);
813 		spin_unlock(&vm_contig_spin);
814 
815 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
816 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
817 		while (count) {
818 			vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);
819 			vm_page_busy_wait(m, FALSE, "cpgfr");
820 			vm_page_unwire(m, 0);
821 			vm_page_free(m);
822 			--count;
823 			++m;
824 		}
825 		spin_lock(&vm_contig_spin);
826 	}
827 	spin_unlock(&vm_contig_spin);
828 
829 	/*
830 	 * Print out how much DMA space drivers have already allocated and
831 	 * how much is left over.
832 	 */
833 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
834 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
835 		(PAGE_SIZE / 1024),
836 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
837 
838 	/*
839 	 * Power of 2
840 	 */
841 	vm_page_hash_size = 4096;
842 	while (vm_page_hash_size < (vm_page_array_size / 16))
843 		vm_page_hash_size <<= 1;
844 	if (vm_page_hash_size > VM_PAGE_HASH_MAX)
845 		vm_page_hash_size = VM_PAGE_HASH_MAX;
846 
847 	/*
848 	 * hash table for vm_page_lookup_quick()
849 	 */
850 	mp = (void *)kmem_alloc3(&kernel_map,
851 				 vm_page_hash_size * sizeof(*vm_page_hash),
852 				 VM_SUBSYS_VMPGHASH, KM_CPU(0));
853 	bzero(mp, vm_page_hash_size * sizeof(*mp));
854 	cpu_sfence();
855 	vm_page_hash = mp;
856 }
857 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
858 	vm_page_startup_finish, NULL);
859 
860 
861 /*
862  * Scan comparison function for Red-Black tree scans.  An inclusive
863  * (start,end) is expected.  Other fields are not used.
864  */
865 int
866 rb_vm_page_scancmp(struct vm_page *p, void *data)
867 {
868 	struct rb_vm_page_scan_info *info = data;
869 
870 	if (p->pindex < info->start_pindex)
871 		return(-1);
872 	if (p->pindex > info->end_pindex)
873 		return(1);
874 	return(0);
875 }
876 
877 int
878 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
879 {
880 	if (p1->pindex < p2->pindex)
881 		return(-1);
882 	if (p1->pindex > p2->pindex)
883 		return(1);
884 	return(0);
885 }
886 
887 void
888 vm_page_init(vm_page_t m)
889 {
890 	/* do nothing for now.  Called from pmap_page_init() */
891 }
892 
893 /*
894  * Each page queue has its own spin lock, which is fairly optimal for
895  * allocating and freeing pages at least.
896  *
897  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
898  * queue spinlock via this function.  Also note that m->queue cannot change
899  * unless both the page and queue are locked.
900  */
901 static __inline
902 void
903 _vm_page_queue_spin_lock(vm_page_t m)
904 {
905 	u_short queue;
906 
907 	queue = m->queue;
908 	if (queue != PQ_NONE) {
909 		spin_lock(&vm_page_queues[queue].spin);
910 		KKASSERT(queue == m->queue);
911 	}
912 }
913 
914 static __inline
915 void
916 _vm_page_queue_spin_unlock(vm_page_t m)
917 {
918 	u_short queue;
919 
920 	queue = m->queue;
921 	cpu_ccfence();
922 	if (queue != PQ_NONE)
923 		spin_unlock(&vm_page_queues[queue].spin);
924 }
925 
926 static __inline
927 void
928 _vm_page_queues_spin_lock(u_short queue)
929 {
930 	cpu_ccfence();
931 	if (queue != PQ_NONE)
932 		spin_lock(&vm_page_queues[queue].spin);
933 }
934 
935 
936 static __inline
937 void
938 _vm_page_queues_spin_unlock(u_short queue)
939 {
940 	cpu_ccfence();
941 	if (queue != PQ_NONE)
942 		spin_unlock(&vm_page_queues[queue].spin);
943 }
944 
945 void
946 vm_page_queue_spin_lock(vm_page_t m)
947 {
948 	_vm_page_queue_spin_lock(m);
949 }
950 
951 void
952 vm_page_queues_spin_lock(u_short queue)
953 {
954 	_vm_page_queues_spin_lock(queue);
955 }
956 
957 void
958 vm_page_queue_spin_unlock(vm_page_t m)
959 {
960 	_vm_page_queue_spin_unlock(m);
961 }
962 
963 void
964 vm_page_queues_spin_unlock(u_short queue)
965 {
966 	_vm_page_queues_spin_unlock(queue);
967 }
968 
969 /*
970  * This locks the specified vm_page and its queue in the proper order
971  * (page first, then queue).  The queue may change so the caller must
972  * recheck on return.
973  */
974 static __inline
975 void
976 _vm_page_and_queue_spin_lock(vm_page_t m)
977 {
978 	vm_page_spin_lock(m);
979 	_vm_page_queue_spin_lock(m);
980 }
981 
982 static __inline
983 void
984 _vm_page_and_queue_spin_unlock(vm_page_t m)
985 {
986 	_vm_page_queues_spin_unlock(m->queue);
987 	vm_page_spin_unlock(m);
988 }
989 
990 void
991 vm_page_and_queue_spin_unlock(vm_page_t m)
992 {
993 	_vm_page_and_queue_spin_unlock(m);
994 }
995 
996 void
997 vm_page_and_queue_spin_lock(vm_page_t m)
998 {
999 	_vm_page_and_queue_spin_lock(m);
1000 }
1001 
1002 /*
1003  * Helper function removes vm_page from its current queue.
1004  * Returns the base queue the page used to be on.
1005  *
1006  * The vm_page and the queue must be spinlocked.
1007  * This function will unlock the queue but leave the page spinlocked.
1008  */
1009 static __inline u_short
1010 _vm_page_rem_queue_spinlocked(vm_page_t m)
1011 {
1012 	struct vpgqueues *pq;
1013 	u_short queue;
1014 	u_short oqueue;
1015 	long *cnt_adj;
1016 	long *cnt_gd;
1017 
1018 	queue = m->queue;
1019 	if (queue != PQ_NONE) {
1020 		pq = &vm_page_queues[queue];
1021 		TAILQ_REMOVE(&pq->pl, m, pageq);
1022 
1023 		/*
1024 		 * Primarily adjust our pcpu stats for rollup, which is
1025 		 * (mycpu->gd_vmstats_adj + offset).  This is normally
1026 		 * synchronized on every hardclock().
1027 		 *
1028 		 * However, in order for the nominal low-memory algorithms
1029 		 * to work properly if the unsynchronized adjustment gets
1030 		 * too negative and might trigger the pageout daemon, we
1031 		 * immediately synchronize with the global structure.
1032 		 *
1033 		 * The idea here is to reduce unnecessary SMP cache mastership
1034 		 * changes in the global vmstats, which can be particularly
1035 		 * bad in multi-socket systems.
1036 		 *
1037 		 * WARNING! In systems with low amounts of memory the
1038 		 *	    vm_paging_needed(-1024 * ncpus) test could
1039 		 *	    wind up testing a value above the paging target,
1040 		 *	    meaning it would almost always return TRUE.  In
1041 		 *	    that situation we synchronize every time the
1042 		 *	    cumulative adjustment falls below -1024.
1043 		 */
1044 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1045 				   pq->cnt_offset);
1046 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1047 				   pq->cnt_offset);
1048 		atomic_add_long(cnt_adj, -1);
1049 		atomic_add_long(cnt_gd, -1);
1050 
1051 		if (*cnt_adj < -1024 && vm_paging_needed(-1024 * ncpus)) {
1052 			u_long copy = atomic_swap_long(cnt_adj, 0);
1053 			cnt_adj = (long *)((char *)&vmstats + pq->cnt_offset);
1054 			atomic_add_long(cnt_adj, copy);
1055 		}
1056 		pq->lcnt--;
1057 		m->queue = PQ_NONE;
1058 		oqueue = queue;
1059 		queue -= m->pc;
1060 		vm_page_queues_spin_unlock(oqueue);	/* intended */
1061 	}
1062 	return queue;
1063 }
1064 
1065 /*
1066  * Helper function places the vm_page on the specified queue.  Generally
1067  * speaking only PQ_FREE pages are placed at the head, to allow them to
1068  * be allocated sooner rather than later on the assumption that they
1069  * are cache-hot.
1070  *
1071  * The vm_page must be spinlocked.
1072  * The vm_page must NOT be FICTITIOUS (that would be a disaster)
1073  * This function will return with both the page and the queue locked.
1074  */
1075 static __inline void
1076 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
1077 {
1078 	struct vpgqueues *pq;
1079 	u_long *cnt_adj;
1080 	u_long *cnt_gd;
1081 
1082 	KKASSERT(m->queue == PQ_NONE &&
1083 		 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0);
1084 
1085 	if (queue != PQ_NONE) {
1086 		vm_page_queues_spin_lock(queue);
1087 		pq = &vm_page_queues[queue];
1088 		++pq->lcnt;
1089 
1090 		/*
1091 		 * Adjust our pcpu stats.  If a system entity really needs
1092 		 * to incorporate the count it will call vmstats_rollup()
1093 		 * to roll it all up into the global vmstats strufture.
1094 		 */
1095 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1096 				   pq->cnt_offset);
1097 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1098 				   pq->cnt_offset);
1099 		atomic_add_long(cnt_adj, 1);
1100 		atomic_add_long(cnt_gd, 1);
1101 
1102 		/*
1103 		 * PQ_FREE is always handled LIFO style to try to provide
1104 		 * cache-hot pages to programs.
1105 		 */
1106 		m->queue = queue;
1107 		if (queue - m->pc == PQ_FREE) {
1108 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1109 		} else if (athead) {
1110 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1111 		} else {
1112 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1113 		}
1114 		/* leave the queue spinlocked */
1115 	}
1116 }
1117 
1118 /*
1119  * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
1120  * until the page is no longer BUSY or SBUSY (busy_count field is 0).
1121  *
1122  * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
1123  * call will be made before returning.
1124  *
1125  * This function does NOT busy the page and on return the page is not
1126  * guaranteed to be available.
1127  */
1128 void
1129 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
1130 {
1131 	u_int32_t busy_count;
1132 
1133 	for (;;) {
1134 		busy_count = m->busy_count;
1135 		cpu_ccfence();
1136 
1137 		if ((busy_count & PBUSY_LOCKED) == 0 &&
1138 		    (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
1139 			break;
1140 		}
1141 		tsleep_interlock(m, 0);
1142 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1143 				      busy_count | PBUSY_WANTED)) {
1144 			atomic_set_int(&m->flags, PG_REFERENCED);
1145 			tsleep(m, PINTERLOCKED, msg, 0);
1146 			break;
1147 		}
1148 	}
1149 }
1150 
1151 /*
1152  * This calculates and returns a page color given an optional VM object and
1153  * either a pindex or an iterator.  We attempt to return a cpu-localized
1154  * pg_color that is still roughly 16-way set-associative.  The CPU topology
1155  * is used if it was probed.
1156  *
1157  * The caller may use the returned value to index into e.g. PQ_FREE when
1158  * allocating a page in order to nominally obtain pages that are hopefully
1159  * already localized to the requesting cpu.  This function is not able to
1160  * provide any sort of guarantee of this, but does its best to improve
1161  * hardware cache management performance.
1162  *
1163  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
1164  */
1165 u_short
1166 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
1167 {
1168 	u_short pg_color;
1169 	int object_pg_color;
1170 
1171 	/*
1172 	 * WARNING! cpu_topology_core_ids might not be a power of two.
1173 	 *	    We also shouldn't make assumptions about
1174 	 *	    cpu_topology_phys_ids either.
1175 	 *
1176 	 * WARNING! ncpus might not be known at this time (during early
1177 	 *	    boot), and might be set to 1.
1178 	 *
1179 	 * General format: [phys_id][core_id][cpuid][set-associativity]
1180 	 * (but uses modulo, so not necessarily precise bit masks)
1181 	 */
1182 	object_pg_color = object ? object->pg_color : 0;
1183 
1184 	if (cpu_topology_ht_ids) {
1185 		int phys_id;
1186 		int core_id;
1187 		int ht_id;
1188 		int physcale;
1189 		int grpscale;
1190 		int cpuscale;
1191 
1192 		/*
1193 		 * Translate cpuid to socket, core, and hyperthread id.
1194 		 */
1195 		phys_id = get_cpu_phys_id(cpuid);
1196 		core_id = get_cpu_core_id(cpuid);
1197 		ht_id = get_cpu_ht_id(cpuid);
1198 
1199 		/*
1200 		 * Calculate pg_color for our array index.
1201 		 *
1202 		 * physcale - socket multiplier.
1203 		 * grpscale - core multiplier (cores per socket)
1204 		 * cpu*	    - cpus per core
1205 		 *
1206 		 * WARNING! In early boot, ncpus has not yet been
1207 		 *	    initialized and may be set to (1).
1208 		 *
1209 		 * WARNING! physcale must match the organization that
1210 		 *	    vm_numa_organize() creates to ensure that
1211 		 *	    we properly localize allocations to the
1212 		 *	    requested cpuid.
1213 		 */
1214 		physcale = PQ_L2_SIZE / cpu_topology_phys_ids;
1215 		grpscale = physcale / cpu_topology_core_ids;
1216 		cpuscale = grpscale / cpu_topology_ht_ids;
1217 
1218 		pg_color = phys_id * physcale;
1219 		pg_color += core_id * grpscale;
1220 		pg_color += ht_id * cpuscale;
1221 		pg_color += (pindex + object_pg_color) % cpuscale;
1222 
1223 #if 0
1224 		if (grpsize >= 8) {
1225 			pg_color += (pindex + object_pg_color) % grpsize;
1226 		} else {
1227 			if (grpsize <= 2) {
1228 				grpsize = 8;
1229 			} else {
1230 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
1231 				grpsize += grpsize;
1232 				if (grpsize < 8)
1233 					grpsize += grpsize;
1234 			}
1235 			pg_color += (pindex + object_pg_color) % grpsize;
1236 		}
1237 #endif
1238 	} else {
1239 		/*
1240 		 * Unknown topology, distribute things evenly.
1241 		 *
1242 		 * WARNING! In early boot, ncpus has not yet been
1243 		 *	    initialized and may be set to (1).
1244 		 */
1245 		int cpuscale;
1246 
1247 		cpuscale = PQ_L2_SIZE / ncpus;
1248 
1249 		pg_color = cpuid * cpuscale;
1250 		pg_color += (pindex + object_pg_color) % cpuscale;
1251 	}
1252 	return (pg_color & PQ_L2_MASK);
1253 }
1254 
1255 /*
1256  * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
1257  * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
1258  */
1259 void
1260 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
1261 				     int also_m_busy, const char *msg
1262 				     VM_PAGE_DEBUG_ARGS)
1263 {
1264 	u_int32_t busy_count;
1265 
1266 	for (;;) {
1267 		busy_count = m->busy_count;
1268 		cpu_ccfence();
1269 		if (busy_count & PBUSY_LOCKED) {
1270 			tsleep_interlock(m, 0);
1271 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1272 					  busy_count | PBUSY_WANTED)) {
1273 				atomic_set_int(&m->flags, PG_REFERENCED);
1274 				tsleep(m, PINTERLOCKED, msg, 0);
1275 			}
1276 		} else if (also_m_busy && busy_count) {
1277 			tsleep_interlock(m, 0);
1278 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1279 					  busy_count | PBUSY_WANTED)) {
1280 				atomic_set_int(&m->flags, PG_REFERENCED);
1281 				tsleep(m, PINTERLOCKED, msg, 0);
1282 			}
1283 		} else {
1284 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1285 					      busy_count | PBUSY_LOCKED)) {
1286 #ifdef VM_PAGE_DEBUG
1287 				m->busy_func = func;
1288 				m->busy_line = lineno;
1289 #endif
1290 				break;
1291 			}
1292 		}
1293 	}
1294 }
1295 
1296 /*
1297  * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
1298  * m->busy_count is also 0.
1299  *
1300  * Returns non-zero on failure.
1301  */
1302 int
1303 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1304 				    VM_PAGE_DEBUG_ARGS)
1305 {
1306 	u_int32_t busy_count;
1307 
1308 	for (;;) {
1309 		busy_count = m->busy_count;
1310 		cpu_ccfence();
1311 		if (busy_count & PBUSY_LOCKED)
1312 			return TRUE;
1313 		if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1314 			return TRUE;
1315 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1316 				      busy_count | PBUSY_LOCKED)) {
1317 #ifdef VM_PAGE_DEBUG
1318 				m->busy_func = func;
1319 				m->busy_line = lineno;
1320 #endif
1321 			return FALSE;
1322 		}
1323 	}
1324 }
1325 
1326 /*
1327  * Clear the BUSY flag and return non-zero to indicate to the caller
1328  * that a wakeup() should be performed.
1329  *
1330  * (inline version)
1331  */
1332 static __inline
1333 int
1334 _vm_page_wakeup(vm_page_t m)
1335 {
1336 	u_int32_t busy_count;
1337 
1338 	busy_count = m->busy_count;
1339 	cpu_ccfence();
1340 	for (;;) {
1341 		if (atomic_fcmpset_int(&m->busy_count, &busy_count,
1342 				      busy_count &
1343 				      ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1344 			return((int)(busy_count & PBUSY_WANTED));
1345 		}
1346 	}
1347 	/* not reached */
1348 }
1349 
1350 /*
1351  * Clear the BUSY flag and wakeup anyone waiting for the page.  This
1352  * is typically the last call you make on a page before moving onto
1353  * other things.
1354  */
1355 void
1356 vm_page_wakeup(vm_page_t m)
1357 {
1358         KASSERT(m->busy_count & PBUSY_LOCKED,
1359 		("vm_page_wakeup: page not busy!!!"));
1360 	if (_vm_page_wakeup(m))
1361 		wakeup(m);
1362 }
1363 
1364 /*
1365  * Hold a page, preventing reuse.  This is typically only called on pages
1366  * in a known state (either held busy, special, or interlocked in some
1367  * manner).  Holding a page does not ensure that it remains valid, it only
1368  * prevents reuse.  The page must not already be on the FREE queue or in
1369  * any danger of being moved to the FREE queue concurrent with this call.
1370  *
1371  * Other parts of the system can still disassociate the page from its object
1372  * and attempt to free it, or perform read or write I/O on it and/or otherwise
1373  * manipulate the page, but if the page is held the VM system will leave the
1374  * page and its data intact and not cycle it through the FREE queue until
1375  * the last hold has been released.
1376  *
1377  * (see vm_page_wire() if you want to prevent the page from being
1378  *  disassociated from its object too).
1379  */
1380 void
1381 vm_page_hold(vm_page_t m)
1382 {
1383 	atomic_add_int(&m->hold_count, 1);
1384 	KKASSERT(m->queue - m->pc != PQ_FREE);
1385 }
1386 
1387 /*
1388  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1389  * it was freed while held and must be moved back to the FREE queue.
1390  *
1391  * To avoid racing against vm_page_free*() we must re-test conditions
1392  * after obtaining the spin-lock.  The initial test can also race a
1393  * vm_page_free*() that is in the middle of moving a page to PQ_HOLD,
1394  * leaving the page on PQ_HOLD with hold_count == 0.  Rather than
1395  * throw a spin-lock in the critical path, we rely on the pageout
1396  * daemon to clean-up these loose ends.
1397  *
1398  * More critically, the 'easy movement' between queues without busying
1399  * a vm_page is only allowed for PQ_FREE<->PQ_HOLD.
1400  */
1401 void
1402 vm_page_unhold(vm_page_t m)
1403 {
1404 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1405 		("vm_page_unhold: pg %p illegal hold_count (%d) or "
1406 		 "on FREE queue (%d)",
1407 		 m, m->hold_count, m->queue - m->pc));
1408 
1409 	if (atomic_fetchadd_int(&m->hold_count, -1) == 1 &&
1410 	    m->queue - m->pc == PQ_HOLD) {
1411 		vm_page_spin_lock(m);
1412 		if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1413 			_vm_page_queue_spin_lock(m);
1414 			_vm_page_rem_queue_spinlocked(m);
1415 			_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1416 			_vm_page_queue_spin_unlock(m);
1417 		}
1418 		vm_page_spin_unlock(m);
1419 	}
1420 }
1421 
1422 /*
1423  * Create a fictitious page with the specified physical address and
1424  * memory attribute.  The memory attribute is the only the machine-
1425  * dependent aspect of a fictitious page that must be initialized.
1426  */
1427 void
1428 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1429 {
1430 	/*
1431 	 * The page's memattr might have changed since the
1432 	 * previous initialization.  Update the pmap to the
1433 	 * new memattr.
1434 	 */
1435 	if ((m->flags & PG_FICTITIOUS) != 0)
1436 		goto memattr;
1437 	m->phys_addr = paddr;
1438 	m->queue = PQ_NONE;
1439 	/* Fictitious pages don't use "segind". */
1440 	/* Fictitious pages don't use "order" or "pool". */
1441 	m->flags = PG_FICTITIOUS | PG_UNQUEUED;
1442 	m->busy_count = PBUSY_LOCKED;
1443 	m->wire_count = 1;
1444 	spin_init(&m->spin, "fake_page");
1445 	pmap_page_init(m);
1446 memattr:
1447 	pmap_page_set_memattr(m, memattr);
1448 }
1449 
1450 /*
1451  * Inserts the given vm_page into the object and object list.
1452  *
1453  * The pagetables are not updated but will presumably fault the page
1454  * in if necessary, or if a kernel page the caller will at some point
1455  * enter the page into the kernel's pmap.  We are not allowed to block
1456  * here so we *can't* do this anyway.
1457  *
1458  * This routine may not block.
1459  * This routine must be called with the vm_object held.
1460  * This routine must be called with a critical section held.
1461  *
1462  * This routine returns TRUE if the page was inserted into the object
1463  * successfully, and FALSE if the page already exists in the object.
1464  */
1465 int
1466 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1467 {
1468 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1469 	if (m->object != NULL)
1470 		panic("vm_page_insert: already inserted");
1471 
1472 	atomic_add_int(&object->generation, 1);
1473 
1474 	/*
1475 	 * Associate the VM page with an (object, offset).
1476 	 *
1477 	 * The vm_page spin lock is required for interactions with the pmap.
1478 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1479 	 */
1480 	vm_page_spin_lock(m);
1481 	m->object = object;
1482 	m->pindex = pindex;
1483 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1484 		m->object = NULL;
1485 		m->pindex = 0;
1486 		vm_page_spin_unlock(m);
1487 		return FALSE;
1488 	}
1489 	++object->resident_page_count;
1490 	++mycpu->gd_vmtotal.t_rm;
1491 	vm_page_spin_unlock(m);
1492 
1493 	/*
1494 	 * Since we are inserting a new and possibly dirty page,
1495 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1496 	 */
1497 	if ((m->valid & m->dirty) ||
1498 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1499 		vm_object_set_writeable_dirty(object);
1500 
1501 	/*
1502 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1503 	 */
1504 	swap_pager_page_inserted(m);
1505 	return TRUE;
1506 }
1507 
1508 /*
1509  * Removes the given vm_page_t from the (object,index) table
1510  *
1511  * The page must be BUSY and will remain BUSY on return.
1512  * No other requirements.
1513  *
1514  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1515  *	 it busy.
1516  *
1517  * NOTE: Caller is responsible for any pmap disposition prior to the
1518  *	 rename (as the pmap code will not be able to find the entries
1519  *	 once the object has been disassociated).  The caller may choose
1520  *	 to leave the pmap association intact if this routine is being
1521  *	 called as part of a rename between shadowed objects.
1522  *
1523  * This routine may not block.
1524  */
1525 void
1526 vm_page_remove(vm_page_t m)
1527 {
1528 	vm_object_t object;
1529 
1530 	if (m->object == NULL) {
1531 		return;
1532 	}
1533 
1534 	if ((m->busy_count & PBUSY_LOCKED) == 0)
1535 		panic("vm_page_remove: page not busy");
1536 
1537 	object = m->object;
1538 
1539 	vm_object_hold(object);
1540 
1541 	/*
1542 	 * Remove the page from the object and update the object.
1543 	 *
1544 	 * The vm_page spin lock is required for interactions with the pmap.
1545 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1546 	 */
1547 	vm_page_spin_lock(m);
1548 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1549 	--object->resident_page_count;
1550 	--mycpu->gd_vmtotal.t_rm;
1551 	m->object = NULL;
1552 	atomic_add_int(&object->generation, 1);
1553 	vm_page_spin_unlock(m);
1554 
1555 	vm_object_drop(object);
1556 }
1557 
1558 /*
1559  * Calculate the hash position for the vm_page hash heuristic.
1560  *
1561  * Mask by ~3 to offer 4-way set-assoc
1562  */
1563 static __inline
1564 struct vm_page_hash_elm *
1565 vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
1566 {
1567 	size_t hi;
1568 
1569 	/* mix it up */
1570 	hi = (intptr_t)object ^ object->pg_color ^ pindex;
1571 	hi += object->pg_color * pindex;
1572 	hi = hi ^ (hi >> 20);
1573 	hi &= vm_page_hash_size - 1;		/* bounds */
1574 	hi &= ~(VM_PAGE_HASH_SET - 1);		/* set-assoc */
1575 	return (&vm_page_hash[hi]);
1576 }
1577 
1578 /*
1579  * Heuristical page lookup that does not require any locks.  Returns
1580  * a soft-busied page on success, NULL on failure.
1581  *
1582  * Caller must lookup the page the slow way if NULL is returned.
1583  */
1584 vm_page_t
1585 vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
1586 {
1587 	struct vm_page_hash_elm *mp;
1588 	vm_page_t m;
1589 	int i;
1590 
1591 	if (vm_page_hash == NULL)
1592 		return NULL;
1593 	mp = vm_page_hash_hash(object, pindex);
1594 	for (i = 0; i < VM_PAGE_HASH_SET; ++i) {
1595 		m = mp[i].m;
1596 		cpu_ccfence();
1597 		if (m == NULL)
1598 			continue;
1599 		if (m->object != object || m->pindex != pindex)
1600 			continue;
1601 		if (vm_page_sbusy_try(m))
1602 			continue;
1603 		if (m->object == object && m->pindex == pindex) {
1604 			mp[i].ticks = ticks;
1605 			return m;
1606 		}
1607 		vm_page_sbusy_drop(m);
1608 	}
1609 	return NULL;
1610 }
1611 
1612 /*
1613  * Enter page onto vm_page_hash[].  This is a heuristic, SMP collisions
1614  * are allowed.
1615  */
1616 static __inline
1617 void
1618 vm_page_hash_enter(vm_page_t m)
1619 {
1620 	struct vm_page_hash_elm *mp;
1621 	struct vm_page_hash_elm *best;
1622 	int i;
1623 
1624 	/*
1625 	 * Only enter type-stable vm_pages with well-shared objects.
1626 	 */
1627 	if (vm_page_hash == NULL ||
1628 	    m < &vm_page_array[0] ||
1629 	    m >= &vm_page_array[vm_page_array_size])
1630 		return;
1631 	if (m->object == NULL)
1632 		return;
1633 #if 0
1634 	/*
1635 	 * Disabled at the moment, there are some degenerate conditions
1636 	 * with often-exec'd programs that get ignored.  In particular,
1637 	 * the kernel's elf loader does a vn_rdwr() on the first page of
1638 	 * a binary.
1639 	 */
1640 	if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING))
1641 		return;
1642 #endif
1643 	if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE)
1644 		return;
1645 
1646 	/*
1647 	 * Find best entry
1648 	 */
1649 	mp = vm_page_hash_hash(m->object, m->pindex);
1650 	best = mp;
1651 	for (i = 0; i < VM_PAGE_HASH_SET; ++i) {
1652 		if (mp[i].m == m) {
1653 			mp[i].ticks = ticks;
1654 			return;
1655 		}
1656 
1657 		/*
1658 		 * The best choice is the oldest entry.
1659 		 *
1660 		 * Also check for a field overflow, using -1 instead of 0
1661 		 * to deal with SMP races on accessing the 'ticks' global.
1662 		 */
1663 		if ((ticks - best->ticks) < (ticks - mp[i].ticks) ||
1664 		    (int)(ticks - mp[i].ticks) < -1) {
1665 			best = &mp[i];
1666 		}
1667 	}
1668 	best->m = m;
1669 	best->ticks = ticks;
1670 }
1671 
1672 /*
1673  * Locate and return the page at (object, pindex), or NULL if the
1674  * page could not be found.
1675  *
1676  * The caller must hold the vm_object token.
1677  */
1678 vm_page_t
1679 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1680 {
1681 	vm_page_t m;
1682 
1683 	/*
1684 	 * Search the hash table for this object/offset pair
1685 	 */
1686 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1687 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1688 	if (m) {
1689 		KKASSERT(m->object == object && m->pindex == pindex);
1690 		vm_page_hash_enter(m);
1691 	}
1692 	return(m);
1693 }
1694 
1695 vm_page_t
1696 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1697 					    vm_pindex_t pindex,
1698 					    int also_m_busy, const char *msg
1699 					    VM_PAGE_DEBUG_ARGS)
1700 {
1701 	u_int32_t busy_count;
1702 	vm_page_t m;
1703 
1704 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1705 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1706 	while (m) {
1707 		KKASSERT(m->object == object && m->pindex == pindex);
1708 		busy_count = m->busy_count;
1709 		cpu_ccfence();
1710 		if (busy_count & PBUSY_LOCKED) {
1711 			tsleep_interlock(m, 0);
1712 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1713 					  busy_count | PBUSY_WANTED)) {
1714 				atomic_set_int(&m->flags, PG_REFERENCED);
1715 				tsleep(m, PINTERLOCKED, msg, 0);
1716 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1717 							      pindex);
1718 			}
1719 		} else if (also_m_busy && busy_count) {
1720 			tsleep_interlock(m, 0);
1721 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1722 					  busy_count | PBUSY_WANTED)) {
1723 				atomic_set_int(&m->flags, PG_REFERENCED);
1724 				tsleep(m, PINTERLOCKED, msg, 0);
1725 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1726 							      pindex);
1727 			}
1728 		} else if (atomic_cmpset_int(&m->busy_count, busy_count,
1729 					     busy_count | PBUSY_LOCKED)) {
1730 #ifdef VM_PAGE_DEBUG
1731 			m->busy_func = func;
1732 			m->busy_line = lineno;
1733 #endif
1734 			vm_page_hash_enter(m);
1735 			break;
1736 		}
1737 	}
1738 	return m;
1739 }
1740 
1741 /*
1742  * Attempt to lookup and busy a page.
1743  *
1744  * Returns NULL if the page could not be found
1745  *
1746  * Returns a vm_page and error == TRUE if the page exists but could not
1747  * be busied.
1748  *
1749  * Returns a vm_page and error == FALSE on success.
1750  */
1751 vm_page_t
1752 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1753 					   vm_pindex_t pindex,
1754 					   int also_m_busy, int *errorp
1755 					   VM_PAGE_DEBUG_ARGS)
1756 {
1757 	u_int32_t busy_count;
1758 	vm_page_t m;
1759 
1760 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1761 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1762 	*errorp = FALSE;
1763 	while (m) {
1764 		KKASSERT(m->object == object && m->pindex == pindex);
1765 		busy_count = m->busy_count;
1766 		cpu_ccfence();
1767 		if (busy_count & PBUSY_LOCKED) {
1768 			*errorp = TRUE;
1769 			break;
1770 		}
1771 		if (also_m_busy && busy_count) {
1772 			*errorp = TRUE;
1773 			break;
1774 		}
1775 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1776 				      busy_count | PBUSY_LOCKED)) {
1777 #ifdef VM_PAGE_DEBUG
1778 			m->busy_func = func;
1779 			m->busy_line = lineno;
1780 #endif
1781 			vm_page_hash_enter(m);
1782 			break;
1783 		}
1784 	}
1785 	return m;
1786 }
1787 
1788 /*
1789  * Returns a page that is only soft-busied for use by the caller in
1790  * a read-only fashion.  Returns NULL if the page could not be found,
1791  * the soft busy could not be obtained, or the page data is invalid.
1792  *
1793  * XXX Doesn't handle PG_FICTITIOUS pages at the moment, but there is
1794  *     no reason why we couldn't.
1795  */
1796 vm_page_t
1797 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1798 			 int pgoff, int pgbytes)
1799 {
1800 	vm_page_t m;
1801 
1802 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1803 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1804 	if (m) {
1805 		if ((m->valid != VM_PAGE_BITS_ALL &&
1806 		     !vm_page_is_valid(m, pgoff, pgbytes)) ||
1807 		    (m->flags & PG_FICTITIOUS)) {
1808 			m = NULL;
1809 		} else if (vm_page_sbusy_try(m)) {
1810 			m = NULL;
1811 		} else if ((m->valid != VM_PAGE_BITS_ALL &&
1812 			    !vm_page_is_valid(m, pgoff, pgbytes)) ||
1813 			   (m->flags & PG_FICTITIOUS)) {
1814 			vm_page_sbusy_drop(m);
1815 			m = NULL;
1816 		} else {
1817 			vm_page_hash_enter(m);
1818 		}
1819 	}
1820 	return m;
1821 }
1822 
1823 /*
1824  * Caller must hold the related vm_object
1825  */
1826 vm_page_t
1827 vm_page_next(vm_page_t m)
1828 {
1829 	vm_page_t next;
1830 
1831 	next = vm_page_rb_tree_RB_NEXT(m);
1832 	if (next && next->pindex != m->pindex + 1)
1833 		next = NULL;
1834 	return (next);
1835 }
1836 
1837 /*
1838  * vm_page_rename()
1839  *
1840  * Move the given vm_page from its current object to the specified
1841  * target object/offset.  The page must be busy and will remain so
1842  * on return.
1843  *
1844  * new_object must be held.
1845  * This routine might block. XXX ?
1846  *
1847  * NOTE: Swap associated with the page must be invalidated by the move.  We
1848  *       have to do this for several reasons:  (1) we aren't freeing the
1849  *       page, (2) we are dirtying the page, (3) the VM system is probably
1850  *       moving the page from object A to B, and will then later move
1851  *       the backing store from A to B and we can't have a conflict.
1852  *
1853  * NOTE: We *always* dirty the page.  It is necessary both for the
1854  *       fact that we moved it, and because we may be invalidating
1855  *	 swap.  If the page is on the cache, we have to deactivate it
1856  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1857  *	 on the cache.
1858  *
1859  * NOTE: Caller is responsible for any pmap disposition prior to the
1860  *	 rename (as the pmap code will not be able to find the entries
1861  *	 once the object has been disassociated or changed).  Nominally
1862  *	 the caller is moving a page between shadowed objects and so the
1863  *	 pmap association is retained without having to remove the page
1864  *	 from it.
1865  */
1866 void
1867 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1868 {
1869 	KKASSERT(m->busy_count & PBUSY_LOCKED);
1870 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1871 	if (m->object) {
1872 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1873 		vm_page_remove(m);
1874 	}
1875 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1876 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1877 		      new_object, new_pindex);
1878 	}
1879 	if (m->queue - m->pc == PQ_CACHE)
1880 		vm_page_deactivate(m);
1881 	vm_page_dirty(m);
1882 }
1883 
1884 /*
1885  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1886  * is to remain BUSYied by the caller.
1887  *
1888  * This routine may not block.
1889  */
1890 void
1891 vm_page_unqueue_nowakeup(vm_page_t m)
1892 {
1893 	vm_page_and_queue_spin_lock(m);
1894 	(void)_vm_page_rem_queue_spinlocked(m);
1895 	vm_page_spin_unlock(m);
1896 }
1897 
1898 /*
1899  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1900  * if necessary.
1901  *
1902  * This routine may not block.
1903  */
1904 void
1905 vm_page_unqueue(vm_page_t m)
1906 {
1907 	u_short queue;
1908 
1909 	vm_page_and_queue_spin_lock(m);
1910 	queue = _vm_page_rem_queue_spinlocked(m);
1911 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1912 		vm_page_spin_unlock(m);
1913 		pagedaemon_wakeup();
1914 	} else {
1915 		vm_page_spin_unlock(m);
1916 	}
1917 }
1918 
1919 /*
1920  * vm_page_list_find()
1921  *
1922  * Find a page on the specified queue with color optimization.
1923  *
1924  * The page coloring optimization attempts to locate a page that does
1925  * not overload other nearby pages in the object in the cpu's L1 or L2
1926  * caches.  We need this optimization because cpu caches tend to be
1927  * physical caches, while object spaces tend to be virtual.
1928  *
1929  * The page coloring optimization also, very importantly, tries to localize
1930  * memory to cpus and physical sockets.
1931  *
1932  * Each PQ_FREE and PQ_CACHE color queue has its own spinlock and the
1933  * algorithm is adjusted to localize allocations on a per-core basis.
1934  * This is done by 'twisting' the colors.
1935  *
1936  * The page is returned spinlocked and removed from its queue (it will
1937  * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
1938  * is responsible for dealing with the busy-page case (usually by
1939  * deactivating the page and looping).
1940  *
1941  * NOTE:  This routine is carefully inlined.  A non-inlined version
1942  *	  is available for outside callers but the only critical path is
1943  *	  from within this source file.
1944  *
1945  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1946  *	  represent stable storage, allowing us to order our locks vm_page
1947  *	  first, then queue.
1948  */
1949 static __inline
1950 vm_page_t
1951 _vm_page_list_find(int basequeue, int index)
1952 {
1953 	struct vpgqueues *pq;
1954 	vm_page_t m;
1955 
1956 	index &= PQ_L2_MASK;
1957 	pq = &vm_page_queues[basequeue + index];
1958 
1959 	/*
1960 	 * Try this cpu's colored queue first.  Test for a page unlocked,
1961 	 * then lock the queue and locate a page.  Note that the lock order
1962 	 * is reversed, but we do not want to dwadle on the page spinlock
1963 	 * anyway as it is held significantly longer than the queue spinlock.
1964 	 */
1965 	if (TAILQ_FIRST(&pq->pl)) {
1966 		spin_lock(&pq->spin);
1967 		TAILQ_FOREACH(m, &pq->pl, pageq) {
1968 			if (spin_trylock(&m->spin) == 0)
1969 				continue;
1970 			KKASSERT(m->queue == basequeue + index);
1971 			pq->lastq = -1;
1972 			return(m);
1973 		}
1974 		spin_unlock(&pq->spin);
1975 	}
1976 
1977 	m = _vm_page_list_find_wide(basequeue, index, &pq->lastq);
1978 
1979 	return(m);
1980 }
1981 
1982 /*
1983  * If we could not find the page in the desired queue try to find it in
1984  * a nearby (NUMA-aware) queue, spreading out as we go.
1985  */
1986 static vm_page_t
1987 _vm_page_list_find_wide(int basequeue, int index, int *lastp)
1988 {
1989 	struct vpgqueues *pq;
1990 	vm_page_t m = NULL;
1991 	int pqmask = set_assoc_mask >> 1;
1992 	int pqi;
1993 	int range;
1994 	int skip_start;
1995 	int skip_next;
1996 	int count;
1997 
1998 	/*
1999 	 * Avoid re-searching empty queues over and over again skip to
2000 	 * pq->last if appropriate.
2001 	 */
2002 	if (*lastp >= 0)
2003 		index = *lastp;
2004 
2005 	index &= PQ_L2_MASK;
2006 	pq = &vm_page_queues[basequeue];
2007 	count = 0;
2008 	skip_start = -1;
2009 	skip_next = -1;
2010 
2011 	/*
2012 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2013 	 * else fails (PQ_L2_MASK).
2014 	 *
2015 	 * pqmask is a mask, 15, 31, 63, etc.
2016 	 *
2017 	 * Test each queue unlocked first, then lock the queue and locate
2018 	 * a page.  Note that the lock order is reversed, but we do not want
2019 	 * to dwadle on the page spinlock anyway as it is held significantly
2020 	 * longer than the queue spinlock.
2021 	 */
2022 	do {
2023 		pqmask = (pqmask << 1) | 1;
2024 
2025 		pqi = index;
2026 		range = pqmask + 1;
2027 
2028 		while (range > 0) {
2029 			if (pqi >= skip_start && pqi < skip_next) {
2030 				range -= skip_next - pqi;
2031 				pqi = (pqi & ~pqmask) | (skip_next & pqmask);
2032 			}
2033 			if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) {
2034 				spin_lock(&pq[pqi].spin);
2035 				TAILQ_FOREACH(m, &pq[pqi].pl, pageq) {
2036 					if (spin_trylock(&m->spin) == 0)
2037 						continue;
2038 					KKASSERT(m->queue == basequeue + pqi);
2039 
2040 					/*
2041 					 * If we had to wander too far, set
2042 					 * *lastp to skip past empty queues.
2043 					 */
2044 					if (count >= 8)
2045 						*lastp = pqi & PQ_L2_MASK;
2046 					return(m);
2047 				}
2048 				spin_unlock(&pq[pqi].spin);
2049 			}
2050 			--range;
2051 			++count;
2052 			pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask);
2053 		}
2054 		skip_start = pqi & ~pqmask;
2055 		skip_next = (pqi | pqmask) + 1;
2056 	} while (pqmask != PQ_L2_MASK);
2057 
2058 	return(m);
2059 }
2060 
2061 static __inline
2062 vm_page_t
2063 _vm_page_list_find2(int bq1, int bq2, int index)
2064 {
2065 	struct vpgqueues *pq1;
2066 	struct vpgqueues *pq2;
2067 	vm_page_t m;
2068 
2069 	index &= PQ_L2_MASK;
2070 	pq1 = &vm_page_queues[bq1 + index];
2071 	pq2 = &vm_page_queues[bq2 + index];
2072 
2073 	/*
2074 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2075 	 * then lock the queue and locate a page.  Note that the lock order
2076 	 * is reversed, but we do not want to dwadle on the page spinlock
2077 	 * anyway as it is held significantly longer than the queue spinlock.
2078 	 */
2079 	if (TAILQ_FIRST(&pq1->pl)) {
2080 		spin_lock(&pq1->spin);
2081 		TAILQ_FOREACH(m, &pq1->pl, pageq) {
2082 			if (spin_trylock(&m->spin) == 0)
2083 				continue;
2084 			KKASSERT(m->queue == bq1 + index);
2085 			pq1->lastq = -1;
2086 			pq2->lastq = -1;
2087 			return(m);
2088 		}
2089 		spin_unlock(&pq1->spin);
2090 	}
2091 
2092 	m = _vm_page_list_find2_wide(bq1, bq2, index, &pq1->lastq, &pq2->lastq);
2093 
2094 	return(m);
2095 }
2096 
2097 
2098 /*
2099  * This version checks two queues at the same time, widening its search
2100  * as we progress.  prefering basequeue1
2101  * and starting on basequeue2 after exhausting the first set.  The idea
2102  * is to try to stay localized to the cpu.
2103  */
2104 static vm_page_t
2105 _vm_page_list_find2_wide(int basequeue1, int basequeue2, int index,
2106 			 int *lastp1, int *lastp2)
2107 {
2108 	struct vpgqueues *pq1;
2109 	struct vpgqueues *pq2;
2110 	vm_page_t m = NULL;
2111 	int pqmask1, pqmask2;
2112 	int pqi;
2113 	int range;
2114 	int skip_start1, skip_start2;
2115 	int skip_next1, skip_next2;
2116 	int count1, count2;
2117 
2118 	/*
2119 	 * Avoid re-searching empty queues over and over again skip to
2120 	 * pq->last if appropriate.
2121 	 */
2122 	if (*lastp1 >= 0)
2123 		index = *lastp1;
2124 
2125 	index &= PQ_L2_MASK;
2126 
2127 	pqmask1 = set_assoc_mask >> 1;
2128 	pq1 = &vm_page_queues[basequeue1];
2129 	count1 = 0;
2130 	skip_start1 = -1;
2131 	skip_next1 = -1;
2132 
2133 	pqmask2 = set_assoc_mask >> 1;
2134 	pq2 = &vm_page_queues[basequeue2];
2135 	count2 = 0;
2136 	skip_start2 = -1;
2137 	skip_next2 = -1;
2138 
2139 	/*
2140 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2141 	 * else fails (PQ_L2_MASK).
2142 	 *
2143 	 * pqmask is a mask, 15, 31, 63, etc.
2144 	 *
2145 	 * Test each queue unlocked first, then lock the queue and locate
2146 	 * a page.  Note that the lock order is reversed, but we do not want
2147 	 * to dwadle on the page spinlock anyway as it is held significantly
2148 	 * longer than the queue spinlock.
2149 	 */
2150 	do {
2151 		if (pqmask1 == PQ_L2_MASK)
2152 			goto skip2;
2153 
2154 		pqmask1 = (pqmask1 << 1) | 1;
2155 		pqi = index;
2156 		range = pqmask1 + 1;
2157 
2158 		while (range > 0) {
2159 			if (pqi >= skip_start1 && pqi < skip_next1) {
2160 				range -= skip_next1 - pqi;
2161 				pqi = (pqi & ~pqmask1) | (skip_next1 & pqmask1);
2162 			}
2163 			if (range > 0 && TAILQ_FIRST(&pq1[pqi].pl)) {
2164 				spin_lock(&pq1[pqi].spin);
2165 				TAILQ_FOREACH(m, &pq1[pqi].pl, pageq) {
2166 					if (spin_trylock(&m->spin) == 0)
2167 						continue;
2168 					KKASSERT(m->queue == basequeue1 + pqi);
2169 
2170 					/*
2171 					 * If we had to wander too far, set
2172 					 * *lastp to skip past empty queues.
2173 					 */
2174 					if (count1 >= 8)
2175 						*lastp1 = pqi & PQ_L2_MASK;
2176 					return(m);
2177 				}
2178 				spin_unlock(&pq1[pqi].spin);
2179 			}
2180 			--range;
2181 			++count1;
2182 			pqi = (pqi & ~pqmask1) | ((pqi + 1) & pqmask1);
2183 		}
2184 		skip_start1 = pqi & ~pqmask1;
2185 		skip_next1 = (pqi | pqmask1) + 1;
2186 skip2:
2187 		if (pqmask1 < ((set_assoc_mask << 1) | 1))
2188 			continue;
2189 
2190 		pqmask2 = (pqmask2 << 1) | 1;
2191 		pqi = index;
2192 		range = pqmask2 + 1;
2193 
2194 		while (range > 0) {
2195 			if (pqi >= skip_start2 && pqi < skip_next2) {
2196 				range -= skip_next2 - pqi;
2197 				pqi = (pqi & ~pqmask2) | (skip_next2 & pqmask2);
2198 			}
2199 			if (range > 0 && TAILQ_FIRST(&pq2[pqi].pl)) {
2200 				spin_lock(&pq2[pqi].spin);
2201 				TAILQ_FOREACH(m, &pq2[pqi].pl, pageq) {
2202 					if (spin_trylock(&m->spin) == 0)
2203 						continue;
2204 					KKASSERT(m->queue == basequeue2 + pqi);
2205 
2206 					/*
2207 					 * If we had to wander too far, set
2208 					 * *lastp to skip past empty queues.
2209 					 */
2210 					if (count2 >= 8)
2211 						*lastp2 = pqi & PQ_L2_MASK;
2212 					return(m);
2213 				}
2214 				spin_unlock(&pq2[pqi].spin);
2215 			}
2216 			--range;
2217 			++count2;
2218 			pqi = (pqi & ~pqmask2) | ((pqi + 1) & pqmask2);
2219 		}
2220 		skip_start2 = pqi & ~pqmask2;
2221 		skip_next2 = (pqi | pqmask2) + 1;
2222 	} while (pqmask1 != PQ_L2_MASK && pqmask2 != PQ_L2_MASK);
2223 
2224 	return(m);
2225 }
2226 
2227 /*
2228  * Returns a vm_page candidate for allocation.  The page is not busied so
2229  * it can move around.  The caller must busy the page (and typically
2230  * deactivate it if it cannot be busied!)
2231  *
2232  * Returns a spinlocked vm_page that has been removed from its queue.
2233  * (note that _vm_page_list_find() does not remove the page from its
2234  *  queue).
2235  */
2236 vm_page_t
2237 vm_page_list_find(int basequeue, int index)
2238 {
2239 	vm_page_t m;
2240 
2241 	m = _vm_page_list_find(basequeue, index);
2242 	if (m)
2243 		_vm_page_rem_queue_spinlocked(m);
2244 	return m;
2245 }
2246 
2247 /*
2248  * Find a page on the cache queue with color optimization, remove it
2249  * from the queue, and busy it.  The returned page will not be spinlocked.
2250  *
2251  * A candidate failure will be deactivated.  Candidates can fail due to
2252  * being busied by someone else, in which case they will be deactivated.
2253  *
2254  * This routine may not block.
2255  *
2256  */
2257 static vm_page_t
2258 vm_page_select_cache(u_short pg_color)
2259 {
2260 	vm_page_t m;
2261 
2262 	for (;;) {
2263 		m = _vm_page_list_find(PQ_CACHE, pg_color);
2264 		if (m == NULL)
2265 			break;
2266 		/*
2267 		 * (m) has been spinlocked
2268 		 */
2269 		_vm_page_rem_queue_spinlocked(m);
2270 		if (vm_page_busy_try(m, TRUE)) {
2271 			_vm_page_deactivate_locked(m, 0);
2272 			vm_page_spin_unlock(m);
2273 		} else {
2274 			/*
2275 			 * We successfully busied the page
2276 			 */
2277 			if ((m->flags & PG_NEED_COMMIT) == 0 &&
2278 			    m->hold_count == 0 &&
2279 			    m->wire_count == 0 &&
2280 			    (m->dirty & m->valid) == 0) {
2281 				vm_page_spin_unlock(m);
2282 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2283 				pagedaemon_wakeup();
2284 				return(m);
2285 			}
2286 
2287 			/*
2288 			 * The page cannot be recycled, deactivate it.
2289 			 */
2290 			_vm_page_deactivate_locked(m, 0);
2291 			if (_vm_page_wakeup(m)) {
2292 				vm_page_spin_unlock(m);
2293 				wakeup(m);
2294 			} else {
2295 				vm_page_spin_unlock(m);
2296 			}
2297 		}
2298 	}
2299 	return (m);
2300 }
2301 
2302 /*
2303  * Find a free page.  We attempt to inline the nominal case and fall back
2304  * to _vm_page_select_free() otherwise.  A busied page is removed from
2305  * the queue and returned.
2306  *
2307  * This routine may not block.
2308  */
2309 static __inline vm_page_t
2310 vm_page_select_free(u_short pg_color)
2311 {
2312 	vm_page_t m;
2313 
2314 	for (;;) {
2315 		m = _vm_page_list_find(PQ_FREE, pg_color);
2316 		if (m == NULL)
2317 			break;
2318 		_vm_page_rem_queue_spinlocked(m);
2319 		if (vm_page_busy_try(m, TRUE)) {
2320 			/*
2321 			 * Various mechanisms such as a pmap_collect can
2322 			 * result in a busy page on the free queue.  We
2323 			 * have to move the page out of the way so we can
2324 			 * retry the allocation.  If the other thread is not
2325 			 * allocating the page then m->valid will remain 0 and
2326 			 * the pageout daemon will free the page later on.
2327 			 *
2328 			 * Since we could not busy the page, however, we
2329 			 * cannot make assumptions as to whether the page
2330 			 * will be allocated by the other thread or not,
2331 			 * so all we can do is deactivate it to move it out
2332 			 * of the way.  In particular, if the other thread
2333 			 * wires the page it may wind up on the inactive
2334 			 * queue and the pageout daemon will have to deal
2335 			 * with that case too.
2336 			 */
2337 			_vm_page_deactivate_locked(m, 0);
2338 			vm_page_spin_unlock(m);
2339 		} else {
2340 			/*
2341 			 * Theoretically if we are able to busy the page
2342 			 * atomic with the queue removal (using the vm_page
2343 			 * lock) nobody else should have been able to mess
2344 			 * with the page before us.
2345 			 *
2346 			 * Assert the page state.  Note that even though
2347 			 * wiring doesn't adjust queues, a page on the free
2348 			 * queue should never be wired at this point.
2349 			 */
2350 			KKASSERT((m->flags & (PG_UNQUEUED |
2351 					      PG_NEED_COMMIT)) == 0);
2352 			KASSERT(m->hold_count == 0,
2353 				("m->hold_count is not zero "
2354 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2355 				 m, m->queue, m->flags,
2356 				 m->hold_count, m->wire_count));
2357 			KKASSERT(m->wire_count == 0);
2358 			vm_page_spin_unlock(m);
2359 			pagedaemon_wakeup();
2360 
2361 			/* return busied and removed page */
2362 			return(m);
2363 		}
2364 	}
2365 	return(m);
2366 }
2367 
2368 static __inline vm_page_t
2369 vm_page_select_free_or_cache(u_short pg_color, int *fromcachep)
2370 {
2371 	vm_page_t m;
2372 
2373 	*fromcachep = 0;
2374 	for (;;) {
2375 		m = _vm_page_list_find2(PQ_FREE, PQ_CACHE, pg_color);
2376 		if (m == NULL)
2377 			break;
2378 		if (vm_page_busy_try(m, TRUE)) {
2379 			_vm_page_rem_queue_spinlocked(m);
2380 			_vm_page_deactivate_locked(m, 0);
2381 			vm_page_spin_unlock(m);
2382 		} else if (m->queue - m->pc == PQ_FREE) {
2383 			/*
2384 			 * We successfully busied the page, PQ_FREE case
2385 			 */
2386 			_vm_page_rem_queue_spinlocked(m);
2387 			KKASSERT((m->flags & (PG_UNQUEUED |
2388 					      PG_NEED_COMMIT)) == 0);
2389 			KASSERT(m->hold_count == 0,
2390 				("m->hold_count is not zero "
2391 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2392 				 m, m->queue, m->flags,
2393 				 m->hold_count, m->wire_count));
2394 			KKASSERT(m->wire_count == 0);
2395 			vm_page_spin_unlock(m);
2396 			pagedaemon_wakeup();
2397 
2398 			/* return busied and removed page */
2399 			return(m);
2400 		} else {
2401 			/*
2402 			 * We successfully busied the page, PQ_CACHE case
2403 			 */
2404 			_vm_page_rem_queue_spinlocked(m);
2405 			if ((m->flags & PG_NEED_COMMIT) == 0 &&
2406 			    m->hold_count == 0 &&
2407 			    m->wire_count == 0 &&
2408 			    (m->dirty & m->valid) == 0) {
2409 				vm_page_spin_unlock(m);
2410 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2411 				pagedaemon_wakeup();
2412 				*fromcachep = 1;
2413 				return(m);
2414 			}
2415 
2416 			/*
2417 			 * The page cannot be recycled, deactivate it.
2418 			 */
2419 			_vm_page_deactivate_locked(m, 0);
2420 			if (_vm_page_wakeup(m)) {
2421 				vm_page_spin_unlock(m);
2422 				wakeup(m);
2423 			} else {
2424 				vm_page_spin_unlock(m);
2425 			}
2426 		}
2427 	}
2428 	return(m);
2429 }
2430 
2431 /*
2432  * vm_page_alloc()
2433  *
2434  * Allocate and return a memory cell associated with this VM object/offset
2435  * pair.  If object is NULL an unassociated page will be allocated.
2436  *
2437  * The returned page will be busied and removed from its queues.  This
2438  * routine can block and may return NULL if a race occurs and the page
2439  * is found to already exist at the specified (object, pindex).
2440  *
2441  *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
2442  *	VM_ALLOC_QUICK		like normal but cannot use cache
2443  *	VM_ALLOC_SYSTEM		greater free drain
2444  *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
2445  *	VM_ALLOC_ZERO		advisory request for pre-zero'd page only
2446  *	VM_ALLOC_FORCE_ZERO	advisory request for pre-zero'd page only
2447  *	VM_ALLOC_NULL_OK	ok to return NULL on insertion collision
2448  *				(see vm_page_grab())
2449  *	VM_ALLOC_USE_GD		ok to use per-gd cache
2450  *
2451  *	VM_ALLOC_CPU(n)		allocate using specified cpu localization
2452  *
2453  * The object must be held if not NULL
2454  * This routine may not block
2455  *
2456  * Additional special handling is required when called from an interrupt
2457  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
2458  * in this case.
2459  */
2460 vm_page_t
2461 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
2462 {
2463 	globaldata_t gd;
2464 	vm_object_t obj;
2465 	vm_page_t m;
2466 	u_short pg_color;
2467 	int cpuid_local;
2468 	int fromcache;
2469 
2470 #if 0
2471 	/*
2472 	 * Special per-cpu free VM page cache.  The pages are pre-busied
2473 	 * and pre-zerod for us.
2474 	 */
2475 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
2476 		crit_enter_gd(gd);
2477 		if (gd->gd_vmpg_count) {
2478 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
2479 			crit_exit_gd(gd);
2480 			goto done;
2481                 }
2482 		crit_exit_gd(gd);
2483         }
2484 #endif
2485 	m = NULL;
2486 
2487 	/*
2488 	 * CPU LOCALIZATION
2489 	 *
2490 	 * CPU localization algorithm.  Break the page queues up by physical
2491 	 * id and core id (note that two cpu threads will have the same core
2492 	 * id, and core_id != gd_cpuid).
2493 	 *
2494 	 * This is nowhere near perfect, for example the last pindex in a
2495 	 * subgroup will overflow into the next cpu or package.  But this
2496 	 * should get us good page reuse locality in heavy mixed loads.
2497 	 *
2498 	 * (may be executed before the APs are started, so other GDs might
2499 	 *  not exist!)
2500 	 */
2501 	if (page_req & VM_ALLOC_CPU_SPEC)
2502 		cpuid_local = VM_ALLOC_GETCPU(page_req);
2503 	else
2504 		cpuid_local = mycpu->gd_cpuid;
2505 
2506 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
2507 
2508 	KKASSERT(page_req &
2509 		(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
2510 		 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2511 
2512 	/*
2513 	 * Certain system threads (pageout daemon, buf_daemon's) are
2514 	 * allowed to eat deeper into the free page list.
2515 	 */
2516 	if (curthread->td_flags & TDF_SYSTHREAD)
2517 		page_req |= VM_ALLOC_SYSTEM;
2518 
2519 	/*
2520 	 * Impose various limitations.  Note that the v_free_reserved test
2521 	 * must match the opposite of vm_page_count_target() to avoid
2522 	 * livelocks, be careful.
2523 	 */
2524 loop:
2525 	gd = mycpu;
2526 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
2527 	    ((page_req & VM_ALLOC_INTERRUPT) &&
2528 	     gd->gd_vmstats.v_free_count > 0) ||
2529 	    ((page_req & VM_ALLOC_SYSTEM) &&
2530 	     gd->gd_vmstats.v_cache_count == 0 &&
2531 	     gd->gd_vmstats.v_free_count >
2532 	     gd->gd_vmstats.v_interrupt_free_min)
2533 	) {
2534 		/*
2535 		 * The free queue has sufficient free pages to take one out.
2536 		 *
2537 		 * However, if the free queue is strained the scan may widen
2538 		 * to the entire queue and cause a great deal of SMP
2539 		 * contention, so we use a double-queue-scan if we can
2540 		 * to avoid this.
2541 		 */
2542 		if (page_req & VM_ALLOC_NORMAL) {
2543 			m = vm_page_select_free_or_cache(pg_color, &fromcache);
2544 			if (m && fromcache)
2545 				goto found_cache;
2546 		} else {
2547 			m = vm_page_select_free(pg_color);
2548 		}
2549 	} else if (page_req & VM_ALLOC_NORMAL) {
2550 		/*
2551 		 * Allocatable from the cache (non-interrupt only).  On
2552 		 * success, we must free the page and try again, thus
2553 		 * ensuring that vmstats.v_*_free_min counters are replenished.
2554 		 */
2555 #ifdef INVARIANTS
2556 		if (curthread->td_preempted) {
2557 			kprintf("vm_page_alloc(): warning, attempt to allocate"
2558 				" cache page from preempting interrupt\n");
2559 			m = NULL;
2560 		} else {
2561 			m = vm_page_select_cache(pg_color);
2562 		}
2563 #else
2564 		m = vm_page_select_cache(pg_color);
2565 #endif
2566 		/*
2567 		 * On success move the page into the free queue and loop.
2568 		 *
2569 		 * Only do this if we can safely acquire the vm_object lock,
2570 		 * because this is effectively a random page and the caller
2571 		 * might be holding the lock shared, we don't want to
2572 		 * deadlock.
2573 		 */
2574 		if (m != NULL) {
2575 found_cache:
2576 			KASSERT(m->dirty == 0,
2577 				("Found dirty cache page %p", m));
2578 			if ((obj = m->object) != NULL) {
2579 				if (vm_object_hold_try(obj)) {
2580 					vm_page_protect(m, VM_PROT_NONE);
2581 					vm_page_free(m);
2582 					/* m->object NULL here */
2583 					vm_object_drop(obj);
2584 				} else {
2585 					vm_page_deactivate(m);
2586 					vm_page_wakeup(m);
2587 				}
2588 			} else {
2589 				vm_page_protect(m, VM_PROT_NONE);
2590 				vm_page_free(m);
2591 			}
2592 			goto loop;
2593 		}
2594 
2595 		/*
2596 		 * On failure return NULL
2597 		 */
2598 		atomic_add_int(&vm_pageout_deficit, 1);
2599 		pagedaemon_wakeup();
2600 		return (NULL);
2601 	} else {
2602 		/*
2603 		 * No pages available, wakeup the pageout daemon and give up.
2604 		 */
2605 		atomic_add_int(&vm_pageout_deficit, 1);
2606 		pagedaemon_wakeup();
2607 		return (NULL);
2608 	}
2609 
2610 	/*
2611 	 * v_free_count can race so loop if we don't find the expected
2612 	 * page.
2613 	 */
2614 	if (m == NULL) {
2615 		vmstats_rollup();
2616 		goto loop;
2617 	}
2618 
2619 	/*
2620 	 * Good page found.  The page has already been busied for us and
2621 	 * removed from its queues.
2622 	 */
2623 	KASSERT(m->dirty == 0,
2624 		("vm_page_alloc: free/cache page %p was dirty", m));
2625 	KKASSERT(m->queue == PQ_NONE);
2626 
2627 #if 0
2628 done:
2629 #endif
2630 	/*
2631 	 * Initialize the structure, inheriting some flags but clearing
2632 	 * all the rest.  The page has already been busied for us.
2633 	 */
2634 	vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
2635 
2636 	KKASSERT(m->wire_count == 0);
2637 	KKASSERT((m->busy_count & PBUSY_MASK) == 0);
2638 	m->act_count = 0;
2639 	m->valid = 0;
2640 
2641 	/*
2642 	 * Caller must be holding the object lock (asserted by
2643 	 * vm_page_insert()).
2644 	 *
2645 	 * NOTE: Inserting a page here does not insert it into any pmaps
2646 	 *	 (which could cause us to block allocating memory).
2647 	 *
2648 	 * NOTE: If no object an unassociated page is allocated, m->pindex
2649 	 *	 can be used by the caller for any purpose.
2650 	 */
2651 	if (object) {
2652 		if (vm_page_insert(m, object, pindex) == FALSE) {
2653 			vm_page_free(m);
2654 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
2655 				panic("PAGE RACE %p[%ld]/%p",
2656 				      object, (long)pindex, m);
2657 			m = NULL;
2658 		}
2659 	} else {
2660 		m->pindex = pindex;
2661 	}
2662 
2663 	/*
2664 	 * Don't wakeup too often - wakeup the pageout daemon when
2665 	 * we would be nearly out of memory.
2666 	 */
2667 	pagedaemon_wakeup();
2668 
2669 	/*
2670 	 * A BUSY page is returned.
2671 	 */
2672 	return (m);
2673 }
2674 
2675 /*
2676  * Returns number of pages available in our DMA memory reserve
2677  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
2678  */
2679 vm_size_t
2680 vm_contig_avail_pages(void)
2681 {
2682 	alist_blk_t blk;
2683 	alist_blk_t count;
2684 	alist_blk_t bfree;
2685 	spin_lock(&vm_contig_spin);
2686 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2687 	spin_unlock(&vm_contig_spin);
2688 
2689 	return bfree;
2690 }
2691 
2692 /*
2693  * Attempt to allocate contiguous physical memory with the specified
2694  * requirements.
2695  */
2696 vm_page_t
2697 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2698 		     unsigned long alignment, unsigned long boundary,
2699 		     unsigned long size, vm_memattr_t memattr)
2700 {
2701 	alist_blk_t blk;
2702 	vm_page_t m;
2703 	vm_pindex_t i;
2704 #if 0
2705 	static vm_pindex_t contig_rover;
2706 #endif
2707 
2708 	alignment >>= PAGE_SHIFT;
2709 	if (alignment == 0)
2710 		alignment = 1;
2711 	boundary >>= PAGE_SHIFT;
2712 	if (boundary == 0)
2713 		boundary = 1;
2714 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
2715 
2716 #if 0
2717 	/*
2718 	 * Disabled temporarily until we find a solution for DRM (a flag
2719 	 * to always use the free space reserve, for performance).
2720 	 */
2721 	if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
2722 	    boundary <= PAGE_SIZE && size == 1 &&
2723 	    memattr == VM_MEMATTR_DEFAULT) {
2724 		/*
2725 		 * Any page will work, use vm_page_alloc()
2726 		 * (e.g. when used from kmem_alloc_attr())
2727 		 */
2728 		m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
2729 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2730 				  VM_ALLOC_INTERRUPT);
2731 		m->valid = VM_PAGE_BITS_ALL;
2732 		vm_page_wire(m);
2733 		vm_page_wakeup(m);
2734 	} else
2735 #endif
2736 	{
2737 		/*
2738 		 * Use the low-memory dma reserve
2739 		 */
2740 		spin_lock(&vm_contig_spin);
2741 		blk = alist_alloc(&vm_contig_alist, 0, size);
2742 		if (blk == ALIST_BLOCK_NONE) {
2743 			spin_unlock(&vm_contig_spin);
2744 			if (bootverbose) {
2745 				kprintf("vm_page_alloc_contig: %ldk nospace\n",
2746 					(size << PAGE_SHIFT) / 1024);
2747 				print_backtrace(5);
2748 			}
2749 			return(NULL);
2750 		}
2751 		if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2752 			alist_free(&vm_contig_alist, blk, size);
2753 			spin_unlock(&vm_contig_spin);
2754 			if (bootverbose) {
2755 				kprintf("vm_page_alloc_contig: %ldk high "
2756 					"%016jx failed\n",
2757 					(size << PAGE_SHIFT) / 1024,
2758 					(intmax_t)high);
2759 			}
2760 			return(NULL);
2761 		}
2762 		spin_unlock(&vm_contig_spin);
2763 
2764 		/*
2765 		 * Base vm_page_t of range
2766 		 */
2767 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2768 	}
2769 	if (vm_contig_verbose) {
2770 		kprintf("vm_page_alloc_contig: %016jx/%ldk "
2771 			"(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2772 			(intmax_t)m->phys_addr,
2773 			(size << PAGE_SHIFT) / 1024,
2774 			low, high, alignment, boundary, size, memattr);
2775 	}
2776 	if (memattr != VM_MEMATTR_DEFAULT) {
2777 		for (i = 0; i < size; ++i) {
2778 			KKASSERT(m[i].flags & PG_FICTITIOUS);
2779 			pmap_page_set_memattr(&m[i], memattr);
2780 		}
2781 	}
2782 	return m;
2783 }
2784 
2785 /*
2786  * Free contiguously allocated pages.  The pages will be wired but not busy.
2787  * When freeing to the alist we leave them wired and not busy.
2788  */
2789 void
2790 vm_page_free_contig(vm_page_t m, unsigned long size)
2791 {
2792 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2793 	vm_pindex_t start = pa >> PAGE_SHIFT;
2794 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2795 
2796 	if (vm_contig_verbose) {
2797 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2798 			(intmax_t)pa, size / 1024);
2799 	}
2800 	if (pa < vm_low_phys_reserved) {
2801 		/*
2802 		 * Just assert check the first page for convenience.
2803 		 */
2804 		KKASSERT(m->wire_count == 1);
2805 		KKASSERT(m->flags & PG_FICTITIOUS);
2806 		KKASSERT(pa + size <= vm_low_phys_reserved);
2807 		spin_lock(&vm_contig_spin);
2808 		alist_free(&vm_contig_alist, start, pages);
2809 		spin_unlock(&vm_contig_spin);
2810 	} else {
2811 		while (pages) {
2812 			/* XXX FUTURE, maybe (pair with vm_pg_contig_alloc()) */
2813 			/*vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);*/
2814 			vm_page_busy_wait(m, FALSE, "cpgfr");
2815 			vm_page_unwire(m, 0);
2816 			vm_page_free(m);
2817 			--pages;
2818 			++m;
2819 		}
2820 
2821 	}
2822 }
2823 
2824 
2825 /*
2826  * Wait for sufficient free memory for nominal heavy memory use kernel
2827  * operations.
2828  *
2829  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2830  *	     will trivially deadlock the system.
2831  */
2832 void
2833 vm_wait_nominal(void)
2834 {
2835 	while (vm_page_count_min(0))
2836 		vm_wait(0);
2837 }
2838 
2839 /*
2840  * Test if vm_wait_nominal() would block.
2841  */
2842 int
2843 vm_test_nominal(void)
2844 {
2845 	if (vm_page_count_min(0))
2846 		return(1);
2847 	return(0);
2848 }
2849 
2850 /*
2851  * Block until free pages are available for allocation, called in various
2852  * places before memory allocations.
2853  *
2854  * The caller may loop if vm_page_count_min() == FALSE so we cannot be
2855  * more generous then that.
2856  */
2857 void
2858 vm_wait(int timo)
2859 {
2860 	/*
2861 	 * never wait forever
2862 	 */
2863 	if (timo == 0)
2864 		timo = hz;
2865 	lwkt_gettoken(&vm_token);
2866 
2867 	if (curthread == pagethread ||
2868 	    curthread == emergpager) {
2869 		/*
2870 		 * The pageout daemon itself needs pages, this is bad.
2871 		 */
2872 		if (vm_page_count_min(0)) {
2873 			vm_pageout_pages_needed = 1;
2874 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2875 		}
2876 	} else {
2877 		/*
2878 		 * Wakeup the pageout daemon if necessary and wait.
2879 		 *
2880 		 * Do not wait indefinitely for the target to be reached,
2881 		 * as load might prevent it from being reached any time soon.
2882 		 * But wait a little to try to slow down page allocations
2883 		 * and to give more important threads (the pagedaemon)
2884 		 * allocation priority.
2885 		 */
2886 		if (vm_page_count_target()) {
2887 			if (vm_pages_needed <= 1) {
2888 				++vm_pages_needed;
2889 				wakeup(&vm_pages_needed);
2890 			}
2891 			++vm_pages_waiting;	/* SMP race ok */
2892 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2893 		}
2894 	}
2895 	lwkt_reltoken(&vm_token);
2896 }
2897 
2898 /*
2899  * Block until free pages are available for allocation
2900  *
2901  * Called only from vm_fault so that processes page faulting can be
2902  * easily tracked.
2903  */
2904 void
2905 vm_wait_pfault(void)
2906 {
2907 	/*
2908 	 * Wakeup the pageout daemon if necessary and wait.
2909 	 *
2910 	 * Do not wait indefinitely for the target to be reached,
2911 	 * as load might prevent it from being reached any time soon.
2912 	 * But wait a little to try to slow down page allocations
2913 	 * and to give more important threads (the pagedaemon)
2914 	 * allocation priority.
2915 	 */
2916 	if (vm_page_count_min(0)) {
2917 		lwkt_gettoken(&vm_token);
2918 		while (vm_page_count_severe()) {
2919 			if (vm_page_count_target()) {
2920 				thread_t td;
2921 
2922 				if (vm_pages_needed <= 1) {
2923 					++vm_pages_needed;
2924 					wakeup(&vm_pages_needed);
2925 				}
2926 				++vm_pages_waiting;	/* SMP race ok */
2927 				tsleep(&vmstats.v_free_count, 0, "pfault", hz);
2928 
2929 				/*
2930 				 * Do not stay stuck in the loop if the system is trying
2931 				 * to kill the process.
2932 				 */
2933 				td = curthread;
2934 				if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL))
2935 					break;
2936 			}
2937 		}
2938 		lwkt_reltoken(&vm_token);
2939 	}
2940 }
2941 
2942 /*
2943  * Put the specified page on the active list (if appropriate).  Ensure
2944  * that act_count is at least ACT_INIT but do not otherwise mess with it.
2945  *
2946  * The caller should be holding the page busied ? XXX
2947  * This routine may not block.
2948  *
2949  * It is ok if the page is wired (so buffer cache operations don't have
2950  * to mess with the page queues).
2951  */
2952 void
2953 vm_page_activate(vm_page_t m)
2954 {
2955 	u_short oqueue;
2956 
2957 	/*
2958 	 * If already active or inappropriate, just set act_count and
2959 	 * return.  We don't have to spin-lock the page.
2960 	 */
2961 	if (m->queue - m->pc == PQ_ACTIVE ||
2962 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
2963 		if (m->act_count < ACT_INIT)
2964 			m->act_count = ACT_INIT;
2965 		return;
2966 	}
2967 
2968 	vm_page_spin_lock(m);
2969 	if (m->queue - m->pc != PQ_ACTIVE &&
2970 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
2971 		_vm_page_queue_spin_lock(m);
2972 		oqueue = _vm_page_rem_queue_spinlocked(m);
2973 		/* page is left spinlocked, queue is unlocked */
2974 
2975 		if (oqueue == PQ_CACHE)
2976 			mycpu->gd_cnt.v_reactivated++;
2977 		if (m->act_count < ACT_INIT)
2978 			m->act_count = ACT_INIT;
2979 		_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
2980 		_vm_page_and_queue_spin_unlock(m);
2981 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
2982 			pagedaemon_wakeup();
2983 	} else {
2984 		if (m->act_count < ACT_INIT)
2985 			m->act_count = ACT_INIT;
2986 		vm_page_spin_unlock(m);
2987 	}
2988 }
2989 
2990 void
2991 vm_page_soft_activate(vm_page_t m)
2992 {
2993 	if (m->queue - m->pc == PQ_ACTIVE ||
2994 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
2995 		if (m->act_count < ACT_INIT)
2996 			m->act_count = ACT_INIT;
2997 	} else {
2998 		vm_page_activate(m);
2999 	}
3000 }
3001 
3002 /*
3003  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
3004  * routine is called when a page has been added to the cache or free
3005  * queues.
3006  *
3007  * This routine may not block.
3008  */
3009 static __inline void
3010 vm_page_free_wakeup(void)
3011 {
3012 	globaldata_t gd = mycpu;
3013 
3014 	/*
3015 	 * If the pageout daemon itself needs pages, then tell it that
3016 	 * there are some free.
3017 	 */
3018 	if (vm_pageout_pages_needed &&
3019 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
3020 	    gd->gd_vmstats.v_pageout_free_min
3021 	) {
3022 		vm_pageout_pages_needed = 0;
3023 		wakeup(&vm_pageout_pages_needed);
3024 	}
3025 
3026 	/*
3027 	 * Wakeup processes that are waiting on memory.
3028 	 *
3029 	 * Generally speaking we want to wakeup stuck processes as soon as
3030 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
3031 	 * where we can do this.  Wait a bit longer to reduce degenerate
3032 	 * re-blocking (vm_page_free_hysteresis).  The target check is just
3033 	 * to make sure the min-check w/hysteresis does not exceed the
3034 	 * normal target.
3035 	 */
3036 	if (vm_pages_waiting) {
3037 		if (!vm_page_count_min(vm_page_free_hysteresis) ||
3038 		    !vm_page_count_target()) {
3039 			vm_pages_waiting = 0;
3040 			wakeup(&vmstats.v_free_count);
3041 			++mycpu->gd_cnt.v_ppwakeups;
3042 		}
3043 #if 0
3044 		if (!vm_page_count_target()) {
3045 			/*
3046 			 * Plenty of pages are free, wakeup everyone.
3047 			 */
3048 			vm_pages_waiting = 0;
3049 			wakeup(&vmstats.v_free_count);
3050 			++mycpu->gd_cnt.v_ppwakeups;
3051 		} else if (!vm_page_count_min(0)) {
3052 			/*
3053 			 * Some pages are free, wakeup someone.
3054 			 */
3055 			int wcount = vm_pages_waiting;
3056 			if (wcount > 0)
3057 				--wcount;
3058 			vm_pages_waiting = wcount;
3059 			wakeup_one(&vmstats.v_free_count);
3060 			++mycpu->gd_cnt.v_ppwakeups;
3061 		}
3062 #endif
3063 	}
3064 }
3065 
3066 /*
3067  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
3068  * it from its VM object.
3069  *
3070  * The vm_page must be BUSY on entry.  BUSY will be released on
3071  * return (the page will have been freed).
3072  */
3073 void
3074 vm_page_free_toq(vm_page_t m)
3075 {
3076 	mycpu->gd_cnt.v_tfree++;
3077 	if (m->flags & (PG_MAPPED | PG_WRITEABLE))
3078 		pmap_mapped_sync(m);
3079 	KKASSERT((m->flags & PG_MAPPED) == 0);
3080 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3081 
3082 	if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
3083 		kprintf("vm_page_free: pindex(%lu), busy %08x, "
3084 			"hold(%d)\n",
3085 			(u_long)m->pindex, m->busy_count, m->hold_count);
3086 		if ((m->queue - m->pc) == PQ_FREE)
3087 			panic("vm_page_free: freeing free page");
3088 		else
3089 			panic("vm_page_free: freeing busy page");
3090 	}
3091 
3092 	/*
3093 	 * Remove from object, spinlock the page and its queues and
3094 	 * remove from any queue.  No queue spinlock will be held
3095 	 * after this section (because the page was removed from any
3096 	 * queue).
3097 	 */
3098 	vm_page_remove(m);
3099 
3100 	/*
3101 	 * No further management of fictitious pages occurs beyond object
3102 	 * and queue removal.
3103 	 */
3104 	if ((m->flags & PG_FICTITIOUS) != 0) {
3105 		KKASSERT(m->queue == PQ_NONE);
3106 		vm_page_wakeup(m);
3107 		return;
3108 	}
3109 	vm_page_and_queue_spin_lock(m);
3110 	_vm_page_rem_queue_spinlocked(m);
3111 
3112 	m->valid = 0;
3113 	vm_page_undirty(m);
3114 
3115 	if (m->wire_count != 0) {
3116 		if (m->wire_count > 1) {
3117 		    panic(
3118 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
3119 			m->wire_count, (long)m->pindex);
3120 		}
3121 		panic("vm_page_free: freeing wired page");
3122 	}
3123 
3124 	if (!MD_PAGE_FREEABLE(m))
3125 		panic("vm_page_free: page %p is still mapped!", m);
3126 
3127 	/*
3128 	 * Clear the PG_NEED_COMMIT and the PG_UNQUEUED flags.  The
3129 	 * page returns to normal operation and will be placed in
3130 	 * the PQ_HOLD or PQ_FREE queue.
3131 	 */
3132 	vm_page_flag_clear(m, PG_NEED_COMMIT | PG_UNQUEUED);
3133 
3134 	if (m->hold_count != 0) {
3135 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
3136 	} else {
3137 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
3138 	}
3139 
3140 	/*
3141 	 * This sequence allows us to clear BUSY while still holding
3142 	 * its spin lock, which reduces contention vs allocators.  We
3143 	 * must not leave the queue locked or _vm_page_wakeup() may
3144 	 * deadlock.
3145 	 */
3146 	_vm_page_queue_spin_unlock(m);
3147 	if (_vm_page_wakeup(m)) {
3148 		vm_page_spin_unlock(m);
3149 		wakeup(m);
3150 	} else {
3151 		vm_page_spin_unlock(m);
3152 	}
3153 	vm_page_free_wakeup();
3154 }
3155 
3156 /*
3157  * Mark this page as wired down by yet another map.  We do not adjust the
3158  * queue the page is on, it will be checked for wiring as-needed.
3159  *
3160  * This function has no effect on fictitious pages.
3161  *
3162  * Caller must be holding the page busy.
3163  */
3164 void
3165 vm_page_wire(vm_page_t m)
3166 {
3167 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3168 	if ((m->flags & PG_FICTITIOUS) == 0) {
3169 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
3170 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
3171 		}
3172 		KASSERT(m->wire_count != 0,
3173 			("vm_page_wire: wire_count overflow m=%p", m));
3174 	}
3175 }
3176 
3177 /*
3178  * Release one wiring of this page, potentially enabling it to be paged again.
3179  *
3180  * Note that wired pages are no longer unconditionally removed from the
3181  * paging queues, so the page may already be on a queue.  Move the page
3182  * to the desired queue if necessary.
3183  *
3184  * Many pages placed on the inactive queue should actually go
3185  * into the cache, but it is difficult to figure out which.  What
3186  * we do instead, if the inactive target is well met, is to put
3187  * clean pages at the head of the inactive queue instead of the tail.
3188  * This will cause them to be moved to the cache more quickly and
3189  * if not actively re-referenced, freed more quickly.  If we just
3190  * stick these pages at the end of the inactive queue, heavy filesystem
3191  * meta-data accesses can cause an unnecessary paging load on memory bound
3192  * processes.  This optimization causes one-time-use metadata to be
3193  * reused more quickly.
3194  *
3195  * Pages marked PG_NEED_COMMIT are always activated and never placed on
3196  * the inactive queue.  This helps the pageout daemon determine memory
3197  * pressure and act on out-of-memory situations more quickly.
3198  *
3199  * BUT, if we are in a low-memory situation we have no choice but to
3200  * put clean pages on the cache queue.
3201  *
3202  * A number of routines use vm_page_unwire() to guarantee that the page
3203  * will go into either the inactive or active queues, and will NEVER
3204  * be placed in the cache - for example, just after dirtying a page.
3205  * dirty pages in the cache are not allowed.
3206  *
3207  * PG_FICTITIOUS or PG_UNQUEUED pages are never moved to any queue, and
3208  * the wire_count will not be adjusted in any way for a PG_FICTITIOUS
3209  * page.
3210  *
3211  * This routine may not block.
3212  */
3213 void
3214 vm_page_unwire(vm_page_t m, int activate)
3215 {
3216 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3217 	if (m->flags & PG_FICTITIOUS) {
3218 		/* do nothing */
3219 	} else if ((int)m->wire_count <= 0) {
3220 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
3221 	} else {
3222 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
3223 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
3224 			if (m->flags & PG_UNQUEUED) {
3225 				;
3226 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
3227 				vm_page_activate(m);
3228 			} else {
3229 				vm_page_deactivate(m);
3230 			}
3231 		}
3232 	}
3233 }
3234 
3235 /*
3236  * Move the specified page to the inactive queue.
3237  *
3238  * Normally athead is 0 resulting in LRU operation.  athead is set
3239  * to 1 if we want this page to be 'as if it were placed in the cache',
3240  * except without unmapping it from the process address space.
3241  *
3242  * vm_page's spinlock must be held on entry and will remain held on return.
3243  * This routine may not block.  The caller does not have to hold the page
3244  * busied but should have some sort of interlock on its validity.
3245  *
3246  * It is ok if the page is wired (so buffer cache operations don't have
3247  * to mess with the page queues).
3248  */
3249 static void
3250 _vm_page_deactivate_locked(vm_page_t m, int athead)
3251 {
3252 	u_short oqueue;
3253 
3254 	/*
3255 	 * Ignore if already inactive.
3256 	 */
3257 	if (m->queue - m->pc == PQ_INACTIVE ||
3258 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3259 		return;
3260 	}
3261 
3262 	_vm_page_queue_spin_lock(m);
3263 	oqueue = _vm_page_rem_queue_spinlocked(m);
3264 
3265 	if ((m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3266 		if (oqueue == PQ_CACHE)
3267 			mycpu->gd_cnt.v_reactivated++;
3268 		vm_page_flag_clear(m, PG_WINATCFLS);
3269 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
3270 		if (athead == 0) {
3271 			atomic_add_long(
3272 				&vm_page_queues[PQ_INACTIVE + m->pc].adds, 1);
3273 		}
3274 	}
3275 	/* NOTE: PQ_NONE if condition not taken */
3276 	_vm_page_queue_spin_unlock(m);
3277 	/* leaves vm_page spinlocked */
3278 }
3279 
3280 /*
3281  * Attempt to deactivate a page.
3282  *
3283  * No requirements.  We can pre-filter before getting the spinlock.
3284  *
3285  * It is ok if the page is wired (so buffer cache operations don't have
3286  * to mess with the page queues).
3287  */
3288 void
3289 vm_page_deactivate(vm_page_t m)
3290 {
3291 	if (m->queue - m->pc != PQ_INACTIVE &&
3292 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3293 		vm_page_spin_lock(m);
3294 		_vm_page_deactivate_locked(m, 0);
3295 		vm_page_spin_unlock(m);
3296 	}
3297 }
3298 
3299 void
3300 vm_page_deactivate_locked(vm_page_t m)
3301 {
3302 	_vm_page_deactivate_locked(m, 0);
3303 }
3304 
3305 /*
3306  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
3307  *
3308  * This function returns non-zero if it successfully moved the page to
3309  * PQ_CACHE.
3310  *
3311  * This function unconditionally unbusies the page on return.
3312  */
3313 int
3314 vm_page_try_to_cache(vm_page_t m)
3315 {
3316 	/*
3317 	 * Shortcut if we obviously cannot move the page, or if the
3318 	 * page is already on the cache queue, or it is ficitious.
3319 	 *
3320 	 * Never allow a wired page into the cache.
3321 	 */
3322 	if (m->dirty || m->hold_count || m->wire_count ||
3323 	    m->queue - m->pc == PQ_CACHE ||
3324 	    (m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS))) {
3325 		vm_page_wakeup(m);
3326 		return(0);
3327 	}
3328 
3329 	/*
3330 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
3331 	 * be moved to the cache, but can be deactivated.  However, users
3332 	 * of this function want to move pages closer to the cache so we
3333 	 * only deactivate it if it is in PQ_ACTIVE.  We do not re-deactivate.
3334 	 */
3335 	vm_page_test_dirty(m);
3336 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3337 		if (m->queue - m->pc == PQ_ACTIVE)
3338 			vm_page_deactivate(m);
3339 		vm_page_wakeup(m);
3340 		return(0);
3341 	}
3342 	vm_page_cache(m);
3343 	return(1);
3344 }
3345 
3346 /*
3347  * Attempt to free the page.  If we cannot free it, we do nothing.
3348  * 1 is returned on success, 0 on failure.
3349  *
3350  * The page can be in any state, including already being on the free
3351  * queue.  Check to see if it really can be freed.  Note that we disallow
3352  * this ad-hoc operation if the page is flagged PG_UNQUEUED.
3353  *
3354  * Caller provides an unlocked/non-busied page.
3355  * No requirements.
3356  */
3357 int
3358 vm_page_try_to_free(vm_page_t m)
3359 {
3360 	if (vm_page_busy_try(m, TRUE))
3361 		return(0);
3362 
3363 	if (m->dirty ||				/* can't free if it is dirty */
3364 	    m->hold_count ||			/* or held (XXX may be wrong) */
3365 	    m->wire_count ||			/* or wired */
3366 	    (m->flags & (PG_UNQUEUED |		/* or unqueued */
3367 			 PG_NEED_COMMIT |	/* or needs a commit */
3368 			 PG_FICTITIOUS)) ||	/* or is fictitious */
3369 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
3370 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
3371 		vm_page_wakeup(m);
3372 		return(0);
3373 	}
3374 
3375 	/*
3376 	 * We can probably free the page.
3377 	 *
3378 	 * Page busied by us and no longer spinlocked.  Dirty pages will
3379 	 * not be freed by this function.    We have to re-test the
3380 	 * dirty bit after cleaning out the pmaps.
3381 	 */
3382 	vm_page_test_dirty(m);
3383 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3384 		vm_page_wakeup(m);
3385 		return(0);
3386 	}
3387 	vm_page_protect(m, VM_PROT_NONE);
3388 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3389 		vm_page_wakeup(m);
3390 		return(0);
3391 	}
3392 	vm_page_free(m);
3393 	return(1);
3394 }
3395 
3396 /*
3397  * vm_page_cache
3398  *
3399  * Put the specified page onto the page cache queue (if appropriate).
3400  *
3401  * The page must be busy, and this routine will release the busy and
3402  * possibly even free the page.
3403  */
3404 void
3405 vm_page_cache(vm_page_t m)
3406 {
3407 	/*
3408 	 * Not suitable for the cache
3409 	 */
3410 	if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS)) ||
3411 	    (m->busy_count & PBUSY_MASK) ||
3412 	    m->wire_count || m->hold_count) {
3413 		vm_page_wakeup(m);
3414 		return;
3415 	}
3416 
3417 	/*
3418 	 * Already in the cache (and thus not mapped)
3419 	 */
3420 	if ((m->queue - m->pc) == PQ_CACHE) {
3421 		if (m->flags & (PG_MAPPED | PG_WRITEABLE))
3422 			pmap_mapped_sync(m);
3423 		KKASSERT((m->flags & PG_MAPPED) == 0);
3424 		vm_page_wakeup(m);
3425 		return;
3426 	}
3427 
3428 #if 0
3429 	/*
3430 	 * REMOVED - it is possible for dirty to get set at any time as
3431 	 *	     long as the page is still mapped and writeable.
3432 	 *
3433 	 * Caller is required to test m->dirty, but note that the act of
3434 	 * removing the page from its maps can cause it to become dirty
3435 	 * on an SMP system due to another cpu running in usermode.
3436 	 */
3437 	if (m->dirty) {
3438 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
3439 			(long)m->pindex);
3440 	}
3441 #endif
3442 
3443 	/*
3444 	 * Remove all pmaps and indicate that the page is not
3445 	 * writeable or mapped.  Our vm_page_protect() call may
3446 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
3447 	 * everything.
3448 	 */
3449 	vm_page_protect(m, VM_PROT_NONE);
3450 	pmap_mapped_sync(m);
3451 	if ((m->flags & (PG_UNQUEUED | PG_MAPPED)) ||
3452 	    (m->busy_count & PBUSY_MASK) ||
3453 	    m->wire_count || m->hold_count) {
3454 		vm_page_wakeup(m);
3455 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3456 		vm_page_deactivate(m);
3457 		vm_page_wakeup(m);
3458 	} else {
3459 		_vm_page_and_queue_spin_lock(m);
3460 		_vm_page_rem_queue_spinlocked(m);
3461 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
3462 		_vm_page_and_queue_spin_unlock(m);
3463 		vm_page_wakeup(m);
3464 		vm_page_free_wakeup();
3465 	}
3466 }
3467 
3468 /*
3469  * vm_page_dontneed()
3470  *
3471  * Cache, deactivate, or do nothing as appropriate.  This routine
3472  * is typically used by madvise() MADV_DONTNEED.
3473  *
3474  * Generally speaking we want to move the page into the cache so
3475  * it gets reused quickly.  However, this can result in a silly syndrome
3476  * due to the page recycling too quickly.  Small objects will not be
3477  * fully cached.  On the otherhand, if we move the page to the inactive
3478  * queue we wind up with a problem whereby very large objects
3479  * unnecessarily blow away our inactive and cache queues.
3480  *
3481  * The solution is to move the pages based on a fixed weighting.  We
3482  * either leave them alone, deactivate them, or move them to the cache,
3483  * where moving them to the cache has the highest weighting.
3484  * By forcing some pages into other queues we eventually force the
3485  * system to balance the queues, potentially recovering other unrelated
3486  * space from active.  The idea is to not force this to happen too
3487  * often.
3488  *
3489  * The page must be busied.
3490  */
3491 void
3492 vm_page_dontneed(vm_page_t m)
3493 {
3494 	static int dnweight;
3495 	int dnw;
3496 	int head;
3497 
3498 	dnw = ++dnweight;
3499 
3500 	/*
3501 	 * occassionally leave the page alone
3502 	 */
3503 	if ((dnw & 0x01F0) == 0 ||
3504 	    m->queue - m->pc == PQ_INACTIVE ||
3505 	    m->queue - m->pc == PQ_CACHE
3506 	) {
3507 		if (m->act_count >= ACT_INIT)
3508 			--m->act_count;
3509 		return;
3510 	}
3511 
3512 	/*
3513 	 * If vm_page_dontneed() is inactivating a page, it must clear
3514 	 * the referenced flag; otherwise the pagedaemon will see references
3515 	 * on the page in the inactive queue and reactivate it. Until the
3516 	 * page can move to the cache queue, madvise's job is not done.
3517 	 */
3518 	vm_page_flag_clear(m, PG_REFERENCED);
3519 	pmap_clear_reference(m);
3520 
3521 	if (m->dirty == 0)
3522 		vm_page_test_dirty(m);
3523 
3524 	if (m->dirty || (dnw & 0x0070) == 0) {
3525 		/*
3526 		 * Deactivate the page 3 times out of 32.
3527 		 */
3528 		head = 0;
3529 	} else {
3530 		/*
3531 		 * Cache the page 28 times out of every 32.  Note that
3532 		 * the page is deactivated instead of cached, but placed
3533 		 * at the head of the queue instead of the tail.
3534 		 */
3535 		head = 1;
3536 	}
3537 	vm_page_spin_lock(m);
3538 	_vm_page_deactivate_locked(m, head);
3539 	vm_page_spin_unlock(m);
3540 }
3541 
3542 /*
3543  * These routines manipulate the 'soft busy' count for a page.  A soft busy
3544  * is almost like a hard BUSY except that it allows certain compatible
3545  * operations to occur on the page while it is busy.  For example, a page
3546  * undergoing a write can still be mapped read-only.
3547  *
3548  * We also use soft-busy to quickly pmap_enter shared read-only pages
3549  * without having to hold the page locked.
3550  *
3551  * The soft-busy count can be > 1 in situations where multiple threads
3552  * are pmap_enter()ing the same page simultaneously, or when two buffer
3553  * cache buffers overlap the same page.
3554  *
3555  * The caller must hold the page BUSY when making these two calls.
3556  */
3557 void
3558 vm_page_io_start(vm_page_t m)
3559 {
3560 	uint32_t ocount;
3561 
3562 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3563 	KKASSERT(ocount & PBUSY_LOCKED);
3564 }
3565 
3566 void
3567 vm_page_io_finish(vm_page_t m)
3568 {
3569 	uint32_t ocount;
3570 
3571 	ocount = atomic_fetchadd_int(&m->busy_count, -1);
3572 	KKASSERT(ocount & PBUSY_MASK);
3573 #if 0
3574 	if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
3575 		wakeup(m);
3576 #endif
3577 }
3578 
3579 /*
3580  * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
3581  *
3582  * We can't use fetchadd here because we might race a hard-busy and the
3583  * page freeing code asserts on a non-zero soft-busy count (even if only
3584  * temporary).
3585  *
3586  * Returns 0 on success, non-zero on failure.
3587  */
3588 int
3589 vm_page_sbusy_try(vm_page_t m)
3590 {
3591 	uint32_t ocount;
3592 
3593 	for (;;) {
3594 		ocount = m->busy_count;
3595 		cpu_ccfence();
3596 		if (ocount & PBUSY_LOCKED)
3597 			return 1;
3598 		if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1))
3599 			break;
3600 	}
3601 	return 0;
3602 #if 0
3603 	if (m->busy_count & PBUSY_LOCKED)
3604 		return 1;
3605 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3606 	if (ocount & PBUSY_LOCKED) {
3607 		vm_page_sbusy_drop(m);
3608 		return 1;
3609 	}
3610 	return 0;
3611 #endif
3612 }
3613 
3614 /*
3615  * Indicate that a clean VM page requires a filesystem commit and cannot
3616  * be reused.  Used by tmpfs.
3617  */
3618 void
3619 vm_page_need_commit(vm_page_t m)
3620 {
3621 	vm_page_flag_set(m, PG_NEED_COMMIT);
3622 	vm_object_set_writeable_dirty(m->object);
3623 }
3624 
3625 void
3626 vm_page_clear_commit(vm_page_t m)
3627 {
3628 	vm_page_flag_clear(m, PG_NEED_COMMIT);
3629 }
3630 
3631 /*
3632  * Grab a page, blocking if it is busy and allocating a page if necessary.
3633  * A busy page is returned or NULL.  The page may or may not be valid and
3634  * might not be on a queue (the caller is responsible for the disposition of
3635  * the page).
3636  *
3637  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
3638  * page will be zero'd and marked valid.
3639  *
3640  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
3641  * valid even if it already exists.
3642  *
3643  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
3644  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
3645  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
3646  *
3647  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
3648  * always returned if we had blocked.
3649  *
3650  * This routine may not be called from an interrupt.
3651  *
3652  * No other requirements.
3653  */
3654 vm_page_t
3655 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
3656 {
3657 	vm_page_t m;
3658 	int error;
3659 	int shared = 1;
3660 
3661 	KKASSERT(allocflags &
3662 		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
3663 	vm_object_hold_shared(object);
3664 	for (;;) {
3665 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
3666 		if (error) {
3667 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
3668 			if ((allocflags & VM_ALLOC_RETRY) == 0) {
3669 				m = NULL;
3670 				break;
3671 			}
3672 			/* retry */
3673 		} else if (m == NULL) {
3674 			if (shared) {
3675 				vm_object_upgrade(object);
3676 				shared = 0;
3677 			}
3678 			if (allocflags & VM_ALLOC_RETRY)
3679 				allocflags |= VM_ALLOC_NULL_OK;
3680 			m = vm_page_alloc(object, pindex,
3681 					  allocflags & ~VM_ALLOC_RETRY);
3682 			if (m)
3683 				break;
3684 			vm_wait(0);
3685 			if ((allocflags & VM_ALLOC_RETRY) == 0)
3686 				goto failed;
3687 		} else {
3688 			/* m found */
3689 			break;
3690 		}
3691 	}
3692 
3693 	/*
3694 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
3695 	 *
3696 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
3697 	 * valid even if already valid.
3698 	 *
3699 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
3700 	 *	  removed the idle zeroing code.  These optimizations actually
3701 	 *	  slow things down on modern cpus because the zerod area is
3702 	 *	  likely uncached, placing a memory-access burden on the
3703 	 *	  accesors taking the fault.
3704 	 *
3705 	 *	  By always zeroing the page in-line with the fault, no
3706 	 *	  dynamic ram reads are needed and the caches are hot, ready
3707 	 *	  for userland to access the memory.
3708 	 */
3709 	if (m->valid == 0) {
3710 		if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3711 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
3712 			m->valid = VM_PAGE_BITS_ALL;
3713 		}
3714 	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
3715 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
3716 		m->valid = VM_PAGE_BITS_ALL;
3717 	}
3718 failed:
3719 	vm_object_drop(object);
3720 	return(m);
3721 }
3722 
3723 /*
3724  * Mapping function for valid bits or for dirty bits in
3725  * a page.  May not block.
3726  *
3727  * Inputs are required to range within a page.
3728  *
3729  * No requirements.
3730  * Non blocking.
3731  */
3732 int
3733 vm_page_bits(int base, int size)
3734 {
3735 	int first_bit;
3736 	int last_bit;
3737 
3738 	KASSERT(
3739 	    base + size <= PAGE_SIZE,
3740 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
3741 	);
3742 
3743 	if (size == 0)		/* handle degenerate case */
3744 		return(0);
3745 
3746 	first_bit = base >> DEV_BSHIFT;
3747 	last_bit = (base + size - 1) >> DEV_BSHIFT;
3748 
3749 	return ((2 << last_bit) - (1 << first_bit));
3750 }
3751 
3752 /*
3753  * Sets portions of a page valid and clean.  The arguments are expected
3754  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3755  * of any partial chunks touched by the range.  The invalid portion of
3756  * such chunks will be zero'd.
3757  *
3758  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
3759  *	 align base to DEV_BSIZE so as not to mark clean a partially
3760  *	 truncated device block.  Otherwise the dirty page status might be
3761  *	 lost.
3762  *
3763  * This routine may not block.
3764  *
3765  * (base + size) must be less then or equal to PAGE_SIZE.
3766  */
3767 static void
3768 _vm_page_zero_valid(vm_page_t m, int base, int size)
3769 {
3770 	int frag;
3771 	int endoff;
3772 
3773 	if (size == 0)	/* handle degenerate case */
3774 		return;
3775 
3776 	/*
3777 	 * If the base is not DEV_BSIZE aligned and the valid
3778 	 * bit is clear, we have to zero out a portion of the
3779 	 * first block.
3780 	 */
3781 
3782 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3783 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3784 	) {
3785 		pmap_zero_page_area(
3786 		    VM_PAGE_TO_PHYS(m),
3787 		    frag,
3788 		    base - frag
3789 		);
3790 	}
3791 
3792 	/*
3793 	 * If the ending offset is not DEV_BSIZE aligned and the
3794 	 * valid bit is clear, we have to zero out a portion of
3795 	 * the last block.
3796 	 */
3797 
3798 	endoff = base + size;
3799 
3800 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3801 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3802 	) {
3803 		pmap_zero_page_area(
3804 		    VM_PAGE_TO_PHYS(m),
3805 		    endoff,
3806 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3807 		);
3808 	}
3809 }
3810 
3811 /*
3812  * Set valid, clear dirty bits.  If validating the entire
3813  * page we can safely clear the pmap modify bit.  We also
3814  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3815  * takes a write fault on a MAP_NOSYNC memory area the flag will
3816  * be set again.
3817  *
3818  * We set valid bits inclusive of any overlap, but we can only
3819  * clear dirty bits for DEV_BSIZE chunks that are fully within
3820  * the range.
3821  *
3822  * Page must be busied?
3823  * No other requirements.
3824  */
3825 void
3826 vm_page_set_valid(vm_page_t m, int base, int size)
3827 {
3828 	_vm_page_zero_valid(m, base, size);
3829 	m->valid |= vm_page_bits(base, size);
3830 }
3831 
3832 
3833 /*
3834  * Set valid bits and clear dirty bits.
3835  *
3836  * Page must be busied by caller.
3837  *
3838  * NOTE: This function does not clear the pmap modified bit.
3839  *	 Also note that e.g. NFS may use a byte-granular base
3840  *	 and size.
3841  *
3842  * No other requirements.
3843  */
3844 void
3845 vm_page_set_validclean(vm_page_t m, int base, int size)
3846 {
3847 	int pagebits;
3848 
3849 	_vm_page_zero_valid(m, base, size);
3850 	pagebits = vm_page_bits(base, size);
3851 	m->valid |= pagebits;
3852 	m->dirty &= ~pagebits;
3853 	if (base == 0 && size == PAGE_SIZE) {
3854 		/*pmap_clear_modify(m);*/
3855 		vm_page_flag_clear(m, PG_NOSYNC);
3856 	}
3857 }
3858 
3859 /*
3860  * Set valid & dirty.  Used by buwrite()
3861  *
3862  * Page must be busied by caller.
3863  */
3864 void
3865 vm_page_set_validdirty(vm_page_t m, int base, int size)
3866 {
3867 	int pagebits;
3868 
3869 	pagebits = vm_page_bits(base, size);
3870 	m->valid |= pagebits;
3871 	m->dirty |= pagebits;
3872 	if (m->object)
3873 	       vm_object_set_writeable_dirty(m->object);
3874 }
3875 
3876 /*
3877  * Clear dirty bits.
3878  *
3879  * NOTE: This function does not clear the pmap modified bit.
3880  *	 Also note that e.g. NFS may use a byte-granular base
3881  *	 and size.
3882  *
3883  * Page must be busied?
3884  * No other requirements.
3885  */
3886 void
3887 vm_page_clear_dirty(vm_page_t m, int base, int size)
3888 {
3889 	m->dirty &= ~vm_page_bits(base, size);
3890 	if (base == 0 && size == PAGE_SIZE) {
3891 		/*pmap_clear_modify(m);*/
3892 		vm_page_flag_clear(m, PG_NOSYNC);
3893 	}
3894 }
3895 
3896 /*
3897  * Make the page all-dirty.
3898  *
3899  * Also make sure the related object and vnode reflect the fact that the
3900  * object may now contain a dirty page.
3901  *
3902  * Page must be busied?
3903  * No other requirements.
3904  */
3905 void
3906 vm_page_dirty(vm_page_t m)
3907 {
3908 #ifdef INVARIANTS
3909         int pqtype = m->queue - m->pc;
3910 #endif
3911         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3912                 ("vm_page_dirty: page in free/cache queue!"));
3913 	if (m->dirty != VM_PAGE_BITS_ALL) {
3914 		m->dirty = VM_PAGE_BITS_ALL;
3915 		if (m->object)
3916 			vm_object_set_writeable_dirty(m->object);
3917 	}
3918 }
3919 
3920 /*
3921  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3922  * valid and dirty bits for the effected areas are cleared.
3923  *
3924  * Page must be busied?
3925  * Does not block.
3926  * No other requirements.
3927  */
3928 void
3929 vm_page_set_invalid(vm_page_t m, int base, int size)
3930 {
3931 	int bits;
3932 
3933 	bits = vm_page_bits(base, size);
3934 	m->valid &= ~bits;
3935 	m->dirty &= ~bits;
3936 	atomic_add_int(&m->object->generation, 1);
3937 }
3938 
3939 /*
3940  * The kernel assumes that the invalid portions of a page contain
3941  * garbage, but such pages can be mapped into memory by user code.
3942  * When this occurs, we must zero out the non-valid portions of the
3943  * page so user code sees what it expects.
3944  *
3945  * Pages are most often semi-valid when the end of a file is mapped
3946  * into memory and the file's size is not page aligned.
3947  *
3948  * Page must be busied?
3949  * No other requirements.
3950  */
3951 void
3952 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
3953 {
3954 	int b;
3955 	int i;
3956 
3957 	/*
3958 	 * Scan the valid bits looking for invalid sections that
3959 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
3960 	 * valid bit may be set ) have already been zerod by
3961 	 * vm_page_set_validclean().
3962 	 */
3963 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
3964 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
3965 		    (m->valid & (1 << i))
3966 		) {
3967 			if (i > b) {
3968 				pmap_zero_page_area(
3969 				    VM_PAGE_TO_PHYS(m),
3970 				    b << DEV_BSHIFT,
3971 				    (i - b) << DEV_BSHIFT
3972 				);
3973 			}
3974 			b = i + 1;
3975 		}
3976 	}
3977 
3978 	/*
3979 	 * setvalid is TRUE when we can safely set the zero'd areas
3980 	 * as being valid.  We can do this if there are no cache consistency
3981 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
3982 	 */
3983 	if (setvalid)
3984 		m->valid = VM_PAGE_BITS_ALL;
3985 }
3986 
3987 /*
3988  * Is a (partial) page valid?  Note that the case where size == 0
3989  * will return FALSE in the degenerate case where the page is entirely
3990  * invalid, and TRUE otherwise.
3991  *
3992  * Does not block.
3993  * No other requirements.
3994  */
3995 int
3996 vm_page_is_valid(vm_page_t m, int base, int size)
3997 {
3998 	int bits = vm_page_bits(base, size);
3999 
4000 	if (m->valid && ((m->valid & bits) == bits))
4001 		return 1;
4002 	else
4003 		return 0;
4004 }
4005 
4006 /*
4007  * Update dirty bits from pmap/mmu.  May not block.
4008  *
4009  * Caller must hold the page busy
4010  *
4011  * WARNING! Unless the page has been unmapped, this function only
4012  *	    provides a likely dirty status.
4013  */
4014 void
4015 vm_page_test_dirty(vm_page_t m)
4016 {
4017 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) {
4018 		vm_page_dirty(m);
4019 	}
4020 }
4021 
4022 #include "opt_ddb.h"
4023 #ifdef DDB
4024 #include <ddb/ddb.h>
4025 
4026 DB_SHOW_COMMAND(page, vm_page_print_page_info)
4027 {
4028 	db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
4029 	db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
4030 	db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
4031 	db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
4032 	db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
4033 	db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
4034 	db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
4035 	db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
4036 	db_printf("vmstats.v_cache_min: %ld\n", vmstats.v_cache_min);
4037 	db_printf("vmstats.v_inactive_target: %ld\n",
4038 		  vmstats.v_inactive_target);
4039 }
4040 
4041 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
4042 {
4043 	int i;
4044 	db_printf("PQ_FREE:");
4045 	for (i = 0; i < PQ_L2_SIZE; i++) {
4046 		db_printf(" %ld", vm_page_queues[PQ_FREE + i].lcnt);
4047 	}
4048 	db_printf("\n");
4049 
4050 	db_printf("PQ_CACHE:");
4051 	for(i = 0; i < PQ_L2_SIZE; i++) {
4052 		db_printf(" %ld", vm_page_queues[PQ_CACHE + i].lcnt);
4053 	}
4054 	db_printf("\n");
4055 
4056 	db_printf("PQ_ACTIVE:");
4057 	for(i = 0; i < PQ_L2_SIZE; i++) {
4058 		db_printf(" %ld", vm_page_queues[PQ_ACTIVE + i].lcnt);
4059 	}
4060 	db_printf("\n");
4061 
4062 	db_printf("PQ_INACTIVE:");
4063 	for(i = 0; i < PQ_L2_SIZE; i++) {
4064 		db_printf(" %ld", vm_page_queues[PQ_INACTIVE + i].lcnt);
4065 	}
4066 	db_printf("\n");
4067 }
4068 #endif /* DDB */
4069