xref: /dragonfly/sys/vm/vm_page.c (revision abf903a5)
1 /*
2  * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
3  * Copyright (c) 1991 Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
38  */
39 
40 /*
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  */
66 /*
67  * Resident memory management module.  The module manipulates 'VM pages'.
68  * A VM page is the core building block for memory management.
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/malloc.h>
74 #include <sys/proc.h>
75 #include <sys/vmmeter.h>
76 #include <sys/vnode.h>
77 #include <sys/kernel.h>
78 #include <sys/alist.h>
79 #include <sys/sysctl.h>
80 #include <sys/cpu_topology.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <sys/lock.h>
85 #include <vm/vm_kern.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_extern.h>
93 #include <vm/swap_pager.h>
94 
95 #include <machine/inttypes.h>
96 #include <machine/md_var.h>
97 #include <machine/specialreg.h>
98 #include <machine/bus_dma.h>
99 
100 #include <vm/vm_page2.h>
101 #include <sys/spinlock2.h>
102 
103 /*
104  * Cache necessary elements in the hash table itself to avoid indirecting
105  * through random vm_page's when doing a lookup.  The hash table is
106  * heuristical and it is ok for races to mess up any or all fields.
107  */
108 struct vm_page_hash_elm {
109 	vm_page_t	m;
110 	vm_object_t	object;	/* heuristical */
111 	vm_pindex_t	pindex;	/* heuristical */
112 	int		ticks;
113 	int		unused;
114 };
115 
116 #define VM_PAGE_HASH_SET	4		    /* power of 2, set-assoc */
117 #define VM_PAGE_HASH_MAX	(8 * 1024 * 1024)   /* power of 2, max size */
118 
119 /*
120  * SET - Minimum required set associative size, must be a power of 2.  We
121  *	 want this to match or exceed the set-associativeness of the cpu,
122  *	 up to a reasonable limit (we will use 16).
123  */
124 __read_mostly static int set_assoc_mask = 16 - 1;
125 
126 static void vm_page_queue_init(void);
127 static void vm_page_free_wakeup(void);
128 static vm_page_t vm_page_select_cache(u_short pg_color);
129 static vm_page_t _vm_page_list_find_wide(int basequeue, int index, int *lastp);
130 static vm_page_t _vm_page_list_find2_wide(int bq1, int bq2, int index,
131 			int *lastp1, int *lastp);
132 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
133 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
134 
135 /*
136  * Array of tailq lists
137  */
138 struct vpgqueues vm_page_queues[PQ_COUNT];
139 
140 static volatile int vm_pages_waiting;
141 static struct alist vm_contig_alist;
142 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
143 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
144 
145 __read_mostly static int vm_page_hash_vnode_only;
146 __read_mostly static int vm_page_hash_size;
147 __read_mostly static struct vm_page_hash_elm *vm_page_hash;
148 
149 static u_long vm_dma_reserved = 0;
150 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
151 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
152 	    "Memory reserved for DMA");
153 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
154 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
155 
156 SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW,
157 	    &vm_page_hash_vnode_only, 0, "Only hash vnode pages");
158 #if 0
159 static int vm_page_hash_debug;
160 SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW,
161 	    &vm_page_hash_debug, 0, "Only hash vnode pages");
162 #endif
163 
164 static int vm_contig_verbose = 0;
165 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
166 
167 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
168 	     vm_pindex_t, pindex);
169 
170 static void
171 vm_page_queue_init(void)
172 {
173 	int i;
174 
175 	for (i = 0; i < PQ_L2_SIZE; i++)
176 		vm_page_queues[PQ_FREE+i].cnt_offset =
177 			offsetof(struct vmstats, v_free_count);
178 	for (i = 0; i < PQ_L2_SIZE; i++)
179 		vm_page_queues[PQ_CACHE+i].cnt_offset =
180 			offsetof(struct vmstats, v_cache_count);
181 	for (i = 0; i < PQ_L2_SIZE; i++)
182 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
183 			offsetof(struct vmstats, v_inactive_count);
184 	for (i = 0; i < PQ_L2_SIZE; i++)
185 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
186 			offsetof(struct vmstats, v_active_count);
187 	for (i = 0; i < PQ_L2_SIZE; i++)
188 		vm_page_queues[PQ_HOLD+i].cnt_offset =
189 			offsetof(struct vmstats, v_active_count);
190 	/* PQ_NONE has no queue */
191 
192 	for (i = 0; i < PQ_COUNT; i++) {
193 		vm_page_queues[i].lastq = -1;
194 		TAILQ_INIT(&vm_page_queues[i].pl);
195 		spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
196 	}
197 }
198 
199 /*
200  * note: place in initialized data section?  Is this necessary?
201  */
202 vm_pindex_t first_page = 0;
203 vm_pindex_t vm_page_array_size = 0;
204 vm_page_t vm_page_array = NULL;
205 vm_paddr_t vm_low_phys_reserved;
206 
207 /*
208  * (low level boot)
209  *
210  * Sets the page size, perhaps based upon the memory size.
211  * Must be called before any use of page-size dependent functions.
212  */
213 void
214 vm_set_page_size(void)
215 {
216 	if (vmstats.v_page_size == 0)
217 		vmstats.v_page_size = PAGE_SIZE;
218 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
219 		panic("vm_set_page_size: page size not a power of two");
220 }
221 
222 /*
223  * (low level boot)
224  *
225  * Add a new page to the freelist for use by the system.  New pages
226  * are added to both the head and tail of the associated free page
227  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
228  * requests pull 'recent' adds (higher physical addresses) first.
229  *
230  * Beware that the page zeroing daemon will also be running soon after
231  * boot, moving pages from the head to the tail of the PQ_FREE queues.
232  *
233  * Must be called in a critical section.
234  */
235 static void
236 vm_add_new_page(vm_paddr_t pa, int *badcountp)
237 {
238 	struct vpgqueues *vpq;
239 	vm_page_t m;
240 
241 	m = PHYS_TO_VM_PAGE(pa);
242 
243 	/*
244 	 * Make sure it isn't a duplicate (due to BIOS page range overlaps,
245 	 * which we consider bugs... but don't crash).  Note that m->phys_addr
246 	 * is pre-initialized, so use m->queue as a check.
247 	 */
248 	if (m->queue) {
249 		if (*badcountp < 10) {
250 			kprintf("vm_add_new_page: duplicate pa %016jx\n",
251 				(intmax_t)pa);
252 			++*badcountp;
253 		} else if (*badcountp == 10) {
254 			kprintf("vm_add_new_page: duplicate pa (many more)\n");
255 			++*badcountp;
256 		}
257 		return;
258 	}
259 
260 	m->phys_addr = pa;
261 	m->flags = 0;
262 	m->pat_mode = PAT_WRITE_BACK;
263 	m->pc = (pa >> PAGE_SHIFT);
264 
265 	/*
266 	 * Twist for cpu localization in addition to page coloring, so
267 	 * different cpus selecting by m->queue get different page colors.
268 	 */
269 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
270 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
271 	m->pc &= PQ_L2_MASK;
272 
273 	/*
274 	 * Reserve a certain number of contiguous low memory pages for
275 	 * contigmalloc() to use.
276 	 *
277 	 * Even though these pages represent real ram and can be
278 	 * reverse-mapped, we set PG_FICTITIOUS and PG_UNQUEUED
279 	 * because their use is special-cased.
280 	 *
281 	 * WARNING! Once PG_FICTITIOUS is set, vm_page_wire*()
282 	 *	    and vm_page_unwire*() calls have no effect.
283 	 */
284 	if (pa < vm_low_phys_reserved) {
285 		atomic_add_long(&vmstats.v_page_count, 1);
286 		atomic_add_long(&vmstats.v_dma_pages, 1);
287 		m->flags |= PG_FICTITIOUS | PG_UNQUEUED;
288 		m->queue = PQ_NONE;
289 		m->wire_count = 1;
290 		atomic_add_long(&vmstats.v_wire_count, 1);
291 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
292 		return;
293 	}
294 
295 	/*
296 	 * General page
297 	 */
298 	m->queue = m->pc + PQ_FREE;
299 	KKASSERT(m->dirty == 0);
300 
301 	atomic_add_long(&vmstats.v_page_count, 1);
302 	atomic_add_long(&vmstats.v_free_count, 1);
303 	vpq = &vm_page_queues[m->queue];
304 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
305 	++vpq->lcnt;
306 }
307 
308 /*
309  * (low level boot)
310  *
311  * Initializes the resident memory module.
312  *
313  * Preallocates memory for critical VM structures and arrays prior to
314  * kernel_map becoming available.
315  *
316  * Memory is allocated from (virtual2_start, virtual2_end) if available,
317  * otherwise memory is allocated from (virtual_start, virtual_end).
318  *
319  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
320  * large enough to hold vm_page_array & other structures for machines with
321  * large amounts of ram, so we want to use virtual2* when available.
322  */
323 void
324 vm_page_startup(void)
325 {
326 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
327 	vm_offset_t mapped;
328 	vm_pindex_t npages;
329 	vm_paddr_t page_range;
330 	vm_paddr_t new_end;
331 	int i;
332 	vm_paddr_t pa;
333 	vm_paddr_t last_pa;
334 	vm_paddr_t end;
335 	vm_paddr_t biggestone, biggestsize;
336 	vm_paddr_t total;
337 	vm_page_t m;
338 	int badcount;
339 
340 	total = 0;
341 	badcount = 0;
342 	biggestsize = 0;
343 	biggestone = 0;
344 	vaddr = round_page(vaddr);
345 
346 	/*
347 	 * Make sure ranges are page-aligned.
348 	 */
349 	for (i = 0; phys_avail[i].phys_end; ++i) {
350 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
351 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
352 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
353 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
354 	}
355 
356 	/*
357 	 * Locate largest block
358 	 */
359 	for (i = 0; phys_avail[i].phys_end; ++i) {
360 		vm_paddr_t size = phys_avail[i].phys_end -
361 				  phys_avail[i].phys_beg;
362 
363 		if (size > biggestsize) {
364 			biggestone = i;
365 			biggestsize = size;
366 		}
367 		total += size;
368 	}
369 	--i;	/* adjust to last entry for use down below */
370 
371 	end = phys_avail[biggestone].phys_end;
372 	end = trunc_page(end);
373 
374 	/*
375 	 * Initialize the queue headers for the free queue, the active queue
376 	 * and the inactive queue.
377 	 */
378 	vm_page_queue_init();
379 
380 #if !defined(_KERNEL_VIRTUAL)
381 	/*
382 	 * VKERNELs don't support minidumps and as such don't need
383 	 * vm_page_dump
384 	 *
385 	 * Allocate a bitmap to indicate that a random physical page
386 	 * needs to be included in a minidump.
387 	 *
388 	 * The amd64 port needs this to indicate which direct map pages
389 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
390 	 *
391 	 * However, x86 still needs this workspace internally within the
392 	 * minidump code.  In theory, they are not needed on x86, but are
393 	 * included should the sf_buf code decide to use them.
394 	 */
395 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
396 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
397 	end -= vm_page_dump_size;
398 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
399 					VM_PROT_READ | VM_PROT_WRITE);
400 	bzero((void *)vm_page_dump, vm_page_dump_size);
401 #endif
402 	/*
403 	 * Compute the number of pages of memory that will be available for
404 	 * use (taking into account the overhead of a page structure per
405 	 * page).
406 	 */
407 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
408 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
409 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
410 
411 #ifndef _KERNEL_VIRTUAL
412 	/*
413 	 * (only applies to real kernels)
414 	 *
415 	 * Reserve a large amount of low memory for potential 32-bit DMA
416 	 * space allocations.  Once device initialization is complete we
417 	 * release most of it, but keep (vm_dma_reserved) memory reserved
418 	 * for later use.  Typically for X / graphics.  Through trial and
419 	 * error we find that GPUs usually requires ~60-100MB or so.
420 	 *
421 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
422 	 */
423 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
424 	if (vm_low_phys_reserved > total / 4)
425 		vm_low_phys_reserved = total / 4;
426 	if (vm_dma_reserved == 0) {
427 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
428 		if (vm_dma_reserved > total / 16)
429 			vm_dma_reserved = total / 16;
430 	}
431 #endif
432 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
433 		   ALIST_RECORDS_65536);
434 
435 	/*
436 	 * Initialize the mem entry structures now, and put them in the free
437 	 * queue.
438 	 */
439 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
440 		kprintf("initializing vm_page_array ");
441 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
442 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
443 	vm_page_array = (vm_page_t)mapped;
444 
445 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
446 	/*
447 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
448 	 * we have to manually add these pages to the minidump tracking so
449 	 * that they can be dumped, including the vm_page_array.
450 	 */
451 	for (pa = new_end;
452 	     pa < phys_avail[biggestone].phys_end;
453 	     pa += PAGE_SIZE) {
454 		dump_add_page(pa);
455 	}
456 #endif
457 
458 	/*
459 	 * Clear all of the page structures, run basic initialization so
460 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
461 	 * map.
462 	 */
463 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
464 	vm_page_array_size = page_range;
465 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
466 		kprintf("size = 0x%zx\n", vm_page_array_size);
467 
468 	m = &vm_page_array[0];
469 	pa = ptoa(first_page);
470 	for (i = 0; i < page_range; ++i) {
471 		spin_init(&m->spin, "vm_page");
472 		m->phys_addr = pa;
473 		pa += PAGE_SIZE;
474 		++m;
475 	}
476 
477 	/*
478 	 * Construct the free queue(s) in ascending order (by physical
479 	 * address) so that the first 16MB of physical memory is allocated
480 	 * last rather than first.  On large-memory machines, this avoids
481 	 * the exhaustion of low physical memory before isa_dma_init has run.
482 	 */
483 	vmstats.v_page_count = 0;
484 	vmstats.v_free_count = 0;
485 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
486 		pa = phys_avail[i].phys_beg;
487 		if (i == biggestone)
488 			last_pa = new_end;
489 		else
490 			last_pa = phys_avail[i].phys_end;
491 		while (pa < last_pa && npages-- > 0) {
492 			vm_add_new_page(pa, &badcount);
493 			pa += PAGE_SIZE;
494 		}
495 	}
496 	if (virtual2_start)
497 		virtual2_start = vaddr;
498 	else
499 		virtual_start = vaddr;
500 	mycpu->gd_vmstats = vmstats;
501 }
502 
503 /*
504  * (called from early boot only)
505  *
506  * Reorganize VM pages based on numa data.  May be called as many times as
507  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
508  * to allow vm_page_alloc() to choose pages based on socket affinity.
509  *
510  * NOTE: This function is only called while we are still in UP mode, so
511  *	 we only need a critical section to protect the queues (which
512  *	 saves a lot of time, there are likely a ton of pages).
513  */
514 void
515 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
516 {
517 	vm_paddr_t scan_beg;
518 	vm_paddr_t scan_end;
519 	vm_paddr_t ran_end;
520 	struct vpgqueues *vpq;
521 	vm_page_t m;
522 	vm_page_t mend;
523 	int socket_mod;
524 	int socket_value;
525 	int i;
526 
527 	/*
528 	 * Check if no physical information, or there was only one socket
529 	 * (so don't waste time doing nothing!).
530 	 */
531 	if (cpu_topology_phys_ids <= 1 ||
532 	    cpu_topology_core_ids == 0) {
533 		return;
534 	}
535 
536 	/*
537 	 * Setup for our iteration.  Note that ACPI may iterate CPU
538 	 * sockets starting at 0 or 1 or some other number.  The
539 	 * cpu_topology code mod's it against the socket count.
540 	 */
541 	ran_end = ran_beg + bytes;
542 
543 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
544 	socket_value = (physid % cpu_topology_phys_ids) * socket_mod;
545 	mend = &vm_page_array[vm_page_array_size];
546 
547 	crit_enter();
548 
549 	/*
550 	 * Adjust cpu_topology's phys_mem parameter
551 	 */
552 	if (root_cpu_node)
553 		vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes);
554 
555 	/*
556 	 * Adjust vm_page->pc and requeue all affected pages.  The
557 	 * allocator will then be able to localize memory allocations
558 	 * to some degree.
559 	 */
560 	for (i = 0; phys_avail[i].phys_end; ++i) {
561 		scan_beg = phys_avail[i].phys_beg;
562 		scan_end = phys_avail[i].phys_end;
563 		if (scan_end <= ran_beg)
564 			continue;
565 		if (scan_beg >= ran_end)
566 			continue;
567 		if (scan_beg < ran_beg)
568 			scan_beg = ran_beg;
569 		if (scan_end > ran_end)
570 			scan_end = ran_end;
571 		if (atop(scan_end) > first_page + vm_page_array_size)
572 			scan_end = ptoa(first_page + vm_page_array_size);
573 
574 		m = PHYS_TO_VM_PAGE(scan_beg);
575 		while (scan_beg < scan_end) {
576 			KKASSERT(m < mend);
577 			if (m->queue != PQ_NONE) {
578 				vpq = &vm_page_queues[m->queue];
579 				TAILQ_REMOVE(&vpq->pl, m, pageq);
580 				--vpq->lcnt;
581 				/* queue doesn't change, no need to adj cnt */
582 				m->queue -= m->pc;
583 				m->pc %= socket_mod;
584 				m->pc += socket_value;
585 				m->pc &= PQ_L2_MASK;
586 				m->queue += m->pc;
587 				vpq = &vm_page_queues[m->queue];
588 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
589 				++vpq->lcnt;
590 				/* queue doesn't change, no need to adj cnt */
591 			} else {
592 				m->pc %= socket_mod;
593 				m->pc += socket_value;
594 				m->pc &= PQ_L2_MASK;
595 			}
596 			scan_beg += PAGE_SIZE;
597 			++m;
598 		}
599 	}
600 
601 	crit_exit();
602 }
603 
604 /*
605  * (called from early boot only)
606  *
607  * Don't allow the NUMA organization to leave vm_page_queues[] nodes
608  * completely empty for a logical cpu.  Doing so would force allocations
609  * on that cpu to always borrow from a nearby cpu, create unnecessary
610  * contention, and cause vm_page_alloc() to iterate more queues and run more
611  * slowly.
612  *
613  * This situation can occur when memory sticks are not entirely populated,
614  * populated at different densities, or in naturally assymetric systems
615  * such as the 2990WX.  There could very well be many vm_page_queues[]
616  * entries with *NO* pages assigned to them.
617  *
618  * Fixing this up ensures that each logical CPU has roughly the same
619  * sized memory pool, and more importantly ensures that logical CPUs
620  * do not wind up with an empty memory pool.
621  *
622  * At them moment we just iterate the other queues and borrow pages,
623  * moving them into the queues for cpus with severe deficits even though
624  * the memory might not be local to those cpus.  I am not doing this in
625  * a 'smart' way, its effectively UMA style (sorta, since its page-by-page
626  * whereas real UMA typically exchanges address bits 8-10 with high address
627  * bits).  But it works extremely well and gives us fairly good deterministic
628  * results on the cpu cores associated with these secondary nodes.
629  */
630 void
631 vm_numa_organize_finalize(void)
632 {
633 	struct vpgqueues *vpq;
634 	vm_page_t m;
635 	long lcnt_lo;
636 	long lcnt_hi;
637 	int iter;
638 	int i;
639 	int scale_lim;
640 
641 	crit_enter();
642 
643 	/*
644 	 * Machines might not use an exact power of 2 for phys_ids,
645 	 * core_ids, ht_ids, etc.  This can slightly reduce the actual
646 	 * range of indices in vm_page_queues[] that are nominally used.
647 	 */
648 	if (cpu_topology_ht_ids) {
649 		scale_lim = PQ_L2_SIZE / cpu_topology_phys_ids;
650 		scale_lim = scale_lim / cpu_topology_core_ids;
651 		scale_lim = scale_lim / cpu_topology_ht_ids;
652 		scale_lim = scale_lim * cpu_topology_ht_ids;
653 		scale_lim = scale_lim * cpu_topology_core_ids;
654 		scale_lim = scale_lim * cpu_topology_phys_ids;
655 	} else {
656 		scale_lim = PQ_L2_SIZE;
657 	}
658 
659 	/*
660 	 * Calculate an average, set hysteresis for balancing from
661 	 * 10% below the average to the average.
662 	 */
663 	lcnt_hi = 0;
664 	for (i = 0; i < scale_lim; ++i) {
665 		lcnt_hi += vm_page_queues[i].lcnt;
666 	}
667 	lcnt_hi /= scale_lim;
668 	lcnt_lo = lcnt_hi - lcnt_hi / 10;
669 
670 	kprintf("vm_page: avg %ld pages per queue, %d queues\n",
671 		lcnt_hi, scale_lim);
672 
673 	iter = 0;
674 	for (i = 0; i < scale_lim; ++i) {
675 		vpq = &vm_page_queues[PQ_FREE + i];
676 		while (vpq->lcnt < lcnt_lo) {
677 			struct vpgqueues *vptmp;
678 
679 			iter = (iter + 1) & PQ_L2_MASK;
680 			vptmp = &vm_page_queues[PQ_FREE + iter];
681 			if (vptmp->lcnt < lcnt_hi)
682 				continue;
683 			m = TAILQ_FIRST(&vptmp->pl);
684 			KKASSERT(m->queue == PQ_FREE + iter);
685 			TAILQ_REMOVE(&vptmp->pl, m, pageq);
686 			--vptmp->lcnt;
687 			/* queue doesn't change, no need to adj cnt */
688 			m->queue -= m->pc;
689 			m->pc = i;
690 			m->queue += m->pc;
691 			TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
692 			++vpq->lcnt;
693 		}
694 	}
695 	crit_exit();
696 }
697 
698 static
699 void
700 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes)
701 {
702 	int cpuid;
703 	int i;
704 
705 	switch(cpup->type) {
706 	case PACKAGE_LEVEL:
707 		cpup->phys_mem += bytes;
708 		break;
709 	case CHIP_LEVEL:
710 		/*
711 		 * All members should have the same chipid, so we only need
712 		 * to pull out one member.
713 		 */
714 		if (CPUMASK_TESTNZERO(cpup->members)) {
715 			cpuid = BSFCPUMASK(cpup->members);
716 			if (physid ==
717 			    get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) {
718 				cpup->phys_mem += bytes;
719 			}
720 		}
721 		break;
722 	case CORE_LEVEL:
723 	case THREAD_LEVEL:
724 		/*
725 		 * Just inherit from the parent node
726 		 */
727 		cpup->phys_mem = cpup->parent_node->phys_mem;
728 		break;
729 	}
730 	for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i)
731 		vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes);
732 }
733 
734 /*
735  * We tended to reserve a ton of memory for contigmalloc().  Now that most
736  * drivers have initialized we want to return most the remaining free
737  * reserve back to the VM page queues so they can be used for normal
738  * allocations.
739  *
740  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
741  */
742 static void
743 vm_page_startup_finish(void *dummy __unused)
744 {
745 	alist_blk_t blk;
746 	alist_blk_t rblk;
747 	alist_blk_t count;
748 	alist_blk_t xcount;
749 	alist_blk_t bfree;
750 	vm_page_t m;
751 	struct vm_page_hash_elm *mp;
752 	int mask;
753 
754 	/*
755 	 * Set the set_assoc_mask based on the fitted number of CPUs.
756 	 * This is a mask, so we subject 1.
757 	 *
758 	 * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8.
759 	 * So if we have 256 CPUs, two hyper-threads will wind up sharing.
760 	 *
761 	 * The maximum is PQ_L2_SIZE.  However, we limit the starting
762 	 * maximum to 16 (mask = 15) in order to improve the cache locality
763 	 * of related kernel data structures.
764 	 */
765 	mask = PQ_L2_SIZE / ncpus_fit - 1;
766 	if (mask < 7)		/* minimum is 8-way w/256 CPU threads */
767 		mask = 7;
768 	if (mask < 15)
769 		mask = 15;
770 	cpu_ccfence();
771 	set_assoc_mask = mask;
772 
773 	/*
774 	 * Return part of the initial reserve back to the system
775 	 */
776 	spin_lock(&vm_contig_spin);
777 	for (;;) {
778 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
779 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
780 			break;
781 		if (count == 0)
782 			break;
783 
784 		/*
785 		 * Figure out how much of the initial reserve we have to
786 		 * free in order to reach our target.
787 		 */
788 		bfree -= vm_dma_reserved / PAGE_SIZE;
789 		if (count > bfree) {
790 			blk += count - bfree;
791 			count = bfree;
792 		}
793 
794 		/*
795 		 * Calculate the nearest power of 2 <= count.
796 		 */
797 		for (xcount = 1; xcount <= count; xcount <<= 1)
798 			;
799 		xcount >>= 1;
800 		blk += count - xcount;
801 		count = xcount;
802 
803 		/*
804 		 * Allocate the pages from the alist, then free them to
805 		 * the normal VM page queues.
806 		 *
807 		 * Pages allocated from the alist are wired.  We have to
808 		 * busy, unwire, and free them.  We must also adjust
809 		 * vm_low_phys_reserved before freeing any pages to prevent
810 		 * confusion.
811 		 */
812 		rblk = alist_alloc(&vm_contig_alist, blk, count);
813 		if (rblk != blk) {
814 			kprintf("vm_page_startup_finish: Unable to return "
815 				"dma space @0x%08x/%d -> 0x%08x\n",
816 				blk, count, rblk);
817 			break;
818 		}
819 		atomic_add_long(&vmstats.v_dma_pages, -(long)count);
820 		spin_unlock(&vm_contig_spin);
821 
822 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
823 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
824 		while (count) {
825 			vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);
826 			vm_page_busy_wait(m, FALSE, "cpgfr");
827 			vm_page_unwire(m, 0);
828 			vm_page_free(m);
829 			--count;
830 			++m;
831 		}
832 		spin_lock(&vm_contig_spin);
833 	}
834 	spin_unlock(&vm_contig_spin);
835 
836 	/*
837 	 * Print out how much DMA space drivers have already allocated and
838 	 * how much is left over.
839 	 */
840 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
841 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
842 		(PAGE_SIZE / 1024),
843 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
844 
845 	/*
846 	 * Power of 2
847 	 */
848 	vm_page_hash_size = 4096;
849 	while (vm_page_hash_size < (vm_page_array_size / 16))
850 		vm_page_hash_size <<= 1;
851 	if (vm_page_hash_size > VM_PAGE_HASH_MAX)
852 		vm_page_hash_size = VM_PAGE_HASH_MAX;
853 
854 	/*
855 	 * hash table for vm_page_lookup_quick()
856 	 */
857 	mp = (void *)kmem_alloc3(&kernel_map,
858 				 (vm_page_hash_size + VM_PAGE_HASH_SET) *
859 				  sizeof(*vm_page_hash),
860 				 VM_SUBSYS_VMPGHASH, KM_CPU(0));
861 	bzero(mp, (vm_page_hash_size + VM_PAGE_HASH_SET) * sizeof(*mp));
862 	cpu_sfence();
863 	vm_page_hash = mp;
864 }
865 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
866 	vm_page_startup_finish, NULL);
867 
868 
869 /*
870  * Scan comparison function for Red-Black tree scans.  An inclusive
871  * (start,end) is expected.  Other fields are not used.
872  */
873 int
874 rb_vm_page_scancmp(struct vm_page *p, void *data)
875 {
876 	struct rb_vm_page_scan_info *info = data;
877 
878 	if (p->pindex < info->start_pindex)
879 		return(-1);
880 	if (p->pindex > info->end_pindex)
881 		return(1);
882 	return(0);
883 }
884 
885 int
886 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
887 {
888 	if (p1->pindex < p2->pindex)
889 		return(-1);
890 	if (p1->pindex > p2->pindex)
891 		return(1);
892 	return(0);
893 }
894 
895 void
896 vm_page_init(vm_page_t m)
897 {
898 	/* do nothing for now.  Called from pmap_page_init() */
899 }
900 
901 /*
902  * Each page queue has its own spin lock, which is fairly optimal for
903  * allocating and freeing pages at least.
904  *
905  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
906  * queue spinlock via this function.  Also note that m->queue cannot change
907  * unless both the page and queue are locked.
908  */
909 static __inline
910 void
911 _vm_page_queue_spin_lock(vm_page_t m)
912 {
913 	u_short queue;
914 
915 	queue = m->queue;
916 	if (queue != PQ_NONE) {
917 		spin_lock(&vm_page_queues[queue].spin);
918 		KKASSERT(queue == m->queue);
919 	}
920 }
921 
922 static __inline
923 void
924 _vm_page_queue_spin_unlock(vm_page_t m)
925 {
926 	u_short queue;
927 
928 	queue = m->queue;
929 	cpu_ccfence();
930 	if (queue != PQ_NONE)
931 		spin_unlock(&vm_page_queues[queue].spin);
932 }
933 
934 static __inline
935 void
936 _vm_page_queues_spin_lock(u_short queue)
937 {
938 	cpu_ccfence();
939 	if (queue != PQ_NONE)
940 		spin_lock(&vm_page_queues[queue].spin);
941 }
942 
943 
944 static __inline
945 void
946 _vm_page_queues_spin_unlock(u_short queue)
947 {
948 	cpu_ccfence();
949 	if (queue != PQ_NONE)
950 		spin_unlock(&vm_page_queues[queue].spin);
951 }
952 
953 void
954 vm_page_queue_spin_lock(vm_page_t m)
955 {
956 	_vm_page_queue_spin_lock(m);
957 }
958 
959 void
960 vm_page_queues_spin_lock(u_short queue)
961 {
962 	_vm_page_queues_spin_lock(queue);
963 }
964 
965 void
966 vm_page_queue_spin_unlock(vm_page_t m)
967 {
968 	_vm_page_queue_spin_unlock(m);
969 }
970 
971 void
972 vm_page_queues_spin_unlock(u_short queue)
973 {
974 	_vm_page_queues_spin_unlock(queue);
975 }
976 
977 /*
978  * This locks the specified vm_page and its queue in the proper order
979  * (page first, then queue).  The queue may change so the caller must
980  * recheck on return.
981  */
982 static __inline
983 void
984 _vm_page_and_queue_spin_lock(vm_page_t m)
985 {
986 	vm_page_spin_lock(m);
987 	_vm_page_queue_spin_lock(m);
988 }
989 
990 static __inline
991 void
992 _vm_page_and_queue_spin_unlock(vm_page_t m)
993 {
994 	_vm_page_queues_spin_unlock(m->queue);
995 	vm_page_spin_unlock(m);
996 }
997 
998 void
999 vm_page_and_queue_spin_unlock(vm_page_t m)
1000 {
1001 	_vm_page_and_queue_spin_unlock(m);
1002 }
1003 
1004 void
1005 vm_page_and_queue_spin_lock(vm_page_t m)
1006 {
1007 	_vm_page_and_queue_spin_lock(m);
1008 }
1009 
1010 /*
1011  * Helper function removes vm_page from its current queue.
1012  * Returns the base queue the page used to be on.
1013  *
1014  * The vm_page and the queue must be spinlocked.
1015  * This function will unlock the queue but leave the page spinlocked.
1016  */
1017 static __inline u_short
1018 _vm_page_rem_queue_spinlocked(vm_page_t m)
1019 {
1020 	struct vpgqueues *pq;
1021 	u_short queue;
1022 	u_short oqueue;
1023 	long *cnt_adj;
1024 	long *cnt_gd;
1025 
1026 	queue = m->queue;
1027 	if (queue != PQ_NONE) {
1028 		pq = &vm_page_queues[queue];
1029 		TAILQ_REMOVE(&pq->pl, m, pageq);
1030 
1031 		/*
1032 		 * Primarily adjust our pcpu stats for rollup, which is
1033 		 * (mycpu->gd_vmstats_adj + offset).  This is normally
1034 		 * synchronized on every hardclock().
1035 		 *
1036 		 * However, in order for the nominal low-memory algorithms
1037 		 * to work properly if the unsynchronized adjustment gets
1038 		 * too negative and might trigger the pageout daemon, we
1039 		 * immediately synchronize with the global structure.
1040 		 *
1041 		 * The idea here is to reduce unnecessary SMP cache mastership
1042 		 * changes in the global vmstats, which can be particularly
1043 		 * bad in multi-socket systems.
1044 		 *
1045 		 * WARNING! In systems with low amounts of memory the
1046 		 *	    vm_paging_needed(-1024 * ncpus) test could
1047 		 *	    wind up testing a value above the paging target,
1048 		 *	    meaning it would almost always return TRUE.  In
1049 		 *	    that situation we synchronize every time the
1050 		 *	    cumulative adjustment falls below -1024.
1051 		 */
1052 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1053 				   pq->cnt_offset);
1054 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1055 				   pq->cnt_offset);
1056 		atomic_add_long(cnt_adj, -1);
1057 		atomic_add_long(cnt_gd, -1);
1058 
1059 		if (*cnt_adj < -1024 && vm_paging_needed(-1024 * ncpus)) {
1060 			u_long copy = atomic_swap_long(cnt_adj, 0);
1061 			cnt_adj = (long *)((char *)&vmstats + pq->cnt_offset);
1062 			atomic_add_long(cnt_adj, copy);
1063 		}
1064 		pq->lcnt--;
1065 		m->queue = PQ_NONE;
1066 		oqueue = queue;
1067 		queue -= m->pc;
1068 		vm_page_queues_spin_unlock(oqueue);	/* intended */
1069 	}
1070 	return queue;
1071 }
1072 
1073 /*
1074  * Helper function places the vm_page on the specified queue.  Generally
1075  * speaking only PQ_FREE pages are placed at the head, to allow them to
1076  * be allocated sooner rather than later on the assumption that they
1077  * are cache-hot.
1078  *
1079  * The vm_page must be spinlocked.
1080  * The vm_page must NOT be FICTITIOUS (that would be a disaster)
1081  * This function will return with both the page and the queue locked.
1082  */
1083 static __inline void
1084 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
1085 {
1086 	struct vpgqueues *pq;
1087 	u_long *cnt_adj;
1088 	u_long *cnt_gd;
1089 
1090 	KKASSERT(m->queue == PQ_NONE &&
1091 		 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0);
1092 
1093 	if (queue != PQ_NONE) {
1094 		vm_page_queues_spin_lock(queue);
1095 		pq = &vm_page_queues[queue];
1096 		++pq->lcnt;
1097 
1098 		/*
1099 		 * Adjust our pcpu stats.  If a system entity really needs
1100 		 * to incorporate the count it will call vmstats_rollup()
1101 		 * to roll it all up into the global vmstats strufture.
1102 		 */
1103 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1104 				   pq->cnt_offset);
1105 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1106 				   pq->cnt_offset);
1107 		atomic_add_long(cnt_adj, 1);
1108 		atomic_add_long(cnt_gd, 1);
1109 
1110 		/*
1111 		 * PQ_FREE is always handled LIFO style to try to provide
1112 		 * cache-hot pages to programs.
1113 		 */
1114 		m->queue = queue;
1115 		if (queue - m->pc == PQ_FREE) {
1116 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1117 		} else if (athead) {
1118 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1119 		} else {
1120 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1121 		}
1122 		/* leave the queue spinlocked */
1123 	}
1124 }
1125 
1126 /*
1127  * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
1128  * until the page is no longer BUSY or SBUSY (busy_count field is 0).
1129  *
1130  * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
1131  * call will be made before returning.
1132  *
1133  * This function does NOT busy the page and on return the page is not
1134  * guaranteed to be available.
1135  */
1136 void
1137 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
1138 {
1139 	u_int32_t busy_count;
1140 
1141 	for (;;) {
1142 		busy_count = m->busy_count;
1143 		cpu_ccfence();
1144 
1145 		if ((busy_count & PBUSY_LOCKED) == 0 &&
1146 		    (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
1147 			break;
1148 		}
1149 		tsleep_interlock(m, 0);
1150 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1151 				      busy_count | PBUSY_WANTED)) {
1152 			atomic_set_int(&m->flags, PG_REFERENCED);
1153 			tsleep(m, PINTERLOCKED, msg, 0);
1154 			break;
1155 		}
1156 	}
1157 }
1158 
1159 /*
1160  * This calculates and returns a page color given an optional VM object and
1161  * either a pindex or an iterator.  We attempt to return a cpu-localized
1162  * pg_color that is still roughly 16-way set-associative.  The CPU topology
1163  * is used if it was probed.
1164  *
1165  * The caller may use the returned value to index into e.g. PQ_FREE when
1166  * allocating a page in order to nominally obtain pages that are hopefully
1167  * already localized to the requesting cpu.  This function is not able to
1168  * provide any sort of guarantee of this, but does its best to improve
1169  * hardware cache management performance.
1170  *
1171  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
1172  */
1173 u_short
1174 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
1175 {
1176 	u_short pg_color;
1177 	int object_pg_color;
1178 
1179 	/*
1180 	 * WARNING! cpu_topology_core_ids might not be a power of two.
1181 	 *	    We also shouldn't make assumptions about
1182 	 *	    cpu_topology_phys_ids either.
1183 	 *
1184 	 * WARNING! ncpus might not be known at this time (during early
1185 	 *	    boot), and might be set to 1.
1186 	 *
1187 	 * General format: [phys_id][core_id][cpuid][set-associativity]
1188 	 * (but uses modulo, so not necessarily precise bit masks)
1189 	 */
1190 	object_pg_color = object ? object->pg_color : 0;
1191 
1192 	if (cpu_topology_ht_ids) {
1193 		int phys_id;
1194 		int core_id;
1195 		int ht_id;
1196 		int physcale;
1197 		int grpscale;
1198 		int cpuscale;
1199 
1200 		/*
1201 		 * Translate cpuid to socket, core, and hyperthread id.
1202 		 */
1203 		phys_id = get_cpu_phys_id(cpuid);
1204 		core_id = get_cpu_core_id(cpuid);
1205 		ht_id = get_cpu_ht_id(cpuid);
1206 
1207 		/*
1208 		 * Calculate pg_color for our array index.
1209 		 *
1210 		 * physcale - socket multiplier.
1211 		 * grpscale - core multiplier (cores per socket)
1212 		 * cpu*	    - cpus per core
1213 		 *
1214 		 * WARNING! In early boot, ncpus has not yet been
1215 		 *	    initialized and may be set to (1).
1216 		 *
1217 		 * WARNING! physcale must match the organization that
1218 		 *	    vm_numa_organize() creates to ensure that
1219 		 *	    we properly localize allocations to the
1220 		 *	    requested cpuid.
1221 		 */
1222 		physcale = PQ_L2_SIZE / cpu_topology_phys_ids;
1223 		grpscale = physcale / cpu_topology_core_ids;
1224 		cpuscale = grpscale / cpu_topology_ht_ids;
1225 
1226 		pg_color = phys_id * physcale;
1227 		pg_color += core_id * grpscale;
1228 		pg_color += ht_id * cpuscale;
1229 		pg_color += (pindex + object_pg_color) % cpuscale;
1230 
1231 #if 0
1232 		if (grpsize >= 8) {
1233 			pg_color += (pindex + object_pg_color) % grpsize;
1234 		} else {
1235 			if (grpsize <= 2) {
1236 				grpsize = 8;
1237 			} else {
1238 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
1239 				grpsize += grpsize;
1240 				if (grpsize < 8)
1241 					grpsize += grpsize;
1242 			}
1243 			pg_color += (pindex + object_pg_color) % grpsize;
1244 		}
1245 #endif
1246 	} else {
1247 		/*
1248 		 * Unknown topology, distribute things evenly.
1249 		 *
1250 		 * WARNING! In early boot, ncpus has not yet been
1251 		 *	    initialized and may be set to (1).
1252 		 */
1253 		int cpuscale;
1254 
1255 		cpuscale = PQ_L2_SIZE / ncpus;
1256 
1257 		pg_color = cpuid * cpuscale;
1258 		pg_color += (pindex + object_pg_color) % cpuscale;
1259 	}
1260 	return (pg_color & PQ_L2_MASK);
1261 }
1262 
1263 /*
1264  * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
1265  * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
1266  */
1267 void
1268 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
1269 				     int also_m_busy, const char *msg
1270 				     VM_PAGE_DEBUG_ARGS)
1271 {
1272 	u_int32_t busy_count;
1273 
1274 	for (;;) {
1275 		busy_count = m->busy_count;
1276 		cpu_ccfence();
1277 		if (busy_count & PBUSY_LOCKED) {
1278 			tsleep_interlock(m, 0);
1279 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1280 					  busy_count | PBUSY_WANTED)) {
1281 				atomic_set_int(&m->flags, PG_REFERENCED);
1282 				tsleep(m, PINTERLOCKED, msg, 0);
1283 			}
1284 		} else if (also_m_busy && busy_count) {
1285 			tsleep_interlock(m, 0);
1286 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1287 					  busy_count | PBUSY_WANTED)) {
1288 				atomic_set_int(&m->flags, PG_REFERENCED);
1289 				tsleep(m, PINTERLOCKED, msg, 0);
1290 			}
1291 		} else {
1292 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1293 					      busy_count | PBUSY_LOCKED)) {
1294 #ifdef VM_PAGE_DEBUG
1295 				m->busy_func = func;
1296 				m->busy_line = lineno;
1297 #endif
1298 				break;
1299 			}
1300 		}
1301 	}
1302 }
1303 
1304 /*
1305  * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
1306  * m->busy_count is also 0.
1307  *
1308  * Returns non-zero on failure.
1309  */
1310 int
1311 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1312 				    VM_PAGE_DEBUG_ARGS)
1313 {
1314 	u_int32_t busy_count;
1315 
1316 	for (;;) {
1317 		busy_count = m->busy_count;
1318 		cpu_ccfence();
1319 		if (busy_count & PBUSY_LOCKED)
1320 			return TRUE;
1321 		if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1322 			return TRUE;
1323 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1324 				      busy_count | PBUSY_LOCKED)) {
1325 #ifdef VM_PAGE_DEBUG
1326 				m->busy_func = func;
1327 				m->busy_line = lineno;
1328 #endif
1329 			return FALSE;
1330 		}
1331 	}
1332 }
1333 
1334 /*
1335  * Clear the BUSY flag and return non-zero to indicate to the caller
1336  * that a wakeup() should be performed.
1337  *
1338  * (inline version)
1339  */
1340 static __inline
1341 int
1342 _vm_page_wakeup(vm_page_t m)
1343 {
1344 	u_int32_t busy_count;
1345 
1346 	busy_count = m->busy_count;
1347 	cpu_ccfence();
1348 	for (;;) {
1349 		if (atomic_fcmpset_int(&m->busy_count, &busy_count,
1350 				      busy_count &
1351 				      ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1352 			return((int)(busy_count & PBUSY_WANTED));
1353 		}
1354 	}
1355 	/* not reached */
1356 }
1357 
1358 /*
1359  * Clear the BUSY flag and wakeup anyone waiting for the page.  This
1360  * is typically the last call you make on a page before moving onto
1361  * other things.
1362  */
1363 void
1364 vm_page_wakeup(vm_page_t m)
1365 {
1366         KASSERT(m->busy_count & PBUSY_LOCKED,
1367 		("vm_page_wakeup: page not busy!!!"));
1368 	if (_vm_page_wakeup(m))
1369 		wakeup(m);
1370 }
1371 
1372 /*
1373  * Hold a page, preventing reuse.  This is typically only called on pages
1374  * in a known state (either held busy, special, or interlocked in some
1375  * manner).  Holding a page does not ensure that it remains valid, it only
1376  * prevents reuse.  The page must not already be on the FREE queue or in
1377  * any danger of being moved to the FREE queue concurrent with this call.
1378  *
1379  * Other parts of the system can still disassociate the page from its object
1380  * and attempt to free it, or perform read or write I/O on it and/or otherwise
1381  * manipulate the page, but if the page is held the VM system will leave the
1382  * page and its data intact and not cycle it through the FREE queue until
1383  * the last hold has been released.
1384  *
1385  * (see vm_page_wire() if you want to prevent the page from being
1386  *  disassociated from its object too).
1387  */
1388 void
1389 vm_page_hold(vm_page_t m)
1390 {
1391 	atomic_add_int(&m->hold_count, 1);
1392 	KKASSERT(m->queue - m->pc != PQ_FREE);
1393 }
1394 
1395 /*
1396  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1397  * it was freed while held and must be moved back to the FREE queue.
1398  *
1399  * To avoid racing against vm_page_free*() we must re-test conditions
1400  * after obtaining the spin-lock.  The initial test can also race a
1401  * vm_page_free*() that is in the middle of moving a page to PQ_HOLD,
1402  * leaving the page on PQ_HOLD with hold_count == 0.  Rather than
1403  * throw a spin-lock in the critical path, we rely on the pageout
1404  * daemon to clean-up these loose ends.
1405  *
1406  * More critically, the 'easy movement' between queues without busying
1407  * a vm_page is only allowed for PQ_FREE<->PQ_HOLD.
1408  */
1409 void
1410 vm_page_unhold(vm_page_t m)
1411 {
1412 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1413 		("vm_page_unhold: pg %p illegal hold_count (%d) or "
1414 		 "on FREE queue (%d)",
1415 		 m, m->hold_count, m->queue - m->pc));
1416 
1417 	if (atomic_fetchadd_int(&m->hold_count, -1) == 1 &&
1418 	    m->queue - m->pc == PQ_HOLD) {
1419 		vm_page_spin_lock(m);
1420 		if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1421 			_vm_page_queue_spin_lock(m);
1422 			_vm_page_rem_queue_spinlocked(m);
1423 			_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1424 			_vm_page_queue_spin_unlock(m);
1425 		}
1426 		vm_page_spin_unlock(m);
1427 	}
1428 }
1429 
1430 /*
1431  * Create a fictitious page with the specified physical address and
1432  * memory attribute.  The memory attribute is the only the machine-
1433  * dependent aspect of a fictitious page that must be initialized.
1434  */
1435 void
1436 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1437 {
1438 	/*
1439 	 * The page's memattr might have changed since the
1440 	 * previous initialization.  Update the pmap to the
1441 	 * new memattr.
1442 	 */
1443 	if ((m->flags & PG_FICTITIOUS) != 0)
1444 		goto memattr;
1445 	m->phys_addr = paddr;
1446 	m->queue = PQ_NONE;
1447 	/* Fictitious pages don't use "segind". */
1448 	/* Fictitious pages don't use "order" or "pool". */
1449 	m->flags = PG_FICTITIOUS | PG_UNQUEUED;
1450 	m->busy_count = PBUSY_LOCKED;
1451 	m->wire_count = 1;
1452 	spin_init(&m->spin, "fake_page");
1453 	pmap_page_init(m);
1454 memattr:
1455 	pmap_page_set_memattr(m, memattr);
1456 }
1457 
1458 /*
1459  * Inserts the given vm_page into the object and object list.
1460  *
1461  * The pagetables are not updated but will presumably fault the page
1462  * in if necessary, or if a kernel page the caller will at some point
1463  * enter the page into the kernel's pmap.  We are not allowed to block
1464  * here so we *can't* do this anyway.
1465  *
1466  * This routine may not block.
1467  * This routine must be called with the vm_object held.
1468  * This routine must be called with a critical section held.
1469  *
1470  * This routine returns TRUE if the page was inserted into the object
1471  * successfully, and FALSE if the page already exists in the object.
1472  */
1473 int
1474 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1475 {
1476 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1477 	if (m->object != NULL)
1478 		panic("vm_page_insert: already inserted");
1479 
1480 	atomic_add_int(&object->generation, 1);
1481 
1482 	/*
1483 	 * Associate the VM page with an (object, offset).
1484 	 *
1485 	 * The vm_page spin lock is required for interactions with the pmap.
1486 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1487 	 */
1488 	vm_page_spin_lock(m);
1489 	m->object = object;
1490 	m->pindex = pindex;
1491 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1492 		m->object = NULL;
1493 		m->pindex = 0;
1494 		vm_page_spin_unlock(m);
1495 		return FALSE;
1496 	}
1497 	++object->resident_page_count;
1498 	++mycpu->gd_vmtotal.t_rm;
1499 	vm_page_spin_unlock(m);
1500 
1501 	/*
1502 	 * Since we are inserting a new and possibly dirty page,
1503 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1504 	 */
1505 	if ((m->valid & m->dirty) ||
1506 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1507 		vm_object_set_writeable_dirty(object);
1508 
1509 	/*
1510 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1511 	 */
1512 	swap_pager_page_inserted(m);
1513 	return TRUE;
1514 }
1515 
1516 /*
1517  * Removes the given vm_page_t from the (object,index) table
1518  *
1519  * The page must be BUSY and will remain BUSY on return.
1520  * No other requirements.
1521  *
1522  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1523  *	 it busy.
1524  *
1525  * NOTE: Caller is responsible for any pmap disposition prior to the
1526  *	 rename (as the pmap code will not be able to find the entries
1527  *	 once the object has been disassociated).  The caller may choose
1528  *	 to leave the pmap association intact if this routine is being
1529  *	 called as part of a rename between shadowed objects.
1530  *
1531  * This routine may not block.
1532  */
1533 void
1534 vm_page_remove(vm_page_t m)
1535 {
1536 	vm_object_t object;
1537 
1538 	if (m->object == NULL) {
1539 		return;
1540 	}
1541 
1542 	if ((m->busy_count & PBUSY_LOCKED) == 0)
1543 		panic("vm_page_remove: page not busy");
1544 
1545 	object = m->object;
1546 
1547 	vm_object_hold(object);
1548 
1549 	/*
1550 	 * Remove the page from the object and update the object.
1551 	 *
1552 	 * The vm_page spin lock is required for interactions with the pmap.
1553 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1554 	 */
1555 	vm_page_spin_lock(m);
1556 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1557 	--object->resident_page_count;
1558 	--mycpu->gd_vmtotal.t_rm;
1559 	m->object = NULL;
1560 	atomic_add_int(&object->generation, 1);
1561 	vm_page_spin_unlock(m);
1562 
1563 	vm_object_drop(object);
1564 }
1565 
1566 /*
1567  * Calculate the hash position for the vm_page hash heuristic.  Generally
1568  * speaking we want to localize sequential lookups to reduce memory stalls.
1569  *
1570  * Mask by ~3 to offer 4-way set-assoc
1571  */
1572 static __inline
1573 struct vm_page_hash_elm *
1574 vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
1575 {
1576 	size_t hi;
1577 
1578 	hi = iscsi_crc32(&object, sizeof(object)) << 2;
1579 	hi ^= hi >> (23 - 2);
1580 	hi += pindex * VM_PAGE_HASH_SET;
1581 #if 0
1582 	/* mix it up */
1583 	hi = (intptr_t)object ^ object->pg_color ^ pindex;
1584 	hi += object->pg_color * pindex;
1585 	hi = hi ^ (hi >> 20);
1586 #endif
1587 	hi &= vm_page_hash_size - 1;		/* bounds */
1588 
1589 	return (&vm_page_hash[hi]);
1590 }
1591 
1592 /*
1593  * Heuristical page lookup that does not require any locks.  Returns
1594  * a soft-busied page on success, NULL on failure.
1595  *
1596  * Caller must lookup the page the slow way if NULL is returned.
1597  */
1598 vm_page_t
1599 vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
1600 {
1601 	struct vm_page_hash_elm *mp;
1602 	vm_page_t m;
1603 	int i;
1604 
1605 	if (__predict_false(vm_page_hash == NULL))
1606 		return NULL;
1607 	mp = vm_page_hash_hash(object, pindex);
1608 	for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1609 		if (mp->object != object ||
1610 		    mp->pindex != pindex) {
1611 			continue;
1612 		}
1613 		m = mp->m;
1614 		cpu_ccfence();
1615 		if (m == NULL)
1616 			continue;
1617 		if (m->object != object || m->pindex != pindex)
1618 			continue;
1619 		if (vm_page_sbusy_try(m))
1620 			continue;
1621 		if (m->object == object && m->pindex == pindex) {
1622 			/*
1623 			 * On-match optimization - do not update ticks
1624 			 * unless we have to (reduce cache coherency traffic)
1625 			 */
1626 			if (mp->ticks != ticks)
1627 				mp->ticks = ticks;
1628 			return m;
1629 		}
1630 		vm_page_sbusy_drop(m);
1631 	}
1632 	return NULL;
1633 }
1634 
1635 /*
1636  * Enter page onto vm_page_hash[].  This is a heuristic, SMP collisions
1637  * are allowed.
1638  */
1639 static __inline
1640 void
1641 vm_page_hash_enter(vm_page_t m)
1642 {
1643 	struct vm_page_hash_elm *mp;
1644 	struct vm_page_hash_elm *best;
1645 	vm_object_t object;
1646 	vm_pindex_t pindex;
1647 	int best_delta;
1648 	int delta;
1649 	int i;
1650 
1651 	/*
1652 	 * Only enter type-stable vm_pages with well-shared objects.
1653 	 */
1654 	if ((m->flags & PG_MAPPEDMULTI) == 0)
1655 		return;
1656 	if (__predict_false(vm_page_hash == NULL ||
1657 			    m < &vm_page_array[0] ||
1658 			    m >= &vm_page_array[vm_page_array_size])) {
1659 		return;
1660 	}
1661 	if (__predict_false(m->object == NULL))
1662 		return;
1663 #if 0
1664 	/*
1665 	 * Disabled at the moment, there are some degenerate conditions
1666 	 * with often-exec'd programs that get ignored.  In particular,
1667 	 * the kernel's elf loader does a vn_rdwr() on the first page of
1668 	 * a binary.
1669 	 */
1670 	if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING))
1671 		return;
1672 #endif
1673 	if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE)
1674 		return;
1675 
1676 	/*
1677 	 * Find best entry
1678 	 */
1679 	object = m->object;
1680 	pindex = m->pindex;
1681 
1682 	mp = vm_page_hash_hash(object, pindex);
1683 	best = mp;
1684 	best_delta = ticks - best->ticks;
1685 
1686 	for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1687 		if (mp->m == m &&
1688 		    mp->object == object &&
1689 		    mp->pindex == pindex) {
1690 			/*
1691 			 * On-match optimization - do not update ticks
1692 			 * unless we have to (reduce cache coherency traffic)
1693 			 */
1694 			if (mp->ticks != ticks)
1695 				mp->ticks = ticks;
1696 			return;
1697 		}
1698 
1699 		/*
1700 		 * The best choice is the oldest entry.
1701 		 *
1702 		 * Also check for a field overflow, using -1 instead of 0
1703 		 * to deal with SMP races on accessing the 'ticks' global.
1704 		 */
1705 		delta = ticks - mp->ticks;
1706 		if (delta < -1)
1707 			best = mp;
1708 		if (best_delta < delta)
1709 			best = mp;
1710 	}
1711 
1712 	/*
1713 	 * Load the entry.  Copy a few elements to the hash entry itself
1714 	 * to reduce memory stalls due to memory indirects on lookups.
1715 	 */
1716 	best->m = m;
1717 	best->object = object;
1718 	best->pindex = pindex;
1719 	best->ticks = ticks;
1720 }
1721 
1722 /*
1723  * Locate and return the page at (object, pindex), or NULL if the
1724  * page could not be found.
1725  *
1726  * The caller must hold the vm_object token.
1727  */
1728 vm_page_t
1729 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1730 {
1731 	vm_page_t m;
1732 
1733 	/*
1734 	 * Search the hash table for this object/offset pair
1735 	 */
1736 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1737 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1738 	if (m) {
1739 		KKASSERT(m->object == object && m->pindex == pindex);
1740 		vm_page_hash_enter(m);
1741 	}
1742 	return(m);
1743 }
1744 
1745 vm_page_t
1746 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1747 					    vm_pindex_t pindex,
1748 					    int also_m_busy, const char *msg
1749 					    VM_PAGE_DEBUG_ARGS)
1750 {
1751 	u_int32_t busy_count;
1752 	vm_page_t m;
1753 
1754 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1755 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1756 	while (m) {
1757 		KKASSERT(m->object == object && m->pindex == pindex);
1758 		busy_count = m->busy_count;
1759 		cpu_ccfence();
1760 		if (busy_count & PBUSY_LOCKED) {
1761 			tsleep_interlock(m, 0);
1762 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1763 					  busy_count | PBUSY_WANTED)) {
1764 				atomic_set_int(&m->flags, PG_REFERENCED);
1765 				tsleep(m, PINTERLOCKED, msg, 0);
1766 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1767 							      pindex);
1768 			}
1769 		} else if (also_m_busy && busy_count) {
1770 			tsleep_interlock(m, 0);
1771 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1772 					  busy_count | PBUSY_WANTED)) {
1773 				atomic_set_int(&m->flags, PG_REFERENCED);
1774 				tsleep(m, PINTERLOCKED, msg, 0);
1775 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1776 							      pindex);
1777 			}
1778 		} else if (atomic_cmpset_int(&m->busy_count, busy_count,
1779 					     busy_count | PBUSY_LOCKED)) {
1780 #ifdef VM_PAGE_DEBUG
1781 			m->busy_func = func;
1782 			m->busy_line = lineno;
1783 #endif
1784 			vm_page_hash_enter(m);
1785 			break;
1786 		}
1787 	}
1788 	return m;
1789 }
1790 
1791 /*
1792  * Attempt to lookup and busy a page.
1793  *
1794  * Returns NULL if the page could not be found
1795  *
1796  * Returns a vm_page and error == TRUE if the page exists but could not
1797  * be busied.
1798  *
1799  * Returns a vm_page and error == FALSE on success.
1800  */
1801 vm_page_t
1802 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1803 					   vm_pindex_t pindex,
1804 					   int also_m_busy, int *errorp
1805 					   VM_PAGE_DEBUG_ARGS)
1806 {
1807 	u_int32_t busy_count;
1808 	vm_page_t m;
1809 
1810 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1811 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1812 	*errorp = FALSE;
1813 	while (m) {
1814 		KKASSERT(m->object == object && m->pindex == pindex);
1815 		busy_count = m->busy_count;
1816 		cpu_ccfence();
1817 		if (busy_count & PBUSY_LOCKED) {
1818 			*errorp = TRUE;
1819 			break;
1820 		}
1821 		if (also_m_busy && busy_count) {
1822 			*errorp = TRUE;
1823 			break;
1824 		}
1825 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1826 				      busy_count | PBUSY_LOCKED)) {
1827 #ifdef VM_PAGE_DEBUG
1828 			m->busy_func = func;
1829 			m->busy_line = lineno;
1830 #endif
1831 			vm_page_hash_enter(m);
1832 			break;
1833 		}
1834 	}
1835 	return m;
1836 }
1837 
1838 /*
1839  * Returns a page that is only soft-busied for use by the caller in
1840  * a read-only fashion.  Returns NULL if the page could not be found,
1841  * the soft busy could not be obtained, or the page data is invalid.
1842  *
1843  * XXX Doesn't handle PG_FICTITIOUS pages at the moment, but there is
1844  *     no reason why we couldn't.
1845  */
1846 vm_page_t
1847 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1848 			 int pgoff, int pgbytes)
1849 {
1850 	vm_page_t m;
1851 
1852 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1853 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1854 	if (m) {
1855 		if ((m->valid != VM_PAGE_BITS_ALL &&
1856 		     !vm_page_is_valid(m, pgoff, pgbytes)) ||
1857 		    (m->flags & PG_FICTITIOUS)) {
1858 			m = NULL;
1859 		} else if (vm_page_sbusy_try(m)) {
1860 			m = NULL;
1861 		} else if ((m->valid != VM_PAGE_BITS_ALL &&
1862 			    !vm_page_is_valid(m, pgoff, pgbytes)) ||
1863 			   (m->flags & PG_FICTITIOUS)) {
1864 			vm_page_sbusy_drop(m);
1865 			m = NULL;
1866 		} else {
1867 			vm_page_hash_enter(m);
1868 		}
1869 	}
1870 	return m;
1871 }
1872 
1873 /*
1874  * Caller must hold the related vm_object
1875  */
1876 vm_page_t
1877 vm_page_next(vm_page_t m)
1878 {
1879 	vm_page_t next;
1880 
1881 	next = vm_page_rb_tree_RB_NEXT(m);
1882 	if (next && next->pindex != m->pindex + 1)
1883 		next = NULL;
1884 	return (next);
1885 }
1886 
1887 /*
1888  * vm_page_rename()
1889  *
1890  * Move the given vm_page from its current object to the specified
1891  * target object/offset.  The page must be busy and will remain so
1892  * on return.
1893  *
1894  * new_object must be held.
1895  * This routine might block. XXX ?
1896  *
1897  * NOTE: Swap associated with the page must be invalidated by the move.  We
1898  *       have to do this for several reasons:  (1) we aren't freeing the
1899  *       page, (2) we are dirtying the page, (3) the VM system is probably
1900  *       moving the page from object A to B, and will then later move
1901  *       the backing store from A to B and we can't have a conflict.
1902  *
1903  * NOTE: We *always* dirty the page.  It is necessary both for the
1904  *       fact that we moved it, and because we may be invalidating
1905  *	 swap.  If the page is on the cache, we have to deactivate it
1906  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1907  *	 on the cache.
1908  *
1909  * NOTE: Caller is responsible for any pmap disposition prior to the
1910  *	 rename (as the pmap code will not be able to find the entries
1911  *	 once the object has been disassociated or changed).  Nominally
1912  *	 the caller is moving a page between shadowed objects and so the
1913  *	 pmap association is retained without having to remove the page
1914  *	 from it.
1915  */
1916 void
1917 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1918 {
1919 	KKASSERT(m->busy_count & PBUSY_LOCKED);
1920 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1921 	if (m->object) {
1922 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1923 		vm_page_remove(m);
1924 	}
1925 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1926 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1927 		      new_object, new_pindex);
1928 	}
1929 	if (m->queue - m->pc == PQ_CACHE)
1930 		vm_page_deactivate(m);
1931 	vm_page_dirty(m);
1932 }
1933 
1934 /*
1935  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1936  * is to remain BUSYied by the caller.
1937  *
1938  * This routine may not block.
1939  */
1940 void
1941 vm_page_unqueue_nowakeup(vm_page_t m)
1942 {
1943 	vm_page_and_queue_spin_lock(m);
1944 	(void)_vm_page_rem_queue_spinlocked(m);
1945 	vm_page_spin_unlock(m);
1946 }
1947 
1948 /*
1949  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1950  * if necessary.
1951  *
1952  * This routine may not block.
1953  */
1954 void
1955 vm_page_unqueue(vm_page_t m)
1956 {
1957 	u_short queue;
1958 
1959 	vm_page_and_queue_spin_lock(m);
1960 	queue = _vm_page_rem_queue_spinlocked(m);
1961 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1962 		vm_page_spin_unlock(m);
1963 		pagedaemon_wakeup();
1964 	} else {
1965 		vm_page_spin_unlock(m);
1966 	}
1967 }
1968 
1969 /*
1970  * vm_page_list_find()
1971  *
1972  * Find a page on the specified queue with color optimization.
1973  *
1974  * The page coloring optimization attempts to locate a page that does
1975  * not overload other nearby pages in the object in the cpu's L1 or L2
1976  * caches.  We need this optimization because cpu caches tend to be
1977  * physical caches, while object spaces tend to be virtual.
1978  *
1979  * The page coloring optimization also, very importantly, tries to localize
1980  * memory to cpus and physical sockets.
1981  *
1982  * Each PQ_FREE and PQ_CACHE color queue has its own spinlock and the
1983  * algorithm is adjusted to localize allocations on a per-core basis.
1984  * This is done by 'twisting' the colors.
1985  *
1986  * The page is returned spinlocked and removed from its queue (it will
1987  * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
1988  * is responsible for dealing with the busy-page case (usually by
1989  * deactivating the page and looping).
1990  *
1991  * NOTE:  This routine is carefully inlined.  A non-inlined version
1992  *	  is available for outside callers but the only critical path is
1993  *	  from within this source file.
1994  *
1995  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1996  *	  represent stable storage, allowing us to order our locks vm_page
1997  *	  first, then queue.
1998  */
1999 static __inline
2000 vm_page_t
2001 _vm_page_list_find(int basequeue, int index)
2002 {
2003 	struct vpgqueues *pq;
2004 	vm_page_t m;
2005 
2006 	index &= PQ_L2_MASK;
2007 	pq = &vm_page_queues[basequeue + index];
2008 
2009 	/*
2010 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2011 	 * then lock the queue and locate a page.  Note that the lock order
2012 	 * is reversed, but we do not want to dwadle on the page spinlock
2013 	 * anyway as it is held significantly longer than the queue spinlock.
2014 	 */
2015 	if (TAILQ_FIRST(&pq->pl)) {
2016 		spin_lock(&pq->spin);
2017 		TAILQ_FOREACH(m, &pq->pl, pageq) {
2018 			if (spin_trylock(&m->spin) == 0)
2019 				continue;
2020 			KKASSERT(m->queue == basequeue + index);
2021 			pq->lastq = -1;
2022 			return(m);
2023 		}
2024 		spin_unlock(&pq->spin);
2025 	}
2026 
2027 	m = _vm_page_list_find_wide(basequeue, index, &pq->lastq);
2028 
2029 	return(m);
2030 }
2031 
2032 /*
2033  * If we could not find the page in the desired queue try to find it in
2034  * a nearby (NUMA-aware) queue, spreading out as we go.
2035  */
2036 static vm_page_t
2037 _vm_page_list_find_wide(int basequeue, int index, int *lastp)
2038 {
2039 	struct vpgqueues *pq;
2040 	vm_page_t m = NULL;
2041 	int pqmask = set_assoc_mask >> 1;
2042 	int pqi;
2043 	int range;
2044 	int skip_start;
2045 	int skip_next;
2046 	int count;
2047 
2048 	/*
2049 	 * Avoid re-searching empty queues over and over again skip to
2050 	 * pq->last if appropriate.
2051 	 */
2052 	if (*lastp >= 0)
2053 		index = *lastp;
2054 
2055 	index &= PQ_L2_MASK;
2056 	pq = &vm_page_queues[basequeue];
2057 	count = 0;
2058 	skip_start = -1;
2059 	skip_next = -1;
2060 
2061 	/*
2062 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2063 	 * else fails (PQ_L2_MASK).
2064 	 *
2065 	 * pqmask is a mask, 15, 31, 63, etc.
2066 	 *
2067 	 * Test each queue unlocked first, then lock the queue and locate
2068 	 * a page.  Note that the lock order is reversed, but we do not want
2069 	 * to dwadle on the page spinlock anyway as it is held significantly
2070 	 * longer than the queue spinlock.
2071 	 */
2072 	do {
2073 		pqmask = (pqmask << 1) | 1;
2074 
2075 		pqi = index;
2076 		range = pqmask + 1;
2077 
2078 		while (range > 0) {
2079 			if (pqi >= skip_start && pqi < skip_next) {
2080 				range -= skip_next - pqi;
2081 				pqi = (pqi & ~pqmask) | (skip_next & pqmask);
2082 			}
2083 			if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) {
2084 				spin_lock(&pq[pqi].spin);
2085 				TAILQ_FOREACH(m, &pq[pqi].pl, pageq) {
2086 					if (spin_trylock(&m->spin) == 0)
2087 						continue;
2088 					KKASSERT(m->queue == basequeue + pqi);
2089 
2090 					/*
2091 					 * If we had to wander too far, set
2092 					 * *lastp to skip past empty queues.
2093 					 */
2094 					if (count >= 8)
2095 						*lastp = pqi & PQ_L2_MASK;
2096 					return(m);
2097 				}
2098 				spin_unlock(&pq[pqi].spin);
2099 			}
2100 			--range;
2101 			++count;
2102 			pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask);
2103 		}
2104 		skip_start = pqi & ~pqmask;
2105 		skip_next = (pqi | pqmask) + 1;
2106 	} while (pqmask != PQ_L2_MASK);
2107 
2108 	return(m);
2109 }
2110 
2111 static __inline
2112 vm_page_t
2113 _vm_page_list_find2(int bq1, int bq2, int index)
2114 {
2115 	struct vpgqueues *pq1;
2116 	struct vpgqueues *pq2;
2117 	vm_page_t m;
2118 
2119 	index &= PQ_L2_MASK;
2120 	pq1 = &vm_page_queues[bq1 + index];
2121 	pq2 = &vm_page_queues[bq2 + index];
2122 
2123 	/*
2124 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2125 	 * then lock the queue and locate a page.  Note that the lock order
2126 	 * is reversed, but we do not want to dwadle on the page spinlock
2127 	 * anyway as it is held significantly longer than the queue spinlock.
2128 	 */
2129 	if (TAILQ_FIRST(&pq1->pl)) {
2130 		spin_lock(&pq1->spin);
2131 		TAILQ_FOREACH(m, &pq1->pl, pageq) {
2132 			if (spin_trylock(&m->spin) == 0)
2133 				continue;
2134 			KKASSERT(m->queue == bq1 + index);
2135 			pq1->lastq = -1;
2136 			pq2->lastq = -1;
2137 			return(m);
2138 		}
2139 		spin_unlock(&pq1->spin);
2140 	}
2141 
2142 	m = _vm_page_list_find2_wide(bq1, bq2, index, &pq1->lastq, &pq2->lastq);
2143 
2144 	return(m);
2145 }
2146 
2147 
2148 /*
2149  * This version checks two queues at the same time, widening its search
2150  * as we progress.  prefering basequeue1
2151  * and starting on basequeue2 after exhausting the first set.  The idea
2152  * is to try to stay localized to the cpu.
2153  */
2154 static vm_page_t
2155 _vm_page_list_find2_wide(int basequeue1, int basequeue2, int index,
2156 			 int *lastp1, int *lastp2)
2157 {
2158 	struct vpgqueues *pq1;
2159 	struct vpgqueues *pq2;
2160 	vm_page_t m = NULL;
2161 	int pqmask1, pqmask2;
2162 	int pqi;
2163 	int range;
2164 	int skip_start1, skip_start2;
2165 	int skip_next1, skip_next2;
2166 	int count1, count2;
2167 
2168 	/*
2169 	 * Avoid re-searching empty queues over and over again skip to
2170 	 * pq->last if appropriate.
2171 	 */
2172 	if (*lastp1 >= 0)
2173 		index = *lastp1;
2174 
2175 	index &= PQ_L2_MASK;
2176 
2177 	pqmask1 = set_assoc_mask >> 1;
2178 	pq1 = &vm_page_queues[basequeue1];
2179 	count1 = 0;
2180 	skip_start1 = -1;
2181 	skip_next1 = -1;
2182 
2183 	pqmask2 = set_assoc_mask >> 1;
2184 	pq2 = &vm_page_queues[basequeue2];
2185 	count2 = 0;
2186 	skip_start2 = -1;
2187 	skip_next2 = -1;
2188 
2189 	/*
2190 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2191 	 * else fails (PQ_L2_MASK).
2192 	 *
2193 	 * pqmask is a mask, 15, 31, 63, etc.
2194 	 *
2195 	 * Test each queue unlocked first, then lock the queue and locate
2196 	 * a page.  Note that the lock order is reversed, but we do not want
2197 	 * to dwadle on the page spinlock anyway as it is held significantly
2198 	 * longer than the queue spinlock.
2199 	 */
2200 	do {
2201 		if (pqmask1 == PQ_L2_MASK)
2202 			goto skip2;
2203 
2204 		pqmask1 = (pqmask1 << 1) | 1;
2205 		pqi = index;
2206 		range = pqmask1 + 1;
2207 
2208 		while (range > 0) {
2209 			if (pqi >= skip_start1 && pqi < skip_next1) {
2210 				range -= skip_next1 - pqi;
2211 				pqi = (pqi & ~pqmask1) | (skip_next1 & pqmask1);
2212 			}
2213 			if (range > 0 && TAILQ_FIRST(&pq1[pqi].pl)) {
2214 				spin_lock(&pq1[pqi].spin);
2215 				TAILQ_FOREACH(m, &pq1[pqi].pl, pageq) {
2216 					if (spin_trylock(&m->spin) == 0)
2217 						continue;
2218 					KKASSERT(m->queue == basequeue1 + pqi);
2219 
2220 					/*
2221 					 * If we had to wander too far, set
2222 					 * *lastp to skip past empty queues.
2223 					 */
2224 					if (count1 >= 8)
2225 						*lastp1 = pqi & PQ_L2_MASK;
2226 					return(m);
2227 				}
2228 				spin_unlock(&pq1[pqi].spin);
2229 			}
2230 			--range;
2231 			++count1;
2232 			pqi = (pqi & ~pqmask1) | ((pqi + 1) & pqmask1);
2233 		}
2234 		skip_start1 = pqi & ~pqmask1;
2235 		skip_next1 = (pqi | pqmask1) + 1;
2236 skip2:
2237 		if (pqmask1 < ((set_assoc_mask << 1) | 1))
2238 			continue;
2239 
2240 		pqmask2 = (pqmask2 << 1) | 1;
2241 		pqi = index;
2242 		range = pqmask2 + 1;
2243 
2244 		while (range > 0) {
2245 			if (pqi >= skip_start2 && pqi < skip_next2) {
2246 				range -= skip_next2 - pqi;
2247 				pqi = (pqi & ~pqmask2) | (skip_next2 & pqmask2);
2248 			}
2249 			if (range > 0 && TAILQ_FIRST(&pq2[pqi].pl)) {
2250 				spin_lock(&pq2[pqi].spin);
2251 				TAILQ_FOREACH(m, &pq2[pqi].pl, pageq) {
2252 					if (spin_trylock(&m->spin) == 0)
2253 						continue;
2254 					KKASSERT(m->queue == basequeue2 + pqi);
2255 
2256 					/*
2257 					 * If we had to wander too far, set
2258 					 * *lastp to skip past empty queues.
2259 					 */
2260 					if (count2 >= 8)
2261 						*lastp2 = pqi & PQ_L2_MASK;
2262 					return(m);
2263 				}
2264 				spin_unlock(&pq2[pqi].spin);
2265 			}
2266 			--range;
2267 			++count2;
2268 			pqi = (pqi & ~pqmask2) | ((pqi + 1) & pqmask2);
2269 		}
2270 		skip_start2 = pqi & ~pqmask2;
2271 		skip_next2 = (pqi | pqmask2) + 1;
2272 	} while (pqmask1 != PQ_L2_MASK && pqmask2 != PQ_L2_MASK);
2273 
2274 	return(m);
2275 }
2276 
2277 /*
2278  * Returns a vm_page candidate for allocation.  The page is not busied so
2279  * it can move around.  The caller must busy the page (and typically
2280  * deactivate it if it cannot be busied!)
2281  *
2282  * Returns a spinlocked vm_page that has been removed from its queue.
2283  * (note that _vm_page_list_find() does not remove the page from its
2284  *  queue).
2285  */
2286 vm_page_t
2287 vm_page_list_find(int basequeue, int index)
2288 {
2289 	vm_page_t m;
2290 
2291 	m = _vm_page_list_find(basequeue, index);
2292 	if (m)
2293 		_vm_page_rem_queue_spinlocked(m);
2294 	return m;
2295 }
2296 
2297 /*
2298  * Find a page on the cache queue with color optimization, remove it
2299  * from the queue, and busy it.  The returned page will not be spinlocked.
2300  *
2301  * A candidate failure will be deactivated.  Candidates can fail due to
2302  * being busied by someone else, in which case they will be deactivated.
2303  *
2304  * This routine may not block.
2305  *
2306  */
2307 static vm_page_t
2308 vm_page_select_cache(u_short pg_color)
2309 {
2310 	vm_page_t m;
2311 
2312 	for (;;) {
2313 		m = _vm_page_list_find(PQ_CACHE, pg_color);
2314 		if (m == NULL)
2315 			break;
2316 		/*
2317 		 * (m) has been spinlocked
2318 		 */
2319 		_vm_page_rem_queue_spinlocked(m);
2320 		if (vm_page_busy_try(m, TRUE)) {
2321 			_vm_page_deactivate_locked(m, 0);
2322 			vm_page_spin_unlock(m);
2323 		} else {
2324 			/*
2325 			 * We successfully busied the page
2326 			 */
2327 			if ((m->flags & PG_NEED_COMMIT) == 0 &&
2328 			    m->hold_count == 0 &&
2329 			    m->wire_count == 0 &&
2330 			    (m->dirty & m->valid) == 0) {
2331 				vm_page_spin_unlock(m);
2332 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2333 				pagedaemon_wakeup();
2334 				return(m);
2335 			}
2336 
2337 			/*
2338 			 * The page cannot be recycled, deactivate it.
2339 			 */
2340 			_vm_page_deactivate_locked(m, 0);
2341 			if (_vm_page_wakeup(m)) {
2342 				vm_page_spin_unlock(m);
2343 				wakeup(m);
2344 			} else {
2345 				vm_page_spin_unlock(m);
2346 			}
2347 		}
2348 	}
2349 	return (m);
2350 }
2351 
2352 /*
2353  * Find a free page.  We attempt to inline the nominal case and fall back
2354  * to _vm_page_select_free() otherwise.  A busied page is removed from
2355  * the queue and returned.
2356  *
2357  * This routine may not block.
2358  */
2359 static __inline vm_page_t
2360 vm_page_select_free(u_short pg_color)
2361 {
2362 	vm_page_t m;
2363 
2364 	for (;;) {
2365 		m = _vm_page_list_find(PQ_FREE, pg_color);
2366 		if (m == NULL)
2367 			break;
2368 		_vm_page_rem_queue_spinlocked(m);
2369 		if (vm_page_busy_try(m, TRUE)) {
2370 			/*
2371 			 * Various mechanisms such as a pmap_collect can
2372 			 * result in a busy page on the free queue.  We
2373 			 * have to move the page out of the way so we can
2374 			 * retry the allocation.  If the other thread is not
2375 			 * allocating the page then m->valid will remain 0 and
2376 			 * the pageout daemon will free the page later on.
2377 			 *
2378 			 * Since we could not busy the page, however, we
2379 			 * cannot make assumptions as to whether the page
2380 			 * will be allocated by the other thread or not,
2381 			 * so all we can do is deactivate it to move it out
2382 			 * of the way.  In particular, if the other thread
2383 			 * wires the page it may wind up on the inactive
2384 			 * queue and the pageout daemon will have to deal
2385 			 * with that case too.
2386 			 */
2387 			_vm_page_deactivate_locked(m, 0);
2388 			vm_page_spin_unlock(m);
2389 		} else {
2390 			/*
2391 			 * Theoretically if we are able to busy the page
2392 			 * atomic with the queue removal (using the vm_page
2393 			 * lock) nobody else should have been able to mess
2394 			 * with the page before us.
2395 			 *
2396 			 * Assert the page state.  Note that even though
2397 			 * wiring doesn't adjust queues, a page on the free
2398 			 * queue should never be wired at this point.
2399 			 */
2400 			KKASSERT((m->flags & (PG_UNQUEUED |
2401 					      PG_NEED_COMMIT)) == 0);
2402 			KASSERT(m->hold_count == 0,
2403 				("m->hold_count is not zero "
2404 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2405 				 m, m->queue, m->flags,
2406 				 m->hold_count, m->wire_count));
2407 			KKASSERT(m->wire_count == 0);
2408 			vm_page_spin_unlock(m);
2409 			pagedaemon_wakeup();
2410 
2411 			/* return busied and removed page */
2412 			return(m);
2413 		}
2414 	}
2415 	return(m);
2416 }
2417 
2418 static __inline vm_page_t
2419 vm_page_select_free_or_cache(u_short pg_color, int *fromcachep)
2420 {
2421 	vm_page_t m;
2422 
2423 	*fromcachep = 0;
2424 	for (;;) {
2425 		m = _vm_page_list_find2(PQ_FREE, PQ_CACHE, pg_color);
2426 		if (m == NULL)
2427 			break;
2428 		if (vm_page_busy_try(m, TRUE)) {
2429 			_vm_page_rem_queue_spinlocked(m);
2430 			_vm_page_deactivate_locked(m, 0);
2431 			vm_page_spin_unlock(m);
2432 		} else if (m->queue - m->pc == PQ_FREE) {
2433 			/*
2434 			 * We successfully busied the page, PQ_FREE case
2435 			 */
2436 			_vm_page_rem_queue_spinlocked(m);
2437 			KKASSERT((m->flags & (PG_UNQUEUED |
2438 					      PG_NEED_COMMIT)) == 0);
2439 			KASSERT(m->hold_count == 0,
2440 				("m->hold_count is not zero "
2441 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2442 				 m, m->queue, m->flags,
2443 				 m->hold_count, m->wire_count));
2444 			KKASSERT(m->wire_count == 0);
2445 			vm_page_spin_unlock(m);
2446 			pagedaemon_wakeup();
2447 
2448 			/* return busied and removed page */
2449 			return(m);
2450 		} else {
2451 			/*
2452 			 * We successfully busied the page, PQ_CACHE case
2453 			 */
2454 			_vm_page_rem_queue_spinlocked(m);
2455 			if ((m->flags & PG_NEED_COMMIT) == 0 &&
2456 			    m->hold_count == 0 &&
2457 			    m->wire_count == 0 &&
2458 			    (m->dirty & m->valid) == 0) {
2459 				vm_page_spin_unlock(m);
2460 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2461 				pagedaemon_wakeup();
2462 				*fromcachep = 1;
2463 				return(m);
2464 			}
2465 
2466 			/*
2467 			 * The page cannot be recycled, deactivate it.
2468 			 */
2469 			_vm_page_deactivate_locked(m, 0);
2470 			if (_vm_page_wakeup(m)) {
2471 				vm_page_spin_unlock(m);
2472 				wakeup(m);
2473 			} else {
2474 				vm_page_spin_unlock(m);
2475 			}
2476 		}
2477 	}
2478 	return(m);
2479 }
2480 
2481 /*
2482  * vm_page_alloc()
2483  *
2484  * Allocate and return a memory cell associated with this VM object/offset
2485  * pair.  If object is NULL an unassociated page will be allocated.
2486  *
2487  * The returned page will be busied and removed from its queues.  This
2488  * routine can block and may return NULL if a race occurs and the page
2489  * is found to already exist at the specified (object, pindex).
2490  *
2491  *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
2492  *	VM_ALLOC_QUICK		like normal but cannot use cache
2493  *	VM_ALLOC_SYSTEM		greater free drain
2494  *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
2495  *	VM_ALLOC_ZERO		advisory request for pre-zero'd page only
2496  *	VM_ALLOC_FORCE_ZERO	advisory request for pre-zero'd page only
2497  *	VM_ALLOC_NULL_OK	ok to return NULL on insertion collision
2498  *				(see vm_page_grab())
2499  *	VM_ALLOC_USE_GD		ok to use per-gd cache
2500  *
2501  *	VM_ALLOC_CPU(n)		allocate using specified cpu localization
2502  *
2503  * The object must be held if not NULL
2504  * This routine may not block
2505  *
2506  * Additional special handling is required when called from an interrupt
2507  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
2508  * in this case.
2509  */
2510 vm_page_t
2511 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
2512 {
2513 	globaldata_t gd;
2514 	vm_object_t obj;
2515 	vm_page_t m;
2516 	u_short pg_color;
2517 	int cpuid_local;
2518 	int fromcache;
2519 
2520 #if 0
2521 	/*
2522 	 * Special per-cpu free VM page cache.  The pages are pre-busied
2523 	 * and pre-zerod for us.
2524 	 */
2525 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
2526 		crit_enter_gd(gd);
2527 		if (gd->gd_vmpg_count) {
2528 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
2529 			crit_exit_gd(gd);
2530 			goto done;
2531                 }
2532 		crit_exit_gd(gd);
2533         }
2534 #endif
2535 	m = NULL;
2536 
2537 	/*
2538 	 * CPU LOCALIZATION
2539 	 *
2540 	 * CPU localization algorithm.  Break the page queues up by physical
2541 	 * id and core id (note that two cpu threads will have the same core
2542 	 * id, and core_id != gd_cpuid).
2543 	 *
2544 	 * This is nowhere near perfect, for example the last pindex in a
2545 	 * subgroup will overflow into the next cpu or package.  But this
2546 	 * should get us good page reuse locality in heavy mixed loads.
2547 	 *
2548 	 * (may be executed before the APs are started, so other GDs might
2549 	 *  not exist!)
2550 	 */
2551 	if (page_req & VM_ALLOC_CPU_SPEC)
2552 		cpuid_local = VM_ALLOC_GETCPU(page_req);
2553 	else
2554 		cpuid_local = mycpu->gd_cpuid;
2555 
2556 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
2557 
2558 	KKASSERT(page_req &
2559 		(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
2560 		 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2561 
2562 	/*
2563 	 * Certain system threads (pageout daemon, buf_daemon's) are
2564 	 * allowed to eat deeper into the free page list.
2565 	 */
2566 	if (curthread->td_flags & TDF_SYSTHREAD)
2567 		page_req |= VM_ALLOC_SYSTEM;
2568 
2569 	/*
2570 	 * Impose various limitations.  Note that the v_free_reserved test
2571 	 * must match the opposite of vm_page_count_target() to avoid
2572 	 * livelocks, be careful.
2573 	 */
2574 loop:
2575 	gd = mycpu;
2576 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
2577 	    ((page_req & VM_ALLOC_INTERRUPT) &&
2578 	     gd->gd_vmstats.v_free_count > 0) ||
2579 	    ((page_req & VM_ALLOC_SYSTEM) &&
2580 	     gd->gd_vmstats.v_cache_count == 0 &&
2581 	     gd->gd_vmstats.v_free_count >
2582 	     gd->gd_vmstats.v_interrupt_free_min)
2583 	) {
2584 		/*
2585 		 * The free queue has sufficient free pages to take one out.
2586 		 *
2587 		 * However, if the free queue is strained the scan may widen
2588 		 * to the entire queue and cause a great deal of SMP
2589 		 * contention, so we use a double-queue-scan if we can
2590 		 * to avoid this.
2591 		 */
2592 		if (page_req & VM_ALLOC_NORMAL) {
2593 			m = vm_page_select_free_or_cache(pg_color, &fromcache);
2594 			if (m && fromcache)
2595 				goto found_cache;
2596 		} else {
2597 			m = vm_page_select_free(pg_color);
2598 		}
2599 	} else if (page_req & VM_ALLOC_NORMAL) {
2600 		/*
2601 		 * Allocatable from the cache (non-interrupt only).  On
2602 		 * success, we must free the page and try again, thus
2603 		 * ensuring that vmstats.v_*_free_min counters are replenished.
2604 		 */
2605 #ifdef INVARIANTS
2606 		if (curthread->td_preempted) {
2607 			kprintf("vm_page_alloc(): warning, attempt to allocate"
2608 				" cache page from preempting interrupt\n");
2609 			m = NULL;
2610 		} else {
2611 			m = vm_page_select_cache(pg_color);
2612 		}
2613 #else
2614 		m = vm_page_select_cache(pg_color);
2615 #endif
2616 		/*
2617 		 * On success move the page into the free queue and loop.
2618 		 *
2619 		 * Only do this if we can safely acquire the vm_object lock,
2620 		 * because this is effectively a random page and the caller
2621 		 * might be holding the lock shared, we don't want to
2622 		 * deadlock.
2623 		 */
2624 		if (m != NULL) {
2625 found_cache:
2626 			KASSERT(m->dirty == 0,
2627 				("Found dirty cache page %p", m));
2628 			if ((obj = m->object) != NULL) {
2629 				if (vm_object_hold_try(obj)) {
2630 					if (__predict_false((m->flags & (PG_MAPPED|PG_WRITEABLE)) != 0))
2631 						vm_page_protect(m, VM_PROT_NONE);
2632 					vm_page_free(m);
2633 					/* m->object NULL here */
2634 					vm_object_drop(obj);
2635 				} else {
2636 					vm_page_deactivate(m);
2637 					vm_page_wakeup(m);
2638 				}
2639 			} else {
2640 				if (__predict_false((m->flags & (PG_MAPPED|PG_WRITEABLE)) != 0))
2641 					vm_page_protect(m, VM_PROT_NONE);
2642 				vm_page_free(m);
2643 			}
2644 			goto loop;
2645 		}
2646 
2647 		/*
2648 		 * On failure return NULL
2649 		 */
2650 		atomic_add_int(&vm_pageout_deficit, 1);
2651 		pagedaemon_wakeup();
2652 		return (NULL);
2653 	} else {
2654 		/*
2655 		 * No pages available, wakeup the pageout daemon and give up.
2656 		 */
2657 		atomic_add_int(&vm_pageout_deficit, 1);
2658 		pagedaemon_wakeup();
2659 		return (NULL);
2660 	}
2661 
2662 	/*
2663 	 * v_free_count can race so loop if we don't find the expected
2664 	 * page.
2665 	 */
2666 	if (m == NULL) {
2667 		vmstats_rollup();
2668 		goto loop;
2669 	}
2670 
2671 	/*
2672 	 * Good page found.  The page has already been busied for us and
2673 	 * removed from its queues.
2674 	 */
2675 	KASSERT(m->dirty == 0,
2676 		("vm_page_alloc: free/cache page %p was dirty", m));
2677 	KKASSERT(m->queue == PQ_NONE);
2678 
2679 #if 0
2680 done:
2681 #endif
2682 	/*
2683 	 * Initialize the structure, inheriting some flags but clearing
2684 	 * all the rest.  The page has already been busied for us.
2685 	 */
2686 	vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
2687 
2688 	KKASSERT(m->wire_count == 0);
2689 	KKASSERT((m->busy_count & PBUSY_MASK) == 0);
2690 	m->act_count = 0;
2691 	m->valid = 0;
2692 
2693 	/*
2694 	 * Caller must be holding the object lock (asserted by
2695 	 * vm_page_insert()).
2696 	 *
2697 	 * NOTE: Inserting a page here does not insert it into any pmaps
2698 	 *	 (which could cause us to block allocating memory).
2699 	 *
2700 	 * NOTE: If no object an unassociated page is allocated, m->pindex
2701 	 *	 can be used by the caller for any purpose.
2702 	 */
2703 	if (object) {
2704 		if (vm_page_insert(m, object, pindex) == FALSE) {
2705 			vm_page_free(m);
2706 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
2707 				panic("PAGE RACE %p[%ld]/%p",
2708 				      object, (long)pindex, m);
2709 			m = NULL;
2710 		}
2711 	} else {
2712 		m->pindex = pindex;
2713 	}
2714 
2715 	/*
2716 	 * Don't wakeup too often - wakeup the pageout daemon when
2717 	 * we would be nearly out of memory.
2718 	 */
2719 	pagedaemon_wakeup();
2720 
2721 	/*
2722 	 * A BUSY page is returned.
2723 	 */
2724 	return (m);
2725 }
2726 
2727 /*
2728  * Returns number of pages available in our DMA memory reserve
2729  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
2730  */
2731 vm_size_t
2732 vm_contig_avail_pages(void)
2733 {
2734 	alist_blk_t blk;
2735 	alist_blk_t count;
2736 	alist_blk_t bfree;
2737 	spin_lock(&vm_contig_spin);
2738 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2739 	spin_unlock(&vm_contig_spin);
2740 
2741 	return bfree;
2742 }
2743 
2744 /*
2745  * Attempt to allocate contiguous physical memory with the specified
2746  * requirements.
2747  */
2748 vm_page_t
2749 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2750 		     unsigned long alignment, unsigned long boundary,
2751 		     unsigned long size, vm_memattr_t memattr)
2752 {
2753 	alist_blk_t blk;
2754 	vm_page_t m;
2755 	vm_pindex_t i;
2756 #if 0
2757 	static vm_pindex_t contig_rover;
2758 #endif
2759 
2760 	alignment >>= PAGE_SHIFT;
2761 	if (alignment == 0)
2762 		alignment = 1;
2763 	boundary >>= PAGE_SHIFT;
2764 	if (boundary == 0)
2765 		boundary = 1;
2766 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
2767 
2768 #if 0
2769 	/*
2770 	 * Disabled temporarily until we find a solution for DRM (a flag
2771 	 * to always use the free space reserve, for performance).
2772 	 */
2773 	if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
2774 	    boundary <= PAGE_SIZE && size == 1 &&
2775 	    memattr == VM_MEMATTR_DEFAULT) {
2776 		/*
2777 		 * Any page will work, use vm_page_alloc()
2778 		 * (e.g. when used from kmem_alloc_attr())
2779 		 */
2780 		m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
2781 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2782 				  VM_ALLOC_INTERRUPT);
2783 		m->valid = VM_PAGE_BITS_ALL;
2784 		vm_page_wire(m);
2785 		vm_page_wakeup(m);
2786 	} else
2787 #endif
2788 	{
2789 		/*
2790 		 * Use the low-memory dma reserve
2791 		 */
2792 		spin_lock(&vm_contig_spin);
2793 		blk = alist_alloc(&vm_contig_alist, 0, size);
2794 		if (blk == ALIST_BLOCK_NONE) {
2795 			spin_unlock(&vm_contig_spin);
2796 			if (bootverbose) {
2797 				kprintf("vm_page_alloc_contig: %ldk nospace\n",
2798 					(size << PAGE_SHIFT) / 1024);
2799 				print_backtrace(5);
2800 			}
2801 			return(NULL);
2802 		}
2803 		if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2804 			alist_free(&vm_contig_alist, blk, size);
2805 			spin_unlock(&vm_contig_spin);
2806 			if (bootverbose) {
2807 				kprintf("vm_page_alloc_contig: %ldk high "
2808 					"%016jx failed\n",
2809 					(size << PAGE_SHIFT) / 1024,
2810 					(intmax_t)high);
2811 			}
2812 			return(NULL);
2813 		}
2814 		spin_unlock(&vm_contig_spin);
2815 
2816 		/*
2817 		 * Base vm_page_t of range
2818 		 */
2819 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2820 	}
2821 	if (vm_contig_verbose) {
2822 		kprintf("vm_page_alloc_contig: %016jx/%ldk "
2823 			"(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2824 			(intmax_t)m->phys_addr,
2825 			(size << PAGE_SHIFT) / 1024,
2826 			low, high, alignment, boundary, size, memattr);
2827 	}
2828 	if (memattr != VM_MEMATTR_DEFAULT) {
2829 		for (i = 0; i < size; ++i) {
2830 			KKASSERT(m[i].flags & PG_FICTITIOUS);
2831 			pmap_page_set_memattr(&m[i], memattr);
2832 		}
2833 	}
2834 	return m;
2835 }
2836 
2837 /*
2838  * Free contiguously allocated pages.  The pages will be wired but not busy.
2839  * When freeing to the alist we leave them wired and not busy.
2840  */
2841 void
2842 vm_page_free_contig(vm_page_t m, unsigned long size)
2843 {
2844 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2845 	vm_pindex_t start = pa >> PAGE_SHIFT;
2846 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2847 
2848 	if (vm_contig_verbose) {
2849 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2850 			(intmax_t)pa, size / 1024);
2851 	}
2852 	if (pa < vm_low_phys_reserved) {
2853 		/*
2854 		 * Just assert check the first page for convenience.
2855 		 */
2856 		KKASSERT(m->wire_count == 1);
2857 		KKASSERT(m->flags & PG_FICTITIOUS);
2858 		KKASSERT(pa + size <= vm_low_phys_reserved);
2859 		spin_lock(&vm_contig_spin);
2860 		alist_free(&vm_contig_alist, start, pages);
2861 		spin_unlock(&vm_contig_spin);
2862 	} else {
2863 		while (pages) {
2864 			/* XXX FUTURE, maybe (pair with vm_pg_contig_alloc()) */
2865 			/*vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);*/
2866 			vm_page_busy_wait(m, FALSE, "cpgfr");
2867 			vm_page_unwire(m, 0);
2868 			vm_page_free(m);
2869 			--pages;
2870 			++m;
2871 		}
2872 
2873 	}
2874 }
2875 
2876 
2877 /*
2878  * Wait for sufficient free memory for nominal heavy memory use kernel
2879  * operations.
2880  *
2881  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2882  *	     will trivially deadlock the system.
2883  */
2884 void
2885 vm_wait_nominal(void)
2886 {
2887 	while (vm_page_count_min(0))
2888 		vm_wait(0);
2889 }
2890 
2891 /*
2892  * Test if vm_wait_nominal() would block.
2893  */
2894 int
2895 vm_test_nominal(void)
2896 {
2897 	if (vm_page_count_min(0))
2898 		return(1);
2899 	return(0);
2900 }
2901 
2902 /*
2903  * Block until free pages are available for allocation, called in various
2904  * places before memory allocations.
2905  *
2906  * The caller may loop if vm_page_count_min() == FALSE so we cannot be
2907  * more generous then that.
2908  */
2909 void
2910 vm_wait(int timo)
2911 {
2912 	/*
2913 	 * never wait forever
2914 	 */
2915 	if (timo == 0)
2916 		timo = hz;
2917 	lwkt_gettoken(&vm_token);
2918 
2919 	if (curthread == pagethread ||
2920 	    curthread == emergpager) {
2921 		/*
2922 		 * The pageout daemon itself needs pages, this is bad.
2923 		 */
2924 		if (vm_page_count_min(0)) {
2925 			vm_pageout_pages_needed = 1;
2926 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2927 		}
2928 	} else {
2929 		/*
2930 		 * Wakeup the pageout daemon if necessary and wait.
2931 		 *
2932 		 * Do not wait indefinitely for the target to be reached,
2933 		 * as load might prevent it from being reached any time soon.
2934 		 * But wait a little to try to slow down page allocations
2935 		 * and to give more important threads (the pagedaemon)
2936 		 * allocation priority.
2937 		 */
2938 		if (vm_page_count_target()) {
2939 			if (vm_pages_needed <= 1) {
2940 				++vm_pages_needed;
2941 				wakeup(&vm_pages_needed);
2942 			}
2943 			++vm_pages_waiting;	/* SMP race ok */
2944 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2945 		}
2946 	}
2947 	lwkt_reltoken(&vm_token);
2948 }
2949 
2950 /*
2951  * Block until free pages are available for allocation
2952  *
2953  * Called only from vm_fault so that processes page faulting can be
2954  * easily tracked.
2955  */
2956 void
2957 vm_wait_pfault(void)
2958 {
2959 	/*
2960 	 * Wakeup the pageout daemon if necessary and wait.
2961 	 *
2962 	 * Do not wait indefinitely for the target to be reached,
2963 	 * as load might prevent it from being reached any time soon.
2964 	 * But wait a little to try to slow down page allocations
2965 	 * and to give more important threads (the pagedaemon)
2966 	 * allocation priority.
2967 	 */
2968 	if (vm_page_count_min(0)) {
2969 		lwkt_gettoken(&vm_token);
2970 		while (vm_page_count_severe()) {
2971 			if (vm_page_count_target()) {
2972 				thread_t td;
2973 
2974 				if (vm_pages_needed <= 1) {
2975 					++vm_pages_needed;
2976 					wakeup(&vm_pages_needed);
2977 				}
2978 				++vm_pages_waiting;	/* SMP race ok */
2979 				tsleep(&vmstats.v_free_count, 0, "pfault", hz);
2980 
2981 				/*
2982 				 * Do not stay stuck in the loop if the system is trying
2983 				 * to kill the process.
2984 				 */
2985 				td = curthread;
2986 				if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL))
2987 					break;
2988 			}
2989 		}
2990 		lwkt_reltoken(&vm_token);
2991 	}
2992 }
2993 
2994 /*
2995  * Put the specified page on the active list (if appropriate).  Ensure
2996  * that act_count is at least ACT_INIT but do not otherwise mess with it.
2997  *
2998  * The caller should be holding the page busied ? XXX
2999  * This routine may not block.
3000  *
3001  * It is ok if the page is wired (so buffer cache operations don't have
3002  * to mess with the page queues).
3003  */
3004 void
3005 vm_page_activate(vm_page_t m)
3006 {
3007 	u_short oqueue;
3008 
3009 	/*
3010 	 * If already active or inappropriate, just set act_count and
3011 	 * return.  We don't have to spin-lock the page.
3012 	 */
3013 	if (m->queue - m->pc == PQ_ACTIVE ||
3014 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3015 		if (m->act_count < ACT_INIT)
3016 			m->act_count = ACT_INIT;
3017 		return;
3018 	}
3019 
3020 	vm_page_spin_lock(m);
3021 	if (m->queue - m->pc != PQ_ACTIVE &&
3022 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3023 		_vm_page_queue_spin_lock(m);
3024 		oqueue = _vm_page_rem_queue_spinlocked(m);
3025 		/* page is left spinlocked, queue is unlocked */
3026 
3027 		if (oqueue == PQ_CACHE)
3028 			mycpu->gd_cnt.v_reactivated++;
3029 		if (m->act_count < ACT_INIT)
3030 			m->act_count = ACT_INIT;
3031 		_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
3032 		_vm_page_and_queue_spin_unlock(m);
3033 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
3034 			pagedaemon_wakeup();
3035 	} else {
3036 		if (m->act_count < ACT_INIT)
3037 			m->act_count = ACT_INIT;
3038 		vm_page_spin_unlock(m);
3039 	}
3040 }
3041 
3042 void
3043 vm_page_soft_activate(vm_page_t m)
3044 {
3045 	if (m->queue - m->pc == PQ_ACTIVE ||
3046 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3047 		if (m->act_count < ACT_INIT)
3048 			m->act_count = ACT_INIT;
3049 	} else {
3050 		vm_page_activate(m);
3051 	}
3052 }
3053 
3054 /*
3055  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
3056  * routine is called when a page has been added to the cache or free
3057  * queues.
3058  *
3059  * This routine may not block.
3060  */
3061 static __inline void
3062 vm_page_free_wakeup(void)
3063 {
3064 	globaldata_t gd = mycpu;
3065 
3066 	/*
3067 	 * If the pageout daemon itself needs pages, then tell it that
3068 	 * there are some free.
3069 	 */
3070 	if (vm_pageout_pages_needed &&
3071 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
3072 	    gd->gd_vmstats.v_pageout_free_min
3073 	) {
3074 		vm_pageout_pages_needed = 0;
3075 		wakeup(&vm_pageout_pages_needed);
3076 	}
3077 
3078 	/*
3079 	 * Wakeup processes that are waiting on memory.
3080 	 *
3081 	 * Generally speaking we want to wakeup stuck processes as soon as
3082 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
3083 	 * where we can do this.  Wait a bit longer to reduce degenerate
3084 	 * re-blocking (vm_page_free_hysteresis).  The target check is just
3085 	 * to make sure the min-check w/hysteresis does not exceed the
3086 	 * normal target.
3087 	 */
3088 	if (vm_pages_waiting) {
3089 		if (!vm_page_count_min(vm_page_free_hysteresis) ||
3090 		    !vm_page_count_target()) {
3091 			vm_pages_waiting = 0;
3092 			wakeup(&vmstats.v_free_count);
3093 			++mycpu->gd_cnt.v_ppwakeups;
3094 		}
3095 #if 0
3096 		if (!vm_page_count_target()) {
3097 			/*
3098 			 * Plenty of pages are free, wakeup everyone.
3099 			 */
3100 			vm_pages_waiting = 0;
3101 			wakeup(&vmstats.v_free_count);
3102 			++mycpu->gd_cnt.v_ppwakeups;
3103 		} else if (!vm_page_count_min(0)) {
3104 			/*
3105 			 * Some pages are free, wakeup someone.
3106 			 */
3107 			int wcount = vm_pages_waiting;
3108 			if (wcount > 0)
3109 				--wcount;
3110 			vm_pages_waiting = wcount;
3111 			wakeup_one(&vmstats.v_free_count);
3112 			++mycpu->gd_cnt.v_ppwakeups;
3113 		}
3114 #endif
3115 	}
3116 }
3117 
3118 /*
3119  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
3120  * it from its VM object.
3121  *
3122  * The vm_page must be BUSY on entry.  BUSY will be released on
3123  * return (the page will have been freed).
3124  */
3125 void
3126 vm_page_free_toq(vm_page_t m)
3127 {
3128 	/*
3129 	 * The page must not be mapped when freed, but we may have to call
3130 	 * pmap_mapped_sync() to validate this.
3131 	 */
3132 	mycpu->gd_cnt.v_tfree++;
3133 	if (m->flags & (PG_MAPPED | PG_WRITEABLE))
3134 		pmap_mapped_sync(m);
3135 	KKASSERT((m->flags & PG_MAPPED) == 0);
3136 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3137 
3138 	if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
3139 		kprintf("vm_page_free: pindex(%lu), busy %08x, "
3140 			"hold(%d)\n",
3141 			(u_long)m->pindex, m->busy_count, m->hold_count);
3142 		if ((m->queue - m->pc) == PQ_FREE)
3143 			panic("vm_page_free: freeing free page");
3144 		else
3145 			panic("vm_page_free: freeing busy page");
3146 	}
3147 
3148 	/*
3149 	 * Remove from object, spinlock the page and its queues and
3150 	 * remove from any queue.  No queue spinlock will be held
3151 	 * after this section (because the page was removed from any
3152 	 * queue).
3153 	 */
3154 	vm_page_remove(m);
3155 
3156 	/*
3157 	 * No further management of fictitious pages occurs beyond object
3158 	 * and queue removal.
3159 	 */
3160 	if ((m->flags & PG_FICTITIOUS) != 0) {
3161 		KKASSERT(m->queue == PQ_NONE);
3162 		vm_page_wakeup(m);
3163 		return;
3164 	}
3165 	vm_page_and_queue_spin_lock(m);
3166 	_vm_page_rem_queue_spinlocked(m);
3167 
3168 	m->valid = 0;
3169 	vm_page_undirty(m);
3170 
3171 	if (m->wire_count != 0) {
3172 		if (m->wire_count > 1) {
3173 		    panic(
3174 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
3175 			m->wire_count, (long)m->pindex);
3176 		}
3177 		panic("vm_page_free: freeing wired page");
3178 	}
3179 
3180 	if (!MD_PAGE_FREEABLE(m))
3181 		panic("vm_page_free: page %p is still mapped!", m);
3182 
3183 	/*
3184 	 * Clear the PG_NEED_COMMIT and the PG_UNQUEUED flags.  The
3185 	 * page returns to normal operation and will be placed in
3186 	 * the PQ_HOLD or PQ_FREE queue.
3187 	 */
3188 	vm_page_flag_clear(m, PG_NEED_COMMIT | PG_UNQUEUED);
3189 
3190 	if (m->hold_count != 0) {
3191 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
3192 	} else {
3193 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
3194 	}
3195 
3196 	/*
3197 	 * This sequence allows us to clear BUSY while still holding
3198 	 * its spin lock, which reduces contention vs allocators.  We
3199 	 * must not leave the queue locked or _vm_page_wakeup() may
3200 	 * deadlock.
3201 	 */
3202 	_vm_page_queue_spin_unlock(m);
3203 	if (_vm_page_wakeup(m)) {
3204 		vm_page_spin_unlock(m);
3205 		wakeup(m);
3206 	} else {
3207 		vm_page_spin_unlock(m);
3208 	}
3209 	vm_page_free_wakeup();
3210 }
3211 
3212 /*
3213  * Mark this page as wired down by yet another map.  We do not adjust the
3214  * queue the page is on, it will be checked for wiring as-needed.
3215  *
3216  * This function has no effect on fictitious pages.
3217  *
3218  * Caller must be holding the page busy.
3219  */
3220 void
3221 vm_page_wire(vm_page_t m)
3222 {
3223 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3224 	if ((m->flags & PG_FICTITIOUS) == 0) {
3225 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
3226 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
3227 		}
3228 		KASSERT(m->wire_count != 0,
3229 			("vm_page_wire: wire_count overflow m=%p", m));
3230 	}
3231 }
3232 
3233 /*
3234  * Release one wiring of this page, potentially enabling it to be paged again.
3235  *
3236  * Note that wired pages are no longer unconditionally removed from the
3237  * paging queues, so the page may already be on a queue.  Move the page
3238  * to the desired queue if necessary.
3239  *
3240  * Many pages placed on the inactive queue should actually go
3241  * into the cache, but it is difficult to figure out which.  What
3242  * we do instead, if the inactive target is well met, is to put
3243  * clean pages at the head of the inactive queue instead of the tail.
3244  * This will cause them to be moved to the cache more quickly and
3245  * if not actively re-referenced, freed more quickly.  If we just
3246  * stick these pages at the end of the inactive queue, heavy filesystem
3247  * meta-data accesses can cause an unnecessary paging load on memory bound
3248  * processes.  This optimization causes one-time-use metadata to be
3249  * reused more quickly.
3250  *
3251  * Pages marked PG_NEED_COMMIT are always activated and never placed on
3252  * the inactive queue.  This helps the pageout daemon determine memory
3253  * pressure and act on out-of-memory situations more quickly.
3254  *
3255  * BUT, if we are in a low-memory situation we have no choice but to
3256  * put clean pages on the cache queue.
3257  *
3258  * A number of routines use vm_page_unwire() to guarantee that the page
3259  * will go into either the inactive or active queues, and will NEVER
3260  * be placed in the cache - for example, just after dirtying a page.
3261  * dirty pages in the cache are not allowed.
3262  *
3263  * PG_FICTITIOUS or PG_UNQUEUED pages are never moved to any queue, and
3264  * the wire_count will not be adjusted in any way for a PG_FICTITIOUS
3265  * page.
3266  *
3267  * This routine may not block.
3268  */
3269 void
3270 vm_page_unwire(vm_page_t m, int activate)
3271 {
3272 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3273 	if (m->flags & PG_FICTITIOUS) {
3274 		/* do nothing */
3275 	} else if ((int)m->wire_count <= 0) {
3276 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
3277 	} else {
3278 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
3279 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
3280 			if (m->flags & PG_UNQUEUED) {
3281 				;
3282 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
3283 				vm_page_activate(m);
3284 			} else {
3285 				vm_page_deactivate(m);
3286 			}
3287 		}
3288 	}
3289 }
3290 
3291 /*
3292  * Move the specified page to the inactive queue.
3293  *
3294  * Normally athead is 0 resulting in LRU operation.  athead is set
3295  * to 1 if we want this page to be 'as if it were placed in the cache',
3296  * except without unmapping it from the process address space.
3297  *
3298  * vm_page's spinlock must be held on entry and will remain held on return.
3299  * This routine may not block.  The caller does not have to hold the page
3300  * busied but should have some sort of interlock on its validity.
3301  *
3302  * It is ok if the page is wired (so buffer cache operations don't have
3303  * to mess with the page queues).
3304  */
3305 static void
3306 _vm_page_deactivate_locked(vm_page_t m, int athead)
3307 {
3308 	u_short oqueue;
3309 
3310 	/*
3311 	 * Ignore if already inactive.
3312 	 */
3313 	if (m->queue - m->pc == PQ_INACTIVE ||
3314 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3315 		return;
3316 	}
3317 
3318 	_vm_page_queue_spin_lock(m);
3319 	oqueue = _vm_page_rem_queue_spinlocked(m);
3320 
3321 	if ((m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3322 		if (oqueue == PQ_CACHE)
3323 			mycpu->gd_cnt.v_reactivated++;
3324 		vm_page_flag_clear(m, PG_WINATCFLS);
3325 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
3326 		if (athead == 0) {
3327 			atomic_add_long(
3328 				&vm_page_queues[PQ_INACTIVE + m->pc].adds, 1);
3329 		}
3330 	}
3331 	/* NOTE: PQ_NONE if condition not taken */
3332 	_vm_page_queue_spin_unlock(m);
3333 	/* leaves vm_page spinlocked */
3334 }
3335 
3336 /*
3337  * Attempt to deactivate a page.
3338  *
3339  * No requirements.  We can pre-filter before getting the spinlock.
3340  *
3341  * It is ok if the page is wired (so buffer cache operations don't have
3342  * to mess with the page queues).
3343  */
3344 void
3345 vm_page_deactivate(vm_page_t m)
3346 {
3347 	if (m->queue - m->pc != PQ_INACTIVE &&
3348 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3349 		vm_page_spin_lock(m);
3350 		_vm_page_deactivate_locked(m, 0);
3351 		vm_page_spin_unlock(m);
3352 	}
3353 }
3354 
3355 void
3356 vm_page_deactivate_locked(vm_page_t m)
3357 {
3358 	_vm_page_deactivate_locked(m, 0);
3359 }
3360 
3361 /*
3362  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
3363  *
3364  * This function returns non-zero if it successfully moved the page to
3365  * PQ_CACHE.
3366  *
3367  * This function unconditionally unbusies the page on return.
3368  */
3369 int
3370 vm_page_try_to_cache(vm_page_t m)
3371 {
3372 	/*
3373 	 * Shortcut if we obviously cannot move the page, or if the
3374 	 * page is already on the cache queue, or it is ficitious.
3375 	 *
3376 	 * Never allow a wired page into the cache.
3377 	 */
3378 	if (m->dirty || m->hold_count || m->wire_count ||
3379 	    m->queue - m->pc == PQ_CACHE ||
3380 	    (m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS))) {
3381 		vm_page_wakeup(m);
3382 		return(0);
3383 	}
3384 
3385 	/*
3386 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
3387 	 * be moved to the cache, but can be deactivated.  However, users
3388 	 * of this function want to move pages closer to the cache so we
3389 	 * only deactivate it if it is in PQ_ACTIVE.  We do not re-deactivate.
3390 	 */
3391 	vm_page_test_dirty(m);
3392 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3393 		if (m->queue - m->pc == PQ_ACTIVE)
3394 			vm_page_deactivate(m);
3395 		vm_page_wakeup(m);
3396 		return(0);
3397 	}
3398 	vm_page_cache(m);
3399 	return(1);
3400 }
3401 
3402 /*
3403  * Attempt to free the page.  If we cannot free it, we do nothing.
3404  * 1 is returned on success, 0 on failure.
3405  *
3406  * The page can be in any state, including already being on the free
3407  * queue.  Check to see if it really can be freed.  Note that we disallow
3408  * this ad-hoc operation if the page is flagged PG_UNQUEUED.
3409  *
3410  * Caller provides an unlocked/non-busied page.
3411  * No requirements.
3412  */
3413 int
3414 vm_page_try_to_free(vm_page_t m)
3415 {
3416 	if (vm_page_busy_try(m, TRUE))
3417 		return(0);
3418 
3419 	if (m->dirty ||				/* can't free if it is dirty */
3420 	    m->hold_count ||			/* or held (XXX may be wrong) */
3421 	    m->wire_count ||			/* or wired */
3422 	    (m->flags & (PG_UNQUEUED |		/* or unqueued */
3423 			 PG_NEED_COMMIT |	/* or needs a commit */
3424 			 PG_FICTITIOUS)) ||	/* or is fictitious */
3425 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
3426 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
3427 		vm_page_wakeup(m);
3428 		return(0);
3429 	}
3430 
3431 	/*
3432 	 * We can probably free the page.
3433 	 *
3434 	 * Page busied by us and no longer spinlocked.  Dirty pages will
3435 	 * not be freed by this function.    We have to re-test the
3436 	 * dirty bit after cleaning out the pmaps.
3437 	 */
3438 	vm_page_test_dirty(m);
3439 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3440 		vm_page_wakeup(m);
3441 		return(0);
3442 	}
3443 	vm_page_protect(m, VM_PROT_NONE);
3444 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3445 		vm_page_wakeup(m);
3446 		return(0);
3447 	}
3448 	vm_page_free(m);
3449 	return(1);
3450 }
3451 
3452 /*
3453  * vm_page_cache
3454  *
3455  * Put the specified page onto the page cache queue (if appropriate).
3456  *
3457  * The page must be busy, and this routine will release the busy and
3458  * possibly even free the page.
3459  */
3460 void
3461 vm_page_cache(vm_page_t m)
3462 {
3463 	/*
3464 	 * Not suitable for the cache
3465 	 */
3466 	if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS)) ||
3467 	    (m->busy_count & PBUSY_MASK) ||
3468 	    m->wire_count || m->hold_count) {
3469 		vm_page_wakeup(m);
3470 		return;
3471 	}
3472 
3473 	/*
3474 	 * Already in the cache (and thus not mapped)
3475 	 */
3476 	if ((m->queue - m->pc) == PQ_CACHE) {
3477 		KKASSERT((m->flags & PG_MAPPED) == 0);
3478 		vm_page_wakeup(m);
3479 		return;
3480 	}
3481 
3482 #if 0
3483 	/*
3484 	 * REMOVED - it is possible for dirty to get set at any time as
3485 	 *	     long as the page is still mapped and writeable.
3486 	 *
3487 	 * Caller is required to test m->dirty, but note that the act of
3488 	 * removing the page from its maps can cause it to become dirty
3489 	 * on an SMP system due to another cpu running in usermode.
3490 	 */
3491 	if (m->dirty) {
3492 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
3493 			(long)m->pindex);
3494 	}
3495 #endif
3496 
3497 	/*
3498 	 * Remove all pmaps and indicate that the page is not
3499 	 * writeable or mapped.  Our vm_page_protect() call may
3500 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
3501 	 * everything.
3502 	 */
3503 	if (m->flags & (PG_MAPPED | PG_WRITEABLE)) {
3504 		vm_page_protect(m, VM_PROT_NONE);
3505 		pmap_mapped_sync(m);
3506 	}
3507 	if ((m->flags & (PG_UNQUEUED | PG_MAPPED)) ||
3508 	    (m->busy_count & PBUSY_MASK) ||
3509 	    m->wire_count || m->hold_count) {
3510 		vm_page_wakeup(m);
3511 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3512 		vm_page_deactivate(m);
3513 		vm_page_wakeup(m);
3514 	} else {
3515 		_vm_page_and_queue_spin_lock(m);
3516 		_vm_page_rem_queue_spinlocked(m);
3517 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
3518 		_vm_page_and_queue_spin_unlock(m);
3519 		vm_page_wakeup(m);
3520 		vm_page_free_wakeup();
3521 	}
3522 }
3523 
3524 /*
3525  * vm_page_dontneed()
3526  *
3527  * Cache, deactivate, or do nothing as appropriate.  This routine
3528  * is typically used by madvise() MADV_DONTNEED.
3529  *
3530  * Generally speaking we want to move the page into the cache so
3531  * it gets reused quickly.  However, this can result in a silly syndrome
3532  * due to the page recycling too quickly.  Small objects will not be
3533  * fully cached.  On the otherhand, if we move the page to the inactive
3534  * queue we wind up with a problem whereby very large objects
3535  * unnecessarily blow away our inactive and cache queues.
3536  *
3537  * The solution is to move the pages based on a fixed weighting.  We
3538  * either leave them alone, deactivate them, or move them to the cache,
3539  * where moving them to the cache has the highest weighting.
3540  * By forcing some pages into other queues we eventually force the
3541  * system to balance the queues, potentially recovering other unrelated
3542  * space from active.  The idea is to not force this to happen too
3543  * often.
3544  *
3545  * The page must be busied.
3546  */
3547 void
3548 vm_page_dontneed(vm_page_t m)
3549 {
3550 	static int dnweight;
3551 	int dnw;
3552 	int head;
3553 
3554 	dnw = ++dnweight;
3555 
3556 	/*
3557 	 * occassionally leave the page alone
3558 	 */
3559 	if ((dnw & 0x01F0) == 0 ||
3560 	    m->queue - m->pc == PQ_INACTIVE ||
3561 	    m->queue - m->pc == PQ_CACHE
3562 	) {
3563 		if (m->act_count >= ACT_INIT)
3564 			--m->act_count;
3565 		return;
3566 	}
3567 
3568 	/*
3569 	 * If vm_page_dontneed() is inactivating a page, it must clear
3570 	 * the referenced flag; otherwise the pagedaemon will see references
3571 	 * on the page in the inactive queue and reactivate it. Until the
3572 	 * page can move to the cache queue, madvise's job is not done.
3573 	 */
3574 	vm_page_flag_clear(m, PG_REFERENCED);
3575 	pmap_clear_reference(m);
3576 
3577 	if (m->dirty == 0)
3578 		vm_page_test_dirty(m);
3579 
3580 	if (m->dirty || (dnw & 0x0070) == 0) {
3581 		/*
3582 		 * Deactivate the page 3 times out of 32.
3583 		 */
3584 		head = 0;
3585 	} else {
3586 		/*
3587 		 * Cache the page 28 times out of every 32.  Note that
3588 		 * the page is deactivated instead of cached, but placed
3589 		 * at the head of the queue instead of the tail.
3590 		 */
3591 		head = 1;
3592 	}
3593 	vm_page_spin_lock(m);
3594 	_vm_page_deactivate_locked(m, head);
3595 	vm_page_spin_unlock(m);
3596 }
3597 
3598 /*
3599  * These routines manipulate the 'soft busy' count for a page.  A soft busy
3600  * is almost like a hard BUSY except that it allows certain compatible
3601  * operations to occur on the page while it is busy.  For example, a page
3602  * undergoing a write can still be mapped read-only.
3603  *
3604  * We also use soft-busy to quickly pmap_enter shared read-only pages
3605  * without having to hold the page locked.
3606  *
3607  * The soft-busy count can be > 1 in situations where multiple threads
3608  * are pmap_enter()ing the same page simultaneously, or when two buffer
3609  * cache buffers overlap the same page.
3610  *
3611  * The caller must hold the page BUSY when making these two calls.
3612  */
3613 void
3614 vm_page_io_start(vm_page_t m)
3615 {
3616 	uint32_t ocount;
3617 
3618 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3619 	KKASSERT(ocount & PBUSY_LOCKED);
3620 }
3621 
3622 void
3623 vm_page_io_finish(vm_page_t m)
3624 {
3625 	uint32_t ocount;
3626 
3627 	ocount = atomic_fetchadd_int(&m->busy_count, -1);
3628 	KKASSERT(ocount & PBUSY_MASK);
3629 #if 0
3630 	if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
3631 		wakeup(m);
3632 #endif
3633 }
3634 
3635 /*
3636  * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
3637  *
3638  * We can't use fetchadd here because we might race a hard-busy and the
3639  * page freeing code asserts on a non-zero soft-busy count (even if only
3640  * temporary).
3641  *
3642  * Returns 0 on success, non-zero on failure.
3643  */
3644 int
3645 vm_page_sbusy_try(vm_page_t m)
3646 {
3647 	uint32_t ocount;
3648 
3649 	for (;;) {
3650 		ocount = m->busy_count;
3651 		cpu_ccfence();
3652 		if (ocount & PBUSY_LOCKED)
3653 			return 1;
3654 		if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1))
3655 			break;
3656 	}
3657 	return 0;
3658 #if 0
3659 	if (m->busy_count & PBUSY_LOCKED)
3660 		return 1;
3661 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3662 	if (ocount & PBUSY_LOCKED) {
3663 		vm_page_sbusy_drop(m);
3664 		return 1;
3665 	}
3666 	return 0;
3667 #endif
3668 }
3669 
3670 /*
3671  * Indicate that a clean VM page requires a filesystem commit and cannot
3672  * be reused.  Used by tmpfs.
3673  */
3674 void
3675 vm_page_need_commit(vm_page_t m)
3676 {
3677 	vm_page_flag_set(m, PG_NEED_COMMIT);
3678 	vm_object_set_writeable_dirty(m->object);
3679 }
3680 
3681 void
3682 vm_page_clear_commit(vm_page_t m)
3683 {
3684 	vm_page_flag_clear(m, PG_NEED_COMMIT);
3685 }
3686 
3687 /*
3688  * Grab a page, blocking if it is busy and allocating a page if necessary.
3689  * A busy page is returned or NULL.  The page may or may not be valid and
3690  * might not be on a queue (the caller is responsible for the disposition of
3691  * the page).
3692  *
3693  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
3694  * page will be zero'd and marked valid.
3695  *
3696  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
3697  * valid even if it already exists.
3698  *
3699  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
3700  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
3701  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
3702  *
3703  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
3704  * always returned if we had blocked.
3705  *
3706  * This routine may not be called from an interrupt.
3707  *
3708  * No other requirements.
3709  */
3710 vm_page_t
3711 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
3712 {
3713 	vm_page_t m;
3714 	int error;
3715 	int shared = 1;
3716 
3717 	KKASSERT(allocflags &
3718 		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
3719 	vm_object_hold_shared(object);
3720 	for (;;) {
3721 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
3722 		if (error) {
3723 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
3724 			if ((allocflags & VM_ALLOC_RETRY) == 0) {
3725 				m = NULL;
3726 				break;
3727 			}
3728 			/* retry */
3729 		} else if (m == NULL) {
3730 			if (shared) {
3731 				vm_object_upgrade(object);
3732 				shared = 0;
3733 			}
3734 			if (allocflags & VM_ALLOC_RETRY)
3735 				allocflags |= VM_ALLOC_NULL_OK;
3736 			m = vm_page_alloc(object, pindex,
3737 					  allocflags & ~VM_ALLOC_RETRY);
3738 			if (m)
3739 				break;
3740 			vm_wait(0);
3741 			if ((allocflags & VM_ALLOC_RETRY) == 0)
3742 				goto failed;
3743 		} else {
3744 			/* m found */
3745 			break;
3746 		}
3747 	}
3748 
3749 	/*
3750 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
3751 	 *
3752 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
3753 	 * valid even if already valid.
3754 	 *
3755 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
3756 	 *	  removed the idle zeroing code.  These optimizations actually
3757 	 *	  slow things down on modern cpus because the zerod area is
3758 	 *	  likely uncached, placing a memory-access burden on the
3759 	 *	  accesors taking the fault.
3760 	 *
3761 	 *	  By always zeroing the page in-line with the fault, no
3762 	 *	  dynamic ram reads are needed and the caches are hot, ready
3763 	 *	  for userland to access the memory.
3764 	 */
3765 	if (m->valid == 0) {
3766 		if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3767 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
3768 			m->valid = VM_PAGE_BITS_ALL;
3769 		}
3770 	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
3771 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
3772 		m->valid = VM_PAGE_BITS_ALL;
3773 	}
3774 failed:
3775 	vm_object_drop(object);
3776 	return(m);
3777 }
3778 
3779 /*
3780  * Mapping function for valid bits or for dirty bits in
3781  * a page.  May not block.
3782  *
3783  * Inputs are required to range within a page.
3784  *
3785  * No requirements.
3786  * Non blocking.
3787  */
3788 int
3789 vm_page_bits(int base, int size)
3790 {
3791 	int first_bit;
3792 	int last_bit;
3793 
3794 	KASSERT(
3795 	    base + size <= PAGE_SIZE,
3796 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
3797 	);
3798 
3799 	if (size == 0)		/* handle degenerate case */
3800 		return(0);
3801 
3802 	first_bit = base >> DEV_BSHIFT;
3803 	last_bit = (base + size - 1) >> DEV_BSHIFT;
3804 
3805 	return ((2 << last_bit) - (1 << first_bit));
3806 }
3807 
3808 /*
3809  * Sets portions of a page valid and clean.  The arguments are expected
3810  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3811  * of any partial chunks touched by the range.  The invalid portion of
3812  * such chunks will be zero'd.
3813  *
3814  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
3815  *	 align base to DEV_BSIZE so as not to mark clean a partially
3816  *	 truncated device block.  Otherwise the dirty page status might be
3817  *	 lost.
3818  *
3819  * This routine may not block.
3820  *
3821  * (base + size) must be less then or equal to PAGE_SIZE.
3822  */
3823 static void
3824 _vm_page_zero_valid(vm_page_t m, int base, int size)
3825 {
3826 	int frag;
3827 	int endoff;
3828 
3829 	if (size == 0)	/* handle degenerate case */
3830 		return;
3831 
3832 	/*
3833 	 * If the base is not DEV_BSIZE aligned and the valid
3834 	 * bit is clear, we have to zero out a portion of the
3835 	 * first block.
3836 	 */
3837 
3838 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3839 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3840 	) {
3841 		pmap_zero_page_area(
3842 		    VM_PAGE_TO_PHYS(m),
3843 		    frag,
3844 		    base - frag
3845 		);
3846 	}
3847 
3848 	/*
3849 	 * If the ending offset is not DEV_BSIZE aligned and the
3850 	 * valid bit is clear, we have to zero out a portion of
3851 	 * the last block.
3852 	 */
3853 
3854 	endoff = base + size;
3855 
3856 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3857 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3858 	) {
3859 		pmap_zero_page_area(
3860 		    VM_PAGE_TO_PHYS(m),
3861 		    endoff,
3862 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3863 		);
3864 	}
3865 }
3866 
3867 /*
3868  * Set valid, clear dirty bits.  If validating the entire
3869  * page we can safely clear the pmap modify bit.  We also
3870  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3871  * takes a write fault on a MAP_NOSYNC memory area the flag will
3872  * be set again.
3873  *
3874  * We set valid bits inclusive of any overlap, but we can only
3875  * clear dirty bits for DEV_BSIZE chunks that are fully within
3876  * the range.
3877  *
3878  * Page must be busied?
3879  * No other requirements.
3880  */
3881 void
3882 vm_page_set_valid(vm_page_t m, int base, int size)
3883 {
3884 	_vm_page_zero_valid(m, base, size);
3885 	m->valid |= vm_page_bits(base, size);
3886 }
3887 
3888 
3889 /*
3890  * Set valid bits and clear dirty bits.
3891  *
3892  * Page must be busied by caller.
3893  *
3894  * NOTE: This function does not clear the pmap modified bit.
3895  *	 Also note that e.g. NFS may use a byte-granular base
3896  *	 and size.
3897  *
3898  * No other requirements.
3899  */
3900 void
3901 vm_page_set_validclean(vm_page_t m, int base, int size)
3902 {
3903 	int pagebits;
3904 
3905 	_vm_page_zero_valid(m, base, size);
3906 	pagebits = vm_page_bits(base, size);
3907 	m->valid |= pagebits;
3908 	m->dirty &= ~pagebits;
3909 	if (base == 0 && size == PAGE_SIZE) {
3910 		/*pmap_clear_modify(m);*/
3911 		vm_page_flag_clear(m, PG_NOSYNC);
3912 	}
3913 }
3914 
3915 /*
3916  * Set valid & dirty.  Used by buwrite()
3917  *
3918  * Page must be busied by caller.
3919  */
3920 void
3921 vm_page_set_validdirty(vm_page_t m, int base, int size)
3922 {
3923 	int pagebits;
3924 
3925 	pagebits = vm_page_bits(base, size);
3926 	m->valid |= pagebits;
3927 	m->dirty |= pagebits;
3928 	if (m->object)
3929 	       vm_object_set_writeable_dirty(m->object);
3930 }
3931 
3932 /*
3933  * Clear dirty bits.
3934  *
3935  * NOTE: This function does not clear the pmap modified bit.
3936  *	 Also note that e.g. NFS may use a byte-granular base
3937  *	 and size.
3938  *
3939  * Page must be busied?
3940  * No other requirements.
3941  */
3942 void
3943 vm_page_clear_dirty(vm_page_t m, int base, int size)
3944 {
3945 	m->dirty &= ~vm_page_bits(base, size);
3946 	if (base == 0 && size == PAGE_SIZE) {
3947 		/*pmap_clear_modify(m);*/
3948 		vm_page_flag_clear(m, PG_NOSYNC);
3949 	}
3950 }
3951 
3952 /*
3953  * Make the page all-dirty.
3954  *
3955  * Also make sure the related object and vnode reflect the fact that the
3956  * object may now contain a dirty page.
3957  *
3958  * Page must be busied?
3959  * No other requirements.
3960  */
3961 void
3962 vm_page_dirty(vm_page_t m)
3963 {
3964 #ifdef INVARIANTS
3965         int pqtype = m->queue - m->pc;
3966 #endif
3967         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3968                 ("vm_page_dirty: page in free/cache queue!"));
3969 	if (m->dirty != VM_PAGE_BITS_ALL) {
3970 		m->dirty = VM_PAGE_BITS_ALL;
3971 		if (m->object)
3972 			vm_object_set_writeable_dirty(m->object);
3973 	}
3974 }
3975 
3976 /*
3977  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3978  * valid and dirty bits for the effected areas are cleared.
3979  *
3980  * Page must be busied?
3981  * Does not block.
3982  * No other requirements.
3983  */
3984 void
3985 vm_page_set_invalid(vm_page_t m, int base, int size)
3986 {
3987 	int bits;
3988 
3989 	bits = vm_page_bits(base, size);
3990 	m->valid &= ~bits;
3991 	m->dirty &= ~bits;
3992 	atomic_add_int(&m->object->generation, 1);
3993 }
3994 
3995 /*
3996  * The kernel assumes that the invalid portions of a page contain
3997  * garbage, but such pages can be mapped into memory by user code.
3998  * When this occurs, we must zero out the non-valid portions of the
3999  * page so user code sees what it expects.
4000  *
4001  * Pages are most often semi-valid when the end of a file is mapped
4002  * into memory and the file's size is not page aligned.
4003  *
4004  * Page must be busied?
4005  * No other requirements.
4006  */
4007 void
4008 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
4009 {
4010 	int b;
4011 	int i;
4012 
4013 	/*
4014 	 * Scan the valid bits looking for invalid sections that
4015 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
4016 	 * valid bit may be set ) have already been zerod by
4017 	 * vm_page_set_validclean().
4018 	 */
4019 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
4020 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
4021 		    (m->valid & (1 << i))
4022 		) {
4023 			if (i > b) {
4024 				pmap_zero_page_area(
4025 				    VM_PAGE_TO_PHYS(m),
4026 				    b << DEV_BSHIFT,
4027 				    (i - b) << DEV_BSHIFT
4028 				);
4029 			}
4030 			b = i + 1;
4031 		}
4032 	}
4033 
4034 	/*
4035 	 * setvalid is TRUE when we can safely set the zero'd areas
4036 	 * as being valid.  We can do this if there are no cache consistency
4037 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
4038 	 */
4039 	if (setvalid)
4040 		m->valid = VM_PAGE_BITS_ALL;
4041 }
4042 
4043 /*
4044  * Is a (partial) page valid?  Note that the case where size == 0
4045  * will return FALSE in the degenerate case where the page is entirely
4046  * invalid, and TRUE otherwise.
4047  *
4048  * Does not block.
4049  * No other requirements.
4050  */
4051 int
4052 vm_page_is_valid(vm_page_t m, int base, int size)
4053 {
4054 	int bits = vm_page_bits(base, size);
4055 
4056 	if (m->valid && ((m->valid & bits) == bits))
4057 		return 1;
4058 	else
4059 		return 0;
4060 }
4061 
4062 /*
4063  * Update dirty bits from pmap/mmu.  May not block.
4064  *
4065  * Caller must hold the page busy
4066  *
4067  * WARNING! Unless the page has been unmapped, this function only
4068  *	    provides a likely dirty status.
4069  */
4070 void
4071 vm_page_test_dirty(vm_page_t m)
4072 {
4073 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) {
4074 		vm_page_dirty(m);
4075 	}
4076 }
4077 
4078 #include "opt_ddb.h"
4079 #ifdef DDB
4080 #include <ddb/ddb.h>
4081 
4082 DB_SHOW_COMMAND(page, vm_page_print_page_info)
4083 {
4084 	db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
4085 	db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
4086 	db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
4087 	db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
4088 	db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
4089 	db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
4090 	db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
4091 	db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
4092 	db_printf("vmstats.v_cache_min: %ld\n", vmstats.v_cache_min);
4093 	db_printf("vmstats.v_inactive_target: %ld\n",
4094 		  vmstats.v_inactive_target);
4095 }
4096 
4097 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
4098 {
4099 	int i;
4100 	db_printf("PQ_FREE:");
4101 	for (i = 0; i < PQ_L2_SIZE; i++) {
4102 		db_printf(" %ld", vm_page_queues[PQ_FREE + i].lcnt);
4103 	}
4104 	db_printf("\n");
4105 
4106 	db_printf("PQ_CACHE:");
4107 	for(i = 0; i < PQ_L2_SIZE; i++) {
4108 		db_printf(" %ld", vm_page_queues[PQ_CACHE + i].lcnt);
4109 	}
4110 	db_printf("\n");
4111 
4112 	db_printf("PQ_ACTIVE:");
4113 	for(i = 0; i < PQ_L2_SIZE; i++) {
4114 		db_printf(" %ld", vm_page_queues[PQ_ACTIVE + i].lcnt);
4115 	}
4116 	db_printf("\n");
4117 
4118 	db_printf("PQ_INACTIVE:");
4119 	for(i = 0; i < PQ_L2_SIZE; i++) {
4120 		db_printf(" %ld", vm_page_queues[PQ_INACTIVE + i].lcnt);
4121 	}
4122 	db_printf("\n");
4123 }
4124 #endif /* DDB */
4125