xref: /dragonfly/sys/vm/vm_page.c (revision 3a8f8248)
1 /*
2  * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
3  * Copyright (c) 1991 Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
38  */
39 
40 /*
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  */
66 /*
67  * Resident memory management module.  The module manipulates 'VM pages'.
68  * A VM page is the core building block for memory management.
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/malloc.h>
74 #include <sys/proc.h>
75 #include <sys/vmmeter.h>
76 #include <sys/vnode.h>
77 #include <sys/kernel.h>
78 #include <sys/alist.h>
79 #include <sys/sysctl.h>
80 #include <sys/cpu_topology.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <sys/lock.h>
85 #include <vm/vm_kern.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_extern.h>
93 #include <vm/swap_pager.h>
94 
95 #include <machine/inttypes.h>
96 #include <machine/md_var.h>
97 #include <machine/specialreg.h>
98 #include <machine/bus_dma.h>
99 
100 #include <vm/vm_page2.h>
101 #include <sys/spinlock2.h>
102 
103 /*
104  * Cache necessary elements in the hash table itself to avoid indirecting
105  * through random vm_page's when doing a lookup.  The hash table is
106  * heuristical and it is ok for races to mess up any or all fields.
107  */
108 struct vm_page_hash_elm {
109 	vm_page_t	m;
110 	vm_object_t	object;	/* heuristical */
111 	vm_pindex_t	pindex;	/* heuristical */
112 	int		ticks;
113 	int		unused;
114 };
115 
116 #define VM_PAGE_HASH_SET	4		    /* power of 2, set-assoc */
117 #define VM_PAGE_HASH_MAX	(8 * 1024 * 1024)   /* power of 2, max size */
118 
119 /*
120  * SET - Minimum required set associative size, must be a power of 2.  We
121  *	 want this to match or exceed the set-associativeness of the cpu,
122  *	 up to a reasonable limit (we will use 16).
123  */
124 __read_mostly static int set_assoc_mask = 16 - 1;
125 
126 static void vm_page_queue_init(void);
127 static void vm_page_free_wakeup(void);
128 static vm_page_t vm_page_select_cache(u_short pg_color);
129 static vm_page_t _vm_page_list_find_wide(int basequeue, int index, int *lastp);
130 static vm_page_t _vm_page_list_find2_wide(int bq1, int bq2, int index,
131 			int *lastp1, int *lastp);
132 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
133 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
134 
135 /*
136  * Array of tailq lists
137  */
138 struct vpgqueues vm_page_queues[PQ_COUNT];
139 
140 static volatile int vm_pages_waiting;
141 static struct alist vm_contig_alist;
142 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
143 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
144 
145 __read_mostly static int vm_page_hash_vnode_only;
146 __read_mostly static int vm_page_hash_size;
147 __read_mostly static struct vm_page_hash_elm *vm_page_hash;
148 
149 static u_long vm_dma_reserved = 0;
150 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
151 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
152 	    "Memory reserved for DMA");
153 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
154 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
155 
156 SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW,
157 	    &vm_page_hash_vnode_only, 0, "Only hash vnode pages");
158 #if 0
159 static int vm_page_hash_debug;
160 SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW,
161 	    &vm_page_hash_debug, 0, "Only hash vnode pages");
162 #endif
163 
164 static int vm_contig_verbose = 0;
165 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
166 
167 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
168 	     vm_pindex_t, pindex);
169 
170 static void
171 vm_page_queue_init(void)
172 {
173 	int i;
174 
175 	for (i = 0; i < PQ_L2_SIZE; i++)
176 		vm_page_queues[PQ_FREE+i].cnt_offset =
177 			offsetof(struct vmstats, v_free_count);
178 	for (i = 0; i < PQ_L2_SIZE; i++)
179 		vm_page_queues[PQ_CACHE+i].cnt_offset =
180 			offsetof(struct vmstats, v_cache_count);
181 	for (i = 0; i < PQ_L2_SIZE; i++)
182 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
183 			offsetof(struct vmstats, v_inactive_count);
184 	for (i = 0; i < PQ_L2_SIZE; i++)
185 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
186 			offsetof(struct vmstats, v_active_count);
187 	for (i = 0; i < PQ_L2_SIZE; i++)
188 		vm_page_queues[PQ_HOLD+i].cnt_offset =
189 			offsetof(struct vmstats, v_active_count);
190 	/* PQ_NONE has no queue */
191 
192 	for (i = 0; i < PQ_COUNT; i++) {
193 		vm_page_queues[i].lastq = -1;
194 		TAILQ_INIT(&vm_page_queues[i].pl);
195 		spin_init(&vm_page_queues[i].spin, "vm_page_queue_init");
196 	}
197 }
198 
199 /*
200  * note: place in initialized data section?  Is this necessary?
201  */
202 vm_pindex_t first_page = 0;
203 vm_pindex_t vm_page_array_size = 0;
204 vm_page_t vm_page_array = NULL;
205 vm_paddr_t vm_low_phys_reserved;
206 
207 /*
208  * (low level boot)
209  *
210  * Sets the page size, perhaps based upon the memory size.
211  * Must be called before any use of page-size dependent functions.
212  */
213 void
214 vm_set_page_size(void)
215 {
216 	if (vmstats.v_page_size == 0)
217 		vmstats.v_page_size = PAGE_SIZE;
218 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
219 		panic("vm_set_page_size: page size not a power of two");
220 }
221 
222 /*
223  * (low level boot)
224  *
225  * Add a new page to the freelist for use by the system.  New pages
226  * are added to both the head and tail of the associated free page
227  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
228  * requests pull 'recent' adds (higher physical addresses) first.
229  *
230  * Beware that the page zeroing daemon will also be running soon after
231  * boot, moving pages from the head to the tail of the PQ_FREE queues.
232  *
233  * Must be called in a critical section.
234  */
235 static void
236 vm_add_new_page(vm_paddr_t pa, int *badcountp)
237 {
238 	struct vpgqueues *vpq;
239 	vm_page_t m;
240 
241 	m = PHYS_TO_VM_PAGE(pa);
242 
243 	/*
244 	 * Make sure it isn't a duplicate (due to BIOS page range overlaps,
245 	 * which we consider bugs... but don't crash).  Note that m->phys_addr
246 	 * is pre-initialized, so use m->queue as a check.
247 	 */
248 	if (m->queue) {
249 		if (*badcountp < 10) {
250 			kprintf("vm_add_new_page: duplicate pa %016jx\n",
251 				(intmax_t)pa);
252 			++*badcountp;
253 		} else if (*badcountp == 10) {
254 			kprintf("vm_add_new_page: duplicate pa (many more)\n");
255 			++*badcountp;
256 		}
257 		return;
258 	}
259 
260 	m->phys_addr = pa;
261 	m->flags = 0;
262 	m->pat_mode = PAT_WRITE_BACK;
263 	m->pc = (pa >> PAGE_SHIFT);
264 
265 	/*
266 	 * Twist for cpu localization in addition to page coloring, so
267 	 * different cpus selecting by m->queue get different page colors.
268 	 */
269 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
270 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
271 	m->pc &= PQ_L2_MASK;
272 
273 	/*
274 	 * Reserve a certain number of contiguous low memory pages for
275 	 * contigmalloc() to use.
276 	 *
277 	 * Even though these pages represent real ram and can be
278 	 * reverse-mapped, we set PG_FICTITIOUS and PG_UNQUEUED
279 	 * because their use is special-cased.
280 	 *
281 	 * WARNING! Once PG_FICTITIOUS is set, vm_page_wire*()
282 	 *	    and vm_page_unwire*() calls have no effect.
283 	 */
284 	if (pa < vm_low_phys_reserved) {
285 		atomic_add_long(&vmstats.v_page_count, 1);
286 		atomic_add_long(&vmstats.v_dma_pages, 1);
287 		m->flags |= PG_FICTITIOUS | PG_UNQUEUED;
288 		m->queue = PQ_NONE;
289 		m->wire_count = 1;
290 		atomic_add_long(&vmstats.v_wire_count, 1);
291 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
292 		return;
293 	}
294 
295 	/*
296 	 * General page
297 	 */
298 	m->queue = m->pc + PQ_FREE;
299 	KKASSERT(m->dirty == 0);
300 
301 	atomic_add_long(&vmstats.v_page_count, 1);
302 	atomic_add_long(&vmstats.v_free_count, 1);
303 	vpq = &vm_page_queues[m->queue];
304 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
305 	++vpq->lcnt;
306 }
307 
308 /*
309  * (low level boot)
310  *
311  * Initializes the resident memory module.
312  *
313  * Preallocates memory for critical VM structures and arrays prior to
314  * kernel_map becoming available.
315  *
316  * Memory is allocated from (virtual2_start, virtual2_end) if available,
317  * otherwise memory is allocated from (virtual_start, virtual_end).
318  *
319  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
320  * large enough to hold vm_page_array & other structures for machines with
321  * large amounts of ram, so we want to use virtual2* when available.
322  */
323 void
324 vm_page_startup(void)
325 {
326 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
327 	vm_offset_t mapped;
328 	vm_pindex_t npages;
329 	vm_paddr_t page_range;
330 	vm_paddr_t new_end;
331 	int i;
332 	vm_paddr_t pa;
333 	vm_paddr_t last_pa;
334 	vm_paddr_t end;
335 	vm_paddr_t biggestone, biggestsize;
336 	vm_paddr_t total;
337 	vm_page_t m;
338 	int badcount;
339 
340 	total = 0;
341 	badcount = 0;
342 	biggestsize = 0;
343 	biggestone = 0;
344 	vaddr = round_page(vaddr);
345 
346 	/*
347 	 * Make sure ranges are page-aligned.
348 	 */
349 	for (i = 0; phys_avail[i].phys_end; ++i) {
350 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
351 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
352 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
353 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
354 	}
355 
356 	/*
357 	 * Locate largest block
358 	 */
359 	for (i = 0; phys_avail[i].phys_end; ++i) {
360 		vm_paddr_t size = phys_avail[i].phys_end -
361 				  phys_avail[i].phys_beg;
362 
363 		if (size > biggestsize) {
364 			biggestone = i;
365 			biggestsize = size;
366 		}
367 		total += size;
368 	}
369 	--i;	/* adjust to last entry for use down below */
370 
371 	end = phys_avail[biggestone].phys_end;
372 	end = trunc_page(end);
373 
374 	/*
375 	 * Initialize the queue headers for the free queue, the active queue
376 	 * and the inactive queue.
377 	 */
378 	vm_page_queue_init();
379 
380 #if !defined(_KERNEL_VIRTUAL)
381 	/*
382 	 * VKERNELs don't support minidumps and as such don't need
383 	 * vm_page_dump
384 	 *
385 	 * Allocate a bitmap to indicate that a random physical page
386 	 * needs to be included in a minidump.
387 	 *
388 	 * The amd64 port needs this to indicate which direct map pages
389 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
390 	 *
391 	 * However, x86 still needs this workspace internally within the
392 	 * minidump code.  In theory, they are not needed on x86, but are
393 	 * included should the sf_buf code decide to use them.
394 	 */
395 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
396 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
397 	end -= vm_page_dump_size;
398 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
399 					VM_PROT_READ | VM_PROT_WRITE);
400 	bzero((void *)vm_page_dump, vm_page_dump_size);
401 #endif
402 	/*
403 	 * Compute the number of pages of memory that will be available for
404 	 * use (taking into account the overhead of a page structure per
405 	 * page).
406 	 */
407 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
408 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
409 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
410 
411 #ifndef _KERNEL_VIRTUAL
412 	/*
413 	 * (only applies to real kernels)
414 	 *
415 	 * Reserve a large amount of low memory for potential 32-bit DMA
416 	 * space allocations.  Once device initialization is complete we
417 	 * release most of it, but keep (vm_dma_reserved) memory reserved
418 	 * for later use.  Typically for X / graphics.  Through trial and
419 	 * error we find that GPUs usually requires ~60-100MB or so.
420 	 *
421 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
422 	 */
423 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
424 	if (vm_low_phys_reserved > total / 4)
425 		vm_low_phys_reserved = total / 4;
426 	if (vm_dma_reserved == 0) {
427 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
428 		if (vm_dma_reserved > total / 16)
429 			vm_dma_reserved = total / 16;
430 	}
431 #endif
432 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
433 		   ALIST_RECORDS_65536);
434 
435 	/*
436 	 * Initialize the mem entry structures now, and put them in the free
437 	 * queue.
438 	 */
439 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
440 		kprintf("initializing vm_page_array ");
441 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
442 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
443 	vm_page_array = (vm_page_t)mapped;
444 
445 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
446 	/*
447 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
448 	 * we have to manually add these pages to the minidump tracking so
449 	 * that they can be dumped, including the vm_page_array.
450 	 */
451 	for (pa = new_end;
452 	     pa < phys_avail[biggestone].phys_end;
453 	     pa += PAGE_SIZE) {
454 		dump_add_page(pa);
455 	}
456 #endif
457 
458 	/*
459 	 * Clear all of the page structures, run basic initialization so
460 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
461 	 * map.
462 	 */
463 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
464 	vm_page_array_size = page_range;
465 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
466 		kprintf("size = 0x%zx\n", vm_page_array_size);
467 
468 	m = &vm_page_array[0];
469 	pa = ptoa(first_page);
470 	for (i = 0; i < page_range; ++i) {
471 		spin_init(&m->spin, "vm_page");
472 		m->phys_addr = pa;
473 		pa += PAGE_SIZE;
474 		++m;
475 	}
476 
477 	/*
478 	 * Construct the free queue(s) in ascending order (by physical
479 	 * address) so that the first 16MB of physical memory is allocated
480 	 * last rather than first.  On large-memory machines, this avoids
481 	 * the exhaustion of low physical memory before isa_dma_init has run.
482 	 */
483 	vmstats.v_page_count = 0;
484 	vmstats.v_free_count = 0;
485 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
486 		pa = phys_avail[i].phys_beg;
487 		if (i == biggestone)
488 			last_pa = new_end;
489 		else
490 			last_pa = phys_avail[i].phys_end;
491 		while (pa < last_pa && npages-- > 0) {
492 			vm_add_new_page(pa, &badcount);
493 			pa += PAGE_SIZE;
494 		}
495 	}
496 	if (virtual2_start)
497 		virtual2_start = vaddr;
498 	else
499 		virtual_start = vaddr;
500 	mycpu->gd_vmstats = vmstats;
501 }
502 
503 /*
504  * (called from early boot only)
505  *
506  * Reorganize VM pages based on numa data.  May be called as many times as
507  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
508  * to allow vm_page_alloc() to choose pages based on socket affinity.
509  *
510  * NOTE: This function is only called while we are still in UP mode, so
511  *	 we only need a critical section to protect the queues (which
512  *	 saves a lot of time, there are likely a ton of pages).
513  */
514 void
515 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
516 {
517 	vm_paddr_t scan_beg;
518 	vm_paddr_t scan_end;
519 	vm_paddr_t ran_end;
520 	struct vpgqueues *vpq;
521 	vm_page_t m;
522 	vm_page_t mend;
523 	int socket_mod;
524 	int socket_value;
525 	int i;
526 
527 	/*
528 	 * Check if no physical information, or there was only one socket
529 	 * (so don't waste time doing nothing!).
530 	 */
531 	if (cpu_topology_phys_ids <= 1 ||
532 	    cpu_topology_core_ids == 0) {
533 		return;
534 	}
535 
536 	/*
537 	 * Setup for our iteration.  Note that ACPI may iterate CPU
538 	 * sockets starting at 0 or 1 or some other number.  The
539 	 * cpu_topology code mod's it against the socket count.
540 	 */
541 	ran_end = ran_beg + bytes;
542 
543 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
544 	socket_value = (physid % cpu_topology_phys_ids) * socket_mod;
545 	mend = &vm_page_array[vm_page_array_size];
546 
547 	crit_enter();
548 
549 	/*
550 	 * Adjust cpu_topology's phys_mem parameter
551 	 */
552 	if (root_cpu_node)
553 		vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes);
554 
555 	/*
556 	 * Adjust vm_page->pc and requeue all affected pages.  The
557 	 * allocator will then be able to localize memory allocations
558 	 * to some degree.
559 	 */
560 	for (i = 0; phys_avail[i].phys_end; ++i) {
561 		scan_beg = phys_avail[i].phys_beg;
562 		scan_end = phys_avail[i].phys_end;
563 		if (scan_end <= ran_beg)
564 			continue;
565 		if (scan_beg >= ran_end)
566 			continue;
567 		if (scan_beg < ran_beg)
568 			scan_beg = ran_beg;
569 		if (scan_end > ran_end)
570 			scan_end = ran_end;
571 		if (atop(scan_end) > first_page + vm_page_array_size)
572 			scan_end = ptoa(first_page + vm_page_array_size);
573 
574 		m = PHYS_TO_VM_PAGE(scan_beg);
575 		while (scan_beg < scan_end) {
576 			KKASSERT(m < mend);
577 			if (m->queue != PQ_NONE) {
578 				vpq = &vm_page_queues[m->queue];
579 				TAILQ_REMOVE(&vpq->pl, m, pageq);
580 				--vpq->lcnt;
581 				/* queue doesn't change, no need to adj cnt */
582 				m->queue -= m->pc;
583 				m->pc %= socket_mod;
584 				m->pc += socket_value;
585 				m->pc &= PQ_L2_MASK;
586 				m->queue += m->pc;
587 				vpq = &vm_page_queues[m->queue];
588 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
589 				++vpq->lcnt;
590 				/* queue doesn't change, no need to adj cnt */
591 			} else {
592 				m->pc %= socket_mod;
593 				m->pc += socket_value;
594 				m->pc &= PQ_L2_MASK;
595 			}
596 			scan_beg += PAGE_SIZE;
597 			++m;
598 		}
599 	}
600 
601 	crit_exit();
602 }
603 
604 /*
605  * (called from early boot only)
606  *
607  * Don't allow the NUMA organization to leave vm_page_queues[] nodes
608  * completely empty for a logical cpu.  Doing so would force allocations
609  * on that cpu to always borrow from a nearby cpu, create unnecessary
610  * contention, and cause vm_page_alloc() to iterate more queues and run more
611  * slowly.
612  *
613  * This situation can occur when memory sticks are not entirely populated,
614  * populated at different densities, or in naturally assymetric systems
615  * such as the 2990WX.  There could very well be many vm_page_queues[]
616  * entries with *NO* pages assigned to them.
617  *
618  * Fixing this up ensures that each logical CPU has roughly the same
619  * sized memory pool, and more importantly ensures that logical CPUs
620  * do not wind up with an empty memory pool.
621  *
622  * At them moment we just iterate the other queues and borrow pages,
623  * moving them into the queues for cpus with severe deficits even though
624  * the memory might not be local to those cpus.  I am not doing this in
625  * a 'smart' way, its effectively UMA style (sorta, since its page-by-page
626  * whereas real UMA typically exchanges address bits 8-10 with high address
627  * bits).  But it works extremely well and gives us fairly good deterministic
628  * results on the cpu cores associated with these secondary nodes.
629  */
630 void
631 vm_numa_organize_finalize(void)
632 {
633 	struct vpgqueues *vpq;
634 	vm_page_t m;
635 	long lcnt_lo;
636 	long lcnt_hi;
637 	int iter;
638 	int i;
639 	int scale_lim;
640 
641 	crit_enter();
642 
643 	/*
644 	 * Machines might not use an exact power of 2 for phys_ids,
645 	 * core_ids, ht_ids, etc.  This can slightly reduce the actual
646 	 * range of indices in vm_page_queues[] that are nominally used.
647 	 */
648 	if (cpu_topology_ht_ids) {
649 		scale_lim = PQ_L2_SIZE / cpu_topology_phys_ids;
650 		scale_lim = scale_lim / cpu_topology_core_ids;
651 		scale_lim = scale_lim / cpu_topology_ht_ids;
652 		scale_lim = scale_lim * cpu_topology_ht_ids;
653 		scale_lim = scale_lim * cpu_topology_core_ids;
654 		scale_lim = scale_lim * cpu_topology_phys_ids;
655 	} else {
656 		scale_lim = PQ_L2_SIZE;
657 	}
658 
659 	/*
660 	 * Calculate an average, set hysteresis for balancing from
661 	 * 10% below the average to the average.
662 	 */
663 	lcnt_hi = 0;
664 	for (i = 0; i < scale_lim; ++i) {
665 		lcnt_hi += vm_page_queues[i].lcnt;
666 	}
667 	lcnt_hi /= scale_lim;
668 	lcnt_lo = lcnt_hi - lcnt_hi / 10;
669 
670 	kprintf("vm_page: avg %ld pages per queue, %d queues\n",
671 		lcnt_hi, scale_lim);
672 
673 	iter = 0;
674 	for (i = 0; i < scale_lim; ++i) {
675 		vpq = &vm_page_queues[PQ_FREE + i];
676 		while (vpq->lcnt < lcnt_lo) {
677 			struct vpgqueues *vptmp;
678 
679 			iter = (iter + 1) & PQ_L2_MASK;
680 			vptmp = &vm_page_queues[PQ_FREE + iter];
681 			if (vptmp->lcnt < lcnt_hi)
682 				continue;
683 			m = TAILQ_FIRST(&vptmp->pl);
684 			KKASSERT(m->queue == PQ_FREE + iter);
685 			TAILQ_REMOVE(&vptmp->pl, m, pageq);
686 			--vptmp->lcnt;
687 			/* queue doesn't change, no need to adj cnt */
688 			m->queue -= m->pc;
689 			m->pc = i;
690 			m->queue += m->pc;
691 			TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
692 			++vpq->lcnt;
693 		}
694 	}
695 	crit_exit();
696 }
697 
698 static
699 void
700 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes)
701 {
702 	int cpuid;
703 	int i;
704 
705 	switch(cpup->type) {
706 	case PACKAGE_LEVEL:
707 		cpup->phys_mem += bytes;
708 		break;
709 	case CHIP_LEVEL:
710 		/*
711 		 * All members should have the same chipid, so we only need
712 		 * to pull out one member.
713 		 */
714 		if (CPUMASK_TESTNZERO(cpup->members)) {
715 			cpuid = BSFCPUMASK(cpup->members);
716 			if (physid ==
717 			    get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) {
718 				cpup->phys_mem += bytes;
719 			}
720 		}
721 		break;
722 	case CORE_LEVEL:
723 	case THREAD_LEVEL:
724 		/*
725 		 * Just inherit from the parent node
726 		 */
727 		cpup->phys_mem = cpup->parent_node->phys_mem;
728 		break;
729 	}
730 	for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i)
731 		vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes);
732 }
733 
734 /*
735  * We tended to reserve a ton of memory for contigmalloc().  Now that most
736  * drivers have initialized we want to return most the remaining free
737  * reserve back to the VM page queues so they can be used for normal
738  * allocations.
739  *
740  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
741  */
742 static void
743 vm_page_startup_finish(void *dummy __unused)
744 {
745 	alist_blk_t blk;
746 	alist_blk_t rblk;
747 	alist_blk_t count;
748 	alist_blk_t xcount;
749 	alist_blk_t bfree;
750 	vm_page_t m;
751 	struct vm_page_hash_elm *mp;
752 	int mask;
753 
754 	/*
755 	 * Set the set_assoc_mask based on the fitted number of CPUs.
756 	 * This is a mask, so we subject 1.
757 	 *
758 	 * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8.
759 	 * So if we have 256 CPUs, two hyper-threads will wind up sharing.
760 	 *
761 	 * The maximum is PQ_L2_SIZE.  However, we limit the starting
762 	 * maximum to 16 (mask = 15) in order to improve the cache locality
763 	 * of related kernel data structures.
764 	 */
765 	mask = PQ_L2_SIZE / ncpus_fit - 1;
766 	if (mask < 7)		/* minimum is 8-way w/256 CPU threads */
767 		mask = 7;
768 	if (mask < 15)
769 		mask = 15;
770 	cpu_ccfence();
771 	set_assoc_mask = mask;
772 
773 	/*
774 	 * Return part of the initial reserve back to the system
775 	 */
776 	spin_lock(&vm_contig_spin);
777 	for (;;) {
778 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
779 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
780 			break;
781 		if (count == 0)
782 			break;
783 
784 		/*
785 		 * Figure out how much of the initial reserve we have to
786 		 * free in order to reach our target.
787 		 */
788 		bfree -= vm_dma_reserved / PAGE_SIZE;
789 		if (count > bfree) {
790 			blk += count - bfree;
791 			count = bfree;
792 		}
793 
794 		/*
795 		 * Calculate the nearest power of 2 <= count.
796 		 */
797 		for (xcount = 1; xcount <= count; xcount <<= 1)
798 			;
799 		xcount >>= 1;
800 		blk += count - xcount;
801 		count = xcount;
802 
803 		/*
804 		 * Allocate the pages from the alist, then free them to
805 		 * the normal VM page queues.
806 		 *
807 		 * Pages allocated from the alist are wired.  We have to
808 		 * busy, unwire, and free them.  We must also adjust
809 		 * vm_low_phys_reserved before freeing any pages to prevent
810 		 * confusion.
811 		 */
812 		rblk = alist_alloc(&vm_contig_alist, blk, count);
813 		if (rblk != blk) {
814 			kprintf("vm_page_startup_finish: Unable to return "
815 				"dma space @0x%08x/%d -> 0x%08x\n",
816 				blk, count, rblk);
817 			break;
818 		}
819 		atomic_add_long(&vmstats.v_dma_pages, -(long)count);
820 		spin_unlock(&vm_contig_spin);
821 
822 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
823 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
824 		while (count) {
825 			vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);
826 			vm_page_busy_wait(m, FALSE, "cpgfr");
827 			vm_page_unwire(m, 0);
828 			vm_page_free(m);
829 			--count;
830 			++m;
831 		}
832 		spin_lock(&vm_contig_spin);
833 	}
834 	spin_unlock(&vm_contig_spin);
835 
836 	/*
837 	 * Print out how much DMA space drivers have already allocated and
838 	 * how much is left over.
839 	 */
840 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
841 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
842 		(PAGE_SIZE / 1024),
843 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
844 
845 	/*
846 	 * Power of 2
847 	 */
848 	vm_page_hash_size = 4096;
849 	while (vm_page_hash_size < (vm_page_array_size / 16))
850 		vm_page_hash_size <<= 1;
851 	if (vm_page_hash_size > VM_PAGE_HASH_MAX)
852 		vm_page_hash_size = VM_PAGE_HASH_MAX;
853 
854 	/*
855 	 * hash table for vm_page_lookup_quick()
856 	 */
857 	mp = (void *)kmem_alloc3(&kernel_map,
858 				 (vm_page_hash_size + VM_PAGE_HASH_SET) *
859 				  sizeof(*vm_page_hash),
860 				 VM_SUBSYS_VMPGHASH, KM_CPU(0));
861 	bzero(mp, (vm_page_hash_size + VM_PAGE_HASH_SET) * sizeof(*mp));
862 	cpu_sfence();
863 	vm_page_hash = mp;
864 }
865 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
866 	vm_page_startup_finish, NULL);
867 
868 
869 /*
870  * Scan comparison function for Red-Black tree scans.  An inclusive
871  * (start,end) is expected.  Other fields are not used.
872  */
873 int
874 rb_vm_page_scancmp(struct vm_page *p, void *data)
875 {
876 	struct rb_vm_page_scan_info *info = data;
877 
878 	if (p->pindex < info->start_pindex)
879 		return(-1);
880 	if (p->pindex > info->end_pindex)
881 		return(1);
882 	return(0);
883 }
884 
885 int
886 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
887 {
888 	if (p1->pindex < p2->pindex)
889 		return(-1);
890 	if (p1->pindex > p2->pindex)
891 		return(1);
892 	return(0);
893 }
894 
895 void
896 vm_page_init(vm_page_t m)
897 {
898 	/* do nothing for now.  Called from pmap_page_init() */
899 }
900 
901 /*
902  * Each page queue has its own spin lock, which is fairly optimal for
903  * allocating and freeing pages at least.
904  *
905  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
906  * queue spinlock via this function.  Also note that m->queue cannot change
907  * unless both the page and queue are locked.
908  */
909 static __inline
910 void
911 _vm_page_queue_spin_lock(vm_page_t m)
912 {
913 	u_short queue;
914 
915 	queue = m->queue;
916 	if (queue != PQ_NONE) {
917 		spin_lock(&vm_page_queues[queue].spin);
918 		KKASSERT(queue == m->queue);
919 	}
920 }
921 
922 static __inline
923 void
924 _vm_page_queue_spin_unlock(vm_page_t m)
925 {
926 	u_short queue;
927 
928 	queue = m->queue;
929 	cpu_ccfence();
930 	if (queue != PQ_NONE)
931 		spin_unlock(&vm_page_queues[queue].spin);
932 }
933 
934 static __inline
935 void
936 _vm_page_queues_spin_lock(u_short queue)
937 {
938 	cpu_ccfence();
939 	if (queue != PQ_NONE)
940 		spin_lock(&vm_page_queues[queue].spin);
941 }
942 
943 
944 static __inline
945 void
946 _vm_page_queues_spin_unlock(u_short queue)
947 {
948 	cpu_ccfence();
949 	if (queue != PQ_NONE)
950 		spin_unlock(&vm_page_queues[queue].spin);
951 }
952 
953 void
954 vm_page_queue_spin_lock(vm_page_t m)
955 {
956 	_vm_page_queue_spin_lock(m);
957 }
958 
959 void
960 vm_page_queues_spin_lock(u_short queue)
961 {
962 	_vm_page_queues_spin_lock(queue);
963 }
964 
965 void
966 vm_page_queue_spin_unlock(vm_page_t m)
967 {
968 	_vm_page_queue_spin_unlock(m);
969 }
970 
971 void
972 vm_page_queues_spin_unlock(u_short queue)
973 {
974 	_vm_page_queues_spin_unlock(queue);
975 }
976 
977 /*
978  * This locks the specified vm_page and its queue in the proper order
979  * (page first, then queue).  The queue may change so the caller must
980  * recheck on return.
981  */
982 static __inline
983 void
984 _vm_page_and_queue_spin_lock(vm_page_t m)
985 {
986 	vm_page_spin_lock(m);
987 	_vm_page_queue_spin_lock(m);
988 }
989 
990 static __inline
991 void
992 _vm_page_and_queue_spin_unlock(vm_page_t m)
993 {
994 	_vm_page_queues_spin_unlock(m->queue);
995 	vm_page_spin_unlock(m);
996 }
997 
998 void
999 vm_page_and_queue_spin_unlock(vm_page_t m)
1000 {
1001 	_vm_page_and_queue_spin_unlock(m);
1002 }
1003 
1004 void
1005 vm_page_and_queue_spin_lock(vm_page_t m)
1006 {
1007 	_vm_page_and_queue_spin_lock(m);
1008 }
1009 
1010 /*
1011  * Helper function removes vm_page from its current queue.
1012  * Returns the base queue the page used to be on.
1013  *
1014  * The vm_page and the queue must be spinlocked.
1015  * This function will unlock the queue but leave the page spinlocked.
1016  */
1017 static __inline u_short
1018 _vm_page_rem_queue_spinlocked(vm_page_t m)
1019 {
1020 	struct vpgqueues *pq;
1021 	u_short queue;
1022 	u_short oqueue;
1023 	long *cnt_adj;
1024 	long *cnt_gd;
1025 
1026 	queue = m->queue;
1027 	if (queue != PQ_NONE) {
1028 		pq = &vm_page_queues[queue];
1029 		TAILQ_REMOVE(&pq->pl, m, pageq);
1030 
1031 		/*
1032 		 * Primarily adjust our pcpu stats for rollup, which is
1033 		 * (mycpu->gd_vmstats_adj + offset).  This is normally
1034 		 * synchronized on every hardclock().
1035 		 *
1036 		 * However, in order for the nominal low-memory algorithms
1037 		 * to work properly if the unsynchronized adjustment gets
1038 		 * too negative and might trigger the pageout daemon, we
1039 		 * immediately synchronize with the global structure.
1040 		 *
1041 		 * The idea here is to reduce unnecessary SMP cache mastership
1042 		 * changes in the global vmstats, which can be particularly
1043 		 * bad in multi-socket systems.
1044 		 *
1045 		 * WARNING! In systems with low amounts of memory the
1046 		 *	    vm_paging_needed(-1024 * ncpus) test could
1047 		 *	    wind up testing a value above the paging target,
1048 		 *	    meaning it would almost always return TRUE.  In
1049 		 *	    that situation we synchronize every time the
1050 		 *	    cumulative adjustment falls below -1024.
1051 		 */
1052 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1053 				   pq->cnt_offset);
1054 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1055 				   pq->cnt_offset);
1056 		atomic_add_long(cnt_adj, -1);
1057 		atomic_add_long(cnt_gd, -1);
1058 
1059 		if (*cnt_adj < -1024 && vm_paging_needed(-1024 * ncpus)) {
1060 			u_long copy = atomic_swap_long(cnt_adj, 0);
1061 			cnt_adj = (long *)((char *)&vmstats + pq->cnt_offset);
1062 			atomic_add_long(cnt_adj, copy);
1063 		}
1064 		pq->lcnt--;
1065 		m->queue = PQ_NONE;
1066 		oqueue = queue;
1067 		queue -= m->pc;
1068 		vm_page_queues_spin_unlock(oqueue);	/* intended */
1069 	}
1070 	return queue;
1071 }
1072 
1073 /*
1074  * Helper function places the vm_page on the specified queue.  Generally
1075  * speaking only PQ_FREE pages are placed at the head, to allow them to
1076  * be allocated sooner rather than later on the assumption that they
1077  * are cache-hot.
1078  *
1079  * The vm_page must be spinlocked.
1080  * The vm_page must NOT be FICTITIOUS (that would be a disaster)
1081  * This function will return with both the page and the queue locked.
1082  */
1083 static __inline void
1084 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
1085 {
1086 	struct vpgqueues *pq;
1087 	u_long *cnt_adj;
1088 	u_long *cnt_gd;
1089 
1090 	KKASSERT(m->queue == PQ_NONE &&
1091 		 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0);
1092 
1093 	if (queue != PQ_NONE) {
1094 		vm_page_queues_spin_lock(queue);
1095 		pq = &vm_page_queues[queue];
1096 		++pq->lcnt;
1097 
1098 		/*
1099 		 * Adjust our pcpu stats.  If a system entity really needs
1100 		 * to incorporate the count it will call vmstats_rollup()
1101 		 * to roll it all up into the global vmstats strufture.
1102 		 */
1103 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1104 				   pq->cnt_offset);
1105 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1106 				   pq->cnt_offset);
1107 		atomic_add_long(cnt_adj, 1);
1108 		atomic_add_long(cnt_gd, 1);
1109 
1110 		/*
1111 		 * PQ_FREE is always handled LIFO style to try to provide
1112 		 * cache-hot pages to programs.
1113 		 */
1114 		m->queue = queue;
1115 		if (queue - m->pc == PQ_FREE) {
1116 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1117 		} else if (athead) {
1118 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1119 		} else {
1120 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1121 		}
1122 		/* leave the queue spinlocked */
1123 	}
1124 }
1125 
1126 /*
1127  * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
1128  * until the page is no longer BUSY or SBUSY (busy_count field is 0).
1129  *
1130  * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
1131  * call will be made before returning.
1132  *
1133  * This function does NOT busy the page and on return the page is not
1134  * guaranteed to be available.
1135  */
1136 void
1137 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
1138 {
1139 	u_int32_t busy_count;
1140 
1141 	for (;;) {
1142 		busy_count = m->busy_count;
1143 		cpu_ccfence();
1144 
1145 		if ((busy_count & PBUSY_LOCKED) == 0 &&
1146 		    (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
1147 			break;
1148 		}
1149 		tsleep_interlock(m, 0);
1150 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1151 				      busy_count | PBUSY_WANTED)) {
1152 			atomic_set_int(&m->flags, PG_REFERENCED);
1153 			tsleep(m, PINTERLOCKED, msg, 0);
1154 			break;
1155 		}
1156 	}
1157 }
1158 
1159 /*
1160  * This calculates and returns a page color given an optional VM object and
1161  * either a pindex or an iterator.  We attempt to return a cpu-localized
1162  * pg_color that is still roughly 16-way set-associative.  The CPU topology
1163  * is used if it was probed.
1164  *
1165  * The caller may use the returned value to index into e.g. PQ_FREE when
1166  * allocating a page in order to nominally obtain pages that are hopefully
1167  * already localized to the requesting cpu.  This function is not able to
1168  * provide any sort of guarantee of this, but does its best to improve
1169  * hardware cache management performance.
1170  *
1171  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
1172  */
1173 u_short
1174 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
1175 {
1176 	u_short pg_color;
1177 	int object_pg_color;
1178 
1179 	/*
1180 	 * WARNING! cpu_topology_core_ids might not be a power of two.
1181 	 *	    We also shouldn't make assumptions about
1182 	 *	    cpu_topology_phys_ids either.
1183 	 *
1184 	 * WARNING! ncpus might not be known at this time (during early
1185 	 *	    boot), and might be set to 1.
1186 	 *
1187 	 * General format: [phys_id][core_id][cpuid][set-associativity]
1188 	 * (but uses modulo, so not necessarily precise bit masks)
1189 	 */
1190 	object_pg_color = object ? object->pg_color : 0;
1191 
1192 	if (cpu_topology_ht_ids) {
1193 		int phys_id;
1194 		int core_id;
1195 		int ht_id;
1196 		int physcale;
1197 		int grpscale;
1198 		int cpuscale;
1199 
1200 		/*
1201 		 * Translate cpuid to socket, core, and hyperthread id.
1202 		 */
1203 		phys_id = get_cpu_phys_id(cpuid);
1204 		core_id = get_cpu_core_id(cpuid);
1205 		ht_id = get_cpu_ht_id(cpuid);
1206 
1207 		/*
1208 		 * Calculate pg_color for our array index.
1209 		 *
1210 		 * physcale - socket multiplier.
1211 		 * grpscale - core multiplier (cores per socket)
1212 		 * cpu*	    - cpus per core
1213 		 *
1214 		 * WARNING! In early boot, ncpus has not yet been
1215 		 *	    initialized and may be set to (1).
1216 		 *
1217 		 * WARNING! physcale must match the organization that
1218 		 *	    vm_numa_organize() creates to ensure that
1219 		 *	    we properly localize allocations to the
1220 		 *	    requested cpuid.
1221 		 */
1222 		physcale = PQ_L2_SIZE / cpu_topology_phys_ids;
1223 		grpscale = physcale / cpu_topology_core_ids;
1224 		cpuscale = grpscale / cpu_topology_ht_ids;
1225 
1226 		pg_color = phys_id * physcale;
1227 		pg_color += core_id * grpscale;
1228 		pg_color += ht_id * cpuscale;
1229 		pg_color += (pindex + object_pg_color) % cpuscale;
1230 
1231 #if 0
1232 		if (grpsize >= 8) {
1233 			pg_color += (pindex + object_pg_color) % grpsize;
1234 		} else {
1235 			if (grpsize <= 2) {
1236 				grpsize = 8;
1237 			} else {
1238 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
1239 				grpsize += grpsize;
1240 				if (grpsize < 8)
1241 					grpsize += grpsize;
1242 			}
1243 			pg_color += (pindex + object_pg_color) % grpsize;
1244 		}
1245 #endif
1246 	} else {
1247 		/*
1248 		 * Unknown topology, distribute things evenly.
1249 		 *
1250 		 * WARNING! In early boot, ncpus has not yet been
1251 		 *	    initialized and may be set to (1).
1252 		 */
1253 		int cpuscale;
1254 
1255 		cpuscale = PQ_L2_SIZE / ncpus;
1256 
1257 		pg_color = cpuid * cpuscale;
1258 		pg_color += (pindex + object_pg_color) % cpuscale;
1259 	}
1260 	return (pg_color & PQ_L2_MASK);
1261 }
1262 
1263 /*
1264  * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
1265  * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
1266  */
1267 void
1268 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
1269 				     int also_m_busy, const char *msg
1270 				     VM_PAGE_DEBUG_ARGS)
1271 {
1272 	u_int32_t busy_count;
1273 
1274 	for (;;) {
1275 		busy_count = m->busy_count;
1276 		cpu_ccfence();
1277 		if (busy_count & PBUSY_LOCKED) {
1278 			tsleep_interlock(m, 0);
1279 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1280 					  busy_count | PBUSY_WANTED)) {
1281 				atomic_set_int(&m->flags, PG_REFERENCED);
1282 				tsleep(m, PINTERLOCKED, msg, 0);
1283 			}
1284 		} else if (also_m_busy && busy_count) {
1285 			tsleep_interlock(m, 0);
1286 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1287 					  busy_count | PBUSY_WANTED)) {
1288 				atomic_set_int(&m->flags, PG_REFERENCED);
1289 				tsleep(m, PINTERLOCKED, msg, 0);
1290 			}
1291 		} else {
1292 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1293 					      busy_count | PBUSY_LOCKED)) {
1294 #ifdef VM_PAGE_DEBUG
1295 				m->busy_func = func;
1296 				m->busy_line = lineno;
1297 #endif
1298 				break;
1299 			}
1300 		}
1301 	}
1302 }
1303 
1304 /*
1305  * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
1306  * m->busy_count is also 0.
1307  *
1308  * Returns non-zero on failure.
1309  */
1310 int
1311 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1312 				    VM_PAGE_DEBUG_ARGS)
1313 {
1314 	u_int32_t busy_count;
1315 
1316 	for (;;) {
1317 		busy_count = m->busy_count;
1318 		cpu_ccfence();
1319 		if (busy_count & PBUSY_LOCKED)
1320 			return TRUE;
1321 		if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1322 			return TRUE;
1323 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1324 				      busy_count | PBUSY_LOCKED)) {
1325 #ifdef VM_PAGE_DEBUG
1326 				m->busy_func = func;
1327 				m->busy_line = lineno;
1328 #endif
1329 			return FALSE;
1330 		}
1331 	}
1332 }
1333 
1334 /*
1335  * Clear the BUSY flag and return non-zero to indicate to the caller
1336  * that a wakeup() should be performed.
1337  *
1338  * (inline version)
1339  */
1340 static __inline
1341 int
1342 _vm_page_wakeup(vm_page_t m)
1343 {
1344 	u_int32_t busy_count;
1345 
1346 	busy_count = m->busy_count;
1347 	cpu_ccfence();
1348 	for (;;) {
1349 		if (atomic_fcmpset_int(&m->busy_count, &busy_count,
1350 				      busy_count &
1351 				      ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1352 			return((int)(busy_count & PBUSY_WANTED));
1353 		}
1354 	}
1355 	/* not reached */
1356 }
1357 
1358 /*
1359  * Clear the BUSY flag and wakeup anyone waiting for the page.  This
1360  * is typically the last call you make on a page before moving onto
1361  * other things.
1362  */
1363 void
1364 vm_page_wakeup(vm_page_t m)
1365 {
1366         KASSERT(m->busy_count & PBUSY_LOCKED,
1367 		("vm_page_wakeup: page not busy!!!"));
1368 	if (_vm_page_wakeup(m))
1369 		wakeup(m);
1370 }
1371 
1372 /*
1373  * Hold a page, preventing reuse.  This is typically only called on pages
1374  * in a known state (either held busy, special, or interlocked in some
1375  * manner).  Holding a page does not ensure that it remains valid, it only
1376  * prevents reuse.  The page must not already be on the FREE queue or in
1377  * any danger of being moved to the FREE queue concurrent with this call.
1378  *
1379  * Other parts of the system can still disassociate the page from its object
1380  * and attempt to free it, or perform read or write I/O on it and/or otherwise
1381  * manipulate the page, but if the page is held the VM system will leave the
1382  * page and its data intact and not cycle it through the FREE queue until
1383  * the last hold has been released.
1384  *
1385  * (see vm_page_wire() if you want to prevent the page from being
1386  *  disassociated from its object too).
1387  */
1388 void
1389 vm_page_hold(vm_page_t m)
1390 {
1391 	atomic_add_int(&m->hold_count, 1);
1392 	KKASSERT(m->queue - m->pc != PQ_FREE);
1393 }
1394 
1395 /*
1396  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1397  * it was freed while held and must be moved back to the FREE queue.
1398  *
1399  * To avoid racing against vm_page_free*() we must re-test conditions
1400  * after obtaining the spin-lock.  The initial test can also race a
1401  * vm_page_free*() that is in the middle of moving a page to PQ_HOLD,
1402  * leaving the page on PQ_HOLD with hold_count == 0.  Rather than
1403  * throw a spin-lock in the critical path, we rely on the pageout
1404  * daemon to clean-up these loose ends.
1405  *
1406  * More critically, the 'easy movement' between queues without busying
1407  * a vm_page is only allowed for PQ_FREE<->PQ_HOLD.
1408  */
1409 void
1410 vm_page_unhold(vm_page_t m)
1411 {
1412 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1413 		("vm_page_unhold: pg %p illegal hold_count (%d) or "
1414 		 "on FREE queue (%d)",
1415 		 m, m->hold_count, m->queue - m->pc));
1416 
1417 	if (atomic_fetchadd_int(&m->hold_count, -1) == 1 &&
1418 	    m->queue - m->pc == PQ_HOLD) {
1419 		vm_page_spin_lock(m);
1420 		if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1421 			_vm_page_queue_spin_lock(m);
1422 			_vm_page_rem_queue_spinlocked(m);
1423 			_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1424 			_vm_page_queue_spin_unlock(m);
1425 		}
1426 		vm_page_spin_unlock(m);
1427 	}
1428 }
1429 
1430 /*
1431  * Create a fictitious page with the specified physical address and
1432  * memory attribute.  The memory attribute is the only the machine-
1433  * dependent aspect of a fictitious page that must be initialized.
1434  */
1435 void
1436 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1437 {
1438 	/*
1439 	 * The page's memattr might have changed since the
1440 	 * previous initialization.  Update the pmap to the
1441 	 * new memattr.
1442 	 */
1443 	if ((m->flags & PG_FICTITIOUS) != 0)
1444 		goto memattr;
1445 	m->phys_addr = paddr;
1446 	m->queue = PQ_NONE;
1447 	/* Fictitious pages don't use "segind". */
1448 	/* Fictitious pages don't use "order" or "pool". */
1449 	m->flags = PG_FICTITIOUS | PG_UNQUEUED;
1450 	m->busy_count = PBUSY_LOCKED;
1451 	m->wire_count = 1;
1452 	spin_init(&m->spin, "fake_page");
1453 	pmap_page_init(m);
1454 memattr:
1455 	pmap_page_set_memattr(m, memattr);
1456 }
1457 
1458 /*
1459  * Inserts the given vm_page into the object and object list.
1460  *
1461  * The pagetables are not updated but will presumably fault the page
1462  * in if necessary, or if a kernel page the caller will at some point
1463  * enter the page into the kernel's pmap.  We are not allowed to block
1464  * here so we *can't* do this anyway.
1465  *
1466  * This routine may not block.
1467  * This routine must be called with the vm_object held.
1468  * This routine must be called with a critical section held.
1469  *
1470  * This routine returns TRUE if the page was inserted into the object
1471  * successfully, and FALSE if the page already exists in the object.
1472  */
1473 int
1474 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1475 {
1476 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1477 	if (m->object != NULL)
1478 		panic("vm_page_insert: already inserted");
1479 
1480 	atomic_add_int(&object->generation, 1);
1481 
1482 	/*
1483 	 * Associate the VM page with an (object, offset).
1484 	 *
1485 	 * The vm_page spin lock is required for interactions with the pmap.
1486 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1487 	 */
1488 	vm_page_spin_lock(m);
1489 	m->object = object;
1490 	m->pindex = pindex;
1491 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1492 		m->object = NULL;
1493 		m->pindex = 0;
1494 		vm_page_spin_unlock(m);
1495 		return FALSE;
1496 	}
1497 	++object->resident_page_count;
1498 	++mycpu->gd_vmtotal.t_rm;
1499 	vm_page_spin_unlock(m);
1500 
1501 	/*
1502 	 * Since we are inserting a new and possibly dirty page,
1503 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1504 	 */
1505 	if ((m->valid & m->dirty) ||
1506 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1507 		vm_object_set_writeable_dirty(object);
1508 
1509 	/*
1510 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1511 	 */
1512 	swap_pager_page_inserted(m);
1513 	return TRUE;
1514 }
1515 
1516 /*
1517  * Removes the given vm_page_t from the (object,index) table
1518  *
1519  * The page must be BUSY and will remain BUSY on return.
1520  * No other requirements.
1521  *
1522  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1523  *	 it busy.
1524  *
1525  * NOTE: Caller is responsible for any pmap disposition prior to the
1526  *	 rename (as the pmap code will not be able to find the entries
1527  *	 once the object has been disassociated).  The caller may choose
1528  *	 to leave the pmap association intact if this routine is being
1529  *	 called as part of a rename between shadowed objects.
1530  *
1531  * This routine may not block.
1532  */
1533 void
1534 vm_page_remove(vm_page_t m)
1535 {
1536 	vm_object_t object;
1537 
1538 	if (m->object == NULL) {
1539 		return;
1540 	}
1541 
1542 	if ((m->busy_count & PBUSY_LOCKED) == 0)
1543 		panic("vm_page_remove: page not busy");
1544 
1545 	object = m->object;
1546 
1547 	vm_object_hold(object);
1548 
1549 	/*
1550 	 * Remove the page from the object and update the object.
1551 	 *
1552 	 * The vm_page spin lock is required for interactions with the pmap.
1553 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1554 	 */
1555 	vm_page_spin_lock(m);
1556 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1557 	--object->resident_page_count;
1558 	--mycpu->gd_vmtotal.t_rm;
1559 	m->object = NULL;
1560 	atomic_add_int(&object->generation, 1);
1561 	vm_page_spin_unlock(m);
1562 
1563 	vm_object_drop(object);
1564 }
1565 
1566 /*
1567  * Calculate the hash position for the vm_page hash heuristic.  Generally
1568  * speaking we want to localize sequential lookups to reduce memory stalls.
1569  *
1570  * Mask by ~3 to offer 4-way set-assoc
1571  */
1572 static __inline
1573 struct vm_page_hash_elm *
1574 vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
1575 {
1576 	size_t hi;
1577 
1578 	hi = iscsi_crc32(&object, sizeof(object)) << 2;
1579 	hi ^= hi >> (23 - 2);
1580 	hi += pindex * VM_PAGE_HASH_SET;
1581 #if 0
1582 	/* mix it up */
1583 	hi = (intptr_t)object ^ object->pg_color ^ pindex;
1584 	hi += object->pg_color * pindex;
1585 	hi = hi ^ (hi >> 20);
1586 #endif
1587 	hi &= vm_page_hash_size - 1;		/* bounds */
1588 
1589 	return (&vm_page_hash[hi]);
1590 }
1591 
1592 /*
1593  * Heuristical page lookup that does not require any locks.  Returns
1594  * a soft-busied page on success, NULL on failure.
1595  *
1596  * Caller must lookup the page the slow way if NULL is returned.
1597  */
1598 vm_page_t
1599 vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
1600 {
1601 	struct vm_page_hash_elm *mp;
1602 	vm_page_t m;
1603 	int i;
1604 
1605 	if (__predict_false(vm_page_hash == NULL))
1606 		return NULL;
1607 	mp = vm_page_hash_hash(object, pindex);
1608 	for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1609 		if (mp->object != object ||
1610 		    mp->pindex != pindex) {
1611 			continue;
1612 		}
1613 		m = mp->m;
1614 		cpu_ccfence();
1615 		if (m == NULL)
1616 			continue;
1617 		if (m->object != object || m->pindex != pindex)
1618 			continue;
1619 		if (vm_page_sbusy_try(m))
1620 			continue;
1621 		if (m->object == object && m->pindex == pindex) {
1622 			/*
1623 			 * On-match optimization - do not update ticks
1624 			 * unless we have to (reduce cache coherency traffic)
1625 			 */
1626 			if (mp->ticks != ticks)
1627 				mp->ticks = ticks;
1628 			return m;
1629 		}
1630 		vm_page_sbusy_drop(m);
1631 	}
1632 	return NULL;
1633 }
1634 
1635 /*
1636  * Enter page onto vm_page_hash[].  This is a heuristic, SMP collisions
1637  * are allowed.
1638  */
1639 static __inline
1640 void
1641 vm_page_hash_enter(vm_page_t m)
1642 {
1643 	struct vm_page_hash_elm *mp;
1644 	struct vm_page_hash_elm *best;
1645 	vm_object_t object;
1646 	vm_pindex_t pindex;
1647 	int best_delta;
1648 	int delta;
1649 	int i;
1650 
1651 	/*
1652 	 * Only enter type-stable vm_pages with well-shared objects.
1653 	 */
1654 	if ((m->flags & PG_MAPPEDMULTI) == 0)
1655 		return;
1656 	if (__predict_false(vm_page_hash == NULL ||
1657 			    m < &vm_page_array[0] ||
1658 			    m >= &vm_page_array[vm_page_array_size])) {
1659 		return;
1660 	}
1661 	if (__predict_false(m->object == NULL))
1662 		return;
1663 #if 0
1664 	/*
1665 	 * Disabled at the moment, there are some degenerate conditions
1666 	 * with often-exec'd programs that get ignored.  In particular,
1667 	 * the kernel's elf loader does a vn_rdwr() on the first page of
1668 	 * a binary.
1669 	 */
1670 	if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING))
1671 		return;
1672 #endif
1673 	if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE)
1674 		return;
1675 
1676 	/*
1677 	 * Find best entry
1678 	 */
1679 	object = m->object;
1680 	pindex = m->pindex;
1681 
1682 	mp = vm_page_hash_hash(object, pindex);
1683 	best = mp;
1684 	best_delta = ticks - best->ticks;
1685 
1686 	for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1687 		if (mp->m == m &&
1688 		    mp->object == object &&
1689 		    mp->pindex == pindex) {
1690 			/*
1691 			 * On-match optimization - do not update ticks
1692 			 * unless we have to (reduce cache coherency traffic)
1693 			 */
1694 			if (mp->ticks != ticks)
1695 				mp->ticks = ticks;
1696 			return;
1697 		}
1698 
1699 		/*
1700 		 * The best choice is the oldest entry.
1701 		 *
1702 		 * Also check for a field overflow, using -1 instead of 0
1703 		 * to deal with SMP races on accessing the 'ticks' global.
1704 		 */
1705 		delta = ticks - mp->ticks;
1706 		if (delta < -1)
1707 			best = mp;
1708 		if (best_delta < delta)
1709 			best = mp;
1710 	}
1711 
1712 	/*
1713 	 * Load the entry.  Copy a few elements to the hash entry itself
1714 	 * to reduce memory stalls due to memory indirects on lookups.
1715 	 */
1716 	best->m = m;
1717 	best->object = object;
1718 	best->pindex = pindex;
1719 	best->ticks = ticks;
1720 }
1721 
1722 /*
1723  * Locate and return the page at (object, pindex), or NULL if the
1724  * page could not be found.
1725  *
1726  * The caller must hold the vm_object token.
1727  */
1728 vm_page_t
1729 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1730 {
1731 	vm_page_t m;
1732 
1733 	/*
1734 	 * Search the hash table for this object/offset pair
1735 	 */
1736 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1737 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1738 	if (m) {
1739 		KKASSERT(m->object == object && m->pindex == pindex);
1740 		vm_page_hash_enter(m);
1741 	}
1742 	return(m);
1743 }
1744 
1745 vm_page_t
1746 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1747 					    vm_pindex_t pindex,
1748 					    int also_m_busy, const char *msg
1749 					    VM_PAGE_DEBUG_ARGS)
1750 {
1751 	u_int32_t busy_count;
1752 	vm_page_t m;
1753 
1754 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1755 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1756 	while (m) {
1757 		KKASSERT(m->object == object && m->pindex == pindex);
1758 		busy_count = m->busy_count;
1759 		cpu_ccfence();
1760 		if (busy_count & PBUSY_LOCKED) {
1761 			tsleep_interlock(m, 0);
1762 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1763 					  busy_count | PBUSY_WANTED)) {
1764 				atomic_set_int(&m->flags, PG_REFERENCED);
1765 				tsleep(m, PINTERLOCKED, msg, 0);
1766 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1767 							      pindex);
1768 			}
1769 		} else if (also_m_busy && busy_count) {
1770 			tsleep_interlock(m, 0);
1771 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1772 					  busy_count | PBUSY_WANTED)) {
1773 				atomic_set_int(&m->flags, PG_REFERENCED);
1774 				tsleep(m, PINTERLOCKED, msg, 0);
1775 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1776 							      pindex);
1777 			}
1778 		} else if (atomic_cmpset_int(&m->busy_count, busy_count,
1779 					     busy_count | PBUSY_LOCKED)) {
1780 #ifdef VM_PAGE_DEBUG
1781 			m->busy_func = func;
1782 			m->busy_line = lineno;
1783 #endif
1784 			vm_page_hash_enter(m);
1785 			break;
1786 		}
1787 	}
1788 	return m;
1789 }
1790 
1791 /*
1792  * Attempt to lookup and busy a page.
1793  *
1794  * Returns NULL if the page could not be found
1795  *
1796  * Returns a vm_page and error == TRUE if the page exists but could not
1797  * be busied.
1798  *
1799  * Returns a vm_page and error == FALSE on success.
1800  */
1801 vm_page_t
1802 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1803 					   vm_pindex_t pindex,
1804 					   int also_m_busy, int *errorp
1805 					   VM_PAGE_DEBUG_ARGS)
1806 {
1807 	u_int32_t busy_count;
1808 	vm_page_t m;
1809 
1810 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1811 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1812 	*errorp = FALSE;
1813 	while (m) {
1814 		KKASSERT(m->object == object && m->pindex == pindex);
1815 		busy_count = m->busy_count;
1816 		cpu_ccfence();
1817 		if (busy_count & PBUSY_LOCKED) {
1818 			*errorp = TRUE;
1819 			break;
1820 		}
1821 		if (also_m_busy && busy_count) {
1822 			*errorp = TRUE;
1823 			break;
1824 		}
1825 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1826 				      busy_count | PBUSY_LOCKED)) {
1827 #ifdef VM_PAGE_DEBUG
1828 			m->busy_func = func;
1829 			m->busy_line = lineno;
1830 #endif
1831 			vm_page_hash_enter(m);
1832 			break;
1833 		}
1834 	}
1835 	return m;
1836 }
1837 
1838 /*
1839  * Returns a page that is only soft-busied for use by the caller in
1840  * a read-only fashion.  Returns NULL if the page could not be found,
1841  * the soft busy could not be obtained, or the page data is invalid.
1842  *
1843  * XXX Doesn't handle PG_FICTITIOUS pages at the moment, but there is
1844  *     no reason why we couldn't.
1845  */
1846 vm_page_t
1847 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1848 			 int pgoff, int pgbytes)
1849 {
1850 	vm_page_t m;
1851 
1852 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1853 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1854 	if (m) {
1855 		if ((m->valid != VM_PAGE_BITS_ALL &&
1856 		     !vm_page_is_valid(m, pgoff, pgbytes)) ||
1857 		    (m->flags & PG_FICTITIOUS)) {
1858 			m = NULL;
1859 		} else if (vm_page_sbusy_try(m)) {
1860 			m = NULL;
1861 		} else if ((m->valid != VM_PAGE_BITS_ALL &&
1862 			    !vm_page_is_valid(m, pgoff, pgbytes)) ||
1863 			   (m->flags & PG_FICTITIOUS)) {
1864 			vm_page_sbusy_drop(m);
1865 			m = NULL;
1866 		} else {
1867 			vm_page_hash_enter(m);
1868 		}
1869 	}
1870 	return m;
1871 }
1872 
1873 /*
1874  * Caller must hold the related vm_object
1875  */
1876 vm_page_t
1877 vm_page_next(vm_page_t m)
1878 {
1879 	vm_page_t next;
1880 
1881 	next = vm_page_rb_tree_RB_NEXT(m);
1882 	if (next && next->pindex != m->pindex + 1)
1883 		next = NULL;
1884 	return (next);
1885 }
1886 
1887 /*
1888  * vm_page_rename()
1889  *
1890  * Move the given vm_page from its current object to the specified
1891  * target object/offset.  The page must be busy and will remain so
1892  * on return.
1893  *
1894  * new_object must be held.
1895  * This routine might block. XXX ?
1896  *
1897  * NOTE: Swap associated with the page must be invalidated by the move.  We
1898  *       have to do this for several reasons:  (1) we aren't freeing the
1899  *       page, (2) we are dirtying the page, (3) the VM system is probably
1900  *       moving the page from object A to B, and will then later move
1901  *       the backing store from A to B and we can't have a conflict.
1902  *
1903  * NOTE: We *always* dirty the page.  It is necessary both for the
1904  *       fact that we moved it, and because we may be invalidating
1905  *	 swap.  If the page is on the cache, we have to deactivate it
1906  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1907  *	 on the cache.
1908  *
1909  * NOTE: Caller is responsible for any pmap disposition prior to the
1910  *	 rename (as the pmap code will not be able to find the entries
1911  *	 once the object has been disassociated or changed).  Nominally
1912  *	 the caller is moving a page between shadowed objects and so the
1913  *	 pmap association is retained without having to remove the page
1914  *	 from it.
1915  */
1916 void
1917 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1918 {
1919 	KKASSERT(m->busy_count & PBUSY_LOCKED);
1920 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1921 	if (m->object) {
1922 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1923 		vm_page_remove(m);
1924 	}
1925 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1926 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1927 		      new_object, new_pindex);
1928 	}
1929 	if (m->queue - m->pc == PQ_CACHE)
1930 		vm_page_deactivate(m);
1931 	vm_page_dirty(m);
1932 }
1933 
1934 /*
1935  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1936  * is to remain BUSYied by the caller.
1937  *
1938  * This routine may not block.
1939  */
1940 void
1941 vm_page_unqueue_nowakeup(vm_page_t m)
1942 {
1943 	vm_page_and_queue_spin_lock(m);
1944 	(void)_vm_page_rem_queue_spinlocked(m);
1945 	vm_page_spin_unlock(m);
1946 }
1947 
1948 /*
1949  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1950  * if necessary.
1951  *
1952  * This routine may not block.
1953  */
1954 void
1955 vm_page_unqueue(vm_page_t m)
1956 {
1957 	u_short queue;
1958 
1959 	vm_page_and_queue_spin_lock(m);
1960 	queue = _vm_page_rem_queue_spinlocked(m);
1961 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1962 		vm_page_spin_unlock(m);
1963 		pagedaemon_wakeup();
1964 	} else {
1965 		vm_page_spin_unlock(m);
1966 	}
1967 }
1968 
1969 /*
1970  * vm_page_list_find()
1971  *
1972  * Find a page on the specified queue with color optimization.
1973  *
1974  * The page coloring optimization attempts to locate a page that does
1975  * not overload other nearby pages in the object in the cpu's L1 or L2
1976  * caches.  We need this optimization because cpu caches tend to be
1977  * physical caches, while object spaces tend to be virtual.
1978  *
1979  * The page coloring optimization also, very importantly, tries to localize
1980  * memory to cpus and physical sockets.
1981  *
1982  * Each PQ_FREE and PQ_CACHE color queue has its own spinlock and the
1983  * algorithm is adjusted to localize allocations on a per-core basis.
1984  * This is done by 'twisting' the colors.
1985  *
1986  * The page is returned spinlocked and removed from its queue (it will
1987  * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
1988  * is responsible for dealing with the busy-page case (usually by
1989  * deactivating the page and looping).
1990  *
1991  * NOTE:  This routine is carefully inlined.  A non-inlined version
1992  *	  is available for outside callers but the only critical path is
1993  *	  from within this source file.
1994  *
1995  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1996  *	  represent stable storage, allowing us to order our locks vm_page
1997  *	  first, then queue.
1998  */
1999 static __inline
2000 vm_page_t
2001 _vm_page_list_find(int basequeue, int index)
2002 {
2003 	struct vpgqueues *pq;
2004 	vm_page_t m;
2005 
2006 	index &= PQ_L2_MASK;
2007 	pq = &vm_page_queues[basequeue + index];
2008 
2009 	/*
2010 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2011 	 * then lock the queue and locate a page.  Note that the lock order
2012 	 * is reversed, but we do not want to dwadle on the page spinlock
2013 	 * anyway as it is held significantly longer than the queue spinlock.
2014 	 */
2015 	if (TAILQ_FIRST(&pq->pl)) {
2016 		spin_lock(&pq->spin);
2017 		TAILQ_FOREACH(m, &pq->pl, pageq) {
2018 			if (spin_trylock(&m->spin) == 0)
2019 				continue;
2020 			KKASSERT(m->queue == basequeue + index);
2021 			pq->lastq = -1;
2022 			return(m);
2023 		}
2024 		spin_unlock(&pq->spin);
2025 	}
2026 
2027 	m = _vm_page_list_find_wide(basequeue, index, &pq->lastq);
2028 
2029 	return(m);
2030 }
2031 
2032 /*
2033  * If we could not find the page in the desired queue try to find it in
2034  * a nearby (NUMA-aware) queue, spreading out as we go.
2035  */
2036 static vm_page_t
2037 _vm_page_list_find_wide(int basequeue, int index, int *lastp)
2038 {
2039 	struct vpgqueues *pq;
2040 	vm_page_t m = NULL;
2041 	int pqmask = set_assoc_mask >> 1;
2042 	int pqi;
2043 	int range;
2044 	int skip_start;
2045 	int skip_next;
2046 	int count;
2047 
2048 	/*
2049 	 * Avoid re-searching empty queues over and over again skip to
2050 	 * pq->last if appropriate.
2051 	 */
2052 	if (*lastp >= 0)
2053 		index = *lastp;
2054 
2055 	index &= PQ_L2_MASK;
2056 	pq = &vm_page_queues[basequeue];
2057 	count = 0;
2058 	skip_start = -1;
2059 	skip_next = -1;
2060 
2061 	/*
2062 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2063 	 * else fails (PQ_L2_MASK).
2064 	 *
2065 	 * pqmask is a mask, 15, 31, 63, etc.
2066 	 *
2067 	 * Test each queue unlocked first, then lock the queue and locate
2068 	 * a page.  Note that the lock order is reversed, but we do not want
2069 	 * to dwadle on the page spinlock anyway as it is held significantly
2070 	 * longer than the queue spinlock.
2071 	 */
2072 	do {
2073 		pqmask = (pqmask << 1) | 1;
2074 
2075 		pqi = index;
2076 		range = pqmask + 1;
2077 
2078 		while (range > 0) {
2079 			if (pqi >= skip_start && pqi < skip_next) {
2080 				range -= skip_next - pqi;
2081 				pqi = (pqi & ~pqmask) | (skip_next & pqmask);
2082 			}
2083 			if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) {
2084 				spin_lock(&pq[pqi].spin);
2085 				TAILQ_FOREACH(m, &pq[pqi].pl, pageq) {
2086 					if (spin_trylock(&m->spin) == 0)
2087 						continue;
2088 					KKASSERT(m->queue == basequeue + pqi);
2089 
2090 					/*
2091 					 * If we had to wander too far, set
2092 					 * *lastp to skip past empty queues.
2093 					 */
2094 					if (count >= 8)
2095 						*lastp = pqi & PQ_L2_MASK;
2096 					return(m);
2097 				}
2098 				spin_unlock(&pq[pqi].spin);
2099 			}
2100 			--range;
2101 			++count;
2102 			pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask);
2103 		}
2104 		skip_start = pqi & ~pqmask;
2105 		skip_next = (pqi | pqmask) + 1;
2106 	} while (pqmask != PQ_L2_MASK);
2107 
2108 	return(m);
2109 }
2110 
2111 static __inline
2112 vm_page_t
2113 _vm_page_list_find2(int bq1, int bq2, int index)
2114 {
2115 	struct vpgqueues *pq1;
2116 	struct vpgqueues *pq2;
2117 	vm_page_t m;
2118 
2119 	index &= PQ_L2_MASK;
2120 	pq1 = &vm_page_queues[bq1 + index];
2121 	pq2 = &vm_page_queues[bq2 + index];
2122 
2123 	/*
2124 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2125 	 * then lock the queue and locate a page.  Note that the lock order
2126 	 * is reversed, but we do not want to dwadle on the page spinlock
2127 	 * anyway as it is held significantly longer than the queue spinlock.
2128 	 */
2129 	if (TAILQ_FIRST(&pq1->pl)) {
2130 		spin_lock(&pq1->spin);
2131 		TAILQ_FOREACH(m, &pq1->pl, pageq) {
2132 			if (spin_trylock(&m->spin) == 0)
2133 				continue;
2134 			KKASSERT(m->queue == bq1 + index);
2135 			pq1->lastq = -1;
2136 			pq2->lastq = -1;
2137 			return(m);
2138 		}
2139 		spin_unlock(&pq1->spin);
2140 	}
2141 
2142 	m = _vm_page_list_find2_wide(bq1, bq2, index, &pq1->lastq, &pq2->lastq);
2143 
2144 	return(m);
2145 }
2146 
2147 
2148 /*
2149  * This version checks two queues at the same time, widening its search
2150  * as we progress.  prefering basequeue1
2151  * and starting on basequeue2 after exhausting the first set.  The idea
2152  * is to try to stay localized to the cpu.
2153  */
2154 static vm_page_t
2155 _vm_page_list_find2_wide(int basequeue1, int basequeue2, int index,
2156 			 int *lastp1, int *lastp2)
2157 {
2158 	struct vpgqueues *pq1;
2159 	struct vpgqueues *pq2;
2160 	vm_page_t m = NULL;
2161 	int pqmask1, pqmask2;
2162 	int pqi;
2163 	int range;
2164 	int skip_start1, skip_start2;
2165 	int skip_next1, skip_next2;
2166 	int count1, count2;
2167 
2168 	/*
2169 	 * Avoid re-searching empty queues over and over again skip to
2170 	 * pq->last if appropriate.
2171 	 */
2172 	if (*lastp1 >= 0)
2173 		index = *lastp1;
2174 
2175 	index &= PQ_L2_MASK;
2176 
2177 	pqmask1 = set_assoc_mask >> 1;
2178 	pq1 = &vm_page_queues[basequeue1];
2179 	count1 = 0;
2180 	skip_start1 = -1;
2181 	skip_next1 = -1;
2182 
2183 	pqmask2 = set_assoc_mask >> 1;
2184 	pq2 = &vm_page_queues[basequeue2];
2185 	count2 = 0;
2186 	skip_start2 = -1;
2187 	skip_next2 = -1;
2188 
2189 	/*
2190 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2191 	 * else fails (PQ_L2_MASK).
2192 	 *
2193 	 * pqmask is a mask, 15, 31, 63, etc.
2194 	 *
2195 	 * Test each queue unlocked first, then lock the queue and locate
2196 	 * a page.  Note that the lock order is reversed, but we do not want
2197 	 * to dwadle on the page spinlock anyway as it is held significantly
2198 	 * longer than the queue spinlock.
2199 	 */
2200 	do {
2201 		if (pqmask1 == PQ_L2_MASK)
2202 			goto skip2;
2203 
2204 		pqmask1 = (pqmask1 << 1) | 1;
2205 		pqi = index;
2206 		range = pqmask1 + 1;
2207 
2208 		while (range > 0) {
2209 			if (pqi >= skip_start1 && pqi < skip_next1) {
2210 				range -= skip_next1 - pqi;
2211 				pqi = (pqi & ~pqmask1) | (skip_next1 & pqmask1);
2212 			}
2213 			if (range > 0 && TAILQ_FIRST(&pq1[pqi].pl)) {
2214 				spin_lock(&pq1[pqi].spin);
2215 				TAILQ_FOREACH(m, &pq1[pqi].pl, pageq) {
2216 					if (spin_trylock(&m->spin) == 0)
2217 						continue;
2218 					KKASSERT(m->queue == basequeue1 + pqi);
2219 
2220 					/*
2221 					 * If we had to wander too far, set
2222 					 * *lastp to skip past empty queues.
2223 					 */
2224 					if (count1 >= 8)
2225 						*lastp1 = pqi & PQ_L2_MASK;
2226 					return(m);
2227 				}
2228 				spin_unlock(&pq1[pqi].spin);
2229 			}
2230 			--range;
2231 			++count1;
2232 			pqi = (pqi & ~pqmask1) | ((pqi + 1) & pqmask1);
2233 		}
2234 		skip_start1 = pqi & ~pqmask1;
2235 		skip_next1 = (pqi | pqmask1) + 1;
2236 skip2:
2237 		if (pqmask1 < ((set_assoc_mask << 1) | 1))
2238 			continue;
2239 
2240 		pqmask2 = (pqmask2 << 1) | 1;
2241 		pqi = index;
2242 		range = pqmask2 + 1;
2243 
2244 		while (range > 0) {
2245 			if (pqi >= skip_start2 && pqi < skip_next2) {
2246 				range -= skip_next2 - pqi;
2247 				pqi = (pqi & ~pqmask2) | (skip_next2 & pqmask2);
2248 			}
2249 			if (range > 0 && TAILQ_FIRST(&pq2[pqi].pl)) {
2250 				spin_lock(&pq2[pqi].spin);
2251 				TAILQ_FOREACH(m, &pq2[pqi].pl, pageq) {
2252 					if (spin_trylock(&m->spin) == 0)
2253 						continue;
2254 					KKASSERT(m->queue == basequeue2 + pqi);
2255 
2256 					/*
2257 					 * If we had to wander too far, set
2258 					 * *lastp to skip past empty queues.
2259 					 */
2260 					if (count2 >= 8)
2261 						*lastp2 = pqi & PQ_L2_MASK;
2262 					return(m);
2263 				}
2264 				spin_unlock(&pq2[pqi].spin);
2265 			}
2266 			--range;
2267 			++count2;
2268 			pqi = (pqi & ~pqmask2) | ((pqi + 1) & pqmask2);
2269 		}
2270 		skip_start2 = pqi & ~pqmask2;
2271 		skip_next2 = (pqi | pqmask2) + 1;
2272 	} while (pqmask1 != PQ_L2_MASK && pqmask2 != PQ_L2_MASK);
2273 
2274 	return(m);
2275 }
2276 
2277 /*
2278  * Returns a vm_page candidate for allocation.  The page is not busied so
2279  * it can move around.  The caller must busy the page (and typically
2280  * deactivate it if it cannot be busied!)
2281  *
2282  * Returns a spinlocked vm_page that has been removed from its queue.
2283  * (note that _vm_page_list_find() does not remove the page from its
2284  *  queue).
2285  */
2286 vm_page_t
2287 vm_page_list_find(int basequeue, int index)
2288 {
2289 	vm_page_t m;
2290 
2291 	m = _vm_page_list_find(basequeue, index);
2292 	if (m)
2293 		_vm_page_rem_queue_spinlocked(m);
2294 	return m;
2295 }
2296 
2297 /*
2298  * Find a page on the cache queue with color optimization, remove it
2299  * from the queue, and busy it.  The returned page will not be spinlocked.
2300  *
2301  * A candidate failure will be deactivated.  Candidates can fail due to
2302  * being busied by someone else, in which case they will be deactivated.
2303  *
2304  * This routine may not block.
2305  *
2306  */
2307 static vm_page_t
2308 vm_page_select_cache(u_short pg_color)
2309 {
2310 	vm_page_t m;
2311 
2312 	for (;;) {
2313 		m = _vm_page_list_find(PQ_CACHE, pg_color);
2314 		if (m == NULL)
2315 			break;
2316 		/*
2317 		 * (m) has been spinlocked
2318 		 */
2319 		_vm_page_rem_queue_spinlocked(m);
2320 		if (vm_page_busy_try(m, TRUE)) {
2321 			_vm_page_deactivate_locked(m, 0);
2322 			vm_page_spin_unlock(m);
2323 		} else {
2324 			/*
2325 			 * We successfully busied the page
2326 			 */
2327 			if ((m->flags & PG_NEED_COMMIT) == 0 &&
2328 			    m->hold_count == 0 &&
2329 			    m->wire_count == 0 &&
2330 			    (m->dirty & m->valid) == 0) {
2331 				vm_page_spin_unlock(m);
2332 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2333 				pagedaemon_wakeup();
2334 				return(m);
2335 			}
2336 
2337 			/*
2338 			 * The page cannot be recycled, deactivate it.
2339 			 */
2340 			_vm_page_deactivate_locked(m, 0);
2341 			if (_vm_page_wakeup(m)) {
2342 				vm_page_spin_unlock(m);
2343 				wakeup(m);
2344 			} else {
2345 				vm_page_spin_unlock(m);
2346 			}
2347 		}
2348 	}
2349 	return (m);
2350 }
2351 
2352 /*
2353  * Find a free page.  We attempt to inline the nominal case and fall back
2354  * to _vm_page_select_free() otherwise.  A busied page is removed from
2355  * the queue and returned.
2356  *
2357  * This routine may not block.
2358  */
2359 static __inline vm_page_t
2360 vm_page_select_free(u_short pg_color)
2361 {
2362 	vm_page_t m;
2363 
2364 	for (;;) {
2365 		m = _vm_page_list_find(PQ_FREE, pg_color);
2366 		if (m == NULL)
2367 			break;
2368 		_vm_page_rem_queue_spinlocked(m);
2369 		if (vm_page_busy_try(m, TRUE)) {
2370 			/*
2371 			 * Various mechanisms such as a pmap_collect can
2372 			 * result in a busy page on the free queue.  We
2373 			 * have to move the page out of the way so we can
2374 			 * retry the allocation.  If the other thread is not
2375 			 * allocating the page then m->valid will remain 0 and
2376 			 * the pageout daemon will free the page later on.
2377 			 *
2378 			 * Since we could not busy the page, however, we
2379 			 * cannot make assumptions as to whether the page
2380 			 * will be allocated by the other thread or not,
2381 			 * so all we can do is deactivate it to move it out
2382 			 * of the way.  In particular, if the other thread
2383 			 * wires the page it may wind up on the inactive
2384 			 * queue and the pageout daemon will have to deal
2385 			 * with that case too.
2386 			 */
2387 			_vm_page_deactivate_locked(m, 0);
2388 			vm_page_spin_unlock(m);
2389 		} else {
2390 			/*
2391 			 * Theoretically if we are able to busy the page
2392 			 * atomic with the queue removal (using the vm_page
2393 			 * lock) nobody else should have been able to mess
2394 			 * with the page before us.
2395 			 *
2396 			 * Assert the page state.  Note that even though
2397 			 * wiring doesn't adjust queues, a page on the free
2398 			 * queue should never be wired at this point.
2399 			 */
2400 			KKASSERT((m->flags & (PG_UNQUEUED |
2401 					      PG_NEED_COMMIT)) == 0);
2402 			KASSERT(m->hold_count == 0,
2403 				("m->hold_count is not zero "
2404 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2405 				 m, m->queue, m->flags,
2406 				 m->hold_count, m->wire_count));
2407 			KKASSERT(m->wire_count == 0);
2408 			vm_page_spin_unlock(m);
2409 			pagedaemon_wakeup();
2410 
2411 			/* return busied and removed page */
2412 			return(m);
2413 		}
2414 	}
2415 	return(m);
2416 }
2417 
2418 static __inline vm_page_t
2419 vm_page_select_free_or_cache(u_short pg_color, int *fromcachep)
2420 {
2421 	vm_page_t m;
2422 
2423 	*fromcachep = 0;
2424 	for (;;) {
2425 		m = _vm_page_list_find2(PQ_FREE, PQ_CACHE, pg_color);
2426 		if (m == NULL)
2427 			break;
2428 		if (vm_page_busy_try(m, TRUE)) {
2429 			_vm_page_rem_queue_spinlocked(m);
2430 			_vm_page_deactivate_locked(m, 0);
2431 			vm_page_spin_unlock(m);
2432 		} else if (m->queue - m->pc == PQ_FREE) {
2433 			/*
2434 			 * We successfully busied the page, PQ_FREE case
2435 			 */
2436 			_vm_page_rem_queue_spinlocked(m);
2437 			KKASSERT((m->flags & (PG_UNQUEUED |
2438 					      PG_NEED_COMMIT)) == 0);
2439 			KASSERT(m->hold_count == 0,
2440 				("m->hold_count is not zero "
2441 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2442 				 m, m->queue, m->flags,
2443 				 m->hold_count, m->wire_count));
2444 			KKASSERT(m->wire_count == 0);
2445 			vm_page_spin_unlock(m);
2446 			pagedaemon_wakeup();
2447 
2448 			/* return busied and removed page */
2449 			return(m);
2450 		} else {
2451 			/*
2452 			 * We successfully busied the page, PQ_CACHE case
2453 			 */
2454 			_vm_page_rem_queue_spinlocked(m);
2455 			if ((m->flags & PG_NEED_COMMIT) == 0 &&
2456 			    m->hold_count == 0 &&
2457 			    m->wire_count == 0 &&
2458 			    (m->dirty & m->valid) == 0) {
2459 				vm_page_spin_unlock(m);
2460 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2461 				pagedaemon_wakeup();
2462 				*fromcachep = 1;
2463 				return(m);
2464 			}
2465 
2466 			/*
2467 			 * The page cannot be recycled, deactivate it.
2468 			 */
2469 			_vm_page_deactivate_locked(m, 0);
2470 			if (_vm_page_wakeup(m)) {
2471 				vm_page_spin_unlock(m);
2472 				wakeup(m);
2473 			} else {
2474 				vm_page_spin_unlock(m);
2475 			}
2476 		}
2477 	}
2478 	return(m);
2479 }
2480 
2481 /*
2482  * vm_page_alloc()
2483  *
2484  * Allocate and return a memory cell associated with this VM object/offset
2485  * pair.  If object is NULL an unassociated page will be allocated.
2486  *
2487  * The returned page will be busied and removed from its queues.  This
2488  * routine can block and may return NULL if a race occurs and the page
2489  * is found to already exist at the specified (object, pindex).
2490  *
2491  *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
2492  *	VM_ALLOC_QUICK		like normal but cannot use cache
2493  *	VM_ALLOC_SYSTEM		greater free drain
2494  *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
2495  *	VM_ALLOC_ZERO		advisory request for pre-zero'd page only
2496  *	VM_ALLOC_FORCE_ZERO	advisory request for pre-zero'd page only
2497  *	VM_ALLOC_NULL_OK	ok to return NULL on insertion collision
2498  *				(see vm_page_grab())
2499  *	VM_ALLOC_USE_GD		ok to use per-gd cache
2500  *
2501  *	VM_ALLOC_CPU(n)		allocate using specified cpu localization
2502  *
2503  * The object must be held if not NULL
2504  * This routine may not block
2505  *
2506  * Additional special handling is required when called from an interrupt
2507  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
2508  * in this case.
2509  */
2510 vm_page_t
2511 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
2512 {
2513 	globaldata_t gd;
2514 	vm_object_t obj;
2515 	vm_page_t m;
2516 	u_short pg_color;
2517 	int cpuid_local;
2518 	int fromcache;
2519 
2520 #if 0
2521 	/*
2522 	 * Special per-cpu free VM page cache.  The pages are pre-busied
2523 	 * and pre-zerod for us.
2524 	 */
2525 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
2526 		crit_enter_gd(gd);
2527 		if (gd->gd_vmpg_count) {
2528 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
2529 			crit_exit_gd(gd);
2530 			goto done;
2531                 }
2532 		crit_exit_gd(gd);
2533         }
2534 #endif
2535 	m = NULL;
2536 
2537 	/*
2538 	 * CPU LOCALIZATION
2539 	 *
2540 	 * CPU localization algorithm.  Break the page queues up by physical
2541 	 * id and core id (note that two cpu threads will have the same core
2542 	 * id, and core_id != gd_cpuid).
2543 	 *
2544 	 * This is nowhere near perfect, for example the last pindex in a
2545 	 * subgroup will overflow into the next cpu or package.  But this
2546 	 * should get us good page reuse locality in heavy mixed loads.
2547 	 *
2548 	 * (may be executed before the APs are started, so other GDs might
2549 	 *  not exist!)
2550 	 */
2551 	if (page_req & VM_ALLOC_CPU_SPEC)
2552 		cpuid_local = VM_ALLOC_GETCPU(page_req);
2553 	else
2554 		cpuid_local = mycpu->gd_cpuid;
2555 
2556 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
2557 
2558 	KKASSERT(page_req &
2559 		(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
2560 		 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2561 
2562 	/*
2563 	 * Certain system threads (pageout daemon, buf_daemon's) are
2564 	 * allowed to eat deeper into the free page list.
2565 	 */
2566 	if (curthread->td_flags & TDF_SYSTHREAD)
2567 		page_req |= VM_ALLOC_SYSTEM;
2568 
2569 	/*
2570 	 * Impose various limitations.  Note that the v_free_reserved test
2571 	 * must match the opposite of vm_page_count_target() to avoid
2572 	 * livelocks, be careful.
2573 	 */
2574 loop:
2575 	gd = mycpu;
2576 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
2577 	    ((page_req & VM_ALLOC_INTERRUPT) &&
2578 	     gd->gd_vmstats.v_free_count > 0) ||
2579 	    ((page_req & VM_ALLOC_SYSTEM) &&
2580 	     gd->gd_vmstats.v_cache_count == 0 &&
2581 	     gd->gd_vmstats.v_free_count >
2582 	     gd->gd_vmstats.v_interrupt_free_min)
2583 	) {
2584 		/*
2585 		 * The free queue has sufficient free pages to take one out.
2586 		 *
2587 		 * However, if the free queue is strained the scan may widen
2588 		 * to the entire queue and cause a great deal of SMP
2589 		 * contention, so we use a double-queue-scan if we can
2590 		 * to avoid this.
2591 		 */
2592 		if (page_req & VM_ALLOC_NORMAL) {
2593 			m = vm_page_select_free_or_cache(pg_color, &fromcache);
2594 			if (m && fromcache)
2595 				goto found_cache;
2596 		} else {
2597 			m = vm_page_select_free(pg_color);
2598 		}
2599 	} else if (page_req & VM_ALLOC_NORMAL) {
2600 		/*
2601 		 * Allocatable from the cache (non-interrupt only).  On
2602 		 * success, we must free the page and try again, thus
2603 		 * ensuring that vmstats.v_*_free_min counters are replenished.
2604 		 */
2605 #ifdef INVARIANTS
2606 		if (curthread->td_preempted) {
2607 			kprintf("vm_page_alloc(): warning, attempt to allocate"
2608 				" cache page from preempting interrupt\n");
2609 			m = NULL;
2610 		} else {
2611 			m = vm_page_select_cache(pg_color);
2612 		}
2613 #else
2614 		m = vm_page_select_cache(pg_color);
2615 #endif
2616 		/*
2617 		 * On success move the page into the free queue and loop.
2618 		 *
2619 		 * Only do this if we can safely acquire the vm_object lock,
2620 		 * because this is effectively a random page and the caller
2621 		 * might be holding the lock shared, we don't want to
2622 		 * deadlock.
2623 		 */
2624 		if (m != NULL) {
2625 found_cache:
2626 			KASSERT(m->dirty == 0,
2627 				("Found dirty cache page %p", m));
2628 			if (__predict_true((m->flags &
2629 					    (PG_MAPPED|PG_WRITEABLE)) == 0)) {
2630 				vm_page_free(m);
2631 			} else if ((obj = m->object) != NULL) {
2632 				if (vm_object_hold_try(obj)) {
2633 					vm_page_protect(m, VM_PROT_NONE);
2634 					vm_page_free(m);
2635 					/* m->object NULL here */
2636 					vm_object_drop(obj);
2637 				} else {
2638 					vm_page_deactivate(m);
2639 					vm_page_wakeup(m);
2640 				}
2641 			} else {
2642 				vm_page_protect(m, VM_PROT_NONE);
2643 				vm_page_free(m);
2644 			}
2645 			goto loop;
2646 		}
2647 
2648 		/*
2649 		 * On failure return NULL
2650 		 */
2651 		atomic_add_int(&vm_pageout_deficit, 1);
2652 		pagedaemon_wakeup();
2653 		return (NULL);
2654 	} else {
2655 		/*
2656 		 * No pages available, wakeup the pageout daemon and give up.
2657 		 */
2658 		atomic_add_int(&vm_pageout_deficit, 1);
2659 		pagedaemon_wakeup();
2660 		return (NULL);
2661 	}
2662 
2663 	/*
2664 	 * v_free_count can race so loop if we don't find the expected
2665 	 * page.
2666 	 */
2667 	if (m == NULL) {
2668 		vmstats_rollup();
2669 		goto loop;
2670 	}
2671 
2672 	/*
2673 	 * Good page found.  The page has already been busied for us and
2674 	 * removed from its queues.
2675 	 */
2676 	KASSERT(m->dirty == 0,
2677 		("vm_page_alloc: free/cache page %p was dirty", m));
2678 	KKASSERT(m->queue == PQ_NONE);
2679 
2680 #if 0
2681 done:
2682 #endif
2683 	/*
2684 	 * Initialize the structure, inheriting some flags but clearing
2685 	 * all the rest.  The page has already been busied for us.
2686 	 */
2687 	vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
2688 
2689 	KKASSERT(m->wire_count == 0);
2690 	KKASSERT((m->busy_count & PBUSY_MASK) == 0);
2691 	m->act_count = 0;
2692 	m->valid = 0;
2693 
2694 	/*
2695 	 * Caller must be holding the object lock (asserted by
2696 	 * vm_page_insert()).
2697 	 *
2698 	 * NOTE: Inserting a page here does not insert it into any pmaps
2699 	 *	 (which could cause us to block allocating memory).
2700 	 *
2701 	 * NOTE: If no object an unassociated page is allocated, m->pindex
2702 	 *	 can be used by the caller for any purpose.
2703 	 */
2704 	if (object) {
2705 		if (vm_page_insert(m, object, pindex) == FALSE) {
2706 			vm_page_free(m);
2707 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
2708 				panic("PAGE RACE %p[%ld]/%p",
2709 				      object, (long)pindex, m);
2710 			m = NULL;
2711 		}
2712 	} else {
2713 		m->pindex = pindex;
2714 	}
2715 
2716 	/*
2717 	 * Don't wakeup too often - wakeup the pageout daemon when
2718 	 * we would be nearly out of memory.
2719 	 */
2720 	pagedaemon_wakeup();
2721 
2722 	/*
2723 	 * A BUSY page is returned.
2724 	 */
2725 	return (m);
2726 }
2727 
2728 /*
2729  * Returns number of pages available in our DMA memory reserve
2730  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
2731  */
2732 vm_size_t
2733 vm_contig_avail_pages(void)
2734 {
2735 	alist_blk_t blk;
2736 	alist_blk_t count;
2737 	alist_blk_t bfree;
2738 	spin_lock(&vm_contig_spin);
2739 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2740 	spin_unlock(&vm_contig_spin);
2741 
2742 	return bfree;
2743 }
2744 
2745 /*
2746  * Attempt to allocate contiguous physical memory with the specified
2747  * requirements.
2748  */
2749 vm_page_t
2750 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2751 		     unsigned long alignment, unsigned long boundary,
2752 		     unsigned long size, vm_memattr_t memattr)
2753 {
2754 	alist_blk_t blk;
2755 	vm_page_t m;
2756 	vm_pindex_t i;
2757 #if 0
2758 	static vm_pindex_t contig_rover;
2759 #endif
2760 
2761 	alignment >>= PAGE_SHIFT;
2762 	if (alignment == 0)
2763 		alignment = 1;
2764 	boundary >>= PAGE_SHIFT;
2765 	if (boundary == 0)
2766 		boundary = 1;
2767 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
2768 
2769 #if 0
2770 	/*
2771 	 * Disabled temporarily until we find a solution for DRM (a flag
2772 	 * to always use the free space reserve, for performance).
2773 	 */
2774 	if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
2775 	    boundary <= PAGE_SIZE && size == 1 &&
2776 	    memattr == VM_MEMATTR_DEFAULT) {
2777 		/*
2778 		 * Any page will work, use vm_page_alloc()
2779 		 * (e.g. when used from kmem_alloc_attr())
2780 		 */
2781 		m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
2782 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2783 				  VM_ALLOC_INTERRUPT);
2784 		m->valid = VM_PAGE_BITS_ALL;
2785 		vm_page_wire(m);
2786 		vm_page_wakeup(m);
2787 	} else
2788 #endif
2789 	{
2790 		/*
2791 		 * Use the low-memory dma reserve
2792 		 */
2793 		spin_lock(&vm_contig_spin);
2794 		blk = alist_alloc(&vm_contig_alist, 0, size);
2795 		if (blk == ALIST_BLOCK_NONE) {
2796 			spin_unlock(&vm_contig_spin);
2797 			if (bootverbose) {
2798 				kprintf("vm_page_alloc_contig: %ldk nospace\n",
2799 					(size << PAGE_SHIFT) / 1024);
2800 				print_backtrace(5);
2801 			}
2802 			return(NULL);
2803 		}
2804 		if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2805 			alist_free(&vm_contig_alist, blk, size);
2806 			spin_unlock(&vm_contig_spin);
2807 			if (bootverbose) {
2808 				kprintf("vm_page_alloc_contig: %ldk high "
2809 					"%016jx failed\n",
2810 					(size << PAGE_SHIFT) / 1024,
2811 					(intmax_t)high);
2812 			}
2813 			return(NULL);
2814 		}
2815 		spin_unlock(&vm_contig_spin);
2816 
2817 		/*
2818 		 * Base vm_page_t of range
2819 		 */
2820 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2821 	}
2822 	if (vm_contig_verbose) {
2823 		kprintf("vm_page_alloc_contig: %016jx/%ldk "
2824 			"(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2825 			(intmax_t)m->phys_addr,
2826 			(size << PAGE_SHIFT) / 1024,
2827 			low, high, alignment, boundary, size, memattr);
2828 	}
2829 	if (memattr != VM_MEMATTR_DEFAULT) {
2830 		for (i = 0; i < size; ++i) {
2831 			KKASSERT(m[i].flags & PG_FICTITIOUS);
2832 			pmap_page_set_memattr(&m[i], memattr);
2833 		}
2834 	}
2835 	return m;
2836 }
2837 
2838 /*
2839  * Free contiguously allocated pages.  The pages will be wired but not busy.
2840  * When freeing to the alist we leave them wired and not busy.
2841  */
2842 void
2843 vm_page_free_contig(vm_page_t m, unsigned long size)
2844 {
2845 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2846 	vm_pindex_t start = pa >> PAGE_SHIFT;
2847 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2848 
2849 	if (vm_contig_verbose) {
2850 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2851 			(intmax_t)pa, size / 1024);
2852 	}
2853 	if (pa < vm_low_phys_reserved) {
2854 		/*
2855 		 * Just assert check the first page for convenience.
2856 		 */
2857 		KKASSERT(m->wire_count == 1);
2858 		KKASSERT(m->flags & PG_FICTITIOUS);
2859 		KKASSERT(pa + size <= vm_low_phys_reserved);
2860 		spin_lock(&vm_contig_spin);
2861 		alist_free(&vm_contig_alist, start, pages);
2862 		spin_unlock(&vm_contig_spin);
2863 	} else {
2864 		while (pages) {
2865 			/* XXX FUTURE, maybe (pair with vm_pg_contig_alloc()) */
2866 			/*vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);*/
2867 			vm_page_busy_wait(m, FALSE, "cpgfr");
2868 			vm_page_unwire(m, 0);
2869 			vm_page_free(m);
2870 			--pages;
2871 			++m;
2872 		}
2873 
2874 	}
2875 }
2876 
2877 
2878 /*
2879  * Wait for sufficient free memory for nominal heavy memory use kernel
2880  * operations.
2881  *
2882  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2883  *	     will trivially deadlock the system.
2884  */
2885 void
2886 vm_wait_nominal(void)
2887 {
2888 	while (vm_page_count_min(0))
2889 		vm_wait(0);
2890 }
2891 
2892 /*
2893  * Test if vm_wait_nominal() would block.
2894  */
2895 int
2896 vm_test_nominal(void)
2897 {
2898 	if (vm_page_count_min(0))
2899 		return(1);
2900 	return(0);
2901 }
2902 
2903 /*
2904  * Block until free pages are available for allocation, called in various
2905  * places before memory allocations.
2906  *
2907  * The caller may loop if vm_page_count_min() == FALSE so we cannot be
2908  * more generous then that.
2909  */
2910 void
2911 vm_wait(int timo)
2912 {
2913 	/*
2914 	 * never wait forever
2915 	 */
2916 	if (timo == 0)
2917 		timo = hz;
2918 	lwkt_gettoken(&vm_token);
2919 
2920 	if (curthread == pagethread ||
2921 	    curthread == emergpager) {
2922 		/*
2923 		 * The pageout daemon itself needs pages, this is bad.
2924 		 */
2925 		if (vm_page_count_min(0)) {
2926 			vm_pageout_pages_needed = 1;
2927 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2928 		}
2929 	} else {
2930 		/*
2931 		 * Wakeup the pageout daemon if necessary and wait.
2932 		 *
2933 		 * Do not wait indefinitely for the target to be reached,
2934 		 * as load might prevent it from being reached any time soon.
2935 		 * But wait a little to try to slow down page allocations
2936 		 * and to give more important threads (the pagedaemon)
2937 		 * allocation priority.
2938 		 */
2939 		if (vm_page_count_target()) {
2940 			if (vm_pages_needed <= 1) {
2941 				++vm_pages_needed;
2942 				wakeup(&vm_pages_needed);
2943 			}
2944 			++vm_pages_waiting;	/* SMP race ok */
2945 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2946 		}
2947 	}
2948 	lwkt_reltoken(&vm_token);
2949 }
2950 
2951 /*
2952  * Block until free pages are available for allocation
2953  *
2954  * Called only from vm_fault so that processes page faulting can be
2955  * easily tracked.
2956  */
2957 void
2958 vm_wait_pfault(void)
2959 {
2960 	/*
2961 	 * Wakeup the pageout daemon if necessary and wait.
2962 	 *
2963 	 * Do not wait indefinitely for the target to be reached,
2964 	 * as load might prevent it from being reached any time soon.
2965 	 * But wait a little to try to slow down page allocations
2966 	 * and to give more important threads (the pagedaemon)
2967 	 * allocation priority.
2968 	 */
2969 	if (vm_page_count_min(0)) {
2970 		lwkt_gettoken(&vm_token);
2971 		while (vm_page_count_severe()) {
2972 			if (vm_page_count_target()) {
2973 				thread_t td;
2974 
2975 				if (vm_pages_needed <= 1) {
2976 					++vm_pages_needed;
2977 					wakeup(&vm_pages_needed);
2978 				}
2979 				++vm_pages_waiting;	/* SMP race ok */
2980 				tsleep(&vmstats.v_free_count, 0, "pfault", hz);
2981 
2982 				/*
2983 				 * Do not stay stuck in the loop if the system is trying
2984 				 * to kill the process.
2985 				 */
2986 				td = curthread;
2987 				if (td->td_proc && (td->td_proc->p_flags & P_LOWMEMKILL))
2988 					break;
2989 			}
2990 		}
2991 		lwkt_reltoken(&vm_token);
2992 	}
2993 }
2994 
2995 /*
2996  * Put the specified page on the active list (if appropriate).  Ensure
2997  * that act_count is at least ACT_INIT but do not otherwise mess with it.
2998  *
2999  * The caller should be holding the page busied ? XXX
3000  * This routine may not block.
3001  *
3002  * It is ok if the page is wired (so buffer cache operations don't have
3003  * to mess with the page queues).
3004  */
3005 void
3006 vm_page_activate(vm_page_t m)
3007 {
3008 	u_short oqueue;
3009 
3010 	/*
3011 	 * If already active or inappropriate, just set act_count and
3012 	 * return.  We don't have to spin-lock the page.
3013 	 */
3014 	if (m->queue - m->pc == PQ_ACTIVE ||
3015 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3016 		if (m->act_count < ACT_INIT)
3017 			m->act_count = ACT_INIT;
3018 		return;
3019 	}
3020 
3021 	vm_page_spin_lock(m);
3022 	if (m->queue - m->pc != PQ_ACTIVE &&
3023 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3024 		_vm_page_queue_spin_lock(m);
3025 		oqueue = _vm_page_rem_queue_spinlocked(m);
3026 		/* page is left spinlocked, queue is unlocked */
3027 
3028 		if (oqueue == PQ_CACHE)
3029 			mycpu->gd_cnt.v_reactivated++;
3030 		if (m->act_count < ACT_INIT)
3031 			m->act_count = ACT_INIT;
3032 		_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
3033 		_vm_page_and_queue_spin_unlock(m);
3034 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
3035 			pagedaemon_wakeup();
3036 	} else {
3037 		if (m->act_count < ACT_INIT)
3038 			m->act_count = ACT_INIT;
3039 		vm_page_spin_unlock(m);
3040 	}
3041 }
3042 
3043 void
3044 vm_page_soft_activate(vm_page_t m)
3045 {
3046 	if (m->queue - m->pc == PQ_ACTIVE ||
3047 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3048 		if (m->act_count < ACT_INIT)
3049 			m->act_count = ACT_INIT;
3050 	} else {
3051 		vm_page_activate(m);
3052 	}
3053 }
3054 
3055 /*
3056  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
3057  * routine is called when a page has been added to the cache or free
3058  * queues.
3059  *
3060  * This routine may not block.
3061  */
3062 static __inline void
3063 vm_page_free_wakeup(void)
3064 {
3065 	globaldata_t gd = mycpu;
3066 
3067 	/*
3068 	 * If the pageout daemon itself needs pages, then tell it that
3069 	 * there are some free.
3070 	 */
3071 	if (vm_pageout_pages_needed &&
3072 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
3073 	    gd->gd_vmstats.v_pageout_free_min
3074 	) {
3075 		vm_pageout_pages_needed = 0;
3076 		wakeup(&vm_pageout_pages_needed);
3077 	}
3078 
3079 	/*
3080 	 * Wakeup processes that are waiting on memory.
3081 	 *
3082 	 * Generally speaking we want to wakeup stuck processes as soon as
3083 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
3084 	 * where we can do this.  Wait a bit longer to reduce degenerate
3085 	 * re-blocking (vm_page_free_hysteresis).  The target check is just
3086 	 * to make sure the min-check w/hysteresis does not exceed the
3087 	 * normal target.
3088 	 */
3089 	if (vm_pages_waiting) {
3090 		if (!vm_page_count_min(vm_page_free_hysteresis) ||
3091 		    !vm_page_count_target()) {
3092 			vm_pages_waiting = 0;
3093 			wakeup(&vmstats.v_free_count);
3094 			++mycpu->gd_cnt.v_ppwakeups;
3095 		}
3096 #if 0
3097 		if (!vm_page_count_target()) {
3098 			/*
3099 			 * Plenty of pages are free, wakeup everyone.
3100 			 */
3101 			vm_pages_waiting = 0;
3102 			wakeup(&vmstats.v_free_count);
3103 			++mycpu->gd_cnt.v_ppwakeups;
3104 		} else if (!vm_page_count_min(0)) {
3105 			/*
3106 			 * Some pages are free, wakeup someone.
3107 			 */
3108 			int wcount = vm_pages_waiting;
3109 			if (wcount > 0)
3110 				--wcount;
3111 			vm_pages_waiting = wcount;
3112 			wakeup_one(&vmstats.v_free_count);
3113 			++mycpu->gd_cnt.v_ppwakeups;
3114 		}
3115 #endif
3116 	}
3117 }
3118 
3119 /*
3120  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
3121  * it from its VM object.
3122  *
3123  * The vm_page must be BUSY on entry.  BUSY will be released on
3124  * return (the page will have been freed).
3125  */
3126 void
3127 vm_page_free_toq(vm_page_t m)
3128 {
3129 	/*
3130 	 * The page must not be mapped when freed, but we may have to call
3131 	 * pmap_mapped_sync() to validate this.
3132 	 */
3133 	mycpu->gd_cnt.v_tfree++;
3134 	if (m->flags & (PG_MAPPED | PG_WRITEABLE))
3135 		pmap_mapped_sync(m);
3136 	KKASSERT((m->flags & PG_MAPPED) == 0);
3137 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3138 
3139 	if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
3140 		kprintf("vm_page_free: pindex(%lu), busy %08x, "
3141 			"hold(%d)\n",
3142 			(u_long)m->pindex, m->busy_count, m->hold_count);
3143 		if ((m->queue - m->pc) == PQ_FREE)
3144 			panic("vm_page_free: freeing free page");
3145 		else
3146 			panic("vm_page_free: freeing busy page");
3147 	}
3148 
3149 	/*
3150 	 * Remove from object, spinlock the page and its queues and
3151 	 * remove from any queue.  No queue spinlock will be held
3152 	 * after this section (because the page was removed from any
3153 	 * queue).
3154 	 */
3155 	vm_page_remove(m);
3156 
3157 	/*
3158 	 * No further management of fictitious pages occurs beyond object
3159 	 * and queue removal.
3160 	 */
3161 	if ((m->flags & PG_FICTITIOUS) != 0) {
3162 		KKASSERT(m->queue == PQ_NONE);
3163 		vm_page_wakeup(m);
3164 		return;
3165 	}
3166 	vm_page_and_queue_spin_lock(m);
3167 	_vm_page_rem_queue_spinlocked(m);
3168 
3169 	m->valid = 0;
3170 	vm_page_undirty(m);
3171 
3172 	if (m->wire_count != 0) {
3173 		if (m->wire_count > 1) {
3174 		    panic(
3175 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
3176 			m->wire_count, (long)m->pindex);
3177 		}
3178 		panic("vm_page_free: freeing wired page");
3179 	}
3180 
3181 	if (!MD_PAGE_FREEABLE(m))
3182 		panic("vm_page_free: page %p is still mapped!", m);
3183 
3184 	/*
3185 	 * Clear the PG_NEED_COMMIT and the PG_UNQUEUED flags.  The
3186 	 * page returns to normal operation and will be placed in
3187 	 * the PQ_HOLD or PQ_FREE queue.
3188 	 */
3189 	vm_page_flag_clear(m, PG_NEED_COMMIT | PG_UNQUEUED);
3190 
3191 	if (m->hold_count != 0) {
3192 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
3193 	} else {
3194 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
3195 	}
3196 
3197 	/*
3198 	 * This sequence allows us to clear BUSY while still holding
3199 	 * its spin lock, which reduces contention vs allocators.  We
3200 	 * must not leave the queue locked or _vm_page_wakeup() may
3201 	 * deadlock.
3202 	 */
3203 	_vm_page_queue_spin_unlock(m);
3204 	if (_vm_page_wakeup(m)) {
3205 		vm_page_spin_unlock(m);
3206 		wakeup(m);
3207 	} else {
3208 		vm_page_spin_unlock(m);
3209 	}
3210 	vm_page_free_wakeup();
3211 }
3212 
3213 /*
3214  * Mark this page as wired down by yet another map.  We do not adjust the
3215  * queue the page is on, it will be checked for wiring as-needed.
3216  *
3217  * This function has no effect on fictitious pages.
3218  *
3219  * Caller must be holding the page busy.
3220  */
3221 void
3222 vm_page_wire(vm_page_t m)
3223 {
3224 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3225 	if ((m->flags & PG_FICTITIOUS) == 0) {
3226 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
3227 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
3228 		}
3229 		KASSERT(m->wire_count != 0,
3230 			("vm_page_wire: wire_count overflow m=%p", m));
3231 	}
3232 }
3233 
3234 /*
3235  * Release one wiring of this page, potentially enabling it to be paged again.
3236  *
3237  * Note that wired pages are no longer unconditionally removed from the
3238  * paging queues, so the page may already be on a queue.  Move the page
3239  * to the desired queue if necessary.
3240  *
3241  * Many pages placed on the inactive queue should actually go
3242  * into the cache, but it is difficult to figure out which.  What
3243  * we do instead, if the inactive target is well met, is to put
3244  * clean pages at the head of the inactive queue instead of the tail.
3245  * This will cause them to be moved to the cache more quickly and
3246  * if not actively re-referenced, freed more quickly.  If we just
3247  * stick these pages at the end of the inactive queue, heavy filesystem
3248  * meta-data accesses can cause an unnecessary paging load on memory bound
3249  * processes.  This optimization causes one-time-use metadata to be
3250  * reused more quickly.
3251  *
3252  * Pages marked PG_NEED_COMMIT are always activated and never placed on
3253  * the inactive queue.  This helps the pageout daemon determine memory
3254  * pressure and act on out-of-memory situations more quickly.
3255  *
3256  * BUT, if we are in a low-memory situation we have no choice but to
3257  * put clean pages on the cache queue.
3258  *
3259  * A number of routines use vm_page_unwire() to guarantee that the page
3260  * will go into either the inactive or active queues, and will NEVER
3261  * be placed in the cache - for example, just after dirtying a page.
3262  * dirty pages in the cache are not allowed.
3263  *
3264  * PG_FICTITIOUS or PG_UNQUEUED pages are never moved to any queue, and
3265  * the wire_count will not be adjusted in any way for a PG_FICTITIOUS
3266  * page.
3267  *
3268  * This routine may not block.
3269  */
3270 void
3271 vm_page_unwire(vm_page_t m, int activate)
3272 {
3273 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3274 	if (m->flags & PG_FICTITIOUS) {
3275 		/* do nothing */
3276 	} else if ((int)m->wire_count <= 0) {
3277 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
3278 	} else {
3279 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
3280 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
3281 			if (m->flags & PG_UNQUEUED) {
3282 				;
3283 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
3284 				vm_page_activate(m);
3285 			} else {
3286 				vm_page_deactivate(m);
3287 			}
3288 		}
3289 	}
3290 }
3291 
3292 /*
3293  * Move the specified page to the inactive queue.
3294  *
3295  * Normally athead is 0 resulting in LRU operation.  athead is set
3296  * to 1 if we want this page to be 'as if it were placed in the cache',
3297  * except without unmapping it from the process address space.
3298  *
3299  * vm_page's spinlock must be held on entry and will remain held on return.
3300  * This routine may not block.  The caller does not have to hold the page
3301  * busied but should have some sort of interlock on its validity.
3302  *
3303  * It is ok if the page is wired (so buffer cache operations don't have
3304  * to mess with the page queues).
3305  */
3306 static void
3307 _vm_page_deactivate_locked(vm_page_t m, int athead)
3308 {
3309 	u_short oqueue;
3310 
3311 	/*
3312 	 * Ignore if already inactive.
3313 	 */
3314 	if (m->queue - m->pc == PQ_INACTIVE ||
3315 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3316 		return;
3317 	}
3318 
3319 	_vm_page_queue_spin_lock(m);
3320 	oqueue = _vm_page_rem_queue_spinlocked(m);
3321 
3322 	if ((m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3323 		if (oqueue == PQ_CACHE)
3324 			mycpu->gd_cnt.v_reactivated++;
3325 		vm_page_flag_clear(m, PG_WINATCFLS);
3326 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
3327 		if (athead == 0) {
3328 			atomic_add_long(
3329 				&vm_page_queues[PQ_INACTIVE + m->pc].adds, 1);
3330 		}
3331 	}
3332 	/* NOTE: PQ_NONE if condition not taken */
3333 	_vm_page_queue_spin_unlock(m);
3334 	/* leaves vm_page spinlocked */
3335 }
3336 
3337 /*
3338  * Attempt to deactivate a page.
3339  *
3340  * No requirements.  We can pre-filter before getting the spinlock.
3341  *
3342  * It is ok if the page is wired (so buffer cache operations don't have
3343  * to mess with the page queues).
3344  */
3345 void
3346 vm_page_deactivate(vm_page_t m)
3347 {
3348 	if (m->queue - m->pc != PQ_INACTIVE &&
3349 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3350 		vm_page_spin_lock(m);
3351 		_vm_page_deactivate_locked(m, 0);
3352 		vm_page_spin_unlock(m);
3353 	}
3354 }
3355 
3356 void
3357 vm_page_deactivate_locked(vm_page_t m)
3358 {
3359 	_vm_page_deactivate_locked(m, 0);
3360 }
3361 
3362 /*
3363  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
3364  *
3365  * This function returns non-zero if it successfully moved the page to
3366  * PQ_CACHE.
3367  *
3368  * This function unconditionally unbusies the page on return.
3369  */
3370 int
3371 vm_page_try_to_cache(vm_page_t m)
3372 {
3373 	/*
3374 	 * Shortcut if we obviously cannot move the page, or if the
3375 	 * page is already on the cache queue, or it is ficitious.
3376 	 *
3377 	 * Never allow a wired page into the cache.
3378 	 */
3379 	if (m->dirty || m->hold_count || m->wire_count ||
3380 	    m->queue - m->pc == PQ_CACHE ||
3381 	    (m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS))) {
3382 		vm_page_wakeup(m);
3383 		return(0);
3384 	}
3385 
3386 	/*
3387 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
3388 	 * be moved to the cache, but can be deactivated.  However, users
3389 	 * of this function want to move pages closer to the cache so we
3390 	 * only deactivate it if it is in PQ_ACTIVE.  We do not re-deactivate.
3391 	 */
3392 	vm_page_test_dirty(m);
3393 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3394 		if (m->queue - m->pc == PQ_ACTIVE)
3395 			vm_page_deactivate(m);
3396 		vm_page_wakeup(m);
3397 		return(0);
3398 	}
3399 	vm_page_cache(m);
3400 	return(1);
3401 }
3402 
3403 /*
3404  * Attempt to free the page.  If we cannot free it, we do nothing.
3405  * 1 is returned on success, 0 on failure.
3406  *
3407  * The page can be in any state, including already being on the free
3408  * queue.  Check to see if it really can be freed.  Note that we disallow
3409  * this ad-hoc operation if the page is flagged PG_UNQUEUED.
3410  *
3411  * Caller provides an unlocked/non-busied page.
3412  * No requirements.
3413  */
3414 int
3415 vm_page_try_to_free(vm_page_t m)
3416 {
3417 	if (vm_page_busy_try(m, TRUE))
3418 		return(0);
3419 
3420 	if (m->dirty ||				/* can't free if it is dirty */
3421 	    m->hold_count ||			/* or held (XXX may be wrong) */
3422 	    m->wire_count ||			/* or wired */
3423 	    (m->flags & (PG_UNQUEUED |		/* or unqueued */
3424 			 PG_NEED_COMMIT |	/* or needs a commit */
3425 			 PG_FICTITIOUS)) ||	/* or is fictitious */
3426 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
3427 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
3428 		vm_page_wakeup(m);
3429 		return(0);
3430 	}
3431 
3432 	/*
3433 	 * We can probably free the page.
3434 	 *
3435 	 * Page busied by us and no longer spinlocked.  Dirty pages will
3436 	 * not be freed by this function.    We have to re-test the
3437 	 * dirty bit after cleaning out the pmaps.
3438 	 */
3439 	vm_page_test_dirty(m);
3440 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3441 		vm_page_wakeup(m);
3442 		return(0);
3443 	}
3444 	vm_page_protect(m, VM_PROT_NONE);
3445 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3446 		vm_page_wakeup(m);
3447 		return(0);
3448 	}
3449 	vm_page_free(m);
3450 	return(1);
3451 }
3452 
3453 /*
3454  * vm_page_cache
3455  *
3456  * Put the specified page onto the page cache queue (if appropriate).
3457  *
3458  * The page must be busy, and this routine will release the busy and
3459  * possibly even free the page.
3460  */
3461 void
3462 vm_page_cache(vm_page_t m)
3463 {
3464 	/*
3465 	 * Not suitable for the cache
3466 	 */
3467 	if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS)) ||
3468 	    (m->busy_count & PBUSY_MASK) ||
3469 	    m->wire_count || m->hold_count) {
3470 		vm_page_wakeup(m);
3471 		return;
3472 	}
3473 
3474 	/*
3475 	 * Already in the cache (and thus not mapped)
3476 	 */
3477 	if ((m->queue - m->pc) == PQ_CACHE) {
3478 		KKASSERT((m->flags & PG_MAPPED) == 0);
3479 		vm_page_wakeup(m);
3480 		return;
3481 	}
3482 
3483 #if 0
3484 	/*
3485 	 * REMOVED - it is possible for dirty to get set at any time as
3486 	 *	     long as the page is still mapped and writeable.
3487 	 *
3488 	 * Caller is required to test m->dirty, but note that the act of
3489 	 * removing the page from its maps can cause it to become dirty
3490 	 * on an SMP system due to another cpu running in usermode.
3491 	 */
3492 	if (m->dirty) {
3493 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
3494 			(long)m->pindex);
3495 	}
3496 #endif
3497 
3498 	/*
3499 	 * Remove all pmaps and indicate that the page is not
3500 	 * writeable or mapped.  Our vm_page_protect() call may
3501 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
3502 	 * everything.
3503 	 */
3504 	if (m->flags & (PG_MAPPED | PG_WRITEABLE)) {
3505 		vm_page_protect(m, VM_PROT_NONE);
3506 		pmap_mapped_sync(m);
3507 	}
3508 	if ((m->flags & (PG_UNQUEUED | PG_MAPPED)) ||
3509 	    (m->busy_count & PBUSY_MASK) ||
3510 	    m->wire_count || m->hold_count) {
3511 		vm_page_wakeup(m);
3512 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3513 		vm_page_deactivate(m);
3514 		vm_page_wakeup(m);
3515 	} else {
3516 		_vm_page_and_queue_spin_lock(m);
3517 		_vm_page_rem_queue_spinlocked(m);
3518 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
3519 		_vm_page_and_queue_spin_unlock(m);
3520 		vm_page_wakeup(m);
3521 		vm_page_free_wakeup();
3522 	}
3523 }
3524 
3525 /*
3526  * vm_page_dontneed()
3527  *
3528  * Cache, deactivate, or do nothing as appropriate.  This routine
3529  * is typically used by madvise() MADV_DONTNEED.
3530  *
3531  * Generally speaking we want to move the page into the cache so
3532  * it gets reused quickly.  However, this can result in a silly syndrome
3533  * due to the page recycling too quickly.  Small objects will not be
3534  * fully cached.  On the otherhand, if we move the page to the inactive
3535  * queue we wind up with a problem whereby very large objects
3536  * unnecessarily blow away our inactive and cache queues.
3537  *
3538  * The solution is to move the pages based on a fixed weighting.  We
3539  * either leave them alone, deactivate them, or move them to the cache,
3540  * where moving them to the cache has the highest weighting.
3541  * By forcing some pages into other queues we eventually force the
3542  * system to balance the queues, potentially recovering other unrelated
3543  * space from active.  The idea is to not force this to happen too
3544  * often.
3545  *
3546  * The page must be busied.
3547  */
3548 void
3549 vm_page_dontneed(vm_page_t m)
3550 {
3551 	static int dnweight;
3552 	int dnw;
3553 	int head;
3554 
3555 	dnw = ++dnweight;
3556 
3557 	/*
3558 	 * occassionally leave the page alone
3559 	 */
3560 	if ((dnw & 0x01F0) == 0 ||
3561 	    m->queue - m->pc == PQ_INACTIVE ||
3562 	    m->queue - m->pc == PQ_CACHE
3563 	) {
3564 		if (m->act_count >= ACT_INIT)
3565 			--m->act_count;
3566 		return;
3567 	}
3568 
3569 	/*
3570 	 * If vm_page_dontneed() is inactivating a page, it must clear
3571 	 * the referenced flag; otherwise the pagedaemon will see references
3572 	 * on the page in the inactive queue and reactivate it. Until the
3573 	 * page can move to the cache queue, madvise's job is not done.
3574 	 */
3575 	vm_page_flag_clear(m, PG_REFERENCED);
3576 	pmap_clear_reference(m);
3577 
3578 	if (m->dirty == 0)
3579 		vm_page_test_dirty(m);
3580 
3581 	if (m->dirty || (dnw & 0x0070) == 0) {
3582 		/*
3583 		 * Deactivate the page 3 times out of 32.
3584 		 */
3585 		head = 0;
3586 	} else {
3587 		/*
3588 		 * Cache the page 28 times out of every 32.  Note that
3589 		 * the page is deactivated instead of cached, but placed
3590 		 * at the head of the queue instead of the tail.
3591 		 */
3592 		head = 1;
3593 	}
3594 	vm_page_spin_lock(m);
3595 	_vm_page_deactivate_locked(m, head);
3596 	vm_page_spin_unlock(m);
3597 }
3598 
3599 /*
3600  * These routines manipulate the 'soft busy' count for a page.  A soft busy
3601  * is almost like a hard BUSY except that it allows certain compatible
3602  * operations to occur on the page while it is busy.  For example, a page
3603  * undergoing a write can still be mapped read-only.
3604  *
3605  * We also use soft-busy to quickly pmap_enter shared read-only pages
3606  * without having to hold the page locked.
3607  *
3608  * The soft-busy count can be > 1 in situations where multiple threads
3609  * are pmap_enter()ing the same page simultaneously, or when two buffer
3610  * cache buffers overlap the same page.
3611  *
3612  * The caller must hold the page BUSY when making these two calls.
3613  */
3614 void
3615 vm_page_io_start(vm_page_t m)
3616 {
3617 	uint32_t ocount;
3618 
3619 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3620 	KKASSERT(ocount & PBUSY_LOCKED);
3621 }
3622 
3623 void
3624 vm_page_io_finish(vm_page_t m)
3625 {
3626 	uint32_t ocount;
3627 
3628 	ocount = atomic_fetchadd_int(&m->busy_count, -1);
3629 	KKASSERT(ocount & PBUSY_MASK);
3630 #if 0
3631 	if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
3632 		wakeup(m);
3633 #endif
3634 }
3635 
3636 /*
3637  * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
3638  *
3639  * We can't use fetchadd here because we might race a hard-busy and the
3640  * page freeing code asserts on a non-zero soft-busy count (even if only
3641  * temporary).
3642  *
3643  * Returns 0 on success, non-zero on failure.
3644  */
3645 int
3646 vm_page_sbusy_try(vm_page_t m)
3647 {
3648 	uint32_t ocount;
3649 
3650 	for (;;) {
3651 		ocount = m->busy_count;
3652 		cpu_ccfence();
3653 		if (ocount & PBUSY_LOCKED)
3654 			return 1;
3655 		if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1))
3656 			break;
3657 	}
3658 	return 0;
3659 #if 0
3660 	if (m->busy_count & PBUSY_LOCKED)
3661 		return 1;
3662 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3663 	if (ocount & PBUSY_LOCKED) {
3664 		vm_page_sbusy_drop(m);
3665 		return 1;
3666 	}
3667 	return 0;
3668 #endif
3669 }
3670 
3671 /*
3672  * Indicate that a clean VM page requires a filesystem commit and cannot
3673  * be reused.  Used by tmpfs.
3674  */
3675 void
3676 vm_page_need_commit(vm_page_t m)
3677 {
3678 	vm_page_flag_set(m, PG_NEED_COMMIT);
3679 	vm_object_set_writeable_dirty(m->object);
3680 }
3681 
3682 void
3683 vm_page_clear_commit(vm_page_t m)
3684 {
3685 	vm_page_flag_clear(m, PG_NEED_COMMIT);
3686 }
3687 
3688 /*
3689  * Grab a page, blocking if it is busy and allocating a page if necessary.
3690  * A busy page is returned or NULL.  The page may or may not be valid and
3691  * might not be on a queue (the caller is responsible for the disposition of
3692  * the page).
3693  *
3694  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
3695  * page will be zero'd and marked valid.
3696  *
3697  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
3698  * valid even if it already exists.
3699  *
3700  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
3701  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
3702  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
3703  *
3704  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
3705  * always returned if we had blocked.
3706  *
3707  * This routine may not be called from an interrupt.
3708  *
3709  * No other requirements.
3710  */
3711 vm_page_t
3712 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
3713 {
3714 	vm_page_t m;
3715 	int error;
3716 	int shared = 1;
3717 
3718 	KKASSERT(allocflags &
3719 		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
3720 	vm_object_hold_shared(object);
3721 	for (;;) {
3722 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
3723 		if (error) {
3724 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
3725 			if ((allocflags & VM_ALLOC_RETRY) == 0) {
3726 				m = NULL;
3727 				break;
3728 			}
3729 			/* retry */
3730 		} else if (m == NULL) {
3731 			if (shared) {
3732 				vm_object_upgrade(object);
3733 				shared = 0;
3734 			}
3735 			if (allocflags & VM_ALLOC_RETRY)
3736 				allocflags |= VM_ALLOC_NULL_OK;
3737 			m = vm_page_alloc(object, pindex,
3738 					  allocflags & ~VM_ALLOC_RETRY);
3739 			if (m)
3740 				break;
3741 			vm_wait(0);
3742 			if ((allocflags & VM_ALLOC_RETRY) == 0)
3743 				goto failed;
3744 		} else {
3745 			/* m found */
3746 			break;
3747 		}
3748 	}
3749 
3750 	/*
3751 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
3752 	 *
3753 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
3754 	 * valid even if already valid.
3755 	 *
3756 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
3757 	 *	  removed the idle zeroing code.  These optimizations actually
3758 	 *	  slow things down on modern cpus because the zerod area is
3759 	 *	  likely uncached, placing a memory-access burden on the
3760 	 *	  accesors taking the fault.
3761 	 *
3762 	 *	  By always zeroing the page in-line with the fault, no
3763 	 *	  dynamic ram reads are needed and the caches are hot, ready
3764 	 *	  for userland to access the memory.
3765 	 */
3766 	if (m->valid == 0) {
3767 		if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3768 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
3769 			m->valid = VM_PAGE_BITS_ALL;
3770 		}
3771 	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
3772 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
3773 		m->valid = VM_PAGE_BITS_ALL;
3774 	}
3775 failed:
3776 	vm_object_drop(object);
3777 	return(m);
3778 }
3779 
3780 /*
3781  * Mapping function for valid bits or for dirty bits in
3782  * a page.  May not block.
3783  *
3784  * Inputs are required to range within a page.
3785  *
3786  * No requirements.
3787  * Non blocking.
3788  */
3789 int
3790 vm_page_bits(int base, int size)
3791 {
3792 	int first_bit;
3793 	int last_bit;
3794 
3795 	KASSERT(
3796 	    base + size <= PAGE_SIZE,
3797 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
3798 	);
3799 
3800 	if (size == 0)		/* handle degenerate case */
3801 		return(0);
3802 
3803 	first_bit = base >> DEV_BSHIFT;
3804 	last_bit = (base + size - 1) >> DEV_BSHIFT;
3805 
3806 	return ((2 << last_bit) - (1 << first_bit));
3807 }
3808 
3809 /*
3810  * Sets portions of a page valid and clean.  The arguments are expected
3811  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3812  * of any partial chunks touched by the range.  The invalid portion of
3813  * such chunks will be zero'd.
3814  *
3815  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
3816  *	 align base to DEV_BSIZE so as not to mark clean a partially
3817  *	 truncated device block.  Otherwise the dirty page status might be
3818  *	 lost.
3819  *
3820  * This routine may not block.
3821  *
3822  * (base + size) must be less then or equal to PAGE_SIZE.
3823  */
3824 static void
3825 _vm_page_zero_valid(vm_page_t m, int base, int size)
3826 {
3827 	int frag;
3828 	int endoff;
3829 
3830 	if (size == 0)	/* handle degenerate case */
3831 		return;
3832 
3833 	/*
3834 	 * If the base is not DEV_BSIZE aligned and the valid
3835 	 * bit is clear, we have to zero out a portion of the
3836 	 * first block.
3837 	 */
3838 
3839 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3840 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3841 	) {
3842 		pmap_zero_page_area(
3843 		    VM_PAGE_TO_PHYS(m),
3844 		    frag,
3845 		    base - frag
3846 		);
3847 	}
3848 
3849 	/*
3850 	 * If the ending offset is not DEV_BSIZE aligned and the
3851 	 * valid bit is clear, we have to zero out a portion of
3852 	 * the last block.
3853 	 */
3854 
3855 	endoff = base + size;
3856 
3857 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3858 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3859 	) {
3860 		pmap_zero_page_area(
3861 		    VM_PAGE_TO_PHYS(m),
3862 		    endoff,
3863 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3864 		);
3865 	}
3866 }
3867 
3868 /*
3869  * Set valid, clear dirty bits.  If validating the entire
3870  * page we can safely clear the pmap modify bit.  We also
3871  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3872  * takes a write fault on a MAP_NOSYNC memory area the flag will
3873  * be set again.
3874  *
3875  * We set valid bits inclusive of any overlap, but we can only
3876  * clear dirty bits for DEV_BSIZE chunks that are fully within
3877  * the range.
3878  *
3879  * Page must be busied?
3880  * No other requirements.
3881  */
3882 void
3883 vm_page_set_valid(vm_page_t m, int base, int size)
3884 {
3885 	_vm_page_zero_valid(m, base, size);
3886 	m->valid |= vm_page_bits(base, size);
3887 }
3888 
3889 
3890 /*
3891  * Set valid bits and clear dirty bits.
3892  *
3893  * Page must be busied by caller.
3894  *
3895  * NOTE: This function does not clear the pmap modified bit.
3896  *	 Also note that e.g. NFS may use a byte-granular base
3897  *	 and size.
3898  *
3899  * No other requirements.
3900  */
3901 void
3902 vm_page_set_validclean(vm_page_t m, int base, int size)
3903 {
3904 	int pagebits;
3905 
3906 	_vm_page_zero_valid(m, base, size);
3907 	pagebits = vm_page_bits(base, size);
3908 	m->valid |= pagebits;
3909 	m->dirty &= ~pagebits;
3910 	if (base == 0 && size == PAGE_SIZE) {
3911 		/*pmap_clear_modify(m);*/
3912 		vm_page_flag_clear(m, PG_NOSYNC);
3913 	}
3914 }
3915 
3916 /*
3917  * Set valid & dirty.  Used by buwrite()
3918  *
3919  * Page must be busied by caller.
3920  */
3921 void
3922 vm_page_set_validdirty(vm_page_t m, int base, int size)
3923 {
3924 	int pagebits;
3925 
3926 	pagebits = vm_page_bits(base, size);
3927 	m->valid |= pagebits;
3928 	m->dirty |= pagebits;
3929 	if (m->object)
3930 	       vm_object_set_writeable_dirty(m->object);
3931 }
3932 
3933 /*
3934  * Clear dirty bits.
3935  *
3936  * NOTE: This function does not clear the pmap modified bit.
3937  *	 Also note that e.g. NFS may use a byte-granular base
3938  *	 and size.
3939  *
3940  * Page must be busied?
3941  * No other requirements.
3942  */
3943 void
3944 vm_page_clear_dirty(vm_page_t m, int base, int size)
3945 {
3946 	m->dirty &= ~vm_page_bits(base, size);
3947 	if (base == 0 && size == PAGE_SIZE) {
3948 		/*pmap_clear_modify(m);*/
3949 		vm_page_flag_clear(m, PG_NOSYNC);
3950 	}
3951 }
3952 
3953 /*
3954  * Make the page all-dirty.
3955  *
3956  * Also make sure the related object and vnode reflect the fact that the
3957  * object may now contain a dirty page.
3958  *
3959  * Page must be busied?
3960  * No other requirements.
3961  */
3962 void
3963 vm_page_dirty(vm_page_t m)
3964 {
3965 #ifdef INVARIANTS
3966         int pqtype = m->queue - m->pc;
3967 #endif
3968         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3969                 ("vm_page_dirty: page in free/cache queue!"));
3970 	if (m->dirty != VM_PAGE_BITS_ALL) {
3971 		m->dirty = VM_PAGE_BITS_ALL;
3972 		if (m->object)
3973 			vm_object_set_writeable_dirty(m->object);
3974 	}
3975 }
3976 
3977 /*
3978  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3979  * valid and dirty bits for the effected areas are cleared.
3980  *
3981  * Page must be busied?
3982  * Does not block.
3983  * No other requirements.
3984  */
3985 void
3986 vm_page_set_invalid(vm_page_t m, int base, int size)
3987 {
3988 	int bits;
3989 
3990 	bits = vm_page_bits(base, size);
3991 	m->valid &= ~bits;
3992 	m->dirty &= ~bits;
3993 	atomic_add_int(&m->object->generation, 1);
3994 }
3995 
3996 /*
3997  * The kernel assumes that the invalid portions of a page contain
3998  * garbage, but such pages can be mapped into memory by user code.
3999  * When this occurs, we must zero out the non-valid portions of the
4000  * page so user code sees what it expects.
4001  *
4002  * Pages are most often semi-valid when the end of a file is mapped
4003  * into memory and the file's size is not page aligned.
4004  *
4005  * Page must be busied?
4006  * No other requirements.
4007  */
4008 void
4009 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
4010 {
4011 	int b;
4012 	int i;
4013 
4014 	/*
4015 	 * Scan the valid bits looking for invalid sections that
4016 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
4017 	 * valid bit may be set ) have already been zerod by
4018 	 * vm_page_set_validclean().
4019 	 */
4020 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
4021 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
4022 		    (m->valid & (1 << i))
4023 		) {
4024 			if (i > b) {
4025 				pmap_zero_page_area(
4026 				    VM_PAGE_TO_PHYS(m),
4027 				    b << DEV_BSHIFT,
4028 				    (i - b) << DEV_BSHIFT
4029 				);
4030 			}
4031 			b = i + 1;
4032 		}
4033 	}
4034 
4035 	/*
4036 	 * setvalid is TRUE when we can safely set the zero'd areas
4037 	 * as being valid.  We can do this if there are no cache consistency
4038 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
4039 	 */
4040 	if (setvalid)
4041 		m->valid = VM_PAGE_BITS_ALL;
4042 }
4043 
4044 /*
4045  * Is a (partial) page valid?  Note that the case where size == 0
4046  * will return FALSE in the degenerate case where the page is entirely
4047  * invalid, and TRUE otherwise.
4048  *
4049  * Does not block.
4050  * No other requirements.
4051  */
4052 int
4053 vm_page_is_valid(vm_page_t m, int base, int size)
4054 {
4055 	int bits = vm_page_bits(base, size);
4056 
4057 	if (m->valid && ((m->valid & bits) == bits))
4058 		return 1;
4059 	else
4060 		return 0;
4061 }
4062 
4063 /*
4064  * Update dirty bits from pmap/mmu.  May not block.
4065  *
4066  * Caller must hold the page busy
4067  *
4068  * WARNING! Unless the page has been unmapped, this function only
4069  *	    provides a likely dirty status.
4070  */
4071 void
4072 vm_page_test_dirty(vm_page_t m)
4073 {
4074 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) {
4075 		vm_page_dirty(m);
4076 	}
4077 }
4078 
4079 #include "opt_ddb.h"
4080 #ifdef DDB
4081 #include <ddb/ddb.h>
4082 
4083 DB_SHOW_COMMAND(page, vm_page_print_page_info)
4084 {
4085 	db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
4086 	db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
4087 	db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
4088 	db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
4089 	db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
4090 	db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
4091 	db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
4092 	db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
4093 	db_printf("vmstats.v_cache_min: %ld\n", vmstats.v_cache_min);
4094 	db_printf("vmstats.v_inactive_target: %ld\n",
4095 		  vmstats.v_inactive_target);
4096 }
4097 
4098 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
4099 {
4100 	int i;
4101 	db_printf("PQ_FREE:");
4102 	for (i = 0; i < PQ_L2_SIZE; i++) {
4103 		db_printf(" %ld", vm_page_queues[PQ_FREE + i].lcnt);
4104 	}
4105 	db_printf("\n");
4106 
4107 	db_printf("PQ_CACHE:");
4108 	for(i = 0; i < PQ_L2_SIZE; i++) {
4109 		db_printf(" %ld", vm_page_queues[PQ_CACHE + i].lcnt);
4110 	}
4111 	db_printf("\n");
4112 
4113 	db_printf("PQ_ACTIVE:");
4114 	for(i = 0; i < PQ_L2_SIZE; i++) {
4115 		db_printf(" %ld", vm_page_queues[PQ_ACTIVE + i].lcnt);
4116 	}
4117 	db_printf("\n");
4118 
4119 	db_printf("PQ_INACTIVE:");
4120 	for(i = 0; i < PQ_L2_SIZE; i++) {
4121 		db_printf(" %ld", vm_page_queues[PQ_INACTIVE + i].lcnt);
4122 	}
4123 	db_printf("\n");
4124 }
4125 #endif /* DDB */
4126