xref: /dragonfly/sys/vm/vm_page.c (revision c8860c9a)
1 /*
2  * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
3  * Copyright (c) 1991 Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
38  */
39 
40 /*
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  */
66 /*
67  * Resident memory management module.  The module manipulates 'VM pages'.
68  * A VM page is the core building block for memory management.
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/malloc.h>
74 #include <sys/proc.h>
75 #include <sys/vmmeter.h>
76 #include <sys/vnode.h>
77 #include <sys/kernel.h>
78 #include <sys/alist.h>
79 #include <sys/sysctl.h>
80 #include <sys/cpu_topology.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <sys/lock.h>
85 #include <vm/vm_kern.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_extern.h>
93 #include <vm/swap_pager.h>
94 
95 #include <machine/inttypes.h>
96 #include <machine/md_var.h>
97 #include <machine/specialreg.h>
98 #include <machine/bus_dma.h>
99 
100 #include <vm/vm_page2.h>
101 #include <sys/spinlock2.h>
102 
103 /*
104  * Cache necessary elements in the hash table itself to avoid indirecting
105  * through random vm_page's when doing a lookup.  The hash table is
106  * heuristical and it is ok for races to mess up any or all fields.
107  */
108 struct vm_page_hash_elm {
109 	vm_page_t	m;
110 	vm_object_t	object;	/* heuristical */
111 	vm_pindex_t	pindex;	/* heuristical */
112 	int		ticks;
113 	int		unused;
114 };
115 
116 #define VM_PAGE_HASH_SET	4		    /* power of 2, set-assoc */
117 #define VM_PAGE_HASH_MAX	(8 * 1024 * 1024)   /* power of 2, max size */
118 
119 /*
120  * SET - Minimum required set associative size, must be a power of 2.  We
121  *	 want this to match or exceed the set-associativeness of the cpu,
122  *	 up to a reasonable limit (we will use 16).
123  */
124 __read_mostly static int set_assoc_mask = 16 - 1;
125 
126 static void vm_page_queue_init(void);
127 static void vm_page_free_wakeup(void);
128 static vm_page_t vm_page_select_cache(u_short pg_color);
129 static vm_page_t _vm_page_list_find_wide(int basequeue, int index, int *lastp);
130 static vm_page_t _vm_page_list_find2_wide(int bq1, int bq2, int index,
131 			int *lastp1, int *lastp);
132 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
133 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
134 
135 /*
136  * Array of tailq lists
137  */
138 struct vpgqueues vm_page_queues[PQ_COUNT];
139 
140 static volatile int vm_pages_waiting;
141 static struct alist vm_contig_alist;
142 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
143 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
144 
145 __read_mostly static int vm_page_hash_vnode_only;
146 __read_mostly static int vm_page_hash_size;
147 __read_mostly static struct vm_page_hash_elm *vm_page_hash;
148 
149 static u_long vm_dma_reserved = 0;
150 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
151 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
152 	    "Memory reserved for DMA");
153 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
154 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
155 
156 SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW,
157 	    &vm_page_hash_vnode_only, 0, "Only hash vnode pages");
158 #if 0
159 static int vm_page_hash_debug;
160 SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW,
161 	    &vm_page_hash_debug, 0, "Only hash vnode pages");
162 #endif
163 
164 static int vm_contig_verbose = 0;
165 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
166 
167 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
168 	     vm_pindex_t, pindex);
169 
170 static void
171 vm_page_queue_init(void)
172 {
173 	int i;
174 
175 	for (i = 0; i < PQ_L2_SIZE; i++)
176 		vm_page_queues[PQ_FREE+i].cnt_offset =
177 			offsetof(struct vmstats, v_free_count);
178 	for (i = 0; i < PQ_L2_SIZE; i++)
179 		vm_page_queues[PQ_CACHE+i].cnt_offset =
180 			offsetof(struct vmstats, v_cache_count);
181 	for (i = 0; i < PQ_L2_SIZE; i++)
182 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
183 			offsetof(struct vmstats, v_inactive_count);
184 	for (i = 0; i < PQ_L2_SIZE; i++)
185 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
186 			offsetof(struct vmstats, v_active_count);
187 	for (i = 0; i < PQ_L2_SIZE; i++)
188 		vm_page_queues[PQ_HOLD+i].cnt_offset =
189 			offsetof(struct vmstats, v_active_count);
190 	/* PQ_NONE has no queue */
191 
192 	for (i = 0; i < PQ_COUNT; i++) {
193 		struct vpgqueues *vpq;
194 
195 		vpq = &vm_page_queues[i];
196 		vpq->lastq = -1;
197 		TAILQ_INIT(&vpq->pl);
198 		spin_init(&vpq->spin, "vm_page_queue_init");
199 	}
200 }
201 
202 /*
203  * note: place in initialized data section?  Is this necessary?
204  */
205 vm_pindex_t first_page = 0;
206 vm_pindex_t vm_page_array_size = 0;
207 vm_page_t vm_page_array = NULL;
208 vm_paddr_t vm_low_phys_reserved;
209 
210 /*
211  * (low level boot)
212  *
213  * Sets the page size, perhaps based upon the memory size.
214  * Must be called before any use of page-size dependent functions.
215  */
216 void
217 vm_set_page_size(void)
218 {
219 	if (vmstats.v_page_size == 0)
220 		vmstats.v_page_size = PAGE_SIZE;
221 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
222 		panic("vm_set_page_size: page size not a power of two");
223 }
224 
225 /*
226  * (low level boot)
227  *
228  * Add a new page to the freelist for use by the system.  New pages
229  * are added to both the head and tail of the associated free page
230  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
231  * requests pull 'recent' adds (higher physical addresses) first.
232  *
233  * Beware that the page zeroing daemon will also be running soon after
234  * boot, moving pages from the head to the tail of the PQ_FREE queues.
235  *
236  * Must be called in a critical section.
237  */
238 static void
239 vm_add_new_page(vm_paddr_t pa, int *badcountp)
240 {
241 	struct vpgqueues *vpq;
242 	vm_page_t m;
243 
244 	m = PHYS_TO_VM_PAGE(pa);
245 
246 	/*
247 	 * Make sure it isn't a duplicate (due to BIOS page range overlaps,
248 	 * which we consider bugs... but don't crash).  Note that m->phys_addr
249 	 * is pre-initialized, so use m->queue as a check.
250 	 */
251 	if (m->queue) {
252 		if (*badcountp < 10) {
253 			kprintf("vm_add_new_page: duplicate pa %016jx\n",
254 				(intmax_t)pa);
255 			++*badcountp;
256 		} else if (*badcountp == 10) {
257 			kprintf("vm_add_new_page: duplicate pa (many more)\n");
258 			++*badcountp;
259 		}
260 		return;
261 	}
262 
263 	m->phys_addr = pa;
264 	m->flags = 0;
265 	m->pat_mode = PAT_WRITE_BACK;
266 	m->pc = (pa >> PAGE_SHIFT);
267 
268 	/*
269 	 * Twist for cpu localization in addition to page coloring, so
270 	 * different cpus selecting by m->queue get different page colors.
271 	 */
272 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
273 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
274 	m->pc &= PQ_L2_MASK;
275 
276 	/*
277 	 * Reserve a certain number of contiguous low memory pages for
278 	 * contigmalloc() to use.
279 	 *
280 	 * Even though these pages represent real ram and can be
281 	 * reverse-mapped, we set PG_FICTITIOUS and PG_UNQUEUED
282 	 * because their use is special-cased.
283 	 *
284 	 * WARNING! Once PG_FICTITIOUS is set, vm_page_wire*()
285 	 *	    and vm_page_unwire*() calls have no effect.
286 	 */
287 	if (pa < vm_low_phys_reserved) {
288 		atomic_add_long(&vmstats.v_page_count, 1);
289 		atomic_add_long(&vmstats.v_dma_pages, 1);
290 		m->flags |= PG_FICTITIOUS | PG_UNQUEUED;
291 		m->queue = PQ_NONE;
292 		m->wire_count = 1;
293 		atomic_add_long(&vmstats.v_wire_count, 1);
294 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
295 		return;
296 	}
297 
298 	/*
299 	 * General page
300 	 */
301 	m->queue = m->pc + PQ_FREE;
302 	KKASSERT(m->dirty == 0);
303 
304 	atomic_add_long(&vmstats.v_page_count, 1);
305 	atomic_add_long(&vmstats.v_free_count, 1);
306 	vpq = &vm_page_queues[m->queue];
307 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
308 	++vpq->lcnt;
309 }
310 
311 /*
312  * (low level boot)
313  *
314  * Initializes the resident memory module.
315  *
316  * Preallocates memory for critical VM structures and arrays prior to
317  * kernel_map becoming available.
318  *
319  * Memory is allocated from (virtual2_start, virtual2_end) if available,
320  * otherwise memory is allocated from (virtual_start, virtual_end).
321  *
322  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
323  * large enough to hold vm_page_array & other structures for machines with
324  * large amounts of ram, so we want to use virtual2* when available.
325  */
326 void
327 vm_page_startup(void)
328 {
329 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
330 	vm_offset_t mapped;
331 	vm_pindex_t npages;
332 	vm_paddr_t page_range;
333 	vm_paddr_t new_end;
334 	int i;
335 	vm_paddr_t pa;
336 	vm_paddr_t last_pa;
337 	vm_paddr_t end;
338 	vm_paddr_t biggestone, biggestsize;
339 	vm_paddr_t total;
340 	vm_page_t m;
341 	int badcount;
342 
343 	total = 0;
344 	badcount = 0;
345 	biggestsize = 0;
346 	biggestone = 0;
347 	vaddr = round_page(vaddr);
348 
349 	/*
350 	 * Make sure ranges are page-aligned.
351 	 */
352 	for (i = 0; phys_avail[i].phys_end; ++i) {
353 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
354 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
355 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
356 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
357 	}
358 
359 	/*
360 	 * Locate largest block
361 	 */
362 	for (i = 0; phys_avail[i].phys_end; ++i) {
363 		vm_paddr_t size = phys_avail[i].phys_end -
364 				  phys_avail[i].phys_beg;
365 
366 		if (size > biggestsize) {
367 			biggestone = i;
368 			biggestsize = size;
369 		}
370 		total += size;
371 	}
372 	--i;	/* adjust to last entry for use down below */
373 
374 	end = phys_avail[biggestone].phys_end;
375 	end = trunc_page(end);
376 
377 	/*
378 	 * Initialize the queue headers for the free queue, the active queue
379 	 * and the inactive queue.
380 	 */
381 	vm_page_queue_init();
382 
383 #if !defined(_KERNEL_VIRTUAL)
384 	/*
385 	 * VKERNELs don't support minidumps and as such don't need
386 	 * vm_page_dump
387 	 *
388 	 * Allocate a bitmap to indicate that a random physical page
389 	 * needs to be included in a minidump.
390 	 *
391 	 * The amd64 port needs this to indicate which direct map pages
392 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
393 	 *
394 	 * However, x86 still needs this workspace internally within the
395 	 * minidump code.  In theory, they are not needed on x86, but are
396 	 * included should the sf_buf code decide to use them.
397 	 */
398 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
399 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
400 	end -= vm_page_dump_size;
401 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
402 					VM_PROT_READ | VM_PROT_WRITE);
403 	bzero((void *)vm_page_dump, vm_page_dump_size);
404 #endif
405 	/*
406 	 * Compute the number of pages of memory that will be available for
407 	 * use (taking into account the overhead of a page structure per
408 	 * page).
409 	 */
410 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
411 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
412 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
413 
414 #ifndef _KERNEL_VIRTUAL
415 	/*
416 	 * (only applies to real kernels)
417 	 *
418 	 * Reserve a large amount of low memory for potential 32-bit DMA
419 	 * space allocations.  Once device initialization is complete we
420 	 * release most of it, but keep (vm_dma_reserved) memory reserved
421 	 * for later use.  Typically for X / graphics.  Through trial and
422 	 * error we find that GPUs usually requires ~60-100MB or so.
423 	 *
424 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
425 	 */
426 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
427 	if (vm_low_phys_reserved > total / 4)
428 		vm_low_phys_reserved = total / 4;
429 	if (vm_dma_reserved == 0) {
430 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
431 		if (vm_dma_reserved > total / 16)
432 			vm_dma_reserved = total / 16;
433 	}
434 #endif
435 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
436 		   ALIST_RECORDS_65536);
437 
438 	/*
439 	 * Initialize the mem entry structures now, and put them in the free
440 	 * queue.
441 	 */
442 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
443 		kprintf("initializing vm_page_array ");
444 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
445 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
446 	vm_page_array = (vm_page_t)mapped;
447 
448 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
449 	/*
450 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
451 	 * we have to manually add these pages to the minidump tracking so
452 	 * that they can be dumped, including the vm_page_array.
453 	 */
454 	for (pa = new_end;
455 	     pa < phys_avail[biggestone].phys_end;
456 	     pa += PAGE_SIZE) {
457 		dump_add_page(pa);
458 	}
459 #endif
460 
461 	/*
462 	 * Clear all of the page structures, run basic initialization so
463 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
464 	 * map.
465 	 */
466 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
467 	vm_page_array_size = page_range;
468 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
469 		kprintf("size = 0x%zx\n", vm_page_array_size);
470 
471 	m = &vm_page_array[0];
472 	pa = ptoa(first_page);
473 	for (i = 0; i < page_range; ++i) {
474 		spin_init(&m->spin, "vm_page");
475 		m->phys_addr = pa;
476 		pa += PAGE_SIZE;
477 		++m;
478 	}
479 
480 	/*
481 	 * Construct the free queue(s) in ascending order (by physical
482 	 * address) so that the first 16MB of physical memory is allocated
483 	 * last rather than first.  On large-memory machines, this avoids
484 	 * the exhaustion of low physical memory before isa_dma_init has run.
485 	 */
486 	vmstats.v_page_count = 0;
487 	vmstats.v_free_count = 0;
488 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
489 		pa = phys_avail[i].phys_beg;
490 		if (i == biggestone)
491 			last_pa = new_end;
492 		else
493 			last_pa = phys_avail[i].phys_end;
494 		while (pa < last_pa && npages-- > 0) {
495 			vm_add_new_page(pa, &badcount);
496 			pa += PAGE_SIZE;
497 		}
498 	}
499 	if (virtual2_start)
500 		virtual2_start = vaddr;
501 	else
502 		virtual_start = vaddr;
503 	mycpu->gd_vmstats = vmstats;
504 }
505 
506 /*
507  * (called from early boot only)
508  *
509  * Reorganize VM pages based on numa data.  May be called as many times as
510  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
511  * to allow vm_page_alloc() to choose pages based on socket affinity.
512  *
513  * NOTE: This function is only called while we are still in UP mode, so
514  *	 we only need a critical section to protect the queues (which
515  *	 saves a lot of time, there are likely a ton of pages).
516  */
517 void
518 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
519 {
520 	vm_paddr_t scan_beg;
521 	vm_paddr_t scan_end;
522 	vm_paddr_t ran_end;
523 	struct vpgqueues *vpq;
524 	vm_page_t m;
525 	vm_page_t mend;
526 	int socket_mod;
527 	int socket_value;
528 	int i;
529 
530 	/*
531 	 * Check if no physical information, or there was only one socket
532 	 * (so don't waste time doing nothing!).
533 	 */
534 	if (cpu_topology_phys_ids <= 1 ||
535 	    cpu_topology_core_ids == 0) {
536 		return;
537 	}
538 
539 	/*
540 	 * Setup for our iteration.  Note that ACPI may iterate CPU
541 	 * sockets starting at 0 or 1 or some other number.  The
542 	 * cpu_topology code mod's it against the socket count.
543 	 */
544 	ran_end = ran_beg + bytes;
545 
546 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
547 	socket_value = (physid % cpu_topology_phys_ids) * socket_mod;
548 	mend = &vm_page_array[vm_page_array_size];
549 
550 	crit_enter();
551 
552 	/*
553 	 * Adjust cpu_topology's phys_mem parameter
554 	 */
555 	if (root_cpu_node)
556 		vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes);
557 
558 	/*
559 	 * Adjust vm_page->pc and requeue all affected pages.  The
560 	 * allocator will then be able to localize memory allocations
561 	 * to some degree.
562 	 */
563 	for (i = 0; phys_avail[i].phys_end; ++i) {
564 		scan_beg = phys_avail[i].phys_beg;
565 		scan_end = phys_avail[i].phys_end;
566 		if (scan_end <= ran_beg)
567 			continue;
568 		if (scan_beg >= ran_end)
569 			continue;
570 		if (scan_beg < ran_beg)
571 			scan_beg = ran_beg;
572 		if (scan_end > ran_end)
573 			scan_end = ran_end;
574 		if (atop(scan_end) > first_page + vm_page_array_size)
575 			scan_end = ptoa(first_page + vm_page_array_size);
576 
577 		m = PHYS_TO_VM_PAGE(scan_beg);
578 		while (scan_beg < scan_end) {
579 			KKASSERT(m < mend);
580 			if (m->queue != PQ_NONE) {
581 				vpq = &vm_page_queues[m->queue];
582 				TAILQ_REMOVE(&vpq->pl, m, pageq);
583 				--vpq->lcnt;
584 				/* queue doesn't change, no need to adj cnt */
585 				m->queue -= m->pc;
586 				m->pc %= socket_mod;
587 				m->pc += socket_value;
588 				m->pc &= PQ_L2_MASK;
589 				m->queue += m->pc;
590 				vpq = &vm_page_queues[m->queue];
591 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
592 				++vpq->lcnt;
593 				/* queue doesn't change, no need to adj cnt */
594 			} else {
595 				m->pc %= socket_mod;
596 				m->pc += socket_value;
597 				m->pc &= PQ_L2_MASK;
598 			}
599 			scan_beg += PAGE_SIZE;
600 			++m;
601 		}
602 	}
603 
604 	crit_exit();
605 }
606 
607 /*
608  * (called from early boot only)
609  *
610  * Don't allow the NUMA organization to leave vm_page_queues[] nodes
611  * completely empty for a logical cpu.  Doing so would force allocations
612  * on that cpu to always borrow from a nearby cpu, create unnecessary
613  * contention, and cause vm_page_alloc() to iterate more queues and run more
614  * slowly.
615  *
616  * This situation can occur when memory sticks are not entirely populated,
617  * populated at different densities, or in naturally assymetric systems
618  * such as the 2990WX.  There could very well be many vm_page_queues[]
619  * entries with *NO* pages assigned to them.
620  *
621  * Fixing this up ensures that each logical CPU has roughly the same
622  * sized memory pool, and more importantly ensures that logical CPUs
623  * do not wind up with an empty memory pool.
624  *
625  * At them moment we just iterate the other queues and borrow pages,
626  * moving them into the queues for cpus with severe deficits even though
627  * the memory might not be local to those cpus.  I am not doing this in
628  * a 'smart' way, its effectively UMA style (sorta, since its page-by-page
629  * whereas real UMA typically exchanges address bits 8-10 with high address
630  * bits).  But it works extremely well and gives us fairly good deterministic
631  * results on the cpu cores associated with these secondary nodes.
632  */
633 void
634 vm_numa_organize_finalize(void)
635 {
636 	struct vpgqueues *vpq;
637 	vm_page_t m;
638 	long lcnt_lo;
639 	long lcnt_hi;
640 	int iter;
641 	int i;
642 	int scale_lim;
643 
644 	crit_enter();
645 
646 	/*
647 	 * Machines might not use an exact power of 2 for phys_ids,
648 	 * core_ids, ht_ids, etc.  This can slightly reduce the actual
649 	 * range of indices in vm_page_queues[] that are nominally used.
650 	 */
651 	if (cpu_topology_ht_ids) {
652 		scale_lim = PQ_L2_SIZE / cpu_topology_phys_ids;
653 		scale_lim = scale_lim / cpu_topology_core_ids;
654 		scale_lim = scale_lim / cpu_topology_ht_ids;
655 		scale_lim = scale_lim * cpu_topology_ht_ids;
656 		scale_lim = scale_lim * cpu_topology_core_ids;
657 		scale_lim = scale_lim * cpu_topology_phys_ids;
658 	} else {
659 		scale_lim = PQ_L2_SIZE;
660 	}
661 
662 	/*
663 	 * Calculate an average, set hysteresis for balancing from
664 	 * 10% below the average to the average.
665 	 */
666 	lcnt_hi = 0;
667 	for (i = 0; i < scale_lim; ++i) {
668 		lcnt_hi += vm_page_queues[i].lcnt;
669 	}
670 	lcnt_hi /= scale_lim;
671 	lcnt_lo = lcnt_hi - lcnt_hi / 10;
672 
673 	kprintf("vm_page: avg %ld pages per queue, %d queues\n",
674 		lcnt_hi, scale_lim);
675 
676 	iter = 0;
677 	for (i = 0; i < scale_lim; ++i) {
678 		vpq = &vm_page_queues[PQ_FREE + i];
679 		while (vpq->lcnt < lcnt_lo) {
680 			struct vpgqueues *vptmp;
681 
682 			iter = (iter + 1) & PQ_L2_MASK;
683 			vptmp = &vm_page_queues[PQ_FREE + iter];
684 			if (vptmp->lcnt < lcnt_hi)
685 				continue;
686 			m = TAILQ_FIRST(&vptmp->pl);
687 			KKASSERT(m->queue == PQ_FREE + iter);
688 			TAILQ_REMOVE(&vptmp->pl, m, pageq);
689 			--vptmp->lcnt;
690 			/* queue doesn't change, no need to adj cnt */
691 			m->queue -= m->pc;
692 			m->pc = i;
693 			m->queue += m->pc;
694 			TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
695 			++vpq->lcnt;
696 		}
697 	}
698 	crit_exit();
699 }
700 
701 static
702 void
703 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes)
704 {
705 	int cpuid;
706 	int i;
707 
708 	switch(cpup->type) {
709 	case PACKAGE_LEVEL:
710 		cpup->phys_mem += bytes;
711 		break;
712 	case CHIP_LEVEL:
713 		/*
714 		 * All members should have the same chipid, so we only need
715 		 * to pull out one member.
716 		 */
717 		if (CPUMASK_TESTNZERO(cpup->members)) {
718 			cpuid = BSFCPUMASK(cpup->members);
719 			if (physid ==
720 			    get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) {
721 				cpup->phys_mem += bytes;
722 			}
723 		}
724 		break;
725 	case CORE_LEVEL:
726 	case THREAD_LEVEL:
727 		/*
728 		 * Just inherit from the parent node
729 		 */
730 		cpup->phys_mem = cpup->parent_node->phys_mem;
731 		break;
732 	}
733 	for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i)
734 		vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes);
735 }
736 
737 /*
738  * We tended to reserve a ton of memory for contigmalloc().  Now that most
739  * drivers have initialized we want to return most the remaining free
740  * reserve back to the VM page queues so they can be used for normal
741  * allocations.
742  *
743  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
744  */
745 static void
746 vm_page_startup_finish(void *dummy __unused)
747 {
748 	alist_blk_t blk;
749 	alist_blk_t rblk;
750 	alist_blk_t count;
751 	alist_blk_t xcount;
752 	alist_blk_t bfree;
753 	vm_page_t m;
754 	struct vm_page_hash_elm *mp;
755 	int mask;
756 
757 	/*
758 	 * Set the set_assoc_mask based on the fitted number of CPUs.
759 	 * This is a mask, so we subject 1.
760 	 *
761 	 * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8.
762 	 * So if we have 256 CPUs, two hyper-threads will wind up sharing.
763 	 *
764 	 * The maximum is PQ_L2_SIZE.  However, we limit the starting
765 	 * maximum to 16 (mask = 15) in order to improve the cache locality
766 	 * of related kernel data structures.
767 	 */
768 	mask = PQ_L2_SIZE / ncpus_fit - 1;
769 	if (mask < 7)		/* minimum is 8-way w/256 CPU threads */
770 		mask = 7;
771 	if (mask < 15)
772 		mask = 15;
773 	cpu_ccfence();
774 	set_assoc_mask = mask;
775 
776 	/*
777 	 * Return part of the initial reserve back to the system
778 	 */
779 	spin_lock(&vm_contig_spin);
780 	for (;;) {
781 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
782 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
783 			break;
784 		if (count == 0)
785 			break;
786 
787 		/*
788 		 * Figure out how much of the initial reserve we have to
789 		 * free in order to reach our target.
790 		 */
791 		bfree -= vm_dma_reserved / PAGE_SIZE;
792 		if (count > bfree) {
793 			blk += count - bfree;
794 			count = bfree;
795 		}
796 
797 		/*
798 		 * Calculate the nearest power of 2 <= count.
799 		 */
800 		for (xcount = 1; xcount <= count; xcount <<= 1)
801 			;
802 		xcount >>= 1;
803 		blk += count - xcount;
804 		count = xcount;
805 
806 		/*
807 		 * Allocate the pages from the alist, then free them to
808 		 * the normal VM page queues.
809 		 *
810 		 * Pages allocated from the alist are wired.  We have to
811 		 * busy, unwire, and free them.  We must also adjust
812 		 * vm_low_phys_reserved before freeing any pages to prevent
813 		 * confusion.
814 		 */
815 		rblk = alist_alloc(&vm_contig_alist, blk, count);
816 		if (rblk != blk) {
817 			kprintf("vm_page_startup_finish: Unable to return "
818 				"dma space @0x%08x/%d -> 0x%08x\n",
819 				blk, count, rblk);
820 			break;
821 		}
822 		atomic_add_long(&vmstats.v_dma_pages, -(long)count);
823 		spin_unlock(&vm_contig_spin);
824 
825 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
826 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
827 		while (count) {
828 			vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);
829 			vm_page_busy_wait(m, FALSE, "cpgfr");
830 			vm_page_unwire(m, 0);
831 			vm_page_free(m);
832 			--count;
833 			++m;
834 		}
835 		spin_lock(&vm_contig_spin);
836 	}
837 	spin_unlock(&vm_contig_spin);
838 
839 	/*
840 	 * Print out how much DMA space drivers have already allocated and
841 	 * how much is left over.
842 	 */
843 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
844 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
845 		(PAGE_SIZE / 1024),
846 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
847 
848 	/*
849 	 * Power of 2
850 	 */
851 	vm_page_hash_size = 4096;
852 	while (vm_page_hash_size < (vm_page_array_size / 16))
853 		vm_page_hash_size <<= 1;
854 	if (vm_page_hash_size > VM_PAGE_HASH_MAX)
855 		vm_page_hash_size = VM_PAGE_HASH_MAX;
856 
857 	/*
858 	 * hash table for vm_page_lookup_quick()
859 	 */
860 	mp = (void *)kmem_alloc3(&kernel_map,
861 				 (vm_page_hash_size + VM_PAGE_HASH_SET) *
862 				  sizeof(*vm_page_hash),
863 				 VM_SUBSYS_VMPGHASH, KM_CPU(0));
864 	bzero(mp, (vm_page_hash_size + VM_PAGE_HASH_SET) * sizeof(*mp));
865 	cpu_sfence();
866 	vm_page_hash = mp;
867 }
868 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
869 	vm_page_startup_finish, NULL);
870 
871 
872 /*
873  * Scan comparison function for Red-Black tree scans.  An inclusive
874  * (start,end) is expected.  Other fields are not used.
875  */
876 int
877 rb_vm_page_scancmp(struct vm_page *p, void *data)
878 {
879 	struct rb_vm_page_scan_info *info = data;
880 
881 	if (p->pindex < info->start_pindex)
882 		return(-1);
883 	if (p->pindex > info->end_pindex)
884 		return(1);
885 	return(0);
886 }
887 
888 int
889 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
890 {
891 	if (p1->pindex < p2->pindex)
892 		return(-1);
893 	if (p1->pindex > p2->pindex)
894 		return(1);
895 	return(0);
896 }
897 
898 void
899 vm_page_init(vm_page_t m)
900 {
901 	/* do nothing for now.  Called from pmap_page_init() */
902 }
903 
904 /*
905  * Each page queue has its own spin lock, which is fairly optimal for
906  * allocating and freeing pages at least.
907  *
908  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
909  * queue spinlock via this function.  Also note that m->queue cannot change
910  * unless both the page and queue are locked.
911  */
912 static __inline
913 void
914 _vm_page_queue_spin_lock(vm_page_t m)
915 {
916 	u_short queue;
917 
918 	queue = m->queue;
919 	if (queue != PQ_NONE) {
920 		spin_lock(&vm_page_queues[queue].spin);
921 		KKASSERT(queue == m->queue);
922 	}
923 }
924 
925 static __inline
926 void
927 _vm_page_queue_spin_unlock(vm_page_t m)
928 {
929 	u_short queue;
930 
931 	queue = m->queue;
932 	cpu_ccfence();
933 	if (queue != PQ_NONE)
934 		spin_unlock(&vm_page_queues[queue].spin);
935 }
936 
937 static __inline
938 void
939 _vm_page_queues_spin_lock(u_short queue)
940 {
941 	cpu_ccfence();
942 	if (queue != PQ_NONE)
943 		spin_lock(&vm_page_queues[queue].spin);
944 }
945 
946 
947 static __inline
948 void
949 _vm_page_queues_spin_unlock(u_short queue)
950 {
951 	cpu_ccfence();
952 	if (queue != PQ_NONE)
953 		spin_unlock(&vm_page_queues[queue].spin);
954 }
955 
956 void
957 vm_page_queue_spin_lock(vm_page_t m)
958 {
959 	_vm_page_queue_spin_lock(m);
960 }
961 
962 void
963 vm_page_queues_spin_lock(u_short queue)
964 {
965 	_vm_page_queues_spin_lock(queue);
966 }
967 
968 void
969 vm_page_queue_spin_unlock(vm_page_t m)
970 {
971 	_vm_page_queue_spin_unlock(m);
972 }
973 
974 void
975 vm_page_queues_spin_unlock(u_short queue)
976 {
977 	_vm_page_queues_spin_unlock(queue);
978 }
979 
980 /*
981  * This locks the specified vm_page and its queue in the proper order
982  * (page first, then queue).  The queue may change so the caller must
983  * recheck on return.
984  */
985 static __inline
986 void
987 _vm_page_and_queue_spin_lock(vm_page_t m)
988 {
989 	vm_page_spin_lock(m);
990 	_vm_page_queue_spin_lock(m);
991 }
992 
993 static __inline
994 void
995 _vm_page_and_queue_spin_unlock(vm_page_t m)
996 {
997 	_vm_page_queues_spin_unlock(m->queue);
998 	vm_page_spin_unlock(m);
999 }
1000 
1001 void
1002 vm_page_and_queue_spin_unlock(vm_page_t m)
1003 {
1004 	_vm_page_and_queue_spin_unlock(m);
1005 }
1006 
1007 void
1008 vm_page_and_queue_spin_lock(vm_page_t m)
1009 {
1010 	_vm_page_and_queue_spin_lock(m);
1011 }
1012 
1013 /*
1014  * Helper function removes vm_page from its current queue.
1015  * Returns the base queue the page used to be on.
1016  *
1017  * The vm_page and the queue must be spinlocked.
1018  * This function will unlock the queue but leave the page spinlocked.
1019  */
1020 static __inline u_short
1021 _vm_page_rem_queue_spinlocked(vm_page_t m)
1022 {
1023 	struct vpgqueues *pq;
1024 	u_short queue;
1025 	u_short oqueue;
1026 	long *cnt_adj;
1027 	long *cnt_gd;
1028 
1029 	queue = m->queue;
1030 	if (queue != PQ_NONE) {
1031 		pq = &vm_page_queues[queue];
1032 		TAILQ_REMOVE(&pq->pl, m, pageq);
1033 
1034 		/*
1035 		 * Primarily adjust our pcpu stats for rollup, which is
1036 		 * (mycpu->gd_vmstats_adj + offset).  This is normally
1037 		 * synchronized on every hardclock().
1038 		 *
1039 		 * However, in order for the nominal low-memory algorithms
1040 		 * to work properly if the unsynchronized adjustment gets
1041 		 * too negative and might trigger the pageout daemon, we
1042 		 * immediately synchronize with the global structure.
1043 		 *
1044 		 * The idea here is to reduce unnecessary SMP cache mastership
1045 		 * changes in the global vmstats, which can be particularly
1046 		 * bad in multi-socket systems.
1047 		 *
1048 		 * WARNING! In systems with low amounts of memory the
1049 		 *	    vm_paging_needed(-1024 * ncpus) test could
1050 		 *	    wind up testing a value above the paging target,
1051 		 *	    meaning it would almost always return TRUE.  In
1052 		 *	    that situation we synchronize every time the
1053 		 *	    cumulative adjustment falls below -1024.
1054 		 */
1055 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1056 				   pq->cnt_offset);
1057 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1058 				   pq->cnt_offset);
1059 		atomic_add_long(cnt_adj, -1);
1060 		atomic_add_long(cnt_gd, -1);
1061 
1062 		if (*cnt_adj < -1024 && vm_paging_start(-1024 * ncpus)) {
1063 			u_long copy = atomic_swap_long(cnt_adj, 0);
1064 			cnt_adj = (long *)((char *)&vmstats + pq->cnt_offset);
1065 			atomic_add_long(cnt_adj, copy);
1066 		}
1067 		pq->lcnt--;
1068 		m->queue = PQ_NONE;
1069 		oqueue = queue;
1070 		queue -= m->pc;
1071 		vm_page_queues_spin_unlock(oqueue);	/* intended */
1072 	}
1073 	return queue;
1074 }
1075 
1076 /*
1077  * Helper function places the vm_page on the specified queue.  Generally
1078  * speaking only PQ_FREE pages are placed at the head, to allow them to
1079  * be allocated sooner rather than later on the assumption that they
1080  * are cache-hot.
1081  *
1082  * The vm_page must be spinlocked.
1083  * The vm_page must NOT be FICTITIOUS (that would be a disaster)
1084  * This function will return with both the page and the queue locked.
1085  */
1086 static __inline void
1087 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
1088 {
1089 	struct vpgqueues *pq;
1090 	u_long *cnt_adj;
1091 	u_long *cnt_gd;
1092 
1093 	KKASSERT(m->queue == PQ_NONE &&
1094 		 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0);
1095 
1096 	if (queue != PQ_NONE) {
1097 		vm_page_queues_spin_lock(queue);
1098 		pq = &vm_page_queues[queue];
1099 		++pq->lcnt;
1100 
1101 		/*
1102 		 * Adjust our pcpu stats.  If a system entity really needs
1103 		 * to incorporate the count it will call vmstats_rollup()
1104 		 * to roll it all up into the global vmstats strufture.
1105 		 */
1106 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1107 				   pq->cnt_offset);
1108 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1109 				   pq->cnt_offset);
1110 		atomic_add_long(cnt_adj, 1);
1111 		atomic_add_long(cnt_gd, 1);
1112 
1113 		/*
1114 		 * PQ_FREE is always handled LIFO style to try to provide
1115 		 * cache-hot pages to programs.
1116 		 */
1117 		m->queue = queue;
1118 		if (queue - m->pc == PQ_FREE) {
1119 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1120 		} else if (athead) {
1121 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1122 		} else {
1123 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1124 		}
1125 		/* leave the queue spinlocked */
1126 	}
1127 }
1128 
1129 /*
1130  * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
1131  * until the page is no longer BUSY or SBUSY (busy_count field is 0).
1132  *
1133  * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
1134  * call will be made before returning.
1135  *
1136  * This function does NOT busy the page and on return the page is not
1137  * guaranteed to be available.
1138  */
1139 void
1140 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
1141 {
1142 	u_int32_t busy_count;
1143 
1144 	for (;;) {
1145 		busy_count = m->busy_count;
1146 		cpu_ccfence();
1147 
1148 		if ((busy_count & PBUSY_LOCKED) == 0 &&
1149 		    (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
1150 			break;
1151 		}
1152 		tsleep_interlock(m, 0);
1153 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1154 				      busy_count | PBUSY_WANTED)) {
1155 			atomic_set_int(&m->flags, PG_REFERENCED);
1156 			tsleep(m, PINTERLOCKED, msg, 0);
1157 			break;
1158 		}
1159 	}
1160 }
1161 
1162 /*
1163  * This calculates and returns a page color given an optional VM object and
1164  * either a pindex or an iterator.  We attempt to return a cpu-localized
1165  * pg_color that is still roughly 16-way set-associative.  The CPU topology
1166  * is used if it was probed.
1167  *
1168  * The caller may use the returned value to index into e.g. PQ_FREE when
1169  * allocating a page in order to nominally obtain pages that are hopefully
1170  * already localized to the requesting cpu.  This function is not able to
1171  * provide any sort of guarantee of this, but does its best to improve
1172  * hardware cache management performance.
1173  *
1174  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
1175  */
1176 u_short
1177 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
1178 {
1179 	u_short pg_color;
1180 	int object_pg_color;
1181 
1182 	/*
1183 	 * WARNING! cpu_topology_core_ids might not be a power of two.
1184 	 *	    We also shouldn't make assumptions about
1185 	 *	    cpu_topology_phys_ids either.
1186 	 *
1187 	 * WARNING! ncpus might not be known at this time (during early
1188 	 *	    boot), and might be set to 1.
1189 	 *
1190 	 * General format: [phys_id][core_id][cpuid][set-associativity]
1191 	 * (but uses modulo, so not necessarily precise bit masks)
1192 	 */
1193 	object_pg_color = object ? object->pg_color : 0;
1194 
1195 	if (cpu_topology_ht_ids) {
1196 		int phys_id;
1197 		int core_id;
1198 		int ht_id;
1199 		int physcale;
1200 		int grpscale;
1201 		int cpuscale;
1202 
1203 		/*
1204 		 * Translate cpuid to socket, core, and hyperthread id.
1205 		 */
1206 		phys_id = get_cpu_phys_id(cpuid);
1207 		core_id = get_cpu_core_id(cpuid);
1208 		ht_id = get_cpu_ht_id(cpuid);
1209 
1210 		/*
1211 		 * Calculate pg_color for our array index.
1212 		 *
1213 		 * physcale - socket multiplier.
1214 		 * grpscale - core multiplier (cores per socket)
1215 		 * cpu*	    - cpus per core
1216 		 *
1217 		 * WARNING! In early boot, ncpus has not yet been
1218 		 *	    initialized and may be set to (1).
1219 		 *
1220 		 * WARNING! physcale must match the organization that
1221 		 *	    vm_numa_organize() creates to ensure that
1222 		 *	    we properly localize allocations to the
1223 		 *	    requested cpuid.
1224 		 */
1225 		physcale = PQ_L2_SIZE / cpu_topology_phys_ids;
1226 		grpscale = physcale / cpu_topology_core_ids;
1227 		cpuscale = grpscale / cpu_topology_ht_ids;
1228 
1229 		pg_color = phys_id * physcale;
1230 		pg_color += core_id * grpscale;
1231 		pg_color += ht_id * cpuscale;
1232 		pg_color += (pindex + object_pg_color) % cpuscale;
1233 
1234 #if 0
1235 		if (grpsize >= 8) {
1236 			pg_color += (pindex + object_pg_color) % grpsize;
1237 		} else {
1238 			if (grpsize <= 2) {
1239 				grpsize = 8;
1240 			} else {
1241 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
1242 				grpsize += grpsize;
1243 				if (grpsize < 8)
1244 					grpsize += grpsize;
1245 			}
1246 			pg_color += (pindex + object_pg_color) % grpsize;
1247 		}
1248 #endif
1249 	} else {
1250 		/*
1251 		 * Unknown topology, distribute things evenly.
1252 		 *
1253 		 * WARNING! In early boot, ncpus has not yet been
1254 		 *	    initialized and may be set to (1).
1255 		 */
1256 		int cpuscale;
1257 
1258 		cpuscale = PQ_L2_SIZE / ncpus;
1259 
1260 		pg_color = cpuid * cpuscale;
1261 		pg_color += (pindex + object_pg_color) % cpuscale;
1262 	}
1263 	return (pg_color & PQ_L2_MASK);
1264 }
1265 
1266 /*
1267  * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
1268  * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
1269  */
1270 void
1271 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
1272 				     int also_m_busy, const char *msg
1273 				     VM_PAGE_DEBUG_ARGS)
1274 {
1275 	u_int32_t busy_count;
1276 
1277 	for (;;) {
1278 		busy_count = m->busy_count;
1279 		cpu_ccfence();
1280 		if (busy_count & PBUSY_LOCKED) {
1281 			tsleep_interlock(m, 0);
1282 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1283 					  busy_count | PBUSY_WANTED)) {
1284 				atomic_set_int(&m->flags, PG_REFERENCED);
1285 				tsleep(m, PINTERLOCKED, msg, 0);
1286 			}
1287 		} else if (also_m_busy && busy_count) {
1288 			tsleep_interlock(m, 0);
1289 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1290 					  busy_count | PBUSY_WANTED)) {
1291 				atomic_set_int(&m->flags, PG_REFERENCED);
1292 				tsleep(m, PINTERLOCKED, msg, 0);
1293 			}
1294 		} else {
1295 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1296 					      busy_count | PBUSY_LOCKED)) {
1297 #ifdef VM_PAGE_DEBUG
1298 				m->busy_func = func;
1299 				m->busy_line = lineno;
1300 #endif
1301 				break;
1302 			}
1303 		}
1304 	}
1305 }
1306 
1307 /*
1308  * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
1309  * m->busy_count is also 0.
1310  *
1311  * Returns non-zero on failure.
1312  */
1313 int
1314 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1315 				    VM_PAGE_DEBUG_ARGS)
1316 {
1317 	u_int32_t busy_count;
1318 
1319 	for (;;) {
1320 		busy_count = m->busy_count;
1321 		cpu_ccfence();
1322 		if (busy_count & PBUSY_LOCKED)
1323 			return TRUE;
1324 		if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1325 			return TRUE;
1326 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1327 				      busy_count | PBUSY_LOCKED)) {
1328 #ifdef VM_PAGE_DEBUG
1329 				m->busy_func = func;
1330 				m->busy_line = lineno;
1331 #endif
1332 			return FALSE;
1333 		}
1334 	}
1335 }
1336 
1337 /*
1338  * Clear the BUSY flag and return non-zero to indicate to the caller
1339  * that a wakeup() should be performed.
1340  *
1341  * (inline version)
1342  */
1343 static __inline
1344 int
1345 _vm_page_wakeup(vm_page_t m)
1346 {
1347 	u_int32_t busy_count;
1348 
1349 	busy_count = m->busy_count;
1350 	cpu_ccfence();
1351 	for (;;) {
1352 		if (atomic_fcmpset_int(&m->busy_count, &busy_count,
1353 				      busy_count &
1354 				      ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1355 			return((int)(busy_count & PBUSY_WANTED));
1356 		}
1357 	}
1358 	/* not reached */
1359 }
1360 
1361 /*
1362  * Clear the BUSY flag and wakeup anyone waiting for the page.  This
1363  * is typically the last call you make on a page before moving onto
1364  * other things.
1365  */
1366 void
1367 vm_page_wakeup(vm_page_t m)
1368 {
1369         KASSERT(m->busy_count & PBUSY_LOCKED,
1370 		("vm_page_wakeup: page not busy!!!"));
1371 	if (_vm_page_wakeup(m))
1372 		wakeup(m);
1373 }
1374 
1375 /*
1376  * Hold a page, preventing reuse.  This is typically only called on pages
1377  * in a known state (either held busy, special, or interlocked in some
1378  * manner).  Holding a page does not ensure that it remains valid, it only
1379  * prevents reuse.  The page must not already be on the FREE queue or in
1380  * any danger of being moved to the FREE queue concurrent with this call.
1381  *
1382  * Other parts of the system can still disassociate the page from its object
1383  * and attempt to free it, or perform read or write I/O on it and/or otherwise
1384  * manipulate the page, but if the page is held the VM system will leave the
1385  * page and its data intact and not cycle it through the FREE queue until
1386  * the last hold has been released.
1387  *
1388  * (see vm_page_wire() if you want to prevent the page from being
1389  *  disassociated from its object too).
1390  */
1391 void
1392 vm_page_hold(vm_page_t m)
1393 {
1394 	atomic_add_int(&m->hold_count, 1);
1395 	KKASSERT(m->queue - m->pc != PQ_FREE);
1396 }
1397 
1398 /*
1399  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1400  * it was freed while held and must be moved back to the FREE queue.
1401  *
1402  * To avoid racing against vm_page_free*() we must re-test conditions
1403  * after obtaining the spin-lock.  The initial test can also race a
1404  * vm_page_free*() that is in the middle of moving a page to PQ_HOLD,
1405  * leaving the page on PQ_HOLD with hold_count == 0.  Rather than
1406  * throw a spin-lock in the critical path, we rely on the pageout
1407  * daemon to clean-up these loose ends.
1408  *
1409  * More critically, the 'easy movement' between queues without busying
1410  * a vm_page is only allowed for PQ_FREE<->PQ_HOLD.
1411  */
1412 void
1413 vm_page_unhold(vm_page_t m)
1414 {
1415 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1416 		("vm_page_unhold: pg %p illegal hold_count (%d) or "
1417 		 "on FREE queue (%d)",
1418 		 m, m->hold_count, m->queue - m->pc));
1419 
1420 	if (atomic_fetchadd_int(&m->hold_count, -1) == 1 &&
1421 	    m->queue - m->pc == PQ_HOLD) {
1422 		vm_page_spin_lock(m);
1423 		if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1424 			_vm_page_queue_spin_lock(m);
1425 			_vm_page_rem_queue_spinlocked(m);
1426 			_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1427 			_vm_page_queue_spin_unlock(m);
1428 		}
1429 		vm_page_spin_unlock(m);
1430 	}
1431 }
1432 
1433 /*
1434  * Create a fictitious page with the specified physical address and
1435  * memory attribute.  The memory attribute is the only the machine-
1436  * dependent aspect of a fictitious page that must be initialized.
1437  */
1438 void
1439 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1440 {
1441 	/*
1442 	 * The page's memattr might have changed since the
1443 	 * previous initialization.  Update the pmap to the
1444 	 * new memattr.
1445 	 */
1446 	if ((m->flags & PG_FICTITIOUS) != 0)
1447 		goto memattr;
1448 	m->phys_addr = paddr;
1449 	m->queue = PQ_NONE;
1450 	/* Fictitious pages don't use "segind". */
1451 	/* Fictitious pages don't use "order" or "pool". */
1452 	m->flags = PG_FICTITIOUS | PG_UNQUEUED;
1453 	m->busy_count = PBUSY_LOCKED;
1454 	m->wire_count = 1;
1455 	spin_init(&m->spin, "fake_page");
1456 	pmap_page_init(m);
1457 memattr:
1458 	pmap_page_set_memattr(m, memattr);
1459 }
1460 
1461 /*
1462  * Inserts the given vm_page into the object and object list.
1463  *
1464  * The pagetables are not updated but will presumably fault the page
1465  * in if necessary, or if a kernel page the caller will at some point
1466  * enter the page into the kernel's pmap.  We are not allowed to block
1467  * here so we *can't* do this anyway.
1468  *
1469  * This routine may not block.
1470  * This routine must be called with the vm_object held.
1471  * This routine must be called with a critical section held.
1472  *
1473  * This routine returns TRUE if the page was inserted into the object
1474  * successfully, and FALSE if the page already exists in the object.
1475  */
1476 int
1477 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1478 {
1479 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1480 	if (m->object != NULL)
1481 		panic("vm_page_insert: already inserted");
1482 
1483 	atomic_add_int(&object->generation, 1);
1484 
1485 	/*
1486 	 * Associate the VM page with an (object, offset).
1487 	 *
1488 	 * The vm_page spin lock is required for interactions with the pmap.
1489 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1490 	 */
1491 	vm_page_spin_lock(m);
1492 	m->object = object;
1493 	m->pindex = pindex;
1494 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1495 		m->object = NULL;
1496 		m->pindex = 0;
1497 		vm_page_spin_unlock(m);
1498 		return FALSE;
1499 	}
1500 	++object->resident_page_count;
1501 	++mycpu->gd_vmtotal.t_rm;
1502 	vm_page_spin_unlock(m);
1503 
1504 	/*
1505 	 * Since we are inserting a new and possibly dirty page,
1506 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1507 	 */
1508 	if ((m->valid & m->dirty) ||
1509 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1510 		vm_object_set_writeable_dirty(object);
1511 
1512 	/*
1513 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1514 	 */
1515 	swap_pager_page_inserted(m);
1516 	return TRUE;
1517 }
1518 
1519 /*
1520  * Removes the given vm_page_t from the (object,index) table
1521  *
1522  * The page must be BUSY and will remain BUSY on return.
1523  * No other requirements.
1524  *
1525  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1526  *	 it busy.
1527  *
1528  * NOTE: Caller is responsible for any pmap disposition prior to the
1529  *	 rename (as the pmap code will not be able to find the entries
1530  *	 once the object has been disassociated).  The caller may choose
1531  *	 to leave the pmap association intact if this routine is being
1532  *	 called as part of a rename between shadowed objects.
1533  *
1534  * This routine may not block.
1535  */
1536 void
1537 vm_page_remove(vm_page_t m)
1538 {
1539 	vm_object_t object;
1540 
1541 	if (m->object == NULL) {
1542 		return;
1543 	}
1544 
1545 	if ((m->busy_count & PBUSY_LOCKED) == 0)
1546 		panic("vm_page_remove: page not busy");
1547 
1548 	object = m->object;
1549 
1550 	vm_object_hold(object);
1551 
1552 	/*
1553 	 * Remove the page from the object and update the object.
1554 	 *
1555 	 * The vm_page spin lock is required for interactions with the pmap.
1556 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1557 	 */
1558 	vm_page_spin_lock(m);
1559 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1560 	--object->resident_page_count;
1561 	--mycpu->gd_vmtotal.t_rm;
1562 	m->object = NULL;
1563 	atomic_add_int(&object->generation, 1);
1564 	vm_page_spin_unlock(m);
1565 
1566 	vm_object_drop(object);
1567 }
1568 
1569 /*
1570  * Calculate the hash position for the vm_page hash heuristic.  Generally
1571  * speaking we want to localize sequential lookups to reduce memory stalls.
1572  *
1573  * Mask by ~3 to offer 4-way set-assoc
1574  */
1575 static __inline
1576 struct vm_page_hash_elm *
1577 vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
1578 {
1579 	size_t hi;
1580 
1581 	hi = iscsi_crc32(&object, sizeof(object)) << 2;
1582 	hi ^= hi >> (23 - 2);
1583 	hi += pindex * VM_PAGE_HASH_SET;
1584 #if 0
1585 	/* mix it up */
1586 	hi = (intptr_t)object ^ object->pg_color ^ pindex;
1587 	hi += object->pg_color * pindex;
1588 	hi = hi ^ (hi >> 20);
1589 #endif
1590 	hi &= vm_page_hash_size - 1;		/* bounds */
1591 
1592 	return (&vm_page_hash[hi]);
1593 }
1594 
1595 /*
1596  * Heuristical page lookup that does not require any locks.  Returns
1597  * a soft-busied page on success, NULL on failure.
1598  *
1599  * Caller must lookup the page the slow way if NULL is returned.
1600  */
1601 vm_page_t
1602 vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
1603 {
1604 	struct vm_page_hash_elm *mp;
1605 	vm_page_t m;
1606 	int i;
1607 
1608 	if (__predict_false(vm_page_hash == NULL))
1609 		return NULL;
1610 	mp = vm_page_hash_hash(object, pindex);
1611 	for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1612 		if (mp->object != object ||
1613 		    mp->pindex != pindex) {
1614 			continue;
1615 		}
1616 		m = mp->m;
1617 		cpu_ccfence();
1618 		if (m == NULL)
1619 			continue;
1620 		if (m->object != object || m->pindex != pindex)
1621 			continue;
1622 		if (vm_page_sbusy_try(m))
1623 			continue;
1624 		if (m->object == object && m->pindex == pindex) {
1625 			/*
1626 			 * On-match optimization - do not update ticks
1627 			 * unless we have to (reduce cache coherency traffic)
1628 			 */
1629 			if (mp->ticks != ticks)
1630 				mp->ticks = ticks;
1631 			return m;
1632 		}
1633 		vm_page_sbusy_drop(m);
1634 	}
1635 	return NULL;
1636 }
1637 
1638 /*
1639  * Enter page onto vm_page_hash[].  This is a heuristic, SMP collisions
1640  * are allowed.
1641  */
1642 static __inline
1643 void
1644 vm_page_hash_enter(vm_page_t m)
1645 {
1646 	struct vm_page_hash_elm *mp;
1647 	struct vm_page_hash_elm *best;
1648 	vm_object_t object;
1649 	vm_pindex_t pindex;
1650 	int best_delta;
1651 	int delta;
1652 	int i;
1653 
1654 	/*
1655 	 * Only enter type-stable vm_pages with well-shared objects.
1656 	 */
1657 	if ((m->flags & PG_MAPPEDMULTI) == 0)
1658 		return;
1659 	if (__predict_false(vm_page_hash == NULL ||
1660 			    m < &vm_page_array[0] ||
1661 			    m >= &vm_page_array[vm_page_array_size])) {
1662 		return;
1663 	}
1664 	if (__predict_false(m->object == NULL))
1665 		return;
1666 #if 0
1667 	/*
1668 	 * Disabled at the moment, there are some degenerate conditions
1669 	 * with often-exec'd programs that get ignored.  In particular,
1670 	 * the kernel's elf loader does a vn_rdwr() on the first page of
1671 	 * a binary.
1672 	 */
1673 	if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING))
1674 		return;
1675 #endif
1676 	if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE)
1677 		return;
1678 
1679 	/*
1680 	 * Find best entry
1681 	 */
1682 	object = m->object;
1683 	pindex = m->pindex;
1684 
1685 	mp = vm_page_hash_hash(object, pindex);
1686 	best = mp;
1687 	best_delta = ticks - best->ticks;
1688 
1689 	for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1690 		if (mp->m == m &&
1691 		    mp->object == object &&
1692 		    mp->pindex == pindex) {
1693 			/*
1694 			 * On-match optimization - do not update ticks
1695 			 * unless we have to (reduce cache coherency traffic)
1696 			 */
1697 			if (mp->ticks != ticks)
1698 				mp->ticks = ticks;
1699 			return;
1700 		}
1701 
1702 		/*
1703 		 * The best choice is the oldest entry.
1704 		 *
1705 		 * Also check for a field overflow, using -1 instead of 0
1706 		 * to deal with SMP races on accessing the 'ticks' global.
1707 		 */
1708 		delta = ticks - mp->ticks;
1709 		if (delta < -1)
1710 			best = mp;
1711 		if (best_delta < delta)
1712 			best = mp;
1713 	}
1714 
1715 	/*
1716 	 * Load the entry.  Copy a few elements to the hash entry itself
1717 	 * to reduce memory stalls due to memory indirects on lookups.
1718 	 */
1719 	best->m = m;
1720 	best->object = object;
1721 	best->pindex = pindex;
1722 	best->ticks = ticks;
1723 }
1724 
1725 /*
1726  * Locate and return the page at (object, pindex), or NULL if the
1727  * page could not be found.
1728  *
1729  * The caller must hold the vm_object token.
1730  */
1731 vm_page_t
1732 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1733 {
1734 	vm_page_t m;
1735 
1736 	/*
1737 	 * Search the hash table for this object/offset pair
1738 	 */
1739 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1740 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1741 	if (m) {
1742 		KKASSERT(m->object == object && m->pindex == pindex);
1743 		vm_page_hash_enter(m);
1744 	}
1745 	return(m);
1746 }
1747 
1748 vm_page_t
1749 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1750 					    vm_pindex_t pindex,
1751 					    int also_m_busy, const char *msg
1752 					    VM_PAGE_DEBUG_ARGS)
1753 {
1754 	u_int32_t busy_count;
1755 	vm_page_t m;
1756 
1757 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1758 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1759 	while (m) {
1760 		KKASSERT(m->object == object && m->pindex == pindex);
1761 		busy_count = m->busy_count;
1762 		cpu_ccfence();
1763 		if (busy_count & PBUSY_LOCKED) {
1764 			tsleep_interlock(m, 0);
1765 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1766 					  busy_count | PBUSY_WANTED)) {
1767 				atomic_set_int(&m->flags, PG_REFERENCED);
1768 				tsleep(m, PINTERLOCKED, msg, 0);
1769 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1770 							      pindex);
1771 			}
1772 		} else if (also_m_busy && busy_count) {
1773 			tsleep_interlock(m, 0);
1774 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1775 					  busy_count | PBUSY_WANTED)) {
1776 				atomic_set_int(&m->flags, PG_REFERENCED);
1777 				tsleep(m, PINTERLOCKED, msg, 0);
1778 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1779 							      pindex);
1780 			}
1781 		} else if (atomic_cmpset_int(&m->busy_count, busy_count,
1782 					     busy_count | PBUSY_LOCKED)) {
1783 #ifdef VM_PAGE_DEBUG
1784 			m->busy_func = func;
1785 			m->busy_line = lineno;
1786 #endif
1787 			vm_page_hash_enter(m);
1788 			break;
1789 		}
1790 	}
1791 	return m;
1792 }
1793 
1794 /*
1795  * Attempt to lookup and busy a page.
1796  *
1797  * Returns NULL if the page could not be found
1798  *
1799  * Returns a vm_page and error == TRUE if the page exists but could not
1800  * be busied.
1801  *
1802  * Returns a vm_page and error == FALSE on success.
1803  */
1804 vm_page_t
1805 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1806 					   vm_pindex_t pindex,
1807 					   int also_m_busy, int *errorp
1808 					   VM_PAGE_DEBUG_ARGS)
1809 {
1810 	u_int32_t busy_count;
1811 	vm_page_t m;
1812 
1813 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1814 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1815 	*errorp = FALSE;
1816 	while (m) {
1817 		KKASSERT(m->object == object && m->pindex == pindex);
1818 		busy_count = m->busy_count;
1819 		cpu_ccfence();
1820 		if (busy_count & PBUSY_LOCKED) {
1821 			*errorp = TRUE;
1822 			break;
1823 		}
1824 		if (also_m_busy && busy_count) {
1825 			*errorp = TRUE;
1826 			break;
1827 		}
1828 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1829 				      busy_count | PBUSY_LOCKED)) {
1830 #ifdef VM_PAGE_DEBUG
1831 			m->busy_func = func;
1832 			m->busy_line = lineno;
1833 #endif
1834 			vm_page_hash_enter(m);
1835 			break;
1836 		}
1837 	}
1838 	return m;
1839 }
1840 
1841 /*
1842  * Returns a page that is only soft-busied for use by the caller in
1843  * a read-only fashion.  Returns NULL if the page could not be found,
1844  * the soft busy could not be obtained, or the page data is invalid.
1845  *
1846  * XXX Doesn't handle PG_FICTITIOUS pages at the moment, but there is
1847  *     no reason why we couldn't.
1848  */
1849 vm_page_t
1850 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1851 			 int pgoff, int pgbytes)
1852 {
1853 	vm_page_t m;
1854 
1855 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1856 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1857 	if (m) {
1858 		if ((m->valid != VM_PAGE_BITS_ALL &&
1859 		     !vm_page_is_valid(m, pgoff, pgbytes)) ||
1860 		    (m->flags & PG_FICTITIOUS)) {
1861 			m = NULL;
1862 		} else if (vm_page_sbusy_try(m)) {
1863 			m = NULL;
1864 		} else if ((m->valid != VM_PAGE_BITS_ALL &&
1865 			    !vm_page_is_valid(m, pgoff, pgbytes)) ||
1866 			   (m->flags & PG_FICTITIOUS)) {
1867 			vm_page_sbusy_drop(m);
1868 			m = NULL;
1869 		} else {
1870 			vm_page_hash_enter(m);
1871 		}
1872 	}
1873 	return m;
1874 }
1875 
1876 /*
1877  * Caller must hold the related vm_object
1878  */
1879 vm_page_t
1880 vm_page_next(vm_page_t m)
1881 {
1882 	vm_page_t next;
1883 
1884 	next = vm_page_rb_tree_RB_NEXT(m);
1885 	if (next && next->pindex != m->pindex + 1)
1886 		next = NULL;
1887 	return (next);
1888 }
1889 
1890 /*
1891  * vm_page_rename()
1892  *
1893  * Move the given vm_page from its current object to the specified
1894  * target object/offset.  The page must be busy and will remain so
1895  * on return.
1896  *
1897  * new_object must be held.
1898  * This routine might block. XXX ?
1899  *
1900  * NOTE: Swap associated with the page must be invalidated by the move.  We
1901  *       have to do this for several reasons:  (1) we aren't freeing the
1902  *       page, (2) we are dirtying the page, (3) the VM system is probably
1903  *       moving the page from object A to B, and will then later move
1904  *       the backing store from A to B and we can't have a conflict.
1905  *
1906  * NOTE: We *always* dirty the page.  It is necessary both for the
1907  *       fact that we moved it, and because we may be invalidating
1908  *	 swap.  If the page is on the cache, we have to deactivate it
1909  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1910  *	 on the cache.
1911  *
1912  * NOTE: Caller is responsible for any pmap disposition prior to the
1913  *	 rename (as the pmap code will not be able to find the entries
1914  *	 once the object has been disassociated or changed).  Nominally
1915  *	 the caller is moving a page between shadowed objects and so the
1916  *	 pmap association is retained without having to remove the page
1917  *	 from it.
1918  */
1919 void
1920 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1921 {
1922 	KKASSERT(m->busy_count & PBUSY_LOCKED);
1923 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1924 	if (m->object) {
1925 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1926 		vm_page_remove(m);
1927 	}
1928 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1929 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1930 		      new_object, new_pindex);
1931 	}
1932 	if (m->queue - m->pc == PQ_CACHE)
1933 		vm_page_deactivate(m);
1934 	vm_page_dirty(m);
1935 }
1936 
1937 /*
1938  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1939  * is to remain BUSYied by the caller.
1940  *
1941  * This routine may not block.
1942  */
1943 void
1944 vm_page_unqueue_nowakeup(vm_page_t m)
1945 {
1946 	vm_page_and_queue_spin_lock(m);
1947 	(void)_vm_page_rem_queue_spinlocked(m);
1948 	vm_page_spin_unlock(m);
1949 }
1950 
1951 /*
1952  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1953  * if necessary.
1954  *
1955  * This routine may not block.
1956  */
1957 void
1958 vm_page_unqueue(vm_page_t m)
1959 {
1960 	u_short queue;
1961 
1962 	vm_page_and_queue_spin_lock(m);
1963 	queue = _vm_page_rem_queue_spinlocked(m);
1964 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1965 		vm_page_spin_unlock(m);
1966 		pagedaemon_wakeup();
1967 	} else {
1968 		vm_page_spin_unlock(m);
1969 	}
1970 }
1971 
1972 /*
1973  * vm_page_list_find()
1974  *
1975  * Find a page on the specified queue with color optimization.
1976  *
1977  * The page coloring optimization attempts to locate a page that does
1978  * not overload other nearby pages in the object in the cpu's L1 or L2
1979  * caches.  We need this optimization because cpu caches tend to be
1980  * physical caches, while object spaces tend to be virtual.
1981  *
1982  * The page coloring optimization also, very importantly, tries to localize
1983  * memory to cpus and physical sockets.
1984  *
1985  * Each PQ_FREE and PQ_CACHE color queue has its own spinlock and the
1986  * algorithm is adjusted to localize allocations on a per-core basis.
1987  * This is done by 'twisting' the colors.
1988  *
1989  * The page is returned spinlocked and removed from its queue (it will
1990  * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
1991  * is responsible for dealing with the busy-page case (usually by
1992  * deactivating the page and looping).
1993  *
1994  * NOTE:  This routine is carefully inlined.  A non-inlined version
1995  *	  is available for outside callers but the only critical path is
1996  *	  from within this source file.
1997  *
1998  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1999  *	  represent stable storage, allowing us to order our locks vm_page
2000  *	  first, then queue.
2001  *
2002  * WARNING! The returned page is not busied and may race other busying
2003  *	  operations, callers must check that the page is in the state they
2004  *	  want after busying.
2005  */
2006 static __inline
2007 vm_page_t
2008 _vm_page_list_find(int basequeue, int index)
2009 {
2010 	struct vpgqueues *pq;
2011 	vm_page_t m;
2012 
2013 	index &= PQ_L2_MASK;
2014 	pq = &vm_page_queues[basequeue + index];
2015 
2016 	/*
2017 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2018 	 * then lock the queue and locate a page.  Note that the lock order
2019 	 * is reversed, but we do not want to dwadle on the page spinlock
2020 	 * anyway as it is held significantly longer than the queue spinlock.
2021 	 */
2022 	if (TAILQ_FIRST(&pq->pl)) {
2023 		spin_lock(&pq->spin);
2024 		TAILQ_FOREACH(m, &pq->pl, pageq) {
2025 			if (spin_trylock(&m->spin) == 0)
2026 				continue;
2027 			KKASSERT(m->queue == basequeue + index);
2028 			pq->lastq = -1;
2029 			return(m);
2030 		}
2031 		spin_unlock(&pq->spin);
2032 	}
2033 
2034 	m = _vm_page_list_find_wide(basequeue, index, &pq->lastq);
2035 
2036 	return(m);
2037 }
2038 
2039 /*
2040  * If we could not find the page in the desired queue try to find it in
2041  * a nearby (NUMA-aware) queue, spreading out as we go.
2042  */
2043 static vm_page_t
2044 _vm_page_list_find_wide(int basequeue, int index, int *lastp)
2045 {
2046 	struct vpgqueues *pq;
2047 	vm_page_t m = NULL;
2048 	int pqmask = set_assoc_mask >> 1;
2049 	int pqi;
2050 	int range;
2051 	int skip_start;
2052 	int skip_next;
2053 	int count;
2054 
2055 	/*
2056 	 * Avoid re-searching empty queues over and over again skip to
2057 	 * pq->last if appropriate.
2058 	 */
2059 	if (*lastp >= 0)
2060 		index = *lastp;
2061 
2062 	index &= PQ_L2_MASK;
2063 	pq = &vm_page_queues[basequeue];
2064 	count = 0;
2065 	skip_start = -1;
2066 	skip_next = -1;
2067 
2068 	/*
2069 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2070 	 * else fails (PQ_L2_MASK).
2071 	 *
2072 	 * pqmask is a mask, 15, 31, 63, etc.
2073 	 *
2074 	 * Test each queue unlocked first, then lock the queue and locate
2075 	 * a page.  Note that the lock order is reversed, but we do not want
2076 	 * to dwadle on the page spinlock anyway as it is held significantly
2077 	 * longer than the queue spinlock.
2078 	 */
2079 	do {
2080 		pqmask = (pqmask << 1) | 1;
2081 
2082 		pqi = index;
2083 		range = pqmask + 1;
2084 
2085 		while (range > 0) {
2086 			if (pqi >= skip_start && pqi < skip_next) {
2087 				range -= skip_next - pqi;
2088 				pqi = (pqi & ~pqmask) | (skip_next & pqmask);
2089 			}
2090 			if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) {
2091 				spin_lock(&pq[pqi].spin);
2092 				TAILQ_FOREACH(m, &pq[pqi].pl, pageq) {
2093 					if (spin_trylock(&m->spin) == 0)
2094 						continue;
2095 					KKASSERT(m->queue == basequeue + pqi);
2096 
2097 					/*
2098 					 * If we had to wander too far, set
2099 					 * *lastp to skip past empty queues.
2100 					 */
2101 					if (count >= 8)
2102 						*lastp = pqi & PQ_L2_MASK;
2103 					return(m);
2104 				}
2105 				spin_unlock(&pq[pqi].spin);
2106 			}
2107 			--range;
2108 			++count;
2109 			pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask);
2110 		}
2111 		skip_start = pqi & ~pqmask;
2112 		skip_next = (pqi | pqmask) + 1;
2113 	} while (pqmask != PQ_L2_MASK);
2114 
2115 	return(m);
2116 }
2117 
2118 static __inline
2119 vm_page_t
2120 _vm_page_list_find2(int bq1, int bq2, int index)
2121 {
2122 	struct vpgqueues *pq1;
2123 	struct vpgqueues *pq2;
2124 	vm_page_t m;
2125 
2126 	index &= PQ_L2_MASK;
2127 	pq1 = &vm_page_queues[bq1 + index];
2128 	pq2 = &vm_page_queues[bq2 + index];
2129 
2130 	/*
2131 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2132 	 * then lock the queue and locate a page.  Note that the lock order
2133 	 * is reversed, but we do not want to dwadle on the page spinlock
2134 	 * anyway as it is held significantly longer than the queue spinlock.
2135 	 */
2136 	if (TAILQ_FIRST(&pq1->pl)) {
2137 		spin_lock(&pq1->spin);
2138 		TAILQ_FOREACH(m, &pq1->pl, pageq) {
2139 			if (spin_trylock(&m->spin) == 0)
2140 				continue;
2141 			KKASSERT(m->queue == bq1 + index);
2142 			pq1->lastq = -1;
2143 			pq2->lastq = -1;
2144 			return(m);
2145 		}
2146 		spin_unlock(&pq1->spin);
2147 	}
2148 
2149 	m = _vm_page_list_find2_wide(bq1, bq2, index, &pq1->lastq, &pq2->lastq);
2150 
2151 	return(m);
2152 }
2153 
2154 
2155 /*
2156  * This version checks two queues at the same time, widening its search
2157  * as we progress.  prefering basequeue1
2158  * and starting on basequeue2 after exhausting the first set.  The idea
2159  * is to try to stay localized to the cpu.
2160  */
2161 static vm_page_t
2162 _vm_page_list_find2_wide(int basequeue1, int basequeue2, int index,
2163 			 int *lastp1, int *lastp2)
2164 {
2165 	struct vpgqueues *pq1;
2166 	struct vpgqueues *pq2;
2167 	vm_page_t m = NULL;
2168 	int pqmask1, pqmask2;
2169 	int pqi;
2170 	int range;
2171 	int skip_start1, skip_start2;
2172 	int skip_next1, skip_next2;
2173 	int count1, count2;
2174 
2175 	/*
2176 	 * Avoid re-searching empty queues over and over again skip to
2177 	 * pq->last if appropriate.
2178 	 */
2179 	if (*lastp1 >= 0)
2180 		index = *lastp1;
2181 
2182 	index &= PQ_L2_MASK;
2183 
2184 	pqmask1 = set_assoc_mask >> 1;
2185 	pq1 = &vm_page_queues[basequeue1];
2186 	count1 = 0;
2187 	skip_start1 = -1;
2188 	skip_next1 = -1;
2189 
2190 	pqmask2 = set_assoc_mask >> 1;
2191 	pq2 = &vm_page_queues[basequeue2];
2192 	count2 = 0;
2193 	skip_start2 = -1;
2194 	skip_next2 = -1;
2195 
2196 	/*
2197 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2198 	 * else fails (PQ_L2_MASK).
2199 	 *
2200 	 * pqmask is a mask, 15, 31, 63, etc.
2201 	 *
2202 	 * Test each queue unlocked first, then lock the queue and locate
2203 	 * a page.  Note that the lock order is reversed, but we do not want
2204 	 * to dwadle on the page spinlock anyway as it is held significantly
2205 	 * longer than the queue spinlock.
2206 	 */
2207 	do {
2208 		if (pqmask1 == PQ_L2_MASK)
2209 			goto skip2;
2210 
2211 		pqmask1 = (pqmask1 << 1) | 1;
2212 		pqi = index;
2213 		range = pqmask1 + 1;
2214 
2215 		while (range > 0) {
2216 			if (pqi >= skip_start1 && pqi < skip_next1) {
2217 				range -= skip_next1 - pqi;
2218 				pqi = (pqi & ~pqmask1) | (skip_next1 & pqmask1);
2219 			}
2220 			if (range > 0 && TAILQ_FIRST(&pq1[pqi].pl)) {
2221 				spin_lock(&pq1[pqi].spin);
2222 				TAILQ_FOREACH(m, &pq1[pqi].pl, pageq) {
2223 					if (spin_trylock(&m->spin) == 0)
2224 						continue;
2225 					KKASSERT(m->queue == basequeue1 + pqi);
2226 
2227 					/*
2228 					 * If we had to wander too far, set
2229 					 * *lastp to skip past empty queues.
2230 					 */
2231 					if (count1 >= 8)
2232 						*lastp1 = pqi & PQ_L2_MASK;
2233 					return(m);
2234 				}
2235 				spin_unlock(&pq1[pqi].spin);
2236 			}
2237 			--range;
2238 			++count1;
2239 			pqi = (pqi & ~pqmask1) | ((pqi + 1) & pqmask1);
2240 		}
2241 		skip_start1 = pqi & ~pqmask1;
2242 		skip_next1 = (pqi | pqmask1) + 1;
2243 skip2:
2244 		if (pqmask1 < ((set_assoc_mask << 1) | 1))
2245 			continue;
2246 
2247 		pqmask2 = (pqmask2 << 1) | 1;
2248 		pqi = index;
2249 		range = pqmask2 + 1;
2250 
2251 		while (range > 0) {
2252 			if (pqi >= skip_start2 && pqi < skip_next2) {
2253 				range -= skip_next2 - pqi;
2254 				pqi = (pqi & ~pqmask2) | (skip_next2 & pqmask2);
2255 			}
2256 			if (range > 0 && TAILQ_FIRST(&pq2[pqi].pl)) {
2257 				spin_lock(&pq2[pqi].spin);
2258 				TAILQ_FOREACH(m, &pq2[pqi].pl, pageq) {
2259 					if (spin_trylock(&m->spin) == 0)
2260 						continue;
2261 					KKASSERT(m->queue == basequeue2 + pqi);
2262 
2263 					/*
2264 					 * If we had to wander too far, set
2265 					 * *lastp to skip past empty queues.
2266 					 */
2267 					if (count2 >= 8)
2268 						*lastp2 = pqi & PQ_L2_MASK;
2269 					return(m);
2270 				}
2271 				spin_unlock(&pq2[pqi].spin);
2272 			}
2273 			--range;
2274 			++count2;
2275 			pqi = (pqi & ~pqmask2) | ((pqi + 1) & pqmask2);
2276 		}
2277 		skip_start2 = pqi & ~pqmask2;
2278 		skip_next2 = (pqi | pqmask2) + 1;
2279 	} while (pqmask1 != PQ_L2_MASK && pqmask2 != PQ_L2_MASK);
2280 
2281 	return(m);
2282 }
2283 
2284 /*
2285  * Returns a vm_page candidate for allocation.  The page is not busied so
2286  * it can move around.  The caller must busy the page (and typically
2287  * deactivate it if it cannot be busied!)
2288  *
2289  * Returns a spinlocked vm_page that has been removed from its queue.
2290  * (note that _vm_page_list_find() does not remove the page from its
2291  *  queue).
2292  */
2293 vm_page_t
2294 vm_page_list_find(int basequeue, int index)
2295 {
2296 	vm_page_t m;
2297 
2298 	m = _vm_page_list_find(basequeue, index);
2299 	if (m)
2300 		_vm_page_rem_queue_spinlocked(m);
2301 	return m;
2302 }
2303 
2304 /*
2305  * Find a page on the cache queue with color optimization, remove it
2306  * from the queue, and busy it.  The returned page will not be spinlocked.
2307  *
2308  * A candidate failure will be deactivated.  Candidates can fail due to
2309  * being busied by someone else, in which case they will be deactivated.
2310  *
2311  * This routine may not block.
2312  *
2313  */
2314 static vm_page_t
2315 vm_page_select_cache(u_short pg_color)
2316 {
2317 	vm_page_t m;
2318 
2319 	for (;;) {
2320 		m = _vm_page_list_find(PQ_CACHE, pg_color);
2321 		if (m == NULL)
2322 			break;
2323 		/*
2324 		 * (m) has been spinlocked
2325 		 */
2326 		_vm_page_rem_queue_spinlocked(m);
2327 		if (vm_page_busy_try(m, TRUE)) {
2328 			_vm_page_deactivate_locked(m, 0);
2329 			vm_page_spin_unlock(m);
2330 		} else {
2331 			/*
2332 			 * We successfully busied the page.  This can race
2333 			 * vm_page_lookup() + busy ops so make sure the
2334 			 * page is in the state we want.
2335 			 */
2336 			if ((m->flags & (PG_NEED_COMMIT | PG_MAPPED)) == 0 &&
2337 			    m->hold_count == 0 &&
2338 			    m->wire_count == 0 &&
2339 			    (m->dirty & m->valid) == 0) {
2340 				vm_page_spin_unlock(m);
2341 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2342 				pagedaemon_wakeup();
2343 				return(m);
2344 			}
2345 
2346 			/*
2347 			 * The page cannot be recycled, deactivate it.
2348 			 */
2349 			_vm_page_deactivate_locked(m, 0);
2350 			if (_vm_page_wakeup(m)) {
2351 				vm_page_spin_unlock(m);
2352 				wakeup(m);
2353 			} else {
2354 				vm_page_spin_unlock(m);
2355 			}
2356 		}
2357 	}
2358 	return (m);
2359 }
2360 
2361 /*
2362  * Find a free page.  We attempt to inline the nominal case and fall back
2363  * to _vm_page_select_free() otherwise.  A busied page is removed from
2364  * the queue and returned.
2365  *
2366  * This routine may not block.
2367  */
2368 static __inline vm_page_t
2369 vm_page_select_free(u_short pg_color)
2370 {
2371 	vm_page_t m;
2372 
2373 	for (;;) {
2374 		m = _vm_page_list_find(PQ_FREE, pg_color);
2375 		if (m == NULL)
2376 			break;
2377 		_vm_page_rem_queue_spinlocked(m);
2378 		if (vm_page_busy_try(m, TRUE)) {
2379 			/*
2380 			 * Various mechanisms such as a pmap_collect can
2381 			 * result in a busy page on the free queue.  We
2382 			 * have to move the page out of the way so we can
2383 			 * retry the allocation.  If the other thread is not
2384 			 * allocating the page then m->valid will remain 0 and
2385 			 * the pageout daemon will free the page later on.
2386 			 *
2387 			 * Since we could not busy the page, however, we
2388 			 * cannot make assumptions as to whether the page
2389 			 * will be allocated by the other thread or not,
2390 			 * so all we can do is deactivate it to move it out
2391 			 * of the way.  In particular, if the other thread
2392 			 * wires the page it may wind up on the inactive
2393 			 * queue and the pageout daemon will have to deal
2394 			 * with that case too.
2395 			 */
2396 			_vm_page_deactivate_locked(m, 0);
2397 			vm_page_spin_unlock(m);
2398 		} else {
2399 			/*
2400 			 * Theoretically if we are able to busy the page
2401 			 * atomic with the queue removal (using the vm_page
2402 			 * lock) nobody else should have been able to mess
2403 			 * with the page before us.
2404 			 *
2405 			 * Assert the page state.  Note that even though
2406 			 * wiring doesn't adjust queues, a page on the free
2407 			 * queue should never be wired at this point.
2408 			 */
2409 			KKASSERT((m->flags & (PG_UNQUEUED |
2410 					      PG_NEED_COMMIT)) == 0);
2411 			KASSERT(m->hold_count == 0,
2412 				("m->hold_count is not zero "
2413 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2414 				 m, m->queue, m->flags,
2415 				 m->hold_count, m->wire_count));
2416 			KKASSERT(m->wire_count == 0);
2417 			vm_page_spin_unlock(m);
2418 			pagedaemon_wakeup();
2419 
2420 			/* return busied and removed page */
2421 			return(m);
2422 		}
2423 	}
2424 	return(m);
2425 }
2426 
2427 static __inline vm_page_t
2428 vm_page_select_free_or_cache(u_short pg_color, int *fromcachep)
2429 {
2430 	vm_page_t m;
2431 
2432 	*fromcachep = 0;
2433 	for (;;) {
2434 		m = _vm_page_list_find2(PQ_FREE, PQ_CACHE, pg_color);
2435 		if (m == NULL)
2436 			break;
2437 		if (vm_page_busy_try(m, TRUE)) {
2438 			_vm_page_rem_queue_spinlocked(m);
2439 			_vm_page_deactivate_locked(m, 0);
2440 			vm_page_spin_unlock(m);
2441 		} else if (m->queue - m->pc == PQ_FREE) {
2442 			/*
2443 			 * We successfully busied the page, PQ_FREE case
2444 			 */
2445 			_vm_page_rem_queue_spinlocked(m);
2446 			KKASSERT((m->flags & (PG_UNQUEUED |
2447 					      PG_NEED_COMMIT)) == 0);
2448 			KASSERT(m->hold_count == 0,
2449 				("m->hold_count is not zero "
2450 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2451 				 m, m->queue, m->flags,
2452 				 m->hold_count, m->wire_count));
2453 			KKASSERT(m->wire_count == 0);
2454 			vm_page_spin_unlock(m);
2455 			pagedaemon_wakeup();
2456 
2457 			/* return busied and removed page */
2458 			return(m);
2459 		} else {
2460 			/*
2461 			 * We successfully busied the page, PQ_CACHE case
2462 			 *
2463 			 * This can race vm_page_lookup() + busy ops, so make
2464 			 * sure the page is in the state we want.
2465 			 */
2466 			_vm_page_rem_queue_spinlocked(m);
2467 			if ((m->flags & (PG_NEED_COMMIT | PG_MAPPED)) == 0 &&
2468 			    m->hold_count == 0 &&
2469 			    m->wire_count == 0 &&
2470 			    (m->dirty & m->valid) == 0) {
2471 				vm_page_spin_unlock(m);
2472 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2473 				pagedaemon_wakeup();
2474 				*fromcachep = 1;
2475 				return(m);
2476 			}
2477 
2478 			/*
2479 			 * The page cannot be recycled, deactivate it.
2480 			 */
2481 			_vm_page_deactivate_locked(m, 0);
2482 			if (_vm_page_wakeup(m)) {
2483 				vm_page_spin_unlock(m);
2484 				wakeup(m);
2485 			} else {
2486 				vm_page_spin_unlock(m);
2487 			}
2488 		}
2489 	}
2490 	return(m);
2491 }
2492 
2493 /*
2494  * vm_page_alloc()
2495  *
2496  * Allocate and return a memory cell associated with this VM object/offset
2497  * pair.  If object is NULL an unassociated page will be allocated.
2498  *
2499  * The returned page will be busied and removed from its queues.  This
2500  * routine can block and may return NULL if a race occurs and the page
2501  * is found to already exist at the specified (object, pindex).
2502  *
2503  *	VM_ALLOC_NORMAL		allow use of cache pages, nominal free drain
2504  *	VM_ALLOC_QUICK		like normal but cannot use cache
2505  *	VM_ALLOC_SYSTEM		greater free drain
2506  *	VM_ALLOC_INTERRUPT	allow free list to be completely drained
2507  *	VM_ALLOC_ZERO		advisory request for pre-zero'd page only
2508  *	VM_ALLOC_FORCE_ZERO	advisory request for pre-zero'd page only
2509  *	VM_ALLOC_NULL_OK	ok to return NULL on insertion collision
2510  *				(see vm_page_grab())
2511  *	VM_ALLOC_USE_GD		ok to use per-gd cache
2512  *
2513  *	VM_ALLOC_CPU(n)		allocate using specified cpu localization
2514  *
2515  * The object must be held if not NULL
2516  * This routine may not block
2517  *
2518  * Additional special handling is required when called from an interrupt
2519  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
2520  * in this case.
2521  */
2522 vm_page_t
2523 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
2524 {
2525 	globaldata_t gd;
2526 	vm_object_t obj;
2527 	vm_page_t m;
2528 	u_short pg_color;
2529 	int cpuid_local;
2530 	int fromcache;
2531 
2532 #if 0
2533 	/*
2534 	 * Special per-cpu free VM page cache.  The pages are pre-busied
2535 	 * and pre-zerod for us.
2536 	 */
2537 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
2538 		crit_enter_gd(gd);
2539 		if (gd->gd_vmpg_count) {
2540 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
2541 			crit_exit_gd(gd);
2542 			goto done;
2543                 }
2544 		crit_exit_gd(gd);
2545         }
2546 #endif
2547 	m = NULL;
2548 
2549 	/*
2550 	 * CPU LOCALIZATION
2551 	 *
2552 	 * CPU localization algorithm.  Break the page queues up by physical
2553 	 * id and core id (note that two cpu threads will have the same core
2554 	 * id, and core_id != gd_cpuid).
2555 	 *
2556 	 * This is nowhere near perfect, for example the last pindex in a
2557 	 * subgroup will overflow into the next cpu or package.  But this
2558 	 * should get us good page reuse locality in heavy mixed loads.
2559 	 *
2560 	 * (may be executed before the APs are started, so other GDs might
2561 	 *  not exist!)
2562 	 */
2563 	if (page_req & VM_ALLOC_CPU_SPEC)
2564 		cpuid_local = VM_ALLOC_GETCPU(page_req);
2565 	else
2566 		cpuid_local = mycpu->gd_cpuid;
2567 
2568 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
2569 
2570 	KKASSERT(page_req &
2571 		(VM_ALLOC_NORMAL|VM_ALLOC_QUICK|
2572 		 VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
2573 
2574 	/*
2575 	 * Certain system threads (pageout daemon, buf_daemon's) are
2576 	 * allowed to eat deeper into the free page list.
2577 	 */
2578 	if (curthread->td_flags & TDF_SYSTHREAD)
2579 		page_req |= VM_ALLOC_SYSTEM;
2580 
2581 	/*
2582 	 * To avoid live-locks only compare against v_free_reserved.  The
2583 	 * pageout daemon has extra tests for this.
2584 	 */
2585 loop:
2586 	gd = mycpu;
2587 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
2588 	    ((page_req & VM_ALLOC_INTERRUPT) &&
2589 	     gd->gd_vmstats.v_free_count > 0) ||
2590 	    ((page_req & VM_ALLOC_SYSTEM) &&
2591 	     gd->gd_vmstats.v_cache_count == 0 &&
2592 	     gd->gd_vmstats.v_free_count >
2593 	     gd->gd_vmstats.v_interrupt_free_min)
2594 	) {
2595 		/*
2596 		 * The free queue has sufficient free pages to take one out.
2597 		 *
2598 		 * However, if the free queue is strained the scan may widen
2599 		 * to the entire queue and cause a great deal of SMP
2600 		 * contention, so we use a double-queue-scan if we can
2601 		 * to avoid this.
2602 		 */
2603 		if (page_req & VM_ALLOC_NORMAL) {
2604 			m = vm_page_select_free_or_cache(pg_color, &fromcache);
2605 			if (m && fromcache)
2606 				goto found_cache;
2607 		} else {
2608 			m = vm_page_select_free(pg_color);
2609 		}
2610 	} else if (page_req & VM_ALLOC_NORMAL) {
2611 		/*
2612 		 * Allocatable from the cache (non-interrupt only).  On
2613 		 * success, we must free the page and try again, thus
2614 		 * ensuring that vmstats.v_*_free_min counters are replenished.
2615 		 */
2616 #ifdef INVARIANTS
2617 		if (curthread->td_preempted) {
2618 			kprintf("vm_page_alloc(): warning, attempt to allocate"
2619 				" cache page from preempting interrupt\n");
2620 			m = NULL;
2621 		} else {
2622 			m = vm_page_select_cache(pg_color);
2623 		}
2624 #else
2625 		m = vm_page_select_cache(pg_color);
2626 #endif
2627 		/*
2628 		 * On success move the page into the free queue and loop.
2629 		 *
2630 		 * Only do this if we can safely acquire the vm_object lock,
2631 		 * because this is effectively a random page and the caller
2632 		 * might be holding the lock shared, we don't want to
2633 		 * deadlock.
2634 		 */
2635 		if (m != NULL) {
2636 found_cache:
2637 			KASSERT(m->dirty == 0,
2638 				("Found dirty cache page %p", m));
2639 			if ((obj = m->object) != NULL) {
2640 				if (vm_object_hold_try(obj)) {
2641 					if (__predict_false((m->flags & (PG_MAPPED|PG_WRITEABLE)) != 0))
2642 						vm_page_protect(m, VM_PROT_NONE);
2643 					vm_page_free(m);
2644 					/* m->object NULL here */
2645 					vm_object_drop(obj);
2646 				} else {
2647 					vm_page_deactivate(m);
2648 					vm_page_wakeup(m);
2649 				}
2650 			} else {
2651 				if (__predict_false((m->flags & (PG_MAPPED|PG_WRITEABLE)) != 0))
2652 					vm_page_protect(m, VM_PROT_NONE);
2653 				vm_page_free(m);
2654 			}
2655 			goto loop;
2656 		}
2657 
2658 		/*
2659 		 * On failure return NULL
2660 		 */
2661 		atomic_add_int(&vm_pageout_deficit, 1);
2662 		pagedaemon_wakeup();
2663 		return (NULL);
2664 	} else {
2665 		/*
2666 		 * No pages available, wakeup the pageout daemon and give up.
2667 		 */
2668 		atomic_add_int(&vm_pageout_deficit, 1);
2669 		pagedaemon_wakeup();
2670 		return (NULL);
2671 	}
2672 
2673 	/*
2674 	 * v_free_count can race so loop if we don't find the expected
2675 	 * page.
2676 	 */
2677 	if (m == NULL) {
2678 		vmstats_rollup();
2679 		goto loop;
2680 	}
2681 
2682 	/*
2683 	 * Good page found.  The page has already been busied for us and
2684 	 * removed from its queues.
2685 	 */
2686 	KASSERT(m->dirty == 0,
2687 		("vm_page_alloc: free/cache page %p was dirty", m));
2688 	KKASSERT(m->queue == PQ_NONE);
2689 
2690 #if 0
2691 done:
2692 #endif
2693 	/*
2694 	 * Initialize the structure, inheriting some flags but clearing
2695 	 * all the rest.  The page has already been busied for us.
2696 	 */
2697 	vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
2698 
2699 	KKASSERT(m->wire_count == 0);
2700 	KKASSERT((m->busy_count & PBUSY_MASK) == 0);
2701 	m->act_count = 0;
2702 	m->valid = 0;
2703 
2704 	/*
2705 	 * Caller must be holding the object lock (asserted by
2706 	 * vm_page_insert()).
2707 	 *
2708 	 * NOTE: Inserting a page here does not insert it into any pmaps
2709 	 *	 (which could cause us to block allocating memory).
2710 	 *
2711 	 * NOTE: If no object an unassociated page is allocated, m->pindex
2712 	 *	 can be used by the caller for any purpose.
2713 	 */
2714 	if (object) {
2715 		if (vm_page_insert(m, object, pindex) == FALSE) {
2716 			vm_page_free(m);
2717 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
2718 				panic("PAGE RACE %p[%ld]/%p",
2719 				      object, (long)pindex, m);
2720 			m = NULL;
2721 		}
2722 	} else {
2723 		m->pindex = pindex;
2724 	}
2725 
2726 	/*
2727 	 * Don't wakeup too often - wakeup the pageout daemon when
2728 	 * we would be nearly out of memory.
2729 	 */
2730 	pagedaemon_wakeup();
2731 
2732 	/*
2733 	 * A BUSY page is returned.
2734 	 */
2735 	return (m);
2736 }
2737 
2738 /*
2739  * Returns number of pages available in our DMA memory reserve
2740  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
2741  */
2742 vm_size_t
2743 vm_contig_avail_pages(void)
2744 {
2745 	alist_blk_t blk;
2746 	alist_blk_t count;
2747 	alist_blk_t bfree;
2748 	spin_lock(&vm_contig_spin);
2749 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2750 	spin_unlock(&vm_contig_spin);
2751 
2752 	return bfree;
2753 }
2754 
2755 /*
2756  * Attempt to allocate contiguous physical memory with the specified
2757  * requirements.
2758  */
2759 vm_page_t
2760 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2761 		     unsigned long alignment, unsigned long boundary,
2762 		     unsigned long size, vm_memattr_t memattr)
2763 {
2764 	alist_blk_t blk;
2765 	vm_page_t m;
2766 	vm_pindex_t i;
2767 #if 0
2768 	static vm_pindex_t contig_rover;
2769 #endif
2770 
2771 	alignment >>= PAGE_SHIFT;
2772 	if (alignment == 0)
2773 		alignment = 1;
2774 	boundary >>= PAGE_SHIFT;
2775 	if (boundary == 0)
2776 		boundary = 1;
2777 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
2778 
2779 #if 0
2780 	/*
2781 	 * Disabled temporarily until we find a solution for DRM (a flag
2782 	 * to always use the free space reserve, for performance).
2783 	 */
2784 	if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
2785 	    boundary <= PAGE_SIZE && size == 1 &&
2786 	    memattr == VM_MEMATTR_DEFAULT) {
2787 		/*
2788 		 * Any page will work, use vm_page_alloc()
2789 		 * (e.g. when used from kmem_alloc_attr())
2790 		 */
2791 		m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
2792 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2793 				  VM_ALLOC_INTERRUPT);
2794 		m->valid = VM_PAGE_BITS_ALL;
2795 		vm_page_wire(m);
2796 		vm_page_wakeup(m);
2797 	} else
2798 #endif
2799 	{
2800 		/*
2801 		 * Use the low-memory dma reserve
2802 		 */
2803 		spin_lock(&vm_contig_spin);
2804 		blk = alist_alloc(&vm_contig_alist, 0, size);
2805 		if (blk == ALIST_BLOCK_NONE) {
2806 			spin_unlock(&vm_contig_spin);
2807 			if (bootverbose) {
2808 				kprintf("vm_page_alloc_contig: %ldk nospace\n",
2809 					(size << PAGE_SHIFT) / 1024);
2810 				print_backtrace(5);
2811 			}
2812 			return(NULL);
2813 		}
2814 		if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2815 			alist_free(&vm_contig_alist, blk, size);
2816 			spin_unlock(&vm_contig_spin);
2817 			if (bootverbose) {
2818 				kprintf("vm_page_alloc_contig: %ldk high "
2819 					"%016jx failed\n",
2820 					(size << PAGE_SHIFT) / 1024,
2821 					(intmax_t)high);
2822 			}
2823 			return(NULL);
2824 		}
2825 		spin_unlock(&vm_contig_spin);
2826 
2827 		/*
2828 		 * Base vm_page_t of range
2829 		 */
2830 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2831 	}
2832 	if (vm_contig_verbose) {
2833 		kprintf("vm_page_alloc_contig: %016jx/%ldk "
2834 			"(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2835 			(intmax_t)m->phys_addr,
2836 			(size << PAGE_SHIFT) / 1024,
2837 			low, high, alignment, boundary, size, memattr);
2838 	}
2839 	if (memattr != VM_MEMATTR_DEFAULT) {
2840 		for (i = 0; i < size; ++i) {
2841 			KKASSERT(m[i].flags & PG_FICTITIOUS);
2842 			pmap_page_set_memattr(&m[i], memattr);
2843 		}
2844 	}
2845 	return m;
2846 }
2847 
2848 /*
2849  * Free contiguously allocated pages.  The pages will be wired but not busy.
2850  * When freeing to the alist we leave them wired and not busy.
2851  */
2852 void
2853 vm_page_free_contig(vm_page_t m, unsigned long size)
2854 {
2855 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2856 	vm_pindex_t start = pa >> PAGE_SHIFT;
2857 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2858 
2859 	if (vm_contig_verbose) {
2860 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2861 			(intmax_t)pa, size / 1024);
2862 	}
2863 	if (pa < vm_low_phys_reserved) {
2864 		/*
2865 		 * Just assert check the first page for convenience.
2866 		 */
2867 		KKASSERT(m->wire_count == 1);
2868 		KKASSERT(m->flags & PG_FICTITIOUS);
2869 		KKASSERT(pa + size <= vm_low_phys_reserved);
2870 		spin_lock(&vm_contig_spin);
2871 		alist_free(&vm_contig_alist, start, pages);
2872 		spin_unlock(&vm_contig_spin);
2873 	} else {
2874 		while (pages) {
2875 			/* XXX FUTURE, maybe (pair with vm_pg_contig_alloc()) */
2876 			/*vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);*/
2877 			vm_page_busy_wait(m, FALSE, "cpgfr");
2878 			vm_page_unwire(m, 0);
2879 			vm_page_free(m);
2880 			--pages;
2881 			++m;
2882 		}
2883 
2884 	}
2885 }
2886 
2887 
2888 /*
2889  * Wait for sufficient free memory for nominal heavy memory use kernel
2890  * operations.
2891  *
2892  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2893  *	     will trivially deadlock the system.
2894  */
2895 void
2896 vm_wait_nominal(void)
2897 {
2898 	while (vm_paging_min())
2899 		vm_wait(0);
2900 }
2901 
2902 /*
2903  * Test if vm_wait_nominal() would block.
2904  */
2905 int
2906 vm_test_nominal(void)
2907 {
2908 	if (vm_paging_min())
2909 		return(1);
2910 	return(0);
2911 }
2912 
2913 /*
2914  * Block until free pages are available for allocation, called in various
2915  * places before memory allocations, and occurs before the minimum is reached.
2916  * Typically in the I/O path.
2917  *
2918  * The caller may loop if vm_paging_min() is TRUE (free pages below minimum),
2919  * so we cannot be more generous then that.
2920  */
2921 void
2922 vm_wait(int timo)
2923 {
2924 	/*
2925 	 * never wait forever
2926 	 */
2927 	if (timo == 0)
2928 		timo = hz;
2929 	lwkt_gettoken(&vm_token);
2930 
2931 	if (curthread == pagethread ||
2932 	    curthread == emergpager) {
2933 		/*
2934 		 * The pageout daemon itself needs pages, this is bad.
2935 		 */
2936 		if (vm_paging_min()) {
2937 			vm_pageout_pages_needed = 1;
2938 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2939 		}
2940 	} else {
2941 		/*
2942 		 * Wakeup the pageout daemon if necessary and wait.
2943 		 *
2944 		 * Do not wait indefinitely for the target to be reached,
2945 		 * as load might prevent it from being reached any time soon.
2946 		 * But wait a little to try to slow down page allocations
2947 		 * and to give more important threads (the pagedaemon)
2948 		 * allocation priority.
2949 		 *
2950 		 * The vm_paging_min() test is a safety.
2951 		 *
2952 		 * I/O waits are given a slightly lower priority (higher nice)
2953 		 * than VM waits.
2954 		 */
2955 		int nice;
2956 
2957 		nice = curthread->td_proc ? curthread->td_proc->p_nice : 0;
2958 		/*if (vm_paging_wait() || vm_paging_min())*/
2959 		if (vm_paging_min_nice(nice + 1))
2960 		{
2961 			if (vm_pages_needed <= 1) {
2962 				++vm_pages_needed;
2963 				wakeup(&vm_pages_needed);
2964 			}
2965 			++vm_pages_waiting;	/* SMP race ok */
2966 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2967 		}
2968 	}
2969 	lwkt_reltoken(&vm_token);
2970 }
2971 
2972 /*
2973  * Block until free pages are available for allocation, called in the
2974  * page-fault code.  We must stall indefinitely (except for certain
2975  * conditions) when the free page count becomes severe.
2976  *
2977  * Called only from vm_fault so that processes page faulting can be
2978  * easily tracked.
2979  *
2980  * The process nice value determines the trip point.  This way niced
2981  * processes which are heavy memory users do not completely mess the
2982  * machine up for normal processes.
2983  */
2984 void
2985 vm_wait_pfault(void)
2986 {
2987 	int nice;
2988 
2989 	/*
2990 	 * Wakeup the pageout daemon if necessary and wait.
2991 	 *
2992 	 * Allow VM faults down to the minimum free page count, but only
2993 	 * stall once paging becomes severe.
2994 	 *
2995 	 * Do not wait indefinitely for the target to be reached,
2996 	 * as load might prevent it from being reached any time soon.
2997 	 * But wait a little to try to slow down page allocations
2998 	 * and to give more important threads (the pagedaemon)
2999 	 * allocation priority.
3000 	 */
3001 	nice = curthread->td_proc ? curthread->td_proc->p_nice : 0;
3002 
3003 	if (vm_paging_min_nice(nice)) {
3004 		lwkt_gettoken(&vm_token);
3005 		do {
3006 			thread_t td;
3007 
3008 			if (vm_pages_needed <= 1) {
3009 				++vm_pages_needed;
3010 				wakeup(&vm_pages_needed);
3011 			}
3012 			++vm_pages_waiting;	/* SMP race ok */
3013 			tsleep(&vmstats.v_free_count, 0, "pfault",
3014 				hz / 10 + 1);
3015 
3016 			/*
3017 			 * Do not stay stuck in the loop if the system
3018 			 * is trying to kill the process.
3019 			 */
3020 			td = curthread;
3021 			if (td->td_proc &&
3022 			    (td->td_proc->p_flags & P_LOWMEMKILL))
3023 			{
3024 				break;
3025 			}
3026 		} while (vm_paging_severe());
3027 		lwkt_reltoken(&vm_token);
3028 	}
3029 }
3030 
3031 /*
3032  * Put the specified page on the active list (if appropriate).  Ensure
3033  * that act_count is at least ACT_INIT but do not otherwise mess with it.
3034  *
3035  * The caller should be holding the page busied ? XXX
3036  * This routine may not block.
3037  *
3038  * It is ok if the page is wired (so buffer cache operations don't have
3039  * to mess with the page queues).
3040  */
3041 void
3042 vm_page_activate(vm_page_t m)
3043 {
3044 	u_short oqueue;
3045 
3046 	/*
3047 	 * If already active or inappropriate, just set act_count and
3048 	 * return.  We don't have to spin-lock the page.
3049 	 */
3050 	if (m->queue - m->pc == PQ_ACTIVE ||
3051 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3052 		if (m->act_count < ACT_INIT)
3053 			m->act_count = ACT_INIT;
3054 		return;
3055 	}
3056 
3057 	vm_page_spin_lock(m);
3058 	if (m->queue - m->pc != PQ_ACTIVE &&
3059 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3060 		_vm_page_queue_spin_lock(m);
3061 		oqueue = _vm_page_rem_queue_spinlocked(m);
3062 		/* page is left spinlocked, queue is unlocked */
3063 
3064 		if (oqueue == PQ_CACHE)
3065 			mycpu->gd_cnt.v_reactivated++;
3066 		if (m->act_count < ACT_INIT)
3067 			m->act_count = ACT_INIT;
3068 		_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
3069 		_vm_page_and_queue_spin_unlock(m);
3070 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
3071 			pagedaemon_wakeup();
3072 	} else {
3073 		if (m->act_count < ACT_INIT)
3074 			m->act_count = ACT_INIT;
3075 		vm_page_spin_unlock(m);
3076 	}
3077 }
3078 
3079 void
3080 vm_page_soft_activate(vm_page_t m)
3081 {
3082 	if (m->queue - m->pc == PQ_ACTIVE ||
3083 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3084 		if (m->act_count < ACT_INIT)
3085 			m->act_count = ACT_INIT;
3086 	} else {
3087 		vm_page_activate(m);
3088 	}
3089 }
3090 
3091 /*
3092  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
3093  * routine is called when a page has been added to the cache or free
3094  * queues.
3095  *
3096  * This routine may not block.
3097  */
3098 static __inline void
3099 vm_page_free_wakeup(void)
3100 {
3101 	globaldata_t gd = mycpu;
3102 
3103 	/*
3104 	 * If the pageout daemon itself needs pages, then tell it that
3105 	 * there are some free.
3106 	 */
3107 	if (vm_pageout_pages_needed &&
3108 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
3109 	    gd->gd_vmstats.v_pageout_free_min
3110 	) {
3111 		vm_pageout_pages_needed = 0;
3112 		wakeup(&vm_pageout_pages_needed);
3113 	}
3114 
3115 	/*
3116 	 * Wakeup processes that are waiting on memory.
3117 	 *
3118 	 * Generally speaking we want to wakeup stuck processes as soon as
3119 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
3120 	 * where we can do this.  Wait a bit longer to reduce degenerate
3121 	 * re-blocking (vm_page_free_hysteresis).
3122 	 *
3123 	 * The target check is a safety to make sure the min-check
3124 	 * w/hysteresis does not exceed the normal target1.
3125 	 */
3126 	if (vm_pages_waiting) {
3127 		if (!vm_paging_min_dnc(vm_page_free_hysteresis) ||
3128 		    !vm_paging_target1())
3129 		{
3130 			vm_pages_waiting = 0;
3131 			wakeup(&vmstats.v_free_count);
3132 			++mycpu->gd_cnt.v_ppwakeups;
3133 		}
3134 	}
3135 }
3136 
3137 /*
3138  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
3139  * it from its VM object.
3140  *
3141  * The vm_page must be BUSY on entry.  BUSY will be released on
3142  * return (the page will have been freed).
3143  */
3144 void
3145 vm_page_free_toq(vm_page_t m)
3146 {
3147 	/*
3148 	 * The page must not be mapped when freed, but we may have to call
3149 	 * pmap_mapped_sync() to validate this.
3150 	 */
3151 	mycpu->gd_cnt.v_tfree++;
3152 	if (m->flags & (PG_MAPPED | PG_WRITEABLE))
3153 		pmap_mapped_sync(m);
3154 	KKASSERT((m->flags & PG_MAPPED) == 0);
3155 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3156 
3157 	if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
3158 		kprintf("vm_page_free: pindex(%lu), busy %08x, "
3159 			"hold(%d)\n",
3160 			(u_long)m->pindex, m->busy_count, m->hold_count);
3161 		if ((m->queue - m->pc) == PQ_FREE)
3162 			panic("vm_page_free: freeing free page");
3163 		else
3164 			panic("vm_page_free: freeing busy page");
3165 	}
3166 
3167 	/*
3168 	 * Remove from object, spinlock the page and its queues and
3169 	 * remove from any queue.  No queue spinlock will be held
3170 	 * after this section (because the page was removed from any
3171 	 * queue).
3172 	 */
3173 	vm_page_remove(m);
3174 
3175 	/*
3176 	 * No further management of fictitious pages occurs beyond object
3177 	 * and queue removal.
3178 	 */
3179 	if ((m->flags & PG_FICTITIOUS) != 0) {
3180 		KKASSERT(m->queue == PQ_NONE);
3181 		vm_page_wakeup(m);
3182 		return;
3183 	}
3184 	vm_page_and_queue_spin_lock(m);
3185 	_vm_page_rem_queue_spinlocked(m);
3186 
3187 	m->valid = 0;
3188 	vm_page_undirty(m);
3189 
3190 	if (m->wire_count != 0) {
3191 		if (m->wire_count > 1) {
3192 		    panic(
3193 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
3194 			m->wire_count, (long)m->pindex);
3195 		}
3196 		panic("vm_page_free: freeing wired page");
3197 	}
3198 
3199 	if (!MD_PAGE_FREEABLE(m))
3200 		panic("vm_page_free: page %p is still mapped!", m);
3201 
3202 	/*
3203 	 * Clear the PG_NEED_COMMIT and the PG_UNQUEUED flags.  The
3204 	 * page returns to normal operation and will be placed in
3205 	 * the PQ_HOLD or PQ_FREE queue.
3206 	 */
3207 	vm_page_flag_clear(m, PG_NEED_COMMIT | PG_UNQUEUED);
3208 
3209 	if (m->hold_count != 0) {
3210 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
3211 	} else {
3212 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
3213 	}
3214 
3215 	/*
3216 	 * This sequence allows us to clear BUSY while still holding
3217 	 * its spin lock, which reduces contention vs allocators.  We
3218 	 * must not leave the queue locked or _vm_page_wakeup() may
3219 	 * deadlock.
3220 	 */
3221 	_vm_page_queue_spin_unlock(m);
3222 	if (_vm_page_wakeup(m)) {
3223 		vm_page_spin_unlock(m);
3224 		wakeup(m);
3225 	} else {
3226 		vm_page_spin_unlock(m);
3227 	}
3228 	vm_page_free_wakeup();
3229 }
3230 
3231 /*
3232  * Mark this page as wired down by yet another map.  We do not adjust the
3233  * queue the page is on, it will be checked for wiring as-needed.
3234  *
3235  * This function has no effect on fictitious pages.
3236  *
3237  * Caller must be holding the page busy.
3238  */
3239 void
3240 vm_page_wire(vm_page_t m)
3241 {
3242 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3243 	if ((m->flags & PG_FICTITIOUS) == 0) {
3244 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
3245 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
3246 		}
3247 		KASSERT(m->wire_count != 0,
3248 			("vm_page_wire: wire_count overflow m=%p", m));
3249 	}
3250 }
3251 
3252 /*
3253  * Release one wiring of this page, potentially enabling it to be paged again.
3254  *
3255  * Note that wired pages are no longer unconditionally removed from the
3256  * paging queues, so the page may already be on a queue.  Move the page
3257  * to the desired queue if necessary.
3258  *
3259  * Many pages placed on the inactive queue should actually go
3260  * into the cache, but it is difficult to figure out which.  What
3261  * we do instead, if the inactive target is well met, is to put
3262  * clean pages at the head of the inactive queue instead of the tail.
3263  * This will cause them to be moved to the cache more quickly and
3264  * if not actively re-referenced, freed more quickly.  If we just
3265  * stick these pages at the end of the inactive queue, heavy filesystem
3266  * meta-data accesses can cause an unnecessary paging load on memory bound
3267  * processes.  This optimization causes one-time-use metadata to be
3268  * reused more quickly.
3269  *
3270  * Pages marked PG_NEED_COMMIT are always activated and never placed on
3271  * the inactive queue.  This helps the pageout daemon determine memory
3272  * pressure and act on out-of-memory situations more quickly.
3273  *
3274  * BUT, if we are in a low-memory situation we have no choice but to
3275  * put clean pages on the cache queue.
3276  *
3277  * A number of routines use vm_page_unwire() to guarantee that the page
3278  * will go into either the inactive or active queues, and will NEVER
3279  * be placed in the cache - for example, just after dirtying a page.
3280  * dirty pages in the cache are not allowed.
3281  *
3282  * PG_FICTITIOUS or PG_UNQUEUED pages are never moved to any queue, and
3283  * the wire_count will not be adjusted in any way for a PG_FICTITIOUS
3284  * page.
3285  *
3286  * This routine may not block.
3287  */
3288 void
3289 vm_page_unwire(vm_page_t m, int activate)
3290 {
3291 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3292 	if (m->flags & PG_FICTITIOUS) {
3293 		/* do nothing */
3294 	} else if ((int)m->wire_count <= 0) {
3295 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
3296 	} else {
3297 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
3298 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
3299 			if (m->flags & PG_UNQUEUED) {
3300 				;
3301 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
3302 				vm_page_activate(m);
3303 			} else {
3304 				vm_page_deactivate(m);
3305 			}
3306 		}
3307 	}
3308 }
3309 
3310 /*
3311  * Move the specified page to the inactive queue.
3312  *
3313  * Normally athead is 0 resulting in LRU operation.  athead is set
3314  * to 1 if we want this page to be 'as if it were placed in the cache',
3315  * except without unmapping it from the process address space.
3316  *
3317  * vm_page's spinlock must be held on entry and will remain held on return.
3318  * This routine may not block.  The caller does not have to hold the page
3319  * busied but should have some sort of interlock on its validity.
3320  *
3321  * It is ok if the page is wired (so buffer cache operations don't have
3322  * to mess with the page queues).
3323  */
3324 static void
3325 _vm_page_deactivate_locked(vm_page_t m, int athead)
3326 {
3327 	u_short oqueue;
3328 
3329 	/*
3330 	 * Ignore if already inactive.
3331 	 */
3332 	if (m->queue - m->pc == PQ_INACTIVE ||
3333 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3334 		return;
3335 	}
3336 
3337 	_vm_page_queue_spin_lock(m);
3338 	oqueue = _vm_page_rem_queue_spinlocked(m);
3339 
3340 	if ((m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3341 		if (oqueue == PQ_CACHE)
3342 			mycpu->gd_cnt.v_reactivated++;
3343 		vm_page_flag_clear(m, PG_WINATCFLS);
3344 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
3345 		if (athead == 0) {
3346 			atomic_add_long(
3347 				&vm_page_queues[PQ_INACTIVE + m->pc].adds, 1);
3348 		}
3349 	}
3350 	/* NOTE: PQ_NONE if condition not taken */
3351 	_vm_page_queue_spin_unlock(m);
3352 	/* leaves vm_page spinlocked */
3353 }
3354 
3355 /*
3356  * Attempt to deactivate a page.
3357  *
3358  * No requirements.  We can pre-filter before getting the spinlock.
3359  *
3360  * It is ok if the page is wired (so buffer cache operations don't have
3361  * to mess with the page queues).
3362  */
3363 void
3364 vm_page_deactivate(vm_page_t m)
3365 {
3366 	if (m->queue - m->pc != PQ_INACTIVE &&
3367 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3368 		vm_page_spin_lock(m);
3369 		_vm_page_deactivate_locked(m, 0);
3370 		vm_page_spin_unlock(m);
3371 	}
3372 }
3373 
3374 void
3375 vm_page_deactivate_locked(vm_page_t m)
3376 {
3377 	_vm_page_deactivate_locked(m, 0);
3378 }
3379 
3380 /*
3381  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
3382  *
3383  * This function returns non-zero if it successfully moved the page to
3384  * PQ_CACHE.
3385  *
3386  * This function unconditionally unbusies the page on return.
3387  */
3388 int
3389 vm_page_try_to_cache(vm_page_t m)
3390 {
3391 	/*
3392 	 * Shortcut if we obviously cannot move the page, or if the
3393 	 * page is already on the cache queue, or it is ficitious.
3394 	 *
3395 	 * Never allow a wired page into the cache.
3396 	 */
3397 	if (m->dirty || m->hold_count || m->wire_count ||
3398 	    m->queue - m->pc == PQ_CACHE ||
3399 	    (m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS))) {
3400 		vm_page_wakeup(m);
3401 		return(0);
3402 	}
3403 
3404 	/*
3405 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
3406 	 * be moved to the cache, but can be deactivated.  However, users
3407 	 * of this function want to move pages closer to the cache so we
3408 	 * only deactivate it if it is in PQ_ACTIVE.  We do not re-deactivate.
3409 	 */
3410 	vm_page_test_dirty(m);
3411 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3412 		if (m->queue - m->pc == PQ_ACTIVE)
3413 			vm_page_deactivate(m);
3414 		vm_page_wakeup(m);
3415 		return(0);
3416 	}
3417 	vm_page_cache(m);
3418 	return(1);
3419 }
3420 
3421 /*
3422  * Attempt to free the page.  If we cannot free it, we do nothing.
3423  * 1 is returned on success, 0 on failure.
3424  *
3425  * The page can be in any state, including already being on the free
3426  * queue.  Check to see if it really can be freed.  Note that we disallow
3427  * this ad-hoc operation if the page is flagged PG_UNQUEUED.
3428  *
3429  * Caller provides an unlocked/non-busied page.
3430  * No requirements.
3431  */
3432 int
3433 vm_page_try_to_free(vm_page_t m)
3434 {
3435 	if (vm_page_busy_try(m, TRUE))
3436 		return(0);
3437 
3438 	if (m->dirty ||				/* can't free if it is dirty */
3439 	    m->hold_count ||			/* or held (XXX may be wrong) */
3440 	    m->wire_count ||			/* or wired */
3441 	    (m->flags & (PG_UNQUEUED |		/* or unqueued */
3442 			 PG_NEED_COMMIT |	/* or needs a commit */
3443 			 PG_FICTITIOUS)) ||	/* or is fictitious */
3444 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
3445 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
3446 		vm_page_wakeup(m);
3447 		return(0);
3448 	}
3449 
3450 	/*
3451 	 * We can probably free the page.
3452 	 *
3453 	 * Page busied by us and no longer spinlocked.  Dirty pages will
3454 	 * not be freed by this function.    We have to re-test the
3455 	 * dirty bit after cleaning out the pmaps.
3456 	 */
3457 	vm_page_test_dirty(m);
3458 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3459 		vm_page_wakeup(m);
3460 		return(0);
3461 	}
3462 	vm_page_protect(m, VM_PROT_NONE);
3463 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3464 		vm_page_wakeup(m);
3465 		return(0);
3466 	}
3467 	vm_page_free(m);
3468 	return(1);
3469 }
3470 
3471 /*
3472  * vm_page_cache
3473  *
3474  * Put the specified page onto the page cache queue (if appropriate).
3475  *
3476  * The page must be busy, and this routine will release the busy and
3477  * possibly even free the page.
3478  */
3479 void
3480 vm_page_cache(vm_page_t m)
3481 {
3482 	/*
3483 	 * Not suitable for the cache
3484 	 */
3485 	if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS)) ||
3486 	    (m->busy_count & PBUSY_MASK) ||
3487 	    m->wire_count || m->hold_count) {
3488 		vm_page_wakeup(m);
3489 		return;
3490 	}
3491 
3492 	/*
3493 	 * Already in the cache (and thus not mapped)
3494 	 */
3495 	if ((m->queue - m->pc) == PQ_CACHE) {
3496 		KKASSERT((m->flags & PG_MAPPED) == 0);
3497 		vm_page_wakeup(m);
3498 		return;
3499 	}
3500 
3501 #if 0
3502 	/*
3503 	 * REMOVED - it is possible for dirty to get set at any time as
3504 	 *	     long as the page is still mapped and writeable.
3505 	 *
3506 	 * Caller is required to test m->dirty, but note that the act of
3507 	 * removing the page from its maps can cause it to become dirty
3508 	 * on an SMP system due to another cpu running in usermode.
3509 	 */
3510 	if (m->dirty) {
3511 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
3512 			(long)m->pindex);
3513 	}
3514 #endif
3515 
3516 	/*
3517 	 * Remove all pmaps and indicate that the page is not
3518 	 * writeable or mapped.  Our vm_page_protect() call may
3519 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
3520 	 * everything.
3521 	 */
3522 	if (m->flags & (PG_MAPPED | PG_WRITEABLE)) {
3523 		vm_page_protect(m, VM_PROT_NONE);
3524 		pmap_mapped_sync(m);
3525 	}
3526 	if ((m->flags & (PG_UNQUEUED | PG_MAPPED)) ||
3527 	    (m->busy_count & PBUSY_MASK) ||
3528 	    m->wire_count || m->hold_count) {
3529 		vm_page_wakeup(m);
3530 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3531 		vm_page_deactivate(m);
3532 		vm_page_wakeup(m);
3533 	} else {
3534 		_vm_page_and_queue_spin_lock(m);
3535 		_vm_page_rem_queue_spinlocked(m);
3536 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
3537 		_vm_page_and_queue_spin_unlock(m);
3538 		vm_page_wakeup(m);
3539 		vm_page_free_wakeup();
3540 	}
3541 }
3542 
3543 /*
3544  * vm_page_dontneed()
3545  *
3546  * Cache, deactivate, or do nothing as appropriate.  This routine
3547  * is typically used by madvise() MADV_DONTNEED.
3548  *
3549  * Generally speaking we want to move the page into the cache so
3550  * it gets reused quickly.  However, this can result in a silly syndrome
3551  * due to the page recycling too quickly.  Small objects will not be
3552  * fully cached.  On the otherhand, if we move the page to the inactive
3553  * queue we wind up with a problem whereby very large objects
3554  * unnecessarily blow away our inactive and cache queues.
3555  *
3556  * The solution is to move the pages based on a fixed weighting.  We
3557  * either leave them alone, deactivate them, or move them to the cache,
3558  * where moving them to the cache has the highest weighting.
3559  * By forcing some pages into other queues we eventually force the
3560  * system to balance the queues, potentially recovering other unrelated
3561  * space from active.  The idea is to not force this to happen too
3562  * often.
3563  *
3564  * The page must be busied.
3565  */
3566 void
3567 vm_page_dontneed(vm_page_t m)
3568 {
3569 	static int dnweight;
3570 	int dnw;
3571 	int head;
3572 
3573 	dnw = ++dnweight;
3574 
3575 	/*
3576 	 * occassionally leave the page alone
3577 	 */
3578 	if ((dnw & 0x01F0) == 0 ||
3579 	    m->queue - m->pc == PQ_INACTIVE ||
3580 	    m->queue - m->pc == PQ_CACHE
3581 	) {
3582 		if (m->act_count >= ACT_INIT)
3583 			--m->act_count;
3584 		return;
3585 	}
3586 
3587 	/*
3588 	 * If vm_page_dontneed() is inactivating a page, it must clear
3589 	 * the referenced flag; otherwise the pagedaemon will see references
3590 	 * on the page in the inactive queue and reactivate it. Until the
3591 	 * page can move to the cache queue, madvise's job is not done.
3592 	 */
3593 	vm_page_flag_clear(m, PG_REFERENCED);
3594 	pmap_clear_reference(m);
3595 
3596 	if (m->dirty == 0)
3597 		vm_page_test_dirty(m);
3598 
3599 	if (m->dirty || (dnw & 0x0070) == 0) {
3600 		/*
3601 		 * Deactivate the page 3 times out of 32.
3602 		 */
3603 		head = 0;
3604 	} else {
3605 		/*
3606 		 * Cache the page 28 times out of every 32.  Note that
3607 		 * the page is deactivated instead of cached, but placed
3608 		 * at the head of the queue instead of the tail.
3609 		 */
3610 		head = 1;
3611 	}
3612 	vm_page_spin_lock(m);
3613 	_vm_page_deactivate_locked(m, head);
3614 	vm_page_spin_unlock(m);
3615 }
3616 
3617 /*
3618  * These routines manipulate the 'soft busy' count for a page.  A soft busy
3619  * is almost like a hard BUSY except that it allows certain compatible
3620  * operations to occur on the page while it is busy.  For example, a page
3621  * undergoing a write can still be mapped read-only.
3622  *
3623  * We also use soft-busy to quickly pmap_enter shared read-only pages
3624  * without having to hold the page locked.
3625  *
3626  * The soft-busy count can be > 1 in situations where multiple threads
3627  * are pmap_enter()ing the same page simultaneously, or when two buffer
3628  * cache buffers overlap the same page.
3629  *
3630  * The caller must hold the page BUSY when making these two calls.
3631  */
3632 void
3633 vm_page_io_start(vm_page_t m)
3634 {
3635 	uint32_t ocount;
3636 
3637 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3638 	KKASSERT(ocount & PBUSY_LOCKED);
3639 }
3640 
3641 void
3642 vm_page_io_finish(vm_page_t m)
3643 {
3644 	uint32_t ocount;
3645 
3646 	ocount = atomic_fetchadd_int(&m->busy_count, -1);
3647 	KKASSERT(ocount & PBUSY_MASK);
3648 #if 0
3649 	if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
3650 		wakeup(m);
3651 #endif
3652 }
3653 
3654 /*
3655  * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
3656  *
3657  * We can't use fetchadd here because we might race a hard-busy and the
3658  * page freeing code asserts on a non-zero soft-busy count (even if only
3659  * temporary).
3660  *
3661  * Returns 0 on success, non-zero on failure.
3662  */
3663 int
3664 vm_page_sbusy_try(vm_page_t m)
3665 {
3666 	uint32_t ocount;
3667 
3668 	for (;;) {
3669 		ocount = m->busy_count;
3670 		cpu_ccfence();
3671 		if (ocount & PBUSY_LOCKED)
3672 			return 1;
3673 		if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1))
3674 			break;
3675 	}
3676 	return 0;
3677 #if 0
3678 	if (m->busy_count & PBUSY_LOCKED)
3679 		return 1;
3680 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3681 	if (ocount & PBUSY_LOCKED) {
3682 		vm_page_sbusy_drop(m);
3683 		return 1;
3684 	}
3685 	return 0;
3686 #endif
3687 }
3688 
3689 /*
3690  * Indicate that a clean VM page requires a filesystem commit and cannot
3691  * be reused.  Used by tmpfs.
3692  */
3693 void
3694 vm_page_need_commit(vm_page_t m)
3695 {
3696 	vm_page_flag_set(m, PG_NEED_COMMIT);
3697 	vm_object_set_writeable_dirty(m->object);
3698 }
3699 
3700 void
3701 vm_page_clear_commit(vm_page_t m)
3702 {
3703 	vm_page_flag_clear(m, PG_NEED_COMMIT);
3704 }
3705 
3706 /*
3707  * Grab a page, blocking if it is busy and allocating a page if necessary.
3708  * A busy page is returned or NULL.  The page may or may not be valid and
3709  * might not be on a queue (the caller is responsible for the disposition of
3710  * the page).
3711  *
3712  * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
3713  * page will be zero'd and marked valid.
3714  *
3715  * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
3716  * valid even if it already exists.
3717  *
3718  * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
3719  * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
3720  * VM_ALLOC_NULL_OK is implied when VM_ALLOC_RETRY is specified.
3721  *
3722  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
3723  * always returned if we had blocked.
3724  *
3725  * This routine may not be called from an interrupt.
3726  *
3727  * No other requirements.
3728  */
3729 vm_page_t
3730 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
3731 {
3732 	vm_page_t m;
3733 	int error;
3734 	int shared = 1;
3735 
3736 	KKASSERT(allocflags &
3737 		(VM_ALLOC_NORMAL|VM_ALLOC_INTERRUPT|VM_ALLOC_SYSTEM));
3738 	vm_object_hold_shared(object);
3739 	for (;;) {
3740 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
3741 		if (error) {
3742 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
3743 			if ((allocflags & VM_ALLOC_RETRY) == 0) {
3744 				m = NULL;
3745 				break;
3746 			}
3747 			/* retry */
3748 		} else if (m == NULL) {
3749 			if (shared) {
3750 				vm_object_upgrade(object);
3751 				shared = 0;
3752 			}
3753 			if (allocflags & VM_ALLOC_RETRY)
3754 				allocflags |= VM_ALLOC_NULL_OK;
3755 			m = vm_page_alloc(object, pindex,
3756 					  allocflags & ~VM_ALLOC_RETRY);
3757 			if (m)
3758 				break;
3759 			vm_wait(0);
3760 			if ((allocflags & VM_ALLOC_RETRY) == 0)
3761 				goto failed;
3762 		} else {
3763 			/* m found */
3764 			break;
3765 		}
3766 	}
3767 
3768 	/*
3769 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
3770 	 *
3771 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
3772 	 * valid even if already valid.
3773 	 *
3774 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
3775 	 *	  removed the idle zeroing code.  These optimizations actually
3776 	 *	  slow things down on modern cpus because the zerod area is
3777 	 *	  likely uncached, placing a memory-access burden on the
3778 	 *	  accesors taking the fault.
3779 	 *
3780 	 *	  By always zeroing the page in-line with the fault, no
3781 	 *	  dynamic ram reads are needed and the caches are hot, ready
3782 	 *	  for userland to access the memory.
3783 	 */
3784 	if (m->valid == 0) {
3785 		if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3786 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
3787 			m->valid = VM_PAGE_BITS_ALL;
3788 		}
3789 	} else if (allocflags & VM_ALLOC_FORCE_ZERO) {
3790 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
3791 		m->valid = VM_PAGE_BITS_ALL;
3792 	}
3793 failed:
3794 	vm_object_drop(object);
3795 	return(m);
3796 }
3797 
3798 /*
3799  * Mapping function for valid bits or for dirty bits in
3800  * a page.  May not block.
3801  *
3802  * Inputs are required to range within a page.
3803  *
3804  * No requirements.
3805  * Non blocking.
3806  */
3807 int
3808 vm_page_bits(int base, int size)
3809 {
3810 	int first_bit;
3811 	int last_bit;
3812 
3813 	KASSERT(
3814 	    base + size <= PAGE_SIZE,
3815 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
3816 	);
3817 
3818 	if (size == 0)		/* handle degenerate case */
3819 		return(0);
3820 
3821 	first_bit = base >> DEV_BSHIFT;
3822 	last_bit = (base + size - 1) >> DEV_BSHIFT;
3823 
3824 	return ((2 << last_bit) - (1 << first_bit));
3825 }
3826 
3827 /*
3828  * Sets portions of a page valid and clean.  The arguments are expected
3829  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3830  * of any partial chunks touched by the range.  The invalid portion of
3831  * such chunks will be zero'd.
3832  *
3833  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
3834  *	 align base to DEV_BSIZE so as not to mark clean a partially
3835  *	 truncated device block.  Otherwise the dirty page status might be
3836  *	 lost.
3837  *
3838  * This routine may not block.
3839  *
3840  * (base + size) must be less then or equal to PAGE_SIZE.
3841  */
3842 static void
3843 _vm_page_zero_valid(vm_page_t m, int base, int size)
3844 {
3845 	int frag;
3846 	int endoff;
3847 
3848 	if (size == 0)	/* handle degenerate case */
3849 		return;
3850 
3851 	/*
3852 	 * If the base is not DEV_BSIZE aligned and the valid
3853 	 * bit is clear, we have to zero out a portion of the
3854 	 * first block.
3855 	 */
3856 
3857 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3858 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3859 	) {
3860 		pmap_zero_page_area(
3861 		    VM_PAGE_TO_PHYS(m),
3862 		    frag,
3863 		    base - frag
3864 		);
3865 	}
3866 
3867 	/*
3868 	 * If the ending offset is not DEV_BSIZE aligned and the
3869 	 * valid bit is clear, we have to zero out a portion of
3870 	 * the last block.
3871 	 */
3872 
3873 	endoff = base + size;
3874 
3875 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3876 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3877 	) {
3878 		pmap_zero_page_area(
3879 		    VM_PAGE_TO_PHYS(m),
3880 		    endoff,
3881 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3882 		);
3883 	}
3884 }
3885 
3886 /*
3887  * Set valid, clear dirty bits.  If validating the entire
3888  * page we can safely clear the pmap modify bit.  We also
3889  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3890  * takes a write fault on a MAP_NOSYNC memory area the flag will
3891  * be set again.
3892  *
3893  * We set valid bits inclusive of any overlap, but we can only
3894  * clear dirty bits for DEV_BSIZE chunks that are fully within
3895  * the range.
3896  *
3897  * Page must be busied?
3898  * No other requirements.
3899  */
3900 void
3901 vm_page_set_valid(vm_page_t m, int base, int size)
3902 {
3903 	_vm_page_zero_valid(m, base, size);
3904 	m->valid |= vm_page_bits(base, size);
3905 }
3906 
3907 
3908 /*
3909  * Set valid bits and clear dirty bits.
3910  *
3911  * Page must be busied by caller.
3912  *
3913  * NOTE: This function does not clear the pmap modified bit.
3914  *	 Also note that e.g. NFS may use a byte-granular base
3915  *	 and size.
3916  *
3917  * No other requirements.
3918  */
3919 void
3920 vm_page_set_validclean(vm_page_t m, int base, int size)
3921 {
3922 	int pagebits;
3923 
3924 	_vm_page_zero_valid(m, base, size);
3925 	pagebits = vm_page_bits(base, size);
3926 	m->valid |= pagebits;
3927 	m->dirty &= ~pagebits;
3928 	if (base == 0 && size == PAGE_SIZE) {
3929 		/*pmap_clear_modify(m);*/
3930 		vm_page_flag_clear(m, PG_NOSYNC);
3931 	}
3932 }
3933 
3934 /*
3935  * Set valid & dirty.  Used by buwrite()
3936  *
3937  * Page must be busied by caller.
3938  */
3939 void
3940 vm_page_set_validdirty(vm_page_t m, int base, int size)
3941 {
3942 	int pagebits;
3943 
3944 	pagebits = vm_page_bits(base, size);
3945 	m->valid |= pagebits;
3946 	m->dirty |= pagebits;
3947 	if (m->object)
3948 	       vm_object_set_writeable_dirty(m->object);
3949 }
3950 
3951 /*
3952  * Clear dirty bits.
3953  *
3954  * NOTE: This function does not clear the pmap modified bit.
3955  *	 Also note that e.g. NFS may use a byte-granular base
3956  *	 and size.
3957  *
3958  * Page must be busied?
3959  * No other requirements.
3960  */
3961 void
3962 vm_page_clear_dirty(vm_page_t m, int base, int size)
3963 {
3964 	m->dirty &= ~vm_page_bits(base, size);
3965 	if (base == 0 && size == PAGE_SIZE) {
3966 		/*pmap_clear_modify(m);*/
3967 		vm_page_flag_clear(m, PG_NOSYNC);
3968 	}
3969 }
3970 
3971 /*
3972  * Make the page all-dirty.
3973  *
3974  * Also make sure the related object and vnode reflect the fact that the
3975  * object may now contain a dirty page.
3976  *
3977  * Page must be busied?
3978  * No other requirements.
3979  */
3980 void
3981 vm_page_dirty(vm_page_t m)
3982 {
3983 #ifdef INVARIANTS
3984         int pqtype = m->queue - m->pc;
3985 #endif
3986         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
3987                 ("vm_page_dirty: page in free/cache queue!"));
3988 	if (m->dirty != VM_PAGE_BITS_ALL) {
3989 		m->dirty = VM_PAGE_BITS_ALL;
3990 		if (m->object)
3991 			vm_object_set_writeable_dirty(m->object);
3992 	}
3993 }
3994 
3995 /*
3996  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
3997  * valid and dirty bits for the effected areas are cleared.
3998  *
3999  * Page must be busied?
4000  * Does not block.
4001  * No other requirements.
4002  */
4003 void
4004 vm_page_set_invalid(vm_page_t m, int base, int size)
4005 {
4006 	int bits;
4007 
4008 	bits = vm_page_bits(base, size);
4009 	m->valid &= ~bits;
4010 	m->dirty &= ~bits;
4011 	atomic_add_int(&m->object->generation, 1);
4012 }
4013 
4014 /*
4015  * The kernel assumes that the invalid portions of a page contain
4016  * garbage, but such pages can be mapped into memory by user code.
4017  * When this occurs, we must zero out the non-valid portions of the
4018  * page so user code sees what it expects.
4019  *
4020  * Pages are most often semi-valid when the end of a file is mapped
4021  * into memory and the file's size is not page aligned.
4022  *
4023  * Page must be busied?
4024  * No other requirements.
4025  */
4026 void
4027 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
4028 {
4029 	int b;
4030 	int i;
4031 
4032 	/*
4033 	 * Scan the valid bits looking for invalid sections that
4034 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
4035 	 * valid bit may be set ) have already been zerod by
4036 	 * vm_page_set_validclean().
4037 	 */
4038 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
4039 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
4040 		    (m->valid & (1 << i))
4041 		) {
4042 			if (i > b) {
4043 				pmap_zero_page_area(
4044 				    VM_PAGE_TO_PHYS(m),
4045 				    b << DEV_BSHIFT,
4046 				    (i - b) << DEV_BSHIFT
4047 				);
4048 			}
4049 			b = i + 1;
4050 		}
4051 	}
4052 
4053 	/*
4054 	 * setvalid is TRUE when we can safely set the zero'd areas
4055 	 * as being valid.  We can do this if there are no cache consistency
4056 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
4057 	 */
4058 	if (setvalid)
4059 		m->valid = VM_PAGE_BITS_ALL;
4060 }
4061 
4062 /*
4063  * Is a (partial) page valid?  Note that the case where size == 0
4064  * will return FALSE in the degenerate case where the page is entirely
4065  * invalid, and TRUE otherwise.
4066  *
4067  * Does not block.
4068  * No other requirements.
4069  */
4070 int
4071 vm_page_is_valid(vm_page_t m, int base, int size)
4072 {
4073 	int bits = vm_page_bits(base, size);
4074 
4075 	if (m->valid && ((m->valid & bits) == bits))
4076 		return 1;
4077 	else
4078 		return 0;
4079 }
4080 
4081 /*
4082  * Update dirty bits from pmap/mmu.  May not block.
4083  *
4084  * Caller must hold the page busy
4085  *
4086  * WARNING! Unless the page has been unmapped, this function only
4087  *	    provides a likely dirty status.
4088  */
4089 void
4090 vm_page_test_dirty(vm_page_t m)
4091 {
4092 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) {
4093 		vm_page_dirty(m);
4094 	}
4095 }
4096 
4097 #include "opt_ddb.h"
4098 #ifdef DDB
4099 #include <ddb/ddb.h>
4100 
4101 DB_SHOW_COMMAND(page, vm_page_print_page_info)
4102 {
4103 	db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
4104 	db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
4105 	db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
4106 	db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
4107 	db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
4108 	db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
4109 	db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
4110 	db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
4111 	db_printf("vmstats.v_inactive_target: %ld\n",
4112 		  vmstats.v_inactive_target);
4113 	db_printf("vmstats.v_paging_wait: %ld\n", vmstats.v_paging_wait);
4114 	db_printf("vmstats.v_paging_start: %ld\n", vmstats.v_paging_start);
4115 	db_printf("vmstats.v_paging_target1: %ld\n", vmstats.v_paging_target1);
4116 	db_printf("vmstats.v_paging_target2: %ld\n", vmstats.v_paging_target2);
4117 }
4118 
4119 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
4120 {
4121 	int i;
4122 	db_printf("PQ_FREE:");
4123 	for (i = 0; i < PQ_L2_SIZE; i++) {
4124 		db_printf(" %ld", vm_page_queues[PQ_FREE + i].lcnt);
4125 	}
4126 	db_printf("\n");
4127 
4128 	db_printf("PQ_CACHE:");
4129 	for(i = 0; i < PQ_L2_SIZE; i++) {
4130 		db_printf(" %ld", vm_page_queues[PQ_CACHE + i].lcnt);
4131 	}
4132 	db_printf("\n");
4133 
4134 	db_printf("PQ_ACTIVE:");
4135 	for(i = 0; i < PQ_L2_SIZE; i++) {
4136 		db_printf(" %ld", vm_page_queues[PQ_ACTIVE + i].lcnt);
4137 	}
4138 	db_printf("\n");
4139 
4140 	db_printf("PQ_INACTIVE:");
4141 	for(i = 0; i < PQ_L2_SIZE; i++) {
4142 		db_printf(" %ld", vm_page_queues[PQ_INACTIVE + i].lcnt);
4143 	}
4144 	db_printf("\n");
4145 }
4146 #endif /* DDB */
4147