xref: /dragonfly/sys/vm/vm_page.c (revision 655933d6)
1 /*
2  * Copyright (c) 2003-2019 The DragonFly Project.  All rights reserved.
3  * Copyright (c) 1991 Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
37  * $FreeBSD: src/sys/vm/vm_page.c,v 1.147.2.18 2002/03/10 05:03:19 alc Exp $
38  */
39 
40 /*
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  */
66 /*
67  * Resident memory management module.  The module manipulates 'VM pages'.
68  * A VM page is the core building block for memory management.
69  */
70 
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/malloc.h>
74 #include <sys/proc.h>
75 #include <sys/vmmeter.h>
76 #include <sys/vnode.h>
77 #include <sys/kernel.h>
78 #include <sys/alist.h>
79 #include <sys/sysctl.h>
80 #include <sys/cpu_topology.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <sys/lock.h>
85 #include <vm/vm_kern.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_extern.h>
93 #include <vm/swap_pager.h>
94 
95 #include <machine/inttypes.h>
96 #include <machine/md_var.h>
97 #include <machine/specialreg.h>
98 #include <machine/bus_dma.h>
99 
100 #include <vm/vm_page2.h>
101 #include <sys/spinlock2.h>
102 
103 /*
104  * Cache necessary elements in the hash table itself to avoid indirecting
105  * through random vm_page's when doing a lookup.  The hash table is
106  * heuristical and it is ok for races to mess up any or all fields.
107  */
108 struct vm_page_hash_elm {
109 	vm_page_t	m;
110 	vm_object_t	object;	/* heuristical */
111 	vm_pindex_t	pindex;	/* heuristical */
112 	int		ticks;
113 	int		unused;
114 };
115 
116 #define VM_PAGE_HASH_SET	4		    /* power of 2, set-assoc */
117 #define VM_PAGE_HASH_MAX	(8 * 1024 * 1024)   /* power of 2, max size */
118 
119 /*
120  * SET - Minimum required set associative size, must be a power of 2.  We
121  *	 want this to match or exceed the set-associativeness of the cpu,
122  *	 up to a reasonable limit (we will use 16).
123  */
124 __read_mostly static int set_assoc_mask = 16 - 1;
125 
126 static void vm_page_queue_init(void);
127 static void vm_page_free_wakeup(void);
128 static vm_page_t vm_page_select_cache(u_short pg_color);
129 static vm_page_t _vm_page_list_find_wide(int basequeue, int index, int *lastp);
130 static vm_page_t _vm_page_list_find2_wide(int bq1, int bq2, int index,
131 			int *lastp1, int *lastp);
132 static void _vm_page_deactivate_locked(vm_page_t m, int athead);
133 static void vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes);
134 
135 /*
136  * Array of tailq lists
137  */
138 struct vpgqueues vm_page_queues[PQ_COUNT];
139 
140 static volatile int vm_pages_waiting;
141 static struct alist vm_contig_alist;
142 static struct almeta vm_contig_ameta[ALIST_RECORDS_65536];
143 static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin");
144 
145 __read_mostly static int vm_page_hash_vnode_only;
146 __read_mostly static int vm_page_hash_size;
147 __read_mostly static struct vm_page_hash_elm *vm_page_hash;
148 
149 static u_long vm_dma_reserved = 0;
150 TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved);
151 SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0,
152 	    "Memory reserved for DMA");
153 SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD,
154 	    &vm_contig_alist.bl_free, 0, "Memory reserved for DMA");
155 
156 SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW,
157 	    &vm_page_hash_vnode_only, 0, "Only hash vnode pages");
158 #if 0
159 static int vm_page_hash_debug;
160 SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW,
161 	    &vm_page_hash_debug, 0, "Only hash vnode pages");
162 #endif
163 
164 static int vm_contig_verbose = 0;
165 TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose);
166 
167 RB_GENERATE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare,
168 	     vm_pindex_t, pindex);
169 
170 static void
171 vm_page_queue_init(void)
172 {
173 	int i;
174 
175 	for (i = 0; i < PQ_L2_SIZE; i++)
176 		vm_page_queues[PQ_FREE+i].cnt_offset =
177 			offsetof(struct vmstats, v_free_count);
178 	for (i = 0; i < PQ_L2_SIZE; i++)
179 		vm_page_queues[PQ_CACHE+i].cnt_offset =
180 			offsetof(struct vmstats, v_cache_count);
181 	for (i = 0; i < PQ_L2_SIZE; i++)
182 		vm_page_queues[PQ_INACTIVE+i].cnt_offset =
183 			offsetof(struct vmstats, v_inactive_count);
184 	for (i = 0; i < PQ_L2_SIZE; i++)
185 		vm_page_queues[PQ_ACTIVE+i].cnt_offset =
186 			offsetof(struct vmstats, v_active_count);
187 	for (i = 0; i < PQ_L2_SIZE; i++)
188 		vm_page_queues[PQ_HOLD+i].cnt_offset =
189 			offsetof(struct vmstats, v_active_count);
190 	/* PQ_NONE has no queue */
191 
192 	for (i = 0; i < PQ_COUNT; i++) {
193 		struct vpgqueues *vpq;
194 
195 		vpq = &vm_page_queues[i];
196 		vpq->lastq = -1;
197 		TAILQ_INIT(&vpq->pl);
198 		spin_init(&vpq->spin, "vm_page_queue_init");
199 	}
200 }
201 
202 /*
203  * note: place in initialized data section?  Is this necessary?
204  */
205 vm_pindex_t first_page = 0;
206 vm_pindex_t vm_page_array_size = 0;
207 vm_page_t vm_page_array = NULL;
208 vm_paddr_t vm_low_phys_reserved;
209 
210 /*
211  * (low level boot)
212  *
213  * Sets the page size, perhaps based upon the memory size.
214  * Must be called before any use of page-size dependent functions.
215  */
216 void
217 vm_set_page_size(void)
218 {
219 	if (vmstats.v_page_size == 0)
220 		vmstats.v_page_size = PAGE_SIZE;
221 	if (((vmstats.v_page_size - 1) & vmstats.v_page_size) != 0)
222 		panic("vm_set_page_size: page size not a power of two");
223 }
224 
225 /*
226  * (low level boot)
227  *
228  * Add a new page to the freelist for use by the system.  New pages
229  * are added to both the head and tail of the associated free page
230  * queue in a bottom-up fashion, so both zero'd and non-zero'd page
231  * requests pull 'recent' adds (higher physical addresses) first.
232  *
233  * Beware that the page zeroing daemon will also be running soon after
234  * boot, moving pages from the head to the tail of the PQ_FREE queues.
235  *
236  * Must be called in a critical section.
237  */
238 static void
239 vm_add_new_page(vm_paddr_t pa, int *badcountp)
240 {
241 	struct vpgqueues *vpq;
242 	vm_page_t m;
243 
244 	m = PHYS_TO_VM_PAGE(pa);
245 
246 	/*
247 	 * Make sure it isn't a duplicate (due to BIOS page range overlaps,
248 	 * which we consider bugs... but don't crash).  Note that m->phys_addr
249 	 * is pre-initialized, so use m->queue as a check.
250 	 */
251 	if (m->queue) {
252 		if (*badcountp < 10) {
253 			kprintf("vm_add_new_page: duplicate pa %016jx\n",
254 				(intmax_t)pa);
255 			++*badcountp;
256 		} else if (*badcountp == 10) {
257 			kprintf("vm_add_new_page: duplicate pa (many more)\n");
258 			++*badcountp;
259 		}
260 		return;
261 	}
262 
263 	m->phys_addr = pa;
264 	m->flags = 0;
265 	m->pat_mode = PAT_WRITE_BACK;
266 	m->pc = (pa >> PAGE_SHIFT);
267 
268 	/*
269 	 * Twist for cpu localization in addition to page coloring, so
270 	 * different cpus selecting by m->queue get different page colors.
271 	 */
272 	m->pc ^= ((pa >> PAGE_SHIFT) / PQ_L2_SIZE);
273 	m->pc ^= ((pa >> PAGE_SHIFT) / (PQ_L2_SIZE * PQ_L2_SIZE));
274 	m->pc &= PQ_L2_MASK;
275 
276 	/*
277 	 * Reserve a certain number of contiguous low memory pages for
278 	 * contigmalloc() to use.
279 	 *
280 	 * Even though these pages represent real ram and can be
281 	 * reverse-mapped, we set PG_FICTITIOUS and PG_UNQUEUED
282 	 * because their use is special-cased.
283 	 *
284 	 * WARNING! Once PG_FICTITIOUS is set, vm_page_wire*()
285 	 *	    and vm_page_unwire*() calls have no effect.
286 	 */
287 	if (pa < vm_low_phys_reserved) {
288 		atomic_add_long(&vmstats.v_page_count, 1);
289 		atomic_add_long(&vmstats.v_dma_pages, 1);
290 		m->flags |= PG_FICTITIOUS | PG_UNQUEUED;
291 		m->queue = PQ_NONE;
292 		m->wire_count = 1;
293 		atomic_add_long(&vmstats.v_wire_count, 1);
294 		alist_free(&vm_contig_alist, pa >> PAGE_SHIFT, 1);
295 		return;
296 	}
297 
298 	/*
299 	 * General page
300 	 */
301 	m->queue = m->pc + PQ_FREE;
302 	KKASSERT(m->dirty == 0);
303 
304 	atomic_add_long(&vmstats.v_page_count, 1);
305 	atomic_add_long(&vmstats.v_free_count, 1);
306 	vpq = &vm_page_queues[m->queue];
307 	TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
308 	++vpq->lcnt;
309 }
310 
311 /*
312  * (low level boot)
313  *
314  * Initializes the resident memory module.
315  *
316  * Preallocates memory for critical VM structures and arrays prior to
317  * kernel_map becoming available.
318  *
319  * Memory is allocated from (virtual2_start, virtual2_end) if available,
320  * otherwise memory is allocated from (virtual_start, virtual_end).
321  *
322  * On x86-64 (virtual_start, virtual_end) is only 2GB and may not be
323  * large enough to hold vm_page_array & other structures for machines with
324  * large amounts of ram, so we want to use virtual2* when available.
325  */
326 void
327 vm_page_startup(void)
328 {
329 	vm_offset_t vaddr = virtual2_start ? virtual2_start : virtual_start;
330 	vm_offset_t mapped;
331 	vm_pindex_t npages;
332 	vm_paddr_t page_range;
333 	vm_paddr_t new_end;
334 	int i;
335 	vm_paddr_t pa;
336 	vm_paddr_t last_pa;
337 	vm_paddr_t end;
338 	vm_paddr_t biggestone, biggestsize;
339 	vm_paddr_t total;
340 	vm_page_t m;
341 	int badcount;
342 
343 	total = 0;
344 	badcount = 0;
345 	biggestsize = 0;
346 	biggestone = 0;
347 	vaddr = round_page(vaddr);
348 
349 	/*
350 	 * Make sure ranges are page-aligned.
351 	 */
352 	for (i = 0; phys_avail[i].phys_end; ++i) {
353 		phys_avail[i].phys_beg = round_page64(phys_avail[i].phys_beg);
354 		phys_avail[i].phys_end = trunc_page64(phys_avail[i].phys_end);
355 		if (phys_avail[i].phys_end < phys_avail[i].phys_beg)
356 			phys_avail[i].phys_end = phys_avail[i].phys_beg;
357 	}
358 
359 	/*
360 	 * Locate largest block
361 	 */
362 	for (i = 0; phys_avail[i].phys_end; ++i) {
363 		vm_paddr_t size = phys_avail[i].phys_end -
364 				  phys_avail[i].phys_beg;
365 
366 		if (size > biggestsize) {
367 			biggestone = i;
368 			biggestsize = size;
369 		}
370 		total += size;
371 	}
372 	--i;	/* adjust to last entry for use down below */
373 
374 	end = phys_avail[biggestone].phys_end;
375 	end = trunc_page(end);
376 
377 	/*
378 	 * Initialize the queue headers for the free queue, the active queue
379 	 * and the inactive queue.
380 	 */
381 	vm_page_queue_init();
382 
383 #if !defined(_KERNEL_VIRTUAL)
384 	/*
385 	 * VKERNELs don't support minidumps and as such don't need
386 	 * vm_page_dump
387 	 *
388 	 * Allocate a bitmap to indicate that a random physical page
389 	 * needs to be included in a minidump.
390 	 *
391 	 * The amd64 port needs this to indicate which direct map pages
392 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
393 	 *
394 	 * However, x86 still needs this workspace internally within the
395 	 * minidump code.  In theory, they are not needed on x86, but are
396 	 * included should the sf_buf code decide to use them.
397 	 */
398 	page_range = phys_avail[i].phys_end / PAGE_SIZE;
399 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
400 	end -= vm_page_dump_size;
401 	vm_page_dump = (void *)pmap_map(&vaddr, end, end + vm_page_dump_size,
402 					VM_PROT_READ | VM_PROT_WRITE);
403 	bzero((void *)vm_page_dump, vm_page_dump_size);
404 #endif
405 	/*
406 	 * Compute the number of pages of memory that will be available for
407 	 * use (taking into account the overhead of a page structure per
408 	 * page).
409 	 */
410 	first_page = phys_avail[0].phys_beg / PAGE_SIZE;
411 	page_range = phys_avail[i].phys_end / PAGE_SIZE - first_page;
412 	npages = (total - (page_range * sizeof(struct vm_page))) / PAGE_SIZE;
413 
414 #ifndef _KERNEL_VIRTUAL
415 	/*
416 	 * (only applies to real kernels)
417 	 *
418 	 * Reserve a large amount of low memory for potential 32-bit DMA
419 	 * space allocations.  Once device initialization is complete we
420 	 * release most of it, but keep (vm_dma_reserved) memory reserved
421 	 * for later use.  Typically for X / graphics.  Through trial and
422 	 * error we find that GPUs usually requires ~60-100MB or so.
423 	 *
424 	 * By default, 128M is left in reserve on machines with 2G+ of ram.
425 	 */
426 	vm_low_phys_reserved = (vm_paddr_t)65536 << PAGE_SHIFT;
427 	if (vm_low_phys_reserved > total / 4)
428 		vm_low_phys_reserved = total / 4;
429 	if (vm_dma_reserved == 0) {
430 		vm_dma_reserved = 128 * 1024 * 1024;	/* 128MB */
431 		if (vm_dma_reserved > total / 16)
432 			vm_dma_reserved = total / 16;
433 	}
434 #endif
435 	alist_init(&vm_contig_alist, 65536, vm_contig_ameta,
436 		   ALIST_RECORDS_65536);
437 
438 	/*
439 	 * Initialize the mem entry structures now, and put them in the free
440 	 * queue.
441 	 */
442 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
443 		kprintf("initializing vm_page_array ");
444 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
445 	mapped = pmap_map(&vaddr, new_end, end, VM_PROT_READ | VM_PROT_WRITE);
446 	vm_page_array = (vm_page_t)mapped;
447 
448 #if defined(__x86_64__) && !defined(_KERNEL_VIRTUAL)
449 	/*
450 	 * since pmap_map on amd64 returns stuff out of a direct-map region,
451 	 * we have to manually add these pages to the minidump tracking so
452 	 * that they can be dumped, including the vm_page_array.
453 	 */
454 	for (pa = new_end;
455 	     pa < phys_avail[biggestone].phys_end;
456 	     pa += PAGE_SIZE) {
457 		dump_add_page(pa);
458 	}
459 #endif
460 
461 	/*
462 	 * Clear all of the page structures, run basic initialization so
463 	 * PHYS_TO_VM_PAGE() operates properly even on pages not in the
464 	 * map.
465 	 */
466 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
467 	vm_page_array_size = page_range;
468 	if (bootverbose && ctob(physmem) >= 400LL*1024*1024*1024)
469 		kprintf("size = 0x%zx\n", vm_page_array_size);
470 
471 	m = &vm_page_array[0];
472 	pa = ptoa(first_page);
473 	for (i = 0; i < page_range; ++i) {
474 		spin_init(&m->spin, "vm_page");
475 		m->phys_addr = pa;
476 		pa += PAGE_SIZE;
477 		++m;
478 	}
479 
480 	/*
481 	 * Construct the free queue(s) in ascending order (by physical
482 	 * address) so that the first 16MB of physical memory is allocated
483 	 * last rather than first.  On large-memory machines, this avoids
484 	 * the exhaustion of low physical memory before isa_dma_init has run.
485 	 */
486 	vmstats.v_page_count = 0;
487 	vmstats.v_free_count = 0;
488 	for (i = 0; phys_avail[i].phys_end && npages > 0; ++i) {
489 		pa = phys_avail[i].phys_beg;
490 		if (i == biggestone)
491 			last_pa = new_end;
492 		else
493 			last_pa = phys_avail[i].phys_end;
494 		while (pa < last_pa && npages-- > 0) {
495 			vm_add_new_page(pa, &badcount);
496 			pa += PAGE_SIZE;
497 		}
498 	}
499 	if (virtual2_start)
500 		virtual2_start = vaddr;
501 	else
502 		virtual_start = vaddr;
503 	mycpu->gd_vmstats = vmstats;
504 }
505 
506 /*
507  * (called from early boot only)
508  *
509  * Reorganize VM pages based on numa data.  May be called as many times as
510  * necessary.  Will reorganize the vm_page_t page color and related queue(s)
511  * to allow vm_page_alloc() to choose pages based on socket affinity.
512  *
513  * NOTE: This function is only called while we are still in UP mode, so
514  *	 we only need a critical section to protect the queues (which
515  *	 saves a lot of time, there are likely a ton of pages).
516  */
517 void
518 vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid)
519 {
520 	vm_paddr_t scan_beg;
521 	vm_paddr_t scan_end;
522 	vm_paddr_t ran_end;
523 	struct vpgqueues *vpq;
524 	vm_page_t m;
525 	vm_page_t mend;
526 	int socket_mod;
527 	int socket_value;
528 	int i;
529 
530 	/*
531 	 * Check if no physical information, or there was only one socket
532 	 * (so don't waste time doing nothing!).
533 	 */
534 	if (cpu_topology_phys_ids <= 1 ||
535 	    cpu_topology_core_ids == 0) {
536 		return;
537 	}
538 
539 	/*
540 	 * Setup for our iteration.  Note that ACPI may iterate CPU
541 	 * sockets starting at 0 or 1 or some other number.  The
542 	 * cpu_topology code mod's it against the socket count.
543 	 */
544 	ran_end = ran_beg + bytes;
545 
546 	socket_mod = PQ_L2_SIZE / cpu_topology_phys_ids;
547 	socket_value = (physid % cpu_topology_phys_ids) * socket_mod;
548 	mend = &vm_page_array[vm_page_array_size];
549 
550 	crit_enter();
551 
552 	/*
553 	 * Adjust cpu_topology's phys_mem parameter
554 	 */
555 	if (root_cpu_node)
556 		vm_numa_add_topology_mem(root_cpu_node, physid, (long)bytes);
557 
558 	/*
559 	 * Adjust vm_page->pc and requeue all affected pages.  The
560 	 * allocator will then be able to localize memory allocations
561 	 * to some degree.
562 	 */
563 	for (i = 0; phys_avail[i].phys_end; ++i) {
564 		scan_beg = phys_avail[i].phys_beg;
565 		scan_end = phys_avail[i].phys_end;
566 		if (scan_end <= ran_beg)
567 			continue;
568 		if (scan_beg >= ran_end)
569 			continue;
570 		if (scan_beg < ran_beg)
571 			scan_beg = ran_beg;
572 		if (scan_end > ran_end)
573 			scan_end = ran_end;
574 		if (atop(scan_end) > first_page + vm_page_array_size)
575 			scan_end = ptoa(first_page + vm_page_array_size);
576 
577 		m = PHYS_TO_VM_PAGE(scan_beg);
578 		while (scan_beg < scan_end) {
579 			KKASSERT(m < mend);
580 			if (m->queue != PQ_NONE) {
581 				vpq = &vm_page_queues[m->queue];
582 				TAILQ_REMOVE(&vpq->pl, m, pageq);
583 				--vpq->lcnt;
584 				/* queue doesn't change, no need to adj cnt */
585 				m->queue -= m->pc;
586 				m->pc %= socket_mod;
587 				m->pc += socket_value;
588 				m->pc &= PQ_L2_MASK;
589 				m->queue += m->pc;
590 				vpq = &vm_page_queues[m->queue];
591 				TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
592 				++vpq->lcnt;
593 				/* queue doesn't change, no need to adj cnt */
594 			} else {
595 				m->pc %= socket_mod;
596 				m->pc += socket_value;
597 				m->pc &= PQ_L2_MASK;
598 			}
599 			scan_beg += PAGE_SIZE;
600 			++m;
601 		}
602 	}
603 
604 	crit_exit();
605 }
606 
607 /*
608  * (called from early boot only)
609  *
610  * Don't allow the NUMA organization to leave vm_page_queues[] nodes
611  * completely empty for a logical cpu.  Doing so would force allocations
612  * on that cpu to always borrow from a nearby cpu, create unnecessary
613  * contention, and cause vm_page_alloc() to iterate more queues and run more
614  * slowly.
615  *
616  * This situation can occur when memory sticks are not entirely populated,
617  * populated at different densities, or in naturally assymetric systems
618  * such as the 2990WX.  There could very well be many vm_page_queues[]
619  * entries with *NO* pages assigned to them.
620  *
621  * Fixing this up ensures that each logical CPU has roughly the same
622  * sized memory pool, and more importantly ensures that logical CPUs
623  * do not wind up with an empty memory pool.
624  *
625  * At them moment we just iterate the other queues and borrow pages,
626  * moving them into the queues for cpus with severe deficits even though
627  * the memory might not be local to those cpus.  I am not doing this in
628  * a 'smart' way, its effectively UMA style (sorta, since its page-by-page
629  * whereas real UMA typically exchanges address bits 8-10 with high address
630  * bits).  But it works extremely well and gives us fairly good deterministic
631  * results on the cpu cores associated with these secondary nodes.
632  */
633 void
634 vm_numa_organize_finalize(void)
635 {
636 	struct vpgqueues *vpq;
637 	vm_page_t m;
638 	long lcnt_lo;
639 	long lcnt_hi;
640 	int iter;
641 	int i;
642 	int scale_lim;
643 
644 	crit_enter();
645 
646 	/*
647 	 * Machines might not use an exact power of 2 for phys_ids,
648 	 * core_ids, ht_ids, etc.  This can slightly reduce the actual
649 	 * range of indices in vm_page_queues[] that are nominally used.
650 	 */
651 	if (cpu_topology_ht_ids) {
652 		scale_lim = PQ_L2_SIZE / cpu_topology_phys_ids;
653 		scale_lim = scale_lim / cpu_topology_core_ids;
654 		scale_lim = scale_lim / cpu_topology_ht_ids;
655 		scale_lim = scale_lim * cpu_topology_ht_ids;
656 		scale_lim = scale_lim * cpu_topology_core_ids;
657 		scale_lim = scale_lim * cpu_topology_phys_ids;
658 	} else {
659 		scale_lim = PQ_L2_SIZE;
660 	}
661 
662 	/*
663 	 * Calculate an average, set hysteresis for balancing from
664 	 * 10% below the average to the average.
665 	 */
666 	lcnt_hi = 0;
667 	for (i = 0; i < scale_lim; ++i) {
668 		lcnt_hi += vm_page_queues[i].lcnt;
669 	}
670 	lcnt_hi /= scale_lim;
671 	lcnt_lo = lcnt_hi - lcnt_hi / 10;
672 
673 	kprintf("vm_page: avg %ld pages per queue, %d queues\n",
674 		lcnt_hi, scale_lim);
675 
676 	iter = 0;
677 	for (i = 0; i < scale_lim; ++i) {
678 		vpq = &vm_page_queues[PQ_FREE + i];
679 		while (vpq->lcnt < lcnt_lo) {
680 			struct vpgqueues *vptmp;
681 
682 			iter = (iter + 1) & PQ_L2_MASK;
683 			vptmp = &vm_page_queues[PQ_FREE + iter];
684 			if (vptmp->lcnt < lcnt_hi)
685 				continue;
686 			m = TAILQ_FIRST(&vptmp->pl);
687 			KKASSERT(m->queue == PQ_FREE + iter);
688 			TAILQ_REMOVE(&vptmp->pl, m, pageq);
689 			--vptmp->lcnt;
690 			/* queue doesn't change, no need to adj cnt */
691 			m->queue -= m->pc;
692 			m->pc = i;
693 			m->queue += m->pc;
694 			TAILQ_INSERT_HEAD(&vpq->pl, m, pageq);
695 			++vpq->lcnt;
696 		}
697 	}
698 	crit_exit();
699 }
700 
701 static
702 void
703 vm_numa_add_topology_mem(cpu_node_t *cpup, int physid, long bytes)
704 {
705 	int cpuid;
706 	int i;
707 
708 	switch(cpup->type) {
709 	case PACKAGE_LEVEL:
710 		cpup->phys_mem += bytes;
711 		break;
712 	case CHIP_LEVEL:
713 		/*
714 		 * All members should have the same chipid, so we only need
715 		 * to pull out one member.
716 		 */
717 		if (CPUMASK_TESTNZERO(cpup->members)) {
718 			cpuid = BSFCPUMASK(cpup->members);
719 			if (physid ==
720 			    get_chip_ID_from_APICID(CPUID_TO_APICID(cpuid))) {
721 				cpup->phys_mem += bytes;
722 			}
723 		}
724 		break;
725 	case CORE_LEVEL:
726 	case THREAD_LEVEL:
727 		/*
728 		 * Just inherit from the parent node
729 		 */
730 		cpup->phys_mem = cpup->parent_node->phys_mem;
731 		break;
732 	}
733 	for (i = 0; i < MAXCPU && cpup->child_node[i]; ++i)
734 		vm_numa_add_topology_mem(cpup->child_node[i], physid, bytes);
735 }
736 
737 /*
738  * We tended to reserve a ton of memory for contigmalloc().  Now that most
739  * drivers have initialized we want to return most the remaining free
740  * reserve back to the VM page queues so they can be used for normal
741  * allocations.
742  *
743  * We leave vm_dma_reserved bytes worth of free pages in the reserve pool.
744  */
745 static void
746 vm_page_startup_finish(void *dummy __unused)
747 {
748 	alist_blk_t blk;
749 	alist_blk_t rblk;
750 	alist_blk_t count;
751 	alist_blk_t xcount;
752 	alist_blk_t bfree;
753 	vm_page_t m;
754 	struct vm_page_hash_elm *mp;
755 	int mask;
756 
757 	/*
758 	 * Set the set_assoc_mask based on the fitted number of CPUs.
759 	 * This is a mask, so we subject 1.
760 	 *
761 	 * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8.
762 	 * So if we have 256 CPUs, two hyper-threads will wind up sharing.
763 	 *
764 	 * The maximum is PQ_L2_SIZE.  However, we limit the starting
765 	 * maximum to 16 (mask = 15) in order to improve the cache locality
766 	 * of related kernel data structures.
767 	 */
768 	mask = PQ_L2_SIZE / ncpus_fit - 1;
769 	if (mask < 7)		/* minimum is 8-way w/256 CPU threads */
770 		mask = 7;
771 	if (mask < 15)
772 		mask = 15;
773 	cpu_ccfence();
774 	set_assoc_mask = mask;
775 
776 	/*
777 	 * Return part of the initial reserve back to the system
778 	 */
779 	spin_lock(&vm_contig_spin);
780 	for (;;) {
781 		bfree = alist_free_info(&vm_contig_alist, &blk, &count);
782 		if (bfree <= vm_dma_reserved / PAGE_SIZE)
783 			break;
784 		if (count == 0)
785 			break;
786 
787 		/*
788 		 * Figure out how much of the initial reserve we have to
789 		 * free in order to reach our target.
790 		 */
791 		bfree -= vm_dma_reserved / PAGE_SIZE;
792 		if (count > bfree) {
793 			blk += count - bfree;
794 			count = bfree;
795 		}
796 
797 		/*
798 		 * Calculate the nearest power of 2 <= count.
799 		 */
800 		for (xcount = 1; xcount <= count; xcount <<= 1)
801 			;
802 		xcount >>= 1;
803 		blk += count - xcount;
804 		count = xcount;
805 
806 		/*
807 		 * Allocate the pages from the alist, then free them to
808 		 * the normal VM page queues.
809 		 *
810 		 * Pages allocated from the alist are wired.  We have to
811 		 * busy, unwire, and free them.  We must also adjust
812 		 * vm_low_phys_reserved before freeing any pages to prevent
813 		 * confusion.
814 		 */
815 		rblk = alist_alloc(&vm_contig_alist, blk, count);
816 		if (rblk != blk) {
817 			kprintf("vm_page_startup_finish: Unable to return "
818 				"dma space @0x%08x/%d -> 0x%08x\n",
819 				blk, count, rblk);
820 			break;
821 		}
822 		atomic_add_long(&vmstats.v_dma_pages, -(long)count);
823 		spin_unlock(&vm_contig_spin);
824 
825 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
826 		vm_low_phys_reserved = VM_PAGE_TO_PHYS(m);
827 		while (count) {
828 			vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);
829 			vm_page_busy_wait(m, FALSE, "cpgfr");
830 			vm_page_unwire(m, 0);
831 			vm_page_free(m);
832 			--count;
833 			++m;
834 		}
835 		spin_lock(&vm_contig_spin);
836 	}
837 	spin_unlock(&vm_contig_spin);
838 
839 	/*
840 	 * Print out how much DMA space drivers have already allocated and
841 	 * how much is left over.
842 	 */
843 	kprintf("DMA space used: %jdk, remaining available: %jdk\n",
844 		(intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) *
845 		(PAGE_SIZE / 1024),
846 		(intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024));
847 
848 	/*
849 	 * Power of 2
850 	 */
851 	vm_page_hash_size = 4096;
852 	while (vm_page_hash_size < (vm_page_array_size / 16))
853 		vm_page_hash_size <<= 1;
854 	if (vm_page_hash_size > VM_PAGE_HASH_MAX)
855 		vm_page_hash_size = VM_PAGE_HASH_MAX;
856 
857 	/*
858 	 * hash table for vm_page_lookup_quick()
859 	 */
860 	mp = (void *)kmem_alloc3(kernel_map,
861 				 (vm_page_hash_size + VM_PAGE_HASH_SET) *
862 				  sizeof(*vm_page_hash),
863 				 VM_SUBSYS_VMPGHASH, KM_CPU(0));
864 	bzero(mp, (vm_page_hash_size + VM_PAGE_HASH_SET) * sizeof(*mp));
865 	cpu_sfence();
866 	vm_page_hash = mp;
867 }
868 SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY,
869 	vm_page_startup_finish, NULL);
870 
871 
872 /*
873  * Scan comparison function for Red-Black tree scans.  An inclusive
874  * (start,end) is expected.  Other fields are not used.
875  */
876 int
877 rb_vm_page_scancmp(struct vm_page *p, void *data)
878 {
879 	struct rb_vm_page_scan_info *info = data;
880 
881 	if (p->pindex < info->start_pindex)
882 		return(-1);
883 	if (p->pindex > info->end_pindex)
884 		return(1);
885 	return(0);
886 }
887 
888 int
889 rb_vm_page_compare(struct vm_page *p1, struct vm_page *p2)
890 {
891 	if (p1->pindex < p2->pindex)
892 		return(-1);
893 	if (p1->pindex > p2->pindex)
894 		return(1);
895 	return(0);
896 }
897 
898 void
899 vm_page_init(vm_page_t m)
900 {
901 	/* do nothing for now.  Called from pmap_page_init() */
902 }
903 
904 /*
905  * Each page queue has its own spin lock, which is fairly optimal for
906  * allocating and freeing pages at least.
907  *
908  * The caller must hold the vm_page_spin_lock() before locking a vm_page's
909  * queue spinlock via this function.  Also note that m->queue cannot change
910  * unless both the page and queue are locked.
911  */
912 static __inline
913 void
914 _vm_page_queue_spin_lock(vm_page_t m)
915 {
916 	u_short queue;
917 
918 	queue = m->queue;
919 	if (queue != PQ_NONE) {
920 		spin_lock(&vm_page_queues[queue].spin);
921 		KKASSERT(queue == m->queue);
922 	}
923 }
924 
925 static __inline
926 void
927 _vm_page_queue_spin_unlock(vm_page_t m)
928 {
929 	u_short queue;
930 
931 	queue = m->queue;
932 	cpu_ccfence();
933 	if (queue != PQ_NONE)
934 		spin_unlock(&vm_page_queues[queue].spin);
935 }
936 
937 static __inline
938 void
939 _vm_page_queues_spin_lock(u_short queue)
940 {
941 	cpu_ccfence();
942 	if (queue != PQ_NONE)
943 		spin_lock(&vm_page_queues[queue].spin);
944 }
945 
946 
947 static __inline
948 void
949 _vm_page_queues_spin_unlock(u_short queue)
950 {
951 	cpu_ccfence();
952 	if (queue != PQ_NONE)
953 		spin_unlock(&vm_page_queues[queue].spin);
954 }
955 
956 void
957 vm_page_queue_spin_lock(vm_page_t m)
958 {
959 	_vm_page_queue_spin_lock(m);
960 }
961 
962 void
963 vm_page_queues_spin_lock(u_short queue)
964 {
965 	_vm_page_queues_spin_lock(queue);
966 }
967 
968 void
969 vm_page_queue_spin_unlock(vm_page_t m)
970 {
971 	_vm_page_queue_spin_unlock(m);
972 }
973 
974 void
975 vm_page_queues_spin_unlock(u_short queue)
976 {
977 	_vm_page_queues_spin_unlock(queue);
978 }
979 
980 /*
981  * This locks the specified vm_page and its queue in the proper order
982  * (page first, then queue).  The queue may change so the caller must
983  * recheck on return.
984  */
985 static __inline
986 void
987 _vm_page_and_queue_spin_lock(vm_page_t m)
988 {
989 	vm_page_spin_lock(m);
990 	_vm_page_queue_spin_lock(m);
991 }
992 
993 static __inline
994 void
995 _vm_page_and_queue_spin_unlock(vm_page_t m)
996 {
997 	_vm_page_queues_spin_unlock(m->queue);
998 	vm_page_spin_unlock(m);
999 }
1000 
1001 void
1002 vm_page_and_queue_spin_unlock(vm_page_t m)
1003 {
1004 	_vm_page_and_queue_spin_unlock(m);
1005 }
1006 
1007 void
1008 vm_page_and_queue_spin_lock(vm_page_t m)
1009 {
1010 	_vm_page_and_queue_spin_lock(m);
1011 }
1012 
1013 /*
1014  * Helper function removes vm_page from its current queue.
1015  * Returns the base queue the page used to be on.
1016  *
1017  * The vm_page and the queue must be spinlocked.
1018  * This function will unlock the queue but leave the page spinlocked.
1019  */
1020 static __inline u_short
1021 _vm_page_rem_queue_spinlocked(vm_page_t m)
1022 {
1023 	struct vpgqueues *pq;
1024 	u_short queue;
1025 	u_short oqueue;
1026 	long *cnt_adj;
1027 	long *cnt_gd;
1028 
1029 	queue = m->queue;
1030 	if (queue != PQ_NONE) {
1031 		pq = &vm_page_queues[queue];
1032 		TAILQ_REMOVE(&pq->pl, m, pageq);
1033 
1034 		/*
1035 		 * Primarily adjust our pcpu stats for rollup, which is
1036 		 * (mycpu->gd_vmstats_adj + offset).  This is normally
1037 		 * synchronized on every hardclock().
1038 		 *
1039 		 * However, in order for the nominal low-memory algorithms
1040 		 * to work properly if the unsynchronized adjustment gets
1041 		 * too negative and might trigger the pageout daemon, we
1042 		 * immediately synchronize with the global structure.
1043 		 *
1044 		 * The idea here is to reduce unnecessary SMP cache mastership
1045 		 * changes in the global vmstats, which can be particularly
1046 		 * bad in multi-socket systems.
1047 		 *
1048 		 * WARNING! In systems with low amounts of memory the
1049 		 *	    vm_paging_needed(-1024 * ncpus) test could
1050 		 *	    wind up testing a value above the paging target,
1051 		 *	    meaning it would almost always return TRUE.  In
1052 		 *	    that situation we synchronize every time the
1053 		 *	    cumulative adjustment falls below -1024.
1054 		 */
1055 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1056 				   pq->cnt_offset);
1057 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1058 				   pq->cnt_offset);
1059 		atomic_add_long(cnt_adj, -1);
1060 		atomic_add_long(cnt_gd, -1);
1061 
1062 		if (*cnt_adj < -1024 && vm_paging_start(-1024 * ncpus)) {
1063 			u_long copy = atomic_swap_long(cnt_adj, 0);
1064 			cnt_adj = (long *)((char *)&vmstats + pq->cnt_offset);
1065 			atomic_add_long(cnt_adj, copy);
1066 		}
1067 		pq->lcnt--;
1068 		m->queue = PQ_NONE;
1069 		oqueue = queue;
1070 		queue -= m->pc;
1071 		vm_page_queues_spin_unlock(oqueue);	/* intended */
1072 	}
1073 	return queue;
1074 }
1075 
1076 /*
1077  * Helper function places the vm_page on the specified queue.  Generally
1078  * speaking only PQ_FREE pages are placed at the head, to allow them to
1079  * be allocated sooner rather than later on the assumption that they
1080  * are cache-hot.
1081  *
1082  * The vm_page must be spinlocked.
1083  * The vm_page must NOT be FICTITIOUS (that would be a disaster)
1084  * This function will return with both the page and the queue locked.
1085  */
1086 static __inline void
1087 _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
1088 {
1089 	struct vpgqueues *pq;
1090 	u_long *cnt_adj;
1091 	u_long *cnt_gd;
1092 
1093 	KKASSERT(m->queue == PQ_NONE &&
1094 		 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0);
1095 
1096 	if (queue != PQ_NONE) {
1097 		vm_page_queues_spin_lock(queue);
1098 		pq = &vm_page_queues[queue];
1099 		++pq->lcnt;
1100 
1101 		/*
1102 		 * Adjust our pcpu stats.  If a system entity really needs
1103 		 * to incorporate the count it will call vmstats_rollup()
1104 		 * to roll it all up into the global vmstats strufture.
1105 		 */
1106 		cnt_adj = (long *)((char *)&mycpu->gd_vmstats_adj +
1107 				   pq->cnt_offset);
1108 		cnt_gd = (long *)((char *)&mycpu->gd_vmstats +
1109 				   pq->cnt_offset);
1110 		atomic_add_long(cnt_adj, 1);
1111 		atomic_add_long(cnt_gd, 1);
1112 
1113 		/*
1114 		 * PQ_FREE is always handled LIFO style to try to provide
1115 		 * cache-hot pages to programs.
1116 		 */
1117 		m->queue = queue;
1118 		if (queue - m->pc == PQ_FREE) {
1119 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1120 		} else if (athead) {
1121 			TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1122 		} else {
1123 			TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1124 		}
1125 		/* leave the queue spinlocked */
1126 	}
1127 }
1128 
1129 /*
1130  * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
1131  * until the page is no longer BUSY or SBUSY (busy_count field is 0).
1132  *
1133  * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
1134  * call will be made before returning.
1135  *
1136  * This function does NOT busy the page and on return the page is not
1137  * guaranteed to be available.
1138  */
1139 void
1140 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
1141 {
1142 	u_int32_t busy_count;
1143 
1144 	for (;;) {
1145 		busy_count = m->busy_count;
1146 		cpu_ccfence();
1147 
1148 		if ((busy_count & PBUSY_LOCKED) == 0 &&
1149 		    (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
1150 			break;
1151 		}
1152 		tsleep_interlock(m, 0);
1153 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1154 				      busy_count | PBUSY_WANTED)) {
1155 			atomic_set_int(&m->flags, PG_REFERENCED);
1156 			tsleep(m, PINTERLOCKED, msg, 0);
1157 			break;
1158 		}
1159 	}
1160 }
1161 
1162 /*
1163  * This calculates and returns a page color given an optional VM object and
1164  * either a pindex or an iterator.  We attempt to return a cpu-localized
1165  * pg_color that is still roughly 16-way set-associative.  The CPU topology
1166  * is used if it was probed.
1167  *
1168  * The caller may use the returned value to index into e.g. PQ_FREE when
1169  * allocating a page in order to nominally obtain pages that are hopefully
1170  * already localized to the requesting cpu.  This function is not able to
1171  * provide any sort of guarantee of this, but does its best to improve
1172  * hardware cache management performance.
1173  *
1174  * WARNING! The caller must mask the returned value with PQ_L2_MASK.
1175  */
1176 u_short
1177 vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
1178 {
1179 	u_short pg_color;
1180 	int object_pg_color;
1181 
1182 	/*
1183 	 * WARNING! cpu_topology_core_ids might not be a power of two.
1184 	 *	    We also shouldn't make assumptions about
1185 	 *	    cpu_topology_phys_ids either.
1186 	 *
1187 	 * WARNING! ncpus might not be known at this time (during early
1188 	 *	    boot), and might be set to 1.
1189 	 *
1190 	 * General format: [phys_id][core_id][cpuid][set-associativity]
1191 	 * (but uses modulo, so not necessarily precise bit masks)
1192 	 */
1193 	object_pg_color = object ? object->pg_color : 0;
1194 
1195 	if (cpu_topology_ht_ids) {
1196 		int phys_id;
1197 		int core_id;
1198 		int ht_id;
1199 		int physcale;
1200 		int grpscale;
1201 		int cpuscale;
1202 
1203 		/*
1204 		 * Translate cpuid to socket, core, and hyperthread id.
1205 		 */
1206 		phys_id = get_cpu_phys_id(cpuid);
1207 		core_id = get_cpu_core_id(cpuid);
1208 		ht_id = get_cpu_ht_id(cpuid);
1209 
1210 		/*
1211 		 * Calculate pg_color for our array index.
1212 		 *
1213 		 * physcale - socket multiplier.
1214 		 * grpscale - core multiplier (cores per socket)
1215 		 * cpu*	    - cpus per core
1216 		 *
1217 		 * WARNING! In early boot, ncpus has not yet been
1218 		 *	    initialized and may be set to (1).
1219 		 *
1220 		 * WARNING! physcale must match the organization that
1221 		 *	    vm_numa_organize() creates to ensure that
1222 		 *	    we properly localize allocations to the
1223 		 *	    requested cpuid.
1224 		 */
1225 		physcale = PQ_L2_SIZE / cpu_topology_phys_ids;
1226 		grpscale = physcale / cpu_topology_core_ids;
1227 		cpuscale = grpscale / cpu_topology_ht_ids;
1228 
1229 		pg_color = phys_id * physcale;
1230 		pg_color += core_id * grpscale;
1231 		pg_color += ht_id * cpuscale;
1232 		pg_color += (pindex + object_pg_color) % cpuscale;
1233 
1234 #if 0
1235 		if (grpsize >= 8) {
1236 			pg_color += (pindex + object_pg_color) % grpsize;
1237 		} else {
1238 			if (grpsize <= 2) {
1239 				grpsize = 8;
1240 			} else {
1241 				/* 3->9, 4->8, 5->10, 6->12, 7->14 */
1242 				grpsize += grpsize;
1243 				if (grpsize < 8)
1244 					grpsize += grpsize;
1245 			}
1246 			pg_color += (pindex + object_pg_color) % grpsize;
1247 		}
1248 #endif
1249 	} else {
1250 		/*
1251 		 * Unknown topology, distribute things evenly.
1252 		 *
1253 		 * WARNING! In early boot, ncpus has not yet been
1254 		 *	    initialized and may be set to (1).
1255 		 */
1256 		int cpuscale;
1257 
1258 		cpuscale = PQ_L2_SIZE / ncpus;
1259 
1260 		pg_color = cpuid * cpuscale;
1261 		pg_color += (pindex + object_pg_color) % cpuscale;
1262 	}
1263 	return (pg_color & PQ_L2_MASK);
1264 }
1265 
1266 /*
1267  * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
1268  * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
1269  */
1270 void
1271 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
1272 				     int also_m_busy, const char *msg
1273 				     VM_PAGE_DEBUG_ARGS)
1274 {
1275 	u_int32_t busy_count;
1276 
1277 	for (;;) {
1278 		busy_count = m->busy_count;
1279 		cpu_ccfence();
1280 		if (busy_count & PBUSY_LOCKED) {
1281 			tsleep_interlock(m, 0);
1282 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1283 					  busy_count | PBUSY_WANTED)) {
1284 				atomic_set_int(&m->flags, PG_REFERENCED);
1285 				tsleep(m, PINTERLOCKED, msg, 0);
1286 			}
1287 		} else if (also_m_busy && busy_count) {
1288 			tsleep_interlock(m, 0);
1289 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1290 					  busy_count | PBUSY_WANTED)) {
1291 				atomic_set_int(&m->flags, PG_REFERENCED);
1292 				tsleep(m, PINTERLOCKED, msg, 0);
1293 			}
1294 		} else {
1295 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1296 					      busy_count | PBUSY_LOCKED)) {
1297 #ifdef VM_PAGE_DEBUG
1298 				m->busy_func = func;
1299 				m->busy_line = lineno;
1300 #endif
1301 				break;
1302 			}
1303 		}
1304 	}
1305 }
1306 
1307 /*
1308  * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
1309  * m->busy_count is also 0.
1310  *
1311  * Returns non-zero on failure.
1312  */
1313 int
1314 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
1315 				    VM_PAGE_DEBUG_ARGS)
1316 {
1317 	u_int32_t busy_count;
1318 
1319 	for (;;) {
1320 		busy_count = m->busy_count;
1321 		cpu_ccfence();
1322 		if (busy_count & PBUSY_LOCKED)
1323 			return TRUE;
1324 		if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
1325 			return TRUE;
1326 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1327 				      busy_count | PBUSY_LOCKED)) {
1328 #ifdef VM_PAGE_DEBUG
1329 				m->busy_func = func;
1330 				m->busy_line = lineno;
1331 #endif
1332 			return FALSE;
1333 		}
1334 	}
1335 }
1336 
1337 /*
1338  * Clear the BUSY flag and return non-zero to indicate to the caller
1339  * that a wakeup() should be performed.
1340  *
1341  * (inline version)
1342  */
1343 static __inline
1344 int
1345 _vm_page_wakeup(vm_page_t m)
1346 {
1347 	u_int32_t busy_count;
1348 
1349 	busy_count = m->busy_count;
1350 	cpu_ccfence();
1351 	for (;;) {
1352 		if (atomic_fcmpset_int(&m->busy_count, &busy_count,
1353 				      busy_count &
1354 				      ~(PBUSY_LOCKED | PBUSY_WANTED))) {
1355 			return((int)(busy_count & PBUSY_WANTED));
1356 		}
1357 	}
1358 	/* not reached */
1359 }
1360 
1361 /*
1362  * Clear the BUSY flag and wakeup anyone waiting for the page.  This
1363  * is typically the last call you make on a page before moving onto
1364  * other things.
1365  */
1366 void
1367 vm_page_wakeup(vm_page_t m)
1368 {
1369         KASSERT(m->busy_count & PBUSY_LOCKED,
1370 		("vm_page_wakeup: page not busy!!!"));
1371 	if (_vm_page_wakeup(m))
1372 		wakeup(m);
1373 }
1374 
1375 /*
1376  * Hold a page, preventing reuse.  This is typically only called on pages
1377  * in a known state (either held busy, special, or interlocked in some
1378  * manner).  Holding a page does not ensure that it remains valid, it only
1379  * prevents reuse.  The page must not already be on the FREE queue or in
1380  * any danger of being moved to the FREE queue concurrent with this call.
1381  *
1382  * Other parts of the system can still disassociate the page from its object
1383  * and attempt to free it, or perform read or write I/O on it and/or otherwise
1384  * manipulate the page, but if the page is held the VM system will leave the
1385  * page and its data intact and not cycle it through the FREE queue until
1386  * the last hold has been released.
1387  *
1388  * (see vm_page_wire() if you want to prevent the page from being
1389  *  disassociated from its object too).
1390  */
1391 void
1392 vm_page_hold(vm_page_t m)
1393 {
1394 	atomic_add_int(&m->hold_count, 1);
1395 	KKASSERT(m->queue - m->pc != PQ_FREE);
1396 }
1397 
1398 /*
1399  * The opposite of vm_page_hold().  If the page is on the HOLD queue
1400  * it was freed while held and must be moved back to the FREE queue.
1401  *
1402  * To avoid racing against vm_page_free*() we must re-test conditions
1403  * after obtaining the spin-lock.  The initial test can also race a
1404  * vm_page_free*() that is in the middle of moving a page to PQ_HOLD,
1405  * leaving the page on PQ_HOLD with hold_count == 0.  Rather than
1406  * throw a spin-lock in the critical path, we rely on the pageout
1407  * daemon to clean-up these loose ends.
1408  *
1409  * More critically, the 'easy movement' between queues without busying
1410  * a vm_page is only allowed for PQ_FREE<->PQ_HOLD.
1411  */
1412 void
1413 vm_page_unhold(vm_page_t m)
1414 {
1415 	KASSERT(m->hold_count > 0 && m->queue - m->pc != PQ_FREE,
1416 		("vm_page_unhold: pg %p illegal hold_count (%d) or "
1417 		 "on FREE queue (%d)",
1418 		 m, m->hold_count, m->queue - m->pc));
1419 
1420 	if (atomic_fetchadd_int(&m->hold_count, -1) == 1 &&
1421 	    m->queue - m->pc == PQ_HOLD) {
1422 		vm_page_spin_lock(m);
1423 		if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
1424 			_vm_page_queue_spin_lock(m);
1425 			_vm_page_rem_queue_spinlocked(m);
1426 			_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
1427 			_vm_page_queue_spin_unlock(m);
1428 		}
1429 		vm_page_spin_unlock(m);
1430 	}
1431 }
1432 
1433 /*
1434  * Create a fictitious page with the specified physical address and
1435  * memory attribute.  The memory attribute is the only the machine-
1436  * dependent aspect of a fictitious page that must be initialized.
1437  */
1438 void
1439 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
1440 {
1441 	/*
1442 	 * The page's memattr might have changed since the
1443 	 * previous initialization.  Update the pmap to the
1444 	 * new memattr.
1445 	 */
1446 	if ((m->flags & PG_FICTITIOUS) != 0)
1447 		goto memattr;
1448 	m->phys_addr = paddr;
1449 	m->queue = PQ_NONE;
1450 	/* Fictitious pages don't use "segind". */
1451 	/* Fictitious pages don't use "order" or "pool". */
1452 	m->flags = PG_FICTITIOUS | PG_UNQUEUED;
1453 	m->busy_count = PBUSY_LOCKED;
1454 	m->wire_count = 1;
1455 	spin_init(&m->spin, "fake_page");
1456 	pmap_page_init(m);
1457 memattr:
1458 	pmap_page_set_memattr(m, memattr);
1459 }
1460 
1461 /*
1462  * Inserts the given vm_page into the object and object list.
1463  *
1464  * The pagetables are not updated but will presumably fault the page
1465  * in if necessary, or if a kernel page the caller will at some point
1466  * enter the page into the kernel's pmap.  We are not allowed to block
1467  * here so we *can't* do this anyway.
1468  *
1469  * This routine may not block.
1470  * This routine must be called with the vm_object held.
1471  * This routine must be called with a critical section held.
1472  *
1473  * This routine returns TRUE if the page was inserted into the object
1474  * successfully, and FALSE if the page already exists in the object.
1475  */
1476 int
1477 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
1478 {
1479 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(object));
1480 	if (m->object != NULL)
1481 		panic("vm_page_insert: already inserted");
1482 
1483 	atomic_add_int(&object->generation, 1);
1484 
1485 	/*
1486 	 * Associate the VM page with an (object, offset).
1487 	 *
1488 	 * The vm_page spin lock is required for interactions with the pmap.
1489 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1490 	 */
1491 	vm_page_spin_lock(m);
1492 	m->object = object;
1493 	m->pindex = pindex;
1494 	if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
1495 		m->object = NULL;
1496 		m->pindex = 0;
1497 		vm_page_spin_unlock(m);
1498 		return FALSE;
1499 	}
1500 	++object->resident_page_count;
1501 	++mycpu->gd_vmtotal.t_rm;
1502 	vm_page_spin_unlock(m);
1503 
1504 	/*
1505 	 * Since we are inserting a new and possibly dirty page,
1506 	 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
1507 	 */
1508 	if ((m->valid & m->dirty) ||
1509 	    (m->flags & (PG_WRITEABLE | PG_NEED_COMMIT)))
1510 		vm_object_set_writeable_dirty(object);
1511 
1512 	/*
1513 	 * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
1514 	 */
1515 	swap_pager_page_inserted(m);
1516 	return TRUE;
1517 }
1518 
1519 /*
1520  * Removes the given vm_page_t from the (object,index) table
1521  *
1522  * The page must be BUSY and will remain BUSY on return.
1523  * No other requirements.
1524  *
1525  * NOTE: FreeBSD side effect was to unbusy the page on return.  We leave
1526  *	 it busy.
1527  *
1528  * NOTE: Caller is responsible for any pmap disposition prior to the
1529  *	 rename (as the pmap code will not be able to find the entries
1530  *	 once the object has been disassociated).  The caller may choose
1531  *	 to leave the pmap association intact if this routine is being
1532  *	 called as part of a rename between shadowed objects.
1533  *
1534  * This routine may not block.
1535  */
1536 void
1537 vm_page_remove(vm_page_t m)
1538 {
1539 	vm_object_t object;
1540 
1541 	if (m->object == NULL) {
1542 		return;
1543 	}
1544 
1545 	if ((m->busy_count & PBUSY_LOCKED) == 0)
1546 		panic("vm_page_remove: page not busy");
1547 
1548 	object = m->object;
1549 
1550 	vm_object_hold(object);
1551 
1552 	/*
1553 	 * Remove the page from the object and update the object.
1554 	 *
1555 	 * The vm_page spin lock is required for interactions with the pmap.
1556 	 * XXX vm_page_spin_lock() might not be needed for this any more.
1557 	 */
1558 	vm_page_spin_lock(m);
1559 	vm_page_rb_tree_RB_REMOVE(&object->rb_memq, m);
1560 	--object->resident_page_count;
1561 	--mycpu->gd_vmtotal.t_rm;
1562 	m->object = NULL;
1563 	atomic_add_int(&object->generation, 1);
1564 	vm_page_spin_unlock(m);
1565 
1566 	vm_object_drop(object);
1567 }
1568 
1569 /*
1570  * Calculate the hash position for the vm_page hash heuristic.  Generally
1571  * speaking we want to localize sequential lookups to reduce memory stalls.
1572  *
1573  * Mask by ~3 to offer 4-way set-assoc
1574  */
1575 static __inline
1576 struct vm_page_hash_elm *
1577 vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex)
1578 {
1579 	size_t hi;
1580 
1581 	hi = iscsi_crc32(&object, sizeof(object)) << 2;
1582 	hi ^= hi >> (23 - 2);
1583 	hi += pindex * VM_PAGE_HASH_SET;
1584 #if 0
1585 	/* mix it up */
1586 	hi = (intptr_t)object ^ object->pg_color ^ pindex;
1587 	hi += object->pg_color * pindex;
1588 	hi = hi ^ (hi >> 20);
1589 #endif
1590 	hi &= vm_page_hash_size - 1;		/* bounds */
1591 
1592 	return (&vm_page_hash[hi]);
1593 }
1594 
1595 /*
1596  * Heuristical page lookup that does not require any locks.  Returns
1597  * a soft-busied page on success, NULL on failure.
1598  *
1599  * Caller must lookup the page the slow way if NULL is returned.
1600  */
1601 vm_page_t
1602 vm_page_hash_get(vm_object_t object, vm_pindex_t pindex)
1603 {
1604 	struct vm_page_hash_elm *mp;
1605 	vm_page_t m;
1606 	int i;
1607 
1608 	if (__predict_false(vm_page_hash == NULL))
1609 		return NULL;
1610 	mp = vm_page_hash_hash(object, pindex);
1611 	for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1612 		if (mp->object != object ||
1613 		    mp->pindex != pindex) {
1614 			continue;
1615 		}
1616 		m = mp->m;
1617 		cpu_ccfence();
1618 		if (m == NULL)
1619 			continue;
1620 		if (m->object != object || m->pindex != pindex)
1621 			continue;
1622 		if (vm_page_sbusy_try(m))
1623 			continue;
1624 		if (m->object == object && m->pindex == pindex) {
1625 			/*
1626 			 * On-match optimization - do not update ticks
1627 			 * unless we have to (reduce cache coherency traffic)
1628 			 */
1629 			if (mp->ticks != ticks)
1630 				mp->ticks = ticks;
1631 			return m;
1632 		}
1633 		vm_page_sbusy_drop(m);
1634 	}
1635 	return NULL;
1636 }
1637 
1638 /*
1639  * Enter page onto vm_page_hash[].  This is a heuristic, SMP collisions
1640  * are allowed.
1641  */
1642 static __inline
1643 void
1644 vm_page_hash_enter(vm_page_t m)
1645 {
1646 	struct vm_page_hash_elm *mp;
1647 	struct vm_page_hash_elm *best;
1648 	vm_object_t object;
1649 	vm_pindex_t pindex;
1650 	int best_delta;
1651 	int delta;
1652 	int i;
1653 
1654 	/*
1655 	 * Only enter type-stable vm_pages with well-shared objects.
1656 	 */
1657 	if ((m->flags & PG_MAPPEDMULTI) == 0)
1658 		return;
1659 	if (__predict_false(vm_page_hash == NULL ||
1660 			    m < &vm_page_array[0] ||
1661 			    m >= &vm_page_array[vm_page_array_size])) {
1662 		return;
1663 	}
1664 	if (__predict_false(m->object == NULL))
1665 		return;
1666 #if 0
1667 	/*
1668 	 * Disabled at the moment, there are some degenerate conditions
1669 	 * with often-exec'd programs that get ignored.  In particular,
1670 	 * the kernel's elf loader does a vn_rdwr() on the first page of
1671 	 * a binary.
1672 	 */
1673 	if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING))
1674 		return;
1675 #endif
1676 	if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE)
1677 		return;
1678 
1679 	/*
1680 	 * Find best entry
1681 	 */
1682 	object = m->object;
1683 	pindex = m->pindex;
1684 
1685 	mp = vm_page_hash_hash(object, pindex);
1686 	best = mp;
1687 	best_delta = ticks - best->ticks;
1688 
1689 	for (i = 0; i < VM_PAGE_HASH_SET; ++i, ++mp) {
1690 		if (mp->m == m &&
1691 		    mp->object == object &&
1692 		    mp->pindex == pindex) {
1693 			/*
1694 			 * On-match optimization - do not update ticks
1695 			 * unless we have to (reduce cache coherency traffic)
1696 			 */
1697 			if (mp->ticks != ticks)
1698 				mp->ticks = ticks;
1699 			return;
1700 		}
1701 
1702 		/*
1703 		 * The best choice is the oldest entry.
1704 		 *
1705 		 * Also check for a field overflow, using -1 instead of 0
1706 		 * to deal with SMP races on accessing the 'ticks' global.
1707 		 */
1708 		delta = ticks - mp->ticks;
1709 		if (delta < -1)
1710 			best = mp;
1711 		if (best_delta < delta)
1712 			best = mp;
1713 	}
1714 
1715 	/*
1716 	 * Load the entry.  Copy a few elements to the hash entry itself
1717 	 * to reduce memory stalls due to memory indirects on lookups.
1718 	 */
1719 	best->m = m;
1720 	best->object = object;
1721 	best->pindex = pindex;
1722 	best->ticks = ticks;
1723 }
1724 
1725 /*
1726  * Locate and return the page at (object, pindex), or NULL if the
1727  * page could not be found.
1728  *
1729  * The caller must hold the vm_object token.
1730  */
1731 vm_page_t
1732 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
1733 {
1734 	vm_page_t m;
1735 
1736 	/*
1737 	 * Search the hash table for this object/offset pair
1738 	 */
1739 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1740 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1741 	if (m) {
1742 		KKASSERT(m->object == object && m->pindex == pindex);
1743 		vm_page_hash_enter(m);
1744 	}
1745 	return(m);
1746 }
1747 
1748 vm_page_t
1749 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
1750 					    vm_pindex_t pindex,
1751 					    int also_m_busy, const char *msg
1752 					    VM_PAGE_DEBUG_ARGS)
1753 {
1754 	u_int32_t busy_count;
1755 	vm_page_t m;
1756 
1757 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1758 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1759 	while (m) {
1760 		KKASSERT(m->object == object && m->pindex == pindex);
1761 		busy_count = m->busy_count;
1762 		cpu_ccfence();
1763 		if (busy_count & PBUSY_LOCKED) {
1764 			tsleep_interlock(m, 0);
1765 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1766 					  busy_count | PBUSY_WANTED)) {
1767 				atomic_set_int(&m->flags, PG_REFERENCED);
1768 				tsleep(m, PINTERLOCKED, msg, 0);
1769 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1770 							      pindex);
1771 			}
1772 		} else if (also_m_busy && busy_count) {
1773 			tsleep_interlock(m, 0);
1774 			if (atomic_cmpset_int(&m->busy_count, busy_count,
1775 					  busy_count | PBUSY_WANTED)) {
1776 				atomic_set_int(&m->flags, PG_REFERENCED);
1777 				tsleep(m, PINTERLOCKED, msg, 0);
1778 				m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
1779 							      pindex);
1780 			}
1781 		} else if (atomic_cmpset_int(&m->busy_count, busy_count,
1782 					     busy_count | PBUSY_LOCKED)) {
1783 #ifdef VM_PAGE_DEBUG
1784 			m->busy_func = func;
1785 			m->busy_line = lineno;
1786 #endif
1787 			vm_page_hash_enter(m);
1788 			break;
1789 		}
1790 	}
1791 	return m;
1792 }
1793 
1794 /*
1795  * Attempt to lookup and busy a page.
1796  *
1797  * Returns NULL if the page could not be found
1798  *
1799  * Returns a vm_page and error == TRUE if the page exists but could not
1800  * be busied.
1801  *
1802  * Returns a vm_page and error == FALSE on success.
1803  */
1804 vm_page_t
1805 VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
1806 					   vm_pindex_t pindex,
1807 					   int also_m_busy, int *errorp
1808 					   VM_PAGE_DEBUG_ARGS)
1809 {
1810 	u_int32_t busy_count;
1811 	vm_page_t m;
1812 
1813 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1814 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1815 	*errorp = FALSE;
1816 	while (m) {
1817 		KKASSERT(m->object == object && m->pindex == pindex);
1818 		busy_count = m->busy_count;
1819 		cpu_ccfence();
1820 		if (busy_count & PBUSY_LOCKED) {
1821 			*errorp = TRUE;
1822 			break;
1823 		}
1824 		if (also_m_busy && busy_count) {
1825 			*errorp = TRUE;
1826 			break;
1827 		}
1828 		if (atomic_cmpset_int(&m->busy_count, busy_count,
1829 				      busy_count | PBUSY_LOCKED)) {
1830 #ifdef VM_PAGE_DEBUG
1831 			m->busy_func = func;
1832 			m->busy_line = lineno;
1833 #endif
1834 			vm_page_hash_enter(m);
1835 			break;
1836 		}
1837 	}
1838 	return m;
1839 }
1840 
1841 /*
1842  * Returns a page that is only soft-busied for use by the caller in
1843  * a read-only fashion.  Returns NULL if the page could not be found,
1844  * the soft busy could not be obtained, or the page data is invalid.
1845  *
1846  * XXX Doesn't handle PG_FICTITIOUS pages at the moment, but there is
1847  *     no reason why we couldn't.
1848  */
1849 vm_page_t
1850 vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex,
1851 			 int pgoff, int pgbytes)
1852 {
1853 	vm_page_t m;
1854 
1855 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1856 	m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
1857 	if (m) {
1858 		if ((m->valid != VM_PAGE_BITS_ALL &&
1859 		     !vm_page_is_valid(m, pgoff, pgbytes)) ||
1860 		    (m->flags & PG_FICTITIOUS)) {
1861 			m = NULL;
1862 		} else if (vm_page_sbusy_try(m)) {
1863 			m = NULL;
1864 		} else if ((m->valid != VM_PAGE_BITS_ALL &&
1865 			    !vm_page_is_valid(m, pgoff, pgbytes)) ||
1866 			   (m->flags & PG_FICTITIOUS)) {
1867 			vm_page_sbusy_drop(m);
1868 			m = NULL;
1869 		} else {
1870 			vm_page_hash_enter(m);
1871 		}
1872 	}
1873 	return m;
1874 }
1875 
1876 /*
1877  * Caller must hold the related vm_object
1878  */
1879 vm_page_t
1880 vm_page_next(vm_page_t m)
1881 {
1882 	vm_page_t next;
1883 
1884 	next = vm_page_rb_tree_RB_NEXT(m);
1885 	if (next && next->pindex != m->pindex + 1)
1886 		next = NULL;
1887 	return (next);
1888 }
1889 
1890 /*
1891  * vm_page_rename()
1892  *
1893  * Move the given vm_page from its current object to the specified
1894  * target object/offset.  The page must be busy and will remain so
1895  * on return.
1896  *
1897  * new_object must be held.
1898  * This routine might block. XXX ?
1899  *
1900  * NOTE: Swap associated with the page must be invalidated by the move.  We
1901  *       have to do this for several reasons:  (1) we aren't freeing the
1902  *       page, (2) we are dirtying the page, (3) the VM system is probably
1903  *       moving the page from object A to B, and will then later move
1904  *       the backing store from A to B and we can't have a conflict.
1905  *
1906  * NOTE: We *always* dirty the page.  It is necessary both for the
1907  *       fact that we moved it, and because we may be invalidating
1908  *	 swap.  If the page is on the cache, we have to deactivate it
1909  *	 or vm_page_dirty() will panic.  Dirty pages are not allowed
1910  *	 on the cache.
1911  *
1912  * NOTE: Caller is responsible for any pmap disposition prior to the
1913  *	 rename (as the pmap code will not be able to find the entries
1914  *	 once the object has been disassociated or changed).  Nominally
1915  *	 the caller is moving a page between shadowed objects and so the
1916  *	 pmap association is retained without having to remove the page
1917  *	 from it.
1918  */
1919 void
1920 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
1921 {
1922 	KKASSERT(m->busy_count & PBUSY_LOCKED);
1923 	ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
1924 	if (m->object) {
1925 		ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
1926 		vm_page_remove(m);
1927 	}
1928 	if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
1929 		panic("vm_page_rename: target exists (%p,%"PRIu64")",
1930 		      new_object, new_pindex);
1931 	}
1932 	if (m->queue - m->pc == PQ_CACHE)
1933 		vm_page_deactivate(m);
1934 	vm_page_dirty(m);
1935 }
1936 
1937 /*
1938  * vm_page_unqueue() without any wakeup.  This routine is used when a page
1939  * is to remain BUSYied by the caller.
1940  *
1941  * This routine may not block.
1942  */
1943 void
1944 vm_page_unqueue_nowakeup(vm_page_t m)
1945 {
1946 	vm_page_and_queue_spin_lock(m);
1947 	(void)_vm_page_rem_queue_spinlocked(m);
1948 	vm_page_spin_unlock(m);
1949 }
1950 
1951 /*
1952  * vm_page_unqueue() - Remove a page from its queue, wakeup the pagedemon
1953  * if necessary.
1954  *
1955  * This routine may not block.
1956  */
1957 void
1958 vm_page_unqueue(vm_page_t m)
1959 {
1960 	u_short queue;
1961 
1962 	vm_page_and_queue_spin_lock(m);
1963 	queue = _vm_page_rem_queue_spinlocked(m);
1964 	if (queue == PQ_FREE || queue == PQ_CACHE) {
1965 		vm_page_spin_unlock(m);
1966 		pagedaemon_wakeup();
1967 	} else {
1968 		vm_page_spin_unlock(m);
1969 	}
1970 }
1971 
1972 /*
1973  * vm_page_list_find()
1974  *
1975  * Find a page on the specified queue with color optimization.
1976  *
1977  * The page coloring optimization attempts to locate a page that does
1978  * not overload other nearby pages in the object in the cpu's L1 or L2
1979  * caches.  We need this optimization because cpu caches tend to be
1980  * physical caches, while object spaces tend to be virtual.
1981  *
1982  * The page coloring optimization also, very importantly, tries to localize
1983  * memory to cpus and physical sockets.
1984  *
1985  * Each PQ_FREE and PQ_CACHE color queue has its own spinlock and the
1986  * algorithm is adjusted to localize allocations on a per-core basis.
1987  * This is done by 'twisting' the colors.
1988  *
1989  * The page is returned spinlocked and removed from its queue (it will
1990  * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
1991  * is responsible for dealing with the busy-page case (usually by
1992  * deactivating the page and looping).
1993  *
1994  * NOTE:  This routine is carefully inlined.  A non-inlined version
1995  *	  is available for outside callers but the only critical path is
1996  *	  from within this source file.
1997  *
1998  * NOTE:  This routine assumes that the vm_pages found in PQ_CACHE and PQ_FREE
1999  *	  represent stable storage, allowing us to order our locks vm_page
2000  *	  first, then queue.
2001  *
2002  * WARNING! The returned page is not busied and may race other busying
2003  *	  operations, callers must check that the page is in the state they
2004  *	  want after busying.
2005  */
2006 static __inline
2007 vm_page_t
2008 _vm_page_list_find(int basequeue, int index)
2009 {
2010 	struct vpgqueues *pq;
2011 	vm_page_t m;
2012 
2013 	index &= PQ_L2_MASK;
2014 	pq = &vm_page_queues[basequeue + index];
2015 
2016 	/*
2017 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2018 	 * then lock the queue and locate a page.  Note that the lock order
2019 	 * is reversed, but we do not want to dwadle on the page spinlock
2020 	 * anyway as it is held significantly longer than the queue spinlock.
2021 	 */
2022 	if (TAILQ_FIRST(&pq->pl)) {
2023 		spin_lock(&pq->spin);
2024 		TAILQ_FOREACH(m, &pq->pl, pageq) {
2025 			if (spin_trylock(&m->spin) == 0)
2026 				continue;
2027 			KKASSERT(m->queue == basequeue + index);
2028 			pq->lastq = -1;
2029 			return(m);
2030 		}
2031 		spin_unlock(&pq->spin);
2032 	}
2033 
2034 	m = _vm_page_list_find_wide(basequeue, index, &pq->lastq);
2035 
2036 	return(m);
2037 }
2038 
2039 /*
2040  * If we could not find the page in the desired queue try to find it in
2041  * a nearby (NUMA-aware) queue, spreading out as we go.
2042  */
2043 static vm_page_t
2044 _vm_page_list_find_wide(int basequeue, int index, int *lastp)
2045 {
2046 	struct vpgqueues *pq;
2047 	vm_page_t m = NULL;
2048 	int pqmask = set_assoc_mask >> 1;
2049 	int pqi;
2050 	int range;
2051 	int skip_start;
2052 	int skip_next;
2053 	int count;
2054 
2055 	/*
2056 	 * Avoid re-searching empty queues over and over again skip to
2057 	 * pq->last if appropriate.
2058 	 */
2059 	if (*lastp >= 0)
2060 		index = *lastp;
2061 
2062 	index &= PQ_L2_MASK;
2063 	pq = &vm_page_queues[basequeue];
2064 	count = 0;
2065 	skip_start = -1;
2066 	skip_next = -1;
2067 
2068 	/*
2069 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2070 	 * else fails (PQ_L2_MASK).
2071 	 *
2072 	 * pqmask is a mask, 15, 31, 63, etc.
2073 	 *
2074 	 * Test each queue unlocked first, then lock the queue and locate
2075 	 * a page.  Note that the lock order is reversed, but we do not want
2076 	 * to dwadle on the page spinlock anyway as it is held significantly
2077 	 * longer than the queue spinlock.
2078 	 */
2079 	do {
2080 		pqmask = (pqmask << 1) | 1;
2081 
2082 		pqi = index;
2083 		range = pqmask + 1;
2084 
2085 		while (range > 0) {
2086 			if (pqi >= skip_start && pqi < skip_next) {
2087 				range -= skip_next - pqi;
2088 				pqi = (pqi & ~pqmask) | (skip_next & pqmask);
2089 			}
2090 			if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) {
2091 				spin_lock(&pq[pqi].spin);
2092 				TAILQ_FOREACH(m, &pq[pqi].pl, pageq) {
2093 					if (spin_trylock(&m->spin) == 0)
2094 						continue;
2095 					KKASSERT(m->queue == basequeue + pqi);
2096 
2097 					/*
2098 					 * If we had to wander too far, set
2099 					 * *lastp to skip past empty queues.
2100 					 */
2101 					if (count >= 8)
2102 						*lastp = pqi & PQ_L2_MASK;
2103 					return(m);
2104 				}
2105 				spin_unlock(&pq[pqi].spin);
2106 			}
2107 			--range;
2108 			++count;
2109 			pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask);
2110 		}
2111 		skip_start = pqi & ~pqmask;
2112 		skip_next = (pqi | pqmask) + 1;
2113 	} while (pqmask != PQ_L2_MASK);
2114 
2115 	return(m);
2116 }
2117 
2118 static __inline
2119 vm_page_t
2120 _vm_page_list_find2(int bq1, int bq2, int index)
2121 {
2122 	struct vpgqueues *pq1;
2123 	struct vpgqueues *pq2;
2124 	vm_page_t m;
2125 
2126 	index &= PQ_L2_MASK;
2127 	pq1 = &vm_page_queues[bq1 + index];
2128 	pq2 = &vm_page_queues[bq2 + index];
2129 
2130 	/*
2131 	 * Try this cpu's colored queue first.  Test for a page unlocked,
2132 	 * then lock the queue and locate a page.  Note that the lock order
2133 	 * is reversed, but we do not want to dwadle on the page spinlock
2134 	 * anyway as it is held significantly longer than the queue spinlock.
2135 	 */
2136 	if (TAILQ_FIRST(&pq1->pl)) {
2137 		spin_lock(&pq1->spin);
2138 		TAILQ_FOREACH(m, &pq1->pl, pageq) {
2139 			if (spin_trylock(&m->spin) == 0)
2140 				continue;
2141 			KKASSERT(m->queue == bq1 + index);
2142 			pq1->lastq = -1;
2143 			pq2->lastq = -1;
2144 			return(m);
2145 		}
2146 		spin_unlock(&pq1->spin);
2147 	}
2148 
2149 	m = _vm_page_list_find2_wide(bq1, bq2, index, &pq1->lastq, &pq2->lastq);
2150 
2151 	return(m);
2152 }
2153 
2154 
2155 /*
2156  * This version checks two queues at the same time, widening its search
2157  * as we progress.  prefering basequeue1
2158  * and starting on basequeue2 after exhausting the first set.  The idea
2159  * is to try to stay localized to the cpu.
2160  */
2161 static vm_page_t
2162 _vm_page_list_find2_wide(int basequeue1, int basequeue2, int index,
2163 			 int *lastp1, int *lastp2)
2164 {
2165 	struct vpgqueues *pq1;
2166 	struct vpgqueues *pq2;
2167 	vm_page_t m = NULL;
2168 	int pqmask1, pqmask2;
2169 	int pqi;
2170 	int range;
2171 	int skip_start1, skip_start2;
2172 	int skip_next1, skip_next2;
2173 	int count1, count2;
2174 
2175 	/*
2176 	 * Avoid re-searching empty queues over and over again skip to
2177 	 * pq->last if appropriate.
2178 	 */
2179 	if (*lastp1 >= 0)
2180 		index = *lastp1;
2181 
2182 	index &= PQ_L2_MASK;
2183 
2184 	pqmask1 = set_assoc_mask >> 1;
2185 	pq1 = &vm_page_queues[basequeue1];
2186 	count1 = 0;
2187 	skip_start1 = -1;
2188 	skip_next1 = -1;
2189 
2190 	pqmask2 = set_assoc_mask >> 1;
2191 	pq2 = &vm_page_queues[basequeue2];
2192 	count2 = 0;
2193 	skip_start2 = -1;
2194 	skip_next2 = -1;
2195 
2196 	/*
2197 	 * Run local sets of 16, 32, 64, 128, up to the entire queue if all
2198 	 * else fails (PQ_L2_MASK).
2199 	 *
2200 	 * pqmask is a mask, 15, 31, 63, etc.
2201 	 *
2202 	 * Test each queue unlocked first, then lock the queue and locate
2203 	 * a page.  Note that the lock order is reversed, but we do not want
2204 	 * to dwadle on the page spinlock anyway as it is held significantly
2205 	 * longer than the queue spinlock.
2206 	 */
2207 	do {
2208 		if (pqmask1 == PQ_L2_MASK)
2209 			goto skip2;
2210 
2211 		pqmask1 = (pqmask1 << 1) | 1;
2212 		pqi = index;
2213 		range = pqmask1 + 1;
2214 
2215 		while (range > 0) {
2216 			if (pqi >= skip_start1 && pqi < skip_next1) {
2217 				range -= skip_next1 - pqi;
2218 				pqi = (pqi & ~pqmask1) | (skip_next1 & pqmask1);
2219 			}
2220 			if (range > 0 && TAILQ_FIRST(&pq1[pqi].pl)) {
2221 				spin_lock(&pq1[pqi].spin);
2222 				TAILQ_FOREACH(m, &pq1[pqi].pl, pageq) {
2223 					if (spin_trylock(&m->spin) == 0)
2224 						continue;
2225 					KKASSERT(m->queue == basequeue1 + pqi);
2226 
2227 					/*
2228 					 * If we had to wander too far, set
2229 					 * *lastp to skip past empty queues.
2230 					 */
2231 					if (count1 >= 8)
2232 						*lastp1 = pqi & PQ_L2_MASK;
2233 					return(m);
2234 				}
2235 				spin_unlock(&pq1[pqi].spin);
2236 			}
2237 			--range;
2238 			++count1;
2239 			pqi = (pqi & ~pqmask1) | ((pqi + 1) & pqmask1);
2240 		}
2241 		skip_start1 = pqi & ~pqmask1;
2242 		skip_next1 = (pqi | pqmask1) + 1;
2243 skip2:
2244 		if (pqmask1 < ((set_assoc_mask << 1) | 1))
2245 			continue;
2246 
2247 		pqmask2 = (pqmask2 << 1) | 1;
2248 		pqi = index;
2249 		range = pqmask2 + 1;
2250 
2251 		while (range > 0) {
2252 			if (pqi >= skip_start2 && pqi < skip_next2) {
2253 				range -= skip_next2 - pqi;
2254 				pqi = (pqi & ~pqmask2) | (skip_next2 & pqmask2);
2255 			}
2256 			if (range > 0 && TAILQ_FIRST(&pq2[pqi].pl)) {
2257 				spin_lock(&pq2[pqi].spin);
2258 				TAILQ_FOREACH(m, &pq2[pqi].pl, pageq) {
2259 					if (spin_trylock(&m->spin) == 0)
2260 						continue;
2261 					KKASSERT(m->queue == basequeue2 + pqi);
2262 
2263 					/*
2264 					 * If we had to wander too far, set
2265 					 * *lastp to skip past empty queues.
2266 					 */
2267 					if (count2 >= 8)
2268 						*lastp2 = pqi & PQ_L2_MASK;
2269 					return(m);
2270 				}
2271 				spin_unlock(&pq2[pqi].spin);
2272 			}
2273 			--range;
2274 			++count2;
2275 			pqi = (pqi & ~pqmask2) | ((pqi + 1) & pqmask2);
2276 		}
2277 		skip_start2 = pqi & ~pqmask2;
2278 		skip_next2 = (pqi | pqmask2) + 1;
2279 	} while (pqmask1 != PQ_L2_MASK && pqmask2 != PQ_L2_MASK);
2280 
2281 	return(m);
2282 }
2283 
2284 /*
2285  * Returns a vm_page candidate for allocation.  The page is not busied so
2286  * it can move around.  The caller must busy the page (and typically
2287  * deactivate it if it cannot be busied!)
2288  *
2289  * Returns a spinlocked vm_page that has been removed from its queue.
2290  * (note that _vm_page_list_find() does not remove the page from its
2291  *  queue).
2292  */
2293 vm_page_t
2294 vm_page_list_find(int basequeue, int index)
2295 {
2296 	vm_page_t m;
2297 
2298 	m = _vm_page_list_find(basequeue, index);
2299 	if (m)
2300 		_vm_page_rem_queue_spinlocked(m);
2301 	return m;
2302 }
2303 
2304 /*
2305  * Find a page on the cache queue with color optimization, remove it
2306  * from the queue, and busy it.  The returned page will not be spinlocked.
2307  *
2308  * A candidate failure will be deactivated.  Candidates can fail due to
2309  * being busied by someone else, in which case they will be deactivated.
2310  *
2311  * This routine may not block.
2312  *
2313  */
2314 static vm_page_t
2315 vm_page_select_cache(u_short pg_color)
2316 {
2317 	vm_page_t m;
2318 
2319 	for (;;) {
2320 		m = _vm_page_list_find(PQ_CACHE, pg_color);
2321 		if (m == NULL)
2322 			break;
2323 		/*
2324 		 * (m) has been spinlocked
2325 		 */
2326 		_vm_page_rem_queue_spinlocked(m);
2327 		if (vm_page_busy_try(m, TRUE)) {
2328 			_vm_page_deactivate_locked(m, 0);
2329 			vm_page_spin_unlock(m);
2330 		} else {
2331 			/*
2332 			 * We successfully busied the page.  This can race
2333 			 * vm_page_lookup() + busy ops so make sure the
2334 			 * page is in the state we want.
2335 			 */
2336 			if ((m->flags & (PG_NEED_COMMIT | PG_MAPPED)) == 0 &&
2337 			    m->hold_count == 0 &&
2338 			    m->wire_count == 0 &&
2339 			    (m->dirty & m->valid) == 0) {
2340 				vm_page_spin_unlock(m);
2341 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2342 				pagedaemon_wakeup();
2343 				return(m);
2344 			}
2345 
2346 			/*
2347 			 * The page cannot be recycled, deactivate it.
2348 			 */
2349 			_vm_page_deactivate_locked(m, 0);
2350 			if (_vm_page_wakeup(m)) {
2351 				vm_page_spin_unlock(m);
2352 				wakeup(m);
2353 			} else {
2354 				vm_page_spin_unlock(m);
2355 			}
2356 		}
2357 	}
2358 	return (m);
2359 }
2360 
2361 /*
2362  * Find a free page.  We attempt to inline the nominal case and fall back
2363  * to _vm_page_select_free() otherwise.  A busied page is removed from
2364  * the queue and returned.
2365  *
2366  * This routine may not block.
2367  */
2368 static __inline vm_page_t
2369 vm_page_select_free(u_short pg_color)
2370 {
2371 	vm_page_t m;
2372 
2373 	for (;;) {
2374 		m = _vm_page_list_find(PQ_FREE, pg_color);
2375 		if (m == NULL)
2376 			break;
2377 		_vm_page_rem_queue_spinlocked(m);
2378 		if (vm_page_busy_try(m, TRUE)) {
2379 			/*
2380 			 * Various mechanisms such as a pmap_collect can
2381 			 * result in a busy page on the free queue.  We
2382 			 * have to move the page out of the way so we can
2383 			 * retry the allocation.  If the other thread is not
2384 			 * allocating the page then m->valid will remain 0 and
2385 			 * the pageout daemon will free the page later on.
2386 			 *
2387 			 * Since we could not busy the page, however, we
2388 			 * cannot make assumptions as to whether the page
2389 			 * will be allocated by the other thread or not,
2390 			 * so all we can do is deactivate it to move it out
2391 			 * of the way.  In particular, if the other thread
2392 			 * wires the page it may wind up on the inactive
2393 			 * queue and the pageout daemon will have to deal
2394 			 * with that case too.
2395 			 */
2396 			_vm_page_deactivate_locked(m, 0);
2397 			vm_page_spin_unlock(m);
2398 		} else {
2399 			/*
2400 			 * Theoretically if we are able to busy the page
2401 			 * atomic with the queue removal (using the vm_page
2402 			 * lock) nobody else should have been able to mess
2403 			 * with the page before us.
2404 			 *
2405 			 * Assert the page state.  Note that even though
2406 			 * wiring doesn't adjust queues, a page on the free
2407 			 * queue should never be wired at this point.
2408 			 */
2409 			KKASSERT((m->flags & (PG_UNQUEUED |
2410 					      PG_NEED_COMMIT)) == 0);
2411 			KASSERT(m->hold_count == 0,
2412 				("m->hold_count is not zero "
2413 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2414 				 m, m->queue, m->flags,
2415 				 m->hold_count, m->wire_count));
2416 			KKASSERT(m->wire_count == 0);
2417 			vm_page_spin_unlock(m);
2418 			pagedaemon_wakeup();
2419 
2420 			/* return busied and removed page */
2421 			return(m);
2422 		}
2423 	}
2424 	return(m);
2425 }
2426 
2427 static __inline vm_page_t
2428 vm_page_select_free_or_cache(u_short pg_color, int *fromcachep)
2429 {
2430 	vm_page_t m;
2431 
2432 	*fromcachep = 0;
2433 	for (;;) {
2434 		m = _vm_page_list_find2(PQ_FREE, PQ_CACHE, pg_color);
2435 		if (m == NULL)
2436 			break;
2437 		if (vm_page_busy_try(m, TRUE)) {
2438 			_vm_page_rem_queue_spinlocked(m);
2439 			_vm_page_deactivate_locked(m, 0);
2440 			vm_page_spin_unlock(m);
2441 		} else if (m->queue - m->pc == PQ_FREE) {
2442 			/*
2443 			 * We successfully busied the page, PQ_FREE case
2444 			 */
2445 			_vm_page_rem_queue_spinlocked(m);
2446 			KKASSERT((m->flags & (PG_UNQUEUED |
2447 					      PG_NEED_COMMIT)) == 0);
2448 			KASSERT(m->hold_count == 0,
2449 				("m->hold_count is not zero "
2450 				 "pg %p q=%d flags=%08x hold=%d wire=%d",
2451 				 m, m->queue, m->flags,
2452 				 m->hold_count, m->wire_count));
2453 			KKASSERT(m->wire_count == 0);
2454 			vm_page_spin_unlock(m);
2455 			pagedaemon_wakeup();
2456 
2457 			/* return busied and removed page */
2458 			return(m);
2459 		} else {
2460 			/*
2461 			 * We successfully busied the page, PQ_CACHE case
2462 			 *
2463 			 * This can race vm_page_lookup() + busy ops, so make
2464 			 * sure the page is in the state we want.
2465 			 */
2466 			_vm_page_rem_queue_spinlocked(m);
2467 			if ((m->flags & (PG_NEED_COMMIT | PG_MAPPED)) == 0 &&
2468 			    m->hold_count == 0 &&
2469 			    m->wire_count == 0 &&
2470 			    (m->dirty & m->valid) == 0) {
2471 				vm_page_spin_unlock(m);
2472 				KKASSERT((m->flags & PG_UNQUEUED) == 0);
2473 				pagedaemon_wakeup();
2474 				*fromcachep = 1;
2475 				return(m);
2476 			}
2477 
2478 			/*
2479 			 * The page cannot be recycled, deactivate it.
2480 			 */
2481 			_vm_page_deactivate_locked(m, 0);
2482 			if (_vm_page_wakeup(m)) {
2483 				vm_page_spin_unlock(m);
2484 				wakeup(m);
2485 			} else {
2486 				vm_page_spin_unlock(m);
2487 			}
2488 		}
2489 	}
2490 	return(m);
2491 }
2492 
2493 /*
2494  * vm_page_alloc()
2495  *
2496  * Allocate and return a memory cell associated with this VM object/offset
2497  * pair.  If object is NULL an unassociated page will be allocated.
2498  *
2499  * The returned page will be busied and removed from its queues.  This
2500  * routine can block and may return NULL if a race occurs and the page
2501  * is found to already exist at the specified (object, pindex).
2502  *
2503  *	VM_ALLOC_NORMAL		- Allow use of cache pages, nominal free drain
2504  *	VM_ALLOC_QUICK		- Like normal but cannot use cache
2505  *	VM_ALLOC_SYSTEM		- Greater free drain
2506  *	VM_ALLOC_INTERRUPT	- Allow free list to be completely drained
2507  *
2508  *	VM_ALLOC_CPU(n)		- Allocate using specified cpu localization
2509  *
2510  *	VM_ALLOC_ZERO		- Zero the page if we have to allocate it.
2511  *				  (vm_page_grab() and vm_page_alloczwq() ONLY!)
2512  *
2513  *	VM_ALLOC_FORCE_ZERO	- Zero the page unconditionally.
2514  *				  (vm_page_grab() and vm_page_alloczwq() ONLY!)
2515  *
2516  *	VM_ALLOC_NULL_OK	- Return NULL on insertion collision, else
2517  *				  panic on insertion collisions.
2518  *				  (vm_page_grab() and vm_page_alloczwq() ONLY!)
2519  *
2520  * The object must be held if not NULL
2521  *
2522  * This routine may not block
2523  *
2524  * Additional special handling is required when called from an interrupt
2525  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
2526  * in this case.
2527  */
2528 vm_page_t
2529 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
2530 {
2531 	globaldata_t gd;
2532 	vm_object_t obj;
2533 	vm_page_t m;
2534 	u_short pg_color;
2535 	int cpuid_local;
2536 	int fromcache;
2537 
2538 #if 0
2539 	/*
2540 	 * Special per-cpu free VM page cache.  The pages are pre-busied
2541 	 * and pre-zerod for us.
2542 	 */
2543 	if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
2544 		crit_enter_gd(gd);
2545 		if (gd->gd_vmpg_count) {
2546 			m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
2547 			crit_exit_gd(gd);
2548 			goto done;
2549                 }
2550 		crit_exit_gd(gd);
2551         }
2552 #endif
2553 	m = NULL;
2554 
2555 	/*
2556 	 * CPU LOCALIZATION
2557 	 *
2558 	 * CPU localization algorithm.  Break the page queues up by physical
2559 	 * id and core id (note that two cpu threads will have the same core
2560 	 * id, and core_id != gd_cpuid).
2561 	 *
2562 	 * This is nowhere near perfect, for example the last pindex in a
2563 	 * subgroup will overflow into the next cpu or package.  But this
2564 	 * should get us good page reuse locality in heavy mixed loads.
2565 	 *
2566 	 * (may be executed before the APs are started, so other GDs might
2567 	 *  not exist!)
2568 	 */
2569 	if (page_req & VM_ALLOC_CPU_SPEC)
2570 		cpuid_local = VM_ALLOC_GETCPU(page_req);
2571 	else
2572 		cpuid_local = mycpu->gd_cpuid;
2573 
2574 	pg_color = vm_get_pg_color(cpuid_local, object, pindex);
2575 
2576 	KKASSERT(page_req & (VM_ALLOC_NORMAL | VM_ALLOC_QUICK |
2577 			     VM_ALLOC_INTERRUPT | VM_ALLOC_SYSTEM));
2578 
2579 	/*
2580 	 * Certain system threads (pageout daemon, buf_daemon's) are
2581 	 * allowed to eat deeper into the free page list.
2582 	 */
2583 	if (curthread->td_flags & TDF_SYSTHREAD)
2584 		page_req |= VM_ALLOC_SYSTEM;
2585 
2586 	/*
2587 	 * To avoid live-locks only compare against v_free_reserved.  The
2588 	 * pageout daemon has extra tests for this.
2589 	 */
2590 loop:
2591 	gd = mycpu;
2592 	if (gd->gd_vmstats.v_free_count >= gd->gd_vmstats.v_free_reserved ||
2593 	    ((page_req & VM_ALLOC_INTERRUPT) &&
2594 	     gd->gd_vmstats.v_free_count > 0) ||
2595 	    ((page_req & VM_ALLOC_SYSTEM) &&
2596 	     gd->gd_vmstats.v_cache_count == 0 &&
2597 	     gd->gd_vmstats.v_free_count >
2598 	     gd->gd_vmstats.v_interrupt_free_min)
2599 	) {
2600 		/*
2601 		 * The free queue has sufficient free pages to take one out.
2602 		 *
2603 		 * However, if the free queue is strained the scan may widen
2604 		 * to the entire queue and cause a great deal of SMP
2605 		 * contention, so we use a double-queue-scan if we can
2606 		 * to avoid this.
2607 		 */
2608 		if (page_req & VM_ALLOC_NORMAL) {
2609 			m = vm_page_select_free_or_cache(pg_color, &fromcache);
2610 			if (m && fromcache)
2611 				goto found_cache;
2612 		} else {
2613 			m = vm_page_select_free(pg_color);
2614 		}
2615 	} else if (page_req & VM_ALLOC_NORMAL) {
2616 		/*
2617 		 * Allocatable from the cache (non-interrupt only).  On
2618 		 * success, we must free the page and try again, thus
2619 		 * ensuring that vmstats.v_*_free_min counters are replenished.
2620 		 */
2621 #ifdef INVARIANTS
2622 		if (curthread->td_preempted) {
2623 			kprintf("vm_page_alloc(): warning, attempt to allocate"
2624 				" cache page from preempting interrupt\n");
2625 			m = NULL;
2626 		} else {
2627 			m = vm_page_select_cache(pg_color);
2628 		}
2629 #else
2630 		m = vm_page_select_cache(pg_color);
2631 #endif
2632 		/*
2633 		 * On success move the page into the free queue and loop.
2634 		 *
2635 		 * Only do this if we can safely acquire the vm_object lock,
2636 		 * because this is effectively a random page and the caller
2637 		 * might be holding the lock shared, we don't want to
2638 		 * deadlock.
2639 		 */
2640 		if (m != NULL) {
2641 found_cache:
2642 			KASSERT(m->dirty == 0,
2643 				("Found dirty cache page %p", m));
2644 			if ((obj = m->object) != NULL) {
2645 				if (vm_object_hold_try(obj)) {
2646 					if (__predict_false((m->flags & (PG_MAPPED|PG_WRITEABLE)) != 0))
2647 						vm_page_protect(m, VM_PROT_NONE);
2648 					vm_page_free(m);
2649 					/* m->object NULL here */
2650 					vm_object_drop(obj);
2651 				} else {
2652 					vm_page_deactivate(m);
2653 					vm_page_wakeup(m);
2654 				}
2655 			} else {
2656 				if (__predict_false((m->flags & (PG_MAPPED|PG_WRITEABLE)) != 0))
2657 					vm_page_protect(m, VM_PROT_NONE);
2658 				vm_page_free(m);
2659 			}
2660 			goto loop;
2661 		}
2662 
2663 		/*
2664 		 * On failure return NULL
2665 		 */
2666 		atomic_add_int(&vm_pageout_deficit, 1);
2667 		pagedaemon_wakeup();
2668 		return (NULL);
2669 	} else {
2670 		/*
2671 		 * No pages available, wakeup the pageout daemon and give up.
2672 		 */
2673 		atomic_add_int(&vm_pageout_deficit, 1);
2674 		pagedaemon_wakeup();
2675 		return (NULL);
2676 	}
2677 
2678 	/*
2679 	 * v_free_count can race so loop if we don't find the expected
2680 	 * page.
2681 	 */
2682 	if (m == NULL) {
2683 		vmstats_rollup();
2684 		goto loop;
2685 	}
2686 
2687 	/*
2688 	 * Good page found.  The page has already been busied for us and
2689 	 * removed from its queues.
2690 	 */
2691 	KASSERT(m->dirty == 0,
2692 		("vm_page_alloc: free/cache page %p was dirty", m));
2693 	KKASSERT(m->queue == PQ_NONE);
2694 
2695 #if 0
2696 done:
2697 #endif
2698 	/*
2699 	 * Initialize the structure, inheriting some flags but clearing
2700 	 * all the rest.  The page has already been busied for us.
2701 	 */
2702 	vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
2703 
2704 	KKASSERT(m->wire_count == 0);
2705 	KKASSERT((m->busy_count & PBUSY_MASK) == 0);
2706 	m->act_count = 0;
2707 	m->valid = 0;
2708 
2709 	/*
2710 	 * Caller must be holding the object lock (asserted by
2711 	 * vm_page_insert()).
2712 	 *
2713 	 * NOTE: Inserting a page here does not insert it into any pmaps
2714 	 *	 (which could cause us to block allocating memory).
2715 	 *
2716 	 * NOTE: If no object an unassociated page is allocated, m->pindex
2717 	 *	 can be used by the caller for any purpose.
2718 	 */
2719 	if (object) {
2720 		if (vm_page_insert(m, object, pindex) == FALSE) {
2721 			vm_page_free(m);
2722 			if ((page_req & VM_ALLOC_NULL_OK) == 0)
2723 				panic("PAGE RACE %p[%ld]/%p",
2724 				      object, (long)pindex, m);
2725 			m = NULL;
2726 		}
2727 	} else {
2728 		m->pindex = pindex;
2729 	}
2730 
2731 	/*
2732 	 * Don't wakeup too often - wakeup the pageout daemon when
2733 	 * we would be nearly out of memory.
2734 	 */
2735 	pagedaemon_wakeup();
2736 
2737 	/*
2738 	 * A BUSY page is returned.
2739 	 */
2740 	return (m);
2741 }
2742 
2743 /*
2744  * Returns number of pages available in our DMA memory reserve
2745  * (adjusted with vm.dma_reserved=<value>m in /boot/loader.conf)
2746  */
2747 vm_size_t
2748 vm_contig_avail_pages(void)
2749 {
2750 	alist_blk_t blk;
2751 	alist_blk_t count;
2752 	alist_blk_t bfree;
2753 	spin_lock(&vm_contig_spin);
2754 	bfree = alist_free_info(&vm_contig_alist, &blk, &count);
2755 	spin_unlock(&vm_contig_spin);
2756 
2757 	return bfree;
2758 }
2759 
2760 /*
2761  * Attempt to allocate contiguous physical memory with the specified
2762  * requirements.
2763  */
2764 vm_page_t
2765 vm_page_alloc_contig(vm_paddr_t low, vm_paddr_t high,
2766 		     unsigned long alignment, unsigned long boundary,
2767 		     unsigned long size, vm_memattr_t memattr)
2768 {
2769 	alist_blk_t blk;
2770 	vm_page_t m;
2771 	vm_pindex_t i;
2772 #if 0
2773 	static vm_pindex_t contig_rover;
2774 #endif
2775 
2776 	alignment >>= PAGE_SHIFT;
2777 	if (alignment == 0)
2778 		alignment = 1;
2779 	boundary >>= PAGE_SHIFT;
2780 	if (boundary == 0)
2781 		boundary = 1;
2782 	size = (size + PAGE_MASK) >> PAGE_SHIFT;
2783 
2784 #if 0
2785 	/*
2786 	 * Disabled temporarily until we find a solution for DRM (a flag
2787 	 * to always use the free space reserve, for performance).
2788 	 */
2789 	if (high == BUS_SPACE_MAXADDR && alignment <= PAGE_SIZE &&
2790 	    boundary <= PAGE_SIZE && size == 1 &&
2791 	    memattr == VM_MEMATTR_DEFAULT) {
2792 		/*
2793 		 * Any page will work, use vm_page_alloc()
2794 		 * (e.g. when used from kmem_alloc_attr())
2795 		 */
2796 		m = vm_page_alloc(NULL, (contig_rover++) & 0x7FFFFFFF,
2797 				  VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM |
2798 				  VM_ALLOC_INTERRUPT);
2799 		m->valid = VM_PAGE_BITS_ALL;
2800 		vm_page_wire(m);
2801 		vm_page_wakeup(m);
2802 	} else
2803 #endif
2804 	{
2805 		/*
2806 		 * Use the low-memory dma reserve
2807 		 */
2808 		spin_lock(&vm_contig_spin);
2809 		blk = alist_alloc(&vm_contig_alist, 0, size);
2810 		if (blk == ALIST_BLOCK_NONE) {
2811 			spin_unlock(&vm_contig_spin);
2812 			if (bootverbose) {
2813 				kprintf("vm_page_alloc_contig: %ldk nospace\n",
2814 					(size << PAGE_SHIFT) / 1024);
2815 				print_backtrace(5);
2816 			}
2817 			return(NULL);
2818 		}
2819 		if (high && ((vm_paddr_t)(blk + size) << PAGE_SHIFT) > high) {
2820 			alist_free(&vm_contig_alist, blk, size);
2821 			spin_unlock(&vm_contig_spin);
2822 			if (bootverbose) {
2823 				kprintf("vm_page_alloc_contig: %ldk high "
2824 					"%016jx failed\n",
2825 					(size << PAGE_SHIFT) / 1024,
2826 					(intmax_t)high);
2827 			}
2828 			return(NULL);
2829 		}
2830 		spin_unlock(&vm_contig_spin);
2831 
2832 		/*
2833 		 * Base vm_page_t of range
2834 		 */
2835 		m = PHYS_TO_VM_PAGE((vm_paddr_t)blk << PAGE_SHIFT);
2836 	}
2837 	if (vm_contig_verbose) {
2838 		kprintf("vm_page_alloc_contig: %016jx/%ldk "
2839 			"(%016jx-%016jx al=%lu bo=%lu pgs=%lu attr=%d\n",
2840 			(intmax_t)m->phys_addr,
2841 			(size << PAGE_SHIFT) / 1024,
2842 			low, high, alignment, boundary, size, memattr);
2843 	}
2844 	if (memattr != VM_MEMATTR_DEFAULT) {
2845 		for (i = 0; i < size; ++i) {
2846 			KKASSERT(m[i].flags & PG_FICTITIOUS);
2847 			pmap_page_set_memattr(&m[i], memattr);
2848 		}
2849 	}
2850 	return m;
2851 }
2852 
2853 /*
2854  * Free contiguously allocated pages.  The pages will be wired but not busy.
2855  * When freeing to the alist we leave them wired and not busy.
2856  */
2857 void
2858 vm_page_free_contig(vm_page_t m, unsigned long size)
2859 {
2860 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
2861 	vm_pindex_t start = pa >> PAGE_SHIFT;
2862 	vm_pindex_t pages = (size + PAGE_MASK) >> PAGE_SHIFT;
2863 
2864 	if (vm_contig_verbose) {
2865 		kprintf("vm_page_free_contig:  %016jx/%ldk\n",
2866 			(intmax_t)pa, size / 1024);
2867 	}
2868 	if (pa < vm_low_phys_reserved) {
2869 		/*
2870 		 * Just assert check the first page for convenience.
2871 		 */
2872 		KKASSERT(m->wire_count == 1);
2873 		KKASSERT(m->flags & PG_FICTITIOUS);
2874 		KKASSERT(pa + size <= vm_low_phys_reserved);
2875 		spin_lock(&vm_contig_spin);
2876 		alist_free(&vm_contig_alist, start, pages);
2877 		spin_unlock(&vm_contig_spin);
2878 	} else {
2879 		while (pages) {
2880 			/* XXX FUTURE, maybe (pair with vm_pg_contig_alloc()) */
2881 			/*vm_page_flag_clear(m, PG_FICTITIOUS | PG_UNQUEUED);*/
2882 			vm_page_busy_wait(m, FALSE, "cpgfr");
2883 			vm_page_unwire(m, 0);
2884 			vm_page_free(m);
2885 			--pages;
2886 			++m;
2887 		}
2888 
2889 	}
2890 }
2891 
2892 
2893 /*
2894  * Wait for sufficient free memory for nominal heavy memory use kernel
2895  * operations.
2896  *
2897  * WARNING!  Be sure never to call this in any vm_pageout code path, which
2898  *	     will trivially deadlock the system.
2899  */
2900 void
2901 vm_wait_nominal(void)
2902 {
2903 	while (vm_paging_min())
2904 		vm_wait(0);
2905 }
2906 
2907 /*
2908  * Test if vm_wait_nominal() would block.
2909  */
2910 int
2911 vm_test_nominal(void)
2912 {
2913 	if (vm_paging_min())
2914 		return(1);
2915 	return(0);
2916 }
2917 
2918 /*
2919  * Block until free pages are available for allocation, called in various
2920  * places before memory allocations, and occurs before the minimum is reached.
2921  * Typically in the I/O path.
2922  *
2923  * The caller may loop if vm_paging_min() is TRUE (free pages below minimum),
2924  * so we cannot be more generous then that.
2925  */
2926 void
2927 vm_wait(int timo)
2928 {
2929 	/*
2930 	 * never wait forever
2931 	 */
2932 	if (timo == 0)
2933 		timo = hz;
2934 	lwkt_gettoken(&vm_token);
2935 
2936 	if (curthread == pagethread ||
2937 	    curthread == emergpager) {
2938 		/*
2939 		 * The pageout daemon itself needs pages, this is bad.
2940 		 */
2941 		if (vm_paging_min()) {
2942 			vm_pageout_pages_needed = 1;
2943 			tsleep(&vm_pageout_pages_needed, 0, "VMWait", timo);
2944 		}
2945 	} else {
2946 		/*
2947 		 * Wakeup the pageout daemon if necessary and wait.
2948 		 *
2949 		 * Do not wait indefinitely for the target to be reached,
2950 		 * as load might prevent it from being reached any time soon.
2951 		 * But wait a little to try to slow down page allocations
2952 		 * and to give more important threads (the pagedaemon)
2953 		 * allocation priority.
2954 		 *
2955 		 * The vm_paging_min() test is a safety.
2956 		 *
2957 		 * I/O waits are given a slightly lower priority (higher nice)
2958 		 * than VM waits.
2959 		 */
2960 		int nice;
2961 
2962 		nice = curthread->td_proc ? curthread->td_proc->p_nice : 0;
2963 		/*if (vm_paging_wait() || vm_paging_min())*/
2964 		if (vm_paging_min_nice(nice + 1))
2965 		{
2966 			if (vm_pages_needed <= 1) {
2967 				++vm_pages_needed;
2968 				wakeup(&vm_pages_needed);
2969 			}
2970 			++vm_pages_waiting;	/* SMP race ok */
2971 			tsleep(&vmstats.v_free_count, 0, "vmwait", timo);
2972 		}
2973 	}
2974 	lwkt_reltoken(&vm_token);
2975 }
2976 
2977 /*
2978  * Block until free pages are available for allocation, called in the
2979  * page-fault code.  We must stall indefinitely (except for certain
2980  * conditions) when the free page count becomes severe.
2981  *
2982  * Called only from vm_fault so that processes page faulting can be
2983  * easily tracked.
2984  *
2985  * The process nice value determines the trip point.  This way niced
2986  * processes which are heavy memory users do not completely mess the
2987  * machine up for normal processes.
2988  */
2989 void
2990 vm_wait_pfault(void)
2991 {
2992 	int nice;
2993 
2994 	/*
2995 	 * Wakeup the pageout daemon if necessary and wait.
2996 	 *
2997 	 * Allow VM faults down to the minimum free page count, but only
2998 	 * stall once paging becomes severe.
2999 	 *
3000 	 * Do not wait indefinitely for the target to be reached,
3001 	 * as load might prevent it from being reached any time soon.
3002 	 * But wait a little to try to slow down page allocations
3003 	 * and to give more important threads (the pagedaemon)
3004 	 * allocation priority.
3005 	 */
3006 	nice = curthread->td_proc ? curthread->td_proc->p_nice : 0;
3007 
3008 	if (vm_paging_min_nice(nice)) {
3009 		lwkt_gettoken(&vm_token);
3010 		do {
3011 			thread_t td;
3012 
3013 			if (vm_pages_needed <= 1) {
3014 				++vm_pages_needed;
3015 				wakeup(&vm_pages_needed);
3016 			}
3017 			++vm_pages_waiting;	/* SMP race ok */
3018 			tsleep(&vmstats.v_free_count, 0, "pfault",
3019 				hz / 10 + 1);
3020 
3021 			/*
3022 			 * Do not stay stuck in the loop if the system
3023 			 * is trying to kill the process.
3024 			 */
3025 			td = curthread;
3026 			if (td->td_proc &&
3027 			    (td->td_proc->p_flags & P_LOWMEMKILL))
3028 			{
3029 				break;
3030 			}
3031 		} while (vm_paging_severe());
3032 		lwkt_reltoken(&vm_token);
3033 	}
3034 }
3035 
3036 /*
3037  * Put the specified page on the active list (if appropriate).  Ensure
3038  * that act_count is at least ACT_INIT but do not otherwise mess with it.
3039  *
3040  * The caller should be holding the page busied ? XXX
3041  * This routine may not block.
3042  *
3043  * It is ok if the page is wired (so buffer cache operations don't have
3044  * to mess with the page queues).
3045  */
3046 void
3047 vm_page_activate(vm_page_t m)
3048 {
3049 	u_short oqueue;
3050 
3051 	/*
3052 	 * If already active or inappropriate, just set act_count and
3053 	 * return.  We don't have to spin-lock the page.
3054 	 */
3055 	if (m->queue - m->pc == PQ_ACTIVE ||
3056 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3057 		if (m->act_count < ACT_INIT)
3058 			m->act_count = ACT_INIT;
3059 		return;
3060 	}
3061 
3062 	vm_page_spin_lock(m);
3063 	if (m->queue - m->pc != PQ_ACTIVE &&
3064 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3065 		_vm_page_queue_spin_lock(m);
3066 		oqueue = _vm_page_rem_queue_spinlocked(m);
3067 		/* page is left spinlocked, queue is unlocked */
3068 
3069 		if (oqueue == PQ_CACHE)
3070 			mycpu->gd_cnt.v_reactivated++;
3071 		if (m->act_count < ACT_INIT)
3072 			m->act_count = ACT_INIT;
3073 		_vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
3074 		_vm_page_and_queue_spin_unlock(m);
3075 		if (oqueue == PQ_CACHE || oqueue == PQ_FREE)
3076 			pagedaemon_wakeup();
3077 	} else {
3078 		if (m->act_count < ACT_INIT)
3079 			m->act_count = ACT_INIT;
3080 		vm_page_spin_unlock(m);
3081 	}
3082 }
3083 
3084 void
3085 vm_page_soft_activate(vm_page_t m)
3086 {
3087 	if (m->queue - m->pc == PQ_ACTIVE ||
3088 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3089 		if (m->act_count < ACT_INIT)
3090 			m->act_count = ACT_INIT;
3091 	} else {
3092 		vm_page_activate(m);
3093 	}
3094 }
3095 
3096 /*
3097  * Helper routine for vm_page_free_toq() and vm_page_cache().  This
3098  * routine is called when a page has been added to the cache or free
3099  * queues.
3100  *
3101  * This routine may not block.
3102  */
3103 static __inline void
3104 vm_page_free_wakeup(void)
3105 {
3106 	globaldata_t gd = mycpu;
3107 
3108 	/*
3109 	 * If the pageout daemon itself needs pages, then tell it that
3110 	 * there are some free.
3111 	 */
3112 	if (vm_pageout_pages_needed &&
3113 	    gd->gd_vmstats.v_cache_count + gd->gd_vmstats.v_free_count >=
3114 	    gd->gd_vmstats.v_pageout_free_min
3115 	) {
3116 		vm_pageout_pages_needed = 0;
3117 		wakeup(&vm_pageout_pages_needed);
3118 	}
3119 
3120 	/*
3121 	 * Wakeup processes that are waiting on memory.
3122 	 *
3123 	 * Generally speaking we want to wakeup stuck processes as soon as
3124 	 * possible.  !vm_page_count_min(0) is the absolute minimum point
3125 	 * where we can do this.  Wait a bit longer to reduce degenerate
3126 	 * re-blocking (vm_page_free_hysteresis).
3127 	 *
3128 	 * The target check is a safety to make sure the min-check
3129 	 * w/hysteresis does not exceed the normal target1.
3130 	 */
3131 	if (vm_pages_waiting) {
3132 		if (!vm_paging_min_dnc(vm_page_free_hysteresis) ||
3133 		    !vm_paging_target1())
3134 		{
3135 			vm_pages_waiting = 0;
3136 			wakeup(&vmstats.v_free_count);
3137 			++mycpu->gd_cnt.v_ppwakeups;
3138 		}
3139 	}
3140 }
3141 
3142 /*
3143  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
3144  * it from its VM object.
3145  *
3146  * The vm_page must be BUSY on entry.  BUSY will be released on
3147  * return (the page will have been freed).
3148  */
3149 void
3150 vm_page_free_toq(vm_page_t m)
3151 {
3152 	/*
3153 	 * The page must not be mapped when freed, but we may have to call
3154 	 * pmap_mapped_sync() to validate this.
3155 	 */
3156 	mycpu->gd_cnt.v_tfree++;
3157 	if (m->flags & (PG_MAPPED | PG_WRITEABLE))
3158 		pmap_mapped_sync(m);
3159 	KKASSERT((m->flags & PG_MAPPED) == 0);
3160 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3161 
3162 	if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
3163 		kprintf("vm_page_free: pindex(%lu), busy %08x, "
3164 			"hold(%d)\n",
3165 			(u_long)m->pindex, m->busy_count, m->hold_count);
3166 		if ((m->queue - m->pc) == PQ_FREE)
3167 			panic("vm_page_free: freeing free page");
3168 		else
3169 			panic("vm_page_free: freeing busy page");
3170 	}
3171 
3172 	/*
3173 	 * Remove from object, spinlock the page and its queues and
3174 	 * remove from any queue.  No queue spinlock will be held
3175 	 * after this section (because the page was removed from any
3176 	 * queue).
3177 	 */
3178 	vm_page_remove(m);
3179 
3180 	/*
3181 	 * No further management of fictitious pages occurs beyond object
3182 	 * and queue removal.
3183 	 */
3184 	if ((m->flags & PG_FICTITIOUS) != 0) {
3185 		KKASSERT(m->queue == PQ_NONE);
3186 		vm_page_wakeup(m);
3187 		return;
3188 	}
3189 	vm_page_and_queue_spin_lock(m);
3190 	_vm_page_rem_queue_spinlocked(m);
3191 
3192 	m->valid = 0;
3193 	vm_page_undirty(m);
3194 
3195 	if (m->wire_count != 0) {
3196 		if (m->wire_count > 1) {
3197 		    panic(
3198 			"vm_page_free: invalid wire count (%d), pindex: 0x%lx",
3199 			m->wire_count, (long)m->pindex);
3200 		}
3201 		panic("vm_page_free: freeing wired page");
3202 	}
3203 
3204 	if (!MD_PAGE_FREEABLE(m))
3205 		panic("vm_page_free: page %p is still mapped!", m);
3206 
3207 	/*
3208 	 * Clear the PG_NEED_COMMIT and the PG_UNQUEUED flags.  The
3209 	 * page returns to normal operation and will be placed in
3210 	 * the PQ_HOLD or PQ_FREE queue.
3211 	 */
3212 	vm_page_flag_clear(m, PG_NEED_COMMIT | PG_UNQUEUED);
3213 
3214 	if (m->hold_count != 0) {
3215 		_vm_page_add_queue_spinlocked(m, PQ_HOLD + m->pc, 0);
3216 	} else {
3217 		_vm_page_add_queue_spinlocked(m, PQ_FREE + m->pc, 1);
3218 	}
3219 
3220 	/*
3221 	 * This sequence allows us to clear BUSY while still holding
3222 	 * its spin lock, which reduces contention vs allocators.  We
3223 	 * must not leave the queue locked or _vm_page_wakeup() may
3224 	 * deadlock.
3225 	 */
3226 	_vm_page_queue_spin_unlock(m);
3227 	if (_vm_page_wakeup(m)) {
3228 		vm_page_spin_unlock(m);
3229 		wakeup(m);
3230 	} else {
3231 		vm_page_spin_unlock(m);
3232 	}
3233 	vm_page_free_wakeup();
3234 }
3235 
3236 /*
3237  * Mark this page as wired down by yet another map.  We do not adjust the
3238  * queue the page is on, it will be checked for wiring as-needed.
3239  *
3240  * This function has no effect on fictitious pages.
3241  *
3242  * Caller must be holding the page busy.
3243  */
3244 void
3245 vm_page_wire(vm_page_t m)
3246 {
3247 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3248 	if ((m->flags & PG_FICTITIOUS) == 0) {
3249 		if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
3250 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
3251 		}
3252 		KASSERT(m->wire_count != 0,
3253 			("vm_page_wire: wire_count overflow m=%p", m));
3254 	}
3255 }
3256 
3257 /*
3258  * Release one wiring of this page, potentially enabling it to be paged again.
3259  *
3260  * Note that wired pages are no longer unconditionally removed from the
3261  * paging queues, so the page may already be on a queue.  Move the page
3262  * to the desired queue if necessary.
3263  *
3264  * Many pages placed on the inactive queue should actually go
3265  * into the cache, but it is difficult to figure out which.  What
3266  * we do instead, if the inactive target is well met, is to put
3267  * clean pages at the head of the inactive queue instead of the tail.
3268  * This will cause them to be moved to the cache more quickly and
3269  * if not actively re-referenced, freed more quickly.  If we just
3270  * stick these pages at the end of the inactive queue, heavy filesystem
3271  * meta-data accesses can cause an unnecessary paging load on memory bound
3272  * processes.  This optimization causes one-time-use metadata to be
3273  * reused more quickly.
3274  *
3275  * Pages marked PG_NEED_COMMIT are always activated and never placed on
3276  * the inactive queue.  This helps the pageout daemon determine memory
3277  * pressure and act on out-of-memory situations more quickly.
3278  *
3279  * BUT, if we are in a low-memory situation we have no choice but to
3280  * put clean pages on the cache queue.
3281  *
3282  * A number of routines use vm_page_unwire() to guarantee that the page
3283  * will go into either the inactive or active queues, and will NEVER
3284  * be placed in the cache - for example, just after dirtying a page.
3285  * dirty pages in the cache are not allowed.
3286  *
3287  * PG_FICTITIOUS or PG_UNQUEUED pages are never moved to any queue, and
3288  * the wire_count will not be adjusted in any way for a PG_FICTITIOUS
3289  * page.
3290  *
3291  * This routine may not block.
3292  */
3293 void
3294 vm_page_unwire(vm_page_t m, int activate)
3295 {
3296 	KKASSERT(m->busy_count & PBUSY_LOCKED);
3297 	if (m->flags & PG_FICTITIOUS) {
3298 		/* do nothing */
3299 	} else if ((int)m->wire_count <= 0) {
3300 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
3301 	} else {
3302 		if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
3303 			atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count,-1);
3304 			if (m->flags & PG_UNQUEUED) {
3305 				;
3306 			} else if (activate || (m->flags & PG_NEED_COMMIT)) {
3307 				vm_page_activate(m);
3308 			} else {
3309 				vm_page_deactivate(m);
3310 			}
3311 		}
3312 	}
3313 }
3314 
3315 /*
3316  * Move the specified page to the inactive queue.
3317  *
3318  * Normally athead is 0 resulting in LRU operation.  athead is set
3319  * to 1 if we want this page to be 'as if it were placed in the cache',
3320  * except without unmapping it from the process address space.
3321  *
3322  * vm_page's spinlock must be held on entry and will remain held on return.
3323  * This routine may not block.  The caller does not have to hold the page
3324  * busied but should have some sort of interlock on its validity.
3325  *
3326  * It is ok if the page is wired (so buffer cache operations don't have
3327  * to mess with the page queues).
3328  */
3329 static void
3330 _vm_page_deactivate_locked(vm_page_t m, int athead)
3331 {
3332 	u_short oqueue;
3333 
3334 	/*
3335 	 * Ignore if already inactive.
3336 	 */
3337 	if (m->queue - m->pc == PQ_INACTIVE ||
3338 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED))) {
3339 		return;
3340 	}
3341 
3342 	_vm_page_queue_spin_lock(m);
3343 	oqueue = _vm_page_rem_queue_spinlocked(m);
3344 
3345 	if ((m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3346 		if (oqueue == PQ_CACHE)
3347 			mycpu->gd_cnt.v_reactivated++;
3348 		vm_page_flag_clear(m, PG_WINATCFLS);
3349 		_vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, athead);
3350 		if (athead == 0) {
3351 			atomic_add_long(
3352 				&vm_page_queues[PQ_INACTIVE + m->pc].adds, 1);
3353 		}
3354 	}
3355 	/* NOTE: PQ_NONE if condition not taken */
3356 	_vm_page_queue_spin_unlock(m);
3357 	/* leaves vm_page spinlocked */
3358 }
3359 
3360 /*
3361  * Attempt to deactivate a page.
3362  *
3363  * No requirements.  We can pre-filter before getting the spinlock.
3364  *
3365  * It is ok if the page is wired (so buffer cache operations don't have
3366  * to mess with the page queues).
3367  */
3368 void
3369 vm_page_deactivate(vm_page_t m)
3370 {
3371 	if (m->queue - m->pc != PQ_INACTIVE &&
3372 	    (m->flags & (PG_FICTITIOUS | PG_UNQUEUED)) == 0) {
3373 		vm_page_spin_lock(m);
3374 		_vm_page_deactivate_locked(m, 0);
3375 		vm_page_spin_unlock(m);
3376 	}
3377 }
3378 
3379 void
3380 vm_page_deactivate_locked(vm_page_t m)
3381 {
3382 	_vm_page_deactivate_locked(m, 0);
3383 }
3384 
3385 /*
3386  * Attempt to move a busied page to PQ_CACHE, then unconditionally unbusy it.
3387  *
3388  * This function returns non-zero if it successfully moved the page to
3389  * PQ_CACHE.
3390  *
3391  * This function unconditionally unbusies the page on return.
3392  */
3393 int
3394 vm_page_try_to_cache(vm_page_t m)
3395 {
3396 	/*
3397 	 * Shortcut if we obviously cannot move the page, or if the
3398 	 * page is already on the cache queue, or it is ficitious.
3399 	 *
3400 	 * Never allow a wired page into the cache.
3401 	 */
3402 	if (m->dirty || m->hold_count || m->wire_count ||
3403 	    m->queue - m->pc == PQ_CACHE ||
3404 	    (m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS))) {
3405 		vm_page_wakeup(m);
3406 		return(0);
3407 	}
3408 
3409 	/*
3410 	 * Page busied by us and no longer spinlocked.  Dirty pages cannot
3411 	 * be moved to the cache, but can be deactivated.  However, users
3412 	 * of this function want to move pages closer to the cache so we
3413 	 * only deactivate it if it is in PQ_ACTIVE.  We do not re-deactivate.
3414 	 */
3415 	vm_page_test_dirty(m);
3416 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3417 		if (m->queue - m->pc == PQ_ACTIVE)
3418 			vm_page_deactivate(m);
3419 		vm_page_wakeup(m);
3420 		return(0);
3421 	}
3422 	vm_page_cache(m);
3423 	return(1);
3424 }
3425 
3426 /*
3427  * Attempt to free the page.  If we cannot free it, we do nothing.
3428  * 1 is returned on success, 0 on failure.
3429  *
3430  * The page can be in any state, including already being on the free
3431  * queue.  Check to see if it really can be freed.  Note that we disallow
3432  * this ad-hoc operation if the page is flagged PG_UNQUEUED.
3433  *
3434  * Caller provides an unlocked/non-busied page.
3435  * No requirements.
3436  */
3437 int
3438 vm_page_try_to_free(vm_page_t m)
3439 {
3440 	if (vm_page_busy_try(m, TRUE))
3441 		return(0);
3442 
3443 	if (m->dirty ||				/* can't free if it is dirty */
3444 	    m->hold_count ||			/* or held (XXX may be wrong) */
3445 	    m->wire_count ||			/* or wired */
3446 	    (m->flags & (PG_UNQUEUED |		/* or unqueued */
3447 			 PG_NEED_COMMIT |	/* or needs a commit */
3448 			 PG_FICTITIOUS)) ||	/* or is fictitious */
3449 	    m->queue - m->pc == PQ_FREE ||	/* already on PQ_FREE */
3450 	    m->queue - m->pc == PQ_HOLD) {	/* already on PQ_HOLD */
3451 		vm_page_wakeup(m);
3452 		return(0);
3453 	}
3454 
3455 	/*
3456 	 * We can probably free the page.
3457 	 *
3458 	 * Page busied by us and no longer spinlocked.  Dirty pages will
3459 	 * not be freed by this function.    We have to re-test the
3460 	 * dirty bit after cleaning out the pmaps.
3461 	 */
3462 	vm_page_test_dirty(m);
3463 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3464 		vm_page_wakeup(m);
3465 		return(0);
3466 	}
3467 	vm_page_protect(m, VM_PROT_NONE);
3468 	if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3469 		vm_page_wakeup(m);
3470 		return(0);
3471 	}
3472 	vm_page_free(m);
3473 	return(1);
3474 }
3475 
3476 /*
3477  * vm_page_cache
3478  *
3479  * Put the specified page onto the page cache queue (if appropriate).
3480  *
3481  * The page must be busy, and this routine will release the busy and
3482  * possibly even free the page.
3483  */
3484 void
3485 vm_page_cache(vm_page_t m)
3486 {
3487 	/*
3488 	 * Not suitable for the cache
3489 	 */
3490 	if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_FICTITIOUS)) ||
3491 	    (m->busy_count & PBUSY_MASK) ||
3492 	    m->wire_count || m->hold_count) {
3493 		vm_page_wakeup(m);
3494 		return;
3495 	}
3496 
3497 	/*
3498 	 * Already in the cache (and thus not mapped)
3499 	 */
3500 	if ((m->queue - m->pc) == PQ_CACHE) {
3501 		KKASSERT((m->flags & PG_MAPPED) == 0);
3502 		vm_page_wakeup(m);
3503 		return;
3504 	}
3505 
3506 #if 0
3507 	/*
3508 	 * REMOVED - it is possible for dirty to get set at any time as
3509 	 *	     long as the page is still mapped and writeable.
3510 	 *
3511 	 * Caller is required to test m->dirty, but note that the act of
3512 	 * removing the page from its maps can cause it to become dirty
3513 	 * on an SMP system due to another cpu running in usermode.
3514 	 */
3515 	if (m->dirty) {
3516 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
3517 			(long)m->pindex);
3518 	}
3519 #endif
3520 
3521 	/*
3522 	 * Remove all pmaps and indicate that the page is not
3523 	 * writeable or mapped.  Our vm_page_protect() call may
3524 	 * have blocked (especially w/ VM_PROT_NONE), so recheck
3525 	 * everything.
3526 	 */
3527 	if (m->flags & (PG_MAPPED | PG_WRITEABLE)) {
3528 		vm_page_protect(m, VM_PROT_NONE);
3529 		pmap_mapped_sync(m);
3530 	}
3531 	if ((m->flags & (PG_UNQUEUED | PG_MAPPED)) ||
3532 	    (m->busy_count & PBUSY_MASK) ||
3533 	    m->wire_count || m->hold_count) {
3534 		vm_page_wakeup(m);
3535 	} else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
3536 		vm_page_deactivate(m);
3537 		vm_page_wakeup(m);
3538 	} else {
3539 		_vm_page_and_queue_spin_lock(m);
3540 		_vm_page_rem_queue_spinlocked(m);
3541 		_vm_page_add_queue_spinlocked(m, PQ_CACHE + m->pc, 0);
3542 		_vm_page_and_queue_spin_unlock(m);
3543 		vm_page_wakeup(m);
3544 		vm_page_free_wakeup();
3545 	}
3546 }
3547 
3548 /*
3549  * vm_page_dontneed()
3550  *
3551  * Cache, deactivate, or do nothing as appropriate.  This routine
3552  * is typically used by madvise() MADV_DONTNEED.
3553  *
3554  * Generally speaking we want to move the page into the cache so
3555  * it gets reused quickly.  However, this can result in a silly syndrome
3556  * due to the page recycling too quickly.  Small objects will not be
3557  * fully cached.  On the otherhand, if we move the page to the inactive
3558  * queue we wind up with a problem whereby very large objects
3559  * unnecessarily blow away our inactive and cache queues.
3560  *
3561  * The solution is to move the pages based on a fixed weighting.  We
3562  * either leave them alone, deactivate them, or move them to the cache,
3563  * where moving them to the cache has the highest weighting.
3564  * By forcing some pages into other queues we eventually force the
3565  * system to balance the queues, potentially recovering other unrelated
3566  * space from active.  The idea is to not force this to happen too
3567  * often.
3568  *
3569  * The page must be busied.
3570  */
3571 void
3572 vm_page_dontneed(vm_page_t m)
3573 {
3574 	static int dnweight;
3575 	int dnw;
3576 	int head;
3577 
3578 	dnw = ++dnweight;
3579 
3580 	/*
3581 	 * occassionally leave the page alone
3582 	 */
3583 	if ((dnw & 0x01F0) == 0 ||
3584 	    m->queue - m->pc == PQ_INACTIVE ||
3585 	    m->queue - m->pc == PQ_CACHE
3586 	) {
3587 		if (m->act_count >= ACT_INIT)
3588 			--m->act_count;
3589 		return;
3590 	}
3591 
3592 	/*
3593 	 * If vm_page_dontneed() is inactivating a page, it must clear
3594 	 * the referenced flag; otherwise the pagedaemon will see references
3595 	 * on the page in the inactive queue and reactivate it. Until the
3596 	 * page can move to the cache queue, madvise's job is not done.
3597 	 */
3598 	vm_page_flag_clear(m, PG_REFERENCED);
3599 	pmap_clear_reference(m);
3600 
3601 	if (m->dirty == 0)
3602 		vm_page_test_dirty(m);
3603 
3604 	if (m->dirty || (dnw & 0x0070) == 0) {
3605 		/*
3606 		 * Deactivate the page 3 times out of 32.
3607 		 */
3608 		head = 0;
3609 	} else {
3610 		/*
3611 		 * Cache the page 28 times out of every 32.  Note that
3612 		 * the page is deactivated instead of cached, but placed
3613 		 * at the head of the queue instead of the tail.
3614 		 */
3615 		head = 1;
3616 	}
3617 	vm_page_spin_lock(m);
3618 	_vm_page_deactivate_locked(m, head);
3619 	vm_page_spin_unlock(m);
3620 }
3621 
3622 /*
3623  * These routines manipulate the 'soft busy' count for a page.  A soft busy
3624  * is almost like a hard BUSY except that it allows certain compatible
3625  * operations to occur on the page while it is busy.  For example, a page
3626  * undergoing a write can still be mapped read-only.
3627  *
3628  * We also use soft-busy to quickly pmap_enter shared read-only pages
3629  * without having to hold the page locked.
3630  *
3631  * The soft-busy count can be > 1 in situations where multiple threads
3632  * are pmap_enter()ing the same page simultaneously, or when two buffer
3633  * cache buffers overlap the same page.
3634  *
3635  * The caller must hold the page BUSY when making these two calls.
3636  */
3637 void
3638 vm_page_io_start(vm_page_t m)
3639 {
3640 	uint32_t ocount;
3641 
3642 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3643 	KKASSERT(ocount & PBUSY_LOCKED);
3644 }
3645 
3646 void
3647 vm_page_io_finish(vm_page_t m)
3648 {
3649 	uint32_t ocount;
3650 
3651 	ocount = atomic_fetchadd_int(&m->busy_count, -1);
3652 	KKASSERT(ocount & PBUSY_MASK);
3653 #if 0
3654 	if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
3655 		wakeup(m);
3656 #endif
3657 }
3658 
3659 /*
3660  * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
3661  *
3662  * We can't use fetchadd here because we might race a hard-busy and the
3663  * page freeing code asserts on a non-zero soft-busy count (even if only
3664  * temporary).
3665  *
3666  * Returns 0 on success, non-zero on failure.
3667  */
3668 int
3669 vm_page_sbusy_try(vm_page_t m)
3670 {
3671 	uint32_t ocount;
3672 
3673 	for (;;) {
3674 		ocount = m->busy_count;
3675 		cpu_ccfence();
3676 		if (ocount & PBUSY_LOCKED)
3677 			return 1;
3678 		if (atomic_cmpset_int(&m->busy_count, ocount, ocount + 1))
3679 			break;
3680 	}
3681 	return 0;
3682 #if 0
3683 	if (m->busy_count & PBUSY_LOCKED)
3684 		return 1;
3685 	ocount = atomic_fetchadd_int(&m->busy_count, 1);
3686 	if (ocount & PBUSY_LOCKED) {
3687 		vm_page_sbusy_drop(m);
3688 		return 1;
3689 	}
3690 	return 0;
3691 #endif
3692 }
3693 
3694 /*
3695  * Indicate that a clean VM page requires a filesystem commit and cannot
3696  * be reused.  Used by tmpfs.
3697  */
3698 void
3699 vm_page_need_commit(vm_page_t m)
3700 {
3701 	vm_page_flag_set(m, PG_NEED_COMMIT);
3702 	vm_object_set_writeable_dirty(m->object);
3703 }
3704 
3705 void
3706 vm_page_clear_commit(vm_page_t m)
3707 {
3708 	vm_page_flag_clear(m, PG_NEED_COMMIT);
3709 }
3710 
3711 /*
3712  * Allocate a page without an object.  The returned page will be wired and
3713  * NOT busy.  The function will block if no page is available, but only loop
3714  * if VM_ALLOC_RETRY is specified (else returns NULL after blocking).
3715  *
3716  * The pindex can be passed as zero, and is typically passed to help the
3717  * allocator 'color' the page returned.  That is, select pages that are
3718  * cache-friendly if the caller is allocating multiple pages.
3719  *
3720  *	VM_ALLOC_QUICK		- Allocate from free queue only
3721  *	VM_ALLOC_NORMAL		- Allocate from free + cache
3722  *	VM_ALLOC_SYSTEM		- Allocation can use system page reserve
3723  *	VM_ALLOC_INTERRUPT	- Allocation can use emergency page reserve
3724  *
3725  *	VM_ALLOC_CPU(n)		- Allocate using specified cpu localization
3726  *
3727  *	VM_ALLOC_ZERO		- Zero and set page valid.  If not specified,
3728  *				  m->valid will be 0 and the page will contain
3729  *				  prior garbage.
3730  *
3731  *	VM_ALLOC_FORCE_ZERO	- (same as VM_ALLOC_ZERO in this case)
3732  *
3733  *	VM_ALLOC_RETRY		- Retry until a page is available.  If not
3734  *				  specified, NULL can be returned.
3735  *
3736  *	VM_ALLOC_NULL_OK	- Not applicable since there is no object.
3737  */
3738 vm_page_t
3739 vm_page_alloczwq(vm_pindex_t pindex, int flags)
3740 {
3741 	vm_page_t m;
3742 
3743 	KKASSERT(flags & (VM_ALLOC_NORMAL | VM_ALLOC_QUICK |
3744 			  VM_ALLOC_INTERRUPT | VM_ALLOC_SYSTEM));
3745 	for (;;) {
3746 		m = vm_page_alloc(NULL, pindex, flags & ~VM_ALLOC_RETRY);
3747 		if (m)
3748 			break;
3749 		vm_wait(0);
3750 		if ((flags & VM_ALLOC_RETRY) == 0)
3751 			return NULL;
3752 	}
3753 
3754 	if (flags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3755 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
3756 		m->valid = VM_PAGE_BITS_ALL;
3757 	}
3758 
3759 	vm_page_wire(m);
3760 	vm_page_wakeup(m);
3761 
3762 	return(m);
3763 }
3764 
3765 /*
3766  * Free a page previously allocated via vm_page_alloczwq().
3767  *
3768  * Caller should not busy the page.  This function will busy, unwire,
3769  * and free the page.
3770  */
3771 void
3772 vm_page_freezwq(vm_page_t m)
3773 {
3774 	vm_page_busy_wait(m, FALSE, "pgzwq");
3775 	vm_page_unwire(m, 0);
3776 	vm_page_free(m);
3777 }
3778 
3779 /*
3780  * Grab a page, blocking if it is busy and allocating a page if necessary.
3781  * A busy page is returned or NULL.  The page may or may not be valid and
3782  * might not be on a queue (the caller is responsible for the disposition of
3783  * the page).
3784  *
3785  *	VM_ALLOC_QUICK		- Allocate from free queue only
3786  *	VM_ALLOC_NORMAL		- Allocate from free + cache
3787  *	VM_ALLOC_SYSTEM		- Allocation can use system page reserve
3788  *	VM_ALLOC_INTERRUPT	- Allocation can use emergency page reserve
3789  *
3790  *	VM_ALLOC_CPU(n)		- Allocate using specified cpu localization
3791  *
3792  *	VM_ALLOC_ZERO		- If the page does not exist and must be
3793  *				  allocated, it will be zerod and set valid.
3794  *
3795  *	VM_ALLOC_FORCE_ZERO	- The page will be zerod and set valid whether
3796  *				  it previously existed or had to be allocated.
3797  *
3798  *	VM_ALLOC_RETRY		- Routine waits and loops until it can obtain
3799  *				  the page, never returning NULL.  Also note
3800  *				  that VM_ALLOC_NORMAL must also be specified
3801  *				  if you use VM_ALLOC_RETRY.
3802  *
3803  *				  Also, VM_ALLOC_NULL_OK is implied when
3804  *				  VM_ALLOC_RETRY is specified, but will simply
3805  *				  cause a retry loop and never return NULL.
3806  *
3807  *	VM_ALLOC_NULL_OK	- Prevent panic on insertion collision.  This
3808  *				  flag is implied and need not be set if
3809  *				  VM_ALLOC_RETRY is specified.
3810  *
3811  *				  If VM_ALLOC_RETRY is not specified, the page
3812  *				  can still be pre-existing and will be
3813  *				  returned if so, but concurrent creation of
3814  *				  the same 'new' page can cause one or more
3815  *				  grabs to return NULL.
3816  *
3817  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
3818  * always returned if we had blocked.
3819  *
3820  * This routine may not be called from an interrupt.
3821  *
3822  * No other requirements.
3823  */
3824 vm_page_t
3825 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int flags)
3826 {
3827 	vm_page_t m;
3828 	int error;
3829 	int shared = 1;
3830 
3831 	KKASSERT(flags & (VM_ALLOC_NORMAL | VM_ALLOC_QUICK |
3832 			  VM_ALLOC_INTERRUPT | VM_ALLOC_SYSTEM));
3833 	vm_object_hold_shared(object);
3834 	for (;;) {
3835 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
3836 		if (error) {
3837 			vm_page_sleep_busy(m, TRUE, "pgrbwt");
3838 			if ((flags & VM_ALLOC_RETRY) == 0) {
3839 				m = NULL;
3840 				break;
3841 			}
3842 			/* retry */
3843 		} else if (m == NULL) {
3844 			if (shared) {
3845 				vm_object_upgrade(object);
3846 				shared = 0;
3847 			}
3848 			if (flags & VM_ALLOC_RETRY)
3849 				flags |= VM_ALLOC_NULL_OK;
3850 			m = vm_page_alloc(object, pindex,
3851 					  flags & ~VM_ALLOC_RETRY);
3852 			if (m)
3853 				break;
3854 			vm_wait(0);
3855 			if ((flags & VM_ALLOC_RETRY) == 0)
3856 				goto failed;
3857 		} else {
3858 			/* m found */
3859 			break;
3860 		}
3861 	}
3862 
3863 	/*
3864 	 * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
3865 	 *
3866 	 * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
3867 	 * valid even if already valid.
3868 	 *
3869 	 * NOTE!  We have removed all of the PG_ZERO optimizations and also
3870 	 *	  removed the idle zeroing code.  These optimizations actually
3871 	 *	  slow things down on modern cpus because the zerod area is
3872 	 *	  likely uncached, placing a memory-access burden on the
3873 	 *	  accesors taking the fault.
3874 	 *
3875 	 *	  By always zeroing the page in-line with the fault, no
3876 	 *	  dynamic ram reads are needed and the caches are hot, ready
3877 	 *	  for userland to access the memory.
3878 	 */
3879 	if (m->valid == 0) {
3880 		if (flags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
3881 			pmap_zero_page(VM_PAGE_TO_PHYS(m));
3882 			m->valid = VM_PAGE_BITS_ALL;
3883 		}
3884 	} else if (flags & VM_ALLOC_FORCE_ZERO) {
3885 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
3886 		m->valid = VM_PAGE_BITS_ALL;
3887 	}
3888 failed:
3889 	vm_object_drop(object);
3890 	return(m);
3891 }
3892 
3893 /*
3894  * Mapping function for valid bits or for dirty bits in
3895  * a page.  May not block.
3896  *
3897  * Inputs are required to range within a page.
3898  *
3899  * No requirements.
3900  * Non blocking.
3901  */
3902 int
3903 vm_page_bits(int base, int size)
3904 {
3905 	int first_bit;
3906 	int last_bit;
3907 
3908 	KASSERT(
3909 	    base + size <= PAGE_SIZE,
3910 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
3911 	);
3912 
3913 	if (size == 0)		/* handle degenerate case */
3914 		return(0);
3915 
3916 	first_bit = base >> DEV_BSHIFT;
3917 	last_bit = (base + size - 1) >> DEV_BSHIFT;
3918 
3919 	return ((2 << last_bit) - (1 << first_bit));
3920 }
3921 
3922 /*
3923  * Sets portions of a page valid and clean.  The arguments are expected
3924  * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
3925  * of any partial chunks touched by the range.  The invalid portion of
3926  * such chunks will be zero'd.
3927  *
3928  * NOTE: When truncating a buffer vnode_pager_setsize() will automatically
3929  *	 align base to DEV_BSIZE so as not to mark clean a partially
3930  *	 truncated device block.  Otherwise the dirty page status might be
3931  *	 lost.
3932  *
3933  * This routine may not block.
3934  *
3935  * (base + size) must be less then or equal to PAGE_SIZE.
3936  */
3937 static void
3938 _vm_page_zero_valid(vm_page_t m, int base, int size)
3939 {
3940 	int frag;
3941 	int endoff;
3942 
3943 	if (size == 0)	/* handle degenerate case */
3944 		return;
3945 
3946 	/*
3947 	 * If the base is not DEV_BSIZE aligned and the valid
3948 	 * bit is clear, we have to zero out a portion of the
3949 	 * first block.
3950 	 */
3951 
3952 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
3953 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
3954 	) {
3955 		pmap_zero_page_area(
3956 		    VM_PAGE_TO_PHYS(m),
3957 		    frag,
3958 		    base - frag
3959 		);
3960 	}
3961 
3962 	/*
3963 	 * If the ending offset is not DEV_BSIZE aligned and the
3964 	 * valid bit is clear, we have to zero out a portion of
3965 	 * the last block.
3966 	 */
3967 
3968 	endoff = base + size;
3969 
3970 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
3971 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
3972 	) {
3973 		pmap_zero_page_area(
3974 		    VM_PAGE_TO_PHYS(m),
3975 		    endoff,
3976 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
3977 		);
3978 	}
3979 }
3980 
3981 /*
3982  * Set valid, clear dirty bits.  If validating the entire
3983  * page we can safely clear the pmap modify bit.  We also
3984  * use this opportunity to clear the PG_NOSYNC flag.  If a process
3985  * takes a write fault on a MAP_NOSYNC memory area the flag will
3986  * be set again.
3987  *
3988  * We set valid bits inclusive of any overlap, but we can only
3989  * clear dirty bits for DEV_BSIZE chunks that are fully within
3990  * the range.
3991  *
3992  * Page must be busied?
3993  * No other requirements.
3994  */
3995 void
3996 vm_page_set_valid(vm_page_t m, int base, int size)
3997 {
3998 	_vm_page_zero_valid(m, base, size);
3999 	m->valid |= vm_page_bits(base, size);
4000 }
4001 
4002 
4003 /*
4004  * Set valid bits and clear dirty bits.
4005  *
4006  * Page must be busied by caller.
4007  *
4008  * NOTE: This function does not clear the pmap modified bit.
4009  *	 Also note that e.g. NFS may use a byte-granular base
4010  *	 and size.
4011  *
4012  * No other requirements.
4013  */
4014 void
4015 vm_page_set_validclean(vm_page_t m, int base, int size)
4016 {
4017 	int pagebits;
4018 
4019 	_vm_page_zero_valid(m, base, size);
4020 	pagebits = vm_page_bits(base, size);
4021 	m->valid |= pagebits;
4022 	m->dirty &= ~pagebits;
4023 	if (base == 0 && size == PAGE_SIZE) {
4024 		/*pmap_clear_modify(m);*/
4025 		vm_page_flag_clear(m, PG_NOSYNC);
4026 	}
4027 }
4028 
4029 /*
4030  * Set valid & dirty.  Used by buwrite()
4031  *
4032  * Page must be busied by caller.
4033  */
4034 void
4035 vm_page_set_validdirty(vm_page_t m, int base, int size)
4036 {
4037 	int pagebits;
4038 
4039 	pagebits = vm_page_bits(base, size);
4040 	m->valid |= pagebits;
4041 	m->dirty |= pagebits;
4042 	if (m->object)
4043 	       vm_object_set_writeable_dirty(m->object);
4044 }
4045 
4046 /*
4047  * Clear dirty bits.
4048  *
4049  * NOTE: This function does not clear the pmap modified bit.
4050  *	 Also note that e.g. NFS may use a byte-granular base
4051  *	 and size.
4052  *
4053  * Page must be busied?
4054  * No other requirements.
4055  */
4056 void
4057 vm_page_clear_dirty(vm_page_t m, int base, int size)
4058 {
4059 	m->dirty &= ~vm_page_bits(base, size);
4060 	if (base == 0 && size == PAGE_SIZE) {
4061 		/*pmap_clear_modify(m);*/
4062 		vm_page_flag_clear(m, PG_NOSYNC);
4063 	}
4064 }
4065 
4066 /*
4067  * Make the page all-dirty.
4068  *
4069  * Also make sure the related object and vnode reflect the fact that the
4070  * object may now contain a dirty page.
4071  *
4072  * Page must be busied?
4073  * No other requirements.
4074  */
4075 void
4076 vm_page_dirty(vm_page_t m)
4077 {
4078 #ifdef INVARIANTS
4079         int pqtype = m->queue - m->pc;
4080 #endif
4081         KASSERT(pqtype != PQ_CACHE && pqtype != PQ_FREE,
4082                 ("vm_page_dirty: page in free/cache queue!"));
4083 	if (m->dirty != VM_PAGE_BITS_ALL) {
4084 		m->dirty = VM_PAGE_BITS_ALL;
4085 		if (m->object)
4086 			vm_object_set_writeable_dirty(m->object);
4087 	}
4088 }
4089 
4090 /*
4091  * Invalidates DEV_BSIZE'd chunks within a page.  Both the
4092  * valid and dirty bits for the effected areas are cleared.
4093  *
4094  * Page must be busied?
4095  * Does not block.
4096  * No other requirements.
4097  */
4098 void
4099 vm_page_set_invalid(vm_page_t m, int base, int size)
4100 {
4101 	int bits;
4102 
4103 	bits = vm_page_bits(base, size);
4104 	m->valid &= ~bits;
4105 	m->dirty &= ~bits;
4106 	atomic_add_int(&m->object->generation, 1);
4107 }
4108 
4109 /*
4110  * The kernel assumes that the invalid portions of a page contain
4111  * garbage, but such pages can be mapped into memory by user code.
4112  * When this occurs, we must zero out the non-valid portions of the
4113  * page so user code sees what it expects.
4114  *
4115  * Pages are most often semi-valid when the end of a file is mapped
4116  * into memory and the file's size is not page aligned.
4117  *
4118  * Page must be busied?
4119  * No other requirements.
4120  */
4121 void
4122 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
4123 {
4124 	int b;
4125 	int i;
4126 
4127 	/*
4128 	 * Scan the valid bits looking for invalid sections that
4129 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
4130 	 * valid bit may be set ) have already been zerod by
4131 	 * vm_page_set_validclean().
4132 	 */
4133 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
4134 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
4135 		    (m->valid & (1 << i))
4136 		) {
4137 			if (i > b) {
4138 				pmap_zero_page_area(
4139 				    VM_PAGE_TO_PHYS(m),
4140 				    b << DEV_BSHIFT,
4141 				    (i - b) << DEV_BSHIFT
4142 				);
4143 			}
4144 			b = i + 1;
4145 		}
4146 	}
4147 
4148 	/*
4149 	 * setvalid is TRUE when we can safely set the zero'd areas
4150 	 * as being valid.  We can do this if there are no cache consistency
4151 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
4152 	 */
4153 	if (setvalid)
4154 		m->valid = VM_PAGE_BITS_ALL;
4155 }
4156 
4157 /*
4158  * Is a (partial) page valid?  Note that the case where size == 0
4159  * will return FALSE in the degenerate case where the page is entirely
4160  * invalid, and TRUE otherwise.
4161  *
4162  * Does not block.
4163  * No other requirements.
4164  */
4165 int
4166 vm_page_is_valid(vm_page_t m, int base, int size)
4167 {
4168 	int bits = vm_page_bits(base, size);
4169 
4170 	if (m->valid && ((m->valid & bits) == bits))
4171 		return 1;
4172 	else
4173 		return 0;
4174 }
4175 
4176 /*
4177  * Update dirty bits from pmap/mmu.  May not block.
4178  *
4179  * Caller must hold the page busy
4180  *
4181  * WARNING! Unless the page has been unmapped, this function only
4182  *	    provides a likely dirty status.
4183  */
4184 void
4185 vm_page_test_dirty(vm_page_t m)
4186 {
4187 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m)) {
4188 		vm_page_dirty(m);
4189 	}
4190 }
4191 
4192 #include "opt_ddb.h"
4193 #ifdef DDB
4194 #include <ddb/ddb.h>
4195 
4196 DB_SHOW_COMMAND(page, vm_page_print_page_info)
4197 {
4198 	db_printf("vmstats.v_free_count: %ld\n", vmstats.v_free_count);
4199 	db_printf("vmstats.v_cache_count: %ld\n", vmstats.v_cache_count);
4200 	db_printf("vmstats.v_inactive_count: %ld\n", vmstats.v_inactive_count);
4201 	db_printf("vmstats.v_active_count: %ld\n", vmstats.v_active_count);
4202 	db_printf("vmstats.v_wire_count: %ld\n", vmstats.v_wire_count);
4203 	db_printf("vmstats.v_free_reserved: %ld\n", vmstats.v_free_reserved);
4204 	db_printf("vmstats.v_free_min: %ld\n", vmstats.v_free_min);
4205 	db_printf("vmstats.v_free_target: %ld\n", vmstats.v_free_target);
4206 	db_printf("vmstats.v_inactive_target: %ld\n",
4207 		  vmstats.v_inactive_target);
4208 	db_printf("vmstats.v_paging_wait: %ld\n", vmstats.v_paging_wait);
4209 	db_printf("vmstats.v_paging_start: %ld\n", vmstats.v_paging_start);
4210 	db_printf("vmstats.v_paging_target1: %ld\n", vmstats.v_paging_target1);
4211 	db_printf("vmstats.v_paging_target2: %ld\n", vmstats.v_paging_target2);
4212 }
4213 
4214 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
4215 {
4216 	int i;
4217 	db_printf("PQ_FREE:");
4218 	for (i = 0; i < PQ_L2_SIZE; i++) {
4219 		db_printf(" %ld", vm_page_queues[PQ_FREE + i].lcnt);
4220 	}
4221 	db_printf("\n");
4222 
4223 	db_printf("PQ_CACHE:");
4224 	for(i = 0; i < PQ_L2_SIZE; i++) {
4225 		db_printf(" %ld", vm_page_queues[PQ_CACHE + i].lcnt);
4226 	}
4227 	db_printf("\n");
4228 
4229 	db_printf("PQ_ACTIVE:");
4230 	for(i = 0; i < PQ_L2_SIZE; i++) {
4231 		db_printf(" %ld", vm_page_queues[PQ_ACTIVE + i].lcnt);
4232 	}
4233 	db_printf("\n");
4234 
4235 	db_printf("PQ_INACTIVE:");
4236 	for(i = 0; i < PQ_L2_SIZE; i++) {
4237 		db_printf(" %ld", vm_page_queues[PQ_INACTIVE + i].lcnt);
4238 	}
4239 	db_printf("\n");
4240 }
4241 #endif /* DDB */
4242