xref: /dragonfly/sys/vm/vm_pageout.c (revision dc6f5bdf)
1 /*
2  * Copyright (c) 2003-2020 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1991 Regents of the University of California.
35  * All rights reserved.
36  * Copyright (c) 1994 John S. Dyson
37  * All rights reserved.
38  * Copyright (c) 1994 David Greenman
39  * All rights reserved.
40  *
41  * This code is derived from software contributed to Berkeley by
42  * The Mach Operating System project at Carnegie-Mellon University.
43  *
44  * Redistribution and use in source and binary forms, with or without
45  * modification, are permitted provided that the following conditions
46  * are met:
47  * 1. Redistributions of source code must retain the above copyright
48  *    notice, this list of conditions and the following disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  * 3. Neither the name of the University nor the names of its contributors
53  *    may be used to endorse or promote products derived from this software
54  *    without specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66  * SUCH DAMAGE.
67  *
68  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
69  *
70  *
71  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
72  * All rights reserved.
73  *
74  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
75  *
76  * Permission to use, copy, modify and distribute this software and
77  * its documentation is hereby granted, provided that both the copyright
78  * notice and this permission notice appear in all copies of the
79  * software, derivative works or modified versions, and any portions
80  * thereof, and that both notices appear in supporting documentation.
81  *
82  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
83  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
84  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
85  *
86  * Carnegie Mellon requests users of this software to return to
87  *
88  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
89  *  School of Computer Science
90  *  Carnegie Mellon University
91  *  Pittsburgh PA 15213-3890
92  *
93  * any improvements or extensions that they make and grant Carnegie the
94  * rights to redistribute these changes.
95  */
96 
97 /*
98  * The proverbial page-out daemon, rewritten many times over the decades.
99  */
100 
101 #include "opt_vm.h"
102 #include <sys/param.h>
103 #include <sys/systm.h>
104 #include <sys/kernel.h>
105 #include <sys/proc.h>
106 #include <sys/kthread.h>
107 #include <sys/resourcevar.h>
108 #include <sys/signalvar.h>
109 #include <sys/vnode.h>
110 #include <sys/vmmeter.h>
111 #include <sys/conf.h>
112 #include <sys/sysctl.h>
113 
114 #include <vm/vm.h>
115 #include <vm/vm_param.h>
116 #include <sys/lock.h>
117 #include <vm/vm_object.h>
118 #include <vm/vm_page.h>
119 #include <vm/vm_map.h>
120 #include <vm/vm_pageout.h>
121 #include <vm/vm_pager.h>
122 #include <vm/swap_pager.h>
123 #include <vm/vm_extern.h>
124 
125 #include <sys/spinlock2.h>
126 #include <vm/vm_page2.h>
127 
128 /*
129  * System initialization
130  */
131 
132 /* the kernel process "vm_pageout"*/
133 static int vm_pageout_page(vm_page_t m, long *max_launderp,
134 			   long *vnodes_skippedp, struct vnode **vpfailedp,
135 			   int pass, int vmflush_flags, long *counts);
136 static int vm_pageout_clean_helper (vm_page_t, int);
137 static void vm_pageout_free_page_calc (vm_size_t count);
138 static void vm_pageout_page_free(vm_page_t m) ;
139 __read_frequently struct thread *emergpager;
140 __read_frequently struct thread *pagethread;
141 static int sequence_emerg_pager;
142 
143 #if !defined(NO_SWAPPING)
144 /* the kernel process "vm_daemon"*/
145 static void vm_daemon (void);
146 static struct	thread *vmthread;
147 
148 static struct kproc_desc vm_kp = {
149 	"vmdaemon",
150 	vm_daemon,
151 	&vmthread
152 };
153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
154 #endif
155 
156 __read_mostly int vm_pages_needed = 0;	/* pageout daemon tsleep event */
157 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */
158 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
159 __read_mostly int vm_page_free_hysteresis = 16;
160 __read_mostly static int vm_pagedaemon_time;
161 
162 #if !defined(NO_SWAPPING)
163 static int vm_daemon_needed;
164 #endif
165 __read_mostly static int vm_max_launder = 0;
166 __read_mostly static int vm_emerg_launder = 100;
167 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
168 __read_mostly static int vm_pageout_full_stats_interval = 0;
169 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
170 __read_mostly static int defer_swap_pageouts=0;
171 __read_mostly static int disable_swap_pageouts=0;
172 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE;
173 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2;
174 __read_mostly static int vm_pageout_debug;
175 
176 #if defined(NO_SWAPPING)
177 __read_mostly static int vm_swap_enabled=0;
178 #else
179 __read_mostly static int vm_swap_enabled=1;
180 #endif
181 
182 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/
183 __read_mostly int vm_pageout_memuse_mode=2;
184 __read_mostly int vm_pageout_allow_active=1;
185 
186 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
187 	CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
188 
189 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
190 	CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
191 
192 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
193 	CTLFLAG_RW, &vm_page_free_hysteresis, 0,
194 	"Free more pages than the minimum required");
195 
196 SYSCTL_INT(_vm, OID_AUTO, max_launder,
197 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
198 SYSCTL_INT(_vm, OID_AUTO, emerg_launder,
199 	CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum");
200 
201 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
202 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
203 
204 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
205 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
206 
207 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
208 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
209 
210 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
211 	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
212 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
213 	CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
214 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active,
215 	CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active");
216 SYSCTL_INT(_vm, OID_AUTO, pageout_debug,
217 	CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)");
218 
219 
220 #if defined(NO_SWAPPING)
221 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
222 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
223 #else
224 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
225 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
226 #endif
227 
228 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
229 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
230 
231 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
232 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
233 
234 static int pageout_lock_miss;
235 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
236 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
237 
238 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
239 
240 #if !defined(NO_SWAPPING)
241 static void vm_req_vmdaemon (void);
242 #endif
243 static void vm_pageout_page_stats(int q);
244 
245 #define MAXSCAN_DIVIDER		10
246 
247 /*
248  * Calculate approximately how many pages on each queue to try to
249  * clean.  An exact calculation creates an edge condition when the
250  * queues are unbalanced so add significant slop.  The queue scans
251  * will stop early when targets are reached and will start where they
252  * left off on the next pass.
253  *
254  * We need to be generous here because there are all sorts of loading
255  * conditions that can cause edge cases if try to average over all queues.
256  * In particular, storage subsystems have become so fast that paging
257  * activity can become quite frantic.  Eventually we will probably need
258  * two paging threads, one for dirty pages and one for clean, to deal
259  * with the bandwidth requirements.
260 
261  * So what we do is calculate a value that can be satisfied nominally by
262  * only having to scan half the queues.
263  */
264 static __inline long
265 PQAVERAGE(long n)
266 {
267 	long avg;
268 
269 	if (n >= 0) {
270 		avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
271 	} else {
272 		avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
273 	}
274 	return avg;
275 }
276 
277 /*
278  * vm_pageout_clean_helper:
279  *
280  * Clean the page and remove it from the laundry.  The page must be busied
281  * by the caller and will be disposed of (put away, flushed) by this routine.
282  */
283 static int
284 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
285 {
286 	vm_object_t object;
287 	vm_page_t mc[BLIST_MAX_ALLOC];
288 	int error;
289 	int ib, is, page_base;
290 	vm_pindex_t pindex = m->pindex;
291 
292 	object = m->object;
293 
294 	/*
295 	 * Don't mess with the page if it's held or special.  Theoretically
296 	 * we can pageout held pages but there is no real need to press our
297 	 * luck, so don't.
298 	 */
299 	if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) {
300 		vm_page_wakeup(m);
301 		return 0;
302 	}
303 
304 	/*
305 	 * Place page in cluster.  Align cluster for optimal swap space
306 	 * allocation (whether it is swap or not).  This is typically ~16-32
307 	 * pages, which also tends to align the cluster to multiples of the
308 	 * filesystem block size if backed by a filesystem.
309 	 */
310 	page_base = pindex % BLIST_MAX_ALLOC;
311 	mc[page_base] = m;
312 	ib = page_base - 1;
313 	is = page_base + 1;
314 
315 	/*
316 	 * Scan object for clusterable pages.
317 	 *
318 	 * We can cluster ONLY if: ->> the page is NOT
319 	 * clean, wired, busy, held, or mapped into a
320 	 * buffer, and one of the following:
321 	 * 1) The page is inactive, or a seldom used
322 	 *    active page.
323 	 * -or-
324 	 * 2) we force the issue.
325 	 *
326 	 * During heavy mmap/modification loads the pageout
327 	 * daemon can really fragment the underlying file
328 	 * due to flushing pages out of order and not trying
329 	 * align the clusters (which leave sporatic out-of-order
330 	 * holes).  To solve this problem we do the reverse scan
331 	 * first and attempt to align our cluster, then do a
332 	 * forward scan if room remains.
333 	 */
334 	vm_object_hold(object);
335 
336 	while (ib >= 0) {
337 		vm_page_t p;
338 
339 		p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
340 					    TRUE, &error);
341 		if (error || p == NULL)
342 			break;
343 		if ((p->queue - p->pc) == PQ_CACHE ||
344 		    (p->flags & PG_UNQUEUED)) {
345 			vm_page_wakeup(p);
346 			break;
347 		}
348 		vm_page_test_dirty(p);
349 		if (((p->dirty & p->valid) == 0 &&
350 		     (p->flags & PG_NEED_COMMIT) == 0) ||
351 		    p->wire_count != 0 ||	/* may be held by buf cache */
352 		    p->hold_count != 0) {	/* may be undergoing I/O */
353 			vm_page_wakeup(p);
354 			break;
355 		}
356 		if (p->queue - p->pc != PQ_INACTIVE) {
357 			if (p->queue - p->pc != PQ_ACTIVE ||
358 			    (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
359 				vm_page_wakeup(p);
360 				break;
361 			}
362 		}
363 
364 		/*
365 		 * Try to maintain page groupings in the cluster.
366 		 */
367 		if (m->flags & PG_WINATCFLS)
368 			vm_page_flag_set(p, PG_WINATCFLS);
369 		else
370 			vm_page_flag_clear(p, PG_WINATCFLS);
371 		p->act_count = m->act_count;
372 
373 		mc[ib] = p;
374 		--ib;
375 	}
376 	++ib;	/* fixup */
377 
378 	while (is < BLIST_MAX_ALLOC &&
379 	       pindex - page_base + is < object->size) {
380 		vm_page_t p;
381 
382 		p = vm_page_lookup_busy_try(object, pindex - page_base + is,
383 					    TRUE, &error);
384 		if (error || p == NULL)
385 			break;
386 		if (((p->queue - p->pc) == PQ_CACHE) ||
387 		    (p->flags & PG_UNQUEUED)) {
388 			vm_page_wakeup(p);
389 			break;
390 		}
391 		vm_page_test_dirty(p);
392 		if (((p->dirty & p->valid) == 0 &&
393 		     (p->flags & PG_NEED_COMMIT) == 0) ||
394 		    p->wire_count != 0 ||	/* may be held by buf cache */
395 		    p->hold_count != 0) {	/* may be undergoing I/O */
396 			vm_page_wakeup(p);
397 			break;
398 		}
399 		if (p->queue - p->pc != PQ_INACTIVE) {
400 			if (p->queue - p->pc != PQ_ACTIVE ||
401 			    (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) {
402 				vm_page_wakeup(p);
403 				break;
404 			}
405 		}
406 
407 		/*
408 		 * Try to maintain page groupings in the cluster.
409 		 */
410 		if (m->flags & PG_WINATCFLS)
411 			vm_page_flag_set(p, PG_WINATCFLS);
412 		else
413 			vm_page_flag_clear(p, PG_WINATCFLS);
414 		p->act_count = m->act_count;
415 
416 		mc[is] = p;
417 		++is;
418 	}
419 
420 	vm_object_drop(object);
421 
422 	/*
423 	 * we allow reads during pageouts...
424 	 */
425 	return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
426 }
427 
428 /*
429  * vm_pageout_flush() - launder the given pages
430  *
431  *	The given pages are laundered.  Note that we setup for the start of
432  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
433  *	reference count all in here rather then in the parent.  If we want
434  *	the parent to do more sophisticated things we may have to change
435  *	the ordering.
436  *
437  *	The pages in the array must be busied by the caller and will be
438  *	unbusied by this function.
439  */
440 int
441 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
442 {
443 	vm_object_t object;
444 	int pageout_status[count];
445 	int numpagedout = 0;
446 	int i;
447 
448 	/*
449 	 * Initiate I/O.  Bump the vm_page_t->busy counter.
450 	 */
451 	for (i = 0; i < count; i++) {
452 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
453 			("vm_pageout_flush page %p index %d/%d: partially "
454 			 "invalid page", mc[i], i, count));
455 		vm_page_io_start(mc[i]);
456 	}
457 
458 	/*
459 	 * We must make the pages read-only.  This will also force the
460 	 * modified bit in the related pmaps to be cleared.  The pager
461 	 * cannot clear the bit for us since the I/O completion code
462 	 * typically runs from an interrupt.  The act of making the page
463 	 * read-only handles the case for us.
464 	 *
465 	 * Then we can unbusy the pages, we still hold a reference by virtue
466 	 * of our soft-busy.
467 	 */
468 	for (i = 0; i < count; i++) {
469 		if (vmflush_flags & OBJPC_TRY_TO_CACHE)
470 			vm_page_protect(mc[i], VM_PROT_NONE);
471 		else
472 			vm_page_protect(mc[i], VM_PROT_READ);
473 		vm_page_wakeup(mc[i]);
474 	}
475 
476 	object = mc[0]->object;
477 	vm_object_pip_add(object, count);
478 
479 	vm_pager_put_pages(object, mc, count,
480 			   (vmflush_flags |
481 			    ((object == &kernel_object) ?
482 				OBJPC_SYNC : 0)),
483 			   pageout_status);
484 
485 	for (i = 0; i < count; i++) {
486 		vm_page_t mt = mc[i];
487 
488 		switch (pageout_status[i]) {
489 		case VM_PAGER_OK:
490 			numpagedout++;
491 			break;
492 		case VM_PAGER_PEND:
493 			numpagedout++;
494 			break;
495 		case VM_PAGER_BAD:
496 			/*
497 			 * Page outside of range of object. Right now we
498 			 * essentially lose the changes by pretending it
499 			 * worked.
500 			 */
501 			vm_page_busy_wait(mt, FALSE, "pgbad");
502 			pmap_clear_modify(mt);
503 			vm_page_undirty(mt);
504 			vm_page_wakeup(mt);
505 			break;
506 		case VM_PAGER_ERROR:
507 		case VM_PAGER_FAIL:
508 			/*
509 			 * A page typically cannot be paged out when we
510 			 * have run out of swap.  We leave the page
511 			 * marked inactive and will try to page it out
512 			 * again later.
513 			 *
514 			 * Starvation of the active page list is used to
515 			 * determine when the system is massively memory
516 			 * starved.
517 			 */
518 			break;
519 		case VM_PAGER_AGAIN:
520 			break;
521 		}
522 
523 		/*
524 		 * If not PENDing this was a synchronous operation and we
525 		 * clean up after the I/O.  If it is PENDing the mess is
526 		 * cleaned up asynchronously.
527 		 *
528 		 * Also nominally act on the caller's wishes if the caller
529 		 * wants to try to really clean (cache or free) the page.
530 		 *
531 		 * Also nominally deactivate the page if the system is
532 		 * memory-stressed.
533 		 */
534 		if (pageout_status[i] != VM_PAGER_PEND) {
535 			vm_page_busy_wait(mt, FALSE, "pgouw");
536 			vm_page_io_finish(mt);
537 			if (vmflush_flags & OBJPC_TRY_TO_CACHE) {
538 				vm_page_try_to_cache(mt);
539 			} else if (vm_page_count_severe()) {
540 				vm_page_deactivate(mt);
541 				vm_page_wakeup(mt);
542 			} else {
543 				vm_page_wakeup(mt);
544 			}
545 			vm_object_pip_wakeup(object);
546 		}
547 	}
548 	return numpagedout;
549 }
550 
551 #if !defined(NO_SWAPPING)
552 
553 /*
554  * Callback function, page busied for us.  We must dispose of the busy
555  * condition.  Any related pmap pages may be held but will not be locked.
556  */
557 static
558 int
559 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
560 			vm_page_t p)
561 {
562 	int actcount;
563 	int cleanit = 0;
564 
565 	/*
566 	 * Basic tests - There should never be a marker, and we can stop
567 	 *		 once the RSS is below the required level.
568 	 */
569 	KKASSERT((p->flags & PG_MARKER) == 0);
570 	if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
571 		vm_page_wakeup(p);
572 		return(-1);
573 	}
574 
575 	mycpu->gd_cnt.v_pdpages++;
576 
577 	if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) {
578 		vm_page_wakeup(p);
579 		goto done;
580 	}
581 
582 	++info->actioncount;
583 
584 	/*
585 	 * Check if the page has been referened recently.  If it has,
586 	 * activate it and skip.
587 	 */
588 	actcount = pmap_ts_referenced(p);
589 	if (actcount) {
590 		vm_page_flag_set(p, PG_REFERENCED);
591 	} else if (p->flags & PG_REFERENCED) {
592 		actcount = 1;
593 	}
594 
595 	if (actcount) {
596 		if (p->queue - p->pc != PQ_ACTIVE) {
597 			vm_page_and_queue_spin_lock(p);
598 			if (p->queue - p->pc != PQ_ACTIVE) {
599 				vm_page_and_queue_spin_unlock(p);
600 				vm_page_activate(p);
601 			} else {
602 				vm_page_and_queue_spin_unlock(p);
603 			}
604 		} else {
605 			p->act_count += actcount;
606 			if (p->act_count > ACT_MAX)
607 				p->act_count = ACT_MAX;
608 		}
609 		vm_page_flag_clear(p, PG_REFERENCED);
610 		vm_page_wakeup(p);
611 		goto done;
612 	}
613 
614 	/*
615 	 * Remove the page from this particular pmap.  Once we do this, our
616 	 * pmap scans will not see it again (unless it gets faulted in), so
617 	 * we must actively dispose of or deal with the page.
618 	 */
619 	pmap_remove_specific(info->pmap, p);
620 
621 	/*
622 	 * If the page is not mapped to another process (i.e. as would be
623 	 * typical if this were a shared page from a library) then deactivate
624 	 * the page and clean it in two passes only.
625 	 *
626 	 * If the page hasn't been referenced since the last check, remove it
627 	 * from the pmap.  If it is no longer mapped, deactivate it
628 	 * immediately, accelerating the normal decline.
629 	 *
630 	 * Once the page has been removed from the pmap the RSS code no
631 	 * longer tracks it so we have to make sure that it is staged for
632 	 * potential flush action.
633 	 *
634 	 * XXX
635 	 */
636 	if ((p->flags & PG_MAPPED) == 0 ||
637 	    (pmap_mapped_sync(p) & PG_MAPPED) == 0) {
638 		if (p->queue - p->pc == PQ_ACTIVE) {
639 			vm_page_deactivate(p);
640 		}
641 		if (p->queue - p->pc == PQ_INACTIVE) {
642 			cleanit = 1;
643 		}
644 	}
645 
646 	/*
647 	 * Ok, try to fully clean the page and any nearby pages such that at
648 	 * least the requested page is freed or moved to the cache queue.
649 	 *
650 	 * We usually do this synchronously to allow us to get the page into
651 	 * the CACHE queue quickly, which will prevent memory exhaustion if
652 	 * a process with a memoryuse limit is running away.  However, the
653 	 * sysadmin may desire to set vm.swap_user_async which relaxes this
654 	 * and improves write performance.
655 	 */
656 	if (cleanit) {
657 		long max_launder = 0x7FFF;
658 		long vnodes_skipped = 0;
659 		long counts[4] = { 0, 0, 0, 0 };
660 		int vmflush_flags;
661 		struct vnode *vpfailed = NULL;
662 
663 		info->offset = va;
664 
665 		if (vm_pageout_memuse_mode >= 2) {
666 			vmflush_flags = OBJPC_TRY_TO_CACHE |
667 					OBJPC_ALLOW_ACTIVE;
668 			if (swap_user_async == 0)
669 				vmflush_flags |= OBJPC_SYNC;
670 			vm_page_flag_set(p, PG_WINATCFLS);
671 			info->cleancount +=
672 				vm_pageout_page(p, &max_launder,
673 						&vnodes_skipped,
674 						&vpfailed, 1, vmflush_flags,
675 						counts);
676 		} else {
677 			vm_page_wakeup(p);
678 			++info->cleancount;
679 		}
680 	} else {
681 		vm_page_wakeup(p);
682 	}
683 
684 	/*
685 	 * Must be at end to avoid SMP races.
686 	 */
687 done:
688 	lwkt_user_yield();
689 	return 0;
690 }
691 
692 /*
693  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
694  * that is relatively difficult to do.  We try to keep track of where we
695  * left off last time to reduce scan overhead.
696  *
697  * Called when vm_pageout_memuse_mode is >= 1.
698  */
699 void
700 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
701 {
702 	vm_offset_t pgout_offset;
703 	struct pmap_pgscan_info info;
704 	int retries = 3;
705 
706 	pgout_offset = map->pgout_offset;
707 again:
708 #if 0
709 	kprintf("%016jx ", pgout_offset);
710 #endif
711 	if (pgout_offset < VM_MIN_USER_ADDRESS)
712 		pgout_offset = VM_MIN_USER_ADDRESS;
713 	if (pgout_offset >= VM_MAX_USER_ADDRESS)
714 		pgout_offset = 0;
715 	info.pmap = vm_map_pmap(map);
716 	info.limit = limit;
717 	info.beg_addr = pgout_offset;
718 	info.end_addr = VM_MAX_USER_ADDRESS;
719 	info.callback = vm_pageout_mdp_callback;
720 	info.cleancount = 0;
721 	info.actioncount = 0;
722 	info.busycount = 0;
723 
724 	pmap_pgscan(&info);
725 	pgout_offset = info.offset;
726 #if 0
727 	kprintf("%016jx %08lx %08lx\n", pgout_offset,
728 		info.cleancount, info.actioncount);
729 #endif
730 
731 	if (pgout_offset != VM_MAX_USER_ADDRESS &&
732 	    pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
733 		goto again;
734 	} else if (retries &&
735 		   pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
736 		--retries;
737 		goto again;
738 	}
739 	map->pgout_offset = pgout_offset;
740 }
741 #endif
742 
743 /*
744  * Called when the pageout scan wants to free a page.  We no longer
745  * try to cycle the vm_object here with a reference & dealloc, which can
746  * cause a non-trivial object collapse in a critical path.
747  *
748  * It is unclear why we cycled the ref_count in the past, perhaps to try
749  * to optimize shadow chain collapses but I don't quite see why it would
750  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
751  * synchronously and not have to be kicked-start.
752  */
753 static void
754 vm_pageout_page_free(vm_page_t m)
755 {
756 	vm_page_protect(m, VM_PROT_NONE);
757 	vm_page_free(m);
758 }
759 
760 /*
761  * vm_pageout_scan does the dirty work for the pageout daemon.
762  */
763 struct vm_pageout_scan_info {
764 	struct proc *bigproc;
765 	vm_offset_t bigsize;
766 };
767 
768 static int vm_pageout_scan_callback(struct proc *p, void *data);
769 
770 /*
771  * Scan inactive queue
772  *
773  * WARNING! Can be called from two pagedaemon threads simultaneously.
774  */
775 static int
776 vm_pageout_scan_inactive(int pass, int q, long avail_shortage,
777 			 long *vnodes_skipped, long *counts)
778 {
779 	vm_page_t m;
780 	struct vm_page marker;
781 	struct vnode *vpfailed;		/* warning, allowed to be stale */
782 	long maxscan;
783 	long delta = 0;
784 	long max_launder;
785 	int isep;
786 	int vmflush_flags;
787 
788 	isep = (curthread == emergpager);
789 	if ((unsigned)pass > 1000)
790 		pass = 1000;
791 
792 	/*
793 	 * This routine is called for each of PQ_L2_SIZE inactive queues.
794 	 * We want the vm_max_launder parameter to apply to the whole
795 	 * queue (i.e. per-whole-queue pass, not per-sub-queue).
796 	 *
797 	 * In each successive full-pass when the page target is not met we
798 	 * allow the per-queue max_launder to increase up to a maximum of
799 	 * vm_max_launder / 16.
800 	 */
801 	if (pass)
802 		max_launder = (long)vm_max_launder * (pass + 1) / PQ_L2_SIZE;
803 	else
804 		max_launder = (long)vm_max_launder / PQ_L2_SIZE;
805 	max_launder /= MAXSCAN_DIVIDER;
806 
807 	if (max_launder <= 1)
808 		max_launder = 1;
809 	if (max_launder >= vm_max_launder / 16)
810 		max_launder = vm_max_launder / 16 + 1;
811 
812 	/*
813 	 * Start scanning the inactive queue for pages we can move to the
814 	 * cache or free.  The scan will stop when the target is reached or
815 	 * we have scanned the entire inactive queue.  Note that m->act_count
816 	 * is not used to form decisions for the inactive queue, only for the
817 	 * active queue.
818 	 *
819 	 * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
820 	 *	  PAGES.
821 	 */
822 
823 	/*
824 	 * Initialize our marker
825 	 */
826 	bzero(&marker, sizeof(marker));
827 	marker.flags = PG_FICTITIOUS | PG_MARKER;
828 	marker.busy_count = PBUSY_LOCKED;
829 	marker.queue = PQ_INACTIVE + q;
830 	marker.pc = q;
831 	marker.wire_count = 1;
832 
833 	/*
834 	 * Inactive queue scan.
835 	 *
836 	 * We pick off approximately 1/10 of each queue.  Each queue is
837 	 * effectively organized LRU so scanning the entire queue would
838 	 * improperly pick up pages that might still be in regular use.
839 	 *
840 	 * NOTE: The vm_page must be spinlocked before the queue to avoid
841 	 *	 deadlocks, so it is easiest to simply iterate the loop
842 	 *	 with the queue unlocked at the top.
843 	 */
844 	vpfailed = NULL;
845 
846 	vm_page_queues_spin_lock(PQ_INACTIVE + q);
847 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
848 	maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1;
849 
850 	/*
851 	 * Queue locked at top of loop to avoid stack marker issues.
852 	 */
853 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
854 	       maxscan-- > 0 && avail_shortage - delta > 0)
855 	{
856 		int count;
857 
858 		KKASSERT(m->queue == PQ_INACTIVE + q);
859 		TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
860 			     &marker, pageq);
861 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
862 				   &marker, pageq);
863 		mycpu->gd_cnt.v_pdpages++;
864 
865 		/*
866 		 * Skip marker pages (atomic against other markers to avoid
867 		 * infinite hop-over scans).
868 		 */
869 		if (m->flags & PG_MARKER)
870 			continue;
871 
872 		/*
873 		 * Try to busy the page.  Don't mess with pages which are
874 		 * already busy or reorder them in the queue.
875 		 */
876 		if (vm_page_busy_try(m, TRUE))
877 			continue;
878 
879 		/*
880 		 * Remaining operations run with the page busy and neither
881 		 * the page or the queue will be spin-locked.
882 		 */
883 		KKASSERT(m->queue == PQ_INACTIVE + q);
884 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
885 
886 		/*
887 		 * The emergency pager runs when the primary pager gets
888 		 * stuck, which typically means the primary pager deadlocked
889 		 * on a vnode-backed page.  Therefore, the emergency pager
890 		 * must skip any complex objects.
891 		 *
892 		 * We disallow VNODEs unless they are VCHR whos device ops
893 		 * does not flag D_NOEMERGPGR.
894 		 */
895 		if (isep && m->object) {
896 			struct vnode *vp;
897 
898 			switch(m->object->type) {
899 			case OBJT_DEFAULT:
900 			case OBJT_SWAP:
901 				/*
902 				 * Allow anonymous memory and assume that
903 				 * swap devices are not complex, since its
904 				 * kinda worthless if we can't swap out dirty
905 				 * anonymous pages.
906 				 */
907 				break;
908 			case OBJT_VNODE:
909 				/*
910 				 * Allow VCHR device if the D_NOEMERGPGR
911 				 * flag is not set, deny other vnode types
912 				 * as being too complex.
913 				 */
914 				vp = m->object->handle;
915 				if (vp && vp->v_type == VCHR &&
916 				    vp->v_rdev && vp->v_rdev->si_ops &&
917 				    (vp->v_rdev->si_ops->head.flags &
918 				     D_NOEMERGPGR) == 0) {
919 					break;
920 				}
921 				/* Deny - fall through */
922 			default:
923 				/*
924 				 * Deny
925 				 */
926 				vm_page_wakeup(m);
927 				vm_page_queues_spin_lock(PQ_INACTIVE + q);
928 				lwkt_yield();
929 				continue;
930 			}
931 		}
932 
933 		/*
934 		 * Try to pageout the page and perhaps other nearby pages.
935 		 * We want to get the pages into the cache eventually (
936 		 * first or second pass).  Otherwise the pages can wind up
937 		 * just cycling in the inactive queue, getting flushed over
938 		 * and over again.
939 		 *
940 		 * Generally speaking we recycle dirty pages within PQ_INACTIVE
941 		 * twice (double LRU) before paging them out.  If the
942 		 * memuse_mode is >= 3 we run them single-LRU like we do clean
943 		 * pages.
944 		 */
945 		if (vm_pageout_memuse_mode >= 3)
946 			vm_page_flag_set(m, PG_WINATCFLS);
947 
948 		vmflush_flags = 0;
949 		if (vm_pageout_allow_active)
950 			vmflush_flags |= OBJPC_ALLOW_ACTIVE;
951 		if (m->flags & PG_WINATCFLS)
952 			vmflush_flags |= OBJPC_TRY_TO_CACHE;
953 		count = vm_pageout_page(m, &max_launder, vnodes_skipped,
954 					&vpfailed, pass, vmflush_flags, counts);
955 		delta += count;
956 
957 		/*
958 		 * Systems with a ton of memory can wind up with huge
959 		 * deactivation counts.  Because the inactive scan is
960 		 * doing a lot of flushing, the combination can result
961 		 * in excessive paging even in situations where other
962 		 * unrelated threads free up sufficient VM.
963 		 *
964 		 * To deal with this we abort the nominal active->inactive
965 		 * scan before we hit the inactive target when free+cache
966 		 * levels have reached a reasonable target.
967 		 *
968 		 * When deciding to stop early we need to add some slop to
969 		 * the test and we need to return full completion to the caller
970 		 * to prevent the caller from thinking there is something
971 		 * wrong and issuing a low-memory+swap warning or pkill.
972 		 *
973 		 * A deficit forces paging regardless of the state of the
974 		 * VM page queues (used for RSS enforcement).
975 		 */
976 		lwkt_yield();
977 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
978 		if (vm_paging_target() < -vm_max_launder) {
979 			/*
980 			 * Stopping early, return full completion to caller.
981 			 */
982 			if (delta < avail_shortage)
983 				delta = avail_shortage;
984 			break;
985 		}
986 	}
987 
988 	/* page queue still spin-locked */
989 	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
990 	vm_page_queues_spin_unlock(PQ_INACTIVE + q);
991 
992 	return (delta);
993 }
994 
995 /*
996  * Pageout the specified page, return the total number of pages paged out
997  * (this routine may cluster).
998  *
999  * The page must be busied and soft-busied by the caller and will be disposed
1000  * of by this function.
1001  */
1002 static int
1003 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp,
1004 		struct vnode **vpfailedp, int pass, int vmflush_flags,
1005 		long *counts)
1006 {
1007 	vm_object_t object;
1008 	int actcount;
1009 	int count = 0;
1010 
1011 	/*
1012 	 * Wiring no longer removes a page from its queue.  The last unwiring
1013 	 * will requeue the page.  Obviously wired pages cannot be paged out
1014 	 * so unqueue it and return.
1015 	 */
1016 	if (m->wire_count) {
1017 		vm_page_unqueue_nowakeup(m);
1018 		vm_page_wakeup(m);
1019 		return 0;
1020 	}
1021 
1022 	/*
1023 	 * A held page may be undergoing I/O, so skip it.
1024 	 */
1025 	if (m->hold_count) {
1026 		vm_page_and_queue_spin_lock(m);
1027 		if (m->queue - m->pc == PQ_INACTIVE) {
1028 			TAILQ_REMOVE(
1029 				&vm_page_queues[m->queue].pl, m, pageq);
1030 			TAILQ_INSERT_TAIL(
1031 				&vm_page_queues[m->queue].pl, m, pageq);
1032 		}
1033 		vm_page_and_queue_spin_unlock(m);
1034 		vm_page_wakeup(m);
1035 		return 0;
1036 	}
1037 
1038 	if (m->object == NULL || m->object->ref_count == 0) {
1039 		/*
1040 		 * If the object is not being used, we ignore previous
1041 		 * references.
1042 		 */
1043 		vm_page_flag_clear(m, PG_REFERENCED);
1044 		pmap_clear_reference(m);
1045 		/* fall through to end */
1046 	} else if (((m->flags & PG_REFERENCED) == 0) &&
1047 		    (actcount = pmap_ts_referenced(m))) {
1048 		/*
1049 		 * Otherwise, if the page has been referenced while
1050 		 * in the inactive queue, we bump the "activation
1051 		 * count" upwards, making it less likely that the
1052 		 * page will be added back to the inactive queue
1053 		 * prematurely again.  Here we check the page tables
1054 		 * (or emulated bits, if any), given the upper level
1055 		 * VM system not knowing anything about existing
1056 		 * references.
1057 		 */
1058 		++counts[3];
1059 		vm_page_activate(m);
1060 		m->act_count += (actcount + ACT_ADVANCE);
1061 		vm_page_wakeup(m);
1062 		return 0;
1063 	}
1064 
1065 	/*
1066 	 * (m) is still busied.
1067 	 *
1068 	 * If the upper level VM system knows about any page
1069 	 * references, we activate the page.  We also set the
1070 	 * "activation count" higher than normal so that we will less
1071 	 * likely place pages back onto the inactive queue again.
1072 	 */
1073 	if ((m->flags & PG_REFERENCED) != 0) {
1074 		vm_page_flag_clear(m, PG_REFERENCED);
1075 		actcount = pmap_ts_referenced(m);
1076 		vm_page_activate(m);
1077 		m->act_count += (actcount + ACT_ADVANCE + 1);
1078 		vm_page_wakeup(m);
1079 		++counts[3];
1080 		return 0;
1081 	}
1082 
1083 	/*
1084 	 * If the upper level VM system doesn't know anything about
1085 	 * the page being dirty, we have to check for it again.  As
1086 	 * far as the VM code knows, any partially dirty pages are
1087 	 * fully dirty.
1088 	 *
1089 	 * Pages marked PG_WRITEABLE may be mapped into the user
1090 	 * address space of a process running on another cpu.  A
1091 	 * user process (without holding the MP lock) running on
1092 	 * another cpu may be able to touch the page while we are
1093 	 * trying to remove it.  vm_page_cache() will handle this
1094 	 * case for us.
1095 	 */
1096 	if (m->dirty == 0) {
1097 		vm_page_test_dirty(m);
1098 	} else {
1099 		vm_page_dirty(m);
1100 	}
1101 
1102 	if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1103 		/*
1104 		 * Invalid pages can be easily freed
1105 		 */
1106 		vm_pageout_page_free(m);
1107 		mycpu->gd_cnt.v_dfree++;
1108 		++count;
1109 		++counts[1];
1110 	} else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1111 		/*
1112 		 * Clean pages can be placed onto the cache queue.
1113 		 * This effectively frees them.
1114 		 */
1115 		vm_page_cache(m);
1116 		++count;
1117 		++counts[1];
1118 	} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
1119 		/*
1120 		 * Dirty pages need to be paged out, but flushing
1121 		 * a page is extremely expensive verses freeing
1122 		 * a clean page.  Rather then artificially limiting
1123 		 * the number of pages we can flush, we instead give
1124 		 * dirty pages extra priority on the inactive queue
1125 		 * by forcing them to be cycled through the queue
1126 		 * twice before being flushed, after which the
1127 		 * (now clean) page will cycle through once more
1128 		 * before being freed.  This significantly extends
1129 		 * the thrash point for a heavily loaded machine.
1130 		 */
1131 		++counts[2];
1132 		vm_page_flag_set(m, PG_WINATCFLS);
1133 		vm_page_and_queue_spin_lock(m);
1134 		if (m->queue - m->pc == PQ_INACTIVE) {
1135 			TAILQ_REMOVE(
1136 				&vm_page_queues[m->queue].pl, m, pageq);
1137 			TAILQ_INSERT_TAIL(
1138 				&vm_page_queues[m->queue].pl, m, pageq);
1139 		}
1140 		vm_page_and_queue_spin_unlock(m);
1141 		vm_page_wakeup(m);
1142 	} else if (*max_launderp > 0) {
1143 		/*
1144 		 * We always want to try to flush some dirty pages if
1145 		 * we encounter them, to keep the system stable.
1146 		 * Normally this number is small, but under extreme
1147 		 * pressure where there are insufficient clean pages
1148 		 * on the inactive queue, we may have to go all out.
1149 		 */
1150 		int swap_pageouts_ok;
1151 		struct vnode *vp = NULL;
1152 
1153 		if ((m->flags & PG_WINATCFLS) == 0)
1154 			vm_page_flag_set(m, PG_WINATCFLS);
1155 		swap_pageouts_ok = 0;
1156 		object = m->object;
1157 		if (object &&
1158 		    (object->type != OBJT_SWAP) &&
1159 		    (object->type != OBJT_DEFAULT)) {
1160 			swap_pageouts_ok = 1;
1161 		} else {
1162 			swap_pageouts_ok = !(defer_swap_pageouts ||
1163 					     disable_swap_pageouts);
1164 			swap_pageouts_ok |= (!disable_swap_pageouts &&
1165 					     defer_swap_pageouts &&
1166 					     vm_page_count_min(0));
1167 		}
1168 
1169 		/*
1170 		 * We don't bother paging objects that are "dead".
1171 		 * Those objects are in a "rundown" state.
1172 		 */
1173 		if (!swap_pageouts_ok ||
1174 		    (object == NULL) ||
1175 		    (object->flags & OBJ_DEAD)) {
1176 			vm_page_and_queue_spin_lock(m);
1177 			if (m->queue - m->pc == PQ_INACTIVE) {
1178 				TAILQ_REMOVE(
1179 				    &vm_page_queues[m->queue].pl,
1180 				    m, pageq);
1181 				TAILQ_INSERT_TAIL(
1182 				    &vm_page_queues[m->queue].pl,
1183 				    m, pageq);
1184 			}
1185 			vm_page_and_queue_spin_unlock(m);
1186 			vm_page_wakeup(m);
1187 			return 0;
1188 		}
1189 
1190 		/*
1191 		 * (m) is still busied.
1192 		 *
1193 		 * The object is already known NOT to be dead.   It
1194 		 * is possible for the vget() to block the whole
1195 		 * pageout daemon, but the new low-memory handling
1196 		 * code should prevent it.
1197 		 *
1198 		 * The previous code skipped locked vnodes and, worse,
1199 		 * reordered pages in the queue.  This results in
1200 		 * completely non-deterministic operation because,
1201 		 * quite often, a vm_fault has initiated an I/O and
1202 		 * is holding a locked vnode at just the point where
1203 		 * the pageout daemon is woken up.
1204 		 *
1205 		 * We can't wait forever for the vnode lock, we might
1206 		 * deadlock due to a vn_read() getting stuck in
1207 		 * vm_wait while holding this vnode.  We skip the
1208 		 * vnode if we can't get it in a reasonable amount
1209 		 * of time.
1210 		 *
1211 		 * vpfailed is used to (try to) avoid the case where
1212 		 * a large number of pages are associated with a
1213 		 * locked vnode, which could cause the pageout daemon
1214 		 * to stall for an excessive amount of time.
1215 		 */
1216 		if (object->type == OBJT_VNODE) {
1217 			int flags;
1218 
1219 			vp = object->handle;
1220 			flags = LK_EXCLUSIVE;
1221 			if (vp == *vpfailedp)
1222 				flags |= LK_NOWAIT;
1223 			else
1224 				flags |= LK_TIMELOCK;
1225 			vm_page_hold(m);
1226 			vm_page_wakeup(m);
1227 
1228 			/*
1229 			 * We have unbusied (m) temporarily so we can
1230 			 * acquire the vp lock without deadlocking.
1231 			 * (m) is held to prevent destruction.
1232 			 */
1233 			if (vget(vp, flags) != 0) {
1234 				*vpfailedp = vp;
1235 				++pageout_lock_miss;
1236 				if (object->flags & OBJ_MIGHTBEDIRTY)
1237 					    ++*vnodes_skippedp;
1238 				vm_page_unhold(m);
1239 				return 0;
1240 			}
1241 
1242 			/*
1243 			 * The page might have been moved to another
1244 			 * queue during potential blocking in vget()
1245 			 * above.  The page might have been freed and
1246 			 * reused for another vnode.  The object might
1247 			 * have been reused for another vnode.
1248 			 */
1249 			if (m->queue - m->pc != PQ_INACTIVE ||
1250 			    m->object != object ||
1251 			    object->handle != vp) {
1252 				if (object->flags & OBJ_MIGHTBEDIRTY)
1253 					++*vnodes_skippedp;
1254 				vput(vp);
1255 				vm_page_unhold(m);
1256 				return 0;
1257 			}
1258 
1259 			/*
1260 			 * The page may have been busied during the
1261 			 * blocking in vput();  We don't move the
1262 			 * page back onto the end of the queue so that
1263 			 * statistics are more correct if we don't.
1264 			 */
1265 			if (vm_page_busy_try(m, TRUE)) {
1266 				vput(vp);
1267 				vm_page_unhold(m);
1268 				return 0;
1269 			}
1270 			vm_page_unhold(m);
1271 
1272 			/*
1273 			 * If it was wired while we didn't own it.
1274 			 */
1275 			if (m->wire_count) {
1276 				vm_page_unqueue_nowakeup(m);
1277 				vput(vp);
1278 				vm_page_wakeup(m);
1279 				return 0;
1280 			}
1281 
1282 			/*
1283 			 * (m) is busied again
1284 			 *
1285 			 * We own the busy bit and remove our hold
1286 			 * bit.  If the page is still held it
1287 			 * might be undergoing I/O, so skip it.
1288 			 */
1289 			if (m->hold_count) {
1290 rebusy_failed:
1291 				vm_page_and_queue_spin_lock(m);
1292 				if (m->queue - m->pc == PQ_INACTIVE) {
1293 					TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1294 					TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1295 				}
1296 				vm_page_and_queue_spin_unlock(m);
1297 				if (object->flags & OBJ_MIGHTBEDIRTY)
1298 					++*vnodes_skippedp;
1299 				vm_page_wakeup(m);
1300 				vput(vp);
1301 				return 0;
1302 			}
1303 
1304 			/*
1305 			 * Recheck queue, object, and vp now that we have
1306 			 * rebusied the page.
1307 			 */
1308 			if (m->queue - m->pc != PQ_INACTIVE ||
1309 			    m->object != object ||
1310 			    object->handle != vp) {
1311 				kprintf("vm_pageout_page: "
1312 					"rebusy %p failed(A)\n",
1313 					m);
1314 				goto rebusy_failed;
1315 			}
1316 
1317 			/*
1318 			 * Check page validity
1319 			 */
1320 			if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1321 				kprintf("vm_pageout_page: "
1322 					"rebusy %p failed(B)\n",
1323 					m);
1324 				goto rebusy_failed;
1325 			}
1326 			if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1327 				kprintf("vm_pageout_page: "
1328 					"rebusy %p failed(C)\n",
1329 					m);
1330 				goto rebusy_failed;
1331 			}
1332 
1333 			/* (m) is left busied as we fall through */
1334 		}
1335 
1336 		/*
1337 		 * page is busy and not held here.
1338 		 *
1339 		 * If a page is dirty, then it is either being washed
1340 		 * (but not yet cleaned) or it is still in the
1341 		 * laundry.  If it is still in the laundry, then we
1342 		 * start the cleaning operation.
1343 		 *
1344 		 * decrement inactive_shortage on success to account
1345 		 * for the (future) cleaned page.  Otherwise we
1346 		 * could wind up laundering or cleaning too many
1347 		 * pages.
1348 		 *
1349 		 * NOTE: Cleaning the page here does not cause
1350 		 *	 force_deficit to be adjusted, because the
1351 		 *	 page is not being freed or moved to the
1352 		 *	 cache.
1353 		 */
1354 		count = vm_pageout_clean_helper(m, vmflush_flags);
1355 		counts[0] += count;
1356 		*max_launderp -= count;
1357 
1358 		/*
1359 		 * Clean ate busy, page no longer accessible
1360 		 */
1361 		if (vp != NULL)
1362 			vput(vp);
1363 	} else {
1364 		vm_page_wakeup(m);
1365 	}
1366 	return count;
1367 }
1368 
1369 /*
1370  * Scan active queue
1371  *
1372  * WARNING! Can be called from two pagedaemon threads simultaneously.
1373  */
1374 static int
1375 vm_pageout_scan_active(int pass, int q,
1376 		       long avail_shortage, long inactive_shortage,
1377 		       long *recycle_countp)
1378 {
1379 	struct vm_page marker;
1380 	vm_page_t m;
1381 	int actcount;
1382 	long delta = 0;
1383 	long maxscan;
1384 	int isep;
1385 
1386 	isep = (curthread == emergpager);
1387 
1388 	/*
1389 	 * We want to move pages from the active queue to the inactive
1390 	 * queue to get the inactive queue to the inactive target.  If
1391 	 * we still have a page shortage from above we try to directly free
1392 	 * clean pages instead of moving them.
1393 	 *
1394 	 * If we do still have a shortage we keep track of the number of
1395 	 * pages we free or cache (recycle_count) as a measure of thrashing
1396 	 * between the active and inactive queues.
1397 	 *
1398 	 * If we were able to completely satisfy the free+cache targets
1399 	 * from the inactive pool we limit the number of pages we move
1400 	 * from the active pool to the inactive pool to 2x the pages we
1401 	 * had removed from the inactive pool (with a minimum of 1/5 the
1402 	 * inactive target).  If we were not able to completely satisfy
1403 	 * the free+cache targets we go for the whole target aggressively.
1404 	 *
1405 	 * NOTE: Both variables can end up negative.
1406 	 * NOTE: We are still in a critical section.
1407 	 *
1408 	 * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
1409 	 *	  PAGES.
1410 	 */
1411 
1412 	bzero(&marker, sizeof(marker));
1413 	marker.flags = PG_FICTITIOUS | PG_MARKER;
1414 	marker.busy_count = PBUSY_LOCKED;
1415 	marker.queue = PQ_ACTIVE + q;
1416 	marker.pc = q;
1417 	marker.wire_count = 1;
1418 
1419 	vm_page_queues_spin_lock(PQ_ACTIVE + q);
1420 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1421 	maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1;
1422 
1423 	/*
1424 	 * Queue locked at top of loop to avoid stack marker issues.
1425 	 */
1426 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1427 	       maxscan-- > 0 && (avail_shortage - delta > 0 ||
1428 				inactive_shortage > 0))
1429 	{
1430 		KKASSERT(m->queue == PQ_ACTIVE + q);
1431 		TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1432 			     &marker, pageq);
1433 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1434 				   &marker, pageq);
1435 
1436 		/*
1437 		 * Skip marker pages (atomic against other markers to avoid
1438 		 * infinite hop-over scans).
1439 		 */
1440 		if (m->flags & PG_MARKER)
1441 			continue;
1442 
1443 		/*
1444 		 * Try to busy the page.  Don't mess with pages which are
1445 		 * already busy or reorder them in the queue.
1446 		 */
1447 		if (vm_page_busy_try(m, TRUE))
1448 			continue;
1449 
1450 		/*
1451 		 * Remaining operations run with the page busy and neither
1452 		 * the page or the queue will be spin-locked.
1453 		 */
1454 		KKASSERT(m->queue == PQ_ACTIVE + q);
1455 		vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1456 
1457 #if 0
1458 		/*
1459 		 * Don't deactivate pages that are held, even if we can
1460 		 * busy them.  (XXX why not?)
1461 		 */
1462 		if (m->hold_count) {
1463 			vm_page_and_queue_spin_lock(m);
1464 			if (m->queue - m->pc == PQ_ACTIVE) {
1465 				TAILQ_REMOVE(
1466 					&vm_page_queues[PQ_ACTIVE + q].pl,
1467 					m, pageq);
1468 				TAILQ_INSERT_TAIL(
1469 					&vm_page_queues[PQ_ACTIVE + q].pl,
1470 					m, pageq);
1471 			}
1472 			vm_page_and_queue_spin_unlock(m);
1473 			vm_page_wakeup(m);
1474 			goto next;
1475 		}
1476 #endif
1477 		/*
1478 		 * We can just remove wired pages from the queue
1479 		 */
1480 		if (m->wire_count) {
1481 			vm_page_unqueue_nowakeup(m);
1482 			vm_page_wakeup(m);
1483 			goto next;
1484 		}
1485 
1486 		/*
1487 		 * The emergency pager ignores vnode-backed pages as these
1488 		 * are the pages that probably bricked the main pager.
1489 		 */
1490 		if (isep && m->object && m->object->type == OBJT_VNODE) {
1491 			vm_page_and_queue_spin_lock(m);
1492 			if (m->queue - m->pc == PQ_ACTIVE) {
1493 				TAILQ_REMOVE(
1494 					&vm_page_queues[PQ_ACTIVE + q].pl,
1495 					m, pageq);
1496 				TAILQ_INSERT_TAIL(
1497 					&vm_page_queues[PQ_ACTIVE + q].pl,
1498 					m, pageq);
1499 			}
1500 			vm_page_and_queue_spin_unlock(m);
1501 			vm_page_wakeup(m);
1502 			goto next;
1503 		}
1504 
1505 		/*
1506 		 * The count for pagedaemon pages is done after checking the
1507 		 * page for eligibility...
1508 		 */
1509 		mycpu->gd_cnt.v_pdpages++;
1510 
1511 		/*
1512 		 * Check to see "how much" the page has been used and clear
1513 		 * the tracking access bits.  If the object has no references
1514 		 * don't bother paying the expense.
1515 		 */
1516 		actcount = 0;
1517 		if (m->object && m->object->ref_count != 0) {
1518 			if (m->flags & PG_REFERENCED)
1519 				++actcount;
1520 			actcount += pmap_ts_referenced(m);
1521 			if (actcount) {
1522 				m->act_count += ACT_ADVANCE + actcount;
1523 				if (m->act_count > ACT_MAX)
1524 					m->act_count = ACT_MAX;
1525 			}
1526 		}
1527 		vm_page_flag_clear(m, PG_REFERENCED);
1528 
1529 		/*
1530 		 * actcount is only valid if the object ref_count is non-zero.
1531 		 * If the page does not have an object, actcount will be zero.
1532 		 */
1533 		if (actcount && m->object->ref_count != 0) {
1534 			vm_page_and_queue_spin_lock(m);
1535 			if (m->queue - m->pc == PQ_ACTIVE) {
1536 				TAILQ_REMOVE(
1537 					&vm_page_queues[PQ_ACTIVE + q].pl,
1538 					m, pageq);
1539 				TAILQ_INSERT_TAIL(
1540 					&vm_page_queues[PQ_ACTIVE + q].pl,
1541 					m, pageq);
1542 			}
1543 			vm_page_and_queue_spin_unlock(m);
1544 			vm_page_wakeup(m);
1545 		} else {
1546 			switch(m->object->type) {
1547 			case OBJT_DEFAULT:
1548 			case OBJT_SWAP:
1549 				m->act_count -= min(m->act_count,
1550 						    vm_anonmem_decline);
1551 				break;
1552 			default:
1553 				m->act_count -= min(m->act_count,
1554 						    vm_filemem_decline);
1555 				break;
1556 			}
1557 			if (vm_pageout_algorithm ||
1558 			    (m->object == NULL) ||
1559 			    (m->object && (m->object->ref_count == 0)) ||
1560 			    m->act_count < pass + 1
1561 			) {
1562 				/*
1563 				 * Deactivate the page.  If we had a
1564 				 * shortage from our inactive scan try to
1565 				 * free (cache) the page instead.
1566 				 *
1567 				 * Don't just blindly cache the page if
1568 				 * we do not have a shortage from the
1569 				 * inactive scan, that could lead to
1570 				 * gigabytes being moved.
1571 				 */
1572 				--inactive_shortage;
1573 				if (avail_shortage - delta > 0 ||
1574 				    (m->object && (m->object->ref_count == 0)))
1575 				{
1576 					if (avail_shortage - delta > 0)
1577 						++*recycle_countp;
1578 					vm_page_protect(m, VM_PROT_NONE);
1579 					if (m->dirty == 0 &&
1580 					    (m->flags & PG_NEED_COMMIT) == 0 &&
1581 					    avail_shortage - delta > 0) {
1582 						vm_page_cache(m);
1583 					} else {
1584 						vm_page_deactivate(m);
1585 						vm_page_wakeup(m);
1586 					}
1587 				} else {
1588 					vm_page_deactivate(m);
1589 					vm_page_wakeup(m);
1590 				}
1591 				++delta;
1592 			} else {
1593 				vm_page_and_queue_spin_lock(m);
1594 				if (m->queue - m->pc == PQ_ACTIVE) {
1595 					TAILQ_REMOVE(
1596 					    &vm_page_queues[PQ_ACTIVE + q].pl,
1597 					    m, pageq);
1598 					TAILQ_INSERT_TAIL(
1599 					    &vm_page_queues[PQ_ACTIVE + q].pl,
1600 					    m, pageq);
1601 				}
1602 				vm_page_and_queue_spin_unlock(m);
1603 				vm_page_wakeup(m);
1604 			}
1605 		}
1606 next:
1607 		lwkt_yield();
1608 		vm_page_queues_spin_lock(PQ_ACTIVE + q);
1609 	}
1610 
1611 	/*
1612 	 * Clean out our local marker.
1613 	 *
1614 	 * Page queue still spin-locked.
1615 	 */
1616 	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1617 	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1618 
1619 	return (delta);
1620 }
1621 
1622 /*
1623  * The number of actually free pages can drop down to v_free_reserved,
1624  * we try to build the free count back above v_free_min.  Note that
1625  * vm_paging_needed() also returns TRUE if v_free_count is not at
1626  * least v_free_min so that is the minimum we must build the free
1627  * count to.
1628  *
1629  * We use a slightly higher target to improve hysteresis,
1630  * ((v_free_target + v_free_min) / 2).  Since v_free_target
1631  * is usually the same as v_cache_min this maintains about
1632  * half the pages in the free queue as are in the cache queue,
1633  * providing pretty good pipelining for pageout operation.
1634  *
1635  * The system operator can manipulate vm.v_cache_min and
1636  * vm.v_free_target to tune the pageout demon.  Be sure
1637  * to keep vm.v_free_min < vm.v_free_target.
1638  *
1639  * Note that the original paging target is to get at least
1640  * (free_min + cache_min) into (free + cache).  The slightly
1641  * higher target will shift additional pages from cache to free
1642  * without effecting the original paging target in order to
1643  * maintain better hysteresis and not have the free count always
1644  * be dead-on v_free_min.
1645  *
1646  * NOTE: we are still in a critical section.
1647  *
1648  * Pages moved from PQ_CACHE to totally free are not counted in the
1649  * pages_freed counter.
1650  *
1651  * WARNING! Can be called from two pagedaemon threads simultaneously.
1652  */
1653 static void
1654 vm_pageout_scan_cache(long avail_shortage, int pass,
1655 		      long vnodes_skipped, long recycle_count)
1656 {
1657 	static int lastkillticks;
1658 	struct vm_pageout_scan_info info;
1659 	vm_page_t m;
1660 	int isep;
1661 
1662 	isep = (curthread == emergpager);
1663 
1664 	while (vmstats.v_free_count <
1665 	       (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1666 		/*
1667 		 * This steals some code from vm/vm_page.c
1668 		 *
1669 		 * Create two rovers and adjust the code to reduce
1670 		 * chances of them winding up at the same index (which
1671 		 * can cause a lot of contention).
1672 		 */
1673 		static int cache_rover[2] = { 0, PQ_L2_MASK / 2 };
1674 
1675 		if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0)
1676 			goto next_rover;
1677 
1678 		m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK);
1679 		if (m == NULL)
1680 			break;
1681 		/*
1682 		 * page is returned removed from its queue and spinlocked
1683 		 *
1684 		 * If the busy attempt fails we can still deactivate the page.
1685 		 */
1686 		if (vm_page_busy_try(m, TRUE)) {
1687 			vm_page_deactivate_locked(m);
1688 			vm_page_spin_unlock(m);
1689 			continue;
1690 		}
1691 		vm_page_spin_unlock(m);
1692 		pagedaemon_wakeup();
1693 		lwkt_yield();
1694 
1695 		/*
1696 		 * Remaining operations run with the page busy and neither
1697 		 * the page or the queue will be spin-locked.
1698 		 */
1699 		if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) ||
1700 		    m->hold_count ||
1701 		    m->wire_count) {
1702 			vm_page_deactivate(m);
1703 			vm_page_wakeup(m);
1704 			continue;
1705 		}
1706 
1707 		/*
1708 		 * Because the page is in the cache, it shouldn't be mapped.
1709 		 */
1710 		pmap_mapped_sync(m);
1711 		KKASSERT((m->flags & PG_MAPPED) == 0);
1712 		KKASSERT(m->dirty == 0);
1713 		vm_pageout_page_free(m);
1714 		mycpu->gd_cnt.v_dfree++;
1715 next_rover:
1716 		if (isep)
1717 			cache_rover[1] -= PQ_PRIME2;
1718 		else
1719 			cache_rover[0] += PQ_PRIME2;
1720 	}
1721 
1722 	/*
1723 	 * If we didn't get enough free pages, and we have skipped a vnode
1724 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
1725 	 * if we did not get enough free pages.
1726 	 */
1727 	if (vm_paging_target() > 0) {
1728 		if (vnodes_skipped && vm_page_count_min(0))
1729 			speedup_syncer(NULL);
1730 #if !defined(NO_SWAPPING)
1731 		if (vm_swap_enabled && vm_page_count_target())
1732 			vm_req_vmdaemon();
1733 #endif
1734 	}
1735 
1736 	/*
1737 	 * Handle catastrophic conditions.  Under good conditions we should
1738 	 * be at the target, well beyond our minimum.  If we could not even
1739 	 * reach our minimum the system is under heavy stress.  But just being
1740 	 * under heavy stress does not trigger process killing.
1741 	 *
1742 	 * We consider ourselves to have run out of memory if the swap pager
1743 	 * is full and avail_shortage is still positive.  The secondary check
1744 	 * ensures that we do not kill processes if the instantanious
1745 	 * availability is good, even if the pageout demon pass says it
1746 	 * couldn't get to the target.
1747 	 *
1748 	 * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL
1749 	 *	  SITUATIONS.
1750 	 */
1751 	if (swap_pager_almost_full &&
1752 	    pass > 0 &&
1753 	    isep == 0 &&
1754 	    (vm_page_count_min(recycle_count) || avail_shortage > 0)) {
1755 		kprintf("Warning: system low on memory+swap "
1756 			"shortage %ld for %d ticks!\n",
1757 			avail_shortage, ticks - swap_fail_ticks);
1758 		if (bootverbose)
1759 		kprintf("Metrics: spaf=%d spf=%d pass=%d "
1760 			"avail=%ld target=%ld last=%u\n",
1761 			swap_pager_almost_full,
1762 			swap_pager_full,
1763 			pass,
1764 			avail_shortage,
1765 			vm_paging_target(),
1766 			(unsigned int)(ticks - lastkillticks));
1767 	}
1768 	if (swap_pager_full &&
1769 	    pass > 1 &&
1770 	    isep == 0 &&
1771 	    avail_shortage > 0 &&
1772 	    vm_paging_target() > 0 &&
1773 	    (unsigned int)(ticks - lastkillticks) >= hz) {
1774 		/*
1775 		 * Kill something, maximum rate once per second to give
1776 		 * the process time to free up sufficient memory.
1777 		 */
1778 		lastkillticks = ticks;
1779 		info.bigproc = NULL;
1780 		info.bigsize = 0;
1781 		allproc_scan(vm_pageout_scan_callback, &info, 0);
1782 		if (info.bigproc != NULL) {
1783 			kprintf("Try to kill process %d %s\n",
1784 				info.bigproc->p_pid, info.bigproc->p_comm);
1785 			info.bigproc->p_nice = PRIO_MIN;
1786 			info.bigproc->p_usched->resetpriority(
1787 				FIRST_LWP_IN_PROC(info.bigproc));
1788 			atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1789 			killproc(info.bigproc, "out of swap space");
1790 			wakeup(&vmstats.v_free_count);
1791 			PRELE(info.bigproc);
1792 		}
1793 	}
1794 }
1795 
1796 static int
1797 vm_pageout_scan_callback(struct proc *p, void *data)
1798 {
1799 	struct vm_pageout_scan_info *info = data;
1800 	vm_offset_t size;
1801 
1802 	/*
1803 	 * Never kill system processes or init.  If we have configured swap
1804 	 * then try to avoid killing low-numbered pids.
1805 	 */
1806 	if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1807 	    ((p->p_pid < 48) && (vm_swap_size != 0))) {
1808 		return (0);
1809 	}
1810 
1811 	lwkt_gettoken(&p->p_token);
1812 
1813 	/*
1814 	 * if the process is in a non-running type state,
1815 	 * don't touch it.
1816 	 */
1817 	if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1818 		lwkt_reltoken(&p->p_token);
1819 		return (0);
1820 	}
1821 
1822 	/*
1823 	 * Get the approximate process size.  Note that anonymous pages
1824 	 * with backing swap will be counted twice, but there should not
1825 	 * be too many such pages due to the stress the VM system is
1826 	 * under at this point.
1827 	 */
1828 	size = vmspace_anonymous_count(p->p_vmspace) +
1829 		vmspace_swap_count(p->p_vmspace);
1830 
1831 	/*
1832 	 * If the this process is bigger than the biggest one
1833 	 * remember it.
1834 	 */
1835 	if (info->bigsize < size) {
1836 		if (info->bigproc)
1837 			PRELE(info->bigproc);
1838 		PHOLD(p);
1839 		info->bigproc = p;
1840 		info->bigsize = size;
1841 	}
1842 	lwkt_reltoken(&p->p_token);
1843 	lwkt_yield();
1844 
1845 	return(0);
1846 }
1847 
1848 /*
1849  * This old guy slowly walks PQ_HOLD looking for pages which need to be
1850  * moved back to PQ_FREE.  It is possible for pages to accumulate here
1851  * when vm_page_free() races against vm_page_unhold(), resulting in a
1852  * page being left on a PQ_HOLD queue with hold_count == 0.
1853  *
1854  * It is easier to handle this edge condition here, in non-critical code,
1855  * rather than enforce a spin-lock for every 1->0 transition in
1856  * vm_page_unhold().
1857  *
1858  * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue.
1859  */
1860 static void
1861 vm_pageout_scan_hold(int q)
1862 {
1863 	vm_page_t m;
1864 
1865 	vm_page_queues_spin_lock(PQ_HOLD + q);
1866 	TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) {
1867 		if (m->flags & PG_MARKER)
1868 			continue;
1869 
1870 		/*
1871 		 * Process one page and return
1872 		 */
1873 		if (m->hold_count)
1874 			break;
1875 		kprintf("DEBUG: pageout HOLD->FREE %p\n", m);
1876 		vm_page_hold(m);
1877 		vm_page_queues_spin_unlock(PQ_HOLD + q);
1878 		vm_page_unhold(m);	/* reprocess */
1879 		return;
1880 	}
1881 	vm_page_queues_spin_unlock(PQ_HOLD + q);
1882 }
1883 
1884 /*
1885  * This routine tries to maintain the pseudo LRU active queue,
1886  * so that during long periods of time where there is no paging,
1887  * that some statistic accumulation still occurs.  This code
1888  * helps the situation where paging just starts to occur.
1889  */
1890 static void
1891 vm_pageout_page_stats(int q)
1892 {
1893 	static int fullintervalcount = 0;
1894 	struct vm_page marker;
1895 	vm_page_t m;
1896 	long pcount, tpcount;		/* Number of pages to check */
1897 	long page_shortage;
1898 
1899 	page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1900 			 vmstats.v_free_min) -
1901 			(vmstats.v_free_count + vmstats.v_inactive_count +
1902 			 vmstats.v_cache_count);
1903 
1904 	if (page_shortage <= 0)
1905 		return;
1906 
1907 	pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1908 	fullintervalcount += vm_pageout_stats_interval;
1909 	if (fullintervalcount < vm_pageout_full_stats_interval) {
1910 		tpcount = (vm_pageout_stats_max * pcount) /
1911 			  vmstats.v_page_count + 1;
1912 		if (pcount > tpcount)
1913 			pcount = tpcount;
1914 	} else {
1915 		fullintervalcount = 0;
1916 	}
1917 
1918 	bzero(&marker, sizeof(marker));
1919 	marker.flags = PG_FICTITIOUS | PG_MARKER;
1920 	marker.busy_count = PBUSY_LOCKED;
1921 	marker.queue = PQ_ACTIVE + q;
1922 	marker.pc = q;
1923 	marker.wire_count = 1;
1924 
1925 	vm_page_queues_spin_lock(PQ_ACTIVE + q);
1926 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1927 
1928 	/*
1929 	 * Queue locked at top of loop to avoid stack marker issues.
1930 	 */
1931 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1932 	       pcount-- > 0)
1933 	{
1934 		int actcount;
1935 
1936 		KKASSERT(m->queue == PQ_ACTIVE + q);
1937 		TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1938 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1939 				   &marker, pageq);
1940 
1941 		/*
1942 		 * Skip marker pages (atomic against other markers to avoid
1943 		 * infinite hop-over scans).
1944 		 */
1945 		if (m->flags & PG_MARKER)
1946 			continue;
1947 
1948 		/*
1949 		 * Ignore pages we can't busy
1950 		 */
1951 		if (vm_page_busy_try(m, TRUE))
1952 			continue;
1953 
1954 		/*
1955 		 * Remaining operations run with the page busy and neither
1956 		 * the page or the queue will be spin-locked.
1957 		 */
1958 		KKASSERT(m->queue == PQ_ACTIVE + q);
1959 		vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1960 
1961 		/*
1962 		 * We can just remove wired pages from the queue
1963 		 */
1964 		if (m->wire_count) {
1965 			vm_page_unqueue_nowakeup(m);
1966 			vm_page_wakeup(m);
1967 			goto next;
1968 		}
1969 
1970 
1971 		/*
1972 		 * We now have a safely busied page, the page and queue
1973 		 * spinlocks have been released.
1974 		 *
1975 		 * Ignore held and wired pages
1976 		 */
1977 		if (m->hold_count || m->wire_count) {
1978 			vm_page_wakeup(m);
1979 			goto next;
1980 		}
1981 
1982 		/*
1983 		 * Calculate activity
1984 		 */
1985 		actcount = 0;
1986 		if (m->flags & PG_REFERENCED) {
1987 			vm_page_flag_clear(m, PG_REFERENCED);
1988 			actcount += 1;
1989 		}
1990 		actcount += pmap_ts_referenced(m);
1991 
1992 		/*
1993 		 * Update act_count and move page to end of queue.
1994 		 */
1995 		if (actcount) {
1996 			m->act_count += ACT_ADVANCE + actcount;
1997 			if (m->act_count > ACT_MAX)
1998 				m->act_count = ACT_MAX;
1999 			vm_page_and_queue_spin_lock(m);
2000 			if (m->queue - m->pc == PQ_ACTIVE) {
2001 				TAILQ_REMOVE(
2002 					&vm_page_queues[PQ_ACTIVE + q].pl,
2003 					m, pageq);
2004 				TAILQ_INSERT_TAIL(
2005 					&vm_page_queues[PQ_ACTIVE + q].pl,
2006 					m, pageq);
2007 			}
2008 			vm_page_and_queue_spin_unlock(m);
2009 			vm_page_wakeup(m);
2010 			goto next;
2011 		}
2012 
2013 		if (m->act_count == 0) {
2014 			/*
2015 			 * We turn off page access, so that we have
2016 			 * more accurate RSS stats.  We don't do this
2017 			 * in the normal page deactivation when the
2018 			 * system is loaded VM wise, because the
2019 			 * cost of the large number of page protect
2020 			 * operations would be higher than the value
2021 			 * of doing the operation.
2022 			 *
2023 			 * We use the marker to save our place so
2024 			 * we can release the spin lock.  both (m)
2025 			 * and (next) will be invalid.
2026 			 */
2027 			vm_page_protect(m, VM_PROT_NONE);
2028 			vm_page_deactivate(m);
2029 		} else {
2030 			m->act_count -= min(m->act_count, ACT_DECLINE);
2031 			vm_page_and_queue_spin_lock(m);
2032 			if (m->queue - m->pc == PQ_ACTIVE) {
2033 				TAILQ_REMOVE(
2034 					&vm_page_queues[PQ_ACTIVE + q].pl,
2035 					m, pageq);
2036 				TAILQ_INSERT_TAIL(
2037 					&vm_page_queues[PQ_ACTIVE + q].pl,
2038 					m, pageq);
2039 			}
2040 			vm_page_and_queue_spin_unlock(m);
2041 		}
2042 		vm_page_wakeup(m);
2043 next:
2044 		vm_page_queues_spin_lock(PQ_ACTIVE + q);
2045 	}
2046 
2047 	/*
2048 	 * Remove our local marker
2049 	 *
2050 	 * Page queue still spin-locked.
2051 	 */
2052 	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
2053 	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
2054 }
2055 
2056 static void
2057 vm_pageout_free_page_calc(vm_size_t count)
2058 {
2059 	/*
2060 	 * v_free_min		normal allocations
2061 	 * v_free_reserved	system allocations
2062 	 * v_pageout_free_min	allocations by pageout daemon
2063 	 * v_interrupt_free_min	low level allocations (e.g swap structures)
2064 	 *
2065 	 * v_free_min is used to generate several other baselines, and they
2066 	 * can get pretty silly on systems with a lot of memory.
2067 	 */
2068 	vmstats.v_free_min = 64 + vmstats.v_page_count / 200;
2069 	vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
2070 	vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
2071 	vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
2072 	vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
2073 }
2074 
2075 
2076 /*
2077  * vm_pageout is the high level pageout daemon.  TWO kernel threads run
2078  * this daemon, the primary pageout daemon and the emergency pageout daemon.
2079  *
2080  * The emergency pageout daemon takes over when the primary pageout daemon
2081  * deadlocks.  The emergency pageout daemon ONLY pages out to swap, thus
2082  * avoiding the many low-memory deadlocks which can occur when paging out
2083  * to VFS's.
2084  */
2085 static void
2086 vm_pageout_thread(void)
2087 {
2088 	int pass;
2089 	int q;
2090 	int q1iterator = 0;
2091 	int q2iterator = 0;
2092 	int q3iterator = 0;
2093 	int isep;
2094 
2095 	curthread->td_flags |= TDF_SYSTHREAD;
2096 
2097 	/*
2098 	 * We only need to setup once.
2099 	 */
2100 	isep = 0;
2101 	if (curthread == emergpager) {
2102 		isep = 1;
2103 		goto skip_setup;
2104 	}
2105 
2106 	/*
2107 	 * Initialize vm_max_launder per pageout pass to be 1/16
2108 	 * of total physical memory, plus a little slop.
2109 	 */
2110 	if (vm_max_launder == 0)
2111 		vm_max_launder = physmem / 256 + 16;
2112 
2113 	/*
2114 	 * Initialize some paging parameters.
2115 	 */
2116 	vm_pageout_free_page_calc(vmstats.v_page_count);
2117 
2118 	/*
2119 	 * v_free_target and v_cache_min control pageout hysteresis.  Note
2120 	 * that these are more a measure of the VM cache queue hysteresis
2121 	 * then the VM free queue.  Specifically, v_free_target is the
2122 	 * high water mark (free+cache pages).
2123 	 *
2124 	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
2125 	 * low water mark, while v_free_min is the stop.  v_cache_min must
2126 	 * be big enough to handle memory needs while the pageout daemon
2127 	 * is signalled and run to free more pages.
2128 	 */
2129 	vmstats.v_free_target = 4 * vmstats.v_free_min +
2130 				vmstats.v_free_reserved;
2131 
2132 	/*
2133 	 * NOTE: With the new buffer cache b_act_count we want the default
2134 	 *	 inactive target to be a percentage of available memory.
2135 	 *
2136 	 *	 The inactive target essentially determines the minimum
2137 	 *	 number of 'temporary' pages capable of caching one-time-use
2138 	 *	 files when the VM system is otherwise full of pages
2139 	 *	 belonging to multi-time-use files or active program data.
2140 	 *
2141 	 * NOTE: The inactive target is aggressively persued only if the
2142 	 *	 inactive queue becomes too small.  If the inactive queue
2143 	 *	 is large enough to satisfy page movement to free+cache
2144 	 *	 then it is repopulated more slowly from the active queue.
2145 	 *	 This allows a general inactive_target default to be set.
2146 	 *
2147 	 *	 There is an issue here for processes which sit mostly idle
2148 	 *	 'overnight', such as sshd, tcsh, and X.  Any movement from
2149 	 *	 the active queue will eventually cause such pages to
2150 	 *	 recycle eventually causing a lot of paging in the morning.
2151 	 *	 To reduce the incidence of this pages cycled out of the
2152 	 *	 buffer cache are moved directly to the inactive queue if
2153 	 *	 they were only used once or twice.
2154 	 *
2155 	 *	 The vfs.vm_cycle_point sysctl can be used to adjust this.
2156 	 *	 Increasing the value (up to 64) increases the number of
2157 	 *	 buffer recyclements which go directly to the inactive queue.
2158 	 */
2159 	if (vmstats.v_free_count > 2048) {
2160 		vmstats.v_cache_min = vmstats.v_free_target;
2161 		vmstats.v_cache_max = 2 * vmstats.v_cache_min;
2162 	} else {
2163 		vmstats.v_cache_min = 0;
2164 		vmstats.v_cache_max = 0;
2165 	}
2166 	vmstats.v_inactive_target = vmstats.v_free_count / 4;
2167 
2168 	/* XXX does not really belong here */
2169 	if (vm_page_max_wired == 0)
2170 		vm_page_max_wired = vmstats.v_free_count / 3;
2171 
2172 	if (vm_pageout_stats_max == 0)
2173 		vm_pageout_stats_max = vmstats.v_free_target;
2174 
2175 	/*
2176 	 * Set interval in seconds for stats scan.
2177 	 */
2178 	if (vm_pageout_stats_interval == 0)
2179 		vm_pageout_stats_interval = 5;
2180 	if (vm_pageout_full_stats_interval == 0)
2181 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
2182 
2183 
2184 	/*
2185 	 * Set maximum free per pass
2186 	 */
2187 	if (vm_pageout_stats_free_max == 0)
2188 		vm_pageout_stats_free_max = 5;
2189 
2190 	swap_pager_swap_init();
2191 	pass = 0;
2192 
2193 	atomic_swap_int(&sequence_emerg_pager, 1);
2194 	wakeup(&sequence_emerg_pager);
2195 
2196 skip_setup:
2197 	/*
2198 	 * Sequence emergency pager startup
2199 	 */
2200 	if (isep) {
2201 		while (sequence_emerg_pager == 0)
2202 			tsleep(&sequence_emerg_pager, 0, "pstartup", hz);
2203 	}
2204 
2205 	/*
2206 	 * The pageout daemon is never done, so loop forever.
2207 	 *
2208 	 * WARNING!  This code is being executed by two kernel threads
2209 	 *	     potentially simultaneously.
2210 	 */
2211 	while (TRUE) {
2212 		int error;
2213 		long avail_shortage;
2214 		long inactive_shortage;
2215 		long vnodes_skipped = 0;
2216 		long recycle_count = 0;
2217 		long tmp;
2218 
2219 		/*
2220 		 * Wait for an action request.  If we timeout check to
2221 		 * see if paging is needed (in case the normal wakeup
2222 		 * code raced us).
2223 		 */
2224 		if (isep) {
2225 			/*
2226 			 * Emergency pagedaemon monitors the primary
2227 			 * pagedaemon while vm_pages_needed != 0.
2228 			 *
2229 			 * The emergency pagedaemon only runs if VM paging
2230 			 * is needed and the primary pagedaemon has not
2231 			 * updated vm_pagedaemon_time for more than 2 seconds.
2232 			 */
2233 			if (vm_pages_needed)
2234 				tsleep(&vm_pagedaemon_time, 0, "psleep", hz);
2235 			else
2236 				tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10);
2237 			if (vm_pages_needed == 0) {
2238 				pass = 0;
2239 				continue;
2240 			}
2241 			if ((int)(ticks - vm_pagedaemon_time) < hz * 2) {
2242 				pass = 0;
2243 				continue;
2244 			}
2245 		} else {
2246 			/*
2247 			 * Primary pagedaemon
2248 			 *
2249 			 * NOTE: We unconditionally cleanup PQ_HOLD even
2250 			 *	 when there is no work to do.
2251 			 */
2252 			vm_pageout_scan_hold(q3iterator & PQ_L2_MASK);
2253 			++q3iterator;
2254 
2255 			if (vm_pages_needed == 0) {
2256 				error = tsleep(&vm_pages_needed,
2257 					       0, "psleep",
2258 					       vm_pageout_stats_interval * hz);
2259 				if (error &&
2260 				    vm_paging_needed(0) == 0 &&
2261 				    vm_pages_needed == 0) {
2262 					for (q = 0; q < PQ_L2_SIZE; ++q)
2263 						vm_pageout_page_stats(q);
2264 					continue;
2265 				}
2266 				vm_pagedaemon_time = ticks;
2267 				vm_pages_needed = 1;
2268 
2269 				/*
2270 				 * Wake the emergency pagedaemon up so it
2271 				 * can monitor us.  It will automatically
2272 				 * go back into a long sleep when
2273 				 * vm_pages_needed returns to 0.
2274 				 */
2275 				wakeup(&vm_pagedaemon_time);
2276 			}
2277 		}
2278 
2279 		mycpu->gd_cnt.v_pdwakeups++;
2280 
2281 		/*
2282 		 * Scan for INACTIVE->CLEAN/PAGEOUT
2283 		 *
2284 		 * This routine tries to avoid thrashing the system with
2285 		 * unnecessary activity.
2286 		 *
2287 		 * Calculate our target for the number of free+cache pages we
2288 		 * want to get to.  This is higher then the number that causes
2289 		 * allocations to stall (severe) in order to provide hysteresis,
2290 		 * and if we don't make it all the way but get to the minimum
2291 		 * we're happy.  Goose it a bit if there are multiple requests
2292 		 * for memory.
2293 		 *
2294 		 * Don't reduce avail_shortage inside the loop or the
2295 		 * PQAVERAGE() calculation will break.
2296 		 *
2297 		 * NOTE! deficit is differentiated from avail_shortage as
2298 		 *	 REQUIRING at least (deficit) pages to be cleaned,
2299 		 *	 even if the page queues are in good shape.  This
2300 		 *	 is used primarily for handling per-process
2301 		 *	 RLIMIT_RSS and may also see small values when
2302 		 *	 processes block due to low memory.
2303 		 */
2304 		vmstats_rollup();
2305 		if (isep == 0)
2306 			vm_pagedaemon_time = ticks;
2307 		avail_shortage = vm_paging_target() + vm_pageout_deficit;
2308 		vm_pageout_deficit = 0;
2309 
2310 		if (avail_shortage > 0) {
2311 			long delta = 0;
2312 			long counts[4] = { 0, 0, 0, 0 };
2313 			int qq;
2314 
2315 			if (vm_pageout_debug) {
2316 				kprintf("scan_inactive pass %d isep=%d\t",
2317 					pass / MAXSCAN_DIVIDER, isep);
2318 			}
2319 
2320 			qq = q1iterator;
2321 			for (q = 0; q < PQ_L2_SIZE; ++q) {
2322 				delta += vm_pageout_scan_inactive(
2323 					    pass / MAXSCAN_DIVIDER,
2324 					    qq & PQ_L2_MASK,
2325 					    PQAVERAGE(avail_shortage),
2326 					    &vnodes_skipped, counts);
2327 				if (isep)
2328 					--qq;
2329 				else
2330 					++qq;
2331 				if (avail_shortage - delta <= 0)
2332 					break;
2333 
2334 				/*
2335 				 * It is possible for avail_shortage to be
2336 				 * very large.  If a large program exits or
2337 				 * frees a ton of memory all at once, we do
2338 				 * not have to continue deactivations.
2339 				 *
2340 				 * (We will still run the active->inactive
2341 				 * target, however).
2342 				 */
2343 				if (!vm_page_count_target() &&
2344 				    !vm_page_count_min(
2345 						vm_page_free_hysteresis)) {
2346 					avail_shortage = 0;
2347 					break;
2348 				}
2349 			}
2350 			if (vm_pageout_debug) {
2351 				kprintf("flushed %ld cleaned %ld "
2352 					"lru2 %ld react %ld "
2353 					"delta %ld\n",
2354 					counts[0], counts[1],
2355 					counts[2], counts[3],
2356 					delta);
2357 			}
2358 			avail_shortage -= delta;
2359 			q1iterator = qq;
2360 		}
2361 
2362 		/*
2363 		 * Figure out how many active pages we must deactivate.  If
2364 		 * we were able to reach our target with just the inactive
2365 		 * scan above we limit the number of active pages we
2366 		 * deactivate to reduce unnecessary work.
2367 		 */
2368 		vmstats_rollup();
2369 		if (isep == 0)
2370 			vm_pagedaemon_time = ticks;
2371 		inactive_shortage = vmstats.v_inactive_target -
2372 				    vmstats.v_inactive_count;
2373 
2374 		/*
2375 		 * If we were unable to free sufficient inactive pages to
2376 		 * satisfy the free/cache queue requirements then simply
2377 		 * reaching the inactive target may not be good enough.
2378 		 * Try to deactivate pages in excess of the target based
2379 		 * on the shortfall.
2380 		 *
2381 		 * However to prevent thrashing the VM system do not
2382 		 * deactivate more than an additional 1/10 the inactive
2383 		 * target's worth of active pages.
2384 		 */
2385 		if (avail_shortage > 0) {
2386 			tmp = avail_shortage * 2;
2387 			if (tmp > vmstats.v_inactive_target / 10)
2388 				tmp = vmstats.v_inactive_target / 10;
2389 			inactive_shortage += tmp;
2390 		}
2391 
2392 		/*
2393 		 * Only trigger a pmap cleanup on inactive shortage.
2394 		 */
2395 		if (isep == 0 && inactive_shortage > 0) {
2396 			pmap_collect();
2397 		}
2398 
2399 		/*
2400 		 * Scan for ACTIVE->INACTIVE
2401 		 *
2402 		 * Only trigger on inactive shortage.  Triggering on
2403 		 * avail_shortage can starve the active queue with
2404 		 * unnecessary active->inactive transitions and destroy
2405 		 * performance.
2406 		 *
2407 		 * If this is the emergency pager, always try to move
2408 		 * a few pages from active to inactive because the inactive
2409 		 * queue might have enough pages, but not enough anonymous
2410 		 * pages.
2411 		 */
2412 		if (isep && inactive_shortage < vm_emerg_launder)
2413 			inactive_shortage = vm_emerg_launder;
2414 
2415 		if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2416 			long delta = 0;
2417 			int qq;
2418 
2419 			qq = q2iterator;
2420 			for (q = 0; q < PQ_L2_SIZE; ++q) {
2421 				delta += vm_pageout_scan_active(
2422 						pass / MAXSCAN_DIVIDER,
2423 						qq & PQ_L2_MASK,
2424 						PQAVERAGE(avail_shortage),
2425 						PQAVERAGE(inactive_shortage),
2426 						&recycle_count);
2427 				if (isep)
2428 					--qq;
2429 				else
2430 					++qq;
2431 				if (inactive_shortage - delta <= 0 &&
2432 				    avail_shortage - delta <= 0) {
2433 					break;
2434 				}
2435 
2436 				/*
2437 				 * inactive_shortage can be a very large
2438 				 * number.  This is intended to break out
2439 				 * early if our inactive_target has been
2440 				 * reached due to other system activity.
2441 				 */
2442 				if (vmstats.v_inactive_count >
2443 				    vmstats.v_inactive_target) {
2444 					inactive_shortage = 0;
2445 					break;
2446 				}
2447 			}
2448 			inactive_shortage -= delta;
2449 			avail_shortage -= delta;
2450 			q2iterator = qq;
2451 		}
2452 
2453 		/*
2454 		 * Scan for CACHE->FREE
2455 		 *
2456 		 * Finally free enough cache pages to meet our free page
2457 		 * requirement and take more drastic measures if we are
2458 		 * still in trouble.
2459 		 */
2460 		vmstats_rollup();
2461 		if (isep == 0)
2462 			vm_pagedaemon_time = ticks;
2463 		vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER,
2464 				      vnodes_skipped, recycle_count);
2465 
2466 		/*
2467 		 * This is a bit sophisticated because we do not necessarily
2468 		 * want to force paging until our targets are reached if we
2469 		 * were able to successfully retire the shortage we calculated.
2470 		 */
2471 		if (avail_shortage > 0) {
2472 			/*
2473 			 * If we did not retire enough pages continue the
2474 			 * pageout operation until we are able to.  It
2475 			 * takes MAXSCAN_DIVIDER passes to cover the entire
2476 			 * inactive list.
2477 			 */
2478 			++pass;
2479 
2480 			if (pass / MAXSCAN_DIVIDER < 10 &&
2481 			    vm_pages_needed > 1) {
2482 				/*
2483 				 * Normal operation, additional processes
2484 				 * have already kicked us.  Retry immediately
2485 				 * unless swap space is completely full in
2486 				 * which case delay a bit.
2487 				 */
2488 				if (swap_pager_full) {
2489 					tsleep(&vm_pages_needed, 0, "pdelay",
2490 						hz / 5);
2491 				} /* else immediate retry */
2492 			} else if (pass / MAXSCAN_DIVIDER < 10) {
2493 				/*
2494 				 * Do a short sleep for the first 10 passes,
2495 				 * allow the sleep to be woken up by resetting
2496 				 * vm_pages_needed to 1 (NOTE: we are still
2497 				 * active paging!).
2498 				 */
2499 				if (isep == 0)
2500 					vm_pages_needed = 1;
2501 				tsleep(&vm_pages_needed, 0, "pdelay", 2);
2502 			} else if (swap_pager_full == 0) {
2503 				/*
2504 				 * We've taken too many passes, force a
2505 				 * longer delay.
2506 				 */
2507 				tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2508 			} else {
2509 				/*
2510 				 * Running out of memory, catastrophic
2511 				 * back-off to one-second intervals.
2512 				 */
2513 				tsleep(&vm_pages_needed, 0, "pdelay", hz);
2514 			}
2515 		} else if (vm_pages_needed) {
2516 			/*
2517 			 * We retired our calculated shortage but we may have
2518 			 * to continue paging if threads drain memory too far
2519 			 * below our target.
2520 			 *
2521 			 * Similar to vm_page_free_wakeup() in vm_page.c.
2522 			 */
2523 			pass = 0;
2524 			if (!vm_paging_needed(0)) {
2525 				/* still more than half-way to our target */
2526 				vm_pages_needed = 0;
2527 				wakeup(&vmstats.v_free_count);
2528 			} else
2529 			if (!vm_page_count_min(vm_page_free_hysteresis)) {
2530 				/*
2531 				 * Continue operations with wakeup
2532 				 * (set variable to avoid overflow)
2533 				 */
2534 				vm_pages_needed = 2;
2535 				wakeup(&vmstats.v_free_count);
2536 			} else {
2537 				/*
2538 				 * No wakeup() needed, continue operations.
2539 				 * (set variable to avoid overflow)
2540 				 */
2541 				vm_pages_needed = 2;
2542 			}
2543 		} else {
2544 			/*
2545 			 * Turn paging back on immediately if we are under
2546 			 * minimum.
2547 			 */
2548 			pass = 0;
2549 		}
2550 	}
2551 }
2552 
2553 static struct kproc_desc pg1_kp = {
2554 	"pagedaemon",
2555 	vm_pageout_thread,
2556 	&pagethread
2557 };
2558 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp);
2559 
2560 static struct kproc_desc pg2_kp = {
2561 	"emergpager",
2562 	vm_pageout_thread,
2563 	&emergpager
2564 };
2565 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp);
2566 
2567 
2568 /*
2569  * Called after allocating a page out of the cache or free queue
2570  * to possibly wake the pagedaemon up to replentish our supply.
2571  *
2572  * We try to generate some hysteresis by waking the pagedaemon up
2573  * when our free+cache pages go below the free_min+cache_min level.
2574  * The pagedaemon tries to get the count back up to at least the
2575  * minimum, and through to the target level if possible.
2576  *
2577  * If the pagedaemon is already active bump vm_pages_needed as a hint
2578  * that there are even more requests pending.
2579  *
2580  * SMP races ok?
2581  * No requirements.
2582  */
2583 void
2584 pagedaemon_wakeup(void)
2585 {
2586 	if (vm_paging_needed(0) && curthread != pagethread) {
2587 		if (vm_pages_needed <= 1) {
2588 			vm_pages_needed = 1;		/* SMP race ok */
2589 			wakeup(&vm_pages_needed);	/* tickle pageout */
2590 		} else if (vm_page_count_min(0)) {
2591 			++vm_pages_needed;		/* SMP race ok */
2592 			/* a wakeup() would be wasted here */
2593 		}
2594 	}
2595 }
2596 
2597 #if !defined(NO_SWAPPING)
2598 
2599 /*
2600  * SMP races ok?
2601  * No requirements.
2602  */
2603 static void
2604 vm_req_vmdaemon(void)
2605 {
2606 	static int lastrun = 0;
2607 
2608 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2609 		wakeup(&vm_daemon_needed);
2610 		lastrun = ticks;
2611 	}
2612 }
2613 
2614 static int vm_daemon_callback(struct proc *p, void *data __unused);
2615 
2616 /*
2617  * No requirements.
2618  *
2619  * Scan processes for exceeding their rlimits, deactivate pages
2620  * when RSS is exceeded.
2621  */
2622 static void
2623 vm_daemon(void)
2624 {
2625 	while (TRUE) {
2626 		tsleep(&vm_daemon_needed, 0, "psleep", 0);
2627 		allproc_scan(vm_daemon_callback, NULL, 0);
2628 	}
2629 }
2630 
2631 static int
2632 vm_daemon_callback(struct proc *p, void *data __unused)
2633 {
2634 	struct vmspace *vm;
2635 	vm_pindex_t limit, size;
2636 
2637 	/*
2638 	 * if this is a system process or if we have already
2639 	 * looked at this process, skip it.
2640 	 */
2641 	lwkt_gettoken(&p->p_token);
2642 
2643 	if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2644 		lwkt_reltoken(&p->p_token);
2645 		return (0);
2646 	}
2647 
2648 	/*
2649 	 * if the process is in a non-running type state,
2650 	 * don't touch it.
2651 	 */
2652 	if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2653 		lwkt_reltoken(&p->p_token);
2654 		return (0);
2655 	}
2656 
2657 	/*
2658 	 * get a limit
2659 	 */
2660 	limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2661 			        p->p_rlimit[RLIMIT_RSS].rlim_max));
2662 
2663 	vm = p->p_vmspace;
2664 	vmspace_hold(vm);
2665 	size = pmap_resident_tlnw_count(&vm->vm_pmap);
2666 	if (limit >= 0 && size > 4096 &&
2667 	    size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
2668 		vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2669 	}
2670 	vmspace_drop(vm);
2671 
2672 	lwkt_reltoken(&p->p_token);
2673 
2674 	return (0);
2675 }
2676 
2677 #endif
2678