xref: /dragonfly/sys/vm/vm_pageout.c (revision ed183f8c)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * The Mach Operating System project at Carnegie-Mellon University.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
37  *
38  *
39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40  * All rights reserved.
41  *
42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43  *
44  * Permission to use, copy, modify and distribute this software and
45  * its documentation is hereby granted, provided that both the copyright
46  * notice and this permission notice appear in all copies of the
47  * software, derivative works or modified versions, and any portions
48  * thereof, and that both notices appear in supporting documentation.
49  *
50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53  *
54  * Carnegie Mellon requests users of this software to return to
55  *
56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57  *  School of Computer Science
58  *  Carnegie Mellon University
59  *  Pittsburgh PA 15213-3890
60  *
61  * any improvements or extensions that they make and grant Carnegie the
62  * rights to redistribute these changes.
63  *
64  * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
65  */
66 
67 /*
68  *	The proverbial page-out daemon.
69  */
70 
71 #include "opt_vm.h"
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/kernel.h>
75 #include <sys/proc.h>
76 #include <sys/kthread.h>
77 #include <sys/resourcevar.h>
78 #include <sys/signalvar.h>
79 #include <sys/vnode.h>
80 #include <sys/vmmeter.h>
81 #include <sys/conf.h>
82 #include <sys/sysctl.h>
83 
84 #include <vm/vm.h>
85 #include <vm/vm_param.h>
86 #include <sys/lock.h>
87 #include <vm/vm_object.h>
88 #include <vm/vm_page.h>
89 #include <vm/vm_map.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/swap_pager.h>
93 #include <vm/vm_extern.h>
94 
95 #include <sys/spinlock2.h>
96 #include <vm/vm_page2.h>
97 
98 /*
99  * System initialization
100  */
101 
102 /* the kernel process "vm_pageout"*/
103 static int vm_pageout_page(vm_page_t m, long *max_launderp,
104 			   long *vnodes_skippedp, struct vnode **vpfailedp,
105 			   int pass, int vmflush_flags);
106 static int vm_pageout_clean_helper (vm_page_t, int);
107 static void vm_pageout_free_page_calc (vm_size_t count);
108 static void vm_pageout_page_free(vm_page_t m) ;
109 struct thread *emergpager;
110 struct thread *pagethread;
111 static int sequence_emerg_pager;
112 
113 #if !defined(NO_SWAPPING)
114 /* the kernel process "vm_daemon"*/
115 static void vm_daemon (void);
116 static struct	thread *vmthread;
117 
118 static struct kproc_desc vm_kp = {
119 	"vmdaemon",
120 	vm_daemon,
121 	&vmthread
122 };
123 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
124 #endif
125 
126 int vm_pages_needed = 0;	/* Event on which pageout daemon sleeps */
127 int vm_pageout_deficit = 0;	/* Estimated number of pages deficit */
128 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
129 int vm_page_free_hysteresis = 16;
130 static int vm_pagedaemon_time;
131 
132 #if !defined(NO_SWAPPING)
133 static int vm_pageout_req_swapout;
134 static int vm_daemon_needed;
135 #endif
136 __read_mostly static int vm_max_launder = 4096;
137 __read_mostly static int vm_emerg_launder = 100;
138 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
139 __read_mostly static int vm_pageout_full_stats_interval = 0;
140 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
141 __read_mostly static int defer_swap_pageouts=0;
142 __read_mostly static int disable_swap_pageouts=0;
143 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE;
144 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2;
145 __read_mostly static int vm_pageout_debug;
146 
147 #if defined(NO_SWAPPING)
148 __read_mostly static int vm_swap_enabled=0;
149 __read_mostly static int vm_swap_idle_enabled=0;
150 #else
151 __read_mostly static int vm_swap_enabled=1;
152 __read_mostly static int vm_swap_idle_enabled=0;
153 #endif
154 
155 /* 0-disable, 1-passive, 2-active swp*/
156 __read_mostly int vm_pageout_memuse_mode=1;
157 
158 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
159 	CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
160 
161 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
162 	CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
163 
164 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
165 	CTLFLAG_RW, &vm_page_free_hysteresis, 0,
166 	"Free more pages than the minimum required");
167 
168 SYSCTL_INT(_vm, OID_AUTO, max_launder,
169 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
170 SYSCTL_INT(_vm, OID_AUTO, emerg_launder,
171 	CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum");
172 
173 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
174 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
175 
176 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
177 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
178 
179 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
180 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
181 
182 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
183 	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
184 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
185 	CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
186 SYSCTL_INT(_vm, OID_AUTO, pageout_debug,
187 	CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)");
188 
189 
190 #if defined(NO_SWAPPING)
191 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
192 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
193 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
194 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
195 #else
196 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
197 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
198 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
199 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
200 #endif
201 
202 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
203 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
204 
205 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
206 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
207 
208 static int pageout_lock_miss;
209 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
210 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
211 
212 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
213 
214 #if !defined(NO_SWAPPING)
215 static void vm_req_vmdaemon (void);
216 #endif
217 static void vm_pageout_page_stats(int q);
218 
219 /*
220  * Calculate approximately how many pages on each queue to try to
221  * clean.  An exact calculation creates an edge condition when the
222  * queues are unbalanced so add significant slop.  The queue scans
223  * will stop early when targets are reached and will start where they
224  * left off on the next pass.
225  *
226  * We need to be generous here because there are all sorts of loading
227  * conditions that can cause edge cases if try to average over all queues.
228  * In particular, storage subsystems have become so fast that paging
229  * activity can become quite frantic.  Eventually we will probably need
230  * two paging threads, one for dirty pages and one for clean, to deal
231  * with the bandwidth requirements.
232 
233  * So what we do is calculate a value that can be satisfied nominally by
234  * only having to scan half the queues.
235  */
236 static __inline long
237 PQAVERAGE(long n)
238 {
239 	long avg;
240 
241 	if (n >= 0) {
242 		avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
243 	} else {
244 		avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
245 	}
246 	return avg;
247 }
248 
249 /*
250  * vm_pageout_clean_helper:
251  *
252  * Clean the page and remove it from the laundry.  The page must be busied
253  * by the caller and will be disposed of (put away, flushed) by this routine.
254  */
255 static int
256 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
257 {
258 	vm_object_t object;
259 	vm_page_t mc[BLIST_MAX_ALLOC];
260 	int error;
261 	int ib, is, page_base;
262 	vm_pindex_t pindex = m->pindex;
263 
264 	object = m->object;
265 
266 	/*
267 	 * Don't mess with the page if it's held or special.  Theoretically
268 	 * we can pageout held pages but there is no real need to press our
269 	 * luck, so don't.
270 	 */
271 	if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) {
272 		vm_page_wakeup(m);
273 		return 0;
274 	}
275 
276 	/*
277 	 * Place page in cluster.  Align cluster for optimal swap space
278 	 * allocation (whether it is swap or not).  This is typically ~16-32
279 	 * pages, which also tends to align the cluster to multiples of the
280 	 * filesystem block size if backed by a filesystem.
281 	 */
282 	page_base = pindex % BLIST_MAX_ALLOC;
283 	mc[page_base] = m;
284 	ib = page_base - 1;
285 	is = page_base + 1;
286 
287 	/*
288 	 * Scan object for clusterable pages.
289 	 *
290 	 * We can cluster ONLY if: ->> the page is NOT
291 	 * clean, wired, busy, held, or mapped into a
292 	 * buffer, and one of the following:
293 	 * 1) The page is inactive, or a seldom used
294 	 *    active page.
295 	 * -or-
296 	 * 2) we force the issue.
297 	 *
298 	 * During heavy mmap/modification loads the pageout
299 	 * daemon can really fragment the underlying file
300 	 * due to flushing pages out of order and not trying
301 	 * align the clusters (which leave sporatic out-of-order
302 	 * holes).  To solve this problem we do the reverse scan
303 	 * first and attempt to align our cluster, then do a
304 	 * forward scan if room remains.
305 	 */
306 	vm_object_hold(object);
307 
308 	while (ib >= 0) {
309 		vm_page_t p;
310 
311 		p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
312 					    TRUE, &error);
313 		if (error || p == NULL)
314 			break;
315 		if ((p->queue - p->pc) == PQ_CACHE ||
316 		    (p->flags & PG_UNQUEUED)) {
317 			vm_page_wakeup(p);
318 			break;
319 		}
320 		vm_page_test_dirty(p);
321 		if (((p->dirty & p->valid) == 0 &&
322 		     (p->flags & PG_NEED_COMMIT) == 0) ||
323 		    p->wire_count != 0 ||	/* may be held by buf cache */
324 		    p->hold_count != 0) {	/* may be undergoing I/O */
325 			vm_page_wakeup(p);
326 			break;
327 		}
328 		if (p->queue - p->pc != PQ_INACTIVE) {
329 			if (p->queue - p->pc != PQ_ACTIVE ||
330 			    (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) {
331 				vm_page_wakeup(p);
332 				break;
333 			}
334 		}
335 
336 		/*
337 		 * Try to maintain page groupings in the cluster.
338 		 */
339 		if (m->flags & PG_WINATCFLS)
340 			vm_page_flag_set(p, PG_WINATCFLS);
341 		else
342 			vm_page_flag_clear(p, PG_WINATCFLS);
343 		p->act_count = m->act_count;
344 
345 		mc[ib] = p;
346 		--ib;
347 	}
348 	++ib;	/* fixup */
349 
350 	while (is < BLIST_MAX_ALLOC &&
351 	       pindex - page_base + is < object->size) {
352 		vm_page_t p;
353 
354 		p = vm_page_lookup_busy_try(object, pindex - page_base + is,
355 					    TRUE, &error);
356 		if (error || p == NULL)
357 			break;
358 		if (((p->queue - p->pc) == PQ_CACHE) ||
359 		    (p->flags & PG_UNQUEUED)) {
360 			vm_page_wakeup(p);
361 			break;
362 		}
363 		vm_page_test_dirty(p);
364 		if (((p->dirty & p->valid) == 0 &&
365 		     (p->flags & PG_NEED_COMMIT) == 0) ||
366 		    p->wire_count != 0 ||	/* may be held by buf cache */
367 		    p->hold_count != 0) {	/* may be undergoing I/O */
368 			vm_page_wakeup(p);
369 			break;
370 		}
371 		if (p->queue - p->pc != PQ_INACTIVE) {
372 			if (p->queue - p->pc != PQ_ACTIVE ||
373 			    (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) {
374 				vm_page_wakeup(p);
375 				break;
376 			}
377 		}
378 
379 		/*
380 		 * Try to maintain page groupings in the cluster.
381 		 */
382 		if (m->flags & PG_WINATCFLS)
383 			vm_page_flag_set(p, PG_WINATCFLS);
384 		else
385 			vm_page_flag_clear(p, PG_WINATCFLS);
386 		p->act_count = m->act_count;
387 
388 		mc[is] = p;
389 		++is;
390 	}
391 
392 	vm_object_drop(object);
393 
394 	/*
395 	 * we allow reads during pageouts...
396 	 */
397 	return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
398 }
399 
400 /*
401  * vm_pageout_flush() - launder the given pages
402  *
403  *	The given pages are laundered.  Note that we setup for the start of
404  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
405  *	reference count all in here rather then in the parent.  If we want
406  *	the parent to do more sophisticated things we may have to change
407  *	the ordering.
408  *
409  *	The pages in the array must be busied by the caller and will be
410  *	unbusied by this function.
411  */
412 int
413 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
414 {
415 	vm_object_t object;
416 	int pageout_status[count];
417 	int numpagedout = 0;
418 	int i;
419 	int dodebug;
420 
421 	if (vm_pageout_debug > 0) {
422 		--vm_pageout_debug;
423 		dodebug = 1;
424 	} else {
425 		dodebug = 0;
426 	}
427 
428 	/*
429 	 * Initiate I/O.  Bump the vm_page_t->busy counter.
430 	 */
431 	for (i = 0; i < count; i++) {
432 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
433 			("vm_pageout_flush page %p index %d/%d: partially "
434 			 "invalid page", mc[i], i, count));
435 		vm_page_io_start(mc[i]);
436 	}
437 
438 	/*
439 	 * We must make the pages read-only.  This will also force the
440 	 * modified bit in the related pmaps to be cleared.  The pager
441 	 * cannot clear the bit for us since the I/O completion code
442 	 * typically runs from an interrupt.  The act of making the page
443 	 * read-only handles the case for us.
444 	 *
445 	 * Then we can unbusy the pages, we still hold a reference by virtue
446 	 * of our soft-busy.
447 	 */
448 	if (dodebug)
449 		kprintf("pageout: ");
450 	for (i = 0; i < count; i++) {
451 		if (vmflush_flags & VM_PAGER_TRY_TO_CACHE)
452 			vm_page_protect(mc[i], VM_PROT_NONE);
453 		else
454 			vm_page_protect(mc[i], VM_PROT_READ);
455 		vm_page_wakeup(mc[i]);
456 		if (dodebug)
457 			kprintf(" %p", mc[i]);
458 	}
459 	if (dodebug)
460 		kprintf("\n");
461 
462 	object = mc[0]->object;
463 	vm_object_pip_add(object, count);
464 
465 	vm_pager_put_pages(object, mc, count,
466 			   (vmflush_flags |
467 			    ((object == &kernel_object) ?
468 				VM_PAGER_PUT_SYNC : 0)),
469 			   pageout_status);
470 
471 	if (dodebug)
472 		kprintf("result: ");
473 	for (i = 0; i < count; i++) {
474 		vm_page_t mt = mc[i];
475 
476 		if (dodebug)
477 			kprintf("  S%d", pageout_status[i]);
478 
479 		switch (pageout_status[i]) {
480 		case VM_PAGER_OK:
481 			numpagedout++;
482 			break;
483 		case VM_PAGER_PEND:
484 			numpagedout++;
485 			break;
486 		case VM_PAGER_BAD:
487 			/*
488 			 * Page outside of range of object. Right now we
489 			 * essentially lose the changes by pretending it
490 			 * worked.
491 			 */
492 			vm_page_busy_wait(mt, FALSE, "pgbad");
493 			pmap_clear_modify(mt);
494 			vm_page_undirty(mt);
495 			vm_page_wakeup(mt);
496 			break;
497 		case VM_PAGER_ERROR:
498 		case VM_PAGER_FAIL:
499 			/*
500 			 * A page typically cannot be paged out when we
501 			 * have run out of swap.  We leave the page
502 			 * marked inactive and will try to page it out
503 			 * again later.
504 			 *
505 			 * Starvation of the active page list is used to
506 			 * determine when the system is massively memory
507 			 * starved.
508 			 */
509 			break;
510 		case VM_PAGER_AGAIN:
511 			break;
512 		}
513 
514 		/*
515 		 * If not PENDing this was a synchronous operation and we
516 		 * clean up after the I/O.  If it is PENDing the mess is
517 		 * cleaned up asynchronously.
518 		 *
519 		 * Also nominally act on the caller's wishes if the caller
520 		 * wants to try to really clean (cache or free) the page.
521 		 *
522 		 * Also nominally deactivate the page if the system is
523 		 * memory-stressed.
524 		 */
525 		if (pageout_status[i] != VM_PAGER_PEND) {
526 			vm_page_busy_wait(mt, FALSE, "pgouw");
527 			vm_page_io_finish(mt);
528 			if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) {
529 				vm_page_try_to_cache(mt);
530 				if (dodebug)
531 				kprintf("A[pq_cache=%d]",
532 					 ((mt->queue - mt->pc) == PQ_CACHE));
533 			} else if (vm_page_count_severe()) {
534 				vm_page_deactivate(mt);
535 				vm_page_wakeup(mt);
536 				if (dodebug)
537 				kprintf("B");
538 			} else {
539 				vm_page_wakeup(mt);
540 				if (dodebug)
541 				kprintf("C");
542 			}
543 			vm_object_pip_wakeup(object);
544 		}
545 	}
546 	if (dodebug)
547 		kprintf("\n");
548 	return numpagedout;
549 }
550 
551 #if !defined(NO_SWAPPING)
552 
553 /*
554  * Callback function, page busied for us.  We must dispose of the busy
555  * condition.  Any related pmap pages may be held but will not be locked.
556  */
557 static
558 int
559 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
560 			vm_page_t p)
561 {
562 	int actcount;
563 	int cleanit = 0;
564 
565 	/*
566 	 * Basic tests - There should never be a marker, and we can stop
567 	 *		 once the RSS is below the required level.
568 	 */
569 	KKASSERT((p->flags & PG_MARKER) == 0);
570 	if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
571 		vm_page_wakeup(p);
572 		return(-1);
573 	}
574 
575 	mycpu->gd_cnt.v_pdpages++;
576 
577 	if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) {
578 		vm_page_wakeup(p);
579 		goto done;
580 	}
581 
582 	++info->actioncount;
583 
584 	/*
585 	 * Check if the page has been referened recently.  If it has,
586 	 * activate it and skip.
587 	 */
588 	actcount = pmap_ts_referenced(p);
589 	if (actcount) {
590 		vm_page_flag_set(p, PG_REFERENCED);
591 	} else if (p->flags & PG_REFERENCED) {
592 		actcount = 1;
593 	}
594 
595 	if (actcount) {
596 		if (p->queue - p->pc != PQ_ACTIVE) {
597 			vm_page_and_queue_spin_lock(p);
598 			if (p->queue - p->pc != PQ_ACTIVE) {
599 				vm_page_and_queue_spin_unlock(p);
600 				vm_page_activate(p);
601 			} else {
602 				vm_page_and_queue_spin_unlock(p);
603 			}
604 		} else {
605 			p->act_count += actcount;
606 			if (p->act_count > ACT_MAX)
607 				p->act_count = ACT_MAX;
608 		}
609 		vm_page_flag_clear(p, PG_REFERENCED);
610 		vm_page_wakeup(p);
611 		goto done;
612 	}
613 
614 	/*
615 	 * Remove the page from this particular pmap.  Once we do this, our
616 	 * pmap scans will not see it again (unless it gets faulted in), so
617 	 * we must actively dispose of or deal with the page.
618 	 */
619 	pmap_remove_specific(info->pmap, p);
620 
621 	/*
622 	 * If the page is not mapped to another process (i.e. as would be
623 	 * typical if this were a shared page from a library) then deactivate
624 	 * the page and clean it in two passes only.
625 	 *
626 	 * If the page hasn't been referenced since the last check, remove it
627 	 * from the pmap.  If it is no longer mapped, deactivate it
628 	 * immediately, accelerating the normal decline.
629 	 *
630 	 * Once the page has been removed from the pmap the RSS code no
631 	 * longer tracks it so we have to make sure that it is staged for
632 	 * potential flush action.
633 	 */
634 	if ((p->flags & PG_MAPPED) == 0 ||
635 	    (pmap_mapped_sync(p) & PG_MAPPED) == 0) {
636 		if (p->queue - p->pc == PQ_ACTIVE) {
637 			vm_page_deactivate(p);
638 		}
639 		if (p->queue - p->pc == PQ_INACTIVE) {
640 			cleanit = 1;
641 		}
642 	}
643 
644 	/*
645 	 * Ok, try to fully clean the page and any nearby pages such that at
646 	 * least the requested page is freed or moved to the cache queue.
647 	 *
648 	 * We usually do this synchronously to allow us to get the page into
649 	 * the CACHE queue quickly, which will prevent memory exhaustion if
650 	 * a process with a memoryuse limit is running away.  However, the
651 	 * sysadmin may desire to set vm.swap_user_async which relaxes this
652 	 * and improves write performance.
653 	 */
654 	if (cleanit) {
655 		long max_launder = 0x7FFF;
656 		long vnodes_skipped = 0;
657 		int vmflush_flags;
658 		struct vnode *vpfailed = NULL;
659 
660 		info->offset = va;
661 
662 		if (vm_pageout_memuse_mode >= 2) {
663 			vmflush_flags = VM_PAGER_TRY_TO_CACHE |
664 					VM_PAGER_ALLOW_ACTIVE;
665 			if (swap_user_async == 0)
666 				vmflush_flags |= VM_PAGER_PUT_SYNC;
667 			vm_page_flag_set(p, PG_WINATCFLS);
668 			info->cleancount +=
669 				vm_pageout_page(p, &max_launder,
670 						&vnodes_skipped,
671 						&vpfailed, 1, vmflush_flags);
672 		} else {
673 			vm_page_wakeup(p);
674 			++info->cleancount;
675 		}
676 	} else {
677 		vm_page_wakeup(p);
678 	}
679 
680 	/*
681 	 * Must be at end to avoid SMP races.
682 	 */
683 done:
684 	lwkt_user_yield();
685 	return 0;
686 }
687 
688 /*
689  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
690  * that is relatively difficult to do.  We try to keep track of where we
691  * left off last time to reduce scan overhead.
692  *
693  * Called when vm_pageout_memuse_mode is >= 1.
694  */
695 void
696 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
697 {
698 	vm_offset_t pgout_offset;
699 	struct pmap_pgscan_info info;
700 	int retries = 3;
701 
702 	pgout_offset = map->pgout_offset;
703 again:
704 #if 0
705 	kprintf("%016jx ", pgout_offset);
706 #endif
707 	if (pgout_offset < VM_MIN_USER_ADDRESS)
708 		pgout_offset = VM_MIN_USER_ADDRESS;
709 	if (pgout_offset >= VM_MAX_USER_ADDRESS)
710 		pgout_offset = 0;
711 	info.pmap = vm_map_pmap(map);
712 	info.limit = limit;
713 	info.beg_addr = pgout_offset;
714 	info.end_addr = VM_MAX_USER_ADDRESS;
715 	info.callback = vm_pageout_mdp_callback;
716 	info.cleancount = 0;
717 	info.actioncount = 0;
718 	info.busycount = 0;
719 
720 	pmap_pgscan(&info);
721 	pgout_offset = info.offset;
722 #if 0
723 	kprintf("%016jx %08lx %08lx\n", pgout_offset,
724 		info.cleancount, info.actioncount);
725 #endif
726 
727 	if (pgout_offset != VM_MAX_USER_ADDRESS &&
728 	    pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
729 		goto again;
730 	} else if (retries &&
731 		   pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
732 		--retries;
733 		goto again;
734 	}
735 	map->pgout_offset = pgout_offset;
736 }
737 #endif
738 
739 /*
740  * Called when the pageout scan wants to free a page.  We no longer
741  * try to cycle the vm_object here with a reference & dealloc, which can
742  * cause a non-trivial object collapse in a critical path.
743  *
744  * It is unclear why we cycled the ref_count in the past, perhaps to try
745  * to optimize shadow chain collapses but I don't quite see why it would
746  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
747  * synchronously and not have to be kicked-start.
748  */
749 static void
750 vm_pageout_page_free(vm_page_t m)
751 {
752 	vm_page_protect(m, VM_PROT_NONE);
753 	vm_page_free(m);
754 }
755 
756 /*
757  * vm_pageout_scan does the dirty work for the pageout daemon.
758  */
759 struct vm_pageout_scan_info {
760 	struct proc *bigproc;
761 	vm_offset_t bigsize;
762 };
763 
764 static int vm_pageout_scan_callback(struct proc *p, void *data);
765 
766 /*
767  * Scan inactive queue
768  *
769  * WARNING! Can be called from two pagedaemon threads simultaneously.
770  */
771 static int
772 vm_pageout_scan_inactive(int pass, int q, long avail_shortage,
773 			 long *vnodes_skipped)
774 {
775 	vm_page_t m;
776 	struct vm_page marker;
777 	struct vnode *vpfailed;		/* warning, allowed to be stale */
778 	long maxscan;
779 	long delta = 0;
780 	long max_launder;
781 	int isep;
782 	int vmflush_flags;
783 
784 	isep = (curthread == emergpager);
785 
786 	/*
787 	 * Start scanning the inactive queue for pages we can move to the
788 	 * cache or free.  The scan will stop when the target is reached or
789 	 * we have scanned the entire inactive queue.  Note that m->act_count
790 	 * is not used to form decisions for the inactive queue, only for the
791 	 * active queue.
792 	 *
793 	 * max_launder limits the number of dirty pages we flush per scan.
794 	 * For most systems a smaller value (16 or 32) is more robust under
795 	 * extreme memory and disk pressure because any unnecessary writes
796 	 * to disk can result in extreme performance degredation.  However,
797 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
798 	 * used) will die horribly with limited laundering.  If the pageout
799 	 * daemon cannot clean enough pages in the first pass, we let it go
800 	 * all out in succeeding passes.
801 	 *
802 	 * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
803 	 *	  PAGES.
804 	 */
805 	if ((max_launder = vm_max_launder) <= 1)
806 		max_launder = 1;
807 	if (pass)
808 		max_launder = 10000;
809 
810 	/*
811 	 * Initialize our marker
812 	 */
813 	bzero(&marker, sizeof(marker));
814 	marker.flags = PG_FICTITIOUS | PG_MARKER;
815 	marker.busy_count = PBUSY_LOCKED;
816 	marker.queue = PQ_INACTIVE + q;
817 	marker.pc = q;
818 	marker.wire_count = 1;
819 
820 	/*
821 	 * Inactive queue scan.
822 	 *
823 	 * NOTE: The vm_page must be spinlocked before the queue to avoid
824 	 *	 deadlocks, so it is easiest to simply iterate the loop
825 	 *	 with the queue unlocked at the top.
826 	 */
827 	vpfailed = NULL;
828 
829 	vm_page_queues_spin_lock(PQ_INACTIVE + q);
830 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
831 	maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt;
832 
833 	/*
834 	 * Queue locked at top of loop to avoid stack marker issues.
835 	 */
836 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
837 	       maxscan-- > 0 && avail_shortage - delta > 0)
838 	{
839 		int count;
840 
841 		KKASSERT(m->queue == PQ_INACTIVE + q);
842 		TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
843 			     &marker, pageq);
844 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
845 				   &marker, pageq);
846 		mycpu->gd_cnt.v_pdpages++;
847 
848 		/*
849 		 * Skip marker pages (atomic against other markers to avoid
850 		 * infinite hop-over scans).
851 		 */
852 		if (m->flags & PG_MARKER)
853 			continue;
854 
855 		/*
856 		 * Try to busy the page.  Don't mess with pages which are
857 		 * already busy or reorder them in the queue.
858 		 */
859 		if (vm_page_busy_try(m, TRUE))
860 			continue;
861 
862 		/*
863 		 * Remaining operations run with the page busy and neither
864 		 * the page or the queue will be spin-locked.
865 		 */
866 		KKASSERT(m->queue == PQ_INACTIVE + q);
867 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
868 
869 		/*
870 		 * The emergency pager runs when the primary pager gets
871 		 * stuck, which typically means the primary pager deadlocked
872 		 * on a vnode-backed page.  Therefore, the emergency pager
873 		 * must skip any complex objects.
874 		 *
875 		 * We disallow VNODEs unless they are VCHR whos device ops
876 		 * does not flag D_NOEMERGPGR.
877 		 */
878 		if (isep && m->object) {
879 			struct vnode *vp;
880 
881 			switch(m->object->type) {
882 			case OBJT_DEFAULT:
883 			case OBJT_SWAP:
884 				/*
885 				 * Allow anonymous memory and assume that
886 				 * swap devices are not complex, since its
887 				 * kinda worthless if we can't swap out dirty
888 				 * anonymous pages.
889 				 */
890 				break;
891 			case OBJT_VNODE:
892 				/*
893 				 * Allow VCHR device if the D_NOEMERGPGR
894 				 * flag is not set, deny other vnode types
895 				 * as being too complex.
896 				 */
897 				vp = m->object->handle;
898 				if (vp && vp->v_type == VCHR &&
899 				    vp->v_rdev && vp->v_rdev->si_ops &&
900 				    (vp->v_rdev->si_ops->head.flags &
901 				     D_NOEMERGPGR) == 0) {
902 					break;
903 				}
904 				/* Deny - fall through */
905 			default:
906 				/*
907 				 * Deny
908 				 */
909 				vm_page_wakeup(m);
910 				vm_page_queues_spin_lock(PQ_INACTIVE + q);
911 				lwkt_yield();
912 				continue;
913 			}
914 		}
915 
916 		/*
917 		 * Try to pageout the page and perhaps other nearby pages.
918 		 * We want to get the pages into the cache on the second
919 		 * pass.  Otherwise the pages can wind up just cycling in
920 		 * the inactive queue, getting flushed over and over again.
921 		 */
922 		if (m->flags & PG_WINATCFLS)
923 			vmflush_flags = VM_PAGER_TRY_TO_CACHE;
924 		else
925 			vmflush_flags = 0;
926 		count = vm_pageout_page(m, &max_launder, vnodes_skipped,
927 					&vpfailed, pass, vmflush_flags);
928 		delta += count;
929 
930 		/*
931 		 * Systems with a ton of memory can wind up with huge
932 		 * deactivation counts.  Because the inactive scan is
933 		 * doing a lot of flushing, the combination can result
934 		 * in excessive paging even in situations where other
935 		 * unrelated threads free up sufficient VM.
936 		 *
937 		 * To deal with this we abort the nominal active->inactive
938 		 * scan before we hit the inactive target when free+cache
939 		 * levels have reached a reasonable target.
940 		 *
941 		 * When deciding to stop early we need to add some slop to
942 		 * the test and we need to return full completion to the caller
943 		 * to prevent the caller from thinking there is something
944 		 * wrong and issuing a low-memory+swap warning or pkill.
945 		 *
946 		 * A deficit forces paging regardless of the state of the
947 		 * VM page queues (used for RSS enforcement).
948 		 */
949 		lwkt_yield();
950 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
951 		if (vm_paging_target() < -vm_max_launder) {
952 			/*
953 			 * Stopping early, return full completion to caller.
954 			 */
955 			if (delta < avail_shortage)
956 				delta = avail_shortage;
957 			break;
958 		}
959 	}
960 
961 	/* page queue still spin-locked */
962 	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
963 	vm_page_queues_spin_unlock(PQ_INACTIVE + q);
964 
965 	return (delta);
966 }
967 
968 /*
969  * Pageout the specified page, return the total number of pages paged out
970  * (this routine may cluster).
971  *
972  * The page must be busied and soft-busied by the caller and will be disposed
973  * of by this function.
974  */
975 static int
976 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp,
977 		struct vnode **vpfailedp, int pass, int vmflush_flags)
978 {
979 	vm_object_t object;
980 	int actcount;
981 	int count = 0;
982 
983 	/*
984 	 * Wiring no longer removes a page from its queue.  The last unwiring
985 	 * will requeue the page.  Obviously wired pages cannot be paged out
986 	 * so unqueue it and return.
987 	 */
988 	if (m->wire_count) {
989 		vm_page_unqueue_nowakeup(m);
990 		vm_page_wakeup(m);
991 		return 0;
992 	}
993 
994 	/*
995 	 * A held page may be undergoing I/O, so skip it.
996 	 */
997 	if (m->hold_count) {
998 		vm_page_and_queue_spin_lock(m);
999 		if (m->queue - m->pc == PQ_INACTIVE) {
1000 			TAILQ_REMOVE(
1001 				&vm_page_queues[m->queue].pl, m, pageq);
1002 			TAILQ_INSERT_TAIL(
1003 				&vm_page_queues[m->queue].pl, m, pageq);
1004 		}
1005 		vm_page_and_queue_spin_unlock(m);
1006 		vm_page_wakeup(m);
1007 		return 0;
1008 	}
1009 
1010 	if (m->object == NULL || m->object->ref_count == 0) {
1011 		/*
1012 		 * If the object is not being used, we ignore previous
1013 		 * references.
1014 		 */
1015 		vm_page_flag_clear(m, PG_REFERENCED);
1016 		pmap_clear_reference(m);
1017 		/* fall through to end */
1018 	} else if (((m->flags & PG_REFERENCED) == 0) &&
1019 		    (actcount = pmap_ts_referenced(m))) {
1020 		/*
1021 		 * Otherwise, if the page has been referenced while
1022 		 * in the inactive queue, we bump the "activation
1023 		 * count" upwards, making it less likely that the
1024 		 * page will be added back to the inactive queue
1025 		 * prematurely again.  Here we check the page tables
1026 		 * (or emulated bits, if any), given the upper level
1027 		 * VM system not knowing anything about existing
1028 		 * references.
1029 		 */
1030 		vm_page_activate(m);
1031 		m->act_count += (actcount + ACT_ADVANCE);
1032 		vm_page_wakeup(m);
1033 		return 0;
1034 	}
1035 
1036 	/*
1037 	 * (m) is still busied.
1038 	 *
1039 	 * If the upper level VM system knows about any page
1040 	 * references, we activate the page.  We also set the
1041 	 * "activation count" higher than normal so that we will less
1042 	 * likely place pages back onto the inactive queue again.
1043 	 */
1044 	if ((m->flags & PG_REFERENCED) != 0) {
1045 		vm_page_flag_clear(m, PG_REFERENCED);
1046 		actcount = pmap_ts_referenced(m);
1047 		vm_page_activate(m);
1048 		m->act_count += (actcount + ACT_ADVANCE + 1);
1049 		vm_page_wakeup(m);
1050 		return 0;
1051 	}
1052 
1053 	/*
1054 	 * If the upper level VM system doesn't know anything about
1055 	 * the page being dirty, we have to check for it again.  As
1056 	 * far as the VM code knows, any partially dirty pages are
1057 	 * fully dirty.
1058 	 *
1059 	 * Pages marked PG_WRITEABLE may be mapped into the user
1060 	 * address space of a process running on another cpu.  A
1061 	 * user process (without holding the MP lock) running on
1062 	 * another cpu may be able to touch the page while we are
1063 	 * trying to remove it.  vm_page_cache() will handle this
1064 	 * case for us.
1065 	 */
1066 	if (m->dirty == 0) {
1067 		vm_page_test_dirty(m);
1068 	} else {
1069 		vm_page_dirty(m);
1070 	}
1071 
1072 	if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1073 		/*
1074 		 * Invalid pages can be easily freed
1075 		 */
1076 		vm_pageout_page_free(m);
1077 		mycpu->gd_cnt.v_dfree++;
1078 		++count;
1079 	} else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
1080 		/*
1081 		 * Clean pages can be placed onto the cache queue.
1082 		 * This effectively frees them.
1083 		 */
1084 		vm_page_cache(m);
1085 		++count;
1086 	} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
1087 		/*
1088 		 * Dirty pages need to be paged out, but flushing
1089 		 * a page is extremely expensive verses freeing
1090 		 * a clean page.  Rather then artificially limiting
1091 		 * the number of pages we can flush, we instead give
1092 		 * dirty pages extra priority on the inactive queue
1093 		 * by forcing them to be cycled through the queue
1094 		 * twice before being flushed, after which the
1095 		 * (now clean) page will cycle through once more
1096 		 * before being freed.  This significantly extends
1097 		 * the thrash point for a heavily loaded machine.
1098 		 */
1099 		vm_page_flag_set(m, PG_WINATCFLS);
1100 		vm_page_and_queue_spin_lock(m);
1101 		if (m->queue - m->pc == PQ_INACTIVE) {
1102 			TAILQ_REMOVE(
1103 				&vm_page_queues[m->queue].pl, m, pageq);
1104 			TAILQ_INSERT_TAIL(
1105 				&vm_page_queues[m->queue].pl, m, pageq);
1106 		}
1107 		vm_page_and_queue_spin_unlock(m);
1108 		vm_page_wakeup(m);
1109 	} else if (*max_launderp > 0) {
1110 		/*
1111 		 * We always want to try to flush some dirty pages if
1112 		 * we encounter them, to keep the system stable.
1113 		 * Normally this number is small, but under extreme
1114 		 * pressure where there are insufficient clean pages
1115 		 * on the inactive queue, we may have to go all out.
1116 		 */
1117 		int swap_pageouts_ok;
1118 		struct vnode *vp = NULL;
1119 
1120 		swap_pageouts_ok = 0;
1121 		object = m->object;
1122 		if (object &&
1123 		    (object->type != OBJT_SWAP) &&
1124 		    (object->type != OBJT_DEFAULT)) {
1125 			swap_pageouts_ok = 1;
1126 		} else {
1127 			swap_pageouts_ok = !(defer_swap_pageouts ||
1128 					     disable_swap_pageouts);
1129 			swap_pageouts_ok |= (!disable_swap_pageouts &&
1130 					     defer_swap_pageouts &&
1131 					     vm_page_count_min(0));
1132 		}
1133 
1134 		/*
1135 		 * We don't bother paging objects that are "dead".
1136 		 * Those objects are in a "rundown" state.
1137 		 */
1138 		if (!swap_pageouts_ok ||
1139 		    (object == NULL) ||
1140 		    (object->flags & OBJ_DEAD)) {
1141 			vm_page_and_queue_spin_lock(m);
1142 			if (m->queue - m->pc == PQ_INACTIVE) {
1143 				TAILQ_REMOVE(
1144 				    &vm_page_queues[m->queue].pl,
1145 				    m, pageq);
1146 				TAILQ_INSERT_TAIL(
1147 				    &vm_page_queues[m->queue].pl,
1148 				    m, pageq);
1149 			}
1150 			vm_page_and_queue_spin_unlock(m);
1151 			vm_page_wakeup(m);
1152 			return 0;
1153 		}
1154 
1155 		/*
1156 		 * (m) is still busied.
1157 		 *
1158 		 * The object is already known NOT to be dead.   It
1159 		 * is possible for the vget() to block the whole
1160 		 * pageout daemon, but the new low-memory handling
1161 		 * code should prevent it.
1162 		 *
1163 		 * The previous code skipped locked vnodes and, worse,
1164 		 * reordered pages in the queue.  This results in
1165 		 * completely non-deterministic operation because,
1166 		 * quite often, a vm_fault has initiated an I/O and
1167 		 * is holding a locked vnode at just the point where
1168 		 * the pageout daemon is woken up.
1169 		 *
1170 		 * We can't wait forever for the vnode lock, we might
1171 		 * deadlock due to a vn_read() getting stuck in
1172 		 * vm_wait while holding this vnode.  We skip the
1173 		 * vnode if we can't get it in a reasonable amount
1174 		 * of time.
1175 		 *
1176 		 * vpfailed is used to (try to) avoid the case where
1177 		 * a large number of pages are associated with a
1178 		 * locked vnode, which could cause the pageout daemon
1179 		 * to stall for an excessive amount of time.
1180 		 */
1181 		if (object->type == OBJT_VNODE) {
1182 			int flags;
1183 
1184 			vp = object->handle;
1185 			flags = LK_EXCLUSIVE;
1186 			if (vp == *vpfailedp)
1187 				flags |= LK_NOWAIT;
1188 			else
1189 				flags |= LK_TIMELOCK;
1190 			vm_page_hold(m);
1191 			vm_page_wakeup(m);
1192 
1193 			/*
1194 			 * We have unbusied (m) temporarily so we can
1195 			 * acquire the vp lock without deadlocking.
1196 			 * (m) is held to prevent destruction.
1197 			 */
1198 			if (vget(vp, flags) != 0) {
1199 				*vpfailedp = vp;
1200 				++pageout_lock_miss;
1201 				if (object->flags & OBJ_MIGHTBEDIRTY)
1202 					    ++*vnodes_skippedp;
1203 				vm_page_unhold(m);
1204 				return 0;
1205 			}
1206 
1207 			/*
1208 			 * The page might have been moved to another
1209 			 * queue during potential blocking in vget()
1210 			 * above.  The page might have been freed and
1211 			 * reused for another vnode.  The object might
1212 			 * have been reused for another vnode.
1213 			 */
1214 			if (m->queue - m->pc != PQ_INACTIVE ||
1215 			    m->object != object ||
1216 			    object->handle != vp) {
1217 				if (object->flags & OBJ_MIGHTBEDIRTY)
1218 					++*vnodes_skippedp;
1219 				vput(vp);
1220 				vm_page_unhold(m);
1221 				return 0;
1222 			}
1223 
1224 			/*
1225 			 * The page may have been busied during the
1226 			 * blocking in vput();  We don't move the
1227 			 * page back onto the end of the queue so that
1228 			 * statistics are more correct if we don't.
1229 			 */
1230 			if (vm_page_busy_try(m, TRUE)) {
1231 				vput(vp);
1232 				vm_page_unhold(m);
1233 				return 0;
1234 			}
1235 			vm_page_unhold(m);
1236 
1237 			/*
1238 			 * If it was wired while we didn't own it.
1239 			 */
1240 			if (m->wire_count) {
1241 				vm_page_unqueue_nowakeup(m);
1242 				vput(vp);
1243 				vm_page_wakeup(m);
1244 				return 0;
1245 			}
1246 
1247 			/*
1248 			 * (m) is busied again
1249 			 *
1250 			 * We own the busy bit and remove our hold
1251 			 * bit.  If the page is still held it
1252 			 * might be undergoing I/O, so skip it.
1253 			 */
1254 			if (m->hold_count) {
1255 				vm_page_and_queue_spin_lock(m);
1256 				if (m->queue - m->pc == PQ_INACTIVE) {
1257 					TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1258 					TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1259 				}
1260 				vm_page_and_queue_spin_unlock(m);
1261 				if (object->flags & OBJ_MIGHTBEDIRTY)
1262 					++*vnodes_skippedp;
1263 				vm_page_wakeup(m);
1264 				vput(vp);
1265 				return 0;
1266 			}
1267 			/* (m) is left busied as we fall through */
1268 		}
1269 
1270 		/*
1271 		 * page is busy and not held here.
1272 		 *
1273 		 * If a page is dirty, then it is either being washed
1274 		 * (but not yet cleaned) or it is still in the
1275 		 * laundry.  If it is still in the laundry, then we
1276 		 * start the cleaning operation.
1277 		 *
1278 		 * decrement inactive_shortage on success to account
1279 		 * for the (future) cleaned page.  Otherwise we
1280 		 * could wind up laundering or cleaning too many
1281 		 * pages.
1282 		 *
1283 		 * NOTE: Cleaning the page here does not cause
1284 		 *	 force_deficit to be adjusted, because the
1285 		 *	 page is not being freed or moved to the
1286 		 *	 cache.
1287 		 */
1288 		count = vm_pageout_clean_helper(m, vmflush_flags);
1289 		*max_launderp -= count;
1290 
1291 		/*
1292 		 * Clean ate busy, page no longer accessible
1293 		 */
1294 		if (vp != NULL)
1295 			vput(vp);
1296 	} else {
1297 		vm_page_wakeup(m);
1298 	}
1299 	return count;
1300 }
1301 
1302 /*
1303  * Scan active queue
1304  *
1305  * WARNING! Can be called from two pagedaemon threads simultaneously.
1306  */
1307 static int
1308 vm_pageout_scan_active(int pass, int q,
1309 		       long avail_shortage, long inactive_shortage,
1310 		       long *recycle_countp)
1311 {
1312 	struct vm_page marker;
1313 	vm_page_t m;
1314 	int actcount;
1315 	long delta = 0;
1316 	long maxscan;
1317 	int isep;
1318 
1319 	isep = (curthread == emergpager);
1320 
1321 	/*
1322 	 * We want to move pages from the active queue to the inactive
1323 	 * queue to get the inactive queue to the inactive target.  If
1324 	 * we still have a page shortage from above we try to directly free
1325 	 * clean pages instead of moving them.
1326 	 *
1327 	 * If we do still have a shortage we keep track of the number of
1328 	 * pages we free or cache (recycle_count) as a measure of thrashing
1329 	 * between the active and inactive queues.
1330 	 *
1331 	 * If we were able to completely satisfy the free+cache targets
1332 	 * from the inactive pool we limit the number of pages we move
1333 	 * from the active pool to the inactive pool to 2x the pages we
1334 	 * had removed from the inactive pool (with a minimum of 1/5 the
1335 	 * inactive target).  If we were not able to completely satisfy
1336 	 * the free+cache targets we go for the whole target aggressively.
1337 	 *
1338 	 * NOTE: Both variables can end up negative.
1339 	 * NOTE: We are still in a critical section.
1340 	 *
1341 	 * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED
1342 	 *	  PAGES.
1343 	 */
1344 
1345 	bzero(&marker, sizeof(marker));
1346 	marker.flags = PG_FICTITIOUS | PG_MARKER;
1347 	marker.busy_count = PBUSY_LOCKED;
1348 	marker.queue = PQ_ACTIVE + q;
1349 	marker.pc = q;
1350 	marker.wire_count = 1;
1351 
1352 	vm_page_queues_spin_lock(PQ_ACTIVE + q);
1353 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1354 	maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt;
1355 
1356 	/*
1357 	 * Queue locked at top of loop to avoid stack marker issues.
1358 	 */
1359 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1360 	       maxscan-- > 0 && (avail_shortage - delta > 0 ||
1361 				inactive_shortage > 0))
1362 	{
1363 		KKASSERT(m->queue == PQ_ACTIVE + q);
1364 		TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1365 			     &marker, pageq);
1366 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1367 				   &marker, pageq);
1368 
1369 		/*
1370 		 * Skip marker pages (atomic against other markers to avoid
1371 		 * infinite hop-over scans).
1372 		 */
1373 		if (m->flags & PG_MARKER)
1374 			continue;
1375 
1376 		/*
1377 		 * Try to busy the page.  Don't mess with pages which are
1378 		 * already busy or reorder them in the queue.
1379 		 */
1380 		if (vm_page_busy_try(m, TRUE))
1381 			continue;
1382 
1383 		/*
1384 		 * Remaining operations run with the page busy and neither
1385 		 * the page or the queue will be spin-locked.
1386 		 */
1387 		KKASSERT(m->queue == PQ_ACTIVE + q);
1388 		vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1389 
1390 #if 0
1391 		/*
1392 		 * Don't deactivate pages that are held, even if we can
1393 		 * busy them.  (XXX why not?)
1394 		 */
1395 		if (m->hold_count) {
1396 			vm_page_and_queue_spin_lock(m);
1397 			if (m->queue - m->pc == PQ_ACTIVE) {
1398 				TAILQ_REMOVE(
1399 					&vm_page_queues[PQ_ACTIVE + q].pl,
1400 					m, pageq);
1401 				TAILQ_INSERT_TAIL(
1402 					&vm_page_queues[PQ_ACTIVE + q].pl,
1403 					m, pageq);
1404 			}
1405 			vm_page_and_queue_spin_unlock(m);
1406 			vm_page_wakeup(m);
1407 			goto next;
1408 		}
1409 #endif
1410 		/*
1411 		 * We can just remove wired pages from the queue
1412 		 */
1413 		if (m->wire_count) {
1414 			vm_page_unqueue_nowakeup(m);
1415 			vm_page_wakeup(m);
1416 			goto next;
1417 		}
1418 
1419 		/*
1420 		 * The emergency pager ignores vnode-backed pages as these
1421 		 * are the pages that probably bricked the main pager.
1422 		 */
1423 		if (isep && m->object && m->object->type == OBJT_VNODE) {
1424 			vm_page_and_queue_spin_lock(m);
1425 			if (m->queue - m->pc == PQ_ACTIVE) {
1426 				TAILQ_REMOVE(
1427 					&vm_page_queues[PQ_ACTIVE + q].pl,
1428 					m, pageq);
1429 				TAILQ_INSERT_TAIL(
1430 					&vm_page_queues[PQ_ACTIVE + q].pl,
1431 					m, pageq);
1432 			}
1433 			vm_page_and_queue_spin_unlock(m);
1434 			vm_page_wakeup(m);
1435 			goto next;
1436 		}
1437 
1438 		/*
1439 		 * The count for pagedaemon pages is done after checking the
1440 		 * page for eligibility...
1441 		 */
1442 		mycpu->gd_cnt.v_pdpages++;
1443 
1444 		/*
1445 		 * Check to see "how much" the page has been used and clear
1446 		 * the tracking access bits.  If the object has no references
1447 		 * don't bother paying the expense.
1448 		 */
1449 		actcount = 0;
1450 		if (m->object && m->object->ref_count != 0) {
1451 			if (m->flags & PG_REFERENCED)
1452 				++actcount;
1453 			actcount += pmap_ts_referenced(m);
1454 			if (actcount) {
1455 				m->act_count += ACT_ADVANCE + actcount;
1456 				if (m->act_count > ACT_MAX)
1457 					m->act_count = ACT_MAX;
1458 			}
1459 		}
1460 		vm_page_flag_clear(m, PG_REFERENCED);
1461 
1462 		/*
1463 		 * actcount is only valid if the object ref_count is non-zero.
1464 		 * If the page does not have an object, actcount will be zero.
1465 		 */
1466 		if (actcount && m->object->ref_count != 0) {
1467 			vm_page_and_queue_spin_lock(m);
1468 			if (m->queue - m->pc == PQ_ACTIVE) {
1469 				TAILQ_REMOVE(
1470 					&vm_page_queues[PQ_ACTIVE + q].pl,
1471 					m, pageq);
1472 				TAILQ_INSERT_TAIL(
1473 					&vm_page_queues[PQ_ACTIVE + q].pl,
1474 					m, pageq);
1475 			}
1476 			vm_page_and_queue_spin_unlock(m);
1477 			vm_page_wakeup(m);
1478 		} else {
1479 			switch(m->object->type) {
1480 			case OBJT_DEFAULT:
1481 			case OBJT_SWAP:
1482 				m->act_count -= min(m->act_count,
1483 						    vm_anonmem_decline);
1484 				break;
1485 			default:
1486 				m->act_count -= min(m->act_count,
1487 						    vm_filemem_decline);
1488 				break;
1489 			}
1490 			if (vm_pageout_algorithm ||
1491 			    (m->object == NULL) ||
1492 			    (m->object && (m->object->ref_count == 0)) ||
1493 			    m->act_count < pass + 1
1494 			) {
1495 				/*
1496 				 * Deactivate the page.  If we had a
1497 				 * shortage from our inactive scan try to
1498 				 * free (cache) the page instead.
1499 				 *
1500 				 * Don't just blindly cache the page if
1501 				 * we do not have a shortage from the
1502 				 * inactive scan, that could lead to
1503 				 * gigabytes being moved.
1504 				 */
1505 				--inactive_shortage;
1506 				if (avail_shortage - delta > 0 ||
1507 				    (m->object && (m->object->ref_count == 0)))
1508 				{
1509 					if (avail_shortage - delta > 0)
1510 						++*recycle_countp;
1511 					vm_page_protect(m, VM_PROT_NONE);
1512 					if (m->dirty == 0 &&
1513 					    (m->flags & PG_NEED_COMMIT) == 0 &&
1514 					    avail_shortage - delta > 0) {
1515 						vm_page_cache(m);
1516 					} else {
1517 						vm_page_deactivate(m);
1518 						vm_page_wakeup(m);
1519 					}
1520 				} else {
1521 					vm_page_deactivate(m);
1522 					vm_page_wakeup(m);
1523 				}
1524 				++delta;
1525 			} else {
1526 				vm_page_and_queue_spin_lock(m);
1527 				if (m->queue - m->pc == PQ_ACTIVE) {
1528 					TAILQ_REMOVE(
1529 					    &vm_page_queues[PQ_ACTIVE + q].pl,
1530 					    m, pageq);
1531 					TAILQ_INSERT_TAIL(
1532 					    &vm_page_queues[PQ_ACTIVE + q].pl,
1533 					    m, pageq);
1534 				}
1535 				vm_page_and_queue_spin_unlock(m);
1536 				vm_page_wakeup(m);
1537 			}
1538 		}
1539 next:
1540 		lwkt_yield();
1541 		vm_page_queues_spin_lock(PQ_ACTIVE + q);
1542 	}
1543 
1544 	/*
1545 	 * Clean out our local marker.
1546 	 *
1547 	 * Page queue still spin-locked.
1548 	 */
1549 	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1550 	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1551 
1552 	return (delta);
1553 }
1554 
1555 /*
1556  * The number of actually free pages can drop down to v_free_reserved,
1557  * we try to build the free count back above v_free_min.  Note that
1558  * vm_paging_needed() also returns TRUE if v_free_count is not at
1559  * least v_free_min so that is the minimum we must build the free
1560  * count to.
1561  *
1562  * We use a slightly higher target to improve hysteresis,
1563  * ((v_free_target + v_free_min) / 2).  Since v_free_target
1564  * is usually the same as v_cache_min this maintains about
1565  * half the pages in the free queue as are in the cache queue,
1566  * providing pretty good pipelining for pageout operation.
1567  *
1568  * The system operator can manipulate vm.v_cache_min and
1569  * vm.v_free_target to tune the pageout demon.  Be sure
1570  * to keep vm.v_free_min < vm.v_free_target.
1571  *
1572  * Note that the original paging target is to get at least
1573  * (free_min + cache_min) into (free + cache).  The slightly
1574  * higher target will shift additional pages from cache to free
1575  * without effecting the original paging target in order to
1576  * maintain better hysteresis and not have the free count always
1577  * be dead-on v_free_min.
1578  *
1579  * NOTE: we are still in a critical section.
1580  *
1581  * Pages moved from PQ_CACHE to totally free are not counted in the
1582  * pages_freed counter.
1583  *
1584  * WARNING! Can be called from two pagedaemon threads simultaneously.
1585  */
1586 static void
1587 vm_pageout_scan_cache(long avail_shortage, int pass,
1588 		      long vnodes_skipped, long recycle_count)
1589 {
1590 	static int lastkillticks;
1591 	struct vm_pageout_scan_info info;
1592 	vm_page_t m;
1593 	int isep;
1594 
1595 	isep = (curthread == emergpager);
1596 
1597 	while (vmstats.v_free_count <
1598 	       (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1599 		/*
1600 		 * This steals some code from vm/vm_page.c
1601 		 *
1602 		 * Create two rovers and adjust the code to reduce
1603 		 * chances of them winding up at the same index (which
1604 		 * can cause a lot of contention).
1605 		 */
1606 		static int cache_rover[2] = { 0, PQ_L2_MASK / 2 };
1607 
1608 		if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0)
1609 			goto next_rover;
1610 
1611 		m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK);
1612 		if (m == NULL)
1613 			break;
1614 
1615 		/*
1616 		 * If the busy attempt fails we can still deactivate the page.
1617 		 */
1618 		/* page is returned removed from its queue and spinlocked */
1619 		if (vm_page_busy_try(m, TRUE)) {
1620 			vm_page_deactivate_locked(m);
1621 			vm_page_spin_unlock(m);
1622 			continue;
1623 		}
1624 		vm_page_spin_unlock(m);
1625 		pagedaemon_wakeup();
1626 		lwkt_yield();
1627 
1628 		/*
1629 		 * Remaining operations run with the page busy and neither
1630 		 * the page or the queue will be spin-locked.
1631 		 */
1632 		if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) ||
1633 		    m->hold_count ||
1634 		    m->wire_count) {
1635 			vm_page_deactivate(m);
1636 			vm_page_wakeup(m);
1637 			continue;
1638 		}
1639 		pmap_mapped_sync(m);
1640 		KKASSERT((m->flags & PG_MAPPED) == 0);
1641 		KKASSERT(m->dirty == 0);
1642 		vm_pageout_page_free(m);
1643 		mycpu->gd_cnt.v_dfree++;
1644 next_rover:
1645 		if (isep)
1646 			cache_rover[1] -= PQ_PRIME2;
1647 		else
1648 			cache_rover[0] += PQ_PRIME2;
1649 	}
1650 
1651 #if !defined(NO_SWAPPING)
1652 	/*
1653 	 * Idle process swapout -- run once per second.
1654 	 */
1655 	if (vm_swap_idle_enabled) {
1656 		static time_t lsec;
1657 		if (time_uptime != lsec) {
1658 			atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE);
1659 			vm_req_vmdaemon();
1660 			lsec = time_uptime;
1661 		}
1662 	}
1663 #endif
1664 
1665 	/*
1666 	 * If we didn't get enough free pages, and we have skipped a vnode
1667 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
1668 	 * if we did not get enough free pages.
1669 	 */
1670 	if (vm_paging_target() > 0) {
1671 		if (vnodes_skipped && vm_page_count_min(0))
1672 			speedup_syncer(NULL);
1673 #if !defined(NO_SWAPPING)
1674 		if (vm_swap_enabled && vm_page_count_target()) {
1675 			atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL);
1676 			vm_req_vmdaemon();
1677 		}
1678 #endif
1679 	}
1680 
1681 	/*
1682 	 * Handle catastrophic conditions.  Under good conditions we should
1683 	 * be at the target, well beyond our minimum.  If we could not even
1684 	 * reach our minimum the system is under heavy stress.  But just being
1685 	 * under heavy stress does not trigger process killing.
1686 	 *
1687 	 * We consider ourselves to have run out of memory if the swap pager
1688 	 * is full and avail_shortage is still positive.  The secondary check
1689 	 * ensures that we do not kill processes if the instantanious
1690 	 * availability is good, even if the pageout demon pass says it
1691 	 * couldn't get to the target.
1692 	 *
1693 	 * NOTE!  THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL
1694 	 *	  SITUATIONS.
1695 	 */
1696 	if (swap_pager_almost_full &&
1697 	    pass > 0 &&
1698 	    isep == 0 &&
1699 	    (vm_page_count_min(recycle_count) || avail_shortage > 0)) {
1700 		kprintf("Warning: system low on memory+swap "
1701 			"shortage %ld for %d ticks!\n",
1702 			avail_shortage, ticks - swap_fail_ticks);
1703 		if (bootverbose)
1704 		kprintf("Metrics: spaf=%d spf=%d pass=%d "
1705 			"avail=%ld target=%ld last=%u\n",
1706 			swap_pager_almost_full,
1707 			swap_pager_full,
1708 			pass,
1709 			avail_shortage,
1710 			vm_paging_target(),
1711 			(unsigned int)(ticks - lastkillticks));
1712 	}
1713 	if (swap_pager_full &&
1714 	    pass > 1 &&
1715 	    isep == 0 &&
1716 	    avail_shortage > 0 &&
1717 	    vm_paging_target() > 0 &&
1718 	    (unsigned int)(ticks - lastkillticks) >= hz) {
1719 		/*
1720 		 * Kill something, maximum rate once per second to give
1721 		 * the process time to free up sufficient memory.
1722 		 */
1723 		lastkillticks = ticks;
1724 		info.bigproc = NULL;
1725 		info.bigsize = 0;
1726 		allproc_scan(vm_pageout_scan_callback, &info, 0);
1727 		if (info.bigproc != NULL) {
1728 			kprintf("Try to kill process %d %s\n",
1729 				info.bigproc->p_pid, info.bigproc->p_comm);
1730 			info.bigproc->p_nice = PRIO_MIN;
1731 			info.bigproc->p_usched->resetpriority(
1732 				FIRST_LWP_IN_PROC(info.bigproc));
1733 			atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1734 			killproc(info.bigproc, "out of swap space");
1735 			wakeup(&vmstats.v_free_count);
1736 			PRELE(info.bigproc);
1737 		}
1738 	}
1739 }
1740 
1741 static int
1742 vm_pageout_scan_callback(struct proc *p, void *data)
1743 {
1744 	struct vm_pageout_scan_info *info = data;
1745 	vm_offset_t size;
1746 
1747 	/*
1748 	 * Never kill system processes or init.  If we have configured swap
1749 	 * then try to avoid killing low-numbered pids.
1750 	 */
1751 	if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1752 	    ((p->p_pid < 48) && (vm_swap_size != 0))) {
1753 		return (0);
1754 	}
1755 
1756 	lwkt_gettoken(&p->p_token);
1757 
1758 	/*
1759 	 * if the process is in a non-running type state,
1760 	 * don't touch it.
1761 	 */
1762 	if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1763 		lwkt_reltoken(&p->p_token);
1764 		return (0);
1765 	}
1766 
1767 	/*
1768 	 * Get the approximate process size.  Note that anonymous pages
1769 	 * with backing swap will be counted twice, but there should not
1770 	 * be too many such pages due to the stress the VM system is
1771 	 * under at this point.
1772 	 */
1773 	size = vmspace_anonymous_count(p->p_vmspace) +
1774 		vmspace_swap_count(p->p_vmspace);
1775 
1776 	/*
1777 	 * If the this process is bigger than the biggest one
1778 	 * remember it.
1779 	 */
1780 	if (info->bigsize < size) {
1781 		if (info->bigproc)
1782 			PRELE(info->bigproc);
1783 		PHOLD(p);
1784 		info->bigproc = p;
1785 		info->bigsize = size;
1786 	}
1787 	lwkt_reltoken(&p->p_token);
1788 	lwkt_yield();
1789 
1790 	return(0);
1791 }
1792 
1793 /*
1794  * This old guy slowly walks PQ_HOLD looking for pages which need to be
1795  * moved back to PQ_FREE.  It is possible for pages to accumulate here
1796  * when vm_page_free() races against vm_page_unhold(), resulting in a
1797  * page being left on a PQ_HOLD queue with hold_count == 0.
1798  *
1799  * It is easier to handle this edge condition here, in non-critical code,
1800  * rather than enforce a spin-lock for every 1->0 transition in
1801  * vm_page_unhold().
1802  *
1803  * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue.
1804  */
1805 static void
1806 vm_pageout_scan_hold(int q)
1807 {
1808 	vm_page_t m;
1809 
1810 	vm_page_queues_spin_lock(PQ_HOLD + q);
1811 	TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) {
1812 		if (m->flags & PG_MARKER)
1813 			continue;
1814 
1815 		/*
1816 		 * Process one page and return
1817 		 */
1818 		if (m->hold_count)
1819 			break;
1820 		kprintf("DEBUG: pageout HOLD->FREE %p\n", m);
1821 		vm_page_hold(m);
1822 		vm_page_queues_spin_unlock(PQ_HOLD + q);
1823 		vm_page_unhold(m);	/* reprocess */
1824 		return;
1825 	}
1826 	vm_page_queues_spin_unlock(PQ_HOLD + q);
1827 }
1828 
1829 /*
1830  * This routine tries to maintain the pseudo LRU active queue,
1831  * so that during long periods of time where there is no paging,
1832  * that some statistic accumulation still occurs.  This code
1833  * helps the situation where paging just starts to occur.
1834  */
1835 static void
1836 vm_pageout_page_stats(int q)
1837 {
1838 	static int fullintervalcount = 0;
1839 	struct vm_page marker;
1840 	vm_page_t m;
1841 	long pcount, tpcount;		/* Number of pages to check */
1842 	long page_shortage;
1843 
1844 	page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1845 			 vmstats.v_free_min) -
1846 			(vmstats.v_free_count + vmstats.v_inactive_count +
1847 			 vmstats.v_cache_count);
1848 
1849 	if (page_shortage <= 0)
1850 		return;
1851 
1852 	pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1853 	fullintervalcount += vm_pageout_stats_interval;
1854 	if (fullintervalcount < vm_pageout_full_stats_interval) {
1855 		tpcount = (vm_pageout_stats_max * pcount) /
1856 			  vmstats.v_page_count + 1;
1857 		if (pcount > tpcount)
1858 			pcount = tpcount;
1859 	} else {
1860 		fullintervalcount = 0;
1861 	}
1862 
1863 	bzero(&marker, sizeof(marker));
1864 	marker.flags = PG_FICTITIOUS | PG_MARKER;
1865 	marker.busy_count = PBUSY_LOCKED;
1866 	marker.queue = PQ_ACTIVE + q;
1867 	marker.pc = q;
1868 	marker.wire_count = 1;
1869 
1870 	vm_page_queues_spin_lock(PQ_ACTIVE + q);
1871 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1872 
1873 	/*
1874 	 * Queue locked at top of loop to avoid stack marker issues.
1875 	 */
1876 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1877 	       pcount-- > 0)
1878 	{
1879 		int actcount;
1880 
1881 		KKASSERT(m->queue == PQ_ACTIVE + q);
1882 		TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1883 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1884 				   &marker, pageq);
1885 
1886 		/*
1887 		 * Skip marker pages (atomic against other markers to avoid
1888 		 * infinite hop-over scans).
1889 		 */
1890 		if (m->flags & PG_MARKER)
1891 			continue;
1892 
1893 		/*
1894 		 * Ignore pages we can't busy
1895 		 */
1896 		if (vm_page_busy_try(m, TRUE))
1897 			continue;
1898 
1899 		/*
1900 		 * Remaining operations run with the page busy and neither
1901 		 * the page or the queue will be spin-locked.
1902 		 */
1903 		KKASSERT(m->queue == PQ_ACTIVE + q);
1904 		vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1905 
1906 		/*
1907 		 * We can just remove wired pages from the queue
1908 		 */
1909 		if (m->wire_count) {
1910 			vm_page_unqueue_nowakeup(m);
1911 			vm_page_wakeup(m);
1912 			goto next;
1913 		}
1914 
1915 
1916 		/*
1917 		 * We now have a safely busied page, the page and queue
1918 		 * spinlocks have been released.
1919 		 *
1920 		 * Ignore held and wired pages
1921 		 */
1922 		if (m->hold_count || m->wire_count) {
1923 			vm_page_wakeup(m);
1924 			goto next;
1925 		}
1926 
1927 		/*
1928 		 * Calculate activity
1929 		 */
1930 		actcount = 0;
1931 		if (m->flags & PG_REFERENCED) {
1932 			vm_page_flag_clear(m, PG_REFERENCED);
1933 			actcount += 1;
1934 		}
1935 		actcount += pmap_ts_referenced(m);
1936 
1937 		/*
1938 		 * Update act_count and move page to end of queue.
1939 		 */
1940 		if (actcount) {
1941 			m->act_count += ACT_ADVANCE + actcount;
1942 			if (m->act_count > ACT_MAX)
1943 				m->act_count = ACT_MAX;
1944 			vm_page_and_queue_spin_lock(m);
1945 			if (m->queue - m->pc == PQ_ACTIVE) {
1946 				TAILQ_REMOVE(
1947 					&vm_page_queues[PQ_ACTIVE + q].pl,
1948 					m, pageq);
1949 				TAILQ_INSERT_TAIL(
1950 					&vm_page_queues[PQ_ACTIVE + q].pl,
1951 					m, pageq);
1952 			}
1953 			vm_page_and_queue_spin_unlock(m);
1954 			vm_page_wakeup(m);
1955 			goto next;
1956 		}
1957 
1958 		if (m->act_count == 0) {
1959 			/*
1960 			 * We turn off page access, so that we have
1961 			 * more accurate RSS stats.  We don't do this
1962 			 * in the normal page deactivation when the
1963 			 * system is loaded VM wise, because the
1964 			 * cost of the large number of page protect
1965 			 * operations would be higher than the value
1966 			 * of doing the operation.
1967 			 *
1968 			 * We use the marker to save our place so
1969 			 * we can release the spin lock.  both (m)
1970 			 * and (next) will be invalid.
1971 			 */
1972 			vm_page_protect(m, VM_PROT_NONE);
1973 			vm_page_deactivate(m);
1974 		} else {
1975 			m->act_count -= min(m->act_count, ACT_DECLINE);
1976 			vm_page_and_queue_spin_lock(m);
1977 			if (m->queue - m->pc == PQ_ACTIVE) {
1978 				TAILQ_REMOVE(
1979 					&vm_page_queues[PQ_ACTIVE + q].pl,
1980 					m, pageq);
1981 				TAILQ_INSERT_TAIL(
1982 					&vm_page_queues[PQ_ACTIVE + q].pl,
1983 					m, pageq);
1984 			}
1985 			vm_page_and_queue_spin_unlock(m);
1986 		}
1987 		vm_page_wakeup(m);
1988 next:
1989 		vm_page_queues_spin_lock(PQ_ACTIVE + q);
1990 	}
1991 
1992 	/*
1993 	 * Remove our local marker
1994 	 *
1995 	 * Page queue still spin-locked.
1996 	 */
1997 	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1998 	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1999 }
2000 
2001 static void
2002 vm_pageout_free_page_calc(vm_size_t count)
2003 {
2004 	/*
2005 	 * v_free_min		normal allocations
2006 	 * v_free_reserved	system allocations
2007 	 * v_pageout_free_min	allocations by pageout daemon
2008 	 * v_interrupt_free_min	low level allocations (e.g swap structures)
2009 	 *
2010 	 * v_free_min is used to generate several other baselines, and they
2011 	 * can get pretty silly on systems with a lot of memory.
2012 	 */
2013 	vmstats.v_free_min = 64 + vmstats.v_page_count / 200;
2014 	vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
2015 	vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
2016 	vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
2017 	vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
2018 }
2019 
2020 
2021 /*
2022  * vm_pageout is the high level pageout daemon.  TWO kernel threads run
2023  * this daemon, the primary pageout daemon and the emergency pageout daemon.
2024  *
2025  * The emergency pageout daemon takes over when the primary pageout daemon
2026  * deadlocks.  The emergency pageout daemon ONLY pages out to swap, thus
2027  * avoiding the many low-memory deadlocks which can occur when paging out
2028  * to VFS's.
2029  */
2030 static void
2031 vm_pageout_thread(void)
2032 {
2033 	int pass;
2034 	int q;
2035 	int q1iterator = 0;
2036 	int q2iterator = 0;
2037 	int q3iterator = 0;
2038 	int isep;
2039 
2040 	curthread->td_flags |= TDF_SYSTHREAD;
2041 
2042 	/*
2043 	 * We only need to setup once.
2044 	 */
2045 	isep = 0;
2046 	if (curthread == emergpager) {
2047 		isep = 1;
2048 		goto skip_setup;
2049 	}
2050 
2051 	/*
2052 	 * Initialize some paging parameters.
2053 	 */
2054 	vm_pageout_free_page_calc(vmstats.v_page_count);
2055 
2056 	/*
2057 	 * v_free_target and v_cache_min control pageout hysteresis.  Note
2058 	 * that these are more a measure of the VM cache queue hysteresis
2059 	 * then the VM free queue.  Specifically, v_free_target is the
2060 	 * high water mark (free+cache pages).
2061 	 *
2062 	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
2063 	 * low water mark, while v_free_min is the stop.  v_cache_min must
2064 	 * be big enough to handle memory needs while the pageout daemon
2065 	 * is signalled and run to free more pages.
2066 	 */
2067 	vmstats.v_free_target = 4 * vmstats.v_free_min +
2068 				vmstats.v_free_reserved;
2069 
2070 	/*
2071 	 * NOTE: With the new buffer cache b_act_count we want the default
2072 	 *	 inactive target to be a percentage of available memory.
2073 	 *
2074 	 *	 The inactive target essentially determines the minimum
2075 	 *	 number of 'temporary' pages capable of caching one-time-use
2076 	 *	 files when the VM system is otherwise full of pages
2077 	 *	 belonging to multi-time-use files or active program data.
2078 	 *
2079 	 * NOTE: The inactive target is aggressively persued only if the
2080 	 *	 inactive queue becomes too small.  If the inactive queue
2081 	 *	 is large enough to satisfy page movement to free+cache
2082 	 *	 then it is repopulated more slowly from the active queue.
2083 	 *	 This allows a general inactive_target default to be set.
2084 	 *
2085 	 *	 There is an issue here for processes which sit mostly idle
2086 	 *	 'overnight', such as sshd, tcsh, and X.  Any movement from
2087 	 *	 the active queue will eventually cause such pages to
2088 	 *	 recycle eventually causing a lot of paging in the morning.
2089 	 *	 To reduce the incidence of this pages cycled out of the
2090 	 *	 buffer cache are moved directly to the inactive queue if
2091 	 *	 they were only used once or twice.
2092 	 *
2093 	 *	 The vfs.vm_cycle_point sysctl can be used to adjust this.
2094 	 *	 Increasing the value (up to 64) increases the number of
2095 	 *	 buffer recyclements which go directly to the inactive queue.
2096 	 */
2097 	if (vmstats.v_free_count > 2048) {
2098 		vmstats.v_cache_min = vmstats.v_free_target;
2099 		vmstats.v_cache_max = 2 * vmstats.v_cache_min;
2100 	} else {
2101 		vmstats.v_cache_min = 0;
2102 		vmstats.v_cache_max = 0;
2103 	}
2104 	vmstats.v_inactive_target = vmstats.v_free_count / 4;
2105 
2106 	/* XXX does not really belong here */
2107 	if (vm_page_max_wired == 0)
2108 		vm_page_max_wired = vmstats.v_free_count / 3;
2109 
2110 	if (vm_pageout_stats_max == 0)
2111 		vm_pageout_stats_max = vmstats.v_free_target;
2112 
2113 	/*
2114 	 * Set interval in seconds for stats scan.
2115 	 */
2116 	if (vm_pageout_stats_interval == 0)
2117 		vm_pageout_stats_interval = 5;
2118 	if (vm_pageout_full_stats_interval == 0)
2119 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
2120 
2121 
2122 	/*
2123 	 * Set maximum free per pass
2124 	 */
2125 	if (vm_pageout_stats_free_max == 0)
2126 		vm_pageout_stats_free_max = 5;
2127 
2128 	swap_pager_swap_init();
2129 	pass = 0;
2130 
2131 	atomic_swap_int(&sequence_emerg_pager, 1);
2132 	wakeup(&sequence_emerg_pager);
2133 
2134 skip_setup:
2135 	/*
2136 	 * Sequence emergency pager startup
2137 	 */
2138 	if (isep) {
2139 		while (sequence_emerg_pager == 0)
2140 			tsleep(&sequence_emerg_pager, 0, "pstartup", hz);
2141 	}
2142 
2143 	/*
2144 	 * The pageout daemon is never done, so loop forever.
2145 	 *
2146 	 * WARNING!  This code is being executed by two kernel threads
2147 	 *	     potentially simultaneously.
2148 	 */
2149 	while (TRUE) {
2150 		int error;
2151 		long avail_shortage;
2152 		long inactive_shortage;
2153 		long vnodes_skipped = 0;
2154 		long recycle_count = 0;
2155 		long tmp;
2156 
2157 		/*
2158 		 * Wait for an action request.  If we timeout check to
2159 		 * see if paging is needed (in case the normal wakeup
2160 		 * code raced us).
2161 		 */
2162 		if (isep) {
2163 			/*
2164 			 * Emergency pagedaemon monitors the primary
2165 			 * pagedaemon while vm_pages_needed != 0.
2166 			 *
2167 			 * The emergency pagedaemon only runs if VM paging
2168 			 * is needed and the primary pagedaemon has not
2169 			 * updated vm_pagedaemon_time for more than 2 seconds.
2170 			 */
2171 			if (vm_pages_needed)
2172 				tsleep(&vm_pagedaemon_time, 0, "psleep", hz);
2173 			else
2174 				tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10);
2175 			if (vm_pages_needed == 0) {
2176 				pass = 0;
2177 				continue;
2178 			}
2179 			if ((int)(ticks - vm_pagedaemon_time) < hz * 2) {
2180 				pass = 0;
2181 				continue;
2182 			}
2183 		} else {
2184 			/*
2185 			 * Primary pagedaemon
2186 			 *
2187 			 * NOTE: We unconditionally cleanup PQ_HOLD even
2188 			 *	 when there is no work to do.
2189 			 */
2190 			vm_pageout_scan_hold(q3iterator & PQ_L2_MASK);
2191 			++q3iterator;
2192 
2193 			if (vm_pages_needed == 0) {
2194 				error = tsleep(&vm_pages_needed,
2195 					       0, "psleep",
2196 					       vm_pageout_stats_interval * hz);
2197 				if (error &&
2198 				    vm_paging_needed(0) == 0 &&
2199 				    vm_pages_needed == 0) {
2200 					for (q = 0; q < PQ_L2_SIZE; ++q)
2201 						vm_pageout_page_stats(q);
2202 					continue;
2203 				}
2204 				vm_pagedaemon_time = ticks;
2205 				vm_pages_needed = 1;
2206 
2207 				/*
2208 				 * Wake the emergency pagedaemon up so it
2209 				 * can monitor us.  It will automatically
2210 				 * go back into a long sleep when
2211 				 * vm_pages_needed returns to 0.
2212 				 */
2213 				wakeup(&vm_pagedaemon_time);
2214 			}
2215 		}
2216 
2217 		mycpu->gd_cnt.v_pdwakeups++;
2218 
2219 		/*
2220 		 * Scan for INACTIVE->CLEAN/PAGEOUT
2221 		 *
2222 		 * This routine tries to avoid thrashing the system with
2223 		 * unnecessary activity.
2224 		 *
2225 		 * Calculate our target for the number of free+cache pages we
2226 		 * want to get to.  This is higher then the number that causes
2227 		 * allocations to stall (severe) in order to provide hysteresis,
2228 		 * and if we don't make it all the way but get to the minimum
2229 		 * we're happy.  Goose it a bit if there are multiple requests
2230 		 * for memory.
2231 		 *
2232 		 * Don't reduce avail_shortage inside the loop or the
2233 		 * PQAVERAGE() calculation will break.
2234 		 *
2235 		 * NOTE! deficit is differentiated from avail_shortage as
2236 		 *	 REQUIRING at least (deficit) pages to be cleaned,
2237 		 *	 even if the page queues are in good shape.  This
2238 		 *	 is used primarily for handling per-process
2239 		 *	 RLIMIT_RSS and may also see small values when
2240 		 *	 processes block due to low memory.
2241 		 */
2242 		vmstats_rollup();
2243 		if (isep == 0)
2244 			vm_pagedaemon_time = ticks;
2245 		avail_shortage = vm_paging_target() + vm_pageout_deficit;
2246 		vm_pageout_deficit = 0;
2247 
2248 		if (avail_shortage > 0) {
2249 			long delta = 0;
2250 			int qq;
2251 
2252 			qq = q1iterator;
2253 			for (q = 0; q < PQ_L2_SIZE; ++q) {
2254 				delta += vm_pageout_scan_inactive(
2255 					    pass,
2256 					    qq & PQ_L2_MASK,
2257 					    PQAVERAGE(avail_shortage),
2258 					    &vnodes_skipped);
2259 				if (isep)
2260 					--qq;
2261 				else
2262 					++qq;
2263 				if (avail_shortage - delta <= 0)
2264 					break;
2265 			}
2266 			avail_shortage -= delta;
2267 			q1iterator = qq;
2268 		}
2269 
2270 		/*
2271 		 * Figure out how many active pages we must deactivate.  If
2272 		 * we were able to reach our target with just the inactive
2273 		 * scan above we limit the number of active pages we
2274 		 * deactivate to reduce unnecessary work.
2275 		 */
2276 		vmstats_rollup();
2277 		if (isep == 0)
2278 			vm_pagedaemon_time = ticks;
2279 		inactive_shortage = vmstats.v_inactive_target -
2280 				    vmstats.v_inactive_count;
2281 
2282 		/*
2283 		 * If we were unable to free sufficient inactive pages to
2284 		 * satisfy the free/cache queue requirements then simply
2285 		 * reaching the inactive target may not be good enough.
2286 		 * Try to deactivate pages in excess of the target based
2287 		 * on the shortfall.
2288 		 *
2289 		 * However to prevent thrashing the VM system do not
2290 		 * deactivate more than an additional 1/10 the inactive
2291 		 * target's worth of active pages.
2292 		 */
2293 		if (avail_shortage > 0) {
2294 			tmp = avail_shortage * 2;
2295 			if (tmp > vmstats.v_inactive_target / 10)
2296 				tmp = vmstats.v_inactive_target / 10;
2297 			inactive_shortage += tmp;
2298 		}
2299 
2300 		/*
2301 		 * Only trigger a pmap cleanup on inactive shortage.
2302 		 */
2303 		if (isep == 0 && inactive_shortage > 0) {
2304 			pmap_collect();
2305 		}
2306 
2307 		/*
2308 		 * Scan for ACTIVE->INACTIVE
2309 		 *
2310 		 * Only trigger on inactive shortage.  Triggering on
2311 		 * avail_shortage can starve the active queue with
2312 		 * unnecessary active->inactive transitions and destroy
2313 		 * performance.
2314 		 *
2315 		 * If this is the emergency pager, always try to move
2316 		 * a few pages from active to inactive because the inactive
2317 		 * queue might have enough pages, but not enough anonymous
2318 		 * pages.
2319 		 */
2320 		if (isep && inactive_shortage < vm_emerg_launder)
2321 			inactive_shortage = vm_emerg_launder;
2322 
2323 		if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2324 			long delta = 0;
2325 			int qq;
2326 
2327 			qq = q2iterator;
2328 			for (q = 0; q < PQ_L2_SIZE; ++q) {
2329 				delta += vm_pageout_scan_active(
2330 						pass,
2331 						qq & PQ_L2_MASK,
2332 						PQAVERAGE(avail_shortage),
2333 						PQAVERAGE(inactive_shortage),
2334 						&recycle_count);
2335 				if (isep)
2336 					--qq;
2337 				else
2338 					++qq;
2339 				if (inactive_shortage - delta <= 0 &&
2340 				    avail_shortage - delta <= 0) {
2341 					break;
2342 				}
2343 			}
2344 			inactive_shortage -= delta;
2345 			avail_shortage -= delta;
2346 			q2iterator = qq;
2347 		}
2348 
2349 		/*
2350 		 * Scan for CACHE->FREE
2351 		 *
2352 		 * Finally free enough cache pages to meet our free page
2353 		 * requirement and take more drastic measures if we are
2354 		 * still in trouble.
2355 		 */
2356 		vmstats_rollup();
2357 		if (isep == 0)
2358 			vm_pagedaemon_time = ticks;
2359 		vm_pageout_scan_cache(avail_shortage, pass,
2360 				      vnodes_skipped, recycle_count);
2361 
2362 		/*
2363 		 * Wait for more work.
2364 		 */
2365 		if (avail_shortage > 0) {
2366 			++pass;
2367 			if (pass < 10 && vm_pages_needed > 1) {
2368 				/*
2369 				 * Normal operation, additional processes
2370 				 * have already kicked us.  Retry immediately
2371 				 * unless swap space is completely full in
2372 				 * which case delay a bit.
2373 				 */
2374 				if (swap_pager_full) {
2375 					tsleep(&vm_pages_needed, 0, "pdelay",
2376 						hz / 5);
2377 				} /* else immediate retry */
2378 			} else if (pass < 10) {
2379 				/*
2380 				 * Normal operation, fewer processes.  Delay
2381 				 * a bit but allow wakeups.  vm_pages_needed
2382 				 * is only adjusted against the primary
2383 				 * pagedaemon here.
2384 				 */
2385 				if (isep == 0)
2386 					vm_pages_needed = 0;
2387 				tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2388 				if (isep == 0)
2389 					vm_pages_needed = 1;
2390 			} else if (swap_pager_full == 0) {
2391 				/*
2392 				 * We've taken too many passes, forced delay.
2393 				 */
2394 				tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2395 			} else {
2396 				/*
2397 				 * Running out of memory, catastrophic
2398 				 * back-off to one-second intervals.
2399 				 */
2400 				tsleep(&vm_pages_needed, 0, "pdelay", hz);
2401 			}
2402 		} else if (vm_pages_needed) {
2403 			/*
2404 			 * Interlocked wakeup of waiters (non-optional).
2405 			 *
2406 			 * Similar to vm_page_free_wakeup() in vm_page.c,
2407 			 * wake
2408 			 */
2409 			pass = 0;
2410 			if (!vm_page_count_min(vm_page_free_hysteresis) ||
2411 			    !vm_page_count_target()) {
2412 				vm_pages_needed = 0;
2413 				wakeup(&vmstats.v_free_count);
2414 			}
2415 		} else {
2416 			pass = 0;
2417 		}
2418 	}
2419 }
2420 
2421 static struct kproc_desc pg1_kp = {
2422 	"pagedaemon",
2423 	vm_pageout_thread,
2424 	&pagethread
2425 };
2426 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp);
2427 
2428 static struct kproc_desc pg2_kp = {
2429 	"emergpager",
2430 	vm_pageout_thread,
2431 	&emergpager
2432 };
2433 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp);
2434 
2435 
2436 /*
2437  * Called after allocating a page out of the cache or free queue
2438  * to possibly wake the pagedaemon up to replentish our supply.
2439  *
2440  * We try to generate some hysteresis by waking the pagedaemon up
2441  * when our free+cache pages go below the free_min+cache_min level.
2442  * The pagedaemon tries to get the count back up to at least the
2443  * minimum, and through to the target level if possible.
2444  *
2445  * If the pagedaemon is already active bump vm_pages_needed as a hint
2446  * that there are even more requests pending.
2447  *
2448  * SMP races ok?
2449  * No requirements.
2450  */
2451 void
2452 pagedaemon_wakeup(void)
2453 {
2454 	if (vm_paging_needed(0) && curthread != pagethread) {
2455 		if (vm_pages_needed == 0) {
2456 			vm_pages_needed = 1;	/* SMP race ok */
2457 			wakeup(&vm_pages_needed);
2458 		} else if (vm_page_count_min(0)) {
2459 			++vm_pages_needed;	/* SMP race ok */
2460 		}
2461 	}
2462 }
2463 
2464 #if !defined(NO_SWAPPING)
2465 
2466 /*
2467  * SMP races ok?
2468  * No requirements.
2469  */
2470 static void
2471 vm_req_vmdaemon(void)
2472 {
2473 	static int lastrun = 0;
2474 
2475 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2476 		wakeup(&vm_daemon_needed);
2477 		lastrun = ticks;
2478 	}
2479 }
2480 
2481 static int vm_daemon_callback(struct proc *p, void *data __unused);
2482 
2483 /*
2484  * No requirements.
2485  */
2486 static void
2487 vm_daemon(void)
2488 {
2489 	int req_swapout;
2490 
2491 	while (TRUE) {
2492 		tsleep(&vm_daemon_needed, 0, "psleep", 0);
2493 		req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0);
2494 
2495 		/*
2496 		 * forced swapouts
2497 		 */
2498 		if (req_swapout)
2499 			swapout_procs(vm_pageout_req_swapout);
2500 
2501 		/*
2502 		 * scan the processes for exceeding their rlimits or if
2503 		 * process is swapped out -- deactivate pages
2504 		 */
2505 		allproc_scan(vm_daemon_callback, NULL, 0);
2506 	}
2507 }
2508 
2509 static int
2510 vm_daemon_callback(struct proc *p, void *data __unused)
2511 {
2512 	struct vmspace *vm;
2513 	vm_pindex_t limit, size;
2514 
2515 	/*
2516 	 * if this is a system process or if we have already
2517 	 * looked at this process, skip it.
2518 	 */
2519 	lwkt_gettoken(&p->p_token);
2520 
2521 	if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2522 		lwkt_reltoken(&p->p_token);
2523 		return (0);
2524 	}
2525 
2526 	/*
2527 	 * if the process is in a non-running type state,
2528 	 * don't touch it.
2529 	 */
2530 	if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2531 		lwkt_reltoken(&p->p_token);
2532 		return (0);
2533 	}
2534 
2535 	/*
2536 	 * get a limit
2537 	 */
2538 	limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2539 			        p->p_rlimit[RLIMIT_RSS].rlim_max));
2540 
2541 	/*
2542 	 * let processes that are swapped out really be
2543 	 * swapped out.  Set the limit to nothing to get as
2544 	 * many pages out to swap as possible.
2545 	 */
2546 	if (p->p_flags & P_SWAPPEDOUT)
2547 		limit = 0;
2548 
2549 	vm = p->p_vmspace;
2550 	vmspace_hold(vm);
2551 	size = pmap_resident_tlnw_count(&vm->vm_pmap);
2552 	if (limit >= 0 && size > 4096 &&
2553 	    size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
2554 		vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2555 	}
2556 	vmspace_drop(vm);
2557 
2558 	lwkt_reltoken(&p->p_token);
2559 
2560 	return (0);
2561 }
2562 
2563 #endif
2564