xref: /dragonfly/sys/vm/vm_pageout.c (revision dcd37f7d)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * The Mach Operating System project at Carnegie-Mellon University.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
43  *
44  *
45  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
46  * All rights reserved.
47  *
48  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
49  *
50  * Permission to use, copy, modify and distribute this software and
51  * its documentation is hereby granted, provided that both the copyright
52  * notice and this permission notice appear in all copies of the
53  * software, derivative works or modified versions, and any portions
54  * thereof, and that both notices appear in supporting documentation.
55  *
56  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
57  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
58  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
59  *
60  * Carnegie Mellon requests users of this software to return to
61  *
62  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
63  *  School of Computer Science
64  *  Carnegie Mellon University
65  *  Pittsburgh PA 15213-3890
66  *
67  * any improvements or extensions that they make and grant Carnegie the
68  * rights to redistribute these changes.
69  *
70  * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
71  * $DragonFly: src/sys/vm/vm_pageout.c,v 1.36 2008/07/01 02:02:56 dillon Exp $
72  */
73 
74 /*
75  *	The proverbial page-out daemon.
76  */
77 
78 #include "opt_vm.h"
79 #include <sys/param.h>
80 #include <sys/systm.h>
81 #include <sys/kernel.h>
82 #include <sys/proc.h>
83 #include <sys/kthread.h>
84 #include <sys/resourcevar.h>
85 #include <sys/signalvar.h>
86 #include <sys/vnode.h>
87 #include <sys/vmmeter.h>
88 #include <sys/sysctl.h>
89 
90 #include <vm/vm.h>
91 #include <vm/vm_param.h>
92 #include <sys/lock.h>
93 #include <vm/vm_object.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_map.h>
96 #include <vm/vm_pageout.h>
97 #include <vm/vm_pager.h>
98 #include <vm/swap_pager.h>
99 #include <vm/vm_extern.h>
100 
101 #include <sys/thread2.h>
102 #include <vm/vm_page2.h>
103 
104 /*
105  * System initialization
106  */
107 
108 /* the kernel process "vm_pageout"*/
109 static void vm_pageout (void);
110 static int vm_pageout_clean (vm_page_t);
111 static int vm_pageout_scan (int pass);
112 static int vm_pageout_free_page_calc (vm_size_t count);
113 struct thread *pagethread;
114 
115 static struct kproc_desc page_kp = {
116 	"pagedaemon",
117 	vm_pageout,
118 	&pagethread
119 };
120 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
121 
122 #if !defined(NO_SWAPPING)
123 /* the kernel process "vm_daemon"*/
124 static void vm_daemon (void);
125 static struct	thread *vmthread;
126 
127 static struct kproc_desc vm_kp = {
128 	"vmdaemon",
129 	vm_daemon,
130 	&vmthread
131 };
132 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
133 #endif
134 
135 
136 int vm_pages_needed=0;		/* Event on which pageout daemon sleeps */
137 int vm_pageout_deficit=0;	/* Estimated number of pages deficit */
138 int vm_pageout_pages_needed=0;	/* flag saying that the pageout daemon needs pages */
139 
140 #if !defined(NO_SWAPPING)
141 static int vm_pageout_req_swapout;	/* XXX */
142 static int vm_daemon_needed;
143 #endif
144 static int vm_max_launder = 32;
145 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
146 static int vm_pageout_full_stats_interval = 0;
147 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
148 static int defer_swap_pageouts=0;
149 static int disable_swap_pageouts=0;
150 
151 #if defined(NO_SWAPPING)
152 static int vm_swap_enabled=0;
153 static int vm_swap_idle_enabled=0;
154 #else
155 static int vm_swap_enabled=1;
156 static int vm_swap_idle_enabled=0;
157 #endif
158 
159 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
160 	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
161 
162 SYSCTL_INT(_vm, OID_AUTO, max_launder,
163 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
164 
165 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
166 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
167 
168 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
169 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
170 
171 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
172 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
173 
174 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
175 	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
176 
177 #if defined(NO_SWAPPING)
178 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
179 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
180 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
181 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
182 #else
183 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
184 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
185 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
186 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
187 #endif
188 
189 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
190 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
191 
192 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
193 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
194 
195 static int pageout_lock_miss;
196 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
197 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
198 
199 int vm_load;
200 SYSCTL_INT(_vm, OID_AUTO, vm_load,
201 	CTLFLAG_RD, &vm_load, 0, "load on the VM system");
202 int vm_load_enable = 1;
203 SYSCTL_INT(_vm, OID_AUTO, vm_load_enable,
204 	CTLFLAG_RW, &vm_load_enable, 0, "enable vm_load rate limiting");
205 #ifdef INVARIANTS
206 int vm_load_debug;
207 SYSCTL_INT(_vm, OID_AUTO, vm_load_debug,
208 	CTLFLAG_RW, &vm_load_debug, 0, "debug vm_load");
209 #endif
210 
211 #define VM_PAGEOUT_PAGE_COUNT 16
212 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
213 
214 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
215 
216 #if !defined(NO_SWAPPING)
217 typedef void freeer_fcn_t (vm_map_t, vm_object_t, vm_pindex_t, int);
218 static void vm_pageout_map_deactivate_pages (vm_map_t, vm_pindex_t);
219 static freeer_fcn_t vm_pageout_object_deactivate_pages;
220 static void vm_req_vmdaemon (void);
221 #endif
222 static void vm_pageout_page_stats(void);
223 
224 /*
225  * Update vm_load to slow down faulting processes.
226  *
227  * SMP races ok.
228  * No requirements.
229  */
230 void
231 vm_fault_ratecheck(void)
232 {
233 	if (vm_pages_needed) {
234 		if (vm_load < 1000)
235 			++vm_load;
236 	} else {
237 		if (vm_load > 0)
238 			--vm_load;
239 	}
240 }
241 
242 /*
243  * vm_pageout_clean:
244  *
245  * Clean the page and remove it from the laundry.  The page must not be
246  * busy on-call.
247  *
248  * We set the busy bit to cause potential page faults on this page to
249  * block.  Note the careful timing, however, the busy bit isn't set till
250  * late and we cannot do anything that will mess with the page.
251  *
252  * The caller must hold vm_token.
253  */
254 static int
255 vm_pageout_clean(vm_page_t m)
256 {
257 	vm_object_t object;
258 	vm_page_t mc[2*vm_pageout_page_count];
259 	int pageout_count;
260 	int ib, is, page_base;
261 	vm_pindex_t pindex = m->pindex;
262 
263 	object = m->object;
264 
265 	/*
266 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
267 	 * with the new swapper, but we could have serious problems paging
268 	 * out other object types if there is insufficient memory.
269 	 *
270 	 * Unfortunately, checking free memory here is far too late, so the
271 	 * check has been moved up a procedural level.
272 	 */
273 
274 	/*
275 	 * Don't mess with the page if it's busy, held, or special
276 	 */
277 	if ((m->hold_count != 0) ||
278 	    ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) {
279 		return 0;
280 	}
281 
282 	mc[vm_pageout_page_count] = m;
283 	pageout_count = 1;
284 	page_base = vm_pageout_page_count;
285 	ib = 1;
286 	is = 1;
287 
288 	/*
289 	 * Scan object for clusterable pages.
290 	 *
291 	 * We can cluster ONLY if: ->> the page is NOT
292 	 * clean, wired, busy, held, or mapped into a
293 	 * buffer, and one of the following:
294 	 * 1) The page is inactive, or a seldom used
295 	 *    active page.
296 	 * -or-
297 	 * 2) we force the issue.
298 	 *
299 	 * During heavy mmap/modification loads the pageout
300 	 * daemon can really fragment the underlying file
301 	 * due to flushing pages out of order and not trying
302 	 * align the clusters (which leave sporatic out-of-order
303 	 * holes).  To solve this problem we do the reverse scan
304 	 * first and attempt to align our cluster, then do a
305 	 * forward scan if room remains.
306 	 */
307 
308 more:
309 	while (ib && pageout_count < vm_pageout_page_count) {
310 		vm_page_t p;
311 
312 		if (ib > pindex) {
313 			ib = 0;
314 			break;
315 		}
316 
317 		if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
318 			ib = 0;
319 			break;
320 		}
321 		if (((p->queue - p->pc) == PQ_CACHE) ||
322 		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
323 			ib = 0;
324 			break;
325 		}
326 		vm_page_test_dirty(p);
327 		if ((p->dirty & p->valid) == 0 ||
328 		    p->queue != PQ_INACTIVE ||
329 		    p->wire_count != 0 ||	/* may be held by buf cache */
330 		    p->hold_count != 0) {	/* may be undergoing I/O */
331 			ib = 0;
332 			break;
333 		}
334 		mc[--page_base] = p;
335 		++pageout_count;
336 		++ib;
337 		/*
338 		 * alignment boundry, stop here and switch directions.  Do
339 		 * not clear ib.
340 		 */
341 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
342 			break;
343 	}
344 
345 	while (pageout_count < vm_pageout_page_count &&
346 	    pindex + is < object->size) {
347 		vm_page_t p;
348 
349 		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
350 			break;
351 		if (((p->queue - p->pc) == PQ_CACHE) ||
352 		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
353 			break;
354 		}
355 		vm_page_test_dirty(p);
356 		if ((p->dirty & p->valid) == 0 ||
357 		    p->queue != PQ_INACTIVE ||
358 		    p->wire_count != 0 ||	/* may be held by buf cache */
359 		    p->hold_count != 0) {	/* may be undergoing I/O */
360 			break;
361 		}
362 		mc[page_base + pageout_count] = p;
363 		++pageout_count;
364 		++is;
365 	}
366 
367 	/*
368 	 * If we exhausted our forward scan, continue with the reverse scan
369 	 * when possible, even past a page boundry.  This catches boundry
370 	 * conditions.
371 	 */
372 	if (ib && pageout_count < vm_pageout_page_count)
373 		goto more;
374 
375 	/*
376 	 * we allow reads during pageouts...
377 	 */
378 	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
379 }
380 
381 /*
382  * vm_pageout_flush() - launder the given pages
383  *
384  *	The given pages are laundered.  Note that we setup for the start of
385  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
386  *	reference count all in here rather then in the parent.  If we want
387  *	the parent to do more sophisticated things we may have to change
388  *	the ordering.
389  *
390  * The caller must hold vm_token.
391  */
392 int
393 vm_pageout_flush(vm_page_t *mc, int count, int flags)
394 {
395 	vm_object_t object;
396 	int pageout_status[count];
397 	int numpagedout = 0;
398 	int i;
399 
400 	ASSERT_LWKT_TOKEN_HELD(&vm_token);
401 
402 	/*
403 	 * Initiate I/O.  Bump the vm_page_t->busy counter.
404 	 */
405 	for (i = 0; i < count; i++) {
406 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially invalid page", mc[i], i, count));
407 		vm_page_io_start(mc[i]);
408 	}
409 
410 	/*
411 	 * We must make the pages read-only.  This will also force the
412 	 * modified bit in the related pmaps to be cleared.  The pager
413 	 * cannot clear the bit for us since the I/O completion code
414 	 * typically runs from an interrupt.  The act of making the page
415 	 * read-only handles the case for us.
416 	 */
417 	for (i = 0; i < count; i++) {
418 		vm_page_protect(mc[i], VM_PROT_READ);
419 	}
420 
421 	object = mc[0]->object;
422 	vm_object_pip_add(object, count);
423 
424 	vm_pager_put_pages(object, mc, count,
425 	    (flags | ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
426 	    pageout_status);
427 
428 	for (i = 0; i < count; i++) {
429 		vm_page_t mt = mc[i];
430 
431 		switch (pageout_status[i]) {
432 		case VM_PAGER_OK:
433 			numpagedout++;
434 			break;
435 		case VM_PAGER_PEND:
436 			numpagedout++;
437 			break;
438 		case VM_PAGER_BAD:
439 			/*
440 			 * Page outside of range of object. Right now we
441 			 * essentially lose the changes by pretending it
442 			 * worked.
443 			 */
444 			pmap_clear_modify(mt);
445 			vm_page_undirty(mt);
446 			break;
447 		case VM_PAGER_ERROR:
448 		case VM_PAGER_FAIL:
449 			/*
450 			 * A page typically cannot be paged out when we
451 			 * have run out of swap.  We leave the page
452 			 * marked inactive and will try to page it out
453 			 * again later.
454 			 *
455 			 * Starvation of the active page list is used to
456 			 * determine when the system is massively memory
457 			 * starved.
458 			 */
459 			break;
460 		case VM_PAGER_AGAIN:
461 			break;
462 		}
463 
464 		/*
465 		 * If the operation is still going, leave the page busy to
466 		 * block all other accesses. Also, leave the paging in
467 		 * progress indicator set so that we don't attempt an object
468 		 * collapse.
469 		 *
470 		 * For any pages which have completed synchronously,
471 		 * deactivate the page if we are under a severe deficit.
472 		 * Do not try to enter them into the cache, though, they
473 		 * might still be read-heavy.
474 		 */
475 		if (pageout_status[i] != VM_PAGER_PEND) {
476 			vm_object_pip_wakeup(object);
477 			vm_page_io_finish(mt);
478 			if (vm_page_count_severe())
479 				vm_page_deactivate(mt);
480 #if 0
481 			if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
482 				vm_page_protect(mt, VM_PROT_READ);
483 #endif
484 		}
485 	}
486 	return numpagedout;
487 }
488 
489 #if !defined(NO_SWAPPING)
490 /*
491  *	vm_pageout_object_deactivate_pages
492  *
493  *	deactivate enough pages to satisfy the inactive target
494  *	requirements or if vm_page_proc_limit is set, then
495  *	deactivate all of the pages in the object and its
496  *	backing_objects.
497  *
498  * The map must be locked.
499  * The caller must hold vm_token.
500  */
501 static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *);
502 
503 static void
504 vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
505 				   vm_pindex_t desired, int map_remove_only)
506 {
507 	struct rb_vm_page_scan_info info;
508 	int remove_mode;
509 
510 	if (object->type == OBJT_DEVICE || object->type == OBJT_PHYS)
511 		return;
512 
513 	while (object) {
514 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
515 			return;
516 		if (object->paging_in_progress)
517 			return;
518 
519 		remove_mode = map_remove_only;
520 		if (object->shadow_count > 1)
521 			remove_mode = 1;
522 
523 		/*
524 		 * scan the objects entire memory queue.  spl protection is
525 		 * required to avoid an interrupt unbusy/free race against
526 		 * our busy check.
527 		 */
528 		crit_enter();
529 		info.limit = remove_mode;
530 		info.map = map;
531 		info.desired = desired;
532 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
533 				vm_pageout_object_deactivate_pages_callback,
534 				&info
535 		);
536 		crit_exit();
537 		object = object->backing_object;
538 	}
539 }
540 
541 /*
542  * The caller must hold vm_token.
543  */
544 static int
545 vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data)
546 {
547 	struct rb_vm_page_scan_info *info = data;
548 	int actcount;
549 
550 	if (pmap_resident_count(vm_map_pmap(info->map)) <= info->desired) {
551 		return(-1);
552 	}
553 	mycpu->gd_cnt.v_pdpages++;
554 	if (p->wire_count != 0 || p->hold_count != 0 || p->busy != 0 ||
555 	    (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
556 	    !pmap_page_exists_quick(vm_map_pmap(info->map), p)) {
557 		return(0);
558 	}
559 
560 	actcount = pmap_ts_referenced(p);
561 	if (actcount) {
562 		vm_page_flag_set(p, PG_REFERENCED);
563 	} else if (p->flags & PG_REFERENCED) {
564 		actcount = 1;
565 	}
566 
567 	if ((p->queue != PQ_ACTIVE) &&
568 		(p->flags & PG_REFERENCED)) {
569 		vm_page_activate(p);
570 		p->act_count += actcount;
571 		vm_page_flag_clear(p, PG_REFERENCED);
572 	} else if (p->queue == PQ_ACTIVE) {
573 		if ((p->flags & PG_REFERENCED) == 0) {
574 			p->act_count -= min(p->act_count, ACT_DECLINE);
575 			if (!info->limit && (vm_pageout_algorithm || (p->act_count == 0))) {
576 				vm_page_busy(p);
577 				vm_page_protect(p, VM_PROT_NONE);
578 				vm_page_wakeup(p);
579 				vm_page_deactivate(p);
580 			} else {
581 				TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
582 				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
583 			}
584 		} else {
585 			vm_page_activate(p);
586 			vm_page_flag_clear(p, PG_REFERENCED);
587 			if (p->act_count < (ACT_MAX - ACT_ADVANCE))
588 				p->act_count += ACT_ADVANCE;
589 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
590 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
591 		}
592 	} else if (p->queue == PQ_INACTIVE) {
593 		vm_page_busy(p);
594 		vm_page_protect(p, VM_PROT_NONE);
595 		vm_page_wakeup(p);
596 	}
597 	return(0);
598 }
599 
600 /*
601  * Deactivate some number of pages in a map, try to do it fairly, but
602  * that is really hard to do.
603  *
604  * The caller must hold vm_token.
605  */
606 static void
607 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t desired)
608 {
609 	vm_map_entry_t tmpe;
610 	vm_object_t obj, bigobj;
611 	int nothingwired;
612 
613 	if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT)) {
614 		return;
615 	}
616 
617 	bigobj = NULL;
618 	nothingwired = TRUE;
619 
620 	/*
621 	 * first, search out the biggest object, and try to free pages from
622 	 * that.
623 	 */
624 	tmpe = map->header.next;
625 	while (tmpe != &map->header) {
626 		switch(tmpe->maptype) {
627 		case VM_MAPTYPE_NORMAL:
628 		case VM_MAPTYPE_VPAGETABLE:
629 			obj = tmpe->object.vm_object;
630 			if ((obj != NULL) && (obj->shadow_count <= 1) &&
631 				((bigobj == NULL) ||
632 				 (bigobj->resident_page_count < obj->resident_page_count))) {
633 				bigobj = obj;
634 			}
635 			break;
636 		default:
637 			break;
638 		}
639 		if (tmpe->wired_count > 0)
640 			nothingwired = FALSE;
641 		tmpe = tmpe->next;
642 	}
643 
644 	if (bigobj)
645 		vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
646 
647 	/*
648 	 * Next, hunt around for other pages to deactivate.  We actually
649 	 * do this search sort of wrong -- .text first is not the best idea.
650 	 */
651 	tmpe = map->header.next;
652 	while (tmpe != &map->header) {
653 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
654 			break;
655 		switch(tmpe->maptype) {
656 		case VM_MAPTYPE_NORMAL:
657 		case VM_MAPTYPE_VPAGETABLE:
658 			obj = tmpe->object.vm_object;
659 			if (obj)
660 				vm_pageout_object_deactivate_pages(map, obj, desired, 0);
661 			break;
662 		default:
663 			break;
664 		}
665 		tmpe = tmpe->next;
666 	};
667 
668 	/*
669 	 * Remove all mappings if a process is swapped out, this will free page
670 	 * table pages.
671 	 */
672 	if (desired == 0 && nothingwired)
673 		pmap_remove(vm_map_pmap(map),
674 			    VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
675 	vm_map_unlock(map);
676 }
677 #endif
678 
679 /*
680  * Don't try to be fancy - being fancy can lead to vnode deadlocks.   We
681  * only do it for OBJT_DEFAULT and OBJT_SWAP objects which we know can
682  * be trivially freed.
683  *
684  * The caller must hold vm_token.
685  */
686 static void
687 vm_pageout_page_free(vm_page_t m)
688 {
689 	vm_object_t object = m->object;
690 	int type = object->type;
691 
692 	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
693 		vm_object_reference(object);
694 	vm_page_busy(m);
695 	vm_page_protect(m, VM_PROT_NONE);
696 	vm_page_free(m);
697 	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
698 		vm_object_deallocate(object);
699 }
700 
701 /*
702  * vm_pageout_scan does the dirty work for the pageout daemon.
703  */
704 struct vm_pageout_scan_info {
705 	struct proc *bigproc;
706 	vm_offset_t bigsize;
707 };
708 
709 static int vm_pageout_scan_callback(struct proc *p, void *data);
710 
711 /*
712  * The caller must hold vm_token.
713  */
714 static int
715 vm_pageout_scan(int pass)
716 {
717 	struct vm_pageout_scan_info info;
718 	vm_page_t m, next;
719 	struct vm_page marker;
720 	struct vnode *vpfailed;		/* warning, allowed to be stale */
721 	int maxscan, pcount;
722 	int recycle_count;
723 	int inactive_shortage, active_shortage;
724 	int inactive_original_shortage;
725 	vm_object_t object;
726 	int actcount;
727 	int vnodes_skipped = 0;
728 	int maxlaunder;
729 
730 	/*
731 	 * Do whatever cleanup that the pmap code can.
732 	 */
733 	pmap_collect();
734 
735 	/*
736 	 * Calculate our target for the number of free+cache pages we
737 	 * want to get to.  This is higher then the number that causes
738 	 * allocations to stall (severe) in order to provide hysteresis,
739 	 * and if we don't make it all the way but get to the minimum
740 	 * we're happy.
741 	 */
742 	inactive_shortage = vm_paging_target() + vm_pageout_deficit;
743 	inactive_original_shortage = inactive_shortage;
744 	vm_pageout_deficit = 0;
745 
746 	/*
747 	 * Initialize our marker
748 	 */
749 	bzero(&marker, sizeof(marker));
750 	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
751 	marker.queue = PQ_INACTIVE;
752 	marker.wire_count = 1;
753 
754 	/*
755 	 * Start scanning the inactive queue for pages we can move to the
756 	 * cache or free.  The scan will stop when the target is reached or
757 	 * we have scanned the entire inactive queue.  Note that m->act_count
758 	 * is not used to form decisions for the inactive queue, only for the
759 	 * active queue.
760 	 *
761 	 * maxlaunder limits the number of dirty pages we flush per scan.
762 	 * For most systems a smaller value (16 or 32) is more robust under
763 	 * extreme memory and disk pressure because any unnecessary writes
764 	 * to disk can result in extreme performance degredation.  However,
765 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
766 	 * used) will die horribly with limited laundering.  If the pageout
767 	 * daemon cannot clean enough pages in the first pass, we let it go
768 	 * all out in succeeding passes.
769 	 */
770 	if ((maxlaunder = vm_max_launder) <= 1)
771 		maxlaunder = 1;
772 	if (pass)
773 		maxlaunder = 10000;
774 
775 	/*
776 	 * We will generally be in a critical section throughout the
777 	 * scan, but we can release it temporarily when we are sitting on a
778 	 * non-busy page without fear.  this is required to prevent an
779 	 * interrupt from unbusying or freeing a page prior to our busy
780 	 * check, leaving us on the wrong queue or checking the wrong
781 	 * page.
782 	 */
783 	crit_enter();
784 rescan0:
785 	vpfailed = NULL;
786 	maxscan = vmstats.v_inactive_count;
787 	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
788 	     m != NULL && maxscan-- > 0 && inactive_shortage > 0;
789 	     m = next
790 	 ) {
791 		mycpu->gd_cnt.v_pdpages++;
792 
793 		/*
794 		 * Give interrupts a chance
795 		 */
796 		crit_exit();
797 		crit_enter();
798 
799 		/*
800 		 * It's easier for some of the conditions below to just loop
801 		 * and catch queue changes here rather then check everywhere
802 		 * else.
803 		 */
804 		if (m->queue != PQ_INACTIVE)
805 			goto rescan0;
806 		next = TAILQ_NEXT(m, pageq);
807 
808 		/*
809 		 * skip marker pages
810 		 */
811 		if (m->flags & PG_MARKER)
812 			continue;
813 
814 		/*
815 		 * A held page may be undergoing I/O, so skip it.
816 		 */
817 		if (m->hold_count) {
818 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
819 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
820 			++vm_swapcache_inactive_heuristic;
821 			continue;
822 		}
823 
824 		/*
825 		 * Dont mess with busy pages, keep in the front of the
826 		 * queue, most likely are being paged out.
827 		 */
828 		if (m->busy || (m->flags & PG_BUSY)) {
829 			continue;
830 		}
831 
832 		if (m->object->ref_count == 0) {
833 			/*
834 			 * If the object is not being used, we ignore previous
835 			 * references.
836 			 */
837 			vm_page_flag_clear(m, PG_REFERENCED);
838 			pmap_clear_reference(m);
839 
840 		} else if (((m->flags & PG_REFERENCED) == 0) &&
841 			    (actcount = pmap_ts_referenced(m))) {
842 			/*
843 			 * Otherwise, if the page has been referenced while
844 			 * in the inactive queue, we bump the "activation
845 			 * count" upwards, making it less likely that the
846 			 * page will be added back to the inactive queue
847 			 * prematurely again.  Here we check the page tables
848 			 * (or emulated bits, if any), given the upper level
849 			 * VM system not knowing anything about existing
850 			 * references.
851 			 */
852 			vm_page_activate(m);
853 			m->act_count += (actcount + ACT_ADVANCE);
854 			continue;
855 		}
856 
857 		/*
858 		 * If the upper level VM system knows about any page
859 		 * references, we activate the page.  We also set the
860 		 * "activation count" higher than normal so that we will less
861 		 * likely place pages back onto the inactive queue again.
862 		 */
863 		if ((m->flags & PG_REFERENCED) != 0) {
864 			vm_page_flag_clear(m, PG_REFERENCED);
865 			actcount = pmap_ts_referenced(m);
866 			vm_page_activate(m);
867 			m->act_count += (actcount + ACT_ADVANCE + 1);
868 			continue;
869 		}
870 
871 		/*
872 		 * If the upper level VM system doesn't know anything about
873 		 * the page being dirty, we have to check for it again.  As
874 		 * far as the VM code knows, any partially dirty pages are
875 		 * fully dirty.
876 		 *
877 		 * Pages marked PG_WRITEABLE may be mapped into the user
878 		 * address space of a process running on another cpu.  A
879 		 * user process (without holding the MP lock) running on
880 		 * another cpu may be able to touch the page while we are
881 		 * trying to remove it.  vm_page_cache() will handle this
882 		 * case for us.
883 		 */
884 		if (m->dirty == 0) {
885 			vm_page_test_dirty(m);
886 		} else {
887 			vm_page_dirty(m);
888 		}
889 
890 		if (m->valid == 0) {
891 			/*
892 			 * Invalid pages can be easily freed
893 			 */
894 			vm_pageout_page_free(m);
895 			mycpu->gd_cnt.v_dfree++;
896 			--inactive_shortage;
897 		} else if (m->dirty == 0) {
898 			/*
899 			 * Clean pages can be placed onto the cache queue.
900 			 * This effectively frees them.
901 			 */
902 			vm_page_cache(m);
903 			--inactive_shortage;
904 		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
905 			/*
906 			 * Dirty pages need to be paged out, but flushing
907 			 * a page is extremely expensive verses freeing
908 			 * a clean page.  Rather then artificially limiting
909 			 * the number of pages we can flush, we instead give
910 			 * dirty pages extra priority on the inactive queue
911 			 * by forcing them to be cycled through the queue
912 			 * twice before being flushed, after which the
913 			 * (now clean) page will cycle through once more
914 			 * before being freed.  This significantly extends
915 			 * the thrash point for a heavily loaded machine.
916 			 */
917 			vm_page_flag_set(m, PG_WINATCFLS);
918 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
919 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
920 			++vm_swapcache_inactive_heuristic;
921 		} else if (maxlaunder > 0) {
922 			/*
923 			 * We always want to try to flush some dirty pages if
924 			 * we encounter them, to keep the system stable.
925 			 * Normally this number is small, but under extreme
926 			 * pressure where there are insufficient clean pages
927 			 * on the inactive queue, we may have to go all out.
928 			 */
929 			int swap_pageouts_ok;
930 			struct vnode *vp = NULL;
931 
932 			object = m->object;
933 
934 			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
935 				swap_pageouts_ok = 1;
936 			} else {
937 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
938 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
939 				vm_page_count_min(0));
940 
941 			}
942 
943 			/*
944 			 * We don't bother paging objects that are "dead".
945 			 * Those objects are in a "rundown" state.
946 			 */
947 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
948 				TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
949 				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
950 				++vm_swapcache_inactive_heuristic;
951 				continue;
952 			}
953 
954 			/*
955 			 * The object is already known NOT to be dead.   It
956 			 * is possible for the vget() to block the whole
957 			 * pageout daemon, but the new low-memory handling
958 			 * code should prevent it.
959 			 *
960 			 * The previous code skipped locked vnodes and, worse,
961 			 * reordered pages in the queue.  This results in
962 			 * completely non-deterministic operation because,
963 			 * quite often, a vm_fault has initiated an I/O and
964 			 * is holding a locked vnode at just the point where
965 			 * the pageout daemon is woken up.
966 			 *
967 			 * We can't wait forever for the vnode lock, we might
968 			 * deadlock due to a vn_read() getting stuck in
969 			 * vm_wait while holding this vnode.  We skip the
970 			 * vnode if we can't get it in a reasonable amount
971 			 * of time.
972 			 *
973 			 * vpfailed is used to (try to) avoid the case where
974 			 * a large number of pages are associated with a
975 			 * locked vnode, which could cause the pageout daemon
976 			 * to stall for an excessive amount of time.
977 			 */
978 			if (object->type == OBJT_VNODE) {
979 				int flags;
980 
981 				vp = object->handle;
982 				flags = LK_EXCLUSIVE | LK_NOOBJ;
983 				if (vp == vpfailed)
984 					flags |= LK_NOWAIT;
985 				else
986 					flags |= LK_TIMELOCK;
987 				if (vget(vp, flags) != 0) {
988 					vpfailed = vp;
989 					++pageout_lock_miss;
990 					if (object->flags & OBJ_MIGHTBEDIRTY)
991 						    vnodes_skipped++;
992 					continue;
993 				}
994 
995 				/*
996 				 * The page might have been moved to another
997 				 * queue during potential blocking in vget()
998 				 * above.  The page might have been freed and
999 				 * reused for another vnode.  The object might
1000 				 * have been reused for another vnode.
1001 				 */
1002 				if (m->queue != PQ_INACTIVE ||
1003 				    m->object != object ||
1004 				    object->handle != vp) {
1005 					if (object->flags & OBJ_MIGHTBEDIRTY)
1006 						vnodes_skipped++;
1007 					vput(vp);
1008 					continue;
1009 				}
1010 
1011 				/*
1012 				 * The page may have been busied during the
1013 				 * blocking in vput();  We don't move the
1014 				 * page back onto the end of the queue so that
1015 				 * statistics are more correct if we don't.
1016 				 */
1017 				if (m->busy || (m->flags & PG_BUSY)) {
1018 					vput(vp);
1019 					continue;
1020 				}
1021 
1022 				/*
1023 				 * If the page has become held it might
1024 				 * be undergoing I/O, so skip it
1025 				 */
1026 				if (m->hold_count) {
1027 					TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1028 					TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1029 					++vm_swapcache_inactive_heuristic;
1030 					if (object->flags & OBJ_MIGHTBEDIRTY)
1031 						vnodes_skipped++;
1032 					vput(vp);
1033 					continue;
1034 				}
1035 			}
1036 
1037 			/*
1038 			 * If a page is dirty, then it is either being washed
1039 			 * (but not yet cleaned) or it is still in the
1040 			 * laundry.  If it is still in the laundry, then we
1041 			 * start the cleaning operation.
1042 			 *
1043 			 * This operation may cluster, invalidating the 'next'
1044 			 * pointer.  To prevent an inordinate number of
1045 			 * restarts we use our marker to remember our place.
1046 			 *
1047 			 * decrement inactive_shortage on success to account
1048 			 * for the (future) cleaned page.  Otherwise we
1049 			 * could wind up laundering or cleaning too many
1050 			 * pages.
1051 			 */
1052 			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
1053 			if (vm_pageout_clean(m) != 0) {
1054 				--inactive_shortage;
1055 				--maxlaunder;
1056 			}
1057 			next = TAILQ_NEXT(&marker, pageq);
1058 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
1059 			if (vp != NULL)
1060 				vput(vp);
1061 		}
1062 	}
1063 
1064 	/*
1065 	 * We want to move pages from the active queue to the inactive
1066 	 * queue to get the inactive queue to the inactive target.  If
1067 	 * we still have a page shortage from above we try to directly free
1068 	 * clean pages instead of moving them.
1069 	 *
1070 	 * If we do still have a shortage we keep track of the number of
1071 	 * pages we free or cache (recycle_count) as a measure of thrashing
1072 	 * between the active and inactive queues.
1073 	 *
1074 	 * If we were able to completely satisfy the free+cache targets
1075 	 * from the inactive pool we limit the number of pages we move
1076 	 * from the active pool to the inactive pool to 2x the pages we
1077 	 * had removed from the inactive pool (with a minimum of 1/5 the
1078 	 * inactive target).  If we were not able to completely satisfy
1079 	 * the free+cache targets we go for the whole target aggressively.
1080 	 *
1081 	 * NOTE: Both variables can end up negative.
1082 	 * NOTE: We are still in a critical section.
1083 	 */
1084 	active_shortage = vmstats.v_inactive_target - vmstats.v_inactive_count;
1085 	if (inactive_original_shortage < vmstats.v_inactive_target / 10)
1086 		inactive_original_shortage = vmstats.v_inactive_target / 10;
1087 	if (inactive_shortage <= 0 &&
1088 	    active_shortage > inactive_original_shortage * 2) {
1089 		active_shortage = inactive_original_shortage * 2;
1090 	}
1091 
1092 	pcount = vmstats.v_active_count;
1093 	recycle_count = 0;
1094 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
1095 
1096 	while ((m != NULL) && (pcount-- > 0) &&
1097 	       (inactive_shortage > 0 || active_shortage > 0)
1098 	) {
1099 		/*
1100 		 * Give interrupts a chance.
1101 		 */
1102 		crit_exit();
1103 		crit_enter();
1104 
1105 		/*
1106 		 * If the page was ripped out from under us, just stop.
1107 		 */
1108 		if (m->queue != PQ_ACTIVE)
1109 			break;
1110 		next = TAILQ_NEXT(m, pageq);
1111 
1112 		/*
1113 		 * Don't deactivate pages that are busy.
1114 		 */
1115 		if ((m->busy != 0) ||
1116 		    (m->flags & PG_BUSY) ||
1117 		    (m->hold_count != 0)) {
1118 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1119 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1120 			m = next;
1121 			continue;
1122 		}
1123 
1124 		/*
1125 		 * The count for pagedaemon pages is done after checking the
1126 		 * page for eligibility...
1127 		 */
1128 		mycpu->gd_cnt.v_pdpages++;
1129 
1130 		/*
1131 		 * Check to see "how much" the page has been used and clear
1132 		 * the tracking access bits.  If the object has no references
1133 		 * don't bother paying the expense.
1134 		 */
1135 		actcount = 0;
1136 		if (m->object->ref_count != 0) {
1137 			if (m->flags & PG_REFERENCED)
1138 				++actcount;
1139 			actcount += pmap_ts_referenced(m);
1140 			if (actcount) {
1141 				m->act_count += ACT_ADVANCE + actcount;
1142 				if (m->act_count > ACT_MAX)
1143 					m->act_count = ACT_MAX;
1144 			}
1145 		}
1146 		vm_page_flag_clear(m, PG_REFERENCED);
1147 
1148 		/*
1149 		 * actcount is only valid if the object ref_count is non-zero.
1150 		 */
1151 		if (actcount && m->object->ref_count != 0) {
1152 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1153 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1154 		} else {
1155 			m->act_count -= min(m->act_count, ACT_DECLINE);
1156 			if (vm_pageout_algorithm ||
1157 			    m->object->ref_count == 0 ||
1158 			    m->act_count < pass + 1
1159 			) {
1160 				/*
1161 				 * Deactivate the page.  If we had a
1162 				 * shortage from our inactive scan try to
1163 				 * free (cache) the page instead.
1164 				 *
1165 				 * Don't just blindly cache the page if
1166 				 * we do not have a shortage from the
1167 				 * inactive scan, that could lead to
1168 				 * gigabytes being moved.
1169 				 */
1170 				--active_shortage;
1171 				if (inactive_shortage > 0 ||
1172 				    m->object->ref_count == 0) {
1173 					if (inactive_shortage > 0)
1174 						++recycle_count;
1175 					vm_page_busy(m);
1176 					vm_page_protect(m, VM_PROT_NONE);
1177 					vm_page_wakeup(m);
1178 					if (m->dirty == 0 &&
1179 					    inactive_shortage > 0) {
1180 						--inactive_shortage;
1181 						vm_page_cache(m);
1182 					} else {
1183 						vm_page_deactivate(m);
1184 					}
1185 				} else {
1186 					vm_page_deactivate(m);
1187 				}
1188 			} else {
1189 				TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1190 				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1191 			}
1192 		}
1193 		m = next;
1194 	}
1195 
1196 	/*
1197 	 * We try to maintain some *really* free pages, this allows interrupt
1198 	 * code to be guaranteed space.  Since both cache and free queues
1199 	 * are considered basically 'free', moving pages from cache to free
1200 	 * does not effect other calculations.
1201 	 *
1202 	 * NOTE: we are still in a critical section.
1203 	 *
1204 	 * Pages moved from PQ_CACHE to totally free are not counted in the
1205 	 * pages_freed counter.
1206 	 */
1207 	while (vmstats.v_free_count < vmstats.v_free_reserved) {
1208 		static int cache_rover = 0;
1209 		m = vm_page_list_find(PQ_CACHE, cache_rover, FALSE);
1210 		if (m == NULL)
1211 			break;
1212 		if ((m->flags & (PG_BUSY|PG_UNMANAGED)) ||
1213 		    m->busy ||
1214 		    m->hold_count ||
1215 		    m->wire_count) {
1216 #ifdef INVARIANTS
1217 			kprintf("Warning: busy page %p found in cache\n", m);
1218 #endif
1219 			vm_page_deactivate(m);
1220 			continue;
1221 		}
1222 		KKASSERT((m->flags & PG_MAPPED) == 0);
1223 		KKASSERT(m->dirty == 0);
1224 		cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK;
1225 		vm_pageout_page_free(m);
1226 		mycpu->gd_cnt.v_dfree++;
1227 	}
1228 
1229 	crit_exit();
1230 
1231 #if !defined(NO_SWAPPING)
1232 	/*
1233 	 * Idle process swapout -- run once per second.
1234 	 */
1235 	if (vm_swap_idle_enabled) {
1236 		static long lsec;
1237 		if (time_second != lsec) {
1238 			vm_pageout_req_swapout |= VM_SWAP_IDLE;
1239 			vm_req_vmdaemon();
1240 			lsec = time_second;
1241 		}
1242 	}
1243 #endif
1244 
1245 	/*
1246 	 * If we didn't get enough free pages, and we have skipped a vnode
1247 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
1248 	 * if we did not get enough free pages.
1249 	 */
1250 	if (vm_paging_target() > 0) {
1251 		if (vnodes_skipped && vm_page_count_min(0))
1252 			speedup_syncer();
1253 #if !defined(NO_SWAPPING)
1254 		if (vm_swap_enabled && vm_page_count_target()) {
1255 			vm_req_vmdaemon();
1256 			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
1257 		}
1258 #endif
1259 	}
1260 
1261 	/*
1262 	 * Handle catastrophic conditions.  Under good conditions we should
1263 	 * be at the target, well beyond our minimum.  If we could not even
1264 	 * reach our minimum the system is under heavy stress.
1265 	 *
1266 	 * Determine whether we have run out of memory.  This occurs when
1267 	 * swap_pager_full is TRUE and the only pages left in the page
1268 	 * queues are dirty.  We will still likely have page shortages.
1269 	 *
1270 	 * - swap_pager_full is set if insufficient swap was
1271 	 *   available to satisfy a requested pageout.
1272 	 *
1273 	 * - the inactive queue is bloated (4 x size of active queue),
1274 	 *   meaning it is unable to get rid of dirty pages and.
1275 	 *
1276 	 * - vm_page_count_min() without counting pages recycled from the
1277 	 *   active queue (recycle_count) means we could not recover
1278 	 *   enough pages to meet bare minimum needs.  This test only
1279 	 *   works if the inactive queue is bloated.
1280 	 *
1281 	 * - due to a positive inactive_shortage we shifted the remaining
1282 	 *   dirty pages from the active queue to the inactive queue
1283 	 *   trying to find clean ones to free.
1284 	 */
1285 	if (swap_pager_full && vm_page_count_min(recycle_count))
1286 		kprintf("Warning: system low on memory+swap!\n");
1287 	if (swap_pager_full && vm_page_count_min(recycle_count) &&
1288 	    vmstats.v_inactive_count > vmstats.v_active_count * 4 &&
1289 	    inactive_shortage > 0) {
1290 		/*
1291 		 * Kill something.
1292 		 */
1293 		info.bigproc = NULL;
1294 		info.bigsize = 0;
1295 		allproc_scan(vm_pageout_scan_callback, &info);
1296 		if (info.bigproc != NULL) {
1297 			killproc(info.bigproc, "out of swap space");
1298 			info.bigproc->p_nice = PRIO_MIN;
1299 			info.bigproc->p_usched->resetpriority(
1300 				FIRST_LWP_IN_PROC(info.bigproc));
1301 			wakeup(&vmstats.v_free_count);
1302 			PRELE(info.bigproc);
1303 		}
1304 	}
1305 	return(inactive_shortage);
1306 }
1307 
1308 /*
1309  * The caller must hold vm_token and proc_token.
1310  */
1311 static int
1312 vm_pageout_scan_callback(struct proc *p, void *data)
1313 {
1314 	struct vm_pageout_scan_info *info = data;
1315 	vm_offset_t size;
1316 
1317 	/*
1318 	 * Never kill system processes or init.  If we have configured swap
1319 	 * then try to avoid killing low-numbered pids.
1320 	 */
1321 	if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
1322 	    ((p->p_pid < 48) && (vm_swap_size != 0))) {
1323 		return (0);
1324 	}
1325 
1326 	/*
1327 	 * if the process is in a non-running type state,
1328 	 * don't touch it.
1329 	 */
1330 	if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
1331 		return (0);
1332 
1333 	/*
1334 	 * Get the approximate process size.  Note that anonymous pages
1335 	 * with backing swap will be counted twice, but there should not
1336 	 * be too many such pages due to the stress the VM system is
1337 	 * under at this point.
1338 	 */
1339 	size = vmspace_anonymous_count(p->p_vmspace) +
1340 		vmspace_swap_count(p->p_vmspace);
1341 
1342 	/*
1343 	 * If the this process is bigger than the biggest one
1344 	 * remember it.
1345 	 */
1346 	if (info->bigsize < size) {
1347 		if (info->bigproc)
1348 			PRELE(info->bigproc);
1349 		PHOLD(p);
1350 		info->bigproc = p;
1351 		info->bigsize = size;
1352 	}
1353 	return(0);
1354 }
1355 
1356 /*
1357  * This routine tries to maintain the pseudo LRU active queue,
1358  * so that during long periods of time where there is no paging,
1359  * that some statistic accumulation still occurs.  This code
1360  * helps the situation where paging just starts to occur.
1361  *
1362  * The caller must hold vm_token.
1363  */
1364 static void
1365 vm_pageout_page_stats(void)
1366 {
1367 	vm_page_t m,next;
1368 	int pcount,tpcount;		/* Number of pages to check */
1369 	static int fullintervalcount = 0;
1370 	int page_shortage;
1371 
1372 	page_shortage =
1373 	    (vmstats.v_inactive_target + vmstats.v_cache_max + vmstats.v_free_min) -
1374 	    (vmstats.v_free_count + vmstats.v_inactive_count + vmstats.v_cache_count);
1375 
1376 	if (page_shortage <= 0)
1377 		return;
1378 
1379 	crit_enter();
1380 
1381 	pcount = vmstats.v_active_count;
1382 	fullintervalcount += vm_pageout_stats_interval;
1383 	if (fullintervalcount < vm_pageout_full_stats_interval) {
1384 		tpcount = (vm_pageout_stats_max * vmstats.v_active_count) / vmstats.v_page_count;
1385 		if (pcount > tpcount)
1386 			pcount = tpcount;
1387 	} else {
1388 		fullintervalcount = 0;
1389 	}
1390 
1391 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
1392 	while ((m != NULL) && (pcount-- > 0)) {
1393 		int actcount;
1394 
1395 		if (m->queue != PQ_ACTIVE) {
1396 			break;
1397 		}
1398 
1399 		next = TAILQ_NEXT(m, pageq);
1400 		/*
1401 		 * Don't deactivate pages that are busy.
1402 		 */
1403 		if ((m->busy != 0) ||
1404 		    (m->flags & PG_BUSY) ||
1405 		    (m->hold_count != 0)) {
1406 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1407 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1408 			m = next;
1409 			continue;
1410 		}
1411 
1412 		actcount = 0;
1413 		if (m->flags & PG_REFERENCED) {
1414 			vm_page_flag_clear(m, PG_REFERENCED);
1415 			actcount += 1;
1416 		}
1417 
1418 		actcount += pmap_ts_referenced(m);
1419 		if (actcount) {
1420 			m->act_count += ACT_ADVANCE + actcount;
1421 			if (m->act_count > ACT_MAX)
1422 				m->act_count = ACT_MAX;
1423 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1424 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1425 		} else {
1426 			if (m->act_count == 0) {
1427 				/*
1428 				 * We turn off page access, so that we have
1429 				 * more accurate RSS stats.  We don't do this
1430 				 * in the normal page deactivation when the
1431 				 * system is loaded VM wise, because the
1432 				 * cost of the large number of page protect
1433 				 * operations would be higher than the value
1434 				 * of doing the operation.
1435 				 */
1436 				vm_page_busy(m);
1437 				vm_page_protect(m, VM_PROT_NONE);
1438 				vm_page_wakeup(m);
1439 				vm_page_deactivate(m);
1440 			} else {
1441 				m->act_count -= min(m->act_count, ACT_DECLINE);
1442 				TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1443 				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1444 			}
1445 		}
1446 
1447 		m = next;
1448 	}
1449 	crit_exit();
1450 }
1451 
1452 /*
1453  * The caller must hold vm_token.
1454  */
1455 static int
1456 vm_pageout_free_page_calc(vm_size_t count)
1457 {
1458 	if (count < vmstats.v_page_count)
1459 		 return 0;
1460 	/*
1461 	 * free_reserved needs to include enough for the largest swap pager
1462 	 * structures plus enough for any pv_entry structs when paging.
1463 	 */
1464 	if (vmstats.v_page_count > 1024)
1465 		vmstats.v_free_min = 4 + (vmstats.v_page_count - 1024) / 200;
1466 	else
1467 		vmstats.v_free_min = 4;
1468 	vmstats.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
1469 		vmstats.v_interrupt_free_min;
1470 	vmstats.v_free_reserved = vm_pageout_page_count +
1471 		vmstats.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
1472 	vmstats.v_free_severe = vmstats.v_free_min / 2;
1473 	vmstats.v_free_min += vmstats.v_free_reserved;
1474 	vmstats.v_free_severe += vmstats.v_free_reserved;
1475 	return 1;
1476 }
1477 
1478 
1479 /*
1480  * vm_pageout is the high level pageout daemon.
1481  *
1482  * No requirements.
1483  */
1484 static void
1485 vm_pageout(void)
1486 {
1487 	int pass;
1488 	int inactive_shortage;
1489 
1490 	/*
1491 	 * Permanently hold vm_token.
1492 	 */
1493 	lwkt_gettoken(&vm_token);
1494 
1495 	/*
1496 	 * Initialize some paging parameters.
1497 	 */
1498 	curthread->td_flags |= TDF_SYSTHREAD;
1499 
1500 	vmstats.v_interrupt_free_min = 2;
1501 	if (vmstats.v_page_count < 2000)
1502 		vm_pageout_page_count = 8;
1503 
1504 	vm_pageout_free_page_calc(vmstats.v_page_count);
1505 
1506 	/*
1507 	 * v_free_target and v_cache_min control pageout hysteresis.  Note
1508 	 * that these are more a measure of the VM cache queue hysteresis
1509 	 * then the VM free queue.  Specifically, v_free_target is the
1510 	 * high water mark (free+cache pages).
1511 	 *
1512 	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
1513 	 * low water mark, while v_free_min is the stop.  v_cache_min must
1514 	 * be big enough to handle memory needs while the pageout daemon
1515 	 * is signalled and run to free more pages.
1516 	 */
1517 	if (vmstats.v_free_count > 6144)
1518 		vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved;
1519 	else
1520 		vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved;
1521 
1522 	/*
1523 	 * NOTE: With the new buffer cache b_act_count we want the default
1524 	 *	 inactive target to be a percentage of available memory.
1525 	 *
1526 	 *	 The inactive target essentially determines the minimum
1527 	 *	 number of 'temporary' pages capable of caching one-time-use
1528 	 *	 files when the VM system is otherwise full of pages
1529 	 *	 belonging to multi-time-use files or active program data.
1530 	 *
1531 	 * NOTE: The inactive target is aggressively persued only if the
1532 	 *	 inactive queue becomes too small.  If the inactive queue
1533 	 *	 is large enough to satisfy page movement to free+cache
1534 	 *	 then it is repopulated more slowly from the active queue.
1535 	 *	 This allows a general inactive_target default to be set.
1536 	 *
1537 	 *	 There is an issue here for processes which sit mostly idle
1538 	 *	 'overnight', such as sshd, tcsh, and X.  Any movement from
1539 	 *	 the active queue will eventually cause such pages to
1540 	 *	 recycle eventually causing a lot of paging in the morning.
1541 	 *	 To reduce the incidence of this pages cycled out of the
1542 	 *	 buffer cache are moved directly to the inactive queue if
1543 	 *	 they were only used once or twice.
1544 	 *
1545 	 *	 The vfs.vm_cycle_point sysctl can be used to adjust this.
1546 	 *	 Increasing the value (up to 64) increases the number of
1547 	 *	 buffer recyclements which go directly to the inactive queue.
1548 	 */
1549 	if (vmstats.v_free_count > 2048) {
1550 		vmstats.v_cache_min = vmstats.v_free_target;
1551 		vmstats.v_cache_max = 2 * vmstats.v_cache_min;
1552 	} else {
1553 		vmstats.v_cache_min = 0;
1554 		vmstats.v_cache_max = 0;
1555 	}
1556 	vmstats.v_inactive_target = vmstats.v_free_count / 4;
1557 
1558 	/* XXX does not really belong here */
1559 	if (vm_page_max_wired == 0)
1560 		vm_page_max_wired = vmstats.v_free_count / 3;
1561 
1562 	if (vm_pageout_stats_max == 0)
1563 		vm_pageout_stats_max = vmstats.v_free_target;
1564 
1565 	/*
1566 	 * Set interval in seconds for stats scan.
1567 	 */
1568 	if (vm_pageout_stats_interval == 0)
1569 		vm_pageout_stats_interval = 5;
1570 	if (vm_pageout_full_stats_interval == 0)
1571 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
1572 
1573 
1574 	/*
1575 	 * Set maximum free per pass
1576 	 */
1577 	if (vm_pageout_stats_free_max == 0)
1578 		vm_pageout_stats_free_max = 5;
1579 
1580 	swap_pager_swap_init();
1581 	pass = 0;
1582 
1583 	/*
1584 	 * The pageout daemon is never done, so loop forever.
1585 	 */
1586 	while (TRUE) {
1587 		int error;
1588 
1589 		/*
1590 		 * Wait for an action request
1591 		 */
1592 		crit_enter();
1593 		if (vm_pages_needed == 0) {
1594 			error = tsleep(&vm_pages_needed,
1595 				       0, "psleep",
1596 				       vm_pageout_stats_interval * hz);
1597 			if (error && vm_pages_needed == 0) {
1598 				vm_pageout_page_stats();
1599 				continue;
1600 			}
1601 			vm_pages_needed = 1;
1602 		}
1603 		crit_exit();
1604 
1605 		/*
1606 		 * If we have enough free memory, wakeup waiters.
1607 		 * (This is optional here)
1608 		 */
1609 		crit_enter();
1610 		if (!vm_page_count_min(0))
1611 			wakeup(&vmstats.v_free_count);
1612 		mycpu->gd_cnt.v_pdwakeups++;
1613 		crit_exit();
1614 
1615 		/*
1616 		 * Scan for pageout.  Try to avoid thrashing the system
1617 		 * with activity.
1618 		 */
1619 		inactive_shortage = vm_pageout_scan(pass);
1620 		if (inactive_shortage > 0) {
1621 			++pass;
1622 			if (swap_pager_full) {
1623 				/*
1624 				 * Running out of memory, catastrophic back-off
1625 				 * to one-second intervals.
1626 				 */
1627 				tsleep(&vm_pages_needed, 0, "pdelay", hz);
1628 			} else if (pass < 10 && vm_pages_needed > 1) {
1629 				/*
1630 				 * Normal operation, additional processes
1631 				 * have already kicked us.  Retry immediately.
1632 				 */
1633 			} else if (pass < 10) {
1634 				/*
1635 				 * Normal operation, fewer processes.  Delay
1636 				 * a bit but allow wakeups.
1637 				 */
1638 				vm_pages_needed = 0;
1639 				tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
1640 				vm_pages_needed = 1;
1641 			} else {
1642 				/*
1643 				 * We've taken too many passes, forced delay.
1644 				 */
1645 				tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
1646 			}
1647 		} else {
1648 			/*
1649 			 * Interlocked wakeup of waiters (non-optional)
1650 			 */
1651 			pass = 0;
1652 			if (vm_pages_needed && !vm_page_count_min(0)) {
1653 				wakeup(&vmstats.v_free_count);
1654 				vm_pages_needed = 0;
1655 			}
1656 		}
1657 	}
1658 }
1659 
1660 /*
1661  * Called after allocating a page out of the cache or free queue
1662  * to possibly wake the pagedaemon up to replentish our supply.
1663  *
1664  * We try to generate some hysteresis by waking the pagedaemon up
1665  * when our free+cache pages go below the severe level.  The pagedaemon
1666  * tries to get the count back up to at least the minimum, and through
1667  * to the target level if possible.
1668  *
1669  * If the pagedaemon is already active bump vm_pages_needed as a hint
1670  * that there are even more requests pending.
1671  *
1672  * SMP races ok?
1673  * No requirements.
1674  */
1675 void
1676 pagedaemon_wakeup(void)
1677 {
1678 	if (vm_page_count_severe() && curthread != pagethread) {
1679 		if (vm_pages_needed == 0) {
1680 			vm_pages_needed = 1;
1681 			wakeup(&vm_pages_needed);
1682 		} else if (vm_page_count_min(0)) {
1683 			++vm_pages_needed;
1684 		}
1685 	}
1686 }
1687 
1688 #if !defined(NO_SWAPPING)
1689 
1690 /*
1691  * SMP races ok?
1692  * No requirements.
1693  */
1694 static void
1695 vm_req_vmdaemon(void)
1696 {
1697 	static int lastrun = 0;
1698 
1699 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
1700 		wakeup(&vm_daemon_needed);
1701 		lastrun = ticks;
1702 	}
1703 }
1704 
1705 static int vm_daemon_callback(struct proc *p, void *data __unused);
1706 
1707 /*
1708  * No requirements.
1709  */
1710 static void
1711 vm_daemon(void)
1712 {
1713 	/*
1714 	 * Permanently hold vm_token.
1715 	 */
1716 	lwkt_gettoken(&vm_token);
1717 
1718 	while (TRUE) {
1719 		tsleep(&vm_daemon_needed, 0, "psleep", 0);
1720 		if (vm_pageout_req_swapout) {
1721 			swapout_procs(vm_pageout_req_swapout);
1722 			vm_pageout_req_swapout = 0;
1723 		}
1724 		/*
1725 		 * scan the processes for exceeding their rlimits or if
1726 		 * process is swapped out -- deactivate pages
1727 		 */
1728 		allproc_scan(vm_daemon_callback, NULL);
1729 	}
1730 }
1731 
1732 /*
1733  * Caller must hold vm_token and proc_token.
1734  */
1735 static int
1736 vm_daemon_callback(struct proc *p, void *data __unused)
1737 {
1738 	vm_pindex_t limit, size;
1739 
1740 	/*
1741 	 * if this is a system process or if we have already
1742 	 * looked at this process, skip it.
1743 	 */
1744 	if (p->p_flag & (P_SYSTEM | P_WEXIT))
1745 		return (0);
1746 
1747 	/*
1748 	 * if the process is in a non-running type state,
1749 	 * don't touch it.
1750 	 */
1751 	if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
1752 		return (0);
1753 
1754 	/*
1755 	 * get a limit
1756 	 */
1757 	limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
1758 			        p->p_rlimit[RLIMIT_RSS].rlim_max));
1759 
1760 	/*
1761 	 * let processes that are swapped out really be
1762 	 * swapped out.  Set the limit to nothing to get as
1763 	 * many pages out to swap as possible.
1764 	 */
1765 	if (p->p_flag & P_SWAPPEDOUT)
1766 		limit = 0;
1767 
1768 	size = vmspace_resident_count(p->p_vmspace);
1769 	if (limit >= 0 && size >= limit) {
1770 		vm_pageout_map_deactivate_pages(
1771 		    &p->p_vmspace->vm_map, limit);
1772 	}
1773 	return (0);
1774 }
1775 
1776 #endif
1777