xref: /dragonfly/sys/vm/vm_pageout.c (revision 984263bc)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * The Mach Operating System project at Carnegie-Mellon University.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
41  *
42  *
43  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
44  * All rights reserved.
45  *
46  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
47  *
48  * Permission to use, copy, modify and distribute this software and
49  * its documentation is hereby granted, provided that both the copyright
50  * notice and this permission notice appear in all copies of the
51  * software, derivative works or modified versions, and any portions
52  * thereof, and that both notices appear in supporting documentation.
53  *
54  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
55  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
56  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
57  *
58  * Carnegie Mellon requests users of this software to return to
59  *
60  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
61  *  School of Computer Science
62  *  Carnegie Mellon University
63  *  Pittsburgh PA 15213-3890
64  *
65  * any improvements or extensions that they make and grant Carnegie the
66  * rights to redistribute these changes.
67  *
68  * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
69  */
70 
71 /*
72  *	The proverbial page-out daemon.
73  */
74 
75 #include "opt_vm.h"
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/kernel.h>
79 #include <sys/proc.h>
80 #include <sys/kthread.h>
81 #include <sys/resourcevar.h>
82 #include <sys/signalvar.h>
83 #include <sys/vnode.h>
84 #include <sys/vmmeter.h>
85 #include <sys/sysctl.h>
86 
87 #include <vm/vm.h>
88 #include <vm/vm_param.h>
89 #include <sys/lock.h>
90 #include <vm/vm_object.h>
91 #include <vm/vm_page.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_pageout.h>
94 #include <vm/vm_pager.h>
95 #include <vm/swap_pager.h>
96 #include <vm/vm_extern.h>
97 
98 /*
99  * System initialization
100  */
101 
102 /* the kernel process "vm_pageout"*/
103 static void vm_pageout __P((void));
104 static int vm_pageout_clean __P((vm_page_t));
105 static void vm_pageout_scan __P((int pass));
106 static int vm_pageout_free_page_calc __P((vm_size_t count));
107 struct proc *pageproc;
108 
109 static struct kproc_desc page_kp = {
110 	"pagedaemon",
111 	vm_pageout,
112 	&pageproc
113 };
114 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
115 
116 #if !defined(NO_SWAPPING)
117 /* the kernel process "vm_daemon"*/
118 static void vm_daemon __P((void));
119 static struct	proc *vmproc;
120 
121 static struct kproc_desc vm_kp = {
122 	"vmdaemon",
123 	vm_daemon,
124 	&vmproc
125 };
126 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
127 #endif
128 
129 
130 int vm_pages_needed=0;		/* Event on which pageout daemon sleeps */
131 int vm_pageout_deficit=0;	/* Estimated number of pages deficit */
132 int vm_pageout_pages_needed=0;	/* flag saying that the pageout daemon needs pages */
133 
134 #if !defined(NO_SWAPPING)
135 static int vm_pageout_req_swapout;	/* XXX */
136 static int vm_daemon_needed;
137 #endif
138 extern int vm_swap_size;
139 static int vm_max_launder = 32;
140 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
141 static int vm_pageout_full_stats_interval = 0;
142 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
143 static int defer_swap_pageouts=0;
144 static int disable_swap_pageouts=0;
145 
146 #if defined(NO_SWAPPING)
147 static int vm_swap_enabled=0;
148 static int vm_swap_idle_enabled=0;
149 #else
150 static int vm_swap_enabled=1;
151 static int vm_swap_idle_enabled=0;
152 #endif
153 
154 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
155 	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
156 
157 SYSCTL_INT(_vm, OID_AUTO, max_launder,
158 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
159 
160 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
161 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
162 
163 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
164 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
165 
166 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
167 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
168 
169 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
170 	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
171 
172 #if defined(NO_SWAPPING)
173 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
174 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
175 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
176 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
177 #else
178 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
179 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
180 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
181 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
182 #endif
183 
184 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
185 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
186 
187 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
188 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
189 
190 static int pageout_lock_miss;
191 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
192 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
193 
194 #define VM_PAGEOUT_PAGE_COUNT 16
195 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
196 
197 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
198 
199 #if !defined(NO_SWAPPING)
200 typedef void freeer_fcn_t __P((vm_map_t, vm_object_t, vm_pindex_t, int));
201 static void vm_pageout_map_deactivate_pages __P((vm_map_t, vm_pindex_t));
202 static freeer_fcn_t vm_pageout_object_deactivate_pages;
203 static void vm_req_vmdaemon __P((void));
204 #endif
205 static void vm_pageout_page_stats(void);
206 
207 /*
208  * vm_pageout_clean:
209  *
210  * Clean the page and remove it from the laundry.
211  *
212  * We set the busy bit to cause potential page faults on this page to
213  * block.  Note the careful timing, however, the busy bit isn't set till
214  * late and we cannot do anything that will mess with the page.
215  */
216 
217 static int
218 vm_pageout_clean(m)
219 	vm_page_t m;
220 {
221 	register vm_object_t object;
222 	vm_page_t mc[2*vm_pageout_page_count];
223 	int pageout_count;
224 	int ib, is, page_base;
225 	vm_pindex_t pindex = m->pindex;
226 
227 	object = m->object;
228 
229 	/*
230 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
231 	 * with the new swapper, but we could have serious problems paging
232 	 * out other object types if there is insufficient memory.
233 	 *
234 	 * Unfortunately, checking free memory here is far too late, so the
235 	 * check has been moved up a procedural level.
236 	 */
237 
238 	/*
239 	 * Don't mess with the page if it's busy, held, or special
240 	 */
241 	if ((m->hold_count != 0) ||
242 	    ((m->busy != 0) || (m->flags & (PG_BUSY|PG_UNMANAGED)))) {
243 		return 0;
244 	}
245 
246 	mc[vm_pageout_page_count] = m;
247 	pageout_count = 1;
248 	page_base = vm_pageout_page_count;
249 	ib = 1;
250 	is = 1;
251 
252 	/*
253 	 * Scan object for clusterable pages.
254 	 *
255 	 * We can cluster ONLY if: ->> the page is NOT
256 	 * clean, wired, busy, held, or mapped into a
257 	 * buffer, and one of the following:
258 	 * 1) The page is inactive, or a seldom used
259 	 *    active page.
260 	 * -or-
261 	 * 2) we force the issue.
262 	 *
263 	 * During heavy mmap/modification loads the pageout
264 	 * daemon can really fragment the underlying file
265 	 * due to flushing pages out of order and not trying
266 	 * align the clusters (which leave sporatic out-of-order
267 	 * holes).  To solve this problem we do the reverse scan
268 	 * first and attempt to align our cluster, then do a
269 	 * forward scan if room remains.
270 	 */
271 
272 more:
273 	while (ib && pageout_count < vm_pageout_page_count) {
274 		vm_page_t p;
275 
276 		if (ib > pindex) {
277 			ib = 0;
278 			break;
279 		}
280 
281 		if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
282 			ib = 0;
283 			break;
284 		}
285 		if (((p->queue - p->pc) == PQ_CACHE) ||
286 		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
287 			ib = 0;
288 			break;
289 		}
290 		vm_page_test_dirty(p);
291 		if ((p->dirty & p->valid) == 0 ||
292 		    p->queue != PQ_INACTIVE ||
293 		    p->wire_count != 0 ||	/* may be held by buf cache */
294 		    p->hold_count != 0) {	/* may be undergoing I/O */
295 			ib = 0;
296 			break;
297 		}
298 		mc[--page_base] = p;
299 		++pageout_count;
300 		++ib;
301 		/*
302 		 * alignment boundry, stop here and switch directions.  Do
303 		 * not clear ib.
304 		 */
305 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
306 			break;
307 	}
308 
309 	while (pageout_count < vm_pageout_page_count &&
310 	    pindex + is < object->size) {
311 		vm_page_t p;
312 
313 		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
314 			break;
315 		if (((p->queue - p->pc) == PQ_CACHE) ||
316 		    (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) {
317 			break;
318 		}
319 		vm_page_test_dirty(p);
320 		if ((p->dirty & p->valid) == 0 ||
321 		    p->queue != PQ_INACTIVE ||
322 		    p->wire_count != 0 ||	/* may be held by buf cache */
323 		    p->hold_count != 0) {	/* may be undergoing I/O */
324 			break;
325 		}
326 		mc[page_base + pageout_count] = p;
327 		++pageout_count;
328 		++is;
329 	}
330 
331 	/*
332 	 * If we exhausted our forward scan, continue with the reverse scan
333 	 * when possible, even past a page boundry.  This catches boundry
334 	 * conditions.
335 	 */
336 	if (ib && pageout_count < vm_pageout_page_count)
337 		goto more;
338 
339 	/*
340 	 * we allow reads during pageouts...
341 	 */
342 	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
343 }
344 
345 /*
346  * vm_pageout_flush() - launder the given pages
347  *
348  *	The given pages are laundered.  Note that we setup for the start of
349  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
350  *	reference count all in here rather then in the parent.  If we want
351  *	the parent to do more sophisticated things we may have to change
352  *	the ordering.
353  */
354 
355 int
356 vm_pageout_flush(mc, count, flags)
357 	vm_page_t *mc;
358 	int count;
359 	int flags;
360 {
361 	register vm_object_t object;
362 	int pageout_status[count];
363 	int numpagedout = 0;
364 	int i;
365 
366 	/*
367 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
368 	 * mark the pages read-only.
369 	 *
370 	 * We do not have to fixup the clean/dirty bits here... we can
371 	 * allow the pager to do it after the I/O completes.
372 	 */
373 
374 	for (i = 0; i < count; i++) {
375 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush page %p index %d/%d: partially invalid page", mc[i], i, count));
376 		vm_page_io_start(mc[i]);
377 		vm_page_protect(mc[i], VM_PROT_READ);
378 	}
379 
380 	object = mc[0]->object;
381 	vm_object_pip_add(object, count);
382 
383 	vm_pager_put_pages(object, mc, count,
384 	    (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
385 	    pageout_status);
386 
387 	for (i = 0; i < count; i++) {
388 		vm_page_t mt = mc[i];
389 
390 		switch (pageout_status[i]) {
391 		case VM_PAGER_OK:
392 			numpagedout++;
393 			break;
394 		case VM_PAGER_PEND:
395 			numpagedout++;
396 			break;
397 		case VM_PAGER_BAD:
398 			/*
399 			 * Page outside of range of object. Right now we
400 			 * essentially lose the changes by pretending it
401 			 * worked.
402 			 */
403 			pmap_clear_modify(mt);
404 			vm_page_undirty(mt);
405 			break;
406 		case VM_PAGER_ERROR:
407 		case VM_PAGER_FAIL:
408 			/*
409 			 * If page couldn't be paged out, then reactivate the
410 			 * page so it doesn't clog the inactive list.  (We
411 			 * will try paging out it again later).
412 			 */
413 			vm_page_activate(mt);
414 			break;
415 		case VM_PAGER_AGAIN:
416 			break;
417 		}
418 
419 		/*
420 		 * If the operation is still going, leave the page busy to
421 		 * block all other accesses. Also, leave the paging in
422 		 * progress indicator set so that we don't attempt an object
423 		 * collapse.
424 		 */
425 		if (pageout_status[i] != VM_PAGER_PEND) {
426 			vm_object_pip_wakeup(object);
427 			vm_page_io_finish(mt);
428 			if (!vm_page_count_severe() || !vm_page_try_to_cache(mt))
429 				vm_page_protect(mt, VM_PROT_READ);
430 		}
431 	}
432 	return numpagedout;
433 }
434 
435 #if !defined(NO_SWAPPING)
436 /*
437  *	vm_pageout_object_deactivate_pages
438  *
439  *	deactivate enough pages to satisfy the inactive target
440  *	requirements or if vm_page_proc_limit is set, then
441  *	deactivate all of the pages in the object and its
442  *	backing_objects.
443  *
444  *	The object and map must be locked.
445  */
446 static void
447 vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
448 	vm_map_t map;
449 	vm_object_t object;
450 	vm_pindex_t desired;
451 	int map_remove_only;
452 {
453 	register vm_page_t p, next;
454 	int rcount;
455 	int remove_mode;
456 	int s;
457 
458 	if (object->type == OBJT_DEVICE || object->type == OBJT_PHYS)
459 		return;
460 
461 	while (object) {
462 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
463 			return;
464 		if (object->paging_in_progress)
465 			return;
466 
467 		remove_mode = map_remove_only;
468 		if (object->shadow_count > 1)
469 			remove_mode = 1;
470 	/*
471 	 * scan the objects entire memory queue
472 	 */
473 		rcount = object->resident_page_count;
474 		p = TAILQ_FIRST(&object->memq);
475 		while (p && (rcount-- > 0)) {
476 			int actcount;
477 			if (pmap_resident_count(vm_map_pmap(map)) <= desired)
478 				return;
479 			next = TAILQ_NEXT(p, listq);
480 			cnt.v_pdpages++;
481 			if (p->wire_count != 0 ||
482 			    p->hold_count != 0 ||
483 			    p->busy != 0 ||
484 			    (p->flags & (PG_BUSY|PG_UNMANAGED)) ||
485 			    !pmap_page_exists_quick(vm_map_pmap(map), p)) {
486 				p = next;
487 				continue;
488 			}
489 
490 			actcount = pmap_ts_referenced(p);
491 			if (actcount) {
492 				vm_page_flag_set(p, PG_REFERENCED);
493 			} else if (p->flags & PG_REFERENCED) {
494 				actcount = 1;
495 			}
496 
497 			if ((p->queue != PQ_ACTIVE) &&
498 				(p->flags & PG_REFERENCED)) {
499 				vm_page_activate(p);
500 				p->act_count += actcount;
501 				vm_page_flag_clear(p, PG_REFERENCED);
502 			} else if (p->queue == PQ_ACTIVE) {
503 				if ((p->flags & PG_REFERENCED) == 0) {
504 					p->act_count -= min(p->act_count, ACT_DECLINE);
505 					if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
506 						vm_page_protect(p, VM_PROT_NONE);
507 						vm_page_deactivate(p);
508 					} else {
509 						s = splvm();
510 						TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
511 						TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
512 						splx(s);
513 					}
514 				} else {
515 					vm_page_activate(p);
516 					vm_page_flag_clear(p, PG_REFERENCED);
517 					if (p->act_count < (ACT_MAX - ACT_ADVANCE))
518 						p->act_count += ACT_ADVANCE;
519 					s = splvm();
520 					TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
521 					TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, p, pageq);
522 					splx(s);
523 				}
524 			} else if (p->queue == PQ_INACTIVE) {
525 				vm_page_protect(p, VM_PROT_NONE);
526 			}
527 			p = next;
528 		}
529 		object = object->backing_object;
530 	}
531 	return;
532 }
533 
534 /*
535  * deactivate some number of pages in a map, try to do it fairly, but
536  * that is really hard to do.
537  */
538 static void
539 vm_pageout_map_deactivate_pages(map, desired)
540 	vm_map_t map;
541 	vm_pindex_t desired;
542 {
543 	vm_map_entry_t tmpe;
544 	vm_object_t obj, bigobj;
545 	int nothingwired;
546 
547 	if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, (void *)0, curproc)) {
548 		return;
549 	}
550 
551 	bigobj = NULL;
552 	nothingwired = TRUE;
553 
554 	/*
555 	 * first, search out the biggest object, and try to free pages from
556 	 * that.
557 	 */
558 	tmpe = map->header.next;
559 	while (tmpe != &map->header) {
560 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
561 			obj = tmpe->object.vm_object;
562 			if ((obj != NULL) && (obj->shadow_count <= 1) &&
563 				((bigobj == NULL) ||
564 				 (bigobj->resident_page_count < obj->resident_page_count))) {
565 				bigobj = obj;
566 			}
567 		}
568 		if (tmpe->wired_count > 0)
569 			nothingwired = FALSE;
570 		tmpe = tmpe->next;
571 	}
572 
573 	if (bigobj)
574 		vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
575 
576 	/*
577 	 * Next, hunt around for other pages to deactivate.  We actually
578 	 * do this search sort of wrong -- .text first is not the best idea.
579 	 */
580 	tmpe = map->header.next;
581 	while (tmpe != &map->header) {
582 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
583 			break;
584 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
585 			obj = tmpe->object.vm_object;
586 			if (obj)
587 				vm_pageout_object_deactivate_pages(map, obj, desired, 0);
588 		}
589 		tmpe = tmpe->next;
590 	};
591 
592 	/*
593 	 * Remove all mappings if a process is swapped out, this will free page
594 	 * table pages.
595 	 */
596 	if (desired == 0 && nothingwired)
597 		pmap_remove(vm_map_pmap(map),
598 			VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
599 	vm_map_unlock(map);
600 	return;
601 }
602 #endif
603 
604 /*
605  * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
606  * to vnode deadlocks.  We only do it for OBJT_DEFAULT and OBJT_SWAP objects
607  * which we know can be trivially freed.
608  */
609 
610 void
611 vm_pageout_page_free(vm_page_t m) {
612 	vm_object_t object = m->object;
613 	int type = object->type;
614 
615 	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
616 		vm_object_reference(object);
617 	vm_page_busy(m);
618 	vm_page_protect(m, VM_PROT_NONE);
619 	vm_page_free(m);
620 	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
621 		vm_object_deallocate(object);
622 }
623 
624 /*
625  *	vm_pageout_scan does the dirty work for the pageout daemon.
626  */
627 static void
628 vm_pageout_scan(int pass)
629 {
630 	vm_page_t m, next;
631 	struct vm_page marker;
632 	int page_shortage, maxscan, pcount;
633 	int addl_page_shortage, addl_page_shortage_init;
634 	struct proc *p, *bigproc;
635 	vm_offset_t size, bigsize;
636 	vm_object_t object;
637 	int actcount;
638 	int vnodes_skipped = 0;
639 	int maxlaunder;
640 	int s;
641 
642 	/*
643 	 * Do whatever cleanup that the pmap code can.
644 	 */
645 	pmap_collect();
646 
647 	addl_page_shortage_init = vm_pageout_deficit;
648 	vm_pageout_deficit = 0;
649 
650 	/*
651 	 * Calculate the number of pages we want to either free or move
652 	 * to the cache.
653 	 */
654 	page_shortage = vm_paging_target() + addl_page_shortage_init;
655 
656 	/*
657 	 * Initialize our marker
658 	 */
659 	bzero(&marker, sizeof(marker));
660 	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
661 	marker.queue = PQ_INACTIVE;
662 	marker.wire_count = 1;
663 
664 	/*
665 	 * Start scanning the inactive queue for pages we can move to the
666 	 * cache or free.  The scan will stop when the target is reached or
667 	 * we have scanned the entire inactive queue.  Note that m->act_count
668 	 * is not used to form decisions for the inactive queue, only for the
669 	 * active queue.
670 	 *
671 	 * maxlaunder limits the number of dirty pages we flush per scan.
672 	 * For most systems a smaller value (16 or 32) is more robust under
673 	 * extreme memory and disk pressure because any unnecessary writes
674 	 * to disk can result in extreme performance degredation.  However,
675 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
676 	 * used) will die horribly with limited laundering.  If the pageout
677 	 * daemon cannot clean enough pages in the first pass, we let it go
678 	 * all out in succeeding passes.
679 	 */
680 	if ((maxlaunder = vm_max_launder) <= 1)
681 		maxlaunder = 1;
682 	if (pass)
683 		maxlaunder = 10000;
684 
685 rescan0:
686 	addl_page_shortage = addl_page_shortage_init;
687 	maxscan = cnt.v_inactive_count;
688 	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
689 	     m != NULL && maxscan-- > 0 && page_shortage > 0;
690 	     m = next) {
691 
692 		cnt.v_pdpages++;
693 
694 		if (m->queue != PQ_INACTIVE) {
695 			goto rescan0;
696 		}
697 
698 		next = TAILQ_NEXT(m, pageq);
699 
700 		/*
701 		 * skip marker pages
702 		 */
703 		if (m->flags & PG_MARKER)
704 			continue;
705 
706 		/*
707 		 * A held page may be undergoing I/O, so skip it.
708 		 */
709 		if (m->hold_count) {
710 			s = splvm();
711 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
712 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
713 			splx(s);
714 			addl_page_shortage++;
715 			continue;
716 		}
717 		/*
718 		 * Dont mess with busy pages, keep in the front of the
719 		 * queue, most likely are being paged out.
720 		 */
721 		if (m->busy || (m->flags & PG_BUSY)) {
722 			addl_page_shortage++;
723 			continue;
724 		}
725 
726 		/*
727 		 * If the object is not being used, we ignore previous
728 		 * references.
729 		 */
730 		if (m->object->ref_count == 0) {
731 			vm_page_flag_clear(m, PG_REFERENCED);
732 			pmap_clear_reference(m);
733 
734 		/*
735 		 * Otherwise, if the page has been referenced while in the
736 		 * inactive queue, we bump the "activation count" upwards,
737 		 * making it less likely that the page will be added back to
738 		 * the inactive queue prematurely again.  Here we check the
739 		 * page tables (or emulated bits, if any), given the upper
740 		 * level VM system not knowing anything about existing
741 		 * references.
742 		 */
743 		} else if (((m->flags & PG_REFERENCED) == 0) &&
744 			(actcount = pmap_ts_referenced(m))) {
745 			vm_page_activate(m);
746 			m->act_count += (actcount + ACT_ADVANCE);
747 			continue;
748 		}
749 
750 		/*
751 		 * If the upper level VM system knows about any page
752 		 * references, we activate the page.  We also set the
753 		 * "activation count" higher than normal so that we will less
754 		 * likely place pages back onto the inactive queue again.
755 		 */
756 		if ((m->flags & PG_REFERENCED) != 0) {
757 			vm_page_flag_clear(m, PG_REFERENCED);
758 			actcount = pmap_ts_referenced(m);
759 			vm_page_activate(m);
760 			m->act_count += (actcount + ACT_ADVANCE + 1);
761 			continue;
762 		}
763 
764 		/*
765 		 * If the upper level VM system doesn't know anything about
766 		 * the page being dirty, we have to check for it again.  As
767 		 * far as the VM code knows, any partially dirty pages are
768 		 * fully dirty.
769 		 */
770 		if (m->dirty == 0) {
771 			vm_page_test_dirty(m);
772 		} else {
773 			vm_page_dirty(m);
774 		}
775 
776 		/*
777 		 * Invalid pages can be easily freed
778 		 */
779 		if (m->valid == 0) {
780 			vm_pageout_page_free(m);
781 			cnt.v_dfree++;
782 			--page_shortage;
783 
784 		/*
785 		 * Clean pages can be placed onto the cache queue.  This
786 		 * effectively frees them.
787 		 */
788 		} else if (m->dirty == 0) {
789 			/*
790 			 * Clean pages can be immediately freed to the cache.
791 			 */
792 			vm_page_cache(m);
793 			--page_shortage;
794 		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
795 			/*
796 			 * Dirty pages need to be paged out, but flushing
797 			 * a page is extremely expensive verses freeing
798 			 * a clean page.  Rather then artificially limiting
799 			 * the number of pages we can flush, we instead give
800 			 * dirty pages extra priority on the inactive queue
801 			 * by forcing them to be cycled through the queue
802 			 * twice before being flushed, after which the
803 			 * (now clean) page will cycle through once more
804 			 * before being freed.  This significantly extends
805 			 * the thrash point for a heavily loaded machine.
806 			 */
807 			s = splvm();
808 			vm_page_flag_set(m, PG_WINATCFLS);
809 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
810 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
811 			splx(s);
812 		} else if (maxlaunder > 0) {
813 			/*
814 			 * We always want to try to flush some dirty pages if
815 			 * we encounter them, to keep the system stable.
816 			 * Normally this number is small, but under extreme
817 			 * pressure where there are insufficient clean pages
818 			 * on the inactive queue, we may have to go all out.
819 			 */
820 			int swap_pageouts_ok;
821 			struct vnode *vp = NULL;
822 
823 			object = m->object;
824 
825 			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
826 				swap_pageouts_ok = 1;
827 			} else {
828 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
829 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
830 				vm_page_count_min());
831 
832 			}
833 
834 			/*
835 			 * We don't bother paging objects that are "dead".
836 			 * Those objects are in a "rundown" state.
837 			 */
838 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
839 				s = splvm();
840 				TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
841 				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
842 				splx(s);
843 				continue;
844 			}
845 
846 			/*
847 			 * The object is already known NOT to be dead.   It
848 			 * is possible for the vget() to block the whole
849 			 * pageout daemon, but the new low-memory handling
850 			 * code should prevent it.
851 			 *
852 			 * The previous code skipped locked vnodes and, worse,
853 			 * reordered pages in the queue.  This results in
854 			 * completely non-deterministic operation because,
855 			 * quite often, a vm_fault has initiated an I/O and
856 			 * is holding a locked vnode at just the point where
857 			 * the pageout daemon is woken up.
858 			 *
859 			 * We can't wait forever for the vnode lock, we might
860 			 * deadlock due to a vn_read() getting stuck in
861 			 * vm_wait while holding this vnode.  We skip the
862 			 * vnode if we can't get it in a reasonable amount
863 			 * of time.
864 			 */
865 
866 			if (object->type == OBJT_VNODE) {
867 				vp = object->handle;
868 
869 				if (vget(vp, LK_EXCLUSIVE|LK_NOOBJ|LK_TIMELOCK, curproc)) {
870 					++pageout_lock_miss;
871 					if (object->flags & OBJ_MIGHTBEDIRTY)
872 						    vnodes_skipped++;
873 					continue;
874 				}
875 
876 				/*
877 				 * The page might have been moved to another
878 				 * queue during potential blocking in vget()
879 				 * above.  The page might have been freed and
880 				 * reused for another vnode.  The object might
881 				 * have been reused for another vnode.
882 				 */
883 				if (m->queue != PQ_INACTIVE ||
884 				    m->object != object ||
885 				    object->handle != vp) {
886 					if (object->flags & OBJ_MIGHTBEDIRTY)
887 						vnodes_skipped++;
888 					vput(vp);
889 					continue;
890 				}
891 
892 				/*
893 				 * The page may have been busied during the
894 				 * blocking in vput();  We don't move the
895 				 * page back onto the end of the queue so that
896 				 * statistics are more correct if we don't.
897 				 */
898 				if (m->busy || (m->flags & PG_BUSY)) {
899 					vput(vp);
900 					continue;
901 				}
902 
903 				/*
904 				 * If the page has become held it might
905 				 * be undergoing I/O, so skip it
906 				 */
907 				if (m->hold_count) {
908 					s = splvm();
909 					TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
910 					TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
911 					splx(s);
912 					if (object->flags & OBJ_MIGHTBEDIRTY)
913 						vnodes_skipped++;
914 					vput(vp);
915 					continue;
916 				}
917 			}
918 
919 			/*
920 			 * If a page is dirty, then it is either being washed
921 			 * (but not yet cleaned) or it is still in the
922 			 * laundry.  If it is still in the laundry, then we
923 			 * start the cleaning operation.
924 			 *
925 			 * This operation may cluster, invalidating the 'next'
926 			 * pointer.  To prevent an inordinate number of
927 			 * restarts we use our marker to remember our place.
928 			 *
929 			 * decrement page_shortage on success to account for
930 			 * the (future) cleaned page.  Otherwise we could wind
931 			 * up laundering or cleaning too many pages.
932 			 */
933 			s = splvm();
934 			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl, m, &marker, pageq);
935 			splx(s);
936 			if (vm_pageout_clean(m) != 0) {
937 				--page_shortage;
938 				--maxlaunder;
939 			}
940 			s = splvm();
941 			next = TAILQ_NEXT(&marker, pageq);
942 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
943 			splx(s);
944 			if (vp != NULL)
945 				vput(vp);
946 		}
947 	}
948 
949 	/*
950 	 * Compute the number of pages we want to try to move from the
951 	 * active queue to the inactive queue.
952 	 */
953 	page_shortage = vm_paging_target() +
954 	    cnt.v_inactive_target - cnt.v_inactive_count;
955 	page_shortage += addl_page_shortage;
956 
957 	/*
958 	 * Scan the active queue for things we can deactivate. We nominally
959 	 * track the per-page activity counter and use it to locate
960 	 * deactivation candidates.
961 	 */
962 
963 	pcount = cnt.v_active_count;
964 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
965 
966 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
967 
968 		/*
969 		 * This is a consistency check, and should likely be a panic
970 		 * or warning.
971 		 */
972 		if (m->queue != PQ_ACTIVE) {
973 			break;
974 		}
975 
976 		next = TAILQ_NEXT(m, pageq);
977 		/*
978 		 * Don't deactivate pages that are busy.
979 		 */
980 		if ((m->busy != 0) ||
981 		    (m->flags & PG_BUSY) ||
982 		    (m->hold_count != 0)) {
983 			s = splvm();
984 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
985 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
986 			splx(s);
987 			m = next;
988 			continue;
989 		}
990 
991 		/*
992 		 * The count for pagedaemon pages is done after checking the
993 		 * page for eligibility...
994 		 */
995 		cnt.v_pdpages++;
996 
997 		/*
998 		 * Check to see "how much" the page has been used.
999 		 */
1000 		actcount = 0;
1001 		if (m->object->ref_count != 0) {
1002 			if (m->flags & PG_REFERENCED) {
1003 				actcount += 1;
1004 			}
1005 			actcount += pmap_ts_referenced(m);
1006 			if (actcount) {
1007 				m->act_count += ACT_ADVANCE + actcount;
1008 				if (m->act_count > ACT_MAX)
1009 					m->act_count = ACT_MAX;
1010 			}
1011 		}
1012 
1013 		/*
1014 		 * Since we have "tested" this bit, we need to clear it now.
1015 		 */
1016 		vm_page_flag_clear(m, PG_REFERENCED);
1017 
1018 		/*
1019 		 * Only if an object is currently being used, do we use the
1020 		 * page activation count stats.
1021 		 */
1022 		if (actcount && (m->object->ref_count != 0)) {
1023 			s = splvm();
1024 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1025 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1026 			splx(s);
1027 		} else {
1028 			m->act_count -= min(m->act_count, ACT_DECLINE);
1029 			if (vm_pageout_algorithm ||
1030 			    m->object->ref_count == 0 ||
1031 			    m->act_count == 0) {
1032 				page_shortage--;
1033 				if (m->object->ref_count == 0) {
1034 					vm_page_protect(m, VM_PROT_NONE);
1035 					if (m->dirty == 0)
1036 						vm_page_cache(m);
1037 					else
1038 						vm_page_deactivate(m);
1039 				} else {
1040 					vm_page_deactivate(m);
1041 				}
1042 			} else {
1043 				s = splvm();
1044 				TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1045 				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1046 				splx(s);
1047 			}
1048 		}
1049 		m = next;
1050 	}
1051 
1052 	s = splvm();
1053 
1054 	/*
1055 	 * We try to maintain some *really* free pages, this allows interrupt
1056 	 * code to be guaranteed space.  Since both cache and free queues
1057 	 * are considered basically 'free', moving pages from cache to free
1058 	 * does not effect other calculations.
1059 	 */
1060 
1061 	while (cnt.v_free_count < cnt.v_free_reserved) {
1062 		static int cache_rover = 0;
1063 		m = vm_page_list_find(PQ_CACHE, cache_rover, FALSE);
1064 		if (!m)
1065 			break;
1066 		if ((m->flags & (PG_BUSY|PG_UNMANAGED)) ||
1067 		    m->busy ||
1068 		    m->hold_count ||
1069 		    m->wire_count) {
1070 #ifdef INVARIANTS
1071 			printf("Warning: busy page %p found in cache\n", m);
1072 #endif
1073 			vm_page_deactivate(m);
1074 			continue;
1075 		}
1076 		cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK;
1077 		vm_pageout_page_free(m);
1078 		cnt.v_dfree++;
1079 	}
1080 	splx(s);
1081 
1082 #if !defined(NO_SWAPPING)
1083 	/*
1084 	 * Idle process swapout -- run once per second.
1085 	 */
1086 	if (vm_swap_idle_enabled) {
1087 		static long lsec;
1088 		if (time_second != lsec) {
1089 			vm_pageout_req_swapout |= VM_SWAP_IDLE;
1090 			vm_req_vmdaemon();
1091 			lsec = time_second;
1092 		}
1093 	}
1094 #endif
1095 
1096 	/*
1097 	 * If we didn't get enough free pages, and we have skipped a vnode
1098 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
1099 	 * if we did not get enough free pages.
1100 	 */
1101 	if (vm_paging_target() > 0) {
1102 		if (vnodes_skipped && vm_page_count_min())
1103 			(void) speedup_syncer();
1104 #if !defined(NO_SWAPPING)
1105 		if (vm_swap_enabled && vm_page_count_target()) {
1106 			vm_req_vmdaemon();
1107 			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
1108 		}
1109 #endif
1110 	}
1111 
1112 	/*
1113 	 * If we are out of swap and were not able to reach our paging
1114 	 * target, kill the largest process.
1115 	 */
1116 	if ((vm_swap_size < 64 && vm_page_count_min()) ||
1117 	    (swap_pager_full && vm_paging_target() > 0)) {
1118 #if 0
1119 	if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
1120 #endif
1121 		bigproc = NULL;
1122 		bigsize = 0;
1123 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1124 			/*
1125 			 * if this is a system process, skip it
1126 			 */
1127 			if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
1128 			    ((p->p_pid < 48) && (vm_swap_size != 0))) {
1129 				continue;
1130 			}
1131 			/*
1132 			 * if the process is in a non-running type state,
1133 			 * don't touch it.
1134 			 */
1135 			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
1136 				continue;
1137 			}
1138 			/*
1139 			 * get the process size
1140 			 */
1141 			size = vmspace_resident_count(p->p_vmspace) +
1142 				vmspace_swap_count(p->p_vmspace);
1143 			/*
1144 			 * if the this process is bigger than the biggest one
1145 			 * remember it.
1146 			 */
1147 			if (size > bigsize) {
1148 				bigproc = p;
1149 				bigsize = size;
1150 			}
1151 		}
1152 		if (bigproc != NULL) {
1153 			killproc(bigproc, "out of swap space");
1154 			bigproc->p_estcpu = 0;
1155 			bigproc->p_nice = PRIO_MIN;
1156 			resetpriority(bigproc);
1157 			wakeup(&cnt.v_free_count);
1158 		}
1159 	}
1160 }
1161 
1162 /*
1163  * This routine tries to maintain the pseudo LRU active queue,
1164  * so that during long periods of time where there is no paging,
1165  * that some statistic accumulation still occurs.  This code
1166  * helps the situation where paging just starts to occur.
1167  */
1168 static void
1169 vm_pageout_page_stats()
1170 {
1171 	int s;
1172 	vm_page_t m,next;
1173 	int pcount,tpcount;		/* Number of pages to check */
1174 	static int fullintervalcount = 0;
1175 	int page_shortage;
1176 	int s0;
1177 
1178 	page_shortage =
1179 	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
1180 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
1181 
1182 	if (page_shortage <= 0)
1183 		return;
1184 
1185 	s0 = splvm();
1186 
1187 	pcount = cnt.v_active_count;
1188 	fullintervalcount += vm_pageout_stats_interval;
1189 	if (fullintervalcount < vm_pageout_full_stats_interval) {
1190 		tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
1191 		if (pcount > tpcount)
1192 			pcount = tpcount;
1193 	} else {
1194 		fullintervalcount = 0;
1195 	}
1196 
1197 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
1198 	while ((m != NULL) && (pcount-- > 0)) {
1199 		int actcount;
1200 
1201 		if (m->queue != PQ_ACTIVE) {
1202 			break;
1203 		}
1204 
1205 		next = TAILQ_NEXT(m, pageq);
1206 		/*
1207 		 * Don't deactivate pages that are busy.
1208 		 */
1209 		if ((m->busy != 0) ||
1210 		    (m->flags & PG_BUSY) ||
1211 		    (m->hold_count != 0)) {
1212 			s = splvm();
1213 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1214 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1215 			splx(s);
1216 			m = next;
1217 			continue;
1218 		}
1219 
1220 		actcount = 0;
1221 		if (m->flags & PG_REFERENCED) {
1222 			vm_page_flag_clear(m, PG_REFERENCED);
1223 			actcount += 1;
1224 		}
1225 
1226 		actcount += pmap_ts_referenced(m);
1227 		if (actcount) {
1228 			m->act_count += ACT_ADVANCE + actcount;
1229 			if (m->act_count > ACT_MAX)
1230 				m->act_count = ACT_MAX;
1231 			s = splvm();
1232 			TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1233 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1234 			splx(s);
1235 		} else {
1236 			if (m->act_count == 0) {
1237 				/*
1238 				 * We turn off page access, so that we have
1239 				 * more accurate RSS stats.  We don't do this
1240 				 * in the normal page deactivation when the
1241 				 * system is loaded VM wise, because the
1242 				 * cost of the large number of page protect
1243 				 * operations would be higher than the value
1244 				 * of doing the operation.
1245 				 */
1246 				vm_page_protect(m, VM_PROT_NONE);
1247 				vm_page_deactivate(m);
1248 			} else {
1249 				m->act_count -= min(m->act_count, ACT_DECLINE);
1250 				s = splvm();
1251 				TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1252 				TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1253 				splx(s);
1254 			}
1255 		}
1256 
1257 		m = next;
1258 	}
1259 	splx(s0);
1260 }
1261 
1262 static int
1263 vm_pageout_free_page_calc(count)
1264 vm_size_t count;
1265 {
1266 	if (count < cnt.v_page_count)
1267 		 return 0;
1268 	/*
1269 	 * free_reserved needs to include enough for the largest swap pager
1270 	 * structures plus enough for any pv_entry structs when paging.
1271 	 */
1272 	if (cnt.v_page_count > 1024)
1273 		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
1274 	else
1275 		cnt.v_free_min = 4;
1276 	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
1277 		cnt.v_interrupt_free_min;
1278 	cnt.v_free_reserved = vm_pageout_page_count +
1279 		cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
1280 	cnt.v_free_severe = cnt.v_free_min / 2;
1281 	cnt.v_free_min += cnt.v_free_reserved;
1282 	cnt.v_free_severe += cnt.v_free_reserved;
1283 	return 1;
1284 }
1285 
1286 
1287 /*
1288  *	vm_pageout is the high level pageout daemon.
1289  */
1290 static void
1291 vm_pageout()
1292 {
1293 	int pass;
1294 
1295 	/*
1296 	 * Initialize some paging parameters.
1297 	 */
1298 
1299 	cnt.v_interrupt_free_min = 2;
1300 	if (cnt.v_page_count < 2000)
1301 		vm_pageout_page_count = 8;
1302 
1303 	vm_pageout_free_page_calc(cnt.v_page_count);
1304 	/*
1305 	 * v_free_target and v_cache_min control pageout hysteresis.  Note
1306 	 * that these are more a measure of the VM cache queue hysteresis
1307 	 * then the VM free queue.  Specifically, v_free_target is the
1308 	 * high water mark (free+cache pages).
1309 	 *
1310 	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
1311 	 * low water mark, while v_free_min is the stop.  v_cache_min must
1312 	 * be big enough to handle memory needs while the pageout daemon
1313 	 * is signalled and run to free more pages.
1314 	 */
1315 	if (cnt.v_free_count > 6144)
1316 		cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
1317 	else
1318 		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
1319 
1320 	if (cnt.v_free_count > 2048) {
1321 		cnt.v_cache_min = cnt.v_free_target;
1322 		cnt.v_cache_max = 2 * cnt.v_cache_min;
1323 		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
1324 	} else {
1325 		cnt.v_cache_min = 0;
1326 		cnt.v_cache_max = 0;
1327 		cnt.v_inactive_target = cnt.v_free_count / 4;
1328 	}
1329 	if (cnt.v_inactive_target > cnt.v_free_count / 3)
1330 		cnt.v_inactive_target = cnt.v_free_count / 3;
1331 
1332 	/* XXX does not really belong here */
1333 	if (vm_page_max_wired == 0)
1334 		vm_page_max_wired = cnt.v_free_count / 3;
1335 
1336 	if (vm_pageout_stats_max == 0)
1337 		vm_pageout_stats_max = cnt.v_free_target;
1338 
1339 	/*
1340 	 * Set interval in seconds for stats scan.
1341 	 */
1342 	if (vm_pageout_stats_interval == 0)
1343 		vm_pageout_stats_interval = 5;
1344 	if (vm_pageout_full_stats_interval == 0)
1345 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
1346 
1347 
1348 	/*
1349 	 * Set maximum free per pass
1350 	 */
1351 	if (vm_pageout_stats_free_max == 0)
1352 		vm_pageout_stats_free_max = 5;
1353 
1354 	swap_pager_swap_init();
1355 	pass = 0;
1356 	/*
1357 	 * The pageout daemon is never done, so loop forever.
1358 	 */
1359 	while (TRUE) {
1360 		int error;
1361 		int s = splvm();
1362 
1363 		/*
1364 		 * If we have enough free memory, wakeup waiters.  Do
1365 		 * not clear vm_pages_needed until we reach our target,
1366 		 * otherwise we may be woken up over and over again and
1367 		 * waste a lot of cpu.
1368 		 */
1369 		if (vm_pages_needed && !vm_page_count_min()) {
1370 			if (vm_paging_needed() <= 0)
1371 				vm_pages_needed = 0;
1372 			wakeup(&cnt.v_free_count);
1373 		}
1374 		if (vm_pages_needed) {
1375 			/*
1376 			 * Still not done, take a second pass without waiting
1377 			 * (unlimited dirty cleaning), otherwise sleep a bit
1378 			 * and try again.
1379 			 */
1380 			++pass;
1381 			if (pass > 1)
1382 				tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
1383 		} else {
1384 			/*
1385 			 * Good enough, sleep & handle stats.  Prime the pass
1386 			 * for the next run.
1387 			 */
1388 			if (pass > 1)
1389 				pass = 1;
1390 			else
1391 				pass = 0;
1392 			error = tsleep(&vm_pages_needed,
1393 				PVM, "psleep", vm_pageout_stats_interval * hz);
1394 			if (error && !vm_pages_needed) {
1395 				splx(s);
1396 				pass = 0;
1397 				vm_pageout_page_stats();
1398 				continue;
1399 			}
1400 		}
1401 
1402 		if (vm_pages_needed)
1403 			cnt.v_pdwakeups++;
1404 		splx(s);
1405 		vm_pageout_scan(pass);
1406 		vm_pageout_deficit = 0;
1407 	}
1408 }
1409 
1410 void
1411 pagedaemon_wakeup()
1412 {
1413 	if (!vm_pages_needed && curproc != pageproc) {
1414 		vm_pages_needed++;
1415 		wakeup(&vm_pages_needed);
1416 	}
1417 }
1418 
1419 #if !defined(NO_SWAPPING)
1420 static void
1421 vm_req_vmdaemon()
1422 {
1423 	static int lastrun = 0;
1424 
1425 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
1426 		wakeup(&vm_daemon_needed);
1427 		lastrun = ticks;
1428 	}
1429 }
1430 
1431 static void
1432 vm_daemon()
1433 {
1434 	struct proc *p;
1435 
1436 	while (TRUE) {
1437 		tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
1438 		if (vm_pageout_req_swapout) {
1439 			swapout_procs(vm_pageout_req_swapout);
1440 			vm_pageout_req_swapout = 0;
1441 		}
1442 		/*
1443 		 * scan the processes for exceeding their rlimits or if
1444 		 * process is swapped out -- deactivate pages
1445 		 */
1446 
1447 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
1448 			vm_pindex_t limit, size;
1449 
1450 			/*
1451 			 * if this is a system process or if we have already
1452 			 * looked at this process, skip it.
1453 			 */
1454 			if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
1455 				continue;
1456 			}
1457 			/*
1458 			 * if the process is in a non-running type state,
1459 			 * don't touch it.
1460 			 */
1461 			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
1462 				continue;
1463 			}
1464 			/*
1465 			 * get a limit
1466 			 */
1467 			limit = OFF_TO_IDX(
1468 			    qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
1469 				p->p_rlimit[RLIMIT_RSS].rlim_max));
1470 
1471 			/*
1472 			 * let processes that are swapped out really be
1473 			 * swapped out set the limit to nothing (will force a
1474 			 * swap-out.)
1475 			 */
1476 			if ((p->p_flag & P_INMEM) == 0)
1477 				limit = 0;	/* XXX */
1478 
1479 			size = vmspace_resident_count(p->p_vmspace);
1480 			if (limit >= 0 && size >= limit) {
1481 				vm_pageout_map_deactivate_pages(
1482 				    &p->p_vmspace->vm_map, limit);
1483 			}
1484 		}
1485 	}
1486 }
1487 #endif
1488