xref: /dragonfly/sys/vm/vm_pageout.c (revision 38b5d46c)
1 /*
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * The Mach Operating System project at Carnegie-Mellon University.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
37  *
38  *
39  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40  * All rights reserved.
41  *
42  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
43  *
44  * Permission to use, copy, modify and distribute this software and
45  * its documentation is hereby granted, provided that both the copyright
46  * notice and this permission notice appear in all copies of the
47  * software, derivative works or modified versions, and any portions
48  * thereof, and that both notices appear in supporting documentation.
49  *
50  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
51  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
52  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
53  *
54  * Carnegie Mellon requests users of this software to return to
55  *
56  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
57  *  School of Computer Science
58  *  Carnegie Mellon University
59  *  Pittsburgh PA 15213-3890
60  *
61  * any improvements or extensions that they make and grant Carnegie the
62  * rights to redistribute these changes.
63  *
64  * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $
65  */
66 
67 /*
68  *	The proverbial page-out daemon.
69  */
70 
71 #include "opt_vm.h"
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/kernel.h>
75 #include <sys/proc.h>
76 #include <sys/kthread.h>
77 #include <sys/resourcevar.h>
78 #include <sys/signalvar.h>
79 #include <sys/vnode.h>
80 #include <sys/vmmeter.h>
81 #include <sys/sysctl.h>
82 
83 #include <vm/vm.h>
84 #include <vm/vm_param.h>
85 #include <sys/lock.h>
86 #include <vm/vm_object.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_map.h>
89 #include <vm/vm_pageout.h>
90 #include <vm/vm_pager.h>
91 #include <vm/swap_pager.h>
92 #include <vm/vm_extern.h>
93 
94 #include <sys/thread2.h>
95 #include <sys/spinlock2.h>
96 #include <vm/vm_page2.h>
97 
98 /*
99  * System initialization
100  */
101 
102 /* the kernel process "vm_pageout"*/
103 static int vm_pageout_page(vm_page_t m, int *max_launderp,
104 			   int *vnodes_skippedp, struct vnode **vpfailedp,
105 			   int pass, int vmflush_flags);
106 static int vm_pageout_clean_helper (vm_page_t, int);
107 static int vm_pageout_free_page_calc (vm_size_t count);
108 static void vm_pageout_page_free(vm_page_t m) ;
109 struct thread *pagethread;
110 
111 #if !defined(NO_SWAPPING)
112 /* the kernel process "vm_daemon"*/
113 static void vm_daemon (void);
114 static struct	thread *vmthread;
115 
116 static struct kproc_desc vm_kp = {
117 	"vmdaemon",
118 	vm_daemon,
119 	&vmthread
120 };
121 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
122 #endif
123 
124 int vm_pages_needed = 0;	/* Event on which pageout daemon sleeps */
125 int vm_pageout_deficit = 0;	/* Estimated number of pages deficit */
126 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */
127 int vm_page_free_hysteresis = 16;
128 
129 #if !defined(NO_SWAPPING)
130 static int vm_pageout_req_swapout;
131 static int vm_daemon_needed;
132 #endif
133 static int vm_max_launder = 4096;
134 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
135 static int vm_pageout_full_stats_interval = 0;
136 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0;
137 static int defer_swap_pageouts=0;
138 static int disable_swap_pageouts=0;
139 static u_int vm_anonmem_decline = ACT_DECLINE;
140 static u_int vm_filemem_decline = ACT_DECLINE * 2;
141 
142 #if defined(NO_SWAPPING)
143 static int vm_swap_enabled=0;
144 static int vm_swap_idle_enabled=0;
145 #else
146 static int vm_swap_enabled=1;
147 static int vm_swap_idle_enabled=0;
148 #endif
149 int vm_pageout_memuse_mode=1;	/* 0-disable, 1-passive, 2-active swp*/
150 
151 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline,
152 	CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory");
153 
154 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline,
155 	CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache");
156 
157 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis,
158 	CTLFLAG_RW, &vm_page_free_hysteresis, 0,
159 	"Free more pages than the minimum required");
160 
161 SYSCTL_INT(_vm, OID_AUTO, max_launder,
162 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
163 
164 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
165 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
166 
167 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
168 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
169 
170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
171 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
172 
173 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
174 	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
175 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode,
176 	CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode");
177 
178 #if defined(NO_SWAPPING)
179 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
180 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
181 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
182 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
183 #else
184 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
185 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
186 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
187 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
188 #endif
189 
190 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
191 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
192 
193 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
194 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
195 
196 static int pageout_lock_miss;
197 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
198 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
199 
200 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
201 
202 #if !defined(NO_SWAPPING)
203 static void vm_req_vmdaemon (void);
204 #endif
205 static void vm_pageout_page_stats(int q);
206 
207 /*
208  * Calculate approximately how many pages on each queue to try to
209  * clean.  An exact calculation creates an edge condition when the
210  * queues are unbalanced so add significant slop.  The queue scans
211  * will stop early when targets are reached and will start where they
212  * left off on the next pass.
213  *
214  * We need to be generous here because there are all sorts of loading
215  * conditions that can cause edge cases if try to average over all queues.
216  * In particular, storage subsystems have become so fast that paging
217  * activity can become quite frantic.  Eventually we will probably need
218  * two paging threads, one for dirty pages and one for clean, to deal
219  * with the bandwidth requirements.
220 
221  * So what we do is calculate a value that can be satisfied nominally by
222  * only having to scan half the queues.
223  */
224 static __inline int
225 PQAVERAGE(int n)
226 {
227 	int avg;
228 
229 	if (n >= 0) {
230 		avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1);
231 	} else {
232 		avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1);
233 	}
234 	return avg;
235 }
236 
237 /*
238  * vm_pageout_clean_helper:
239  *
240  * Clean the page and remove it from the laundry.  The page must not be
241  * busy on-call.
242  *
243  * We set the busy bit to cause potential page faults on this page to
244  * block.  Note the careful timing, however, the busy bit isn't set till
245  * late and we cannot do anything that will mess with the page.
246  */
247 static int
248 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags)
249 {
250 	vm_object_t object;
251 	vm_page_t mc[BLIST_MAX_ALLOC];
252 	int error;
253 	int ib, is, page_base;
254 	vm_pindex_t pindex = m->pindex;
255 
256 	object = m->object;
257 
258 	/*
259 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
260 	 * with the new swapper, but we could have serious problems paging
261 	 * out other object types if there is insufficient memory.
262 	 *
263 	 * Unfortunately, checking free memory here is far too late, so the
264 	 * check has been moved up a procedural level.
265 	 */
266 
267 	/*
268 	 * Don't mess with the page if it's busy, held, or special
269 	 *
270 	 * XXX do we really need to check hold_count here?  hold_count
271 	 * isn't supposed to mess with vm_page ops except prevent the
272 	 * page from being reused.
273 	 */
274 	if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) {
275 		vm_page_wakeup(m);
276 		return 0;
277 	}
278 
279 	/*
280 	 * Place page in cluster.  Align cluster for optimal swap space
281 	 * allocation (whether it is swap or not).  This is typically ~16-32
282 	 * pages, which also tends to align the cluster to multiples of the
283 	 * filesystem block size if backed by a filesystem.
284 	 */
285 	page_base = pindex % BLIST_MAX_ALLOC;
286 	mc[page_base] = m;
287 	ib = page_base - 1;
288 	is = page_base + 1;
289 
290 	/*
291 	 * Scan object for clusterable pages.
292 	 *
293 	 * We can cluster ONLY if: ->> the page is NOT
294 	 * clean, wired, busy, held, or mapped into a
295 	 * buffer, and one of the following:
296 	 * 1) The page is inactive, or a seldom used
297 	 *    active page.
298 	 * -or-
299 	 * 2) we force the issue.
300 	 *
301 	 * During heavy mmap/modification loads the pageout
302 	 * daemon can really fragment the underlying file
303 	 * due to flushing pages out of order and not trying
304 	 * align the clusters (which leave sporatic out-of-order
305 	 * holes).  To solve this problem we do the reverse scan
306 	 * first and attempt to align our cluster, then do a
307 	 * forward scan if room remains.
308 	 */
309 	vm_object_hold(object);
310 
311 	while (ib >= 0) {
312 		vm_page_t p;
313 
314 		p = vm_page_lookup_busy_try(object, pindex - page_base + ib,
315 					    TRUE, &error);
316 		if (error || p == NULL)
317 			break;
318 		if ((p->queue - p->pc) == PQ_CACHE ||
319 		    (p->flags & PG_UNMANAGED)) {
320 			vm_page_wakeup(p);
321 			break;
322 		}
323 		vm_page_test_dirty(p);
324 		if (((p->dirty & p->valid) == 0 &&
325 		     (p->flags & PG_NEED_COMMIT) == 0) ||
326 		    p->wire_count != 0 ||	/* may be held by buf cache */
327 		    p->hold_count != 0) {	/* may be undergoing I/O */
328 			vm_page_wakeup(p);
329 			break;
330 		}
331 		if (p->queue - p->pc != PQ_INACTIVE) {
332 			if (p->queue - p->pc != PQ_ACTIVE ||
333 			    (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) {
334 				vm_page_wakeup(p);
335 				break;
336 			}
337 		}
338 
339 		/*
340 		 * Try to maintain page groupings in the cluster.
341 		 */
342 		if (m->flags & PG_WINATCFLS)
343 			vm_page_flag_set(p, PG_WINATCFLS);
344 		else
345 			vm_page_flag_clear(p, PG_WINATCFLS);
346 		p->act_count = m->act_count;
347 
348 		mc[ib] = p;
349 		--ib;
350 	}
351 	++ib;	/* fixup */
352 
353 	while (is < BLIST_MAX_ALLOC &&
354 	       pindex - page_base + is < object->size) {
355 		vm_page_t p;
356 
357 		p = vm_page_lookup_busy_try(object, pindex - page_base + is,
358 					    TRUE, &error);
359 		if (error || p == NULL)
360 			break;
361 		if (((p->queue - p->pc) == PQ_CACHE) ||
362 		    (p->flags & PG_UNMANAGED)) {
363 			vm_page_wakeup(p);
364 			break;
365 		}
366 		vm_page_test_dirty(p);
367 		if (((p->dirty & p->valid) == 0 &&
368 		     (p->flags & PG_NEED_COMMIT) == 0) ||
369 		    p->wire_count != 0 ||	/* may be held by buf cache */
370 		    p->hold_count != 0) {	/* may be undergoing I/O */
371 			vm_page_wakeup(p);
372 			break;
373 		}
374 		if (p->queue - p->pc != PQ_INACTIVE) {
375 			if (p->queue - p->pc != PQ_ACTIVE ||
376 			    (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) {
377 				vm_page_wakeup(p);
378 				break;
379 			}
380 		}
381 
382 		/*
383 		 * Try to maintain page groupings in the cluster.
384 		 */
385 		if (m->flags & PG_WINATCFLS)
386 			vm_page_flag_set(p, PG_WINATCFLS);
387 		else
388 			vm_page_flag_clear(p, PG_WINATCFLS);
389 		p->act_count = m->act_count;
390 
391 		mc[is] = p;
392 		++is;
393 	}
394 
395 	vm_object_drop(object);
396 
397 	/*
398 	 * we allow reads during pageouts...
399 	 */
400 	return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags);
401 }
402 
403 /*
404  * vm_pageout_flush() - launder the given pages
405  *
406  *	The given pages are laundered.  Note that we setup for the start of
407  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
408  *	reference count all in here rather then in the parent.  If we want
409  *	the parent to do more sophisticated things we may have to change
410  *	the ordering.
411  *
412  *	The pages in the array must be busied by the caller and will be
413  *	unbusied by this function.
414  */
415 int
416 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags)
417 {
418 	vm_object_t object;
419 	int pageout_status[count];
420 	int numpagedout = 0;
421 	int i;
422 
423 	/*
424 	 * Initiate I/O.  Bump the vm_page_t->busy counter.
425 	 */
426 	for (i = 0; i < count; i++) {
427 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
428 			("vm_pageout_flush page %p index %d/%d: partially "
429 			 "invalid page", mc[i], i, count));
430 		vm_page_io_start(mc[i]);
431 	}
432 
433 	/*
434 	 * We must make the pages read-only.  This will also force the
435 	 * modified bit in the related pmaps to be cleared.  The pager
436 	 * cannot clear the bit for us since the I/O completion code
437 	 * typically runs from an interrupt.  The act of making the page
438 	 * read-only handles the case for us.
439 	 *
440 	 * Then we can unbusy the pages, we still hold a reference by virtue
441 	 * of our soft-busy.
442 	 */
443 	for (i = 0; i < count; i++) {
444 		if (vmflush_flags & VM_PAGER_TRY_TO_CACHE)
445 			vm_page_protect(mc[i], VM_PROT_NONE);
446 		else
447 			vm_page_protect(mc[i], VM_PROT_READ);
448 		vm_page_wakeup(mc[i]);
449 	}
450 
451 	object = mc[0]->object;
452 	vm_object_pip_add(object, count);
453 
454 	vm_pager_put_pages(object, mc, count,
455 	    (vmflush_flags |
456 	     ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
457 	    pageout_status);
458 
459 	for (i = 0; i < count; i++) {
460 		vm_page_t mt = mc[i];
461 
462 		switch (pageout_status[i]) {
463 		case VM_PAGER_OK:
464 			numpagedout++;
465 			break;
466 		case VM_PAGER_PEND:
467 			numpagedout++;
468 			break;
469 		case VM_PAGER_BAD:
470 			/*
471 			 * Page outside of range of object. Right now we
472 			 * essentially lose the changes by pretending it
473 			 * worked.
474 			 */
475 			vm_page_busy_wait(mt, FALSE, "pgbad");
476 			pmap_clear_modify(mt);
477 			vm_page_undirty(mt);
478 			vm_page_wakeup(mt);
479 			break;
480 		case VM_PAGER_ERROR:
481 		case VM_PAGER_FAIL:
482 			/*
483 			 * A page typically cannot be paged out when we
484 			 * have run out of swap.  We leave the page
485 			 * marked inactive and will try to page it out
486 			 * again later.
487 			 *
488 			 * Starvation of the active page list is used to
489 			 * determine when the system is massively memory
490 			 * starved.
491 			 */
492 			break;
493 		case VM_PAGER_AGAIN:
494 			break;
495 		}
496 
497 		/*
498 		 * If not PENDing this was a synchronous operation and we
499 		 * clean up after the I/O.  If it is PENDing the mess is
500 		 * cleaned up asynchronously.
501 		 *
502 		 * Also nominally act on the caller's wishes if the caller
503 		 * wants to try to really clean (cache or free) the page.
504 		 *
505 		 * Also nominally deactivate the page if the system is
506 		 * memory-stressed.
507 		 */
508 		if (pageout_status[i] != VM_PAGER_PEND) {
509 			vm_page_busy_wait(mt, FALSE, "pgouw");
510 			vm_page_io_finish(mt);
511 			if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) {
512 				vm_page_try_to_cache(mt);
513 			} else if (vm_page_count_severe()) {
514 				vm_page_deactivate(mt);
515 				vm_page_wakeup(mt);
516 			} else {
517 				vm_page_wakeup(mt);
518 			}
519 			vm_object_pip_wakeup(object);
520 		}
521 	}
522 	return numpagedout;
523 }
524 
525 #if !defined(NO_SWAPPING)
526 
527 /*
528  * Callback function, page busied for us.  We must dispose of the busy
529  * condition.  Any related pmap pages may be held but will not be locked.
530  */
531 static
532 int
533 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va,
534 			vm_page_t p)
535 {
536 	int actcount;
537 	int cleanit = 0;
538 
539 	/*
540 	 * Basic tests - There should never be a marker, and we can stop
541 	 *		 once the RSS is below the required level.
542 	 */
543 	KKASSERT((p->flags & PG_MARKER) == 0);
544 	if (pmap_resident_tlnw_count(info->pmap) <= info->limit) {
545 		vm_page_wakeup(p);
546 		return(-1);
547 	}
548 
549 	mycpu->gd_cnt.v_pdpages++;
550 
551 	if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) {
552 		vm_page_wakeup(p);
553 		goto done;
554 	}
555 
556 	++info->actioncount;
557 
558 	/*
559 	 * Check if the page has been referened recently.  If it has,
560 	 * activate it and skip.
561 	 */
562 	actcount = pmap_ts_referenced(p);
563 	if (actcount) {
564 		vm_page_flag_set(p, PG_REFERENCED);
565 	} else if (p->flags & PG_REFERENCED) {
566 		actcount = 1;
567 	}
568 
569 	if (actcount) {
570 		if (p->queue - p->pc != PQ_ACTIVE) {
571 			vm_page_and_queue_spin_lock(p);
572 			if (p->queue - p->pc != PQ_ACTIVE) {
573 				vm_page_and_queue_spin_unlock(p);
574 				vm_page_activate(p);
575 			} else {
576 				vm_page_and_queue_spin_unlock(p);
577 			}
578 		} else {
579 			p->act_count += actcount;
580 			if (p->act_count > ACT_MAX)
581 				p->act_count = ACT_MAX;
582 		}
583 		vm_page_flag_clear(p, PG_REFERENCED);
584 		vm_page_wakeup(p);
585 		goto done;
586 	}
587 
588 	/*
589 	 * Remove the page from this particular pmap.  Once we do this, our
590 	 * pmap scans will not see it again (unless it gets faulted in), so
591 	 * we must actively dispose of or deal with the page.
592 	 */
593 	pmap_remove_specific(info->pmap, p);
594 
595 	/*
596 	 * If the page is not mapped to another process (i.e. as would be
597 	 * typical if this were a shared page from a library) then deactivate
598 	 * the page and clean it in two passes only.
599 	 *
600 	 * If the page hasn't been referenced since the last check, remove it
601 	 * from the pmap.  If it is no longer mapped, deactivate it
602 	 * immediately, accelerating the normal decline.
603 	 *
604 	 * Once the page has been removed from the pmap the RSS code no
605 	 * longer tracks it so we have to make sure that it is staged for
606 	 * potential flush action.
607 	 */
608 	if ((p->flags & PG_MAPPED) == 0) {
609 		if (p->queue - p->pc == PQ_ACTIVE) {
610 			vm_page_deactivate(p);
611 		}
612 		if (p->queue - p->pc == PQ_INACTIVE) {
613 			cleanit = 1;
614 		}
615 	}
616 
617 	/*
618 	 * Ok, try to fully clean the page and any nearby pages such that at
619 	 * least the requested page is freed or moved to the cache queue.
620 	 *
621 	 * We usually do this synchronously to allow us to get the page into
622 	 * the CACHE queue quickly, which will prevent memory exhaustion if
623 	 * a process with a memoryuse limit is running away.  However, the
624 	 * sysadmin may desire to set vm.swap_user_async which relaxes this
625 	 * and improves write performance.
626 	 */
627 	if (cleanit) {
628 		int max_launder = 0x7FFF;
629 		int vnodes_skipped = 0;
630 		int vmflush_flags;
631 		struct vnode *vpfailed = NULL;
632 
633 		info->offset = va;
634 
635 		if (vm_pageout_memuse_mode >= 2) {
636 			vmflush_flags = VM_PAGER_TRY_TO_CACHE |
637 					VM_PAGER_ALLOW_ACTIVE;
638 			if (swap_user_async == 0)
639 				vmflush_flags |= VM_PAGER_PUT_SYNC;
640 			vm_page_flag_set(p, PG_WINATCFLS);
641 			info->cleancount +=
642 				vm_pageout_page(p, &max_launder,
643 						&vnodes_skipped,
644 						&vpfailed, 1, vmflush_flags);
645 		} else {
646 			vm_page_wakeup(p);
647 			++info->cleancount;
648 		}
649 	} else {
650 		vm_page_wakeup(p);
651 	}
652 done:
653 	lwkt_user_yield();
654 	return 0;
655 }
656 
657 /*
658  * Deactivate some number of pages in a map due to set RLIMIT_RSS limits.
659  * that is relatively difficult to do.  We try to keep track of where we
660  * left off last time to reduce scan overhead.
661  *
662  * Called when vm_pageout_memuse_mode is >= 1.
663  */
664 void
665 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit)
666 {
667 	vm_offset_t pgout_offset;
668 	struct pmap_pgscan_info info;
669 	int retries = 3;
670 
671 	pgout_offset = map->pgout_offset;
672 again:
673 #if 0
674 	kprintf("%016jx ", pgout_offset);
675 #endif
676 	if (pgout_offset < VM_MIN_USER_ADDRESS)
677 		pgout_offset = VM_MIN_USER_ADDRESS;
678 	if (pgout_offset >= VM_MAX_USER_ADDRESS)
679 		pgout_offset = 0;
680 	info.pmap = vm_map_pmap(map);
681 	info.limit = limit;
682 	info.beg_addr = pgout_offset;
683 	info.end_addr = VM_MAX_USER_ADDRESS;
684 	info.callback = vm_pageout_mdp_callback;
685 	info.cleancount = 0;
686 	info.actioncount = 0;
687 	info.busycount = 0;
688 
689 	pmap_pgscan(&info);
690 	pgout_offset = info.offset;
691 #if 0
692 	kprintf("%016jx %08lx %08lx\n", pgout_offset,
693 		info.cleancount, info.actioncount);
694 #endif
695 
696 	if (pgout_offset != VM_MAX_USER_ADDRESS &&
697 	    pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
698 		goto again;
699 	} else if (retries &&
700 		   pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) {
701 		--retries;
702 		goto again;
703 	}
704 	map->pgout_offset = pgout_offset;
705 }
706 #endif
707 
708 /*
709  * Called when the pageout scan wants to free a page.  We no longer
710  * try to cycle the vm_object here with a reference & dealloc, which can
711  * cause a non-trivial object collapse in a critical path.
712  *
713  * It is unclear why we cycled the ref_count in the past, perhaps to try
714  * to optimize shadow chain collapses but I don't quite see why it would
715  * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
716  * synchronously and not have to be kicked-start.
717  */
718 static void
719 vm_pageout_page_free(vm_page_t m)
720 {
721 	vm_page_protect(m, VM_PROT_NONE);
722 	vm_page_free(m);
723 }
724 
725 /*
726  * vm_pageout_scan does the dirty work for the pageout daemon.
727  */
728 struct vm_pageout_scan_info {
729 	struct proc *bigproc;
730 	vm_offset_t bigsize;
731 };
732 
733 static int vm_pageout_scan_callback(struct proc *p, void *data);
734 
735 static int
736 vm_pageout_scan_inactive(int pass, int q, int avail_shortage,
737 			 int *vnodes_skipped)
738 {
739 	vm_page_t m;
740 	struct vm_page marker;
741 	struct vnode *vpfailed;		/* warning, allowed to be stale */
742 	int maxscan;
743 	int delta = 0;
744 	int max_launder;
745 
746 	/*
747 	 * Start scanning the inactive queue for pages we can move to the
748 	 * cache or free.  The scan will stop when the target is reached or
749 	 * we have scanned the entire inactive queue.  Note that m->act_count
750 	 * is not used to form decisions for the inactive queue, only for the
751 	 * active queue.
752 	 *
753 	 * max_launder limits the number of dirty pages we flush per scan.
754 	 * For most systems a smaller value (16 or 32) is more robust under
755 	 * extreme memory and disk pressure because any unnecessary writes
756 	 * to disk can result in extreme performance degredation.  However,
757 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
758 	 * used) will die horribly with limited laundering.  If the pageout
759 	 * daemon cannot clean enough pages in the first pass, we let it go
760 	 * all out in succeeding passes.
761 	 */
762 	if ((max_launder = vm_max_launder) <= 1)
763 		max_launder = 1;
764 	if (pass)
765 		max_launder = 10000;
766 
767 	/*
768 	 * Initialize our marker
769 	 */
770 	bzero(&marker, sizeof(marker));
771 	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
772 	marker.queue = PQ_INACTIVE + q;
773 	marker.pc = q;
774 	marker.wire_count = 1;
775 
776 	/*
777 	 * Inactive queue scan.
778 	 *
779 	 * NOTE: The vm_page must be spinlocked before the queue to avoid
780 	 *	 deadlocks, so it is easiest to simply iterate the loop
781 	 *	 with the queue unlocked at the top.
782 	 */
783 	vpfailed = NULL;
784 
785 	vm_page_queues_spin_lock(PQ_INACTIVE + q);
786 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
787 	maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt;
788 
789 	/*
790 	 * Queue locked at top of loop to avoid stack marker issues.
791 	 */
792 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
793 	       maxscan-- > 0 && avail_shortage - delta > 0)
794 	{
795 		int count;
796 
797 		KKASSERT(m->queue == PQ_INACTIVE + q);
798 		TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl,
799 			     &marker, pageq);
800 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m,
801 				   &marker, pageq);
802 		mycpu->gd_cnt.v_pdpages++;
803 
804 		/*
805 		 * Skip marker pages (atomic against other markers to avoid
806 		 * infinite hop-over scans).
807 		 */
808 		if (m->flags & PG_MARKER)
809 			continue;
810 
811 		/*
812 		 * Try to busy the page.  Don't mess with pages which are
813 		 * already busy or reorder them in the queue.
814 		 */
815 		if (vm_page_busy_try(m, TRUE))
816 			continue;
817 
818 		/*
819 		 * Remaining operations run with the page busy and neither
820 		 * the page or the queue will be spin-locked.
821 		 */
822 		vm_page_queues_spin_unlock(PQ_INACTIVE + q);
823 		KKASSERT(m->queue == PQ_INACTIVE + q);
824 
825 		count = vm_pageout_page(m, &max_launder, vnodes_skipped,
826 					&vpfailed, pass, 0);
827 		delta += count;
828 
829 		/*
830 		 * Systems with a ton of memory can wind up with huge
831 		 * deactivation counts.  Because the inactive scan is
832 		 * doing a lot of flushing, the combination can result
833 		 * in excessive paging even in situations where other
834 		 * unrelated threads free up sufficient VM.
835 		 *
836 		 * To deal with this we abort the nominal active->inactive
837 		 * scan before we hit the inactive target when free+cache
838 		 * levels have reached a reasonable target.
839 		 *
840 		 * When deciding to stop early we need to add some slop to
841 		 * the test and we need to return full completion to the caller
842 		 * to prevent the caller from thinking there is something
843 		 * wrong and issuing a low-memory+swap warning or pkill.
844 		 *
845 		 * A deficit forces paging regardless of the state of the
846 		 * VM page queues (used for RSS enforcement).
847 		 */
848 		lwkt_yield();
849 		vm_page_queues_spin_lock(PQ_INACTIVE + q);
850 		if (vm_paging_target() < -vm_max_launder) {
851 			/*
852 			 * Stopping early, return full completion to caller.
853 			 */
854 			if (delta < avail_shortage)
855 				delta = avail_shortage;
856 			break;
857 		}
858 	}
859 
860 	/* page queue still spin-locked */
861 	TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq);
862 	vm_page_queues_spin_unlock(PQ_INACTIVE + q);
863 
864 	return (delta);
865 }
866 
867 /*
868  * Pageout the specified page, return the total number of pages paged out
869  * (this routine may cluster).
870  *
871  * The page must be busied and soft-busied by the caller and will be disposed
872  * of by this function.
873  */
874 static int
875 vm_pageout_page(vm_page_t m, int *max_launderp, int *vnodes_skippedp,
876 		struct vnode **vpfailedp, int pass, int vmflush_flags)
877 {
878 	vm_object_t object;
879 	int actcount;
880 	int count = 0;
881 
882 	/*
883 	 * It is possible for a page to be busied ad-hoc (e.g. the
884 	 * pmap_collect() code) and wired and race against the
885 	 * allocation of a new page.  vm_page_alloc() may be forced
886 	 * to deactivate the wired page in which case it winds up
887 	 * on the inactive queue and must be handled here.  We
888 	 * correct the problem simply by unqueuing the page.
889 	 */
890 	if (m->wire_count) {
891 		vm_page_unqueue_nowakeup(m);
892 		vm_page_wakeup(m);
893 		kprintf("WARNING: pagedaemon: wired page on "
894 			"inactive queue %p\n", m);
895 		return 0;
896 	}
897 
898 	/*
899 	 * A held page may be undergoing I/O, so skip it.
900 	 */
901 	if (m->hold_count) {
902 		vm_page_and_queue_spin_lock(m);
903 		if (m->queue - m->pc == PQ_INACTIVE) {
904 			TAILQ_REMOVE(
905 				&vm_page_queues[m->queue].pl, m, pageq);
906 			TAILQ_INSERT_TAIL(
907 				&vm_page_queues[m->queue].pl, m, pageq);
908 			++vm_swapcache_inactive_heuristic;
909 		}
910 		vm_page_and_queue_spin_unlock(m);
911 		vm_page_wakeup(m);
912 		return 0;
913 	}
914 
915 	if (m->object == NULL || m->object->ref_count == 0) {
916 		/*
917 		 * If the object is not being used, we ignore previous
918 		 * references.
919 		 */
920 		vm_page_flag_clear(m, PG_REFERENCED);
921 		pmap_clear_reference(m);
922 		/* fall through to end */
923 	} else if (((m->flags & PG_REFERENCED) == 0) &&
924 		    (actcount = pmap_ts_referenced(m))) {
925 		/*
926 		 * Otherwise, if the page has been referenced while
927 		 * in the inactive queue, we bump the "activation
928 		 * count" upwards, making it less likely that the
929 		 * page will be added back to the inactive queue
930 		 * prematurely again.  Here we check the page tables
931 		 * (or emulated bits, if any), given the upper level
932 		 * VM system not knowing anything about existing
933 		 * references.
934 		 */
935 		vm_page_activate(m);
936 		m->act_count += (actcount + ACT_ADVANCE);
937 		vm_page_wakeup(m);
938 		return 0;
939 	}
940 
941 	/*
942 	 * (m) is still busied.
943 	 *
944 	 * If the upper level VM system knows about any page
945 	 * references, we activate the page.  We also set the
946 	 * "activation count" higher than normal so that we will less
947 	 * likely place pages back onto the inactive queue again.
948 	 */
949 	if ((m->flags & PG_REFERENCED) != 0) {
950 		vm_page_flag_clear(m, PG_REFERENCED);
951 		actcount = pmap_ts_referenced(m);
952 		vm_page_activate(m);
953 		m->act_count += (actcount + ACT_ADVANCE + 1);
954 		vm_page_wakeup(m);
955 		return 0;
956 	}
957 
958 	/*
959 	 * If the upper level VM system doesn't know anything about
960 	 * the page being dirty, we have to check for it again.  As
961 	 * far as the VM code knows, any partially dirty pages are
962 	 * fully dirty.
963 	 *
964 	 * Pages marked PG_WRITEABLE may be mapped into the user
965 	 * address space of a process running on another cpu.  A
966 	 * user process (without holding the MP lock) running on
967 	 * another cpu may be able to touch the page while we are
968 	 * trying to remove it.  vm_page_cache() will handle this
969 	 * case for us.
970 	 */
971 	if (m->dirty == 0) {
972 		vm_page_test_dirty(m);
973 	} else {
974 		vm_page_dirty(m);
975 	}
976 
977 	if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
978 		/*
979 		 * Invalid pages can be easily freed
980 		 */
981 		vm_pageout_page_free(m);
982 		mycpu->gd_cnt.v_dfree++;
983 		++count;
984 	} else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) {
985 		/*
986 		 * Clean pages can be placed onto the cache queue.
987 		 * This effectively frees them.
988 		 */
989 		vm_page_cache(m);
990 		++count;
991 	} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
992 		/*
993 		 * Dirty pages need to be paged out, but flushing
994 		 * a page is extremely expensive verses freeing
995 		 * a clean page.  Rather then artificially limiting
996 		 * the number of pages we can flush, we instead give
997 		 * dirty pages extra priority on the inactive queue
998 		 * by forcing them to be cycled through the queue
999 		 * twice before being flushed, after which the
1000 		 * (now clean) page will cycle through once more
1001 		 * before being freed.  This significantly extends
1002 		 * the thrash point for a heavily loaded machine.
1003 		 */
1004 		vm_page_flag_set(m, PG_WINATCFLS);
1005 		vm_page_and_queue_spin_lock(m);
1006 		if (m->queue - m->pc == PQ_INACTIVE) {
1007 			TAILQ_REMOVE(
1008 				&vm_page_queues[m->queue].pl, m, pageq);
1009 			TAILQ_INSERT_TAIL(
1010 				&vm_page_queues[m->queue].pl, m, pageq);
1011 			++vm_swapcache_inactive_heuristic;
1012 		}
1013 		vm_page_and_queue_spin_unlock(m);
1014 		vm_page_wakeup(m);
1015 	} else if (*max_launderp > 0) {
1016 		/*
1017 		 * We always want to try to flush some dirty pages if
1018 		 * we encounter them, to keep the system stable.
1019 		 * Normally this number is small, but under extreme
1020 		 * pressure where there are insufficient clean pages
1021 		 * on the inactive queue, we may have to go all out.
1022 		 */
1023 		int swap_pageouts_ok;
1024 		struct vnode *vp = NULL;
1025 
1026 		swap_pageouts_ok = 0;
1027 		object = m->object;
1028 		if (object &&
1029 		    (object->type != OBJT_SWAP) &&
1030 		    (object->type != OBJT_DEFAULT)) {
1031 			swap_pageouts_ok = 1;
1032 		} else {
1033 			swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
1034 			swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
1035 			vm_page_count_min(0));
1036 		}
1037 
1038 		/*
1039 		 * We don't bother paging objects that are "dead".
1040 		 * Those objects are in a "rundown" state.
1041 		 */
1042 		if (!swap_pageouts_ok ||
1043 		    (object == NULL) ||
1044 		    (object->flags & OBJ_DEAD)) {
1045 			vm_page_and_queue_spin_lock(m);
1046 			if (m->queue - m->pc == PQ_INACTIVE) {
1047 				TAILQ_REMOVE(
1048 				    &vm_page_queues[m->queue].pl,
1049 				    m, pageq);
1050 				TAILQ_INSERT_TAIL(
1051 				    &vm_page_queues[m->queue].pl,
1052 				    m, pageq);
1053 				++vm_swapcache_inactive_heuristic;
1054 			}
1055 			vm_page_and_queue_spin_unlock(m);
1056 			vm_page_wakeup(m);
1057 			return 0;
1058 		}
1059 
1060 		/*
1061 		 * (m) is still busied.
1062 		 *
1063 		 * The object is already known NOT to be dead.   It
1064 		 * is possible for the vget() to block the whole
1065 		 * pageout daemon, but the new low-memory handling
1066 		 * code should prevent it.
1067 		 *
1068 		 * The previous code skipped locked vnodes and, worse,
1069 		 * reordered pages in the queue.  This results in
1070 		 * completely non-deterministic operation because,
1071 		 * quite often, a vm_fault has initiated an I/O and
1072 		 * is holding a locked vnode at just the point where
1073 		 * the pageout daemon is woken up.
1074 		 *
1075 		 * We can't wait forever for the vnode lock, we might
1076 		 * deadlock due to a vn_read() getting stuck in
1077 		 * vm_wait while holding this vnode.  We skip the
1078 		 * vnode if we can't get it in a reasonable amount
1079 		 * of time.
1080 		 *
1081 		 * vpfailed is used to (try to) avoid the case where
1082 		 * a large number of pages are associated with a
1083 		 * locked vnode, which could cause the pageout daemon
1084 		 * to stall for an excessive amount of time.
1085 		 */
1086 		if (object->type == OBJT_VNODE) {
1087 			int flags;
1088 
1089 			vp = object->handle;
1090 			flags = LK_EXCLUSIVE;
1091 			if (vp == *vpfailedp)
1092 				flags |= LK_NOWAIT;
1093 			else
1094 				flags |= LK_TIMELOCK;
1095 			vm_page_hold(m);
1096 			vm_page_wakeup(m);
1097 
1098 			/*
1099 			 * We have unbusied (m) temporarily so we can
1100 			 * acquire the vp lock without deadlocking.
1101 			 * (m) is held to prevent destruction.
1102 			 */
1103 			if (vget(vp, flags) != 0) {
1104 				*vpfailedp = vp;
1105 				++pageout_lock_miss;
1106 				if (object->flags & OBJ_MIGHTBEDIRTY)
1107 					    ++*vnodes_skippedp;
1108 				vm_page_unhold(m);
1109 				return 0;
1110 			}
1111 
1112 			/*
1113 			 * The page might have been moved to another
1114 			 * queue during potential blocking in vget()
1115 			 * above.  The page might have been freed and
1116 			 * reused for another vnode.  The object might
1117 			 * have been reused for another vnode.
1118 			 */
1119 			if (m->queue - m->pc != PQ_INACTIVE ||
1120 			    m->object != object ||
1121 			    object->handle != vp) {
1122 				if (object->flags & OBJ_MIGHTBEDIRTY)
1123 					++*vnodes_skippedp;
1124 				vput(vp);
1125 				vm_page_unhold(m);
1126 				return 0;
1127 			}
1128 
1129 			/*
1130 			 * The page may have been busied during the
1131 			 * blocking in vput();  We don't move the
1132 			 * page back onto the end of the queue so that
1133 			 * statistics are more correct if we don't.
1134 			 */
1135 			if (vm_page_busy_try(m, TRUE)) {
1136 				vput(vp);
1137 				vm_page_unhold(m);
1138 				return 0;
1139 			}
1140 			vm_page_unhold(m);
1141 
1142 			/*
1143 			 * (m) is busied again
1144 			 *
1145 			 * We own the busy bit and remove our hold
1146 			 * bit.  If the page is still held it
1147 			 * might be undergoing I/O, so skip it.
1148 			 */
1149 			if (m->hold_count) {
1150 				vm_page_and_queue_spin_lock(m);
1151 				if (m->queue - m->pc == PQ_INACTIVE) {
1152 					TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq);
1153 					TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1154 					++vm_swapcache_inactive_heuristic;
1155 				}
1156 				vm_page_and_queue_spin_unlock(m);
1157 				if (object->flags & OBJ_MIGHTBEDIRTY)
1158 					++*vnodes_skippedp;
1159 				vm_page_wakeup(m);
1160 				vput(vp);
1161 				return 0;
1162 			}
1163 			/* (m) is left busied as we fall through */
1164 		}
1165 
1166 		/*
1167 		 * page is busy and not held here.
1168 		 *
1169 		 * If a page is dirty, then it is either being washed
1170 		 * (but not yet cleaned) or it is still in the
1171 		 * laundry.  If it is still in the laundry, then we
1172 		 * start the cleaning operation.
1173 		 *
1174 		 * decrement inactive_shortage on success to account
1175 		 * for the (future) cleaned page.  Otherwise we
1176 		 * could wind up laundering or cleaning too many
1177 		 * pages.
1178 		 *
1179 		 * NOTE: Cleaning the page here does not cause
1180 		 *	 force_deficit to be adjusted, because the
1181 		 *	 page is not being freed or moved to the
1182 		 *	 cache.
1183 		 */
1184 		count = vm_pageout_clean_helper(m, vmflush_flags);
1185 		*max_launderp -= count;
1186 
1187 		/*
1188 		 * Clean ate busy, page no longer accessible
1189 		 */
1190 		if (vp != NULL)
1191 			vput(vp);
1192 	} else {
1193 		vm_page_wakeup(m);
1194 	}
1195 	return count;
1196 }
1197 
1198 static int
1199 vm_pageout_scan_active(int pass, int q,
1200 		       int avail_shortage, int inactive_shortage,
1201 		       int *recycle_countp)
1202 {
1203 	struct vm_page marker;
1204 	vm_page_t m;
1205 	int actcount;
1206 	int delta = 0;
1207 	int maxscan;
1208 
1209 	/*
1210 	 * We want to move pages from the active queue to the inactive
1211 	 * queue to get the inactive queue to the inactive target.  If
1212 	 * we still have a page shortage from above we try to directly free
1213 	 * clean pages instead of moving them.
1214 	 *
1215 	 * If we do still have a shortage we keep track of the number of
1216 	 * pages we free or cache (recycle_count) as a measure of thrashing
1217 	 * between the active and inactive queues.
1218 	 *
1219 	 * If we were able to completely satisfy the free+cache targets
1220 	 * from the inactive pool we limit the number of pages we move
1221 	 * from the active pool to the inactive pool to 2x the pages we
1222 	 * had removed from the inactive pool (with a minimum of 1/5 the
1223 	 * inactive target).  If we were not able to completely satisfy
1224 	 * the free+cache targets we go for the whole target aggressively.
1225 	 *
1226 	 * NOTE: Both variables can end up negative.
1227 	 * NOTE: We are still in a critical section.
1228 	 */
1229 
1230 	bzero(&marker, sizeof(marker));
1231 	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1232 	marker.queue = PQ_ACTIVE + q;
1233 	marker.pc = q;
1234 	marker.wire_count = 1;
1235 
1236 	vm_page_queues_spin_lock(PQ_ACTIVE + q);
1237 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1238 	maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt;
1239 
1240 	/*
1241 	 * Queue locked at top of loop to avoid stack marker issues.
1242 	 */
1243 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1244 	       maxscan-- > 0 && (avail_shortage - delta > 0 ||
1245 				inactive_shortage > 0))
1246 	{
1247 		KKASSERT(m->queue == PQ_ACTIVE + q);
1248 		TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl,
1249 			     &marker, pageq);
1250 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1251 				   &marker, pageq);
1252 
1253 		/*
1254 		 * Skip marker pages (atomic against other markers to avoid
1255 		 * infinite hop-over scans).
1256 		 */
1257 		if (m->flags & PG_MARKER)
1258 			continue;
1259 
1260 		/*
1261 		 * Try to busy the page.  Don't mess with pages which are
1262 		 * already busy or reorder them in the queue.
1263 		 */
1264 		if (vm_page_busy_try(m, TRUE))
1265 			continue;
1266 
1267 		/*
1268 		 * Remaining operations run with the page busy and neither
1269 		 * the page or the queue will be spin-locked.
1270 		 */
1271 		vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1272 		KKASSERT(m->queue == PQ_ACTIVE + q);
1273 
1274 		/*
1275 		 * Don't deactivate pages that are held, even if we can
1276 		 * busy them.  (XXX why not?)
1277 		 */
1278 		if (m->hold_count != 0) {
1279 			vm_page_and_queue_spin_lock(m);
1280 			if (m->queue - m->pc == PQ_ACTIVE) {
1281 				TAILQ_REMOVE(
1282 					&vm_page_queues[PQ_ACTIVE + q].pl,
1283 					m, pageq);
1284 				TAILQ_INSERT_TAIL(
1285 					&vm_page_queues[PQ_ACTIVE + q].pl,
1286 					m, pageq);
1287 			}
1288 			vm_page_and_queue_spin_unlock(m);
1289 			vm_page_wakeup(m);
1290 			goto next;
1291 		}
1292 
1293 		/*
1294 		 * The count for pagedaemon pages is done after checking the
1295 		 * page for eligibility...
1296 		 */
1297 		mycpu->gd_cnt.v_pdpages++;
1298 
1299 		/*
1300 		 * Check to see "how much" the page has been used and clear
1301 		 * the tracking access bits.  If the object has no references
1302 		 * don't bother paying the expense.
1303 		 */
1304 		actcount = 0;
1305 		if (m->object && m->object->ref_count != 0) {
1306 			if (m->flags & PG_REFERENCED)
1307 				++actcount;
1308 			actcount += pmap_ts_referenced(m);
1309 			if (actcount) {
1310 				m->act_count += ACT_ADVANCE + actcount;
1311 				if (m->act_count > ACT_MAX)
1312 					m->act_count = ACT_MAX;
1313 			}
1314 		}
1315 		vm_page_flag_clear(m, PG_REFERENCED);
1316 
1317 		/*
1318 		 * actcount is only valid if the object ref_count is non-zero.
1319 		 * If the page does not have an object, actcount will be zero.
1320 		 */
1321 		if (actcount && m->object->ref_count != 0) {
1322 			vm_page_and_queue_spin_lock(m);
1323 			if (m->queue - m->pc == PQ_ACTIVE) {
1324 				TAILQ_REMOVE(
1325 					&vm_page_queues[PQ_ACTIVE + q].pl,
1326 					m, pageq);
1327 				TAILQ_INSERT_TAIL(
1328 					&vm_page_queues[PQ_ACTIVE + q].pl,
1329 					m, pageq);
1330 			}
1331 			vm_page_and_queue_spin_unlock(m);
1332 			vm_page_wakeup(m);
1333 		} else {
1334 			switch(m->object->type) {
1335 			case OBJT_DEFAULT:
1336 			case OBJT_SWAP:
1337 				m->act_count -= min(m->act_count,
1338 						    vm_anonmem_decline);
1339 				break;
1340 			default:
1341 				m->act_count -= min(m->act_count,
1342 						    vm_filemem_decline);
1343 				break;
1344 			}
1345 			if (vm_pageout_algorithm ||
1346 			    (m->object == NULL) ||
1347 			    (m->object && (m->object->ref_count == 0)) ||
1348 			    m->act_count < pass + 1
1349 			) {
1350 				/*
1351 				 * Deactivate the page.  If we had a
1352 				 * shortage from our inactive scan try to
1353 				 * free (cache) the page instead.
1354 				 *
1355 				 * Don't just blindly cache the page if
1356 				 * we do not have a shortage from the
1357 				 * inactive scan, that could lead to
1358 				 * gigabytes being moved.
1359 				 */
1360 				--inactive_shortage;
1361 				if (avail_shortage - delta > 0 ||
1362 				    (m->object && (m->object->ref_count == 0)))
1363 				{
1364 					if (avail_shortage - delta > 0)
1365 						++*recycle_countp;
1366 					vm_page_protect(m, VM_PROT_NONE);
1367 					if (m->dirty == 0 &&
1368 					    (m->flags & PG_NEED_COMMIT) == 0 &&
1369 					    avail_shortage - delta > 0) {
1370 						vm_page_cache(m);
1371 					} else {
1372 						vm_page_deactivate(m);
1373 						vm_page_wakeup(m);
1374 					}
1375 				} else {
1376 					vm_page_deactivate(m);
1377 					vm_page_wakeup(m);
1378 				}
1379 				++delta;
1380 			} else {
1381 				vm_page_and_queue_spin_lock(m);
1382 				if (m->queue - m->pc == PQ_ACTIVE) {
1383 					TAILQ_REMOVE(
1384 					    &vm_page_queues[PQ_ACTIVE + q].pl,
1385 					    m, pageq);
1386 					TAILQ_INSERT_TAIL(
1387 					    &vm_page_queues[PQ_ACTIVE + q].pl,
1388 					    m, pageq);
1389 				}
1390 				vm_page_and_queue_spin_unlock(m);
1391 				vm_page_wakeup(m);
1392 			}
1393 		}
1394 next:
1395 		lwkt_yield();
1396 		vm_page_queues_spin_lock(PQ_ACTIVE + q);
1397 	}
1398 
1399 	/*
1400 	 * Clean out our local marker.
1401 	 *
1402 	 * Page queue still spin-locked.
1403 	 */
1404 	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1405 	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1406 
1407 	return (delta);
1408 }
1409 
1410 /*
1411  * The number of actually free pages can drop down to v_free_reserved,
1412  * we try to build the free count back above v_free_min.  Note that
1413  * vm_paging_needed() also returns TRUE if v_free_count is not at
1414  * least v_free_min so that is the minimum we must build the free
1415  * count to.
1416  *
1417  * We use a slightly higher target to improve hysteresis,
1418  * ((v_free_target + v_free_min) / 2).  Since v_free_target
1419  * is usually the same as v_cache_min this maintains about
1420  * half the pages in the free queue as are in the cache queue,
1421  * providing pretty good pipelining for pageout operation.
1422  *
1423  * The system operator can manipulate vm.v_cache_min and
1424  * vm.v_free_target to tune the pageout demon.  Be sure
1425  * to keep vm.v_free_min < vm.v_free_target.
1426  *
1427  * Note that the original paging target is to get at least
1428  * (free_min + cache_min) into (free + cache).  The slightly
1429  * higher target will shift additional pages from cache to free
1430  * without effecting the original paging target in order to
1431  * maintain better hysteresis and not have the free count always
1432  * be dead-on v_free_min.
1433  *
1434  * NOTE: we are still in a critical section.
1435  *
1436  * Pages moved from PQ_CACHE to totally free are not counted in the
1437  * pages_freed counter.
1438  */
1439 static void
1440 vm_pageout_scan_cache(int avail_shortage, int pass,
1441 		      int vnodes_skipped, int recycle_count)
1442 {
1443 	static int lastkillticks;
1444 	struct vm_pageout_scan_info info;
1445 	vm_page_t m;
1446 
1447 	while (vmstats.v_free_count <
1448 	       (vmstats.v_free_min + vmstats.v_free_target) / 2) {
1449 		/*
1450 		 * This steals some code from vm/vm_page.c
1451 		 */
1452 		static int cache_rover = 0;
1453 
1454 		m = vm_page_list_find(PQ_CACHE,
1455 				      cache_rover & PQ_L2_MASK, FALSE);
1456 		if (m == NULL)
1457 			break;
1458 		/* page is returned removed from its queue and spinlocked */
1459 		if (vm_page_busy_try(m, TRUE)) {
1460 			vm_page_deactivate_locked(m);
1461 			vm_page_spin_unlock(m);
1462 			continue;
1463 		}
1464 		vm_page_spin_unlock(m);
1465 		pagedaemon_wakeup();
1466 		lwkt_yield();
1467 
1468 		/*
1469 		 * Remaining operations run with the page busy and neither
1470 		 * the page or the queue will be spin-locked.
1471 		 */
1472 		if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1473 		    m->hold_count ||
1474 		    m->wire_count) {
1475 			vm_page_deactivate(m);
1476 			vm_page_wakeup(m);
1477 			continue;
1478 		}
1479 		KKASSERT((m->flags & PG_MAPPED) == 0);
1480 		KKASSERT(m->dirty == 0);
1481 		cache_rover += PQ_PRIME2;
1482 		vm_pageout_page_free(m);
1483 		mycpu->gd_cnt.v_dfree++;
1484 	}
1485 
1486 #if !defined(NO_SWAPPING)
1487 	/*
1488 	 * Idle process swapout -- run once per second.
1489 	 */
1490 	if (vm_swap_idle_enabled) {
1491 		static time_t lsec;
1492 		if (time_uptime != lsec) {
1493 			atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE);
1494 			vm_req_vmdaemon();
1495 			lsec = time_uptime;
1496 		}
1497 	}
1498 #endif
1499 
1500 	/*
1501 	 * If we didn't get enough free pages, and we have skipped a vnode
1502 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
1503 	 * if we did not get enough free pages.
1504 	 */
1505 	if (vm_paging_target() > 0) {
1506 		if (vnodes_skipped && vm_page_count_min(0))
1507 			speedup_syncer(NULL);
1508 #if !defined(NO_SWAPPING)
1509 		if (vm_swap_enabled && vm_page_count_target()) {
1510 			atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL);
1511 			vm_req_vmdaemon();
1512 		}
1513 #endif
1514 	}
1515 
1516 	/*
1517 	 * Handle catastrophic conditions.  Under good conditions we should
1518 	 * be at the target, well beyond our minimum.  If we could not even
1519 	 * reach our minimum the system is under heavy stress.  But just being
1520 	 * under heavy stress does not trigger process killing.
1521 	 *
1522 	 * We consider ourselves to have run out of memory if the swap pager
1523 	 * is full and avail_shortage is still positive.  The secondary check
1524 	 * ensures that we do not kill processes if the instantanious
1525 	 * availability is good, even if the pageout demon pass says it
1526 	 * couldn't get to the target.
1527 	 */
1528 	if (swap_pager_almost_full &&
1529 	    pass > 0 &&
1530 	    (vm_page_count_min(recycle_count) || avail_shortage > 0)) {
1531 		kprintf("Warning: system low on memory+swap "
1532 			"shortage %d for %d ticks!\n",
1533 			avail_shortage, ticks - swap_fail_ticks);
1534 	}
1535 	if (swap_pager_full &&
1536 	    pass > 1 &&
1537 	    avail_shortage > 0 &&
1538 	    vm_paging_target() > 0 &&
1539 	    (unsigned int)(ticks - lastkillticks) >= hz) {
1540 		/*
1541 		 * Kill something, maximum rate once per second to give
1542 		 * the process time to free up sufficient memory.
1543 		 */
1544 		lastkillticks = ticks;
1545 		info.bigproc = NULL;
1546 		info.bigsize = 0;
1547 		allproc_scan(vm_pageout_scan_callback, &info);
1548 		if (info.bigproc != NULL) {
1549 			info.bigproc->p_nice = PRIO_MIN;
1550 			info.bigproc->p_usched->resetpriority(
1551 				FIRST_LWP_IN_PROC(info.bigproc));
1552 			atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL);
1553 			killproc(info.bigproc, "out of swap space");
1554 			wakeup(&vmstats.v_free_count);
1555 			PRELE(info.bigproc);
1556 		}
1557 	}
1558 }
1559 
1560 static int
1561 vm_pageout_scan_callback(struct proc *p, void *data)
1562 {
1563 	struct vm_pageout_scan_info *info = data;
1564 	vm_offset_t size;
1565 
1566 	/*
1567 	 * Never kill system processes or init.  If we have configured swap
1568 	 * then try to avoid killing low-numbered pids.
1569 	 */
1570 	if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) ||
1571 	    ((p->p_pid < 48) && (vm_swap_size != 0))) {
1572 		return (0);
1573 	}
1574 
1575 	lwkt_gettoken(&p->p_token);
1576 
1577 	/*
1578 	 * if the process is in a non-running type state,
1579 	 * don't touch it.
1580 	 */
1581 	if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
1582 		lwkt_reltoken(&p->p_token);
1583 		return (0);
1584 	}
1585 
1586 	/*
1587 	 * Get the approximate process size.  Note that anonymous pages
1588 	 * with backing swap will be counted twice, but there should not
1589 	 * be too many such pages due to the stress the VM system is
1590 	 * under at this point.
1591 	 */
1592 	size = vmspace_anonymous_count(p->p_vmspace) +
1593 		vmspace_swap_count(p->p_vmspace);
1594 
1595 	/*
1596 	 * If the this process is bigger than the biggest one
1597 	 * remember it.
1598 	 */
1599 	if (info->bigsize < size) {
1600 		if (info->bigproc)
1601 			PRELE(info->bigproc);
1602 		PHOLD(p);
1603 		info->bigproc = p;
1604 		info->bigsize = size;
1605 	}
1606 	lwkt_reltoken(&p->p_token);
1607 	lwkt_yield();
1608 
1609 	return(0);
1610 }
1611 
1612 /*
1613  * This routine tries to maintain the pseudo LRU active queue,
1614  * so that during long periods of time where there is no paging,
1615  * that some statistic accumulation still occurs.  This code
1616  * helps the situation where paging just starts to occur.
1617  */
1618 static void
1619 vm_pageout_page_stats(int q)
1620 {
1621 	static int fullintervalcount = 0;
1622 	struct vm_page marker;
1623 	vm_page_t m;
1624 	int pcount, tpcount;		/* Number of pages to check */
1625 	int page_shortage;
1626 
1627 	page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max +
1628 			 vmstats.v_free_min) -
1629 			(vmstats.v_free_count + vmstats.v_inactive_count +
1630 			 vmstats.v_cache_count);
1631 
1632 	if (page_shortage <= 0)
1633 		return;
1634 
1635 	pcount = vm_page_queues[PQ_ACTIVE + q].lcnt;
1636 	fullintervalcount += vm_pageout_stats_interval;
1637 	if (fullintervalcount < vm_pageout_full_stats_interval) {
1638 		tpcount = (vm_pageout_stats_max * pcount) /
1639 			  vmstats.v_page_count + 1;
1640 		if (pcount > tpcount)
1641 			pcount = tpcount;
1642 	} else {
1643 		fullintervalcount = 0;
1644 	}
1645 
1646 	bzero(&marker, sizeof(marker));
1647 	marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
1648 	marker.queue = PQ_ACTIVE + q;
1649 	marker.pc = q;
1650 	marker.wire_count = 1;
1651 
1652 	vm_page_queues_spin_lock(PQ_ACTIVE + q);
1653 	TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1654 
1655 	/*
1656 	 * Queue locked at top of loop to avoid stack marker issues.
1657 	 */
1658 	while ((m = TAILQ_NEXT(&marker, pageq)) != NULL &&
1659 	       pcount-- > 0)
1660 	{
1661 		int actcount;
1662 
1663 		KKASSERT(m->queue == PQ_ACTIVE + q);
1664 		TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1665 		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m,
1666 				   &marker, pageq);
1667 
1668 		/*
1669 		 * Skip marker pages (atomic against other markers to avoid
1670 		 * infinite hop-over scans).
1671 		 */
1672 		if (m->flags & PG_MARKER)
1673 			continue;
1674 
1675 		/*
1676 		 * Ignore pages we can't busy
1677 		 */
1678 		if (vm_page_busy_try(m, TRUE))
1679 			continue;
1680 
1681 		/*
1682 		 * Remaining operations run with the page busy and neither
1683 		 * the page or the queue will be spin-locked.
1684 		 */
1685 		vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1686 		KKASSERT(m->queue == PQ_ACTIVE + q);
1687 
1688 		/*
1689 		 * We now have a safely busied page, the page and queue
1690 		 * spinlocks have been released.
1691 		 *
1692 		 * Ignore held pages
1693 		 */
1694 		if (m->hold_count) {
1695 			vm_page_wakeup(m);
1696 			goto next;
1697 		}
1698 
1699 		/*
1700 		 * Calculate activity
1701 		 */
1702 		actcount = 0;
1703 		if (m->flags & PG_REFERENCED) {
1704 			vm_page_flag_clear(m, PG_REFERENCED);
1705 			actcount += 1;
1706 		}
1707 		actcount += pmap_ts_referenced(m);
1708 
1709 		/*
1710 		 * Update act_count and move page to end of queue.
1711 		 */
1712 		if (actcount) {
1713 			m->act_count += ACT_ADVANCE + actcount;
1714 			if (m->act_count > ACT_MAX)
1715 				m->act_count = ACT_MAX;
1716 			vm_page_and_queue_spin_lock(m);
1717 			if (m->queue - m->pc == PQ_ACTIVE) {
1718 				TAILQ_REMOVE(
1719 					&vm_page_queues[PQ_ACTIVE + q].pl,
1720 					m, pageq);
1721 				TAILQ_INSERT_TAIL(
1722 					&vm_page_queues[PQ_ACTIVE + q].pl,
1723 					m, pageq);
1724 			}
1725 			vm_page_and_queue_spin_unlock(m);
1726 			vm_page_wakeup(m);
1727 			goto next;
1728 		}
1729 
1730 		if (m->act_count == 0) {
1731 			/*
1732 			 * We turn off page access, so that we have
1733 			 * more accurate RSS stats.  We don't do this
1734 			 * in the normal page deactivation when the
1735 			 * system is loaded VM wise, because the
1736 			 * cost of the large number of page protect
1737 			 * operations would be higher than the value
1738 			 * of doing the operation.
1739 			 *
1740 			 * We use the marker to save our place so
1741 			 * we can release the spin lock.  both (m)
1742 			 * and (next) will be invalid.
1743 			 */
1744 			vm_page_protect(m, VM_PROT_NONE);
1745 			vm_page_deactivate(m);
1746 		} else {
1747 			m->act_count -= min(m->act_count, ACT_DECLINE);
1748 			vm_page_and_queue_spin_lock(m);
1749 			if (m->queue - m->pc == PQ_ACTIVE) {
1750 				TAILQ_REMOVE(
1751 					&vm_page_queues[PQ_ACTIVE + q].pl,
1752 					m, pageq);
1753 				TAILQ_INSERT_TAIL(
1754 					&vm_page_queues[PQ_ACTIVE + q].pl,
1755 					m, pageq);
1756 			}
1757 			vm_page_and_queue_spin_unlock(m);
1758 		}
1759 		vm_page_wakeup(m);
1760 next:
1761 		vm_page_queues_spin_lock(PQ_ACTIVE + q);
1762 	}
1763 
1764 	/*
1765 	 * Remove our local marker
1766 	 *
1767 	 * Page queue still spin-locked.
1768 	 */
1769 	TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq);
1770 	vm_page_queues_spin_unlock(PQ_ACTIVE + q);
1771 }
1772 
1773 static int
1774 vm_pageout_free_page_calc(vm_size_t count)
1775 {
1776 	if (count < vmstats.v_page_count)
1777 		 return 0;
1778 	/*
1779 	 * free_reserved needs to include enough for the largest swap pager
1780 	 * structures plus enough for any pv_entry structs when paging.
1781 	 *
1782 	 * v_free_min		normal allocations
1783 	 * v_free_reserved	system allocations
1784 	 * v_pageout_free_min	allocations by pageout daemon
1785 	 * v_interrupt_free_min	low level allocations (e.g swap structures)
1786 	 */
1787 	if (vmstats.v_page_count > 1024)
1788 		vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200;
1789 	else
1790 		vmstats.v_free_min = 64;
1791 	vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7;
1792 	vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0;
1793 	vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7;
1794 	vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7;
1795 
1796 	return 1;
1797 }
1798 
1799 
1800 /*
1801  * vm_pageout is the high level pageout daemon.
1802  *
1803  * No requirements.
1804  */
1805 static void
1806 vm_pageout_thread(void)
1807 {
1808 	int pass;
1809 	int q;
1810 	int q1iterator = 0;
1811 	int q2iterator = 0;
1812 
1813 	/*
1814 	 * Initialize some paging parameters.
1815 	 */
1816 	curthread->td_flags |= TDF_SYSTHREAD;
1817 
1818 	vm_pageout_free_page_calc(vmstats.v_page_count);
1819 
1820 	/*
1821 	 * v_free_target and v_cache_min control pageout hysteresis.  Note
1822 	 * that these are more a measure of the VM cache queue hysteresis
1823 	 * then the VM free queue.  Specifically, v_free_target is the
1824 	 * high water mark (free+cache pages).
1825 	 *
1826 	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
1827 	 * low water mark, while v_free_min is the stop.  v_cache_min must
1828 	 * be big enough to handle memory needs while the pageout daemon
1829 	 * is signalled and run to free more pages.
1830 	 */
1831 	if (vmstats.v_free_count > 6144)
1832 		vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved;
1833 	else
1834 		vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved;
1835 
1836 	/*
1837 	 * NOTE: With the new buffer cache b_act_count we want the default
1838 	 *	 inactive target to be a percentage of available memory.
1839 	 *
1840 	 *	 The inactive target essentially determines the minimum
1841 	 *	 number of 'temporary' pages capable of caching one-time-use
1842 	 *	 files when the VM system is otherwise full of pages
1843 	 *	 belonging to multi-time-use files or active program data.
1844 	 *
1845 	 * NOTE: The inactive target is aggressively persued only if the
1846 	 *	 inactive queue becomes too small.  If the inactive queue
1847 	 *	 is large enough to satisfy page movement to free+cache
1848 	 *	 then it is repopulated more slowly from the active queue.
1849 	 *	 This allows a general inactive_target default to be set.
1850 	 *
1851 	 *	 There is an issue here for processes which sit mostly idle
1852 	 *	 'overnight', such as sshd, tcsh, and X.  Any movement from
1853 	 *	 the active queue will eventually cause such pages to
1854 	 *	 recycle eventually causing a lot of paging in the morning.
1855 	 *	 To reduce the incidence of this pages cycled out of the
1856 	 *	 buffer cache are moved directly to the inactive queue if
1857 	 *	 they were only used once or twice.
1858 	 *
1859 	 *	 The vfs.vm_cycle_point sysctl can be used to adjust this.
1860 	 *	 Increasing the value (up to 64) increases the number of
1861 	 *	 buffer recyclements which go directly to the inactive queue.
1862 	 */
1863 	if (vmstats.v_free_count > 2048) {
1864 		vmstats.v_cache_min = vmstats.v_free_target;
1865 		vmstats.v_cache_max = 2 * vmstats.v_cache_min;
1866 	} else {
1867 		vmstats.v_cache_min = 0;
1868 		vmstats.v_cache_max = 0;
1869 	}
1870 	vmstats.v_inactive_target = vmstats.v_free_count / 4;
1871 
1872 	/* XXX does not really belong here */
1873 	if (vm_page_max_wired == 0)
1874 		vm_page_max_wired = vmstats.v_free_count / 3;
1875 
1876 	if (vm_pageout_stats_max == 0)
1877 		vm_pageout_stats_max = vmstats.v_free_target;
1878 
1879 	/*
1880 	 * Set interval in seconds for stats scan.
1881 	 */
1882 	if (vm_pageout_stats_interval == 0)
1883 		vm_pageout_stats_interval = 5;
1884 	if (vm_pageout_full_stats_interval == 0)
1885 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
1886 
1887 
1888 	/*
1889 	 * Set maximum free per pass
1890 	 */
1891 	if (vm_pageout_stats_free_max == 0)
1892 		vm_pageout_stats_free_max = 5;
1893 
1894 	swap_pager_swap_init();
1895 	pass = 0;
1896 
1897 	/*
1898 	 * The pageout daemon is never done, so loop forever.
1899 	 */
1900 	while (TRUE) {
1901 		int error;
1902 		int avail_shortage;
1903 		int inactive_shortage;
1904 		int vnodes_skipped = 0;
1905 		int recycle_count = 0;
1906 		int tmp;
1907 
1908 		/*
1909 		 * Wait for an action request.  If we timeout check to
1910 		 * see if paging is needed (in case the normal wakeup
1911 		 * code raced us).
1912 		 */
1913 		if (vm_pages_needed == 0) {
1914 			error = tsleep(&vm_pages_needed,
1915 				       0, "psleep",
1916 				       vm_pageout_stats_interval * hz);
1917 			if (error &&
1918 			    vm_paging_needed() == 0 &&
1919 			    vm_pages_needed == 0) {
1920 				for (q = 0; q < PQ_L2_SIZE; ++q)
1921 					vm_pageout_page_stats(q);
1922 				continue;
1923 			}
1924 			vm_pages_needed = 1;
1925 		}
1926 
1927 		mycpu->gd_cnt.v_pdwakeups++;
1928 
1929 		/*
1930 		 * Scan for INACTIVE->CLEAN/PAGEOUT
1931 		 *
1932 		 * This routine tries to avoid thrashing the system with
1933 		 * unnecessary activity.
1934 		 *
1935 		 * Calculate our target for the number of free+cache pages we
1936 		 * want to get to.  This is higher then the number that causes
1937 		 * allocations to stall (severe) in order to provide hysteresis,
1938 		 * and if we don't make it all the way but get to the minimum
1939 		 * we're happy.  Goose it a bit if there are multiple requests
1940 		 * for memory.
1941 		 *
1942 		 * Don't reduce avail_shortage inside the loop or the
1943 		 * PQAVERAGE() calculation will break.
1944 		 *
1945 		 * NOTE! deficit is differentiated from avail_shortage as
1946 		 *	 REQUIRING at least (deficit) pages to be cleaned,
1947 		 *	 even if the page queues are in good shape.  This
1948 		 *	 is used primarily for handling per-process
1949 		 *	 RLIMIT_RSS and may also see small values when
1950 		 *	 processes block due to low memory.
1951 		 */
1952 		avail_shortage = vm_paging_target() + vm_pageout_deficit;
1953 		vm_pageout_deficit = 0;
1954 
1955 		if (avail_shortage > 0) {
1956 			int delta = 0;
1957 
1958 			for (q = 0; q < PQ_L2_SIZE; ++q) {
1959 				delta += vm_pageout_scan_inactive(
1960 					    pass,
1961 					    (q + q1iterator) & PQ_L2_MASK,
1962 					    PQAVERAGE(avail_shortage),
1963 					    &vnodes_skipped);
1964 				if (avail_shortage - delta <= 0)
1965 					break;
1966 			}
1967 			avail_shortage -= delta;
1968 			q1iterator = q + 1;
1969 		}
1970 
1971 		/*
1972 		 * Figure out how many active pages we must deactivate.  If
1973 		 * we were able to reach our target with just the inactive
1974 		 * scan above we limit the number of active pages we
1975 		 * deactivate to reduce unnecessary work.
1976 		 */
1977 		inactive_shortage = vmstats.v_inactive_target -
1978 				    vmstats.v_inactive_count;
1979 
1980 		/*
1981 		 * If we were unable to free sufficient inactive pages to
1982 		 * satisfy the free/cache queue requirements then simply
1983 		 * reaching the inactive target may not be good enough.
1984 		 * Try to deactivate pages in excess of the target based
1985 		 * on the shortfall.
1986 		 *
1987 		 * However to prevent thrashing the VM system do not
1988 		 * deactivate more than an additional 1/10 the inactive
1989 		 * target's worth of active pages.
1990 		 */
1991 		if (avail_shortage > 0) {
1992 			tmp = avail_shortage * 2;
1993 			if (tmp > vmstats.v_inactive_target / 10)
1994 				tmp = vmstats.v_inactive_target / 10;
1995 			inactive_shortage += tmp;
1996 		}
1997 
1998 		/*
1999 		 * Only trigger a pmap cleanup on inactive shortage.
2000 		 */
2001 		if (inactive_shortage > 0) {
2002 			pmap_collect();
2003 		}
2004 
2005 		/*
2006 		 * Scan for ACTIVE->INACTIVE
2007 		 *
2008 		 * Only trigger on inactive shortage.  Triggering on
2009 		 * avail_shortage can starve the active queue with
2010 		 * unnecessary active->inactive transitions and destroy
2011 		 * performance.
2012 		 */
2013 		if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) {
2014 			int delta = 0;
2015 
2016 			for (q = 0; q < PQ_L2_SIZE; ++q) {
2017 				delta += vm_pageout_scan_active(
2018 						pass,
2019 						(q + q2iterator) & PQ_L2_MASK,
2020 						PQAVERAGE(avail_shortage),
2021 						PQAVERAGE(inactive_shortage),
2022 						&recycle_count);
2023 				if (inactive_shortage - delta <= 0 &&
2024 				    avail_shortage - delta <= 0) {
2025 					break;
2026 				}
2027 			}
2028 			inactive_shortage -= delta;
2029 			avail_shortage -= delta;
2030 			q2iterator = q + 1;
2031 		}
2032 
2033 		/*
2034 		 * Scan for CACHE->FREE
2035 		 *
2036 		 * Finally free enough cache pages to meet our free page
2037 		 * requirement and take more drastic measures if we are
2038 		 * still in trouble.
2039 		 */
2040 		vm_pageout_scan_cache(avail_shortage, pass,
2041 				      vnodes_skipped, recycle_count);
2042 
2043 		/*
2044 		 * Wait for more work.
2045 		 */
2046 		if (avail_shortage > 0) {
2047 			++pass;
2048 			if (pass < 10 && vm_pages_needed > 1) {
2049 				/*
2050 				 * Normal operation, additional processes
2051 				 * have already kicked us.  Retry immediately
2052 				 * unless swap space is completely full in
2053 				 * which case delay a bit.
2054 				 */
2055 				if (swap_pager_full) {
2056 					tsleep(&vm_pages_needed, 0, "pdelay",
2057 						hz / 5);
2058 				} /* else immediate retry */
2059 			} else if (pass < 10) {
2060 				/*
2061 				 * Normal operation, fewer processes.  Delay
2062 				 * a bit but allow wakeups.
2063 				 */
2064 				vm_pages_needed = 0;
2065 				tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2066 				vm_pages_needed = 1;
2067 			} else if (swap_pager_full == 0) {
2068 				/*
2069 				 * We've taken too many passes, forced delay.
2070 				 */
2071 				tsleep(&vm_pages_needed, 0, "pdelay", hz / 10);
2072 			} else {
2073 				/*
2074 				 * Running out of memory, catastrophic
2075 				 * back-off to one-second intervals.
2076 				 */
2077 				tsleep(&vm_pages_needed, 0, "pdelay", hz);
2078 			}
2079 		} else if (vm_pages_needed) {
2080 			/*
2081 			 * Interlocked wakeup of waiters (non-optional).
2082 			 *
2083 			 * Similar to vm_page_free_wakeup() in vm_page.c,
2084 			 * wake
2085 			 */
2086 			pass = 0;
2087 			if (!vm_page_count_min(vm_page_free_hysteresis) ||
2088 			    !vm_page_count_target()) {
2089 				vm_pages_needed = 0;
2090 				wakeup(&vmstats.v_free_count);
2091 			}
2092 		} else {
2093 			pass = 0;
2094 		}
2095 	}
2096 }
2097 
2098 static struct kproc_desc page_kp = {
2099 	"pagedaemon",
2100 	vm_pageout_thread,
2101 	&pagethread
2102 };
2103 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp);
2104 
2105 
2106 /*
2107  * Called after allocating a page out of the cache or free queue
2108  * to possibly wake the pagedaemon up to replentish our supply.
2109  *
2110  * We try to generate some hysteresis by waking the pagedaemon up
2111  * when our free+cache pages go below the free_min+cache_min level.
2112  * The pagedaemon tries to get the count back up to at least the
2113  * minimum, and through to the target level if possible.
2114  *
2115  * If the pagedaemon is already active bump vm_pages_needed as a hint
2116  * that there are even more requests pending.
2117  *
2118  * SMP races ok?
2119  * No requirements.
2120  */
2121 void
2122 pagedaemon_wakeup(void)
2123 {
2124 	if (vm_paging_needed() && curthread != pagethread) {
2125 		if (vm_pages_needed == 0) {
2126 			vm_pages_needed = 1;	/* SMP race ok */
2127 			wakeup(&vm_pages_needed);
2128 		} else if (vm_page_count_min(0)) {
2129 			++vm_pages_needed;	/* SMP race ok */
2130 		}
2131 	}
2132 }
2133 
2134 #if !defined(NO_SWAPPING)
2135 
2136 /*
2137  * SMP races ok?
2138  * No requirements.
2139  */
2140 static void
2141 vm_req_vmdaemon(void)
2142 {
2143 	static int lastrun = 0;
2144 
2145 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
2146 		wakeup(&vm_daemon_needed);
2147 		lastrun = ticks;
2148 	}
2149 }
2150 
2151 static int vm_daemon_callback(struct proc *p, void *data __unused);
2152 
2153 /*
2154  * No requirements.
2155  */
2156 static void
2157 vm_daemon(void)
2158 {
2159 	int req_swapout;
2160 
2161 	while (TRUE) {
2162 		tsleep(&vm_daemon_needed, 0, "psleep", 0);
2163 		req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0);
2164 
2165 		/*
2166 		 * forced swapouts
2167 		 */
2168 		if (req_swapout)
2169 			swapout_procs(vm_pageout_req_swapout);
2170 
2171 		/*
2172 		 * scan the processes for exceeding their rlimits or if
2173 		 * process is swapped out -- deactivate pages
2174 		 */
2175 		allproc_scan(vm_daemon_callback, NULL);
2176 	}
2177 }
2178 
2179 static int
2180 vm_daemon_callback(struct proc *p, void *data __unused)
2181 {
2182 	struct vmspace *vm;
2183 	vm_pindex_t limit, size;
2184 
2185 	/*
2186 	 * if this is a system process or if we have already
2187 	 * looked at this process, skip it.
2188 	 */
2189 	lwkt_gettoken(&p->p_token);
2190 
2191 	if (p->p_flags & (P_SYSTEM | P_WEXIT)) {
2192 		lwkt_reltoken(&p->p_token);
2193 		return (0);
2194 	}
2195 
2196 	/*
2197 	 * if the process is in a non-running type state,
2198 	 * don't touch it.
2199 	 */
2200 	if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) {
2201 		lwkt_reltoken(&p->p_token);
2202 		return (0);
2203 	}
2204 
2205 	/*
2206 	 * get a limit
2207 	 */
2208 	limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
2209 			        p->p_rlimit[RLIMIT_RSS].rlim_max));
2210 
2211 	/*
2212 	 * let processes that are swapped out really be
2213 	 * swapped out.  Set the limit to nothing to get as
2214 	 * many pages out to swap as possible.
2215 	 */
2216 	if (p->p_flags & P_SWAPPEDOUT)
2217 		limit = 0;
2218 
2219 	vm = p->p_vmspace;
2220 	vmspace_hold(vm);
2221 	size = pmap_resident_tlnw_count(&vm->vm_pmap);
2222 	if (limit >= 0 && size > 4096 &&
2223 	    size - 4096 >= limit && vm_pageout_memuse_mode >= 1) {
2224 		vm_pageout_map_deactivate_pages(&vm->vm_map, limit);
2225 	}
2226 	vmspace_drop(vm);
2227 
2228 	lwkt_reltoken(&p->p_token);
2229 
2230 	return (0);
2231 }
2232 
2233 #endif
2234