xref: /illumos-gate/usr/src/uts/common/os/vm_pageout.c (revision 338664df)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5ad23a2dbSjohansen  * Common Development and Distribution License (the "License").
6ad23a2dbSjohansen  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
21727737b4SJoshua M. Clulow 
22727737b4SJoshua M. Clulow /*
2371cf2822SAndy Fiddaman  * Copyright 2018 Joyent, Inc.
2471cf2822SAndy Fiddaman  * Copyright 2023 Oxide Computer Company
25d12ea28fSAndy Fiddaman  * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
26727737b4SJoshua M. Clulow  */
27727737b4SJoshua M. Clulow 
287c478bd9Sstevel@tonic-gate /*
29d3d50737SRafael Vanoni  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
307c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
317c478bd9Sstevel@tonic-gate  */
327c478bd9Sstevel@tonic-gate 
337c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
347c478bd9Sstevel@tonic-gate /* All Rights Reserved */
357c478bd9Sstevel@tonic-gate 
367c478bd9Sstevel@tonic-gate /*
377c478bd9Sstevel@tonic-gate  * University Copyright- Copyright (c) 1982, 1986, 1988
387c478bd9Sstevel@tonic-gate  * The Regents of the University of California
397c478bd9Sstevel@tonic-gate  * All Rights Reserved
407c478bd9Sstevel@tonic-gate  *
417c478bd9Sstevel@tonic-gate  * University Acknowledgment- Portions of this document are derived from
427c478bd9Sstevel@tonic-gate  * software developed by the University of California, Berkeley, and its
437c478bd9Sstevel@tonic-gate  * contributors.
447c478bd9Sstevel@tonic-gate  */
457c478bd9Sstevel@tonic-gate 
467c478bd9Sstevel@tonic-gate #include <sys/types.h>
477c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
487c478bd9Sstevel@tonic-gate #include <sys/param.h>
497c478bd9Sstevel@tonic-gate #include <sys/buf.h>
507c478bd9Sstevel@tonic-gate #include <sys/uio.h>
517c478bd9Sstevel@tonic-gate #include <sys/proc.h>
527c478bd9Sstevel@tonic-gate #include <sys/systm.h>
537c478bd9Sstevel@tonic-gate #include <sys/mman.h>
547c478bd9Sstevel@tonic-gate #include <sys/cred.h>
557c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
567c478bd9Sstevel@tonic-gate #include <sys/vm.h>
577c478bd9Sstevel@tonic-gate #include <sys/vmparam.h>
587c478bd9Sstevel@tonic-gate #include <sys/vtrace.h>
597c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
607c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
617c478bd9Sstevel@tonic-gate #include <sys/user.h>
627c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
637c478bd9Sstevel@tonic-gate #include <sys/debug.h>
647c478bd9Sstevel@tonic-gate #include <sys/callb.h>
657c478bd9Sstevel@tonic-gate #include <sys/mem_cage.h>
667c478bd9Sstevel@tonic-gate #include <sys/time.h>
67727737b4SJoshua M. Clulow #include <sys/stdbool.h>
687c478bd9Sstevel@tonic-gate 
697c478bd9Sstevel@tonic-gate #include <vm/hat.h>
707c478bd9Sstevel@tonic-gate #include <vm/as.h>
717c478bd9Sstevel@tonic-gate #include <vm/seg.h>
727c478bd9Sstevel@tonic-gate #include <vm/page.h>
737c478bd9Sstevel@tonic-gate #include <vm/pvn.h>
747c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
757c478bd9Sstevel@tonic-gate 
762d9166aeSJoshua M. Clulow /*
772d9166aeSJoshua M. Clulow  * FREE MEMORY MANAGEMENT
782d9166aeSJoshua M. Clulow  *
792d9166aeSJoshua M. Clulow  * Management of the pool of free pages is a tricky business.  There are
802d9166aeSJoshua M. Clulow  * several critical threshold values which constrain our allocation of new
812d9166aeSJoshua M. Clulow  * pages and inform the rate of paging out of memory to swap.  These threshold
822d9166aeSJoshua M. Clulow  * values, and the behaviour they induce, are described below in descending
832d9166aeSJoshua M. Clulow  * order of size -- and thus increasing order of severity!
842d9166aeSJoshua M. Clulow  *
852d9166aeSJoshua M. Clulow  *   +---------------------------------------------------- physmem (all memory)
862d9166aeSJoshua M. Clulow  *   |
872d9166aeSJoshua M. Clulow  *   | Ordinarily there are no particular constraints placed on page
882d9166aeSJoshua M. Clulow  *   v allocation.  The page scanner is not running and page_create_va()
892d9166aeSJoshua M. Clulow  *   | will effectively grant all page requests (whether from the kernel
902d9166aeSJoshua M. Clulow  *   | or from user processes) without artificial delay.
912d9166aeSJoshua M. Clulow  *   |
922d9166aeSJoshua M. Clulow  *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
932d9166aeSJoshua M. Clulow  *   |
942d9166aeSJoshua M. Clulow  *   | When we have less than "lotsfree" pages, pageout_scanner() is
952d9166aeSJoshua M. Clulow  *   v signalled by schedpaging() to begin looking for pages that can
962d9166aeSJoshua M. Clulow  *   | be evicted to disk to bring us back above lotsfree.  At this
972d9166aeSJoshua M. Clulow  *   | stage there is still no constraint on allocation of free pages.
982d9166aeSJoshua M. Clulow  *   |
992d9166aeSJoshua M. Clulow  *   | For small systems, we set a lower bound of 16MB for lotsfree;
1002d9166aeSJoshua M. Clulow  *   v this is the natural value for a system with 1GB memory.  This is
1012d9166aeSJoshua M. Clulow  *   | to ensure that the pageout reserve pool contains at least 4MB
1022d9166aeSJoshua M. Clulow  *   | for use by ZFS.
1032d9166aeSJoshua M. Clulow  *   |
1042d9166aeSJoshua M. Clulow  *   | For systems with a large amount of memory, we constrain lotsfree
1052d9166aeSJoshua M. Clulow  *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
1062d9166aeSJoshua M. Clulow  *   v at some point the required slack relates more closely to the
1072d9166aeSJoshua M. Clulow  *   | rate at which paging can occur than to the total amount of memory.
1082d9166aeSJoshua M. Clulow  *   |
1092d9166aeSJoshua M. Clulow  *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
1102d9166aeSJoshua M. Clulow  *   |
1112d9166aeSJoshua M. Clulow  *   | When we drop below desfree, a number of kernel facilities will
1122d9166aeSJoshua M. Clulow  *   v wait before allocating more memory, under the assumption that
1132d9166aeSJoshua M. Clulow  *   | pageout or reaping will make progress and free up some memory.
1142d9166aeSJoshua M. Clulow  *   | This behaviour is not especially coordinated; look for comparisons
1152d9166aeSJoshua M. Clulow  *   | of desfree and freemem.
1162d9166aeSJoshua M. Clulow  *   |
1172d9166aeSJoshua M. Clulow  *   | In addition to various attempts at advisory caution, clock()
1182d9166aeSJoshua M. Clulow  *   | will wake up the thread that is ordinarily parked in sched().
1192d9166aeSJoshua M. Clulow  *   | This routine is responsible for the heavy-handed swapping out
1202d9166aeSJoshua M. Clulow  *   v of entire processes in an attempt to arrest the slide of free
1212d9166aeSJoshua M. Clulow  *   | memory.  See comments in sched.c for more details.
1222d9166aeSJoshua M. Clulow  *   |
1232d9166aeSJoshua M. Clulow  *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
1242d9166aeSJoshua M. Clulow  *   |
1252d9166aeSJoshua M. Clulow  *   | These two separate tunables have, by default, the same value.
1262d9166aeSJoshua M. Clulow  *   v Various parts of the kernel use minfree to signal the need for
1272d9166aeSJoshua M. Clulow  *   | more aggressive reclamation of memory, and sched() is more
1282d9166aeSJoshua M. Clulow  *   | aggressive at swapping processes out.
1292d9166aeSJoshua M. Clulow  *   |
1302d9166aeSJoshua M. Clulow  *   | If free memory falls below throttlefree, page_create_va() will
1312d9166aeSJoshua M. Clulow  *   | use page_create_throttle() to begin holding most requests for
1322d9166aeSJoshua M. Clulow  *   | new pages while pageout and reaping free up memory.  Sleeping
1332d9166aeSJoshua M. Clulow  *   v allocations (e.g., KM_SLEEP) are held here while we wait for
1342d9166aeSJoshua M. Clulow  *   | more memory.  Non-sleeping allocations are generally allowed to
1352d9166aeSJoshua M. Clulow  *   | proceed, unless their priority is explicitly lowered with
136ca783257SDan McDonald  *   | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
1372d9166aeSJoshua M. Clulow  *   |
1382d9166aeSJoshua M. Clulow  *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
1392d9166aeSJoshua M. Clulow  *   |
1402d9166aeSJoshua M. Clulow  *   | When we hit throttlefree, the situation is already dire.  The
1412d9166aeSJoshua M. Clulow  *   v system is generally paging out memory and swapping out entire
1422d9166aeSJoshua M. Clulow  *   | processes in order to free up memory for continued operation.
1432d9166aeSJoshua M. Clulow  *   |
1442d9166aeSJoshua M. Clulow  *   | Unfortunately, evicting memory to disk generally requires short
1452d9166aeSJoshua M. Clulow  *   | term use of additional memory; e.g., allocation of buffers for
1462d9166aeSJoshua M. Clulow  *   | storage drivers, updating maps of free and used blocks, etc.
1472d9166aeSJoshua M. Clulow  *   | As such, pageout_reserve is the number of pages that we keep in
1482d9166aeSJoshua M. Clulow  *   | special reserve for use by pageout() and sched() and by any
1492d9166aeSJoshua M. Clulow  *   v other parts of the kernel that need to be working for those to
1502d9166aeSJoshua M. Clulow  *   | make forward progress such as the ZFS I/O pipeline.
1512d9166aeSJoshua M. Clulow  *   |
1522d9166aeSJoshua M. Clulow  *   | When we are below pageout_reserve, we fail or hold any allocation
1532d9166aeSJoshua M. Clulow  *   | that has not explicitly requested access to the reserve pool.
1542d9166aeSJoshua M. Clulow  *   | Access to the reserve is generally granted via the KM_PUSHPAGE
1552d9166aeSJoshua M. Clulow  *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
1562d9166aeSJoshua M. Clulow  *   | can implicitly tap the reserve.  For more details, see the
1572d9166aeSJoshua M. Clulow  *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
1582d9166aeSJoshua M. Clulow  *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
1592d9166aeSJoshua M. Clulow  *   |
1602d9166aeSJoshua M. Clulow  *   +---------------------------------------------------------- no free memory
1612d9166aeSJoshua M. Clulow  *   |
1622d9166aeSJoshua M. Clulow  *   | If we have arrived here, things are very bad indeed.  It is
1632d9166aeSJoshua M. Clulow  *   v surprisingly difficult to tell if this condition is even fatal,
1642d9166aeSJoshua M. Clulow  *   | as enough memory may have been granted to pageout() and to the
1652d9166aeSJoshua M. Clulow  *   | ZFS I/O pipeline that requests for eviction that have already been
1662d9166aeSJoshua M. Clulow  *   | made will complete and free up memory some time soon.
1672d9166aeSJoshua M. Clulow  *   |
1682d9166aeSJoshua M. Clulow  *   | If free memory does not materialise, the system generally remains
1692d9166aeSJoshua M. Clulow  *   | deadlocked.  The pageout_deadman() below is run once per second
1702d9166aeSJoshua M. Clulow  *   | from clock(), seeking to limit the amount of time a single request
1712d9166aeSJoshua M. Clulow  *   v to page out can be blocked before the system panics to get a crash
1722d9166aeSJoshua M. Clulow  *   | dump and return to service.
1732d9166aeSJoshua M. Clulow  *   |
1742d9166aeSJoshua M. Clulow  *   +-------------------------------------------------------------------------
1752d9166aeSJoshua M. Clulow  */
1767c478bd9Sstevel@tonic-gate 
1777c478bd9Sstevel@tonic-gate /*
1787c478bd9Sstevel@tonic-gate  * The following parameters control operation of the page replacement
1792d9166aeSJoshua M. Clulow  * algorithm.  They are initialized to 0, and then computed at boot time based
1802d9166aeSJoshua M. Clulow  * on the size of the system; see setupclock().  If they are patched non-zero
1812d9166aeSJoshua M. Clulow  * in a loaded vmunix they are left alone and may thus be changed per system
1822d9166aeSJoshua M. Clulow  * using "mdb -kw" on the loaded system.
1837c478bd9Sstevel@tonic-gate  */
1847c478bd9Sstevel@tonic-gate pgcnt_t		slowscan = 0;
1857c478bd9Sstevel@tonic-gate pgcnt_t		fastscan = 0;
1867c478bd9Sstevel@tonic-gate 
1877c478bd9Sstevel@tonic-gate static pgcnt_t	handspreadpages = 0;
1882d9166aeSJoshua M. Clulow 
1892d9166aeSJoshua M. Clulow /*
1902d9166aeSJoshua M. Clulow  * looppages:
1912d9166aeSJoshua M. Clulow  *     Cached copy of the total number of pages in the system (total_pages).
1922d9166aeSJoshua M. Clulow  *
1932d9166aeSJoshua M. Clulow  * loopfraction:
1942d9166aeSJoshua M. Clulow  *     Divisor used to relate fastscan to looppages in setupclock().
1952d9166aeSJoshua M. Clulow  */
1962d9166aeSJoshua M. Clulow static uint_t	loopfraction = 2;
1977c478bd9Sstevel@tonic-gate static pgcnt_t	looppages;
1982d9166aeSJoshua M. Clulow 
1992d9166aeSJoshua M. Clulow static uint_t	min_percent_cpu = 4;
2002d9166aeSJoshua M. Clulow static uint_t	max_percent_cpu = 80;
2017c478bd9Sstevel@tonic-gate static pgcnt_t	maxfastscan = 0;
2027c478bd9Sstevel@tonic-gate static pgcnt_t	maxslowscan = 100;
2037c478bd9Sstevel@tonic-gate 
2042d9166aeSJoshua M. Clulow #define		MEGABYTES		(1024ULL * 1024ULL)
2052d9166aeSJoshua M. Clulow 
2062d9166aeSJoshua M. Clulow /*
2072d9166aeSJoshua M. Clulow  * pageout_threshold_style:
2082d9166aeSJoshua M. Clulow  *     set to 1 to use the previous default threshold size calculation;
2092d9166aeSJoshua M. Clulow  *     i.e., each threshold is half of the next largest value.
2102d9166aeSJoshua M. Clulow  */
2112d9166aeSJoshua M. Clulow uint_t		pageout_threshold_style = 0;
2122d9166aeSJoshua M. Clulow 
2132d9166aeSJoshua M. Clulow /*
2142d9166aeSJoshua M. Clulow  * The operator may override these tunables to request a different minimum or
2152d9166aeSJoshua M. Clulow  * maximum lotsfree value, or to change the divisor we use for automatic
2162d9166aeSJoshua M. Clulow  * sizing.
2172d9166aeSJoshua M. Clulow  *
2182d9166aeSJoshua M. Clulow  * By default, we make lotsfree 1/64th of the total memory in the machine.  The
2192d9166aeSJoshua M. Clulow  * minimum and maximum are specified in bytes, rather than pages; a zero value
2202d9166aeSJoshua M. Clulow  * means the default values (below) are used.
2212d9166aeSJoshua M. Clulow  */
2222d9166aeSJoshua M. Clulow uint_t		lotsfree_fraction = 64;
2232d9166aeSJoshua M. Clulow pgcnt_t		lotsfree_min = 0;
2242d9166aeSJoshua M. Clulow pgcnt_t		lotsfree_max = 0;
2252d9166aeSJoshua M. Clulow 
2262d9166aeSJoshua M. Clulow #define		LOTSFREE_MIN_DEFAULT	(16 * MEGABYTES)
2272d9166aeSJoshua M. Clulow #define		LOTSFREE_MAX_DEFAULT	(2048 * MEGABYTES)
2282d9166aeSJoshua M. Clulow 
2292d9166aeSJoshua M. Clulow /*
2302d9166aeSJoshua M. Clulow  * If these tunables are set to non-zero values in /etc/system, and provided
2312d9166aeSJoshua M. Clulow  * the value is not larger than the threshold above, the specified value will
2322d9166aeSJoshua M. Clulow  * be used directly without any additional calculation or adjustment.  The boot
2332d9166aeSJoshua M. Clulow  * time value of these overrides is preserved in the "clockinit" struct.  More
2342d9166aeSJoshua M. Clulow  * detail is available in the comment at the top of the file.
2352d9166aeSJoshua M. Clulow  */
2367c478bd9Sstevel@tonic-gate pgcnt_t		maxpgio = 0;
2377c478bd9Sstevel@tonic-gate pgcnt_t		minfree = 0;
2387c478bd9Sstevel@tonic-gate pgcnt_t		desfree = 0;
2397c478bd9Sstevel@tonic-gate pgcnt_t		lotsfree = 0;
2407c478bd9Sstevel@tonic-gate pgcnt_t		needfree = 0;
2417c478bd9Sstevel@tonic-gate pgcnt_t		throttlefree = 0;
2427c478bd9Sstevel@tonic-gate pgcnt_t		pageout_reserve = 0;
2437c478bd9Sstevel@tonic-gate 
2447c478bd9Sstevel@tonic-gate pgcnt_t		deficit;
2457c478bd9Sstevel@tonic-gate pgcnt_t		nscan;
2467c478bd9Sstevel@tonic-gate pgcnt_t		desscan;
2477c478bd9Sstevel@tonic-gate 
248*338664dfSAndy Fiddaman /* kstats */
249*338664dfSAndy Fiddaman uint64_t	low_mem_scan;
250*338664dfSAndy Fiddaman 
25171cf2822SAndy Fiddaman /* The maximum supported number of page_scanner() threads */
25271cf2822SAndy Fiddaman #define	MAX_PSCAN_THREADS	16
25371cf2822SAndy Fiddaman 
2547c478bd9Sstevel@tonic-gate /*
2552d9166aeSJoshua M. Clulow  * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the
2562d9166aeSJoshua M. Clulow  * number of nanoseconds in each wakeup cycle that gives the equivalent of some
2572d9166aeSJoshua M. Clulow  * underlying %CPU duty cycle.
2587c478bd9Sstevel@tonic-gate  *
2592d9166aeSJoshua M. Clulow  * min_pageout_nsec:
2602d9166aeSJoshua M. Clulow  *     nanoseconds/wakeup equivalent of min_percent_cpu.
2617c478bd9Sstevel@tonic-gate  *
2622d9166aeSJoshua M. Clulow  * max_pageout_nsec:
2632d9166aeSJoshua M. Clulow  *     nanoseconds/wakeup equivalent of max_percent_cpu.
2647c478bd9Sstevel@tonic-gate  *
2652d9166aeSJoshua M. Clulow  * pageout_nsec:
2662d9166aeSJoshua M. Clulow  *     Number of nanoseconds budgeted for each wakeup cycle.
2677c478bd9Sstevel@tonic-gate  *     Computed each time around by schedpaging().
2682d9166aeSJoshua M. Clulow  *     Varies between min_pageout_nsec and max_pageout_nsec,
2697c478bd9Sstevel@tonic-gate  *     depending on memory pressure.
2707c478bd9Sstevel@tonic-gate  */
2712d9166aeSJoshua M. Clulow static hrtime_t	min_pageout_nsec;
2722d9166aeSJoshua M. Clulow static hrtime_t	max_pageout_nsec;
2732d9166aeSJoshua M. Clulow static hrtime_t	pageout_nsec;
2747c478bd9Sstevel@tonic-gate 
27571cf2822SAndy Fiddaman static bool	reset_hands[MAX_PSCAN_THREADS];
2767c478bd9Sstevel@tonic-gate 
2777c478bd9Sstevel@tonic-gate #define	PAGES_POLL_MASK	1023
2787c478bd9Sstevel@tonic-gate 
2797c478bd9Sstevel@tonic-gate /*
28071cf2822SAndy Fiddaman  * Pageout scheduling.
28171cf2822SAndy Fiddaman  *
28271cf2822SAndy Fiddaman  * Schedpaging controls the rate at which the page out daemon runs by
28371cf2822SAndy Fiddaman  * setting the global variables nscan and desscan SCHEDPAGING_HZ
28471cf2822SAndy Fiddaman  * times a second.  Nscan records the number of pages pageout has examined
28571cf2822SAndy Fiddaman  * in its current pass; schedpaging() resets this value to zero each time
28671cf2822SAndy Fiddaman  * it runs.  Desscan records the number of pages pageout should examine
28771cf2822SAndy Fiddaman  * in its next pass; schedpaging() sets this value based on the amount of
28871cf2822SAndy Fiddaman  * currently available memory.
28971cf2822SAndy Fiddaman  */
29071cf2822SAndy Fiddaman #define	SCHEDPAGING_HZ	4
29171cf2822SAndy Fiddaman 
29271cf2822SAndy Fiddaman /*
29371cf2822SAndy Fiddaman  * despagescanners:
29471cf2822SAndy Fiddaman  *	The desired number of page scanner threads. For testing purposes, this
29571cf2822SAndy Fiddaman  *	value can be set in /etc/system or tuned directly with mdb(1). The
29671cf2822SAndy Fiddaman  *	system will bring the actual number of threads into line with the
29771cf2822SAndy Fiddaman  *	desired number. If set to an invalid value, the system will correct the
29871cf2822SAndy Fiddaman  *	setting.
29971cf2822SAndy Fiddaman  */
30071cf2822SAndy Fiddaman uint_t despagescanners = 0;
30171cf2822SAndy Fiddaman 
30271cf2822SAndy Fiddaman /*
3037c478bd9Sstevel@tonic-gate  * pageout_sample_lim:
3042d9166aeSJoshua M. Clulow  *     The limit on the number of samples needed to establish a value for new
3052d9166aeSJoshua M. Clulow  *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
3062d9166aeSJoshua M. Clulow  *     handspreadpages.
3077c478bd9Sstevel@tonic-gate  *
3087c478bd9Sstevel@tonic-gate  * pageout_sample_cnt:
3092d9166aeSJoshua M. Clulow  *     Current sample number.  Once the sample gets large enough, set new
3102d9166aeSJoshua M. Clulow  *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
3117c478bd9Sstevel@tonic-gate  *
3127c478bd9Sstevel@tonic-gate  * pageout_sample_pages:
3137c478bd9Sstevel@tonic-gate  *     The accumulated number of pages scanned during sampling.
3147c478bd9Sstevel@tonic-gate  *
3152d9166aeSJoshua M. Clulow  * pageout_sample_etime:
3162d9166aeSJoshua M. Clulow  *     The accumulated nanoseconds for the sample.
3177c478bd9Sstevel@tonic-gate  *
31871cf2822SAndy Fiddaman  * pageout_sampling:
31971cf2822SAndy Fiddaman  *     True while sampling is still in progress.
32071cf2822SAndy Fiddaman  *
3217c478bd9Sstevel@tonic-gate  * pageout_rate:
3227c478bd9Sstevel@tonic-gate  *     Rate in pages/nanosecond, computed at the end of sampling.
3237c478bd9Sstevel@tonic-gate  *
3247c478bd9Sstevel@tonic-gate  * pageout_new_spread:
3252d9166aeSJoshua M. Clulow  *     Initially zero while the system scan rate is measured by
3262d9166aeSJoshua M. Clulow  *     pageout_scanner(), which then sets this value once per system boot after
3272d9166aeSJoshua M. Clulow  *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
3282d9166aeSJoshua M. Clulow  *     new value is used for fastscan and handspreadpages.
3297c478bd9Sstevel@tonic-gate  */
3307c478bd9Sstevel@tonic-gate typedef hrtime_t hrrate_t;
3317c478bd9Sstevel@tonic-gate 
3327c478bd9Sstevel@tonic-gate static uint64_t	pageout_sample_lim = 4;
3337c478bd9Sstevel@tonic-gate static uint64_t	pageout_sample_cnt = 0;
3347c478bd9Sstevel@tonic-gate static pgcnt_t	pageout_sample_pages = 0;
33571cf2822SAndy Fiddaman static hrtime_t	pageout_sample_etime = 0;
33671cf2822SAndy Fiddaman static bool	pageout_sampling = true;
3377c478bd9Sstevel@tonic-gate static hrrate_t	pageout_rate = 0;
3387c478bd9Sstevel@tonic-gate static pgcnt_t	pageout_new_spread = 0;
3397c478bd9Sstevel@tonic-gate 
34071cf2822SAndy Fiddaman /* The current number of page scanner threads */
34171cf2822SAndy Fiddaman static uint_t n_page_scanners = 1;
34271cf2822SAndy Fiddaman /* The number of page scanner threads that are actively scanning. */
34371cf2822SAndy Fiddaman static uint_t pageouts_running;
3447c478bd9Sstevel@tonic-gate 
3457c478bd9Sstevel@tonic-gate /*
3462d9166aeSJoshua M. Clulow  * Record number of times a pageout_scanner() wakeup cycle finished because it
3477c478bd9Sstevel@tonic-gate  * timed out (exceeded its CPU budget), rather than because it visited
3487c478bd9Sstevel@tonic-gate  * its budgeted number of pages.
3497c478bd9Sstevel@tonic-gate  */
3507c478bd9Sstevel@tonic-gate uint64_t	pageout_timeouts = 0;
3517c478bd9Sstevel@tonic-gate 
3527c478bd9Sstevel@tonic-gate #ifdef VM_STATS
3537c478bd9Sstevel@tonic-gate static struct pageoutvmstats_str {
3547c478bd9Sstevel@tonic-gate 	ulong_t	checkpage[3];
3557c478bd9Sstevel@tonic-gate } pageoutvmstats;
3567c478bd9Sstevel@tonic-gate #endif /* VM_STATS */
3577c478bd9Sstevel@tonic-gate 
3587c478bd9Sstevel@tonic-gate /*
3597c478bd9Sstevel@tonic-gate  * Threads waiting for free memory use this condition variable and lock until
3607c478bd9Sstevel@tonic-gate  * memory becomes available.
3617c478bd9Sstevel@tonic-gate  */
3627c478bd9Sstevel@tonic-gate kmutex_t	memavail_lock;
3637c478bd9Sstevel@tonic-gate kcondvar_t	memavail_cv;
3647c478bd9Sstevel@tonic-gate 
3652d9166aeSJoshua M. Clulow typedef enum pageout_hand {
3662d9166aeSJoshua M. Clulow 	POH_FRONT = 1,
3672d9166aeSJoshua M. Clulow 	POH_BACK,
3682d9166aeSJoshua M. Clulow } pageout_hand_t;
3697c478bd9Sstevel@tonic-gate 
3702d9166aeSJoshua M. Clulow typedef enum {
3712d9166aeSJoshua M. Clulow 	CKP_INELIGIBLE,
3722d9166aeSJoshua M. Clulow 	CKP_NOT_FREED,
3732d9166aeSJoshua M. Clulow 	CKP_FREED,
3742d9166aeSJoshua M. Clulow } checkpage_result_t;
3752d9166aeSJoshua M. Clulow 
3762d9166aeSJoshua M. Clulow static checkpage_result_t checkpage(page_t *, pageout_hand_t);
3772d9166aeSJoshua M. Clulow 
3782d9166aeSJoshua M. Clulow static struct clockinit {
3792d9166aeSJoshua M. Clulow 	bool ci_init;
3802d9166aeSJoshua M. Clulow 	pgcnt_t ci_lotsfree_min;
3812d9166aeSJoshua M. Clulow 	pgcnt_t ci_lotsfree_max;
3822d9166aeSJoshua M. Clulow 	pgcnt_t ci_lotsfree;
3832d9166aeSJoshua M. Clulow 	pgcnt_t ci_desfree;
3842d9166aeSJoshua M. Clulow 	pgcnt_t ci_minfree;
3852d9166aeSJoshua M. Clulow 	pgcnt_t ci_throttlefree;
3862d9166aeSJoshua M. Clulow 	pgcnt_t ci_pageout_reserve;
3872d9166aeSJoshua M. Clulow 	pgcnt_t ci_maxpgio;
3882d9166aeSJoshua M. Clulow 	pgcnt_t ci_maxfastscan;
3892d9166aeSJoshua M. Clulow 	pgcnt_t ci_fastscan;
3902d9166aeSJoshua M. Clulow 	pgcnt_t ci_slowscan;
3912d9166aeSJoshua M. Clulow 	pgcnt_t ci_handspreadpages;
39271cf2822SAndy Fiddaman 	uint_t  ci_despagescanners;
3932d9166aeSJoshua M. Clulow } clockinit = { .ci_init = false };
3942d9166aeSJoshua M. Clulow 
39571cf2822SAndy Fiddaman static inline pgcnt_t
clamp(pgcnt_t value,pgcnt_t minimum,pgcnt_t maximum)3962d9166aeSJoshua M. Clulow clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
3977c478bd9Sstevel@tonic-gate {
39871cf2822SAndy Fiddaman 	if (value < minimum)
3992d9166aeSJoshua M. Clulow 		return (minimum);
40071cf2822SAndy Fiddaman 	else if (value > maximum)
4012d9166aeSJoshua M. Clulow 		return (maximum);
40271cf2822SAndy Fiddaman 	else
4032d9166aeSJoshua M. Clulow 		return (value);
4042d9166aeSJoshua M. Clulow }
4057c478bd9Sstevel@tonic-gate 
4062d9166aeSJoshua M. Clulow static pgcnt_t
tune(pgcnt_t initval,pgcnt_t initval_ceiling,pgcnt_t defval)4072d9166aeSJoshua M. Clulow tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
4082d9166aeSJoshua M. Clulow {
40971cf2822SAndy Fiddaman 	if (initval == 0 || initval >= initval_ceiling)
4102d9166aeSJoshua M. Clulow 		return (defval);
41171cf2822SAndy Fiddaman 	else
4122d9166aeSJoshua M. Clulow 		return (initval);
4132d9166aeSJoshua M. Clulow }
41471cf2822SAndy Fiddaman 
41571cf2822SAndy Fiddaman /*
41671cf2822SAndy Fiddaman  * On large memory systems, multiple instances of the page scanner are run,
41771cf2822SAndy Fiddaman  * each responsible for a separate region of memory. This speeds up page
41871cf2822SAndy Fiddaman  * invalidation under low memory conditions.
41971cf2822SAndy Fiddaman  *
42071cf2822SAndy Fiddaman  * For testing purposes, despagescanners can be set in /etc/system or via
42171cf2822SAndy Fiddaman  * mdb(1) and it will be used as a guide for how many page scanners to create;
42271cf2822SAndy Fiddaman  * the value will be adjusted if it is not sensible. Otherwise, the number of
42371cf2822SAndy Fiddaman  * page scanners is determined dynamically based on handspreadpages.
42471cf2822SAndy Fiddaman  */
42571cf2822SAndy Fiddaman static void
recalc_pagescanners(void)42671cf2822SAndy Fiddaman recalc_pagescanners(void)
42771cf2822SAndy Fiddaman {
42871cf2822SAndy Fiddaman 	uint_t des;
42971cf2822SAndy Fiddaman 
43071cf2822SAndy Fiddaman 	/* If the initial calibration has not been done, take no action. */
43171cf2822SAndy Fiddaman 	if (pageout_new_spread == 0)
43271cf2822SAndy Fiddaman 		return;
43371cf2822SAndy Fiddaman 
43471cf2822SAndy Fiddaman 	/*
43571cf2822SAndy Fiddaman 	 * If `clockinit.ci_despagescanners` is non-zero, then a value for
43671cf2822SAndy Fiddaman 	 * `despagescanners` was set during initial boot. In this case, if
43771cf2822SAndy Fiddaman 	 * `despagescanners` has been reset to 0 then we want to revert to
43871cf2822SAndy Fiddaman 	 * that initial boot value.
43971cf2822SAndy Fiddaman 	 */
44071cf2822SAndy Fiddaman 	if (despagescanners == 0)
44171cf2822SAndy Fiddaman 		despagescanners = clockinit.ci_despagescanners;
44271cf2822SAndy Fiddaman 
44371cf2822SAndy Fiddaman 	if (despagescanners != 0) {
44471cf2822SAndy Fiddaman 		/*
44571cf2822SAndy Fiddaman 		 * We have a desired number of page scanners, either from
44671cf2822SAndy Fiddaman 		 * /etc/system or set via mdb. Try and use it (it will be
44771cf2822SAndy Fiddaman 		 * adjusted below if necessary).
44871cf2822SAndy Fiddaman 		 */
44971cf2822SAndy Fiddaman 		des = despagescanners;
45071cf2822SAndy Fiddaman 	} else {
45171cf2822SAndy Fiddaman 		/*
45271cf2822SAndy Fiddaman 		 * Calculate the number of desired scanners based on the
45371cf2822SAndy Fiddaman 		 * system's memory size.
45471cf2822SAndy Fiddaman 		 *
45571cf2822SAndy Fiddaman 		 * A 64GiB region size is used as the basis for calculating how
45671cf2822SAndy Fiddaman 		 * many scanner threads should be created. For systems with up
45771cf2822SAndy Fiddaman 		 * to 64GiB of RAM, a single thread is used; for very large
45871cf2822SAndy Fiddaman 		 * memory systems the threads are limited to MAX_PSCAN_THREADS.
45971cf2822SAndy Fiddaman 		 */
46071cf2822SAndy Fiddaman 		des = (looppages - 1) / btop(64ULL << 30) + 1;
46171cf2822SAndy Fiddaman 	}
46271cf2822SAndy Fiddaman 
46371cf2822SAndy Fiddaman 	/*
46471cf2822SAndy Fiddaman 	 * Clamp the number of scanners so that we have no more than
46571cf2822SAndy Fiddaman 	 * MAX_PSCAN_THREADS and so that each scanner covers at least 10% more
46671cf2822SAndy Fiddaman 	 * than handspreadpages.
46771cf2822SAndy Fiddaman 	 */
46871cf2822SAndy Fiddaman 	pgcnt_t min_scanner_pages = handspreadpages + handspreadpages / 10;
46971cf2822SAndy Fiddaman 	pgcnt_t max_scanners = looppages / min_scanner_pages;
47071cf2822SAndy Fiddaman 	despagescanners = clamp(des, 1,
47171cf2822SAndy Fiddaman 	    clamp(max_scanners, 1, MAX_PSCAN_THREADS));
4727c478bd9Sstevel@tonic-gate }
4737c478bd9Sstevel@tonic-gate 
4747c478bd9Sstevel@tonic-gate /*
4752d9166aeSJoshua M. Clulow  * Set up the paging constants for the clock algorithm used by
4762d9166aeSJoshua M. Clulow  * pageout_scanner(), and by the virtual memory system overall.  See the
4772d9166aeSJoshua M. Clulow  * comments at the top of this file for more information about the threshold
4782d9166aeSJoshua M. Clulow  * values and system responses to memory pressure.
4792d9166aeSJoshua M. Clulow  *
4802d9166aeSJoshua M. Clulow  * This routine is called once by main() at startup, after the initial size of
4812d9166aeSJoshua M. Clulow  * physical memory is determined.  It may be called again later if memory is
4822d9166aeSJoshua M. Clulow  * added to or removed from the system, or if new measurements of the page scan
4832d9166aeSJoshua M. Clulow  * rate become available.
4847c478bd9Sstevel@tonic-gate  */
4852d9166aeSJoshua M. Clulow void
setupclock(void)4862d9166aeSJoshua M. Clulow setupclock(void)
4872d9166aeSJoshua M. Clulow {
4882d9166aeSJoshua M. Clulow 	bool half = (pageout_threshold_style == 1);
4892d9166aeSJoshua M. Clulow 	bool recalc = true;
4902d9166aeSJoshua M. Clulow 
4912d9166aeSJoshua M. Clulow 	looppages = total_pages;
4927c478bd9Sstevel@tonic-gate 
4937c478bd9Sstevel@tonic-gate 	/*
4942d9166aeSJoshua M. Clulow 	 * The operator may have provided specific values for some of the
4952d9166aeSJoshua M. Clulow 	 * tunables via /etc/system.  On our first call, we preserve those
4962d9166aeSJoshua M. Clulow 	 * values so that they can be used for subsequent recalculations.
4972d9166aeSJoshua M. Clulow 	 *
4982d9166aeSJoshua M. Clulow 	 * A value of zero for any tunable means we will use the default
4992d9166aeSJoshua M. Clulow 	 * sizing.
5007c478bd9Sstevel@tonic-gate 	 */
5012d9166aeSJoshua M. Clulow 	if (!clockinit.ci_init) {
5022d9166aeSJoshua M. Clulow 		clockinit.ci_init = true;
5032d9166aeSJoshua M. Clulow 
5042d9166aeSJoshua M. Clulow 		clockinit.ci_lotsfree_min = lotsfree_min;
5052d9166aeSJoshua M. Clulow 		clockinit.ci_lotsfree_max = lotsfree_max;
5062d9166aeSJoshua M. Clulow 		clockinit.ci_lotsfree = lotsfree;
5072d9166aeSJoshua M. Clulow 		clockinit.ci_desfree = desfree;
5082d9166aeSJoshua M. Clulow 		clockinit.ci_minfree = minfree;
5092d9166aeSJoshua M. Clulow 		clockinit.ci_throttlefree = throttlefree;
5102d9166aeSJoshua M. Clulow 		clockinit.ci_pageout_reserve = pageout_reserve;
5112d9166aeSJoshua M. Clulow 		clockinit.ci_maxpgio = maxpgio;
5122d9166aeSJoshua M. Clulow 		clockinit.ci_maxfastscan = maxfastscan;
5132d9166aeSJoshua M. Clulow 		clockinit.ci_fastscan = fastscan;
5142d9166aeSJoshua M. Clulow 		clockinit.ci_slowscan = slowscan;
5152d9166aeSJoshua M. Clulow 		clockinit.ci_handspreadpages = handspreadpages;
51671cf2822SAndy Fiddaman 		clockinit.ci_despagescanners = despagescanners;
5177c478bd9Sstevel@tonic-gate 
5187c478bd9Sstevel@tonic-gate 		/*
5192d9166aeSJoshua M. Clulow 		 * The first call does not trigger a recalculation, only
5202d9166aeSJoshua M. Clulow 		 * subsequent calls.
5217c478bd9Sstevel@tonic-gate 		 */
5222d9166aeSJoshua M. Clulow 		recalc = false;
5232d9166aeSJoshua M. Clulow 	}
5247c478bd9Sstevel@tonic-gate 
5257c478bd9Sstevel@tonic-gate 	/*
5262d9166aeSJoshua M. Clulow 	 * Configure paging threshold values.  For more details on what each
5272d9166aeSJoshua M. Clulow 	 * threshold signifies, see the comments at the top of this file.
5287c478bd9Sstevel@tonic-gate 	 */
5292d9166aeSJoshua M. Clulow 	lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
5302d9166aeSJoshua M. Clulow 	    btop(LOTSFREE_MAX_DEFAULT));
5312d9166aeSJoshua M. Clulow 	lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
5322d9166aeSJoshua M. Clulow 	    btop(LOTSFREE_MIN_DEFAULT));
5337c478bd9Sstevel@tonic-gate 
5342d9166aeSJoshua M. Clulow 	lotsfree = tune(clockinit.ci_lotsfree, looppages,
5352d9166aeSJoshua M. Clulow 	    clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
5367c478bd9Sstevel@tonic-gate 
5372d9166aeSJoshua M. Clulow 	desfree = tune(clockinit.ci_desfree, lotsfree,
5382d9166aeSJoshua M. Clulow 	    lotsfree / 2);
5392d9166aeSJoshua M. Clulow 
5402d9166aeSJoshua M. Clulow 	minfree = tune(clockinit.ci_minfree, desfree,
5412d9166aeSJoshua M. Clulow 	    half ? desfree / 2 : 3 * desfree / 4);
5422d9166aeSJoshua M. Clulow 
5432d9166aeSJoshua M. Clulow 	throttlefree = tune(clockinit.ci_throttlefree, desfree,
5442d9166aeSJoshua M. Clulow 	    minfree);
5452d9166aeSJoshua M. Clulow 
5462d9166aeSJoshua M. Clulow 	pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
5472d9166aeSJoshua M. Clulow 	    half ? throttlefree / 2 : 3 * throttlefree / 4);
5487c478bd9Sstevel@tonic-gate 
5497c478bd9Sstevel@tonic-gate 	/*
5507c478bd9Sstevel@tonic-gate 	 * Maxpgio thresholds how much paging is acceptable.
5517c478bd9Sstevel@tonic-gate 	 * This figures that 2/3 busy on an arm is all that is
5527c478bd9Sstevel@tonic-gate 	 * tolerable for paging.  We assume one operation per disk rev.
5537c478bd9Sstevel@tonic-gate 	 *
5547c478bd9Sstevel@tonic-gate 	 * XXX - Does not account for multiple swap devices.
5557c478bd9Sstevel@tonic-gate 	 */
5562d9166aeSJoshua M. Clulow 	if (clockinit.ci_maxpgio == 0) {
5577c478bd9Sstevel@tonic-gate 		maxpgio = (DISKRPM * 2) / 3;
5582d9166aeSJoshua M. Clulow 	} else {
5592d9166aeSJoshua M. Clulow 		maxpgio = clockinit.ci_maxpgio;
5602d9166aeSJoshua M. Clulow 	}
5617c478bd9Sstevel@tonic-gate 
5627c478bd9Sstevel@tonic-gate 	/*
5637c478bd9Sstevel@tonic-gate 	 * The clock scan rate varies between fastscan and slowscan
5647c478bd9Sstevel@tonic-gate 	 * based on the amount of free memory available.  Fastscan
5657c478bd9Sstevel@tonic-gate 	 * rate should be set based on the number pages that can be
5667c478bd9Sstevel@tonic-gate 	 * scanned per sec using ~10% of processor time.  Since this
5677c478bd9Sstevel@tonic-gate 	 * value depends on the processor, MMU, Mhz etc., it is
5687c478bd9Sstevel@tonic-gate 	 * difficult to determine it in a generic manner for all
5697c478bd9Sstevel@tonic-gate 	 * architectures.
5707c478bd9Sstevel@tonic-gate 	 *
5717c478bd9Sstevel@tonic-gate 	 * Instead of trying to determine the number of pages scanned
5727c478bd9Sstevel@tonic-gate 	 * per sec for every processor, fastscan is set to be the smaller
5737c478bd9Sstevel@tonic-gate 	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
5747c478bd9Sstevel@tonic-gate 	 * time is limited to ~4% of processor time.
5757c478bd9Sstevel@tonic-gate 	 *
5767c478bd9Sstevel@tonic-gate 	 * Setting fastscan to be 1/2 of memory allows pageout to scan
5777c478bd9Sstevel@tonic-gate 	 * all of memory in ~2 secs.  This implies that user pages not
5787c478bd9Sstevel@tonic-gate 	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
5797c478bd9Sstevel@tonic-gate 	 * can be reclaimed when free memory is very low.  Stealing pages
5807c478bd9Sstevel@tonic-gate 	 * not accessed within 1 sec seems reasonable and ensures that
5817c478bd9Sstevel@tonic-gate 	 * active user processes don't thrash.
5827c478bd9Sstevel@tonic-gate 	 *
5837c478bd9Sstevel@tonic-gate 	 * Smaller values of fastscan result in scanning fewer pages
5847c478bd9Sstevel@tonic-gate 	 * every second and consequently pageout may not be able to free
5857c478bd9Sstevel@tonic-gate 	 * sufficient memory to maintain the minimum threshold.  Larger
5867c478bd9Sstevel@tonic-gate 	 * values of fastscan result in scanning a lot more pages which
5877c478bd9Sstevel@tonic-gate 	 * could lead to thrashing and higher CPU usage.
5887c478bd9Sstevel@tonic-gate 	 *
5897c478bd9Sstevel@tonic-gate 	 * Fastscan needs to be limited to a maximum value and should not
5907c478bd9Sstevel@tonic-gate 	 * scale with memory to prevent pageout from consuming too much
5917c478bd9Sstevel@tonic-gate 	 * time for scanning on slow CPU's and avoid thrashing, as a
5927c478bd9Sstevel@tonic-gate 	 * result of scanning too many pages, on faster CPU's.
5937c478bd9Sstevel@tonic-gate 	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
5947c478bd9Sstevel@tonic-gate 	 * (the upper bound for fastscan) based on the average number
5957c478bd9Sstevel@tonic-gate 	 * of pages that can potentially be scanned in ~1 sec (using ~4%
5967c478bd9Sstevel@tonic-gate 	 * of the CPU) on some of the following machines that currently
5977c478bd9Sstevel@tonic-gate 	 * run Solaris 2.x:
5987c478bd9Sstevel@tonic-gate 	 *
5997c478bd9Sstevel@tonic-gate 	 *			average memory scanned in ~1 sec
6007c478bd9Sstevel@tonic-gate 	 *
6017c478bd9Sstevel@tonic-gate 	 *	25 Mhz SS1+:		23 Meg
6027c478bd9Sstevel@tonic-gate 	 *	LX:			37 Meg
6037c478bd9Sstevel@tonic-gate 	 *	50 Mhz SC2000:		68 Meg
6047c478bd9Sstevel@tonic-gate 	 *
6057c478bd9Sstevel@tonic-gate 	 *	40 Mhz 486:		26 Meg
6067c478bd9Sstevel@tonic-gate 	 *	66 Mhz 486:		42 Meg
6077c478bd9Sstevel@tonic-gate 	 *
6087c478bd9Sstevel@tonic-gate 	 * When free memory falls just below lotsfree, the scan rate
6097c478bd9Sstevel@tonic-gate 	 * goes from 0 to slowscan (i.e., pageout starts running).  This
6107c478bd9Sstevel@tonic-gate 	 * transition needs to be smooth and is achieved by ensuring that
6117c478bd9Sstevel@tonic-gate 	 * pageout scans a small number of pages to satisfy the transient
6127c478bd9Sstevel@tonic-gate 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
6137c478bd9Sstevel@tonic-gate 	 * wakeup) since scanning that many pages has no noticible impact
6147c478bd9Sstevel@tonic-gate 	 * on system performance.
6157c478bd9Sstevel@tonic-gate 	 *
6167c478bd9Sstevel@tonic-gate 	 * In addition to setting fastscan and slowscan, pageout is
6177c478bd9Sstevel@tonic-gate 	 * limited to using ~4% of the CPU.  This results in increasing
6187c478bd9Sstevel@tonic-gate 	 * the time taken to scan all of memory, which in turn means that
6197c478bd9Sstevel@tonic-gate 	 * user processes have a better opportunity of preventing their
6207c478bd9Sstevel@tonic-gate 	 * pages from being stolen.  This has a positive effect on
6217c478bd9Sstevel@tonic-gate 	 * interactive and overall system performance when memory demand
6227c478bd9Sstevel@tonic-gate 	 * is high.
6237c478bd9Sstevel@tonic-gate 	 *
6247c478bd9Sstevel@tonic-gate 	 * Thus, the rate at which pages are scanned for replacement will
6257c478bd9Sstevel@tonic-gate 	 * vary linearly between slowscan and the number of pages that
6267c478bd9Sstevel@tonic-gate 	 * can be scanned using ~4% of processor time instead of varying
6277c478bd9Sstevel@tonic-gate 	 * linearly between slowscan and fastscan.
6287c478bd9Sstevel@tonic-gate 	 *
6297c478bd9Sstevel@tonic-gate 	 * Also, the processor time used by pageout will vary from ~1%
6307c478bd9Sstevel@tonic-gate 	 * at slowscan to ~4% at fastscan instead of varying between
6317c478bd9Sstevel@tonic-gate 	 * ~1% at slowscan and ~10% at fastscan.
6327c478bd9Sstevel@tonic-gate 	 *
6337c478bd9Sstevel@tonic-gate 	 * The values chosen for the various VM parameters (fastscan,
6347c478bd9Sstevel@tonic-gate 	 * handspreadpages, etc) are not universally true for all machines,
6357c478bd9Sstevel@tonic-gate 	 * but appear to be a good rule of thumb for the machines we've
6367c478bd9Sstevel@tonic-gate 	 * tested.  They have the following ranges:
6377c478bd9Sstevel@tonic-gate 	 *
6387c478bd9Sstevel@tonic-gate 	 *	cpu speed:	20 to 70 Mhz
6397c478bd9Sstevel@tonic-gate 	 *	page size:	4K to 8K
6407c478bd9Sstevel@tonic-gate 	 *	memory size:	16M to 5G
6417c478bd9Sstevel@tonic-gate 	 *	page scan rate:	4000 - 17400 4K pages per sec
6427c478bd9Sstevel@tonic-gate 	 *
6437c478bd9Sstevel@tonic-gate 	 * The values need to be re-examined for machines which don't
6447c478bd9Sstevel@tonic-gate 	 * fall into the various ranges (e.g., slower or faster CPUs,
6457c478bd9Sstevel@tonic-gate 	 * smaller or larger pagesizes etc) shown above.
6467c478bd9Sstevel@tonic-gate 	 *
6477c478bd9Sstevel@tonic-gate 	 * On an MP machine, pageout is often unable to maintain the
6487c478bd9Sstevel@tonic-gate 	 * minimum paging thresholds under heavy load.  This is due to
6497c478bd9Sstevel@tonic-gate 	 * the fact that user processes running on other CPU's can be
6507c478bd9Sstevel@tonic-gate 	 * dirtying memory at a much faster pace than pageout can find
6517c478bd9Sstevel@tonic-gate 	 * pages to free.  The memory demands could be met by enabling
6527c478bd9Sstevel@tonic-gate 	 * more than one CPU to run the clock algorithm in such a manner
6537c478bd9Sstevel@tonic-gate 	 * that the various clock hands don't overlap.  This also makes
6547c478bd9Sstevel@tonic-gate 	 * it more difficult to determine the values for fastscan, slowscan
6557c478bd9Sstevel@tonic-gate 	 * and handspreadpages.
6567c478bd9Sstevel@tonic-gate 	 *
6577c478bd9Sstevel@tonic-gate 	 * The swapper is currently used to free up memory when pageout
6587c478bd9Sstevel@tonic-gate 	 * is unable to meet memory demands by swapping out processes.
6597c478bd9Sstevel@tonic-gate 	 * In addition to freeing up memory, swapping also reduces the
6607c478bd9Sstevel@tonic-gate 	 * demand for memory by preventing user processes from running
6617c478bd9Sstevel@tonic-gate 	 * and thereby consuming memory.
6627c478bd9Sstevel@tonic-gate 	 */
6632d9166aeSJoshua M. Clulow 	if (clockinit.ci_maxfastscan == 0) {
6642d9166aeSJoshua M. Clulow 		if (pageout_new_spread != 0) {
6657c478bd9Sstevel@tonic-gate 			maxfastscan = pageout_new_spread;
6667c478bd9Sstevel@tonic-gate 		} else {
6672d9166aeSJoshua M. Clulow 			maxfastscan = MAXHANDSPREADPAGES;
6687c478bd9Sstevel@tonic-gate 		}
6692d9166aeSJoshua M. Clulow 	} else {
6702d9166aeSJoshua M. Clulow 		maxfastscan = clockinit.ci_maxfastscan;
6712d9166aeSJoshua M. Clulow 	}
6722d9166aeSJoshua M. Clulow 
6732d9166aeSJoshua M. Clulow 	if (clockinit.ci_fastscan == 0) {
6747c478bd9Sstevel@tonic-gate 		fastscan = MIN(looppages / loopfraction, maxfastscan);
6752d9166aeSJoshua M. Clulow 	} else {
6762d9166aeSJoshua M. Clulow 		fastscan = clockinit.ci_fastscan;
6772d9166aeSJoshua M. Clulow 	}
6782d9166aeSJoshua M. Clulow 
6792d9166aeSJoshua M. Clulow 	if (fastscan > looppages / loopfraction) {
6807c478bd9Sstevel@tonic-gate 		fastscan = looppages / loopfraction;
6812d9166aeSJoshua M. Clulow 	}
6827c478bd9Sstevel@tonic-gate 
6837c478bd9Sstevel@tonic-gate 	/*
6847c478bd9Sstevel@tonic-gate 	 * Set slow scan time to 1/10 the fast scan time, but
6857c478bd9Sstevel@tonic-gate 	 * not to exceed maxslowscan.
6867c478bd9Sstevel@tonic-gate 	 */
6872d9166aeSJoshua M. Clulow 	if (clockinit.ci_slowscan == 0) {
6887c478bd9Sstevel@tonic-gate 		slowscan = MIN(fastscan / 10, maxslowscan);
6892d9166aeSJoshua M. Clulow 	} else {
6902d9166aeSJoshua M. Clulow 		slowscan = clockinit.ci_slowscan;
6912d9166aeSJoshua M. Clulow 	}
6922d9166aeSJoshua M. Clulow 
6932d9166aeSJoshua M. Clulow 	if (slowscan > fastscan / 2) {
6947c478bd9Sstevel@tonic-gate 		slowscan = fastscan / 2;
6952d9166aeSJoshua M. Clulow 	}
6967c478bd9Sstevel@tonic-gate 
6977c478bd9Sstevel@tonic-gate 	/*
69871cf2822SAndy Fiddaman 	 * Handspreadpages is the distance (in pages) between front and back
6997c478bd9Sstevel@tonic-gate 	 * pageout daemon hands.  The amount of time to reclaim a page
7007c478bd9Sstevel@tonic-gate 	 * once pageout examines it increases with this distance and
7017c478bd9Sstevel@tonic-gate 	 * decreases as the scan rate rises. It must be < the amount
7027c478bd9Sstevel@tonic-gate 	 * of pageable memory.
7037c478bd9Sstevel@tonic-gate 	 *
7047c478bd9Sstevel@tonic-gate 	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
7057c478bd9Sstevel@tonic-gate 	 * to be "fastscan" results in the front hand being a few secs
7067c478bd9Sstevel@tonic-gate 	 * (varies based on the processor speed) ahead of the back hand
7077c478bd9Sstevel@tonic-gate 	 * at fastscan rates.  This distance can be further reduced, if
7087c478bd9Sstevel@tonic-gate 	 * necessary, by increasing the processor time used by pageout
7097c478bd9Sstevel@tonic-gate 	 * to be more than ~4% and preferrably not more than ~10%.
7107c478bd9Sstevel@tonic-gate 	 *
7117c478bd9Sstevel@tonic-gate 	 * As a result, user processes have a much better chance of
7127c478bd9Sstevel@tonic-gate 	 * referencing their pages before the back hand examines them.
7137c478bd9Sstevel@tonic-gate 	 * This also significantly lowers the number of reclaims from
7147c478bd9Sstevel@tonic-gate 	 * the freelist since pageout does not end up freeing pages which
7157c478bd9Sstevel@tonic-gate 	 * may be referenced a sec later.
7167c478bd9Sstevel@tonic-gate 	 */
7172d9166aeSJoshua M. Clulow 	if (clockinit.ci_handspreadpages == 0) {
7187c478bd9Sstevel@tonic-gate 		handspreadpages = fastscan;
7192d9166aeSJoshua M. Clulow 	} else {
7202d9166aeSJoshua M. Clulow 		handspreadpages = clockinit.ci_handspreadpages;
7212d9166aeSJoshua M. Clulow 	}
7227c478bd9Sstevel@tonic-gate 
7237c478bd9Sstevel@tonic-gate 	/*
7247c478bd9Sstevel@tonic-gate 	 * Make sure that back hand follows front hand by at least
7252d9166aeSJoshua M. Clulow 	 * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
7262d9166aeSJoshua M. Clulow 	 * back hand to look at a page during the same wakeup of the pageout
7272d9166aeSJoshua M. Clulow 	 * daemon in which the front hand cleared its ref bit.
7287c478bd9Sstevel@tonic-gate 	 */
7292d9166aeSJoshua M. Clulow 	if (handspreadpages >= looppages) {
7307c478bd9Sstevel@tonic-gate 		handspreadpages = looppages - 1;
7312d9166aeSJoshua M. Clulow 	}
7327c478bd9Sstevel@tonic-gate 
7337c478bd9Sstevel@tonic-gate 	/*
73471cf2822SAndy Fiddaman 	 * Establish the minimum and maximum length of time to be spent
73571cf2822SAndy Fiddaman 	 * scanning pages per wakeup, limiting the scanner duty cycle. The
73671cf2822SAndy Fiddaman 	 * input percentage values (0-100) must be converted to a fraction of
73771cf2822SAndy Fiddaman 	 * the number of nanoseconds in a second of wall time, then further
73871cf2822SAndy Fiddaman 	 * scaled down by the number of scanner wakeups in a second.
7397c478bd9Sstevel@tonic-gate 	 */
74071cf2822SAndy Fiddaman 	min_pageout_nsec = MAX(1,
74171cf2822SAndy Fiddaman 	    NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
74271cf2822SAndy Fiddaman 	max_pageout_nsec = MAX(min_pageout_nsec,
74371cf2822SAndy Fiddaman 	    NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
7447c478bd9Sstevel@tonic-gate 
7457c478bd9Sstevel@tonic-gate 	/*
74671cf2822SAndy Fiddaman 	 * If not called for recalculation, return and skip the remaining
74771cf2822SAndy Fiddaman 	 * steps.
7487c478bd9Sstevel@tonic-gate 	 */
74971cf2822SAndy Fiddaman 	if (!recalc)
75071cf2822SAndy Fiddaman 		return;
7517c478bd9Sstevel@tonic-gate 
75271cf2822SAndy Fiddaman 	/*
75371cf2822SAndy Fiddaman 	 * Set a flag to re-evaluate the clock hand positions.
75471cf2822SAndy Fiddaman 	 */
75571cf2822SAndy Fiddaman 	for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
75671cf2822SAndy Fiddaman 		reset_hands[i] = true;
75771cf2822SAndy Fiddaman 
75871cf2822SAndy Fiddaman 	recalc_pagescanners();
75971cf2822SAndy Fiddaman }
76071cf2822SAndy Fiddaman 
76171cf2822SAndy Fiddaman static kmutex_t	pageout_mutex;
7627c478bd9Sstevel@tonic-gate 
7637c478bd9Sstevel@tonic-gate /*
7647c478bd9Sstevel@tonic-gate  * Pool of available async pageout putpage requests.
7657c478bd9Sstevel@tonic-gate  */
7667c478bd9Sstevel@tonic-gate static struct async_reqs *push_req;
7677c478bd9Sstevel@tonic-gate static struct async_reqs *req_freelist;	/* available req structs */
7687c478bd9Sstevel@tonic-gate static struct async_reqs *push_list;	/* pending reqs */
7697c478bd9Sstevel@tonic-gate static kmutex_t push_lock;		/* protects req pool */
7707c478bd9Sstevel@tonic-gate static kcondvar_t push_cv;
7717c478bd9Sstevel@tonic-gate 
772727737b4SJoshua M. Clulow /*
773727737b4SJoshua M. Clulow  * If pageout() is stuck on a single push for this many seconds,
774727737b4SJoshua M. Clulow  * pageout_deadman() will assume the system has hit a memory deadlock.  If set
775727737b4SJoshua M. Clulow  * to 0, the deadman will have no effect.
776727737b4SJoshua M. Clulow  *
777727737b4SJoshua M. Clulow  * Note that we are only looking for stalls in the calls that pageout() makes
778727737b4SJoshua M. Clulow  * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
779727737b4SJoshua M. Clulow  * I/O, which should not take long unless the underlying strategy call blocks
780727737b4SJoshua M. Clulow  * indefinitely for memory.  The actual I/O request happens (or fails) later.
781727737b4SJoshua M. Clulow  */
782727737b4SJoshua M. Clulow uint_t pageout_deadman_seconds = 90;
783727737b4SJoshua M. Clulow 
784727737b4SJoshua M. Clulow static uint_t pageout_stucktime = 0;
785727737b4SJoshua M. Clulow static bool pageout_pushing = false;
786727737b4SJoshua M. Clulow static uint64_t pageout_pushcount = 0;
787727737b4SJoshua M. Clulow static uint64_t pageout_pushcount_seen = 0;
788727737b4SJoshua M. Clulow 
78971cf2822SAndy Fiddaman int async_list_size = 8192;
7907c478bd9Sstevel@tonic-gate 
79171cf2822SAndy Fiddaman static void pageout_scanner(void *);
7927c478bd9Sstevel@tonic-gate 
7937c478bd9Sstevel@tonic-gate /*
7947c478bd9Sstevel@tonic-gate  * If a page is being shared more than "po_share" times
7957c478bd9Sstevel@tonic-gate  * then leave it alone- don't page it out.
7967c478bd9Sstevel@tonic-gate  */
7977c478bd9Sstevel@tonic-gate #define	MIN_PO_SHARE	(8)
7987c478bd9Sstevel@tonic-gate #define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
7997c478bd9Sstevel@tonic-gate ulong_t	po_share = MIN_PO_SHARE;
8007c478bd9Sstevel@tonic-gate 
8017c478bd9Sstevel@tonic-gate /*
8027c478bd9Sstevel@tonic-gate  * Schedule rate for paging.
8037c478bd9Sstevel@tonic-gate  * Rate is linear interpolation between
8047c478bd9Sstevel@tonic-gate  * slowscan with lotsfree and fastscan when out of memory.
8057c478bd9Sstevel@tonic-gate  */
8067c478bd9Sstevel@tonic-gate static void
schedpaging(void * arg)8077c478bd9Sstevel@tonic-gate schedpaging(void *arg)
8087c478bd9Sstevel@tonic-gate {
8097c478bd9Sstevel@tonic-gate 	spgcnt_t vavail;
8107c478bd9Sstevel@tonic-gate 
8117c478bd9Sstevel@tonic-gate 	if (freemem < lotsfree + needfree + kmem_reapahead)
8127c478bd9Sstevel@tonic-gate 		kmem_reap();
8137c478bd9Sstevel@tonic-gate 
814a98e9dbfSaguzovsk 	if (freemem < lotsfree + needfree)
8157c478bd9Sstevel@tonic-gate 		seg_preap();
8167c478bd9Sstevel@tonic-gate 
8177c478bd9Sstevel@tonic-gate 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
8187c478bd9Sstevel@tonic-gate 		kcage_cageout_wakeup();
8197c478bd9Sstevel@tonic-gate 
8207c478bd9Sstevel@tonic-gate 	if (mutex_tryenter(&pageout_mutex)) {
82171cf2822SAndy Fiddaman 		if (pageouts_running != 0)
82271cf2822SAndy Fiddaman 			goto out;
82371cf2822SAndy Fiddaman 
82471cf2822SAndy Fiddaman 		/* No pageout scanner threads running. */
8257c478bd9Sstevel@tonic-gate 		nscan = 0;
8267c478bd9Sstevel@tonic-gate 		vavail = freemem - deficit;
82706cfbf35Sjimp 		if (pageout_new_spread != 0)
82806cfbf35Sjimp 			vavail -= needfree;
82971cf2822SAndy Fiddaman 		/* Note that vavail is signed so don't use clamp() here */
8307c478bd9Sstevel@tonic-gate 		if (vavail < 0)
8317c478bd9Sstevel@tonic-gate 			vavail = 0;
8327c478bd9Sstevel@tonic-gate 		if (vavail > lotsfree)
8337c478bd9Sstevel@tonic-gate 			vavail = lotsfree;
8347c478bd9Sstevel@tonic-gate 
8352d9166aeSJoshua M. Clulow 		if (needfree > 0 && pageout_new_spread == 0) {
83606cfbf35Sjimp 			/*
83706cfbf35Sjimp 			 * If we've not yet collected enough samples to
83806cfbf35Sjimp 			 * calculate a spread, use the old logic of kicking
83906cfbf35Sjimp 			 * into high gear anytime needfree is non-zero.
84006cfbf35Sjimp 			 */
8412d9166aeSJoshua M. Clulow 			desscan = fastscan / SCHEDPAGING_HZ;
8427c478bd9Sstevel@tonic-gate 		} else {
84306cfbf35Sjimp 			/*
84406cfbf35Sjimp 			 * Once we've calculated a spread based on system
84506cfbf35Sjimp 			 * memory and usage, just treat needfree as another
84606cfbf35Sjimp 			 * form of deficit.
84706cfbf35Sjimp 			 */
8487c478bd9Sstevel@tonic-gate 			spgcnt_t faststmp, slowstmp, result;
8497c478bd9Sstevel@tonic-gate 
8507c478bd9Sstevel@tonic-gate 			slowstmp = slowscan * vavail;
8517c478bd9Sstevel@tonic-gate 			faststmp = fastscan * (lotsfree - vavail);
8527c478bd9Sstevel@tonic-gate 			result = (slowstmp + faststmp) /
8532d9166aeSJoshua M. Clulow 			    nz(lotsfree) / SCHEDPAGING_HZ;
8547c478bd9Sstevel@tonic-gate 			desscan = (pgcnt_t)result;
8557c478bd9Sstevel@tonic-gate 		}
8567c478bd9Sstevel@tonic-gate 
8572d9166aeSJoshua M. Clulow 		pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
8582d9166aeSJoshua M. Clulow 		    (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
8597c478bd9Sstevel@tonic-gate 
86071cf2822SAndy Fiddaman 		DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
86171cf2822SAndy Fiddaman 		    pageout_nsec);
86271cf2822SAndy Fiddaman 
86371cf2822SAndy Fiddaman 		if (pageout_new_spread != 0 && despagescanners != 0 &&
86471cf2822SAndy Fiddaman 		    despagescanners != n_page_scanners) {
8652d9166aeSJoshua M. Clulow 			/*
86671cf2822SAndy Fiddaman 			 * We have finished the pagescan initialisation and the
86771cf2822SAndy Fiddaman 			 * desired number of page scanners has changed, either
86871cf2822SAndy Fiddaman 			 * because sampling just finished, because of a memory
86971cf2822SAndy Fiddaman 			 * DR, or because despagescanners has been modified on
87071cf2822SAndy Fiddaman 			 * the fly (e.g. via mdb(1)).
8712d9166aeSJoshua M. Clulow 			 */
87271cf2822SAndy Fiddaman 			uint_t curr_nscan = n_page_scanners;
87371cf2822SAndy Fiddaman 			uint_t i;
87471cf2822SAndy Fiddaman 
87571cf2822SAndy Fiddaman 			/* Re-validate despagescanners */
87671cf2822SAndy Fiddaman 			recalc_pagescanners();
87771cf2822SAndy Fiddaman 
87871cf2822SAndy Fiddaman 			n_page_scanners = despagescanners;
87971cf2822SAndy Fiddaman 
88071cf2822SAndy Fiddaman 			for (i = 0; i < MAX_PSCAN_THREADS; i++)
88171cf2822SAndy Fiddaman 				reset_hands[i] = true;
88271cf2822SAndy Fiddaman 
88371cf2822SAndy Fiddaman 			/* If we need more scanners, start them now. */
88471cf2822SAndy Fiddaman 			for (i = curr_nscan; i < n_page_scanners; i++) {
88571cf2822SAndy Fiddaman 				(void) lwp_kernel_create(proc_pageout,
88671cf2822SAndy Fiddaman 				    pageout_scanner, (void *)(uintptr_t)i,
88771cf2822SAndy Fiddaman 				    TS_RUN, curthread->t_pri);
88871cf2822SAndy Fiddaman 			}
88971cf2822SAndy Fiddaman 
89071cf2822SAndy Fiddaman 			/*
89171cf2822SAndy Fiddaman 			 * If the number of scanners has decreased, trigger a
89271cf2822SAndy Fiddaman 			 * wakeup so that the excess threads will terminate.
89371cf2822SAndy Fiddaman 			 */
89471cf2822SAndy Fiddaman 			if (n_page_scanners < curr_nscan) {
89571cf2822SAndy Fiddaman 				WAKE_PAGEOUT_SCANNER(reducing);
89671cf2822SAndy Fiddaman 			}
89771cf2822SAndy Fiddaman 		}
89871cf2822SAndy Fiddaman 
89971cf2822SAndy Fiddaman 		if (pageout_sampling) {
90071cf2822SAndy Fiddaman 			/*
90171cf2822SAndy Fiddaman 			 * We still need to measure the rate at which the
90271cf2822SAndy Fiddaman 			 * system is able to scan pages of memory. Each of
90371cf2822SAndy Fiddaman 			 * these initial samples is a scan of as much system
90471cf2822SAndy Fiddaman 			 * memory as practical, regardless of whether or not we
90571cf2822SAndy Fiddaman 			 * are experiencing memory pressure.
90671cf2822SAndy Fiddaman 			 */
90771cf2822SAndy Fiddaman 			desscan = total_pages;
90871cf2822SAndy Fiddaman 			pageout_nsec = max_pageout_nsec;
90971cf2822SAndy Fiddaman 
91071cf2822SAndy Fiddaman 			WAKE_PAGEOUT_SCANNER(sampling);
91171cf2822SAndy Fiddaman 		} else if (freemem < lotsfree + needfree) {
91271cf2822SAndy Fiddaman 			/*
91371cf2822SAndy Fiddaman 			 * We need more memory.
91471cf2822SAndy Fiddaman 			 */
915*338664dfSAndy Fiddaman 			low_mem_scan++;
91671cf2822SAndy Fiddaman 			WAKE_PAGEOUT_SCANNER(lowmem);
9177c478bd9Sstevel@tonic-gate 		} else {
9187c478bd9Sstevel@tonic-gate 			/*
9197c478bd9Sstevel@tonic-gate 			 * There are enough free pages, no need to
92071cf2822SAndy Fiddaman 			 * kick the scanner threads.  And next time
9217c478bd9Sstevel@tonic-gate 			 * around, keep more of the `highly shared'
9227c478bd9Sstevel@tonic-gate 			 * pages.
9237c478bd9Sstevel@tonic-gate 			 */
9247c478bd9Sstevel@tonic-gate 			cv_signal_pageout();
92571cf2822SAndy Fiddaman 			if (po_share > MIN_PO_SHARE)
9267c478bd9Sstevel@tonic-gate 				po_share >>= 1;
9277c478bd9Sstevel@tonic-gate 		}
92871cf2822SAndy Fiddaman out:
9297c478bd9Sstevel@tonic-gate 		mutex_exit(&pageout_mutex);
9307c478bd9Sstevel@tonic-gate 	}
9317c478bd9Sstevel@tonic-gate 
9327c478bd9Sstevel@tonic-gate 	/*
9337c478bd9Sstevel@tonic-gate 	 * Signal threads waiting for available memory.
9347c478bd9Sstevel@tonic-gate 	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
93571cf2822SAndy Fiddaman 	 * in this case it is not needed - the waiters will be woken up during
9367c478bd9Sstevel@tonic-gate 	 * the next invocation of this function.
9377c478bd9Sstevel@tonic-gate 	 */
9387c478bd9Sstevel@tonic-gate 	if (kmem_avail() > 0)
9397c478bd9Sstevel@tonic-gate 		cv_broadcast(&memavail_cv);
9407c478bd9Sstevel@tonic-gate 
9412d9166aeSJoshua M. Clulow 	(void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
9427c478bd9Sstevel@tonic-gate }
9437c478bd9Sstevel@tonic-gate 
9447c478bd9Sstevel@tonic-gate pgcnt_t		pushes;
9457c478bd9Sstevel@tonic-gate ulong_t		push_list_size;		/* # of requests on pageout queue */
9467c478bd9Sstevel@tonic-gate 
9472d9166aeSJoshua M. Clulow /*
9482d9166aeSJoshua M. Clulow  * Paging out should always be enabled.  This tunable exists to hold pageout
9492d9166aeSJoshua M. Clulow  * for debugging purposes.  If set to 0, pageout_scanner() will go back to
9502d9166aeSJoshua M. Clulow  * sleep each time it is woken by schedpaging().
9512d9166aeSJoshua M. Clulow  */
9522d9166aeSJoshua M. Clulow uint_t dopageout = 1;
9537c478bd9Sstevel@tonic-gate 
9547c478bd9Sstevel@tonic-gate /*
9557c478bd9Sstevel@tonic-gate  * The page out daemon, which runs as process 2.
9567c478bd9Sstevel@tonic-gate  *
95771cf2822SAndy Fiddaman  * The daemon treats physical memory as a circular array of pages and scans
95871cf2822SAndy Fiddaman  * the pages using a 'two-handed clock' algorithm. The front hand moves
95971cf2822SAndy Fiddaman  * through the pages, clearing the reference bit. The back hand travels a
96071cf2822SAndy Fiddaman  * distance (handspreadpages) behind the front hand, freeing the pages that
96171cf2822SAndy Fiddaman  * have not been referenced in the time since the front hand passed. If
96271cf2822SAndy Fiddaman  * modified, they are first written to their backing store before being
96371cf2822SAndy Fiddaman  * freed.
9647c478bd9Sstevel@tonic-gate  *
96571cf2822SAndy Fiddaman  * In order to make page invalidation more responsive on machines with
96671cf2822SAndy Fiddaman  * larger memory, multiple pageout_scanner threads may be created. In this
96771cf2822SAndy Fiddaman  * case, each thread is given a segment of the memory "clock face" so that
96871cf2822SAndy Fiddaman  * memory can be reclaimed more quickly. As long as there are at least lotsfree
96971cf2822SAndy Fiddaman  * pages, then pageout_scanner threads are not run.
9707c478bd9Sstevel@tonic-gate  *
97171cf2822SAndy Fiddaman  * There are multiple threads that act on behalf of the pageout process. A
97271cf2822SAndy Fiddaman  * set of threads scan pages (pageout_scanner) and frees them up if they
97371cf2822SAndy Fiddaman  * don't require any VOP_PUTPAGE operation. If a page must be written back
97471cf2822SAndy Fiddaman  * to its backing store, the request is put on a list and the other
97571cf2822SAndy Fiddaman  * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
97671cf2822SAndy Fiddaman  * requests from the list, and processes them. Some filesystems may require
97771cf2822SAndy Fiddaman  * resources for the VOP_PUTPAGE operations (like memory) and hence can
97871cf2822SAndy Fiddaman  * block the pageout thread, but the scanner thread can still operate.
97971cf2822SAndy Fiddaman  * There is still no guarantee that memory deadlocks cannot occur.
9807c478bd9Sstevel@tonic-gate  */
9817c478bd9Sstevel@tonic-gate void
pageout()9827c478bd9Sstevel@tonic-gate pageout()
9837c478bd9Sstevel@tonic-gate {
9847c478bd9Sstevel@tonic-gate 	struct async_reqs *arg;
9857c478bd9Sstevel@tonic-gate 	pri_t pageout_pri;
9867c478bd9Sstevel@tonic-gate 	int i;
9877c478bd9Sstevel@tonic-gate 	pgcnt_t max_pushes;
9887c478bd9Sstevel@tonic-gate 	callb_cpr_t cprinfo;
9897c478bd9Sstevel@tonic-gate 
9907c478bd9Sstevel@tonic-gate 	proc_pageout = ttoproc(curthread);
9917c478bd9Sstevel@tonic-gate 	proc_pageout->p_cstime = 0;
9927c478bd9Sstevel@tonic-gate 	proc_pageout->p_stime =  0;
9937c478bd9Sstevel@tonic-gate 	proc_pageout->p_cutime =  0;
9947c478bd9Sstevel@tonic-gate 	proc_pageout->p_utime = 0;
995ae115bc7Smrj 	bcopy("pageout", PTOU(curproc)->u_psargs, 8);
996ae115bc7Smrj 	bcopy("pageout", PTOU(curproc)->u_comm, 7);
9977c478bd9Sstevel@tonic-gate 
9987c478bd9Sstevel@tonic-gate 	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
9997c478bd9Sstevel@tonic-gate 	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
10007c478bd9Sstevel@tonic-gate 
10017c478bd9Sstevel@tonic-gate 	/*
100271cf2822SAndy Fiddaman 	 * Allocate and initialize the async request structures for pageout.
10037c478bd9Sstevel@tonic-gate 	 */
10047c478bd9Sstevel@tonic-gate 	push_req = (struct async_reqs *)
10057c478bd9Sstevel@tonic-gate 	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
10067c478bd9Sstevel@tonic-gate 
10077c478bd9Sstevel@tonic-gate 	req_freelist = push_req;
10082d9166aeSJoshua M. Clulow 	for (i = 0; i < async_list_size - 1; i++) {
10097c478bd9Sstevel@tonic-gate 		push_req[i].a_next = &push_req[i + 1];
10102d9166aeSJoshua M. Clulow 	}
10117c478bd9Sstevel@tonic-gate 
10127c478bd9Sstevel@tonic-gate 	pageout_pri = curthread->t_pri;
101335a5a358SJonathan Adams 
101471cf2822SAndy Fiddaman 	/* Create the first pageout scanner thread. */
101571cf2822SAndy Fiddaman 	(void) lwp_kernel_create(proc_pageout, pageout_scanner,
101671cf2822SAndy Fiddaman 	    (void *)0,	/* this is instance 0, not NULL */
101771cf2822SAndy Fiddaman 	    TS_RUN, pageout_pri - 1);
10187c478bd9Sstevel@tonic-gate 
10197c478bd9Sstevel@tonic-gate 	/*
102071cf2822SAndy Fiddaman 	 * kick off the pageout scheduler.
10217c478bd9Sstevel@tonic-gate 	 */
10227c478bd9Sstevel@tonic-gate 	schedpaging(NULL);
10237c478bd9Sstevel@tonic-gate 
10247c478bd9Sstevel@tonic-gate 	/*
10257c478bd9Sstevel@tonic-gate 	 * Create kernel cage thread.
10267c478bd9Sstevel@tonic-gate 	 * The kernel cage thread is started under the pageout process
10277c478bd9Sstevel@tonic-gate 	 * to take advantage of the less restricted page allocation
10287c478bd9Sstevel@tonic-gate 	 * in page_create_throttle().
10297c478bd9Sstevel@tonic-gate 	 */
10307c478bd9Sstevel@tonic-gate 	kcage_cageout_init();
10317c478bd9Sstevel@tonic-gate 
10327c478bd9Sstevel@tonic-gate 	/*
10337c478bd9Sstevel@tonic-gate 	 * Limit pushes to avoid saturating pageout devices.
10347c478bd9Sstevel@tonic-gate 	 */
10352d9166aeSJoshua M. Clulow 	max_pushes = maxpgio / SCHEDPAGING_HZ;
10367c478bd9Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
10377c478bd9Sstevel@tonic-gate 
10387c478bd9Sstevel@tonic-gate 	for (;;) {
10397c478bd9Sstevel@tonic-gate 		mutex_enter(&push_lock);
10407c478bd9Sstevel@tonic-gate 
10417c478bd9Sstevel@tonic-gate 		while ((arg = push_list) == NULL || pushes > max_pushes) {
10427c478bd9Sstevel@tonic-gate 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
10437c478bd9Sstevel@tonic-gate 			cv_wait(&push_cv, &push_lock);
10447c478bd9Sstevel@tonic-gate 			pushes = 0;
10457c478bd9Sstevel@tonic-gate 			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
10467c478bd9Sstevel@tonic-gate 		}
10477c478bd9Sstevel@tonic-gate 		push_list = arg->a_next;
10487c478bd9Sstevel@tonic-gate 		arg->a_next = NULL;
1049727737b4SJoshua M. Clulow 		pageout_pushing = true;
10507c478bd9Sstevel@tonic-gate 		mutex_exit(&push_lock);
10517c478bd9Sstevel@tonic-gate 
105271cf2822SAndy Fiddaman 		DTRACE_PROBE(pageout__push);
105371cf2822SAndy Fiddaman 
10547c478bd9Sstevel@tonic-gate 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
105506cfbf35Sjimp 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
10567c478bd9Sstevel@tonic-gate 			pushes++;
10577c478bd9Sstevel@tonic-gate 		}
10587c478bd9Sstevel@tonic-gate 
10597c478bd9Sstevel@tonic-gate 		/* vp held by checkpage() */
10607c478bd9Sstevel@tonic-gate 		VN_RELE(arg->a_vp);
10617c478bd9Sstevel@tonic-gate 
10627c478bd9Sstevel@tonic-gate 		mutex_enter(&push_lock);
1063727737b4SJoshua M. Clulow 		pageout_pushing = false;
1064727737b4SJoshua M. Clulow 		pageout_pushcount++;
10657c478bd9Sstevel@tonic-gate 		arg->a_next = req_freelist;	/* back on freelist */
10667c478bd9Sstevel@tonic-gate 		req_freelist = arg;
10677c478bd9Sstevel@tonic-gate 		push_list_size--;
10687c478bd9Sstevel@tonic-gate 		mutex_exit(&push_lock);
10697c478bd9Sstevel@tonic-gate 	}
10707c478bd9Sstevel@tonic-gate }
10717c478bd9Sstevel@tonic-gate 
107271cf2822SAndy Fiddaman static void
pageout_sample_add(pgcnt_t count,hrtime_t elapsed)107371cf2822SAndy Fiddaman pageout_sample_add(pgcnt_t count, hrtime_t elapsed)
107471cf2822SAndy Fiddaman {
107571cf2822SAndy Fiddaman 	VERIFY(pageout_sampling);
107671cf2822SAndy Fiddaman 
107771cf2822SAndy Fiddaman 	/*
107871cf2822SAndy Fiddaman 	 * The global variables used below are only modified during initial
107971cf2822SAndy Fiddaman 	 * scanning when there is a single page scanner thread running.
108071cf2822SAndy Fiddaman 	 */
108171cf2822SAndy Fiddaman 	pageout_sample_pages += count;
108271cf2822SAndy Fiddaman 	pageout_sample_etime += elapsed;
108371cf2822SAndy Fiddaman 	pageout_sample_cnt++;
108471cf2822SAndy Fiddaman 
108571cf2822SAndy Fiddaman 	if (pageout_sample_cnt >= pageout_sample_lim) {
108671cf2822SAndy Fiddaman 		/*
108771cf2822SAndy Fiddaman 		 * We have enough samples, set the spread.
108871cf2822SAndy Fiddaman 		 */
108971cf2822SAndy Fiddaman 		pageout_sampling = false;
109071cf2822SAndy Fiddaman 		pageout_rate = (hrrate_t)pageout_sample_pages *
109171cf2822SAndy Fiddaman 		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
109271cf2822SAndy Fiddaman 		pageout_new_spread = pageout_rate / 10;
109371cf2822SAndy Fiddaman 	}
109471cf2822SAndy Fiddaman }
109571cf2822SAndy Fiddaman 
109671cf2822SAndy Fiddaman static inline page_t *
wrapping_page_next(page_t * cur,page_t * start,page_t * end)109771cf2822SAndy Fiddaman wrapping_page_next(page_t *cur, page_t *start, page_t *end)
109871cf2822SAndy Fiddaman {
109971cf2822SAndy Fiddaman 	if (cur == end)
110071cf2822SAndy Fiddaman 		return (start);
110171cf2822SAndy Fiddaman 	return (page_nextn(cur, 1));
110271cf2822SAndy Fiddaman }
110371cf2822SAndy Fiddaman 
11047c478bd9Sstevel@tonic-gate /*
11057c478bd9Sstevel@tonic-gate  * Kernel thread that scans pages looking for ones to free
11067c478bd9Sstevel@tonic-gate  */
11077c478bd9Sstevel@tonic-gate static void
pageout_scanner(void * a)110871cf2822SAndy Fiddaman pageout_scanner(void *a)
11097c478bd9Sstevel@tonic-gate {
111071cf2822SAndy Fiddaman 	page_t *fhand, *bhand, *fhandstart;
111171cf2822SAndy Fiddaman 	page_t *regionstart, *regionend;
11122d9166aeSJoshua M. Clulow 	uint_t laps;
11137c478bd9Sstevel@tonic-gate 	callb_cpr_t cprinfo;
111471cf2822SAndy Fiddaman 	pgcnt_t	nscan_cnt;
11157c478bd9Sstevel@tonic-gate 	pgcnt_t	pcount;
111671cf2822SAndy Fiddaman 	hrtime_t sample_start, sample_end;
111771cf2822SAndy Fiddaman 	uint_t inst = (uint_t)(uintptr_t)a;
111871cf2822SAndy Fiddaman 
111971cf2822SAndy Fiddaman 	VERIFY3U(inst, <, MAX_PSCAN_THREADS);
11207c478bd9Sstevel@tonic-gate 
11217c478bd9Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
11227c478bd9Sstevel@tonic-gate 	mutex_enter(&pageout_mutex);
11237c478bd9Sstevel@tonic-gate 
11247c478bd9Sstevel@tonic-gate 	/*
11257c478bd9Sstevel@tonic-gate 	 * The restart case does not attempt to point the hands at roughly
11267c478bd9Sstevel@tonic-gate 	 * the right point on the assumption that after one circuit things
11272d9166aeSJoshua M. Clulow 	 * will have settled down, and restarts shouldn't be that often.
11287c478bd9Sstevel@tonic-gate 	 */
112971cf2822SAndy Fiddaman 	reset_hands[inst] = true;
11307c478bd9Sstevel@tonic-gate 
113171cf2822SAndy Fiddaman 	pageouts_running++;
113271cf2822SAndy Fiddaman 	mutex_exit(&pageout_mutex);
11337c478bd9Sstevel@tonic-gate 
11347c478bd9Sstevel@tonic-gate loop:
11357c478bd9Sstevel@tonic-gate 	cv_signal_pageout();
11367c478bd9Sstevel@tonic-gate 
113771cf2822SAndy Fiddaman 	mutex_enter(&pageout_mutex);
113871cf2822SAndy Fiddaman 	pageouts_running--;
11397c478bd9Sstevel@tonic-gate 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
11407c478bd9Sstevel@tonic-gate 	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
11417c478bd9Sstevel@tonic-gate 	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
114271cf2822SAndy Fiddaman 	pageouts_running++;
114371cf2822SAndy Fiddaman 	mutex_exit(&pageout_mutex);
11447c478bd9Sstevel@tonic-gate 
11452d9166aeSJoshua M. Clulow 	/*
114671cf2822SAndy Fiddaman 	 * Check if pageout has been disabled for debugging purposes.
11472d9166aeSJoshua M. Clulow 	 */
114871cf2822SAndy Fiddaman 	if (dopageout == 0)
11497c478bd9Sstevel@tonic-gate 		goto loop;
115071cf2822SAndy Fiddaman 
115171cf2822SAndy Fiddaman 	/*
115271cf2822SAndy Fiddaman 	 * One may reset the clock hands and scanned region for debugging
115371cf2822SAndy Fiddaman 	 * purposes. Hands will also be reset on first thread startup, if
115471cf2822SAndy Fiddaman 	 * the number of scanning threads (n_page_scanners) changes, or if
115571cf2822SAndy Fiddaman 	 * memory is added to, or removed from, the system.
115671cf2822SAndy Fiddaman 	 */
115771cf2822SAndy Fiddaman 	if (reset_hands[inst]) {
115871cf2822SAndy Fiddaman 		page_t *first;
115971cf2822SAndy Fiddaman 
116071cf2822SAndy Fiddaman 		reset_hands[inst] = false;
116171cf2822SAndy Fiddaman 
116271cf2822SAndy Fiddaman 		if (inst >= n_page_scanners) {
116371cf2822SAndy Fiddaman 			/*
116471cf2822SAndy Fiddaman 			 * The desired number of page scanners has been
116571cf2822SAndy Fiddaman 			 * reduced and this instance is no longer wanted.
116671cf2822SAndy Fiddaman 			 * Exit the lwp.
116771cf2822SAndy Fiddaman 			 */
116871cf2822SAndy Fiddaman 			VERIFY3U(inst, !=, 0);
116971cf2822SAndy Fiddaman 			DTRACE_PROBE1(pageout__exit, uint_t, inst);
117071cf2822SAndy Fiddaman 			mutex_enter(&pageout_mutex);
117171cf2822SAndy Fiddaman 			pageouts_running--;
117271cf2822SAndy Fiddaman 			mutex_exit(&pageout_mutex);
117371cf2822SAndy Fiddaman 			mutex_enter(&curproc->p_lock);
117471cf2822SAndy Fiddaman 			lwp_exit();
117571cf2822SAndy Fiddaman 			/* NOTREACHED */
117671cf2822SAndy Fiddaman 		}
117771cf2822SAndy Fiddaman 
117871cf2822SAndy Fiddaman 		first = page_first();
117971cf2822SAndy Fiddaman 
118071cf2822SAndy Fiddaman 		/*
118171cf2822SAndy Fiddaman 		 * Each scanner thread gets its own sector of the memory
118271cf2822SAndy Fiddaman 		 * clock face.
118371cf2822SAndy Fiddaman 		 */
118471cf2822SAndy Fiddaman 		pgcnt_t span, offset;
118571cf2822SAndy Fiddaman 
118671cf2822SAndy Fiddaman 		span = looppages / n_page_scanners;
118771cf2822SAndy Fiddaman 		VERIFY3U(span, >, handspreadpages);
118871cf2822SAndy Fiddaman 
118971cf2822SAndy Fiddaman 		offset = inst * span;
119071cf2822SAndy Fiddaman 		regionstart = page_nextn(first, offset);
119171cf2822SAndy Fiddaman 		if (inst == n_page_scanners - 1) {
119271cf2822SAndy Fiddaman 			/* The last instance goes up to the last page */
119371cf2822SAndy Fiddaman 			regionend = page_nextn(first, looppages - 1);
119471cf2822SAndy Fiddaman 		} else {
119571cf2822SAndy Fiddaman 			regionend = page_nextn(regionstart, span - 1);
119671cf2822SAndy Fiddaman 		}
119771cf2822SAndy Fiddaman 
119871cf2822SAndy Fiddaman 		bhand = regionstart;
119971cf2822SAndy Fiddaman 		fhand = page_nextn(bhand, handspreadpages);
120071cf2822SAndy Fiddaman 
120171cf2822SAndy Fiddaman 		DTRACE_PROBE4(pageout__reset, uint_t, inst,
120271cf2822SAndy Fiddaman 		    pgcnt_t, regionstart, pgcnt_t, regionend,
120371cf2822SAndy Fiddaman 		    pgcnt_t, fhand);
12042d9166aeSJoshua M. Clulow 	}
12057c478bd9Sstevel@tonic-gate 
12062d9166aeSJoshua M. Clulow 	/*
120771cf2822SAndy Fiddaman 	 * This CPU kstat is only incremented here and we're on this CPU, so no
120871cf2822SAndy Fiddaman 	 * lock.
12092d9166aeSJoshua M. Clulow 	 */
12107c478bd9Sstevel@tonic-gate 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
12117c478bd9Sstevel@tonic-gate 
12122d9166aeSJoshua M. Clulow 	/*
12132d9166aeSJoshua M. Clulow 	 * Keep track of the number of times we have scanned all the way around
121471cf2822SAndy Fiddaman 	 * the loop on this wakeup.
12152d9166aeSJoshua M. Clulow 	 */
12162d9166aeSJoshua M. Clulow 	laps = 0;
12177c478bd9Sstevel@tonic-gate 
12182d9166aeSJoshua M. Clulow 	/*
12192d9166aeSJoshua M. Clulow 	 * Track the number of pages visited during this scan so that we can
12202d9166aeSJoshua M. Clulow 	 * periodically measure our duty cycle.
12212d9166aeSJoshua M. Clulow 	 */
122271cf2822SAndy Fiddaman 	nscan_cnt = 0;
12237c478bd9Sstevel@tonic-gate 	pcount = 0;
12242d9166aeSJoshua M. Clulow 
122571cf2822SAndy Fiddaman 	DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
122671cf2822SAndy Fiddaman 	    hrtime_t, pageout_nsec, page_t *, bhand, page_t *, fhand);
122771cf2822SAndy Fiddaman 
12282d9166aeSJoshua M. Clulow 	/*
122971cf2822SAndy Fiddaman 	 * Record the initial position of the front hand for this cycle so
123071cf2822SAndy Fiddaman 	 * that we can detect when the hand wraps around.
12312d9166aeSJoshua M. Clulow 	 */
123271cf2822SAndy Fiddaman 	fhandstart = fhand;
12332d9166aeSJoshua M. Clulow 
12347c478bd9Sstevel@tonic-gate 	sample_start = gethrtime();
12357c478bd9Sstevel@tonic-gate 
12367c478bd9Sstevel@tonic-gate 	/*
12377c478bd9Sstevel@tonic-gate 	 * Scan the appropriate number of pages for a single duty cycle.
12387c478bd9Sstevel@tonic-gate 	 */
123971cf2822SAndy Fiddaman 	while (nscan_cnt < desscan) {
12402d9166aeSJoshua M. Clulow 		checkpage_result_t rvfront, rvback;
12417c478bd9Sstevel@tonic-gate 
124271cf2822SAndy Fiddaman 		if (!pageout_sampling && freemem >= lotsfree + needfree) {
12432d9166aeSJoshua M. Clulow 			/*
12442d9166aeSJoshua M. Clulow 			 * We are not sampling and enough memory has become
12452d9166aeSJoshua M. Clulow 			 * available that scanning is no longer required.
12462d9166aeSJoshua M. Clulow 			 */
124771cf2822SAndy Fiddaman 			DTRACE_PROBE1(pageout__memfree, uint_t, inst);
12482d9166aeSJoshua M. Clulow 			break;
12492d9166aeSJoshua M. Clulow 		}
12507c478bd9Sstevel@tonic-gate 
125171cf2822SAndy Fiddaman 		DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
125271cf2822SAndy Fiddaman 
12537c478bd9Sstevel@tonic-gate 		/*
12542d9166aeSJoshua M. Clulow 		 * Periodically check to see if we have exceeded the CPU duty
12552d9166aeSJoshua M. Clulow 		 * cycle for a single wakeup.
12567c478bd9Sstevel@tonic-gate 		 */
12577c478bd9Sstevel@tonic-gate 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
125871cf2822SAndy Fiddaman 			hrtime_t pageout_cycle_nsec;
125971cf2822SAndy Fiddaman 
12602d9166aeSJoshua M. Clulow 			pageout_cycle_nsec = gethrtime() - sample_start;
12612d9166aeSJoshua M. Clulow 			if (pageout_cycle_nsec >= pageout_nsec) {
126271cf2822SAndy Fiddaman 				atomic_inc_64(&pageout_timeouts);
126371cf2822SAndy Fiddaman 				DTRACE_PROBE1(pageout__timeout, uint_t, inst);
12647c478bd9Sstevel@tonic-gate 				break;
12657c478bd9Sstevel@tonic-gate 			}
12667c478bd9Sstevel@tonic-gate 		}
12677c478bd9Sstevel@tonic-gate 
12687c478bd9Sstevel@tonic-gate 		/*
12697c478bd9Sstevel@tonic-gate 		 * If checkpage manages to add a page to the free list,
12707c478bd9Sstevel@tonic-gate 		 * we give ourselves another couple of trips around the loop.
12717c478bd9Sstevel@tonic-gate 		 */
127271cf2822SAndy Fiddaman 		if ((rvfront = checkpage(fhand, POH_FRONT)) == CKP_FREED) {
12732d9166aeSJoshua M. Clulow 			laps = 0;
12742d9166aeSJoshua M. Clulow 		}
127571cf2822SAndy Fiddaman 		if ((rvback = checkpage(bhand, POH_BACK)) == CKP_FREED) {
12762d9166aeSJoshua M. Clulow 			laps = 0;
12772d9166aeSJoshua M. Clulow 		}
12787c478bd9Sstevel@tonic-gate 
12797c478bd9Sstevel@tonic-gate 		++pcount;
12807c478bd9Sstevel@tonic-gate 
12817c478bd9Sstevel@tonic-gate 		/*
128271cf2822SAndy Fiddaman 		 * This CPU kstat is only incremented here and we're on this
128371cf2822SAndy Fiddaman 		 * CPU, so no lock.
12847c478bd9Sstevel@tonic-gate 		 */
12857c478bd9Sstevel@tonic-gate 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
12867c478bd9Sstevel@tonic-gate 
12877c478bd9Sstevel@tonic-gate 		/*
12887c478bd9Sstevel@tonic-gate 		 * Don't include ineligible pages in the number scanned.
12897c478bd9Sstevel@tonic-gate 		 */
129071cf2822SAndy Fiddaman 		if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE)
129171cf2822SAndy Fiddaman 			nscan_cnt++;
12927c478bd9Sstevel@tonic-gate 
12937c478bd9Sstevel@tonic-gate 		/*
129471cf2822SAndy Fiddaman 		 * Tick
12957c478bd9Sstevel@tonic-gate 		 */
129671cf2822SAndy Fiddaman 		bhand = wrapping_page_next(bhand, regionstart, regionend);
129771cf2822SAndy Fiddaman 		fhand = wrapping_page_next(fhand, regionstart, regionend);
12987c478bd9Sstevel@tonic-gate 
12997c478bd9Sstevel@tonic-gate 		/*
130071cf2822SAndy Fiddaman 		 * The front hand has wrapped around during this wakeup.
130171cf2822SAndy Fiddaman 		 */
130271cf2822SAndy Fiddaman 		if (fhand == fhandstart) {
130371cf2822SAndy Fiddaman 			laps++;
130471cf2822SAndy Fiddaman 			DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
130571cf2822SAndy Fiddaman 			    uint_t, laps);
130671cf2822SAndy Fiddaman 
130771cf2822SAndy Fiddaman 			/*
130871cf2822SAndy Fiddaman 			 * This CPU kstat is only incremented here and we're
130971cf2822SAndy Fiddaman 			 * on this CPU, so no lock.
13107c478bd9Sstevel@tonic-gate 			 */
13117c478bd9Sstevel@tonic-gate 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
13122d9166aeSJoshua M. Clulow 
13132d9166aeSJoshua M. Clulow 			if (laps > 1) {
13147c478bd9Sstevel@tonic-gate 				/*
13157c478bd9Sstevel@tonic-gate 				 * Extremely unlikely, but it happens.
13167c478bd9Sstevel@tonic-gate 				 * We went around the loop at least once
13177c478bd9Sstevel@tonic-gate 				 * and didn't get far enough.
13187c478bd9Sstevel@tonic-gate 				 * If we are still skipping `highly shared'
13197c478bd9Sstevel@tonic-gate 				 * pages, skip fewer of them.  Otherwise,
13207c478bd9Sstevel@tonic-gate 				 * give up till the next clock tick.
13217c478bd9Sstevel@tonic-gate 				 */
13227c478bd9Sstevel@tonic-gate 				if (po_share < MAX_PO_SHARE) {
13237c478bd9Sstevel@tonic-gate 					po_share <<= 1;
13247c478bd9Sstevel@tonic-gate 				} else {
13257c478bd9Sstevel@tonic-gate 					break;
13267c478bd9Sstevel@tonic-gate 				}
13277c478bd9Sstevel@tonic-gate 			}
13287c478bd9Sstevel@tonic-gate 		}
13297c478bd9Sstevel@tonic-gate 	}
13307c478bd9Sstevel@tonic-gate 
13317c478bd9Sstevel@tonic-gate 	sample_end = gethrtime();
133271cf2822SAndy Fiddaman 	atomic_add_long(&nscan, nscan_cnt);
13337c478bd9Sstevel@tonic-gate 
133471cf2822SAndy Fiddaman 	DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
133571cf2822SAndy Fiddaman 	    pgcnt_t, nscan_cnt, pgcnt_t, pcount)
13367c478bd9Sstevel@tonic-gate 
13372d9166aeSJoshua M. Clulow 	/*
133871cf2822SAndy Fiddaman 	 * Continue accumulating samples until we have enough to get a
133971cf2822SAndy Fiddaman 	 * reasonable value for average scan rate.
13402d9166aeSJoshua M. Clulow 	 */
134171cf2822SAndy Fiddaman 	if (pageout_sampling) {
134271cf2822SAndy Fiddaman 		VERIFY3U(inst, ==, 0);
134371cf2822SAndy Fiddaman 		pageout_sample_add(pcount, sample_end - sample_start);
1344d12ea28fSAndy Fiddaman 		/*
134571cf2822SAndy Fiddaman 		 * If, after the sample just added, we have finished sampling,
134671cf2822SAndy Fiddaman 		 * set up the paging constants.
1347d12ea28fSAndy Fiddaman 		 */
134871cf2822SAndy Fiddaman 		if (!pageout_sampling)
13492d9166aeSJoshua M. Clulow 			setupclock();
13502d9166aeSJoshua M. Clulow 	}
13517c478bd9Sstevel@tonic-gate 
13527c478bd9Sstevel@tonic-gate 	goto loop;
13537c478bd9Sstevel@tonic-gate }
13547c478bd9Sstevel@tonic-gate 
13557c478bd9Sstevel@tonic-gate /*
1356727737b4SJoshua M. Clulow  * The pageout deadman is run once per second by clock().
1357727737b4SJoshua M. Clulow  */
1358727737b4SJoshua M. Clulow void
pageout_deadman(void)1359727737b4SJoshua M. Clulow pageout_deadman(void)
1360727737b4SJoshua M. Clulow {
1361727737b4SJoshua M. Clulow 	if (panicstr != NULL) {
1362727737b4SJoshua M. Clulow 		/*
1363727737b4SJoshua M. Clulow 		 * There is no pageout after panic.
1364727737b4SJoshua M. Clulow 		 */
1365727737b4SJoshua M. Clulow 		return;
1366727737b4SJoshua M. Clulow 	}
1367727737b4SJoshua M. Clulow 
1368727737b4SJoshua M. Clulow 	if (pageout_deadman_seconds == 0) {
1369727737b4SJoshua M. Clulow 		/*
1370727737b4SJoshua M. Clulow 		 * The deadman is not enabled.
1371727737b4SJoshua M. Clulow 		 */
1372727737b4SJoshua M. Clulow 		return;
1373727737b4SJoshua M. Clulow 	}
1374727737b4SJoshua M. Clulow 
1375727737b4SJoshua M. Clulow 	if (!pageout_pushing) {
1376727737b4SJoshua M. Clulow 		goto reset;
1377727737b4SJoshua M. Clulow 	}
1378727737b4SJoshua M. Clulow 
1379727737b4SJoshua M. Clulow 	/*
1380727737b4SJoshua M. Clulow 	 * We are pushing a page.  Check to see if it is the same call we saw
1381727737b4SJoshua M. Clulow 	 * last time we looked:
1382727737b4SJoshua M. Clulow 	 */
1383727737b4SJoshua M. Clulow 	if (pageout_pushcount != pageout_pushcount_seen) {
1384727737b4SJoshua M. Clulow 		/*
1385727737b4SJoshua M. Clulow 		 * It is a different call from the last check, so we are not
1386727737b4SJoshua M. Clulow 		 * stuck.
1387727737b4SJoshua M. Clulow 		 */
1388727737b4SJoshua M. Clulow 		goto reset;
1389727737b4SJoshua M. Clulow 	}
1390727737b4SJoshua M. Clulow 
1391727737b4SJoshua M. Clulow 	if (++pageout_stucktime >= pageout_deadman_seconds) {
1392727737b4SJoshua M. Clulow 		panic("pageout_deadman: stuck pushing the same page for %d "
1393727737b4SJoshua M. Clulow 		    "seconds (freemem is %lu)", pageout_deadman_seconds,
1394727737b4SJoshua M. Clulow 		    freemem);
1395727737b4SJoshua M. Clulow 	}
1396727737b4SJoshua M. Clulow 
1397727737b4SJoshua M. Clulow 	return;
1398727737b4SJoshua M. Clulow 
1399727737b4SJoshua M. Clulow reset:
1400727737b4SJoshua M. Clulow 	/*
1401727737b4SJoshua M. Clulow 	 * Reset our tracking state to reflect that we are not stuck:
1402727737b4SJoshua M. Clulow 	 */
1403727737b4SJoshua M. Clulow 	pageout_stucktime = 0;
1404727737b4SJoshua M. Clulow 	pageout_pushcount_seen = pageout_pushcount;
1405727737b4SJoshua M. Clulow }
1406727737b4SJoshua M. Clulow 
1407727737b4SJoshua M. Clulow /*
14087c478bd9Sstevel@tonic-gate  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
14097c478bd9Sstevel@tonic-gate  * system (u., page table) or free, then leave it alone.  Otherwise,
14107c478bd9Sstevel@tonic-gate  * if we are running the front hand, turn off the page's reference bit.
14117c478bd9Sstevel@tonic-gate  * If the proc is over maxrss, we take it.  If running the back hand,
14127c478bd9Sstevel@tonic-gate  * check whether the page has been reclaimed.  If not, free the page,
14137c478bd9Sstevel@tonic-gate  * pushing it to disk first if necessary.
14147c478bd9Sstevel@tonic-gate  *
14157c478bd9Sstevel@tonic-gate  * Return values:
14162d9166aeSJoshua M. Clulow  *	CKP_INELIGIBLE if the page is not a candidate at all,
14172d9166aeSJoshua M. Clulow  *	CKP_NOT_FREED  if the page was not freed, or
14182d9166aeSJoshua M. Clulow  *	CKP_FREED      if we freed it.
14197c478bd9Sstevel@tonic-gate  */
14202d9166aeSJoshua M. Clulow static checkpage_result_t
checkpage(page_t * pp,pageout_hand_t whichhand)142171cf2822SAndy Fiddaman checkpage(page_t *pp, pageout_hand_t whichhand)
14227c478bd9Sstevel@tonic-gate {
14237c478bd9Sstevel@tonic-gate 	int ppattr;
14247c478bd9Sstevel@tonic-gate 	int isfs = 0;
14257c478bd9Sstevel@tonic-gate 	int isexec = 0;
14267c478bd9Sstevel@tonic-gate 	int pagesync_flag;
14277c478bd9Sstevel@tonic-gate 
14287c478bd9Sstevel@tonic-gate 	/*
14297c478bd9Sstevel@tonic-gate 	 * Skip pages:
14307c478bd9Sstevel@tonic-gate 	 *	- associated with the kernel vnode since
14317c478bd9Sstevel@tonic-gate 	 *	    they are always "exclusively" locked.
14327c478bd9Sstevel@tonic-gate 	 *	- that are free
14337c478bd9Sstevel@tonic-gate 	 *	- that are shared more than po_share'd times
14347c478bd9Sstevel@tonic-gate 	 *	- its already locked
14357c478bd9Sstevel@tonic-gate 	 *
14367c478bd9Sstevel@tonic-gate 	 * NOTE:  These optimizations assume that reads are atomic.
14377c478bd9Sstevel@tonic-gate 	 */
1438a98e9dbfSaguzovsk 
1439a98e9dbfSaguzovsk 	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
1440a98e9dbfSaguzovsk 	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
1441a98e9dbfSaguzovsk 	    hat_page_checkshare(pp, po_share)) {
14422d9166aeSJoshua M. Clulow 		return (CKP_INELIGIBLE);
14437c478bd9Sstevel@tonic-gate 	}
14447c478bd9Sstevel@tonic-gate 
14457c478bd9Sstevel@tonic-gate 	if (!page_trylock(pp, SE_EXCL)) {
14467c478bd9Sstevel@tonic-gate 		/*
14477c478bd9Sstevel@tonic-gate 		 * Skip the page if we can't acquire the "exclusive" lock.
14487c478bd9Sstevel@tonic-gate 		 */
14492d9166aeSJoshua M. Clulow 		return (CKP_INELIGIBLE);
14507c478bd9Sstevel@tonic-gate 	} else if (PP_ISFREE(pp)) {
14517c478bd9Sstevel@tonic-gate 		/*
14527c478bd9Sstevel@tonic-gate 		 * It became free between the above check and our actually
14532d9166aeSJoshua M. Clulow 		 * locking the page.  Oh well, there will be other pages.
14547c478bd9Sstevel@tonic-gate 		 */
14557c478bd9Sstevel@tonic-gate 		page_unlock(pp);
14562d9166aeSJoshua M. Clulow 		return (CKP_INELIGIBLE);
14577c478bd9Sstevel@tonic-gate 	}
14587c478bd9Sstevel@tonic-gate 
14597c478bd9Sstevel@tonic-gate 	/*
14607c478bd9Sstevel@tonic-gate 	 * Reject pages that cannot be freed. The page_struct_lock
14617c478bd9Sstevel@tonic-gate 	 * need not be acquired to examine these
14627c478bd9Sstevel@tonic-gate 	 * fields since the page has an "exclusive" lock.
14637c478bd9Sstevel@tonic-gate 	 */
14647c478bd9Sstevel@tonic-gate 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
14657c478bd9Sstevel@tonic-gate 		page_unlock(pp);
14662d9166aeSJoshua M. Clulow 		return (CKP_INELIGIBLE);
14677c478bd9Sstevel@tonic-gate 	}
14687c478bd9Sstevel@tonic-gate 
14697c478bd9Sstevel@tonic-gate 	/*
14707c478bd9Sstevel@tonic-gate 	 * Maintain statistics for what we are freeing
14717c478bd9Sstevel@tonic-gate 	 */
14727c478bd9Sstevel@tonic-gate 	if (pp->p_vnode != NULL) {
14737c478bd9Sstevel@tonic-gate 		if (pp->p_vnode->v_flag & VVMEXEC)
14747c478bd9Sstevel@tonic-gate 			isexec = 1;
14757c478bd9Sstevel@tonic-gate 
14767c478bd9Sstevel@tonic-gate 		if (!IS_SWAPFSVP(pp->p_vnode))
14777c478bd9Sstevel@tonic-gate 			isfs = 1;
14787c478bd9Sstevel@tonic-gate 	}
14797c478bd9Sstevel@tonic-gate 
14807c478bd9Sstevel@tonic-gate 	/*
14817c478bd9Sstevel@tonic-gate 	 * Turn off REF and MOD bits with the front hand.
14827c478bd9Sstevel@tonic-gate 	 * The back hand examines the REF bit and always considers
14837c478bd9Sstevel@tonic-gate 	 * SHARED pages as referenced.
14847c478bd9Sstevel@tonic-gate 	 */
14852d9166aeSJoshua M. Clulow 	if (whichhand == POH_FRONT) {
14867c478bd9Sstevel@tonic-gate 		pagesync_flag = HAT_SYNC_ZERORM;
14872d9166aeSJoshua M. Clulow 	} else {
14887c478bd9Sstevel@tonic-gate 		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
14897c478bd9Sstevel@tonic-gate 		    HAT_SYNC_STOPON_SHARED;
14902d9166aeSJoshua M. Clulow 	}
14917c478bd9Sstevel@tonic-gate 
14927c478bd9Sstevel@tonic-gate 	ppattr = hat_pagesync(pp, pagesync_flag);
14937c478bd9Sstevel@tonic-gate 
14947c478bd9Sstevel@tonic-gate recheck:
14957c478bd9Sstevel@tonic-gate 	/*
14967c478bd9Sstevel@tonic-gate 	 * If page is referenced; make unreferenced but reclaimable.
14977c478bd9Sstevel@tonic-gate 	 * If this page is not referenced, then it must be reclaimable
14987c478bd9Sstevel@tonic-gate 	 * and we can add it to the free list.
14997c478bd9Sstevel@tonic-gate 	 */
15007c478bd9Sstevel@tonic-gate 	if (ppattr & P_REF) {
15012d9166aeSJoshua M. Clulow 		DTRACE_PROBE2(pageout__isref, page_t *, pp,
15022d9166aeSJoshua M. Clulow 		    pageout_hand_t, whichhand);
15032d9166aeSJoshua M. Clulow 
15042d9166aeSJoshua M. Clulow 		if (whichhand == POH_FRONT) {
15057c478bd9Sstevel@tonic-gate 			/*
15067c478bd9Sstevel@tonic-gate 			 * Checking of rss or madvise flags needed here...
15077c478bd9Sstevel@tonic-gate 			 *
15087c478bd9Sstevel@tonic-gate 			 * If not "well-behaved", fall through into the code
15097c478bd9Sstevel@tonic-gate 			 * for not referenced.
15107c478bd9Sstevel@tonic-gate 			 */
15117c478bd9Sstevel@tonic-gate 			hat_clrref(pp);
15127c478bd9Sstevel@tonic-gate 		}
15132d9166aeSJoshua M. Clulow 
15147c478bd9Sstevel@tonic-gate 		/*
15157c478bd9Sstevel@tonic-gate 		 * Somebody referenced the page since the front
15167c478bd9Sstevel@tonic-gate 		 * hand went by, so it's not a candidate for
15177c478bd9Sstevel@tonic-gate 		 * freeing up.
15187c478bd9Sstevel@tonic-gate 		 */
15197c478bd9Sstevel@tonic-gate 		page_unlock(pp);
15202d9166aeSJoshua M. Clulow 		return (CKP_NOT_FREED);
15217c478bd9Sstevel@tonic-gate 	}
15227c478bd9Sstevel@tonic-gate 
15237c478bd9Sstevel@tonic-gate 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
15247c478bd9Sstevel@tonic-gate 
15257c478bd9Sstevel@tonic-gate 	/*
15267c478bd9Sstevel@tonic-gate 	 * If large page, attempt to demote it. If successfully demoted,
15277c478bd9Sstevel@tonic-gate 	 * retry the checkpage.
15287c478bd9Sstevel@tonic-gate 	 */
15297c478bd9Sstevel@tonic-gate 	if (pp->p_szc != 0) {
15307c478bd9Sstevel@tonic-gate 		if (!page_try_demote_pages(pp)) {
15317c478bd9Sstevel@tonic-gate 			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
15327c478bd9Sstevel@tonic-gate 			page_unlock(pp);
15332d9166aeSJoshua M. Clulow 			return (CKP_INELIGIBLE);
15347c478bd9Sstevel@tonic-gate 		}
15352d9166aeSJoshua M. Clulow 
15367c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
15377c478bd9Sstevel@tonic-gate 		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
15382d9166aeSJoshua M. Clulow 
15397c478bd9Sstevel@tonic-gate 		/*
15402d9166aeSJoshua M. Clulow 		 * Since page_try_demote_pages() could have unloaded some
15417c478bd9Sstevel@tonic-gate 		 * mappings it makes sense to reload ppattr.
15427c478bd9Sstevel@tonic-gate 		 */
15437c478bd9Sstevel@tonic-gate 		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
15447c478bd9Sstevel@tonic-gate 	}
15457c478bd9Sstevel@tonic-gate 
15467c478bd9Sstevel@tonic-gate 	/*
15472d9166aeSJoshua M. Clulow 	 * If the page is currently dirty, we have to arrange to have it
15482d9166aeSJoshua M. Clulow 	 * cleaned before it can be freed.
15497c478bd9Sstevel@tonic-gate 	 *
15507c478bd9Sstevel@tonic-gate 	 * XXX - ASSERT(pp->p_vnode != NULL);
15517c478bd9Sstevel@tonic-gate 	 */
15522d9166aeSJoshua M. Clulow 	if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
15537c478bd9Sstevel@tonic-gate 		struct vnode *vp = pp->p_vnode;
15547c478bd9Sstevel@tonic-gate 		u_offset_t offset = pp->p_offset;
15557c478bd9Sstevel@tonic-gate 
15567c478bd9Sstevel@tonic-gate 		/*
15577c478bd9Sstevel@tonic-gate 		 * XXX - Test for process being swapped out or about to exit?
15587c478bd9Sstevel@tonic-gate 		 * [Can't get back to process(es) using the page.]
15597c478bd9Sstevel@tonic-gate 		 */
15607c478bd9Sstevel@tonic-gate 
15617c478bd9Sstevel@tonic-gate 		/*
15627c478bd9Sstevel@tonic-gate 		 * Hold the vnode before releasing the page lock to
15637c478bd9Sstevel@tonic-gate 		 * prevent it from being freed and re-used by some
15647c478bd9Sstevel@tonic-gate 		 * other thread.
15657c478bd9Sstevel@tonic-gate 		 */
15667c478bd9Sstevel@tonic-gate 		VN_HOLD(vp);
15677c478bd9Sstevel@tonic-gate 		page_unlock(pp);
15687c478bd9Sstevel@tonic-gate 
15697c478bd9Sstevel@tonic-gate 		/*
15702d9166aeSJoshua M. Clulow 		 * Queue I/O request for the pageout thread.
15717c478bd9Sstevel@tonic-gate 		 */
15727c478bd9Sstevel@tonic-gate 		if (!queue_io_request(vp, offset)) {
15737c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
15742d9166aeSJoshua M. Clulow 			return (CKP_NOT_FREED);
15757c478bd9Sstevel@tonic-gate 		}
15762d9166aeSJoshua M. Clulow 		return (CKP_FREED);
15777c478bd9Sstevel@tonic-gate 	}
15787c478bd9Sstevel@tonic-gate 
15797c478bd9Sstevel@tonic-gate 	/*
15802d9166aeSJoshua M. Clulow 	 * Now we unload all the translations and put the page back on to the
15812d9166aeSJoshua M. Clulow 	 * free list.  If the page was used (referenced or modified) after the
15822d9166aeSJoshua M. Clulow 	 * pagesync but before it was unloaded we catch it and handle the page
15832d9166aeSJoshua M. Clulow 	 * properly.
15847c478bd9Sstevel@tonic-gate 	 */
15852d9166aeSJoshua M. Clulow 	DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
15867c478bd9Sstevel@tonic-gate 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
15877c478bd9Sstevel@tonic-gate 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
15882d9166aeSJoshua M. Clulow 	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
15897c478bd9Sstevel@tonic-gate 		goto recheck;
15902d9166aeSJoshua M. Clulow 	}
15917c478bd9Sstevel@tonic-gate 
15927c478bd9Sstevel@tonic-gate 	VN_DISPOSE(pp, B_FREE, 0, kcred);
15937c478bd9Sstevel@tonic-gate 
15947c478bd9Sstevel@tonic-gate 	CPU_STATS_ADD_K(vm, dfree, 1);
15957c478bd9Sstevel@tonic-gate 
15967c478bd9Sstevel@tonic-gate 	if (isfs) {
15977c478bd9Sstevel@tonic-gate 		if (isexec) {
15987c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(vm, execfree, 1);
15997c478bd9Sstevel@tonic-gate 		} else {
16007c478bd9Sstevel@tonic-gate 			CPU_STATS_ADD_K(vm, fsfree, 1);
16017c478bd9Sstevel@tonic-gate 		}
16027c478bd9Sstevel@tonic-gate 	} else {
16037c478bd9Sstevel@tonic-gate 		CPU_STATS_ADD_K(vm, anonfree, 1);
16047c478bd9Sstevel@tonic-gate 	}
16057c478bd9Sstevel@tonic-gate 
16062d9166aeSJoshua M. Clulow 	return (CKP_FREED);
16077c478bd9Sstevel@tonic-gate }
16087c478bd9Sstevel@tonic-gate 
16097c478bd9Sstevel@tonic-gate /*
16107c478bd9Sstevel@tonic-gate  * Queue async i/o request from pageout_scanner and segment swapout
16117c478bd9Sstevel@tonic-gate  * routines on one common list.  This ensures that pageout devices (swap)
16127c478bd9Sstevel@tonic-gate  * are not saturated by pageout_scanner or swapout requests.
16137c478bd9Sstevel@tonic-gate  * The pageout thread empties this list by initiating i/o operations.
16147c478bd9Sstevel@tonic-gate  */
16157c478bd9Sstevel@tonic-gate int
queue_io_request(vnode_t * vp,u_offset_t off)16167c478bd9Sstevel@tonic-gate queue_io_request(vnode_t *vp, u_offset_t off)
16177c478bd9Sstevel@tonic-gate {
16187c478bd9Sstevel@tonic-gate 	struct async_reqs *arg;
16197c478bd9Sstevel@tonic-gate 
16207c478bd9Sstevel@tonic-gate 	/*
16217c478bd9Sstevel@tonic-gate 	 * If we cannot allocate an async request struct,
16227c478bd9Sstevel@tonic-gate 	 * skip this page.
16237c478bd9Sstevel@tonic-gate 	 */
16247c478bd9Sstevel@tonic-gate 	mutex_enter(&push_lock);
16257c478bd9Sstevel@tonic-gate 	if ((arg = req_freelist) == NULL) {
16267c478bd9Sstevel@tonic-gate 		mutex_exit(&push_lock);
16277c478bd9Sstevel@tonic-gate 		return (0);
16287c478bd9Sstevel@tonic-gate 	}
16297c478bd9Sstevel@tonic-gate 	req_freelist = arg->a_next;		/* adjust freelist */
16307c478bd9Sstevel@tonic-gate 	push_list_size++;
16317c478bd9Sstevel@tonic-gate 
16327c478bd9Sstevel@tonic-gate 	arg->a_vp = vp;
16337c478bd9Sstevel@tonic-gate 	arg->a_off = off;
16347c478bd9Sstevel@tonic-gate 	arg->a_len = PAGESIZE;
16357c478bd9Sstevel@tonic-gate 	arg->a_flags = B_ASYNC | B_FREE;
16367c478bd9Sstevel@tonic-gate 	arg->a_cred = kcred;		/* always held */
16377c478bd9Sstevel@tonic-gate 
16387c478bd9Sstevel@tonic-gate 	/*
16397c478bd9Sstevel@tonic-gate 	 * Add to list of pending write requests.
16407c478bd9Sstevel@tonic-gate 	 */
16417c478bd9Sstevel@tonic-gate 	arg->a_next = push_list;
16427c478bd9Sstevel@tonic-gate 	push_list = arg;
16437c478bd9Sstevel@tonic-gate 
16447c478bd9Sstevel@tonic-gate 	if (req_freelist == NULL) {
16457c478bd9Sstevel@tonic-gate 		/*
16467c478bd9Sstevel@tonic-gate 		 * No free async requests left. The lock is held so we
16477c478bd9Sstevel@tonic-gate 		 * might as well signal the pusher thread now.
16487c478bd9Sstevel@tonic-gate 		 */
16497c478bd9Sstevel@tonic-gate 		cv_signal(&push_cv);
16507c478bd9Sstevel@tonic-gate 	}
16517c478bd9Sstevel@tonic-gate 	mutex_exit(&push_lock);
16527c478bd9Sstevel@tonic-gate 	return (1);
16537c478bd9Sstevel@tonic-gate }
16547c478bd9Sstevel@tonic-gate 
16557c478bd9Sstevel@tonic-gate /*
16567c478bd9Sstevel@tonic-gate  * Wake up pageout to initiate i/o if push_list is not empty.
16577c478bd9Sstevel@tonic-gate  */
16587c478bd9Sstevel@tonic-gate void
cv_signal_pageout()16597c478bd9Sstevel@tonic-gate cv_signal_pageout()
16607c478bd9Sstevel@tonic-gate {
16617c478bd9Sstevel@tonic-gate 	if (push_list != NULL) {
16627c478bd9Sstevel@tonic-gate 		mutex_enter(&push_lock);
16637c478bd9Sstevel@tonic-gate 		cv_signal(&push_cv);
16647c478bd9Sstevel@tonic-gate 		mutex_exit(&push_lock);
16657c478bd9Sstevel@tonic-gate 	}
16667c478bd9Sstevel@tonic-gate }
1667