1ac04195bSKonstantin Belousov /*- 2796df753SPedro F. Giffuni * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) 3796df753SPedro F. Giffuni * 4ac04195bSKonstantin Belousov * Copyright (c) 1991 Regents of the University of California. 5ac04195bSKonstantin Belousov * All rights reserved. 6ac04195bSKonstantin Belousov * Copyright (c) 1994 John S. Dyson 7ac04195bSKonstantin Belousov * All rights reserved. 8ac04195bSKonstantin Belousov * Copyright (c) 1994 David Greenman 9ac04195bSKonstantin Belousov * All rights reserved. 10ac04195bSKonstantin Belousov * Copyright (c) 2005 Yahoo! Technologies Norway AS 11ac04195bSKonstantin Belousov * All rights reserved. 12ac04195bSKonstantin Belousov * 13ac04195bSKonstantin Belousov * This code is derived from software contributed to Berkeley by 14ac04195bSKonstantin Belousov * The Mach Operating System project at Carnegie-Mellon University. 15ac04195bSKonstantin Belousov * 16ac04195bSKonstantin Belousov * Redistribution and use in source and binary forms, with or without 17ac04195bSKonstantin Belousov * modification, are permitted provided that the following conditions 18ac04195bSKonstantin Belousov * are met: 19ac04195bSKonstantin Belousov * 1. Redistributions of source code must retain the above copyright 20ac04195bSKonstantin Belousov * notice, this list of conditions and the following disclaimer. 21ac04195bSKonstantin Belousov * 2. Redistributions in binary form must reproduce the above copyright 22ac04195bSKonstantin Belousov * notice, this list of conditions and the following disclaimer in the 23ac04195bSKonstantin Belousov * documentation and/or other materials provided with the distribution. 24ac04195bSKonstantin Belousov * 3. All advertising materials mentioning features or use of this software 25ac04195bSKonstantin Belousov * must display the following acknowledgement: 26ac04195bSKonstantin Belousov * This product includes software developed by the University of 27ac04195bSKonstantin Belousov * California, Berkeley and its contributors. 28ac04195bSKonstantin Belousov * 4. Neither the name of the University nor the names of its contributors 29ac04195bSKonstantin Belousov * may be used to endorse or promote products derived from this software 30ac04195bSKonstantin Belousov * without specific prior written permission. 31ac04195bSKonstantin Belousov * 32ac04195bSKonstantin Belousov * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33ac04195bSKonstantin Belousov * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34ac04195bSKonstantin Belousov * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35ac04195bSKonstantin Belousov * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36ac04195bSKonstantin Belousov * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37ac04195bSKonstantin Belousov * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38ac04195bSKonstantin Belousov * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39ac04195bSKonstantin Belousov * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40ac04195bSKonstantin Belousov * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41ac04195bSKonstantin Belousov * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42ac04195bSKonstantin Belousov * SUCH DAMAGE. 43ac04195bSKonstantin Belousov * 44ac04195bSKonstantin Belousov * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 45ac04195bSKonstantin Belousov * 46ac04195bSKonstantin Belousov * 47ac04195bSKonstantin Belousov * Copyright (c) 1987, 1990 Carnegie-Mellon University. 48ac04195bSKonstantin Belousov * All rights reserved. 49ac04195bSKonstantin Belousov * 50ac04195bSKonstantin Belousov * Authors: Avadis Tevanian, Jr., Michael Wayne Young 51ac04195bSKonstantin Belousov * 52ac04195bSKonstantin Belousov * Permission to use, copy, modify and distribute this software and 53ac04195bSKonstantin Belousov * its documentation is hereby granted, provided that both the copyright 54ac04195bSKonstantin Belousov * notice and this permission notice appear in all copies of the 55ac04195bSKonstantin Belousov * software, derivative works or modified versions, and any portions 56ac04195bSKonstantin Belousov * thereof, and that both notices appear in supporting documentation. 57ac04195bSKonstantin Belousov * 58ac04195bSKonstantin Belousov * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 59ac04195bSKonstantin Belousov * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 60ac04195bSKonstantin Belousov * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 61ac04195bSKonstantin Belousov * 62ac04195bSKonstantin Belousov * Carnegie Mellon requests users of this software to return to 63ac04195bSKonstantin Belousov * 64ac04195bSKonstantin Belousov * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 65ac04195bSKonstantin Belousov * School of Computer Science 66ac04195bSKonstantin Belousov * Carnegie Mellon University 67ac04195bSKonstantin Belousov * Pittsburgh PA 15213-3890 68ac04195bSKonstantin Belousov * 69ac04195bSKonstantin Belousov * any improvements or extensions that they make and grant Carnegie the 70ac04195bSKonstantin Belousov * rights to redistribute these changes. 71ac04195bSKonstantin Belousov */ 72ac04195bSKonstantin Belousov 73ac04195bSKonstantin Belousov #include <sys/cdefs.h> 74ac04195bSKonstantin Belousov __FBSDID("$FreeBSD$"); 75ac04195bSKonstantin Belousov 76ac04195bSKonstantin Belousov #include "opt_kstack_pages.h" 77ac04195bSKonstantin Belousov #include "opt_kstack_max_pages.h" 78ac04195bSKonstantin Belousov #include "opt_vm.h" 79ac04195bSKonstantin Belousov 80ac04195bSKonstantin Belousov #include <sys/param.h> 81ac04195bSKonstantin Belousov #include <sys/systm.h> 82ac04195bSKonstantin Belousov #include <sys/limits.h> 83ac04195bSKonstantin Belousov #include <sys/kernel.h> 84ac04195bSKonstantin Belousov #include <sys/eventhandler.h> 85ac04195bSKonstantin Belousov #include <sys/lock.h> 86ac04195bSKonstantin Belousov #include <sys/mutex.h> 87ac04195bSKonstantin Belousov #include <sys/proc.h> 88ac04195bSKonstantin Belousov #include <sys/_kstack_cache.h> 89ac04195bSKonstantin Belousov #include <sys/kthread.h> 90ac04195bSKonstantin Belousov #include <sys/ktr.h> 91ac04195bSKonstantin Belousov #include <sys/mount.h> 92ac04195bSKonstantin Belousov #include <sys/racct.h> 93ac04195bSKonstantin Belousov #include <sys/resourcevar.h> 94ac04195bSKonstantin Belousov #include <sys/sched.h> 95ac04195bSKonstantin Belousov #include <sys/sdt.h> 96ac04195bSKonstantin Belousov #include <sys/signalvar.h> 97ac04195bSKonstantin Belousov #include <sys/smp.h> 98ac04195bSKonstantin Belousov #include <sys/time.h> 99ac04195bSKonstantin Belousov #include <sys/vnode.h> 100ac04195bSKonstantin Belousov #include <sys/vmmeter.h> 101ac04195bSKonstantin Belousov #include <sys/rwlock.h> 102ac04195bSKonstantin Belousov #include <sys/sx.h> 103ac04195bSKonstantin Belousov #include <sys/sysctl.h> 104ac04195bSKonstantin Belousov 105ac04195bSKonstantin Belousov #include <vm/vm.h> 106ac04195bSKonstantin Belousov #include <vm/vm_param.h> 107ac04195bSKonstantin Belousov #include <vm/vm_object.h> 108ac04195bSKonstantin Belousov #include <vm/vm_page.h> 109ac04195bSKonstantin Belousov #include <vm/vm_map.h> 110ac04195bSKonstantin Belousov #include <vm/vm_pageout.h> 111ac04195bSKonstantin Belousov #include <vm/vm_pager.h> 112ac04195bSKonstantin Belousov #include <vm/vm_phys.h> 113ac04195bSKonstantin Belousov #include <vm/swap_pager.h> 114ac04195bSKonstantin Belousov #include <vm/vm_extern.h> 115ac04195bSKonstantin Belousov #include <vm/uma.h> 116ac04195bSKonstantin Belousov 117ac04195bSKonstantin Belousov /* the kernel process "vm_daemon" */ 118ac04195bSKonstantin Belousov static void vm_daemon(void); 119ac04195bSKonstantin Belousov static struct proc *vmproc; 120ac04195bSKonstantin Belousov 121ac04195bSKonstantin Belousov static struct kproc_desc vm_kp = { 122ac04195bSKonstantin Belousov "vmdaemon", 123ac04195bSKonstantin Belousov vm_daemon, 124ac04195bSKonstantin Belousov &vmproc 125ac04195bSKonstantin Belousov }; 126ac04195bSKonstantin Belousov SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 127ac04195bSKonstantin Belousov 128ac04195bSKonstantin Belousov static int vm_swap_enabled = 1; 129ac04195bSKonstantin Belousov static int vm_swap_idle_enabled = 0; 130ac04195bSKonstantin Belousov 131ac04195bSKonstantin Belousov SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, 132ac04195bSKonstantin Belousov &vm_swap_enabled, 0, 133ac04195bSKonstantin Belousov "Enable entire process swapout"); 134ac04195bSKonstantin Belousov SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, 135ac04195bSKonstantin Belousov &vm_swap_idle_enabled, 0, 136ac04195bSKonstantin Belousov "Allow swapout on idle criteria"); 137ac04195bSKonstantin Belousov 138ac04195bSKonstantin Belousov /* 139ac04195bSKonstantin Belousov * Swap_idle_threshold1 is the guaranteed swapped in time for a process 140ac04195bSKonstantin Belousov */ 141ac04195bSKonstantin Belousov static int swap_idle_threshold1 = 2; 142ac04195bSKonstantin Belousov SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW, 143ac04195bSKonstantin Belousov &swap_idle_threshold1, 0, 144ac04195bSKonstantin Belousov "Guaranteed swapped in time for a process"); 145ac04195bSKonstantin Belousov 146ac04195bSKonstantin Belousov /* 147ac04195bSKonstantin Belousov * Swap_idle_threshold2 is the time that a process can be idle before 148ac04195bSKonstantin Belousov * it will be swapped out, if idle swapping is enabled. 149ac04195bSKonstantin Belousov */ 150ac04195bSKonstantin Belousov static int swap_idle_threshold2 = 10; 151ac04195bSKonstantin Belousov SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW, 152ac04195bSKonstantin Belousov &swap_idle_threshold2, 0, 153ac04195bSKonstantin Belousov "Time before a process will be swapped out"); 154ac04195bSKonstantin Belousov 155ac04195bSKonstantin Belousov static int vm_pageout_req_swapout; /* XXX */ 156ac04195bSKonstantin Belousov static int vm_daemon_needed; 157ac04195bSKonstantin Belousov static struct mtx vm_daemon_mtx; 158ac04195bSKonstantin Belousov /* Allow for use by vm_pageout before vm_daemon is initialized. */ 159ac04195bSKonstantin Belousov MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); 160ac04195bSKonstantin Belousov 161ac04195bSKonstantin Belousov static void swapclear(struct proc *); 162ac04195bSKonstantin Belousov static int swapout(struct proc *); 163ac04195bSKonstantin Belousov static void vm_swapout_map_deactivate_pages(vm_map_t, long); 164ac04195bSKonstantin Belousov static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long); 165ac04195bSKonstantin Belousov static void swapout_procs(int action); 166ac04195bSKonstantin Belousov static void vm_req_vmdaemon(int req); 167ac04195bSKonstantin Belousov static void vm_thread_swapin(struct thread *td); 168ac04195bSKonstantin Belousov static void vm_thread_swapout(struct thread *td); 169ac04195bSKonstantin Belousov 170ac04195bSKonstantin Belousov /* 171ac04195bSKonstantin Belousov * vm_swapout_object_deactivate_pages 172ac04195bSKonstantin Belousov * 173ac04195bSKonstantin Belousov * Deactivate enough pages to satisfy the inactive target 174ac04195bSKonstantin Belousov * requirements. 175ac04195bSKonstantin Belousov * 176ac04195bSKonstantin Belousov * The object and map must be locked. 177ac04195bSKonstantin Belousov */ 178ac04195bSKonstantin Belousov static void 179ac04195bSKonstantin Belousov vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, 180ac04195bSKonstantin Belousov long desired) 181ac04195bSKonstantin Belousov { 182ac04195bSKonstantin Belousov vm_object_t backing_object, object; 183ac04195bSKonstantin Belousov vm_page_t p; 184ac04195bSKonstantin Belousov int act_delta, remove_mode; 185ac04195bSKonstantin Belousov 186ac04195bSKonstantin Belousov VM_OBJECT_ASSERT_LOCKED(first_object); 187ac04195bSKonstantin Belousov if ((first_object->flags & OBJ_FICTITIOUS) != 0) 188ac04195bSKonstantin Belousov return; 189ac04195bSKonstantin Belousov for (object = first_object;; object = backing_object) { 190ac04195bSKonstantin Belousov if (pmap_resident_count(pmap) <= desired) 191ac04195bSKonstantin Belousov goto unlock_return; 192ac04195bSKonstantin Belousov VM_OBJECT_ASSERT_LOCKED(object); 193ac04195bSKonstantin Belousov if ((object->flags & OBJ_UNMANAGED) != 0 || 194ac04195bSKonstantin Belousov object->paging_in_progress != 0) 195ac04195bSKonstantin Belousov goto unlock_return; 196ac04195bSKonstantin Belousov 197ac04195bSKonstantin Belousov remove_mode = 0; 198ac04195bSKonstantin Belousov if (object->shadow_count > 1) 199ac04195bSKonstantin Belousov remove_mode = 1; 200ac04195bSKonstantin Belousov /* 201ac04195bSKonstantin Belousov * Scan the object's entire memory queue. 202ac04195bSKonstantin Belousov */ 203ac04195bSKonstantin Belousov TAILQ_FOREACH(p, &object->memq, listq) { 204ac04195bSKonstantin Belousov if (pmap_resident_count(pmap) <= desired) 205ac04195bSKonstantin Belousov goto unlock_return; 206ac04195bSKonstantin Belousov if (vm_page_busied(p)) 207ac04195bSKonstantin Belousov continue; 208ac04195bSKonstantin Belousov VM_CNT_INC(v_pdpages); 209ac04195bSKonstantin Belousov vm_page_lock(p); 210ac04195bSKonstantin Belousov if (p->wire_count != 0 || p->hold_count != 0 || 211ac04195bSKonstantin Belousov !pmap_page_exists_quick(pmap, p)) { 212ac04195bSKonstantin Belousov vm_page_unlock(p); 213ac04195bSKonstantin Belousov continue; 214ac04195bSKonstantin Belousov } 215ac04195bSKonstantin Belousov act_delta = pmap_ts_referenced(p); 216ac04195bSKonstantin Belousov if ((p->aflags & PGA_REFERENCED) != 0) { 217ac04195bSKonstantin Belousov if (act_delta == 0) 218ac04195bSKonstantin Belousov act_delta = 1; 219ac04195bSKonstantin Belousov vm_page_aflag_clear(p, PGA_REFERENCED); 220ac04195bSKonstantin Belousov } 221ac04195bSKonstantin Belousov if (!vm_page_active(p) && act_delta != 0) { 222ac04195bSKonstantin Belousov vm_page_activate(p); 223ac04195bSKonstantin Belousov p->act_count += act_delta; 224ac04195bSKonstantin Belousov } else if (vm_page_active(p)) { 225ac04195bSKonstantin Belousov if (act_delta == 0) { 226ac04195bSKonstantin Belousov p->act_count -= min(p->act_count, 227ac04195bSKonstantin Belousov ACT_DECLINE); 228ac04195bSKonstantin Belousov if (!remove_mode && p->act_count == 0) { 229ac04195bSKonstantin Belousov pmap_remove_all(p); 230ac04195bSKonstantin Belousov vm_page_deactivate(p); 231ac04195bSKonstantin Belousov } else 232ac04195bSKonstantin Belousov vm_page_requeue(p); 233ac04195bSKonstantin Belousov } else { 234ac04195bSKonstantin Belousov vm_page_activate(p); 235ac04195bSKonstantin Belousov if (p->act_count < ACT_MAX - 236ac04195bSKonstantin Belousov ACT_ADVANCE) 237ac04195bSKonstantin Belousov p->act_count += ACT_ADVANCE; 238ac04195bSKonstantin Belousov vm_page_requeue(p); 239ac04195bSKonstantin Belousov } 240ac04195bSKonstantin Belousov } else if (vm_page_inactive(p)) 241ac04195bSKonstantin Belousov pmap_remove_all(p); 242ac04195bSKonstantin Belousov vm_page_unlock(p); 243ac04195bSKonstantin Belousov } 244ac04195bSKonstantin Belousov if ((backing_object = object->backing_object) == NULL) 245ac04195bSKonstantin Belousov goto unlock_return; 246ac04195bSKonstantin Belousov VM_OBJECT_RLOCK(backing_object); 247ac04195bSKonstantin Belousov if (object != first_object) 248ac04195bSKonstantin Belousov VM_OBJECT_RUNLOCK(object); 249ac04195bSKonstantin Belousov } 250ac04195bSKonstantin Belousov unlock_return: 251ac04195bSKonstantin Belousov if (object != first_object) 252ac04195bSKonstantin Belousov VM_OBJECT_RUNLOCK(object); 253ac04195bSKonstantin Belousov } 254ac04195bSKonstantin Belousov 255ac04195bSKonstantin Belousov /* 256ac04195bSKonstantin Belousov * deactivate some number of pages in a map, try to do it fairly, but 257ac04195bSKonstantin Belousov * that is really hard to do. 258ac04195bSKonstantin Belousov */ 259ac04195bSKonstantin Belousov static void 260ac04195bSKonstantin Belousov vm_swapout_map_deactivate_pages(vm_map_t map, long desired) 261ac04195bSKonstantin Belousov { 262ac04195bSKonstantin Belousov vm_map_entry_t tmpe; 263ac04195bSKonstantin Belousov vm_object_t obj, bigobj; 264ac04195bSKonstantin Belousov int nothingwired; 265ac04195bSKonstantin Belousov 2660080a8faSKonstantin Belousov if (!vm_map_trylock_read(map)) 267ac04195bSKonstantin Belousov return; 268ac04195bSKonstantin Belousov 269ac04195bSKonstantin Belousov bigobj = NULL; 270ac04195bSKonstantin Belousov nothingwired = TRUE; 271ac04195bSKonstantin Belousov 272ac04195bSKonstantin Belousov /* 273ac04195bSKonstantin Belousov * first, search out the biggest object, and try to free pages from 274ac04195bSKonstantin Belousov * that. 275ac04195bSKonstantin Belousov */ 276ac04195bSKonstantin Belousov tmpe = map->header.next; 277ac04195bSKonstantin Belousov while (tmpe != &map->header) { 278ac04195bSKonstantin Belousov if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 279ac04195bSKonstantin Belousov obj = tmpe->object.vm_object; 280ac04195bSKonstantin Belousov if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { 281ac04195bSKonstantin Belousov if (obj->shadow_count <= 1 && 282ac04195bSKonstantin Belousov (bigobj == NULL || 283ac04195bSKonstantin Belousov bigobj->resident_page_count < 284ac04195bSKonstantin Belousov obj->resident_page_count)) { 285ac04195bSKonstantin Belousov if (bigobj != NULL) 286ac04195bSKonstantin Belousov VM_OBJECT_RUNLOCK(bigobj); 287ac04195bSKonstantin Belousov bigobj = obj; 288ac04195bSKonstantin Belousov } else 289ac04195bSKonstantin Belousov VM_OBJECT_RUNLOCK(obj); 290ac04195bSKonstantin Belousov } 291ac04195bSKonstantin Belousov } 292ac04195bSKonstantin Belousov if (tmpe->wired_count > 0) 293ac04195bSKonstantin Belousov nothingwired = FALSE; 294ac04195bSKonstantin Belousov tmpe = tmpe->next; 295ac04195bSKonstantin Belousov } 296ac04195bSKonstantin Belousov 297ac04195bSKonstantin Belousov if (bigobj != NULL) { 298ac04195bSKonstantin Belousov vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired); 299ac04195bSKonstantin Belousov VM_OBJECT_RUNLOCK(bigobj); 300ac04195bSKonstantin Belousov } 301ac04195bSKonstantin Belousov /* 302ac04195bSKonstantin Belousov * Next, hunt around for other pages to deactivate. We actually 303ac04195bSKonstantin Belousov * do this search sort of wrong -- .text first is not the best idea. 304ac04195bSKonstantin Belousov */ 305ac04195bSKonstantin Belousov tmpe = map->header.next; 306ac04195bSKonstantin Belousov while (tmpe != &map->header) { 307ac04195bSKonstantin Belousov if (pmap_resident_count(vm_map_pmap(map)) <= desired) 308ac04195bSKonstantin Belousov break; 309ac04195bSKonstantin Belousov if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 310ac04195bSKonstantin Belousov obj = tmpe->object.vm_object; 311ac04195bSKonstantin Belousov if (obj != NULL) { 312ac04195bSKonstantin Belousov VM_OBJECT_RLOCK(obj); 313ac04195bSKonstantin Belousov vm_swapout_object_deactivate_pages(map->pmap, 314ac04195bSKonstantin Belousov obj, desired); 315ac04195bSKonstantin Belousov VM_OBJECT_RUNLOCK(obj); 316ac04195bSKonstantin Belousov } 317ac04195bSKonstantin Belousov } 318ac04195bSKonstantin Belousov tmpe = tmpe->next; 319ac04195bSKonstantin Belousov } 320ac04195bSKonstantin Belousov 321ac04195bSKonstantin Belousov /* 322ac04195bSKonstantin Belousov * Remove all mappings if a process is swapped out, this will free page 323ac04195bSKonstantin Belousov * table pages. 324ac04195bSKonstantin Belousov */ 325ac04195bSKonstantin Belousov if (desired == 0 && nothingwired) { 326ac04195bSKonstantin Belousov pmap_remove(vm_map_pmap(map), vm_map_min(map), 327ac04195bSKonstantin Belousov vm_map_max(map)); 328ac04195bSKonstantin Belousov } 329ac04195bSKonstantin Belousov 3300080a8faSKonstantin Belousov vm_map_unlock_read(map); 331ac04195bSKonstantin Belousov } 332ac04195bSKonstantin Belousov 333ac04195bSKonstantin Belousov /* 334ac04195bSKonstantin Belousov * Swap out requests 335ac04195bSKonstantin Belousov */ 336ac04195bSKonstantin Belousov #define VM_SWAP_NORMAL 1 337ac04195bSKonstantin Belousov #define VM_SWAP_IDLE 2 338ac04195bSKonstantin Belousov 339ac04195bSKonstantin Belousov void 340ac04195bSKonstantin Belousov vm_swapout_run(void) 341ac04195bSKonstantin Belousov { 342ac04195bSKonstantin Belousov 343ac04195bSKonstantin Belousov if (vm_swap_enabled) 344ac04195bSKonstantin Belousov vm_req_vmdaemon(VM_SWAP_NORMAL); 345ac04195bSKonstantin Belousov } 346ac04195bSKonstantin Belousov 347ac04195bSKonstantin Belousov /* 348ac04195bSKonstantin Belousov * Idle process swapout -- run once per second when pagedaemons are 349ac04195bSKonstantin Belousov * reclaiming pages. 350ac04195bSKonstantin Belousov */ 351ac04195bSKonstantin Belousov void 352ac04195bSKonstantin Belousov vm_swapout_run_idle(void) 353ac04195bSKonstantin Belousov { 354ac04195bSKonstantin Belousov static long lsec; 355ac04195bSKonstantin Belousov 356ac04195bSKonstantin Belousov if (!vm_swap_idle_enabled || time_second == lsec) 357ac04195bSKonstantin Belousov return; 358ac04195bSKonstantin Belousov vm_req_vmdaemon(VM_SWAP_IDLE); 359ac04195bSKonstantin Belousov lsec = time_second; 360ac04195bSKonstantin Belousov } 361ac04195bSKonstantin Belousov 362ac04195bSKonstantin Belousov static void 363ac04195bSKonstantin Belousov vm_req_vmdaemon(int req) 364ac04195bSKonstantin Belousov { 365ac04195bSKonstantin Belousov static int lastrun = 0; 366ac04195bSKonstantin Belousov 367ac04195bSKonstantin Belousov mtx_lock(&vm_daemon_mtx); 368ac04195bSKonstantin Belousov vm_pageout_req_swapout |= req; 369ac04195bSKonstantin Belousov if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 370ac04195bSKonstantin Belousov wakeup(&vm_daemon_needed); 371ac04195bSKonstantin Belousov lastrun = ticks; 372ac04195bSKonstantin Belousov } 373ac04195bSKonstantin Belousov mtx_unlock(&vm_daemon_mtx); 374ac04195bSKonstantin Belousov } 375ac04195bSKonstantin Belousov 376ac04195bSKonstantin Belousov static void 377ac04195bSKonstantin Belousov vm_daemon(void) 378ac04195bSKonstantin Belousov { 379ac04195bSKonstantin Belousov struct rlimit rsslim; 380ac04195bSKonstantin Belousov struct proc *p; 381ac04195bSKonstantin Belousov struct thread *td; 382ac04195bSKonstantin Belousov struct vmspace *vm; 383ac04195bSKonstantin Belousov int breakout, swapout_flags, tryagain, attempts; 384ac04195bSKonstantin Belousov #ifdef RACCT 385ac04195bSKonstantin Belousov uint64_t rsize, ravailable; 386ac04195bSKonstantin Belousov #endif 387ac04195bSKonstantin Belousov 388ac04195bSKonstantin Belousov while (TRUE) { 389ac04195bSKonstantin Belousov mtx_lock(&vm_daemon_mtx); 390ac04195bSKonstantin Belousov msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 391ac04195bSKonstantin Belousov #ifdef RACCT 392ac04195bSKonstantin Belousov racct_enable ? hz : 0 393ac04195bSKonstantin Belousov #else 394ac04195bSKonstantin Belousov 0 395ac04195bSKonstantin Belousov #endif 396ac04195bSKonstantin Belousov ); 397ac04195bSKonstantin Belousov swapout_flags = vm_pageout_req_swapout; 398ac04195bSKonstantin Belousov vm_pageout_req_swapout = 0; 399ac04195bSKonstantin Belousov mtx_unlock(&vm_daemon_mtx); 400ac04195bSKonstantin Belousov if (swapout_flags) 401ac04195bSKonstantin Belousov swapout_procs(swapout_flags); 402ac04195bSKonstantin Belousov 403ac04195bSKonstantin Belousov /* 404ac04195bSKonstantin Belousov * scan the processes for exceeding their rlimits or if 405ac04195bSKonstantin Belousov * process is swapped out -- deactivate pages 406ac04195bSKonstantin Belousov */ 407ac04195bSKonstantin Belousov tryagain = 0; 408ac04195bSKonstantin Belousov attempts = 0; 409ac04195bSKonstantin Belousov again: 410ac04195bSKonstantin Belousov attempts++; 411ac04195bSKonstantin Belousov sx_slock(&allproc_lock); 412ac04195bSKonstantin Belousov FOREACH_PROC_IN_SYSTEM(p) { 413ac04195bSKonstantin Belousov vm_pindex_t limit, size; 414ac04195bSKonstantin Belousov 415ac04195bSKonstantin Belousov /* 416ac04195bSKonstantin Belousov * if this is a system process or if we have already 417ac04195bSKonstantin Belousov * looked at this process, skip it. 418ac04195bSKonstantin Belousov */ 419ac04195bSKonstantin Belousov PROC_LOCK(p); 420ac04195bSKonstantin Belousov if (p->p_state != PRS_NORMAL || 421ac04195bSKonstantin Belousov p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { 422ac04195bSKonstantin Belousov PROC_UNLOCK(p); 423ac04195bSKonstantin Belousov continue; 424ac04195bSKonstantin Belousov } 425ac04195bSKonstantin Belousov /* 426ac04195bSKonstantin Belousov * if the process is in a non-running type state, 427ac04195bSKonstantin Belousov * don't touch it. 428ac04195bSKonstantin Belousov */ 429ac04195bSKonstantin Belousov breakout = 0; 430ac04195bSKonstantin Belousov FOREACH_THREAD_IN_PROC(p, td) { 431ac04195bSKonstantin Belousov thread_lock(td); 432ac04195bSKonstantin Belousov if (!TD_ON_RUNQ(td) && 433ac04195bSKonstantin Belousov !TD_IS_RUNNING(td) && 434ac04195bSKonstantin Belousov !TD_IS_SLEEPING(td) && 435ac04195bSKonstantin Belousov !TD_IS_SUSPENDED(td)) { 436ac04195bSKonstantin Belousov thread_unlock(td); 437ac04195bSKonstantin Belousov breakout = 1; 438ac04195bSKonstantin Belousov break; 439ac04195bSKonstantin Belousov } 440ac04195bSKonstantin Belousov thread_unlock(td); 441ac04195bSKonstantin Belousov } 442ac04195bSKonstantin Belousov if (breakout) { 443ac04195bSKonstantin Belousov PROC_UNLOCK(p); 444ac04195bSKonstantin Belousov continue; 445ac04195bSKonstantin Belousov } 446ac04195bSKonstantin Belousov /* 447ac04195bSKonstantin Belousov * get a limit 448ac04195bSKonstantin Belousov */ 449ac04195bSKonstantin Belousov lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); 450ac04195bSKonstantin Belousov limit = OFF_TO_IDX( 451ac04195bSKonstantin Belousov qmin(rsslim.rlim_cur, rsslim.rlim_max)); 452ac04195bSKonstantin Belousov 453ac04195bSKonstantin Belousov /* 454ac04195bSKonstantin Belousov * let processes that are swapped out really be 455ac04195bSKonstantin Belousov * swapped out set the limit to nothing (will force a 456ac04195bSKonstantin Belousov * swap-out.) 457ac04195bSKonstantin Belousov */ 458ac04195bSKonstantin Belousov if ((p->p_flag & P_INMEM) == 0) 459ac04195bSKonstantin Belousov limit = 0; /* XXX */ 460ac04195bSKonstantin Belousov vm = vmspace_acquire_ref(p); 461ac04195bSKonstantin Belousov _PHOLD_LITE(p); 462ac04195bSKonstantin Belousov PROC_UNLOCK(p); 463ac04195bSKonstantin Belousov if (vm == NULL) { 464ac04195bSKonstantin Belousov PRELE(p); 465ac04195bSKonstantin Belousov continue; 466ac04195bSKonstantin Belousov } 467ac04195bSKonstantin Belousov sx_sunlock(&allproc_lock); 468ac04195bSKonstantin Belousov 469ac04195bSKonstantin Belousov size = vmspace_resident_count(vm); 470ac04195bSKonstantin Belousov if (size >= limit) { 471ac04195bSKonstantin Belousov vm_swapout_map_deactivate_pages( 472ac04195bSKonstantin Belousov &vm->vm_map, limit); 473ac04195bSKonstantin Belousov size = vmspace_resident_count(vm); 474ac04195bSKonstantin Belousov } 475ac04195bSKonstantin Belousov #ifdef RACCT 476ac04195bSKonstantin Belousov if (racct_enable) { 477ac04195bSKonstantin Belousov rsize = IDX_TO_OFF(size); 478ac04195bSKonstantin Belousov PROC_LOCK(p); 479ac04195bSKonstantin Belousov if (p->p_state == PRS_NORMAL) 480ac04195bSKonstantin Belousov racct_set(p, RACCT_RSS, rsize); 481ac04195bSKonstantin Belousov ravailable = racct_get_available(p, RACCT_RSS); 482ac04195bSKonstantin Belousov PROC_UNLOCK(p); 483ac04195bSKonstantin Belousov if (rsize > ravailable) { 484ac04195bSKonstantin Belousov /* 485ac04195bSKonstantin Belousov * Don't be overly aggressive; this 486ac04195bSKonstantin Belousov * might be an innocent process, 487ac04195bSKonstantin Belousov * and the limit could've been exceeded 488ac04195bSKonstantin Belousov * by some memory hog. Don't try 489ac04195bSKonstantin Belousov * to deactivate more than 1/4th 490ac04195bSKonstantin Belousov * of process' resident set size. 491ac04195bSKonstantin Belousov */ 492ac04195bSKonstantin Belousov if (attempts <= 8) { 493ac04195bSKonstantin Belousov if (ravailable < rsize - 494ac04195bSKonstantin Belousov (rsize / 4)) { 495ac04195bSKonstantin Belousov ravailable = rsize - 496ac04195bSKonstantin Belousov (rsize / 4); 497ac04195bSKonstantin Belousov } 498ac04195bSKonstantin Belousov } 499ac04195bSKonstantin Belousov vm_swapout_map_deactivate_pages( 500ac04195bSKonstantin Belousov &vm->vm_map, 501ac04195bSKonstantin Belousov OFF_TO_IDX(ravailable)); 502ac04195bSKonstantin Belousov /* Update RSS usage after paging out. */ 503ac04195bSKonstantin Belousov size = vmspace_resident_count(vm); 504ac04195bSKonstantin Belousov rsize = IDX_TO_OFF(size); 505ac04195bSKonstantin Belousov PROC_LOCK(p); 506ac04195bSKonstantin Belousov if (p->p_state == PRS_NORMAL) 507ac04195bSKonstantin Belousov racct_set(p, RACCT_RSS, rsize); 508ac04195bSKonstantin Belousov PROC_UNLOCK(p); 509ac04195bSKonstantin Belousov if (rsize > ravailable) 510ac04195bSKonstantin Belousov tryagain = 1; 511ac04195bSKonstantin Belousov } 512ac04195bSKonstantin Belousov } 513ac04195bSKonstantin Belousov #endif 514ac04195bSKonstantin Belousov vmspace_free(vm); 515ac04195bSKonstantin Belousov sx_slock(&allproc_lock); 516ac04195bSKonstantin Belousov PRELE(p); 517ac04195bSKonstantin Belousov } 518ac04195bSKonstantin Belousov sx_sunlock(&allproc_lock); 519ac04195bSKonstantin Belousov if (tryagain != 0 && attempts <= 10) 520ac04195bSKonstantin Belousov goto again; 521ac04195bSKonstantin Belousov } 522ac04195bSKonstantin Belousov } 523ac04195bSKonstantin Belousov 524ac04195bSKonstantin Belousov /* 525ac04195bSKonstantin Belousov * Allow a thread's kernel stack to be paged out. 526ac04195bSKonstantin Belousov */ 527ac04195bSKonstantin Belousov static void 528ac04195bSKonstantin Belousov vm_thread_swapout(struct thread *td) 529ac04195bSKonstantin Belousov { 530ac04195bSKonstantin Belousov vm_object_t ksobj; 531ac04195bSKonstantin Belousov vm_page_t m; 532ac04195bSKonstantin Belousov int i, pages; 533ac04195bSKonstantin Belousov 534ac04195bSKonstantin Belousov cpu_thread_swapout(td); 535ac04195bSKonstantin Belousov pages = td->td_kstack_pages; 536ac04195bSKonstantin Belousov ksobj = td->td_kstack_obj; 537ac04195bSKonstantin Belousov pmap_qremove(td->td_kstack, pages); 538ac04195bSKonstantin Belousov VM_OBJECT_WLOCK(ksobj); 539ac04195bSKonstantin Belousov for (i = 0; i < pages; i++) { 540ac04195bSKonstantin Belousov m = vm_page_lookup(ksobj, i); 541ac04195bSKonstantin Belousov if (m == NULL) 542ac04195bSKonstantin Belousov panic("vm_thread_swapout: kstack already missing?"); 543ac04195bSKonstantin Belousov vm_page_dirty(m); 544ac04195bSKonstantin Belousov vm_page_lock(m); 545ac04195bSKonstantin Belousov vm_page_unwire(m, PQ_INACTIVE); 546ac04195bSKonstantin Belousov vm_page_unlock(m); 547ac04195bSKonstantin Belousov } 548ac04195bSKonstantin Belousov VM_OBJECT_WUNLOCK(ksobj); 549ac04195bSKonstantin Belousov } 550ac04195bSKonstantin Belousov 551ac04195bSKonstantin Belousov /* 552ac04195bSKonstantin Belousov * Bring the kernel stack for a specified thread back in. 553ac04195bSKonstantin Belousov */ 554ac04195bSKonstantin Belousov static void 555ac04195bSKonstantin Belousov vm_thread_swapin(struct thread *td) 556ac04195bSKonstantin Belousov { 557ac04195bSKonstantin Belousov vm_object_t ksobj; 558ac04195bSKonstantin Belousov vm_page_t ma[KSTACK_MAX_PAGES]; 559e258b4a0SKonstantin Belousov int a, count, i, j, pages, rv; 560ac04195bSKonstantin Belousov 561ac04195bSKonstantin Belousov pages = td->td_kstack_pages; 562ac04195bSKonstantin Belousov ksobj = td->td_kstack_obj; 563ac04195bSKonstantin Belousov VM_OBJECT_WLOCK(ksobj); 564ac04195bSKonstantin Belousov (void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED, ma, 565ac04195bSKonstantin Belousov pages); 566e258b4a0SKonstantin Belousov for (i = 0; i < pages;) { 567ac04195bSKonstantin Belousov vm_page_assert_xbusied(ma[i]); 568ac04195bSKonstantin Belousov if (ma[i]->valid == VM_PAGE_BITS_ALL) { 569ac04195bSKonstantin Belousov vm_page_xunbusy(ma[i]); 570ac04195bSKonstantin Belousov i++; 571ac04195bSKonstantin Belousov continue; 572ac04195bSKonstantin Belousov } 573ac04195bSKonstantin Belousov vm_object_pip_add(ksobj, 1); 574ac04195bSKonstantin Belousov for (j = i + 1; j < pages; j++) 575ac04195bSKonstantin Belousov if (ma[j]->valid == VM_PAGE_BITS_ALL) 576ac04195bSKonstantin Belousov break; 577ac04195bSKonstantin Belousov rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a); 578ac04195bSKonstantin Belousov KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i])); 579ac04195bSKonstantin Belousov count = min(a + 1, j - i); 580ac04195bSKonstantin Belousov rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL); 581ac04195bSKonstantin Belousov KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d", 582ac04195bSKonstantin Belousov __func__, td->td_proc->p_pid)); 583ac04195bSKonstantin Belousov vm_object_pip_wakeup(ksobj); 584ac04195bSKonstantin Belousov for (j = i; j < i + count; j++) 585ac04195bSKonstantin Belousov vm_page_xunbusy(ma[j]); 586ac04195bSKonstantin Belousov i += count; 587ac04195bSKonstantin Belousov } 588ac04195bSKonstantin Belousov VM_OBJECT_WUNLOCK(ksobj); 589ac04195bSKonstantin Belousov pmap_qenter(td->td_kstack, ma, pages); 590ac04195bSKonstantin Belousov cpu_thread_swapin(td); 591ac04195bSKonstantin Belousov } 592ac04195bSKonstantin Belousov 593ac04195bSKonstantin Belousov void 594ac04195bSKonstantin Belousov faultin(struct proc *p) 595ac04195bSKonstantin Belousov { 596ac04195bSKonstantin Belousov struct thread *td; 597ac04195bSKonstantin Belousov 598ac04195bSKonstantin Belousov PROC_LOCK_ASSERT(p, MA_OWNED); 599ac04195bSKonstantin Belousov /* 600ac04195bSKonstantin Belousov * If another process is swapping in this process, 601ac04195bSKonstantin Belousov * just wait until it finishes. 602ac04195bSKonstantin Belousov */ 603ac04195bSKonstantin Belousov if (p->p_flag & P_SWAPPINGIN) { 604ac04195bSKonstantin Belousov while (p->p_flag & P_SWAPPINGIN) 605ac04195bSKonstantin Belousov msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0); 606ac04195bSKonstantin Belousov return; 607ac04195bSKonstantin Belousov } 608ac04195bSKonstantin Belousov if ((p->p_flag & P_INMEM) == 0) { 609ac04195bSKonstantin Belousov /* 610ac04195bSKonstantin Belousov * Don't let another thread swap process p out while we are 611ac04195bSKonstantin Belousov * busy swapping it in. 612ac04195bSKonstantin Belousov */ 613ac04195bSKonstantin Belousov ++p->p_lock; 614ac04195bSKonstantin Belousov p->p_flag |= P_SWAPPINGIN; 615ac04195bSKonstantin Belousov PROC_UNLOCK(p); 616ac04195bSKonstantin Belousov 617ac04195bSKonstantin Belousov /* 618ac04195bSKonstantin Belousov * We hold no lock here because the list of threads 619ac04195bSKonstantin Belousov * can not change while all threads in the process are 620ac04195bSKonstantin Belousov * swapped out. 621ac04195bSKonstantin Belousov */ 622ac04195bSKonstantin Belousov FOREACH_THREAD_IN_PROC(p, td) 623ac04195bSKonstantin Belousov vm_thread_swapin(td); 624ac04195bSKonstantin Belousov PROC_LOCK(p); 625ac04195bSKonstantin Belousov swapclear(p); 626ac04195bSKonstantin Belousov p->p_swtick = ticks; 627ac04195bSKonstantin Belousov 628ac04195bSKonstantin Belousov wakeup(&p->p_flag); 629ac04195bSKonstantin Belousov 630ac04195bSKonstantin Belousov /* Allow other threads to swap p out now. */ 631ac04195bSKonstantin Belousov --p->p_lock; 632ac04195bSKonstantin Belousov } 633ac04195bSKonstantin Belousov } 634ac04195bSKonstantin Belousov 635ac04195bSKonstantin Belousov /* 636ac04195bSKonstantin Belousov * This swapin algorithm attempts to swap-in processes only if there 637ac04195bSKonstantin Belousov * is enough space for them. Of course, if a process waits for a long 638ac04195bSKonstantin Belousov * time, it will be swapped in anyway. 639ac04195bSKonstantin Belousov */ 640ac04195bSKonstantin Belousov void 641ac04195bSKonstantin Belousov swapper(void) 642ac04195bSKonstantin Belousov { 643e258b4a0SKonstantin Belousov struct proc *p, *pp; 644ac04195bSKonstantin Belousov struct thread *td; 645e258b4a0SKonstantin Belousov int ppri, pri, slptime, swtime; 646ac04195bSKonstantin Belousov 647ac04195bSKonstantin Belousov loop: 648ac04195bSKonstantin Belousov if (vm_page_count_min()) { 649ac04195bSKonstantin Belousov VM_WAIT; 650ac04195bSKonstantin Belousov goto loop; 651ac04195bSKonstantin Belousov } 652ac04195bSKonstantin Belousov 653ac04195bSKonstantin Belousov pp = NULL; 654ac04195bSKonstantin Belousov ppri = INT_MIN; 655ac04195bSKonstantin Belousov sx_slock(&allproc_lock); 656ac04195bSKonstantin Belousov FOREACH_PROC_IN_SYSTEM(p) { 657ac04195bSKonstantin Belousov PROC_LOCK(p); 658ac04195bSKonstantin Belousov if (p->p_state == PRS_NEW || 659ac04195bSKonstantin Belousov p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) { 660ac04195bSKonstantin Belousov PROC_UNLOCK(p); 661ac04195bSKonstantin Belousov continue; 662ac04195bSKonstantin Belousov } 663ac04195bSKonstantin Belousov swtime = (ticks - p->p_swtick) / hz; 664ac04195bSKonstantin Belousov FOREACH_THREAD_IN_PROC(p, td) { 665ac04195bSKonstantin Belousov /* 666ac04195bSKonstantin Belousov * An otherwise runnable thread of a process 667ac04195bSKonstantin Belousov * swapped out has only the TDI_SWAPPED bit set. 668ac04195bSKonstantin Belousov */ 669ac04195bSKonstantin Belousov thread_lock(td); 670ac04195bSKonstantin Belousov if (td->td_inhibitors == TDI_SWAPPED) { 671ac04195bSKonstantin Belousov slptime = (ticks - td->td_slptick) / hz; 672ac04195bSKonstantin Belousov pri = swtime + slptime; 673ac04195bSKonstantin Belousov if ((td->td_flags & TDF_SWAPINREQ) == 0) 674ac04195bSKonstantin Belousov pri -= p->p_nice * 8; 675ac04195bSKonstantin Belousov /* 676ac04195bSKonstantin Belousov * if this thread is higher priority 677ac04195bSKonstantin Belousov * and there is enough space, then select 678ac04195bSKonstantin Belousov * this process instead of the previous 679ac04195bSKonstantin Belousov * selection. 680ac04195bSKonstantin Belousov */ 681ac04195bSKonstantin Belousov if (pri > ppri) { 682ac04195bSKonstantin Belousov pp = p; 683ac04195bSKonstantin Belousov ppri = pri; 684ac04195bSKonstantin Belousov } 685ac04195bSKonstantin Belousov } 686ac04195bSKonstantin Belousov thread_unlock(td); 687ac04195bSKonstantin Belousov } 688ac04195bSKonstantin Belousov PROC_UNLOCK(p); 689ac04195bSKonstantin Belousov } 690ac04195bSKonstantin Belousov sx_sunlock(&allproc_lock); 691ac04195bSKonstantin Belousov 692ac04195bSKonstantin Belousov /* 693ac04195bSKonstantin Belousov * Nothing to do, back to sleep. 694ac04195bSKonstantin Belousov */ 695ac04195bSKonstantin Belousov if ((p = pp) == NULL) { 696ac04195bSKonstantin Belousov tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2); 697ac04195bSKonstantin Belousov goto loop; 698ac04195bSKonstantin Belousov } 699ac04195bSKonstantin Belousov PROC_LOCK(p); 700ac04195bSKonstantin Belousov 701ac04195bSKonstantin Belousov /* 702ac04195bSKonstantin Belousov * Another process may be bringing or may have already 703ac04195bSKonstantin Belousov * brought this process in while we traverse all threads. 704ac04195bSKonstantin Belousov * Or, this process may even be being swapped out again. 705ac04195bSKonstantin Belousov */ 706ac04195bSKonstantin Belousov if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) { 707ac04195bSKonstantin Belousov PROC_UNLOCK(p); 708ac04195bSKonstantin Belousov goto loop; 709ac04195bSKonstantin Belousov } 710ac04195bSKonstantin Belousov 711ac04195bSKonstantin Belousov /* 71289c0e67dSKonstantin Belousov * We would like to bring someone in. 713ac04195bSKonstantin Belousov */ 714ac04195bSKonstantin Belousov faultin(p); 715ac04195bSKonstantin Belousov PROC_UNLOCK(p); 716ac04195bSKonstantin Belousov goto loop; 717ac04195bSKonstantin Belousov } 718ac04195bSKonstantin Belousov 719ac04195bSKonstantin Belousov /* 720ac04195bSKonstantin Belousov * First, if any processes have been sleeping or stopped for at least 721ac04195bSKonstantin Belousov * "swap_idle_threshold1" seconds, they are swapped out. If, however, 722ac04195bSKonstantin Belousov * no such processes exist, then the longest-sleeping or stopped 723ac04195bSKonstantin Belousov * process is swapped out. Finally, and only as a last resort, if 724ac04195bSKonstantin Belousov * there are no sleeping or stopped processes, the longest-resident 725ac04195bSKonstantin Belousov * process is swapped out. 726ac04195bSKonstantin Belousov */ 727ac04195bSKonstantin Belousov static void 728ac04195bSKonstantin Belousov swapout_procs(int action) 729ac04195bSKonstantin Belousov { 730ac04195bSKonstantin Belousov struct proc *p; 731ac04195bSKonstantin Belousov struct thread *td; 732e258b4a0SKonstantin Belousov struct vmspace *vm; 733e258b4a0SKonstantin Belousov int minslptime, slptime; 734e258b4a0SKonstantin Belousov bool didswap; 735ac04195bSKonstantin Belousov 736e258b4a0SKonstantin Belousov minslptime = 100000; 737e258b4a0SKonstantin Belousov didswap = false; 738ac04195bSKonstantin Belousov retry: 739ac04195bSKonstantin Belousov sx_slock(&allproc_lock); 740ac04195bSKonstantin Belousov FOREACH_PROC_IN_SYSTEM(p) { 741ac04195bSKonstantin Belousov PROC_LOCK(p); 742ac04195bSKonstantin Belousov /* 743ac04195bSKonstantin Belousov * Watch out for a process in 744ac04195bSKonstantin Belousov * creation. It may have no 745ac04195bSKonstantin Belousov * address space or lock yet. 746ac04195bSKonstantin Belousov */ 747ac04195bSKonstantin Belousov if (p->p_state == PRS_NEW) { 748ac04195bSKonstantin Belousov PROC_UNLOCK(p); 749ac04195bSKonstantin Belousov continue; 750ac04195bSKonstantin Belousov } 751ac04195bSKonstantin Belousov /* 752ac04195bSKonstantin Belousov * An aio daemon switches its 753ac04195bSKonstantin Belousov * address space while running. 754ac04195bSKonstantin Belousov * Perform a quick check whether 755ac04195bSKonstantin Belousov * a process has P_SYSTEM. 756ac04195bSKonstantin Belousov * Filter out exiting processes. 757ac04195bSKonstantin Belousov */ 758ac04195bSKonstantin Belousov if ((p->p_flag & (P_SYSTEM | P_WEXIT)) != 0) { 759ac04195bSKonstantin Belousov PROC_UNLOCK(p); 760ac04195bSKonstantin Belousov continue; 761ac04195bSKonstantin Belousov } 762ac04195bSKonstantin Belousov _PHOLD_LITE(p); 763ac04195bSKonstantin Belousov PROC_UNLOCK(p); 764ac04195bSKonstantin Belousov sx_sunlock(&allproc_lock); 765ac04195bSKonstantin Belousov 766ac04195bSKonstantin Belousov /* 767ac04195bSKonstantin Belousov * Do not swapout a process that 768ac04195bSKonstantin Belousov * is waiting for VM data 769ac04195bSKonstantin Belousov * structures as there is a possible 770ac04195bSKonstantin Belousov * deadlock. Test this first as 771ac04195bSKonstantin Belousov * this may block. 772ac04195bSKonstantin Belousov * 773ac04195bSKonstantin Belousov * Lock the map until swapout 774ac04195bSKonstantin Belousov * finishes, or a thread of this 775ac04195bSKonstantin Belousov * process may attempt to alter 776ac04195bSKonstantin Belousov * the map. 777ac04195bSKonstantin Belousov */ 778ac04195bSKonstantin Belousov vm = vmspace_acquire_ref(p); 779ac04195bSKonstantin Belousov if (vm == NULL) 780ac04195bSKonstantin Belousov goto nextproc2; 781ac04195bSKonstantin Belousov if (!vm_map_trylock(&vm->vm_map)) 782ac04195bSKonstantin Belousov goto nextproc1; 783ac04195bSKonstantin Belousov 784ac04195bSKonstantin Belousov PROC_LOCK(p); 785ac04195bSKonstantin Belousov if (p->p_lock != 1 || (p->p_flag & (P_STOPPED_SINGLE | 786ac04195bSKonstantin Belousov P_TRACED | P_SYSTEM)) != 0) 787ac04195bSKonstantin Belousov goto nextproc; 788ac04195bSKonstantin Belousov 789ac04195bSKonstantin Belousov /* 790e258b4a0SKonstantin Belousov * Only aiod changes vmspace. However, it will be 791ac04195bSKonstantin Belousov * skipped because of the if statement above checking 792e258b4a0SKonstantin Belousov * for P_SYSTEM. 793ac04195bSKonstantin Belousov */ 794e258b4a0SKonstantin Belousov if ((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) != 795e258b4a0SKonstantin Belousov P_INMEM) 796ac04195bSKonstantin Belousov goto nextproc; 797ac04195bSKonstantin Belousov 798ac04195bSKonstantin Belousov switch (p->p_state) { 799ac04195bSKonstantin Belousov default: 800e258b4a0SKonstantin Belousov /* 801e258b4a0SKonstantin Belousov * Don't swap out processes in any sort 802e258b4a0SKonstantin Belousov * of 'special' state. 803e258b4a0SKonstantin Belousov */ 804ac04195bSKonstantin Belousov break; 805ac04195bSKonstantin Belousov 806ac04195bSKonstantin Belousov case PRS_NORMAL: 807ac04195bSKonstantin Belousov /* 808ac04195bSKonstantin Belousov * do not swapout a realtime process 809ac04195bSKonstantin Belousov * Check all the thread groups.. 810ac04195bSKonstantin Belousov */ 811ac04195bSKonstantin Belousov FOREACH_THREAD_IN_PROC(p, td) { 812ac04195bSKonstantin Belousov thread_lock(td); 813ac04195bSKonstantin Belousov if (PRI_IS_REALTIME(td->td_pri_class)) { 814ac04195bSKonstantin Belousov thread_unlock(td); 815ac04195bSKonstantin Belousov goto nextproc; 816ac04195bSKonstantin Belousov } 817ac04195bSKonstantin Belousov slptime = (ticks - td->td_slptick) / hz; 818ac04195bSKonstantin Belousov /* 819ac04195bSKonstantin Belousov * Guarantee swap_idle_threshold1 820ac04195bSKonstantin Belousov * time in memory. 821ac04195bSKonstantin Belousov */ 822ac04195bSKonstantin Belousov if (slptime < swap_idle_threshold1) { 823ac04195bSKonstantin Belousov thread_unlock(td); 824ac04195bSKonstantin Belousov goto nextproc; 825ac04195bSKonstantin Belousov } 826ac04195bSKonstantin Belousov 827ac04195bSKonstantin Belousov /* 828ac04195bSKonstantin Belousov * Do not swapout a process if it is 829ac04195bSKonstantin Belousov * waiting on a critical event of some 830ac04195bSKonstantin Belousov * kind or there is a thread whose 831ac04195bSKonstantin Belousov * pageable memory may be accessed. 832ac04195bSKonstantin Belousov * 833ac04195bSKonstantin Belousov * This could be refined to support 834ac04195bSKonstantin Belousov * swapping out a thread. 835ac04195bSKonstantin Belousov */ 836ac04195bSKonstantin Belousov if (!thread_safetoswapout(td)) { 837ac04195bSKonstantin Belousov thread_unlock(td); 838ac04195bSKonstantin Belousov goto nextproc; 839ac04195bSKonstantin Belousov } 840ac04195bSKonstantin Belousov /* 841ac04195bSKonstantin Belousov * If the system is under memory stress, 842ac04195bSKonstantin Belousov * or if we are swapping 843ac04195bSKonstantin Belousov * idle processes >= swap_idle_threshold2, 844ac04195bSKonstantin Belousov * then swap the process out. 845ac04195bSKonstantin Belousov */ 846e258b4a0SKonstantin Belousov if ((action & VM_SWAP_NORMAL) == 0 && 847e258b4a0SKonstantin Belousov ((action & VM_SWAP_IDLE) == 0 || 848e258b4a0SKonstantin Belousov slptime < swap_idle_threshold2)) { 849ac04195bSKonstantin Belousov thread_unlock(td); 850ac04195bSKonstantin Belousov goto nextproc; 851ac04195bSKonstantin Belousov } 852ac04195bSKonstantin Belousov 853ac04195bSKonstantin Belousov if (minslptime > slptime) 854ac04195bSKonstantin Belousov minslptime = slptime; 855ac04195bSKonstantin Belousov thread_unlock(td); 856ac04195bSKonstantin Belousov } 857ac04195bSKonstantin Belousov 858ac04195bSKonstantin Belousov /* 859ac04195bSKonstantin Belousov * If the pageout daemon didn't free enough pages, 860ac04195bSKonstantin Belousov * or if this process is idle and the system is 861ac04195bSKonstantin Belousov * configured to swap proactively, swap it out. 862ac04195bSKonstantin Belousov */ 863e258b4a0SKonstantin Belousov if ((action & VM_SWAP_NORMAL) != 0 || 864e258b4a0SKonstantin Belousov ((action & VM_SWAP_IDLE) != 0 && 865e258b4a0SKonstantin Belousov minslptime > swap_idle_threshold2)) { 866ac04195bSKonstantin Belousov _PRELE(p); 867ac04195bSKonstantin Belousov if (swapout(p) == 0) 868e258b4a0SKonstantin Belousov didswap = true; 869ac04195bSKonstantin Belousov PROC_UNLOCK(p); 870ac04195bSKonstantin Belousov vm_map_unlock(&vm->vm_map); 871ac04195bSKonstantin Belousov vmspace_free(vm); 872ac04195bSKonstantin Belousov goto retry; 873ac04195bSKonstantin Belousov } 874ac04195bSKonstantin Belousov } 875ac04195bSKonstantin Belousov nextproc: 876ac04195bSKonstantin Belousov PROC_UNLOCK(p); 877ac04195bSKonstantin Belousov vm_map_unlock(&vm->vm_map); 878ac04195bSKonstantin Belousov nextproc1: 879ac04195bSKonstantin Belousov vmspace_free(vm); 880ac04195bSKonstantin Belousov nextproc2: 881ac04195bSKonstantin Belousov sx_slock(&allproc_lock); 882ac04195bSKonstantin Belousov PRELE(p); 883ac04195bSKonstantin Belousov } 884ac04195bSKonstantin Belousov sx_sunlock(&allproc_lock); 885ac04195bSKonstantin Belousov /* 886ac04195bSKonstantin Belousov * If we swapped something out, and another process needed memory, 887ac04195bSKonstantin Belousov * then wakeup the sched process. 888ac04195bSKonstantin Belousov */ 889ac04195bSKonstantin Belousov if (didswap) 890ac04195bSKonstantin Belousov wakeup(&proc0); 891ac04195bSKonstantin Belousov } 892ac04195bSKonstantin Belousov 893ac04195bSKonstantin Belousov static void 894ac04195bSKonstantin Belousov swapclear(struct proc *p) 895ac04195bSKonstantin Belousov { 896ac04195bSKonstantin Belousov struct thread *td; 897ac04195bSKonstantin Belousov 898ac04195bSKonstantin Belousov PROC_LOCK_ASSERT(p, MA_OWNED); 899ac04195bSKonstantin Belousov 900ac04195bSKonstantin Belousov FOREACH_THREAD_IN_PROC(p, td) { 901ac04195bSKonstantin Belousov thread_lock(td); 902ac04195bSKonstantin Belousov td->td_flags |= TDF_INMEM; 903ac04195bSKonstantin Belousov td->td_flags &= ~TDF_SWAPINREQ; 904ac04195bSKonstantin Belousov TD_CLR_SWAPPED(td); 905ac04195bSKonstantin Belousov if (TD_CAN_RUN(td)) 906ac04195bSKonstantin Belousov if (setrunnable(td)) { 907ac04195bSKonstantin Belousov #ifdef INVARIANTS 908ac04195bSKonstantin Belousov /* 909ac04195bSKonstantin Belousov * XXX: We just cleared TDI_SWAPPED 910ac04195bSKonstantin Belousov * above and set TDF_INMEM, so this 911ac04195bSKonstantin Belousov * should never happen. 912ac04195bSKonstantin Belousov */ 913ac04195bSKonstantin Belousov panic("not waking up swapper"); 914ac04195bSKonstantin Belousov #endif 915ac04195bSKonstantin Belousov } 916ac04195bSKonstantin Belousov thread_unlock(td); 917ac04195bSKonstantin Belousov } 918ac04195bSKonstantin Belousov p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT); 919ac04195bSKonstantin Belousov p->p_flag |= P_INMEM; 920ac04195bSKonstantin Belousov } 921ac04195bSKonstantin Belousov 922ac04195bSKonstantin Belousov static int 923ac04195bSKonstantin Belousov swapout(struct proc *p) 924ac04195bSKonstantin Belousov { 925ac04195bSKonstantin Belousov struct thread *td; 926ac04195bSKonstantin Belousov 927ac04195bSKonstantin Belousov PROC_LOCK_ASSERT(p, MA_OWNED); 928ac04195bSKonstantin Belousov 929ac04195bSKonstantin Belousov /* 930ac04195bSKonstantin Belousov * The states of this process and its threads may have changed 931ac04195bSKonstantin Belousov * by now. Assuming that there is only one pageout daemon thread, 932ac04195bSKonstantin Belousov * this process should still be in memory. 933ac04195bSKonstantin Belousov */ 934ac04195bSKonstantin Belousov KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) == 935ac04195bSKonstantin Belousov P_INMEM, ("swapout: lost a swapout race?")); 936ac04195bSKonstantin Belousov 937ac04195bSKonstantin Belousov /* 938e258b4a0SKonstantin Belousov * Remember the resident count. 939ac04195bSKonstantin Belousov */ 940ac04195bSKonstantin Belousov p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace); 941e258b4a0SKonstantin Belousov 942ac04195bSKonstantin Belousov /* 943ac04195bSKonstantin Belousov * Check and mark all threads before we proceed. 944ac04195bSKonstantin Belousov */ 945ac04195bSKonstantin Belousov p->p_flag &= ~P_INMEM; 946ac04195bSKonstantin Belousov p->p_flag |= P_SWAPPINGOUT; 947ac04195bSKonstantin Belousov FOREACH_THREAD_IN_PROC(p, td) { 948ac04195bSKonstantin Belousov thread_lock(td); 949ac04195bSKonstantin Belousov if (!thread_safetoswapout(td)) { 950ac04195bSKonstantin Belousov thread_unlock(td); 951ac04195bSKonstantin Belousov swapclear(p); 952ac04195bSKonstantin Belousov return (EBUSY); 953ac04195bSKonstantin Belousov } 954ac04195bSKonstantin Belousov td->td_flags &= ~TDF_INMEM; 955ac04195bSKonstantin Belousov TD_SET_SWAPPED(td); 956ac04195bSKonstantin Belousov thread_unlock(td); 957ac04195bSKonstantin Belousov } 958ac04195bSKonstantin Belousov td = FIRST_THREAD_IN_PROC(p); 959ac04195bSKonstantin Belousov ++td->td_ru.ru_nswap; 960ac04195bSKonstantin Belousov PROC_UNLOCK(p); 961ac04195bSKonstantin Belousov 962ac04195bSKonstantin Belousov /* 963ac04195bSKonstantin Belousov * This list is stable because all threads are now prevented from 964ac04195bSKonstantin Belousov * running. The list is only modified in the context of a running 965ac04195bSKonstantin Belousov * thread in this process. 966ac04195bSKonstantin Belousov */ 967ac04195bSKonstantin Belousov FOREACH_THREAD_IN_PROC(p, td) 968ac04195bSKonstantin Belousov vm_thread_swapout(td); 969ac04195bSKonstantin Belousov 970ac04195bSKonstantin Belousov PROC_LOCK(p); 971ac04195bSKonstantin Belousov p->p_flag &= ~P_SWAPPINGOUT; 972ac04195bSKonstantin Belousov p->p_swtick = ticks; 973ac04195bSKonstantin Belousov return (0); 974ac04195bSKonstantin Belousov } 975