1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1991 Regents of the University of California. 35 * All rights reserved. 36 * Copyright (c) 1994 John S. Dyson 37 * All rights reserved. 38 * Copyright (c) 1994 David Greenman 39 * All rights reserved. 40 * 41 * This code is derived from software contributed to Berkeley by 42 * The Mach Operating System project at Carnegie-Mellon University. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 69 * 70 * 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * The proverbial page-out daemon, rewritten many times over the decades. 99 */ 100 101 #include "opt_vm.h" 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/kernel.h> 105 #include <sys/proc.h> 106 #include <sys/kthread.h> 107 #include <sys/resourcevar.h> 108 #include <sys/signalvar.h> 109 #include <sys/vnode.h> 110 #include <sys/malloc.h> 111 #include <sys/vmmeter.h> 112 #include <sys/conf.h> 113 #include <sys/sysctl.h> 114 115 #include <vm/vm.h> 116 #include <vm/vm_param.h> 117 #include <sys/lock.h> 118 #include <vm/vm_object.h> 119 #include <vm/vm_page.h> 120 #include <vm/vm_map.h> 121 #include <vm/vm_pageout.h> 122 #include <vm/vm_pager.h> 123 #include <vm/swap_pager.h> 124 #include <vm/vm_extern.h> 125 126 #include <sys/spinlock2.h> 127 #include <vm/vm_page2.h> 128 129 /* 130 * Persistent markers held by pageout daemon (array) 131 */ 132 struct markers { 133 struct vm_page hold; 134 struct vm_page stat; 135 struct vm_page pact; 136 }; 137 138 /* 139 * System initialization 140 */ 141 142 /* the kernel process "vm_pageout"*/ 143 static int vm_pageout_page(vm_page_t m, long *max_launderp, 144 long *vnodes_skippedp, struct vnode **vpfailedp, 145 int pass, int vmflush_flags, long *counts); 146 static int vm_pageout_clean_helper (vm_page_t, int); 147 static void vm_pageout_free_page_calc (vm_size_t count); 148 static void vm_pageout_page_free(vm_page_t m) ; 149 __read_frequently struct thread *emergpager; 150 __read_frequently struct thread *pagethread; 151 static int sequence_emerg_pager; 152 153 #if !defined(NO_SWAPPING) 154 /* the kernel process "vm_daemon"*/ 155 static void vm_daemon (void); 156 static struct thread *vmthread; 157 158 static struct kproc_desc vm_kp = { 159 "vmdaemon", 160 vm_daemon, 161 &vmthread 162 }; 163 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 164 #endif 165 166 __read_mostly int vm_pages_needed = 0; /* pageout daemon tsleep event */ 167 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */ 168 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 169 __read_mostly int vm_page_free_hysteresis = 16; 170 __read_mostly static time_t vm_pagedaemon_uptime; 171 172 #if !defined(NO_SWAPPING) 173 static int vm_daemon_needed; 174 #endif 175 __read_mostly static int vm_queue_idle_perc = 20; 176 __read_mostly static int vm_max_launder = 0; 177 __read_mostly static int vm_emerg_launder = 100; 178 __read_mostly static int vm_pageout_stats_actcmp = 0; 179 __read_mostly static int vm_pageout_stats_inamin = 16; 180 __read_mostly static int vm_pageout_stats_inalim = 4096; 181 __read_mostly static int vm_pageout_stats_scan = 0; 182 __read_mostly static int vm_pageout_stats_ticks = 0; 183 __read_mostly static int vm_pageout_algorithm = 0; 184 __read_mostly static int defer_swap_pageouts = 0; 185 __read_mostly static int disable_swap_pageouts = 0; 186 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 187 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 188 __read_mostly static int vm_pageout_debug; 189 __read_mostly static long vm_pageout_stats_rsecs = 300; 190 191 #if defined(NO_SWAPPING) 192 __read_mostly static int vm_swap_enabled=0; 193 #else 194 __read_mostly static int vm_swap_enabled=1; 195 #endif 196 197 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/ 198 __read_mostly int vm_pageout_memuse_mode=2; 199 __read_mostly int vm_pageout_allow_active=1; 200 201 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 202 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 203 204 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 205 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 206 207 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 208 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 209 "Free more pages than the minimum required"); 210 211 SYSCTL_INT(_vm, OID_AUTO, queue_idle_perc, 212 CTLFLAG_RW, &vm_queue_idle_perc, 0, "page stats stop point, percent"); 213 214 SYSCTL_INT(_vm, OID_AUTO, max_launder, 215 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 216 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 217 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 218 219 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_actcmp, 220 CTLFLAG_RW, &vm_pageout_stats_actcmp, 0, 221 "Current dynamic act_count comparator"); 222 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inamin, 223 CTLFLAG_RW, &vm_pageout_stats_inamin, 0, 224 "min out of lim tests must match"); 225 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inalim, 226 CTLFLAG_RW, &vm_pageout_stats_inalim, 0, 227 "min out of lim tests must match"); 228 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_ticks, 229 CTLFLAG_RW, &vm_pageout_stats_ticks, 0, 230 "Interval for partial stats scan"); 231 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_scan, 232 CTLFLAG_RW, &vm_pageout_stats_scan, 0, 233 "hold/ACT scan count per interval"); 234 SYSCTL_LONG(_vm, OID_AUTO, pageout_stats_rsecs, 235 CTLFLAG_RW, &vm_pageout_stats_rsecs, 0, 236 "min out of lim tests must match"); 237 238 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 239 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 240 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active, 241 CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active"); 242 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 243 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 244 245 246 #if defined(NO_SWAPPING) 247 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 248 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 249 #else 250 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 251 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 252 #endif 253 254 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 255 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 256 257 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 258 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 259 260 static int pageout_lock_miss; 261 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 262 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 263 264 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 265 266 static MALLOC_DEFINE(M_PAGEOUT, "pageout", "Pageout structures"); 267 268 #if !defined(NO_SWAPPING) 269 static void vm_req_vmdaemon (void); 270 #endif 271 272 #define MAXSCAN_DIVIDER 10 273 274 #define VM_CACHE_SCAN_MIN 16 275 #define VM_CACHE_SCAN_NOM (VM_CACHE_SCAN_MIN * 4) 276 277 /* 278 * Calculate approximately how many pages on each queue to try to 279 * clean. An exact calculation creates an edge condition when the 280 * queues are unbalanced so add significant slop. The queue scans 281 * will stop early when targets are reached and will start where they 282 * left off on the next pass. 283 * 284 * We need to be generous here because there are all sorts of loading 285 * conditions that can cause edge cases if try to average over all queues. 286 * In particular, storage subsystems have become so fast that paging 287 * activity can become quite frantic. Eventually we will probably need 288 * two paging threads, one for dirty pages and one for clean, to deal 289 * with the bandwidth requirements. 290 291 * So what we do is calculate a value that can be satisfied nominally by 292 * only having to scan half the queues. 293 */ 294 static __inline long 295 PQAVERAGE(long n) 296 { 297 long avg; 298 299 if (n >= 0) { 300 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 301 } else { 302 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 303 } 304 return avg; 305 } 306 307 /* 308 * vm_pageout_clean_helper: 309 * 310 * Clean the page and remove it from the laundry. The page must be busied 311 * by the caller and will be disposed of (put away, flushed) by this routine. 312 */ 313 static int 314 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 315 { 316 vm_object_t object; 317 vm_page_t mc[BLIST_MAX_ALLOC]; 318 int error; 319 int ib, is, page_base; 320 vm_pindex_t pindex = m->pindex; 321 322 object = m->object; 323 324 /* 325 * Don't mess with the page if it's held or special. Theoretically 326 * we can pageout held pages but there is no real need to press our 327 * luck, so don't. 328 */ 329 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 330 vm_page_wakeup(m); 331 return 0; 332 } 333 334 /* 335 * Place page in cluster. Align cluster for optimal swap space 336 * allocation (whether it is swap or not). This is typically ~16-32 337 * pages, which also tends to align the cluster to multiples of the 338 * filesystem block size if backed by a filesystem. 339 */ 340 page_base = pindex % BLIST_MAX_ALLOC; 341 mc[page_base] = m; 342 ib = page_base - 1; 343 is = page_base + 1; 344 345 /* 346 * Scan object for clusterable pages. 347 * 348 * We can cluster ONLY if: ->> the page is NOT 349 * clean, wired, busy, held, or mapped into a 350 * buffer, and one of the following: 351 * 1) The page is inactive, or a seldom used 352 * active page. 353 * -or- 354 * 2) we force the issue. 355 * 356 * During heavy mmap/modification loads the pageout 357 * daemon can really fragment the underlying file 358 * due to flushing pages out of order and not trying 359 * align the clusters (which leave sporatic out-of-order 360 * holes). To solve this problem we do the reverse scan 361 * first and attempt to align our cluster, then do a 362 * forward scan if room remains. 363 */ 364 vm_object_hold(object); 365 366 while (ib >= 0) { 367 vm_page_t p; 368 369 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 370 TRUE, &error); 371 if (error || p == NULL) 372 break; 373 if ((p->queue - p->pc) == PQ_CACHE || 374 (p->flags & PG_UNQUEUED)) { 375 vm_page_wakeup(p); 376 break; 377 } 378 vm_page_test_dirty(p); 379 if (((p->dirty & p->valid) == 0 && 380 (p->flags & PG_NEED_COMMIT) == 0) || 381 p->wire_count != 0 || /* may be held by buf cache */ 382 p->hold_count != 0) { /* may be undergoing I/O */ 383 vm_page_wakeup(p); 384 break; 385 } 386 if (p->queue - p->pc != PQ_INACTIVE) { 387 if (p->queue - p->pc != PQ_ACTIVE || 388 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 389 vm_page_wakeup(p); 390 break; 391 } 392 } 393 394 /* 395 * Try to maintain page groupings in the cluster. 396 */ 397 if (m->flags & PG_WINATCFLS) 398 vm_page_flag_set(p, PG_WINATCFLS); 399 else 400 vm_page_flag_clear(p, PG_WINATCFLS); 401 p->act_count = m->act_count; 402 403 mc[ib] = p; 404 --ib; 405 } 406 ++ib; /* fixup */ 407 408 while (is < BLIST_MAX_ALLOC && 409 pindex - page_base + is < object->size) { 410 vm_page_t p; 411 412 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 413 TRUE, &error); 414 if (error || p == NULL) 415 break; 416 if (((p->queue - p->pc) == PQ_CACHE) || 417 (p->flags & PG_UNQUEUED)) { 418 vm_page_wakeup(p); 419 break; 420 } 421 vm_page_test_dirty(p); 422 if (((p->dirty & p->valid) == 0 && 423 (p->flags & PG_NEED_COMMIT) == 0) || 424 p->wire_count != 0 || /* may be held by buf cache */ 425 p->hold_count != 0) { /* may be undergoing I/O */ 426 vm_page_wakeup(p); 427 break; 428 } 429 if (p->queue - p->pc != PQ_INACTIVE) { 430 if (p->queue - p->pc != PQ_ACTIVE || 431 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 432 vm_page_wakeup(p); 433 break; 434 } 435 } 436 437 /* 438 * Try to maintain page groupings in the cluster. 439 */ 440 if (m->flags & PG_WINATCFLS) 441 vm_page_flag_set(p, PG_WINATCFLS); 442 else 443 vm_page_flag_clear(p, PG_WINATCFLS); 444 p->act_count = m->act_count; 445 446 mc[is] = p; 447 ++is; 448 } 449 450 vm_object_drop(object); 451 452 /* 453 * we allow reads during pageouts... 454 */ 455 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 456 } 457 458 /* 459 * vm_pageout_flush() - launder the given pages 460 * 461 * The given pages are laundered. Note that we setup for the start of 462 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 463 * reference count all in here rather then in the parent. If we want 464 * the parent to do more sophisticated things we may have to change 465 * the ordering. 466 * 467 * The pages in the array must be busied by the caller and will be 468 * unbusied by this function. 469 */ 470 int 471 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 472 { 473 vm_object_t object; 474 int pageout_status[count]; 475 int numpagedout = 0; 476 int i; 477 478 /* 479 * Initiate I/O. Bump the vm_page_t->busy counter. 480 */ 481 for (i = 0; i < count; i++) { 482 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 483 ("vm_pageout_flush page %p index %d/%d: partially " 484 "invalid page", mc[i], i, count)); 485 vm_page_io_start(mc[i]); 486 } 487 488 /* 489 * We must make the pages read-only. This will also force the 490 * modified bit in the related pmaps to be cleared. The pager 491 * cannot clear the bit for us since the I/O completion code 492 * typically runs from an interrupt. The act of making the page 493 * read-only handles the case for us. 494 * 495 * Then we can unbusy the pages, we still hold a reference by virtue 496 * of our soft-busy. 497 */ 498 for (i = 0; i < count; i++) { 499 if (vmflush_flags & OBJPC_TRY_TO_CACHE) 500 vm_page_protect(mc[i], VM_PROT_NONE); 501 else 502 vm_page_protect(mc[i], VM_PROT_READ); 503 vm_page_wakeup(mc[i]); 504 } 505 506 object = mc[0]->object; 507 vm_object_pip_add(object, count); 508 509 vm_pager_put_pages(object, mc, count, 510 (vmflush_flags | 511 ((object == &kernel_object) ? 512 OBJPC_SYNC : 0)), 513 pageout_status); 514 515 for (i = 0; i < count; i++) { 516 vm_page_t mt = mc[i]; 517 518 switch (pageout_status[i]) { 519 case VM_PAGER_OK: 520 numpagedout++; 521 break; 522 case VM_PAGER_PEND: 523 numpagedout++; 524 break; 525 case VM_PAGER_BAD: 526 /* 527 * Page outside of range of object. Right now we 528 * essentially lose the changes by pretending it 529 * worked. 530 */ 531 vm_page_busy_wait(mt, FALSE, "pgbad"); 532 pmap_clear_modify(mt); 533 vm_page_undirty(mt); 534 vm_page_wakeup(mt); 535 break; 536 case VM_PAGER_ERROR: 537 case VM_PAGER_FAIL: 538 /* 539 * A page typically cannot be paged out when we 540 * have run out of swap. We leave the page 541 * marked inactive and will try to page it out 542 * again later. 543 * 544 * Starvation of the active page list is used to 545 * determine when the system is massively memory 546 * starved. 547 */ 548 break; 549 case VM_PAGER_AGAIN: 550 break; 551 } 552 553 /* 554 * If not PENDing this was a synchronous operation and we 555 * clean up after the I/O. If it is PENDing the mess is 556 * cleaned up asynchronously. 557 * 558 * Also nominally act on the caller's wishes if the caller 559 * wants to try to really clean (cache or free) the page. 560 * 561 * Also nominally deactivate the page if the system is 562 * memory-stressed. 563 */ 564 if (pageout_status[i] != VM_PAGER_PEND) { 565 vm_page_busy_wait(mt, FALSE, "pgouw"); 566 vm_page_io_finish(mt); 567 if (vmflush_flags & OBJPC_TRY_TO_CACHE) { 568 vm_page_try_to_cache(mt); 569 } else if (vm_paging_severe()) { 570 vm_page_deactivate(mt); 571 vm_page_wakeup(mt); 572 } else { 573 vm_page_wakeup(mt); 574 } 575 vm_object_pip_wakeup(object); 576 } 577 } 578 return numpagedout; 579 } 580 581 #if !defined(NO_SWAPPING) 582 583 /* 584 * Callback function, page busied for us. We must dispose of the busy 585 * condition. Any related pmap pages may be held but will not be locked. 586 */ 587 static 588 int 589 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 590 vm_page_t p) 591 { 592 int actcount; 593 int cleanit = 0; 594 595 /* 596 * Basic tests - There should never be a marker, and we can stop 597 * once the RSS is below the required level. 598 */ 599 KKASSERT((p->flags & PG_MARKER) == 0); 600 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 601 vm_page_wakeup(p); 602 return(-1); 603 } 604 605 mycpu->gd_cnt.v_pdpages++; 606 607 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 608 vm_page_wakeup(p); 609 goto done; 610 } 611 612 ++info->actioncount; 613 614 /* 615 * Check if the page has been referened recently. If it has, 616 * activate it and skip. 617 */ 618 actcount = pmap_ts_referenced(p); 619 if (actcount) { 620 vm_page_flag_set(p, PG_REFERENCED); 621 } else if (p->flags & PG_REFERENCED) { 622 actcount = 1; 623 } 624 625 if (actcount) { 626 if (p->queue - p->pc != PQ_ACTIVE) { 627 vm_page_and_queue_spin_lock(p); 628 if (p->queue - p->pc != PQ_ACTIVE) { 629 vm_page_and_queue_spin_unlock(p); 630 vm_page_activate(p); 631 } else { 632 vm_page_and_queue_spin_unlock(p); 633 } 634 } else { 635 p->act_count += actcount; 636 if (p->act_count > ACT_MAX) 637 p->act_count = ACT_MAX; 638 } 639 vm_page_flag_clear(p, PG_REFERENCED); 640 vm_page_wakeup(p); 641 goto done; 642 } 643 644 /* 645 * Remove the page from this particular pmap. Once we do this, our 646 * pmap scans will not see it again (unless it gets faulted in), so 647 * we must actively dispose of or deal with the page. 648 */ 649 pmap_remove_specific(info->pmap, p); 650 651 /* 652 * If the page is not mapped to another process (i.e. as would be 653 * typical if this were a shared page from a library) then deactivate 654 * the page and clean it in two passes only. 655 * 656 * If the page hasn't been referenced since the last check, remove it 657 * from the pmap. If it is no longer mapped, deactivate it 658 * immediately, accelerating the normal decline. 659 * 660 * Once the page has been removed from the pmap the RSS code no 661 * longer tracks it so we have to make sure that it is staged for 662 * potential flush action. 663 * 664 * XXX 665 */ 666 if ((p->flags & PG_MAPPED) == 0 || 667 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 668 if (p->queue - p->pc == PQ_ACTIVE) { 669 vm_page_deactivate(p); 670 } 671 if (p->queue - p->pc == PQ_INACTIVE) { 672 cleanit = 1; 673 } 674 } 675 676 /* 677 * Ok, try to fully clean the page and any nearby pages such that at 678 * least the requested page is freed or moved to the cache queue. 679 * 680 * We usually do this synchronously to allow us to get the page into 681 * the CACHE queue quickly, which will prevent memory exhaustion if 682 * a process with a memoryuse limit is running away. However, the 683 * sysadmin may desire to set vm.swap_user_async which relaxes this 684 * and improves write performance. 685 */ 686 if (cleanit) { 687 long max_launder = 0x7FFF; 688 long vnodes_skipped = 0; 689 long counts[4] = { 0, 0, 0, 0 }; 690 int vmflush_flags; 691 struct vnode *vpfailed = NULL; 692 693 info->offset = va; 694 695 if (vm_pageout_memuse_mode >= 2) { 696 vmflush_flags = OBJPC_TRY_TO_CACHE | 697 OBJPC_ALLOW_ACTIVE; 698 if (swap_user_async == 0) 699 vmflush_flags |= OBJPC_SYNC; 700 vm_page_flag_set(p, PG_WINATCFLS); 701 info->cleancount += 702 vm_pageout_page(p, &max_launder, 703 &vnodes_skipped, 704 &vpfailed, 1, vmflush_flags, 705 counts); 706 } else { 707 vm_page_wakeup(p); 708 ++info->cleancount; 709 } 710 } else { 711 vm_page_wakeup(p); 712 } 713 714 /* 715 * Must be at end to avoid SMP races. 716 */ 717 done: 718 lwkt_user_yield(); 719 return 0; 720 } 721 722 /* 723 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 724 * that is relatively difficult to do. We try to keep track of where we 725 * left off last time to reduce scan overhead. 726 * 727 * Called when vm_pageout_memuse_mode is >= 1. 728 */ 729 void 730 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 731 { 732 vm_offset_t pgout_offset; 733 struct pmap_pgscan_info info; 734 int retries = 3; 735 736 pgout_offset = map->pgout_offset; 737 again: 738 #if 0 739 kprintf("%016jx ", pgout_offset); 740 #endif 741 if (pgout_offset < VM_MIN_USER_ADDRESS) 742 pgout_offset = VM_MIN_USER_ADDRESS; 743 if (pgout_offset >= VM_MAX_USER_ADDRESS) 744 pgout_offset = 0; 745 info.pmap = vm_map_pmap(map); 746 info.limit = limit; 747 info.beg_addr = pgout_offset; 748 info.end_addr = VM_MAX_USER_ADDRESS; 749 info.callback = vm_pageout_mdp_callback; 750 info.cleancount = 0; 751 info.actioncount = 0; 752 info.busycount = 0; 753 754 pmap_pgscan(&info); 755 pgout_offset = info.offset; 756 #if 0 757 kprintf("%016jx %08lx %08lx\n", pgout_offset, 758 info.cleancount, info.actioncount); 759 #endif 760 761 if (pgout_offset != VM_MAX_USER_ADDRESS && 762 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 763 goto again; 764 } else if (retries && 765 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 766 --retries; 767 goto again; 768 } 769 map->pgout_offset = pgout_offset; 770 } 771 #endif 772 773 /* 774 * Called when the pageout scan wants to free a page. We no longer 775 * try to cycle the vm_object here with a reference & dealloc, which can 776 * cause a non-trivial object collapse in a critical path. 777 * 778 * It is unclear why we cycled the ref_count in the past, perhaps to try 779 * to optimize shadow chain collapses but I don't quite see why it would 780 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 781 * synchronously and not have to be kicked-start. 782 */ 783 static void 784 vm_pageout_page_free(vm_page_t m) 785 { 786 vm_page_protect(m, VM_PROT_NONE); 787 vm_page_free(m); 788 } 789 790 /* 791 * vm_pageout_scan does the dirty work for the pageout daemon. 792 */ 793 struct vm_pageout_scan_info { 794 struct proc *bigproc; 795 vm_offset_t bigsize; 796 }; 797 798 static int vm_pageout_scan_callback(struct proc *p, void *data); 799 800 /* 801 * Scan inactive queue for pages we can cache or free. 802 * 803 * WARNING! Can be called from two pagedaemon threads simultaneously. 804 */ 805 static int 806 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 807 long *vnodes_skipped, long *counts) 808 { 809 vm_page_t m; 810 struct vm_page marker; 811 struct vnode *vpfailed; /* warning, allowed to be stale */ 812 long maxscan; 813 long delta = 0; 814 long max_launder; 815 int isep; 816 int vmflush_flags; 817 818 isep = (curthread == emergpager); 819 if ((unsigned)pass > 1000) 820 pass = 1000; 821 822 /* 823 * This routine is called for each of PQ_L2_SIZE inactive queues. 824 * We want the vm_max_launder parameter to apply to the whole 825 * queue (i.e. per-whole-queue pass, not per-sub-queue). 826 * 827 * In each successive full-pass when the page target is not met we 828 * allow the per-queue max_launder to increase up to a maximum of 829 * vm_max_launder / 16. 830 */ 831 if (pass) 832 max_launder = (long)vm_max_launder * (pass + 1) / PQ_L2_SIZE; 833 else 834 max_launder = (long)vm_max_launder / PQ_L2_SIZE; 835 max_launder /= MAXSCAN_DIVIDER; 836 837 if (max_launder <= 1) 838 max_launder = 1; 839 if (max_launder >= vm_max_launder / 16) 840 max_launder = vm_max_launder / 16 + 1; 841 842 /* 843 * Start scanning the inactive queue for pages we can move to the 844 * cache or free. The scan will stop when the target is reached or 845 * we have scanned the entire inactive queue. Note that m->act_count 846 * is not used to form decisions for the inactive queue, only for the 847 * active queue. 848 * 849 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 850 * PAGES. 851 */ 852 853 /* 854 * Initialize our marker 855 */ 856 bzero(&marker, sizeof(marker)); 857 marker.flags = PG_FICTITIOUS | PG_MARKER; 858 marker.busy_count = PBUSY_LOCKED; 859 marker.queue = PQ_INACTIVE + q; 860 marker.pc = q; 861 marker.wire_count = 1; 862 863 /* 864 * Inactive queue scan. 865 * 866 * We pick off approximately 1/10 of each queue. Each queue is 867 * effectively organized LRU so scanning the entire queue would 868 * improperly pick up pages that might still be in regular use. 869 * 870 * NOTE: The vm_page must be spinlocked before the queue to avoid 871 * deadlocks, so it is easiest to simply iterate the loop 872 * with the queue unlocked at the top. 873 */ 874 vpfailed = NULL; 875 876 vm_page_queues_spin_lock(PQ_INACTIVE + q); 877 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 878 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1; 879 880 /* 881 * Queue locked at top of loop to avoid stack marker issues. 882 */ 883 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 884 maxscan-- > 0 && avail_shortage - delta > 0) 885 { 886 int count; 887 888 KKASSERT(m->queue == PQ_INACTIVE + q); 889 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 890 &marker, pageq); 891 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 892 &marker, pageq); 893 mycpu->gd_cnt.v_pdpages++; 894 895 /* 896 * Skip marker pages (atomic against other markers to avoid 897 * infinite hop-over scans). 898 */ 899 if (m->flags & PG_MARKER) 900 continue; 901 902 /* 903 * Try to busy the page. Don't mess with pages which are 904 * already busy or reorder them in the queue. 905 */ 906 if (vm_page_busy_try(m, TRUE)) 907 continue; 908 909 /* 910 * Remaining operations run with the page busy and neither 911 * the page or the queue will be spin-locked. 912 */ 913 KKASSERT(m->queue == PQ_INACTIVE + q); 914 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 915 916 /* 917 * The emergency pager runs when the primary pager gets 918 * stuck, which typically means the primary pager deadlocked 919 * on a vnode-backed page. Therefore, the emergency pager 920 * must skip any complex objects. 921 * 922 * We disallow VNODEs unless they are VCHR whos device ops 923 * does not flag D_NOEMERGPGR. 924 */ 925 if (isep && m->object) { 926 struct vnode *vp; 927 928 switch(m->object->type) { 929 case OBJT_DEFAULT: 930 case OBJT_SWAP: 931 /* 932 * Allow anonymous memory and assume that 933 * swap devices are not complex, since its 934 * kinda worthless if we can't swap out dirty 935 * anonymous pages. 936 */ 937 break; 938 case OBJT_VNODE: 939 /* 940 * Allow VCHR device if the D_NOEMERGPGR 941 * flag is not set, deny other vnode types 942 * as being too complex. 943 */ 944 vp = m->object->handle; 945 if (vp && vp->v_type == VCHR && 946 vp->v_rdev && vp->v_rdev->si_ops && 947 (vp->v_rdev->si_ops->head.flags & 948 D_NOEMERGPGR) == 0) { 949 break; 950 } 951 /* Deny - fall through */ 952 default: 953 /* 954 * Deny 955 */ 956 vm_page_wakeup(m); 957 vm_page_queues_spin_lock(PQ_INACTIVE + q); 958 lwkt_yield(); 959 continue; 960 } 961 } 962 963 /* 964 * Try to pageout the page and perhaps other nearby pages. 965 * We want to get the pages into the cache eventually ( 966 * first or second pass). Otherwise the pages can wind up 967 * just cycling in the inactive queue, getting flushed over 968 * and over again. 969 * 970 * Generally speaking we recycle dirty pages within PQ_INACTIVE 971 * twice (double LRU) before paging them out. If the 972 * memuse_mode is >= 3 we run them single-LRU like we do clean 973 * pages. 974 */ 975 if (vm_pageout_memuse_mode >= 3) 976 vm_page_flag_set(m, PG_WINATCFLS); 977 978 vmflush_flags = 0; 979 if (vm_pageout_allow_active) 980 vmflush_flags |= OBJPC_ALLOW_ACTIVE; 981 if (m->flags & PG_WINATCFLS) 982 vmflush_flags |= OBJPC_TRY_TO_CACHE; 983 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 984 &vpfailed, pass, vmflush_flags, counts); 985 delta += count; 986 987 /* 988 * Systems with a ton of memory can wind up with huge 989 * deactivation counts. Because the inactive scan is 990 * doing a lot of flushing, the combination can result 991 * in excessive paging even in situations where other 992 * unrelated threads free up sufficient VM. 993 * 994 * To deal with this we abort the nominal active->inactive 995 * scan before we hit the inactive target when free+cache 996 * levels have reached a reasonable target. 997 * 998 * When deciding to stop early we need to add some slop to 999 * the test and we need to return full completion to the caller 1000 * to prevent the caller from thinking there is something 1001 * wrong and issuing a low-memory+swap warning or pkill. 1002 * 1003 * A deficit forces paging regardless of the state of the 1004 * VM page queues (used for RSS enforcement). 1005 */ 1006 lwkt_yield(); 1007 vm_page_queues_spin_lock(PQ_INACTIVE + q); 1008 1009 /* if (vm_paging_target() < -vm_max_launder) */ 1010 if (vm_paging_target2()) { 1011 /* 1012 * Stopping early, return full completion to caller. 1013 */ 1014 if (delta < avail_shortage) 1015 delta = avail_shortage; 1016 break; 1017 } 1018 } 1019 1020 /* page queue still spin-locked */ 1021 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 1022 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 1023 1024 return (delta); 1025 } 1026 1027 /* 1028 * Pageout the specified page, return the total number of pages paged out 1029 * (this routine may cluster). 1030 * 1031 * The page must be busied and soft-busied by the caller and will be disposed 1032 * of by this function. 1033 */ 1034 static int 1035 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 1036 struct vnode **vpfailedp, int pass, int vmflush_flags, 1037 long *counts) 1038 { 1039 vm_object_t object; 1040 int actcount; 1041 int count = 0; 1042 1043 /* 1044 * Wiring no longer removes a page from its queue. The last unwiring 1045 * will requeue the page. Obviously wired pages cannot be paged out 1046 * so unqueue it and return. 1047 */ 1048 if (m->wire_count) { 1049 vm_page_unqueue_nowakeup(m); 1050 vm_page_wakeup(m); 1051 return 0; 1052 } 1053 1054 /* 1055 * A held page may be undergoing I/O, so skip it. 1056 */ 1057 if (m->hold_count) { 1058 vm_page_and_queue_spin_lock(m); 1059 if (m->queue - m->pc == PQ_INACTIVE) { 1060 TAILQ_REMOVE( 1061 &vm_page_queues[m->queue].pl, m, pageq); 1062 TAILQ_INSERT_TAIL( 1063 &vm_page_queues[m->queue].pl, m, pageq); 1064 } 1065 vm_page_and_queue_spin_unlock(m); 1066 vm_page_wakeup(m); 1067 return 0; 1068 } 1069 1070 if (m->object == NULL || m->object->ref_count == 0) { 1071 /* 1072 * If the object is not being used, we ignore previous 1073 * references. 1074 */ 1075 vm_page_flag_clear(m, PG_REFERENCED); 1076 pmap_clear_reference(m); 1077 /* fall through to end */ 1078 } else if (((m->flags & PG_REFERENCED) == 0) && 1079 (actcount = pmap_ts_referenced(m))) { 1080 /* 1081 * Otherwise, if the page has been referenced while 1082 * in the inactive queue, we bump the "activation 1083 * count" upwards, making it less likely that the 1084 * page will be added back to the inactive queue 1085 * prematurely again. Here we check the page tables 1086 * (or emulated bits, if any), given the upper level 1087 * VM system not knowing anything about existing 1088 * references. 1089 */ 1090 ++counts[3]; 1091 vm_page_activate(m); 1092 m->act_count += (actcount + ACT_ADVANCE); 1093 vm_page_wakeup(m); 1094 return 0; 1095 } 1096 1097 /* 1098 * (m) is still busied. 1099 * 1100 * If the upper level VM system knows about any page 1101 * references, we activate the page. We also set the 1102 * "activation count" higher than normal so that we will less 1103 * likely place pages back onto the inactive queue again. 1104 */ 1105 if ((m->flags & PG_REFERENCED) != 0) { 1106 vm_page_flag_clear(m, PG_REFERENCED); 1107 actcount = pmap_ts_referenced(m); 1108 vm_page_activate(m); 1109 m->act_count += (actcount + ACT_ADVANCE + 1); 1110 vm_page_wakeup(m); 1111 ++counts[3]; 1112 return 0; 1113 } 1114 1115 /* 1116 * If the upper level VM system doesn't know anything about 1117 * the page being dirty, we have to check for it again. As 1118 * far as the VM code knows, any partially dirty pages are 1119 * fully dirty. 1120 * 1121 * Pages marked PG_WRITEABLE may be mapped into the user 1122 * address space of a process running on another cpu. A 1123 * user process (without holding the MP lock) running on 1124 * another cpu may be able to touch the page while we are 1125 * trying to remove it. vm_page_cache() will handle this 1126 * case for us. 1127 */ 1128 if (m->dirty == 0) { 1129 vm_page_test_dirty(m); 1130 } else { 1131 vm_page_dirty(m); 1132 } 1133 1134 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1135 /* 1136 * Invalid pages can be easily freed 1137 */ 1138 vm_pageout_page_free(m); 1139 mycpu->gd_cnt.v_dfree++; 1140 ++count; 1141 ++counts[1]; 1142 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1143 /* 1144 * Clean pages can be placed onto the cache queue. 1145 * This effectively frees them. 1146 */ 1147 vm_page_cache(m); 1148 ++count; 1149 ++counts[1]; 1150 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1151 /* 1152 * Dirty pages need to be paged out, but flushing 1153 * a page is extremely expensive verses freeing 1154 * a clean page. Rather then artificially limiting 1155 * the number of pages we can flush, we instead give 1156 * dirty pages extra priority on the inactive queue 1157 * by forcing them to be cycled through the queue 1158 * twice before being flushed, after which the 1159 * (now clean) page will cycle through once more 1160 * before being freed. This significantly extends 1161 * the thrash point for a heavily loaded machine. 1162 */ 1163 ++counts[2]; 1164 vm_page_flag_set(m, PG_WINATCFLS); 1165 vm_page_and_queue_spin_lock(m); 1166 if (m->queue - m->pc == PQ_INACTIVE) { 1167 TAILQ_REMOVE( 1168 &vm_page_queues[m->queue].pl, m, pageq); 1169 TAILQ_INSERT_TAIL( 1170 &vm_page_queues[m->queue].pl, m, pageq); 1171 } 1172 vm_page_and_queue_spin_unlock(m); 1173 vm_page_wakeup(m); 1174 } else if (*max_launderp > 0) { 1175 /* 1176 * We always want to try to flush some dirty pages if 1177 * we encounter them, to keep the system stable. 1178 * Normally this number is small, but under extreme 1179 * pressure where there are insufficient clean pages 1180 * on the inactive queue, we may have to go all out. 1181 */ 1182 int swap_pageouts_ok; 1183 struct vnode *vp = NULL; 1184 1185 if ((m->flags & PG_WINATCFLS) == 0) 1186 vm_page_flag_set(m, PG_WINATCFLS); 1187 swap_pageouts_ok = 0; 1188 object = m->object; 1189 if (object && 1190 (object->type != OBJT_SWAP) && 1191 (object->type != OBJT_DEFAULT)) { 1192 swap_pageouts_ok = 1; 1193 } else { 1194 swap_pageouts_ok = !(defer_swap_pageouts || 1195 disable_swap_pageouts); 1196 swap_pageouts_ok |= (!disable_swap_pageouts && 1197 defer_swap_pageouts && 1198 vm_paging_min()); 1199 } 1200 1201 /* 1202 * We don't bother paging objects that are "dead". 1203 * Those objects are in a "rundown" state. 1204 */ 1205 if (!swap_pageouts_ok || 1206 (object == NULL) || 1207 (object->flags & OBJ_DEAD)) { 1208 vm_page_and_queue_spin_lock(m); 1209 if (m->queue - m->pc == PQ_INACTIVE) { 1210 TAILQ_REMOVE( 1211 &vm_page_queues[m->queue].pl, 1212 m, pageq); 1213 TAILQ_INSERT_TAIL( 1214 &vm_page_queues[m->queue].pl, 1215 m, pageq); 1216 } 1217 vm_page_and_queue_spin_unlock(m); 1218 vm_page_wakeup(m); 1219 return 0; 1220 } 1221 1222 /* 1223 * (m) is still busied. 1224 * 1225 * The object is already known NOT to be dead. It 1226 * is possible for the vget() to block the whole 1227 * pageout daemon, but the new low-memory handling 1228 * code should prevent it. 1229 * 1230 * The previous code skipped locked vnodes and, worse, 1231 * reordered pages in the queue. This results in 1232 * completely non-deterministic operation because, 1233 * quite often, a vm_fault has initiated an I/O and 1234 * is holding a locked vnode at just the point where 1235 * the pageout daemon is woken up. 1236 * 1237 * We can't wait forever for the vnode lock, we might 1238 * deadlock due to a vn_read() getting stuck in 1239 * vm_wait while holding this vnode. We skip the 1240 * vnode if we can't get it in a reasonable amount 1241 * of time. 1242 * 1243 * vpfailed is used to (try to) avoid the case where 1244 * a large number of pages are associated with a 1245 * locked vnode, which could cause the pageout daemon 1246 * to stall for an excessive amount of time. 1247 */ 1248 if (object->type == OBJT_VNODE) { 1249 int flags; 1250 1251 vp = object->handle; 1252 flags = LK_EXCLUSIVE; 1253 if (vp == *vpfailedp) 1254 flags |= LK_NOWAIT; 1255 else 1256 flags |= LK_TIMELOCK; 1257 vm_page_hold(m); 1258 vm_page_wakeup(m); 1259 1260 /* 1261 * We have unbusied (m) temporarily so we can 1262 * acquire the vp lock without deadlocking. 1263 * (m) is held to prevent destruction. 1264 */ 1265 if (vget(vp, flags) != 0) { 1266 *vpfailedp = vp; 1267 ++pageout_lock_miss; 1268 if (object->flags & OBJ_MIGHTBEDIRTY) 1269 ++*vnodes_skippedp; 1270 vm_page_unhold(m); 1271 return 0; 1272 } 1273 1274 /* 1275 * The page might have been moved to another 1276 * queue during potential blocking in vget() 1277 * above. The page might have been freed and 1278 * reused for another vnode. The object might 1279 * have been reused for another vnode. 1280 */ 1281 if (m->queue - m->pc != PQ_INACTIVE || 1282 m->object != object || 1283 object->handle != vp) { 1284 if (object->flags & OBJ_MIGHTBEDIRTY) 1285 ++*vnodes_skippedp; 1286 vput(vp); 1287 vm_page_unhold(m); 1288 return 0; 1289 } 1290 1291 /* 1292 * The page may have been busied during the 1293 * blocking in vput(); We don't move the 1294 * page back onto the end of the queue so that 1295 * statistics are more correct if we don't. 1296 */ 1297 if (vm_page_busy_try(m, TRUE)) { 1298 vput(vp); 1299 vm_page_unhold(m); 1300 return 0; 1301 } 1302 vm_page_unhold(m); 1303 1304 /* 1305 * If it was wired while we didn't own it. 1306 */ 1307 if (m->wire_count) { 1308 vm_page_unqueue_nowakeup(m); 1309 vput(vp); 1310 vm_page_wakeup(m); 1311 return 0; 1312 } 1313 1314 /* 1315 * (m) is busied again 1316 * 1317 * We own the busy bit and remove our hold 1318 * bit. If the page is still held it 1319 * might be undergoing I/O, so skip it. 1320 */ 1321 if (m->hold_count) { 1322 rebusy_failed: 1323 vm_page_and_queue_spin_lock(m); 1324 if (m->queue - m->pc == PQ_INACTIVE) { 1325 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1326 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1327 } 1328 vm_page_and_queue_spin_unlock(m); 1329 if (object->flags & OBJ_MIGHTBEDIRTY) 1330 ++*vnodes_skippedp; 1331 vm_page_wakeup(m); 1332 vput(vp); 1333 return 0; 1334 } 1335 1336 /* 1337 * Recheck queue, object, and vp now that we have 1338 * rebusied the page. 1339 */ 1340 if (m->queue - m->pc != PQ_INACTIVE || 1341 m->object != object || 1342 object->handle != vp) { 1343 kprintf("vm_pageout_page: " 1344 "rebusy %p failed(A)\n", 1345 m); 1346 goto rebusy_failed; 1347 } 1348 1349 /* 1350 * Check page validity 1351 */ 1352 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1353 kprintf("vm_pageout_page: " 1354 "rebusy %p failed(B)\n", 1355 m); 1356 goto rebusy_failed; 1357 } 1358 if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1359 kprintf("vm_pageout_page: " 1360 "rebusy %p failed(C)\n", 1361 m); 1362 goto rebusy_failed; 1363 } 1364 1365 /* (m) is left busied as we fall through */ 1366 } 1367 1368 /* 1369 * page is busy and not held here. 1370 * 1371 * If a page is dirty, then it is either being washed 1372 * (but not yet cleaned) or it is still in the 1373 * laundry. If it is still in the laundry, then we 1374 * start the cleaning operation. 1375 * 1376 * decrement inactive_shortage on success to account 1377 * for the (future) cleaned page. Otherwise we 1378 * could wind up laundering or cleaning too many 1379 * pages. 1380 * 1381 * NOTE: Cleaning the page here does not cause 1382 * force_deficit to be adjusted, because the 1383 * page is not being freed or moved to the 1384 * cache. 1385 */ 1386 count = vm_pageout_clean_helper(m, vmflush_flags); 1387 counts[0] += count; 1388 *max_launderp -= count; 1389 1390 /* 1391 * Clean ate busy, page no longer accessible 1392 */ 1393 if (vp != NULL) 1394 vput(vp); 1395 } else { 1396 vm_page_wakeup(m); 1397 } 1398 return count; 1399 } 1400 1401 /* 1402 * Scan active queue 1403 * 1404 * WARNING! Can be called from two pagedaemon threads simultaneously. 1405 */ 1406 static int 1407 vm_pageout_scan_active(int pass, int q, 1408 long avail_shortage, long inactive_shortage, 1409 struct vm_page *marker, 1410 long *recycle_countp) 1411 { 1412 vm_page_t m; 1413 int actcount; 1414 long delta = 0; 1415 long maxscan; 1416 int isep; 1417 1418 isep = (curthread == emergpager); 1419 1420 /* 1421 * We want to move pages from the active queue to the inactive 1422 * queue to get the inactive queue to the inactive target. If 1423 * we still have a page shortage from above we try to directly free 1424 * clean pages instead of moving them. 1425 * 1426 * If we do still have a shortage we keep track of the number of 1427 * pages we free or cache (recycle_count) as a measure of thrashing 1428 * between the active and inactive queues. 1429 * 1430 * If we were able to completely satisfy the free+cache targets 1431 * from the inactive pool we limit the number of pages we move 1432 * from the active pool to the inactive pool to 2x the pages we 1433 * had removed from the inactive pool (with a minimum of 1/5 the 1434 * inactive target). If we were not able to completely satisfy 1435 * the free+cache targets we go for the whole target aggressively. 1436 * 1437 * NOTE: Both variables can end up negative. 1438 * NOTE: We are still in a critical section. 1439 * 1440 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1441 * PAGES. 1442 */ 1443 1444 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1445 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1; 1446 1447 /* 1448 * Queue locked at top of loop to avoid stack marker issues. 1449 */ 1450 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 1451 maxscan-- > 0 && (avail_shortage - delta > 0 || 1452 inactive_shortage > 0)) 1453 { 1454 KKASSERT(m->queue == PQ_ACTIVE + q); 1455 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1456 marker, pageq); 1457 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1458 marker, pageq); 1459 1460 /* 1461 * Skip marker pages (atomic against other markers to avoid 1462 * infinite hop-over scans). 1463 */ 1464 if (m->flags & PG_MARKER) 1465 continue; 1466 1467 /* 1468 * Try to busy the page. Don't mess with pages which are 1469 * already busy or reorder them in the queue. 1470 */ 1471 if (vm_page_busy_try(m, TRUE)) 1472 continue; 1473 1474 /* 1475 * Remaining operations run with the page busy and neither 1476 * the page or the queue will be spin-locked. 1477 */ 1478 KKASSERT(m->queue == PQ_ACTIVE + q); 1479 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1480 1481 #if 0 1482 /* 1483 * Don't deactivate pages that are held, even if we can 1484 * busy them. (XXX why not?) 1485 */ 1486 if (m->hold_count) { 1487 vm_page_and_queue_spin_lock(m); 1488 if (m->queue - m->pc == PQ_ACTIVE) { 1489 TAILQ_REMOVE( 1490 &vm_page_queues[PQ_ACTIVE + q].pl, 1491 m, pageq); 1492 TAILQ_INSERT_TAIL( 1493 &vm_page_queues[PQ_ACTIVE + q].pl, 1494 m, pageq); 1495 } 1496 vm_page_and_queue_spin_unlock(m); 1497 vm_page_wakeup(m); 1498 goto next; 1499 } 1500 #endif 1501 /* 1502 * We can just remove wired pages from the queue 1503 */ 1504 if (m->wire_count) { 1505 vm_page_unqueue_nowakeup(m); 1506 vm_page_wakeup(m); 1507 goto next; 1508 } 1509 1510 /* 1511 * The emergency pager ignores vnode-backed pages as these 1512 * are the pages that probably bricked the main pager. 1513 */ 1514 if (isep && m->object && m->object->type == OBJT_VNODE) { 1515 #if 0 1516 vm_page_and_queue_spin_lock(m); 1517 if (m->queue - m->pc == PQ_ACTIVE) { 1518 TAILQ_REMOVE( 1519 &vm_page_queues[PQ_ACTIVE + q].pl, 1520 m, pageq); 1521 TAILQ_INSERT_TAIL( 1522 &vm_page_queues[PQ_ACTIVE + q].pl, 1523 m, pageq); 1524 } 1525 vm_page_and_queue_spin_unlock(m); 1526 #endif 1527 vm_page_wakeup(m); 1528 goto next; 1529 } 1530 1531 /* 1532 * The count for pagedaemon pages is done after checking the 1533 * page for eligibility... 1534 */ 1535 mycpu->gd_cnt.v_pdpages++; 1536 1537 /* 1538 * Check to see "how much" the page has been used and clear 1539 * the tracking access bits. If the object has no references 1540 * don't bother paying the expense. 1541 */ 1542 actcount = 0; 1543 if (m->object && m->object->ref_count != 0) { 1544 if (m->flags & PG_REFERENCED) 1545 ++actcount; 1546 actcount += pmap_ts_referenced(m); 1547 if (actcount) { 1548 m->act_count += ACT_ADVANCE + actcount; 1549 if (m->act_count > ACT_MAX) 1550 m->act_count = ACT_MAX; 1551 } 1552 } 1553 vm_page_flag_clear(m, PG_REFERENCED); 1554 1555 /* 1556 * actcount is only valid if the object ref_count is non-zero. 1557 * If the page does not have an object, actcount will be zero. 1558 */ 1559 if (actcount && m->object->ref_count != 0) { 1560 #if 0 1561 vm_page_and_queue_spin_lock(m); 1562 if (m->queue - m->pc == PQ_ACTIVE) { 1563 TAILQ_REMOVE( 1564 &vm_page_queues[PQ_ACTIVE + q].pl, 1565 m, pageq); 1566 TAILQ_INSERT_TAIL( 1567 &vm_page_queues[PQ_ACTIVE + q].pl, 1568 m, pageq); 1569 } 1570 vm_page_and_queue_spin_unlock(m); 1571 #endif 1572 vm_page_wakeup(m); 1573 } else { 1574 switch(m->object->type) { 1575 case OBJT_DEFAULT: 1576 case OBJT_SWAP: 1577 m->act_count -= min(m->act_count, 1578 vm_anonmem_decline); 1579 break; 1580 default: 1581 m->act_count -= min(m->act_count, 1582 vm_filemem_decline); 1583 break; 1584 } 1585 if (vm_pageout_algorithm || 1586 (m->object == NULL) || 1587 (m->object && (m->object->ref_count == 0)) || 1588 m->act_count < pass + 1 1589 ) { 1590 /* 1591 * Deactivate the page. If we had a 1592 * shortage from our inactive scan try to 1593 * free (cache) the page instead. 1594 * 1595 * Don't just blindly cache the page if 1596 * we do not have a shortage from the 1597 * inactive scan, that could lead to 1598 * gigabytes being moved. 1599 */ 1600 --inactive_shortage; 1601 if (avail_shortage - delta > 0 || 1602 (m->object && (m->object->ref_count == 0))) 1603 { 1604 if (avail_shortage - delta > 0) 1605 ++*recycle_countp; 1606 vm_page_protect(m, VM_PROT_NONE); 1607 if (m->dirty == 0 && 1608 (m->flags & PG_NEED_COMMIT) == 0 && 1609 avail_shortage - delta > 0) { 1610 vm_page_cache(m); 1611 } else { 1612 vm_page_deactivate(m); 1613 vm_page_wakeup(m); 1614 } 1615 } else { 1616 vm_page_deactivate(m); 1617 vm_page_wakeup(m); 1618 } 1619 ++delta; 1620 } else { 1621 /* 1622 * Do nothing 1623 */ 1624 #if 0 1625 vm_page_and_queue_spin_lock(m); 1626 if (m->queue - m->pc == PQ_ACTIVE) { 1627 TAILQ_REMOVE( 1628 &vm_page_queues[PQ_ACTIVE + q].pl, 1629 m, pageq); 1630 TAILQ_INSERT_TAIL( 1631 &vm_page_queues[PQ_ACTIVE + q].pl, 1632 m, pageq); 1633 } 1634 vm_page_and_queue_spin_unlock(m); 1635 #endif 1636 vm_page_wakeup(m); 1637 } 1638 } 1639 next: 1640 lwkt_yield(); 1641 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1642 } 1643 1644 /* 1645 * Clean out our local marker. 1646 * 1647 * Page queue still spin-locked. 1648 */ 1649 if (m == NULL) { 1650 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1651 marker, pageq); 1652 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 1653 marker, pageq); 1654 } 1655 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1656 1657 return (delta); 1658 } 1659 1660 /* 1661 * The number of actually free pages can drop down to v_free_reserved, 1662 * we try to build the free count back above v_free_min, to v_free_target. 1663 * 1664 * Cache pages are already counted as being free-ish. 1665 * 1666 * NOTE: we are still in a critical section. 1667 * 1668 * Pages moved from PQ_CACHE to totally free are not counted in the 1669 * pages_freed counter. 1670 * 1671 * WARNING! Can be called from two pagedaemon threads simultaneously. 1672 */ 1673 static void 1674 vm_pageout_scan_cache(long avail_shortage, int pass, 1675 long vnodes_skipped, long recycle_count) 1676 { 1677 static int lastkillticks; 1678 struct vm_pageout_scan_info info; 1679 vm_page_t m; 1680 int isep; 1681 1682 isep = (curthread == emergpager); 1683 1684 /* 1685 * Test conditions also include a safeety against v_free_min in 1686 * case the sysop messes up the sysctls. 1687 * 1688 * Also include a test to avoid degenerate scans. 1689 */ 1690 while ((vmstats.v_free_count < vmstats.v_free_target || 1691 vmstats.v_free_count < vmstats.v_free_min) && 1692 vmstats.v_cache_count > VM_CACHE_SCAN_MIN) 1693 { 1694 /* 1695 * This steals some code from vm/vm_page.c 1696 * 1697 * Create two rovers and adjust the code to reduce 1698 * chances of them winding up at the same index (which 1699 * can cause a lot of contention). 1700 */ 1701 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1702 1703 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1704 goto next_rover; 1705 1706 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1707 if (m == NULL) 1708 break; 1709 /* 1710 * page is returned removed from its queue and spinlocked 1711 * 1712 * If the busy attempt fails we can still deactivate the page. 1713 */ 1714 if (vm_page_busy_try(m, TRUE)) { 1715 vm_page_deactivate_locked(m); 1716 vm_page_spin_unlock(m); 1717 continue; 1718 } 1719 vm_page_spin_unlock(m); 1720 pagedaemon_wakeup(); 1721 lwkt_yield(); 1722 1723 /* 1724 * Remaining operations run with the page busy and neither 1725 * the page or the queue will be spin-locked. 1726 */ 1727 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) || 1728 m->hold_count || 1729 m->wire_count) { 1730 vm_page_deactivate(m); 1731 vm_page_wakeup(m); 1732 continue; 1733 } 1734 1735 /* 1736 * Because the page is in the cache, it shouldn't be mapped. 1737 */ 1738 pmap_mapped_sync(m); 1739 KKASSERT((m->flags & PG_MAPPED) == 0); 1740 KKASSERT(m->dirty == 0); 1741 vm_pageout_page_free(m); 1742 mycpu->gd_cnt.v_dfree++; 1743 next_rover: 1744 if (isep) 1745 cache_rover[1] -= PQ_PRIME2; 1746 else 1747 cache_rover[0] += PQ_PRIME2; 1748 } 1749 1750 /* 1751 * If we didn't get enough free pages, and we have skipped a vnode 1752 * in a writeable object, wakeup the sync daemon. And kick swapout 1753 * if we did not get enough free pages. 1754 */ 1755 if (vm_paging_target1()) { 1756 if (vnodes_skipped && vm_paging_min()) 1757 speedup_syncer(NULL); 1758 #if !defined(NO_SWAPPING) 1759 if (vm_swap_enabled && vm_paging_target1()) 1760 vm_req_vmdaemon(); 1761 #endif 1762 } 1763 1764 /* 1765 * Handle catastrophic conditions. Under good conditions we should 1766 * be at the target, well beyond our minimum. If we could not even 1767 * reach our minimum the system is under heavy stress. But just being 1768 * under heavy stress does not trigger process killing. 1769 * 1770 * We consider ourselves to have run out of memory if the swap pager 1771 * is full and avail_shortage is still positive. The secondary check 1772 * ensures that we do not kill processes if the instantanious 1773 * availability is good, even if the pageout demon pass says it 1774 * couldn't get to the target. 1775 * 1776 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1777 * SITUATIONS. 1778 */ 1779 if (swap_pager_almost_full && 1780 pass > 0 && 1781 isep == 0 && 1782 (vm_paging_min_dnc(recycle_count) || avail_shortage > 0)) { 1783 kprintf("Warning: system low on memory+swap " 1784 "shortage %ld for %d ticks!\n", 1785 avail_shortage, ticks - swap_fail_ticks); 1786 if (bootverbose) { 1787 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1788 "availshrt=%ld tgt=%d/%d inacshrt=%ld " 1789 "last=%u\n", 1790 swap_pager_almost_full, 1791 swap_pager_full, 1792 pass, 1793 avail_shortage, 1794 vm_paging_target1(), 1795 vm_paging_target2(), 1796 vm_paging_target2_count(), 1797 (unsigned int)(ticks - lastkillticks)); 1798 } 1799 } 1800 if (swap_pager_full && 1801 pass > 1 && 1802 isep == 0 && 1803 avail_shortage > 0 && 1804 vm_paging_target1() && 1805 (unsigned int)(ticks - lastkillticks) >= hz) { 1806 /* 1807 * Kill something, maximum rate once per second to give 1808 * the process time to free up sufficient memory. 1809 */ 1810 lastkillticks = ticks; 1811 info.bigproc = NULL; 1812 info.bigsize = 0; 1813 allproc_scan(vm_pageout_scan_callback, &info, 0); 1814 if (info.bigproc != NULL) { 1815 kprintf("Try to kill process %d %s\n", 1816 info.bigproc->p_pid, info.bigproc->p_comm); 1817 info.bigproc->p_nice = PRIO_MIN; 1818 info.bigproc->p_usched->resetpriority( 1819 FIRST_LWP_IN_PROC(info.bigproc)); 1820 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1821 killproc(info.bigproc, "out of swap space"); 1822 wakeup(&vmstats.v_free_count); 1823 PRELE(info.bigproc); 1824 } 1825 } 1826 } 1827 1828 static int 1829 vm_pageout_scan_callback(struct proc *p, void *data) 1830 { 1831 struct vm_pageout_scan_info *info = data; 1832 vm_offset_t size; 1833 1834 /* 1835 * Never kill system processes or init. If we have configured swap 1836 * then try to avoid killing low-numbered pids. 1837 */ 1838 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1839 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1840 return (0); 1841 } 1842 1843 lwkt_gettoken(&p->p_token); 1844 1845 /* 1846 * if the process is in a non-running type state, 1847 * don't touch it. 1848 */ 1849 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1850 lwkt_reltoken(&p->p_token); 1851 return (0); 1852 } 1853 1854 /* 1855 * Get the approximate process size. Note that anonymous pages 1856 * with backing swap will be counted twice, but there should not 1857 * be too many such pages due to the stress the VM system is 1858 * under at this point. 1859 */ 1860 size = vmspace_anonymous_count(p->p_vmspace) + 1861 vmspace_swap_count(p->p_vmspace); 1862 1863 /* 1864 * If the this process is bigger than the biggest one 1865 * remember it. 1866 */ 1867 if (info->bigsize < size) { 1868 if (info->bigproc) 1869 PRELE(info->bigproc); 1870 PHOLD(p); 1871 info->bigproc = p; 1872 info->bigsize = size; 1873 } 1874 lwkt_reltoken(&p->p_token); 1875 lwkt_yield(); 1876 1877 return(0); 1878 } 1879 1880 /* 1881 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1882 * moved back to PQ_FREE. It is possible for pages to accumulate here 1883 * when vm_page_free() races against vm_page_unhold(), resulting in a 1884 * page being left on a PQ_HOLD queue with hold_count == 0. 1885 * 1886 * It is easier to handle this edge condition here, in non-critical code, 1887 * rather than enforce a spin-lock for every 1->0 transition in 1888 * vm_page_unhold(). 1889 * 1890 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1891 */ 1892 static void 1893 vm_pageout_scan_hold(int q, struct vm_page *marker) 1894 { 1895 vm_page_t m; 1896 long pcount; 1897 1898 pcount = vm_page_queues[PQ_HOLD + q].lcnt; 1899 if (pcount > vm_pageout_stats_scan) 1900 pcount = vm_pageout_stats_scan; 1901 1902 vm_page_queues_spin_lock(PQ_HOLD + q); 1903 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 1904 pcount-- > 0) 1905 { 1906 KKASSERT(m->queue == PQ_HOLD + q); 1907 TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl, marker, pageq); 1908 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_HOLD + q].pl, m, 1909 marker, pageq); 1910 1911 if (m->flags & PG_MARKER) 1912 continue; 1913 1914 /* 1915 * Process one page and return 1916 */ 1917 if (m->hold_count) 1918 break; 1919 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1920 vm_page_hold(m); 1921 vm_page_queues_spin_unlock(PQ_HOLD + q); 1922 vm_page_unhold(m); /* reprocess */ 1923 vm_page_queues_spin_lock(PQ_HOLD + q); 1924 } 1925 1926 /* 1927 * If queue exhausted move the marker back to the head. 1928 */ 1929 if (m == NULL) { 1930 TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl, 1931 marker, pageq); 1932 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl, 1933 marker, pageq); 1934 } 1935 1936 vm_page_queues_spin_unlock(PQ_HOLD + q); 1937 } 1938 1939 /* 1940 * This code maintains the m->act for active pages. The scan occurs only 1941 * as long as the pageout daemon is not running or the inactive target has 1942 * not been reached. 1943 * 1944 * The restrictions prevent an idle machine from degrading all VM pages 1945 * m->act to 0 or nearly 0, which makes the field useless. For example, if 1946 * a workstation user goes to bed. 1947 */ 1948 static void 1949 vm_pageout_page_stats(int q, struct vm_page *marker, long *counterp) 1950 { 1951 struct vpgqueues *pq = &vm_page_queues[PQ_ACTIVE + q]; 1952 vm_page_t m; 1953 long pcount; /* Number of pages to check */ 1954 1955 /* 1956 * No point scanning the active queue if it is smaller than 1957 * 1/2 usable memory. This most typically occurs at system 1958 * startup or if a huge amount of memory has just been freed. 1959 */ 1960 if (vmstats.v_active_count < vmstats.v_free_count + 1961 vmstats.v_cache_count + 1962 vmstats.v_inactive_count) 1963 { 1964 return; 1965 } 1966 1967 /* 1968 * Generally do not scan if the pageout daemon is not running 1969 * or the inactive target has been reached. However, we override 1970 * this and scan anyway for N seconds after the pageout daemon last 1971 * ran. 1972 * 1973 * This last bit is designed to give the system a little time to 1974 * stage more pages for potential deactivation. In this situation, 1975 * if the inactive target has been met, we just update m->act_count 1976 * and do not otherwise mess with the page. But we don't want it 1977 * to run forever because that would cause m->act to become unusable 1978 * if the machine were to become idle. 1979 */ 1980 if (vm_pages_needed == 0 && !vm_paging_inactive()) { 1981 if (time_uptime - vm_pagedaemon_uptime > vm_pageout_stats_rsecs) 1982 return; 1983 } 1984 1985 if (vm_pageout_debug) { 1986 static time_t save_time; 1987 if (save_time != time_uptime) { 1988 save_time = time_uptime; 1989 kprintf("DEACTIVATE Q=%4d N=%ld\n", 1990 q, vm_paging_inactive_count()); 1991 } 1992 } 1993 1994 /* 1995 * Limited scan to reduce cpu glitches, just in case the 1996 * pmap_ts_referenced() burns a lot of CPU. 1997 */ 1998 pcount = pq->lcnt; 1999 if (pcount > vm_pageout_stats_scan) 2000 pcount = vm_pageout_stats_scan; 2001 2002 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2003 2004 /* 2005 * Queue locked at top of loop to avoid stack marker issues. 2006 */ 2007 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 2008 pcount-- > 0) 2009 { 2010 int actcount; 2011 2012 KKASSERT(m->queue == PQ_ACTIVE + q); 2013 TAILQ_REMOVE(&pq->pl, marker, pageq); 2014 TAILQ_INSERT_AFTER(&pq->pl, m, marker, pageq); 2015 2016 /* 2017 * Skip marker pages (atomic against other markers to avoid 2018 * infinite hop-over scans). 2019 */ 2020 if (m->flags & PG_MARKER) 2021 continue; 2022 2023 ++counterp[0]; 2024 2025 /* 2026 * Ignore pages we can't busy 2027 */ 2028 if (vm_page_busy_try(m, TRUE)) { 2029 continue; 2030 } 2031 2032 /* 2033 * Remaining operations run with the page busy and neither 2034 * the page or the queue will be spin-locked. 2035 */ 2036 KKASSERT(m->queue == PQ_ACTIVE + q); 2037 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2038 2039 /* 2040 * We can just remove wired pages from the queue 2041 */ 2042 if (m->wire_count) { 2043 vm_page_unqueue_nowakeup(m); 2044 vm_page_wakeup(m); 2045 goto next; 2046 } 2047 2048 2049 /* 2050 * We now have a safely busied page, the page and queue 2051 * spinlocks have been released. 2052 * 2053 * Ignore held and wired pages 2054 */ 2055 if (m->hold_count || m->wire_count) { 2056 vm_page_wakeup(m); 2057 goto next; 2058 } 2059 2060 /* 2061 * Calculate activity 2062 */ 2063 actcount = 0; 2064 if (m->flags & PG_REFERENCED) { 2065 vm_page_flag_clear(m, PG_REFERENCED); 2066 actcount += 1; 2067 } 2068 actcount += pmap_ts_referenced(m); 2069 2070 /* 2071 * Update act_count and move page to end of queue. 2072 */ 2073 if (actcount) { 2074 m->act_count += ACT_ADVANCE + actcount; 2075 if (m->act_count > ACT_MAX) 2076 m->act_count = ACT_MAX; 2077 #if 0 2078 vm_page_and_queue_spin_lock(m); 2079 if (m->queue - m->pc == PQ_ACTIVE) { 2080 TAILQ_REMOVE(&pq->pl, m, pageq); 2081 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 2082 } 2083 vm_page_and_queue_spin_unlock(m); 2084 #endif 2085 vm_page_wakeup(m); 2086 goto next; 2087 } 2088 2089 if (m->act_count == 0) { 2090 /* 2091 * If the deactivation target has not been reached 2092 * we try to deactivate the page. 2093 * 2094 * If the deactivation target has been reached it 2095 * is a complete waste of time (both now and later) 2096 * to try to deactivate more pages. 2097 */ 2098 if (vm_paging_inactive()) { 2099 vm_page_protect(m, VM_PROT_NONE); 2100 vm_page_deactivate(m); 2101 } 2102 ++counterp[1]; 2103 } else { 2104 m->act_count -= min(m->act_count, ACT_DECLINE); 2105 #if 0 2106 vm_page_and_queue_spin_lock(m); 2107 if (m->queue - m->pc == PQ_ACTIVE) { 2108 TAILQ_REMOVE(&pq->pl, m, pageq); 2109 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 2110 } 2111 vm_page_and_queue_spin_unlock(m); 2112 #endif 2113 2114 if (m->act_count < vm_pageout_stats_actcmp) { 2115 if (vm_paging_inactive()) { 2116 vm_page_protect(m, VM_PROT_NONE); 2117 vm_page_deactivate(m); 2118 } 2119 ++counterp[1]; 2120 } 2121 } 2122 vm_page_wakeup(m); 2123 next: 2124 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2125 } 2126 2127 /* 2128 * If the queue has been exhausted move the marker back to the head. 2129 */ 2130 if (m == NULL) { 2131 TAILQ_REMOVE(&pq->pl, marker, pageq); 2132 TAILQ_INSERT_HEAD(&pq->pl, marker, pageq); 2133 } 2134 2135 /* 2136 * Remove our local marker 2137 * 2138 * Page queue still spin-locked. 2139 */ 2140 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2141 2142 /* 2143 * After roughly every (inalim) pages determine if we are making 2144 * appropriate progress. If we are then reduce the comparison point 2145 * for act_count, and if we are not increase the comparison point. 2146 * 2147 * This allows us to handle heavier loads and also balances the 2148 * code, particularly at startup. 2149 */ 2150 if (counterp[0] > vm_pageout_stats_inalim) { 2151 if (counterp[1] < vm_pageout_stats_inamin) { 2152 if (vm_pageout_stats_actcmp < ACT_MAX * 3 / 4) 2153 ++vm_pageout_stats_actcmp; 2154 } else { 2155 if (vm_pageout_stats_actcmp > 0) 2156 --vm_pageout_stats_actcmp; 2157 } 2158 counterp[0] = 0; 2159 counterp[1] = 0; 2160 } 2161 } 2162 2163 static void 2164 vm_pageout_free_page_calc(vm_size_t count) 2165 { 2166 /* 2167 * v_free_min normal allocations 2168 * v_free_reserved system allocations 2169 * v_pageout_free_min allocations by pageout daemon 2170 * v_interrupt_free_min low level allocations (e.g swap structures) 2171 * 2172 * v_free_min is used to generate several other baselines, and they 2173 * can get pretty silly on systems with a lot of memory. 2174 */ 2175 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2176 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2177 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2178 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2179 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2180 } 2181 2182 2183 /* 2184 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2185 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2186 * 2187 * The emergency pageout daemon takes over when the primary pageout daemon 2188 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2189 * avoiding the many low-memory deadlocks which can occur when paging out 2190 * to VFS's. 2191 */ 2192 static void 2193 vm_pageout_thread(void) 2194 { 2195 int pass; 2196 int q; 2197 int q1iterator = 0; 2198 int q2iterator = 0; 2199 int q3iterator = 0; 2200 int isep; 2201 enum { PAGING_IDLE, PAGING_TARGET1, PAGING_TARGET2 } state; 2202 struct markers *markers; 2203 long scounter[2] = { 0, 0 }; 2204 2205 curthread->td_flags |= TDF_SYSTHREAD; 2206 state = PAGING_IDLE; 2207 2208 /* 2209 * Allocate continuous markers for hold, stats (active), and 2210 * paging active queue scan. These scans occur incrementally. 2211 */ 2212 markers = kmalloc(sizeof(*markers) * PQ_L2_SIZE, 2213 M_PAGEOUT, M_WAITOK | M_ZERO); 2214 2215 for (q = 0; q < PQ_L2_SIZE; ++q) { 2216 struct markers *mark = &markers[q]; 2217 2218 mark->hold.flags = PG_FICTITIOUS | PG_MARKER; 2219 mark->hold.busy_count = PBUSY_LOCKED; 2220 mark->hold.queue = PQ_HOLD + q; 2221 mark->hold.pc = PQ_HOLD + q; 2222 mark->hold.wire_count = 1; 2223 vm_page_queues_spin_lock(PQ_HOLD + q); 2224 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl, 2225 &mark->hold, pageq); 2226 vm_page_queues_spin_unlock(PQ_HOLD + q); 2227 2228 mark->stat.flags = PG_FICTITIOUS | PG_MARKER; 2229 mark->stat.busy_count = PBUSY_LOCKED; 2230 mark->stat.queue = PQ_ACTIVE + q; 2231 mark->stat.pc = PQ_ACTIVE + q; 2232 mark->stat.wire_count = 1; 2233 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2234 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 2235 &mark->stat, pageq); 2236 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2237 2238 mark->pact.flags = PG_FICTITIOUS | PG_MARKER; 2239 mark->pact.busy_count = PBUSY_LOCKED; 2240 mark->pact.queue = PQ_ACTIVE + q; 2241 mark->pact.pc = PQ_ACTIVE + q; 2242 mark->pact.wire_count = 1; 2243 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2244 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 2245 &mark->pact, pageq); 2246 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2247 } 2248 2249 /* 2250 * We only need to setup once. 2251 */ 2252 isep = 0; 2253 if (curthread == emergpager) { 2254 isep = 1; 2255 goto skip_setup; 2256 } 2257 2258 /* 2259 * Initialize vm_max_launder per pageout pass to be 1/16 2260 * of total physical memory, plus a little slop. 2261 */ 2262 if (vm_max_launder == 0) 2263 vm_max_launder = physmem / 256 + 16; 2264 2265 /* 2266 * Initialize some paging parameters. 2267 */ 2268 vm_pageout_free_page_calc(vmstats.v_page_count); 2269 2270 /* 2271 * Basic pageout daemon paging operation settings 2272 */ 2273 vmstats.v_free_target = vmstats.v_free_min * 2; 2274 2275 vmstats.v_paging_wait = vmstats.v_free_min * 2; 2276 vmstats.v_paging_start = vmstats.v_free_min * 3; 2277 vmstats.v_paging_target1 = vmstats.v_free_min * 4; 2278 vmstats.v_paging_target2 = vmstats.v_free_min * 5; 2279 2280 /* 2281 * NOTE: With the new buffer cache b_act_count we want the default 2282 * inactive target to be a percentage of available memory. 2283 * 2284 * The inactive target essentially determines the minimum 2285 * number of 'temporary' pages capable of caching one-time-use 2286 * files when the VM system is otherwise full of pages 2287 * belonging to multi-time-use files or active program data. 2288 * 2289 * NOTE: The inactive target is aggressively persued only if the 2290 * inactive queue becomes too small. If the inactive queue 2291 * is large enough to satisfy page movement to free+cache 2292 * then it is repopulated more slowly from the active queue. 2293 * This allows a general inactive_target default to be set. 2294 * 2295 * There is an issue here for processes which sit mostly idle 2296 * 'overnight', such as sshd, tcsh, and X. Any movement from 2297 * the active queue will eventually cause such pages to 2298 * recycle eventually causing a lot of paging in the morning. 2299 * To reduce the incidence of this pages cycled out of the 2300 * buffer cache are moved directly to the inactive queue if 2301 * they were only used once or twice. 2302 * 2303 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2304 * Increasing the value (up to 64) increases the number of 2305 * buffer recyclements which go directly to the inactive queue. 2306 * 2307 * NOTE: There is 'cache target'. The combined (free + cache( target 2308 * is handled by the v_paging_* targets above. 2309 */ 2310 vmstats.v_inactive_target = vmstats.v_free_count / 16; 2311 //vmstats.v_inactive_target = vmstats.v_free_min * 4; 2312 2313 /* XXX does not really belong here */ 2314 if (vm_page_max_wired == 0) 2315 vm_page_max_wired = vmstats.v_free_count / 3; 2316 2317 /* 2318 * page stats operation. 2319 * 2320 * scan - needs to be large enough for decent turn-around but 2321 * not so large that it eats a ton of CPU. Pages per run. 2322 * 2323 * ticks - interval per run in ticks. 2324 * 2325 * run - number of seconds after the pagedaemon has run that 2326 * we continue to collect page stats, after which we stop. 2327 * 2328 * Calculated for 50% coverage. 2329 * 2330 */ 2331 if (vm_pageout_stats_scan == 0) { 2332 vm_pageout_stats_scan = vmstats.v_free_count / PQ_L2_SIZE / 16; 2333 if (vm_pageout_stats_scan < 16) 2334 vm_pageout_stats_scan = 16; 2335 } 2336 2337 if (vm_pageout_stats_ticks == 0) 2338 vm_pageout_stats_ticks = hz / 10; 2339 2340 vm_pagedaemon_uptime = time_uptime; 2341 2342 swap_pager_swap_init(); 2343 pass = 0; 2344 2345 atomic_swap_int(&sequence_emerg_pager, 1); 2346 wakeup(&sequence_emerg_pager); 2347 2348 skip_setup: 2349 /* 2350 * Sequence emergency pager startup 2351 */ 2352 if (isep) { 2353 while (sequence_emerg_pager == 0) 2354 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2355 } 2356 2357 /* 2358 * The pageout daemon is never done, so loop forever. 2359 * 2360 * WARNING! This code is being executed by two kernel threads 2361 * potentially simultaneously. 2362 */ 2363 while (TRUE) { 2364 int error; 2365 long avail_shortage; 2366 long inactive_shortage; 2367 long vnodes_skipped = 0; 2368 long recycle_count = 0; 2369 long tmp; 2370 2371 /* 2372 * Wait for an action request. If we timeout check to 2373 * see if paging is needed (in case the normal wakeup 2374 * code raced us). 2375 */ 2376 if (isep) { 2377 /* 2378 * Emergency pagedaemon monitors the primary 2379 * pagedaemon while vm_pages_needed != 0. 2380 * 2381 * The emergency pagedaemon only runs if VM paging 2382 * is needed and the primary pagedaemon has not 2383 * updated vm_pagedaemon_uptime for more than 2 seconds. 2384 */ 2385 if (vm_pages_needed) 2386 tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz); 2387 else 2388 tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz*10); 2389 if (vm_pages_needed == 0) { 2390 pass = 0; 2391 continue; 2392 } 2393 if ((int)(time_uptime - vm_pagedaemon_uptime) < 2) { 2394 pass = 0; 2395 continue; 2396 } 2397 } else { 2398 /* 2399 * Primary pagedaemon 2400 * 2401 * Do an unconditional partial scan to deal with 2402 * PQ_HOLD races and to maintain active stats on 2403 * pages that are in PQ_ACTIVE. 2404 */ 2405 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK, 2406 &markers[q3iterator & PQ_L2_MASK].hold); 2407 vm_pageout_page_stats(q3iterator & PQ_L2_MASK, 2408 &markers[q3iterator & PQ_L2_MASK].stat, 2409 scounter); 2410 ++q3iterator; 2411 2412 /* 2413 * Primary idle sleep loop, check condition after 2414 * sleep. 2415 * 2416 * NOTE: State will not be IDLE if vm_pages_needed 2417 * is non-zero. 2418 */ 2419 if (vm_pages_needed == 0) { 2420 error = tsleep(&vm_pages_needed, 2421 0, "psleep", 2422 vm_pageout_stats_ticks); 2423 if (error && 2424 vm_paging_start(0) == 0 && 2425 vm_pages_needed == 0) 2426 { 2427 continue; 2428 } 2429 vm_pagedaemon_uptime = time_uptime; 2430 vm_pages_needed = 1; 2431 state = PAGING_TARGET1; 2432 2433 /* 2434 * Wake the emergency pagedaemon up so it 2435 * can monitor us. It will automatically 2436 * go back into a long sleep when 2437 * vm_pages_needed returns to 0. 2438 */ 2439 wakeup(&vm_pagedaemon_uptime); 2440 } 2441 } 2442 2443 mycpu->gd_cnt.v_pdwakeups++; 2444 2445 /* 2446 * Scan for INACTIVE->CLEAN/PAGEOUT 2447 * 2448 * This routine tries to avoid thrashing the system with 2449 * unnecessary activity. 2450 * 2451 * Calculate our target for the number of free+cache pages we 2452 * want to get to. This is higher then the number that causes 2453 * allocations to stall (severe) in order to provide hysteresis, 2454 * and if we don't make it all the way but get to the minimum 2455 * we're happy. Goose it a bit if there are multiple requests 2456 * for memory. 2457 * 2458 * Don't reduce avail_shortage inside the loop or the 2459 * PQAVERAGE() calculation will break. 2460 * 2461 * NOTE! deficit is differentiated from avail_shortage as 2462 * REQUIRING at least (deficit) pages to be cleaned, 2463 * even if the page queues are in good shape. This 2464 * is used primarily for handling per-process 2465 * RLIMIT_RSS and may also see small values when 2466 * processes block due to low memory. 2467 */ 2468 vmstats_rollup(); 2469 if (isep == 0) 2470 vm_pagedaemon_uptime = time_uptime; 2471 2472 if (state == PAGING_TARGET1) { 2473 avail_shortage = vm_paging_target1_count() + 2474 vm_pageout_deficit; 2475 } else { 2476 avail_shortage = vm_paging_target2_count() + 2477 vm_pageout_deficit; 2478 } 2479 vm_pageout_deficit = 0; 2480 2481 if (avail_shortage > 0) { 2482 long delta = 0; 2483 long counts[4] = { 0, 0, 0, 0 }; 2484 long use = avail_shortage; 2485 int qq; 2486 2487 if (vm_pageout_debug) { 2488 kprintf("scan_inactive pass %d isep=%d\t", 2489 pass / MAXSCAN_DIVIDER, isep); 2490 } 2491 2492 /* 2493 * Once target1 is achieved we move on to target2, 2494 * but pageout more lazily in smaller batches. 2495 */ 2496 if (state == PAGING_TARGET2 && 2497 use > vmstats.v_inactive_target / 10) 2498 { 2499 use = vmstats.v_inactive_target / 10 + 1; 2500 } 2501 2502 qq = q1iterator; 2503 for (q = 0; q < PQ_L2_SIZE; ++q) { 2504 delta += vm_pageout_scan_inactive( 2505 pass / MAXSCAN_DIVIDER, 2506 qq & PQ_L2_MASK, 2507 PQAVERAGE(use), 2508 &vnodes_skipped, counts); 2509 if (isep) 2510 --qq; 2511 else 2512 ++qq; 2513 if (avail_shortage - delta <= 0) 2514 break; 2515 2516 /* 2517 * It is possible for avail_shortage to be 2518 * very large. If a large program exits or 2519 * frees a ton of memory all at once, we do 2520 * not have to continue deactivations. 2521 * 2522 * (We will still run the active->inactive 2523 * target, however). 2524 */ 2525 if (!vm_paging_target2() && 2526 !vm_paging_min_dnc(vm_page_free_hysteresis)) { 2527 avail_shortage = 0; 2528 break; 2529 } 2530 } 2531 if (vm_pageout_debug) { 2532 kprintf("flushed %ld cleaned %ld " 2533 "lru2 %ld react %ld " 2534 "delta %ld\n", 2535 counts[0], counts[1], 2536 counts[2], counts[3], 2537 delta); 2538 } 2539 avail_shortage -= delta; 2540 q1iterator = qq; 2541 } 2542 2543 /* 2544 * Figure out how many active pages we must deactivate. If 2545 * we were able to reach our target with just the inactive 2546 * scan above we limit the number of active pages we 2547 * deactivate to reduce unnecessary work. 2548 * 2549 * When calculating inactive_shortage notice that we are 2550 * departing from what vm_paging_inactive_count() does. 2551 * During paging, the free + cache queues are assumed to 2552 * be under stress, so only a pure inactive target is 2553 * calculated without taking into account v_free_min, 2554 * v_free_count, or v_cache_count. 2555 */ 2556 vmstats_rollup(); 2557 if (isep == 0) 2558 vm_pagedaemon_uptime = time_uptime; 2559 inactive_shortage = vmstats.v_inactive_target - 2560 vmstats.v_inactive_count; 2561 2562 /* 2563 * If we were unable to free sufficient inactive pages to 2564 * satisfy the free/cache queue requirements then simply 2565 * reaching the inactive target may not be good enough. 2566 * Try to deactivate pages in excess of the target based 2567 * on the shortfall. 2568 * 2569 * However to prevent thrashing the VM system do not 2570 * deactivate more than an additional 1/10 the inactive 2571 * target's worth of active pages. 2572 */ 2573 if (avail_shortage > 0) { 2574 tmp = avail_shortage * 2; 2575 if (tmp > vmstats.v_inactive_target / 10) 2576 tmp = vmstats.v_inactive_target / 10; 2577 inactive_shortage += tmp; 2578 } 2579 2580 /* 2581 * Only trigger a pmap cleanup on inactive shortage. 2582 */ 2583 if (isep == 0 && inactive_shortage > 0) { 2584 pmap_collect(); 2585 } 2586 2587 /* 2588 * Scan for ACTIVE->INACTIVE 2589 * 2590 * Only trigger on inactive shortage. Triggering on 2591 * avail_shortage can starve the active queue with 2592 * unnecessary active->inactive transitions and destroy 2593 * performance. 2594 * 2595 * If this is the emergency pager, always try to move 2596 * a few pages from active to inactive because the inactive 2597 * queue might have enough pages, but not enough anonymous 2598 * pages. 2599 */ 2600 if (isep && inactive_shortage < vm_emerg_launder) 2601 inactive_shortage = vm_emerg_launder; 2602 2603 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2604 long delta = 0; 2605 int qq; 2606 2607 qq = q2iterator; 2608 for (q = 0; q < PQ_L2_SIZE; ++q) { 2609 delta += vm_pageout_scan_active( 2610 pass / MAXSCAN_DIVIDER, 2611 qq & PQ_L2_MASK, 2612 PQAVERAGE(avail_shortage), 2613 PQAVERAGE(inactive_shortage), 2614 &markers[qq & PQ_L2_MASK].pact, 2615 &recycle_count); 2616 if (isep) 2617 --qq; 2618 else 2619 ++qq; 2620 if (inactive_shortage - delta <= 0 && 2621 avail_shortage - delta <= 0) { 2622 break; 2623 } 2624 2625 /* 2626 * inactive_shortage can be a very large 2627 * number. This is intended to break out 2628 * early if our inactive_target has been 2629 * reached due to other system activity. 2630 */ 2631 if (vmstats.v_inactive_count > 2632 vmstats.v_inactive_target) 2633 { 2634 inactive_shortage = 0; 2635 break; 2636 } 2637 } 2638 inactive_shortage -= delta; 2639 avail_shortage -= delta; 2640 q2iterator = qq; 2641 } 2642 2643 /* 2644 * Scan for CACHE->FREE 2645 * 2646 * Finally free enough cache pages to meet our free page 2647 * requirement and take more drastic measures if we are 2648 * still in trouble. 2649 */ 2650 vmstats_rollup(); 2651 if (isep == 0) 2652 vm_pagedaemon_uptime = time_uptime; 2653 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER, 2654 vnodes_skipped, recycle_count); 2655 2656 /* 2657 * This is a bit sophisticated because we do not necessarily 2658 * want to force paging until our targets are reached if we 2659 * were able to successfully retire the shortage we calculated. 2660 */ 2661 if (avail_shortage > 0) { 2662 /* 2663 * If we did not retire enough pages continue the 2664 * pageout operation until we are able to. It 2665 * takes MAXSCAN_DIVIDER passes to cover the entire 2666 * inactive list. 2667 */ 2668 ++pass; 2669 2670 if (pass / MAXSCAN_DIVIDER < 10 && 2671 vm_pages_needed > 1) { 2672 /* 2673 * Normal operation, additional processes 2674 * have already kicked us. Retry immediately 2675 * unless swap space is completely full in 2676 * which case delay a bit. 2677 */ 2678 if (swap_pager_full) { 2679 tsleep(&vm_pages_needed, 0, "pdelay", 2680 hz / 5); 2681 } /* else immediate retry */ 2682 } else if (pass / MAXSCAN_DIVIDER < 10) { 2683 /* 2684 * Do a short sleep for the first 10 passes, 2685 * allow the sleep to be woken up by resetting 2686 * vm_pages_needed to 1 (NOTE: we are still 2687 * active paging!). 2688 */ 2689 if (isep == 0) 2690 vm_pages_needed = 1; 2691 tsleep(&vm_pages_needed, 0, "pdelay", 2); 2692 } else if (swap_pager_full == 0) { 2693 /* 2694 * We've taken too many passes, force a 2695 * longer delay. 2696 */ 2697 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2698 } else { 2699 /* 2700 * Running out of memory, catastrophic 2701 * back-off to one-second intervals. 2702 */ 2703 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2704 } 2705 } else { 2706 /* 2707 * Reset pass 2708 */ 2709 pass = 0; 2710 2711 if (vm_paging_start(0) || 2712 vm_paging_min_dnc(vm_page_free_hysteresis)) 2713 { 2714 /* 2715 * Pages sufficiently exhausted to start 2716 * page-daemon in TARGET1 mode 2717 */ 2718 state = PAGING_TARGET1; 2719 vm_pages_needed = 2; 2720 2721 /* 2722 * We can wakeup waiters if we are above 2723 * the wait point. 2724 */ 2725 if (!vm_paging_wait()) 2726 wakeup(&vmstats.v_free_count); 2727 } else if (vm_pages_needed) { 2728 /* 2729 * Continue paging until TARGET2 reached, 2730 * but waiters can be woken up. 2731 * 2732 * The PAGING_TARGET2 state tells the 2733 * pagedaemon to work a little less hard. 2734 */ 2735 if (vm_paging_target1()) { 2736 state = PAGING_TARGET1; 2737 vm_pages_needed = 2; 2738 } else if (vm_paging_target2()) { 2739 state = PAGING_TARGET2; 2740 vm_pages_needed = 2; 2741 } else { 2742 vm_pages_needed = 0; 2743 } 2744 wakeup(&vmstats.v_free_count); 2745 } /* else nothing to do here */ 2746 } 2747 } 2748 } 2749 2750 static struct kproc_desc pg1_kp = { 2751 "pagedaemon", 2752 vm_pageout_thread, 2753 &pagethread 2754 }; 2755 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2756 2757 static struct kproc_desc pg2_kp = { 2758 "emergpager", 2759 vm_pageout_thread, 2760 &emergpager 2761 }; 2762 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2763 2764 2765 /* 2766 * Called after allocating a page out of the cache or free queue 2767 * to possibly wake the pagedaemon up to replentish our supply. 2768 * 2769 * We try to generate some hysteresis by waking the pagedaemon up 2770 * when our free+cache pages go below the free_min+cache_min level. 2771 * The pagedaemon tries to get the count back up to at least the 2772 * minimum, and through to the target level if possible. 2773 * 2774 * If the pagedaemon is already active bump vm_pages_needed as a hint 2775 * that there are even more requests pending. 2776 * 2777 * SMP races ok? 2778 * No requirements. 2779 */ 2780 void 2781 pagedaemon_wakeup(void) 2782 { 2783 if (vm_paging_start(0) && curthread != pagethread) { 2784 if (vm_pages_needed <= 1) { 2785 vm_pages_needed = 1; /* SMP race ok */ 2786 wakeup(&vm_pages_needed); /* tickle pageout */ 2787 } else if (vm_paging_min()) { 2788 ++vm_pages_needed; /* SMP race ok */ 2789 /* a wakeup() would be wasted here */ 2790 } 2791 } 2792 } 2793 2794 #if !defined(NO_SWAPPING) 2795 2796 /* 2797 * SMP races ok? 2798 * No requirements. 2799 */ 2800 static void 2801 vm_req_vmdaemon(void) 2802 { 2803 static int lastrun = 0; 2804 2805 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2806 wakeup(&vm_daemon_needed); 2807 lastrun = ticks; 2808 } 2809 } 2810 2811 static int vm_daemon_callback(struct proc *p, void *data __unused); 2812 2813 /* 2814 * No requirements. 2815 * 2816 * Scan processes for exceeding their rlimits, deactivate pages 2817 * when RSS is exceeded. 2818 */ 2819 static void 2820 vm_daemon(void) 2821 { 2822 while (TRUE) { 2823 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2824 allproc_scan(vm_daemon_callback, NULL, 0); 2825 } 2826 } 2827 2828 static int 2829 vm_daemon_callback(struct proc *p, void *data __unused) 2830 { 2831 struct vmspace *vm; 2832 vm_pindex_t limit, size; 2833 2834 /* 2835 * if this is a system process or if we have already 2836 * looked at this process, skip it. 2837 */ 2838 lwkt_gettoken(&p->p_token); 2839 2840 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2841 lwkt_reltoken(&p->p_token); 2842 return (0); 2843 } 2844 2845 /* 2846 * if the process is in a non-running type state, 2847 * don't touch it. 2848 */ 2849 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2850 lwkt_reltoken(&p->p_token); 2851 return (0); 2852 } 2853 2854 /* 2855 * get a limit 2856 */ 2857 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2858 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2859 2860 vm = p->p_vmspace; 2861 vmspace_hold(vm); 2862 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2863 if (limit >= 0 && size > 4096 && 2864 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2865 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2866 } 2867 vmspace_drop(vm); 2868 2869 lwkt_reltoken(&p->p_token); 2870 2871 return (0); 2872 } 2873 2874 #endif 2875