1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/conf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <sys/lock.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_map.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/swap_pager.h> 93 #include <vm/vm_extern.h> 94 95 #include <sys/spinlock2.h> 96 #include <vm/vm_page2.h> 97 98 /* 99 * System initialization 100 */ 101 102 /* the kernel process "vm_pageout"*/ 103 static int vm_pageout_page(vm_page_t m, long *max_launderp, 104 long *vnodes_skippedp, struct vnode **vpfailedp, 105 int pass, int vmflush_flags); 106 static int vm_pageout_clean_helper (vm_page_t, int); 107 static int vm_pageout_free_page_calc (vm_size_t count); 108 static void vm_pageout_page_free(vm_page_t m) ; 109 struct thread *emergpager; 110 struct thread *pagethread; 111 static int sequence_emerg_pager; 112 113 #if !defined(NO_SWAPPING) 114 /* the kernel process "vm_daemon"*/ 115 static void vm_daemon (void); 116 static struct thread *vmthread; 117 118 static struct kproc_desc vm_kp = { 119 "vmdaemon", 120 vm_daemon, 121 &vmthread 122 }; 123 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 124 #endif 125 126 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 127 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 128 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 129 int vm_page_free_hysteresis = 16; 130 static int vm_pagedaemon_time; 131 132 #if !defined(NO_SWAPPING) 133 static int vm_pageout_req_swapout; 134 static int vm_daemon_needed; 135 #endif 136 static int vm_max_launder = 4096; 137 static int vm_emerg_launder = 100; 138 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 139 static int vm_pageout_full_stats_interval = 0; 140 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 141 static int defer_swap_pageouts=0; 142 static int disable_swap_pageouts=0; 143 static u_int vm_anonmem_decline = ACT_DECLINE; 144 static u_int vm_filemem_decline = ACT_DECLINE * 2; 145 146 #if defined(NO_SWAPPING) 147 static int vm_swap_enabled=0; 148 static int vm_swap_idle_enabled=0; 149 #else 150 static int vm_swap_enabled=1; 151 static int vm_swap_idle_enabled=0; 152 #endif 153 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 154 155 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 156 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 157 158 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 159 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 160 161 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 162 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 163 "Free more pages than the minimum required"); 164 165 SYSCTL_INT(_vm, OID_AUTO, max_launder, 166 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 167 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 168 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 169 170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 171 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 172 173 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 174 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 175 176 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 177 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 178 179 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 180 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 181 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 182 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 183 184 #if defined(NO_SWAPPING) 185 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 186 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 187 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 188 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 189 #else 190 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 191 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 192 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 193 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 194 #endif 195 196 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 197 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 198 199 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 200 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 201 202 static int pageout_lock_miss; 203 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 204 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 205 206 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 207 208 #if !defined(NO_SWAPPING) 209 static void vm_req_vmdaemon (void); 210 #endif 211 static void vm_pageout_page_stats(int q); 212 213 /* 214 * Calculate approximately how many pages on each queue to try to 215 * clean. An exact calculation creates an edge condition when the 216 * queues are unbalanced so add significant slop. The queue scans 217 * will stop early when targets are reached and will start where they 218 * left off on the next pass. 219 * 220 * We need to be generous here because there are all sorts of loading 221 * conditions that can cause edge cases if try to average over all queues. 222 * In particular, storage subsystems have become so fast that paging 223 * activity can become quite frantic. Eventually we will probably need 224 * two paging threads, one for dirty pages and one for clean, to deal 225 * with the bandwidth requirements. 226 227 * So what we do is calculate a value that can be satisfied nominally by 228 * only having to scan half the queues. 229 */ 230 static __inline long 231 PQAVERAGE(long n) 232 { 233 long avg; 234 235 if (n >= 0) { 236 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 237 } else { 238 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 239 } 240 return avg; 241 } 242 243 /* 244 * vm_pageout_clean_helper: 245 * 246 * Clean the page and remove it from the laundry. The page must be busied 247 * by the caller and will be disposed of (put away, flushed) by this routine. 248 */ 249 static int 250 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 251 { 252 vm_object_t object; 253 vm_page_t mc[BLIST_MAX_ALLOC]; 254 int error; 255 int ib, is, page_base; 256 vm_pindex_t pindex = m->pindex; 257 258 object = m->object; 259 260 /* 261 * Don't mess with the page if it's held or special. Theoretically 262 * we can pageout held pages but there is no real need to press our 263 * luck, so don't. 264 */ 265 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 266 vm_page_wakeup(m); 267 return 0; 268 } 269 270 /* 271 * Place page in cluster. Align cluster for optimal swap space 272 * allocation (whether it is swap or not). This is typically ~16-32 273 * pages, which also tends to align the cluster to multiples of the 274 * filesystem block size if backed by a filesystem. 275 */ 276 page_base = pindex % BLIST_MAX_ALLOC; 277 mc[page_base] = m; 278 ib = page_base - 1; 279 is = page_base + 1; 280 281 /* 282 * Scan object for clusterable pages. 283 * 284 * We can cluster ONLY if: ->> the page is NOT 285 * clean, wired, busy, held, or mapped into a 286 * buffer, and one of the following: 287 * 1) The page is inactive, or a seldom used 288 * active page. 289 * -or- 290 * 2) we force the issue. 291 * 292 * During heavy mmap/modification loads the pageout 293 * daemon can really fragment the underlying file 294 * due to flushing pages out of order and not trying 295 * align the clusters (which leave sporatic out-of-order 296 * holes). To solve this problem we do the reverse scan 297 * first and attempt to align our cluster, then do a 298 * forward scan if room remains. 299 */ 300 vm_object_hold(object); 301 302 while (ib >= 0) { 303 vm_page_t p; 304 305 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 306 TRUE, &error); 307 if (error || p == NULL) 308 break; 309 if ((p->queue - p->pc) == PQ_CACHE || 310 (p->flags & PG_UNMANAGED)) { 311 vm_page_wakeup(p); 312 break; 313 } 314 vm_page_test_dirty(p); 315 if (((p->dirty & p->valid) == 0 && 316 (p->flags & PG_NEED_COMMIT) == 0) || 317 p->wire_count != 0 || /* may be held by buf cache */ 318 p->hold_count != 0) { /* may be undergoing I/O */ 319 vm_page_wakeup(p); 320 break; 321 } 322 if (p->queue - p->pc != PQ_INACTIVE) { 323 if (p->queue - p->pc != PQ_ACTIVE || 324 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 325 vm_page_wakeup(p); 326 break; 327 } 328 } 329 330 /* 331 * Try to maintain page groupings in the cluster. 332 */ 333 if (m->flags & PG_WINATCFLS) 334 vm_page_flag_set(p, PG_WINATCFLS); 335 else 336 vm_page_flag_clear(p, PG_WINATCFLS); 337 p->act_count = m->act_count; 338 339 mc[ib] = p; 340 --ib; 341 } 342 ++ib; /* fixup */ 343 344 while (is < BLIST_MAX_ALLOC && 345 pindex - page_base + is < object->size) { 346 vm_page_t p; 347 348 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 349 TRUE, &error); 350 if (error || p == NULL) 351 break; 352 if (((p->queue - p->pc) == PQ_CACHE) || 353 (p->flags & PG_UNMANAGED)) { 354 vm_page_wakeup(p); 355 break; 356 } 357 vm_page_test_dirty(p); 358 if (((p->dirty & p->valid) == 0 && 359 (p->flags & PG_NEED_COMMIT) == 0) || 360 p->wire_count != 0 || /* may be held by buf cache */ 361 p->hold_count != 0) { /* may be undergoing I/O */ 362 vm_page_wakeup(p); 363 break; 364 } 365 if (p->queue - p->pc != PQ_INACTIVE) { 366 if (p->queue - p->pc != PQ_ACTIVE || 367 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 368 vm_page_wakeup(p); 369 break; 370 } 371 } 372 373 /* 374 * Try to maintain page groupings in the cluster. 375 */ 376 if (m->flags & PG_WINATCFLS) 377 vm_page_flag_set(p, PG_WINATCFLS); 378 else 379 vm_page_flag_clear(p, PG_WINATCFLS); 380 p->act_count = m->act_count; 381 382 mc[is] = p; 383 ++is; 384 } 385 386 vm_object_drop(object); 387 388 /* 389 * we allow reads during pageouts... 390 */ 391 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 392 } 393 394 /* 395 * vm_pageout_flush() - launder the given pages 396 * 397 * The given pages are laundered. Note that we setup for the start of 398 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 399 * reference count all in here rather then in the parent. If we want 400 * the parent to do more sophisticated things we may have to change 401 * the ordering. 402 * 403 * The pages in the array must be busied by the caller and will be 404 * unbusied by this function. 405 */ 406 int 407 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 408 { 409 vm_object_t object; 410 int pageout_status[count]; 411 int numpagedout = 0; 412 int i; 413 414 /* 415 * Initiate I/O. Bump the vm_page_t->busy counter. 416 */ 417 for (i = 0; i < count; i++) { 418 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 419 ("vm_pageout_flush page %p index %d/%d: partially " 420 "invalid page", mc[i], i, count)); 421 vm_page_io_start(mc[i]); 422 } 423 424 /* 425 * We must make the pages read-only. This will also force the 426 * modified bit in the related pmaps to be cleared. The pager 427 * cannot clear the bit for us since the I/O completion code 428 * typically runs from an interrupt. The act of making the page 429 * read-only handles the case for us. 430 * 431 * Then we can unbusy the pages, we still hold a reference by virtue 432 * of our soft-busy. 433 */ 434 for (i = 0; i < count; i++) { 435 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 436 vm_page_protect(mc[i], VM_PROT_NONE); 437 else 438 vm_page_protect(mc[i], VM_PROT_READ); 439 vm_page_wakeup(mc[i]); 440 } 441 442 object = mc[0]->object; 443 vm_object_pip_add(object, count); 444 445 vm_pager_put_pages(object, mc, count, 446 (vmflush_flags | 447 ((object == &kernel_object) ? 448 VM_PAGER_PUT_SYNC : 0)), 449 pageout_status); 450 451 for (i = 0; i < count; i++) { 452 vm_page_t mt = mc[i]; 453 454 switch (pageout_status[i]) { 455 case VM_PAGER_OK: 456 numpagedout++; 457 break; 458 case VM_PAGER_PEND: 459 numpagedout++; 460 break; 461 case VM_PAGER_BAD: 462 /* 463 * Page outside of range of object. Right now we 464 * essentially lose the changes by pretending it 465 * worked. 466 */ 467 vm_page_busy_wait(mt, FALSE, "pgbad"); 468 pmap_clear_modify(mt); 469 vm_page_undirty(mt); 470 vm_page_wakeup(mt); 471 break; 472 case VM_PAGER_ERROR: 473 case VM_PAGER_FAIL: 474 /* 475 * A page typically cannot be paged out when we 476 * have run out of swap. We leave the page 477 * marked inactive and will try to page it out 478 * again later. 479 * 480 * Starvation of the active page list is used to 481 * determine when the system is massively memory 482 * starved. 483 */ 484 break; 485 case VM_PAGER_AGAIN: 486 break; 487 } 488 489 /* 490 * If not PENDing this was a synchronous operation and we 491 * clean up after the I/O. If it is PENDing the mess is 492 * cleaned up asynchronously. 493 * 494 * Also nominally act on the caller's wishes if the caller 495 * wants to try to really clean (cache or free) the page. 496 * 497 * Also nominally deactivate the page if the system is 498 * memory-stressed. 499 */ 500 if (pageout_status[i] != VM_PAGER_PEND) { 501 vm_page_busy_wait(mt, FALSE, "pgouw"); 502 vm_page_io_finish(mt); 503 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 504 vm_page_try_to_cache(mt); 505 } else if (vm_page_count_severe()) { 506 vm_page_deactivate(mt); 507 vm_page_wakeup(mt); 508 } else { 509 vm_page_wakeup(mt); 510 } 511 vm_object_pip_wakeup(object); 512 } 513 } 514 return numpagedout; 515 } 516 517 #if !defined(NO_SWAPPING) 518 519 /* 520 * Callback function, page busied for us. We must dispose of the busy 521 * condition. Any related pmap pages may be held but will not be locked. 522 */ 523 static 524 int 525 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 526 vm_page_t p) 527 { 528 int actcount; 529 int cleanit = 0; 530 531 /* 532 * Basic tests - There should never be a marker, and we can stop 533 * once the RSS is below the required level. 534 */ 535 KKASSERT((p->flags & PG_MARKER) == 0); 536 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 537 vm_page_wakeup(p); 538 return(-1); 539 } 540 541 mycpu->gd_cnt.v_pdpages++; 542 543 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 544 vm_page_wakeup(p); 545 goto done; 546 } 547 548 ++info->actioncount; 549 550 /* 551 * Check if the page has been referened recently. If it has, 552 * activate it and skip. 553 */ 554 actcount = pmap_ts_referenced(p); 555 if (actcount) { 556 vm_page_flag_set(p, PG_REFERENCED); 557 } else if (p->flags & PG_REFERENCED) { 558 actcount = 1; 559 } 560 561 if (actcount) { 562 if (p->queue - p->pc != PQ_ACTIVE) { 563 vm_page_and_queue_spin_lock(p); 564 if (p->queue - p->pc != PQ_ACTIVE) { 565 vm_page_and_queue_spin_unlock(p); 566 vm_page_activate(p); 567 } else { 568 vm_page_and_queue_spin_unlock(p); 569 } 570 } else { 571 p->act_count += actcount; 572 if (p->act_count > ACT_MAX) 573 p->act_count = ACT_MAX; 574 } 575 vm_page_flag_clear(p, PG_REFERENCED); 576 vm_page_wakeup(p); 577 goto done; 578 } 579 580 /* 581 * Remove the page from this particular pmap. Once we do this, our 582 * pmap scans will not see it again (unless it gets faulted in), so 583 * we must actively dispose of or deal with the page. 584 */ 585 pmap_remove_specific(info->pmap, p); 586 587 /* 588 * If the page is not mapped to another process (i.e. as would be 589 * typical if this were a shared page from a library) then deactivate 590 * the page and clean it in two passes only. 591 * 592 * If the page hasn't been referenced since the last check, remove it 593 * from the pmap. If it is no longer mapped, deactivate it 594 * immediately, accelerating the normal decline. 595 * 596 * Once the page has been removed from the pmap the RSS code no 597 * longer tracks it so we have to make sure that it is staged for 598 * potential flush action. 599 */ 600 if ((p->flags & PG_MAPPED) == 0 || 601 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 602 if (p->queue - p->pc == PQ_ACTIVE) { 603 vm_page_deactivate(p); 604 } 605 if (p->queue - p->pc == PQ_INACTIVE) { 606 cleanit = 1; 607 } 608 } 609 610 /* 611 * Ok, try to fully clean the page and any nearby pages such that at 612 * least the requested page is freed or moved to the cache queue. 613 * 614 * We usually do this synchronously to allow us to get the page into 615 * the CACHE queue quickly, which will prevent memory exhaustion if 616 * a process with a memoryuse limit is running away. However, the 617 * sysadmin may desire to set vm.swap_user_async which relaxes this 618 * and improves write performance. 619 */ 620 if (cleanit) { 621 long max_launder = 0x7FFF; 622 long vnodes_skipped = 0; 623 int vmflush_flags; 624 struct vnode *vpfailed = NULL; 625 626 info->offset = va; 627 628 if (vm_pageout_memuse_mode >= 2) { 629 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 630 VM_PAGER_ALLOW_ACTIVE; 631 if (swap_user_async == 0) 632 vmflush_flags |= VM_PAGER_PUT_SYNC; 633 vm_page_flag_set(p, PG_WINATCFLS); 634 info->cleancount += 635 vm_pageout_page(p, &max_launder, 636 &vnodes_skipped, 637 &vpfailed, 1, vmflush_flags); 638 } else { 639 vm_page_wakeup(p); 640 ++info->cleancount; 641 } 642 } else { 643 vm_page_wakeup(p); 644 } 645 646 /* 647 * Must be at end to avoid SMP races. 648 */ 649 done: 650 lwkt_user_yield(); 651 return 0; 652 } 653 654 /* 655 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 656 * that is relatively difficult to do. We try to keep track of where we 657 * left off last time to reduce scan overhead. 658 * 659 * Called when vm_pageout_memuse_mode is >= 1. 660 */ 661 void 662 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 663 { 664 vm_offset_t pgout_offset; 665 struct pmap_pgscan_info info; 666 int retries = 3; 667 668 pgout_offset = map->pgout_offset; 669 again: 670 #if 0 671 kprintf("%016jx ", pgout_offset); 672 #endif 673 if (pgout_offset < VM_MIN_USER_ADDRESS) 674 pgout_offset = VM_MIN_USER_ADDRESS; 675 if (pgout_offset >= VM_MAX_USER_ADDRESS) 676 pgout_offset = 0; 677 info.pmap = vm_map_pmap(map); 678 info.limit = limit; 679 info.beg_addr = pgout_offset; 680 info.end_addr = VM_MAX_USER_ADDRESS; 681 info.callback = vm_pageout_mdp_callback; 682 info.cleancount = 0; 683 info.actioncount = 0; 684 info.busycount = 0; 685 686 pmap_pgscan(&info); 687 pgout_offset = info.offset; 688 #if 0 689 kprintf("%016jx %08lx %08lx\n", pgout_offset, 690 info.cleancount, info.actioncount); 691 #endif 692 693 if (pgout_offset != VM_MAX_USER_ADDRESS && 694 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 695 goto again; 696 } else if (retries && 697 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 698 --retries; 699 goto again; 700 } 701 map->pgout_offset = pgout_offset; 702 } 703 #endif 704 705 /* 706 * Called when the pageout scan wants to free a page. We no longer 707 * try to cycle the vm_object here with a reference & dealloc, which can 708 * cause a non-trivial object collapse in a critical path. 709 * 710 * It is unclear why we cycled the ref_count in the past, perhaps to try 711 * to optimize shadow chain collapses but I don't quite see why it would 712 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 713 * synchronously and not have to be kicked-start. 714 */ 715 static void 716 vm_pageout_page_free(vm_page_t m) 717 { 718 vm_page_protect(m, VM_PROT_NONE); 719 vm_page_free(m); 720 } 721 722 /* 723 * vm_pageout_scan does the dirty work for the pageout daemon. 724 */ 725 struct vm_pageout_scan_info { 726 struct proc *bigproc; 727 vm_offset_t bigsize; 728 }; 729 730 static int vm_pageout_scan_callback(struct proc *p, void *data); 731 732 /* 733 * Scan inactive queue 734 * 735 * WARNING! Can be called from two pagedaemon threads simultaneously. 736 */ 737 static int 738 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 739 long *vnodes_skipped) 740 { 741 vm_page_t m; 742 struct vm_page marker; 743 struct vnode *vpfailed; /* warning, allowed to be stale */ 744 long maxscan; 745 long delta = 0; 746 long max_launder; 747 int isep; 748 749 isep = (curthread == emergpager); 750 751 /* 752 * Start scanning the inactive queue for pages we can move to the 753 * cache or free. The scan will stop when the target is reached or 754 * we have scanned the entire inactive queue. Note that m->act_count 755 * is not used to form decisions for the inactive queue, only for the 756 * active queue. 757 * 758 * max_launder limits the number of dirty pages we flush per scan. 759 * For most systems a smaller value (16 or 32) is more robust under 760 * extreme memory and disk pressure because any unnecessary writes 761 * to disk can result in extreme performance degredation. However, 762 * systems with excessive dirty pages (especially when MAP_NOSYNC is 763 * used) will die horribly with limited laundering. If the pageout 764 * daemon cannot clean enough pages in the first pass, we let it go 765 * all out in succeeding passes. 766 * 767 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 768 * PAGES. 769 */ 770 if ((max_launder = vm_max_launder) <= 1) 771 max_launder = 1; 772 if (pass) 773 max_launder = 10000; 774 775 /* 776 * Initialize our marker 777 */ 778 bzero(&marker, sizeof(marker)); 779 marker.flags = PG_FICTITIOUS | PG_MARKER; 780 marker.busy_count = PBUSY_LOCKED; 781 marker.queue = PQ_INACTIVE + q; 782 marker.pc = q; 783 marker.wire_count = 1; 784 785 /* 786 * Inactive queue scan. 787 * 788 * NOTE: The vm_page must be spinlocked before the queue to avoid 789 * deadlocks, so it is easiest to simply iterate the loop 790 * with the queue unlocked at the top. 791 */ 792 vpfailed = NULL; 793 794 vm_page_queues_spin_lock(PQ_INACTIVE + q); 795 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 796 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 797 798 /* 799 * Queue locked at top of loop to avoid stack marker issues. 800 */ 801 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 802 maxscan-- > 0 && avail_shortage - delta > 0) 803 { 804 int count; 805 806 KKASSERT(m->queue == PQ_INACTIVE + q); 807 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 808 &marker, pageq); 809 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 810 &marker, pageq); 811 mycpu->gd_cnt.v_pdpages++; 812 813 /* 814 * Skip marker pages (atomic against other markers to avoid 815 * infinite hop-over scans). 816 */ 817 if (m->flags & PG_MARKER) 818 continue; 819 820 /* 821 * Try to busy the page. Don't mess with pages which are 822 * already busy or reorder them in the queue. 823 */ 824 if (vm_page_busy_try(m, TRUE)) 825 continue; 826 827 /* 828 * Remaining operations run with the page busy and neither 829 * the page or the queue will be spin-locked. 830 */ 831 KKASSERT(m->queue == PQ_INACTIVE + q); 832 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 833 834 /* 835 * The emergency pager runs when the primary pager gets 836 * stuck, which typically means the primary pager deadlocked 837 * on a vnode-backed page. Therefore, the emergency pager 838 * must skip any complex objects. 839 * 840 * We disallow VNODEs unless they are VCHR whos device ops 841 * does not flag D_NOEMERGPGR. 842 */ 843 if (isep && m->object) { 844 struct vnode *vp; 845 846 switch(m->object->type) { 847 case OBJT_DEFAULT: 848 case OBJT_SWAP: 849 /* 850 * Allow anonymous memory and assume that 851 * swap devices are not complex, since its 852 * kinda worthless if we can't swap out dirty 853 * anonymous pages. 854 */ 855 break; 856 case OBJT_VNODE: 857 /* 858 * Allow VCHR device if the D_NOEMERGPGR 859 * flag is not set, deny other vnode types 860 * as being too complex. 861 */ 862 vp = m->object->handle; 863 if (vp && vp->v_type == VCHR && 864 vp->v_rdev && vp->v_rdev->si_ops && 865 (vp->v_rdev->si_ops->head.flags & 866 D_NOEMERGPGR) == 0) { 867 break; 868 } 869 /* Deny - fall through */ 870 default: 871 /* 872 * Deny 873 */ 874 vm_page_wakeup(m); 875 vm_page_queues_spin_lock(PQ_INACTIVE + q); 876 lwkt_yield(); 877 continue; 878 } 879 } 880 881 /* 882 * Try to pageout the page and perhaps other nearby pages. 883 */ 884 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 885 &vpfailed, pass, 0); 886 delta += count; 887 888 /* 889 * Systems with a ton of memory can wind up with huge 890 * deactivation counts. Because the inactive scan is 891 * doing a lot of flushing, the combination can result 892 * in excessive paging even in situations where other 893 * unrelated threads free up sufficient VM. 894 * 895 * To deal with this we abort the nominal active->inactive 896 * scan before we hit the inactive target when free+cache 897 * levels have reached a reasonable target. 898 * 899 * When deciding to stop early we need to add some slop to 900 * the test and we need to return full completion to the caller 901 * to prevent the caller from thinking there is something 902 * wrong and issuing a low-memory+swap warning or pkill. 903 * 904 * A deficit forces paging regardless of the state of the 905 * VM page queues (used for RSS enforcement). 906 */ 907 lwkt_yield(); 908 vm_page_queues_spin_lock(PQ_INACTIVE + q); 909 if (vm_paging_target() < -vm_max_launder) { 910 /* 911 * Stopping early, return full completion to caller. 912 */ 913 if (delta < avail_shortage) 914 delta = avail_shortage; 915 break; 916 } 917 } 918 919 /* page queue still spin-locked */ 920 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 921 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 922 923 return (delta); 924 } 925 926 /* 927 * Pageout the specified page, return the total number of pages paged out 928 * (this routine may cluster). 929 * 930 * The page must be busied and soft-busied by the caller and will be disposed 931 * of by this function. 932 */ 933 static int 934 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 935 struct vnode **vpfailedp, int pass, int vmflush_flags) 936 { 937 vm_object_t object; 938 int actcount; 939 int count = 0; 940 941 /* 942 * Wiring no longer removes a page from its queue. The last unwiring 943 * will requeue the page. Obviously wired pages cannot be paged out 944 * so unqueue it and return. 945 */ 946 if (m->wire_count) { 947 vm_page_unqueue_nowakeup(m); 948 vm_page_wakeup(m); 949 return 0; 950 } 951 952 /* 953 * A held page may be undergoing I/O, so skip it. 954 */ 955 if (m->hold_count) { 956 vm_page_and_queue_spin_lock(m); 957 if (m->queue - m->pc == PQ_INACTIVE) { 958 TAILQ_REMOVE( 959 &vm_page_queues[m->queue].pl, m, pageq); 960 TAILQ_INSERT_TAIL( 961 &vm_page_queues[m->queue].pl, m, pageq); 962 } 963 vm_page_and_queue_spin_unlock(m); 964 vm_page_wakeup(m); 965 return 0; 966 } 967 968 if (m->object == NULL || m->object->ref_count == 0) { 969 /* 970 * If the object is not being used, we ignore previous 971 * references. 972 */ 973 vm_page_flag_clear(m, PG_REFERENCED); 974 pmap_clear_reference(m); 975 /* fall through to end */ 976 } else if (((m->flags & PG_REFERENCED) == 0) && 977 (actcount = pmap_ts_referenced(m))) { 978 /* 979 * Otherwise, if the page has been referenced while 980 * in the inactive queue, we bump the "activation 981 * count" upwards, making it less likely that the 982 * page will be added back to the inactive queue 983 * prematurely again. Here we check the page tables 984 * (or emulated bits, if any), given the upper level 985 * VM system not knowing anything about existing 986 * references. 987 */ 988 vm_page_activate(m); 989 m->act_count += (actcount + ACT_ADVANCE); 990 vm_page_wakeup(m); 991 return 0; 992 } 993 994 /* 995 * (m) is still busied. 996 * 997 * If the upper level VM system knows about any page 998 * references, we activate the page. We also set the 999 * "activation count" higher than normal so that we will less 1000 * likely place pages back onto the inactive queue again. 1001 */ 1002 if ((m->flags & PG_REFERENCED) != 0) { 1003 vm_page_flag_clear(m, PG_REFERENCED); 1004 actcount = pmap_ts_referenced(m); 1005 vm_page_activate(m); 1006 m->act_count += (actcount + ACT_ADVANCE + 1); 1007 vm_page_wakeup(m); 1008 return 0; 1009 } 1010 1011 /* 1012 * If the upper level VM system doesn't know anything about 1013 * the page being dirty, we have to check for it again. As 1014 * far as the VM code knows, any partially dirty pages are 1015 * fully dirty. 1016 * 1017 * Pages marked PG_WRITEABLE may be mapped into the user 1018 * address space of a process running on another cpu. A 1019 * user process (without holding the MP lock) running on 1020 * another cpu may be able to touch the page while we are 1021 * trying to remove it. vm_page_cache() will handle this 1022 * case for us. 1023 */ 1024 if (m->dirty == 0) { 1025 vm_page_test_dirty(m); 1026 } else { 1027 vm_page_dirty(m); 1028 } 1029 1030 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1031 /* 1032 * Invalid pages can be easily freed 1033 */ 1034 vm_pageout_page_free(m); 1035 mycpu->gd_cnt.v_dfree++; 1036 ++count; 1037 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1038 /* 1039 * Clean pages can be placed onto the cache queue. 1040 * This effectively frees them. 1041 */ 1042 vm_page_cache(m); 1043 ++count; 1044 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1045 /* 1046 * Dirty pages need to be paged out, but flushing 1047 * a page is extremely expensive verses freeing 1048 * a clean page. Rather then artificially limiting 1049 * the number of pages we can flush, we instead give 1050 * dirty pages extra priority on the inactive queue 1051 * by forcing them to be cycled through the queue 1052 * twice before being flushed, after which the 1053 * (now clean) page will cycle through once more 1054 * before being freed. This significantly extends 1055 * the thrash point for a heavily loaded machine. 1056 */ 1057 vm_page_flag_set(m, PG_WINATCFLS); 1058 vm_page_and_queue_spin_lock(m); 1059 if (m->queue - m->pc == PQ_INACTIVE) { 1060 TAILQ_REMOVE( 1061 &vm_page_queues[m->queue].pl, m, pageq); 1062 TAILQ_INSERT_TAIL( 1063 &vm_page_queues[m->queue].pl, m, pageq); 1064 } 1065 vm_page_and_queue_spin_unlock(m); 1066 vm_page_wakeup(m); 1067 } else if (*max_launderp > 0) { 1068 /* 1069 * We always want to try to flush some dirty pages if 1070 * we encounter them, to keep the system stable. 1071 * Normally this number is small, but under extreme 1072 * pressure where there are insufficient clean pages 1073 * on the inactive queue, we may have to go all out. 1074 */ 1075 int swap_pageouts_ok; 1076 struct vnode *vp = NULL; 1077 1078 swap_pageouts_ok = 0; 1079 object = m->object; 1080 if (object && 1081 (object->type != OBJT_SWAP) && 1082 (object->type != OBJT_DEFAULT)) { 1083 swap_pageouts_ok = 1; 1084 } else { 1085 swap_pageouts_ok = !(defer_swap_pageouts || 1086 disable_swap_pageouts); 1087 swap_pageouts_ok |= (!disable_swap_pageouts && 1088 defer_swap_pageouts && 1089 vm_page_count_min(0)); 1090 } 1091 1092 /* 1093 * We don't bother paging objects that are "dead". 1094 * Those objects are in a "rundown" state. 1095 */ 1096 if (!swap_pageouts_ok || 1097 (object == NULL) || 1098 (object->flags & OBJ_DEAD)) { 1099 vm_page_and_queue_spin_lock(m); 1100 if (m->queue - m->pc == PQ_INACTIVE) { 1101 TAILQ_REMOVE( 1102 &vm_page_queues[m->queue].pl, 1103 m, pageq); 1104 TAILQ_INSERT_TAIL( 1105 &vm_page_queues[m->queue].pl, 1106 m, pageq); 1107 } 1108 vm_page_and_queue_spin_unlock(m); 1109 vm_page_wakeup(m); 1110 return 0; 1111 } 1112 1113 /* 1114 * (m) is still busied. 1115 * 1116 * The object is already known NOT to be dead. It 1117 * is possible for the vget() to block the whole 1118 * pageout daemon, but the new low-memory handling 1119 * code should prevent it. 1120 * 1121 * The previous code skipped locked vnodes and, worse, 1122 * reordered pages in the queue. This results in 1123 * completely non-deterministic operation because, 1124 * quite often, a vm_fault has initiated an I/O and 1125 * is holding a locked vnode at just the point where 1126 * the pageout daemon is woken up. 1127 * 1128 * We can't wait forever for the vnode lock, we might 1129 * deadlock due to a vn_read() getting stuck in 1130 * vm_wait while holding this vnode. We skip the 1131 * vnode if we can't get it in a reasonable amount 1132 * of time. 1133 * 1134 * vpfailed is used to (try to) avoid the case where 1135 * a large number of pages are associated with a 1136 * locked vnode, which could cause the pageout daemon 1137 * to stall for an excessive amount of time. 1138 */ 1139 if (object->type == OBJT_VNODE) { 1140 int flags; 1141 1142 vp = object->handle; 1143 flags = LK_EXCLUSIVE; 1144 if (vp == *vpfailedp) 1145 flags |= LK_NOWAIT; 1146 else 1147 flags |= LK_TIMELOCK; 1148 vm_page_hold(m); 1149 vm_page_wakeup(m); 1150 1151 /* 1152 * We have unbusied (m) temporarily so we can 1153 * acquire the vp lock without deadlocking. 1154 * (m) is held to prevent destruction. 1155 */ 1156 if (vget(vp, flags) != 0) { 1157 *vpfailedp = vp; 1158 ++pageout_lock_miss; 1159 if (object->flags & OBJ_MIGHTBEDIRTY) 1160 ++*vnodes_skippedp; 1161 vm_page_unhold(m); 1162 return 0; 1163 } 1164 1165 /* 1166 * The page might have been moved to another 1167 * queue during potential blocking in vget() 1168 * above. The page might have been freed and 1169 * reused for another vnode. The object might 1170 * have been reused for another vnode. 1171 */ 1172 if (m->queue - m->pc != PQ_INACTIVE || 1173 m->object != object || 1174 object->handle != vp) { 1175 if (object->flags & OBJ_MIGHTBEDIRTY) 1176 ++*vnodes_skippedp; 1177 vput(vp); 1178 vm_page_unhold(m); 1179 return 0; 1180 } 1181 1182 /* 1183 * The page may have been busied during the 1184 * blocking in vput(); We don't move the 1185 * page back onto the end of the queue so that 1186 * statistics are more correct if we don't. 1187 */ 1188 if (vm_page_busy_try(m, TRUE)) { 1189 vput(vp); 1190 vm_page_unhold(m); 1191 return 0; 1192 } 1193 vm_page_unhold(m); 1194 1195 /* 1196 * If it was wired while we didn't own it. 1197 */ 1198 if (m->wire_count) { 1199 vm_page_unqueue_nowakeup(m); 1200 vput(vp); 1201 vm_page_wakeup(m); 1202 return 0; 1203 } 1204 1205 /* 1206 * (m) is busied again 1207 * 1208 * We own the busy bit and remove our hold 1209 * bit. If the page is still held it 1210 * might be undergoing I/O, so skip it. 1211 */ 1212 if (m->hold_count) { 1213 vm_page_and_queue_spin_lock(m); 1214 if (m->queue - m->pc == PQ_INACTIVE) { 1215 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1216 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1217 } 1218 vm_page_and_queue_spin_unlock(m); 1219 if (object->flags & OBJ_MIGHTBEDIRTY) 1220 ++*vnodes_skippedp; 1221 vm_page_wakeup(m); 1222 vput(vp); 1223 return 0; 1224 } 1225 /* (m) is left busied as we fall through */ 1226 } 1227 1228 /* 1229 * page is busy and not held here. 1230 * 1231 * If a page is dirty, then it is either being washed 1232 * (but not yet cleaned) or it is still in the 1233 * laundry. If it is still in the laundry, then we 1234 * start the cleaning operation. 1235 * 1236 * decrement inactive_shortage on success to account 1237 * for the (future) cleaned page. Otherwise we 1238 * could wind up laundering or cleaning too many 1239 * pages. 1240 * 1241 * NOTE: Cleaning the page here does not cause 1242 * force_deficit to be adjusted, because the 1243 * page is not being freed or moved to the 1244 * cache. 1245 */ 1246 count = vm_pageout_clean_helper(m, vmflush_flags); 1247 *max_launderp -= count; 1248 1249 /* 1250 * Clean ate busy, page no longer accessible 1251 */ 1252 if (vp != NULL) 1253 vput(vp); 1254 } else { 1255 vm_page_wakeup(m); 1256 } 1257 return count; 1258 } 1259 1260 /* 1261 * Scan active queue 1262 * 1263 * WARNING! Can be called from two pagedaemon threads simultaneously. 1264 */ 1265 static int 1266 vm_pageout_scan_active(int pass, int q, 1267 long avail_shortage, long inactive_shortage, 1268 long *recycle_countp) 1269 { 1270 struct vm_page marker; 1271 vm_page_t m; 1272 int actcount; 1273 long delta = 0; 1274 long maxscan; 1275 int isep; 1276 1277 isep = (curthread == emergpager); 1278 1279 /* 1280 * We want to move pages from the active queue to the inactive 1281 * queue to get the inactive queue to the inactive target. If 1282 * we still have a page shortage from above we try to directly free 1283 * clean pages instead of moving them. 1284 * 1285 * If we do still have a shortage we keep track of the number of 1286 * pages we free or cache (recycle_count) as a measure of thrashing 1287 * between the active and inactive queues. 1288 * 1289 * If we were able to completely satisfy the free+cache targets 1290 * from the inactive pool we limit the number of pages we move 1291 * from the active pool to the inactive pool to 2x the pages we 1292 * had removed from the inactive pool (with a minimum of 1/5 the 1293 * inactive target). If we were not able to completely satisfy 1294 * the free+cache targets we go for the whole target aggressively. 1295 * 1296 * NOTE: Both variables can end up negative. 1297 * NOTE: We are still in a critical section. 1298 * 1299 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1300 * PAGES. 1301 */ 1302 1303 bzero(&marker, sizeof(marker)); 1304 marker.flags = PG_FICTITIOUS | PG_MARKER; 1305 marker.busy_count = PBUSY_LOCKED; 1306 marker.queue = PQ_ACTIVE + q; 1307 marker.pc = q; 1308 marker.wire_count = 1; 1309 1310 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1311 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1312 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1313 1314 /* 1315 * Queue locked at top of loop to avoid stack marker issues. 1316 */ 1317 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1318 maxscan-- > 0 && (avail_shortage - delta > 0 || 1319 inactive_shortage > 0)) 1320 { 1321 KKASSERT(m->queue == PQ_ACTIVE + q); 1322 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1323 &marker, pageq); 1324 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1325 &marker, pageq); 1326 1327 /* 1328 * Skip marker pages (atomic against other markers to avoid 1329 * infinite hop-over scans). 1330 */ 1331 if (m->flags & PG_MARKER) 1332 continue; 1333 1334 /* 1335 * Try to busy the page. Don't mess with pages which are 1336 * already busy or reorder them in the queue. 1337 */ 1338 if (vm_page_busy_try(m, TRUE)) 1339 continue; 1340 1341 /* 1342 * Remaining operations run with the page busy and neither 1343 * the page or the queue will be spin-locked. 1344 */ 1345 KKASSERT(m->queue == PQ_ACTIVE + q); 1346 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1347 1348 #if 0 1349 /* 1350 * Don't deactivate pages that are held, even if we can 1351 * busy them. (XXX why not?) 1352 */ 1353 if (m->hold_count) { 1354 vm_page_and_queue_spin_lock(m); 1355 if (m->queue - m->pc == PQ_ACTIVE) { 1356 TAILQ_REMOVE( 1357 &vm_page_queues[PQ_ACTIVE + q].pl, 1358 m, pageq); 1359 TAILQ_INSERT_TAIL( 1360 &vm_page_queues[PQ_ACTIVE + q].pl, 1361 m, pageq); 1362 } 1363 vm_page_and_queue_spin_unlock(m); 1364 vm_page_wakeup(m); 1365 goto next; 1366 } 1367 #endif 1368 /* 1369 * We can just remove wired pages from the queue 1370 */ 1371 if (m->wire_count) { 1372 vm_page_unqueue_nowakeup(m); 1373 vm_page_wakeup(m); 1374 goto next; 1375 } 1376 1377 /* 1378 * The emergency pager ignores vnode-backed pages as these 1379 * are the pages that probably bricked the main pager. 1380 */ 1381 if (isep && m->object && m->object->type == OBJT_VNODE) { 1382 vm_page_and_queue_spin_lock(m); 1383 if (m->queue - m->pc == PQ_ACTIVE) { 1384 TAILQ_REMOVE( 1385 &vm_page_queues[PQ_ACTIVE + q].pl, 1386 m, pageq); 1387 TAILQ_INSERT_TAIL( 1388 &vm_page_queues[PQ_ACTIVE + q].pl, 1389 m, pageq); 1390 } 1391 vm_page_and_queue_spin_unlock(m); 1392 vm_page_wakeup(m); 1393 goto next; 1394 } 1395 1396 /* 1397 * The count for pagedaemon pages is done after checking the 1398 * page for eligibility... 1399 */ 1400 mycpu->gd_cnt.v_pdpages++; 1401 1402 /* 1403 * Check to see "how much" the page has been used and clear 1404 * the tracking access bits. If the object has no references 1405 * don't bother paying the expense. 1406 */ 1407 actcount = 0; 1408 if (m->object && m->object->ref_count != 0) { 1409 if (m->flags & PG_REFERENCED) 1410 ++actcount; 1411 actcount += pmap_ts_referenced(m); 1412 if (actcount) { 1413 m->act_count += ACT_ADVANCE + actcount; 1414 if (m->act_count > ACT_MAX) 1415 m->act_count = ACT_MAX; 1416 } 1417 } 1418 vm_page_flag_clear(m, PG_REFERENCED); 1419 1420 /* 1421 * actcount is only valid if the object ref_count is non-zero. 1422 * If the page does not have an object, actcount will be zero. 1423 */ 1424 if (actcount && m->object->ref_count != 0) { 1425 vm_page_and_queue_spin_lock(m); 1426 if (m->queue - m->pc == PQ_ACTIVE) { 1427 TAILQ_REMOVE( 1428 &vm_page_queues[PQ_ACTIVE + q].pl, 1429 m, pageq); 1430 TAILQ_INSERT_TAIL( 1431 &vm_page_queues[PQ_ACTIVE + q].pl, 1432 m, pageq); 1433 } 1434 vm_page_and_queue_spin_unlock(m); 1435 vm_page_wakeup(m); 1436 } else { 1437 switch(m->object->type) { 1438 case OBJT_DEFAULT: 1439 case OBJT_SWAP: 1440 m->act_count -= min(m->act_count, 1441 vm_anonmem_decline); 1442 break; 1443 default: 1444 m->act_count -= min(m->act_count, 1445 vm_filemem_decline); 1446 break; 1447 } 1448 if (vm_pageout_algorithm || 1449 (m->object == NULL) || 1450 (m->object && (m->object->ref_count == 0)) || 1451 m->act_count < pass + 1 1452 ) { 1453 /* 1454 * Deactivate the page. If we had a 1455 * shortage from our inactive scan try to 1456 * free (cache) the page instead. 1457 * 1458 * Don't just blindly cache the page if 1459 * we do not have a shortage from the 1460 * inactive scan, that could lead to 1461 * gigabytes being moved. 1462 */ 1463 --inactive_shortage; 1464 if (avail_shortage - delta > 0 || 1465 (m->object && (m->object->ref_count == 0))) 1466 { 1467 if (avail_shortage - delta > 0) 1468 ++*recycle_countp; 1469 vm_page_protect(m, VM_PROT_NONE); 1470 if (m->dirty == 0 && 1471 (m->flags & PG_NEED_COMMIT) == 0 && 1472 avail_shortage - delta > 0) { 1473 vm_page_cache(m); 1474 } else { 1475 vm_page_deactivate(m); 1476 vm_page_wakeup(m); 1477 } 1478 } else { 1479 vm_page_deactivate(m); 1480 vm_page_wakeup(m); 1481 } 1482 ++delta; 1483 } else { 1484 vm_page_and_queue_spin_lock(m); 1485 if (m->queue - m->pc == PQ_ACTIVE) { 1486 TAILQ_REMOVE( 1487 &vm_page_queues[PQ_ACTIVE + q].pl, 1488 m, pageq); 1489 TAILQ_INSERT_TAIL( 1490 &vm_page_queues[PQ_ACTIVE + q].pl, 1491 m, pageq); 1492 } 1493 vm_page_and_queue_spin_unlock(m); 1494 vm_page_wakeup(m); 1495 } 1496 } 1497 next: 1498 lwkt_yield(); 1499 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1500 } 1501 1502 /* 1503 * Clean out our local marker. 1504 * 1505 * Page queue still spin-locked. 1506 */ 1507 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1508 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1509 1510 return (delta); 1511 } 1512 1513 /* 1514 * The number of actually free pages can drop down to v_free_reserved, 1515 * we try to build the free count back above v_free_min. Note that 1516 * vm_paging_needed() also returns TRUE if v_free_count is not at 1517 * least v_free_min so that is the minimum we must build the free 1518 * count to. 1519 * 1520 * We use a slightly higher target to improve hysteresis, 1521 * ((v_free_target + v_free_min) / 2). Since v_free_target 1522 * is usually the same as v_cache_min this maintains about 1523 * half the pages in the free queue as are in the cache queue, 1524 * providing pretty good pipelining for pageout operation. 1525 * 1526 * The system operator can manipulate vm.v_cache_min and 1527 * vm.v_free_target to tune the pageout demon. Be sure 1528 * to keep vm.v_free_min < vm.v_free_target. 1529 * 1530 * Note that the original paging target is to get at least 1531 * (free_min + cache_min) into (free + cache). The slightly 1532 * higher target will shift additional pages from cache to free 1533 * without effecting the original paging target in order to 1534 * maintain better hysteresis and not have the free count always 1535 * be dead-on v_free_min. 1536 * 1537 * NOTE: we are still in a critical section. 1538 * 1539 * Pages moved from PQ_CACHE to totally free are not counted in the 1540 * pages_freed counter. 1541 * 1542 * WARNING! Can be called from two pagedaemon threads simultaneously. 1543 */ 1544 static void 1545 vm_pageout_scan_cache(long avail_shortage, int pass, 1546 long vnodes_skipped, long recycle_count) 1547 { 1548 static int lastkillticks; 1549 struct vm_pageout_scan_info info; 1550 vm_page_t m; 1551 int isep; 1552 1553 isep = (curthread == emergpager); 1554 1555 while (vmstats.v_free_count < 1556 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1557 /* 1558 * This steals some code from vm/vm_page.c 1559 * 1560 * Create two rovers and adjust the code to reduce 1561 * chances of them winding up at the same index (which 1562 * can cause a lot of contention). 1563 */ 1564 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1565 1566 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1567 goto next_rover; 1568 1569 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1570 if (m == NULL) 1571 break; 1572 1573 /* 1574 * If the busy attempt fails we can still deactivate the page. 1575 */ 1576 /* page is returned removed from its queue and spinlocked */ 1577 if (vm_page_busy_try(m, TRUE)) { 1578 vm_page_deactivate_locked(m); 1579 vm_page_spin_unlock(m); 1580 continue; 1581 } 1582 vm_page_spin_unlock(m); 1583 pagedaemon_wakeup(); 1584 lwkt_yield(); 1585 1586 /* 1587 * Remaining operations run with the page busy and neither 1588 * the page or the queue will be spin-locked. 1589 */ 1590 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1591 m->hold_count || 1592 m->wire_count) { 1593 vm_page_deactivate(m); 1594 vm_page_wakeup(m); 1595 continue; 1596 } 1597 pmap_mapped_sync(m); 1598 KKASSERT((m->flags & PG_MAPPED) == 0); 1599 KKASSERT(m->dirty == 0); 1600 vm_pageout_page_free(m); 1601 mycpu->gd_cnt.v_dfree++; 1602 next_rover: 1603 if (isep) 1604 cache_rover[1] -= PQ_PRIME2; 1605 else 1606 cache_rover[0] += PQ_PRIME2; 1607 } 1608 1609 #if !defined(NO_SWAPPING) 1610 /* 1611 * Idle process swapout -- run once per second. 1612 */ 1613 if (vm_swap_idle_enabled) { 1614 static time_t lsec; 1615 if (time_uptime != lsec) { 1616 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1617 vm_req_vmdaemon(); 1618 lsec = time_uptime; 1619 } 1620 } 1621 #endif 1622 1623 /* 1624 * If we didn't get enough free pages, and we have skipped a vnode 1625 * in a writeable object, wakeup the sync daemon. And kick swapout 1626 * if we did not get enough free pages. 1627 */ 1628 if (vm_paging_target() > 0) { 1629 if (vnodes_skipped && vm_page_count_min(0)) 1630 speedup_syncer(NULL); 1631 #if !defined(NO_SWAPPING) 1632 if (vm_swap_enabled && vm_page_count_target()) { 1633 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1634 vm_req_vmdaemon(); 1635 } 1636 #endif 1637 } 1638 1639 /* 1640 * Handle catastrophic conditions. Under good conditions we should 1641 * be at the target, well beyond our minimum. If we could not even 1642 * reach our minimum the system is under heavy stress. But just being 1643 * under heavy stress does not trigger process killing. 1644 * 1645 * We consider ourselves to have run out of memory if the swap pager 1646 * is full and avail_shortage is still positive. The secondary check 1647 * ensures that we do not kill processes if the instantanious 1648 * availability is good, even if the pageout demon pass says it 1649 * couldn't get to the target. 1650 * 1651 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1652 * SITUATIONS. 1653 */ 1654 if (swap_pager_almost_full && 1655 pass > 0 && 1656 isep == 0 && 1657 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1658 kprintf("Warning: system low on memory+swap " 1659 "shortage %ld for %d ticks!\n", 1660 avail_shortage, ticks - swap_fail_ticks); 1661 if (bootverbose) 1662 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1663 "avail=%ld target=%ld last=%u\n", 1664 swap_pager_almost_full, 1665 swap_pager_full, 1666 pass, 1667 avail_shortage, 1668 vm_paging_target(), 1669 (unsigned int)(ticks - lastkillticks)); 1670 } 1671 if (swap_pager_full && 1672 pass > 1 && 1673 isep == 0 && 1674 avail_shortage > 0 && 1675 vm_paging_target() > 0 && 1676 (unsigned int)(ticks - lastkillticks) >= hz) { 1677 /* 1678 * Kill something, maximum rate once per second to give 1679 * the process time to free up sufficient memory. 1680 */ 1681 lastkillticks = ticks; 1682 info.bigproc = NULL; 1683 info.bigsize = 0; 1684 allproc_scan(vm_pageout_scan_callback, &info, 0); 1685 if (info.bigproc != NULL) { 1686 kprintf("Try to kill process %d %s\n", 1687 info.bigproc->p_pid, info.bigproc->p_comm); 1688 info.bigproc->p_nice = PRIO_MIN; 1689 info.bigproc->p_usched->resetpriority( 1690 FIRST_LWP_IN_PROC(info.bigproc)); 1691 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1692 killproc(info.bigproc, "out of swap space"); 1693 wakeup(&vmstats.v_free_count); 1694 PRELE(info.bigproc); 1695 } 1696 } 1697 } 1698 1699 static int 1700 vm_pageout_scan_callback(struct proc *p, void *data) 1701 { 1702 struct vm_pageout_scan_info *info = data; 1703 vm_offset_t size; 1704 1705 /* 1706 * Never kill system processes or init. If we have configured swap 1707 * then try to avoid killing low-numbered pids. 1708 */ 1709 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1710 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1711 return (0); 1712 } 1713 1714 lwkt_gettoken(&p->p_token); 1715 1716 /* 1717 * if the process is in a non-running type state, 1718 * don't touch it. 1719 */ 1720 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1721 lwkt_reltoken(&p->p_token); 1722 return (0); 1723 } 1724 1725 /* 1726 * Get the approximate process size. Note that anonymous pages 1727 * with backing swap will be counted twice, but there should not 1728 * be too many such pages due to the stress the VM system is 1729 * under at this point. 1730 */ 1731 size = vmspace_anonymous_count(p->p_vmspace) + 1732 vmspace_swap_count(p->p_vmspace); 1733 1734 /* 1735 * If the this process is bigger than the biggest one 1736 * remember it. 1737 */ 1738 if (info->bigsize < size) { 1739 if (info->bigproc) 1740 PRELE(info->bigproc); 1741 PHOLD(p); 1742 info->bigproc = p; 1743 info->bigsize = size; 1744 } 1745 lwkt_reltoken(&p->p_token); 1746 lwkt_yield(); 1747 1748 return(0); 1749 } 1750 1751 /* 1752 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1753 * moved back to PQ_FREE. It is possible for pages to accumulate here 1754 * when vm_page_free() races against vm_page_unhold(), resulting in a 1755 * page being left on a PQ_HOLD queue with hold_count == 0. 1756 * 1757 * It is easier to handle this edge condition here, in non-critical code, 1758 * rather than enforce a spin-lock for every 1->0 transition in 1759 * vm_page_unhold(). 1760 * 1761 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1762 */ 1763 static void 1764 vm_pageout_scan_hold(int q) 1765 { 1766 vm_page_t m; 1767 1768 vm_page_queues_spin_lock(PQ_HOLD + q); 1769 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1770 if (m->flags & PG_MARKER) 1771 continue; 1772 1773 /* 1774 * Process one page and return 1775 */ 1776 if (m->hold_count) 1777 break; 1778 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1779 vm_page_hold(m); 1780 vm_page_queues_spin_unlock(PQ_HOLD + q); 1781 vm_page_unhold(m); /* reprocess */ 1782 return; 1783 } 1784 vm_page_queues_spin_unlock(PQ_HOLD + q); 1785 } 1786 1787 /* 1788 * This routine tries to maintain the pseudo LRU active queue, 1789 * so that during long periods of time where there is no paging, 1790 * that some statistic accumulation still occurs. This code 1791 * helps the situation where paging just starts to occur. 1792 */ 1793 static void 1794 vm_pageout_page_stats(int q) 1795 { 1796 static int fullintervalcount = 0; 1797 struct vm_page marker; 1798 vm_page_t m; 1799 long pcount, tpcount; /* Number of pages to check */ 1800 long page_shortage; 1801 1802 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1803 vmstats.v_free_min) - 1804 (vmstats.v_free_count + vmstats.v_inactive_count + 1805 vmstats.v_cache_count); 1806 1807 if (page_shortage <= 0) 1808 return; 1809 1810 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1811 fullintervalcount += vm_pageout_stats_interval; 1812 if (fullintervalcount < vm_pageout_full_stats_interval) { 1813 tpcount = (vm_pageout_stats_max * pcount) / 1814 vmstats.v_page_count + 1; 1815 if (pcount > tpcount) 1816 pcount = tpcount; 1817 } else { 1818 fullintervalcount = 0; 1819 } 1820 1821 bzero(&marker, sizeof(marker)); 1822 marker.flags = PG_FICTITIOUS | PG_MARKER; 1823 marker.busy_count = PBUSY_LOCKED; 1824 marker.queue = PQ_ACTIVE + q; 1825 marker.pc = q; 1826 marker.wire_count = 1; 1827 1828 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1829 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1830 1831 /* 1832 * Queue locked at top of loop to avoid stack marker issues. 1833 */ 1834 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1835 pcount-- > 0) 1836 { 1837 int actcount; 1838 1839 KKASSERT(m->queue == PQ_ACTIVE + q); 1840 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1841 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1842 &marker, pageq); 1843 1844 /* 1845 * Skip marker pages (atomic against other markers to avoid 1846 * infinite hop-over scans). 1847 */ 1848 if (m->flags & PG_MARKER) 1849 continue; 1850 1851 /* 1852 * Ignore pages we can't busy 1853 */ 1854 if (vm_page_busy_try(m, TRUE)) 1855 continue; 1856 1857 /* 1858 * Remaining operations run with the page busy and neither 1859 * the page or the queue will be spin-locked. 1860 */ 1861 KKASSERT(m->queue == PQ_ACTIVE + q); 1862 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1863 1864 /* 1865 * We can just remove wired pages from the queue 1866 */ 1867 if (m->wire_count) { 1868 vm_page_unqueue_nowakeup(m); 1869 vm_page_wakeup(m); 1870 goto next; 1871 } 1872 1873 1874 /* 1875 * We now have a safely busied page, the page and queue 1876 * spinlocks have been released. 1877 * 1878 * Ignore held and wired pages 1879 */ 1880 if (m->hold_count || m->wire_count) { 1881 vm_page_wakeup(m); 1882 goto next; 1883 } 1884 1885 /* 1886 * Calculate activity 1887 */ 1888 actcount = 0; 1889 if (m->flags & PG_REFERENCED) { 1890 vm_page_flag_clear(m, PG_REFERENCED); 1891 actcount += 1; 1892 } 1893 actcount += pmap_ts_referenced(m); 1894 1895 /* 1896 * Update act_count and move page to end of queue. 1897 */ 1898 if (actcount) { 1899 m->act_count += ACT_ADVANCE + actcount; 1900 if (m->act_count > ACT_MAX) 1901 m->act_count = ACT_MAX; 1902 vm_page_and_queue_spin_lock(m); 1903 if (m->queue - m->pc == PQ_ACTIVE) { 1904 TAILQ_REMOVE( 1905 &vm_page_queues[PQ_ACTIVE + q].pl, 1906 m, pageq); 1907 TAILQ_INSERT_TAIL( 1908 &vm_page_queues[PQ_ACTIVE + q].pl, 1909 m, pageq); 1910 } 1911 vm_page_and_queue_spin_unlock(m); 1912 vm_page_wakeup(m); 1913 goto next; 1914 } 1915 1916 if (m->act_count == 0) { 1917 /* 1918 * We turn off page access, so that we have 1919 * more accurate RSS stats. We don't do this 1920 * in the normal page deactivation when the 1921 * system is loaded VM wise, because the 1922 * cost of the large number of page protect 1923 * operations would be higher than the value 1924 * of doing the operation. 1925 * 1926 * We use the marker to save our place so 1927 * we can release the spin lock. both (m) 1928 * and (next) will be invalid. 1929 */ 1930 vm_page_protect(m, VM_PROT_NONE); 1931 vm_page_deactivate(m); 1932 } else { 1933 m->act_count -= min(m->act_count, ACT_DECLINE); 1934 vm_page_and_queue_spin_lock(m); 1935 if (m->queue - m->pc == PQ_ACTIVE) { 1936 TAILQ_REMOVE( 1937 &vm_page_queues[PQ_ACTIVE + q].pl, 1938 m, pageq); 1939 TAILQ_INSERT_TAIL( 1940 &vm_page_queues[PQ_ACTIVE + q].pl, 1941 m, pageq); 1942 } 1943 vm_page_and_queue_spin_unlock(m); 1944 } 1945 vm_page_wakeup(m); 1946 next: 1947 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1948 } 1949 1950 /* 1951 * Remove our local marker 1952 * 1953 * Page queue still spin-locked. 1954 */ 1955 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1956 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1957 } 1958 1959 static int 1960 vm_pageout_free_page_calc(vm_size_t count) 1961 { 1962 if (count < vmstats.v_page_count) 1963 return 0; 1964 /* 1965 * v_free_min normal allocations 1966 * v_free_reserved system allocations 1967 * v_pageout_free_min allocations by pageout daemon 1968 * v_interrupt_free_min low level allocations (e.g swap structures) 1969 */ 1970 if (vmstats.v_page_count > 1024) 1971 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1972 else 1973 vmstats.v_free_min = 64; 1974 1975 /* 1976 * Make sure the vmmeter slop can't blow out our global minimums. 1977 * 1978 * However, to accomodate weird configurations (vkernels with many 1979 * cpus and little memory, or artifically reduced hw.physmem), do 1980 * not allow v_free_min to exceed 1/20 of ram or the pageout demon 1981 * will go out of control. 1982 */ 1983 if (vmstats.v_free_min < VMMETER_SLOP_COUNT * ncpus * 10) 1984 vmstats.v_free_min = VMMETER_SLOP_COUNT * ncpus * 10; 1985 if (vmstats.v_free_min > vmstats.v_page_count / 20) 1986 vmstats.v_free_min = vmstats.v_page_count / 20; 1987 1988 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1989 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1990 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1991 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1992 1993 return 1; 1994 } 1995 1996 1997 /* 1998 * vm_pageout is the high level pageout daemon. TWO kernel threads run 1999 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2000 * 2001 * The emergency pageout daemon takes over when the primary pageout daemon 2002 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2003 * avoiding the many low-memory deadlocks which can occur when paging out 2004 * to VFS's. 2005 */ 2006 static void 2007 vm_pageout_thread(void) 2008 { 2009 int pass; 2010 int q; 2011 int q1iterator = 0; 2012 int q2iterator = 0; 2013 int q3iterator = 0; 2014 int isep; 2015 2016 curthread->td_flags |= TDF_SYSTHREAD; 2017 2018 /* 2019 * We only need to setup once. 2020 */ 2021 isep = 0; 2022 if (curthread == emergpager) { 2023 isep = 1; 2024 goto skip_setup; 2025 } 2026 2027 /* 2028 * Initialize some paging parameters. 2029 */ 2030 vm_pageout_free_page_calc(vmstats.v_page_count); 2031 2032 /* 2033 * v_free_target and v_cache_min control pageout hysteresis. Note 2034 * that these are more a measure of the VM cache queue hysteresis 2035 * then the VM free queue. Specifically, v_free_target is the 2036 * high water mark (free+cache pages). 2037 * 2038 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2039 * low water mark, while v_free_min is the stop. v_cache_min must 2040 * be big enough to handle memory needs while the pageout daemon 2041 * is signalled and run to free more pages. 2042 */ 2043 if (vmstats.v_free_count > 6144) 2044 vmstats.v_free_target = 4 * vmstats.v_free_min + 2045 vmstats.v_free_reserved; 2046 else 2047 vmstats.v_free_target = 2 * vmstats.v_free_min + 2048 vmstats.v_free_reserved; 2049 2050 /* 2051 * NOTE: With the new buffer cache b_act_count we want the default 2052 * inactive target to be a percentage of available memory. 2053 * 2054 * The inactive target essentially determines the minimum 2055 * number of 'temporary' pages capable of caching one-time-use 2056 * files when the VM system is otherwise full of pages 2057 * belonging to multi-time-use files or active program data. 2058 * 2059 * NOTE: The inactive target is aggressively persued only if the 2060 * inactive queue becomes too small. If the inactive queue 2061 * is large enough to satisfy page movement to free+cache 2062 * then it is repopulated more slowly from the active queue. 2063 * This allows a general inactive_target default to be set. 2064 * 2065 * There is an issue here for processes which sit mostly idle 2066 * 'overnight', such as sshd, tcsh, and X. Any movement from 2067 * the active queue will eventually cause such pages to 2068 * recycle eventually causing a lot of paging in the morning. 2069 * To reduce the incidence of this pages cycled out of the 2070 * buffer cache are moved directly to the inactive queue if 2071 * they were only used once or twice. 2072 * 2073 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2074 * Increasing the value (up to 64) increases the number of 2075 * buffer recyclements which go directly to the inactive queue. 2076 */ 2077 if (vmstats.v_free_count > 2048) { 2078 vmstats.v_cache_min = vmstats.v_free_target; 2079 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2080 } else { 2081 vmstats.v_cache_min = 0; 2082 vmstats.v_cache_max = 0; 2083 } 2084 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2085 2086 /* XXX does not really belong here */ 2087 if (vm_page_max_wired == 0) 2088 vm_page_max_wired = vmstats.v_free_count / 3; 2089 2090 if (vm_pageout_stats_max == 0) 2091 vm_pageout_stats_max = vmstats.v_free_target; 2092 2093 /* 2094 * Set interval in seconds for stats scan. 2095 */ 2096 if (vm_pageout_stats_interval == 0) 2097 vm_pageout_stats_interval = 5; 2098 if (vm_pageout_full_stats_interval == 0) 2099 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2100 2101 2102 /* 2103 * Set maximum free per pass 2104 */ 2105 if (vm_pageout_stats_free_max == 0) 2106 vm_pageout_stats_free_max = 5; 2107 2108 swap_pager_swap_init(); 2109 pass = 0; 2110 2111 atomic_swap_int(&sequence_emerg_pager, 1); 2112 wakeup(&sequence_emerg_pager); 2113 2114 skip_setup: 2115 /* 2116 * Sequence emergency pager startup 2117 */ 2118 if (isep) { 2119 while (sequence_emerg_pager == 0) 2120 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2121 } 2122 2123 /* 2124 * The pageout daemon is never done, so loop forever. 2125 * 2126 * WARNING! This code is being executed by two kernel threads 2127 * potentially simultaneously. 2128 */ 2129 while (TRUE) { 2130 int error; 2131 long avail_shortage; 2132 long inactive_shortage; 2133 long vnodes_skipped = 0; 2134 long recycle_count = 0; 2135 long tmp; 2136 2137 /* 2138 * Wait for an action request. If we timeout check to 2139 * see if paging is needed (in case the normal wakeup 2140 * code raced us). 2141 */ 2142 if (isep) { 2143 /* 2144 * Emergency pagedaemon monitors the primary 2145 * pagedaemon while vm_pages_needed != 0. 2146 * 2147 * The emergency pagedaemon only runs if VM paging 2148 * is needed and the primary pagedaemon has not 2149 * updated vm_pagedaemon_time for more than 2 seconds. 2150 */ 2151 if (vm_pages_needed) 2152 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2153 else 2154 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2155 if (vm_pages_needed == 0) { 2156 pass = 0; 2157 continue; 2158 } 2159 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2160 pass = 0; 2161 continue; 2162 } 2163 } else { 2164 /* 2165 * Primary pagedaemon 2166 * 2167 * NOTE: We unconditionally cleanup PQ_HOLD even 2168 * when there is no work to do. 2169 */ 2170 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2171 ++q3iterator; 2172 2173 if (vm_pages_needed == 0) { 2174 error = tsleep(&vm_pages_needed, 2175 0, "psleep", 2176 vm_pageout_stats_interval * hz); 2177 if (error && 2178 vm_paging_needed() == 0 && 2179 vm_pages_needed == 0) { 2180 for (q = 0; q < PQ_L2_SIZE; ++q) 2181 vm_pageout_page_stats(q); 2182 continue; 2183 } 2184 vm_pagedaemon_time = ticks; 2185 vm_pages_needed = 1; 2186 2187 /* 2188 * Wake the emergency pagedaemon up so it 2189 * can monitor us. It will automatically 2190 * go back into a long sleep when 2191 * vm_pages_needed returns to 0. 2192 */ 2193 wakeup(&vm_pagedaemon_time); 2194 } 2195 } 2196 2197 mycpu->gd_cnt.v_pdwakeups++; 2198 2199 /* 2200 * Scan for INACTIVE->CLEAN/PAGEOUT 2201 * 2202 * This routine tries to avoid thrashing the system with 2203 * unnecessary activity. 2204 * 2205 * Calculate our target for the number of free+cache pages we 2206 * want to get to. This is higher then the number that causes 2207 * allocations to stall (severe) in order to provide hysteresis, 2208 * and if we don't make it all the way but get to the minimum 2209 * we're happy. Goose it a bit if there are multiple requests 2210 * for memory. 2211 * 2212 * Don't reduce avail_shortage inside the loop or the 2213 * PQAVERAGE() calculation will break. 2214 * 2215 * NOTE! deficit is differentiated from avail_shortage as 2216 * REQUIRING at least (deficit) pages to be cleaned, 2217 * even if the page queues are in good shape. This 2218 * is used primarily for handling per-process 2219 * RLIMIT_RSS and may also see small values when 2220 * processes block due to low memory. 2221 */ 2222 vmstats_rollup(); 2223 if (isep == 0) 2224 vm_pagedaemon_time = ticks; 2225 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2226 vm_pageout_deficit = 0; 2227 2228 if (avail_shortage > 0) { 2229 long delta = 0; 2230 int qq; 2231 2232 qq = q1iterator; 2233 for (q = 0; q < PQ_L2_SIZE; ++q) { 2234 delta += vm_pageout_scan_inactive( 2235 pass, 2236 qq & PQ_L2_MASK, 2237 PQAVERAGE(avail_shortage), 2238 &vnodes_skipped); 2239 if (isep) 2240 --qq; 2241 else 2242 ++qq; 2243 if (avail_shortage - delta <= 0) 2244 break; 2245 } 2246 avail_shortage -= delta; 2247 q1iterator = qq; 2248 } 2249 2250 /* 2251 * Figure out how many active pages we must deactivate. If 2252 * we were able to reach our target with just the inactive 2253 * scan above we limit the number of active pages we 2254 * deactivate to reduce unnecessary work. 2255 */ 2256 vmstats_rollup(); 2257 if (isep == 0) 2258 vm_pagedaemon_time = ticks; 2259 inactive_shortage = vmstats.v_inactive_target - 2260 vmstats.v_inactive_count; 2261 2262 /* 2263 * If we were unable to free sufficient inactive pages to 2264 * satisfy the free/cache queue requirements then simply 2265 * reaching the inactive target may not be good enough. 2266 * Try to deactivate pages in excess of the target based 2267 * on the shortfall. 2268 * 2269 * However to prevent thrashing the VM system do not 2270 * deactivate more than an additional 1/10 the inactive 2271 * target's worth of active pages. 2272 */ 2273 if (avail_shortage > 0) { 2274 tmp = avail_shortage * 2; 2275 if (tmp > vmstats.v_inactive_target / 10) 2276 tmp = vmstats.v_inactive_target / 10; 2277 inactive_shortage += tmp; 2278 } 2279 2280 /* 2281 * Only trigger a pmap cleanup on inactive shortage. 2282 */ 2283 if (isep == 0 && inactive_shortage > 0) { 2284 pmap_collect(); 2285 } 2286 2287 /* 2288 * Scan for ACTIVE->INACTIVE 2289 * 2290 * Only trigger on inactive shortage. Triggering on 2291 * avail_shortage can starve the active queue with 2292 * unnecessary active->inactive transitions and destroy 2293 * performance. 2294 * 2295 * If this is the emergency pager, always try to move 2296 * a few pages from active to inactive because the inactive 2297 * queue might have enough pages, but not enough anonymous 2298 * pages. 2299 */ 2300 if (isep && inactive_shortage < vm_emerg_launder) 2301 inactive_shortage = vm_emerg_launder; 2302 2303 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2304 long delta = 0; 2305 int qq; 2306 2307 qq = q2iterator; 2308 for (q = 0; q < PQ_L2_SIZE; ++q) { 2309 delta += vm_pageout_scan_active( 2310 pass, 2311 qq & PQ_L2_MASK, 2312 PQAVERAGE(avail_shortage), 2313 PQAVERAGE(inactive_shortage), 2314 &recycle_count); 2315 if (isep) 2316 --qq; 2317 else 2318 ++qq; 2319 if (inactive_shortage - delta <= 0 && 2320 avail_shortage - delta <= 0) { 2321 break; 2322 } 2323 } 2324 inactive_shortage -= delta; 2325 avail_shortage -= delta; 2326 q2iterator = qq; 2327 } 2328 2329 /* 2330 * Scan for CACHE->FREE 2331 * 2332 * Finally free enough cache pages to meet our free page 2333 * requirement and take more drastic measures if we are 2334 * still in trouble. 2335 */ 2336 vmstats_rollup(); 2337 if (isep == 0) 2338 vm_pagedaemon_time = ticks; 2339 vm_pageout_scan_cache(avail_shortage, pass, 2340 vnodes_skipped, recycle_count); 2341 2342 /* 2343 * Wait for more work. 2344 */ 2345 if (avail_shortage > 0) { 2346 ++pass; 2347 if (pass < 10 && vm_pages_needed > 1) { 2348 /* 2349 * Normal operation, additional processes 2350 * have already kicked us. Retry immediately 2351 * unless swap space is completely full in 2352 * which case delay a bit. 2353 */ 2354 if (swap_pager_full) { 2355 tsleep(&vm_pages_needed, 0, "pdelay", 2356 hz / 5); 2357 } /* else immediate retry */ 2358 } else if (pass < 10) { 2359 /* 2360 * Normal operation, fewer processes. Delay 2361 * a bit but allow wakeups. vm_pages_needed 2362 * is only adjusted against the primary 2363 * pagedaemon here. 2364 */ 2365 if (isep == 0) 2366 vm_pages_needed = 0; 2367 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2368 if (isep == 0) 2369 vm_pages_needed = 1; 2370 } else if (swap_pager_full == 0) { 2371 /* 2372 * We've taken too many passes, forced delay. 2373 */ 2374 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2375 } else { 2376 /* 2377 * Running out of memory, catastrophic 2378 * back-off to one-second intervals. 2379 */ 2380 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2381 } 2382 } else if (vm_pages_needed) { 2383 /* 2384 * Interlocked wakeup of waiters (non-optional). 2385 * 2386 * Similar to vm_page_free_wakeup() in vm_page.c, 2387 * wake 2388 */ 2389 pass = 0; 2390 if (!vm_page_count_min(vm_page_free_hysteresis) || 2391 !vm_page_count_target()) { 2392 vm_pages_needed = 0; 2393 wakeup(&vmstats.v_free_count); 2394 } 2395 } else { 2396 pass = 0; 2397 } 2398 } 2399 } 2400 2401 static struct kproc_desc pg1_kp = { 2402 "pagedaemon", 2403 vm_pageout_thread, 2404 &pagethread 2405 }; 2406 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2407 2408 static struct kproc_desc pg2_kp = { 2409 "emergpager", 2410 vm_pageout_thread, 2411 &emergpager 2412 }; 2413 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2414 2415 2416 /* 2417 * Called after allocating a page out of the cache or free queue 2418 * to possibly wake the pagedaemon up to replentish our supply. 2419 * 2420 * We try to generate some hysteresis by waking the pagedaemon up 2421 * when our free+cache pages go below the free_min+cache_min level. 2422 * The pagedaemon tries to get the count back up to at least the 2423 * minimum, and through to the target level if possible. 2424 * 2425 * If the pagedaemon is already active bump vm_pages_needed as a hint 2426 * that there are even more requests pending. 2427 * 2428 * SMP races ok? 2429 * No requirements. 2430 */ 2431 void 2432 pagedaemon_wakeup(void) 2433 { 2434 if (vm_paging_needed() && curthread != pagethread) { 2435 if (vm_pages_needed == 0) { 2436 vm_pages_needed = 1; /* SMP race ok */ 2437 wakeup(&vm_pages_needed); 2438 } else if (vm_page_count_min(0)) { 2439 ++vm_pages_needed; /* SMP race ok */ 2440 } 2441 } 2442 } 2443 2444 #if !defined(NO_SWAPPING) 2445 2446 /* 2447 * SMP races ok? 2448 * No requirements. 2449 */ 2450 static void 2451 vm_req_vmdaemon(void) 2452 { 2453 static int lastrun = 0; 2454 2455 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2456 wakeup(&vm_daemon_needed); 2457 lastrun = ticks; 2458 } 2459 } 2460 2461 static int vm_daemon_callback(struct proc *p, void *data __unused); 2462 2463 /* 2464 * No requirements. 2465 */ 2466 static void 2467 vm_daemon(void) 2468 { 2469 int req_swapout; 2470 2471 while (TRUE) { 2472 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2473 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2474 2475 /* 2476 * forced swapouts 2477 */ 2478 if (req_swapout) 2479 swapout_procs(vm_pageout_req_swapout); 2480 2481 /* 2482 * scan the processes for exceeding their rlimits or if 2483 * process is swapped out -- deactivate pages 2484 */ 2485 allproc_scan(vm_daemon_callback, NULL, 0); 2486 } 2487 } 2488 2489 static int 2490 vm_daemon_callback(struct proc *p, void *data __unused) 2491 { 2492 struct vmspace *vm; 2493 vm_pindex_t limit, size; 2494 2495 /* 2496 * if this is a system process or if we have already 2497 * looked at this process, skip it. 2498 */ 2499 lwkt_gettoken(&p->p_token); 2500 2501 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2502 lwkt_reltoken(&p->p_token); 2503 return (0); 2504 } 2505 2506 /* 2507 * if the process is in a non-running type state, 2508 * don't touch it. 2509 */ 2510 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2511 lwkt_reltoken(&p->p_token); 2512 return (0); 2513 } 2514 2515 /* 2516 * get a limit 2517 */ 2518 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2519 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2520 2521 /* 2522 * let processes that are swapped out really be 2523 * swapped out. Set the limit to nothing to get as 2524 * many pages out to swap as possible. 2525 */ 2526 if (p->p_flags & P_SWAPPEDOUT) 2527 limit = 0; 2528 2529 vm = p->p_vmspace; 2530 vmspace_hold(vm); 2531 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2532 if (limit >= 0 && size > 4096 && 2533 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2534 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2535 } 2536 vmspace_drop(vm); 2537 2538 lwkt_reltoken(&p->p_token); 2539 2540 return (0); 2541 } 2542 2543 #endif 2544