1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/sysctl.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_param.h> 85 #include <sys/lock.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_pageout.h> 90 #include <vm/vm_pager.h> 91 #include <vm/swap_pager.h> 92 #include <vm/vm_extern.h> 93 94 #include <sys/thread2.h> 95 #include <sys/spinlock2.h> 96 #include <vm/vm_page2.h> 97 98 /* 99 * System initialization 100 */ 101 102 /* the kernel process "vm_pageout"*/ 103 static int vm_pageout_page(vm_page_t m, int *max_launderp, 104 int *vnodes_skippedp, struct vnode **vpfailedp, 105 int pass, int vmflush_flags); 106 static int vm_pageout_clean_helper (vm_page_t, int); 107 static int vm_pageout_free_page_calc (vm_size_t count); 108 static void vm_pageout_page_free(vm_page_t m) ; 109 struct thread *pagethread; 110 111 #if !defined(NO_SWAPPING) 112 /* the kernel process "vm_daemon"*/ 113 static void vm_daemon (void); 114 static struct thread *vmthread; 115 116 static struct kproc_desc vm_kp = { 117 "vmdaemon", 118 vm_daemon, 119 &vmthread 120 }; 121 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 122 #endif 123 124 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 125 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 126 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 127 int vm_page_free_hysteresis = 16; 128 129 #if !defined(NO_SWAPPING) 130 static int vm_pageout_req_swapout; 131 static int vm_daemon_needed; 132 #endif 133 static int vm_max_launder = 4096; 134 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 135 static int vm_pageout_full_stats_interval = 0; 136 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 137 static int defer_swap_pageouts=0; 138 static int disable_swap_pageouts=0; 139 static u_int vm_anonmem_decline = ACT_DECLINE; 140 static u_int vm_filemem_decline = ACT_DECLINE * 2; 141 142 #if defined(NO_SWAPPING) 143 static int vm_swap_enabled=0; 144 static int vm_swap_idle_enabled=0; 145 #else 146 static int vm_swap_enabled=1; 147 static int vm_swap_idle_enabled=0; 148 #endif 149 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 150 151 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 152 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 153 154 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 155 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 156 157 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 158 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 159 "Free more pages than the minimum required"); 160 161 SYSCTL_INT(_vm, OID_AUTO, max_launder, 162 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 163 164 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 165 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 166 167 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 168 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 169 170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 171 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 172 173 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 174 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 175 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 176 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 177 178 #if defined(NO_SWAPPING) 179 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 180 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 181 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 182 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 183 #else 184 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 185 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 186 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 187 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 188 #endif 189 190 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 191 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 192 193 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 194 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 195 196 static int pageout_lock_miss; 197 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 198 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 199 200 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 201 202 #if !defined(NO_SWAPPING) 203 static void vm_req_vmdaemon (void); 204 #endif 205 static void vm_pageout_page_stats(int q); 206 207 /* 208 * Calculate approximately how many pages on each queue to try to 209 * clean. An exact calculation creates an edge condition when the 210 * queues are unbalanced so add significant slop. The queue scans 211 * will stop early when targets are reached and will start where they 212 * left off on the next pass. 213 * 214 * We need to be generous here because there are all sorts of loading 215 * conditions that can cause edge cases if try to average over all queues. 216 * In particular, storage subsystems have become so fast that paging 217 * activity can become quite frantic. Eventually we will probably need 218 * two paging threads, one for dirty pages and one for clean, to deal 219 * with the bandwidth requirements. 220 221 * So what we do is calculate a value that can be satisfied nominally by 222 * only having to scan half the queues. 223 */ 224 static __inline int 225 PQAVERAGE(int n) 226 { 227 int avg; 228 229 if (n >= 0) { 230 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 231 } else { 232 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 233 } 234 return avg; 235 } 236 237 /* 238 * vm_pageout_clean_helper: 239 * 240 * Clean the page and remove it from the laundry. The page must be busied 241 * by the caller and will be disposed of (put away, flushed) by this routine. 242 */ 243 static int 244 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 245 { 246 vm_object_t object; 247 vm_page_t mc[BLIST_MAX_ALLOC]; 248 int error; 249 int ib, is, page_base; 250 vm_pindex_t pindex = m->pindex; 251 252 object = m->object; 253 254 /* 255 * Don't mess with the page if it's held or special. 256 * 257 * XXX do we really need to check hold_count here? hold_count 258 * isn't supposed to mess with vm_page ops except prevent the 259 * page from being reused. 260 */ 261 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 262 vm_page_wakeup(m); 263 return 0; 264 } 265 266 /* 267 * Place page in cluster. Align cluster for optimal swap space 268 * allocation (whether it is swap or not). This is typically ~16-32 269 * pages, which also tends to align the cluster to multiples of the 270 * filesystem block size if backed by a filesystem. 271 */ 272 page_base = pindex % BLIST_MAX_ALLOC; 273 mc[page_base] = m; 274 ib = page_base - 1; 275 is = page_base + 1; 276 277 /* 278 * Scan object for clusterable pages. 279 * 280 * We can cluster ONLY if: ->> the page is NOT 281 * clean, wired, busy, held, or mapped into a 282 * buffer, and one of the following: 283 * 1) The page is inactive, or a seldom used 284 * active page. 285 * -or- 286 * 2) we force the issue. 287 * 288 * During heavy mmap/modification loads the pageout 289 * daemon can really fragment the underlying file 290 * due to flushing pages out of order and not trying 291 * align the clusters (which leave sporatic out-of-order 292 * holes). To solve this problem we do the reverse scan 293 * first and attempt to align our cluster, then do a 294 * forward scan if room remains. 295 */ 296 vm_object_hold(object); 297 298 while (ib >= 0) { 299 vm_page_t p; 300 301 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 302 TRUE, &error); 303 if (error || p == NULL) 304 break; 305 if ((p->queue - p->pc) == PQ_CACHE || 306 (p->flags & PG_UNMANAGED)) { 307 vm_page_wakeup(p); 308 break; 309 } 310 vm_page_test_dirty(p); 311 if (((p->dirty & p->valid) == 0 && 312 (p->flags & PG_NEED_COMMIT) == 0) || 313 p->wire_count != 0 || /* may be held by buf cache */ 314 p->hold_count != 0) { /* may be undergoing I/O */ 315 vm_page_wakeup(p); 316 break; 317 } 318 if (p->queue - p->pc != PQ_INACTIVE) { 319 if (p->queue - p->pc != PQ_ACTIVE || 320 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 321 vm_page_wakeup(p); 322 break; 323 } 324 } 325 326 /* 327 * Try to maintain page groupings in the cluster. 328 */ 329 if (m->flags & PG_WINATCFLS) 330 vm_page_flag_set(p, PG_WINATCFLS); 331 else 332 vm_page_flag_clear(p, PG_WINATCFLS); 333 p->act_count = m->act_count; 334 335 mc[ib] = p; 336 --ib; 337 } 338 ++ib; /* fixup */ 339 340 while (is < BLIST_MAX_ALLOC && 341 pindex - page_base + is < object->size) { 342 vm_page_t p; 343 344 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 345 TRUE, &error); 346 if (error || p == NULL) 347 break; 348 if (((p->queue - p->pc) == PQ_CACHE) || 349 (p->flags & PG_UNMANAGED)) { 350 vm_page_wakeup(p); 351 break; 352 } 353 vm_page_test_dirty(p); 354 if (((p->dirty & p->valid) == 0 && 355 (p->flags & PG_NEED_COMMIT) == 0) || 356 p->wire_count != 0 || /* may be held by buf cache */ 357 p->hold_count != 0) { /* may be undergoing I/O */ 358 vm_page_wakeup(p); 359 break; 360 } 361 if (p->queue - p->pc != PQ_INACTIVE) { 362 if (p->queue - p->pc != PQ_ACTIVE || 363 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 364 vm_page_wakeup(p); 365 break; 366 } 367 } 368 369 /* 370 * Try to maintain page groupings in the cluster. 371 */ 372 if (m->flags & PG_WINATCFLS) 373 vm_page_flag_set(p, PG_WINATCFLS); 374 else 375 vm_page_flag_clear(p, PG_WINATCFLS); 376 p->act_count = m->act_count; 377 378 mc[is] = p; 379 ++is; 380 } 381 382 vm_object_drop(object); 383 384 /* 385 * we allow reads during pageouts... 386 */ 387 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 388 } 389 390 /* 391 * vm_pageout_flush() - launder the given pages 392 * 393 * The given pages are laundered. Note that we setup for the start of 394 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 395 * reference count all in here rather then in the parent. If we want 396 * the parent to do more sophisticated things we may have to change 397 * the ordering. 398 * 399 * The pages in the array must be busied by the caller and will be 400 * unbusied by this function. 401 */ 402 int 403 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 404 { 405 vm_object_t object; 406 int pageout_status[count]; 407 int numpagedout = 0; 408 int i; 409 410 /* 411 * Initiate I/O. Bump the vm_page_t->busy counter. 412 */ 413 for (i = 0; i < count; i++) { 414 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 415 ("vm_pageout_flush page %p index %d/%d: partially " 416 "invalid page", mc[i], i, count)); 417 vm_page_io_start(mc[i]); 418 } 419 420 /* 421 * We must make the pages read-only. This will also force the 422 * modified bit in the related pmaps to be cleared. The pager 423 * cannot clear the bit for us since the I/O completion code 424 * typically runs from an interrupt. The act of making the page 425 * read-only handles the case for us. 426 * 427 * Then we can unbusy the pages, we still hold a reference by virtue 428 * of our soft-busy. 429 */ 430 for (i = 0; i < count; i++) { 431 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 432 vm_page_protect(mc[i], VM_PROT_NONE); 433 else 434 vm_page_protect(mc[i], VM_PROT_READ); 435 vm_page_wakeup(mc[i]); 436 } 437 438 object = mc[0]->object; 439 vm_object_pip_add(object, count); 440 441 vm_pager_put_pages(object, mc, count, 442 (vmflush_flags | 443 ((object == &kernel_object) ? 444 VM_PAGER_PUT_SYNC : 0)), 445 pageout_status); 446 447 for (i = 0; i < count; i++) { 448 vm_page_t mt = mc[i]; 449 450 switch (pageout_status[i]) { 451 case VM_PAGER_OK: 452 numpagedout++; 453 break; 454 case VM_PAGER_PEND: 455 numpagedout++; 456 break; 457 case VM_PAGER_BAD: 458 /* 459 * Page outside of range of object. Right now we 460 * essentially lose the changes by pretending it 461 * worked. 462 */ 463 vm_page_busy_wait(mt, FALSE, "pgbad"); 464 pmap_clear_modify(mt); 465 vm_page_undirty(mt); 466 vm_page_wakeup(mt); 467 break; 468 case VM_PAGER_ERROR: 469 case VM_PAGER_FAIL: 470 /* 471 * A page typically cannot be paged out when we 472 * have run out of swap. We leave the page 473 * marked inactive and will try to page it out 474 * again later. 475 * 476 * Starvation of the active page list is used to 477 * determine when the system is massively memory 478 * starved. 479 */ 480 break; 481 case VM_PAGER_AGAIN: 482 break; 483 } 484 485 /* 486 * If not PENDing this was a synchronous operation and we 487 * clean up after the I/O. If it is PENDing the mess is 488 * cleaned up asynchronously. 489 * 490 * Also nominally act on the caller's wishes if the caller 491 * wants to try to really clean (cache or free) the page. 492 * 493 * Also nominally deactivate the page if the system is 494 * memory-stressed. 495 */ 496 if (pageout_status[i] != VM_PAGER_PEND) { 497 vm_page_busy_wait(mt, FALSE, "pgouw"); 498 vm_page_io_finish(mt); 499 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 500 vm_page_try_to_cache(mt); 501 } else if (vm_page_count_severe()) { 502 vm_page_deactivate(mt); 503 vm_page_wakeup(mt); 504 } else { 505 vm_page_wakeup(mt); 506 } 507 vm_object_pip_wakeup(object); 508 } 509 } 510 return numpagedout; 511 } 512 513 #if !defined(NO_SWAPPING) 514 515 /* 516 * Callback function, page busied for us. We must dispose of the busy 517 * condition. Any related pmap pages may be held but will not be locked. 518 */ 519 static 520 int 521 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 522 vm_page_t p) 523 { 524 int actcount; 525 int cleanit = 0; 526 527 /* 528 * Basic tests - There should never be a marker, and we can stop 529 * once the RSS is below the required level. 530 */ 531 KKASSERT((p->flags & PG_MARKER) == 0); 532 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 533 vm_page_wakeup(p); 534 return(-1); 535 } 536 537 mycpu->gd_cnt.v_pdpages++; 538 539 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 540 vm_page_wakeup(p); 541 goto done; 542 } 543 544 ++info->actioncount; 545 546 /* 547 * Check if the page has been referened recently. If it has, 548 * activate it and skip. 549 */ 550 actcount = pmap_ts_referenced(p); 551 if (actcount) { 552 vm_page_flag_set(p, PG_REFERENCED); 553 } else if (p->flags & PG_REFERENCED) { 554 actcount = 1; 555 } 556 557 if (actcount) { 558 if (p->queue - p->pc != PQ_ACTIVE) { 559 vm_page_and_queue_spin_lock(p); 560 if (p->queue - p->pc != PQ_ACTIVE) { 561 vm_page_and_queue_spin_unlock(p); 562 vm_page_activate(p); 563 } else { 564 vm_page_and_queue_spin_unlock(p); 565 } 566 } else { 567 p->act_count += actcount; 568 if (p->act_count > ACT_MAX) 569 p->act_count = ACT_MAX; 570 } 571 vm_page_flag_clear(p, PG_REFERENCED); 572 vm_page_wakeup(p); 573 goto done; 574 } 575 576 /* 577 * Remove the page from this particular pmap. Once we do this, our 578 * pmap scans will not see it again (unless it gets faulted in), so 579 * we must actively dispose of or deal with the page. 580 */ 581 pmap_remove_specific(info->pmap, p); 582 583 /* 584 * If the page is not mapped to another process (i.e. as would be 585 * typical if this were a shared page from a library) then deactivate 586 * the page and clean it in two passes only. 587 * 588 * If the page hasn't been referenced since the last check, remove it 589 * from the pmap. If it is no longer mapped, deactivate it 590 * immediately, accelerating the normal decline. 591 * 592 * Once the page has been removed from the pmap the RSS code no 593 * longer tracks it so we have to make sure that it is staged for 594 * potential flush action. 595 */ 596 if ((p->flags & PG_MAPPED) == 0) { 597 if (p->queue - p->pc == PQ_ACTIVE) { 598 vm_page_deactivate(p); 599 } 600 if (p->queue - p->pc == PQ_INACTIVE) { 601 cleanit = 1; 602 } 603 } 604 605 /* 606 * Ok, try to fully clean the page and any nearby pages such that at 607 * least the requested page is freed or moved to the cache queue. 608 * 609 * We usually do this synchronously to allow us to get the page into 610 * the CACHE queue quickly, which will prevent memory exhaustion if 611 * a process with a memoryuse limit is running away. However, the 612 * sysadmin may desire to set vm.swap_user_async which relaxes this 613 * and improves write performance. 614 */ 615 if (cleanit) { 616 int max_launder = 0x7FFF; 617 int vnodes_skipped = 0; 618 int vmflush_flags; 619 struct vnode *vpfailed = NULL; 620 621 info->offset = va; 622 623 if (vm_pageout_memuse_mode >= 2) { 624 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 625 VM_PAGER_ALLOW_ACTIVE; 626 if (swap_user_async == 0) 627 vmflush_flags |= VM_PAGER_PUT_SYNC; 628 vm_page_flag_set(p, PG_WINATCFLS); 629 info->cleancount += 630 vm_pageout_page(p, &max_launder, 631 &vnodes_skipped, 632 &vpfailed, 1, vmflush_flags); 633 } else { 634 vm_page_wakeup(p); 635 ++info->cleancount; 636 } 637 } else { 638 vm_page_wakeup(p); 639 } 640 641 /* 642 * Must be at end to avoid SMP races. 643 */ 644 done: 645 lwkt_user_yield(); 646 return 0; 647 } 648 649 /* 650 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 651 * that is relatively difficult to do. We try to keep track of where we 652 * left off last time to reduce scan overhead. 653 * 654 * Called when vm_pageout_memuse_mode is >= 1. 655 */ 656 void 657 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 658 { 659 vm_offset_t pgout_offset; 660 struct pmap_pgscan_info info; 661 int retries = 3; 662 663 pgout_offset = map->pgout_offset; 664 again: 665 #if 0 666 kprintf("%016jx ", pgout_offset); 667 #endif 668 if (pgout_offset < VM_MIN_USER_ADDRESS) 669 pgout_offset = VM_MIN_USER_ADDRESS; 670 if (pgout_offset >= VM_MAX_USER_ADDRESS) 671 pgout_offset = 0; 672 info.pmap = vm_map_pmap(map); 673 info.limit = limit; 674 info.beg_addr = pgout_offset; 675 info.end_addr = VM_MAX_USER_ADDRESS; 676 info.callback = vm_pageout_mdp_callback; 677 info.cleancount = 0; 678 info.actioncount = 0; 679 info.busycount = 0; 680 681 pmap_pgscan(&info); 682 pgout_offset = info.offset; 683 #if 0 684 kprintf("%016jx %08lx %08lx\n", pgout_offset, 685 info.cleancount, info.actioncount); 686 #endif 687 688 if (pgout_offset != VM_MAX_USER_ADDRESS && 689 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 690 goto again; 691 } else if (retries && 692 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 693 --retries; 694 goto again; 695 } 696 map->pgout_offset = pgout_offset; 697 } 698 #endif 699 700 /* 701 * Called when the pageout scan wants to free a page. We no longer 702 * try to cycle the vm_object here with a reference & dealloc, which can 703 * cause a non-trivial object collapse in a critical path. 704 * 705 * It is unclear why we cycled the ref_count in the past, perhaps to try 706 * to optimize shadow chain collapses but I don't quite see why it would 707 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 708 * synchronously and not have to be kicked-start. 709 */ 710 static void 711 vm_pageout_page_free(vm_page_t m) 712 { 713 vm_page_protect(m, VM_PROT_NONE); 714 vm_page_free(m); 715 } 716 717 /* 718 * vm_pageout_scan does the dirty work for the pageout daemon. 719 */ 720 struct vm_pageout_scan_info { 721 struct proc *bigproc; 722 vm_offset_t bigsize; 723 }; 724 725 static int vm_pageout_scan_callback(struct proc *p, void *data); 726 727 static int 728 vm_pageout_scan_inactive(int pass, int q, int avail_shortage, 729 int *vnodes_skipped) 730 { 731 vm_page_t m; 732 struct vm_page marker; 733 struct vnode *vpfailed; /* warning, allowed to be stale */ 734 int maxscan; 735 int delta = 0; 736 int max_launder; 737 738 /* 739 * Start scanning the inactive queue for pages we can move to the 740 * cache or free. The scan will stop when the target is reached or 741 * we have scanned the entire inactive queue. Note that m->act_count 742 * is not used to form decisions for the inactive queue, only for the 743 * active queue. 744 * 745 * max_launder limits the number of dirty pages we flush per scan. 746 * For most systems a smaller value (16 or 32) is more robust under 747 * extreme memory and disk pressure because any unnecessary writes 748 * to disk can result in extreme performance degredation. However, 749 * systems with excessive dirty pages (especially when MAP_NOSYNC is 750 * used) will die horribly with limited laundering. If the pageout 751 * daemon cannot clean enough pages in the first pass, we let it go 752 * all out in succeeding passes. 753 */ 754 if ((max_launder = vm_max_launder) <= 1) 755 max_launder = 1; 756 if (pass) 757 max_launder = 10000; 758 759 /* 760 * Initialize our marker 761 */ 762 bzero(&marker, sizeof(marker)); 763 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 764 marker.queue = PQ_INACTIVE + q; 765 marker.pc = q; 766 marker.wire_count = 1; 767 768 /* 769 * Inactive queue scan. 770 * 771 * NOTE: The vm_page must be spinlocked before the queue to avoid 772 * deadlocks, so it is easiest to simply iterate the loop 773 * with the queue unlocked at the top. 774 */ 775 vpfailed = NULL; 776 777 vm_page_queues_spin_lock(PQ_INACTIVE + q); 778 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 779 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 780 781 /* 782 * Queue locked at top of loop to avoid stack marker issues. 783 */ 784 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 785 maxscan-- > 0 && avail_shortage - delta > 0) 786 { 787 int count; 788 789 KKASSERT(m->queue == PQ_INACTIVE + q); 790 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 791 &marker, pageq); 792 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 793 &marker, pageq); 794 mycpu->gd_cnt.v_pdpages++; 795 796 /* 797 * Skip marker pages (atomic against other markers to avoid 798 * infinite hop-over scans). 799 */ 800 if (m->flags & PG_MARKER) 801 continue; 802 803 /* 804 * Try to busy the page. Don't mess with pages which are 805 * already busy or reorder them in the queue. 806 */ 807 if (vm_page_busy_try(m, TRUE)) 808 continue; 809 810 /* 811 * Remaining operations run with the page busy and neither 812 * the page or the queue will be spin-locked. 813 */ 814 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 815 KKASSERT(m->queue == PQ_INACTIVE + q); 816 817 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 818 &vpfailed, pass, 0); 819 delta += count; 820 821 /* 822 * Systems with a ton of memory can wind up with huge 823 * deactivation counts. Because the inactive scan is 824 * doing a lot of flushing, the combination can result 825 * in excessive paging even in situations where other 826 * unrelated threads free up sufficient VM. 827 * 828 * To deal with this we abort the nominal active->inactive 829 * scan before we hit the inactive target when free+cache 830 * levels have reached a reasonable target. 831 * 832 * When deciding to stop early we need to add some slop to 833 * the test and we need to return full completion to the caller 834 * to prevent the caller from thinking there is something 835 * wrong and issuing a low-memory+swap warning or pkill. 836 * 837 * A deficit forces paging regardless of the state of the 838 * VM page queues (used for RSS enforcement). 839 */ 840 lwkt_yield(); 841 vm_page_queues_spin_lock(PQ_INACTIVE + q); 842 if (vm_paging_target() < -vm_max_launder) { 843 /* 844 * Stopping early, return full completion to caller. 845 */ 846 if (delta < avail_shortage) 847 delta = avail_shortage; 848 break; 849 } 850 } 851 852 /* page queue still spin-locked */ 853 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 854 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 855 856 return (delta); 857 } 858 859 /* 860 * Pageout the specified page, return the total number of pages paged out 861 * (this routine may cluster). 862 * 863 * The page must be busied and soft-busied by the caller and will be disposed 864 * of by this function. 865 */ 866 static int 867 vm_pageout_page(vm_page_t m, int *max_launderp, int *vnodes_skippedp, 868 struct vnode **vpfailedp, int pass, int vmflush_flags) 869 { 870 vm_object_t object; 871 int actcount; 872 int count = 0; 873 874 /* 875 * It is possible for a page to be busied ad-hoc (e.g. the 876 * pmap_collect() code) and wired and race against the 877 * allocation of a new page. vm_page_alloc() may be forced 878 * to deactivate the wired page in which case it winds up 879 * on the inactive queue and must be handled here. We 880 * correct the problem simply by unqueuing the page. 881 */ 882 if (m->wire_count) { 883 vm_page_unqueue_nowakeup(m); 884 vm_page_wakeup(m); 885 kprintf("WARNING: pagedaemon: wired page on " 886 "inactive queue %p\n", m); 887 return 0; 888 } 889 890 /* 891 * A held page may be undergoing I/O, so skip it. 892 */ 893 if (m->hold_count) { 894 vm_page_and_queue_spin_lock(m); 895 if (m->queue - m->pc == PQ_INACTIVE) { 896 TAILQ_REMOVE( 897 &vm_page_queues[m->queue].pl, m, pageq); 898 TAILQ_INSERT_TAIL( 899 &vm_page_queues[m->queue].pl, m, pageq); 900 ++vm_swapcache_inactive_heuristic; 901 } 902 vm_page_and_queue_spin_unlock(m); 903 vm_page_wakeup(m); 904 return 0; 905 } 906 907 if (m->object == NULL || m->object->ref_count == 0) { 908 /* 909 * If the object is not being used, we ignore previous 910 * references. 911 */ 912 vm_page_flag_clear(m, PG_REFERENCED); 913 pmap_clear_reference(m); 914 /* fall through to end */ 915 } else if (((m->flags & PG_REFERENCED) == 0) && 916 (actcount = pmap_ts_referenced(m))) { 917 /* 918 * Otherwise, if the page has been referenced while 919 * in the inactive queue, we bump the "activation 920 * count" upwards, making it less likely that the 921 * page will be added back to the inactive queue 922 * prematurely again. Here we check the page tables 923 * (or emulated bits, if any), given the upper level 924 * VM system not knowing anything about existing 925 * references. 926 */ 927 vm_page_activate(m); 928 m->act_count += (actcount + ACT_ADVANCE); 929 vm_page_wakeup(m); 930 return 0; 931 } 932 933 /* 934 * (m) is still busied. 935 * 936 * If the upper level VM system knows about any page 937 * references, we activate the page. We also set the 938 * "activation count" higher than normal so that we will less 939 * likely place pages back onto the inactive queue again. 940 */ 941 if ((m->flags & PG_REFERENCED) != 0) { 942 vm_page_flag_clear(m, PG_REFERENCED); 943 actcount = pmap_ts_referenced(m); 944 vm_page_activate(m); 945 m->act_count += (actcount + ACT_ADVANCE + 1); 946 vm_page_wakeup(m); 947 return 0; 948 } 949 950 /* 951 * If the upper level VM system doesn't know anything about 952 * the page being dirty, we have to check for it again. As 953 * far as the VM code knows, any partially dirty pages are 954 * fully dirty. 955 * 956 * Pages marked PG_WRITEABLE may be mapped into the user 957 * address space of a process running on another cpu. A 958 * user process (without holding the MP lock) running on 959 * another cpu may be able to touch the page while we are 960 * trying to remove it. vm_page_cache() will handle this 961 * case for us. 962 */ 963 if (m->dirty == 0) { 964 vm_page_test_dirty(m); 965 } else { 966 vm_page_dirty(m); 967 } 968 969 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 970 /* 971 * Invalid pages can be easily freed 972 */ 973 vm_pageout_page_free(m); 974 mycpu->gd_cnt.v_dfree++; 975 ++count; 976 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 977 /* 978 * Clean pages can be placed onto the cache queue. 979 * This effectively frees them. 980 */ 981 vm_page_cache(m); 982 ++count; 983 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 984 /* 985 * Dirty pages need to be paged out, but flushing 986 * a page is extremely expensive verses freeing 987 * a clean page. Rather then artificially limiting 988 * the number of pages we can flush, we instead give 989 * dirty pages extra priority on the inactive queue 990 * by forcing them to be cycled through the queue 991 * twice before being flushed, after which the 992 * (now clean) page will cycle through once more 993 * before being freed. This significantly extends 994 * the thrash point for a heavily loaded machine. 995 */ 996 vm_page_flag_set(m, PG_WINATCFLS); 997 vm_page_and_queue_spin_lock(m); 998 if (m->queue - m->pc == PQ_INACTIVE) { 999 TAILQ_REMOVE( 1000 &vm_page_queues[m->queue].pl, m, pageq); 1001 TAILQ_INSERT_TAIL( 1002 &vm_page_queues[m->queue].pl, m, pageq); 1003 ++vm_swapcache_inactive_heuristic; 1004 } 1005 vm_page_and_queue_spin_unlock(m); 1006 vm_page_wakeup(m); 1007 } else if (*max_launderp > 0) { 1008 /* 1009 * We always want to try to flush some dirty pages if 1010 * we encounter them, to keep the system stable. 1011 * Normally this number is small, but under extreme 1012 * pressure where there are insufficient clean pages 1013 * on the inactive queue, we may have to go all out. 1014 */ 1015 int swap_pageouts_ok; 1016 struct vnode *vp = NULL; 1017 1018 swap_pageouts_ok = 0; 1019 object = m->object; 1020 if (object && 1021 (object->type != OBJT_SWAP) && 1022 (object->type != OBJT_DEFAULT)) { 1023 swap_pageouts_ok = 1; 1024 } else { 1025 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 1026 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 1027 vm_page_count_min(0)); 1028 } 1029 1030 /* 1031 * We don't bother paging objects that are "dead". 1032 * Those objects are in a "rundown" state. 1033 */ 1034 if (!swap_pageouts_ok || 1035 (object == NULL) || 1036 (object->flags & OBJ_DEAD)) { 1037 vm_page_and_queue_spin_lock(m); 1038 if (m->queue - m->pc == PQ_INACTIVE) { 1039 TAILQ_REMOVE( 1040 &vm_page_queues[m->queue].pl, 1041 m, pageq); 1042 TAILQ_INSERT_TAIL( 1043 &vm_page_queues[m->queue].pl, 1044 m, pageq); 1045 ++vm_swapcache_inactive_heuristic; 1046 } 1047 vm_page_and_queue_spin_unlock(m); 1048 vm_page_wakeup(m); 1049 return 0; 1050 } 1051 1052 /* 1053 * (m) is still busied. 1054 * 1055 * The object is already known NOT to be dead. It 1056 * is possible for the vget() to block the whole 1057 * pageout daemon, but the new low-memory handling 1058 * code should prevent it. 1059 * 1060 * The previous code skipped locked vnodes and, worse, 1061 * reordered pages in the queue. This results in 1062 * completely non-deterministic operation because, 1063 * quite often, a vm_fault has initiated an I/O and 1064 * is holding a locked vnode at just the point where 1065 * the pageout daemon is woken up. 1066 * 1067 * We can't wait forever for the vnode lock, we might 1068 * deadlock due to a vn_read() getting stuck in 1069 * vm_wait while holding this vnode. We skip the 1070 * vnode if we can't get it in a reasonable amount 1071 * of time. 1072 * 1073 * vpfailed is used to (try to) avoid the case where 1074 * a large number of pages are associated with a 1075 * locked vnode, which could cause the pageout daemon 1076 * to stall for an excessive amount of time. 1077 */ 1078 if (object->type == OBJT_VNODE) { 1079 int flags; 1080 1081 vp = object->handle; 1082 flags = LK_EXCLUSIVE; 1083 if (vp == *vpfailedp) 1084 flags |= LK_NOWAIT; 1085 else 1086 flags |= LK_TIMELOCK; 1087 vm_page_hold(m); 1088 vm_page_wakeup(m); 1089 1090 /* 1091 * We have unbusied (m) temporarily so we can 1092 * acquire the vp lock without deadlocking. 1093 * (m) is held to prevent destruction. 1094 */ 1095 if (vget(vp, flags) != 0) { 1096 *vpfailedp = vp; 1097 ++pageout_lock_miss; 1098 if (object->flags & OBJ_MIGHTBEDIRTY) 1099 ++*vnodes_skippedp; 1100 vm_page_unhold(m); 1101 return 0; 1102 } 1103 1104 /* 1105 * The page might have been moved to another 1106 * queue during potential blocking in vget() 1107 * above. The page might have been freed and 1108 * reused for another vnode. The object might 1109 * have been reused for another vnode. 1110 */ 1111 if (m->queue - m->pc != PQ_INACTIVE || 1112 m->object != object || 1113 object->handle != vp) { 1114 if (object->flags & OBJ_MIGHTBEDIRTY) 1115 ++*vnodes_skippedp; 1116 vput(vp); 1117 vm_page_unhold(m); 1118 return 0; 1119 } 1120 1121 /* 1122 * The page may have been busied during the 1123 * blocking in vput(); We don't move the 1124 * page back onto the end of the queue so that 1125 * statistics are more correct if we don't. 1126 */ 1127 if (vm_page_busy_try(m, TRUE)) { 1128 vput(vp); 1129 vm_page_unhold(m); 1130 return 0; 1131 } 1132 vm_page_unhold(m); 1133 1134 /* 1135 * (m) is busied again 1136 * 1137 * We own the busy bit and remove our hold 1138 * bit. If the page is still held it 1139 * might be undergoing I/O, so skip it. 1140 */ 1141 if (m->hold_count) { 1142 vm_page_and_queue_spin_lock(m); 1143 if (m->queue - m->pc == PQ_INACTIVE) { 1144 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1145 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1146 ++vm_swapcache_inactive_heuristic; 1147 } 1148 vm_page_and_queue_spin_unlock(m); 1149 if (object->flags & OBJ_MIGHTBEDIRTY) 1150 ++*vnodes_skippedp; 1151 vm_page_wakeup(m); 1152 vput(vp); 1153 return 0; 1154 } 1155 /* (m) is left busied as we fall through */ 1156 } 1157 1158 /* 1159 * page is busy and not held here. 1160 * 1161 * If a page is dirty, then it is either being washed 1162 * (but not yet cleaned) or it is still in the 1163 * laundry. If it is still in the laundry, then we 1164 * start the cleaning operation. 1165 * 1166 * decrement inactive_shortage on success to account 1167 * for the (future) cleaned page. Otherwise we 1168 * could wind up laundering or cleaning too many 1169 * pages. 1170 * 1171 * NOTE: Cleaning the page here does not cause 1172 * force_deficit to be adjusted, because the 1173 * page is not being freed or moved to the 1174 * cache. 1175 */ 1176 count = vm_pageout_clean_helper(m, vmflush_flags); 1177 *max_launderp -= count; 1178 1179 /* 1180 * Clean ate busy, page no longer accessible 1181 */ 1182 if (vp != NULL) 1183 vput(vp); 1184 } else { 1185 vm_page_wakeup(m); 1186 } 1187 return count; 1188 } 1189 1190 static int 1191 vm_pageout_scan_active(int pass, int q, 1192 int avail_shortage, int inactive_shortage, 1193 int *recycle_countp) 1194 { 1195 struct vm_page marker; 1196 vm_page_t m; 1197 int actcount; 1198 int delta = 0; 1199 int maxscan; 1200 1201 /* 1202 * We want to move pages from the active queue to the inactive 1203 * queue to get the inactive queue to the inactive target. If 1204 * we still have a page shortage from above we try to directly free 1205 * clean pages instead of moving them. 1206 * 1207 * If we do still have a shortage we keep track of the number of 1208 * pages we free or cache (recycle_count) as a measure of thrashing 1209 * between the active and inactive queues. 1210 * 1211 * If we were able to completely satisfy the free+cache targets 1212 * from the inactive pool we limit the number of pages we move 1213 * from the active pool to the inactive pool to 2x the pages we 1214 * had removed from the inactive pool (with a minimum of 1/5 the 1215 * inactive target). If we were not able to completely satisfy 1216 * the free+cache targets we go for the whole target aggressively. 1217 * 1218 * NOTE: Both variables can end up negative. 1219 * NOTE: We are still in a critical section. 1220 */ 1221 1222 bzero(&marker, sizeof(marker)); 1223 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1224 marker.queue = PQ_ACTIVE + q; 1225 marker.pc = q; 1226 marker.wire_count = 1; 1227 1228 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1229 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1230 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1231 1232 /* 1233 * Queue locked at top of loop to avoid stack marker issues. 1234 */ 1235 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1236 maxscan-- > 0 && (avail_shortage - delta > 0 || 1237 inactive_shortage > 0)) 1238 { 1239 KKASSERT(m->queue == PQ_ACTIVE + q); 1240 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1241 &marker, pageq); 1242 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1243 &marker, pageq); 1244 1245 /* 1246 * Skip marker pages (atomic against other markers to avoid 1247 * infinite hop-over scans). 1248 */ 1249 if (m->flags & PG_MARKER) 1250 continue; 1251 1252 /* 1253 * Try to busy the page. Don't mess with pages which are 1254 * already busy or reorder them in the queue. 1255 */ 1256 if (vm_page_busy_try(m, TRUE)) 1257 continue; 1258 1259 /* 1260 * Remaining operations run with the page busy and neither 1261 * the page or the queue will be spin-locked. 1262 */ 1263 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1264 KKASSERT(m->queue == PQ_ACTIVE + q); 1265 1266 /* 1267 * Don't deactivate pages that are held, even if we can 1268 * busy them. (XXX why not?) 1269 */ 1270 if (m->hold_count != 0) { 1271 vm_page_and_queue_spin_lock(m); 1272 if (m->queue - m->pc == PQ_ACTIVE) { 1273 TAILQ_REMOVE( 1274 &vm_page_queues[PQ_ACTIVE + q].pl, 1275 m, pageq); 1276 TAILQ_INSERT_TAIL( 1277 &vm_page_queues[PQ_ACTIVE + q].pl, 1278 m, pageq); 1279 } 1280 vm_page_and_queue_spin_unlock(m); 1281 vm_page_wakeup(m); 1282 goto next; 1283 } 1284 1285 /* 1286 * The count for pagedaemon pages is done after checking the 1287 * page for eligibility... 1288 */ 1289 mycpu->gd_cnt.v_pdpages++; 1290 1291 /* 1292 * Check to see "how much" the page has been used and clear 1293 * the tracking access bits. If the object has no references 1294 * don't bother paying the expense. 1295 */ 1296 actcount = 0; 1297 if (m->object && m->object->ref_count != 0) { 1298 if (m->flags & PG_REFERENCED) 1299 ++actcount; 1300 actcount += pmap_ts_referenced(m); 1301 if (actcount) { 1302 m->act_count += ACT_ADVANCE + actcount; 1303 if (m->act_count > ACT_MAX) 1304 m->act_count = ACT_MAX; 1305 } 1306 } 1307 vm_page_flag_clear(m, PG_REFERENCED); 1308 1309 /* 1310 * actcount is only valid if the object ref_count is non-zero. 1311 * If the page does not have an object, actcount will be zero. 1312 */ 1313 if (actcount && m->object->ref_count != 0) { 1314 vm_page_and_queue_spin_lock(m); 1315 if (m->queue - m->pc == PQ_ACTIVE) { 1316 TAILQ_REMOVE( 1317 &vm_page_queues[PQ_ACTIVE + q].pl, 1318 m, pageq); 1319 TAILQ_INSERT_TAIL( 1320 &vm_page_queues[PQ_ACTIVE + q].pl, 1321 m, pageq); 1322 } 1323 vm_page_and_queue_spin_unlock(m); 1324 vm_page_wakeup(m); 1325 } else { 1326 switch(m->object->type) { 1327 case OBJT_DEFAULT: 1328 case OBJT_SWAP: 1329 m->act_count -= min(m->act_count, 1330 vm_anonmem_decline); 1331 break; 1332 default: 1333 m->act_count -= min(m->act_count, 1334 vm_filemem_decline); 1335 break; 1336 } 1337 if (vm_pageout_algorithm || 1338 (m->object == NULL) || 1339 (m->object && (m->object->ref_count == 0)) || 1340 m->act_count < pass + 1 1341 ) { 1342 /* 1343 * Deactivate the page. If we had a 1344 * shortage from our inactive scan try to 1345 * free (cache) the page instead. 1346 * 1347 * Don't just blindly cache the page if 1348 * we do not have a shortage from the 1349 * inactive scan, that could lead to 1350 * gigabytes being moved. 1351 */ 1352 --inactive_shortage; 1353 if (avail_shortage - delta > 0 || 1354 (m->object && (m->object->ref_count == 0))) 1355 { 1356 if (avail_shortage - delta > 0) 1357 ++*recycle_countp; 1358 vm_page_protect(m, VM_PROT_NONE); 1359 if (m->dirty == 0 && 1360 (m->flags & PG_NEED_COMMIT) == 0 && 1361 avail_shortage - delta > 0) { 1362 vm_page_cache(m); 1363 } else { 1364 vm_page_deactivate(m); 1365 vm_page_wakeup(m); 1366 } 1367 } else { 1368 vm_page_deactivate(m); 1369 vm_page_wakeup(m); 1370 } 1371 ++delta; 1372 } else { 1373 vm_page_and_queue_spin_lock(m); 1374 if (m->queue - m->pc == PQ_ACTIVE) { 1375 TAILQ_REMOVE( 1376 &vm_page_queues[PQ_ACTIVE + q].pl, 1377 m, pageq); 1378 TAILQ_INSERT_TAIL( 1379 &vm_page_queues[PQ_ACTIVE + q].pl, 1380 m, pageq); 1381 } 1382 vm_page_and_queue_spin_unlock(m); 1383 vm_page_wakeup(m); 1384 } 1385 } 1386 next: 1387 lwkt_yield(); 1388 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1389 } 1390 1391 /* 1392 * Clean out our local marker. 1393 * 1394 * Page queue still spin-locked. 1395 */ 1396 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1397 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1398 1399 return (delta); 1400 } 1401 1402 /* 1403 * The number of actually free pages can drop down to v_free_reserved, 1404 * we try to build the free count back above v_free_min. Note that 1405 * vm_paging_needed() also returns TRUE if v_free_count is not at 1406 * least v_free_min so that is the minimum we must build the free 1407 * count to. 1408 * 1409 * We use a slightly higher target to improve hysteresis, 1410 * ((v_free_target + v_free_min) / 2). Since v_free_target 1411 * is usually the same as v_cache_min this maintains about 1412 * half the pages in the free queue as are in the cache queue, 1413 * providing pretty good pipelining for pageout operation. 1414 * 1415 * The system operator can manipulate vm.v_cache_min and 1416 * vm.v_free_target to tune the pageout demon. Be sure 1417 * to keep vm.v_free_min < vm.v_free_target. 1418 * 1419 * Note that the original paging target is to get at least 1420 * (free_min + cache_min) into (free + cache). The slightly 1421 * higher target will shift additional pages from cache to free 1422 * without effecting the original paging target in order to 1423 * maintain better hysteresis and not have the free count always 1424 * be dead-on v_free_min. 1425 * 1426 * NOTE: we are still in a critical section. 1427 * 1428 * Pages moved from PQ_CACHE to totally free are not counted in the 1429 * pages_freed counter. 1430 */ 1431 static void 1432 vm_pageout_scan_cache(int avail_shortage, int pass, 1433 int vnodes_skipped, int recycle_count) 1434 { 1435 static int lastkillticks; 1436 struct vm_pageout_scan_info info; 1437 vm_page_t m; 1438 1439 while (vmstats.v_free_count < 1440 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1441 /* 1442 * This steals some code from vm/vm_page.c 1443 */ 1444 static int cache_rover = 0; 1445 1446 m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK); 1447 if (m == NULL) 1448 break; 1449 /* page is returned removed from its queue and spinlocked */ 1450 if (vm_page_busy_try(m, TRUE)) { 1451 vm_page_deactivate_locked(m); 1452 vm_page_spin_unlock(m); 1453 continue; 1454 } 1455 vm_page_spin_unlock(m); 1456 pagedaemon_wakeup(); 1457 lwkt_yield(); 1458 1459 /* 1460 * Remaining operations run with the page busy and neither 1461 * the page or the queue will be spin-locked. 1462 */ 1463 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1464 m->hold_count || 1465 m->wire_count) { 1466 vm_page_deactivate(m); 1467 vm_page_wakeup(m); 1468 continue; 1469 } 1470 KKASSERT((m->flags & PG_MAPPED) == 0); 1471 KKASSERT(m->dirty == 0); 1472 cache_rover += PQ_PRIME2; 1473 vm_pageout_page_free(m); 1474 mycpu->gd_cnt.v_dfree++; 1475 } 1476 1477 #if !defined(NO_SWAPPING) 1478 /* 1479 * Idle process swapout -- run once per second. 1480 */ 1481 if (vm_swap_idle_enabled) { 1482 static time_t lsec; 1483 if (time_uptime != lsec) { 1484 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1485 vm_req_vmdaemon(); 1486 lsec = time_uptime; 1487 } 1488 } 1489 #endif 1490 1491 /* 1492 * If we didn't get enough free pages, and we have skipped a vnode 1493 * in a writeable object, wakeup the sync daemon. And kick swapout 1494 * if we did not get enough free pages. 1495 */ 1496 if (vm_paging_target() > 0) { 1497 if (vnodes_skipped && vm_page_count_min(0)) 1498 speedup_syncer(NULL); 1499 #if !defined(NO_SWAPPING) 1500 if (vm_swap_enabled && vm_page_count_target()) { 1501 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1502 vm_req_vmdaemon(); 1503 } 1504 #endif 1505 } 1506 1507 /* 1508 * Handle catastrophic conditions. Under good conditions we should 1509 * be at the target, well beyond our minimum. If we could not even 1510 * reach our minimum the system is under heavy stress. But just being 1511 * under heavy stress does not trigger process killing. 1512 * 1513 * We consider ourselves to have run out of memory if the swap pager 1514 * is full and avail_shortage is still positive. The secondary check 1515 * ensures that we do not kill processes if the instantanious 1516 * availability is good, even if the pageout demon pass says it 1517 * couldn't get to the target. 1518 */ 1519 if (swap_pager_almost_full && 1520 pass > 0 && 1521 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1522 kprintf("Warning: system low on memory+swap " 1523 "shortage %d for %d ticks!\n", 1524 avail_shortage, ticks - swap_fail_ticks); 1525 } 1526 if (swap_pager_full && 1527 pass > 1 && 1528 avail_shortage > 0 && 1529 vm_paging_target() > 0 && 1530 (unsigned int)(ticks - lastkillticks) >= hz) { 1531 /* 1532 * Kill something, maximum rate once per second to give 1533 * the process time to free up sufficient memory. 1534 */ 1535 lastkillticks = ticks; 1536 info.bigproc = NULL; 1537 info.bigsize = 0; 1538 allproc_scan(vm_pageout_scan_callback, &info); 1539 if (info.bigproc != NULL) { 1540 info.bigproc->p_nice = PRIO_MIN; 1541 info.bigproc->p_usched->resetpriority( 1542 FIRST_LWP_IN_PROC(info.bigproc)); 1543 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1544 killproc(info.bigproc, "out of swap space"); 1545 wakeup(&vmstats.v_free_count); 1546 PRELE(info.bigproc); 1547 } 1548 } 1549 } 1550 1551 static int 1552 vm_pageout_scan_callback(struct proc *p, void *data) 1553 { 1554 struct vm_pageout_scan_info *info = data; 1555 vm_offset_t size; 1556 1557 /* 1558 * Never kill system processes or init. If we have configured swap 1559 * then try to avoid killing low-numbered pids. 1560 */ 1561 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1562 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1563 return (0); 1564 } 1565 1566 lwkt_gettoken(&p->p_token); 1567 1568 /* 1569 * if the process is in a non-running type state, 1570 * don't touch it. 1571 */ 1572 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1573 lwkt_reltoken(&p->p_token); 1574 return (0); 1575 } 1576 1577 /* 1578 * Get the approximate process size. Note that anonymous pages 1579 * with backing swap will be counted twice, but there should not 1580 * be too many such pages due to the stress the VM system is 1581 * under at this point. 1582 */ 1583 size = vmspace_anonymous_count(p->p_vmspace) + 1584 vmspace_swap_count(p->p_vmspace); 1585 1586 /* 1587 * If the this process is bigger than the biggest one 1588 * remember it. 1589 */ 1590 if (info->bigsize < size) { 1591 if (info->bigproc) 1592 PRELE(info->bigproc); 1593 PHOLD(p); 1594 info->bigproc = p; 1595 info->bigsize = size; 1596 } 1597 lwkt_reltoken(&p->p_token); 1598 lwkt_yield(); 1599 1600 return(0); 1601 } 1602 1603 /* 1604 * This routine tries to maintain the pseudo LRU active queue, 1605 * so that during long periods of time where there is no paging, 1606 * that some statistic accumulation still occurs. This code 1607 * helps the situation where paging just starts to occur. 1608 */ 1609 static void 1610 vm_pageout_page_stats(int q) 1611 { 1612 static int fullintervalcount = 0; 1613 struct vm_page marker; 1614 vm_page_t m; 1615 int pcount, tpcount; /* Number of pages to check */ 1616 int page_shortage; 1617 1618 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1619 vmstats.v_free_min) - 1620 (vmstats.v_free_count + vmstats.v_inactive_count + 1621 vmstats.v_cache_count); 1622 1623 if (page_shortage <= 0) 1624 return; 1625 1626 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1627 fullintervalcount += vm_pageout_stats_interval; 1628 if (fullintervalcount < vm_pageout_full_stats_interval) { 1629 tpcount = (vm_pageout_stats_max * pcount) / 1630 vmstats.v_page_count + 1; 1631 if (pcount > tpcount) 1632 pcount = tpcount; 1633 } else { 1634 fullintervalcount = 0; 1635 } 1636 1637 bzero(&marker, sizeof(marker)); 1638 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1639 marker.queue = PQ_ACTIVE + q; 1640 marker.pc = q; 1641 marker.wire_count = 1; 1642 1643 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1644 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1645 1646 /* 1647 * Queue locked at top of loop to avoid stack marker issues. 1648 */ 1649 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1650 pcount-- > 0) 1651 { 1652 int actcount; 1653 1654 KKASSERT(m->queue == PQ_ACTIVE + q); 1655 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1656 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1657 &marker, pageq); 1658 1659 /* 1660 * Skip marker pages (atomic against other markers to avoid 1661 * infinite hop-over scans). 1662 */ 1663 if (m->flags & PG_MARKER) 1664 continue; 1665 1666 /* 1667 * Ignore pages we can't busy 1668 */ 1669 if (vm_page_busy_try(m, TRUE)) 1670 continue; 1671 1672 /* 1673 * Remaining operations run with the page busy and neither 1674 * the page or the queue will be spin-locked. 1675 */ 1676 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1677 KKASSERT(m->queue == PQ_ACTIVE + q); 1678 1679 /* 1680 * We now have a safely busied page, the page and queue 1681 * spinlocks have been released. 1682 * 1683 * Ignore held pages 1684 */ 1685 if (m->hold_count) { 1686 vm_page_wakeup(m); 1687 goto next; 1688 } 1689 1690 /* 1691 * Calculate activity 1692 */ 1693 actcount = 0; 1694 if (m->flags & PG_REFERENCED) { 1695 vm_page_flag_clear(m, PG_REFERENCED); 1696 actcount += 1; 1697 } 1698 actcount += pmap_ts_referenced(m); 1699 1700 /* 1701 * Update act_count and move page to end of queue. 1702 */ 1703 if (actcount) { 1704 m->act_count += ACT_ADVANCE + actcount; 1705 if (m->act_count > ACT_MAX) 1706 m->act_count = ACT_MAX; 1707 vm_page_and_queue_spin_lock(m); 1708 if (m->queue - m->pc == PQ_ACTIVE) { 1709 TAILQ_REMOVE( 1710 &vm_page_queues[PQ_ACTIVE + q].pl, 1711 m, pageq); 1712 TAILQ_INSERT_TAIL( 1713 &vm_page_queues[PQ_ACTIVE + q].pl, 1714 m, pageq); 1715 } 1716 vm_page_and_queue_spin_unlock(m); 1717 vm_page_wakeup(m); 1718 goto next; 1719 } 1720 1721 if (m->act_count == 0) { 1722 /* 1723 * We turn off page access, so that we have 1724 * more accurate RSS stats. We don't do this 1725 * in the normal page deactivation when the 1726 * system is loaded VM wise, because the 1727 * cost of the large number of page protect 1728 * operations would be higher than the value 1729 * of doing the operation. 1730 * 1731 * We use the marker to save our place so 1732 * we can release the spin lock. both (m) 1733 * and (next) will be invalid. 1734 */ 1735 vm_page_protect(m, VM_PROT_NONE); 1736 vm_page_deactivate(m); 1737 } else { 1738 m->act_count -= min(m->act_count, ACT_DECLINE); 1739 vm_page_and_queue_spin_lock(m); 1740 if (m->queue - m->pc == PQ_ACTIVE) { 1741 TAILQ_REMOVE( 1742 &vm_page_queues[PQ_ACTIVE + q].pl, 1743 m, pageq); 1744 TAILQ_INSERT_TAIL( 1745 &vm_page_queues[PQ_ACTIVE + q].pl, 1746 m, pageq); 1747 } 1748 vm_page_and_queue_spin_unlock(m); 1749 } 1750 vm_page_wakeup(m); 1751 next: 1752 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1753 } 1754 1755 /* 1756 * Remove our local marker 1757 * 1758 * Page queue still spin-locked. 1759 */ 1760 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1761 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1762 } 1763 1764 static int 1765 vm_pageout_free_page_calc(vm_size_t count) 1766 { 1767 if (count < vmstats.v_page_count) 1768 return 0; 1769 /* 1770 * free_reserved needs to include enough for the largest swap pager 1771 * structures plus enough for any pv_entry structs when paging. 1772 * 1773 * v_free_min normal allocations 1774 * v_free_reserved system allocations 1775 * v_pageout_free_min allocations by pageout daemon 1776 * v_interrupt_free_min low level allocations (e.g swap structures) 1777 */ 1778 if (vmstats.v_page_count > 1024) 1779 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1780 else 1781 vmstats.v_free_min = 64; 1782 1783 /* 1784 * Make sure the vmmeter slop can't blow out our global minimums. 1785 * 1786 * However, to accomodate weird configurations (vkernels with many 1787 * cpus and little memory, or artifically reduced hw.physmem), do 1788 * not allow v_free_min to exceed 1/20 of ram or the pageout demon 1789 * will go out of control. 1790 */ 1791 if (vmstats.v_free_min < VMMETER_SLOP_COUNT * ncpus * 10) 1792 vmstats.v_free_min = VMMETER_SLOP_COUNT * ncpus * 10; 1793 if (vmstats.v_free_min > vmstats.v_page_count / 20) 1794 vmstats.v_free_min = vmstats.v_page_count / 20; 1795 1796 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1797 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1798 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1799 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1800 1801 return 1; 1802 } 1803 1804 1805 /* 1806 * vm_pageout is the high level pageout daemon. 1807 * 1808 * No requirements. 1809 */ 1810 static void 1811 vm_pageout_thread(void) 1812 { 1813 int pass; 1814 int q; 1815 int q1iterator = 0; 1816 int q2iterator = 0; 1817 1818 /* 1819 * Initialize some paging parameters. 1820 */ 1821 curthread->td_flags |= TDF_SYSTHREAD; 1822 1823 vm_pageout_free_page_calc(vmstats.v_page_count); 1824 1825 /* 1826 * v_free_target and v_cache_min control pageout hysteresis. Note 1827 * that these are more a measure of the VM cache queue hysteresis 1828 * then the VM free queue. Specifically, v_free_target is the 1829 * high water mark (free+cache pages). 1830 * 1831 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1832 * low water mark, while v_free_min is the stop. v_cache_min must 1833 * be big enough to handle memory needs while the pageout daemon 1834 * is signalled and run to free more pages. 1835 */ 1836 if (vmstats.v_free_count > 6144) 1837 vmstats.v_free_target = 4 * vmstats.v_free_min + 1838 vmstats.v_free_reserved; 1839 else 1840 vmstats.v_free_target = 2 * vmstats.v_free_min + 1841 vmstats.v_free_reserved; 1842 1843 /* 1844 * NOTE: With the new buffer cache b_act_count we want the default 1845 * inactive target to be a percentage of available memory. 1846 * 1847 * The inactive target essentially determines the minimum 1848 * number of 'temporary' pages capable of caching one-time-use 1849 * files when the VM system is otherwise full of pages 1850 * belonging to multi-time-use files or active program data. 1851 * 1852 * NOTE: The inactive target is aggressively persued only if the 1853 * inactive queue becomes too small. If the inactive queue 1854 * is large enough to satisfy page movement to free+cache 1855 * then it is repopulated more slowly from the active queue. 1856 * This allows a general inactive_target default to be set. 1857 * 1858 * There is an issue here for processes which sit mostly idle 1859 * 'overnight', such as sshd, tcsh, and X. Any movement from 1860 * the active queue will eventually cause such pages to 1861 * recycle eventually causing a lot of paging in the morning. 1862 * To reduce the incidence of this pages cycled out of the 1863 * buffer cache are moved directly to the inactive queue if 1864 * they were only used once or twice. 1865 * 1866 * The vfs.vm_cycle_point sysctl can be used to adjust this. 1867 * Increasing the value (up to 64) increases the number of 1868 * buffer recyclements which go directly to the inactive queue. 1869 */ 1870 if (vmstats.v_free_count > 2048) { 1871 vmstats.v_cache_min = vmstats.v_free_target; 1872 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 1873 } else { 1874 vmstats.v_cache_min = 0; 1875 vmstats.v_cache_max = 0; 1876 } 1877 vmstats.v_inactive_target = vmstats.v_free_count / 4; 1878 1879 /* XXX does not really belong here */ 1880 if (vm_page_max_wired == 0) 1881 vm_page_max_wired = vmstats.v_free_count / 3; 1882 1883 if (vm_pageout_stats_max == 0) 1884 vm_pageout_stats_max = vmstats.v_free_target; 1885 1886 /* 1887 * Set interval in seconds for stats scan. 1888 */ 1889 if (vm_pageout_stats_interval == 0) 1890 vm_pageout_stats_interval = 5; 1891 if (vm_pageout_full_stats_interval == 0) 1892 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 1893 1894 1895 /* 1896 * Set maximum free per pass 1897 */ 1898 if (vm_pageout_stats_free_max == 0) 1899 vm_pageout_stats_free_max = 5; 1900 1901 swap_pager_swap_init(); 1902 pass = 0; 1903 1904 /* 1905 * The pageout daemon is never done, so loop forever. 1906 */ 1907 while (TRUE) { 1908 int error; 1909 int avail_shortage; 1910 int inactive_shortage; 1911 int vnodes_skipped = 0; 1912 int recycle_count = 0; 1913 int tmp; 1914 1915 /* 1916 * Wait for an action request. If we timeout check to 1917 * see if paging is needed (in case the normal wakeup 1918 * code raced us). 1919 */ 1920 if (vm_pages_needed == 0) { 1921 error = tsleep(&vm_pages_needed, 1922 0, "psleep", 1923 vm_pageout_stats_interval * hz); 1924 if (error && 1925 vm_paging_needed() == 0 && 1926 vm_pages_needed == 0) { 1927 for (q = 0; q < PQ_L2_SIZE; ++q) 1928 vm_pageout_page_stats(q); 1929 continue; 1930 } 1931 vm_pages_needed = 1; 1932 } 1933 1934 mycpu->gd_cnt.v_pdwakeups++; 1935 1936 /* 1937 * Scan for INACTIVE->CLEAN/PAGEOUT 1938 * 1939 * This routine tries to avoid thrashing the system with 1940 * unnecessary activity. 1941 * 1942 * Calculate our target for the number of free+cache pages we 1943 * want to get to. This is higher then the number that causes 1944 * allocations to stall (severe) in order to provide hysteresis, 1945 * and if we don't make it all the way but get to the minimum 1946 * we're happy. Goose it a bit if there are multiple requests 1947 * for memory. 1948 * 1949 * Don't reduce avail_shortage inside the loop or the 1950 * PQAVERAGE() calculation will break. 1951 * 1952 * NOTE! deficit is differentiated from avail_shortage as 1953 * REQUIRING at least (deficit) pages to be cleaned, 1954 * even if the page queues are in good shape. This 1955 * is used primarily for handling per-process 1956 * RLIMIT_RSS and may also see small values when 1957 * processes block due to low memory. 1958 */ 1959 vmstats_rollup(); 1960 avail_shortage = vm_paging_target() + vm_pageout_deficit; 1961 vm_pageout_deficit = 0; 1962 1963 if (avail_shortage > 0) { 1964 int delta = 0; 1965 1966 for (q = 0; q < PQ_L2_SIZE; ++q) { 1967 delta += vm_pageout_scan_inactive( 1968 pass, 1969 (q + q1iterator) & PQ_L2_MASK, 1970 PQAVERAGE(avail_shortage), 1971 &vnodes_skipped); 1972 if (avail_shortage - delta <= 0) 1973 break; 1974 } 1975 avail_shortage -= delta; 1976 q1iterator = q + 1; 1977 } 1978 1979 /* 1980 * Figure out how many active pages we must deactivate. If 1981 * we were able to reach our target with just the inactive 1982 * scan above we limit the number of active pages we 1983 * deactivate to reduce unnecessary work. 1984 */ 1985 vmstats_rollup(); 1986 inactive_shortage = vmstats.v_inactive_target - 1987 vmstats.v_inactive_count; 1988 1989 /* 1990 * If we were unable to free sufficient inactive pages to 1991 * satisfy the free/cache queue requirements then simply 1992 * reaching the inactive target may not be good enough. 1993 * Try to deactivate pages in excess of the target based 1994 * on the shortfall. 1995 * 1996 * However to prevent thrashing the VM system do not 1997 * deactivate more than an additional 1/10 the inactive 1998 * target's worth of active pages. 1999 */ 2000 if (avail_shortage > 0) { 2001 tmp = avail_shortage * 2; 2002 if (tmp > vmstats.v_inactive_target / 10) 2003 tmp = vmstats.v_inactive_target / 10; 2004 inactive_shortage += tmp; 2005 } 2006 2007 /* 2008 * Only trigger a pmap cleanup on inactive shortage. 2009 */ 2010 if (inactive_shortage > 0) { 2011 pmap_collect(); 2012 } 2013 2014 /* 2015 * Scan for ACTIVE->INACTIVE 2016 * 2017 * Only trigger on inactive shortage. Triggering on 2018 * avail_shortage can starve the active queue with 2019 * unnecessary active->inactive transitions and destroy 2020 * performance. 2021 */ 2022 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2023 int delta = 0; 2024 2025 for (q = 0; q < PQ_L2_SIZE; ++q) { 2026 delta += vm_pageout_scan_active( 2027 pass, 2028 (q + q2iterator) & PQ_L2_MASK, 2029 PQAVERAGE(avail_shortage), 2030 PQAVERAGE(inactive_shortage), 2031 &recycle_count); 2032 if (inactive_shortage - delta <= 0 && 2033 avail_shortage - delta <= 0) { 2034 break; 2035 } 2036 } 2037 inactive_shortage -= delta; 2038 avail_shortage -= delta; 2039 q2iterator = q + 1; 2040 } 2041 2042 /* 2043 * Scan for CACHE->FREE 2044 * 2045 * Finally free enough cache pages to meet our free page 2046 * requirement and take more drastic measures if we are 2047 * still in trouble. 2048 */ 2049 vmstats_rollup(); 2050 vm_pageout_scan_cache(avail_shortage, pass, 2051 vnodes_skipped, recycle_count); 2052 2053 /* 2054 * Wait for more work. 2055 */ 2056 if (avail_shortage > 0) { 2057 ++pass; 2058 if (pass < 10 && vm_pages_needed > 1) { 2059 /* 2060 * Normal operation, additional processes 2061 * have already kicked us. Retry immediately 2062 * unless swap space is completely full in 2063 * which case delay a bit. 2064 */ 2065 if (swap_pager_full) { 2066 tsleep(&vm_pages_needed, 0, "pdelay", 2067 hz / 5); 2068 } /* else immediate retry */ 2069 } else if (pass < 10) { 2070 /* 2071 * Normal operation, fewer processes. Delay 2072 * a bit but allow wakeups. 2073 */ 2074 vm_pages_needed = 0; 2075 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2076 vm_pages_needed = 1; 2077 } else if (swap_pager_full == 0) { 2078 /* 2079 * We've taken too many passes, forced delay. 2080 */ 2081 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2082 } else { 2083 /* 2084 * Running out of memory, catastrophic 2085 * back-off to one-second intervals. 2086 */ 2087 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2088 } 2089 } else if (vm_pages_needed) { 2090 /* 2091 * Interlocked wakeup of waiters (non-optional). 2092 * 2093 * Similar to vm_page_free_wakeup() in vm_page.c, 2094 * wake 2095 */ 2096 pass = 0; 2097 if (!vm_page_count_min(vm_page_free_hysteresis) || 2098 !vm_page_count_target()) { 2099 vm_pages_needed = 0; 2100 wakeup(&vmstats.v_free_count); 2101 } 2102 } else { 2103 pass = 0; 2104 } 2105 } 2106 } 2107 2108 static struct kproc_desc page_kp = { 2109 "pagedaemon", 2110 vm_pageout_thread, 2111 &pagethread 2112 }; 2113 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp); 2114 2115 2116 /* 2117 * Called after allocating a page out of the cache or free queue 2118 * to possibly wake the pagedaemon up to replentish our supply. 2119 * 2120 * We try to generate some hysteresis by waking the pagedaemon up 2121 * when our free+cache pages go below the free_min+cache_min level. 2122 * The pagedaemon tries to get the count back up to at least the 2123 * minimum, and through to the target level if possible. 2124 * 2125 * If the pagedaemon is already active bump vm_pages_needed as a hint 2126 * that there are even more requests pending. 2127 * 2128 * SMP races ok? 2129 * No requirements. 2130 */ 2131 void 2132 pagedaemon_wakeup(void) 2133 { 2134 if (vm_paging_needed() && curthread != pagethread) { 2135 if (vm_pages_needed == 0) { 2136 vm_pages_needed = 1; /* SMP race ok */ 2137 wakeup(&vm_pages_needed); 2138 } else if (vm_page_count_min(0)) { 2139 ++vm_pages_needed; /* SMP race ok */ 2140 } 2141 } 2142 } 2143 2144 #if !defined(NO_SWAPPING) 2145 2146 /* 2147 * SMP races ok? 2148 * No requirements. 2149 */ 2150 static void 2151 vm_req_vmdaemon(void) 2152 { 2153 static int lastrun = 0; 2154 2155 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2156 wakeup(&vm_daemon_needed); 2157 lastrun = ticks; 2158 } 2159 } 2160 2161 static int vm_daemon_callback(struct proc *p, void *data __unused); 2162 2163 /* 2164 * No requirements. 2165 */ 2166 static void 2167 vm_daemon(void) 2168 { 2169 int req_swapout; 2170 2171 while (TRUE) { 2172 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2173 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2174 2175 /* 2176 * forced swapouts 2177 */ 2178 if (req_swapout) 2179 swapout_procs(vm_pageout_req_swapout); 2180 2181 /* 2182 * scan the processes for exceeding their rlimits or if 2183 * process is swapped out -- deactivate pages 2184 */ 2185 allproc_scan(vm_daemon_callback, NULL); 2186 } 2187 } 2188 2189 static int 2190 vm_daemon_callback(struct proc *p, void *data __unused) 2191 { 2192 struct vmspace *vm; 2193 vm_pindex_t limit, size; 2194 2195 /* 2196 * if this is a system process or if we have already 2197 * looked at this process, skip it. 2198 */ 2199 lwkt_gettoken(&p->p_token); 2200 2201 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2202 lwkt_reltoken(&p->p_token); 2203 return (0); 2204 } 2205 2206 /* 2207 * if the process is in a non-running type state, 2208 * don't touch it. 2209 */ 2210 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2211 lwkt_reltoken(&p->p_token); 2212 return (0); 2213 } 2214 2215 /* 2216 * get a limit 2217 */ 2218 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2219 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2220 2221 /* 2222 * let processes that are swapped out really be 2223 * swapped out. Set the limit to nothing to get as 2224 * many pages out to swap as possible. 2225 */ 2226 if (p->p_flags & P_SWAPPEDOUT) 2227 limit = 0; 2228 2229 vm = p->p_vmspace; 2230 vmspace_hold(vm); 2231 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2232 if (limit >= 0 && size > 4096 && 2233 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2234 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2235 } 2236 vmspace_drop(vm); 2237 2238 lwkt_reltoken(&p->p_token); 2239 2240 return (0); 2241 } 2242 2243 #endif 2244