1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/sysctl.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_param.h> 85 #include <sys/lock.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_pageout.h> 90 #include <vm/vm_pager.h> 91 #include <vm/swap_pager.h> 92 #include <vm/vm_extern.h> 93 94 #include <sys/thread2.h> 95 #include <sys/spinlock2.h> 96 #include <vm/vm_page2.h> 97 98 /* 99 * System initialization 100 */ 101 102 /* the kernel process "vm_pageout"*/ 103 static int vm_pageout_page(vm_page_t m, int *max_launderp, 104 int *vnodes_skippedp, struct vnode **vpfailedp, 105 int pass, int vmflush_flags); 106 static int vm_pageout_clean_helper (vm_page_t, int); 107 static int vm_pageout_free_page_calc (vm_size_t count); 108 static void vm_pageout_page_free(vm_page_t m) ; 109 struct thread *pagethread; 110 111 #if !defined(NO_SWAPPING) 112 /* the kernel process "vm_daemon"*/ 113 static void vm_daemon (void); 114 static struct thread *vmthread; 115 116 static struct kproc_desc vm_kp = { 117 "vmdaemon", 118 vm_daemon, 119 &vmthread 120 }; 121 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 122 #endif 123 124 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 125 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 126 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 127 int vm_page_free_hysteresis = 16; 128 129 #if !defined(NO_SWAPPING) 130 static int vm_pageout_req_swapout; 131 static int vm_daemon_needed; 132 #endif 133 static int vm_max_launder = 4096; 134 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 135 static int vm_pageout_full_stats_interval = 0; 136 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 137 static int defer_swap_pageouts=0; 138 static int disable_swap_pageouts=0; 139 static u_int vm_anonmem_decline = ACT_DECLINE; 140 static u_int vm_filemem_decline = ACT_DECLINE * 2; 141 142 #if defined(NO_SWAPPING) 143 static int vm_swap_enabled=0; 144 static int vm_swap_idle_enabled=0; 145 #else 146 static int vm_swap_enabled=1; 147 static int vm_swap_idle_enabled=0; 148 #endif 149 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 150 151 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 152 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 153 154 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 155 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 156 157 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 158 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 159 "Free more pages than the minimum required"); 160 161 SYSCTL_INT(_vm, OID_AUTO, max_launder, 162 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 163 164 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 165 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 166 167 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 168 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 169 170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 171 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 172 173 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 174 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 175 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 176 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 177 178 #if defined(NO_SWAPPING) 179 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 180 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 181 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 182 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 183 #else 184 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 185 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 186 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 187 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 188 #endif 189 190 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 191 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 192 193 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 194 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 195 196 static int pageout_lock_miss; 197 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 198 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 199 200 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 201 202 #if !defined(NO_SWAPPING) 203 static void vm_req_vmdaemon (void); 204 #endif 205 static void vm_pageout_page_stats(int q); 206 207 /* 208 * Calculate approximately how many pages on each queue to try to 209 * clean. An exact calculation creates an edge condition when the 210 * queues are unbalanced so add significant slop. The queue scans 211 * will stop early when targets are reached and will start where they 212 * left off on the next pass. 213 * 214 * We need to be generous here because there are all sorts of loading 215 * conditions that can cause edge cases if try to average over all queues. 216 * In particular, storage subsystems have become so fast that paging 217 * activity can become quite frantic. Eventually we will probably need 218 * two paging threads, one for dirty pages and one for clean, to deal 219 * with the bandwidth requirements. 220 221 * So what we do is calculate a value that can be satisfied nominally by 222 * only having to scan half the queues. 223 */ 224 static __inline int 225 PQAVERAGE(int n) 226 { 227 int avg; 228 229 if (n >= 0) { 230 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 231 } else { 232 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 233 } 234 return avg; 235 } 236 237 /* 238 * vm_pageout_clean_helper: 239 * 240 * Clean the page and remove it from the laundry. The page must be busied 241 * by the caller and will be disposed of (put away, flushed) by this routine. 242 */ 243 static int 244 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 245 { 246 vm_object_t object; 247 vm_page_t mc[BLIST_MAX_ALLOC]; 248 int error; 249 int ib, is, page_base; 250 vm_pindex_t pindex = m->pindex; 251 252 object = m->object; 253 254 /* 255 * Don't mess with the page if it's held or special. 256 * 257 * XXX do we really need to check hold_count here? hold_count 258 * isn't supposed to mess with vm_page ops except prevent the 259 * page from being reused. 260 */ 261 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 262 vm_page_wakeup(m); 263 return 0; 264 } 265 266 /* 267 * Place page in cluster. Align cluster for optimal swap space 268 * allocation (whether it is swap or not). This is typically ~16-32 269 * pages, which also tends to align the cluster to multiples of the 270 * filesystem block size if backed by a filesystem. 271 */ 272 page_base = pindex % BLIST_MAX_ALLOC; 273 mc[page_base] = m; 274 ib = page_base - 1; 275 is = page_base + 1; 276 277 /* 278 * Scan object for clusterable pages. 279 * 280 * We can cluster ONLY if: ->> the page is NOT 281 * clean, wired, busy, held, or mapped into a 282 * buffer, and one of the following: 283 * 1) The page is inactive, or a seldom used 284 * active page. 285 * -or- 286 * 2) we force the issue. 287 * 288 * During heavy mmap/modification loads the pageout 289 * daemon can really fragment the underlying file 290 * due to flushing pages out of order and not trying 291 * align the clusters (which leave sporatic out-of-order 292 * holes). To solve this problem we do the reverse scan 293 * first and attempt to align our cluster, then do a 294 * forward scan if room remains. 295 */ 296 vm_object_hold(object); 297 298 while (ib >= 0) { 299 vm_page_t p; 300 301 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 302 TRUE, &error); 303 if (error || p == NULL) 304 break; 305 if ((p->queue - p->pc) == PQ_CACHE || 306 (p->flags & PG_UNMANAGED)) { 307 vm_page_wakeup(p); 308 break; 309 } 310 vm_page_test_dirty(p); 311 if (((p->dirty & p->valid) == 0 && 312 (p->flags & PG_NEED_COMMIT) == 0) || 313 p->wire_count != 0 || /* may be held by buf cache */ 314 p->hold_count != 0) { /* may be undergoing I/O */ 315 vm_page_wakeup(p); 316 break; 317 } 318 if (p->queue - p->pc != PQ_INACTIVE) { 319 if (p->queue - p->pc != PQ_ACTIVE || 320 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 321 vm_page_wakeup(p); 322 break; 323 } 324 } 325 326 /* 327 * Try to maintain page groupings in the cluster. 328 */ 329 if (m->flags & PG_WINATCFLS) 330 vm_page_flag_set(p, PG_WINATCFLS); 331 else 332 vm_page_flag_clear(p, PG_WINATCFLS); 333 p->act_count = m->act_count; 334 335 mc[ib] = p; 336 --ib; 337 } 338 ++ib; /* fixup */ 339 340 while (is < BLIST_MAX_ALLOC && 341 pindex - page_base + is < object->size) { 342 vm_page_t p; 343 344 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 345 TRUE, &error); 346 if (error || p == NULL) 347 break; 348 if (((p->queue - p->pc) == PQ_CACHE) || 349 (p->flags & PG_UNMANAGED)) { 350 vm_page_wakeup(p); 351 break; 352 } 353 vm_page_test_dirty(p); 354 if (((p->dirty & p->valid) == 0 && 355 (p->flags & PG_NEED_COMMIT) == 0) || 356 p->wire_count != 0 || /* may be held by buf cache */ 357 p->hold_count != 0) { /* may be undergoing I/O */ 358 vm_page_wakeup(p); 359 break; 360 } 361 if (p->queue - p->pc != PQ_INACTIVE) { 362 if (p->queue - p->pc != PQ_ACTIVE || 363 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 364 vm_page_wakeup(p); 365 break; 366 } 367 } 368 369 /* 370 * Try to maintain page groupings in the cluster. 371 */ 372 if (m->flags & PG_WINATCFLS) 373 vm_page_flag_set(p, PG_WINATCFLS); 374 else 375 vm_page_flag_clear(p, PG_WINATCFLS); 376 p->act_count = m->act_count; 377 378 mc[is] = p; 379 ++is; 380 } 381 382 vm_object_drop(object); 383 384 /* 385 * we allow reads during pageouts... 386 */ 387 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 388 } 389 390 /* 391 * vm_pageout_flush() - launder the given pages 392 * 393 * The given pages are laundered. Note that we setup for the start of 394 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 395 * reference count all in here rather then in the parent. If we want 396 * the parent to do more sophisticated things we may have to change 397 * the ordering. 398 * 399 * The pages in the array must be busied by the caller and will be 400 * unbusied by this function. 401 */ 402 int 403 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 404 { 405 vm_object_t object; 406 int pageout_status[count]; 407 int numpagedout = 0; 408 int i; 409 410 /* 411 * Initiate I/O. Bump the vm_page_t->busy counter. 412 */ 413 for (i = 0; i < count; i++) { 414 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 415 ("vm_pageout_flush page %p index %d/%d: partially " 416 "invalid page", mc[i], i, count)); 417 vm_page_io_start(mc[i]); 418 } 419 420 /* 421 * We must make the pages read-only. This will also force the 422 * modified bit in the related pmaps to be cleared. The pager 423 * cannot clear the bit for us since the I/O completion code 424 * typically runs from an interrupt. The act of making the page 425 * read-only handles the case for us. 426 * 427 * Then we can unbusy the pages, we still hold a reference by virtue 428 * of our soft-busy. 429 */ 430 for (i = 0; i < count; i++) { 431 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 432 vm_page_protect(mc[i], VM_PROT_NONE); 433 else 434 vm_page_protect(mc[i], VM_PROT_READ); 435 vm_page_wakeup(mc[i]); 436 } 437 438 object = mc[0]->object; 439 vm_object_pip_add(object, count); 440 441 vm_pager_put_pages(object, mc, count, 442 (vmflush_flags | 443 ((object == &kernel_object) ? 444 VM_PAGER_PUT_SYNC : 0)), 445 pageout_status); 446 447 for (i = 0; i < count; i++) { 448 vm_page_t mt = mc[i]; 449 450 switch (pageout_status[i]) { 451 case VM_PAGER_OK: 452 numpagedout++; 453 break; 454 case VM_PAGER_PEND: 455 numpagedout++; 456 break; 457 case VM_PAGER_BAD: 458 /* 459 * Page outside of range of object. Right now we 460 * essentially lose the changes by pretending it 461 * worked. 462 */ 463 vm_page_busy_wait(mt, FALSE, "pgbad"); 464 pmap_clear_modify(mt); 465 vm_page_undirty(mt); 466 vm_page_wakeup(mt); 467 break; 468 case VM_PAGER_ERROR: 469 case VM_PAGER_FAIL: 470 /* 471 * A page typically cannot be paged out when we 472 * have run out of swap. We leave the page 473 * marked inactive and will try to page it out 474 * again later. 475 * 476 * Starvation of the active page list is used to 477 * determine when the system is massively memory 478 * starved. 479 */ 480 break; 481 case VM_PAGER_AGAIN: 482 break; 483 } 484 485 /* 486 * If not PENDing this was a synchronous operation and we 487 * clean up after the I/O. If it is PENDing the mess is 488 * cleaned up asynchronously. 489 * 490 * Also nominally act on the caller's wishes if the caller 491 * wants to try to really clean (cache or free) the page. 492 * 493 * Also nominally deactivate the page if the system is 494 * memory-stressed. 495 */ 496 if (pageout_status[i] != VM_PAGER_PEND) { 497 vm_page_busy_wait(mt, FALSE, "pgouw"); 498 vm_page_io_finish(mt); 499 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 500 vm_page_try_to_cache(mt); 501 } else if (vm_page_count_severe()) { 502 vm_page_deactivate(mt); 503 vm_page_wakeup(mt); 504 } else { 505 vm_page_wakeup(mt); 506 } 507 vm_object_pip_wakeup(object); 508 } 509 } 510 return numpagedout; 511 } 512 513 #if !defined(NO_SWAPPING) 514 515 /* 516 * Callback function, page busied for us. We must dispose of the busy 517 * condition. Any related pmap pages may be held but will not be locked. 518 */ 519 static 520 int 521 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 522 vm_page_t p) 523 { 524 int actcount; 525 int cleanit = 0; 526 527 /* 528 * Basic tests - There should never be a marker, and we can stop 529 * once the RSS is below the required level. 530 */ 531 KKASSERT((p->flags & PG_MARKER) == 0); 532 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 533 vm_page_wakeup(p); 534 return(-1); 535 } 536 537 mycpu->gd_cnt.v_pdpages++; 538 539 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 540 vm_page_wakeup(p); 541 goto done; 542 } 543 544 ++info->actioncount; 545 546 /* 547 * Check if the page has been referened recently. If it has, 548 * activate it and skip. 549 */ 550 actcount = pmap_ts_referenced(p); 551 if (actcount) { 552 vm_page_flag_set(p, PG_REFERENCED); 553 } else if (p->flags & PG_REFERENCED) { 554 actcount = 1; 555 } 556 557 if (actcount) { 558 if (p->queue - p->pc != PQ_ACTIVE) { 559 vm_page_and_queue_spin_lock(p); 560 if (p->queue - p->pc != PQ_ACTIVE) { 561 vm_page_and_queue_spin_unlock(p); 562 vm_page_activate(p); 563 } else { 564 vm_page_and_queue_spin_unlock(p); 565 } 566 } else { 567 p->act_count += actcount; 568 if (p->act_count > ACT_MAX) 569 p->act_count = ACT_MAX; 570 } 571 vm_page_flag_clear(p, PG_REFERENCED); 572 vm_page_wakeup(p); 573 goto done; 574 } 575 576 /* 577 * Remove the page from this particular pmap. Once we do this, our 578 * pmap scans will not see it again (unless it gets faulted in), so 579 * we must actively dispose of or deal with the page. 580 */ 581 pmap_remove_specific(info->pmap, p); 582 583 /* 584 * If the page is not mapped to another process (i.e. as would be 585 * typical if this were a shared page from a library) then deactivate 586 * the page and clean it in two passes only. 587 * 588 * If the page hasn't been referenced since the last check, remove it 589 * from the pmap. If it is no longer mapped, deactivate it 590 * immediately, accelerating the normal decline. 591 * 592 * Once the page has been removed from the pmap the RSS code no 593 * longer tracks it so we have to make sure that it is staged for 594 * potential flush action. 595 */ 596 if ((p->flags & PG_MAPPED) == 0) { 597 if (p->queue - p->pc == PQ_ACTIVE) { 598 vm_page_deactivate(p); 599 } 600 if (p->queue - p->pc == PQ_INACTIVE) { 601 cleanit = 1; 602 } 603 } 604 605 /* 606 * Ok, try to fully clean the page and any nearby pages such that at 607 * least the requested page is freed or moved to the cache queue. 608 * 609 * We usually do this synchronously to allow us to get the page into 610 * the CACHE queue quickly, which will prevent memory exhaustion if 611 * a process with a memoryuse limit is running away. However, the 612 * sysadmin may desire to set vm.swap_user_async which relaxes this 613 * and improves write performance. 614 */ 615 if (cleanit) { 616 int max_launder = 0x7FFF; 617 int vnodes_skipped = 0; 618 int vmflush_flags; 619 struct vnode *vpfailed = NULL; 620 621 info->offset = va; 622 623 if (vm_pageout_memuse_mode >= 2) { 624 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 625 VM_PAGER_ALLOW_ACTIVE; 626 if (swap_user_async == 0) 627 vmflush_flags |= VM_PAGER_PUT_SYNC; 628 vm_page_flag_set(p, PG_WINATCFLS); 629 info->cleancount += 630 vm_pageout_page(p, &max_launder, 631 &vnodes_skipped, 632 &vpfailed, 1, vmflush_flags); 633 } else { 634 vm_page_wakeup(p); 635 ++info->cleancount; 636 } 637 } else { 638 vm_page_wakeup(p); 639 } 640 641 /* 642 * Must be at end to avoid SMP races. 643 */ 644 done: 645 lwkt_user_yield(); 646 return 0; 647 } 648 649 /* 650 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 651 * that is relatively difficult to do. We try to keep track of where we 652 * left off last time to reduce scan overhead. 653 * 654 * Called when vm_pageout_memuse_mode is >= 1. 655 */ 656 void 657 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 658 { 659 vm_offset_t pgout_offset; 660 struct pmap_pgscan_info info; 661 int retries = 3; 662 663 pgout_offset = map->pgout_offset; 664 again: 665 #if 0 666 kprintf("%016jx ", pgout_offset); 667 #endif 668 if (pgout_offset < VM_MIN_USER_ADDRESS) 669 pgout_offset = VM_MIN_USER_ADDRESS; 670 if (pgout_offset >= VM_MAX_USER_ADDRESS) 671 pgout_offset = 0; 672 info.pmap = vm_map_pmap(map); 673 info.limit = limit; 674 info.beg_addr = pgout_offset; 675 info.end_addr = VM_MAX_USER_ADDRESS; 676 info.callback = vm_pageout_mdp_callback; 677 info.cleancount = 0; 678 info.actioncount = 0; 679 info.busycount = 0; 680 681 pmap_pgscan(&info); 682 pgout_offset = info.offset; 683 #if 0 684 kprintf("%016jx %08lx %08lx\n", pgout_offset, 685 info.cleancount, info.actioncount); 686 #endif 687 688 if (pgout_offset != VM_MAX_USER_ADDRESS && 689 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 690 goto again; 691 } else if (retries && 692 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 693 --retries; 694 goto again; 695 } 696 map->pgout_offset = pgout_offset; 697 } 698 #endif 699 700 /* 701 * Called when the pageout scan wants to free a page. We no longer 702 * try to cycle the vm_object here with a reference & dealloc, which can 703 * cause a non-trivial object collapse in a critical path. 704 * 705 * It is unclear why we cycled the ref_count in the past, perhaps to try 706 * to optimize shadow chain collapses but I don't quite see why it would 707 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 708 * synchronously and not have to be kicked-start. 709 */ 710 static void 711 vm_pageout_page_free(vm_page_t m) 712 { 713 vm_page_protect(m, VM_PROT_NONE); 714 vm_page_free(m); 715 } 716 717 /* 718 * vm_pageout_scan does the dirty work for the pageout daemon. 719 */ 720 struct vm_pageout_scan_info { 721 struct proc *bigproc; 722 vm_offset_t bigsize; 723 }; 724 725 static int vm_pageout_scan_callback(struct proc *p, void *data); 726 727 static int 728 vm_pageout_scan_inactive(int pass, int q, int avail_shortage, 729 int *vnodes_skipped) 730 { 731 vm_page_t m; 732 struct vm_page marker; 733 struct vnode *vpfailed; /* warning, allowed to be stale */ 734 int maxscan; 735 int delta = 0; 736 int max_launder; 737 738 /* 739 * Start scanning the inactive queue for pages we can move to the 740 * cache or free. The scan will stop when the target is reached or 741 * we have scanned the entire inactive queue. Note that m->act_count 742 * is not used to form decisions for the inactive queue, only for the 743 * active queue. 744 * 745 * max_launder limits the number of dirty pages we flush per scan. 746 * For most systems a smaller value (16 or 32) is more robust under 747 * extreme memory and disk pressure because any unnecessary writes 748 * to disk can result in extreme performance degredation. However, 749 * systems with excessive dirty pages (especially when MAP_NOSYNC is 750 * used) will die horribly with limited laundering. If the pageout 751 * daemon cannot clean enough pages in the first pass, we let it go 752 * all out in succeeding passes. 753 */ 754 if ((max_launder = vm_max_launder) <= 1) 755 max_launder = 1; 756 if (pass) 757 max_launder = 10000; 758 759 /* 760 * Initialize our marker 761 */ 762 bzero(&marker, sizeof(marker)); 763 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 764 marker.queue = PQ_INACTIVE + q; 765 marker.pc = q; 766 marker.wire_count = 1; 767 768 /* 769 * Inactive queue scan. 770 * 771 * NOTE: The vm_page must be spinlocked before the queue to avoid 772 * deadlocks, so it is easiest to simply iterate the loop 773 * with the queue unlocked at the top. 774 */ 775 vpfailed = NULL; 776 777 vm_page_queues_spin_lock(PQ_INACTIVE + q); 778 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 779 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 780 781 /* 782 * Queue locked at top of loop to avoid stack marker issues. 783 */ 784 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 785 maxscan-- > 0 && avail_shortage - delta > 0) 786 { 787 int count; 788 789 KKASSERT(m->queue == PQ_INACTIVE + q); 790 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 791 &marker, pageq); 792 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 793 &marker, pageq); 794 mycpu->gd_cnt.v_pdpages++; 795 796 /* 797 * Skip marker pages (atomic against other markers to avoid 798 * infinite hop-over scans). 799 */ 800 if (m->flags & PG_MARKER) 801 continue; 802 803 /* 804 * Try to busy the page. Don't mess with pages which are 805 * already busy or reorder them in the queue. 806 */ 807 if (vm_page_busy_try(m, TRUE)) 808 continue; 809 810 /* 811 * Remaining operations run with the page busy and neither 812 * the page or the queue will be spin-locked. 813 */ 814 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 815 KKASSERT(m->queue == PQ_INACTIVE + q); 816 817 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 818 &vpfailed, pass, 0); 819 delta += count; 820 821 /* 822 * Systems with a ton of memory can wind up with huge 823 * deactivation counts. Because the inactive scan is 824 * doing a lot of flushing, the combination can result 825 * in excessive paging even in situations where other 826 * unrelated threads free up sufficient VM. 827 * 828 * To deal with this we abort the nominal active->inactive 829 * scan before we hit the inactive target when free+cache 830 * levels have reached a reasonable target. 831 * 832 * When deciding to stop early we need to add some slop to 833 * the test and we need to return full completion to the caller 834 * to prevent the caller from thinking there is something 835 * wrong and issuing a low-memory+swap warning or pkill. 836 * 837 * A deficit forces paging regardless of the state of the 838 * VM page queues (used for RSS enforcement). 839 */ 840 lwkt_yield(); 841 vm_page_queues_spin_lock(PQ_INACTIVE + q); 842 if (vm_paging_target() < -vm_max_launder) { 843 /* 844 * Stopping early, return full completion to caller. 845 */ 846 if (delta < avail_shortage) 847 delta = avail_shortage; 848 break; 849 } 850 } 851 852 /* page queue still spin-locked */ 853 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 854 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 855 856 return (delta); 857 } 858 859 /* 860 * Pageout the specified page, return the total number of pages paged out 861 * (this routine may cluster). 862 * 863 * The page must be busied and soft-busied by the caller and will be disposed 864 * of by this function. 865 */ 866 static int 867 vm_pageout_page(vm_page_t m, int *max_launderp, int *vnodes_skippedp, 868 struct vnode **vpfailedp, int pass, int vmflush_flags) 869 { 870 vm_object_t object; 871 int actcount; 872 int count = 0; 873 874 /* 875 * It is possible for a page to be busied ad-hoc (e.g. the 876 * pmap_collect() code) and wired and race against the 877 * allocation of a new page. vm_page_alloc() may be forced 878 * to deactivate the wired page in which case it winds up 879 * on the inactive queue and must be handled here. We 880 * correct the problem simply by unqueuing the page. 881 */ 882 if (m->wire_count) { 883 vm_page_unqueue_nowakeup(m); 884 vm_page_wakeup(m); 885 kprintf("WARNING: pagedaemon: wired page on " 886 "inactive queue %p\n", m); 887 return 0; 888 } 889 890 /* 891 * A held page may be undergoing I/O, so skip it. 892 */ 893 if (m->hold_count) { 894 vm_page_and_queue_spin_lock(m); 895 if (m->queue - m->pc == PQ_INACTIVE) { 896 TAILQ_REMOVE( 897 &vm_page_queues[m->queue].pl, m, pageq); 898 TAILQ_INSERT_TAIL( 899 &vm_page_queues[m->queue].pl, m, pageq); 900 ++vm_swapcache_inactive_heuristic; 901 } 902 vm_page_and_queue_spin_unlock(m); 903 vm_page_wakeup(m); 904 return 0; 905 } 906 907 if (m->object == NULL || m->object->ref_count == 0) { 908 /* 909 * If the object is not being used, we ignore previous 910 * references. 911 */ 912 vm_page_flag_clear(m, PG_REFERENCED); 913 pmap_clear_reference(m); 914 /* fall through to end */ 915 } else if (((m->flags & PG_REFERENCED) == 0) && 916 (actcount = pmap_ts_referenced(m))) { 917 /* 918 * Otherwise, if the page has been referenced while 919 * in the inactive queue, we bump the "activation 920 * count" upwards, making it less likely that the 921 * page will be added back to the inactive queue 922 * prematurely again. Here we check the page tables 923 * (or emulated bits, if any), given the upper level 924 * VM system not knowing anything about existing 925 * references. 926 */ 927 vm_page_activate(m); 928 m->act_count += (actcount + ACT_ADVANCE); 929 vm_page_wakeup(m); 930 return 0; 931 } 932 933 /* 934 * (m) is still busied. 935 * 936 * If the upper level VM system knows about any page 937 * references, we activate the page. We also set the 938 * "activation count" higher than normal so that we will less 939 * likely place pages back onto the inactive queue again. 940 */ 941 if ((m->flags & PG_REFERENCED) != 0) { 942 vm_page_flag_clear(m, PG_REFERENCED); 943 actcount = pmap_ts_referenced(m); 944 vm_page_activate(m); 945 m->act_count += (actcount + ACT_ADVANCE + 1); 946 vm_page_wakeup(m); 947 return 0; 948 } 949 950 /* 951 * If the upper level VM system doesn't know anything about 952 * the page being dirty, we have to check for it again. As 953 * far as the VM code knows, any partially dirty pages are 954 * fully dirty. 955 * 956 * Pages marked PG_WRITEABLE may be mapped into the user 957 * address space of a process running on another cpu. A 958 * user process (without holding the MP lock) running on 959 * another cpu may be able to touch the page while we are 960 * trying to remove it. vm_page_cache() will handle this 961 * case for us. 962 */ 963 if (m->dirty == 0) { 964 vm_page_test_dirty(m); 965 } else { 966 vm_page_dirty(m); 967 } 968 969 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 970 /* 971 * Invalid pages can be easily freed 972 */ 973 vm_pageout_page_free(m); 974 mycpu->gd_cnt.v_dfree++; 975 ++count; 976 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 977 /* 978 * Clean pages can be placed onto the cache queue. 979 * This effectively frees them. 980 */ 981 vm_page_cache(m); 982 ++count; 983 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 984 /* 985 * Dirty pages need to be paged out, but flushing 986 * a page is extremely expensive verses freeing 987 * a clean page. Rather then artificially limiting 988 * the number of pages we can flush, we instead give 989 * dirty pages extra priority on the inactive queue 990 * by forcing them to be cycled through the queue 991 * twice before being flushed, after which the 992 * (now clean) page will cycle through once more 993 * before being freed. This significantly extends 994 * the thrash point for a heavily loaded machine. 995 */ 996 vm_page_flag_set(m, PG_WINATCFLS); 997 vm_page_and_queue_spin_lock(m); 998 if (m->queue - m->pc == PQ_INACTIVE) { 999 TAILQ_REMOVE( 1000 &vm_page_queues[m->queue].pl, m, pageq); 1001 TAILQ_INSERT_TAIL( 1002 &vm_page_queues[m->queue].pl, m, pageq); 1003 ++vm_swapcache_inactive_heuristic; 1004 } 1005 vm_page_and_queue_spin_unlock(m); 1006 vm_page_wakeup(m); 1007 } else if (*max_launderp > 0) { 1008 /* 1009 * We always want to try to flush some dirty pages if 1010 * we encounter them, to keep the system stable. 1011 * Normally this number is small, but under extreme 1012 * pressure where there are insufficient clean pages 1013 * on the inactive queue, we may have to go all out. 1014 */ 1015 int swap_pageouts_ok; 1016 struct vnode *vp = NULL; 1017 1018 swap_pageouts_ok = 0; 1019 object = m->object; 1020 if (object && 1021 (object->type != OBJT_SWAP) && 1022 (object->type != OBJT_DEFAULT)) { 1023 swap_pageouts_ok = 1; 1024 } else { 1025 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 1026 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 1027 vm_page_count_min(0)); 1028 } 1029 1030 /* 1031 * We don't bother paging objects that are "dead". 1032 * Those objects are in a "rundown" state. 1033 */ 1034 if (!swap_pageouts_ok || 1035 (object == NULL) || 1036 (object->flags & OBJ_DEAD)) { 1037 vm_page_and_queue_spin_lock(m); 1038 if (m->queue - m->pc == PQ_INACTIVE) { 1039 TAILQ_REMOVE( 1040 &vm_page_queues[m->queue].pl, 1041 m, pageq); 1042 TAILQ_INSERT_TAIL( 1043 &vm_page_queues[m->queue].pl, 1044 m, pageq); 1045 ++vm_swapcache_inactive_heuristic; 1046 } 1047 vm_page_and_queue_spin_unlock(m); 1048 vm_page_wakeup(m); 1049 return 0; 1050 } 1051 1052 /* 1053 * (m) is still busied. 1054 * 1055 * The object is already known NOT to be dead. It 1056 * is possible for the vget() to block the whole 1057 * pageout daemon, but the new low-memory handling 1058 * code should prevent it. 1059 * 1060 * The previous code skipped locked vnodes and, worse, 1061 * reordered pages in the queue. This results in 1062 * completely non-deterministic operation because, 1063 * quite often, a vm_fault has initiated an I/O and 1064 * is holding a locked vnode at just the point where 1065 * the pageout daemon is woken up. 1066 * 1067 * We can't wait forever for the vnode lock, we might 1068 * deadlock due to a vn_read() getting stuck in 1069 * vm_wait while holding this vnode. We skip the 1070 * vnode if we can't get it in a reasonable amount 1071 * of time. 1072 * 1073 * vpfailed is used to (try to) avoid the case where 1074 * a large number of pages are associated with a 1075 * locked vnode, which could cause the pageout daemon 1076 * to stall for an excessive amount of time. 1077 */ 1078 if (object->type == OBJT_VNODE) { 1079 int flags; 1080 1081 vp = object->handle; 1082 flags = LK_EXCLUSIVE; 1083 if (vp == *vpfailedp) 1084 flags |= LK_NOWAIT; 1085 else 1086 flags |= LK_TIMELOCK; 1087 vm_page_hold(m); 1088 vm_page_wakeup(m); 1089 1090 /* 1091 * We have unbusied (m) temporarily so we can 1092 * acquire the vp lock without deadlocking. 1093 * (m) is held to prevent destruction. 1094 */ 1095 if (vget(vp, flags) != 0) { 1096 *vpfailedp = vp; 1097 ++pageout_lock_miss; 1098 if (object->flags & OBJ_MIGHTBEDIRTY) 1099 ++*vnodes_skippedp; 1100 vm_page_unhold(m); 1101 return 0; 1102 } 1103 1104 /* 1105 * The page might have been moved to another 1106 * queue during potential blocking in vget() 1107 * above. The page might have been freed and 1108 * reused for another vnode. The object might 1109 * have been reused for another vnode. 1110 */ 1111 if (m->queue - m->pc != PQ_INACTIVE || 1112 m->object != object || 1113 object->handle != vp) { 1114 if (object->flags & OBJ_MIGHTBEDIRTY) 1115 ++*vnodes_skippedp; 1116 vput(vp); 1117 vm_page_unhold(m); 1118 return 0; 1119 } 1120 1121 /* 1122 * The page may have been busied during the 1123 * blocking in vput(); We don't move the 1124 * page back onto the end of the queue so that 1125 * statistics are more correct if we don't. 1126 */ 1127 if (vm_page_busy_try(m, TRUE)) { 1128 vput(vp); 1129 vm_page_unhold(m); 1130 return 0; 1131 } 1132 vm_page_unhold(m); 1133 1134 /* 1135 * (m) is busied again 1136 * 1137 * We own the busy bit and remove our hold 1138 * bit. If the page is still held it 1139 * might be undergoing I/O, so skip it. 1140 */ 1141 if (m->hold_count) { 1142 vm_page_and_queue_spin_lock(m); 1143 if (m->queue - m->pc == PQ_INACTIVE) { 1144 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1145 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1146 ++vm_swapcache_inactive_heuristic; 1147 } 1148 vm_page_and_queue_spin_unlock(m); 1149 if (object->flags & OBJ_MIGHTBEDIRTY) 1150 ++*vnodes_skippedp; 1151 vm_page_wakeup(m); 1152 vput(vp); 1153 return 0; 1154 } 1155 /* (m) is left busied as we fall through */ 1156 } 1157 1158 /* 1159 * page is busy and not held here. 1160 * 1161 * If a page is dirty, then it is either being washed 1162 * (but not yet cleaned) or it is still in the 1163 * laundry. If it is still in the laundry, then we 1164 * start the cleaning operation. 1165 * 1166 * decrement inactive_shortage on success to account 1167 * for the (future) cleaned page. Otherwise we 1168 * could wind up laundering or cleaning too many 1169 * pages. 1170 * 1171 * NOTE: Cleaning the page here does not cause 1172 * force_deficit to be adjusted, because the 1173 * page is not being freed or moved to the 1174 * cache. 1175 */ 1176 count = vm_pageout_clean_helper(m, vmflush_flags); 1177 *max_launderp -= count; 1178 1179 /* 1180 * Clean ate busy, page no longer accessible 1181 */ 1182 if (vp != NULL) 1183 vput(vp); 1184 } else { 1185 vm_page_wakeup(m); 1186 } 1187 return count; 1188 } 1189 1190 static int 1191 vm_pageout_scan_active(int pass, int q, 1192 int avail_shortage, int inactive_shortage, 1193 int *recycle_countp) 1194 { 1195 struct vm_page marker; 1196 vm_page_t m; 1197 int actcount; 1198 int delta = 0; 1199 int maxscan; 1200 1201 /* 1202 * We want to move pages from the active queue to the inactive 1203 * queue to get the inactive queue to the inactive target. If 1204 * we still have a page shortage from above we try to directly free 1205 * clean pages instead of moving them. 1206 * 1207 * If we do still have a shortage we keep track of the number of 1208 * pages we free or cache (recycle_count) as a measure of thrashing 1209 * between the active and inactive queues. 1210 * 1211 * If we were able to completely satisfy the free+cache targets 1212 * from the inactive pool we limit the number of pages we move 1213 * from the active pool to the inactive pool to 2x the pages we 1214 * had removed from the inactive pool (with a minimum of 1/5 the 1215 * inactive target). If we were not able to completely satisfy 1216 * the free+cache targets we go for the whole target aggressively. 1217 * 1218 * NOTE: Both variables can end up negative. 1219 * NOTE: We are still in a critical section. 1220 */ 1221 1222 bzero(&marker, sizeof(marker)); 1223 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1224 marker.queue = PQ_ACTIVE + q; 1225 marker.pc = q; 1226 marker.wire_count = 1; 1227 1228 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1229 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1230 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1231 1232 /* 1233 * Queue locked at top of loop to avoid stack marker issues. 1234 */ 1235 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1236 maxscan-- > 0 && (avail_shortage - delta > 0 || 1237 inactive_shortage > 0)) 1238 { 1239 KKASSERT(m->queue == PQ_ACTIVE + q); 1240 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1241 &marker, pageq); 1242 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1243 &marker, pageq); 1244 1245 /* 1246 * Skip marker pages (atomic against other markers to avoid 1247 * infinite hop-over scans). 1248 */ 1249 if (m->flags & PG_MARKER) 1250 continue; 1251 1252 /* 1253 * Try to busy the page. Don't mess with pages which are 1254 * already busy or reorder them in the queue. 1255 */ 1256 if (vm_page_busy_try(m, TRUE)) 1257 continue; 1258 1259 /* 1260 * Remaining operations run with the page busy and neither 1261 * the page or the queue will be spin-locked. 1262 */ 1263 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1264 KKASSERT(m->queue == PQ_ACTIVE + q); 1265 1266 /* 1267 * Don't deactivate pages that are held, even if we can 1268 * busy them. (XXX why not?) 1269 */ 1270 if (m->hold_count != 0) { 1271 vm_page_and_queue_spin_lock(m); 1272 if (m->queue - m->pc == PQ_ACTIVE) { 1273 TAILQ_REMOVE( 1274 &vm_page_queues[PQ_ACTIVE + q].pl, 1275 m, pageq); 1276 TAILQ_INSERT_TAIL( 1277 &vm_page_queues[PQ_ACTIVE + q].pl, 1278 m, pageq); 1279 } 1280 vm_page_and_queue_spin_unlock(m); 1281 vm_page_wakeup(m); 1282 goto next; 1283 } 1284 1285 /* 1286 * The count for pagedaemon pages is done after checking the 1287 * page for eligibility... 1288 */ 1289 mycpu->gd_cnt.v_pdpages++; 1290 1291 /* 1292 * Check to see "how much" the page has been used and clear 1293 * the tracking access bits. If the object has no references 1294 * don't bother paying the expense. 1295 */ 1296 actcount = 0; 1297 if (m->object && m->object->ref_count != 0) { 1298 if (m->flags & PG_REFERENCED) 1299 ++actcount; 1300 actcount += pmap_ts_referenced(m); 1301 if (actcount) { 1302 m->act_count += ACT_ADVANCE + actcount; 1303 if (m->act_count > ACT_MAX) 1304 m->act_count = ACT_MAX; 1305 } 1306 } 1307 vm_page_flag_clear(m, PG_REFERENCED); 1308 1309 /* 1310 * actcount is only valid if the object ref_count is non-zero. 1311 * If the page does not have an object, actcount will be zero. 1312 */ 1313 if (actcount && m->object->ref_count != 0) { 1314 vm_page_and_queue_spin_lock(m); 1315 if (m->queue - m->pc == PQ_ACTIVE) { 1316 TAILQ_REMOVE( 1317 &vm_page_queues[PQ_ACTIVE + q].pl, 1318 m, pageq); 1319 TAILQ_INSERT_TAIL( 1320 &vm_page_queues[PQ_ACTIVE + q].pl, 1321 m, pageq); 1322 } 1323 vm_page_and_queue_spin_unlock(m); 1324 vm_page_wakeup(m); 1325 } else { 1326 switch(m->object->type) { 1327 case OBJT_DEFAULT: 1328 case OBJT_SWAP: 1329 m->act_count -= min(m->act_count, 1330 vm_anonmem_decline); 1331 break; 1332 default: 1333 m->act_count -= min(m->act_count, 1334 vm_filemem_decline); 1335 break; 1336 } 1337 if (vm_pageout_algorithm || 1338 (m->object == NULL) || 1339 (m->object && (m->object->ref_count == 0)) || 1340 m->act_count < pass + 1 1341 ) { 1342 /* 1343 * Deactivate the page. If we had a 1344 * shortage from our inactive scan try to 1345 * free (cache) the page instead. 1346 * 1347 * Don't just blindly cache the page if 1348 * we do not have a shortage from the 1349 * inactive scan, that could lead to 1350 * gigabytes being moved. 1351 */ 1352 --inactive_shortage; 1353 if (avail_shortage - delta > 0 || 1354 (m->object && (m->object->ref_count == 0))) 1355 { 1356 if (avail_shortage - delta > 0) 1357 ++*recycle_countp; 1358 vm_page_protect(m, VM_PROT_NONE); 1359 if (m->dirty == 0 && 1360 (m->flags & PG_NEED_COMMIT) == 0 && 1361 avail_shortage - delta > 0) { 1362 vm_page_cache(m); 1363 } else { 1364 vm_page_deactivate(m); 1365 vm_page_wakeup(m); 1366 } 1367 } else { 1368 vm_page_deactivate(m); 1369 vm_page_wakeup(m); 1370 } 1371 ++delta; 1372 } else { 1373 vm_page_and_queue_spin_lock(m); 1374 if (m->queue - m->pc == PQ_ACTIVE) { 1375 TAILQ_REMOVE( 1376 &vm_page_queues[PQ_ACTIVE + q].pl, 1377 m, pageq); 1378 TAILQ_INSERT_TAIL( 1379 &vm_page_queues[PQ_ACTIVE + q].pl, 1380 m, pageq); 1381 } 1382 vm_page_and_queue_spin_unlock(m); 1383 vm_page_wakeup(m); 1384 } 1385 } 1386 next: 1387 lwkt_yield(); 1388 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1389 } 1390 1391 /* 1392 * Clean out our local marker. 1393 * 1394 * Page queue still spin-locked. 1395 */ 1396 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1397 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1398 1399 return (delta); 1400 } 1401 1402 /* 1403 * The number of actually free pages can drop down to v_free_reserved, 1404 * we try to build the free count back above v_free_min. Note that 1405 * vm_paging_needed() also returns TRUE if v_free_count is not at 1406 * least v_free_min so that is the minimum we must build the free 1407 * count to. 1408 * 1409 * We use a slightly higher target to improve hysteresis, 1410 * ((v_free_target + v_free_min) / 2). Since v_free_target 1411 * is usually the same as v_cache_min this maintains about 1412 * half the pages in the free queue as are in the cache queue, 1413 * providing pretty good pipelining for pageout operation. 1414 * 1415 * The system operator can manipulate vm.v_cache_min and 1416 * vm.v_free_target to tune the pageout demon. Be sure 1417 * to keep vm.v_free_min < vm.v_free_target. 1418 * 1419 * Note that the original paging target is to get at least 1420 * (free_min + cache_min) into (free + cache). The slightly 1421 * higher target will shift additional pages from cache to free 1422 * without effecting the original paging target in order to 1423 * maintain better hysteresis and not have the free count always 1424 * be dead-on v_free_min. 1425 * 1426 * NOTE: we are still in a critical section. 1427 * 1428 * Pages moved from PQ_CACHE to totally free are not counted in the 1429 * pages_freed counter. 1430 */ 1431 static void 1432 vm_pageout_scan_cache(int avail_shortage, int pass, 1433 int vnodes_skipped, int recycle_count) 1434 { 1435 static int lastkillticks; 1436 struct vm_pageout_scan_info info; 1437 vm_page_t m; 1438 1439 while (vmstats.v_free_count < 1440 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1441 /* 1442 * This steals some code from vm/vm_page.c 1443 */ 1444 static int cache_rover = 0; 1445 1446 m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK); 1447 if (m == NULL) 1448 break; 1449 /* page is returned removed from its queue and spinlocked */ 1450 if (vm_page_busy_try(m, TRUE)) { 1451 vm_page_deactivate_locked(m); 1452 vm_page_spin_unlock(m); 1453 continue; 1454 } 1455 vm_page_spin_unlock(m); 1456 pagedaemon_wakeup(); 1457 lwkt_yield(); 1458 1459 /* 1460 * Remaining operations run with the page busy and neither 1461 * the page or the queue will be spin-locked. 1462 */ 1463 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1464 m->hold_count || 1465 m->wire_count) { 1466 vm_page_deactivate(m); 1467 vm_page_wakeup(m); 1468 continue; 1469 } 1470 KKASSERT((m->flags & PG_MAPPED) == 0); 1471 KKASSERT(m->dirty == 0); 1472 cache_rover += PQ_PRIME2; 1473 vm_pageout_page_free(m); 1474 mycpu->gd_cnt.v_dfree++; 1475 } 1476 1477 #if !defined(NO_SWAPPING) 1478 /* 1479 * Idle process swapout -- run once per second. 1480 */ 1481 if (vm_swap_idle_enabled) { 1482 static time_t lsec; 1483 if (time_uptime != lsec) { 1484 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1485 vm_req_vmdaemon(); 1486 lsec = time_uptime; 1487 } 1488 } 1489 #endif 1490 1491 /* 1492 * If we didn't get enough free pages, and we have skipped a vnode 1493 * in a writeable object, wakeup the sync daemon. And kick swapout 1494 * if we did not get enough free pages. 1495 */ 1496 if (vm_paging_target() > 0) { 1497 if (vnodes_skipped && vm_page_count_min(0)) 1498 speedup_syncer(NULL); 1499 #if !defined(NO_SWAPPING) 1500 if (vm_swap_enabled && vm_page_count_target()) { 1501 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1502 vm_req_vmdaemon(); 1503 } 1504 #endif 1505 } 1506 1507 /* 1508 * Handle catastrophic conditions. Under good conditions we should 1509 * be at the target, well beyond our minimum. If we could not even 1510 * reach our minimum the system is under heavy stress. But just being 1511 * under heavy stress does not trigger process killing. 1512 * 1513 * We consider ourselves to have run out of memory if the swap pager 1514 * is full and avail_shortage is still positive. The secondary check 1515 * ensures that we do not kill processes if the instantanious 1516 * availability is good, even if the pageout demon pass says it 1517 * couldn't get to the target. 1518 */ 1519 if (swap_pager_almost_full && 1520 pass > 0 && 1521 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1522 kprintf("Warning: system low on memory+swap " 1523 "shortage %d for %d ticks!\n", 1524 avail_shortage, ticks - swap_fail_ticks); 1525 if (bootverbose) 1526 kprintf("Metrics: spaf=%d spf=%d pass=%d avail=%d target=%d last=%u\n", 1527 swap_pager_almost_full, 1528 swap_pager_full, 1529 pass, 1530 avail_shortage, 1531 vm_paging_target(), 1532 (unsigned int)(ticks - lastkillticks)); 1533 } 1534 if (swap_pager_full && 1535 pass > 1 && 1536 avail_shortage > 0 && 1537 vm_paging_target() > 0 && 1538 (unsigned int)(ticks - lastkillticks) >= hz) { 1539 /* 1540 * Kill something, maximum rate once per second to give 1541 * the process time to free up sufficient memory. 1542 */ 1543 lastkillticks = ticks; 1544 info.bigproc = NULL; 1545 info.bigsize = 0; 1546 allproc_scan(vm_pageout_scan_callback, &info); 1547 if (info.bigproc != NULL) { 1548 kprintf("Try to kill process %d %s\n", 1549 info.bigproc->p_pid, info.bigproc->p_comm); 1550 info.bigproc->p_nice = PRIO_MIN; 1551 info.bigproc->p_usched->resetpriority( 1552 FIRST_LWP_IN_PROC(info.bigproc)); 1553 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1554 killproc(info.bigproc, "out of swap space"); 1555 wakeup(&vmstats.v_free_count); 1556 PRELE(info.bigproc); 1557 } 1558 } 1559 } 1560 1561 static int 1562 vm_pageout_scan_callback(struct proc *p, void *data) 1563 { 1564 struct vm_pageout_scan_info *info = data; 1565 vm_offset_t size; 1566 1567 /* 1568 * Never kill system processes or init. If we have configured swap 1569 * then try to avoid killing low-numbered pids. 1570 */ 1571 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1572 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1573 return (0); 1574 } 1575 1576 lwkt_gettoken(&p->p_token); 1577 1578 /* 1579 * if the process is in a non-running type state, 1580 * don't touch it. 1581 */ 1582 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1583 lwkt_reltoken(&p->p_token); 1584 return (0); 1585 } 1586 1587 /* 1588 * Get the approximate process size. Note that anonymous pages 1589 * with backing swap will be counted twice, but there should not 1590 * be too many such pages due to the stress the VM system is 1591 * under at this point. 1592 */ 1593 size = vmspace_anonymous_count(p->p_vmspace) + 1594 vmspace_swap_count(p->p_vmspace); 1595 1596 /* 1597 * If the this process is bigger than the biggest one 1598 * remember it. 1599 */ 1600 if (info->bigsize < size) { 1601 if (info->bigproc) 1602 PRELE(info->bigproc); 1603 PHOLD(p); 1604 info->bigproc = p; 1605 info->bigsize = size; 1606 } 1607 lwkt_reltoken(&p->p_token); 1608 lwkt_yield(); 1609 1610 return(0); 1611 } 1612 1613 /* 1614 * This routine tries to maintain the pseudo LRU active queue, 1615 * so that during long periods of time where there is no paging, 1616 * that some statistic accumulation still occurs. This code 1617 * helps the situation where paging just starts to occur. 1618 */ 1619 static void 1620 vm_pageout_page_stats(int q) 1621 { 1622 static int fullintervalcount = 0; 1623 struct vm_page marker; 1624 vm_page_t m; 1625 int pcount, tpcount; /* Number of pages to check */ 1626 int page_shortage; 1627 1628 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1629 vmstats.v_free_min) - 1630 (vmstats.v_free_count + vmstats.v_inactive_count + 1631 vmstats.v_cache_count); 1632 1633 if (page_shortage <= 0) 1634 return; 1635 1636 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1637 fullintervalcount += vm_pageout_stats_interval; 1638 if (fullintervalcount < vm_pageout_full_stats_interval) { 1639 tpcount = (vm_pageout_stats_max * pcount) / 1640 vmstats.v_page_count + 1; 1641 if (pcount > tpcount) 1642 pcount = tpcount; 1643 } else { 1644 fullintervalcount = 0; 1645 } 1646 1647 bzero(&marker, sizeof(marker)); 1648 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1649 marker.queue = PQ_ACTIVE + q; 1650 marker.pc = q; 1651 marker.wire_count = 1; 1652 1653 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1654 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1655 1656 /* 1657 * Queue locked at top of loop to avoid stack marker issues. 1658 */ 1659 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1660 pcount-- > 0) 1661 { 1662 int actcount; 1663 1664 KKASSERT(m->queue == PQ_ACTIVE + q); 1665 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1666 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1667 &marker, pageq); 1668 1669 /* 1670 * Skip marker pages (atomic against other markers to avoid 1671 * infinite hop-over scans). 1672 */ 1673 if (m->flags & PG_MARKER) 1674 continue; 1675 1676 /* 1677 * Ignore pages we can't busy 1678 */ 1679 if (vm_page_busy_try(m, TRUE)) 1680 continue; 1681 1682 /* 1683 * Remaining operations run with the page busy and neither 1684 * the page or the queue will be spin-locked. 1685 */ 1686 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1687 KKASSERT(m->queue == PQ_ACTIVE + q); 1688 1689 /* 1690 * We now have a safely busied page, the page and queue 1691 * spinlocks have been released. 1692 * 1693 * Ignore held pages 1694 */ 1695 if (m->hold_count) { 1696 vm_page_wakeup(m); 1697 goto next; 1698 } 1699 1700 /* 1701 * Calculate activity 1702 */ 1703 actcount = 0; 1704 if (m->flags & PG_REFERENCED) { 1705 vm_page_flag_clear(m, PG_REFERENCED); 1706 actcount += 1; 1707 } 1708 actcount += pmap_ts_referenced(m); 1709 1710 /* 1711 * Update act_count and move page to end of queue. 1712 */ 1713 if (actcount) { 1714 m->act_count += ACT_ADVANCE + actcount; 1715 if (m->act_count > ACT_MAX) 1716 m->act_count = ACT_MAX; 1717 vm_page_and_queue_spin_lock(m); 1718 if (m->queue - m->pc == PQ_ACTIVE) { 1719 TAILQ_REMOVE( 1720 &vm_page_queues[PQ_ACTIVE + q].pl, 1721 m, pageq); 1722 TAILQ_INSERT_TAIL( 1723 &vm_page_queues[PQ_ACTIVE + q].pl, 1724 m, pageq); 1725 } 1726 vm_page_and_queue_spin_unlock(m); 1727 vm_page_wakeup(m); 1728 goto next; 1729 } 1730 1731 if (m->act_count == 0) { 1732 /* 1733 * We turn off page access, so that we have 1734 * more accurate RSS stats. We don't do this 1735 * in the normal page deactivation when the 1736 * system is loaded VM wise, because the 1737 * cost of the large number of page protect 1738 * operations would be higher than the value 1739 * of doing the operation. 1740 * 1741 * We use the marker to save our place so 1742 * we can release the spin lock. both (m) 1743 * and (next) will be invalid. 1744 */ 1745 vm_page_protect(m, VM_PROT_NONE); 1746 vm_page_deactivate(m); 1747 } else { 1748 m->act_count -= min(m->act_count, ACT_DECLINE); 1749 vm_page_and_queue_spin_lock(m); 1750 if (m->queue - m->pc == PQ_ACTIVE) { 1751 TAILQ_REMOVE( 1752 &vm_page_queues[PQ_ACTIVE + q].pl, 1753 m, pageq); 1754 TAILQ_INSERT_TAIL( 1755 &vm_page_queues[PQ_ACTIVE + q].pl, 1756 m, pageq); 1757 } 1758 vm_page_and_queue_spin_unlock(m); 1759 } 1760 vm_page_wakeup(m); 1761 next: 1762 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1763 } 1764 1765 /* 1766 * Remove our local marker 1767 * 1768 * Page queue still spin-locked. 1769 */ 1770 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1771 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1772 } 1773 1774 static int 1775 vm_pageout_free_page_calc(vm_size_t count) 1776 { 1777 if (count < vmstats.v_page_count) 1778 return 0; 1779 /* 1780 * free_reserved needs to include enough for the largest swap pager 1781 * structures plus enough for any pv_entry structs when paging. 1782 * 1783 * v_free_min normal allocations 1784 * v_free_reserved system allocations 1785 * v_pageout_free_min allocations by pageout daemon 1786 * v_interrupt_free_min low level allocations (e.g swap structures) 1787 */ 1788 if (vmstats.v_page_count > 1024) 1789 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1790 else 1791 vmstats.v_free_min = 64; 1792 1793 /* 1794 * Make sure the vmmeter slop can't blow out our global minimums. 1795 * 1796 * However, to accomodate weird configurations (vkernels with many 1797 * cpus and little memory, or artifically reduced hw.physmem), do 1798 * not allow v_free_min to exceed 1/20 of ram or the pageout demon 1799 * will go out of control. 1800 */ 1801 if (vmstats.v_free_min < VMMETER_SLOP_COUNT * ncpus * 10) 1802 vmstats.v_free_min = VMMETER_SLOP_COUNT * ncpus * 10; 1803 if (vmstats.v_free_min > vmstats.v_page_count / 20) 1804 vmstats.v_free_min = vmstats.v_page_count / 20; 1805 1806 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1807 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1808 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1809 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1810 1811 return 1; 1812 } 1813 1814 1815 /* 1816 * vm_pageout is the high level pageout daemon. 1817 * 1818 * No requirements. 1819 */ 1820 static void 1821 vm_pageout_thread(void) 1822 { 1823 int pass; 1824 int q; 1825 int q1iterator = 0; 1826 int q2iterator = 0; 1827 1828 /* 1829 * Initialize some paging parameters. 1830 */ 1831 curthread->td_flags |= TDF_SYSTHREAD; 1832 1833 vm_pageout_free_page_calc(vmstats.v_page_count); 1834 1835 /* 1836 * v_free_target and v_cache_min control pageout hysteresis. Note 1837 * that these are more a measure of the VM cache queue hysteresis 1838 * then the VM free queue. Specifically, v_free_target is the 1839 * high water mark (free+cache pages). 1840 * 1841 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1842 * low water mark, while v_free_min is the stop. v_cache_min must 1843 * be big enough to handle memory needs while the pageout daemon 1844 * is signalled and run to free more pages. 1845 */ 1846 if (vmstats.v_free_count > 6144) 1847 vmstats.v_free_target = 4 * vmstats.v_free_min + 1848 vmstats.v_free_reserved; 1849 else 1850 vmstats.v_free_target = 2 * vmstats.v_free_min + 1851 vmstats.v_free_reserved; 1852 1853 /* 1854 * NOTE: With the new buffer cache b_act_count we want the default 1855 * inactive target to be a percentage of available memory. 1856 * 1857 * The inactive target essentially determines the minimum 1858 * number of 'temporary' pages capable of caching one-time-use 1859 * files when the VM system is otherwise full of pages 1860 * belonging to multi-time-use files or active program data. 1861 * 1862 * NOTE: The inactive target is aggressively persued only if the 1863 * inactive queue becomes too small. If the inactive queue 1864 * is large enough to satisfy page movement to free+cache 1865 * then it is repopulated more slowly from the active queue. 1866 * This allows a general inactive_target default to be set. 1867 * 1868 * There is an issue here for processes which sit mostly idle 1869 * 'overnight', such as sshd, tcsh, and X. Any movement from 1870 * the active queue will eventually cause such pages to 1871 * recycle eventually causing a lot of paging in the morning. 1872 * To reduce the incidence of this pages cycled out of the 1873 * buffer cache are moved directly to the inactive queue if 1874 * they were only used once or twice. 1875 * 1876 * The vfs.vm_cycle_point sysctl can be used to adjust this. 1877 * Increasing the value (up to 64) increases the number of 1878 * buffer recyclements which go directly to the inactive queue. 1879 */ 1880 if (vmstats.v_free_count > 2048) { 1881 vmstats.v_cache_min = vmstats.v_free_target; 1882 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 1883 } else { 1884 vmstats.v_cache_min = 0; 1885 vmstats.v_cache_max = 0; 1886 } 1887 vmstats.v_inactive_target = vmstats.v_free_count / 4; 1888 1889 /* XXX does not really belong here */ 1890 if (vm_page_max_wired == 0) 1891 vm_page_max_wired = vmstats.v_free_count / 3; 1892 1893 if (vm_pageout_stats_max == 0) 1894 vm_pageout_stats_max = vmstats.v_free_target; 1895 1896 /* 1897 * Set interval in seconds for stats scan. 1898 */ 1899 if (vm_pageout_stats_interval == 0) 1900 vm_pageout_stats_interval = 5; 1901 if (vm_pageout_full_stats_interval == 0) 1902 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 1903 1904 1905 /* 1906 * Set maximum free per pass 1907 */ 1908 if (vm_pageout_stats_free_max == 0) 1909 vm_pageout_stats_free_max = 5; 1910 1911 swap_pager_swap_init(); 1912 pass = 0; 1913 1914 /* 1915 * The pageout daemon is never done, so loop forever. 1916 */ 1917 while (TRUE) { 1918 int error; 1919 int avail_shortage; 1920 int inactive_shortage; 1921 int vnodes_skipped = 0; 1922 int recycle_count = 0; 1923 int tmp; 1924 1925 /* 1926 * Wait for an action request. If we timeout check to 1927 * see if paging is needed (in case the normal wakeup 1928 * code raced us). 1929 */ 1930 if (vm_pages_needed == 0) { 1931 error = tsleep(&vm_pages_needed, 1932 0, "psleep", 1933 vm_pageout_stats_interval * hz); 1934 if (error && 1935 vm_paging_needed() == 0 && 1936 vm_pages_needed == 0) { 1937 for (q = 0; q < PQ_L2_SIZE; ++q) 1938 vm_pageout_page_stats(q); 1939 continue; 1940 } 1941 vm_pages_needed = 1; 1942 } 1943 1944 mycpu->gd_cnt.v_pdwakeups++; 1945 1946 /* 1947 * Scan for INACTIVE->CLEAN/PAGEOUT 1948 * 1949 * This routine tries to avoid thrashing the system with 1950 * unnecessary activity. 1951 * 1952 * Calculate our target for the number of free+cache pages we 1953 * want to get to. This is higher then the number that causes 1954 * allocations to stall (severe) in order to provide hysteresis, 1955 * and if we don't make it all the way but get to the minimum 1956 * we're happy. Goose it a bit if there are multiple requests 1957 * for memory. 1958 * 1959 * Don't reduce avail_shortage inside the loop or the 1960 * PQAVERAGE() calculation will break. 1961 * 1962 * NOTE! deficit is differentiated from avail_shortage as 1963 * REQUIRING at least (deficit) pages to be cleaned, 1964 * even if the page queues are in good shape. This 1965 * is used primarily for handling per-process 1966 * RLIMIT_RSS and may also see small values when 1967 * processes block due to low memory. 1968 */ 1969 vmstats_rollup(); 1970 avail_shortage = vm_paging_target() + vm_pageout_deficit; 1971 vm_pageout_deficit = 0; 1972 1973 if (avail_shortage > 0) { 1974 int delta = 0; 1975 1976 for (q = 0; q < PQ_L2_SIZE; ++q) { 1977 delta += vm_pageout_scan_inactive( 1978 pass, 1979 (q + q1iterator) & PQ_L2_MASK, 1980 PQAVERAGE(avail_shortage), 1981 &vnodes_skipped); 1982 if (avail_shortage - delta <= 0) 1983 break; 1984 } 1985 avail_shortage -= delta; 1986 q1iterator = q + 1; 1987 } 1988 1989 /* 1990 * Figure out how many active pages we must deactivate. If 1991 * we were able to reach our target with just the inactive 1992 * scan above we limit the number of active pages we 1993 * deactivate to reduce unnecessary work. 1994 */ 1995 vmstats_rollup(); 1996 inactive_shortage = vmstats.v_inactive_target - 1997 vmstats.v_inactive_count; 1998 1999 /* 2000 * If we were unable to free sufficient inactive pages to 2001 * satisfy the free/cache queue requirements then simply 2002 * reaching the inactive target may not be good enough. 2003 * Try to deactivate pages in excess of the target based 2004 * on the shortfall. 2005 * 2006 * However to prevent thrashing the VM system do not 2007 * deactivate more than an additional 1/10 the inactive 2008 * target's worth of active pages. 2009 */ 2010 if (avail_shortage > 0) { 2011 tmp = avail_shortage * 2; 2012 if (tmp > vmstats.v_inactive_target / 10) 2013 tmp = vmstats.v_inactive_target / 10; 2014 inactive_shortage += tmp; 2015 } 2016 2017 /* 2018 * Only trigger a pmap cleanup on inactive shortage. 2019 */ 2020 if (inactive_shortage > 0) { 2021 pmap_collect(); 2022 } 2023 2024 /* 2025 * Scan for ACTIVE->INACTIVE 2026 * 2027 * Only trigger on inactive shortage. Triggering on 2028 * avail_shortage can starve the active queue with 2029 * unnecessary active->inactive transitions and destroy 2030 * performance. 2031 */ 2032 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2033 int delta = 0; 2034 2035 for (q = 0; q < PQ_L2_SIZE; ++q) { 2036 delta += vm_pageout_scan_active( 2037 pass, 2038 (q + q2iterator) & PQ_L2_MASK, 2039 PQAVERAGE(avail_shortage), 2040 PQAVERAGE(inactive_shortage), 2041 &recycle_count); 2042 if (inactive_shortage - delta <= 0 && 2043 avail_shortage - delta <= 0) { 2044 break; 2045 } 2046 } 2047 inactive_shortage -= delta; 2048 avail_shortage -= delta; 2049 q2iterator = q + 1; 2050 } 2051 2052 /* 2053 * Scan for CACHE->FREE 2054 * 2055 * Finally free enough cache pages to meet our free page 2056 * requirement and take more drastic measures if we are 2057 * still in trouble. 2058 */ 2059 vmstats_rollup(); 2060 vm_pageout_scan_cache(avail_shortage, pass, 2061 vnodes_skipped, recycle_count); 2062 2063 /* 2064 * Wait for more work. 2065 */ 2066 if (avail_shortage > 0) { 2067 ++pass; 2068 if (pass < 10 && vm_pages_needed > 1) { 2069 /* 2070 * Normal operation, additional processes 2071 * have already kicked us. Retry immediately 2072 * unless swap space is completely full in 2073 * which case delay a bit. 2074 */ 2075 if (swap_pager_full) { 2076 tsleep(&vm_pages_needed, 0, "pdelay", 2077 hz / 5); 2078 } /* else immediate retry */ 2079 } else if (pass < 10) { 2080 /* 2081 * Normal operation, fewer processes. Delay 2082 * a bit but allow wakeups. 2083 */ 2084 vm_pages_needed = 0; 2085 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2086 vm_pages_needed = 1; 2087 } else if (swap_pager_full == 0) { 2088 /* 2089 * We've taken too many passes, forced delay. 2090 */ 2091 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2092 } else { 2093 /* 2094 * Running out of memory, catastrophic 2095 * back-off to one-second intervals. 2096 */ 2097 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2098 } 2099 } else if (vm_pages_needed) { 2100 /* 2101 * Interlocked wakeup of waiters (non-optional). 2102 * 2103 * Similar to vm_page_free_wakeup() in vm_page.c, 2104 * wake 2105 */ 2106 pass = 0; 2107 if (!vm_page_count_min(vm_page_free_hysteresis) || 2108 !vm_page_count_target()) { 2109 vm_pages_needed = 0; 2110 wakeup(&vmstats.v_free_count); 2111 } 2112 } else { 2113 pass = 0; 2114 } 2115 } 2116 } 2117 2118 static struct kproc_desc page_kp = { 2119 "pagedaemon", 2120 vm_pageout_thread, 2121 &pagethread 2122 }; 2123 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp); 2124 2125 2126 /* 2127 * Called after allocating a page out of the cache or free queue 2128 * to possibly wake the pagedaemon up to replentish our supply. 2129 * 2130 * We try to generate some hysteresis by waking the pagedaemon up 2131 * when our free+cache pages go below the free_min+cache_min level. 2132 * The pagedaemon tries to get the count back up to at least the 2133 * minimum, and through to the target level if possible. 2134 * 2135 * If the pagedaemon is already active bump vm_pages_needed as a hint 2136 * that there are even more requests pending. 2137 * 2138 * SMP races ok? 2139 * No requirements. 2140 */ 2141 void 2142 pagedaemon_wakeup(void) 2143 { 2144 if (vm_paging_needed() && curthread != pagethread) { 2145 if (vm_pages_needed == 0) { 2146 vm_pages_needed = 1; /* SMP race ok */ 2147 wakeup(&vm_pages_needed); 2148 } else if (vm_page_count_min(0)) { 2149 ++vm_pages_needed; /* SMP race ok */ 2150 } 2151 } 2152 } 2153 2154 #if !defined(NO_SWAPPING) 2155 2156 /* 2157 * SMP races ok? 2158 * No requirements. 2159 */ 2160 static void 2161 vm_req_vmdaemon(void) 2162 { 2163 static int lastrun = 0; 2164 2165 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2166 wakeup(&vm_daemon_needed); 2167 lastrun = ticks; 2168 } 2169 } 2170 2171 static int vm_daemon_callback(struct proc *p, void *data __unused); 2172 2173 /* 2174 * No requirements. 2175 */ 2176 static void 2177 vm_daemon(void) 2178 { 2179 int req_swapout; 2180 2181 while (TRUE) { 2182 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2183 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2184 2185 /* 2186 * forced swapouts 2187 */ 2188 if (req_swapout) 2189 swapout_procs(vm_pageout_req_swapout); 2190 2191 /* 2192 * scan the processes for exceeding their rlimits or if 2193 * process is swapped out -- deactivate pages 2194 */ 2195 allproc_scan(vm_daemon_callback, NULL); 2196 } 2197 } 2198 2199 static int 2200 vm_daemon_callback(struct proc *p, void *data __unused) 2201 { 2202 struct vmspace *vm; 2203 vm_pindex_t limit, size; 2204 2205 /* 2206 * if this is a system process or if we have already 2207 * looked at this process, skip it. 2208 */ 2209 lwkt_gettoken(&p->p_token); 2210 2211 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2212 lwkt_reltoken(&p->p_token); 2213 return (0); 2214 } 2215 2216 /* 2217 * if the process is in a non-running type state, 2218 * don't touch it. 2219 */ 2220 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2221 lwkt_reltoken(&p->p_token); 2222 return (0); 2223 } 2224 2225 /* 2226 * get a limit 2227 */ 2228 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2229 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2230 2231 /* 2232 * let processes that are swapped out really be 2233 * swapped out. Set the limit to nothing to get as 2234 * many pages out to swap as possible. 2235 */ 2236 if (p->p_flags & P_SWAPPEDOUT) 2237 limit = 0; 2238 2239 vm = p->p_vmspace; 2240 vmspace_hold(vm); 2241 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2242 if (limit >= 0 && size > 4096 && 2243 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2244 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2245 } 2246 vmspace_drop(vm); 2247 2248 lwkt_reltoken(&p->p_token); 2249 2250 return (0); 2251 } 2252 2253 #endif 2254