1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/sysctl.h> 82 83 #include <vm/vm.h> 84 #include <vm/vm_param.h> 85 #include <sys/lock.h> 86 #include <vm/vm_object.h> 87 #include <vm/vm_page.h> 88 #include <vm/vm_map.h> 89 #include <vm/vm_pageout.h> 90 #include <vm/vm_pager.h> 91 #include <vm/swap_pager.h> 92 #include <vm/vm_extern.h> 93 94 #include <sys/thread2.h> 95 #include <sys/spinlock2.h> 96 #include <vm/vm_page2.h> 97 98 /* 99 * System initialization 100 */ 101 102 /* the kernel process "vm_pageout"*/ 103 static int vm_pageout_page(vm_page_t m, int *max_launderp, 104 int *vnodes_skippedp, struct vnode **vpfailedp, 105 int pass, int vmflush_flags); 106 static int vm_pageout_clean_helper (vm_page_t, int); 107 static int vm_pageout_free_page_calc (vm_size_t count); 108 static void vm_pageout_page_free(vm_page_t m) ; 109 struct thread *pagethread; 110 111 #if !defined(NO_SWAPPING) 112 /* the kernel process "vm_daemon"*/ 113 static void vm_daemon (void); 114 static struct thread *vmthread; 115 116 static struct kproc_desc vm_kp = { 117 "vmdaemon", 118 vm_daemon, 119 &vmthread 120 }; 121 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 122 #endif 123 124 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 125 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 126 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 127 int vm_page_free_hysteresis = 16; 128 129 #if !defined(NO_SWAPPING) 130 static int vm_pageout_req_swapout; 131 static int vm_daemon_needed; 132 #endif 133 static int vm_max_launder = 4096; 134 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 135 static int vm_pageout_full_stats_interval = 0; 136 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 137 static int defer_swap_pageouts=0; 138 static int disable_swap_pageouts=0; 139 static u_int vm_anonmem_decline = ACT_DECLINE; 140 static u_int vm_filemem_decline = ACT_DECLINE * 2; 141 142 #if defined(NO_SWAPPING) 143 static int vm_swap_enabled=0; 144 static int vm_swap_idle_enabled=0; 145 #else 146 static int vm_swap_enabled=1; 147 static int vm_swap_idle_enabled=0; 148 #endif 149 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 150 151 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 152 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 153 154 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 155 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 156 157 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 158 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 159 "Free more pages than the minimum required"); 160 161 SYSCTL_INT(_vm, OID_AUTO, max_launder, 162 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 163 164 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 165 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 166 167 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 168 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 169 170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 171 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 172 173 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 174 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 175 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 176 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 177 178 #if defined(NO_SWAPPING) 179 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 180 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 181 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 182 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 183 #else 184 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 185 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 186 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 187 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 188 #endif 189 190 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 191 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 192 193 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 194 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 195 196 static int pageout_lock_miss; 197 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 198 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 199 200 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 201 202 #if !defined(NO_SWAPPING) 203 static void vm_req_vmdaemon (void); 204 #endif 205 static void vm_pageout_page_stats(int q); 206 207 /* 208 * Calculate approximately how many pages on each queue to try to 209 * clean. An exact calculation creates an edge condition when the 210 * queues are unbalanced so add significant slop. The queue scans 211 * will stop early when targets are reached and will start where they 212 * left off on the next pass. 213 * 214 * We need to be generous here because there are all sorts of loading 215 * conditions that can cause edge cases if try to average over all queues. 216 * In particular, storage subsystems have become so fast that paging 217 * activity can become quite frantic. Eventually we will probably need 218 * two paging threads, one for dirty pages and one for clean, to deal 219 * with the bandwidth requirements. 220 221 * So what we do is calculate a value that can be satisfied nominally by 222 * only having to scan half the queues. 223 */ 224 static __inline int 225 PQAVERAGE(int n) 226 { 227 int avg; 228 229 if (n >= 0) { 230 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 231 } else { 232 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 233 } 234 return avg; 235 } 236 237 /* 238 * vm_pageout_clean_helper: 239 * 240 * Clean the page and remove it from the laundry. The page must not be 241 * busy on-call. 242 * 243 * We set the busy bit to cause potential page faults on this page to 244 * block. Note the careful timing, however, the busy bit isn't set till 245 * late and we cannot do anything that will mess with the page. 246 */ 247 static int 248 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 249 { 250 vm_object_t object; 251 vm_page_t mc[BLIST_MAX_ALLOC]; 252 int error; 253 int ib, is, page_base; 254 vm_pindex_t pindex = m->pindex; 255 256 object = m->object; 257 258 /* 259 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP 260 * with the new swapper, but we could have serious problems paging 261 * out other object types if there is insufficient memory. 262 * 263 * Unfortunately, checking free memory here is far too late, so the 264 * check has been moved up a procedural level. 265 */ 266 267 /* 268 * Don't mess with the page if it's busy, held, or special 269 * 270 * XXX do we really need to check hold_count here? hold_count 271 * isn't supposed to mess with vm_page ops except prevent the 272 * page from being reused. 273 */ 274 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 275 vm_page_wakeup(m); 276 return 0; 277 } 278 279 /* 280 * Place page in cluster. Align cluster for optimal swap space 281 * allocation (whether it is swap or not). This is typically ~16-32 282 * pages, which also tends to align the cluster to multiples of the 283 * filesystem block size if backed by a filesystem. 284 */ 285 page_base = pindex % BLIST_MAX_ALLOC; 286 mc[page_base] = m; 287 ib = page_base - 1; 288 is = page_base + 1; 289 290 /* 291 * Scan object for clusterable pages. 292 * 293 * We can cluster ONLY if: ->> the page is NOT 294 * clean, wired, busy, held, or mapped into a 295 * buffer, and one of the following: 296 * 1) The page is inactive, or a seldom used 297 * active page. 298 * -or- 299 * 2) we force the issue. 300 * 301 * During heavy mmap/modification loads the pageout 302 * daemon can really fragment the underlying file 303 * due to flushing pages out of order and not trying 304 * align the clusters (which leave sporatic out-of-order 305 * holes). To solve this problem we do the reverse scan 306 * first and attempt to align our cluster, then do a 307 * forward scan if room remains. 308 */ 309 vm_object_hold(object); 310 311 while (ib >= 0) { 312 vm_page_t p; 313 314 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 315 TRUE, &error); 316 if (error || p == NULL) 317 break; 318 if ((p->queue - p->pc) == PQ_CACHE || 319 (p->flags & PG_UNMANAGED)) { 320 vm_page_wakeup(p); 321 break; 322 } 323 vm_page_test_dirty(p); 324 if (((p->dirty & p->valid) == 0 && 325 (p->flags & PG_NEED_COMMIT) == 0) || 326 p->wire_count != 0 || /* may be held by buf cache */ 327 p->hold_count != 0) { /* may be undergoing I/O */ 328 vm_page_wakeup(p); 329 break; 330 } 331 if (p->queue - p->pc != PQ_INACTIVE) { 332 if (p->queue - p->pc != PQ_ACTIVE || 333 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 334 vm_page_wakeup(p); 335 break; 336 } 337 } 338 339 /* 340 * Try to maintain page groupings in the cluster. 341 */ 342 if (m->flags & PG_WINATCFLS) 343 vm_page_flag_set(p, PG_WINATCFLS); 344 else 345 vm_page_flag_clear(p, PG_WINATCFLS); 346 p->act_count = m->act_count; 347 348 mc[ib] = p; 349 --ib; 350 } 351 ++ib; /* fixup */ 352 353 while (is < BLIST_MAX_ALLOC && 354 pindex - page_base + is < object->size) { 355 vm_page_t p; 356 357 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 358 TRUE, &error); 359 if (error || p == NULL) 360 break; 361 if (((p->queue - p->pc) == PQ_CACHE) || 362 (p->flags & PG_UNMANAGED)) { 363 vm_page_wakeup(p); 364 break; 365 } 366 vm_page_test_dirty(p); 367 if (((p->dirty & p->valid) == 0 && 368 (p->flags & PG_NEED_COMMIT) == 0) || 369 p->wire_count != 0 || /* may be held by buf cache */ 370 p->hold_count != 0) { /* may be undergoing I/O */ 371 vm_page_wakeup(p); 372 break; 373 } 374 if (p->queue - p->pc != PQ_INACTIVE) { 375 if (p->queue - p->pc != PQ_ACTIVE || 376 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 377 vm_page_wakeup(p); 378 break; 379 } 380 } 381 382 /* 383 * Try to maintain page groupings in the cluster. 384 */ 385 if (m->flags & PG_WINATCFLS) 386 vm_page_flag_set(p, PG_WINATCFLS); 387 else 388 vm_page_flag_clear(p, PG_WINATCFLS); 389 p->act_count = m->act_count; 390 391 mc[is] = p; 392 ++is; 393 } 394 395 vm_object_drop(object); 396 397 /* 398 * we allow reads during pageouts... 399 */ 400 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 401 } 402 403 /* 404 * vm_pageout_flush() - launder the given pages 405 * 406 * The given pages are laundered. Note that we setup for the start of 407 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 408 * reference count all in here rather then in the parent. If we want 409 * the parent to do more sophisticated things we may have to change 410 * the ordering. 411 * 412 * The pages in the array must be busied by the caller and will be 413 * unbusied by this function. 414 */ 415 int 416 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 417 { 418 vm_object_t object; 419 int pageout_status[count]; 420 int numpagedout = 0; 421 int i; 422 423 /* 424 * Initiate I/O. Bump the vm_page_t->busy counter. 425 */ 426 for (i = 0; i < count; i++) { 427 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 428 ("vm_pageout_flush page %p index %d/%d: partially " 429 "invalid page", mc[i], i, count)); 430 vm_page_io_start(mc[i]); 431 } 432 433 /* 434 * We must make the pages read-only. This will also force the 435 * modified bit in the related pmaps to be cleared. The pager 436 * cannot clear the bit for us since the I/O completion code 437 * typically runs from an interrupt. The act of making the page 438 * read-only handles the case for us. 439 * 440 * Then we can unbusy the pages, we still hold a reference by virtue 441 * of our soft-busy. 442 */ 443 for (i = 0; i < count; i++) { 444 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 445 vm_page_protect(mc[i], VM_PROT_NONE); 446 else 447 vm_page_protect(mc[i], VM_PROT_READ); 448 vm_page_wakeup(mc[i]); 449 } 450 451 object = mc[0]->object; 452 vm_object_pip_add(object, count); 453 454 vm_pager_put_pages(object, mc, count, 455 (vmflush_flags | 456 ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)), 457 pageout_status); 458 459 for (i = 0; i < count; i++) { 460 vm_page_t mt = mc[i]; 461 462 switch (pageout_status[i]) { 463 case VM_PAGER_OK: 464 numpagedout++; 465 break; 466 case VM_PAGER_PEND: 467 numpagedout++; 468 break; 469 case VM_PAGER_BAD: 470 /* 471 * Page outside of range of object. Right now we 472 * essentially lose the changes by pretending it 473 * worked. 474 */ 475 vm_page_busy_wait(mt, FALSE, "pgbad"); 476 pmap_clear_modify(mt); 477 vm_page_undirty(mt); 478 vm_page_wakeup(mt); 479 break; 480 case VM_PAGER_ERROR: 481 case VM_PAGER_FAIL: 482 /* 483 * A page typically cannot be paged out when we 484 * have run out of swap. We leave the page 485 * marked inactive and will try to page it out 486 * again later. 487 * 488 * Starvation of the active page list is used to 489 * determine when the system is massively memory 490 * starved. 491 */ 492 break; 493 case VM_PAGER_AGAIN: 494 break; 495 } 496 497 /* 498 * If not PENDing this was a synchronous operation and we 499 * clean up after the I/O. If it is PENDing the mess is 500 * cleaned up asynchronously. 501 * 502 * Also nominally act on the caller's wishes if the caller 503 * wants to try to really clean (cache or free) the page. 504 * 505 * Also nominally deactivate the page if the system is 506 * memory-stressed. 507 */ 508 if (pageout_status[i] != VM_PAGER_PEND) { 509 vm_page_busy_wait(mt, FALSE, "pgouw"); 510 vm_page_io_finish(mt); 511 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 512 vm_page_try_to_cache(mt); 513 } else if (vm_page_count_severe()) { 514 vm_page_deactivate(mt); 515 vm_page_wakeup(mt); 516 } else { 517 vm_page_wakeup(mt); 518 } 519 vm_object_pip_wakeup(object); 520 } 521 } 522 return numpagedout; 523 } 524 525 #if !defined(NO_SWAPPING) 526 527 /* 528 * Callback function, page busied for us. We must dispose of the busy 529 * condition. Any related pmap pages may be held but will not be locked. 530 */ 531 static 532 int 533 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 534 vm_page_t p) 535 { 536 int actcount; 537 int cleanit = 0; 538 539 /* 540 * Basic tests - There should never be a marker, and we can stop 541 * once the RSS is below the required level. 542 */ 543 KKASSERT((p->flags & PG_MARKER) == 0); 544 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 545 vm_page_wakeup(p); 546 return(-1); 547 } 548 549 mycpu->gd_cnt.v_pdpages++; 550 551 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 552 vm_page_wakeup(p); 553 goto done; 554 } 555 556 ++info->actioncount; 557 558 /* 559 * Check if the page has been referened recently. If it has, 560 * activate it and skip. 561 */ 562 actcount = pmap_ts_referenced(p); 563 if (actcount) { 564 vm_page_flag_set(p, PG_REFERENCED); 565 } else if (p->flags & PG_REFERENCED) { 566 actcount = 1; 567 } 568 569 if (actcount) { 570 if (p->queue - p->pc != PQ_ACTIVE) { 571 vm_page_and_queue_spin_lock(p); 572 if (p->queue - p->pc != PQ_ACTIVE) { 573 vm_page_and_queue_spin_unlock(p); 574 vm_page_activate(p); 575 } else { 576 vm_page_and_queue_spin_unlock(p); 577 } 578 } else { 579 p->act_count += actcount; 580 if (p->act_count > ACT_MAX) 581 p->act_count = ACT_MAX; 582 } 583 vm_page_flag_clear(p, PG_REFERENCED); 584 vm_page_wakeup(p); 585 goto done; 586 } 587 588 /* 589 * Remove the page from this particular pmap. Once we do this, our 590 * pmap scans will not see it again (unless it gets faulted in), so 591 * we must actively dispose of or deal with the page. 592 */ 593 pmap_remove_specific(info->pmap, p); 594 595 /* 596 * If the page is not mapped to another process (i.e. as would be 597 * typical if this were a shared page from a library) then deactivate 598 * the page and clean it in two passes only. 599 * 600 * If the page hasn't been referenced since the last check, remove it 601 * from the pmap. If it is no longer mapped, deactivate it 602 * immediately, accelerating the normal decline. 603 * 604 * Once the page has been removed from the pmap the RSS code no 605 * longer tracks it so we have to make sure that it is staged for 606 * potential flush action. 607 */ 608 if ((p->flags & PG_MAPPED) == 0) { 609 if (p->queue - p->pc == PQ_ACTIVE) { 610 vm_page_deactivate(p); 611 } 612 if (p->queue - p->pc == PQ_INACTIVE) { 613 cleanit = 1; 614 } 615 } 616 617 /* 618 * Ok, try to fully clean the page and any nearby pages such that at 619 * least the requested page is freed or moved to the cache queue. 620 * 621 * We usually do this synchronously to allow us to get the page into 622 * the CACHE queue quickly, which will prevent memory exhaustion if 623 * a process with a memoryuse limit is running away. However, the 624 * sysadmin may desire to set vm.swap_user_async which relaxes this 625 * and improves write performance. 626 */ 627 if (cleanit) { 628 int max_launder = 0x7FFF; 629 int vnodes_skipped = 0; 630 int vmflush_flags; 631 struct vnode *vpfailed = NULL; 632 633 info->offset = va; 634 635 if (vm_pageout_memuse_mode >= 2) { 636 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 637 VM_PAGER_ALLOW_ACTIVE; 638 if (swap_user_async == 0) 639 vmflush_flags |= VM_PAGER_PUT_SYNC; 640 vm_page_flag_set(p, PG_WINATCFLS); 641 info->cleancount += 642 vm_pageout_page(p, &max_launder, 643 &vnodes_skipped, 644 &vpfailed, 1, vmflush_flags); 645 } else { 646 vm_page_wakeup(p); 647 ++info->cleancount; 648 } 649 } else { 650 vm_page_wakeup(p); 651 } 652 done: 653 lwkt_user_yield(); 654 return 0; 655 } 656 657 /* 658 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 659 * that is relatively difficult to do. We try to keep track of where we 660 * left off last time to reduce scan overhead. 661 * 662 * Called when vm_pageout_memuse_mode is >= 1. 663 */ 664 void 665 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 666 { 667 vm_offset_t pgout_offset; 668 struct pmap_pgscan_info info; 669 int retries = 3; 670 671 pgout_offset = map->pgout_offset; 672 again: 673 #if 0 674 kprintf("%016jx ", pgout_offset); 675 #endif 676 if (pgout_offset < VM_MIN_USER_ADDRESS) 677 pgout_offset = VM_MIN_USER_ADDRESS; 678 if (pgout_offset >= VM_MAX_USER_ADDRESS) 679 pgout_offset = 0; 680 info.pmap = vm_map_pmap(map); 681 info.limit = limit; 682 info.beg_addr = pgout_offset; 683 info.end_addr = VM_MAX_USER_ADDRESS; 684 info.callback = vm_pageout_mdp_callback; 685 info.cleancount = 0; 686 info.actioncount = 0; 687 info.busycount = 0; 688 689 pmap_pgscan(&info); 690 pgout_offset = info.offset; 691 #if 0 692 kprintf("%016jx %08lx %08lx\n", pgout_offset, 693 info.cleancount, info.actioncount); 694 #endif 695 696 if (pgout_offset != VM_MAX_USER_ADDRESS && 697 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 698 goto again; 699 } else if (retries && 700 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 701 --retries; 702 goto again; 703 } 704 map->pgout_offset = pgout_offset; 705 } 706 #endif 707 708 /* 709 * Called when the pageout scan wants to free a page. We no longer 710 * try to cycle the vm_object here with a reference & dealloc, which can 711 * cause a non-trivial object collapse in a critical path. 712 * 713 * It is unclear why we cycled the ref_count in the past, perhaps to try 714 * to optimize shadow chain collapses but I don't quite see why it would 715 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 716 * synchronously and not have to be kicked-start. 717 */ 718 static void 719 vm_pageout_page_free(vm_page_t m) 720 { 721 vm_page_protect(m, VM_PROT_NONE); 722 vm_page_free(m); 723 } 724 725 /* 726 * vm_pageout_scan does the dirty work for the pageout daemon. 727 */ 728 struct vm_pageout_scan_info { 729 struct proc *bigproc; 730 vm_offset_t bigsize; 731 }; 732 733 static int vm_pageout_scan_callback(struct proc *p, void *data); 734 735 static int 736 vm_pageout_scan_inactive(int pass, int q, int avail_shortage, 737 int *vnodes_skipped) 738 { 739 vm_page_t m; 740 struct vm_page marker; 741 struct vnode *vpfailed; /* warning, allowed to be stale */ 742 int maxscan; 743 int delta = 0; 744 int max_launder; 745 746 /* 747 * Start scanning the inactive queue for pages we can move to the 748 * cache or free. The scan will stop when the target is reached or 749 * we have scanned the entire inactive queue. Note that m->act_count 750 * is not used to form decisions for the inactive queue, only for the 751 * active queue. 752 * 753 * max_launder limits the number of dirty pages we flush per scan. 754 * For most systems a smaller value (16 or 32) is more robust under 755 * extreme memory and disk pressure because any unnecessary writes 756 * to disk can result in extreme performance degredation. However, 757 * systems with excessive dirty pages (especially when MAP_NOSYNC is 758 * used) will die horribly with limited laundering. If the pageout 759 * daemon cannot clean enough pages in the first pass, we let it go 760 * all out in succeeding passes. 761 */ 762 if ((max_launder = vm_max_launder) <= 1) 763 max_launder = 1; 764 if (pass) 765 max_launder = 10000; 766 767 /* 768 * Initialize our marker 769 */ 770 bzero(&marker, sizeof(marker)); 771 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 772 marker.queue = PQ_INACTIVE + q; 773 marker.pc = q; 774 marker.wire_count = 1; 775 776 /* 777 * Inactive queue scan. 778 * 779 * NOTE: The vm_page must be spinlocked before the queue to avoid 780 * deadlocks, so it is easiest to simply iterate the loop 781 * with the queue unlocked at the top. 782 */ 783 vpfailed = NULL; 784 785 vm_page_queues_spin_lock(PQ_INACTIVE + q); 786 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 787 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 788 789 /* 790 * Queue locked at top of loop to avoid stack marker issues. 791 */ 792 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 793 maxscan-- > 0 && avail_shortage - delta > 0) 794 { 795 int count; 796 797 KKASSERT(m->queue == PQ_INACTIVE + q); 798 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 799 &marker, pageq); 800 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 801 &marker, pageq); 802 mycpu->gd_cnt.v_pdpages++; 803 804 /* 805 * Skip marker pages (atomic against other markers to avoid 806 * infinite hop-over scans). 807 */ 808 if (m->flags & PG_MARKER) 809 continue; 810 811 /* 812 * Try to busy the page. Don't mess with pages which are 813 * already busy or reorder them in the queue. 814 */ 815 if (vm_page_busy_try(m, TRUE)) 816 continue; 817 818 /* 819 * Remaining operations run with the page busy and neither 820 * the page or the queue will be spin-locked. 821 */ 822 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 823 KKASSERT(m->queue == PQ_INACTIVE + q); 824 825 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 826 &vpfailed, pass, 0); 827 delta += count; 828 829 /* 830 * Systems with a ton of memory can wind up with huge 831 * deactivation counts. Because the inactive scan is 832 * doing a lot of flushing, the combination can result 833 * in excessive paging even in situations where other 834 * unrelated threads free up sufficient VM. 835 * 836 * To deal with this we abort the nominal active->inactive 837 * scan before we hit the inactive target when free+cache 838 * levels have reached a reasonable target. 839 * 840 * When deciding to stop early we need to add some slop to 841 * the test and we need to return full completion to the caller 842 * to prevent the caller from thinking there is something 843 * wrong and issuing a low-memory+swap warning or pkill. 844 * 845 * A deficit forces paging regardless of the state of the 846 * VM page queues (used for RSS enforcement). 847 */ 848 lwkt_yield(); 849 vm_page_queues_spin_lock(PQ_INACTIVE + q); 850 if (vm_paging_target() < -vm_max_launder) { 851 /* 852 * Stopping early, return full completion to caller. 853 */ 854 if (delta < avail_shortage) 855 delta = avail_shortage; 856 break; 857 } 858 } 859 860 /* page queue still spin-locked */ 861 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 862 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 863 864 return (delta); 865 } 866 867 /* 868 * Pageout the specified page, return the total number of pages paged out 869 * (this routine may cluster). 870 * 871 * The page must be busied and soft-busied by the caller and will be disposed 872 * of by this function. 873 */ 874 static int 875 vm_pageout_page(vm_page_t m, int *max_launderp, int *vnodes_skippedp, 876 struct vnode **vpfailedp, int pass, int vmflush_flags) 877 { 878 vm_object_t object; 879 int actcount; 880 int count = 0; 881 882 /* 883 * It is possible for a page to be busied ad-hoc (e.g. the 884 * pmap_collect() code) and wired and race against the 885 * allocation of a new page. vm_page_alloc() may be forced 886 * to deactivate the wired page in which case it winds up 887 * on the inactive queue and must be handled here. We 888 * correct the problem simply by unqueuing the page. 889 */ 890 if (m->wire_count) { 891 vm_page_unqueue_nowakeup(m); 892 vm_page_wakeup(m); 893 kprintf("WARNING: pagedaemon: wired page on " 894 "inactive queue %p\n", m); 895 return 0; 896 } 897 898 /* 899 * A held page may be undergoing I/O, so skip it. 900 */ 901 if (m->hold_count) { 902 vm_page_and_queue_spin_lock(m); 903 if (m->queue - m->pc == PQ_INACTIVE) { 904 TAILQ_REMOVE( 905 &vm_page_queues[m->queue].pl, m, pageq); 906 TAILQ_INSERT_TAIL( 907 &vm_page_queues[m->queue].pl, m, pageq); 908 ++vm_swapcache_inactive_heuristic; 909 } 910 vm_page_and_queue_spin_unlock(m); 911 vm_page_wakeup(m); 912 return 0; 913 } 914 915 if (m->object == NULL || m->object->ref_count == 0) { 916 /* 917 * If the object is not being used, we ignore previous 918 * references. 919 */ 920 vm_page_flag_clear(m, PG_REFERENCED); 921 pmap_clear_reference(m); 922 /* fall through to end */ 923 } else if (((m->flags & PG_REFERENCED) == 0) && 924 (actcount = pmap_ts_referenced(m))) { 925 /* 926 * Otherwise, if the page has been referenced while 927 * in the inactive queue, we bump the "activation 928 * count" upwards, making it less likely that the 929 * page will be added back to the inactive queue 930 * prematurely again. Here we check the page tables 931 * (or emulated bits, if any), given the upper level 932 * VM system not knowing anything about existing 933 * references. 934 */ 935 vm_page_activate(m); 936 m->act_count += (actcount + ACT_ADVANCE); 937 vm_page_wakeup(m); 938 return 0; 939 } 940 941 /* 942 * (m) is still busied. 943 * 944 * If the upper level VM system knows about any page 945 * references, we activate the page. We also set the 946 * "activation count" higher than normal so that we will less 947 * likely place pages back onto the inactive queue again. 948 */ 949 if ((m->flags & PG_REFERENCED) != 0) { 950 vm_page_flag_clear(m, PG_REFERENCED); 951 actcount = pmap_ts_referenced(m); 952 vm_page_activate(m); 953 m->act_count += (actcount + ACT_ADVANCE + 1); 954 vm_page_wakeup(m); 955 return 0; 956 } 957 958 /* 959 * If the upper level VM system doesn't know anything about 960 * the page being dirty, we have to check for it again. As 961 * far as the VM code knows, any partially dirty pages are 962 * fully dirty. 963 * 964 * Pages marked PG_WRITEABLE may be mapped into the user 965 * address space of a process running on another cpu. A 966 * user process (without holding the MP lock) running on 967 * another cpu may be able to touch the page while we are 968 * trying to remove it. vm_page_cache() will handle this 969 * case for us. 970 */ 971 if (m->dirty == 0) { 972 vm_page_test_dirty(m); 973 } else { 974 vm_page_dirty(m); 975 } 976 977 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 978 /* 979 * Invalid pages can be easily freed 980 */ 981 vm_pageout_page_free(m); 982 mycpu->gd_cnt.v_dfree++; 983 ++count; 984 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 985 /* 986 * Clean pages can be placed onto the cache queue. 987 * This effectively frees them. 988 */ 989 vm_page_cache(m); 990 ++count; 991 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 992 /* 993 * Dirty pages need to be paged out, but flushing 994 * a page is extremely expensive verses freeing 995 * a clean page. Rather then artificially limiting 996 * the number of pages we can flush, we instead give 997 * dirty pages extra priority on the inactive queue 998 * by forcing them to be cycled through the queue 999 * twice before being flushed, after which the 1000 * (now clean) page will cycle through once more 1001 * before being freed. This significantly extends 1002 * the thrash point for a heavily loaded machine. 1003 */ 1004 vm_page_flag_set(m, PG_WINATCFLS); 1005 vm_page_and_queue_spin_lock(m); 1006 if (m->queue - m->pc == PQ_INACTIVE) { 1007 TAILQ_REMOVE( 1008 &vm_page_queues[m->queue].pl, m, pageq); 1009 TAILQ_INSERT_TAIL( 1010 &vm_page_queues[m->queue].pl, m, pageq); 1011 ++vm_swapcache_inactive_heuristic; 1012 } 1013 vm_page_and_queue_spin_unlock(m); 1014 vm_page_wakeup(m); 1015 } else if (*max_launderp > 0) { 1016 /* 1017 * We always want to try to flush some dirty pages if 1018 * we encounter them, to keep the system stable. 1019 * Normally this number is small, but under extreme 1020 * pressure where there are insufficient clean pages 1021 * on the inactive queue, we may have to go all out. 1022 */ 1023 int swap_pageouts_ok; 1024 struct vnode *vp = NULL; 1025 1026 swap_pageouts_ok = 0; 1027 object = m->object; 1028 if (object && 1029 (object->type != OBJT_SWAP) && 1030 (object->type != OBJT_DEFAULT)) { 1031 swap_pageouts_ok = 1; 1032 } else { 1033 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 1034 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 1035 vm_page_count_min(0)); 1036 } 1037 1038 /* 1039 * We don't bother paging objects that are "dead". 1040 * Those objects are in a "rundown" state. 1041 */ 1042 if (!swap_pageouts_ok || 1043 (object == NULL) || 1044 (object->flags & OBJ_DEAD)) { 1045 vm_page_and_queue_spin_lock(m); 1046 if (m->queue - m->pc == PQ_INACTIVE) { 1047 TAILQ_REMOVE( 1048 &vm_page_queues[m->queue].pl, 1049 m, pageq); 1050 TAILQ_INSERT_TAIL( 1051 &vm_page_queues[m->queue].pl, 1052 m, pageq); 1053 ++vm_swapcache_inactive_heuristic; 1054 } 1055 vm_page_and_queue_spin_unlock(m); 1056 vm_page_wakeup(m); 1057 return 0; 1058 } 1059 1060 /* 1061 * (m) is still busied. 1062 * 1063 * The object is already known NOT to be dead. It 1064 * is possible for the vget() to block the whole 1065 * pageout daemon, but the new low-memory handling 1066 * code should prevent it. 1067 * 1068 * The previous code skipped locked vnodes and, worse, 1069 * reordered pages in the queue. This results in 1070 * completely non-deterministic operation because, 1071 * quite often, a vm_fault has initiated an I/O and 1072 * is holding a locked vnode at just the point where 1073 * the pageout daemon is woken up. 1074 * 1075 * We can't wait forever for the vnode lock, we might 1076 * deadlock due to a vn_read() getting stuck in 1077 * vm_wait while holding this vnode. We skip the 1078 * vnode if we can't get it in a reasonable amount 1079 * of time. 1080 * 1081 * vpfailed is used to (try to) avoid the case where 1082 * a large number of pages are associated with a 1083 * locked vnode, which could cause the pageout daemon 1084 * to stall for an excessive amount of time. 1085 */ 1086 if (object->type == OBJT_VNODE) { 1087 int flags; 1088 1089 vp = object->handle; 1090 flags = LK_EXCLUSIVE; 1091 if (vp == *vpfailedp) 1092 flags |= LK_NOWAIT; 1093 else 1094 flags |= LK_TIMELOCK; 1095 vm_page_hold(m); 1096 vm_page_wakeup(m); 1097 1098 /* 1099 * We have unbusied (m) temporarily so we can 1100 * acquire the vp lock without deadlocking. 1101 * (m) is held to prevent destruction. 1102 */ 1103 if (vget(vp, flags) != 0) { 1104 *vpfailedp = vp; 1105 ++pageout_lock_miss; 1106 if (object->flags & OBJ_MIGHTBEDIRTY) 1107 ++*vnodes_skippedp; 1108 vm_page_unhold(m); 1109 return 0; 1110 } 1111 1112 /* 1113 * The page might have been moved to another 1114 * queue during potential blocking in vget() 1115 * above. The page might have been freed and 1116 * reused for another vnode. The object might 1117 * have been reused for another vnode. 1118 */ 1119 if (m->queue - m->pc != PQ_INACTIVE || 1120 m->object != object || 1121 object->handle != vp) { 1122 if (object->flags & OBJ_MIGHTBEDIRTY) 1123 ++*vnodes_skippedp; 1124 vput(vp); 1125 vm_page_unhold(m); 1126 return 0; 1127 } 1128 1129 /* 1130 * The page may have been busied during the 1131 * blocking in vput(); We don't move the 1132 * page back onto the end of the queue so that 1133 * statistics are more correct if we don't. 1134 */ 1135 if (vm_page_busy_try(m, TRUE)) { 1136 vput(vp); 1137 vm_page_unhold(m); 1138 return 0; 1139 } 1140 vm_page_unhold(m); 1141 1142 /* 1143 * (m) is busied again 1144 * 1145 * We own the busy bit and remove our hold 1146 * bit. If the page is still held it 1147 * might be undergoing I/O, so skip it. 1148 */ 1149 if (m->hold_count) { 1150 vm_page_and_queue_spin_lock(m); 1151 if (m->queue - m->pc == PQ_INACTIVE) { 1152 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1153 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1154 ++vm_swapcache_inactive_heuristic; 1155 } 1156 vm_page_and_queue_spin_unlock(m); 1157 if (object->flags & OBJ_MIGHTBEDIRTY) 1158 ++*vnodes_skippedp; 1159 vm_page_wakeup(m); 1160 vput(vp); 1161 return 0; 1162 } 1163 /* (m) is left busied as we fall through */ 1164 } 1165 1166 /* 1167 * page is busy and not held here. 1168 * 1169 * If a page is dirty, then it is either being washed 1170 * (but not yet cleaned) or it is still in the 1171 * laundry. If it is still in the laundry, then we 1172 * start the cleaning operation. 1173 * 1174 * decrement inactive_shortage on success to account 1175 * for the (future) cleaned page. Otherwise we 1176 * could wind up laundering or cleaning too many 1177 * pages. 1178 * 1179 * NOTE: Cleaning the page here does not cause 1180 * force_deficit to be adjusted, because the 1181 * page is not being freed or moved to the 1182 * cache. 1183 */ 1184 count = vm_pageout_clean_helper(m, vmflush_flags); 1185 *max_launderp -= count; 1186 1187 /* 1188 * Clean ate busy, page no longer accessible 1189 */ 1190 if (vp != NULL) 1191 vput(vp); 1192 } else { 1193 vm_page_wakeup(m); 1194 } 1195 return count; 1196 } 1197 1198 static int 1199 vm_pageout_scan_active(int pass, int q, 1200 int avail_shortage, int inactive_shortage, 1201 int *recycle_countp) 1202 { 1203 struct vm_page marker; 1204 vm_page_t m; 1205 int actcount; 1206 int delta = 0; 1207 int maxscan; 1208 1209 /* 1210 * We want to move pages from the active queue to the inactive 1211 * queue to get the inactive queue to the inactive target. If 1212 * we still have a page shortage from above we try to directly free 1213 * clean pages instead of moving them. 1214 * 1215 * If we do still have a shortage we keep track of the number of 1216 * pages we free or cache (recycle_count) as a measure of thrashing 1217 * between the active and inactive queues. 1218 * 1219 * If we were able to completely satisfy the free+cache targets 1220 * from the inactive pool we limit the number of pages we move 1221 * from the active pool to the inactive pool to 2x the pages we 1222 * had removed from the inactive pool (with a minimum of 1/5 the 1223 * inactive target). If we were not able to completely satisfy 1224 * the free+cache targets we go for the whole target aggressively. 1225 * 1226 * NOTE: Both variables can end up negative. 1227 * NOTE: We are still in a critical section. 1228 */ 1229 1230 bzero(&marker, sizeof(marker)); 1231 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1232 marker.queue = PQ_ACTIVE + q; 1233 marker.pc = q; 1234 marker.wire_count = 1; 1235 1236 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1237 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1238 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1239 1240 /* 1241 * Queue locked at top of loop to avoid stack marker issues. 1242 */ 1243 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1244 maxscan-- > 0 && (avail_shortage - delta > 0 || 1245 inactive_shortage > 0)) 1246 { 1247 KKASSERT(m->queue == PQ_ACTIVE + q); 1248 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1249 &marker, pageq); 1250 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1251 &marker, pageq); 1252 1253 /* 1254 * Skip marker pages (atomic against other markers to avoid 1255 * infinite hop-over scans). 1256 */ 1257 if (m->flags & PG_MARKER) 1258 continue; 1259 1260 /* 1261 * Try to busy the page. Don't mess with pages which are 1262 * already busy or reorder them in the queue. 1263 */ 1264 if (vm_page_busy_try(m, TRUE)) 1265 continue; 1266 1267 /* 1268 * Remaining operations run with the page busy and neither 1269 * the page or the queue will be spin-locked. 1270 */ 1271 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1272 KKASSERT(m->queue == PQ_ACTIVE + q); 1273 1274 /* 1275 * Don't deactivate pages that are held, even if we can 1276 * busy them. (XXX why not?) 1277 */ 1278 if (m->hold_count != 0) { 1279 vm_page_and_queue_spin_lock(m); 1280 if (m->queue - m->pc == PQ_ACTIVE) { 1281 TAILQ_REMOVE( 1282 &vm_page_queues[PQ_ACTIVE + q].pl, 1283 m, pageq); 1284 TAILQ_INSERT_TAIL( 1285 &vm_page_queues[PQ_ACTIVE + q].pl, 1286 m, pageq); 1287 } 1288 vm_page_and_queue_spin_unlock(m); 1289 vm_page_wakeup(m); 1290 goto next; 1291 } 1292 1293 /* 1294 * The count for pagedaemon pages is done after checking the 1295 * page for eligibility... 1296 */ 1297 mycpu->gd_cnt.v_pdpages++; 1298 1299 /* 1300 * Check to see "how much" the page has been used and clear 1301 * the tracking access bits. If the object has no references 1302 * don't bother paying the expense. 1303 */ 1304 actcount = 0; 1305 if (m->object && m->object->ref_count != 0) { 1306 if (m->flags & PG_REFERENCED) 1307 ++actcount; 1308 actcount += pmap_ts_referenced(m); 1309 if (actcount) { 1310 m->act_count += ACT_ADVANCE + actcount; 1311 if (m->act_count > ACT_MAX) 1312 m->act_count = ACT_MAX; 1313 } 1314 } 1315 vm_page_flag_clear(m, PG_REFERENCED); 1316 1317 /* 1318 * actcount is only valid if the object ref_count is non-zero. 1319 * If the page does not have an object, actcount will be zero. 1320 */ 1321 if (actcount && m->object->ref_count != 0) { 1322 vm_page_and_queue_spin_lock(m); 1323 if (m->queue - m->pc == PQ_ACTIVE) { 1324 TAILQ_REMOVE( 1325 &vm_page_queues[PQ_ACTIVE + q].pl, 1326 m, pageq); 1327 TAILQ_INSERT_TAIL( 1328 &vm_page_queues[PQ_ACTIVE + q].pl, 1329 m, pageq); 1330 } 1331 vm_page_and_queue_spin_unlock(m); 1332 vm_page_wakeup(m); 1333 } else { 1334 switch(m->object->type) { 1335 case OBJT_DEFAULT: 1336 case OBJT_SWAP: 1337 m->act_count -= min(m->act_count, 1338 vm_anonmem_decline); 1339 break; 1340 default: 1341 m->act_count -= min(m->act_count, 1342 vm_filemem_decline); 1343 break; 1344 } 1345 if (vm_pageout_algorithm || 1346 (m->object == NULL) || 1347 (m->object && (m->object->ref_count == 0)) || 1348 m->act_count < pass + 1 1349 ) { 1350 /* 1351 * Deactivate the page. If we had a 1352 * shortage from our inactive scan try to 1353 * free (cache) the page instead. 1354 * 1355 * Don't just blindly cache the page if 1356 * we do not have a shortage from the 1357 * inactive scan, that could lead to 1358 * gigabytes being moved. 1359 */ 1360 --inactive_shortage; 1361 if (avail_shortage - delta > 0 || 1362 (m->object && (m->object->ref_count == 0))) 1363 { 1364 if (avail_shortage - delta > 0) 1365 ++*recycle_countp; 1366 vm_page_protect(m, VM_PROT_NONE); 1367 if (m->dirty == 0 && 1368 (m->flags & PG_NEED_COMMIT) == 0 && 1369 avail_shortage - delta > 0) { 1370 vm_page_cache(m); 1371 } else { 1372 vm_page_deactivate(m); 1373 vm_page_wakeup(m); 1374 } 1375 } else { 1376 vm_page_deactivate(m); 1377 vm_page_wakeup(m); 1378 } 1379 ++delta; 1380 } else { 1381 vm_page_and_queue_spin_lock(m); 1382 if (m->queue - m->pc == PQ_ACTIVE) { 1383 TAILQ_REMOVE( 1384 &vm_page_queues[PQ_ACTIVE + q].pl, 1385 m, pageq); 1386 TAILQ_INSERT_TAIL( 1387 &vm_page_queues[PQ_ACTIVE + q].pl, 1388 m, pageq); 1389 } 1390 vm_page_and_queue_spin_unlock(m); 1391 vm_page_wakeup(m); 1392 } 1393 } 1394 next: 1395 lwkt_yield(); 1396 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1397 } 1398 1399 /* 1400 * Clean out our local marker. 1401 * 1402 * Page queue still spin-locked. 1403 */ 1404 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1405 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1406 1407 return (delta); 1408 } 1409 1410 /* 1411 * The number of actually free pages can drop down to v_free_reserved, 1412 * we try to build the free count back above v_free_min. Note that 1413 * vm_paging_needed() also returns TRUE if v_free_count is not at 1414 * least v_free_min so that is the minimum we must build the free 1415 * count to. 1416 * 1417 * We use a slightly higher target to improve hysteresis, 1418 * ((v_free_target + v_free_min) / 2). Since v_free_target 1419 * is usually the same as v_cache_min this maintains about 1420 * half the pages in the free queue as are in the cache queue, 1421 * providing pretty good pipelining for pageout operation. 1422 * 1423 * The system operator can manipulate vm.v_cache_min and 1424 * vm.v_free_target to tune the pageout demon. Be sure 1425 * to keep vm.v_free_min < vm.v_free_target. 1426 * 1427 * Note that the original paging target is to get at least 1428 * (free_min + cache_min) into (free + cache). The slightly 1429 * higher target will shift additional pages from cache to free 1430 * without effecting the original paging target in order to 1431 * maintain better hysteresis and not have the free count always 1432 * be dead-on v_free_min. 1433 * 1434 * NOTE: we are still in a critical section. 1435 * 1436 * Pages moved from PQ_CACHE to totally free are not counted in the 1437 * pages_freed counter. 1438 */ 1439 static void 1440 vm_pageout_scan_cache(int avail_shortage, int pass, 1441 int vnodes_skipped, int recycle_count) 1442 { 1443 static int lastkillticks; 1444 struct vm_pageout_scan_info info; 1445 vm_page_t m; 1446 1447 while (vmstats.v_free_count < 1448 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1449 /* 1450 * This steals some code from vm/vm_page.c 1451 */ 1452 static int cache_rover = 0; 1453 1454 m = vm_page_list_find(PQ_CACHE, 1455 cache_rover & PQ_L2_MASK, FALSE); 1456 if (m == NULL) 1457 break; 1458 /* page is returned removed from its queue and spinlocked */ 1459 if (vm_page_busy_try(m, TRUE)) { 1460 vm_page_deactivate_locked(m); 1461 vm_page_spin_unlock(m); 1462 continue; 1463 } 1464 vm_page_spin_unlock(m); 1465 pagedaemon_wakeup(); 1466 lwkt_yield(); 1467 1468 /* 1469 * Remaining operations run with the page busy and neither 1470 * the page or the queue will be spin-locked. 1471 */ 1472 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1473 m->hold_count || 1474 m->wire_count) { 1475 vm_page_deactivate(m); 1476 vm_page_wakeup(m); 1477 continue; 1478 } 1479 KKASSERT((m->flags & PG_MAPPED) == 0); 1480 KKASSERT(m->dirty == 0); 1481 cache_rover += PQ_PRIME2; 1482 vm_pageout_page_free(m); 1483 mycpu->gd_cnt.v_dfree++; 1484 } 1485 1486 #if !defined(NO_SWAPPING) 1487 /* 1488 * Idle process swapout -- run once per second. 1489 */ 1490 if (vm_swap_idle_enabled) { 1491 static time_t lsec; 1492 if (time_uptime != lsec) { 1493 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1494 vm_req_vmdaemon(); 1495 lsec = time_uptime; 1496 } 1497 } 1498 #endif 1499 1500 /* 1501 * If we didn't get enough free pages, and we have skipped a vnode 1502 * in a writeable object, wakeup the sync daemon. And kick swapout 1503 * if we did not get enough free pages. 1504 */ 1505 if (vm_paging_target() > 0) { 1506 if (vnodes_skipped && vm_page_count_min(0)) 1507 speedup_syncer(NULL); 1508 #if !defined(NO_SWAPPING) 1509 if (vm_swap_enabled && vm_page_count_target()) { 1510 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1511 vm_req_vmdaemon(); 1512 } 1513 #endif 1514 } 1515 1516 /* 1517 * Handle catastrophic conditions. Under good conditions we should 1518 * be at the target, well beyond our minimum. If we could not even 1519 * reach our minimum the system is under heavy stress. But just being 1520 * under heavy stress does not trigger process killing. 1521 * 1522 * We consider ourselves to have run out of memory if the swap pager 1523 * is full and avail_shortage is still positive. The secondary check 1524 * ensures that we do not kill processes if the instantanious 1525 * availability is good, even if the pageout demon pass says it 1526 * couldn't get to the target. 1527 */ 1528 if (swap_pager_almost_full && 1529 pass > 0 && 1530 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1531 kprintf("Warning: system low on memory+swap " 1532 "shortage %d for %d ticks!\n", 1533 avail_shortage, ticks - swap_fail_ticks); 1534 } 1535 if (swap_pager_full && 1536 pass > 1 && 1537 avail_shortage > 0 && 1538 vm_paging_target() > 0 && 1539 (unsigned int)(ticks - lastkillticks) >= hz) { 1540 /* 1541 * Kill something, maximum rate once per second to give 1542 * the process time to free up sufficient memory. 1543 */ 1544 lastkillticks = ticks; 1545 info.bigproc = NULL; 1546 info.bigsize = 0; 1547 allproc_scan(vm_pageout_scan_callback, &info); 1548 if (info.bigproc != NULL) { 1549 info.bigproc->p_nice = PRIO_MIN; 1550 info.bigproc->p_usched->resetpriority( 1551 FIRST_LWP_IN_PROC(info.bigproc)); 1552 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1553 killproc(info.bigproc, "out of swap space"); 1554 wakeup(&vmstats.v_free_count); 1555 PRELE(info.bigproc); 1556 } 1557 } 1558 } 1559 1560 static int 1561 vm_pageout_scan_callback(struct proc *p, void *data) 1562 { 1563 struct vm_pageout_scan_info *info = data; 1564 vm_offset_t size; 1565 1566 /* 1567 * Never kill system processes or init. If we have configured swap 1568 * then try to avoid killing low-numbered pids. 1569 */ 1570 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1571 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1572 return (0); 1573 } 1574 1575 lwkt_gettoken(&p->p_token); 1576 1577 /* 1578 * if the process is in a non-running type state, 1579 * don't touch it. 1580 */ 1581 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1582 lwkt_reltoken(&p->p_token); 1583 return (0); 1584 } 1585 1586 /* 1587 * Get the approximate process size. Note that anonymous pages 1588 * with backing swap will be counted twice, but there should not 1589 * be too many such pages due to the stress the VM system is 1590 * under at this point. 1591 */ 1592 size = vmspace_anonymous_count(p->p_vmspace) + 1593 vmspace_swap_count(p->p_vmspace); 1594 1595 /* 1596 * If the this process is bigger than the biggest one 1597 * remember it. 1598 */ 1599 if (info->bigsize < size) { 1600 if (info->bigproc) 1601 PRELE(info->bigproc); 1602 PHOLD(p); 1603 info->bigproc = p; 1604 info->bigsize = size; 1605 } 1606 lwkt_reltoken(&p->p_token); 1607 lwkt_yield(); 1608 1609 return(0); 1610 } 1611 1612 /* 1613 * This routine tries to maintain the pseudo LRU active queue, 1614 * so that during long periods of time where there is no paging, 1615 * that some statistic accumulation still occurs. This code 1616 * helps the situation where paging just starts to occur. 1617 */ 1618 static void 1619 vm_pageout_page_stats(int q) 1620 { 1621 static int fullintervalcount = 0; 1622 struct vm_page marker; 1623 vm_page_t m; 1624 int pcount, tpcount; /* Number of pages to check */ 1625 int page_shortage; 1626 1627 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1628 vmstats.v_free_min) - 1629 (vmstats.v_free_count + vmstats.v_inactive_count + 1630 vmstats.v_cache_count); 1631 1632 if (page_shortage <= 0) 1633 return; 1634 1635 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1636 fullintervalcount += vm_pageout_stats_interval; 1637 if (fullintervalcount < vm_pageout_full_stats_interval) { 1638 tpcount = (vm_pageout_stats_max * pcount) / 1639 vmstats.v_page_count + 1; 1640 if (pcount > tpcount) 1641 pcount = tpcount; 1642 } else { 1643 fullintervalcount = 0; 1644 } 1645 1646 bzero(&marker, sizeof(marker)); 1647 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1648 marker.queue = PQ_ACTIVE + q; 1649 marker.pc = q; 1650 marker.wire_count = 1; 1651 1652 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1653 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1654 1655 /* 1656 * Queue locked at top of loop to avoid stack marker issues. 1657 */ 1658 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1659 pcount-- > 0) 1660 { 1661 int actcount; 1662 1663 KKASSERT(m->queue == PQ_ACTIVE + q); 1664 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1665 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1666 &marker, pageq); 1667 1668 /* 1669 * Skip marker pages (atomic against other markers to avoid 1670 * infinite hop-over scans). 1671 */ 1672 if (m->flags & PG_MARKER) 1673 continue; 1674 1675 /* 1676 * Ignore pages we can't busy 1677 */ 1678 if (vm_page_busy_try(m, TRUE)) 1679 continue; 1680 1681 /* 1682 * Remaining operations run with the page busy and neither 1683 * the page or the queue will be spin-locked. 1684 */ 1685 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1686 KKASSERT(m->queue == PQ_ACTIVE + q); 1687 1688 /* 1689 * We now have a safely busied page, the page and queue 1690 * spinlocks have been released. 1691 * 1692 * Ignore held pages 1693 */ 1694 if (m->hold_count) { 1695 vm_page_wakeup(m); 1696 goto next; 1697 } 1698 1699 /* 1700 * Calculate activity 1701 */ 1702 actcount = 0; 1703 if (m->flags & PG_REFERENCED) { 1704 vm_page_flag_clear(m, PG_REFERENCED); 1705 actcount += 1; 1706 } 1707 actcount += pmap_ts_referenced(m); 1708 1709 /* 1710 * Update act_count and move page to end of queue. 1711 */ 1712 if (actcount) { 1713 m->act_count += ACT_ADVANCE + actcount; 1714 if (m->act_count > ACT_MAX) 1715 m->act_count = ACT_MAX; 1716 vm_page_and_queue_spin_lock(m); 1717 if (m->queue - m->pc == PQ_ACTIVE) { 1718 TAILQ_REMOVE( 1719 &vm_page_queues[PQ_ACTIVE + q].pl, 1720 m, pageq); 1721 TAILQ_INSERT_TAIL( 1722 &vm_page_queues[PQ_ACTIVE + q].pl, 1723 m, pageq); 1724 } 1725 vm_page_and_queue_spin_unlock(m); 1726 vm_page_wakeup(m); 1727 goto next; 1728 } 1729 1730 if (m->act_count == 0) { 1731 /* 1732 * We turn off page access, so that we have 1733 * more accurate RSS stats. We don't do this 1734 * in the normal page deactivation when the 1735 * system is loaded VM wise, because the 1736 * cost of the large number of page protect 1737 * operations would be higher than the value 1738 * of doing the operation. 1739 * 1740 * We use the marker to save our place so 1741 * we can release the spin lock. both (m) 1742 * and (next) will be invalid. 1743 */ 1744 vm_page_protect(m, VM_PROT_NONE); 1745 vm_page_deactivate(m); 1746 } else { 1747 m->act_count -= min(m->act_count, ACT_DECLINE); 1748 vm_page_and_queue_spin_lock(m); 1749 if (m->queue - m->pc == PQ_ACTIVE) { 1750 TAILQ_REMOVE( 1751 &vm_page_queues[PQ_ACTIVE + q].pl, 1752 m, pageq); 1753 TAILQ_INSERT_TAIL( 1754 &vm_page_queues[PQ_ACTIVE + q].pl, 1755 m, pageq); 1756 } 1757 vm_page_and_queue_spin_unlock(m); 1758 } 1759 vm_page_wakeup(m); 1760 next: 1761 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1762 } 1763 1764 /* 1765 * Remove our local marker 1766 * 1767 * Page queue still spin-locked. 1768 */ 1769 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1770 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1771 } 1772 1773 static int 1774 vm_pageout_free_page_calc(vm_size_t count) 1775 { 1776 if (count < vmstats.v_page_count) 1777 return 0; 1778 /* 1779 * free_reserved needs to include enough for the largest swap pager 1780 * structures plus enough for any pv_entry structs when paging. 1781 * 1782 * v_free_min normal allocations 1783 * v_free_reserved system allocations 1784 * v_pageout_free_min allocations by pageout daemon 1785 * v_interrupt_free_min low level allocations (e.g swap structures) 1786 */ 1787 if (vmstats.v_page_count > 1024) 1788 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1789 else 1790 vmstats.v_free_min = 64; 1791 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1792 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1793 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1794 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1795 1796 return 1; 1797 } 1798 1799 1800 /* 1801 * vm_pageout is the high level pageout daemon. 1802 * 1803 * No requirements. 1804 */ 1805 static void 1806 vm_pageout_thread(void) 1807 { 1808 int pass; 1809 int q; 1810 int q1iterator = 0; 1811 int q2iterator = 0; 1812 1813 /* 1814 * Initialize some paging parameters. 1815 */ 1816 curthread->td_flags |= TDF_SYSTHREAD; 1817 1818 vm_pageout_free_page_calc(vmstats.v_page_count); 1819 1820 /* 1821 * v_free_target and v_cache_min control pageout hysteresis. Note 1822 * that these are more a measure of the VM cache queue hysteresis 1823 * then the VM free queue. Specifically, v_free_target is the 1824 * high water mark (free+cache pages). 1825 * 1826 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1827 * low water mark, while v_free_min is the stop. v_cache_min must 1828 * be big enough to handle memory needs while the pageout daemon 1829 * is signalled and run to free more pages. 1830 */ 1831 if (vmstats.v_free_count > 6144) 1832 vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved; 1833 else 1834 vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved; 1835 1836 /* 1837 * NOTE: With the new buffer cache b_act_count we want the default 1838 * inactive target to be a percentage of available memory. 1839 * 1840 * The inactive target essentially determines the minimum 1841 * number of 'temporary' pages capable of caching one-time-use 1842 * files when the VM system is otherwise full of pages 1843 * belonging to multi-time-use files or active program data. 1844 * 1845 * NOTE: The inactive target is aggressively persued only if the 1846 * inactive queue becomes too small. If the inactive queue 1847 * is large enough to satisfy page movement to free+cache 1848 * then it is repopulated more slowly from the active queue. 1849 * This allows a general inactive_target default to be set. 1850 * 1851 * There is an issue here for processes which sit mostly idle 1852 * 'overnight', such as sshd, tcsh, and X. Any movement from 1853 * the active queue will eventually cause such pages to 1854 * recycle eventually causing a lot of paging in the morning. 1855 * To reduce the incidence of this pages cycled out of the 1856 * buffer cache are moved directly to the inactive queue if 1857 * they were only used once or twice. 1858 * 1859 * The vfs.vm_cycle_point sysctl can be used to adjust this. 1860 * Increasing the value (up to 64) increases the number of 1861 * buffer recyclements which go directly to the inactive queue. 1862 */ 1863 if (vmstats.v_free_count > 2048) { 1864 vmstats.v_cache_min = vmstats.v_free_target; 1865 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 1866 } else { 1867 vmstats.v_cache_min = 0; 1868 vmstats.v_cache_max = 0; 1869 } 1870 vmstats.v_inactive_target = vmstats.v_free_count / 4; 1871 1872 /* XXX does not really belong here */ 1873 if (vm_page_max_wired == 0) 1874 vm_page_max_wired = vmstats.v_free_count / 3; 1875 1876 if (vm_pageout_stats_max == 0) 1877 vm_pageout_stats_max = vmstats.v_free_target; 1878 1879 /* 1880 * Set interval in seconds for stats scan. 1881 */ 1882 if (vm_pageout_stats_interval == 0) 1883 vm_pageout_stats_interval = 5; 1884 if (vm_pageout_full_stats_interval == 0) 1885 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 1886 1887 1888 /* 1889 * Set maximum free per pass 1890 */ 1891 if (vm_pageout_stats_free_max == 0) 1892 vm_pageout_stats_free_max = 5; 1893 1894 swap_pager_swap_init(); 1895 pass = 0; 1896 1897 /* 1898 * The pageout daemon is never done, so loop forever. 1899 */ 1900 while (TRUE) { 1901 int error; 1902 int avail_shortage; 1903 int inactive_shortage; 1904 int vnodes_skipped = 0; 1905 int recycle_count = 0; 1906 int tmp; 1907 1908 /* 1909 * Wait for an action request. If we timeout check to 1910 * see if paging is needed (in case the normal wakeup 1911 * code raced us). 1912 */ 1913 if (vm_pages_needed == 0) { 1914 error = tsleep(&vm_pages_needed, 1915 0, "psleep", 1916 vm_pageout_stats_interval * hz); 1917 if (error && 1918 vm_paging_needed() == 0 && 1919 vm_pages_needed == 0) { 1920 for (q = 0; q < PQ_L2_SIZE; ++q) 1921 vm_pageout_page_stats(q); 1922 continue; 1923 } 1924 vm_pages_needed = 1; 1925 } 1926 1927 mycpu->gd_cnt.v_pdwakeups++; 1928 1929 /* 1930 * Scan for INACTIVE->CLEAN/PAGEOUT 1931 * 1932 * This routine tries to avoid thrashing the system with 1933 * unnecessary activity. 1934 * 1935 * Calculate our target for the number of free+cache pages we 1936 * want to get to. This is higher then the number that causes 1937 * allocations to stall (severe) in order to provide hysteresis, 1938 * and if we don't make it all the way but get to the minimum 1939 * we're happy. Goose it a bit if there are multiple requests 1940 * for memory. 1941 * 1942 * Don't reduce avail_shortage inside the loop or the 1943 * PQAVERAGE() calculation will break. 1944 * 1945 * NOTE! deficit is differentiated from avail_shortage as 1946 * REQUIRING at least (deficit) pages to be cleaned, 1947 * even if the page queues are in good shape. This 1948 * is used primarily for handling per-process 1949 * RLIMIT_RSS and may also see small values when 1950 * processes block due to low memory. 1951 */ 1952 avail_shortage = vm_paging_target() + vm_pageout_deficit; 1953 vm_pageout_deficit = 0; 1954 1955 if (avail_shortage > 0) { 1956 int delta = 0; 1957 1958 for (q = 0; q < PQ_L2_SIZE; ++q) { 1959 delta += vm_pageout_scan_inactive( 1960 pass, 1961 (q + q1iterator) & PQ_L2_MASK, 1962 PQAVERAGE(avail_shortage), 1963 &vnodes_skipped); 1964 if (avail_shortage - delta <= 0) 1965 break; 1966 } 1967 avail_shortage -= delta; 1968 q1iterator = q + 1; 1969 } 1970 1971 /* 1972 * Figure out how many active pages we must deactivate. If 1973 * we were able to reach our target with just the inactive 1974 * scan above we limit the number of active pages we 1975 * deactivate to reduce unnecessary work. 1976 */ 1977 inactive_shortage = vmstats.v_inactive_target - 1978 vmstats.v_inactive_count; 1979 1980 /* 1981 * If we were unable to free sufficient inactive pages to 1982 * satisfy the free/cache queue requirements then simply 1983 * reaching the inactive target may not be good enough. 1984 * Try to deactivate pages in excess of the target based 1985 * on the shortfall. 1986 * 1987 * However to prevent thrashing the VM system do not 1988 * deactivate more than an additional 1/10 the inactive 1989 * target's worth of active pages. 1990 */ 1991 if (avail_shortage > 0) { 1992 tmp = avail_shortage * 2; 1993 if (tmp > vmstats.v_inactive_target / 10) 1994 tmp = vmstats.v_inactive_target / 10; 1995 inactive_shortage += tmp; 1996 } 1997 1998 /* 1999 * Only trigger a pmap cleanup on inactive shortage. 2000 */ 2001 if (inactive_shortage > 0) { 2002 pmap_collect(); 2003 } 2004 2005 /* 2006 * Scan for ACTIVE->INACTIVE 2007 * 2008 * Only trigger on inactive shortage. Triggering on 2009 * avail_shortage can starve the active queue with 2010 * unnecessary active->inactive transitions and destroy 2011 * performance. 2012 */ 2013 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2014 int delta = 0; 2015 2016 for (q = 0; q < PQ_L2_SIZE; ++q) { 2017 delta += vm_pageout_scan_active( 2018 pass, 2019 (q + q2iterator) & PQ_L2_MASK, 2020 PQAVERAGE(avail_shortage), 2021 PQAVERAGE(inactive_shortage), 2022 &recycle_count); 2023 if (inactive_shortage - delta <= 0 && 2024 avail_shortage - delta <= 0) { 2025 break; 2026 } 2027 } 2028 inactive_shortage -= delta; 2029 avail_shortage -= delta; 2030 q2iterator = q + 1; 2031 } 2032 2033 /* 2034 * Scan for CACHE->FREE 2035 * 2036 * Finally free enough cache pages to meet our free page 2037 * requirement and take more drastic measures if we are 2038 * still in trouble. 2039 */ 2040 vm_pageout_scan_cache(avail_shortage, pass, 2041 vnodes_skipped, recycle_count); 2042 2043 /* 2044 * Wait for more work. 2045 */ 2046 if (avail_shortage > 0) { 2047 ++pass; 2048 if (pass < 10 && vm_pages_needed > 1) { 2049 /* 2050 * Normal operation, additional processes 2051 * have already kicked us. Retry immediately 2052 * unless swap space is completely full in 2053 * which case delay a bit. 2054 */ 2055 if (swap_pager_full) { 2056 tsleep(&vm_pages_needed, 0, "pdelay", 2057 hz / 5); 2058 } /* else immediate retry */ 2059 } else if (pass < 10) { 2060 /* 2061 * Normal operation, fewer processes. Delay 2062 * a bit but allow wakeups. 2063 */ 2064 vm_pages_needed = 0; 2065 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2066 vm_pages_needed = 1; 2067 } else if (swap_pager_full == 0) { 2068 /* 2069 * We've taken too many passes, forced delay. 2070 */ 2071 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2072 } else { 2073 /* 2074 * Running out of memory, catastrophic 2075 * back-off to one-second intervals. 2076 */ 2077 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2078 } 2079 } else if (vm_pages_needed) { 2080 /* 2081 * Interlocked wakeup of waiters (non-optional). 2082 * 2083 * Similar to vm_page_free_wakeup() in vm_page.c, 2084 * wake 2085 */ 2086 pass = 0; 2087 if (!vm_page_count_min(vm_page_free_hysteresis) || 2088 !vm_page_count_target()) { 2089 vm_pages_needed = 0; 2090 wakeup(&vmstats.v_free_count); 2091 } 2092 } else { 2093 pass = 0; 2094 } 2095 } 2096 } 2097 2098 static struct kproc_desc page_kp = { 2099 "pagedaemon", 2100 vm_pageout_thread, 2101 &pagethread 2102 }; 2103 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp); 2104 2105 2106 /* 2107 * Called after allocating a page out of the cache or free queue 2108 * to possibly wake the pagedaemon up to replentish our supply. 2109 * 2110 * We try to generate some hysteresis by waking the pagedaemon up 2111 * when our free+cache pages go below the free_min+cache_min level. 2112 * The pagedaemon tries to get the count back up to at least the 2113 * minimum, and through to the target level if possible. 2114 * 2115 * If the pagedaemon is already active bump vm_pages_needed as a hint 2116 * that there are even more requests pending. 2117 * 2118 * SMP races ok? 2119 * No requirements. 2120 */ 2121 void 2122 pagedaemon_wakeup(void) 2123 { 2124 if (vm_paging_needed() && curthread != pagethread) { 2125 if (vm_pages_needed == 0) { 2126 vm_pages_needed = 1; /* SMP race ok */ 2127 wakeup(&vm_pages_needed); 2128 } else if (vm_page_count_min(0)) { 2129 ++vm_pages_needed; /* SMP race ok */ 2130 } 2131 } 2132 } 2133 2134 #if !defined(NO_SWAPPING) 2135 2136 /* 2137 * SMP races ok? 2138 * No requirements. 2139 */ 2140 static void 2141 vm_req_vmdaemon(void) 2142 { 2143 static int lastrun = 0; 2144 2145 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2146 wakeup(&vm_daemon_needed); 2147 lastrun = ticks; 2148 } 2149 } 2150 2151 static int vm_daemon_callback(struct proc *p, void *data __unused); 2152 2153 /* 2154 * No requirements. 2155 */ 2156 static void 2157 vm_daemon(void) 2158 { 2159 int req_swapout; 2160 2161 while (TRUE) { 2162 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2163 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2164 2165 /* 2166 * forced swapouts 2167 */ 2168 if (req_swapout) 2169 swapout_procs(vm_pageout_req_swapout); 2170 2171 /* 2172 * scan the processes for exceeding their rlimits or if 2173 * process is swapped out -- deactivate pages 2174 */ 2175 allproc_scan(vm_daemon_callback, NULL); 2176 } 2177 } 2178 2179 static int 2180 vm_daemon_callback(struct proc *p, void *data __unused) 2181 { 2182 struct vmspace *vm; 2183 vm_pindex_t limit, size; 2184 2185 /* 2186 * if this is a system process or if we have already 2187 * looked at this process, skip it. 2188 */ 2189 lwkt_gettoken(&p->p_token); 2190 2191 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2192 lwkt_reltoken(&p->p_token); 2193 return (0); 2194 } 2195 2196 /* 2197 * if the process is in a non-running type state, 2198 * don't touch it. 2199 */ 2200 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2201 lwkt_reltoken(&p->p_token); 2202 return (0); 2203 } 2204 2205 /* 2206 * get a limit 2207 */ 2208 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2209 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2210 2211 /* 2212 * let processes that are swapped out really be 2213 * swapped out. Set the limit to nothing to get as 2214 * many pages out to swap as possible. 2215 */ 2216 if (p->p_flags & P_SWAPPEDOUT) 2217 limit = 0; 2218 2219 vm = p->p_vmspace; 2220 vmspace_hold(vm); 2221 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2222 if (limit >= 0 && size > 4096 && 2223 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2224 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2225 } 2226 vmspace_drop(vm); 2227 2228 lwkt_reltoken(&p->p_token); 2229 2230 return (0); 2231 } 2232 2233 #endif 2234