1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/conf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <sys/lock.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_map.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/swap_pager.h> 93 #include <vm/vm_extern.h> 94 95 #include <sys/thread2.h> 96 #include <sys/spinlock2.h> 97 #include <vm/vm_page2.h> 98 99 /* 100 * System initialization 101 */ 102 103 /* the kernel process "vm_pageout"*/ 104 static int vm_pageout_page(vm_page_t m, int *max_launderp, 105 int *vnodes_skippedp, struct vnode **vpfailedp, 106 int pass, int vmflush_flags); 107 static int vm_pageout_clean_helper (vm_page_t, int); 108 static int vm_pageout_free_page_calc (vm_size_t count); 109 static void vm_pageout_page_free(vm_page_t m) ; 110 struct thread *emergpager; 111 struct thread *pagethread; 112 static int sequence_emerg_pager; 113 114 #if !defined(NO_SWAPPING) 115 /* the kernel process "vm_daemon"*/ 116 static void vm_daemon (void); 117 static struct thread *vmthread; 118 119 static struct kproc_desc vm_kp = { 120 "vmdaemon", 121 vm_daemon, 122 &vmthread 123 }; 124 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 125 #endif 126 127 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 128 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 129 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 130 int vm_page_free_hysteresis = 16; 131 static int vm_pagedaemon_time; 132 133 #if !defined(NO_SWAPPING) 134 static int vm_pageout_req_swapout; 135 static int vm_daemon_needed; 136 #endif 137 static int vm_max_launder = 4096; 138 static int vm_emerg_launder = 100; 139 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 140 static int vm_pageout_full_stats_interval = 0; 141 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 142 static int defer_swap_pageouts=0; 143 static int disable_swap_pageouts=0; 144 static u_int vm_anonmem_decline = ACT_DECLINE; 145 static u_int vm_filemem_decline = ACT_DECLINE * 2; 146 147 #if defined(NO_SWAPPING) 148 static int vm_swap_enabled=0; 149 static int vm_swap_idle_enabled=0; 150 #else 151 static int vm_swap_enabled=1; 152 static int vm_swap_idle_enabled=0; 153 #endif 154 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 155 156 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 157 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 158 159 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 160 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 161 162 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 163 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 164 "Free more pages than the minimum required"); 165 166 SYSCTL_INT(_vm, OID_AUTO, max_launder, 167 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 168 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 169 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 170 171 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 172 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 173 174 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 175 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 176 177 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 178 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 179 180 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 181 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 182 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 183 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 184 185 #if defined(NO_SWAPPING) 186 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 187 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 188 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 189 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 190 #else 191 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 192 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 193 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 194 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 195 #endif 196 197 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 198 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 199 200 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 201 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 202 203 static int pageout_lock_miss; 204 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 205 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 206 207 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 208 209 #if !defined(NO_SWAPPING) 210 static void vm_req_vmdaemon (void); 211 #endif 212 static void vm_pageout_page_stats(int q); 213 214 /* 215 * Calculate approximately how many pages on each queue to try to 216 * clean. An exact calculation creates an edge condition when the 217 * queues are unbalanced so add significant slop. The queue scans 218 * will stop early when targets are reached and will start where they 219 * left off on the next pass. 220 * 221 * We need to be generous here because there are all sorts of loading 222 * conditions that can cause edge cases if try to average over all queues. 223 * In particular, storage subsystems have become so fast that paging 224 * activity can become quite frantic. Eventually we will probably need 225 * two paging threads, one for dirty pages and one for clean, to deal 226 * with the bandwidth requirements. 227 228 * So what we do is calculate a value that can be satisfied nominally by 229 * only having to scan half the queues. 230 */ 231 static __inline int 232 PQAVERAGE(int n) 233 { 234 int avg; 235 236 if (n >= 0) { 237 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 238 } else { 239 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 240 } 241 return avg; 242 } 243 244 /* 245 * vm_pageout_clean_helper: 246 * 247 * Clean the page and remove it from the laundry. The page must be busied 248 * by the caller and will be disposed of (put away, flushed) by this routine. 249 */ 250 static int 251 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 252 { 253 vm_object_t object; 254 vm_page_t mc[BLIST_MAX_ALLOC]; 255 int error; 256 int ib, is, page_base; 257 vm_pindex_t pindex = m->pindex; 258 259 object = m->object; 260 261 /* 262 * Don't mess with the page if it's held or special. 263 * 264 * XXX do we really need to check hold_count here? hold_count 265 * isn't supposed to mess with vm_page ops except prevent the 266 * page from being reused. 267 */ 268 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 269 vm_page_wakeup(m); 270 return 0; 271 } 272 273 /* 274 * Place page in cluster. Align cluster for optimal swap space 275 * allocation (whether it is swap or not). This is typically ~16-32 276 * pages, which also tends to align the cluster to multiples of the 277 * filesystem block size if backed by a filesystem. 278 */ 279 page_base = pindex % BLIST_MAX_ALLOC; 280 mc[page_base] = m; 281 ib = page_base - 1; 282 is = page_base + 1; 283 284 /* 285 * Scan object for clusterable pages. 286 * 287 * We can cluster ONLY if: ->> the page is NOT 288 * clean, wired, busy, held, or mapped into a 289 * buffer, and one of the following: 290 * 1) The page is inactive, or a seldom used 291 * active page. 292 * -or- 293 * 2) we force the issue. 294 * 295 * During heavy mmap/modification loads the pageout 296 * daemon can really fragment the underlying file 297 * due to flushing pages out of order and not trying 298 * align the clusters (which leave sporatic out-of-order 299 * holes). To solve this problem we do the reverse scan 300 * first and attempt to align our cluster, then do a 301 * forward scan if room remains. 302 */ 303 vm_object_hold(object); 304 305 while (ib >= 0) { 306 vm_page_t p; 307 308 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 309 TRUE, &error); 310 if (error || p == NULL) 311 break; 312 if ((p->queue - p->pc) == PQ_CACHE || 313 (p->flags & PG_UNMANAGED)) { 314 vm_page_wakeup(p); 315 break; 316 } 317 vm_page_test_dirty(p); 318 if (((p->dirty & p->valid) == 0 && 319 (p->flags & PG_NEED_COMMIT) == 0) || 320 p->wire_count != 0 || /* may be held by buf cache */ 321 p->hold_count != 0) { /* may be undergoing I/O */ 322 vm_page_wakeup(p); 323 break; 324 } 325 if (p->queue - p->pc != PQ_INACTIVE) { 326 if (p->queue - p->pc != PQ_ACTIVE || 327 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 328 vm_page_wakeup(p); 329 break; 330 } 331 } 332 333 /* 334 * Try to maintain page groupings in the cluster. 335 */ 336 if (m->flags & PG_WINATCFLS) 337 vm_page_flag_set(p, PG_WINATCFLS); 338 else 339 vm_page_flag_clear(p, PG_WINATCFLS); 340 p->act_count = m->act_count; 341 342 mc[ib] = p; 343 --ib; 344 } 345 ++ib; /* fixup */ 346 347 while (is < BLIST_MAX_ALLOC && 348 pindex - page_base + is < object->size) { 349 vm_page_t p; 350 351 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 352 TRUE, &error); 353 if (error || p == NULL) 354 break; 355 if (((p->queue - p->pc) == PQ_CACHE) || 356 (p->flags & PG_UNMANAGED)) { 357 vm_page_wakeup(p); 358 break; 359 } 360 vm_page_test_dirty(p); 361 if (((p->dirty & p->valid) == 0 && 362 (p->flags & PG_NEED_COMMIT) == 0) || 363 p->wire_count != 0 || /* may be held by buf cache */ 364 p->hold_count != 0) { /* may be undergoing I/O */ 365 vm_page_wakeup(p); 366 break; 367 } 368 if (p->queue - p->pc != PQ_INACTIVE) { 369 if (p->queue - p->pc != PQ_ACTIVE || 370 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 371 vm_page_wakeup(p); 372 break; 373 } 374 } 375 376 /* 377 * Try to maintain page groupings in the cluster. 378 */ 379 if (m->flags & PG_WINATCFLS) 380 vm_page_flag_set(p, PG_WINATCFLS); 381 else 382 vm_page_flag_clear(p, PG_WINATCFLS); 383 p->act_count = m->act_count; 384 385 mc[is] = p; 386 ++is; 387 } 388 389 vm_object_drop(object); 390 391 /* 392 * we allow reads during pageouts... 393 */ 394 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 395 } 396 397 /* 398 * vm_pageout_flush() - launder the given pages 399 * 400 * The given pages are laundered. Note that we setup for the start of 401 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 402 * reference count all in here rather then in the parent. If we want 403 * the parent to do more sophisticated things we may have to change 404 * the ordering. 405 * 406 * The pages in the array must be busied by the caller and will be 407 * unbusied by this function. 408 */ 409 int 410 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 411 { 412 vm_object_t object; 413 int pageout_status[count]; 414 int numpagedout = 0; 415 int i; 416 417 /* 418 * Initiate I/O. Bump the vm_page_t->busy counter. 419 */ 420 for (i = 0; i < count; i++) { 421 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 422 ("vm_pageout_flush page %p index %d/%d: partially " 423 "invalid page", mc[i], i, count)); 424 vm_page_io_start(mc[i]); 425 } 426 427 /* 428 * We must make the pages read-only. This will also force the 429 * modified bit in the related pmaps to be cleared. The pager 430 * cannot clear the bit for us since the I/O completion code 431 * typically runs from an interrupt. The act of making the page 432 * read-only handles the case for us. 433 * 434 * Then we can unbusy the pages, we still hold a reference by virtue 435 * of our soft-busy. 436 */ 437 for (i = 0; i < count; i++) { 438 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 439 vm_page_protect(mc[i], VM_PROT_NONE); 440 else 441 vm_page_protect(mc[i], VM_PROT_READ); 442 vm_page_wakeup(mc[i]); 443 } 444 445 object = mc[0]->object; 446 vm_object_pip_add(object, count); 447 448 vm_pager_put_pages(object, mc, count, 449 (vmflush_flags | 450 ((object == &kernel_object) ? 451 VM_PAGER_PUT_SYNC : 0)), 452 pageout_status); 453 454 for (i = 0; i < count; i++) { 455 vm_page_t mt = mc[i]; 456 457 switch (pageout_status[i]) { 458 case VM_PAGER_OK: 459 numpagedout++; 460 break; 461 case VM_PAGER_PEND: 462 numpagedout++; 463 break; 464 case VM_PAGER_BAD: 465 /* 466 * Page outside of range of object. Right now we 467 * essentially lose the changes by pretending it 468 * worked. 469 */ 470 vm_page_busy_wait(mt, FALSE, "pgbad"); 471 pmap_clear_modify(mt); 472 vm_page_undirty(mt); 473 vm_page_wakeup(mt); 474 break; 475 case VM_PAGER_ERROR: 476 case VM_PAGER_FAIL: 477 /* 478 * A page typically cannot be paged out when we 479 * have run out of swap. We leave the page 480 * marked inactive and will try to page it out 481 * again later. 482 * 483 * Starvation of the active page list is used to 484 * determine when the system is massively memory 485 * starved. 486 */ 487 break; 488 case VM_PAGER_AGAIN: 489 break; 490 } 491 492 /* 493 * If not PENDing this was a synchronous operation and we 494 * clean up after the I/O. If it is PENDing the mess is 495 * cleaned up asynchronously. 496 * 497 * Also nominally act on the caller's wishes if the caller 498 * wants to try to really clean (cache or free) the page. 499 * 500 * Also nominally deactivate the page if the system is 501 * memory-stressed. 502 */ 503 if (pageout_status[i] != VM_PAGER_PEND) { 504 vm_page_busy_wait(mt, FALSE, "pgouw"); 505 vm_page_io_finish(mt); 506 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 507 vm_page_try_to_cache(mt); 508 } else if (vm_page_count_severe()) { 509 vm_page_deactivate(mt); 510 vm_page_wakeup(mt); 511 } else { 512 vm_page_wakeup(mt); 513 } 514 vm_object_pip_wakeup(object); 515 } 516 } 517 return numpagedout; 518 } 519 520 #if !defined(NO_SWAPPING) 521 522 /* 523 * Callback function, page busied for us. We must dispose of the busy 524 * condition. Any related pmap pages may be held but will not be locked. 525 */ 526 static 527 int 528 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 529 vm_page_t p) 530 { 531 int actcount; 532 int cleanit = 0; 533 534 /* 535 * Basic tests - There should never be a marker, and we can stop 536 * once the RSS is below the required level. 537 */ 538 KKASSERT((p->flags & PG_MARKER) == 0); 539 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 540 vm_page_wakeup(p); 541 return(-1); 542 } 543 544 mycpu->gd_cnt.v_pdpages++; 545 546 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 547 vm_page_wakeup(p); 548 goto done; 549 } 550 551 ++info->actioncount; 552 553 /* 554 * Check if the page has been referened recently. If it has, 555 * activate it and skip. 556 */ 557 actcount = pmap_ts_referenced(p); 558 if (actcount) { 559 vm_page_flag_set(p, PG_REFERENCED); 560 } else if (p->flags & PG_REFERENCED) { 561 actcount = 1; 562 } 563 564 if (actcount) { 565 if (p->queue - p->pc != PQ_ACTIVE) { 566 vm_page_and_queue_spin_lock(p); 567 if (p->queue - p->pc != PQ_ACTIVE) { 568 vm_page_and_queue_spin_unlock(p); 569 vm_page_activate(p); 570 } else { 571 vm_page_and_queue_spin_unlock(p); 572 } 573 } else { 574 p->act_count += actcount; 575 if (p->act_count > ACT_MAX) 576 p->act_count = ACT_MAX; 577 } 578 vm_page_flag_clear(p, PG_REFERENCED); 579 vm_page_wakeup(p); 580 goto done; 581 } 582 583 /* 584 * Remove the page from this particular pmap. Once we do this, our 585 * pmap scans will not see it again (unless it gets faulted in), so 586 * we must actively dispose of or deal with the page. 587 */ 588 pmap_remove_specific(info->pmap, p); 589 590 /* 591 * If the page is not mapped to another process (i.e. as would be 592 * typical if this were a shared page from a library) then deactivate 593 * the page and clean it in two passes only. 594 * 595 * If the page hasn't been referenced since the last check, remove it 596 * from the pmap. If it is no longer mapped, deactivate it 597 * immediately, accelerating the normal decline. 598 * 599 * Once the page has been removed from the pmap the RSS code no 600 * longer tracks it so we have to make sure that it is staged for 601 * potential flush action. 602 */ 603 if ((p->flags & PG_MAPPED) == 0) { 604 if (p->queue - p->pc == PQ_ACTIVE) { 605 vm_page_deactivate(p); 606 } 607 if (p->queue - p->pc == PQ_INACTIVE) { 608 cleanit = 1; 609 } 610 } 611 612 /* 613 * Ok, try to fully clean the page and any nearby pages such that at 614 * least the requested page is freed or moved to the cache queue. 615 * 616 * We usually do this synchronously to allow us to get the page into 617 * the CACHE queue quickly, which will prevent memory exhaustion if 618 * a process with a memoryuse limit is running away. However, the 619 * sysadmin may desire to set vm.swap_user_async which relaxes this 620 * and improves write performance. 621 */ 622 if (cleanit) { 623 int max_launder = 0x7FFF; 624 int vnodes_skipped = 0; 625 int vmflush_flags; 626 struct vnode *vpfailed = NULL; 627 628 info->offset = va; 629 630 if (vm_pageout_memuse_mode >= 2) { 631 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 632 VM_PAGER_ALLOW_ACTIVE; 633 if (swap_user_async == 0) 634 vmflush_flags |= VM_PAGER_PUT_SYNC; 635 vm_page_flag_set(p, PG_WINATCFLS); 636 info->cleancount += 637 vm_pageout_page(p, &max_launder, 638 &vnodes_skipped, 639 &vpfailed, 1, vmflush_flags); 640 } else { 641 vm_page_wakeup(p); 642 ++info->cleancount; 643 } 644 } else { 645 vm_page_wakeup(p); 646 } 647 648 /* 649 * Must be at end to avoid SMP races. 650 */ 651 done: 652 lwkt_user_yield(); 653 return 0; 654 } 655 656 /* 657 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 658 * that is relatively difficult to do. We try to keep track of where we 659 * left off last time to reduce scan overhead. 660 * 661 * Called when vm_pageout_memuse_mode is >= 1. 662 */ 663 void 664 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 665 { 666 vm_offset_t pgout_offset; 667 struct pmap_pgscan_info info; 668 int retries = 3; 669 670 pgout_offset = map->pgout_offset; 671 again: 672 #if 0 673 kprintf("%016jx ", pgout_offset); 674 #endif 675 if (pgout_offset < VM_MIN_USER_ADDRESS) 676 pgout_offset = VM_MIN_USER_ADDRESS; 677 if (pgout_offset >= VM_MAX_USER_ADDRESS) 678 pgout_offset = 0; 679 info.pmap = vm_map_pmap(map); 680 info.limit = limit; 681 info.beg_addr = pgout_offset; 682 info.end_addr = VM_MAX_USER_ADDRESS; 683 info.callback = vm_pageout_mdp_callback; 684 info.cleancount = 0; 685 info.actioncount = 0; 686 info.busycount = 0; 687 688 pmap_pgscan(&info); 689 pgout_offset = info.offset; 690 #if 0 691 kprintf("%016jx %08lx %08lx\n", pgout_offset, 692 info.cleancount, info.actioncount); 693 #endif 694 695 if (pgout_offset != VM_MAX_USER_ADDRESS && 696 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 697 goto again; 698 } else if (retries && 699 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 700 --retries; 701 goto again; 702 } 703 map->pgout_offset = pgout_offset; 704 } 705 #endif 706 707 /* 708 * Called when the pageout scan wants to free a page. We no longer 709 * try to cycle the vm_object here with a reference & dealloc, which can 710 * cause a non-trivial object collapse in a critical path. 711 * 712 * It is unclear why we cycled the ref_count in the past, perhaps to try 713 * to optimize shadow chain collapses but I don't quite see why it would 714 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 715 * synchronously and not have to be kicked-start. 716 */ 717 static void 718 vm_pageout_page_free(vm_page_t m) 719 { 720 vm_page_protect(m, VM_PROT_NONE); 721 vm_page_free(m); 722 } 723 724 /* 725 * vm_pageout_scan does the dirty work for the pageout daemon. 726 */ 727 struct vm_pageout_scan_info { 728 struct proc *bigproc; 729 vm_offset_t bigsize; 730 }; 731 732 static int vm_pageout_scan_callback(struct proc *p, void *data); 733 734 /* 735 * Scan inactive queue 736 * 737 * WARNING! Can be called from two pagedaemon threads simultaneously. 738 */ 739 static int 740 vm_pageout_scan_inactive(int pass, int q, int avail_shortage, 741 int *vnodes_skipped) 742 { 743 vm_page_t m; 744 struct vm_page marker; 745 struct vnode *vpfailed; /* warning, allowed to be stale */ 746 int maxscan; 747 int delta = 0; 748 int max_launder; 749 int isep; 750 751 isep = (curthread == emergpager); 752 753 /* 754 * Start scanning the inactive queue for pages we can move to the 755 * cache or free. The scan will stop when the target is reached or 756 * we have scanned the entire inactive queue. Note that m->act_count 757 * is not used to form decisions for the inactive queue, only for the 758 * active queue. 759 * 760 * max_launder limits the number of dirty pages we flush per scan. 761 * For most systems a smaller value (16 or 32) is more robust under 762 * extreme memory and disk pressure because any unnecessary writes 763 * to disk can result in extreme performance degredation. However, 764 * systems with excessive dirty pages (especially when MAP_NOSYNC is 765 * used) will die horribly with limited laundering. If the pageout 766 * daemon cannot clean enough pages in the first pass, we let it go 767 * all out in succeeding passes. 768 * 769 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 770 * PAGES. 771 */ 772 if ((max_launder = vm_max_launder) <= 1) 773 max_launder = 1; 774 if (pass) 775 max_launder = 10000; 776 777 /* 778 * Initialize our marker 779 */ 780 bzero(&marker, sizeof(marker)); 781 marker.flags = PG_FICTITIOUS | PG_MARKER; 782 marker.busy_count = PBUSY_LOCKED; 783 marker.queue = PQ_INACTIVE + q; 784 marker.pc = q; 785 marker.wire_count = 1; 786 787 /* 788 * Inactive queue scan. 789 * 790 * NOTE: The vm_page must be spinlocked before the queue to avoid 791 * deadlocks, so it is easiest to simply iterate the loop 792 * with the queue unlocked at the top. 793 */ 794 vpfailed = NULL; 795 796 vm_page_queues_spin_lock(PQ_INACTIVE + q); 797 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 798 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 799 800 /* 801 * Queue locked at top of loop to avoid stack marker issues. 802 */ 803 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 804 maxscan-- > 0 && avail_shortage - delta > 0) 805 { 806 int count; 807 808 KKASSERT(m->queue == PQ_INACTIVE + q); 809 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 810 &marker, pageq); 811 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 812 &marker, pageq); 813 mycpu->gd_cnt.v_pdpages++; 814 815 /* 816 * Skip marker pages (atomic against other markers to avoid 817 * infinite hop-over scans). 818 */ 819 if (m->flags & PG_MARKER) 820 continue; 821 822 /* 823 * Try to busy the page. Don't mess with pages which are 824 * already busy or reorder them in the queue. 825 */ 826 if (vm_page_busy_try(m, TRUE)) 827 continue; 828 829 /* 830 * Remaining operations run with the page busy and neither 831 * the page or the queue will be spin-locked. 832 */ 833 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 834 KKASSERT(m->queue == PQ_INACTIVE + q); 835 836 /* 837 * The emergency pager runs when the primary pager gets 838 * stuck, which typically means the primary pager deadlocked 839 * on a vnode-backed page. Therefore, the emergency pager 840 * must skip any complex objects. 841 * 842 * We disallow VNODEs unless they are VCHR whos device ops 843 * does not flag D_NOEMERGPGR. 844 */ 845 if (isep && m->object) { 846 struct vnode *vp; 847 848 switch(m->object->type) { 849 case OBJT_DEFAULT: 850 case OBJT_SWAP: 851 /* 852 * Allow anonymous memory and assume that 853 * swap devices are not complex, since its 854 * kinda worthless if we can't swap out dirty 855 * anonymous pages. 856 */ 857 break; 858 case OBJT_VNODE: 859 /* 860 * Allow VCHR device if the D_NOEMERGPGR 861 * flag is not set, deny other vnode types 862 * as being too complex. 863 */ 864 vp = m->object->handle; 865 if (vp && vp->v_type == VCHR && 866 vp->v_rdev && vp->v_rdev->si_ops && 867 (vp->v_rdev->si_ops->head.flags & 868 D_NOEMERGPGR) == 0) { 869 break; 870 } 871 /* Deny - fall through */ 872 default: 873 /* 874 * Deny 875 */ 876 vm_page_wakeup(m); 877 vm_page_queues_spin_lock(PQ_INACTIVE + q); 878 lwkt_yield(); 879 continue; 880 } 881 } 882 883 /* 884 * Try to pageout the page and perhaps other nearby pages. 885 */ 886 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 887 &vpfailed, pass, 0); 888 delta += count; 889 890 /* 891 * Systems with a ton of memory can wind up with huge 892 * deactivation counts. Because the inactive scan is 893 * doing a lot of flushing, the combination can result 894 * in excessive paging even in situations where other 895 * unrelated threads free up sufficient VM. 896 * 897 * To deal with this we abort the nominal active->inactive 898 * scan before we hit the inactive target when free+cache 899 * levels have reached a reasonable target. 900 * 901 * When deciding to stop early we need to add some slop to 902 * the test and we need to return full completion to the caller 903 * to prevent the caller from thinking there is something 904 * wrong and issuing a low-memory+swap warning or pkill. 905 * 906 * A deficit forces paging regardless of the state of the 907 * VM page queues (used for RSS enforcement). 908 */ 909 lwkt_yield(); 910 vm_page_queues_spin_lock(PQ_INACTIVE + q); 911 if (vm_paging_target() < -vm_max_launder) { 912 /* 913 * Stopping early, return full completion to caller. 914 */ 915 if (delta < avail_shortage) 916 delta = avail_shortage; 917 break; 918 } 919 } 920 921 /* page queue still spin-locked */ 922 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 923 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 924 925 return (delta); 926 } 927 928 /* 929 * Pageout the specified page, return the total number of pages paged out 930 * (this routine may cluster). 931 * 932 * The page must be busied and soft-busied by the caller and will be disposed 933 * of by this function. 934 */ 935 static int 936 vm_pageout_page(vm_page_t m, int *max_launderp, int *vnodes_skippedp, 937 struct vnode **vpfailedp, int pass, int vmflush_flags) 938 { 939 vm_object_t object; 940 int actcount; 941 int count = 0; 942 943 /* 944 * It is possible for a page to be busied ad-hoc (e.g. the 945 * pmap_collect() code) and wired and race against the 946 * allocation of a new page. vm_page_alloc() may be forced 947 * to deactivate the wired page in which case it winds up 948 * on the inactive queue and must be handled here. We 949 * correct the problem simply by unqueuing the page. 950 */ 951 if (m->wire_count) { 952 vm_page_unqueue_nowakeup(m); 953 vm_page_wakeup(m); 954 kprintf("WARNING: pagedaemon: wired page on " 955 "inactive queue %p\n", m); 956 return 0; 957 } 958 959 /* 960 * A held page may be undergoing I/O, so skip it. 961 */ 962 if (m->hold_count) { 963 vm_page_and_queue_spin_lock(m); 964 if (m->queue - m->pc == PQ_INACTIVE) { 965 TAILQ_REMOVE( 966 &vm_page_queues[m->queue].pl, m, pageq); 967 TAILQ_INSERT_TAIL( 968 &vm_page_queues[m->queue].pl, m, pageq); 969 ++vm_swapcache_inactive_heuristic; 970 } 971 vm_page_and_queue_spin_unlock(m); 972 vm_page_wakeup(m); 973 return 0; 974 } 975 976 if (m->object == NULL || m->object->ref_count == 0) { 977 /* 978 * If the object is not being used, we ignore previous 979 * references. 980 */ 981 vm_page_flag_clear(m, PG_REFERENCED); 982 pmap_clear_reference(m); 983 /* fall through to end */ 984 } else if (((m->flags & PG_REFERENCED) == 0) && 985 (actcount = pmap_ts_referenced(m))) { 986 /* 987 * Otherwise, if the page has been referenced while 988 * in the inactive queue, we bump the "activation 989 * count" upwards, making it less likely that the 990 * page will be added back to the inactive queue 991 * prematurely again. Here we check the page tables 992 * (or emulated bits, if any), given the upper level 993 * VM system not knowing anything about existing 994 * references. 995 */ 996 vm_page_activate(m); 997 m->act_count += (actcount + ACT_ADVANCE); 998 vm_page_wakeup(m); 999 return 0; 1000 } 1001 1002 /* 1003 * (m) is still busied. 1004 * 1005 * If the upper level VM system knows about any page 1006 * references, we activate the page. We also set the 1007 * "activation count" higher than normal so that we will less 1008 * likely place pages back onto the inactive queue again. 1009 */ 1010 if ((m->flags & PG_REFERENCED) != 0) { 1011 vm_page_flag_clear(m, PG_REFERENCED); 1012 actcount = pmap_ts_referenced(m); 1013 vm_page_activate(m); 1014 m->act_count += (actcount + ACT_ADVANCE + 1); 1015 vm_page_wakeup(m); 1016 return 0; 1017 } 1018 1019 /* 1020 * If the upper level VM system doesn't know anything about 1021 * the page being dirty, we have to check for it again. As 1022 * far as the VM code knows, any partially dirty pages are 1023 * fully dirty. 1024 * 1025 * Pages marked PG_WRITEABLE may be mapped into the user 1026 * address space of a process running on another cpu. A 1027 * user process (without holding the MP lock) running on 1028 * another cpu may be able to touch the page while we are 1029 * trying to remove it. vm_page_cache() will handle this 1030 * case for us. 1031 */ 1032 if (m->dirty == 0) { 1033 vm_page_test_dirty(m); 1034 } else { 1035 vm_page_dirty(m); 1036 } 1037 1038 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1039 /* 1040 * Invalid pages can be easily freed 1041 */ 1042 vm_pageout_page_free(m); 1043 mycpu->gd_cnt.v_dfree++; 1044 ++count; 1045 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1046 /* 1047 * Clean pages can be placed onto the cache queue. 1048 * This effectively frees them. 1049 */ 1050 vm_page_cache(m); 1051 ++count; 1052 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1053 /* 1054 * Dirty pages need to be paged out, but flushing 1055 * a page is extremely expensive verses freeing 1056 * a clean page. Rather then artificially limiting 1057 * the number of pages we can flush, we instead give 1058 * dirty pages extra priority on the inactive queue 1059 * by forcing them to be cycled through the queue 1060 * twice before being flushed, after which the 1061 * (now clean) page will cycle through once more 1062 * before being freed. This significantly extends 1063 * the thrash point for a heavily loaded machine. 1064 */ 1065 vm_page_flag_set(m, PG_WINATCFLS); 1066 vm_page_and_queue_spin_lock(m); 1067 if (m->queue - m->pc == PQ_INACTIVE) { 1068 TAILQ_REMOVE( 1069 &vm_page_queues[m->queue].pl, m, pageq); 1070 TAILQ_INSERT_TAIL( 1071 &vm_page_queues[m->queue].pl, m, pageq); 1072 ++vm_swapcache_inactive_heuristic; 1073 } 1074 vm_page_and_queue_spin_unlock(m); 1075 vm_page_wakeup(m); 1076 } else if (*max_launderp > 0) { 1077 /* 1078 * We always want to try to flush some dirty pages if 1079 * we encounter them, to keep the system stable. 1080 * Normally this number is small, but under extreme 1081 * pressure where there are insufficient clean pages 1082 * on the inactive queue, we may have to go all out. 1083 */ 1084 int swap_pageouts_ok; 1085 struct vnode *vp = NULL; 1086 1087 swap_pageouts_ok = 0; 1088 object = m->object; 1089 if (object && 1090 (object->type != OBJT_SWAP) && 1091 (object->type != OBJT_DEFAULT)) { 1092 swap_pageouts_ok = 1; 1093 } else { 1094 swap_pageouts_ok = !(defer_swap_pageouts || 1095 disable_swap_pageouts); 1096 swap_pageouts_ok |= (!disable_swap_pageouts && 1097 defer_swap_pageouts && 1098 vm_page_count_min(0)); 1099 } 1100 1101 /* 1102 * We don't bother paging objects that are "dead". 1103 * Those objects are in a "rundown" state. 1104 */ 1105 if (!swap_pageouts_ok || 1106 (object == NULL) || 1107 (object->flags & OBJ_DEAD)) { 1108 vm_page_and_queue_spin_lock(m); 1109 if (m->queue - m->pc == PQ_INACTIVE) { 1110 TAILQ_REMOVE( 1111 &vm_page_queues[m->queue].pl, 1112 m, pageq); 1113 TAILQ_INSERT_TAIL( 1114 &vm_page_queues[m->queue].pl, 1115 m, pageq); 1116 ++vm_swapcache_inactive_heuristic; 1117 } 1118 vm_page_and_queue_spin_unlock(m); 1119 vm_page_wakeup(m); 1120 return 0; 1121 } 1122 1123 /* 1124 * (m) is still busied. 1125 * 1126 * The object is already known NOT to be dead. It 1127 * is possible for the vget() to block the whole 1128 * pageout daemon, but the new low-memory handling 1129 * code should prevent it. 1130 * 1131 * The previous code skipped locked vnodes and, worse, 1132 * reordered pages in the queue. This results in 1133 * completely non-deterministic operation because, 1134 * quite often, a vm_fault has initiated an I/O and 1135 * is holding a locked vnode at just the point where 1136 * the pageout daemon is woken up. 1137 * 1138 * We can't wait forever for the vnode lock, we might 1139 * deadlock due to a vn_read() getting stuck in 1140 * vm_wait while holding this vnode. We skip the 1141 * vnode if we can't get it in a reasonable amount 1142 * of time. 1143 * 1144 * vpfailed is used to (try to) avoid the case where 1145 * a large number of pages are associated with a 1146 * locked vnode, which could cause the pageout daemon 1147 * to stall for an excessive amount of time. 1148 */ 1149 if (object->type == OBJT_VNODE) { 1150 int flags; 1151 1152 vp = object->handle; 1153 flags = LK_EXCLUSIVE; 1154 if (vp == *vpfailedp) 1155 flags |= LK_NOWAIT; 1156 else 1157 flags |= LK_TIMELOCK; 1158 vm_page_hold(m); 1159 vm_page_wakeup(m); 1160 1161 /* 1162 * We have unbusied (m) temporarily so we can 1163 * acquire the vp lock without deadlocking. 1164 * (m) is held to prevent destruction. 1165 */ 1166 if (vget(vp, flags) != 0) { 1167 *vpfailedp = vp; 1168 ++pageout_lock_miss; 1169 if (object->flags & OBJ_MIGHTBEDIRTY) 1170 ++*vnodes_skippedp; 1171 vm_page_unhold(m); 1172 return 0; 1173 } 1174 1175 /* 1176 * The page might have been moved to another 1177 * queue during potential blocking in vget() 1178 * above. The page might have been freed and 1179 * reused for another vnode. The object might 1180 * have been reused for another vnode. 1181 */ 1182 if (m->queue - m->pc != PQ_INACTIVE || 1183 m->object != object || 1184 object->handle != vp) { 1185 if (object->flags & OBJ_MIGHTBEDIRTY) 1186 ++*vnodes_skippedp; 1187 vput(vp); 1188 vm_page_unhold(m); 1189 return 0; 1190 } 1191 1192 /* 1193 * The page may have been busied during the 1194 * blocking in vput(); We don't move the 1195 * page back onto the end of the queue so that 1196 * statistics are more correct if we don't. 1197 */ 1198 if (vm_page_busy_try(m, TRUE)) { 1199 vput(vp); 1200 vm_page_unhold(m); 1201 return 0; 1202 } 1203 vm_page_unhold(m); 1204 1205 /* 1206 * (m) is busied again 1207 * 1208 * We own the busy bit and remove our hold 1209 * bit. If the page is still held it 1210 * might be undergoing I/O, so skip it. 1211 */ 1212 if (m->hold_count) { 1213 vm_page_and_queue_spin_lock(m); 1214 if (m->queue - m->pc == PQ_INACTIVE) { 1215 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1216 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1217 ++vm_swapcache_inactive_heuristic; 1218 } 1219 vm_page_and_queue_spin_unlock(m); 1220 if (object->flags & OBJ_MIGHTBEDIRTY) 1221 ++*vnodes_skippedp; 1222 vm_page_wakeup(m); 1223 vput(vp); 1224 return 0; 1225 } 1226 /* (m) is left busied as we fall through */ 1227 } 1228 1229 /* 1230 * page is busy and not held here. 1231 * 1232 * If a page is dirty, then it is either being washed 1233 * (but not yet cleaned) or it is still in the 1234 * laundry. If it is still in the laundry, then we 1235 * start the cleaning operation. 1236 * 1237 * decrement inactive_shortage on success to account 1238 * for the (future) cleaned page. Otherwise we 1239 * could wind up laundering or cleaning too many 1240 * pages. 1241 * 1242 * NOTE: Cleaning the page here does not cause 1243 * force_deficit to be adjusted, because the 1244 * page is not being freed or moved to the 1245 * cache. 1246 */ 1247 count = vm_pageout_clean_helper(m, vmflush_flags); 1248 *max_launderp -= count; 1249 1250 /* 1251 * Clean ate busy, page no longer accessible 1252 */ 1253 if (vp != NULL) 1254 vput(vp); 1255 } else { 1256 vm_page_wakeup(m); 1257 } 1258 return count; 1259 } 1260 1261 /* 1262 * Scan active queue 1263 * 1264 * WARNING! Can be called from two pagedaemon threads simultaneously. 1265 */ 1266 static int 1267 vm_pageout_scan_active(int pass, int q, 1268 int avail_shortage, int inactive_shortage, 1269 int *recycle_countp) 1270 { 1271 struct vm_page marker; 1272 vm_page_t m; 1273 int actcount; 1274 int delta = 0; 1275 int maxscan; 1276 int isep; 1277 1278 isep = (curthread == emergpager); 1279 1280 /* 1281 * We want to move pages from the active queue to the inactive 1282 * queue to get the inactive queue to the inactive target. If 1283 * we still have a page shortage from above we try to directly free 1284 * clean pages instead of moving them. 1285 * 1286 * If we do still have a shortage we keep track of the number of 1287 * pages we free or cache (recycle_count) as a measure of thrashing 1288 * between the active and inactive queues. 1289 * 1290 * If we were able to completely satisfy the free+cache targets 1291 * from the inactive pool we limit the number of pages we move 1292 * from the active pool to the inactive pool to 2x the pages we 1293 * had removed from the inactive pool (with a minimum of 1/5 the 1294 * inactive target). If we were not able to completely satisfy 1295 * the free+cache targets we go for the whole target aggressively. 1296 * 1297 * NOTE: Both variables can end up negative. 1298 * NOTE: We are still in a critical section. 1299 * 1300 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1301 * PAGES. 1302 */ 1303 1304 bzero(&marker, sizeof(marker)); 1305 marker.flags = PG_FICTITIOUS | PG_MARKER; 1306 marker.busy_count = PBUSY_LOCKED; 1307 marker.queue = PQ_ACTIVE + q; 1308 marker.pc = q; 1309 marker.wire_count = 1; 1310 1311 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1312 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1313 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1314 1315 /* 1316 * Queue locked at top of loop to avoid stack marker issues. 1317 */ 1318 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1319 maxscan-- > 0 && (avail_shortage - delta > 0 || 1320 inactive_shortage > 0)) 1321 { 1322 KKASSERT(m->queue == PQ_ACTIVE + q); 1323 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1324 &marker, pageq); 1325 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1326 &marker, pageq); 1327 1328 /* 1329 * Skip marker pages (atomic against other markers to avoid 1330 * infinite hop-over scans). 1331 */ 1332 if (m->flags & PG_MARKER) 1333 continue; 1334 1335 /* 1336 * Try to busy the page. Don't mess with pages which are 1337 * already busy or reorder them in the queue. 1338 */ 1339 if (vm_page_busy_try(m, TRUE)) 1340 continue; 1341 1342 /* 1343 * Remaining operations run with the page busy and neither 1344 * the page or the queue will be spin-locked. 1345 */ 1346 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1347 KKASSERT(m->queue == PQ_ACTIVE + q); 1348 1349 /* 1350 * Don't deactivate pages that are held, even if we can 1351 * busy them. (XXX why not?) 1352 */ 1353 if (m->hold_count != 0) { 1354 vm_page_and_queue_spin_lock(m); 1355 if (m->queue - m->pc == PQ_ACTIVE) { 1356 TAILQ_REMOVE( 1357 &vm_page_queues[PQ_ACTIVE + q].pl, 1358 m, pageq); 1359 TAILQ_INSERT_TAIL( 1360 &vm_page_queues[PQ_ACTIVE + q].pl, 1361 m, pageq); 1362 } 1363 vm_page_and_queue_spin_unlock(m); 1364 vm_page_wakeup(m); 1365 goto next; 1366 } 1367 1368 /* 1369 * The emergency pager ignores vnode-backed pages as these 1370 * are the pages that probably bricked the main pager. 1371 */ 1372 if (isep && m->object && m->object->type == OBJT_VNODE) { 1373 vm_page_and_queue_spin_lock(m); 1374 if (m->queue - m->pc == PQ_ACTIVE) { 1375 TAILQ_REMOVE( 1376 &vm_page_queues[PQ_ACTIVE + q].pl, 1377 m, pageq); 1378 TAILQ_INSERT_TAIL( 1379 &vm_page_queues[PQ_ACTIVE + q].pl, 1380 m, pageq); 1381 } 1382 vm_page_and_queue_spin_unlock(m); 1383 vm_page_wakeup(m); 1384 goto next; 1385 } 1386 1387 /* 1388 * The count for pagedaemon pages is done after checking the 1389 * page for eligibility... 1390 */ 1391 mycpu->gd_cnt.v_pdpages++; 1392 1393 /* 1394 * Check to see "how much" the page has been used and clear 1395 * the tracking access bits. If the object has no references 1396 * don't bother paying the expense. 1397 */ 1398 actcount = 0; 1399 if (m->object && m->object->ref_count != 0) { 1400 if (m->flags & PG_REFERENCED) 1401 ++actcount; 1402 actcount += pmap_ts_referenced(m); 1403 if (actcount) { 1404 m->act_count += ACT_ADVANCE + actcount; 1405 if (m->act_count > ACT_MAX) 1406 m->act_count = ACT_MAX; 1407 } 1408 } 1409 vm_page_flag_clear(m, PG_REFERENCED); 1410 1411 /* 1412 * actcount is only valid if the object ref_count is non-zero. 1413 * If the page does not have an object, actcount will be zero. 1414 */ 1415 if (actcount && m->object->ref_count != 0) { 1416 vm_page_and_queue_spin_lock(m); 1417 if (m->queue - m->pc == PQ_ACTIVE) { 1418 TAILQ_REMOVE( 1419 &vm_page_queues[PQ_ACTIVE + q].pl, 1420 m, pageq); 1421 TAILQ_INSERT_TAIL( 1422 &vm_page_queues[PQ_ACTIVE + q].pl, 1423 m, pageq); 1424 } 1425 vm_page_and_queue_spin_unlock(m); 1426 vm_page_wakeup(m); 1427 } else { 1428 switch(m->object->type) { 1429 case OBJT_DEFAULT: 1430 case OBJT_SWAP: 1431 m->act_count -= min(m->act_count, 1432 vm_anonmem_decline); 1433 break; 1434 default: 1435 m->act_count -= min(m->act_count, 1436 vm_filemem_decline); 1437 break; 1438 } 1439 if (vm_pageout_algorithm || 1440 (m->object == NULL) || 1441 (m->object && (m->object->ref_count == 0)) || 1442 m->act_count < pass + 1 1443 ) { 1444 /* 1445 * Deactivate the page. If we had a 1446 * shortage from our inactive scan try to 1447 * free (cache) the page instead. 1448 * 1449 * Don't just blindly cache the page if 1450 * we do not have a shortage from the 1451 * inactive scan, that could lead to 1452 * gigabytes being moved. 1453 */ 1454 --inactive_shortage; 1455 if (avail_shortage - delta > 0 || 1456 (m->object && (m->object->ref_count == 0))) 1457 { 1458 if (avail_shortage - delta > 0) 1459 ++*recycle_countp; 1460 vm_page_protect(m, VM_PROT_NONE); 1461 if (m->dirty == 0 && 1462 (m->flags & PG_NEED_COMMIT) == 0 && 1463 avail_shortage - delta > 0) { 1464 vm_page_cache(m); 1465 } else { 1466 vm_page_deactivate(m); 1467 vm_page_wakeup(m); 1468 } 1469 } else { 1470 vm_page_deactivate(m); 1471 vm_page_wakeup(m); 1472 } 1473 ++delta; 1474 } else { 1475 vm_page_and_queue_spin_lock(m); 1476 if (m->queue - m->pc == PQ_ACTIVE) { 1477 TAILQ_REMOVE( 1478 &vm_page_queues[PQ_ACTIVE + q].pl, 1479 m, pageq); 1480 TAILQ_INSERT_TAIL( 1481 &vm_page_queues[PQ_ACTIVE + q].pl, 1482 m, pageq); 1483 } 1484 vm_page_and_queue_spin_unlock(m); 1485 vm_page_wakeup(m); 1486 } 1487 } 1488 next: 1489 lwkt_yield(); 1490 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1491 } 1492 1493 /* 1494 * Clean out our local marker. 1495 * 1496 * Page queue still spin-locked. 1497 */ 1498 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1499 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1500 1501 return (delta); 1502 } 1503 1504 /* 1505 * The number of actually free pages can drop down to v_free_reserved, 1506 * we try to build the free count back above v_free_min. Note that 1507 * vm_paging_needed() also returns TRUE if v_free_count is not at 1508 * least v_free_min so that is the minimum we must build the free 1509 * count to. 1510 * 1511 * We use a slightly higher target to improve hysteresis, 1512 * ((v_free_target + v_free_min) / 2). Since v_free_target 1513 * is usually the same as v_cache_min this maintains about 1514 * half the pages in the free queue as are in the cache queue, 1515 * providing pretty good pipelining for pageout operation. 1516 * 1517 * The system operator can manipulate vm.v_cache_min and 1518 * vm.v_free_target to tune the pageout demon. Be sure 1519 * to keep vm.v_free_min < vm.v_free_target. 1520 * 1521 * Note that the original paging target is to get at least 1522 * (free_min + cache_min) into (free + cache). The slightly 1523 * higher target will shift additional pages from cache to free 1524 * without effecting the original paging target in order to 1525 * maintain better hysteresis and not have the free count always 1526 * be dead-on v_free_min. 1527 * 1528 * NOTE: we are still in a critical section. 1529 * 1530 * Pages moved from PQ_CACHE to totally free are not counted in the 1531 * pages_freed counter. 1532 * 1533 * WARNING! Can be called from two pagedaemon threads simultaneously. 1534 */ 1535 static void 1536 vm_pageout_scan_cache(int avail_shortage, int pass, 1537 int vnodes_skipped, int recycle_count) 1538 { 1539 static int lastkillticks; 1540 struct vm_pageout_scan_info info; 1541 vm_page_t m; 1542 int isep; 1543 1544 isep = (curthread == emergpager); 1545 1546 while (vmstats.v_free_count < 1547 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1548 /* 1549 * This steals some code from vm/vm_page.c 1550 * 1551 * Create two rovers and adjust the code to reduce 1552 * chances of them winding up at the same index (which 1553 * can cause a lot of contention). 1554 */ 1555 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1556 1557 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1558 goto next_rover; 1559 1560 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1561 if (m == NULL) 1562 break; 1563 /* page is returned removed from its queue and spinlocked */ 1564 if (vm_page_busy_try(m, TRUE)) { 1565 vm_page_deactivate_locked(m); 1566 vm_page_spin_unlock(m); 1567 continue; 1568 } 1569 vm_page_spin_unlock(m); 1570 pagedaemon_wakeup(); 1571 lwkt_yield(); 1572 1573 /* 1574 * Remaining operations run with the page busy and neither 1575 * the page or the queue will be spin-locked. 1576 */ 1577 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1578 m->hold_count || 1579 m->wire_count) { 1580 vm_page_deactivate(m); 1581 vm_page_wakeup(m); 1582 continue; 1583 } 1584 KKASSERT((m->flags & PG_MAPPED) == 0); 1585 KKASSERT(m->dirty == 0); 1586 vm_pageout_page_free(m); 1587 mycpu->gd_cnt.v_dfree++; 1588 next_rover: 1589 if (isep) 1590 cache_rover[1] -= PQ_PRIME2; 1591 else 1592 cache_rover[0] += PQ_PRIME2; 1593 } 1594 1595 #if !defined(NO_SWAPPING) 1596 /* 1597 * Idle process swapout -- run once per second. 1598 */ 1599 if (vm_swap_idle_enabled) { 1600 static time_t lsec; 1601 if (time_uptime != lsec) { 1602 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1603 vm_req_vmdaemon(); 1604 lsec = time_uptime; 1605 } 1606 } 1607 #endif 1608 1609 /* 1610 * If we didn't get enough free pages, and we have skipped a vnode 1611 * in a writeable object, wakeup the sync daemon. And kick swapout 1612 * if we did not get enough free pages. 1613 */ 1614 if (vm_paging_target() > 0) { 1615 if (vnodes_skipped && vm_page_count_min(0)) 1616 speedup_syncer(NULL); 1617 #if !defined(NO_SWAPPING) 1618 if (vm_swap_enabled && vm_page_count_target()) { 1619 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1620 vm_req_vmdaemon(); 1621 } 1622 #endif 1623 } 1624 1625 /* 1626 * Handle catastrophic conditions. Under good conditions we should 1627 * be at the target, well beyond our minimum. If we could not even 1628 * reach our minimum the system is under heavy stress. But just being 1629 * under heavy stress does not trigger process killing. 1630 * 1631 * We consider ourselves to have run out of memory if the swap pager 1632 * is full and avail_shortage is still positive. The secondary check 1633 * ensures that we do not kill processes if the instantanious 1634 * availability is good, even if the pageout demon pass says it 1635 * couldn't get to the target. 1636 * 1637 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1638 * SITUATIONS. 1639 */ 1640 if (swap_pager_almost_full && 1641 pass > 0 && 1642 isep == 0 && 1643 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1644 kprintf("Warning: system low on memory+swap " 1645 "shortage %d for %d ticks!\n", 1646 avail_shortage, ticks - swap_fail_ticks); 1647 if (bootverbose) 1648 kprintf("Metrics: spaf=%d spf=%d pass=%d avail=%d target=%d last=%u\n", 1649 swap_pager_almost_full, 1650 swap_pager_full, 1651 pass, 1652 avail_shortage, 1653 vm_paging_target(), 1654 (unsigned int)(ticks - lastkillticks)); 1655 } 1656 if (swap_pager_full && 1657 pass > 1 && 1658 isep == 0 && 1659 avail_shortage > 0 && 1660 vm_paging_target() > 0 && 1661 (unsigned int)(ticks - lastkillticks) >= hz) { 1662 /* 1663 * Kill something, maximum rate once per second to give 1664 * the process time to free up sufficient memory. 1665 */ 1666 lastkillticks = ticks; 1667 info.bigproc = NULL; 1668 info.bigsize = 0; 1669 allproc_scan(vm_pageout_scan_callback, &info, 0); 1670 if (info.bigproc != NULL) { 1671 kprintf("Try to kill process %d %s\n", 1672 info.bigproc->p_pid, info.bigproc->p_comm); 1673 info.bigproc->p_nice = PRIO_MIN; 1674 info.bigproc->p_usched->resetpriority( 1675 FIRST_LWP_IN_PROC(info.bigproc)); 1676 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1677 killproc(info.bigproc, "out of swap space"); 1678 wakeup(&vmstats.v_free_count); 1679 PRELE(info.bigproc); 1680 } 1681 } 1682 } 1683 1684 static int 1685 vm_pageout_scan_callback(struct proc *p, void *data) 1686 { 1687 struct vm_pageout_scan_info *info = data; 1688 vm_offset_t size; 1689 1690 /* 1691 * Never kill system processes or init. If we have configured swap 1692 * then try to avoid killing low-numbered pids. 1693 */ 1694 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1695 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1696 return (0); 1697 } 1698 1699 lwkt_gettoken(&p->p_token); 1700 1701 /* 1702 * if the process is in a non-running type state, 1703 * don't touch it. 1704 */ 1705 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1706 lwkt_reltoken(&p->p_token); 1707 return (0); 1708 } 1709 1710 /* 1711 * Get the approximate process size. Note that anonymous pages 1712 * with backing swap will be counted twice, but there should not 1713 * be too many such pages due to the stress the VM system is 1714 * under at this point. 1715 */ 1716 size = vmspace_anonymous_count(p->p_vmspace) + 1717 vmspace_swap_count(p->p_vmspace); 1718 1719 /* 1720 * If the this process is bigger than the biggest one 1721 * remember it. 1722 */ 1723 if (info->bigsize < size) { 1724 if (info->bigproc) 1725 PRELE(info->bigproc); 1726 PHOLD(p); 1727 info->bigproc = p; 1728 info->bigsize = size; 1729 } 1730 lwkt_reltoken(&p->p_token); 1731 lwkt_yield(); 1732 1733 return(0); 1734 } 1735 1736 /* 1737 * This routine tries to maintain the pseudo LRU active queue, 1738 * so that during long periods of time where there is no paging, 1739 * that some statistic accumulation still occurs. This code 1740 * helps the situation where paging just starts to occur. 1741 */ 1742 static void 1743 vm_pageout_page_stats(int q) 1744 { 1745 static int fullintervalcount = 0; 1746 struct vm_page marker; 1747 vm_page_t m; 1748 int pcount, tpcount; /* Number of pages to check */ 1749 int page_shortage; 1750 1751 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1752 vmstats.v_free_min) - 1753 (vmstats.v_free_count + vmstats.v_inactive_count + 1754 vmstats.v_cache_count); 1755 1756 if (page_shortage <= 0) 1757 return; 1758 1759 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1760 fullintervalcount += vm_pageout_stats_interval; 1761 if (fullintervalcount < vm_pageout_full_stats_interval) { 1762 tpcount = (vm_pageout_stats_max * pcount) / 1763 vmstats.v_page_count + 1; 1764 if (pcount > tpcount) 1765 pcount = tpcount; 1766 } else { 1767 fullintervalcount = 0; 1768 } 1769 1770 bzero(&marker, sizeof(marker)); 1771 marker.flags = PG_FICTITIOUS | PG_MARKER; 1772 marker.busy_count = PBUSY_LOCKED; 1773 marker.queue = PQ_ACTIVE + q; 1774 marker.pc = q; 1775 marker.wire_count = 1; 1776 1777 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1778 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1779 1780 /* 1781 * Queue locked at top of loop to avoid stack marker issues. 1782 */ 1783 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1784 pcount-- > 0) 1785 { 1786 int actcount; 1787 1788 KKASSERT(m->queue == PQ_ACTIVE + q); 1789 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1790 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1791 &marker, pageq); 1792 1793 /* 1794 * Skip marker pages (atomic against other markers to avoid 1795 * infinite hop-over scans). 1796 */ 1797 if (m->flags & PG_MARKER) 1798 continue; 1799 1800 /* 1801 * Ignore pages we can't busy 1802 */ 1803 if (vm_page_busy_try(m, TRUE)) 1804 continue; 1805 1806 /* 1807 * Remaining operations run with the page busy and neither 1808 * the page or the queue will be spin-locked. 1809 */ 1810 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1811 KKASSERT(m->queue == PQ_ACTIVE + q); 1812 1813 /* 1814 * We now have a safely busied page, the page and queue 1815 * spinlocks have been released. 1816 * 1817 * Ignore held pages 1818 */ 1819 if (m->hold_count) { 1820 vm_page_wakeup(m); 1821 goto next; 1822 } 1823 1824 /* 1825 * Calculate activity 1826 */ 1827 actcount = 0; 1828 if (m->flags & PG_REFERENCED) { 1829 vm_page_flag_clear(m, PG_REFERENCED); 1830 actcount += 1; 1831 } 1832 actcount += pmap_ts_referenced(m); 1833 1834 /* 1835 * Update act_count and move page to end of queue. 1836 */ 1837 if (actcount) { 1838 m->act_count += ACT_ADVANCE + actcount; 1839 if (m->act_count > ACT_MAX) 1840 m->act_count = ACT_MAX; 1841 vm_page_and_queue_spin_lock(m); 1842 if (m->queue - m->pc == PQ_ACTIVE) { 1843 TAILQ_REMOVE( 1844 &vm_page_queues[PQ_ACTIVE + q].pl, 1845 m, pageq); 1846 TAILQ_INSERT_TAIL( 1847 &vm_page_queues[PQ_ACTIVE + q].pl, 1848 m, pageq); 1849 } 1850 vm_page_and_queue_spin_unlock(m); 1851 vm_page_wakeup(m); 1852 goto next; 1853 } 1854 1855 if (m->act_count == 0) { 1856 /* 1857 * We turn off page access, so that we have 1858 * more accurate RSS stats. We don't do this 1859 * in the normal page deactivation when the 1860 * system is loaded VM wise, because the 1861 * cost of the large number of page protect 1862 * operations would be higher than the value 1863 * of doing the operation. 1864 * 1865 * We use the marker to save our place so 1866 * we can release the spin lock. both (m) 1867 * and (next) will be invalid. 1868 */ 1869 vm_page_protect(m, VM_PROT_NONE); 1870 vm_page_deactivate(m); 1871 } else { 1872 m->act_count -= min(m->act_count, ACT_DECLINE); 1873 vm_page_and_queue_spin_lock(m); 1874 if (m->queue - m->pc == PQ_ACTIVE) { 1875 TAILQ_REMOVE( 1876 &vm_page_queues[PQ_ACTIVE + q].pl, 1877 m, pageq); 1878 TAILQ_INSERT_TAIL( 1879 &vm_page_queues[PQ_ACTIVE + q].pl, 1880 m, pageq); 1881 } 1882 vm_page_and_queue_spin_unlock(m); 1883 } 1884 vm_page_wakeup(m); 1885 next: 1886 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1887 } 1888 1889 /* 1890 * Remove our local marker 1891 * 1892 * Page queue still spin-locked. 1893 */ 1894 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1895 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1896 } 1897 1898 static int 1899 vm_pageout_free_page_calc(vm_size_t count) 1900 { 1901 if (count < vmstats.v_page_count) 1902 return 0; 1903 /* 1904 * free_reserved needs to include enough for the largest swap pager 1905 * structures plus enough for any pv_entry structs when paging. 1906 * 1907 * v_free_min normal allocations 1908 * v_free_reserved system allocations 1909 * v_pageout_free_min allocations by pageout daemon 1910 * v_interrupt_free_min low level allocations (e.g swap structures) 1911 */ 1912 if (vmstats.v_page_count > 1024) 1913 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1914 else 1915 vmstats.v_free_min = 64; 1916 1917 /* 1918 * Make sure the vmmeter slop can't blow out our global minimums. 1919 * 1920 * However, to accomodate weird configurations (vkernels with many 1921 * cpus and little memory, or artifically reduced hw.physmem), do 1922 * not allow v_free_min to exceed 1/20 of ram or the pageout demon 1923 * will go out of control. 1924 */ 1925 if (vmstats.v_free_min < VMMETER_SLOP_COUNT * ncpus * 10) 1926 vmstats.v_free_min = VMMETER_SLOP_COUNT * ncpus * 10; 1927 if (vmstats.v_free_min > vmstats.v_page_count / 20) 1928 vmstats.v_free_min = vmstats.v_page_count / 20; 1929 1930 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1931 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1932 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1933 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1934 1935 return 1; 1936 } 1937 1938 1939 /* 1940 * vm_pageout is the high level pageout daemon. TWO kernel threads run 1941 * this daemon, the primary pageout daemon and the emergency pageout daemon. 1942 * 1943 * The emergency pageout daemon takes over when the primary pageout daemon 1944 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 1945 * avoiding the many low-memory deadlocks which can occur when paging out 1946 * to VFS's. 1947 */ 1948 static void 1949 vm_pageout_thread(void) 1950 { 1951 int pass; 1952 int q; 1953 int q1iterator = 0; 1954 int q2iterator = 0; 1955 int isep; 1956 1957 curthread->td_flags |= TDF_SYSTHREAD; 1958 1959 /* 1960 * We only need to setup once. 1961 */ 1962 isep = 0; 1963 if (curthread == emergpager) { 1964 isep = 1; 1965 goto skip_setup; 1966 } 1967 1968 /* 1969 * Initialize some paging parameters. 1970 */ 1971 vm_pageout_free_page_calc(vmstats.v_page_count); 1972 1973 /* 1974 * v_free_target and v_cache_min control pageout hysteresis. Note 1975 * that these are more a measure of the VM cache queue hysteresis 1976 * then the VM free queue. Specifically, v_free_target is the 1977 * high water mark (free+cache pages). 1978 * 1979 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1980 * low water mark, while v_free_min is the stop. v_cache_min must 1981 * be big enough to handle memory needs while the pageout daemon 1982 * is signalled and run to free more pages. 1983 */ 1984 if (vmstats.v_free_count > 6144) 1985 vmstats.v_free_target = 4 * vmstats.v_free_min + 1986 vmstats.v_free_reserved; 1987 else 1988 vmstats.v_free_target = 2 * vmstats.v_free_min + 1989 vmstats.v_free_reserved; 1990 1991 /* 1992 * NOTE: With the new buffer cache b_act_count we want the default 1993 * inactive target to be a percentage of available memory. 1994 * 1995 * The inactive target essentially determines the minimum 1996 * number of 'temporary' pages capable of caching one-time-use 1997 * files when the VM system is otherwise full of pages 1998 * belonging to multi-time-use files or active program data. 1999 * 2000 * NOTE: The inactive target is aggressively persued only if the 2001 * inactive queue becomes too small. If the inactive queue 2002 * is large enough to satisfy page movement to free+cache 2003 * then it is repopulated more slowly from the active queue. 2004 * This allows a general inactive_target default to be set. 2005 * 2006 * There is an issue here for processes which sit mostly idle 2007 * 'overnight', such as sshd, tcsh, and X. Any movement from 2008 * the active queue will eventually cause such pages to 2009 * recycle eventually causing a lot of paging in the morning. 2010 * To reduce the incidence of this pages cycled out of the 2011 * buffer cache are moved directly to the inactive queue if 2012 * they were only used once or twice. 2013 * 2014 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2015 * Increasing the value (up to 64) increases the number of 2016 * buffer recyclements which go directly to the inactive queue. 2017 */ 2018 if (vmstats.v_free_count > 2048) { 2019 vmstats.v_cache_min = vmstats.v_free_target; 2020 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2021 } else { 2022 vmstats.v_cache_min = 0; 2023 vmstats.v_cache_max = 0; 2024 } 2025 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2026 2027 /* XXX does not really belong here */ 2028 if (vm_page_max_wired == 0) 2029 vm_page_max_wired = vmstats.v_free_count / 3; 2030 2031 if (vm_pageout_stats_max == 0) 2032 vm_pageout_stats_max = vmstats.v_free_target; 2033 2034 /* 2035 * Set interval in seconds for stats scan. 2036 */ 2037 if (vm_pageout_stats_interval == 0) 2038 vm_pageout_stats_interval = 5; 2039 if (vm_pageout_full_stats_interval == 0) 2040 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2041 2042 2043 /* 2044 * Set maximum free per pass 2045 */ 2046 if (vm_pageout_stats_free_max == 0) 2047 vm_pageout_stats_free_max = 5; 2048 2049 swap_pager_swap_init(); 2050 pass = 0; 2051 2052 atomic_swap_int(&sequence_emerg_pager, 1); 2053 wakeup(&sequence_emerg_pager); 2054 2055 skip_setup: 2056 /* 2057 * Sequence emergency pager startup 2058 */ 2059 if (isep) { 2060 while (sequence_emerg_pager == 0) 2061 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2062 } 2063 2064 /* 2065 * The pageout daemon is never done, so loop forever. 2066 * 2067 * WARNING! This code is being executed by two kernel threads 2068 * potentially simultaneously. 2069 */ 2070 while (TRUE) { 2071 int error; 2072 int avail_shortage; 2073 int inactive_shortage; 2074 int vnodes_skipped = 0; 2075 int recycle_count = 0; 2076 int tmp; 2077 2078 /* 2079 * Wait for an action request. If we timeout check to 2080 * see if paging is needed (in case the normal wakeup 2081 * code raced us). 2082 */ 2083 if (isep) { 2084 /* 2085 * Emergency pagedaemon monitors the primary 2086 * pagedaemon while vm_pages_needed != 0. 2087 * 2088 * The emergency pagedaemon only runs if VM paging 2089 * is needed and the primary pagedaemon has not 2090 * updated vm_pagedaemon_time for more than 2 seconds. 2091 */ 2092 if (vm_pages_needed) 2093 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2094 else 2095 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2096 if (vm_pages_needed == 0) { 2097 pass = 0; 2098 continue; 2099 } 2100 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2101 pass = 0; 2102 continue; 2103 } 2104 } else { 2105 /* 2106 * Primary pagedaemon 2107 */ 2108 if (vm_pages_needed == 0) { 2109 error = tsleep(&vm_pages_needed, 2110 0, "psleep", 2111 vm_pageout_stats_interval * hz); 2112 if (error && 2113 vm_paging_needed() == 0 && 2114 vm_pages_needed == 0) { 2115 for (q = 0; q < PQ_L2_SIZE; ++q) 2116 vm_pageout_page_stats(q); 2117 continue; 2118 } 2119 vm_pagedaemon_time = ticks; 2120 vm_pages_needed = 1; 2121 2122 /* 2123 * Wake the emergency pagedaemon up so it 2124 * can monitor us. It will automatically 2125 * go back into a long sleep when 2126 * vm_pages_needed returns to 0. 2127 */ 2128 wakeup(&vm_pagedaemon_time); 2129 } 2130 } 2131 2132 mycpu->gd_cnt.v_pdwakeups++; 2133 2134 /* 2135 * Scan for INACTIVE->CLEAN/PAGEOUT 2136 * 2137 * This routine tries to avoid thrashing the system with 2138 * unnecessary activity. 2139 * 2140 * Calculate our target for the number of free+cache pages we 2141 * want to get to. This is higher then the number that causes 2142 * allocations to stall (severe) in order to provide hysteresis, 2143 * and if we don't make it all the way but get to the minimum 2144 * we're happy. Goose it a bit if there are multiple requests 2145 * for memory. 2146 * 2147 * Don't reduce avail_shortage inside the loop or the 2148 * PQAVERAGE() calculation will break. 2149 * 2150 * NOTE! deficit is differentiated from avail_shortage as 2151 * REQUIRING at least (deficit) pages to be cleaned, 2152 * even if the page queues are in good shape. This 2153 * is used primarily for handling per-process 2154 * RLIMIT_RSS and may also see small values when 2155 * processes block due to low memory. 2156 */ 2157 vmstats_rollup(); 2158 if (isep == 0) 2159 vm_pagedaemon_time = ticks; 2160 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2161 vm_pageout_deficit = 0; 2162 2163 if (avail_shortage > 0) { 2164 int delta = 0; 2165 int qq; 2166 2167 qq = q1iterator; 2168 for (q = 0; q < PQ_L2_SIZE; ++q) { 2169 delta += vm_pageout_scan_inactive( 2170 pass, 2171 qq & PQ_L2_MASK, 2172 PQAVERAGE(avail_shortage), 2173 &vnodes_skipped); 2174 if (isep) 2175 --qq; 2176 else 2177 ++qq; 2178 if (avail_shortage - delta <= 0) 2179 break; 2180 } 2181 avail_shortage -= delta; 2182 q1iterator = qq; 2183 } 2184 2185 /* 2186 * Figure out how many active pages we must deactivate. If 2187 * we were able to reach our target with just the inactive 2188 * scan above we limit the number of active pages we 2189 * deactivate to reduce unnecessary work. 2190 */ 2191 vmstats_rollup(); 2192 if (isep == 0) 2193 vm_pagedaemon_time = ticks; 2194 inactive_shortage = vmstats.v_inactive_target - 2195 vmstats.v_inactive_count; 2196 2197 /* 2198 * If we were unable to free sufficient inactive pages to 2199 * satisfy the free/cache queue requirements then simply 2200 * reaching the inactive target may not be good enough. 2201 * Try to deactivate pages in excess of the target based 2202 * on the shortfall. 2203 * 2204 * However to prevent thrashing the VM system do not 2205 * deactivate more than an additional 1/10 the inactive 2206 * target's worth of active pages. 2207 */ 2208 if (avail_shortage > 0) { 2209 tmp = avail_shortage * 2; 2210 if (tmp > vmstats.v_inactive_target / 10) 2211 tmp = vmstats.v_inactive_target / 10; 2212 inactive_shortage += tmp; 2213 } 2214 2215 /* 2216 * Only trigger a pmap cleanup on inactive shortage. 2217 */ 2218 if (isep == 0 && inactive_shortage > 0) { 2219 pmap_collect(); 2220 } 2221 2222 /* 2223 * Scan for ACTIVE->INACTIVE 2224 * 2225 * Only trigger on inactive shortage. Triggering on 2226 * avail_shortage can starve the active queue with 2227 * unnecessary active->inactive transitions and destroy 2228 * performance. 2229 * 2230 * If this is the emergency pager, always try to move 2231 * a few pages from active to inactive because the inactive 2232 * queue might have enough pages, but not enough anonymous 2233 * pages. 2234 */ 2235 if (isep && inactive_shortage < vm_emerg_launder) 2236 inactive_shortage = vm_emerg_launder; 2237 2238 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2239 int delta = 0; 2240 int qq; 2241 2242 qq = q2iterator; 2243 for (q = 0; q < PQ_L2_SIZE; ++q) { 2244 delta += vm_pageout_scan_active( 2245 pass, 2246 qq & PQ_L2_MASK, 2247 PQAVERAGE(avail_shortage), 2248 PQAVERAGE(inactive_shortage), 2249 &recycle_count); 2250 if (isep) 2251 --qq; 2252 else 2253 ++qq; 2254 if (inactive_shortage - delta <= 0 && 2255 avail_shortage - delta <= 0) { 2256 break; 2257 } 2258 } 2259 inactive_shortage -= delta; 2260 avail_shortage -= delta; 2261 q2iterator = qq; 2262 } 2263 2264 /* 2265 * Scan for CACHE->FREE 2266 * 2267 * Finally free enough cache pages to meet our free page 2268 * requirement and take more drastic measures if we are 2269 * still in trouble. 2270 */ 2271 vmstats_rollup(); 2272 if (isep == 0) 2273 vm_pagedaemon_time = ticks; 2274 vm_pageout_scan_cache(avail_shortage, pass, 2275 vnodes_skipped, recycle_count); 2276 2277 /* 2278 * Wait for more work. 2279 */ 2280 if (avail_shortage > 0) { 2281 ++pass; 2282 if (pass < 10 && vm_pages_needed > 1) { 2283 /* 2284 * Normal operation, additional processes 2285 * have already kicked us. Retry immediately 2286 * unless swap space is completely full in 2287 * which case delay a bit. 2288 */ 2289 if (swap_pager_full) { 2290 tsleep(&vm_pages_needed, 0, "pdelay", 2291 hz / 5); 2292 } /* else immediate retry */ 2293 } else if (pass < 10) { 2294 /* 2295 * Normal operation, fewer processes. Delay 2296 * a bit but allow wakeups. vm_pages_needed 2297 * is only adjusted against the primary 2298 * pagedaemon here. 2299 */ 2300 if (isep == 0) 2301 vm_pages_needed = 0; 2302 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2303 if (isep == 0) 2304 vm_pages_needed = 1; 2305 } else if (swap_pager_full == 0) { 2306 /* 2307 * We've taken too many passes, forced delay. 2308 */ 2309 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2310 } else { 2311 /* 2312 * Running out of memory, catastrophic 2313 * back-off to one-second intervals. 2314 */ 2315 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2316 } 2317 } else if (vm_pages_needed) { 2318 /* 2319 * Interlocked wakeup of waiters (non-optional). 2320 * 2321 * Similar to vm_page_free_wakeup() in vm_page.c, 2322 * wake 2323 */ 2324 pass = 0; 2325 if (!vm_page_count_min(vm_page_free_hysteresis) || 2326 !vm_page_count_target()) { 2327 vm_pages_needed = 0; 2328 wakeup(&vmstats.v_free_count); 2329 } 2330 } else { 2331 pass = 0; 2332 } 2333 } 2334 } 2335 2336 static struct kproc_desc pg1_kp = { 2337 "pagedaemon", 2338 vm_pageout_thread, 2339 &pagethread 2340 }; 2341 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2342 2343 static struct kproc_desc pg2_kp = { 2344 "emergpager", 2345 vm_pageout_thread, 2346 &emergpager 2347 }; 2348 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2349 2350 2351 /* 2352 * Called after allocating a page out of the cache or free queue 2353 * to possibly wake the pagedaemon up to replentish our supply. 2354 * 2355 * We try to generate some hysteresis by waking the pagedaemon up 2356 * when our free+cache pages go below the free_min+cache_min level. 2357 * The pagedaemon tries to get the count back up to at least the 2358 * minimum, and through to the target level if possible. 2359 * 2360 * If the pagedaemon is already active bump vm_pages_needed as a hint 2361 * that there are even more requests pending. 2362 * 2363 * SMP races ok? 2364 * No requirements. 2365 */ 2366 void 2367 pagedaemon_wakeup(void) 2368 { 2369 if (vm_paging_needed() && curthread != pagethread) { 2370 if (vm_pages_needed == 0) { 2371 vm_pages_needed = 1; /* SMP race ok */ 2372 wakeup(&vm_pages_needed); 2373 } else if (vm_page_count_min(0)) { 2374 ++vm_pages_needed; /* SMP race ok */ 2375 } 2376 } 2377 } 2378 2379 #if !defined(NO_SWAPPING) 2380 2381 /* 2382 * SMP races ok? 2383 * No requirements. 2384 */ 2385 static void 2386 vm_req_vmdaemon(void) 2387 { 2388 static int lastrun = 0; 2389 2390 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2391 wakeup(&vm_daemon_needed); 2392 lastrun = ticks; 2393 } 2394 } 2395 2396 static int vm_daemon_callback(struct proc *p, void *data __unused); 2397 2398 /* 2399 * No requirements. 2400 */ 2401 static void 2402 vm_daemon(void) 2403 { 2404 int req_swapout; 2405 2406 while (TRUE) { 2407 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2408 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2409 2410 /* 2411 * forced swapouts 2412 */ 2413 if (req_swapout) 2414 swapout_procs(vm_pageout_req_swapout); 2415 2416 /* 2417 * scan the processes for exceeding their rlimits or if 2418 * process is swapped out -- deactivate pages 2419 */ 2420 allproc_scan(vm_daemon_callback, NULL, 0); 2421 } 2422 } 2423 2424 static int 2425 vm_daemon_callback(struct proc *p, void *data __unused) 2426 { 2427 struct vmspace *vm; 2428 vm_pindex_t limit, size; 2429 2430 /* 2431 * if this is a system process or if we have already 2432 * looked at this process, skip it. 2433 */ 2434 lwkt_gettoken(&p->p_token); 2435 2436 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2437 lwkt_reltoken(&p->p_token); 2438 return (0); 2439 } 2440 2441 /* 2442 * if the process is in a non-running type state, 2443 * don't touch it. 2444 */ 2445 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2446 lwkt_reltoken(&p->p_token); 2447 return (0); 2448 } 2449 2450 /* 2451 * get a limit 2452 */ 2453 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2454 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2455 2456 /* 2457 * let processes that are swapped out really be 2458 * swapped out. Set the limit to nothing to get as 2459 * many pages out to swap as possible. 2460 */ 2461 if (p->p_flags & P_SWAPPEDOUT) 2462 limit = 0; 2463 2464 vm = p->p_vmspace; 2465 vmspace_hold(vm); 2466 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2467 if (limit >= 0 && size > 4096 && 2468 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2469 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2470 } 2471 vmspace_drop(vm); 2472 2473 lwkt_reltoken(&p->p_token); 2474 2475 return (0); 2476 } 2477 2478 #endif 2479