1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/conf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <sys/lock.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_map.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/swap_pager.h> 93 #include <vm/vm_extern.h> 94 95 #include <sys/spinlock2.h> 96 #include <vm/vm_page2.h> 97 98 /* 99 * System initialization 100 */ 101 102 /* the kernel process "vm_pageout"*/ 103 static int vm_pageout_page(vm_page_t m, long *max_launderp, 104 long *vnodes_skippedp, struct vnode **vpfailedp, 105 int pass, int vmflush_flags); 106 static int vm_pageout_clean_helper (vm_page_t, int); 107 static int vm_pageout_free_page_calc (vm_size_t count); 108 static void vm_pageout_page_free(vm_page_t m) ; 109 struct thread *emergpager; 110 struct thread *pagethread; 111 static int sequence_emerg_pager; 112 113 #if !defined(NO_SWAPPING) 114 /* the kernel process "vm_daemon"*/ 115 static void vm_daemon (void); 116 static struct thread *vmthread; 117 118 static struct kproc_desc vm_kp = { 119 "vmdaemon", 120 vm_daemon, 121 &vmthread 122 }; 123 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 124 #endif 125 126 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 127 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 128 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 129 int vm_page_free_hysteresis = 16; 130 static int vm_pagedaemon_time; 131 132 #if !defined(NO_SWAPPING) 133 static int vm_pageout_req_swapout; 134 static int vm_daemon_needed; 135 #endif 136 static int vm_max_launder = 4096; 137 static int vm_emerg_launder = 100; 138 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 139 static int vm_pageout_full_stats_interval = 0; 140 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 141 static int defer_swap_pageouts=0; 142 static int disable_swap_pageouts=0; 143 static u_int vm_anonmem_decline = ACT_DECLINE; 144 static u_int vm_filemem_decline = ACT_DECLINE * 2; 145 146 #if defined(NO_SWAPPING) 147 static int vm_swap_enabled=0; 148 static int vm_swap_idle_enabled=0; 149 #else 150 static int vm_swap_enabled=1; 151 static int vm_swap_idle_enabled=0; 152 #endif 153 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 154 155 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 156 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 157 158 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 159 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 160 161 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 162 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 163 "Free more pages than the minimum required"); 164 165 SYSCTL_INT(_vm, OID_AUTO, max_launder, 166 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 167 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 168 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 169 170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 171 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 172 173 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 174 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 175 176 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 177 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 178 179 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 180 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 181 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 182 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 183 184 #if defined(NO_SWAPPING) 185 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 186 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 187 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 188 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 189 #else 190 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 191 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 192 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 193 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 194 #endif 195 196 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 197 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 198 199 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 200 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 201 202 static int pageout_lock_miss; 203 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 204 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 205 206 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 207 208 #if !defined(NO_SWAPPING) 209 static void vm_req_vmdaemon (void); 210 #endif 211 static void vm_pageout_page_stats(int q); 212 213 /* 214 * Calculate approximately how many pages on each queue to try to 215 * clean. An exact calculation creates an edge condition when the 216 * queues are unbalanced so add significant slop. The queue scans 217 * will stop early when targets are reached and will start where they 218 * left off on the next pass. 219 * 220 * We need to be generous here because there are all sorts of loading 221 * conditions that can cause edge cases if try to average over all queues. 222 * In particular, storage subsystems have become so fast that paging 223 * activity can become quite frantic. Eventually we will probably need 224 * two paging threads, one for dirty pages and one for clean, to deal 225 * with the bandwidth requirements. 226 227 * So what we do is calculate a value that can be satisfied nominally by 228 * only having to scan half the queues. 229 */ 230 static __inline long 231 PQAVERAGE(long n) 232 { 233 long avg; 234 235 if (n >= 0) { 236 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 237 } else { 238 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 239 } 240 return avg; 241 } 242 243 /* 244 * vm_pageout_clean_helper: 245 * 246 * Clean the page and remove it from the laundry. The page must be busied 247 * by the caller and will be disposed of (put away, flushed) by this routine. 248 */ 249 static int 250 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 251 { 252 vm_object_t object; 253 vm_page_t mc[BLIST_MAX_ALLOC]; 254 int error; 255 int ib, is, page_base; 256 vm_pindex_t pindex = m->pindex; 257 258 object = m->object; 259 260 /* 261 * Don't mess with the page if it's held or special. Theoretically 262 * we can pageout held pages but there is no real need to press our 263 * luck, so don't. 264 */ 265 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 266 vm_page_wakeup(m); 267 return 0; 268 } 269 270 /* 271 * Place page in cluster. Align cluster for optimal swap space 272 * allocation (whether it is swap or not). This is typically ~16-32 273 * pages, which also tends to align the cluster to multiples of the 274 * filesystem block size if backed by a filesystem. 275 */ 276 page_base = pindex % BLIST_MAX_ALLOC; 277 mc[page_base] = m; 278 ib = page_base - 1; 279 is = page_base + 1; 280 281 /* 282 * Scan object for clusterable pages. 283 * 284 * We can cluster ONLY if: ->> the page is NOT 285 * clean, wired, busy, held, or mapped into a 286 * buffer, and one of the following: 287 * 1) The page is inactive, or a seldom used 288 * active page. 289 * -or- 290 * 2) we force the issue. 291 * 292 * During heavy mmap/modification loads the pageout 293 * daemon can really fragment the underlying file 294 * due to flushing pages out of order and not trying 295 * align the clusters (which leave sporatic out-of-order 296 * holes). To solve this problem we do the reverse scan 297 * first and attempt to align our cluster, then do a 298 * forward scan if room remains. 299 */ 300 vm_object_hold(object); 301 302 while (ib >= 0) { 303 vm_page_t p; 304 305 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 306 TRUE, &error); 307 if (error || p == NULL) 308 break; 309 if ((p->queue - p->pc) == PQ_CACHE || 310 (p->flags & PG_UNMANAGED)) { 311 vm_page_wakeup(p); 312 break; 313 } 314 vm_page_test_dirty(p); 315 if (((p->dirty & p->valid) == 0 && 316 (p->flags & PG_NEED_COMMIT) == 0) || 317 p->wire_count != 0 || /* may be held by buf cache */ 318 p->hold_count != 0) { /* may be undergoing I/O */ 319 vm_page_wakeup(p); 320 break; 321 } 322 if (p->queue - p->pc != PQ_INACTIVE) { 323 if (p->queue - p->pc != PQ_ACTIVE || 324 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 325 vm_page_wakeup(p); 326 break; 327 } 328 } 329 330 /* 331 * Try to maintain page groupings in the cluster. 332 */ 333 if (m->flags & PG_WINATCFLS) 334 vm_page_flag_set(p, PG_WINATCFLS); 335 else 336 vm_page_flag_clear(p, PG_WINATCFLS); 337 p->act_count = m->act_count; 338 339 mc[ib] = p; 340 --ib; 341 } 342 ++ib; /* fixup */ 343 344 while (is < BLIST_MAX_ALLOC && 345 pindex - page_base + is < object->size) { 346 vm_page_t p; 347 348 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 349 TRUE, &error); 350 if (error || p == NULL) 351 break; 352 if (((p->queue - p->pc) == PQ_CACHE) || 353 (p->flags & PG_UNMANAGED)) { 354 vm_page_wakeup(p); 355 break; 356 } 357 vm_page_test_dirty(p); 358 if (((p->dirty & p->valid) == 0 && 359 (p->flags & PG_NEED_COMMIT) == 0) || 360 p->wire_count != 0 || /* may be held by buf cache */ 361 p->hold_count != 0) { /* may be undergoing I/O */ 362 vm_page_wakeup(p); 363 break; 364 } 365 if (p->queue - p->pc != PQ_INACTIVE) { 366 if (p->queue - p->pc != PQ_ACTIVE || 367 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 368 vm_page_wakeup(p); 369 break; 370 } 371 } 372 373 /* 374 * Try to maintain page groupings in the cluster. 375 */ 376 if (m->flags & PG_WINATCFLS) 377 vm_page_flag_set(p, PG_WINATCFLS); 378 else 379 vm_page_flag_clear(p, PG_WINATCFLS); 380 p->act_count = m->act_count; 381 382 mc[is] = p; 383 ++is; 384 } 385 386 vm_object_drop(object); 387 388 /* 389 * we allow reads during pageouts... 390 */ 391 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 392 } 393 394 /* 395 * vm_pageout_flush() - launder the given pages 396 * 397 * The given pages are laundered. Note that we setup for the start of 398 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 399 * reference count all in here rather then in the parent. If we want 400 * the parent to do more sophisticated things we may have to change 401 * the ordering. 402 * 403 * The pages in the array must be busied by the caller and will be 404 * unbusied by this function. 405 */ 406 int 407 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 408 { 409 vm_object_t object; 410 int pageout_status[count]; 411 int numpagedout = 0; 412 int i; 413 414 /* 415 * Initiate I/O. Bump the vm_page_t->busy counter. 416 */ 417 for (i = 0; i < count; i++) { 418 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 419 ("vm_pageout_flush page %p index %d/%d: partially " 420 "invalid page", mc[i], i, count)); 421 vm_page_io_start(mc[i]); 422 } 423 424 /* 425 * We must make the pages read-only. This will also force the 426 * modified bit in the related pmaps to be cleared. The pager 427 * cannot clear the bit for us since the I/O completion code 428 * typically runs from an interrupt. The act of making the page 429 * read-only handles the case for us. 430 * 431 * Then we can unbusy the pages, we still hold a reference by virtue 432 * of our soft-busy. 433 */ 434 for (i = 0; i < count; i++) { 435 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 436 vm_page_protect(mc[i], VM_PROT_NONE); 437 else 438 vm_page_protect(mc[i], VM_PROT_READ); 439 vm_page_wakeup(mc[i]); 440 } 441 442 object = mc[0]->object; 443 vm_object_pip_add(object, count); 444 445 vm_pager_put_pages(object, mc, count, 446 (vmflush_flags | 447 ((object == &kernel_object) ? 448 VM_PAGER_PUT_SYNC : 0)), 449 pageout_status); 450 451 for (i = 0; i < count; i++) { 452 vm_page_t mt = mc[i]; 453 454 switch (pageout_status[i]) { 455 case VM_PAGER_OK: 456 numpagedout++; 457 break; 458 case VM_PAGER_PEND: 459 numpagedout++; 460 break; 461 case VM_PAGER_BAD: 462 /* 463 * Page outside of range of object. Right now we 464 * essentially lose the changes by pretending it 465 * worked. 466 */ 467 vm_page_busy_wait(mt, FALSE, "pgbad"); 468 pmap_clear_modify(mt); 469 vm_page_undirty(mt); 470 vm_page_wakeup(mt); 471 break; 472 case VM_PAGER_ERROR: 473 case VM_PAGER_FAIL: 474 /* 475 * A page typically cannot be paged out when we 476 * have run out of swap. We leave the page 477 * marked inactive and will try to page it out 478 * again later. 479 * 480 * Starvation of the active page list is used to 481 * determine when the system is massively memory 482 * starved. 483 */ 484 break; 485 case VM_PAGER_AGAIN: 486 break; 487 } 488 489 /* 490 * If not PENDing this was a synchronous operation and we 491 * clean up after the I/O. If it is PENDing the mess is 492 * cleaned up asynchronously. 493 * 494 * Also nominally act on the caller's wishes if the caller 495 * wants to try to really clean (cache or free) the page. 496 * 497 * Also nominally deactivate the page if the system is 498 * memory-stressed. 499 */ 500 if (pageout_status[i] != VM_PAGER_PEND) { 501 vm_page_busy_wait(mt, FALSE, "pgouw"); 502 vm_page_io_finish(mt); 503 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 504 vm_page_try_to_cache(mt); 505 } else if (vm_page_count_severe()) { 506 vm_page_deactivate(mt); 507 vm_page_wakeup(mt); 508 } else { 509 vm_page_wakeup(mt); 510 } 511 vm_object_pip_wakeup(object); 512 } 513 } 514 return numpagedout; 515 } 516 517 #if !defined(NO_SWAPPING) 518 519 /* 520 * Callback function, page busied for us. We must dispose of the busy 521 * condition. Any related pmap pages may be held but will not be locked. 522 */ 523 static 524 int 525 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 526 vm_page_t p) 527 { 528 int actcount; 529 int cleanit = 0; 530 531 /* 532 * Basic tests - There should never be a marker, and we can stop 533 * once the RSS is below the required level. 534 */ 535 KKASSERT((p->flags & PG_MARKER) == 0); 536 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 537 vm_page_wakeup(p); 538 return(-1); 539 } 540 541 mycpu->gd_cnt.v_pdpages++; 542 543 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 544 vm_page_wakeup(p); 545 goto done; 546 } 547 548 ++info->actioncount; 549 550 /* 551 * Check if the page has been referened recently. If it has, 552 * activate it and skip. 553 */ 554 actcount = pmap_ts_referenced(p); 555 if (actcount) { 556 vm_page_flag_set(p, PG_REFERENCED); 557 } else if (p->flags & PG_REFERENCED) { 558 actcount = 1; 559 } 560 561 if (actcount) { 562 if (p->queue - p->pc != PQ_ACTIVE) { 563 vm_page_and_queue_spin_lock(p); 564 if (p->queue - p->pc != PQ_ACTIVE) { 565 vm_page_and_queue_spin_unlock(p); 566 vm_page_activate(p); 567 } else { 568 vm_page_and_queue_spin_unlock(p); 569 } 570 } else { 571 p->act_count += actcount; 572 if (p->act_count > ACT_MAX) 573 p->act_count = ACT_MAX; 574 } 575 vm_page_flag_clear(p, PG_REFERENCED); 576 vm_page_wakeup(p); 577 goto done; 578 } 579 580 /* 581 * Remove the page from this particular pmap. Once we do this, our 582 * pmap scans will not see it again (unless it gets faulted in), so 583 * we must actively dispose of or deal with the page. 584 */ 585 pmap_remove_specific(info->pmap, p); 586 587 /* 588 * If the page is not mapped to another process (i.e. as would be 589 * typical if this were a shared page from a library) then deactivate 590 * the page and clean it in two passes only. 591 * 592 * If the page hasn't been referenced since the last check, remove it 593 * from the pmap. If it is no longer mapped, deactivate it 594 * immediately, accelerating the normal decline. 595 * 596 * Once the page has been removed from the pmap the RSS code no 597 * longer tracks it so we have to make sure that it is staged for 598 * potential flush action. 599 */ 600 if ((p->flags & PG_MAPPED) == 0) { 601 if (p->queue - p->pc == PQ_ACTIVE) { 602 vm_page_deactivate(p); 603 } 604 if (p->queue - p->pc == PQ_INACTIVE) { 605 cleanit = 1; 606 } 607 } 608 609 /* 610 * Ok, try to fully clean the page and any nearby pages such that at 611 * least the requested page is freed or moved to the cache queue. 612 * 613 * We usually do this synchronously to allow us to get the page into 614 * the CACHE queue quickly, which will prevent memory exhaustion if 615 * a process with a memoryuse limit is running away. However, the 616 * sysadmin may desire to set vm.swap_user_async which relaxes this 617 * and improves write performance. 618 */ 619 if (cleanit) { 620 long max_launder = 0x7FFF; 621 long vnodes_skipped = 0; 622 int vmflush_flags; 623 struct vnode *vpfailed = NULL; 624 625 info->offset = va; 626 627 if (vm_pageout_memuse_mode >= 2) { 628 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 629 VM_PAGER_ALLOW_ACTIVE; 630 if (swap_user_async == 0) 631 vmflush_flags |= VM_PAGER_PUT_SYNC; 632 vm_page_flag_set(p, PG_WINATCFLS); 633 info->cleancount += 634 vm_pageout_page(p, &max_launder, 635 &vnodes_skipped, 636 &vpfailed, 1, vmflush_flags); 637 } else { 638 vm_page_wakeup(p); 639 ++info->cleancount; 640 } 641 } else { 642 vm_page_wakeup(p); 643 } 644 645 /* 646 * Must be at end to avoid SMP races. 647 */ 648 done: 649 lwkt_user_yield(); 650 return 0; 651 } 652 653 /* 654 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 655 * that is relatively difficult to do. We try to keep track of where we 656 * left off last time to reduce scan overhead. 657 * 658 * Called when vm_pageout_memuse_mode is >= 1. 659 */ 660 void 661 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 662 { 663 vm_offset_t pgout_offset; 664 struct pmap_pgscan_info info; 665 int retries = 3; 666 667 pgout_offset = map->pgout_offset; 668 again: 669 #if 0 670 kprintf("%016jx ", pgout_offset); 671 #endif 672 if (pgout_offset < VM_MIN_USER_ADDRESS) 673 pgout_offset = VM_MIN_USER_ADDRESS; 674 if (pgout_offset >= VM_MAX_USER_ADDRESS) 675 pgout_offset = 0; 676 info.pmap = vm_map_pmap(map); 677 info.limit = limit; 678 info.beg_addr = pgout_offset; 679 info.end_addr = VM_MAX_USER_ADDRESS; 680 info.callback = vm_pageout_mdp_callback; 681 info.cleancount = 0; 682 info.actioncount = 0; 683 info.busycount = 0; 684 685 pmap_pgscan(&info); 686 pgout_offset = info.offset; 687 #if 0 688 kprintf("%016jx %08lx %08lx\n", pgout_offset, 689 info.cleancount, info.actioncount); 690 #endif 691 692 if (pgout_offset != VM_MAX_USER_ADDRESS && 693 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 694 goto again; 695 } else if (retries && 696 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 697 --retries; 698 goto again; 699 } 700 map->pgout_offset = pgout_offset; 701 } 702 #endif 703 704 /* 705 * Called when the pageout scan wants to free a page. We no longer 706 * try to cycle the vm_object here with a reference & dealloc, which can 707 * cause a non-trivial object collapse in a critical path. 708 * 709 * It is unclear why we cycled the ref_count in the past, perhaps to try 710 * to optimize shadow chain collapses but I don't quite see why it would 711 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 712 * synchronously and not have to be kicked-start. 713 */ 714 static void 715 vm_pageout_page_free(vm_page_t m) 716 { 717 vm_page_protect(m, VM_PROT_NONE); 718 vm_page_free(m); 719 } 720 721 /* 722 * vm_pageout_scan does the dirty work for the pageout daemon. 723 */ 724 struct vm_pageout_scan_info { 725 struct proc *bigproc; 726 vm_offset_t bigsize; 727 }; 728 729 static int vm_pageout_scan_callback(struct proc *p, void *data); 730 731 /* 732 * Scan inactive queue 733 * 734 * WARNING! Can be called from two pagedaemon threads simultaneously. 735 */ 736 static int 737 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 738 long *vnodes_skipped) 739 { 740 vm_page_t m; 741 struct vm_page marker; 742 struct vnode *vpfailed; /* warning, allowed to be stale */ 743 long maxscan; 744 long delta = 0; 745 long max_launder; 746 int isep; 747 748 isep = (curthread == emergpager); 749 750 /* 751 * Start scanning the inactive queue for pages we can move to the 752 * cache or free. The scan will stop when the target is reached or 753 * we have scanned the entire inactive queue. Note that m->act_count 754 * is not used to form decisions for the inactive queue, only for the 755 * active queue. 756 * 757 * max_launder limits the number of dirty pages we flush per scan. 758 * For most systems a smaller value (16 or 32) is more robust under 759 * extreme memory and disk pressure because any unnecessary writes 760 * to disk can result in extreme performance degredation. However, 761 * systems with excessive dirty pages (especially when MAP_NOSYNC is 762 * used) will die horribly with limited laundering. If the pageout 763 * daemon cannot clean enough pages in the first pass, we let it go 764 * all out in succeeding passes. 765 * 766 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 767 * PAGES. 768 */ 769 if ((max_launder = vm_max_launder) <= 1) 770 max_launder = 1; 771 if (pass) 772 max_launder = 10000; 773 774 /* 775 * Initialize our marker 776 */ 777 bzero(&marker, sizeof(marker)); 778 marker.flags = PG_FICTITIOUS | PG_MARKER; 779 marker.busy_count = PBUSY_LOCKED; 780 marker.queue = PQ_INACTIVE + q; 781 marker.pc = q; 782 marker.wire_count = 1; 783 784 /* 785 * Inactive queue scan. 786 * 787 * NOTE: The vm_page must be spinlocked before the queue to avoid 788 * deadlocks, so it is easiest to simply iterate the loop 789 * with the queue unlocked at the top. 790 */ 791 vpfailed = NULL; 792 793 vm_page_queues_spin_lock(PQ_INACTIVE + q); 794 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 795 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 796 797 /* 798 * Queue locked at top of loop to avoid stack marker issues. 799 */ 800 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 801 maxscan-- > 0 && avail_shortage - delta > 0) 802 { 803 int count; 804 805 KKASSERT(m->queue == PQ_INACTIVE + q); 806 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 807 &marker, pageq); 808 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 809 &marker, pageq); 810 mycpu->gd_cnt.v_pdpages++; 811 812 /* 813 * Skip marker pages (atomic against other markers to avoid 814 * infinite hop-over scans). 815 */ 816 if (m->flags & PG_MARKER) 817 continue; 818 819 /* 820 * Try to busy the page. Don't mess with pages which are 821 * already busy or reorder them in the queue. 822 */ 823 if (vm_page_busy_try(m, TRUE)) 824 continue; 825 826 /* 827 * Remaining operations run with the page busy and neither 828 * the page or the queue will be spin-locked. 829 */ 830 KKASSERT(m->queue == PQ_INACTIVE + q); 831 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 832 833 /* 834 * The emergency pager runs when the primary pager gets 835 * stuck, which typically means the primary pager deadlocked 836 * on a vnode-backed page. Therefore, the emergency pager 837 * must skip any complex objects. 838 * 839 * We disallow VNODEs unless they are VCHR whos device ops 840 * does not flag D_NOEMERGPGR. 841 */ 842 if (isep && m->object) { 843 struct vnode *vp; 844 845 switch(m->object->type) { 846 case OBJT_DEFAULT: 847 case OBJT_SWAP: 848 /* 849 * Allow anonymous memory and assume that 850 * swap devices are not complex, since its 851 * kinda worthless if we can't swap out dirty 852 * anonymous pages. 853 */ 854 break; 855 case OBJT_VNODE: 856 /* 857 * Allow VCHR device if the D_NOEMERGPGR 858 * flag is not set, deny other vnode types 859 * as being too complex. 860 */ 861 vp = m->object->handle; 862 if (vp && vp->v_type == VCHR && 863 vp->v_rdev && vp->v_rdev->si_ops && 864 (vp->v_rdev->si_ops->head.flags & 865 D_NOEMERGPGR) == 0) { 866 break; 867 } 868 /* Deny - fall through */ 869 default: 870 /* 871 * Deny 872 */ 873 vm_page_wakeup(m); 874 vm_page_queues_spin_lock(PQ_INACTIVE + q); 875 lwkt_yield(); 876 continue; 877 } 878 } 879 880 /* 881 * Try to pageout the page and perhaps other nearby pages. 882 */ 883 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 884 &vpfailed, pass, 0); 885 delta += count; 886 887 /* 888 * Systems with a ton of memory can wind up with huge 889 * deactivation counts. Because the inactive scan is 890 * doing a lot of flushing, the combination can result 891 * in excessive paging even in situations where other 892 * unrelated threads free up sufficient VM. 893 * 894 * To deal with this we abort the nominal active->inactive 895 * scan before we hit the inactive target when free+cache 896 * levels have reached a reasonable target. 897 * 898 * When deciding to stop early we need to add some slop to 899 * the test and we need to return full completion to the caller 900 * to prevent the caller from thinking there is something 901 * wrong and issuing a low-memory+swap warning or pkill. 902 * 903 * A deficit forces paging regardless of the state of the 904 * VM page queues (used for RSS enforcement). 905 */ 906 lwkt_yield(); 907 vm_page_queues_spin_lock(PQ_INACTIVE + q); 908 if (vm_paging_target() < -vm_max_launder) { 909 /* 910 * Stopping early, return full completion to caller. 911 */ 912 if (delta < avail_shortage) 913 delta = avail_shortage; 914 break; 915 } 916 } 917 918 /* page queue still spin-locked */ 919 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 920 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 921 922 return (delta); 923 } 924 925 /* 926 * Pageout the specified page, return the total number of pages paged out 927 * (this routine may cluster). 928 * 929 * The page must be busied and soft-busied by the caller and will be disposed 930 * of by this function. 931 */ 932 static int 933 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 934 struct vnode **vpfailedp, int pass, int vmflush_flags) 935 { 936 vm_object_t object; 937 int actcount; 938 int count = 0; 939 940 /* 941 * Wiring no longer removes a page from its queue. The last unwiring 942 * will requeue the page. Obviously wired pages cannot be paged out 943 * so unqueue it and return. 944 */ 945 if (m->wire_count) { 946 vm_page_unqueue_nowakeup(m); 947 vm_page_wakeup(m); 948 return 0; 949 } 950 951 /* 952 * A held page may be undergoing I/O, so skip it. 953 */ 954 if (m->hold_count) { 955 vm_page_and_queue_spin_lock(m); 956 if (m->queue - m->pc == PQ_INACTIVE) { 957 TAILQ_REMOVE( 958 &vm_page_queues[m->queue].pl, m, pageq); 959 TAILQ_INSERT_TAIL( 960 &vm_page_queues[m->queue].pl, m, pageq); 961 } 962 vm_page_and_queue_spin_unlock(m); 963 vm_page_wakeup(m); 964 return 0; 965 } 966 967 if (m->object == NULL || m->object->ref_count == 0) { 968 /* 969 * If the object is not being used, we ignore previous 970 * references. 971 */ 972 vm_page_flag_clear(m, PG_REFERENCED); 973 pmap_clear_reference(m); 974 /* fall through to end */ 975 } else if (((m->flags & PG_REFERENCED) == 0) && 976 (actcount = pmap_ts_referenced(m))) { 977 /* 978 * Otherwise, if the page has been referenced while 979 * in the inactive queue, we bump the "activation 980 * count" upwards, making it less likely that the 981 * page will be added back to the inactive queue 982 * prematurely again. Here we check the page tables 983 * (or emulated bits, if any), given the upper level 984 * VM system not knowing anything about existing 985 * references. 986 */ 987 vm_page_activate(m); 988 m->act_count += (actcount + ACT_ADVANCE); 989 vm_page_wakeup(m); 990 return 0; 991 } 992 993 /* 994 * (m) is still busied. 995 * 996 * If the upper level VM system knows about any page 997 * references, we activate the page. We also set the 998 * "activation count" higher than normal so that we will less 999 * likely place pages back onto the inactive queue again. 1000 */ 1001 if ((m->flags & PG_REFERENCED) != 0) { 1002 vm_page_flag_clear(m, PG_REFERENCED); 1003 actcount = pmap_ts_referenced(m); 1004 vm_page_activate(m); 1005 m->act_count += (actcount + ACT_ADVANCE + 1); 1006 vm_page_wakeup(m); 1007 return 0; 1008 } 1009 1010 /* 1011 * If the upper level VM system doesn't know anything about 1012 * the page being dirty, we have to check for it again. As 1013 * far as the VM code knows, any partially dirty pages are 1014 * fully dirty. 1015 * 1016 * Pages marked PG_WRITEABLE may be mapped into the user 1017 * address space of a process running on another cpu. A 1018 * user process (without holding the MP lock) running on 1019 * another cpu may be able to touch the page while we are 1020 * trying to remove it. vm_page_cache() will handle this 1021 * case for us. 1022 */ 1023 if (m->dirty == 0) { 1024 vm_page_test_dirty(m); 1025 } else { 1026 vm_page_dirty(m); 1027 } 1028 1029 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1030 /* 1031 * Invalid pages can be easily freed 1032 */ 1033 vm_pageout_page_free(m); 1034 mycpu->gd_cnt.v_dfree++; 1035 ++count; 1036 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1037 /* 1038 * Clean pages can be placed onto the cache queue. 1039 * This effectively frees them. 1040 */ 1041 vm_page_cache(m); 1042 ++count; 1043 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1044 /* 1045 * Dirty pages need to be paged out, but flushing 1046 * a page is extremely expensive verses freeing 1047 * a clean page. Rather then artificially limiting 1048 * the number of pages we can flush, we instead give 1049 * dirty pages extra priority on the inactive queue 1050 * by forcing them to be cycled through the queue 1051 * twice before being flushed, after which the 1052 * (now clean) page will cycle through once more 1053 * before being freed. This significantly extends 1054 * the thrash point for a heavily loaded machine. 1055 */ 1056 vm_page_flag_set(m, PG_WINATCFLS); 1057 vm_page_and_queue_spin_lock(m); 1058 if (m->queue - m->pc == PQ_INACTIVE) { 1059 TAILQ_REMOVE( 1060 &vm_page_queues[m->queue].pl, m, pageq); 1061 TAILQ_INSERT_TAIL( 1062 &vm_page_queues[m->queue].pl, m, pageq); 1063 } 1064 vm_page_and_queue_spin_unlock(m); 1065 vm_page_wakeup(m); 1066 } else if (*max_launderp > 0) { 1067 /* 1068 * We always want to try to flush some dirty pages if 1069 * we encounter them, to keep the system stable. 1070 * Normally this number is small, but under extreme 1071 * pressure where there are insufficient clean pages 1072 * on the inactive queue, we may have to go all out. 1073 */ 1074 int swap_pageouts_ok; 1075 struct vnode *vp = NULL; 1076 1077 swap_pageouts_ok = 0; 1078 object = m->object; 1079 if (object && 1080 (object->type != OBJT_SWAP) && 1081 (object->type != OBJT_DEFAULT)) { 1082 swap_pageouts_ok = 1; 1083 } else { 1084 swap_pageouts_ok = !(defer_swap_pageouts || 1085 disable_swap_pageouts); 1086 swap_pageouts_ok |= (!disable_swap_pageouts && 1087 defer_swap_pageouts && 1088 vm_page_count_min(0)); 1089 } 1090 1091 /* 1092 * We don't bother paging objects that are "dead". 1093 * Those objects are in a "rundown" state. 1094 */ 1095 if (!swap_pageouts_ok || 1096 (object == NULL) || 1097 (object->flags & OBJ_DEAD)) { 1098 vm_page_and_queue_spin_lock(m); 1099 if (m->queue - m->pc == PQ_INACTIVE) { 1100 TAILQ_REMOVE( 1101 &vm_page_queues[m->queue].pl, 1102 m, pageq); 1103 TAILQ_INSERT_TAIL( 1104 &vm_page_queues[m->queue].pl, 1105 m, pageq); 1106 } 1107 vm_page_and_queue_spin_unlock(m); 1108 vm_page_wakeup(m); 1109 return 0; 1110 } 1111 1112 /* 1113 * (m) is still busied. 1114 * 1115 * The object is already known NOT to be dead. It 1116 * is possible for the vget() to block the whole 1117 * pageout daemon, but the new low-memory handling 1118 * code should prevent it. 1119 * 1120 * The previous code skipped locked vnodes and, worse, 1121 * reordered pages in the queue. This results in 1122 * completely non-deterministic operation because, 1123 * quite often, a vm_fault has initiated an I/O and 1124 * is holding a locked vnode at just the point where 1125 * the pageout daemon is woken up. 1126 * 1127 * We can't wait forever for the vnode lock, we might 1128 * deadlock due to a vn_read() getting stuck in 1129 * vm_wait while holding this vnode. We skip the 1130 * vnode if we can't get it in a reasonable amount 1131 * of time. 1132 * 1133 * vpfailed is used to (try to) avoid the case where 1134 * a large number of pages are associated with a 1135 * locked vnode, which could cause the pageout daemon 1136 * to stall for an excessive amount of time. 1137 */ 1138 if (object->type == OBJT_VNODE) { 1139 int flags; 1140 1141 vp = object->handle; 1142 flags = LK_EXCLUSIVE; 1143 if (vp == *vpfailedp) 1144 flags |= LK_NOWAIT; 1145 else 1146 flags |= LK_TIMELOCK; 1147 vm_page_hold(m); 1148 vm_page_wakeup(m); 1149 1150 /* 1151 * We have unbusied (m) temporarily so we can 1152 * acquire the vp lock without deadlocking. 1153 * (m) is held to prevent destruction. 1154 */ 1155 if (vget(vp, flags) != 0) { 1156 *vpfailedp = vp; 1157 ++pageout_lock_miss; 1158 if (object->flags & OBJ_MIGHTBEDIRTY) 1159 ++*vnodes_skippedp; 1160 vm_page_unhold(m); 1161 return 0; 1162 } 1163 1164 /* 1165 * The page might have been moved to another 1166 * queue during potential blocking in vget() 1167 * above. The page might have been freed and 1168 * reused for another vnode. The object might 1169 * have been reused for another vnode. 1170 */ 1171 if (m->queue - m->pc != PQ_INACTIVE || 1172 m->object != object || 1173 object->handle != vp) { 1174 if (object->flags & OBJ_MIGHTBEDIRTY) 1175 ++*vnodes_skippedp; 1176 vput(vp); 1177 vm_page_unhold(m); 1178 return 0; 1179 } 1180 1181 /* 1182 * The page may have been busied during the 1183 * blocking in vput(); We don't move the 1184 * page back onto the end of the queue so that 1185 * statistics are more correct if we don't. 1186 */ 1187 if (vm_page_busy_try(m, TRUE)) { 1188 vput(vp); 1189 vm_page_unhold(m); 1190 return 0; 1191 } 1192 vm_page_unhold(m); 1193 1194 /* 1195 * If it was wired while we didn't own it. 1196 */ 1197 if (m->wire_count) { 1198 vm_page_unqueue_nowakeup(m); 1199 vput(vp); 1200 vm_page_wakeup(m); 1201 return 0; 1202 } 1203 1204 /* 1205 * (m) is busied again 1206 * 1207 * We own the busy bit and remove our hold 1208 * bit. If the page is still held it 1209 * might be undergoing I/O, so skip it. 1210 */ 1211 if (m->hold_count) { 1212 vm_page_and_queue_spin_lock(m); 1213 if (m->queue - m->pc == PQ_INACTIVE) { 1214 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1215 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1216 } 1217 vm_page_and_queue_spin_unlock(m); 1218 if (object->flags & OBJ_MIGHTBEDIRTY) 1219 ++*vnodes_skippedp; 1220 vm_page_wakeup(m); 1221 vput(vp); 1222 return 0; 1223 } 1224 /* (m) is left busied as we fall through */ 1225 } 1226 1227 /* 1228 * page is busy and not held here. 1229 * 1230 * If a page is dirty, then it is either being washed 1231 * (but not yet cleaned) or it is still in the 1232 * laundry. If it is still in the laundry, then we 1233 * start the cleaning operation. 1234 * 1235 * decrement inactive_shortage on success to account 1236 * for the (future) cleaned page. Otherwise we 1237 * could wind up laundering or cleaning too many 1238 * pages. 1239 * 1240 * NOTE: Cleaning the page here does not cause 1241 * force_deficit to be adjusted, because the 1242 * page is not being freed or moved to the 1243 * cache. 1244 */ 1245 count = vm_pageout_clean_helper(m, vmflush_flags); 1246 *max_launderp -= count; 1247 1248 /* 1249 * Clean ate busy, page no longer accessible 1250 */ 1251 if (vp != NULL) 1252 vput(vp); 1253 } else { 1254 vm_page_wakeup(m); 1255 } 1256 return count; 1257 } 1258 1259 /* 1260 * Scan active queue 1261 * 1262 * WARNING! Can be called from two pagedaemon threads simultaneously. 1263 */ 1264 static int 1265 vm_pageout_scan_active(int pass, int q, 1266 long avail_shortage, long inactive_shortage, 1267 long *recycle_countp) 1268 { 1269 struct vm_page marker; 1270 vm_page_t m; 1271 int actcount; 1272 long delta = 0; 1273 long maxscan; 1274 int isep; 1275 1276 isep = (curthread == emergpager); 1277 1278 /* 1279 * We want to move pages from the active queue to the inactive 1280 * queue to get the inactive queue to the inactive target. If 1281 * we still have a page shortage from above we try to directly free 1282 * clean pages instead of moving them. 1283 * 1284 * If we do still have a shortage we keep track of the number of 1285 * pages we free or cache (recycle_count) as a measure of thrashing 1286 * between the active and inactive queues. 1287 * 1288 * If we were able to completely satisfy the free+cache targets 1289 * from the inactive pool we limit the number of pages we move 1290 * from the active pool to the inactive pool to 2x the pages we 1291 * had removed from the inactive pool (with a minimum of 1/5 the 1292 * inactive target). If we were not able to completely satisfy 1293 * the free+cache targets we go for the whole target aggressively. 1294 * 1295 * NOTE: Both variables can end up negative. 1296 * NOTE: We are still in a critical section. 1297 * 1298 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1299 * PAGES. 1300 */ 1301 1302 bzero(&marker, sizeof(marker)); 1303 marker.flags = PG_FICTITIOUS | PG_MARKER; 1304 marker.busy_count = PBUSY_LOCKED; 1305 marker.queue = PQ_ACTIVE + q; 1306 marker.pc = q; 1307 marker.wire_count = 1; 1308 1309 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1310 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1311 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1312 1313 /* 1314 * Queue locked at top of loop to avoid stack marker issues. 1315 */ 1316 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1317 maxscan-- > 0 && (avail_shortage - delta > 0 || 1318 inactive_shortage > 0)) 1319 { 1320 KKASSERT(m->queue == PQ_ACTIVE + q); 1321 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1322 &marker, pageq); 1323 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1324 &marker, pageq); 1325 1326 /* 1327 * Skip marker pages (atomic against other markers to avoid 1328 * infinite hop-over scans). 1329 */ 1330 if (m->flags & PG_MARKER) 1331 continue; 1332 1333 /* 1334 * Try to busy the page. Don't mess with pages which are 1335 * already busy or reorder them in the queue. 1336 */ 1337 if (vm_page_busy_try(m, TRUE)) 1338 continue; 1339 1340 /* 1341 * Remaining operations run with the page busy and neither 1342 * the page or the queue will be spin-locked. 1343 */ 1344 KKASSERT(m->queue == PQ_ACTIVE + q); 1345 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1346 1347 #if 0 1348 /* 1349 * Don't deactivate pages that are held, even if we can 1350 * busy them. (XXX why not?) 1351 */ 1352 if (m->hold_count) { 1353 vm_page_and_queue_spin_lock(m); 1354 if (m->queue - m->pc == PQ_ACTIVE) { 1355 TAILQ_REMOVE( 1356 &vm_page_queues[PQ_ACTIVE + q].pl, 1357 m, pageq); 1358 TAILQ_INSERT_TAIL( 1359 &vm_page_queues[PQ_ACTIVE + q].pl, 1360 m, pageq); 1361 } 1362 vm_page_and_queue_spin_unlock(m); 1363 vm_page_wakeup(m); 1364 goto next; 1365 } 1366 #endif 1367 /* 1368 * We can just remove wired pages from the queue 1369 */ 1370 if (m->wire_count) { 1371 vm_page_unqueue_nowakeup(m); 1372 vm_page_wakeup(m); 1373 goto next; 1374 } 1375 1376 /* 1377 * The emergency pager ignores vnode-backed pages as these 1378 * are the pages that probably bricked the main pager. 1379 */ 1380 if (isep && m->object && m->object->type == OBJT_VNODE) { 1381 vm_page_and_queue_spin_lock(m); 1382 if (m->queue - m->pc == PQ_ACTIVE) { 1383 TAILQ_REMOVE( 1384 &vm_page_queues[PQ_ACTIVE + q].pl, 1385 m, pageq); 1386 TAILQ_INSERT_TAIL( 1387 &vm_page_queues[PQ_ACTIVE + q].pl, 1388 m, pageq); 1389 } 1390 vm_page_and_queue_spin_unlock(m); 1391 vm_page_wakeup(m); 1392 goto next; 1393 } 1394 1395 /* 1396 * The count for pagedaemon pages is done after checking the 1397 * page for eligibility... 1398 */ 1399 mycpu->gd_cnt.v_pdpages++; 1400 1401 /* 1402 * Check to see "how much" the page has been used and clear 1403 * the tracking access bits. If the object has no references 1404 * don't bother paying the expense. 1405 */ 1406 actcount = 0; 1407 if (m->object && m->object->ref_count != 0) { 1408 if (m->flags & PG_REFERENCED) 1409 ++actcount; 1410 actcount += pmap_ts_referenced(m); 1411 if (actcount) { 1412 m->act_count += ACT_ADVANCE + actcount; 1413 if (m->act_count > ACT_MAX) 1414 m->act_count = ACT_MAX; 1415 } 1416 } 1417 vm_page_flag_clear(m, PG_REFERENCED); 1418 1419 /* 1420 * actcount is only valid if the object ref_count is non-zero. 1421 * If the page does not have an object, actcount will be zero. 1422 */ 1423 if (actcount && m->object->ref_count != 0) { 1424 vm_page_and_queue_spin_lock(m); 1425 if (m->queue - m->pc == PQ_ACTIVE) { 1426 TAILQ_REMOVE( 1427 &vm_page_queues[PQ_ACTIVE + q].pl, 1428 m, pageq); 1429 TAILQ_INSERT_TAIL( 1430 &vm_page_queues[PQ_ACTIVE + q].pl, 1431 m, pageq); 1432 } 1433 vm_page_and_queue_spin_unlock(m); 1434 vm_page_wakeup(m); 1435 } else { 1436 switch(m->object->type) { 1437 case OBJT_DEFAULT: 1438 case OBJT_SWAP: 1439 m->act_count -= min(m->act_count, 1440 vm_anonmem_decline); 1441 break; 1442 default: 1443 m->act_count -= min(m->act_count, 1444 vm_filemem_decline); 1445 break; 1446 } 1447 if (vm_pageout_algorithm || 1448 (m->object == NULL) || 1449 (m->object && (m->object->ref_count == 0)) || 1450 m->act_count < pass + 1 1451 ) { 1452 /* 1453 * Deactivate the page. If we had a 1454 * shortage from our inactive scan try to 1455 * free (cache) the page instead. 1456 * 1457 * Don't just blindly cache the page if 1458 * we do not have a shortage from the 1459 * inactive scan, that could lead to 1460 * gigabytes being moved. 1461 */ 1462 --inactive_shortage; 1463 if (avail_shortage - delta > 0 || 1464 (m->object && (m->object->ref_count == 0))) 1465 { 1466 if (avail_shortage - delta > 0) 1467 ++*recycle_countp; 1468 vm_page_protect(m, VM_PROT_NONE); 1469 if (m->dirty == 0 && 1470 (m->flags & PG_NEED_COMMIT) == 0 && 1471 avail_shortage - delta > 0) { 1472 vm_page_cache(m); 1473 } else { 1474 vm_page_deactivate(m); 1475 vm_page_wakeup(m); 1476 } 1477 } else { 1478 vm_page_deactivate(m); 1479 vm_page_wakeup(m); 1480 } 1481 ++delta; 1482 } else { 1483 vm_page_and_queue_spin_lock(m); 1484 if (m->queue - m->pc == PQ_ACTIVE) { 1485 TAILQ_REMOVE( 1486 &vm_page_queues[PQ_ACTIVE + q].pl, 1487 m, pageq); 1488 TAILQ_INSERT_TAIL( 1489 &vm_page_queues[PQ_ACTIVE + q].pl, 1490 m, pageq); 1491 } 1492 vm_page_and_queue_spin_unlock(m); 1493 vm_page_wakeup(m); 1494 } 1495 } 1496 next: 1497 lwkt_yield(); 1498 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1499 } 1500 1501 /* 1502 * Clean out our local marker. 1503 * 1504 * Page queue still spin-locked. 1505 */ 1506 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1507 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1508 1509 return (delta); 1510 } 1511 1512 /* 1513 * The number of actually free pages can drop down to v_free_reserved, 1514 * we try to build the free count back above v_free_min. Note that 1515 * vm_paging_needed() also returns TRUE if v_free_count is not at 1516 * least v_free_min so that is the minimum we must build the free 1517 * count to. 1518 * 1519 * We use a slightly higher target to improve hysteresis, 1520 * ((v_free_target + v_free_min) / 2). Since v_free_target 1521 * is usually the same as v_cache_min this maintains about 1522 * half the pages in the free queue as are in the cache queue, 1523 * providing pretty good pipelining for pageout operation. 1524 * 1525 * The system operator can manipulate vm.v_cache_min and 1526 * vm.v_free_target to tune the pageout demon. Be sure 1527 * to keep vm.v_free_min < vm.v_free_target. 1528 * 1529 * Note that the original paging target is to get at least 1530 * (free_min + cache_min) into (free + cache). The slightly 1531 * higher target will shift additional pages from cache to free 1532 * without effecting the original paging target in order to 1533 * maintain better hysteresis and not have the free count always 1534 * be dead-on v_free_min. 1535 * 1536 * NOTE: we are still in a critical section. 1537 * 1538 * Pages moved from PQ_CACHE to totally free are not counted in the 1539 * pages_freed counter. 1540 * 1541 * WARNING! Can be called from two pagedaemon threads simultaneously. 1542 */ 1543 static void 1544 vm_pageout_scan_cache(long avail_shortage, int pass, 1545 long vnodes_skipped, long recycle_count) 1546 { 1547 static int lastkillticks; 1548 struct vm_pageout_scan_info info; 1549 vm_page_t m; 1550 int isep; 1551 1552 isep = (curthread == emergpager); 1553 1554 while (vmstats.v_free_count < 1555 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1556 /* 1557 * This steals some code from vm/vm_page.c 1558 * 1559 * Create two rovers and adjust the code to reduce 1560 * chances of them winding up at the same index (which 1561 * can cause a lot of contention). 1562 */ 1563 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1564 1565 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1566 goto next_rover; 1567 1568 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1569 if (m == NULL) 1570 break; 1571 1572 /* 1573 * If the busy attempt fails we can still deactivate the page. 1574 */ 1575 /* page is returned removed from its queue and spinlocked */ 1576 if (vm_page_busy_try(m, TRUE)) { 1577 vm_page_deactivate_locked(m); 1578 vm_page_spin_unlock(m); 1579 continue; 1580 } 1581 vm_page_spin_unlock(m); 1582 pagedaemon_wakeup(); 1583 lwkt_yield(); 1584 1585 /* 1586 * Remaining operations run with the page busy and neither 1587 * the page or the queue will be spin-locked. 1588 */ 1589 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1590 m->hold_count || 1591 m->wire_count) { 1592 vm_page_deactivate(m); 1593 vm_page_wakeup(m); 1594 continue; 1595 } 1596 KKASSERT((m->flags & PG_MAPPED) == 0); 1597 KKASSERT(m->dirty == 0); 1598 vm_pageout_page_free(m); 1599 mycpu->gd_cnt.v_dfree++; 1600 next_rover: 1601 if (isep) 1602 cache_rover[1] -= PQ_PRIME2; 1603 else 1604 cache_rover[0] += PQ_PRIME2; 1605 } 1606 1607 #if !defined(NO_SWAPPING) 1608 /* 1609 * Idle process swapout -- run once per second. 1610 */ 1611 if (vm_swap_idle_enabled) { 1612 static time_t lsec; 1613 if (time_uptime != lsec) { 1614 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1615 vm_req_vmdaemon(); 1616 lsec = time_uptime; 1617 } 1618 } 1619 #endif 1620 1621 /* 1622 * If we didn't get enough free pages, and we have skipped a vnode 1623 * in a writeable object, wakeup the sync daemon. And kick swapout 1624 * if we did not get enough free pages. 1625 */ 1626 if (vm_paging_target() > 0) { 1627 if (vnodes_skipped && vm_page_count_min(0)) 1628 speedup_syncer(NULL); 1629 #if !defined(NO_SWAPPING) 1630 if (vm_swap_enabled && vm_page_count_target()) { 1631 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1632 vm_req_vmdaemon(); 1633 } 1634 #endif 1635 } 1636 1637 /* 1638 * Handle catastrophic conditions. Under good conditions we should 1639 * be at the target, well beyond our minimum. If we could not even 1640 * reach our minimum the system is under heavy stress. But just being 1641 * under heavy stress does not trigger process killing. 1642 * 1643 * We consider ourselves to have run out of memory if the swap pager 1644 * is full and avail_shortage is still positive. The secondary check 1645 * ensures that we do not kill processes if the instantanious 1646 * availability is good, even if the pageout demon pass says it 1647 * couldn't get to the target. 1648 * 1649 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1650 * SITUATIONS. 1651 */ 1652 if (swap_pager_almost_full && 1653 pass > 0 && 1654 isep == 0 && 1655 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1656 kprintf("Warning: system low on memory+swap " 1657 "shortage %ld for %d ticks!\n", 1658 avail_shortage, ticks - swap_fail_ticks); 1659 if (bootverbose) 1660 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1661 "avail=%ld target=%ld last=%u\n", 1662 swap_pager_almost_full, 1663 swap_pager_full, 1664 pass, 1665 avail_shortage, 1666 vm_paging_target(), 1667 (unsigned int)(ticks - lastkillticks)); 1668 } 1669 if (swap_pager_full && 1670 pass > 1 && 1671 isep == 0 && 1672 avail_shortage > 0 && 1673 vm_paging_target() > 0 && 1674 (unsigned int)(ticks - lastkillticks) >= hz) { 1675 /* 1676 * Kill something, maximum rate once per second to give 1677 * the process time to free up sufficient memory. 1678 */ 1679 lastkillticks = ticks; 1680 info.bigproc = NULL; 1681 info.bigsize = 0; 1682 allproc_scan(vm_pageout_scan_callback, &info, 0); 1683 if (info.bigproc != NULL) { 1684 kprintf("Try to kill process %d %s\n", 1685 info.bigproc->p_pid, info.bigproc->p_comm); 1686 info.bigproc->p_nice = PRIO_MIN; 1687 info.bigproc->p_usched->resetpriority( 1688 FIRST_LWP_IN_PROC(info.bigproc)); 1689 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1690 killproc(info.bigproc, "out of swap space"); 1691 wakeup(&vmstats.v_free_count); 1692 PRELE(info.bigproc); 1693 } 1694 } 1695 } 1696 1697 static int 1698 vm_pageout_scan_callback(struct proc *p, void *data) 1699 { 1700 struct vm_pageout_scan_info *info = data; 1701 vm_offset_t size; 1702 1703 /* 1704 * Never kill system processes or init. If we have configured swap 1705 * then try to avoid killing low-numbered pids. 1706 */ 1707 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1708 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1709 return (0); 1710 } 1711 1712 lwkt_gettoken(&p->p_token); 1713 1714 /* 1715 * if the process is in a non-running type state, 1716 * don't touch it. 1717 */ 1718 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1719 lwkt_reltoken(&p->p_token); 1720 return (0); 1721 } 1722 1723 /* 1724 * Get the approximate process size. Note that anonymous pages 1725 * with backing swap will be counted twice, but there should not 1726 * be too many such pages due to the stress the VM system is 1727 * under at this point. 1728 */ 1729 size = vmspace_anonymous_count(p->p_vmspace) + 1730 vmspace_swap_count(p->p_vmspace); 1731 1732 /* 1733 * If the this process is bigger than the biggest one 1734 * remember it. 1735 */ 1736 if (info->bigsize < size) { 1737 if (info->bigproc) 1738 PRELE(info->bigproc); 1739 PHOLD(p); 1740 info->bigproc = p; 1741 info->bigsize = size; 1742 } 1743 lwkt_reltoken(&p->p_token); 1744 lwkt_yield(); 1745 1746 return(0); 1747 } 1748 1749 /* 1750 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1751 * moved back to PQ_FREE. It is possible for pages to accumulate here 1752 * when vm_page_free() races against vm_page_unhold(), resulting in a 1753 * page being left on a PQ_HOLD queue with hold_count == 0. 1754 * 1755 * It is easier to handle this edge condition here, in non-critical code, 1756 * rather than enforce a spin-lock for every 1->0 transition in 1757 * vm_page_unhold(). 1758 * 1759 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1760 */ 1761 static void 1762 vm_pageout_scan_hold(int q) 1763 { 1764 vm_page_t m; 1765 1766 vm_page_queues_spin_lock(PQ_HOLD + q); 1767 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1768 if (m->flags & PG_MARKER) 1769 continue; 1770 1771 /* 1772 * Process one page and return 1773 */ 1774 if (m->hold_count) 1775 break; 1776 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1777 vm_page_hold(m); 1778 vm_page_queues_spin_unlock(PQ_HOLD + q); 1779 vm_page_unhold(m); /* reprocess */ 1780 return; 1781 } 1782 vm_page_queues_spin_unlock(PQ_HOLD + q); 1783 } 1784 1785 /* 1786 * This routine tries to maintain the pseudo LRU active queue, 1787 * so that during long periods of time where there is no paging, 1788 * that some statistic accumulation still occurs. This code 1789 * helps the situation where paging just starts to occur. 1790 */ 1791 static void 1792 vm_pageout_page_stats(int q) 1793 { 1794 static int fullintervalcount = 0; 1795 struct vm_page marker; 1796 vm_page_t m; 1797 long pcount, tpcount; /* Number of pages to check */ 1798 long page_shortage; 1799 1800 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1801 vmstats.v_free_min) - 1802 (vmstats.v_free_count + vmstats.v_inactive_count + 1803 vmstats.v_cache_count); 1804 1805 if (page_shortage <= 0) 1806 return; 1807 1808 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1809 fullintervalcount += vm_pageout_stats_interval; 1810 if (fullintervalcount < vm_pageout_full_stats_interval) { 1811 tpcount = (vm_pageout_stats_max * pcount) / 1812 vmstats.v_page_count + 1; 1813 if (pcount > tpcount) 1814 pcount = tpcount; 1815 } else { 1816 fullintervalcount = 0; 1817 } 1818 1819 bzero(&marker, sizeof(marker)); 1820 marker.flags = PG_FICTITIOUS | PG_MARKER; 1821 marker.busy_count = PBUSY_LOCKED; 1822 marker.queue = PQ_ACTIVE + q; 1823 marker.pc = q; 1824 marker.wire_count = 1; 1825 1826 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1827 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1828 1829 /* 1830 * Queue locked at top of loop to avoid stack marker issues. 1831 */ 1832 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1833 pcount-- > 0) 1834 { 1835 int actcount; 1836 1837 KKASSERT(m->queue == PQ_ACTIVE + q); 1838 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1839 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1840 &marker, pageq); 1841 1842 /* 1843 * Skip marker pages (atomic against other markers to avoid 1844 * infinite hop-over scans). 1845 */ 1846 if (m->flags & PG_MARKER) 1847 continue; 1848 1849 /* 1850 * Ignore pages we can't busy 1851 */ 1852 if (vm_page_busy_try(m, TRUE)) 1853 continue; 1854 1855 /* 1856 * Remaining operations run with the page busy and neither 1857 * the page or the queue will be spin-locked. 1858 */ 1859 KKASSERT(m->queue == PQ_ACTIVE + q); 1860 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1861 1862 /* 1863 * We can just remove wired pages from the queue 1864 */ 1865 if (m->wire_count) { 1866 vm_page_unqueue_nowakeup(m); 1867 vm_page_wakeup(m); 1868 goto next; 1869 } 1870 1871 1872 /* 1873 * We now have a safely busied page, the page and queue 1874 * spinlocks have been released. 1875 * 1876 * Ignore held and wired pages 1877 */ 1878 if (m->hold_count || m->wire_count) { 1879 vm_page_wakeup(m); 1880 goto next; 1881 } 1882 1883 /* 1884 * Calculate activity 1885 */ 1886 actcount = 0; 1887 if (m->flags & PG_REFERENCED) { 1888 vm_page_flag_clear(m, PG_REFERENCED); 1889 actcount += 1; 1890 } 1891 actcount += pmap_ts_referenced(m); 1892 1893 /* 1894 * Update act_count and move page to end of queue. 1895 */ 1896 if (actcount) { 1897 m->act_count += ACT_ADVANCE + actcount; 1898 if (m->act_count > ACT_MAX) 1899 m->act_count = ACT_MAX; 1900 vm_page_and_queue_spin_lock(m); 1901 if (m->queue - m->pc == PQ_ACTIVE) { 1902 TAILQ_REMOVE( 1903 &vm_page_queues[PQ_ACTIVE + q].pl, 1904 m, pageq); 1905 TAILQ_INSERT_TAIL( 1906 &vm_page_queues[PQ_ACTIVE + q].pl, 1907 m, pageq); 1908 } 1909 vm_page_and_queue_spin_unlock(m); 1910 vm_page_wakeup(m); 1911 goto next; 1912 } 1913 1914 if (m->act_count == 0) { 1915 /* 1916 * We turn off page access, so that we have 1917 * more accurate RSS stats. We don't do this 1918 * in the normal page deactivation when the 1919 * system is loaded VM wise, because the 1920 * cost of the large number of page protect 1921 * operations would be higher than the value 1922 * of doing the operation. 1923 * 1924 * We use the marker to save our place so 1925 * we can release the spin lock. both (m) 1926 * and (next) will be invalid. 1927 */ 1928 vm_page_protect(m, VM_PROT_NONE); 1929 vm_page_deactivate(m); 1930 } else { 1931 m->act_count -= min(m->act_count, ACT_DECLINE); 1932 vm_page_and_queue_spin_lock(m); 1933 if (m->queue - m->pc == PQ_ACTIVE) { 1934 TAILQ_REMOVE( 1935 &vm_page_queues[PQ_ACTIVE + q].pl, 1936 m, pageq); 1937 TAILQ_INSERT_TAIL( 1938 &vm_page_queues[PQ_ACTIVE + q].pl, 1939 m, pageq); 1940 } 1941 vm_page_and_queue_spin_unlock(m); 1942 } 1943 vm_page_wakeup(m); 1944 next: 1945 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1946 } 1947 1948 /* 1949 * Remove our local marker 1950 * 1951 * Page queue still spin-locked. 1952 */ 1953 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1954 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1955 } 1956 1957 static int 1958 vm_pageout_free_page_calc(vm_size_t count) 1959 { 1960 if (count < vmstats.v_page_count) 1961 return 0; 1962 /* 1963 * free_reserved needs to include enough for the largest swap pager 1964 * structures plus enough for any pv_entry structs when paging. 1965 * 1966 * v_free_min normal allocations 1967 * v_free_reserved system allocations 1968 * v_pageout_free_min allocations by pageout daemon 1969 * v_interrupt_free_min low level allocations (e.g swap structures) 1970 */ 1971 if (vmstats.v_page_count > 1024) 1972 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1973 else 1974 vmstats.v_free_min = 64; 1975 1976 /* 1977 * Make sure the vmmeter slop can't blow out our global minimums. 1978 * 1979 * However, to accomodate weird configurations (vkernels with many 1980 * cpus and little memory, or artifically reduced hw.physmem), do 1981 * not allow v_free_min to exceed 1/20 of ram or the pageout demon 1982 * will go out of control. 1983 */ 1984 if (vmstats.v_free_min < VMMETER_SLOP_COUNT * ncpus * 10) 1985 vmstats.v_free_min = VMMETER_SLOP_COUNT * ncpus * 10; 1986 if (vmstats.v_free_min > vmstats.v_page_count / 20) 1987 vmstats.v_free_min = vmstats.v_page_count / 20; 1988 1989 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1990 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1991 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1992 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1993 1994 return 1; 1995 } 1996 1997 1998 /* 1999 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2000 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2001 * 2002 * The emergency pageout daemon takes over when the primary pageout daemon 2003 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2004 * avoiding the many low-memory deadlocks which can occur when paging out 2005 * to VFS's. 2006 */ 2007 static void 2008 vm_pageout_thread(void) 2009 { 2010 int pass; 2011 int q; 2012 int q1iterator = 0; 2013 int q2iterator = 0; 2014 int q3iterator = 0; 2015 int isep; 2016 2017 curthread->td_flags |= TDF_SYSTHREAD; 2018 2019 /* 2020 * We only need to setup once. 2021 */ 2022 isep = 0; 2023 if (curthread == emergpager) { 2024 isep = 1; 2025 goto skip_setup; 2026 } 2027 2028 /* 2029 * Initialize some paging parameters. 2030 */ 2031 vm_pageout_free_page_calc(vmstats.v_page_count); 2032 2033 /* 2034 * v_free_target and v_cache_min control pageout hysteresis. Note 2035 * that these are more a measure of the VM cache queue hysteresis 2036 * then the VM free queue. Specifically, v_free_target is the 2037 * high water mark (free+cache pages). 2038 * 2039 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2040 * low water mark, while v_free_min is the stop. v_cache_min must 2041 * be big enough to handle memory needs while the pageout daemon 2042 * is signalled and run to free more pages. 2043 */ 2044 if (vmstats.v_free_count > 6144) 2045 vmstats.v_free_target = 4 * vmstats.v_free_min + 2046 vmstats.v_free_reserved; 2047 else 2048 vmstats.v_free_target = 2 * vmstats.v_free_min + 2049 vmstats.v_free_reserved; 2050 2051 /* 2052 * NOTE: With the new buffer cache b_act_count we want the default 2053 * inactive target to be a percentage of available memory. 2054 * 2055 * The inactive target essentially determines the minimum 2056 * number of 'temporary' pages capable of caching one-time-use 2057 * files when the VM system is otherwise full of pages 2058 * belonging to multi-time-use files or active program data. 2059 * 2060 * NOTE: The inactive target is aggressively persued only if the 2061 * inactive queue becomes too small. If the inactive queue 2062 * is large enough to satisfy page movement to free+cache 2063 * then it is repopulated more slowly from the active queue. 2064 * This allows a general inactive_target default to be set. 2065 * 2066 * There is an issue here for processes which sit mostly idle 2067 * 'overnight', such as sshd, tcsh, and X. Any movement from 2068 * the active queue will eventually cause such pages to 2069 * recycle eventually causing a lot of paging in the morning. 2070 * To reduce the incidence of this pages cycled out of the 2071 * buffer cache are moved directly to the inactive queue if 2072 * they were only used once or twice. 2073 * 2074 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2075 * Increasing the value (up to 64) increases the number of 2076 * buffer recyclements which go directly to the inactive queue. 2077 */ 2078 if (vmstats.v_free_count > 2048) { 2079 vmstats.v_cache_min = vmstats.v_free_target; 2080 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2081 } else { 2082 vmstats.v_cache_min = 0; 2083 vmstats.v_cache_max = 0; 2084 } 2085 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2086 2087 /* XXX does not really belong here */ 2088 if (vm_page_max_wired == 0) 2089 vm_page_max_wired = vmstats.v_free_count / 3; 2090 2091 if (vm_pageout_stats_max == 0) 2092 vm_pageout_stats_max = vmstats.v_free_target; 2093 2094 /* 2095 * Set interval in seconds for stats scan. 2096 */ 2097 if (vm_pageout_stats_interval == 0) 2098 vm_pageout_stats_interval = 5; 2099 if (vm_pageout_full_stats_interval == 0) 2100 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2101 2102 2103 /* 2104 * Set maximum free per pass 2105 */ 2106 if (vm_pageout_stats_free_max == 0) 2107 vm_pageout_stats_free_max = 5; 2108 2109 swap_pager_swap_init(); 2110 pass = 0; 2111 2112 atomic_swap_int(&sequence_emerg_pager, 1); 2113 wakeup(&sequence_emerg_pager); 2114 2115 skip_setup: 2116 /* 2117 * Sequence emergency pager startup 2118 */ 2119 if (isep) { 2120 while (sequence_emerg_pager == 0) 2121 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2122 } 2123 2124 /* 2125 * The pageout daemon is never done, so loop forever. 2126 * 2127 * WARNING! This code is being executed by two kernel threads 2128 * potentially simultaneously. 2129 */ 2130 while (TRUE) { 2131 int error; 2132 long avail_shortage; 2133 long inactive_shortage; 2134 long vnodes_skipped = 0; 2135 long recycle_count = 0; 2136 long tmp; 2137 2138 /* 2139 * Wait for an action request. If we timeout check to 2140 * see if paging is needed (in case the normal wakeup 2141 * code raced us). 2142 */ 2143 if (isep) { 2144 /* 2145 * Emergency pagedaemon monitors the primary 2146 * pagedaemon while vm_pages_needed != 0. 2147 * 2148 * The emergency pagedaemon only runs if VM paging 2149 * is needed and the primary pagedaemon has not 2150 * updated vm_pagedaemon_time for more than 2 seconds. 2151 */ 2152 if (vm_pages_needed) 2153 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2154 else 2155 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2156 if (vm_pages_needed == 0) { 2157 pass = 0; 2158 continue; 2159 } 2160 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2161 pass = 0; 2162 continue; 2163 } 2164 } else { 2165 /* 2166 * Primary pagedaemon 2167 * 2168 * NOTE: We unconditionally cleanup PQ_HOLD even 2169 * when there is no work to do. 2170 */ 2171 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2172 ++q3iterator; 2173 2174 if (vm_pages_needed == 0) { 2175 error = tsleep(&vm_pages_needed, 2176 0, "psleep", 2177 vm_pageout_stats_interval * hz); 2178 if (error && 2179 vm_paging_needed() == 0 && 2180 vm_pages_needed == 0) { 2181 for (q = 0; q < PQ_L2_SIZE; ++q) 2182 vm_pageout_page_stats(q); 2183 continue; 2184 } 2185 vm_pagedaemon_time = ticks; 2186 vm_pages_needed = 1; 2187 2188 /* 2189 * Wake the emergency pagedaemon up so it 2190 * can monitor us. It will automatically 2191 * go back into a long sleep when 2192 * vm_pages_needed returns to 0. 2193 */ 2194 wakeup(&vm_pagedaemon_time); 2195 } 2196 } 2197 2198 mycpu->gd_cnt.v_pdwakeups++; 2199 2200 /* 2201 * Scan for INACTIVE->CLEAN/PAGEOUT 2202 * 2203 * This routine tries to avoid thrashing the system with 2204 * unnecessary activity. 2205 * 2206 * Calculate our target for the number of free+cache pages we 2207 * want to get to. This is higher then the number that causes 2208 * allocations to stall (severe) in order to provide hysteresis, 2209 * and if we don't make it all the way but get to the minimum 2210 * we're happy. Goose it a bit if there are multiple requests 2211 * for memory. 2212 * 2213 * Don't reduce avail_shortage inside the loop or the 2214 * PQAVERAGE() calculation will break. 2215 * 2216 * NOTE! deficit is differentiated from avail_shortage as 2217 * REQUIRING at least (deficit) pages to be cleaned, 2218 * even if the page queues are in good shape. This 2219 * is used primarily for handling per-process 2220 * RLIMIT_RSS and may also see small values when 2221 * processes block due to low memory. 2222 */ 2223 vmstats_rollup(); 2224 if (isep == 0) 2225 vm_pagedaemon_time = ticks; 2226 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2227 vm_pageout_deficit = 0; 2228 2229 if (avail_shortage > 0) { 2230 long delta = 0; 2231 int qq; 2232 2233 qq = q1iterator; 2234 for (q = 0; q < PQ_L2_SIZE; ++q) { 2235 delta += vm_pageout_scan_inactive( 2236 pass, 2237 qq & PQ_L2_MASK, 2238 PQAVERAGE(avail_shortage), 2239 &vnodes_skipped); 2240 if (isep) 2241 --qq; 2242 else 2243 ++qq; 2244 if (avail_shortage - delta <= 0) 2245 break; 2246 } 2247 avail_shortage -= delta; 2248 q1iterator = qq; 2249 } 2250 2251 /* 2252 * Figure out how many active pages we must deactivate. If 2253 * we were able to reach our target with just the inactive 2254 * scan above we limit the number of active pages we 2255 * deactivate to reduce unnecessary work. 2256 */ 2257 vmstats_rollup(); 2258 if (isep == 0) 2259 vm_pagedaemon_time = ticks; 2260 inactive_shortage = vmstats.v_inactive_target - 2261 vmstats.v_inactive_count; 2262 2263 /* 2264 * If we were unable to free sufficient inactive pages to 2265 * satisfy the free/cache queue requirements then simply 2266 * reaching the inactive target may not be good enough. 2267 * Try to deactivate pages in excess of the target based 2268 * on the shortfall. 2269 * 2270 * However to prevent thrashing the VM system do not 2271 * deactivate more than an additional 1/10 the inactive 2272 * target's worth of active pages. 2273 */ 2274 if (avail_shortage > 0) { 2275 tmp = avail_shortage * 2; 2276 if (tmp > vmstats.v_inactive_target / 10) 2277 tmp = vmstats.v_inactive_target / 10; 2278 inactive_shortage += tmp; 2279 } 2280 2281 /* 2282 * Only trigger a pmap cleanup on inactive shortage. 2283 */ 2284 if (isep == 0 && inactive_shortage > 0) { 2285 pmap_collect(); 2286 } 2287 2288 /* 2289 * Scan for ACTIVE->INACTIVE 2290 * 2291 * Only trigger on inactive shortage. Triggering on 2292 * avail_shortage can starve the active queue with 2293 * unnecessary active->inactive transitions and destroy 2294 * performance. 2295 * 2296 * If this is the emergency pager, always try to move 2297 * a few pages from active to inactive because the inactive 2298 * queue might have enough pages, but not enough anonymous 2299 * pages. 2300 */ 2301 if (isep && inactive_shortage < vm_emerg_launder) 2302 inactive_shortage = vm_emerg_launder; 2303 2304 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2305 long delta = 0; 2306 int qq; 2307 2308 qq = q2iterator; 2309 for (q = 0; q < PQ_L2_SIZE; ++q) { 2310 delta += vm_pageout_scan_active( 2311 pass, 2312 qq & PQ_L2_MASK, 2313 PQAVERAGE(avail_shortage), 2314 PQAVERAGE(inactive_shortage), 2315 &recycle_count); 2316 if (isep) 2317 --qq; 2318 else 2319 ++qq; 2320 if (inactive_shortage - delta <= 0 && 2321 avail_shortage - delta <= 0) { 2322 break; 2323 } 2324 } 2325 inactive_shortage -= delta; 2326 avail_shortage -= delta; 2327 q2iterator = qq; 2328 } 2329 2330 /* 2331 * Scan for CACHE->FREE 2332 * 2333 * Finally free enough cache pages to meet our free page 2334 * requirement and take more drastic measures if we are 2335 * still in trouble. 2336 */ 2337 vmstats_rollup(); 2338 if (isep == 0) 2339 vm_pagedaemon_time = ticks; 2340 vm_pageout_scan_cache(avail_shortage, pass, 2341 vnodes_skipped, recycle_count); 2342 2343 /* 2344 * Wait for more work. 2345 */ 2346 if (avail_shortage > 0) { 2347 ++pass; 2348 if (pass < 10 && vm_pages_needed > 1) { 2349 /* 2350 * Normal operation, additional processes 2351 * have already kicked us. Retry immediately 2352 * unless swap space is completely full in 2353 * which case delay a bit. 2354 */ 2355 if (swap_pager_full) { 2356 tsleep(&vm_pages_needed, 0, "pdelay", 2357 hz / 5); 2358 } /* else immediate retry */ 2359 } else if (pass < 10) { 2360 /* 2361 * Normal operation, fewer processes. Delay 2362 * a bit but allow wakeups. vm_pages_needed 2363 * is only adjusted against the primary 2364 * pagedaemon here. 2365 */ 2366 if (isep == 0) 2367 vm_pages_needed = 0; 2368 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2369 if (isep == 0) 2370 vm_pages_needed = 1; 2371 } else if (swap_pager_full == 0) { 2372 /* 2373 * We've taken too many passes, forced delay. 2374 */ 2375 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2376 } else { 2377 /* 2378 * Running out of memory, catastrophic 2379 * back-off to one-second intervals. 2380 */ 2381 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2382 } 2383 } else if (vm_pages_needed) { 2384 /* 2385 * Interlocked wakeup of waiters (non-optional). 2386 * 2387 * Similar to vm_page_free_wakeup() in vm_page.c, 2388 * wake 2389 */ 2390 pass = 0; 2391 if (!vm_page_count_min(vm_page_free_hysteresis) || 2392 !vm_page_count_target()) { 2393 vm_pages_needed = 0; 2394 wakeup(&vmstats.v_free_count); 2395 } 2396 } else { 2397 pass = 0; 2398 } 2399 } 2400 } 2401 2402 static struct kproc_desc pg1_kp = { 2403 "pagedaemon", 2404 vm_pageout_thread, 2405 &pagethread 2406 }; 2407 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2408 2409 static struct kproc_desc pg2_kp = { 2410 "emergpager", 2411 vm_pageout_thread, 2412 &emergpager 2413 }; 2414 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2415 2416 2417 /* 2418 * Called after allocating a page out of the cache or free queue 2419 * to possibly wake the pagedaemon up to replentish our supply. 2420 * 2421 * We try to generate some hysteresis by waking the pagedaemon up 2422 * when our free+cache pages go below the free_min+cache_min level. 2423 * The pagedaemon tries to get the count back up to at least the 2424 * minimum, and through to the target level if possible. 2425 * 2426 * If the pagedaemon is already active bump vm_pages_needed as a hint 2427 * that there are even more requests pending. 2428 * 2429 * SMP races ok? 2430 * No requirements. 2431 */ 2432 void 2433 pagedaemon_wakeup(void) 2434 { 2435 if (vm_paging_needed() && curthread != pagethread) { 2436 if (vm_pages_needed == 0) { 2437 vm_pages_needed = 1; /* SMP race ok */ 2438 wakeup(&vm_pages_needed); 2439 } else if (vm_page_count_min(0)) { 2440 ++vm_pages_needed; /* SMP race ok */ 2441 } 2442 } 2443 } 2444 2445 #if !defined(NO_SWAPPING) 2446 2447 /* 2448 * SMP races ok? 2449 * No requirements. 2450 */ 2451 static void 2452 vm_req_vmdaemon(void) 2453 { 2454 static int lastrun = 0; 2455 2456 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2457 wakeup(&vm_daemon_needed); 2458 lastrun = ticks; 2459 } 2460 } 2461 2462 static int vm_daemon_callback(struct proc *p, void *data __unused); 2463 2464 /* 2465 * No requirements. 2466 */ 2467 static void 2468 vm_daemon(void) 2469 { 2470 int req_swapout; 2471 2472 while (TRUE) { 2473 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2474 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2475 2476 /* 2477 * forced swapouts 2478 */ 2479 if (req_swapout) 2480 swapout_procs(vm_pageout_req_swapout); 2481 2482 /* 2483 * scan the processes for exceeding their rlimits or if 2484 * process is swapped out -- deactivate pages 2485 */ 2486 allproc_scan(vm_daemon_callback, NULL, 0); 2487 } 2488 } 2489 2490 static int 2491 vm_daemon_callback(struct proc *p, void *data __unused) 2492 { 2493 struct vmspace *vm; 2494 vm_pindex_t limit, size; 2495 2496 /* 2497 * if this is a system process or if we have already 2498 * looked at this process, skip it. 2499 */ 2500 lwkt_gettoken(&p->p_token); 2501 2502 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2503 lwkt_reltoken(&p->p_token); 2504 return (0); 2505 } 2506 2507 /* 2508 * if the process is in a non-running type state, 2509 * don't touch it. 2510 */ 2511 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2512 lwkt_reltoken(&p->p_token); 2513 return (0); 2514 } 2515 2516 /* 2517 * get a limit 2518 */ 2519 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2520 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2521 2522 /* 2523 * let processes that are swapped out really be 2524 * swapped out. Set the limit to nothing to get as 2525 * many pages out to swap as possible. 2526 */ 2527 if (p->p_flags & P_SWAPPEDOUT) 2528 limit = 0; 2529 2530 vm = p->p_vmspace; 2531 vmspace_hold(vm); 2532 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2533 if (limit >= 0 && size > 4096 && 2534 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2535 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2536 } 2537 vmspace_drop(vm); 2538 2539 lwkt_reltoken(&p->p_token); 2540 2541 return (0); 2542 } 2543 2544 #endif 2545