1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/conf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <sys/lock.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_map.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/swap_pager.h> 93 #include <vm/vm_extern.h> 94 95 #include <sys/spinlock2.h> 96 #include <vm/vm_page2.h> 97 98 /* 99 * System initialization 100 */ 101 102 /* the kernel process "vm_pageout"*/ 103 static int vm_pageout_page(vm_page_t m, long *max_launderp, 104 long *vnodes_skippedp, struct vnode **vpfailedp, 105 int pass, int vmflush_flags); 106 static int vm_pageout_clean_helper (vm_page_t, int); 107 static void vm_pageout_free_page_calc (vm_size_t count); 108 static void vm_pageout_page_free(vm_page_t m) ; 109 struct thread *emergpager; 110 struct thread *pagethread; 111 static int sequence_emerg_pager; 112 113 #if !defined(NO_SWAPPING) 114 /* the kernel process "vm_daemon"*/ 115 static void vm_daemon (void); 116 static struct thread *vmthread; 117 118 static struct kproc_desc vm_kp = { 119 "vmdaemon", 120 vm_daemon, 121 &vmthread 122 }; 123 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 124 #endif 125 126 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 127 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 128 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 129 int vm_page_free_hysteresis = 16; 130 static int vm_pagedaemon_time; 131 132 #if !defined(NO_SWAPPING) 133 static int vm_pageout_req_swapout; 134 static int vm_daemon_needed; 135 #endif 136 static int vm_max_launder = 4096; 137 static int vm_emerg_launder = 100; 138 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 139 static int vm_pageout_full_stats_interval = 0; 140 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 141 static int defer_swap_pageouts=0; 142 static int disable_swap_pageouts=0; 143 static u_int vm_anonmem_decline = ACT_DECLINE; 144 static u_int vm_filemem_decline = ACT_DECLINE * 2; 145 146 #if defined(NO_SWAPPING) 147 static int vm_swap_enabled=0; 148 static int vm_swap_idle_enabled=0; 149 #else 150 static int vm_swap_enabled=1; 151 static int vm_swap_idle_enabled=0; 152 #endif 153 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 154 155 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 156 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 157 158 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 159 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 160 161 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 162 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 163 "Free more pages than the minimum required"); 164 165 SYSCTL_INT(_vm, OID_AUTO, max_launder, 166 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 167 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 168 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 169 170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 171 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 172 173 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 174 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 175 176 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 177 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 178 179 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 180 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 181 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 182 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 183 184 #if defined(NO_SWAPPING) 185 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 186 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 187 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 188 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 189 #else 190 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 191 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 192 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 193 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 194 #endif 195 196 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 197 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 198 199 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 200 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 201 202 static int pageout_lock_miss; 203 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 204 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 205 206 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 207 208 #if !defined(NO_SWAPPING) 209 static void vm_req_vmdaemon (void); 210 #endif 211 static void vm_pageout_page_stats(int q); 212 213 /* 214 * Calculate approximately how many pages on each queue to try to 215 * clean. An exact calculation creates an edge condition when the 216 * queues are unbalanced so add significant slop. The queue scans 217 * will stop early when targets are reached and will start where they 218 * left off on the next pass. 219 * 220 * We need to be generous here because there are all sorts of loading 221 * conditions that can cause edge cases if try to average over all queues. 222 * In particular, storage subsystems have become so fast that paging 223 * activity can become quite frantic. Eventually we will probably need 224 * two paging threads, one for dirty pages and one for clean, to deal 225 * with the bandwidth requirements. 226 227 * So what we do is calculate a value that can be satisfied nominally by 228 * only having to scan half the queues. 229 */ 230 static __inline long 231 PQAVERAGE(long n) 232 { 233 long avg; 234 235 if (n >= 0) { 236 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 237 } else { 238 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 239 } 240 return avg; 241 } 242 243 /* 244 * vm_pageout_clean_helper: 245 * 246 * Clean the page and remove it from the laundry. The page must be busied 247 * by the caller and will be disposed of (put away, flushed) by this routine. 248 */ 249 static int 250 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 251 { 252 vm_object_t object; 253 vm_page_t mc[BLIST_MAX_ALLOC]; 254 int error; 255 int ib, is, page_base; 256 vm_pindex_t pindex = m->pindex; 257 258 object = m->object; 259 260 /* 261 * Don't mess with the page if it's held or special. Theoretically 262 * we can pageout held pages but there is no real need to press our 263 * luck, so don't. 264 */ 265 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 266 vm_page_wakeup(m); 267 return 0; 268 } 269 270 /* 271 * Place page in cluster. Align cluster for optimal swap space 272 * allocation (whether it is swap or not). This is typically ~16-32 273 * pages, which also tends to align the cluster to multiples of the 274 * filesystem block size if backed by a filesystem. 275 */ 276 page_base = pindex % BLIST_MAX_ALLOC; 277 mc[page_base] = m; 278 ib = page_base - 1; 279 is = page_base + 1; 280 281 /* 282 * Scan object for clusterable pages. 283 * 284 * We can cluster ONLY if: ->> the page is NOT 285 * clean, wired, busy, held, or mapped into a 286 * buffer, and one of the following: 287 * 1) The page is inactive, or a seldom used 288 * active page. 289 * -or- 290 * 2) we force the issue. 291 * 292 * During heavy mmap/modification loads the pageout 293 * daemon can really fragment the underlying file 294 * due to flushing pages out of order and not trying 295 * align the clusters (which leave sporatic out-of-order 296 * holes). To solve this problem we do the reverse scan 297 * first and attempt to align our cluster, then do a 298 * forward scan if room remains. 299 */ 300 vm_object_hold(object); 301 302 while (ib >= 0) { 303 vm_page_t p; 304 305 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 306 TRUE, &error); 307 if (error || p == NULL) 308 break; 309 if ((p->queue - p->pc) == PQ_CACHE || 310 (p->flags & PG_UNQUEUED)) { 311 vm_page_wakeup(p); 312 break; 313 } 314 vm_page_test_dirty(p); 315 if (((p->dirty & p->valid) == 0 && 316 (p->flags & PG_NEED_COMMIT) == 0) || 317 p->wire_count != 0 || /* may be held by buf cache */ 318 p->hold_count != 0) { /* may be undergoing I/O */ 319 vm_page_wakeup(p); 320 break; 321 } 322 if (p->queue - p->pc != PQ_INACTIVE) { 323 if (p->queue - p->pc != PQ_ACTIVE || 324 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 325 vm_page_wakeup(p); 326 break; 327 } 328 } 329 330 /* 331 * Try to maintain page groupings in the cluster. 332 */ 333 if (m->flags & PG_WINATCFLS) 334 vm_page_flag_set(p, PG_WINATCFLS); 335 else 336 vm_page_flag_clear(p, PG_WINATCFLS); 337 p->act_count = m->act_count; 338 339 mc[ib] = p; 340 --ib; 341 } 342 ++ib; /* fixup */ 343 344 while (is < BLIST_MAX_ALLOC && 345 pindex - page_base + is < object->size) { 346 vm_page_t p; 347 348 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 349 TRUE, &error); 350 if (error || p == NULL) 351 break; 352 if (((p->queue - p->pc) == PQ_CACHE) || 353 (p->flags & PG_UNQUEUED)) { 354 vm_page_wakeup(p); 355 break; 356 } 357 vm_page_test_dirty(p); 358 if (((p->dirty & p->valid) == 0 && 359 (p->flags & PG_NEED_COMMIT) == 0) || 360 p->wire_count != 0 || /* may be held by buf cache */ 361 p->hold_count != 0) { /* may be undergoing I/O */ 362 vm_page_wakeup(p); 363 break; 364 } 365 if (p->queue - p->pc != PQ_INACTIVE) { 366 if (p->queue - p->pc != PQ_ACTIVE || 367 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 368 vm_page_wakeup(p); 369 break; 370 } 371 } 372 373 /* 374 * Try to maintain page groupings in the cluster. 375 */ 376 if (m->flags & PG_WINATCFLS) 377 vm_page_flag_set(p, PG_WINATCFLS); 378 else 379 vm_page_flag_clear(p, PG_WINATCFLS); 380 p->act_count = m->act_count; 381 382 mc[is] = p; 383 ++is; 384 } 385 386 vm_object_drop(object); 387 388 /* 389 * we allow reads during pageouts... 390 */ 391 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 392 } 393 394 /* 395 * vm_pageout_flush() - launder the given pages 396 * 397 * The given pages are laundered. Note that we setup for the start of 398 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 399 * reference count all in here rather then in the parent. If we want 400 * the parent to do more sophisticated things we may have to change 401 * the ordering. 402 * 403 * The pages in the array must be busied by the caller and will be 404 * unbusied by this function. 405 */ 406 int 407 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 408 { 409 vm_object_t object; 410 int pageout_status[count]; 411 int numpagedout = 0; 412 int i; 413 414 /* 415 * Initiate I/O. Bump the vm_page_t->busy counter. 416 */ 417 for (i = 0; i < count; i++) { 418 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 419 ("vm_pageout_flush page %p index %d/%d: partially " 420 "invalid page", mc[i], i, count)); 421 vm_page_io_start(mc[i]); 422 } 423 424 /* 425 * We must make the pages read-only. This will also force the 426 * modified bit in the related pmaps to be cleared. The pager 427 * cannot clear the bit for us since the I/O completion code 428 * typically runs from an interrupt. The act of making the page 429 * read-only handles the case for us. 430 * 431 * Then we can unbusy the pages, we still hold a reference by virtue 432 * of our soft-busy. 433 */ 434 for (i = 0; i < count; i++) { 435 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 436 vm_page_protect(mc[i], VM_PROT_NONE); 437 else 438 vm_page_protect(mc[i], VM_PROT_READ); 439 vm_page_wakeup(mc[i]); 440 } 441 442 object = mc[0]->object; 443 vm_object_pip_add(object, count); 444 445 vm_pager_put_pages(object, mc, count, 446 (vmflush_flags | 447 ((object == &kernel_object) ? 448 VM_PAGER_PUT_SYNC : 0)), 449 pageout_status); 450 451 for (i = 0; i < count; i++) { 452 vm_page_t mt = mc[i]; 453 454 switch (pageout_status[i]) { 455 case VM_PAGER_OK: 456 numpagedout++; 457 break; 458 case VM_PAGER_PEND: 459 numpagedout++; 460 break; 461 case VM_PAGER_BAD: 462 /* 463 * Page outside of range of object. Right now we 464 * essentially lose the changes by pretending it 465 * worked. 466 */ 467 vm_page_busy_wait(mt, FALSE, "pgbad"); 468 pmap_clear_modify(mt); 469 vm_page_undirty(mt); 470 vm_page_wakeup(mt); 471 break; 472 case VM_PAGER_ERROR: 473 case VM_PAGER_FAIL: 474 /* 475 * A page typically cannot be paged out when we 476 * have run out of swap. We leave the page 477 * marked inactive and will try to page it out 478 * again later. 479 * 480 * Starvation of the active page list is used to 481 * determine when the system is massively memory 482 * starved. 483 */ 484 break; 485 case VM_PAGER_AGAIN: 486 break; 487 } 488 489 /* 490 * If not PENDing this was a synchronous operation and we 491 * clean up after the I/O. If it is PENDing the mess is 492 * cleaned up asynchronously. 493 * 494 * Also nominally act on the caller's wishes if the caller 495 * wants to try to really clean (cache or free) the page. 496 * 497 * Also nominally deactivate the page if the system is 498 * memory-stressed. 499 */ 500 if (pageout_status[i] != VM_PAGER_PEND) { 501 vm_page_busy_wait(mt, FALSE, "pgouw"); 502 vm_page_io_finish(mt); 503 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 504 vm_page_try_to_cache(mt); 505 } else if (vm_page_count_severe()) { 506 vm_page_deactivate(mt); 507 vm_page_wakeup(mt); 508 } else { 509 vm_page_wakeup(mt); 510 } 511 vm_object_pip_wakeup(object); 512 } 513 } 514 return numpagedout; 515 } 516 517 #if !defined(NO_SWAPPING) 518 519 /* 520 * Callback function, page busied for us. We must dispose of the busy 521 * condition. Any related pmap pages may be held but will not be locked. 522 */ 523 static 524 int 525 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 526 vm_page_t p) 527 { 528 int actcount; 529 int cleanit = 0; 530 531 /* 532 * Basic tests - There should never be a marker, and we can stop 533 * once the RSS is below the required level. 534 */ 535 KKASSERT((p->flags & PG_MARKER) == 0); 536 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 537 vm_page_wakeup(p); 538 return(-1); 539 } 540 541 mycpu->gd_cnt.v_pdpages++; 542 543 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 544 vm_page_wakeup(p); 545 goto done; 546 } 547 548 ++info->actioncount; 549 550 /* 551 * Check if the page has been referened recently. If it has, 552 * activate it and skip. 553 */ 554 actcount = pmap_ts_referenced(p); 555 if (actcount) { 556 vm_page_flag_set(p, PG_REFERENCED); 557 } else if (p->flags & PG_REFERENCED) { 558 actcount = 1; 559 } 560 561 if (actcount) { 562 if (p->queue - p->pc != PQ_ACTIVE) { 563 vm_page_and_queue_spin_lock(p); 564 if (p->queue - p->pc != PQ_ACTIVE) { 565 vm_page_and_queue_spin_unlock(p); 566 vm_page_activate(p); 567 } else { 568 vm_page_and_queue_spin_unlock(p); 569 } 570 } else { 571 p->act_count += actcount; 572 if (p->act_count > ACT_MAX) 573 p->act_count = ACT_MAX; 574 } 575 vm_page_flag_clear(p, PG_REFERENCED); 576 vm_page_wakeup(p); 577 goto done; 578 } 579 580 /* 581 * Remove the page from this particular pmap. Once we do this, our 582 * pmap scans will not see it again (unless it gets faulted in), so 583 * we must actively dispose of or deal with the page. 584 */ 585 pmap_remove_specific(info->pmap, p); 586 587 /* 588 * If the page is not mapped to another process (i.e. as would be 589 * typical if this were a shared page from a library) then deactivate 590 * the page and clean it in two passes only. 591 * 592 * If the page hasn't been referenced since the last check, remove it 593 * from the pmap. If it is no longer mapped, deactivate it 594 * immediately, accelerating the normal decline. 595 * 596 * Once the page has been removed from the pmap the RSS code no 597 * longer tracks it so we have to make sure that it is staged for 598 * potential flush action. 599 */ 600 if ((p->flags & PG_MAPPED) == 0 || 601 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 602 if (p->queue - p->pc == PQ_ACTIVE) { 603 vm_page_deactivate(p); 604 } 605 if (p->queue - p->pc == PQ_INACTIVE) { 606 cleanit = 1; 607 } 608 } 609 610 /* 611 * Ok, try to fully clean the page and any nearby pages such that at 612 * least the requested page is freed or moved to the cache queue. 613 * 614 * We usually do this synchronously to allow us to get the page into 615 * the CACHE queue quickly, which will prevent memory exhaustion if 616 * a process with a memoryuse limit is running away. However, the 617 * sysadmin may desire to set vm.swap_user_async which relaxes this 618 * and improves write performance. 619 */ 620 if (cleanit) { 621 long max_launder = 0x7FFF; 622 long vnodes_skipped = 0; 623 int vmflush_flags; 624 struct vnode *vpfailed = NULL; 625 626 info->offset = va; 627 628 if (vm_pageout_memuse_mode >= 2) { 629 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 630 VM_PAGER_ALLOW_ACTIVE; 631 if (swap_user_async == 0) 632 vmflush_flags |= VM_PAGER_PUT_SYNC; 633 vm_page_flag_set(p, PG_WINATCFLS); 634 info->cleancount += 635 vm_pageout_page(p, &max_launder, 636 &vnodes_skipped, 637 &vpfailed, 1, vmflush_flags); 638 } else { 639 vm_page_wakeup(p); 640 ++info->cleancount; 641 } 642 } else { 643 vm_page_wakeup(p); 644 } 645 646 /* 647 * Must be at end to avoid SMP races. 648 */ 649 done: 650 lwkt_user_yield(); 651 return 0; 652 } 653 654 /* 655 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 656 * that is relatively difficult to do. We try to keep track of where we 657 * left off last time to reduce scan overhead. 658 * 659 * Called when vm_pageout_memuse_mode is >= 1. 660 */ 661 void 662 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 663 { 664 vm_offset_t pgout_offset; 665 struct pmap_pgscan_info info; 666 int retries = 3; 667 668 pgout_offset = map->pgout_offset; 669 again: 670 #if 0 671 kprintf("%016jx ", pgout_offset); 672 #endif 673 if (pgout_offset < VM_MIN_USER_ADDRESS) 674 pgout_offset = VM_MIN_USER_ADDRESS; 675 if (pgout_offset >= VM_MAX_USER_ADDRESS) 676 pgout_offset = 0; 677 info.pmap = vm_map_pmap(map); 678 info.limit = limit; 679 info.beg_addr = pgout_offset; 680 info.end_addr = VM_MAX_USER_ADDRESS; 681 info.callback = vm_pageout_mdp_callback; 682 info.cleancount = 0; 683 info.actioncount = 0; 684 info.busycount = 0; 685 686 pmap_pgscan(&info); 687 pgout_offset = info.offset; 688 #if 0 689 kprintf("%016jx %08lx %08lx\n", pgout_offset, 690 info.cleancount, info.actioncount); 691 #endif 692 693 if (pgout_offset != VM_MAX_USER_ADDRESS && 694 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 695 goto again; 696 } else if (retries && 697 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 698 --retries; 699 goto again; 700 } 701 map->pgout_offset = pgout_offset; 702 } 703 #endif 704 705 /* 706 * Called when the pageout scan wants to free a page. We no longer 707 * try to cycle the vm_object here with a reference & dealloc, which can 708 * cause a non-trivial object collapse in a critical path. 709 * 710 * It is unclear why we cycled the ref_count in the past, perhaps to try 711 * to optimize shadow chain collapses but I don't quite see why it would 712 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 713 * synchronously and not have to be kicked-start. 714 */ 715 static void 716 vm_pageout_page_free(vm_page_t m) 717 { 718 vm_page_protect(m, VM_PROT_NONE); 719 vm_page_free(m); 720 } 721 722 /* 723 * vm_pageout_scan does the dirty work for the pageout daemon. 724 */ 725 struct vm_pageout_scan_info { 726 struct proc *bigproc; 727 vm_offset_t bigsize; 728 }; 729 730 static int vm_pageout_scan_callback(struct proc *p, void *data); 731 732 /* 733 * Scan inactive queue 734 * 735 * WARNING! Can be called from two pagedaemon threads simultaneously. 736 */ 737 static int 738 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 739 long *vnodes_skipped) 740 { 741 vm_page_t m; 742 struct vm_page marker; 743 struct vnode *vpfailed; /* warning, allowed to be stale */ 744 long maxscan; 745 long delta = 0; 746 long max_launder; 747 int isep; 748 749 isep = (curthread == emergpager); 750 751 /* 752 * Start scanning the inactive queue for pages we can move to the 753 * cache or free. The scan will stop when the target is reached or 754 * we have scanned the entire inactive queue. Note that m->act_count 755 * is not used to form decisions for the inactive queue, only for the 756 * active queue. 757 * 758 * max_launder limits the number of dirty pages we flush per scan. 759 * For most systems a smaller value (16 or 32) is more robust under 760 * extreme memory and disk pressure because any unnecessary writes 761 * to disk can result in extreme performance degredation. However, 762 * systems with excessive dirty pages (especially when MAP_NOSYNC is 763 * used) will die horribly with limited laundering. If the pageout 764 * daemon cannot clean enough pages in the first pass, we let it go 765 * all out in succeeding passes. 766 * 767 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 768 * PAGES. 769 */ 770 if ((max_launder = vm_max_launder) <= 1) 771 max_launder = 1; 772 if (pass) 773 max_launder = 10000; 774 775 /* 776 * Initialize our marker 777 */ 778 bzero(&marker, sizeof(marker)); 779 marker.flags = PG_FICTITIOUS | PG_MARKER; 780 marker.busy_count = PBUSY_LOCKED; 781 marker.queue = PQ_INACTIVE + q; 782 marker.pc = q; 783 marker.wire_count = 1; 784 785 /* 786 * Inactive queue scan. 787 * 788 * NOTE: The vm_page must be spinlocked before the queue to avoid 789 * deadlocks, so it is easiest to simply iterate the loop 790 * with the queue unlocked at the top. 791 */ 792 vpfailed = NULL; 793 794 vm_page_queues_spin_lock(PQ_INACTIVE + q); 795 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 796 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 797 798 /* 799 * Queue locked at top of loop to avoid stack marker issues. 800 */ 801 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 802 maxscan-- > 0 && avail_shortage - delta > 0) 803 { 804 int count; 805 806 KKASSERT(m->queue == PQ_INACTIVE + q); 807 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 808 &marker, pageq); 809 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 810 &marker, pageq); 811 mycpu->gd_cnt.v_pdpages++; 812 813 /* 814 * Skip marker pages (atomic against other markers to avoid 815 * infinite hop-over scans). 816 */ 817 if (m->flags & PG_MARKER) 818 continue; 819 820 /* 821 * Try to busy the page. Don't mess with pages which are 822 * already busy or reorder them in the queue. 823 */ 824 if (vm_page_busy_try(m, TRUE)) 825 continue; 826 827 /* 828 * Remaining operations run with the page busy and neither 829 * the page or the queue will be spin-locked. 830 */ 831 KKASSERT(m->queue == PQ_INACTIVE + q); 832 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 833 834 /* 835 * The emergency pager runs when the primary pager gets 836 * stuck, which typically means the primary pager deadlocked 837 * on a vnode-backed page. Therefore, the emergency pager 838 * must skip any complex objects. 839 * 840 * We disallow VNODEs unless they are VCHR whos device ops 841 * does not flag D_NOEMERGPGR. 842 */ 843 if (isep && m->object) { 844 struct vnode *vp; 845 846 switch(m->object->type) { 847 case OBJT_DEFAULT: 848 case OBJT_SWAP: 849 /* 850 * Allow anonymous memory and assume that 851 * swap devices are not complex, since its 852 * kinda worthless if we can't swap out dirty 853 * anonymous pages. 854 */ 855 break; 856 case OBJT_VNODE: 857 /* 858 * Allow VCHR device if the D_NOEMERGPGR 859 * flag is not set, deny other vnode types 860 * as being too complex. 861 */ 862 vp = m->object->handle; 863 if (vp && vp->v_type == VCHR && 864 vp->v_rdev && vp->v_rdev->si_ops && 865 (vp->v_rdev->si_ops->head.flags & 866 D_NOEMERGPGR) == 0) { 867 break; 868 } 869 /* Deny - fall through */ 870 default: 871 /* 872 * Deny 873 */ 874 vm_page_wakeup(m); 875 vm_page_queues_spin_lock(PQ_INACTIVE + q); 876 lwkt_yield(); 877 continue; 878 } 879 } 880 881 /* 882 * Try to pageout the page and perhaps other nearby pages. 883 */ 884 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 885 &vpfailed, pass, 0); 886 delta += count; 887 888 /* 889 * Systems with a ton of memory can wind up with huge 890 * deactivation counts. Because the inactive scan is 891 * doing a lot of flushing, the combination can result 892 * in excessive paging even in situations where other 893 * unrelated threads free up sufficient VM. 894 * 895 * To deal with this we abort the nominal active->inactive 896 * scan before we hit the inactive target when free+cache 897 * levels have reached a reasonable target. 898 * 899 * When deciding to stop early we need to add some slop to 900 * the test and we need to return full completion to the caller 901 * to prevent the caller from thinking there is something 902 * wrong and issuing a low-memory+swap warning or pkill. 903 * 904 * A deficit forces paging regardless of the state of the 905 * VM page queues (used for RSS enforcement). 906 */ 907 lwkt_yield(); 908 vm_page_queues_spin_lock(PQ_INACTIVE + q); 909 if (vm_paging_target() < -vm_max_launder) { 910 /* 911 * Stopping early, return full completion to caller. 912 */ 913 if (delta < avail_shortage) 914 delta = avail_shortage; 915 break; 916 } 917 } 918 919 /* page queue still spin-locked */ 920 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 921 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 922 923 return (delta); 924 } 925 926 /* 927 * Pageout the specified page, return the total number of pages paged out 928 * (this routine may cluster). 929 * 930 * The page must be busied and soft-busied by the caller and will be disposed 931 * of by this function. 932 */ 933 static int 934 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 935 struct vnode **vpfailedp, int pass, int vmflush_flags) 936 { 937 vm_object_t object; 938 int actcount; 939 int count = 0; 940 941 /* 942 * Wiring no longer removes a page from its queue. The last unwiring 943 * will requeue the page. Obviously wired pages cannot be paged out 944 * so unqueue it and return. 945 */ 946 if (m->wire_count) { 947 vm_page_unqueue_nowakeup(m); 948 vm_page_wakeup(m); 949 return 0; 950 } 951 952 /* 953 * A held page may be undergoing I/O, so skip it. 954 */ 955 if (m->hold_count) { 956 vm_page_and_queue_spin_lock(m); 957 if (m->queue - m->pc == PQ_INACTIVE) { 958 TAILQ_REMOVE( 959 &vm_page_queues[m->queue].pl, m, pageq); 960 TAILQ_INSERT_TAIL( 961 &vm_page_queues[m->queue].pl, m, pageq); 962 } 963 vm_page_and_queue_spin_unlock(m); 964 vm_page_wakeup(m); 965 return 0; 966 } 967 968 if (m->object == NULL || m->object->ref_count == 0) { 969 /* 970 * If the object is not being used, we ignore previous 971 * references. 972 */ 973 vm_page_flag_clear(m, PG_REFERENCED); 974 pmap_clear_reference(m); 975 /* fall through to end */ 976 } else if (((m->flags & PG_REFERENCED) == 0) && 977 (actcount = pmap_ts_referenced(m))) { 978 /* 979 * Otherwise, if the page has been referenced while 980 * in the inactive queue, we bump the "activation 981 * count" upwards, making it less likely that the 982 * page will be added back to the inactive queue 983 * prematurely again. Here we check the page tables 984 * (or emulated bits, if any), given the upper level 985 * VM system not knowing anything about existing 986 * references. 987 */ 988 vm_page_activate(m); 989 m->act_count += (actcount + ACT_ADVANCE); 990 vm_page_wakeup(m); 991 return 0; 992 } 993 994 /* 995 * (m) is still busied. 996 * 997 * If the upper level VM system knows about any page 998 * references, we activate the page. We also set the 999 * "activation count" higher than normal so that we will less 1000 * likely place pages back onto the inactive queue again. 1001 */ 1002 if ((m->flags & PG_REFERENCED) != 0) { 1003 vm_page_flag_clear(m, PG_REFERENCED); 1004 actcount = pmap_ts_referenced(m); 1005 vm_page_activate(m); 1006 m->act_count += (actcount + ACT_ADVANCE + 1); 1007 vm_page_wakeup(m); 1008 return 0; 1009 } 1010 1011 /* 1012 * If the upper level VM system doesn't know anything about 1013 * the page being dirty, we have to check for it again. As 1014 * far as the VM code knows, any partially dirty pages are 1015 * fully dirty. 1016 * 1017 * Pages marked PG_WRITEABLE may be mapped into the user 1018 * address space of a process running on another cpu. A 1019 * user process (without holding the MP lock) running on 1020 * another cpu may be able to touch the page while we are 1021 * trying to remove it. vm_page_cache() will handle this 1022 * case for us. 1023 */ 1024 if (m->dirty == 0) { 1025 vm_page_test_dirty(m); 1026 } else { 1027 vm_page_dirty(m); 1028 } 1029 1030 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1031 /* 1032 * Invalid pages can be easily freed 1033 */ 1034 vm_pageout_page_free(m); 1035 mycpu->gd_cnt.v_dfree++; 1036 ++count; 1037 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1038 /* 1039 * Clean pages can be placed onto the cache queue. 1040 * This effectively frees them. 1041 */ 1042 vm_page_cache(m); 1043 ++count; 1044 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1045 /* 1046 * Dirty pages need to be paged out, but flushing 1047 * a page is extremely expensive verses freeing 1048 * a clean page. Rather then artificially limiting 1049 * the number of pages we can flush, we instead give 1050 * dirty pages extra priority on the inactive queue 1051 * by forcing them to be cycled through the queue 1052 * twice before being flushed, after which the 1053 * (now clean) page will cycle through once more 1054 * before being freed. This significantly extends 1055 * the thrash point for a heavily loaded machine. 1056 */ 1057 vm_page_flag_set(m, PG_WINATCFLS); 1058 vm_page_and_queue_spin_lock(m); 1059 if (m->queue - m->pc == PQ_INACTIVE) { 1060 TAILQ_REMOVE( 1061 &vm_page_queues[m->queue].pl, m, pageq); 1062 TAILQ_INSERT_TAIL( 1063 &vm_page_queues[m->queue].pl, m, pageq); 1064 } 1065 vm_page_and_queue_spin_unlock(m); 1066 vm_page_wakeup(m); 1067 } else if (*max_launderp > 0) { 1068 /* 1069 * We always want to try to flush some dirty pages if 1070 * we encounter them, to keep the system stable. 1071 * Normally this number is small, but under extreme 1072 * pressure where there are insufficient clean pages 1073 * on the inactive queue, we may have to go all out. 1074 */ 1075 int swap_pageouts_ok; 1076 struct vnode *vp = NULL; 1077 1078 swap_pageouts_ok = 0; 1079 object = m->object; 1080 if (object && 1081 (object->type != OBJT_SWAP) && 1082 (object->type != OBJT_DEFAULT)) { 1083 swap_pageouts_ok = 1; 1084 } else { 1085 swap_pageouts_ok = !(defer_swap_pageouts || 1086 disable_swap_pageouts); 1087 swap_pageouts_ok |= (!disable_swap_pageouts && 1088 defer_swap_pageouts && 1089 vm_page_count_min(0)); 1090 } 1091 1092 /* 1093 * We don't bother paging objects that are "dead". 1094 * Those objects are in a "rundown" state. 1095 */ 1096 if (!swap_pageouts_ok || 1097 (object == NULL) || 1098 (object->flags & OBJ_DEAD)) { 1099 vm_page_and_queue_spin_lock(m); 1100 if (m->queue - m->pc == PQ_INACTIVE) { 1101 TAILQ_REMOVE( 1102 &vm_page_queues[m->queue].pl, 1103 m, pageq); 1104 TAILQ_INSERT_TAIL( 1105 &vm_page_queues[m->queue].pl, 1106 m, pageq); 1107 } 1108 vm_page_and_queue_spin_unlock(m); 1109 vm_page_wakeup(m); 1110 return 0; 1111 } 1112 1113 /* 1114 * (m) is still busied. 1115 * 1116 * The object is already known NOT to be dead. It 1117 * is possible for the vget() to block the whole 1118 * pageout daemon, but the new low-memory handling 1119 * code should prevent it. 1120 * 1121 * The previous code skipped locked vnodes and, worse, 1122 * reordered pages in the queue. This results in 1123 * completely non-deterministic operation because, 1124 * quite often, a vm_fault has initiated an I/O and 1125 * is holding a locked vnode at just the point where 1126 * the pageout daemon is woken up. 1127 * 1128 * We can't wait forever for the vnode lock, we might 1129 * deadlock due to a vn_read() getting stuck in 1130 * vm_wait while holding this vnode. We skip the 1131 * vnode if we can't get it in a reasonable amount 1132 * of time. 1133 * 1134 * vpfailed is used to (try to) avoid the case where 1135 * a large number of pages are associated with a 1136 * locked vnode, which could cause the pageout daemon 1137 * to stall for an excessive amount of time. 1138 */ 1139 if (object->type == OBJT_VNODE) { 1140 int flags; 1141 1142 vp = object->handle; 1143 flags = LK_EXCLUSIVE; 1144 if (vp == *vpfailedp) 1145 flags |= LK_NOWAIT; 1146 else 1147 flags |= LK_TIMELOCK; 1148 vm_page_hold(m); 1149 vm_page_wakeup(m); 1150 1151 /* 1152 * We have unbusied (m) temporarily so we can 1153 * acquire the vp lock without deadlocking. 1154 * (m) is held to prevent destruction. 1155 */ 1156 if (vget(vp, flags) != 0) { 1157 *vpfailedp = vp; 1158 ++pageout_lock_miss; 1159 if (object->flags & OBJ_MIGHTBEDIRTY) 1160 ++*vnodes_skippedp; 1161 vm_page_unhold(m); 1162 return 0; 1163 } 1164 1165 /* 1166 * The page might have been moved to another 1167 * queue during potential blocking in vget() 1168 * above. The page might have been freed and 1169 * reused for another vnode. The object might 1170 * have been reused for another vnode. 1171 */ 1172 if (m->queue - m->pc != PQ_INACTIVE || 1173 m->object != object || 1174 object->handle != vp) { 1175 if (object->flags & OBJ_MIGHTBEDIRTY) 1176 ++*vnodes_skippedp; 1177 vput(vp); 1178 vm_page_unhold(m); 1179 return 0; 1180 } 1181 1182 /* 1183 * The page may have been busied during the 1184 * blocking in vput(); We don't move the 1185 * page back onto the end of the queue so that 1186 * statistics are more correct if we don't. 1187 */ 1188 if (vm_page_busy_try(m, TRUE)) { 1189 vput(vp); 1190 vm_page_unhold(m); 1191 return 0; 1192 } 1193 vm_page_unhold(m); 1194 1195 /* 1196 * If it was wired while we didn't own it. 1197 */ 1198 if (m->wire_count) { 1199 vm_page_unqueue_nowakeup(m); 1200 vput(vp); 1201 vm_page_wakeup(m); 1202 return 0; 1203 } 1204 1205 /* 1206 * (m) is busied again 1207 * 1208 * We own the busy bit and remove our hold 1209 * bit. If the page is still held it 1210 * might be undergoing I/O, so skip it. 1211 */ 1212 if (m->hold_count) { 1213 vm_page_and_queue_spin_lock(m); 1214 if (m->queue - m->pc == PQ_INACTIVE) { 1215 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1216 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1217 } 1218 vm_page_and_queue_spin_unlock(m); 1219 if (object->flags & OBJ_MIGHTBEDIRTY) 1220 ++*vnodes_skippedp; 1221 vm_page_wakeup(m); 1222 vput(vp); 1223 return 0; 1224 } 1225 /* (m) is left busied as we fall through */ 1226 } 1227 1228 /* 1229 * page is busy and not held here. 1230 * 1231 * If a page is dirty, then it is either being washed 1232 * (but not yet cleaned) or it is still in the 1233 * laundry. If it is still in the laundry, then we 1234 * start the cleaning operation. 1235 * 1236 * decrement inactive_shortage on success to account 1237 * for the (future) cleaned page. Otherwise we 1238 * could wind up laundering or cleaning too many 1239 * pages. 1240 * 1241 * NOTE: Cleaning the page here does not cause 1242 * force_deficit to be adjusted, because the 1243 * page is not being freed or moved to the 1244 * cache. 1245 */ 1246 count = vm_pageout_clean_helper(m, vmflush_flags); 1247 *max_launderp -= count; 1248 1249 /* 1250 * Clean ate busy, page no longer accessible 1251 */ 1252 if (vp != NULL) 1253 vput(vp); 1254 } else { 1255 vm_page_wakeup(m); 1256 } 1257 return count; 1258 } 1259 1260 /* 1261 * Scan active queue 1262 * 1263 * WARNING! Can be called from two pagedaemon threads simultaneously. 1264 */ 1265 static int 1266 vm_pageout_scan_active(int pass, int q, 1267 long avail_shortage, long inactive_shortage, 1268 long *recycle_countp) 1269 { 1270 struct vm_page marker; 1271 vm_page_t m; 1272 int actcount; 1273 long delta = 0; 1274 long maxscan; 1275 int isep; 1276 1277 isep = (curthread == emergpager); 1278 1279 /* 1280 * We want to move pages from the active queue to the inactive 1281 * queue to get the inactive queue to the inactive target. If 1282 * we still have a page shortage from above we try to directly free 1283 * clean pages instead of moving them. 1284 * 1285 * If we do still have a shortage we keep track of the number of 1286 * pages we free or cache (recycle_count) as a measure of thrashing 1287 * between the active and inactive queues. 1288 * 1289 * If we were able to completely satisfy the free+cache targets 1290 * from the inactive pool we limit the number of pages we move 1291 * from the active pool to the inactive pool to 2x the pages we 1292 * had removed from the inactive pool (with a minimum of 1/5 the 1293 * inactive target). If we were not able to completely satisfy 1294 * the free+cache targets we go for the whole target aggressively. 1295 * 1296 * NOTE: Both variables can end up negative. 1297 * NOTE: We are still in a critical section. 1298 * 1299 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1300 * PAGES. 1301 */ 1302 1303 bzero(&marker, sizeof(marker)); 1304 marker.flags = PG_FICTITIOUS | PG_MARKER; 1305 marker.busy_count = PBUSY_LOCKED; 1306 marker.queue = PQ_ACTIVE + q; 1307 marker.pc = q; 1308 marker.wire_count = 1; 1309 1310 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1311 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1312 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1313 1314 /* 1315 * Queue locked at top of loop to avoid stack marker issues. 1316 */ 1317 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1318 maxscan-- > 0 && (avail_shortage - delta > 0 || 1319 inactive_shortage > 0)) 1320 { 1321 KKASSERT(m->queue == PQ_ACTIVE + q); 1322 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1323 &marker, pageq); 1324 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1325 &marker, pageq); 1326 1327 /* 1328 * Skip marker pages (atomic against other markers to avoid 1329 * infinite hop-over scans). 1330 */ 1331 if (m->flags & PG_MARKER) 1332 continue; 1333 1334 /* 1335 * Try to busy the page. Don't mess with pages which are 1336 * already busy or reorder them in the queue. 1337 */ 1338 if (vm_page_busy_try(m, TRUE)) 1339 continue; 1340 1341 /* 1342 * Remaining operations run with the page busy and neither 1343 * the page or the queue will be spin-locked. 1344 */ 1345 KKASSERT(m->queue == PQ_ACTIVE + q); 1346 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1347 1348 #if 0 1349 /* 1350 * Don't deactivate pages that are held, even if we can 1351 * busy them. (XXX why not?) 1352 */ 1353 if (m->hold_count) { 1354 vm_page_and_queue_spin_lock(m); 1355 if (m->queue - m->pc == PQ_ACTIVE) { 1356 TAILQ_REMOVE( 1357 &vm_page_queues[PQ_ACTIVE + q].pl, 1358 m, pageq); 1359 TAILQ_INSERT_TAIL( 1360 &vm_page_queues[PQ_ACTIVE + q].pl, 1361 m, pageq); 1362 } 1363 vm_page_and_queue_spin_unlock(m); 1364 vm_page_wakeup(m); 1365 goto next; 1366 } 1367 #endif 1368 /* 1369 * We can just remove wired pages from the queue 1370 */ 1371 if (m->wire_count) { 1372 vm_page_unqueue_nowakeup(m); 1373 vm_page_wakeup(m); 1374 goto next; 1375 } 1376 1377 /* 1378 * The emergency pager ignores vnode-backed pages as these 1379 * are the pages that probably bricked the main pager. 1380 */ 1381 if (isep && m->object && m->object->type == OBJT_VNODE) { 1382 vm_page_and_queue_spin_lock(m); 1383 if (m->queue - m->pc == PQ_ACTIVE) { 1384 TAILQ_REMOVE( 1385 &vm_page_queues[PQ_ACTIVE + q].pl, 1386 m, pageq); 1387 TAILQ_INSERT_TAIL( 1388 &vm_page_queues[PQ_ACTIVE + q].pl, 1389 m, pageq); 1390 } 1391 vm_page_and_queue_spin_unlock(m); 1392 vm_page_wakeup(m); 1393 goto next; 1394 } 1395 1396 /* 1397 * The count for pagedaemon pages is done after checking the 1398 * page for eligibility... 1399 */ 1400 mycpu->gd_cnt.v_pdpages++; 1401 1402 /* 1403 * Check to see "how much" the page has been used and clear 1404 * the tracking access bits. If the object has no references 1405 * don't bother paying the expense. 1406 */ 1407 actcount = 0; 1408 if (m->object && m->object->ref_count != 0) { 1409 if (m->flags & PG_REFERENCED) 1410 ++actcount; 1411 actcount += pmap_ts_referenced(m); 1412 if (actcount) { 1413 m->act_count += ACT_ADVANCE + actcount; 1414 if (m->act_count > ACT_MAX) 1415 m->act_count = ACT_MAX; 1416 } 1417 } 1418 vm_page_flag_clear(m, PG_REFERENCED); 1419 1420 /* 1421 * actcount is only valid if the object ref_count is non-zero. 1422 * If the page does not have an object, actcount will be zero. 1423 */ 1424 if (actcount && m->object->ref_count != 0) { 1425 vm_page_and_queue_spin_lock(m); 1426 if (m->queue - m->pc == PQ_ACTIVE) { 1427 TAILQ_REMOVE( 1428 &vm_page_queues[PQ_ACTIVE + q].pl, 1429 m, pageq); 1430 TAILQ_INSERT_TAIL( 1431 &vm_page_queues[PQ_ACTIVE + q].pl, 1432 m, pageq); 1433 } 1434 vm_page_and_queue_spin_unlock(m); 1435 vm_page_wakeup(m); 1436 } else { 1437 switch(m->object->type) { 1438 case OBJT_DEFAULT: 1439 case OBJT_SWAP: 1440 m->act_count -= min(m->act_count, 1441 vm_anonmem_decline); 1442 break; 1443 default: 1444 m->act_count -= min(m->act_count, 1445 vm_filemem_decline); 1446 break; 1447 } 1448 if (vm_pageout_algorithm || 1449 (m->object == NULL) || 1450 (m->object && (m->object->ref_count == 0)) || 1451 m->act_count < pass + 1 1452 ) { 1453 /* 1454 * Deactivate the page. If we had a 1455 * shortage from our inactive scan try to 1456 * free (cache) the page instead. 1457 * 1458 * Don't just blindly cache the page if 1459 * we do not have a shortage from the 1460 * inactive scan, that could lead to 1461 * gigabytes being moved. 1462 */ 1463 --inactive_shortage; 1464 if (avail_shortage - delta > 0 || 1465 (m->object && (m->object->ref_count == 0))) 1466 { 1467 if (avail_shortage - delta > 0) 1468 ++*recycle_countp; 1469 vm_page_protect(m, VM_PROT_NONE); 1470 if (m->dirty == 0 && 1471 (m->flags & PG_NEED_COMMIT) == 0 && 1472 avail_shortage - delta > 0) { 1473 vm_page_cache(m); 1474 } else { 1475 vm_page_deactivate(m); 1476 vm_page_wakeup(m); 1477 } 1478 } else { 1479 vm_page_deactivate(m); 1480 vm_page_wakeup(m); 1481 } 1482 ++delta; 1483 } else { 1484 vm_page_and_queue_spin_lock(m); 1485 if (m->queue - m->pc == PQ_ACTIVE) { 1486 TAILQ_REMOVE( 1487 &vm_page_queues[PQ_ACTIVE + q].pl, 1488 m, pageq); 1489 TAILQ_INSERT_TAIL( 1490 &vm_page_queues[PQ_ACTIVE + q].pl, 1491 m, pageq); 1492 } 1493 vm_page_and_queue_spin_unlock(m); 1494 vm_page_wakeup(m); 1495 } 1496 } 1497 next: 1498 lwkt_yield(); 1499 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1500 } 1501 1502 /* 1503 * Clean out our local marker. 1504 * 1505 * Page queue still spin-locked. 1506 */ 1507 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1508 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1509 1510 return (delta); 1511 } 1512 1513 /* 1514 * The number of actually free pages can drop down to v_free_reserved, 1515 * we try to build the free count back above v_free_min. Note that 1516 * vm_paging_needed() also returns TRUE if v_free_count is not at 1517 * least v_free_min so that is the minimum we must build the free 1518 * count to. 1519 * 1520 * We use a slightly higher target to improve hysteresis, 1521 * ((v_free_target + v_free_min) / 2). Since v_free_target 1522 * is usually the same as v_cache_min this maintains about 1523 * half the pages in the free queue as are in the cache queue, 1524 * providing pretty good pipelining for pageout operation. 1525 * 1526 * The system operator can manipulate vm.v_cache_min and 1527 * vm.v_free_target to tune the pageout demon. Be sure 1528 * to keep vm.v_free_min < vm.v_free_target. 1529 * 1530 * Note that the original paging target is to get at least 1531 * (free_min + cache_min) into (free + cache). The slightly 1532 * higher target will shift additional pages from cache to free 1533 * without effecting the original paging target in order to 1534 * maintain better hysteresis and not have the free count always 1535 * be dead-on v_free_min. 1536 * 1537 * NOTE: we are still in a critical section. 1538 * 1539 * Pages moved from PQ_CACHE to totally free are not counted in the 1540 * pages_freed counter. 1541 * 1542 * WARNING! Can be called from two pagedaemon threads simultaneously. 1543 */ 1544 static void 1545 vm_pageout_scan_cache(long avail_shortage, int pass, 1546 long vnodes_skipped, long recycle_count) 1547 { 1548 static int lastkillticks; 1549 struct vm_pageout_scan_info info; 1550 vm_page_t m; 1551 int isep; 1552 1553 isep = (curthread == emergpager); 1554 1555 while (vmstats.v_free_count < 1556 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1557 /* 1558 * This steals some code from vm/vm_page.c 1559 * 1560 * Create two rovers and adjust the code to reduce 1561 * chances of them winding up at the same index (which 1562 * can cause a lot of contention). 1563 */ 1564 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1565 1566 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1567 goto next_rover; 1568 1569 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1570 if (m == NULL) 1571 break; 1572 1573 /* 1574 * If the busy attempt fails we can still deactivate the page. 1575 */ 1576 /* page is returned removed from its queue and spinlocked */ 1577 if (vm_page_busy_try(m, TRUE)) { 1578 vm_page_deactivate_locked(m); 1579 vm_page_spin_unlock(m); 1580 continue; 1581 } 1582 vm_page_spin_unlock(m); 1583 pagedaemon_wakeup(); 1584 lwkt_yield(); 1585 1586 /* 1587 * Remaining operations run with the page busy and neither 1588 * the page or the queue will be spin-locked. 1589 */ 1590 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) || 1591 m->hold_count || 1592 m->wire_count) { 1593 vm_page_deactivate(m); 1594 vm_page_wakeup(m); 1595 continue; 1596 } 1597 pmap_mapped_sync(m); 1598 KKASSERT((m->flags & PG_MAPPED) == 0); 1599 KKASSERT(m->dirty == 0); 1600 vm_pageout_page_free(m); 1601 mycpu->gd_cnt.v_dfree++; 1602 next_rover: 1603 if (isep) 1604 cache_rover[1] -= PQ_PRIME2; 1605 else 1606 cache_rover[0] += PQ_PRIME2; 1607 } 1608 1609 #if !defined(NO_SWAPPING) 1610 /* 1611 * Idle process swapout -- run once per second. 1612 */ 1613 if (vm_swap_idle_enabled) { 1614 static time_t lsec; 1615 if (time_uptime != lsec) { 1616 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1617 vm_req_vmdaemon(); 1618 lsec = time_uptime; 1619 } 1620 } 1621 #endif 1622 1623 /* 1624 * If we didn't get enough free pages, and we have skipped a vnode 1625 * in a writeable object, wakeup the sync daemon. And kick swapout 1626 * if we did not get enough free pages. 1627 */ 1628 if (vm_paging_target() > 0) { 1629 if (vnodes_skipped && vm_page_count_min(0)) 1630 speedup_syncer(NULL); 1631 #if !defined(NO_SWAPPING) 1632 if (vm_swap_enabled && vm_page_count_target()) { 1633 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1634 vm_req_vmdaemon(); 1635 } 1636 #endif 1637 } 1638 1639 /* 1640 * Handle catastrophic conditions. Under good conditions we should 1641 * be at the target, well beyond our minimum. If we could not even 1642 * reach our minimum the system is under heavy stress. But just being 1643 * under heavy stress does not trigger process killing. 1644 * 1645 * We consider ourselves to have run out of memory if the swap pager 1646 * is full and avail_shortage is still positive. The secondary check 1647 * ensures that we do not kill processes if the instantanious 1648 * availability is good, even if the pageout demon pass says it 1649 * couldn't get to the target. 1650 * 1651 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1652 * SITUATIONS. 1653 */ 1654 if (swap_pager_almost_full && 1655 pass > 0 && 1656 isep == 0 && 1657 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1658 kprintf("Warning: system low on memory+swap " 1659 "shortage %ld for %d ticks!\n", 1660 avail_shortage, ticks - swap_fail_ticks); 1661 if (bootverbose) 1662 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1663 "avail=%ld target=%ld last=%u\n", 1664 swap_pager_almost_full, 1665 swap_pager_full, 1666 pass, 1667 avail_shortage, 1668 vm_paging_target(), 1669 (unsigned int)(ticks - lastkillticks)); 1670 } 1671 if (swap_pager_full && 1672 pass > 1 && 1673 isep == 0 && 1674 avail_shortage > 0 && 1675 vm_paging_target() > 0 && 1676 (unsigned int)(ticks - lastkillticks) >= hz) { 1677 /* 1678 * Kill something, maximum rate once per second to give 1679 * the process time to free up sufficient memory. 1680 */ 1681 lastkillticks = ticks; 1682 info.bigproc = NULL; 1683 info.bigsize = 0; 1684 allproc_scan(vm_pageout_scan_callback, &info, 0); 1685 if (info.bigproc != NULL) { 1686 kprintf("Try to kill process %d %s\n", 1687 info.bigproc->p_pid, info.bigproc->p_comm); 1688 info.bigproc->p_nice = PRIO_MIN; 1689 info.bigproc->p_usched->resetpriority( 1690 FIRST_LWP_IN_PROC(info.bigproc)); 1691 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1692 killproc(info.bigproc, "out of swap space"); 1693 wakeup(&vmstats.v_free_count); 1694 PRELE(info.bigproc); 1695 } 1696 } 1697 } 1698 1699 static int 1700 vm_pageout_scan_callback(struct proc *p, void *data) 1701 { 1702 struct vm_pageout_scan_info *info = data; 1703 vm_offset_t size; 1704 1705 /* 1706 * Never kill system processes or init. If we have configured swap 1707 * then try to avoid killing low-numbered pids. 1708 */ 1709 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1710 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1711 return (0); 1712 } 1713 1714 lwkt_gettoken(&p->p_token); 1715 1716 /* 1717 * if the process is in a non-running type state, 1718 * don't touch it. 1719 */ 1720 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1721 lwkt_reltoken(&p->p_token); 1722 return (0); 1723 } 1724 1725 /* 1726 * Get the approximate process size. Note that anonymous pages 1727 * with backing swap will be counted twice, but there should not 1728 * be too many such pages due to the stress the VM system is 1729 * under at this point. 1730 */ 1731 size = vmspace_anonymous_count(p->p_vmspace) + 1732 vmspace_swap_count(p->p_vmspace); 1733 1734 /* 1735 * If the this process is bigger than the biggest one 1736 * remember it. 1737 */ 1738 if (info->bigsize < size) { 1739 if (info->bigproc) 1740 PRELE(info->bigproc); 1741 PHOLD(p); 1742 info->bigproc = p; 1743 info->bigsize = size; 1744 } 1745 lwkt_reltoken(&p->p_token); 1746 lwkt_yield(); 1747 1748 return(0); 1749 } 1750 1751 /* 1752 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1753 * moved back to PQ_FREE. It is possible for pages to accumulate here 1754 * when vm_page_free() races against vm_page_unhold(), resulting in a 1755 * page being left on a PQ_HOLD queue with hold_count == 0. 1756 * 1757 * It is easier to handle this edge condition here, in non-critical code, 1758 * rather than enforce a spin-lock for every 1->0 transition in 1759 * vm_page_unhold(). 1760 * 1761 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1762 */ 1763 static void 1764 vm_pageout_scan_hold(int q) 1765 { 1766 vm_page_t m; 1767 1768 vm_page_queues_spin_lock(PQ_HOLD + q); 1769 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1770 if (m->flags & PG_MARKER) 1771 continue; 1772 1773 /* 1774 * Process one page and return 1775 */ 1776 if (m->hold_count) 1777 break; 1778 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1779 vm_page_hold(m); 1780 vm_page_queues_spin_unlock(PQ_HOLD + q); 1781 vm_page_unhold(m); /* reprocess */ 1782 return; 1783 } 1784 vm_page_queues_spin_unlock(PQ_HOLD + q); 1785 } 1786 1787 /* 1788 * This routine tries to maintain the pseudo LRU active queue, 1789 * so that during long periods of time where there is no paging, 1790 * that some statistic accumulation still occurs. This code 1791 * helps the situation where paging just starts to occur. 1792 */ 1793 static void 1794 vm_pageout_page_stats(int q) 1795 { 1796 static int fullintervalcount = 0; 1797 struct vm_page marker; 1798 vm_page_t m; 1799 long pcount, tpcount; /* Number of pages to check */ 1800 long page_shortage; 1801 1802 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1803 vmstats.v_free_min) - 1804 (vmstats.v_free_count + vmstats.v_inactive_count + 1805 vmstats.v_cache_count); 1806 1807 if (page_shortage <= 0) 1808 return; 1809 1810 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1811 fullintervalcount += vm_pageout_stats_interval; 1812 if (fullintervalcount < vm_pageout_full_stats_interval) { 1813 tpcount = (vm_pageout_stats_max * pcount) / 1814 vmstats.v_page_count + 1; 1815 if (pcount > tpcount) 1816 pcount = tpcount; 1817 } else { 1818 fullintervalcount = 0; 1819 } 1820 1821 bzero(&marker, sizeof(marker)); 1822 marker.flags = PG_FICTITIOUS | PG_MARKER; 1823 marker.busy_count = PBUSY_LOCKED; 1824 marker.queue = PQ_ACTIVE + q; 1825 marker.pc = q; 1826 marker.wire_count = 1; 1827 1828 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1829 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1830 1831 /* 1832 * Queue locked at top of loop to avoid stack marker issues. 1833 */ 1834 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1835 pcount-- > 0) 1836 { 1837 int actcount; 1838 1839 KKASSERT(m->queue == PQ_ACTIVE + q); 1840 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1841 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1842 &marker, pageq); 1843 1844 /* 1845 * Skip marker pages (atomic against other markers to avoid 1846 * infinite hop-over scans). 1847 */ 1848 if (m->flags & PG_MARKER) 1849 continue; 1850 1851 /* 1852 * Ignore pages we can't busy 1853 */ 1854 if (vm_page_busy_try(m, TRUE)) 1855 continue; 1856 1857 /* 1858 * Remaining operations run with the page busy and neither 1859 * the page or the queue will be spin-locked. 1860 */ 1861 KKASSERT(m->queue == PQ_ACTIVE + q); 1862 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1863 1864 /* 1865 * We can just remove wired pages from the queue 1866 */ 1867 if (m->wire_count) { 1868 vm_page_unqueue_nowakeup(m); 1869 vm_page_wakeup(m); 1870 goto next; 1871 } 1872 1873 1874 /* 1875 * We now have a safely busied page, the page and queue 1876 * spinlocks have been released. 1877 * 1878 * Ignore held and wired pages 1879 */ 1880 if (m->hold_count || m->wire_count) { 1881 vm_page_wakeup(m); 1882 goto next; 1883 } 1884 1885 /* 1886 * Calculate activity 1887 */ 1888 actcount = 0; 1889 if (m->flags & PG_REFERENCED) { 1890 vm_page_flag_clear(m, PG_REFERENCED); 1891 actcount += 1; 1892 } 1893 actcount += pmap_ts_referenced(m); 1894 1895 /* 1896 * Update act_count and move page to end of queue. 1897 */ 1898 if (actcount) { 1899 m->act_count += ACT_ADVANCE + actcount; 1900 if (m->act_count > ACT_MAX) 1901 m->act_count = ACT_MAX; 1902 vm_page_and_queue_spin_lock(m); 1903 if (m->queue - m->pc == PQ_ACTIVE) { 1904 TAILQ_REMOVE( 1905 &vm_page_queues[PQ_ACTIVE + q].pl, 1906 m, pageq); 1907 TAILQ_INSERT_TAIL( 1908 &vm_page_queues[PQ_ACTIVE + q].pl, 1909 m, pageq); 1910 } 1911 vm_page_and_queue_spin_unlock(m); 1912 vm_page_wakeup(m); 1913 goto next; 1914 } 1915 1916 if (m->act_count == 0) { 1917 /* 1918 * We turn off page access, so that we have 1919 * more accurate RSS stats. We don't do this 1920 * in the normal page deactivation when the 1921 * system is loaded VM wise, because the 1922 * cost of the large number of page protect 1923 * operations would be higher than the value 1924 * of doing the operation. 1925 * 1926 * We use the marker to save our place so 1927 * we can release the spin lock. both (m) 1928 * and (next) will be invalid. 1929 */ 1930 vm_page_protect(m, VM_PROT_NONE); 1931 vm_page_deactivate(m); 1932 } else { 1933 m->act_count -= min(m->act_count, ACT_DECLINE); 1934 vm_page_and_queue_spin_lock(m); 1935 if (m->queue - m->pc == PQ_ACTIVE) { 1936 TAILQ_REMOVE( 1937 &vm_page_queues[PQ_ACTIVE + q].pl, 1938 m, pageq); 1939 TAILQ_INSERT_TAIL( 1940 &vm_page_queues[PQ_ACTIVE + q].pl, 1941 m, pageq); 1942 } 1943 vm_page_and_queue_spin_unlock(m); 1944 } 1945 vm_page_wakeup(m); 1946 next: 1947 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1948 } 1949 1950 /* 1951 * Remove our local marker 1952 * 1953 * Page queue still spin-locked. 1954 */ 1955 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1956 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1957 } 1958 1959 static void 1960 vm_pageout_free_page_calc(vm_size_t count) 1961 { 1962 /* 1963 * v_free_min normal allocations 1964 * v_free_reserved system allocations 1965 * v_pageout_free_min allocations by pageout daemon 1966 * v_interrupt_free_min low level allocations (e.g swap structures) 1967 * 1968 * v_free_min is used to generate several other baselines, and they 1969 * can get pretty silly on systems with a lot of memory. 1970 */ 1971 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 1972 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1973 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1974 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1975 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1976 } 1977 1978 1979 /* 1980 * vm_pageout is the high level pageout daemon. TWO kernel threads run 1981 * this daemon, the primary pageout daemon and the emergency pageout daemon. 1982 * 1983 * The emergency pageout daemon takes over when the primary pageout daemon 1984 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 1985 * avoiding the many low-memory deadlocks which can occur when paging out 1986 * to VFS's. 1987 */ 1988 static void 1989 vm_pageout_thread(void) 1990 { 1991 int pass; 1992 int q; 1993 int q1iterator = 0; 1994 int q2iterator = 0; 1995 int q3iterator = 0; 1996 int isep; 1997 1998 curthread->td_flags |= TDF_SYSTHREAD; 1999 2000 /* 2001 * We only need to setup once. 2002 */ 2003 isep = 0; 2004 if (curthread == emergpager) { 2005 isep = 1; 2006 goto skip_setup; 2007 } 2008 2009 /* 2010 * Initialize some paging parameters. 2011 */ 2012 vm_pageout_free_page_calc(vmstats.v_page_count); 2013 2014 /* 2015 * v_free_target and v_cache_min control pageout hysteresis. Note 2016 * that these are more a measure of the VM cache queue hysteresis 2017 * then the VM free queue. Specifically, v_free_target is the 2018 * high water mark (free+cache pages). 2019 * 2020 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2021 * low water mark, while v_free_min is the stop. v_cache_min must 2022 * be big enough to handle memory needs while the pageout daemon 2023 * is signalled and run to free more pages. 2024 */ 2025 vmstats.v_free_target = 4 * vmstats.v_free_min + 2026 vmstats.v_free_reserved; 2027 2028 /* 2029 * NOTE: With the new buffer cache b_act_count we want the default 2030 * inactive target to be a percentage of available memory. 2031 * 2032 * The inactive target essentially determines the minimum 2033 * number of 'temporary' pages capable of caching one-time-use 2034 * files when the VM system is otherwise full of pages 2035 * belonging to multi-time-use files or active program data. 2036 * 2037 * NOTE: The inactive target is aggressively persued only if the 2038 * inactive queue becomes too small. If the inactive queue 2039 * is large enough to satisfy page movement to free+cache 2040 * then it is repopulated more slowly from the active queue. 2041 * This allows a general inactive_target default to be set. 2042 * 2043 * There is an issue here for processes which sit mostly idle 2044 * 'overnight', such as sshd, tcsh, and X. Any movement from 2045 * the active queue will eventually cause such pages to 2046 * recycle eventually causing a lot of paging in the morning. 2047 * To reduce the incidence of this pages cycled out of the 2048 * buffer cache are moved directly to the inactive queue if 2049 * they were only used once or twice. 2050 * 2051 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2052 * Increasing the value (up to 64) increases the number of 2053 * buffer recyclements which go directly to the inactive queue. 2054 */ 2055 if (vmstats.v_free_count > 2048) { 2056 vmstats.v_cache_min = vmstats.v_free_target; 2057 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2058 } else { 2059 vmstats.v_cache_min = 0; 2060 vmstats.v_cache_max = 0; 2061 } 2062 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2063 2064 /* XXX does not really belong here */ 2065 if (vm_page_max_wired == 0) 2066 vm_page_max_wired = vmstats.v_free_count / 3; 2067 2068 if (vm_pageout_stats_max == 0) 2069 vm_pageout_stats_max = vmstats.v_free_target; 2070 2071 /* 2072 * Set interval in seconds for stats scan. 2073 */ 2074 if (vm_pageout_stats_interval == 0) 2075 vm_pageout_stats_interval = 5; 2076 if (vm_pageout_full_stats_interval == 0) 2077 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2078 2079 2080 /* 2081 * Set maximum free per pass 2082 */ 2083 if (vm_pageout_stats_free_max == 0) 2084 vm_pageout_stats_free_max = 5; 2085 2086 swap_pager_swap_init(); 2087 pass = 0; 2088 2089 atomic_swap_int(&sequence_emerg_pager, 1); 2090 wakeup(&sequence_emerg_pager); 2091 2092 skip_setup: 2093 /* 2094 * Sequence emergency pager startup 2095 */ 2096 if (isep) { 2097 while (sequence_emerg_pager == 0) 2098 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2099 } 2100 2101 /* 2102 * The pageout daemon is never done, so loop forever. 2103 * 2104 * WARNING! This code is being executed by two kernel threads 2105 * potentially simultaneously. 2106 */ 2107 while (TRUE) { 2108 int error; 2109 long avail_shortage; 2110 long inactive_shortage; 2111 long vnodes_skipped = 0; 2112 long recycle_count = 0; 2113 long tmp; 2114 2115 /* 2116 * Wait for an action request. If we timeout check to 2117 * see if paging is needed (in case the normal wakeup 2118 * code raced us). 2119 */ 2120 if (isep) { 2121 /* 2122 * Emergency pagedaemon monitors the primary 2123 * pagedaemon while vm_pages_needed != 0. 2124 * 2125 * The emergency pagedaemon only runs if VM paging 2126 * is needed and the primary pagedaemon has not 2127 * updated vm_pagedaemon_time for more than 2 seconds. 2128 */ 2129 if (vm_pages_needed) 2130 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2131 else 2132 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2133 if (vm_pages_needed == 0) { 2134 pass = 0; 2135 continue; 2136 } 2137 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2138 pass = 0; 2139 continue; 2140 } 2141 } else { 2142 /* 2143 * Primary pagedaemon 2144 * 2145 * NOTE: We unconditionally cleanup PQ_HOLD even 2146 * when there is no work to do. 2147 */ 2148 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2149 ++q3iterator; 2150 2151 if (vm_pages_needed == 0) { 2152 error = tsleep(&vm_pages_needed, 2153 0, "psleep", 2154 vm_pageout_stats_interval * hz); 2155 if (error && 2156 vm_paging_needed(0) == 0 && 2157 vm_pages_needed == 0) { 2158 for (q = 0; q < PQ_L2_SIZE; ++q) 2159 vm_pageout_page_stats(q); 2160 continue; 2161 } 2162 vm_pagedaemon_time = ticks; 2163 vm_pages_needed = 1; 2164 2165 /* 2166 * Wake the emergency pagedaemon up so it 2167 * can monitor us. It will automatically 2168 * go back into a long sleep when 2169 * vm_pages_needed returns to 0. 2170 */ 2171 wakeup(&vm_pagedaemon_time); 2172 } 2173 } 2174 2175 mycpu->gd_cnt.v_pdwakeups++; 2176 2177 /* 2178 * Scan for INACTIVE->CLEAN/PAGEOUT 2179 * 2180 * This routine tries to avoid thrashing the system with 2181 * unnecessary activity. 2182 * 2183 * Calculate our target for the number of free+cache pages we 2184 * want to get to. This is higher then the number that causes 2185 * allocations to stall (severe) in order to provide hysteresis, 2186 * and if we don't make it all the way but get to the minimum 2187 * we're happy. Goose it a bit if there are multiple requests 2188 * for memory. 2189 * 2190 * Don't reduce avail_shortage inside the loop or the 2191 * PQAVERAGE() calculation will break. 2192 * 2193 * NOTE! deficit is differentiated from avail_shortage as 2194 * REQUIRING at least (deficit) pages to be cleaned, 2195 * even if the page queues are in good shape. This 2196 * is used primarily for handling per-process 2197 * RLIMIT_RSS and may also see small values when 2198 * processes block due to low memory. 2199 */ 2200 vmstats_rollup(); 2201 if (isep == 0) 2202 vm_pagedaemon_time = ticks; 2203 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2204 vm_pageout_deficit = 0; 2205 2206 if (avail_shortage > 0) { 2207 long delta = 0; 2208 int qq; 2209 2210 qq = q1iterator; 2211 for (q = 0; q < PQ_L2_SIZE; ++q) { 2212 delta += vm_pageout_scan_inactive( 2213 pass, 2214 qq & PQ_L2_MASK, 2215 PQAVERAGE(avail_shortage), 2216 &vnodes_skipped); 2217 if (isep) 2218 --qq; 2219 else 2220 ++qq; 2221 if (avail_shortage - delta <= 0) 2222 break; 2223 } 2224 avail_shortage -= delta; 2225 q1iterator = qq; 2226 } 2227 2228 /* 2229 * Figure out how many active pages we must deactivate. If 2230 * we were able to reach our target with just the inactive 2231 * scan above we limit the number of active pages we 2232 * deactivate to reduce unnecessary work. 2233 */ 2234 vmstats_rollup(); 2235 if (isep == 0) 2236 vm_pagedaemon_time = ticks; 2237 inactive_shortage = vmstats.v_inactive_target - 2238 vmstats.v_inactive_count; 2239 2240 /* 2241 * If we were unable to free sufficient inactive pages to 2242 * satisfy the free/cache queue requirements then simply 2243 * reaching the inactive target may not be good enough. 2244 * Try to deactivate pages in excess of the target based 2245 * on the shortfall. 2246 * 2247 * However to prevent thrashing the VM system do not 2248 * deactivate more than an additional 1/10 the inactive 2249 * target's worth of active pages. 2250 */ 2251 if (avail_shortage > 0) { 2252 tmp = avail_shortage * 2; 2253 if (tmp > vmstats.v_inactive_target / 10) 2254 tmp = vmstats.v_inactive_target / 10; 2255 inactive_shortage += tmp; 2256 } 2257 2258 /* 2259 * Only trigger a pmap cleanup on inactive shortage. 2260 */ 2261 if (isep == 0 && inactive_shortage > 0) { 2262 pmap_collect(); 2263 } 2264 2265 /* 2266 * Scan for ACTIVE->INACTIVE 2267 * 2268 * Only trigger on inactive shortage. Triggering on 2269 * avail_shortage can starve the active queue with 2270 * unnecessary active->inactive transitions and destroy 2271 * performance. 2272 * 2273 * If this is the emergency pager, always try to move 2274 * a few pages from active to inactive because the inactive 2275 * queue might have enough pages, but not enough anonymous 2276 * pages. 2277 */ 2278 if (isep && inactive_shortage < vm_emerg_launder) 2279 inactive_shortage = vm_emerg_launder; 2280 2281 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2282 long delta = 0; 2283 int qq; 2284 2285 qq = q2iterator; 2286 for (q = 0; q < PQ_L2_SIZE; ++q) { 2287 delta += vm_pageout_scan_active( 2288 pass, 2289 qq & PQ_L2_MASK, 2290 PQAVERAGE(avail_shortage), 2291 PQAVERAGE(inactive_shortage), 2292 &recycle_count); 2293 if (isep) 2294 --qq; 2295 else 2296 ++qq; 2297 if (inactive_shortage - delta <= 0 && 2298 avail_shortage - delta <= 0) { 2299 break; 2300 } 2301 } 2302 inactive_shortage -= delta; 2303 avail_shortage -= delta; 2304 q2iterator = qq; 2305 } 2306 2307 /* 2308 * Scan for CACHE->FREE 2309 * 2310 * Finally free enough cache pages to meet our free page 2311 * requirement and take more drastic measures if we are 2312 * still in trouble. 2313 */ 2314 vmstats_rollup(); 2315 if (isep == 0) 2316 vm_pagedaemon_time = ticks; 2317 vm_pageout_scan_cache(avail_shortage, pass, 2318 vnodes_skipped, recycle_count); 2319 2320 /* 2321 * Wait for more work. 2322 */ 2323 if (avail_shortage > 0) { 2324 ++pass; 2325 if (pass < 10 && vm_pages_needed > 1) { 2326 /* 2327 * Normal operation, additional processes 2328 * have already kicked us. Retry immediately 2329 * unless swap space is completely full in 2330 * which case delay a bit. 2331 */ 2332 if (swap_pager_full) { 2333 tsleep(&vm_pages_needed, 0, "pdelay", 2334 hz / 5); 2335 } /* else immediate retry */ 2336 } else if (pass < 10) { 2337 /* 2338 * Normal operation, fewer processes. Delay 2339 * a bit but allow wakeups. vm_pages_needed 2340 * is only adjusted against the primary 2341 * pagedaemon here. 2342 */ 2343 if (isep == 0) 2344 vm_pages_needed = 0; 2345 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2346 if (isep == 0) 2347 vm_pages_needed = 1; 2348 } else if (swap_pager_full == 0) { 2349 /* 2350 * We've taken too many passes, forced delay. 2351 */ 2352 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2353 } else { 2354 /* 2355 * Running out of memory, catastrophic 2356 * back-off to one-second intervals. 2357 */ 2358 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2359 } 2360 } else if (vm_pages_needed) { 2361 /* 2362 * Interlocked wakeup of waiters (non-optional). 2363 * 2364 * Similar to vm_page_free_wakeup() in vm_page.c, 2365 * wake 2366 */ 2367 pass = 0; 2368 if (!vm_page_count_min(vm_page_free_hysteresis) || 2369 !vm_page_count_target()) { 2370 vm_pages_needed = 0; 2371 wakeup(&vmstats.v_free_count); 2372 } 2373 } else { 2374 pass = 0; 2375 } 2376 } 2377 } 2378 2379 static struct kproc_desc pg1_kp = { 2380 "pagedaemon", 2381 vm_pageout_thread, 2382 &pagethread 2383 }; 2384 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2385 2386 static struct kproc_desc pg2_kp = { 2387 "emergpager", 2388 vm_pageout_thread, 2389 &emergpager 2390 }; 2391 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2392 2393 2394 /* 2395 * Called after allocating a page out of the cache or free queue 2396 * to possibly wake the pagedaemon up to replentish our supply. 2397 * 2398 * We try to generate some hysteresis by waking the pagedaemon up 2399 * when our free+cache pages go below the free_min+cache_min level. 2400 * The pagedaemon tries to get the count back up to at least the 2401 * minimum, and through to the target level if possible. 2402 * 2403 * If the pagedaemon is already active bump vm_pages_needed as a hint 2404 * that there are even more requests pending. 2405 * 2406 * SMP races ok? 2407 * No requirements. 2408 */ 2409 void 2410 pagedaemon_wakeup(void) 2411 { 2412 if (vm_paging_needed(0) && curthread != pagethread) { 2413 if (vm_pages_needed == 0) { 2414 vm_pages_needed = 1; /* SMP race ok */ 2415 wakeup(&vm_pages_needed); 2416 } else if (vm_page_count_min(0)) { 2417 ++vm_pages_needed; /* SMP race ok */ 2418 } 2419 } 2420 } 2421 2422 #if !defined(NO_SWAPPING) 2423 2424 /* 2425 * SMP races ok? 2426 * No requirements. 2427 */ 2428 static void 2429 vm_req_vmdaemon(void) 2430 { 2431 static int lastrun = 0; 2432 2433 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2434 wakeup(&vm_daemon_needed); 2435 lastrun = ticks; 2436 } 2437 } 2438 2439 static int vm_daemon_callback(struct proc *p, void *data __unused); 2440 2441 /* 2442 * No requirements. 2443 */ 2444 static void 2445 vm_daemon(void) 2446 { 2447 int req_swapout; 2448 2449 while (TRUE) { 2450 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2451 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2452 2453 /* 2454 * forced swapouts 2455 */ 2456 if (req_swapout) 2457 swapout_procs(vm_pageout_req_swapout); 2458 2459 /* 2460 * scan the processes for exceeding their rlimits or if 2461 * process is swapped out -- deactivate pages 2462 */ 2463 allproc_scan(vm_daemon_callback, NULL, 0); 2464 } 2465 } 2466 2467 static int 2468 vm_daemon_callback(struct proc *p, void *data __unused) 2469 { 2470 struct vmspace *vm; 2471 vm_pindex_t limit, size; 2472 2473 /* 2474 * if this is a system process or if we have already 2475 * looked at this process, skip it. 2476 */ 2477 lwkt_gettoken(&p->p_token); 2478 2479 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2480 lwkt_reltoken(&p->p_token); 2481 return (0); 2482 } 2483 2484 /* 2485 * if the process is in a non-running type state, 2486 * don't touch it. 2487 */ 2488 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2489 lwkt_reltoken(&p->p_token); 2490 return (0); 2491 } 2492 2493 /* 2494 * get a limit 2495 */ 2496 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2497 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2498 2499 /* 2500 * let processes that are swapped out really be 2501 * swapped out. Set the limit to nothing to get as 2502 * many pages out to swap as possible. 2503 */ 2504 if (p->p_flags & P_SWAPPEDOUT) 2505 limit = 0; 2506 2507 vm = p->p_vmspace; 2508 vmspace_hold(vm); 2509 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2510 if (limit >= 0 && size > 4096 && 2511 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2512 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2513 } 2514 vmspace_drop(vm); 2515 2516 lwkt_reltoken(&p->p_token); 2517 2518 return (0); 2519 } 2520 2521 #endif 2522