1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/conf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <sys/lock.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_map.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/swap_pager.h> 93 #include <vm/vm_extern.h> 94 95 #include <sys/spinlock2.h> 96 #include <vm/vm_page2.h> 97 98 /* 99 * System initialization 100 */ 101 102 /* the kernel process "vm_pageout"*/ 103 static int vm_pageout_page(vm_page_t m, long *max_launderp, 104 long *vnodes_skippedp, struct vnode **vpfailedp, 105 int pass, int vmflush_flags); 106 static int vm_pageout_clean_helper (vm_page_t, int); 107 static void vm_pageout_free_page_calc (vm_size_t count); 108 static void vm_pageout_page_free(vm_page_t m) ; 109 struct thread *emergpager; 110 struct thread *pagethread; 111 static int sequence_emerg_pager; 112 113 #if !defined(NO_SWAPPING) 114 /* the kernel process "vm_daemon"*/ 115 static void vm_daemon (void); 116 static struct thread *vmthread; 117 118 static struct kproc_desc vm_kp = { 119 "vmdaemon", 120 vm_daemon, 121 &vmthread 122 }; 123 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 124 #endif 125 126 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 127 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 128 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 129 int vm_page_free_hysteresis = 16; 130 static int vm_pagedaemon_time; 131 132 #if !defined(NO_SWAPPING) 133 static int vm_pageout_req_swapout; 134 static int vm_daemon_needed; 135 #endif 136 __read_mostly static int vm_max_launder = 4096; 137 __read_mostly static int vm_emerg_launder = 100; 138 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 139 __read_mostly static int vm_pageout_full_stats_interval = 0; 140 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 141 __read_mostly static int defer_swap_pageouts=0; 142 __read_mostly static int disable_swap_pageouts=0; 143 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 144 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 145 __read_mostly static int vm_pageout_debug; 146 147 #if defined(NO_SWAPPING) 148 __read_mostly static int vm_swap_enabled=0; 149 __read_mostly static int vm_swap_idle_enabled=0; 150 #else 151 __read_mostly static int vm_swap_enabled=1; 152 __read_mostly static int vm_swap_idle_enabled=0; 153 #endif 154 155 /* 0-disable, 1-passive, 2-active swp*/ 156 __read_mostly int vm_pageout_memuse_mode=1; 157 158 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 159 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 160 161 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 162 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 163 164 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 165 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 166 "Free more pages than the minimum required"); 167 168 SYSCTL_INT(_vm, OID_AUTO, max_launder, 169 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 170 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 171 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 172 173 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 174 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 175 176 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 177 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 178 179 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 180 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 181 182 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 183 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 184 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 185 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 186 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 187 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 188 189 190 #if defined(NO_SWAPPING) 191 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 192 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 193 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 194 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 195 #else 196 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 197 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 198 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 199 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 200 #endif 201 202 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 203 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 204 205 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 206 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 207 208 static int pageout_lock_miss; 209 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 210 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 211 212 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 213 214 #if !defined(NO_SWAPPING) 215 static void vm_req_vmdaemon (void); 216 #endif 217 static void vm_pageout_page_stats(int q); 218 219 /* 220 * Calculate approximately how many pages on each queue to try to 221 * clean. An exact calculation creates an edge condition when the 222 * queues are unbalanced so add significant slop. The queue scans 223 * will stop early when targets are reached and will start where they 224 * left off on the next pass. 225 * 226 * We need to be generous here because there are all sorts of loading 227 * conditions that can cause edge cases if try to average over all queues. 228 * In particular, storage subsystems have become so fast that paging 229 * activity can become quite frantic. Eventually we will probably need 230 * two paging threads, one for dirty pages and one for clean, to deal 231 * with the bandwidth requirements. 232 233 * So what we do is calculate a value that can be satisfied nominally by 234 * only having to scan half the queues. 235 */ 236 static __inline long 237 PQAVERAGE(long n) 238 { 239 long avg; 240 241 if (n >= 0) { 242 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 243 } else { 244 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 245 } 246 return avg; 247 } 248 249 /* 250 * vm_pageout_clean_helper: 251 * 252 * Clean the page and remove it from the laundry. The page must be busied 253 * by the caller and will be disposed of (put away, flushed) by this routine. 254 */ 255 static int 256 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 257 { 258 vm_object_t object; 259 vm_page_t mc[BLIST_MAX_ALLOC]; 260 int error; 261 int ib, is, page_base; 262 vm_pindex_t pindex = m->pindex; 263 264 object = m->object; 265 266 /* 267 * Don't mess with the page if it's held or special. Theoretically 268 * we can pageout held pages but there is no real need to press our 269 * luck, so don't. 270 */ 271 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 272 vm_page_wakeup(m); 273 return 0; 274 } 275 276 /* 277 * Place page in cluster. Align cluster for optimal swap space 278 * allocation (whether it is swap or not). This is typically ~16-32 279 * pages, which also tends to align the cluster to multiples of the 280 * filesystem block size if backed by a filesystem. 281 */ 282 page_base = pindex % BLIST_MAX_ALLOC; 283 mc[page_base] = m; 284 ib = page_base - 1; 285 is = page_base + 1; 286 287 /* 288 * Scan object for clusterable pages. 289 * 290 * We can cluster ONLY if: ->> the page is NOT 291 * clean, wired, busy, held, or mapped into a 292 * buffer, and one of the following: 293 * 1) The page is inactive, or a seldom used 294 * active page. 295 * -or- 296 * 2) we force the issue. 297 * 298 * During heavy mmap/modification loads the pageout 299 * daemon can really fragment the underlying file 300 * due to flushing pages out of order and not trying 301 * align the clusters (which leave sporatic out-of-order 302 * holes). To solve this problem we do the reverse scan 303 * first and attempt to align our cluster, then do a 304 * forward scan if room remains. 305 */ 306 vm_object_hold(object); 307 308 while (ib >= 0) { 309 vm_page_t p; 310 311 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 312 TRUE, &error); 313 if (error || p == NULL) 314 break; 315 if ((p->queue - p->pc) == PQ_CACHE || 316 (p->flags & PG_UNQUEUED)) { 317 vm_page_wakeup(p); 318 break; 319 } 320 vm_page_test_dirty(p); 321 if (((p->dirty & p->valid) == 0 && 322 (p->flags & PG_NEED_COMMIT) == 0) || 323 p->wire_count != 0 || /* may be held by buf cache */ 324 p->hold_count != 0) { /* may be undergoing I/O */ 325 vm_page_wakeup(p); 326 break; 327 } 328 if (p->queue - p->pc != PQ_INACTIVE) { 329 if (p->queue - p->pc != PQ_ACTIVE || 330 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 331 vm_page_wakeup(p); 332 break; 333 } 334 } 335 336 /* 337 * Try to maintain page groupings in the cluster. 338 */ 339 if (m->flags & PG_WINATCFLS) 340 vm_page_flag_set(p, PG_WINATCFLS); 341 else 342 vm_page_flag_clear(p, PG_WINATCFLS); 343 p->act_count = m->act_count; 344 345 mc[ib] = p; 346 --ib; 347 } 348 ++ib; /* fixup */ 349 350 while (is < BLIST_MAX_ALLOC && 351 pindex - page_base + is < object->size) { 352 vm_page_t p; 353 354 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 355 TRUE, &error); 356 if (error || p == NULL) 357 break; 358 if (((p->queue - p->pc) == PQ_CACHE) || 359 (p->flags & PG_UNQUEUED)) { 360 vm_page_wakeup(p); 361 break; 362 } 363 vm_page_test_dirty(p); 364 if (((p->dirty & p->valid) == 0 && 365 (p->flags & PG_NEED_COMMIT) == 0) || 366 p->wire_count != 0 || /* may be held by buf cache */ 367 p->hold_count != 0) { /* may be undergoing I/O */ 368 vm_page_wakeup(p); 369 break; 370 } 371 if (p->queue - p->pc != PQ_INACTIVE) { 372 if (p->queue - p->pc != PQ_ACTIVE || 373 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 374 vm_page_wakeup(p); 375 break; 376 } 377 } 378 379 /* 380 * Try to maintain page groupings in the cluster. 381 */ 382 if (m->flags & PG_WINATCFLS) 383 vm_page_flag_set(p, PG_WINATCFLS); 384 else 385 vm_page_flag_clear(p, PG_WINATCFLS); 386 p->act_count = m->act_count; 387 388 mc[is] = p; 389 ++is; 390 } 391 392 vm_object_drop(object); 393 394 /* 395 * we allow reads during pageouts... 396 */ 397 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 398 } 399 400 /* 401 * vm_pageout_flush() - launder the given pages 402 * 403 * The given pages are laundered. Note that we setup for the start of 404 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 405 * reference count all in here rather then in the parent. If we want 406 * the parent to do more sophisticated things we may have to change 407 * the ordering. 408 * 409 * The pages in the array must be busied by the caller and will be 410 * unbusied by this function. 411 */ 412 int 413 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 414 { 415 vm_object_t object; 416 int pageout_status[count]; 417 int numpagedout = 0; 418 int i; 419 int dodebug; 420 421 if (vm_pageout_debug > 0) { 422 --vm_pageout_debug; 423 dodebug = 1; 424 } else { 425 dodebug = 0; 426 } 427 428 /* 429 * Initiate I/O. Bump the vm_page_t->busy counter. 430 */ 431 for (i = 0; i < count; i++) { 432 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 433 ("vm_pageout_flush page %p index %d/%d: partially " 434 "invalid page", mc[i], i, count)); 435 vm_page_io_start(mc[i]); 436 } 437 438 /* 439 * We must make the pages read-only. This will also force the 440 * modified bit in the related pmaps to be cleared. The pager 441 * cannot clear the bit for us since the I/O completion code 442 * typically runs from an interrupt. The act of making the page 443 * read-only handles the case for us. 444 * 445 * Then we can unbusy the pages, we still hold a reference by virtue 446 * of our soft-busy. 447 */ 448 if (dodebug) 449 kprintf("pageout: "); 450 for (i = 0; i < count; i++) { 451 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 452 vm_page_protect(mc[i], VM_PROT_NONE); 453 else 454 vm_page_protect(mc[i], VM_PROT_READ); 455 vm_page_wakeup(mc[i]); 456 if (dodebug) 457 kprintf(" %p", mc[i]); 458 } 459 if (dodebug) 460 kprintf("\n"); 461 462 object = mc[0]->object; 463 vm_object_pip_add(object, count); 464 465 vm_pager_put_pages(object, mc, count, 466 (vmflush_flags | 467 ((object == &kernel_object) ? 468 VM_PAGER_PUT_SYNC : 0)), 469 pageout_status); 470 471 if (dodebug) 472 kprintf("result: "); 473 for (i = 0; i < count; i++) { 474 vm_page_t mt = mc[i]; 475 476 if (dodebug) 477 kprintf(" S%d", pageout_status[i]); 478 479 switch (pageout_status[i]) { 480 case VM_PAGER_OK: 481 numpagedout++; 482 break; 483 case VM_PAGER_PEND: 484 numpagedout++; 485 break; 486 case VM_PAGER_BAD: 487 /* 488 * Page outside of range of object. Right now we 489 * essentially lose the changes by pretending it 490 * worked. 491 */ 492 vm_page_busy_wait(mt, FALSE, "pgbad"); 493 pmap_clear_modify(mt); 494 vm_page_undirty(mt); 495 vm_page_wakeup(mt); 496 break; 497 case VM_PAGER_ERROR: 498 case VM_PAGER_FAIL: 499 /* 500 * A page typically cannot be paged out when we 501 * have run out of swap. We leave the page 502 * marked inactive and will try to page it out 503 * again later. 504 * 505 * Starvation of the active page list is used to 506 * determine when the system is massively memory 507 * starved. 508 */ 509 break; 510 case VM_PAGER_AGAIN: 511 break; 512 } 513 514 /* 515 * If not PENDing this was a synchronous operation and we 516 * clean up after the I/O. If it is PENDing the mess is 517 * cleaned up asynchronously. 518 * 519 * Also nominally act on the caller's wishes if the caller 520 * wants to try to really clean (cache or free) the page. 521 * 522 * Also nominally deactivate the page if the system is 523 * memory-stressed. 524 */ 525 if (pageout_status[i] != VM_PAGER_PEND) { 526 vm_page_busy_wait(mt, FALSE, "pgouw"); 527 vm_page_io_finish(mt); 528 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 529 vm_page_try_to_cache(mt); 530 if (dodebug) 531 kprintf("A[pq_cache=%d]", 532 ((mt->queue - mt->pc) == PQ_CACHE)); 533 } else if (vm_page_count_severe()) { 534 vm_page_deactivate(mt); 535 vm_page_wakeup(mt); 536 if (dodebug) 537 kprintf("B"); 538 } else { 539 vm_page_wakeup(mt); 540 if (dodebug) 541 kprintf("C"); 542 } 543 vm_object_pip_wakeup(object); 544 } 545 } 546 if (dodebug) 547 kprintf("\n"); 548 return numpagedout; 549 } 550 551 #if !defined(NO_SWAPPING) 552 553 /* 554 * Callback function, page busied for us. We must dispose of the busy 555 * condition. Any related pmap pages may be held but will not be locked. 556 */ 557 static 558 int 559 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 560 vm_page_t p) 561 { 562 int actcount; 563 int cleanit = 0; 564 565 /* 566 * Basic tests - There should never be a marker, and we can stop 567 * once the RSS is below the required level. 568 */ 569 KKASSERT((p->flags & PG_MARKER) == 0); 570 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 571 vm_page_wakeup(p); 572 return(-1); 573 } 574 575 mycpu->gd_cnt.v_pdpages++; 576 577 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 578 vm_page_wakeup(p); 579 goto done; 580 } 581 582 ++info->actioncount; 583 584 /* 585 * Check if the page has been referened recently. If it has, 586 * activate it and skip. 587 */ 588 actcount = pmap_ts_referenced(p); 589 if (actcount) { 590 vm_page_flag_set(p, PG_REFERENCED); 591 } else if (p->flags & PG_REFERENCED) { 592 actcount = 1; 593 } 594 595 if (actcount) { 596 if (p->queue - p->pc != PQ_ACTIVE) { 597 vm_page_and_queue_spin_lock(p); 598 if (p->queue - p->pc != PQ_ACTIVE) { 599 vm_page_and_queue_spin_unlock(p); 600 vm_page_activate(p); 601 } else { 602 vm_page_and_queue_spin_unlock(p); 603 } 604 } else { 605 p->act_count += actcount; 606 if (p->act_count > ACT_MAX) 607 p->act_count = ACT_MAX; 608 } 609 vm_page_flag_clear(p, PG_REFERENCED); 610 vm_page_wakeup(p); 611 goto done; 612 } 613 614 /* 615 * Remove the page from this particular pmap. Once we do this, our 616 * pmap scans will not see it again (unless it gets faulted in), so 617 * we must actively dispose of or deal with the page. 618 */ 619 pmap_remove_specific(info->pmap, p); 620 621 /* 622 * If the page is not mapped to another process (i.e. as would be 623 * typical if this were a shared page from a library) then deactivate 624 * the page and clean it in two passes only. 625 * 626 * If the page hasn't been referenced since the last check, remove it 627 * from the pmap. If it is no longer mapped, deactivate it 628 * immediately, accelerating the normal decline. 629 * 630 * Once the page has been removed from the pmap the RSS code no 631 * longer tracks it so we have to make sure that it is staged for 632 * potential flush action. 633 */ 634 if ((p->flags & PG_MAPPED) == 0 || 635 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 636 if (p->queue - p->pc == PQ_ACTIVE) { 637 vm_page_deactivate(p); 638 } 639 if (p->queue - p->pc == PQ_INACTIVE) { 640 cleanit = 1; 641 } 642 } 643 644 /* 645 * Ok, try to fully clean the page and any nearby pages such that at 646 * least the requested page is freed or moved to the cache queue. 647 * 648 * We usually do this synchronously to allow us to get the page into 649 * the CACHE queue quickly, which will prevent memory exhaustion if 650 * a process with a memoryuse limit is running away. However, the 651 * sysadmin may desire to set vm.swap_user_async which relaxes this 652 * and improves write performance. 653 */ 654 if (cleanit) { 655 long max_launder = 0x7FFF; 656 long vnodes_skipped = 0; 657 int vmflush_flags; 658 struct vnode *vpfailed = NULL; 659 660 info->offset = va; 661 662 if (vm_pageout_memuse_mode >= 2) { 663 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 664 VM_PAGER_ALLOW_ACTIVE; 665 if (swap_user_async == 0) 666 vmflush_flags |= VM_PAGER_PUT_SYNC; 667 vm_page_flag_set(p, PG_WINATCFLS); 668 info->cleancount += 669 vm_pageout_page(p, &max_launder, 670 &vnodes_skipped, 671 &vpfailed, 1, vmflush_flags); 672 } else { 673 vm_page_wakeup(p); 674 ++info->cleancount; 675 } 676 } else { 677 vm_page_wakeup(p); 678 } 679 680 /* 681 * Must be at end to avoid SMP races. 682 */ 683 done: 684 lwkt_user_yield(); 685 return 0; 686 } 687 688 /* 689 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 690 * that is relatively difficult to do. We try to keep track of where we 691 * left off last time to reduce scan overhead. 692 * 693 * Called when vm_pageout_memuse_mode is >= 1. 694 */ 695 void 696 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 697 { 698 vm_offset_t pgout_offset; 699 struct pmap_pgscan_info info; 700 int retries = 3; 701 702 pgout_offset = map->pgout_offset; 703 again: 704 #if 0 705 kprintf("%016jx ", pgout_offset); 706 #endif 707 if (pgout_offset < VM_MIN_USER_ADDRESS) 708 pgout_offset = VM_MIN_USER_ADDRESS; 709 if (pgout_offset >= VM_MAX_USER_ADDRESS) 710 pgout_offset = 0; 711 info.pmap = vm_map_pmap(map); 712 info.limit = limit; 713 info.beg_addr = pgout_offset; 714 info.end_addr = VM_MAX_USER_ADDRESS; 715 info.callback = vm_pageout_mdp_callback; 716 info.cleancount = 0; 717 info.actioncount = 0; 718 info.busycount = 0; 719 720 pmap_pgscan(&info); 721 pgout_offset = info.offset; 722 #if 0 723 kprintf("%016jx %08lx %08lx\n", pgout_offset, 724 info.cleancount, info.actioncount); 725 #endif 726 727 if (pgout_offset != VM_MAX_USER_ADDRESS && 728 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 729 goto again; 730 } else if (retries && 731 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 732 --retries; 733 goto again; 734 } 735 map->pgout_offset = pgout_offset; 736 } 737 #endif 738 739 /* 740 * Called when the pageout scan wants to free a page. We no longer 741 * try to cycle the vm_object here with a reference & dealloc, which can 742 * cause a non-trivial object collapse in a critical path. 743 * 744 * It is unclear why we cycled the ref_count in the past, perhaps to try 745 * to optimize shadow chain collapses but I don't quite see why it would 746 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 747 * synchronously and not have to be kicked-start. 748 */ 749 static void 750 vm_pageout_page_free(vm_page_t m) 751 { 752 vm_page_protect(m, VM_PROT_NONE); 753 vm_page_free(m); 754 } 755 756 /* 757 * vm_pageout_scan does the dirty work for the pageout daemon. 758 */ 759 struct vm_pageout_scan_info { 760 struct proc *bigproc; 761 vm_offset_t bigsize; 762 }; 763 764 static int vm_pageout_scan_callback(struct proc *p, void *data); 765 766 /* 767 * Scan inactive queue 768 * 769 * WARNING! Can be called from two pagedaemon threads simultaneously. 770 */ 771 static int 772 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 773 long *vnodes_skipped) 774 { 775 vm_page_t m; 776 struct vm_page marker; 777 struct vnode *vpfailed; /* warning, allowed to be stale */ 778 long maxscan; 779 long delta = 0; 780 long max_launder; 781 int isep; 782 int vmflush_flags; 783 784 isep = (curthread == emergpager); 785 786 /* 787 * Start scanning the inactive queue for pages we can move to the 788 * cache or free. The scan will stop when the target is reached or 789 * we have scanned the entire inactive queue. Note that m->act_count 790 * is not used to form decisions for the inactive queue, only for the 791 * active queue. 792 * 793 * max_launder limits the number of dirty pages we flush per scan. 794 * For most systems a smaller value (16 or 32) is more robust under 795 * extreme memory and disk pressure because any unnecessary writes 796 * to disk can result in extreme performance degredation. However, 797 * systems with excessive dirty pages (especially when MAP_NOSYNC is 798 * used) will die horribly with limited laundering. If the pageout 799 * daemon cannot clean enough pages in the first pass, we let it go 800 * all out in succeeding passes. 801 * 802 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 803 * PAGES. 804 */ 805 if ((max_launder = vm_max_launder) <= 1) 806 max_launder = 1; 807 if (pass) 808 max_launder = 10000; 809 810 /* 811 * Initialize our marker 812 */ 813 bzero(&marker, sizeof(marker)); 814 marker.flags = PG_FICTITIOUS | PG_MARKER; 815 marker.busy_count = PBUSY_LOCKED; 816 marker.queue = PQ_INACTIVE + q; 817 marker.pc = q; 818 marker.wire_count = 1; 819 820 /* 821 * Inactive queue scan. 822 * 823 * NOTE: The vm_page must be spinlocked before the queue to avoid 824 * deadlocks, so it is easiest to simply iterate the loop 825 * with the queue unlocked at the top. 826 */ 827 vpfailed = NULL; 828 829 vm_page_queues_spin_lock(PQ_INACTIVE + q); 830 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 831 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 832 833 /* 834 * Queue locked at top of loop to avoid stack marker issues. 835 */ 836 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 837 maxscan-- > 0 && avail_shortage - delta > 0) 838 { 839 int count; 840 841 KKASSERT(m->queue == PQ_INACTIVE + q); 842 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 843 &marker, pageq); 844 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 845 &marker, pageq); 846 mycpu->gd_cnt.v_pdpages++; 847 848 /* 849 * Skip marker pages (atomic against other markers to avoid 850 * infinite hop-over scans). 851 */ 852 if (m->flags & PG_MARKER) 853 continue; 854 855 /* 856 * Try to busy the page. Don't mess with pages which are 857 * already busy or reorder them in the queue. 858 */ 859 if (vm_page_busy_try(m, TRUE)) 860 continue; 861 862 /* 863 * Remaining operations run with the page busy and neither 864 * the page or the queue will be spin-locked. 865 */ 866 KKASSERT(m->queue == PQ_INACTIVE + q); 867 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 868 869 /* 870 * The emergency pager runs when the primary pager gets 871 * stuck, which typically means the primary pager deadlocked 872 * on a vnode-backed page. Therefore, the emergency pager 873 * must skip any complex objects. 874 * 875 * We disallow VNODEs unless they are VCHR whos device ops 876 * does not flag D_NOEMERGPGR. 877 */ 878 if (isep && m->object) { 879 struct vnode *vp; 880 881 switch(m->object->type) { 882 case OBJT_DEFAULT: 883 case OBJT_SWAP: 884 /* 885 * Allow anonymous memory and assume that 886 * swap devices are not complex, since its 887 * kinda worthless if we can't swap out dirty 888 * anonymous pages. 889 */ 890 break; 891 case OBJT_VNODE: 892 /* 893 * Allow VCHR device if the D_NOEMERGPGR 894 * flag is not set, deny other vnode types 895 * as being too complex. 896 */ 897 vp = m->object->handle; 898 if (vp && vp->v_type == VCHR && 899 vp->v_rdev && vp->v_rdev->si_ops && 900 (vp->v_rdev->si_ops->head.flags & 901 D_NOEMERGPGR) == 0) { 902 break; 903 } 904 /* Deny - fall through */ 905 default: 906 /* 907 * Deny 908 */ 909 vm_page_wakeup(m); 910 vm_page_queues_spin_lock(PQ_INACTIVE + q); 911 lwkt_yield(); 912 continue; 913 } 914 } 915 916 /* 917 * Try to pageout the page and perhaps other nearby pages. 918 * We want to get the pages into the cache on the second 919 * pass. Otherwise the pages can wind up just cycling in 920 * the inactive queue, getting flushed over and over again. 921 */ 922 if (m->flags & PG_WINATCFLS) 923 vmflush_flags = VM_PAGER_TRY_TO_CACHE; 924 else 925 vmflush_flags = 0; 926 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 927 &vpfailed, pass, vmflush_flags); 928 delta += count; 929 930 /* 931 * Systems with a ton of memory can wind up with huge 932 * deactivation counts. Because the inactive scan is 933 * doing a lot of flushing, the combination can result 934 * in excessive paging even in situations where other 935 * unrelated threads free up sufficient VM. 936 * 937 * To deal with this we abort the nominal active->inactive 938 * scan before we hit the inactive target when free+cache 939 * levels have reached a reasonable target. 940 * 941 * When deciding to stop early we need to add some slop to 942 * the test and we need to return full completion to the caller 943 * to prevent the caller from thinking there is something 944 * wrong and issuing a low-memory+swap warning or pkill. 945 * 946 * A deficit forces paging regardless of the state of the 947 * VM page queues (used for RSS enforcement). 948 */ 949 lwkt_yield(); 950 vm_page_queues_spin_lock(PQ_INACTIVE + q); 951 if (vm_paging_target() < -vm_max_launder) { 952 /* 953 * Stopping early, return full completion to caller. 954 */ 955 if (delta < avail_shortage) 956 delta = avail_shortage; 957 break; 958 } 959 } 960 961 /* page queue still spin-locked */ 962 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 963 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 964 965 return (delta); 966 } 967 968 /* 969 * Pageout the specified page, return the total number of pages paged out 970 * (this routine may cluster). 971 * 972 * The page must be busied and soft-busied by the caller and will be disposed 973 * of by this function. 974 */ 975 static int 976 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 977 struct vnode **vpfailedp, int pass, int vmflush_flags) 978 { 979 vm_object_t object; 980 int actcount; 981 int count = 0; 982 983 /* 984 * Wiring no longer removes a page from its queue. The last unwiring 985 * will requeue the page. Obviously wired pages cannot be paged out 986 * so unqueue it and return. 987 */ 988 if (m->wire_count) { 989 vm_page_unqueue_nowakeup(m); 990 vm_page_wakeup(m); 991 return 0; 992 } 993 994 /* 995 * A held page may be undergoing I/O, so skip it. 996 */ 997 if (m->hold_count) { 998 vm_page_and_queue_spin_lock(m); 999 if (m->queue - m->pc == PQ_INACTIVE) { 1000 TAILQ_REMOVE( 1001 &vm_page_queues[m->queue].pl, m, pageq); 1002 TAILQ_INSERT_TAIL( 1003 &vm_page_queues[m->queue].pl, m, pageq); 1004 } 1005 vm_page_and_queue_spin_unlock(m); 1006 vm_page_wakeup(m); 1007 return 0; 1008 } 1009 1010 if (m->object == NULL || m->object->ref_count == 0) { 1011 /* 1012 * If the object is not being used, we ignore previous 1013 * references. 1014 */ 1015 vm_page_flag_clear(m, PG_REFERENCED); 1016 pmap_clear_reference(m); 1017 /* fall through to end */ 1018 } else if (((m->flags & PG_REFERENCED) == 0) && 1019 (actcount = pmap_ts_referenced(m))) { 1020 /* 1021 * Otherwise, if the page has been referenced while 1022 * in the inactive queue, we bump the "activation 1023 * count" upwards, making it less likely that the 1024 * page will be added back to the inactive queue 1025 * prematurely again. Here we check the page tables 1026 * (or emulated bits, if any), given the upper level 1027 * VM system not knowing anything about existing 1028 * references. 1029 */ 1030 vm_page_activate(m); 1031 m->act_count += (actcount + ACT_ADVANCE); 1032 vm_page_wakeup(m); 1033 return 0; 1034 } 1035 1036 /* 1037 * (m) is still busied. 1038 * 1039 * If the upper level VM system knows about any page 1040 * references, we activate the page. We also set the 1041 * "activation count" higher than normal so that we will less 1042 * likely place pages back onto the inactive queue again. 1043 */ 1044 if ((m->flags & PG_REFERENCED) != 0) { 1045 vm_page_flag_clear(m, PG_REFERENCED); 1046 actcount = pmap_ts_referenced(m); 1047 vm_page_activate(m); 1048 m->act_count += (actcount + ACT_ADVANCE + 1); 1049 vm_page_wakeup(m); 1050 return 0; 1051 } 1052 1053 /* 1054 * If the upper level VM system doesn't know anything about 1055 * the page being dirty, we have to check for it again. As 1056 * far as the VM code knows, any partially dirty pages are 1057 * fully dirty. 1058 * 1059 * Pages marked PG_WRITEABLE may be mapped into the user 1060 * address space of a process running on another cpu. A 1061 * user process (without holding the MP lock) running on 1062 * another cpu may be able to touch the page while we are 1063 * trying to remove it. vm_page_cache() will handle this 1064 * case for us. 1065 */ 1066 if (m->dirty == 0) { 1067 vm_page_test_dirty(m); 1068 } else { 1069 vm_page_dirty(m); 1070 } 1071 1072 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1073 /* 1074 * Invalid pages can be easily freed 1075 */ 1076 vm_pageout_page_free(m); 1077 mycpu->gd_cnt.v_dfree++; 1078 ++count; 1079 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1080 /* 1081 * Clean pages can be placed onto the cache queue. 1082 * This effectively frees them. 1083 */ 1084 vm_page_cache(m); 1085 ++count; 1086 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1087 /* 1088 * Dirty pages need to be paged out, but flushing 1089 * a page is extremely expensive verses freeing 1090 * a clean page. Rather then artificially limiting 1091 * the number of pages we can flush, we instead give 1092 * dirty pages extra priority on the inactive queue 1093 * by forcing them to be cycled through the queue 1094 * twice before being flushed, after which the 1095 * (now clean) page will cycle through once more 1096 * before being freed. This significantly extends 1097 * the thrash point for a heavily loaded machine. 1098 */ 1099 vm_page_flag_set(m, PG_WINATCFLS); 1100 vm_page_and_queue_spin_lock(m); 1101 if (m->queue - m->pc == PQ_INACTIVE) { 1102 TAILQ_REMOVE( 1103 &vm_page_queues[m->queue].pl, m, pageq); 1104 TAILQ_INSERT_TAIL( 1105 &vm_page_queues[m->queue].pl, m, pageq); 1106 } 1107 vm_page_and_queue_spin_unlock(m); 1108 vm_page_wakeup(m); 1109 } else if (*max_launderp > 0) { 1110 /* 1111 * We always want to try to flush some dirty pages if 1112 * we encounter them, to keep the system stable. 1113 * Normally this number is small, but under extreme 1114 * pressure where there are insufficient clean pages 1115 * on the inactive queue, we may have to go all out. 1116 */ 1117 int swap_pageouts_ok; 1118 struct vnode *vp = NULL; 1119 1120 swap_pageouts_ok = 0; 1121 object = m->object; 1122 if (object && 1123 (object->type != OBJT_SWAP) && 1124 (object->type != OBJT_DEFAULT)) { 1125 swap_pageouts_ok = 1; 1126 } else { 1127 swap_pageouts_ok = !(defer_swap_pageouts || 1128 disable_swap_pageouts); 1129 swap_pageouts_ok |= (!disable_swap_pageouts && 1130 defer_swap_pageouts && 1131 vm_page_count_min(0)); 1132 } 1133 1134 /* 1135 * We don't bother paging objects that are "dead". 1136 * Those objects are in a "rundown" state. 1137 */ 1138 if (!swap_pageouts_ok || 1139 (object == NULL) || 1140 (object->flags & OBJ_DEAD)) { 1141 vm_page_and_queue_spin_lock(m); 1142 if (m->queue - m->pc == PQ_INACTIVE) { 1143 TAILQ_REMOVE( 1144 &vm_page_queues[m->queue].pl, 1145 m, pageq); 1146 TAILQ_INSERT_TAIL( 1147 &vm_page_queues[m->queue].pl, 1148 m, pageq); 1149 } 1150 vm_page_and_queue_spin_unlock(m); 1151 vm_page_wakeup(m); 1152 return 0; 1153 } 1154 1155 /* 1156 * (m) is still busied. 1157 * 1158 * The object is already known NOT to be dead. It 1159 * is possible for the vget() to block the whole 1160 * pageout daemon, but the new low-memory handling 1161 * code should prevent it. 1162 * 1163 * The previous code skipped locked vnodes and, worse, 1164 * reordered pages in the queue. This results in 1165 * completely non-deterministic operation because, 1166 * quite often, a vm_fault has initiated an I/O and 1167 * is holding a locked vnode at just the point where 1168 * the pageout daemon is woken up. 1169 * 1170 * We can't wait forever for the vnode lock, we might 1171 * deadlock due to a vn_read() getting stuck in 1172 * vm_wait while holding this vnode. We skip the 1173 * vnode if we can't get it in a reasonable amount 1174 * of time. 1175 * 1176 * vpfailed is used to (try to) avoid the case where 1177 * a large number of pages are associated with a 1178 * locked vnode, which could cause the pageout daemon 1179 * to stall for an excessive amount of time. 1180 */ 1181 if (object->type == OBJT_VNODE) { 1182 int flags; 1183 1184 vp = object->handle; 1185 flags = LK_EXCLUSIVE; 1186 if (vp == *vpfailedp) 1187 flags |= LK_NOWAIT; 1188 else 1189 flags |= LK_TIMELOCK; 1190 vm_page_hold(m); 1191 vm_page_wakeup(m); 1192 1193 /* 1194 * We have unbusied (m) temporarily so we can 1195 * acquire the vp lock without deadlocking. 1196 * (m) is held to prevent destruction. 1197 */ 1198 if (vget(vp, flags) != 0) { 1199 *vpfailedp = vp; 1200 ++pageout_lock_miss; 1201 if (object->flags & OBJ_MIGHTBEDIRTY) 1202 ++*vnodes_skippedp; 1203 vm_page_unhold(m); 1204 return 0; 1205 } 1206 1207 /* 1208 * The page might have been moved to another 1209 * queue during potential blocking in vget() 1210 * above. The page might have been freed and 1211 * reused for another vnode. The object might 1212 * have been reused for another vnode. 1213 */ 1214 if (m->queue - m->pc != PQ_INACTIVE || 1215 m->object != object || 1216 object->handle != vp) { 1217 if (object->flags & OBJ_MIGHTBEDIRTY) 1218 ++*vnodes_skippedp; 1219 vput(vp); 1220 vm_page_unhold(m); 1221 return 0; 1222 } 1223 1224 /* 1225 * The page may have been busied during the 1226 * blocking in vput(); We don't move the 1227 * page back onto the end of the queue so that 1228 * statistics are more correct if we don't. 1229 */ 1230 if (vm_page_busy_try(m, TRUE)) { 1231 vput(vp); 1232 vm_page_unhold(m); 1233 return 0; 1234 } 1235 vm_page_unhold(m); 1236 1237 /* 1238 * If it was wired while we didn't own it. 1239 */ 1240 if (m->wire_count) { 1241 vm_page_unqueue_nowakeup(m); 1242 vput(vp); 1243 vm_page_wakeup(m); 1244 return 0; 1245 } 1246 1247 /* 1248 * (m) is busied again 1249 * 1250 * We own the busy bit and remove our hold 1251 * bit. If the page is still held it 1252 * might be undergoing I/O, so skip it. 1253 */ 1254 if (m->hold_count) { 1255 vm_page_and_queue_spin_lock(m); 1256 if (m->queue - m->pc == PQ_INACTIVE) { 1257 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1258 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1259 } 1260 vm_page_and_queue_spin_unlock(m); 1261 if (object->flags & OBJ_MIGHTBEDIRTY) 1262 ++*vnodes_skippedp; 1263 vm_page_wakeup(m); 1264 vput(vp); 1265 return 0; 1266 } 1267 /* (m) is left busied as we fall through */ 1268 } 1269 1270 /* 1271 * page is busy and not held here. 1272 * 1273 * If a page is dirty, then it is either being washed 1274 * (but not yet cleaned) or it is still in the 1275 * laundry. If it is still in the laundry, then we 1276 * start the cleaning operation. 1277 * 1278 * decrement inactive_shortage on success to account 1279 * for the (future) cleaned page. Otherwise we 1280 * could wind up laundering or cleaning too many 1281 * pages. 1282 * 1283 * NOTE: Cleaning the page here does not cause 1284 * force_deficit to be adjusted, because the 1285 * page is not being freed or moved to the 1286 * cache. 1287 */ 1288 count = vm_pageout_clean_helper(m, vmflush_flags); 1289 *max_launderp -= count; 1290 1291 /* 1292 * Clean ate busy, page no longer accessible 1293 */ 1294 if (vp != NULL) 1295 vput(vp); 1296 } else { 1297 vm_page_wakeup(m); 1298 } 1299 return count; 1300 } 1301 1302 /* 1303 * Scan active queue 1304 * 1305 * WARNING! Can be called from two pagedaemon threads simultaneously. 1306 */ 1307 static int 1308 vm_pageout_scan_active(int pass, int q, 1309 long avail_shortage, long inactive_shortage, 1310 long *recycle_countp) 1311 { 1312 struct vm_page marker; 1313 vm_page_t m; 1314 int actcount; 1315 long delta = 0; 1316 long maxscan; 1317 int isep; 1318 1319 isep = (curthread == emergpager); 1320 1321 /* 1322 * We want to move pages from the active queue to the inactive 1323 * queue to get the inactive queue to the inactive target. If 1324 * we still have a page shortage from above we try to directly free 1325 * clean pages instead of moving them. 1326 * 1327 * If we do still have a shortage we keep track of the number of 1328 * pages we free or cache (recycle_count) as a measure of thrashing 1329 * between the active and inactive queues. 1330 * 1331 * If we were able to completely satisfy the free+cache targets 1332 * from the inactive pool we limit the number of pages we move 1333 * from the active pool to the inactive pool to 2x the pages we 1334 * had removed from the inactive pool (with a minimum of 1/5 the 1335 * inactive target). If we were not able to completely satisfy 1336 * the free+cache targets we go for the whole target aggressively. 1337 * 1338 * NOTE: Both variables can end up negative. 1339 * NOTE: We are still in a critical section. 1340 * 1341 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1342 * PAGES. 1343 */ 1344 1345 bzero(&marker, sizeof(marker)); 1346 marker.flags = PG_FICTITIOUS | PG_MARKER; 1347 marker.busy_count = PBUSY_LOCKED; 1348 marker.queue = PQ_ACTIVE + q; 1349 marker.pc = q; 1350 marker.wire_count = 1; 1351 1352 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1353 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1354 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1355 1356 /* 1357 * Queue locked at top of loop to avoid stack marker issues. 1358 */ 1359 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1360 maxscan-- > 0 && (avail_shortage - delta > 0 || 1361 inactive_shortage > 0)) 1362 { 1363 KKASSERT(m->queue == PQ_ACTIVE + q); 1364 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1365 &marker, pageq); 1366 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1367 &marker, pageq); 1368 1369 /* 1370 * Skip marker pages (atomic against other markers to avoid 1371 * infinite hop-over scans). 1372 */ 1373 if (m->flags & PG_MARKER) 1374 continue; 1375 1376 /* 1377 * Try to busy the page. Don't mess with pages which are 1378 * already busy or reorder them in the queue. 1379 */ 1380 if (vm_page_busy_try(m, TRUE)) 1381 continue; 1382 1383 /* 1384 * Remaining operations run with the page busy and neither 1385 * the page or the queue will be spin-locked. 1386 */ 1387 KKASSERT(m->queue == PQ_ACTIVE + q); 1388 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1389 1390 #if 0 1391 /* 1392 * Don't deactivate pages that are held, even if we can 1393 * busy them. (XXX why not?) 1394 */ 1395 if (m->hold_count) { 1396 vm_page_and_queue_spin_lock(m); 1397 if (m->queue - m->pc == PQ_ACTIVE) { 1398 TAILQ_REMOVE( 1399 &vm_page_queues[PQ_ACTIVE + q].pl, 1400 m, pageq); 1401 TAILQ_INSERT_TAIL( 1402 &vm_page_queues[PQ_ACTIVE + q].pl, 1403 m, pageq); 1404 } 1405 vm_page_and_queue_spin_unlock(m); 1406 vm_page_wakeup(m); 1407 goto next; 1408 } 1409 #endif 1410 /* 1411 * We can just remove wired pages from the queue 1412 */ 1413 if (m->wire_count) { 1414 vm_page_unqueue_nowakeup(m); 1415 vm_page_wakeup(m); 1416 goto next; 1417 } 1418 1419 /* 1420 * The emergency pager ignores vnode-backed pages as these 1421 * are the pages that probably bricked the main pager. 1422 */ 1423 if (isep && m->object && m->object->type == OBJT_VNODE) { 1424 vm_page_and_queue_spin_lock(m); 1425 if (m->queue - m->pc == PQ_ACTIVE) { 1426 TAILQ_REMOVE( 1427 &vm_page_queues[PQ_ACTIVE + q].pl, 1428 m, pageq); 1429 TAILQ_INSERT_TAIL( 1430 &vm_page_queues[PQ_ACTIVE + q].pl, 1431 m, pageq); 1432 } 1433 vm_page_and_queue_spin_unlock(m); 1434 vm_page_wakeup(m); 1435 goto next; 1436 } 1437 1438 /* 1439 * The count for pagedaemon pages is done after checking the 1440 * page for eligibility... 1441 */ 1442 mycpu->gd_cnt.v_pdpages++; 1443 1444 /* 1445 * Check to see "how much" the page has been used and clear 1446 * the tracking access bits. If the object has no references 1447 * don't bother paying the expense. 1448 */ 1449 actcount = 0; 1450 if (m->object && m->object->ref_count != 0) { 1451 if (m->flags & PG_REFERENCED) 1452 ++actcount; 1453 actcount += pmap_ts_referenced(m); 1454 if (actcount) { 1455 m->act_count += ACT_ADVANCE + actcount; 1456 if (m->act_count > ACT_MAX) 1457 m->act_count = ACT_MAX; 1458 } 1459 } 1460 vm_page_flag_clear(m, PG_REFERENCED); 1461 1462 /* 1463 * actcount is only valid if the object ref_count is non-zero. 1464 * If the page does not have an object, actcount will be zero. 1465 */ 1466 if (actcount && m->object->ref_count != 0) { 1467 vm_page_and_queue_spin_lock(m); 1468 if (m->queue - m->pc == PQ_ACTIVE) { 1469 TAILQ_REMOVE( 1470 &vm_page_queues[PQ_ACTIVE + q].pl, 1471 m, pageq); 1472 TAILQ_INSERT_TAIL( 1473 &vm_page_queues[PQ_ACTIVE + q].pl, 1474 m, pageq); 1475 } 1476 vm_page_and_queue_spin_unlock(m); 1477 vm_page_wakeup(m); 1478 } else { 1479 switch(m->object->type) { 1480 case OBJT_DEFAULT: 1481 case OBJT_SWAP: 1482 m->act_count -= min(m->act_count, 1483 vm_anonmem_decline); 1484 break; 1485 default: 1486 m->act_count -= min(m->act_count, 1487 vm_filemem_decline); 1488 break; 1489 } 1490 if (vm_pageout_algorithm || 1491 (m->object == NULL) || 1492 (m->object && (m->object->ref_count == 0)) || 1493 m->act_count < pass + 1 1494 ) { 1495 /* 1496 * Deactivate the page. If we had a 1497 * shortage from our inactive scan try to 1498 * free (cache) the page instead. 1499 * 1500 * Don't just blindly cache the page if 1501 * we do not have a shortage from the 1502 * inactive scan, that could lead to 1503 * gigabytes being moved. 1504 */ 1505 --inactive_shortage; 1506 if (avail_shortage - delta > 0 || 1507 (m->object && (m->object->ref_count == 0))) 1508 { 1509 if (avail_shortage - delta > 0) 1510 ++*recycle_countp; 1511 vm_page_protect(m, VM_PROT_NONE); 1512 if (m->dirty == 0 && 1513 (m->flags & PG_NEED_COMMIT) == 0 && 1514 avail_shortage - delta > 0) { 1515 vm_page_cache(m); 1516 } else { 1517 vm_page_deactivate(m); 1518 vm_page_wakeup(m); 1519 } 1520 } else { 1521 vm_page_deactivate(m); 1522 vm_page_wakeup(m); 1523 } 1524 ++delta; 1525 } else { 1526 vm_page_and_queue_spin_lock(m); 1527 if (m->queue - m->pc == PQ_ACTIVE) { 1528 TAILQ_REMOVE( 1529 &vm_page_queues[PQ_ACTIVE + q].pl, 1530 m, pageq); 1531 TAILQ_INSERT_TAIL( 1532 &vm_page_queues[PQ_ACTIVE + q].pl, 1533 m, pageq); 1534 } 1535 vm_page_and_queue_spin_unlock(m); 1536 vm_page_wakeup(m); 1537 } 1538 } 1539 next: 1540 lwkt_yield(); 1541 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1542 } 1543 1544 /* 1545 * Clean out our local marker. 1546 * 1547 * Page queue still spin-locked. 1548 */ 1549 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1550 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1551 1552 return (delta); 1553 } 1554 1555 /* 1556 * The number of actually free pages can drop down to v_free_reserved, 1557 * we try to build the free count back above v_free_min. Note that 1558 * vm_paging_needed() also returns TRUE if v_free_count is not at 1559 * least v_free_min so that is the minimum we must build the free 1560 * count to. 1561 * 1562 * We use a slightly higher target to improve hysteresis, 1563 * ((v_free_target + v_free_min) / 2). Since v_free_target 1564 * is usually the same as v_cache_min this maintains about 1565 * half the pages in the free queue as are in the cache queue, 1566 * providing pretty good pipelining for pageout operation. 1567 * 1568 * The system operator can manipulate vm.v_cache_min and 1569 * vm.v_free_target to tune the pageout demon. Be sure 1570 * to keep vm.v_free_min < vm.v_free_target. 1571 * 1572 * Note that the original paging target is to get at least 1573 * (free_min + cache_min) into (free + cache). The slightly 1574 * higher target will shift additional pages from cache to free 1575 * without effecting the original paging target in order to 1576 * maintain better hysteresis and not have the free count always 1577 * be dead-on v_free_min. 1578 * 1579 * NOTE: we are still in a critical section. 1580 * 1581 * Pages moved from PQ_CACHE to totally free are not counted in the 1582 * pages_freed counter. 1583 * 1584 * WARNING! Can be called from two pagedaemon threads simultaneously. 1585 */ 1586 static void 1587 vm_pageout_scan_cache(long avail_shortage, int pass, 1588 long vnodes_skipped, long recycle_count) 1589 { 1590 static int lastkillticks; 1591 struct vm_pageout_scan_info info; 1592 vm_page_t m; 1593 int isep; 1594 1595 isep = (curthread == emergpager); 1596 1597 while (vmstats.v_free_count < 1598 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1599 /* 1600 * This steals some code from vm/vm_page.c 1601 * 1602 * Create two rovers and adjust the code to reduce 1603 * chances of them winding up at the same index (which 1604 * can cause a lot of contention). 1605 */ 1606 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1607 1608 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1609 goto next_rover; 1610 1611 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1612 if (m == NULL) 1613 break; 1614 1615 /* 1616 * If the busy attempt fails we can still deactivate the page. 1617 */ 1618 /* page is returned removed from its queue and spinlocked */ 1619 if (vm_page_busy_try(m, TRUE)) { 1620 vm_page_deactivate_locked(m); 1621 vm_page_spin_unlock(m); 1622 continue; 1623 } 1624 vm_page_spin_unlock(m); 1625 pagedaemon_wakeup(); 1626 lwkt_yield(); 1627 1628 /* 1629 * Remaining operations run with the page busy and neither 1630 * the page or the queue will be spin-locked. 1631 */ 1632 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) || 1633 m->hold_count || 1634 m->wire_count) { 1635 vm_page_deactivate(m); 1636 vm_page_wakeup(m); 1637 continue; 1638 } 1639 pmap_mapped_sync(m); 1640 KKASSERT((m->flags & PG_MAPPED) == 0); 1641 KKASSERT(m->dirty == 0); 1642 vm_pageout_page_free(m); 1643 mycpu->gd_cnt.v_dfree++; 1644 next_rover: 1645 if (isep) 1646 cache_rover[1] -= PQ_PRIME2; 1647 else 1648 cache_rover[0] += PQ_PRIME2; 1649 } 1650 1651 #if !defined(NO_SWAPPING) 1652 /* 1653 * Idle process swapout -- run once per second. 1654 */ 1655 if (vm_swap_idle_enabled) { 1656 static time_t lsec; 1657 if (time_uptime != lsec) { 1658 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1659 vm_req_vmdaemon(); 1660 lsec = time_uptime; 1661 } 1662 } 1663 #endif 1664 1665 /* 1666 * If we didn't get enough free pages, and we have skipped a vnode 1667 * in a writeable object, wakeup the sync daemon. And kick swapout 1668 * if we did not get enough free pages. 1669 */ 1670 if (vm_paging_target() > 0) { 1671 if (vnodes_skipped && vm_page_count_min(0)) 1672 speedup_syncer(NULL); 1673 #if !defined(NO_SWAPPING) 1674 if (vm_swap_enabled && vm_page_count_target()) { 1675 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1676 vm_req_vmdaemon(); 1677 } 1678 #endif 1679 } 1680 1681 /* 1682 * Handle catastrophic conditions. Under good conditions we should 1683 * be at the target, well beyond our minimum. If we could not even 1684 * reach our minimum the system is under heavy stress. But just being 1685 * under heavy stress does not trigger process killing. 1686 * 1687 * We consider ourselves to have run out of memory if the swap pager 1688 * is full and avail_shortage is still positive. The secondary check 1689 * ensures that we do not kill processes if the instantanious 1690 * availability is good, even if the pageout demon pass says it 1691 * couldn't get to the target. 1692 * 1693 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1694 * SITUATIONS. 1695 */ 1696 if (swap_pager_almost_full && 1697 pass > 0 && 1698 isep == 0 && 1699 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1700 kprintf("Warning: system low on memory+swap " 1701 "shortage %ld for %d ticks!\n", 1702 avail_shortage, ticks - swap_fail_ticks); 1703 if (bootverbose) 1704 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1705 "avail=%ld target=%ld last=%u\n", 1706 swap_pager_almost_full, 1707 swap_pager_full, 1708 pass, 1709 avail_shortage, 1710 vm_paging_target(), 1711 (unsigned int)(ticks - lastkillticks)); 1712 } 1713 if (swap_pager_full && 1714 pass > 1 && 1715 isep == 0 && 1716 avail_shortage > 0 && 1717 vm_paging_target() > 0 && 1718 (unsigned int)(ticks - lastkillticks) >= hz) { 1719 /* 1720 * Kill something, maximum rate once per second to give 1721 * the process time to free up sufficient memory. 1722 */ 1723 lastkillticks = ticks; 1724 info.bigproc = NULL; 1725 info.bigsize = 0; 1726 allproc_scan(vm_pageout_scan_callback, &info, 0); 1727 if (info.bigproc != NULL) { 1728 kprintf("Try to kill process %d %s\n", 1729 info.bigproc->p_pid, info.bigproc->p_comm); 1730 info.bigproc->p_nice = PRIO_MIN; 1731 info.bigproc->p_usched->resetpriority( 1732 FIRST_LWP_IN_PROC(info.bigproc)); 1733 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1734 killproc(info.bigproc, "out of swap space"); 1735 wakeup(&vmstats.v_free_count); 1736 PRELE(info.bigproc); 1737 } 1738 } 1739 } 1740 1741 static int 1742 vm_pageout_scan_callback(struct proc *p, void *data) 1743 { 1744 struct vm_pageout_scan_info *info = data; 1745 vm_offset_t size; 1746 1747 /* 1748 * Never kill system processes or init. If we have configured swap 1749 * then try to avoid killing low-numbered pids. 1750 */ 1751 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1752 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1753 return (0); 1754 } 1755 1756 lwkt_gettoken(&p->p_token); 1757 1758 /* 1759 * if the process is in a non-running type state, 1760 * don't touch it. 1761 */ 1762 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1763 lwkt_reltoken(&p->p_token); 1764 return (0); 1765 } 1766 1767 /* 1768 * Get the approximate process size. Note that anonymous pages 1769 * with backing swap will be counted twice, but there should not 1770 * be too many such pages due to the stress the VM system is 1771 * under at this point. 1772 */ 1773 size = vmspace_anonymous_count(p->p_vmspace) + 1774 vmspace_swap_count(p->p_vmspace); 1775 1776 /* 1777 * If the this process is bigger than the biggest one 1778 * remember it. 1779 */ 1780 if (info->bigsize < size) { 1781 if (info->bigproc) 1782 PRELE(info->bigproc); 1783 PHOLD(p); 1784 info->bigproc = p; 1785 info->bigsize = size; 1786 } 1787 lwkt_reltoken(&p->p_token); 1788 lwkt_yield(); 1789 1790 return(0); 1791 } 1792 1793 /* 1794 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1795 * moved back to PQ_FREE. It is possible for pages to accumulate here 1796 * when vm_page_free() races against vm_page_unhold(), resulting in a 1797 * page being left on a PQ_HOLD queue with hold_count == 0. 1798 * 1799 * It is easier to handle this edge condition here, in non-critical code, 1800 * rather than enforce a spin-lock for every 1->0 transition in 1801 * vm_page_unhold(). 1802 * 1803 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1804 */ 1805 static void 1806 vm_pageout_scan_hold(int q) 1807 { 1808 vm_page_t m; 1809 1810 vm_page_queues_spin_lock(PQ_HOLD + q); 1811 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1812 if (m->flags & PG_MARKER) 1813 continue; 1814 1815 /* 1816 * Process one page and return 1817 */ 1818 if (m->hold_count) 1819 break; 1820 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1821 vm_page_hold(m); 1822 vm_page_queues_spin_unlock(PQ_HOLD + q); 1823 vm_page_unhold(m); /* reprocess */ 1824 return; 1825 } 1826 vm_page_queues_spin_unlock(PQ_HOLD + q); 1827 } 1828 1829 /* 1830 * This routine tries to maintain the pseudo LRU active queue, 1831 * so that during long periods of time where there is no paging, 1832 * that some statistic accumulation still occurs. This code 1833 * helps the situation where paging just starts to occur. 1834 */ 1835 static void 1836 vm_pageout_page_stats(int q) 1837 { 1838 static int fullintervalcount = 0; 1839 struct vm_page marker; 1840 vm_page_t m; 1841 long pcount, tpcount; /* Number of pages to check */ 1842 long page_shortage; 1843 1844 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1845 vmstats.v_free_min) - 1846 (vmstats.v_free_count + vmstats.v_inactive_count + 1847 vmstats.v_cache_count); 1848 1849 if (page_shortage <= 0) 1850 return; 1851 1852 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1853 fullintervalcount += vm_pageout_stats_interval; 1854 if (fullintervalcount < vm_pageout_full_stats_interval) { 1855 tpcount = (vm_pageout_stats_max * pcount) / 1856 vmstats.v_page_count + 1; 1857 if (pcount > tpcount) 1858 pcount = tpcount; 1859 } else { 1860 fullintervalcount = 0; 1861 } 1862 1863 bzero(&marker, sizeof(marker)); 1864 marker.flags = PG_FICTITIOUS | PG_MARKER; 1865 marker.busy_count = PBUSY_LOCKED; 1866 marker.queue = PQ_ACTIVE + q; 1867 marker.pc = q; 1868 marker.wire_count = 1; 1869 1870 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1871 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1872 1873 /* 1874 * Queue locked at top of loop to avoid stack marker issues. 1875 */ 1876 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1877 pcount-- > 0) 1878 { 1879 int actcount; 1880 1881 KKASSERT(m->queue == PQ_ACTIVE + q); 1882 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1883 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1884 &marker, pageq); 1885 1886 /* 1887 * Skip marker pages (atomic against other markers to avoid 1888 * infinite hop-over scans). 1889 */ 1890 if (m->flags & PG_MARKER) 1891 continue; 1892 1893 /* 1894 * Ignore pages we can't busy 1895 */ 1896 if (vm_page_busy_try(m, TRUE)) 1897 continue; 1898 1899 /* 1900 * Remaining operations run with the page busy and neither 1901 * the page or the queue will be spin-locked. 1902 */ 1903 KKASSERT(m->queue == PQ_ACTIVE + q); 1904 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1905 1906 /* 1907 * We can just remove wired pages from the queue 1908 */ 1909 if (m->wire_count) { 1910 vm_page_unqueue_nowakeup(m); 1911 vm_page_wakeup(m); 1912 goto next; 1913 } 1914 1915 1916 /* 1917 * We now have a safely busied page, the page and queue 1918 * spinlocks have been released. 1919 * 1920 * Ignore held and wired pages 1921 */ 1922 if (m->hold_count || m->wire_count) { 1923 vm_page_wakeup(m); 1924 goto next; 1925 } 1926 1927 /* 1928 * Calculate activity 1929 */ 1930 actcount = 0; 1931 if (m->flags & PG_REFERENCED) { 1932 vm_page_flag_clear(m, PG_REFERENCED); 1933 actcount += 1; 1934 } 1935 actcount += pmap_ts_referenced(m); 1936 1937 /* 1938 * Update act_count and move page to end of queue. 1939 */ 1940 if (actcount) { 1941 m->act_count += ACT_ADVANCE + actcount; 1942 if (m->act_count > ACT_MAX) 1943 m->act_count = ACT_MAX; 1944 vm_page_and_queue_spin_lock(m); 1945 if (m->queue - m->pc == PQ_ACTIVE) { 1946 TAILQ_REMOVE( 1947 &vm_page_queues[PQ_ACTIVE + q].pl, 1948 m, pageq); 1949 TAILQ_INSERT_TAIL( 1950 &vm_page_queues[PQ_ACTIVE + q].pl, 1951 m, pageq); 1952 } 1953 vm_page_and_queue_spin_unlock(m); 1954 vm_page_wakeup(m); 1955 goto next; 1956 } 1957 1958 if (m->act_count == 0) { 1959 /* 1960 * We turn off page access, so that we have 1961 * more accurate RSS stats. We don't do this 1962 * in the normal page deactivation when the 1963 * system is loaded VM wise, because the 1964 * cost of the large number of page protect 1965 * operations would be higher than the value 1966 * of doing the operation. 1967 * 1968 * We use the marker to save our place so 1969 * we can release the spin lock. both (m) 1970 * and (next) will be invalid. 1971 */ 1972 vm_page_protect(m, VM_PROT_NONE); 1973 vm_page_deactivate(m); 1974 } else { 1975 m->act_count -= min(m->act_count, ACT_DECLINE); 1976 vm_page_and_queue_spin_lock(m); 1977 if (m->queue - m->pc == PQ_ACTIVE) { 1978 TAILQ_REMOVE( 1979 &vm_page_queues[PQ_ACTIVE + q].pl, 1980 m, pageq); 1981 TAILQ_INSERT_TAIL( 1982 &vm_page_queues[PQ_ACTIVE + q].pl, 1983 m, pageq); 1984 } 1985 vm_page_and_queue_spin_unlock(m); 1986 } 1987 vm_page_wakeup(m); 1988 next: 1989 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1990 } 1991 1992 /* 1993 * Remove our local marker 1994 * 1995 * Page queue still spin-locked. 1996 */ 1997 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1998 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1999 } 2000 2001 static void 2002 vm_pageout_free_page_calc(vm_size_t count) 2003 { 2004 /* 2005 * v_free_min normal allocations 2006 * v_free_reserved system allocations 2007 * v_pageout_free_min allocations by pageout daemon 2008 * v_interrupt_free_min low level allocations (e.g swap structures) 2009 * 2010 * v_free_min is used to generate several other baselines, and they 2011 * can get pretty silly on systems with a lot of memory. 2012 */ 2013 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2014 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2015 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2016 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2017 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2018 } 2019 2020 2021 /* 2022 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2023 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2024 * 2025 * The emergency pageout daemon takes over when the primary pageout daemon 2026 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2027 * avoiding the many low-memory deadlocks which can occur when paging out 2028 * to VFS's. 2029 */ 2030 static void 2031 vm_pageout_thread(void) 2032 { 2033 int pass; 2034 int q; 2035 int q1iterator = 0; 2036 int q2iterator = 0; 2037 int q3iterator = 0; 2038 int isep; 2039 2040 curthread->td_flags |= TDF_SYSTHREAD; 2041 2042 /* 2043 * We only need to setup once. 2044 */ 2045 isep = 0; 2046 if (curthread == emergpager) { 2047 isep = 1; 2048 goto skip_setup; 2049 } 2050 2051 /* 2052 * Initialize some paging parameters. 2053 */ 2054 vm_pageout_free_page_calc(vmstats.v_page_count); 2055 2056 /* 2057 * v_free_target and v_cache_min control pageout hysteresis. Note 2058 * that these are more a measure of the VM cache queue hysteresis 2059 * then the VM free queue. Specifically, v_free_target is the 2060 * high water mark (free+cache pages). 2061 * 2062 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2063 * low water mark, while v_free_min is the stop. v_cache_min must 2064 * be big enough to handle memory needs while the pageout daemon 2065 * is signalled and run to free more pages. 2066 */ 2067 vmstats.v_free_target = 4 * vmstats.v_free_min + 2068 vmstats.v_free_reserved; 2069 2070 /* 2071 * NOTE: With the new buffer cache b_act_count we want the default 2072 * inactive target to be a percentage of available memory. 2073 * 2074 * The inactive target essentially determines the minimum 2075 * number of 'temporary' pages capable of caching one-time-use 2076 * files when the VM system is otherwise full of pages 2077 * belonging to multi-time-use files or active program data. 2078 * 2079 * NOTE: The inactive target is aggressively persued only if the 2080 * inactive queue becomes too small. If the inactive queue 2081 * is large enough to satisfy page movement to free+cache 2082 * then it is repopulated more slowly from the active queue. 2083 * This allows a general inactive_target default to be set. 2084 * 2085 * There is an issue here for processes which sit mostly idle 2086 * 'overnight', such as sshd, tcsh, and X. Any movement from 2087 * the active queue will eventually cause such pages to 2088 * recycle eventually causing a lot of paging in the morning. 2089 * To reduce the incidence of this pages cycled out of the 2090 * buffer cache are moved directly to the inactive queue if 2091 * they were only used once or twice. 2092 * 2093 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2094 * Increasing the value (up to 64) increases the number of 2095 * buffer recyclements which go directly to the inactive queue. 2096 */ 2097 if (vmstats.v_free_count > 2048) { 2098 vmstats.v_cache_min = vmstats.v_free_target; 2099 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2100 } else { 2101 vmstats.v_cache_min = 0; 2102 vmstats.v_cache_max = 0; 2103 } 2104 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2105 2106 /* XXX does not really belong here */ 2107 if (vm_page_max_wired == 0) 2108 vm_page_max_wired = vmstats.v_free_count / 3; 2109 2110 if (vm_pageout_stats_max == 0) 2111 vm_pageout_stats_max = vmstats.v_free_target; 2112 2113 /* 2114 * Set interval in seconds for stats scan. 2115 */ 2116 if (vm_pageout_stats_interval == 0) 2117 vm_pageout_stats_interval = 5; 2118 if (vm_pageout_full_stats_interval == 0) 2119 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2120 2121 2122 /* 2123 * Set maximum free per pass 2124 */ 2125 if (vm_pageout_stats_free_max == 0) 2126 vm_pageout_stats_free_max = 5; 2127 2128 swap_pager_swap_init(); 2129 pass = 0; 2130 2131 atomic_swap_int(&sequence_emerg_pager, 1); 2132 wakeup(&sequence_emerg_pager); 2133 2134 skip_setup: 2135 /* 2136 * Sequence emergency pager startup 2137 */ 2138 if (isep) { 2139 while (sequence_emerg_pager == 0) 2140 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2141 } 2142 2143 /* 2144 * The pageout daemon is never done, so loop forever. 2145 * 2146 * WARNING! This code is being executed by two kernel threads 2147 * potentially simultaneously. 2148 */ 2149 while (TRUE) { 2150 int error; 2151 long avail_shortage; 2152 long inactive_shortage; 2153 long vnodes_skipped = 0; 2154 long recycle_count = 0; 2155 long tmp; 2156 2157 /* 2158 * Wait for an action request. If we timeout check to 2159 * see if paging is needed (in case the normal wakeup 2160 * code raced us). 2161 */ 2162 if (isep) { 2163 /* 2164 * Emergency pagedaemon monitors the primary 2165 * pagedaemon while vm_pages_needed != 0. 2166 * 2167 * The emergency pagedaemon only runs if VM paging 2168 * is needed and the primary pagedaemon has not 2169 * updated vm_pagedaemon_time for more than 2 seconds. 2170 */ 2171 if (vm_pages_needed) 2172 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2173 else 2174 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2175 if (vm_pages_needed == 0) { 2176 pass = 0; 2177 continue; 2178 } 2179 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2180 pass = 0; 2181 continue; 2182 } 2183 } else { 2184 /* 2185 * Primary pagedaemon 2186 * 2187 * NOTE: We unconditionally cleanup PQ_HOLD even 2188 * when there is no work to do. 2189 */ 2190 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2191 ++q3iterator; 2192 2193 if (vm_pages_needed == 0) { 2194 error = tsleep(&vm_pages_needed, 2195 0, "psleep", 2196 vm_pageout_stats_interval * hz); 2197 if (error && 2198 vm_paging_needed(0) == 0 && 2199 vm_pages_needed == 0) { 2200 for (q = 0; q < PQ_L2_SIZE; ++q) 2201 vm_pageout_page_stats(q); 2202 continue; 2203 } 2204 vm_pagedaemon_time = ticks; 2205 vm_pages_needed = 1; 2206 2207 /* 2208 * Wake the emergency pagedaemon up so it 2209 * can monitor us. It will automatically 2210 * go back into a long sleep when 2211 * vm_pages_needed returns to 0. 2212 */ 2213 wakeup(&vm_pagedaemon_time); 2214 } 2215 } 2216 2217 mycpu->gd_cnt.v_pdwakeups++; 2218 2219 /* 2220 * Scan for INACTIVE->CLEAN/PAGEOUT 2221 * 2222 * This routine tries to avoid thrashing the system with 2223 * unnecessary activity. 2224 * 2225 * Calculate our target for the number of free+cache pages we 2226 * want to get to. This is higher then the number that causes 2227 * allocations to stall (severe) in order to provide hysteresis, 2228 * and if we don't make it all the way but get to the minimum 2229 * we're happy. Goose it a bit if there are multiple requests 2230 * for memory. 2231 * 2232 * Don't reduce avail_shortage inside the loop or the 2233 * PQAVERAGE() calculation will break. 2234 * 2235 * NOTE! deficit is differentiated from avail_shortage as 2236 * REQUIRING at least (deficit) pages to be cleaned, 2237 * even if the page queues are in good shape. This 2238 * is used primarily for handling per-process 2239 * RLIMIT_RSS and may also see small values when 2240 * processes block due to low memory. 2241 */ 2242 vmstats_rollup(); 2243 if (isep == 0) 2244 vm_pagedaemon_time = ticks; 2245 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2246 vm_pageout_deficit = 0; 2247 2248 if (avail_shortage > 0) { 2249 long delta = 0; 2250 int qq; 2251 2252 qq = q1iterator; 2253 for (q = 0; q < PQ_L2_SIZE; ++q) { 2254 delta += vm_pageout_scan_inactive( 2255 pass, 2256 qq & PQ_L2_MASK, 2257 PQAVERAGE(avail_shortage), 2258 &vnodes_skipped); 2259 if (isep) 2260 --qq; 2261 else 2262 ++qq; 2263 if (avail_shortage - delta <= 0) 2264 break; 2265 } 2266 avail_shortage -= delta; 2267 q1iterator = qq; 2268 } 2269 2270 /* 2271 * Figure out how many active pages we must deactivate. If 2272 * we were able to reach our target with just the inactive 2273 * scan above we limit the number of active pages we 2274 * deactivate to reduce unnecessary work. 2275 */ 2276 vmstats_rollup(); 2277 if (isep == 0) 2278 vm_pagedaemon_time = ticks; 2279 inactive_shortage = vmstats.v_inactive_target - 2280 vmstats.v_inactive_count; 2281 2282 /* 2283 * If we were unable to free sufficient inactive pages to 2284 * satisfy the free/cache queue requirements then simply 2285 * reaching the inactive target may not be good enough. 2286 * Try to deactivate pages in excess of the target based 2287 * on the shortfall. 2288 * 2289 * However to prevent thrashing the VM system do not 2290 * deactivate more than an additional 1/10 the inactive 2291 * target's worth of active pages. 2292 */ 2293 if (avail_shortage > 0) { 2294 tmp = avail_shortage * 2; 2295 if (tmp > vmstats.v_inactive_target / 10) 2296 tmp = vmstats.v_inactive_target / 10; 2297 inactive_shortage += tmp; 2298 } 2299 2300 /* 2301 * Only trigger a pmap cleanup on inactive shortage. 2302 */ 2303 if (isep == 0 && inactive_shortage > 0) { 2304 pmap_collect(); 2305 } 2306 2307 /* 2308 * Scan for ACTIVE->INACTIVE 2309 * 2310 * Only trigger on inactive shortage. Triggering on 2311 * avail_shortage can starve the active queue with 2312 * unnecessary active->inactive transitions and destroy 2313 * performance. 2314 * 2315 * If this is the emergency pager, always try to move 2316 * a few pages from active to inactive because the inactive 2317 * queue might have enough pages, but not enough anonymous 2318 * pages. 2319 */ 2320 if (isep && inactive_shortage < vm_emerg_launder) 2321 inactive_shortage = vm_emerg_launder; 2322 2323 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2324 long delta = 0; 2325 int qq; 2326 2327 qq = q2iterator; 2328 for (q = 0; q < PQ_L2_SIZE; ++q) { 2329 delta += vm_pageout_scan_active( 2330 pass, 2331 qq & PQ_L2_MASK, 2332 PQAVERAGE(avail_shortage), 2333 PQAVERAGE(inactive_shortage), 2334 &recycle_count); 2335 if (isep) 2336 --qq; 2337 else 2338 ++qq; 2339 if (inactive_shortage - delta <= 0 && 2340 avail_shortage - delta <= 0) { 2341 break; 2342 } 2343 } 2344 inactive_shortage -= delta; 2345 avail_shortage -= delta; 2346 q2iterator = qq; 2347 } 2348 2349 /* 2350 * Scan for CACHE->FREE 2351 * 2352 * Finally free enough cache pages to meet our free page 2353 * requirement and take more drastic measures if we are 2354 * still in trouble. 2355 */ 2356 vmstats_rollup(); 2357 if (isep == 0) 2358 vm_pagedaemon_time = ticks; 2359 vm_pageout_scan_cache(avail_shortage, pass, 2360 vnodes_skipped, recycle_count); 2361 2362 /* 2363 * Wait for more work. 2364 */ 2365 if (avail_shortage > 0) { 2366 ++pass; 2367 if (pass < 10 && vm_pages_needed > 1) { 2368 /* 2369 * Normal operation, additional processes 2370 * have already kicked us. Retry immediately 2371 * unless swap space is completely full in 2372 * which case delay a bit. 2373 */ 2374 if (swap_pager_full) { 2375 tsleep(&vm_pages_needed, 0, "pdelay", 2376 hz / 5); 2377 } /* else immediate retry */ 2378 } else if (pass < 10) { 2379 /* 2380 * Normal operation, fewer processes. Delay 2381 * a bit but allow wakeups. vm_pages_needed 2382 * is only adjusted against the primary 2383 * pagedaemon here. 2384 */ 2385 if (isep == 0) 2386 vm_pages_needed = 0; 2387 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2388 if (isep == 0) 2389 vm_pages_needed = 1; 2390 } else if (swap_pager_full == 0) { 2391 /* 2392 * We've taken too many passes, forced delay. 2393 */ 2394 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2395 } else { 2396 /* 2397 * Running out of memory, catastrophic 2398 * back-off to one-second intervals. 2399 */ 2400 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2401 } 2402 } else if (vm_pages_needed) { 2403 /* 2404 * Interlocked wakeup of waiters (non-optional). 2405 * 2406 * Similar to vm_page_free_wakeup() in vm_page.c, 2407 * wake 2408 */ 2409 pass = 0; 2410 if (!vm_page_count_min(vm_page_free_hysteresis) || 2411 !vm_page_count_target()) { 2412 vm_pages_needed = 0; 2413 wakeup(&vmstats.v_free_count); 2414 } 2415 } else { 2416 pass = 0; 2417 } 2418 } 2419 } 2420 2421 static struct kproc_desc pg1_kp = { 2422 "pagedaemon", 2423 vm_pageout_thread, 2424 &pagethread 2425 }; 2426 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2427 2428 static struct kproc_desc pg2_kp = { 2429 "emergpager", 2430 vm_pageout_thread, 2431 &emergpager 2432 }; 2433 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2434 2435 2436 /* 2437 * Called after allocating a page out of the cache or free queue 2438 * to possibly wake the pagedaemon up to replentish our supply. 2439 * 2440 * We try to generate some hysteresis by waking the pagedaemon up 2441 * when our free+cache pages go below the free_min+cache_min level. 2442 * The pagedaemon tries to get the count back up to at least the 2443 * minimum, and through to the target level if possible. 2444 * 2445 * If the pagedaemon is already active bump vm_pages_needed as a hint 2446 * that there are even more requests pending. 2447 * 2448 * SMP races ok? 2449 * No requirements. 2450 */ 2451 void 2452 pagedaemon_wakeup(void) 2453 { 2454 if (vm_paging_needed(0) && curthread != pagethread) { 2455 if (vm_pages_needed == 0) { 2456 vm_pages_needed = 1; /* SMP race ok */ 2457 wakeup(&vm_pages_needed); 2458 } else if (vm_page_count_min(0)) { 2459 ++vm_pages_needed; /* SMP race ok */ 2460 } 2461 } 2462 } 2463 2464 #if !defined(NO_SWAPPING) 2465 2466 /* 2467 * SMP races ok? 2468 * No requirements. 2469 */ 2470 static void 2471 vm_req_vmdaemon(void) 2472 { 2473 static int lastrun = 0; 2474 2475 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2476 wakeup(&vm_daemon_needed); 2477 lastrun = ticks; 2478 } 2479 } 2480 2481 static int vm_daemon_callback(struct proc *p, void *data __unused); 2482 2483 /* 2484 * No requirements. 2485 */ 2486 static void 2487 vm_daemon(void) 2488 { 2489 int req_swapout; 2490 2491 while (TRUE) { 2492 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2493 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2494 2495 /* 2496 * forced swapouts 2497 */ 2498 if (req_swapout) 2499 swapout_procs(vm_pageout_req_swapout); 2500 2501 /* 2502 * scan the processes for exceeding their rlimits or if 2503 * process is swapped out -- deactivate pages 2504 */ 2505 allproc_scan(vm_daemon_callback, NULL, 0); 2506 } 2507 } 2508 2509 static int 2510 vm_daemon_callback(struct proc *p, void *data __unused) 2511 { 2512 struct vmspace *vm; 2513 vm_pindex_t limit, size; 2514 2515 /* 2516 * if this is a system process or if we have already 2517 * looked at this process, skip it. 2518 */ 2519 lwkt_gettoken(&p->p_token); 2520 2521 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2522 lwkt_reltoken(&p->p_token); 2523 return (0); 2524 } 2525 2526 /* 2527 * if the process is in a non-running type state, 2528 * don't touch it. 2529 */ 2530 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2531 lwkt_reltoken(&p->p_token); 2532 return (0); 2533 } 2534 2535 /* 2536 * get a limit 2537 */ 2538 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2539 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2540 2541 /* 2542 * let processes that are swapped out really be 2543 * swapped out. Set the limit to nothing to get as 2544 * many pages out to swap as possible. 2545 */ 2546 if (p->p_flags & P_SWAPPEDOUT) 2547 limit = 0; 2548 2549 vm = p->p_vmspace; 2550 vmspace_hold(vm); 2551 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2552 if (limit >= 0 && size > 4096 && 2553 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2554 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2555 } 2556 vmspace_drop(vm); 2557 2558 lwkt_reltoken(&p->p_token); 2559 2560 return (0); 2561 } 2562 2563 #endif 2564