1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1991 Regents of the University of California. 35 * All rights reserved. 36 * Copyright (c) 1994 John S. Dyson 37 * All rights reserved. 38 * Copyright (c) 1994 David Greenman 39 * All rights reserved. 40 * 41 * This code is derived from software contributed to Berkeley by 42 * The Mach Operating System project at Carnegie-Mellon University. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 69 * 70 * 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * The proverbial page-out daemon, rewritten many times over the decades. 99 */ 100 101 #include "opt_vm.h" 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/kernel.h> 105 #include <sys/proc.h> 106 #include <sys/kthread.h> 107 #include <sys/resourcevar.h> 108 #include <sys/signalvar.h> 109 #include <sys/vnode.h> 110 #include <sys/malloc.h> 111 #include <sys/vmmeter.h> 112 #include <sys/conf.h> 113 #include <sys/sysctl.h> 114 115 #include <vm/vm.h> 116 #include <vm/vm_param.h> 117 #include <sys/lock.h> 118 #include <vm/vm_object.h> 119 #include <vm/vm_page.h> 120 #include <vm/vm_map.h> 121 #include <vm/vm_pageout.h> 122 #include <vm/vm_pager.h> 123 #include <vm/swap_pager.h> 124 #include <vm/vm_extern.h> 125 126 #include <sys/spinlock2.h> 127 #include <vm/vm_page2.h> 128 129 /* 130 * Persistent markers held by pageout daemon (array) 131 */ 132 struct markers { 133 struct vm_page hold; 134 struct vm_page stat; 135 struct vm_page pact; 136 }; 137 138 /* 139 * System initialization 140 */ 141 142 /* the kernel process "vm_pageout"*/ 143 static int vm_pageout_page(vm_page_t m, long *max_launderp, 144 long *vnodes_skippedp, struct vnode **vpfailedp, 145 int pass, int vmflush_flags, long *counts); 146 static int vm_pageout_clean_helper (vm_page_t, int); 147 static void vm_pageout_free_page_calc (vm_size_t count); 148 static void vm_pageout_page_free(vm_page_t m) ; 149 __read_frequently struct thread *emergpager; 150 __read_frequently struct thread *pagethread; 151 static int sequence_emerg_pager; 152 153 #if !defined(NO_SWAPPING) 154 /* the kernel process "vm_daemon"*/ 155 static void vm_daemon (void); 156 static struct thread *vmthread; 157 158 static struct kproc_desc vm_kp = { 159 "vmdaemon", 160 vm_daemon, 161 &vmthread 162 }; 163 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 164 #endif 165 166 __read_mostly int vm_pages_needed = 0; /* pageout daemon tsleep event */ 167 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */ 168 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 169 __read_mostly int vm_page_free_hysteresis = 16; 170 __read_mostly static time_t vm_pagedaemon_uptime; 171 172 #if !defined(NO_SWAPPING) 173 static int vm_daemon_needed; 174 #endif 175 __read_mostly static int vm_queue_idle_perc = 20; 176 __read_mostly static int vm_max_launder = 0; 177 __read_mostly static int vm_emerg_launder = 100; 178 __read_mostly static int vm_pageout_stats_actcmp = 0; 179 __read_mostly static int vm_pageout_stats_inamin = 16; 180 __read_mostly static int vm_pageout_stats_inalim = 4096; 181 __read_mostly static int vm_pageout_stats_scan = 0; 182 __read_mostly static int vm_pageout_stats_ticks = 0; 183 __read_mostly static int vm_pageout_algorithm = 0; 184 __read_mostly static int defer_swap_pageouts = 0; 185 __read_mostly static int disable_swap_pageouts = 0; 186 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 187 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 188 __read_mostly static int vm_pageout_debug; 189 __read_mostly static long vm_pageout_stats_rsecs = 300; 190 191 #if defined(NO_SWAPPING) 192 __read_mostly static int vm_swap_enabled=0; 193 #else 194 __read_mostly static int vm_swap_enabled=1; 195 #endif 196 197 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/ 198 __read_mostly int vm_pageout_memuse_mode=2; 199 __read_mostly int vm_pageout_allow_active=1; 200 201 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 202 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 203 204 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 205 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 206 207 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 208 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 209 "Free more pages than the minimum required"); 210 211 SYSCTL_INT(_vm, OID_AUTO, queue_idle_perc, 212 CTLFLAG_RW, &vm_queue_idle_perc, 0, "page stats stop point, percent"); 213 214 SYSCTL_INT(_vm, OID_AUTO, max_launder, 215 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 216 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 217 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 218 219 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_actcmp, 220 CTLFLAG_RW, &vm_pageout_stats_actcmp, 0, 221 "Current dynamic act_count comparator"); 222 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inamin, 223 CTLFLAG_RW, &vm_pageout_stats_inamin, 0, 224 "min out of lim tests must match"); 225 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inalim, 226 CTLFLAG_RW, &vm_pageout_stats_inalim, 0, 227 "min out of lim tests must match"); 228 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_ticks, 229 CTLFLAG_RW, &vm_pageout_stats_ticks, 0, 230 "Interval for partial stats scan"); 231 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_scan, 232 CTLFLAG_RW, &vm_pageout_stats_scan, 0, 233 "hold/ACT scan count per interval"); 234 SYSCTL_LONG(_vm, OID_AUTO, pageout_stats_rsecs, 235 CTLFLAG_RW, &vm_pageout_stats_rsecs, 0, 236 "min out of lim tests must match"); 237 238 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 239 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 240 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active, 241 CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active"); 242 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 243 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 244 245 246 #if defined(NO_SWAPPING) 247 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 248 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 249 #else 250 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 251 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 252 #endif 253 254 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 255 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 256 257 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 258 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 259 260 static int pageout_lock_miss; 261 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 262 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 263 264 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 265 266 static MALLOC_DEFINE(M_PAGEOUT, "pageout", "Pageout structures"); 267 268 #if !defined(NO_SWAPPING) 269 static void vm_req_vmdaemon (void); 270 #endif 271 272 #define MAXSCAN_DIVIDER 10 273 274 #define VM_CACHE_SCAN_MIN 16 275 #define VM_CACHE_SCAN_NOM (VM_CACHE_SCAN_MIN * 4) 276 277 /* 278 * Calculate approximately how many pages on each queue to try to 279 * clean. An exact calculation creates an edge condition when the 280 * queues are unbalanced so add significant slop. The queue scans 281 * will stop early when targets are reached and will start where they 282 * left off on the next pass. 283 * 284 * We need to be generous here because there are all sorts of loading 285 * conditions that can cause edge cases if try to average over all queues. 286 * In particular, storage subsystems have become so fast that paging 287 * activity can become quite frantic. Eventually we will probably need 288 * two paging threads, one for dirty pages and one for clean, to deal 289 * with the bandwidth requirements. 290 291 * So what we do is calculate a value that can be satisfied nominally by 292 * only having to scan half the queues. 293 */ 294 static __inline long 295 PQAVERAGE(long n) 296 { 297 long avg; 298 299 if (n >= 0) { 300 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 301 } else { 302 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 303 } 304 return avg; 305 } 306 307 /* 308 * vm_pageout_clean_helper: 309 * 310 * Clean the page and remove it from the laundry. The page must be busied 311 * by the caller and will be disposed of (put away, flushed) by this routine. 312 */ 313 static int 314 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 315 { 316 vm_object_t object; 317 vm_page_t mc[BLIST_MAX_ALLOC]; 318 int error; 319 int ib, is, page_base; 320 vm_pindex_t pindex = m->pindex; 321 322 object = m->object; 323 324 /* 325 * Don't mess with the page if it's held or special. Theoretically 326 * we can pageout held pages but there is no real need to press our 327 * luck, so don't. 328 */ 329 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 330 vm_page_wakeup(m); 331 return 0; 332 } 333 334 /* 335 * Place page in cluster. Align cluster for optimal swap space 336 * allocation (whether it is swap or not). This is typically ~16-32 337 * pages, which also tends to align the cluster to multiples of the 338 * filesystem block size if backed by a filesystem. 339 */ 340 page_base = pindex % BLIST_MAX_ALLOC; 341 mc[page_base] = m; 342 ib = page_base - 1; 343 is = page_base + 1; 344 345 /* 346 * Scan object for clusterable pages. 347 * 348 * We can cluster ONLY if: ->> the page is NOT 349 * clean, wired, busy, held, or mapped into a 350 * buffer, and one of the following: 351 * 1) The page is inactive, or a seldom used 352 * active page. 353 * -or- 354 * 2) we force the issue. 355 * 356 * During heavy mmap/modification loads the pageout 357 * daemon can really fragment the underlying file 358 * due to flushing pages out of order and not trying 359 * align the clusters (which leave sporatic out-of-order 360 * holes). To solve this problem we do the reverse scan 361 * first and attempt to align our cluster, then do a 362 * forward scan if room remains. 363 */ 364 vm_object_hold(object); 365 366 while (ib >= 0) { 367 vm_page_t p; 368 369 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 370 TRUE, &error); 371 if (error || p == NULL) 372 break; 373 if ((p->queue - p->pc) == PQ_CACHE || 374 (p->flags & PG_UNQUEUED)) { 375 vm_page_wakeup(p); 376 break; 377 } 378 vm_page_test_dirty(p); 379 if (((p->dirty & p->valid) == 0 && 380 (p->flags & PG_NEED_COMMIT) == 0) || 381 p->wire_count != 0 || /* may be held by buf cache */ 382 p->hold_count != 0) { /* may be undergoing I/O */ 383 vm_page_wakeup(p); 384 break; 385 } 386 if (p->queue - p->pc != PQ_INACTIVE) { 387 if (p->queue - p->pc != PQ_ACTIVE || 388 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 389 vm_page_wakeup(p); 390 break; 391 } 392 } 393 394 /* 395 * Try to maintain page groupings in the cluster. 396 */ 397 if (m->flags & PG_WINATCFLS) 398 vm_page_flag_set(p, PG_WINATCFLS); 399 else 400 vm_page_flag_clear(p, PG_WINATCFLS); 401 p->act_count = m->act_count; 402 403 mc[ib] = p; 404 --ib; 405 } 406 ++ib; /* fixup */ 407 408 while (is < BLIST_MAX_ALLOC && 409 pindex - page_base + is < object->size) { 410 vm_page_t p; 411 412 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 413 TRUE, &error); 414 if (error || p == NULL) 415 break; 416 if (((p->queue - p->pc) == PQ_CACHE) || 417 (p->flags & PG_UNQUEUED)) { 418 vm_page_wakeup(p); 419 break; 420 } 421 vm_page_test_dirty(p); 422 if (((p->dirty & p->valid) == 0 && 423 (p->flags & PG_NEED_COMMIT) == 0) || 424 p->wire_count != 0 || /* may be held by buf cache */ 425 p->hold_count != 0) { /* may be undergoing I/O */ 426 vm_page_wakeup(p); 427 break; 428 } 429 if (p->queue - p->pc != PQ_INACTIVE) { 430 if (p->queue - p->pc != PQ_ACTIVE || 431 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 432 vm_page_wakeup(p); 433 break; 434 } 435 } 436 437 /* 438 * Try to maintain page groupings in the cluster. 439 */ 440 if (m->flags & PG_WINATCFLS) 441 vm_page_flag_set(p, PG_WINATCFLS); 442 else 443 vm_page_flag_clear(p, PG_WINATCFLS); 444 p->act_count = m->act_count; 445 446 mc[is] = p; 447 ++is; 448 } 449 450 vm_object_drop(object); 451 452 /* 453 * we allow reads during pageouts... 454 */ 455 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 456 } 457 458 /* 459 * vm_pageout_flush() - launder the given pages 460 * 461 * The given pages are laundered. Note that we setup for the start of 462 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 463 * reference count all in here rather then in the parent. If we want 464 * the parent to do more sophisticated things we may have to change 465 * the ordering. 466 * 467 * The pages in the array must be busied by the caller and will be 468 * unbusied by this function. 469 */ 470 int 471 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 472 { 473 vm_object_t object; 474 int pageout_status[count]; 475 int numpagedout = 0; 476 int i; 477 478 /* 479 * Initiate I/O. Bump the vm_page_t->busy counter. 480 */ 481 for (i = 0; i < count; i++) { 482 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 483 ("vm_pageout_flush page %p index %d/%d: partially " 484 "invalid page", mc[i], i, count)); 485 vm_page_io_start(mc[i]); 486 } 487 488 /* 489 * We must make the pages read-only. This will also force the 490 * modified bit in the related pmaps to be cleared. The pager 491 * cannot clear the bit for us since the I/O completion code 492 * typically runs from an interrupt. The act of making the page 493 * read-only handles the case for us. 494 * 495 * Then we can unbusy the pages, we still hold a reference by virtue 496 * of our soft-busy. 497 */ 498 for (i = 0; i < count; i++) { 499 if (vmflush_flags & OBJPC_TRY_TO_CACHE) 500 vm_page_protect(mc[i], VM_PROT_NONE); 501 else 502 vm_page_protect(mc[i], VM_PROT_READ); 503 vm_page_wakeup(mc[i]); 504 } 505 506 object = mc[0]->object; 507 vm_object_pip_add(object, count); 508 509 vm_pager_put_pages(object, mc, count, 510 (vmflush_flags | 511 ((object == &kernel_object) ? 512 OBJPC_SYNC : 0)), 513 pageout_status); 514 515 for (i = 0; i < count; i++) { 516 vm_page_t mt = mc[i]; 517 518 switch (pageout_status[i]) { 519 case VM_PAGER_OK: 520 numpagedout++; 521 break; 522 case VM_PAGER_PEND: 523 numpagedout++; 524 break; 525 case VM_PAGER_BAD: 526 /* 527 * Page outside of range of object. Right now we 528 * essentially lose the changes by pretending it 529 * worked. 530 */ 531 vm_page_busy_wait(mt, FALSE, "pgbad"); 532 pmap_clear_modify(mt); 533 vm_page_undirty(mt); 534 vm_page_wakeup(mt); 535 break; 536 case VM_PAGER_ERROR: 537 case VM_PAGER_FAIL: 538 /* 539 * A page typically cannot be paged out when we 540 * have run out of swap. We leave the page 541 * marked inactive and will try to page it out 542 * again later. 543 * 544 * Starvation of the active page list is used to 545 * determine when the system is massively memory 546 * starved. 547 */ 548 break; 549 case VM_PAGER_AGAIN: 550 break; 551 } 552 553 /* 554 * If not PENDing this was a synchronous operation and we 555 * clean up after the I/O. If it is PENDing the mess is 556 * cleaned up asynchronously. 557 * 558 * Also nominally act on the caller's wishes if the caller 559 * wants to try to really clean (cache or free) the page. 560 * 561 * Also nominally deactivate the page if the system is 562 * memory-stressed. 563 */ 564 if (pageout_status[i] != VM_PAGER_PEND) { 565 vm_page_busy_wait(mt, FALSE, "pgouw"); 566 vm_page_io_finish(mt); 567 if (vmflush_flags & OBJPC_TRY_TO_CACHE) { 568 vm_page_try_to_cache(mt); 569 } else if (vm_paging_severe()) { 570 vm_page_deactivate(mt); 571 vm_page_wakeup(mt); 572 } else { 573 vm_page_wakeup(mt); 574 } 575 vm_object_pip_wakeup(object); 576 } 577 } 578 return numpagedout; 579 } 580 581 #if !defined(NO_SWAPPING) 582 583 /* 584 * Callback function, page busied for us. We must dispose of the busy 585 * condition. Any related pmap pages may be held but will not be locked. 586 */ 587 static 588 int 589 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 590 vm_page_t p) 591 { 592 int actcount; 593 int cleanit = 0; 594 595 /* 596 * Basic tests - There should never be a marker, and we can stop 597 * once the RSS is below the required level. 598 */ 599 KKASSERT((p->flags & PG_MARKER) == 0); 600 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 601 vm_page_wakeup(p); 602 return(-1); 603 } 604 605 mycpu->gd_cnt.v_pdpages++; 606 607 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 608 vm_page_wakeup(p); 609 goto done; 610 } 611 612 ++info->actioncount; 613 614 /* 615 * Check if the page has been referened recently. If it has, 616 * activate it and skip. 617 */ 618 actcount = pmap_ts_referenced(p); 619 if (actcount) { 620 vm_page_flag_set(p, PG_REFERENCED); 621 } else if (p->flags & PG_REFERENCED) { 622 actcount = 1; 623 } 624 625 if (actcount) { 626 if (p->queue - p->pc != PQ_ACTIVE) { 627 vm_page_and_queue_spin_lock(p); 628 if (p->queue - p->pc != PQ_ACTIVE) { 629 vm_page_and_queue_spin_unlock(p); 630 vm_page_activate(p); 631 } else { 632 vm_page_and_queue_spin_unlock(p); 633 } 634 } else { 635 p->act_count += actcount; 636 if (p->act_count > ACT_MAX) 637 p->act_count = ACT_MAX; 638 } 639 vm_page_flag_clear(p, PG_REFERENCED); 640 vm_page_wakeup(p); 641 goto done; 642 } 643 644 /* 645 * Remove the page from this particular pmap. Once we do this, our 646 * pmap scans will not see it again (unless it gets faulted in), so 647 * we must actively dispose of or deal with the page. 648 */ 649 pmap_remove_specific(info->pmap, p); 650 651 /* 652 * If the page is not mapped to another process (i.e. as would be 653 * typical if this were a shared page from a library) then deactivate 654 * the page and clean it in two passes only. 655 * 656 * If the page hasn't been referenced since the last check, remove it 657 * from the pmap. If it is no longer mapped, deactivate it 658 * immediately, accelerating the normal decline. 659 * 660 * Once the page has been removed from the pmap the RSS code no 661 * longer tracks it so we have to make sure that it is staged for 662 * potential flush action. 663 * 664 * XXX 665 */ 666 if ((p->flags & PG_MAPPED) == 0 || 667 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 668 if (p->queue - p->pc == PQ_ACTIVE) { 669 vm_page_deactivate(p); 670 } 671 if (p->queue - p->pc == PQ_INACTIVE) { 672 cleanit = 1; 673 } 674 } 675 676 /* 677 * Ok, try to fully clean the page and any nearby pages such that at 678 * least the requested page is freed or moved to the cache queue. 679 * 680 * We usually do this synchronously to allow us to get the page into 681 * the CACHE queue quickly, which will prevent memory exhaustion if 682 * a process with a memoryuse limit is running away. However, the 683 * sysadmin may desire to set vm.swap_user_async which relaxes this 684 * and improves write performance. 685 */ 686 if (cleanit) { 687 long max_launder = 0x7FFF; 688 long vnodes_skipped = 0; 689 long counts[4] = { 0, 0, 0, 0 }; 690 int vmflush_flags; 691 struct vnode *vpfailed = NULL; 692 693 info->offset = va; 694 695 if (vm_pageout_memuse_mode >= 2) { 696 vmflush_flags = OBJPC_TRY_TO_CACHE | 697 OBJPC_ALLOW_ACTIVE; 698 if (swap_user_async == 0) 699 vmflush_flags |= OBJPC_SYNC; 700 vm_page_flag_set(p, PG_WINATCFLS); 701 info->cleancount += 702 vm_pageout_page(p, &max_launder, 703 &vnodes_skipped, 704 &vpfailed, 1, vmflush_flags, 705 counts); 706 } else { 707 vm_page_wakeup(p); 708 ++info->cleancount; 709 } 710 } else { 711 vm_page_wakeup(p); 712 } 713 714 /* 715 * Must be at end to avoid SMP races. 716 */ 717 done: 718 lwkt_user_yield(); 719 return 0; 720 } 721 722 /* 723 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 724 * that is relatively difficult to do. We try to keep track of where we 725 * left off last time to reduce scan overhead. 726 * 727 * Called when vm_pageout_memuse_mode is >= 1. 728 */ 729 void 730 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 731 { 732 vm_offset_t pgout_offset; 733 struct pmap_pgscan_info info; 734 int retries = 3; 735 736 pgout_offset = map->pgout_offset; 737 again: 738 #if 0 739 kprintf("%016jx ", pgout_offset); 740 #endif 741 if (pgout_offset < VM_MIN_USER_ADDRESS) 742 pgout_offset = VM_MIN_USER_ADDRESS; 743 if (pgout_offset >= VM_MAX_USER_ADDRESS) 744 pgout_offset = 0; 745 info.pmap = vm_map_pmap(map); 746 info.limit = limit; 747 info.beg_addr = pgout_offset; 748 info.end_addr = VM_MAX_USER_ADDRESS; 749 info.callback = vm_pageout_mdp_callback; 750 info.cleancount = 0; 751 info.actioncount = 0; 752 info.busycount = 0; 753 754 pmap_pgscan(&info); 755 pgout_offset = info.offset; 756 #if 0 757 kprintf("%016jx %08lx %08lx\n", pgout_offset, 758 info.cleancount, info.actioncount); 759 #endif 760 761 if (pgout_offset != VM_MAX_USER_ADDRESS && 762 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 763 goto again; 764 } else if (retries && 765 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 766 --retries; 767 goto again; 768 } 769 map->pgout_offset = pgout_offset; 770 } 771 #endif 772 773 /* 774 * Called when the pageout scan wants to free a page. We no longer 775 * try to cycle the vm_object here with a reference & dealloc, which can 776 * cause a non-trivial object collapse in a critical path. 777 * 778 * It is unclear why we cycled the ref_count in the past, perhaps to try 779 * to optimize shadow chain collapses but I don't quite see why it would 780 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 781 * synchronously and not have to be kicked-start. 782 */ 783 static void 784 vm_pageout_page_free(vm_page_t m) 785 { 786 vm_page_protect(m, VM_PROT_NONE); 787 vm_page_free(m); 788 } 789 790 /* 791 * vm_pageout_scan does the dirty work for the pageout daemon. 792 */ 793 struct vm_pageout_scan_info { 794 struct proc *bigproc; 795 vm_offset_t bigsize; 796 }; 797 798 static int vm_pageout_scan_callback(struct proc *p, void *data); 799 800 /* 801 * Scan inactive queue for pages we can cache or free. 802 * 803 * WARNING! Can be called from two pagedaemon threads simultaneously. 804 */ 805 static int 806 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 807 long *vnodes_skipped, long *counts) 808 { 809 vm_page_t m; 810 struct vm_page marker; 811 struct vnode *vpfailed; /* warning, allowed to be stale */ 812 long maxscan; 813 long delta = 0; 814 long max_launder; 815 int isep; 816 int vmflush_flags; 817 818 isep = (curthread == emergpager); 819 820 /* 821 * This routine is called for each of PQ_L2_SIZE inactive queues. 822 * We want the vm_max_launder parameter to apply to the whole 823 * queue (i.e. per-whole-queue pass, not per-sub-queue). 824 * 825 * In each successive full-pass when the page target is not met we 826 * allow the per-queue max_launder to increase up to a maximum of 827 * vm_max_launder / 16. 828 */ 829 max_launder = (long)vm_max_launder / PQ_L2_SIZE; 830 if (pass) 831 max_launder *= 2; 832 max_launder = (max_launder + MAXSCAN_DIVIDER - 1) / MAXSCAN_DIVIDER; 833 834 if (max_launder <= 1) 835 max_launder = 1; 836 if (max_launder >= vm_max_launder / 16) 837 max_launder = vm_max_launder / 16 + 1; 838 839 /* 840 * Start scanning the inactive queue for pages we can move to the 841 * cache or free. The scan will stop when the target is reached or 842 * we have scanned the entire inactive queue. Note that m->act_count 843 * is not used to form decisions for the inactive queue, only for the 844 * active queue. 845 * 846 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 847 * PAGES. 848 */ 849 850 /* 851 * Initialize our marker 852 */ 853 bzero(&marker, sizeof(marker)); 854 marker.flags = PG_FICTITIOUS | PG_MARKER; 855 marker.busy_count = PBUSY_LOCKED; 856 marker.queue = PQ_INACTIVE + q; 857 marker.pc = q; 858 marker.wire_count = 1; 859 860 /* 861 * Inactive queue scan. 862 * 863 * We pick off approximately 1/10 of each queue. Each queue is 864 * effectively organized LRU so scanning the entire queue would 865 * improperly pick up pages that might still be in regular use. 866 * 867 * NOTE: The vm_page must be spinlocked before the queue to avoid 868 * deadlocks, so it is easiest to simply iterate the loop 869 * with the queue unlocked at the top. 870 */ 871 vpfailed = NULL; 872 873 vm_page_queues_spin_lock(PQ_INACTIVE + q); 874 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 875 maxscan = (vm_page_queues[PQ_INACTIVE + q].lcnt + MAXSCAN_DIVIDER - 1) / 876 MAXSCAN_DIVIDER + 1; 877 878 /* 879 * Queue locked at top of loop to avoid stack marker issues. 880 */ 881 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 882 maxscan-- > 0 && avail_shortage - delta > 0) 883 { 884 int count; 885 886 KKASSERT(m->queue == PQ_INACTIVE + q); 887 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 888 &marker, pageq); 889 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 890 &marker, pageq); 891 mycpu->gd_cnt.v_pdpages++; 892 893 /* 894 * Skip marker pages (atomic against other markers to avoid 895 * infinite hop-over scans). 896 */ 897 if (m->flags & PG_MARKER) 898 continue; 899 900 /* 901 * Try to busy the page. Don't mess with pages which are 902 * already busy or reorder them in the queue. 903 */ 904 if (vm_page_busy_try(m, TRUE)) 905 continue; 906 907 /* 908 * Remaining operations run with the page busy and neither 909 * the page or the queue will be spin-locked. 910 */ 911 KKASSERT(m->queue == PQ_INACTIVE + q); 912 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 913 914 /* 915 * The emergency pager runs when the primary pager gets 916 * stuck, which typically means the primary pager deadlocked 917 * on a vnode-backed page. Therefore, the emergency pager 918 * must skip any complex objects. 919 * 920 * We disallow VNODEs unless they are VCHR whos device ops 921 * does not flag D_NOEMERGPGR. 922 */ 923 if (isep && m->object) { 924 struct vnode *vp; 925 926 switch(m->object->type) { 927 case OBJT_DEFAULT: 928 case OBJT_SWAP: 929 /* 930 * Allow anonymous memory and assume that 931 * swap devices are not complex, since its 932 * kinda worthless if we can't swap out dirty 933 * anonymous pages. 934 */ 935 break; 936 case OBJT_VNODE: 937 /* 938 * Allow VCHR device if the D_NOEMERGPGR 939 * flag is not set, deny other vnode types 940 * as being too complex. 941 */ 942 vp = m->object->handle; 943 if (vp && vp->v_type == VCHR && 944 vp->v_rdev && vp->v_rdev->si_ops && 945 (vp->v_rdev->si_ops->head.flags & 946 D_NOEMERGPGR) == 0) { 947 break; 948 } 949 /* Deny - fall through */ 950 default: 951 /* 952 * Deny 953 */ 954 vm_page_wakeup(m); 955 vm_page_queues_spin_lock(PQ_INACTIVE + q); 956 lwkt_yield(); 957 continue; 958 } 959 } 960 961 /* 962 * Try to pageout the page and perhaps other nearby pages. 963 * We want to get the pages into the cache eventually ( 964 * first or second pass). Otherwise the pages can wind up 965 * just cycling in the inactive queue, getting flushed over 966 * and over again. 967 * 968 * Generally speaking we recycle dirty pages within PQ_INACTIVE 969 * twice (double LRU) before paging them out. If the 970 * memuse_mode is >= 3 we run them single-LRU like we do clean 971 * pages. 972 */ 973 if (vm_pageout_memuse_mode >= 3) 974 vm_page_flag_set(m, PG_WINATCFLS); 975 976 vmflush_flags = 0; 977 if (vm_pageout_allow_active) 978 vmflush_flags |= OBJPC_ALLOW_ACTIVE; 979 if (m->flags & PG_WINATCFLS) 980 vmflush_flags |= OBJPC_TRY_TO_CACHE; 981 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 982 &vpfailed, pass, vmflush_flags, counts); 983 delta += count; 984 985 /* 986 * Systems with a ton of memory can wind up with huge 987 * deactivation counts. Because the inactive scan is 988 * doing a lot of flushing, the combination can result 989 * in excessive paging even in situations where other 990 * unrelated threads free up sufficient VM. 991 * 992 * To deal with this we abort the nominal active->inactive 993 * scan before we hit the inactive target when free+cache 994 * levels have reached a reasonable target. 995 * 996 * When deciding to stop early we need to add some slop to 997 * the test and we need to return full completion to the caller 998 * to prevent the caller from thinking there is something 999 * wrong and issuing a low-memory+swap warning or pkill. 1000 * 1001 * A deficit forces paging regardless of the state of the 1002 * VM page queues (used for RSS enforcement). 1003 */ 1004 lwkt_yield(); 1005 vm_page_queues_spin_lock(PQ_INACTIVE + q); 1006 1007 /* if (vm_paging_target() < -vm_max_launder) */ 1008 if (!vm_paging_target2()) { 1009 /* 1010 * Stopping early, return full completion to caller. 1011 */ 1012 if (delta < avail_shortage) 1013 delta = avail_shortage; 1014 break; 1015 } 1016 } 1017 1018 /* page queue still spin-locked */ 1019 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 1020 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 1021 1022 return (delta); 1023 } 1024 1025 /* 1026 * Pageout the specified page, return the total number of pages paged out 1027 * (this routine may cluster). 1028 * 1029 * The page must be busied and soft-busied by the caller and will be disposed 1030 * of by this function. 1031 */ 1032 static int 1033 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 1034 struct vnode **vpfailedp, int pass, int vmflush_flags, 1035 long *counts) 1036 { 1037 vm_object_t object; 1038 int actcount; 1039 int count = 0; 1040 1041 /* 1042 * Wiring no longer removes a page from its queue. The last unwiring 1043 * will requeue the page. Obviously wired pages cannot be paged out 1044 * so unqueue it and return. 1045 */ 1046 if (m->wire_count) { 1047 vm_page_unqueue_nowakeup(m); 1048 vm_page_wakeup(m); 1049 return 0; 1050 } 1051 1052 /* 1053 * A held page may be undergoing I/O, so skip it. 1054 */ 1055 if (m->hold_count) { 1056 vm_page_and_queue_spin_lock(m); 1057 if (m->queue - m->pc == PQ_INACTIVE) { 1058 TAILQ_REMOVE( 1059 &vm_page_queues[m->queue].pl, m, pageq); 1060 TAILQ_INSERT_TAIL( 1061 &vm_page_queues[m->queue].pl, m, pageq); 1062 } 1063 vm_page_and_queue_spin_unlock(m); 1064 vm_page_wakeup(m); 1065 return 0; 1066 } 1067 1068 if (m->object == NULL || m->object->ref_count == 0) { 1069 /* 1070 * If the object is not being used, we ignore previous 1071 * references. 1072 */ 1073 vm_page_flag_clear(m, PG_REFERENCED); 1074 pmap_clear_reference(m); 1075 /* fall through to end */ 1076 } else if (((m->flags & PG_REFERENCED) == 0) && 1077 (actcount = pmap_ts_referenced(m))) { 1078 /* 1079 * Otherwise, if the page has been referenced while 1080 * in the inactive queue, we bump the "activation 1081 * count" upwards, making it less likely that the 1082 * page will be added back to the inactive queue 1083 * prematurely again. Here we check the page tables 1084 * (or emulated bits, if any), given the upper level 1085 * VM system not knowing anything about existing 1086 * references. 1087 */ 1088 ++counts[3]; 1089 vm_page_activate(m); 1090 m->act_count += (actcount + ACT_ADVANCE); 1091 vm_page_wakeup(m); 1092 return 0; 1093 } 1094 1095 /* 1096 * (m) is still busied. 1097 * 1098 * If the upper level VM system knows about any page 1099 * references, we activate the page. We also set the 1100 * "activation count" higher than normal so that we will less 1101 * likely place pages back onto the inactive queue again. 1102 */ 1103 if ((m->flags & PG_REFERENCED) != 0) { 1104 vm_page_flag_clear(m, PG_REFERENCED); 1105 actcount = pmap_ts_referenced(m); 1106 vm_page_activate(m); 1107 m->act_count += (actcount + ACT_ADVANCE + 1); 1108 vm_page_wakeup(m); 1109 ++counts[3]; 1110 return 0; 1111 } 1112 1113 /* 1114 * If the upper level VM system doesn't know anything about 1115 * the page being dirty, we have to check for it again. As 1116 * far as the VM code knows, any partially dirty pages are 1117 * fully dirty. 1118 * 1119 * Pages marked PG_WRITEABLE may be mapped into the user 1120 * address space of a process running on another cpu. A 1121 * user process (without holding the MP lock) running on 1122 * another cpu may be able to touch the page while we are 1123 * trying to remove it. vm_page_cache() will handle this 1124 * case for us. 1125 */ 1126 if (m->dirty == 0) { 1127 vm_page_test_dirty(m); 1128 } else { 1129 vm_page_dirty(m); 1130 } 1131 1132 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1133 /* 1134 * Invalid pages can be easily freed 1135 */ 1136 vm_pageout_page_free(m); 1137 mycpu->gd_cnt.v_dfree++; 1138 ++count; 1139 ++counts[1]; 1140 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1141 /* 1142 * Clean pages can be placed onto the cache queue. 1143 * This effectively frees them. 1144 */ 1145 vm_page_cache(m); 1146 ++count; 1147 ++counts[1]; 1148 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1149 /* 1150 * Dirty pages need to be paged out, but flushing 1151 * a page is extremely expensive verses freeing 1152 * a clean page. Rather then artificially limiting 1153 * the number of pages we can flush, we instead give 1154 * dirty pages extra priority on the inactive queue 1155 * by forcing them to be cycled through the queue 1156 * twice before being flushed, after which the 1157 * (now clean) page will cycle through once more 1158 * before being freed. This significantly extends 1159 * the thrash point for a heavily loaded machine. 1160 */ 1161 ++counts[2]; 1162 vm_page_flag_set(m, PG_WINATCFLS); 1163 vm_page_and_queue_spin_lock(m); 1164 if (m->queue - m->pc == PQ_INACTIVE) { 1165 TAILQ_REMOVE( 1166 &vm_page_queues[m->queue].pl, m, pageq); 1167 TAILQ_INSERT_TAIL( 1168 &vm_page_queues[m->queue].pl, m, pageq); 1169 } 1170 vm_page_and_queue_spin_unlock(m); 1171 vm_page_wakeup(m); 1172 } else if (*max_launderp > 0) { 1173 /* 1174 * We always want to try to flush some dirty pages if 1175 * we encounter them, to keep the system stable. 1176 * Normally this number is small, but under extreme 1177 * pressure where there are insufficient clean pages 1178 * on the inactive queue, we may have to go all out. 1179 */ 1180 int swap_pageouts_ok; 1181 struct vnode *vp = NULL; 1182 1183 if ((m->flags & PG_WINATCFLS) == 0) 1184 vm_page_flag_set(m, PG_WINATCFLS); 1185 swap_pageouts_ok = 0; 1186 object = m->object; 1187 if (object && 1188 (object->type != OBJT_SWAP) && 1189 (object->type != OBJT_DEFAULT)) { 1190 swap_pageouts_ok = 1; 1191 } else { 1192 swap_pageouts_ok = !(defer_swap_pageouts || 1193 disable_swap_pageouts); 1194 swap_pageouts_ok |= (!disable_swap_pageouts && 1195 defer_swap_pageouts && 1196 vm_paging_min()); 1197 } 1198 1199 /* 1200 * We don't bother paging objects that are "dead". 1201 * Those objects are in a "rundown" state. 1202 */ 1203 if (!swap_pageouts_ok || 1204 (object == NULL) || 1205 (object->flags & OBJ_DEAD)) { 1206 vm_page_and_queue_spin_lock(m); 1207 if (m->queue - m->pc == PQ_INACTIVE) { 1208 TAILQ_REMOVE( 1209 &vm_page_queues[m->queue].pl, 1210 m, pageq); 1211 TAILQ_INSERT_TAIL( 1212 &vm_page_queues[m->queue].pl, 1213 m, pageq); 1214 } 1215 vm_page_and_queue_spin_unlock(m); 1216 vm_page_wakeup(m); 1217 return 0; 1218 } 1219 1220 /* 1221 * (m) is still busied. 1222 * 1223 * The object is already known NOT to be dead. It 1224 * is possible for the vget() to block the whole 1225 * pageout daemon, but the new low-memory handling 1226 * code should prevent it. 1227 * 1228 * The previous code skipped locked vnodes and, worse, 1229 * reordered pages in the queue. This results in 1230 * completely non-deterministic operation because, 1231 * quite often, a vm_fault has initiated an I/O and 1232 * is holding a locked vnode at just the point where 1233 * the pageout daemon is woken up. 1234 * 1235 * We can't wait forever for the vnode lock, we might 1236 * deadlock due to a vn_read() getting stuck in 1237 * vm_wait while holding this vnode. We skip the 1238 * vnode if we can't get it in a reasonable amount 1239 * of time. 1240 * 1241 * vpfailed is used to (try to) avoid the case where 1242 * a large number of pages are associated with a 1243 * locked vnode, which could cause the pageout daemon 1244 * to stall for an excessive amount of time. 1245 */ 1246 if (object->type == OBJT_VNODE) { 1247 int flags; 1248 1249 vp = object->handle; 1250 flags = LK_EXCLUSIVE; 1251 if (vp == *vpfailedp) 1252 flags |= LK_NOWAIT; 1253 else 1254 flags |= LK_TIMELOCK; 1255 vm_page_hold(m); 1256 vm_page_wakeup(m); 1257 1258 /* 1259 * We have unbusied (m) temporarily so we can 1260 * acquire the vp lock without deadlocking. 1261 * (m) is held to prevent destruction. 1262 */ 1263 if (vget(vp, flags) != 0) { 1264 *vpfailedp = vp; 1265 ++pageout_lock_miss; 1266 if (object->flags & OBJ_MIGHTBEDIRTY) 1267 ++*vnodes_skippedp; 1268 vm_page_unhold(m); 1269 return 0; 1270 } 1271 1272 /* 1273 * The page might have been moved to another 1274 * queue during potential blocking in vget() 1275 * above. The page might have been freed and 1276 * reused for another vnode. The object might 1277 * have been reused for another vnode. 1278 */ 1279 if (m->queue - m->pc != PQ_INACTIVE || 1280 m->object != object || 1281 object->handle != vp) { 1282 if (object->flags & OBJ_MIGHTBEDIRTY) 1283 ++*vnodes_skippedp; 1284 vput(vp); 1285 vm_page_unhold(m); 1286 return 0; 1287 } 1288 1289 /* 1290 * The page may have been busied during the 1291 * blocking in vput(); We don't move the 1292 * page back onto the end of the queue so that 1293 * statistics are more correct if we don't. 1294 */ 1295 if (vm_page_busy_try(m, TRUE)) { 1296 vput(vp); 1297 vm_page_unhold(m); 1298 return 0; 1299 } 1300 vm_page_unhold(m); 1301 1302 /* 1303 * If it was wired while we didn't own it. 1304 */ 1305 if (m->wire_count) { 1306 vm_page_unqueue_nowakeup(m); 1307 vput(vp); 1308 vm_page_wakeup(m); 1309 return 0; 1310 } 1311 1312 /* 1313 * (m) is busied again 1314 * 1315 * We own the busy bit and remove our hold 1316 * bit. If the page is still held it 1317 * might be undergoing I/O, so skip it. 1318 */ 1319 if (m->hold_count) { 1320 rebusy_failed: 1321 vm_page_and_queue_spin_lock(m); 1322 if (m->queue - m->pc == PQ_INACTIVE) { 1323 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1324 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1325 } 1326 vm_page_and_queue_spin_unlock(m); 1327 if (object->flags & OBJ_MIGHTBEDIRTY) 1328 ++*vnodes_skippedp; 1329 vm_page_wakeup(m); 1330 vput(vp); 1331 return 0; 1332 } 1333 1334 /* 1335 * Recheck queue, object, and vp now that we have 1336 * rebusied the page. 1337 */ 1338 if (m->queue - m->pc != PQ_INACTIVE || 1339 m->object != object || 1340 object->handle != vp) { 1341 kprintf("vm_pageout_page: " 1342 "rebusy %p failed(A)\n", 1343 m); 1344 goto rebusy_failed; 1345 } 1346 1347 /* 1348 * Check page validity 1349 */ 1350 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1351 kprintf("vm_pageout_page: " 1352 "rebusy %p failed(B)\n", 1353 m); 1354 goto rebusy_failed; 1355 } 1356 if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1357 kprintf("vm_pageout_page: " 1358 "rebusy %p failed(C)\n", 1359 m); 1360 goto rebusy_failed; 1361 } 1362 1363 /* (m) is left busied as we fall through */ 1364 } 1365 1366 /* 1367 * page is busy and not held here. 1368 * 1369 * If a page is dirty, then it is either being washed 1370 * (but not yet cleaned) or it is still in the 1371 * laundry. If it is still in the laundry, then we 1372 * start the cleaning operation. 1373 * 1374 * decrement inactive_shortage on success to account 1375 * for the (future) cleaned page. Otherwise we 1376 * could wind up laundering or cleaning too many 1377 * pages. 1378 * 1379 * NOTE: Cleaning the page here does not cause 1380 * force_deficit to be adjusted, because the 1381 * page is not being freed or moved to the 1382 * cache. 1383 */ 1384 count = vm_pageout_clean_helper(m, vmflush_flags); 1385 counts[0] += count; 1386 *max_launderp -= count; 1387 1388 /* 1389 * Clean ate busy, page no longer accessible 1390 */ 1391 if (vp != NULL) 1392 vput(vp); 1393 } else { 1394 vm_page_wakeup(m); 1395 } 1396 return count; 1397 } 1398 1399 /* 1400 * Scan active queue 1401 * 1402 * WARNING! Can be called from two pagedaemon threads simultaneously. 1403 */ 1404 static int 1405 vm_pageout_scan_active(int pass, int q, 1406 long avail_shortage, long inactive_shortage, 1407 struct vm_page *marker, 1408 long *recycle_countp) 1409 { 1410 vm_page_t m; 1411 int actcount; 1412 long delta = 0; 1413 long maxscan; 1414 int isep; 1415 1416 isep = (curthread == emergpager); 1417 1418 /* 1419 * We want to move pages from the active queue to the inactive 1420 * queue to get the inactive queue to the inactive target. If 1421 * we still have a page shortage from above we try to directly free 1422 * clean pages instead of moving them. 1423 * 1424 * If we do still have a shortage we keep track of the number of 1425 * pages we free or cache (recycle_count) as a measure of thrashing 1426 * between the active and inactive queues. 1427 * 1428 * If we were able to completely satisfy the free+cache targets 1429 * from the inactive pool we limit the number of pages we move 1430 * from the active pool to the inactive pool to 2x the pages we 1431 * had removed from the inactive pool (with a minimum of 1/5 the 1432 * inactive target). If we were not able to completely satisfy 1433 * the free+cache targets we go for the whole target aggressively. 1434 * 1435 * NOTE: Both variables can end up negative. 1436 * NOTE: We are still in a critical section. 1437 * 1438 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1439 * PAGES. 1440 */ 1441 1442 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1443 maxscan = (vm_page_queues[PQ_ACTIVE + q].lcnt + MAXSCAN_DIVIDER - 1) / 1444 MAXSCAN_DIVIDER + 1; 1445 1446 /* 1447 * Queue locked at top of loop to avoid stack marker issues. 1448 */ 1449 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 1450 maxscan-- > 0 && (avail_shortage - delta > 0 || 1451 inactive_shortage > 0)) 1452 { 1453 KKASSERT(m->queue == PQ_ACTIVE + q); 1454 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1455 marker, pageq); 1456 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1457 marker, pageq); 1458 1459 /* 1460 * Skip marker pages (atomic against other markers to avoid 1461 * infinite hop-over scans). 1462 */ 1463 if (m->flags & PG_MARKER) 1464 continue; 1465 1466 /* 1467 * Try to busy the page. Don't mess with pages which are 1468 * already busy or reorder them in the queue. 1469 */ 1470 if (vm_page_busy_try(m, TRUE)) 1471 continue; 1472 1473 /* 1474 * Remaining operations run with the page busy and neither 1475 * the page or the queue will be spin-locked. 1476 */ 1477 KKASSERT(m->queue == PQ_ACTIVE + q); 1478 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1479 1480 #if 0 1481 /* 1482 * Don't deactivate pages that are held, even if we can 1483 * busy them. (XXX why not?) 1484 */ 1485 if (m->hold_count) { 1486 vm_page_and_queue_spin_lock(m); 1487 if (m->queue - m->pc == PQ_ACTIVE) { 1488 TAILQ_REMOVE( 1489 &vm_page_queues[PQ_ACTIVE + q].pl, 1490 m, pageq); 1491 TAILQ_INSERT_TAIL( 1492 &vm_page_queues[PQ_ACTIVE + q].pl, 1493 m, pageq); 1494 } 1495 vm_page_and_queue_spin_unlock(m); 1496 vm_page_wakeup(m); 1497 goto next; 1498 } 1499 #endif 1500 /* 1501 * We can just remove wired pages from the queue 1502 */ 1503 if (m->wire_count) { 1504 vm_page_unqueue_nowakeup(m); 1505 vm_page_wakeup(m); 1506 goto next; 1507 } 1508 1509 /* 1510 * The emergency pager ignores vnode-backed pages as these 1511 * are the pages that probably bricked the main pager. 1512 */ 1513 if (isep && m->object && m->object->type == OBJT_VNODE) { 1514 #if 0 1515 vm_page_and_queue_spin_lock(m); 1516 if (m->queue - m->pc == PQ_ACTIVE) { 1517 TAILQ_REMOVE( 1518 &vm_page_queues[PQ_ACTIVE + q].pl, 1519 m, pageq); 1520 TAILQ_INSERT_TAIL( 1521 &vm_page_queues[PQ_ACTIVE + q].pl, 1522 m, pageq); 1523 } 1524 vm_page_and_queue_spin_unlock(m); 1525 #endif 1526 vm_page_wakeup(m); 1527 goto next; 1528 } 1529 1530 /* 1531 * The count for pagedaemon pages is done after checking the 1532 * page for eligibility... 1533 */ 1534 mycpu->gd_cnt.v_pdpages++; 1535 1536 /* 1537 * Check to see "how much" the page has been used and clear 1538 * the tracking access bits. If the object has no references 1539 * don't bother paying the expense. 1540 */ 1541 actcount = 0; 1542 if (m->object && m->object->ref_count != 0) { 1543 if (m->flags & PG_REFERENCED) 1544 ++actcount; 1545 actcount += pmap_ts_referenced(m); 1546 if (actcount) { 1547 m->act_count += ACT_ADVANCE + actcount; 1548 if (m->act_count > ACT_MAX) 1549 m->act_count = ACT_MAX; 1550 } 1551 } 1552 vm_page_flag_clear(m, PG_REFERENCED); 1553 1554 /* 1555 * actcount is only valid if the object ref_count is non-zero. 1556 * If the page does not have an object, actcount will be zero. 1557 */ 1558 if (actcount && m->object->ref_count != 0) { 1559 #if 0 1560 vm_page_and_queue_spin_lock(m); 1561 if (m->queue - m->pc == PQ_ACTIVE) { 1562 TAILQ_REMOVE( 1563 &vm_page_queues[PQ_ACTIVE + q].pl, 1564 m, pageq); 1565 TAILQ_INSERT_TAIL( 1566 &vm_page_queues[PQ_ACTIVE + q].pl, 1567 m, pageq); 1568 } 1569 vm_page_and_queue_spin_unlock(m); 1570 #endif 1571 vm_page_wakeup(m); 1572 } else { 1573 switch(m->object->type) { 1574 case OBJT_DEFAULT: 1575 case OBJT_SWAP: 1576 m->act_count -= min(m->act_count, 1577 vm_anonmem_decline); 1578 break; 1579 default: 1580 m->act_count -= min(m->act_count, 1581 vm_filemem_decline); 1582 break; 1583 } 1584 if (vm_pageout_algorithm || 1585 (m->object == NULL) || 1586 (m->object && (m->object->ref_count == 0)) || 1587 m->act_count < pass + 1 1588 ) { 1589 /* 1590 * Deactivate the page. If we had a 1591 * shortage from our inactive scan try to 1592 * free (cache) the page instead. 1593 * 1594 * Don't just blindly cache the page if 1595 * we do not have a shortage from the 1596 * inactive scan, that could lead to 1597 * gigabytes being moved. 1598 */ 1599 --inactive_shortage; 1600 if (avail_shortage - delta > 0 || 1601 (m->object && (m->object->ref_count == 0))) 1602 { 1603 if (avail_shortage - delta > 0) 1604 ++*recycle_countp; 1605 vm_page_protect(m, VM_PROT_NONE); 1606 if (m->dirty == 0 && 1607 (m->flags & PG_NEED_COMMIT) == 0 && 1608 avail_shortage - delta > 0) { 1609 vm_page_cache(m); 1610 } else { 1611 vm_page_deactivate(m); 1612 vm_page_wakeup(m); 1613 } 1614 } else { 1615 vm_page_deactivate(m); 1616 vm_page_wakeup(m); 1617 } 1618 ++delta; 1619 } else { 1620 /* 1621 * Do nothing 1622 */ 1623 #if 0 1624 vm_page_and_queue_spin_lock(m); 1625 if (m->queue - m->pc == PQ_ACTIVE) { 1626 TAILQ_REMOVE( 1627 &vm_page_queues[PQ_ACTIVE + q].pl, 1628 m, pageq); 1629 TAILQ_INSERT_TAIL( 1630 &vm_page_queues[PQ_ACTIVE + q].pl, 1631 m, pageq); 1632 } 1633 vm_page_and_queue_spin_unlock(m); 1634 #endif 1635 vm_page_wakeup(m); 1636 } 1637 } 1638 next: 1639 lwkt_yield(); 1640 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1641 } 1642 1643 /* 1644 * Clean out our local marker. 1645 * 1646 * Page queue still spin-locked. 1647 */ 1648 if (m == NULL) { 1649 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1650 marker, pageq); 1651 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 1652 marker, pageq); 1653 } 1654 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1655 1656 return (delta); 1657 } 1658 1659 /* 1660 * The number of actually free pages can drop down to v_free_reserved, 1661 * we try to build the free count back above v_free_min, to v_free_target. 1662 * 1663 * Cache pages are already counted as being free-ish. 1664 * 1665 * NOTE: we are still in a critical section. 1666 * 1667 * Pages moved from PQ_CACHE to totally free are not counted in the 1668 * pages_freed counter. 1669 * 1670 * WARNING! Can be called from two pagedaemon threads simultaneously. 1671 */ 1672 static void 1673 vm_pageout_scan_cache(long avail_shortage, int pass, 1674 long vnodes_skipped, long recycle_count) 1675 { 1676 static int lastkillticks; 1677 struct vm_pageout_scan_info info; 1678 vm_page_t m; 1679 int isep; 1680 1681 isep = (curthread == emergpager); 1682 1683 /* 1684 * Test conditions also include a safeety against v_free_min in 1685 * case the sysop messes up the sysctls. 1686 * 1687 * Also include a test to avoid degenerate scans. 1688 */ 1689 while ((vmstats.v_free_count < vmstats.v_free_target || 1690 vmstats.v_free_count < vmstats.v_free_min) && 1691 vmstats.v_cache_count > VM_CACHE_SCAN_MIN) 1692 { 1693 /* 1694 * This steals some code from vm/vm_page.c 1695 * 1696 * Create two rovers and adjust the code to reduce 1697 * chances of them winding up at the same index (which 1698 * can cause a lot of contention). 1699 */ 1700 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1701 1702 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1703 goto next_rover; 1704 1705 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1706 if (m == NULL) 1707 break; 1708 /* 1709 * page is returned removed from its queue and spinlocked. 1710 * 1711 * If the busy attempt fails we can still deactivate the page. 1712 */ 1713 if (vm_page_busy_try(m, TRUE)) { 1714 vm_page_deactivate_locked(m); 1715 vm_page_spin_unlock(m); 1716 continue; 1717 } 1718 vm_page_spin_unlock(m); 1719 pagedaemon_wakeup(); 1720 lwkt_yield(); 1721 1722 /* 1723 * Report a possible edge case. This shouldn't happen but 1724 * actually I think it can race against e.g. 1725 * vm_page_lookup()/busy sequences. If the page isn't 1726 * in a cache-like state we will deactivate and skip it. 1727 */ 1728 if ((m->flags & PG_MAPPED) || (m->valid & m->dirty)) { 1729 kprintf("WARNING! page race during find/busy: %p " 1730 "queue == %d dirty=%02x\n", 1731 m, m->queue - m->pc, m->dirty); 1732 } 1733 1734 /* 1735 * Remaining operations run with the page busy and neither 1736 * the page or the queue will be spin-locked. 1737 */ 1738 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_MAPPED)) || 1739 m->hold_count || 1740 m->wire_count || 1741 (m->valid & m->dirty)) 1742 { 1743 vm_page_deactivate(m); 1744 vm_page_wakeup(m); 1745 continue; 1746 } 1747 1748 /* 1749 * Because the page is in the cache, it shouldn't be mapped. 1750 */ 1751 pmap_mapped_sync(m); 1752 KKASSERT((m->flags & PG_MAPPED) == 0); 1753 KKASSERT(m->dirty == 0); 1754 vm_pageout_page_free(m); 1755 mycpu->gd_cnt.v_dfree++; 1756 next_rover: 1757 if (isep) 1758 cache_rover[1] -= PQ_PRIME2; 1759 else 1760 cache_rover[0] += PQ_PRIME2; 1761 } 1762 1763 /* 1764 * If we didn't get enough free pages, and we have skipped a vnode 1765 * in a writeable object, wakeup the sync daemon. And kick swapout 1766 * if we did not get enough free pages. 1767 */ 1768 if (vm_paging_target1()) { 1769 if (vnodes_skipped && vm_paging_min()) 1770 speedup_syncer(NULL); 1771 #if !defined(NO_SWAPPING) 1772 if (vm_swap_enabled && vm_paging_target1()) 1773 vm_req_vmdaemon(); 1774 #endif 1775 } 1776 1777 /* 1778 * Handle catastrophic conditions. Under good conditions we should 1779 * be at the target, well beyond our minimum. If we could not even 1780 * reach our minimum the system is under heavy stress. But just being 1781 * under heavy stress does not trigger process killing. 1782 * 1783 * We consider ourselves to have run out of memory if the swap pager 1784 * is full and avail_shortage is still positive. The secondary check 1785 * ensures that we do not kill processes if the instantanious 1786 * availability is good, even if the pageout demon pass says it 1787 * couldn't get to the target. 1788 * 1789 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1790 * SITUATIONS. 1791 */ 1792 if (swap_pager_almost_full && 1793 pass > 0 && 1794 isep == 0 && 1795 (vm_paging_min_dnc(recycle_count) || avail_shortage > 0)) { 1796 kprintf("Warning: system low on memory+swap " 1797 "shortage %ld for %d ticks!\n", 1798 avail_shortage, ticks - swap_fail_ticks); 1799 if (bootverbose) { 1800 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1801 "availshrt=%ld tgt=%d/%d inacshrt=%ld " 1802 "last=%u\n", 1803 swap_pager_almost_full, 1804 swap_pager_full, 1805 pass, 1806 avail_shortage, 1807 vm_paging_target1(), 1808 vm_paging_target2(), 1809 vm_paging_target2_count(), 1810 (unsigned int)(ticks - lastkillticks)); 1811 } 1812 } 1813 if (swap_pager_full && 1814 pass > 1 && 1815 isep == 0 && 1816 avail_shortage > 0 && 1817 vm_paging_target1() && 1818 (unsigned int)(ticks - lastkillticks) >= hz) 1819 { 1820 /* 1821 * Kill something, maximum rate once per second to give 1822 * the process time to free up sufficient memory. 1823 */ 1824 lastkillticks = ticks; 1825 info.bigproc = NULL; 1826 info.bigsize = 0; 1827 allproc_scan(vm_pageout_scan_callback, &info, 0); 1828 if (info.bigproc != NULL) { 1829 kprintf("Try to kill process %d %s\n", 1830 info.bigproc->p_pid, info.bigproc->p_comm); 1831 info.bigproc->p_nice = PRIO_MIN; 1832 info.bigproc->p_usched->resetpriority( 1833 FIRST_LWP_IN_PROC(info.bigproc)); 1834 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1835 killproc(info.bigproc, "out of swap space"); 1836 wakeup(&vmstats.v_free_count); 1837 PRELE(info.bigproc); 1838 } 1839 } 1840 } 1841 1842 static int 1843 vm_pageout_scan_callback(struct proc *p, void *data) 1844 { 1845 struct vm_pageout_scan_info *info = data; 1846 vm_offset_t size; 1847 1848 /* 1849 * Never kill system processes or init. If we have configured swap 1850 * then try to avoid killing low-numbered pids. 1851 */ 1852 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1853 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1854 return (0); 1855 } 1856 1857 lwkt_gettoken(&p->p_token); 1858 1859 /* 1860 * if the process is in a non-running type state, 1861 * don't touch it. 1862 */ 1863 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1864 lwkt_reltoken(&p->p_token); 1865 return (0); 1866 } 1867 1868 /* 1869 * Get the approximate process size. Note that anonymous pages 1870 * with backing swap will be counted twice, but there should not 1871 * be too many such pages due to the stress the VM system is 1872 * under at this point. 1873 */ 1874 size = vmspace_anonymous_count(p->p_vmspace) + 1875 vmspace_swap_count(p->p_vmspace); 1876 1877 /* 1878 * If the this process is bigger than the biggest one 1879 * remember it. 1880 */ 1881 if (info->bigsize < size) { 1882 if (info->bigproc) 1883 PRELE(info->bigproc); 1884 PHOLD(p); 1885 info->bigproc = p; 1886 info->bigsize = size; 1887 } 1888 lwkt_reltoken(&p->p_token); 1889 lwkt_yield(); 1890 1891 return(0); 1892 } 1893 1894 /* 1895 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1896 * moved back to PQ_FREE. It is possible for pages to accumulate here 1897 * when vm_page_free() races against vm_page_unhold(), resulting in a 1898 * page being left on a PQ_HOLD queue with hold_count == 0. 1899 * 1900 * It is easier to handle this edge condition here, in non-critical code, 1901 * rather than enforce a spin-lock for every 1->0 transition in 1902 * vm_page_unhold(). 1903 * 1904 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1905 */ 1906 static void 1907 vm_pageout_scan_hold(int q, struct vm_page *marker) 1908 { 1909 vm_page_t m; 1910 long pcount; 1911 1912 pcount = vm_page_queues[PQ_HOLD + q].lcnt; 1913 if (pcount > vm_pageout_stats_scan) 1914 pcount = vm_pageout_stats_scan; 1915 1916 vm_page_queues_spin_lock(PQ_HOLD + q); 1917 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 1918 pcount-- > 0) 1919 { 1920 KKASSERT(m->queue == PQ_HOLD + q); 1921 TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl, marker, pageq); 1922 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_HOLD + q].pl, m, 1923 marker, pageq); 1924 1925 if (m->flags & PG_MARKER) 1926 continue; 1927 1928 /* 1929 * Process one page and return 1930 */ 1931 if (m->hold_count) 1932 break; 1933 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1934 vm_page_hold(m); 1935 vm_page_queues_spin_unlock(PQ_HOLD + q); 1936 vm_page_unhold(m); /* reprocess */ 1937 vm_page_queues_spin_lock(PQ_HOLD + q); 1938 } 1939 1940 /* 1941 * If queue exhausted move the marker back to the head. 1942 */ 1943 if (m == NULL) { 1944 TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl, 1945 marker, pageq); 1946 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl, 1947 marker, pageq); 1948 } 1949 1950 vm_page_queues_spin_unlock(PQ_HOLD + q); 1951 } 1952 1953 /* 1954 * This code maintains the m->act for active pages. The scan occurs only 1955 * as long as the pageout daemon is not running or the inactive target has 1956 * not been reached. 1957 * 1958 * The restrictions prevent an idle machine from degrading all VM pages 1959 * m->act to 0 or nearly 0, which makes the field useless. For example, if 1960 * a workstation user goes to bed. 1961 */ 1962 static void 1963 vm_pageout_page_stats(int q, struct vm_page *marker, long *counterp) 1964 { 1965 struct vpgqueues *pq = &vm_page_queues[PQ_ACTIVE + q]; 1966 vm_page_t m; 1967 long pcount; /* Number of pages to check */ 1968 1969 /* 1970 * No point scanning the active queue if it is smaller than 1971 * 1/2 usable memory. This most typically occurs at system 1972 * startup or if a huge amount of memory has just been freed. 1973 */ 1974 if (vmstats.v_active_count < vmstats.v_free_count + 1975 vmstats.v_cache_count + 1976 vmstats.v_inactive_count) 1977 { 1978 return; 1979 } 1980 1981 /* 1982 * Generally do not scan if the pageout daemon is not running 1983 * or the inactive target has been reached. However, we override 1984 * this and scan anyway for N seconds after the pageout daemon last 1985 * ran. 1986 * 1987 * This last bit is designed to give the system a little time to 1988 * stage more pages for potential deactivation. In this situation, 1989 * if the inactive target has been met, we just update m->act_count 1990 * and do not otherwise mess with the page. But we don't want it 1991 * to run forever because that would cause m->act to become unusable 1992 * if the machine were to become idle. 1993 */ 1994 if (vm_pages_needed == 0 && !vm_paging_inactive()) { 1995 if (time_uptime - vm_pagedaemon_uptime > vm_pageout_stats_rsecs) 1996 return; 1997 } 1998 1999 if (vm_pageout_debug) { 2000 static time_t save_time; 2001 if (save_time != time_uptime) { 2002 save_time = time_uptime; 2003 kprintf("DEACTIVATE Q=%4d N=%ld\n", 2004 q, vm_paging_inactive_count()); 2005 } 2006 } 2007 2008 /* 2009 * Limited scan to reduce cpu glitches, just in case the 2010 * pmap_ts_referenced() burns a lot of CPU. 2011 */ 2012 pcount = pq->lcnt; 2013 if (pcount > vm_pageout_stats_scan) 2014 pcount = vm_pageout_stats_scan; 2015 2016 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2017 2018 /* 2019 * Queue locked at top of loop to avoid stack marker issues. 2020 */ 2021 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 2022 pcount-- > 0) 2023 { 2024 int actcount; 2025 2026 KKASSERT(m->queue == PQ_ACTIVE + q); 2027 TAILQ_REMOVE(&pq->pl, marker, pageq); 2028 TAILQ_INSERT_AFTER(&pq->pl, m, marker, pageq); 2029 2030 /* 2031 * Skip marker pages (atomic against other markers to avoid 2032 * infinite hop-over scans). 2033 */ 2034 if (m->flags & PG_MARKER) 2035 continue; 2036 2037 ++counterp[0]; 2038 2039 /* 2040 * Ignore pages we can't busy 2041 */ 2042 if (vm_page_busy_try(m, TRUE)) { 2043 continue; 2044 } 2045 2046 /* 2047 * Remaining operations run with the page busy and neither 2048 * the page or the queue will be spin-locked. 2049 */ 2050 KKASSERT(m->queue == PQ_ACTIVE + q); 2051 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2052 2053 /* 2054 * We can just remove wired pages from the queue 2055 */ 2056 if (m->wire_count) { 2057 vm_page_unqueue_nowakeup(m); 2058 vm_page_wakeup(m); 2059 goto next; 2060 } 2061 2062 2063 /* 2064 * We now have a safely busied page, the page and queue 2065 * spinlocks have been released. 2066 * 2067 * Ignore held and wired pages 2068 */ 2069 if (m->hold_count || m->wire_count) { 2070 vm_page_wakeup(m); 2071 goto next; 2072 } 2073 2074 /* 2075 * Calculate activity 2076 */ 2077 actcount = 0; 2078 if (m->flags & PG_REFERENCED) { 2079 vm_page_flag_clear(m, PG_REFERENCED); 2080 actcount += 1; 2081 } 2082 actcount += pmap_ts_referenced(m); 2083 2084 /* 2085 * Update act_count and move page to end of queue. 2086 */ 2087 if (actcount) { 2088 m->act_count += ACT_ADVANCE + actcount; 2089 if (m->act_count > ACT_MAX) 2090 m->act_count = ACT_MAX; 2091 #if 0 2092 vm_page_and_queue_spin_lock(m); 2093 if (m->queue - m->pc == PQ_ACTIVE) { 2094 TAILQ_REMOVE(&pq->pl, m, pageq); 2095 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 2096 } 2097 vm_page_and_queue_spin_unlock(m); 2098 #endif 2099 vm_page_wakeup(m); 2100 goto next; 2101 } 2102 2103 if (m->act_count == 0) { 2104 /* 2105 * If the deactivation target has not been reached 2106 * we try to deactivate the page. 2107 * 2108 * If the deactivation target has been reached it 2109 * is a complete waste of time (both now and later) 2110 * to try to deactivate more pages. 2111 */ 2112 if (vm_paging_inactive()) { 2113 vm_page_protect(m, VM_PROT_NONE); 2114 vm_page_deactivate(m); 2115 } 2116 ++counterp[1]; 2117 } else { 2118 m->act_count -= min(m->act_count, ACT_DECLINE); 2119 #if 0 2120 vm_page_and_queue_spin_lock(m); 2121 if (m->queue - m->pc == PQ_ACTIVE) { 2122 TAILQ_REMOVE(&pq->pl, m, pageq); 2123 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 2124 } 2125 vm_page_and_queue_spin_unlock(m); 2126 #endif 2127 2128 if (m->act_count < vm_pageout_stats_actcmp) { 2129 if (vm_paging_inactive()) { 2130 vm_page_protect(m, VM_PROT_NONE); 2131 vm_page_deactivate(m); 2132 } 2133 ++counterp[1]; 2134 } 2135 } 2136 vm_page_wakeup(m); 2137 next: 2138 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2139 } 2140 2141 /* 2142 * If the queue has been exhausted move the marker back to the head. 2143 */ 2144 if (m == NULL) { 2145 TAILQ_REMOVE(&pq->pl, marker, pageq); 2146 TAILQ_INSERT_HEAD(&pq->pl, marker, pageq); 2147 } 2148 2149 /* 2150 * Remove our local marker 2151 * 2152 * Page queue still spin-locked. 2153 */ 2154 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2155 2156 /* 2157 * After roughly every (inalim) pages determine if we are making 2158 * appropriate progress. If we are then reduce the comparison point 2159 * for act_count, and if we are not increase the comparison point. 2160 * 2161 * This allows us to handle heavier loads and also balances the 2162 * code, particularly at startup. 2163 */ 2164 if (counterp[0] > vm_pageout_stats_inalim) { 2165 if (counterp[1] < vm_pageout_stats_inamin) { 2166 if (vm_pageout_stats_actcmp < ACT_MAX * 3 / 4) 2167 ++vm_pageout_stats_actcmp; 2168 } else { 2169 if (vm_pageout_stats_actcmp > 0) 2170 --vm_pageout_stats_actcmp; 2171 } 2172 counterp[0] = 0; 2173 counterp[1] = 0; 2174 } 2175 } 2176 2177 static void 2178 vm_pageout_free_page_calc(vm_size_t count) 2179 { 2180 /* 2181 * v_free_min normal allocations 2182 * v_free_reserved system allocations 2183 * v_pageout_free_min allocations by pageout daemon 2184 * v_interrupt_free_min low level allocations (e.g swap structures) 2185 * 2186 * v_free_min is used to generate several other baselines, and they 2187 * can get pretty silly on systems with a lot of memory. 2188 */ 2189 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2190 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2191 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2192 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2193 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2194 } 2195 2196 2197 /* 2198 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2199 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2200 * 2201 * The emergency pageout daemon takes over when the primary pageout daemon 2202 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2203 * avoiding the many low-memory deadlocks which can occur when paging out 2204 * to VFS's. 2205 */ 2206 static void 2207 vm_pageout_thread(void) 2208 { 2209 int pass; 2210 int q; 2211 int q1iterator = 0; 2212 int q2iterator = 0; 2213 int q3iterator = 0; 2214 int isep; 2215 enum { PAGING_IDLE, PAGING_TARGET1, PAGING_TARGET2 } state; 2216 struct markers *markers; 2217 long scounter[2] = { 0, 0 }; 2218 time_t warn_time; 2219 2220 curthread->td_flags |= TDF_SYSTHREAD; 2221 state = PAGING_IDLE; 2222 2223 /* 2224 * Allocate continuous markers for hold, stats (active), and 2225 * paging active queue scan. These scans occur incrementally. 2226 */ 2227 markers = kmalloc(sizeof(*markers) * PQ_L2_SIZE, 2228 M_PAGEOUT, M_WAITOK | M_ZERO); 2229 2230 for (q = 0; q < PQ_L2_SIZE; ++q) { 2231 struct markers *mark = &markers[q]; 2232 2233 mark->hold.flags = PG_FICTITIOUS | PG_MARKER; 2234 mark->hold.busy_count = PBUSY_LOCKED; 2235 mark->hold.queue = PQ_HOLD + q; 2236 mark->hold.pc = PQ_HOLD + q; 2237 mark->hold.wire_count = 1; 2238 vm_page_queues_spin_lock(PQ_HOLD + q); 2239 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl, 2240 &mark->hold, pageq); 2241 vm_page_queues_spin_unlock(PQ_HOLD + q); 2242 2243 mark->stat.flags = PG_FICTITIOUS | PG_MARKER; 2244 mark->stat.busy_count = PBUSY_LOCKED; 2245 mark->stat.queue = PQ_ACTIVE + q; 2246 mark->stat.pc = PQ_ACTIVE + q; 2247 mark->stat.wire_count = 1; 2248 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2249 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 2250 &mark->stat, pageq); 2251 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2252 2253 mark->pact.flags = PG_FICTITIOUS | PG_MARKER; 2254 mark->pact.busy_count = PBUSY_LOCKED; 2255 mark->pact.queue = PQ_ACTIVE + q; 2256 mark->pact.pc = PQ_ACTIVE + q; 2257 mark->pact.wire_count = 1; 2258 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2259 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 2260 &mark->pact, pageq); 2261 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2262 } 2263 2264 /* 2265 * We only need to setup once. 2266 */ 2267 isep = 0; 2268 if (curthread == emergpager) { 2269 isep = 1; 2270 goto skip_setup; 2271 } 2272 2273 /* 2274 * Initialize vm_max_launder per pageout pass to be 1/16 2275 * of total physical memory, plus a little slop. 2276 */ 2277 if (vm_max_launder == 0) 2278 vm_max_launder = physmem / 256 + 16; 2279 2280 /* 2281 * Initialize some paging parameters. 2282 */ 2283 vm_pageout_free_page_calc(vmstats.v_page_count); 2284 2285 /* 2286 * Basic pageout daemon paging operation settings 2287 */ 2288 vmstats.v_free_target = vmstats.v_free_min * 2; 2289 2290 vmstats.v_paging_wait = vmstats.v_free_min * 2; 2291 vmstats.v_paging_start = vmstats.v_free_min * 3; 2292 vmstats.v_paging_target1 = vmstats.v_free_min * 4; 2293 vmstats.v_paging_target2 = vmstats.v_free_min * 5; 2294 2295 /* 2296 * NOTE: With the new buffer cache b_act_count we want the default 2297 * inactive target to be a percentage of available memory. 2298 * 2299 * The inactive target essentially determines the minimum 2300 * number of 'temporary' pages capable of caching one-time-use 2301 * files when the VM system is otherwise full of pages 2302 * belonging to multi-time-use files or active program data. 2303 * 2304 * NOTE: The inactive target is aggressively persued only if the 2305 * inactive queue becomes too small. If the inactive queue 2306 * is large enough to satisfy page movement to free+cache 2307 * then it is repopulated more slowly from the active queue. 2308 * This allows a general inactive_target default to be set. 2309 * 2310 * There is an issue here for processes which sit mostly idle 2311 * 'overnight', such as sshd, tcsh, and X. Any movement from 2312 * the active queue will eventually cause such pages to 2313 * recycle eventually causing a lot of paging in the morning. 2314 * To reduce the incidence of this pages cycled out of the 2315 * buffer cache are moved directly to the inactive queue if 2316 * they were only used once or twice. 2317 * 2318 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2319 * Increasing the value (up to 64) increases the number of 2320 * buffer recyclements which go directly to the inactive queue. 2321 * 2322 * NOTE: There is 'cache target'. The combined (free + cache( target 2323 * is handled by the v_paging_* targets above. 2324 */ 2325 vmstats.v_inactive_target = vmstats.v_free_count / 16; 2326 //vmstats.v_inactive_target = vmstats.v_free_min * 4; 2327 2328 /* XXX does not really belong here */ 2329 if (vm_page_max_wired == 0) 2330 vm_page_max_wired = vmstats.v_free_count / 3; 2331 2332 /* 2333 * page stats operation. 2334 * 2335 * scan - needs to be large enough for decent turn-around but 2336 * not so large that it eats a ton of CPU. Pages per run. 2337 * 2338 * ticks - interval per run in ticks. 2339 * 2340 * run - number of seconds after the pagedaemon has run that 2341 * we continue to collect page stats, after which we stop. 2342 * 2343 * Calculated for 50% coverage. 2344 * 2345 */ 2346 if (vm_pageout_stats_scan == 0) { 2347 vm_pageout_stats_scan = vmstats.v_free_count / PQ_L2_SIZE / 16; 2348 if (vm_pageout_stats_scan < 16) 2349 vm_pageout_stats_scan = 16; 2350 } 2351 2352 if (vm_pageout_stats_ticks == 0) 2353 vm_pageout_stats_ticks = hz / 10; 2354 2355 vm_pagedaemon_uptime = time_uptime; 2356 2357 swap_pager_swap_init(); 2358 2359 atomic_swap_int(&sequence_emerg_pager, 1); 2360 wakeup(&sequence_emerg_pager); 2361 2362 skip_setup: 2363 /* 2364 * Sequence emergency pager startup 2365 */ 2366 if (isep) { 2367 while (sequence_emerg_pager == 0) 2368 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2369 } 2370 2371 pass = 0; 2372 warn_time = time_uptime; 2373 2374 /* 2375 * The pageout daemon is never done, so loop forever. 2376 * 2377 * WARNING! This code is being executed by two kernel threads 2378 * potentially simultaneously. 2379 */ 2380 while (TRUE) { 2381 int error; 2382 long avail_shortage; 2383 long inactive_shortage; 2384 long vnodes_skipped = 0; 2385 long recycle_count = 0; 2386 long tmp; 2387 2388 /* 2389 * Don't let pass overflow 2390 */ 2391 if (pass > 0x7FFF0000) 2392 pass = 0x70000000; 2393 2394 /* 2395 * Wait for an action request. If we timeout check to 2396 * see if paging is needed (in case the normal wakeup 2397 * code raced us). 2398 */ 2399 if (isep) { 2400 /* 2401 * Emergency pagedaemon monitors the primary 2402 * pagedaemon while vm_pages_needed != 0. 2403 * 2404 * The emergency pagedaemon only runs if VM paging 2405 * is needed and the primary pagedaemon has not 2406 * updated vm_pagedaemon_uptime for more than 2 2407 * seconds. 2408 */ 2409 if (vm_pages_needed) 2410 tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz); 2411 else 2412 tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz*10); 2413 if (vm_pages_needed == 0) { 2414 pass = 0; 2415 continue; 2416 } 2417 if ((int)(time_uptime - vm_pagedaemon_uptime) < 2) { 2418 pass = 0; 2419 continue; 2420 } 2421 } else { 2422 /* 2423 * Primary pagedaemon 2424 * 2425 * Do an unconditional partial scan to deal with 2426 * PQ_HOLD races and to maintain active stats on 2427 * pages that are in PQ_ACTIVE. 2428 */ 2429 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK, 2430 &markers[q3iterator & PQ_L2_MASK].hold); 2431 vm_pageout_page_stats(q3iterator & PQ_L2_MASK, 2432 &markers[q3iterator & PQ_L2_MASK].stat, 2433 scounter); 2434 ++q3iterator; 2435 2436 /* 2437 * Primary idle sleep loop, check condition after 2438 * sleep. 2439 * 2440 * NOTE: State will not be IDLE if vm_pages_needed 2441 * is non-zero. 2442 */ 2443 if (vm_pages_needed == 0) { 2444 error = tsleep(&vm_pages_needed, 2445 0, "psleep", 2446 vm_pageout_stats_ticks); 2447 if (error && 2448 vm_paging_start(0) == 0 && 2449 vm_pages_needed == 0) 2450 { 2451 continue; 2452 } 2453 vm_pagedaemon_uptime = time_uptime; 2454 vm_pages_needed = 1; 2455 state = PAGING_TARGET1; 2456 2457 /* 2458 * Wake the emergency pagedaemon up so it 2459 * can monitor us. It will automatically 2460 * go back into a long sleep when 2461 * vm_pages_needed returns to 0. 2462 */ 2463 wakeup(&vm_pagedaemon_uptime); 2464 } 2465 } 2466 2467 mycpu->gd_cnt.v_pdwakeups++; 2468 2469 /* 2470 * Scan for INACTIVE->CLEAN/PAGEOUT 2471 * 2472 * This routine tries to avoid thrashing the system with 2473 * unnecessary activity. 2474 * 2475 * Calculate our target for the number of free+cache pages we 2476 * want to get to. This is higher then the number that causes 2477 * allocations to stall (severe) in order to provide hysteresis, 2478 * and if we don't make it all the way but get to the minimum 2479 * we're happy. Goose it a bit if there are multiple requests 2480 * for memory. 2481 * 2482 * Don't reduce avail_shortage inside the loop or the 2483 * PQAVERAGE() calculation will break. 2484 * 2485 * NOTE! deficit is differentiated from avail_shortage as 2486 * REQUIRING at least (deficit) pages to be cleaned, 2487 * even if the page queues are in good shape. This 2488 * is used primarily for handling per-process 2489 * RLIMIT_RSS and may also see small values when 2490 * processes block due to low memory. 2491 */ 2492 vmstats_rollup(); 2493 if (isep == 0) 2494 vm_pagedaemon_uptime = time_uptime; 2495 2496 if (state == PAGING_TARGET1) { 2497 avail_shortage = vm_paging_target1_count() + 2498 vm_pageout_deficit; 2499 } else { 2500 avail_shortage = vm_paging_target2_count() + 2501 vm_pageout_deficit; 2502 } 2503 vm_pageout_deficit = 0; 2504 2505 if (avail_shortage > 0) { 2506 long delta = 0; 2507 long counts[4] = { 0, 0, 0, 0 }; 2508 long use = avail_shortage; 2509 int qq; 2510 2511 if (vm_pageout_debug) { 2512 static time_t save_time3; 2513 if (save_time3 != time_uptime) { 2514 save_time3 = time_uptime; 2515 kprintf("scan_inactive " 2516 "pass %d isep=%d\n", 2517 pass, isep); 2518 } 2519 } 2520 2521 /* 2522 * Once target1 is achieved we move on to target2, 2523 * but pageout more lazily in smaller batches. 2524 */ 2525 if (state == PAGING_TARGET2 && 2526 use > vmstats.v_inactive_target / 10) 2527 { 2528 use = vmstats.v_inactive_target / 10 + 1; 2529 } 2530 2531 qq = q1iterator; 2532 for (q = 0; q < PQ_L2_SIZE; ++q) { 2533 delta += vm_pageout_scan_inactive( 2534 pass / MAXSCAN_DIVIDER, 2535 qq & PQ_L2_MASK, 2536 PQAVERAGE(use), 2537 &vnodes_skipped, counts); 2538 if (isep) 2539 --qq; 2540 else 2541 ++qq; 2542 if (avail_shortage - delta <= 0) 2543 break; 2544 2545 /* 2546 * It is possible for avail_shortage to be 2547 * very large. If a large program exits or 2548 * frees a ton of memory all at once, we do 2549 * not have to continue deactivations. 2550 * 2551 * (We will still run the active->inactive 2552 * target, however). 2553 */ 2554 if (!vm_paging_target2() && 2555 !vm_paging_min_dnc(vm_page_free_hysteresis)) { 2556 avail_shortage = 0; 2557 break; 2558 } 2559 } 2560 if (vm_pageout_debug) { 2561 static time_t save_time2; 2562 if (save_time2 != time_uptime) { 2563 save_time2 = time_uptime; 2564 kprintf("flsh %ld cln %ld " 2565 "lru2 %ld react %ld " 2566 "delta %ld\n", 2567 counts[0], counts[1], 2568 counts[2], counts[3], 2569 delta); 2570 } 2571 } 2572 avail_shortage -= delta; 2573 q1iterator = qq; 2574 } 2575 2576 /* 2577 * Figure out how many active pages we must deactivate. If 2578 * we were able to reach our target with just the inactive 2579 * scan above we limit the number of active pages we 2580 * deactivate to reduce unnecessary work. 2581 * 2582 * When calculating inactive_shortage notice that we are 2583 * departing from what vm_paging_inactive_count() does. 2584 * During paging, the free + cache queues are assumed to 2585 * be under stress, so only a pure inactive target is 2586 * calculated without taking into account v_free_min, 2587 * v_free_count, or v_cache_count. 2588 */ 2589 vmstats_rollup(); 2590 if (isep == 0) 2591 vm_pagedaemon_uptime = time_uptime; 2592 inactive_shortage = vmstats.v_inactive_target - 2593 vmstats.v_inactive_count; 2594 2595 /* 2596 * If we were unable to free sufficient inactive pages to 2597 * satisfy the free/cache queue requirements then simply 2598 * reaching the inactive target may not be good enough. 2599 * Try to deactivate pages in excess of the target based 2600 * on the shortfall. 2601 * 2602 * However to prevent thrashing the VM system do not 2603 * deactivate more than an additional 1/10 the inactive 2604 * target's worth of active pages. 2605 */ 2606 if (avail_shortage > 0) { 2607 tmp = avail_shortage * 2; 2608 if (tmp > vmstats.v_inactive_target / 10) 2609 tmp = vmstats.v_inactive_target / 10; 2610 inactive_shortage += tmp; 2611 } 2612 2613 /* 2614 * Only trigger a pmap cleanup on inactive shortage. 2615 */ 2616 if (isep == 0 && inactive_shortage > 0) { 2617 pmap_collect(); 2618 } 2619 2620 /* 2621 * Scan for ACTIVE->INACTIVE 2622 * 2623 * Only trigger on inactive shortage. Triggering on 2624 * avail_shortage can starve the active queue with 2625 * unnecessary active->inactive transitions and destroy 2626 * performance. 2627 * 2628 * If this is the emergency pager, always try to move 2629 * a few pages from active to inactive because the inactive 2630 * queue might have enough pages, but not enough anonymous 2631 * pages. 2632 */ 2633 if (isep && inactive_shortage < vm_emerg_launder) 2634 inactive_shortage = vm_emerg_launder; 2635 2636 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2637 long delta = 0; 2638 int qq; 2639 2640 qq = q2iterator; 2641 for (q = 0; q < PQ_L2_SIZE; ++q) { 2642 delta += vm_pageout_scan_active( 2643 pass / MAXSCAN_DIVIDER, 2644 qq & PQ_L2_MASK, 2645 PQAVERAGE(avail_shortage), 2646 PQAVERAGE(inactive_shortage), 2647 &markers[qq & PQ_L2_MASK].pact, 2648 &recycle_count); 2649 if (isep) 2650 --qq; 2651 else 2652 ++qq; 2653 if (inactive_shortage - delta <= 0 && 2654 avail_shortage - delta <= 0) { 2655 break; 2656 } 2657 2658 /* 2659 * inactive_shortage can be a very large 2660 * number. This is intended to break out 2661 * early if our inactive_target has been 2662 * reached due to other system activity. 2663 */ 2664 if (vmstats.v_inactive_count > 2665 vmstats.v_inactive_target) 2666 { 2667 inactive_shortage = 0; 2668 break; 2669 } 2670 } 2671 inactive_shortage -= delta; 2672 avail_shortage -= delta; 2673 q2iterator = qq; 2674 } 2675 2676 /* 2677 * Scan for CACHE->FREE 2678 * 2679 * Finally free enough cache pages to meet our free page 2680 * requirement and take more drastic measures if we are 2681 * still in trouble. 2682 */ 2683 vmstats_rollup(); 2684 if (isep == 0) 2685 vm_pagedaemon_uptime = time_uptime; 2686 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER, 2687 vnodes_skipped, recycle_count); 2688 2689 /* 2690 * This is a bit sophisticated because we do not necessarily 2691 * want to force paging until our targets are reached if we 2692 * were able to successfully retire the shortage we calculated. 2693 */ 2694 if (avail_shortage > 0) { 2695 /* 2696 * If we did not retire enough pages continue the 2697 * pageout operation until we are able to. It 2698 * takes MAXSCAN_DIVIDER passes to cover the entire 2699 * inactive list. 2700 * 2701 * We used to throw delays in here if paging went on 2702 * continuously but that really just makes things 2703 * worse. Just keep going. 2704 */ 2705 if (pass == 0) 2706 warn_time = time_uptime; 2707 ++pass; 2708 if (isep == 0 && time_uptime - warn_time >= 60) { 2709 kprintf("pagedaemon: WARNING! Continuous " 2710 "paging for %ld minutes\n", 2711 (time_uptime - warn_time ) / 60); 2712 warn_time = time_uptime; 2713 } 2714 2715 if (vm_pages_needed) { 2716 /* 2717 * Normal operation, additional processes 2718 * have already kicked us. Retry immediately 2719 * unless swap space is completely full in 2720 * which case delay a bit. 2721 */ 2722 if (swap_pager_full) { 2723 tsleep(&vm_pages_needed, 0, "pdelay", 2724 hz / 5); 2725 } /* else immediate loop */ 2726 } /* else immediate loop */ 2727 } else { 2728 /* 2729 * Reset pass 2730 */ 2731 pass = 0; 2732 2733 if (vm_paging_start(0) || 2734 vm_paging_min_dnc(vm_page_free_hysteresis)) 2735 { 2736 /* 2737 * Pages sufficiently exhausted to start 2738 * page-daemon in TARGET1 mode 2739 */ 2740 state = PAGING_TARGET1; 2741 vm_pages_needed = 2; 2742 2743 /* 2744 * We can wakeup waiters if we are above 2745 * the wait point. 2746 */ 2747 if (!vm_paging_wait()) 2748 wakeup(&vmstats.v_free_count); 2749 } else if (vm_pages_needed) { 2750 /* 2751 * Continue paging until TARGET2 reached, 2752 * but waiters can be woken up. 2753 * 2754 * The PAGING_TARGET2 state tells the 2755 * pagedaemon to work a little less hard. 2756 */ 2757 if (vm_paging_target1()) { 2758 state = PAGING_TARGET1; 2759 vm_pages_needed = 2; 2760 } else if (vm_paging_target2()) { 2761 state = PAGING_TARGET2; 2762 vm_pages_needed = 2; 2763 } else { 2764 vm_pages_needed = 0; 2765 } 2766 wakeup(&vmstats.v_free_count); 2767 } /* else nothing to do here */ 2768 } 2769 } 2770 } 2771 2772 static struct kproc_desc pg1_kp = { 2773 "pagedaemon", 2774 vm_pageout_thread, 2775 &pagethread 2776 }; 2777 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2778 2779 static struct kproc_desc pg2_kp = { 2780 "emergpager", 2781 vm_pageout_thread, 2782 &emergpager 2783 }; 2784 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2785 2786 2787 /* 2788 * Called after allocating a page out of the cache or free queue 2789 * to possibly wake the pagedaemon up to replentish our supply. 2790 * 2791 * We try to generate some hysteresis by waking the pagedaemon up 2792 * when our free+cache pages go below the free_min+cache_min level. 2793 * The pagedaemon tries to get the count back up to at least the 2794 * minimum, and through to the target level if possible. 2795 * 2796 * If the pagedaemon is already active bump vm_pages_needed as a hint 2797 * that there are even more requests pending. 2798 * 2799 * SMP races ok? 2800 * No requirements. 2801 */ 2802 void 2803 pagedaemon_wakeup(void) 2804 { 2805 if (vm_paging_start(0) && curthread != pagethread) { 2806 if (vm_pages_needed <= 1) { 2807 vm_pages_needed = 1; /* SMP race ok */ 2808 wakeup(&vm_pages_needed); /* tickle pageout */ 2809 } else if (vm_paging_min()) { 2810 ++vm_pages_needed; /* SMP race ok */ 2811 /* a wakeup() would be wasted here */ 2812 } 2813 } 2814 } 2815 2816 #if !defined(NO_SWAPPING) 2817 2818 /* 2819 * SMP races ok? 2820 * No requirements. 2821 */ 2822 static void 2823 vm_req_vmdaemon(void) 2824 { 2825 static int lastrun = 0; 2826 2827 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2828 wakeup(&vm_daemon_needed); 2829 lastrun = ticks; 2830 } 2831 } 2832 2833 static int vm_daemon_callback(struct proc *p, void *data __unused); 2834 2835 /* 2836 * No requirements. 2837 * 2838 * Scan processes for exceeding their rlimits, deactivate pages 2839 * when RSS is exceeded. 2840 */ 2841 static void 2842 vm_daemon(void) 2843 { 2844 while (TRUE) { 2845 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2846 allproc_scan(vm_daemon_callback, NULL, 0); 2847 } 2848 } 2849 2850 static int 2851 vm_daemon_callback(struct proc *p, void *data __unused) 2852 { 2853 struct vmspace *vm; 2854 vm_pindex_t limit, size; 2855 2856 /* 2857 * if this is a system process or if we have already 2858 * looked at this process, skip it. 2859 */ 2860 lwkt_gettoken(&p->p_token); 2861 2862 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2863 lwkt_reltoken(&p->p_token); 2864 return (0); 2865 } 2866 2867 /* 2868 * if the process is in a non-running type state, 2869 * don't touch it. 2870 */ 2871 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2872 lwkt_reltoken(&p->p_token); 2873 return (0); 2874 } 2875 2876 /* 2877 * get a limit 2878 */ 2879 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2880 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2881 2882 vm = p->p_vmspace; 2883 vmspace_hold(vm); 2884 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2885 if (limit >= 0 && size > 4096 && 2886 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2887 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2888 } 2889 vmspace_drop(vm); 2890 2891 lwkt_reltoken(&p->p_token); 2892 2893 return (0); 2894 } 2895 2896 #endif 2897