1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1991 Regents of the University of California. 35 * All rights reserved. 36 * Copyright (c) 1994 John S. Dyson 37 * All rights reserved. 38 * Copyright (c) 1994 David Greenman 39 * All rights reserved. 40 * 41 * This code is derived from software contributed to Berkeley by 42 * The Mach Operating System project at Carnegie-Mellon University. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 69 * 70 * 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * The proverbial page-out daemon, rewritten many times over the decades. 99 */ 100 101 #include "opt_vm.h" 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/kernel.h> 105 #include <sys/proc.h> 106 #include <sys/kthread.h> 107 #include <sys/resourcevar.h> 108 #include <sys/signalvar.h> 109 #include <sys/vnode.h> 110 #include <sys/malloc.h> 111 #include <sys/vmmeter.h> 112 #include <sys/conf.h> 113 #include <sys/sysctl.h> 114 115 #include <vm/vm.h> 116 #include <vm/vm_param.h> 117 #include <sys/lock.h> 118 #include <vm/vm_object.h> 119 #include <vm/vm_page.h> 120 #include <vm/vm_map.h> 121 #include <vm/vm_pageout.h> 122 #include <vm/vm_pager.h> 123 #include <vm/swap_pager.h> 124 #include <vm/vm_extern.h> 125 126 #include <sys/spinlock2.h> 127 #include <vm/vm_page2.h> 128 129 /* 130 * Persistent markers held by pageout daemon (array) 131 */ 132 struct markers { 133 struct vm_page hold; 134 struct vm_page stat; 135 struct vm_page pact; 136 }; 137 138 /* 139 * System initialization 140 */ 141 142 /* the kernel process "vm_pageout"*/ 143 static int vm_pageout_page(vm_page_t m, long *max_launderp, 144 long *vnodes_skippedp, struct vnode **vpfailedp, 145 int pass, int vmflush_flags, long *counts); 146 static int vm_pageout_clean_helper (vm_page_t, int); 147 static void vm_pageout_free_page_calc (vm_size_t count); 148 static void vm_pageout_page_free(vm_page_t m) ; 149 __read_frequently struct thread *emergpager; 150 __read_frequently struct thread *pagethread; 151 static int sequence_emerg_pager; 152 153 #if !defined(NO_SWAPPING) 154 /* the kernel process "vm_daemon"*/ 155 static void vm_daemon (void); 156 static struct thread *vmthread; 157 158 static struct kproc_desc vm_kp = { 159 "vmdaemon", 160 vm_daemon, 161 &vmthread 162 }; 163 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 164 #endif 165 166 __read_mostly int vm_pages_needed = 0; /* pageout daemon tsleep event */ 167 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */ 168 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 169 __read_mostly int vm_page_free_hysteresis = 16; 170 __read_mostly static time_t vm_pagedaemon_uptime; 171 172 #if !defined(NO_SWAPPING) 173 static int vm_daemon_needed; 174 #endif 175 __read_mostly static int vm_queue_idle_perc = 20; 176 __read_mostly static int vm_max_launder = 0; 177 __read_mostly static int vm_emerg_launder = 100; 178 __read_mostly static int vm_pageout_stats_actcmp = 0; 179 __read_mostly static int vm_pageout_stats_inamin = 16; 180 __read_mostly static int vm_pageout_stats_inalim = 4096; 181 __read_mostly static int vm_pageout_stats_scan = 0; 182 __read_mostly static int vm_pageout_stats_ticks = 0; 183 __read_mostly static int vm_pageout_algorithm = 0; 184 __read_mostly static int defer_swap_pageouts = 0; 185 __read_mostly static int disable_swap_pageouts = 0; 186 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 187 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 188 __read_mostly static int vm_pageout_debug; 189 __read_mostly static long vm_pageout_stats_rsecs = 300; 190 191 #if defined(NO_SWAPPING) 192 __read_mostly static int vm_swap_enabled=0; 193 #else 194 __read_mostly static int vm_swap_enabled=1; 195 #endif 196 197 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/ 198 __read_mostly int vm_pageout_memuse_mode=2; 199 __read_mostly int vm_pageout_allow_active=1; 200 201 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 202 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 203 204 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 205 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 206 207 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 208 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 209 "Free more pages than the minimum required"); 210 211 SYSCTL_INT(_vm, OID_AUTO, queue_idle_perc, 212 CTLFLAG_RW, &vm_queue_idle_perc, 0, "page stats stop point, percent"); 213 214 SYSCTL_INT(_vm, OID_AUTO, max_launder, 215 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 216 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 217 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 218 219 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_actcmp, 220 CTLFLAG_RW, &vm_pageout_stats_actcmp, 0, 221 "Current dynamic act_count comparator"); 222 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inamin, 223 CTLFLAG_RW, &vm_pageout_stats_inamin, 0, 224 "min out of lim tests must match"); 225 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_inalim, 226 CTLFLAG_RW, &vm_pageout_stats_inalim, 0, 227 "min out of lim tests must match"); 228 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_ticks, 229 CTLFLAG_RW, &vm_pageout_stats_ticks, 0, 230 "Interval for partial stats scan"); 231 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_scan, 232 CTLFLAG_RW, &vm_pageout_stats_scan, 0, 233 "hold/ACT scan count per interval"); 234 SYSCTL_LONG(_vm, OID_AUTO, pageout_stats_rsecs, 235 CTLFLAG_RW, &vm_pageout_stats_rsecs, 0, 236 "min out of lim tests must match"); 237 238 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 239 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 240 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active, 241 CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active"); 242 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 243 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 244 245 246 #if defined(NO_SWAPPING) 247 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 248 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 249 #else 250 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 251 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 252 #endif 253 254 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 255 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 256 257 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 258 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 259 260 static int pageout_lock_miss; 261 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 262 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 263 264 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 265 266 static MALLOC_DEFINE(M_PAGEOUT, "pageout", "Pageout structures"); 267 268 #if !defined(NO_SWAPPING) 269 static void vm_req_vmdaemon (void); 270 #endif 271 272 #define MAXSCAN_DIVIDER 10 273 274 #define VM_CACHE_SCAN_MIN 16 275 #define VM_CACHE_SCAN_NOM (VM_CACHE_SCAN_MIN * 4) 276 277 /* 278 * Calculate approximately how many pages on each queue to try to 279 * clean. An exact calculation creates an edge condition when the 280 * queues are unbalanced so add significant slop. The queue scans 281 * will stop early when targets are reached and will start where they 282 * left off on the next pass. 283 * 284 * We need to be generous here because there are all sorts of loading 285 * conditions that can cause edge cases if try to average over all queues. 286 * In particular, storage subsystems have become so fast that paging 287 * activity can become quite frantic. Eventually we will probably need 288 * two paging threads, one for dirty pages and one for clean, to deal 289 * with the bandwidth requirements. 290 291 * So what we do is calculate a value that can be satisfied nominally by 292 * only having to scan half the queues. 293 */ 294 static __inline long 295 PQAVERAGE(long n) 296 { 297 long avg; 298 299 if (n >= 0) { 300 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 301 } else { 302 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 303 } 304 return avg; 305 } 306 307 /* 308 * vm_pageout_clean_helper: 309 * 310 * Clean the page and remove it from the laundry. The page must be busied 311 * by the caller and will be disposed of (put away, flushed) by this routine. 312 */ 313 static int 314 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 315 { 316 vm_object_t object; 317 vm_page_t mc[BLIST_MAX_ALLOC]; 318 int error; 319 int ib, is, page_base; 320 vm_pindex_t pindex = m->pindex; 321 322 object = m->object; 323 324 /* 325 * Don't mess with the page if it's held or special. Theoretically 326 * we can pageout held pages but there is no real need to press our 327 * luck, so don't. 328 */ 329 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 330 vm_page_wakeup(m); 331 return 0; 332 } 333 334 /* 335 * Place page in cluster. Align cluster for optimal swap space 336 * allocation (whether it is swap or not). This is typically ~16-32 337 * pages, which also tends to align the cluster to multiples of the 338 * filesystem block size if backed by a filesystem. 339 */ 340 page_base = pindex % BLIST_MAX_ALLOC; 341 mc[page_base] = m; 342 ib = page_base - 1; 343 is = page_base + 1; 344 345 /* 346 * Scan object for clusterable pages. 347 * 348 * We can cluster ONLY if: ->> the page is NOT 349 * clean, wired, busy, held, or mapped into a 350 * buffer, and one of the following: 351 * 1) The page is inactive, or a seldom used 352 * active page. 353 * -or- 354 * 2) we force the issue. 355 * 356 * During heavy mmap/modification loads the pageout 357 * daemon can really fragment the underlying file 358 * due to flushing pages out of order and not trying 359 * align the clusters (which leave sporatic out-of-order 360 * holes). To solve this problem we do the reverse scan 361 * first and attempt to align our cluster, then do a 362 * forward scan if room remains. 363 */ 364 vm_object_hold(object); 365 366 while (ib >= 0) { 367 vm_page_t p; 368 369 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 370 TRUE, &error); 371 if (error || p == NULL) 372 break; 373 if ((p->queue - p->pc) == PQ_CACHE || 374 (p->flags & PG_UNQUEUED)) { 375 vm_page_wakeup(p); 376 break; 377 } 378 vm_page_test_dirty(p); 379 if (((p->dirty & p->valid) == 0 && 380 (p->flags & PG_NEED_COMMIT) == 0) || 381 p->wire_count != 0 || /* may be held by buf cache */ 382 p->hold_count != 0) { /* may be undergoing I/O */ 383 vm_page_wakeup(p); 384 break; 385 } 386 if (p->queue - p->pc != PQ_INACTIVE) { 387 if (p->queue - p->pc != PQ_ACTIVE || 388 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 389 vm_page_wakeup(p); 390 break; 391 } 392 } 393 394 /* 395 * Try to maintain page groupings in the cluster. 396 */ 397 if (m->flags & PG_WINATCFLS) 398 vm_page_flag_set(p, PG_WINATCFLS); 399 else 400 vm_page_flag_clear(p, PG_WINATCFLS); 401 p->act_count = m->act_count; 402 403 mc[ib] = p; 404 --ib; 405 } 406 ++ib; /* fixup */ 407 408 while (is < BLIST_MAX_ALLOC && 409 pindex - page_base + is < object->size) { 410 vm_page_t p; 411 412 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 413 TRUE, &error); 414 if (error || p == NULL) 415 break; 416 if (((p->queue - p->pc) == PQ_CACHE) || 417 (p->flags & PG_UNQUEUED)) { 418 vm_page_wakeup(p); 419 break; 420 } 421 vm_page_test_dirty(p); 422 if (((p->dirty & p->valid) == 0 && 423 (p->flags & PG_NEED_COMMIT) == 0) || 424 p->wire_count != 0 || /* may be held by buf cache */ 425 p->hold_count != 0) { /* may be undergoing I/O */ 426 vm_page_wakeup(p); 427 break; 428 } 429 if (p->queue - p->pc != PQ_INACTIVE) { 430 if (p->queue - p->pc != PQ_ACTIVE || 431 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 432 vm_page_wakeup(p); 433 break; 434 } 435 } 436 437 /* 438 * Try to maintain page groupings in the cluster. 439 */ 440 if (m->flags & PG_WINATCFLS) 441 vm_page_flag_set(p, PG_WINATCFLS); 442 else 443 vm_page_flag_clear(p, PG_WINATCFLS); 444 p->act_count = m->act_count; 445 446 mc[is] = p; 447 ++is; 448 } 449 450 vm_object_drop(object); 451 452 /* 453 * we allow reads during pageouts... 454 */ 455 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 456 } 457 458 /* 459 * vm_pageout_flush() - launder the given pages 460 * 461 * The given pages are laundered. Note that we setup for the start of 462 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 463 * reference count all in here rather then in the parent. If we want 464 * the parent to do more sophisticated things we may have to change 465 * the ordering. 466 * 467 * The pages in the array must be busied by the caller and will be 468 * unbusied by this function. 469 */ 470 int 471 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 472 { 473 vm_object_t object; 474 int pageout_status[count]; 475 int numpagedout = 0; 476 int i; 477 478 /* 479 * Initiate I/O. Bump the vm_page_t->busy counter. 480 */ 481 for (i = 0; i < count; i++) { 482 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 483 ("vm_pageout_flush page %p index %d/%d: partially " 484 "invalid page", mc[i], i, count)); 485 vm_page_io_start(mc[i]); 486 } 487 488 /* 489 * We must make the pages read-only. This will also force the 490 * modified bit in the related pmaps to be cleared. The pager 491 * cannot clear the bit for us since the I/O completion code 492 * typically runs from an interrupt. The act of making the page 493 * read-only handles the case for us. 494 * 495 * Then we can unbusy the pages, we still hold a reference by virtue 496 * of our soft-busy. 497 */ 498 for (i = 0; i < count; i++) { 499 if (vmflush_flags & OBJPC_TRY_TO_CACHE) 500 vm_page_protect(mc[i], VM_PROT_NONE); 501 else 502 vm_page_protect(mc[i], VM_PROT_READ); 503 vm_page_wakeup(mc[i]); 504 } 505 506 object = mc[0]->object; 507 vm_object_pip_add(object, count); 508 509 vm_pager_put_pages(object, mc, count, 510 (vmflush_flags | 511 ((object == kernel_object) ? OBJPC_SYNC : 0)), 512 pageout_status); 513 514 for (i = 0; i < count; i++) { 515 vm_page_t mt = mc[i]; 516 517 switch (pageout_status[i]) { 518 case VM_PAGER_OK: 519 numpagedout++; 520 break; 521 case VM_PAGER_PEND: 522 numpagedout++; 523 break; 524 case VM_PAGER_BAD: 525 /* 526 * Page outside of range of object. Right now we 527 * essentially lose the changes by pretending it 528 * worked. 529 */ 530 vm_page_busy_wait(mt, FALSE, "pgbad"); 531 pmap_clear_modify(mt); 532 vm_page_undirty(mt); 533 vm_page_wakeup(mt); 534 break; 535 case VM_PAGER_ERROR: 536 case VM_PAGER_FAIL: 537 /* 538 * A page typically cannot be paged out when we 539 * have run out of swap. We leave the page 540 * marked inactive and will try to page it out 541 * again later. 542 * 543 * Starvation of the active page list is used to 544 * determine when the system is massively memory 545 * starved. 546 */ 547 break; 548 case VM_PAGER_AGAIN: 549 break; 550 } 551 552 /* 553 * If not PENDing this was a synchronous operation and we 554 * clean up after the I/O. If it is PENDing the mess is 555 * cleaned up asynchronously. 556 * 557 * Also nominally act on the caller's wishes if the caller 558 * wants to try to really clean (cache or free) the page. 559 * 560 * Also nominally deactivate the page if the system is 561 * memory-stressed. 562 */ 563 if (pageout_status[i] != VM_PAGER_PEND) { 564 vm_page_busy_wait(mt, FALSE, "pgouw"); 565 vm_page_io_finish(mt); 566 if (vmflush_flags & OBJPC_TRY_TO_CACHE) { 567 vm_page_try_to_cache(mt); 568 } else if (vm_paging_severe()) { 569 vm_page_deactivate(mt); 570 vm_page_wakeup(mt); 571 } else { 572 vm_page_wakeup(mt); 573 } 574 vm_object_pip_wakeup(object); 575 } 576 } 577 return numpagedout; 578 } 579 580 #if !defined(NO_SWAPPING) 581 582 /* 583 * Callback function, page busied for us. We must dispose of the busy 584 * condition. Any related pmap pages may be held but will not be locked. 585 */ 586 static 587 int 588 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 589 vm_page_t p) 590 { 591 int actcount; 592 int cleanit = 0; 593 594 /* 595 * Basic tests - There should never be a marker, and we can stop 596 * once the RSS is below the required level. 597 */ 598 KKASSERT((p->flags & PG_MARKER) == 0); 599 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 600 vm_page_wakeup(p); 601 return(-1); 602 } 603 604 mycpu->gd_cnt.v_pdpages++; 605 606 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 607 vm_page_wakeup(p); 608 goto done; 609 } 610 611 ++info->actioncount; 612 613 /* 614 * Check if the page has been referened recently. If it has, 615 * activate it and skip. 616 */ 617 actcount = pmap_ts_referenced(p); 618 if (actcount) { 619 vm_page_flag_set(p, PG_REFERENCED); 620 } else if (p->flags & PG_REFERENCED) { 621 actcount = 1; 622 } 623 624 if (actcount) { 625 if (p->queue - p->pc != PQ_ACTIVE) { 626 vm_page_and_queue_spin_lock(p); 627 if (p->queue - p->pc != PQ_ACTIVE) { 628 vm_page_and_queue_spin_unlock(p); 629 vm_page_activate(p); 630 } else { 631 vm_page_and_queue_spin_unlock(p); 632 } 633 } else { 634 p->act_count += actcount; 635 if (p->act_count > ACT_MAX) 636 p->act_count = ACT_MAX; 637 } 638 vm_page_flag_clear(p, PG_REFERENCED); 639 vm_page_wakeup(p); 640 goto done; 641 } 642 643 /* 644 * Remove the page from this particular pmap. Once we do this, our 645 * pmap scans will not see it again (unless it gets faulted in), so 646 * we must actively dispose of or deal with the page. 647 */ 648 pmap_remove_specific(info->pmap, p); 649 650 /* 651 * If the page is not mapped to another process (i.e. as would be 652 * typical if this were a shared page from a library) then deactivate 653 * the page and clean it in two passes only. 654 * 655 * If the page hasn't been referenced since the last check, remove it 656 * from the pmap. If it is no longer mapped, deactivate it 657 * immediately, accelerating the normal decline. 658 * 659 * Once the page has been removed from the pmap the RSS code no 660 * longer tracks it so we have to make sure that it is staged for 661 * potential flush action. 662 * 663 * XXX 664 */ 665 if ((p->flags & PG_MAPPED) == 0 || 666 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 667 if (p->queue - p->pc == PQ_ACTIVE) { 668 vm_page_deactivate(p); 669 } 670 if (p->queue - p->pc == PQ_INACTIVE) { 671 cleanit = 1; 672 } 673 } 674 675 /* 676 * Ok, try to fully clean the page and any nearby pages such that at 677 * least the requested page is freed or moved to the cache queue. 678 * 679 * We usually do this synchronously to allow us to get the page into 680 * the CACHE queue quickly, which will prevent memory exhaustion if 681 * a process with a memoryuse limit is running away. However, the 682 * sysadmin may desire to set vm.swap_user_async which relaxes this 683 * and improves write performance. 684 */ 685 if (cleanit) { 686 long max_launder = 0x7FFF; 687 long vnodes_skipped = 0; 688 long counts[4] = { 0, 0, 0, 0 }; 689 int vmflush_flags; 690 struct vnode *vpfailed = NULL; 691 692 info->offset = va; 693 694 if (vm_pageout_memuse_mode >= 2) { 695 vmflush_flags = OBJPC_TRY_TO_CACHE | 696 OBJPC_ALLOW_ACTIVE; 697 if (swap_user_async == 0) 698 vmflush_flags |= OBJPC_SYNC; 699 vm_page_flag_set(p, PG_WINATCFLS); 700 info->cleancount += 701 vm_pageout_page(p, &max_launder, 702 &vnodes_skipped, 703 &vpfailed, 1, vmflush_flags, 704 counts); 705 } else { 706 vm_page_wakeup(p); 707 ++info->cleancount; 708 } 709 } else { 710 vm_page_wakeup(p); 711 } 712 713 /* 714 * Must be at end to avoid SMP races. 715 */ 716 done: 717 lwkt_user_yield(); 718 return 0; 719 } 720 721 /* 722 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 723 * that is relatively difficult to do. We try to keep track of where we 724 * left off last time to reduce scan overhead. 725 * 726 * Called when vm_pageout_memuse_mode is >= 1. 727 */ 728 void 729 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 730 { 731 vm_offset_t pgout_offset; 732 struct pmap_pgscan_info info; 733 int retries = 3; 734 735 pgout_offset = map->pgout_offset; 736 again: 737 #if 0 738 kprintf("%016jx ", pgout_offset); 739 #endif 740 if (pgout_offset < VM_MIN_USER_ADDRESS) 741 pgout_offset = VM_MIN_USER_ADDRESS; 742 if (pgout_offset >= VM_MAX_USER_ADDRESS) 743 pgout_offset = 0; 744 info.pmap = vm_map_pmap(map); 745 info.limit = limit; 746 info.beg_addr = pgout_offset; 747 info.end_addr = VM_MAX_USER_ADDRESS; 748 info.callback = vm_pageout_mdp_callback; 749 info.cleancount = 0; 750 info.actioncount = 0; 751 info.busycount = 0; 752 753 pmap_pgscan(&info); 754 pgout_offset = info.offset; 755 #if 0 756 kprintf("%016jx %08lx %08lx\n", pgout_offset, 757 info.cleancount, info.actioncount); 758 #endif 759 760 if (pgout_offset != VM_MAX_USER_ADDRESS && 761 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 762 goto again; 763 } else if (retries && 764 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 765 --retries; 766 goto again; 767 } 768 map->pgout_offset = pgout_offset; 769 } 770 #endif 771 772 /* 773 * Called when the pageout scan wants to free a page. We no longer 774 * try to cycle the vm_object here with a reference & dealloc, which can 775 * cause a non-trivial object collapse in a critical path. 776 * 777 * It is unclear why we cycled the ref_count in the past, perhaps to try 778 * to optimize shadow chain collapses but I don't quite see why it would 779 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 780 * synchronously and not have to be kicked-start. 781 */ 782 static void 783 vm_pageout_page_free(vm_page_t m) 784 { 785 vm_page_protect(m, VM_PROT_NONE); 786 vm_page_free(m); 787 } 788 789 /* 790 * vm_pageout_scan does the dirty work for the pageout daemon. 791 */ 792 struct vm_pageout_scan_info { 793 struct proc *bigproc; 794 vm_offset_t bigsize; 795 }; 796 797 static int vm_pageout_scan_callback(struct proc *p, void *data); 798 799 /* 800 * Scan inactive queue for pages we can cache or free. 801 * 802 * WARNING! Can be called from two pagedaemon threads simultaneously. 803 */ 804 static int 805 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 806 long *vnodes_skipped, long *counts) 807 { 808 vm_page_t m; 809 struct vm_page marker; 810 struct vnode *vpfailed; /* warning, allowed to be stale */ 811 long maxscan; 812 long delta = 0; 813 long max_launder; 814 int isep; 815 int vmflush_flags; 816 817 isep = (curthread == emergpager); 818 819 /* 820 * This routine is called for each of PQ_L2_SIZE inactive queues. 821 * We want the vm_max_launder parameter to apply to the whole 822 * queue (i.e. per-whole-queue pass, not per-sub-queue). 823 * 824 * In each successive full-pass when the page target is not met we 825 * allow the per-queue max_launder to increase up to a maximum of 826 * vm_max_launder / 16. 827 */ 828 max_launder = (long)vm_max_launder / PQ_L2_SIZE; 829 if (pass) 830 max_launder *= 2; 831 max_launder = (max_launder + MAXSCAN_DIVIDER - 1) / MAXSCAN_DIVIDER; 832 833 if (max_launder <= 1) 834 max_launder = 1; 835 if (max_launder >= vm_max_launder / 16) 836 max_launder = vm_max_launder / 16 + 1; 837 838 /* 839 * Start scanning the inactive queue for pages we can move to the 840 * cache or free. The scan will stop when the target is reached or 841 * we have scanned the entire inactive queue. Note that m->act_count 842 * is not used to form decisions for the inactive queue, only for the 843 * active queue. 844 * 845 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 846 * PAGES. 847 */ 848 849 /* 850 * Initialize our marker 851 */ 852 bzero(&marker, sizeof(marker)); 853 marker.flags = PG_FICTITIOUS | PG_MARKER; 854 marker.busy_count = PBUSY_LOCKED; 855 marker.queue = PQ_INACTIVE + q; 856 marker.pc = q; 857 marker.wire_count = 1; 858 859 /* 860 * Inactive queue scan. 861 * 862 * We pick off approximately 1/10 of each queue. Each queue is 863 * effectively organized LRU so scanning the entire queue would 864 * improperly pick up pages that might still be in regular use. 865 * 866 * NOTE: The vm_page must be spinlocked before the queue to avoid 867 * deadlocks, so it is easiest to simply iterate the loop 868 * with the queue unlocked at the top. 869 */ 870 vpfailed = NULL; 871 872 vm_page_queues_spin_lock(PQ_INACTIVE + q); 873 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 874 maxscan = (vm_page_queues[PQ_INACTIVE + q].lcnt + MAXSCAN_DIVIDER - 1) / 875 MAXSCAN_DIVIDER + 1; 876 877 /* 878 * Queue locked at top of loop to avoid stack marker issues. 879 */ 880 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 881 maxscan-- > 0 && avail_shortage - delta > 0) 882 { 883 int count; 884 885 KKASSERT(m->queue == PQ_INACTIVE + q); 886 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 887 &marker, pageq); 888 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 889 &marker, pageq); 890 mycpu->gd_cnt.v_pdpages++; 891 892 /* 893 * Skip marker pages (atomic against other markers to avoid 894 * infinite hop-over scans). 895 */ 896 if (m->flags & PG_MARKER) 897 continue; 898 899 /* 900 * Try to busy the page. Don't mess with pages which are 901 * already busy or reorder them in the queue. 902 */ 903 if (vm_page_busy_try(m, TRUE)) 904 continue; 905 906 /* 907 * Remaining operations run with the page busy and neither 908 * the page or the queue will be spin-locked. 909 */ 910 KKASSERT(m->queue == PQ_INACTIVE + q); 911 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 912 913 /* 914 * The emergency pager runs when the primary pager gets 915 * stuck, which typically means the primary pager deadlocked 916 * on a vnode-backed page. Therefore, the emergency pager 917 * must skip any complex objects. 918 * 919 * We disallow VNODEs unless they are VCHR whos device ops 920 * does not flag D_NOEMERGPGR. 921 */ 922 if (isep && m->object) { 923 struct vnode *vp; 924 925 switch(m->object->type) { 926 case OBJT_DEFAULT: 927 case OBJT_SWAP: 928 /* 929 * Allow anonymous memory and assume that 930 * swap devices are not complex, since its 931 * kinda worthless if we can't swap out dirty 932 * anonymous pages. 933 */ 934 break; 935 case OBJT_VNODE: 936 /* 937 * Allow VCHR device if the D_NOEMERGPGR 938 * flag is not set, deny other vnode types 939 * as being too complex. 940 */ 941 vp = m->object->handle; 942 if (vp && vp->v_type == VCHR && 943 vp->v_rdev && vp->v_rdev->si_ops && 944 (vp->v_rdev->si_ops->head.flags & 945 D_NOEMERGPGR) == 0) { 946 break; 947 } 948 /* Deny - fall through */ 949 default: 950 /* 951 * Deny 952 */ 953 vm_page_wakeup(m); 954 vm_page_queues_spin_lock(PQ_INACTIVE + q); 955 lwkt_yield(); 956 continue; 957 } 958 } 959 960 /* 961 * Try to pageout the page and perhaps other nearby pages. 962 * We want to get the pages into the cache eventually ( 963 * first or second pass). Otherwise the pages can wind up 964 * just cycling in the inactive queue, getting flushed over 965 * and over again. 966 * 967 * Generally speaking we recycle dirty pages within PQ_INACTIVE 968 * twice (double LRU) before paging them out. If the 969 * memuse_mode is >= 3 we run them single-LRU like we do clean 970 * pages. 971 */ 972 if (vm_pageout_memuse_mode >= 3) 973 vm_page_flag_set(m, PG_WINATCFLS); 974 975 vmflush_flags = 0; 976 if (vm_pageout_allow_active) 977 vmflush_flags |= OBJPC_ALLOW_ACTIVE; 978 if (m->flags & PG_WINATCFLS) 979 vmflush_flags |= OBJPC_TRY_TO_CACHE; 980 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 981 &vpfailed, pass, vmflush_flags, counts); 982 delta += count; 983 984 /* 985 * Systems with a ton of memory can wind up with huge 986 * deactivation counts. Because the inactive scan is 987 * doing a lot of flushing, the combination can result 988 * in excessive paging even in situations where other 989 * unrelated threads free up sufficient VM. 990 * 991 * To deal with this we abort the nominal active->inactive 992 * scan before we hit the inactive target when free+cache 993 * levels have reached a reasonable target. 994 * 995 * When deciding to stop early we need to add some slop to 996 * the test and we need to return full completion to the caller 997 * to prevent the caller from thinking there is something 998 * wrong and issuing a low-memory+swap warning or pkill. 999 * 1000 * A deficit forces paging regardless of the state of the 1001 * VM page queues (used for RSS enforcement). 1002 */ 1003 lwkt_yield(); 1004 vm_page_queues_spin_lock(PQ_INACTIVE + q); 1005 1006 /* if (vm_paging_target() < -vm_max_launder) */ 1007 if (!vm_paging_target2()) { 1008 /* 1009 * Stopping early, return full completion to caller. 1010 */ 1011 if (delta < avail_shortage) 1012 delta = avail_shortage; 1013 break; 1014 } 1015 } 1016 1017 /* page queue still spin-locked */ 1018 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 1019 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 1020 1021 return (delta); 1022 } 1023 1024 /* 1025 * Pageout the specified page, return the total number of pages paged out 1026 * (this routine may cluster). 1027 * 1028 * The page must be busied and soft-busied by the caller and will be disposed 1029 * of by this function. 1030 */ 1031 static int 1032 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 1033 struct vnode **vpfailedp, int pass, int vmflush_flags, 1034 long *counts) 1035 { 1036 vm_object_t object; 1037 int actcount; 1038 int count = 0; 1039 1040 /* 1041 * Wiring no longer removes a page from its queue. The last unwiring 1042 * will requeue the page. Obviously wired pages cannot be paged out 1043 * so unqueue it and return. 1044 */ 1045 if (m->wire_count) { 1046 vm_page_unqueue_nowakeup(m); 1047 vm_page_wakeup(m); 1048 return 0; 1049 } 1050 1051 /* 1052 * A held page may be undergoing I/O, so skip it. 1053 */ 1054 if (m->hold_count) { 1055 vm_page_and_queue_spin_lock(m); 1056 if (m->queue - m->pc == PQ_INACTIVE) { 1057 TAILQ_REMOVE( 1058 &vm_page_queues[m->queue].pl, m, pageq); 1059 TAILQ_INSERT_TAIL( 1060 &vm_page_queues[m->queue].pl, m, pageq); 1061 } 1062 vm_page_and_queue_spin_unlock(m); 1063 vm_page_wakeup(m); 1064 return 0; 1065 } 1066 1067 if (m->object == NULL || m->object->ref_count == 0) { 1068 /* 1069 * If the object is not being used, we ignore previous 1070 * references. 1071 */ 1072 vm_page_flag_clear(m, PG_REFERENCED); 1073 pmap_clear_reference(m); 1074 /* fall through to end */ 1075 } else if (((m->flags & PG_REFERENCED) == 0) && 1076 (actcount = pmap_ts_referenced(m))) { 1077 /* 1078 * Otherwise, if the page has been referenced while 1079 * in the inactive queue, we bump the "activation 1080 * count" upwards, making it less likely that the 1081 * page will be added back to the inactive queue 1082 * prematurely again. Here we check the page tables 1083 * (or emulated bits, if any), given the upper level 1084 * VM system not knowing anything about existing 1085 * references. 1086 */ 1087 ++counts[3]; 1088 vm_page_activate(m); 1089 m->act_count += (actcount + ACT_ADVANCE); 1090 vm_page_wakeup(m); 1091 return 0; 1092 } 1093 1094 /* 1095 * (m) is still busied. 1096 * 1097 * If the upper level VM system knows about any page 1098 * references, we activate the page. We also set the 1099 * "activation count" higher than normal so that we will less 1100 * likely place pages back onto the inactive queue again. 1101 */ 1102 if ((m->flags & PG_REFERENCED) != 0) { 1103 vm_page_flag_clear(m, PG_REFERENCED); 1104 actcount = pmap_ts_referenced(m); 1105 vm_page_activate(m); 1106 m->act_count += (actcount + ACT_ADVANCE + 1); 1107 vm_page_wakeup(m); 1108 ++counts[3]; 1109 return 0; 1110 } 1111 1112 /* 1113 * If the upper level VM system doesn't know anything about 1114 * the page being dirty, we have to check for it again. As 1115 * far as the VM code knows, any partially dirty pages are 1116 * fully dirty. 1117 * 1118 * Pages marked PG_WRITEABLE may be mapped into the user 1119 * address space of a process running on another cpu. A 1120 * user process (without holding the MP lock) running on 1121 * another cpu may be able to touch the page while we are 1122 * trying to remove it. vm_page_cache() will handle this 1123 * case for us. 1124 */ 1125 if (m->dirty == 0) { 1126 vm_page_test_dirty(m); 1127 } else { 1128 vm_page_dirty(m); 1129 } 1130 1131 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1132 /* 1133 * Invalid pages can be easily freed 1134 */ 1135 vm_pageout_page_free(m); 1136 mycpu->gd_cnt.v_dfree++; 1137 ++count; 1138 ++counts[1]; 1139 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1140 /* 1141 * Clean pages can be placed onto the cache queue. 1142 * This effectively frees them. 1143 */ 1144 vm_page_cache(m); 1145 ++count; 1146 ++counts[1]; 1147 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1148 /* 1149 * Dirty pages need to be paged out, but flushing 1150 * a page is extremely expensive verses freeing 1151 * a clean page. Rather then artificially limiting 1152 * the number of pages we can flush, we instead give 1153 * dirty pages extra priority on the inactive queue 1154 * by forcing them to be cycled through the queue 1155 * twice before being flushed, after which the 1156 * (now clean) page will cycle through once more 1157 * before being freed. This significantly extends 1158 * the thrash point for a heavily loaded machine. 1159 */ 1160 ++counts[2]; 1161 vm_page_flag_set(m, PG_WINATCFLS); 1162 vm_page_and_queue_spin_lock(m); 1163 if (m->queue - m->pc == PQ_INACTIVE) { 1164 TAILQ_REMOVE( 1165 &vm_page_queues[m->queue].pl, m, pageq); 1166 TAILQ_INSERT_TAIL( 1167 &vm_page_queues[m->queue].pl, m, pageq); 1168 } 1169 vm_page_and_queue_spin_unlock(m); 1170 vm_page_wakeup(m); 1171 } else if (*max_launderp > 0) { 1172 /* 1173 * We always want to try to flush some dirty pages if 1174 * we encounter them, to keep the system stable. 1175 * Normally this number is small, but under extreme 1176 * pressure where there are insufficient clean pages 1177 * on the inactive queue, we may have to go all out. 1178 */ 1179 int swap_pageouts_ok; 1180 struct vnode *vp = NULL; 1181 1182 if ((m->flags & PG_WINATCFLS) == 0) 1183 vm_page_flag_set(m, PG_WINATCFLS); 1184 swap_pageouts_ok = 0; 1185 object = m->object; 1186 if (object && 1187 (object->type != OBJT_SWAP) && 1188 (object->type != OBJT_DEFAULT)) { 1189 swap_pageouts_ok = 1; 1190 } else { 1191 swap_pageouts_ok = !(defer_swap_pageouts || 1192 disable_swap_pageouts); 1193 swap_pageouts_ok |= (!disable_swap_pageouts && 1194 defer_swap_pageouts && 1195 vm_paging_min()); 1196 } 1197 1198 /* 1199 * We don't bother paging objects that are "dead". 1200 * Those objects are in a "rundown" state. 1201 */ 1202 if (!swap_pageouts_ok || 1203 (object == NULL) || 1204 (object->flags & OBJ_DEAD)) { 1205 vm_page_and_queue_spin_lock(m); 1206 if (m->queue - m->pc == PQ_INACTIVE) { 1207 TAILQ_REMOVE( 1208 &vm_page_queues[m->queue].pl, 1209 m, pageq); 1210 TAILQ_INSERT_TAIL( 1211 &vm_page_queues[m->queue].pl, 1212 m, pageq); 1213 } 1214 vm_page_and_queue_spin_unlock(m); 1215 vm_page_wakeup(m); 1216 return 0; 1217 } 1218 1219 /* 1220 * (m) is still busied. 1221 * 1222 * The object is already known NOT to be dead. It 1223 * is possible for the vget() to block the whole 1224 * pageout daemon, but the new low-memory handling 1225 * code should prevent it. 1226 * 1227 * The previous code skipped locked vnodes and, worse, 1228 * reordered pages in the queue. This results in 1229 * completely non-deterministic operation because, 1230 * quite often, a vm_fault has initiated an I/O and 1231 * is holding a locked vnode at just the point where 1232 * the pageout daemon is woken up. 1233 * 1234 * We can't wait forever for the vnode lock, we might 1235 * deadlock due to a vn_read() getting stuck in 1236 * vm_wait while holding this vnode. We skip the 1237 * vnode if we can't get it in a reasonable amount 1238 * of time. 1239 * 1240 * vpfailed is used to (try to) avoid the case where 1241 * a large number of pages are associated with a 1242 * locked vnode, which could cause the pageout daemon 1243 * to stall for an excessive amount of time. 1244 */ 1245 if (object->type == OBJT_VNODE) { 1246 int flags; 1247 1248 vp = object->handle; 1249 flags = LK_EXCLUSIVE; 1250 if (vp == *vpfailedp) 1251 flags |= LK_NOWAIT; 1252 else 1253 flags |= LK_TIMELOCK; 1254 vm_page_hold(m); 1255 vm_page_wakeup(m); 1256 1257 /* 1258 * We have unbusied (m) temporarily so we can 1259 * acquire the vp lock without deadlocking. 1260 * (m) is held to prevent destruction. 1261 */ 1262 if (vget(vp, flags) != 0) { 1263 *vpfailedp = vp; 1264 ++pageout_lock_miss; 1265 if (object->flags & OBJ_MIGHTBEDIRTY) 1266 ++*vnodes_skippedp; 1267 vm_page_unhold(m); 1268 return 0; 1269 } 1270 1271 /* 1272 * The page might have been moved to another 1273 * queue during potential blocking in vget() 1274 * above. The page might have been freed and 1275 * reused for another vnode. The object might 1276 * have been reused for another vnode. 1277 */ 1278 if (m->queue - m->pc != PQ_INACTIVE || 1279 m->object != object || 1280 object->handle != vp) { 1281 if (object->flags & OBJ_MIGHTBEDIRTY) 1282 ++*vnodes_skippedp; 1283 vput(vp); 1284 vm_page_unhold(m); 1285 return 0; 1286 } 1287 1288 /* 1289 * The page may have been busied during the 1290 * blocking in vput(); We don't move the 1291 * page back onto the end of the queue so that 1292 * statistics are more correct if we don't. 1293 */ 1294 if (vm_page_busy_try(m, TRUE)) { 1295 vput(vp); 1296 vm_page_unhold(m); 1297 return 0; 1298 } 1299 vm_page_unhold(m); 1300 1301 /* 1302 * If it was wired while we didn't own it. 1303 */ 1304 if (m->wire_count) { 1305 vm_page_unqueue_nowakeup(m); 1306 vput(vp); 1307 vm_page_wakeup(m); 1308 return 0; 1309 } 1310 1311 /* 1312 * (m) is busied again 1313 * 1314 * We own the busy bit and remove our hold 1315 * bit. If the page is still held it 1316 * might be undergoing I/O, so skip it. 1317 */ 1318 if (m->hold_count) { 1319 rebusy_failed: 1320 vm_page_and_queue_spin_lock(m); 1321 if (m->queue - m->pc == PQ_INACTIVE) { 1322 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1323 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1324 } 1325 vm_page_and_queue_spin_unlock(m); 1326 if (object->flags & OBJ_MIGHTBEDIRTY) 1327 ++*vnodes_skippedp; 1328 vm_page_wakeup(m); 1329 vput(vp); 1330 return 0; 1331 } 1332 1333 /* 1334 * Recheck queue, object, and vp now that we have 1335 * rebusied the page. 1336 */ 1337 if (m->queue - m->pc != PQ_INACTIVE || 1338 m->object != object || 1339 object->handle != vp) { 1340 kprintf("vm_pageout_page: " 1341 "rebusy %p failed(A)\n", 1342 m); 1343 goto rebusy_failed; 1344 } 1345 1346 /* 1347 * Check page validity 1348 */ 1349 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1350 kprintf("vm_pageout_page: " 1351 "rebusy %p failed(B)\n", 1352 m); 1353 goto rebusy_failed; 1354 } 1355 if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1356 kprintf("vm_pageout_page: " 1357 "rebusy %p failed(C)\n", 1358 m); 1359 goto rebusy_failed; 1360 } 1361 1362 /* (m) is left busied as we fall through */ 1363 } 1364 1365 /* 1366 * page is busy and not held here. 1367 * 1368 * If a page is dirty, then it is either being washed 1369 * (but not yet cleaned) or it is still in the 1370 * laundry. If it is still in the laundry, then we 1371 * start the cleaning operation. 1372 * 1373 * decrement inactive_shortage on success to account 1374 * for the (future) cleaned page. Otherwise we 1375 * could wind up laundering or cleaning too many 1376 * pages. 1377 * 1378 * NOTE: Cleaning the page here does not cause 1379 * force_deficit to be adjusted, because the 1380 * page is not being freed or moved to the 1381 * cache. 1382 */ 1383 count = vm_pageout_clean_helper(m, vmflush_flags); 1384 counts[0] += count; 1385 *max_launderp -= count; 1386 1387 /* 1388 * Clean ate busy, page no longer accessible 1389 */ 1390 if (vp != NULL) 1391 vput(vp); 1392 } else { 1393 vm_page_wakeup(m); 1394 } 1395 return count; 1396 } 1397 1398 /* 1399 * Scan active queue 1400 * 1401 * WARNING! Can be called from two pagedaemon threads simultaneously. 1402 */ 1403 static int 1404 vm_pageout_scan_active(int pass, int q, 1405 long avail_shortage, long inactive_shortage, 1406 struct vm_page *marker, 1407 long *recycle_countp) 1408 { 1409 vm_page_t m; 1410 int actcount; 1411 long delta = 0; 1412 long maxscan; 1413 int isep; 1414 1415 isep = (curthread == emergpager); 1416 1417 /* 1418 * We want to move pages from the active queue to the inactive 1419 * queue to get the inactive queue to the inactive target. If 1420 * we still have a page shortage from above we try to directly free 1421 * clean pages instead of moving them. 1422 * 1423 * If we do still have a shortage we keep track of the number of 1424 * pages we free or cache (recycle_count) as a measure of thrashing 1425 * between the active and inactive queues. 1426 * 1427 * If we were able to completely satisfy the free+cache targets 1428 * from the inactive pool we limit the number of pages we move 1429 * from the active pool to the inactive pool to 2x the pages we 1430 * had removed from the inactive pool (with a minimum of 1/5 the 1431 * inactive target). If we were not able to completely satisfy 1432 * the free+cache targets we go for the whole target aggressively. 1433 * 1434 * NOTE: Both variables can end up negative. 1435 * NOTE: We are still in a critical section. 1436 * 1437 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1438 * PAGES. 1439 */ 1440 1441 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1442 maxscan = (vm_page_queues[PQ_ACTIVE + q].lcnt + MAXSCAN_DIVIDER - 1) / 1443 MAXSCAN_DIVIDER + 1; 1444 1445 /* 1446 * Queue locked at top of loop to avoid stack marker issues. 1447 */ 1448 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 1449 maxscan-- > 0 && (avail_shortage - delta > 0 || 1450 inactive_shortage > 0)) 1451 { 1452 KKASSERT(m->queue == PQ_ACTIVE + q); 1453 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1454 marker, pageq); 1455 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1456 marker, pageq); 1457 1458 /* 1459 * Skip marker pages (atomic against other markers to avoid 1460 * infinite hop-over scans). 1461 */ 1462 if (m->flags & PG_MARKER) 1463 continue; 1464 1465 /* 1466 * Try to busy the page. Don't mess with pages which are 1467 * already busy or reorder them in the queue. 1468 */ 1469 if (vm_page_busy_try(m, TRUE)) 1470 continue; 1471 1472 /* 1473 * Remaining operations run with the page busy and neither 1474 * the page or the queue will be spin-locked. 1475 */ 1476 KKASSERT(m->queue == PQ_ACTIVE + q); 1477 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1478 1479 #if 0 1480 /* 1481 * Don't deactivate pages that are held, even if we can 1482 * busy them. (XXX why not?) 1483 */ 1484 if (m->hold_count) { 1485 vm_page_and_queue_spin_lock(m); 1486 if (m->queue - m->pc == PQ_ACTIVE) { 1487 TAILQ_REMOVE( 1488 &vm_page_queues[PQ_ACTIVE + q].pl, 1489 m, pageq); 1490 TAILQ_INSERT_TAIL( 1491 &vm_page_queues[PQ_ACTIVE + q].pl, 1492 m, pageq); 1493 } 1494 vm_page_and_queue_spin_unlock(m); 1495 vm_page_wakeup(m); 1496 goto next; 1497 } 1498 #endif 1499 /* 1500 * We can just remove wired pages from the queue 1501 */ 1502 if (m->wire_count) { 1503 vm_page_unqueue_nowakeup(m); 1504 vm_page_wakeup(m); 1505 goto next; 1506 } 1507 1508 /* 1509 * The emergency pager ignores vnode-backed pages as these 1510 * are the pages that probably bricked the main pager. 1511 */ 1512 if (isep && m->object && m->object->type == OBJT_VNODE) { 1513 #if 0 1514 vm_page_and_queue_spin_lock(m); 1515 if (m->queue - m->pc == PQ_ACTIVE) { 1516 TAILQ_REMOVE( 1517 &vm_page_queues[PQ_ACTIVE + q].pl, 1518 m, pageq); 1519 TAILQ_INSERT_TAIL( 1520 &vm_page_queues[PQ_ACTIVE + q].pl, 1521 m, pageq); 1522 } 1523 vm_page_and_queue_spin_unlock(m); 1524 #endif 1525 vm_page_wakeup(m); 1526 goto next; 1527 } 1528 1529 /* 1530 * The count for pagedaemon pages is done after checking the 1531 * page for eligibility... 1532 */ 1533 mycpu->gd_cnt.v_pdpages++; 1534 1535 /* 1536 * Check to see "how much" the page has been used and clear 1537 * the tracking access bits. If the object has no references 1538 * don't bother paying the expense. 1539 */ 1540 actcount = 0; 1541 if (m->object && m->object->ref_count != 0) { 1542 if (m->flags & PG_REFERENCED) 1543 ++actcount; 1544 actcount += pmap_ts_referenced(m); 1545 if (actcount) { 1546 m->act_count += ACT_ADVANCE + actcount; 1547 if (m->act_count > ACT_MAX) 1548 m->act_count = ACT_MAX; 1549 } 1550 } 1551 vm_page_flag_clear(m, PG_REFERENCED); 1552 1553 /* 1554 * actcount is only valid if the object ref_count is non-zero. 1555 * If the page does not have an object, actcount will be zero. 1556 */ 1557 if (actcount && m->object->ref_count != 0) { 1558 #if 0 1559 vm_page_and_queue_spin_lock(m); 1560 if (m->queue - m->pc == PQ_ACTIVE) { 1561 TAILQ_REMOVE( 1562 &vm_page_queues[PQ_ACTIVE + q].pl, 1563 m, pageq); 1564 TAILQ_INSERT_TAIL( 1565 &vm_page_queues[PQ_ACTIVE + q].pl, 1566 m, pageq); 1567 } 1568 vm_page_and_queue_spin_unlock(m); 1569 #endif 1570 vm_page_wakeup(m); 1571 } else { 1572 switch(m->object->type) { 1573 case OBJT_DEFAULT: 1574 case OBJT_SWAP: 1575 m->act_count -= min(m->act_count, 1576 vm_anonmem_decline); 1577 break; 1578 default: 1579 m->act_count -= min(m->act_count, 1580 vm_filemem_decline); 1581 break; 1582 } 1583 if (vm_pageout_algorithm || 1584 (m->object == NULL) || 1585 (m->object && (m->object->ref_count == 0)) || 1586 m->act_count < pass + 1 1587 ) { 1588 /* 1589 * Deactivate the page. If we had a 1590 * shortage from our inactive scan try to 1591 * free (cache) the page instead. 1592 * 1593 * Don't just blindly cache the page if 1594 * we do not have a shortage from the 1595 * inactive scan, that could lead to 1596 * gigabytes being moved. 1597 */ 1598 --inactive_shortage; 1599 if (avail_shortage - delta > 0 || 1600 (m->object && (m->object->ref_count == 0))) 1601 { 1602 if (avail_shortage - delta > 0) 1603 ++*recycle_countp; 1604 vm_page_protect(m, VM_PROT_NONE); 1605 if (m->dirty == 0 && 1606 (m->flags & PG_NEED_COMMIT) == 0 && 1607 avail_shortage - delta > 0) { 1608 vm_page_cache(m); 1609 } else { 1610 vm_page_deactivate(m); 1611 vm_page_wakeup(m); 1612 } 1613 } else { 1614 vm_page_deactivate(m); 1615 vm_page_wakeup(m); 1616 } 1617 ++delta; 1618 } else { 1619 /* 1620 * Do nothing 1621 */ 1622 #if 0 1623 vm_page_and_queue_spin_lock(m); 1624 if (m->queue - m->pc == PQ_ACTIVE) { 1625 TAILQ_REMOVE( 1626 &vm_page_queues[PQ_ACTIVE + q].pl, 1627 m, pageq); 1628 TAILQ_INSERT_TAIL( 1629 &vm_page_queues[PQ_ACTIVE + q].pl, 1630 m, pageq); 1631 } 1632 vm_page_and_queue_spin_unlock(m); 1633 #endif 1634 vm_page_wakeup(m); 1635 } 1636 } 1637 next: 1638 lwkt_yield(); 1639 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1640 } 1641 1642 /* 1643 * Clean out our local marker. 1644 * 1645 * Page queue still spin-locked. 1646 */ 1647 if (m == NULL) { 1648 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1649 marker, pageq); 1650 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 1651 marker, pageq); 1652 } 1653 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1654 1655 return (delta); 1656 } 1657 1658 /* 1659 * The number of actually free pages can drop down to v_free_reserved, 1660 * we try to build the free count back above v_free_min, to v_free_target. 1661 * 1662 * Cache pages are already counted as being free-ish. 1663 * 1664 * NOTE: we are still in a critical section. 1665 * 1666 * Pages moved from PQ_CACHE to totally free are not counted in the 1667 * pages_freed counter. 1668 * 1669 * WARNING! Can be called from two pagedaemon threads simultaneously. 1670 */ 1671 static void 1672 vm_pageout_scan_cache(long avail_shortage, int pass, 1673 long vnodes_skipped, long recycle_count) 1674 { 1675 static int lastkillticks; 1676 struct vm_pageout_scan_info info; 1677 vm_page_t m; 1678 int isep; 1679 1680 isep = (curthread == emergpager); 1681 1682 /* 1683 * Test conditions also include a safeety against v_free_min in 1684 * case the sysop messes up the sysctls. 1685 * 1686 * Also include a test to avoid degenerate scans. 1687 */ 1688 while ((vmstats.v_free_count < vmstats.v_free_target || 1689 vmstats.v_free_count < vmstats.v_free_min) && 1690 vmstats.v_cache_count > VM_CACHE_SCAN_MIN) 1691 { 1692 /* 1693 * This steals some code from vm/vm_page.c 1694 * 1695 * Create two rovers and adjust the code to reduce 1696 * chances of them winding up at the same index (which 1697 * can cause a lot of contention). 1698 */ 1699 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1700 1701 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1702 goto next_rover; 1703 1704 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1705 if (m == NULL) 1706 break; 1707 /* 1708 * page is returned removed from its queue and spinlocked. 1709 * 1710 * If the busy attempt fails we can still deactivate the page. 1711 */ 1712 if (vm_page_busy_try(m, TRUE)) { 1713 vm_page_deactivate_locked(m); 1714 vm_page_spin_unlock(m); 1715 continue; 1716 } 1717 vm_page_spin_unlock(m); 1718 pagedaemon_wakeup(); 1719 lwkt_yield(); 1720 1721 /* 1722 * Report a possible edge case. This shouldn't happen but 1723 * actually I think it can race against e.g. 1724 * vm_page_lookup()/busy sequences. If the page isn't 1725 * in a cache-like state we will deactivate and skip it. 1726 */ 1727 if ((m->flags & PG_MAPPED) || (m->valid & m->dirty)) { 1728 kprintf("WARNING! page race during find/busy: %p " 1729 "queue == %d dirty=%02x\n", 1730 m, m->queue - m->pc, m->dirty); 1731 } 1732 1733 /* 1734 * Remaining operations run with the page busy and neither 1735 * the page or the queue will be spin-locked. 1736 */ 1737 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT | PG_MAPPED)) || 1738 m->hold_count || 1739 m->wire_count || 1740 (m->valid & m->dirty)) 1741 { 1742 vm_page_deactivate(m); 1743 vm_page_wakeup(m); 1744 continue; 1745 } 1746 1747 /* 1748 * Because the page is in the cache, it shouldn't be mapped. 1749 */ 1750 pmap_mapped_sync(m); 1751 KKASSERT((m->flags & PG_MAPPED) == 0); 1752 KKASSERT(m->dirty == 0); 1753 vm_pageout_page_free(m); 1754 mycpu->gd_cnt.v_dfree++; 1755 next_rover: 1756 if (isep) 1757 cache_rover[1] -= PQ_PRIME2; 1758 else 1759 cache_rover[0] += PQ_PRIME2; 1760 } 1761 1762 /* 1763 * If we didn't get enough free pages, and we have skipped a vnode 1764 * in a writeable object, wakeup the sync daemon. And kick swapout 1765 * if we did not get enough free pages. 1766 */ 1767 if (vm_paging_target1()) { 1768 if (vnodes_skipped && vm_paging_min()) 1769 speedup_syncer(NULL); 1770 #if !defined(NO_SWAPPING) 1771 if (vm_swap_enabled && vm_paging_target1()) 1772 vm_req_vmdaemon(); 1773 #endif 1774 } 1775 1776 /* 1777 * Handle catastrophic conditions. Under good conditions we should 1778 * be at the target, well beyond our minimum. If we could not even 1779 * reach our minimum the system is under heavy stress. But just being 1780 * under heavy stress does not trigger process killing. 1781 * 1782 * We consider ourselves to have run out of memory if the swap pager 1783 * is full and avail_shortage is still positive. The secondary check 1784 * ensures that we do not kill processes if the instantanious 1785 * availability is good, even if the pageout demon pass says it 1786 * couldn't get to the target. 1787 * 1788 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1789 * SITUATIONS. 1790 */ 1791 if (swap_pager_almost_full && 1792 pass > 0 && 1793 isep == 0 && 1794 (vm_paging_min_dnc(recycle_count) || avail_shortage > 0)) { 1795 kprintf("Warning: system low on memory+swap " 1796 "shortage %ld for %d ticks!\n", 1797 avail_shortage, ticks - swap_fail_ticks); 1798 if (bootverbose) { 1799 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1800 "availshrt=%ld tgt=%d/%d inacshrt=%ld " 1801 "last=%u\n", 1802 swap_pager_almost_full, 1803 swap_pager_full, 1804 pass, 1805 avail_shortage, 1806 vm_paging_target1(), 1807 vm_paging_target2(), 1808 vm_paging_target2_count(), 1809 (unsigned int)(ticks - lastkillticks)); 1810 } 1811 } 1812 if (swap_pager_full && 1813 pass > 1 && 1814 isep == 0 && 1815 avail_shortage > 0 && 1816 vm_paging_target1() && 1817 (unsigned int)(ticks - lastkillticks) >= hz) 1818 { 1819 /* 1820 * Kill something, maximum rate once per second to give 1821 * the process time to free up sufficient memory. 1822 */ 1823 lastkillticks = ticks; 1824 info.bigproc = NULL; 1825 info.bigsize = 0; 1826 allproc_scan(vm_pageout_scan_callback, &info, 0); 1827 if (info.bigproc != NULL) { 1828 kprintf("Try to kill process %d %s\n", 1829 info.bigproc->p_pid, info.bigproc->p_comm); 1830 info.bigproc->p_nice = PRIO_MIN; 1831 info.bigproc->p_usched->resetpriority( 1832 FIRST_LWP_IN_PROC(info.bigproc)); 1833 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1834 killproc(info.bigproc, "out of swap space"); 1835 wakeup(&vmstats.v_free_count); 1836 PRELE(info.bigproc); 1837 } 1838 } 1839 } 1840 1841 static int 1842 vm_pageout_scan_callback(struct proc *p, void *data) 1843 { 1844 struct vm_pageout_scan_info *info = data; 1845 vm_offset_t size; 1846 1847 /* 1848 * Never kill system processes or init. If we have configured swap 1849 * then try to avoid killing low-numbered pids. 1850 */ 1851 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1852 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1853 return (0); 1854 } 1855 1856 lwkt_gettoken(&p->p_token); 1857 1858 /* 1859 * if the process is in a non-running type state, 1860 * don't touch it. 1861 */ 1862 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1863 lwkt_reltoken(&p->p_token); 1864 return (0); 1865 } 1866 1867 /* 1868 * Get the approximate process size. Note that anonymous pages 1869 * with backing swap will be counted twice, but there should not 1870 * be too many such pages due to the stress the VM system is 1871 * under at this point. 1872 */ 1873 size = vmspace_anonymous_count(p->p_vmspace) + 1874 vmspace_swap_count(p->p_vmspace); 1875 1876 /* 1877 * If the this process is bigger than the biggest one 1878 * remember it. 1879 */ 1880 if (info->bigsize < size) { 1881 if (info->bigproc) 1882 PRELE(info->bigproc); 1883 PHOLD(p); 1884 info->bigproc = p; 1885 info->bigsize = size; 1886 } 1887 lwkt_reltoken(&p->p_token); 1888 lwkt_yield(); 1889 1890 return(0); 1891 } 1892 1893 /* 1894 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1895 * moved back to PQ_FREE. It is possible for pages to accumulate here 1896 * when vm_page_free() races against vm_page_unhold(), resulting in a 1897 * page being left on a PQ_HOLD queue with hold_count == 0. 1898 * 1899 * It is easier to handle this edge condition here, in non-critical code, 1900 * rather than enforce a spin-lock for every 1->0 transition in 1901 * vm_page_unhold(). 1902 * 1903 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1904 */ 1905 static void 1906 vm_pageout_scan_hold(int q, struct vm_page *marker) 1907 { 1908 vm_page_t m; 1909 long pcount; 1910 1911 pcount = vm_page_queues[PQ_HOLD + q].lcnt; 1912 if (pcount > vm_pageout_stats_scan) 1913 pcount = vm_pageout_stats_scan; 1914 1915 vm_page_queues_spin_lock(PQ_HOLD + q); 1916 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 1917 pcount-- > 0) 1918 { 1919 KKASSERT(m->queue == PQ_HOLD + q); 1920 TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl, marker, pageq); 1921 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_HOLD + q].pl, m, 1922 marker, pageq); 1923 1924 if (m->flags & PG_MARKER) 1925 continue; 1926 1927 /* 1928 * Process one page and return 1929 */ 1930 if (m->hold_count) 1931 break; 1932 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1933 vm_page_hold(m); 1934 vm_page_queues_spin_unlock(PQ_HOLD + q); 1935 vm_page_unhold(m); /* reprocess */ 1936 vm_page_queues_spin_lock(PQ_HOLD + q); 1937 } 1938 1939 /* 1940 * If queue exhausted move the marker back to the head. 1941 */ 1942 if (m == NULL) { 1943 TAILQ_REMOVE(&vm_page_queues[PQ_HOLD + q].pl, 1944 marker, pageq); 1945 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl, 1946 marker, pageq); 1947 } 1948 1949 vm_page_queues_spin_unlock(PQ_HOLD + q); 1950 } 1951 1952 /* 1953 * This code maintains the m->act for active pages. The scan occurs only 1954 * as long as the pageout daemon is not running or the inactive target has 1955 * not been reached. 1956 * 1957 * The restrictions prevent an idle machine from degrading all VM pages 1958 * m->act to 0 or nearly 0, which makes the field useless. For example, if 1959 * a workstation user goes to bed. 1960 */ 1961 static void 1962 vm_pageout_page_stats(int q, struct vm_page *marker, long *counterp) 1963 { 1964 struct vpgqueues *pq = &vm_page_queues[PQ_ACTIVE + q]; 1965 vm_page_t m; 1966 long pcount; /* Number of pages to check */ 1967 1968 /* 1969 * No point scanning the active queue if it is smaller than 1970 * 1/2 usable memory. This most typically occurs at system 1971 * startup or if a huge amount of memory has just been freed. 1972 */ 1973 if (vmstats.v_active_count < vmstats.v_free_count + 1974 vmstats.v_cache_count + 1975 vmstats.v_inactive_count) 1976 { 1977 return; 1978 } 1979 1980 /* 1981 * Generally do not scan if the pageout daemon is not running 1982 * or the inactive target has been reached. However, we override 1983 * this and scan anyway for N seconds after the pageout daemon last 1984 * ran. 1985 * 1986 * This last bit is designed to give the system a little time to 1987 * stage more pages for potential deactivation. In this situation, 1988 * if the inactive target has been met, we just update m->act_count 1989 * and do not otherwise mess with the page. But we don't want it 1990 * to run forever because that would cause m->act to become unusable 1991 * if the machine were to become idle. 1992 */ 1993 if (vm_pages_needed == 0 && !vm_paging_inactive()) { 1994 if (time_uptime - vm_pagedaemon_uptime > vm_pageout_stats_rsecs) 1995 return; 1996 } 1997 1998 if (vm_pageout_debug) { 1999 static time_t save_time; 2000 if (save_time != time_uptime) { 2001 save_time = time_uptime; 2002 kprintf("DEACTIVATE Q=%4d N=%ld\n", 2003 q, vm_paging_inactive_count()); 2004 } 2005 } 2006 2007 /* 2008 * Limited scan to reduce cpu glitches, just in case the 2009 * pmap_ts_referenced() burns a lot of CPU. 2010 */ 2011 pcount = pq->lcnt; 2012 if (pcount > vm_pageout_stats_scan) 2013 pcount = vm_pageout_stats_scan; 2014 2015 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2016 2017 /* 2018 * Queue locked at top of loop to avoid stack marker issues. 2019 */ 2020 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && 2021 pcount-- > 0) 2022 { 2023 int actcount; 2024 2025 KKASSERT(m->queue == PQ_ACTIVE + q); 2026 TAILQ_REMOVE(&pq->pl, marker, pageq); 2027 TAILQ_INSERT_AFTER(&pq->pl, m, marker, pageq); 2028 2029 /* 2030 * Skip marker pages (atomic against other markers to avoid 2031 * infinite hop-over scans). 2032 */ 2033 if (m->flags & PG_MARKER) 2034 continue; 2035 2036 ++counterp[0]; 2037 2038 /* 2039 * Ignore pages we can't busy 2040 */ 2041 if (vm_page_busy_try(m, TRUE)) { 2042 continue; 2043 } 2044 2045 /* 2046 * Remaining operations run with the page busy and neither 2047 * the page or the queue will be spin-locked. 2048 */ 2049 KKASSERT(m->queue == PQ_ACTIVE + q); 2050 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2051 2052 /* 2053 * We can just remove wired pages from the queue 2054 */ 2055 if (m->wire_count) { 2056 vm_page_unqueue_nowakeup(m); 2057 vm_page_wakeup(m); 2058 goto next; 2059 } 2060 2061 2062 /* 2063 * We now have a safely busied page, the page and queue 2064 * spinlocks have been released. 2065 * 2066 * Ignore held and wired pages 2067 */ 2068 if (m->hold_count || m->wire_count) { 2069 vm_page_wakeup(m); 2070 goto next; 2071 } 2072 2073 /* 2074 * Calculate activity 2075 */ 2076 actcount = 0; 2077 if (m->flags & PG_REFERENCED) { 2078 vm_page_flag_clear(m, PG_REFERENCED); 2079 actcount += 1; 2080 } 2081 actcount += pmap_ts_referenced(m); 2082 2083 /* 2084 * Update act_count and move page to end of queue. 2085 */ 2086 if (actcount) { 2087 m->act_count += ACT_ADVANCE + actcount; 2088 if (m->act_count > ACT_MAX) 2089 m->act_count = ACT_MAX; 2090 #if 0 2091 vm_page_and_queue_spin_lock(m); 2092 if (m->queue - m->pc == PQ_ACTIVE) { 2093 TAILQ_REMOVE(&pq->pl, m, pageq); 2094 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 2095 } 2096 vm_page_and_queue_spin_unlock(m); 2097 #endif 2098 vm_page_wakeup(m); 2099 goto next; 2100 } 2101 2102 if (m->act_count == 0) { 2103 /* 2104 * If the deactivation target has not been reached 2105 * we try to deactivate the page. 2106 * 2107 * If the deactivation target has been reached it 2108 * is a complete waste of time (both now and later) 2109 * to try to deactivate more pages. 2110 */ 2111 if (vm_paging_inactive()) { 2112 vm_page_protect(m, VM_PROT_NONE); 2113 vm_page_deactivate(m); 2114 } 2115 ++counterp[1]; 2116 } else { 2117 m->act_count -= min(m->act_count, ACT_DECLINE); 2118 #if 0 2119 vm_page_and_queue_spin_lock(m); 2120 if (m->queue - m->pc == PQ_ACTIVE) { 2121 TAILQ_REMOVE(&pq->pl, m, pageq); 2122 TAILQ_INSERT_TAIL(&pq->pl, m, pageq); 2123 } 2124 vm_page_and_queue_spin_unlock(m); 2125 #endif 2126 2127 if (m->act_count < vm_pageout_stats_actcmp) { 2128 if (vm_paging_inactive()) { 2129 vm_page_protect(m, VM_PROT_NONE); 2130 vm_page_deactivate(m); 2131 } 2132 ++counterp[1]; 2133 } 2134 } 2135 vm_page_wakeup(m); 2136 next: 2137 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2138 } 2139 2140 /* 2141 * If the queue has been exhausted move the marker back to the head. 2142 */ 2143 if (m == NULL) { 2144 TAILQ_REMOVE(&pq->pl, marker, pageq); 2145 TAILQ_INSERT_HEAD(&pq->pl, marker, pageq); 2146 } 2147 2148 /* 2149 * Remove our local marker 2150 * 2151 * Page queue still spin-locked. 2152 */ 2153 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2154 2155 /* 2156 * After roughly every (inalim) pages determine if we are making 2157 * appropriate progress. If we are then reduce the comparison point 2158 * for act_count, and if we are not increase the comparison point. 2159 * 2160 * This allows us to handle heavier loads and also balances the 2161 * code, particularly at startup. 2162 */ 2163 if (counterp[0] > vm_pageout_stats_inalim) { 2164 if (counterp[1] < vm_pageout_stats_inamin) { 2165 if (vm_pageout_stats_actcmp < ACT_MAX * 3 / 4) 2166 ++vm_pageout_stats_actcmp; 2167 } else { 2168 if (vm_pageout_stats_actcmp > 0) 2169 --vm_pageout_stats_actcmp; 2170 } 2171 counterp[0] = 0; 2172 counterp[1] = 0; 2173 } 2174 } 2175 2176 static void 2177 vm_pageout_free_page_calc(vm_size_t count) 2178 { 2179 /* 2180 * v_free_min normal allocations 2181 * v_free_reserved system allocations 2182 * v_pageout_free_min allocations by pageout daemon 2183 * v_interrupt_free_min low level allocations (e.g swap structures) 2184 * 2185 * v_free_min is used to generate several other baselines, and they 2186 * can get pretty silly on systems with a lot of memory. 2187 */ 2188 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2189 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2190 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2191 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2192 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2193 } 2194 2195 2196 /* 2197 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2198 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2199 * 2200 * The emergency pageout daemon takes over when the primary pageout daemon 2201 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2202 * avoiding the many low-memory deadlocks which can occur when paging out 2203 * to VFS's. 2204 */ 2205 static void 2206 vm_pageout_thread(void) 2207 { 2208 int pass; 2209 int q; 2210 int q1iterator = 0; 2211 int q2iterator = 0; 2212 int q3iterator = 0; 2213 int isep; 2214 enum { PAGING_IDLE, PAGING_TARGET1, PAGING_TARGET2 } state; 2215 struct markers *markers; 2216 long scounter[2] = { 0, 0 }; 2217 time_t warn_time; 2218 2219 curthread->td_flags |= TDF_SYSTHREAD; 2220 state = PAGING_IDLE; 2221 2222 /* 2223 * Allocate continuous markers for hold, stats (active), and 2224 * paging active queue scan. These scans occur incrementally. 2225 */ 2226 markers = kmalloc(sizeof(*markers) * PQ_L2_SIZE, 2227 M_PAGEOUT, M_WAITOK | M_ZERO); 2228 2229 for (q = 0; q < PQ_L2_SIZE; ++q) { 2230 struct markers *mark = &markers[q]; 2231 2232 mark->hold.flags = PG_FICTITIOUS | PG_MARKER; 2233 mark->hold.busy_count = PBUSY_LOCKED; 2234 mark->hold.queue = PQ_HOLD + q; 2235 mark->hold.pc = PQ_HOLD + q; 2236 mark->hold.wire_count = 1; 2237 vm_page_queues_spin_lock(PQ_HOLD + q); 2238 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_HOLD + q].pl, 2239 &mark->hold, pageq); 2240 vm_page_queues_spin_unlock(PQ_HOLD + q); 2241 2242 mark->stat.flags = PG_FICTITIOUS | PG_MARKER; 2243 mark->stat.busy_count = PBUSY_LOCKED; 2244 mark->stat.queue = PQ_ACTIVE + q; 2245 mark->stat.pc = PQ_ACTIVE + q; 2246 mark->stat.wire_count = 1; 2247 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2248 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 2249 &mark->stat, pageq); 2250 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2251 2252 mark->pact.flags = PG_FICTITIOUS | PG_MARKER; 2253 mark->pact.busy_count = PBUSY_LOCKED; 2254 mark->pact.queue = PQ_ACTIVE + q; 2255 mark->pact.pc = PQ_ACTIVE + q; 2256 mark->pact.wire_count = 1; 2257 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2258 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, 2259 &mark->pact, pageq); 2260 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2261 } 2262 2263 /* 2264 * We only need to setup once. 2265 */ 2266 isep = 0; 2267 if (curthread == emergpager) { 2268 isep = 1; 2269 goto skip_setup; 2270 } 2271 2272 /* 2273 * Initialize vm_max_launder per pageout pass to be 1/16 2274 * of total physical memory, plus a little slop. 2275 */ 2276 if (vm_max_launder == 0) 2277 vm_max_launder = physmem / 256 + 16; 2278 2279 /* 2280 * Initialize some paging parameters. 2281 */ 2282 vm_pageout_free_page_calc(vmstats.v_page_count); 2283 2284 /* 2285 * Basic pageout daemon paging operation settings 2286 */ 2287 vmstats.v_free_target = vmstats.v_free_min * 2; 2288 2289 vmstats.v_paging_wait = vmstats.v_free_min * 2; 2290 vmstats.v_paging_start = vmstats.v_free_min * 3; 2291 vmstats.v_paging_target1 = vmstats.v_free_min * 4; 2292 vmstats.v_paging_target2 = vmstats.v_free_min * 5; 2293 2294 /* 2295 * NOTE: With the new buffer cache b_act_count we want the default 2296 * inactive target to be a percentage of available memory. 2297 * 2298 * The inactive target essentially determines the minimum 2299 * number of 'temporary' pages capable of caching one-time-use 2300 * files when the VM system is otherwise full of pages 2301 * belonging to multi-time-use files or active program data. 2302 * 2303 * NOTE: The inactive target is aggressively persued only if the 2304 * inactive queue becomes too small. If the inactive queue 2305 * is large enough to satisfy page movement to free+cache 2306 * then it is repopulated more slowly from the active queue. 2307 * This allows a general inactive_target default to be set. 2308 * 2309 * There is an issue here for processes which sit mostly idle 2310 * 'overnight', such as sshd, tcsh, and X. Any movement from 2311 * the active queue will eventually cause such pages to 2312 * recycle eventually causing a lot of paging in the morning. 2313 * To reduce the incidence of this pages cycled out of the 2314 * buffer cache are moved directly to the inactive queue if 2315 * they were only used once or twice. 2316 * 2317 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2318 * Increasing the value (up to 64) increases the number of 2319 * buffer recyclements which go directly to the inactive queue. 2320 * 2321 * NOTE: There is 'cache target'. The combined (free + cache( target 2322 * is handled by the v_paging_* targets above. 2323 */ 2324 vmstats.v_inactive_target = vmstats.v_free_count / 16; 2325 //vmstats.v_inactive_target = vmstats.v_free_min * 4; 2326 2327 /* XXX does not really belong here */ 2328 if (vm_page_max_wired == 0) 2329 vm_page_max_wired = vmstats.v_free_count / 3; 2330 2331 /* 2332 * page stats operation. 2333 * 2334 * scan - needs to be large enough for decent turn-around but 2335 * not so large that it eats a ton of CPU. Pages per run. 2336 * 2337 * ticks - interval per run in ticks. 2338 * 2339 * run - number of seconds after the pagedaemon has run that 2340 * we continue to collect page stats, after which we stop. 2341 * 2342 * Calculated for 50% coverage. 2343 * 2344 */ 2345 if (vm_pageout_stats_scan == 0) { 2346 vm_pageout_stats_scan = vmstats.v_free_count / PQ_L2_SIZE / 16; 2347 if (vm_pageout_stats_scan < 16) 2348 vm_pageout_stats_scan = 16; 2349 } 2350 2351 if (vm_pageout_stats_ticks == 0) 2352 vm_pageout_stats_ticks = hz / 10; 2353 2354 vm_pagedaemon_uptime = time_uptime; 2355 2356 swap_pager_swap_init(); 2357 2358 atomic_swap_int(&sequence_emerg_pager, 1); 2359 wakeup(&sequence_emerg_pager); 2360 2361 skip_setup: 2362 /* 2363 * Sequence emergency pager startup 2364 */ 2365 if (isep) { 2366 while (sequence_emerg_pager == 0) 2367 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2368 } 2369 2370 pass = 0; 2371 warn_time = time_uptime; 2372 2373 /* 2374 * The pageout daemon is never done, so loop forever. 2375 * 2376 * WARNING! This code is being executed by two kernel threads 2377 * potentially simultaneously. 2378 */ 2379 while (TRUE) { 2380 int error; 2381 long avail_shortage; 2382 long inactive_shortage; 2383 long vnodes_skipped = 0; 2384 long recycle_count = 0; 2385 long tmp; 2386 2387 /* 2388 * Don't let pass overflow 2389 */ 2390 if (pass > 0x7FFF0000) 2391 pass = 0x70000000; 2392 2393 /* 2394 * Wait for an action request. If we timeout check to 2395 * see if paging is needed (in case the normal wakeup 2396 * code raced us). 2397 */ 2398 if (isep) { 2399 /* 2400 * Emergency pagedaemon monitors the primary 2401 * pagedaemon while vm_pages_needed != 0. 2402 * 2403 * The emergency pagedaemon only runs if VM paging 2404 * is needed and the primary pagedaemon has not 2405 * updated vm_pagedaemon_uptime for more than 2 2406 * seconds. 2407 */ 2408 if (vm_pages_needed) 2409 tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz); 2410 else 2411 tsleep(&vm_pagedaemon_uptime, 0, "psleep", hz*10); 2412 if (vm_pages_needed == 0) { 2413 pass = 0; 2414 continue; 2415 } 2416 if ((int)(time_uptime - vm_pagedaemon_uptime) < 2) { 2417 pass = 0; 2418 continue; 2419 } 2420 } else { 2421 /* 2422 * Primary pagedaemon 2423 * 2424 * Do an unconditional partial scan to deal with 2425 * PQ_HOLD races and to maintain active stats on 2426 * pages that are in PQ_ACTIVE. 2427 */ 2428 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK, 2429 &markers[q3iterator & PQ_L2_MASK].hold); 2430 vm_pageout_page_stats(q3iterator & PQ_L2_MASK, 2431 &markers[q3iterator & PQ_L2_MASK].stat, 2432 scounter); 2433 ++q3iterator; 2434 2435 /* 2436 * Primary idle sleep loop, check condition after 2437 * sleep. 2438 * 2439 * NOTE: State will not be IDLE if vm_pages_needed 2440 * is non-zero. 2441 */ 2442 if (vm_pages_needed == 0) { 2443 error = tsleep(&vm_pages_needed, 2444 0, "psleep", 2445 vm_pageout_stats_ticks); 2446 if (error && 2447 vm_paging_start(0) == 0 && 2448 vm_pages_needed == 0) 2449 { 2450 continue; 2451 } 2452 vm_pagedaemon_uptime = time_uptime; 2453 vm_pages_needed = 1; 2454 state = PAGING_TARGET1; 2455 2456 /* 2457 * Wake the emergency pagedaemon up so it 2458 * can monitor us. It will automatically 2459 * go back into a long sleep when 2460 * vm_pages_needed returns to 0. 2461 */ 2462 wakeup(&vm_pagedaemon_uptime); 2463 } 2464 } 2465 2466 mycpu->gd_cnt.v_pdwakeups++; 2467 2468 /* 2469 * Scan for INACTIVE->CLEAN/PAGEOUT 2470 * 2471 * This routine tries to avoid thrashing the system with 2472 * unnecessary activity. 2473 * 2474 * Calculate our target for the number of free+cache pages we 2475 * want to get to. This is higher then the number that causes 2476 * allocations to stall (severe) in order to provide hysteresis, 2477 * and if we don't make it all the way but get to the minimum 2478 * we're happy. Goose it a bit if there are multiple requests 2479 * for memory. 2480 * 2481 * Don't reduce avail_shortage inside the loop or the 2482 * PQAVERAGE() calculation will break. 2483 * 2484 * NOTE! deficit is differentiated from avail_shortage as 2485 * REQUIRING at least (deficit) pages to be cleaned, 2486 * even if the page queues are in good shape. This 2487 * is used primarily for handling per-process 2488 * RLIMIT_RSS and may also see small values when 2489 * processes block due to low memory. 2490 */ 2491 vmstats_rollup(); 2492 if (isep == 0) 2493 vm_pagedaemon_uptime = time_uptime; 2494 2495 if (state == PAGING_TARGET1) { 2496 avail_shortage = vm_paging_target1_count() + 2497 vm_pageout_deficit; 2498 } else { 2499 avail_shortage = vm_paging_target2_count() + 2500 vm_pageout_deficit; 2501 } 2502 vm_pageout_deficit = 0; 2503 2504 if (avail_shortage > 0) { 2505 long delta = 0; 2506 long counts[4] = { 0, 0, 0, 0 }; 2507 long use = avail_shortage; 2508 int qq; 2509 2510 if (vm_pageout_debug) { 2511 static time_t save_time3; 2512 if (save_time3 != time_uptime) { 2513 save_time3 = time_uptime; 2514 kprintf("scan_inactive " 2515 "pass %d isep=%d\n", 2516 pass, isep); 2517 } 2518 } 2519 2520 /* 2521 * Once target1 is achieved we move on to target2, 2522 * but pageout more lazily in smaller batches. 2523 */ 2524 if (state == PAGING_TARGET2 && 2525 use > vmstats.v_inactive_target / 10) 2526 { 2527 use = vmstats.v_inactive_target / 10 + 1; 2528 } 2529 2530 qq = q1iterator; 2531 for (q = 0; q < PQ_L2_SIZE; ++q) { 2532 delta += vm_pageout_scan_inactive( 2533 pass / MAXSCAN_DIVIDER, 2534 qq & PQ_L2_MASK, 2535 PQAVERAGE(use), 2536 &vnodes_skipped, counts); 2537 if (isep) 2538 --qq; 2539 else 2540 ++qq; 2541 if (avail_shortage - delta <= 0) 2542 break; 2543 2544 /* 2545 * It is possible for avail_shortage to be 2546 * very large. If a large program exits or 2547 * frees a ton of memory all at once, we do 2548 * not have to continue deactivations. 2549 * 2550 * (We will still run the active->inactive 2551 * target, however). 2552 */ 2553 if (!vm_paging_target2() && 2554 !vm_paging_min_dnc(vm_page_free_hysteresis)) { 2555 avail_shortage = 0; 2556 break; 2557 } 2558 } 2559 if (vm_pageout_debug) { 2560 static time_t save_time2; 2561 if (save_time2 != time_uptime) { 2562 save_time2 = time_uptime; 2563 kprintf("flsh %ld cln %ld " 2564 "lru2 %ld react %ld " 2565 "delta %ld\n", 2566 counts[0], counts[1], 2567 counts[2], counts[3], 2568 delta); 2569 } 2570 } 2571 avail_shortage -= delta; 2572 q1iterator = qq; 2573 } 2574 2575 /* 2576 * Figure out how many active pages we must deactivate. If 2577 * we were able to reach our target with just the inactive 2578 * scan above we limit the number of active pages we 2579 * deactivate to reduce unnecessary work. 2580 * 2581 * When calculating inactive_shortage notice that we are 2582 * departing from what vm_paging_inactive_count() does. 2583 * During paging, the free + cache queues are assumed to 2584 * be under stress, so only a pure inactive target is 2585 * calculated without taking into account v_free_min, 2586 * v_free_count, or v_cache_count. 2587 */ 2588 vmstats_rollup(); 2589 if (isep == 0) 2590 vm_pagedaemon_uptime = time_uptime; 2591 inactive_shortage = vmstats.v_inactive_target - 2592 vmstats.v_inactive_count; 2593 2594 /* 2595 * If we were unable to free sufficient inactive pages to 2596 * satisfy the free/cache queue requirements then simply 2597 * reaching the inactive target may not be good enough. 2598 * Try to deactivate pages in excess of the target based 2599 * on the shortfall. 2600 * 2601 * However to prevent thrashing the VM system do not 2602 * deactivate more than an additional 1/10 the inactive 2603 * target's worth of active pages. 2604 */ 2605 if (avail_shortage > 0) { 2606 tmp = avail_shortage * 2; 2607 if (tmp > vmstats.v_inactive_target / 10) 2608 tmp = vmstats.v_inactive_target / 10; 2609 inactive_shortage += tmp; 2610 } 2611 2612 /* 2613 * Only trigger a pmap cleanup on inactive shortage. 2614 */ 2615 if (isep == 0 && inactive_shortage > 0) { 2616 pmap_collect(); 2617 } 2618 2619 /* 2620 * Scan for ACTIVE->INACTIVE 2621 * 2622 * Only trigger on inactive shortage. Triggering on 2623 * avail_shortage can starve the active queue with 2624 * unnecessary active->inactive transitions and destroy 2625 * performance. 2626 * 2627 * If this is the emergency pager, always try to move 2628 * a few pages from active to inactive because the inactive 2629 * queue might have enough pages, but not enough anonymous 2630 * pages. 2631 */ 2632 if (isep && inactive_shortage < vm_emerg_launder) 2633 inactive_shortage = vm_emerg_launder; 2634 2635 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2636 long delta = 0; 2637 int qq; 2638 2639 qq = q2iterator; 2640 for (q = 0; q < PQ_L2_SIZE; ++q) { 2641 delta += vm_pageout_scan_active( 2642 pass / MAXSCAN_DIVIDER, 2643 qq & PQ_L2_MASK, 2644 PQAVERAGE(avail_shortage), 2645 PQAVERAGE(inactive_shortage), 2646 &markers[qq & PQ_L2_MASK].pact, 2647 &recycle_count); 2648 if (isep) 2649 --qq; 2650 else 2651 ++qq; 2652 if (inactive_shortage - delta <= 0 && 2653 avail_shortage - delta <= 0) { 2654 break; 2655 } 2656 2657 /* 2658 * inactive_shortage can be a very large 2659 * number. This is intended to break out 2660 * early if our inactive_target has been 2661 * reached due to other system activity. 2662 */ 2663 if (vmstats.v_inactive_count > 2664 vmstats.v_inactive_target) 2665 { 2666 inactive_shortage = 0; 2667 break; 2668 } 2669 } 2670 inactive_shortage -= delta; 2671 avail_shortage -= delta; 2672 q2iterator = qq; 2673 } 2674 2675 /* 2676 * Scan for CACHE->FREE 2677 * 2678 * Finally free enough cache pages to meet our free page 2679 * requirement and take more drastic measures if we are 2680 * still in trouble. 2681 */ 2682 vmstats_rollup(); 2683 if (isep == 0) 2684 vm_pagedaemon_uptime = time_uptime; 2685 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER, 2686 vnodes_skipped, recycle_count); 2687 2688 /* 2689 * This is a bit sophisticated because we do not necessarily 2690 * want to force paging until our targets are reached if we 2691 * were able to successfully retire the shortage we calculated. 2692 */ 2693 if (avail_shortage > 0) { 2694 /* 2695 * If we did not retire enough pages continue the 2696 * pageout operation until we are able to. It 2697 * takes MAXSCAN_DIVIDER passes to cover the entire 2698 * inactive list. 2699 * 2700 * We used to throw delays in here if paging went on 2701 * continuously but that really just makes things 2702 * worse. Just keep going. 2703 */ 2704 if (pass == 0) 2705 warn_time = time_uptime; 2706 ++pass; 2707 if (isep == 0 && time_uptime - warn_time >= 60) { 2708 kprintf("pagedaemon: WARNING! Continuous " 2709 "paging for %ld minutes\n", 2710 (time_uptime - warn_time ) / 60); 2711 warn_time = time_uptime; 2712 } 2713 2714 if (vm_pages_needed) { 2715 /* 2716 * Normal operation, additional processes 2717 * have already kicked us. Retry immediately 2718 * unless swap space is completely full in 2719 * which case delay a bit. 2720 */ 2721 if (swap_pager_full) { 2722 tsleep(&vm_pages_needed, 0, "pdelay", 2723 hz / 5); 2724 } /* else immediate loop */ 2725 } /* else immediate loop */ 2726 } else { 2727 /* 2728 * Reset pass 2729 */ 2730 pass = 0; 2731 2732 if (vm_paging_start(0) || 2733 vm_paging_min_dnc(vm_page_free_hysteresis)) 2734 { 2735 /* 2736 * Pages sufficiently exhausted to start 2737 * page-daemon in TARGET1 mode 2738 */ 2739 state = PAGING_TARGET1; 2740 vm_pages_needed = 2; 2741 2742 /* 2743 * We can wakeup waiters if we are above 2744 * the wait point. 2745 */ 2746 if (!vm_paging_wait()) 2747 wakeup(&vmstats.v_free_count); 2748 } else if (vm_pages_needed) { 2749 /* 2750 * Continue paging until TARGET2 reached, 2751 * but waiters can be woken up. 2752 * 2753 * The PAGING_TARGET2 state tells the 2754 * pagedaemon to work a little less hard. 2755 */ 2756 if (vm_paging_target1()) { 2757 state = PAGING_TARGET1; 2758 vm_pages_needed = 2; 2759 } else if (vm_paging_target2()) { 2760 state = PAGING_TARGET2; 2761 vm_pages_needed = 2; 2762 } else { 2763 vm_pages_needed = 0; 2764 } 2765 wakeup(&vmstats.v_free_count); 2766 } /* else nothing to do here */ 2767 } 2768 } 2769 } 2770 2771 static struct kproc_desc pg1_kp = { 2772 "pagedaemon", 2773 vm_pageout_thread, 2774 &pagethread 2775 }; 2776 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2777 2778 static struct kproc_desc pg2_kp = { 2779 "emergpager", 2780 vm_pageout_thread, 2781 &emergpager 2782 }; 2783 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2784 2785 2786 /* 2787 * Called after allocating a page out of the cache or free queue 2788 * to possibly wake the pagedaemon up to replentish our supply. 2789 * 2790 * We try to generate some hysteresis by waking the pagedaemon up 2791 * when our free+cache pages go below the free_min+cache_min level. 2792 * The pagedaemon tries to get the count back up to at least the 2793 * minimum, and through to the target level if possible. 2794 * 2795 * If the pagedaemon is already active bump vm_pages_needed as a hint 2796 * that there are even more requests pending. 2797 * 2798 * SMP races ok? 2799 * No requirements. 2800 */ 2801 void 2802 pagedaemon_wakeup(void) 2803 { 2804 if (vm_paging_start(0) && curthread != pagethread) { 2805 if (vm_pages_needed <= 1) { 2806 vm_pages_needed = 1; /* SMP race ok */ 2807 wakeup(&vm_pages_needed); /* tickle pageout */ 2808 } else if (vm_paging_min()) { 2809 ++vm_pages_needed; /* SMP race ok */ 2810 /* a wakeup() would be wasted here */ 2811 } 2812 } 2813 } 2814 2815 #if !defined(NO_SWAPPING) 2816 2817 /* 2818 * SMP races ok? 2819 * No requirements. 2820 */ 2821 static void 2822 vm_req_vmdaemon(void) 2823 { 2824 static int lastrun = 0; 2825 2826 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2827 wakeup(&vm_daemon_needed); 2828 lastrun = ticks; 2829 } 2830 } 2831 2832 static int vm_daemon_callback(struct proc *p, void *data __unused); 2833 2834 /* 2835 * No requirements. 2836 * 2837 * Scan processes for exceeding their rlimits, deactivate pages 2838 * when RSS is exceeded. 2839 */ 2840 static void 2841 vm_daemon(void) 2842 { 2843 while (TRUE) { 2844 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2845 allproc_scan(vm_daemon_callback, NULL, 0); 2846 } 2847 } 2848 2849 static int 2850 vm_daemon_callback(struct proc *p, void *data __unused) 2851 { 2852 struct vmspace *vm; 2853 vm_pindex_t limit, size; 2854 2855 /* 2856 * if this is a system process or if we have already 2857 * looked at this process, skip it. 2858 */ 2859 lwkt_gettoken(&p->p_token); 2860 2861 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2862 lwkt_reltoken(&p->p_token); 2863 return (0); 2864 } 2865 2866 /* 2867 * if the process is in a non-running type state, 2868 * don't touch it. 2869 */ 2870 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2871 lwkt_reltoken(&p->p_token); 2872 return (0); 2873 } 2874 2875 /* 2876 * get a limit 2877 */ 2878 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2879 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2880 2881 vm = p->p_vmspace; 2882 vmspace_hold(vm); 2883 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2884 if (limit >= 0 && size > 4096 && 2885 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2886 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2887 } 2888 vmspace_drop(vm); 2889 2890 lwkt_reltoken(&p->p_token); 2891 2892 return (0); 2893 } 2894 2895 #endif 2896