1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1991 Regents of the University of California. 35 * All rights reserved. 36 * Copyright (c) 1994 John S. Dyson 37 * All rights reserved. 38 * Copyright (c) 1994 David Greenman 39 * All rights reserved. 40 * 41 * This code is derived from software contributed to Berkeley by 42 * The Mach Operating System project at Carnegie-Mellon University. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 69 * 70 * 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * The proverbial page-out daemon, rewritten many times over the decades. 99 */ 100 101 #include "opt_vm.h" 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/kernel.h> 105 #include <sys/proc.h> 106 #include <sys/kthread.h> 107 #include <sys/resourcevar.h> 108 #include <sys/signalvar.h> 109 #include <sys/vnode.h> 110 #include <sys/vmmeter.h> 111 #include <sys/conf.h> 112 #include <sys/sysctl.h> 113 114 #include <vm/vm.h> 115 #include <vm/vm_param.h> 116 #include <sys/lock.h> 117 #include <vm/vm_object.h> 118 #include <vm/vm_page.h> 119 #include <vm/vm_map.h> 120 #include <vm/vm_pageout.h> 121 #include <vm/vm_pager.h> 122 #include <vm/swap_pager.h> 123 #include <vm/vm_extern.h> 124 125 #include <sys/spinlock2.h> 126 #include <vm/vm_page2.h> 127 128 /* 129 * System initialization 130 */ 131 132 /* the kernel process "vm_pageout"*/ 133 static int vm_pageout_page(vm_page_t m, long *max_launderp, 134 long *vnodes_skippedp, struct vnode **vpfailedp, 135 int pass, int vmflush_flags, long *counts); 136 static int vm_pageout_clean_helper (vm_page_t, int); 137 static void vm_pageout_free_page_calc (vm_size_t count); 138 static void vm_pageout_page_free(vm_page_t m) ; 139 __read_frequently struct thread *emergpager; 140 __read_frequently struct thread *pagethread; 141 static int sequence_emerg_pager; 142 143 #if !defined(NO_SWAPPING) 144 /* the kernel process "vm_daemon"*/ 145 static void vm_daemon (void); 146 static struct thread *vmthread; 147 148 static struct kproc_desc vm_kp = { 149 "vmdaemon", 150 vm_daemon, 151 &vmthread 152 }; 153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 154 #endif 155 156 __read_mostly int vm_pages_needed = 0; /* pageout daemon tsleep event */ 157 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */ 158 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 159 __read_mostly int vm_page_free_hysteresis = 16; 160 __read_mostly static int vm_pagedaemon_time; 161 162 #if !defined(NO_SWAPPING) 163 static int vm_pageout_req_swapout; 164 static int vm_daemon_needed; 165 #endif 166 __read_mostly static int vm_max_launder = 0; 167 __read_mostly static int vm_emerg_launder = 100; 168 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 169 __read_mostly static int vm_pageout_full_stats_interval = 0; 170 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 171 __read_mostly static int defer_swap_pageouts=0; 172 __read_mostly static int disable_swap_pageouts=0; 173 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 174 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 175 __read_mostly static int vm_pageout_debug; 176 177 #if defined(NO_SWAPPING) 178 __read_mostly static int vm_swap_enabled=0; 179 __read_mostly static int vm_swap_idle_enabled=0; 180 #else 181 __read_mostly static int vm_swap_enabled=1; 182 __read_mostly static int vm_swap_idle_enabled=0; 183 #endif 184 185 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/ 186 __read_mostly int vm_pageout_memuse_mode=2; 187 __read_mostly int vm_pageout_allow_active=1; 188 189 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 190 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 191 192 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 193 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 194 195 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 196 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 197 "Free more pages than the minimum required"); 198 199 SYSCTL_INT(_vm, OID_AUTO, max_launder, 200 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 201 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 202 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 203 204 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 205 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 206 207 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 208 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 209 210 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 211 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 212 213 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 214 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 215 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 216 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 217 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active, 218 CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active"); 219 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 220 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 221 222 223 #if defined(NO_SWAPPING) 224 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 225 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 226 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 227 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 228 #else 229 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 230 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 231 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 232 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 233 #endif 234 235 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 236 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 237 238 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 239 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 240 241 static int pageout_lock_miss; 242 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 243 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 244 245 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 246 247 #if !defined(NO_SWAPPING) 248 static void vm_req_vmdaemon (void); 249 #endif 250 static void vm_pageout_page_stats(int q); 251 252 #define MAXSCAN_DIVIDER 10 253 254 /* 255 * Calculate approximately how many pages on each queue to try to 256 * clean. An exact calculation creates an edge condition when the 257 * queues are unbalanced so add significant slop. The queue scans 258 * will stop early when targets are reached and will start where they 259 * left off on the next pass. 260 * 261 * We need to be generous here because there are all sorts of loading 262 * conditions that can cause edge cases if try to average over all queues. 263 * In particular, storage subsystems have become so fast that paging 264 * activity can become quite frantic. Eventually we will probably need 265 * two paging threads, one for dirty pages and one for clean, to deal 266 * with the bandwidth requirements. 267 268 * So what we do is calculate a value that can be satisfied nominally by 269 * only having to scan half the queues. 270 */ 271 static __inline long 272 PQAVERAGE(long n) 273 { 274 long avg; 275 276 if (n >= 0) { 277 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 278 } else { 279 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 280 } 281 return avg; 282 } 283 284 /* 285 * vm_pageout_clean_helper: 286 * 287 * Clean the page and remove it from the laundry. The page must be busied 288 * by the caller and will be disposed of (put away, flushed) by this routine. 289 */ 290 static int 291 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 292 { 293 vm_object_t object; 294 vm_page_t mc[BLIST_MAX_ALLOC]; 295 int error; 296 int ib, is, page_base; 297 vm_pindex_t pindex = m->pindex; 298 299 object = m->object; 300 301 /* 302 * Don't mess with the page if it's held or special. Theoretically 303 * we can pageout held pages but there is no real need to press our 304 * luck, so don't. 305 */ 306 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 307 vm_page_wakeup(m); 308 return 0; 309 } 310 311 /* 312 * Place page in cluster. Align cluster for optimal swap space 313 * allocation (whether it is swap or not). This is typically ~16-32 314 * pages, which also tends to align the cluster to multiples of the 315 * filesystem block size if backed by a filesystem. 316 */ 317 page_base = pindex % BLIST_MAX_ALLOC; 318 mc[page_base] = m; 319 ib = page_base - 1; 320 is = page_base + 1; 321 322 /* 323 * Scan object for clusterable pages. 324 * 325 * We can cluster ONLY if: ->> the page is NOT 326 * clean, wired, busy, held, or mapped into a 327 * buffer, and one of the following: 328 * 1) The page is inactive, or a seldom used 329 * active page. 330 * -or- 331 * 2) we force the issue. 332 * 333 * During heavy mmap/modification loads the pageout 334 * daemon can really fragment the underlying file 335 * due to flushing pages out of order and not trying 336 * align the clusters (which leave sporatic out-of-order 337 * holes). To solve this problem we do the reverse scan 338 * first and attempt to align our cluster, then do a 339 * forward scan if room remains. 340 */ 341 vm_object_hold(object); 342 343 while (ib >= 0) { 344 vm_page_t p; 345 346 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 347 TRUE, &error); 348 if (error || p == NULL) 349 break; 350 if ((p->queue - p->pc) == PQ_CACHE || 351 (p->flags & PG_UNQUEUED)) { 352 vm_page_wakeup(p); 353 break; 354 } 355 vm_page_test_dirty(p); 356 if (((p->dirty & p->valid) == 0 && 357 (p->flags & PG_NEED_COMMIT) == 0) || 358 p->wire_count != 0 || /* may be held by buf cache */ 359 p->hold_count != 0) { /* may be undergoing I/O */ 360 vm_page_wakeup(p); 361 break; 362 } 363 if (p->queue - p->pc != PQ_INACTIVE) { 364 if (p->queue - p->pc != PQ_ACTIVE || 365 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 366 vm_page_wakeup(p); 367 break; 368 } 369 } 370 371 /* 372 * Try to maintain page groupings in the cluster. 373 */ 374 if (m->flags & PG_WINATCFLS) 375 vm_page_flag_set(p, PG_WINATCFLS); 376 else 377 vm_page_flag_clear(p, PG_WINATCFLS); 378 p->act_count = m->act_count; 379 380 mc[ib] = p; 381 --ib; 382 } 383 ++ib; /* fixup */ 384 385 while (is < BLIST_MAX_ALLOC && 386 pindex - page_base + is < object->size) { 387 vm_page_t p; 388 389 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 390 TRUE, &error); 391 if (error || p == NULL) 392 break; 393 if (((p->queue - p->pc) == PQ_CACHE) || 394 (p->flags & PG_UNQUEUED)) { 395 vm_page_wakeup(p); 396 break; 397 } 398 vm_page_test_dirty(p); 399 if (((p->dirty & p->valid) == 0 && 400 (p->flags & PG_NEED_COMMIT) == 0) || 401 p->wire_count != 0 || /* may be held by buf cache */ 402 p->hold_count != 0) { /* may be undergoing I/O */ 403 vm_page_wakeup(p); 404 break; 405 } 406 if (p->queue - p->pc != PQ_INACTIVE) { 407 if (p->queue - p->pc != PQ_ACTIVE || 408 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 409 vm_page_wakeup(p); 410 break; 411 } 412 } 413 414 /* 415 * Try to maintain page groupings in the cluster. 416 */ 417 if (m->flags & PG_WINATCFLS) 418 vm_page_flag_set(p, PG_WINATCFLS); 419 else 420 vm_page_flag_clear(p, PG_WINATCFLS); 421 p->act_count = m->act_count; 422 423 mc[is] = p; 424 ++is; 425 } 426 427 vm_object_drop(object); 428 429 /* 430 * we allow reads during pageouts... 431 */ 432 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 433 } 434 435 /* 436 * vm_pageout_flush() - launder the given pages 437 * 438 * The given pages are laundered. Note that we setup for the start of 439 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 440 * reference count all in here rather then in the parent. If we want 441 * the parent to do more sophisticated things we may have to change 442 * the ordering. 443 * 444 * The pages in the array must be busied by the caller and will be 445 * unbusied by this function. 446 */ 447 int 448 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 449 { 450 vm_object_t object; 451 int pageout_status[count]; 452 int numpagedout = 0; 453 int i; 454 455 /* 456 * Initiate I/O. Bump the vm_page_t->busy counter. 457 */ 458 for (i = 0; i < count; i++) { 459 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 460 ("vm_pageout_flush page %p index %d/%d: partially " 461 "invalid page", mc[i], i, count)); 462 vm_page_io_start(mc[i]); 463 } 464 465 /* 466 * We must make the pages read-only. This will also force the 467 * modified bit in the related pmaps to be cleared. The pager 468 * cannot clear the bit for us since the I/O completion code 469 * typically runs from an interrupt. The act of making the page 470 * read-only handles the case for us. 471 * 472 * Then we can unbusy the pages, we still hold a reference by virtue 473 * of our soft-busy. 474 */ 475 for (i = 0; i < count; i++) { 476 if (vmflush_flags & OBJPC_TRY_TO_CACHE) 477 vm_page_protect(mc[i], VM_PROT_NONE); 478 else 479 vm_page_protect(mc[i], VM_PROT_READ); 480 vm_page_wakeup(mc[i]); 481 } 482 483 object = mc[0]->object; 484 vm_object_pip_add(object, count); 485 486 vm_pager_put_pages(object, mc, count, 487 (vmflush_flags | 488 ((object == &kernel_object) ? 489 OBJPC_SYNC : 0)), 490 pageout_status); 491 492 for (i = 0; i < count; i++) { 493 vm_page_t mt = mc[i]; 494 495 switch (pageout_status[i]) { 496 case VM_PAGER_OK: 497 numpagedout++; 498 break; 499 case VM_PAGER_PEND: 500 numpagedout++; 501 break; 502 case VM_PAGER_BAD: 503 /* 504 * Page outside of range of object. Right now we 505 * essentially lose the changes by pretending it 506 * worked. 507 */ 508 vm_page_busy_wait(mt, FALSE, "pgbad"); 509 pmap_clear_modify(mt); 510 vm_page_undirty(mt); 511 vm_page_wakeup(mt); 512 break; 513 case VM_PAGER_ERROR: 514 case VM_PAGER_FAIL: 515 /* 516 * A page typically cannot be paged out when we 517 * have run out of swap. We leave the page 518 * marked inactive and will try to page it out 519 * again later. 520 * 521 * Starvation of the active page list is used to 522 * determine when the system is massively memory 523 * starved. 524 */ 525 break; 526 case VM_PAGER_AGAIN: 527 break; 528 } 529 530 /* 531 * If not PENDing this was a synchronous operation and we 532 * clean up after the I/O. If it is PENDing the mess is 533 * cleaned up asynchronously. 534 * 535 * Also nominally act on the caller's wishes if the caller 536 * wants to try to really clean (cache or free) the page. 537 * 538 * Also nominally deactivate the page if the system is 539 * memory-stressed. 540 */ 541 if (pageout_status[i] != VM_PAGER_PEND) { 542 vm_page_busy_wait(mt, FALSE, "pgouw"); 543 vm_page_io_finish(mt); 544 if (vmflush_flags & OBJPC_TRY_TO_CACHE) { 545 vm_page_try_to_cache(mt); 546 } else if (vm_page_count_severe()) { 547 vm_page_deactivate(mt); 548 vm_page_wakeup(mt); 549 } else { 550 vm_page_wakeup(mt); 551 } 552 vm_object_pip_wakeup(object); 553 } 554 } 555 return numpagedout; 556 } 557 558 #if !defined(NO_SWAPPING) 559 560 /* 561 * Callback function, page busied for us. We must dispose of the busy 562 * condition. Any related pmap pages may be held but will not be locked. 563 */ 564 static 565 int 566 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 567 vm_page_t p) 568 { 569 int actcount; 570 int cleanit = 0; 571 572 /* 573 * Basic tests - There should never be a marker, and we can stop 574 * once the RSS is below the required level. 575 */ 576 KKASSERT((p->flags & PG_MARKER) == 0); 577 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 578 vm_page_wakeup(p); 579 return(-1); 580 } 581 582 mycpu->gd_cnt.v_pdpages++; 583 584 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 585 vm_page_wakeup(p); 586 goto done; 587 } 588 589 ++info->actioncount; 590 591 /* 592 * Check if the page has been referened recently. If it has, 593 * activate it and skip. 594 */ 595 actcount = pmap_ts_referenced(p); 596 if (actcount) { 597 vm_page_flag_set(p, PG_REFERENCED); 598 } else if (p->flags & PG_REFERENCED) { 599 actcount = 1; 600 } 601 602 if (actcount) { 603 if (p->queue - p->pc != PQ_ACTIVE) { 604 vm_page_and_queue_spin_lock(p); 605 if (p->queue - p->pc != PQ_ACTIVE) { 606 vm_page_and_queue_spin_unlock(p); 607 vm_page_activate(p); 608 } else { 609 vm_page_and_queue_spin_unlock(p); 610 } 611 } else { 612 p->act_count += actcount; 613 if (p->act_count > ACT_MAX) 614 p->act_count = ACT_MAX; 615 } 616 vm_page_flag_clear(p, PG_REFERENCED); 617 vm_page_wakeup(p); 618 goto done; 619 } 620 621 /* 622 * Remove the page from this particular pmap. Once we do this, our 623 * pmap scans will not see it again (unless it gets faulted in), so 624 * we must actively dispose of or deal with the page. 625 */ 626 pmap_remove_specific(info->pmap, p); 627 628 /* 629 * If the page is not mapped to another process (i.e. as would be 630 * typical if this were a shared page from a library) then deactivate 631 * the page and clean it in two passes only. 632 * 633 * If the page hasn't been referenced since the last check, remove it 634 * from the pmap. If it is no longer mapped, deactivate it 635 * immediately, accelerating the normal decline. 636 * 637 * Once the page has been removed from the pmap the RSS code no 638 * longer tracks it so we have to make sure that it is staged for 639 * potential flush action. 640 * 641 * XXX 642 */ 643 if ((p->flags & PG_MAPPED) == 0 || 644 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 645 if (p->queue - p->pc == PQ_ACTIVE) { 646 vm_page_deactivate(p); 647 } 648 if (p->queue - p->pc == PQ_INACTIVE) { 649 cleanit = 1; 650 } 651 } 652 653 /* 654 * Ok, try to fully clean the page and any nearby pages such that at 655 * least the requested page is freed or moved to the cache queue. 656 * 657 * We usually do this synchronously to allow us to get the page into 658 * the CACHE queue quickly, which will prevent memory exhaustion if 659 * a process with a memoryuse limit is running away. However, the 660 * sysadmin may desire to set vm.swap_user_async which relaxes this 661 * and improves write performance. 662 */ 663 if (cleanit) { 664 long max_launder = 0x7FFF; 665 long vnodes_skipped = 0; 666 long counts[4] = { 0, 0, 0, 0 }; 667 int vmflush_flags; 668 struct vnode *vpfailed = NULL; 669 670 info->offset = va; 671 672 if (vm_pageout_memuse_mode >= 2) { 673 vmflush_flags = OBJPC_TRY_TO_CACHE | 674 OBJPC_ALLOW_ACTIVE; 675 if (swap_user_async == 0) 676 vmflush_flags |= OBJPC_SYNC; 677 vm_page_flag_set(p, PG_WINATCFLS); 678 info->cleancount += 679 vm_pageout_page(p, &max_launder, 680 &vnodes_skipped, 681 &vpfailed, 1, vmflush_flags, 682 counts); 683 } else { 684 vm_page_wakeup(p); 685 ++info->cleancount; 686 } 687 } else { 688 vm_page_wakeup(p); 689 } 690 691 /* 692 * Must be at end to avoid SMP races. 693 */ 694 done: 695 lwkt_user_yield(); 696 return 0; 697 } 698 699 /* 700 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 701 * that is relatively difficult to do. We try to keep track of where we 702 * left off last time to reduce scan overhead. 703 * 704 * Called when vm_pageout_memuse_mode is >= 1. 705 */ 706 void 707 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 708 { 709 vm_offset_t pgout_offset; 710 struct pmap_pgscan_info info; 711 int retries = 3; 712 713 pgout_offset = map->pgout_offset; 714 again: 715 #if 0 716 kprintf("%016jx ", pgout_offset); 717 #endif 718 if (pgout_offset < VM_MIN_USER_ADDRESS) 719 pgout_offset = VM_MIN_USER_ADDRESS; 720 if (pgout_offset >= VM_MAX_USER_ADDRESS) 721 pgout_offset = 0; 722 info.pmap = vm_map_pmap(map); 723 info.limit = limit; 724 info.beg_addr = pgout_offset; 725 info.end_addr = VM_MAX_USER_ADDRESS; 726 info.callback = vm_pageout_mdp_callback; 727 info.cleancount = 0; 728 info.actioncount = 0; 729 info.busycount = 0; 730 731 pmap_pgscan(&info); 732 pgout_offset = info.offset; 733 #if 0 734 kprintf("%016jx %08lx %08lx\n", pgout_offset, 735 info.cleancount, info.actioncount); 736 #endif 737 738 if (pgout_offset != VM_MAX_USER_ADDRESS && 739 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 740 goto again; 741 } else if (retries && 742 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 743 --retries; 744 goto again; 745 } 746 map->pgout_offset = pgout_offset; 747 } 748 #endif 749 750 /* 751 * Called when the pageout scan wants to free a page. We no longer 752 * try to cycle the vm_object here with a reference & dealloc, which can 753 * cause a non-trivial object collapse in a critical path. 754 * 755 * It is unclear why we cycled the ref_count in the past, perhaps to try 756 * to optimize shadow chain collapses but I don't quite see why it would 757 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 758 * synchronously and not have to be kicked-start. 759 */ 760 static void 761 vm_pageout_page_free(vm_page_t m) 762 { 763 vm_page_protect(m, VM_PROT_NONE); 764 vm_page_free(m); 765 } 766 767 /* 768 * vm_pageout_scan does the dirty work for the pageout daemon. 769 */ 770 struct vm_pageout_scan_info { 771 struct proc *bigproc; 772 vm_offset_t bigsize; 773 }; 774 775 static int vm_pageout_scan_callback(struct proc *p, void *data); 776 777 /* 778 * Scan inactive queue 779 * 780 * WARNING! Can be called from two pagedaemon threads simultaneously. 781 */ 782 static int 783 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 784 long *vnodes_skipped, long *counts) 785 { 786 vm_page_t m; 787 struct vm_page marker; 788 struct vnode *vpfailed; /* warning, allowed to be stale */ 789 long maxscan; 790 long delta = 0; 791 long max_launder; 792 int isep; 793 int vmflush_flags; 794 795 isep = (curthread == emergpager); 796 if ((unsigned)pass > 1000) 797 pass = 1000; 798 799 /* 800 * This routine is called for each of PQ_L2_SIZE inactive queues. 801 * We want the vm_max_launder parameter to apply to the whole 802 * queue (i.e. per-whole-queue pass, not per-sub-queue). 803 * 804 * In each successive full-pass when the page target is not met we 805 * allow the per-queue max_launder to increase up to a maximum of 806 * vm_max_launder / 16. 807 */ 808 if (pass) 809 max_launder = (long)vm_max_launder * (pass + 1) / PQ_L2_SIZE; 810 else 811 max_launder = (long)vm_max_launder / PQ_L2_SIZE; 812 max_launder /= MAXSCAN_DIVIDER; 813 814 if (max_launder <= 1) 815 max_launder = 1; 816 if (max_launder >= vm_max_launder / 16) 817 max_launder = vm_max_launder / 16 + 1; 818 819 /* 820 * Start scanning the inactive queue for pages we can move to the 821 * cache or free. The scan will stop when the target is reached or 822 * we have scanned the entire inactive queue. Note that m->act_count 823 * is not used to form decisions for the inactive queue, only for the 824 * active queue. 825 * 826 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 827 * PAGES. 828 */ 829 830 /* 831 * Initialize our marker 832 */ 833 bzero(&marker, sizeof(marker)); 834 marker.flags = PG_FICTITIOUS | PG_MARKER; 835 marker.busy_count = PBUSY_LOCKED; 836 marker.queue = PQ_INACTIVE + q; 837 marker.pc = q; 838 marker.wire_count = 1; 839 840 /* 841 * Inactive queue scan. 842 * 843 * We pick off approximately 1/10 of each queue. Each queue is 844 * effectively organized LRU so scanning the entire queue would 845 * improperly pick up pages that might still be in regular use. 846 * 847 * NOTE: The vm_page must be spinlocked before the queue to avoid 848 * deadlocks, so it is easiest to simply iterate the loop 849 * with the queue unlocked at the top. 850 */ 851 vpfailed = NULL; 852 853 vm_page_queues_spin_lock(PQ_INACTIVE + q); 854 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 855 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1; 856 857 /* 858 * Queue locked at top of loop to avoid stack marker issues. 859 */ 860 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 861 maxscan-- > 0 && avail_shortage - delta > 0) 862 { 863 int count; 864 865 KKASSERT(m->queue == PQ_INACTIVE + q); 866 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 867 &marker, pageq); 868 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 869 &marker, pageq); 870 mycpu->gd_cnt.v_pdpages++; 871 872 /* 873 * Skip marker pages (atomic against other markers to avoid 874 * infinite hop-over scans). 875 */ 876 if (m->flags & PG_MARKER) 877 continue; 878 879 /* 880 * Try to busy the page. Don't mess with pages which are 881 * already busy or reorder them in the queue. 882 */ 883 if (vm_page_busy_try(m, TRUE)) 884 continue; 885 886 /* 887 * Remaining operations run with the page busy and neither 888 * the page or the queue will be spin-locked. 889 */ 890 KKASSERT(m->queue == PQ_INACTIVE + q); 891 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 892 893 /* 894 * The emergency pager runs when the primary pager gets 895 * stuck, which typically means the primary pager deadlocked 896 * on a vnode-backed page. Therefore, the emergency pager 897 * must skip any complex objects. 898 * 899 * We disallow VNODEs unless they are VCHR whos device ops 900 * does not flag D_NOEMERGPGR. 901 */ 902 if (isep && m->object) { 903 struct vnode *vp; 904 905 switch(m->object->type) { 906 case OBJT_DEFAULT: 907 case OBJT_SWAP: 908 /* 909 * Allow anonymous memory and assume that 910 * swap devices are not complex, since its 911 * kinda worthless if we can't swap out dirty 912 * anonymous pages. 913 */ 914 break; 915 case OBJT_VNODE: 916 /* 917 * Allow VCHR device if the D_NOEMERGPGR 918 * flag is not set, deny other vnode types 919 * as being too complex. 920 */ 921 vp = m->object->handle; 922 if (vp && vp->v_type == VCHR && 923 vp->v_rdev && vp->v_rdev->si_ops && 924 (vp->v_rdev->si_ops->head.flags & 925 D_NOEMERGPGR) == 0) { 926 break; 927 } 928 /* Deny - fall through */ 929 default: 930 /* 931 * Deny 932 */ 933 vm_page_wakeup(m); 934 vm_page_queues_spin_lock(PQ_INACTIVE + q); 935 lwkt_yield(); 936 continue; 937 } 938 } 939 940 /* 941 * Try to pageout the page and perhaps other nearby pages. 942 * We want to get the pages into the cache eventually ( 943 * first or second pass). Otherwise the pages can wind up 944 * just cycling in the inactive queue, getting flushed over 945 * and over again. 946 * 947 * Generally speaking we recycle dirty pages within PQ_INACTIVE 948 * twice (double LRU) before paging them out. If the 949 * memuse_mode is >= 3 we run them single-LRU like we do clean 950 * pages. 951 */ 952 if (vm_pageout_memuse_mode >= 3) 953 vm_page_flag_set(m, PG_WINATCFLS); 954 955 vmflush_flags = 0; 956 if (vm_pageout_allow_active) 957 vmflush_flags |= OBJPC_ALLOW_ACTIVE; 958 if (m->flags & PG_WINATCFLS) 959 vmflush_flags |= OBJPC_TRY_TO_CACHE; 960 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 961 &vpfailed, pass, vmflush_flags, counts); 962 delta += count; 963 964 /* 965 * Systems with a ton of memory can wind up with huge 966 * deactivation counts. Because the inactive scan is 967 * doing a lot of flushing, the combination can result 968 * in excessive paging even in situations where other 969 * unrelated threads free up sufficient VM. 970 * 971 * To deal with this we abort the nominal active->inactive 972 * scan before we hit the inactive target when free+cache 973 * levels have reached a reasonable target. 974 * 975 * When deciding to stop early we need to add some slop to 976 * the test and we need to return full completion to the caller 977 * to prevent the caller from thinking there is something 978 * wrong and issuing a low-memory+swap warning or pkill. 979 * 980 * A deficit forces paging regardless of the state of the 981 * VM page queues (used for RSS enforcement). 982 */ 983 lwkt_yield(); 984 vm_page_queues_spin_lock(PQ_INACTIVE + q); 985 if (vm_paging_target() < -vm_max_launder) { 986 /* 987 * Stopping early, return full completion to caller. 988 */ 989 if (delta < avail_shortage) 990 delta = avail_shortage; 991 break; 992 } 993 } 994 995 /* page queue still spin-locked */ 996 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 997 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 998 999 return (delta); 1000 } 1001 1002 /* 1003 * Pageout the specified page, return the total number of pages paged out 1004 * (this routine may cluster). 1005 * 1006 * The page must be busied and soft-busied by the caller and will be disposed 1007 * of by this function. 1008 */ 1009 static int 1010 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 1011 struct vnode **vpfailedp, int pass, int vmflush_flags, 1012 long *counts) 1013 { 1014 vm_object_t object; 1015 int actcount; 1016 int count = 0; 1017 1018 /* 1019 * Wiring no longer removes a page from its queue. The last unwiring 1020 * will requeue the page. Obviously wired pages cannot be paged out 1021 * so unqueue it and return. 1022 */ 1023 if (m->wire_count) { 1024 vm_page_unqueue_nowakeup(m); 1025 vm_page_wakeup(m); 1026 return 0; 1027 } 1028 1029 /* 1030 * A held page may be undergoing I/O, so skip it. 1031 */ 1032 if (m->hold_count) { 1033 vm_page_and_queue_spin_lock(m); 1034 if (m->queue - m->pc == PQ_INACTIVE) { 1035 TAILQ_REMOVE( 1036 &vm_page_queues[m->queue].pl, m, pageq); 1037 TAILQ_INSERT_TAIL( 1038 &vm_page_queues[m->queue].pl, m, pageq); 1039 } 1040 vm_page_and_queue_spin_unlock(m); 1041 vm_page_wakeup(m); 1042 return 0; 1043 } 1044 1045 if (m->object == NULL || m->object->ref_count == 0) { 1046 /* 1047 * If the object is not being used, we ignore previous 1048 * references. 1049 */ 1050 vm_page_flag_clear(m, PG_REFERENCED); 1051 pmap_clear_reference(m); 1052 /* fall through to end */ 1053 } else if (((m->flags & PG_REFERENCED) == 0) && 1054 (actcount = pmap_ts_referenced(m))) { 1055 /* 1056 * Otherwise, if the page has been referenced while 1057 * in the inactive queue, we bump the "activation 1058 * count" upwards, making it less likely that the 1059 * page will be added back to the inactive queue 1060 * prematurely again. Here we check the page tables 1061 * (or emulated bits, if any), given the upper level 1062 * VM system not knowing anything about existing 1063 * references. 1064 */ 1065 ++counts[3]; 1066 vm_page_activate(m); 1067 m->act_count += (actcount + ACT_ADVANCE); 1068 vm_page_wakeup(m); 1069 return 0; 1070 } 1071 1072 /* 1073 * (m) is still busied. 1074 * 1075 * If the upper level VM system knows about any page 1076 * references, we activate the page. We also set the 1077 * "activation count" higher than normal so that we will less 1078 * likely place pages back onto the inactive queue again. 1079 */ 1080 if ((m->flags & PG_REFERENCED) != 0) { 1081 vm_page_flag_clear(m, PG_REFERENCED); 1082 actcount = pmap_ts_referenced(m); 1083 vm_page_activate(m); 1084 m->act_count += (actcount + ACT_ADVANCE + 1); 1085 vm_page_wakeup(m); 1086 ++counts[3]; 1087 return 0; 1088 } 1089 1090 /* 1091 * If the upper level VM system doesn't know anything about 1092 * the page being dirty, we have to check for it again. As 1093 * far as the VM code knows, any partially dirty pages are 1094 * fully dirty. 1095 * 1096 * Pages marked PG_WRITEABLE may be mapped into the user 1097 * address space of a process running on another cpu. A 1098 * user process (without holding the MP lock) running on 1099 * another cpu may be able to touch the page while we are 1100 * trying to remove it. vm_page_cache() will handle this 1101 * case for us. 1102 */ 1103 if (m->dirty == 0) { 1104 vm_page_test_dirty(m); 1105 } else { 1106 vm_page_dirty(m); 1107 } 1108 1109 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1110 /* 1111 * Invalid pages can be easily freed 1112 */ 1113 vm_pageout_page_free(m); 1114 mycpu->gd_cnt.v_dfree++; 1115 ++count; 1116 ++counts[1]; 1117 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1118 /* 1119 * Clean pages can be placed onto the cache queue. 1120 * This effectively frees them. 1121 */ 1122 vm_page_cache(m); 1123 ++count; 1124 ++counts[1]; 1125 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1126 /* 1127 * Dirty pages need to be paged out, but flushing 1128 * a page is extremely expensive verses freeing 1129 * a clean page. Rather then artificially limiting 1130 * the number of pages we can flush, we instead give 1131 * dirty pages extra priority on the inactive queue 1132 * by forcing them to be cycled through the queue 1133 * twice before being flushed, after which the 1134 * (now clean) page will cycle through once more 1135 * before being freed. This significantly extends 1136 * the thrash point for a heavily loaded machine. 1137 */ 1138 ++counts[2]; 1139 vm_page_flag_set(m, PG_WINATCFLS); 1140 vm_page_and_queue_spin_lock(m); 1141 if (m->queue - m->pc == PQ_INACTIVE) { 1142 TAILQ_REMOVE( 1143 &vm_page_queues[m->queue].pl, m, pageq); 1144 TAILQ_INSERT_TAIL( 1145 &vm_page_queues[m->queue].pl, m, pageq); 1146 } 1147 vm_page_and_queue_spin_unlock(m); 1148 vm_page_wakeup(m); 1149 } else if (*max_launderp > 0) { 1150 /* 1151 * We always want to try to flush some dirty pages if 1152 * we encounter them, to keep the system stable. 1153 * Normally this number is small, but under extreme 1154 * pressure where there are insufficient clean pages 1155 * on the inactive queue, we may have to go all out. 1156 */ 1157 int swap_pageouts_ok; 1158 struct vnode *vp = NULL; 1159 1160 if ((m->flags & PG_WINATCFLS) == 0) 1161 vm_page_flag_set(m, PG_WINATCFLS); 1162 swap_pageouts_ok = 0; 1163 object = m->object; 1164 if (object && 1165 (object->type != OBJT_SWAP) && 1166 (object->type != OBJT_DEFAULT)) { 1167 swap_pageouts_ok = 1; 1168 } else { 1169 swap_pageouts_ok = !(defer_swap_pageouts || 1170 disable_swap_pageouts); 1171 swap_pageouts_ok |= (!disable_swap_pageouts && 1172 defer_swap_pageouts && 1173 vm_page_count_min(0)); 1174 } 1175 1176 /* 1177 * We don't bother paging objects that are "dead". 1178 * Those objects are in a "rundown" state. 1179 */ 1180 if (!swap_pageouts_ok || 1181 (object == NULL) || 1182 (object->flags & OBJ_DEAD)) { 1183 vm_page_and_queue_spin_lock(m); 1184 if (m->queue - m->pc == PQ_INACTIVE) { 1185 TAILQ_REMOVE( 1186 &vm_page_queues[m->queue].pl, 1187 m, pageq); 1188 TAILQ_INSERT_TAIL( 1189 &vm_page_queues[m->queue].pl, 1190 m, pageq); 1191 } 1192 vm_page_and_queue_spin_unlock(m); 1193 vm_page_wakeup(m); 1194 return 0; 1195 } 1196 1197 /* 1198 * (m) is still busied. 1199 * 1200 * The object is already known NOT to be dead. It 1201 * is possible for the vget() to block the whole 1202 * pageout daemon, but the new low-memory handling 1203 * code should prevent it. 1204 * 1205 * The previous code skipped locked vnodes and, worse, 1206 * reordered pages in the queue. This results in 1207 * completely non-deterministic operation because, 1208 * quite often, a vm_fault has initiated an I/O and 1209 * is holding a locked vnode at just the point where 1210 * the pageout daemon is woken up. 1211 * 1212 * We can't wait forever for the vnode lock, we might 1213 * deadlock due to a vn_read() getting stuck in 1214 * vm_wait while holding this vnode. We skip the 1215 * vnode if we can't get it in a reasonable amount 1216 * of time. 1217 * 1218 * vpfailed is used to (try to) avoid the case where 1219 * a large number of pages are associated with a 1220 * locked vnode, which could cause the pageout daemon 1221 * to stall for an excessive amount of time. 1222 */ 1223 if (object->type == OBJT_VNODE) { 1224 int flags; 1225 1226 vp = object->handle; 1227 flags = LK_EXCLUSIVE; 1228 if (vp == *vpfailedp) 1229 flags |= LK_NOWAIT; 1230 else 1231 flags |= LK_TIMELOCK; 1232 vm_page_hold(m); 1233 vm_page_wakeup(m); 1234 1235 /* 1236 * We have unbusied (m) temporarily so we can 1237 * acquire the vp lock without deadlocking. 1238 * (m) is held to prevent destruction. 1239 */ 1240 if (vget(vp, flags) != 0) { 1241 *vpfailedp = vp; 1242 ++pageout_lock_miss; 1243 if (object->flags & OBJ_MIGHTBEDIRTY) 1244 ++*vnodes_skippedp; 1245 vm_page_unhold(m); 1246 return 0; 1247 } 1248 1249 /* 1250 * The page might have been moved to another 1251 * queue during potential blocking in vget() 1252 * above. The page might have been freed and 1253 * reused for another vnode. The object might 1254 * have been reused for another vnode. 1255 */ 1256 if (m->queue - m->pc != PQ_INACTIVE || 1257 m->object != object || 1258 object->handle != vp) { 1259 if (object->flags & OBJ_MIGHTBEDIRTY) 1260 ++*vnodes_skippedp; 1261 vput(vp); 1262 vm_page_unhold(m); 1263 return 0; 1264 } 1265 1266 /* 1267 * The page may have been busied during the 1268 * blocking in vput(); We don't move the 1269 * page back onto the end of the queue so that 1270 * statistics are more correct if we don't. 1271 */ 1272 if (vm_page_busy_try(m, TRUE)) { 1273 vput(vp); 1274 vm_page_unhold(m); 1275 return 0; 1276 } 1277 vm_page_unhold(m); 1278 1279 /* 1280 * If it was wired while we didn't own it. 1281 */ 1282 if (m->wire_count) { 1283 vm_page_unqueue_nowakeup(m); 1284 vput(vp); 1285 vm_page_wakeup(m); 1286 return 0; 1287 } 1288 1289 /* 1290 * (m) is busied again 1291 * 1292 * We own the busy bit and remove our hold 1293 * bit. If the page is still held it 1294 * might be undergoing I/O, so skip it. 1295 */ 1296 if (m->hold_count) { 1297 rebusy_failed: 1298 vm_page_and_queue_spin_lock(m); 1299 if (m->queue - m->pc == PQ_INACTIVE) { 1300 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1301 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1302 } 1303 vm_page_and_queue_spin_unlock(m); 1304 if (object->flags & OBJ_MIGHTBEDIRTY) 1305 ++*vnodes_skippedp; 1306 vm_page_wakeup(m); 1307 vput(vp); 1308 return 0; 1309 } 1310 1311 /* 1312 * Recheck queue, object, and vp now that we have 1313 * rebusied the page. 1314 */ 1315 if (m->queue - m->pc != PQ_INACTIVE || 1316 m->object != object || 1317 object->handle != vp) { 1318 kprintf("vm_pageout_page: " 1319 "rebusy %p failed(A)\n", 1320 m); 1321 goto rebusy_failed; 1322 } 1323 1324 /* 1325 * Check page validity 1326 */ 1327 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1328 kprintf("vm_pageout_page: " 1329 "rebusy %p failed(B)\n", 1330 m); 1331 goto rebusy_failed; 1332 } 1333 if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1334 kprintf("vm_pageout_page: " 1335 "rebusy %p failed(C)\n", 1336 m); 1337 goto rebusy_failed; 1338 } 1339 1340 /* (m) is left busied as we fall through */ 1341 } 1342 1343 /* 1344 * page is busy and not held here. 1345 * 1346 * If a page is dirty, then it is either being washed 1347 * (but not yet cleaned) or it is still in the 1348 * laundry. If it is still in the laundry, then we 1349 * start the cleaning operation. 1350 * 1351 * decrement inactive_shortage on success to account 1352 * for the (future) cleaned page. Otherwise we 1353 * could wind up laundering or cleaning too many 1354 * pages. 1355 * 1356 * NOTE: Cleaning the page here does not cause 1357 * force_deficit to be adjusted, because the 1358 * page is not being freed or moved to the 1359 * cache. 1360 */ 1361 count = vm_pageout_clean_helper(m, vmflush_flags); 1362 counts[0] += count; 1363 *max_launderp -= count; 1364 1365 /* 1366 * Clean ate busy, page no longer accessible 1367 */ 1368 if (vp != NULL) 1369 vput(vp); 1370 } else { 1371 vm_page_wakeup(m); 1372 } 1373 return count; 1374 } 1375 1376 /* 1377 * Scan active queue 1378 * 1379 * WARNING! Can be called from two pagedaemon threads simultaneously. 1380 */ 1381 static int 1382 vm_pageout_scan_active(int pass, int q, 1383 long avail_shortage, long inactive_shortage, 1384 long *recycle_countp) 1385 { 1386 struct vm_page marker; 1387 vm_page_t m; 1388 int actcount; 1389 long delta = 0; 1390 long maxscan; 1391 int isep; 1392 1393 isep = (curthread == emergpager); 1394 1395 /* 1396 * We want to move pages from the active queue to the inactive 1397 * queue to get the inactive queue to the inactive target. If 1398 * we still have a page shortage from above we try to directly free 1399 * clean pages instead of moving them. 1400 * 1401 * If we do still have a shortage we keep track of the number of 1402 * pages we free or cache (recycle_count) as a measure of thrashing 1403 * between the active and inactive queues. 1404 * 1405 * If we were able to completely satisfy the free+cache targets 1406 * from the inactive pool we limit the number of pages we move 1407 * from the active pool to the inactive pool to 2x the pages we 1408 * had removed from the inactive pool (with a minimum of 1/5 the 1409 * inactive target). If we were not able to completely satisfy 1410 * the free+cache targets we go for the whole target aggressively. 1411 * 1412 * NOTE: Both variables can end up negative. 1413 * NOTE: We are still in a critical section. 1414 * 1415 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1416 * PAGES. 1417 */ 1418 1419 bzero(&marker, sizeof(marker)); 1420 marker.flags = PG_FICTITIOUS | PG_MARKER; 1421 marker.busy_count = PBUSY_LOCKED; 1422 marker.queue = PQ_ACTIVE + q; 1423 marker.pc = q; 1424 marker.wire_count = 1; 1425 1426 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1427 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1428 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1; 1429 1430 /* 1431 * Queue locked at top of loop to avoid stack marker issues. 1432 */ 1433 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1434 maxscan-- > 0 && (avail_shortage - delta > 0 || 1435 inactive_shortage > 0)) 1436 { 1437 KKASSERT(m->queue == PQ_ACTIVE + q); 1438 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1439 &marker, pageq); 1440 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1441 &marker, pageq); 1442 1443 /* 1444 * Skip marker pages (atomic against other markers to avoid 1445 * infinite hop-over scans). 1446 */ 1447 if (m->flags & PG_MARKER) 1448 continue; 1449 1450 /* 1451 * Try to busy the page. Don't mess with pages which are 1452 * already busy or reorder them in the queue. 1453 */ 1454 if (vm_page_busy_try(m, TRUE)) 1455 continue; 1456 1457 /* 1458 * Remaining operations run with the page busy and neither 1459 * the page or the queue will be spin-locked. 1460 */ 1461 KKASSERT(m->queue == PQ_ACTIVE + q); 1462 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1463 1464 #if 0 1465 /* 1466 * Don't deactivate pages that are held, even if we can 1467 * busy them. (XXX why not?) 1468 */ 1469 if (m->hold_count) { 1470 vm_page_and_queue_spin_lock(m); 1471 if (m->queue - m->pc == PQ_ACTIVE) { 1472 TAILQ_REMOVE( 1473 &vm_page_queues[PQ_ACTIVE + q].pl, 1474 m, pageq); 1475 TAILQ_INSERT_TAIL( 1476 &vm_page_queues[PQ_ACTIVE + q].pl, 1477 m, pageq); 1478 } 1479 vm_page_and_queue_spin_unlock(m); 1480 vm_page_wakeup(m); 1481 goto next; 1482 } 1483 #endif 1484 /* 1485 * We can just remove wired pages from the queue 1486 */ 1487 if (m->wire_count) { 1488 vm_page_unqueue_nowakeup(m); 1489 vm_page_wakeup(m); 1490 goto next; 1491 } 1492 1493 /* 1494 * The emergency pager ignores vnode-backed pages as these 1495 * are the pages that probably bricked the main pager. 1496 */ 1497 if (isep && m->object && m->object->type == OBJT_VNODE) { 1498 vm_page_and_queue_spin_lock(m); 1499 if (m->queue - m->pc == PQ_ACTIVE) { 1500 TAILQ_REMOVE( 1501 &vm_page_queues[PQ_ACTIVE + q].pl, 1502 m, pageq); 1503 TAILQ_INSERT_TAIL( 1504 &vm_page_queues[PQ_ACTIVE + q].pl, 1505 m, pageq); 1506 } 1507 vm_page_and_queue_spin_unlock(m); 1508 vm_page_wakeup(m); 1509 goto next; 1510 } 1511 1512 /* 1513 * The count for pagedaemon pages is done after checking the 1514 * page for eligibility... 1515 */ 1516 mycpu->gd_cnt.v_pdpages++; 1517 1518 /* 1519 * Check to see "how much" the page has been used and clear 1520 * the tracking access bits. If the object has no references 1521 * don't bother paying the expense. 1522 */ 1523 actcount = 0; 1524 if (m->object && m->object->ref_count != 0) { 1525 if (m->flags & PG_REFERENCED) 1526 ++actcount; 1527 actcount += pmap_ts_referenced(m); 1528 if (actcount) { 1529 m->act_count += ACT_ADVANCE + actcount; 1530 if (m->act_count > ACT_MAX) 1531 m->act_count = ACT_MAX; 1532 } 1533 } 1534 vm_page_flag_clear(m, PG_REFERENCED); 1535 1536 /* 1537 * actcount is only valid if the object ref_count is non-zero. 1538 * If the page does not have an object, actcount will be zero. 1539 */ 1540 if (actcount && m->object->ref_count != 0) { 1541 vm_page_and_queue_spin_lock(m); 1542 if (m->queue - m->pc == PQ_ACTIVE) { 1543 TAILQ_REMOVE( 1544 &vm_page_queues[PQ_ACTIVE + q].pl, 1545 m, pageq); 1546 TAILQ_INSERT_TAIL( 1547 &vm_page_queues[PQ_ACTIVE + q].pl, 1548 m, pageq); 1549 } 1550 vm_page_and_queue_spin_unlock(m); 1551 vm_page_wakeup(m); 1552 } else { 1553 switch(m->object->type) { 1554 case OBJT_DEFAULT: 1555 case OBJT_SWAP: 1556 m->act_count -= min(m->act_count, 1557 vm_anonmem_decline); 1558 break; 1559 default: 1560 m->act_count -= min(m->act_count, 1561 vm_filemem_decline); 1562 break; 1563 } 1564 if (vm_pageout_algorithm || 1565 (m->object == NULL) || 1566 (m->object && (m->object->ref_count == 0)) || 1567 m->act_count < pass + 1 1568 ) { 1569 /* 1570 * Deactivate the page. If we had a 1571 * shortage from our inactive scan try to 1572 * free (cache) the page instead. 1573 * 1574 * Don't just blindly cache the page if 1575 * we do not have a shortage from the 1576 * inactive scan, that could lead to 1577 * gigabytes being moved. 1578 */ 1579 --inactive_shortage; 1580 if (avail_shortage - delta > 0 || 1581 (m->object && (m->object->ref_count == 0))) 1582 { 1583 if (avail_shortage - delta > 0) 1584 ++*recycle_countp; 1585 vm_page_protect(m, VM_PROT_NONE); 1586 if (m->dirty == 0 && 1587 (m->flags & PG_NEED_COMMIT) == 0 && 1588 avail_shortage - delta > 0) { 1589 vm_page_cache(m); 1590 } else { 1591 vm_page_deactivate(m); 1592 vm_page_wakeup(m); 1593 } 1594 } else { 1595 vm_page_deactivate(m); 1596 vm_page_wakeup(m); 1597 } 1598 ++delta; 1599 } else { 1600 vm_page_and_queue_spin_lock(m); 1601 if (m->queue - m->pc == PQ_ACTIVE) { 1602 TAILQ_REMOVE( 1603 &vm_page_queues[PQ_ACTIVE + q].pl, 1604 m, pageq); 1605 TAILQ_INSERT_TAIL( 1606 &vm_page_queues[PQ_ACTIVE + q].pl, 1607 m, pageq); 1608 } 1609 vm_page_and_queue_spin_unlock(m); 1610 vm_page_wakeup(m); 1611 } 1612 } 1613 next: 1614 lwkt_yield(); 1615 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1616 } 1617 1618 /* 1619 * Clean out our local marker. 1620 * 1621 * Page queue still spin-locked. 1622 */ 1623 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1624 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1625 1626 return (delta); 1627 } 1628 1629 /* 1630 * The number of actually free pages can drop down to v_free_reserved, 1631 * we try to build the free count back above v_free_min. Note that 1632 * vm_paging_needed() also returns TRUE if v_free_count is not at 1633 * least v_free_min so that is the minimum we must build the free 1634 * count to. 1635 * 1636 * We use a slightly higher target to improve hysteresis, 1637 * ((v_free_target + v_free_min) / 2). Since v_free_target 1638 * is usually the same as v_cache_min this maintains about 1639 * half the pages in the free queue as are in the cache queue, 1640 * providing pretty good pipelining for pageout operation. 1641 * 1642 * The system operator can manipulate vm.v_cache_min and 1643 * vm.v_free_target to tune the pageout demon. Be sure 1644 * to keep vm.v_free_min < vm.v_free_target. 1645 * 1646 * Note that the original paging target is to get at least 1647 * (free_min + cache_min) into (free + cache). The slightly 1648 * higher target will shift additional pages from cache to free 1649 * without effecting the original paging target in order to 1650 * maintain better hysteresis and not have the free count always 1651 * be dead-on v_free_min. 1652 * 1653 * NOTE: we are still in a critical section. 1654 * 1655 * Pages moved from PQ_CACHE to totally free are not counted in the 1656 * pages_freed counter. 1657 * 1658 * WARNING! Can be called from two pagedaemon threads simultaneously. 1659 */ 1660 static void 1661 vm_pageout_scan_cache(long avail_shortage, int pass, 1662 long vnodes_skipped, long recycle_count) 1663 { 1664 static int lastkillticks; 1665 struct vm_pageout_scan_info info; 1666 vm_page_t m; 1667 int isep; 1668 1669 isep = (curthread == emergpager); 1670 1671 while (vmstats.v_free_count < 1672 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1673 /* 1674 * This steals some code from vm/vm_page.c 1675 * 1676 * Create two rovers and adjust the code to reduce 1677 * chances of them winding up at the same index (which 1678 * can cause a lot of contention). 1679 */ 1680 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1681 1682 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1683 goto next_rover; 1684 1685 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1686 if (m == NULL) 1687 break; 1688 /* 1689 * page is returned removed from its queue and spinlocked 1690 * 1691 * If the busy attempt fails we can still deactivate the page. 1692 */ 1693 if (vm_page_busy_try(m, TRUE)) { 1694 vm_page_deactivate_locked(m); 1695 vm_page_spin_unlock(m); 1696 continue; 1697 } 1698 vm_page_spin_unlock(m); 1699 pagedaemon_wakeup(); 1700 lwkt_yield(); 1701 1702 /* 1703 * Remaining operations run with the page busy and neither 1704 * the page or the queue will be spin-locked. 1705 */ 1706 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) || 1707 m->hold_count || 1708 m->wire_count) { 1709 vm_page_deactivate(m); 1710 vm_page_wakeup(m); 1711 continue; 1712 } 1713 1714 /* 1715 * Because the page is in the cache, it shouldn't be mapped. 1716 */ 1717 pmap_mapped_sync(m); 1718 KKASSERT((m->flags & PG_MAPPED) == 0); 1719 KKASSERT(m->dirty == 0); 1720 vm_pageout_page_free(m); 1721 mycpu->gd_cnt.v_dfree++; 1722 next_rover: 1723 if (isep) 1724 cache_rover[1] -= PQ_PRIME2; 1725 else 1726 cache_rover[0] += PQ_PRIME2; 1727 } 1728 1729 #if !defined(NO_SWAPPING) 1730 /* 1731 * Idle process swapout -- run once per second. 1732 */ 1733 if (vm_swap_idle_enabled) { 1734 static time_t lsec; 1735 if (time_uptime != lsec) { 1736 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1737 vm_req_vmdaemon(); 1738 lsec = time_uptime; 1739 } 1740 } 1741 #endif 1742 1743 /* 1744 * If we didn't get enough free pages, and we have skipped a vnode 1745 * in a writeable object, wakeup the sync daemon. And kick swapout 1746 * if we did not get enough free pages. 1747 */ 1748 if (vm_paging_target() > 0) { 1749 if (vnodes_skipped && vm_page_count_min(0)) 1750 speedup_syncer(NULL); 1751 #if !defined(NO_SWAPPING) 1752 if (vm_swap_enabled && vm_page_count_target()) { 1753 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1754 vm_req_vmdaemon(); 1755 } 1756 #endif 1757 } 1758 1759 /* 1760 * Handle catastrophic conditions. Under good conditions we should 1761 * be at the target, well beyond our minimum. If we could not even 1762 * reach our minimum the system is under heavy stress. But just being 1763 * under heavy stress does not trigger process killing. 1764 * 1765 * We consider ourselves to have run out of memory if the swap pager 1766 * is full and avail_shortage is still positive. The secondary check 1767 * ensures that we do not kill processes if the instantanious 1768 * availability is good, even if the pageout demon pass says it 1769 * couldn't get to the target. 1770 * 1771 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1772 * SITUATIONS. 1773 */ 1774 if (swap_pager_almost_full && 1775 pass > 0 && 1776 isep == 0 && 1777 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1778 kprintf("Warning: system low on memory+swap " 1779 "shortage %ld for %d ticks!\n", 1780 avail_shortage, ticks - swap_fail_ticks); 1781 if (bootverbose) 1782 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1783 "avail=%ld target=%ld last=%u\n", 1784 swap_pager_almost_full, 1785 swap_pager_full, 1786 pass, 1787 avail_shortage, 1788 vm_paging_target(), 1789 (unsigned int)(ticks - lastkillticks)); 1790 } 1791 if (swap_pager_full && 1792 pass > 1 && 1793 isep == 0 && 1794 avail_shortage > 0 && 1795 vm_paging_target() > 0 && 1796 (unsigned int)(ticks - lastkillticks) >= hz) { 1797 /* 1798 * Kill something, maximum rate once per second to give 1799 * the process time to free up sufficient memory. 1800 */ 1801 lastkillticks = ticks; 1802 info.bigproc = NULL; 1803 info.bigsize = 0; 1804 allproc_scan(vm_pageout_scan_callback, &info, 0); 1805 if (info.bigproc != NULL) { 1806 kprintf("Try to kill process %d %s\n", 1807 info.bigproc->p_pid, info.bigproc->p_comm); 1808 info.bigproc->p_nice = PRIO_MIN; 1809 info.bigproc->p_usched->resetpriority( 1810 FIRST_LWP_IN_PROC(info.bigproc)); 1811 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1812 killproc(info.bigproc, "out of swap space"); 1813 wakeup(&vmstats.v_free_count); 1814 PRELE(info.bigproc); 1815 } 1816 } 1817 } 1818 1819 static int 1820 vm_pageout_scan_callback(struct proc *p, void *data) 1821 { 1822 struct vm_pageout_scan_info *info = data; 1823 vm_offset_t size; 1824 1825 /* 1826 * Never kill system processes or init. If we have configured swap 1827 * then try to avoid killing low-numbered pids. 1828 */ 1829 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1830 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1831 return (0); 1832 } 1833 1834 lwkt_gettoken(&p->p_token); 1835 1836 /* 1837 * if the process is in a non-running type state, 1838 * don't touch it. 1839 */ 1840 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1841 lwkt_reltoken(&p->p_token); 1842 return (0); 1843 } 1844 1845 /* 1846 * Get the approximate process size. Note that anonymous pages 1847 * with backing swap will be counted twice, but there should not 1848 * be too many such pages due to the stress the VM system is 1849 * under at this point. 1850 */ 1851 size = vmspace_anonymous_count(p->p_vmspace) + 1852 vmspace_swap_count(p->p_vmspace); 1853 1854 /* 1855 * If the this process is bigger than the biggest one 1856 * remember it. 1857 */ 1858 if (info->bigsize < size) { 1859 if (info->bigproc) 1860 PRELE(info->bigproc); 1861 PHOLD(p); 1862 info->bigproc = p; 1863 info->bigsize = size; 1864 } 1865 lwkt_reltoken(&p->p_token); 1866 lwkt_yield(); 1867 1868 return(0); 1869 } 1870 1871 /* 1872 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1873 * moved back to PQ_FREE. It is possible for pages to accumulate here 1874 * when vm_page_free() races against vm_page_unhold(), resulting in a 1875 * page being left on a PQ_HOLD queue with hold_count == 0. 1876 * 1877 * It is easier to handle this edge condition here, in non-critical code, 1878 * rather than enforce a spin-lock for every 1->0 transition in 1879 * vm_page_unhold(). 1880 * 1881 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1882 */ 1883 static void 1884 vm_pageout_scan_hold(int q) 1885 { 1886 vm_page_t m; 1887 1888 vm_page_queues_spin_lock(PQ_HOLD + q); 1889 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1890 if (m->flags & PG_MARKER) 1891 continue; 1892 1893 /* 1894 * Process one page and return 1895 */ 1896 if (m->hold_count) 1897 break; 1898 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1899 vm_page_hold(m); 1900 vm_page_queues_spin_unlock(PQ_HOLD + q); 1901 vm_page_unhold(m); /* reprocess */ 1902 return; 1903 } 1904 vm_page_queues_spin_unlock(PQ_HOLD + q); 1905 } 1906 1907 /* 1908 * This routine tries to maintain the pseudo LRU active queue, 1909 * so that during long periods of time where there is no paging, 1910 * that some statistic accumulation still occurs. This code 1911 * helps the situation where paging just starts to occur. 1912 */ 1913 static void 1914 vm_pageout_page_stats(int q) 1915 { 1916 static int fullintervalcount = 0; 1917 struct vm_page marker; 1918 vm_page_t m; 1919 long pcount, tpcount; /* Number of pages to check */ 1920 long page_shortage; 1921 1922 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1923 vmstats.v_free_min) - 1924 (vmstats.v_free_count + vmstats.v_inactive_count + 1925 vmstats.v_cache_count); 1926 1927 if (page_shortage <= 0) 1928 return; 1929 1930 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1931 fullintervalcount += vm_pageout_stats_interval; 1932 if (fullintervalcount < vm_pageout_full_stats_interval) { 1933 tpcount = (vm_pageout_stats_max * pcount) / 1934 vmstats.v_page_count + 1; 1935 if (pcount > tpcount) 1936 pcount = tpcount; 1937 } else { 1938 fullintervalcount = 0; 1939 } 1940 1941 bzero(&marker, sizeof(marker)); 1942 marker.flags = PG_FICTITIOUS | PG_MARKER; 1943 marker.busy_count = PBUSY_LOCKED; 1944 marker.queue = PQ_ACTIVE + q; 1945 marker.pc = q; 1946 marker.wire_count = 1; 1947 1948 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1949 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1950 1951 /* 1952 * Queue locked at top of loop to avoid stack marker issues. 1953 */ 1954 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1955 pcount-- > 0) 1956 { 1957 int actcount; 1958 1959 KKASSERT(m->queue == PQ_ACTIVE + q); 1960 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1961 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1962 &marker, pageq); 1963 1964 /* 1965 * Skip marker pages (atomic against other markers to avoid 1966 * infinite hop-over scans). 1967 */ 1968 if (m->flags & PG_MARKER) 1969 continue; 1970 1971 /* 1972 * Ignore pages we can't busy 1973 */ 1974 if (vm_page_busy_try(m, TRUE)) 1975 continue; 1976 1977 /* 1978 * Remaining operations run with the page busy and neither 1979 * the page or the queue will be spin-locked. 1980 */ 1981 KKASSERT(m->queue == PQ_ACTIVE + q); 1982 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1983 1984 /* 1985 * We can just remove wired pages from the queue 1986 */ 1987 if (m->wire_count) { 1988 vm_page_unqueue_nowakeup(m); 1989 vm_page_wakeup(m); 1990 goto next; 1991 } 1992 1993 1994 /* 1995 * We now have a safely busied page, the page and queue 1996 * spinlocks have been released. 1997 * 1998 * Ignore held and wired pages 1999 */ 2000 if (m->hold_count || m->wire_count) { 2001 vm_page_wakeup(m); 2002 goto next; 2003 } 2004 2005 /* 2006 * Calculate activity 2007 */ 2008 actcount = 0; 2009 if (m->flags & PG_REFERENCED) { 2010 vm_page_flag_clear(m, PG_REFERENCED); 2011 actcount += 1; 2012 } 2013 actcount += pmap_ts_referenced(m); 2014 2015 /* 2016 * Update act_count and move page to end of queue. 2017 */ 2018 if (actcount) { 2019 m->act_count += ACT_ADVANCE + actcount; 2020 if (m->act_count > ACT_MAX) 2021 m->act_count = ACT_MAX; 2022 vm_page_and_queue_spin_lock(m); 2023 if (m->queue - m->pc == PQ_ACTIVE) { 2024 TAILQ_REMOVE( 2025 &vm_page_queues[PQ_ACTIVE + q].pl, 2026 m, pageq); 2027 TAILQ_INSERT_TAIL( 2028 &vm_page_queues[PQ_ACTIVE + q].pl, 2029 m, pageq); 2030 } 2031 vm_page_and_queue_spin_unlock(m); 2032 vm_page_wakeup(m); 2033 goto next; 2034 } 2035 2036 if (m->act_count == 0) { 2037 /* 2038 * We turn off page access, so that we have 2039 * more accurate RSS stats. We don't do this 2040 * in the normal page deactivation when the 2041 * system is loaded VM wise, because the 2042 * cost of the large number of page protect 2043 * operations would be higher than the value 2044 * of doing the operation. 2045 * 2046 * We use the marker to save our place so 2047 * we can release the spin lock. both (m) 2048 * and (next) will be invalid. 2049 */ 2050 vm_page_protect(m, VM_PROT_NONE); 2051 vm_page_deactivate(m); 2052 } else { 2053 m->act_count -= min(m->act_count, ACT_DECLINE); 2054 vm_page_and_queue_spin_lock(m); 2055 if (m->queue - m->pc == PQ_ACTIVE) { 2056 TAILQ_REMOVE( 2057 &vm_page_queues[PQ_ACTIVE + q].pl, 2058 m, pageq); 2059 TAILQ_INSERT_TAIL( 2060 &vm_page_queues[PQ_ACTIVE + q].pl, 2061 m, pageq); 2062 } 2063 vm_page_and_queue_spin_unlock(m); 2064 } 2065 vm_page_wakeup(m); 2066 next: 2067 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2068 } 2069 2070 /* 2071 * Remove our local marker 2072 * 2073 * Page queue still spin-locked. 2074 */ 2075 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 2076 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2077 } 2078 2079 static void 2080 vm_pageout_free_page_calc(vm_size_t count) 2081 { 2082 /* 2083 * v_free_min normal allocations 2084 * v_free_reserved system allocations 2085 * v_pageout_free_min allocations by pageout daemon 2086 * v_interrupt_free_min low level allocations (e.g swap structures) 2087 * 2088 * v_free_min is used to generate several other baselines, and they 2089 * can get pretty silly on systems with a lot of memory. 2090 */ 2091 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2092 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2093 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2094 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2095 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2096 } 2097 2098 2099 /* 2100 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2101 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2102 * 2103 * The emergency pageout daemon takes over when the primary pageout daemon 2104 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2105 * avoiding the many low-memory deadlocks which can occur when paging out 2106 * to VFS's. 2107 */ 2108 static void 2109 vm_pageout_thread(void) 2110 { 2111 int pass; 2112 int q; 2113 int q1iterator = 0; 2114 int q2iterator = 0; 2115 int q3iterator = 0; 2116 int isep; 2117 2118 curthread->td_flags |= TDF_SYSTHREAD; 2119 2120 /* 2121 * We only need to setup once. 2122 */ 2123 isep = 0; 2124 if (curthread == emergpager) { 2125 isep = 1; 2126 goto skip_setup; 2127 } 2128 2129 /* 2130 * Initialize vm_max_launder per pageout pass to be 1/16 2131 * of total physical memory, plus a little slop. 2132 */ 2133 if (vm_max_launder == 0) 2134 vm_max_launder = physmem / 256 + 16; 2135 2136 /* 2137 * Initialize some paging parameters. 2138 */ 2139 vm_pageout_free_page_calc(vmstats.v_page_count); 2140 2141 /* 2142 * v_free_target and v_cache_min control pageout hysteresis. Note 2143 * that these are more a measure of the VM cache queue hysteresis 2144 * then the VM free queue. Specifically, v_free_target is the 2145 * high water mark (free+cache pages). 2146 * 2147 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2148 * low water mark, while v_free_min is the stop. v_cache_min must 2149 * be big enough to handle memory needs while the pageout daemon 2150 * is signalled and run to free more pages. 2151 */ 2152 vmstats.v_free_target = 4 * vmstats.v_free_min + 2153 vmstats.v_free_reserved; 2154 2155 /* 2156 * NOTE: With the new buffer cache b_act_count we want the default 2157 * inactive target to be a percentage of available memory. 2158 * 2159 * The inactive target essentially determines the minimum 2160 * number of 'temporary' pages capable of caching one-time-use 2161 * files when the VM system is otherwise full of pages 2162 * belonging to multi-time-use files or active program data. 2163 * 2164 * NOTE: The inactive target is aggressively persued only if the 2165 * inactive queue becomes too small. If the inactive queue 2166 * is large enough to satisfy page movement to free+cache 2167 * then it is repopulated more slowly from the active queue. 2168 * This allows a general inactive_target default to be set. 2169 * 2170 * There is an issue here for processes which sit mostly idle 2171 * 'overnight', such as sshd, tcsh, and X. Any movement from 2172 * the active queue will eventually cause such pages to 2173 * recycle eventually causing a lot of paging in the morning. 2174 * To reduce the incidence of this pages cycled out of the 2175 * buffer cache are moved directly to the inactive queue if 2176 * they were only used once or twice. 2177 * 2178 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2179 * Increasing the value (up to 64) increases the number of 2180 * buffer recyclements which go directly to the inactive queue. 2181 */ 2182 if (vmstats.v_free_count > 2048) { 2183 vmstats.v_cache_min = vmstats.v_free_target; 2184 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2185 } else { 2186 vmstats.v_cache_min = 0; 2187 vmstats.v_cache_max = 0; 2188 } 2189 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2190 2191 /* XXX does not really belong here */ 2192 if (vm_page_max_wired == 0) 2193 vm_page_max_wired = vmstats.v_free_count / 3; 2194 2195 if (vm_pageout_stats_max == 0) 2196 vm_pageout_stats_max = vmstats.v_free_target; 2197 2198 /* 2199 * Set interval in seconds for stats scan. 2200 */ 2201 if (vm_pageout_stats_interval == 0) 2202 vm_pageout_stats_interval = 5; 2203 if (vm_pageout_full_stats_interval == 0) 2204 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2205 2206 2207 /* 2208 * Set maximum free per pass 2209 */ 2210 if (vm_pageout_stats_free_max == 0) 2211 vm_pageout_stats_free_max = 5; 2212 2213 swap_pager_swap_init(); 2214 pass = 0; 2215 2216 atomic_swap_int(&sequence_emerg_pager, 1); 2217 wakeup(&sequence_emerg_pager); 2218 2219 skip_setup: 2220 /* 2221 * Sequence emergency pager startup 2222 */ 2223 if (isep) { 2224 while (sequence_emerg_pager == 0) 2225 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2226 } 2227 2228 /* 2229 * The pageout daemon is never done, so loop forever. 2230 * 2231 * WARNING! This code is being executed by two kernel threads 2232 * potentially simultaneously. 2233 */ 2234 while (TRUE) { 2235 int error; 2236 long avail_shortage; 2237 long inactive_shortage; 2238 long vnodes_skipped = 0; 2239 long recycle_count = 0; 2240 long tmp; 2241 2242 /* 2243 * Wait for an action request. If we timeout check to 2244 * see if paging is needed (in case the normal wakeup 2245 * code raced us). 2246 */ 2247 if (isep) { 2248 /* 2249 * Emergency pagedaemon monitors the primary 2250 * pagedaemon while vm_pages_needed != 0. 2251 * 2252 * The emergency pagedaemon only runs if VM paging 2253 * is needed and the primary pagedaemon has not 2254 * updated vm_pagedaemon_time for more than 2 seconds. 2255 */ 2256 if (vm_pages_needed) 2257 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2258 else 2259 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2260 if (vm_pages_needed == 0) { 2261 pass = 0; 2262 continue; 2263 } 2264 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2265 pass = 0; 2266 continue; 2267 } 2268 } else { 2269 /* 2270 * Primary pagedaemon 2271 * 2272 * NOTE: We unconditionally cleanup PQ_HOLD even 2273 * when there is no work to do. 2274 */ 2275 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2276 ++q3iterator; 2277 2278 if (vm_pages_needed == 0) { 2279 error = tsleep(&vm_pages_needed, 2280 0, "psleep", 2281 vm_pageout_stats_interval * hz); 2282 if (error && 2283 vm_paging_needed(0) == 0 && 2284 vm_pages_needed == 0) { 2285 for (q = 0; q < PQ_L2_SIZE; ++q) 2286 vm_pageout_page_stats(q); 2287 continue; 2288 } 2289 vm_pagedaemon_time = ticks; 2290 vm_pages_needed = 1; 2291 2292 /* 2293 * Wake the emergency pagedaemon up so it 2294 * can monitor us. It will automatically 2295 * go back into a long sleep when 2296 * vm_pages_needed returns to 0. 2297 */ 2298 wakeup(&vm_pagedaemon_time); 2299 } 2300 } 2301 2302 mycpu->gd_cnt.v_pdwakeups++; 2303 2304 /* 2305 * Scan for INACTIVE->CLEAN/PAGEOUT 2306 * 2307 * This routine tries to avoid thrashing the system with 2308 * unnecessary activity. 2309 * 2310 * Calculate our target for the number of free+cache pages we 2311 * want to get to. This is higher then the number that causes 2312 * allocations to stall (severe) in order to provide hysteresis, 2313 * and if we don't make it all the way but get to the minimum 2314 * we're happy. Goose it a bit if there are multiple requests 2315 * for memory. 2316 * 2317 * Don't reduce avail_shortage inside the loop or the 2318 * PQAVERAGE() calculation will break. 2319 * 2320 * NOTE! deficit is differentiated from avail_shortage as 2321 * REQUIRING at least (deficit) pages to be cleaned, 2322 * even if the page queues are in good shape. This 2323 * is used primarily for handling per-process 2324 * RLIMIT_RSS and may also see small values when 2325 * processes block due to low memory. 2326 */ 2327 vmstats_rollup(); 2328 if (isep == 0) 2329 vm_pagedaemon_time = ticks; 2330 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2331 vm_pageout_deficit = 0; 2332 2333 if (avail_shortage > 0) { 2334 long delta = 0; 2335 long counts[4] = { 0, 0, 0, 0 }; 2336 int qq; 2337 2338 if (vm_pageout_debug) { 2339 kprintf("scan_inactive pass %d isep=%d\t", 2340 pass / MAXSCAN_DIVIDER, isep); 2341 } 2342 2343 qq = q1iterator; 2344 for (q = 0; q < PQ_L2_SIZE; ++q) { 2345 delta += vm_pageout_scan_inactive( 2346 pass / MAXSCAN_DIVIDER, 2347 qq & PQ_L2_MASK, 2348 PQAVERAGE(avail_shortage), 2349 &vnodes_skipped, counts); 2350 if (isep) 2351 --qq; 2352 else 2353 ++qq; 2354 if (avail_shortage - delta <= 0) 2355 break; 2356 2357 /* 2358 * It is possible for avail_shortage to be 2359 * very large. If a large program exits or 2360 * frees a ton of memory all at once, we do 2361 * not have to continue deactivations. 2362 * 2363 * (We will still run the active->inactive 2364 * target, however). 2365 */ 2366 if (!vm_page_count_target() && 2367 !vm_page_count_min( 2368 vm_page_free_hysteresis)) { 2369 avail_shortage = 0; 2370 break; 2371 } 2372 } 2373 if (vm_pageout_debug) { 2374 kprintf("flushed %ld cleaned %ld " 2375 "lru2 %ld react %ld " 2376 "delta %ld\n", 2377 counts[0], counts[1], 2378 counts[2], counts[3], 2379 delta); 2380 } 2381 avail_shortage -= delta; 2382 q1iterator = qq; 2383 } 2384 2385 /* 2386 * Figure out how many active pages we must deactivate. If 2387 * we were able to reach our target with just the inactive 2388 * scan above we limit the number of active pages we 2389 * deactivate to reduce unnecessary work. 2390 */ 2391 vmstats_rollup(); 2392 if (isep == 0) 2393 vm_pagedaemon_time = ticks; 2394 inactive_shortage = vmstats.v_inactive_target - 2395 vmstats.v_inactive_count; 2396 2397 /* 2398 * If we were unable to free sufficient inactive pages to 2399 * satisfy the free/cache queue requirements then simply 2400 * reaching the inactive target may not be good enough. 2401 * Try to deactivate pages in excess of the target based 2402 * on the shortfall. 2403 * 2404 * However to prevent thrashing the VM system do not 2405 * deactivate more than an additional 1/10 the inactive 2406 * target's worth of active pages. 2407 */ 2408 if (avail_shortage > 0) { 2409 tmp = avail_shortage * 2; 2410 if (tmp > vmstats.v_inactive_target / 10) 2411 tmp = vmstats.v_inactive_target / 10; 2412 inactive_shortage += tmp; 2413 } 2414 2415 /* 2416 * Only trigger a pmap cleanup on inactive shortage. 2417 */ 2418 if (isep == 0 && inactive_shortage > 0) { 2419 pmap_collect(); 2420 } 2421 2422 /* 2423 * Scan for ACTIVE->INACTIVE 2424 * 2425 * Only trigger on inactive shortage. Triggering on 2426 * avail_shortage can starve the active queue with 2427 * unnecessary active->inactive transitions and destroy 2428 * performance. 2429 * 2430 * If this is the emergency pager, always try to move 2431 * a few pages from active to inactive because the inactive 2432 * queue might have enough pages, but not enough anonymous 2433 * pages. 2434 */ 2435 if (isep && inactive_shortage < vm_emerg_launder) 2436 inactive_shortage = vm_emerg_launder; 2437 2438 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2439 long delta = 0; 2440 int qq; 2441 2442 qq = q2iterator; 2443 for (q = 0; q < PQ_L2_SIZE; ++q) { 2444 delta += vm_pageout_scan_active( 2445 pass / MAXSCAN_DIVIDER, 2446 qq & PQ_L2_MASK, 2447 PQAVERAGE(avail_shortage), 2448 PQAVERAGE(inactive_shortage), 2449 &recycle_count); 2450 if (isep) 2451 --qq; 2452 else 2453 ++qq; 2454 if (inactive_shortage - delta <= 0 && 2455 avail_shortage - delta <= 0) { 2456 break; 2457 } 2458 2459 /* 2460 * inactive_shortage can be a very large 2461 * number. This is intended to break out 2462 * early if our inactive_target has been 2463 * reached due to other system activity. 2464 */ 2465 if (vmstats.v_inactive_count > 2466 vmstats.v_inactive_target) { 2467 inactive_shortage = 0; 2468 break; 2469 } 2470 } 2471 inactive_shortage -= delta; 2472 avail_shortage -= delta; 2473 q2iterator = qq; 2474 } 2475 2476 /* 2477 * Scan for CACHE->FREE 2478 * 2479 * Finally free enough cache pages to meet our free page 2480 * requirement and take more drastic measures if we are 2481 * still in trouble. 2482 */ 2483 vmstats_rollup(); 2484 if (isep == 0) 2485 vm_pagedaemon_time = ticks; 2486 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER, 2487 vnodes_skipped, recycle_count); 2488 2489 /* 2490 * This is a bit sophisticated because we do not necessarily 2491 * want to force paging until our targets are reached if we 2492 * were able to successfully retire the shortage we calculated. 2493 */ 2494 if (avail_shortage > 0) { 2495 /* 2496 * If we did not retire enough pages continue the 2497 * pageout operation until we are able to. It 2498 * takes MAXSCAN_DIVIDER passes to cover the entire 2499 * inactive list. 2500 */ 2501 ++pass; 2502 2503 if (pass / MAXSCAN_DIVIDER < 10 && 2504 vm_pages_needed > 1) { 2505 /* 2506 * Normal operation, additional processes 2507 * have already kicked us. Retry immediately 2508 * unless swap space is completely full in 2509 * which case delay a bit. 2510 */ 2511 if (swap_pager_full) { 2512 tsleep(&vm_pages_needed, 0, "pdelay", 2513 hz / 5); 2514 } /* else immediate retry */ 2515 } else if (pass / MAXSCAN_DIVIDER < 10) { 2516 /* 2517 * Do a short sleep for the first 10 passes, 2518 * allow the sleep to be woken up by resetting 2519 * vm_pages_needed to 1 (NOTE: we are still 2520 * active paging!). 2521 */ 2522 if (isep == 0) 2523 vm_pages_needed = 1; 2524 tsleep(&vm_pages_needed, 0, "pdelay", 2); 2525 } else if (swap_pager_full == 0) { 2526 /* 2527 * We've taken too many passes, force a 2528 * longer delay. 2529 */ 2530 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2531 } else { 2532 /* 2533 * Running out of memory, catastrophic 2534 * back-off to one-second intervals. 2535 */ 2536 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2537 } 2538 } else if (vm_pages_needed) { 2539 /* 2540 * We retired our calculated shortage but we may have 2541 * to continue paging if threads drain memory too far 2542 * below our target. 2543 * 2544 * Similar to vm_page_free_wakeup() in vm_page.c. 2545 */ 2546 pass = 0; 2547 if (!vm_paging_needed(0)) { 2548 /* still more than half-way to our target */ 2549 vm_pages_needed = 0; 2550 wakeup(&vmstats.v_free_count); 2551 } else 2552 if (!vm_page_count_min(vm_page_free_hysteresis)) { 2553 /* 2554 * Continue operations with wakeup 2555 * (set variable to avoid overflow) 2556 */ 2557 vm_pages_needed = 2; 2558 wakeup(&vmstats.v_free_count); 2559 } else { 2560 /* 2561 * No wakeup() needed, continue operations. 2562 * (set variable to avoid overflow) 2563 */ 2564 vm_pages_needed = 2; 2565 } 2566 } else { 2567 /* 2568 * Turn paging back on immediately if we are under 2569 * minimum. 2570 */ 2571 pass = 0; 2572 } 2573 } 2574 } 2575 2576 static struct kproc_desc pg1_kp = { 2577 "pagedaemon", 2578 vm_pageout_thread, 2579 &pagethread 2580 }; 2581 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2582 2583 static struct kproc_desc pg2_kp = { 2584 "emergpager", 2585 vm_pageout_thread, 2586 &emergpager 2587 }; 2588 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2589 2590 2591 /* 2592 * Called after allocating a page out of the cache or free queue 2593 * to possibly wake the pagedaemon up to replentish our supply. 2594 * 2595 * We try to generate some hysteresis by waking the pagedaemon up 2596 * when our free+cache pages go below the free_min+cache_min level. 2597 * The pagedaemon tries to get the count back up to at least the 2598 * minimum, and through to the target level if possible. 2599 * 2600 * If the pagedaemon is already active bump vm_pages_needed as a hint 2601 * that there are even more requests pending. 2602 * 2603 * SMP races ok? 2604 * No requirements. 2605 */ 2606 void 2607 pagedaemon_wakeup(void) 2608 { 2609 if (vm_paging_needed(0) && curthread != pagethread) { 2610 if (vm_pages_needed <= 1) { 2611 vm_pages_needed = 1; /* SMP race ok */ 2612 wakeup(&vm_pages_needed); /* tickle pageout */ 2613 } else if (vm_page_count_min(0)) { 2614 ++vm_pages_needed; /* SMP race ok */ 2615 /* a wakeup() would be wasted here */ 2616 } 2617 } 2618 } 2619 2620 #if !defined(NO_SWAPPING) 2621 2622 /* 2623 * SMP races ok? 2624 * No requirements. 2625 */ 2626 static void 2627 vm_req_vmdaemon(void) 2628 { 2629 static int lastrun = 0; 2630 2631 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2632 wakeup(&vm_daemon_needed); 2633 lastrun = ticks; 2634 } 2635 } 2636 2637 static int vm_daemon_callback(struct proc *p, void *data __unused); 2638 2639 /* 2640 * No requirements. 2641 */ 2642 static void 2643 vm_daemon(void) 2644 { 2645 int req_swapout; 2646 2647 while (TRUE) { 2648 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2649 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2650 2651 /* 2652 * forced swapouts 2653 */ 2654 if (req_swapout) 2655 swapout_procs(vm_pageout_req_swapout); 2656 2657 /* 2658 * scan the processes for exceeding their rlimits or if 2659 * process is swapped out -- deactivate pages 2660 */ 2661 allproc_scan(vm_daemon_callback, NULL, 0); 2662 } 2663 } 2664 2665 static int 2666 vm_daemon_callback(struct proc *p, void *data __unused) 2667 { 2668 struct vmspace *vm; 2669 vm_pindex_t limit, size; 2670 2671 /* 2672 * if this is a system process or if we have already 2673 * looked at this process, skip it. 2674 */ 2675 lwkt_gettoken(&p->p_token); 2676 2677 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2678 lwkt_reltoken(&p->p_token); 2679 return (0); 2680 } 2681 2682 /* 2683 * if the process is in a non-running type state, 2684 * don't touch it. 2685 */ 2686 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2687 lwkt_reltoken(&p->p_token); 2688 return (0); 2689 } 2690 2691 /* 2692 * get a limit 2693 */ 2694 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2695 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2696 2697 /* 2698 * let processes that are swapped out really be 2699 * swapped out. Set the limit to nothing to get as 2700 * many pages out to swap as possible. 2701 */ 2702 if (p->p_flags & P_SWAPPEDOUT) 2703 limit = 0; 2704 2705 vm = p->p_vmspace; 2706 vmspace_hold(vm); 2707 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2708 if (limit >= 0 && size > 4096 && 2709 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2710 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2711 } 2712 vmspace_drop(vm); 2713 2714 lwkt_reltoken(&p->p_token); 2715 2716 return (0); 2717 } 2718 2719 #endif 2720