1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1991 Regents of the University of California. 35 * All rights reserved. 36 * Copyright (c) 1994 John S. Dyson 37 * All rights reserved. 38 * Copyright (c) 1994 David Greenman 39 * All rights reserved. 40 * 41 * This code is derived from software contributed to Berkeley by 42 * The Mach Operating System project at Carnegie-Mellon University. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 69 * 70 * 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * The proverbial page-out daemon, rewritten many times over the decades. 99 */ 100 101 #include "opt_vm.h" 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/kernel.h> 105 #include <sys/proc.h> 106 #include <sys/kthread.h> 107 #include <sys/resourcevar.h> 108 #include <sys/signalvar.h> 109 #include <sys/vnode.h> 110 #include <sys/vmmeter.h> 111 #include <sys/conf.h> 112 #include <sys/sysctl.h> 113 114 #include <vm/vm.h> 115 #include <vm/vm_param.h> 116 #include <sys/lock.h> 117 #include <vm/vm_object.h> 118 #include <vm/vm_page.h> 119 #include <vm/vm_map.h> 120 #include <vm/vm_pageout.h> 121 #include <vm/vm_pager.h> 122 #include <vm/swap_pager.h> 123 #include <vm/vm_extern.h> 124 125 #include <sys/spinlock2.h> 126 #include <vm/vm_page2.h> 127 128 /* 129 * System initialization 130 */ 131 132 /* the kernel process "vm_pageout"*/ 133 static int vm_pageout_page(vm_page_t m, long *max_launderp, 134 long *vnodes_skippedp, struct vnode **vpfailedp, 135 int pass, int vmflush_flags, long *counts); 136 static int vm_pageout_clean_helper (vm_page_t, int); 137 static void vm_pageout_free_page_calc (vm_size_t count); 138 static void vm_pageout_page_free(vm_page_t m) ; 139 __read_frequently struct thread *emergpager; 140 __read_frequently struct thread *pagethread; 141 static int sequence_emerg_pager; 142 143 #if !defined(NO_SWAPPING) 144 /* the kernel process "vm_daemon"*/ 145 static void vm_daemon (void); 146 static struct thread *vmthread; 147 148 static struct kproc_desc vm_kp = { 149 "vmdaemon", 150 vm_daemon, 151 &vmthread 152 }; 153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 154 #endif 155 156 __read_mostly int vm_pages_needed = 0; /* pageout daemon tsleep event */ 157 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */ 158 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 159 __read_mostly int vm_page_free_hysteresis = 16; 160 __read_mostly static int vm_pagedaemon_time; 161 162 #if !defined(NO_SWAPPING) 163 static int vm_daemon_needed; 164 #endif 165 __read_mostly static int vm_max_launder = 0; 166 __read_mostly static int vm_emerg_launder = 100; 167 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 168 __read_mostly static int vm_pageout_full_stats_interval = 0; 169 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 170 __read_mostly static int defer_swap_pageouts=0; 171 __read_mostly static int disable_swap_pageouts=0; 172 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 173 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 174 __read_mostly static int vm_pageout_debug; 175 176 #if defined(NO_SWAPPING) 177 __read_mostly static int vm_swap_enabled=0; 178 #else 179 __read_mostly static int vm_swap_enabled=1; 180 #endif 181 182 /* 0-disable, 1-passive, 2-active swp, 3-acive swp + single-queue dirty pages*/ 183 __read_mostly int vm_pageout_memuse_mode=2; 184 __read_mostly int vm_pageout_allow_active=1; 185 186 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 187 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 188 189 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 190 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 191 192 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 193 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 194 "Free more pages than the minimum required"); 195 196 SYSCTL_INT(_vm, OID_AUTO, max_launder, 197 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 198 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 199 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 200 201 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 202 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 203 204 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 205 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 206 207 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 208 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 209 210 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 211 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 212 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 213 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 214 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active, 215 CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active"); 216 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 217 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 218 219 220 #if defined(NO_SWAPPING) 221 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 222 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 223 #else 224 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 225 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 226 #endif 227 228 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 229 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 230 231 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 232 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 233 234 static int pageout_lock_miss; 235 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 236 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 237 238 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 239 240 #if !defined(NO_SWAPPING) 241 static void vm_req_vmdaemon (void); 242 #endif 243 static void vm_pageout_page_stats(int q); 244 245 #define MAXSCAN_DIVIDER 10 246 247 /* 248 * Calculate approximately how many pages on each queue to try to 249 * clean. An exact calculation creates an edge condition when the 250 * queues are unbalanced so add significant slop. The queue scans 251 * will stop early when targets are reached and will start where they 252 * left off on the next pass. 253 * 254 * We need to be generous here because there are all sorts of loading 255 * conditions that can cause edge cases if try to average over all queues. 256 * In particular, storage subsystems have become so fast that paging 257 * activity can become quite frantic. Eventually we will probably need 258 * two paging threads, one for dirty pages and one for clean, to deal 259 * with the bandwidth requirements. 260 261 * So what we do is calculate a value that can be satisfied nominally by 262 * only having to scan half the queues. 263 */ 264 static __inline long 265 PQAVERAGE(long n) 266 { 267 long avg; 268 269 if (n >= 0) { 270 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 271 } else { 272 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 273 } 274 return avg; 275 } 276 277 /* 278 * vm_pageout_clean_helper: 279 * 280 * Clean the page and remove it from the laundry. The page must be busied 281 * by the caller and will be disposed of (put away, flushed) by this routine. 282 */ 283 static int 284 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 285 { 286 vm_object_t object; 287 vm_page_t mc[BLIST_MAX_ALLOC]; 288 int error; 289 int ib, is, page_base; 290 vm_pindex_t pindex = m->pindex; 291 292 object = m->object; 293 294 /* 295 * Don't mess with the page if it's held or special. Theoretically 296 * we can pageout held pages but there is no real need to press our 297 * luck, so don't. 298 */ 299 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 300 vm_page_wakeup(m); 301 return 0; 302 } 303 304 /* 305 * Place page in cluster. Align cluster for optimal swap space 306 * allocation (whether it is swap or not). This is typically ~16-32 307 * pages, which also tends to align the cluster to multiples of the 308 * filesystem block size if backed by a filesystem. 309 */ 310 page_base = pindex % BLIST_MAX_ALLOC; 311 mc[page_base] = m; 312 ib = page_base - 1; 313 is = page_base + 1; 314 315 /* 316 * Scan object for clusterable pages. 317 * 318 * We can cluster ONLY if: ->> the page is NOT 319 * clean, wired, busy, held, or mapped into a 320 * buffer, and one of the following: 321 * 1) The page is inactive, or a seldom used 322 * active page. 323 * -or- 324 * 2) we force the issue. 325 * 326 * During heavy mmap/modification loads the pageout 327 * daemon can really fragment the underlying file 328 * due to flushing pages out of order and not trying 329 * align the clusters (which leave sporatic out-of-order 330 * holes). To solve this problem we do the reverse scan 331 * first and attempt to align our cluster, then do a 332 * forward scan if room remains. 333 */ 334 vm_object_hold(object); 335 336 while (ib >= 0) { 337 vm_page_t p; 338 339 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 340 TRUE, &error); 341 if (error || p == NULL) 342 break; 343 if ((p->queue - p->pc) == PQ_CACHE || 344 (p->flags & PG_UNQUEUED)) { 345 vm_page_wakeup(p); 346 break; 347 } 348 vm_page_test_dirty(p); 349 if (((p->dirty & p->valid) == 0 && 350 (p->flags & PG_NEED_COMMIT) == 0) || 351 p->wire_count != 0 || /* may be held by buf cache */ 352 p->hold_count != 0) { /* may be undergoing I/O */ 353 vm_page_wakeup(p); 354 break; 355 } 356 if (p->queue - p->pc != PQ_INACTIVE) { 357 if (p->queue - p->pc != PQ_ACTIVE || 358 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 359 vm_page_wakeup(p); 360 break; 361 } 362 } 363 364 /* 365 * Try to maintain page groupings in the cluster. 366 */ 367 if (m->flags & PG_WINATCFLS) 368 vm_page_flag_set(p, PG_WINATCFLS); 369 else 370 vm_page_flag_clear(p, PG_WINATCFLS); 371 p->act_count = m->act_count; 372 373 mc[ib] = p; 374 --ib; 375 } 376 ++ib; /* fixup */ 377 378 while (is < BLIST_MAX_ALLOC && 379 pindex - page_base + is < object->size) { 380 vm_page_t p; 381 382 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 383 TRUE, &error); 384 if (error || p == NULL) 385 break; 386 if (((p->queue - p->pc) == PQ_CACHE) || 387 (p->flags & PG_UNQUEUED)) { 388 vm_page_wakeup(p); 389 break; 390 } 391 vm_page_test_dirty(p); 392 if (((p->dirty & p->valid) == 0 && 393 (p->flags & PG_NEED_COMMIT) == 0) || 394 p->wire_count != 0 || /* may be held by buf cache */ 395 p->hold_count != 0) { /* may be undergoing I/O */ 396 vm_page_wakeup(p); 397 break; 398 } 399 if (p->queue - p->pc != PQ_INACTIVE) { 400 if (p->queue - p->pc != PQ_ACTIVE || 401 (vmflush_flags & OBJPC_ALLOW_ACTIVE) == 0) { 402 vm_page_wakeup(p); 403 break; 404 } 405 } 406 407 /* 408 * Try to maintain page groupings in the cluster. 409 */ 410 if (m->flags & PG_WINATCFLS) 411 vm_page_flag_set(p, PG_WINATCFLS); 412 else 413 vm_page_flag_clear(p, PG_WINATCFLS); 414 p->act_count = m->act_count; 415 416 mc[is] = p; 417 ++is; 418 } 419 420 vm_object_drop(object); 421 422 /* 423 * we allow reads during pageouts... 424 */ 425 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 426 } 427 428 /* 429 * vm_pageout_flush() - launder the given pages 430 * 431 * The given pages are laundered. Note that we setup for the start of 432 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 433 * reference count all in here rather then in the parent. If we want 434 * the parent to do more sophisticated things we may have to change 435 * the ordering. 436 * 437 * The pages in the array must be busied by the caller and will be 438 * unbusied by this function. 439 */ 440 int 441 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 442 { 443 vm_object_t object; 444 int pageout_status[count]; 445 int numpagedout = 0; 446 int i; 447 448 /* 449 * Initiate I/O. Bump the vm_page_t->busy counter. 450 */ 451 for (i = 0; i < count; i++) { 452 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 453 ("vm_pageout_flush page %p index %d/%d: partially " 454 "invalid page", mc[i], i, count)); 455 vm_page_io_start(mc[i]); 456 } 457 458 /* 459 * We must make the pages read-only. This will also force the 460 * modified bit in the related pmaps to be cleared. The pager 461 * cannot clear the bit for us since the I/O completion code 462 * typically runs from an interrupt. The act of making the page 463 * read-only handles the case for us. 464 * 465 * Then we can unbusy the pages, we still hold a reference by virtue 466 * of our soft-busy. 467 */ 468 for (i = 0; i < count; i++) { 469 if (vmflush_flags & OBJPC_TRY_TO_CACHE) 470 vm_page_protect(mc[i], VM_PROT_NONE); 471 else 472 vm_page_protect(mc[i], VM_PROT_READ); 473 vm_page_wakeup(mc[i]); 474 } 475 476 object = mc[0]->object; 477 vm_object_pip_add(object, count); 478 479 vm_pager_put_pages(object, mc, count, 480 (vmflush_flags | 481 ((object == &kernel_object) ? 482 OBJPC_SYNC : 0)), 483 pageout_status); 484 485 for (i = 0; i < count; i++) { 486 vm_page_t mt = mc[i]; 487 488 switch (pageout_status[i]) { 489 case VM_PAGER_OK: 490 numpagedout++; 491 break; 492 case VM_PAGER_PEND: 493 numpagedout++; 494 break; 495 case VM_PAGER_BAD: 496 /* 497 * Page outside of range of object. Right now we 498 * essentially lose the changes by pretending it 499 * worked. 500 */ 501 vm_page_busy_wait(mt, FALSE, "pgbad"); 502 pmap_clear_modify(mt); 503 vm_page_undirty(mt); 504 vm_page_wakeup(mt); 505 break; 506 case VM_PAGER_ERROR: 507 case VM_PAGER_FAIL: 508 /* 509 * A page typically cannot be paged out when we 510 * have run out of swap. We leave the page 511 * marked inactive and will try to page it out 512 * again later. 513 * 514 * Starvation of the active page list is used to 515 * determine when the system is massively memory 516 * starved. 517 */ 518 break; 519 case VM_PAGER_AGAIN: 520 break; 521 } 522 523 /* 524 * If not PENDing this was a synchronous operation and we 525 * clean up after the I/O. If it is PENDing the mess is 526 * cleaned up asynchronously. 527 * 528 * Also nominally act on the caller's wishes if the caller 529 * wants to try to really clean (cache or free) the page. 530 * 531 * Also nominally deactivate the page if the system is 532 * memory-stressed. 533 */ 534 if (pageout_status[i] != VM_PAGER_PEND) { 535 vm_page_busy_wait(mt, FALSE, "pgouw"); 536 vm_page_io_finish(mt); 537 if (vmflush_flags & OBJPC_TRY_TO_CACHE) { 538 vm_page_try_to_cache(mt); 539 } else if (vm_page_count_severe()) { 540 vm_page_deactivate(mt); 541 vm_page_wakeup(mt); 542 } else { 543 vm_page_wakeup(mt); 544 } 545 vm_object_pip_wakeup(object); 546 } 547 } 548 return numpagedout; 549 } 550 551 #if !defined(NO_SWAPPING) 552 553 /* 554 * Callback function, page busied for us. We must dispose of the busy 555 * condition. Any related pmap pages may be held but will not be locked. 556 */ 557 static 558 int 559 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 560 vm_page_t p) 561 { 562 int actcount; 563 int cleanit = 0; 564 565 /* 566 * Basic tests - There should never be a marker, and we can stop 567 * once the RSS is below the required level. 568 */ 569 KKASSERT((p->flags & PG_MARKER) == 0); 570 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 571 vm_page_wakeup(p); 572 return(-1); 573 } 574 575 mycpu->gd_cnt.v_pdpages++; 576 577 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 578 vm_page_wakeup(p); 579 goto done; 580 } 581 582 ++info->actioncount; 583 584 /* 585 * Check if the page has been referened recently. If it has, 586 * activate it and skip. 587 */ 588 actcount = pmap_ts_referenced(p); 589 if (actcount) { 590 vm_page_flag_set(p, PG_REFERENCED); 591 } else if (p->flags & PG_REFERENCED) { 592 actcount = 1; 593 } 594 595 if (actcount) { 596 if (p->queue - p->pc != PQ_ACTIVE) { 597 vm_page_and_queue_spin_lock(p); 598 if (p->queue - p->pc != PQ_ACTIVE) { 599 vm_page_and_queue_spin_unlock(p); 600 vm_page_activate(p); 601 } else { 602 vm_page_and_queue_spin_unlock(p); 603 } 604 } else { 605 p->act_count += actcount; 606 if (p->act_count > ACT_MAX) 607 p->act_count = ACT_MAX; 608 } 609 vm_page_flag_clear(p, PG_REFERENCED); 610 vm_page_wakeup(p); 611 goto done; 612 } 613 614 /* 615 * Remove the page from this particular pmap. Once we do this, our 616 * pmap scans will not see it again (unless it gets faulted in), so 617 * we must actively dispose of or deal with the page. 618 */ 619 pmap_remove_specific(info->pmap, p); 620 621 /* 622 * If the page is not mapped to another process (i.e. as would be 623 * typical if this were a shared page from a library) then deactivate 624 * the page and clean it in two passes only. 625 * 626 * If the page hasn't been referenced since the last check, remove it 627 * from the pmap. If it is no longer mapped, deactivate it 628 * immediately, accelerating the normal decline. 629 * 630 * Once the page has been removed from the pmap the RSS code no 631 * longer tracks it so we have to make sure that it is staged for 632 * potential flush action. 633 * 634 * XXX 635 */ 636 if ((p->flags & PG_MAPPED) == 0 || 637 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 638 if (p->queue - p->pc == PQ_ACTIVE) { 639 vm_page_deactivate(p); 640 } 641 if (p->queue - p->pc == PQ_INACTIVE) { 642 cleanit = 1; 643 } 644 } 645 646 /* 647 * Ok, try to fully clean the page and any nearby pages such that at 648 * least the requested page is freed or moved to the cache queue. 649 * 650 * We usually do this synchronously to allow us to get the page into 651 * the CACHE queue quickly, which will prevent memory exhaustion if 652 * a process with a memoryuse limit is running away. However, the 653 * sysadmin may desire to set vm.swap_user_async which relaxes this 654 * and improves write performance. 655 */ 656 if (cleanit) { 657 long max_launder = 0x7FFF; 658 long vnodes_skipped = 0; 659 long counts[4] = { 0, 0, 0, 0 }; 660 int vmflush_flags; 661 struct vnode *vpfailed = NULL; 662 663 info->offset = va; 664 665 if (vm_pageout_memuse_mode >= 2) { 666 vmflush_flags = OBJPC_TRY_TO_CACHE | 667 OBJPC_ALLOW_ACTIVE; 668 if (swap_user_async == 0) 669 vmflush_flags |= OBJPC_SYNC; 670 vm_page_flag_set(p, PG_WINATCFLS); 671 info->cleancount += 672 vm_pageout_page(p, &max_launder, 673 &vnodes_skipped, 674 &vpfailed, 1, vmflush_flags, 675 counts); 676 } else { 677 vm_page_wakeup(p); 678 ++info->cleancount; 679 } 680 } else { 681 vm_page_wakeup(p); 682 } 683 684 /* 685 * Must be at end to avoid SMP races. 686 */ 687 done: 688 lwkt_user_yield(); 689 return 0; 690 } 691 692 /* 693 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 694 * that is relatively difficult to do. We try to keep track of where we 695 * left off last time to reduce scan overhead. 696 * 697 * Called when vm_pageout_memuse_mode is >= 1. 698 */ 699 void 700 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 701 { 702 vm_offset_t pgout_offset; 703 struct pmap_pgscan_info info; 704 int retries = 3; 705 706 pgout_offset = map->pgout_offset; 707 again: 708 #if 0 709 kprintf("%016jx ", pgout_offset); 710 #endif 711 if (pgout_offset < VM_MIN_USER_ADDRESS) 712 pgout_offset = VM_MIN_USER_ADDRESS; 713 if (pgout_offset >= VM_MAX_USER_ADDRESS) 714 pgout_offset = 0; 715 info.pmap = vm_map_pmap(map); 716 info.limit = limit; 717 info.beg_addr = pgout_offset; 718 info.end_addr = VM_MAX_USER_ADDRESS; 719 info.callback = vm_pageout_mdp_callback; 720 info.cleancount = 0; 721 info.actioncount = 0; 722 info.busycount = 0; 723 724 pmap_pgscan(&info); 725 pgout_offset = info.offset; 726 #if 0 727 kprintf("%016jx %08lx %08lx\n", pgout_offset, 728 info.cleancount, info.actioncount); 729 #endif 730 731 if (pgout_offset != VM_MAX_USER_ADDRESS && 732 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 733 goto again; 734 } else if (retries && 735 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 736 --retries; 737 goto again; 738 } 739 map->pgout_offset = pgout_offset; 740 } 741 #endif 742 743 /* 744 * Called when the pageout scan wants to free a page. We no longer 745 * try to cycle the vm_object here with a reference & dealloc, which can 746 * cause a non-trivial object collapse in a critical path. 747 * 748 * It is unclear why we cycled the ref_count in the past, perhaps to try 749 * to optimize shadow chain collapses but I don't quite see why it would 750 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 751 * synchronously and not have to be kicked-start. 752 */ 753 static void 754 vm_pageout_page_free(vm_page_t m) 755 { 756 vm_page_protect(m, VM_PROT_NONE); 757 vm_page_free(m); 758 } 759 760 /* 761 * vm_pageout_scan does the dirty work for the pageout daemon. 762 */ 763 struct vm_pageout_scan_info { 764 struct proc *bigproc; 765 vm_offset_t bigsize; 766 }; 767 768 static int vm_pageout_scan_callback(struct proc *p, void *data); 769 770 /* 771 * Scan inactive queue 772 * 773 * WARNING! Can be called from two pagedaemon threads simultaneously. 774 */ 775 static int 776 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 777 long *vnodes_skipped, long *counts) 778 { 779 vm_page_t m; 780 struct vm_page marker; 781 struct vnode *vpfailed; /* warning, allowed to be stale */ 782 long maxscan; 783 long delta = 0; 784 long max_launder; 785 int isep; 786 int vmflush_flags; 787 788 isep = (curthread == emergpager); 789 if ((unsigned)pass > 1000) 790 pass = 1000; 791 792 /* 793 * This routine is called for each of PQ_L2_SIZE inactive queues. 794 * We want the vm_max_launder parameter to apply to the whole 795 * queue (i.e. per-whole-queue pass, not per-sub-queue). 796 * 797 * In each successive full-pass when the page target is not met we 798 * allow the per-queue max_launder to increase up to a maximum of 799 * vm_max_launder / 16. 800 */ 801 if (pass) 802 max_launder = (long)vm_max_launder * (pass + 1) / PQ_L2_SIZE; 803 else 804 max_launder = (long)vm_max_launder / PQ_L2_SIZE; 805 max_launder /= MAXSCAN_DIVIDER; 806 807 if (max_launder <= 1) 808 max_launder = 1; 809 if (max_launder >= vm_max_launder / 16) 810 max_launder = vm_max_launder / 16 + 1; 811 812 /* 813 * Start scanning the inactive queue for pages we can move to the 814 * cache or free. The scan will stop when the target is reached or 815 * we have scanned the entire inactive queue. Note that m->act_count 816 * is not used to form decisions for the inactive queue, only for the 817 * active queue. 818 * 819 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 820 * PAGES. 821 */ 822 823 /* 824 * Initialize our marker 825 */ 826 bzero(&marker, sizeof(marker)); 827 marker.flags = PG_FICTITIOUS | PG_MARKER; 828 marker.busy_count = PBUSY_LOCKED; 829 marker.queue = PQ_INACTIVE + q; 830 marker.pc = q; 831 marker.wire_count = 1; 832 833 /* 834 * Inactive queue scan. 835 * 836 * We pick off approximately 1/10 of each queue. Each queue is 837 * effectively organized LRU so scanning the entire queue would 838 * improperly pick up pages that might still be in regular use. 839 * 840 * NOTE: The vm_page must be spinlocked before the queue to avoid 841 * deadlocks, so it is easiest to simply iterate the loop 842 * with the queue unlocked at the top. 843 */ 844 vpfailed = NULL; 845 846 vm_page_queues_spin_lock(PQ_INACTIVE + q); 847 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 848 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1; 849 850 /* 851 * Queue locked at top of loop to avoid stack marker issues. 852 */ 853 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 854 maxscan-- > 0 && avail_shortage - delta > 0) 855 { 856 int count; 857 858 KKASSERT(m->queue == PQ_INACTIVE + q); 859 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 860 &marker, pageq); 861 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 862 &marker, pageq); 863 mycpu->gd_cnt.v_pdpages++; 864 865 /* 866 * Skip marker pages (atomic against other markers to avoid 867 * infinite hop-over scans). 868 */ 869 if (m->flags & PG_MARKER) 870 continue; 871 872 /* 873 * Try to busy the page. Don't mess with pages which are 874 * already busy or reorder them in the queue. 875 */ 876 if (vm_page_busy_try(m, TRUE)) 877 continue; 878 879 /* 880 * Remaining operations run with the page busy and neither 881 * the page or the queue will be spin-locked. 882 */ 883 KKASSERT(m->queue == PQ_INACTIVE + q); 884 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 885 886 /* 887 * The emergency pager runs when the primary pager gets 888 * stuck, which typically means the primary pager deadlocked 889 * on a vnode-backed page. Therefore, the emergency pager 890 * must skip any complex objects. 891 * 892 * We disallow VNODEs unless they are VCHR whos device ops 893 * does not flag D_NOEMERGPGR. 894 */ 895 if (isep && m->object) { 896 struct vnode *vp; 897 898 switch(m->object->type) { 899 case OBJT_DEFAULT: 900 case OBJT_SWAP: 901 /* 902 * Allow anonymous memory and assume that 903 * swap devices are not complex, since its 904 * kinda worthless if we can't swap out dirty 905 * anonymous pages. 906 */ 907 break; 908 case OBJT_VNODE: 909 /* 910 * Allow VCHR device if the D_NOEMERGPGR 911 * flag is not set, deny other vnode types 912 * as being too complex. 913 */ 914 vp = m->object->handle; 915 if (vp && vp->v_type == VCHR && 916 vp->v_rdev && vp->v_rdev->si_ops && 917 (vp->v_rdev->si_ops->head.flags & 918 D_NOEMERGPGR) == 0) { 919 break; 920 } 921 /* Deny - fall through */ 922 default: 923 /* 924 * Deny 925 */ 926 vm_page_wakeup(m); 927 vm_page_queues_spin_lock(PQ_INACTIVE + q); 928 lwkt_yield(); 929 continue; 930 } 931 } 932 933 /* 934 * Try to pageout the page and perhaps other nearby pages. 935 * We want to get the pages into the cache eventually ( 936 * first or second pass). Otherwise the pages can wind up 937 * just cycling in the inactive queue, getting flushed over 938 * and over again. 939 * 940 * Generally speaking we recycle dirty pages within PQ_INACTIVE 941 * twice (double LRU) before paging them out. If the 942 * memuse_mode is >= 3 we run them single-LRU like we do clean 943 * pages. 944 */ 945 if (vm_pageout_memuse_mode >= 3) 946 vm_page_flag_set(m, PG_WINATCFLS); 947 948 vmflush_flags = 0; 949 if (vm_pageout_allow_active) 950 vmflush_flags |= OBJPC_ALLOW_ACTIVE; 951 if (m->flags & PG_WINATCFLS) 952 vmflush_flags |= OBJPC_TRY_TO_CACHE; 953 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 954 &vpfailed, pass, vmflush_flags, counts); 955 delta += count; 956 957 /* 958 * Systems with a ton of memory can wind up with huge 959 * deactivation counts. Because the inactive scan is 960 * doing a lot of flushing, the combination can result 961 * in excessive paging even in situations where other 962 * unrelated threads free up sufficient VM. 963 * 964 * To deal with this we abort the nominal active->inactive 965 * scan before we hit the inactive target when free+cache 966 * levels have reached a reasonable target. 967 * 968 * When deciding to stop early we need to add some slop to 969 * the test and we need to return full completion to the caller 970 * to prevent the caller from thinking there is something 971 * wrong and issuing a low-memory+swap warning or pkill. 972 * 973 * A deficit forces paging regardless of the state of the 974 * VM page queues (used for RSS enforcement). 975 */ 976 lwkt_yield(); 977 vm_page_queues_spin_lock(PQ_INACTIVE + q); 978 if (vm_paging_target() < -vm_max_launder) { 979 /* 980 * Stopping early, return full completion to caller. 981 */ 982 if (delta < avail_shortage) 983 delta = avail_shortage; 984 break; 985 } 986 } 987 988 /* page queue still spin-locked */ 989 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 990 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 991 992 return (delta); 993 } 994 995 /* 996 * Pageout the specified page, return the total number of pages paged out 997 * (this routine may cluster). 998 * 999 * The page must be busied and soft-busied by the caller and will be disposed 1000 * of by this function. 1001 */ 1002 static int 1003 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 1004 struct vnode **vpfailedp, int pass, int vmflush_flags, 1005 long *counts) 1006 { 1007 vm_object_t object; 1008 int actcount; 1009 int count = 0; 1010 1011 /* 1012 * Wiring no longer removes a page from its queue. The last unwiring 1013 * will requeue the page. Obviously wired pages cannot be paged out 1014 * so unqueue it and return. 1015 */ 1016 if (m->wire_count) { 1017 vm_page_unqueue_nowakeup(m); 1018 vm_page_wakeup(m); 1019 return 0; 1020 } 1021 1022 /* 1023 * A held page may be undergoing I/O, so skip it. 1024 */ 1025 if (m->hold_count) { 1026 vm_page_and_queue_spin_lock(m); 1027 if (m->queue - m->pc == PQ_INACTIVE) { 1028 TAILQ_REMOVE( 1029 &vm_page_queues[m->queue].pl, m, pageq); 1030 TAILQ_INSERT_TAIL( 1031 &vm_page_queues[m->queue].pl, m, pageq); 1032 } 1033 vm_page_and_queue_spin_unlock(m); 1034 vm_page_wakeup(m); 1035 return 0; 1036 } 1037 1038 if (m->object == NULL || m->object->ref_count == 0) { 1039 /* 1040 * If the object is not being used, we ignore previous 1041 * references. 1042 */ 1043 vm_page_flag_clear(m, PG_REFERENCED); 1044 pmap_clear_reference(m); 1045 /* fall through to end */ 1046 } else if (((m->flags & PG_REFERENCED) == 0) && 1047 (actcount = pmap_ts_referenced(m))) { 1048 /* 1049 * Otherwise, if the page has been referenced while 1050 * in the inactive queue, we bump the "activation 1051 * count" upwards, making it less likely that the 1052 * page will be added back to the inactive queue 1053 * prematurely again. Here we check the page tables 1054 * (or emulated bits, if any), given the upper level 1055 * VM system not knowing anything about existing 1056 * references. 1057 */ 1058 ++counts[3]; 1059 vm_page_activate(m); 1060 m->act_count += (actcount + ACT_ADVANCE); 1061 vm_page_wakeup(m); 1062 return 0; 1063 } 1064 1065 /* 1066 * (m) is still busied. 1067 * 1068 * If the upper level VM system knows about any page 1069 * references, we activate the page. We also set the 1070 * "activation count" higher than normal so that we will less 1071 * likely place pages back onto the inactive queue again. 1072 */ 1073 if ((m->flags & PG_REFERENCED) != 0) { 1074 vm_page_flag_clear(m, PG_REFERENCED); 1075 actcount = pmap_ts_referenced(m); 1076 vm_page_activate(m); 1077 m->act_count += (actcount + ACT_ADVANCE + 1); 1078 vm_page_wakeup(m); 1079 ++counts[3]; 1080 return 0; 1081 } 1082 1083 /* 1084 * If the upper level VM system doesn't know anything about 1085 * the page being dirty, we have to check for it again. As 1086 * far as the VM code knows, any partially dirty pages are 1087 * fully dirty. 1088 * 1089 * Pages marked PG_WRITEABLE may be mapped into the user 1090 * address space of a process running on another cpu. A 1091 * user process (without holding the MP lock) running on 1092 * another cpu may be able to touch the page while we are 1093 * trying to remove it. vm_page_cache() will handle this 1094 * case for us. 1095 */ 1096 if (m->dirty == 0) { 1097 vm_page_test_dirty(m); 1098 } else { 1099 vm_page_dirty(m); 1100 } 1101 1102 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1103 /* 1104 * Invalid pages can be easily freed 1105 */ 1106 vm_pageout_page_free(m); 1107 mycpu->gd_cnt.v_dfree++; 1108 ++count; 1109 ++counts[1]; 1110 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1111 /* 1112 * Clean pages can be placed onto the cache queue. 1113 * This effectively frees them. 1114 */ 1115 vm_page_cache(m); 1116 ++count; 1117 ++counts[1]; 1118 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1119 /* 1120 * Dirty pages need to be paged out, but flushing 1121 * a page is extremely expensive verses freeing 1122 * a clean page. Rather then artificially limiting 1123 * the number of pages we can flush, we instead give 1124 * dirty pages extra priority on the inactive queue 1125 * by forcing them to be cycled through the queue 1126 * twice before being flushed, after which the 1127 * (now clean) page will cycle through once more 1128 * before being freed. This significantly extends 1129 * the thrash point for a heavily loaded machine. 1130 */ 1131 ++counts[2]; 1132 vm_page_flag_set(m, PG_WINATCFLS); 1133 vm_page_and_queue_spin_lock(m); 1134 if (m->queue - m->pc == PQ_INACTIVE) { 1135 TAILQ_REMOVE( 1136 &vm_page_queues[m->queue].pl, m, pageq); 1137 TAILQ_INSERT_TAIL( 1138 &vm_page_queues[m->queue].pl, m, pageq); 1139 } 1140 vm_page_and_queue_spin_unlock(m); 1141 vm_page_wakeup(m); 1142 } else if (*max_launderp > 0) { 1143 /* 1144 * We always want to try to flush some dirty pages if 1145 * we encounter them, to keep the system stable. 1146 * Normally this number is small, but under extreme 1147 * pressure where there are insufficient clean pages 1148 * on the inactive queue, we may have to go all out. 1149 */ 1150 int swap_pageouts_ok; 1151 struct vnode *vp = NULL; 1152 1153 if ((m->flags & PG_WINATCFLS) == 0) 1154 vm_page_flag_set(m, PG_WINATCFLS); 1155 swap_pageouts_ok = 0; 1156 object = m->object; 1157 if (object && 1158 (object->type != OBJT_SWAP) && 1159 (object->type != OBJT_DEFAULT)) { 1160 swap_pageouts_ok = 1; 1161 } else { 1162 swap_pageouts_ok = !(defer_swap_pageouts || 1163 disable_swap_pageouts); 1164 swap_pageouts_ok |= (!disable_swap_pageouts && 1165 defer_swap_pageouts && 1166 vm_page_count_min(0)); 1167 } 1168 1169 /* 1170 * We don't bother paging objects that are "dead". 1171 * Those objects are in a "rundown" state. 1172 */ 1173 if (!swap_pageouts_ok || 1174 (object == NULL) || 1175 (object->flags & OBJ_DEAD)) { 1176 vm_page_and_queue_spin_lock(m); 1177 if (m->queue - m->pc == PQ_INACTIVE) { 1178 TAILQ_REMOVE( 1179 &vm_page_queues[m->queue].pl, 1180 m, pageq); 1181 TAILQ_INSERT_TAIL( 1182 &vm_page_queues[m->queue].pl, 1183 m, pageq); 1184 } 1185 vm_page_and_queue_spin_unlock(m); 1186 vm_page_wakeup(m); 1187 return 0; 1188 } 1189 1190 /* 1191 * (m) is still busied. 1192 * 1193 * The object is already known NOT to be dead. It 1194 * is possible for the vget() to block the whole 1195 * pageout daemon, but the new low-memory handling 1196 * code should prevent it. 1197 * 1198 * The previous code skipped locked vnodes and, worse, 1199 * reordered pages in the queue. This results in 1200 * completely non-deterministic operation because, 1201 * quite often, a vm_fault has initiated an I/O and 1202 * is holding a locked vnode at just the point where 1203 * the pageout daemon is woken up. 1204 * 1205 * We can't wait forever for the vnode lock, we might 1206 * deadlock due to a vn_read() getting stuck in 1207 * vm_wait while holding this vnode. We skip the 1208 * vnode if we can't get it in a reasonable amount 1209 * of time. 1210 * 1211 * vpfailed is used to (try to) avoid the case where 1212 * a large number of pages are associated with a 1213 * locked vnode, which could cause the pageout daemon 1214 * to stall for an excessive amount of time. 1215 */ 1216 if (object->type == OBJT_VNODE) { 1217 int flags; 1218 1219 vp = object->handle; 1220 flags = LK_EXCLUSIVE; 1221 if (vp == *vpfailedp) 1222 flags |= LK_NOWAIT; 1223 else 1224 flags |= LK_TIMELOCK; 1225 vm_page_hold(m); 1226 vm_page_wakeup(m); 1227 1228 /* 1229 * We have unbusied (m) temporarily so we can 1230 * acquire the vp lock without deadlocking. 1231 * (m) is held to prevent destruction. 1232 */ 1233 if (vget(vp, flags) != 0) { 1234 *vpfailedp = vp; 1235 ++pageout_lock_miss; 1236 if (object->flags & OBJ_MIGHTBEDIRTY) 1237 ++*vnodes_skippedp; 1238 vm_page_unhold(m); 1239 return 0; 1240 } 1241 1242 /* 1243 * The page might have been moved to another 1244 * queue during potential blocking in vget() 1245 * above. The page might have been freed and 1246 * reused for another vnode. The object might 1247 * have been reused for another vnode. 1248 */ 1249 if (m->queue - m->pc != PQ_INACTIVE || 1250 m->object != object || 1251 object->handle != vp) { 1252 if (object->flags & OBJ_MIGHTBEDIRTY) 1253 ++*vnodes_skippedp; 1254 vput(vp); 1255 vm_page_unhold(m); 1256 return 0; 1257 } 1258 1259 /* 1260 * The page may have been busied during the 1261 * blocking in vput(); We don't move the 1262 * page back onto the end of the queue so that 1263 * statistics are more correct if we don't. 1264 */ 1265 if (vm_page_busy_try(m, TRUE)) { 1266 vput(vp); 1267 vm_page_unhold(m); 1268 return 0; 1269 } 1270 vm_page_unhold(m); 1271 1272 /* 1273 * If it was wired while we didn't own it. 1274 */ 1275 if (m->wire_count) { 1276 vm_page_unqueue_nowakeup(m); 1277 vput(vp); 1278 vm_page_wakeup(m); 1279 return 0; 1280 } 1281 1282 /* 1283 * (m) is busied again 1284 * 1285 * We own the busy bit and remove our hold 1286 * bit. If the page is still held it 1287 * might be undergoing I/O, so skip it. 1288 */ 1289 if (m->hold_count) { 1290 rebusy_failed: 1291 vm_page_and_queue_spin_lock(m); 1292 if (m->queue - m->pc == PQ_INACTIVE) { 1293 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1294 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1295 } 1296 vm_page_and_queue_spin_unlock(m); 1297 if (object->flags & OBJ_MIGHTBEDIRTY) 1298 ++*vnodes_skippedp; 1299 vm_page_wakeup(m); 1300 vput(vp); 1301 return 0; 1302 } 1303 1304 /* 1305 * Recheck queue, object, and vp now that we have 1306 * rebusied the page. 1307 */ 1308 if (m->queue - m->pc != PQ_INACTIVE || 1309 m->object != object || 1310 object->handle != vp) { 1311 kprintf("vm_pageout_page: " 1312 "rebusy %p failed(A)\n", 1313 m); 1314 goto rebusy_failed; 1315 } 1316 1317 /* 1318 * Check page validity 1319 */ 1320 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1321 kprintf("vm_pageout_page: " 1322 "rebusy %p failed(B)\n", 1323 m); 1324 goto rebusy_failed; 1325 } 1326 if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1327 kprintf("vm_pageout_page: " 1328 "rebusy %p failed(C)\n", 1329 m); 1330 goto rebusy_failed; 1331 } 1332 1333 /* (m) is left busied as we fall through */ 1334 } 1335 1336 /* 1337 * page is busy and not held here. 1338 * 1339 * If a page is dirty, then it is either being washed 1340 * (but not yet cleaned) or it is still in the 1341 * laundry. If it is still in the laundry, then we 1342 * start the cleaning operation. 1343 * 1344 * decrement inactive_shortage on success to account 1345 * for the (future) cleaned page. Otherwise we 1346 * could wind up laundering or cleaning too many 1347 * pages. 1348 * 1349 * NOTE: Cleaning the page here does not cause 1350 * force_deficit to be adjusted, because the 1351 * page is not being freed or moved to the 1352 * cache. 1353 */ 1354 count = vm_pageout_clean_helper(m, vmflush_flags); 1355 counts[0] += count; 1356 *max_launderp -= count; 1357 1358 /* 1359 * Clean ate busy, page no longer accessible 1360 */ 1361 if (vp != NULL) 1362 vput(vp); 1363 } else { 1364 vm_page_wakeup(m); 1365 } 1366 return count; 1367 } 1368 1369 /* 1370 * Scan active queue 1371 * 1372 * WARNING! Can be called from two pagedaemon threads simultaneously. 1373 */ 1374 static int 1375 vm_pageout_scan_active(int pass, int q, 1376 long avail_shortage, long inactive_shortage, 1377 long *recycle_countp) 1378 { 1379 struct vm_page marker; 1380 vm_page_t m; 1381 int actcount; 1382 long delta = 0; 1383 long maxscan; 1384 int isep; 1385 1386 isep = (curthread == emergpager); 1387 1388 /* 1389 * We want to move pages from the active queue to the inactive 1390 * queue to get the inactive queue to the inactive target. If 1391 * we still have a page shortage from above we try to directly free 1392 * clean pages instead of moving them. 1393 * 1394 * If we do still have a shortage we keep track of the number of 1395 * pages we free or cache (recycle_count) as a measure of thrashing 1396 * between the active and inactive queues. 1397 * 1398 * If we were able to completely satisfy the free+cache targets 1399 * from the inactive pool we limit the number of pages we move 1400 * from the active pool to the inactive pool to 2x the pages we 1401 * had removed from the inactive pool (with a minimum of 1/5 the 1402 * inactive target). If we were not able to completely satisfy 1403 * the free+cache targets we go for the whole target aggressively. 1404 * 1405 * NOTE: Both variables can end up negative. 1406 * NOTE: We are still in a critical section. 1407 * 1408 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1409 * PAGES. 1410 */ 1411 1412 bzero(&marker, sizeof(marker)); 1413 marker.flags = PG_FICTITIOUS | PG_MARKER; 1414 marker.busy_count = PBUSY_LOCKED; 1415 marker.queue = PQ_ACTIVE + q; 1416 marker.pc = q; 1417 marker.wire_count = 1; 1418 1419 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1420 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1421 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / MAXSCAN_DIVIDER + 1; 1422 1423 /* 1424 * Queue locked at top of loop to avoid stack marker issues. 1425 */ 1426 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1427 maxscan-- > 0 && (avail_shortage - delta > 0 || 1428 inactive_shortage > 0)) 1429 { 1430 KKASSERT(m->queue == PQ_ACTIVE + q); 1431 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1432 &marker, pageq); 1433 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1434 &marker, pageq); 1435 1436 /* 1437 * Skip marker pages (atomic against other markers to avoid 1438 * infinite hop-over scans). 1439 */ 1440 if (m->flags & PG_MARKER) 1441 continue; 1442 1443 /* 1444 * Try to busy the page. Don't mess with pages which are 1445 * already busy or reorder them in the queue. 1446 */ 1447 if (vm_page_busy_try(m, TRUE)) 1448 continue; 1449 1450 /* 1451 * Remaining operations run with the page busy and neither 1452 * the page or the queue will be spin-locked. 1453 */ 1454 KKASSERT(m->queue == PQ_ACTIVE + q); 1455 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1456 1457 #if 0 1458 /* 1459 * Don't deactivate pages that are held, even if we can 1460 * busy them. (XXX why not?) 1461 */ 1462 if (m->hold_count) { 1463 vm_page_and_queue_spin_lock(m); 1464 if (m->queue - m->pc == PQ_ACTIVE) { 1465 TAILQ_REMOVE( 1466 &vm_page_queues[PQ_ACTIVE + q].pl, 1467 m, pageq); 1468 TAILQ_INSERT_TAIL( 1469 &vm_page_queues[PQ_ACTIVE + q].pl, 1470 m, pageq); 1471 } 1472 vm_page_and_queue_spin_unlock(m); 1473 vm_page_wakeup(m); 1474 goto next; 1475 } 1476 #endif 1477 /* 1478 * We can just remove wired pages from the queue 1479 */ 1480 if (m->wire_count) { 1481 vm_page_unqueue_nowakeup(m); 1482 vm_page_wakeup(m); 1483 goto next; 1484 } 1485 1486 /* 1487 * The emergency pager ignores vnode-backed pages as these 1488 * are the pages that probably bricked the main pager. 1489 */ 1490 if (isep && m->object && m->object->type == OBJT_VNODE) { 1491 vm_page_and_queue_spin_lock(m); 1492 if (m->queue - m->pc == PQ_ACTIVE) { 1493 TAILQ_REMOVE( 1494 &vm_page_queues[PQ_ACTIVE + q].pl, 1495 m, pageq); 1496 TAILQ_INSERT_TAIL( 1497 &vm_page_queues[PQ_ACTIVE + q].pl, 1498 m, pageq); 1499 } 1500 vm_page_and_queue_spin_unlock(m); 1501 vm_page_wakeup(m); 1502 goto next; 1503 } 1504 1505 /* 1506 * The count for pagedaemon pages is done after checking the 1507 * page for eligibility... 1508 */ 1509 mycpu->gd_cnt.v_pdpages++; 1510 1511 /* 1512 * Check to see "how much" the page has been used and clear 1513 * the tracking access bits. If the object has no references 1514 * don't bother paying the expense. 1515 */ 1516 actcount = 0; 1517 if (m->object && m->object->ref_count != 0) { 1518 if (m->flags & PG_REFERENCED) 1519 ++actcount; 1520 actcount += pmap_ts_referenced(m); 1521 if (actcount) { 1522 m->act_count += ACT_ADVANCE + actcount; 1523 if (m->act_count > ACT_MAX) 1524 m->act_count = ACT_MAX; 1525 } 1526 } 1527 vm_page_flag_clear(m, PG_REFERENCED); 1528 1529 /* 1530 * actcount is only valid if the object ref_count is non-zero. 1531 * If the page does not have an object, actcount will be zero. 1532 */ 1533 if (actcount && m->object->ref_count != 0) { 1534 vm_page_and_queue_spin_lock(m); 1535 if (m->queue - m->pc == PQ_ACTIVE) { 1536 TAILQ_REMOVE( 1537 &vm_page_queues[PQ_ACTIVE + q].pl, 1538 m, pageq); 1539 TAILQ_INSERT_TAIL( 1540 &vm_page_queues[PQ_ACTIVE + q].pl, 1541 m, pageq); 1542 } 1543 vm_page_and_queue_spin_unlock(m); 1544 vm_page_wakeup(m); 1545 } else { 1546 switch(m->object->type) { 1547 case OBJT_DEFAULT: 1548 case OBJT_SWAP: 1549 m->act_count -= min(m->act_count, 1550 vm_anonmem_decline); 1551 break; 1552 default: 1553 m->act_count -= min(m->act_count, 1554 vm_filemem_decline); 1555 break; 1556 } 1557 if (vm_pageout_algorithm || 1558 (m->object == NULL) || 1559 (m->object && (m->object->ref_count == 0)) || 1560 m->act_count < pass + 1 1561 ) { 1562 /* 1563 * Deactivate the page. If we had a 1564 * shortage from our inactive scan try to 1565 * free (cache) the page instead. 1566 * 1567 * Don't just blindly cache the page if 1568 * we do not have a shortage from the 1569 * inactive scan, that could lead to 1570 * gigabytes being moved. 1571 */ 1572 --inactive_shortage; 1573 if (avail_shortage - delta > 0 || 1574 (m->object && (m->object->ref_count == 0))) 1575 { 1576 if (avail_shortage - delta > 0) 1577 ++*recycle_countp; 1578 vm_page_protect(m, VM_PROT_NONE); 1579 if (m->dirty == 0 && 1580 (m->flags & PG_NEED_COMMIT) == 0 && 1581 avail_shortage - delta > 0) { 1582 vm_page_cache(m); 1583 } else { 1584 vm_page_deactivate(m); 1585 vm_page_wakeup(m); 1586 } 1587 } else { 1588 vm_page_deactivate(m); 1589 vm_page_wakeup(m); 1590 } 1591 ++delta; 1592 } else { 1593 vm_page_and_queue_spin_lock(m); 1594 if (m->queue - m->pc == PQ_ACTIVE) { 1595 TAILQ_REMOVE( 1596 &vm_page_queues[PQ_ACTIVE + q].pl, 1597 m, pageq); 1598 TAILQ_INSERT_TAIL( 1599 &vm_page_queues[PQ_ACTIVE + q].pl, 1600 m, pageq); 1601 } 1602 vm_page_and_queue_spin_unlock(m); 1603 vm_page_wakeup(m); 1604 } 1605 } 1606 next: 1607 lwkt_yield(); 1608 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1609 } 1610 1611 /* 1612 * Clean out our local marker. 1613 * 1614 * Page queue still spin-locked. 1615 */ 1616 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1617 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1618 1619 return (delta); 1620 } 1621 1622 /* 1623 * The number of actually free pages can drop down to v_free_reserved, 1624 * we try to build the free count back above v_free_min. Note that 1625 * vm_paging_needed() also returns TRUE if v_free_count is not at 1626 * least v_free_min so that is the minimum we must build the free 1627 * count to. 1628 * 1629 * We use a slightly higher target to improve hysteresis, 1630 * ((v_free_target + v_free_min) / 2). Since v_free_target 1631 * is usually the same as v_cache_min this maintains about 1632 * half the pages in the free queue as are in the cache queue, 1633 * providing pretty good pipelining for pageout operation. 1634 * 1635 * The system operator can manipulate vm.v_cache_min and 1636 * vm.v_free_target to tune the pageout demon. Be sure 1637 * to keep vm.v_free_min < vm.v_free_target. 1638 * 1639 * Note that the original paging target is to get at least 1640 * (free_min + cache_min) into (free + cache). The slightly 1641 * higher target will shift additional pages from cache to free 1642 * without effecting the original paging target in order to 1643 * maintain better hysteresis and not have the free count always 1644 * be dead-on v_free_min. 1645 * 1646 * NOTE: we are still in a critical section. 1647 * 1648 * Pages moved from PQ_CACHE to totally free are not counted in the 1649 * pages_freed counter. 1650 * 1651 * WARNING! Can be called from two pagedaemon threads simultaneously. 1652 */ 1653 static void 1654 vm_pageout_scan_cache(long avail_shortage, int pass, 1655 long vnodes_skipped, long recycle_count) 1656 { 1657 static int lastkillticks; 1658 struct vm_pageout_scan_info info; 1659 vm_page_t m; 1660 int isep; 1661 1662 isep = (curthread == emergpager); 1663 1664 while (vmstats.v_free_count < 1665 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1666 /* 1667 * This steals some code from vm/vm_page.c 1668 * 1669 * Create two rovers and adjust the code to reduce 1670 * chances of them winding up at the same index (which 1671 * can cause a lot of contention). 1672 */ 1673 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1674 1675 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1676 goto next_rover; 1677 1678 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1679 if (m == NULL) 1680 break; 1681 /* 1682 * page is returned removed from its queue and spinlocked 1683 * 1684 * If the busy attempt fails we can still deactivate the page. 1685 */ 1686 if (vm_page_busy_try(m, TRUE)) { 1687 vm_page_deactivate_locked(m); 1688 vm_page_spin_unlock(m); 1689 continue; 1690 } 1691 vm_page_spin_unlock(m); 1692 pagedaemon_wakeup(); 1693 lwkt_yield(); 1694 1695 /* 1696 * Remaining operations run with the page busy and neither 1697 * the page or the queue will be spin-locked. 1698 */ 1699 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) || 1700 m->hold_count || 1701 m->wire_count) { 1702 vm_page_deactivate(m); 1703 vm_page_wakeup(m); 1704 continue; 1705 } 1706 1707 /* 1708 * Because the page is in the cache, it shouldn't be mapped. 1709 */ 1710 pmap_mapped_sync(m); 1711 KKASSERT((m->flags & PG_MAPPED) == 0); 1712 KKASSERT(m->dirty == 0); 1713 vm_pageout_page_free(m); 1714 mycpu->gd_cnt.v_dfree++; 1715 next_rover: 1716 if (isep) 1717 cache_rover[1] -= PQ_PRIME2; 1718 else 1719 cache_rover[0] += PQ_PRIME2; 1720 } 1721 1722 /* 1723 * If we didn't get enough free pages, and we have skipped a vnode 1724 * in a writeable object, wakeup the sync daemon. And kick swapout 1725 * if we did not get enough free pages. 1726 */ 1727 if (vm_paging_target() > 0) { 1728 if (vnodes_skipped && vm_page_count_min(0)) 1729 speedup_syncer(NULL); 1730 #if !defined(NO_SWAPPING) 1731 if (vm_swap_enabled && vm_page_count_target()) 1732 vm_req_vmdaemon(); 1733 #endif 1734 } 1735 1736 /* 1737 * Handle catastrophic conditions. Under good conditions we should 1738 * be at the target, well beyond our minimum. If we could not even 1739 * reach our minimum the system is under heavy stress. But just being 1740 * under heavy stress does not trigger process killing. 1741 * 1742 * We consider ourselves to have run out of memory if the swap pager 1743 * is full and avail_shortage is still positive. The secondary check 1744 * ensures that we do not kill processes if the instantanious 1745 * availability is good, even if the pageout demon pass says it 1746 * couldn't get to the target. 1747 * 1748 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1749 * SITUATIONS. 1750 */ 1751 if (swap_pager_almost_full && 1752 pass > 0 && 1753 isep == 0 && 1754 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1755 kprintf("Warning: system low on memory+swap " 1756 "shortage %ld for %d ticks!\n", 1757 avail_shortage, ticks - swap_fail_ticks); 1758 if (bootverbose) 1759 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1760 "avail=%ld target=%ld last=%u\n", 1761 swap_pager_almost_full, 1762 swap_pager_full, 1763 pass, 1764 avail_shortage, 1765 vm_paging_target(), 1766 (unsigned int)(ticks - lastkillticks)); 1767 } 1768 if (swap_pager_full && 1769 pass > 1 && 1770 isep == 0 && 1771 avail_shortage > 0 && 1772 vm_paging_target() > 0 && 1773 (unsigned int)(ticks - lastkillticks) >= hz) { 1774 /* 1775 * Kill something, maximum rate once per second to give 1776 * the process time to free up sufficient memory. 1777 */ 1778 lastkillticks = ticks; 1779 info.bigproc = NULL; 1780 info.bigsize = 0; 1781 allproc_scan(vm_pageout_scan_callback, &info, 0); 1782 if (info.bigproc != NULL) { 1783 kprintf("Try to kill process %d %s\n", 1784 info.bigproc->p_pid, info.bigproc->p_comm); 1785 info.bigproc->p_nice = PRIO_MIN; 1786 info.bigproc->p_usched->resetpriority( 1787 FIRST_LWP_IN_PROC(info.bigproc)); 1788 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1789 killproc(info.bigproc, "out of swap space"); 1790 wakeup(&vmstats.v_free_count); 1791 PRELE(info.bigproc); 1792 } 1793 } 1794 } 1795 1796 static int 1797 vm_pageout_scan_callback(struct proc *p, void *data) 1798 { 1799 struct vm_pageout_scan_info *info = data; 1800 vm_offset_t size; 1801 1802 /* 1803 * Never kill system processes or init. If we have configured swap 1804 * then try to avoid killing low-numbered pids. 1805 */ 1806 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1807 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1808 return (0); 1809 } 1810 1811 lwkt_gettoken(&p->p_token); 1812 1813 /* 1814 * if the process is in a non-running type state, 1815 * don't touch it. 1816 */ 1817 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1818 lwkt_reltoken(&p->p_token); 1819 return (0); 1820 } 1821 1822 /* 1823 * Get the approximate process size. Note that anonymous pages 1824 * with backing swap will be counted twice, but there should not 1825 * be too many such pages due to the stress the VM system is 1826 * under at this point. 1827 */ 1828 size = vmspace_anonymous_count(p->p_vmspace) + 1829 vmspace_swap_count(p->p_vmspace); 1830 1831 /* 1832 * If the this process is bigger than the biggest one 1833 * remember it. 1834 */ 1835 if (info->bigsize < size) { 1836 if (info->bigproc) 1837 PRELE(info->bigproc); 1838 PHOLD(p); 1839 info->bigproc = p; 1840 info->bigsize = size; 1841 } 1842 lwkt_reltoken(&p->p_token); 1843 lwkt_yield(); 1844 1845 return(0); 1846 } 1847 1848 /* 1849 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1850 * moved back to PQ_FREE. It is possible for pages to accumulate here 1851 * when vm_page_free() races against vm_page_unhold(), resulting in a 1852 * page being left on a PQ_HOLD queue with hold_count == 0. 1853 * 1854 * It is easier to handle this edge condition here, in non-critical code, 1855 * rather than enforce a spin-lock for every 1->0 transition in 1856 * vm_page_unhold(). 1857 * 1858 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1859 */ 1860 static void 1861 vm_pageout_scan_hold(int q) 1862 { 1863 vm_page_t m; 1864 1865 vm_page_queues_spin_lock(PQ_HOLD + q); 1866 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1867 if (m->flags & PG_MARKER) 1868 continue; 1869 1870 /* 1871 * Process one page and return 1872 */ 1873 if (m->hold_count) 1874 break; 1875 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1876 vm_page_hold(m); 1877 vm_page_queues_spin_unlock(PQ_HOLD + q); 1878 vm_page_unhold(m); /* reprocess */ 1879 return; 1880 } 1881 vm_page_queues_spin_unlock(PQ_HOLD + q); 1882 } 1883 1884 /* 1885 * This routine tries to maintain the pseudo LRU active queue, 1886 * so that during long periods of time where there is no paging, 1887 * that some statistic accumulation still occurs. This code 1888 * helps the situation where paging just starts to occur. 1889 */ 1890 static void 1891 vm_pageout_page_stats(int q) 1892 { 1893 static int fullintervalcount = 0; 1894 struct vm_page marker; 1895 vm_page_t m; 1896 long pcount, tpcount; /* Number of pages to check */ 1897 long page_shortage; 1898 1899 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1900 vmstats.v_free_min) - 1901 (vmstats.v_free_count + vmstats.v_inactive_count + 1902 vmstats.v_cache_count); 1903 1904 if (page_shortage <= 0) 1905 return; 1906 1907 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1908 fullintervalcount += vm_pageout_stats_interval; 1909 if (fullintervalcount < vm_pageout_full_stats_interval) { 1910 tpcount = (vm_pageout_stats_max * pcount) / 1911 vmstats.v_page_count + 1; 1912 if (pcount > tpcount) 1913 pcount = tpcount; 1914 } else { 1915 fullintervalcount = 0; 1916 } 1917 1918 bzero(&marker, sizeof(marker)); 1919 marker.flags = PG_FICTITIOUS | PG_MARKER; 1920 marker.busy_count = PBUSY_LOCKED; 1921 marker.queue = PQ_ACTIVE + q; 1922 marker.pc = q; 1923 marker.wire_count = 1; 1924 1925 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1926 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1927 1928 /* 1929 * Queue locked at top of loop to avoid stack marker issues. 1930 */ 1931 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1932 pcount-- > 0) 1933 { 1934 int actcount; 1935 1936 KKASSERT(m->queue == PQ_ACTIVE + q); 1937 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1938 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1939 &marker, pageq); 1940 1941 /* 1942 * Skip marker pages (atomic against other markers to avoid 1943 * infinite hop-over scans). 1944 */ 1945 if (m->flags & PG_MARKER) 1946 continue; 1947 1948 /* 1949 * Ignore pages we can't busy 1950 */ 1951 if (vm_page_busy_try(m, TRUE)) 1952 continue; 1953 1954 /* 1955 * Remaining operations run with the page busy and neither 1956 * the page or the queue will be spin-locked. 1957 */ 1958 KKASSERT(m->queue == PQ_ACTIVE + q); 1959 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1960 1961 /* 1962 * We can just remove wired pages from the queue 1963 */ 1964 if (m->wire_count) { 1965 vm_page_unqueue_nowakeup(m); 1966 vm_page_wakeup(m); 1967 goto next; 1968 } 1969 1970 1971 /* 1972 * We now have a safely busied page, the page and queue 1973 * spinlocks have been released. 1974 * 1975 * Ignore held and wired pages 1976 */ 1977 if (m->hold_count || m->wire_count) { 1978 vm_page_wakeup(m); 1979 goto next; 1980 } 1981 1982 /* 1983 * Calculate activity 1984 */ 1985 actcount = 0; 1986 if (m->flags & PG_REFERENCED) { 1987 vm_page_flag_clear(m, PG_REFERENCED); 1988 actcount += 1; 1989 } 1990 actcount += pmap_ts_referenced(m); 1991 1992 /* 1993 * Update act_count and move page to end of queue. 1994 */ 1995 if (actcount) { 1996 m->act_count += ACT_ADVANCE + actcount; 1997 if (m->act_count > ACT_MAX) 1998 m->act_count = ACT_MAX; 1999 vm_page_and_queue_spin_lock(m); 2000 if (m->queue - m->pc == PQ_ACTIVE) { 2001 TAILQ_REMOVE( 2002 &vm_page_queues[PQ_ACTIVE + q].pl, 2003 m, pageq); 2004 TAILQ_INSERT_TAIL( 2005 &vm_page_queues[PQ_ACTIVE + q].pl, 2006 m, pageq); 2007 } 2008 vm_page_and_queue_spin_unlock(m); 2009 vm_page_wakeup(m); 2010 goto next; 2011 } 2012 2013 if (m->act_count == 0) { 2014 /* 2015 * We turn off page access, so that we have 2016 * more accurate RSS stats. We don't do this 2017 * in the normal page deactivation when the 2018 * system is loaded VM wise, because the 2019 * cost of the large number of page protect 2020 * operations would be higher than the value 2021 * of doing the operation. 2022 * 2023 * We use the marker to save our place so 2024 * we can release the spin lock. both (m) 2025 * and (next) will be invalid. 2026 */ 2027 vm_page_protect(m, VM_PROT_NONE); 2028 vm_page_deactivate(m); 2029 } else { 2030 m->act_count -= min(m->act_count, ACT_DECLINE); 2031 vm_page_and_queue_spin_lock(m); 2032 if (m->queue - m->pc == PQ_ACTIVE) { 2033 TAILQ_REMOVE( 2034 &vm_page_queues[PQ_ACTIVE + q].pl, 2035 m, pageq); 2036 TAILQ_INSERT_TAIL( 2037 &vm_page_queues[PQ_ACTIVE + q].pl, 2038 m, pageq); 2039 } 2040 vm_page_and_queue_spin_unlock(m); 2041 } 2042 vm_page_wakeup(m); 2043 next: 2044 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2045 } 2046 2047 /* 2048 * Remove our local marker 2049 * 2050 * Page queue still spin-locked. 2051 */ 2052 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 2053 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2054 } 2055 2056 static void 2057 vm_pageout_free_page_calc(vm_size_t count) 2058 { 2059 /* 2060 * v_free_min normal allocations 2061 * v_free_reserved system allocations 2062 * v_pageout_free_min allocations by pageout daemon 2063 * v_interrupt_free_min low level allocations (e.g swap structures) 2064 * 2065 * v_free_min is used to generate several other baselines, and they 2066 * can get pretty silly on systems with a lot of memory. 2067 */ 2068 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2069 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2070 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2071 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2072 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2073 } 2074 2075 2076 /* 2077 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2078 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2079 * 2080 * The emergency pageout daemon takes over when the primary pageout daemon 2081 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2082 * avoiding the many low-memory deadlocks which can occur when paging out 2083 * to VFS's. 2084 */ 2085 static void 2086 vm_pageout_thread(void) 2087 { 2088 int pass; 2089 int q; 2090 int q1iterator = 0; 2091 int q2iterator = 0; 2092 int q3iterator = 0; 2093 int isep; 2094 2095 curthread->td_flags |= TDF_SYSTHREAD; 2096 2097 /* 2098 * We only need to setup once. 2099 */ 2100 isep = 0; 2101 if (curthread == emergpager) { 2102 isep = 1; 2103 goto skip_setup; 2104 } 2105 2106 /* 2107 * Initialize vm_max_launder per pageout pass to be 1/16 2108 * of total physical memory, plus a little slop. 2109 */ 2110 if (vm_max_launder == 0) 2111 vm_max_launder = physmem / 256 + 16; 2112 2113 /* 2114 * Initialize some paging parameters. 2115 */ 2116 vm_pageout_free_page_calc(vmstats.v_page_count); 2117 2118 /* 2119 * v_free_target and v_cache_min control pageout hysteresis. Note 2120 * that these are more a measure of the VM cache queue hysteresis 2121 * then the VM free queue. Specifically, v_free_target is the 2122 * high water mark (free+cache pages). 2123 * 2124 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2125 * low water mark, while v_free_min is the stop. v_cache_min must 2126 * be big enough to handle memory needs while the pageout daemon 2127 * is signalled and run to free more pages. 2128 */ 2129 vmstats.v_free_target = 4 * vmstats.v_free_min + 2130 vmstats.v_free_reserved; 2131 2132 /* 2133 * NOTE: With the new buffer cache b_act_count we want the default 2134 * inactive target to be a percentage of available memory. 2135 * 2136 * The inactive target essentially determines the minimum 2137 * number of 'temporary' pages capable of caching one-time-use 2138 * files when the VM system is otherwise full of pages 2139 * belonging to multi-time-use files or active program data. 2140 * 2141 * NOTE: The inactive target is aggressively persued only if the 2142 * inactive queue becomes too small. If the inactive queue 2143 * is large enough to satisfy page movement to free+cache 2144 * then it is repopulated more slowly from the active queue. 2145 * This allows a general inactive_target default to be set. 2146 * 2147 * There is an issue here for processes which sit mostly idle 2148 * 'overnight', such as sshd, tcsh, and X. Any movement from 2149 * the active queue will eventually cause such pages to 2150 * recycle eventually causing a lot of paging in the morning. 2151 * To reduce the incidence of this pages cycled out of the 2152 * buffer cache are moved directly to the inactive queue if 2153 * they were only used once or twice. 2154 * 2155 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2156 * Increasing the value (up to 64) increases the number of 2157 * buffer recyclements which go directly to the inactive queue. 2158 */ 2159 if (vmstats.v_free_count > 2048) { 2160 vmstats.v_cache_min = vmstats.v_free_target; 2161 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2162 } else { 2163 vmstats.v_cache_min = 0; 2164 vmstats.v_cache_max = 0; 2165 } 2166 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2167 2168 /* XXX does not really belong here */ 2169 if (vm_page_max_wired == 0) 2170 vm_page_max_wired = vmstats.v_free_count / 3; 2171 2172 if (vm_pageout_stats_max == 0) 2173 vm_pageout_stats_max = vmstats.v_free_target; 2174 2175 /* 2176 * Set interval in seconds for stats scan. 2177 */ 2178 if (vm_pageout_stats_interval == 0) 2179 vm_pageout_stats_interval = 5; 2180 if (vm_pageout_full_stats_interval == 0) 2181 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2182 2183 2184 /* 2185 * Set maximum free per pass 2186 */ 2187 if (vm_pageout_stats_free_max == 0) 2188 vm_pageout_stats_free_max = 5; 2189 2190 swap_pager_swap_init(); 2191 pass = 0; 2192 2193 atomic_swap_int(&sequence_emerg_pager, 1); 2194 wakeup(&sequence_emerg_pager); 2195 2196 skip_setup: 2197 /* 2198 * Sequence emergency pager startup 2199 */ 2200 if (isep) { 2201 while (sequence_emerg_pager == 0) 2202 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2203 } 2204 2205 /* 2206 * The pageout daemon is never done, so loop forever. 2207 * 2208 * WARNING! This code is being executed by two kernel threads 2209 * potentially simultaneously. 2210 */ 2211 while (TRUE) { 2212 int error; 2213 long avail_shortage; 2214 long inactive_shortage; 2215 long vnodes_skipped = 0; 2216 long recycle_count = 0; 2217 long tmp; 2218 2219 /* 2220 * Wait for an action request. If we timeout check to 2221 * see if paging is needed (in case the normal wakeup 2222 * code raced us). 2223 */ 2224 if (isep) { 2225 /* 2226 * Emergency pagedaemon monitors the primary 2227 * pagedaemon while vm_pages_needed != 0. 2228 * 2229 * The emergency pagedaemon only runs if VM paging 2230 * is needed and the primary pagedaemon has not 2231 * updated vm_pagedaemon_time for more than 2 seconds. 2232 */ 2233 if (vm_pages_needed) 2234 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2235 else 2236 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2237 if (vm_pages_needed == 0) { 2238 pass = 0; 2239 continue; 2240 } 2241 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2242 pass = 0; 2243 continue; 2244 } 2245 } else { 2246 /* 2247 * Primary pagedaemon 2248 * 2249 * NOTE: We unconditionally cleanup PQ_HOLD even 2250 * when there is no work to do. 2251 */ 2252 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2253 ++q3iterator; 2254 2255 if (vm_pages_needed == 0) { 2256 error = tsleep(&vm_pages_needed, 2257 0, "psleep", 2258 vm_pageout_stats_interval * hz); 2259 if (error && 2260 vm_paging_needed(0) == 0 && 2261 vm_pages_needed == 0) { 2262 for (q = 0; q < PQ_L2_SIZE; ++q) 2263 vm_pageout_page_stats(q); 2264 continue; 2265 } 2266 vm_pagedaemon_time = ticks; 2267 vm_pages_needed = 1; 2268 2269 /* 2270 * Wake the emergency pagedaemon up so it 2271 * can monitor us. It will automatically 2272 * go back into a long sleep when 2273 * vm_pages_needed returns to 0. 2274 */ 2275 wakeup(&vm_pagedaemon_time); 2276 } 2277 } 2278 2279 mycpu->gd_cnt.v_pdwakeups++; 2280 2281 /* 2282 * Scan for INACTIVE->CLEAN/PAGEOUT 2283 * 2284 * This routine tries to avoid thrashing the system with 2285 * unnecessary activity. 2286 * 2287 * Calculate our target for the number of free+cache pages we 2288 * want to get to. This is higher then the number that causes 2289 * allocations to stall (severe) in order to provide hysteresis, 2290 * and if we don't make it all the way but get to the minimum 2291 * we're happy. Goose it a bit if there are multiple requests 2292 * for memory. 2293 * 2294 * Don't reduce avail_shortage inside the loop or the 2295 * PQAVERAGE() calculation will break. 2296 * 2297 * NOTE! deficit is differentiated from avail_shortage as 2298 * REQUIRING at least (deficit) pages to be cleaned, 2299 * even if the page queues are in good shape. This 2300 * is used primarily for handling per-process 2301 * RLIMIT_RSS and may also see small values when 2302 * processes block due to low memory. 2303 */ 2304 vmstats_rollup(); 2305 if (isep == 0) 2306 vm_pagedaemon_time = ticks; 2307 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2308 vm_pageout_deficit = 0; 2309 2310 if (avail_shortage > 0) { 2311 long delta = 0; 2312 long counts[4] = { 0, 0, 0, 0 }; 2313 int qq; 2314 2315 if (vm_pageout_debug) { 2316 kprintf("scan_inactive pass %d isep=%d\t", 2317 pass / MAXSCAN_DIVIDER, isep); 2318 } 2319 2320 qq = q1iterator; 2321 for (q = 0; q < PQ_L2_SIZE; ++q) { 2322 delta += vm_pageout_scan_inactive( 2323 pass / MAXSCAN_DIVIDER, 2324 qq & PQ_L2_MASK, 2325 PQAVERAGE(avail_shortage), 2326 &vnodes_skipped, counts); 2327 if (isep) 2328 --qq; 2329 else 2330 ++qq; 2331 if (avail_shortage - delta <= 0) 2332 break; 2333 2334 /* 2335 * It is possible for avail_shortage to be 2336 * very large. If a large program exits or 2337 * frees a ton of memory all at once, we do 2338 * not have to continue deactivations. 2339 * 2340 * (We will still run the active->inactive 2341 * target, however). 2342 */ 2343 if (!vm_page_count_target() && 2344 !vm_page_count_min( 2345 vm_page_free_hysteresis)) { 2346 avail_shortage = 0; 2347 break; 2348 } 2349 } 2350 if (vm_pageout_debug) { 2351 kprintf("flushed %ld cleaned %ld " 2352 "lru2 %ld react %ld " 2353 "delta %ld\n", 2354 counts[0], counts[1], 2355 counts[2], counts[3], 2356 delta); 2357 } 2358 avail_shortage -= delta; 2359 q1iterator = qq; 2360 } 2361 2362 /* 2363 * Figure out how many active pages we must deactivate. If 2364 * we were able to reach our target with just the inactive 2365 * scan above we limit the number of active pages we 2366 * deactivate to reduce unnecessary work. 2367 */ 2368 vmstats_rollup(); 2369 if (isep == 0) 2370 vm_pagedaemon_time = ticks; 2371 inactive_shortage = vmstats.v_inactive_target - 2372 vmstats.v_inactive_count; 2373 2374 /* 2375 * If we were unable to free sufficient inactive pages to 2376 * satisfy the free/cache queue requirements then simply 2377 * reaching the inactive target may not be good enough. 2378 * Try to deactivate pages in excess of the target based 2379 * on the shortfall. 2380 * 2381 * However to prevent thrashing the VM system do not 2382 * deactivate more than an additional 1/10 the inactive 2383 * target's worth of active pages. 2384 */ 2385 if (avail_shortage > 0) { 2386 tmp = avail_shortage * 2; 2387 if (tmp > vmstats.v_inactive_target / 10) 2388 tmp = vmstats.v_inactive_target / 10; 2389 inactive_shortage += tmp; 2390 } 2391 2392 /* 2393 * Only trigger a pmap cleanup on inactive shortage. 2394 */ 2395 if (isep == 0 && inactive_shortage > 0) { 2396 pmap_collect(); 2397 } 2398 2399 /* 2400 * Scan for ACTIVE->INACTIVE 2401 * 2402 * Only trigger on inactive shortage. Triggering on 2403 * avail_shortage can starve the active queue with 2404 * unnecessary active->inactive transitions and destroy 2405 * performance. 2406 * 2407 * If this is the emergency pager, always try to move 2408 * a few pages from active to inactive because the inactive 2409 * queue might have enough pages, but not enough anonymous 2410 * pages. 2411 */ 2412 if (isep && inactive_shortage < vm_emerg_launder) 2413 inactive_shortage = vm_emerg_launder; 2414 2415 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2416 long delta = 0; 2417 int qq; 2418 2419 qq = q2iterator; 2420 for (q = 0; q < PQ_L2_SIZE; ++q) { 2421 delta += vm_pageout_scan_active( 2422 pass / MAXSCAN_DIVIDER, 2423 qq & PQ_L2_MASK, 2424 PQAVERAGE(avail_shortage), 2425 PQAVERAGE(inactive_shortage), 2426 &recycle_count); 2427 if (isep) 2428 --qq; 2429 else 2430 ++qq; 2431 if (inactive_shortage - delta <= 0 && 2432 avail_shortage - delta <= 0) { 2433 break; 2434 } 2435 2436 /* 2437 * inactive_shortage can be a very large 2438 * number. This is intended to break out 2439 * early if our inactive_target has been 2440 * reached due to other system activity. 2441 */ 2442 if (vmstats.v_inactive_count > 2443 vmstats.v_inactive_target) { 2444 inactive_shortage = 0; 2445 break; 2446 } 2447 } 2448 inactive_shortage -= delta; 2449 avail_shortage -= delta; 2450 q2iterator = qq; 2451 } 2452 2453 /* 2454 * Scan for CACHE->FREE 2455 * 2456 * Finally free enough cache pages to meet our free page 2457 * requirement and take more drastic measures if we are 2458 * still in trouble. 2459 */ 2460 vmstats_rollup(); 2461 if (isep == 0) 2462 vm_pagedaemon_time = ticks; 2463 vm_pageout_scan_cache(avail_shortage, pass / MAXSCAN_DIVIDER, 2464 vnodes_skipped, recycle_count); 2465 2466 /* 2467 * This is a bit sophisticated because we do not necessarily 2468 * want to force paging until our targets are reached if we 2469 * were able to successfully retire the shortage we calculated. 2470 */ 2471 if (avail_shortage > 0) { 2472 /* 2473 * If we did not retire enough pages continue the 2474 * pageout operation until we are able to. It 2475 * takes MAXSCAN_DIVIDER passes to cover the entire 2476 * inactive list. 2477 */ 2478 ++pass; 2479 2480 if (pass / MAXSCAN_DIVIDER < 10 && 2481 vm_pages_needed > 1) { 2482 /* 2483 * Normal operation, additional processes 2484 * have already kicked us. Retry immediately 2485 * unless swap space is completely full in 2486 * which case delay a bit. 2487 */ 2488 if (swap_pager_full) { 2489 tsleep(&vm_pages_needed, 0, "pdelay", 2490 hz / 5); 2491 } /* else immediate retry */ 2492 } else if (pass / MAXSCAN_DIVIDER < 10) { 2493 /* 2494 * Do a short sleep for the first 10 passes, 2495 * allow the sleep to be woken up by resetting 2496 * vm_pages_needed to 1 (NOTE: we are still 2497 * active paging!). 2498 */ 2499 if (isep == 0) 2500 vm_pages_needed = 1; 2501 tsleep(&vm_pages_needed, 0, "pdelay", 2); 2502 } else if (swap_pager_full == 0) { 2503 /* 2504 * We've taken too many passes, force a 2505 * longer delay. 2506 */ 2507 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2508 } else { 2509 /* 2510 * Running out of memory, catastrophic 2511 * back-off to one-second intervals. 2512 */ 2513 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2514 } 2515 } else if (vm_pages_needed) { 2516 /* 2517 * We retired our calculated shortage but we may have 2518 * to continue paging if threads drain memory too far 2519 * below our target. 2520 * 2521 * Similar to vm_page_free_wakeup() in vm_page.c. 2522 */ 2523 pass = 0; 2524 if (!vm_paging_needed(0)) { 2525 /* still more than half-way to our target */ 2526 vm_pages_needed = 0; 2527 wakeup(&vmstats.v_free_count); 2528 } else 2529 if (!vm_page_count_min(vm_page_free_hysteresis)) { 2530 /* 2531 * Continue operations with wakeup 2532 * (set variable to avoid overflow) 2533 */ 2534 vm_pages_needed = 2; 2535 wakeup(&vmstats.v_free_count); 2536 } else { 2537 /* 2538 * No wakeup() needed, continue operations. 2539 * (set variable to avoid overflow) 2540 */ 2541 vm_pages_needed = 2; 2542 } 2543 } else { 2544 /* 2545 * Turn paging back on immediately if we are under 2546 * minimum. 2547 */ 2548 pass = 0; 2549 } 2550 } 2551 } 2552 2553 static struct kproc_desc pg1_kp = { 2554 "pagedaemon", 2555 vm_pageout_thread, 2556 &pagethread 2557 }; 2558 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2559 2560 static struct kproc_desc pg2_kp = { 2561 "emergpager", 2562 vm_pageout_thread, 2563 &emergpager 2564 }; 2565 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2566 2567 2568 /* 2569 * Called after allocating a page out of the cache or free queue 2570 * to possibly wake the pagedaemon up to replentish our supply. 2571 * 2572 * We try to generate some hysteresis by waking the pagedaemon up 2573 * when our free+cache pages go below the free_min+cache_min level. 2574 * The pagedaemon tries to get the count back up to at least the 2575 * minimum, and through to the target level if possible. 2576 * 2577 * If the pagedaemon is already active bump vm_pages_needed as a hint 2578 * that there are even more requests pending. 2579 * 2580 * SMP races ok? 2581 * No requirements. 2582 */ 2583 void 2584 pagedaemon_wakeup(void) 2585 { 2586 if (vm_paging_needed(0) && curthread != pagethread) { 2587 if (vm_pages_needed <= 1) { 2588 vm_pages_needed = 1; /* SMP race ok */ 2589 wakeup(&vm_pages_needed); /* tickle pageout */ 2590 } else if (vm_page_count_min(0)) { 2591 ++vm_pages_needed; /* SMP race ok */ 2592 /* a wakeup() would be wasted here */ 2593 } 2594 } 2595 } 2596 2597 #if !defined(NO_SWAPPING) 2598 2599 /* 2600 * SMP races ok? 2601 * No requirements. 2602 */ 2603 static void 2604 vm_req_vmdaemon(void) 2605 { 2606 static int lastrun = 0; 2607 2608 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2609 wakeup(&vm_daemon_needed); 2610 lastrun = ticks; 2611 } 2612 } 2613 2614 static int vm_daemon_callback(struct proc *p, void *data __unused); 2615 2616 /* 2617 * No requirements. 2618 * 2619 * Scan processes for exceeding their rlimits, deactivate pages 2620 * when RSS is exceeded. 2621 */ 2622 static void 2623 vm_daemon(void) 2624 { 2625 while (TRUE) { 2626 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2627 allproc_scan(vm_daemon_callback, NULL, 0); 2628 } 2629 } 2630 2631 static int 2632 vm_daemon_callback(struct proc *p, void *data __unused) 2633 { 2634 struct vmspace *vm; 2635 vm_pindex_t limit, size; 2636 2637 /* 2638 * if this is a system process or if we have already 2639 * looked at this process, skip it. 2640 */ 2641 lwkt_gettoken(&p->p_token); 2642 2643 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2644 lwkt_reltoken(&p->p_token); 2645 return (0); 2646 } 2647 2648 /* 2649 * if the process is in a non-running type state, 2650 * don't touch it. 2651 */ 2652 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2653 lwkt_reltoken(&p->p_token); 2654 return (0); 2655 } 2656 2657 /* 2658 * get a limit 2659 */ 2660 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2661 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2662 2663 vm = p->p_vmspace; 2664 vmspace_hold(vm); 2665 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2666 if (limit >= 0 && size > 4096 && 2667 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2668 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2669 } 2670 vmspace_drop(vm); 2671 2672 lwkt_reltoken(&p->p_token); 2673 2674 return (0); 2675 } 2676 2677 #endif 2678