1 /* 2 * Copyright (c) 2004,2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vfs_spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vfs_spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vfs_spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vfs_spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vfs_spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/buf.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #include <sys/buf2.h> 71 #include <sys/thread2.h> 72 73 #define VACT_MAX 10 74 #define VACT_INC 2 75 76 static void vnode_terminate(struct vnode *vp); 77 78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 79 80 /* 81 * The vnode free list hold inactive vnodes. Aged inactive vnodes 82 * are inserted prior to the mid point, and otherwise inserted 83 * at the tail. 84 */ 85 TAILQ_HEAD(freelst, vnode); 86 static struct freelst vnode_active_list; 87 static struct freelst vnode_inactive_list; 88 static struct vnode vnode_active_rover; 89 static struct spinlock vfs_spin = SPINLOCK_INITIALIZER(vfs_spin, "vfs_spin"); 90 91 int activevnodes = 0; 92 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 93 &activevnodes, 0, "Number of active nodes"); 94 int cachedvnodes = 0; 95 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 96 &cachedvnodes, 0, "Number of total cached nodes"); 97 int inactivevnodes = 0; 98 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 99 &inactivevnodes, 0, "Number of inactive nodes"); 100 static int batchfreevnodes = 5; 101 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 102 &batchfreevnodes, 0, "Number of vnodes to free at once"); 103 #ifdef TRACKVNODE 104 static u_long trackvnode; 105 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 106 &trackvnode, 0, ""); 107 #endif 108 109 /* 110 * Called from vfsinit() 111 */ 112 void 113 vfs_lock_init(void) 114 { 115 TAILQ_INIT(&vnode_inactive_list); 116 TAILQ_INIT(&vnode_active_list); 117 TAILQ_INSERT_TAIL(&vnode_active_list, &vnode_active_rover, v_list); 118 spin_init(&vfs_spin, "vfslock"); 119 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 120 } 121 122 /* 123 * Misc functions 124 */ 125 static __inline 126 void 127 _vsetflags(struct vnode *vp, int flags) 128 { 129 atomic_set_int(&vp->v_flag, flags); 130 } 131 132 static __inline 133 void 134 _vclrflags(struct vnode *vp, int flags) 135 { 136 atomic_clear_int(&vp->v_flag, flags); 137 } 138 139 void 140 vsetflags(struct vnode *vp, int flags) 141 { 142 _vsetflags(vp, flags); 143 } 144 145 void 146 vclrflags(struct vnode *vp, int flags) 147 { 148 _vclrflags(vp, flags); 149 } 150 151 /* 152 * Place the vnode on the active list. 153 * 154 * Caller must hold vp->v_spin 155 */ 156 static __inline 157 void 158 _vactivate(struct vnode *vp) 159 { 160 #ifdef TRACKVNODE 161 if ((u_long)vp == trackvnode) 162 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 163 #endif 164 spin_lock(&vfs_spin); 165 166 switch(vp->v_state) { 167 case VS_ACTIVE: 168 panic("_vactivate: already active"); 169 /* NOT REACHED */ 170 spin_unlock(&vfs_spin); 171 return; 172 case VS_INACTIVE: 173 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 174 --inactivevnodes; 175 break; 176 case VS_CACHED: 177 case VS_DYING: 178 break; 179 } 180 TAILQ_INSERT_TAIL(&vnode_active_list, vp, v_list); 181 vp->v_state = VS_ACTIVE; 182 ++activevnodes; 183 184 spin_unlock(&vfs_spin); 185 } 186 187 /* 188 * Put a vnode on the inactive list. 189 * 190 * Caller must hold v_spin 191 */ 192 static __inline 193 void 194 _vinactive(struct vnode *vp) 195 { 196 #ifdef TRACKVNODE 197 if ((u_long)vp == trackvnode) { 198 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 199 print_backtrace(-1); 200 } 201 #endif 202 spin_lock(&vfs_spin); 203 204 /* 205 * Remove from active list if it is sitting on it 206 */ 207 switch(vp->v_state) { 208 case VS_ACTIVE: 209 TAILQ_REMOVE(&vnode_active_list, vp, v_list); 210 --activevnodes; 211 break; 212 case VS_INACTIVE: 213 panic("_vinactive: already inactive"); 214 /* NOT REACHED */ 215 spin_unlock(&vfs_spin); 216 return; 217 case VS_CACHED: 218 case VS_DYING: 219 break; 220 } 221 222 /* 223 * Distinguish between basically dead vnodes, vnodes with cached 224 * data, and vnodes without cached data. A rover will shift the 225 * vnodes around as their cache status is lost. 226 */ 227 if (vp->v_flag & VRECLAIMED) { 228 TAILQ_INSERT_HEAD(&vnode_inactive_list, vp, v_list); 229 } else { 230 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 231 } 232 ++inactivevnodes; 233 vp->v_state = VS_INACTIVE; 234 235 spin_unlock(&vfs_spin); 236 } 237 238 static __inline 239 void 240 _vinactive_tail(struct vnode *vp) 241 { 242 spin_lock(&vfs_spin); 243 244 /* 245 * Remove from active list if it is sitting on it 246 */ 247 switch(vp->v_state) { 248 case VS_ACTIVE: 249 TAILQ_REMOVE(&vnode_active_list, vp, v_list); 250 --activevnodes; 251 break; 252 case VS_INACTIVE: 253 panic("_vinactive_tail: already inactive"); 254 /* NOT REACHED */ 255 spin_unlock(&vfs_spin); 256 return; 257 case VS_CACHED: 258 case VS_DYING: 259 break; 260 } 261 262 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 263 ++inactivevnodes; 264 vp->v_state = VS_INACTIVE; 265 266 spin_unlock(&vfs_spin); 267 } 268 269 /* 270 * Add a ref to an active vnode. This function should never be called 271 * with an inactive vnode (use vget() instead), but might be called 272 * with other states. 273 */ 274 void 275 vref(struct vnode *vp) 276 { 277 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 278 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 279 atomic_add_int(&vp->v_refcnt, 1); 280 } 281 282 /* 283 * Count number of cached vnodes. This is middling expensive so be 284 * careful not to make this call in the critical path, particularly 285 * not updating the global. Each cpu tracks its own accumulator. 286 * The individual accumulators are not accurate and must be summed 287 * together. 288 */ 289 int 290 countcachedvnodes(int gupdate) 291 { 292 int i; 293 int n = 0; 294 295 for (i = 0; i < ncpus; ++i) { 296 globaldata_t gd = globaldata_find(i); 297 n += gd->gd_cachedvnodes; 298 } 299 if (gupdate) 300 cachedvnodes = n; 301 return n; 302 } 303 304 /* 305 * Release a ref on an active or inactive vnode. 306 * 307 * Caller has no other requirements. 308 * 309 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 310 * transition, otherwise we leave the vnode in the active list and 311 * do a lockless transition to 0, which is very important for the 312 * critical path. 313 * 314 * (vrele() is not called when a vnode is being destroyed w/kfree) 315 */ 316 void 317 vrele(struct vnode *vp) 318 { 319 for (;;) { 320 int count = vp->v_refcnt; 321 cpu_ccfence(); 322 KKASSERT((count & VREF_MASK) > 0); 323 KKASSERT(vp->v_state == VS_ACTIVE || 324 vp->v_state == VS_INACTIVE); 325 326 /* 327 * 2+ case 328 */ 329 if ((count & VREF_MASK) > 1) { 330 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1)) 331 break; 332 continue; 333 } 334 335 /* 336 * 1->0 transition case must handle possible finalization. 337 * When finalizing we transition 1->0x40000000. Note that 338 * cachedvnodes is only adjusted on transitions to ->0. 339 * 340 * WARNING! VREF_TERMINATE can be cleared at any point 341 * when the refcnt is non-zero (by vget()) and 342 * the vnode has not been reclaimed. Thus 343 * transitions out of VREF_TERMINATE do not have 344 * to mess with cachedvnodes. 345 */ 346 if (count & VREF_FINALIZE) { 347 vx_lock(vp); 348 if (atomic_cmpset_int(&vp->v_refcnt, 349 count, VREF_TERMINATE)) { 350 vnode_terminate(vp); 351 break; 352 } 353 vx_unlock(vp); 354 } else { 355 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) { 356 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 357 break; 358 } 359 } 360 /* retry */ 361 } 362 } 363 364 /* 365 * Add an auxiliary data structure reference to the vnode. Auxiliary 366 * references do not change the state of the vnode or prevent deactivation 367 * or reclamation of the vnode, but will prevent the vnode from being 368 * destroyed (kfree()'d). 369 * 370 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 371 * already be held by the caller. vdrop() will clean up the 372 * free list state. 373 */ 374 void 375 vhold(struct vnode *vp) 376 { 377 atomic_add_int(&vp->v_auxrefs, 1); 378 } 379 380 /* 381 * Remove an auxiliary reference from the vnode. 382 */ 383 void 384 vdrop(struct vnode *vp) 385 { 386 atomic_add_int(&vp->v_auxrefs, -1); 387 } 388 389 /* 390 * This function is called on the 1->0 transition (which is actually 391 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 392 * of the vnode. 393 * 394 * Additional vrefs are allowed to race but will not result in a reentrant 395 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 396 * prevents additional 1->0 transitions. 397 * 398 * ONLY A VGET() CAN REACTIVATE THE VNODE. 399 * 400 * Caller must hold the VX lock. 401 * 402 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 403 * 404 * NOTE: The vnode may be marked inactive with dirty buffers 405 * or dirty pages in its cached VM object still present. 406 * 407 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 408 * previously be active). We lose control of the vnode the instant 409 * it is placed on the free list. 410 * 411 * The VX lock is required when transitioning to VS_CACHED but is 412 * not sufficient for the vshouldfree() interlocked test or when 413 * transitioning away from VS_CACHED. v_spin is also required for 414 * those cases. 415 */ 416 static 417 void 418 vnode_terminate(struct vnode *vp) 419 { 420 KKASSERT(vp->v_state == VS_ACTIVE); 421 422 if ((vp->v_flag & VINACTIVE) == 0) { 423 _vsetflags(vp, VINACTIVE); 424 if (vp->v_mount) 425 VOP_INACTIVE(vp); 426 /* might deactivate page */ 427 } 428 spin_lock(&vp->v_spin); 429 _vinactive(vp); 430 spin_unlock(&vp->v_spin); 431 432 vx_unlock(vp); 433 } 434 435 /**************************************************************** 436 * VX LOCKING FUNCTIONS * 437 **************************************************************** 438 * 439 * These functions lock vnodes for reclamation and deactivation related 440 * activities. The caller must already be holding some sort of reference 441 * on the vnode. 442 */ 443 void 444 vx_lock(struct vnode *vp) 445 { 446 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 447 } 448 449 void 450 vx_unlock(struct vnode *vp) 451 { 452 lockmgr(&vp->v_lock, LK_RELEASE); 453 } 454 455 /**************************************************************** 456 * VNODE ACQUISITION FUNCTIONS * 457 **************************************************************** 458 * 459 * These functions must be used when accessing a vnode that has no 460 * chance of being destroyed in a SMP race. That means the caller will 461 * usually either hold an auxiliary reference (such as the namecache) 462 * or hold some other lock that ensures that the vnode cannot be destroyed. 463 * 464 * These functions are MANDATORY for any code chain accessing a vnode 465 * whos activation state is not known. 466 * 467 * vget() can be called with LK_NOWAIT and will return EBUSY if the 468 * lock cannot be immediately acquired. 469 * 470 * vget()/vput() are used when reactivation is desired. 471 * 472 * vx_get() and vx_put() are used when reactivation is not desired. 473 */ 474 int 475 vget(struct vnode *vp, int flags) 476 { 477 int error; 478 479 /* 480 * A lock type must be passed 481 */ 482 if ((flags & LK_TYPE_MASK) == 0) { 483 panic("vget() called with no lock specified!"); 484 /* NOT REACHED */ 485 } 486 487 /* 488 * Reference the structure and then acquire the lock. 489 * 490 * NOTE: The requested lock might be a shared lock and does 491 * not protect our access to the refcnt or other fields. 492 */ 493 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 494 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 495 496 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 497 /* 498 * The lock failed, undo and return an error. This will not 499 * normally trigger a termination. 500 */ 501 vrele(vp); 502 } else if (vp->v_flag & VRECLAIMED) { 503 /* 504 * The node is being reclaimed and cannot be reactivated 505 * any more, undo and return ENOENT. 506 */ 507 vn_unlock(vp); 508 vrele(vp); 509 error = ENOENT; 510 } else if (vp->v_state == VS_ACTIVE) { 511 /* 512 * A VS_ACTIVE vnode coupled with the fact that we have 513 * a vnode lock (even if shared) prevents v_state from 514 * changing. Since the vnode is not in a VRECLAIMED state, 515 * we can safely clear VINACTIVE. 516 * 517 * NOTE! Multiple threads may clear VINACTIVE if this is 518 * shared lock. This race is allowed. 519 */ 520 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 521 vp->v_act += VACT_INC; 522 if (vp->v_act > VACT_MAX) /* SMP race ok */ 523 vp->v_act = VACT_MAX; 524 error = 0; 525 } else { 526 /* 527 * If the vnode is not VS_ACTIVE it must be reactivated 528 * in addition to clearing VINACTIVE. An exclusive spin_lock 529 * is needed to manipulate the vnode's list. 530 * 531 * Because the lockmgr lock might be shared, we might race 532 * another reactivation, which we handle. In this situation, 533 * however, the refcnt prevents other v_state races. 534 * 535 * As with above, clearing VINACTIVE is allowed to race other 536 * clearings of VINACTIVE. 537 * 538 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 539 * the refcnt is non-zero and the vnode has not been 540 * reclaimed. This also means that the transitions do 541 * not affect cachedvnodes. 542 */ 543 _vclrflags(vp, VINACTIVE); 544 vp->v_act += VACT_INC; 545 if (vp->v_act > VACT_MAX) /* SMP race ok */ 546 vp->v_act = VACT_MAX; 547 spin_lock(&vp->v_spin); 548 549 switch(vp->v_state) { 550 case VS_INACTIVE: 551 _vactivate(vp); 552 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 553 VREF_FINALIZE); 554 spin_unlock(&vp->v_spin); 555 break; 556 case VS_CACHED: 557 _vactivate(vp); 558 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 559 VREF_FINALIZE); 560 spin_unlock(&vp->v_spin); 561 break; 562 case VS_ACTIVE: 563 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE); 564 spin_unlock(&vp->v_spin); 565 break; 566 case VS_DYING: 567 spin_unlock(&vp->v_spin); 568 panic("Impossible VS_DYING state"); 569 break; 570 } 571 error = 0; 572 } 573 return(error); 574 } 575 576 #ifdef DEBUG_VPUT 577 578 void 579 debug_vput(struct vnode *vp, const char *filename, int line) 580 { 581 kprintf("vput(%p) %s:%d\n", vp, filename, line); 582 vn_unlock(vp); 583 vrele(vp); 584 } 585 586 #else 587 588 void 589 vput(struct vnode *vp) 590 { 591 vn_unlock(vp); 592 vrele(vp); 593 } 594 595 #endif 596 597 /* 598 * Acquire the vnode lock unguarded. 599 * 600 * The non-blocking version also uses a slightly different mechanic. 601 * This function will explicitly fail not only if it cannot acquire 602 * the lock normally, but also if the caller already holds a lock. 603 * 604 * The adjusted mechanic is used to close a loophole where complex 605 * VOP_RECLAIM code can circle around recursively and allocate the 606 * same vnode it is trying to destroy from the freelist. 607 * 608 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 609 * cause the incorrect behavior to occur. If not for that lockmgr() 610 * would do the right thing. 611 * 612 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 613 */ 614 void 615 vx_get(struct vnode *vp) 616 { 617 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 618 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 619 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 620 } 621 622 int 623 vx_get_nonblock(struct vnode *vp) 624 { 625 int error; 626 627 if (lockcountnb(&vp->v_lock)) 628 return(EBUSY); 629 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 630 if (error == 0) { 631 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 632 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 633 } 634 return(error); 635 } 636 637 /* 638 * Release a VX lock that also held a ref on the vnode. vrele() will handle 639 * any needed state transitions. 640 * 641 * However, filesystems use this function to get rid of unwanted new vnodes 642 * so try to get the vnode on the correct queue in that case. 643 */ 644 void 645 vx_put(struct vnode *vp) 646 { 647 if (vp->v_type == VNON || vp->v_type == VBAD) 648 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 649 lockmgr(&vp->v_lock, LK_RELEASE); 650 vrele(vp); 651 } 652 653 /* 654 * Try to reuse a vnode from the free list. This function is somewhat 655 * advisory in that NULL can be returned as a normal case, even if free 656 * vnodes are present. 657 * 658 * The scan is limited because it can result in excessive CPU use during 659 * periods of extreme vnode use. 660 * 661 * NOTE: The returned vnode is not completely initialized. 662 */ 663 static 664 struct vnode * 665 cleanfreevnode(int maxcount) 666 { 667 struct vnode *vp; 668 int count; 669 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 670 671 /* 672 * Try to deactivate some vnodes cached on the active list. 673 */ 674 if (countcachedvnodes(0) < inactivevnodes) 675 goto skip; 676 677 for (count = 0; count < maxcount * 2; count++) { 678 spin_lock(&vfs_spin); 679 680 vp = TAILQ_NEXT(&vnode_active_rover, v_list); 681 TAILQ_REMOVE(&vnode_active_list, &vnode_active_rover, v_list); 682 if (vp == NULL) { 683 TAILQ_INSERT_HEAD(&vnode_active_list, 684 &vnode_active_rover, v_list); 685 } else { 686 TAILQ_INSERT_AFTER(&vnode_active_list, vp, 687 &vnode_active_rover, v_list); 688 } 689 if (vp == NULL) { 690 spin_unlock(&vfs_spin); 691 continue; 692 } 693 if ((vp->v_refcnt & VREF_MASK) != 0) { 694 spin_unlock(&vfs_spin); 695 vp->v_act += VACT_INC; 696 if (vp->v_act > VACT_MAX) /* SMP race ok */ 697 vp->v_act = VACT_MAX; 698 continue; 699 } 700 701 /* 702 * decrement by less if the vnode's object has a lot of 703 * VM pages. XXX possible SMP races. 704 */ 705 if (vp->v_act > 0) { 706 vm_object_t obj; 707 if ((obj = vp->v_object) != NULL && 708 obj->resident_page_count >= trigger) { 709 vp->v_act -= 1; 710 } else { 711 vp->v_act -= VACT_INC; 712 } 713 if (vp->v_act < 0) 714 vp->v_act = 0; 715 spin_unlock(&vfs_spin); 716 continue; 717 } 718 719 /* 720 * Try to deactivate the vnode. 721 */ 722 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 723 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 724 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 725 726 spin_unlock(&vfs_spin); 727 vrele(vp); 728 } 729 730 skip: 731 /* 732 * Loop trying to lock the first vnode on the free list. 733 * Cycle if we can't. 734 */ 735 for (count = 0; count < maxcount; count++) { 736 spin_lock(&vfs_spin); 737 738 vp = TAILQ_FIRST(&vnode_inactive_list); 739 if (vp == NULL) { 740 spin_unlock(&vfs_spin); 741 break; 742 } 743 744 /* 745 * non-blocking vx_get will also ref the vnode on success. 746 */ 747 if (vx_get_nonblock(vp)) { 748 KKASSERT(vp->v_state == VS_INACTIVE); 749 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 750 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 751 spin_unlock(&vfs_spin); 752 continue; 753 } 754 755 /* 756 * Because we are holding vfs_spin the vnode should currently 757 * be inactive and VREF_TERMINATE should still be set. 758 * 759 * Once vfs_spin is released the vnode's state should remain 760 * unmodified due to both the lock and ref on it. 761 */ 762 KKASSERT(vp->v_state == VS_INACTIVE); 763 spin_unlock(&vfs_spin); 764 #ifdef TRACKVNODE 765 if ((u_long)vp == trackvnode) 766 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 767 #endif 768 769 /* 770 * Do not reclaim/reuse a vnode while auxillary refs exists. 771 * This includes namecache refs due to a related ncp being 772 * locked or having children, a VM object association, or 773 * other hold users. 774 * 775 * Do not reclaim/reuse a vnode if someone else has a real 776 * ref on it. This can occur if a filesystem temporarily 777 * releases the vnode lock during VOP_RECLAIM. 778 */ 779 if (vp->v_auxrefs || 780 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 781 failed: 782 if (vp->v_state == VS_INACTIVE) { 783 spin_lock(&vfs_spin); 784 if (vp->v_state == VS_INACTIVE) { 785 TAILQ_REMOVE(&vnode_inactive_list, 786 vp, v_list); 787 TAILQ_INSERT_TAIL(&vnode_inactive_list, 788 vp, v_list); 789 } 790 spin_unlock(&vfs_spin); 791 } 792 vx_put(vp); 793 continue; 794 } 795 796 /* 797 * VINACTIVE and VREF_TERMINATE are expected to both be set 798 * for vnodes pulled from the inactive list, and cannot be 799 * changed while we hold the vx lock. 800 * 801 * Try to reclaim the vnode. 802 */ 803 KKASSERT(vp->v_flag & VINACTIVE); 804 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 805 806 if ((vp->v_flag & VRECLAIMED) == 0) { 807 if (cache_inval_vp_nonblock(vp)) 808 goto failed; 809 vgone_vxlocked(vp); 810 /* vnode is still VX locked */ 811 } 812 813 /* 814 * At this point if there are no other refs or auxrefs on 815 * the vnode with the inactive list locked, and we remove 816 * the vnode from the inactive list, it should not be 817 * possible for anyone else to access the vnode any more. 818 * 819 * Since the vnode is in a VRECLAIMED state, no new 820 * namecache associations could have been made and the 821 * vnode should have already been removed from its mountlist. 822 * 823 * Since we hold a VX lock on the vnode it cannot have been 824 * reactivated (moved out of the inactive list). 825 */ 826 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 827 spin_lock(&vfs_spin); 828 if (vp->v_auxrefs || 829 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 830 spin_unlock(&vfs_spin); 831 goto failed; 832 } 833 KKASSERT(vp->v_state == VS_INACTIVE); 834 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 835 --inactivevnodes; 836 vp->v_state = VS_DYING; 837 spin_unlock(&vfs_spin); 838 839 /* 840 * Nothing should have been able to access this vp. Only 841 * our ref should remain now. 842 */ 843 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 844 KASSERT(vp->v_refcnt == 1, 845 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 846 847 /* 848 * Return a VX locked vnode suitable for reuse. 849 */ 850 return(vp); 851 } 852 return(NULL); 853 } 854 855 /* 856 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 857 * 858 * All new vnodes set the VAGE flags. An open() of the vnode will 859 * decrement the (2-bit) flags. Vnodes which are opened several times 860 * are thus retained in the cache over vnodes which are merely stat()d. 861 * 862 * We always allocate the vnode. Attempting to recycle existing vnodes 863 * here can lead to numerous deadlocks, particularly with softupdates. 864 */ 865 struct vnode * 866 allocvnode(int lktimeout, int lkflags) 867 { 868 struct vnode *vp; 869 870 /* 871 * Do not flag for synchronous recyclement unless there are enough 872 * freeable vnodes to recycle and the number of vnodes has 873 * significantly exceeded our target. We want the normal vnlru 874 * process to handle the cleaning (at 9/10's) before we are forced 875 * to flag it here at 11/10's for userexit path processing. 876 */ 877 if (numvnodes >= maxvnodes * 11 / 10 && 878 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 879 struct thread *td = curthread; 880 if (td->td_lwp) 881 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 882 } 883 884 /* 885 * lktimeout only applies when LK_TIMELOCK is used, and only 886 * the pageout daemon uses it. The timeout may not be zero 887 * or the pageout daemon can deadlock in low-VM situations. 888 */ 889 if (lktimeout == 0) 890 lktimeout = hz / 10; 891 892 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 893 894 lwkt_token_init(&vp->v_token, "vnode"); 895 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 896 TAILQ_INIT(&vp->v_namecache); 897 RB_INIT(&vp->v_rbclean_tree); 898 RB_INIT(&vp->v_rbdirty_tree); 899 RB_INIT(&vp->v_rbhash_tree); 900 spin_init(&vp->v_spin, "allocvnode"); 901 902 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 903 atomic_add_int(&numvnodes, 1); 904 vp->v_refcnt = 1; 905 vp->v_flag = VAGE0 | VAGE1; 906 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 907 908 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 909 /* exclusive lock still held */ 910 911 vp->v_filesize = NOOFFSET; 912 vp->v_type = VNON; 913 vp->v_tag = 0; 914 vp->v_state = VS_CACHED; 915 _vactivate(vp); 916 917 return (vp); 918 } 919 920 /* 921 * Called after a process has allocated a vnode via allocvnode() 922 * and we detected that too many vnodes were present. 923 * 924 * This function is called just prior to a return to userland if the 925 * process at some point had to allocate a new vnode during the last 926 * system call and the vnode count was found to be excessive. 927 * 928 * This is a synchronous path that we do not normally want to execute. 929 * 930 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 931 * 932 * WARNING: Sometimes numvnodes can blow out due to children being 933 * present under directory vnodes in the namecache. For the 934 * moment use an if() instead of a while() and note that if 935 * we were to use a while() we would still have to break out 936 * if freesomevnodes() returned 0. vnlru will also be trying 937 * hard to free vnodes at the same time (with a lower trigger 938 * pointer). 939 */ 940 void 941 allocvnode_gc(void) 942 { 943 if (numvnodes >= maxvnodes && 944 countcachedvnodes(0) + inactivevnodes >= maxvnodes * 5 / 10) { 945 freesomevnodes(batchfreevnodes); 946 } 947 } 948 949 int 950 freesomevnodes(int n) 951 { 952 struct vnode *vp; 953 int count = 0; 954 955 while (n) { 956 if ((vp = cleanfreevnode(n)) == NULL) 957 break; 958 vx_unlock(vp); 959 --n; 960 ++count; 961 kfree(vp, M_VNODE); 962 atomic_add_int(&numvnodes, -1); 963 } 964 return(count); 965 } 966