1 /* 2 * Copyright (c) 2004,2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vfs_spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vfs_spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vfs_spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vfs_spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vfs_spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/buf.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #include <sys/buf2.h> 71 #include <sys/thread2.h> 72 73 #define VACT_MAX 10 74 #define VACT_INC 2 75 76 static void vnode_terminate(struct vnode *vp); 77 78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 79 80 /* 81 * The vnode free list hold inactive vnodes. Aged inactive vnodes 82 * are inserted prior to the mid point, and otherwise inserted 83 * at the tail. 84 */ 85 TAILQ_HEAD(freelst, vnode); 86 static struct freelst vnode_active_list; 87 static struct freelst vnode_inactive_list; 88 static struct vnode vnode_active_rover; 89 static struct spinlock vfs_spin = SPINLOCK_INITIALIZER(vfs_spin, "vfs_spin"); 90 91 int activevnodes = 0; 92 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 93 &activevnodes, 0, "Number of active nodes"); 94 int cachedvnodes = 0; 95 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 96 &cachedvnodes, 0, "Number of total cached nodes"); 97 int inactivevnodes = 0; 98 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 99 &inactivevnodes, 0, "Number of inactive nodes"); 100 static int batchfreevnodes = 5; 101 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 102 &batchfreevnodes, 0, "Number of vnodes to free at once"); 103 #ifdef TRACKVNODE 104 static ulong trackvnode; 105 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 106 &trackvnode, 0, ""); 107 #endif 108 109 /* 110 * Called from vfsinit() 111 */ 112 void 113 vfs_lock_init(void) 114 { 115 TAILQ_INIT(&vnode_inactive_list); 116 TAILQ_INIT(&vnode_active_list); 117 TAILQ_INSERT_TAIL(&vnode_active_list, &vnode_active_rover, v_list); 118 spin_init(&vfs_spin, "vfslock"); 119 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 120 } 121 122 /* 123 * Misc functions 124 */ 125 static __inline 126 void 127 _vsetflags(struct vnode *vp, int flags) 128 { 129 atomic_set_int(&vp->v_flag, flags); 130 } 131 132 static __inline 133 void 134 _vclrflags(struct vnode *vp, int flags) 135 { 136 atomic_clear_int(&vp->v_flag, flags); 137 } 138 139 void 140 vsetflags(struct vnode *vp, int flags) 141 { 142 _vsetflags(vp, flags); 143 } 144 145 void 146 vclrflags(struct vnode *vp, int flags) 147 { 148 _vclrflags(vp, flags); 149 } 150 151 /* 152 * Place the vnode on the active list. 153 * 154 * Caller must hold vp->v_spin 155 */ 156 static __inline 157 void 158 _vactivate(struct vnode *vp) 159 { 160 #ifdef TRACKVNODE 161 if ((ulong)vp == trackvnode) 162 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 163 #endif 164 spin_lock(&vfs_spin); 165 166 switch(vp->v_state) { 167 case VS_ACTIVE: 168 panic("_vactivate: already active"); 169 /* NOT REACHED */ 170 spin_unlock(&vfs_spin); 171 return; 172 case VS_INACTIVE: 173 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 174 --inactivevnodes; 175 break; 176 case VS_CACHED: 177 case VS_DYING: 178 break; 179 } 180 TAILQ_INSERT_TAIL(&vnode_active_list, vp, v_list); 181 vp->v_state = VS_ACTIVE; 182 ++activevnodes; 183 184 spin_unlock(&vfs_spin); 185 } 186 187 /* 188 * Put a vnode on the inactive list. 189 * 190 * Caller must hold v_spin 191 */ 192 static __inline 193 void 194 _vinactive(struct vnode *vp) 195 { 196 #ifdef TRACKVNODE 197 if ((ulong)vp == trackvnode) { 198 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 199 print_backtrace(-1); 200 } 201 #endif 202 spin_lock(&vfs_spin); 203 204 /* 205 * Remove from active list if it is sitting on it 206 */ 207 switch(vp->v_state) { 208 case VS_ACTIVE: 209 TAILQ_REMOVE(&vnode_active_list, vp, v_list); 210 --activevnodes; 211 break; 212 case VS_INACTIVE: 213 panic("_vinactive: already inactive"); 214 /* NOT REACHED */ 215 spin_unlock(&vfs_spin); 216 return; 217 case VS_CACHED: 218 case VS_DYING: 219 break; 220 } 221 222 /* 223 * Distinguish between basically dead vnodes, vnodes with cached 224 * data, and vnodes without cached data. A rover will shift the 225 * vnodes around as their cache status is lost. 226 */ 227 if (vp->v_flag & VRECLAIMED) { 228 TAILQ_INSERT_HEAD(&vnode_inactive_list, vp, v_list); 229 } else { 230 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 231 } 232 ++inactivevnodes; 233 vp->v_state = VS_INACTIVE; 234 235 spin_unlock(&vfs_spin); 236 } 237 238 static __inline 239 void 240 _vinactive_tail(struct vnode *vp) 241 { 242 spin_lock(&vfs_spin); 243 244 /* 245 * Remove from active list if it is sitting on it 246 */ 247 switch(vp->v_state) { 248 case VS_ACTIVE: 249 TAILQ_REMOVE(&vnode_active_list, vp, v_list); 250 --activevnodes; 251 break; 252 case VS_INACTIVE: 253 panic("_vinactive_tail: already inactive"); 254 /* NOT REACHED */ 255 spin_unlock(&vfs_spin); 256 return; 257 case VS_CACHED: 258 case VS_DYING: 259 break; 260 } 261 262 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 263 ++inactivevnodes; 264 vp->v_state = VS_INACTIVE; 265 266 spin_unlock(&vfs_spin); 267 } 268 269 /* 270 * Add a ref to an active vnode. This function should never be called 271 * with an inactive vnode (use vget() instead), but might be called 272 * with other states. 273 */ 274 void 275 vref(struct vnode *vp) 276 { 277 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 278 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 279 atomic_add_int(&vp->v_refcnt, 1); 280 } 281 282 /* 283 * Release a ref on an active or inactive vnode. 284 * 285 * Caller has no other requirements. 286 * 287 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 288 * transition, otherwise we leave the vnode in the active list and 289 * do a lockless transition to 0, which is very important for the 290 * critical path. 291 * 292 * (vrele() is not called when a vnode is being destroyed w/kfree) 293 */ 294 void 295 vrele(struct vnode *vp) 296 { 297 for (;;) { 298 int count = vp->v_refcnt; 299 cpu_ccfence(); 300 KKASSERT((count & VREF_MASK) > 0); 301 KKASSERT(vp->v_state == VS_ACTIVE || 302 vp->v_state == VS_INACTIVE); 303 304 /* 305 * 2+ case 306 */ 307 if ((count & VREF_MASK) > 1) { 308 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1)) 309 break; 310 continue; 311 } 312 313 /* 314 * 1->0 transition case must handle possible finalization. 315 * When finalizing we transition 1->0x40000000. Note that 316 * cachedvnodes is only adjusted on transitions to ->0. 317 * 318 * WARNING! VREF_TERMINATE can be cleared at any point 319 * when the refcnt is non-zero (by vget()) and 320 * the vnode has not been reclaimed. Thus 321 * transitions out of VREF_TERMINATE do not have 322 * to mess with cachedvnodes. 323 */ 324 if (count & VREF_FINALIZE) { 325 vx_lock(vp); 326 if (atomic_cmpset_int(&vp->v_refcnt, 327 count, VREF_TERMINATE)) { 328 vnode_terminate(vp); 329 break; 330 } 331 vx_unlock(vp); 332 } else { 333 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) { 334 atomic_add_int(&cachedvnodes, 1); 335 break; 336 } 337 } 338 /* retry */ 339 } 340 } 341 342 /* 343 * Add an auxiliary data structure reference to the vnode. Auxiliary 344 * references do not change the state of the vnode or prevent deactivation 345 * or reclamation of the vnode, but will prevent the vnode from being 346 * destroyed (kfree()'d). 347 * 348 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 349 * already be held by the caller. vdrop() will clean up the 350 * free list state. 351 */ 352 void 353 vhold(struct vnode *vp) 354 { 355 atomic_add_int(&vp->v_auxrefs, 1); 356 } 357 358 /* 359 * Remove an auxiliary reference from the vnode. 360 */ 361 void 362 vdrop(struct vnode *vp) 363 { 364 atomic_add_int(&vp->v_auxrefs, -1); 365 } 366 367 /* 368 * This function is called on the 1->0 transition (which is actually 369 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 370 * of the vnode. 371 * 372 * Additional vrefs are allowed to race but will not result in a reentrant 373 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 374 * prevents additional 1->0 transitions. 375 * 376 * ONLY A VGET() CAN REACTIVATE THE VNODE. 377 * 378 * Caller must hold the VX lock. 379 * 380 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 381 * 382 * NOTE: The vnode may be marked inactive with dirty buffers 383 * or dirty pages in its cached VM object still present. 384 * 385 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 386 * previously be active). We lose control of the vnode the instant 387 * it is placed on the free list. 388 * 389 * The VX lock is required when transitioning to VS_CACHED but is 390 * not sufficient for the vshouldfree() interlocked test or when 391 * transitioning away from VS_CACHED. v_spin is also required for 392 * those cases. 393 */ 394 static 395 void 396 vnode_terminate(struct vnode *vp) 397 { 398 KKASSERT(vp->v_state == VS_ACTIVE); 399 400 if ((vp->v_flag & VINACTIVE) == 0) { 401 _vsetflags(vp, VINACTIVE); 402 if (vp->v_mount) 403 VOP_INACTIVE(vp); 404 /* might deactivate page */ 405 } 406 spin_lock(&vp->v_spin); 407 _vinactive(vp); 408 spin_unlock(&vp->v_spin); 409 410 vx_unlock(vp); 411 } 412 413 /**************************************************************** 414 * VX LOCKING FUNCTIONS * 415 **************************************************************** 416 * 417 * These functions lock vnodes for reclamation and deactivation related 418 * activities. The caller must already be holding some sort of reference 419 * on the vnode. 420 */ 421 void 422 vx_lock(struct vnode *vp) 423 { 424 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 425 } 426 427 void 428 vx_unlock(struct vnode *vp) 429 { 430 lockmgr(&vp->v_lock, LK_RELEASE); 431 } 432 433 /**************************************************************** 434 * VNODE ACQUISITION FUNCTIONS * 435 **************************************************************** 436 * 437 * These functions must be used when accessing a vnode that has no 438 * chance of being destroyed in a SMP race. That means the caller will 439 * usually either hold an auxiliary reference (such as the namecache) 440 * or hold some other lock that ensures that the vnode cannot be destroyed. 441 * 442 * These functions are MANDATORY for any code chain accessing a vnode 443 * whos activation state is not known. 444 * 445 * vget() can be called with LK_NOWAIT and will return EBUSY if the 446 * lock cannot be immediately acquired. 447 * 448 * vget()/vput() are used when reactivation is desired. 449 * 450 * vx_get() and vx_put() are used when reactivation is not desired. 451 */ 452 int 453 vget(struct vnode *vp, int flags) 454 { 455 int error; 456 457 /* 458 * A lock type must be passed 459 */ 460 if ((flags & LK_TYPE_MASK) == 0) { 461 panic("vget() called with no lock specified!"); 462 /* NOT REACHED */ 463 } 464 465 /* 466 * Reference the structure and then acquire the lock. 467 * 468 * NOTE: The requested lock might be a shared lock and does 469 * not protect our access to the refcnt or other fields. 470 */ 471 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 472 atomic_add_int(&cachedvnodes, -1); 473 474 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 475 /* 476 * The lock failed, undo and return an error. This will not 477 * normally trigger a termination. 478 */ 479 vrele(vp); 480 } else if (vp->v_flag & VRECLAIMED) { 481 /* 482 * The node is being reclaimed and cannot be reactivated 483 * any more, undo and return ENOENT. 484 */ 485 vn_unlock(vp); 486 vrele(vp); 487 error = ENOENT; 488 } else if (vp->v_state == VS_ACTIVE) { 489 /* 490 * A VS_ACTIVE vnode coupled with the fact that we have 491 * a vnode lock (even if shared) prevents v_state from 492 * changing. Since the vnode is not in a VRECLAIMED state, 493 * we can safely clear VINACTIVE. 494 * 495 * NOTE! Multiple threads may clear VINACTIVE if this is 496 * shared lock. This race is allowed. 497 */ 498 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 499 vp->v_act += VACT_INC; 500 if (vp->v_act > VACT_MAX) /* SMP race ok */ 501 vp->v_act = VACT_MAX; 502 error = 0; 503 } else { 504 /* 505 * If the vnode is not VS_ACTIVE it must be reactivated 506 * in addition to clearing VINACTIVE. An exclusive spin_lock 507 * is needed to manipulate the vnode's list. 508 * 509 * Because the lockmgr lock might be shared, we might race 510 * another reactivation, which we handle. In this situation, 511 * however, the refcnt prevents other v_state races. 512 * 513 * As with above, clearing VINACTIVE is allowed to race other 514 * clearings of VINACTIVE. 515 * 516 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 517 * the refcnt is non-zero and the vnode has not been 518 * reclaimed. This also means that the transitions do 519 * not affect cachedvnodes. 520 */ 521 _vclrflags(vp, VINACTIVE); 522 vp->v_act += VACT_INC; 523 if (vp->v_act > VACT_MAX) /* SMP race ok */ 524 vp->v_act = VACT_MAX; 525 spin_lock(&vp->v_spin); 526 527 switch(vp->v_state) { 528 case VS_INACTIVE: 529 _vactivate(vp); 530 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 531 VREF_FINALIZE); 532 spin_unlock(&vp->v_spin); 533 break; 534 case VS_CACHED: 535 _vactivate(vp); 536 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 537 VREF_FINALIZE); 538 spin_unlock(&vp->v_spin); 539 break; 540 case VS_ACTIVE: 541 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE); 542 spin_unlock(&vp->v_spin); 543 break; 544 case VS_DYING: 545 spin_unlock(&vp->v_spin); 546 panic("Impossible VS_DYING state"); 547 break; 548 } 549 error = 0; 550 } 551 return(error); 552 } 553 554 #ifdef DEBUG_VPUT 555 556 void 557 debug_vput(struct vnode *vp, const char *filename, int line) 558 { 559 kprintf("vput(%p) %s:%d\n", vp, filename, line); 560 vn_unlock(vp); 561 vrele(vp); 562 } 563 564 #else 565 566 void 567 vput(struct vnode *vp) 568 { 569 vn_unlock(vp); 570 vrele(vp); 571 } 572 573 #endif 574 575 /* 576 * Acquire the vnode lock unguarded. 577 * 578 * The non-blocking version also uses a slightly different mechanic. 579 * This function will explicitly fail not only if it cannot acquire 580 * the lock normally, but also if the caller already holds a lock. 581 * 582 * The adjusted mechanic is used to close a loophole where complex 583 * VOP_RECLAIM code can circle around recursively and allocate the 584 * same vnode it is trying to destroy from the freelist. 585 * 586 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 587 * cause the incorrect behavior to occur. If not for that lockmgr() 588 * would do the right thing. 589 * 590 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 591 */ 592 void 593 vx_get(struct vnode *vp) 594 { 595 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 596 atomic_add_int(&cachedvnodes, -1); 597 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 598 } 599 600 int 601 vx_get_nonblock(struct vnode *vp) 602 { 603 int error; 604 605 if (lockcountnb(&vp->v_lock)) 606 return(EBUSY); 607 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 608 if (error == 0) { 609 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 610 atomic_add_int(&cachedvnodes, -1); 611 } 612 return(error); 613 } 614 615 /* 616 * Release a VX lock that also held a ref on the vnode. vrele() will handle 617 * any needed state transitions. 618 * 619 * However, filesystems use this function to get rid of unwanted new vnodes 620 * so try to get the vnode on the correct queue in that case. 621 */ 622 void 623 vx_put(struct vnode *vp) 624 { 625 if (vp->v_type == VNON || vp->v_type == VBAD) 626 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 627 lockmgr(&vp->v_lock, LK_RELEASE); 628 vrele(vp); 629 } 630 631 /* 632 * Try to reuse a vnode from the free list. This function is somewhat 633 * advisory in that NULL can be returned as a normal case, even if free 634 * vnodes are present. 635 * 636 * The scan is limited because it can result in excessive CPU use during 637 * periods of extreme vnode use. 638 * 639 * NOTE: The returned vnode is not completely initialized. 640 */ 641 static 642 struct vnode * 643 cleanfreevnode(int maxcount) 644 { 645 struct vnode *vp; 646 int count; 647 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 648 649 /* 650 * Try to deactivate some vnodes cached on the active list. 651 */ 652 if (cachedvnodes < inactivevnodes) 653 goto skip; 654 655 for (count = 0; count < maxcount * 2; count++) { 656 spin_lock(&vfs_spin); 657 658 vp = TAILQ_NEXT(&vnode_active_rover, v_list); 659 TAILQ_REMOVE(&vnode_active_list, &vnode_active_rover, v_list); 660 if (vp == NULL) { 661 TAILQ_INSERT_HEAD(&vnode_active_list, 662 &vnode_active_rover, v_list); 663 } else { 664 TAILQ_INSERT_AFTER(&vnode_active_list, vp, 665 &vnode_active_rover, v_list); 666 } 667 if (vp == NULL) { 668 spin_unlock(&vfs_spin); 669 continue; 670 } 671 if ((vp->v_refcnt & VREF_MASK) != 0) { 672 spin_unlock(&vfs_spin); 673 vp->v_act += VACT_INC; 674 if (vp->v_act > VACT_MAX) /* SMP race ok */ 675 vp->v_act = VACT_MAX; 676 continue; 677 } 678 679 /* 680 * decrement by less if the vnode's object has a lot of 681 * VM pages. XXX possible SMP races. 682 */ 683 if (vp->v_act > 0) { 684 vm_object_t obj; 685 if ((obj = vp->v_object) != NULL && 686 obj->resident_page_count >= trigger) { 687 vp->v_act -= 1; 688 } else { 689 vp->v_act -= VACT_INC; 690 } 691 if (vp->v_act < 0) 692 vp->v_act = 0; 693 spin_unlock(&vfs_spin); 694 continue; 695 } 696 697 /* 698 * Try to deactivate the vnode. 699 */ 700 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 701 atomic_add_int(&cachedvnodes, -1); 702 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 703 704 spin_unlock(&vfs_spin); 705 vrele(vp); 706 } 707 708 skip: 709 /* 710 * Loop trying to lock the first vnode on the free list. 711 * Cycle if we can't. 712 */ 713 for (count = 0; count < maxcount; count++) { 714 spin_lock(&vfs_spin); 715 716 vp = TAILQ_FIRST(&vnode_inactive_list); 717 if (vp == NULL) { 718 spin_unlock(&vfs_spin); 719 break; 720 } 721 722 /* 723 * non-blocking vx_get will also ref the vnode on success. 724 */ 725 if (vx_get_nonblock(vp)) { 726 KKASSERT(vp->v_state == VS_INACTIVE); 727 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 728 TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list); 729 spin_unlock(&vfs_spin); 730 continue; 731 } 732 733 /* 734 * Because we are holding vfs_spin the vnode should currently 735 * be inactive and VREF_TERMINATE should still be set. 736 * 737 * Once vfs_spin is released the vnode's state should remain 738 * unmodified due to both the lock and ref on it. 739 */ 740 KKASSERT(vp->v_state == VS_INACTIVE); 741 spin_unlock(&vfs_spin); 742 #ifdef TRACKVNODE 743 if ((ulong)vp == trackvnode) 744 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 745 #endif 746 747 /* 748 * Do not reclaim/reuse a vnode while auxillary refs exists. 749 * This includes namecache refs due to a related ncp being 750 * locked or having children, a VM object association, or 751 * other hold users. 752 * 753 * Do not reclaim/reuse a vnode if someone else has a real 754 * ref on it. This can occur if a filesystem temporarily 755 * releases the vnode lock during VOP_RECLAIM. 756 */ 757 if (vp->v_auxrefs || 758 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 759 failed: 760 if (vp->v_state == VS_INACTIVE) { 761 spin_lock(&vfs_spin); 762 if (vp->v_state == VS_INACTIVE) { 763 TAILQ_REMOVE(&vnode_inactive_list, 764 vp, v_list); 765 TAILQ_INSERT_TAIL(&vnode_inactive_list, 766 vp, v_list); 767 } 768 spin_unlock(&vfs_spin); 769 } 770 vx_put(vp); 771 continue; 772 } 773 774 /* 775 * VINACTIVE and VREF_TERMINATE are expected to both be set 776 * for vnodes pulled from the inactive list, and cannot be 777 * changed while we hold the vx lock. 778 * 779 * Try to reclaim the vnode. 780 */ 781 KKASSERT(vp->v_flag & VINACTIVE); 782 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 783 784 if ((vp->v_flag & VRECLAIMED) == 0) { 785 if (cache_inval_vp_nonblock(vp)) 786 goto failed; 787 vgone_vxlocked(vp); 788 /* vnode is still VX locked */ 789 } 790 791 /* 792 * At this point if there are no other refs or auxrefs on 793 * the vnode with the inactive list locked, and we remove 794 * the vnode from the inactive list, it should not be 795 * possible for anyone else to access the vnode any more. 796 * 797 * Since the vnode is in a VRECLAIMED state, no new 798 * namecache associations could have been made and the 799 * vnode should have already been removed from its mountlist. 800 * 801 * Since we hold a VX lock on the vnode it cannot have been 802 * reactivated (moved out of the inactive list). 803 */ 804 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 805 spin_lock(&vfs_spin); 806 if (vp->v_auxrefs || 807 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 808 spin_unlock(&vfs_spin); 809 goto failed; 810 } 811 KKASSERT(vp->v_state == VS_INACTIVE); 812 TAILQ_REMOVE(&vnode_inactive_list, vp, v_list); 813 --inactivevnodes; 814 vp->v_state = VS_DYING; 815 spin_unlock(&vfs_spin); 816 817 /* 818 * Nothing should have been able to access this vp. Only 819 * our ref should remain now. 820 */ 821 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 822 KASSERT(vp->v_refcnt == 1, 823 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 824 825 /* 826 * Return a VX locked vnode suitable for reuse. 827 */ 828 return(vp); 829 } 830 return(NULL); 831 } 832 833 /* 834 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 835 * 836 * All new vnodes set the VAGE flags. An open() of the vnode will 837 * decrement the (2-bit) flags. Vnodes which are opened several times 838 * are thus retained in the cache over vnodes which are merely stat()d. 839 * 840 * We always allocate the vnode. Attempting to recycle existing vnodes 841 * here can lead to numerous deadlocks, particularly with softupdates. 842 */ 843 struct vnode * 844 allocvnode(int lktimeout, int lkflags) 845 { 846 struct vnode *vp; 847 848 /* 849 * Do not flag for synchronous recyclement unless there are enough 850 * freeable vnodes to recycle and the number of vnodes has 851 * significantly exceeded our target. We want the normal vnlru 852 * process to handle the cleaning (at 9/10's) before we are forced 853 * to flag it here at 11/10's for userexit path processing. 854 */ 855 if (numvnodes >= desiredvnodes * 11 / 10 && 856 cachedvnodes + inactivevnodes >= desiredvnodes * 5 / 10) { 857 struct thread *td = curthread; 858 if (td->td_lwp) 859 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 860 } 861 862 /* 863 * lktimeout only applies when LK_TIMELOCK is used, and only 864 * the pageout daemon uses it. The timeout may not be zero 865 * or the pageout daemon can deadlock in low-VM situations. 866 */ 867 if (lktimeout == 0) 868 lktimeout = hz / 10; 869 870 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 871 872 lwkt_token_init(&vp->v_token, "vnode"); 873 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 874 TAILQ_INIT(&vp->v_namecache); 875 RB_INIT(&vp->v_rbclean_tree); 876 RB_INIT(&vp->v_rbdirty_tree); 877 RB_INIT(&vp->v_rbhash_tree); 878 spin_init(&vp->v_spin, "allocvnode"); 879 880 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 881 atomic_add_int(&numvnodes, 1); 882 vp->v_refcnt = 1; 883 vp->v_flag = VAGE0 | VAGE1; 884 885 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 886 /* exclusive lock still held */ 887 888 vp->v_filesize = NOOFFSET; 889 vp->v_type = VNON; 890 vp->v_tag = 0; 891 vp->v_state = VS_CACHED; 892 _vactivate(vp); 893 894 return (vp); 895 } 896 897 /* 898 * Called after a process has allocated a vnode via allocvnode() 899 * and we detected that too many vnodes were present. 900 * 901 * This function is called just prior to a return to userland if the 902 * process at some point had to allocate a new vnode during the last 903 * system call and the vnode count was found to be excessive. 904 * 905 * This is a synchronous path that we do not normally want to execute. 906 * 907 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 908 * 909 * WARNING: Sometimes numvnodes can blow out due to children being 910 * present under directory vnodes in the namecache. For the 911 * moment use an if() instead of a while() and note that if 912 * we were to use a while() we would still have to break out 913 * if freesomevnodes() returned 0. vnlru will also be trying 914 * hard to free vnodes at the same time (with a lower trigger 915 * pointer). 916 */ 917 void 918 allocvnode_gc(void) 919 { 920 if (numvnodes >= desiredvnodes && 921 cachedvnodes + inactivevnodes >= desiredvnodes * 5 / 10) { 922 freesomevnodes(batchfreevnodes); 923 } 924 } 925 926 int 927 freesomevnodes(int n) 928 { 929 struct vnode *vp; 930 int count = 0; 931 932 while (n) { 933 if ((vp = cleanfreevnode(n)) == NULL) 934 break; 935 vx_unlock(vp); 936 --n; 937 ++count; 938 kfree(vp, M_VNODE); 939 atomic_add_int(&numvnodes, -1); 940 } 941 return(count); 942 } 943