1 /* 2 * Copyright (c) 2004,2013-2022 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing but not quite). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/spinlock2.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #define VACT_MAX 10 71 #define VACT_INC 2 72 73 static void vnode_terminate(struct vnode *vp); 74 75 static MALLOC_DEFINE_OBJ(M_VNODE, sizeof(struct vnode), "vnodes", "vnodes"); 76 static MALLOC_DEFINE(M_VNODE_HASH, "vnodelsthash", "vnode list hash"); 77 78 /* 79 * The vnode free list hold inactive vnodes. Aged inactive vnodes 80 * are inserted prior to the mid point, and otherwise inserted 81 * at the tail. 82 * 83 * The vnode code goes to great lengths to avoid moving vnodes between 84 * lists, but sometimes it is unavoidable. For this situation we try to 85 * avoid lock contention but we do not try very hard to avoid cache line 86 * congestion. A modestly sized hash table is used. 87 */ 88 #define VLIST_PRIME2 123462047LU 89 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 90 91 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 92 VLIST_PRIME2 % (unsigned)ncpus) 93 94 static struct vnode_index *vnode_list_hash; 95 96 int activevnodes = 0; 97 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 98 &activevnodes, 0, "Number of active nodes"); 99 int cachedvnodes = 0; 100 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 101 &cachedvnodes, 0, "Number of total cached nodes"); 102 int inactivevnodes = 0; 103 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 104 &inactivevnodes, 0, "Number of inactive nodes"); 105 static int batchfreevnodes = 5; 106 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 107 &batchfreevnodes, 0, "Number of vnodes to free at once"); 108 109 static long auxrecovervnodes1; 110 SYSCTL_INT(_debug, OID_AUTO, auxrecovervnodes1, CTLFLAG_RW, 111 &auxrecovervnodes1, 0, "vnlru auxillary vnodes recovered"); 112 static long auxrecovervnodes2; 113 SYSCTL_INT(_debug, OID_AUTO, auxrecovervnodes2, CTLFLAG_RW, 114 &auxrecovervnodes2, 0, "vnlru auxillary vnodes recovered"); 115 116 #ifdef TRACKVNODE 117 static u_long trackvnode; 118 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 119 &trackvnode, 0, ""); 120 #endif 121 122 /* 123 * Called from vfsinit() 124 */ 125 void 126 vfs_lock_init(void) 127 { 128 int i; 129 130 kmalloc_obj_raise_limit(M_VNODE, 0); /* unlimited */ 131 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 132 M_VNODE_HASH, M_ZERO | M_WAITOK); 133 for (i = 0; i < ncpus; ++i) { 134 struct vnode_index *vi = &vnode_list_hash[i]; 135 136 TAILQ_INIT(&vi->inactive_list); 137 TAILQ_INIT(&vi->active_list); 138 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 139 spin_init(&vi->spin, "vfslock"); 140 } 141 } 142 143 /* 144 * Misc functions 145 */ 146 static __inline 147 void 148 _vsetflags(struct vnode *vp, int flags) 149 { 150 atomic_set_int(&vp->v_flag, flags); 151 } 152 153 static __inline 154 void 155 _vclrflags(struct vnode *vp, int flags) 156 { 157 atomic_clear_int(&vp->v_flag, flags); 158 } 159 160 void 161 vsetflags(struct vnode *vp, int flags) 162 { 163 _vsetflags(vp, flags); 164 } 165 166 void 167 vclrflags(struct vnode *vp, int flags) 168 { 169 _vclrflags(vp, flags); 170 } 171 172 /* 173 * Place the vnode on the active list. 174 * 175 * Caller must hold vp->v_spin 176 */ 177 static __inline 178 void 179 _vactivate(struct vnode *vp) 180 { 181 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 182 183 #ifdef TRACKVNODE 184 if ((u_long)vp == trackvnode) 185 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 186 #endif 187 spin_lock(&vi->spin); 188 189 switch(vp->v_state) { 190 case VS_ACTIVE: 191 spin_unlock(&vi->spin); 192 panic("_vactivate: already active"); 193 /* NOT REACHED */ 194 return; 195 case VS_INACTIVE: 196 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 197 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 198 break; 199 case VS_CACHED: 200 case VS_DYING: 201 break; 202 } 203 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 204 vp->v_state = VS_ACTIVE; 205 spin_unlock(&vi->spin); 206 atomic_add_int(&mycpu->gd_activevnodes, 1); 207 } 208 209 /* 210 * Put a vnode on the inactive list. 211 * 212 * Caller must hold v_spin 213 */ 214 static __inline 215 void 216 _vinactive(struct vnode *vp) 217 { 218 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 219 220 #ifdef TRACKVNODE 221 if ((u_long)vp == trackvnode) { 222 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 223 print_backtrace(-1); 224 } 225 #endif 226 spin_lock(&vi->spin); 227 228 /* 229 * Remove from active list if it is sitting on it 230 */ 231 switch(vp->v_state) { 232 case VS_ACTIVE: 233 TAILQ_REMOVE(&vi->active_list, vp, v_list); 234 atomic_add_int(&mycpu->gd_activevnodes, -1); 235 break; 236 case VS_INACTIVE: 237 spin_unlock(&vi->spin); 238 panic("_vinactive: already inactive"); 239 /* NOT REACHED */ 240 return; 241 case VS_CACHED: 242 case VS_DYING: 243 break; 244 } 245 246 /* 247 * Distinguish between basically dead vnodes, vnodes with cached 248 * data, and vnodes without cached data. A rover will shift the 249 * vnodes around as their cache status is lost. 250 */ 251 if (vp->v_flag & VRECLAIMED) { 252 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 253 } else { 254 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 255 } 256 vp->v_state = VS_INACTIVE; 257 spin_unlock(&vi->spin); 258 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 259 } 260 261 /* 262 * Add a ref to an active vnode. This function should never be called 263 * with an inactive vnode (use vget() instead), but might be called 264 * with other states. 265 */ 266 void 267 vref(struct vnode *vp) 268 { 269 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 270 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 271 atomic_add_int(&vp->v_refcnt, 1); 272 } 273 274 void 275 vref_special(struct vnode *vp) 276 { 277 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 278 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 279 } 280 281 void 282 synchronizevnodecount(void) 283 { 284 int nca = 0; 285 int act = 0; 286 int ina = 0; 287 int i; 288 289 for (i = 0; i < ncpus; ++i) { 290 globaldata_t gd = globaldata_find(i); 291 nca += gd->gd_cachedvnodes; 292 act += gd->gd_activevnodes; 293 ina += gd->gd_inactivevnodes; 294 } 295 cachedvnodes = nca; 296 activevnodes = act; 297 inactivevnodes = ina; 298 } 299 300 /* 301 * Count number of cached vnodes. This is middling expensive so be 302 * careful not to make this call in the critical path. Each cpu tracks 303 * its own accumulator. The individual accumulators must be summed 304 * together to get an accurate value. 305 */ 306 int 307 countcachedvnodes(void) 308 { 309 int i; 310 int n = 0; 311 312 for (i = 0; i < ncpus; ++i) { 313 globaldata_t gd = globaldata_find(i); 314 n += gd->gd_cachedvnodes; 315 } 316 return n; 317 } 318 319 int 320 countcachedandinactivevnodes(void) 321 { 322 int i; 323 int n = 0; 324 325 for (i = 0; i < ncpus; ++i) { 326 globaldata_t gd = globaldata_find(i); 327 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 328 } 329 return n; 330 } 331 332 /* 333 * Release a ref on an active or inactive vnode. 334 * 335 * Caller has no other requirements. 336 * 337 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 338 * transition, otherwise we leave the vnode in the active list and 339 * do a lockless transition to 0, which is very important for the 340 * critical path. 341 * 342 * (vrele() is not called when a vnode is being destroyed w/kfree) 343 */ 344 void 345 vrele(struct vnode *vp) 346 { 347 int count; 348 349 #if 1 350 count = vp->v_refcnt; 351 cpu_ccfence(); 352 353 for (;;) { 354 KKASSERT((count & VREF_MASK) > 0); 355 KKASSERT(vp->v_state == VS_ACTIVE || 356 vp->v_state == VS_INACTIVE); 357 358 /* 359 * 2+ case 360 */ 361 if ((count & VREF_MASK) > 1) { 362 if (atomic_fcmpset_int(&vp->v_refcnt, 363 &count, count - 1)) { 364 break; 365 } 366 continue; 367 } 368 369 /* 370 * 1->0 transition case must handle possible finalization. 371 * When finalizing we transition 1->0x40000000. Note that 372 * cachedvnodes is only adjusted on transitions to ->0. 373 * 374 * WARNING! VREF_TERMINATE can be cleared at any point 375 * when the refcnt is non-zero (by vget()) and 376 * the vnode has not been reclaimed. Thus 377 * transitions out of VREF_TERMINATE do not have 378 * to mess with cachedvnodes. 379 */ 380 if (count & VREF_FINALIZE) { 381 vx_lock(vp); 382 if (atomic_fcmpset_int(&vp->v_refcnt, 383 &count, VREF_TERMINATE)) { 384 vnode_terminate(vp); 385 break; 386 } 387 vx_unlock(vp); 388 } else { 389 if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) { 390 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 391 break; 392 } 393 } 394 cpu_pause(); 395 /* retry */ 396 } 397 #else 398 /* 399 * XXX NOT YET WORKING! Multiple threads can reference the vnode 400 * after dropping their count, racing destruction, because this 401 * code is not directly transitioning from 1->VREF_FINALIZE. 402 */ 403 /* 404 * Drop the ref-count. On the 1->0 transition we check VREF_FINALIZE 405 * and attempt to acquire VREF_TERMINATE if set. It is possible for 406 * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but 407 * only one will be able to transition the vnode into the 408 * VREF_TERMINATE state. 409 * 410 * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter 411 * this state once. 412 */ 413 count = atomic_fetchadd_int(&vp->v_refcnt, -1); 414 if ((count & VREF_MASK) == 1) { 415 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 416 --count; 417 while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) { 418 vx_lock(vp); 419 if (atomic_fcmpset_int(&vp->v_refcnt, 420 &count, VREF_TERMINATE)) { 421 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 422 vnode_terminate(vp); 423 break; 424 } 425 vx_unlock(vp); 426 } 427 } 428 #endif 429 } 430 431 /* 432 * Add an auxiliary data structure reference to the vnode. Auxiliary 433 * references do not change the state of the vnode or prevent deactivation 434 * or reclamation of the vnode, but will prevent the vnode from being 435 * destroyed (kfree()'d). 436 * 437 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 438 * already be held by the caller. vdrop() will clean up the 439 * free list state. 440 */ 441 void 442 vhold(struct vnode *vp) 443 { 444 atomic_add_int(&vp->v_auxrefs, 1); 445 } 446 447 /* 448 * Remove an auxiliary reference from the vnode. 449 */ 450 void 451 vdrop(struct vnode *vp) 452 { 453 atomic_add_int(&vp->v_auxrefs, -1); 454 } 455 456 /* 457 * Set VREF_FINALIZE to request that the vnode be inactivated 458 * as soon as possible (on the 1->0 transition of its refs). 459 * 460 * Caller must have a ref on the vnode. 461 * 462 * This function has no effect if the vnode is already in termination 463 * processing. 464 */ 465 void 466 vfinalize(struct vnode *vp) 467 { 468 if ((vp->v_refcnt & VREF_MASK) > 0) 469 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 470 } 471 472 /* 473 * This function is called on the 1->0 transition (which is actually 474 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 475 * of the vnode. 476 * 477 * Additional vrefs are allowed to race but will not result in a reentrant 478 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 479 * prevents additional 1->0 transitions. 480 * 481 * ONLY A VGET() CAN REACTIVATE THE VNODE. 482 * 483 * Caller must hold the VX lock. 484 * 485 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 486 * 487 * NOTE: The vnode may be marked inactive with dirty buffers 488 * or dirty pages in its cached VM object still present. 489 * 490 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 491 * previously be active). We lose control of the vnode the instant 492 * it is placed on the free list. 493 * 494 * The VX lock is required when transitioning to VS_CACHED but is 495 * not sufficient for the vshouldfree() interlocked test or when 496 * transitioning away from VS_CACHED. v_spin is also required for 497 * those cases. 498 */ 499 static 500 void 501 vnode_terminate(struct vnode *vp) 502 { 503 KKASSERT(vp->v_state == VS_ACTIVE); 504 505 if ((vp->v_flag & VINACTIVE) == 0) { 506 _vsetflags(vp, VINACTIVE); 507 if (vp->v_mount) 508 VOP_INACTIVE(vp); 509 } 510 spin_lock(&vp->v_spin); 511 _vinactive(vp); 512 spin_unlock(&vp->v_spin); 513 514 vx_unlock(vp); 515 } 516 517 /**************************************************************** 518 * VX LOCKING FUNCTIONS * 519 **************************************************************** 520 * 521 * These functions lock vnodes for reclamation and deactivation related 522 * activities. The caller must already be holding some sort of reference 523 * on the vnode. 524 */ 525 void 526 vx_lock(struct vnode *vp) 527 { 528 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 529 spin_lock_update_only(&vp->v_spin); 530 } 531 532 void 533 vx_unlock(struct vnode *vp) 534 { 535 spin_unlock_update_only(&vp->v_spin); 536 lockmgr(&vp->v_lock, LK_RELEASE); 537 } 538 539 /* 540 * Downgrades a VX lock to a normal VN lock. The lock remains EXCLUSIVE. 541 * 542 * Generally required after calling getnewvnode() if the intention is 543 * to return a normal locked vnode to the caller. 544 */ 545 void 546 vx_downgrade(struct vnode *vp) 547 { 548 spin_unlock_update_only(&vp->v_spin); 549 } 550 551 /**************************************************************** 552 * VNODE ACQUISITION FUNCTIONS * 553 **************************************************************** 554 * 555 * These functions must be used when accessing a vnode that has no 556 * chance of being destroyed in a SMP race. That means the caller will 557 * usually either hold an auxiliary reference (such as the namecache) 558 * or hold some other lock that ensures that the vnode cannot be destroyed. 559 * 560 * These functions are MANDATORY for any code chain accessing a vnode 561 * whos activation state is not known. 562 * 563 * vget() can be called with LK_NOWAIT and will return EBUSY if the 564 * lock cannot be immediately acquired. 565 * 566 * vget()/vput() are used when reactivation is desired. 567 * 568 * vx_get() and vx_put() are used when reactivation is not desired. 569 */ 570 int 571 vget(struct vnode *vp, int flags) 572 { 573 int error; 574 575 /* 576 * A lock type must be passed 577 */ 578 if ((flags & LK_TYPE_MASK) == 0) { 579 panic("vget() called with no lock specified!"); 580 /* NOT REACHED */ 581 } 582 583 /* 584 * Reference the structure and then acquire the lock. 585 * 586 * NOTE: The requested lock might be a shared lock and does 587 * not protect our access to the refcnt or other fields. 588 */ 589 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 590 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 591 592 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 593 /* 594 * The lock failed, undo and return an error. This will not 595 * normally trigger a termination. 596 */ 597 vrele(vp); 598 } else if (vp->v_flag & VRECLAIMED) { 599 /* 600 * The node is being reclaimed and cannot be reactivated 601 * any more, undo and return ENOENT. 602 */ 603 vn_unlock(vp); 604 vrele(vp); 605 error = ENOENT; 606 } else if (vp->v_state == VS_ACTIVE) { 607 /* 608 * A VS_ACTIVE vnode coupled with the fact that we have 609 * a vnode lock (even if shared) prevents v_state from 610 * changing. Since the vnode is not in a VRECLAIMED state, 611 * we can safely clear VINACTIVE. 612 * 613 * It is possible for a shared lock to cause a race with 614 * another thread that is also in the process of clearing 615 * VREF_TERMINATE, meaning that we might return with it still 616 * set and then assert in a later vref(). The solution is to 617 * unconditionally clear VREF_TERMINATE here as well. 618 * 619 * NOTE! Multiple threads may clear VINACTIVE if this is 620 * shared lock. This race is allowed. 621 */ 622 if (vp->v_flag & VINACTIVE) 623 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 624 if (vp->v_act < VACT_MAX) { 625 vp->v_act += VACT_INC; 626 if (vp->v_act > VACT_MAX) /* SMP race ok */ 627 vp->v_act = VACT_MAX; 628 } 629 error = 0; 630 if (vp->v_refcnt & VREF_TERMINATE) /* SMP race ok */ 631 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 632 } else { 633 /* 634 * If the vnode is not VS_ACTIVE it must be reactivated 635 * in addition to clearing VINACTIVE. An exclusive spin_lock 636 * is needed to manipulate the vnode's list. 637 * 638 * Because the lockmgr lock might be shared, we might race 639 * another reactivation, which we handle. In this situation, 640 * however, the refcnt prevents other v_state races. 641 * 642 * As with above, clearing VINACTIVE is allowed to race other 643 * clearings of VINACTIVE. 644 * 645 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 646 * the refcnt is non-zero and the vnode has not been 647 * reclaimed. This also means that the transitions do 648 * not affect cachedvnodes. 649 * 650 * It is possible for a shared lock to cause a race with 651 * another thread that is also in the process of clearing 652 * VREF_TERMINATE, meaning that we might return with it still 653 * set and then assert in a later vref(). The solution is to 654 * unconditionally clear VREF_TERMINATE here as well. 655 */ 656 _vclrflags(vp, VINACTIVE); 657 vp->v_act += VACT_INC; 658 if (vp->v_act > VACT_MAX) /* SMP race ok */ 659 vp->v_act = VACT_MAX; 660 spin_lock(&vp->v_spin); 661 662 switch(vp->v_state) { 663 case VS_INACTIVE: 664 _vactivate(vp); 665 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 666 VREF_FINALIZE); 667 spin_unlock(&vp->v_spin); 668 break; 669 case VS_CACHED: 670 _vactivate(vp); 671 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 672 VREF_FINALIZE); 673 spin_unlock(&vp->v_spin); 674 break; 675 case VS_ACTIVE: 676 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | 677 VREF_TERMINATE); 678 spin_unlock(&vp->v_spin); 679 break; 680 case VS_DYING: 681 spin_unlock(&vp->v_spin); 682 panic("Impossible VS_DYING state"); 683 break; 684 } 685 error = 0; 686 } 687 return(error); 688 } 689 690 #ifdef DEBUG_VPUT 691 692 void 693 debug_vput(struct vnode *vp, const char *filename, int line) 694 { 695 kprintf("vput(%p) %s:%d\n", vp, filename, line); 696 vn_unlock(vp); 697 vrele(vp); 698 } 699 700 #else 701 702 void 703 vput(struct vnode *vp) 704 { 705 vn_unlock(vp); 706 vrele(vp); 707 } 708 709 #endif 710 711 /* 712 * Acquire the vnode lock unguarded. 713 * 714 * The non-blocking version also uses a slightly different mechanic. 715 * This function will explicitly fail not only if it cannot acquire 716 * the lock normally, but also if the caller already holds a lock. 717 * 718 * The adjusted mechanic is used to close a loophole where complex 719 * VOP_RECLAIM code can circle around recursively and allocate the 720 * same vnode it is trying to destroy from the freelist. 721 * 722 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 723 * cause the incorrect behavior to occur. If not for that lockmgr() 724 * would do the right thing. 725 * 726 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 727 */ 728 void 729 vx_get(struct vnode *vp) 730 { 731 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 732 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 733 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 734 spin_lock_update_only(&vp->v_spin); 735 } 736 737 int 738 vx_get_nonblock(struct vnode *vp) 739 { 740 int error; 741 742 if (lockinuse(&vp->v_lock)) 743 return(EBUSY); 744 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 745 if (error == 0) { 746 spin_lock_update_only(&vp->v_spin); 747 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 748 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 749 } 750 return(error); 751 } 752 753 /* 754 * Release a VX lock that also held a ref on the vnode. vrele() will handle 755 * any needed state transitions. 756 * 757 * However, filesystems use this function to get rid of unwanted new vnodes 758 * so try to get the vnode on the correct queue in that case. 759 */ 760 void 761 vx_put(struct vnode *vp) 762 { 763 if (vp->v_type == VNON || vp->v_type == VBAD) 764 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 765 spin_unlock_update_only(&vp->v_spin); 766 lockmgr(&vp->v_lock, LK_RELEASE); 767 vrele(vp); 768 } 769 770 /* 771 * Try to reuse a vnode from the free list. This function is somewhat 772 * advisory in that NULL can be returned as a normal case, even if free 773 * vnodes are present. 774 * 775 * The scan is limited because it can result in excessive CPU use during 776 * periods of extreme vnode use. 777 * 778 * NOTE: The returned vnode is not completely initialized. 779 * The returned vnode will be VX locked. 780 */ 781 static 782 struct vnode * 783 cleanfreevnode(int maxcount) 784 { 785 struct vnode_index *vi; 786 struct vnode *vp; 787 int count; 788 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 789 int ri; 790 int cpu_count; 791 int cachedvnodes; 792 793 /* 794 * Try to deactivate some vnodes cached on the active list. We 795 * generally want a 50-50 balance active vs inactive. 796 */ 797 cachedvnodes = countcachedvnodes(); 798 if (cachedvnodes < inactivevnodes) 799 goto skip; 800 801 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 802 803 for (count = 0; count < maxcount * 2; ++count, ++ri) { 804 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 805 806 spin_lock(&vi->spin); 807 808 vp = TAILQ_NEXT(&vi->active_rover, v_list); 809 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 810 if (vp == NULL) { 811 TAILQ_INSERT_HEAD(&vi->active_list, 812 &vi->active_rover, v_list); 813 } else { 814 TAILQ_INSERT_AFTER(&vi->active_list, vp, 815 &vi->active_rover, v_list); 816 } 817 if (vp == NULL) { 818 spin_unlock(&vi->spin); 819 continue; 820 } 821 822 /* 823 * Don't try to deactivate if someone has the vp referenced. 824 */ 825 if ((vp->v_refcnt & VREF_MASK) != 0) { 826 spin_unlock(&vi->spin); 827 vp->v_act += VACT_INC; 828 if (vp->v_act > VACT_MAX) /* SMP race ok */ 829 vp->v_act = VACT_MAX; 830 continue; 831 } 832 833 /* 834 * Calculate the deactivation weight. Reduce v_act less 835 * if the vnode's object has a lot of VM pages. 836 * 837 * XXX obj race 838 */ 839 if (vp->v_act > 0) { 840 vm_object_t obj; 841 842 if ((obj = vp->v_object) != NULL && 843 obj->resident_page_count >= trigger) 844 { 845 vp->v_act -= 1; 846 } else { 847 vp->v_act -= VACT_INC; 848 } 849 if (vp->v_act < 0) 850 vp->v_act = 0; 851 spin_unlock(&vi->spin); 852 continue; 853 } 854 855 /* 856 * If v_auxrefs is not the expected value the vnode might 857 * reside in the namecache topology on an internal node and 858 * not at a leaf. v_auxrefs can be wrong for other reasons, 859 * but this is the most likely. 860 * 861 * Such vnodes will not be recycled by vnlru later on in 862 * its inactive scan, so try to make the vnode presentable 863 * and only move it to the inactive queue if we can. 864 * 865 * On success, the vnode is disconnected from the namecache 866 * topology entirely, making vnodes above it in the topology 867 * recycleable. This will allow the active scan to continue 868 * to make progress in balancing the active and inactive 869 * lists. 870 */ 871 if (vp->v_auxrefs != vp->v_namecache_count) { 872 if (vx_get_nonblock(vp) == 0) { 873 spin_unlock(&vi->spin); 874 if ((vp->v_refcnt & VREF_MASK) == 1) 875 cache_inval_vp_quick(vp); 876 if (vp->v_auxrefs == vp->v_namecache_count) 877 ++auxrecovervnodes1; 878 vx_put(vp); 879 } else { 880 spin_unlock(&vi->spin); 881 } 882 continue; 883 } 884 885 /* 886 * Try to deactivate the vnode. It is ok if v_auxrefs 887 * races every once in a while, we just don't want an 888 * excess of unreclaimable vnodes on the inactive list. 889 */ 890 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 891 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 892 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 893 894 spin_unlock(&vi->spin); 895 vrele(vp); 896 } 897 898 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 899 900 skip: 901 /* 902 * Loop trying to lock the first vnode on the free list. 903 * Cycle if we can't. 904 */ 905 cpu_count = ncpus; 906 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 907 908 for (count = 0; count < maxcount; ++count, ++ri) { 909 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 910 911 spin_lock(&vi->spin); 912 913 vp = TAILQ_FIRST(&vi->inactive_list); 914 if (vp == NULL) { 915 spin_unlock(&vi->spin); 916 if (--cpu_count == 0) 917 break; 918 ri = (ri + 16) & ~15; 919 --ri; 920 continue; 921 } 922 923 /* 924 * non-blocking vx_get will also ref the vnode on success. 925 */ 926 if (vx_get_nonblock(vp)) { 927 KKASSERT(vp->v_state == VS_INACTIVE); 928 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 929 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 930 spin_unlock(&vi->spin); 931 continue; 932 } 933 934 /* 935 * Because we are holding vfs_spin the vnode should currently 936 * be inactive and VREF_TERMINATE should still be set. 937 * 938 * Once vfs_spin is released the vnode's state should remain 939 * unmodified due to both the lock and ref on it. 940 */ 941 KKASSERT(vp->v_state == VS_INACTIVE); 942 spin_unlock(&vi->spin); 943 #ifdef TRACKVNODE 944 if ((u_long)vp == trackvnode) 945 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 946 #endif 947 948 /* 949 * The active scan already did this, but some leakage can 950 * happen. Don't let an easily recycleable vnode go to 951 * waste! 952 */ 953 if (vp->v_auxrefs != vp->v_namecache_count && 954 (vp->v_refcnt & ~VREF_FINALIZE) == VREF_TERMINATE + 1) 955 { 956 cache_inval_vp_quick(vp); 957 if (vp->v_auxrefs == vp->v_namecache_count) 958 ++auxrecovervnodes2; 959 } 960 961 /* 962 * Do not reclaim/reuse a vnode while auxillary refs exists. 963 * This includes namecache refs due to a related ncp being 964 * locked or having children, a VM object association, or 965 * other hold users. 966 * 967 * Do not reclaim/reuse a vnode if someone else has a real 968 * ref on it. This can occur if a filesystem temporarily 969 * releases the vnode lock during VOP_RECLAIM. 970 */ 971 if (vp->v_auxrefs != vp->v_namecache_count || 972 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 973 failed: 974 if (vp->v_state == VS_INACTIVE) { 975 spin_lock(&vi->spin); 976 if (vp->v_state == VS_INACTIVE) { 977 TAILQ_REMOVE(&vi->inactive_list, 978 vp, v_list); 979 TAILQ_INSERT_TAIL(&vi->inactive_list, 980 vp, v_list); 981 } 982 spin_unlock(&vi->spin); 983 } 984 vx_put(vp); 985 continue; 986 } 987 988 /* 989 * VINACTIVE and VREF_TERMINATE are expected to both be set 990 * for vnodes pulled from the inactive list, and cannot be 991 * changed while we hold the vx lock. 992 * 993 * Try to reclaim the vnode. 994 * 995 * The cache_inval_vp() can fail if any of the namecache 996 * elements are actively locked, preventing the vnode from 997 * bring reclaimed. This is desired operation as it gives 998 * the namecache code certain guarantees just by holding 999 * a ncp. 1000 */ 1001 KKASSERT(vp->v_flag & VINACTIVE); 1002 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1003 1004 if ((vp->v_flag & VRECLAIMED) == 0) { 1005 if (cache_inval_vp_nonblock(vp)) 1006 goto failed; 1007 vgone_vxlocked(vp); 1008 /* vnode is still VX locked */ 1009 } 1010 1011 /* 1012 * At this point if there are no other refs or auxrefs on 1013 * the vnode with the inactive list locked, and we remove 1014 * the vnode from the inactive list, it should not be 1015 * possible for anyone else to access the vnode any more. 1016 * 1017 * Since the vnode is in a VRECLAIMED state, no new 1018 * namecache associations could have been made and the 1019 * vnode should have already been removed from its mountlist. 1020 * 1021 * Since we hold a VX lock on the vnode it cannot have been 1022 * reactivated (moved out of the inactive list). 1023 */ 1024 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1025 spin_lock(&vi->spin); 1026 if (vp->v_auxrefs || 1027 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1028 spin_unlock(&vi->spin); 1029 goto failed; 1030 } 1031 KKASSERT(vp->v_state == VS_INACTIVE); 1032 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1033 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1034 vp->v_state = VS_DYING; 1035 spin_unlock(&vi->spin); 1036 1037 /* 1038 * Nothing should have been able to access this vp. Only 1039 * our ref should remain now. 1040 */ 1041 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1042 KASSERT(vp->v_refcnt == 1, 1043 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1044 1045 /* 1046 * Return a VX locked vnode suitable for reuse. 1047 */ 1048 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 1049 return(vp); 1050 } 1051 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 1052 return(NULL); 1053 } 1054 1055 /* 1056 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 1057 * 1058 * All new vnodes set the VAGE flags. An open() of the vnode will 1059 * decrement the (2-bit) flags. Vnodes which are opened several times 1060 * are thus retained in the cache over vnodes which are merely stat()d. 1061 * 1062 * We attempt to reuse an already-recycled vnode from our pcpu inactive 1063 * queue first, and allocate otherwise. Attempting to recycle inactive 1064 * vnodes here can lead to numerous deadlocks, particularly with 1065 * softupdates. 1066 */ 1067 struct vnode * 1068 allocvnode(int lktimeout, int lkflags) 1069 { 1070 struct vnode *vp; 1071 struct vnode_index *vi; 1072 1073 /* 1074 * lktimeout only applies when LK_TIMELOCK is used, and only 1075 * the pageout daemon uses it. The timeout may not be zero 1076 * or the pageout daemon can deadlock in low-VM situations. 1077 */ 1078 if (lktimeout == 0) 1079 lktimeout = hz / 10; 1080 1081 /* 1082 * Do not flag for synchronous recyclement unless there are enough 1083 * freeable vnodes to recycle and the number of vnodes has 1084 * significantly exceeded our target. We want the normal vnlru 1085 * process to handle the cleaning (at 9/10's) before we are forced 1086 * to flag it here at 11/10's for userexit path processing. 1087 */ 1088 if (numvnodes >= maxvnodes * 11 / 10 && 1089 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 1090 struct thread *td = curthread; 1091 if (td->td_lwp) 1092 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 1093 } 1094 1095 /* 1096 * Try to trivially reuse a reclaimed vnode from the head of the 1097 * inactive list for this cpu. Any vnode cycling which occurs 1098 * which terminates the vnode will cause it to be returned to the 1099 * same pcpu structure (e.g. unlink calls). 1100 */ 1101 vi = &vnode_list_hash[mycpuid]; 1102 spin_lock(&vi->spin); 1103 1104 vp = TAILQ_FIRST(&vi->inactive_list); 1105 if (vp && (vp->v_flag & VRECLAIMED)) { 1106 /* 1107 * non-blocking vx_get will also ref the vnode on success. 1108 */ 1109 if (vx_get_nonblock(vp)) { 1110 KKASSERT(vp->v_state == VS_INACTIVE); 1111 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1112 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 1113 spin_unlock(&vi->spin); 1114 goto slower; 1115 } 1116 1117 /* 1118 * Because we are holding vfs_spin the vnode should currently 1119 * be inactive and VREF_TERMINATE should still be set. 1120 * 1121 * Once vfs_spin is released the vnode's state should remain 1122 * unmodified due to both the lock and ref on it. 1123 */ 1124 KKASSERT(vp->v_state == VS_INACTIVE); 1125 #ifdef TRACKVNODE 1126 if ((u_long)vp == trackvnode) 1127 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 1128 #endif 1129 1130 /* 1131 * Do not reclaim/reuse a vnode while auxillary refs exists. 1132 * This includes namecache refs due to a related ncp being 1133 * locked or having children, a VM object association, or 1134 * other hold users. 1135 * 1136 * Do not reclaim/reuse a vnode if someone else has a real 1137 * ref on it. This can occur if a filesystem temporarily 1138 * releases the vnode lock during VOP_RECLAIM. 1139 */ 1140 if (vp->v_auxrefs || 1141 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1142 if (vp->v_state == VS_INACTIVE) { 1143 TAILQ_REMOVE(&vi->inactive_list, 1144 vp, v_list); 1145 TAILQ_INSERT_TAIL(&vi->inactive_list, 1146 vp, v_list); 1147 } 1148 spin_unlock(&vi->spin); 1149 vx_put(vp); 1150 goto slower; 1151 } 1152 1153 /* 1154 * VINACTIVE and VREF_TERMINATE are expected to both be set 1155 * for vnodes pulled from the inactive list, and cannot be 1156 * changed while we hold the vx lock. 1157 * 1158 * Try to reclaim the vnode. 1159 */ 1160 KKASSERT(vp->v_flag & VINACTIVE); 1161 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1162 1163 if ((vp->v_flag & VRECLAIMED) == 0) { 1164 spin_unlock(&vi->spin); 1165 vx_put(vp); 1166 goto slower; 1167 } 1168 1169 /* 1170 * At this point if there are no other refs or auxrefs on 1171 * the vnode with the inactive list locked, and we remove 1172 * the vnode from the inactive list, it should not be 1173 * possible for anyone else to access the vnode any more. 1174 * 1175 * Since the vnode is in a VRECLAIMED state, no new 1176 * namecache associations could have been made and the 1177 * vnode should have already been removed from its mountlist. 1178 * 1179 * Since we hold a VX lock on the vnode it cannot have been 1180 * reactivated (moved out of the inactive list). 1181 */ 1182 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1183 KKASSERT(vp->v_state == VS_INACTIVE); 1184 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1185 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1186 vp->v_state = VS_DYING; 1187 spin_unlock(&vi->spin); 1188 1189 /* 1190 * Nothing should have been able to access this vp. Only 1191 * our ref should remain now. 1192 * 1193 * At this point we can kfree() the vnode if we want to. 1194 * Instead, we reuse it for the allocation. 1195 */ 1196 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1197 KASSERT(vp->v_refcnt == 1, 1198 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1199 vx_unlock(vp); /* safety: keep the API clean */ 1200 bzero(vp, sizeof(*vp)); 1201 } else { 1202 spin_unlock(&vi->spin); 1203 slower: 1204 vp = kmalloc_obj(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1205 atomic_add_int(&numvnodes, 1); 1206 } 1207 1208 lwkt_token_init(&vp->v_token, "vnode"); 1209 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1210 TAILQ_INIT(&vp->v_namecache); 1211 RB_INIT(&vp->v_rbclean_tree); 1212 RB_INIT(&vp->v_rbdirty_tree); 1213 RB_INIT(&vp->v_rbhash_tree); 1214 spin_init(&vp->v_spin, "allocvnode"); 1215 1216 vx_lock(vp); 1217 vp->v_refcnt = 1; 1218 vp->v_flag = VAGE0 | VAGE1; 1219 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1220 1221 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1222 /* exclusive lock still held */ 1223 1224 vp->v_filesize = NOOFFSET; 1225 vp->v_type = VNON; 1226 vp->v_tag = 0; 1227 vp->v_state = VS_CACHED; 1228 _vactivate(vp); 1229 1230 return (vp); 1231 } 1232 1233 /* 1234 * Called after a process has allocated a vnode via allocvnode() 1235 * and we detected that too many vnodes were present. 1236 * 1237 * This function is called just prior to a return to userland if the 1238 * process at some point had to allocate a new vnode during the last 1239 * system call and the vnode count was found to be excessive. 1240 * 1241 * This is a synchronous path that we do not normally want to execute. 1242 * 1243 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1244 * 1245 * WARNING: Sometimes numvnodes can blow out due to children being 1246 * present under directory vnodes in the namecache. For the 1247 * moment use an if() instead of a while() and note that if 1248 * we were to use a while() we would still have to break out 1249 * if freesomevnodes() returned 0. vnlru will also be trying 1250 * hard to free vnodes at the same time (with a lower trigger 1251 * pointer). 1252 */ 1253 void 1254 allocvnode_gc(void) 1255 { 1256 if (numvnodes >= maxvnodes && 1257 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) 1258 { 1259 freesomevnodes(batchfreevnodes); 1260 } 1261 } 1262 1263 int 1264 freesomevnodes(int n) 1265 { 1266 struct vnode *vp; 1267 int count = 0; 1268 1269 while (n) { 1270 if ((vp = cleanfreevnode(n)) == NULL) 1271 break; 1272 vx_unlock(vp); 1273 --n; 1274 ++count; 1275 kfree_obj(vp, M_VNODE); 1276 atomic_add_int(&numvnodes, -1); 1277 } 1278 return(count); 1279 } 1280