1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing but not quite). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/spinlock2.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #define VACT_MAX 10 71 #define VACT_INC 2 72 73 static void vnode_terminate(struct vnode *vp); 74 75 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 76 77 /* 78 * The vnode free list hold inactive vnodes. Aged inactive vnodes 79 * are inserted prior to the mid point, and otherwise inserted 80 * at the tail. 81 * 82 * The vnode code goes to great lengths to avoid moving vnodes between 83 * lists, but sometimes it is unavoidable. For this situation we try to 84 * avoid lock contention but we do not try very hard to avoid cache line 85 * congestion. A modestly sized hash table is used. 86 */ 87 #define VLIST_PRIME2 123462047LU 88 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 89 90 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 91 VLIST_PRIME2 % (unsigned)ncpus) 92 93 static struct vnode_index *vnode_list_hash; 94 95 int activevnodes = 0; 96 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 97 &activevnodes, 0, "Number of active nodes"); 98 int cachedvnodes = 0; 99 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 100 &cachedvnodes, 0, "Number of total cached nodes"); 101 int inactivevnodes = 0; 102 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 103 &inactivevnodes, 0, "Number of inactive nodes"); 104 static int batchfreevnodes = 5; 105 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 106 &batchfreevnodes, 0, "Number of vnodes to free at once"); 107 #ifdef TRACKVNODE 108 static u_long trackvnode; 109 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 110 &trackvnode, 0, ""); 111 #endif 112 113 /* 114 * Called from vfsinit() 115 */ 116 void 117 vfs_lock_init(void) 118 { 119 int i; 120 121 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 122 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 123 M_VNODE, M_ZERO | M_WAITOK); 124 for (i = 0; i < ncpus; ++i) { 125 struct vnode_index *vi = &vnode_list_hash[i]; 126 127 TAILQ_INIT(&vi->inactive_list); 128 TAILQ_INIT(&vi->active_list); 129 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 130 spin_init(&vi->spin, "vfslock"); 131 } 132 } 133 134 /* 135 * Misc functions 136 */ 137 static __inline 138 void 139 _vsetflags(struct vnode *vp, int flags) 140 { 141 atomic_set_int(&vp->v_flag, flags); 142 } 143 144 static __inline 145 void 146 _vclrflags(struct vnode *vp, int flags) 147 { 148 atomic_clear_int(&vp->v_flag, flags); 149 } 150 151 void 152 vsetflags(struct vnode *vp, int flags) 153 { 154 _vsetflags(vp, flags); 155 } 156 157 void 158 vclrflags(struct vnode *vp, int flags) 159 { 160 _vclrflags(vp, flags); 161 } 162 163 /* 164 * Place the vnode on the active list. 165 * 166 * Caller must hold vp->v_spin 167 */ 168 static __inline 169 void 170 _vactivate(struct vnode *vp) 171 { 172 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 173 174 #ifdef TRACKVNODE 175 if ((u_long)vp == trackvnode) 176 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 177 #endif 178 spin_lock(&vi->spin); 179 180 switch(vp->v_state) { 181 case VS_ACTIVE: 182 spin_unlock(&vi->spin); 183 panic("_vactivate: already active"); 184 /* NOT REACHED */ 185 return; 186 case VS_INACTIVE: 187 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 188 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 189 break; 190 case VS_CACHED: 191 case VS_DYING: 192 break; 193 } 194 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 195 vp->v_state = VS_ACTIVE; 196 spin_unlock(&vi->spin); 197 atomic_add_int(&mycpu->gd_activevnodes, 1); 198 } 199 200 /* 201 * Put a vnode on the inactive list. 202 * 203 * Caller must hold v_spin 204 */ 205 static __inline 206 void 207 _vinactive(struct vnode *vp) 208 { 209 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 210 211 #ifdef TRACKVNODE 212 if ((u_long)vp == trackvnode) { 213 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 214 print_backtrace(-1); 215 } 216 #endif 217 spin_lock(&vi->spin); 218 219 /* 220 * Remove from active list if it is sitting on it 221 */ 222 switch(vp->v_state) { 223 case VS_ACTIVE: 224 TAILQ_REMOVE(&vi->active_list, vp, v_list); 225 atomic_add_int(&mycpu->gd_activevnodes, -1); 226 break; 227 case VS_INACTIVE: 228 spin_unlock(&vi->spin); 229 panic("_vinactive: already inactive"); 230 /* NOT REACHED */ 231 return; 232 case VS_CACHED: 233 case VS_DYING: 234 break; 235 } 236 237 /* 238 * Distinguish between basically dead vnodes, vnodes with cached 239 * data, and vnodes without cached data. A rover will shift the 240 * vnodes around as their cache status is lost. 241 */ 242 if (vp->v_flag & VRECLAIMED) { 243 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 244 } else { 245 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 246 } 247 vp->v_state = VS_INACTIVE; 248 spin_unlock(&vi->spin); 249 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 250 } 251 252 /* 253 * Add a ref to an active vnode. This function should never be called 254 * with an inactive vnode (use vget() instead), but might be called 255 * with other states. 256 */ 257 void 258 vref(struct vnode *vp) 259 { 260 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 261 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 262 atomic_add_int(&vp->v_refcnt, 1); 263 } 264 265 void 266 vref_special(struct vnode *vp) 267 { 268 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 269 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 270 } 271 272 void 273 synchronizevnodecount(void) 274 { 275 int nca = 0; 276 int act = 0; 277 int ina = 0; 278 int i; 279 280 for (i = 0; i < ncpus; ++i) { 281 globaldata_t gd = globaldata_find(i); 282 nca += gd->gd_cachedvnodes; 283 act += gd->gd_activevnodes; 284 ina += gd->gd_inactivevnodes; 285 } 286 cachedvnodes = nca; 287 activevnodes = act; 288 inactivevnodes = ina; 289 } 290 291 /* 292 * Count number of cached vnodes. This is middling expensive so be 293 * careful not to make this call in the critical path. Each cpu tracks 294 * its own accumulator. The individual accumulators must be summed 295 * together to get an accurate value. 296 */ 297 int 298 countcachedvnodes(void) 299 { 300 int i; 301 int n = 0; 302 303 for (i = 0; i < ncpus; ++i) { 304 globaldata_t gd = globaldata_find(i); 305 n += gd->gd_cachedvnodes; 306 } 307 return n; 308 } 309 310 int 311 countcachedandinactivevnodes(void) 312 { 313 int i; 314 int n = 0; 315 316 for (i = 0; i < ncpus; ++i) { 317 globaldata_t gd = globaldata_find(i); 318 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 319 } 320 return n; 321 } 322 323 /* 324 * Release a ref on an active or inactive vnode. 325 * 326 * Caller has no other requirements. 327 * 328 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 329 * transition, otherwise we leave the vnode in the active list and 330 * do a lockless transition to 0, which is very important for the 331 * critical path. 332 * 333 * (vrele() is not called when a vnode is being destroyed w/kfree) 334 */ 335 void 336 vrele(struct vnode *vp) 337 { 338 int count; 339 340 #if 1 341 count = vp->v_refcnt; 342 cpu_ccfence(); 343 344 for (;;) { 345 KKASSERT((count & VREF_MASK) > 0); 346 KKASSERT(vp->v_state == VS_ACTIVE || 347 vp->v_state == VS_INACTIVE); 348 349 /* 350 * 2+ case 351 */ 352 if ((count & VREF_MASK) > 1) { 353 if (atomic_fcmpset_int(&vp->v_refcnt, 354 &count, count - 1)) { 355 break; 356 } 357 continue; 358 } 359 360 /* 361 * 1->0 transition case must handle possible finalization. 362 * When finalizing we transition 1->0x40000000. Note that 363 * cachedvnodes is only adjusted on transitions to ->0. 364 * 365 * WARNING! VREF_TERMINATE can be cleared at any point 366 * when the refcnt is non-zero (by vget()) and 367 * the vnode has not been reclaimed. Thus 368 * transitions out of VREF_TERMINATE do not have 369 * to mess with cachedvnodes. 370 */ 371 if (count & VREF_FINALIZE) { 372 vx_lock(vp); 373 if (atomic_fcmpset_int(&vp->v_refcnt, 374 &count, VREF_TERMINATE)) { 375 vnode_terminate(vp); 376 break; 377 } 378 vx_unlock(vp); 379 } else { 380 if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) { 381 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 382 break; 383 } 384 } 385 cpu_pause(); 386 /* retry */ 387 } 388 #else 389 /* 390 * XXX NOT YET WORKING! Multiple threads can reference the vnode 391 * after dropping their count, racing destruction, because this 392 * code is not directly transitioning from 1->VREF_FINALIZE. 393 */ 394 /* 395 * Drop the ref-count. On the 1->0 transition we check VREF_FINALIZE 396 * and attempt to acquire VREF_TERMINATE if set. It is possible for 397 * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but 398 * only one will be able to transition the vnode into the 399 * VREF_TERMINATE state. 400 * 401 * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter 402 * this state once. 403 */ 404 count = atomic_fetchadd_int(&vp->v_refcnt, -1); 405 if ((count & VREF_MASK) == 1) { 406 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 407 --count; 408 while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) { 409 vx_lock(vp); 410 if (atomic_fcmpset_int(&vp->v_refcnt, 411 &count, VREF_TERMINATE)) { 412 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 413 vnode_terminate(vp); 414 break; 415 } 416 vx_unlock(vp); 417 } 418 } 419 #endif 420 } 421 422 /* 423 * Add an auxiliary data structure reference to the vnode. Auxiliary 424 * references do not change the state of the vnode or prevent deactivation 425 * or reclamation of the vnode, but will prevent the vnode from being 426 * destroyed (kfree()'d). 427 * 428 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 429 * already be held by the caller. vdrop() will clean up the 430 * free list state. 431 */ 432 void 433 vhold(struct vnode *vp) 434 { 435 atomic_add_int(&vp->v_auxrefs, 1); 436 } 437 438 /* 439 * Remove an auxiliary reference from the vnode. 440 */ 441 void 442 vdrop(struct vnode *vp) 443 { 444 atomic_add_int(&vp->v_auxrefs, -1); 445 } 446 447 /* 448 * This function is called on the 1->0 transition (which is actually 449 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 450 * of the vnode. 451 * 452 * Additional vrefs are allowed to race but will not result in a reentrant 453 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 454 * prevents additional 1->0 transitions. 455 * 456 * ONLY A VGET() CAN REACTIVATE THE VNODE. 457 * 458 * Caller must hold the VX lock. 459 * 460 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 461 * 462 * NOTE: The vnode may be marked inactive with dirty buffers 463 * or dirty pages in its cached VM object still present. 464 * 465 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 466 * previously be active). We lose control of the vnode the instant 467 * it is placed on the free list. 468 * 469 * The VX lock is required when transitioning to VS_CACHED but is 470 * not sufficient for the vshouldfree() interlocked test or when 471 * transitioning away from VS_CACHED. v_spin is also required for 472 * those cases. 473 */ 474 static 475 void 476 vnode_terminate(struct vnode *vp) 477 { 478 KKASSERT(vp->v_state == VS_ACTIVE); 479 480 if ((vp->v_flag & VINACTIVE) == 0) { 481 _vsetflags(vp, VINACTIVE); 482 if (vp->v_mount) 483 VOP_INACTIVE(vp); 484 } 485 spin_lock(&vp->v_spin); 486 _vinactive(vp); 487 spin_unlock(&vp->v_spin); 488 489 vx_unlock(vp); 490 } 491 492 /**************************************************************** 493 * VX LOCKING FUNCTIONS * 494 **************************************************************** 495 * 496 * These functions lock vnodes for reclamation and deactivation related 497 * activities. The caller must already be holding some sort of reference 498 * on the vnode. 499 */ 500 void 501 vx_lock(struct vnode *vp) 502 { 503 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 504 spin_lock_update_only(&vp->v_spin); 505 } 506 507 void 508 vx_unlock(struct vnode *vp) 509 { 510 spin_unlock_update_only(&vp->v_spin); 511 lockmgr(&vp->v_lock, LK_RELEASE); 512 } 513 514 /* 515 * Downgrades a VX lock to a normal VN lock. The lock remains EXCLUSIVE. 516 * 517 * Generally required after calling getnewvnode() if the intention is 518 * to return a normal locked vnode to the caller. 519 */ 520 void 521 vx_downgrade(struct vnode *vp) 522 { 523 spin_unlock_update_only(&vp->v_spin); 524 } 525 526 /**************************************************************** 527 * VNODE ACQUISITION FUNCTIONS * 528 **************************************************************** 529 * 530 * These functions must be used when accessing a vnode that has no 531 * chance of being destroyed in a SMP race. That means the caller will 532 * usually either hold an auxiliary reference (such as the namecache) 533 * or hold some other lock that ensures that the vnode cannot be destroyed. 534 * 535 * These functions are MANDATORY for any code chain accessing a vnode 536 * whos activation state is not known. 537 * 538 * vget() can be called with LK_NOWAIT and will return EBUSY if the 539 * lock cannot be immediately acquired. 540 * 541 * vget()/vput() are used when reactivation is desired. 542 * 543 * vx_get() and vx_put() are used when reactivation is not desired. 544 */ 545 int 546 vget(struct vnode *vp, int flags) 547 { 548 int error; 549 550 /* 551 * A lock type must be passed 552 */ 553 if ((flags & LK_TYPE_MASK) == 0) { 554 panic("vget() called with no lock specified!"); 555 /* NOT REACHED */ 556 } 557 558 /* 559 * Reference the structure and then acquire the lock. 560 * 561 * NOTE: The requested lock might be a shared lock and does 562 * not protect our access to the refcnt or other fields. 563 */ 564 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 565 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 566 567 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 568 /* 569 * The lock failed, undo and return an error. This will not 570 * normally trigger a termination. 571 */ 572 vrele(vp); 573 } else if (vp->v_flag & VRECLAIMED) { 574 /* 575 * The node is being reclaimed and cannot be reactivated 576 * any more, undo and return ENOENT. 577 */ 578 vn_unlock(vp); 579 vrele(vp); 580 error = ENOENT; 581 } else if (vp->v_state == VS_ACTIVE) { 582 /* 583 * A VS_ACTIVE vnode coupled with the fact that we have 584 * a vnode lock (even if shared) prevents v_state from 585 * changing. Since the vnode is not in a VRECLAIMED state, 586 * we can safely clear VINACTIVE. 587 * 588 * It is possible for a shared lock to cause a race with 589 * another thread that is also in the process of clearing 590 * VREF_TERMINATE, meaning that we might return with it still 591 * set and then assert in a later vref(). The solution is to 592 * unconditionally clear VREF_TERMINATE here as well. 593 * 594 * NOTE! Multiple threads may clear VINACTIVE if this is 595 * shared lock. This race is allowed. 596 */ 597 if (vp->v_flag & VINACTIVE) 598 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 599 if (vp->v_act < VACT_MAX) { 600 vp->v_act += VACT_INC; 601 if (vp->v_act > VACT_MAX) /* SMP race ok */ 602 vp->v_act = VACT_MAX; 603 } 604 error = 0; 605 if (vp->v_refcnt & VREF_TERMINATE) /* SMP race ok */ 606 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 607 } else { 608 /* 609 * If the vnode is not VS_ACTIVE it must be reactivated 610 * in addition to clearing VINACTIVE. An exclusive spin_lock 611 * is needed to manipulate the vnode's list. 612 * 613 * Because the lockmgr lock might be shared, we might race 614 * another reactivation, which we handle. In this situation, 615 * however, the refcnt prevents other v_state races. 616 * 617 * As with above, clearing VINACTIVE is allowed to race other 618 * clearings of VINACTIVE. 619 * 620 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 621 * the refcnt is non-zero and the vnode has not been 622 * reclaimed. This also means that the transitions do 623 * not affect cachedvnodes. 624 * 625 * It is possible for a shared lock to cause a race with 626 * another thread that is also in the process of clearing 627 * VREF_TERMINATE, meaning that we might return with it still 628 * set and then assert in a later vref(). The solution is to 629 * unconditionally clear VREF_TERMINATE here as well. 630 */ 631 _vclrflags(vp, VINACTIVE); 632 vp->v_act += VACT_INC; 633 if (vp->v_act > VACT_MAX) /* SMP race ok */ 634 vp->v_act = VACT_MAX; 635 spin_lock(&vp->v_spin); 636 637 switch(vp->v_state) { 638 case VS_INACTIVE: 639 _vactivate(vp); 640 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 641 VREF_FINALIZE); 642 spin_unlock(&vp->v_spin); 643 break; 644 case VS_CACHED: 645 _vactivate(vp); 646 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 647 VREF_FINALIZE); 648 spin_unlock(&vp->v_spin); 649 break; 650 case VS_ACTIVE: 651 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | 652 VREF_TERMINATE); 653 spin_unlock(&vp->v_spin); 654 break; 655 case VS_DYING: 656 spin_unlock(&vp->v_spin); 657 panic("Impossible VS_DYING state"); 658 break; 659 } 660 error = 0; 661 } 662 return(error); 663 } 664 665 #ifdef DEBUG_VPUT 666 667 void 668 debug_vput(struct vnode *vp, const char *filename, int line) 669 { 670 kprintf("vput(%p) %s:%d\n", vp, filename, line); 671 vn_unlock(vp); 672 vrele(vp); 673 } 674 675 #else 676 677 void 678 vput(struct vnode *vp) 679 { 680 vn_unlock(vp); 681 vrele(vp); 682 } 683 684 #endif 685 686 /* 687 * Acquire the vnode lock unguarded. 688 * 689 * The non-blocking version also uses a slightly different mechanic. 690 * This function will explicitly fail not only if it cannot acquire 691 * the lock normally, but also if the caller already holds a lock. 692 * 693 * The adjusted mechanic is used to close a loophole where complex 694 * VOP_RECLAIM code can circle around recursively and allocate the 695 * same vnode it is trying to destroy from the freelist. 696 * 697 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 698 * cause the incorrect behavior to occur. If not for that lockmgr() 699 * would do the right thing. 700 * 701 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 702 */ 703 void 704 vx_get(struct vnode *vp) 705 { 706 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 707 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 708 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 709 spin_lock_update_only(&vp->v_spin); 710 } 711 712 int 713 vx_get_nonblock(struct vnode *vp) 714 { 715 int error; 716 717 if (lockinuse(&vp->v_lock)) 718 return(EBUSY); 719 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 720 if (error == 0) { 721 spin_lock_update_only(&vp->v_spin); 722 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 723 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 724 } 725 return(error); 726 } 727 728 /* 729 * Release a VX lock that also held a ref on the vnode. vrele() will handle 730 * any needed state transitions. 731 * 732 * However, filesystems use this function to get rid of unwanted new vnodes 733 * so try to get the vnode on the correct queue in that case. 734 */ 735 void 736 vx_put(struct vnode *vp) 737 { 738 if (vp->v_type == VNON || vp->v_type == VBAD) 739 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 740 spin_unlock_update_only(&vp->v_spin); 741 lockmgr(&vp->v_lock, LK_RELEASE); 742 vrele(vp); 743 } 744 745 /* 746 * Try to reuse a vnode from the free list. This function is somewhat 747 * advisory in that NULL can be returned as a normal case, even if free 748 * vnodes are present. 749 * 750 * The scan is limited because it can result in excessive CPU use during 751 * periods of extreme vnode use. 752 * 753 * NOTE: The returned vnode is not completely initialized. 754 * The returned vnode will be VX locked. 755 */ 756 static 757 struct vnode * 758 cleanfreevnode(int maxcount) 759 { 760 struct vnode_index *vi; 761 struct vnode *vp; 762 int count; 763 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 764 int ri; 765 int cpu_count; 766 767 /* 768 * Try to deactivate some vnodes cached on the active list. 769 */ 770 if (countcachedvnodes() < inactivevnodes) 771 goto skip; 772 773 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 774 775 for (count = 0; count < maxcount * 2; ++count, ++ri) { 776 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 777 778 spin_lock(&vi->spin); 779 780 vp = TAILQ_NEXT(&vi->active_rover, v_list); 781 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 782 if (vp == NULL) { 783 TAILQ_INSERT_HEAD(&vi->active_list, 784 &vi->active_rover, v_list); 785 } else { 786 TAILQ_INSERT_AFTER(&vi->active_list, vp, 787 &vi->active_rover, v_list); 788 } 789 if (vp == NULL) { 790 spin_unlock(&vi->spin); 791 continue; 792 } 793 if ((vp->v_refcnt & VREF_MASK) != 0) { 794 spin_unlock(&vi->spin); 795 vp->v_act += VACT_INC; 796 if (vp->v_act > VACT_MAX) /* SMP race ok */ 797 vp->v_act = VACT_MAX; 798 continue; 799 } 800 801 /* 802 * decrement by less if the vnode's object has a lot of 803 * VM pages. XXX possible SMP races. 804 */ 805 if (vp->v_act > 0) { 806 vm_object_t obj; 807 if ((obj = vp->v_object) != NULL && 808 obj->resident_page_count >= trigger) { 809 vp->v_act -= 1; 810 } else { 811 vp->v_act -= VACT_INC; 812 } 813 if (vp->v_act < 0) 814 vp->v_act = 0; 815 spin_unlock(&vi->spin); 816 continue; 817 } 818 819 /* 820 * Try to deactivate the vnode. 821 */ 822 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 823 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 824 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 825 826 spin_unlock(&vi->spin); 827 vrele(vp); 828 } 829 830 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 831 832 skip: 833 /* 834 * Loop trying to lock the first vnode on the free list. 835 * Cycle if we can't. 836 */ 837 cpu_count = ncpus; 838 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 839 840 for (count = 0; count < maxcount; ++count, ++ri) { 841 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 842 843 spin_lock(&vi->spin); 844 845 vp = TAILQ_FIRST(&vi->inactive_list); 846 if (vp == NULL) { 847 spin_unlock(&vi->spin); 848 if (--cpu_count == 0) 849 break; 850 ri = (ri + 16) & ~15; 851 --ri; 852 continue; 853 } 854 855 /* 856 * non-blocking vx_get will also ref the vnode on success. 857 */ 858 if (vx_get_nonblock(vp)) { 859 KKASSERT(vp->v_state == VS_INACTIVE); 860 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 861 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 862 spin_unlock(&vi->spin); 863 continue; 864 } 865 866 /* 867 * Because we are holding vfs_spin the vnode should currently 868 * be inactive and VREF_TERMINATE should still be set. 869 * 870 * Once vfs_spin is released the vnode's state should remain 871 * unmodified due to both the lock and ref on it. 872 */ 873 KKASSERT(vp->v_state == VS_INACTIVE); 874 spin_unlock(&vi->spin); 875 #ifdef TRACKVNODE 876 if ((u_long)vp == trackvnode) 877 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 878 #endif 879 880 /* 881 * Do not reclaim/reuse a vnode while auxillary refs exists. 882 * This includes namecache refs due to a related ncp being 883 * locked or having children, a VM object association, or 884 * other hold users. 885 * 886 * Do not reclaim/reuse a vnode if someone else has a real 887 * ref on it. This can occur if a filesystem temporarily 888 * releases the vnode lock during VOP_RECLAIM. 889 */ 890 if (vp->v_auxrefs != vp->v_namecache_count || 891 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 892 failed: 893 if (vp->v_state == VS_INACTIVE) { 894 spin_lock(&vi->spin); 895 if (vp->v_state == VS_INACTIVE) { 896 TAILQ_REMOVE(&vi->inactive_list, 897 vp, v_list); 898 TAILQ_INSERT_TAIL(&vi->inactive_list, 899 vp, v_list); 900 } 901 spin_unlock(&vi->spin); 902 } 903 vx_put(vp); 904 continue; 905 } 906 907 /* 908 * VINACTIVE and VREF_TERMINATE are expected to both be set 909 * for vnodes pulled from the inactive list, and cannot be 910 * changed while we hold the vx lock. 911 * 912 * Try to reclaim the vnode. 913 * 914 * The cache_inval_vp() can fail if any of the namecache 915 * elements are actively locked, preventing the vnode from 916 * bring reclaimed. This is desired operation as it gives 917 * the namecache code certain guarantees just by holding 918 * a ncp. 919 */ 920 KKASSERT(vp->v_flag & VINACTIVE); 921 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 922 923 if ((vp->v_flag & VRECLAIMED) == 0) { 924 if (cache_inval_vp_nonblock(vp)) 925 goto failed; 926 vgone_vxlocked(vp); 927 /* vnode is still VX locked */ 928 } 929 930 /* 931 * At this point if there are no other refs or auxrefs on 932 * the vnode with the inactive list locked, and we remove 933 * the vnode from the inactive list, it should not be 934 * possible for anyone else to access the vnode any more. 935 * 936 * Since the vnode is in a VRECLAIMED state, no new 937 * namecache associations could have been made and the 938 * vnode should have already been removed from its mountlist. 939 * 940 * Since we hold a VX lock on the vnode it cannot have been 941 * reactivated (moved out of the inactive list). 942 */ 943 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 944 spin_lock(&vi->spin); 945 if (vp->v_auxrefs || 946 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 947 spin_unlock(&vi->spin); 948 goto failed; 949 } 950 KKASSERT(vp->v_state == VS_INACTIVE); 951 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 952 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 953 vp->v_state = VS_DYING; 954 spin_unlock(&vi->spin); 955 956 /* 957 * Nothing should have been able to access this vp. Only 958 * our ref should remain now. 959 */ 960 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 961 KASSERT(vp->v_refcnt == 1, 962 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 963 964 /* 965 * Return a VX locked vnode suitable for reuse. 966 */ 967 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 968 return(vp); 969 } 970 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 971 return(NULL); 972 } 973 974 /* 975 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 976 * 977 * All new vnodes set the VAGE flags. An open() of the vnode will 978 * decrement the (2-bit) flags. Vnodes which are opened several times 979 * are thus retained in the cache over vnodes which are merely stat()d. 980 * 981 * We attempt to reuse an already-recycled vnode from our pcpu inactive 982 * queue first, and allocate otherwise. Attempting to recycle inactive 983 * vnodes here can lead to numerous deadlocks, particularly with 984 * softupdates. 985 */ 986 struct vnode * 987 allocvnode(int lktimeout, int lkflags) 988 { 989 struct vnode *vp; 990 struct vnode_index *vi; 991 992 /* 993 * lktimeout only applies when LK_TIMELOCK is used, and only 994 * the pageout daemon uses it. The timeout may not be zero 995 * or the pageout daemon can deadlock in low-VM situations. 996 */ 997 if (lktimeout == 0) 998 lktimeout = hz / 10; 999 1000 /* 1001 * Do not flag for synchronous recyclement unless there are enough 1002 * freeable vnodes to recycle and the number of vnodes has 1003 * significantly exceeded our target. We want the normal vnlru 1004 * process to handle the cleaning (at 9/10's) before we are forced 1005 * to flag it here at 11/10's for userexit path processing. 1006 */ 1007 if (numvnodes >= maxvnodes * 11 / 10 && 1008 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 1009 struct thread *td = curthread; 1010 if (td->td_lwp) 1011 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 1012 } 1013 1014 /* 1015 * Try to trivially reuse a reclaimed vnode from the head of the 1016 * inactive list for this cpu. Any vnode cycling which occurs 1017 * which terminates the vnode will cause it to be returned to the 1018 * same pcpu structure (e.g. unlink calls). 1019 */ 1020 vi = &vnode_list_hash[mycpuid]; 1021 spin_lock(&vi->spin); 1022 1023 vp = TAILQ_FIRST(&vi->inactive_list); 1024 if (vp && (vp->v_flag & VRECLAIMED)) { 1025 /* 1026 * non-blocking vx_get will also ref the vnode on success. 1027 */ 1028 if (vx_get_nonblock(vp)) { 1029 KKASSERT(vp->v_state == VS_INACTIVE); 1030 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1031 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 1032 spin_unlock(&vi->spin); 1033 goto slower; 1034 } 1035 1036 /* 1037 * Because we are holding vfs_spin the vnode should currently 1038 * be inactive and VREF_TERMINATE should still be set. 1039 * 1040 * Once vfs_spin is released the vnode's state should remain 1041 * unmodified due to both the lock and ref on it. 1042 */ 1043 KKASSERT(vp->v_state == VS_INACTIVE); 1044 #ifdef TRACKVNODE 1045 if ((u_long)vp == trackvnode) 1046 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 1047 #endif 1048 1049 /* 1050 * Do not reclaim/reuse a vnode while auxillary refs exists. 1051 * This includes namecache refs due to a related ncp being 1052 * locked or having children, a VM object association, or 1053 * other hold users. 1054 * 1055 * Do not reclaim/reuse a vnode if someone else has a real 1056 * ref on it. This can occur if a filesystem temporarily 1057 * releases the vnode lock during VOP_RECLAIM. 1058 */ 1059 if (vp->v_auxrefs || 1060 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1061 if (vp->v_state == VS_INACTIVE) { 1062 TAILQ_REMOVE(&vi->inactive_list, 1063 vp, v_list); 1064 TAILQ_INSERT_TAIL(&vi->inactive_list, 1065 vp, v_list); 1066 } 1067 spin_unlock(&vi->spin); 1068 vx_put(vp); 1069 goto slower; 1070 } 1071 1072 /* 1073 * VINACTIVE and VREF_TERMINATE are expected to both be set 1074 * for vnodes pulled from the inactive list, and cannot be 1075 * changed while we hold the vx lock. 1076 * 1077 * Try to reclaim the vnode. 1078 */ 1079 KKASSERT(vp->v_flag & VINACTIVE); 1080 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1081 1082 if ((vp->v_flag & VRECLAIMED) == 0) { 1083 spin_unlock(&vi->spin); 1084 vx_put(vp); 1085 goto slower; 1086 } 1087 1088 /* 1089 * At this point if there are no other refs or auxrefs on 1090 * the vnode with the inactive list locked, and we remove 1091 * the vnode from the inactive list, it should not be 1092 * possible for anyone else to access the vnode any more. 1093 * 1094 * Since the vnode is in a VRECLAIMED state, no new 1095 * namecache associations could have been made and the 1096 * vnode should have already been removed from its mountlist. 1097 * 1098 * Since we hold a VX lock on the vnode it cannot have been 1099 * reactivated (moved out of the inactive list). 1100 */ 1101 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1102 KKASSERT(vp->v_state == VS_INACTIVE); 1103 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1104 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1105 vp->v_state = VS_DYING; 1106 spin_unlock(&vi->spin); 1107 1108 /* 1109 * Nothing should have been able to access this vp. Only 1110 * our ref should remain now. 1111 * 1112 * At this point we can kfree() the vnode if we want to. 1113 * Instead, we reuse it for the allocation. 1114 */ 1115 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1116 KASSERT(vp->v_refcnt == 1, 1117 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1118 vx_unlock(vp); /* safety: keep the API clean */ 1119 bzero(vp, sizeof(*vp)); 1120 } else { 1121 spin_unlock(&vi->spin); 1122 slower: 1123 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1124 atomic_add_int(&numvnodes, 1); 1125 } 1126 1127 lwkt_token_init(&vp->v_token, "vnode"); 1128 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1129 TAILQ_INIT(&vp->v_namecache); 1130 RB_INIT(&vp->v_rbclean_tree); 1131 RB_INIT(&vp->v_rbdirty_tree); 1132 RB_INIT(&vp->v_rbhash_tree); 1133 spin_init(&vp->v_spin, "allocvnode"); 1134 1135 vx_lock(vp); 1136 vp->v_refcnt = 1; 1137 vp->v_flag = VAGE0 | VAGE1; 1138 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1139 1140 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1141 /* exclusive lock still held */ 1142 1143 vp->v_filesize = NOOFFSET; 1144 vp->v_type = VNON; 1145 vp->v_tag = 0; 1146 vp->v_state = VS_CACHED; 1147 _vactivate(vp); 1148 1149 return (vp); 1150 } 1151 1152 /* 1153 * Called after a process has allocated a vnode via allocvnode() 1154 * and we detected that too many vnodes were present. 1155 * 1156 * This function is called just prior to a return to userland if the 1157 * process at some point had to allocate a new vnode during the last 1158 * system call and the vnode count was found to be excessive. 1159 * 1160 * This is a synchronous path that we do not normally want to execute. 1161 * 1162 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1163 * 1164 * WARNING: Sometimes numvnodes can blow out due to children being 1165 * present under directory vnodes in the namecache. For the 1166 * moment use an if() instead of a while() and note that if 1167 * we were to use a while() we would still have to break out 1168 * if freesomevnodes() returned 0. vnlru will also be trying 1169 * hard to free vnodes at the same time (with a lower trigger 1170 * pointer). 1171 */ 1172 void 1173 allocvnode_gc(void) 1174 { 1175 if (numvnodes >= maxvnodes && 1176 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) { 1177 freesomevnodes(batchfreevnodes); 1178 } 1179 } 1180 1181 int 1182 freesomevnodes(int n) 1183 { 1184 struct vnode *vp; 1185 int count = 0; 1186 1187 while (n) { 1188 if ((vp = cleanfreevnode(n)) == NULL) 1189 break; 1190 vx_unlock(vp); 1191 --n; 1192 ++count; 1193 kfree(vp, M_VNODE); 1194 atomic_add_int(&numvnodes, -1); 1195 } 1196 return(count); 1197 } 1198