1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing but not quite). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/spinlock2.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #define VACT_MAX 10 71 #define VACT_INC 2 72 73 static void vnode_terminate(struct vnode *vp); 74 75 static MALLOC_DEFINE_OBJ(M_VNODE, sizeof(struct vnode), "vnodes", "vnodes"); 76 static MALLOC_DEFINE(M_VNODE_HASH, "vnodelsthash", "vnode list hash"); 77 78 /* 79 * The vnode free list hold inactive vnodes. Aged inactive vnodes 80 * are inserted prior to the mid point, and otherwise inserted 81 * at the tail. 82 * 83 * The vnode code goes to great lengths to avoid moving vnodes between 84 * lists, but sometimes it is unavoidable. For this situation we try to 85 * avoid lock contention but we do not try very hard to avoid cache line 86 * congestion. A modestly sized hash table is used. 87 */ 88 #define VLIST_PRIME2 123462047LU 89 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 90 91 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 92 VLIST_PRIME2 % (unsigned)ncpus) 93 94 static struct vnode_index *vnode_list_hash; 95 96 int activevnodes = 0; 97 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 98 &activevnodes, 0, "Number of active nodes"); 99 int cachedvnodes = 0; 100 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 101 &cachedvnodes, 0, "Number of total cached nodes"); 102 int inactivevnodes = 0; 103 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 104 &inactivevnodes, 0, "Number of inactive nodes"); 105 static int batchfreevnodes = 5; 106 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 107 &batchfreevnodes, 0, "Number of vnodes to free at once"); 108 #ifdef TRACKVNODE 109 static u_long trackvnode; 110 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 111 &trackvnode, 0, ""); 112 #endif 113 114 /* 115 * Called from vfsinit() 116 */ 117 void 118 vfs_lock_init(void) 119 { 120 int i; 121 122 kmalloc_obj_raise_limit(M_VNODE, 0); /* unlimited */ 123 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 124 M_VNODE_HASH, M_ZERO | M_WAITOK); 125 for (i = 0; i < ncpus; ++i) { 126 struct vnode_index *vi = &vnode_list_hash[i]; 127 128 TAILQ_INIT(&vi->inactive_list); 129 TAILQ_INIT(&vi->active_list); 130 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 131 spin_init(&vi->spin, "vfslock"); 132 } 133 } 134 135 /* 136 * Misc functions 137 */ 138 static __inline 139 void 140 _vsetflags(struct vnode *vp, int flags) 141 { 142 atomic_set_int(&vp->v_flag, flags); 143 } 144 145 static __inline 146 void 147 _vclrflags(struct vnode *vp, int flags) 148 { 149 atomic_clear_int(&vp->v_flag, flags); 150 } 151 152 void 153 vsetflags(struct vnode *vp, int flags) 154 { 155 _vsetflags(vp, flags); 156 } 157 158 void 159 vclrflags(struct vnode *vp, int flags) 160 { 161 _vclrflags(vp, flags); 162 } 163 164 /* 165 * Place the vnode on the active list. 166 * 167 * Caller must hold vp->v_spin 168 */ 169 static __inline 170 void 171 _vactivate(struct vnode *vp) 172 { 173 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 174 175 #ifdef TRACKVNODE 176 if ((u_long)vp == trackvnode) 177 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 178 #endif 179 spin_lock(&vi->spin); 180 181 switch(vp->v_state) { 182 case VS_ACTIVE: 183 spin_unlock(&vi->spin); 184 panic("_vactivate: already active"); 185 /* NOT REACHED */ 186 return; 187 case VS_INACTIVE: 188 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 189 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 190 break; 191 case VS_CACHED: 192 case VS_DYING: 193 break; 194 } 195 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 196 vp->v_state = VS_ACTIVE; 197 spin_unlock(&vi->spin); 198 atomic_add_int(&mycpu->gd_activevnodes, 1); 199 } 200 201 /* 202 * Put a vnode on the inactive list. 203 * 204 * Caller must hold v_spin 205 */ 206 static __inline 207 void 208 _vinactive(struct vnode *vp) 209 { 210 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 211 212 #ifdef TRACKVNODE 213 if ((u_long)vp == trackvnode) { 214 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 215 print_backtrace(-1); 216 } 217 #endif 218 spin_lock(&vi->spin); 219 220 /* 221 * Remove from active list if it is sitting on it 222 */ 223 switch(vp->v_state) { 224 case VS_ACTIVE: 225 TAILQ_REMOVE(&vi->active_list, vp, v_list); 226 atomic_add_int(&mycpu->gd_activevnodes, -1); 227 break; 228 case VS_INACTIVE: 229 spin_unlock(&vi->spin); 230 panic("_vinactive: already inactive"); 231 /* NOT REACHED */ 232 return; 233 case VS_CACHED: 234 case VS_DYING: 235 break; 236 } 237 238 /* 239 * Distinguish between basically dead vnodes, vnodes with cached 240 * data, and vnodes without cached data. A rover will shift the 241 * vnodes around as their cache status is lost. 242 */ 243 if (vp->v_flag & VRECLAIMED) { 244 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 245 } else { 246 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 247 } 248 vp->v_state = VS_INACTIVE; 249 spin_unlock(&vi->spin); 250 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 251 } 252 253 /* 254 * Add a ref to an active vnode. This function should never be called 255 * with an inactive vnode (use vget() instead), but might be called 256 * with other states. 257 */ 258 void 259 vref(struct vnode *vp) 260 { 261 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 262 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 263 atomic_add_int(&vp->v_refcnt, 1); 264 } 265 266 void 267 vref_special(struct vnode *vp) 268 { 269 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 270 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 271 } 272 273 void 274 synchronizevnodecount(void) 275 { 276 int nca = 0; 277 int act = 0; 278 int ina = 0; 279 int i; 280 281 for (i = 0; i < ncpus; ++i) { 282 globaldata_t gd = globaldata_find(i); 283 nca += gd->gd_cachedvnodes; 284 act += gd->gd_activevnodes; 285 ina += gd->gd_inactivevnodes; 286 } 287 cachedvnodes = nca; 288 activevnodes = act; 289 inactivevnodes = ina; 290 } 291 292 /* 293 * Count number of cached vnodes. This is middling expensive so be 294 * careful not to make this call in the critical path. Each cpu tracks 295 * its own accumulator. The individual accumulators must be summed 296 * together to get an accurate value. 297 */ 298 int 299 countcachedvnodes(void) 300 { 301 int i; 302 int n = 0; 303 304 for (i = 0; i < ncpus; ++i) { 305 globaldata_t gd = globaldata_find(i); 306 n += gd->gd_cachedvnodes; 307 } 308 return n; 309 } 310 311 int 312 countcachedandinactivevnodes(void) 313 { 314 int i; 315 int n = 0; 316 317 for (i = 0; i < ncpus; ++i) { 318 globaldata_t gd = globaldata_find(i); 319 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 320 } 321 return n; 322 } 323 324 /* 325 * Release a ref on an active or inactive vnode. 326 * 327 * Caller has no other requirements. 328 * 329 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 330 * transition, otherwise we leave the vnode in the active list and 331 * do a lockless transition to 0, which is very important for the 332 * critical path. 333 * 334 * (vrele() is not called when a vnode is being destroyed w/kfree) 335 */ 336 void 337 vrele(struct vnode *vp) 338 { 339 int count; 340 341 #if 1 342 count = vp->v_refcnt; 343 cpu_ccfence(); 344 345 for (;;) { 346 KKASSERT((count & VREF_MASK) > 0); 347 KKASSERT(vp->v_state == VS_ACTIVE || 348 vp->v_state == VS_INACTIVE); 349 350 /* 351 * 2+ case 352 */ 353 if ((count & VREF_MASK) > 1) { 354 if (atomic_fcmpset_int(&vp->v_refcnt, 355 &count, count - 1)) { 356 break; 357 } 358 continue; 359 } 360 361 /* 362 * 1->0 transition case must handle possible finalization. 363 * When finalizing we transition 1->0x40000000. Note that 364 * cachedvnodes is only adjusted on transitions to ->0. 365 * 366 * WARNING! VREF_TERMINATE can be cleared at any point 367 * when the refcnt is non-zero (by vget()) and 368 * the vnode has not been reclaimed. Thus 369 * transitions out of VREF_TERMINATE do not have 370 * to mess with cachedvnodes. 371 */ 372 if (count & VREF_FINALIZE) { 373 vx_lock(vp); 374 if (atomic_fcmpset_int(&vp->v_refcnt, 375 &count, VREF_TERMINATE)) { 376 vnode_terminate(vp); 377 break; 378 } 379 vx_unlock(vp); 380 } else { 381 if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) { 382 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 383 break; 384 } 385 } 386 cpu_pause(); 387 /* retry */ 388 } 389 #else 390 /* 391 * XXX NOT YET WORKING! Multiple threads can reference the vnode 392 * after dropping their count, racing destruction, because this 393 * code is not directly transitioning from 1->VREF_FINALIZE. 394 */ 395 /* 396 * Drop the ref-count. On the 1->0 transition we check VREF_FINALIZE 397 * and attempt to acquire VREF_TERMINATE if set. It is possible for 398 * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but 399 * only one will be able to transition the vnode into the 400 * VREF_TERMINATE state. 401 * 402 * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter 403 * this state once. 404 */ 405 count = atomic_fetchadd_int(&vp->v_refcnt, -1); 406 if ((count & VREF_MASK) == 1) { 407 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 408 --count; 409 while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) { 410 vx_lock(vp); 411 if (atomic_fcmpset_int(&vp->v_refcnt, 412 &count, VREF_TERMINATE)) { 413 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 414 vnode_terminate(vp); 415 break; 416 } 417 vx_unlock(vp); 418 } 419 } 420 #endif 421 } 422 423 /* 424 * Add an auxiliary data structure reference to the vnode. Auxiliary 425 * references do not change the state of the vnode or prevent deactivation 426 * or reclamation of the vnode, but will prevent the vnode from being 427 * destroyed (kfree()'d). 428 * 429 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 430 * already be held by the caller. vdrop() will clean up the 431 * free list state. 432 */ 433 void 434 vhold(struct vnode *vp) 435 { 436 atomic_add_int(&vp->v_auxrefs, 1); 437 } 438 439 /* 440 * Remove an auxiliary reference from the vnode. 441 */ 442 void 443 vdrop(struct vnode *vp) 444 { 445 atomic_add_int(&vp->v_auxrefs, -1); 446 } 447 448 /* 449 * This function is called on the 1->0 transition (which is actually 450 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 451 * of the vnode. 452 * 453 * Additional vrefs are allowed to race but will not result in a reentrant 454 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 455 * prevents additional 1->0 transitions. 456 * 457 * ONLY A VGET() CAN REACTIVATE THE VNODE. 458 * 459 * Caller must hold the VX lock. 460 * 461 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 462 * 463 * NOTE: The vnode may be marked inactive with dirty buffers 464 * or dirty pages in its cached VM object still present. 465 * 466 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 467 * previously be active). We lose control of the vnode the instant 468 * it is placed on the free list. 469 * 470 * The VX lock is required when transitioning to VS_CACHED but is 471 * not sufficient for the vshouldfree() interlocked test or when 472 * transitioning away from VS_CACHED. v_spin is also required for 473 * those cases. 474 */ 475 static 476 void 477 vnode_terminate(struct vnode *vp) 478 { 479 KKASSERT(vp->v_state == VS_ACTIVE); 480 481 if ((vp->v_flag & VINACTIVE) == 0) { 482 _vsetflags(vp, VINACTIVE); 483 if (vp->v_mount) 484 VOP_INACTIVE(vp); 485 } 486 spin_lock(&vp->v_spin); 487 _vinactive(vp); 488 spin_unlock(&vp->v_spin); 489 490 vx_unlock(vp); 491 } 492 493 /**************************************************************** 494 * VX LOCKING FUNCTIONS * 495 **************************************************************** 496 * 497 * These functions lock vnodes for reclamation and deactivation related 498 * activities. The caller must already be holding some sort of reference 499 * on the vnode. 500 */ 501 void 502 vx_lock(struct vnode *vp) 503 { 504 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 505 spin_lock_update_only(&vp->v_spin); 506 } 507 508 void 509 vx_unlock(struct vnode *vp) 510 { 511 spin_unlock_update_only(&vp->v_spin); 512 lockmgr(&vp->v_lock, LK_RELEASE); 513 } 514 515 /* 516 * Downgrades a VX lock to a normal VN lock. The lock remains EXCLUSIVE. 517 * 518 * Generally required after calling getnewvnode() if the intention is 519 * to return a normal locked vnode to the caller. 520 */ 521 void 522 vx_downgrade(struct vnode *vp) 523 { 524 spin_unlock_update_only(&vp->v_spin); 525 } 526 527 /**************************************************************** 528 * VNODE ACQUISITION FUNCTIONS * 529 **************************************************************** 530 * 531 * These functions must be used when accessing a vnode that has no 532 * chance of being destroyed in a SMP race. That means the caller will 533 * usually either hold an auxiliary reference (such as the namecache) 534 * or hold some other lock that ensures that the vnode cannot be destroyed. 535 * 536 * These functions are MANDATORY for any code chain accessing a vnode 537 * whos activation state is not known. 538 * 539 * vget() can be called with LK_NOWAIT and will return EBUSY if the 540 * lock cannot be immediately acquired. 541 * 542 * vget()/vput() are used when reactivation is desired. 543 * 544 * vx_get() and vx_put() are used when reactivation is not desired. 545 */ 546 int 547 vget(struct vnode *vp, int flags) 548 { 549 int error; 550 551 /* 552 * A lock type must be passed 553 */ 554 if ((flags & LK_TYPE_MASK) == 0) { 555 panic("vget() called with no lock specified!"); 556 /* NOT REACHED */ 557 } 558 559 /* 560 * Reference the structure and then acquire the lock. 561 * 562 * NOTE: The requested lock might be a shared lock and does 563 * not protect our access to the refcnt or other fields. 564 */ 565 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 566 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 567 568 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 569 /* 570 * The lock failed, undo and return an error. This will not 571 * normally trigger a termination. 572 */ 573 vrele(vp); 574 } else if (vp->v_flag & VRECLAIMED) { 575 /* 576 * The node is being reclaimed and cannot be reactivated 577 * any more, undo and return ENOENT. 578 */ 579 vn_unlock(vp); 580 vrele(vp); 581 error = ENOENT; 582 } else if (vp->v_state == VS_ACTIVE) { 583 /* 584 * A VS_ACTIVE vnode coupled with the fact that we have 585 * a vnode lock (even if shared) prevents v_state from 586 * changing. Since the vnode is not in a VRECLAIMED state, 587 * we can safely clear VINACTIVE. 588 * 589 * It is possible for a shared lock to cause a race with 590 * another thread that is also in the process of clearing 591 * VREF_TERMINATE, meaning that we might return with it still 592 * set and then assert in a later vref(). The solution is to 593 * unconditionally clear VREF_TERMINATE here as well. 594 * 595 * NOTE! Multiple threads may clear VINACTIVE if this is 596 * shared lock. This race is allowed. 597 */ 598 if (vp->v_flag & VINACTIVE) 599 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 600 if (vp->v_act < VACT_MAX) { 601 vp->v_act += VACT_INC; 602 if (vp->v_act > VACT_MAX) /* SMP race ok */ 603 vp->v_act = VACT_MAX; 604 } 605 error = 0; 606 if (vp->v_refcnt & VREF_TERMINATE) /* SMP race ok */ 607 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 608 } else { 609 /* 610 * If the vnode is not VS_ACTIVE it must be reactivated 611 * in addition to clearing VINACTIVE. An exclusive spin_lock 612 * is needed to manipulate the vnode's list. 613 * 614 * Because the lockmgr lock might be shared, we might race 615 * another reactivation, which we handle. In this situation, 616 * however, the refcnt prevents other v_state races. 617 * 618 * As with above, clearing VINACTIVE is allowed to race other 619 * clearings of VINACTIVE. 620 * 621 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 622 * the refcnt is non-zero and the vnode has not been 623 * reclaimed. This also means that the transitions do 624 * not affect cachedvnodes. 625 * 626 * It is possible for a shared lock to cause a race with 627 * another thread that is also in the process of clearing 628 * VREF_TERMINATE, meaning that we might return with it still 629 * set and then assert in a later vref(). The solution is to 630 * unconditionally clear VREF_TERMINATE here as well. 631 */ 632 _vclrflags(vp, VINACTIVE); 633 vp->v_act += VACT_INC; 634 if (vp->v_act > VACT_MAX) /* SMP race ok */ 635 vp->v_act = VACT_MAX; 636 spin_lock(&vp->v_spin); 637 638 switch(vp->v_state) { 639 case VS_INACTIVE: 640 _vactivate(vp); 641 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 642 VREF_FINALIZE); 643 spin_unlock(&vp->v_spin); 644 break; 645 case VS_CACHED: 646 _vactivate(vp); 647 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 648 VREF_FINALIZE); 649 spin_unlock(&vp->v_spin); 650 break; 651 case VS_ACTIVE: 652 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | 653 VREF_TERMINATE); 654 spin_unlock(&vp->v_spin); 655 break; 656 case VS_DYING: 657 spin_unlock(&vp->v_spin); 658 panic("Impossible VS_DYING state"); 659 break; 660 } 661 error = 0; 662 } 663 return(error); 664 } 665 666 #ifdef DEBUG_VPUT 667 668 void 669 debug_vput(struct vnode *vp, const char *filename, int line) 670 { 671 kprintf("vput(%p) %s:%d\n", vp, filename, line); 672 vn_unlock(vp); 673 vrele(vp); 674 } 675 676 #else 677 678 void 679 vput(struct vnode *vp) 680 { 681 vn_unlock(vp); 682 vrele(vp); 683 } 684 685 #endif 686 687 /* 688 * Acquire the vnode lock unguarded. 689 * 690 * The non-blocking version also uses a slightly different mechanic. 691 * This function will explicitly fail not only if it cannot acquire 692 * the lock normally, but also if the caller already holds a lock. 693 * 694 * The adjusted mechanic is used to close a loophole where complex 695 * VOP_RECLAIM code can circle around recursively and allocate the 696 * same vnode it is trying to destroy from the freelist. 697 * 698 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 699 * cause the incorrect behavior to occur. If not for that lockmgr() 700 * would do the right thing. 701 * 702 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 703 */ 704 void 705 vx_get(struct vnode *vp) 706 { 707 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 708 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 709 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 710 spin_lock_update_only(&vp->v_spin); 711 } 712 713 int 714 vx_get_nonblock(struct vnode *vp) 715 { 716 int error; 717 718 if (lockinuse(&vp->v_lock)) 719 return(EBUSY); 720 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 721 if (error == 0) { 722 spin_lock_update_only(&vp->v_spin); 723 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 724 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 725 } 726 return(error); 727 } 728 729 /* 730 * Release a VX lock that also held a ref on the vnode. vrele() will handle 731 * any needed state transitions. 732 * 733 * However, filesystems use this function to get rid of unwanted new vnodes 734 * so try to get the vnode on the correct queue in that case. 735 */ 736 void 737 vx_put(struct vnode *vp) 738 { 739 if (vp->v_type == VNON || vp->v_type == VBAD) 740 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 741 spin_unlock_update_only(&vp->v_spin); 742 lockmgr(&vp->v_lock, LK_RELEASE); 743 vrele(vp); 744 } 745 746 /* 747 * Try to reuse a vnode from the free list. This function is somewhat 748 * advisory in that NULL can be returned as a normal case, even if free 749 * vnodes are present. 750 * 751 * The scan is limited because it can result in excessive CPU use during 752 * periods of extreme vnode use. 753 * 754 * NOTE: The returned vnode is not completely initialized. 755 * The returned vnode will be VX locked. 756 */ 757 static 758 struct vnode * 759 cleanfreevnode(int maxcount) 760 { 761 struct vnode_index *vi; 762 struct vnode *vp; 763 int count; 764 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 765 int ri; 766 int cpu_count; 767 768 /* 769 * Try to deactivate some vnodes cached on the active list. 770 */ 771 if (countcachedvnodes() < inactivevnodes) 772 goto skip; 773 774 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 775 776 for (count = 0; count < maxcount * 2; ++count, ++ri) { 777 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 778 779 spin_lock(&vi->spin); 780 781 vp = TAILQ_NEXT(&vi->active_rover, v_list); 782 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 783 if (vp == NULL) { 784 TAILQ_INSERT_HEAD(&vi->active_list, 785 &vi->active_rover, v_list); 786 } else { 787 TAILQ_INSERT_AFTER(&vi->active_list, vp, 788 &vi->active_rover, v_list); 789 } 790 if (vp == NULL) { 791 spin_unlock(&vi->spin); 792 continue; 793 } 794 if ((vp->v_refcnt & VREF_MASK) != 0) { 795 spin_unlock(&vi->spin); 796 vp->v_act += VACT_INC; 797 if (vp->v_act > VACT_MAX) /* SMP race ok */ 798 vp->v_act = VACT_MAX; 799 continue; 800 } 801 802 /* 803 * decrement by less if the vnode's object has a lot of 804 * VM pages. XXX possible SMP races. 805 */ 806 if (vp->v_act > 0) { 807 vm_object_t obj; 808 if ((obj = vp->v_object) != NULL && 809 obj->resident_page_count >= trigger) { 810 vp->v_act -= 1; 811 } else { 812 vp->v_act -= VACT_INC; 813 } 814 if (vp->v_act < 0) 815 vp->v_act = 0; 816 spin_unlock(&vi->spin); 817 continue; 818 } 819 820 /* 821 * Try to deactivate the vnode. 822 */ 823 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 824 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 825 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 826 827 spin_unlock(&vi->spin); 828 vrele(vp); 829 } 830 831 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 832 833 skip: 834 /* 835 * Loop trying to lock the first vnode on the free list. 836 * Cycle if we can't. 837 */ 838 cpu_count = ncpus; 839 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 840 841 for (count = 0; count < maxcount; ++count, ++ri) { 842 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 843 844 spin_lock(&vi->spin); 845 846 vp = TAILQ_FIRST(&vi->inactive_list); 847 if (vp == NULL) { 848 spin_unlock(&vi->spin); 849 if (--cpu_count == 0) 850 break; 851 ri = (ri + 16) & ~15; 852 --ri; 853 continue; 854 } 855 856 /* 857 * non-blocking vx_get will also ref the vnode on success. 858 */ 859 if (vx_get_nonblock(vp)) { 860 KKASSERT(vp->v_state == VS_INACTIVE); 861 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 862 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 863 spin_unlock(&vi->spin); 864 continue; 865 } 866 867 /* 868 * Because we are holding vfs_spin the vnode should currently 869 * be inactive and VREF_TERMINATE should still be set. 870 * 871 * Once vfs_spin is released the vnode's state should remain 872 * unmodified due to both the lock and ref on it. 873 */ 874 KKASSERT(vp->v_state == VS_INACTIVE); 875 spin_unlock(&vi->spin); 876 #ifdef TRACKVNODE 877 if ((u_long)vp == trackvnode) 878 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 879 #endif 880 881 /* 882 * Do not reclaim/reuse a vnode while auxillary refs exists. 883 * This includes namecache refs due to a related ncp being 884 * locked or having children, a VM object association, or 885 * other hold users. 886 * 887 * Do not reclaim/reuse a vnode if someone else has a real 888 * ref on it. This can occur if a filesystem temporarily 889 * releases the vnode lock during VOP_RECLAIM. 890 */ 891 if (vp->v_auxrefs != vp->v_namecache_count || 892 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 893 failed: 894 if (vp->v_state == VS_INACTIVE) { 895 spin_lock(&vi->spin); 896 if (vp->v_state == VS_INACTIVE) { 897 TAILQ_REMOVE(&vi->inactive_list, 898 vp, v_list); 899 TAILQ_INSERT_TAIL(&vi->inactive_list, 900 vp, v_list); 901 } 902 spin_unlock(&vi->spin); 903 } 904 vx_put(vp); 905 continue; 906 } 907 908 /* 909 * VINACTIVE and VREF_TERMINATE are expected to both be set 910 * for vnodes pulled from the inactive list, and cannot be 911 * changed while we hold the vx lock. 912 * 913 * Try to reclaim the vnode. 914 * 915 * The cache_inval_vp() can fail if any of the namecache 916 * elements are actively locked, preventing the vnode from 917 * bring reclaimed. This is desired operation as it gives 918 * the namecache code certain guarantees just by holding 919 * a ncp. 920 */ 921 KKASSERT(vp->v_flag & VINACTIVE); 922 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 923 924 if ((vp->v_flag & VRECLAIMED) == 0) { 925 if (cache_inval_vp_nonblock(vp)) 926 goto failed; 927 vgone_vxlocked(vp); 928 /* vnode is still VX locked */ 929 } 930 931 /* 932 * At this point if there are no other refs or auxrefs on 933 * the vnode with the inactive list locked, and we remove 934 * the vnode from the inactive list, it should not be 935 * possible for anyone else to access the vnode any more. 936 * 937 * Since the vnode is in a VRECLAIMED state, no new 938 * namecache associations could have been made and the 939 * vnode should have already been removed from its mountlist. 940 * 941 * Since we hold a VX lock on the vnode it cannot have been 942 * reactivated (moved out of the inactive list). 943 */ 944 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 945 spin_lock(&vi->spin); 946 if (vp->v_auxrefs || 947 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 948 spin_unlock(&vi->spin); 949 goto failed; 950 } 951 KKASSERT(vp->v_state == VS_INACTIVE); 952 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 953 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 954 vp->v_state = VS_DYING; 955 spin_unlock(&vi->spin); 956 957 /* 958 * Nothing should have been able to access this vp. Only 959 * our ref should remain now. 960 */ 961 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 962 KASSERT(vp->v_refcnt == 1, 963 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 964 965 /* 966 * Return a VX locked vnode suitable for reuse. 967 */ 968 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 969 return(vp); 970 } 971 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 972 return(NULL); 973 } 974 975 /* 976 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 977 * 978 * All new vnodes set the VAGE flags. An open() of the vnode will 979 * decrement the (2-bit) flags. Vnodes which are opened several times 980 * are thus retained in the cache over vnodes which are merely stat()d. 981 * 982 * We attempt to reuse an already-recycled vnode from our pcpu inactive 983 * queue first, and allocate otherwise. Attempting to recycle inactive 984 * vnodes here can lead to numerous deadlocks, particularly with 985 * softupdates. 986 */ 987 struct vnode * 988 allocvnode(int lktimeout, int lkflags) 989 { 990 struct vnode *vp; 991 struct vnode_index *vi; 992 993 /* 994 * lktimeout only applies when LK_TIMELOCK is used, and only 995 * the pageout daemon uses it. The timeout may not be zero 996 * or the pageout daemon can deadlock in low-VM situations. 997 */ 998 if (lktimeout == 0) 999 lktimeout = hz / 10; 1000 1001 /* 1002 * Do not flag for synchronous recyclement unless there are enough 1003 * freeable vnodes to recycle and the number of vnodes has 1004 * significantly exceeded our target. We want the normal vnlru 1005 * process to handle the cleaning (at 9/10's) before we are forced 1006 * to flag it here at 11/10's for userexit path processing. 1007 */ 1008 if (numvnodes >= maxvnodes * 11 / 10 && 1009 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 1010 struct thread *td = curthread; 1011 if (td->td_lwp) 1012 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 1013 } 1014 1015 /* 1016 * Try to trivially reuse a reclaimed vnode from the head of the 1017 * inactive list for this cpu. Any vnode cycling which occurs 1018 * which terminates the vnode will cause it to be returned to the 1019 * same pcpu structure (e.g. unlink calls). 1020 */ 1021 vi = &vnode_list_hash[mycpuid]; 1022 spin_lock(&vi->spin); 1023 1024 vp = TAILQ_FIRST(&vi->inactive_list); 1025 if (vp && (vp->v_flag & VRECLAIMED)) { 1026 /* 1027 * non-blocking vx_get will also ref the vnode on success. 1028 */ 1029 if (vx_get_nonblock(vp)) { 1030 KKASSERT(vp->v_state == VS_INACTIVE); 1031 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1032 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 1033 spin_unlock(&vi->spin); 1034 goto slower; 1035 } 1036 1037 /* 1038 * Because we are holding vfs_spin the vnode should currently 1039 * be inactive and VREF_TERMINATE should still be set. 1040 * 1041 * Once vfs_spin is released the vnode's state should remain 1042 * unmodified due to both the lock and ref on it. 1043 */ 1044 KKASSERT(vp->v_state == VS_INACTIVE); 1045 #ifdef TRACKVNODE 1046 if ((u_long)vp == trackvnode) 1047 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 1048 #endif 1049 1050 /* 1051 * Do not reclaim/reuse a vnode while auxillary refs exists. 1052 * This includes namecache refs due to a related ncp being 1053 * locked or having children, a VM object association, or 1054 * other hold users. 1055 * 1056 * Do not reclaim/reuse a vnode if someone else has a real 1057 * ref on it. This can occur if a filesystem temporarily 1058 * releases the vnode lock during VOP_RECLAIM. 1059 */ 1060 if (vp->v_auxrefs || 1061 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1062 if (vp->v_state == VS_INACTIVE) { 1063 TAILQ_REMOVE(&vi->inactive_list, 1064 vp, v_list); 1065 TAILQ_INSERT_TAIL(&vi->inactive_list, 1066 vp, v_list); 1067 } 1068 spin_unlock(&vi->spin); 1069 vx_put(vp); 1070 goto slower; 1071 } 1072 1073 /* 1074 * VINACTIVE and VREF_TERMINATE are expected to both be set 1075 * for vnodes pulled from the inactive list, and cannot be 1076 * changed while we hold the vx lock. 1077 * 1078 * Try to reclaim the vnode. 1079 */ 1080 KKASSERT(vp->v_flag & VINACTIVE); 1081 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1082 1083 if ((vp->v_flag & VRECLAIMED) == 0) { 1084 spin_unlock(&vi->spin); 1085 vx_put(vp); 1086 goto slower; 1087 } 1088 1089 /* 1090 * At this point if there are no other refs or auxrefs on 1091 * the vnode with the inactive list locked, and we remove 1092 * the vnode from the inactive list, it should not be 1093 * possible for anyone else to access the vnode any more. 1094 * 1095 * Since the vnode is in a VRECLAIMED state, no new 1096 * namecache associations could have been made and the 1097 * vnode should have already been removed from its mountlist. 1098 * 1099 * Since we hold a VX lock on the vnode it cannot have been 1100 * reactivated (moved out of the inactive list). 1101 */ 1102 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1103 KKASSERT(vp->v_state == VS_INACTIVE); 1104 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1105 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1106 vp->v_state = VS_DYING; 1107 spin_unlock(&vi->spin); 1108 1109 /* 1110 * Nothing should have been able to access this vp. Only 1111 * our ref should remain now. 1112 * 1113 * At this point we can kfree() the vnode if we want to. 1114 * Instead, we reuse it for the allocation. 1115 */ 1116 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1117 KASSERT(vp->v_refcnt == 1, 1118 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1119 vx_unlock(vp); /* safety: keep the API clean */ 1120 bzero(vp, sizeof(*vp)); 1121 } else { 1122 spin_unlock(&vi->spin); 1123 slower: 1124 vp = kmalloc_obj(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1125 atomic_add_int(&numvnodes, 1); 1126 } 1127 1128 lwkt_token_init(&vp->v_token, "vnode"); 1129 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1130 TAILQ_INIT(&vp->v_namecache); 1131 RB_INIT(&vp->v_rbclean_tree); 1132 RB_INIT(&vp->v_rbdirty_tree); 1133 RB_INIT(&vp->v_rbhash_tree); 1134 spin_init(&vp->v_spin, "allocvnode"); 1135 1136 vx_lock(vp); 1137 vp->v_refcnt = 1; 1138 vp->v_flag = VAGE0 | VAGE1; 1139 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1140 1141 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1142 /* exclusive lock still held */ 1143 1144 vp->v_filesize = NOOFFSET; 1145 vp->v_type = VNON; 1146 vp->v_tag = 0; 1147 vp->v_state = VS_CACHED; 1148 _vactivate(vp); 1149 1150 return (vp); 1151 } 1152 1153 /* 1154 * Called after a process has allocated a vnode via allocvnode() 1155 * and we detected that too many vnodes were present. 1156 * 1157 * This function is called just prior to a return to userland if the 1158 * process at some point had to allocate a new vnode during the last 1159 * system call and the vnode count was found to be excessive. 1160 * 1161 * This is a synchronous path that we do not normally want to execute. 1162 * 1163 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1164 * 1165 * WARNING: Sometimes numvnodes can blow out due to children being 1166 * present under directory vnodes in the namecache. For the 1167 * moment use an if() instead of a while() and note that if 1168 * we were to use a while() we would still have to break out 1169 * if freesomevnodes() returned 0. vnlru will also be trying 1170 * hard to free vnodes at the same time (with a lower trigger 1171 * pointer). 1172 */ 1173 void 1174 allocvnode_gc(void) 1175 { 1176 if (numvnodes >= maxvnodes && 1177 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) { 1178 freesomevnodes(batchfreevnodes); 1179 } 1180 } 1181 1182 int 1183 freesomevnodes(int n) 1184 { 1185 struct vnode *vp; 1186 int count = 0; 1187 1188 while (n) { 1189 if ((vp = cleanfreevnode(n)) == NULL) 1190 break; 1191 vx_unlock(vp); 1192 --n; 1193 ++count; 1194 kfree_obj(vp, M_VNODE); 1195 atomic_add_int(&numvnodes, -1); 1196 } 1197 return(count); 1198 } 1199