1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/spinlock2.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #define VACT_MAX 10 71 #define VACT_INC 2 72 73 static void vnode_terminate(struct vnode *vp); 74 75 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 76 77 /* 78 * The vnode free list hold inactive vnodes. Aged inactive vnodes 79 * are inserted prior to the mid point, and otherwise inserted 80 * at the tail. 81 * 82 * The vnode code goes to great lengths to avoid moving vnodes between 83 * lists, but sometimes it is unavoidable. For this situation we try to 84 * avoid lock contention but we do not try very hard to avoid cache line 85 * congestion. A modestly sized hash table is used. 86 */ 87 #define VLIST_PRIME2 123462047LU 88 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 89 90 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 91 VLIST_PRIME2 % (unsigned)ncpus) 92 93 static struct vnode_index *vnode_list_hash; 94 95 int activevnodes = 0; 96 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 97 &activevnodes, 0, "Number of active nodes"); 98 int cachedvnodes = 0; 99 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 100 &cachedvnodes, 0, "Number of total cached nodes"); 101 int inactivevnodes = 0; 102 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 103 &inactivevnodes, 0, "Number of inactive nodes"); 104 static int batchfreevnodes = 5; 105 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 106 &batchfreevnodes, 0, "Number of vnodes to free at once"); 107 #ifdef TRACKVNODE 108 static u_long trackvnode; 109 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 110 &trackvnode, 0, ""); 111 #endif 112 113 /* 114 * Called from vfsinit() 115 */ 116 void 117 vfs_lock_init(void) 118 { 119 int i; 120 121 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 122 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 123 M_VNODE, M_ZERO | M_WAITOK); 124 for (i = 0; i < ncpus; ++i) { 125 struct vnode_index *vi = &vnode_list_hash[i]; 126 127 TAILQ_INIT(&vi->inactive_list); 128 TAILQ_INIT(&vi->active_list); 129 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 130 spin_init(&vi->spin, "vfslock"); 131 } 132 } 133 134 /* 135 * Misc functions 136 */ 137 static __inline 138 void 139 _vsetflags(struct vnode *vp, int flags) 140 { 141 atomic_set_int(&vp->v_flag, flags); 142 } 143 144 static __inline 145 void 146 _vclrflags(struct vnode *vp, int flags) 147 { 148 atomic_clear_int(&vp->v_flag, flags); 149 } 150 151 void 152 vsetflags(struct vnode *vp, int flags) 153 { 154 _vsetflags(vp, flags); 155 } 156 157 void 158 vclrflags(struct vnode *vp, int flags) 159 { 160 _vclrflags(vp, flags); 161 } 162 163 /* 164 * Place the vnode on the active list. 165 * 166 * Caller must hold vp->v_spin 167 */ 168 static __inline 169 void 170 _vactivate(struct vnode *vp) 171 { 172 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 173 174 #ifdef TRACKVNODE 175 if ((u_long)vp == trackvnode) 176 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 177 #endif 178 spin_lock(&vi->spin); 179 180 switch(vp->v_state) { 181 case VS_ACTIVE: 182 spin_unlock(&vi->spin); 183 panic("_vactivate: already active"); 184 /* NOT REACHED */ 185 return; 186 case VS_INACTIVE: 187 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 188 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 189 break; 190 case VS_CACHED: 191 case VS_DYING: 192 break; 193 } 194 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 195 vp->v_state = VS_ACTIVE; 196 spin_unlock(&vi->spin); 197 atomic_add_int(&mycpu->gd_activevnodes, 1); 198 } 199 200 /* 201 * Put a vnode on the inactive list. 202 * 203 * Caller must hold v_spin 204 */ 205 static __inline 206 void 207 _vinactive(struct vnode *vp) 208 { 209 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 210 211 #ifdef TRACKVNODE 212 if ((u_long)vp == trackvnode) { 213 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 214 print_backtrace(-1); 215 } 216 #endif 217 spin_lock(&vi->spin); 218 219 /* 220 * Remove from active list if it is sitting on it 221 */ 222 switch(vp->v_state) { 223 case VS_ACTIVE: 224 TAILQ_REMOVE(&vi->active_list, vp, v_list); 225 atomic_add_int(&mycpu->gd_activevnodes, -1); 226 break; 227 case VS_INACTIVE: 228 spin_unlock(&vi->spin); 229 panic("_vinactive: already inactive"); 230 /* NOT REACHED */ 231 return; 232 case VS_CACHED: 233 case VS_DYING: 234 break; 235 } 236 237 /* 238 * Distinguish between basically dead vnodes, vnodes with cached 239 * data, and vnodes without cached data. A rover will shift the 240 * vnodes around as their cache status is lost. 241 */ 242 if (vp->v_flag & VRECLAIMED) { 243 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 244 } else { 245 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 246 } 247 vp->v_state = VS_INACTIVE; 248 spin_unlock(&vi->spin); 249 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 250 } 251 252 static __inline 253 void 254 _vinactive_tail(struct vnode *vp) 255 { 256 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 257 258 spin_lock(&vi->spin); 259 260 /* 261 * Remove from active list if it is sitting on it 262 */ 263 switch(vp->v_state) { 264 case VS_ACTIVE: 265 TAILQ_REMOVE(&vi->active_list, vp, v_list); 266 atomic_add_int(&mycpu->gd_activevnodes, -1); 267 break; 268 case VS_INACTIVE: 269 spin_unlock(&vi->spin); 270 panic("_vinactive_tail: already inactive"); 271 /* NOT REACHED */ 272 return; 273 case VS_CACHED: 274 case VS_DYING: 275 break; 276 } 277 278 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 279 vp->v_state = VS_INACTIVE; 280 spin_unlock(&vi->spin); 281 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 282 } 283 284 /* 285 * Add a ref to an active vnode. This function should never be called 286 * with an inactive vnode (use vget() instead), but might be called 287 * with other states. 288 */ 289 void 290 vref(struct vnode *vp) 291 { 292 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 293 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 294 atomic_add_int(&vp->v_refcnt, 1); 295 } 296 297 void 298 synchronizevnodecount(void) 299 { 300 int nca = 0; 301 int act = 0; 302 int ina = 0; 303 int i; 304 305 for (i = 0; i < ncpus; ++i) { 306 globaldata_t gd = globaldata_find(i); 307 nca += gd->gd_cachedvnodes; 308 act += gd->gd_activevnodes; 309 ina += gd->gd_inactivevnodes; 310 } 311 cachedvnodes = nca; 312 activevnodes = act; 313 inactivevnodes = ina; 314 } 315 316 /* 317 * Count number of cached vnodes. This is middling expensive so be 318 * careful not to make this call in the critical path. Each cpu tracks 319 * its own accumulator. The individual accumulators must be summed 320 * together to get an accurate value. 321 */ 322 int 323 countcachedvnodes(void) 324 { 325 int i; 326 int n = 0; 327 328 for (i = 0; i < ncpus; ++i) { 329 globaldata_t gd = globaldata_find(i); 330 n += gd->gd_cachedvnodes; 331 } 332 return n; 333 } 334 335 int 336 countcachedandinactivevnodes(void) 337 { 338 int i; 339 int n = 0; 340 341 for (i = 0; i < ncpus; ++i) { 342 globaldata_t gd = globaldata_find(i); 343 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 344 } 345 return n; 346 } 347 348 /* 349 * Release a ref on an active or inactive vnode. 350 * 351 * Caller has no other requirements. 352 * 353 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 354 * transition, otherwise we leave the vnode in the active list and 355 * do a lockless transition to 0, which is very important for the 356 * critical path. 357 * 358 * (vrele() is not called when a vnode is being destroyed w/kfree) 359 */ 360 void 361 vrele(struct vnode *vp) 362 { 363 for (;;) { 364 int count = vp->v_refcnt; 365 cpu_ccfence(); 366 KKASSERT((count & VREF_MASK) > 0); 367 KKASSERT(vp->v_state == VS_ACTIVE || 368 vp->v_state == VS_INACTIVE); 369 370 /* 371 * 2+ case 372 */ 373 if ((count & VREF_MASK) > 1) { 374 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1)) 375 break; 376 continue; 377 } 378 379 /* 380 * 1->0 transition case must handle possible finalization. 381 * When finalizing we transition 1->0x40000000. Note that 382 * cachedvnodes is only adjusted on transitions to ->0. 383 * 384 * WARNING! VREF_TERMINATE can be cleared at any point 385 * when the refcnt is non-zero (by vget()) and 386 * the vnode has not been reclaimed. Thus 387 * transitions out of VREF_TERMINATE do not have 388 * to mess with cachedvnodes. 389 */ 390 if (count & VREF_FINALIZE) { 391 vx_lock(vp); 392 if (atomic_cmpset_int(&vp->v_refcnt, 393 count, VREF_TERMINATE)) { 394 vnode_terminate(vp); 395 break; 396 } 397 vx_unlock(vp); 398 } else { 399 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) { 400 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 401 break; 402 } 403 } 404 /* retry */ 405 } 406 } 407 408 /* 409 * Add an auxiliary data structure reference to the vnode. Auxiliary 410 * references do not change the state of the vnode or prevent deactivation 411 * or reclamation of the vnode, but will prevent the vnode from being 412 * destroyed (kfree()'d). 413 * 414 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 415 * already be held by the caller. vdrop() will clean up the 416 * free list state. 417 */ 418 void 419 vhold(struct vnode *vp) 420 { 421 atomic_add_int(&vp->v_auxrefs, 1); 422 } 423 424 /* 425 * Remove an auxiliary reference from the vnode. 426 */ 427 void 428 vdrop(struct vnode *vp) 429 { 430 atomic_add_int(&vp->v_auxrefs, -1); 431 } 432 433 /* 434 * This function is called on the 1->0 transition (which is actually 435 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 436 * of the vnode. 437 * 438 * Additional vrefs are allowed to race but will not result in a reentrant 439 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 440 * prevents additional 1->0 transitions. 441 * 442 * ONLY A VGET() CAN REACTIVATE THE VNODE. 443 * 444 * Caller must hold the VX lock. 445 * 446 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 447 * 448 * NOTE: The vnode may be marked inactive with dirty buffers 449 * or dirty pages in its cached VM object still present. 450 * 451 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 452 * previously be active). We lose control of the vnode the instant 453 * it is placed on the free list. 454 * 455 * The VX lock is required when transitioning to VS_CACHED but is 456 * not sufficient for the vshouldfree() interlocked test or when 457 * transitioning away from VS_CACHED. v_spin is also required for 458 * those cases. 459 */ 460 static 461 void 462 vnode_terminate(struct vnode *vp) 463 { 464 KKASSERT(vp->v_state == VS_ACTIVE); 465 466 if ((vp->v_flag & VINACTIVE) == 0) { 467 _vsetflags(vp, VINACTIVE); 468 if (vp->v_mount) 469 VOP_INACTIVE(vp); 470 } 471 spin_lock(&vp->v_spin); 472 _vinactive(vp); 473 spin_unlock(&vp->v_spin); 474 475 vx_unlock(vp); 476 } 477 478 /**************************************************************** 479 * VX LOCKING FUNCTIONS * 480 **************************************************************** 481 * 482 * These functions lock vnodes for reclamation and deactivation related 483 * activities. The caller must already be holding some sort of reference 484 * on the vnode. 485 */ 486 void 487 vx_lock(struct vnode *vp) 488 { 489 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 490 } 491 492 void 493 vx_unlock(struct vnode *vp) 494 { 495 lockmgr(&vp->v_lock, LK_RELEASE); 496 } 497 498 /**************************************************************** 499 * VNODE ACQUISITION FUNCTIONS * 500 **************************************************************** 501 * 502 * These functions must be used when accessing a vnode that has no 503 * chance of being destroyed in a SMP race. That means the caller will 504 * usually either hold an auxiliary reference (such as the namecache) 505 * or hold some other lock that ensures that the vnode cannot be destroyed. 506 * 507 * These functions are MANDATORY for any code chain accessing a vnode 508 * whos activation state is not known. 509 * 510 * vget() can be called with LK_NOWAIT and will return EBUSY if the 511 * lock cannot be immediately acquired. 512 * 513 * vget()/vput() are used when reactivation is desired. 514 * 515 * vx_get() and vx_put() are used when reactivation is not desired. 516 */ 517 int 518 vget(struct vnode *vp, int flags) 519 { 520 int error; 521 522 /* 523 * A lock type must be passed 524 */ 525 if ((flags & LK_TYPE_MASK) == 0) { 526 panic("vget() called with no lock specified!"); 527 /* NOT REACHED */ 528 } 529 530 /* 531 * Reference the structure and then acquire the lock. 532 * 533 * NOTE: The requested lock might be a shared lock and does 534 * not protect our access to the refcnt or other fields. 535 */ 536 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 537 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 538 539 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 540 /* 541 * The lock failed, undo and return an error. This will not 542 * normally trigger a termination. 543 */ 544 vrele(vp); 545 } else if (vp->v_flag & VRECLAIMED) { 546 /* 547 * The node is being reclaimed and cannot be reactivated 548 * any more, undo and return ENOENT. 549 */ 550 vn_unlock(vp); 551 vrele(vp); 552 error = ENOENT; 553 } else if (vp->v_state == VS_ACTIVE) { 554 /* 555 * A VS_ACTIVE vnode coupled with the fact that we have 556 * a vnode lock (even if shared) prevents v_state from 557 * changing. Since the vnode is not in a VRECLAIMED state, 558 * we can safely clear VINACTIVE. 559 * 560 * It is possible for a shared lock to cause a race with 561 * another thread that is also in the process of clearing 562 * VREF_TERMINATE, meaning that we might return with it still 563 * set and then assert in a later vref(). The solution is to 564 * unconditionally clear VREF_TERMINATE here as well. 565 * 566 * NOTE! Multiple threads may clear VINACTIVE if this is 567 * shared lock. This race is allowed. 568 */ 569 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 570 vp->v_act += VACT_INC; 571 if (vp->v_act > VACT_MAX) /* SMP race ok */ 572 vp->v_act = VACT_MAX; 573 error = 0; 574 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 575 } else { 576 /* 577 * If the vnode is not VS_ACTIVE it must be reactivated 578 * in addition to clearing VINACTIVE. An exclusive spin_lock 579 * is needed to manipulate the vnode's list. 580 * 581 * Because the lockmgr lock might be shared, we might race 582 * another reactivation, which we handle. In this situation, 583 * however, the refcnt prevents other v_state races. 584 * 585 * As with above, clearing VINACTIVE is allowed to race other 586 * clearings of VINACTIVE. 587 * 588 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 589 * the refcnt is non-zero and the vnode has not been 590 * reclaimed. This also means that the transitions do 591 * not affect cachedvnodes. 592 * 593 * It is possible for a shared lock to cause a race with 594 * another thread that is also in the process of clearing 595 * VREF_TERMINATE, meaning that we might return with it still 596 * set and then assert in a later vref(). The solution is to 597 * unconditionally clear VREF_TERMINATE here as well. 598 */ 599 _vclrflags(vp, VINACTIVE); 600 vp->v_act += VACT_INC; 601 if (vp->v_act > VACT_MAX) /* SMP race ok */ 602 vp->v_act = VACT_MAX; 603 spin_lock(&vp->v_spin); 604 605 switch(vp->v_state) { 606 case VS_INACTIVE: 607 _vactivate(vp); 608 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 609 VREF_FINALIZE); 610 spin_unlock(&vp->v_spin); 611 break; 612 case VS_CACHED: 613 _vactivate(vp); 614 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 615 VREF_FINALIZE); 616 spin_unlock(&vp->v_spin); 617 break; 618 case VS_ACTIVE: 619 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | 620 VREF_TERMINATE); 621 spin_unlock(&vp->v_spin); 622 break; 623 case VS_DYING: 624 spin_unlock(&vp->v_spin); 625 panic("Impossible VS_DYING state"); 626 break; 627 } 628 error = 0; 629 } 630 return(error); 631 } 632 633 #ifdef DEBUG_VPUT 634 635 void 636 debug_vput(struct vnode *vp, const char *filename, int line) 637 { 638 kprintf("vput(%p) %s:%d\n", vp, filename, line); 639 vn_unlock(vp); 640 vrele(vp); 641 } 642 643 #else 644 645 void 646 vput(struct vnode *vp) 647 { 648 vn_unlock(vp); 649 vrele(vp); 650 } 651 652 #endif 653 654 /* 655 * Acquire the vnode lock unguarded. 656 * 657 * The non-blocking version also uses a slightly different mechanic. 658 * This function will explicitly fail not only if it cannot acquire 659 * the lock normally, but also if the caller already holds a lock. 660 * 661 * The adjusted mechanic is used to close a loophole where complex 662 * VOP_RECLAIM code can circle around recursively and allocate the 663 * same vnode it is trying to destroy from the freelist. 664 * 665 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 666 * cause the incorrect behavior to occur. If not for that lockmgr() 667 * would do the right thing. 668 * 669 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 670 */ 671 void 672 vx_get(struct vnode *vp) 673 { 674 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 675 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 676 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 677 } 678 679 int 680 vx_get_nonblock(struct vnode *vp) 681 { 682 int error; 683 684 if (lockinuse(&vp->v_lock)) 685 return(EBUSY); 686 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 687 if (error == 0) { 688 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 689 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 690 } 691 return(error); 692 } 693 694 /* 695 * Release a VX lock that also held a ref on the vnode. vrele() will handle 696 * any needed state transitions. 697 * 698 * However, filesystems use this function to get rid of unwanted new vnodes 699 * so try to get the vnode on the correct queue in that case. 700 */ 701 void 702 vx_put(struct vnode *vp) 703 { 704 if (vp->v_type == VNON || vp->v_type == VBAD) 705 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 706 lockmgr(&vp->v_lock, LK_RELEASE); 707 vrele(vp); 708 } 709 710 /* 711 * Try to reuse a vnode from the free list. This function is somewhat 712 * advisory in that NULL can be returned as a normal case, even if free 713 * vnodes are present. 714 * 715 * The scan is limited because it can result in excessive CPU use during 716 * periods of extreme vnode use. 717 * 718 * NOTE: The returned vnode is not completely initialized. 719 */ 720 static 721 struct vnode * 722 cleanfreevnode(int maxcount) 723 { 724 struct vnode_index *vi; 725 struct vnode *vp; 726 int count; 727 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 728 int ri; 729 int cpu_count; 730 731 /* 732 * Try to deactivate some vnodes cached on the active list. 733 */ 734 if (countcachedvnodes() < inactivevnodes) 735 goto skip; 736 737 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 738 739 for (count = 0; count < maxcount * 2; ++count, ++ri) { 740 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 741 742 spin_lock(&vi->spin); 743 744 vp = TAILQ_NEXT(&vi->active_rover, v_list); 745 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 746 if (vp == NULL) { 747 TAILQ_INSERT_HEAD(&vi->active_list, 748 &vi->active_rover, v_list); 749 } else { 750 TAILQ_INSERT_AFTER(&vi->active_list, vp, 751 &vi->active_rover, v_list); 752 } 753 if (vp == NULL) { 754 spin_unlock(&vi->spin); 755 continue; 756 } 757 if ((vp->v_refcnt & VREF_MASK) != 0) { 758 spin_unlock(&vi->spin); 759 vp->v_act += VACT_INC; 760 if (vp->v_act > VACT_MAX) /* SMP race ok */ 761 vp->v_act = VACT_MAX; 762 continue; 763 } 764 765 /* 766 * decrement by less if the vnode's object has a lot of 767 * VM pages. XXX possible SMP races. 768 */ 769 if (vp->v_act > 0) { 770 vm_object_t obj; 771 if ((obj = vp->v_object) != NULL && 772 obj->resident_page_count >= trigger) { 773 vp->v_act -= 1; 774 } else { 775 vp->v_act -= VACT_INC; 776 } 777 if (vp->v_act < 0) 778 vp->v_act = 0; 779 spin_unlock(&vi->spin); 780 continue; 781 } 782 783 /* 784 * Try to deactivate the vnode. 785 */ 786 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 787 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 788 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 789 790 spin_unlock(&vi->spin); 791 vrele(vp); 792 } 793 794 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 795 796 skip: 797 /* 798 * Loop trying to lock the first vnode on the free list. 799 * Cycle if we can't. 800 */ 801 cpu_count = ncpus; 802 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 803 804 for (count = 0; count < maxcount; ++count, ++ri) { 805 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 806 807 spin_lock(&vi->spin); 808 809 vp = TAILQ_FIRST(&vi->inactive_list); 810 if (vp == NULL) { 811 spin_unlock(&vi->spin); 812 if (--cpu_count == 0) 813 break; 814 ri = (ri + 16) & ~15; 815 --ri; 816 continue; 817 } 818 819 /* 820 * non-blocking vx_get will also ref the vnode on success. 821 */ 822 if (vx_get_nonblock(vp)) { 823 KKASSERT(vp->v_state == VS_INACTIVE); 824 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 825 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 826 spin_unlock(&vi->spin); 827 continue; 828 } 829 830 /* 831 * Because we are holding vfs_spin the vnode should currently 832 * be inactive and VREF_TERMINATE should still be set. 833 * 834 * Once vfs_spin is released the vnode's state should remain 835 * unmodified due to both the lock and ref on it. 836 */ 837 KKASSERT(vp->v_state == VS_INACTIVE); 838 spin_unlock(&vi->spin); 839 #ifdef TRACKVNODE 840 if ((u_long)vp == trackvnode) 841 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 842 #endif 843 844 /* 845 * Do not reclaim/reuse a vnode while auxillary refs exists. 846 * This includes namecache refs due to a related ncp being 847 * locked or having children, a VM object association, or 848 * other hold users. 849 * 850 * Do not reclaim/reuse a vnode if someone else has a real 851 * ref on it. This can occur if a filesystem temporarily 852 * releases the vnode lock during VOP_RECLAIM. 853 */ 854 if (vp->v_auxrefs || 855 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 856 failed: 857 if (vp->v_state == VS_INACTIVE) { 858 spin_lock(&vi->spin); 859 if (vp->v_state == VS_INACTIVE) { 860 TAILQ_REMOVE(&vi->inactive_list, 861 vp, v_list); 862 TAILQ_INSERT_TAIL(&vi->inactive_list, 863 vp, v_list); 864 } 865 spin_unlock(&vi->spin); 866 } 867 vx_put(vp); 868 continue; 869 } 870 871 /* 872 * VINACTIVE and VREF_TERMINATE are expected to both be set 873 * for vnodes pulled from the inactive list, and cannot be 874 * changed while we hold the vx lock. 875 * 876 * Try to reclaim the vnode. 877 */ 878 KKASSERT(vp->v_flag & VINACTIVE); 879 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 880 881 if ((vp->v_flag & VRECLAIMED) == 0) { 882 if (cache_inval_vp_nonblock(vp)) 883 goto failed; 884 vgone_vxlocked(vp); 885 /* vnode is still VX locked */ 886 } 887 888 /* 889 * At this point if there are no other refs or auxrefs on 890 * the vnode with the inactive list locked, and we remove 891 * the vnode from the inactive list, it should not be 892 * possible for anyone else to access the vnode any more. 893 * 894 * Since the vnode is in a VRECLAIMED state, no new 895 * namecache associations could have been made and the 896 * vnode should have already been removed from its mountlist. 897 * 898 * Since we hold a VX lock on the vnode it cannot have been 899 * reactivated (moved out of the inactive list). 900 */ 901 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 902 spin_lock(&vi->spin); 903 if (vp->v_auxrefs || 904 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 905 spin_unlock(&vi->spin); 906 goto failed; 907 } 908 KKASSERT(vp->v_state == VS_INACTIVE); 909 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 910 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 911 vp->v_state = VS_DYING; 912 spin_unlock(&vi->spin); 913 914 /* 915 * Nothing should have been able to access this vp. Only 916 * our ref should remain now. 917 */ 918 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 919 KASSERT(vp->v_refcnt == 1, 920 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 921 922 /* 923 * Return a VX locked vnode suitable for reuse. 924 */ 925 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 926 return(vp); 927 } 928 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 929 return(NULL); 930 } 931 932 /* 933 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 934 * 935 * All new vnodes set the VAGE flags. An open() of the vnode will 936 * decrement the (2-bit) flags. Vnodes which are opened several times 937 * are thus retained in the cache over vnodes which are merely stat()d. 938 * 939 * We attempt to reuse an already-recycled vnode from our pcpu inactive 940 * queue first, and allocate otherwise. Attempting to recycle inactive 941 * vnodes here can lead to numerous deadlocks, particularly with 942 * softupdates. 943 */ 944 struct vnode * 945 allocvnode(int lktimeout, int lkflags) 946 { 947 struct vnode *vp; 948 struct vnode_index *vi; 949 950 /* 951 * lktimeout only applies when LK_TIMELOCK is used, and only 952 * the pageout daemon uses it. The timeout may not be zero 953 * or the pageout daemon can deadlock in low-VM situations. 954 */ 955 if (lktimeout == 0) 956 lktimeout = hz / 10; 957 958 /* 959 * Do not flag for synchronous recyclement unless there are enough 960 * freeable vnodes to recycle and the number of vnodes has 961 * significantly exceeded our target. We want the normal vnlru 962 * process to handle the cleaning (at 9/10's) before we are forced 963 * to flag it here at 11/10's for userexit path processing. 964 */ 965 if (numvnodes >= maxvnodes * 11 / 10 && 966 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 967 struct thread *td = curthread; 968 if (td->td_lwp) 969 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 970 } 971 972 /* 973 * Try to trivially reuse a reclaimed vnode from the head of the 974 * inactive list for this cpu. Any vnode cycling which occurs 975 * which terminates the vnode will cause it to be returned to the 976 * same pcpu structure (e.g. unlink calls). 977 */ 978 vi = &vnode_list_hash[mycpuid]; 979 spin_lock(&vi->spin); 980 981 vp = TAILQ_FIRST(&vi->inactive_list); 982 if (vp && (vp->v_flag & VRECLAIMED)) { 983 /* 984 * non-blocking vx_get will also ref the vnode on success. 985 */ 986 if (vx_get_nonblock(vp)) { 987 KKASSERT(vp->v_state == VS_INACTIVE); 988 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 989 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 990 spin_unlock(&vi->spin); 991 goto slower; 992 } 993 994 /* 995 * Because we are holding vfs_spin the vnode should currently 996 * be inactive and VREF_TERMINATE should still be set. 997 * 998 * Once vfs_spin is released the vnode's state should remain 999 * unmodified due to both the lock and ref on it. 1000 */ 1001 KKASSERT(vp->v_state == VS_INACTIVE); 1002 #ifdef TRACKVNODE 1003 if ((u_long)vp == trackvnode) 1004 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 1005 #endif 1006 1007 /* 1008 * Do not reclaim/reuse a vnode while auxillary refs exists. 1009 * This includes namecache refs due to a related ncp being 1010 * locked or having children, a VM object association, or 1011 * other hold users. 1012 * 1013 * Do not reclaim/reuse a vnode if someone else has a real 1014 * ref on it. This can occur if a filesystem temporarily 1015 * releases the vnode lock during VOP_RECLAIM. 1016 */ 1017 if (vp->v_auxrefs || 1018 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1019 if (vp->v_state == VS_INACTIVE) { 1020 if (vp->v_state == VS_INACTIVE) { 1021 TAILQ_REMOVE(&vi->inactive_list, 1022 vp, v_list); 1023 TAILQ_INSERT_TAIL(&vi->inactive_list, 1024 vp, v_list); 1025 } 1026 } 1027 spin_unlock(&vi->spin); 1028 vx_put(vp); 1029 goto slower; 1030 } 1031 1032 /* 1033 * VINACTIVE and VREF_TERMINATE are expected to both be set 1034 * for vnodes pulled from the inactive list, and cannot be 1035 * changed while we hold the vx lock. 1036 * 1037 * Try to reclaim the vnode. 1038 */ 1039 KKASSERT(vp->v_flag & VINACTIVE); 1040 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1041 1042 if ((vp->v_flag & VRECLAIMED) == 0) { 1043 spin_unlock(&vi->spin); 1044 vx_put(vp); 1045 goto slower; 1046 } 1047 1048 /* 1049 * At this point if there are no other refs or auxrefs on 1050 * the vnode with the inactive list locked, and we remove 1051 * the vnode from the inactive list, it should not be 1052 * possible for anyone else to access the vnode any more. 1053 * 1054 * Since the vnode is in a VRECLAIMED state, no new 1055 * namecache associations could have been made and the 1056 * vnode should have already been removed from its mountlist. 1057 * 1058 * Since we hold a VX lock on the vnode it cannot have been 1059 * reactivated (moved out of the inactive list). 1060 */ 1061 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1062 KKASSERT(vp->v_state == VS_INACTIVE); 1063 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1064 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1065 vp->v_state = VS_DYING; 1066 spin_unlock(&vi->spin); 1067 1068 /* 1069 * Nothing should have been able to access this vp. Only 1070 * our ref should remain now. 1071 * 1072 * At this point we can kfree() the vnode if we want to. 1073 * Instead, we reuse it for the allocation. 1074 */ 1075 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1076 KASSERT(vp->v_refcnt == 1, 1077 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1078 bzero(vp, sizeof(*vp)); 1079 } else { 1080 spin_unlock(&vi->spin); 1081 slower: 1082 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1083 atomic_add_int(&numvnodes, 1); 1084 } 1085 1086 lwkt_token_init(&vp->v_token, "vnode"); 1087 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1088 TAILQ_INIT(&vp->v_namecache); 1089 RB_INIT(&vp->v_rbclean_tree); 1090 RB_INIT(&vp->v_rbdirty_tree); 1091 RB_INIT(&vp->v_rbhash_tree); 1092 spin_init(&vp->v_spin, "allocvnode"); 1093 1094 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 1095 vp->v_refcnt = 1; 1096 vp->v_flag = VAGE0 | VAGE1; 1097 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1098 1099 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1100 /* exclusive lock still held */ 1101 1102 vp->v_filesize = NOOFFSET; 1103 vp->v_type = VNON; 1104 vp->v_tag = 0; 1105 vp->v_state = VS_CACHED; 1106 _vactivate(vp); 1107 1108 return (vp); 1109 } 1110 1111 /* 1112 * Called after a process has allocated a vnode via allocvnode() 1113 * and we detected that too many vnodes were present. 1114 * 1115 * This function is called just prior to a return to userland if the 1116 * process at some point had to allocate a new vnode during the last 1117 * system call and the vnode count was found to be excessive. 1118 * 1119 * This is a synchronous path that we do not normally want to execute. 1120 * 1121 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1122 * 1123 * WARNING: Sometimes numvnodes can blow out due to children being 1124 * present under directory vnodes in the namecache. For the 1125 * moment use an if() instead of a while() and note that if 1126 * we were to use a while() we would still have to break out 1127 * if freesomevnodes() returned 0. vnlru will also be trying 1128 * hard to free vnodes at the same time (with a lower trigger 1129 * pointer). 1130 */ 1131 void 1132 allocvnode_gc(void) 1133 { 1134 if (numvnodes >= maxvnodes && 1135 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) { 1136 freesomevnodes(batchfreevnodes); 1137 } 1138 } 1139 1140 int 1141 freesomevnodes(int n) 1142 { 1143 struct vnode *vp; 1144 int count = 0; 1145 1146 while (n) { 1147 if ((vp = cleanfreevnode(n)) == NULL) 1148 break; 1149 vx_unlock(vp); 1150 --n; 1151 ++count; 1152 kfree(vp, M_VNODE); 1153 atomic_add_int(&numvnodes, -1); 1154 } 1155 return(count); 1156 } 1157