1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/buf.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #include <sys/buf2.h> 71 #include <sys/thread2.h> 72 73 #define VACT_MAX 10 74 #define VACT_INC 2 75 76 static void vnode_terminate(struct vnode *vp); 77 78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 79 80 /* 81 * The vnode free list hold inactive vnodes. Aged inactive vnodes 82 * are inserted prior to the mid point, and otherwise inserted 83 * at the tail. 84 * 85 * The vnode code goes to great lengths to avoid moving vnodes between 86 * lists, but sometimes it is unavoidable. For this situation we try to 87 * avoid lock contention but we do not try very hard to avoid cache line 88 * congestion. A modestly sized hash table is used. 89 */ 90 #define VLIST_PRIME2 123462047LU 91 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 92 93 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 94 VLIST_PRIME2 % (unsigned)ncpus) 95 96 static struct vnode_index *vnode_list_hash; 97 98 int activevnodes = 0; 99 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 100 &activevnodes, 0, "Number of active nodes"); 101 int cachedvnodes = 0; 102 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 103 &cachedvnodes, 0, "Number of total cached nodes"); 104 int inactivevnodes = 0; 105 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 106 &inactivevnodes, 0, "Number of inactive nodes"); 107 static int batchfreevnodes = 5; 108 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 109 &batchfreevnodes, 0, "Number of vnodes to free at once"); 110 #ifdef TRACKVNODE 111 static u_long trackvnode; 112 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 113 &trackvnode, 0, ""); 114 #endif 115 116 /* 117 * Called from vfsinit() 118 */ 119 void 120 vfs_lock_init(void) 121 { 122 int i; 123 124 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 125 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 126 M_VNODE, M_ZERO | M_WAITOK); 127 for (i = 0; i < ncpus; ++i) { 128 struct vnode_index *vi = &vnode_list_hash[i]; 129 130 TAILQ_INIT(&vi->inactive_list); 131 TAILQ_INIT(&vi->active_list); 132 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 133 spin_init(&vi->spin, "vfslock"); 134 } 135 } 136 137 /* 138 * Misc functions 139 */ 140 static __inline 141 void 142 _vsetflags(struct vnode *vp, int flags) 143 { 144 atomic_set_int(&vp->v_flag, flags); 145 } 146 147 static __inline 148 void 149 _vclrflags(struct vnode *vp, int flags) 150 { 151 atomic_clear_int(&vp->v_flag, flags); 152 } 153 154 void 155 vsetflags(struct vnode *vp, int flags) 156 { 157 _vsetflags(vp, flags); 158 } 159 160 void 161 vclrflags(struct vnode *vp, int flags) 162 { 163 _vclrflags(vp, flags); 164 } 165 166 /* 167 * Place the vnode on the active list. 168 * 169 * Caller must hold vp->v_spin 170 */ 171 static __inline 172 void 173 _vactivate(struct vnode *vp) 174 { 175 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 176 177 #ifdef TRACKVNODE 178 if ((u_long)vp == trackvnode) 179 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 180 #endif 181 spin_lock(&vi->spin); 182 183 switch(vp->v_state) { 184 case VS_ACTIVE: 185 spin_unlock(&vi->spin); 186 panic("_vactivate: already active"); 187 /* NOT REACHED */ 188 return; 189 case VS_INACTIVE: 190 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 191 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 192 break; 193 case VS_CACHED: 194 case VS_DYING: 195 break; 196 } 197 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 198 vp->v_state = VS_ACTIVE; 199 spin_unlock(&vi->spin); 200 atomic_add_int(&mycpu->gd_activevnodes, 1); 201 } 202 203 /* 204 * Put a vnode on the inactive list. 205 * 206 * Caller must hold v_spin 207 */ 208 static __inline 209 void 210 _vinactive(struct vnode *vp) 211 { 212 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 213 214 #ifdef TRACKVNODE 215 if ((u_long)vp == trackvnode) { 216 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 217 print_backtrace(-1); 218 } 219 #endif 220 spin_lock(&vi->spin); 221 222 /* 223 * Remove from active list if it is sitting on it 224 */ 225 switch(vp->v_state) { 226 case VS_ACTIVE: 227 TAILQ_REMOVE(&vi->active_list, vp, v_list); 228 atomic_add_int(&mycpu->gd_activevnodes, -1); 229 break; 230 case VS_INACTIVE: 231 spin_unlock(&vi->spin); 232 panic("_vinactive: already inactive"); 233 /* NOT REACHED */ 234 return; 235 case VS_CACHED: 236 case VS_DYING: 237 break; 238 } 239 240 /* 241 * Distinguish between basically dead vnodes, vnodes with cached 242 * data, and vnodes without cached data. A rover will shift the 243 * vnodes around as their cache status is lost. 244 */ 245 if (vp->v_flag & VRECLAIMED) { 246 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 247 } else { 248 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 249 } 250 vp->v_state = VS_INACTIVE; 251 spin_unlock(&vi->spin); 252 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 253 } 254 255 static __inline 256 void 257 _vinactive_tail(struct vnode *vp) 258 { 259 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 260 261 spin_lock(&vi->spin); 262 263 /* 264 * Remove from active list if it is sitting on it 265 */ 266 switch(vp->v_state) { 267 case VS_ACTIVE: 268 TAILQ_REMOVE(&vi->active_list, vp, v_list); 269 atomic_add_int(&mycpu->gd_activevnodes, -1); 270 break; 271 case VS_INACTIVE: 272 spin_unlock(&vi->spin); 273 panic("_vinactive_tail: already inactive"); 274 /* NOT REACHED */ 275 return; 276 case VS_CACHED: 277 case VS_DYING: 278 break; 279 } 280 281 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 282 vp->v_state = VS_INACTIVE; 283 spin_unlock(&vi->spin); 284 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 285 } 286 287 /* 288 * Add a ref to an active vnode. This function should never be called 289 * with an inactive vnode (use vget() instead), but might be called 290 * with other states. 291 */ 292 void 293 vref(struct vnode *vp) 294 { 295 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 296 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 297 atomic_add_int(&vp->v_refcnt, 1); 298 } 299 300 void 301 synchronizevnodecount(void) 302 { 303 int nca = 0; 304 int act = 0; 305 int ina = 0; 306 int i; 307 308 for (i = 0; i < ncpus; ++i) { 309 globaldata_t gd = globaldata_find(i); 310 nca += gd->gd_cachedvnodes; 311 act += gd->gd_activevnodes; 312 ina += gd->gd_inactivevnodes; 313 } 314 cachedvnodes = nca; 315 activevnodes = act; 316 inactivevnodes = ina; 317 } 318 319 /* 320 * Count number of cached vnodes. This is middling expensive so be 321 * careful not to make this call in the critical path. Each cpu tracks 322 * its own accumulator. The individual accumulators must be summed 323 * together to get an accurate value. 324 */ 325 int 326 countcachedvnodes(void) 327 { 328 int i; 329 int n = 0; 330 331 for (i = 0; i < ncpus; ++i) { 332 globaldata_t gd = globaldata_find(i); 333 n += gd->gd_cachedvnodes; 334 } 335 return n; 336 } 337 338 int 339 countcachedandinactivevnodes(void) 340 { 341 int i; 342 int n = 0; 343 344 for (i = 0; i < ncpus; ++i) { 345 globaldata_t gd = globaldata_find(i); 346 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 347 } 348 return n; 349 } 350 351 /* 352 * Release a ref on an active or inactive vnode. 353 * 354 * Caller has no other requirements. 355 * 356 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 357 * transition, otherwise we leave the vnode in the active list and 358 * do a lockless transition to 0, which is very important for the 359 * critical path. 360 * 361 * (vrele() is not called when a vnode is being destroyed w/kfree) 362 */ 363 void 364 vrele(struct vnode *vp) 365 { 366 for (;;) { 367 int count = vp->v_refcnt; 368 cpu_ccfence(); 369 KKASSERT((count & VREF_MASK) > 0); 370 KKASSERT(vp->v_state == VS_ACTIVE || 371 vp->v_state == VS_INACTIVE); 372 373 /* 374 * 2+ case 375 */ 376 if ((count & VREF_MASK) > 1) { 377 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1)) 378 break; 379 continue; 380 } 381 382 /* 383 * 1->0 transition case must handle possible finalization. 384 * When finalizing we transition 1->0x40000000. Note that 385 * cachedvnodes is only adjusted on transitions to ->0. 386 * 387 * WARNING! VREF_TERMINATE can be cleared at any point 388 * when the refcnt is non-zero (by vget()) and 389 * the vnode has not been reclaimed. Thus 390 * transitions out of VREF_TERMINATE do not have 391 * to mess with cachedvnodes. 392 */ 393 if (count & VREF_FINALIZE) { 394 vx_lock(vp); 395 if (atomic_cmpset_int(&vp->v_refcnt, 396 count, VREF_TERMINATE)) { 397 vnode_terminate(vp); 398 break; 399 } 400 vx_unlock(vp); 401 } else { 402 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) { 403 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 404 break; 405 } 406 } 407 /* retry */ 408 } 409 } 410 411 /* 412 * Add an auxiliary data structure reference to the vnode. Auxiliary 413 * references do not change the state of the vnode or prevent deactivation 414 * or reclamation of the vnode, but will prevent the vnode from being 415 * destroyed (kfree()'d). 416 * 417 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 418 * already be held by the caller. vdrop() will clean up the 419 * free list state. 420 */ 421 void 422 vhold(struct vnode *vp) 423 { 424 atomic_add_int(&vp->v_auxrefs, 1); 425 } 426 427 /* 428 * Remove an auxiliary reference from the vnode. 429 */ 430 void 431 vdrop(struct vnode *vp) 432 { 433 atomic_add_int(&vp->v_auxrefs, -1); 434 } 435 436 /* 437 * This function is called on the 1->0 transition (which is actually 438 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 439 * of the vnode. 440 * 441 * Additional vrefs are allowed to race but will not result in a reentrant 442 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 443 * prevents additional 1->0 transitions. 444 * 445 * ONLY A VGET() CAN REACTIVATE THE VNODE. 446 * 447 * Caller must hold the VX lock. 448 * 449 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 450 * 451 * NOTE: The vnode may be marked inactive with dirty buffers 452 * or dirty pages in its cached VM object still present. 453 * 454 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 455 * previously be active). We lose control of the vnode the instant 456 * it is placed on the free list. 457 * 458 * The VX lock is required when transitioning to VS_CACHED but is 459 * not sufficient for the vshouldfree() interlocked test or when 460 * transitioning away from VS_CACHED. v_spin is also required for 461 * those cases. 462 */ 463 static 464 void 465 vnode_terminate(struct vnode *vp) 466 { 467 KKASSERT(vp->v_state == VS_ACTIVE); 468 469 if ((vp->v_flag & VINACTIVE) == 0) { 470 _vsetflags(vp, VINACTIVE); 471 if (vp->v_mount) 472 VOP_INACTIVE(vp); 473 } 474 spin_lock(&vp->v_spin); 475 _vinactive(vp); 476 spin_unlock(&vp->v_spin); 477 478 vx_unlock(vp); 479 } 480 481 /**************************************************************** 482 * VX LOCKING FUNCTIONS * 483 **************************************************************** 484 * 485 * These functions lock vnodes for reclamation and deactivation related 486 * activities. The caller must already be holding some sort of reference 487 * on the vnode. 488 */ 489 void 490 vx_lock(struct vnode *vp) 491 { 492 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 493 } 494 495 void 496 vx_unlock(struct vnode *vp) 497 { 498 lockmgr(&vp->v_lock, LK_RELEASE); 499 } 500 501 /**************************************************************** 502 * VNODE ACQUISITION FUNCTIONS * 503 **************************************************************** 504 * 505 * These functions must be used when accessing a vnode that has no 506 * chance of being destroyed in a SMP race. That means the caller will 507 * usually either hold an auxiliary reference (such as the namecache) 508 * or hold some other lock that ensures that the vnode cannot be destroyed. 509 * 510 * These functions are MANDATORY for any code chain accessing a vnode 511 * whos activation state is not known. 512 * 513 * vget() can be called with LK_NOWAIT and will return EBUSY if the 514 * lock cannot be immediately acquired. 515 * 516 * vget()/vput() are used when reactivation is desired. 517 * 518 * vx_get() and vx_put() are used when reactivation is not desired. 519 */ 520 int 521 vget(struct vnode *vp, int flags) 522 { 523 int error; 524 525 /* 526 * A lock type must be passed 527 */ 528 if ((flags & LK_TYPE_MASK) == 0) { 529 panic("vget() called with no lock specified!"); 530 /* NOT REACHED */ 531 } 532 533 /* 534 * Reference the structure and then acquire the lock. 535 * 536 * NOTE: The requested lock might be a shared lock and does 537 * not protect our access to the refcnt or other fields. 538 */ 539 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 540 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 541 542 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 543 /* 544 * The lock failed, undo and return an error. This will not 545 * normally trigger a termination. 546 */ 547 vrele(vp); 548 } else if (vp->v_flag & VRECLAIMED) { 549 /* 550 * The node is being reclaimed and cannot be reactivated 551 * any more, undo and return ENOENT. 552 */ 553 vn_unlock(vp); 554 vrele(vp); 555 error = ENOENT; 556 } else if (vp->v_state == VS_ACTIVE) { 557 /* 558 * A VS_ACTIVE vnode coupled with the fact that we have 559 * a vnode lock (even if shared) prevents v_state from 560 * changing. Since the vnode is not in a VRECLAIMED state, 561 * we can safely clear VINACTIVE. 562 * 563 * It is possible for a shared lock to cause a race with 564 * another thread that is also in the process of clearing 565 * VREF_TERMINATE, meaning that we might return with it still 566 * set and then assert in a later vref(). The solution is to 567 * unconditionally clear VREF_TERMINATE here as well. 568 * 569 * NOTE! Multiple threads may clear VINACTIVE if this is 570 * shared lock. This race is allowed. 571 */ 572 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 573 vp->v_act += VACT_INC; 574 if (vp->v_act > VACT_MAX) /* SMP race ok */ 575 vp->v_act = VACT_MAX; 576 error = 0; 577 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 578 } else { 579 /* 580 * If the vnode is not VS_ACTIVE it must be reactivated 581 * in addition to clearing VINACTIVE. An exclusive spin_lock 582 * is needed to manipulate the vnode's list. 583 * 584 * Because the lockmgr lock might be shared, we might race 585 * another reactivation, which we handle. In this situation, 586 * however, the refcnt prevents other v_state races. 587 * 588 * As with above, clearing VINACTIVE is allowed to race other 589 * clearings of VINACTIVE. 590 * 591 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 592 * the refcnt is non-zero and the vnode has not been 593 * reclaimed. This also means that the transitions do 594 * not affect cachedvnodes. 595 * 596 * It is possible for a shared lock to cause a race with 597 * another thread that is also in the process of clearing 598 * VREF_TERMINATE, meaning that we might return with it still 599 * set and then assert in a later vref(). The solution is to 600 * unconditionally clear VREF_TERMINATE here as well. 601 */ 602 _vclrflags(vp, VINACTIVE); 603 vp->v_act += VACT_INC; 604 if (vp->v_act > VACT_MAX) /* SMP race ok */ 605 vp->v_act = VACT_MAX; 606 spin_lock(&vp->v_spin); 607 608 switch(vp->v_state) { 609 case VS_INACTIVE: 610 _vactivate(vp); 611 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 612 VREF_FINALIZE); 613 spin_unlock(&vp->v_spin); 614 break; 615 case VS_CACHED: 616 _vactivate(vp); 617 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 618 VREF_FINALIZE); 619 spin_unlock(&vp->v_spin); 620 break; 621 case VS_ACTIVE: 622 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | 623 VREF_TERMINATE); 624 spin_unlock(&vp->v_spin); 625 break; 626 case VS_DYING: 627 spin_unlock(&vp->v_spin); 628 panic("Impossible VS_DYING state"); 629 break; 630 } 631 error = 0; 632 } 633 return(error); 634 } 635 636 #ifdef DEBUG_VPUT 637 638 void 639 debug_vput(struct vnode *vp, const char *filename, int line) 640 { 641 kprintf("vput(%p) %s:%d\n", vp, filename, line); 642 vn_unlock(vp); 643 vrele(vp); 644 } 645 646 #else 647 648 void 649 vput(struct vnode *vp) 650 { 651 vn_unlock(vp); 652 vrele(vp); 653 } 654 655 #endif 656 657 /* 658 * Acquire the vnode lock unguarded. 659 * 660 * The non-blocking version also uses a slightly different mechanic. 661 * This function will explicitly fail not only if it cannot acquire 662 * the lock normally, but also if the caller already holds a lock. 663 * 664 * The adjusted mechanic is used to close a loophole where complex 665 * VOP_RECLAIM code can circle around recursively and allocate the 666 * same vnode it is trying to destroy from the freelist. 667 * 668 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 669 * cause the incorrect behavior to occur. If not for that lockmgr() 670 * would do the right thing. 671 * 672 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 673 */ 674 void 675 vx_get(struct vnode *vp) 676 { 677 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 678 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 679 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 680 } 681 682 int 683 vx_get_nonblock(struct vnode *vp) 684 { 685 int error; 686 687 if (lockinuse(&vp->v_lock)) 688 return(EBUSY); 689 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 690 if (error == 0) { 691 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 692 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 693 } 694 return(error); 695 } 696 697 /* 698 * Release a VX lock that also held a ref on the vnode. vrele() will handle 699 * any needed state transitions. 700 * 701 * However, filesystems use this function to get rid of unwanted new vnodes 702 * so try to get the vnode on the correct queue in that case. 703 */ 704 void 705 vx_put(struct vnode *vp) 706 { 707 if (vp->v_type == VNON || vp->v_type == VBAD) 708 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 709 lockmgr(&vp->v_lock, LK_RELEASE); 710 vrele(vp); 711 } 712 713 /* 714 * Try to reuse a vnode from the free list. This function is somewhat 715 * advisory in that NULL can be returned as a normal case, even if free 716 * vnodes are present. 717 * 718 * The scan is limited because it can result in excessive CPU use during 719 * periods of extreme vnode use. 720 * 721 * NOTE: The returned vnode is not completely initialized. 722 */ 723 static 724 struct vnode * 725 cleanfreevnode(int maxcount) 726 { 727 struct vnode_index *vi; 728 struct vnode *vp; 729 int count; 730 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 731 int ri; 732 int cpu_count; 733 734 /* 735 * Try to deactivate some vnodes cached on the active list. 736 */ 737 if (countcachedvnodes() < inactivevnodes) 738 goto skip; 739 740 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 741 742 for (count = 0; count < maxcount * 2; ++count, ++ri) { 743 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 744 745 spin_lock(&vi->spin); 746 747 vp = TAILQ_NEXT(&vi->active_rover, v_list); 748 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 749 if (vp == NULL) { 750 TAILQ_INSERT_HEAD(&vi->active_list, 751 &vi->active_rover, v_list); 752 } else { 753 TAILQ_INSERT_AFTER(&vi->active_list, vp, 754 &vi->active_rover, v_list); 755 } 756 if (vp == NULL) { 757 spin_unlock(&vi->spin); 758 continue; 759 } 760 if ((vp->v_refcnt & VREF_MASK) != 0) { 761 spin_unlock(&vi->spin); 762 vp->v_act += VACT_INC; 763 if (vp->v_act > VACT_MAX) /* SMP race ok */ 764 vp->v_act = VACT_MAX; 765 continue; 766 } 767 768 /* 769 * decrement by less if the vnode's object has a lot of 770 * VM pages. XXX possible SMP races. 771 */ 772 if (vp->v_act > 0) { 773 vm_object_t obj; 774 if ((obj = vp->v_object) != NULL && 775 obj->resident_page_count >= trigger) { 776 vp->v_act -= 1; 777 } else { 778 vp->v_act -= VACT_INC; 779 } 780 if (vp->v_act < 0) 781 vp->v_act = 0; 782 spin_unlock(&vi->spin); 783 continue; 784 } 785 786 /* 787 * Try to deactivate the vnode. 788 */ 789 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 790 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 791 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 792 793 spin_unlock(&vi->spin); 794 vrele(vp); 795 } 796 797 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 798 799 skip: 800 /* 801 * Loop trying to lock the first vnode on the free list. 802 * Cycle if we can't. 803 */ 804 cpu_count = ncpus; 805 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 806 807 for (count = 0; count < maxcount; ++count, ++ri) { 808 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 809 810 spin_lock(&vi->spin); 811 812 vp = TAILQ_FIRST(&vi->inactive_list); 813 if (vp == NULL) { 814 spin_unlock(&vi->spin); 815 if (--cpu_count == 0) 816 break; 817 ri = (ri + 16) & ~15; 818 --ri; 819 continue; 820 } 821 822 /* 823 * non-blocking vx_get will also ref the vnode on success. 824 */ 825 if (vx_get_nonblock(vp)) { 826 KKASSERT(vp->v_state == VS_INACTIVE); 827 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 828 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 829 spin_unlock(&vi->spin); 830 continue; 831 } 832 833 /* 834 * Because we are holding vfs_spin the vnode should currently 835 * be inactive and VREF_TERMINATE should still be set. 836 * 837 * Once vfs_spin is released the vnode's state should remain 838 * unmodified due to both the lock and ref on it. 839 */ 840 KKASSERT(vp->v_state == VS_INACTIVE); 841 spin_unlock(&vi->spin); 842 #ifdef TRACKVNODE 843 if ((u_long)vp == trackvnode) 844 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 845 #endif 846 847 /* 848 * Do not reclaim/reuse a vnode while auxillary refs exists. 849 * This includes namecache refs due to a related ncp being 850 * locked or having children, a VM object association, or 851 * other hold users. 852 * 853 * Do not reclaim/reuse a vnode if someone else has a real 854 * ref on it. This can occur if a filesystem temporarily 855 * releases the vnode lock during VOP_RECLAIM. 856 */ 857 if (vp->v_auxrefs || 858 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 859 failed: 860 if (vp->v_state == VS_INACTIVE) { 861 spin_lock(&vi->spin); 862 if (vp->v_state == VS_INACTIVE) { 863 TAILQ_REMOVE(&vi->inactive_list, 864 vp, v_list); 865 TAILQ_INSERT_TAIL(&vi->inactive_list, 866 vp, v_list); 867 } 868 spin_unlock(&vi->spin); 869 } 870 vx_put(vp); 871 continue; 872 } 873 874 /* 875 * VINACTIVE and VREF_TERMINATE are expected to both be set 876 * for vnodes pulled from the inactive list, and cannot be 877 * changed while we hold the vx lock. 878 * 879 * Try to reclaim the vnode. 880 */ 881 KKASSERT(vp->v_flag & VINACTIVE); 882 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 883 884 if ((vp->v_flag & VRECLAIMED) == 0) { 885 if (cache_inval_vp_nonblock(vp)) 886 goto failed; 887 vgone_vxlocked(vp); 888 /* vnode is still VX locked */ 889 } 890 891 /* 892 * At this point if there are no other refs or auxrefs on 893 * the vnode with the inactive list locked, and we remove 894 * the vnode from the inactive list, it should not be 895 * possible for anyone else to access the vnode any more. 896 * 897 * Since the vnode is in a VRECLAIMED state, no new 898 * namecache associations could have been made and the 899 * vnode should have already been removed from its mountlist. 900 * 901 * Since we hold a VX lock on the vnode it cannot have been 902 * reactivated (moved out of the inactive list). 903 */ 904 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 905 spin_lock(&vi->spin); 906 if (vp->v_auxrefs || 907 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 908 spin_unlock(&vi->spin); 909 goto failed; 910 } 911 KKASSERT(vp->v_state == VS_INACTIVE); 912 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 913 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 914 vp->v_state = VS_DYING; 915 spin_unlock(&vi->spin); 916 917 /* 918 * Nothing should have been able to access this vp. Only 919 * our ref should remain now. 920 */ 921 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 922 KASSERT(vp->v_refcnt == 1, 923 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 924 925 /* 926 * Return a VX locked vnode suitable for reuse. 927 */ 928 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 929 return(vp); 930 } 931 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 932 return(NULL); 933 } 934 935 /* 936 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 937 * 938 * All new vnodes set the VAGE flags. An open() of the vnode will 939 * decrement the (2-bit) flags. Vnodes which are opened several times 940 * are thus retained in the cache over vnodes which are merely stat()d. 941 * 942 * We attempt to reuse an already-recycled vnode from our pcpu inactive 943 * queue first, and allocate otherwise. Attempting to recycle inactive 944 * vnodes here can lead to numerous deadlocks, particularly with 945 * softupdates. 946 */ 947 struct vnode * 948 allocvnode(int lktimeout, int lkflags) 949 { 950 struct vnode *vp; 951 struct vnode_index *vi; 952 953 /* 954 * lktimeout only applies when LK_TIMELOCK is used, and only 955 * the pageout daemon uses it. The timeout may not be zero 956 * or the pageout daemon can deadlock in low-VM situations. 957 */ 958 if (lktimeout == 0) 959 lktimeout = hz / 10; 960 961 /* 962 * Do not flag for synchronous recyclement unless there are enough 963 * freeable vnodes to recycle and the number of vnodes has 964 * significantly exceeded our target. We want the normal vnlru 965 * process to handle the cleaning (at 9/10's) before we are forced 966 * to flag it here at 11/10's for userexit path processing. 967 */ 968 if (numvnodes >= maxvnodes * 11 / 10 && 969 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 970 struct thread *td = curthread; 971 if (td->td_lwp) 972 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 973 } 974 975 /* 976 * Try to trivially reuse a reclaimed vnode from the head of the 977 * inactive list for this cpu. Any vnode cycling which occurs 978 * which terminates the vnode will cause it to be returned to the 979 * same pcpu structure (e.g. unlink calls). 980 */ 981 vi = &vnode_list_hash[mycpuid]; 982 spin_lock(&vi->spin); 983 984 vp = TAILQ_FIRST(&vi->inactive_list); 985 if (vp && (vp->v_flag & VRECLAIMED)) { 986 /* 987 * non-blocking vx_get will also ref the vnode on success. 988 */ 989 if (vx_get_nonblock(vp)) { 990 KKASSERT(vp->v_state == VS_INACTIVE); 991 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 992 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 993 spin_unlock(&vi->spin); 994 goto slower; 995 } 996 997 /* 998 * Because we are holding vfs_spin the vnode should currently 999 * be inactive and VREF_TERMINATE should still be set. 1000 * 1001 * Once vfs_spin is released the vnode's state should remain 1002 * unmodified due to both the lock and ref on it. 1003 */ 1004 KKASSERT(vp->v_state == VS_INACTIVE); 1005 #ifdef TRACKVNODE 1006 if ((u_long)vp == trackvnode) 1007 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 1008 #endif 1009 1010 /* 1011 * Do not reclaim/reuse a vnode while auxillary refs exists. 1012 * This includes namecache refs due to a related ncp being 1013 * locked or having children, a VM object association, or 1014 * other hold users. 1015 * 1016 * Do not reclaim/reuse a vnode if someone else has a real 1017 * ref on it. This can occur if a filesystem temporarily 1018 * releases the vnode lock during VOP_RECLAIM. 1019 */ 1020 if (vp->v_auxrefs || 1021 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1022 if (vp->v_state == VS_INACTIVE) { 1023 if (vp->v_state == VS_INACTIVE) { 1024 TAILQ_REMOVE(&vi->inactive_list, 1025 vp, v_list); 1026 TAILQ_INSERT_TAIL(&vi->inactive_list, 1027 vp, v_list); 1028 } 1029 } 1030 spin_unlock(&vi->spin); 1031 vx_put(vp); 1032 goto slower; 1033 } 1034 1035 /* 1036 * VINACTIVE and VREF_TERMINATE are expected to both be set 1037 * for vnodes pulled from the inactive list, and cannot be 1038 * changed while we hold the vx lock. 1039 * 1040 * Try to reclaim the vnode. 1041 */ 1042 KKASSERT(vp->v_flag & VINACTIVE); 1043 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1044 1045 if ((vp->v_flag & VRECLAIMED) == 0) { 1046 spin_unlock(&vi->spin); 1047 vx_put(vp); 1048 goto slower; 1049 } 1050 1051 /* 1052 * At this point if there are no other refs or auxrefs on 1053 * the vnode with the inactive list locked, and we remove 1054 * the vnode from the inactive list, it should not be 1055 * possible for anyone else to access the vnode any more. 1056 * 1057 * Since the vnode is in a VRECLAIMED state, no new 1058 * namecache associations could have been made and the 1059 * vnode should have already been removed from its mountlist. 1060 * 1061 * Since we hold a VX lock on the vnode it cannot have been 1062 * reactivated (moved out of the inactive list). 1063 */ 1064 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1065 KKASSERT(vp->v_state == VS_INACTIVE); 1066 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1067 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1068 vp->v_state = VS_DYING; 1069 spin_unlock(&vi->spin); 1070 1071 /* 1072 * Nothing should have been able to access this vp. Only 1073 * our ref should remain now. 1074 * 1075 * At this point we can kfree() the vnode if we want to. 1076 * Instead, we reuse it for the allocation. 1077 */ 1078 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1079 KASSERT(vp->v_refcnt == 1, 1080 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1081 bzero(vp, sizeof(*vp)); 1082 } else { 1083 spin_unlock(&vi->spin); 1084 slower: 1085 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1086 atomic_add_int(&numvnodes, 1); 1087 } 1088 1089 lwkt_token_init(&vp->v_token, "vnode"); 1090 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1091 TAILQ_INIT(&vp->v_namecache); 1092 RB_INIT(&vp->v_rbclean_tree); 1093 RB_INIT(&vp->v_rbdirty_tree); 1094 RB_INIT(&vp->v_rbhash_tree); 1095 spin_init(&vp->v_spin, "allocvnode"); 1096 1097 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 1098 vp->v_refcnt = 1; 1099 vp->v_flag = VAGE0 | VAGE1; 1100 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1101 1102 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1103 /* exclusive lock still held */ 1104 1105 vp->v_filesize = NOOFFSET; 1106 vp->v_type = VNON; 1107 vp->v_tag = 0; 1108 vp->v_state = VS_CACHED; 1109 _vactivate(vp); 1110 1111 return (vp); 1112 } 1113 1114 /* 1115 * Called after a process has allocated a vnode via allocvnode() 1116 * and we detected that too many vnodes were present. 1117 * 1118 * This function is called just prior to a return to userland if the 1119 * process at some point had to allocate a new vnode during the last 1120 * system call and the vnode count was found to be excessive. 1121 * 1122 * This is a synchronous path that we do not normally want to execute. 1123 * 1124 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1125 * 1126 * WARNING: Sometimes numvnodes can blow out due to children being 1127 * present under directory vnodes in the namecache. For the 1128 * moment use an if() instead of a while() and note that if 1129 * we were to use a while() we would still have to break out 1130 * if freesomevnodes() returned 0. vnlru will also be trying 1131 * hard to free vnodes at the same time (with a lower trigger 1132 * pointer). 1133 */ 1134 void 1135 allocvnode_gc(void) 1136 { 1137 if (numvnodes >= maxvnodes && 1138 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) { 1139 freesomevnodes(batchfreevnodes); 1140 } 1141 } 1142 1143 int 1144 freesomevnodes(int n) 1145 { 1146 struct vnode *vp; 1147 int count = 0; 1148 1149 while (n) { 1150 if ((vp = cleanfreevnode(n)) == NULL) 1151 break; 1152 vx_unlock(vp); 1153 --n; 1154 ++count; 1155 kfree(vp, M_VNODE); 1156 atomic_add_int(&numvnodes, -1); 1157 } 1158 return(count); 1159 } 1160