1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/buf.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #include <sys/buf2.h> 71 #include <sys/thread2.h> 72 73 #define VACT_MAX 10 74 #define VACT_INC 2 75 76 static void vnode_terminate(struct vnode *vp); 77 78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 79 80 /* 81 * The vnode free list hold inactive vnodes. Aged inactive vnodes 82 * are inserted prior to the mid point, and otherwise inserted 83 * at the tail. 84 * 85 * The vnode code goes to great lengths to avoid moving vnodes between 86 * lists, but sometimes it is unavoidable. For this situation we try to 87 * avoid lock contention but we do not try very hard to avoid cache line 88 * congestion. A modestly sized hash table is used. 89 */ 90 #define VLIST_PRIME2 123462047LU 91 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 92 93 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 94 VLIST_PRIME2 % (unsigned)ncpus) 95 96 TAILQ_HEAD(freelst, vnode); 97 98 struct vnode_index { 99 struct freelst active_list; 100 struct vnode active_rover; 101 struct freelst inactive_list; 102 struct spinlock spin; 103 int deac_rover; 104 int free_rover; 105 } __cachealign; 106 107 static struct vnode_index *vnode_list_hash; 108 109 int activevnodes = 0; 110 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 111 &activevnodes, 0, "Number of active nodes"); 112 int cachedvnodes = 0; 113 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 114 &cachedvnodes, 0, "Number of total cached nodes"); 115 int inactivevnodes = 0; 116 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 117 &inactivevnodes, 0, "Number of inactive nodes"); 118 static int batchfreevnodes = 5; 119 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 120 &batchfreevnodes, 0, "Number of vnodes to free at once"); 121 #ifdef TRACKVNODE 122 static u_long trackvnode; 123 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 124 &trackvnode, 0, ""); 125 #endif 126 127 /* 128 * Called from vfsinit() 129 */ 130 void 131 vfs_lock_init(void) 132 { 133 int i; 134 135 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 136 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 137 M_VNODE, M_ZERO | M_WAITOK); 138 for (i = 0; i < ncpus; ++i) { 139 struct vnode_index *vi = &vnode_list_hash[i]; 140 141 TAILQ_INIT(&vi->inactive_list); 142 TAILQ_INIT(&vi->active_list); 143 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 144 spin_init(&vi->spin, "vfslock"); 145 } 146 } 147 148 /* 149 * Misc functions 150 */ 151 static __inline 152 void 153 _vsetflags(struct vnode *vp, int flags) 154 { 155 atomic_set_int(&vp->v_flag, flags); 156 } 157 158 static __inline 159 void 160 _vclrflags(struct vnode *vp, int flags) 161 { 162 atomic_clear_int(&vp->v_flag, flags); 163 } 164 165 void 166 vsetflags(struct vnode *vp, int flags) 167 { 168 _vsetflags(vp, flags); 169 } 170 171 void 172 vclrflags(struct vnode *vp, int flags) 173 { 174 _vclrflags(vp, flags); 175 } 176 177 /* 178 * Place the vnode on the active list. 179 * 180 * Caller must hold vp->v_spin 181 */ 182 static __inline 183 void 184 _vactivate(struct vnode *vp) 185 { 186 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 187 188 #ifdef TRACKVNODE 189 if ((u_long)vp == trackvnode) 190 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 191 #endif 192 spin_lock(&vi->spin); 193 194 switch(vp->v_state) { 195 case VS_ACTIVE: 196 spin_unlock(&vi->spin); 197 panic("_vactivate: already active"); 198 /* NOT REACHED */ 199 return; 200 case VS_INACTIVE: 201 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 202 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 203 break; 204 case VS_CACHED: 205 case VS_DYING: 206 break; 207 } 208 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 209 vp->v_state = VS_ACTIVE; 210 spin_unlock(&vi->spin); 211 atomic_add_int(&mycpu->gd_activevnodes, 1); 212 } 213 214 /* 215 * Put a vnode on the inactive list. 216 * 217 * Caller must hold v_spin 218 */ 219 static __inline 220 void 221 _vinactive(struct vnode *vp) 222 { 223 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 224 225 #ifdef TRACKVNODE 226 if ((u_long)vp == trackvnode) { 227 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 228 print_backtrace(-1); 229 } 230 #endif 231 spin_lock(&vi->spin); 232 233 /* 234 * Remove from active list if it is sitting on it 235 */ 236 switch(vp->v_state) { 237 case VS_ACTIVE: 238 TAILQ_REMOVE(&vi->active_list, vp, v_list); 239 atomic_add_int(&mycpu->gd_activevnodes, -1); 240 break; 241 case VS_INACTIVE: 242 spin_unlock(&vi->spin); 243 panic("_vinactive: already inactive"); 244 /* NOT REACHED */ 245 return; 246 case VS_CACHED: 247 case VS_DYING: 248 break; 249 } 250 251 /* 252 * Distinguish between basically dead vnodes, vnodes with cached 253 * data, and vnodes without cached data. A rover will shift the 254 * vnodes around as their cache status is lost. 255 */ 256 if (vp->v_flag & VRECLAIMED) { 257 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 258 } else { 259 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 260 } 261 vp->v_state = VS_INACTIVE; 262 spin_unlock(&vi->spin); 263 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 264 } 265 266 static __inline 267 void 268 _vinactive_tail(struct vnode *vp) 269 { 270 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 271 272 spin_lock(&vi->spin); 273 274 /* 275 * Remove from active list if it is sitting on it 276 */ 277 switch(vp->v_state) { 278 case VS_ACTIVE: 279 TAILQ_REMOVE(&vi->active_list, vp, v_list); 280 atomic_add_int(&mycpu->gd_activevnodes, -1); 281 break; 282 case VS_INACTIVE: 283 spin_unlock(&vi->spin); 284 panic("_vinactive_tail: already inactive"); 285 /* NOT REACHED */ 286 return; 287 case VS_CACHED: 288 case VS_DYING: 289 break; 290 } 291 292 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 293 vp->v_state = VS_INACTIVE; 294 spin_unlock(&vi->spin); 295 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 296 } 297 298 /* 299 * Add a ref to an active vnode. This function should never be called 300 * with an inactive vnode (use vget() instead), but might be called 301 * with other states. 302 */ 303 void 304 vref(struct vnode *vp) 305 { 306 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 307 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 308 atomic_add_int(&vp->v_refcnt, 1); 309 } 310 311 void 312 synchronizevnodecount(void) 313 { 314 int nca = 0; 315 int act = 0; 316 int ina = 0; 317 int i; 318 319 for (i = 0; i < ncpus; ++i) { 320 globaldata_t gd = globaldata_find(i); 321 nca += gd->gd_cachedvnodes; 322 act += gd->gd_activevnodes; 323 ina += gd->gd_inactivevnodes; 324 } 325 cachedvnodes = nca; 326 activevnodes = act; 327 inactivevnodes = ina; 328 } 329 330 /* 331 * Count number of cached vnodes. This is middling expensive so be 332 * careful not to make this call in the critical path. Each cpu tracks 333 * its own accumulator. The individual accumulators must be summed 334 * together to get an accurate value. 335 */ 336 int 337 countcachedvnodes(void) 338 { 339 int i; 340 int n = 0; 341 342 for (i = 0; i < ncpus; ++i) { 343 globaldata_t gd = globaldata_find(i); 344 n += gd->gd_cachedvnodes; 345 } 346 return n; 347 } 348 349 int 350 countcachedandinactivevnodes(void) 351 { 352 int i; 353 int n = 0; 354 355 for (i = 0; i < ncpus; ++i) { 356 globaldata_t gd = globaldata_find(i); 357 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 358 } 359 return n; 360 } 361 362 /* 363 * Release a ref on an active or inactive vnode. 364 * 365 * Caller has no other requirements. 366 * 367 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 368 * transition, otherwise we leave the vnode in the active list and 369 * do a lockless transition to 0, which is very important for the 370 * critical path. 371 * 372 * (vrele() is not called when a vnode is being destroyed w/kfree) 373 */ 374 void 375 vrele(struct vnode *vp) 376 { 377 for (;;) { 378 int count = vp->v_refcnt; 379 cpu_ccfence(); 380 KKASSERT((count & VREF_MASK) > 0); 381 KKASSERT(vp->v_state == VS_ACTIVE || 382 vp->v_state == VS_INACTIVE); 383 384 /* 385 * 2+ case 386 */ 387 if ((count & VREF_MASK) > 1) { 388 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1)) 389 break; 390 continue; 391 } 392 393 /* 394 * 1->0 transition case must handle possible finalization. 395 * When finalizing we transition 1->0x40000000. Note that 396 * cachedvnodes is only adjusted on transitions to ->0. 397 * 398 * WARNING! VREF_TERMINATE can be cleared at any point 399 * when the refcnt is non-zero (by vget()) and 400 * the vnode has not been reclaimed. Thus 401 * transitions out of VREF_TERMINATE do not have 402 * to mess with cachedvnodes. 403 */ 404 if (count & VREF_FINALIZE) { 405 vx_lock(vp); 406 if (atomic_cmpset_int(&vp->v_refcnt, 407 count, VREF_TERMINATE)) { 408 vnode_terminate(vp); 409 break; 410 } 411 vx_unlock(vp); 412 } else { 413 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) { 414 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 415 break; 416 } 417 } 418 /* retry */ 419 } 420 } 421 422 /* 423 * Add an auxiliary data structure reference to the vnode. Auxiliary 424 * references do not change the state of the vnode or prevent deactivation 425 * or reclamation of the vnode, but will prevent the vnode from being 426 * destroyed (kfree()'d). 427 * 428 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 429 * already be held by the caller. vdrop() will clean up the 430 * free list state. 431 */ 432 void 433 vhold(struct vnode *vp) 434 { 435 atomic_add_int(&vp->v_auxrefs, 1); 436 } 437 438 /* 439 * Remove an auxiliary reference from the vnode. 440 */ 441 void 442 vdrop(struct vnode *vp) 443 { 444 atomic_add_int(&vp->v_auxrefs, -1); 445 } 446 447 /* 448 * This function is called on the 1->0 transition (which is actually 449 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 450 * of the vnode. 451 * 452 * Additional vrefs are allowed to race but will not result in a reentrant 453 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 454 * prevents additional 1->0 transitions. 455 * 456 * ONLY A VGET() CAN REACTIVATE THE VNODE. 457 * 458 * Caller must hold the VX lock. 459 * 460 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 461 * 462 * NOTE: The vnode may be marked inactive with dirty buffers 463 * or dirty pages in its cached VM object still present. 464 * 465 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 466 * previously be active). We lose control of the vnode the instant 467 * it is placed on the free list. 468 * 469 * The VX lock is required when transitioning to VS_CACHED but is 470 * not sufficient for the vshouldfree() interlocked test or when 471 * transitioning away from VS_CACHED. v_spin is also required for 472 * those cases. 473 */ 474 static 475 void 476 vnode_terminate(struct vnode *vp) 477 { 478 KKASSERT(vp->v_state == VS_ACTIVE); 479 480 if ((vp->v_flag & VINACTIVE) == 0) { 481 _vsetflags(vp, VINACTIVE); 482 if (vp->v_mount) 483 VOP_INACTIVE(vp); 484 } 485 spin_lock(&vp->v_spin); 486 _vinactive(vp); 487 spin_unlock(&vp->v_spin); 488 489 vx_unlock(vp); 490 } 491 492 /**************************************************************** 493 * VX LOCKING FUNCTIONS * 494 **************************************************************** 495 * 496 * These functions lock vnodes for reclamation and deactivation related 497 * activities. The caller must already be holding some sort of reference 498 * on the vnode. 499 */ 500 void 501 vx_lock(struct vnode *vp) 502 { 503 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 504 } 505 506 void 507 vx_unlock(struct vnode *vp) 508 { 509 lockmgr(&vp->v_lock, LK_RELEASE); 510 } 511 512 /**************************************************************** 513 * VNODE ACQUISITION FUNCTIONS * 514 **************************************************************** 515 * 516 * These functions must be used when accessing a vnode that has no 517 * chance of being destroyed in a SMP race. That means the caller will 518 * usually either hold an auxiliary reference (such as the namecache) 519 * or hold some other lock that ensures that the vnode cannot be destroyed. 520 * 521 * These functions are MANDATORY for any code chain accessing a vnode 522 * whos activation state is not known. 523 * 524 * vget() can be called with LK_NOWAIT and will return EBUSY if the 525 * lock cannot be immediately acquired. 526 * 527 * vget()/vput() are used when reactivation is desired. 528 * 529 * vx_get() and vx_put() are used when reactivation is not desired. 530 */ 531 int 532 vget(struct vnode *vp, int flags) 533 { 534 int error; 535 536 /* 537 * A lock type must be passed 538 */ 539 if ((flags & LK_TYPE_MASK) == 0) { 540 panic("vget() called with no lock specified!"); 541 /* NOT REACHED */ 542 } 543 544 /* 545 * Reference the structure and then acquire the lock. 546 * 547 * NOTE: The requested lock might be a shared lock and does 548 * not protect our access to the refcnt or other fields. 549 */ 550 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 551 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 552 553 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 554 /* 555 * The lock failed, undo and return an error. This will not 556 * normally trigger a termination. 557 */ 558 vrele(vp); 559 } else if (vp->v_flag & VRECLAIMED) { 560 /* 561 * The node is being reclaimed and cannot be reactivated 562 * any more, undo and return ENOENT. 563 */ 564 vn_unlock(vp); 565 vrele(vp); 566 error = ENOENT; 567 } else if (vp->v_state == VS_ACTIVE) { 568 /* 569 * A VS_ACTIVE vnode coupled with the fact that we have 570 * a vnode lock (even if shared) prevents v_state from 571 * changing. Since the vnode is not in a VRECLAIMED state, 572 * we can safely clear VINACTIVE. 573 * 574 * NOTE! Multiple threads may clear VINACTIVE if this is 575 * shared lock. This race is allowed. 576 */ 577 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 578 vp->v_act += VACT_INC; 579 if (vp->v_act > VACT_MAX) /* SMP race ok */ 580 vp->v_act = VACT_MAX; 581 error = 0; 582 } else { 583 /* 584 * If the vnode is not VS_ACTIVE it must be reactivated 585 * in addition to clearing VINACTIVE. An exclusive spin_lock 586 * is needed to manipulate the vnode's list. 587 * 588 * Because the lockmgr lock might be shared, we might race 589 * another reactivation, which we handle. In this situation, 590 * however, the refcnt prevents other v_state races. 591 * 592 * As with above, clearing VINACTIVE is allowed to race other 593 * clearings of VINACTIVE. 594 * 595 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 596 * the refcnt is non-zero and the vnode has not been 597 * reclaimed. This also means that the transitions do 598 * not affect cachedvnodes. 599 */ 600 _vclrflags(vp, VINACTIVE); 601 vp->v_act += VACT_INC; 602 if (vp->v_act > VACT_MAX) /* SMP race ok */ 603 vp->v_act = VACT_MAX; 604 spin_lock(&vp->v_spin); 605 606 switch(vp->v_state) { 607 case VS_INACTIVE: 608 _vactivate(vp); 609 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 610 VREF_FINALIZE); 611 spin_unlock(&vp->v_spin); 612 break; 613 case VS_CACHED: 614 _vactivate(vp); 615 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 616 VREF_FINALIZE); 617 spin_unlock(&vp->v_spin); 618 break; 619 case VS_ACTIVE: 620 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE); 621 spin_unlock(&vp->v_spin); 622 break; 623 case VS_DYING: 624 spin_unlock(&vp->v_spin); 625 panic("Impossible VS_DYING state"); 626 break; 627 } 628 error = 0; 629 } 630 return(error); 631 } 632 633 #ifdef DEBUG_VPUT 634 635 void 636 debug_vput(struct vnode *vp, const char *filename, int line) 637 { 638 kprintf("vput(%p) %s:%d\n", vp, filename, line); 639 vn_unlock(vp); 640 vrele(vp); 641 } 642 643 #else 644 645 void 646 vput(struct vnode *vp) 647 { 648 vn_unlock(vp); 649 vrele(vp); 650 } 651 652 #endif 653 654 /* 655 * Acquire the vnode lock unguarded. 656 * 657 * The non-blocking version also uses a slightly different mechanic. 658 * This function will explicitly fail not only if it cannot acquire 659 * the lock normally, but also if the caller already holds a lock. 660 * 661 * The adjusted mechanic is used to close a loophole where complex 662 * VOP_RECLAIM code can circle around recursively and allocate the 663 * same vnode it is trying to destroy from the freelist. 664 * 665 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 666 * cause the incorrect behavior to occur. If not for that lockmgr() 667 * would do the right thing. 668 * 669 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 670 */ 671 void 672 vx_get(struct vnode *vp) 673 { 674 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 675 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 676 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 677 } 678 679 int 680 vx_get_nonblock(struct vnode *vp) 681 { 682 int error; 683 684 if (lockinuse(&vp->v_lock)) 685 return(EBUSY); 686 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 687 if (error == 0) { 688 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 689 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 690 } 691 return(error); 692 } 693 694 /* 695 * Release a VX lock that also held a ref on the vnode. vrele() will handle 696 * any needed state transitions. 697 * 698 * However, filesystems use this function to get rid of unwanted new vnodes 699 * so try to get the vnode on the correct queue in that case. 700 */ 701 void 702 vx_put(struct vnode *vp) 703 { 704 if (vp->v_type == VNON || vp->v_type == VBAD) 705 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 706 lockmgr(&vp->v_lock, LK_RELEASE); 707 vrele(vp); 708 } 709 710 /* 711 * Try to reuse a vnode from the free list. This function is somewhat 712 * advisory in that NULL can be returned as a normal case, even if free 713 * vnodes are present. 714 * 715 * The scan is limited because it can result in excessive CPU use during 716 * periods of extreme vnode use. 717 * 718 * NOTE: The returned vnode is not completely initialized. 719 */ 720 static 721 struct vnode * 722 cleanfreevnode(int maxcount) 723 { 724 struct vnode_index *vi; 725 struct vnode *vp; 726 int count; 727 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 728 int ri; 729 int cpu_count; 730 731 /* 732 * Try to deactivate some vnodes cached on the active list. 733 */ 734 if (countcachedvnodes() < inactivevnodes) 735 goto skip; 736 737 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 738 739 for (count = 0; count < maxcount * 2; ++count, ++ri) { 740 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 741 742 spin_lock(&vi->spin); 743 744 vp = TAILQ_NEXT(&vi->active_rover, v_list); 745 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 746 if (vp == NULL) { 747 TAILQ_INSERT_HEAD(&vi->active_list, 748 &vi->active_rover, v_list); 749 } else { 750 TAILQ_INSERT_AFTER(&vi->active_list, vp, 751 &vi->active_rover, v_list); 752 } 753 if (vp == NULL) { 754 spin_unlock(&vi->spin); 755 continue; 756 } 757 if ((vp->v_refcnt & VREF_MASK) != 0) { 758 spin_unlock(&vi->spin); 759 vp->v_act += VACT_INC; 760 if (vp->v_act > VACT_MAX) /* SMP race ok */ 761 vp->v_act = VACT_MAX; 762 continue; 763 } 764 765 /* 766 * decrement by less if the vnode's object has a lot of 767 * VM pages. XXX possible SMP races. 768 */ 769 if (vp->v_act > 0) { 770 vm_object_t obj; 771 if ((obj = vp->v_object) != NULL && 772 obj->resident_page_count >= trigger) { 773 vp->v_act -= 1; 774 } else { 775 vp->v_act -= VACT_INC; 776 } 777 if (vp->v_act < 0) 778 vp->v_act = 0; 779 spin_unlock(&vi->spin); 780 continue; 781 } 782 783 /* 784 * Try to deactivate the vnode. 785 */ 786 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 787 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 788 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 789 790 spin_unlock(&vi->spin); 791 vrele(vp); 792 } 793 794 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 795 796 skip: 797 /* 798 * Loop trying to lock the first vnode on the free list. 799 * Cycle if we can't. 800 */ 801 cpu_count = ncpus; 802 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 803 804 for (count = 0; count < maxcount; ++count, ++ri) { 805 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 806 807 spin_lock(&vi->spin); 808 809 vp = TAILQ_FIRST(&vi->inactive_list); 810 if (vp == NULL) { 811 spin_unlock(&vi->spin); 812 if (--cpu_count == 0) 813 break; 814 ri = (ri + 16) & ~15; 815 --ri; 816 continue; 817 } 818 819 /* 820 * non-blocking vx_get will also ref the vnode on success. 821 */ 822 if (vx_get_nonblock(vp)) { 823 KKASSERT(vp->v_state == VS_INACTIVE); 824 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 825 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 826 spin_unlock(&vi->spin); 827 continue; 828 } 829 830 /* 831 * Because we are holding vfs_spin the vnode should currently 832 * be inactive and VREF_TERMINATE should still be set. 833 * 834 * Once vfs_spin is released the vnode's state should remain 835 * unmodified due to both the lock and ref on it. 836 */ 837 KKASSERT(vp->v_state == VS_INACTIVE); 838 spin_unlock(&vi->spin); 839 #ifdef TRACKVNODE 840 if ((u_long)vp == trackvnode) 841 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 842 #endif 843 844 /* 845 * Do not reclaim/reuse a vnode while auxillary refs exists. 846 * This includes namecache refs due to a related ncp being 847 * locked or having children, a VM object association, or 848 * other hold users. 849 * 850 * Do not reclaim/reuse a vnode if someone else has a real 851 * ref on it. This can occur if a filesystem temporarily 852 * releases the vnode lock during VOP_RECLAIM. 853 */ 854 if (vp->v_auxrefs || 855 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 856 failed: 857 if (vp->v_state == VS_INACTIVE) { 858 spin_lock(&vi->spin); 859 if (vp->v_state == VS_INACTIVE) { 860 TAILQ_REMOVE(&vi->inactive_list, 861 vp, v_list); 862 TAILQ_INSERT_TAIL(&vi->inactive_list, 863 vp, v_list); 864 } 865 spin_unlock(&vi->spin); 866 } 867 vx_put(vp); 868 continue; 869 } 870 871 /* 872 * VINACTIVE and VREF_TERMINATE are expected to both be set 873 * for vnodes pulled from the inactive list, and cannot be 874 * changed while we hold the vx lock. 875 * 876 * Try to reclaim the vnode. 877 */ 878 KKASSERT(vp->v_flag & VINACTIVE); 879 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 880 881 if ((vp->v_flag & VRECLAIMED) == 0) { 882 if (cache_inval_vp_nonblock(vp)) 883 goto failed; 884 vgone_vxlocked(vp); 885 /* vnode is still VX locked */ 886 } 887 888 /* 889 * At this point if there are no other refs or auxrefs on 890 * the vnode with the inactive list locked, and we remove 891 * the vnode from the inactive list, it should not be 892 * possible for anyone else to access the vnode any more. 893 * 894 * Since the vnode is in a VRECLAIMED state, no new 895 * namecache associations could have been made and the 896 * vnode should have already been removed from its mountlist. 897 * 898 * Since we hold a VX lock on the vnode it cannot have been 899 * reactivated (moved out of the inactive list). 900 */ 901 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 902 spin_lock(&vi->spin); 903 if (vp->v_auxrefs || 904 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 905 spin_unlock(&vi->spin); 906 goto failed; 907 } 908 KKASSERT(vp->v_state == VS_INACTIVE); 909 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 910 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 911 vp->v_state = VS_DYING; 912 spin_unlock(&vi->spin); 913 914 /* 915 * Nothing should have been able to access this vp. Only 916 * our ref should remain now. 917 */ 918 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 919 KASSERT(vp->v_refcnt == 1, 920 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 921 922 /* 923 * Return a VX locked vnode suitable for reuse. 924 */ 925 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 926 return(vp); 927 } 928 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 929 return(NULL); 930 } 931 932 /* 933 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 934 * 935 * All new vnodes set the VAGE flags. An open() of the vnode will 936 * decrement the (2-bit) flags. Vnodes which are opened several times 937 * are thus retained in the cache over vnodes which are merely stat()d. 938 * 939 * We attempt to reuse an already-recycled vnode from our pcpu inactive 940 * queue first, and allocate otherwise. Attempting to recycle inactive 941 * vnodes here can lead to numerous deadlocks, particularly with 942 * softupdates. 943 */ 944 struct vnode * 945 allocvnode(int lktimeout, int lkflags) 946 { 947 struct vnode *vp; 948 struct vnode_index *vi; 949 950 /* 951 * lktimeout only applies when LK_TIMELOCK is used, and only 952 * the pageout daemon uses it. The timeout may not be zero 953 * or the pageout daemon can deadlock in low-VM situations. 954 */ 955 if (lktimeout == 0) 956 lktimeout = hz / 10; 957 958 /* 959 * Do not flag for synchronous recyclement unless there are enough 960 * freeable vnodes to recycle and the number of vnodes has 961 * significantly exceeded our target. We want the normal vnlru 962 * process to handle the cleaning (at 9/10's) before we are forced 963 * to flag it here at 11/10's for userexit path processing. 964 */ 965 if (numvnodes >= maxvnodes * 11 / 10 && 966 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 967 struct thread *td = curthread; 968 if (td->td_lwp) 969 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 970 } 971 972 /* 973 * Try to trivially reuse a reclaimed vnode from the head of the 974 * inactive list for this cpu. Any vnode cycling which occurs 975 * which terminates the vnode will cause it to be returned to the 976 * same pcpu structure (e.g. unlink calls). 977 */ 978 vi = &vnode_list_hash[mycpuid]; 979 spin_lock(&vi->spin); 980 981 vp = TAILQ_FIRST(&vi->inactive_list); 982 if (vp && (vp->v_flag & VRECLAIMED)) { 983 /* 984 * non-blocking vx_get will also ref the vnode on success. 985 */ 986 if (vx_get_nonblock(vp)) { 987 KKASSERT(vp->v_state == VS_INACTIVE); 988 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 989 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 990 spin_unlock(&vi->spin); 991 goto slower; 992 } 993 994 /* 995 * Because we are holding vfs_spin the vnode should currently 996 * be inactive and VREF_TERMINATE should still be set. 997 * 998 * Once vfs_spin is released the vnode's state should remain 999 * unmodified due to both the lock and ref on it. 1000 */ 1001 KKASSERT(vp->v_state == VS_INACTIVE); 1002 #ifdef TRACKVNODE 1003 if ((u_long)vp == trackvnode) 1004 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 1005 #endif 1006 1007 /* 1008 * Do not reclaim/reuse a vnode while auxillary refs exists. 1009 * This includes namecache refs due to a related ncp being 1010 * locked or having children, a VM object association, or 1011 * other hold users. 1012 * 1013 * Do not reclaim/reuse a vnode if someone else has a real 1014 * ref on it. This can occur if a filesystem temporarily 1015 * releases the vnode lock during VOP_RECLAIM. 1016 */ 1017 if (vp->v_auxrefs || 1018 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1019 if (vp->v_state == VS_INACTIVE) { 1020 if (vp->v_state == VS_INACTIVE) { 1021 TAILQ_REMOVE(&vi->inactive_list, 1022 vp, v_list); 1023 TAILQ_INSERT_TAIL(&vi->inactive_list, 1024 vp, v_list); 1025 } 1026 } 1027 spin_unlock(&vi->spin); 1028 vx_put(vp); 1029 goto slower; 1030 } 1031 1032 /* 1033 * VINACTIVE and VREF_TERMINATE are expected to both be set 1034 * for vnodes pulled from the inactive list, and cannot be 1035 * changed while we hold the vx lock. 1036 * 1037 * Try to reclaim the vnode. 1038 */ 1039 KKASSERT(vp->v_flag & VINACTIVE); 1040 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1041 1042 if ((vp->v_flag & VRECLAIMED) == 0) { 1043 spin_unlock(&vi->spin); 1044 vx_put(vp); 1045 goto slower; 1046 } 1047 1048 /* 1049 * At this point if there are no other refs or auxrefs on 1050 * the vnode with the inactive list locked, and we remove 1051 * the vnode from the inactive list, it should not be 1052 * possible for anyone else to access the vnode any more. 1053 * 1054 * Since the vnode is in a VRECLAIMED state, no new 1055 * namecache associations could have been made and the 1056 * vnode should have already been removed from its mountlist. 1057 * 1058 * Since we hold a VX lock on the vnode it cannot have been 1059 * reactivated (moved out of the inactive list). 1060 */ 1061 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1062 KKASSERT(vp->v_state == VS_INACTIVE); 1063 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1064 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1065 vp->v_state = VS_DYING; 1066 spin_unlock(&vi->spin); 1067 1068 /* 1069 * Nothing should have been able to access this vp. Only 1070 * our ref should remain now. 1071 * 1072 * At this point we can kfree() the vnode if we want to. 1073 * Instead, we reuse it for the allocation. 1074 */ 1075 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1076 KASSERT(vp->v_refcnt == 1, 1077 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1078 bzero(vp, sizeof(*vp)); 1079 } else { 1080 spin_unlock(&vi->spin); 1081 slower: 1082 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1083 atomic_add_int(&numvnodes, 1); 1084 } 1085 1086 lwkt_token_init(&vp->v_token, "vnode"); 1087 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1088 TAILQ_INIT(&vp->v_namecache); 1089 RB_INIT(&vp->v_rbclean_tree); 1090 RB_INIT(&vp->v_rbdirty_tree); 1091 RB_INIT(&vp->v_rbhash_tree); 1092 spin_init(&vp->v_spin, "allocvnode"); 1093 1094 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 1095 vp->v_refcnt = 1; 1096 vp->v_flag = VAGE0 | VAGE1; 1097 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1098 1099 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1100 /* exclusive lock still held */ 1101 1102 vp->v_filesize = NOOFFSET; 1103 vp->v_type = VNON; 1104 vp->v_tag = 0; 1105 vp->v_state = VS_CACHED; 1106 _vactivate(vp); 1107 1108 return (vp); 1109 } 1110 1111 /* 1112 * Called after a process has allocated a vnode via allocvnode() 1113 * and we detected that too many vnodes were present. 1114 * 1115 * This function is called just prior to a return to userland if the 1116 * process at some point had to allocate a new vnode during the last 1117 * system call and the vnode count was found to be excessive. 1118 * 1119 * This is a synchronous path that we do not normally want to execute. 1120 * 1121 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1122 * 1123 * WARNING: Sometimes numvnodes can blow out due to children being 1124 * present under directory vnodes in the namecache. For the 1125 * moment use an if() instead of a while() and note that if 1126 * we were to use a while() we would still have to break out 1127 * if freesomevnodes() returned 0. vnlru will also be trying 1128 * hard to free vnodes at the same time (with a lower trigger 1129 * pointer). 1130 */ 1131 void 1132 allocvnode_gc(void) 1133 { 1134 if (numvnodes >= maxvnodes && 1135 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) { 1136 freesomevnodes(batchfreevnodes); 1137 } 1138 } 1139 1140 int 1141 freesomevnodes(int n) 1142 { 1143 struct vnode *vp; 1144 int count = 0; 1145 1146 while (n) { 1147 if ((vp = cleanfreevnode(n)) == NULL) 1148 break; 1149 vx_unlock(vp); 1150 --n; 1151 ++count; 1152 kfree(vp, M_VNODE); 1153 atomic_add_int(&numvnodes, -1); 1154 } 1155 return(count); 1156 } 1157