1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/buf.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #include <sys/buf2.h> 71 72 #define VACT_MAX 10 73 #define VACT_INC 2 74 75 static void vnode_terminate(struct vnode *vp); 76 77 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 78 79 /* 80 * The vnode free list hold inactive vnodes. Aged inactive vnodes 81 * are inserted prior to the mid point, and otherwise inserted 82 * at the tail. 83 * 84 * The vnode code goes to great lengths to avoid moving vnodes between 85 * lists, but sometimes it is unavoidable. For this situation we try to 86 * avoid lock contention but we do not try very hard to avoid cache line 87 * congestion. A modestly sized hash table is used. 88 */ 89 #define VLIST_PRIME2 123462047LU 90 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 91 92 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 93 VLIST_PRIME2 % (unsigned)ncpus) 94 95 static struct vnode_index *vnode_list_hash; 96 97 int activevnodes = 0; 98 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 99 &activevnodes, 0, "Number of active nodes"); 100 int cachedvnodes = 0; 101 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 102 &cachedvnodes, 0, "Number of total cached nodes"); 103 int inactivevnodes = 0; 104 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 105 &inactivevnodes, 0, "Number of inactive nodes"); 106 static int batchfreevnodes = 5; 107 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 108 &batchfreevnodes, 0, "Number of vnodes to free at once"); 109 #ifdef TRACKVNODE 110 static u_long trackvnode; 111 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 112 &trackvnode, 0, ""); 113 #endif 114 115 /* 116 * Called from vfsinit() 117 */ 118 void 119 vfs_lock_init(void) 120 { 121 int i; 122 123 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 124 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 125 M_VNODE, M_ZERO | M_WAITOK); 126 for (i = 0; i < ncpus; ++i) { 127 struct vnode_index *vi = &vnode_list_hash[i]; 128 129 TAILQ_INIT(&vi->inactive_list); 130 TAILQ_INIT(&vi->active_list); 131 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 132 spin_init(&vi->spin, "vfslock"); 133 } 134 } 135 136 /* 137 * Misc functions 138 */ 139 static __inline 140 void 141 _vsetflags(struct vnode *vp, int flags) 142 { 143 atomic_set_int(&vp->v_flag, flags); 144 } 145 146 static __inline 147 void 148 _vclrflags(struct vnode *vp, int flags) 149 { 150 atomic_clear_int(&vp->v_flag, flags); 151 } 152 153 void 154 vsetflags(struct vnode *vp, int flags) 155 { 156 _vsetflags(vp, flags); 157 } 158 159 void 160 vclrflags(struct vnode *vp, int flags) 161 { 162 _vclrflags(vp, flags); 163 } 164 165 /* 166 * Place the vnode on the active list. 167 * 168 * Caller must hold vp->v_spin 169 */ 170 static __inline 171 void 172 _vactivate(struct vnode *vp) 173 { 174 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 175 176 #ifdef TRACKVNODE 177 if ((u_long)vp == trackvnode) 178 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 179 #endif 180 spin_lock(&vi->spin); 181 182 switch(vp->v_state) { 183 case VS_ACTIVE: 184 spin_unlock(&vi->spin); 185 panic("_vactivate: already active"); 186 /* NOT REACHED */ 187 return; 188 case VS_INACTIVE: 189 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 190 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 191 break; 192 case VS_CACHED: 193 case VS_DYING: 194 break; 195 } 196 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 197 vp->v_state = VS_ACTIVE; 198 spin_unlock(&vi->spin); 199 atomic_add_int(&mycpu->gd_activevnodes, 1); 200 } 201 202 /* 203 * Put a vnode on the inactive list. 204 * 205 * Caller must hold v_spin 206 */ 207 static __inline 208 void 209 _vinactive(struct vnode *vp) 210 { 211 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 212 213 #ifdef TRACKVNODE 214 if ((u_long)vp == trackvnode) { 215 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 216 print_backtrace(-1); 217 } 218 #endif 219 spin_lock(&vi->spin); 220 221 /* 222 * Remove from active list if it is sitting on it 223 */ 224 switch(vp->v_state) { 225 case VS_ACTIVE: 226 TAILQ_REMOVE(&vi->active_list, vp, v_list); 227 atomic_add_int(&mycpu->gd_activevnodes, -1); 228 break; 229 case VS_INACTIVE: 230 spin_unlock(&vi->spin); 231 panic("_vinactive: already inactive"); 232 /* NOT REACHED */ 233 return; 234 case VS_CACHED: 235 case VS_DYING: 236 break; 237 } 238 239 /* 240 * Distinguish between basically dead vnodes, vnodes with cached 241 * data, and vnodes without cached data. A rover will shift the 242 * vnodes around as their cache status is lost. 243 */ 244 if (vp->v_flag & VRECLAIMED) { 245 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 246 } else { 247 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 248 } 249 vp->v_state = VS_INACTIVE; 250 spin_unlock(&vi->spin); 251 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 252 } 253 254 static __inline 255 void 256 _vinactive_tail(struct vnode *vp) 257 { 258 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 259 260 spin_lock(&vi->spin); 261 262 /* 263 * Remove from active list if it is sitting on it 264 */ 265 switch(vp->v_state) { 266 case VS_ACTIVE: 267 TAILQ_REMOVE(&vi->active_list, vp, v_list); 268 atomic_add_int(&mycpu->gd_activevnodes, -1); 269 break; 270 case VS_INACTIVE: 271 spin_unlock(&vi->spin); 272 panic("_vinactive_tail: already inactive"); 273 /* NOT REACHED */ 274 return; 275 case VS_CACHED: 276 case VS_DYING: 277 break; 278 } 279 280 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 281 vp->v_state = VS_INACTIVE; 282 spin_unlock(&vi->spin); 283 atomic_add_int(&mycpu->gd_inactivevnodes, 1); 284 } 285 286 /* 287 * Add a ref to an active vnode. This function should never be called 288 * with an inactive vnode (use vget() instead), but might be called 289 * with other states. 290 */ 291 void 292 vref(struct vnode *vp) 293 { 294 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 295 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 296 atomic_add_int(&vp->v_refcnt, 1); 297 } 298 299 void 300 synchronizevnodecount(void) 301 { 302 int nca = 0; 303 int act = 0; 304 int ina = 0; 305 int i; 306 307 for (i = 0; i < ncpus; ++i) { 308 globaldata_t gd = globaldata_find(i); 309 nca += gd->gd_cachedvnodes; 310 act += gd->gd_activevnodes; 311 ina += gd->gd_inactivevnodes; 312 } 313 cachedvnodes = nca; 314 activevnodes = act; 315 inactivevnodes = ina; 316 } 317 318 /* 319 * Count number of cached vnodes. This is middling expensive so be 320 * careful not to make this call in the critical path. Each cpu tracks 321 * its own accumulator. The individual accumulators must be summed 322 * together to get an accurate value. 323 */ 324 int 325 countcachedvnodes(void) 326 { 327 int i; 328 int n = 0; 329 330 for (i = 0; i < ncpus; ++i) { 331 globaldata_t gd = globaldata_find(i); 332 n += gd->gd_cachedvnodes; 333 } 334 return n; 335 } 336 337 int 338 countcachedandinactivevnodes(void) 339 { 340 int i; 341 int n = 0; 342 343 for (i = 0; i < ncpus; ++i) { 344 globaldata_t gd = globaldata_find(i); 345 n += gd->gd_cachedvnodes + gd->gd_inactivevnodes; 346 } 347 return n; 348 } 349 350 /* 351 * Release a ref on an active or inactive vnode. 352 * 353 * Caller has no other requirements. 354 * 355 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 356 * transition, otherwise we leave the vnode in the active list and 357 * do a lockless transition to 0, which is very important for the 358 * critical path. 359 * 360 * (vrele() is not called when a vnode is being destroyed w/kfree) 361 */ 362 void 363 vrele(struct vnode *vp) 364 { 365 for (;;) { 366 int count = vp->v_refcnt; 367 cpu_ccfence(); 368 KKASSERT((count & VREF_MASK) > 0); 369 KKASSERT(vp->v_state == VS_ACTIVE || 370 vp->v_state == VS_INACTIVE); 371 372 /* 373 * 2+ case 374 */ 375 if ((count & VREF_MASK) > 1) { 376 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1)) 377 break; 378 continue; 379 } 380 381 /* 382 * 1->0 transition case must handle possible finalization. 383 * When finalizing we transition 1->0x40000000. Note that 384 * cachedvnodes is only adjusted on transitions to ->0. 385 * 386 * WARNING! VREF_TERMINATE can be cleared at any point 387 * when the refcnt is non-zero (by vget()) and 388 * the vnode has not been reclaimed. Thus 389 * transitions out of VREF_TERMINATE do not have 390 * to mess with cachedvnodes. 391 */ 392 if (count & VREF_FINALIZE) { 393 vx_lock(vp); 394 if (atomic_cmpset_int(&vp->v_refcnt, 395 count, VREF_TERMINATE)) { 396 vnode_terminate(vp); 397 break; 398 } 399 vx_unlock(vp); 400 } else { 401 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) { 402 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 403 break; 404 } 405 } 406 /* retry */ 407 } 408 } 409 410 /* 411 * Add an auxiliary data structure reference to the vnode. Auxiliary 412 * references do not change the state of the vnode or prevent deactivation 413 * or reclamation of the vnode, but will prevent the vnode from being 414 * destroyed (kfree()'d). 415 * 416 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 417 * already be held by the caller. vdrop() will clean up the 418 * free list state. 419 */ 420 void 421 vhold(struct vnode *vp) 422 { 423 atomic_add_int(&vp->v_auxrefs, 1); 424 } 425 426 /* 427 * Remove an auxiliary reference from the vnode. 428 */ 429 void 430 vdrop(struct vnode *vp) 431 { 432 atomic_add_int(&vp->v_auxrefs, -1); 433 } 434 435 /* 436 * This function is called on the 1->0 transition (which is actually 437 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 438 * of the vnode. 439 * 440 * Additional vrefs are allowed to race but will not result in a reentrant 441 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 442 * prevents additional 1->0 transitions. 443 * 444 * ONLY A VGET() CAN REACTIVATE THE VNODE. 445 * 446 * Caller must hold the VX lock. 447 * 448 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 449 * 450 * NOTE: The vnode may be marked inactive with dirty buffers 451 * or dirty pages in its cached VM object still present. 452 * 453 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 454 * previously be active). We lose control of the vnode the instant 455 * it is placed on the free list. 456 * 457 * The VX lock is required when transitioning to VS_CACHED but is 458 * not sufficient for the vshouldfree() interlocked test or when 459 * transitioning away from VS_CACHED. v_spin is also required for 460 * those cases. 461 */ 462 static 463 void 464 vnode_terminate(struct vnode *vp) 465 { 466 KKASSERT(vp->v_state == VS_ACTIVE); 467 468 if ((vp->v_flag & VINACTIVE) == 0) { 469 _vsetflags(vp, VINACTIVE); 470 if (vp->v_mount) 471 VOP_INACTIVE(vp); 472 } 473 spin_lock(&vp->v_spin); 474 _vinactive(vp); 475 spin_unlock(&vp->v_spin); 476 477 vx_unlock(vp); 478 } 479 480 /**************************************************************** 481 * VX LOCKING FUNCTIONS * 482 **************************************************************** 483 * 484 * These functions lock vnodes for reclamation and deactivation related 485 * activities. The caller must already be holding some sort of reference 486 * on the vnode. 487 */ 488 void 489 vx_lock(struct vnode *vp) 490 { 491 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 492 } 493 494 void 495 vx_unlock(struct vnode *vp) 496 { 497 lockmgr(&vp->v_lock, LK_RELEASE); 498 } 499 500 /**************************************************************** 501 * VNODE ACQUISITION FUNCTIONS * 502 **************************************************************** 503 * 504 * These functions must be used when accessing a vnode that has no 505 * chance of being destroyed in a SMP race. That means the caller will 506 * usually either hold an auxiliary reference (such as the namecache) 507 * or hold some other lock that ensures that the vnode cannot be destroyed. 508 * 509 * These functions are MANDATORY for any code chain accessing a vnode 510 * whos activation state is not known. 511 * 512 * vget() can be called with LK_NOWAIT and will return EBUSY if the 513 * lock cannot be immediately acquired. 514 * 515 * vget()/vput() are used when reactivation is desired. 516 * 517 * vx_get() and vx_put() are used when reactivation is not desired. 518 */ 519 int 520 vget(struct vnode *vp, int flags) 521 { 522 int error; 523 524 /* 525 * A lock type must be passed 526 */ 527 if ((flags & LK_TYPE_MASK) == 0) { 528 panic("vget() called with no lock specified!"); 529 /* NOT REACHED */ 530 } 531 532 /* 533 * Reference the structure and then acquire the lock. 534 * 535 * NOTE: The requested lock might be a shared lock and does 536 * not protect our access to the refcnt or other fields. 537 */ 538 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 539 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 540 541 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 542 /* 543 * The lock failed, undo and return an error. This will not 544 * normally trigger a termination. 545 */ 546 vrele(vp); 547 } else if (vp->v_flag & VRECLAIMED) { 548 /* 549 * The node is being reclaimed and cannot be reactivated 550 * any more, undo and return ENOENT. 551 */ 552 vn_unlock(vp); 553 vrele(vp); 554 error = ENOENT; 555 } else if (vp->v_state == VS_ACTIVE) { 556 /* 557 * A VS_ACTIVE vnode coupled with the fact that we have 558 * a vnode lock (even if shared) prevents v_state from 559 * changing. Since the vnode is not in a VRECLAIMED state, 560 * we can safely clear VINACTIVE. 561 * 562 * It is possible for a shared lock to cause a race with 563 * another thread that is also in the process of clearing 564 * VREF_TERMINATE, meaning that we might return with it still 565 * set and then assert in a later vref(). The solution is to 566 * unconditionally clear VREF_TERMINATE here as well. 567 * 568 * NOTE! Multiple threads may clear VINACTIVE if this is 569 * shared lock. This race is allowed. 570 */ 571 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 572 vp->v_act += VACT_INC; 573 if (vp->v_act > VACT_MAX) /* SMP race ok */ 574 vp->v_act = VACT_MAX; 575 error = 0; 576 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE); 577 } else { 578 /* 579 * If the vnode is not VS_ACTIVE it must be reactivated 580 * in addition to clearing VINACTIVE. An exclusive spin_lock 581 * is needed to manipulate the vnode's list. 582 * 583 * Because the lockmgr lock might be shared, we might race 584 * another reactivation, which we handle. In this situation, 585 * however, the refcnt prevents other v_state races. 586 * 587 * As with above, clearing VINACTIVE is allowed to race other 588 * clearings of VINACTIVE. 589 * 590 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 591 * the refcnt is non-zero and the vnode has not been 592 * reclaimed. This also means that the transitions do 593 * not affect cachedvnodes. 594 * 595 * It is possible for a shared lock to cause a race with 596 * another thread that is also in the process of clearing 597 * VREF_TERMINATE, meaning that we might return with it still 598 * set and then assert in a later vref(). The solution is to 599 * unconditionally clear VREF_TERMINATE here as well. 600 */ 601 _vclrflags(vp, VINACTIVE); 602 vp->v_act += VACT_INC; 603 if (vp->v_act > VACT_MAX) /* SMP race ok */ 604 vp->v_act = VACT_MAX; 605 spin_lock(&vp->v_spin); 606 607 switch(vp->v_state) { 608 case VS_INACTIVE: 609 _vactivate(vp); 610 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 611 VREF_FINALIZE); 612 spin_unlock(&vp->v_spin); 613 break; 614 case VS_CACHED: 615 _vactivate(vp); 616 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 617 VREF_FINALIZE); 618 spin_unlock(&vp->v_spin); 619 break; 620 case VS_ACTIVE: 621 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE | 622 VREF_TERMINATE); 623 spin_unlock(&vp->v_spin); 624 break; 625 case VS_DYING: 626 spin_unlock(&vp->v_spin); 627 panic("Impossible VS_DYING state"); 628 break; 629 } 630 error = 0; 631 } 632 return(error); 633 } 634 635 #ifdef DEBUG_VPUT 636 637 void 638 debug_vput(struct vnode *vp, const char *filename, int line) 639 { 640 kprintf("vput(%p) %s:%d\n", vp, filename, line); 641 vn_unlock(vp); 642 vrele(vp); 643 } 644 645 #else 646 647 void 648 vput(struct vnode *vp) 649 { 650 vn_unlock(vp); 651 vrele(vp); 652 } 653 654 #endif 655 656 /* 657 * Acquire the vnode lock unguarded. 658 * 659 * The non-blocking version also uses a slightly different mechanic. 660 * This function will explicitly fail not only if it cannot acquire 661 * the lock normally, but also if the caller already holds a lock. 662 * 663 * The adjusted mechanic is used to close a loophole where complex 664 * VOP_RECLAIM code can circle around recursively and allocate the 665 * same vnode it is trying to destroy from the freelist. 666 * 667 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 668 * cause the incorrect behavior to occur. If not for that lockmgr() 669 * would do the right thing. 670 * 671 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 672 */ 673 void 674 vx_get(struct vnode *vp) 675 { 676 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 677 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 678 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 679 } 680 681 int 682 vx_get_nonblock(struct vnode *vp) 683 { 684 int error; 685 686 if (lockinuse(&vp->v_lock)) 687 return(EBUSY); 688 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 689 if (error == 0) { 690 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 691 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 692 } 693 return(error); 694 } 695 696 /* 697 * Release a VX lock that also held a ref on the vnode. vrele() will handle 698 * any needed state transitions. 699 * 700 * However, filesystems use this function to get rid of unwanted new vnodes 701 * so try to get the vnode on the correct queue in that case. 702 */ 703 void 704 vx_put(struct vnode *vp) 705 { 706 if (vp->v_type == VNON || vp->v_type == VBAD) 707 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 708 lockmgr(&vp->v_lock, LK_RELEASE); 709 vrele(vp); 710 } 711 712 /* 713 * Try to reuse a vnode from the free list. This function is somewhat 714 * advisory in that NULL can be returned as a normal case, even if free 715 * vnodes are present. 716 * 717 * The scan is limited because it can result in excessive CPU use during 718 * periods of extreme vnode use. 719 * 720 * NOTE: The returned vnode is not completely initialized. 721 */ 722 static 723 struct vnode * 724 cleanfreevnode(int maxcount) 725 { 726 struct vnode_index *vi; 727 struct vnode *vp; 728 int count; 729 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 730 int ri; 731 int cpu_count; 732 733 /* 734 * Try to deactivate some vnodes cached on the active list. 735 */ 736 if (countcachedvnodes() < inactivevnodes) 737 goto skip; 738 739 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 740 741 for (count = 0; count < maxcount * 2; ++count, ++ri) { 742 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 743 744 spin_lock(&vi->spin); 745 746 vp = TAILQ_NEXT(&vi->active_rover, v_list); 747 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 748 if (vp == NULL) { 749 TAILQ_INSERT_HEAD(&vi->active_list, 750 &vi->active_rover, v_list); 751 } else { 752 TAILQ_INSERT_AFTER(&vi->active_list, vp, 753 &vi->active_rover, v_list); 754 } 755 if (vp == NULL) { 756 spin_unlock(&vi->spin); 757 continue; 758 } 759 if ((vp->v_refcnt & VREF_MASK) != 0) { 760 spin_unlock(&vi->spin); 761 vp->v_act += VACT_INC; 762 if (vp->v_act > VACT_MAX) /* SMP race ok */ 763 vp->v_act = VACT_MAX; 764 continue; 765 } 766 767 /* 768 * decrement by less if the vnode's object has a lot of 769 * VM pages. XXX possible SMP races. 770 */ 771 if (vp->v_act > 0) { 772 vm_object_t obj; 773 if ((obj = vp->v_object) != NULL && 774 obj->resident_page_count >= trigger) { 775 vp->v_act -= 1; 776 } else { 777 vp->v_act -= VACT_INC; 778 } 779 if (vp->v_act < 0) 780 vp->v_act = 0; 781 spin_unlock(&vi->spin); 782 continue; 783 } 784 785 /* 786 * Try to deactivate the vnode. 787 */ 788 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 789 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 790 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 791 792 spin_unlock(&vi->spin); 793 vrele(vp); 794 } 795 796 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 797 798 skip: 799 /* 800 * Loop trying to lock the first vnode on the free list. 801 * Cycle if we can't. 802 */ 803 cpu_count = ncpus; 804 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 805 806 for (count = 0; count < maxcount; ++count, ++ri) { 807 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 808 809 spin_lock(&vi->spin); 810 811 vp = TAILQ_FIRST(&vi->inactive_list); 812 if (vp == NULL) { 813 spin_unlock(&vi->spin); 814 if (--cpu_count == 0) 815 break; 816 ri = (ri + 16) & ~15; 817 --ri; 818 continue; 819 } 820 821 /* 822 * non-blocking vx_get will also ref the vnode on success. 823 */ 824 if (vx_get_nonblock(vp)) { 825 KKASSERT(vp->v_state == VS_INACTIVE); 826 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 827 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 828 spin_unlock(&vi->spin); 829 continue; 830 } 831 832 /* 833 * Because we are holding vfs_spin the vnode should currently 834 * be inactive and VREF_TERMINATE should still be set. 835 * 836 * Once vfs_spin is released the vnode's state should remain 837 * unmodified due to both the lock and ref on it. 838 */ 839 KKASSERT(vp->v_state == VS_INACTIVE); 840 spin_unlock(&vi->spin); 841 #ifdef TRACKVNODE 842 if ((u_long)vp == trackvnode) 843 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 844 #endif 845 846 /* 847 * Do not reclaim/reuse a vnode while auxillary refs exists. 848 * This includes namecache refs due to a related ncp being 849 * locked or having children, a VM object association, or 850 * other hold users. 851 * 852 * Do not reclaim/reuse a vnode if someone else has a real 853 * ref on it. This can occur if a filesystem temporarily 854 * releases the vnode lock during VOP_RECLAIM. 855 */ 856 if (vp->v_auxrefs || 857 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 858 failed: 859 if (vp->v_state == VS_INACTIVE) { 860 spin_lock(&vi->spin); 861 if (vp->v_state == VS_INACTIVE) { 862 TAILQ_REMOVE(&vi->inactive_list, 863 vp, v_list); 864 TAILQ_INSERT_TAIL(&vi->inactive_list, 865 vp, v_list); 866 } 867 spin_unlock(&vi->spin); 868 } 869 vx_put(vp); 870 continue; 871 } 872 873 /* 874 * VINACTIVE and VREF_TERMINATE are expected to both be set 875 * for vnodes pulled from the inactive list, and cannot be 876 * changed while we hold the vx lock. 877 * 878 * Try to reclaim the vnode. 879 */ 880 KKASSERT(vp->v_flag & VINACTIVE); 881 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 882 883 if ((vp->v_flag & VRECLAIMED) == 0) { 884 if (cache_inval_vp_nonblock(vp)) 885 goto failed; 886 vgone_vxlocked(vp); 887 /* vnode is still VX locked */ 888 } 889 890 /* 891 * At this point if there are no other refs or auxrefs on 892 * the vnode with the inactive list locked, and we remove 893 * the vnode from the inactive list, it should not be 894 * possible for anyone else to access the vnode any more. 895 * 896 * Since the vnode is in a VRECLAIMED state, no new 897 * namecache associations could have been made and the 898 * vnode should have already been removed from its mountlist. 899 * 900 * Since we hold a VX lock on the vnode it cannot have been 901 * reactivated (moved out of the inactive list). 902 */ 903 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 904 spin_lock(&vi->spin); 905 if (vp->v_auxrefs || 906 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 907 spin_unlock(&vi->spin); 908 goto failed; 909 } 910 KKASSERT(vp->v_state == VS_INACTIVE); 911 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 912 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 913 vp->v_state = VS_DYING; 914 spin_unlock(&vi->spin); 915 916 /* 917 * Nothing should have been able to access this vp. Only 918 * our ref should remain now. 919 */ 920 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 921 KASSERT(vp->v_refcnt == 1, 922 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 923 924 /* 925 * Return a VX locked vnode suitable for reuse. 926 */ 927 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 928 return(vp); 929 } 930 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 931 return(NULL); 932 } 933 934 /* 935 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 936 * 937 * All new vnodes set the VAGE flags. An open() of the vnode will 938 * decrement the (2-bit) flags. Vnodes which are opened several times 939 * are thus retained in the cache over vnodes which are merely stat()d. 940 * 941 * We attempt to reuse an already-recycled vnode from our pcpu inactive 942 * queue first, and allocate otherwise. Attempting to recycle inactive 943 * vnodes here can lead to numerous deadlocks, particularly with 944 * softupdates. 945 */ 946 struct vnode * 947 allocvnode(int lktimeout, int lkflags) 948 { 949 struct vnode *vp; 950 struct vnode_index *vi; 951 952 /* 953 * lktimeout only applies when LK_TIMELOCK is used, and only 954 * the pageout daemon uses it. The timeout may not be zero 955 * or the pageout daemon can deadlock in low-VM situations. 956 */ 957 if (lktimeout == 0) 958 lktimeout = hz / 10; 959 960 /* 961 * Do not flag for synchronous recyclement unless there are enough 962 * freeable vnodes to recycle and the number of vnodes has 963 * significantly exceeded our target. We want the normal vnlru 964 * process to handle the cleaning (at 9/10's) before we are forced 965 * to flag it here at 11/10's for userexit path processing. 966 */ 967 if (numvnodes >= maxvnodes * 11 / 10 && 968 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 969 struct thread *td = curthread; 970 if (td->td_lwp) 971 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 972 } 973 974 /* 975 * Try to trivially reuse a reclaimed vnode from the head of the 976 * inactive list for this cpu. Any vnode cycling which occurs 977 * which terminates the vnode will cause it to be returned to the 978 * same pcpu structure (e.g. unlink calls). 979 */ 980 vi = &vnode_list_hash[mycpuid]; 981 spin_lock(&vi->spin); 982 983 vp = TAILQ_FIRST(&vi->inactive_list); 984 if (vp && (vp->v_flag & VRECLAIMED)) { 985 /* 986 * non-blocking vx_get will also ref the vnode on success. 987 */ 988 if (vx_get_nonblock(vp)) { 989 KKASSERT(vp->v_state == VS_INACTIVE); 990 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 991 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 992 spin_unlock(&vi->spin); 993 goto slower; 994 } 995 996 /* 997 * Because we are holding vfs_spin the vnode should currently 998 * be inactive and VREF_TERMINATE should still be set. 999 * 1000 * Once vfs_spin is released the vnode's state should remain 1001 * unmodified due to both the lock and ref on it. 1002 */ 1003 KKASSERT(vp->v_state == VS_INACTIVE); 1004 #ifdef TRACKVNODE 1005 if ((u_long)vp == trackvnode) 1006 kprintf("allocvnode %p %08x\n", vp, vp->v_flag); 1007 #endif 1008 1009 /* 1010 * Do not reclaim/reuse a vnode while auxillary refs exists. 1011 * This includes namecache refs due to a related ncp being 1012 * locked or having children, a VM object association, or 1013 * other hold users. 1014 * 1015 * Do not reclaim/reuse a vnode if someone else has a real 1016 * ref on it. This can occur if a filesystem temporarily 1017 * releases the vnode lock during VOP_RECLAIM. 1018 */ 1019 if (vp->v_auxrefs || 1020 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 1021 if (vp->v_state == VS_INACTIVE) { 1022 if (vp->v_state == VS_INACTIVE) { 1023 TAILQ_REMOVE(&vi->inactive_list, 1024 vp, v_list); 1025 TAILQ_INSERT_TAIL(&vi->inactive_list, 1026 vp, v_list); 1027 } 1028 } 1029 spin_unlock(&vi->spin); 1030 vx_put(vp); 1031 goto slower; 1032 } 1033 1034 /* 1035 * VINACTIVE and VREF_TERMINATE are expected to both be set 1036 * for vnodes pulled from the inactive list, and cannot be 1037 * changed while we hold the vx lock. 1038 * 1039 * Try to reclaim the vnode. 1040 */ 1041 KKASSERT(vp->v_flag & VINACTIVE); 1042 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 1043 1044 if ((vp->v_flag & VRECLAIMED) == 0) { 1045 spin_unlock(&vi->spin); 1046 vx_put(vp); 1047 goto slower; 1048 } 1049 1050 /* 1051 * At this point if there are no other refs or auxrefs on 1052 * the vnode with the inactive list locked, and we remove 1053 * the vnode from the inactive list, it should not be 1054 * possible for anyone else to access the vnode any more. 1055 * 1056 * Since the vnode is in a VRECLAIMED state, no new 1057 * namecache associations could have been made and the 1058 * vnode should have already been removed from its mountlist. 1059 * 1060 * Since we hold a VX lock on the vnode it cannot have been 1061 * reactivated (moved out of the inactive list). 1062 */ 1063 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1064 KKASSERT(vp->v_state == VS_INACTIVE); 1065 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 1066 atomic_add_int(&mycpu->gd_inactivevnodes, -1); 1067 vp->v_state = VS_DYING; 1068 spin_unlock(&vi->spin); 1069 1070 /* 1071 * Nothing should have been able to access this vp. Only 1072 * our ref should remain now. 1073 * 1074 * At this point we can kfree() the vnode if we want to. 1075 * Instead, we reuse it for the allocation. 1076 */ 1077 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 1078 KASSERT(vp->v_refcnt == 1, 1079 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 1080 bzero(vp, sizeof(*vp)); 1081 } else { 1082 spin_unlock(&vi->spin); 1083 slower: 1084 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 1085 atomic_add_int(&numvnodes, 1); 1086 } 1087 1088 lwkt_token_init(&vp->v_token, "vnode"); 1089 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 1090 TAILQ_INIT(&vp->v_namecache); 1091 RB_INIT(&vp->v_rbclean_tree); 1092 RB_INIT(&vp->v_rbdirty_tree); 1093 RB_INIT(&vp->v_rbhash_tree); 1094 spin_init(&vp->v_spin, "allocvnode"); 1095 1096 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 1097 vp->v_refcnt = 1; 1098 vp->v_flag = VAGE0 | VAGE1; 1099 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 1100 1101 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 1102 /* exclusive lock still held */ 1103 1104 vp->v_filesize = NOOFFSET; 1105 vp->v_type = VNON; 1106 vp->v_tag = 0; 1107 vp->v_state = VS_CACHED; 1108 _vactivate(vp); 1109 1110 return (vp); 1111 } 1112 1113 /* 1114 * Called after a process has allocated a vnode via allocvnode() 1115 * and we detected that too many vnodes were present. 1116 * 1117 * This function is called just prior to a return to userland if the 1118 * process at some point had to allocate a new vnode during the last 1119 * system call and the vnode count was found to be excessive. 1120 * 1121 * This is a synchronous path that we do not normally want to execute. 1122 * 1123 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 1124 * 1125 * WARNING: Sometimes numvnodes can blow out due to children being 1126 * present under directory vnodes in the namecache. For the 1127 * moment use an if() instead of a while() and note that if 1128 * we were to use a while() we would still have to break out 1129 * if freesomevnodes() returned 0. vnlru will also be trying 1130 * hard to free vnodes at the same time (with a lower trigger 1131 * pointer). 1132 */ 1133 void 1134 allocvnode_gc(void) 1135 { 1136 if (numvnodes >= maxvnodes && 1137 countcachedandinactivevnodes() >= maxvnodes * 5 / 10) { 1138 freesomevnodes(batchfreevnodes); 1139 } 1140 } 1141 1142 int 1143 freesomevnodes(int n) 1144 { 1145 struct vnode *vp; 1146 int count = 0; 1147 1148 while (n) { 1149 if ((vp = cleanfreevnode(n)) == NULL) 1150 break; 1151 vx_unlock(vp); 1152 --n; 1153 ++count; 1154 kfree(vp, M_VNODE); 1155 atomic_add_int(&numvnodes, -1); 1156 } 1157 return(count); 1158 } 1159