1 /* 2 * Copyright (c) 2004,2013-2017 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * External lock/ref-related vnode functions 37 * 38 * vs_state transition locking requirements: 39 * 40 * INACTIVE -> CACHED|DYING vx_lock(excl) + vi->spin 41 * DYING -> CACHED vx_lock(excl) 42 * ACTIVE -> INACTIVE (none) + v_spin + vi->spin 43 * INACTIVE -> ACTIVE vn_lock(any) + v_spin + vi->spin 44 * CACHED -> ACTIVE vn_lock(any) + v_spin + vi->spin 45 * 46 * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin, 47 * 48 * Switching into ACTIVE also requires a vref and vnode lock, however 49 * the vnode lock is allowed to be SHARED. 50 * 51 * Switching into a CACHED or DYING state requires an exclusive vnode 52 * lock or vx_lock (which is almost the same thing). 53 */ 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/malloc.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/vnode.h> 62 #include <sys/buf.h> 63 #include <sys/sysctl.h> 64 65 #include <machine/limits.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_object.h> 69 70 #include <sys/buf2.h> 71 #include <sys/thread2.h> 72 73 #define VACT_MAX 10 74 #define VACT_INC 2 75 76 static void vnode_terminate(struct vnode *vp); 77 78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures"); 79 80 /* 81 * The vnode free list hold inactive vnodes. Aged inactive vnodes 82 * are inserted prior to the mid point, and otherwise inserted 83 * at the tail. 84 * 85 * The vnode code goes to great lengths to avoid moving vnodes between 86 * lists, but sometimes it is unavoidable. For this situation we try to 87 * avoid lock contention but we do not try very hard to avoid cache line 88 * congestion. A modestly sized hash table is used. 89 */ 90 #define VLIST_PRIME2 123462047LU 91 #define VLIST_XOR (uintptr_t)0xab4582fa8322fb71LLU 92 93 #define VLIST_HASH(vp) (((uintptr_t)vp ^ VLIST_XOR) % \ 94 VLIST_PRIME2 % (unsigned)ncpus) 95 96 TAILQ_HEAD(freelst, vnode); 97 98 struct vnode_index { 99 struct freelst active_list; 100 struct vnode active_rover; 101 struct freelst inactive_list; 102 struct spinlock spin; 103 int deac_rover; 104 int free_rover; 105 } __cachealign; 106 107 static struct vnode_index *vnode_list_hash; 108 109 int activevnodes = 0; 110 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD, 111 &activevnodes, 0, "Number of active nodes"); 112 int cachedvnodes = 0; 113 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD, 114 &cachedvnodes, 0, "Number of total cached nodes"); 115 int inactivevnodes = 0; 116 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD, 117 &inactivevnodes, 0, "Number of inactive nodes"); 118 static int batchfreevnodes = 5; 119 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW, 120 &batchfreevnodes, 0, "Number of vnodes to free at once"); 121 #ifdef TRACKVNODE 122 static u_long trackvnode; 123 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW, 124 &trackvnode, 0, ""); 125 #endif 126 127 /* 128 * Called from vfsinit() 129 */ 130 void 131 vfs_lock_init(void) 132 { 133 int i; 134 135 kmalloc_raise_limit(M_VNODE, 0); /* unlimited */ 136 vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus, 137 M_VNODE, M_ZERO | M_WAITOK); 138 for (i = 0; i < ncpus; ++i) { 139 struct vnode_index *vi = &vnode_list_hash[i]; 140 141 TAILQ_INIT(&vi->inactive_list); 142 TAILQ_INIT(&vi->active_list); 143 TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list); 144 spin_init(&vi->spin, "vfslock"); 145 } 146 } 147 148 /* 149 * Misc functions 150 */ 151 static __inline 152 void 153 _vsetflags(struct vnode *vp, int flags) 154 { 155 atomic_set_int(&vp->v_flag, flags); 156 } 157 158 static __inline 159 void 160 _vclrflags(struct vnode *vp, int flags) 161 { 162 atomic_clear_int(&vp->v_flag, flags); 163 } 164 165 void 166 vsetflags(struct vnode *vp, int flags) 167 { 168 _vsetflags(vp, flags); 169 } 170 171 void 172 vclrflags(struct vnode *vp, int flags) 173 { 174 _vclrflags(vp, flags); 175 } 176 177 /* 178 * Place the vnode on the active list. 179 * 180 * Caller must hold vp->v_spin 181 */ 182 static __inline 183 void 184 _vactivate(struct vnode *vp) 185 { 186 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 187 188 #ifdef TRACKVNODE 189 if ((u_long)vp == trackvnode) 190 kprintf("_vactivate %p %08x\n", vp, vp->v_flag); 191 #endif 192 spin_lock(&vi->spin); 193 194 switch(vp->v_state) { 195 case VS_ACTIVE: 196 spin_unlock(&vi->spin); 197 panic("_vactivate: already active"); 198 /* NOT REACHED */ 199 return; 200 case VS_INACTIVE: 201 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 202 atomic_add_int(&inactivevnodes, -1); 203 break; 204 case VS_CACHED: 205 case VS_DYING: 206 break; 207 } 208 TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list); 209 vp->v_state = VS_ACTIVE; 210 spin_unlock(&vi->spin); 211 atomic_add_int(&activevnodes, 1); 212 } 213 214 /* 215 * Put a vnode on the inactive list. 216 * 217 * Caller must hold v_spin 218 */ 219 static __inline 220 void 221 _vinactive(struct vnode *vp) 222 { 223 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 224 225 #ifdef TRACKVNODE 226 if ((u_long)vp == trackvnode) { 227 kprintf("_vinactive %p %08x\n", vp, vp->v_flag); 228 print_backtrace(-1); 229 } 230 #endif 231 spin_lock(&vi->spin); 232 233 /* 234 * Remove from active list if it is sitting on it 235 */ 236 switch(vp->v_state) { 237 case VS_ACTIVE: 238 TAILQ_REMOVE(&vi->active_list, vp, v_list); 239 atomic_add_int(&activevnodes, -1); 240 break; 241 case VS_INACTIVE: 242 spin_unlock(&vi->spin); 243 panic("_vinactive: already inactive"); 244 /* NOT REACHED */ 245 return; 246 case VS_CACHED: 247 case VS_DYING: 248 break; 249 } 250 251 /* 252 * Distinguish between basically dead vnodes, vnodes with cached 253 * data, and vnodes without cached data. A rover will shift the 254 * vnodes around as their cache status is lost. 255 */ 256 if (vp->v_flag & VRECLAIMED) { 257 TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list); 258 } else { 259 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 260 } 261 vp->v_state = VS_INACTIVE; 262 spin_unlock(&vi->spin); 263 atomic_add_int(&inactivevnodes, 1); 264 } 265 266 static __inline 267 void 268 _vinactive_tail(struct vnode *vp) 269 { 270 struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)]; 271 272 spin_lock(&vi->spin); 273 274 /* 275 * Remove from active list if it is sitting on it 276 */ 277 switch(vp->v_state) { 278 case VS_ACTIVE: 279 TAILQ_REMOVE(&vi->active_list, vp, v_list); 280 atomic_add_int(&activevnodes, -1); 281 break; 282 case VS_INACTIVE: 283 spin_unlock(&vi->spin); 284 panic("_vinactive_tail: already inactive"); 285 /* NOT REACHED */ 286 return; 287 case VS_CACHED: 288 case VS_DYING: 289 break; 290 } 291 292 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 293 vp->v_state = VS_INACTIVE; 294 spin_unlock(&vi->spin); 295 atomic_add_int(&inactivevnodes, 1); 296 } 297 298 /* 299 * Add a ref to an active vnode. This function should never be called 300 * with an inactive vnode (use vget() instead), but might be called 301 * with other states. 302 */ 303 void 304 vref(struct vnode *vp) 305 { 306 KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE), 307 ("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state)); 308 atomic_add_int(&vp->v_refcnt, 1); 309 } 310 311 /* 312 * Count number of cached vnodes. This is middling expensive so be 313 * careful not to make this call in the critical path, particularly 314 * not updating the global. Each cpu tracks its own accumulator. 315 * The individual accumulators are not accurate and must be summed 316 * together. 317 */ 318 int 319 countcachedvnodes(int gupdate) 320 { 321 int i; 322 int n = 0; 323 324 for (i = 0; i < ncpus; ++i) { 325 globaldata_t gd = globaldata_find(i); 326 n += gd->gd_cachedvnodes; 327 } 328 if (gupdate) 329 cachedvnodes = n; 330 return n; 331 } 332 333 /* 334 * Release a ref on an active or inactive vnode. 335 * 336 * Caller has no other requirements. 337 * 338 * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0 339 * transition, otherwise we leave the vnode in the active list and 340 * do a lockless transition to 0, which is very important for the 341 * critical path. 342 * 343 * (vrele() is not called when a vnode is being destroyed w/kfree) 344 */ 345 void 346 vrele(struct vnode *vp) 347 { 348 for (;;) { 349 int count = vp->v_refcnt; 350 cpu_ccfence(); 351 KKASSERT((count & VREF_MASK) > 0); 352 KKASSERT(vp->v_state == VS_ACTIVE || 353 vp->v_state == VS_INACTIVE); 354 355 /* 356 * 2+ case 357 */ 358 if ((count & VREF_MASK) > 1) { 359 if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1)) 360 break; 361 continue; 362 } 363 364 /* 365 * 1->0 transition case must handle possible finalization. 366 * When finalizing we transition 1->0x40000000. Note that 367 * cachedvnodes is only adjusted on transitions to ->0. 368 * 369 * WARNING! VREF_TERMINATE can be cleared at any point 370 * when the refcnt is non-zero (by vget()) and 371 * the vnode has not been reclaimed. Thus 372 * transitions out of VREF_TERMINATE do not have 373 * to mess with cachedvnodes. 374 */ 375 if (count & VREF_FINALIZE) { 376 vx_lock(vp); 377 if (atomic_cmpset_int(&vp->v_refcnt, 378 count, VREF_TERMINATE)) { 379 vnode_terminate(vp); 380 break; 381 } 382 vx_unlock(vp); 383 } else { 384 if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) { 385 atomic_add_int(&mycpu->gd_cachedvnodes, 1); 386 break; 387 } 388 } 389 /* retry */ 390 } 391 } 392 393 /* 394 * Add an auxiliary data structure reference to the vnode. Auxiliary 395 * references do not change the state of the vnode or prevent deactivation 396 * or reclamation of the vnode, but will prevent the vnode from being 397 * destroyed (kfree()'d). 398 * 399 * WARNING! vhold() must not acquire v_spin. The spinlock may or may not 400 * already be held by the caller. vdrop() will clean up the 401 * free list state. 402 */ 403 void 404 vhold(struct vnode *vp) 405 { 406 atomic_add_int(&vp->v_auxrefs, 1); 407 } 408 409 /* 410 * Remove an auxiliary reference from the vnode. 411 */ 412 void 413 vdrop(struct vnode *vp) 414 { 415 atomic_add_int(&vp->v_auxrefs, -1); 416 } 417 418 /* 419 * This function is called on the 1->0 transition (which is actually 420 * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation 421 * of the vnode. 422 * 423 * Additional vrefs are allowed to race but will not result in a reentrant 424 * call to vnode_terminate() due to refcnt being VREF_TERMINATE. This 425 * prevents additional 1->0 transitions. 426 * 427 * ONLY A VGET() CAN REACTIVATE THE VNODE. 428 * 429 * Caller must hold the VX lock. 430 * 431 * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops 432 * 433 * NOTE: The vnode may be marked inactive with dirty buffers 434 * or dirty pages in its cached VM object still present. 435 * 436 * NOTE: VS_FREE should not be set on entry (the vnode was expected to 437 * previously be active). We lose control of the vnode the instant 438 * it is placed on the free list. 439 * 440 * The VX lock is required when transitioning to VS_CACHED but is 441 * not sufficient for the vshouldfree() interlocked test or when 442 * transitioning away from VS_CACHED. v_spin is also required for 443 * those cases. 444 */ 445 static 446 void 447 vnode_terminate(struct vnode *vp) 448 { 449 KKASSERT(vp->v_state == VS_ACTIVE); 450 451 if ((vp->v_flag & VINACTIVE) == 0) { 452 _vsetflags(vp, VINACTIVE); 453 if (vp->v_mount) 454 VOP_INACTIVE(vp); 455 /* might deactivate page */ 456 } 457 spin_lock(&vp->v_spin); 458 _vinactive(vp); 459 spin_unlock(&vp->v_spin); 460 461 vx_unlock(vp); 462 } 463 464 /**************************************************************** 465 * VX LOCKING FUNCTIONS * 466 **************************************************************** 467 * 468 * These functions lock vnodes for reclamation and deactivation related 469 * activities. The caller must already be holding some sort of reference 470 * on the vnode. 471 */ 472 void 473 vx_lock(struct vnode *vp) 474 { 475 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 476 } 477 478 void 479 vx_unlock(struct vnode *vp) 480 { 481 lockmgr(&vp->v_lock, LK_RELEASE); 482 } 483 484 /**************************************************************** 485 * VNODE ACQUISITION FUNCTIONS * 486 **************************************************************** 487 * 488 * These functions must be used when accessing a vnode that has no 489 * chance of being destroyed in a SMP race. That means the caller will 490 * usually either hold an auxiliary reference (such as the namecache) 491 * or hold some other lock that ensures that the vnode cannot be destroyed. 492 * 493 * These functions are MANDATORY for any code chain accessing a vnode 494 * whos activation state is not known. 495 * 496 * vget() can be called with LK_NOWAIT and will return EBUSY if the 497 * lock cannot be immediately acquired. 498 * 499 * vget()/vput() are used when reactivation is desired. 500 * 501 * vx_get() and vx_put() are used when reactivation is not desired. 502 */ 503 int 504 vget(struct vnode *vp, int flags) 505 { 506 int error; 507 508 /* 509 * A lock type must be passed 510 */ 511 if ((flags & LK_TYPE_MASK) == 0) { 512 panic("vget() called with no lock specified!"); 513 /* NOT REACHED */ 514 } 515 516 /* 517 * Reference the structure and then acquire the lock. 518 * 519 * NOTE: The requested lock might be a shared lock and does 520 * not protect our access to the refcnt or other fields. 521 */ 522 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 523 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 524 525 if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) { 526 /* 527 * The lock failed, undo and return an error. This will not 528 * normally trigger a termination. 529 */ 530 vrele(vp); 531 } else if (vp->v_flag & VRECLAIMED) { 532 /* 533 * The node is being reclaimed and cannot be reactivated 534 * any more, undo and return ENOENT. 535 */ 536 vn_unlock(vp); 537 vrele(vp); 538 error = ENOENT; 539 } else if (vp->v_state == VS_ACTIVE) { 540 /* 541 * A VS_ACTIVE vnode coupled with the fact that we have 542 * a vnode lock (even if shared) prevents v_state from 543 * changing. Since the vnode is not in a VRECLAIMED state, 544 * we can safely clear VINACTIVE. 545 * 546 * NOTE! Multiple threads may clear VINACTIVE if this is 547 * shared lock. This race is allowed. 548 */ 549 _vclrflags(vp, VINACTIVE); /* SMP race ok */ 550 vp->v_act += VACT_INC; 551 if (vp->v_act > VACT_MAX) /* SMP race ok */ 552 vp->v_act = VACT_MAX; 553 error = 0; 554 } else { 555 /* 556 * If the vnode is not VS_ACTIVE it must be reactivated 557 * in addition to clearing VINACTIVE. An exclusive spin_lock 558 * is needed to manipulate the vnode's list. 559 * 560 * Because the lockmgr lock might be shared, we might race 561 * another reactivation, which we handle. In this situation, 562 * however, the refcnt prevents other v_state races. 563 * 564 * As with above, clearing VINACTIVE is allowed to race other 565 * clearings of VINACTIVE. 566 * 567 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when 568 * the refcnt is non-zero and the vnode has not been 569 * reclaimed. This also means that the transitions do 570 * not affect cachedvnodes. 571 */ 572 _vclrflags(vp, VINACTIVE); 573 vp->v_act += VACT_INC; 574 if (vp->v_act > VACT_MAX) /* SMP race ok */ 575 vp->v_act = VACT_MAX; 576 spin_lock(&vp->v_spin); 577 578 switch(vp->v_state) { 579 case VS_INACTIVE: 580 _vactivate(vp); 581 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 582 VREF_FINALIZE); 583 spin_unlock(&vp->v_spin); 584 break; 585 case VS_CACHED: 586 _vactivate(vp); 587 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE | 588 VREF_FINALIZE); 589 spin_unlock(&vp->v_spin); 590 break; 591 case VS_ACTIVE: 592 atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE); 593 spin_unlock(&vp->v_spin); 594 break; 595 case VS_DYING: 596 spin_unlock(&vp->v_spin); 597 panic("Impossible VS_DYING state"); 598 break; 599 } 600 error = 0; 601 } 602 return(error); 603 } 604 605 #ifdef DEBUG_VPUT 606 607 void 608 debug_vput(struct vnode *vp, const char *filename, int line) 609 { 610 kprintf("vput(%p) %s:%d\n", vp, filename, line); 611 vn_unlock(vp); 612 vrele(vp); 613 } 614 615 #else 616 617 void 618 vput(struct vnode *vp) 619 { 620 vn_unlock(vp); 621 vrele(vp); 622 } 623 624 #endif 625 626 /* 627 * Acquire the vnode lock unguarded. 628 * 629 * The non-blocking version also uses a slightly different mechanic. 630 * This function will explicitly fail not only if it cannot acquire 631 * the lock normally, but also if the caller already holds a lock. 632 * 633 * The adjusted mechanic is used to close a loophole where complex 634 * VOP_RECLAIM code can circle around recursively and allocate the 635 * same vnode it is trying to destroy from the freelist. 636 * 637 * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can 638 * cause the incorrect behavior to occur. If not for that lockmgr() 639 * would do the right thing. 640 * 641 * XXX The vx_*() locks should use auxrefs, not the main reference counter. 642 */ 643 void 644 vx_get(struct vnode *vp) 645 { 646 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 647 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 648 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 649 } 650 651 int 652 vx_get_nonblock(struct vnode *vp) 653 { 654 int error; 655 656 if (lockinuse(&vp->v_lock)) 657 return(EBUSY); 658 error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT); 659 if (error == 0) { 660 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 661 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 662 } 663 return(error); 664 } 665 666 /* 667 * Release a VX lock that also held a ref on the vnode. vrele() will handle 668 * any needed state transitions. 669 * 670 * However, filesystems use this function to get rid of unwanted new vnodes 671 * so try to get the vnode on the correct queue in that case. 672 */ 673 void 674 vx_put(struct vnode *vp) 675 { 676 if (vp->v_type == VNON || vp->v_type == VBAD) 677 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 678 lockmgr(&vp->v_lock, LK_RELEASE); 679 vrele(vp); 680 } 681 682 /* 683 * Try to reuse a vnode from the free list. This function is somewhat 684 * advisory in that NULL can be returned as a normal case, even if free 685 * vnodes are present. 686 * 687 * The scan is limited because it can result in excessive CPU use during 688 * periods of extreme vnode use. 689 * 690 * NOTE: The returned vnode is not completely initialized. 691 */ 692 static 693 struct vnode * 694 cleanfreevnode(int maxcount) 695 { 696 struct vnode_index *vi; 697 struct vnode *vp; 698 int count; 699 int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1); 700 int ri; 701 int cpu_count; 702 703 /* 704 * Try to deactivate some vnodes cached on the active list. 705 */ 706 if (countcachedvnodes(0) < inactivevnodes) 707 goto skip; 708 709 ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1; 710 711 for (count = 0; count < maxcount * 2; ++count, ++ri) { 712 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 713 714 spin_lock(&vi->spin); 715 716 vp = TAILQ_NEXT(&vi->active_rover, v_list); 717 TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list); 718 if (vp == NULL) { 719 TAILQ_INSERT_HEAD(&vi->active_list, 720 &vi->active_rover, v_list); 721 } else { 722 TAILQ_INSERT_AFTER(&vi->active_list, vp, 723 &vi->active_rover, v_list); 724 } 725 if (vp == NULL) { 726 spin_unlock(&vi->spin); 727 continue; 728 } 729 if ((vp->v_refcnt & VREF_MASK) != 0) { 730 spin_unlock(&vi->spin); 731 vp->v_act += VACT_INC; 732 if (vp->v_act > VACT_MAX) /* SMP race ok */ 733 vp->v_act = VACT_MAX; 734 continue; 735 } 736 737 /* 738 * decrement by less if the vnode's object has a lot of 739 * VM pages. XXX possible SMP races. 740 */ 741 if (vp->v_act > 0) { 742 vm_object_t obj; 743 if ((obj = vp->v_object) != NULL && 744 obj->resident_page_count >= trigger) { 745 vp->v_act -= 1; 746 } else { 747 vp->v_act -= VACT_INC; 748 } 749 if (vp->v_act < 0) 750 vp->v_act = 0; 751 spin_unlock(&vi->spin); 752 continue; 753 } 754 755 /* 756 * Try to deactivate the vnode. 757 */ 758 if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0) 759 atomic_add_int(&mycpu->gd_cachedvnodes, -1); 760 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE); 761 762 spin_unlock(&vi->spin); 763 vrele(vp); 764 } 765 766 vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri; 767 768 skip: 769 /* 770 * Loop trying to lock the first vnode on the free list. 771 * Cycle if we can't. 772 */ 773 cpu_count = ncpus; 774 ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1; 775 776 for (count = 0; count < maxcount; ++count, ++ri) { 777 vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus]; 778 779 spin_lock(&vi->spin); 780 781 vp = TAILQ_FIRST(&vi->inactive_list); 782 if (vp == NULL) { 783 spin_unlock(&vi->spin); 784 if (--cpu_count == 0) 785 break; 786 ri = (ri + 16) & ~15; 787 --ri; 788 continue; 789 } 790 791 /* 792 * non-blocking vx_get will also ref the vnode on success. 793 */ 794 if (vx_get_nonblock(vp)) { 795 KKASSERT(vp->v_state == VS_INACTIVE); 796 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 797 TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list); 798 spin_unlock(&vi->spin); 799 continue; 800 } 801 802 /* 803 * Because we are holding vfs_spin the vnode should currently 804 * be inactive and VREF_TERMINATE should still be set. 805 * 806 * Once vfs_spin is released the vnode's state should remain 807 * unmodified due to both the lock and ref on it. 808 */ 809 KKASSERT(vp->v_state == VS_INACTIVE); 810 spin_unlock(&vi->spin); 811 #ifdef TRACKVNODE 812 if ((u_long)vp == trackvnode) 813 kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag); 814 #endif 815 816 /* 817 * Do not reclaim/reuse a vnode while auxillary refs exists. 818 * This includes namecache refs due to a related ncp being 819 * locked or having children, a VM object association, or 820 * other hold users. 821 * 822 * Do not reclaim/reuse a vnode if someone else has a real 823 * ref on it. This can occur if a filesystem temporarily 824 * releases the vnode lock during VOP_RECLAIM. 825 */ 826 if (vp->v_auxrefs || 827 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 828 failed: 829 if (vp->v_state == VS_INACTIVE) { 830 spin_lock(&vi->spin); 831 if (vp->v_state == VS_INACTIVE) { 832 TAILQ_REMOVE(&vi->inactive_list, 833 vp, v_list); 834 TAILQ_INSERT_TAIL(&vi->inactive_list, 835 vp, v_list); 836 } 837 spin_unlock(&vi->spin); 838 } 839 vx_put(vp); 840 continue; 841 } 842 843 /* 844 * VINACTIVE and VREF_TERMINATE are expected to both be set 845 * for vnodes pulled from the inactive list, and cannot be 846 * changed while we hold the vx lock. 847 * 848 * Try to reclaim the vnode. 849 */ 850 KKASSERT(vp->v_flag & VINACTIVE); 851 KKASSERT(vp->v_refcnt & VREF_TERMINATE); 852 853 if ((vp->v_flag & VRECLAIMED) == 0) { 854 if (cache_inval_vp_nonblock(vp)) 855 goto failed; 856 vgone_vxlocked(vp); 857 /* vnode is still VX locked */ 858 } 859 860 /* 861 * At this point if there are no other refs or auxrefs on 862 * the vnode with the inactive list locked, and we remove 863 * the vnode from the inactive list, it should not be 864 * possible for anyone else to access the vnode any more. 865 * 866 * Since the vnode is in a VRECLAIMED state, no new 867 * namecache associations could have been made and the 868 * vnode should have already been removed from its mountlist. 869 * 870 * Since we hold a VX lock on the vnode it cannot have been 871 * reactivated (moved out of the inactive list). 872 */ 873 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 874 spin_lock(&vi->spin); 875 if (vp->v_auxrefs || 876 (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) { 877 spin_unlock(&vi->spin); 878 goto failed; 879 } 880 KKASSERT(vp->v_state == VS_INACTIVE); 881 TAILQ_REMOVE(&vi->inactive_list, vp, v_list); 882 atomic_add_int(&inactivevnodes, -1); 883 vp->v_state = VS_DYING; 884 spin_unlock(&vi->spin); 885 886 /* 887 * Nothing should have been able to access this vp. Only 888 * our ref should remain now. 889 */ 890 atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE); 891 KASSERT(vp->v_refcnt == 1, 892 ("vp %p badrefs %08x", vp, vp->v_refcnt)); 893 894 /* 895 * Return a VX locked vnode suitable for reuse. 896 */ 897 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 898 return(vp); 899 } 900 vnode_list_hash[mycpu->gd_cpuid].free_rover = ri; 901 return(NULL); 902 } 903 904 /* 905 * Obtain a new vnode. The returned vnode is VX locked & vrefd. 906 * 907 * All new vnodes set the VAGE flags. An open() of the vnode will 908 * decrement the (2-bit) flags. Vnodes which are opened several times 909 * are thus retained in the cache over vnodes which are merely stat()d. 910 * 911 * We always allocate the vnode. Attempting to recycle existing vnodes 912 * here can lead to numerous deadlocks, particularly with softupdates. 913 */ 914 struct vnode * 915 allocvnode(int lktimeout, int lkflags) 916 { 917 struct vnode *vp; 918 919 /* 920 * Do not flag for synchronous recyclement unless there are enough 921 * freeable vnodes to recycle and the number of vnodes has 922 * significantly exceeded our target. We want the normal vnlru 923 * process to handle the cleaning (at 9/10's) before we are forced 924 * to flag it here at 11/10's for userexit path processing. 925 */ 926 if (numvnodes >= maxvnodes * 11 / 10 && 927 cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) { 928 struct thread *td = curthread; 929 if (td->td_lwp) 930 atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU); 931 } 932 933 /* 934 * lktimeout only applies when LK_TIMELOCK is used, and only 935 * the pageout daemon uses it. The timeout may not be zero 936 * or the pageout daemon can deadlock in low-VM situations. 937 */ 938 if (lktimeout == 0) 939 lktimeout = hz / 10; 940 941 vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK); 942 943 lwkt_token_init(&vp->v_token, "vnode"); 944 lockinit(&vp->v_lock, "vnode", lktimeout, lkflags); 945 TAILQ_INIT(&vp->v_namecache); 946 RB_INIT(&vp->v_rbclean_tree); 947 RB_INIT(&vp->v_rbdirty_tree); 948 RB_INIT(&vp->v_rbhash_tree); 949 spin_init(&vp->v_spin, "allocvnode"); 950 951 lockmgr(&vp->v_lock, LK_EXCLUSIVE); 952 atomic_add_int(&numvnodes, 1); 953 vp->v_refcnt = 1; 954 vp->v_flag = VAGE0 | VAGE1; 955 vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; 956 957 KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); 958 /* exclusive lock still held */ 959 960 vp->v_filesize = NOOFFSET; 961 vp->v_type = VNON; 962 vp->v_tag = 0; 963 vp->v_state = VS_CACHED; 964 _vactivate(vp); 965 966 return (vp); 967 } 968 969 /* 970 * Called after a process has allocated a vnode via allocvnode() 971 * and we detected that too many vnodes were present. 972 * 973 * This function is called just prior to a return to userland if the 974 * process at some point had to allocate a new vnode during the last 975 * system call and the vnode count was found to be excessive. 976 * 977 * This is a synchronous path that we do not normally want to execute. 978 * 979 * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10. 980 * 981 * WARNING: Sometimes numvnodes can blow out due to children being 982 * present under directory vnodes in the namecache. For the 983 * moment use an if() instead of a while() and note that if 984 * we were to use a while() we would still have to break out 985 * if freesomevnodes() returned 0. vnlru will also be trying 986 * hard to free vnodes at the same time (with a lower trigger 987 * pointer). 988 */ 989 void 990 allocvnode_gc(void) 991 { 992 if (numvnodes >= maxvnodes && 993 countcachedvnodes(0) + inactivevnodes >= maxvnodes * 5 / 10) { 994 freesomevnodes(batchfreevnodes); 995 } 996 } 997 998 int 999 freesomevnodes(int n) 1000 { 1001 struct vnode *vp; 1002 int count = 0; 1003 1004 while (n) { 1005 if ((vp = cleanfreevnode(n)) == NULL) 1006 break; 1007 vx_unlock(vp); 1008 --n; 1009 ++count; 1010 kfree(vp, M_VNODE); 1011 atomic_add_int(&numvnodes, -1); 1012 } 1013 return(count); 1014 } 1015