1 /* 2 * Copyright (c) 2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. All advertising materials mentioning features or use of this software 51 * must display the following acknowledgement: 52 * This product includes software developed by the University of 53 * California, Berkeley and its contributors. 54 * 4. Neither the name of the University nor the names of its contributors 55 * may be used to endorse or promote products derived from this software 56 * without specific prior written permission. 57 * 58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 68 * SUCH DAMAGE. 69 * 70 * $DragonFly: src/sys/kern/vfs_mount.c,v 1.17 2006/05/06 02:43:12 dillon Exp $ 71 */ 72 73 /* 74 * External virtual filesystem routines 75 */ 76 #include "opt_ddb.h" 77 78 #include <sys/param.h> 79 #include <sys/systm.h> 80 #include <sys/kernel.h> 81 #include <sys/malloc.h> 82 #include <sys/mount.h> 83 #include <sys/proc.h> 84 #include <sys/vnode.h> 85 #include <sys/buf.h> 86 #include <sys/eventhandler.h> 87 #include <sys/kthread.h> 88 #include <sys/sysctl.h> 89 90 #include <machine/limits.h> 91 92 #include <sys/buf2.h> 93 #include <sys/thread2.h> 94 95 #include <vm/vm.h> 96 #include <vm/vm_object.h> 97 98 struct mountscan_info { 99 TAILQ_ENTRY(mountscan_info) msi_entry; 100 int msi_how; 101 struct mount *msi_node; 102 }; 103 104 struct vmntvnodescan_info { 105 TAILQ_ENTRY(vmntvnodescan_info) entry; 106 struct vnode *vp; 107 }; 108 109 static int vnlru_nowhere = 0; 110 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RD, 111 &vnlru_nowhere, 0, 112 "Number of times the vnlru process ran without success"); 113 114 115 static struct lwkt_token mntid_token; 116 117 static struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); 118 static TAILQ_HEAD(,mountscan_info) mountscan_list; 119 static struct lwkt_token mountlist_token; 120 static TAILQ_HEAD(,vmntvnodescan_info) mntvnodescan_list; 121 struct lwkt_token mntvnode_token; 122 123 /* 124 * Called from vfsinit() 125 */ 126 void 127 vfs_mount_init(void) 128 { 129 lwkt_token_init(&mountlist_token); 130 lwkt_token_init(&mntvnode_token); 131 lwkt_token_init(&mntid_token); 132 TAILQ_INIT(&mountscan_list); 133 TAILQ_INIT(&mntvnodescan_list); 134 } 135 136 /* 137 * Support function called with mntvnode_token held to remove a vnode 138 * from the mountlist. We must update any list scans which are in progress. 139 */ 140 static void 141 vremovevnodemnt(struct vnode *vp) 142 { 143 struct vmntvnodescan_info *info; 144 145 TAILQ_FOREACH(info, &mntvnodescan_list, entry) { 146 if (info->vp == vp) 147 info->vp = TAILQ_NEXT(vp, v_nmntvnodes); 148 } 149 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); 150 } 151 152 /* 153 * Support function called with mntvnode_token held to move a vnode to 154 * the end of the list. 155 */ 156 static void 157 vmovevnodetoend(struct mount *mp, struct vnode *vp) 158 { 159 vremovevnodemnt(vp); 160 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 161 } 162 163 164 /* 165 * Allocate a new vnode and associate it with a tag, mount point, and 166 * operations vector. 167 * 168 * A VX locked and refd vnode is returned. The caller should setup the 169 * remaining fields and vx_put() or, if he wishes to leave a vref, 170 * vx_unlock() the vnode. 171 */ 172 int 173 getnewvnode(enum vtagtype tag, struct mount *mp, 174 struct vnode **vpp, int lktimeout, int lkflags) 175 { 176 struct vnode *vp; 177 178 KKASSERT(mp != NULL); 179 180 vp = allocvnode(lktimeout, lkflags); 181 vp->v_tag = tag; 182 vp->v_data = NULL; 183 184 /* 185 * By default the vnode is assigned the mount point's normal 186 * operations vector. 187 */ 188 vp->v_ops = &mp->mnt_vn_use_ops; 189 190 /* 191 * Placing the vnode on the mount point's queue makes it visible. 192 * VNON prevents it from being messed with, however. 193 */ 194 insmntque(vp, mp); 195 196 /* 197 * A VX locked & refd vnode is returned. 198 */ 199 *vpp = vp; 200 return (0); 201 } 202 203 /* 204 * This function creates vnodes with special operations vectors. The 205 * mount point is optional. 206 * 207 * This routine is being phased out. 208 */ 209 int 210 getspecialvnode(enum vtagtype tag, struct mount *mp, 211 struct vop_ops **ops_pp, 212 struct vnode **vpp, int lktimeout, int lkflags) 213 { 214 struct vnode *vp; 215 216 vp = allocvnode(lktimeout, lkflags); 217 vp->v_tag = tag; 218 vp->v_data = NULL; 219 vp->v_ops = ops_pp; 220 221 /* 222 * Placing the vnode on the mount point's queue makes it visible. 223 * VNON prevents it from being messed with, however. 224 */ 225 insmntque(vp, mp); 226 227 /* 228 * A VX locked & refd vnode is returned. 229 */ 230 *vpp = vp; 231 return (0); 232 } 233 234 /* 235 * Interlock against an unmount, return 0 on success, non-zero on failure. 236 * 237 * The passed flag may be 0 or LK_NOWAIT and is only used if an unmount 238 * is in-progress. 239 * 240 * If no unmount is in-progress LK_NOWAIT is ignored. No other flag bits 241 * are used. A shared locked will be obtained and the filesystem will not 242 * be unmountable until the lock is released. 243 */ 244 int 245 vfs_busy(struct mount *mp, int flags) 246 { 247 int lkflags; 248 249 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 250 if (flags & LK_NOWAIT) 251 return (ENOENT); 252 /* XXX not MP safe */ 253 mp->mnt_kern_flag |= MNTK_MWAIT; 254 /* 255 * Since all busy locks are shared except the exclusive 256 * lock granted when unmounting, the only place that a 257 * wakeup needs to be done is at the release of the 258 * exclusive lock at the end of dounmount. 259 */ 260 tsleep((caddr_t)mp, 0, "vfs_busy", 0); 261 return (ENOENT); 262 } 263 lkflags = LK_SHARED | LK_NOPAUSE; 264 if (lockmgr(&mp->mnt_lock, lkflags)) 265 panic("vfs_busy: unexpected lock failure"); 266 return (0); 267 } 268 269 /* 270 * Free a busy filesystem. 271 */ 272 void 273 vfs_unbusy(struct mount *mp) 274 { 275 lockmgr(&mp->mnt_lock, LK_RELEASE); 276 } 277 278 /* 279 * Lookup a filesystem type, and if found allocate and initialize 280 * a mount structure for it. 281 * 282 * Devname is usually updated by mount(8) after booting. 283 */ 284 int 285 vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp) 286 { 287 struct vfsconf *vfsp; 288 struct mount *mp; 289 290 if (fstypename == NULL) 291 return (ENODEV); 292 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 293 if (!strcmp(vfsp->vfc_name, fstypename)) 294 break; 295 } 296 if (vfsp == NULL) 297 return (ENODEV); 298 mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK); 299 bzero((char *)mp, (u_long)sizeof(struct mount)); 300 lockinit(&mp->mnt_lock, "vfslock", VLKTIMEOUT, LK_NOPAUSE); 301 vfs_busy(mp, LK_NOWAIT); 302 TAILQ_INIT(&mp->mnt_nvnodelist); 303 TAILQ_INIT(&mp->mnt_reservedvnlist); 304 TAILQ_INIT(&mp->mnt_jlist); 305 mp->mnt_nvnodelistsize = 0; 306 mp->mnt_vfc = vfsp; 307 mp->mnt_op = vfsp->vfc_vfsops; 308 mp->mnt_flag = MNT_RDONLY; 309 mp->mnt_vnodecovered = NULLVP; 310 vfsp->vfc_refcount++; 311 mp->mnt_iosize_max = DFLTPHYS; 312 mp->mnt_stat.f_type = vfsp->vfc_typenum; 313 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 314 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 315 copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 316 *mpp = mp; 317 return (0); 318 } 319 320 /* 321 * Lookup a mount point by filesystem identifier. 322 */ 323 struct mount * 324 vfs_getvfs(fsid_t *fsid) 325 { 326 struct mount *mp; 327 lwkt_tokref ilock; 328 329 lwkt_gettoken(&ilock, &mountlist_token); 330 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 331 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 332 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 333 break; 334 } 335 } 336 lwkt_reltoken(&ilock); 337 return (mp); 338 } 339 340 /* 341 * Get a new unique fsid. Try to make its val[0] unique, since this value 342 * will be used to create fake device numbers for stat(). Also try (but 343 * not so hard) make its val[0] unique mod 2^16, since some emulators only 344 * support 16-bit device numbers. We end up with unique val[0]'s for the 345 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 346 * 347 * Keep in mind that several mounts may be running in parallel. Starting 348 * the search one past where the previous search terminated is both a 349 * micro-optimization and a defense against returning the same fsid to 350 * different mounts. 351 */ 352 void 353 vfs_getnewfsid(struct mount *mp) 354 { 355 static u_int16_t mntid_base; 356 lwkt_tokref ilock; 357 fsid_t tfsid; 358 int mtype; 359 360 lwkt_gettoken(&ilock, &mntid_token); 361 mtype = mp->mnt_vfc->vfc_typenum; 362 tfsid.val[1] = mtype; 363 mtype = (mtype & 0xFF) << 24; 364 for (;;) { 365 tfsid.val[0] = makeudev(255, 366 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 367 mntid_base++; 368 if (vfs_getvfs(&tfsid) == NULL) 369 break; 370 } 371 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 372 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 373 lwkt_reltoken(&ilock); 374 } 375 376 /* 377 * This routine is called when we have too many vnodes. It attempts 378 * to free <count> vnodes and will potentially free vnodes that still 379 * have VM backing store (VM backing store is typically the cause 380 * of a vnode blowout so we want to do this). Therefore, this operation 381 * is not considered cheap. 382 * 383 * A number of conditions may prevent a vnode from being reclaimed. 384 * the buffer cache may have references on the vnode, a directory 385 * vnode may still have references due to the namei cache representing 386 * underlying files, or the vnode may be in active use. It is not 387 * desireable to reuse such vnodes. These conditions may cause the 388 * number of vnodes to reach some minimum value regardless of what 389 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 390 */ 391 392 /* 393 * This is a quick non-blocking check to determine if the vnode is a good 394 * candidate for being (eventually) vgone()'d. Returns 0 if the vnode is 395 * not a good candidate, 1 if it is. 396 * 397 * vnodes marked VFREE are already on the free list, but may still need 398 * to be recycled due to eating namecache resources and potentially blocking 399 * the namecache directory chain and related vnodes from being freed. 400 */ 401 static __inline int 402 vmightfree(struct vnode *vp, int page_count) 403 { 404 if (vp->v_flag & VRECLAIMED) 405 return (0); 406 if ((vp->v_flag & VFREE) && TAILQ_EMPTY(&vp->v_namecache)) 407 return (0); 408 if (vp->v_usecount != 0) 409 return (0); 410 if (vp->v_object && vp->v_object->resident_page_count >= page_count) 411 return (0); 412 return (1); 413 } 414 415 /* 416 * The vnode was found to be possibly vgone()able and the caller has locked it 417 * (thus the usecount should be 1 now). Determine if the vnode is actually 418 * vgone()able, doing some cleanups in the process. Returns 1 if the vnode 419 * can be vgone()'d, 0 otherwise. 420 * 421 * Note that v_holdcnt may be non-zero because (A) this vnode is not a leaf 422 * in the namecache topology and (B) this vnode has buffer cache bufs. 423 * We cannot remove vnodes with non-leaf namecache associations. We do a 424 * tentitive leaf check prior to attempting to flush out any buffers but the 425 * 'real' test when all is said in done is that v_holdcnt must become 0 for 426 * the vnode to be freeable. 427 * 428 * We could theoretically just unconditionally flush when v_holdcnt != 0, 429 * but flushing data associated with non-leaf nodes (which are always 430 * directories), just throws it away for no benefit. It is the buffer 431 * cache's responsibility to choose buffers to recycle from the cached 432 * data point of view. 433 */ 434 static int 435 visleaf(struct vnode *vp) 436 { 437 struct namecache *ncp; 438 439 TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) { 440 if (!TAILQ_EMPTY(&ncp->nc_list)) 441 return(0); 442 } 443 return(1); 444 } 445 446 /* 447 * Try to clean up the vnode to the point where it can be vgone()'d, returning 448 * 0 if it cannot be vgone()'d (or already has been), 1 if it can. Unlike 449 * vmightfree() this routine may flush the vnode and block. Vnodes marked 450 * VFREE are still candidates for vgone()ing because they may hold namecache 451 * resources and could be blocking the namecache directory hierarchy (and 452 * related vnodes) from being freed. 453 */ 454 static int 455 vtrytomakegoneable(struct vnode *vp, int page_count) 456 { 457 if (vp->v_flag & VRECLAIMED) 458 return (0); 459 if (vp->v_usecount != 1) 460 return (0); 461 if (vp->v_object && vp->v_object->resident_page_count >= page_count) 462 return (0); 463 if (vp->v_holdcnt && visleaf(vp)) { 464 vinvalbuf(vp, V_SAVE, 0, 0); 465 #if 0 /* DEBUG */ 466 printf((vp->v_holdcnt ? "vrecycle: vp %p failed: %s\n" : 467 "vrecycle: vp %p succeeded: %s\n"), vp, 468 (TAILQ_FIRST(&vp->v_namecache) ? 469 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?")); 470 #endif 471 } 472 return(vp->v_usecount == 1 && vp->v_holdcnt == 0); 473 } 474 475 /* 476 * Reclaim up to 1/10 of the vnodes associated with a mount point. Try 477 * to avoid vnodes which have lots of resident pages (we are trying to free 478 * vnodes, not memory). 479 * 480 * This routine is a callback from the mountlist scan. The mount point 481 * in question will be busied. 482 */ 483 static int 484 vlrureclaim(struct mount *mp, void *data) 485 { 486 struct vnode *vp; 487 lwkt_tokref ilock; 488 int done; 489 int trigger; 490 int usevnodes; 491 int count; 492 int trigger_mult = vnlru_nowhere; 493 494 /* 495 * Calculate the trigger point for the resident pages check. The 496 * minimum trigger value is approximately the number of pages in 497 * the system divded by the number of vnodes. However, due to 498 * various other system memory overheads unrelated to data caching 499 * it is a good idea to double the trigger (at least). 500 * 501 * trigger_mult starts at 0. If the recycler is having problems 502 * finding enough freeable vnodes it will increase trigger_mult. 503 * This should not happen in normal operation, even on machines with 504 * low amounts of memory, but extraordinary memory use by the system 505 * verses the amount of cached data can trigger it. 506 */ 507 usevnodes = desiredvnodes; 508 if (usevnodes <= 0) 509 usevnodes = 1; 510 trigger = vmstats.v_page_count * (trigger_mult + 2) / usevnodes; 511 512 done = 0; 513 lwkt_gettoken(&ilock, &mntvnode_token); 514 count = mp->mnt_nvnodelistsize / 10 + 1; 515 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 516 /* 517 * __VNODESCAN__ 518 * 519 * The VP will stick around while we hold mntvnode_token, 520 * at least until we block, so we can safely do an initial 521 * check, and then must check again after we lock the vnode. 522 */ 523 if (vp->v_type == VNON || /* XXX */ 524 vp->v_type == VBAD || /* XXX */ 525 !vmightfree(vp, trigger) /* critical path opt */ 526 ) { 527 vmovevnodetoend(mp, vp); 528 --count; 529 continue; 530 } 531 532 /* 533 * VX get the candidate vnode. If the VX get fails the 534 * vnode might still be on the mountlist. Our loop depends 535 * on us at least cycling the vnode to the end of the 536 * mountlist. 537 */ 538 if (vx_get_nonblock(vp) != 0) { 539 if (vp->v_mount == mp) 540 vmovevnodetoend(mp, vp); 541 --count; 542 continue; 543 } 544 545 /* 546 * Since we blocked locking the vp, make sure it is still 547 * a candidate for reclamation. That is, it has not already 548 * been reclaimed and only has our VX reference associated 549 * with it. 550 */ 551 if (vp->v_type == VNON || /* XXX */ 552 vp->v_type == VBAD || /* XXX */ 553 (vp->v_flag & VRECLAIMED) || 554 vp->v_mount != mp || 555 !vtrytomakegoneable(vp, trigger) /* critical path opt */ 556 ) { 557 if (vp->v_mount == mp) 558 vmovevnodetoend(mp, vp); 559 --count; 560 vx_put(vp); 561 continue; 562 } 563 564 /* 565 * All right, we are good, move the vp to the end of the 566 * mountlist and clean it out. The vget will have returned 567 * an error if the vnode was destroyed (VRECLAIMED set), so we 568 * do not have to check again. The vput() will move the 569 * vnode to the free list if the vgone() was successful. 570 */ 571 KKASSERT(vp->v_mount == mp); 572 vmovevnodetoend(mp, vp); 573 vgone(vp); 574 vx_put(vp); 575 ++done; 576 --count; 577 } 578 lwkt_reltoken(&ilock); 579 return (done); 580 } 581 582 /* 583 * Attempt to recycle vnodes in a context that is always safe to block. 584 * Calling vlrurecycle() from the bowels of file system code has some 585 * interesting deadlock problems. 586 */ 587 static struct thread *vnlruthread; 588 static int vnlruproc_sig; 589 590 void 591 vnlru_proc_wait(void) 592 { 593 if (vnlruproc_sig == 0) { 594 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 595 wakeup(vnlruthread); 596 } 597 tsleep(&vnlruproc_sig, 0, "vlruwk", hz); 598 } 599 600 static void 601 vnlru_proc(void) 602 { 603 struct thread *td = curthread; 604 int done; 605 606 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td, 607 SHUTDOWN_PRI_FIRST); 608 609 crit_enter(); 610 for (;;) { 611 kproc_suspend_loop(); 612 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { 613 vnlruproc_sig = 0; 614 wakeup(&vnlruproc_sig); 615 tsleep(td, 0, "vlruwt", hz); 616 continue; 617 } 618 cache_cleanneg(0); 619 done = mountlist_scan(vlrureclaim, NULL, MNTSCAN_FORWARD); 620 621 /* 622 * The vlrureclaim() call only processes 1/10 of the vnodes 623 * on each mount. If we couldn't find any repeat the loop 624 * at least enough times to cover all available vnodes before 625 * we start sleeping. Complain if the failure extends past 626 * 30 second, every 30 seconds. 627 */ 628 if (done == 0) { 629 ++vnlru_nowhere; 630 if (vnlru_nowhere % 10 == 0) 631 tsleep(td, 0, "vlrup", hz * 3); 632 if (vnlru_nowhere % 100 == 0) 633 printf("vnlru_proc: vnode recycler stopped working!\n"); 634 if (vnlru_nowhere == 1000) 635 vnlru_nowhere = 900; 636 } else { 637 vnlru_nowhere = 0; 638 } 639 } 640 crit_exit(); 641 } 642 643 /* 644 * MOUNTLIST FUNCTIONS 645 */ 646 647 /* 648 * mountlist_insert (MP SAFE) 649 * 650 * Add a new mount point to the mount list. 651 */ 652 void 653 mountlist_insert(struct mount *mp, int how) 654 { 655 lwkt_tokref ilock; 656 657 lwkt_gettoken(&ilock, &mountlist_token); 658 if (how == MNTINS_FIRST) 659 TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list); 660 else 661 TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); 662 lwkt_reltoken(&ilock); 663 } 664 665 /* 666 * mountlist_interlock (MP SAFE) 667 * 668 * Execute the specified interlock function with the mountlist token 669 * held. The function will be called in a serialized fashion verses 670 * other functions called through this mechanism. 671 */ 672 int 673 mountlist_interlock(int (*callback)(struct mount *), struct mount *mp) 674 { 675 lwkt_tokref ilock; 676 int error; 677 678 lwkt_gettoken(&ilock, &mountlist_token); 679 error = callback(mp); 680 lwkt_reltoken(&ilock); 681 return (error); 682 } 683 684 /* 685 * mountlist_boot_getfirst (DURING BOOT ONLY) 686 * 687 * This function returns the first mount on the mountlist, which is 688 * expected to be the root mount. Since no interlocks are obtained 689 * this function is only safe to use during booting. 690 */ 691 692 struct mount * 693 mountlist_boot_getfirst(void) 694 { 695 return(TAILQ_FIRST(&mountlist)); 696 } 697 698 /* 699 * mountlist_remove (MP SAFE) 700 * 701 * Remove a node from the mountlist. If this node is the next scan node 702 * for any active mountlist scans, the active mountlist scan will be 703 * adjusted to skip the node, thus allowing removals during mountlist 704 * scans. 705 */ 706 void 707 mountlist_remove(struct mount *mp) 708 { 709 struct mountscan_info *msi; 710 lwkt_tokref ilock; 711 712 lwkt_gettoken(&ilock, &mountlist_token); 713 TAILQ_FOREACH(msi, &mountscan_list, msi_entry) { 714 if (msi->msi_node == mp) { 715 if (msi->msi_how & MNTSCAN_FORWARD) 716 msi->msi_node = TAILQ_NEXT(mp, mnt_list); 717 else 718 msi->msi_node = TAILQ_PREV(mp, mntlist, mnt_list); 719 } 720 } 721 TAILQ_REMOVE(&mountlist, mp, mnt_list); 722 lwkt_reltoken(&ilock); 723 } 724 725 /* 726 * mountlist_scan (MP SAFE) 727 * 728 * Safely scan the mount points on the mount list. Unless otherwise 729 * specified each mount point will be busied prior to the callback and 730 * unbusied afterwords. The callback may safely remove any mount point 731 * without interfering with the scan. If the current callback 732 * mount is removed the scanner will not attempt to unbusy it. 733 * 734 * If a mount node cannot be busied it is silently skipped. 735 * 736 * The callback return value is aggregated and a total is returned. A return 737 * value of < 0 is not aggregated and will terminate the scan. 738 * 739 * MNTSCAN_FORWARD - the mountlist is scanned in the forward direction 740 * MNTSCAN_REVERSE - the mountlist is scanned in reverse 741 * MNTSCAN_NOBUSY - the scanner will make the callback without busying 742 * the mount node. 743 */ 744 int 745 mountlist_scan(int (*callback)(struct mount *, void *), void *data, int how) 746 { 747 struct mountscan_info info; 748 lwkt_tokref ilock; 749 struct mount *mp; 750 thread_t td; 751 int count; 752 int res; 753 754 lwkt_gettoken(&ilock, &mountlist_token); 755 756 info.msi_how = how; 757 info.msi_node = NULL; /* paranoia */ 758 TAILQ_INSERT_TAIL(&mountscan_list, &info, msi_entry); 759 760 res = 0; 761 td = curthread; 762 763 if (how & MNTSCAN_FORWARD) { 764 info.msi_node = TAILQ_FIRST(&mountlist); 765 while ((mp = info.msi_node) != NULL) { 766 if (how & MNTSCAN_NOBUSY) { 767 count = callback(mp, data); 768 } else if (vfs_busy(mp, LK_NOWAIT) == 0) { 769 count = callback(mp, data); 770 if (mp == info.msi_node) 771 vfs_unbusy(mp); 772 } else { 773 count = 0; 774 } 775 if (count < 0) 776 break; 777 res += count; 778 if (mp == info.msi_node) 779 info.msi_node = TAILQ_NEXT(mp, mnt_list); 780 } 781 } else if (how & MNTSCAN_REVERSE) { 782 info.msi_node = TAILQ_LAST(&mountlist, mntlist); 783 while ((mp = info.msi_node) != NULL) { 784 if (how & MNTSCAN_NOBUSY) { 785 count = callback(mp, data); 786 } else if (vfs_busy(mp, LK_NOWAIT) == 0) { 787 count = callback(mp, data); 788 if (mp == info.msi_node) 789 vfs_unbusy(mp); 790 } else { 791 count = 0; 792 } 793 if (count < 0) 794 break; 795 res += count; 796 if (mp == info.msi_node) 797 info.msi_node = TAILQ_PREV(mp, mntlist, mnt_list); 798 } 799 } 800 TAILQ_REMOVE(&mountscan_list, &info, msi_entry); 801 lwkt_reltoken(&ilock); 802 return(res); 803 } 804 805 /* 806 * MOUNT RELATED VNODE FUNCTIONS 807 */ 808 809 static struct kproc_desc vnlru_kp = { 810 "vnlru", 811 vnlru_proc, 812 &vnlruthread 813 }; 814 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 815 816 /* 817 * Move a vnode from one mount queue to another. 818 */ 819 void 820 insmntque(struct vnode *vp, struct mount *mp) 821 { 822 lwkt_tokref ilock; 823 824 lwkt_gettoken(&ilock, &mntvnode_token); 825 /* 826 * Delete from old mount point vnode list, if on one. 827 */ 828 if (vp->v_mount != NULL) { 829 KASSERT(vp->v_mount->mnt_nvnodelistsize > 0, 830 ("bad mount point vnode list size")); 831 vremovevnodemnt(vp); 832 vp->v_mount->mnt_nvnodelistsize--; 833 } 834 /* 835 * Insert into list of vnodes for the new mount point, if available. 836 */ 837 if ((vp->v_mount = mp) == NULL) { 838 lwkt_reltoken(&ilock); 839 return; 840 } 841 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 842 mp->mnt_nvnodelistsize++; 843 lwkt_reltoken(&ilock); 844 } 845 846 847 /* 848 * Scan the vnodes under a mount point and issue appropriate callbacks. 849 * 850 * The fastfunc() callback is called with just the mountlist token held 851 * (no vnode lock). It may not block and the vnode may be undergoing 852 * modifications while the caller is processing it. The vnode will 853 * not be entirely destroyed, however, due to the fact that the mountlist 854 * token is held. A return value < 0 skips to the next vnode without calling 855 * the slowfunc(), a return value > 0 terminates the loop. 856 * 857 * The slowfunc() callback is called after the vnode has been successfully 858 * locked based on passed flags. The vnode is skipped if it gets rearranged 859 * or destroyed while blocking on the lock. A non-zero return value from 860 * the slow function terminates the loop. The slow function is allowed to 861 * arbitrarily block. The scanning code guarentees consistency of operation 862 * even if the slow function deletes or moves the node, or blocks and some 863 * other thread deletes or moves the node. 864 */ 865 int 866 vmntvnodescan( 867 struct mount *mp, 868 int flags, 869 int (*fastfunc)(struct mount *mp, struct vnode *vp, void *data), 870 int (*slowfunc)(struct mount *mp, struct vnode *vp, void *data), 871 void *data 872 ) { 873 struct vmntvnodescan_info info; 874 lwkt_tokref ilock; 875 struct vnode *vp; 876 int r = 0; 877 int maxcount = 1000000; 878 879 lwkt_gettoken(&ilock, &mntvnode_token); 880 881 info.vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 882 TAILQ_INSERT_TAIL(&mntvnodescan_list, &info, entry); 883 while ((vp = info.vp) != NULL) { 884 if (--maxcount == 0) 885 panic("maxcount reached during vmntvnodescan"); 886 887 if (vp->v_type == VNON) /* visible but not ready */ 888 goto next; 889 KKASSERT(vp->v_mount == mp); 890 891 /* 892 * Quick test. A negative return continues the loop without 893 * calling the slow test. 0 continues onto the slow test. 894 * A positive number aborts the loop. 895 */ 896 if (fastfunc) { 897 if ((r = fastfunc(mp, vp, data)) < 0) 898 goto next; 899 if (r) 900 break; 901 } 902 903 /* 904 * Get a vxlock on the vnode, retry if it has moved or isn't 905 * in the mountlist where we expect it. 906 */ 907 if (slowfunc) { 908 int error; 909 910 switch(flags) { 911 case VMSC_GETVP: 912 error = vget(vp, LK_EXCLUSIVE); 913 break; 914 case VMSC_GETVP|VMSC_NOWAIT: 915 error = vget(vp, LK_EXCLUSIVE|LK_NOWAIT); 916 break; 917 case VMSC_GETVX: 918 error = vx_get(vp); 919 break; 920 case VMSC_REFVP: 921 vref(vp); 922 /* fall through */ 923 default: 924 error = 0; 925 break; 926 } 927 if (error) 928 goto next; 929 /* 930 * Do not call the slow function if the vnode is 931 * invalid or if it was ripped out from under us 932 * while we (potentially) blocked. 933 */ 934 if (info.vp == vp && vp->v_type != VNON) 935 r = slowfunc(mp, vp, data); 936 937 /* 938 * Cleanup 939 */ 940 switch(flags) { 941 case VMSC_GETVP: 942 case VMSC_GETVP|VMSC_NOWAIT: 943 vput(vp); 944 break; 945 case VMSC_GETVX: 946 vx_put(vp); 947 break; 948 case VMSC_REFVP: 949 vrele(vp); 950 /* fall through */ 951 default: 952 break; 953 } 954 if (r != 0) 955 break; 956 } 957 958 /* 959 * Iterate. If the vnode was ripped out from under us 960 * info.vp will already point to the next vnode, otherwise 961 * we have to obtain the next valid vnode ourselves. 962 */ 963 next: 964 if (info.vp == vp) 965 info.vp = TAILQ_NEXT(vp, v_nmntvnodes); 966 } 967 TAILQ_REMOVE(&mntvnodescan_list, &info, entry); 968 lwkt_reltoken(&ilock); 969 return(r); 970 } 971 972 /* 973 * Remove any vnodes in the vnode table belonging to mount point mp. 974 * 975 * If FORCECLOSE is not specified, there should not be any active ones, 976 * return error if any are found (nb: this is a user error, not a 977 * system error). If FORCECLOSE is specified, detach any active vnodes 978 * that are found. 979 * 980 * If WRITECLOSE is set, only flush out regular file vnodes open for 981 * writing. 982 * 983 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. 984 * 985 * `rootrefs' specifies the base reference count for the root vnode 986 * of this filesystem. The root vnode is considered busy if its 987 * v_usecount exceeds this value. On a successful return, vflush() 988 * will call vrele() on the root vnode exactly rootrefs times. 989 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 990 * be zero. 991 */ 992 #ifdef DIAGNOSTIC 993 static int busyprt = 0; /* print out busy vnodes */ 994 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 995 #endif 996 997 static int vflush_scan(struct mount *mp, struct vnode *vp, void *data); 998 999 struct vflush_info { 1000 int flags; 1001 int busy; 1002 thread_t td; 1003 }; 1004 1005 int 1006 vflush(struct mount *mp, int rootrefs, int flags) 1007 { 1008 struct thread *td = curthread; /* XXX */ 1009 struct vnode *rootvp = NULL; 1010 int error; 1011 struct vflush_info vflush_info; 1012 1013 if (rootrefs > 0) { 1014 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 1015 ("vflush: bad args")); 1016 /* 1017 * Get the filesystem root vnode. We can vput() it 1018 * immediately, since with rootrefs > 0, it won't go away. 1019 */ 1020 if ((error = VFS_ROOT(mp, &rootvp)) != 0) 1021 return (error); 1022 vput(rootvp); 1023 } 1024 1025 vflush_info.busy = 0; 1026 vflush_info.flags = flags; 1027 vflush_info.td = td; 1028 vmntvnodescan(mp, VMSC_GETVX, NULL, vflush_scan, &vflush_info); 1029 1030 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 1031 /* 1032 * If just the root vnode is busy, and if its refcount 1033 * is equal to `rootrefs', then go ahead and kill it. 1034 */ 1035 KASSERT(vflush_info.busy > 0, ("vflush: not busy")); 1036 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); 1037 if (vflush_info.busy == 1 && rootvp->v_usecount == rootrefs) { 1038 if (vx_lock(rootvp) == 0) { 1039 vgone(rootvp); 1040 vx_unlock(rootvp); 1041 vflush_info.busy = 0; 1042 } 1043 } 1044 } 1045 if (vflush_info.busy) 1046 return (EBUSY); 1047 for (; rootrefs > 0; rootrefs--) 1048 vrele(rootvp); 1049 return (0); 1050 } 1051 1052 /* 1053 * The scan callback is made with an VX locked vnode. 1054 */ 1055 static int 1056 vflush_scan(struct mount *mp, struct vnode *vp, void *data) 1057 { 1058 struct vflush_info *info = data; 1059 struct vattr vattr; 1060 1061 /* 1062 * Skip over a vnodes marked VSYSTEM. 1063 */ 1064 if ((info->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1065 return(0); 1066 } 1067 1068 /* 1069 * If WRITECLOSE is set, flush out unlinked but still open 1070 * files (even if open only for reading) and regular file 1071 * vnodes open for writing. 1072 */ 1073 if ((info->flags & WRITECLOSE) && 1074 (vp->v_type == VNON || 1075 (VOP_GETATTR(vp, &vattr) == 0 && 1076 vattr.va_nlink > 0)) && 1077 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1078 return(0); 1079 } 1080 1081 /* 1082 * With v_usecount == 0, all we need to do is clear out the 1083 * vnode data structures and we are done. 1084 */ 1085 if (vp->v_usecount == 1) { 1086 vgone(vp); 1087 return(0); 1088 } 1089 1090 /* 1091 * If FORCECLOSE is set, forcibly close the vnode. For block 1092 * or character devices, revert to an anonymous device. For 1093 * all other files, just kill them. 1094 */ 1095 if (info->flags & FORCECLOSE) { 1096 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1097 vgone(vp); 1098 } else { 1099 vclean(vp, 0); 1100 vp->v_ops = &spec_vnode_vops; 1101 insmntque(vp, NULL); 1102 } 1103 return(0); 1104 } 1105 #ifdef DIAGNOSTIC 1106 if (busyprt) 1107 vprint("vflush: busy vnode", vp); 1108 #endif 1109 ++info->busy; 1110 return(0); 1111 } 1112 1113