1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.2 2003/06/17 04:28:42 dillon Exp $ 41 */ 42 43 /* 44 * External virtual filesystem routines 45 */ 46 #include "opt_ddb.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/dirent.h> 53 #include <sys/domain.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fcntl.h> 56 #include <sys/kernel.h> 57 #include <sys/kthread.h> 58 #include <sys/malloc.h> 59 #include <sys/mbuf.h> 60 #include <sys/mount.h> 61 #include <sys/namei.h> 62 #include <sys/proc.h> 63 #include <sys/reboot.h> 64 #include <sys/socket.h> 65 #include <sys/stat.h> 66 #include <sys/sysctl.h> 67 #include <sys/syslog.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/limits.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/pmap.h> 77 #include <vm/vm_map.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_pager.h> 80 #include <vm/vnode_pager.h> 81 #include <vm/vm_zone.h> 82 83 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 84 85 static void insmntque __P((struct vnode *vp, struct mount *mp)); 86 static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 87 static unsigned long numvnodes; 88 static void vlruvp(struct vnode *vp); 89 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 90 91 enum vtype iftovt_tab[16] = { 92 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 93 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 94 }; 95 int vttoif_tab[9] = { 96 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 97 S_IFSOCK, S_IFIFO, S_IFMT, 98 }; 99 100 static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 101 102 static u_long wantfreevnodes = 25; 103 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 104 static u_long freevnodes = 0; 105 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 106 107 static int reassignbufcalls; 108 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 109 static int reassignbufloops; 110 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 111 static int reassignbufsortgood; 112 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 113 static int reassignbufsortbad; 114 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 115 static int reassignbufmethod = 1; 116 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 117 static int nameileafonly = 0; 118 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); 119 120 #ifdef ENABLE_VFS_IOOPT 121 int vfs_ioopt = 0; 122 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 123 #endif 124 125 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */ 126 struct simplelock mountlist_slock; 127 struct simplelock mntvnode_slock; 128 int nfs_mount_type = -1; 129 #ifndef NULL_SIMPLELOCKS 130 static struct simplelock mntid_slock; 131 static struct simplelock vnode_free_list_slock; 132 static struct simplelock spechash_slock; 133 #endif 134 struct nfs_public nfs_pub; /* publicly exported FS */ 135 static vm_zone_t vnode_zone; 136 137 /* 138 * The workitem queue. 139 */ 140 #define SYNCER_MAXDELAY 32 141 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 142 time_t syncdelay = 30; /* max time to delay syncing data */ 143 time_t filedelay = 30; /* time to delay syncing files */ 144 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 145 time_t dirdelay = 29; /* time to delay syncing directories */ 146 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 147 time_t metadelay = 28; /* time to delay syncing metadata */ 148 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 149 static int rushjob; /* number of slots to run ASAP */ 150 static int stat_rush_requests; /* number of times I/O speeded up */ 151 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 152 153 static int syncer_delayno = 0; 154 static long syncer_mask; 155 LIST_HEAD(synclist, vnode); 156 static struct synclist *syncer_workitem_pending; 157 158 int desiredvnodes; 159 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 160 &desiredvnodes, 0, "Maximum number of vnodes"); 161 static int minvnodes; 162 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 163 &minvnodes, 0, "Minimum number of vnodes"); 164 static int vnlru_nowhere = 0; 165 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, 166 "Number of times the vnlru process ran without success"); 167 168 static void vfs_free_addrlist __P((struct netexport *nep)); 169 static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 170 static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 171 struct export_args *argp)); 172 173 /* 174 * Initialize the vnode management data structures. 175 */ 176 void 177 vntblinit() 178 { 179 180 desiredvnodes = maxproc + cnt.v_page_count / 4; 181 minvnodes = desiredvnodes / 4; 182 simple_lock_init(&mntvnode_slock); 183 simple_lock_init(&mntid_slock); 184 simple_lock_init(&spechash_slock); 185 TAILQ_INIT(&vnode_free_list); 186 simple_lock_init(&vnode_free_list_slock); 187 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 188 /* 189 * Initialize the filesystem syncer. 190 */ 191 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 192 &syncer_mask); 193 syncer_maxdelay = syncer_mask + 1; 194 } 195 196 /* 197 * Mark a mount point as busy. Used to synchronize access and to delay 198 * unmounting. Interlock is not released on failure. 199 */ 200 int 201 vfs_busy(mp, flags, interlkp, p) 202 struct mount *mp; 203 int flags; 204 struct simplelock *interlkp; 205 struct proc *p; 206 { 207 int lkflags; 208 209 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 210 if (flags & LK_NOWAIT) 211 return (ENOENT); 212 mp->mnt_kern_flag |= MNTK_MWAIT; 213 if (interlkp) { 214 simple_unlock(interlkp); 215 } 216 /* 217 * Since all busy locks are shared except the exclusive 218 * lock granted when unmounting, the only place that a 219 * wakeup needs to be done is at the release of the 220 * exclusive lock at the end of dounmount. 221 */ 222 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 223 if (interlkp) { 224 simple_lock(interlkp); 225 } 226 return (ENOENT); 227 } 228 lkflags = LK_SHARED | LK_NOPAUSE; 229 if (interlkp) 230 lkflags |= LK_INTERLOCK; 231 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 232 panic("vfs_busy: unexpected lock failure"); 233 return (0); 234 } 235 236 /* 237 * Free a busy filesystem. 238 */ 239 void 240 vfs_unbusy(mp, p) 241 struct mount *mp; 242 struct proc *p; 243 { 244 245 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 246 } 247 248 /* 249 * Lookup a filesystem type, and if found allocate and initialize 250 * a mount structure for it. 251 * 252 * Devname is usually updated by mount(8) after booting. 253 */ 254 int 255 vfs_rootmountalloc(fstypename, devname, mpp) 256 char *fstypename; 257 char *devname; 258 struct mount **mpp; 259 { 260 struct proc *p = curproc; /* XXX */ 261 struct vfsconf *vfsp; 262 struct mount *mp; 263 264 if (fstypename == NULL) 265 return (ENODEV); 266 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 267 if (!strcmp(vfsp->vfc_name, fstypename)) 268 break; 269 if (vfsp == NULL) 270 return (ENODEV); 271 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 272 bzero((char *)mp, (u_long)sizeof(struct mount)); 273 lockinit(&mp->mnt_lock, PVFS, "vfslock", VLKTIMEOUT, LK_NOPAUSE); 274 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 275 TAILQ_INIT(&mp->mnt_nvnodelist); 276 TAILQ_INIT(&mp->mnt_reservedvnlist); 277 mp->mnt_nvnodelistsize = 0; 278 mp->mnt_vfc = vfsp; 279 mp->mnt_op = vfsp->vfc_vfsops; 280 mp->mnt_flag = MNT_RDONLY; 281 mp->mnt_vnodecovered = NULLVP; 282 vfsp->vfc_refcount++; 283 mp->mnt_iosize_max = DFLTPHYS; 284 mp->mnt_stat.f_type = vfsp->vfc_typenum; 285 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 286 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 287 mp->mnt_stat.f_mntonname[0] = '/'; 288 mp->mnt_stat.f_mntonname[1] = 0; 289 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 290 *mpp = mp; 291 return (0); 292 } 293 294 /* 295 * Find an appropriate filesystem to use for the root. If a filesystem 296 * has not been preselected, walk through the list of known filesystems 297 * trying those that have mountroot routines, and try them until one 298 * works or we have tried them all. 299 */ 300 #ifdef notdef /* XXX JH */ 301 int 302 lite2_vfs_mountroot() 303 { 304 struct vfsconf *vfsp; 305 extern int (*lite2_mountroot) __P((void)); 306 int error; 307 308 if (lite2_mountroot != NULL) 309 return ((*lite2_mountroot)()); 310 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 311 if (vfsp->vfc_mountroot == NULL) 312 continue; 313 if ((error = (*vfsp->vfc_mountroot)()) == 0) 314 return (0); 315 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 316 } 317 return (ENODEV); 318 } 319 #endif 320 321 /* 322 * Lookup a mount point by filesystem identifier. 323 */ 324 struct mount * 325 vfs_getvfs(fsid) 326 fsid_t *fsid; 327 { 328 register struct mount *mp; 329 330 simple_lock(&mountlist_slock); 331 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 332 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 333 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 334 simple_unlock(&mountlist_slock); 335 return (mp); 336 } 337 } 338 simple_unlock(&mountlist_slock); 339 return ((struct mount *) 0); 340 } 341 342 /* 343 * Get a new unique fsid. Try to make its val[0] unique, since this value 344 * will be used to create fake device numbers for stat(). Also try (but 345 * not so hard) make its val[0] unique mod 2^16, since some emulators only 346 * support 16-bit device numbers. We end up with unique val[0]'s for the 347 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 348 * 349 * Keep in mind that several mounts may be running in parallel. Starting 350 * the search one past where the previous search terminated is both a 351 * micro-optimization and a defense against returning the same fsid to 352 * different mounts. 353 */ 354 void 355 vfs_getnewfsid(mp) 356 struct mount *mp; 357 { 358 static u_int16_t mntid_base; 359 fsid_t tfsid; 360 int mtype; 361 362 simple_lock(&mntid_slock); 363 mtype = mp->mnt_vfc->vfc_typenum; 364 tfsid.val[1] = mtype; 365 mtype = (mtype & 0xFF) << 24; 366 for (;;) { 367 tfsid.val[0] = makeudev(255, 368 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 369 mntid_base++; 370 if (vfs_getvfs(&tfsid) == NULL) 371 break; 372 } 373 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 374 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 375 simple_unlock(&mntid_slock); 376 } 377 378 /* 379 * Knob to control the precision of file timestamps: 380 * 381 * 0 = seconds only; nanoseconds zeroed. 382 * 1 = seconds and nanoseconds, accurate within 1/HZ. 383 * 2 = seconds and nanoseconds, truncated to microseconds. 384 * >=3 = seconds and nanoseconds, maximum precision. 385 */ 386 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 387 388 static int timestamp_precision = TSP_SEC; 389 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 390 ×tamp_precision, 0, ""); 391 392 /* 393 * Get a current timestamp. 394 */ 395 void 396 vfs_timestamp(tsp) 397 struct timespec *tsp; 398 { 399 struct timeval tv; 400 401 switch (timestamp_precision) { 402 case TSP_SEC: 403 tsp->tv_sec = time_second; 404 tsp->tv_nsec = 0; 405 break; 406 case TSP_HZ: 407 getnanotime(tsp); 408 break; 409 case TSP_USEC: 410 microtime(&tv); 411 TIMEVAL_TO_TIMESPEC(&tv, tsp); 412 break; 413 case TSP_NSEC: 414 default: 415 nanotime(tsp); 416 break; 417 } 418 } 419 420 /* 421 * Set vnode attributes to VNOVAL 422 */ 423 void 424 vattr_null(vap) 425 register struct vattr *vap; 426 { 427 428 vap->va_type = VNON; 429 vap->va_size = VNOVAL; 430 vap->va_bytes = VNOVAL; 431 vap->va_mode = VNOVAL; 432 vap->va_nlink = VNOVAL; 433 vap->va_uid = VNOVAL; 434 vap->va_gid = VNOVAL; 435 vap->va_fsid = VNOVAL; 436 vap->va_fileid = VNOVAL; 437 vap->va_blocksize = VNOVAL; 438 vap->va_rdev = VNOVAL; 439 vap->va_atime.tv_sec = VNOVAL; 440 vap->va_atime.tv_nsec = VNOVAL; 441 vap->va_mtime.tv_sec = VNOVAL; 442 vap->va_mtime.tv_nsec = VNOVAL; 443 vap->va_ctime.tv_sec = VNOVAL; 444 vap->va_ctime.tv_nsec = VNOVAL; 445 vap->va_flags = VNOVAL; 446 vap->va_gen = VNOVAL; 447 vap->va_vaflags = 0; 448 } 449 450 /* 451 * This routine is called when we have too many vnodes. It attempts 452 * to free <count> vnodes and will potentially free vnodes that still 453 * have VM backing store (VM backing store is typically the cause 454 * of a vnode blowout so we want to do this). Therefore, this operation 455 * is not considered cheap. 456 * 457 * A number of conditions may prevent a vnode from being reclaimed. 458 * the buffer cache may have references on the vnode, a directory 459 * vnode may still have references due to the namei cache representing 460 * underlying files, or the vnode may be in active use. It is not 461 * desireable to reuse such vnodes. These conditions may cause the 462 * number of vnodes to reach some minimum value regardless of what 463 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 464 */ 465 static int 466 vlrureclaim(struct mount *mp) 467 { 468 struct vnode *vp; 469 int done; 470 int trigger; 471 int usevnodes; 472 int count; 473 474 /* 475 * Calculate the trigger point, don't allow user 476 * screwups to blow us up. This prevents us from 477 * recycling vnodes with lots of resident pages. We 478 * aren't trying to free memory, we are trying to 479 * free vnodes. 480 */ 481 usevnodes = desiredvnodes; 482 if (usevnodes <= 0) 483 usevnodes = 1; 484 trigger = cnt.v_page_count * 2 / usevnodes; 485 486 done = 0; 487 simple_lock(&mntvnode_slock); 488 count = mp->mnt_nvnodelistsize / 10 + 1; 489 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 490 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 491 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 492 493 if (vp->v_type != VNON && 494 vp->v_type != VBAD && 495 VMIGHTFREE(vp) && /* critical path opt */ 496 (vp->v_object == NULL || vp->v_object->resident_page_count < trigger) && 497 simple_lock_try(&vp->v_interlock) 498 ) { 499 simple_unlock(&mntvnode_slock); 500 if (VMIGHTFREE(vp)) { 501 vgonel(vp, curproc); 502 done++; 503 } else { 504 simple_unlock(&vp->v_interlock); 505 } 506 simple_lock(&mntvnode_slock); 507 } 508 --count; 509 } 510 simple_unlock(&mntvnode_slock); 511 return done; 512 } 513 514 /* 515 * Attempt to recycle vnodes in a context that is always safe to block. 516 * Calling vlrurecycle() from the bowels of file system code has some 517 * interesting deadlock problems. 518 */ 519 static struct proc *vnlruproc; 520 static int vnlruproc_sig; 521 522 static void 523 vnlru_proc(void) 524 { 525 struct mount *mp, *nmp; 526 int s; 527 int done; 528 struct proc *p = vnlruproc; 529 530 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, 531 SHUTDOWN_PRI_FIRST); 532 533 s = splbio(); 534 for (;;) { 535 kproc_suspend_loop(p); 536 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { 537 vnlruproc_sig = 0; 538 wakeup(&vnlruproc_sig); 539 tsleep(vnlruproc, PVFS, "vlruwt", hz); 540 continue; 541 } 542 done = 0; 543 simple_lock(&mountlist_slock); 544 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 545 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 546 nmp = TAILQ_NEXT(mp, mnt_list); 547 continue; 548 } 549 done += vlrureclaim(mp); 550 simple_lock(&mountlist_slock); 551 nmp = TAILQ_NEXT(mp, mnt_list); 552 vfs_unbusy(mp, p); 553 } 554 simple_unlock(&mountlist_slock); 555 if (done == 0) { 556 vnlru_nowhere++; 557 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 558 } 559 } 560 splx(s); 561 } 562 563 static struct kproc_desc vnlru_kp = { 564 "vnlru", 565 vnlru_proc, 566 &vnlruproc 567 }; 568 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 569 570 /* 571 * Routines having to do with the management of the vnode table. 572 */ 573 extern vop_t **dead_vnodeop_p; 574 575 /* 576 * Return the next vnode from the free list. 577 */ 578 int 579 getnewvnode(tag, mp, vops, vpp) 580 enum vtagtype tag; 581 struct mount *mp; 582 vop_t **vops; 583 struct vnode **vpp; 584 { 585 int s; 586 struct proc *p = curproc; /* XXX */ 587 struct vnode *vp = NULL; 588 vm_object_t object; 589 590 s = splbio(); 591 592 /* 593 * Try to reuse vnodes if we hit the max. This situation only 594 * occurs in certain large-memory (2G+) situations. We cannot 595 * attempt to directly reclaim vnodes due to nasty recursion 596 * problems. 597 */ 598 while (numvnodes - freevnodes > desiredvnodes) { 599 if (vnlruproc_sig == 0) { 600 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 601 wakeup(vnlruproc); 602 } 603 tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz); 604 } 605 606 607 /* 608 * Attempt to reuse a vnode already on the free list, allocating 609 * a new vnode if we can't find one or if we have not reached a 610 * good minimum for good LRU performance. 611 */ 612 simple_lock(&vnode_free_list_slock); 613 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 614 int count; 615 616 for (count = 0; count < freevnodes; count++) { 617 vp = TAILQ_FIRST(&vnode_free_list); 618 if (vp == NULL || vp->v_usecount) 619 panic("getnewvnode: free vnode isn't"); 620 621 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 622 if ((VOP_GETVOBJECT(vp, &object) == 0 && 623 (object->resident_page_count || object->ref_count)) || 624 !simple_lock_try(&vp->v_interlock)) { 625 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 626 vp = NULL; 627 continue; 628 } 629 if (LIST_FIRST(&vp->v_cache_src)) { 630 /* 631 * note: nameileafonly sysctl is temporary, 632 * for debugging only, and will eventually be 633 * removed. 634 */ 635 if (nameileafonly > 0) { 636 /* 637 * Do not reuse namei-cached directory 638 * vnodes that have cached 639 * subdirectories. 640 */ 641 if (cache_leaf_test(vp) < 0) { 642 simple_unlock(&vp->v_interlock); 643 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 644 vp = NULL; 645 continue; 646 } 647 } else if (nameileafonly < 0 || 648 vmiodirenable == 0) { 649 /* 650 * Do not reuse namei-cached directory 651 * vnodes if nameileafonly is -1 or 652 * if VMIO backing for directories is 653 * turned off (otherwise we reuse them 654 * too quickly). 655 */ 656 simple_unlock(&vp->v_interlock); 657 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 658 vp = NULL; 659 continue; 660 } 661 } 662 break; 663 } 664 } 665 666 if (vp) { 667 vp->v_flag |= VDOOMED; 668 vp->v_flag &= ~VFREE; 669 freevnodes--; 670 simple_unlock(&vnode_free_list_slock); 671 cache_purge(vp); 672 vp->v_lease = NULL; 673 if (vp->v_type != VBAD) { 674 vgonel(vp, p); 675 } else { 676 simple_unlock(&vp->v_interlock); 677 } 678 679 #ifdef INVARIANTS 680 { 681 int s; 682 683 if (vp->v_data) 684 panic("cleaned vnode isn't"); 685 s = splbio(); 686 if (vp->v_numoutput) 687 panic("Clean vnode has pending I/O's"); 688 splx(s); 689 } 690 #endif 691 vp->v_flag = 0; 692 vp->v_lastw = 0; 693 vp->v_lasta = 0; 694 vp->v_cstart = 0; 695 vp->v_clen = 0; 696 vp->v_socket = 0; 697 vp->v_writecount = 0; /* XXX */ 698 } else { 699 simple_unlock(&vnode_free_list_slock); 700 vp = (struct vnode *) zalloc(vnode_zone); 701 bzero((char *) vp, sizeof *vp); 702 simple_lock_init(&vp->v_interlock); 703 vp->v_dd = vp; 704 cache_purge(vp); 705 LIST_INIT(&vp->v_cache_src); 706 TAILQ_INIT(&vp->v_cache_dst); 707 numvnodes++; 708 } 709 710 TAILQ_INIT(&vp->v_cleanblkhd); 711 TAILQ_INIT(&vp->v_dirtyblkhd); 712 vp->v_type = VNON; 713 vp->v_tag = tag; 714 vp->v_op = vops; 715 insmntque(vp, mp); 716 *vpp = vp; 717 vp->v_usecount = 1; 718 vp->v_data = 0; 719 splx(s); 720 721 vfs_object_create(vp, p, p->p_ucred); 722 return (0); 723 } 724 725 /* 726 * Move a vnode from one mount queue to another. 727 */ 728 static void 729 insmntque(vp, mp) 730 register struct vnode *vp; 731 register struct mount *mp; 732 { 733 734 simple_lock(&mntvnode_slock); 735 /* 736 * Delete from old mount point vnode list, if on one. 737 */ 738 if (vp->v_mount != NULL) { 739 KASSERT(vp->v_mount->mnt_nvnodelistsize > 0, 740 ("bad mount point vnode list size")); 741 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); 742 vp->v_mount->mnt_nvnodelistsize--; 743 } 744 /* 745 * Insert into list of vnodes for the new mount point, if available. 746 */ 747 if ((vp->v_mount = mp) == NULL) { 748 simple_unlock(&mntvnode_slock); 749 return; 750 } 751 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 752 mp->mnt_nvnodelistsize++; 753 simple_unlock(&mntvnode_slock); 754 } 755 756 /* 757 * Update outstanding I/O count and do wakeup if requested. 758 */ 759 void 760 vwakeup(bp) 761 register struct buf *bp; 762 { 763 register struct vnode *vp; 764 765 bp->b_flags &= ~B_WRITEINPROG; 766 if ((vp = bp->b_vp)) { 767 vp->v_numoutput--; 768 if (vp->v_numoutput < 0) 769 panic("vwakeup: neg numoutput"); 770 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 771 vp->v_flag &= ~VBWAIT; 772 wakeup((caddr_t) &vp->v_numoutput); 773 } 774 } 775 } 776 777 /* 778 * Flush out and invalidate all buffers associated with a vnode. 779 * Called with the underlying object locked. 780 */ 781 int 782 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 783 register struct vnode *vp; 784 int flags; 785 struct ucred *cred; 786 struct proc *p; 787 int slpflag, slptimeo; 788 { 789 register struct buf *bp; 790 struct buf *nbp, *blist; 791 int s, error; 792 vm_object_t object; 793 794 if (flags & V_SAVE) { 795 s = splbio(); 796 while (vp->v_numoutput) { 797 vp->v_flag |= VBWAIT; 798 error = tsleep((caddr_t)&vp->v_numoutput, 799 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 800 if (error) { 801 splx(s); 802 return (error); 803 } 804 } 805 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 806 splx(s); 807 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 808 return (error); 809 s = splbio(); 810 if (vp->v_numoutput > 0 || 811 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 812 panic("vinvalbuf: dirty bufs"); 813 } 814 splx(s); 815 } 816 s = splbio(); 817 for (;;) { 818 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 819 if (!blist) 820 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 821 if (!blist) 822 break; 823 824 for (bp = blist; bp; bp = nbp) { 825 nbp = TAILQ_NEXT(bp, b_vnbufs); 826 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 827 error = BUF_TIMELOCK(bp, 828 LK_EXCLUSIVE | LK_SLEEPFAIL, 829 "vinvalbuf", slpflag, slptimeo); 830 if (error == ENOLCK) 831 break; 832 splx(s); 833 return (error); 834 } 835 /* 836 * XXX Since there are no node locks for NFS, I 837 * believe there is a slight chance that a delayed 838 * write will occur while sleeping just above, so 839 * check for it. Note that vfs_bio_awrite expects 840 * buffers to reside on a queue, while VOP_BWRITE and 841 * brelse do not. 842 */ 843 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 844 (flags & V_SAVE)) { 845 846 if (bp->b_vp == vp) { 847 if (bp->b_flags & B_CLUSTEROK) { 848 BUF_UNLOCK(bp); 849 vfs_bio_awrite(bp); 850 } else { 851 bremfree(bp); 852 bp->b_flags |= B_ASYNC; 853 VOP_BWRITE(bp->b_vp, bp); 854 } 855 } else { 856 bremfree(bp); 857 (void) VOP_BWRITE(bp->b_vp, bp); 858 } 859 break; 860 } 861 bremfree(bp); 862 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 863 bp->b_flags &= ~B_ASYNC; 864 brelse(bp); 865 } 866 } 867 868 /* 869 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 870 * have write I/O in-progress but if there is a VM object then the 871 * VM object can also have read-I/O in-progress. 872 */ 873 do { 874 while (vp->v_numoutput > 0) { 875 vp->v_flag |= VBWAIT; 876 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 877 } 878 if (VOP_GETVOBJECT(vp, &object) == 0) { 879 while (object->paging_in_progress) 880 vm_object_pip_sleep(object, "vnvlbx"); 881 } 882 } while (vp->v_numoutput > 0); 883 884 splx(s); 885 886 /* 887 * Destroy the copy in the VM cache, too. 888 */ 889 simple_lock(&vp->v_interlock); 890 if (VOP_GETVOBJECT(vp, &object) == 0) { 891 vm_object_page_remove(object, 0, 0, 892 (flags & V_SAVE) ? TRUE : FALSE); 893 } 894 simple_unlock(&vp->v_interlock); 895 896 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 897 panic("vinvalbuf: flush failed"); 898 return (0); 899 } 900 901 /* 902 * Truncate a file's buffer and pages to a specified length. This 903 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 904 * sync activity. 905 */ 906 int 907 vtruncbuf(vp, cred, p, length, blksize) 908 register struct vnode *vp; 909 struct ucred *cred; 910 struct proc *p; 911 off_t length; 912 int blksize; 913 { 914 register struct buf *bp; 915 struct buf *nbp; 916 int s, anyfreed; 917 int trunclbn; 918 919 /* 920 * Round up to the *next* lbn. 921 */ 922 trunclbn = (length + blksize - 1) / blksize; 923 924 s = splbio(); 925 restart: 926 anyfreed = 1; 927 for (;anyfreed;) { 928 anyfreed = 0; 929 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 930 nbp = TAILQ_NEXT(bp, b_vnbufs); 931 if (bp->b_lblkno >= trunclbn) { 932 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 933 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 934 goto restart; 935 } else { 936 bremfree(bp); 937 bp->b_flags |= (B_INVAL | B_RELBUF); 938 bp->b_flags &= ~B_ASYNC; 939 brelse(bp); 940 anyfreed = 1; 941 } 942 if (nbp && 943 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 944 (nbp->b_vp != vp) || 945 (nbp->b_flags & B_DELWRI))) { 946 goto restart; 947 } 948 } 949 } 950 951 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 952 nbp = TAILQ_NEXT(bp, b_vnbufs); 953 if (bp->b_lblkno >= trunclbn) { 954 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 955 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 956 goto restart; 957 } else { 958 bremfree(bp); 959 bp->b_flags |= (B_INVAL | B_RELBUF); 960 bp->b_flags &= ~B_ASYNC; 961 brelse(bp); 962 anyfreed = 1; 963 } 964 if (nbp && 965 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 966 (nbp->b_vp != vp) || 967 (nbp->b_flags & B_DELWRI) == 0)) { 968 goto restart; 969 } 970 } 971 } 972 } 973 974 if (length > 0) { 975 restartsync: 976 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 977 nbp = TAILQ_NEXT(bp, b_vnbufs); 978 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 979 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 980 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 981 goto restart; 982 } else { 983 bremfree(bp); 984 if (bp->b_vp == vp) { 985 bp->b_flags |= B_ASYNC; 986 } else { 987 bp->b_flags &= ~B_ASYNC; 988 } 989 VOP_BWRITE(bp->b_vp, bp); 990 } 991 goto restartsync; 992 } 993 994 } 995 } 996 997 while (vp->v_numoutput > 0) { 998 vp->v_flag |= VBWAIT; 999 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 1000 } 1001 1002 splx(s); 1003 1004 vnode_pager_setsize(vp, length); 1005 1006 return (0); 1007 } 1008 1009 /* 1010 * Associate a buffer with a vnode. 1011 */ 1012 void 1013 bgetvp(vp, bp) 1014 register struct vnode *vp; 1015 register struct buf *bp; 1016 { 1017 int s; 1018 1019 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 1020 1021 vhold(vp); 1022 bp->b_vp = vp; 1023 bp->b_dev = vn_todev(vp); 1024 /* 1025 * Insert onto list for new vnode. 1026 */ 1027 s = splbio(); 1028 bp->b_xflags |= BX_VNCLEAN; 1029 bp->b_xflags &= ~BX_VNDIRTY; 1030 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 1031 splx(s); 1032 } 1033 1034 /* 1035 * Disassociate a buffer from a vnode. 1036 */ 1037 void 1038 brelvp(bp) 1039 register struct buf *bp; 1040 { 1041 struct vnode *vp; 1042 struct buflists *listheadp; 1043 int s; 1044 1045 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1046 1047 /* 1048 * Delete from old vnode list, if on one. 1049 */ 1050 vp = bp->b_vp; 1051 s = splbio(); 1052 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1053 if (bp->b_xflags & BX_VNDIRTY) 1054 listheadp = &vp->v_dirtyblkhd; 1055 else 1056 listheadp = &vp->v_cleanblkhd; 1057 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1058 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1059 } 1060 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 1061 vp->v_flag &= ~VONWORKLST; 1062 LIST_REMOVE(vp, v_synclist); 1063 } 1064 splx(s); 1065 bp->b_vp = (struct vnode *) 0; 1066 vdrop(vp); 1067 } 1068 1069 /* 1070 * The workitem queue. 1071 * 1072 * It is useful to delay writes of file data and filesystem metadata 1073 * for tens of seconds so that quickly created and deleted files need 1074 * not waste disk bandwidth being created and removed. To realize this, 1075 * we append vnodes to a "workitem" queue. When running with a soft 1076 * updates implementation, most pending metadata dependencies should 1077 * not wait for more than a few seconds. Thus, mounted on block devices 1078 * are delayed only about a half the time that file data is delayed. 1079 * Similarly, directory updates are more critical, so are only delayed 1080 * about a third the time that file data is delayed. Thus, there are 1081 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 1082 * one each second (driven off the filesystem syncer process). The 1083 * syncer_delayno variable indicates the next queue that is to be processed. 1084 * Items that need to be processed soon are placed in this queue: 1085 * 1086 * syncer_workitem_pending[syncer_delayno] 1087 * 1088 * A delay of fifteen seconds is done by placing the request fifteen 1089 * entries later in the queue: 1090 * 1091 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 1092 * 1093 */ 1094 1095 /* 1096 * Add an item to the syncer work queue. 1097 */ 1098 static void 1099 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 1100 { 1101 int s, slot; 1102 1103 s = splbio(); 1104 1105 if (vp->v_flag & VONWORKLST) { 1106 LIST_REMOVE(vp, v_synclist); 1107 } 1108 1109 if (delay > syncer_maxdelay - 2) 1110 delay = syncer_maxdelay - 2; 1111 slot = (syncer_delayno + delay) & syncer_mask; 1112 1113 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 1114 vp->v_flag |= VONWORKLST; 1115 splx(s); 1116 } 1117 1118 struct proc *updateproc; 1119 static void sched_sync __P((void)); 1120 static struct kproc_desc up_kp = { 1121 "syncer", 1122 sched_sync, 1123 &updateproc 1124 }; 1125 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1126 1127 /* 1128 * System filesystem synchronizer daemon. 1129 */ 1130 void 1131 sched_sync(void) 1132 { 1133 struct synclist *slp; 1134 struct vnode *vp; 1135 long starttime; 1136 int s; 1137 struct proc *p = updateproc; 1138 1139 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, 1140 SHUTDOWN_PRI_LAST); 1141 1142 for (;;) { 1143 kproc_suspend_loop(p); 1144 1145 starttime = time_second; 1146 1147 /* 1148 * Push files whose dirty time has expired. Be careful 1149 * of interrupt race on slp queue. 1150 */ 1151 s = splbio(); 1152 slp = &syncer_workitem_pending[syncer_delayno]; 1153 syncer_delayno += 1; 1154 if (syncer_delayno == syncer_maxdelay) 1155 syncer_delayno = 0; 1156 splx(s); 1157 1158 while ((vp = LIST_FIRST(slp)) != NULL) { 1159 if (VOP_ISLOCKED(vp, NULL) == 0) { 1160 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1161 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 1162 VOP_UNLOCK(vp, 0, p); 1163 } 1164 s = splbio(); 1165 if (LIST_FIRST(slp) == vp) { 1166 /* 1167 * Note: v_tag VT_VFS vps can remain on the 1168 * worklist too with no dirty blocks, but 1169 * since sync_fsync() moves it to a different 1170 * slot we are safe. 1171 */ 1172 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1173 !vn_isdisk(vp, NULL)) 1174 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1175 /* 1176 * Put us back on the worklist. The worklist 1177 * routine will remove us from our current 1178 * position and then add us back in at a later 1179 * position. 1180 */ 1181 vn_syncer_add_to_worklist(vp, syncdelay); 1182 } 1183 splx(s); 1184 } 1185 1186 /* 1187 * Do soft update processing. 1188 */ 1189 if (bioops.io_sync) 1190 (*bioops.io_sync)(NULL); 1191 1192 /* 1193 * The variable rushjob allows the kernel to speed up the 1194 * processing of the filesystem syncer process. A rushjob 1195 * value of N tells the filesystem syncer to process the next 1196 * N seconds worth of work on its queue ASAP. Currently rushjob 1197 * is used by the soft update code to speed up the filesystem 1198 * syncer process when the incore state is getting so far 1199 * ahead of the disk that the kernel memory pool is being 1200 * threatened with exhaustion. 1201 */ 1202 if (rushjob > 0) { 1203 rushjob -= 1; 1204 continue; 1205 } 1206 /* 1207 * If it has taken us less than a second to process the 1208 * current work, then wait. Otherwise start right over 1209 * again. We can still lose time if any single round 1210 * takes more than two seconds, but it does not really 1211 * matter as we are just trying to generally pace the 1212 * filesystem activity. 1213 */ 1214 if (time_second == starttime) 1215 tsleep(&lbolt, PPAUSE, "syncer", 0); 1216 } 1217 } 1218 1219 /* 1220 * Request the syncer daemon to speed up its work. 1221 * We never push it to speed up more than half of its 1222 * normal turn time, otherwise it could take over the cpu. 1223 */ 1224 int 1225 speedup_syncer() 1226 { 1227 int s; 1228 1229 s = splhigh(); 1230 if (updateproc->p_wchan == &lbolt) 1231 setrunnable(updateproc); 1232 splx(s); 1233 if (rushjob < syncdelay / 2) { 1234 rushjob += 1; 1235 stat_rush_requests += 1; 1236 return (1); 1237 } 1238 return(0); 1239 } 1240 1241 /* 1242 * Associate a p-buffer with a vnode. 1243 * 1244 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1245 * with the buffer. i.e. the bp has not been linked into the vnode or 1246 * ref-counted. 1247 */ 1248 void 1249 pbgetvp(vp, bp) 1250 register struct vnode *vp; 1251 register struct buf *bp; 1252 { 1253 1254 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1255 1256 bp->b_vp = vp; 1257 bp->b_flags |= B_PAGING; 1258 bp->b_dev = vn_todev(vp); 1259 } 1260 1261 /* 1262 * Disassociate a p-buffer from a vnode. 1263 */ 1264 void 1265 pbrelvp(bp) 1266 register struct buf *bp; 1267 { 1268 1269 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1270 1271 /* XXX REMOVE ME */ 1272 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1273 panic( 1274 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1275 bp, 1276 (int)bp->b_flags 1277 ); 1278 } 1279 bp->b_vp = (struct vnode *) 0; 1280 bp->b_flags &= ~B_PAGING; 1281 } 1282 1283 void 1284 pbreassignbuf(bp, newvp) 1285 struct buf *bp; 1286 struct vnode *newvp; 1287 { 1288 if ((bp->b_flags & B_PAGING) == 0) { 1289 panic( 1290 "pbreassignbuf() on non phys bp %p", 1291 bp 1292 ); 1293 } 1294 bp->b_vp = newvp; 1295 } 1296 1297 /* 1298 * Reassign a buffer from one vnode to another. 1299 * Used to assign file specific control information 1300 * (indirect blocks) to the vnode to which they belong. 1301 */ 1302 void 1303 reassignbuf(bp, newvp) 1304 register struct buf *bp; 1305 register struct vnode *newvp; 1306 { 1307 struct buflists *listheadp; 1308 int delay; 1309 int s; 1310 1311 if (newvp == NULL) { 1312 printf("reassignbuf: NULL"); 1313 return; 1314 } 1315 ++reassignbufcalls; 1316 1317 /* 1318 * B_PAGING flagged buffers cannot be reassigned because their vp 1319 * is not fully linked in. 1320 */ 1321 if (bp->b_flags & B_PAGING) 1322 panic("cannot reassign paging buffer"); 1323 1324 s = splbio(); 1325 /* 1326 * Delete from old vnode list, if on one. 1327 */ 1328 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1329 if (bp->b_xflags & BX_VNDIRTY) 1330 listheadp = &bp->b_vp->v_dirtyblkhd; 1331 else 1332 listheadp = &bp->b_vp->v_cleanblkhd; 1333 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1334 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1335 if (bp->b_vp != newvp) { 1336 vdrop(bp->b_vp); 1337 bp->b_vp = NULL; /* for clarification */ 1338 } 1339 } 1340 /* 1341 * If dirty, put on list of dirty buffers; otherwise insert onto list 1342 * of clean buffers. 1343 */ 1344 if (bp->b_flags & B_DELWRI) { 1345 struct buf *tbp; 1346 1347 listheadp = &newvp->v_dirtyblkhd; 1348 if ((newvp->v_flag & VONWORKLST) == 0) { 1349 switch (newvp->v_type) { 1350 case VDIR: 1351 delay = dirdelay; 1352 break; 1353 case VCHR: 1354 case VBLK: 1355 if (newvp->v_specmountpoint != NULL) { 1356 delay = metadelay; 1357 break; 1358 } 1359 /* fall through */ 1360 default: 1361 delay = filedelay; 1362 } 1363 vn_syncer_add_to_worklist(newvp, delay); 1364 } 1365 bp->b_xflags |= BX_VNDIRTY; 1366 tbp = TAILQ_FIRST(listheadp); 1367 if (tbp == NULL || 1368 bp->b_lblkno == 0 || 1369 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1370 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1371 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1372 ++reassignbufsortgood; 1373 } else if (bp->b_lblkno < 0) { 1374 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1375 ++reassignbufsortgood; 1376 } else if (reassignbufmethod == 1) { 1377 /* 1378 * New sorting algorithm, only handle sequential case, 1379 * otherwise append to end (but before metadata) 1380 */ 1381 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1382 (tbp->b_xflags & BX_VNDIRTY)) { 1383 /* 1384 * Found the best place to insert the buffer 1385 */ 1386 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1387 ++reassignbufsortgood; 1388 } else { 1389 /* 1390 * Missed, append to end, but before meta-data. 1391 * We know that the head buffer in the list is 1392 * not meta-data due to prior conditionals. 1393 * 1394 * Indirect effects: NFS second stage write 1395 * tends to wind up here, giving maximum 1396 * distance between the unstable write and the 1397 * commit rpc. 1398 */ 1399 tbp = TAILQ_LAST(listheadp, buflists); 1400 while (tbp && tbp->b_lblkno < 0) 1401 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1402 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1403 ++reassignbufsortbad; 1404 } 1405 } else { 1406 /* 1407 * Old sorting algorithm, scan queue and insert 1408 */ 1409 struct buf *ttbp; 1410 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1411 (ttbp->b_lblkno < bp->b_lblkno)) { 1412 ++reassignbufloops; 1413 tbp = ttbp; 1414 } 1415 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1416 } 1417 } else { 1418 bp->b_xflags |= BX_VNCLEAN; 1419 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1420 if ((newvp->v_flag & VONWORKLST) && 1421 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1422 newvp->v_flag &= ~VONWORKLST; 1423 LIST_REMOVE(newvp, v_synclist); 1424 } 1425 } 1426 if (bp->b_vp != newvp) { 1427 bp->b_vp = newvp; 1428 vhold(bp->b_vp); 1429 } 1430 splx(s); 1431 } 1432 1433 /* 1434 * Create a vnode for a block device. 1435 * Used for mounting the root file system. 1436 */ 1437 int 1438 bdevvp(dev, vpp) 1439 dev_t dev; 1440 struct vnode **vpp; 1441 { 1442 register struct vnode *vp; 1443 struct vnode *nvp; 1444 int error; 1445 1446 if (dev == NODEV) { 1447 *vpp = NULLVP; 1448 return (ENXIO); 1449 } 1450 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1451 if (error) { 1452 *vpp = NULLVP; 1453 return (error); 1454 } 1455 vp = nvp; 1456 vp->v_type = VBLK; 1457 addalias(vp, dev); 1458 *vpp = vp; 1459 return (0); 1460 } 1461 1462 /* 1463 * Add vnode to the alias list hung off the dev_t. 1464 * 1465 * The reason for this gunk is that multiple vnodes can reference 1466 * the same physical device, so checking vp->v_usecount to see 1467 * how many users there are is inadequate; the v_usecount for 1468 * the vnodes need to be accumulated. vcount() does that. 1469 */ 1470 void 1471 addaliasu(nvp, nvp_rdev) 1472 struct vnode *nvp; 1473 udev_t nvp_rdev; 1474 { 1475 1476 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1477 panic("addaliasu on non-special vnode"); 1478 addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0)); 1479 } 1480 1481 void 1482 addalias(nvp, dev) 1483 struct vnode *nvp; 1484 dev_t dev; 1485 { 1486 1487 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1488 panic("addalias on non-special vnode"); 1489 1490 nvp->v_rdev = dev; 1491 simple_lock(&spechash_slock); 1492 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1493 simple_unlock(&spechash_slock); 1494 } 1495 1496 /* 1497 * Grab a particular vnode from the free list, increment its 1498 * reference count and lock it. The vnode lock bit is set if the 1499 * vnode is being eliminated in vgone. The process is awakened 1500 * when the transition is completed, and an error returned to 1501 * indicate that the vnode is no longer usable (possibly having 1502 * been changed to a new file system type). 1503 */ 1504 int 1505 vget(vp, flags, p) 1506 register struct vnode *vp; 1507 int flags; 1508 struct proc *p; 1509 { 1510 int error; 1511 1512 /* 1513 * If the vnode is in the process of being cleaned out for 1514 * another use, we wait for the cleaning to finish and then 1515 * return failure. Cleaning is determined by checking that 1516 * the VXLOCK flag is set. 1517 */ 1518 if ((flags & LK_INTERLOCK) == 0) { 1519 simple_lock(&vp->v_interlock); 1520 } 1521 if (vp->v_flag & VXLOCK) { 1522 if (vp->v_vxproc == curproc) { 1523 #if 0 1524 /* this can now occur in normal operation */ 1525 log(LOG_INFO, "VXLOCK interlock avoided\n"); 1526 #endif 1527 } else { 1528 vp->v_flag |= VXWANT; 1529 simple_unlock(&vp->v_interlock); 1530 tsleep((caddr_t)vp, PINOD, "vget", 0); 1531 return (ENOENT); 1532 } 1533 } 1534 1535 vp->v_usecount++; 1536 1537 if (VSHOULDBUSY(vp)) 1538 vbusy(vp); 1539 if (flags & LK_TYPE_MASK) { 1540 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1541 /* 1542 * must expand vrele here because we do not want 1543 * to call VOP_INACTIVE if the reference count 1544 * drops back to zero since it was never really 1545 * active. We must remove it from the free list 1546 * before sleeping so that multiple processes do 1547 * not try to recycle it. 1548 */ 1549 simple_lock(&vp->v_interlock); 1550 vp->v_usecount--; 1551 if (VSHOULDFREE(vp)) 1552 vfree(vp); 1553 else 1554 vlruvp(vp); 1555 simple_unlock(&vp->v_interlock); 1556 } 1557 return (error); 1558 } 1559 simple_unlock(&vp->v_interlock); 1560 return (0); 1561 } 1562 1563 void 1564 vref(struct vnode *vp) 1565 { 1566 simple_lock(&vp->v_interlock); 1567 vp->v_usecount++; 1568 simple_unlock(&vp->v_interlock); 1569 } 1570 1571 /* 1572 * Vnode put/release. 1573 * If count drops to zero, call inactive routine and return to freelist. 1574 */ 1575 void 1576 vrele(vp) 1577 struct vnode *vp; 1578 { 1579 struct proc *p = curproc; /* XXX */ 1580 1581 KASSERT(vp != NULL, ("vrele: null vp")); 1582 1583 simple_lock(&vp->v_interlock); 1584 1585 if (vp->v_usecount > 1) { 1586 1587 vp->v_usecount--; 1588 simple_unlock(&vp->v_interlock); 1589 1590 return; 1591 } 1592 1593 if (vp->v_usecount == 1) { 1594 vp->v_usecount--; 1595 /* 1596 * We must call VOP_INACTIVE with the node locked. 1597 * If we are doing a vpu, the node is already locked, 1598 * but, in the case of vrele, we must explicitly lock 1599 * the vnode before calling VOP_INACTIVE 1600 */ 1601 1602 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) 1603 VOP_INACTIVE(vp, p); 1604 if (VSHOULDFREE(vp)) 1605 vfree(vp); 1606 else 1607 vlruvp(vp); 1608 } else { 1609 #ifdef DIAGNOSTIC 1610 vprint("vrele: negative ref count", vp); 1611 simple_unlock(&vp->v_interlock); 1612 #endif 1613 panic("vrele: negative ref cnt"); 1614 } 1615 } 1616 1617 void 1618 vput(vp) 1619 struct vnode *vp; 1620 { 1621 struct proc *p = curproc; /* XXX */ 1622 1623 KASSERT(vp != NULL, ("vput: null vp")); 1624 1625 simple_lock(&vp->v_interlock); 1626 1627 if (vp->v_usecount > 1) { 1628 vp->v_usecount--; 1629 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1630 return; 1631 } 1632 1633 if (vp->v_usecount == 1) { 1634 vp->v_usecount--; 1635 /* 1636 * We must call VOP_INACTIVE with the node locked. 1637 * If we are doing a vpu, the node is already locked, 1638 * so we just need to release the vnode mutex. 1639 */ 1640 simple_unlock(&vp->v_interlock); 1641 VOP_INACTIVE(vp, p); 1642 if (VSHOULDFREE(vp)) 1643 vfree(vp); 1644 else 1645 vlruvp(vp); 1646 } else { 1647 #ifdef DIAGNOSTIC 1648 vprint("vput: negative ref count", vp); 1649 #endif 1650 panic("vput: negative ref cnt"); 1651 } 1652 } 1653 1654 /* 1655 * Somebody doesn't want the vnode recycled. 1656 */ 1657 void 1658 vhold(vp) 1659 register struct vnode *vp; 1660 { 1661 int s; 1662 1663 s = splbio(); 1664 vp->v_holdcnt++; 1665 if (VSHOULDBUSY(vp)) 1666 vbusy(vp); 1667 splx(s); 1668 } 1669 1670 /* 1671 * One less who cares about this vnode. 1672 */ 1673 void 1674 vdrop(vp) 1675 register struct vnode *vp; 1676 { 1677 int s; 1678 1679 s = splbio(); 1680 if (vp->v_holdcnt <= 0) 1681 panic("vdrop: holdcnt"); 1682 vp->v_holdcnt--; 1683 if (VSHOULDFREE(vp)) 1684 vfree(vp); 1685 splx(s); 1686 } 1687 1688 /* 1689 * Remove any vnodes in the vnode table belonging to mount point mp. 1690 * 1691 * If FORCECLOSE is not specified, there should not be any active ones, 1692 * return error if any are found (nb: this is a user error, not a 1693 * system error). If FORCECLOSE is specified, detach any active vnodes 1694 * that are found. 1695 * 1696 * If WRITECLOSE is set, only flush out regular file vnodes open for 1697 * writing. 1698 * 1699 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. 1700 * 1701 * `rootrefs' specifies the base reference count for the root vnode 1702 * of this filesystem. The root vnode is considered busy if its 1703 * v_usecount exceeds this value. On a successful return, vflush() 1704 * will call vrele() on the root vnode exactly rootrefs times. 1705 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 1706 * be zero. 1707 */ 1708 #ifdef DIAGNOSTIC 1709 static int busyprt = 0; /* print out busy vnodes */ 1710 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1711 #endif 1712 1713 int 1714 vflush(mp, rootrefs, flags) 1715 struct mount *mp; 1716 int rootrefs; 1717 int flags; 1718 { 1719 struct proc *p = curproc; /* XXX */ 1720 struct vnode *vp, *nvp, *rootvp = NULL; 1721 struct vattr vattr; 1722 int busy = 0, error; 1723 1724 if (rootrefs > 0) { 1725 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 1726 ("vflush: bad args")); 1727 /* 1728 * Get the filesystem root vnode. We can vput() it 1729 * immediately, since with rootrefs > 0, it won't go away. 1730 */ 1731 if ((error = VFS_ROOT(mp, &rootvp)) != 0) 1732 return (error); 1733 vput(rootvp); 1734 } 1735 simple_lock(&mntvnode_slock); 1736 loop: 1737 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) { 1738 /* 1739 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1740 * Start over if it has (it won't be on the list anymore). 1741 */ 1742 if (vp->v_mount != mp) 1743 goto loop; 1744 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 1745 1746 simple_lock(&vp->v_interlock); 1747 /* 1748 * Skip over a vnodes marked VSYSTEM. 1749 */ 1750 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1751 simple_unlock(&vp->v_interlock); 1752 continue; 1753 } 1754 /* 1755 * If WRITECLOSE is set, flush out unlinked but still open 1756 * files (even if open only for reading) and regular file 1757 * vnodes open for writing. 1758 */ 1759 if ((flags & WRITECLOSE) && 1760 (vp->v_type == VNON || 1761 (VOP_GETATTR(vp, &vattr, p->p_ucred, p) == 0 && 1762 vattr.va_nlink > 0)) && 1763 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1764 simple_unlock(&vp->v_interlock); 1765 continue; 1766 } 1767 1768 /* 1769 * With v_usecount == 0, all we need to do is clear out the 1770 * vnode data structures and we are done. 1771 */ 1772 if (vp->v_usecount == 0) { 1773 simple_unlock(&mntvnode_slock); 1774 vgonel(vp, p); 1775 simple_lock(&mntvnode_slock); 1776 continue; 1777 } 1778 1779 /* 1780 * If FORCECLOSE is set, forcibly close the vnode. For block 1781 * or character devices, revert to an anonymous device. For 1782 * all other files, just kill them. 1783 */ 1784 if (flags & FORCECLOSE) { 1785 simple_unlock(&mntvnode_slock); 1786 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1787 vgonel(vp, p); 1788 } else { 1789 vclean(vp, 0, p); 1790 vp->v_op = spec_vnodeop_p; 1791 insmntque(vp, (struct mount *) 0); 1792 } 1793 simple_lock(&mntvnode_slock); 1794 continue; 1795 } 1796 #ifdef DIAGNOSTIC 1797 if (busyprt) 1798 vprint("vflush: busy vnode", vp); 1799 #endif 1800 simple_unlock(&vp->v_interlock); 1801 busy++; 1802 } 1803 simple_unlock(&mntvnode_slock); 1804 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 1805 /* 1806 * If just the root vnode is busy, and if its refcount 1807 * is equal to `rootrefs', then go ahead and kill it. 1808 */ 1809 simple_lock(&rootvp->v_interlock); 1810 KASSERT(busy > 0, ("vflush: not busy")); 1811 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); 1812 if (busy == 1 && rootvp->v_usecount == rootrefs) { 1813 vgonel(rootvp, p); 1814 busy = 0; 1815 } else 1816 simple_unlock(&rootvp->v_interlock); 1817 } 1818 if (busy) 1819 return (EBUSY); 1820 for (; rootrefs > 0; rootrefs--) 1821 vrele(rootvp); 1822 return (0); 1823 } 1824 1825 /* 1826 * We do not want to recycle the vnode too quickly. 1827 * 1828 * XXX we can't move vp's around the nvnodelist without really screwing 1829 * up the efficiency of filesystem SYNC and friends. This code is 1830 * disabled until we fix the syncing code's scanning algorithm. 1831 */ 1832 static void 1833 vlruvp(struct vnode *vp) 1834 { 1835 #if 0 1836 struct mount *mp; 1837 1838 if ((mp = vp->v_mount) != NULL) { 1839 simple_lock(&mntvnode_slock); 1840 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1841 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1842 simple_unlock(&mntvnode_slock); 1843 } 1844 #endif 1845 } 1846 1847 /* 1848 * Disassociate the underlying file system from a vnode. 1849 */ 1850 static void 1851 vclean(vp, flags, p) 1852 struct vnode *vp; 1853 int flags; 1854 struct proc *p; 1855 { 1856 int active; 1857 1858 /* 1859 * Check to see if the vnode is in use. If so we have to reference it 1860 * before we clean it out so that its count cannot fall to zero and 1861 * generate a race against ourselves to recycle it. 1862 */ 1863 if ((active = vp->v_usecount)) 1864 vp->v_usecount++; 1865 1866 /* 1867 * Prevent the vnode from being recycled or brought into use while we 1868 * clean it out. 1869 */ 1870 if (vp->v_flag & VXLOCK) 1871 panic("vclean: deadlock"); 1872 vp->v_flag |= VXLOCK; 1873 vp->v_vxproc = curproc; 1874 /* 1875 * Even if the count is zero, the VOP_INACTIVE routine may still 1876 * have the object locked while it cleans it out. The VOP_LOCK 1877 * ensures that the VOP_INACTIVE routine is done with its work. 1878 * For active vnodes, it ensures that no other activity can 1879 * occur while the underlying object is being cleaned out. 1880 */ 1881 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1882 1883 /* 1884 * Clean out any buffers associated with the vnode. 1885 */ 1886 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1887 1888 VOP_DESTROYVOBJECT(vp); 1889 1890 /* 1891 * If purging an active vnode, it must be closed and 1892 * deactivated before being reclaimed. Note that the 1893 * VOP_INACTIVE will unlock the vnode. 1894 */ 1895 if (active) { 1896 if (flags & DOCLOSE) 1897 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1898 VOP_INACTIVE(vp, p); 1899 } else { 1900 /* 1901 * Any other processes trying to obtain this lock must first 1902 * wait for VXLOCK to clear, then call the new lock operation. 1903 */ 1904 VOP_UNLOCK(vp, 0, p); 1905 } 1906 /* 1907 * Reclaim the vnode. 1908 */ 1909 if (VOP_RECLAIM(vp, p)) 1910 panic("vclean: cannot reclaim"); 1911 1912 if (active) { 1913 /* 1914 * Inline copy of vrele() since VOP_INACTIVE 1915 * has already been called. 1916 */ 1917 simple_lock(&vp->v_interlock); 1918 if (--vp->v_usecount <= 0) { 1919 #ifdef DIAGNOSTIC 1920 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1921 vprint("vclean: bad ref count", vp); 1922 panic("vclean: ref cnt"); 1923 } 1924 #endif 1925 vfree(vp); 1926 } 1927 simple_unlock(&vp->v_interlock); 1928 } 1929 1930 cache_purge(vp); 1931 vp->v_vnlock = NULL; 1932 1933 if (VSHOULDFREE(vp)) 1934 vfree(vp); 1935 1936 /* 1937 * Done with purge, notify sleepers of the grim news. 1938 */ 1939 vp->v_op = dead_vnodeop_p; 1940 vn_pollgone(vp); 1941 vp->v_tag = VT_NON; 1942 vp->v_flag &= ~VXLOCK; 1943 vp->v_vxproc = NULL; 1944 if (vp->v_flag & VXWANT) { 1945 vp->v_flag &= ~VXWANT; 1946 wakeup((caddr_t) vp); 1947 } 1948 } 1949 1950 /* 1951 * Eliminate all activity associated with the requested vnode 1952 * and with all vnodes aliased to the requested vnode. 1953 */ 1954 int 1955 vop_revoke(ap) 1956 struct vop_revoke_args /* { 1957 struct vnode *a_vp; 1958 int a_flags; 1959 } */ *ap; 1960 { 1961 struct vnode *vp, *vq; 1962 dev_t dev; 1963 1964 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1965 1966 vp = ap->a_vp; 1967 /* 1968 * If a vgone (or vclean) is already in progress, 1969 * wait until it is done and return. 1970 */ 1971 if (vp->v_flag & VXLOCK) { 1972 vp->v_flag |= VXWANT; 1973 simple_unlock(&vp->v_interlock); 1974 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1975 return (0); 1976 } 1977 dev = vp->v_rdev; 1978 for (;;) { 1979 simple_lock(&spechash_slock); 1980 vq = SLIST_FIRST(&dev->si_hlist); 1981 simple_unlock(&spechash_slock); 1982 if (!vq) 1983 break; 1984 vgone(vq); 1985 } 1986 return (0); 1987 } 1988 1989 /* 1990 * Recycle an unused vnode to the front of the free list. 1991 * Release the passed interlock if the vnode will be recycled. 1992 */ 1993 int 1994 vrecycle(vp, inter_lkp, p) 1995 struct vnode *vp; 1996 struct simplelock *inter_lkp; 1997 struct proc *p; 1998 { 1999 2000 simple_lock(&vp->v_interlock); 2001 if (vp->v_usecount == 0) { 2002 if (inter_lkp) { 2003 simple_unlock(inter_lkp); 2004 } 2005 vgonel(vp, p); 2006 return (1); 2007 } 2008 simple_unlock(&vp->v_interlock); 2009 return (0); 2010 } 2011 2012 /* 2013 * Eliminate all activity associated with a vnode 2014 * in preparation for reuse. 2015 */ 2016 void 2017 vgone(vp) 2018 register struct vnode *vp; 2019 { 2020 struct proc *p = curproc; /* XXX */ 2021 2022 simple_lock(&vp->v_interlock); 2023 vgonel(vp, p); 2024 } 2025 2026 /* 2027 * vgone, with the vp interlock held. 2028 */ 2029 void 2030 vgonel(vp, p) 2031 struct vnode *vp; 2032 struct proc *p; 2033 { 2034 int s; 2035 2036 /* 2037 * If a vgone (or vclean) is already in progress, 2038 * wait until it is done and return. 2039 */ 2040 if (vp->v_flag & VXLOCK) { 2041 vp->v_flag |= VXWANT; 2042 simple_unlock(&vp->v_interlock); 2043 tsleep((caddr_t)vp, PINOD, "vgone", 0); 2044 return; 2045 } 2046 2047 /* 2048 * Clean out the filesystem specific data. 2049 */ 2050 vclean(vp, DOCLOSE, p); 2051 simple_lock(&vp->v_interlock); 2052 2053 /* 2054 * Delete from old mount point vnode list, if on one. 2055 */ 2056 if (vp->v_mount != NULL) 2057 insmntque(vp, (struct mount *)0); 2058 /* 2059 * If special device, remove it from special device alias list 2060 * if it is on one. 2061 */ 2062 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 2063 simple_lock(&spechash_slock); 2064 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext); 2065 freedev(vp->v_rdev); 2066 simple_unlock(&spechash_slock); 2067 vp->v_rdev = NULL; 2068 } 2069 2070 /* 2071 * If it is on the freelist and not already at the head, 2072 * move it to the head of the list. The test of the 2073 * VDOOMED flag and the reference count of zero is because 2074 * it will be removed from the free list by getnewvnode, 2075 * but will not have its reference count incremented until 2076 * after calling vgone. If the reference count were 2077 * incremented first, vgone would (incorrectly) try to 2078 * close the previous instance of the underlying object. 2079 */ 2080 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 2081 s = splbio(); 2082 simple_lock(&vnode_free_list_slock); 2083 if (vp->v_flag & VFREE) 2084 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2085 else 2086 freevnodes++; 2087 vp->v_flag |= VFREE; 2088 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2089 simple_unlock(&vnode_free_list_slock); 2090 splx(s); 2091 } 2092 2093 vp->v_type = VBAD; 2094 simple_unlock(&vp->v_interlock); 2095 } 2096 2097 /* 2098 * Lookup a vnode by device number. 2099 */ 2100 int 2101 vfinddev(dev, type, vpp) 2102 dev_t dev; 2103 enum vtype type; 2104 struct vnode **vpp; 2105 { 2106 struct vnode *vp; 2107 2108 simple_lock(&spechash_slock); 2109 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2110 if (type == vp->v_type) { 2111 *vpp = vp; 2112 simple_unlock(&spechash_slock); 2113 return (1); 2114 } 2115 } 2116 simple_unlock(&spechash_slock); 2117 return (0); 2118 } 2119 2120 /* 2121 * Calculate the total number of references to a special device. 2122 */ 2123 int 2124 vcount(vp) 2125 struct vnode *vp; 2126 { 2127 struct vnode *vq; 2128 int count; 2129 2130 count = 0; 2131 simple_lock(&spechash_slock); 2132 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext) 2133 count += vq->v_usecount; 2134 simple_unlock(&spechash_slock); 2135 return (count); 2136 } 2137 2138 /* 2139 * Same as above, but using the dev_t as argument 2140 */ 2141 2142 int 2143 count_dev(dev) 2144 dev_t dev; 2145 { 2146 struct vnode *vp; 2147 2148 vp = SLIST_FIRST(&dev->si_hlist); 2149 if (vp == NULL) 2150 return (0); 2151 return(vcount(vp)); 2152 } 2153 2154 /* 2155 * Print out a description of a vnode. 2156 */ 2157 static char *typename[] = 2158 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2159 2160 void 2161 vprint(label, vp) 2162 char *label; 2163 struct vnode *vp; 2164 { 2165 char buf[96]; 2166 2167 if (label != NULL) 2168 printf("%s: %p: ", label, (void *)vp); 2169 else 2170 printf("%p: ", (void *)vp); 2171 printf("type %s, usecount %d, writecount %d, refcount %d,", 2172 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2173 vp->v_holdcnt); 2174 buf[0] = '\0'; 2175 if (vp->v_flag & VROOT) 2176 strcat(buf, "|VROOT"); 2177 if (vp->v_flag & VTEXT) 2178 strcat(buf, "|VTEXT"); 2179 if (vp->v_flag & VSYSTEM) 2180 strcat(buf, "|VSYSTEM"); 2181 if (vp->v_flag & VXLOCK) 2182 strcat(buf, "|VXLOCK"); 2183 if (vp->v_flag & VXWANT) 2184 strcat(buf, "|VXWANT"); 2185 if (vp->v_flag & VBWAIT) 2186 strcat(buf, "|VBWAIT"); 2187 if (vp->v_flag & VDOOMED) 2188 strcat(buf, "|VDOOMED"); 2189 if (vp->v_flag & VFREE) 2190 strcat(buf, "|VFREE"); 2191 if (vp->v_flag & VOBJBUF) 2192 strcat(buf, "|VOBJBUF"); 2193 if (buf[0] != '\0') 2194 printf(" flags (%s)", &buf[1]); 2195 if (vp->v_data == NULL) { 2196 printf("\n"); 2197 } else { 2198 printf("\n\t"); 2199 VOP_PRINT(vp); 2200 } 2201 } 2202 2203 #ifdef DDB 2204 #include <ddb/ddb.h> 2205 /* 2206 * List all of the locked vnodes in the system. 2207 * Called when debugging the kernel. 2208 */ 2209 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2210 { 2211 struct proc *p = curproc; /* XXX */ 2212 struct mount *mp, *nmp; 2213 struct vnode *vp; 2214 2215 printf("Locked vnodes\n"); 2216 simple_lock(&mountlist_slock); 2217 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2218 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2219 nmp = TAILQ_NEXT(mp, mnt_list); 2220 continue; 2221 } 2222 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2223 if (VOP_ISLOCKED(vp, NULL)) 2224 vprint((char *)0, vp); 2225 } 2226 simple_lock(&mountlist_slock); 2227 nmp = TAILQ_NEXT(mp, mnt_list); 2228 vfs_unbusy(mp, p); 2229 } 2230 simple_unlock(&mountlist_slock); 2231 } 2232 #endif 2233 2234 /* 2235 * Top level filesystem related information gathering. 2236 */ 2237 static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2238 2239 static int 2240 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2241 { 2242 int *name = (int *)arg1 - 1; /* XXX */ 2243 u_int namelen = arg2 + 1; /* XXX */ 2244 struct vfsconf *vfsp; 2245 2246 #if 1 || defined(COMPAT_PRELITE2) 2247 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2248 if (namelen == 1) 2249 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2250 #endif 2251 2252 #ifdef notyet 2253 /* all sysctl names at this level are at least name and field */ 2254 if (namelen < 2) 2255 return (ENOTDIR); /* overloaded */ 2256 if (name[0] != VFS_GENERIC) { 2257 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2258 if (vfsp->vfc_typenum == name[0]) 2259 break; 2260 if (vfsp == NULL) 2261 return (EOPNOTSUPP); 2262 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2263 oldp, oldlenp, newp, newlen, p)); 2264 } 2265 #endif 2266 switch (name[1]) { 2267 case VFS_MAXTYPENUM: 2268 if (namelen != 2) 2269 return (ENOTDIR); 2270 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2271 case VFS_CONF: 2272 if (namelen != 3) 2273 return (ENOTDIR); /* overloaded */ 2274 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2275 if (vfsp->vfc_typenum == name[2]) 2276 break; 2277 if (vfsp == NULL) 2278 return (EOPNOTSUPP); 2279 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2280 } 2281 return (EOPNOTSUPP); 2282 } 2283 2284 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2285 "Generic filesystem"); 2286 2287 #if 1 || defined(COMPAT_PRELITE2) 2288 2289 static int 2290 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2291 { 2292 int error; 2293 struct vfsconf *vfsp; 2294 struct ovfsconf ovfs; 2295 2296 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2297 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2298 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2299 ovfs.vfc_index = vfsp->vfc_typenum; 2300 ovfs.vfc_refcount = vfsp->vfc_refcount; 2301 ovfs.vfc_flags = vfsp->vfc_flags; 2302 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2303 if (error) 2304 return error; 2305 } 2306 return 0; 2307 } 2308 2309 #endif /* 1 || COMPAT_PRELITE2 */ 2310 2311 #if 0 2312 #define KINFO_VNODESLOP 10 2313 /* 2314 * Dump vnode list (via sysctl). 2315 * Copyout address of vnode followed by vnode. 2316 */ 2317 /* ARGSUSED */ 2318 static int 2319 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2320 { 2321 struct proc *p = curproc; /* XXX */ 2322 struct mount *mp, *nmp; 2323 struct vnode *nvp, *vp; 2324 int error; 2325 2326 #define VPTRSZ sizeof (struct vnode *) 2327 #define VNODESZ sizeof (struct vnode) 2328 2329 req->lock = 0; 2330 if (!req->oldptr) /* Make an estimate */ 2331 return (SYSCTL_OUT(req, 0, 2332 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2333 2334 simple_lock(&mountlist_slock); 2335 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2336 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2337 nmp = TAILQ_NEXT(mp, mnt_list); 2338 continue; 2339 } 2340 again: 2341 simple_lock(&mntvnode_slock); 2342 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 2343 vp != NULL; 2344 vp = nvp) { 2345 /* 2346 * Check that the vp is still associated with 2347 * this filesystem. RACE: could have been 2348 * recycled onto the same filesystem. 2349 */ 2350 if (vp->v_mount != mp) { 2351 simple_unlock(&mntvnode_slock); 2352 goto again; 2353 } 2354 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2355 simple_unlock(&mntvnode_slock); 2356 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2357 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2358 return (error); 2359 simple_lock(&mntvnode_slock); 2360 } 2361 simple_unlock(&mntvnode_slock); 2362 simple_lock(&mountlist_slock); 2363 nmp = TAILQ_NEXT(mp, mnt_list); 2364 vfs_unbusy(mp, p); 2365 } 2366 simple_unlock(&mountlist_slock); 2367 2368 return (0); 2369 } 2370 #endif 2371 2372 /* 2373 * XXX 2374 * Exporting the vnode list on large systems causes them to crash. 2375 * Exporting the vnode list on medium systems causes sysctl to coredump. 2376 */ 2377 #if 0 2378 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2379 0, 0, sysctl_vnode, "S,vnode", ""); 2380 #endif 2381 2382 /* 2383 * Check to see if a filesystem is mounted on a block device. 2384 */ 2385 int 2386 vfs_mountedon(vp) 2387 struct vnode *vp; 2388 { 2389 2390 if (vp->v_specmountpoint != NULL) 2391 return (EBUSY); 2392 return (0); 2393 } 2394 2395 /* 2396 * Unmount all filesystems. The list is traversed in reverse order 2397 * of mounting to avoid dependencies. 2398 */ 2399 void 2400 vfs_unmountall() 2401 { 2402 struct mount *mp; 2403 struct proc *p; 2404 int error; 2405 2406 if (curproc != NULL) 2407 p = curproc; 2408 else 2409 p = initproc; /* XXX XXX should this be proc0? */ 2410 /* 2411 * Since this only runs when rebooting, it is not interlocked. 2412 */ 2413 while(!TAILQ_EMPTY(&mountlist)) { 2414 mp = TAILQ_LAST(&mountlist, mntlist); 2415 error = dounmount(mp, MNT_FORCE, p); 2416 if (error) { 2417 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2418 printf("unmount of %s failed (", 2419 mp->mnt_stat.f_mntonname); 2420 if (error == EBUSY) 2421 printf("BUSY)\n"); 2422 else 2423 printf("%d)\n", error); 2424 } else { 2425 /* The unmount has removed mp from the mountlist */ 2426 } 2427 } 2428 } 2429 2430 /* 2431 * Build hash lists of net addresses and hang them off the mount point. 2432 * Called by ufs_mount() to set up the lists of export addresses. 2433 */ 2434 static int 2435 vfs_hang_addrlist(mp, nep, argp) 2436 struct mount *mp; 2437 struct netexport *nep; 2438 struct export_args *argp; 2439 { 2440 register struct netcred *np; 2441 register struct radix_node_head *rnh; 2442 register int i; 2443 struct radix_node *rn; 2444 struct sockaddr *saddr, *smask = 0; 2445 struct domain *dom; 2446 int error; 2447 2448 if (argp->ex_addrlen == 0) { 2449 if (mp->mnt_flag & MNT_DEFEXPORTED) 2450 return (EPERM); 2451 np = &nep->ne_defexported; 2452 np->netc_exflags = argp->ex_flags; 2453 np->netc_anon = argp->ex_anon; 2454 np->netc_anon.cr_ref = 1; 2455 mp->mnt_flag |= MNT_DEFEXPORTED; 2456 return (0); 2457 } 2458 2459 if (argp->ex_addrlen > MLEN) 2460 return (EINVAL); 2461 2462 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2463 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2464 bzero((caddr_t) np, i); 2465 saddr = (struct sockaddr *) (np + 1); 2466 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2467 goto out; 2468 if (saddr->sa_len > argp->ex_addrlen) 2469 saddr->sa_len = argp->ex_addrlen; 2470 if (argp->ex_masklen) { 2471 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2472 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2473 if (error) 2474 goto out; 2475 if (smask->sa_len > argp->ex_masklen) 2476 smask->sa_len = argp->ex_masklen; 2477 } 2478 i = saddr->sa_family; 2479 if ((rnh = nep->ne_rtable[i]) == 0) { 2480 /* 2481 * Seems silly to initialize every AF when most are not used, 2482 * do so on demand here 2483 */ 2484 for (dom = domains; dom; dom = dom->dom_next) 2485 if (dom->dom_family == i && dom->dom_rtattach) { 2486 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2487 dom->dom_rtoffset); 2488 break; 2489 } 2490 if ((rnh = nep->ne_rtable[i]) == 0) { 2491 error = ENOBUFS; 2492 goto out; 2493 } 2494 } 2495 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2496 np->netc_rnodes); 2497 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2498 error = EPERM; 2499 goto out; 2500 } 2501 np->netc_exflags = argp->ex_flags; 2502 np->netc_anon = argp->ex_anon; 2503 np->netc_anon.cr_ref = 1; 2504 return (0); 2505 out: 2506 free(np, M_NETADDR); 2507 return (error); 2508 } 2509 2510 /* ARGSUSED */ 2511 static int 2512 vfs_free_netcred(rn, w) 2513 struct radix_node *rn; 2514 void *w; 2515 { 2516 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2517 2518 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2519 free((caddr_t) rn, M_NETADDR); 2520 return (0); 2521 } 2522 2523 /* 2524 * Free the net address hash lists that are hanging off the mount points. 2525 */ 2526 static void 2527 vfs_free_addrlist(nep) 2528 struct netexport *nep; 2529 { 2530 register int i; 2531 register struct radix_node_head *rnh; 2532 2533 for (i = 0; i <= AF_MAX; i++) 2534 if ((rnh = nep->ne_rtable[i])) { 2535 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2536 (caddr_t) rnh); 2537 free((caddr_t) rnh, M_RTABLE); 2538 nep->ne_rtable[i] = 0; 2539 } 2540 } 2541 2542 int 2543 vfs_export(mp, nep, argp) 2544 struct mount *mp; 2545 struct netexport *nep; 2546 struct export_args *argp; 2547 { 2548 int error; 2549 2550 if (argp->ex_flags & MNT_DELEXPORT) { 2551 if (mp->mnt_flag & MNT_EXPUBLIC) { 2552 vfs_setpublicfs(NULL, NULL, NULL); 2553 mp->mnt_flag &= ~MNT_EXPUBLIC; 2554 } 2555 vfs_free_addrlist(nep); 2556 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2557 } 2558 if (argp->ex_flags & MNT_EXPORTED) { 2559 if (argp->ex_flags & MNT_EXPUBLIC) { 2560 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2561 return (error); 2562 mp->mnt_flag |= MNT_EXPUBLIC; 2563 } 2564 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2565 return (error); 2566 mp->mnt_flag |= MNT_EXPORTED; 2567 } 2568 return (0); 2569 } 2570 2571 2572 /* 2573 * Set the publicly exported filesystem (WebNFS). Currently, only 2574 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2575 */ 2576 int 2577 vfs_setpublicfs(mp, nep, argp) 2578 struct mount *mp; 2579 struct netexport *nep; 2580 struct export_args *argp; 2581 { 2582 int error; 2583 struct vnode *rvp; 2584 char *cp; 2585 2586 /* 2587 * mp == NULL -> invalidate the current info, the FS is 2588 * no longer exported. May be called from either vfs_export 2589 * or unmount, so check if it hasn't already been done. 2590 */ 2591 if (mp == NULL) { 2592 if (nfs_pub.np_valid) { 2593 nfs_pub.np_valid = 0; 2594 if (nfs_pub.np_index != NULL) { 2595 FREE(nfs_pub.np_index, M_TEMP); 2596 nfs_pub.np_index = NULL; 2597 } 2598 } 2599 return (0); 2600 } 2601 2602 /* 2603 * Only one allowed at a time. 2604 */ 2605 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2606 return (EBUSY); 2607 2608 /* 2609 * Get real filehandle for root of exported FS. 2610 */ 2611 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2612 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2613 2614 if ((error = VFS_ROOT(mp, &rvp))) 2615 return (error); 2616 2617 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2618 return (error); 2619 2620 vput(rvp); 2621 2622 /* 2623 * If an indexfile was specified, pull it in. 2624 */ 2625 if (argp->ex_indexfile != NULL) { 2626 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2627 M_WAITOK); 2628 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2629 MAXNAMLEN, (size_t *)0); 2630 if (!error) { 2631 /* 2632 * Check for illegal filenames. 2633 */ 2634 for (cp = nfs_pub.np_index; *cp; cp++) { 2635 if (*cp == '/') { 2636 error = EINVAL; 2637 break; 2638 } 2639 } 2640 } 2641 if (error) { 2642 FREE(nfs_pub.np_index, M_TEMP); 2643 return (error); 2644 } 2645 } 2646 2647 nfs_pub.np_mount = mp; 2648 nfs_pub.np_valid = 1; 2649 return (0); 2650 } 2651 2652 struct netcred * 2653 vfs_export_lookup(mp, nep, nam) 2654 register struct mount *mp; 2655 struct netexport *nep; 2656 struct sockaddr *nam; 2657 { 2658 register struct netcred *np; 2659 register struct radix_node_head *rnh; 2660 struct sockaddr *saddr; 2661 2662 np = NULL; 2663 if (mp->mnt_flag & MNT_EXPORTED) { 2664 /* 2665 * Lookup in the export list first. 2666 */ 2667 if (nam != NULL) { 2668 saddr = nam; 2669 rnh = nep->ne_rtable[saddr->sa_family]; 2670 if (rnh != NULL) { 2671 np = (struct netcred *) 2672 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2673 rnh); 2674 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2675 np = NULL; 2676 } 2677 } 2678 /* 2679 * If no address match, use the default if it exists. 2680 */ 2681 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2682 np = &nep->ne_defexported; 2683 } 2684 return (np); 2685 } 2686 2687 /* 2688 * perform msync on all vnodes under a mount point 2689 * the mount point must be locked. 2690 */ 2691 void 2692 vfs_msync(struct mount *mp, int flags) 2693 { 2694 struct vnode *vp, *nvp; 2695 struct vm_object *obj; 2696 int tries; 2697 2698 tries = 5; 2699 simple_lock(&mntvnode_slock); 2700 loop: 2701 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) { 2702 if (vp->v_mount != mp) { 2703 if (--tries > 0) 2704 goto loop; 2705 break; 2706 } 2707 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2708 2709 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2710 continue; 2711 2712 /* 2713 * There could be hundreds of thousands of vnodes, we cannot 2714 * afford to do anything heavy-weight until we have a fairly 2715 * good indication that there is something to do. 2716 */ 2717 if ((vp->v_flag & VOBJDIRTY) && 2718 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2719 simple_unlock(&mntvnode_slock); 2720 if (!vget(vp, 2721 LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2722 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2723 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2724 } 2725 vput(vp); 2726 } 2727 simple_lock(&mntvnode_slock); 2728 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { 2729 if (--tries > 0) 2730 goto loop; 2731 break; 2732 } 2733 } 2734 } 2735 simple_unlock(&mntvnode_slock); 2736 } 2737 2738 /* 2739 * Create the VM object needed for VMIO and mmap support. This 2740 * is done for all VREG files in the system. Some filesystems might 2741 * afford the additional metadata buffering capability of the 2742 * VMIO code by making the device node be VMIO mode also. 2743 * 2744 * vp must be locked when vfs_object_create is called. 2745 */ 2746 int 2747 vfs_object_create(vp, p, cred) 2748 struct vnode *vp; 2749 struct proc *p; 2750 struct ucred *cred; 2751 { 2752 return (VOP_CREATEVOBJECT(vp, cred, p)); 2753 } 2754 2755 void 2756 vfree(vp) 2757 struct vnode *vp; 2758 { 2759 int s; 2760 2761 s = splbio(); 2762 simple_lock(&vnode_free_list_slock); 2763 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2764 if (vp->v_flag & VAGE) { 2765 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2766 } else { 2767 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2768 } 2769 freevnodes++; 2770 simple_unlock(&vnode_free_list_slock); 2771 vp->v_flag &= ~VAGE; 2772 vp->v_flag |= VFREE; 2773 splx(s); 2774 } 2775 2776 void 2777 vbusy(vp) 2778 struct vnode *vp; 2779 { 2780 int s; 2781 2782 s = splbio(); 2783 simple_lock(&vnode_free_list_slock); 2784 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2785 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2786 freevnodes--; 2787 simple_unlock(&vnode_free_list_slock); 2788 vp->v_flag &= ~(VFREE|VAGE); 2789 splx(s); 2790 } 2791 2792 /* 2793 * Record a process's interest in events which might happen to 2794 * a vnode. Because poll uses the historic select-style interface 2795 * internally, this routine serves as both the ``check for any 2796 * pending events'' and the ``record my interest in future events'' 2797 * functions. (These are done together, while the lock is held, 2798 * to avoid race conditions.) 2799 */ 2800 int 2801 vn_pollrecord(vp, p, events) 2802 struct vnode *vp; 2803 struct proc *p; 2804 short events; 2805 { 2806 simple_lock(&vp->v_pollinfo.vpi_lock); 2807 if (vp->v_pollinfo.vpi_revents & events) { 2808 /* 2809 * This leaves events we are not interested 2810 * in available for the other process which 2811 * which presumably had requested them 2812 * (otherwise they would never have been 2813 * recorded). 2814 */ 2815 events &= vp->v_pollinfo.vpi_revents; 2816 vp->v_pollinfo.vpi_revents &= ~events; 2817 2818 simple_unlock(&vp->v_pollinfo.vpi_lock); 2819 return events; 2820 } 2821 vp->v_pollinfo.vpi_events |= events; 2822 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2823 simple_unlock(&vp->v_pollinfo.vpi_lock); 2824 return 0; 2825 } 2826 2827 /* 2828 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2829 * it is possible for us to miss an event due to race conditions, but 2830 * that condition is expected to be rare, so for the moment it is the 2831 * preferred interface. 2832 */ 2833 void 2834 vn_pollevent(vp, events) 2835 struct vnode *vp; 2836 short events; 2837 { 2838 simple_lock(&vp->v_pollinfo.vpi_lock); 2839 if (vp->v_pollinfo.vpi_events & events) { 2840 /* 2841 * We clear vpi_events so that we don't 2842 * call selwakeup() twice if two events are 2843 * posted before the polling process(es) is 2844 * awakened. This also ensures that we take at 2845 * most one selwakeup() if the polling process 2846 * is no longer interested. However, it does 2847 * mean that only one event can be noticed at 2848 * a time. (Perhaps we should only clear those 2849 * event bits which we note?) XXX 2850 */ 2851 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2852 vp->v_pollinfo.vpi_revents |= events; 2853 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2854 } 2855 simple_unlock(&vp->v_pollinfo.vpi_lock); 2856 } 2857 2858 /* 2859 * Wake up anyone polling on vp because it is being revoked. 2860 * This depends on dead_poll() returning POLLHUP for correct 2861 * behavior. 2862 */ 2863 void 2864 vn_pollgone(vp) 2865 struct vnode *vp; 2866 { 2867 simple_lock(&vp->v_pollinfo.vpi_lock); 2868 if (vp->v_pollinfo.vpi_events) { 2869 vp->v_pollinfo.vpi_events = 0; 2870 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2871 } 2872 simple_unlock(&vp->v_pollinfo.vpi_lock); 2873 } 2874 2875 2876 2877 /* 2878 * Routine to create and manage a filesystem syncer vnode. 2879 */ 2880 #define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2881 static int sync_fsync __P((struct vop_fsync_args *)); 2882 static int sync_inactive __P((struct vop_inactive_args *)); 2883 static int sync_reclaim __P((struct vop_reclaim_args *)); 2884 #define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2885 #define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2886 static int sync_print __P((struct vop_print_args *)); 2887 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2888 2889 static vop_t **sync_vnodeop_p; 2890 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2891 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2892 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2893 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2894 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2895 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2896 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2897 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2898 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2899 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2900 { NULL, NULL } 2901 }; 2902 static struct vnodeopv_desc sync_vnodeop_opv_desc = 2903 { &sync_vnodeop_p, sync_vnodeop_entries }; 2904 2905 VNODEOP_SET(sync_vnodeop_opv_desc); 2906 2907 /* 2908 * Create a new filesystem syncer vnode for the specified mount point. 2909 */ 2910 int 2911 vfs_allocate_syncvnode(mp) 2912 struct mount *mp; 2913 { 2914 struct vnode *vp; 2915 static long start, incr, next; 2916 int error; 2917 2918 /* Allocate a new vnode */ 2919 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2920 mp->mnt_syncer = NULL; 2921 return (error); 2922 } 2923 vp->v_type = VNON; 2924 /* 2925 * Place the vnode onto the syncer worklist. We attempt to 2926 * scatter them about on the list so that they will go off 2927 * at evenly distributed times even if all the filesystems 2928 * are mounted at once. 2929 */ 2930 next += incr; 2931 if (next == 0 || next > syncer_maxdelay) { 2932 start /= 2; 2933 incr /= 2; 2934 if (start == 0) { 2935 start = syncer_maxdelay / 2; 2936 incr = syncer_maxdelay; 2937 } 2938 next = start; 2939 } 2940 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2941 mp->mnt_syncer = vp; 2942 return (0); 2943 } 2944 2945 /* 2946 * Do a lazy sync of the filesystem. 2947 */ 2948 static int 2949 sync_fsync(ap) 2950 struct vop_fsync_args /* { 2951 struct vnode *a_vp; 2952 struct ucred *a_cred; 2953 int a_waitfor; 2954 struct proc *a_p; 2955 } */ *ap; 2956 { 2957 struct vnode *syncvp = ap->a_vp; 2958 struct mount *mp = syncvp->v_mount; 2959 struct proc *p = ap->a_p; 2960 int asyncflag; 2961 2962 /* 2963 * We only need to do something if this is a lazy evaluation. 2964 */ 2965 if (ap->a_waitfor != MNT_LAZY) 2966 return (0); 2967 2968 /* 2969 * Move ourselves to the back of the sync list. 2970 */ 2971 vn_syncer_add_to_worklist(syncvp, syncdelay); 2972 2973 /* 2974 * Walk the list of vnodes pushing all that are dirty and 2975 * not already on the sync list. 2976 */ 2977 simple_lock(&mountlist_slock); 2978 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { 2979 simple_unlock(&mountlist_slock); 2980 return (0); 2981 } 2982 asyncflag = mp->mnt_flag & MNT_ASYNC; 2983 mp->mnt_flag &= ~MNT_ASYNC; 2984 vfs_msync(mp, MNT_NOWAIT); 2985 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2986 if (asyncflag) 2987 mp->mnt_flag |= MNT_ASYNC; 2988 vfs_unbusy(mp, p); 2989 return (0); 2990 } 2991 2992 /* 2993 * The syncer vnode is no referenced. 2994 */ 2995 static int 2996 sync_inactive(ap) 2997 struct vop_inactive_args /* { 2998 struct vnode *a_vp; 2999 struct proc *a_p; 3000 } */ *ap; 3001 { 3002 3003 vgone(ap->a_vp); 3004 return (0); 3005 } 3006 3007 /* 3008 * The syncer vnode is no longer needed and is being decommissioned. 3009 * 3010 * Modifications to the worklist must be protected at splbio(). 3011 */ 3012 static int 3013 sync_reclaim(ap) 3014 struct vop_reclaim_args /* { 3015 struct vnode *a_vp; 3016 } */ *ap; 3017 { 3018 struct vnode *vp = ap->a_vp; 3019 int s; 3020 3021 s = splbio(); 3022 vp->v_mount->mnt_syncer = NULL; 3023 if (vp->v_flag & VONWORKLST) { 3024 LIST_REMOVE(vp, v_synclist); 3025 vp->v_flag &= ~VONWORKLST; 3026 } 3027 splx(s); 3028 3029 return (0); 3030 } 3031 3032 /* 3033 * Print out a syncer vnode. 3034 */ 3035 static int 3036 sync_print(ap) 3037 struct vop_print_args /* { 3038 struct vnode *a_vp; 3039 } */ *ap; 3040 { 3041 struct vnode *vp = ap->a_vp; 3042 3043 printf("syncer vnode"); 3044 if (vp->v_vnlock != NULL) 3045 lockmgr_printinfo(vp->v_vnlock); 3046 printf("\n"); 3047 return (0); 3048 } 3049 3050 /* 3051 * extract the dev_t from a VBLK or VCHR 3052 */ 3053 dev_t 3054 vn_todev(vp) 3055 struct vnode *vp; 3056 { 3057 if (vp->v_type != VBLK && vp->v_type != VCHR) 3058 return (NODEV); 3059 return (vp->v_rdev); 3060 } 3061 3062 /* 3063 * Check if vnode represents a disk device 3064 */ 3065 int 3066 vn_isdisk(vp, errp) 3067 struct vnode *vp; 3068 int *errp; 3069 { 3070 if (vp->v_type != VBLK && vp->v_type != VCHR) { 3071 if (errp != NULL) 3072 *errp = ENOTBLK; 3073 return (0); 3074 } 3075 if (vp->v_rdev == NULL) { 3076 if (errp != NULL) 3077 *errp = ENXIO; 3078 return (0); 3079 } 3080 if (!devsw(vp->v_rdev)) { 3081 if (errp != NULL) 3082 *errp = ENXIO; 3083 return (0); 3084 } 3085 if (!(devsw(vp->v_rdev)->d_flags & D_DISK)) { 3086 if (errp != NULL) 3087 *errp = ENOTBLK; 3088 return (0); 3089 } 3090 if (errp != NULL) 3091 *errp = 0; 3092 return (1); 3093 } 3094 3095 void 3096 NDFREE(ndp, flags) 3097 struct nameidata *ndp; 3098 const uint flags; 3099 { 3100 if (!(flags & NDF_NO_FREE_PNBUF) && 3101 (ndp->ni_cnd.cn_flags & HASBUF)) { 3102 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3103 ndp->ni_cnd.cn_flags &= ~HASBUF; 3104 } 3105 if (!(flags & NDF_NO_DVP_UNLOCK) && 3106 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 3107 ndp->ni_dvp != ndp->ni_vp) 3108 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 3109 if (!(flags & NDF_NO_DVP_RELE) && 3110 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 3111 vrele(ndp->ni_dvp); 3112 ndp->ni_dvp = NULL; 3113 } 3114 if (!(flags & NDF_NO_VP_UNLOCK) && 3115 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 3116 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 3117 if (!(flags & NDF_NO_VP_RELE) && 3118 ndp->ni_vp) { 3119 vrele(ndp->ni_vp); 3120 ndp->ni_vp = NULL; 3121 } 3122 if (!(flags & NDF_NO_STARTDIR_RELE) && 3123 (ndp->ni_cnd.cn_flags & SAVESTART)) { 3124 vrele(ndp->ni_startdir); 3125 ndp->ni_startdir = NULL; 3126 } 3127 } 3128