1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.30 2004/05/19 22:52:58 dillon Exp $ 41 */ 42 43 /* 44 * External virtual filesystem routines 45 */ 46 #include "opt_ddb.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/dirent.h> 53 #include <sys/domain.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fcntl.h> 56 #include <sys/kernel.h> 57 #include <sys/kthread.h> 58 #include <sys/malloc.h> 59 #include <sys/mbuf.h> 60 #include <sys/mount.h> 61 #include <sys/proc.h> 62 #include <sys/namei.h> 63 #include <sys/reboot.h> 64 #include <sys/socket.h> 65 #include <sys/stat.h> 66 #include <sys/sysctl.h> 67 #include <sys/syslog.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/limits.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pager.h> 81 #include <vm/vnode_pager.h> 82 #include <vm/vm_zone.h> 83 84 #include <sys/buf2.h> 85 #include <sys/thread2.h> 86 87 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 88 89 static void insmntque (struct vnode *vp, struct mount *mp); 90 static void vclean (struct vnode *vp, lwkt_tokref_t vlock, int flags, struct thread *td); 91 static unsigned long numvnodes; 92 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 93 94 enum vtype iftovt_tab[16] = { 95 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 96 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 97 }; 98 int vttoif_tab[9] = { 99 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 100 S_IFSOCK, S_IFIFO, S_IFMT, 101 }; 102 103 static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 104 105 static u_long wantfreevnodes = 25; 106 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 107 static u_long freevnodes = 0; 108 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 109 110 static int reassignbufcalls; 111 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 112 static int reassignbufloops; 113 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 114 static int reassignbufsortgood; 115 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 116 static int reassignbufsortbad; 117 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 118 static int reassignbufmethod = 1; 119 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 120 121 #ifdef ENABLE_VFS_IOOPT 122 int vfs_ioopt = 0; 123 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 124 #endif 125 126 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */ 127 struct lwkt_token mountlist_token; 128 struct lwkt_token mntvnode_token; 129 int nfs_mount_type = -1; 130 static struct lwkt_token mntid_token; 131 static struct lwkt_token vnode_free_list_token; 132 static struct lwkt_token spechash_token; 133 struct nfs_public nfs_pub; /* publicly exported FS */ 134 static vm_zone_t vnode_zone; 135 136 /* 137 * The workitem queue. 138 */ 139 #define SYNCER_MAXDELAY 32 140 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 141 time_t syncdelay = 30; /* max time to delay syncing data */ 142 SYSCTL_INT(_kern, OID_AUTO, syncdelay, CTLFLAG_RW, &syncdelay, 0, 143 "VFS data synchronization delay"); 144 time_t filedelay = 30; /* time to delay syncing files */ 145 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 146 "File synchronization delay"); 147 time_t dirdelay = 29; /* time to delay syncing directories */ 148 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 149 "Directory synchronization delay"); 150 time_t metadelay = 28; /* time to delay syncing metadata */ 151 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 152 "VFS metadata synchronization delay"); 153 static int rushjob; /* number of slots to run ASAP */ 154 static int stat_rush_requests; /* number of times I/O speeded up */ 155 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 156 157 static int syncer_delayno = 0; 158 static long syncer_mask; 159 LIST_HEAD(synclist, vnode); 160 static struct synclist *syncer_workitem_pending; 161 162 int desiredvnodes; 163 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 164 &desiredvnodes, 0, "Maximum number of vnodes"); 165 static int minvnodes; 166 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 167 &minvnodes, 0, "Minimum number of vnodes"); 168 static int vnlru_nowhere = 0; 169 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, 170 "Number of times the vnlru process ran without success"); 171 172 static void vfs_free_addrlist (struct netexport *nep); 173 static int vfs_free_netcred (struct radix_node *rn, void *w); 174 static int vfs_hang_addrlist (struct mount *mp, struct netexport *nep, 175 struct export_args *argp); 176 177 #define VSHOULDFREE(vp) \ 178 (!((vp)->v_flag & (VFREE|VDOOMED)) && \ 179 !(vp)->v_holdcnt && !(vp)->v_usecount && \ 180 (!(vp)->v_object || \ 181 !((vp)->v_object->ref_count || (vp)->v_object->resident_page_count))) 182 183 #define VMIGHTFREE(vp) \ 184 (((vp)->v_flag & (VFREE|VDOOMED|VXLOCK)) == 0 && \ 185 cache_leaf_test(vp) == 0 && (vp)->v_usecount == 0) 186 187 #define VSHOULDBUSY(vp) \ 188 (((vp)->v_flag & VFREE) && \ 189 ((vp)->v_holdcnt || (vp)->v_usecount)) 190 191 static void vbusy(struct vnode *vp); 192 static void vfree(struct vnode *vp); 193 static void vmaybefree(struct vnode *vp); 194 195 extern int dev_ref_debug; 196 197 /* 198 * NOTE: the vnode interlock must be held on call. 199 */ 200 static __inline void 201 vmaybefree(struct vnode *vp) 202 { 203 if (VSHOULDFREE(vp)) 204 vfree(vp); 205 } 206 207 /* 208 * Initialize the vnode management data structures. 209 */ 210 void 211 vntblinit() 212 { 213 214 /* 215 * Desired vnodes is a result of the physical page count 216 * and the size of kernel's heap. It scales in proportion 217 * to the amount of available physical memory. This can 218 * cause trouble on 64-bit and large memory platforms. 219 */ 220 /* desiredvnodes = maxproc + vmstats.v_page_count / 4; */ 221 desiredvnodes = 222 min(maxproc + vmstats.v_page_count /4, 223 2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 224 (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); 225 226 minvnodes = desiredvnodes / 4; 227 lwkt_token_init(&mountlist_token); 228 lwkt_token_init(&mntvnode_token); 229 lwkt_token_init(&mntid_token); 230 lwkt_token_init(&spechash_token); 231 TAILQ_INIT(&vnode_free_list); 232 lwkt_token_init(&vnode_free_list_token); 233 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 234 /* 235 * Initialize the filesystem syncer. 236 */ 237 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 238 &syncer_mask); 239 syncer_maxdelay = syncer_mask + 1; 240 } 241 242 /* 243 * Mark a mount point as busy. Used to synchronize access and to delay 244 * unmounting. Interlock is not released on failure. 245 */ 246 int 247 vfs_busy(struct mount *mp, int flags, lwkt_tokref_t interlkp, struct thread *td) 248 { 249 int lkflags; 250 251 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 252 if (flags & LK_NOWAIT) 253 return (ENOENT); 254 mp->mnt_kern_flag |= MNTK_MWAIT; 255 /* 256 * Since all busy locks are shared except the exclusive 257 * lock granted when unmounting, the only place that a 258 * wakeup needs to be done is at the release of the 259 * exclusive lock at the end of dounmount. 260 * 261 * note: interlkp is a serializer and thus can be safely 262 * held through any sleep 263 */ 264 tsleep((caddr_t)mp, 0, "vfs_busy", 0); 265 return (ENOENT); 266 } 267 lkflags = LK_SHARED | LK_NOPAUSE; 268 if (interlkp) 269 lkflags |= LK_INTERLOCK; 270 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td)) 271 panic("vfs_busy: unexpected lock failure"); 272 return (0); 273 } 274 275 /* 276 * Free a busy filesystem. 277 */ 278 void 279 vfs_unbusy(struct mount *mp, struct thread *td) 280 { 281 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); 282 } 283 284 /* 285 * Lookup a filesystem type, and if found allocate and initialize 286 * a mount structure for it. 287 * 288 * Devname is usually updated by mount(8) after booting. 289 */ 290 int 291 vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp) 292 { 293 struct thread *td = curthread; /* XXX */ 294 struct vfsconf *vfsp; 295 struct mount *mp; 296 297 if (fstypename == NULL) 298 return (ENODEV); 299 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 300 if (!strcmp(vfsp->vfc_name, fstypename)) 301 break; 302 if (vfsp == NULL) 303 return (ENODEV); 304 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 305 bzero((char *)mp, (u_long)sizeof(struct mount)); 306 lockinit(&mp->mnt_lock, 0, "vfslock", VLKTIMEOUT, LK_NOPAUSE); 307 vfs_busy(mp, LK_NOWAIT, NULL, td); 308 TAILQ_INIT(&mp->mnt_nvnodelist); 309 TAILQ_INIT(&mp->mnt_reservedvnlist); 310 mp->mnt_nvnodelistsize = 0; 311 mp->mnt_vfc = vfsp; 312 mp->mnt_op = vfsp->vfc_vfsops; 313 mp->mnt_flag = MNT_RDONLY; 314 mp->mnt_vnodecovered = NULLVP; 315 vfsp->vfc_refcount++; 316 mp->mnt_iosize_max = DFLTPHYS; 317 mp->mnt_stat.f_type = vfsp->vfc_typenum; 318 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 319 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 320 mp->mnt_stat.f_mntonname[0] = '/'; 321 mp->mnt_stat.f_mntonname[1] = 0; 322 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 323 *mpp = mp; 324 return (0); 325 } 326 327 /* 328 * Find an appropriate filesystem to use for the root. If a filesystem 329 * has not been preselected, walk through the list of known filesystems 330 * trying those that have mountroot routines, and try them until one 331 * works or we have tried them all. 332 */ 333 #ifdef notdef /* XXX JH */ 334 int 335 lite2_vfs_mountroot() 336 { 337 struct vfsconf *vfsp; 338 extern int (*lite2_mountroot) (void); 339 int error; 340 341 if (lite2_mountroot != NULL) 342 return ((*lite2_mountroot)()); 343 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 344 if (vfsp->vfc_mountroot == NULL) 345 continue; 346 if ((error = (*vfsp->vfc_mountroot)()) == 0) 347 return (0); 348 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 349 } 350 return (ENODEV); 351 } 352 #endif 353 354 /* 355 * Lookup a mount point by filesystem identifier. 356 */ 357 struct mount * 358 vfs_getvfs(fsid) 359 fsid_t *fsid; 360 { 361 struct mount *mp; 362 lwkt_tokref ilock; 363 364 lwkt_gettoken(&ilock, &mountlist_token); 365 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 366 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 367 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 368 break; 369 } 370 } 371 lwkt_reltoken(&ilock); 372 return (mp); 373 } 374 375 /* 376 * Get a new unique fsid. Try to make its val[0] unique, since this value 377 * will be used to create fake device numbers for stat(). Also try (but 378 * not so hard) make its val[0] unique mod 2^16, since some emulators only 379 * support 16-bit device numbers. We end up with unique val[0]'s for the 380 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 381 * 382 * Keep in mind that several mounts may be running in parallel. Starting 383 * the search one past where the previous search terminated is both a 384 * micro-optimization and a defense against returning the same fsid to 385 * different mounts. 386 */ 387 void 388 vfs_getnewfsid(mp) 389 struct mount *mp; 390 { 391 static u_int16_t mntid_base; 392 lwkt_tokref ilock; 393 fsid_t tfsid; 394 int mtype; 395 396 lwkt_gettoken(&ilock, &mntid_token); 397 mtype = mp->mnt_vfc->vfc_typenum; 398 tfsid.val[1] = mtype; 399 mtype = (mtype & 0xFF) << 24; 400 for (;;) { 401 tfsid.val[0] = makeudev(255, 402 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 403 mntid_base++; 404 if (vfs_getvfs(&tfsid) == NULL) 405 break; 406 } 407 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 408 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 409 lwkt_reltoken(&ilock); 410 } 411 412 /* 413 * Knob to control the precision of file timestamps: 414 * 415 * 0 = seconds only; nanoseconds zeroed. 416 * 1 = seconds and nanoseconds, accurate within 1/HZ. 417 * 2 = seconds and nanoseconds, truncated to microseconds. 418 * >=3 = seconds and nanoseconds, maximum precision. 419 */ 420 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 421 422 static int timestamp_precision = TSP_SEC; 423 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 424 ×tamp_precision, 0, ""); 425 426 /* 427 * Get a current timestamp. 428 */ 429 void 430 vfs_timestamp(tsp) 431 struct timespec *tsp; 432 { 433 struct timeval tv; 434 435 switch (timestamp_precision) { 436 case TSP_SEC: 437 tsp->tv_sec = time_second; 438 tsp->tv_nsec = 0; 439 break; 440 case TSP_HZ: 441 getnanotime(tsp); 442 break; 443 case TSP_USEC: 444 microtime(&tv); 445 TIMEVAL_TO_TIMESPEC(&tv, tsp); 446 break; 447 case TSP_NSEC: 448 default: 449 nanotime(tsp); 450 break; 451 } 452 } 453 454 /* 455 * Set vnode attributes to VNOVAL 456 */ 457 void 458 vattr_null(vap) 459 struct vattr *vap; 460 { 461 462 vap->va_type = VNON; 463 vap->va_size = VNOVAL; 464 vap->va_bytes = VNOVAL; 465 vap->va_mode = VNOVAL; 466 vap->va_nlink = VNOVAL; 467 vap->va_uid = VNOVAL; 468 vap->va_gid = VNOVAL; 469 vap->va_fsid = VNOVAL; 470 vap->va_fileid = VNOVAL; 471 vap->va_blocksize = VNOVAL; 472 vap->va_rdev = VNOVAL; 473 vap->va_atime.tv_sec = VNOVAL; 474 vap->va_atime.tv_nsec = VNOVAL; 475 vap->va_mtime.tv_sec = VNOVAL; 476 vap->va_mtime.tv_nsec = VNOVAL; 477 vap->va_ctime.tv_sec = VNOVAL; 478 vap->va_ctime.tv_nsec = VNOVAL; 479 vap->va_flags = VNOVAL; 480 vap->va_gen = VNOVAL; 481 vap->va_vaflags = 0; 482 } 483 484 /* 485 * This routine is called when we have too many vnodes. It attempts 486 * to free <count> vnodes and will potentially free vnodes that still 487 * have VM backing store (VM backing store is typically the cause 488 * of a vnode blowout so we want to do this). Therefore, this operation 489 * is not considered cheap. 490 * 491 * A number of conditions may prevent a vnode from being reclaimed. 492 * the buffer cache may have references on the vnode, a directory 493 * vnode may still have references due to the namei cache representing 494 * underlying files, or the vnode may be in active use. It is not 495 * desireable to reuse such vnodes. These conditions may cause the 496 * number of vnodes to reach some minimum value regardless of what 497 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 498 */ 499 static int 500 vlrureclaim(struct mount *mp) 501 { 502 struct vnode *vp; 503 lwkt_tokref ilock; 504 lwkt_tokref vlock; 505 int done; 506 int trigger; 507 int usevnodes; 508 int count; 509 510 /* 511 * Calculate the trigger point, don't allow user 512 * screwups to blow us up. This prevents us from 513 * recycling vnodes with lots of resident pages. We 514 * aren't trying to free memory, we are trying to 515 * free vnodes. 516 */ 517 usevnodes = desiredvnodes; 518 if (usevnodes <= 0) 519 usevnodes = 1; 520 trigger = vmstats.v_page_count * 2 / usevnodes; 521 522 done = 0; 523 lwkt_gettoken(&ilock, &mntvnode_token); 524 count = mp->mnt_nvnodelistsize / 10 + 1; 525 while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 526 /* 527 * __VNODESCAN__ 528 * 529 * The VP will stick around while we hold mntvnode_token, 530 * at least until we block, so we can safely do an initial 531 * check. But we have to check again after obtaining 532 * the vnode interlock. vp->v_interlock points to stable 533 * storage so it's ok if the vp gets ripped out from 534 * under us while we are blocked. 535 */ 536 if (vp->v_type == VNON || 537 vp->v_type == VBAD || 538 !VMIGHTFREE(vp) || /* critical path opt */ 539 (vp->v_object && 540 vp->v_object->resident_page_count >= trigger) 541 ) { 542 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 543 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes); 544 --count; 545 continue; 546 } 547 548 /* 549 * Get the interlock, delay moving the node to the tail so 550 * we don't race against new additions to the mountlist. 551 */ 552 lwkt_gettoken(&vlock, vp->v_interlock); 553 if (TAILQ_FIRST(&mp->mnt_nvnodelist) != vp) { 554 lwkt_reltoken(&vlock); 555 continue; 556 } 557 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 558 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes); 559 560 /* 561 * Must check again 562 */ 563 if (vp->v_type == VNON || 564 vp->v_type == VBAD || 565 !VMIGHTFREE(vp) || /* critical path opt */ 566 (vp->v_object && 567 vp->v_object->resident_page_count >= trigger) 568 ) { 569 lwkt_reltoken(&vlock); 570 --count; 571 continue; 572 } 573 vgonel(vp, &vlock, curthread); 574 ++done; 575 --count; 576 } 577 lwkt_reltoken(&ilock); 578 return done; 579 } 580 581 /* 582 * Attempt to recycle vnodes in a context that is always safe to block. 583 * Calling vlrurecycle() from the bowels of file system code has some 584 * interesting deadlock problems. 585 */ 586 static struct thread *vnlruthread; 587 static int vnlruproc_sig; 588 589 static void 590 vnlru_proc(void) 591 { 592 struct mount *mp, *nmp; 593 lwkt_tokref ilock; 594 int s; 595 int done; 596 struct thread *td = curthread; 597 598 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td, 599 SHUTDOWN_PRI_FIRST); 600 601 s = splbio(); 602 for (;;) { 603 kproc_suspend_loop(); 604 if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) { 605 vnlruproc_sig = 0; 606 wakeup(&vnlruproc_sig); 607 tsleep(td, 0, "vlruwt", hz); 608 continue; 609 } 610 done = 0; 611 lwkt_gettoken(&ilock, &mountlist_token); 612 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 613 if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) { 614 nmp = TAILQ_NEXT(mp, mnt_list); 615 continue; 616 } 617 done += vlrureclaim(mp); 618 lwkt_gettokref(&ilock); 619 nmp = TAILQ_NEXT(mp, mnt_list); 620 vfs_unbusy(mp, td); 621 } 622 lwkt_reltoken(&ilock); 623 if (done == 0) { 624 vnlru_nowhere++; 625 tsleep(td, 0, "vlrup", hz * 3); 626 } 627 } 628 splx(s); 629 } 630 631 static struct kproc_desc vnlru_kp = { 632 "vnlru", 633 vnlru_proc, 634 &vnlruthread 635 }; 636 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 637 638 /* 639 * Routines having to do with the management of the vnode table. 640 */ 641 extern vop_t **dead_vnodeop_p; 642 643 /* 644 * Return the next vnode from the free list. 645 */ 646 int 647 getnewvnode(tag, mp, vops, vpp) 648 enum vtagtype tag; 649 struct mount *mp; 650 vop_t **vops; 651 struct vnode **vpp; 652 { 653 int s; 654 struct thread *td = curthread; /* XXX */ 655 struct vnode *vp = NULL; 656 struct vnode *xvp; 657 vm_object_t object; 658 lwkt_tokref ilock; 659 lwkt_tokref vlock; 660 661 s = splbio(); 662 663 /* 664 * Try to reuse vnodes if we hit the max. This situation only 665 * occurs in certain large-memory (2G+) situations. We cannot 666 * attempt to directly reclaim vnodes due to nasty recursion 667 * problems. 668 */ 669 while (numvnodes - freevnodes > desiredvnodes) { 670 if (vnlruproc_sig == 0) { 671 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 672 wakeup(vnlruthread); 673 } 674 tsleep(&vnlruproc_sig, 0, "vlruwk", hz); 675 } 676 677 678 /* 679 * Attempt to reuse a vnode already on the free list, allocating 680 * a new vnode if we can't find one or if we have not reached a 681 * good minimum for good LRU performance. 682 */ 683 lwkt_gettoken(&ilock, &vnode_free_list_token); 684 if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) { 685 int count; 686 687 for (count = 0; count < freevnodes; count++) { 688 /* 689 * __VNODESCAN__ 690 * 691 * Pull the next vnode off the free list and do some 692 * sanity checks. Note that regardless of how we 693 * block, if freevnodes is non-zero there had better 694 * be something on the list. 695 */ 696 vp = TAILQ_FIRST(&vnode_free_list); 697 if (vp == NULL) 698 panic("getnewvnode: free vnode isn't"); 699 700 /* 701 * Move the vnode to the end of the list so other 702 * processes do not double-block trying to recycle 703 * the same vnode (as an optimization), then get 704 * the interlock. 705 */ 706 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 707 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 708 709 /* 710 * Skip vnodes that are in the process of being 711 * held or referenced. Since the act of adding or 712 * removing a vnode on the freelist requires a token 713 * and may block, the ref count may be adjusted 714 * prior to its addition or removal. 715 */ 716 if (VSHOULDBUSY(vp)) { 717 vp = NULL; 718 continue; 719 } 720 721 722 /* 723 * Obtain the vnode interlock and check that the 724 * vnode is still on the free list. 725 * 726 * This normally devolves into a degenerate case so 727 * it is optimal. Loop up if it isn't. Note that 728 * the vnode could be in the middle of being moved 729 * off the free list (the VSHOULDBUSY() check) and 730 * must be skipped if so. 731 */ 732 lwkt_gettoken(&vlock, vp->v_interlock); 733 TAILQ_FOREACH_REVERSE(xvp, &vnode_free_list, 734 freelst, v_freelist) { 735 if (vp == xvp) 736 break; 737 } 738 if (vp != xvp || VSHOULDBUSY(vp)) { 739 vp = NULL; 740 continue; 741 } 742 743 /* 744 * We now safely own the vnode. If the vnode has 745 * an object do not recycle it if its VM object 746 * has resident pages or references. 747 */ 748 if ((VOP_GETVOBJECT(vp, &object) == 0 && 749 (object->resident_page_count || object->ref_count)) 750 ) { 751 lwkt_reltoken(&vlock); 752 vp = NULL; 753 continue; 754 } 755 756 /* 757 * We can almost reuse this vnode. But we don't want 758 * to recycle it if the vnode has children in the 759 * namecache because that breaks the namecache's 760 * path element chain. (YYY use nc_refs for the 761 * check?) 762 */ 763 KKASSERT(vp->v_flag & VFREE); 764 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 765 766 if (TAILQ_FIRST(&vp->v_namecache) == NULL || 767 cache_leaf_test(vp) >= 0) { 768 /* ok, we can reuse this vnode */ 769 break; 770 } 771 lwkt_reltoken(&vlock); 772 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 773 vp = NULL; 774 } 775 } 776 777 /* 778 * If vp is non-NULL we hold it's interlock. 779 */ 780 if (vp) { 781 vp->v_flag |= VDOOMED; 782 vp->v_flag &= ~VFREE; 783 freevnodes--; 784 lwkt_reltoken(&ilock); 785 cache_purge(vp); /* YYY may block */ 786 vp->v_lease = NULL; 787 if (vp->v_type != VBAD) { 788 vgonel(vp, &vlock, td); 789 } else { 790 lwkt_reltoken(&vlock); 791 } 792 793 #ifdef INVARIANTS 794 { 795 int s; 796 797 if (vp->v_data) 798 panic("cleaned vnode isn't"); 799 s = splbio(); 800 if (vp->v_numoutput) 801 panic("Clean vnode has pending I/O's"); 802 splx(s); 803 } 804 #endif 805 vp->v_flag = 0; 806 vp->v_lastw = 0; 807 vp->v_lasta = 0; 808 vp->v_cstart = 0; 809 vp->v_clen = 0; 810 vp->v_socket = 0; 811 vp->v_writecount = 0; /* XXX */ 812 } else { 813 lwkt_reltoken(&ilock); 814 vp = zalloc(vnode_zone); 815 bzero(vp, sizeof(*vp)); 816 vp->v_interlock = lwkt_token_pool_get(vp); 817 lwkt_token_init(&vp->v_pollinfo.vpi_token); 818 cache_purge(vp); 819 TAILQ_INIT(&vp->v_namecache); 820 numvnodes++; 821 } 822 823 TAILQ_INIT(&vp->v_cleanblkhd); 824 TAILQ_INIT(&vp->v_dirtyblkhd); 825 vp->v_type = VNON; 826 vp->v_tag = tag; 827 vp->v_op = vops; 828 insmntque(vp, mp); 829 *vpp = vp; 830 vp->v_usecount = 1; 831 vp->v_data = 0; 832 splx(s); 833 834 vfs_object_create(vp, td); 835 return (0); 836 } 837 838 /* 839 * Move a vnode from one mount queue to another. 840 */ 841 static void 842 insmntque(vp, mp) 843 struct vnode *vp; 844 struct mount *mp; 845 { 846 lwkt_tokref ilock; 847 848 lwkt_gettoken(&ilock, &mntvnode_token); 849 /* 850 * Delete from old mount point vnode list, if on one. 851 */ 852 if (vp->v_mount != NULL) { 853 KASSERT(vp->v_mount->mnt_nvnodelistsize > 0, 854 ("bad mount point vnode list size")); 855 TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); 856 vp->v_mount->mnt_nvnodelistsize--; 857 } 858 /* 859 * Insert into list of vnodes for the new mount point, if available. 860 */ 861 if ((vp->v_mount = mp) == NULL) { 862 lwkt_reltoken(&ilock); 863 return; 864 } 865 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 866 mp->mnt_nvnodelistsize++; 867 lwkt_reltoken(&ilock); 868 } 869 870 /* 871 * Update outstanding I/O count and do wakeup if requested. 872 */ 873 void 874 vwakeup(bp) 875 struct buf *bp; 876 { 877 struct vnode *vp; 878 879 bp->b_flags &= ~B_WRITEINPROG; 880 if ((vp = bp->b_vp)) { 881 vp->v_numoutput--; 882 if (vp->v_numoutput < 0) 883 panic("vwakeup: neg numoutput"); 884 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 885 vp->v_flag &= ~VBWAIT; 886 wakeup((caddr_t) &vp->v_numoutput); 887 } 888 } 889 } 890 891 /* 892 * Flush out and invalidate all buffers associated with a vnode. 893 * Called with the underlying object locked. 894 */ 895 int 896 vinvalbuf(struct vnode *vp, int flags, struct thread *td, 897 int slpflag, int slptimeo) 898 { 899 struct buf *bp; 900 struct buf *nbp, *blist; 901 int s, error; 902 vm_object_t object; 903 lwkt_tokref vlock; 904 905 if (flags & V_SAVE) { 906 s = splbio(); 907 while (vp->v_numoutput) { 908 vp->v_flag |= VBWAIT; 909 error = tsleep((caddr_t)&vp->v_numoutput, 910 slpflag, "vinvlbuf", slptimeo); 911 if (error) { 912 splx(s); 913 return (error); 914 } 915 } 916 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 917 splx(s); 918 if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0) 919 return (error); 920 s = splbio(); 921 if (vp->v_numoutput > 0 || 922 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 923 panic("vinvalbuf: dirty bufs"); 924 } 925 splx(s); 926 } 927 s = splbio(); 928 for (;;) { 929 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 930 if (!blist) 931 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 932 if (!blist) 933 break; 934 935 for (bp = blist; bp; bp = nbp) { 936 nbp = TAILQ_NEXT(bp, b_vnbufs); 937 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 938 error = BUF_TIMELOCK(bp, 939 LK_EXCLUSIVE | LK_SLEEPFAIL, 940 "vinvalbuf", slpflag, slptimeo); 941 if (error == ENOLCK) 942 break; 943 splx(s); 944 return (error); 945 } 946 /* 947 * XXX Since there are no node locks for NFS, I 948 * believe there is a slight chance that a delayed 949 * write will occur while sleeping just above, so 950 * check for it. Note that vfs_bio_awrite expects 951 * buffers to reside on a queue, while VOP_BWRITE and 952 * brelse do not. 953 */ 954 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 955 (flags & V_SAVE)) { 956 957 if (bp->b_vp == vp) { 958 if (bp->b_flags & B_CLUSTEROK) { 959 BUF_UNLOCK(bp); 960 vfs_bio_awrite(bp); 961 } else { 962 bremfree(bp); 963 bp->b_flags |= B_ASYNC; 964 VOP_BWRITE(bp->b_vp, bp); 965 } 966 } else { 967 bremfree(bp); 968 (void) VOP_BWRITE(bp->b_vp, bp); 969 } 970 break; 971 } 972 bremfree(bp); 973 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 974 bp->b_flags &= ~B_ASYNC; 975 brelse(bp); 976 } 977 } 978 979 /* 980 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 981 * have write I/O in-progress but if there is a VM object then the 982 * VM object can also have read-I/O in-progress. 983 */ 984 do { 985 while (vp->v_numoutput > 0) { 986 vp->v_flag |= VBWAIT; 987 tsleep(&vp->v_numoutput, 0, "vnvlbv", 0); 988 } 989 if (VOP_GETVOBJECT(vp, &object) == 0) { 990 while (object->paging_in_progress) 991 vm_object_pip_sleep(object, "vnvlbx"); 992 } 993 } while (vp->v_numoutput > 0); 994 995 splx(s); 996 997 /* 998 * Destroy the copy in the VM cache, too. 999 */ 1000 lwkt_gettoken(&vlock, vp->v_interlock); 1001 if (VOP_GETVOBJECT(vp, &object) == 0) { 1002 vm_object_page_remove(object, 0, 0, 1003 (flags & V_SAVE) ? TRUE : FALSE); 1004 } 1005 lwkt_reltoken(&vlock); 1006 1007 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 1008 panic("vinvalbuf: flush failed"); 1009 return (0); 1010 } 1011 1012 /* 1013 * Truncate a file's buffer and pages to a specified length. This 1014 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1015 * sync activity. 1016 */ 1017 int 1018 vtruncbuf(struct vnode *vp, struct thread *td, off_t length, int blksize) 1019 { 1020 struct buf *bp; 1021 struct buf *nbp; 1022 int s, anyfreed; 1023 int trunclbn; 1024 1025 /* 1026 * Round up to the *next* lbn. 1027 */ 1028 trunclbn = (length + blksize - 1) / blksize; 1029 1030 s = splbio(); 1031 restart: 1032 anyfreed = 1; 1033 for (;anyfreed;) { 1034 anyfreed = 0; 1035 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 1036 nbp = TAILQ_NEXT(bp, b_vnbufs); 1037 if (bp->b_lblkno >= trunclbn) { 1038 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1039 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1040 goto restart; 1041 } else { 1042 bremfree(bp); 1043 bp->b_flags |= (B_INVAL | B_RELBUF); 1044 bp->b_flags &= ~B_ASYNC; 1045 brelse(bp); 1046 anyfreed = 1; 1047 } 1048 if (nbp && 1049 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1050 (nbp->b_vp != vp) || 1051 (nbp->b_flags & B_DELWRI))) { 1052 goto restart; 1053 } 1054 } 1055 } 1056 1057 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1058 nbp = TAILQ_NEXT(bp, b_vnbufs); 1059 if (bp->b_lblkno >= trunclbn) { 1060 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1061 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1062 goto restart; 1063 } else { 1064 bremfree(bp); 1065 bp->b_flags |= (B_INVAL | B_RELBUF); 1066 bp->b_flags &= ~B_ASYNC; 1067 brelse(bp); 1068 anyfreed = 1; 1069 } 1070 if (nbp && 1071 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1072 (nbp->b_vp != vp) || 1073 (nbp->b_flags & B_DELWRI) == 0)) { 1074 goto restart; 1075 } 1076 } 1077 } 1078 } 1079 1080 if (length > 0) { 1081 restartsync: 1082 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 1083 nbp = TAILQ_NEXT(bp, b_vnbufs); 1084 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 1085 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 1086 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 1087 goto restart; 1088 } else { 1089 bremfree(bp); 1090 if (bp->b_vp == vp) { 1091 bp->b_flags |= B_ASYNC; 1092 } else { 1093 bp->b_flags &= ~B_ASYNC; 1094 } 1095 VOP_BWRITE(bp->b_vp, bp); 1096 } 1097 goto restartsync; 1098 } 1099 1100 } 1101 } 1102 1103 while (vp->v_numoutput > 0) { 1104 vp->v_flag |= VBWAIT; 1105 tsleep(&vp->v_numoutput, 0, "vbtrunc", 0); 1106 } 1107 1108 splx(s); 1109 1110 vnode_pager_setsize(vp, length); 1111 1112 return (0); 1113 } 1114 1115 /* 1116 * Associate a buffer with a vnode. 1117 */ 1118 void 1119 bgetvp(vp, bp) 1120 struct vnode *vp; 1121 struct buf *bp; 1122 { 1123 int s; 1124 1125 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 1126 1127 vhold(vp); 1128 bp->b_vp = vp; 1129 bp->b_dev = vn_todev(vp); 1130 /* 1131 * Insert onto list for new vnode. 1132 */ 1133 s = splbio(); 1134 bp->b_xflags |= BX_VNCLEAN; 1135 bp->b_xflags &= ~BX_VNDIRTY; 1136 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 1137 splx(s); 1138 } 1139 1140 /* 1141 * Disassociate a buffer from a vnode. 1142 */ 1143 void 1144 brelvp(bp) 1145 struct buf *bp; 1146 { 1147 struct vnode *vp; 1148 struct buflists *listheadp; 1149 int s; 1150 1151 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1152 1153 /* 1154 * Delete from old vnode list, if on one. 1155 */ 1156 vp = bp->b_vp; 1157 s = splbio(); 1158 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1159 if (bp->b_xflags & BX_VNDIRTY) 1160 listheadp = &vp->v_dirtyblkhd; 1161 else 1162 listheadp = &vp->v_cleanblkhd; 1163 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1164 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1165 } 1166 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 1167 vp->v_flag &= ~VONWORKLST; 1168 LIST_REMOVE(vp, v_synclist); 1169 } 1170 splx(s); 1171 bp->b_vp = (struct vnode *) 0; 1172 vdrop(vp); 1173 } 1174 1175 /* 1176 * The workitem queue. 1177 * 1178 * It is useful to delay writes of file data and filesystem metadata 1179 * for tens of seconds so that quickly created and deleted files need 1180 * not waste disk bandwidth being created and removed. To realize this, 1181 * we append vnodes to a "workitem" queue. When running with a soft 1182 * updates implementation, most pending metadata dependencies should 1183 * not wait for more than a few seconds. Thus, mounted on block devices 1184 * are delayed only about a half the time that file data is delayed. 1185 * Similarly, directory updates are more critical, so are only delayed 1186 * about a third the time that file data is delayed. Thus, there are 1187 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 1188 * one each second (driven off the filesystem syncer process). The 1189 * syncer_delayno variable indicates the next queue that is to be processed. 1190 * Items that need to be processed soon are placed in this queue: 1191 * 1192 * syncer_workitem_pending[syncer_delayno] 1193 * 1194 * A delay of fifteen seconds is done by placing the request fifteen 1195 * entries later in the queue: 1196 * 1197 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 1198 * 1199 */ 1200 1201 /* 1202 * Add an item to the syncer work queue. 1203 */ 1204 static void 1205 vn_syncer_add_to_worklist(struct vnode *vp, int delay) 1206 { 1207 int s, slot; 1208 1209 s = splbio(); 1210 1211 if (vp->v_flag & VONWORKLST) { 1212 LIST_REMOVE(vp, v_synclist); 1213 } 1214 1215 if (delay > syncer_maxdelay - 2) 1216 delay = syncer_maxdelay - 2; 1217 slot = (syncer_delayno + delay) & syncer_mask; 1218 1219 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 1220 vp->v_flag |= VONWORKLST; 1221 splx(s); 1222 } 1223 1224 struct thread *updatethread; 1225 static void sched_sync (void); 1226 static struct kproc_desc up_kp = { 1227 "syncer", 1228 sched_sync, 1229 &updatethread 1230 }; 1231 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1232 1233 /* 1234 * System filesystem synchronizer daemon. 1235 */ 1236 void 1237 sched_sync(void) 1238 { 1239 struct synclist *slp; 1240 struct vnode *vp; 1241 long starttime; 1242 int s; 1243 struct thread *td = curthread; 1244 1245 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td, 1246 SHUTDOWN_PRI_LAST); 1247 1248 for (;;) { 1249 kproc_suspend_loop(); 1250 1251 starttime = time_second; 1252 1253 /* 1254 * Push files whose dirty time has expired. Be careful 1255 * of interrupt race on slp queue. 1256 */ 1257 s = splbio(); 1258 slp = &syncer_workitem_pending[syncer_delayno]; 1259 syncer_delayno += 1; 1260 if (syncer_delayno == syncer_maxdelay) 1261 syncer_delayno = 0; 1262 splx(s); 1263 1264 while ((vp = LIST_FIRST(slp)) != NULL) { 1265 if (VOP_ISLOCKED(vp, NULL) == 0) { 1266 vn_lock(vp, NULL, LK_EXCLUSIVE | LK_RETRY, td); 1267 (void) VOP_FSYNC(vp, MNT_LAZY, td); 1268 VOP_UNLOCK(vp, NULL, 0, td); 1269 } 1270 s = splbio(); 1271 if (LIST_FIRST(slp) == vp) { 1272 /* 1273 * Note: v_tag VT_VFS vps can remain on the 1274 * worklist too with no dirty blocks, but 1275 * since sync_fsync() moves it to a different 1276 * slot we are safe. 1277 */ 1278 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1279 !vn_isdisk(vp, NULL)) 1280 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1281 /* 1282 * Put us back on the worklist. The worklist 1283 * routine will remove us from our current 1284 * position and then add us back in at a later 1285 * position. 1286 */ 1287 vn_syncer_add_to_worklist(vp, syncdelay); 1288 } 1289 splx(s); 1290 } 1291 1292 /* 1293 * Do soft update processing. 1294 */ 1295 if (bioops.io_sync) 1296 (*bioops.io_sync)(NULL); 1297 1298 /* 1299 * The variable rushjob allows the kernel to speed up the 1300 * processing of the filesystem syncer process. A rushjob 1301 * value of N tells the filesystem syncer to process the next 1302 * N seconds worth of work on its queue ASAP. Currently rushjob 1303 * is used by the soft update code to speed up the filesystem 1304 * syncer process when the incore state is getting so far 1305 * ahead of the disk that the kernel memory pool is being 1306 * threatened with exhaustion. 1307 */ 1308 if (rushjob > 0) { 1309 rushjob -= 1; 1310 continue; 1311 } 1312 /* 1313 * If it has taken us less than a second to process the 1314 * current work, then wait. Otherwise start right over 1315 * again. We can still lose time if any single round 1316 * takes more than two seconds, but it does not really 1317 * matter as we are just trying to generally pace the 1318 * filesystem activity. 1319 */ 1320 if (time_second == starttime) 1321 tsleep(&lbolt, 0, "syncer", 0); 1322 } 1323 } 1324 1325 /* 1326 * Request the syncer daemon to speed up its work. 1327 * We never push it to speed up more than half of its 1328 * normal turn time, otherwise it could take over the cpu. 1329 * 1330 * YYY wchan field protected by the BGL. 1331 */ 1332 int 1333 speedup_syncer() 1334 { 1335 crit_enter(); 1336 if (updatethread->td_wchan == &lbolt) { /* YYY */ 1337 unsleep(updatethread); 1338 lwkt_schedule(updatethread); 1339 } 1340 crit_exit(); 1341 if (rushjob < syncdelay / 2) { 1342 rushjob += 1; 1343 stat_rush_requests += 1; 1344 return (1); 1345 } 1346 return(0); 1347 } 1348 1349 /* 1350 * Associate a p-buffer with a vnode. 1351 * 1352 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1353 * with the buffer. i.e. the bp has not been linked into the vnode or 1354 * ref-counted. 1355 */ 1356 void 1357 pbgetvp(vp, bp) 1358 struct vnode *vp; 1359 struct buf *bp; 1360 { 1361 1362 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1363 1364 bp->b_vp = vp; 1365 bp->b_flags |= B_PAGING; 1366 bp->b_dev = vn_todev(vp); 1367 } 1368 1369 /* 1370 * Disassociate a p-buffer from a vnode. 1371 */ 1372 void 1373 pbrelvp(bp) 1374 struct buf *bp; 1375 { 1376 1377 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1378 1379 /* XXX REMOVE ME */ 1380 if (TAILQ_NEXT(bp, b_vnbufs) != NULL) { 1381 panic( 1382 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1383 bp, 1384 (int)bp->b_flags 1385 ); 1386 } 1387 bp->b_vp = (struct vnode *) 0; 1388 bp->b_flags &= ~B_PAGING; 1389 } 1390 1391 void 1392 pbreassignbuf(bp, newvp) 1393 struct buf *bp; 1394 struct vnode *newvp; 1395 { 1396 if ((bp->b_flags & B_PAGING) == 0) { 1397 panic( 1398 "pbreassignbuf() on non phys bp %p", 1399 bp 1400 ); 1401 } 1402 bp->b_vp = newvp; 1403 } 1404 1405 /* 1406 * Reassign a buffer from one vnode to another. 1407 * Used to assign file specific control information 1408 * (indirect blocks) to the vnode to which they belong. 1409 */ 1410 void 1411 reassignbuf(bp, newvp) 1412 struct buf *bp; 1413 struct vnode *newvp; 1414 { 1415 struct buflists *listheadp; 1416 int delay; 1417 int s; 1418 1419 if (newvp == NULL) { 1420 printf("reassignbuf: NULL"); 1421 return; 1422 } 1423 ++reassignbufcalls; 1424 1425 /* 1426 * B_PAGING flagged buffers cannot be reassigned because their vp 1427 * is not fully linked in. 1428 */ 1429 if (bp->b_flags & B_PAGING) 1430 panic("cannot reassign paging buffer"); 1431 1432 s = splbio(); 1433 /* 1434 * Delete from old vnode list, if on one. 1435 */ 1436 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1437 if (bp->b_xflags & BX_VNDIRTY) 1438 listheadp = &bp->b_vp->v_dirtyblkhd; 1439 else 1440 listheadp = &bp->b_vp->v_cleanblkhd; 1441 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1442 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1443 if (bp->b_vp != newvp) { 1444 vdrop(bp->b_vp); 1445 bp->b_vp = NULL; /* for clarification */ 1446 } 1447 } 1448 /* 1449 * If dirty, put on list of dirty buffers; otherwise insert onto list 1450 * of clean buffers. 1451 */ 1452 if (bp->b_flags & B_DELWRI) { 1453 struct buf *tbp; 1454 1455 listheadp = &newvp->v_dirtyblkhd; 1456 if ((newvp->v_flag & VONWORKLST) == 0) { 1457 switch (newvp->v_type) { 1458 case VDIR: 1459 delay = dirdelay; 1460 break; 1461 case VCHR: 1462 case VBLK: 1463 if (newvp->v_rdev && 1464 newvp->v_rdev->si_mountpoint != NULL) { 1465 delay = metadelay; 1466 break; 1467 } 1468 /* fall through */ 1469 default: 1470 delay = filedelay; 1471 } 1472 vn_syncer_add_to_worklist(newvp, delay); 1473 } 1474 bp->b_xflags |= BX_VNDIRTY; 1475 tbp = TAILQ_FIRST(listheadp); 1476 if (tbp == NULL || 1477 bp->b_lblkno == 0 || 1478 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1479 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1480 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1481 ++reassignbufsortgood; 1482 } else if (bp->b_lblkno < 0) { 1483 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1484 ++reassignbufsortgood; 1485 } else if (reassignbufmethod == 1) { 1486 /* 1487 * New sorting algorithm, only handle sequential case, 1488 * otherwise append to end (but before metadata) 1489 */ 1490 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1491 (tbp->b_xflags & BX_VNDIRTY)) { 1492 /* 1493 * Found the best place to insert the buffer 1494 */ 1495 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1496 ++reassignbufsortgood; 1497 } else { 1498 /* 1499 * Missed, append to end, but before meta-data. 1500 * We know that the head buffer in the list is 1501 * not meta-data due to prior conditionals. 1502 * 1503 * Indirect effects: NFS second stage write 1504 * tends to wind up here, giving maximum 1505 * distance between the unstable write and the 1506 * commit rpc. 1507 */ 1508 tbp = TAILQ_LAST(listheadp, buflists); 1509 while (tbp && tbp->b_lblkno < 0) 1510 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1511 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1512 ++reassignbufsortbad; 1513 } 1514 } else { 1515 /* 1516 * Old sorting algorithm, scan queue and insert 1517 */ 1518 struct buf *ttbp; 1519 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1520 (ttbp->b_lblkno < bp->b_lblkno)) { 1521 ++reassignbufloops; 1522 tbp = ttbp; 1523 } 1524 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1525 } 1526 } else { 1527 bp->b_xflags |= BX_VNCLEAN; 1528 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1529 if ((newvp->v_flag & VONWORKLST) && 1530 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1531 newvp->v_flag &= ~VONWORKLST; 1532 LIST_REMOVE(newvp, v_synclist); 1533 } 1534 } 1535 if (bp->b_vp != newvp) { 1536 bp->b_vp = newvp; 1537 vhold(bp->b_vp); 1538 } 1539 splx(s); 1540 } 1541 1542 /* 1543 * Create a vnode for a block device. 1544 * Used for mounting the root file system. 1545 */ 1546 int 1547 bdevvp(dev_t dev, struct vnode **vpp) 1548 { 1549 struct vnode *vp; 1550 struct vnode *nvp; 1551 int error; 1552 1553 if (dev == NODEV) { 1554 *vpp = NULLVP; 1555 return (ENXIO); 1556 } 1557 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1558 if (error) { 1559 *vpp = NULLVP; 1560 return (error); 1561 } 1562 vp = nvp; 1563 vp->v_type = VCHR; 1564 vp->v_udev = dev->si_udev; 1565 *vpp = vp; 1566 return (0); 1567 } 1568 1569 int 1570 v_associate_rdev(struct vnode *vp, dev_t dev) 1571 { 1572 lwkt_tokref ilock; 1573 1574 if (dev == NULL || dev == NODEV) 1575 return(ENXIO); 1576 if (dev_is_good(dev) == 0) 1577 return(ENXIO); 1578 KKASSERT(vp->v_rdev == NULL); 1579 if (dev_ref_debug) 1580 printf("Z1"); 1581 vp->v_rdev = reference_dev(dev); 1582 lwkt_gettoken(&ilock, &spechash_token); 1583 SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext); 1584 lwkt_reltoken(&ilock); 1585 return(0); 1586 } 1587 1588 void 1589 v_release_rdev(struct vnode *vp) 1590 { 1591 lwkt_tokref ilock; 1592 dev_t dev; 1593 1594 if ((dev = vp->v_rdev) != NULL) { 1595 lwkt_gettoken(&ilock, &spechash_token); 1596 SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext); 1597 if (dev_ref_debug) 1598 printf("Y2"); 1599 vp->v_rdev = NULL; 1600 release_dev(dev); 1601 lwkt_reltoken(&ilock); 1602 } 1603 } 1604 1605 /* 1606 * Add a vnode to the alias list hung off the dev_t. We only associate 1607 * the device number with the vnode. The actual device is not associated 1608 * until the vnode is opened (usually in spec_open()), and will be 1609 * disassociated on last close. 1610 */ 1611 void 1612 addaliasu(struct vnode *nvp, udev_t nvp_udev) 1613 { 1614 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1615 panic("addaliasu on non-special vnode"); 1616 nvp->v_udev = nvp_udev; 1617 } 1618 1619 /* 1620 * Grab a particular vnode from the free list, increment its 1621 * reference count and lock it. The vnode lock bit is set if the 1622 * vnode is being eliminated in vgone. The process is awakened 1623 * when the transition is completed, and an error returned to 1624 * indicate that the vnode is no longer usable (possibly having 1625 * been changed to a new file system type). 1626 * 1627 * This code is very sensitive. We are depending on the vnode interlock 1628 * to be maintained through to the vn_lock() call, which means that we 1629 * cannot block which means that we cannot call vbusy() until after vn_lock(). 1630 * If the interlock is not maintained, the VXLOCK check will not properly 1631 * interlock against a vclean()'s LK_DRAIN operation on the lock. 1632 */ 1633 int 1634 vget(struct vnode *vp, lwkt_tokref_t vlock, int flags, thread_t td) 1635 { 1636 int error; 1637 lwkt_tokref vvlock; 1638 1639 /* 1640 * We need the interlock to safely modify the v_ fields. ZZZ it is 1641 * only legal to pass (1) the vnode's interlock and (2) only pass 1642 * NULL w/o LK_INTERLOCK if the vnode is *ALREADY* referenced or 1643 * held. 1644 */ 1645 if ((flags & LK_INTERLOCK) == 0) { 1646 lwkt_gettoken(&vvlock, vp->v_interlock); 1647 vlock = &vvlock; 1648 } 1649 1650 /* 1651 * If the vnode is in the process of being cleaned out for 1652 * another use, we wait for the cleaning to finish and then 1653 * return failure. Cleaning is determined by checking that 1654 * the VXLOCK flag is set. It is possible for the vnode to be 1655 * self-referenced during the cleaning operation. 1656 */ 1657 if (vp->v_flag & VXLOCK) { 1658 if (vp->v_vxthread == curthread) { 1659 #if 0 1660 /* this can now occur in normal operation */ 1661 log(LOG_INFO, "VXLOCK interlock avoided\n"); 1662 #endif 1663 } else { 1664 vp->v_flag |= VXWANT; 1665 lwkt_reltoken(vlock); 1666 tsleep((caddr_t)vp, 0, "vget", 0); 1667 return (ENOENT); 1668 } 1669 } 1670 1671 /* 1672 * Bump v_usecount to prevent the vnode from being recycled. The 1673 * usecount needs to be bumped before we successfully get our lock. 1674 */ 1675 vp->v_usecount++; 1676 if (flags & LK_TYPE_MASK) { 1677 if ((error = vn_lock(vp, vlock, flags | LK_INTERLOCK, td)) != 0) { 1678 /* 1679 * must expand vrele here because we do not want 1680 * to call VOP_INACTIVE if the reference count 1681 * drops back to zero since it was never really 1682 * active. We must remove it from the free list 1683 * before sleeping so that multiple processes do 1684 * not try to recycle it. 1685 */ 1686 lwkt_gettokref(vlock); 1687 vp->v_usecount--; 1688 vmaybefree(vp); 1689 lwkt_reltoken(vlock); 1690 } 1691 return (error); 1692 } 1693 if (VSHOULDBUSY(vp)) 1694 vbusy(vp); /* interlock must be held on call */ 1695 lwkt_reltoken(vlock); 1696 return (0); 1697 } 1698 1699 void 1700 vref(struct vnode *vp) 1701 { 1702 crit_enter(); /* YYY use crit section for moment / BGL protected */ 1703 vp->v_usecount++; 1704 crit_exit(); 1705 } 1706 1707 /* 1708 * Vnode put/release. 1709 * If count drops to zero, call inactive routine and return to freelist. 1710 */ 1711 void 1712 vrele(struct vnode *vp) 1713 { 1714 struct thread *td = curthread; /* XXX */ 1715 lwkt_tokref vlock; 1716 1717 KASSERT(vp != NULL && vp->v_usecount >= 0, 1718 ("vrele: null vp or <=0 v_usecount")); 1719 1720 lwkt_gettoken(&vlock, vp->v_interlock); 1721 1722 if (vp->v_usecount > 1) { 1723 vp->v_usecount--; 1724 lwkt_reltoken(&vlock); 1725 return; 1726 } 1727 1728 if (vp->v_usecount == 1) { 1729 vp->v_usecount--; 1730 /* 1731 * We must call VOP_INACTIVE with the node locked and the 1732 * usecount 0. If we are doing a vpu, the node is already 1733 * locked, but, in the case of vrele, we must explicitly lock 1734 * the vnode before calling VOP_INACTIVE. 1735 */ 1736 1737 if (vn_lock(vp, NULL, LK_EXCLUSIVE, td) == 0) 1738 VOP_INACTIVE(vp, td); 1739 vmaybefree(vp); 1740 lwkt_reltoken(&vlock); 1741 } else { 1742 #ifdef DIAGNOSTIC 1743 vprint("vrele: negative ref count", vp); 1744 #endif 1745 lwkt_reltoken(&vlock); 1746 panic("vrele: negative ref cnt"); 1747 } 1748 } 1749 1750 void 1751 vput(struct vnode *vp) 1752 { 1753 struct thread *td = curthread; /* XXX */ 1754 lwkt_tokref vlock; 1755 1756 KASSERT(vp != NULL, ("vput: null vp")); 1757 1758 lwkt_gettoken(&vlock, vp->v_interlock); 1759 1760 if (vp->v_usecount > 1) { 1761 vp->v_usecount--; 1762 VOP_UNLOCK(vp, &vlock, LK_INTERLOCK, td); 1763 return; 1764 } 1765 1766 if (vp->v_usecount == 1) { 1767 vp->v_usecount--; 1768 /* 1769 * We must call VOP_INACTIVE with the node locked. 1770 * If we are doing a vpu, the node is already locked, 1771 * so we just need to release the vnode mutex. 1772 */ 1773 VOP_INACTIVE(vp, td); 1774 vmaybefree(vp); 1775 lwkt_reltoken(&vlock); 1776 } else { 1777 #ifdef DIAGNOSTIC 1778 vprint("vput: negative ref count", vp); 1779 #endif 1780 lwkt_reltoken(&vlock); 1781 panic("vput: negative ref cnt"); 1782 } 1783 } 1784 1785 /* 1786 * Somebody doesn't want the vnode recycled. ZZZ vnode interlock should 1787 * be held but isn't. 1788 */ 1789 void 1790 vhold(vp) 1791 struct vnode *vp; 1792 { 1793 int s; 1794 1795 s = splbio(); 1796 vp->v_holdcnt++; 1797 if (VSHOULDBUSY(vp)) 1798 vbusy(vp); /* interlock must be held on call */ 1799 splx(s); 1800 } 1801 1802 /* 1803 * One less who cares about this vnode. 1804 */ 1805 void 1806 vdrop(vp) 1807 struct vnode *vp; 1808 { 1809 lwkt_tokref vlock; 1810 1811 lwkt_gettoken(&vlock, vp->v_interlock); 1812 if (vp->v_holdcnt <= 0) 1813 panic("vdrop: holdcnt"); 1814 vp->v_holdcnt--; 1815 vmaybefree(vp); 1816 lwkt_reltoken(&vlock); 1817 } 1818 1819 int 1820 vmntvnodescan( 1821 struct mount *mp, 1822 int (*fastfunc)(struct mount *mp, struct vnode *vp, void *data), 1823 int (*slowfunc)(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data), 1824 void *data 1825 ) { 1826 lwkt_tokref ilock; 1827 lwkt_tokref vlock; 1828 struct vnode *pvp; 1829 struct vnode *vp; 1830 int r = 0; 1831 1832 /* 1833 * Scan the vnodes on the mount's vnode list. Use a placemarker 1834 */ 1835 pvp = zalloc(vnode_zone); 1836 pvp->v_flag |= VPLACEMARKER; 1837 1838 lwkt_gettoken(&ilock, &mntvnode_token); 1839 TAILQ_INSERT_HEAD(&mp->mnt_nvnodelist, pvp, v_nmntvnodes); 1840 1841 while ((vp = TAILQ_NEXT(pvp, v_nmntvnodes)) != NULL) { 1842 /* 1843 * Move the placemarker and skip other placemarkers we 1844 * encounter. The nothing can get in our way so the 1845 * mount point on the vp must be valid. 1846 */ 1847 TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes); 1848 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, pvp, v_nmntvnodes); 1849 if (vp->v_flag & VPLACEMARKER) 1850 continue; 1851 KKASSERT(vp->v_mount == mp); 1852 1853 /* 1854 * Quick test 1855 */ 1856 if (fastfunc) { 1857 if ((r = fastfunc(mp, vp, data)) < 0) 1858 continue; 1859 if (r) 1860 break; 1861 } 1862 1863 /* 1864 * Get the vnodes interlock and make sure it is still on the 1865 * mount list. Skip it if it has moved (we may encounter it 1866 * later). Then do the with-interlock test. The callback 1867 * is responsible for releasing the vnode interlock. 1868 * 1869 * The interlock is type-stable. 1870 */ 1871 if (slowfunc) { 1872 lwkt_gettoken(&vlock, vp->v_interlock); 1873 if (vp != TAILQ_PREV(pvp, vnodelst, v_nmntvnodes)) { 1874 printf("vmntvnodescan (debug info only): f=%p vp=%p vnode ripped out from under us\n", slowfunc, vp); 1875 lwkt_reltoken(&vlock); 1876 continue; 1877 } 1878 if ((r = slowfunc(mp, vp, &vlock, data)) != 0) { 1879 KKASSERT(lwkt_havetokref(&vlock) == 0); 1880 break; 1881 } 1882 KKASSERT(lwkt_havetokref(&vlock) == 0); 1883 } 1884 } 1885 TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes); 1886 zfree(vnode_zone, pvp); 1887 lwkt_reltoken(&ilock); 1888 return(r); 1889 } 1890 1891 /* 1892 * Remove any vnodes in the vnode table belonging to mount point mp. 1893 * 1894 * If FORCECLOSE is not specified, there should not be any active ones, 1895 * return error if any are found (nb: this is a user error, not a 1896 * system error). If FORCECLOSE is specified, detach any active vnodes 1897 * that are found. 1898 * 1899 * If WRITECLOSE is set, only flush out regular file vnodes open for 1900 * writing. 1901 * 1902 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped. 1903 * 1904 * `rootrefs' specifies the base reference count for the root vnode 1905 * of this filesystem. The root vnode is considered busy if its 1906 * v_usecount exceeds this value. On a successful return, vflush() 1907 * will call vrele() on the root vnode exactly rootrefs times. 1908 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 1909 * be zero. 1910 */ 1911 #ifdef DIAGNOSTIC 1912 static int busyprt = 0; /* print out busy vnodes */ 1913 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1914 #endif 1915 1916 static int vflush_scan(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data); 1917 1918 struct vflush_info { 1919 int flags; 1920 int busy; 1921 thread_t td; 1922 }; 1923 1924 int 1925 vflush(mp, rootrefs, flags) 1926 struct mount *mp; 1927 int rootrefs; 1928 int flags; 1929 { 1930 struct thread *td = curthread; /* XXX */ 1931 struct vnode *rootvp = NULL; 1932 int error; 1933 lwkt_tokref vlock; 1934 struct vflush_info vflush_info; 1935 1936 if (rootrefs > 0) { 1937 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 1938 ("vflush: bad args")); 1939 /* 1940 * Get the filesystem root vnode. We can vput() it 1941 * immediately, since with rootrefs > 0, it won't go away. 1942 */ 1943 if ((error = VFS_ROOT(mp, &rootvp)) != 0) 1944 return (error); 1945 vput(rootvp); 1946 } 1947 1948 vflush_info.busy = 0; 1949 vflush_info.flags = flags; 1950 vflush_info.td = td; 1951 vmntvnodescan(mp, NULL, vflush_scan, &vflush_info); 1952 1953 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 1954 /* 1955 * If just the root vnode is busy, and if its refcount 1956 * is equal to `rootrefs', then go ahead and kill it. 1957 */ 1958 lwkt_gettoken(&vlock, rootvp->v_interlock); 1959 KASSERT(vflush_info.busy > 0, ("vflush: not busy")); 1960 KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs")); 1961 if (vflush_info.busy == 1 && rootvp->v_usecount == rootrefs) { 1962 vgonel(rootvp, &vlock, td); 1963 vflush_info.busy = 0; 1964 } else { 1965 lwkt_reltoken(&vlock); 1966 } 1967 } 1968 if (vflush_info.busy) 1969 return (EBUSY); 1970 for (; rootrefs > 0; rootrefs--) 1971 vrele(rootvp); 1972 return (0); 1973 } 1974 1975 /* 1976 * The scan callback is made with an interlocked vnode. 1977 */ 1978 static int 1979 vflush_scan(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data) 1980 { 1981 struct vflush_info *info = data; 1982 struct vattr vattr; 1983 1984 /* 1985 * Skip over a vnodes marked VSYSTEM. 1986 */ 1987 if ((info->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1988 lwkt_reltoken(vlock); 1989 return(0); 1990 } 1991 1992 /* 1993 * If WRITECLOSE is set, flush out unlinked but still open 1994 * files (even if open only for reading) and regular file 1995 * vnodes open for writing. 1996 */ 1997 if ((info->flags & WRITECLOSE) && 1998 (vp->v_type == VNON || 1999 (VOP_GETATTR(vp, &vattr, info->td) == 0 && 2000 vattr.va_nlink > 0)) && 2001 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2002 lwkt_reltoken(vlock); 2003 return(0); 2004 } 2005 2006 /* 2007 * With v_usecount == 0, all we need to do is clear out the 2008 * vnode data structures and we are done. 2009 */ 2010 if (vp->v_usecount == 0) { 2011 vgonel(vp, vlock, info->td); 2012 return(0); 2013 } 2014 2015 /* 2016 * If FORCECLOSE is set, forcibly close the vnode. For block 2017 * or character devices, revert to an anonymous device. For 2018 * all other files, just kill them. 2019 */ 2020 if (info->flags & FORCECLOSE) { 2021 if (vp->v_type != VBLK && vp->v_type != VCHR) { 2022 vgonel(vp, vlock, info->td); 2023 } else { 2024 vclean(vp, vlock, 0, info->td); 2025 vp->v_op = spec_vnodeop_p; 2026 insmntque(vp, (struct mount *) 0); 2027 } 2028 return(0); 2029 } 2030 #ifdef DIAGNOSTIC 2031 if (busyprt) 2032 vprint("vflush: busy vnode", vp); 2033 #endif 2034 lwkt_reltoken(vlock); 2035 ++info->busy; 2036 return(0); 2037 } 2038 2039 /* 2040 * Disassociate the underlying file system from a vnode. 2041 */ 2042 static void 2043 vclean(struct vnode *vp, lwkt_tokref_t vlock, int flags, struct thread *td) 2044 { 2045 int active; 2046 2047 /* 2048 * Check to see if the vnode is in use. If so we have to reference it 2049 * before we clean it out so that its count cannot fall to zero and 2050 * generate a race against ourselves to recycle it. 2051 */ 2052 if ((active = vp->v_usecount)) 2053 vp->v_usecount++; 2054 2055 /* 2056 * Prevent the vnode from being recycled or brought into use while we 2057 * clean it out. 2058 */ 2059 if (vp->v_flag & VXLOCK) 2060 panic("vclean: deadlock"); 2061 vp->v_flag |= VXLOCK; 2062 vp->v_vxthread = curthread; 2063 2064 /* 2065 * Even if the count is zero, the VOP_INACTIVE routine may still 2066 * have the object locked while it cleans it out. The VOP_LOCK 2067 * ensures that the VOP_INACTIVE routine is done with its work. 2068 * For active vnodes, it ensures that no other activity can 2069 * occur while the underlying object is being cleaned out. 2070 * 2071 * NOTE: we continue to hold the vnode interlock through to the 2072 * end of vclean(). 2073 */ 2074 VOP_LOCK(vp, NULL, LK_DRAIN, td); 2075 2076 /* 2077 * Clean out any buffers associated with the vnode. 2078 */ 2079 vinvalbuf(vp, V_SAVE, td, 0, 0); 2080 VOP_DESTROYVOBJECT(vp); 2081 2082 /* 2083 * If purging an active vnode, it must be closed and 2084 * deactivated before being reclaimed. Note that the 2085 * VOP_INACTIVE will unlock the vnode. 2086 */ 2087 if (active) { 2088 if (flags & DOCLOSE) 2089 VOP_CLOSE(vp, FNONBLOCK, td); 2090 VOP_INACTIVE(vp, td); 2091 } else { 2092 /* 2093 * Any other processes trying to obtain this lock must first 2094 * wait for VXLOCK to clear, then call the new lock operation. 2095 */ 2096 VOP_UNLOCK(vp, NULL, 0, td); 2097 } 2098 /* 2099 * Reclaim the vnode. 2100 */ 2101 if (VOP_RECLAIM(vp, td)) 2102 panic("vclean: cannot reclaim"); 2103 2104 if (active) { 2105 /* 2106 * Inline copy of vrele() since VOP_INACTIVE 2107 * has already been called. 2108 */ 2109 if (--vp->v_usecount <= 0) { 2110 #ifdef DIAGNOSTIC 2111 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 2112 vprint("vclean: bad ref count", vp); 2113 panic("vclean: ref cnt"); 2114 } 2115 #endif 2116 vfree(vp); 2117 } 2118 } 2119 2120 cache_purge(vp); 2121 vp->v_vnlock = NULL; 2122 vmaybefree(vp); 2123 2124 /* 2125 * Done with purge, notify sleepers of the grim news. 2126 */ 2127 vp->v_op = dead_vnodeop_p; 2128 vn_pollgone(vp); 2129 vp->v_tag = VT_NON; 2130 vp->v_flag &= ~VXLOCK; 2131 vp->v_vxthread = NULL; 2132 if (vp->v_flag & VXWANT) { 2133 vp->v_flag &= ~VXWANT; 2134 wakeup((caddr_t) vp); 2135 } 2136 lwkt_reltoken(vlock); 2137 } 2138 2139 /* 2140 * Eliminate all activity associated with the requested vnode 2141 * and with all vnodes aliased to the requested vnode. 2142 */ 2143 int 2144 vop_revoke(ap) 2145 struct vop_revoke_args /* { 2146 struct vnode *a_vp; 2147 int a_flags; 2148 } */ *ap; 2149 { 2150 struct vnode *vp, *vq; 2151 lwkt_tokref ilock; 2152 dev_t dev; 2153 2154 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 2155 2156 vp = ap->a_vp; 2157 /* 2158 * If a vgone (or vclean) is already in progress, 2159 * wait until it is done and return. 2160 */ 2161 if (vp->v_flag & VXLOCK) { 2162 vp->v_flag |= VXWANT; 2163 /*lwkt_reltoken(vlock); ZZZ */ 2164 tsleep((caddr_t)vp, 0, "vop_revokeall", 0); 2165 return (0); 2166 } 2167 2168 /* 2169 * If the vnode has a device association, scrap all vnodes associated 2170 * with the device. Don't let the device disappear on us while we 2171 * are scrapping the vnodes. 2172 */ 2173 if (vp->v_type != VCHR && vp->v_type != VBLK) 2174 return(0); 2175 if ((dev = vp->v_rdev) == NULL) { 2176 if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV) 2177 return(0); 2178 } 2179 reference_dev(dev); 2180 for (;;) { 2181 lwkt_gettoken(&ilock, &spechash_token); 2182 vq = SLIST_FIRST(&dev->si_hlist); 2183 lwkt_reltoken(&ilock); 2184 if (vq == NULL) 2185 break; 2186 vgone(vq); 2187 } 2188 release_dev(dev); 2189 return (0); 2190 } 2191 2192 /* 2193 * Recycle an unused vnode to the front of the free list. 2194 * Release the passed interlock if the vnode will be recycled. 2195 */ 2196 int 2197 vrecycle(struct vnode *vp, lwkt_tokref_t inter_lkp, struct thread *td) 2198 { 2199 lwkt_tokref vlock; 2200 2201 lwkt_gettoken(&vlock, vp->v_interlock); 2202 if (vp->v_usecount == 0) { 2203 if (inter_lkp) 2204 lwkt_reltoken(inter_lkp); 2205 vgonel(vp, &vlock, td); 2206 return (1); 2207 } 2208 lwkt_reltoken(&vlock); 2209 return (0); 2210 } 2211 2212 /* 2213 * Eliminate all activity associated with a vnode 2214 * in preparation for reuse. 2215 */ 2216 void 2217 vgone(struct vnode *vp) 2218 { 2219 struct thread *td = curthread; /* XXX */ 2220 lwkt_tokref vlock; 2221 2222 lwkt_gettoken(&vlock, vp->v_interlock); 2223 vgonel(vp, &vlock, td); 2224 } 2225 2226 /* 2227 * vgone, with the vp interlock held. 2228 */ 2229 void 2230 vgonel(struct vnode *vp, lwkt_tokref_t vlock, struct thread *td) 2231 { 2232 lwkt_tokref ilock; 2233 int s; 2234 2235 /* 2236 * If a vgone (or vclean) is already in progress, 2237 * wait until it is done and return. 2238 */ 2239 if (vp->v_flag & VXLOCK) { 2240 vp->v_flag |= VXWANT; 2241 lwkt_reltoken(vlock); 2242 tsleep((caddr_t)vp, 0, "vgone", 0); 2243 return; 2244 } 2245 2246 /* 2247 * Clean out the filesystem specific data. 2248 */ 2249 vclean(vp, vlock, DOCLOSE, td); 2250 lwkt_gettokref(vlock); 2251 2252 /* 2253 * Delete from old mount point vnode list, if on one. 2254 */ 2255 if (vp->v_mount != NULL) 2256 insmntque(vp, (struct mount *)0); 2257 /* 2258 * If special device, remove it from special device alias list 2259 * if it is on one. 2260 */ 2261 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 2262 v_release_rdev(vp); 2263 } 2264 2265 /* 2266 * If it is on the freelist and not already at the head, 2267 * move it to the head of the list. The test of the 2268 * VDOOMED flag and the reference count of zero is because 2269 * it will be removed from the free list by getnewvnode, 2270 * but will not have its reference count incremented until 2271 * after calling vgone. If the reference count were 2272 * incremented first, vgone would (incorrectly) try to 2273 * close the previous instance of the underlying object. 2274 */ 2275 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 2276 s = splbio(); 2277 lwkt_gettoken(&ilock, &vnode_free_list_token); 2278 if (vp->v_flag & VFREE) 2279 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2280 else 2281 freevnodes++; 2282 vp->v_flag |= VFREE; 2283 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2284 lwkt_reltoken(&ilock); 2285 splx(s); 2286 } 2287 vp->v_type = VBAD; 2288 lwkt_reltoken(vlock); 2289 } 2290 2291 /* 2292 * Lookup a vnode by device number. 2293 */ 2294 int 2295 vfinddev(dev, type, vpp) 2296 dev_t dev; 2297 enum vtype type; 2298 struct vnode **vpp; 2299 { 2300 lwkt_tokref ilock; 2301 struct vnode *vp; 2302 2303 lwkt_gettoken(&ilock, &spechash_token); 2304 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2305 if (type == vp->v_type) { 2306 *vpp = vp; 2307 lwkt_reltoken(&ilock); 2308 return (1); 2309 } 2310 } 2311 lwkt_reltoken(&ilock); 2312 return (0); 2313 } 2314 2315 /* 2316 * Calculate the total number of references to a special device. This 2317 * routine may only be called for VBLK and VCHR vnodes since v_rdev is 2318 * an overloaded field. Since udev2dev can now return NODEV, we have 2319 * to check for a NULL v_rdev. 2320 */ 2321 int 2322 count_dev(dev_t dev) 2323 { 2324 lwkt_tokref ilock; 2325 struct vnode *vp; 2326 int count = 0; 2327 2328 if (SLIST_FIRST(&dev->si_hlist)) { 2329 lwkt_gettoken(&ilock, &spechash_token); 2330 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 2331 count += vp->v_usecount; 2332 } 2333 lwkt_reltoken(&ilock); 2334 } 2335 return(count); 2336 } 2337 2338 int 2339 count_udev(udev_t udev) 2340 { 2341 dev_t dev; 2342 2343 if ((dev = udev2dev(udev, 0)) == NODEV) 2344 return(0); 2345 return(count_dev(dev)); 2346 } 2347 2348 int 2349 vcount(struct vnode *vp) 2350 { 2351 if (vp->v_rdev == NULL) 2352 return(0); 2353 return(count_dev(vp->v_rdev)); 2354 } 2355 2356 /* 2357 * Print out a description of a vnode. 2358 */ 2359 static char *typename[] = 2360 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2361 2362 void 2363 vprint(label, vp) 2364 char *label; 2365 struct vnode *vp; 2366 { 2367 char buf[96]; 2368 2369 if (label != NULL) 2370 printf("%s: %p: ", label, (void *)vp); 2371 else 2372 printf("%p: ", (void *)vp); 2373 printf("type %s, usecount %d, writecount %d, refcount %d,", 2374 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2375 vp->v_holdcnt); 2376 buf[0] = '\0'; 2377 if (vp->v_flag & VROOT) 2378 strcat(buf, "|VROOT"); 2379 if (vp->v_flag & VTEXT) 2380 strcat(buf, "|VTEXT"); 2381 if (vp->v_flag & VSYSTEM) 2382 strcat(buf, "|VSYSTEM"); 2383 if (vp->v_flag & VXLOCK) 2384 strcat(buf, "|VXLOCK"); 2385 if (vp->v_flag & VXWANT) 2386 strcat(buf, "|VXWANT"); 2387 if (vp->v_flag & VBWAIT) 2388 strcat(buf, "|VBWAIT"); 2389 if (vp->v_flag & VDOOMED) 2390 strcat(buf, "|VDOOMED"); 2391 if (vp->v_flag & VFREE) 2392 strcat(buf, "|VFREE"); 2393 if (vp->v_flag & VOBJBUF) 2394 strcat(buf, "|VOBJBUF"); 2395 if (buf[0] != '\0') 2396 printf(" flags (%s)", &buf[1]); 2397 if (vp->v_data == NULL) { 2398 printf("\n"); 2399 } else { 2400 printf("\n\t"); 2401 VOP_PRINT(vp); 2402 } 2403 } 2404 2405 #ifdef DDB 2406 #include <ddb/ddb.h> 2407 /* 2408 * List all of the locked vnodes in the system. 2409 * Called when debugging the kernel. 2410 */ 2411 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2412 { 2413 struct thread *td = curthread; /* XXX */ 2414 lwkt_tokref ilock; 2415 struct mount *mp, *nmp; 2416 struct vnode *vp; 2417 2418 printf("Locked vnodes\n"); 2419 lwkt_gettoken(&ilock, &mountlist_token); 2420 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2421 if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) { 2422 nmp = TAILQ_NEXT(mp, mnt_list); 2423 continue; 2424 } 2425 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2426 if (VOP_ISLOCKED(vp, NULL)) 2427 vprint((char *)0, vp); 2428 } 2429 lwkt_gettokref(&ilock); 2430 nmp = TAILQ_NEXT(mp, mnt_list); 2431 vfs_unbusy(mp, td); 2432 } 2433 lwkt_reltoken(&ilock); 2434 } 2435 #endif 2436 2437 /* 2438 * Top level filesystem related information gathering. 2439 */ 2440 static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS); 2441 2442 static int 2443 vfs_sysctl(SYSCTL_HANDLER_ARGS) 2444 { 2445 int *name = (int *)arg1 - 1; /* XXX */ 2446 u_int namelen = arg2 + 1; /* XXX */ 2447 struct vfsconf *vfsp; 2448 2449 #if 1 || defined(COMPAT_PRELITE2) 2450 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2451 if (namelen == 1) 2452 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2453 #endif 2454 2455 #ifdef notyet 2456 /* all sysctl names at this level are at least name and field */ 2457 if (namelen < 2) 2458 return (ENOTDIR); /* overloaded */ 2459 if (name[0] != VFS_GENERIC) { 2460 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2461 if (vfsp->vfc_typenum == name[0]) 2462 break; 2463 if (vfsp == NULL) 2464 return (EOPNOTSUPP); 2465 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2466 oldp, oldlenp, newp, newlen, p)); 2467 } 2468 #endif 2469 switch (name[1]) { 2470 case VFS_MAXTYPENUM: 2471 if (namelen != 2) 2472 return (ENOTDIR); 2473 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2474 case VFS_CONF: 2475 if (namelen != 3) 2476 return (ENOTDIR); /* overloaded */ 2477 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2478 if (vfsp->vfc_typenum == name[2]) 2479 break; 2480 if (vfsp == NULL) 2481 return (EOPNOTSUPP); 2482 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2483 } 2484 return (EOPNOTSUPP); 2485 } 2486 2487 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2488 "Generic filesystem"); 2489 2490 #if 1 || defined(COMPAT_PRELITE2) 2491 2492 static int 2493 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2494 { 2495 int error; 2496 struct vfsconf *vfsp; 2497 struct ovfsconf ovfs; 2498 2499 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2500 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2501 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2502 ovfs.vfc_index = vfsp->vfc_typenum; 2503 ovfs.vfc_refcount = vfsp->vfc_refcount; 2504 ovfs.vfc_flags = vfsp->vfc_flags; 2505 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2506 if (error) 2507 return error; 2508 } 2509 return 0; 2510 } 2511 2512 #endif /* 1 || COMPAT_PRELITE2 */ 2513 2514 #if 0 2515 #define KINFO_VNODESLOP 10 2516 /* 2517 * Dump vnode list (via sysctl). 2518 * Copyout address of vnode followed by vnode. 2519 */ 2520 /* ARGSUSED */ 2521 static int 2522 sysctl_vnode(SYSCTL_HANDLER_ARGS) 2523 { 2524 struct proc *p = curproc; /* XXX */ 2525 struct mount *mp, *nmp; 2526 struct vnode *nvp, *vp; 2527 lwkt_tokref ilock; 2528 lwkt_tokref jlock; 2529 int error; 2530 2531 #define VPTRSZ sizeof (struct vnode *) 2532 #define VNODESZ sizeof (struct vnode) 2533 2534 req->lock = 0; 2535 if (!req->oldptr) /* Make an estimate */ 2536 return (SYSCTL_OUT(req, 0, 2537 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2538 2539 lwkt_gettoken(&ilock, &mountlist_token); 2540 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2541 if (vfs_busy(mp, LK_NOWAIT, &ilock, p)) { 2542 nmp = TAILQ_NEXT(mp, mnt_list); 2543 continue; 2544 } 2545 lwkt_gettoken(&jlock, &mntvnode_token); 2546 again: 2547 for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 2548 vp != NULL; 2549 vp = nvp) { 2550 /* 2551 * Check that the vp is still associated with 2552 * this filesystem. RACE: could have been 2553 * recycled onto the same filesystem. 2554 */ 2555 if (vp->v_mount != mp) 2556 goto again; 2557 nvp = TAILQ_NEXT(vp, v_nmntvnodes); 2558 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2559 (error = SYSCTL_OUT(req, vp, VNODESZ))) { 2560 lwkt_reltoken(&jlock); 2561 return (error); 2562 } 2563 } 2564 lwkt_reltoken(&jlock); 2565 lwkt_gettokref(&ilock); 2566 nmp = TAILQ_NEXT(mp, mnt_list); /* ZZZ */ 2567 vfs_unbusy(mp, p); 2568 } 2569 lwkt_reltoken(&ilock); 2570 2571 return (0); 2572 } 2573 #endif 2574 2575 /* 2576 * XXX 2577 * Exporting the vnode list on large systems causes them to crash. 2578 * Exporting the vnode list on medium systems causes sysctl to coredump. 2579 */ 2580 #if 0 2581 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2582 0, 0, sysctl_vnode, "S,vnode", ""); 2583 #endif 2584 2585 /* 2586 * Check to see if a filesystem is mounted on a block device. 2587 */ 2588 int 2589 vfs_mountedon(struct vnode *vp) 2590 { 2591 dev_t dev; 2592 2593 if ((dev = vp->v_rdev) == NULL) 2594 dev = udev2dev(vp->v_udev, (vp->v_type == VBLK)); 2595 if (dev != NODEV && dev->si_mountpoint) 2596 return (EBUSY); 2597 return (0); 2598 } 2599 2600 /* 2601 * Unmount all filesystems. The list is traversed in reverse order 2602 * of mounting to avoid dependencies. 2603 */ 2604 void 2605 vfs_unmountall() 2606 { 2607 struct mount *mp; 2608 struct thread *td = curthread; 2609 int error; 2610 2611 if (td->td_proc == NULL) 2612 td = initproc->p_thread; /* XXX XXX use proc0 instead? */ 2613 2614 /* 2615 * Since this only runs when rebooting, it is not interlocked. 2616 */ 2617 while(!TAILQ_EMPTY(&mountlist)) { 2618 mp = TAILQ_LAST(&mountlist, mntlist); 2619 error = dounmount(mp, MNT_FORCE, td); 2620 if (error) { 2621 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2622 printf("unmount of %s failed (", 2623 mp->mnt_stat.f_mntonname); 2624 if (error == EBUSY) 2625 printf("BUSY)\n"); 2626 else 2627 printf("%d)\n", error); 2628 } else { 2629 /* The unmount has removed mp from the mountlist */ 2630 } 2631 } 2632 } 2633 2634 /* 2635 * Build hash lists of net addresses and hang them off the mount point. 2636 * Called by ufs_mount() to set up the lists of export addresses. 2637 */ 2638 static int 2639 vfs_hang_addrlist(mp, nep, argp) 2640 struct mount *mp; 2641 struct netexport *nep; 2642 struct export_args *argp; 2643 { 2644 struct netcred *np; 2645 struct radix_node_head *rnh; 2646 int i; 2647 struct radix_node *rn; 2648 struct sockaddr *saddr, *smask = 0; 2649 struct domain *dom; 2650 int error; 2651 2652 if (argp->ex_addrlen == 0) { 2653 if (mp->mnt_flag & MNT_DEFEXPORTED) 2654 return (EPERM); 2655 np = &nep->ne_defexported; 2656 np->netc_exflags = argp->ex_flags; 2657 np->netc_anon = argp->ex_anon; 2658 np->netc_anon.cr_ref = 1; 2659 mp->mnt_flag |= MNT_DEFEXPORTED; 2660 return (0); 2661 } 2662 2663 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN) 2664 return (EINVAL); 2665 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN) 2666 return (EINVAL); 2667 2668 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2669 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2670 bzero((caddr_t) np, i); 2671 saddr = (struct sockaddr *) (np + 1); 2672 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2673 goto out; 2674 if (saddr->sa_len > argp->ex_addrlen) 2675 saddr->sa_len = argp->ex_addrlen; 2676 if (argp->ex_masklen) { 2677 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2678 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2679 if (error) 2680 goto out; 2681 if (smask->sa_len > argp->ex_masklen) 2682 smask->sa_len = argp->ex_masklen; 2683 } 2684 i = saddr->sa_family; 2685 if ((rnh = nep->ne_rtable[i]) == 0) { 2686 /* 2687 * Seems silly to initialize every AF when most are not used, 2688 * do so on demand here 2689 */ 2690 for (dom = domains; dom; dom = dom->dom_next) 2691 if (dom->dom_family == i && dom->dom_rtattach) { 2692 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2693 dom->dom_rtoffset); 2694 break; 2695 } 2696 if ((rnh = nep->ne_rtable[i]) == 0) { 2697 error = ENOBUFS; 2698 goto out; 2699 } 2700 } 2701 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2702 np->netc_rnodes); 2703 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2704 error = EPERM; 2705 goto out; 2706 } 2707 np->netc_exflags = argp->ex_flags; 2708 np->netc_anon = argp->ex_anon; 2709 np->netc_anon.cr_ref = 1; 2710 return (0); 2711 out: 2712 free(np, M_NETADDR); 2713 return (error); 2714 } 2715 2716 /* ARGSUSED */ 2717 static int 2718 vfs_free_netcred(rn, w) 2719 struct radix_node *rn; 2720 void *w; 2721 { 2722 struct radix_node_head *rnh = (struct radix_node_head *) w; 2723 2724 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2725 free((caddr_t) rn, M_NETADDR); 2726 return (0); 2727 } 2728 2729 /* 2730 * Free the net address hash lists that are hanging off the mount points. 2731 */ 2732 static void 2733 vfs_free_addrlist(nep) 2734 struct netexport *nep; 2735 { 2736 int i; 2737 struct radix_node_head *rnh; 2738 2739 for (i = 0; i <= AF_MAX; i++) 2740 if ((rnh = nep->ne_rtable[i])) { 2741 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2742 (caddr_t) rnh); 2743 free((caddr_t) rnh, M_RTABLE); 2744 nep->ne_rtable[i] = 0; 2745 } 2746 } 2747 2748 int 2749 vfs_export(mp, nep, argp) 2750 struct mount *mp; 2751 struct netexport *nep; 2752 struct export_args *argp; 2753 { 2754 int error; 2755 2756 if (argp->ex_flags & MNT_DELEXPORT) { 2757 if (mp->mnt_flag & MNT_EXPUBLIC) { 2758 vfs_setpublicfs(NULL, NULL, NULL); 2759 mp->mnt_flag &= ~MNT_EXPUBLIC; 2760 } 2761 vfs_free_addrlist(nep); 2762 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2763 } 2764 if (argp->ex_flags & MNT_EXPORTED) { 2765 if (argp->ex_flags & MNT_EXPUBLIC) { 2766 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2767 return (error); 2768 mp->mnt_flag |= MNT_EXPUBLIC; 2769 } 2770 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2771 return (error); 2772 mp->mnt_flag |= MNT_EXPORTED; 2773 } 2774 return (0); 2775 } 2776 2777 2778 /* 2779 * Set the publicly exported filesystem (WebNFS). Currently, only 2780 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2781 */ 2782 int 2783 vfs_setpublicfs(mp, nep, argp) 2784 struct mount *mp; 2785 struct netexport *nep; 2786 struct export_args *argp; 2787 { 2788 int error; 2789 struct vnode *rvp; 2790 char *cp; 2791 2792 /* 2793 * mp == NULL -> invalidate the current info, the FS is 2794 * no longer exported. May be called from either vfs_export 2795 * or unmount, so check if it hasn't already been done. 2796 */ 2797 if (mp == NULL) { 2798 if (nfs_pub.np_valid) { 2799 nfs_pub.np_valid = 0; 2800 if (nfs_pub.np_index != NULL) { 2801 FREE(nfs_pub.np_index, M_TEMP); 2802 nfs_pub.np_index = NULL; 2803 } 2804 } 2805 return (0); 2806 } 2807 2808 /* 2809 * Only one allowed at a time. 2810 */ 2811 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2812 return (EBUSY); 2813 2814 /* 2815 * Get real filehandle for root of exported FS. 2816 */ 2817 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2818 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2819 2820 if ((error = VFS_ROOT(mp, &rvp))) 2821 return (error); 2822 2823 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2824 return (error); 2825 2826 vput(rvp); 2827 2828 /* 2829 * If an indexfile was specified, pull it in. 2830 */ 2831 if (argp->ex_indexfile != NULL) { 2832 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2833 M_WAITOK); 2834 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2835 MAXNAMLEN, (size_t *)0); 2836 if (!error) { 2837 /* 2838 * Check for illegal filenames. 2839 */ 2840 for (cp = nfs_pub.np_index; *cp; cp++) { 2841 if (*cp == '/') { 2842 error = EINVAL; 2843 break; 2844 } 2845 } 2846 } 2847 if (error) { 2848 FREE(nfs_pub.np_index, M_TEMP); 2849 return (error); 2850 } 2851 } 2852 2853 nfs_pub.np_mount = mp; 2854 nfs_pub.np_valid = 1; 2855 return (0); 2856 } 2857 2858 struct netcred * 2859 vfs_export_lookup(mp, nep, nam) 2860 struct mount *mp; 2861 struct netexport *nep; 2862 struct sockaddr *nam; 2863 { 2864 struct netcred *np; 2865 struct radix_node_head *rnh; 2866 struct sockaddr *saddr; 2867 2868 np = NULL; 2869 if (mp->mnt_flag & MNT_EXPORTED) { 2870 /* 2871 * Lookup in the export list first. 2872 */ 2873 if (nam != NULL) { 2874 saddr = nam; 2875 rnh = nep->ne_rtable[saddr->sa_family]; 2876 if (rnh != NULL) { 2877 np = (struct netcred *) 2878 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2879 rnh); 2880 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2881 np = NULL; 2882 } 2883 } 2884 /* 2885 * If no address match, use the default if it exists. 2886 */ 2887 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2888 np = &nep->ne_defexported; 2889 } 2890 return (np); 2891 } 2892 2893 /* 2894 * perform msync on all vnodes under a mount point. The mount point must 2895 * be locked. This code is also responsible for lazy-freeing unreferenced 2896 * vnodes whos VM objects no longer contain pages. 2897 * 2898 * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state. 2899 */ 2900 static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data); 2901 static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, 2902 lwkt_tokref_t vlock, void *data); 2903 2904 void 2905 vfs_msync(struct mount *mp, int flags) 2906 { 2907 vmntvnodescan(mp, vfs_msync_scan1, vfs_msync_scan2, (void *)flags); 2908 } 2909 2910 /* 2911 * scan1 is a fast pre-check. There could be hundreds of thousands of 2912 * vnodes, we cannot afford to do anything heavy weight until we have a 2913 * fairly good indication that there is work to do. 2914 */ 2915 static 2916 int 2917 vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data) 2918 { 2919 int flags = (int)data; 2920 2921 if ((vp->v_flag & VXLOCK) == 0) { 2922 if (VSHOULDFREE(vp)) 2923 return(0); 2924 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 2925 (vp->v_flag & VOBJDIRTY) && 2926 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2927 return(0); 2928 } 2929 } 2930 return(-1); 2931 } 2932 2933 static 2934 int 2935 vfs_msync_scan2(struct mount *mp, struct vnode *vp, lwkt_tokref_t vlock, void *data) 2936 { 2937 vm_object_t obj; 2938 int error; 2939 int flags = (int)data; 2940 2941 if (vp->v_flag & VXLOCK) 2942 return(0); 2943 2944 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 2945 (vp->v_flag & VOBJDIRTY) && 2946 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2947 error = vget(vp, vlock, LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ | LK_INTERLOCK, curthread); 2948 if (error == 0) { 2949 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2950 vm_object_page_clean(obj, 0, 0, 2951 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2952 } 2953 vput(vp); 2954 } 2955 return(0); 2956 } 2957 vmaybefree(vp); 2958 lwkt_reltoken(vlock); 2959 return(0); 2960 } 2961 2962 /* 2963 * Create the VM object needed for VMIO and mmap support. This 2964 * is done for all VREG files in the system. Some filesystems might 2965 * afford the additional metadata buffering capability of the 2966 * VMIO code by making the device node be VMIO mode also. 2967 * 2968 * vp must be locked when vfs_object_create is called. 2969 */ 2970 int 2971 vfs_object_create(struct vnode *vp, struct thread *td) 2972 { 2973 return (VOP_CREATEVOBJECT(vp, td)); 2974 } 2975 2976 /* 2977 * NOTE: the vnode interlock must be held during the call. We have to recheck 2978 * the VFREE flag since the vnode may have been removed from the free list 2979 * while we were blocked on vnode_free_list_token. The use or hold count 2980 * must have already been bumped by the caller. 2981 */ 2982 static void 2983 vbusy(struct vnode *vp) 2984 { 2985 lwkt_tokref ilock; 2986 2987 lwkt_gettoken(&ilock, &vnode_free_list_token); 2988 if ((vp->v_flag & VFREE) != 0) { 2989 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2990 freevnodes--; 2991 vp->v_flag &= ~(VFREE|VAGE); 2992 } 2993 lwkt_reltoken(&ilock); 2994 } 2995 2996 /* 2997 * NOTE: the vnode interlock must be held during the call. The use or hold 2998 * count must have already been bumped by the caller. We use a VINFREE to 2999 * interlock against other calls to vfree() which might occur while we 3000 * are blocked. The vnode cannot be reused until it has actually been 3001 * placed on the free list, so there are no other races even though the 3002 * use and hold counts are 0. 3003 */ 3004 static void 3005 vfree(struct vnode *vp) 3006 { 3007 lwkt_tokref ilock; 3008 3009 if ((vp->v_flag & VINFREE) == 0) { 3010 vp->v_flag |= VINFREE; 3011 lwkt_gettoken(&ilock, &vnode_free_list_token); /* can block */ 3012 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 3013 if (vp->v_flag & VAGE) { 3014 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 3015 } else { 3016 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 3017 } 3018 freevnodes++; 3019 vp->v_flag &= ~(VAGE|VINFREE); 3020 vp->v_flag |= VFREE; 3021 lwkt_reltoken(&ilock); /* can block */ 3022 } 3023 } 3024 3025 3026 /* 3027 * Record a process's interest in events which might happen to 3028 * a vnode. Because poll uses the historic select-style interface 3029 * internally, this routine serves as both the ``check for any 3030 * pending events'' and the ``record my interest in future events'' 3031 * functions. (These are done together, while the lock is held, 3032 * to avoid race conditions.) 3033 */ 3034 int 3035 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 3036 { 3037 lwkt_tokref ilock; 3038 3039 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 3040 if (vp->v_pollinfo.vpi_revents & events) { 3041 /* 3042 * This leaves events we are not interested 3043 * in available for the other process which 3044 * which presumably had requested them 3045 * (otherwise they would never have been 3046 * recorded). 3047 */ 3048 events &= vp->v_pollinfo.vpi_revents; 3049 vp->v_pollinfo.vpi_revents &= ~events; 3050 3051 lwkt_reltoken(&ilock); 3052 return events; 3053 } 3054 vp->v_pollinfo.vpi_events |= events; 3055 selrecord(td, &vp->v_pollinfo.vpi_selinfo); 3056 lwkt_reltoken(&ilock); 3057 return 0; 3058 } 3059 3060 /* 3061 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 3062 * it is possible for us to miss an event due to race conditions, but 3063 * that condition is expected to be rare, so for the moment it is the 3064 * preferred interface. 3065 */ 3066 void 3067 vn_pollevent(vp, events) 3068 struct vnode *vp; 3069 short events; 3070 { 3071 lwkt_tokref ilock; 3072 3073 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 3074 if (vp->v_pollinfo.vpi_events & events) { 3075 /* 3076 * We clear vpi_events so that we don't 3077 * call selwakeup() twice if two events are 3078 * posted before the polling process(es) is 3079 * awakened. This also ensures that we take at 3080 * most one selwakeup() if the polling process 3081 * is no longer interested. However, it does 3082 * mean that only one event can be noticed at 3083 * a time. (Perhaps we should only clear those 3084 * event bits which we note?) XXX 3085 */ 3086 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 3087 vp->v_pollinfo.vpi_revents |= events; 3088 selwakeup(&vp->v_pollinfo.vpi_selinfo); 3089 } 3090 lwkt_reltoken(&ilock); 3091 } 3092 3093 /* 3094 * Wake up anyone polling on vp because it is being revoked. 3095 * This depends on dead_poll() returning POLLHUP for correct 3096 * behavior. 3097 */ 3098 void 3099 vn_pollgone(vp) 3100 struct vnode *vp; 3101 { 3102 lwkt_tokref ilock; 3103 3104 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 3105 if (vp->v_pollinfo.vpi_events) { 3106 vp->v_pollinfo.vpi_events = 0; 3107 selwakeup(&vp->v_pollinfo.vpi_selinfo); 3108 } 3109 lwkt_reltoken(&ilock); 3110 } 3111 3112 3113 3114 /* 3115 * Routine to create and manage a filesystem syncer vnode. 3116 */ 3117 #define sync_close ((int (*) (struct vop_close_args *))nullop) 3118 static int sync_fsync (struct vop_fsync_args *); 3119 static int sync_inactive (struct vop_inactive_args *); 3120 static int sync_reclaim (struct vop_reclaim_args *); 3121 #define sync_lock ((int (*) (struct vop_lock_args *))vop_nolock) 3122 #define sync_unlock ((int (*) (struct vop_unlock_args *))vop_nounlock) 3123 static int sync_print (struct vop_print_args *); 3124 #define sync_islocked ((int(*) (struct vop_islocked_args *))vop_noislocked) 3125 3126 static vop_t **sync_vnodeop_p; 3127 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 3128 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 3129 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 3130 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 3131 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 3132 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 3133 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 3134 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 3135 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 3136 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 3137 { NULL, NULL } 3138 }; 3139 static struct vnodeopv_desc sync_vnodeop_opv_desc = 3140 { &sync_vnodeop_p, sync_vnodeop_entries }; 3141 3142 VNODEOP_SET(sync_vnodeop_opv_desc); 3143 3144 /* 3145 * Create a new filesystem syncer vnode for the specified mount point. 3146 * This vnode is placed on the worklist and is responsible for sync'ing 3147 * the filesystem. 3148 * 3149 * NOTE: read-only mounts are also placed on the worklist. The filesystem 3150 * sync code is also responsible for cleaning up vnodes. 3151 */ 3152 int 3153 vfs_allocate_syncvnode(struct mount *mp) 3154 { 3155 struct vnode *vp; 3156 static long start, incr, next; 3157 int error; 3158 3159 /* Allocate a new vnode */ 3160 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 3161 mp->mnt_syncer = NULL; 3162 return (error); 3163 } 3164 vp->v_type = VNON; 3165 /* 3166 * Place the vnode onto the syncer worklist. We attempt to 3167 * scatter them about on the list so that they will go off 3168 * at evenly distributed times even if all the filesystems 3169 * are mounted at once. 3170 */ 3171 next += incr; 3172 if (next == 0 || next > syncer_maxdelay) { 3173 start /= 2; 3174 incr /= 2; 3175 if (start == 0) { 3176 start = syncer_maxdelay / 2; 3177 incr = syncer_maxdelay; 3178 } 3179 next = start; 3180 } 3181 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 3182 mp->mnt_syncer = vp; 3183 return (0); 3184 } 3185 3186 /* 3187 * Do a lazy sync of the filesystem. 3188 */ 3189 static int 3190 sync_fsync(ap) 3191 struct vop_fsync_args /* { 3192 struct vnode *a_vp; 3193 struct ucred *a_cred; 3194 int a_waitfor; 3195 struct thread *a_td; 3196 } */ *ap; 3197 { 3198 struct vnode *syncvp = ap->a_vp; 3199 struct mount *mp = syncvp->v_mount; 3200 struct thread *td = ap->a_td; 3201 lwkt_tokref ilock; 3202 int asyncflag; 3203 3204 /* 3205 * We only need to do something if this is a lazy evaluation. 3206 */ 3207 if (ap->a_waitfor != MNT_LAZY) 3208 return (0); 3209 3210 /* 3211 * Move ourselves to the back of the sync list. 3212 */ 3213 vn_syncer_add_to_worklist(syncvp, syncdelay); 3214 3215 /* 3216 * Walk the list of vnodes pushing all that are dirty and 3217 * not already on the sync list, and freeing vnodes which have 3218 * no refs and whos VM objects are empty. vfs_msync() handles 3219 * the VM issues and must be called whether the mount is readonly 3220 * or not. 3221 */ 3222 lwkt_gettoken(&ilock, &mountlist_token); 3223 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &ilock, td) != 0) { 3224 lwkt_reltoken(&ilock); 3225 return (0); 3226 } 3227 if (mp->mnt_flag & MNT_RDONLY) { 3228 vfs_msync(mp, MNT_NOWAIT); 3229 } else { 3230 asyncflag = mp->mnt_flag & MNT_ASYNC; 3231 mp->mnt_flag &= ~MNT_ASYNC; /* ZZZ hack */ 3232 vfs_msync(mp, MNT_NOWAIT); 3233 VFS_SYNC(mp, MNT_LAZY, td); 3234 if (asyncflag) 3235 mp->mnt_flag |= MNT_ASYNC; 3236 } 3237 vfs_unbusy(mp, td); 3238 return (0); 3239 } 3240 3241 /* 3242 * The syncer vnode is no referenced. 3243 */ 3244 static int 3245 sync_inactive(ap) 3246 struct vop_inactive_args /* { 3247 struct vnode *a_vp; 3248 struct proc *a_p; 3249 } */ *ap; 3250 { 3251 3252 vgone(ap->a_vp); 3253 return (0); 3254 } 3255 3256 /* 3257 * The syncer vnode is no longer needed and is being decommissioned. 3258 * 3259 * Modifications to the worklist must be protected at splbio(). 3260 */ 3261 static int 3262 sync_reclaim(ap) 3263 struct vop_reclaim_args /* { 3264 struct vnode *a_vp; 3265 } */ *ap; 3266 { 3267 struct vnode *vp = ap->a_vp; 3268 int s; 3269 3270 s = splbio(); 3271 vp->v_mount->mnt_syncer = NULL; 3272 if (vp->v_flag & VONWORKLST) { 3273 LIST_REMOVE(vp, v_synclist); 3274 vp->v_flag &= ~VONWORKLST; 3275 } 3276 splx(s); 3277 3278 return (0); 3279 } 3280 3281 /* 3282 * Print out a syncer vnode. 3283 */ 3284 static int 3285 sync_print(ap) 3286 struct vop_print_args /* { 3287 struct vnode *a_vp; 3288 } */ *ap; 3289 { 3290 struct vnode *vp = ap->a_vp; 3291 3292 printf("syncer vnode"); 3293 if (vp->v_vnlock != NULL) 3294 lockmgr_printinfo(vp->v_vnlock); 3295 printf("\n"); 3296 return (0); 3297 } 3298 3299 /* 3300 * extract the dev_t from a VBLK or VCHR. The vnode must have been opened 3301 * (or v_rdev might be NULL). 3302 */ 3303 dev_t 3304 vn_todev(struct vnode *vp) 3305 { 3306 if (vp->v_type != VBLK && vp->v_type != VCHR) 3307 return (NODEV); 3308 KKASSERT(vp->v_rdev != NULL); 3309 return (vp->v_rdev); 3310 } 3311 3312 /* 3313 * Check if vnode represents a disk device. The vnode does not need to be 3314 * opened. 3315 */ 3316 int 3317 vn_isdisk(struct vnode *vp, int *errp) 3318 { 3319 dev_t dev; 3320 3321 if (vp->v_type != VBLK && vp->v_type != VCHR) { 3322 if (errp != NULL) 3323 *errp = ENOTBLK; 3324 return (0); 3325 } 3326 3327 if ((dev = vp->v_rdev) == NULL) 3328 dev = udev2dev(vp->v_udev, (vp->v_type == VBLK)); 3329 if (dev == NULL || dev == NODEV) { 3330 if (errp != NULL) 3331 *errp = ENXIO; 3332 return (0); 3333 } 3334 if (dev_is_good(dev) == 0) { 3335 if (errp != NULL) 3336 *errp = ENXIO; 3337 return (0); 3338 } 3339 if ((dev_dflags(dev) & D_DISK) == 0) { 3340 if (errp != NULL) 3341 *errp = ENOTBLK; 3342 return (0); 3343 } 3344 if (errp != NULL) 3345 *errp = 0; 3346 return (1); 3347 } 3348 3349 void 3350 NDFREE(ndp, flags) 3351 struct nameidata *ndp; 3352 const uint flags; 3353 { 3354 if (!(flags & NDF_NO_FREE_PNBUF) && 3355 (ndp->ni_cnd.cn_flags & CNP_HASBUF)) { 3356 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 3357 ndp->ni_cnd.cn_flags &= ~CNP_HASBUF; 3358 } 3359 if (!(flags & NDF_NO_DNCP_RELE) && 3360 (ndp->ni_cnd.cn_flags & CNP_WANTDNCP) && 3361 ndp->ni_dncp) { 3362 cache_drop(ndp->ni_dncp); 3363 ndp->ni_dncp = NULL; 3364 } 3365 if (!(flags & NDF_NO_NCP_RELE) && 3366 (ndp->ni_cnd.cn_flags & CNP_WANTNCP) && 3367 ndp->ni_ncp) { 3368 cache_drop(ndp->ni_ncp); 3369 ndp->ni_ncp = NULL; 3370 } 3371 if (!(flags & NDF_NO_DVP_UNLOCK) && 3372 (ndp->ni_cnd.cn_flags & CNP_LOCKPARENT) && 3373 ndp->ni_dvp != ndp->ni_vp) { 3374 VOP_UNLOCK(ndp->ni_dvp, NULL, 0, ndp->ni_cnd.cn_td); 3375 } 3376 if (!(flags & NDF_NO_DVP_RELE) && 3377 (ndp->ni_cnd.cn_flags & (CNP_LOCKPARENT|CNP_WANTPARENT))) { 3378 vrele(ndp->ni_dvp); 3379 ndp->ni_dvp = NULL; 3380 } 3381 if (!(flags & NDF_NO_VP_UNLOCK) && 3382 (ndp->ni_cnd.cn_flags & CNP_LOCKLEAF) && ndp->ni_vp) { 3383 VOP_UNLOCK(ndp->ni_vp, NULL, 0, ndp->ni_cnd.cn_td); 3384 } 3385 if (!(flags & NDF_NO_VP_RELE) && 3386 ndp->ni_vp) { 3387 vrele(ndp->ni_vp); 3388 ndp->ni_vp = NULL; 3389 } 3390 if (!(flags & NDF_NO_STARTDIR_RELE) && 3391 (ndp->ni_cnd.cn_flags & CNP_SAVESTART)) { 3392 vrele(ndp->ni_startdir); 3393 ndp->ni_startdir = NULL; 3394 } 3395 } 3396 3397 #ifdef DEBUG_VFS_LOCKS 3398 3399 void 3400 assert_vop_locked(struct vnode *vp, const char *str) 3401 { 3402 3403 if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) { 3404 panic("%s: %p is not locked shared but should be", str, vp); 3405 } 3406 } 3407 3408 void 3409 assert_vop_unlocked(struct vnode *vp, const char *str) 3410 { 3411 3412 if (vp && IS_LOCKING_VFS(vp)) { 3413 if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) { 3414 panic("%s: %p is locked but should not be", str, vp); 3415 } 3416 } 3417 } 3418 3419 #endif 3420