1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.64 2005/09/17 07:43:00 dillon Exp $ 41 */ 42 43 /* 44 * External virtual filesystem routines 45 */ 46 #include "opt_ddb.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/dirent.h> 53 #include <sys/domain.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fcntl.h> 56 #include <sys/kernel.h> 57 #include <sys/kthread.h> 58 #include <sys/malloc.h> 59 #include <sys/mbuf.h> 60 #include <sys/mount.h> 61 #include <sys/proc.h> 62 #include <sys/reboot.h> 63 #include <sys/socket.h> 64 #include <sys/stat.h> 65 #include <sys/sysctl.h> 66 #include <sys/syslog.h> 67 #include <sys/unistd.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/limits.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pager.h> 81 #include <vm/vnode_pager.h> 82 #include <vm/vm_zone.h> 83 84 #include <sys/buf2.h> 85 #include <sys/thread2.h> 86 87 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 88 89 int numvnodes; 90 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 91 int vfs_fastdev = 1; 92 SYSCTL_INT(_vfs, OID_AUTO, fastdev, CTLFLAG_RW, &vfs_fastdev, 0, ""); 93 94 enum vtype iftovt_tab[16] = { 95 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 96 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 97 }; 98 int vttoif_tab[9] = { 99 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 100 S_IFSOCK, S_IFIFO, S_IFMT, 101 }; 102 103 static int reassignbufcalls; 104 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, 105 &reassignbufcalls, 0, ""); 106 static int reassignbufloops; 107 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, 108 &reassignbufloops, 0, ""); 109 static int reassignbufsortgood; 110 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, 111 &reassignbufsortgood, 0, ""); 112 static int reassignbufsortbad; 113 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, 114 &reassignbufsortbad, 0, ""); 115 static int reassignbufmethod = 1; 116 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, 117 &reassignbufmethod, 0, ""); 118 119 int nfs_mount_type = -1; 120 static struct lwkt_token spechash_token; 121 struct nfs_public nfs_pub; /* publicly exported FS */ 122 123 int desiredvnodes; 124 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 125 &desiredvnodes, 0, "Maximum number of vnodes"); 126 127 static void vfs_free_addrlist (struct netexport *nep); 128 static int vfs_free_netcred (struct radix_node *rn, void *w); 129 static int vfs_hang_addrlist (struct mount *mp, struct netexport *nep, 130 struct export_args *argp); 131 132 extern int dev_ref_debug; 133 extern struct vnodeopv_entry_desc spec_vnodeop_entries[]; 134 135 /* 136 * Red black tree functions 137 */ 138 static int rb_buf_compare(struct buf *b1, struct buf *b2); 139 RB_GENERATE(buf_rb_tree, buf, b_rbnode, rb_buf_compare); 140 141 static int 142 rb_buf_compare(struct buf *b1, struct buf *b2) 143 { 144 if (b1->b_lblkno < b2->b_lblkno) 145 return(-1); 146 if (b1->b_lblkno > b2->b_lblkno) 147 return(1); 148 return(0); 149 } 150 151 /* 152 * Return 0 if the vnode is already on the free list or cannot be placed 153 * on the free list. Return 1 if the vnode can be placed on the free list. 154 */ 155 static __inline int 156 vshouldfree(struct vnode *vp, int usecount) 157 { 158 if (vp->v_flag & VFREE) 159 return (0); /* already free */ 160 if (vp->v_holdcnt != 0 || vp->v_usecount != usecount) 161 return (0); /* other holderse */ 162 if (vp->v_object && 163 (vp->v_object->ref_count || vp->v_object->resident_page_count)) { 164 return (0); 165 } 166 return (1); 167 } 168 169 /* 170 * Initialize the vnode management data structures. 171 * 172 * Called from vfsinit() 173 */ 174 void 175 vfs_subr_init(void) 176 { 177 /* 178 * Desired vnodes is a result of the physical page count 179 * and the size of kernel's heap. It scales in proportion 180 * to the amount of available physical memory. This can 181 * cause trouble on 64-bit and large memory platforms. 182 */ 183 /* desiredvnodes = maxproc + vmstats.v_page_count / 4; */ 184 desiredvnodes = 185 min(maxproc + vmstats.v_page_count /4, 186 2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 187 (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); 188 189 lwkt_token_init(&spechash_token); 190 } 191 192 /* 193 * Knob to control the precision of file timestamps: 194 * 195 * 0 = seconds only; nanoseconds zeroed. 196 * 1 = seconds and nanoseconds, accurate within 1/HZ. 197 * 2 = seconds and nanoseconds, truncated to microseconds. 198 * >=3 = seconds and nanoseconds, maximum precision. 199 */ 200 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 201 202 static int timestamp_precision = TSP_SEC; 203 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 204 ×tamp_precision, 0, ""); 205 206 /* 207 * Get a current timestamp. 208 */ 209 void 210 vfs_timestamp(struct timespec *tsp) 211 { 212 struct timeval tv; 213 214 switch (timestamp_precision) { 215 case TSP_SEC: 216 tsp->tv_sec = time_second; 217 tsp->tv_nsec = 0; 218 break; 219 case TSP_HZ: 220 getnanotime(tsp); 221 break; 222 case TSP_USEC: 223 microtime(&tv); 224 TIMEVAL_TO_TIMESPEC(&tv, tsp); 225 break; 226 case TSP_NSEC: 227 default: 228 nanotime(tsp); 229 break; 230 } 231 } 232 233 /* 234 * Set vnode attributes to VNOVAL 235 */ 236 void 237 vattr_null(struct vattr *vap) 238 { 239 vap->va_type = VNON; 240 vap->va_size = VNOVAL; 241 vap->va_bytes = VNOVAL; 242 vap->va_mode = VNOVAL; 243 vap->va_nlink = VNOVAL; 244 vap->va_uid = VNOVAL; 245 vap->va_gid = VNOVAL; 246 vap->va_fsid = VNOVAL; 247 vap->va_fileid = VNOVAL; 248 vap->va_blocksize = VNOVAL; 249 vap->va_rdev = VNOVAL; 250 vap->va_atime.tv_sec = VNOVAL; 251 vap->va_atime.tv_nsec = VNOVAL; 252 vap->va_mtime.tv_sec = VNOVAL; 253 vap->va_mtime.tv_nsec = VNOVAL; 254 vap->va_ctime.tv_sec = VNOVAL; 255 vap->va_ctime.tv_nsec = VNOVAL; 256 vap->va_flags = VNOVAL; 257 vap->va_gen = VNOVAL; 258 vap->va_vaflags = 0; 259 vap->va_fsmid = VNOVAL; 260 } 261 262 /* 263 * Update outstanding I/O count and do wakeup if requested. 264 */ 265 void 266 vwakeup(struct buf *bp) 267 { 268 struct vnode *vp; 269 270 if ((vp = bp->b_vp)) { 271 vp->v_numoutput--; 272 if (vp->v_numoutput < 0) 273 panic("vwakeup: neg numoutput"); 274 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 275 vp->v_flag &= ~VBWAIT; 276 wakeup((caddr_t) &vp->v_numoutput); 277 } 278 } 279 } 280 281 /* 282 * Flush out and invalidate all buffers associated with a vnode. 283 * 284 * vp must be locked. 285 */ 286 static int vinvalbuf_bp(struct buf *bp, void *data); 287 288 struct vinvalbuf_bp_info { 289 struct vnode *vp; 290 int slptimeo; 291 int slpflag; 292 int flags; 293 }; 294 295 int 296 vinvalbuf(struct vnode *vp, int flags, struct thread *td, 297 int slpflag, int slptimeo) 298 { 299 struct vinvalbuf_bp_info info; 300 int error; 301 vm_object_t object; 302 303 /* 304 * If we are being asked to save, call fsync to ensure that the inode 305 * is updated. 306 */ 307 if (flags & V_SAVE) { 308 crit_enter(); 309 while (vp->v_numoutput) { 310 vp->v_flag |= VBWAIT; 311 error = tsleep((caddr_t)&vp->v_numoutput, 312 slpflag, "vinvlbuf", slptimeo); 313 if (error) { 314 crit_exit(); 315 return (error); 316 } 317 } 318 if (!RB_EMPTY(&vp->v_rbdirty_tree)) { 319 crit_exit(); 320 if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0) 321 return (error); 322 crit_enter(); 323 if (vp->v_numoutput > 0 || 324 !RB_EMPTY(&vp->v_rbdirty_tree)) 325 panic("vinvalbuf: dirty bufs"); 326 } 327 crit_exit(); 328 } 329 crit_enter(); 330 info.slptimeo = slptimeo; 331 info.slpflag = slpflag; 332 info.flags = flags; 333 info.vp = vp; 334 335 /* 336 * Flush the buffer cache until nothing is left. 337 */ 338 while (!RB_EMPTY(&vp->v_rbclean_tree) || 339 !RB_EMPTY(&vp->v_rbdirty_tree)) { 340 error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL, 341 vinvalbuf_bp, &info); 342 if (error == 0) { 343 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 344 vinvalbuf_bp, &info); 345 } 346 } 347 348 /* 349 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 350 * have write I/O in-progress but if there is a VM object then the 351 * VM object can also have read-I/O in-progress. 352 */ 353 do { 354 while (vp->v_numoutput > 0) { 355 vp->v_flag |= VBWAIT; 356 tsleep(&vp->v_numoutput, 0, "vnvlbv", 0); 357 } 358 if (VOP_GETVOBJECT(vp, &object) == 0) { 359 while (object->paging_in_progress) 360 vm_object_pip_sleep(object, "vnvlbx"); 361 } 362 } while (vp->v_numoutput > 0); 363 364 crit_exit(); 365 366 /* 367 * Destroy the copy in the VM cache, too. 368 */ 369 if (VOP_GETVOBJECT(vp, &object) == 0) { 370 vm_object_page_remove(object, 0, 0, 371 (flags & V_SAVE) ? TRUE : FALSE); 372 } 373 374 if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree)) 375 panic("vinvalbuf: flush failed"); 376 return (0); 377 } 378 379 static int 380 vinvalbuf_bp(struct buf *bp, void *data) 381 { 382 struct vinvalbuf_bp_info *info = data; 383 int error; 384 385 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 386 error = BUF_TIMELOCK(bp, 387 LK_EXCLUSIVE | LK_SLEEPFAIL, 388 "vinvalbuf", info->slpflag, info->slptimeo); 389 if (error == 0) { 390 BUF_UNLOCK(bp); 391 error = ENOLCK; 392 } 393 if (error == ENOLCK) 394 return(0); 395 return (-error); 396 } 397 /* 398 * XXX Since there are no node locks for NFS, I 399 * believe there is a slight chance that a delayed 400 * write will occur while sleeping just above, so 401 * check for it. Note that vfs_bio_awrite expects 402 * buffers to reside on a queue, while VOP_BWRITE and 403 * brelse do not. 404 */ 405 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 406 (info->flags & V_SAVE)) { 407 if (bp->b_vp == info->vp) { 408 if (bp->b_flags & B_CLUSTEROK) { 409 BUF_UNLOCK(bp); 410 vfs_bio_awrite(bp); 411 } else { 412 bremfree(bp); 413 bp->b_flags |= B_ASYNC; 414 VOP_BWRITE(bp->b_vp, bp); 415 } 416 } else { 417 bremfree(bp); 418 VOP_BWRITE(bp->b_vp, bp); 419 } 420 } else { 421 bremfree(bp); 422 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 423 bp->b_flags &= ~B_ASYNC; 424 brelse(bp); 425 } 426 return(0); 427 } 428 429 /* 430 * Truncate a file's buffer and pages to a specified length. This 431 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 432 * sync activity. 433 * 434 * The vnode must be locked. 435 */ 436 static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data); 437 static int vtruncbuf_bp_trunc(struct buf *bp, void *data); 438 static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data); 439 static int vtruncbuf_bp_metasync(struct buf *bp, void *data); 440 441 int 442 vtruncbuf(struct vnode *vp, struct thread *td, off_t length, int blksize) 443 { 444 daddr_t trunclbn; 445 int count; 446 447 /* 448 * Round up to the *next* lbn, then destroy the buffers in question. 449 * Since we are only removing some of the buffers we must rely on the 450 * scan count to determine whether a loop is necessary. 451 */ 452 trunclbn = (length + blksize - 1) / blksize; 453 454 crit_enter(); 455 do { 456 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 457 vtruncbuf_bp_trunc_cmp, 458 vtruncbuf_bp_trunc, &trunclbn); 459 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 460 vtruncbuf_bp_trunc_cmp, 461 vtruncbuf_bp_trunc, &trunclbn); 462 } while(count); 463 464 /* 465 * For safety, fsync any remaining metadata if the file is not being 466 * truncated to 0. Since the metadata does not represent the entire 467 * dirty list we have to rely on the hit count to ensure that we get 468 * all of it. 469 */ 470 if (length > 0) { 471 do { 472 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 473 vtruncbuf_bp_metasync_cmp, 474 vtruncbuf_bp_metasync, vp); 475 } while (count); 476 } 477 478 /* 479 * Wait for any in-progress I/O to complete before returning (why?) 480 */ 481 while (vp->v_numoutput > 0) { 482 vp->v_flag |= VBWAIT; 483 tsleep(&vp->v_numoutput, 0, "vbtrunc", 0); 484 } 485 486 crit_exit(); 487 488 vnode_pager_setsize(vp, length); 489 490 return (0); 491 } 492 493 /* 494 * The callback buffer is beyond the new file EOF and must be destroyed. 495 * Note that the compare function must conform to the RB_SCAN's requirements. 496 */ 497 static 498 int 499 vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data) 500 { 501 if (bp->b_lblkno >= *(daddr_t *)data) 502 return(0); 503 return(-1); 504 } 505 506 static 507 int 508 vtruncbuf_bp_trunc(struct buf *bp, void *data) 509 { 510 /* 511 * Do not try to use a buffer we cannot immediately lock, but sleep 512 * anyway to prevent a livelock. The code will loop until all buffers 513 * can be acted upon. 514 */ 515 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 516 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 517 BUF_UNLOCK(bp); 518 } else { 519 bremfree(bp); 520 bp->b_flags |= (B_INVAL | B_RELBUF); 521 bp->b_flags &= ~B_ASYNC; 522 brelse(bp); 523 } 524 return(1); 525 } 526 527 /* 528 * Fsync all meta-data after truncating a file to be non-zero. Only metadata 529 * blocks (with a negative lblkno) are scanned. 530 * Note that the compare function must conform to the RB_SCAN's requirements. 531 */ 532 static int 533 vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data) 534 { 535 if (bp->b_lblkno < 0) 536 return(0); 537 return(1); 538 } 539 540 static int 541 vtruncbuf_bp_metasync(struct buf *bp, void *data) 542 { 543 struct vnode *vp = data; 544 545 if (bp->b_flags & B_DELWRI) { 546 /* 547 * Do not try to use a buffer we cannot immediately lock, 548 * but sleep anyway to prevent a livelock. The code will 549 * loop until all buffers can be acted upon. 550 */ 551 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 552 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 553 BUF_UNLOCK(bp); 554 } else { 555 bremfree(bp); 556 if (bp->b_vp == vp) { 557 bp->b_flags |= B_ASYNC; 558 } else { 559 bp->b_flags &= ~B_ASYNC; 560 } 561 VOP_BWRITE(bp->b_vp, bp); 562 } 563 return(1); 564 } else { 565 return(0); 566 } 567 } 568 569 /* 570 * vfsync - implements a multipass fsync on a file which understands 571 * dependancies and meta-data. The passed vnode must be locked. The 572 * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY. 573 * 574 * When fsyncing data asynchronously just do one consolidated pass starting 575 * with the most negative block number. This may not get all the data due 576 * to dependancies. 577 * 578 * When fsyncing data synchronously do a data pass, then a metadata pass, 579 * then do additional data+metadata passes to try to get all the data out. 580 */ 581 static int vfsync_wait_output(struct vnode *vp, 582 int (*waitoutput)(struct vnode *, struct thread *)); 583 static int vfsync_data_only_cmp(struct buf *bp, void *data); 584 static int vfsync_meta_only_cmp(struct buf *bp, void *data); 585 static int vfsync_lazy_range_cmp(struct buf *bp, void *data); 586 static int vfsync_bp(struct buf *bp, void *data); 587 588 struct vfsync_info { 589 struct vnode *vp; 590 int synchronous; 591 int syncdeps; 592 int lazycount; 593 int lazylimit; 594 daddr_t lbn; 595 int (*checkdef)(struct buf *); 596 }; 597 598 int 599 vfsync(struct vnode *vp, int waitfor, int passes, daddr_t lbn, 600 int (*checkdef)(struct buf *), 601 int (*waitoutput)(struct vnode *, struct thread *)) 602 { 603 struct vfsync_info info; 604 int error; 605 606 bzero(&info, sizeof(info)); 607 info.vp = vp; 608 info.lbn = lbn; 609 if ((info.checkdef = checkdef) == NULL) 610 info.syncdeps = 1; 611 612 crit_enter(); 613 614 switch(waitfor) { 615 case MNT_LAZY: 616 /* 617 * Lazy (filesystem syncer typ) Asynchronous plus limit the 618 * number of data (not meta) pages we try to flush to 1MB. 619 * A non-zero return means that lazy limit was reached. 620 */ 621 info.lazylimit = 1024 * 1024; 622 info.syncdeps = 1; 623 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 624 vfsync_lazy_range_cmp, vfsync_bp, &info); 625 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 626 vfsync_meta_only_cmp, vfsync_bp, &info); 627 if (error == 0) 628 vp->v_lazyw = 0; 629 else if (!RB_EMPTY(&vp->v_rbdirty_tree)) 630 vn_syncer_add_to_worklist(vp, 1); 631 error = 0; 632 break; 633 case MNT_NOWAIT: 634 /* 635 * Asynchronous. Do a data-only pass and a meta-only pass. 636 */ 637 info.syncdeps = 1; 638 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 639 vfsync_bp, &info); 640 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp, 641 vfsync_bp, &info); 642 error = 0; 643 break; 644 default: 645 /* 646 * Synchronous. Do a data-only pass, then a meta-data+data 647 * pass, then additional integrated passes to try to get 648 * all the dependancies flushed. 649 */ 650 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 651 vfsync_bp, &info); 652 error = vfsync_wait_output(vp, waitoutput); 653 if (error == 0) { 654 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 655 vfsync_bp, &info); 656 error = vfsync_wait_output(vp, waitoutput); 657 } 658 while (error == 0 && passes > 0 && 659 !RB_EMPTY(&vp->v_rbdirty_tree)) { 660 if (--passes == 0) { 661 info.synchronous = 1; 662 info.syncdeps = 1; 663 } 664 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 665 vfsync_bp, &info); 666 if (error < 0) 667 error = -error; 668 info.syncdeps = 1; 669 if (error == 0) 670 error = vfsync_wait_output(vp, waitoutput); 671 } 672 break; 673 } 674 crit_exit(); 675 return(error); 676 } 677 678 static int 679 vfsync_wait_output(struct vnode *vp, int (*waitoutput)(struct vnode *, struct thread *)) 680 { 681 int error = 0; 682 683 while (vp->v_numoutput) { 684 vp->v_flag |= VBWAIT; 685 tsleep(&vp->v_numoutput, 0, "fsfsn", 0); 686 } 687 if (waitoutput) 688 error = waitoutput(vp, curthread); 689 return(error); 690 } 691 692 static int 693 vfsync_data_only_cmp(struct buf *bp, void *data) 694 { 695 if (bp->b_lblkno < 0) 696 return(-1); 697 return(0); 698 } 699 700 static int 701 vfsync_meta_only_cmp(struct buf *bp, void *data) 702 { 703 if (bp->b_lblkno < 0) 704 return(0); 705 return(1); 706 } 707 708 static int 709 vfsync_lazy_range_cmp(struct buf *bp, void *data) 710 { 711 struct vfsync_info *info = data; 712 if (bp->b_lblkno < info->vp->v_lazyw) 713 return(-1); 714 return(0); 715 } 716 717 static int 718 vfsync_bp(struct buf *bp, void *data) 719 { 720 struct vfsync_info *info = data; 721 struct vnode *vp = info->vp; 722 int error; 723 724 /* 725 * if syncdeps is not set we do not try to write buffers which have 726 * dependancies. 727 */ 728 if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp)) 729 return(0); 730 731 /* 732 * Ignore buffers that we cannot immediately lock. XXX 733 */ 734 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) 735 return(0); 736 if ((bp->b_flags & B_DELWRI) == 0) 737 panic("vfsync_bp: buffer not dirty"); 738 if (vp != bp->b_vp) 739 panic("vfsync_bp: buffer vp mismatch"); 740 741 /* 742 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer 743 * has been written but an additional handshake with the device 744 * is required before we can dispose of the buffer. We have no idea 745 * how to do this so we have to skip these buffers. 746 */ 747 if (bp->b_flags & B_NEEDCOMMIT) { 748 BUF_UNLOCK(bp); 749 return(0); 750 } 751 752 /* 753 * (LEGACY FROM UFS, REMOVE WHEN POSSIBLE) - invalidate any dirty 754 * buffers beyond the file EOF. 755 */ 756 if (info->lbn != (daddr_t)-1 && vp->v_type == VREG && 757 bp->b_lblkno >= info->lbn) { 758 bremfree(bp); 759 bp->b_flags |= B_INVAL | B_NOCACHE; 760 crit_exit(); 761 brelse(bp); 762 crit_enter(); 763 } 764 765 if (info->synchronous) { 766 /* 767 * Synchronous flushing. An error may be returned. 768 */ 769 bremfree(bp); 770 crit_exit(); 771 error = bwrite(bp); 772 crit_enter(); 773 } else { 774 /* 775 * Asynchronous flushing. A negative return value simply 776 * stops the scan and is not considered an error. We use 777 * this to support limited MNT_LAZY flushes. 778 */ 779 vp->v_lazyw = bp->b_lblkno; 780 if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) { 781 BUF_UNLOCK(bp); 782 info->lazycount += vfs_bio_awrite(bp); 783 } else { 784 info->lazycount += bp->b_bufsize; 785 bremfree(bp); 786 crit_exit(); 787 bawrite(bp); 788 crit_enter(); 789 } 790 if (info->lazylimit && info->lazycount >= info->lazylimit) 791 error = 1; 792 else 793 error = 0; 794 } 795 return(-error); 796 } 797 798 /* 799 * Associate a buffer with a vnode. 800 */ 801 void 802 bgetvp(struct vnode *vp, struct buf *bp) 803 { 804 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 805 806 vhold(vp); 807 bp->b_vp = vp; 808 bp->b_dev = vn_todev(vp); 809 /* 810 * Insert onto list for new vnode. 811 */ 812 crit_enter(); 813 bp->b_xflags |= BX_VNCLEAN; 814 bp->b_xflags &= ~BX_VNDIRTY; 815 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) 816 panic("reassignbuf: dup lblk vp %p bp %p", vp, bp); 817 crit_exit(); 818 } 819 820 /* 821 * Disassociate a buffer from a vnode. 822 */ 823 void 824 brelvp(struct buf *bp) 825 { 826 struct vnode *vp; 827 828 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 829 830 /* 831 * Delete from old vnode list, if on one. 832 */ 833 vp = bp->b_vp; 834 crit_enter(); 835 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 836 if (bp->b_xflags & BX_VNDIRTY) 837 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); 838 else 839 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); 840 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 841 } 842 if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) { 843 vp->v_flag &= ~VONWORKLST; 844 LIST_REMOVE(vp, v_synclist); 845 } 846 crit_exit(); 847 bp->b_vp = NULL; 848 vdrop(vp); 849 } 850 851 /* 852 * Associate a p-buffer with a vnode. 853 * 854 * Also sets B_PAGING flag to indicate that vnode is not fully associated 855 * with the buffer. i.e. the bp has not been linked into the vnode or 856 * ref-counted. 857 */ 858 void 859 pbgetvp(struct vnode *vp, struct buf *bp) 860 { 861 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 862 863 bp->b_vp = vp; 864 bp->b_flags |= B_PAGING; 865 bp->b_dev = vn_todev(vp); 866 } 867 868 /* 869 * Disassociate a p-buffer from a vnode. 870 */ 871 void 872 pbrelvp(struct buf *bp) 873 { 874 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 875 876 bp->b_vp = NULL; 877 bp->b_flags &= ~B_PAGING; 878 } 879 880 void 881 pbreassignbuf(struct buf *bp, struct vnode *newvp) 882 { 883 if ((bp->b_flags & B_PAGING) == 0) { 884 panic( 885 "pbreassignbuf() on non phys bp %p", 886 bp 887 ); 888 } 889 bp->b_vp = newvp; 890 } 891 892 /* 893 * Reassign a buffer from one vnode to another. 894 * Used to assign file specific control information 895 * (indirect blocks) to the vnode to which they belong. 896 */ 897 void 898 reassignbuf(struct buf *bp, struct vnode *newvp) 899 { 900 int delay; 901 902 if (newvp == NULL) { 903 printf("reassignbuf: NULL"); 904 return; 905 } 906 ++reassignbufcalls; 907 908 /* 909 * B_PAGING flagged buffers cannot be reassigned because their vp 910 * is not fully linked in. 911 */ 912 if (bp->b_flags & B_PAGING) 913 panic("cannot reassign paging buffer"); 914 915 crit_enter(); 916 /* 917 * Delete from old vnode list, if on one. 918 */ 919 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 920 if (bp->b_xflags & BX_VNDIRTY) 921 buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbdirty_tree, bp); 922 else 923 buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbclean_tree, bp); 924 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 925 if (bp->b_vp != newvp) { 926 vdrop(bp->b_vp); 927 bp->b_vp = NULL; /* for clarification */ 928 } 929 } 930 /* 931 * If dirty, put on list of dirty buffers; otherwise insert onto list 932 * of clean buffers. 933 */ 934 if (bp->b_flags & B_DELWRI) { 935 if ((newvp->v_flag & VONWORKLST) == 0) { 936 switch (newvp->v_type) { 937 case VDIR: 938 delay = dirdelay; 939 break; 940 case VCHR: 941 case VBLK: 942 if (newvp->v_rdev && 943 newvp->v_rdev->si_mountpoint != NULL) { 944 delay = metadelay; 945 break; 946 } 947 /* fall through */ 948 default: 949 delay = filedelay; 950 } 951 vn_syncer_add_to_worklist(newvp, delay); 952 } 953 bp->b_xflags |= BX_VNDIRTY; 954 if (buf_rb_tree_RB_INSERT(&newvp->v_rbdirty_tree, bp)) 955 panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp); 956 } else { 957 bp->b_xflags |= BX_VNCLEAN; 958 if (buf_rb_tree_RB_INSERT(&newvp->v_rbclean_tree, bp)) 959 panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp); 960 if ((newvp->v_flag & VONWORKLST) && 961 RB_EMPTY(&newvp->v_rbdirty_tree)) { 962 newvp->v_flag &= ~VONWORKLST; 963 LIST_REMOVE(newvp, v_synclist); 964 } 965 } 966 if (bp->b_vp != newvp) { 967 bp->b_vp = newvp; 968 vhold(bp->b_vp); 969 } 970 crit_exit(); 971 } 972 973 /* 974 * Create a vnode for a block device. 975 * Used for mounting the root file system. 976 */ 977 int 978 bdevvp(dev_t dev, struct vnode **vpp) 979 { 980 struct vnode *vp; 981 struct vnode *nvp; 982 int error; 983 984 if (dev == NODEV) { 985 *vpp = NULLVP; 986 return (ENXIO); 987 } 988 error = getspecialvnode(VT_NON, NULL, &spec_vnode_vops, &nvp, 0, 0); 989 if (error) { 990 *vpp = NULLVP; 991 return (error); 992 } 993 vp = nvp; 994 vp->v_type = VCHR; 995 vp->v_udev = dev->si_udev; 996 vx_unlock(vp); 997 *vpp = vp; 998 return (0); 999 } 1000 1001 int 1002 v_associate_rdev(struct vnode *vp, dev_t dev) 1003 { 1004 lwkt_tokref ilock; 1005 1006 if (dev == NULL || dev == NODEV) 1007 return(ENXIO); 1008 if (dev_is_good(dev) == 0) 1009 return(ENXIO); 1010 KKASSERT(vp->v_rdev == NULL); 1011 if (dev_ref_debug) 1012 printf("Z1"); 1013 vp->v_rdev = reference_dev(dev); 1014 lwkt_gettoken(&ilock, &spechash_token); 1015 SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext); 1016 lwkt_reltoken(&ilock); 1017 return(0); 1018 } 1019 1020 void 1021 v_release_rdev(struct vnode *vp) 1022 { 1023 lwkt_tokref ilock; 1024 dev_t dev; 1025 1026 if ((dev = vp->v_rdev) != NULL) { 1027 lwkt_gettoken(&ilock, &spechash_token); 1028 SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext); 1029 if (dev_ref_debug && vp->v_opencount != 0) { 1030 printf("releasing rdev with non-0 " 1031 "v_opencount(%d) (revoked?)\n", 1032 vp->v_opencount); 1033 } 1034 vp->v_rdev = NULL; 1035 vp->v_opencount = 0; 1036 release_dev(dev); 1037 lwkt_reltoken(&ilock); 1038 } 1039 } 1040 1041 /* 1042 * Add a vnode to the alias list hung off the dev_t. We only associate 1043 * the device number with the vnode. The actual device is not associated 1044 * until the vnode is opened (usually in spec_open()), and will be 1045 * disassociated on last close. 1046 */ 1047 void 1048 addaliasu(struct vnode *nvp, udev_t nvp_udev) 1049 { 1050 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1051 panic("addaliasu on non-special vnode"); 1052 nvp->v_udev = nvp_udev; 1053 } 1054 1055 /* 1056 * Disassociate a vnode from its underlying filesystem. 1057 * 1058 * The vnode must be VX locked and refd 1059 * 1060 * If there are v_usecount references to the vnode other then ours we have 1061 * to VOP_CLOSE the vnode before we can deactivate and reclaim it. 1062 */ 1063 void 1064 vclean(struct vnode *vp, int flags, struct thread *td) 1065 { 1066 int active; 1067 int retflags = 0; 1068 1069 /* 1070 * If the vnode has already been reclaimed we have nothing to do. 1071 */ 1072 if (vp->v_flag & VRECLAIMED) 1073 return; 1074 vp->v_flag |= VRECLAIMED; 1075 1076 /* 1077 * Scrap the vfs cache 1078 */ 1079 while (cache_inval_vp(vp, 0, &retflags) != 0) { 1080 printf("Warning: vnode %p clean/cache_resolution race detected\n", vp); 1081 tsleep(vp, 0, "vclninv", 2); 1082 } 1083 1084 /* 1085 * Check to see if the vnode is in use. If so we have to reference it 1086 * before we clean it out so that its count cannot fall to zero and 1087 * generate a race against ourselves to recycle it. 1088 */ 1089 active = (vp->v_usecount > 1); 1090 1091 /* 1092 * Clean out any buffers associated with the vnode and destroy its 1093 * object, if it has one. 1094 */ 1095 vinvalbuf(vp, V_SAVE, td, 0, 0); 1096 VOP_DESTROYVOBJECT(vp); 1097 1098 /* 1099 * If purging an active vnode, it must be closed and 1100 * deactivated before being reclaimed. XXX 1101 * 1102 * Note that neither of these routines unlocks the vnode. 1103 */ 1104 if (active) { 1105 if (flags & DOCLOSE) 1106 VOP_CLOSE(vp, FNONBLOCK, td); 1107 } 1108 1109 /* 1110 * If the vnode has not be deactivated, deactivated it. 1111 */ 1112 if ((vp->v_flag & VINACTIVE) == 0) { 1113 vp->v_flag |= VINACTIVE; 1114 VOP_INACTIVE(vp, td); 1115 } 1116 1117 /* 1118 * Reclaim the vnode. 1119 */ 1120 if (VOP_RECLAIM(vp, retflags, td)) 1121 panic("vclean: cannot reclaim"); 1122 1123 /* 1124 * Done with purge, notify sleepers of the grim news. 1125 */ 1126 vp->v_ops = &dead_vnode_vops; 1127 vn_pollgone(vp); 1128 vp->v_tag = VT_NON; 1129 } 1130 1131 /* 1132 * Eliminate all activity associated with the requested vnode 1133 * and with all vnodes aliased to the requested vnode. 1134 * 1135 * The vnode must be referenced and vx_lock()'d 1136 * 1137 * revoke { struct vnode *a_vp, int a_flags } 1138 */ 1139 int 1140 vop_stdrevoke(struct vop_revoke_args *ap) 1141 { 1142 struct vnode *vp, *vq; 1143 lwkt_tokref ilock; 1144 dev_t dev; 1145 1146 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1147 1148 vp = ap->a_vp; 1149 1150 /* 1151 * If the vnode is already dead don't try to revoke it 1152 */ 1153 if (vp->v_flag & VRECLAIMED) 1154 return (0); 1155 1156 /* 1157 * If the vnode has a device association, scrap all vnodes associated 1158 * with the device. Don't let the device disappear on us while we 1159 * are scrapping the vnodes. 1160 * 1161 * The passed vp will probably show up in the list, do not VX lock 1162 * it twice! 1163 */ 1164 if (vp->v_type != VCHR && vp->v_type != VBLK) 1165 return(0); 1166 if ((dev = vp->v_rdev) == NULL) { 1167 if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV) 1168 return(0); 1169 } 1170 reference_dev(dev); 1171 lwkt_gettoken(&ilock, &spechash_token); 1172 while ((vq = SLIST_FIRST(&dev->si_hlist)) != NULL) { 1173 if (vp == vq || vx_get(vq) == 0) { 1174 if (vq == SLIST_FIRST(&dev->si_hlist)) 1175 vgone(vq); 1176 if (vp != vq) 1177 vx_put(vq); 1178 } 1179 } 1180 lwkt_reltoken(&ilock); 1181 release_dev(dev); 1182 return (0); 1183 } 1184 1185 /* 1186 * Recycle an unused vnode to the front of the free list. 1187 * 1188 * Returns 1 if we were successfully able to recycle the vnode, 1189 * 0 otherwise. 1190 */ 1191 int 1192 vrecycle(struct vnode *vp, struct thread *td) 1193 { 1194 if (vp->v_usecount == 1) { 1195 vgone(vp); 1196 return (1); 1197 } 1198 return (0); 1199 } 1200 1201 /* 1202 * Eliminate all activity associated with a vnode in preparation for reuse. 1203 * 1204 * The vnode must be VX locked and refd and will remain VX locked and refd 1205 * on return. This routine may be called with the vnode in any state, as 1206 * long as it is VX locked. The vnode will be cleaned out and marked 1207 * VRECLAIMED but will not actually be reused until all existing refs and 1208 * holds go away. 1209 * 1210 * NOTE: This routine may be called on a vnode which has not yet been 1211 * already been deactivated (VOP_INACTIVE), or on a vnode which has 1212 * already been reclaimed. 1213 * 1214 * This routine is not responsible for placing us back on the freelist. 1215 * Instead, it happens automatically when the caller releases the VX lock 1216 * (assuming there aren't any other references). 1217 */ 1218 void 1219 vgone(struct vnode *vp) 1220 { 1221 /* 1222 * assert that the VX lock is held. This is an absolute requirement 1223 * now for vgone() to be called. 1224 */ 1225 KKASSERT(vp->v_lock.lk_exclusivecount == 1); 1226 1227 /* 1228 * Clean out the filesystem specific data and set the VRECLAIMED 1229 * bit. Also deactivate the vnode if necessary. 1230 */ 1231 vclean(vp, DOCLOSE, curthread); 1232 1233 /* 1234 * Delete from old mount point vnode list, if on one. 1235 */ 1236 if (vp->v_mount != NULL) 1237 insmntque(vp, NULL); 1238 1239 /* 1240 * If special device, remove it from special device alias list 1241 * if it is on one. This should normally only occur if a vnode is 1242 * being revoked as the device should otherwise have been released 1243 * naturally. 1244 */ 1245 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 1246 v_release_rdev(vp); 1247 } 1248 1249 /* 1250 * Set us to VBAD 1251 */ 1252 vp->v_type = VBAD; 1253 } 1254 1255 /* 1256 * Lookup a vnode by device number. 1257 */ 1258 int 1259 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp) 1260 { 1261 lwkt_tokref ilock; 1262 struct vnode *vp; 1263 1264 lwkt_gettoken(&ilock, &spechash_token); 1265 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1266 if (type == vp->v_type) { 1267 *vpp = vp; 1268 lwkt_reltoken(&ilock); 1269 return (1); 1270 } 1271 } 1272 lwkt_reltoken(&ilock); 1273 return (0); 1274 } 1275 1276 /* 1277 * Calculate the total number of references to a special device. This 1278 * routine may only be called for VBLK and VCHR vnodes since v_rdev is 1279 * an overloaded field. Since udev2dev can now return NODEV, we have 1280 * to check for a NULL v_rdev. 1281 */ 1282 int 1283 count_dev(dev_t dev) 1284 { 1285 lwkt_tokref ilock; 1286 struct vnode *vp; 1287 int count = 0; 1288 1289 if (SLIST_FIRST(&dev->si_hlist)) { 1290 lwkt_gettoken(&ilock, &spechash_token); 1291 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1292 count += vp->v_usecount; 1293 } 1294 lwkt_reltoken(&ilock); 1295 } 1296 return(count); 1297 } 1298 1299 int 1300 count_udev(udev_t udev) 1301 { 1302 dev_t dev; 1303 1304 if ((dev = udev2dev(udev, 0)) == NODEV) 1305 return(0); 1306 return(count_dev(dev)); 1307 } 1308 1309 int 1310 vcount(struct vnode *vp) 1311 { 1312 if (vp->v_rdev == NULL) 1313 return(0); 1314 return(count_dev(vp->v_rdev)); 1315 } 1316 1317 /* 1318 * Print out a description of a vnode. 1319 */ 1320 static char *typename[] = 1321 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1322 1323 void 1324 vprint(char *label, struct vnode *vp) 1325 { 1326 char buf[96]; 1327 1328 if (label != NULL) 1329 printf("%s: %p: ", label, (void *)vp); 1330 else 1331 printf("%p: ", (void *)vp); 1332 printf("type %s, usecount %d, writecount %d, refcount %d,", 1333 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1334 vp->v_holdcnt); 1335 buf[0] = '\0'; 1336 if (vp->v_flag & VROOT) 1337 strcat(buf, "|VROOT"); 1338 if (vp->v_flag & VTEXT) 1339 strcat(buf, "|VTEXT"); 1340 if (vp->v_flag & VSYSTEM) 1341 strcat(buf, "|VSYSTEM"); 1342 if (vp->v_flag & VBWAIT) 1343 strcat(buf, "|VBWAIT"); 1344 if (vp->v_flag & VFREE) 1345 strcat(buf, "|VFREE"); 1346 if (vp->v_flag & VOBJBUF) 1347 strcat(buf, "|VOBJBUF"); 1348 if (buf[0] != '\0') 1349 printf(" flags (%s)", &buf[1]); 1350 if (vp->v_data == NULL) { 1351 printf("\n"); 1352 } else { 1353 printf("\n\t"); 1354 VOP_PRINT(vp); 1355 } 1356 } 1357 1358 #ifdef DDB 1359 #include <ddb/ddb.h> 1360 1361 static int db_show_locked_vnodes(struct mount *mp, void *data); 1362 1363 /* 1364 * List all of the locked vnodes in the system. 1365 * Called when debugging the kernel. 1366 */ 1367 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 1368 { 1369 printf("Locked vnodes\n"); 1370 mountlist_scan(db_show_locked_vnodes, NULL, 1371 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1372 } 1373 1374 static int 1375 db_show_locked_vnodes(struct mount *mp, void *data __unused) 1376 { 1377 struct vnode *vp; 1378 1379 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 1380 if (VOP_ISLOCKED(vp, NULL)) 1381 vprint((char *)0, vp); 1382 } 1383 return(0); 1384 } 1385 #endif 1386 1387 /* 1388 * Top level filesystem related information gathering. 1389 */ 1390 static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS); 1391 1392 static int 1393 vfs_sysctl(SYSCTL_HANDLER_ARGS) 1394 { 1395 int *name = (int *)arg1 - 1; /* XXX */ 1396 u_int namelen = arg2 + 1; /* XXX */ 1397 struct vfsconf *vfsp; 1398 1399 #if 1 || defined(COMPAT_PRELITE2) 1400 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 1401 if (namelen == 1) 1402 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 1403 #endif 1404 1405 #ifdef notyet 1406 /* all sysctl names at this level are at least name and field */ 1407 if (namelen < 2) 1408 return (ENOTDIR); /* overloaded */ 1409 if (name[0] != VFS_GENERIC) { 1410 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 1411 if (vfsp->vfc_typenum == name[0]) 1412 break; 1413 if (vfsp == NULL) 1414 return (EOPNOTSUPP); 1415 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 1416 oldp, oldlenp, newp, newlen, p)); 1417 } 1418 #endif 1419 switch (name[1]) { 1420 case VFS_MAXTYPENUM: 1421 if (namelen != 2) 1422 return (ENOTDIR); 1423 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 1424 case VFS_CONF: 1425 if (namelen != 3) 1426 return (ENOTDIR); /* overloaded */ 1427 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 1428 if (vfsp->vfc_typenum == name[2]) 1429 break; 1430 if (vfsp == NULL) 1431 return (EOPNOTSUPP); 1432 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 1433 } 1434 return (EOPNOTSUPP); 1435 } 1436 1437 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 1438 "Generic filesystem"); 1439 1440 #if 1 || defined(COMPAT_PRELITE2) 1441 1442 static int 1443 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 1444 { 1445 int error; 1446 struct vfsconf *vfsp; 1447 struct ovfsconf ovfs; 1448 1449 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 1450 bzero(&ovfs, sizeof(ovfs)); 1451 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 1452 strcpy(ovfs.vfc_name, vfsp->vfc_name); 1453 ovfs.vfc_index = vfsp->vfc_typenum; 1454 ovfs.vfc_refcount = vfsp->vfc_refcount; 1455 ovfs.vfc_flags = vfsp->vfc_flags; 1456 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 1457 if (error) 1458 return error; 1459 } 1460 return 0; 1461 } 1462 1463 #endif /* 1 || COMPAT_PRELITE2 */ 1464 1465 /* 1466 * Check to see if a filesystem is mounted on a block device. 1467 */ 1468 int 1469 vfs_mountedon(struct vnode *vp) 1470 { 1471 dev_t dev; 1472 1473 if ((dev = vp->v_rdev) == NULL) 1474 dev = udev2dev(vp->v_udev, (vp->v_type == VBLK)); 1475 if (dev != NODEV && dev->si_mountpoint) 1476 return (EBUSY); 1477 return (0); 1478 } 1479 1480 /* 1481 * Unmount all filesystems. The list is traversed in reverse order 1482 * of mounting to avoid dependencies. 1483 */ 1484 1485 static int vfs_umountall_callback(struct mount *mp, void *data); 1486 1487 void 1488 vfs_unmountall(void) 1489 { 1490 struct thread *td = curthread; 1491 int count; 1492 1493 if (td->td_proc == NULL) 1494 td = initproc->p_thread; /* XXX XXX use proc0 instead? */ 1495 1496 do { 1497 count = mountlist_scan(vfs_umountall_callback, 1498 &td, MNTSCAN_REVERSE|MNTSCAN_NOBUSY); 1499 } while (count); 1500 } 1501 1502 static 1503 int 1504 vfs_umountall_callback(struct mount *mp, void *data) 1505 { 1506 struct thread *td = *(struct thread **)data; 1507 int error; 1508 1509 error = dounmount(mp, MNT_FORCE, td); 1510 if (error) { 1511 mountlist_remove(mp); 1512 printf("unmount of filesystem mounted from %s failed (", 1513 mp->mnt_stat.f_mntfromname); 1514 if (error == EBUSY) 1515 printf("BUSY)\n"); 1516 else 1517 printf("%d)\n", error); 1518 } 1519 return(1); 1520 } 1521 1522 /* 1523 * Build hash lists of net addresses and hang them off the mount point. 1524 * Called by ufs_mount() to set up the lists of export addresses. 1525 */ 1526 static int 1527 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 1528 struct export_args *argp) 1529 { 1530 struct netcred *np; 1531 struct radix_node_head *rnh; 1532 int i; 1533 struct radix_node *rn; 1534 struct sockaddr *saddr, *smask = 0; 1535 struct domain *dom; 1536 int error; 1537 1538 if (argp->ex_addrlen == 0) { 1539 if (mp->mnt_flag & MNT_DEFEXPORTED) 1540 return (EPERM); 1541 np = &nep->ne_defexported; 1542 np->netc_exflags = argp->ex_flags; 1543 np->netc_anon = argp->ex_anon; 1544 np->netc_anon.cr_ref = 1; 1545 mp->mnt_flag |= MNT_DEFEXPORTED; 1546 return (0); 1547 } 1548 1549 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN) 1550 return (EINVAL); 1551 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN) 1552 return (EINVAL); 1553 1554 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 1555 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 1556 bzero((caddr_t) np, i); 1557 saddr = (struct sockaddr *) (np + 1); 1558 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 1559 goto out; 1560 if (saddr->sa_len > argp->ex_addrlen) 1561 saddr->sa_len = argp->ex_addrlen; 1562 if (argp->ex_masklen) { 1563 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 1564 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 1565 if (error) 1566 goto out; 1567 if (smask->sa_len > argp->ex_masklen) 1568 smask->sa_len = argp->ex_masklen; 1569 } 1570 i = saddr->sa_family; 1571 if ((rnh = nep->ne_rtable[i]) == 0) { 1572 /* 1573 * Seems silly to initialize every AF when most are not used, 1574 * do so on demand here 1575 */ 1576 SLIST_FOREACH(dom, &domains, dom_next) 1577 if (dom->dom_family == i && dom->dom_rtattach) { 1578 dom->dom_rtattach((void **) &nep->ne_rtable[i], 1579 dom->dom_rtoffset); 1580 break; 1581 } 1582 if ((rnh = nep->ne_rtable[i]) == 0) { 1583 error = ENOBUFS; 1584 goto out; 1585 } 1586 } 1587 rn = (*rnh->rnh_addaddr) ((char *) saddr, (char *) smask, rnh, 1588 np->netc_rnodes); 1589 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 1590 error = EPERM; 1591 goto out; 1592 } 1593 np->netc_exflags = argp->ex_flags; 1594 np->netc_anon = argp->ex_anon; 1595 np->netc_anon.cr_ref = 1; 1596 return (0); 1597 out: 1598 free(np, M_NETADDR); 1599 return (error); 1600 } 1601 1602 /* ARGSUSED */ 1603 static int 1604 vfs_free_netcred(struct radix_node *rn, void *w) 1605 { 1606 struct radix_node_head *rnh = (struct radix_node_head *) w; 1607 1608 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 1609 free((caddr_t) rn, M_NETADDR); 1610 return (0); 1611 } 1612 1613 /* 1614 * Free the net address hash lists that are hanging off the mount points. 1615 */ 1616 static void 1617 vfs_free_addrlist(struct netexport *nep) 1618 { 1619 int i; 1620 struct radix_node_head *rnh; 1621 1622 for (i = 0; i <= AF_MAX; i++) 1623 if ((rnh = nep->ne_rtable[i])) { 1624 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 1625 (caddr_t) rnh); 1626 free((caddr_t) rnh, M_RTABLE); 1627 nep->ne_rtable[i] = 0; 1628 } 1629 } 1630 1631 int 1632 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp) 1633 { 1634 int error; 1635 1636 if (argp->ex_flags & MNT_DELEXPORT) { 1637 if (mp->mnt_flag & MNT_EXPUBLIC) { 1638 vfs_setpublicfs(NULL, NULL, NULL); 1639 mp->mnt_flag &= ~MNT_EXPUBLIC; 1640 } 1641 vfs_free_addrlist(nep); 1642 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 1643 } 1644 if (argp->ex_flags & MNT_EXPORTED) { 1645 if (argp->ex_flags & MNT_EXPUBLIC) { 1646 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 1647 return (error); 1648 mp->mnt_flag |= MNT_EXPUBLIC; 1649 } 1650 if ((error = vfs_hang_addrlist(mp, nep, argp))) 1651 return (error); 1652 mp->mnt_flag |= MNT_EXPORTED; 1653 } 1654 return (0); 1655 } 1656 1657 1658 /* 1659 * Set the publicly exported filesystem (WebNFS). Currently, only 1660 * one public filesystem is possible in the spec (RFC 2054 and 2055) 1661 */ 1662 int 1663 vfs_setpublicfs(struct mount *mp, struct netexport *nep, 1664 struct export_args *argp) 1665 { 1666 int error; 1667 struct vnode *rvp; 1668 char *cp; 1669 1670 /* 1671 * mp == NULL -> invalidate the current info, the FS is 1672 * no longer exported. May be called from either vfs_export 1673 * or unmount, so check if it hasn't already been done. 1674 */ 1675 if (mp == NULL) { 1676 if (nfs_pub.np_valid) { 1677 nfs_pub.np_valid = 0; 1678 if (nfs_pub.np_index != NULL) { 1679 FREE(nfs_pub.np_index, M_TEMP); 1680 nfs_pub.np_index = NULL; 1681 } 1682 } 1683 return (0); 1684 } 1685 1686 /* 1687 * Only one allowed at a time. 1688 */ 1689 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 1690 return (EBUSY); 1691 1692 /* 1693 * Get real filehandle for root of exported FS. 1694 */ 1695 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 1696 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 1697 1698 if ((error = VFS_ROOT(mp, &rvp))) 1699 return (error); 1700 1701 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 1702 return (error); 1703 1704 vput(rvp); 1705 1706 /* 1707 * If an indexfile was specified, pull it in. 1708 */ 1709 if (argp->ex_indexfile != NULL) { 1710 int namelen; 1711 1712 error = vn_get_namelen(rvp, &namelen); 1713 if (error) 1714 return (error); 1715 MALLOC(nfs_pub.np_index, char *, namelen, M_TEMP, 1716 M_WAITOK); 1717 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 1718 namelen, (size_t *)0); 1719 if (!error) { 1720 /* 1721 * Check for illegal filenames. 1722 */ 1723 for (cp = nfs_pub.np_index; *cp; cp++) { 1724 if (*cp == '/') { 1725 error = EINVAL; 1726 break; 1727 } 1728 } 1729 } 1730 if (error) { 1731 FREE(nfs_pub.np_index, M_TEMP); 1732 return (error); 1733 } 1734 } 1735 1736 nfs_pub.np_mount = mp; 1737 nfs_pub.np_valid = 1; 1738 return (0); 1739 } 1740 1741 struct netcred * 1742 vfs_export_lookup(struct mount *mp, struct netexport *nep, 1743 struct sockaddr *nam) 1744 { 1745 struct netcred *np; 1746 struct radix_node_head *rnh; 1747 struct sockaddr *saddr; 1748 1749 np = NULL; 1750 if (mp->mnt_flag & MNT_EXPORTED) { 1751 /* 1752 * Lookup in the export list first. 1753 */ 1754 if (nam != NULL) { 1755 saddr = nam; 1756 rnh = nep->ne_rtable[saddr->sa_family]; 1757 if (rnh != NULL) { 1758 np = (struct netcred *) 1759 (*rnh->rnh_matchaddr)((char *)saddr, 1760 rnh); 1761 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 1762 np = NULL; 1763 } 1764 } 1765 /* 1766 * If no address match, use the default if it exists. 1767 */ 1768 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 1769 np = &nep->ne_defexported; 1770 } 1771 return (np); 1772 } 1773 1774 /* 1775 * perform msync on all vnodes under a mount point. The mount point must 1776 * be locked. This code is also responsible for lazy-freeing unreferenced 1777 * vnodes whos VM objects no longer contain pages. 1778 * 1779 * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state. 1780 */ 1781 static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data); 1782 static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data); 1783 1784 void 1785 vfs_msync(struct mount *mp, int flags) 1786 { 1787 vmntvnodescan(mp, VMSC_REFVP, vfs_msync_scan1, vfs_msync_scan2, 1788 (void *)flags); 1789 } 1790 1791 /* 1792 * scan1 is a fast pre-check. There could be hundreds of thousands of 1793 * vnodes, we cannot afford to do anything heavy weight until we have a 1794 * fairly good indication that there is work to do. 1795 */ 1796 static 1797 int 1798 vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data) 1799 { 1800 int flags = (int)data; 1801 1802 if ((vp->v_flag & VRECLAIMED) == 0) { 1803 if (vshouldfree(vp, 0)) 1804 return(0); /* call scan2 */ 1805 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 1806 (vp->v_flag & VOBJDIRTY) && 1807 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 1808 return(0); /* call scan2 */ 1809 } 1810 } 1811 1812 /* 1813 * do not call scan2, continue the loop 1814 */ 1815 return(-1); 1816 } 1817 1818 static 1819 int 1820 vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data) 1821 { 1822 vm_object_t obj; 1823 int flags = (int)data; 1824 1825 if (vp->v_flag & VRECLAIMED) 1826 return(0); 1827 1828 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 1829 (vp->v_flag & VOBJDIRTY) && 1830 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 1831 if (VOP_GETVOBJECT(vp, &obj) == 0) { 1832 vm_object_page_clean(obj, 0, 0, 1833 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 1834 } 1835 } 1836 return(0); 1837 } 1838 1839 /* 1840 * Create the VM object needed for VMIO and mmap support. This 1841 * is done for all VREG files in the system. Some filesystems might 1842 * afford the additional metadata buffering capability of the 1843 * VMIO code by making the device node be VMIO mode also. 1844 * 1845 * vp must be locked when vfs_object_create is called. 1846 */ 1847 int 1848 vfs_object_create(struct vnode *vp, struct thread *td) 1849 { 1850 return (VOP_CREATEVOBJECT(vp, td)); 1851 } 1852 1853 /* 1854 * Record a process's interest in events which might happen to 1855 * a vnode. Because poll uses the historic select-style interface 1856 * internally, this routine serves as both the ``check for any 1857 * pending events'' and the ``record my interest in future events'' 1858 * functions. (These are done together, while the lock is held, 1859 * to avoid race conditions.) 1860 */ 1861 int 1862 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 1863 { 1864 lwkt_tokref ilock; 1865 1866 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1867 if (vp->v_pollinfo.vpi_revents & events) { 1868 /* 1869 * This leaves events we are not interested 1870 * in available for the other process which 1871 * which presumably had requested them 1872 * (otherwise they would never have been 1873 * recorded). 1874 */ 1875 events &= vp->v_pollinfo.vpi_revents; 1876 vp->v_pollinfo.vpi_revents &= ~events; 1877 1878 lwkt_reltoken(&ilock); 1879 return events; 1880 } 1881 vp->v_pollinfo.vpi_events |= events; 1882 selrecord(td, &vp->v_pollinfo.vpi_selinfo); 1883 lwkt_reltoken(&ilock); 1884 return 0; 1885 } 1886 1887 /* 1888 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 1889 * it is possible for us to miss an event due to race conditions, but 1890 * that condition is expected to be rare, so for the moment it is the 1891 * preferred interface. 1892 */ 1893 void 1894 vn_pollevent(struct vnode *vp, int events) 1895 { 1896 lwkt_tokref ilock; 1897 1898 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1899 if (vp->v_pollinfo.vpi_events & events) { 1900 /* 1901 * We clear vpi_events so that we don't 1902 * call selwakeup() twice if two events are 1903 * posted before the polling process(es) is 1904 * awakened. This also ensures that we take at 1905 * most one selwakeup() if the polling process 1906 * is no longer interested. However, it does 1907 * mean that only one event can be noticed at 1908 * a time. (Perhaps we should only clear those 1909 * event bits which we note?) XXX 1910 */ 1911 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 1912 vp->v_pollinfo.vpi_revents |= events; 1913 selwakeup(&vp->v_pollinfo.vpi_selinfo); 1914 } 1915 lwkt_reltoken(&ilock); 1916 } 1917 1918 /* 1919 * Wake up anyone polling on vp because it is being revoked. 1920 * This depends on dead_poll() returning POLLHUP for correct 1921 * behavior. 1922 */ 1923 void 1924 vn_pollgone(struct vnode *vp) 1925 { 1926 lwkt_tokref ilock; 1927 1928 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1929 if (vp->v_pollinfo.vpi_events) { 1930 vp->v_pollinfo.vpi_events = 0; 1931 selwakeup(&vp->v_pollinfo.vpi_selinfo); 1932 } 1933 lwkt_reltoken(&ilock); 1934 } 1935 1936 /* 1937 * extract the dev_t from a VBLK or VCHR. The vnode must have been opened 1938 * (or v_rdev might be NULL). 1939 */ 1940 dev_t 1941 vn_todev(struct vnode *vp) 1942 { 1943 if (vp->v_type != VBLK && vp->v_type != VCHR) 1944 return (NODEV); 1945 KKASSERT(vp->v_rdev != NULL); 1946 return (vp->v_rdev); 1947 } 1948 1949 /* 1950 * Check if vnode represents a disk device. The vnode does not need to be 1951 * opened. 1952 */ 1953 int 1954 vn_isdisk(struct vnode *vp, int *errp) 1955 { 1956 dev_t dev; 1957 1958 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1959 if (errp != NULL) 1960 *errp = ENOTBLK; 1961 return (0); 1962 } 1963 1964 if ((dev = vp->v_rdev) == NULL) 1965 dev = udev2dev(vp->v_udev, (vp->v_type == VBLK)); 1966 if (dev == NULL || dev == NODEV) { 1967 if (errp != NULL) 1968 *errp = ENXIO; 1969 return (0); 1970 } 1971 if (dev_is_good(dev) == 0) { 1972 if (errp != NULL) 1973 *errp = ENXIO; 1974 return (0); 1975 } 1976 if ((dev_dflags(dev) & D_DISK) == 0) { 1977 if (errp != NULL) 1978 *errp = ENOTBLK; 1979 return (0); 1980 } 1981 if (errp != NULL) 1982 *errp = 0; 1983 return (1); 1984 } 1985 1986 #ifdef DEBUG_VFS_LOCKS 1987 1988 void 1989 assert_vop_locked(struct vnode *vp, const char *str) 1990 { 1991 if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) { 1992 panic("%s: %p is not locked shared but should be", str, vp); 1993 } 1994 } 1995 1996 void 1997 assert_vop_unlocked(struct vnode *vp, const char *str) 1998 { 1999 if (vp && IS_LOCKING_VFS(vp)) { 2000 if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) { 2001 panic("%s: %p is locked but should not be", str, vp); 2002 } 2003 } 2004 } 2005 2006 #endif 2007 2008 int 2009 vn_get_namelen(struct vnode *vp, int *namelen) 2010 { 2011 int error, retval[2]; 2012 2013 error = VOP_PATHCONF(vp, _PC_NAME_MAX, retval); 2014 if (error) 2015 return (error); 2016 *namelen = *retval; 2017 return (0); 2018 } 2019 2020 int 2021 vop_write_dirent(int *error, struct uio *uio, ino_t d_ino, uint8_t d_type, 2022 uint16_t d_namlen, const char *d_name) 2023 { 2024 struct dirent *dp; 2025 size_t len; 2026 2027 len = _DIRENT_RECLEN(d_namlen); 2028 if (len > uio->uio_resid) 2029 return(1); 2030 2031 dp = malloc(len, M_TEMP, M_WAITOK | M_ZERO); 2032 2033 dp->d_ino = d_ino; 2034 dp->d_namlen = d_namlen; 2035 dp->d_type = d_type; 2036 bcopy(d_name, dp->d_name, d_namlen); 2037 2038 *error = uiomove((caddr_t)dp, len, uio); 2039 2040 free(dp, M_TEMP); 2041 2042 return(0); 2043 } 2044