1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.66 2006/02/17 19:18:06 dillon Exp $ 41 */ 42 43 /* 44 * External virtual filesystem routines 45 */ 46 #include "opt_ddb.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/dirent.h> 53 #include <sys/domain.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fcntl.h> 56 #include <sys/kernel.h> 57 #include <sys/kthread.h> 58 #include <sys/malloc.h> 59 #include <sys/mbuf.h> 60 #include <sys/mount.h> 61 #include <sys/proc.h> 62 #include <sys/reboot.h> 63 #include <sys/socket.h> 64 #include <sys/stat.h> 65 #include <sys/sysctl.h> 66 #include <sys/syslog.h> 67 #include <sys/unistd.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/limits.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pager.h> 81 #include <vm/vnode_pager.h> 82 #include <vm/vm_zone.h> 83 84 #include <sys/buf2.h> 85 #include <sys/thread2.h> 86 87 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 88 89 int numvnodes; 90 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 91 int vfs_fastdev = 1; 92 SYSCTL_INT(_vfs, OID_AUTO, fastdev, CTLFLAG_RW, &vfs_fastdev, 0, ""); 93 94 enum vtype iftovt_tab[16] = { 95 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 96 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 97 }; 98 int vttoif_tab[9] = { 99 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 100 S_IFSOCK, S_IFIFO, S_IFMT, 101 }; 102 103 static int reassignbufcalls; 104 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, 105 &reassignbufcalls, 0, ""); 106 static int reassignbufloops; 107 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, 108 &reassignbufloops, 0, ""); 109 static int reassignbufsortgood; 110 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, 111 &reassignbufsortgood, 0, ""); 112 static int reassignbufsortbad; 113 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, 114 &reassignbufsortbad, 0, ""); 115 static int reassignbufmethod = 1; 116 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, 117 &reassignbufmethod, 0, ""); 118 119 int nfs_mount_type = -1; 120 static struct lwkt_token spechash_token; 121 struct nfs_public nfs_pub; /* publicly exported FS */ 122 123 int desiredvnodes; 124 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 125 &desiredvnodes, 0, "Maximum number of vnodes"); 126 127 static void vfs_free_addrlist (struct netexport *nep); 128 static int vfs_free_netcred (struct radix_node *rn, void *w); 129 static int vfs_hang_addrlist (struct mount *mp, struct netexport *nep, 130 struct export_args *argp); 131 132 extern int dev_ref_debug; 133 extern struct vnodeopv_entry_desc spec_vnodeop_entries[]; 134 135 /* 136 * Red black tree functions 137 */ 138 static int rb_buf_compare(struct buf *b1, struct buf *b2); 139 RB_GENERATE(buf_rb_tree, buf, b_rbnode, rb_buf_compare); 140 141 static int 142 rb_buf_compare(struct buf *b1, struct buf *b2) 143 { 144 if (b1->b_lblkno < b2->b_lblkno) 145 return(-1); 146 if (b1->b_lblkno > b2->b_lblkno) 147 return(1); 148 return(0); 149 } 150 151 /* 152 * Return 0 if the vnode is already on the free list or cannot be placed 153 * on the free list. Return 1 if the vnode can be placed on the free list. 154 */ 155 static __inline int 156 vshouldfree(struct vnode *vp, int usecount) 157 { 158 if (vp->v_flag & VFREE) 159 return (0); /* already free */ 160 if (vp->v_holdcnt != 0 || vp->v_usecount != usecount) 161 return (0); /* other holderse */ 162 if (vp->v_object && 163 (vp->v_object->ref_count || vp->v_object->resident_page_count)) { 164 return (0); 165 } 166 return (1); 167 } 168 169 /* 170 * Initialize the vnode management data structures. 171 * 172 * Called from vfsinit() 173 */ 174 void 175 vfs_subr_init(void) 176 { 177 /* 178 * Desired vnodes is a result of the physical page count 179 * and the size of kernel's heap. It scales in proportion 180 * to the amount of available physical memory. This can 181 * cause trouble on 64-bit and large memory platforms. 182 */ 183 /* desiredvnodes = maxproc + vmstats.v_page_count / 4; */ 184 desiredvnodes = 185 min(maxproc + vmstats.v_page_count /4, 186 2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 187 (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); 188 189 lwkt_token_init(&spechash_token); 190 } 191 192 /* 193 * Knob to control the precision of file timestamps: 194 * 195 * 0 = seconds only; nanoseconds zeroed. 196 * 1 = seconds and nanoseconds, accurate within 1/HZ. 197 * 2 = seconds and nanoseconds, truncated to microseconds. 198 * >=3 = seconds and nanoseconds, maximum precision. 199 */ 200 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 201 202 static int timestamp_precision = TSP_SEC; 203 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 204 ×tamp_precision, 0, ""); 205 206 /* 207 * Get a current timestamp. 208 */ 209 void 210 vfs_timestamp(struct timespec *tsp) 211 { 212 struct timeval tv; 213 214 switch (timestamp_precision) { 215 case TSP_SEC: 216 tsp->tv_sec = time_second; 217 tsp->tv_nsec = 0; 218 break; 219 case TSP_HZ: 220 getnanotime(tsp); 221 break; 222 case TSP_USEC: 223 microtime(&tv); 224 TIMEVAL_TO_TIMESPEC(&tv, tsp); 225 break; 226 case TSP_NSEC: 227 default: 228 nanotime(tsp); 229 break; 230 } 231 } 232 233 /* 234 * Set vnode attributes to VNOVAL 235 */ 236 void 237 vattr_null(struct vattr *vap) 238 { 239 vap->va_type = VNON; 240 vap->va_size = VNOVAL; 241 vap->va_bytes = VNOVAL; 242 vap->va_mode = VNOVAL; 243 vap->va_nlink = VNOVAL; 244 vap->va_uid = VNOVAL; 245 vap->va_gid = VNOVAL; 246 vap->va_fsid = VNOVAL; 247 vap->va_fileid = VNOVAL; 248 vap->va_blocksize = VNOVAL; 249 vap->va_rdev = VNOVAL; 250 vap->va_atime.tv_sec = VNOVAL; 251 vap->va_atime.tv_nsec = VNOVAL; 252 vap->va_mtime.tv_sec = VNOVAL; 253 vap->va_mtime.tv_nsec = VNOVAL; 254 vap->va_ctime.tv_sec = VNOVAL; 255 vap->va_ctime.tv_nsec = VNOVAL; 256 vap->va_flags = VNOVAL; 257 vap->va_gen = VNOVAL; 258 vap->va_vaflags = 0; 259 vap->va_fsmid = VNOVAL; 260 } 261 262 /* 263 * Flush out and invalidate all buffers associated with a vnode. 264 * 265 * vp must be locked. 266 */ 267 static int vinvalbuf_bp(struct buf *bp, void *data); 268 269 struct vinvalbuf_bp_info { 270 struct vnode *vp; 271 int slptimeo; 272 int slpflag; 273 int flags; 274 }; 275 276 int 277 vinvalbuf(struct vnode *vp, int flags, struct thread *td, 278 int slpflag, int slptimeo) 279 { 280 struct vinvalbuf_bp_info info; 281 int error; 282 vm_object_t object; 283 284 /* 285 * If we are being asked to save, call fsync to ensure that the inode 286 * is updated. 287 */ 288 if (flags & V_SAVE) { 289 crit_enter(); 290 while (vp->v_track_write.bk_active) { 291 vp->v_track_write.bk_waitflag = 1; 292 error = tsleep(&vp->v_track_write, slpflag, 293 "vinvlbuf", slptimeo); 294 if (error) { 295 crit_exit(); 296 return (error); 297 } 298 } 299 if (!RB_EMPTY(&vp->v_rbdirty_tree)) { 300 crit_exit(); 301 if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0) 302 return (error); 303 crit_enter(); 304 if (vp->v_track_write.bk_active > 0 || 305 !RB_EMPTY(&vp->v_rbdirty_tree)) 306 panic("vinvalbuf: dirty bufs"); 307 } 308 crit_exit(); 309 } 310 crit_enter(); 311 info.slptimeo = slptimeo; 312 info.slpflag = slpflag; 313 info.flags = flags; 314 info.vp = vp; 315 316 /* 317 * Flush the buffer cache until nothing is left. 318 */ 319 while (!RB_EMPTY(&vp->v_rbclean_tree) || 320 !RB_EMPTY(&vp->v_rbdirty_tree)) { 321 error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL, 322 vinvalbuf_bp, &info); 323 if (error == 0) { 324 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 325 vinvalbuf_bp, &info); 326 } 327 } 328 329 /* 330 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 331 * have write I/O in-progress but if there is a VM object then the 332 * VM object can also have read-I/O in-progress. 333 */ 334 do { 335 while (vp->v_track_write.bk_active > 0) { 336 vp->v_track_write.bk_waitflag = 1; 337 tsleep(&vp->v_track_write, 0, "vnvlbv", 0); 338 } 339 if (VOP_GETVOBJECT(vp, &object) == 0) { 340 while (object->paging_in_progress) 341 vm_object_pip_sleep(object, "vnvlbx"); 342 } 343 } while (vp->v_track_write.bk_active > 0); 344 345 crit_exit(); 346 347 /* 348 * Destroy the copy in the VM cache, too. 349 */ 350 if (VOP_GETVOBJECT(vp, &object) == 0) { 351 vm_object_page_remove(object, 0, 0, 352 (flags & V_SAVE) ? TRUE : FALSE); 353 } 354 355 if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree)) 356 panic("vinvalbuf: flush failed"); 357 return (0); 358 } 359 360 static int 361 vinvalbuf_bp(struct buf *bp, void *data) 362 { 363 struct vinvalbuf_bp_info *info = data; 364 int error; 365 366 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 367 error = BUF_TIMELOCK(bp, 368 LK_EXCLUSIVE | LK_SLEEPFAIL, 369 "vinvalbuf", info->slpflag, info->slptimeo); 370 if (error == 0) { 371 BUF_UNLOCK(bp); 372 error = ENOLCK; 373 } 374 if (error == ENOLCK) 375 return(0); 376 return (-error); 377 } 378 /* 379 * XXX Since there are no node locks for NFS, I 380 * believe there is a slight chance that a delayed 381 * write will occur while sleeping just above, so 382 * check for it. Note that vfs_bio_awrite expects 383 * buffers to reside on a queue, while VOP_BWRITE and 384 * brelse do not. 385 */ 386 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 387 (info->flags & V_SAVE)) { 388 if (bp->b_vp == info->vp) { 389 if (bp->b_flags & B_CLUSTEROK) { 390 BUF_UNLOCK(bp); 391 vfs_bio_awrite(bp); 392 } else { 393 bremfree(bp); 394 bp->b_flags |= B_ASYNC; 395 VOP_BWRITE(bp->b_vp, bp); 396 } 397 } else { 398 bremfree(bp); 399 VOP_BWRITE(bp->b_vp, bp); 400 } 401 } else { 402 bremfree(bp); 403 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 404 bp->b_flags &= ~B_ASYNC; 405 brelse(bp); 406 } 407 return(0); 408 } 409 410 /* 411 * Truncate a file's buffer and pages to a specified length. This 412 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 413 * sync activity. 414 * 415 * The vnode must be locked. 416 */ 417 static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data); 418 static int vtruncbuf_bp_trunc(struct buf *bp, void *data); 419 static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data); 420 static int vtruncbuf_bp_metasync(struct buf *bp, void *data); 421 422 int 423 vtruncbuf(struct vnode *vp, struct thread *td, off_t length, int blksize) 424 { 425 daddr_t trunclbn; 426 int count; 427 428 /* 429 * Round up to the *next* lbn, then destroy the buffers in question. 430 * Since we are only removing some of the buffers we must rely on the 431 * scan count to determine whether a loop is necessary. 432 */ 433 trunclbn = (length + blksize - 1) / blksize; 434 435 crit_enter(); 436 do { 437 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 438 vtruncbuf_bp_trunc_cmp, 439 vtruncbuf_bp_trunc, &trunclbn); 440 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 441 vtruncbuf_bp_trunc_cmp, 442 vtruncbuf_bp_trunc, &trunclbn); 443 } while(count); 444 445 /* 446 * For safety, fsync any remaining metadata if the file is not being 447 * truncated to 0. Since the metadata does not represent the entire 448 * dirty list we have to rely on the hit count to ensure that we get 449 * all of it. 450 */ 451 if (length > 0) { 452 do { 453 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 454 vtruncbuf_bp_metasync_cmp, 455 vtruncbuf_bp_metasync, vp); 456 } while (count); 457 } 458 459 /* 460 * Wait for any in-progress I/O to complete before returning (why?) 461 */ 462 while (vp->v_track_write.bk_active > 0) { 463 vp->v_track_write.bk_waitflag = 1; 464 tsleep(&vp->v_track_write, 0, "vbtrunc", 0); 465 } 466 467 crit_exit(); 468 469 vnode_pager_setsize(vp, length); 470 471 return (0); 472 } 473 474 /* 475 * The callback buffer is beyond the new file EOF and must be destroyed. 476 * Note that the compare function must conform to the RB_SCAN's requirements. 477 */ 478 static 479 int 480 vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data) 481 { 482 if (bp->b_lblkno >= *(daddr_t *)data) 483 return(0); 484 return(-1); 485 } 486 487 static 488 int 489 vtruncbuf_bp_trunc(struct buf *bp, void *data) 490 { 491 /* 492 * Do not try to use a buffer we cannot immediately lock, but sleep 493 * anyway to prevent a livelock. The code will loop until all buffers 494 * can be acted upon. 495 */ 496 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 497 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 498 BUF_UNLOCK(bp); 499 } else { 500 bremfree(bp); 501 bp->b_flags |= (B_INVAL | B_RELBUF); 502 bp->b_flags &= ~B_ASYNC; 503 brelse(bp); 504 } 505 return(1); 506 } 507 508 /* 509 * Fsync all meta-data after truncating a file to be non-zero. Only metadata 510 * blocks (with a negative lblkno) are scanned. 511 * Note that the compare function must conform to the RB_SCAN's requirements. 512 */ 513 static int 514 vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data) 515 { 516 if (bp->b_lblkno < 0) 517 return(0); 518 return(1); 519 } 520 521 static int 522 vtruncbuf_bp_metasync(struct buf *bp, void *data) 523 { 524 struct vnode *vp = data; 525 526 if (bp->b_flags & B_DELWRI) { 527 /* 528 * Do not try to use a buffer we cannot immediately lock, 529 * but sleep anyway to prevent a livelock. The code will 530 * loop until all buffers can be acted upon. 531 */ 532 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 533 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 534 BUF_UNLOCK(bp); 535 } else { 536 bremfree(bp); 537 if (bp->b_vp == vp) { 538 bp->b_flags |= B_ASYNC; 539 } else { 540 bp->b_flags &= ~B_ASYNC; 541 } 542 VOP_BWRITE(bp->b_vp, bp); 543 } 544 return(1); 545 } else { 546 return(0); 547 } 548 } 549 550 /* 551 * vfsync - implements a multipass fsync on a file which understands 552 * dependancies and meta-data. The passed vnode must be locked. The 553 * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY. 554 * 555 * When fsyncing data asynchronously just do one consolidated pass starting 556 * with the most negative block number. This may not get all the data due 557 * to dependancies. 558 * 559 * When fsyncing data synchronously do a data pass, then a metadata pass, 560 * then do additional data+metadata passes to try to get all the data out. 561 */ 562 static int vfsync_wait_output(struct vnode *vp, 563 int (*waitoutput)(struct vnode *, struct thread *)); 564 static int vfsync_data_only_cmp(struct buf *bp, void *data); 565 static int vfsync_meta_only_cmp(struct buf *bp, void *data); 566 static int vfsync_lazy_range_cmp(struct buf *bp, void *data); 567 static int vfsync_bp(struct buf *bp, void *data); 568 569 struct vfsync_info { 570 struct vnode *vp; 571 int synchronous; 572 int syncdeps; 573 int lazycount; 574 int lazylimit; 575 daddr_t lbn; 576 int (*checkdef)(struct buf *); 577 }; 578 579 int 580 vfsync(struct vnode *vp, int waitfor, int passes, daddr_t lbn, 581 int (*checkdef)(struct buf *), 582 int (*waitoutput)(struct vnode *, struct thread *)) 583 { 584 struct vfsync_info info; 585 int error; 586 587 bzero(&info, sizeof(info)); 588 info.vp = vp; 589 info.lbn = lbn; 590 if ((info.checkdef = checkdef) == NULL) 591 info.syncdeps = 1; 592 593 crit_enter(); 594 595 switch(waitfor) { 596 case MNT_LAZY: 597 /* 598 * Lazy (filesystem syncer typ) Asynchronous plus limit the 599 * number of data (not meta) pages we try to flush to 1MB. 600 * A non-zero return means that lazy limit was reached. 601 */ 602 info.lazylimit = 1024 * 1024; 603 info.syncdeps = 1; 604 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 605 vfsync_lazy_range_cmp, vfsync_bp, &info); 606 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 607 vfsync_meta_only_cmp, vfsync_bp, &info); 608 if (error == 0) 609 vp->v_lazyw = 0; 610 else if (!RB_EMPTY(&vp->v_rbdirty_tree)) 611 vn_syncer_add_to_worklist(vp, 1); 612 error = 0; 613 break; 614 case MNT_NOWAIT: 615 /* 616 * Asynchronous. Do a data-only pass and a meta-only pass. 617 */ 618 info.syncdeps = 1; 619 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 620 vfsync_bp, &info); 621 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp, 622 vfsync_bp, &info); 623 error = 0; 624 break; 625 default: 626 /* 627 * Synchronous. Do a data-only pass, then a meta-data+data 628 * pass, then additional integrated passes to try to get 629 * all the dependancies flushed. 630 */ 631 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 632 vfsync_bp, &info); 633 error = vfsync_wait_output(vp, waitoutput); 634 if (error == 0) { 635 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 636 vfsync_bp, &info); 637 error = vfsync_wait_output(vp, waitoutput); 638 } 639 while (error == 0 && passes > 0 && 640 !RB_EMPTY(&vp->v_rbdirty_tree)) { 641 if (--passes == 0) { 642 info.synchronous = 1; 643 info.syncdeps = 1; 644 } 645 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 646 vfsync_bp, &info); 647 if (error < 0) 648 error = -error; 649 info.syncdeps = 1; 650 if (error == 0) 651 error = vfsync_wait_output(vp, waitoutput); 652 } 653 break; 654 } 655 crit_exit(); 656 return(error); 657 } 658 659 static int 660 vfsync_wait_output(struct vnode *vp, int (*waitoutput)(struct vnode *, struct thread *)) 661 { 662 int error = 0; 663 664 while (vp->v_track_write.bk_active) { 665 vp->v_track_write.bk_waitflag = 1; 666 tsleep(&vp->v_track_write, 0, "fsfsn", 0); 667 } 668 if (waitoutput) 669 error = waitoutput(vp, curthread); 670 return(error); 671 } 672 673 static int 674 vfsync_data_only_cmp(struct buf *bp, void *data) 675 { 676 if (bp->b_lblkno < 0) 677 return(-1); 678 return(0); 679 } 680 681 static int 682 vfsync_meta_only_cmp(struct buf *bp, void *data) 683 { 684 if (bp->b_lblkno < 0) 685 return(0); 686 return(1); 687 } 688 689 static int 690 vfsync_lazy_range_cmp(struct buf *bp, void *data) 691 { 692 struct vfsync_info *info = data; 693 if (bp->b_lblkno < info->vp->v_lazyw) 694 return(-1); 695 return(0); 696 } 697 698 static int 699 vfsync_bp(struct buf *bp, void *data) 700 { 701 struct vfsync_info *info = data; 702 struct vnode *vp = info->vp; 703 int error; 704 705 /* 706 * if syncdeps is not set we do not try to write buffers which have 707 * dependancies. 708 */ 709 if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp)) 710 return(0); 711 712 /* 713 * Ignore buffers that we cannot immediately lock. XXX 714 */ 715 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) 716 return(0); 717 if ((bp->b_flags & B_DELWRI) == 0) 718 panic("vfsync_bp: buffer not dirty"); 719 if (vp != bp->b_vp) 720 panic("vfsync_bp: buffer vp mismatch"); 721 722 /* 723 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer 724 * has been written but an additional handshake with the device 725 * is required before we can dispose of the buffer. We have no idea 726 * how to do this so we have to skip these buffers. 727 */ 728 if (bp->b_flags & B_NEEDCOMMIT) { 729 BUF_UNLOCK(bp); 730 return(0); 731 } 732 733 /* 734 * (LEGACY FROM UFS, REMOVE WHEN POSSIBLE) - invalidate any dirty 735 * buffers beyond the file EOF. 736 */ 737 if (info->lbn != (daddr_t)-1 && vp->v_type == VREG && 738 bp->b_lblkno >= info->lbn) { 739 bremfree(bp); 740 bp->b_flags |= B_INVAL | B_NOCACHE; 741 crit_exit(); 742 brelse(bp); 743 crit_enter(); 744 } 745 746 if (info->synchronous) { 747 /* 748 * Synchronous flushing. An error may be returned. 749 */ 750 bremfree(bp); 751 crit_exit(); 752 error = bwrite(bp); 753 crit_enter(); 754 } else { 755 /* 756 * Asynchronous flushing. A negative return value simply 757 * stops the scan and is not considered an error. We use 758 * this to support limited MNT_LAZY flushes. 759 */ 760 vp->v_lazyw = bp->b_lblkno; 761 if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) { 762 BUF_UNLOCK(bp); 763 info->lazycount += vfs_bio_awrite(bp); 764 } else { 765 info->lazycount += bp->b_bufsize; 766 bremfree(bp); 767 crit_exit(); 768 bawrite(bp); 769 crit_enter(); 770 } 771 if (info->lazylimit && info->lazycount >= info->lazylimit) 772 error = 1; 773 else 774 error = 0; 775 } 776 return(-error); 777 } 778 779 /* 780 * Associate a buffer with a vnode. 781 */ 782 void 783 bgetvp(struct vnode *vp, struct buf *bp) 784 { 785 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 786 787 vhold(vp); 788 bp->b_vp = vp; 789 /* 790 * Insert onto list for new vnode. 791 */ 792 crit_enter(); 793 bp->b_xflags |= BX_VNCLEAN; 794 bp->b_xflags &= ~BX_VNDIRTY; 795 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) 796 panic("reassignbuf: dup lblk vp %p bp %p", vp, bp); 797 crit_exit(); 798 } 799 800 /* 801 * Disassociate a buffer from a vnode. 802 */ 803 void 804 brelvp(struct buf *bp) 805 { 806 struct vnode *vp; 807 808 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 809 810 /* 811 * Delete from old vnode list, if on one. 812 */ 813 vp = bp->b_vp; 814 crit_enter(); 815 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 816 if (bp->b_xflags & BX_VNDIRTY) 817 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); 818 else 819 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); 820 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 821 } 822 if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) { 823 vp->v_flag &= ~VONWORKLST; 824 LIST_REMOVE(vp, v_synclist); 825 } 826 crit_exit(); 827 bp->b_vp = NULL; 828 vdrop(vp); 829 } 830 831 /* 832 * Associate a p-buffer with a vnode. 833 * 834 * Also sets B_PAGING flag to indicate that vnode is not fully associated 835 * with the buffer. i.e. the bp has not been linked into the vnode or 836 * ref-counted. 837 */ 838 void 839 pbgetvp(struct vnode *vp, struct buf *bp) 840 { 841 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 842 843 bp->b_vp = vp; 844 bp->b_flags |= B_PAGING; 845 } 846 847 /* 848 * Disassociate a p-buffer from a vnode. 849 */ 850 void 851 pbrelvp(struct buf *bp) 852 { 853 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 854 855 bp->b_vp = NULL; 856 bp->b_flags &= ~B_PAGING; 857 } 858 859 void 860 pbreassignbuf(struct buf *bp, struct vnode *newvp) 861 { 862 if ((bp->b_flags & B_PAGING) == 0) { 863 panic( 864 "pbreassignbuf() on non phys bp %p", 865 bp 866 ); 867 } 868 bp->b_vp = newvp; 869 } 870 871 /* 872 * Reassign a buffer from one vnode to another. 873 * Used to assign file specific control information 874 * (indirect blocks) to the vnode to which they belong. 875 */ 876 void 877 reassignbuf(struct buf *bp, struct vnode *newvp) 878 { 879 int delay; 880 881 if (newvp == NULL) { 882 printf("reassignbuf: NULL"); 883 return; 884 } 885 ++reassignbufcalls; 886 887 /* 888 * B_PAGING flagged buffers cannot be reassigned because their vp 889 * is not fully linked in. 890 */ 891 if (bp->b_flags & B_PAGING) 892 panic("cannot reassign paging buffer"); 893 894 crit_enter(); 895 /* 896 * Delete from old vnode list, if on one. 897 */ 898 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 899 if (bp->b_xflags & BX_VNDIRTY) 900 buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbdirty_tree, bp); 901 else 902 buf_rb_tree_RB_REMOVE(&bp->b_vp->v_rbclean_tree, bp); 903 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 904 if (bp->b_vp != newvp) { 905 vdrop(bp->b_vp); 906 bp->b_vp = NULL; /* for clarification */ 907 } 908 } 909 /* 910 * If dirty, put on list of dirty buffers; otherwise insert onto list 911 * of clean buffers. 912 */ 913 if (bp->b_flags & B_DELWRI) { 914 if ((newvp->v_flag & VONWORKLST) == 0) { 915 switch (newvp->v_type) { 916 case VDIR: 917 delay = dirdelay; 918 break; 919 case VCHR: 920 case VBLK: 921 if (newvp->v_rdev && 922 newvp->v_rdev->si_mountpoint != NULL) { 923 delay = metadelay; 924 break; 925 } 926 /* fall through */ 927 default: 928 delay = filedelay; 929 } 930 vn_syncer_add_to_worklist(newvp, delay); 931 } 932 bp->b_xflags |= BX_VNDIRTY; 933 if (buf_rb_tree_RB_INSERT(&newvp->v_rbdirty_tree, bp)) 934 panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp); 935 } else { 936 bp->b_xflags |= BX_VNCLEAN; 937 if (buf_rb_tree_RB_INSERT(&newvp->v_rbclean_tree, bp)) 938 panic("reassignbuf: dup lblk vp %p bp %p", newvp, bp); 939 if ((newvp->v_flag & VONWORKLST) && 940 RB_EMPTY(&newvp->v_rbdirty_tree)) { 941 newvp->v_flag &= ~VONWORKLST; 942 LIST_REMOVE(newvp, v_synclist); 943 } 944 } 945 if (bp->b_vp != newvp) { 946 bp->b_vp = newvp; 947 vhold(bp->b_vp); 948 } 949 crit_exit(); 950 } 951 952 /* 953 * Create a vnode for a block device. 954 * Used for mounting the root file system. 955 */ 956 int 957 bdevvp(dev_t dev, struct vnode **vpp) 958 { 959 struct vnode *vp; 960 struct vnode *nvp; 961 int error; 962 963 if (dev == NODEV) { 964 *vpp = NULLVP; 965 return (ENXIO); 966 } 967 error = getspecialvnode(VT_NON, NULL, &spec_vnode_vops, &nvp, 0, 0); 968 if (error) { 969 *vpp = NULLVP; 970 return (error); 971 } 972 vp = nvp; 973 vp->v_type = VCHR; 974 vp->v_udev = dev->si_udev; 975 vx_unlock(vp); 976 *vpp = vp; 977 return (0); 978 } 979 980 int 981 v_associate_rdev(struct vnode *vp, dev_t dev) 982 { 983 lwkt_tokref ilock; 984 985 if (dev == NULL || dev == NODEV) 986 return(ENXIO); 987 if (dev_is_good(dev) == 0) 988 return(ENXIO); 989 KKASSERT(vp->v_rdev == NULL); 990 if (dev_ref_debug) 991 printf("Z1"); 992 vp->v_rdev = reference_dev(dev); 993 lwkt_gettoken(&ilock, &spechash_token); 994 SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext); 995 lwkt_reltoken(&ilock); 996 return(0); 997 } 998 999 void 1000 v_release_rdev(struct vnode *vp) 1001 { 1002 lwkt_tokref ilock; 1003 dev_t dev; 1004 1005 if ((dev = vp->v_rdev) != NULL) { 1006 lwkt_gettoken(&ilock, &spechash_token); 1007 SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext); 1008 if (dev_ref_debug && vp->v_opencount != 0) { 1009 printf("releasing rdev with non-0 " 1010 "v_opencount(%d) (revoked?)\n", 1011 vp->v_opencount); 1012 } 1013 vp->v_rdev = NULL; 1014 vp->v_opencount = 0; 1015 release_dev(dev); 1016 lwkt_reltoken(&ilock); 1017 } 1018 } 1019 1020 /* 1021 * Add a vnode to the alias list hung off the dev_t. We only associate 1022 * the device number with the vnode. The actual device is not associated 1023 * until the vnode is opened (usually in spec_open()), and will be 1024 * disassociated on last close. 1025 */ 1026 void 1027 addaliasu(struct vnode *nvp, udev_t nvp_udev) 1028 { 1029 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1030 panic("addaliasu on non-special vnode"); 1031 nvp->v_udev = nvp_udev; 1032 } 1033 1034 /* 1035 * Disassociate a vnode from its underlying filesystem. 1036 * 1037 * The vnode must be VX locked and refd 1038 * 1039 * If there are v_usecount references to the vnode other then ours we have 1040 * to VOP_CLOSE the vnode before we can deactivate and reclaim it. 1041 */ 1042 void 1043 vclean(struct vnode *vp, int flags, struct thread *td) 1044 { 1045 int active; 1046 int retflags = 0; 1047 1048 /* 1049 * If the vnode has already been reclaimed we have nothing to do. 1050 */ 1051 if (vp->v_flag & VRECLAIMED) 1052 return; 1053 vp->v_flag |= VRECLAIMED; 1054 1055 /* 1056 * Scrap the vfs cache 1057 */ 1058 while (cache_inval_vp(vp, 0, &retflags) != 0) { 1059 printf("Warning: vnode %p clean/cache_resolution race detected\n", vp); 1060 tsleep(vp, 0, "vclninv", 2); 1061 } 1062 1063 /* 1064 * Check to see if the vnode is in use. If so we have to reference it 1065 * before we clean it out so that its count cannot fall to zero and 1066 * generate a race against ourselves to recycle it. 1067 */ 1068 active = (vp->v_usecount > 1); 1069 1070 /* 1071 * Clean out any buffers associated with the vnode and destroy its 1072 * object, if it has one. 1073 */ 1074 vinvalbuf(vp, V_SAVE, td, 0, 0); 1075 VOP_DESTROYVOBJECT(vp); 1076 1077 /* 1078 * If purging an active vnode, it must be closed and 1079 * deactivated before being reclaimed. XXX 1080 * 1081 * Note that neither of these routines unlocks the vnode. 1082 */ 1083 if (active) { 1084 if (flags & DOCLOSE) 1085 VOP_CLOSE(vp, FNONBLOCK, td); 1086 } 1087 1088 /* 1089 * If the vnode has not be deactivated, deactivated it. 1090 */ 1091 if ((vp->v_flag & VINACTIVE) == 0) { 1092 vp->v_flag |= VINACTIVE; 1093 VOP_INACTIVE(vp, td); 1094 } 1095 1096 /* 1097 * Reclaim the vnode. 1098 */ 1099 if (VOP_RECLAIM(vp, retflags, td)) 1100 panic("vclean: cannot reclaim"); 1101 1102 /* 1103 * Done with purge, notify sleepers of the grim news. 1104 */ 1105 vp->v_ops = &dead_vnode_vops; 1106 vn_pollgone(vp); 1107 vp->v_tag = VT_NON; 1108 } 1109 1110 /* 1111 * Eliminate all activity associated with the requested vnode 1112 * and with all vnodes aliased to the requested vnode. 1113 * 1114 * The vnode must be referenced and vx_lock()'d 1115 * 1116 * revoke { struct vnode *a_vp, int a_flags } 1117 */ 1118 int 1119 vop_stdrevoke(struct vop_revoke_args *ap) 1120 { 1121 struct vnode *vp, *vq; 1122 lwkt_tokref ilock; 1123 dev_t dev; 1124 1125 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1126 1127 vp = ap->a_vp; 1128 1129 /* 1130 * If the vnode is already dead don't try to revoke it 1131 */ 1132 if (vp->v_flag & VRECLAIMED) 1133 return (0); 1134 1135 /* 1136 * If the vnode has a device association, scrap all vnodes associated 1137 * with the device. Don't let the device disappear on us while we 1138 * are scrapping the vnodes. 1139 * 1140 * The passed vp will probably show up in the list, do not VX lock 1141 * it twice! 1142 */ 1143 if (vp->v_type != VCHR && vp->v_type != VBLK) 1144 return(0); 1145 if ((dev = vp->v_rdev) == NULL) { 1146 if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV) 1147 return(0); 1148 } 1149 reference_dev(dev); 1150 lwkt_gettoken(&ilock, &spechash_token); 1151 while ((vq = SLIST_FIRST(&dev->si_hlist)) != NULL) { 1152 if (vp == vq || vx_get(vq) == 0) { 1153 if (vq == SLIST_FIRST(&dev->si_hlist)) 1154 vgone(vq); 1155 if (vp != vq) 1156 vx_put(vq); 1157 } 1158 } 1159 lwkt_reltoken(&ilock); 1160 release_dev(dev); 1161 return (0); 1162 } 1163 1164 /* 1165 * Recycle an unused vnode to the front of the free list. 1166 * 1167 * Returns 1 if we were successfully able to recycle the vnode, 1168 * 0 otherwise. 1169 */ 1170 int 1171 vrecycle(struct vnode *vp, struct thread *td) 1172 { 1173 if (vp->v_usecount == 1) { 1174 vgone(vp); 1175 return (1); 1176 } 1177 return (0); 1178 } 1179 1180 /* 1181 * Eliminate all activity associated with a vnode in preparation for reuse. 1182 * 1183 * The vnode must be VX locked and refd and will remain VX locked and refd 1184 * on return. This routine may be called with the vnode in any state, as 1185 * long as it is VX locked. The vnode will be cleaned out and marked 1186 * VRECLAIMED but will not actually be reused until all existing refs and 1187 * holds go away. 1188 * 1189 * NOTE: This routine may be called on a vnode which has not yet been 1190 * already been deactivated (VOP_INACTIVE), or on a vnode which has 1191 * already been reclaimed. 1192 * 1193 * This routine is not responsible for placing us back on the freelist. 1194 * Instead, it happens automatically when the caller releases the VX lock 1195 * (assuming there aren't any other references). 1196 */ 1197 void 1198 vgone(struct vnode *vp) 1199 { 1200 /* 1201 * assert that the VX lock is held. This is an absolute requirement 1202 * now for vgone() to be called. 1203 */ 1204 KKASSERT(vp->v_lock.lk_exclusivecount == 1); 1205 1206 /* 1207 * Clean out the filesystem specific data and set the VRECLAIMED 1208 * bit. Also deactivate the vnode if necessary. 1209 */ 1210 vclean(vp, DOCLOSE, curthread); 1211 1212 /* 1213 * Delete from old mount point vnode list, if on one. 1214 */ 1215 if (vp->v_mount != NULL) 1216 insmntque(vp, NULL); 1217 1218 /* 1219 * If special device, remove it from special device alias list 1220 * if it is on one. This should normally only occur if a vnode is 1221 * being revoked as the device should otherwise have been released 1222 * naturally. 1223 */ 1224 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 1225 v_release_rdev(vp); 1226 } 1227 1228 /* 1229 * Set us to VBAD 1230 */ 1231 vp->v_type = VBAD; 1232 } 1233 1234 /* 1235 * Lookup a vnode by device number. 1236 */ 1237 int 1238 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp) 1239 { 1240 lwkt_tokref ilock; 1241 struct vnode *vp; 1242 1243 lwkt_gettoken(&ilock, &spechash_token); 1244 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1245 if (type == vp->v_type) { 1246 *vpp = vp; 1247 lwkt_reltoken(&ilock); 1248 return (1); 1249 } 1250 } 1251 lwkt_reltoken(&ilock); 1252 return (0); 1253 } 1254 1255 /* 1256 * Calculate the total number of references to a special device. This 1257 * routine may only be called for VBLK and VCHR vnodes since v_rdev is 1258 * an overloaded field. Since udev2dev can now return NODEV, we have 1259 * to check for a NULL v_rdev. 1260 */ 1261 int 1262 count_dev(dev_t dev) 1263 { 1264 lwkt_tokref ilock; 1265 struct vnode *vp; 1266 int count = 0; 1267 1268 if (SLIST_FIRST(&dev->si_hlist)) { 1269 lwkt_gettoken(&ilock, &spechash_token); 1270 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1271 count += vp->v_usecount; 1272 } 1273 lwkt_reltoken(&ilock); 1274 } 1275 return(count); 1276 } 1277 1278 int 1279 count_udev(udev_t udev) 1280 { 1281 dev_t dev; 1282 1283 if ((dev = udev2dev(udev, 0)) == NODEV) 1284 return(0); 1285 return(count_dev(dev)); 1286 } 1287 1288 int 1289 vcount(struct vnode *vp) 1290 { 1291 if (vp->v_rdev == NULL) 1292 return(0); 1293 return(count_dev(vp->v_rdev)); 1294 } 1295 1296 /* 1297 * Print out a description of a vnode. 1298 */ 1299 static char *typename[] = 1300 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1301 1302 void 1303 vprint(char *label, struct vnode *vp) 1304 { 1305 char buf[96]; 1306 1307 if (label != NULL) 1308 printf("%s: %p: ", label, (void *)vp); 1309 else 1310 printf("%p: ", (void *)vp); 1311 printf("type %s, usecount %d, writecount %d, refcount %d,", 1312 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1313 vp->v_holdcnt); 1314 buf[0] = '\0'; 1315 if (vp->v_flag & VROOT) 1316 strcat(buf, "|VROOT"); 1317 if (vp->v_flag & VTEXT) 1318 strcat(buf, "|VTEXT"); 1319 if (vp->v_flag & VSYSTEM) 1320 strcat(buf, "|VSYSTEM"); 1321 if (vp->v_flag & VFREE) 1322 strcat(buf, "|VFREE"); 1323 if (vp->v_flag & VOBJBUF) 1324 strcat(buf, "|VOBJBUF"); 1325 if (buf[0] != '\0') 1326 printf(" flags (%s)", &buf[1]); 1327 if (vp->v_data == NULL) { 1328 printf("\n"); 1329 } else { 1330 printf("\n\t"); 1331 VOP_PRINT(vp); 1332 } 1333 } 1334 1335 #ifdef DDB 1336 #include <ddb/ddb.h> 1337 1338 static int db_show_locked_vnodes(struct mount *mp, void *data); 1339 1340 /* 1341 * List all of the locked vnodes in the system. 1342 * Called when debugging the kernel. 1343 */ 1344 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 1345 { 1346 printf("Locked vnodes\n"); 1347 mountlist_scan(db_show_locked_vnodes, NULL, 1348 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1349 } 1350 1351 static int 1352 db_show_locked_vnodes(struct mount *mp, void *data __unused) 1353 { 1354 struct vnode *vp; 1355 1356 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 1357 if (VOP_ISLOCKED(vp, NULL)) 1358 vprint((char *)0, vp); 1359 } 1360 return(0); 1361 } 1362 #endif 1363 1364 /* 1365 * Top level filesystem related information gathering. 1366 */ 1367 static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS); 1368 1369 static int 1370 vfs_sysctl(SYSCTL_HANDLER_ARGS) 1371 { 1372 int *name = (int *)arg1 - 1; /* XXX */ 1373 u_int namelen = arg2 + 1; /* XXX */ 1374 struct vfsconf *vfsp; 1375 1376 #if 1 || defined(COMPAT_PRELITE2) 1377 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 1378 if (namelen == 1) 1379 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 1380 #endif 1381 1382 #ifdef notyet 1383 /* all sysctl names at this level are at least name and field */ 1384 if (namelen < 2) 1385 return (ENOTDIR); /* overloaded */ 1386 if (name[0] != VFS_GENERIC) { 1387 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 1388 if (vfsp->vfc_typenum == name[0]) 1389 break; 1390 if (vfsp == NULL) 1391 return (EOPNOTSUPP); 1392 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 1393 oldp, oldlenp, newp, newlen, p)); 1394 } 1395 #endif 1396 switch (name[1]) { 1397 case VFS_MAXTYPENUM: 1398 if (namelen != 2) 1399 return (ENOTDIR); 1400 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 1401 case VFS_CONF: 1402 if (namelen != 3) 1403 return (ENOTDIR); /* overloaded */ 1404 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 1405 if (vfsp->vfc_typenum == name[2]) 1406 break; 1407 if (vfsp == NULL) 1408 return (EOPNOTSUPP); 1409 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 1410 } 1411 return (EOPNOTSUPP); 1412 } 1413 1414 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 1415 "Generic filesystem"); 1416 1417 #if 1 || defined(COMPAT_PRELITE2) 1418 1419 static int 1420 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 1421 { 1422 int error; 1423 struct vfsconf *vfsp; 1424 struct ovfsconf ovfs; 1425 1426 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 1427 bzero(&ovfs, sizeof(ovfs)); 1428 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 1429 strcpy(ovfs.vfc_name, vfsp->vfc_name); 1430 ovfs.vfc_index = vfsp->vfc_typenum; 1431 ovfs.vfc_refcount = vfsp->vfc_refcount; 1432 ovfs.vfc_flags = vfsp->vfc_flags; 1433 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 1434 if (error) 1435 return error; 1436 } 1437 return 0; 1438 } 1439 1440 #endif /* 1 || COMPAT_PRELITE2 */ 1441 1442 /* 1443 * Check to see if a filesystem is mounted on a block device. 1444 */ 1445 int 1446 vfs_mountedon(struct vnode *vp) 1447 { 1448 dev_t dev; 1449 1450 if ((dev = vp->v_rdev) == NULL) 1451 dev = udev2dev(vp->v_udev, (vp->v_type == VBLK)); 1452 if (dev != NODEV && dev->si_mountpoint) 1453 return (EBUSY); 1454 return (0); 1455 } 1456 1457 /* 1458 * Unmount all filesystems. The list is traversed in reverse order 1459 * of mounting to avoid dependencies. 1460 */ 1461 1462 static int vfs_umountall_callback(struct mount *mp, void *data); 1463 1464 void 1465 vfs_unmountall(void) 1466 { 1467 struct thread *td = curthread; 1468 int count; 1469 1470 if (td->td_proc == NULL) 1471 td = initproc->p_thread; /* XXX XXX use proc0 instead? */ 1472 1473 do { 1474 count = mountlist_scan(vfs_umountall_callback, 1475 &td, MNTSCAN_REVERSE|MNTSCAN_NOBUSY); 1476 } while (count); 1477 } 1478 1479 static 1480 int 1481 vfs_umountall_callback(struct mount *mp, void *data) 1482 { 1483 struct thread *td = *(struct thread **)data; 1484 int error; 1485 1486 error = dounmount(mp, MNT_FORCE, td); 1487 if (error) { 1488 mountlist_remove(mp); 1489 printf("unmount of filesystem mounted from %s failed (", 1490 mp->mnt_stat.f_mntfromname); 1491 if (error == EBUSY) 1492 printf("BUSY)\n"); 1493 else 1494 printf("%d)\n", error); 1495 } 1496 return(1); 1497 } 1498 1499 /* 1500 * Build hash lists of net addresses and hang them off the mount point. 1501 * Called by ufs_mount() to set up the lists of export addresses. 1502 */ 1503 static int 1504 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 1505 struct export_args *argp) 1506 { 1507 struct netcred *np; 1508 struct radix_node_head *rnh; 1509 int i; 1510 struct radix_node *rn; 1511 struct sockaddr *saddr, *smask = 0; 1512 struct domain *dom; 1513 int error; 1514 1515 if (argp->ex_addrlen == 0) { 1516 if (mp->mnt_flag & MNT_DEFEXPORTED) 1517 return (EPERM); 1518 np = &nep->ne_defexported; 1519 np->netc_exflags = argp->ex_flags; 1520 np->netc_anon = argp->ex_anon; 1521 np->netc_anon.cr_ref = 1; 1522 mp->mnt_flag |= MNT_DEFEXPORTED; 1523 return (0); 1524 } 1525 1526 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN) 1527 return (EINVAL); 1528 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN) 1529 return (EINVAL); 1530 1531 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 1532 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 1533 bzero((caddr_t) np, i); 1534 saddr = (struct sockaddr *) (np + 1); 1535 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 1536 goto out; 1537 if (saddr->sa_len > argp->ex_addrlen) 1538 saddr->sa_len = argp->ex_addrlen; 1539 if (argp->ex_masklen) { 1540 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 1541 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 1542 if (error) 1543 goto out; 1544 if (smask->sa_len > argp->ex_masklen) 1545 smask->sa_len = argp->ex_masklen; 1546 } 1547 i = saddr->sa_family; 1548 if ((rnh = nep->ne_rtable[i]) == 0) { 1549 /* 1550 * Seems silly to initialize every AF when most are not used, 1551 * do so on demand here 1552 */ 1553 SLIST_FOREACH(dom, &domains, dom_next) 1554 if (dom->dom_family == i && dom->dom_rtattach) { 1555 dom->dom_rtattach((void **) &nep->ne_rtable[i], 1556 dom->dom_rtoffset); 1557 break; 1558 } 1559 if ((rnh = nep->ne_rtable[i]) == 0) { 1560 error = ENOBUFS; 1561 goto out; 1562 } 1563 } 1564 rn = (*rnh->rnh_addaddr) ((char *) saddr, (char *) smask, rnh, 1565 np->netc_rnodes); 1566 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 1567 error = EPERM; 1568 goto out; 1569 } 1570 np->netc_exflags = argp->ex_flags; 1571 np->netc_anon = argp->ex_anon; 1572 np->netc_anon.cr_ref = 1; 1573 return (0); 1574 out: 1575 free(np, M_NETADDR); 1576 return (error); 1577 } 1578 1579 /* ARGSUSED */ 1580 static int 1581 vfs_free_netcred(struct radix_node *rn, void *w) 1582 { 1583 struct radix_node_head *rnh = (struct radix_node_head *) w; 1584 1585 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 1586 free((caddr_t) rn, M_NETADDR); 1587 return (0); 1588 } 1589 1590 /* 1591 * Free the net address hash lists that are hanging off the mount points. 1592 */ 1593 static void 1594 vfs_free_addrlist(struct netexport *nep) 1595 { 1596 int i; 1597 struct radix_node_head *rnh; 1598 1599 for (i = 0; i <= AF_MAX; i++) 1600 if ((rnh = nep->ne_rtable[i])) { 1601 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 1602 (caddr_t) rnh); 1603 free((caddr_t) rnh, M_RTABLE); 1604 nep->ne_rtable[i] = 0; 1605 } 1606 } 1607 1608 int 1609 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp) 1610 { 1611 int error; 1612 1613 if (argp->ex_flags & MNT_DELEXPORT) { 1614 if (mp->mnt_flag & MNT_EXPUBLIC) { 1615 vfs_setpublicfs(NULL, NULL, NULL); 1616 mp->mnt_flag &= ~MNT_EXPUBLIC; 1617 } 1618 vfs_free_addrlist(nep); 1619 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 1620 } 1621 if (argp->ex_flags & MNT_EXPORTED) { 1622 if (argp->ex_flags & MNT_EXPUBLIC) { 1623 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 1624 return (error); 1625 mp->mnt_flag |= MNT_EXPUBLIC; 1626 } 1627 if ((error = vfs_hang_addrlist(mp, nep, argp))) 1628 return (error); 1629 mp->mnt_flag |= MNT_EXPORTED; 1630 } 1631 return (0); 1632 } 1633 1634 1635 /* 1636 * Set the publicly exported filesystem (WebNFS). Currently, only 1637 * one public filesystem is possible in the spec (RFC 2054 and 2055) 1638 */ 1639 int 1640 vfs_setpublicfs(struct mount *mp, struct netexport *nep, 1641 struct export_args *argp) 1642 { 1643 int error; 1644 struct vnode *rvp; 1645 char *cp; 1646 1647 /* 1648 * mp == NULL -> invalidate the current info, the FS is 1649 * no longer exported. May be called from either vfs_export 1650 * or unmount, so check if it hasn't already been done. 1651 */ 1652 if (mp == NULL) { 1653 if (nfs_pub.np_valid) { 1654 nfs_pub.np_valid = 0; 1655 if (nfs_pub.np_index != NULL) { 1656 FREE(nfs_pub.np_index, M_TEMP); 1657 nfs_pub.np_index = NULL; 1658 } 1659 } 1660 return (0); 1661 } 1662 1663 /* 1664 * Only one allowed at a time. 1665 */ 1666 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 1667 return (EBUSY); 1668 1669 /* 1670 * Get real filehandle for root of exported FS. 1671 */ 1672 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 1673 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 1674 1675 if ((error = VFS_ROOT(mp, &rvp))) 1676 return (error); 1677 1678 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 1679 return (error); 1680 1681 vput(rvp); 1682 1683 /* 1684 * If an indexfile was specified, pull it in. 1685 */ 1686 if (argp->ex_indexfile != NULL) { 1687 int namelen; 1688 1689 error = vn_get_namelen(rvp, &namelen); 1690 if (error) 1691 return (error); 1692 MALLOC(nfs_pub.np_index, char *, namelen, M_TEMP, 1693 M_WAITOK); 1694 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 1695 namelen, (size_t *)0); 1696 if (!error) { 1697 /* 1698 * Check for illegal filenames. 1699 */ 1700 for (cp = nfs_pub.np_index; *cp; cp++) { 1701 if (*cp == '/') { 1702 error = EINVAL; 1703 break; 1704 } 1705 } 1706 } 1707 if (error) { 1708 FREE(nfs_pub.np_index, M_TEMP); 1709 return (error); 1710 } 1711 } 1712 1713 nfs_pub.np_mount = mp; 1714 nfs_pub.np_valid = 1; 1715 return (0); 1716 } 1717 1718 struct netcred * 1719 vfs_export_lookup(struct mount *mp, struct netexport *nep, 1720 struct sockaddr *nam) 1721 { 1722 struct netcred *np; 1723 struct radix_node_head *rnh; 1724 struct sockaddr *saddr; 1725 1726 np = NULL; 1727 if (mp->mnt_flag & MNT_EXPORTED) { 1728 /* 1729 * Lookup in the export list first. 1730 */ 1731 if (nam != NULL) { 1732 saddr = nam; 1733 rnh = nep->ne_rtable[saddr->sa_family]; 1734 if (rnh != NULL) { 1735 np = (struct netcred *) 1736 (*rnh->rnh_matchaddr)((char *)saddr, 1737 rnh); 1738 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 1739 np = NULL; 1740 } 1741 } 1742 /* 1743 * If no address match, use the default if it exists. 1744 */ 1745 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 1746 np = &nep->ne_defexported; 1747 } 1748 return (np); 1749 } 1750 1751 /* 1752 * perform msync on all vnodes under a mount point. The mount point must 1753 * be locked. This code is also responsible for lazy-freeing unreferenced 1754 * vnodes whos VM objects no longer contain pages. 1755 * 1756 * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state. 1757 * 1758 * NOTE: XXX VOP_PUTPAGES and friends requires that the vnode be locked, 1759 * but vnode_pager_putpages() doesn't lock the vnode. We have to do it 1760 * way up in this high level function. 1761 */ 1762 static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data); 1763 static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data); 1764 1765 void 1766 vfs_msync(struct mount *mp, int flags) 1767 { 1768 int vmsc_flags; 1769 1770 vmsc_flags = VMSC_GETVP; 1771 if (flags != MNT_WAIT) 1772 vmsc_flags |= VMSC_NOWAIT; 1773 vmntvnodescan(mp, vmsc_flags, vfs_msync_scan1, vfs_msync_scan2, 1774 (void *)flags); 1775 } 1776 1777 /* 1778 * scan1 is a fast pre-check. There could be hundreds of thousands of 1779 * vnodes, we cannot afford to do anything heavy weight until we have a 1780 * fairly good indication that there is work to do. 1781 */ 1782 static 1783 int 1784 vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data) 1785 { 1786 int flags = (int)data; 1787 1788 if ((vp->v_flag & VRECLAIMED) == 0) { 1789 if (vshouldfree(vp, 0)) 1790 return(0); /* call scan2 */ 1791 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 1792 (vp->v_flag & VOBJDIRTY) && 1793 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 1794 return(0); /* call scan2 */ 1795 } 1796 } 1797 1798 /* 1799 * do not call scan2, continue the loop 1800 */ 1801 return(-1); 1802 } 1803 1804 /* 1805 * This callback is handed a locked vnode. 1806 */ 1807 static 1808 int 1809 vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data) 1810 { 1811 vm_object_t obj; 1812 int flags = (int)data; 1813 1814 if (vp->v_flag & VRECLAIMED) 1815 return(0); 1816 1817 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 1818 (vp->v_flag & VOBJDIRTY)) { 1819 if (VOP_GETVOBJECT(vp, &obj) == 0) { 1820 vm_object_page_clean(obj, 0, 0, 1821 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 1822 } 1823 } 1824 return(0); 1825 } 1826 1827 /* 1828 * Create the VM object needed for VMIO and mmap support. This 1829 * is done for all VREG files in the system. Some filesystems might 1830 * afford the additional metadata buffering capability of the 1831 * VMIO code by making the device node be VMIO mode also. 1832 * 1833 * vp must be locked when vfs_object_create is called. 1834 */ 1835 int 1836 vfs_object_create(struct vnode *vp, struct thread *td) 1837 { 1838 return (VOP_CREATEVOBJECT(vp, td)); 1839 } 1840 1841 /* 1842 * Record a process's interest in events which might happen to 1843 * a vnode. Because poll uses the historic select-style interface 1844 * internally, this routine serves as both the ``check for any 1845 * pending events'' and the ``record my interest in future events'' 1846 * functions. (These are done together, while the lock is held, 1847 * to avoid race conditions.) 1848 */ 1849 int 1850 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 1851 { 1852 lwkt_tokref ilock; 1853 1854 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1855 if (vp->v_pollinfo.vpi_revents & events) { 1856 /* 1857 * This leaves events we are not interested 1858 * in available for the other process which 1859 * which presumably had requested them 1860 * (otherwise they would never have been 1861 * recorded). 1862 */ 1863 events &= vp->v_pollinfo.vpi_revents; 1864 vp->v_pollinfo.vpi_revents &= ~events; 1865 1866 lwkt_reltoken(&ilock); 1867 return events; 1868 } 1869 vp->v_pollinfo.vpi_events |= events; 1870 selrecord(td, &vp->v_pollinfo.vpi_selinfo); 1871 lwkt_reltoken(&ilock); 1872 return 0; 1873 } 1874 1875 /* 1876 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 1877 * it is possible for us to miss an event due to race conditions, but 1878 * that condition is expected to be rare, so for the moment it is the 1879 * preferred interface. 1880 */ 1881 void 1882 vn_pollevent(struct vnode *vp, int events) 1883 { 1884 lwkt_tokref ilock; 1885 1886 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1887 if (vp->v_pollinfo.vpi_events & events) { 1888 /* 1889 * We clear vpi_events so that we don't 1890 * call selwakeup() twice if two events are 1891 * posted before the polling process(es) is 1892 * awakened. This also ensures that we take at 1893 * most one selwakeup() if the polling process 1894 * is no longer interested. However, it does 1895 * mean that only one event can be noticed at 1896 * a time. (Perhaps we should only clear those 1897 * event bits which we note?) XXX 1898 */ 1899 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 1900 vp->v_pollinfo.vpi_revents |= events; 1901 selwakeup(&vp->v_pollinfo.vpi_selinfo); 1902 } 1903 lwkt_reltoken(&ilock); 1904 } 1905 1906 /* 1907 * Wake up anyone polling on vp because it is being revoked. 1908 * This depends on dead_poll() returning POLLHUP for correct 1909 * behavior. 1910 */ 1911 void 1912 vn_pollgone(struct vnode *vp) 1913 { 1914 lwkt_tokref ilock; 1915 1916 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1917 if (vp->v_pollinfo.vpi_events) { 1918 vp->v_pollinfo.vpi_events = 0; 1919 selwakeup(&vp->v_pollinfo.vpi_selinfo); 1920 } 1921 lwkt_reltoken(&ilock); 1922 } 1923 1924 /* 1925 * extract the dev_t from a VBLK or VCHR. The vnode must have been opened 1926 * (or v_rdev might be NULL). 1927 */ 1928 dev_t 1929 vn_todev(struct vnode *vp) 1930 { 1931 if (vp->v_type != VBLK && vp->v_type != VCHR) 1932 return (NODEV); 1933 KKASSERT(vp->v_rdev != NULL); 1934 return (vp->v_rdev); 1935 } 1936 1937 /* 1938 * Check if vnode represents a disk device. The vnode does not need to be 1939 * opened. 1940 */ 1941 int 1942 vn_isdisk(struct vnode *vp, int *errp) 1943 { 1944 dev_t dev; 1945 1946 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1947 if (errp != NULL) 1948 *errp = ENOTBLK; 1949 return (0); 1950 } 1951 1952 if ((dev = vp->v_rdev) == NULL) 1953 dev = udev2dev(vp->v_udev, (vp->v_type == VBLK)); 1954 if (dev == NULL || dev == NODEV) { 1955 if (errp != NULL) 1956 *errp = ENXIO; 1957 return (0); 1958 } 1959 if (dev_is_good(dev) == 0) { 1960 if (errp != NULL) 1961 *errp = ENXIO; 1962 return (0); 1963 } 1964 if ((dev_dflags(dev) & D_DISK) == 0) { 1965 if (errp != NULL) 1966 *errp = ENOTBLK; 1967 return (0); 1968 } 1969 if (errp != NULL) 1970 *errp = 0; 1971 return (1); 1972 } 1973 1974 #ifdef DEBUG_VFS_LOCKS 1975 1976 void 1977 assert_vop_locked(struct vnode *vp, const char *str) 1978 { 1979 if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) { 1980 panic("%s: %p is not locked shared but should be", str, vp); 1981 } 1982 } 1983 1984 void 1985 assert_vop_unlocked(struct vnode *vp, const char *str) 1986 { 1987 if (vp && IS_LOCKING_VFS(vp)) { 1988 if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) { 1989 panic("%s: %p is locked but should not be", str, vp); 1990 } 1991 } 1992 } 1993 1994 #endif 1995 1996 int 1997 vn_get_namelen(struct vnode *vp, int *namelen) 1998 { 1999 int error, retval[2]; 2000 2001 error = VOP_PATHCONF(vp, _PC_NAME_MAX, retval); 2002 if (error) 2003 return (error); 2004 *namelen = *retval; 2005 return (0); 2006 } 2007 2008 int 2009 vop_write_dirent(int *error, struct uio *uio, ino_t d_ino, uint8_t d_type, 2010 uint16_t d_namlen, const char *d_name) 2011 { 2012 struct dirent *dp; 2013 size_t len; 2014 2015 len = _DIRENT_RECLEN(d_namlen); 2016 if (len > uio->uio_resid) 2017 return(1); 2018 2019 dp = malloc(len, M_TEMP, M_WAITOK | M_ZERO); 2020 2021 dp->d_ino = d_ino; 2022 dp->d_namlen = d_namlen; 2023 dp->d_type = d_type; 2024 bcopy(d_name, dp->d_name, d_namlen); 2025 2026 *error = uiomove((caddr_t)dp, len, uio); 2027 2028 free(dp, M_TEMP); 2029 2030 return(0); 2031 } 2032