1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.70 2006/03/05 18:38:34 dillon Exp $ 41 */ 42 43 /* 44 * External virtual filesystem routines 45 */ 46 #include "opt_ddb.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/dirent.h> 53 #include <sys/domain.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fcntl.h> 56 #include <sys/kernel.h> 57 #include <sys/kthread.h> 58 #include <sys/malloc.h> 59 #include <sys/mbuf.h> 60 #include <sys/mount.h> 61 #include <sys/proc.h> 62 #include <sys/reboot.h> 63 #include <sys/socket.h> 64 #include <sys/stat.h> 65 #include <sys/sysctl.h> 66 #include <sys/syslog.h> 67 #include <sys/unistd.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/limits.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pager.h> 81 #include <vm/vnode_pager.h> 82 #include <vm/vm_zone.h> 83 84 #include <sys/buf2.h> 85 #include <sys/thread2.h> 86 87 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 88 89 int numvnodes; 90 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 91 int vfs_fastdev = 1; 92 SYSCTL_INT(_vfs, OID_AUTO, fastdev, CTLFLAG_RW, &vfs_fastdev, 0, ""); 93 94 enum vtype iftovt_tab[16] = { 95 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 96 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 97 }; 98 int vttoif_tab[9] = { 99 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 100 S_IFSOCK, S_IFIFO, S_IFMT, 101 }; 102 103 static int reassignbufcalls; 104 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, 105 &reassignbufcalls, 0, ""); 106 static int reassignbufloops; 107 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, 108 &reassignbufloops, 0, ""); 109 static int reassignbufsortgood; 110 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, 111 &reassignbufsortgood, 0, ""); 112 static int reassignbufsortbad; 113 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, 114 &reassignbufsortbad, 0, ""); 115 static int reassignbufmethod = 1; 116 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, 117 &reassignbufmethod, 0, ""); 118 119 int nfs_mount_type = -1; 120 static struct lwkt_token spechash_token; 121 struct nfs_public nfs_pub; /* publicly exported FS */ 122 123 int desiredvnodes; 124 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 125 &desiredvnodes, 0, "Maximum number of vnodes"); 126 127 static void vfs_free_addrlist (struct netexport *nep); 128 static int vfs_free_netcred (struct radix_node *rn, void *w); 129 static int vfs_hang_addrlist (struct mount *mp, struct netexport *nep, 130 struct export_args *argp); 131 132 extern int dev_ref_debug; 133 extern struct vnodeopv_entry_desc spec_vnodeop_entries[]; 134 135 /* 136 * Red black tree functions 137 */ 138 static int rb_buf_compare(struct buf *b1, struct buf *b2); 139 RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, daddr_t, b_lblkno); 140 RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, daddr_t, b_lblkno); 141 142 static int 143 rb_buf_compare(struct buf *b1, struct buf *b2) 144 { 145 if (b1->b_lblkno < b2->b_lblkno) 146 return(-1); 147 if (b1->b_lblkno > b2->b_lblkno) 148 return(1); 149 return(0); 150 } 151 152 /* 153 * Return 0 if the vnode is already on the free list or cannot be placed 154 * on the free list. Return 1 if the vnode can be placed on the free list. 155 */ 156 static __inline int 157 vshouldfree(struct vnode *vp, int usecount) 158 { 159 if (vp->v_flag & VFREE) 160 return (0); /* already free */ 161 if (vp->v_holdcnt != 0 || vp->v_usecount != usecount) 162 return (0); /* other holderse */ 163 if (vp->v_object && 164 (vp->v_object->ref_count || vp->v_object->resident_page_count)) { 165 return (0); 166 } 167 return (1); 168 } 169 170 /* 171 * Initialize the vnode management data structures. 172 * 173 * Called from vfsinit() 174 */ 175 void 176 vfs_subr_init(void) 177 { 178 /* 179 * Desired vnodes is a result of the physical page count 180 * and the size of kernel's heap. It scales in proportion 181 * to the amount of available physical memory. This can 182 * cause trouble on 64-bit and large memory platforms. 183 */ 184 /* desiredvnodes = maxproc + vmstats.v_page_count / 4; */ 185 desiredvnodes = 186 min(maxproc + vmstats.v_page_count /4, 187 2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 188 (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); 189 190 lwkt_token_init(&spechash_token); 191 } 192 193 /* 194 * Knob to control the precision of file timestamps: 195 * 196 * 0 = seconds only; nanoseconds zeroed. 197 * 1 = seconds and nanoseconds, accurate within 1/HZ. 198 * 2 = seconds and nanoseconds, truncated to microseconds. 199 * >=3 = seconds and nanoseconds, maximum precision. 200 */ 201 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 202 203 static int timestamp_precision = TSP_SEC; 204 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 205 ×tamp_precision, 0, ""); 206 207 /* 208 * Get a current timestamp. 209 */ 210 void 211 vfs_timestamp(struct timespec *tsp) 212 { 213 struct timeval tv; 214 215 switch (timestamp_precision) { 216 case TSP_SEC: 217 tsp->tv_sec = time_second; 218 tsp->tv_nsec = 0; 219 break; 220 case TSP_HZ: 221 getnanotime(tsp); 222 break; 223 case TSP_USEC: 224 microtime(&tv); 225 TIMEVAL_TO_TIMESPEC(&tv, tsp); 226 break; 227 case TSP_NSEC: 228 default: 229 nanotime(tsp); 230 break; 231 } 232 } 233 234 /* 235 * Set vnode attributes to VNOVAL 236 */ 237 void 238 vattr_null(struct vattr *vap) 239 { 240 vap->va_type = VNON; 241 vap->va_size = VNOVAL; 242 vap->va_bytes = VNOVAL; 243 vap->va_mode = VNOVAL; 244 vap->va_nlink = VNOVAL; 245 vap->va_uid = VNOVAL; 246 vap->va_gid = VNOVAL; 247 vap->va_fsid = VNOVAL; 248 vap->va_fileid = VNOVAL; 249 vap->va_blocksize = VNOVAL; 250 vap->va_rdev = VNOVAL; 251 vap->va_atime.tv_sec = VNOVAL; 252 vap->va_atime.tv_nsec = VNOVAL; 253 vap->va_mtime.tv_sec = VNOVAL; 254 vap->va_mtime.tv_nsec = VNOVAL; 255 vap->va_ctime.tv_sec = VNOVAL; 256 vap->va_ctime.tv_nsec = VNOVAL; 257 vap->va_flags = VNOVAL; 258 vap->va_gen = VNOVAL; 259 vap->va_vaflags = 0; 260 vap->va_fsmid = VNOVAL; 261 } 262 263 /* 264 * Flush out and invalidate all buffers associated with a vnode. 265 * 266 * vp must be locked. 267 */ 268 static int vinvalbuf_bp(struct buf *bp, void *data); 269 270 struct vinvalbuf_bp_info { 271 struct vnode *vp; 272 int slptimeo; 273 int lkflags; 274 int flags; 275 }; 276 277 int 278 vinvalbuf(struct vnode *vp, int flags, struct thread *td, 279 int slpflag, int slptimeo) 280 { 281 struct vinvalbuf_bp_info info; 282 int error; 283 vm_object_t object; 284 285 /* 286 * If we are being asked to save, call fsync to ensure that the inode 287 * is updated. 288 */ 289 if (flags & V_SAVE) { 290 crit_enter(); 291 while (vp->v_track_write.bk_active) { 292 vp->v_track_write.bk_waitflag = 1; 293 error = tsleep(&vp->v_track_write, slpflag, 294 "vinvlbuf", slptimeo); 295 if (error) { 296 crit_exit(); 297 return (error); 298 } 299 } 300 if (!RB_EMPTY(&vp->v_rbdirty_tree)) { 301 crit_exit(); 302 if ((error = VOP_FSYNC(vp, MNT_WAIT, td)) != 0) 303 return (error); 304 crit_enter(); 305 if (vp->v_track_write.bk_active > 0 || 306 !RB_EMPTY(&vp->v_rbdirty_tree)) 307 panic("vinvalbuf: dirty bufs"); 308 } 309 crit_exit(); 310 } 311 crit_enter(); 312 info.slptimeo = slptimeo; 313 info.lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; 314 if (slpflag & PCATCH) 315 info.lkflags |= LK_PCATCH; 316 info.flags = flags; 317 info.vp = vp; 318 319 /* 320 * Flush the buffer cache until nothing is left. 321 */ 322 while (!RB_EMPTY(&vp->v_rbclean_tree) || 323 !RB_EMPTY(&vp->v_rbdirty_tree)) { 324 error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL, 325 vinvalbuf_bp, &info); 326 if (error == 0) { 327 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 328 vinvalbuf_bp, &info); 329 } 330 } 331 332 /* 333 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 334 * have write I/O in-progress but if there is a VM object then the 335 * VM object can also have read-I/O in-progress. 336 */ 337 do { 338 while (vp->v_track_write.bk_active > 0) { 339 vp->v_track_write.bk_waitflag = 1; 340 tsleep(&vp->v_track_write, 0, "vnvlbv", 0); 341 } 342 if (VOP_GETVOBJECT(vp, &object) == 0) { 343 while (object->paging_in_progress) 344 vm_object_pip_sleep(object, "vnvlbx"); 345 } 346 } while (vp->v_track_write.bk_active > 0); 347 348 crit_exit(); 349 350 /* 351 * Destroy the copy in the VM cache, too. 352 */ 353 if (VOP_GETVOBJECT(vp, &object) == 0) { 354 vm_object_page_remove(object, 0, 0, 355 (flags & V_SAVE) ? TRUE : FALSE); 356 } 357 358 if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree)) 359 panic("vinvalbuf: flush failed"); 360 if (!RB_EMPTY(&vp->v_rbhash_tree)) 361 panic("vinvalbuf: flush failed, buffers still present"); 362 return (0); 363 } 364 365 static int 366 vinvalbuf_bp(struct buf *bp, void *data) 367 { 368 struct vinvalbuf_bp_info *info = data; 369 int error; 370 371 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 372 error = BUF_TIMELOCK(bp, info->lkflags, 373 "vinvalbuf", info->slptimeo); 374 if (error == 0) { 375 BUF_UNLOCK(bp); 376 error = ENOLCK; 377 } 378 if (error == ENOLCK) 379 return(0); 380 return (-error); 381 } 382 383 KKASSERT(bp->b_vp == info->vp); 384 385 /* 386 * XXX Since there are no node locks for NFS, I 387 * believe there is a slight chance that a delayed 388 * write will occur while sleeping just above, so 389 * check for it. Note that vfs_bio_awrite expects 390 * buffers to reside on a queue, while VOP_BWRITE and 391 * brelse do not. 392 */ 393 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 394 (info->flags & V_SAVE)) { 395 if (bp->b_vp == info->vp) { 396 if (bp->b_flags & B_CLUSTEROK) { 397 vfs_bio_awrite(bp); 398 } else { 399 bremfree(bp); 400 bp->b_flags |= B_ASYNC; 401 VOP_BWRITE(bp->b_vp, bp); 402 } 403 } else { 404 bremfree(bp); 405 VOP_BWRITE(bp->b_vp, bp); 406 } 407 } else { 408 bremfree(bp); 409 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 410 bp->b_flags &= ~B_ASYNC; 411 brelse(bp); 412 } 413 return(0); 414 } 415 416 /* 417 * Truncate a file's buffer and pages to a specified length. This 418 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 419 * sync activity. 420 * 421 * The vnode must be locked. 422 */ 423 static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data); 424 static int vtruncbuf_bp_trunc(struct buf *bp, void *data); 425 static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data); 426 static int vtruncbuf_bp_metasync(struct buf *bp, void *data); 427 428 int 429 vtruncbuf(struct vnode *vp, struct thread *td, off_t length, int blksize) 430 { 431 daddr_t trunclbn; 432 int count; 433 434 /* 435 * Round up to the *next* lbn, then destroy the buffers in question. 436 * Since we are only removing some of the buffers we must rely on the 437 * scan count to determine whether a loop is necessary. 438 */ 439 trunclbn = (length + blksize - 1) / blksize; 440 441 crit_enter(); 442 do { 443 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 444 vtruncbuf_bp_trunc_cmp, 445 vtruncbuf_bp_trunc, &trunclbn); 446 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 447 vtruncbuf_bp_trunc_cmp, 448 vtruncbuf_bp_trunc, &trunclbn); 449 } while(count); 450 451 /* 452 * For safety, fsync any remaining metadata if the file is not being 453 * truncated to 0. Since the metadata does not represent the entire 454 * dirty list we have to rely on the hit count to ensure that we get 455 * all of it. 456 */ 457 if (length > 0) { 458 do { 459 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 460 vtruncbuf_bp_metasync_cmp, 461 vtruncbuf_bp_metasync, vp); 462 } while (count); 463 } 464 465 /* 466 * Wait for any in-progress I/O to complete before returning (why?) 467 */ 468 while (vp->v_track_write.bk_active > 0) { 469 vp->v_track_write.bk_waitflag = 1; 470 tsleep(&vp->v_track_write, 0, "vbtrunc", 0); 471 } 472 473 crit_exit(); 474 475 vnode_pager_setsize(vp, length); 476 477 return (0); 478 } 479 480 /* 481 * The callback buffer is beyond the new file EOF and must be destroyed. 482 * Note that the compare function must conform to the RB_SCAN's requirements. 483 */ 484 static 485 int 486 vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data) 487 { 488 if (bp->b_lblkno >= *(daddr_t *)data) 489 return(0); 490 return(-1); 491 } 492 493 static 494 int 495 vtruncbuf_bp_trunc(struct buf *bp, void *data) 496 { 497 /* 498 * Do not try to use a buffer we cannot immediately lock, but sleep 499 * anyway to prevent a livelock. The code will loop until all buffers 500 * can be acted upon. 501 */ 502 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 503 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 504 BUF_UNLOCK(bp); 505 } else { 506 bremfree(bp); 507 bp->b_flags |= (B_INVAL | B_RELBUF); 508 bp->b_flags &= ~B_ASYNC; 509 brelse(bp); 510 } 511 return(1); 512 } 513 514 /* 515 * Fsync all meta-data after truncating a file to be non-zero. Only metadata 516 * blocks (with a negative lblkno) are scanned. 517 * Note that the compare function must conform to the RB_SCAN's requirements. 518 */ 519 static int 520 vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data) 521 { 522 if (bp->b_lblkno < 0) 523 return(0); 524 return(1); 525 } 526 527 static int 528 vtruncbuf_bp_metasync(struct buf *bp, void *data) 529 { 530 struct vnode *vp = data; 531 532 if (bp->b_flags & B_DELWRI) { 533 /* 534 * Do not try to use a buffer we cannot immediately lock, 535 * but sleep anyway to prevent a livelock. The code will 536 * loop until all buffers can be acted upon. 537 */ 538 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 539 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 540 BUF_UNLOCK(bp); 541 } else { 542 bremfree(bp); 543 if (bp->b_vp == vp) { 544 bp->b_flags |= B_ASYNC; 545 } else { 546 bp->b_flags &= ~B_ASYNC; 547 } 548 VOP_BWRITE(bp->b_vp, bp); 549 } 550 return(1); 551 } else { 552 return(0); 553 } 554 } 555 556 /* 557 * vfsync - implements a multipass fsync on a file which understands 558 * dependancies and meta-data. The passed vnode must be locked. The 559 * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY. 560 * 561 * When fsyncing data asynchronously just do one consolidated pass starting 562 * with the most negative block number. This may not get all the data due 563 * to dependancies. 564 * 565 * When fsyncing data synchronously do a data pass, then a metadata pass, 566 * then do additional data+metadata passes to try to get all the data out. 567 */ 568 static int vfsync_wait_output(struct vnode *vp, 569 int (*waitoutput)(struct vnode *, struct thread *)); 570 static int vfsync_data_only_cmp(struct buf *bp, void *data); 571 static int vfsync_meta_only_cmp(struct buf *bp, void *data); 572 static int vfsync_lazy_range_cmp(struct buf *bp, void *data); 573 static int vfsync_bp(struct buf *bp, void *data); 574 575 struct vfsync_info { 576 struct vnode *vp; 577 int synchronous; 578 int syncdeps; 579 int lazycount; 580 int lazylimit; 581 daddr_t lbn; 582 int (*checkdef)(struct buf *); 583 }; 584 585 int 586 vfsync(struct vnode *vp, int waitfor, int passes, daddr_t lbn, 587 int (*checkdef)(struct buf *), 588 int (*waitoutput)(struct vnode *, struct thread *)) 589 { 590 struct vfsync_info info; 591 int error; 592 593 bzero(&info, sizeof(info)); 594 info.vp = vp; 595 info.lbn = lbn; 596 if ((info.checkdef = checkdef) == NULL) 597 info.syncdeps = 1; 598 599 crit_enter(); 600 601 switch(waitfor) { 602 case MNT_LAZY: 603 /* 604 * Lazy (filesystem syncer typ) Asynchronous plus limit the 605 * number of data (not meta) pages we try to flush to 1MB. 606 * A non-zero return means that lazy limit was reached. 607 */ 608 info.lazylimit = 1024 * 1024; 609 info.syncdeps = 1; 610 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 611 vfsync_lazy_range_cmp, vfsync_bp, &info); 612 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 613 vfsync_meta_only_cmp, vfsync_bp, &info); 614 if (error == 0) 615 vp->v_lazyw = 0; 616 else if (!RB_EMPTY(&vp->v_rbdirty_tree)) 617 vn_syncer_add_to_worklist(vp, 1); 618 error = 0; 619 break; 620 case MNT_NOWAIT: 621 /* 622 * Asynchronous. Do a data-only pass and a meta-only pass. 623 */ 624 info.syncdeps = 1; 625 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 626 vfsync_bp, &info); 627 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp, 628 vfsync_bp, &info); 629 error = 0; 630 break; 631 default: 632 /* 633 * Synchronous. Do a data-only pass, then a meta-data+data 634 * pass, then additional integrated passes to try to get 635 * all the dependancies flushed. 636 */ 637 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 638 vfsync_bp, &info); 639 error = vfsync_wait_output(vp, waitoutput); 640 if (error == 0) { 641 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 642 vfsync_bp, &info); 643 error = vfsync_wait_output(vp, waitoutput); 644 } 645 while (error == 0 && passes > 0 && 646 !RB_EMPTY(&vp->v_rbdirty_tree)) { 647 if (--passes == 0) { 648 info.synchronous = 1; 649 info.syncdeps = 1; 650 } 651 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 652 vfsync_bp, &info); 653 if (error < 0) 654 error = -error; 655 info.syncdeps = 1; 656 if (error == 0) 657 error = vfsync_wait_output(vp, waitoutput); 658 } 659 break; 660 } 661 crit_exit(); 662 return(error); 663 } 664 665 static int 666 vfsync_wait_output(struct vnode *vp, int (*waitoutput)(struct vnode *, struct thread *)) 667 { 668 int error = 0; 669 670 while (vp->v_track_write.bk_active) { 671 vp->v_track_write.bk_waitflag = 1; 672 tsleep(&vp->v_track_write, 0, "fsfsn", 0); 673 } 674 if (waitoutput) 675 error = waitoutput(vp, curthread); 676 return(error); 677 } 678 679 static int 680 vfsync_data_only_cmp(struct buf *bp, void *data) 681 { 682 if (bp->b_lblkno < 0) 683 return(-1); 684 return(0); 685 } 686 687 static int 688 vfsync_meta_only_cmp(struct buf *bp, void *data) 689 { 690 if (bp->b_lblkno < 0) 691 return(0); 692 return(1); 693 } 694 695 static int 696 vfsync_lazy_range_cmp(struct buf *bp, void *data) 697 { 698 struct vfsync_info *info = data; 699 if (bp->b_lblkno < info->vp->v_lazyw) 700 return(-1); 701 return(0); 702 } 703 704 static int 705 vfsync_bp(struct buf *bp, void *data) 706 { 707 struct vfsync_info *info = data; 708 struct vnode *vp = info->vp; 709 int error; 710 711 /* 712 * if syncdeps is not set we do not try to write buffers which have 713 * dependancies. 714 */ 715 if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp)) 716 return(0); 717 718 /* 719 * Ignore buffers that we cannot immediately lock. XXX 720 */ 721 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) 722 return(0); 723 if ((bp->b_flags & B_DELWRI) == 0) 724 panic("vfsync_bp: buffer not dirty"); 725 if (vp != bp->b_vp) 726 panic("vfsync_bp: buffer vp mismatch"); 727 728 /* 729 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer 730 * has been written but an additional handshake with the device 731 * is required before we can dispose of the buffer. We have no idea 732 * how to do this so we have to skip these buffers. 733 */ 734 if (bp->b_flags & B_NEEDCOMMIT) { 735 BUF_UNLOCK(bp); 736 return(0); 737 } 738 739 /* 740 * (LEGACY FROM UFS, REMOVE WHEN POSSIBLE) - invalidate any dirty 741 * buffers beyond the file EOF. 742 */ 743 if (info->lbn != (daddr_t)-1 && vp->v_type == VREG && 744 bp->b_lblkno >= info->lbn) { 745 bremfree(bp); 746 bp->b_flags |= B_INVAL | B_NOCACHE; 747 crit_exit(); 748 brelse(bp); 749 crit_enter(); 750 } 751 752 if (info->synchronous) { 753 /* 754 * Synchronous flushing. An error may be returned. 755 */ 756 bremfree(bp); 757 crit_exit(); 758 error = bwrite(bp); 759 crit_enter(); 760 } else { 761 /* 762 * Asynchronous flushing. A negative return value simply 763 * stops the scan and is not considered an error. We use 764 * this to support limited MNT_LAZY flushes. 765 */ 766 vp->v_lazyw = bp->b_lblkno; 767 if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) { 768 info->lazycount += vfs_bio_awrite(bp); 769 } else { 770 info->lazycount += bp->b_bufsize; 771 bremfree(bp); 772 crit_exit(); 773 bawrite(bp); 774 crit_enter(); 775 } 776 if (info->lazylimit && info->lazycount >= info->lazylimit) 777 error = 1; 778 else 779 error = 0; 780 } 781 return(-error); 782 } 783 784 /* 785 * Associate a buffer with a vnode. 786 */ 787 void 788 bgetvp(struct vnode *vp, struct buf *bp) 789 { 790 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 791 KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI)) == 0); 792 KKASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0); 793 794 vhold(vp); 795 /* 796 * Insert onto list for new vnode. 797 */ 798 crit_enter(); 799 bp->b_vp = vp; 800 bp->b_flags |= B_HASHED; 801 if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) 802 panic("reassignbuf: dup lblk vp %p bp %p", vp, bp); 803 804 bp->b_xflags |= BX_VNCLEAN; 805 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) 806 panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp); 807 crit_exit(); 808 } 809 810 /* 811 * Disassociate a buffer from a vnode. 812 */ 813 void 814 brelvp(struct buf *bp) 815 { 816 struct vnode *vp; 817 818 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 819 820 /* 821 * Delete from old vnode list, if on one. 822 */ 823 vp = bp->b_vp; 824 crit_enter(); 825 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 826 if (bp->b_xflags & BX_VNDIRTY) 827 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); 828 else 829 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); 830 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 831 } 832 if (bp->b_flags & B_HASHED) { 833 buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp); 834 bp->b_flags &= ~B_HASHED; 835 } 836 if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) { 837 vp->v_flag &= ~VONWORKLST; 838 LIST_REMOVE(vp, v_synclist); 839 } 840 crit_exit(); 841 bp->b_vp = NULL; 842 vdrop(vp); 843 } 844 845 /* 846 * Associate a p-buffer with a vnode. 847 * 848 * Also sets B_PAGING flag to indicate that vnode is not fully associated 849 * with the buffer. i.e. the bp has not been linked into the vnode or 850 * ref-counted. 851 */ 852 void 853 pbgetvp(struct vnode *vp, struct buf *bp) 854 { 855 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 856 KKASSERT((bp->b_flags & B_HASHED) == 0); 857 858 bp->b_vp = vp; 859 bp->b_flags |= B_PAGING; 860 } 861 862 /* 863 * Disassociate a p-buffer from a vnode. 864 */ 865 void 866 pbrelvp(struct buf *bp) 867 { 868 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 869 KKASSERT((bp->b_flags & B_HASHED) == 0); 870 871 bp->b_vp = NULL; 872 bp->b_flags &= ~B_PAGING; 873 } 874 875 /* 876 * Reassign the buffer to the proper clean/dirty list based on B_DELWRI. 877 * This routine is called when the state of the B_DELWRI bit is changed. 878 */ 879 void 880 reassignbuf(struct buf *bp) 881 { 882 struct vnode *vp = bp->b_vp; 883 int delay; 884 885 KKASSERT(vp != NULL); 886 ++reassignbufcalls; 887 888 /* 889 * B_PAGING flagged buffers cannot be reassigned because their vp 890 * is not fully linked in. 891 */ 892 if (bp->b_flags & B_PAGING) 893 panic("cannot reassign paging buffer"); 894 895 crit_enter(); 896 if (bp->b_flags & B_DELWRI) { 897 /* 898 * Move to the dirty list, add the vnode to the worklist 899 */ 900 if (bp->b_xflags & BX_VNCLEAN) { 901 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); 902 bp->b_xflags &= ~BX_VNCLEAN; 903 } 904 if ((bp->b_xflags & BX_VNDIRTY) == 0) { 905 if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) { 906 panic("reassignbuf: dup lblk vp %p bp %p", 907 vp, bp); 908 } 909 bp->b_xflags |= BX_VNDIRTY; 910 } 911 if ((vp->v_flag & VONWORKLST) == 0) { 912 switch (vp->v_type) { 913 case VDIR: 914 delay = dirdelay; 915 break; 916 case VCHR: 917 case VBLK: 918 if (vp->v_rdev && 919 vp->v_rdev->si_mountpoint != NULL) { 920 delay = metadelay; 921 break; 922 } 923 /* fall through */ 924 default: 925 delay = filedelay; 926 } 927 vn_syncer_add_to_worklist(vp, delay); 928 } 929 } else { 930 /* 931 * Move to the clean list, remove the vnode from the worklist 932 * if no dirty blocks remain. 933 */ 934 if (bp->b_xflags & BX_VNDIRTY) { 935 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); 936 bp->b_xflags &= ~BX_VNDIRTY; 937 } 938 if ((bp->b_xflags & BX_VNCLEAN) == 0) { 939 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) { 940 panic("reassignbuf: dup lblk vp %p bp %p", 941 vp, bp); 942 } 943 bp->b_xflags |= BX_VNCLEAN; 944 } 945 if ((vp->v_flag & VONWORKLST) && 946 RB_EMPTY(&vp->v_rbdirty_tree)) { 947 vp->v_flag &= ~VONWORKLST; 948 LIST_REMOVE(vp, v_synclist); 949 } 950 } 951 crit_exit(); 952 } 953 954 /* 955 * Create a vnode for a block device. 956 * Used for mounting the root file system. 957 */ 958 int 959 bdevvp(dev_t dev, struct vnode **vpp) 960 { 961 struct vnode *vp; 962 struct vnode *nvp; 963 int error; 964 965 if (dev == NODEV) { 966 *vpp = NULLVP; 967 return (ENXIO); 968 } 969 error = getspecialvnode(VT_NON, NULL, &spec_vnode_vops, &nvp, 0, 0); 970 if (error) { 971 *vpp = NULLVP; 972 return (error); 973 } 974 vp = nvp; 975 vp->v_type = VCHR; 976 vp->v_udev = dev->si_udev; 977 vx_unlock(vp); 978 *vpp = vp; 979 return (0); 980 } 981 982 int 983 v_associate_rdev(struct vnode *vp, dev_t dev) 984 { 985 lwkt_tokref ilock; 986 987 if (dev == NULL || dev == NODEV) 988 return(ENXIO); 989 if (dev_is_good(dev) == 0) 990 return(ENXIO); 991 KKASSERT(vp->v_rdev == NULL); 992 if (dev_ref_debug) 993 printf("Z1"); 994 vp->v_rdev = reference_dev(dev); 995 lwkt_gettoken(&ilock, &spechash_token); 996 SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext); 997 lwkt_reltoken(&ilock); 998 return(0); 999 } 1000 1001 void 1002 v_release_rdev(struct vnode *vp) 1003 { 1004 lwkt_tokref ilock; 1005 dev_t dev; 1006 1007 if ((dev = vp->v_rdev) != NULL) { 1008 lwkt_gettoken(&ilock, &spechash_token); 1009 SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext); 1010 if (dev_ref_debug && vp->v_opencount != 0) { 1011 printf("releasing rdev with non-0 " 1012 "v_opencount(%d) (revoked?)\n", 1013 vp->v_opencount); 1014 } 1015 vp->v_rdev = NULL; 1016 vp->v_opencount = 0; 1017 release_dev(dev); 1018 lwkt_reltoken(&ilock); 1019 } 1020 } 1021 1022 /* 1023 * Add a vnode to the alias list hung off the dev_t. We only associate 1024 * the device number with the vnode. The actual device is not associated 1025 * until the vnode is opened (usually in spec_open()), and will be 1026 * disassociated on last close. 1027 */ 1028 void 1029 addaliasu(struct vnode *nvp, udev_t nvp_udev) 1030 { 1031 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1032 panic("addaliasu on non-special vnode"); 1033 nvp->v_udev = nvp_udev; 1034 } 1035 1036 /* 1037 * Disassociate a vnode from its underlying filesystem. 1038 * 1039 * The vnode must be VX locked and refd 1040 * 1041 * If there are v_usecount references to the vnode other then ours we have 1042 * to VOP_CLOSE the vnode before we can deactivate and reclaim it. 1043 */ 1044 void 1045 vclean(struct vnode *vp, int flags, struct thread *td) 1046 { 1047 int active; 1048 int retflags = 0; 1049 1050 /* 1051 * If the vnode has already been reclaimed we have nothing to do. 1052 */ 1053 if (vp->v_flag & VRECLAIMED) 1054 return; 1055 vp->v_flag |= VRECLAIMED; 1056 1057 /* 1058 * Scrap the vfs cache 1059 */ 1060 while (cache_inval_vp(vp, 0, &retflags) != 0) { 1061 printf("Warning: vnode %p clean/cache_resolution race detected\n", vp); 1062 tsleep(vp, 0, "vclninv", 2); 1063 } 1064 1065 /* 1066 * Check to see if the vnode is in use. If so we have to reference it 1067 * before we clean it out so that its count cannot fall to zero and 1068 * generate a race against ourselves to recycle it. 1069 */ 1070 active = (vp->v_usecount > 1); 1071 1072 /* 1073 * Clean out any buffers associated with the vnode and destroy its 1074 * object, if it has one. 1075 */ 1076 vinvalbuf(vp, V_SAVE, td, 0, 0); 1077 VOP_DESTROYVOBJECT(vp); 1078 1079 /* 1080 * If purging an active vnode, it must be closed and 1081 * deactivated before being reclaimed. XXX 1082 * 1083 * Note that neither of these routines unlocks the vnode. 1084 */ 1085 if (active) { 1086 if (flags & DOCLOSE) 1087 VOP_CLOSE(vp, FNONBLOCK, td); 1088 } 1089 1090 /* 1091 * If the vnode has not be deactivated, deactivated it. 1092 */ 1093 if ((vp->v_flag & VINACTIVE) == 0) { 1094 vp->v_flag |= VINACTIVE; 1095 VOP_INACTIVE(vp, td); 1096 } 1097 1098 /* 1099 * Reclaim the vnode. 1100 */ 1101 if (VOP_RECLAIM(vp, retflags, td)) 1102 panic("vclean: cannot reclaim"); 1103 1104 /* 1105 * Done with purge, notify sleepers of the grim news. 1106 */ 1107 vp->v_ops = &dead_vnode_vops; 1108 vn_pollgone(vp); 1109 vp->v_tag = VT_NON; 1110 } 1111 1112 /* 1113 * Eliminate all activity associated with the requested vnode 1114 * and with all vnodes aliased to the requested vnode. 1115 * 1116 * The vnode must be referenced and vx_lock()'d 1117 * 1118 * revoke { struct vnode *a_vp, int a_flags } 1119 */ 1120 int 1121 vop_stdrevoke(struct vop_revoke_args *ap) 1122 { 1123 struct vnode *vp, *vq; 1124 lwkt_tokref ilock; 1125 dev_t dev; 1126 1127 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1128 1129 vp = ap->a_vp; 1130 1131 /* 1132 * If the vnode is already dead don't try to revoke it 1133 */ 1134 if (vp->v_flag & VRECLAIMED) 1135 return (0); 1136 1137 /* 1138 * If the vnode has a device association, scrap all vnodes associated 1139 * with the device. Don't let the device disappear on us while we 1140 * are scrapping the vnodes. 1141 * 1142 * The passed vp will probably show up in the list, do not VX lock 1143 * it twice! 1144 */ 1145 if (vp->v_type != VCHR && vp->v_type != VBLK) 1146 return(0); 1147 if ((dev = vp->v_rdev) == NULL) { 1148 if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV) 1149 return(0); 1150 } 1151 reference_dev(dev); 1152 lwkt_gettoken(&ilock, &spechash_token); 1153 while ((vq = SLIST_FIRST(&dev->si_hlist)) != NULL) { 1154 if (vp == vq || vx_get(vq) == 0) { 1155 if (vq == SLIST_FIRST(&dev->si_hlist)) 1156 vgone(vq); 1157 if (vp != vq) 1158 vx_put(vq); 1159 } 1160 } 1161 lwkt_reltoken(&ilock); 1162 release_dev(dev); 1163 return (0); 1164 } 1165 1166 /* 1167 * Recycle an unused vnode to the front of the free list. 1168 * 1169 * Returns 1 if we were successfully able to recycle the vnode, 1170 * 0 otherwise. 1171 */ 1172 int 1173 vrecycle(struct vnode *vp, struct thread *td) 1174 { 1175 if (vp->v_usecount == 1) { 1176 vgone(vp); 1177 return (1); 1178 } 1179 return (0); 1180 } 1181 1182 /* 1183 * Eliminate all activity associated with a vnode in preparation for reuse. 1184 * 1185 * The vnode must be VX locked and refd and will remain VX locked and refd 1186 * on return. This routine may be called with the vnode in any state, as 1187 * long as it is VX locked. The vnode will be cleaned out and marked 1188 * VRECLAIMED but will not actually be reused until all existing refs and 1189 * holds go away. 1190 * 1191 * NOTE: This routine may be called on a vnode which has not yet been 1192 * already been deactivated (VOP_INACTIVE), or on a vnode which has 1193 * already been reclaimed. 1194 * 1195 * This routine is not responsible for placing us back on the freelist. 1196 * Instead, it happens automatically when the caller releases the VX lock 1197 * (assuming there aren't any other references). 1198 */ 1199 void 1200 vgone(struct vnode *vp) 1201 { 1202 /* 1203 * assert that the VX lock is held. This is an absolute requirement 1204 * now for vgone() to be called. 1205 */ 1206 KKASSERT(vp->v_lock.lk_exclusivecount == 1); 1207 1208 /* 1209 * Clean out the filesystem specific data and set the VRECLAIMED 1210 * bit. Also deactivate the vnode if necessary. 1211 */ 1212 vclean(vp, DOCLOSE, curthread); 1213 1214 /* 1215 * Delete from old mount point vnode list, if on one. 1216 */ 1217 if (vp->v_mount != NULL) 1218 insmntque(vp, NULL); 1219 1220 /* 1221 * If special device, remove it from special device alias list 1222 * if it is on one. This should normally only occur if a vnode is 1223 * being revoked as the device should otherwise have been released 1224 * naturally. 1225 */ 1226 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 1227 v_release_rdev(vp); 1228 } 1229 1230 /* 1231 * Set us to VBAD 1232 */ 1233 vp->v_type = VBAD; 1234 } 1235 1236 /* 1237 * Lookup a vnode by device number. 1238 */ 1239 int 1240 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp) 1241 { 1242 lwkt_tokref ilock; 1243 struct vnode *vp; 1244 1245 lwkt_gettoken(&ilock, &spechash_token); 1246 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1247 if (type == vp->v_type) { 1248 *vpp = vp; 1249 lwkt_reltoken(&ilock); 1250 return (1); 1251 } 1252 } 1253 lwkt_reltoken(&ilock); 1254 return (0); 1255 } 1256 1257 /* 1258 * Calculate the total number of references to a special device. This 1259 * routine may only be called for VBLK and VCHR vnodes since v_rdev is 1260 * an overloaded field. Since udev2dev can now return NODEV, we have 1261 * to check for a NULL v_rdev. 1262 */ 1263 int 1264 count_dev(dev_t dev) 1265 { 1266 lwkt_tokref ilock; 1267 struct vnode *vp; 1268 int count = 0; 1269 1270 if (SLIST_FIRST(&dev->si_hlist)) { 1271 lwkt_gettoken(&ilock, &spechash_token); 1272 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1273 count += vp->v_usecount; 1274 } 1275 lwkt_reltoken(&ilock); 1276 } 1277 return(count); 1278 } 1279 1280 int 1281 count_udev(udev_t udev) 1282 { 1283 dev_t dev; 1284 1285 if ((dev = udev2dev(udev, 0)) == NODEV) 1286 return(0); 1287 return(count_dev(dev)); 1288 } 1289 1290 int 1291 vcount(struct vnode *vp) 1292 { 1293 if (vp->v_rdev == NULL) 1294 return(0); 1295 return(count_dev(vp->v_rdev)); 1296 } 1297 1298 /* 1299 * Print out a description of a vnode. 1300 */ 1301 static char *typename[] = 1302 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1303 1304 void 1305 vprint(char *label, struct vnode *vp) 1306 { 1307 char buf[96]; 1308 1309 if (label != NULL) 1310 printf("%s: %p: ", label, (void *)vp); 1311 else 1312 printf("%p: ", (void *)vp); 1313 printf("type %s, usecount %d, writecount %d, refcount %d,", 1314 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1315 vp->v_holdcnt); 1316 buf[0] = '\0'; 1317 if (vp->v_flag & VROOT) 1318 strcat(buf, "|VROOT"); 1319 if (vp->v_flag & VTEXT) 1320 strcat(buf, "|VTEXT"); 1321 if (vp->v_flag & VSYSTEM) 1322 strcat(buf, "|VSYSTEM"); 1323 if (vp->v_flag & VFREE) 1324 strcat(buf, "|VFREE"); 1325 if (vp->v_flag & VOBJBUF) 1326 strcat(buf, "|VOBJBUF"); 1327 if (buf[0] != '\0') 1328 printf(" flags (%s)", &buf[1]); 1329 if (vp->v_data == NULL) { 1330 printf("\n"); 1331 } else { 1332 printf("\n\t"); 1333 VOP_PRINT(vp); 1334 } 1335 } 1336 1337 #ifdef DDB 1338 #include <ddb/ddb.h> 1339 1340 static int db_show_locked_vnodes(struct mount *mp, void *data); 1341 1342 /* 1343 * List all of the locked vnodes in the system. 1344 * Called when debugging the kernel. 1345 */ 1346 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 1347 { 1348 printf("Locked vnodes\n"); 1349 mountlist_scan(db_show_locked_vnodes, NULL, 1350 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1351 } 1352 1353 static int 1354 db_show_locked_vnodes(struct mount *mp, void *data __unused) 1355 { 1356 struct vnode *vp; 1357 1358 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 1359 if (VOP_ISLOCKED(vp, NULL)) 1360 vprint((char *)0, vp); 1361 } 1362 return(0); 1363 } 1364 #endif 1365 1366 /* 1367 * Top level filesystem related information gathering. 1368 */ 1369 static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS); 1370 1371 static int 1372 vfs_sysctl(SYSCTL_HANDLER_ARGS) 1373 { 1374 int *name = (int *)arg1 - 1; /* XXX */ 1375 u_int namelen = arg2 + 1; /* XXX */ 1376 struct vfsconf *vfsp; 1377 1378 #if 1 || defined(COMPAT_PRELITE2) 1379 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 1380 if (namelen == 1) 1381 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 1382 #endif 1383 1384 #ifdef notyet 1385 /* all sysctl names at this level are at least name and field */ 1386 if (namelen < 2) 1387 return (ENOTDIR); /* overloaded */ 1388 if (name[0] != VFS_GENERIC) { 1389 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 1390 if (vfsp->vfc_typenum == name[0]) 1391 break; 1392 if (vfsp == NULL) 1393 return (EOPNOTSUPP); 1394 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 1395 oldp, oldlenp, newp, newlen, p)); 1396 } 1397 #endif 1398 switch (name[1]) { 1399 case VFS_MAXTYPENUM: 1400 if (namelen != 2) 1401 return (ENOTDIR); 1402 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 1403 case VFS_CONF: 1404 if (namelen != 3) 1405 return (ENOTDIR); /* overloaded */ 1406 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 1407 if (vfsp->vfc_typenum == name[2]) 1408 break; 1409 if (vfsp == NULL) 1410 return (EOPNOTSUPP); 1411 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 1412 } 1413 return (EOPNOTSUPP); 1414 } 1415 1416 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 1417 "Generic filesystem"); 1418 1419 #if 1 || defined(COMPAT_PRELITE2) 1420 1421 static int 1422 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 1423 { 1424 int error; 1425 struct vfsconf *vfsp; 1426 struct ovfsconf ovfs; 1427 1428 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 1429 bzero(&ovfs, sizeof(ovfs)); 1430 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 1431 strcpy(ovfs.vfc_name, vfsp->vfc_name); 1432 ovfs.vfc_index = vfsp->vfc_typenum; 1433 ovfs.vfc_refcount = vfsp->vfc_refcount; 1434 ovfs.vfc_flags = vfsp->vfc_flags; 1435 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 1436 if (error) 1437 return error; 1438 } 1439 return 0; 1440 } 1441 1442 #endif /* 1 || COMPAT_PRELITE2 */ 1443 1444 /* 1445 * Check to see if a filesystem is mounted on a block device. 1446 */ 1447 int 1448 vfs_mountedon(struct vnode *vp) 1449 { 1450 dev_t dev; 1451 1452 if ((dev = vp->v_rdev) == NULL) 1453 dev = udev2dev(vp->v_udev, (vp->v_type == VBLK)); 1454 if (dev != NODEV && dev->si_mountpoint) 1455 return (EBUSY); 1456 return (0); 1457 } 1458 1459 /* 1460 * Unmount all filesystems. The list is traversed in reverse order 1461 * of mounting to avoid dependencies. 1462 */ 1463 1464 static int vfs_umountall_callback(struct mount *mp, void *data); 1465 1466 void 1467 vfs_unmountall(void) 1468 { 1469 struct thread *td = curthread; 1470 int count; 1471 1472 if (td->td_proc == NULL) 1473 td = initproc->p_thread; /* XXX XXX use proc0 instead? */ 1474 1475 do { 1476 count = mountlist_scan(vfs_umountall_callback, 1477 &td, MNTSCAN_REVERSE|MNTSCAN_NOBUSY); 1478 } while (count); 1479 } 1480 1481 static 1482 int 1483 vfs_umountall_callback(struct mount *mp, void *data) 1484 { 1485 struct thread *td = *(struct thread **)data; 1486 int error; 1487 1488 error = dounmount(mp, MNT_FORCE, td); 1489 if (error) { 1490 mountlist_remove(mp); 1491 printf("unmount of filesystem mounted from %s failed (", 1492 mp->mnt_stat.f_mntfromname); 1493 if (error == EBUSY) 1494 printf("BUSY)\n"); 1495 else 1496 printf("%d)\n", error); 1497 } 1498 return(1); 1499 } 1500 1501 /* 1502 * Build hash lists of net addresses and hang them off the mount point. 1503 * Called by ufs_mount() to set up the lists of export addresses. 1504 */ 1505 static int 1506 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 1507 struct export_args *argp) 1508 { 1509 struct netcred *np; 1510 struct radix_node_head *rnh; 1511 int i; 1512 struct radix_node *rn; 1513 struct sockaddr *saddr, *smask = 0; 1514 struct domain *dom; 1515 int error; 1516 1517 if (argp->ex_addrlen == 0) { 1518 if (mp->mnt_flag & MNT_DEFEXPORTED) 1519 return (EPERM); 1520 np = &nep->ne_defexported; 1521 np->netc_exflags = argp->ex_flags; 1522 np->netc_anon = argp->ex_anon; 1523 np->netc_anon.cr_ref = 1; 1524 mp->mnt_flag |= MNT_DEFEXPORTED; 1525 return (0); 1526 } 1527 1528 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN) 1529 return (EINVAL); 1530 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN) 1531 return (EINVAL); 1532 1533 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 1534 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 1535 bzero((caddr_t) np, i); 1536 saddr = (struct sockaddr *) (np + 1); 1537 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 1538 goto out; 1539 if (saddr->sa_len > argp->ex_addrlen) 1540 saddr->sa_len = argp->ex_addrlen; 1541 if (argp->ex_masklen) { 1542 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 1543 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 1544 if (error) 1545 goto out; 1546 if (smask->sa_len > argp->ex_masklen) 1547 smask->sa_len = argp->ex_masklen; 1548 } 1549 i = saddr->sa_family; 1550 if ((rnh = nep->ne_rtable[i]) == 0) { 1551 /* 1552 * Seems silly to initialize every AF when most are not used, 1553 * do so on demand here 1554 */ 1555 SLIST_FOREACH(dom, &domains, dom_next) 1556 if (dom->dom_family == i && dom->dom_rtattach) { 1557 dom->dom_rtattach((void **) &nep->ne_rtable[i], 1558 dom->dom_rtoffset); 1559 break; 1560 } 1561 if ((rnh = nep->ne_rtable[i]) == 0) { 1562 error = ENOBUFS; 1563 goto out; 1564 } 1565 } 1566 rn = (*rnh->rnh_addaddr) ((char *) saddr, (char *) smask, rnh, 1567 np->netc_rnodes); 1568 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 1569 error = EPERM; 1570 goto out; 1571 } 1572 np->netc_exflags = argp->ex_flags; 1573 np->netc_anon = argp->ex_anon; 1574 np->netc_anon.cr_ref = 1; 1575 return (0); 1576 out: 1577 free(np, M_NETADDR); 1578 return (error); 1579 } 1580 1581 /* ARGSUSED */ 1582 static int 1583 vfs_free_netcred(struct radix_node *rn, void *w) 1584 { 1585 struct radix_node_head *rnh = (struct radix_node_head *) w; 1586 1587 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 1588 free((caddr_t) rn, M_NETADDR); 1589 return (0); 1590 } 1591 1592 /* 1593 * Free the net address hash lists that are hanging off the mount points. 1594 */ 1595 static void 1596 vfs_free_addrlist(struct netexport *nep) 1597 { 1598 int i; 1599 struct radix_node_head *rnh; 1600 1601 for (i = 0; i <= AF_MAX; i++) 1602 if ((rnh = nep->ne_rtable[i])) { 1603 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 1604 (caddr_t) rnh); 1605 free((caddr_t) rnh, M_RTABLE); 1606 nep->ne_rtable[i] = 0; 1607 } 1608 } 1609 1610 int 1611 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp) 1612 { 1613 int error; 1614 1615 if (argp->ex_flags & MNT_DELEXPORT) { 1616 if (mp->mnt_flag & MNT_EXPUBLIC) { 1617 vfs_setpublicfs(NULL, NULL, NULL); 1618 mp->mnt_flag &= ~MNT_EXPUBLIC; 1619 } 1620 vfs_free_addrlist(nep); 1621 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 1622 } 1623 if (argp->ex_flags & MNT_EXPORTED) { 1624 if (argp->ex_flags & MNT_EXPUBLIC) { 1625 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 1626 return (error); 1627 mp->mnt_flag |= MNT_EXPUBLIC; 1628 } 1629 if ((error = vfs_hang_addrlist(mp, nep, argp))) 1630 return (error); 1631 mp->mnt_flag |= MNT_EXPORTED; 1632 } 1633 return (0); 1634 } 1635 1636 1637 /* 1638 * Set the publicly exported filesystem (WebNFS). Currently, only 1639 * one public filesystem is possible in the spec (RFC 2054 and 2055) 1640 */ 1641 int 1642 vfs_setpublicfs(struct mount *mp, struct netexport *nep, 1643 struct export_args *argp) 1644 { 1645 int error; 1646 struct vnode *rvp; 1647 char *cp; 1648 1649 /* 1650 * mp == NULL -> invalidate the current info, the FS is 1651 * no longer exported. May be called from either vfs_export 1652 * or unmount, so check if it hasn't already been done. 1653 */ 1654 if (mp == NULL) { 1655 if (nfs_pub.np_valid) { 1656 nfs_pub.np_valid = 0; 1657 if (nfs_pub.np_index != NULL) { 1658 FREE(nfs_pub.np_index, M_TEMP); 1659 nfs_pub.np_index = NULL; 1660 } 1661 } 1662 return (0); 1663 } 1664 1665 /* 1666 * Only one allowed at a time. 1667 */ 1668 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 1669 return (EBUSY); 1670 1671 /* 1672 * Get real filehandle for root of exported FS. 1673 */ 1674 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 1675 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 1676 1677 if ((error = VFS_ROOT(mp, &rvp))) 1678 return (error); 1679 1680 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 1681 return (error); 1682 1683 vput(rvp); 1684 1685 /* 1686 * If an indexfile was specified, pull it in. 1687 */ 1688 if (argp->ex_indexfile != NULL) { 1689 int namelen; 1690 1691 error = vn_get_namelen(rvp, &namelen); 1692 if (error) 1693 return (error); 1694 MALLOC(nfs_pub.np_index, char *, namelen, M_TEMP, 1695 M_WAITOK); 1696 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 1697 namelen, (size_t *)0); 1698 if (!error) { 1699 /* 1700 * Check for illegal filenames. 1701 */ 1702 for (cp = nfs_pub.np_index; *cp; cp++) { 1703 if (*cp == '/') { 1704 error = EINVAL; 1705 break; 1706 } 1707 } 1708 } 1709 if (error) { 1710 FREE(nfs_pub.np_index, M_TEMP); 1711 return (error); 1712 } 1713 } 1714 1715 nfs_pub.np_mount = mp; 1716 nfs_pub.np_valid = 1; 1717 return (0); 1718 } 1719 1720 struct netcred * 1721 vfs_export_lookup(struct mount *mp, struct netexport *nep, 1722 struct sockaddr *nam) 1723 { 1724 struct netcred *np; 1725 struct radix_node_head *rnh; 1726 struct sockaddr *saddr; 1727 1728 np = NULL; 1729 if (mp->mnt_flag & MNT_EXPORTED) { 1730 /* 1731 * Lookup in the export list first. 1732 */ 1733 if (nam != NULL) { 1734 saddr = nam; 1735 rnh = nep->ne_rtable[saddr->sa_family]; 1736 if (rnh != NULL) { 1737 np = (struct netcred *) 1738 (*rnh->rnh_matchaddr)((char *)saddr, 1739 rnh); 1740 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 1741 np = NULL; 1742 } 1743 } 1744 /* 1745 * If no address match, use the default if it exists. 1746 */ 1747 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 1748 np = &nep->ne_defexported; 1749 } 1750 return (np); 1751 } 1752 1753 /* 1754 * perform msync on all vnodes under a mount point. The mount point must 1755 * be locked. This code is also responsible for lazy-freeing unreferenced 1756 * vnodes whos VM objects no longer contain pages. 1757 * 1758 * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state. 1759 * 1760 * NOTE: XXX VOP_PUTPAGES and friends requires that the vnode be locked, 1761 * but vnode_pager_putpages() doesn't lock the vnode. We have to do it 1762 * way up in this high level function. 1763 */ 1764 static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data); 1765 static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data); 1766 1767 void 1768 vfs_msync(struct mount *mp, int flags) 1769 { 1770 int vmsc_flags; 1771 1772 vmsc_flags = VMSC_GETVP; 1773 if (flags != MNT_WAIT) 1774 vmsc_flags |= VMSC_NOWAIT; 1775 vmntvnodescan(mp, vmsc_flags, vfs_msync_scan1, vfs_msync_scan2, 1776 (void *)flags); 1777 } 1778 1779 /* 1780 * scan1 is a fast pre-check. There could be hundreds of thousands of 1781 * vnodes, we cannot afford to do anything heavy weight until we have a 1782 * fairly good indication that there is work to do. 1783 */ 1784 static 1785 int 1786 vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data) 1787 { 1788 int flags = (int)data; 1789 1790 if ((vp->v_flag & VRECLAIMED) == 0) { 1791 if (vshouldfree(vp, 0)) 1792 return(0); /* call scan2 */ 1793 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 1794 (vp->v_flag & VOBJDIRTY) && 1795 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 1796 return(0); /* call scan2 */ 1797 } 1798 } 1799 1800 /* 1801 * do not call scan2, continue the loop 1802 */ 1803 return(-1); 1804 } 1805 1806 /* 1807 * This callback is handed a locked vnode. 1808 */ 1809 static 1810 int 1811 vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data) 1812 { 1813 vm_object_t obj; 1814 int flags = (int)data; 1815 1816 if (vp->v_flag & VRECLAIMED) 1817 return(0); 1818 1819 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 1820 (vp->v_flag & VOBJDIRTY)) { 1821 if (VOP_GETVOBJECT(vp, &obj) == 0) { 1822 vm_object_page_clean(obj, 0, 0, 1823 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 1824 } 1825 } 1826 return(0); 1827 } 1828 1829 /* 1830 * Create the VM object needed for VMIO and mmap support. This 1831 * is done for all VREG files in the system. Some filesystems might 1832 * afford the additional metadata buffering capability of the 1833 * VMIO code by making the device node be VMIO mode also. 1834 * 1835 * vp must be locked when vfs_object_create is called. 1836 */ 1837 int 1838 vfs_object_create(struct vnode *vp, struct thread *td) 1839 { 1840 return (VOP_CREATEVOBJECT(vp, td)); 1841 } 1842 1843 /* 1844 * Record a process's interest in events which might happen to 1845 * a vnode. Because poll uses the historic select-style interface 1846 * internally, this routine serves as both the ``check for any 1847 * pending events'' and the ``record my interest in future events'' 1848 * functions. (These are done together, while the lock is held, 1849 * to avoid race conditions.) 1850 */ 1851 int 1852 vn_pollrecord(struct vnode *vp, struct thread *td, int events) 1853 { 1854 lwkt_tokref ilock; 1855 1856 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1857 if (vp->v_pollinfo.vpi_revents & events) { 1858 /* 1859 * This leaves events we are not interested 1860 * in available for the other process which 1861 * which presumably had requested them 1862 * (otherwise they would never have been 1863 * recorded). 1864 */ 1865 events &= vp->v_pollinfo.vpi_revents; 1866 vp->v_pollinfo.vpi_revents &= ~events; 1867 1868 lwkt_reltoken(&ilock); 1869 return events; 1870 } 1871 vp->v_pollinfo.vpi_events |= events; 1872 selrecord(td, &vp->v_pollinfo.vpi_selinfo); 1873 lwkt_reltoken(&ilock); 1874 return 0; 1875 } 1876 1877 /* 1878 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 1879 * it is possible for us to miss an event due to race conditions, but 1880 * that condition is expected to be rare, so for the moment it is the 1881 * preferred interface. 1882 */ 1883 void 1884 vn_pollevent(struct vnode *vp, int events) 1885 { 1886 lwkt_tokref ilock; 1887 1888 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1889 if (vp->v_pollinfo.vpi_events & events) { 1890 /* 1891 * We clear vpi_events so that we don't 1892 * call selwakeup() twice if two events are 1893 * posted before the polling process(es) is 1894 * awakened. This also ensures that we take at 1895 * most one selwakeup() if the polling process 1896 * is no longer interested. However, it does 1897 * mean that only one event can be noticed at 1898 * a time. (Perhaps we should only clear those 1899 * event bits which we note?) XXX 1900 */ 1901 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 1902 vp->v_pollinfo.vpi_revents |= events; 1903 selwakeup(&vp->v_pollinfo.vpi_selinfo); 1904 } 1905 lwkt_reltoken(&ilock); 1906 } 1907 1908 /* 1909 * Wake up anyone polling on vp because it is being revoked. 1910 * This depends on dead_poll() returning POLLHUP for correct 1911 * behavior. 1912 */ 1913 void 1914 vn_pollgone(struct vnode *vp) 1915 { 1916 lwkt_tokref ilock; 1917 1918 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1919 if (vp->v_pollinfo.vpi_events) { 1920 vp->v_pollinfo.vpi_events = 0; 1921 selwakeup(&vp->v_pollinfo.vpi_selinfo); 1922 } 1923 lwkt_reltoken(&ilock); 1924 } 1925 1926 /* 1927 * extract the dev_t from a VBLK or VCHR. The vnode must have been opened 1928 * (or v_rdev might be NULL). 1929 */ 1930 dev_t 1931 vn_todev(struct vnode *vp) 1932 { 1933 if (vp->v_type != VBLK && vp->v_type != VCHR) 1934 return (NODEV); 1935 KKASSERT(vp->v_rdev != NULL); 1936 return (vp->v_rdev); 1937 } 1938 1939 /* 1940 * Check if vnode represents a disk device. The vnode does not need to be 1941 * opened. 1942 */ 1943 int 1944 vn_isdisk(struct vnode *vp, int *errp) 1945 { 1946 dev_t dev; 1947 1948 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1949 if (errp != NULL) 1950 *errp = ENOTBLK; 1951 return (0); 1952 } 1953 1954 if ((dev = vp->v_rdev) == NULL) 1955 dev = udev2dev(vp->v_udev, (vp->v_type == VBLK)); 1956 if (dev == NULL || dev == NODEV) { 1957 if (errp != NULL) 1958 *errp = ENXIO; 1959 return (0); 1960 } 1961 if (dev_is_good(dev) == 0) { 1962 if (errp != NULL) 1963 *errp = ENXIO; 1964 return (0); 1965 } 1966 if ((dev_dflags(dev) & D_DISK) == 0) { 1967 if (errp != NULL) 1968 *errp = ENOTBLK; 1969 return (0); 1970 } 1971 if (errp != NULL) 1972 *errp = 0; 1973 return (1); 1974 } 1975 1976 #ifdef DEBUG_VFS_LOCKS 1977 1978 void 1979 assert_vop_locked(struct vnode *vp, const char *str) 1980 { 1981 if (vp && IS_LOCKING_VFS(vp) && !VOP_ISLOCKED(vp, NULL)) { 1982 panic("%s: %p is not locked shared but should be", str, vp); 1983 } 1984 } 1985 1986 void 1987 assert_vop_unlocked(struct vnode *vp, const char *str) 1988 { 1989 if (vp && IS_LOCKING_VFS(vp)) { 1990 if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) { 1991 panic("%s: %p is locked but should not be", str, vp); 1992 } 1993 } 1994 } 1995 1996 #endif 1997 1998 int 1999 vn_get_namelen(struct vnode *vp, int *namelen) 2000 { 2001 int error, retval[2]; 2002 2003 error = VOP_PATHCONF(vp, _PC_NAME_MAX, retval); 2004 if (error) 2005 return (error); 2006 *namelen = *retval; 2007 return (0); 2008 } 2009 2010 int 2011 vop_write_dirent(int *error, struct uio *uio, ino_t d_ino, uint8_t d_type, 2012 uint16_t d_namlen, const char *d_name) 2013 { 2014 struct dirent *dp; 2015 size_t len; 2016 2017 len = _DIRENT_RECLEN(d_namlen); 2018 if (len > uio->uio_resid) 2019 return(1); 2020 2021 dp = malloc(len, M_TEMP, M_WAITOK | M_ZERO); 2022 2023 dp->d_ino = d_ino; 2024 dp->d_namlen = d_namlen; 2025 dp->d_type = d_type; 2026 bcopy(d_name, dp->d_name, d_namlen); 2027 2028 *error = uiomove((caddr_t)dp, len, uio); 2029 2030 free(dp, M_TEMP); 2031 2032 return(0); 2033 } 2034