1 /* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ 40 * $DragonFly: src/sys/kern/vfs_subr.c,v 1.118 2008/09/17 21:44:18 dillon Exp $ 41 */ 42 43 /* 44 * External virtual filesystem routines 45 */ 46 #include "opt_ddb.h" 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/dirent.h> 53 #include <sys/domain.h> 54 #include <sys/eventhandler.h> 55 #include <sys/fcntl.h> 56 #include <sys/kernel.h> 57 #include <sys/kthread.h> 58 #include <sys/malloc.h> 59 #include <sys/mbuf.h> 60 #include <sys/mount.h> 61 #include <sys/proc.h> 62 #include <sys/reboot.h> 63 #include <sys/socket.h> 64 #include <sys/stat.h> 65 #include <sys/sysctl.h> 66 #include <sys/syslog.h> 67 #include <sys/unistd.h> 68 #include <sys/vmmeter.h> 69 #include <sys/vnode.h> 70 71 #include <machine/limits.h> 72 73 #include <vm/vm.h> 74 #include <vm/vm_object.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_kern.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pager.h> 81 #include <vm/vnode_pager.h> 82 #include <vm/vm_zone.h> 83 84 #include <sys/buf2.h> 85 #include <sys/thread2.h> 86 #include <sys/sysref2.h> 87 88 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 89 90 int numvnodes; 91 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 92 int vfs_fastdev = 1; 93 SYSCTL_INT(_vfs, OID_AUTO, fastdev, CTLFLAG_RW, &vfs_fastdev, 0, ""); 94 95 enum vtype iftovt_tab[16] = { 96 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 97 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 98 }; 99 int vttoif_tab[9] = { 100 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 101 S_IFSOCK, S_IFIFO, S_IFMT, 102 }; 103 104 static int reassignbufcalls; 105 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, 106 &reassignbufcalls, 0, ""); 107 static int reassignbufloops; 108 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, 109 &reassignbufloops, 0, ""); 110 static int reassignbufsortgood; 111 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, 112 &reassignbufsortgood, 0, ""); 113 static int reassignbufsortbad; 114 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, 115 &reassignbufsortbad, 0, ""); 116 static int reassignbufmethod = 1; 117 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, 118 &reassignbufmethod, 0, ""); 119 120 int nfs_mount_type = -1; 121 static struct lwkt_token spechash_token; 122 struct nfs_public nfs_pub; /* publicly exported FS */ 123 124 int desiredvnodes; 125 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 126 &desiredvnodes, 0, "Maximum number of vnodes"); 127 128 static void vfs_free_addrlist (struct netexport *nep); 129 static int vfs_free_netcred (struct radix_node *rn, void *w); 130 static int vfs_hang_addrlist (struct mount *mp, struct netexport *nep, 131 const struct export_args *argp); 132 133 extern int dev_ref_debug; 134 135 /* 136 * Red black tree functions 137 */ 138 static int rb_buf_compare(struct buf *b1, struct buf *b2); 139 RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, off_t, b_loffset); 140 RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, off_t, b_loffset); 141 142 static int 143 rb_buf_compare(struct buf *b1, struct buf *b2) 144 { 145 if (b1->b_loffset < b2->b_loffset) 146 return(-1); 147 if (b1->b_loffset > b2->b_loffset) 148 return(1); 149 return(0); 150 } 151 152 /* 153 * Returns non-zero if the vnode is a candidate for lazy msyncing. 154 */ 155 static __inline int 156 vshouldmsync(struct vnode *vp) 157 { 158 if (vp->v_auxrefs != 0 || vp->v_sysref.refcnt > 0) 159 return (0); /* other holders */ 160 if (vp->v_object && 161 (vp->v_object->ref_count || vp->v_object->resident_page_count)) { 162 return (0); 163 } 164 return (1); 165 } 166 167 /* 168 * Initialize the vnode management data structures. 169 * 170 * Called from vfsinit() 171 */ 172 void 173 vfs_subr_init(void) 174 { 175 /* 176 * Desiredvnodes is kern.maxvnodes. We want to scale it 177 * according to available system memory but we may also have 178 * to limit it based on available KVM, which is capped on 32 bit 179 * systems. 180 */ 181 desiredvnodes = min(maxproc + vmstats.v_page_count / 4, 182 KvaSize / (20 * 183 (sizeof(struct vm_object) + sizeof(struct vnode)))); 184 185 lwkt_token_init(&spechash_token); 186 } 187 188 /* 189 * Knob to control the precision of file timestamps: 190 * 191 * 0 = seconds only; nanoseconds zeroed. 192 * 1 = seconds and nanoseconds, accurate within 1/HZ. 193 * 2 = seconds and nanoseconds, truncated to microseconds. 194 * >=3 = seconds and nanoseconds, maximum precision. 195 */ 196 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 197 198 static int timestamp_precision = TSP_SEC; 199 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 200 ×tamp_precision, 0, ""); 201 202 /* 203 * Get a current timestamp. 204 */ 205 void 206 vfs_timestamp(struct timespec *tsp) 207 { 208 struct timeval tv; 209 210 switch (timestamp_precision) { 211 case TSP_SEC: 212 tsp->tv_sec = time_second; 213 tsp->tv_nsec = 0; 214 break; 215 case TSP_HZ: 216 getnanotime(tsp); 217 break; 218 case TSP_USEC: 219 microtime(&tv); 220 TIMEVAL_TO_TIMESPEC(&tv, tsp); 221 break; 222 case TSP_NSEC: 223 default: 224 nanotime(tsp); 225 break; 226 } 227 } 228 229 /* 230 * Set vnode attributes to VNOVAL 231 */ 232 void 233 vattr_null(struct vattr *vap) 234 { 235 vap->va_type = VNON; 236 vap->va_size = VNOVAL; 237 vap->va_bytes = VNOVAL; 238 vap->va_mode = VNOVAL; 239 vap->va_nlink = VNOVAL; 240 vap->va_uid = VNOVAL; 241 vap->va_gid = VNOVAL; 242 vap->va_fsid = VNOVAL; 243 vap->va_fileid = VNOVAL; 244 vap->va_blocksize = VNOVAL; 245 vap->va_rmajor = VNOVAL; 246 vap->va_rminor = VNOVAL; 247 vap->va_atime.tv_sec = VNOVAL; 248 vap->va_atime.tv_nsec = VNOVAL; 249 vap->va_mtime.tv_sec = VNOVAL; 250 vap->va_mtime.tv_nsec = VNOVAL; 251 vap->va_ctime.tv_sec = VNOVAL; 252 vap->va_ctime.tv_nsec = VNOVAL; 253 vap->va_flags = VNOVAL; 254 vap->va_gen = VNOVAL; 255 vap->va_vaflags = 0; 256 vap->va_fsmid = VNOVAL; 257 /* va_*_uuid fields are only valid if related flags are set */ 258 } 259 260 /* 261 * Flush out and invalidate all buffers associated with a vnode. 262 * 263 * vp must be locked. 264 */ 265 static int vinvalbuf_bp(struct buf *bp, void *data); 266 267 struct vinvalbuf_bp_info { 268 struct vnode *vp; 269 int slptimeo; 270 int lkflags; 271 int flags; 272 }; 273 274 void 275 vupdatefsmid(struct vnode *vp) 276 { 277 atomic_set_int(&vp->v_flag, VFSMID); 278 } 279 280 int 281 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 282 { 283 struct vinvalbuf_bp_info info; 284 int error; 285 vm_object_t object; 286 287 /* 288 * If we are being asked to save, call fsync to ensure that the inode 289 * is updated. 290 */ 291 if (flags & V_SAVE) { 292 crit_enter(); 293 while (vp->v_track_write.bk_active) { 294 vp->v_track_write.bk_waitflag = 1; 295 error = tsleep(&vp->v_track_write, slpflag, 296 "vinvlbuf", slptimeo); 297 if (error) { 298 crit_exit(); 299 return (error); 300 } 301 } 302 if (!RB_EMPTY(&vp->v_rbdirty_tree)) { 303 crit_exit(); 304 if ((error = VOP_FSYNC(vp, MNT_WAIT)) != 0) 305 return (error); 306 crit_enter(); 307 308 /* 309 * Dirty bufs may be left or generated via races 310 * in circumstances where vinvalbuf() is called on 311 * a vnode not undergoing reclamation. Only 312 * panic if we are trying to reclaim the vnode. 313 */ 314 if ((vp->v_flag & VRECLAIMED) && 315 (vp->v_track_write.bk_active > 0 || 316 !RB_EMPTY(&vp->v_rbdirty_tree))) { 317 panic("vinvalbuf: dirty bufs"); 318 } 319 } 320 crit_exit(); 321 } 322 crit_enter(); 323 info.slptimeo = slptimeo; 324 info.lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; 325 if (slpflag & PCATCH) 326 info.lkflags |= LK_PCATCH; 327 info.flags = flags; 328 info.vp = vp; 329 330 /* 331 * Flush the buffer cache until nothing is left. 332 */ 333 while (!RB_EMPTY(&vp->v_rbclean_tree) || 334 !RB_EMPTY(&vp->v_rbdirty_tree)) { 335 error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL, 336 vinvalbuf_bp, &info); 337 if (error == 0) { 338 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 339 vinvalbuf_bp, &info); 340 } 341 } 342 343 /* 344 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 345 * have write I/O in-progress but if there is a VM object then the 346 * VM object can also have read-I/O in-progress. 347 */ 348 do { 349 while (vp->v_track_write.bk_active > 0) { 350 vp->v_track_write.bk_waitflag = 1; 351 tsleep(&vp->v_track_write, 0, "vnvlbv", 0); 352 } 353 if ((object = vp->v_object) != NULL) { 354 while (object->paging_in_progress) 355 vm_object_pip_sleep(object, "vnvlbx"); 356 } 357 } while (vp->v_track_write.bk_active > 0); 358 359 crit_exit(); 360 361 /* 362 * Destroy the copy in the VM cache, too. 363 */ 364 if ((object = vp->v_object) != NULL) { 365 vm_object_page_remove(object, 0, 0, 366 (flags & V_SAVE) ? TRUE : FALSE); 367 } 368 369 if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree)) 370 panic("vinvalbuf: flush failed"); 371 if (!RB_EMPTY(&vp->v_rbhash_tree)) 372 panic("vinvalbuf: flush failed, buffers still present"); 373 return (0); 374 } 375 376 static int 377 vinvalbuf_bp(struct buf *bp, void *data) 378 { 379 struct vinvalbuf_bp_info *info = data; 380 int error; 381 382 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 383 error = BUF_TIMELOCK(bp, info->lkflags, 384 "vinvalbuf", info->slptimeo); 385 if (error == 0) { 386 BUF_UNLOCK(bp); 387 error = ENOLCK; 388 } 389 if (error == ENOLCK) 390 return(0); 391 return (-error); 392 } 393 394 KKASSERT(bp->b_vp == info->vp); 395 396 /* 397 * XXX Since there are no node locks for NFS, I 398 * believe there is a slight chance that a delayed 399 * write will occur while sleeping just above, so 400 * check for it. Note that vfs_bio_awrite expects 401 * buffers to reside on a queue, while bwrite() and 402 * brelse() do not. 403 * 404 * NOTE: NO B_LOCKED CHECK. Also no buf_checkwrite() 405 * check. This code will write out the buffer, period. 406 */ 407 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 408 (info->flags & V_SAVE)) { 409 if (bp->b_vp == info->vp) { 410 if (bp->b_flags & B_CLUSTEROK) { 411 vfs_bio_awrite(bp); 412 } else { 413 bremfree(bp); 414 bp->b_flags |= B_ASYNC; 415 bwrite(bp); 416 } 417 } else { 418 bremfree(bp); 419 bwrite(bp); 420 } 421 } else if (info->flags & V_SAVE) { 422 /* 423 * Cannot set B_NOCACHE on a clean buffer as this will 424 * destroy the VM backing store which might actually 425 * be dirty (and unsynchronized). 426 */ 427 bremfree(bp); 428 bp->b_flags |= (B_INVAL | B_RELBUF); 429 bp->b_flags &= ~B_ASYNC; 430 brelse(bp); 431 } else { 432 bremfree(bp); 433 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 434 bp->b_flags &= ~B_ASYNC; 435 brelse(bp); 436 } 437 return(0); 438 } 439 440 /* 441 * Truncate a file's buffer and pages to a specified length. This 442 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 443 * sync activity. 444 * 445 * The vnode must be locked. 446 */ 447 static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data); 448 static int vtruncbuf_bp_trunc(struct buf *bp, void *data); 449 static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data); 450 static int vtruncbuf_bp_metasync(struct buf *bp, void *data); 451 452 int 453 vtruncbuf(struct vnode *vp, off_t length, int blksize) 454 { 455 off_t truncloffset; 456 int count; 457 const char *filename; 458 459 /* 460 * Round up to the *next* block, then destroy the buffers in question. 461 * Since we are only removing some of the buffers we must rely on the 462 * scan count to determine whether a loop is necessary. 463 */ 464 if ((count = (int)(length % blksize)) != 0) 465 truncloffset = length + (blksize - count); 466 else 467 truncloffset = length; 468 469 crit_enter(); 470 do { 471 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 472 vtruncbuf_bp_trunc_cmp, 473 vtruncbuf_bp_trunc, &truncloffset); 474 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 475 vtruncbuf_bp_trunc_cmp, 476 vtruncbuf_bp_trunc, &truncloffset); 477 } while(count); 478 479 /* 480 * For safety, fsync any remaining metadata if the file is not being 481 * truncated to 0. Since the metadata does not represent the entire 482 * dirty list we have to rely on the hit count to ensure that we get 483 * all of it. 484 */ 485 if (length > 0) { 486 do { 487 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 488 vtruncbuf_bp_metasync_cmp, 489 vtruncbuf_bp_metasync, vp); 490 } while (count); 491 } 492 493 /* 494 * Clean out any left over VM backing store. 495 */ 496 crit_exit(); 497 498 vnode_pager_setsize(vp, length); 499 500 crit_enter(); 501 502 /* 503 * It is possible to have in-progress I/O from buffers that were 504 * not part of the truncation. This should not happen if we 505 * are truncating to 0-length. 506 */ 507 filename = TAILQ_FIRST(&vp->v_namecache) ? 508 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?"; 509 510 while ((count = vp->v_track_write.bk_active) > 0) { 511 vp->v_track_write.bk_waitflag = 1; 512 tsleep(&vp->v_track_write, 0, "vbtrunc", 0); 513 if (length == 0) { 514 kprintf("Warning: vtruncbuf(): Had to wait for " 515 "%d buffer I/Os to finish in %s\n", 516 count, filename); 517 } 518 } 519 520 /* 521 * Make sure no buffers were instantiated while we were trying 522 * to clean out the remaining VM pages. This could occur due 523 * to busy dirty VM pages being flushed out to disk. 524 */ 525 do { 526 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, 527 vtruncbuf_bp_trunc_cmp, 528 vtruncbuf_bp_trunc, &truncloffset); 529 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 530 vtruncbuf_bp_trunc_cmp, 531 vtruncbuf_bp_trunc, &truncloffset); 532 if (count) { 533 kprintf("Warning: vtruncbuf(): Had to re-clean %d " 534 "left over buffers in %s\n", count, filename); 535 } 536 } while(count); 537 538 crit_exit(); 539 540 return (0); 541 } 542 543 /* 544 * The callback buffer is beyond the new file EOF and must be destroyed. 545 * Note that the compare function must conform to the RB_SCAN's requirements. 546 */ 547 static 548 int 549 vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data) 550 { 551 if (bp->b_loffset >= *(off_t *)data) 552 return(0); 553 return(-1); 554 } 555 556 static 557 int 558 vtruncbuf_bp_trunc(struct buf *bp, void *data) 559 { 560 /* 561 * Do not try to use a buffer we cannot immediately lock, but sleep 562 * anyway to prevent a livelock. The code will loop until all buffers 563 * can be acted upon. 564 */ 565 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 566 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 567 BUF_UNLOCK(bp); 568 } else { 569 bremfree(bp); 570 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE); 571 bp->b_flags &= ~B_ASYNC; 572 brelse(bp); 573 } 574 return(1); 575 } 576 577 /* 578 * Fsync all meta-data after truncating a file to be non-zero. Only metadata 579 * blocks (with a negative loffset) are scanned. 580 * Note that the compare function must conform to the RB_SCAN's requirements. 581 */ 582 static int 583 vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data) 584 { 585 if (bp->b_loffset < 0) 586 return(0); 587 return(1); 588 } 589 590 static int 591 vtruncbuf_bp_metasync(struct buf *bp, void *data) 592 { 593 struct vnode *vp = data; 594 595 if (bp->b_flags & B_DELWRI) { 596 /* 597 * Do not try to use a buffer we cannot immediately lock, 598 * but sleep anyway to prevent a livelock. The code will 599 * loop until all buffers can be acted upon. 600 */ 601 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 602 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) 603 BUF_UNLOCK(bp); 604 } else { 605 bremfree(bp); 606 if (bp->b_vp == vp) { 607 bp->b_flags |= B_ASYNC; 608 } else { 609 bp->b_flags &= ~B_ASYNC; 610 } 611 bwrite(bp); 612 } 613 return(1); 614 } else { 615 return(0); 616 } 617 } 618 619 /* 620 * vfsync - implements a multipass fsync on a file which understands 621 * dependancies and meta-data. The passed vnode must be locked. The 622 * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY. 623 * 624 * When fsyncing data asynchronously just do one consolidated pass starting 625 * with the most negative block number. This may not get all the data due 626 * to dependancies. 627 * 628 * When fsyncing data synchronously do a data pass, then a metadata pass, 629 * then do additional data+metadata passes to try to get all the data out. 630 */ 631 static int vfsync_wait_output(struct vnode *vp, 632 int (*waitoutput)(struct vnode *, struct thread *)); 633 static int vfsync_data_only_cmp(struct buf *bp, void *data); 634 static int vfsync_meta_only_cmp(struct buf *bp, void *data); 635 static int vfsync_lazy_range_cmp(struct buf *bp, void *data); 636 static int vfsync_bp(struct buf *bp, void *data); 637 638 struct vfsync_info { 639 struct vnode *vp; 640 int synchronous; 641 int syncdeps; 642 int lazycount; 643 int lazylimit; 644 int skippedbufs; 645 int (*checkdef)(struct buf *); 646 }; 647 648 int 649 vfsync(struct vnode *vp, int waitfor, int passes, 650 int (*checkdef)(struct buf *), 651 int (*waitoutput)(struct vnode *, struct thread *)) 652 { 653 struct vfsync_info info; 654 int error; 655 656 bzero(&info, sizeof(info)); 657 info.vp = vp; 658 if ((info.checkdef = checkdef) == NULL) 659 info.syncdeps = 1; 660 661 crit_enter_id("vfsync"); 662 663 switch(waitfor) { 664 case MNT_LAZY: 665 /* 666 * Lazy (filesystem syncer typ) Asynchronous plus limit the 667 * number of data (not meta) pages we try to flush to 1MB. 668 * A non-zero return means that lazy limit was reached. 669 */ 670 info.lazylimit = 1024 * 1024; 671 info.syncdeps = 1; 672 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 673 vfsync_lazy_range_cmp, vfsync_bp, &info); 674 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, 675 vfsync_meta_only_cmp, vfsync_bp, &info); 676 if (error == 0) 677 vp->v_lazyw = 0; 678 else if (!RB_EMPTY(&vp->v_rbdirty_tree)) 679 vn_syncer_add_to_worklist(vp, 1); 680 error = 0; 681 break; 682 case MNT_NOWAIT: 683 /* 684 * Asynchronous. Do a data-only pass and a meta-only pass. 685 */ 686 info.syncdeps = 1; 687 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 688 vfsync_bp, &info); 689 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp, 690 vfsync_bp, &info); 691 error = 0; 692 break; 693 default: 694 /* 695 * Synchronous. Do a data-only pass, then a meta-data+data 696 * pass, then additional integrated passes to try to get 697 * all the dependancies flushed. 698 */ 699 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp, 700 vfsync_bp, &info); 701 error = vfsync_wait_output(vp, waitoutput); 702 if (error == 0) { 703 info.skippedbufs = 0; 704 RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 705 vfsync_bp, &info); 706 error = vfsync_wait_output(vp, waitoutput); 707 if (info.skippedbufs) 708 kprintf("Warning: vfsync skipped %d dirty bufs in pass2!\n", info.skippedbufs); 709 } 710 while (error == 0 && passes > 0 && 711 !RB_EMPTY(&vp->v_rbdirty_tree)) { 712 if (--passes == 0) { 713 info.synchronous = 1; 714 info.syncdeps = 1; 715 } 716 error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL, 717 vfsync_bp, &info); 718 if (error < 0) 719 error = -error; 720 info.syncdeps = 1; 721 if (error == 0) 722 error = vfsync_wait_output(vp, waitoutput); 723 } 724 break; 725 } 726 crit_exit_id("vfsync"); 727 return(error); 728 } 729 730 static int 731 vfsync_wait_output(struct vnode *vp, int (*waitoutput)(struct vnode *, struct thread *)) 732 { 733 int error = 0; 734 735 while (vp->v_track_write.bk_active) { 736 vp->v_track_write.bk_waitflag = 1; 737 tsleep(&vp->v_track_write, 0, "fsfsn", 0); 738 } 739 if (waitoutput) 740 error = waitoutput(vp, curthread); 741 return(error); 742 } 743 744 static int 745 vfsync_data_only_cmp(struct buf *bp, void *data) 746 { 747 if (bp->b_loffset < 0) 748 return(-1); 749 return(0); 750 } 751 752 static int 753 vfsync_meta_only_cmp(struct buf *bp, void *data) 754 { 755 if (bp->b_loffset < 0) 756 return(0); 757 return(1); 758 } 759 760 static int 761 vfsync_lazy_range_cmp(struct buf *bp, void *data) 762 { 763 struct vfsync_info *info = data; 764 if (bp->b_loffset < info->vp->v_lazyw) 765 return(-1); 766 return(0); 767 } 768 769 static int 770 vfsync_bp(struct buf *bp, void *data) 771 { 772 struct vfsync_info *info = data; 773 struct vnode *vp = info->vp; 774 int error; 775 776 /* 777 * if syncdeps is not set we do not try to write buffers which have 778 * dependancies. 779 */ 780 if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp)) 781 return(0); 782 783 /* 784 * Ignore buffers that we cannot immediately lock. XXX 785 */ 786 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 787 kprintf("Warning: vfsync_bp skipping dirty buffer %p\n", bp); 788 ++info->skippedbufs; 789 return(0); 790 } 791 if ((bp->b_flags & B_DELWRI) == 0) 792 panic("vfsync_bp: buffer not dirty"); 793 if (vp != bp->b_vp) 794 panic("vfsync_bp: buffer vp mismatch"); 795 796 /* 797 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer 798 * has been written but an additional handshake with the device 799 * is required before we can dispose of the buffer. We have no idea 800 * how to do this so we have to skip these buffers. 801 */ 802 if (bp->b_flags & B_NEEDCOMMIT) { 803 BUF_UNLOCK(bp); 804 return(0); 805 } 806 807 /* 808 * Ask bioops if it is ok to sync 809 */ 810 if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) { 811 bremfree(bp); 812 brelse(bp); 813 return(0); 814 } 815 816 if (info->synchronous) { 817 /* 818 * Synchronous flushing. An error may be returned. 819 */ 820 bremfree(bp); 821 crit_exit_id("vfsync"); 822 error = bwrite(bp); 823 crit_enter_id("vfsync"); 824 } else { 825 /* 826 * Asynchronous flushing. A negative return value simply 827 * stops the scan and is not considered an error. We use 828 * this to support limited MNT_LAZY flushes. 829 */ 830 vp->v_lazyw = bp->b_loffset; 831 if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) { 832 info->lazycount += vfs_bio_awrite(bp); 833 } else { 834 info->lazycount += bp->b_bufsize; 835 bremfree(bp); 836 crit_exit_id("vfsync"); 837 bawrite(bp); 838 crit_enter_id("vfsync"); 839 } 840 if (info->lazylimit && info->lazycount >= info->lazylimit) 841 error = 1; 842 else 843 error = 0; 844 } 845 return(-error); 846 } 847 848 /* 849 * Associate a buffer with a vnode. 850 */ 851 void 852 bgetvp(struct vnode *vp, struct buf *bp) 853 { 854 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 855 KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI|B_VNCLEAN|B_VNDIRTY)) == 0); 856 857 vhold(vp); 858 /* 859 * Insert onto list for new vnode. 860 */ 861 crit_enter(); 862 bp->b_vp = vp; 863 bp->b_flags |= B_HASHED; 864 if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp)) 865 panic("reassignbuf: dup lblk vp %p bp %p", vp, bp); 866 867 bp->b_flags |= B_VNCLEAN; 868 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) 869 panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp); 870 crit_exit(); 871 } 872 873 /* 874 * Disassociate a buffer from a vnode. 875 */ 876 void 877 brelvp(struct buf *bp) 878 { 879 struct vnode *vp; 880 881 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 882 883 /* 884 * Delete from old vnode list, if on one. 885 */ 886 vp = bp->b_vp; 887 crit_enter(); 888 if (bp->b_flags & (B_VNDIRTY | B_VNCLEAN)) { 889 if (bp->b_flags & B_VNDIRTY) 890 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); 891 else 892 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); 893 bp->b_flags &= ~(B_VNDIRTY | B_VNCLEAN); 894 } 895 if (bp->b_flags & B_HASHED) { 896 buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp); 897 bp->b_flags &= ~B_HASHED; 898 } 899 if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) { 900 vp->v_flag &= ~VONWORKLST; 901 LIST_REMOVE(vp, v_synclist); 902 } 903 crit_exit(); 904 bp->b_vp = NULL; 905 vdrop(vp); 906 } 907 908 /* 909 * Reassign the buffer to the proper clean/dirty list based on B_DELWRI. 910 * This routine is called when the state of the B_DELWRI bit is changed. 911 */ 912 void 913 reassignbuf(struct buf *bp) 914 { 915 struct vnode *vp = bp->b_vp; 916 int delay; 917 918 KKASSERT(vp != NULL); 919 ++reassignbufcalls; 920 921 /* 922 * B_PAGING flagged buffers cannot be reassigned because their vp 923 * is not fully linked in. 924 */ 925 if (bp->b_flags & B_PAGING) 926 panic("cannot reassign paging buffer"); 927 928 crit_enter(); 929 if (bp->b_flags & B_DELWRI) { 930 /* 931 * Move to the dirty list, add the vnode to the worklist 932 */ 933 if (bp->b_flags & B_VNCLEAN) { 934 buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp); 935 bp->b_flags &= ~B_VNCLEAN; 936 } 937 if ((bp->b_flags & B_VNDIRTY) == 0) { 938 if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) { 939 panic("reassignbuf: dup lblk vp %p bp %p", 940 vp, bp); 941 } 942 bp->b_flags |= B_VNDIRTY; 943 } 944 if ((vp->v_flag & VONWORKLST) == 0) { 945 switch (vp->v_type) { 946 case VDIR: 947 delay = dirdelay; 948 break; 949 case VCHR: 950 case VBLK: 951 if (vp->v_rdev && 952 vp->v_rdev->si_mountpoint != NULL) { 953 delay = metadelay; 954 break; 955 } 956 /* fall through */ 957 default: 958 delay = filedelay; 959 } 960 vn_syncer_add_to_worklist(vp, delay); 961 } 962 } else { 963 /* 964 * Move to the clean list, remove the vnode from the worklist 965 * if no dirty blocks remain. 966 */ 967 if (bp->b_flags & B_VNDIRTY) { 968 buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp); 969 bp->b_flags &= ~B_VNDIRTY; 970 } 971 if ((bp->b_flags & B_VNCLEAN) == 0) { 972 if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) { 973 panic("reassignbuf: dup lblk vp %p bp %p", 974 vp, bp); 975 } 976 bp->b_flags |= B_VNCLEAN; 977 } 978 if ((vp->v_flag & VONWORKLST) && 979 RB_EMPTY(&vp->v_rbdirty_tree)) { 980 vp->v_flag &= ~VONWORKLST; 981 LIST_REMOVE(vp, v_synclist); 982 } 983 } 984 crit_exit(); 985 } 986 987 /* 988 * Create a vnode for a block device. 989 * Used for mounting the root file system. 990 */ 991 int 992 bdevvp(cdev_t dev, struct vnode **vpp) 993 { 994 struct vnode *vp; 995 struct vnode *nvp; 996 int error; 997 998 if (dev == NULL) { 999 *vpp = NULLVP; 1000 return (ENXIO); 1001 } 1002 error = getspecialvnode(VT_NON, NULL, &spec_vnode_vops_p, &nvp, 0, 0); 1003 if (error) { 1004 *vpp = NULLVP; 1005 return (error); 1006 } 1007 vp = nvp; 1008 vp->v_type = VCHR; 1009 vp->v_umajor = dev->si_umajor; 1010 vp->v_uminor = dev->si_uminor; 1011 vx_unlock(vp); 1012 *vpp = vp; 1013 return (0); 1014 } 1015 1016 int 1017 v_associate_rdev(struct vnode *vp, cdev_t dev) 1018 { 1019 lwkt_tokref ilock; 1020 1021 if (dev == NULL) 1022 return(ENXIO); 1023 if (dev_is_good(dev) == 0) 1024 return(ENXIO); 1025 KKASSERT(vp->v_rdev == NULL); 1026 if (dev_ref_debug) 1027 kprintf("Z1"); 1028 vp->v_rdev = reference_dev(dev); 1029 lwkt_gettoken(&ilock, &spechash_token); 1030 SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_cdevnext); 1031 lwkt_reltoken(&ilock); 1032 return(0); 1033 } 1034 1035 void 1036 v_release_rdev(struct vnode *vp) 1037 { 1038 lwkt_tokref ilock; 1039 cdev_t dev; 1040 1041 if ((dev = vp->v_rdev) != NULL) { 1042 lwkt_gettoken(&ilock, &spechash_token); 1043 SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_cdevnext); 1044 vp->v_rdev = NULL; 1045 release_dev(dev); 1046 lwkt_reltoken(&ilock); 1047 } 1048 } 1049 1050 /* 1051 * Add a vnode to the alias list hung off the cdev_t. We only associate 1052 * the device number with the vnode. The actual device is not associated 1053 * until the vnode is opened (usually in spec_open()), and will be 1054 * disassociated on last close. 1055 */ 1056 void 1057 addaliasu(struct vnode *nvp, int x, int y) 1058 { 1059 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1060 panic("addaliasu on non-special vnode"); 1061 nvp->v_umajor = x; 1062 nvp->v_uminor = y; 1063 } 1064 1065 /* 1066 * Simple call that a filesystem can make to try to get rid of a 1067 * vnode. It will fail if anyone is referencing the vnode (including 1068 * the caller). 1069 * 1070 * The filesystem can check whether its in-memory inode structure still 1071 * references the vp on return. 1072 */ 1073 void 1074 vclean_unlocked(struct vnode *vp) 1075 { 1076 vx_get(vp); 1077 if (sysref_isactive(&vp->v_sysref) == 0) 1078 vgone_vxlocked(vp); 1079 vx_put(vp); 1080 } 1081 1082 /* 1083 * Disassociate a vnode from its underlying filesystem. 1084 * 1085 * The vnode must be VX locked and referenced. In all normal situations 1086 * there are no active references. If vclean_vxlocked() is called while 1087 * there are active references, the vnode is being ripped out and we have 1088 * to call VOP_CLOSE() as appropriate before we can reclaim it. 1089 */ 1090 void 1091 vclean_vxlocked(struct vnode *vp, int flags) 1092 { 1093 int active; 1094 int n; 1095 vm_object_t object; 1096 1097 /* 1098 * If the vnode has already been reclaimed we have nothing to do. 1099 */ 1100 if (vp->v_flag & VRECLAIMED) 1101 return; 1102 vp->v_flag |= VRECLAIMED; 1103 1104 /* 1105 * Scrap the vfs cache 1106 */ 1107 while (cache_inval_vp(vp, 0) != 0) { 1108 kprintf("Warning: vnode %p clean/cache_resolution race detected\n", vp); 1109 tsleep(vp, 0, "vclninv", 2); 1110 } 1111 1112 /* 1113 * Check to see if the vnode is in use. If so we have to reference it 1114 * before we clean it out so that its count cannot fall to zero and 1115 * generate a race against ourselves to recycle it. 1116 */ 1117 active = sysref_isactive(&vp->v_sysref); 1118 1119 /* 1120 * Clean out any buffers associated with the vnode and destroy its 1121 * object, if it has one. 1122 */ 1123 vinvalbuf(vp, V_SAVE, 0, 0); 1124 1125 /* 1126 * If purging an active vnode (typically during a forced unmount 1127 * or reboot), it must be closed and deactivated before being 1128 * reclaimed. This isn't really all that safe, but what can 1129 * we do? XXX. 1130 * 1131 * Note that neither of these routines unlocks the vnode. 1132 */ 1133 if (active && (flags & DOCLOSE)) { 1134 while ((n = vp->v_opencount) != 0) { 1135 if (vp->v_writecount) 1136 VOP_CLOSE(vp, FWRITE|FNONBLOCK); 1137 else 1138 VOP_CLOSE(vp, FNONBLOCK); 1139 if (vp->v_opencount == n) { 1140 kprintf("Warning: unable to force-close" 1141 " vnode %p\n", vp); 1142 break; 1143 } 1144 } 1145 } 1146 1147 /* 1148 * If the vnode has not been deactivated, deactivated it. Deactivation 1149 * can create new buffers and VM pages so we have to call vinvalbuf() 1150 * again to make sure they all get flushed. 1151 * 1152 * This can occur if a file with a link count of 0 needs to be 1153 * truncated. 1154 */ 1155 if ((vp->v_flag & VINACTIVE) == 0) { 1156 vp->v_flag |= VINACTIVE; 1157 VOP_INACTIVE(vp); 1158 vinvalbuf(vp, V_SAVE, 0, 0); 1159 } 1160 1161 /* 1162 * If the vnode has an object, destroy it. 1163 */ 1164 if ((object = vp->v_object) != NULL) { 1165 if (object->ref_count == 0) { 1166 if ((object->flags & OBJ_DEAD) == 0) 1167 vm_object_terminate(object); 1168 } else { 1169 vm_pager_deallocate(object); 1170 } 1171 vp->v_flag &= ~VOBJBUF; 1172 } 1173 KKASSERT((vp->v_flag & VOBJBUF) == 0); 1174 1175 /* 1176 * Reclaim the vnode. 1177 */ 1178 if (VOP_RECLAIM(vp)) 1179 panic("vclean: cannot reclaim"); 1180 1181 /* 1182 * Done with purge, notify sleepers of the grim news. 1183 */ 1184 vp->v_ops = &dead_vnode_vops_p; 1185 vn_pollgone(vp); 1186 vp->v_tag = VT_NON; 1187 1188 /* 1189 * If we are destroying an active vnode, reactivate it now that 1190 * we have reassociated it with deadfs. This prevents the system 1191 * from crashing on the vnode due to it being unexpectedly marked 1192 * as inactive or reclaimed. 1193 */ 1194 if (active && (flags & DOCLOSE)) { 1195 vp->v_flag &= ~(VINACTIVE|VRECLAIMED); 1196 } 1197 } 1198 1199 /* 1200 * Eliminate all activity associated with the requested vnode 1201 * and with all vnodes aliased to the requested vnode. 1202 * 1203 * The vnode must be referenced and vx_lock()'d 1204 * 1205 * revoke { struct vnode *a_vp, int a_flags } 1206 */ 1207 int 1208 vop_stdrevoke(struct vop_revoke_args *ap) 1209 { 1210 struct vnode *vp, *vq; 1211 lwkt_tokref ilock; 1212 cdev_t dev; 1213 1214 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1215 1216 vp = ap->a_vp; 1217 1218 /* 1219 * If the vnode is already dead don't try to revoke it 1220 */ 1221 if (vp->v_flag & VRECLAIMED) 1222 return (0); 1223 1224 /* 1225 * If the vnode has a device association, scrap all vnodes associated 1226 * with the device. Don't let the device disappear on us while we 1227 * are scrapping the vnodes. 1228 * 1229 * The passed vp will probably show up in the list, do not VX lock 1230 * it twice! 1231 */ 1232 if (vp->v_type != VCHR) 1233 return(0); 1234 if ((dev = vp->v_rdev) == NULL) { 1235 if ((dev = get_dev(vp->v_umajor, vp->v_uminor)) == NULL) 1236 return(0); 1237 } 1238 reference_dev(dev); 1239 lwkt_gettoken(&ilock, &spechash_token); 1240 while ((vq = SLIST_FIRST(&dev->si_hlist)) != NULL) { 1241 if (vp != vq) 1242 vx_get(vq); 1243 if (vq == SLIST_FIRST(&dev->si_hlist)) 1244 vgone_vxlocked(vq); 1245 if (vp != vq) 1246 vx_put(vq); 1247 } 1248 lwkt_reltoken(&ilock); 1249 release_dev(dev); 1250 return (0); 1251 } 1252 1253 /* 1254 * This is called when the object underlying a vnode is being destroyed, 1255 * such as in a remove(). Try to recycle the vnode immediately if the 1256 * only active reference is our reference. 1257 * 1258 * Directory vnodes in the namecache with children cannot be immediately 1259 * recycled because numerous VOP_N*() ops require them to be stable. 1260 */ 1261 int 1262 vrecycle(struct vnode *vp) 1263 { 1264 if (vp->v_sysref.refcnt <= 1) { 1265 if (cache_inval_vp_nonblock(vp)) 1266 return(0); 1267 vgone_vxlocked(vp); 1268 return (1); 1269 } 1270 return (0); 1271 } 1272 1273 /* 1274 * Return the maximum I/O size allowed for strategy calls on VP. 1275 * 1276 * If vp is VCHR or VBLK we dive the device, otherwise we use 1277 * the vp's mount info. 1278 */ 1279 int 1280 vmaxiosize(struct vnode *vp) 1281 { 1282 if (vp->v_type == VBLK || vp->v_type == VCHR) { 1283 return(vp->v_rdev->si_iosize_max); 1284 } else { 1285 return(vp->v_mount->mnt_iosize_max); 1286 } 1287 } 1288 1289 /* 1290 * Eliminate all activity associated with a vnode in preparation for reuse. 1291 * 1292 * The vnode must be VX locked and refd and will remain VX locked and refd 1293 * on return. This routine may be called with the vnode in any state, as 1294 * long as it is VX locked. The vnode will be cleaned out and marked 1295 * VRECLAIMED but will not actually be reused until all existing refs and 1296 * holds go away. 1297 * 1298 * NOTE: This routine may be called on a vnode which has not yet been 1299 * already been deactivated (VOP_INACTIVE), or on a vnode which has 1300 * already been reclaimed. 1301 * 1302 * This routine is not responsible for placing us back on the freelist. 1303 * Instead, it happens automatically when the caller releases the VX lock 1304 * (assuming there aren't any other references). 1305 */ 1306 1307 void 1308 vgone_vxlocked(struct vnode *vp) 1309 { 1310 /* 1311 * assert that the VX lock is held. This is an absolute requirement 1312 * now for vgone_vxlocked() to be called. 1313 */ 1314 KKASSERT(vp->v_lock.lk_exclusivecount == 1); 1315 1316 /* 1317 * Clean out the filesystem specific data and set the VRECLAIMED 1318 * bit. Also deactivate the vnode if necessary. 1319 */ 1320 vclean_vxlocked(vp, DOCLOSE); 1321 1322 /* 1323 * Delete from old mount point vnode list, if on one. 1324 */ 1325 if (vp->v_mount != NULL) 1326 insmntque(vp, NULL); 1327 1328 /* 1329 * If special device, remove it from special device alias list 1330 * if it is on one. This should normally only occur if a vnode is 1331 * being revoked as the device should otherwise have been released 1332 * naturally. 1333 */ 1334 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 1335 v_release_rdev(vp); 1336 } 1337 1338 /* 1339 * Set us to VBAD 1340 */ 1341 vp->v_type = VBAD; 1342 } 1343 1344 /* 1345 * Lookup a vnode by device number. 1346 */ 1347 int 1348 vfinddev(cdev_t dev, enum vtype type, struct vnode **vpp) 1349 { 1350 lwkt_tokref ilock; 1351 struct vnode *vp; 1352 1353 lwkt_gettoken(&ilock, &spechash_token); 1354 SLIST_FOREACH(vp, &dev->si_hlist, v_cdevnext) { 1355 if (type == vp->v_type) { 1356 *vpp = vp; 1357 lwkt_reltoken(&ilock); 1358 return (1); 1359 } 1360 } 1361 lwkt_reltoken(&ilock); 1362 return (0); 1363 } 1364 1365 /* 1366 * Calculate the total number of references to a special device. This 1367 * routine may only be called for VBLK and VCHR vnodes since v_rdev is 1368 * an overloaded field. Since udev2dev can now return NULL, we have 1369 * to check for a NULL v_rdev. 1370 */ 1371 int 1372 count_dev(cdev_t dev) 1373 { 1374 lwkt_tokref ilock; 1375 struct vnode *vp; 1376 int count = 0; 1377 1378 if (SLIST_FIRST(&dev->si_hlist)) { 1379 lwkt_gettoken(&ilock, &spechash_token); 1380 SLIST_FOREACH(vp, &dev->si_hlist, v_cdevnext) { 1381 if (vp->v_sysref.refcnt > 0) 1382 count += vp->v_sysref.refcnt; 1383 } 1384 lwkt_reltoken(&ilock); 1385 } 1386 return(count); 1387 } 1388 1389 int 1390 count_udev(int x, int y) 1391 { 1392 cdev_t dev; 1393 1394 if ((dev = get_dev(x, y)) == NULL) 1395 return(0); 1396 return(count_dev(dev)); 1397 } 1398 1399 int 1400 vcount(struct vnode *vp) 1401 { 1402 if (vp->v_rdev == NULL) 1403 return(0); 1404 return(count_dev(vp->v_rdev)); 1405 } 1406 1407 /* 1408 * Initialize VMIO for a vnode. This routine MUST be called before a 1409 * VFS can issue buffer cache ops on a vnode. It is typically called 1410 * when a vnode is initialized from its inode. 1411 */ 1412 int 1413 vinitvmio(struct vnode *vp, off_t filesize) 1414 { 1415 vm_object_t object; 1416 int error = 0; 1417 1418 retry: 1419 if ((object = vp->v_object) == NULL) { 1420 object = vnode_pager_alloc(vp, filesize, 0, 0); 1421 /* 1422 * Dereference the reference we just created. This assumes 1423 * that the object is associated with the vp. 1424 */ 1425 object->ref_count--; 1426 vrele(vp); 1427 } else { 1428 if (object->flags & OBJ_DEAD) { 1429 vn_unlock(vp); 1430 vm_object_dead_sleep(object, "vodead"); 1431 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1432 goto retry; 1433 } 1434 } 1435 KASSERT(vp->v_object != NULL, ("vinitvmio: NULL object")); 1436 vp->v_flag |= VOBJBUF; 1437 return (error); 1438 } 1439 1440 1441 /* 1442 * Print out a description of a vnode. 1443 */ 1444 static char *typename[] = 1445 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1446 1447 void 1448 vprint(char *label, struct vnode *vp) 1449 { 1450 char buf[96]; 1451 1452 if (label != NULL) 1453 kprintf("%s: %p: ", label, (void *)vp); 1454 else 1455 kprintf("%p: ", (void *)vp); 1456 kprintf("type %s, sysrefs %d, writecount %d, holdcnt %d,", 1457 typename[vp->v_type], 1458 vp->v_sysref.refcnt, vp->v_writecount, vp->v_auxrefs); 1459 buf[0] = '\0'; 1460 if (vp->v_flag & VROOT) 1461 strcat(buf, "|VROOT"); 1462 if (vp->v_flag & VPFSROOT) 1463 strcat(buf, "|VPFSROOT"); 1464 if (vp->v_flag & VTEXT) 1465 strcat(buf, "|VTEXT"); 1466 if (vp->v_flag & VSYSTEM) 1467 strcat(buf, "|VSYSTEM"); 1468 if (vp->v_flag & VFREE) 1469 strcat(buf, "|VFREE"); 1470 if (vp->v_flag & VOBJBUF) 1471 strcat(buf, "|VOBJBUF"); 1472 if (buf[0] != '\0') 1473 kprintf(" flags (%s)", &buf[1]); 1474 if (vp->v_data == NULL) { 1475 kprintf("\n"); 1476 } else { 1477 kprintf("\n\t"); 1478 VOP_PRINT(vp); 1479 } 1480 } 1481 1482 #ifdef DDB 1483 #include <ddb/ddb.h> 1484 1485 static int db_show_locked_vnodes(struct mount *mp, void *data); 1486 1487 /* 1488 * List all of the locked vnodes in the system. 1489 * Called when debugging the kernel. 1490 */ 1491 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 1492 { 1493 kprintf("Locked vnodes\n"); 1494 mountlist_scan(db_show_locked_vnodes, NULL, 1495 MNTSCAN_FORWARD|MNTSCAN_NOBUSY); 1496 } 1497 1498 static int 1499 db_show_locked_vnodes(struct mount *mp, void *data __unused) 1500 { 1501 struct vnode *vp; 1502 1503 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 1504 if (vn_islocked(vp)) 1505 vprint((char *)0, vp); 1506 } 1507 return(0); 1508 } 1509 #endif 1510 1511 /* 1512 * Top level filesystem related information gathering. 1513 */ 1514 static int sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS); 1515 1516 static int 1517 vfs_sysctl(SYSCTL_HANDLER_ARGS) 1518 { 1519 int *name = (int *)arg1 - 1; /* XXX */ 1520 u_int namelen = arg2 + 1; /* XXX */ 1521 struct vfsconf *vfsp; 1522 int maxtypenum; 1523 1524 #if 1 || defined(COMPAT_PRELITE2) 1525 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 1526 if (namelen == 1) 1527 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 1528 #endif 1529 1530 #ifdef notyet 1531 /* all sysctl names at this level are at least name and field */ 1532 if (namelen < 2) 1533 return (ENOTDIR); /* overloaded */ 1534 if (name[0] != VFS_GENERIC) { 1535 vfsp = vfsconf_find_by_typenum(name[0]); 1536 if (vfsp == NULL) 1537 return (EOPNOTSUPP); 1538 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 1539 oldp, oldlenp, newp, newlen, p)); 1540 } 1541 #endif 1542 switch (name[1]) { 1543 case VFS_MAXTYPENUM: 1544 if (namelen != 2) 1545 return (ENOTDIR); 1546 maxtypenum = vfsconf_get_maxtypenum(); 1547 return (SYSCTL_OUT(req, &maxtypenum, sizeof(maxtypenum))); 1548 case VFS_CONF: 1549 if (namelen != 3) 1550 return (ENOTDIR); /* overloaded */ 1551 vfsp = vfsconf_find_by_typenum(name[2]); 1552 if (vfsp == NULL) 1553 return (EOPNOTSUPP); 1554 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 1555 } 1556 return (EOPNOTSUPP); 1557 } 1558 1559 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 1560 "Generic filesystem"); 1561 1562 #if 1 || defined(COMPAT_PRELITE2) 1563 1564 static int 1565 sysctl_ovfs_conf_iter(struct vfsconf *vfsp, void *data) 1566 { 1567 int error; 1568 struct ovfsconf ovfs; 1569 struct sysctl_req *req = (struct sysctl_req*) data; 1570 1571 bzero(&ovfs, sizeof(ovfs)); 1572 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 1573 strcpy(ovfs.vfc_name, vfsp->vfc_name); 1574 ovfs.vfc_index = vfsp->vfc_typenum; 1575 ovfs.vfc_refcount = vfsp->vfc_refcount; 1576 ovfs.vfc_flags = vfsp->vfc_flags; 1577 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 1578 if (error) 1579 return error; /* abort iteration with error code */ 1580 else 1581 return 0; /* continue iterating with next element */ 1582 } 1583 1584 static int 1585 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 1586 { 1587 return vfsconf_each(sysctl_ovfs_conf_iter, (void*)req); 1588 } 1589 1590 #endif /* 1 || COMPAT_PRELITE2 */ 1591 1592 /* 1593 * Check to see if a filesystem is mounted on a block device. 1594 */ 1595 int 1596 vfs_mountedon(struct vnode *vp) 1597 { 1598 cdev_t dev; 1599 1600 if ((dev = vp->v_rdev) == NULL) { 1601 if (vp->v_type != VBLK) 1602 dev = get_dev(vp->v_uminor, vp->v_umajor); 1603 } 1604 if (dev != NULL && dev->si_mountpoint) 1605 return (EBUSY); 1606 return (0); 1607 } 1608 1609 /* 1610 * Unmount all filesystems. The list is traversed in reverse order 1611 * of mounting to avoid dependencies. 1612 */ 1613 1614 static int vfs_umountall_callback(struct mount *mp, void *data); 1615 1616 void 1617 vfs_unmountall(void) 1618 { 1619 int count; 1620 1621 do { 1622 count = mountlist_scan(vfs_umountall_callback, 1623 NULL, MNTSCAN_REVERSE|MNTSCAN_NOBUSY); 1624 } while (count); 1625 } 1626 1627 static 1628 int 1629 vfs_umountall_callback(struct mount *mp, void *data) 1630 { 1631 int error; 1632 1633 error = dounmount(mp, MNT_FORCE); 1634 if (error) { 1635 mountlist_remove(mp); 1636 kprintf("unmount of filesystem mounted from %s failed (", 1637 mp->mnt_stat.f_mntfromname); 1638 if (error == EBUSY) 1639 kprintf("BUSY)\n"); 1640 else 1641 kprintf("%d)\n", error); 1642 } 1643 return(1); 1644 } 1645 1646 /* 1647 * Build hash lists of net addresses and hang them off the mount point. 1648 * Called by ufs_mount() to set up the lists of export addresses. 1649 */ 1650 static int 1651 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 1652 const struct export_args *argp) 1653 { 1654 struct netcred *np; 1655 struct radix_node_head *rnh; 1656 int i; 1657 struct radix_node *rn; 1658 struct sockaddr *saddr, *smask = 0; 1659 struct domain *dom; 1660 int error; 1661 1662 if (argp->ex_addrlen == 0) { 1663 if (mp->mnt_flag & MNT_DEFEXPORTED) 1664 return (EPERM); 1665 np = &nep->ne_defexported; 1666 np->netc_exflags = argp->ex_flags; 1667 np->netc_anon = argp->ex_anon; 1668 np->netc_anon.cr_ref = 1; 1669 mp->mnt_flag |= MNT_DEFEXPORTED; 1670 return (0); 1671 } 1672 1673 if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN) 1674 return (EINVAL); 1675 if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN) 1676 return (EINVAL); 1677 1678 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 1679 np = (struct netcred *) kmalloc(i, M_NETADDR, M_WAITOK | M_ZERO); 1680 saddr = (struct sockaddr *) (np + 1); 1681 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 1682 goto out; 1683 if (saddr->sa_len > argp->ex_addrlen) 1684 saddr->sa_len = argp->ex_addrlen; 1685 if (argp->ex_masklen) { 1686 smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); 1687 error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen); 1688 if (error) 1689 goto out; 1690 if (smask->sa_len > argp->ex_masklen) 1691 smask->sa_len = argp->ex_masklen; 1692 } 1693 i = saddr->sa_family; 1694 if ((rnh = nep->ne_rtable[i]) == 0) { 1695 /* 1696 * Seems silly to initialize every AF when most are not used, 1697 * do so on demand here 1698 */ 1699 SLIST_FOREACH(dom, &domains, dom_next) 1700 if (dom->dom_family == i && dom->dom_rtattach) { 1701 dom->dom_rtattach((void **) &nep->ne_rtable[i], 1702 dom->dom_rtoffset); 1703 break; 1704 } 1705 if ((rnh = nep->ne_rtable[i]) == 0) { 1706 error = ENOBUFS; 1707 goto out; 1708 } 1709 } 1710 rn = (*rnh->rnh_addaddr) ((char *) saddr, (char *) smask, rnh, 1711 np->netc_rnodes); 1712 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 1713 error = EPERM; 1714 goto out; 1715 } 1716 np->netc_exflags = argp->ex_flags; 1717 np->netc_anon = argp->ex_anon; 1718 np->netc_anon.cr_ref = 1; 1719 return (0); 1720 out: 1721 kfree(np, M_NETADDR); 1722 return (error); 1723 } 1724 1725 /* ARGSUSED */ 1726 static int 1727 vfs_free_netcred(struct radix_node *rn, void *w) 1728 { 1729 struct radix_node_head *rnh = (struct radix_node_head *) w; 1730 1731 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 1732 kfree((caddr_t) rn, M_NETADDR); 1733 return (0); 1734 } 1735 1736 /* 1737 * Free the net address hash lists that are hanging off the mount points. 1738 */ 1739 static void 1740 vfs_free_addrlist(struct netexport *nep) 1741 { 1742 int i; 1743 struct radix_node_head *rnh; 1744 1745 for (i = 0; i <= AF_MAX; i++) 1746 if ((rnh = nep->ne_rtable[i])) { 1747 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 1748 (caddr_t) rnh); 1749 kfree((caddr_t) rnh, M_RTABLE); 1750 nep->ne_rtable[i] = 0; 1751 } 1752 } 1753 1754 int 1755 vfs_export(struct mount *mp, struct netexport *nep, 1756 const struct export_args *argp) 1757 { 1758 int error; 1759 1760 if (argp->ex_flags & MNT_DELEXPORT) { 1761 if (mp->mnt_flag & MNT_EXPUBLIC) { 1762 vfs_setpublicfs(NULL, NULL, NULL); 1763 mp->mnt_flag &= ~MNT_EXPUBLIC; 1764 } 1765 vfs_free_addrlist(nep); 1766 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 1767 } 1768 if (argp->ex_flags & MNT_EXPORTED) { 1769 if (argp->ex_flags & MNT_EXPUBLIC) { 1770 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 1771 return (error); 1772 mp->mnt_flag |= MNT_EXPUBLIC; 1773 } 1774 if ((error = vfs_hang_addrlist(mp, nep, argp))) 1775 return (error); 1776 mp->mnt_flag |= MNT_EXPORTED; 1777 } 1778 return (0); 1779 } 1780 1781 1782 /* 1783 * Set the publicly exported filesystem (WebNFS). Currently, only 1784 * one public filesystem is possible in the spec (RFC 2054 and 2055) 1785 */ 1786 int 1787 vfs_setpublicfs(struct mount *mp, struct netexport *nep, 1788 const struct export_args *argp) 1789 { 1790 int error; 1791 struct vnode *rvp; 1792 char *cp; 1793 1794 /* 1795 * mp == NULL -> invalidate the current info, the FS is 1796 * no longer exported. May be called from either vfs_export 1797 * or unmount, so check if it hasn't already been done. 1798 */ 1799 if (mp == NULL) { 1800 if (nfs_pub.np_valid) { 1801 nfs_pub.np_valid = 0; 1802 if (nfs_pub.np_index != NULL) { 1803 FREE(nfs_pub.np_index, M_TEMP); 1804 nfs_pub.np_index = NULL; 1805 } 1806 } 1807 return (0); 1808 } 1809 1810 /* 1811 * Only one allowed at a time. 1812 */ 1813 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 1814 return (EBUSY); 1815 1816 /* 1817 * Get real filehandle for root of exported FS. 1818 */ 1819 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 1820 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 1821 1822 if ((error = VFS_ROOT(mp, &rvp))) 1823 return (error); 1824 1825 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 1826 return (error); 1827 1828 vput(rvp); 1829 1830 /* 1831 * If an indexfile was specified, pull it in. 1832 */ 1833 if (argp->ex_indexfile != NULL) { 1834 int namelen; 1835 1836 error = vn_get_namelen(rvp, &namelen); 1837 if (error) 1838 return (error); 1839 MALLOC(nfs_pub.np_index, char *, namelen, M_TEMP, 1840 M_WAITOK); 1841 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 1842 namelen, (size_t *)0); 1843 if (!error) { 1844 /* 1845 * Check for illegal filenames. 1846 */ 1847 for (cp = nfs_pub.np_index; *cp; cp++) { 1848 if (*cp == '/') { 1849 error = EINVAL; 1850 break; 1851 } 1852 } 1853 } 1854 if (error) { 1855 FREE(nfs_pub.np_index, M_TEMP); 1856 return (error); 1857 } 1858 } 1859 1860 nfs_pub.np_mount = mp; 1861 nfs_pub.np_valid = 1; 1862 return (0); 1863 } 1864 1865 struct netcred * 1866 vfs_export_lookup(struct mount *mp, struct netexport *nep, 1867 struct sockaddr *nam) 1868 { 1869 struct netcred *np; 1870 struct radix_node_head *rnh; 1871 struct sockaddr *saddr; 1872 1873 np = NULL; 1874 if (mp->mnt_flag & MNT_EXPORTED) { 1875 /* 1876 * Lookup in the export list first. 1877 */ 1878 if (nam != NULL) { 1879 saddr = nam; 1880 rnh = nep->ne_rtable[saddr->sa_family]; 1881 if (rnh != NULL) { 1882 np = (struct netcred *) 1883 (*rnh->rnh_matchaddr)((char *)saddr, 1884 rnh); 1885 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 1886 np = NULL; 1887 } 1888 } 1889 /* 1890 * If no address match, use the default if it exists. 1891 */ 1892 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 1893 np = &nep->ne_defexported; 1894 } 1895 return (np); 1896 } 1897 1898 /* 1899 * perform msync on all vnodes under a mount point. The mount point must 1900 * be locked. This code is also responsible for lazy-freeing unreferenced 1901 * vnodes whos VM objects no longer contain pages. 1902 * 1903 * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state. 1904 * 1905 * NOTE: XXX VOP_PUTPAGES and friends requires that the vnode be locked, 1906 * but vnode_pager_putpages() doesn't lock the vnode. We have to do it 1907 * way up in this high level function. 1908 */ 1909 static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data); 1910 static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data); 1911 1912 void 1913 vfs_msync(struct mount *mp, int flags) 1914 { 1915 int vmsc_flags; 1916 1917 vmsc_flags = VMSC_GETVP; 1918 if (flags != MNT_WAIT) 1919 vmsc_flags |= VMSC_NOWAIT; 1920 vmntvnodescan(mp, vmsc_flags, vfs_msync_scan1, vfs_msync_scan2, 1921 (void *)flags); 1922 } 1923 1924 /* 1925 * scan1 is a fast pre-check. There could be hundreds of thousands of 1926 * vnodes, we cannot afford to do anything heavy weight until we have a 1927 * fairly good indication that there is work to do. 1928 */ 1929 static 1930 int 1931 vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data) 1932 { 1933 int flags = (int)data; 1934 1935 if ((vp->v_flag & VRECLAIMED) == 0) { 1936 if (vshouldmsync(vp)) 1937 return(0); /* call scan2 */ 1938 if ((mp->mnt_flag & MNT_RDONLY) == 0 && 1939 (vp->v_flag & VOBJDIRTY) && 1940 (flags == MNT_WAIT || vn_islocked(vp) == 0)) { 1941 return(0); /* call scan2 */ 1942 } 1943 } 1944 1945 /* 1946 * do not call scan2, continue the loop 1947 */ 1948 return(-1); 1949 } 1950 1951 /* 1952 * This callback is handed a locked vnode. 1953 */ 1954 static 1955 int 1956 vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data) 1957 { 1958 vm_object_t obj; 1959 int flags = (int)data; 1960 1961 if (vp->v_flag & VRECLAIMED) 1962 return(0); 1963 1964 if ((mp->mnt_flag & MNT_RDONLY) == 0 && (vp->v_flag & VOBJDIRTY)) { 1965 if ((obj = vp->v_object) != NULL) { 1966 vm_object_page_clean(obj, 0, 0, 1967 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 1968 } 1969 } 1970 return(0); 1971 } 1972 1973 /* 1974 * Record a process's interest in events which might happen to 1975 * a vnode. Because poll uses the historic select-style interface 1976 * internally, this routine serves as both the ``check for any 1977 * pending events'' and the ``record my interest in future events'' 1978 * functions. (These are done together, while the lock is held, 1979 * to avoid race conditions.) 1980 */ 1981 int 1982 vn_pollrecord(struct vnode *vp, int events) 1983 { 1984 lwkt_tokref ilock; 1985 1986 KKASSERT(curthread->td_proc != NULL); 1987 1988 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 1989 if (vp->v_pollinfo.vpi_revents & events) { 1990 /* 1991 * This leaves events we are not interested 1992 * in available for the other process which 1993 * which presumably had requested them 1994 * (otherwise they would never have been 1995 * recorded). 1996 */ 1997 events &= vp->v_pollinfo.vpi_revents; 1998 vp->v_pollinfo.vpi_revents &= ~events; 1999 2000 lwkt_reltoken(&ilock); 2001 return events; 2002 } 2003 vp->v_pollinfo.vpi_events |= events; 2004 selrecord(curthread, &vp->v_pollinfo.vpi_selinfo); 2005 lwkt_reltoken(&ilock); 2006 return 0; 2007 } 2008 2009 /* 2010 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2011 * it is possible for us to miss an event due to race conditions, but 2012 * that condition is expected to be rare, so for the moment it is the 2013 * preferred interface. 2014 */ 2015 void 2016 vn_pollevent(struct vnode *vp, int events) 2017 { 2018 lwkt_tokref ilock; 2019 2020 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 2021 if (vp->v_pollinfo.vpi_events & events) { 2022 /* 2023 * We clear vpi_events so that we don't 2024 * call selwakeup() twice if two events are 2025 * posted before the polling process(es) is 2026 * awakened. This also ensures that we take at 2027 * most one selwakeup() if the polling process 2028 * is no longer interested. However, it does 2029 * mean that only one event can be noticed at 2030 * a time. (Perhaps we should only clear those 2031 * event bits which we note?) XXX 2032 */ 2033 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2034 vp->v_pollinfo.vpi_revents |= events; 2035 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2036 } 2037 lwkt_reltoken(&ilock); 2038 } 2039 2040 /* 2041 * Wake up anyone polling on vp because it is being revoked. 2042 * This depends on dead_poll() returning POLLHUP for correct 2043 * behavior. 2044 */ 2045 void 2046 vn_pollgone(struct vnode *vp) 2047 { 2048 lwkt_tokref ilock; 2049 2050 lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token); 2051 if (vp->v_pollinfo.vpi_events) { 2052 vp->v_pollinfo.vpi_events = 0; 2053 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2054 } 2055 lwkt_reltoken(&ilock); 2056 } 2057 2058 /* 2059 * extract the cdev_t from a VBLK or VCHR. The vnode must have been opened 2060 * (or v_rdev might be NULL). 2061 */ 2062 cdev_t 2063 vn_todev(struct vnode *vp) 2064 { 2065 if (vp->v_type != VBLK && vp->v_type != VCHR) 2066 return (NULL); 2067 KKASSERT(vp->v_rdev != NULL); 2068 return (vp->v_rdev); 2069 } 2070 2071 /* 2072 * Check if vnode represents a disk device. The vnode does not need to be 2073 * opened. 2074 */ 2075 int 2076 vn_isdisk(struct vnode *vp, int *errp) 2077 { 2078 cdev_t dev; 2079 2080 if (vp->v_type != VCHR) { 2081 if (errp != NULL) 2082 *errp = ENOTBLK; 2083 return (0); 2084 } 2085 2086 if ((dev = vp->v_rdev) == NULL) 2087 dev = get_dev(vp->v_umajor, vp->v_uminor); 2088 2089 if (dev == NULL) { 2090 if (errp != NULL) 2091 *errp = ENXIO; 2092 return (0); 2093 } 2094 if (dev_is_good(dev) == 0) { 2095 if (errp != NULL) 2096 *errp = ENXIO; 2097 return (0); 2098 } 2099 if ((dev_dflags(dev) & D_DISK) == 0) { 2100 if (errp != NULL) 2101 *errp = ENOTBLK; 2102 return (0); 2103 } 2104 if (errp != NULL) 2105 *errp = 0; 2106 return (1); 2107 } 2108 2109 int 2110 vn_get_namelen(struct vnode *vp, int *namelen) 2111 { 2112 int error, retval[2]; 2113 2114 error = VOP_PATHCONF(vp, _PC_NAME_MAX, retval); 2115 if (error) 2116 return (error); 2117 *namelen = *retval; 2118 return (0); 2119 } 2120 2121 int 2122 vop_write_dirent(int *error, struct uio *uio, ino_t d_ino, uint8_t d_type, 2123 uint16_t d_namlen, const char *d_name) 2124 { 2125 struct dirent *dp; 2126 size_t len; 2127 2128 len = _DIRENT_RECLEN(d_namlen); 2129 if (len > uio->uio_resid) 2130 return(1); 2131 2132 dp = kmalloc(len, M_TEMP, M_WAITOK | M_ZERO); 2133 2134 dp->d_ino = d_ino; 2135 dp->d_namlen = d_namlen; 2136 dp->d_type = d_type; 2137 bcopy(d_name, dp->d_name, d_namlen); 2138 2139 *error = uiomove((caddr_t)dp, len, uio); 2140 2141 kfree(dp, M_TEMP); 2142 2143 return(0); 2144 } 2145 2146 void 2147 vn_mark_atime(struct vnode *vp, struct thread *td) 2148 { 2149 struct proc *p = td->td_proc; 2150 struct ucred *cred = p ? p->p_ucred : proc0.p_ucred; 2151 2152 if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) { 2153 VOP_MARKATIME(vp, cred); 2154 } 2155 } 2156