xref: /dragonfly/sys/kern/vfs_subr.c (revision 685c703c)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39  * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $
40  * $DragonFly: src/sys/kern/vfs_subr.c,v 1.91 2006/07/18 22:22:12 dillon Exp $
41  */
42 
43 /*
44  * External virtual filesystem routines
45  */
46 #include "opt_ddb.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/buf.h>
51 #include <sys/conf.h>
52 #include <sys/dirent.h>
53 #include <sys/domain.h>
54 #include <sys/eventhandler.h>
55 #include <sys/fcntl.h>
56 #include <sys/kernel.h>
57 #include <sys/kthread.h>
58 #include <sys/malloc.h>
59 #include <sys/mbuf.h>
60 #include <sys/mount.h>
61 #include <sys/proc.h>
62 #include <sys/reboot.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/sysctl.h>
66 #include <sys/syslog.h>
67 #include <sys/unistd.h>
68 #include <sys/vmmeter.h>
69 #include <sys/vnode.h>
70 
71 #include <machine/limits.h>
72 
73 #include <vm/vm.h>
74 #include <vm/vm_object.h>
75 #include <vm/vm_extern.h>
76 #include <vm/vm_kern.h>
77 #include <vm/pmap.h>
78 #include <vm/vm_map.h>
79 #include <vm/vm_page.h>
80 #include <vm/vm_pager.h>
81 #include <vm/vnode_pager.h>
82 #include <vm/vm_zone.h>
83 
84 #include <sys/buf2.h>
85 #include <sys/thread2.h>
86 
87 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
88 
89 int numvnodes;
90 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
91 int vfs_fastdev = 1;
92 SYSCTL_INT(_vfs, OID_AUTO, fastdev, CTLFLAG_RW, &vfs_fastdev, 0, "");
93 
94 enum vtype iftovt_tab[16] = {
95 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
96 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
97 };
98 int vttoif_tab[9] = {
99 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
100 	S_IFSOCK, S_IFIFO, S_IFMT,
101 };
102 
103 static int reassignbufcalls;
104 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW,
105 		&reassignbufcalls, 0, "");
106 static int reassignbufloops;
107 SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW,
108 		&reassignbufloops, 0, "");
109 static int reassignbufsortgood;
110 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW,
111 		&reassignbufsortgood, 0, "");
112 static int reassignbufsortbad;
113 SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW,
114 		&reassignbufsortbad, 0, "");
115 static int reassignbufmethod = 1;
116 SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW,
117 		&reassignbufmethod, 0, "");
118 
119 int	nfs_mount_type = -1;
120 static struct lwkt_token spechash_token;
121 struct nfs_public nfs_pub;	/* publicly exported FS */
122 
123 int desiredvnodes;
124 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
125 		&desiredvnodes, 0, "Maximum number of vnodes");
126 
127 static void	vfs_free_addrlist (struct netexport *nep);
128 static int	vfs_free_netcred (struct radix_node *rn, void *w);
129 static int	vfs_hang_addrlist (struct mount *mp, struct netexport *nep,
130 				       struct export_args *argp);
131 
132 extern int dev_ref_debug;
133 
134 /*
135  * Red black tree functions
136  */
137 static int rb_buf_compare(struct buf *b1, struct buf *b2);
138 RB_GENERATE2(buf_rb_tree, buf, b_rbnode, rb_buf_compare, off_t, b_loffset);
139 RB_GENERATE2(buf_rb_hash, buf, b_rbhash, rb_buf_compare, off_t, b_loffset);
140 
141 static int
142 rb_buf_compare(struct buf *b1, struct buf *b2)
143 {
144 	if (b1->b_loffset < b2->b_loffset)
145 		return(-1);
146 	if (b1->b_loffset > b2->b_loffset)
147 		return(1);
148 	return(0);
149 }
150 
151 /*
152  * Return 0 if the vnode is already on the free list or cannot be placed
153  * on the free list.  Return 1 if the vnode can be placed on the free list.
154  */
155 static __inline int
156 vshouldfree(struct vnode *vp, int usecount)
157 {
158 	if (vp->v_flag & VFREE)
159 		return (0);		/* already free */
160 	if (vp->v_holdcnt != 0 || vp->v_usecount != usecount)
161 		return (0);		/* other holderse */
162 	if (vp->v_object &&
163 	    (vp->v_object->ref_count || vp->v_object->resident_page_count)) {
164 		return (0);
165 	}
166 	return (1);
167 }
168 
169 /*
170  * Initialize the vnode management data structures.
171  *
172  * Called from vfsinit()
173  */
174 void
175 vfs_subr_init(void)
176 {
177 	/*
178 	 * Desired vnodes is a result of the physical page count
179 	 * and the size of kernel's heap.  It scales in proportion
180 	 * to the amount of available physical memory.  This can
181 	 * cause trouble on 64-bit and large memory platforms.
182 	 */
183 	/* desiredvnodes = maxproc + vmstats.v_page_count / 4; */
184 	desiredvnodes =
185 		min(maxproc + vmstats.v_page_count /4,
186 		    2 * (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
187 		    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
188 
189 	lwkt_token_init(&spechash_token);
190 }
191 
192 /*
193  * Knob to control the precision of file timestamps:
194  *
195  *   0 = seconds only; nanoseconds zeroed.
196  *   1 = seconds and nanoseconds, accurate within 1/HZ.
197  *   2 = seconds and nanoseconds, truncated to microseconds.
198  * >=3 = seconds and nanoseconds, maximum precision.
199  */
200 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
201 
202 static int timestamp_precision = TSP_SEC;
203 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
204 		&timestamp_precision, 0, "");
205 
206 /*
207  * Get a current timestamp.
208  */
209 void
210 vfs_timestamp(struct timespec *tsp)
211 {
212 	struct timeval tv;
213 
214 	switch (timestamp_precision) {
215 	case TSP_SEC:
216 		tsp->tv_sec = time_second;
217 		tsp->tv_nsec = 0;
218 		break;
219 	case TSP_HZ:
220 		getnanotime(tsp);
221 		break;
222 	case TSP_USEC:
223 		microtime(&tv);
224 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
225 		break;
226 	case TSP_NSEC:
227 	default:
228 		nanotime(tsp);
229 		break;
230 	}
231 }
232 
233 /*
234  * Set vnode attributes to VNOVAL
235  */
236 void
237 vattr_null(struct vattr *vap)
238 {
239 	vap->va_type = VNON;
240 	vap->va_size = VNOVAL;
241 	vap->va_bytes = VNOVAL;
242 	vap->va_mode = VNOVAL;
243 	vap->va_nlink = VNOVAL;
244 	vap->va_uid = VNOVAL;
245 	vap->va_gid = VNOVAL;
246 	vap->va_fsid = VNOVAL;
247 	vap->va_fileid = VNOVAL;
248 	vap->va_blocksize = VNOVAL;
249 	vap->va_rdev = VNOVAL;
250 	vap->va_atime.tv_sec = VNOVAL;
251 	vap->va_atime.tv_nsec = VNOVAL;
252 	vap->va_mtime.tv_sec = VNOVAL;
253 	vap->va_mtime.tv_nsec = VNOVAL;
254 	vap->va_ctime.tv_sec = VNOVAL;
255 	vap->va_ctime.tv_nsec = VNOVAL;
256 	vap->va_flags = VNOVAL;
257 	vap->va_gen = VNOVAL;
258 	vap->va_vaflags = 0;
259 	vap->va_fsmid = VNOVAL;
260 }
261 
262 /*
263  * Flush out and invalidate all buffers associated with a vnode.
264  *
265  * vp must be locked.
266  */
267 static int vinvalbuf_bp(struct buf *bp, void *data);
268 
269 struct vinvalbuf_bp_info {
270 	struct vnode *vp;
271 	int slptimeo;
272 	int lkflags;
273 	int flags;
274 };
275 
276 void
277 vupdatefsmid(struct vnode *vp)
278 {
279 	atomic_set_int(&vp->v_flag, VFSMID);
280 }
281 
282 int
283 vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
284 {
285 	struct vinvalbuf_bp_info info;
286 	int error;
287 	vm_object_t object;
288 
289 	/*
290 	 * If we are being asked to save, call fsync to ensure that the inode
291 	 * is updated.
292 	 */
293 	if (flags & V_SAVE) {
294 		crit_enter();
295 		while (vp->v_track_write.bk_active) {
296 			vp->v_track_write.bk_waitflag = 1;
297 			error = tsleep(&vp->v_track_write, slpflag,
298 					"vinvlbuf", slptimeo);
299 			if (error) {
300 				crit_exit();
301 				return (error);
302 			}
303 		}
304 		if (!RB_EMPTY(&vp->v_rbdirty_tree)) {
305 			crit_exit();
306 			if ((error = VOP_FSYNC(vp, MNT_WAIT)) != 0)
307 				return (error);
308 			crit_enter();
309 			if (vp->v_track_write.bk_active > 0 ||
310 			    !RB_EMPTY(&vp->v_rbdirty_tree))
311 				panic("vinvalbuf: dirty bufs");
312 		}
313 		crit_exit();
314   	}
315 	crit_enter();
316 	info.slptimeo = slptimeo;
317 	info.lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL;
318 	if (slpflag & PCATCH)
319 		info.lkflags |= LK_PCATCH;
320 	info.flags = flags;
321 	info.vp = vp;
322 
323 	/*
324 	 * Flush the buffer cache until nothing is left.
325 	 */
326 	while (!RB_EMPTY(&vp->v_rbclean_tree) ||
327 	    !RB_EMPTY(&vp->v_rbdirty_tree)) {
328 		error = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, NULL,
329 				vinvalbuf_bp, &info);
330 		if (error == 0) {
331 			error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
332 					vinvalbuf_bp, &info);
333 		}
334 	}
335 
336 	/*
337 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
338 	 * have write I/O in-progress but if there is a VM object then the
339 	 * VM object can also have read-I/O in-progress.
340 	 */
341 	do {
342 		while (vp->v_track_write.bk_active > 0) {
343 			vp->v_track_write.bk_waitflag = 1;
344 			tsleep(&vp->v_track_write, 0, "vnvlbv", 0);
345 		}
346 		if ((object = vp->v_object) != NULL) {
347 			while (object->paging_in_progress)
348 				vm_object_pip_sleep(object, "vnvlbx");
349 		}
350 	} while (vp->v_track_write.bk_active > 0);
351 
352 	crit_exit();
353 
354 	/*
355 	 * Destroy the copy in the VM cache, too.
356 	 */
357 	if ((object = vp->v_object) != NULL) {
358 		vm_object_page_remove(object, 0, 0,
359 			(flags & V_SAVE) ? TRUE : FALSE);
360 	}
361 
362 	if (!RB_EMPTY(&vp->v_rbdirty_tree) || !RB_EMPTY(&vp->v_rbclean_tree))
363 		panic("vinvalbuf: flush failed");
364 	if (!RB_EMPTY(&vp->v_rbhash_tree))
365 		panic("vinvalbuf: flush failed, buffers still present");
366 	return (0);
367 }
368 
369 static int
370 vinvalbuf_bp(struct buf *bp, void *data)
371 {
372 	struct vinvalbuf_bp_info *info = data;
373 	int error;
374 
375 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
376 		error = BUF_TIMELOCK(bp, info->lkflags,
377 				     "vinvalbuf", info->slptimeo);
378 		if (error == 0) {
379 			BUF_UNLOCK(bp);
380 			error = ENOLCK;
381 		}
382 		if (error == ENOLCK)
383 			return(0);
384 		return (-error);
385 	}
386 
387 	KKASSERT(bp->b_vp == info->vp);
388 
389 	/*
390 	 * XXX Since there are no node locks for NFS, I
391 	 * believe there is a slight chance that a delayed
392 	 * write will occur while sleeping just above, so
393 	 * check for it.  Note that vfs_bio_awrite expects
394 	 * buffers to reside on a queue, while bwrite() and
395 	 * brelse() do not.
396 	 */
397 	if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
398 	    (info->flags & V_SAVE)) {
399 		if (bp->b_vp == info->vp) {
400 			if (bp->b_flags & B_CLUSTEROK) {
401 				vfs_bio_awrite(bp);
402 			} else {
403 				bremfree(bp);
404 				bp->b_flags |= B_ASYNC;
405 				bwrite(bp);
406 			}
407 		} else {
408 			bremfree(bp);
409 			bwrite(bp);
410 		}
411 	} else if (info->flags & V_SAVE) {
412 		/*
413 		 * Cannot set B_NOCACHE on a clean buffer as this will
414 		 * destroy the VM backing store which might actually
415 		 * be dirty (and unsynchronized).
416 		 */
417 		bremfree(bp);
418 		bp->b_flags |= (B_INVAL | B_RELBUF);
419 		bp->b_flags &= ~B_ASYNC;
420 		brelse(bp);
421 	} else {
422 		bremfree(bp);
423 		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
424 		bp->b_flags &= ~B_ASYNC;
425 		brelse(bp);
426 	}
427 	return(0);
428 }
429 
430 /*
431  * Truncate a file's buffer and pages to a specified length.  This
432  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
433  * sync activity.
434  *
435  * The vnode must be locked.
436  */
437 static int vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
438 static int vtruncbuf_bp_trunc(struct buf *bp, void *data);
439 static int vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
440 static int vtruncbuf_bp_metasync(struct buf *bp, void *data);
441 
442 int
443 vtruncbuf(struct vnode *vp, off_t length, int blksize)
444 {
445 	off_t truncloffset;
446 	int count;
447 	const char *filename;
448 
449 	/*
450 	 * Round up to the *next* block, then destroy the buffers in question.
451 	 * Since we are only removing some of the buffers we must rely on the
452 	 * scan count to determine whether a loop is necessary.
453 	 */
454 	if ((count = (int)(length % blksize)) != 0)
455 		truncloffset = length + (blksize - count);
456 	else
457 		truncloffset = length;
458 
459 	crit_enter();
460 	do {
461 		count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
462 				vtruncbuf_bp_trunc_cmp,
463 				vtruncbuf_bp_trunc, &truncloffset);
464 		count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
465 				vtruncbuf_bp_trunc_cmp,
466 				vtruncbuf_bp_trunc, &truncloffset);
467 	} while(count);
468 
469 	/*
470 	 * For safety, fsync any remaining metadata if the file is not being
471 	 * truncated to 0.  Since the metadata does not represent the entire
472 	 * dirty list we have to rely on the hit count to ensure that we get
473 	 * all of it.
474 	 */
475 	if (length > 0) {
476 		do {
477 			count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
478 					vtruncbuf_bp_metasync_cmp,
479 					vtruncbuf_bp_metasync, vp);
480 		} while (count);
481 	}
482 
483 	/*
484 	 * Clean out any left over VM backing store.
485 	 */
486 	crit_exit();
487 
488 	vnode_pager_setsize(vp, length);
489 
490 	crit_enter();
491 
492 	/*
493 	 * It is possible to have in-progress I/O from buffers that were
494 	 * not part of the truncation.  This should not happen if we
495 	 * are truncating to 0-length.
496 	 */
497 	filename = TAILQ_FIRST(&vp->v_namecache) ?
498 		   TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
499 
500 	while ((count = vp->v_track_write.bk_active) > 0) {
501 		vp->v_track_write.bk_waitflag = 1;
502 		tsleep(&vp->v_track_write, 0, "vbtrunc", 0);
503 		if (length == 0) {
504 			printf("Warning: vtruncbuf(): Had to wait for "
505 			       "%d buffer I/Os to finish in %s\n",
506 			       count, filename);
507 		}
508 	}
509 
510 	/*
511 	 * Make sure no buffers were instantiated while we were trying
512 	 * to clean out the remaining VM pages.  This could occur due
513 	 * to busy dirty VM pages being flushed out to disk.
514 	 */
515 	do {
516 		count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
517 				vtruncbuf_bp_trunc_cmp,
518 				vtruncbuf_bp_trunc, &truncloffset);
519 		count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
520 				vtruncbuf_bp_trunc_cmp,
521 				vtruncbuf_bp_trunc, &truncloffset);
522 		if (count) {
523 			printf("Warning: vtruncbuf():  Had to re-clean %d "
524 			       "left over buffers in %s\n", count, filename);
525 		}
526 	} while(count);
527 
528 	crit_exit();
529 
530 	return (0);
531 }
532 
533 /*
534  * The callback buffer is beyond the new file EOF and must be destroyed.
535  * Note that the compare function must conform to the RB_SCAN's requirements.
536  */
537 static
538 int
539 vtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
540 {
541 	if (bp->b_loffset >= *(off_t *)data)
542 		return(0);
543 	return(-1);
544 }
545 
546 static
547 int
548 vtruncbuf_bp_trunc(struct buf *bp, void *data)
549 {
550 	/*
551 	 * Do not try to use a buffer we cannot immediately lock, but sleep
552 	 * anyway to prevent a livelock.  The code will loop until all buffers
553 	 * can be acted upon.
554 	 */
555 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
556 		if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
557 			BUF_UNLOCK(bp);
558 	} else {
559 		bremfree(bp);
560 		bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
561 		bp->b_flags &= ~B_ASYNC;
562 		brelse(bp);
563 	}
564 	return(1);
565 }
566 
567 /*
568  * Fsync all meta-data after truncating a file to be non-zero.  Only metadata
569  * blocks (with a negative loffset) are scanned.
570  * Note that the compare function must conform to the RB_SCAN's requirements.
571  */
572 static int
573 vtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
574 {
575 	if (bp->b_loffset < 0)
576 		return(0);
577 	return(1);
578 }
579 
580 static int
581 vtruncbuf_bp_metasync(struct buf *bp, void *data)
582 {
583 	struct vnode *vp = data;
584 
585 	if (bp->b_flags & B_DELWRI) {
586 		/*
587 		 * Do not try to use a buffer we cannot immediately lock,
588 		 * but sleep anyway to prevent a livelock.  The code will
589 		 * loop until all buffers can be acted upon.
590 		 */
591 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
592 			if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
593 				BUF_UNLOCK(bp);
594 		} else {
595 			bremfree(bp);
596 			if (bp->b_vp == vp) {
597 				bp->b_flags |= B_ASYNC;
598 			} else {
599 				bp->b_flags &= ~B_ASYNC;
600 			}
601 			bwrite(bp);
602 		}
603 		return(1);
604 	} else {
605 		return(0);
606 	}
607 }
608 
609 /*
610  * vfsync - implements a multipass fsync on a file which understands
611  * dependancies and meta-data.  The passed vnode must be locked.  The
612  * waitfor argument may be MNT_WAIT or MNT_NOWAIT, or MNT_LAZY.
613  *
614  * When fsyncing data asynchronously just do one consolidated pass starting
615  * with the most negative block number.  This may not get all the data due
616  * to dependancies.
617  *
618  * When fsyncing data synchronously do a data pass, then a metadata pass,
619  * then do additional data+metadata passes to try to get all the data out.
620  */
621 static int vfsync_wait_output(struct vnode *vp,
622 			    int (*waitoutput)(struct vnode *, struct thread *));
623 static int vfsync_data_only_cmp(struct buf *bp, void *data);
624 static int vfsync_meta_only_cmp(struct buf *bp, void *data);
625 static int vfsync_lazy_range_cmp(struct buf *bp, void *data);
626 static int vfsync_bp(struct buf *bp, void *data);
627 
628 struct vfsync_info {
629 	struct vnode *vp;
630 	int synchronous;
631 	int syncdeps;
632 	int lazycount;
633 	int lazylimit;
634 	int skippedbufs;
635 	int (*checkdef)(struct buf *);
636 };
637 
638 int
639 vfsync(struct vnode *vp, int waitfor, int passes,
640 	int (*checkdef)(struct buf *),
641 	int (*waitoutput)(struct vnode *, struct thread *))
642 {
643 	struct vfsync_info info;
644 	int error;
645 
646 	bzero(&info, sizeof(info));
647 	info.vp = vp;
648 	if ((info.checkdef = checkdef) == NULL)
649 		info.syncdeps = 1;
650 
651 	crit_enter_id("vfsync");
652 
653 	switch(waitfor) {
654 	case MNT_LAZY:
655 		/*
656 		 * Lazy (filesystem syncer typ) Asynchronous plus limit the
657 		 * number of data (not meta) pages we try to flush to 1MB.
658 		 * A non-zero return means that lazy limit was reached.
659 		 */
660 		info.lazylimit = 1024 * 1024;
661 		info.syncdeps = 1;
662 		error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
663 				vfsync_lazy_range_cmp, vfsync_bp, &info);
664 		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
665 				vfsync_meta_only_cmp, vfsync_bp, &info);
666 		if (error == 0)
667 			vp->v_lazyw = 0;
668 		else if (!RB_EMPTY(&vp->v_rbdirty_tree))
669 			vn_syncer_add_to_worklist(vp, 1);
670 		error = 0;
671 		break;
672 	case MNT_NOWAIT:
673 		/*
674 		 * Asynchronous.  Do a data-only pass and a meta-only pass.
675 		 */
676 		info.syncdeps = 1;
677 		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp,
678 			vfsync_bp, &info);
679 		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_meta_only_cmp,
680 			vfsync_bp, &info);
681 		error = 0;
682 		break;
683 	default:
684 		/*
685 		 * Synchronous.  Do a data-only pass, then a meta-data+data
686 		 * pass, then additional integrated passes to try to get
687 		 * all the dependancies flushed.
688 		 */
689 		RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, vfsync_data_only_cmp,
690 			vfsync_bp, &info);
691 		error = vfsync_wait_output(vp, waitoutput);
692 		if (error == 0) {
693 			info.skippedbufs = 0;
694 			RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
695 				vfsync_bp, &info);
696 			error = vfsync_wait_output(vp, waitoutput);
697 			if (info.skippedbufs)
698 				printf("Warning: vfsync skipped %d dirty bufs in pass2!\n", info.skippedbufs);
699 		}
700 		while (error == 0 && passes > 0 &&
701 		    !RB_EMPTY(&vp->v_rbdirty_tree)) {
702 			if (--passes == 0) {
703 				info.synchronous = 1;
704 				info.syncdeps = 1;
705 			}
706 			error = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, NULL,
707 				vfsync_bp, &info);
708 			if (error < 0)
709 				error = -error;
710 			info.syncdeps = 1;
711 			if (error == 0)
712 				error = vfsync_wait_output(vp, waitoutput);
713 		}
714 		break;
715 	}
716 	crit_exit_id("vfsync");
717 	return(error);
718 }
719 
720 static int
721 vfsync_wait_output(struct vnode *vp, int (*waitoutput)(struct vnode *, struct thread *))
722 {
723 	int error = 0;
724 
725 	while (vp->v_track_write.bk_active) {
726 		vp->v_track_write.bk_waitflag = 1;
727 		tsleep(&vp->v_track_write, 0, "fsfsn", 0);
728 	}
729 	if (waitoutput)
730 		error = waitoutput(vp, curthread);
731 	return(error);
732 }
733 
734 static int
735 vfsync_data_only_cmp(struct buf *bp, void *data)
736 {
737 	if (bp->b_loffset < 0)
738 		return(-1);
739 	return(0);
740 }
741 
742 static int
743 vfsync_meta_only_cmp(struct buf *bp, void *data)
744 {
745 	if (bp->b_loffset < 0)
746 		return(0);
747 	return(1);
748 }
749 
750 static int
751 vfsync_lazy_range_cmp(struct buf *bp, void *data)
752 {
753 	struct vfsync_info *info = data;
754 	if (bp->b_loffset < info->vp->v_lazyw)
755 		return(-1);
756 	return(0);
757 }
758 
759 static int
760 vfsync_bp(struct buf *bp, void *data)
761 {
762 	struct vfsync_info *info = data;
763 	struct vnode *vp = info->vp;
764 	int error;
765 
766 	/*
767 	 * if syncdeps is not set we do not try to write buffers which have
768 	 * dependancies.
769 	 */
770 	if (!info->synchronous && info->syncdeps == 0 && info->checkdef(bp))
771 		return(0);
772 
773 	/*
774 	 * Ignore buffers that we cannot immediately lock.  XXX
775 	 */
776 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
777 		printf("Warning: vfsync_bp skipping dirty buffer %p\n", bp);
778 		++info->skippedbufs;
779 		return(0);
780 	}
781 	if ((bp->b_flags & B_DELWRI) == 0)
782 		panic("vfsync_bp: buffer not dirty");
783 	if (vp != bp->b_vp)
784 		panic("vfsync_bp: buffer vp mismatch");
785 
786 	/*
787 	 * B_NEEDCOMMIT (primarily used by NFS) is a state where the buffer
788 	 * has been written but an additional handshake with the device
789 	 * is required before we can dispose of the buffer.  We have no idea
790 	 * how to do this so we have to skip these buffers.
791 	 */
792 	if (bp->b_flags & B_NEEDCOMMIT) {
793 		BUF_UNLOCK(bp);
794 		return(0);
795 	}
796 
797 	if (info->synchronous) {
798 		/*
799 		 * Synchronous flushing.  An error may be returned.
800 		 */
801 		bremfree(bp);
802 		crit_exit_id("vfsync");
803 		error = bwrite(bp);
804 		crit_enter_id("vfsync");
805 	} else {
806 		/*
807 		 * Asynchronous flushing.  A negative return value simply
808 		 * stops the scan and is not considered an error.  We use
809 		 * this to support limited MNT_LAZY flushes.
810 		 */
811 		vp->v_lazyw = bp->b_loffset;
812 		if ((vp->v_flag & VOBJBUF) && (bp->b_flags & B_CLUSTEROK)) {
813 			info->lazycount += vfs_bio_awrite(bp);
814 		} else {
815 			info->lazycount += bp->b_bufsize;
816 			bremfree(bp);
817 			crit_exit_id("vfsync");
818 			bawrite(bp);
819 			crit_enter_id("vfsync");
820 		}
821 		if (info->lazylimit && info->lazycount >= info->lazylimit)
822 			error = 1;
823 		else
824 			error = 0;
825 	}
826 	return(-error);
827 }
828 
829 /*
830  * Associate a buffer with a vnode.
831  */
832 void
833 bgetvp(struct vnode *vp, struct buf *bp)
834 {
835 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
836 	KKASSERT((bp->b_flags & (B_HASHED|B_DELWRI|B_VNCLEAN|B_VNDIRTY)) == 0);
837 
838 	vhold(vp);
839 	/*
840 	 * Insert onto list for new vnode.
841 	 */
842 	crit_enter();
843 	bp->b_vp = vp;
844 	bp->b_flags |= B_HASHED;
845 	if (buf_rb_hash_RB_INSERT(&vp->v_rbhash_tree, bp))
846 		panic("reassignbuf: dup lblk vp %p bp %p", vp, bp);
847 
848 	bp->b_flags |= B_VNCLEAN;
849 	if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp))
850 		panic("reassignbuf: dup lblk/clean vp %p bp %p", vp, bp);
851 	crit_exit();
852 }
853 
854 /*
855  * Disassociate a buffer from a vnode.
856  */
857 void
858 brelvp(struct buf *bp)
859 {
860 	struct vnode *vp;
861 
862 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
863 
864 	/*
865 	 * Delete from old vnode list, if on one.
866 	 */
867 	vp = bp->b_vp;
868 	crit_enter();
869 	if (bp->b_flags & (B_VNDIRTY | B_VNCLEAN)) {
870 		if (bp->b_flags & B_VNDIRTY)
871 			buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);
872 		else
873 			buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
874 		bp->b_flags &= ~(B_VNDIRTY | B_VNCLEAN);
875 	}
876 	if (bp->b_flags & B_HASHED) {
877 		buf_rb_hash_RB_REMOVE(&vp->v_rbhash_tree, bp);
878 		bp->b_flags &= ~B_HASHED;
879 	}
880 	if ((vp->v_flag & VONWORKLST) && RB_EMPTY(&vp->v_rbdirty_tree)) {
881 		vp->v_flag &= ~VONWORKLST;
882 		LIST_REMOVE(vp, v_synclist);
883 	}
884 	crit_exit();
885 	bp->b_vp = NULL;
886 	vdrop(vp);
887 }
888 
889 /*
890  * Reassign the buffer to the proper clean/dirty list based on B_DELWRI.
891  * This routine is called when the state of the B_DELWRI bit is changed.
892  */
893 void
894 reassignbuf(struct buf *bp)
895 {
896 	struct vnode *vp = bp->b_vp;
897 	int delay;
898 
899 	KKASSERT(vp != NULL);
900 	++reassignbufcalls;
901 
902 	/*
903 	 * B_PAGING flagged buffers cannot be reassigned because their vp
904 	 * is not fully linked in.
905 	 */
906 	if (bp->b_flags & B_PAGING)
907 		panic("cannot reassign paging buffer");
908 
909 	crit_enter();
910 	if (bp->b_flags & B_DELWRI) {
911 		/*
912 		 * Move to the dirty list, add the vnode to the worklist
913 		 */
914 		if (bp->b_flags & B_VNCLEAN) {
915 			buf_rb_tree_RB_REMOVE(&vp->v_rbclean_tree, bp);
916 			bp->b_flags &= ~B_VNCLEAN;
917 		}
918 		if ((bp->b_flags & B_VNDIRTY) == 0) {
919 			if (buf_rb_tree_RB_INSERT(&vp->v_rbdirty_tree, bp)) {
920 				panic("reassignbuf: dup lblk vp %p bp %p",
921 				      vp, bp);
922 			}
923 			bp->b_flags |= B_VNDIRTY;
924 		}
925 		if ((vp->v_flag & VONWORKLST) == 0) {
926 			switch (vp->v_type) {
927 			case VDIR:
928 				delay = dirdelay;
929 				break;
930 			case VCHR:
931 			case VBLK:
932 				if (vp->v_rdev &&
933 				    vp->v_rdev->si_mountpoint != NULL) {
934 					delay = metadelay;
935 					break;
936 				}
937 				/* fall through */
938 			default:
939 				delay = filedelay;
940 			}
941 			vn_syncer_add_to_worklist(vp, delay);
942 		}
943 	} else {
944 		/*
945 		 * Move to the clean list, remove the vnode from the worklist
946 		 * if no dirty blocks remain.
947 		 */
948 		if (bp->b_flags & B_VNDIRTY) {
949 			buf_rb_tree_RB_REMOVE(&vp->v_rbdirty_tree, bp);
950 			bp->b_flags &= ~B_VNDIRTY;
951 		}
952 		if ((bp->b_flags & B_VNCLEAN) == 0) {
953 			if (buf_rb_tree_RB_INSERT(&vp->v_rbclean_tree, bp)) {
954 				panic("reassignbuf: dup lblk vp %p bp %p",
955 				      vp, bp);
956 			}
957 			bp->b_flags |= B_VNCLEAN;
958 		}
959 		if ((vp->v_flag & VONWORKLST) &&
960 		    RB_EMPTY(&vp->v_rbdirty_tree)) {
961 			vp->v_flag &= ~VONWORKLST;
962 			LIST_REMOVE(vp, v_synclist);
963 		}
964 	}
965 	crit_exit();
966 }
967 
968 /*
969  * Create a vnode for a block device.
970  * Used for mounting the root file system.
971  */
972 int
973 bdevvp(dev_t dev, struct vnode **vpp)
974 {
975 	struct vnode *vp;
976 	struct vnode *nvp;
977 	int error;
978 
979 	if (dev == NODEV) {
980 		*vpp = NULLVP;
981 		return (ENXIO);
982 	}
983 	error = getspecialvnode(VT_NON, NULL, &spec_vnode_vops_p, &nvp, 0, 0);
984 	if (error) {
985 		*vpp = NULLVP;
986 		return (error);
987 	}
988 	vp = nvp;
989 	vp->v_type = VCHR;
990 	vp->v_udev = dev->si_udev;
991 	vx_unlock(vp);
992 	*vpp = vp;
993 	return (0);
994 }
995 
996 int
997 v_associate_rdev(struct vnode *vp, dev_t dev)
998 {
999 	lwkt_tokref ilock;
1000 
1001 	if (dev == NULL || dev == NODEV)
1002 		return(ENXIO);
1003 	if (dev_is_good(dev) == 0)
1004 		return(ENXIO);
1005 	KKASSERT(vp->v_rdev == NULL);
1006 	if (dev_ref_debug)
1007 		printf("Z1");
1008 	vp->v_rdev = reference_dev(dev);
1009 	lwkt_gettoken(&ilock, &spechash_token);
1010 	SLIST_INSERT_HEAD(&dev->si_hlist, vp, v_specnext);
1011 	lwkt_reltoken(&ilock);
1012 	return(0);
1013 }
1014 
1015 void
1016 v_release_rdev(struct vnode *vp)
1017 {
1018 	lwkt_tokref ilock;
1019 	dev_t dev;
1020 
1021 	if ((dev = vp->v_rdev) != NULL) {
1022 		lwkt_gettoken(&ilock, &spechash_token);
1023 		SLIST_REMOVE(&dev->si_hlist, vp, vnode, v_specnext);
1024 		vp->v_rdev = NULL;
1025 		release_dev(dev);
1026 		lwkt_reltoken(&ilock);
1027 	}
1028 }
1029 
1030 /*
1031  * Add a vnode to the alias list hung off the dev_t.  We only associate
1032  * the device number with the vnode.  The actual device is not associated
1033  * until the vnode is opened (usually in spec_open()), and will be
1034  * disassociated on last close.
1035  */
1036 void
1037 addaliasu(struct vnode *nvp, udev_t nvp_udev)
1038 {
1039 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1040 		panic("addaliasu on non-special vnode");
1041 	nvp->v_udev = nvp_udev;
1042 }
1043 
1044 /*
1045  * Disassociate a vnode from its underlying filesystem.
1046  *
1047  * The vnode must be VX locked and refd
1048  *
1049  * If there are v_usecount references to the vnode other then ours we have
1050  * to VOP_CLOSE the vnode before we can deactivate and reclaim it.
1051  */
1052 void
1053 vclean(struct vnode *vp, int flags)
1054 {
1055 	int active;
1056 	int n;
1057 	vm_object_t object;
1058 
1059 	/*
1060 	 * If the vnode has already been reclaimed we have nothing to do.
1061 	 */
1062 	if (vp->v_flag & VRECLAIMED)
1063 		return;
1064 	vp->v_flag |= VRECLAIMED;
1065 
1066 	/*
1067 	 * Scrap the vfs cache
1068 	 */
1069 	while (cache_inval_vp(vp, 0) != 0) {
1070 		printf("Warning: vnode %p clean/cache_resolution race detected\n", vp);
1071 		tsleep(vp, 0, "vclninv", 2);
1072 	}
1073 
1074 	/*
1075 	 * Check to see if the vnode is in use. If so we have to reference it
1076 	 * before we clean it out so that its count cannot fall to zero and
1077 	 * generate a race against ourselves to recycle it.
1078 	 */
1079 	active = (vp->v_usecount > 1);
1080 
1081 	/*
1082 	 * Clean out any buffers associated with the vnode and destroy its
1083 	 * object, if it has one.
1084 	 */
1085 	vinvalbuf(vp, V_SAVE, 0, 0);
1086 
1087 	/*
1088 	 * If purging an active vnode (typically during a forced unmount
1089 	 * or reboot), it must be closed and deactivated before being
1090 	 * reclaimed.  This isn't really all that safe, but what can
1091 	 * we do? XXX.
1092 	 *
1093 	 * Note that neither of these routines unlocks the vnode.
1094 	 */
1095 	if (active && (flags & DOCLOSE)) {
1096 		while ((n = vp->v_opencount) != 0) {
1097 			if (vp->v_writecount)
1098 				VOP_CLOSE(vp, FWRITE|FNONBLOCK);
1099 			else
1100 				VOP_CLOSE(vp, FNONBLOCK);
1101 			if (vp->v_opencount == n) {
1102 				printf("Warning: unable to force-close"
1103 				       " vnode %p\n", vp);
1104 				break;
1105 			}
1106 		}
1107 	}
1108 
1109 	/*
1110 	 * If the vnode has not be deactivated, deactivated it.  Deactivation
1111 	 * can create new buffers and VM pages so we have to call vinvalbuf()
1112 	 * again to make sure they all get flushed.
1113 	 *
1114 	 * This can occur if a file with a link count of 0 needs to be
1115 	 * truncated.
1116 	 */
1117 	if ((vp->v_flag & VINACTIVE) == 0) {
1118 		vp->v_flag |= VINACTIVE;
1119 		VOP_INACTIVE(vp);
1120 		vinvalbuf(vp, V_SAVE, 0, 0);
1121 	}
1122 
1123 	/*
1124 	 * If the vnode has an object, destroy it.
1125 	 */
1126 	if ((object = vp->v_object) != NULL) {
1127 		if (object->ref_count == 0) {
1128 			if ((object->flags & OBJ_DEAD) == 0)
1129 				vm_object_terminate(object);
1130 		} else {
1131 			vm_pager_deallocate(object);
1132 		}
1133 		vp->v_flag &= ~VOBJBUF;
1134 	}
1135 	KKASSERT((vp->v_flag & VOBJBUF) == 0);
1136 
1137 
1138 	/*
1139 	 * Reclaim the vnode.
1140 	 */
1141 	if (VOP_RECLAIM(vp))
1142 		panic("vclean: cannot reclaim");
1143 
1144 	/*
1145 	 * Done with purge, notify sleepers of the grim news.
1146 	 */
1147 	vp->v_ops = &dead_vnode_vops_p;
1148 	vn_pollgone(vp);
1149 	vp->v_tag = VT_NON;
1150 }
1151 
1152 /*
1153  * Eliminate all activity associated with the requested vnode
1154  * and with all vnodes aliased to the requested vnode.
1155  *
1156  * The vnode must be referenced and vx_lock()'d
1157  *
1158  * revoke { struct vnode *a_vp, int a_flags }
1159  */
1160 int
1161 vop_stdrevoke(struct vop_revoke_args *ap)
1162 {
1163 	struct vnode *vp, *vq;
1164 	lwkt_tokref ilock;
1165 	dev_t dev;
1166 
1167 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1168 
1169 	vp = ap->a_vp;
1170 
1171 	/*
1172 	 * If the vnode is already dead don't try to revoke it
1173 	 */
1174 	if (vp->v_flag & VRECLAIMED)
1175 		return (0);
1176 
1177 	/*
1178 	 * If the vnode has a device association, scrap all vnodes associated
1179 	 * with the device.  Don't let the device disappear on us while we
1180 	 * are scrapping the vnodes.
1181 	 *
1182 	 * The passed vp will probably show up in the list, do not VX lock
1183 	 * it twice!
1184 	 */
1185 	if (vp->v_type != VCHR && vp->v_type != VBLK)
1186 		return(0);
1187 	if ((dev = vp->v_rdev) == NULL) {
1188 		if ((dev = udev2dev(vp->v_udev, vp->v_type == VBLK)) == NODEV)
1189 			return(0);
1190 	}
1191 	reference_dev(dev);
1192 	lwkt_gettoken(&ilock, &spechash_token);
1193 	while ((vq = SLIST_FIRST(&dev->si_hlist)) != NULL) {
1194 		if (vp == vq || vx_get(vq) == 0) {
1195 			if (vq == SLIST_FIRST(&dev->si_hlist))
1196 				vgone(vq);
1197 			if (vp != vq)
1198 				vx_put(vq);
1199 		}
1200 	}
1201 	lwkt_reltoken(&ilock);
1202 	release_dev(dev);
1203 	return (0);
1204 }
1205 
1206 /*
1207  * Recycle an unused vnode to the front of the free list.
1208  *
1209  * Returns 1 if we were successfully able to recycle the vnode,
1210  * 0 otherwise.
1211  */
1212 int
1213 vrecycle(struct vnode *vp)
1214 {
1215 	if (vp->v_usecount == 1) {
1216 		vgone(vp);
1217 		return (1);
1218 	}
1219 	return (0);
1220 }
1221 
1222 /*
1223  * Eliminate all activity associated with a vnode in preparation for reuse.
1224  *
1225  * The vnode must be VX locked and refd and will remain VX locked and refd
1226  * on return.  This routine may be called with the vnode in any state, as
1227  * long as it is VX locked.  The vnode will be cleaned out and marked
1228  * VRECLAIMED but will not actually be reused until all existing refs and
1229  * holds go away.
1230  *
1231  * NOTE: This routine may be called on a vnode which has not yet been
1232  * already been deactivated (VOP_INACTIVE), or on a vnode which has
1233  * already been reclaimed.
1234  *
1235  * This routine is not responsible for placing us back on the freelist.
1236  * Instead, it happens automatically when the caller releases the VX lock
1237  * (assuming there aren't any other references).
1238  */
1239 void
1240 vgone(struct vnode *vp)
1241 {
1242 	/*
1243 	 * assert that the VX lock is held.  This is an absolute requirement
1244 	 * now for vgone() to be called.
1245 	 */
1246 	KKASSERT(vp->v_lock.lk_exclusivecount == 1);
1247 
1248 	/*
1249 	 * Clean out the filesystem specific data and set the VRECLAIMED
1250 	 * bit.  Also deactivate the vnode if necessary.
1251 	 */
1252 	vclean(vp, DOCLOSE);
1253 
1254 	/*
1255 	 * Delete from old mount point vnode list, if on one.
1256 	 */
1257 	if (vp->v_mount != NULL)
1258 		insmntque(vp, NULL);
1259 
1260 	/*
1261 	 * If special device, remove it from special device alias list
1262 	 * if it is on one.  This should normally only occur if a vnode is
1263 	 * being revoked as the device should otherwise have been released
1264 	 * naturally.
1265 	 */
1266 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
1267 		v_release_rdev(vp);
1268 	}
1269 
1270 	/*
1271 	 * Set us to VBAD
1272 	 */
1273 	vp->v_type = VBAD;
1274 }
1275 
1276 /*
1277  * Lookup a vnode by device number.
1278  */
1279 int
1280 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
1281 {
1282 	lwkt_tokref ilock;
1283 	struct vnode *vp;
1284 
1285 	lwkt_gettoken(&ilock, &spechash_token);
1286 	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1287 		if (type == vp->v_type) {
1288 			*vpp = vp;
1289 			lwkt_reltoken(&ilock);
1290 			return (1);
1291 		}
1292 	}
1293 	lwkt_reltoken(&ilock);
1294 	return (0);
1295 }
1296 
1297 /*
1298  * Calculate the total number of references to a special device.  This
1299  * routine may only be called for VBLK and VCHR vnodes since v_rdev is
1300  * an overloaded field.  Since udev2dev can now return NODEV, we have
1301  * to check for a NULL v_rdev.
1302  */
1303 int
1304 count_dev(dev_t dev)
1305 {
1306 	lwkt_tokref ilock;
1307 	struct vnode *vp;
1308 	int count = 0;
1309 
1310 	if (SLIST_FIRST(&dev->si_hlist)) {
1311 		lwkt_gettoken(&ilock, &spechash_token);
1312 		SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1313 			count += vp->v_usecount;
1314 		}
1315 		lwkt_reltoken(&ilock);
1316 	}
1317 	return(count);
1318 }
1319 
1320 int
1321 count_udev(udev_t udev)
1322 {
1323 	dev_t dev;
1324 
1325 	if ((dev = udev2dev(udev, 0)) == NODEV)
1326 		return(0);
1327 	return(count_dev(dev));
1328 }
1329 
1330 int
1331 vcount(struct vnode *vp)
1332 {
1333 	if (vp->v_rdev == NULL)
1334 		return(0);
1335 	return(count_dev(vp->v_rdev));
1336 }
1337 
1338 /*
1339  * Initialize VMIO for a vnode.  This routine MUST be called before a
1340  * VFS can issue buffer cache ops on a vnode.  It is typically called
1341  * when a vnode is initialized from its inode.
1342  */
1343 int
1344 vinitvmio(struct vnode *vp, off_t filesize)
1345 {
1346 	vm_object_t object;
1347 	int error = 0;
1348 
1349 retry:
1350 	if ((object = vp->v_object) == NULL) {
1351 		object = vnode_pager_alloc(vp, filesize, 0, 0);
1352 		/*
1353 		 * Dereference the reference we just created.  This assumes
1354 		 * that the object is associated with the vp.
1355 		 */
1356 		object->ref_count--;
1357 		vp->v_usecount--;
1358 	} else {
1359 		if (object->flags & OBJ_DEAD) {
1360 			VOP_UNLOCK(vp, 0);
1361 			tsleep(object, 0, "vodead", 0);
1362 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1363 			goto retry;
1364 		}
1365 	}
1366 	KASSERT(vp->v_object != NULL, ("vinitvmio: NULL object"));
1367 	vp->v_flag |= VOBJBUF;
1368 	return (error);
1369 }
1370 
1371 
1372 /*
1373  * Print out a description of a vnode.
1374  */
1375 static char *typename[] =
1376 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1377 
1378 void
1379 vprint(char *label, struct vnode *vp)
1380 {
1381 	char buf[96];
1382 
1383 	if (label != NULL)
1384 		printf("%s: %p: ", label, (void *)vp);
1385 	else
1386 		printf("%p: ", (void *)vp);
1387 	printf("type %s, usecount %d, writecount %d, refcount %d,",
1388 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1389 	    vp->v_holdcnt);
1390 	buf[0] = '\0';
1391 	if (vp->v_flag & VROOT)
1392 		strcat(buf, "|VROOT");
1393 	if (vp->v_flag & VTEXT)
1394 		strcat(buf, "|VTEXT");
1395 	if (vp->v_flag & VSYSTEM)
1396 		strcat(buf, "|VSYSTEM");
1397 	if (vp->v_flag & VFREE)
1398 		strcat(buf, "|VFREE");
1399 	if (vp->v_flag & VOBJBUF)
1400 		strcat(buf, "|VOBJBUF");
1401 	if (buf[0] != '\0')
1402 		printf(" flags (%s)", &buf[1]);
1403 	if (vp->v_data == NULL) {
1404 		printf("\n");
1405 	} else {
1406 		printf("\n\t");
1407 		VOP_PRINT(vp);
1408 	}
1409 }
1410 
1411 #ifdef DDB
1412 #include <ddb/ddb.h>
1413 
1414 static int db_show_locked_vnodes(struct mount *mp, void *data);
1415 
1416 /*
1417  * List all of the locked vnodes in the system.
1418  * Called when debugging the kernel.
1419  */
1420 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
1421 {
1422 	printf("Locked vnodes\n");
1423 	mountlist_scan(db_show_locked_vnodes, NULL,
1424 			MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1425 }
1426 
1427 static int
1428 db_show_locked_vnodes(struct mount *mp, void *data __unused)
1429 {
1430 	struct vnode *vp;
1431 
1432 	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
1433 		if (VOP_ISLOCKED(vp, NULL))
1434 			vprint((char *)0, vp);
1435 	}
1436 	return(0);
1437 }
1438 #endif
1439 
1440 /*
1441  * Top level filesystem related information gathering.
1442  */
1443 static int	sysctl_ovfs_conf (SYSCTL_HANDLER_ARGS);
1444 
1445 static int
1446 vfs_sysctl(SYSCTL_HANDLER_ARGS)
1447 {
1448 	int *name = (int *)arg1 - 1;	/* XXX */
1449 	u_int namelen = arg2 + 1;	/* XXX */
1450 	struct vfsconf *vfsp;
1451 
1452 #if 1 || defined(COMPAT_PRELITE2)
1453 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
1454 	if (namelen == 1)
1455 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
1456 #endif
1457 
1458 #ifdef notyet
1459 	/* all sysctl names at this level are at least name and field */
1460 	if (namelen < 2)
1461 		return (ENOTDIR);		/* overloaded */
1462 	if (name[0] != VFS_GENERIC) {
1463 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1464 			if (vfsp->vfc_typenum == name[0])
1465 				break;
1466 		if (vfsp == NULL)
1467 			return (EOPNOTSUPP);
1468 		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1469 		    oldp, oldlenp, newp, newlen, p));
1470 	}
1471 #endif
1472 	switch (name[1]) {
1473 	case VFS_MAXTYPENUM:
1474 		if (namelen != 2)
1475 			return (ENOTDIR);
1476 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
1477 	case VFS_CONF:
1478 		if (namelen != 3)
1479 			return (ENOTDIR);	/* overloaded */
1480 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1481 			if (vfsp->vfc_typenum == name[2])
1482 				break;
1483 		if (vfsp == NULL)
1484 			return (EOPNOTSUPP);
1485 		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
1486 	}
1487 	return (EOPNOTSUPP);
1488 }
1489 
1490 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
1491 	"Generic filesystem");
1492 
1493 #if 1 || defined(COMPAT_PRELITE2)
1494 
1495 static int
1496 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
1497 {
1498 	int error;
1499 	struct vfsconf *vfsp;
1500 	struct ovfsconf ovfs;
1501 
1502 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1503 		bzero(&ovfs, sizeof(ovfs));
1504 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
1505 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
1506 		ovfs.vfc_index = vfsp->vfc_typenum;
1507 		ovfs.vfc_refcount = vfsp->vfc_refcount;
1508 		ovfs.vfc_flags = vfsp->vfc_flags;
1509 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
1510 		if (error)
1511 			return error;
1512 	}
1513 	return 0;
1514 }
1515 
1516 #endif /* 1 || COMPAT_PRELITE2 */
1517 
1518 /*
1519  * Check to see if a filesystem is mounted on a block device.
1520  */
1521 int
1522 vfs_mountedon(struct vnode *vp)
1523 {
1524 	dev_t dev;
1525 
1526 	if ((dev = vp->v_rdev) == NULL)
1527 		dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
1528 	if (dev != NODEV && dev->si_mountpoint)
1529 		return (EBUSY);
1530 	return (0);
1531 }
1532 
1533 /*
1534  * Unmount all filesystems. The list is traversed in reverse order
1535  * of mounting to avoid dependencies.
1536  */
1537 
1538 static int vfs_umountall_callback(struct mount *mp, void *data);
1539 
1540 void
1541 vfs_unmountall(void)
1542 {
1543 	struct thread *td = curthread;
1544 	int count;
1545 
1546 	if (td->td_proc == NULL)
1547 		td = initproc->p_thread;	/* XXX XXX use proc0 instead? */
1548 
1549 	do {
1550 		count = mountlist_scan(vfs_umountall_callback,
1551 					NULL, MNTSCAN_REVERSE|MNTSCAN_NOBUSY);
1552 	} while (count);
1553 }
1554 
1555 static
1556 int
1557 vfs_umountall_callback(struct mount *mp, void *data)
1558 {
1559 	int error;
1560 
1561 	error = dounmount(mp, MNT_FORCE);
1562 	if (error) {
1563 		mountlist_remove(mp);
1564 		printf("unmount of filesystem mounted from %s failed (",
1565 			mp->mnt_stat.f_mntfromname);
1566 		if (error == EBUSY)
1567 			printf("BUSY)\n");
1568 		else
1569 			printf("%d)\n", error);
1570 	}
1571 	return(1);
1572 }
1573 
1574 /*
1575  * Build hash lists of net addresses and hang them off the mount point.
1576  * Called by ufs_mount() to set up the lists of export addresses.
1577  */
1578 static int
1579 vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
1580 		struct export_args *argp)
1581 {
1582 	struct netcred *np;
1583 	struct radix_node_head *rnh;
1584 	int i;
1585 	struct radix_node *rn;
1586 	struct sockaddr *saddr, *smask = 0;
1587 	struct domain *dom;
1588 	int error;
1589 
1590 	if (argp->ex_addrlen == 0) {
1591 		if (mp->mnt_flag & MNT_DEFEXPORTED)
1592 			return (EPERM);
1593 		np = &nep->ne_defexported;
1594 		np->netc_exflags = argp->ex_flags;
1595 		np->netc_anon = argp->ex_anon;
1596 		np->netc_anon.cr_ref = 1;
1597 		mp->mnt_flag |= MNT_DEFEXPORTED;
1598 		return (0);
1599 	}
1600 
1601 	if (argp->ex_addrlen < 0 || argp->ex_addrlen > MLEN)
1602 		return (EINVAL);
1603 	if (argp->ex_masklen < 0 || argp->ex_masklen > MLEN)
1604 		return (EINVAL);
1605 
1606 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1607 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
1608 	bzero((caddr_t) np, i);
1609 	saddr = (struct sockaddr *) (np + 1);
1610 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
1611 		goto out;
1612 	if (saddr->sa_len > argp->ex_addrlen)
1613 		saddr->sa_len = argp->ex_addrlen;
1614 	if (argp->ex_masklen) {
1615 		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1616 		error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
1617 		if (error)
1618 			goto out;
1619 		if (smask->sa_len > argp->ex_masklen)
1620 			smask->sa_len = argp->ex_masklen;
1621 	}
1622 	i = saddr->sa_family;
1623 	if ((rnh = nep->ne_rtable[i]) == 0) {
1624 		/*
1625 		 * Seems silly to initialize every AF when most are not used,
1626 		 * do so on demand here
1627 		 */
1628 		SLIST_FOREACH(dom, &domains, dom_next)
1629 			if (dom->dom_family == i && dom->dom_rtattach) {
1630 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
1631 				    dom->dom_rtoffset);
1632 				break;
1633 			}
1634 		if ((rnh = nep->ne_rtable[i]) == 0) {
1635 			error = ENOBUFS;
1636 			goto out;
1637 		}
1638 	}
1639 	rn = (*rnh->rnh_addaddr) ((char *) saddr, (char *) smask, rnh,
1640 	    np->netc_rnodes);
1641 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
1642 		error = EPERM;
1643 		goto out;
1644 	}
1645 	np->netc_exflags = argp->ex_flags;
1646 	np->netc_anon = argp->ex_anon;
1647 	np->netc_anon.cr_ref = 1;
1648 	return (0);
1649 out:
1650 	free(np, M_NETADDR);
1651 	return (error);
1652 }
1653 
1654 /* ARGSUSED */
1655 static int
1656 vfs_free_netcred(struct radix_node *rn, void *w)
1657 {
1658 	struct radix_node_head *rnh = (struct radix_node_head *) w;
1659 
1660 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
1661 	free((caddr_t) rn, M_NETADDR);
1662 	return (0);
1663 }
1664 
1665 /*
1666  * Free the net address hash lists that are hanging off the mount points.
1667  */
1668 static void
1669 vfs_free_addrlist(struct netexport *nep)
1670 {
1671 	int i;
1672 	struct radix_node_head *rnh;
1673 
1674 	for (i = 0; i <= AF_MAX; i++)
1675 		if ((rnh = nep->ne_rtable[i])) {
1676 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
1677 			    (caddr_t) rnh);
1678 			free((caddr_t) rnh, M_RTABLE);
1679 			nep->ne_rtable[i] = 0;
1680 		}
1681 }
1682 
1683 int
1684 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp)
1685 {
1686 	int error;
1687 
1688 	if (argp->ex_flags & MNT_DELEXPORT) {
1689 		if (mp->mnt_flag & MNT_EXPUBLIC) {
1690 			vfs_setpublicfs(NULL, NULL, NULL);
1691 			mp->mnt_flag &= ~MNT_EXPUBLIC;
1692 		}
1693 		vfs_free_addrlist(nep);
1694 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1695 	}
1696 	if (argp->ex_flags & MNT_EXPORTED) {
1697 		if (argp->ex_flags & MNT_EXPUBLIC) {
1698 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
1699 				return (error);
1700 			mp->mnt_flag |= MNT_EXPUBLIC;
1701 		}
1702 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
1703 			return (error);
1704 		mp->mnt_flag |= MNT_EXPORTED;
1705 	}
1706 	return (0);
1707 }
1708 
1709 
1710 /*
1711  * Set the publicly exported filesystem (WebNFS). Currently, only
1712  * one public filesystem is possible in the spec (RFC 2054 and 2055)
1713  */
1714 int
1715 vfs_setpublicfs(struct mount *mp, struct netexport *nep,
1716 		struct export_args *argp)
1717 {
1718 	int error;
1719 	struct vnode *rvp;
1720 	char *cp;
1721 
1722 	/*
1723 	 * mp == NULL -> invalidate the current info, the FS is
1724 	 * no longer exported. May be called from either vfs_export
1725 	 * or unmount, so check if it hasn't already been done.
1726 	 */
1727 	if (mp == NULL) {
1728 		if (nfs_pub.np_valid) {
1729 			nfs_pub.np_valid = 0;
1730 			if (nfs_pub.np_index != NULL) {
1731 				FREE(nfs_pub.np_index, M_TEMP);
1732 				nfs_pub.np_index = NULL;
1733 			}
1734 		}
1735 		return (0);
1736 	}
1737 
1738 	/*
1739 	 * Only one allowed at a time.
1740 	 */
1741 	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
1742 		return (EBUSY);
1743 
1744 	/*
1745 	 * Get real filehandle for root of exported FS.
1746 	 */
1747 	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
1748 	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
1749 
1750 	if ((error = VFS_ROOT(mp, &rvp)))
1751 		return (error);
1752 
1753 	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
1754 		return (error);
1755 
1756 	vput(rvp);
1757 
1758 	/*
1759 	 * If an indexfile was specified, pull it in.
1760 	 */
1761 	if (argp->ex_indexfile != NULL) {
1762 		int namelen;
1763 
1764 		error = vn_get_namelen(rvp, &namelen);
1765 		if (error)
1766 			return (error);
1767 		MALLOC(nfs_pub.np_index, char *, namelen, M_TEMP,
1768 		    M_WAITOK);
1769 		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
1770 		    namelen, (size_t *)0);
1771 		if (!error) {
1772 			/*
1773 			 * Check for illegal filenames.
1774 			 */
1775 			for (cp = nfs_pub.np_index; *cp; cp++) {
1776 				if (*cp == '/') {
1777 					error = EINVAL;
1778 					break;
1779 				}
1780 			}
1781 		}
1782 		if (error) {
1783 			FREE(nfs_pub.np_index, M_TEMP);
1784 			return (error);
1785 		}
1786 	}
1787 
1788 	nfs_pub.np_mount = mp;
1789 	nfs_pub.np_valid = 1;
1790 	return (0);
1791 }
1792 
1793 struct netcred *
1794 vfs_export_lookup(struct mount *mp, struct netexport *nep,
1795 		struct sockaddr *nam)
1796 {
1797 	struct netcred *np;
1798 	struct radix_node_head *rnh;
1799 	struct sockaddr *saddr;
1800 
1801 	np = NULL;
1802 	if (mp->mnt_flag & MNT_EXPORTED) {
1803 		/*
1804 		 * Lookup in the export list first.
1805 		 */
1806 		if (nam != NULL) {
1807 			saddr = nam;
1808 			rnh = nep->ne_rtable[saddr->sa_family];
1809 			if (rnh != NULL) {
1810 				np = (struct netcred *)
1811 					(*rnh->rnh_matchaddr)((char *)saddr,
1812 							      rnh);
1813 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
1814 					np = NULL;
1815 			}
1816 		}
1817 		/*
1818 		 * If no address match, use the default if it exists.
1819 		 */
1820 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1821 			np = &nep->ne_defexported;
1822 	}
1823 	return (np);
1824 }
1825 
1826 /*
1827  * perform msync on all vnodes under a mount point.  The mount point must
1828  * be locked.  This code is also responsible for lazy-freeing unreferenced
1829  * vnodes whos VM objects no longer contain pages.
1830  *
1831  * NOTE: MNT_WAIT still skips vnodes in the VXLOCK state.
1832  *
1833  * NOTE: XXX VOP_PUTPAGES and friends requires that the vnode be locked,
1834  * but vnode_pager_putpages() doesn't lock the vnode.  We have to do it
1835  * way up in this high level function.
1836  */
1837 static int vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data);
1838 static int vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data);
1839 
1840 void
1841 vfs_msync(struct mount *mp, int flags)
1842 {
1843 	int vmsc_flags;
1844 
1845 	vmsc_flags = VMSC_GETVP;
1846 	if (flags != MNT_WAIT)
1847 		vmsc_flags |= VMSC_NOWAIT;
1848 	vmntvnodescan(mp, vmsc_flags, vfs_msync_scan1, vfs_msync_scan2,
1849 			(void *)flags);
1850 }
1851 
1852 /*
1853  * scan1 is a fast pre-check.  There could be hundreds of thousands of
1854  * vnodes, we cannot afford to do anything heavy weight until we have a
1855  * fairly good indication that there is work to do.
1856  */
1857 static
1858 int
1859 vfs_msync_scan1(struct mount *mp, struct vnode *vp, void *data)
1860 {
1861 	int flags = (int)data;
1862 
1863 	if ((vp->v_flag & VRECLAIMED) == 0) {
1864 		if (vshouldfree(vp, 0))
1865 			return(0);	/* call scan2 */
1866 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
1867 		    (vp->v_flag & VOBJDIRTY) &&
1868 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
1869 			return(0);	/* call scan2 */
1870 		}
1871 	}
1872 
1873 	/*
1874 	 * do not call scan2, continue the loop
1875 	 */
1876 	return(-1);
1877 }
1878 
1879 /*
1880  * This callback is handed a locked vnode.
1881  */
1882 static
1883 int
1884 vfs_msync_scan2(struct mount *mp, struct vnode *vp, void *data)
1885 {
1886 	vm_object_t obj;
1887 	int flags = (int)data;
1888 
1889 	if (vp->v_flag & VRECLAIMED)
1890 		return(0);
1891 
1892 	if ((mp->mnt_flag & MNT_RDONLY) == 0 && (vp->v_flag & VOBJDIRTY)) {
1893 		if ((obj = vp->v_object) != NULL) {
1894 			vm_object_page_clean(obj, 0, 0,
1895 			 flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
1896 		}
1897 	}
1898 	return(0);
1899 }
1900 
1901 /*
1902  * Record a process's interest in events which might happen to
1903  * a vnode.  Because poll uses the historic select-style interface
1904  * internally, this routine serves as both the ``check for any
1905  * pending events'' and the ``record my interest in future events''
1906  * functions.  (These are done together, while the lock is held,
1907  * to avoid race conditions.)
1908  */
1909 int
1910 vn_pollrecord(struct vnode *vp, int events)
1911 {
1912 	lwkt_tokref ilock;
1913 
1914 	KKASSERT(curthread->td_proc != NULL);
1915 
1916 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
1917 	if (vp->v_pollinfo.vpi_revents & events) {
1918 		/*
1919 		 * This leaves events we are not interested
1920 		 * in available for the other process which
1921 		 * which presumably had requested them
1922 		 * (otherwise they would never have been
1923 		 * recorded).
1924 		 */
1925 		events &= vp->v_pollinfo.vpi_revents;
1926 		vp->v_pollinfo.vpi_revents &= ~events;
1927 
1928 		lwkt_reltoken(&ilock);
1929 		return events;
1930 	}
1931 	vp->v_pollinfo.vpi_events |= events;
1932 	selrecord(curthread, &vp->v_pollinfo.vpi_selinfo);
1933 	lwkt_reltoken(&ilock);
1934 	return 0;
1935 }
1936 
1937 /*
1938  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
1939  * it is possible for us to miss an event due to race conditions, but
1940  * that condition is expected to be rare, so for the moment it is the
1941  * preferred interface.
1942  */
1943 void
1944 vn_pollevent(struct vnode *vp, int events)
1945 {
1946 	lwkt_tokref ilock;
1947 
1948 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
1949 	if (vp->v_pollinfo.vpi_events & events) {
1950 		/*
1951 		 * We clear vpi_events so that we don't
1952 		 * call selwakeup() twice if two events are
1953 		 * posted before the polling process(es) is
1954 		 * awakened.  This also ensures that we take at
1955 		 * most one selwakeup() if the polling process
1956 		 * is no longer interested.  However, it does
1957 		 * mean that only one event can be noticed at
1958 		 * a time.  (Perhaps we should only clear those
1959 		 * event bits which we note?) XXX
1960 		 */
1961 		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
1962 		vp->v_pollinfo.vpi_revents |= events;
1963 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
1964 	}
1965 	lwkt_reltoken(&ilock);
1966 }
1967 
1968 /*
1969  * Wake up anyone polling on vp because it is being revoked.
1970  * This depends on dead_poll() returning POLLHUP for correct
1971  * behavior.
1972  */
1973 void
1974 vn_pollgone(struct vnode *vp)
1975 {
1976 	lwkt_tokref ilock;
1977 
1978 	lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
1979 	if (vp->v_pollinfo.vpi_events) {
1980 		vp->v_pollinfo.vpi_events = 0;
1981 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
1982 	}
1983 	lwkt_reltoken(&ilock);
1984 }
1985 
1986 /*
1987  * extract the dev_t from a VBLK or VCHR.  The vnode must have been opened
1988  * (or v_rdev might be NULL).
1989  */
1990 dev_t
1991 vn_todev(struct vnode *vp)
1992 {
1993 	if (vp->v_type != VBLK && vp->v_type != VCHR)
1994 		return (NODEV);
1995 	KKASSERT(vp->v_rdev != NULL);
1996 	return (vp->v_rdev);
1997 }
1998 
1999 /*
2000  * Check if vnode represents a disk device.  The vnode does not need to be
2001  * opened.
2002  */
2003 int
2004 vn_isdisk(struct vnode *vp, int *errp)
2005 {
2006 	dev_t dev;
2007 
2008 	if (vp->v_type != VBLK && vp->v_type != VCHR) {
2009 		if (errp != NULL)
2010 			*errp = ENOTBLK;
2011 		return (0);
2012 	}
2013 
2014 	if ((dev = vp->v_rdev) == NULL)
2015 		dev = udev2dev(vp->v_udev, (vp->v_type == VBLK));
2016 	if (dev == NULL || dev == NODEV) {
2017 		if (errp != NULL)
2018 			*errp = ENXIO;
2019 		return (0);
2020 	}
2021 	if (dev_is_good(dev) == 0) {
2022 		if (errp != NULL)
2023 			*errp = ENXIO;
2024 		return (0);
2025 	}
2026 	if ((dev_dflags(dev) & D_DISK) == 0) {
2027 		if (errp != NULL)
2028 			*errp = ENOTBLK;
2029 		return (0);
2030 	}
2031 	if (errp != NULL)
2032 		*errp = 0;
2033 	return (1);
2034 }
2035 
2036 #ifdef DEBUG_VFS_LOCKS
2037 
2038 void
2039 assert_vop_locked(struct vnode *vp, const char *str)
2040 {
2041 	if (vp && !VOP_ISLOCKED(vp, NULL)) {
2042 		panic("%s: %p is not locked shared but should be", str, vp);
2043 	}
2044 }
2045 
2046 void
2047 assert_vop_unlocked(struct vnode *vp, const char *str)
2048 {
2049 	if (vp) {
2050 		if (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) {
2051 			panic("%s: %p is locked but should not be", str, vp);
2052 		}
2053 	}
2054 }
2055 
2056 #endif
2057 
2058 int
2059 vn_get_namelen(struct vnode *vp, int *namelen)
2060 {
2061 	int error, retval[2];
2062 
2063 	error = VOP_PATHCONF(vp, _PC_NAME_MAX, retval);
2064 	if (error)
2065 		return (error);
2066 	*namelen = *retval;
2067 	return (0);
2068 }
2069 
2070 int
2071 vop_write_dirent(int *error, struct uio *uio, ino_t d_ino, uint8_t d_type,
2072 		uint16_t d_namlen, const char *d_name)
2073 {
2074 	struct dirent *dp;
2075 	size_t len;
2076 
2077 	len = _DIRENT_RECLEN(d_namlen);
2078 	if (len > uio->uio_resid)
2079 		return(1);
2080 
2081 	dp = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
2082 
2083 	dp->d_ino = d_ino;
2084 	dp->d_namlen = d_namlen;
2085 	dp->d_type = d_type;
2086 	bcopy(d_name, dp->d_name, d_namlen);
2087 
2088 	*error = uiomove((caddr_t)dp, len, uio);
2089 
2090 	free(dp, M_TEMP);
2091 
2092 	return(0);
2093 }
2094 
2095