xref: /openbsd/sys/kern/vfs_subr.c (revision 78b7da88)
1 /*	$OpenBSD: vfs_subr.c,v 1.325 2024/10/31 10:06:51 mvs Exp $	*/
2 /*	$NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1989, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * (c) UNIX System Laboratories, Inc.
8  * All or some portions of this file are derived from material licensed
9  * to the University of California by American Telephone and Telegraph
10  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11  * the permission of UNIX System Laboratories, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
38  */
39 
40 /*
41  * External virtual filesystem routines
42  */
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/sysctl.h>
48 #include <sys/mount.h>
49 #include <sys/fcntl.h>
50 #include <sys/conf.h>
51 #include <sys/vnode.h>
52 #include <sys/lock.h>
53 #include <sys/lockf.h>
54 #include <sys/stat.h>
55 #include <sys/acct.h>
56 #include <sys/namei.h>
57 #include <sys/ucred.h>
58 #include <sys/buf.h>
59 #include <sys/errno.h>
60 #include <sys/malloc.h>
61 #include <sys/mbuf.h>
62 #include <sys/syscallargs.h>
63 #include <sys/pool.h>
64 #include <sys/tree.h>
65 #include <sys/specdev.h>
66 #include <sys/atomic.h>
67 
68 #include <netinet/in.h>
69 
70 #include <uvm/uvm_extern.h>
71 #include <uvm/uvm_vnode.h>
72 
73 #include "softraid.h"
74 
75 /*
76  * Locks used to protect data:
77  *	a	atomic
78  */
79 
80 void sr_quiesce(void);
81 
82 enum vtype iftovt_tab[16] = {
83 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
84 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
85 };
86 
87 int	vttoif_tab[9] = {
88 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
89 	S_IFSOCK, S_IFIFO, S_IFMT,
90 };
91 
92 int prtactive = 0;	/* 1 => print out reclaim of active vnodes */
93 int suid_clear = 1;	/* [a] 1 => clear SUID / SGID on owner change */
94 
95 /*
96  * Insq/Remq for the vnode usage lists.
97  */
98 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
99 #define	bufremvn(bp) {							\
100 	LIST_REMOVE(bp, b_vnbufs);					\
101 	LIST_NEXT(bp, b_vnbufs) = NOLIST;				\
102 }
103 
104 TAILQ_HEAD(freelst, vnode);
105 struct freelst vnode_hold_list;	/* list of vnodes referencing buffers */
106 struct freelst vnode_free_list;	/* vnode free list */
107 
108 struct mntlist mountlist;	/* mounted filesystem list */
109 
110 void	vclean(struct vnode *, int, struct proc *);
111 
112 void insmntque(struct vnode *, struct mount *);
113 int getdevvp(dev_t, struct vnode **, enum vtype);
114 
115 int vfs_hang_addrlist(struct mount *, struct netexport *,
116 				  struct export_args *);
117 int vfs_free_netcred(struct radix_node *, void *, u_int);
118 void vfs_free_addrlist(struct netexport *);
119 void vputonfreelist(struct vnode *);
120 
121 int vflush_vnode(struct vnode *, void *);
122 int maxvnodes;
123 
124 struct mutex vnode_mtx = MUTEX_INITIALIZER(IPL_BIO);
125 
126 void vfs_unmountall(void);
127 
128 #ifdef DEBUG
129 void printlockedvnodes(void);
130 #endif
131 
132 struct pool vnode_pool;
133 struct pool uvm_vnode_pool;
134 
135 static inline int rb_buf_compare(const struct buf *b1, const struct buf *b2);
136 RBT_GENERATE(buf_rb_bufs, buf, b_rbbufs, rb_buf_compare);
137 
138 static inline int
rb_buf_compare(const struct buf * b1,const struct buf * b2)139 rb_buf_compare(const struct buf *b1, const struct buf *b2)
140 {
141 	if (b1->b_lblkno < b2->b_lblkno)
142 		return(-1);
143 	if (b1->b_lblkno > b2->b_lblkno)
144 		return(1);
145 	return(0);
146 }
147 
148 /*
149  * Initialize the vnode management data structures.
150  */
151 void
vntblinit(void)152 vntblinit(void)
153 {
154 	/* buffer cache may need a vnode for each buffer */
155 	maxvnodes = 2 * initialvnodes;
156 	pool_init(&vnode_pool, sizeof(struct vnode), 0, IPL_NONE,
157 	    PR_WAITOK, "vnodes", NULL);
158 	pool_init(&uvm_vnode_pool, sizeof(struct uvm_vnode), 0, IPL_NONE,
159 	    PR_WAITOK, "uvmvnodes", NULL);
160 	TAILQ_INIT(&vnode_hold_list);
161 	TAILQ_INIT(&vnode_free_list);
162 	TAILQ_INIT(&mountlist);
163 	/*
164 	 * Initialize the filesystem syncer.
165 	 */
166 	vn_initialize_syncerd();
167 
168 #ifdef NFSSERVER
169 	rn_init(sizeof(struct sockaddr_in));
170 #endif /* NFSSERVER */
171 }
172 
173 /*
174  * Allocate a mount point.
175  *
176  * The returned mount point is marked as busy.
177  */
178 struct mount *
vfs_mount_alloc(struct vnode * vp,struct vfsconf * vfsp)179 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp)
180 {
181 	struct mount *mp;
182 
183 	mp = malloc(sizeof(*mp), M_MOUNT, M_WAITOK|M_ZERO);
184 	rw_init_flags(&mp->mnt_lock, "vfslock", RWL_IS_VNODE);
185 	(void)vfs_busy(mp, VB_READ|VB_NOWAIT);
186 
187 	TAILQ_INIT(&mp->mnt_vnodelist);
188 	mp->mnt_vnodecovered = vp;
189 
190 	atomic_inc_int(&vfsp->vfc_refcount);
191 	mp->mnt_vfc = vfsp;
192 	mp->mnt_op = vfsp->vfc_vfsops;
193 	mp->mnt_flag = vfsp->vfc_flags;
194 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
195 
196 	return (mp);
197 }
198 
199 /*
200  * Release a mount point.
201  */
202 void
vfs_mount_free(struct mount * mp)203 vfs_mount_free(struct mount *mp)
204 {
205 	atomic_dec_int(&mp->mnt_vfc->vfc_refcount);
206 	free(mp, M_MOUNT, sizeof(*mp));
207 }
208 
209 /*
210  * Mark a mount point as busy. Used to synchronize access and to delay
211  * unmounting.
212  *
213  * Default behaviour is to attempt getting a READ lock and in case of an
214  * ongoing unmount, to wait for it to finish and then return failure.
215  */
216 int
vfs_busy(struct mount * mp,int flags)217 vfs_busy(struct mount *mp, int flags)
218 {
219 	int rwflags = 0;
220 
221 	if (flags & VB_WRITE)
222 		rwflags |= RW_WRITE;
223 	else
224 		rwflags |= RW_READ;
225 
226 	if (flags & VB_WAIT)
227 		rwflags |= RW_SLEEPFAIL;
228 	else
229 		rwflags |= RW_NOSLEEP;
230 
231 #ifdef WITNESS
232 	if (flags & VB_DUPOK)
233 		rwflags |= RW_DUPOK;
234 #endif
235 
236 	if (rw_enter(&mp->mnt_lock, rwflags))
237 		return (EBUSY);
238 
239 	return (0);
240 }
241 
242 /*
243  * Free a busy file system
244  */
245 void
vfs_unbusy(struct mount * mp)246 vfs_unbusy(struct mount *mp)
247 {
248 	rw_exit(&mp->mnt_lock);
249 }
250 
251 int
vfs_isbusy(struct mount * mp)252 vfs_isbusy(struct mount *mp)
253 {
254 	return (rw_status(&mp->mnt_lock) != 0);
255 }
256 
257 /*
258  * Lookup a filesystem type, and if found allocate and initialize
259  * a mount structure for it.
260  *
261  * Devname is usually updated by mount(8) after booting.
262  */
263 int
vfs_rootmountalloc(char * fstypename,char * devname,struct mount ** mpp)264 vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp)
265 {
266 	struct vfsconf *vfsp;
267 	struct mount *mp;
268 
269 	vfsp = vfs_byname(fstypename);
270 	if (vfsp == NULL)
271 		return (ENODEV);
272 	mp = vfs_mount_alloc(NULLVP, vfsp);
273 	mp->mnt_flag |= MNT_RDONLY;
274 	mp->mnt_stat.f_mntonname[0] = '/';
275 	strlcpy(mp->mnt_stat.f_mntfromname, devname, MNAMELEN);
276 	strlcpy(mp->mnt_stat.f_mntfromspec, devname, MNAMELEN);
277 	*mpp = mp;
278 	return (0);
279  }
280 
281 /*
282  * Lookup a mount point by filesystem identifier.
283  */
284 struct mount *
vfs_getvfs(fsid_t * fsid)285 vfs_getvfs(fsid_t *fsid)
286 {
287 	struct mount *mp;
288 
289 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
290 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
291 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
292 			return (mp);
293 		}
294 	}
295 
296 	return (NULL);
297 }
298 
299 
300 /*
301  * Get a new unique fsid
302  */
303 void
vfs_getnewfsid(struct mount * mp)304 vfs_getnewfsid(struct mount *mp)
305 {
306 	static u_short xxxfs_mntid;
307 
308 	fsid_t tfsid;
309 	int mtype;
310 
311 	mtype = mp->mnt_vfc->vfc_typenum;
312 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
313 	mp->mnt_stat.f_fsid.val[1] = mtype;
314 	if (xxxfs_mntid == 0)
315 		++xxxfs_mntid;
316 	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
317 	tfsid.val[1] = mtype;
318 	if (!TAILQ_EMPTY(&mountlist)) {
319 		while (vfs_getvfs(&tfsid)) {
320 			tfsid.val[0]++;
321 			xxxfs_mntid++;
322 		}
323 	}
324 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
325 }
326 
327 /*
328  * Set vnode attributes to VNOVAL
329  */
330 void
vattr_null(struct vattr * vap)331 vattr_null(struct vattr *vap)
332 {
333 
334 	vap->va_type = VNON;
335 	/*
336 	 * Don't get fancy: u_quad_t = u_int = VNOVAL leaves the u_quad_t
337 	 * with 2^31-1 instead of 2^64-1.  Just write'm out and let
338 	 * the compiler do its job.
339 	 */
340 	vap->va_mode = VNOVAL;
341 	vap->va_nlink = VNOVAL;
342 	vap->va_uid = VNOVAL;
343 	vap->va_gid = VNOVAL;
344 	vap->va_fsid = VNOVAL;
345 	vap->va_fileid = VNOVAL;
346 	vap->va_size = VNOVAL;
347 	vap->va_blocksize = VNOVAL;
348 	vap->va_atime.tv_sec = VNOVAL;
349 	vap->va_atime.tv_nsec = VNOVAL;
350 	vap->va_mtime.tv_sec = VNOVAL;
351 	vap->va_mtime.tv_nsec = VNOVAL;
352 	vap->va_ctime.tv_sec = VNOVAL;
353 	vap->va_ctime.tv_nsec = VNOVAL;
354 	vap->va_gen = VNOVAL;
355 	vap->va_flags = VNOVAL;
356 	vap->va_rdev = VNOVAL;
357 	vap->va_bytes = VNOVAL;
358 	vap->va_filerev = VNOVAL;
359 	vap->va_vaflags = 0;
360 }
361 
362 /*
363  * Routines having to do with the management of the vnode table.
364  */
365 long numvnodes;
366 
367 /*
368  * Return the next vnode from the free list.
369  */
370 int
getnewvnode(enum vtagtype tag,struct mount * mp,const struct vops * vops,struct vnode ** vpp)371 getnewvnode(enum vtagtype tag, struct mount *mp, const struct vops *vops,
372     struct vnode **vpp)
373 {
374 	struct proc *p = curproc;
375 	struct freelst *listhd;
376 	static int toggle;
377 	struct vnode *vp;
378 	int s;
379 
380 	/*
381 	 * allow maxvnodes to increase if the buffer cache itself
382 	 * is big enough to justify it. (we don't shrink it ever)
383 	 */
384 	maxvnodes = maxvnodes < bcstats.numbufs ? bcstats.numbufs
385 	    : maxvnodes;
386 
387 	/*
388 	 * We must choose whether to allocate a new vnode or recycle an
389 	 * existing one. The criterion for allocating a new one is that
390 	 * the total number of vnodes is less than the number desired or
391 	 * there are no vnodes on either free list. Generally we only
392 	 * want to recycle vnodes that have no buffers associated with
393 	 * them, so we look first on the vnode_free_list. If it is empty,
394 	 * we next consider vnodes with referencing buffers on the
395 	 * vnode_hold_list. The toggle ensures that half the time we
396 	 * will use a buffer from the vnode_hold_list, and half the time
397 	 * we will allocate a new one unless the list has grown to twice
398 	 * the desired size. We are reticent to recycle vnodes from the
399 	 * vnode_hold_list because we will lose the identity of all its
400 	 * referencing buffers.
401 	 */
402 	toggle ^= 1;
403 	if (numvnodes / 2 > maxvnodes)
404 		toggle = 0;
405 
406 	s = splbio();
407 	if ((numvnodes < maxvnodes) ||
408 	    ((TAILQ_FIRST(listhd = &vnode_free_list) == NULL) &&
409 	    ((TAILQ_FIRST(listhd = &vnode_hold_list) == NULL) || toggle))) {
410 		splx(s);
411 		vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO);
412 		vp->v_uvm = pool_get(&uvm_vnode_pool, PR_WAITOK | PR_ZERO);
413 		vp->v_uvm->u_vnode = vp;
414 		uvm_obj_init(&vp->v_uvm->u_obj, &uvm_vnodeops, 0);
415 		RBT_INIT(buf_rb_bufs, &vp->v_bufs_tree);
416 		cache_tree_init(&vp->v_nc_tree);
417 		TAILQ_INIT(&vp->v_cache_dst);
418 		numvnodes++;
419 	} else {
420 		TAILQ_FOREACH(vp, listhd, v_freelist) {
421 			if (VOP_ISLOCKED(vp) == 0)
422 				break;
423 		}
424 		/*
425 		 * Unless this is a bad time of the month, at most
426 		 * the first NCPUS items on the free list are
427 		 * locked, so this is close enough to being empty.
428 		 */
429 		if (vp == NULL) {
430 			splx(s);
431 			tablefull("vnode");
432 			*vpp = NULL;
433 			return (ENFILE);
434 		}
435 
436 #ifdef DIAGNOSTIC
437 		if (vp->v_usecount) {
438 			vprint("free vnode", vp);
439 			panic("free vnode isn't");
440 		}
441 #endif
442 
443 		TAILQ_REMOVE(listhd, vp, v_freelist);
444 		vp->v_bioflag &= ~VBIOONFREELIST;
445 		splx(s);
446 
447 		if (vp->v_type != VBAD)
448 			vgonel(vp, p);
449 #ifdef DIAGNOSTIC
450 		if (vp->v_data) {
451 			vprint("cleaned vnode", vp);
452 			panic("cleaned vnode isn't");
453 		}
454 		s = splbio();
455 		if (vp->v_numoutput)
456 			panic("Clean vnode has pending I/O's");
457 		splx(s);
458 #endif
459 		vp->v_flag = 0;
460 		vp->v_socket = NULL;
461 	}
462 	cache_purge(vp);
463 	vp->v_type = VNON;
464 	vp->v_tag = tag;
465 	vp->v_op = vops;
466 	insmntque(vp, mp);
467 	*vpp = vp;
468 	vp->v_usecount = 1;
469 	vp->v_data = NULL;
470 	return (0);
471 }
472 
473 /*
474  * Move a vnode from one mount queue to another.
475  */
476 void
insmntque(struct vnode * vp,struct mount * mp)477 insmntque(struct vnode *vp, struct mount *mp)
478 {
479 	/*
480 	 * Delete from old mount point vnode list, if on one.
481 	 */
482 	if (vp->v_mount != NULL)
483 		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
484 	/*
485 	 * Insert into list of vnodes for the new mount point, if available.
486 	 */
487 	if ((vp->v_mount = mp) != NULL)
488 		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
489 }
490 
491 /*
492  * Create a vnode for a block device.
493  * Used for root filesystem, argdev, and swap areas.
494  * Also used for memory file system special devices.
495  */
496 int
bdevvp(dev_t dev,struct vnode ** vpp)497 bdevvp(dev_t dev, struct vnode **vpp)
498 {
499 	return (getdevvp(dev, vpp, VBLK));
500 }
501 
502 /*
503  * Create a vnode for a character device.
504  * Used for console handling.
505  */
506 int
cdevvp(dev_t dev,struct vnode ** vpp)507 cdevvp(dev_t dev, struct vnode **vpp)
508 {
509 	return (getdevvp(dev, vpp, VCHR));
510 }
511 
512 /*
513  * Create a vnode for a device.
514  * Used by bdevvp (block device) for root file system etc.,
515  * and by cdevvp (character device) for console.
516  */
517 int
getdevvp(dev_t dev,struct vnode ** vpp,enum vtype type)518 getdevvp(dev_t dev, struct vnode **vpp, enum vtype type)
519 {
520 	struct vnode *vp;
521 	struct vnode *nvp;
522 	int error;
523 
524 	if (dev == NODEV) {
525 		*vpp = NULLVP;
526 		return (0);
527 	}
528 	error = getnewvnode(VT_NON, NULL, &spec_vops, &nvp);
529 	if (error) {
530 		*vpp = NULLVP;
531 		return (error);
532 	}
533 	vp = nvp;
534 	vp->v_type = type;
535 	if ((nvp = checkalias(vp, dev, NULL)) != NULL) {
536 		vput(vp);
537 		vp = nvp;
538 	}
539 	if (vp->v_type == VCHR && cdevsw[major(vp->v_rdev)].d_type == D_TTY)
540 		vp->v_flag |= VISTTY;
541 	*vpp = vp;
542 	return (0);
543 }
544 
545 /*
546  * Check to see if the new vnode represents a special device
547  * for which we already have a vnode (either because of
548  * bdevvp() or because of a different vnode representing
549  * the same block device). If such an alias exists, deallocate
550  * the existing contents and return the aliased vnode. The
551  * caller is responsible for filling it with its new contents.
552  */
553 struct vnode *
checkalias(struct vnode * nvp,dev_t nvp_rdev,struct mount * mp)554 checkalias(struct vnode *nvp, dev_t nvp_rdev, struct mount *mp)
555 {
556 	struct proc *p = curproc;
557 	struct vnode *vp;
558 	struct vnodechain *vchain;
559 
560 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
561 		return (NULLVP);
562 
563 	vchain = &speclisth[SPECHASH(nvp_rdev)];
564 loop:
565 	SLIST_FOREACH(vp, vchain, v_specnext) {
566 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) {
567 			continue;
568 		}
569 		/*
570 		 * Alias, but not in use, so flush it out.
571 		 */
572 		if (vp->v_usecount == 0) {
573 			vgonel(vp, p);
574 			goto loop;
575 		}
576 		if (vget(vp, LK_EXCLUSIVE)) {
577 			goto loop;
578 		}
579 		break;
580 	}
581 
582 	/*
583 	 * Common case is actually in the if statement
584 	 */
585 	if (vp == NULL || !(vp->v_tag == VT_NON && vp->v_type == VBLK)) {
586 		nvp->v_specinfo = malloc(sizeof(struct specinfo), M_VNODE,
587 			M_WAITOK);
588 		nvp->v_rdev = nvp_rdev;
589 		nvp->v_hashchain = vchain;
590 		nvp->v_specmountpoint = NULL;
591 		nvp->v_speclockf = NULL;
592 		nvp->v_specbitmap = NULL;
593 		if (nvp->v_type == VCHR &&
594 		    (cdevsw[major(nvp_rdev)].d_flags & D_CLONE) &&
595 		    (minor(nvp_rdev) >> CLONE_SHIFT == 0)) {
596 			if (vp != NULLVP)
597 				nvp->v_specbitmap = vp->v_specbitmap;
598 			else
599 				nvp->v_specbitmap = malloc(CLONE_MAPSZ,
600 				    M_VNODE, M_WAITOK | M_ZERO);
601 		}
602 		SLIST_INSERT_HEAD(vchain, nvp, v_specnext);
603 		if (vp != NULLVP) {
604 			nvp->v_flag |= VALIASED;
605 			vp->v_flag |= VALIASED;
606 			vput(vp);
607 		}
608 		return (NULLVP);
609 	}
610 
611 	/*
612 	 * This code is the uncommon case. It is called in case
613 	 * we found an alias that was VT_NON && vtype of VBLK
614 	 * This means we found a block device that was created
615 	 * using bdevvp.
616 	 * An example of such a vnode is the root partition device vnode
617 	 * created in ffs_mountroot.
618 	 *
619 	 * The vnodes created by bdevvp should not be aliased (why?).
620 	 */
621 
622 	VOP_UNLOCK(vp);
623 	vclean(vp, 0, p);
624 	vp->v_op = nvp->v_op;
625 	vp->v_tag = nvp->v_tag;
626 	nvp->v_type = VNON;
627 	insmntque(vp, mp);
628 	return (vp);
629 }
630 
631 /*
632  * Grab a particular vnode from the free list, increment its
633  * reference count and lock it. If the vnode lock bit is set,
634  * the vnode is being eliminated in vgone. In that case, we
635  * cannot grab it, so the process is awakened when the
636  * transition is completed, and an error code is returned to
637  * indicate that the vnode is no longer usable, possibly
638  * having been changed to a new file system type.
639  */
640 int
vget(struct vnode * vp,int flags)641 vget(struct vnode *vp, int flags)
642 {
643 	int error, s, onfreelist;
644 
645 	/*
646 	 * If the vnode is in the process of being cleaned out for
647 	 * another use, we wait for the cleaning to finish and then
648 	 * return failure. Cleaning is determined by checking that
649 	 * the VXLOCK flag is set.
650 	 */
651 	mtx_enter(&vnode_mtx);
652 	if (vp->v_lflag & VXLOCK) {
653 		if (flags & LK_NOWAIT) {
654 			mtx_leave(&vnode_mtx);
655 			return (EBUSY);
656 		}
657 
658 		vp->v_lflag |= VXWANT;
659 		msleep_nsec(vp, &vnode_mtx, PINOD, "vget", INFSLP);
660 		mtx_leave(&vnode_mtx);
661 		return (ENOENT);
662 	}
663 	mtx_leave(&vnode_mtx);
664 
665 	s = splbio();
666 	onfreelist = vp->v_bioflag & VBIOONFREELIST;
667 	if (vp->v_usecount == 0 && onfreelist) {
668 		if (vp->v_holdcnt > 0)
669 			TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
670 		else
671 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
672 		vp->v_bioflag &= ~VBIOONFREELIST;
673 	}
674 	splx(s);
675 
676 	vp->v_usecount++;
677 	if (flags & LK_TYPE_MASK) {
678 		if ((error = vn_lock(vp, flags)) != 0) {
679 			vp->v_usecount--;
680 			if (vp->v_usecount == 0 && onfreelist)
681 				vputonfreelist(vp);
682 		}
683 		return (error);
684 	}
685 
686 	return (0);
687 }
688 
689 
690 /* Vnode reference. */
691 void
vref(struct vnode * vp)692 vref(struct vnode *vp)
693 {
694 	KERNEL_ASSERT_LOCKED();
695 
696 #ifdef DIAGNOSTIC
697 	if (vp->v_usecount == 0)
698 		panic("vref used where vget required");
699 	if (vp->v_type == VNON)
700 		panic("vref on a VNON vnode");
701 #endif
702 	vp->v_usecount++;
703 }
704 
705 void
vputonfreelist(struct vnode * vp)706 vputonfreelist(struct vnode *vp)
707 {
708 	int s;
709 	struct freelst *lst;
710 
711 	s = splbio();
712 #ifdef DIAGNOSTIC
713 	if (vp->v_usecount != 0)
714 		panic("Use count is not zero!");
715 
716 	/*
717 	 * If the hold count is still positive, one or many threads could still
718 	 * be waiting on the vnode lock inside uvn_io().
719 	 */
720 	if (vp->v_holdcnt == 0 && vp->v_lockcount != 0)
721 		panic("%s: lock count is not zero", __func__);
722 
723 	if (vp->v_bioflag & VBIOONFREELIST) {
724 		vprint("vnode already on free list: ", vp);
725 		panic("vnode already on free list");
726 	}
727 #endif
728 
729 	vp->v_bioflag |= VBIOONFREELIST;
730 	vp->v_bioflag &= ~VBIOERROR;
731 
732 	if (vp->v_holdcnt > 0)
733 		lst = &vnode_hold_list;
734 	else
735 		lst = &vnode_free_list;
736 
737 	if (vp->v_type == VBAD)
738 		TAILQ_INSERT_HEAD(lst, vp, v_freelist);
739 	else
740 		TAILQ_INSERT_TAIL(lst, vp, v_freelist);
741 
742 	splx(s);
743 }
744 
745 /*
746  * vput(), just unlock and vrele()
747  */
748 void
vput(struct vnode * vp)749 vput(struct vnode *vp)
750 {
751 	struct proc *p = curproc;
752 	int s;
753 
754 #ifdef DIAGNOSTIC
755 	if (vp == NULL)
756 		panic("vput: null vp");
757 #endif
758 
759 #ifdef DIAGNOSTIC
760 	if (vp->v_usecount == 0) {
761 		vprint("vput: bad ref count", vp);
762 		panic("vput: ref cnt");
763 	}
764 #endif
765 	vp->v_usecount--;
766 	KASSERT(vp->v_usecount > 0 || vp->v_uvcount == 0);
767 	if (vp->v_usecount > 0) {
768 		VOP_UNLOCK(vp);
769 		return;
770 	}
771 
772 #ifdef DIAGNOSTIC
773 	if (vp->v_writecount != 0) {
774 		vprint("vput: bad writecount", vp);
775 		panic("vput: v_writecount != 0");
776 	}
777 #endif
778 
779 	VOP_INACTIVE(vp, p);
780 
781 	s = splbio();
782 	if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST))
783 		vputonfreelist(vp);
784 	splx(s);
785 }
786 
787 /*
788  * Vnode release - use for active VNODES.
789  * If count drops to zero, call inactive routine and return to freelist.
790  * Returns 0 if it did not sleep.
791  */
792 int
vrele(struct vnode * vp)793 vrele(struct vnode *vp)
794 {
795 	struct proc *p = curproc;
796 	int s;
797 
798 #ifdef DIAGNOSTIC
799 	if (vp == NULL)
800 		panic("vrele: null vp");
801 #endif
802 #ifdef DIAGNOSTIC
803 	if (vp->v_usecount == 0) {
804 		vprint("vrele: bad ref count", vp);
805 		panic("vrele: ref cnt");
806 	}
807 #endif
808 	vp->v_usecount--;
809 	if (vp->v_usecount > 0) {
810 		return (0);
811 	}
812 
813 #ifdef DIAGNOSTIC
814 	if (vp->v_writecount != 0) {
815 		vprint("vrele: bad writecount", vp);
816 		panic("vrele: v_writecount != 0");
817 	}
818 #endif
819 
820 	if (vn_lock(vp, LK_EXCLUSIVE)) {
821 #ifdef DIAGNOSTIC
822 		vprint("vrele: cannot lock", vp);
823 #endif
824 		return (1);
825 	}
826 
827 	VOP_INACTIVE(vp, p);
828 
829 	s = splbio();
830 	if (vp->v_usecount == 0 && !(vp->v_bioflag & VBIOONFREELIST))
831 		vputonfreelist(vp);
832 	splx(s);
833 	return (1);
834 }
835 
836 /* Page or buffer structure gets a reference. */
837 void
vhold(struct vnode * vp)838 vhold(struct vnode *vp)
839 {
840 	int s;
841 
842 	s = splbio();
843 
844 	/*
845 	 * If it is on the freelist and the hold count is currently
846 	 * zero, move it to the hold list.
847 	 */
848 	if ((vp->v_bioflag & VBIOONFREELIST) &&
849 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
850 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
851 		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
852 	}
853 	vp->v_holdcnt++;
854 
855 	splx(s);
856 }
857 
858 /* Lose interest in a vnode. */
859 void
vdrop(struct vnode * vp)860 vdrop(struct vnode *vp)
861 {
862 	int s;
863 
864 	s = splbio();
865 
866 #ifdef DIAGNOSTIC
867 	if (vp->v_holdcnt == 0)
868 		panic("vdrop: zero holdcnt");
869 #endif
870 
871 	vp->v_holdcnt--;
872 
873 	/*
874 	 * If it is on the holdlist and the hold count drops to
875 	 * zero, move it to the free list.
876 	 */
877 	if ((vp->v_bioflag & VBIOONFREELIST) &&
878 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
879 		TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
880 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
881 	}
882 
883 	splx(s);
884 }
885 
886 /*
887  * Remove any vnodes in the vnode table belonging to mount point mp.
888  *
889  * If MNT_NOFORCE is specified, there should not be any active ones,
890  * return error if any are found (nb: this is a user error, not a
891  * system error). If MNT_FORCE is specified, detach any active vnodes
892  * that are found.
893  */
894 #ifdef DEBUG_SYSCTL
895 int busyprt = 0;	/* print out busy vnodes */
896 struct ctldebug debug_vfs_busyprt = { "vfs_busyprt", &busyprt };
897 #endif
898 
899 int
vfs_mount_foreach_vnode(struct mount * mp,int (* func)(struct vnode *,void *),void * arg)900 vfs_mount_foreach_vnode(struct mount *mp,
901     int (*func)(struct vnode *, void *), void *arg) {
902 	struct vnode *vp, *nvp;
903 	int error = 0;
904 
905 loop:
906 	TAILQ_FOREACH_SAFE(vp , &mp->mnt_vnodelist, v_mntvnodes, nvp) {
907 		if (vp->v_mount != mp)
908 			goto loop;
909 
910 		error = func(vp, arg);
911 
912 		if (error != 0)
913 			break;
914 	}
915 
916 	return (error);
917 }
918 
919 struct vflush_args {
920 	struct vnode *skipvp;
921 	int busy;
922 	int flags;
923 };
924 
925 int
vflush_vnode(struct vnode * vp,void * arg)926 vflush_vnode(struct vnode *vp, void *arg)
927 {
928 	struct vflush_args *va = arg;
929 	struct proc *p = curproc;
930 	int empty, s;
931 
932 	if (vp == va->skipvp) {
933 		return (0);
934 	}
935 
936 	if ((va->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
937 		return (0);
938 	}
939 
940 	/*
941 	 * If WRITECLOSE is set, only flush out regular file
942 	 * vnodes open for writing.
943 	 */
944 	if ((va->flags & WRITECLOSE) &&
945 	    (vp->v_writecount == 0 || vp->v_type != VREG)) {
946 		return (0);
947 	}
948 
949 	/*
950 	 * With v_usecount == 0, all we need to do is clear
951 	 * out the vnode data structures and we are done.
952 	 */
953 	if (vp->v_usecount == 0) {
954 		vgonel(vp, p);
955 		return (0);
956 	}
957 
958 	/*
959 	 * If FORCECLOSE is set, forcibly close the vnode.
960 	 * For block or character devices, revert to an
961 	 * anonymous device. For all other files, just kill them.
962 	 */
963 	if (va->flags & FORCECLOSE) {
964 		if (vp->v_type != VBLK && vp->v_type != VCHR) {
965 			vgonel(vp, p);
966 		} else {
967 			vclean(vp, 0, p);
968 			vp->v_op = &spec_vops;
969 			insmntque(vp, NULL);
970 		}
971 		return (0);
972 	}
973 
974 	/*
975 	 * If set, this is allowed to ignore vnodes which don't
976 	 * have changes pending to disk.
977 	 * XXX Might be nice to check per-fs "inode" flags, but
978 	 * generally the filesystem is sync'd already, right?
979 	 */
980 	s = splbio();
981 	empty = (va->flags & IGNORECLEAN) && LIST_EMPTY(&vp->v_dirtyblkhd);
982 	splx(s);
983 
984 	if (empty)
985 		return (0);
986 
987 #if defined(DEBUG_SYSCTL) && (defined(DEBUG) || defined(DIAGNOSTIC))
988 	if (busyprt)
989 		vprint("vflush: busy vnode", vp);
990 #endif
991 	va->busy++;
992 	return (0);
993 }
994 
995 int
vflush(struct mount * mp,struct vnode * skipvp,int flags)996 vflush(struct mount *mp, struct vnode *skipvp, int flags)
997 {
998 	struct vflush_args va;
999 	va.skipvp = skipvp;
1000 	va.busy = 0;
1001 	va.flags = flags;
1002 
1003 	vfs_mount_foreach_vnode(mp, vflush_vnode, &va);
1004 
1005 	if (va.busy)
1006 		return (EBUSY);
1007 	return (0);
1008 }
1009 
1010 /*
1011  * Disassociate the underlying file system from a vnode.
1012  */
1013 void
vclean(struct vnode * vp,int flags,struct proc * p)1014 vclean(struct vnode *vp, int flags, struct proc *p)
1015 {
1016 	int active, do_wakeup = 0;
1017 	int s;
1018 
1019 	/*
1020 	 * Check to see if the vnode is in use.
1021 	 * If so we have to reference it before we clean it out
1022 	 * so that its count cannot fall to zero and generate a
1023 	 * race against ourselves to recycle it.
1024 	 */
1025 	if ((active = vp->v_usecount) != 0)
1026 		vp->v_usecount++;
1027 
1028 	/*
1029 	 * Prevent the vnode from being recycled or
1030 	 * brought into use while we clean it out.
1031 	 */
1032 	mtx_enter(&vnode_mtx);
1033 	if (vp->v_lflag & VXLOCK)
1034 		panic("vclean: deadlock");
1035 	vp->v_lflag |= VXLOCK;
1036 
1037 	if (vp->v_lockcount > 0) {
1038 		/*
1039 		 * Ensure that any thread currently waiting on the same lock has
1040 		 * observed that the vnode is about to be exclusively locked
1041 		 * before continuing.
1042 		 */
1043 		msleep_nsec(&vp->v_lockcount, &vnode_mtx, PINOD, "vop_lock",
1044 		    INFSLP);
1045 		KASSERT(vp->v_lockcount == 0);
1046 	}
1047 	mtx_leave(&vnode_mtx);
1048 
1049 	/*
1050 	 * Even if the count is zero, the VOP_INACTIVE routine may still
1051 	 * have the object locked while it cleans it out. The VOP_LOCK
1052 	 * ensures that the VOP_INACTIVE routine is done with its work.
1053 	 * For active vnodes, it ensures that no other activity can
1054 	 * occur while the underlying object is being cleaned out.
1055 	 */
1056 	VOP_LOCK(vp, LK_EXCLUSIVE | LK_DRAIN);
1057 
1058 	/*
1059 	 * Clean out any VM data associated with the vnode.
1060 	 */
1061 	uvm_vnp_terminate(vp);
1062 	/*
1063 	 * Clean out any buffers associated with the vnode.
1064 	 */
1065 	if (flags & DOCLOSE)
1066 		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, INFSLP);
1067 	/*
1068 	 * If purging an active vnode, it must be closed and
1069 	 * deactivated before being reclaimed. Note that the
1070 	 * VOP_INACTIVE will unlock the vnode
1071 	 */
1072 	if (active) {
1073 		if (flags & DOCLOSE)
1074 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1075 		VOP_INACTIVE(vp, p);
1076 	} else {
1077 		/*
1078 		 * Any other processes trying to obtain this lock must first
1079 		 * wait for VXLOCK to clear, then call the new lock operation.
1080 		 */
1081 		VOP_UNLOCK(vp);
1082 	}
1083 
1084 	/*
1085 	 * Reclaim the vnode.
1086 	 */
1087 	if (VOP_RECLAIM(vp, p))
1088 		panic("vclean: cannot reclaim");
1089 	if (active) {
1090 		vp->v_usecount--;
1091 		if (vp->v_usecount == 0) {
1092 			s = splbio();
1093 			if (vp->v_holdcnt > 0)
1094 				panic("vclean: not clean");
1095 			vputonfreelist(vp);
1096 			splx(s);
1097 		}
1098 	}
1099 	cache_purge(vp);
1100 
1101 	/*
1102 	 * Done with purge, notify sleepers of the grim news.
1103 	 */
1104 	vp->v_op = &dead_vops;
1105 	VN_KNOTE(vp, NOTE_REVOKE);
1106 	vp->v_tag = VT_NON;
1107 #ifdef VFSLCKDEBUG
1108 	vp->v_flag &= ~VLOCKSWORK;
1109 #endif
1110 	mtx_enter(&vnode_mtx);
1111 	vp->v_lflag &= ~VXLOCK;
1112 	if (vp->v_lflag & VXWANT) {
1113 		vp->v_lflag &= ~VXWANT;
1114 		do_wakeup = 1;
1115 	}
1116 	mtx_leave(&vnode_mtx);
1117 	if (do_wakeup)
1118 		wakeup(vp);
1119 }
1120 
1121 /*
1122  * Recycle an unused vnode to the front of the free list.
1123  */
1124 int
vrecycle(struct vnode * vp,struct proc * p)1125 vrecycle(struct vnode *vp, struct proc *p)
1126 {
1127 	if (vp->v_usecount == 0) {
1128 		vgonel(vp, p);
1129 		return (1);
1130 	}
1131 	return (0);
1132 }
1133 
1134 /*
1135  * Eliminate all activity associated with a vnode
1136  * in preparation for reuse.
1137  */
1138 void
vgone(struct vnode * vp)1139 vgone(struct vnode *vp)
1140 {
1141 	struct proc *p = curproc;
1142 	vgonel(vp, p);
1143 }
1144 
1145 /*
1146  * vgone, with struct proc.
1147  */
1148 void
vgonel(struct vnode * vp,struct proc * p)1149 vgonel(struct vnode *vp, struct proc *p)
1150 {
1151 	struct vnode *vq;
1152 	struct vnode *vx;
1153 	int s;
1154 
1155 	KASSERT(vp->v_uvcount == 0);
1156 
1157 	/*
1158 	 * If a vgone (or vclean) is already in progress,
1159 	 * wait until it is done and return.
1160 	 */
1161 	mtx_enter(&vnode_mtx);
1162 	if (vp->v_lflag & VXLOCK) {
1163 		vp->v_lflag |= VXWANT;
1164 		msleep_nsec(vp, &vnode_mtx, PINOD, "vgone", INFSLP);
1165 		mtx_leave(&vnode_mtx);
1166 		return;
1167 	}
1168 	mtx_leave(&vnode_mtx);
1169 
1170 	/*
1171 	 * Clean out the filesystem specific data.
1172 	 */
1173 	vclean(vp, DOCLOSE, p);
1174 	/*
1175 	 * Delete from old mount point vnode list, if on one.
1176 	 */
1177 	if (vp->v_mount != NULL)
1178 		insmntque(vp, NULL);
1179 	/*
1180 	 * If special device, remove it from special device alias list
1181 	 * if it is on one.
1182 	 */
1183 	if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
1184 	    vp->v_specinfo != NULL) {
1185 		if ((vp->v_flag & VALIASED) == 0 && vp->v_type == VCHR &&
1186 		    (cdevsw[major(vp->v_rdev)].d_flags & D_CLONE) &&
1187 		    (minor(vp->v_rdev) >> CLONE_SHIFT == 0)) {
1188 			free(vp->v_specbitmap, M_VNODE, CLONE_MAPSZ);
1189 		}
1190 		SLIST_REMOVE(vp->v_hashchain, vp, vnode, v_specnext);
1191 		if (vp->v_flag & VALIASED) {
1192 			vx = NULL;
1193 			SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
1194 				if (vq->v_rdev != vp->v_rdev ||
1195 				    vq->v_type != vp->v_type)
1196 					continue;
1197 				if (vx)
1198 					break;
1199 				vx = vq;
1200 			}
1201 			if (vx == NULL)
1202 				panic("missing alias");
1203 			if (vq == NULL)
1204 				vx->v_flag &= ~VALIASED;
1205 			vp->v_flag &= ~VALIASED;
1206 		}
1207 		lf_purgelocks(&vp->v_speclockf);
1208 		free(vp->v_specinfo, M_VNODE, sizeof(struct specinfo));
1209 		vp->v_specinfo = NULL;
1210 	}
1211 	/*
1212 	 * If it is on the freelist and not already at the head,
1213 	 * move it to the head of the list.
1214 	 */
1215 	vp->v_type = VBAD;
1216 
1217 	/*
1218 	 * Move onto the free list, unless we were called from
1219 	 * getnewvnode and we're not on any free list
1220 	 */
1221 	s = splbio();
1222 	if (vp->v_usecount == 0 &&
1223 	    (vp->v_bioflag & VBIOONFREELIST)) {
1224 		if (vp->v_holdcnt > 0)
1225 			panic("vgonel: not clean");
1226 
1227 		if (TAILQ_FIRST(&vnode_free_list) != vp) {
1228 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1229 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1230 		}
1231 	}
1232 	splx(s);
1233 }
1234 
1235 /*
1236  * Lookup a vnode by device number.
1237  */
1238 int
vfinddev(dev_t dev,enum vtype type,struct vnode ** vpp)1239 vfinddev(dev_t dev, enum vtype type, struct vnode **vpp)
1240 {
1241 	struct vnode *vp;
1242 	int rc =0;
1243 
1244 	SLIST_FOREACH(vp, &speclisth[SPECHASH(dev)], v_specnext) {
1245 		if (dev != vp->v_rdev || type != vp->v_type)
1246 			continue;
1247 		*vpp = vp;
1248 		rc = 1;
1249 		break;
1250 	}
1251 	return (rc);
1252 }
1253 
1254 /*
1255  * Revoke all the vnodes corresponding to the specified minor number
1256  * range (endpoints inclusive) of the specified major.
1257  */
1258 void
vdevgone(int maj,int minl,int minh,enum vtype type)1259 vdevgone(int maj, int minl, int minh, enum vtype type)
1260 {
1261 	struct vnode *vp;
1262 	int mn;
1263 
1264 	for (mn = minl; mn <= minh; mn++)
1265 		if (vfinddev(makedev(maj, mn), type, &vp))
1266 			VOP_REVOKE(vp, REVOKEALL);
1267 }
1268 
1269 /*
1270  * Calculate the total number of references to a special device.
1271  */
1272 int
vcount(struct vnode * vp)1273 vcount(struct vnode *vp)
1274 {
1275 	struct vnode *vq;
1276 	int count;
1277 
1278 loop:
1279 	if ((vp->v_flag & VALIASED) == 0)
1280 		return (vp->v_usecount);
1281 	count = 0;
1282 	SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
1283 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1284 			continue;
1285 		/*
1286 		 * Alias, but not in use, so flush it out.
1287 		 */
1288 		if (vq->v_usecount == 0 && vq != vp) {
1289 			vgone(vq);
1290 			goto loop;
1291 		}
1292 		count += vq->v_usecount;
1293 	}
1294 	return (count);
1295 }
1296 
1297 #if defined(DEBUG) || defined(DIAGNOSTIC)
1298 /*
1299  * Print out a description of a vnode.
1300  */
1301 static char *typename[] =
1302    { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1303 
1304 void
vprint(char * label,struct vnode * vp)1305 vprint(char *label, struct vnode *vp)
1306 {
1307 	char buf[64];
1308 
1309 	if (label != NULL)
1310 		printf("%s: ", label);
1311 	printf("%p, type %s, use %u, write %u, hold %u,",
1312 		vp, typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1313 		vp->v_holdcnt);
1314 	buf[0] = '\0';
1315 	if (vp->v_flag & VROOT)
1316 		strlcat(buf, "|VROOT", sizeof buf);
1317 	if (vp->v_flag & VTEXT)
1318 		strlcat(buf, "|VTEXT", sizeof buf);
1319 	if (vp->v_flag & VSYSTEM)
1320 		strlcat(buf, "|VSYSTEM", sizeof buf);
1321 	if (vp->v_lflag & VXLOCK)
1322 		strlcat(buf, "|VXLOCK", sizeof buf);
1323 	if (vp->v_lflag & VXWANT)
1324 		strlcat(buf, "|VXWANT", sizeof buf);
1325 	if (vp->v_bioflag & VBIOWAIT)
1326 		strlcat(buf, "|VBIOWAIT", sizeof buf);
1327 	if (vp->v_bioflag & VBIOONFREELIST)
1328 		strlcat(buf, "|VBIOONFREELIST", sizeof buf);
1329 	if (vp->v_bioflag & VBIOONSYNCLIST)
1330 		strlcat(buf, "|VBIOONSYNCLIST", sizeof buf);
1331 	if (vp->v_flag & VALIASED)
1332 		strlcat(buf, "|VALIASED", sizeof buf);
1333 	if (buf[0] != '\0')
1334 		printf(" flags (%s)", &buf[1]);
1335 	if (vp->v_data == NULL) {
1336 		printf("\n");
1337 	} else {
1338 		printf("\n\t");
1339 		VOP_PRINT(vp);
1340 	}
1341 }
1342 #endif /* DEBUG || DIAGNOSTIC */
1343 
1344 #ifdef DEBUG
1345 /*
1346  * List all of the locked vnodes in the system.
1347  * Called when debugging the kernel.
1348  */
1349 void
printlockedvnodes(void)1350 printlockedvnodes(void)
1351 {
1352 	struct mount *mp;
1353 	struct vnode *vp;
1354 
1355 	printf("Locked vnodes\n");
1356 
1357 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1358 		if (vfs_busy(mp, VB_READ|VB_NOWAIT))
1359 			continue;
1360 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
1361 			if (VOP_ISLOCKED(vp))
1362 				vprint(NULL, vp);
1363 		}
1364 		vfs_unbusy(mp);
1365 	}
1366 
1367 }
1368 #endif
1369 
1370 /*
1371  * Top level filesystem related information gathering.
1372  */
1373 int
vfs_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)1374 vfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1375     size_t newlen, struct proc *p)
1376 {
1377 	struct vfsconf *vfsp, *tmpvfsp;
1378 	int ret;
1379 
1380 	/* all sysctl names at this level are at least name and field */
1381 	if (namelen < 2)
1382 		return (ENOTDIR);		/* overloaded */
1383 
1384 	if (name[0] != VFS_GENERIC) {
1385 		vfsp = vfs_bytypenum(name[0]);
1386 		if (vfsp == NULL || vfsp->vfc_vfsops->vfs_sysctl == NULL)
1387 			return (EOPNOTSUPP);
1388 
1389 		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1390 		    oldp, oldlenp, newp, newlen, p));
1391 	}
1392 
1393 	switch (name[1]) {
1394 	case VFS_MAXTYPENUM:
1395 		return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
1396 
1397 	case VFS_CONF:
1398 		if (namelen < 3)
1399 			return (ENOTDIR);	/* overloaded */
1400 
1401 		vfsp = vfs_bytypenum(name[2]);
1402 		if (vfsp == NULL)
1403 			return (EOPNOTSUPP);
1404 
1405 		/* Make a copy, clear out kernel pointers */
1406 		tmpvfsp = malloc(sizeof(*tmpvfsp), M_TEMP, M_WAITOK|M_ZERO);
1407 		memcpy(tmpvfsp, vfsp, sizeof(*tmpvfsp));
1408 		tmpvfsp->vfc_vfsops = NULL;
1409 
1410 		ret = sysctl_rdstruct(oldp, oldlenp, newp, tmpvfsp,
1411 		    sizeof(struct vfsconf));
1412 
1413 		free(tmpvfsp, M_TEMP, sizeof(*tmpvfsp));
1414 		return (ret);
1415 	case VFS_BCACHESTAT:	/* buffer cache statistics */
1416 		ret = sysctl_rdstruct(oldp, oldlenp, newp, &bcstats,
1417 		    sizeof(struct bcachestats));
1418 		return(ret);
1419 	}
1420 	return (EOPNOTSUPP);
1421 }
1422 
1423 /*
1424  * Check to see if a filesystem is mounted on a block device.
1425  */
1426 int
vfs_mountedon(struct vnode * vp)1427 vfs_mountedon(struct vnode *vp)
1428 {
1429 	struct vnode *vq;
1430 	int error = 0;
1431 
1432 	if (vp->v_specmountpoint != NULL)
1433 		return (EBUSY);
1434 	if (vp->v_flag & VALIASED) {
1435 		SLIST_FOREACH(vq, vp->v_hashchain, v_specnext) {
1436 			if (vq->v_rdev != vp->v_rdev ||
1437 			    vq->v_type != vp->v_type)
1438 				continue;
1439 			if (vq->v_specmountpoint != NULL) {
1440 				error = EBUSY;
1441 				break;
1442 			}
1443 		}
1444 	}
1445 	return (error);
1446 }
1447 
1448 #ifdef NFSSERVER
1449 /*
1450  * Build hash lists of net addresses and hang them off the mount point.
1451  * Called by vfs_export() to set up the lists of export addresses.
1452  */
1453 int
vfs_hang_addrlist(struct mount * mp,struct netexport * nep,struct export_args * argp)1454 vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
1455     struct export_args *argp)
1456 {
1457 	struct netcred *np;
1458 	struct radix_node_head *rnh;
1459 	int nplen, i;
1460 	struct radix_node *rn;
1461 	struct sockaddr *saddr, *smask = NULL;
1462 	int error;
1463 
1464 	if (argp->ex_addrlen == 0) {
1465 		if (mp->mnt_flag & MNT_DEFEXPORTED)
1466 			return (EPERM);
1467 		np = &nep->ne_defexported;
1468 		/* fill in the kernel's ucred from userspace's xucred */
1469 		if ((error = crfromxucred(&np->netc_anon, &argp->ex_anon)))
1470 			return (error);
1471 		mp->mnt_flag |= MNT_DEFEXPORTED;
1472 		goto finish;
1473 	}
1474 	if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN ||
1475 	    argp->ex_addrlen < 0 || argp->ex_masklen < 0)
1476 		return (EINVAL);
1477 	nplen = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1478 	np = (struct netcred *)malloc(nplen, M_NETADDR, M_WAITOK|M_ZERO);
1479 	np->netc_len = nplen;
1480 	saddr = (struct sockaddr *)(np + 1);
1481 	error = copyin(argp->ex_addr, saddr, argp->ex_addrlen);
1482 	if (error)
1483 		goto out;
1484 	if (saddr->sa_len > argp->ex_addrlen)
1485 		saddr->sa_len = argp->ex_addrlen;
1486 	if (argp->ex_masklen) {
1487 		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1488 		error = copyin(argp->ex_mask, smask, argp->ex_masklen);
1489 		if (error)
1490 			goto out;
1491 		if (smask->sa_len > argp->ex_masklen)
1492 			smask->sa_len = argp->ex_masklen;
1493 	}
1494 	/* fill in the kernel's ucred from userspace's xucred */
1495 	if ((error = crfromxucred(&np->netc_anon, &argp->ex_anon)))
1496 		goto out;
1497 	i = saddr->sa_family;
1498 	switch (i) {
1499 	case AF_INET:
1500 		if ((rnh = nep->ne_rtable_inet) == NULL) {
1501 			if (!rn_inithead((void **)&nep->ne_rtable_inet,
1502 			    offsetof(struct sockaddr_in, sin_addr))) {
1503 				error = ENOBUFS;
1504 				goto out;
1505 			}
1506 			rnh = nep->ne_rtable_inet;
1507 		}
1508 		break;
1509 	default:
1510 		error = EINVAL;
1511 		goto out;
1512 	}
1513 	rn = rn_addroute(saddr, smask, rnh, np->netc_rnodes, 0);
1514 	if (rn == NULL || np != (struct netcred *)rn) { /* already exists */
1515 		error = EPERM;
1516 		goto out;
1517 	}
1518 finish:
1519 	np->netc_exflags = argp->ex_flags;
1520 	return (0);
1521 out:
1522 	free(np, M_NETADDR, np->netc_len);
1523 	return (error);
1524 }
1525 
1526 int
vfs_free_netcred(struct radix_node * rn,void * w,u_int id)1527 vfs_free_netcred(struct radix_node *rn, void *w, u_int id)
1528 {
1529 	struct radix_node_head *rnh = (struct radix_node_head *)w;
1530 	struct netcred * np = (struct netcred *)rn;
1531 
1532 	rn_delete(rn->rn_key, rn->rn_mask, rnh, NULL);
1533 	free(np, M_NETADDR, np->netc_len);
1534 	return (0);
1535 }
1536 
1537 /*
1538  * Free the net address hash lists that are hanging off the mount points.
1539  */
1540 void
vfs_free_addrlist(struct netexport * nep)1541 vfs_free_addrlist(struct netexport *nep)
1542 {
1543 	struct radix_node_head *rnh;
1544 
1545 	if ((rnh = nep->ne_rtable_inet) != NULL) {
1546 		rn_walktree(rnh, vfs_free_netcred, rnh);
1547 		free(rnh, M_RTABLE, sizeof(*rnh));
1548 		nep->ne_rtable_inet = NULL;
1549 	}
1550 }
1551 #endif /* NFSSERVER */
1552 
1553 int
vfs_export(struct mount * mp,struct netexport * nep,struct export_args * argp)1554 vfs_export(struct mount *mp, struct netexport *nep, struct export_args *argp)
1555 {
1556 #ifdef NFSSERVER
1557 	int error;
1558 
1559 	if (argp->ex_flags & MNT_DELEXPORT) {
1560 		vfs_free_addrlist(nep);
1561 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1562 	}
1563 	if (argp->ex_flags & MNT_EXPORTED) {
1564 		if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
1565 			return (error);
1566 		mp->mnt_flag |= MNT_EXPORTED;
1567 	}
1568 	return (0);
1569 #else
1570 	return (ENOTSUP);
1571 #endif /* NFSSERVER */
1572 }
1573 
1574 struct netcred *
vfs_export_lookup(struct mount * mp,struct netexport * nep,struct mbuf * nam)1575 vfs_export_lookup(struct mount *mp, struct netexport *nep, struct mbuf *nam)
1576 {
1577 #ifdef NFSSERVER
1578 	struct netcred *np;
1579 	struct radix_node_head *rnh;
1580 	struct sockaddr *saddr;
1581 
1582 	np = NULL;
1583 	if (mp->mnt_flag & MNT_EXPORTED) {
1584 		/*
1585 		 * Lookup in the export list first.
1586 		 */
1587 		if (nam != NULL) {
1588 			saddr = mtod(nam, struct sockaddr *);
1589 			switch(saddr->sa_family) {
1590 			case AF_INET:
1591 				rnh = nep->ne_rtable_inet;
1592 				break;
1593 			default:
1594 				rnh = NULL;
1595 				break;
1596 			}
1597 			if (rnh != NULL)
1598 				np = (struct netcred *)rn_match(saddr, rnh);
1599 		}
1600 		/*
1601 		 * If no address match, use the default if it exists.
1602 		 */
1603 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1604 			np = &nep->ne_defexported;
1605 	}
1606 	return (np);
1607 #else
1608 	return (NULL);
1609 #endif /* NFSSERVER */
1610 }
1611 
1612 /*
1613  * Do the usual access checking.
1614  * file_mode, uid and gid are from the vnode in question,
1615  * while acc_mode and cred are from the VOP_ACCESS parameter list
1616  */
1617 int
vaccess(enum vtype type,mode_t file_mode,uid_t uid,gid_t gid,mode_t acc_mode,struct ucred * cred)1618 vaccess(enum vtype type, mode_t file_mode, uid_t uid, gid_t gid,
1619     mode_t acc_mode, struct ucred *cred)
1620 {
1621 	mode_t mask;
1622 
1623 	/* User id 0 always gets read/write access. */
1624 	if (cred->cr_uid == 0) {
1625 		/* For VEXEC, at least one of the execute bits must be set. */
1626 		if ((acc_mode & VEXEC) && type != VDIR &&
1627 		    (file_mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)
1628 			return EACCES;
1629 		return 0;
1630 	}
1631 
1632 	mask = 0;
1633 
1634 	/* Otherwise, check the owner. */
1635 	if (cred->cr_uid == uid) {
1636 		if (acc_mode & VEXEC)
1637 			mask |= S_IXUSR;
1638 		if (acc_mode & VREAD)
1639 			mask |= S_IRUSR;
1640 		if (acc_mode & VWRITE)
1641 			mask |= S_IWUSR;
1642 		return (file_mode & mask) == mask ? 0 : EACCES;
1643 	}
1644 
1645 	/* Otherwise, check the groups. */
1646 	if (groupmember(gid, cred)) {
1647 		if (acc_mode & VEXEC)
1648 			mask |= S_IXGRP;
1649 		if (acc_mode & VREAD)
1650 			mask |= S_IRGRP;
1651 		if (acc_mode & VWRITE)
1652 			mask |= S_IWGRP;
1653 		return (file_mode & mask) == mask ? 0 : EACCES;
1654 	}
1655 
1656 	/* Otherwise, check everyone else. */
1657 	if (acc_mode & VEXEC)
1658 		mask |= S_IXOTH;
1659 	if (acc_mode & VREAD)
1660 		mask |= S_IROTH;
1661 	if (acc_mode & VWRITE)
1662 		mask |= S_IWOTH;
1663 	return (file_mode & mask) == mask ? 0 : EACCES;
1664 }
1665 
1666 int
vnoperm(struct vnode * vp)1667 vnoperm(struct vnode *vp)
1668 {
1669 	if (vp->v_flag & VROOT || vp->v_mount == NULL)
1670 		return 0;
1671 
1672 	return (vp->v_mount->mnt_flag & MNT_NOPERM);
1673 }
1674 
1675 struct rwlock vfs_stall_lock = RWLOCK_INITIALIZER("vfs_stall");
1676 unsigned int vfs_stalling = 0;
1677 
1678 int
vfs_stall(struct proc * p,int stall)1679 vfs_stall(struct proc *p, int stall)
1680 {
1681 	struct mount *mp;
1682 	int allerror = 0, error;
1683 
1684 	if (stall) {
1685 		atomic_inc_int(&vfs_stalling);
1686 		rw_enter_write(&vfs_stall_lock);
1687 	}
1688 
1689 	/*
1690 	 * The loop variable mp is protected by vfs_busy() so that it cannot
1691 	 * be unmounted while VFS_SYNC() sleeps.  Traverse forward to keep the
1692 	 * lock order consistent with dounmount().
1693 	 */
1694 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1695 		if (stall) {
1696 			error = vfs_busy(mp, VB_WRITE|VB_WAIT|VB_DUPOK);
1697 			if (error) {
1698 				printf("%s: busy\n", mp->mnt_stat.f_mntonname);
1699 				allerror = error;
1700 				continue;
1701 			}
1702 			uvm_vnp_sync(mp);
1703 			error = VFS_SYNC(mp, MNT_WAIT, stall, p->p_ucred, p);
1704 			if (error) {
1705 				printf("%s: failed to sync\n",
1706 				    mp->mnt_stat.f_mntonname);
1707 				vfs_unbusy(mp);
1708 				allerror = error;
1709 				continue;
1710 			}
1711 			mp->mnt_flag |= MNT_STALLED;
1712 		} else {
1713 			if (mp->mnt_flag & MNT_STALLED) {
1714 				vfs_unbusy(mp);
1715 				mp->mnt_flag &= ~MNT_STALLED;
1716 			}
1717 		}
1718 	}
1719 
1720 	if (!stall) {
1721 		rw_exit_write(&vfs_stall_lock);
1722 		atomic_dec_int(&vfs_stalling);
1723 	}
1724 
1725 	return (allerror);
1726 }
1727 
1728 void
vfs_stall_barrier(void)1729 vfs_stall_barrier(void)
1730 {
1731 	if (__predict_false(vfs_stalling)) {
1732 		rw_enter_read(&vfs_stall_lock);
1733 		rw_exit_read(&vfs_stall_lock);
1734 	}
1735 }
1736 
1737 /*
1738  * Unmount all file systems.
1739  * We traverse the list in reverse order under the assumption that doing so
1740  * will avoid needing to worry about dependencies.
1741  */
1742 void
vfs_unmountall(void)1743 vfs_unmountall(void)
1744 {
1745 	struct mount *mp, *nmp;
1746 	int allerror, error, again = 1;
1747 
1748  retry:
1749 	allerror = 0;
1750 	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, nmp) {
1751 		if (vfs_busy(mp, VB_WRITE|VB_NOWAIT))
1752 			continue;
1753 		/* XXX Here is a race, the next pointer is not locked. */
1754 		if ((error = dounmount(mp, MNT_FORCE, curproc)) != 0) {
1755 			printf("unmount of %s failed with error %d\n",
1756 			    mp->mnt_stat.f_mntonname, error);
1757 			allerror = 1;
1758 		}
1759 	}
1760 
1761 	if (allerror) {
1762 		printf("WARNING: some file systems would not unmount\n");
1763 		if (again) {
1764 			printf("retrying\n");
1765 			again = 0;
1766 			goto retry;
1767 		}
1768 	}
1769 }
1770 
1771 /*
1772  * Sync and unmount file systems before shutting down.
1773  */
1774 void
vfs_shutdown(struct proc * p)1775 vfs_shutdown(struct proc *p)
1776 {
1777 #ifdef ACCOUNTING
1778 	acct_shutdown();
1779 #endif
1780 
1781 	printf("syncing disks...");
1782 
1783 	if (panicstr == NULL) {
1784 		/* Sync before unmount, in case we hang on something. */
1785 		sys_sync(p, NULL, NULL);
1786 		vfs_unmountall();
1787 	}
1788 
1789 #if NSOFTRAID > 0
1790 	sr_quiesce();
1791 #endif
1792 
1793 	if (vfs_syncwait(p, 1))
1794 		printf(" giving up\n");
1795 	else
1796 		printf(" done\n");
1797 }
1798 
1799 /*
1800  * perform sync() operation and wait for buffers to flush.
1801  */
1802 int
vfs_syncwait(struct proc * p,int verbose)1803 vfs_syncwait(struct proc *p, int verbose)
1804 {
1805 	struct buf *bp;
1806 	int iter, nbusy, dcount, s;
1807 #ifdef MULTIPROCESSOR
1808 	int hold_count;
1809 #endif
1810 
1811 	sys_sync(p, NULL, NULL);
1812 
1813 	/* Wait for sync to finish. */
1814 	dcount = 10000;
1815 	for (iter = 0; iter < 20; iter++) {
1816 		nbusy = 0;
1817 		LIST_FOREACH(bp, &bufhead, b_list) {
1818 			if ((bp->b_flags & (B_BUSY|B_INVAL|B_READ)) == B_BUSY)
1819 				nbusy++;
1820 			/*
1821 			 * With soft updates, some buffers that are
1822 			 * written will be remarked as dirty until other
1823 			 * buffers are written.
1824 			 *
1825 			 * XXX here be dragons. this should really go away
1826 			 * but should be carefully made to go away on it's
1827 			 * own with testing.. XXX
1828 			 */
1829 			if (bp->b_flags & B_DELWRI) {
1830 				s = splbio();
1831 				bremfree(bp);
1832 				buf_acquire(bp);
1833 				splx(s);
1834 				nbusy++;
1835 				bawrite(bp);
1836 				if (dcount-- <= 0) {
1837 					if (verbose)
1838 						printf("softdep ");
1839 					return 1;
1840 				}
1841 			}
1842 		}
1843 		if (nbusy == 0)
1844 			break;
1845 		if (verbose)
1846 			printf("%d ", nbusy);
1847 #ifdef MULTIPROCESSOR
1848 		if (_kernel_lock_held())
1849 			hold_count = __mp_release_all(&kernel_lock);
1850 		else
1851 			hold_count = 0;
1852 #endif
1853 		DELAY(40000 * iter);
1854 #ifdef MULTIPROCESSOR
1855 		if (hold_count)
1856 			__mp_acquire_count(&kernel_lock, hold_count);
1857 #endif
1858 	}
1859 
1860 	return nbusy;
1861 }
1862 
1863 /*
1864  * posix file system related system variables.
1865  */
1866 int
fs_posix_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)1867 fs_posix_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
1868     void *newp, size_t newlen, struct proc *p)
1869 {
1870 	/* all sysctl names at this level are terminal */
1871 	if (namelen != 1)
1872 		return (ENOTDIR);
1873 
1874 	switch (name[0]) {
1875 	case FS_POSIX_SETUID:
1876 		return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen,
1877 		    &suid_clear));
1878 	default:
1879 		return (EOPNOTSUPP);
1880 	}
1881 	/* NOTREACHED */
1882 }
1883 
1884 /*
1885  * file system related system variables.
1886  */
1887 int
fs_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen,struct proc * p)1888 fs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1889     size_t newlen, struct proc *p)
1890 {
1891 	sysctlfn *fn;
1892 
1893 	switch (name[0]) {
1894 	case FS_POSIX:
1895 		fn = fs_posix_sysctl;
1896 		break;
1897 	default:
1898 		return (EOPNOTSUPP);
1899 	}
1900 	return (*fn)(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p);
1901 }
1902 
1903 
1904 /*
1905  * Routines dealing with vnodes and buffers
1906  */
1907 
1908 /*
1909  * Wait for all outstanding I/Os to complete
1910  *
1911  * Manipulates v_numoutput. Must be called at splbio()
1912  */
1913 int
vwaitforio(struct vnode * vp,int slpflag,char * wmesg,uint64_t timeo)1914 vwaitforio(struct vnode *vp, int slpflag, char *wmesg, uint64_t timeo)
1915 {
1916 	int error = 0;
1917 
1918 	splassert(IPL_BIO);
1919 
1920 	while (vp->v_numoutput) {
1921 		vp->v_bioflag |= VBIOWAIT;
1922 		error = tsleep_nsec(&vp->v_numoutput,
1923 		    slpflag | (PRIBIO + 1), wmesg, timeo);
1924 		if (error)
1925 			break;
1926 	}
1927 
1928 	return (error);
1929 }
1930 
1931 /*
1932  * Update outstanding I/O count and do wakeup if requested.
1933  *
1934  * Manipulates v_numoutput. Must be called at splbio()
1935  */
1936 void
vwakeup(struct vnode * vp)1937 vwakeup(struct vnode *vp)
1938 {
1939 	splassert(IPL_BIO);
1940 
1941 	if (vp != NULL) {
1942 		if (vp->v_numoutput-- == 0)
1943 			panic("vwakeup: neg numoutput");
1944 		if ((vp->v_bioflag & VBIOWAIT) && vp->v_numoutput == 0) {
1945 			vp->v_bioflag &= ~VBIOWAIT;
1946 			wakeup(&vp->v_numoutput);
1947 		}
1948 	}
1949 }
1950 
1951 /*
1952  * Flush out and invalidate all buffers associated with a vnode.
1953  * Called with the underlying object locked.
1954  */
1955 int
vinvalbuf(struct vnode * vp,int flags,struct ucred * cred,struct proc * p,int slpflag,uint64_t slptimeo)1956 vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, struct proc *p,
1957     int slpflag, uint64_t slptimeo)
1958 {
1959 	struct buf *bp;
1960 	struct buf *nbp, *blist;
1961 	int s, error;
1962 
1963 #ifdef VFSLCKDEBUG
1964 	if ((vp->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp))
1965 		panic("%s: vp isn't locked, vp %p", __func__, vp);
1966 #endif
1967 
1968 	if (flags & V_SAVE) {
1969 		s = splbio();
1970 		vwaitforio(vp, 0, "vinvalbuf", INFSLP);
1971 		if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
1972 			splx(s);
1973 			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
1974 				return (error);
1975 			s = splbio();
1976 			if (vp->v_numoutput > 0 ||
1977 			    !LIST_EMPTY(&vp->v_dirtyblkhd))
1978 				panic("%s: dirty bufs, vp %p", __func__, vp);
1979 		}
1980 		splx(s);
1981 	}
1982 loop:
1983 	s = splbio();
1984 	for (;;) {
1985 		int count = 0;
1986 		if ((blist = LIST_FIRST(&vp->v_cleanblkhd)) &&
1987 		    (flags & V_SAVEMETA))
1988 			while (blist && blist->b_lblkno < 0)
1989 				blist = LIST_NEXT(blist, b_vnbufs);
1990 		if (blist == NULL &&
1991 		    (blist = LIST_FIRST(&vp->v_dirtyblkhd)) &&
1992 		    (flags & V_SAVEMETA))
1993 			while (blist && blist->b_lblkno < 0)
1994 				blist = LIST_NEXT(blist, b_vnbufs);
1995 		if (!blist)
1996 			break;
1997 
1998 		for (bp = blist; bp; bp = nbp) {
1999 			nbp = LIST_NEXT(bp, b_vnbufs);
2000 			if (flags & V_SAVEMETA && bp->b_lblkno < 0)
2001 				continue;
2002 			if (bp->b_flags & B_BUSY) {
2003 				bp->b_flags |= B_WANTED;
2004 				error = tsleep_nsec(bp, slpflag | (PRIBIO + 1),
2005 				    "vinvalbuf", slptimeo);
2006 				if (error) {
2007 					splx(s);
2008 					return (error);
2009 				}
2010 				break;
2011 			}
2012 			bremfree(bp);
2013 			/*
2014 			 * XXX Since there are no node locks for NFS, I believe
2015 			 * there is a slight chance that a delayed write will
2016 			 * occur while sleeping just above, so check for it.
2017 			 */
2018 			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
2019 				buf_acquire(bp);
2020 				splx(s);
2021 				(void) VOP_BWRITE(bp);
2022 				goto loop;
2023 			}
2024 			buf_acquire_nomap(bp);
2025 			bp->b_flags |= B_INVAL;
2026 			brelse(bp);
2027 			count++;
2028 			/*
2029 			 * XXX Temporary workaround XXX
2030 			 *
2031 			 * If this is a gigantisch vnode and we are
2032 			 * trashing a ton of buffers, drop the lock
2033 			 * and yield every so often. The longer term
2034 			 * fix is to add a separate list for these
2035 			 * invalid buffers so we don't have to do the
2036 			 * work to free these here.
2037 			 */
2038 			if (count > 100) {
2039 				splx(s);
2040 				sched_pause(yield);
2041 				goto loop;
2042 			}
2043 		}
2044 	}
2045 	if (!(flags & V_SAVEMETA) &&
2046 	    (!LIST_EMPTY(&vp->v_dirtyblkhd) || !LIST_EMPTY(&vp->v_cleanblkhd)))
2047 		panic("%s: flush failed, vp %p", __func__, vp);
2048 	splx(s);
2049 	return (0);
2050 }
2051 
2052 void
vflushbuf(struct vnode * vp,int sync)2053 vflushbuf(struct vnode *vp, int sync)
2054 {
2055 	struct buf *bp, *nbp;
2056 	int s;
2057 
2058 loop:
2059 	s = splbio();
2060 	LIST_FOREACH_SAFE(bp, &vp->v_dirtyblkhd, b_vnbufs, nbp) {
2061 		if ((bp->b_flags & B_BUSY))
2062 			continue;
2063 		if ((bp->b_flags & B_DELWRI) == 0)
2064 			panic("vflushbuf: not dirty");
2065 		bremfree(bp);
2066 		buf_acquire(bp);
2067 		splx(s);
2068 		/*
2069 		 * Wait for I/O associated with indirect blocks to complete,
2070 		 * since there is no way to quickly wait for them below.
2071 		 */
2072 		if (bp->b_vp == vp || sync == 0)
2073 			(void) bawrite(bp);
2074 		else
2075 			(void) bwrite(bp);
2076 		goto loop;
2077 	}
2078 	if (sync == 0) {
2079 		splx(s);
2080 		return;
2081 	}
2082 	vwaitforio(vp, 0, "vflushbuf", INFSLP);
2083 	if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
2084 		splx(s);
2085 #ifdef DIAGNOSTIC
2086 		vprint("vflushbuf: dirty", vp);
2087 #endif
2088 		goto loop;
2089 	}
2090 	splx(s);
2091 }
2092 
2093 /*
2094  * Associate a buffer with a vnode.
2095  *
2096  * Manipulates buffer vnode queues. Must be called at splbio().
2097  */
2098 void
bgetvp(struct vnode * vp,struct buf * bp)2099 bgetvp(struct vnode *vp, struct buf *bp)
2100 {
2101 	splassert(IPL_BIO);
2102 
2103 
2104 	if (bp->b_vp)
2105 		panic("bgetvp: not free");
2106 	vhold(vp);
2107 	bp->b_vp = vp;
2108 	if (vp->v_type == VBLK || vp->v_type == VCHR)
2109 		bp->b_dev = vp->v_rdev;
2110 	else
2111 		bp->b_dev = NODEV;
2112 	/*
2113 	 * Insert onto list for new vnode.
2114 	 */
2115 	bufinsvn(bp, &vp->v_cleanblkhd);
2116 }
2117 
2118 /*
2119  * Disassociate a buffer from a vnode.
2120  *
2121  * Manipulates vnode buffer queues. Must be called at splbio().
2122  */
2123 void
brelvp(struct buf * bp)2124 brelvp(struct buf *bp)
2125 {
2126 	struct vnode *vp;
2127 
2128 	splassert(IPL_BIO);
2129 
2130 	if ((vp = bp->b_vp) == (struct vnode *) 0)
2131 		panic("brelvp: NULL");
2132 	/*
2133 	 * Delete from old vnode list, if on one.
2134 	 */
2135 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
2136 		bufremvn(bp);
2137 	if ((vp->v_bioflag & VBIOONSYNCLIST) &&
2138 	    LIST_EMPTY(&vp->v_dirtyblkhd)) {
2139 		vp->v_bioflag &= ~VBIOONSYNCLIST;
2140 		LIST_REMOVE(vp, v_synclist);
2141 	}
2142 	bp->b_vp = NULL;
2143 
2144 	vdrop(vp);
2145 }
2146 
2147 /*
2148  * Replaces the current vnode associated with the buffer, if any,
2149  * with a new vnode.
2150  *
2151  * If an output I/O is pending on the buffer, the old vnode
2152  * I/O count is adjusted.
2153  *
2154  * Ignores vnode buffer queues. Must be called at splbio().
2155  */
2156 void
buf_replacevnode(struct buf * bp,struct vnode * newvp)2157 buf_replacevnode(struct buf *bp, struct vnode *newvp)
2158 {
2159 	struct vnode *oldvp = bp->b_vp;
2160 
2161 	splassert(IPL_BIO);
2162 
2163 	if (oldvp)
2164 		brelvp(bp);
2165 
2166 	if ((bp->b_flags & (B_READ | B_DONE)) == 0) {
2167 		newvp->v_numoutput++;	/* put it on swapdev */
2168 		vwakeup(oldvp);
2169 	}
2170 
2171 	bgetvp(newvp, bp);
2172 	bufremvn(bp);
2173 }
2174 
2175 /*
2176  * Used to assign buffers to the appropriate clean or dirty list on
2177  * the vnode and to add newly dirty vnodes to the appropriate
2178  * filesystem syncer list.
2179  *
2180  * Manipulates vnode buffer queues. Must be called at splbio().
2181  */
2182 void
reassignbuf(struct buf * bp)2183 reassignbuf(struct buf *bp)
2184 {
2185 	struct buflists *listheadp;
2186 	int delay;
2187 	struct vnode *vp = bp->b_vp;
2188 
2189 	splassert(IPL_BIO);
2190 
2191 	/*
2192 	 * Delete from old vnode list, if on one.
2193 	 */
2194 	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
2195 		bufremvn(bp);
2196 
2197 	/*
2198 	 * If dirty, put on list of dirty buffers;
2199 	 * otherwise insert onto list of clean buffers.
2200 	 */
2201 	if ((bp->b_flags & B_DELWRI) == 0) {
2202 		listheadp = &vp->v_cleanblkhd;
2203 		if ((vp->v_bioflag & VBIOONSYNCLIST) &&
2204 		    LIST_EMPTY(&vp->v_dirtyblkhd)) {
2205 			vp->v_bioflag &= ~VBIOONSYNCLIST;
2206 			LIST_REMOVE(vp, v_synclist);
2207 		}
2208 	} else {
2209 		listheadp = &vp->v_dirtyblkhd;
2210 		if ((vp->v_bioflag & VBIOONSYNCLIST) == 0) {
2211 			switch (vp->v_type) {
2212 			case VDIR:
2213 				delay = syncdelay / 2;
2214 				break;
2215 			case VBLK:
2216 				if (vp->v_specmountpoint != NULL) {
2217 					delay = syncdelay / 3;
2218 					break;
2219 				}
2220 				/* FALLTHROUGH */
2221 			default:
2222 				delay = syncdelay;
2223 			}
2224 			vn_syncer_add_to_worklist(vp, delay);
2225 		}
2226 	}
2227 	bufinsvn(bp, listheadp);
2228 }
2229 
2230 #ifdef DDB
2231 #include <machine/db_machdep.h>
2232 #include <ddb/db_interface.h>
2233 
2234 void
vfs_buf_print(void * b,int full,int (* pr)(const char *,...))2235 vfs_buf_print(void *b, int full,
2236     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2237 {
2238 	struct buf *bp = b;
2239 
2240 	(*pr)("  vp %p lblkno 0x%llx blkno 0x%llx dev 0x%x\n"
2241 	      "  proc %p error %d flags %lb\n",
2242 	    bp->b_vp, (int64_t)bp->b_lblkno, (int64_t)bp->b_blkno, bp->b_dev,
2243 	    bp->b_proc, bp->b_error, bp->b_flags, B_BITS);
2244 
2245 	(*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n"
2246 	      "  data %p saveaddr %p iodone %p\n",
2247 	    bp->b_bufsize, bp->b_bcount, (long)bp->b_resid,
2248 	    bp->b_data, bp->b_saveaddr,
2249 	    bp->b_iodone);
2250 
2251 	(*pr)("  dirty {off 0x%x end 0x%x} valid {off 0x%x end 0x%x}\n",
2252 	    bp->b_dirtyoff, bp->b_dirtyend, bp->b_validoff, bp->b_validend);
2253 
2254 }
2255 
2256 const char *vtypes[] = { VTYPE_NAMES };
2257 const char *vtags[] = { VTAG_NAMES };
2258 
2259 void
vfs_vnode_print(void * v,int full,int (* pr)(const char *,...))2260 vfs_vnode_print(void *v, int full,
2261     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2262 {
2263 	struct vnode *vp = v;
2264 
2265 	(*pr)("tag %s(%d) type %s(%d) mount %p typedata %p\n",
2266 	      (u_int)vp->v_tag >= nitems(vtags)? "<unk>":vtags[vp->v_tag],
2267 	      vp->v_tag,
2268 	      (u_int)vp->v_type >= nitems(vtypes)? "<unk>":vtypes[vp->v_type],
2269 	      vp->v_type, vp->v_mount, vp->v_mountedhere);
2270 
2271 	(*pr)("data %p usecount %d writecount %d holdcnt %d numoutput %d\n",
2272 	      vp->v_data, vp->v_usecount, vp->v_writecount,
2273 	      vp->v_holdcnt, vp->v_numoutput);
2274 
2275 	/* uvm_object_printit(&vp->v_uobj, full, pr); */
2276 
2277 	if (full) {
2278 		struct buf *bp;
2279 
2280 		(*pr)("clean bufs:\n");
2281 		LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
2282 			(*pr)(" bp %p\n", bp);
2283 			vfs_buf_print(bp, full, pr);
2284 		}
2285 
2286 		(*pr)("dirty bufs:\n");
2287 		LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
2288 			(*pr)(" bp %p\n", bp);
2289 			vfs_buf_print(bp, full, pr);
2290 		}
2291 	}
2292 }
2293 
2294 void
vfs_mount_print(struct mount * mp,int full,int (* pr)(const char *,...))2295 vfs_mount_print(struct mount *mp, int full,
2296     int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2297 {
2298 	struct vfsconf *vfc = mp->mnt_vfc;
2299 	struct vnode *vp;
2300 	int cnt;
2301 
2302 	(*pr)("flags %b\nvnodecovered %p syncer %p data %p\n",
2303 	    mp->mnt_flag, MNT_BITS,
2304 	    mp->mnt_vnodecovered, mp->mnt_syncer, mp->mnt_data);
2305 
2306 	(*pr)("vfsconf: ops %p name \"%s\" num %d ref %u flags 0x%x\n",
2307 	    vfc->vfc_vfsops, vfc->vfc_name, vfc->vfc_typenum,
2308 	    vfc->vfc_refcount, vfc->vfc_flags);
2309 
2310 	(*pr)("statvfs cache: bsize %x iosize %x\n"
2311 	    "blocks %llu free %llu avail %lld\n",
2312 	    mp->mnt_stat.f_bsize, mp->mnt_stat.f_iosize, mp->mnt_stat.f_blocks,
2313 	    mp->mnt_stat.f_bfree, mp->mnt_stat.f_bavail);
2314 
2315 	(*pr)("  files %llu ffiles %llu favail %lld\n", mp->mnt_stat.f_files,
2316 	    mp->mnt_stat.f_ffree, mp->mnt_stat.f_favail);
2317 
2318 	(*pr)("  f_fsidx {0x%x, 0x%x} owner %u ctime 0x%llx\n",
2319 	    mp->mnt_stat.f_fsid.val[0], mp->mnt_stat.f_fsid.val[1],
2320 	    mp->mnt_stat.f_owner, mp->mnt_stat.f_ctime);
2321 
2322 	(*pr)("  syncwrites %llu asyncwrites = %llu\n",
2323 	    mp->mnt_stat.f_syncwrites, mp->mnt_stat.f_asyncwrites);
2324 
2325 	(*pr)("  syncreads %llu asyncreads = %llu\n",
2326 	    mp->mnt_stat.f_syncreads, mp->mnt_stat.f_asyncreads);
2327 
2328 	(*pr)("  fstype \"%s\" mnton \"%s\" mntfrom \"%s\" mntspec \"%s\"\n",
2329 	    mp->mnt_stat.f_fstypename, mp->mnt_stat.f_mntonname,
2330 	    mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntfromspec);
2331 
2332 	(*pr)("locked vnodes:");
2333 	/* XXX would take mountlist lock, except ddb has no context */
2334 	cnt = 0;
2335 	TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2336 		if (VOP_ISLOCKED(vp)) {
2337 			if (cnt == 0)
2338 				(*pr)("\n  %p", vp);
2339 			else if ((cnt % (72 / (sizeof(void *) * 2 + 4))) == 0)
2340 				(*pr)(",\n  %p", vp);
2341 			else
2342 				(*pr)(", %p", vp);
2343 			cnt++;
2344 		}
2345 	}
2346 	(*pr)("\n");
2347 
2348 	if (full) {
2349 		(*pr)("all vnodes:");
2350 		/* XXX would take mountlist lock, except ddb has no context */
2351 		cnt = 0;
2352 		TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2353 			if (cnt == 0)
2354 				(*pr)("\n  %p", vp);
2355 			else if ((cnt % (72 / (sizeof(void *) * 2 + 4))) == 0)
2356 				(*pr)(",\n  %p", vp);
2357 			else
2358 				(*pr)(", %p", vp);
2359 			cnt++;
2360 		}
2361 		(*pr)("\n");
2362 	}
2363 }
2364 #endif /* DDB */
2365 
2366 void
copy_statfs_info(struct statfs * sbp,const struct mount * mp)2367 copy_statfs_info(struct statfs *sbp, const struct mount *mp)
2368 {
2369 	const struct statfs *mbp;
2370 
2371 	strncpy(sbp->f_fstypename, mp->mnt_vfc->vfc_name, MFSNAMELEN);
2372 
2373 	if (sbp == (mbp = &mp->mnt_stat))
2374 		return;
2375 
2376 	sbp->f_fsid = mbp->f_fsid;
2377 	sbp->f_owner = mbp->f_owner;
2378 	sbp->f_flags = mbp->f_flags;
2379 	sbp->f_syncwrites = mbp->f_syncwrites;
2380 	sbp->f_asyncwrites = mbp->f_asyncwrites;
2381 	sbp->f_syncreads = mbp->f_syncreads;
2382 	sbp->f_asyncreads = mbp->f_asyncreads;
2383 	sbp->f_namemax = mbp->f_namemax;
2384 	memcpy(sbp->f_mntonname, mp->mnt_stat.f_mntonname, MNAMELEN);
2385 	memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, MNAMELEN);
2386 	memcpy(sbp->f_mntfromspec, mp->mnt_stat.f_mntfromspec, MNAMELEN);
2387 	memcpy(&sbp->mount_info, &mp->mnt_stat.mount_info,
2388 	    sizeof(union mount_info));
2389 }
2390