xref: /dragonfly/sys/kern/vfs_mount.c (revision 333227be)
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1989, 1993
35  *	The Regents of the University of California.  All rights reserved.
36  * (c) UNIX System Laboratories, Inc.
37  * All or some portions of this file are derived from material licensed
38  * to the University of California by American Telephone and Telegraph
39  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40  * the permission of UNIX System Laboratories, Inc.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. All advertising materials mentioning features or use of this software
51  *    must display the following acknowledgement:
52  *	This product includes software developed by the University of
53  *	California, Berkeley and its contributors.
54  * 4. Neither the name of the University nor the names of its contributors
55  *    may be used to endorse or promote products derived from this software
56  *    without specific prior written permission.
57  *
58  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68  * SUCH DAMAGE.
69  *
70  * $DragonFly: src/sys/kern/vfs_mount.c,v 1.2 2004/10/19 05:55:34 dillon Exp $
71  */
72 
73 /*
74  * External virtual filesystem routines
75  */
76 #include "opt_ddb.h"
77 
78 #include <sys/param.h>
79 #include <sys/systm.h>
80 #include <sys/kernel.h>
81 #include <sys/malloc.h>
82 #include <sys/mount.h>
83 #include <sys/proc.h>
84 #include <sys/vnode.h>
85 #include <sys/buf.h>
86 #include <sys/eventhandler.h>
87 #include <sys/kthread.h>
88 #include <sys/sysctl.h>
89 
90 #include <machine/limits.h>
91 
92 #include <sys/buf2.h>
93 #include <sys/thread2.h>
94 
95 #include <vm/vm.h>
96 #include <vm/vm_object.h>
97 
98 static int vnlru_nowhere = 0;
99 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
100 	    &vnlru_nowhere, 0,
101 	    "Number of times the vnlru process ran without success");
102 
103 
104 static struct lwkt_token mntid_token;
105 
106 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
107 struct lwkt_token mountlist_token;
108 struct lwkt_token mntvnode_token;
109 
110 
111 /*
112  * Called from vfsinit()
113  */
114 void
115 vfs_mount_init(void)
116 {
117 	lwkt_token_init(&mountlist_token);
118 	lwkt_token_init(&mntvnode_token);
119 	lwkt_token_init(&mntid_token);
120 }
121 
122 /*
123  * Allocate a new vnode and associate it with a tag, mount point, and
124  * operations vector.
125  *
126  * A VX locked and refd vnode is returned.  The caller should setup the
127  * remaining fields and vx_put() or, if he wishes to leave a vref,
128  * vx_unlock() the vnode.
129  */
130 int
131 getnewvnode(enum vtagtype tag, struct mount *mp, struct vop_ops *ops,
132 		struct vnode **vpp, int lktimeout, int lkflags)
133 {
134 	struct vnode *vp;
135 
136 	vp = allocvnode(lktimeout, lkflags);
137 	vp->v_tag = tag;
138 	vp->v_ops = ops;
139 	vp->v_data = NULL;
140 
141 	/*
142 	 * Placing the vnode on the mount point's queue makes it visible.
143 	 * VNON prevents it from being messed with, however.
144 	 */
145 	insmntque(vp, mp);
146 	vfs_object_create(vp, curthread);
147 
148 	/*
149 	 * A VX locked & refd vnode is returned.
150 	 */
151 	*vpp = vp;
152 	return (0);
153 }
154 
155 /*
156  * Mark a mount point as busy. Used to synchronize access and to delay
157  * unmounting. Interlock is not released on failure.
158  */
159 int
160 vfs_busy(struct mount *mp, int flags,
161 	lwkt_tokref_t interlkp, struct thread *td)
162 {
163 	int lkflags;
164 
165 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
166 		if (flags & LK_NOWAIT)
167 			return (ENOENT);
168 		mp->mnt_kern_flag |= MNTK_MWAIT;
169 		/*
170 		 * Since all busy locks are shared except the exclusive
171 		 * lock granted when unmounting, the only place that a
172 		 * wakeup needs to be done is at the release of the
173 		 * exclusive lock at the end of dounmount.
174 		 *
175 		 * note: interlkp is a serializer and thus can be safely
176 		 * held through any sleep
177 		 */
178 		tsleep((caddr_t)mp, 0, "vfs_busy", 0);
179 		return (ENOENT);
180 	}
181 	lkflags = LK_SHARED | LK_NOPAUSE;
182 	if (interlkp)
183 		lkflags |= LK_INTERLOCK;
184 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
185 		panic("vfs_busy: unexpected lock failure");
186 	return (0);
187 }
188 
189 /*
190  * Free a busy filesystem.
191  */
192 void
193 vfs_unbusy(struct mount *mp, struct thread *td)
194 {
195 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
196 }
197 
198 /*
199  * Lookup a filesystem type, and if found allocate and initialize
200  * a mount structure for it.
201  *
202  * Devname is usually updated by mount(8) after booting.
203  */
204 int
205 vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp)
206 {
207 	struct thread *td = curthread;	/* XXX */
208 	struct vfsconf *vfsp;
209 	struct mount *mp;
210 
211 	if (fstypename == NULL)
212 		return (ENODEV);
213 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
214 		if (!strcmp(vfsp->vfc_name, fstypename))
215 			break;
216 	}
217 	if (vfsp == NULL)
218 		return (ENODEV);
219 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
220 	bzero((char *)mp, (u_long)sizeof(struct mount));
221 	lockinit(&mp->mnt_lock, 0, "vfslock", VLKTIMEOUT, LK_NOPAUSE);
222 	vfs_busy(mp, LK_NOWAIT, NULL, td);
223 	TAILQ_INIT(&mp->mnt_nvnodelist);
224 	TAILQ_INIT(&mp->mnt_reservedvnlist);
225 	mp->mnt_nvnodelistsize = 0;
226 	mp->mnt_vfc = vfsp;
227 	mp->mnt_op = vfsp->vfc_vfsops;
228 	mp->mnt_flag = MNT_RDONLY;
229 	mp->mnt_vnodecovered = NULLVP;
230 	vfsp->vfc_refcount++;
231 	mp->mnt_iosize_max = DFLTPHYS;
232 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
233 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
234 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
235 	mp->mnt_stat.f_mntonname[0] = '/';
236 	mp->mnt_stat.f_mntonname[1] = 0;
237 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
238 	*mpp = mp;
239 	return (0);
240 }
241 
242 /*
243  * Lookup a mount point by filesystem identifier.
244  */
245 struct mount *
246 vfs_getvfs(fsid_t *fsid)
247 {
248 	struct mount *mp;
249 	lwkt_tokref ilock;
250 
251 	lwkt_gettoken(&ilock, &mountlist_token);
252 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
253 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
254 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
255 			break;
256 	    }
257 	}
258 	lwkt_reltoken(&ilock);
259 	return (mp);
260 }
261 
262 /*
263  * Get a new unique fsid.  Try to make its val[0] unique, since this value
264  * will be used to create fake device numbers for stat().  Also try (but
265  * not so hard) make its val[0] unique mod 2^16, since some emulators only
266  * support 16-bit device numbers.  We end up with unique val[0]'s for the
267  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
268  *
269  * Keep in mind that several mounts may be running in parallel.  Starting
270  * the search one past where the previous search terminated is both a
271  * micro-optimization and a defense against returning the same fsid to
272  * different mounts.
273  */
274 void
275 vfs_getnewfsid(struct mount *mp)
276 {
277 	static u_int16_t mntid_base;
278 	lwkt_tokref ilock;
279 	fsid_t tfsid;
280 	int mtype;
281 
282 	lwkt_gettoken(&ilock, &mntid_token);
283 	mtype = mp->mnt_vfc->vfc_typenum;
284 	tfsid.val[1] = mtype;
285 	mtype = (mtype & 0xFF) << 24;
286 	for (;;) {
287 		tfsid.val[0] = makeudev(255,
288 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
289 		mntid_base++;
290 		if (vfs_getvfs(&tfsid) == NULL)
291 			break;
292 	}
293 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
294 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
295 	lwkt_reltoken(&ilock);
296 }
297 
298 /*
299  * This routine is called when we have too many vnodes.  It attempts
300  * to free <count> vnodes and will potentially free vnodes that still
301  * have VM backing store (VM backing store is typically the cause
302  * of a vnode blowout so we want to do this).  Therefore, this operation
303  * is not considered cheap.
304  *
305  * A number of conditions may prevent a vnode from being reclaimed.
306  * the buffer cache may have references on the vnode, a directory
307  * vnode may still have references due to the namei cache representing
308  * underlying files, or the vnode may be in active use.   It is not
309  * desireable to reuse such vnodes.  These conditions may cause the
310  * number of vnodes to reach some minimum value regardless of what
311  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
312  */
313 
314 /*
315  * Return 0 if the vnode is not already on the free list, return 1 if the
316  * vnode, with some additional work could possibly be placed on the free list.
317  */
318 static __inline int
319 vmightfree(struct vnode *vp, int use_count, int page_count)
320 {
321 	if (vp->v_flag & VFREE)
322 		return (0);
323 	if (vp->v_usecount != use_count || vp->v_holdcnt)
324 		return (0);
325 	if (vp->v_object && vp->v_object->resident_page_count >= page_count)
326 		return (0);
327 	return (1);
328 }
329 
330 
331 static int
332 vlrureclaim(struct mount *mp)
333 {
334 	struct vnode *vp;
335 	lwkt_tokref ilock;
336 	int done;
337 	int trigger;
338 	int usevnodes;
339 	int count;
340 
341 	/*
342 	 * Calculate the trigger point, don't allow user
343 	 * screwups to blow us up.   This prevents us from
344 	 * recycling vnodes with lots of resident pages.  We
345 	 * aren't trying to free memory, we are trying to
346 	 * free vnodes.
347 	 */
348 	usevnodes = desiredvnodes;
349 	if (usevnodes <= 0)
350 		usevnodes = 1;
351 	trigger = vmstats.v_page_count * 2 / usevnodes;
352 
353 	done = 0;
354 	lwkt_gettoken(&ilock, &mntvnode_token);
355 	count = mp->mnt_nvnodelistsize / 10 + 1;
356 	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
357 		/*
358 		 * __VNODESCAN__
359 		 *
360 		 * The VP will stick around while we hold mntvnode_token,
361 		 * at least until we block, so we can safely do an initial
362 		 * check, and then must check again after we lock the vnode.
363 		 */
364 		if (vp->v_type == VNON ||	/* XXX */
365 		    vp->v_type == VBAD ||	/* XXX */
366 		    !vmightfree(vp, 0, trigger)	/* critical path opt */
367 		) {
368 			TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
369 			TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
370 			--count;
371 			continue;
372 		}
373 
374 		/*
375 		 * VX get the candidate vnode.  If the VX get fails the
376 		 * vnode might still be on the mountlist.  Our loop depends
377 		 * on us at least cycling the vnode to the end of the
378 		 * mountlist.
379 		 */
380 		if (vx_get_nonblock(vp) != 0) {
381 			if (vp->v_mount == mp) {
382 				TAILQ_REMOVE(&mp->mnt_nvnodelist,
383 						vp, v_nmntvnodes);
384 				TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,
385 						vp, v_nmntvnodes);
386 			}
387 			--count;
388 			continue;
389 		}
390 
391 		/*
392 		 * Since we blocked locking the vp, make sure it is still
393 		 * a candidate for reclamation.  That is, it has not already
394 		 * been reclaimed and only has our VX reference associated
395 		 * with it.
396 		 */
397 		if (vp->v_type == VNON ||	/* XXX */
398 		    vp->v_type == VBAD ||	/* XXX */
399 		    (vp->v_flag & VRECLAIMED) ||
400 		    vp->v_mount != mp ||
401 		    !vmightfree(vp, 1, trigger)	/* critical path opt */
402 		) {
403 			if (vp->v_mount == mp) {
404 				TAILQ_REMOVE(&mp->mnt_nvnodelist,
405 						vp, v_nmntvnodes);
406 				TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,
407 						vp, v_nmntvnodes);
408 			}
409 			--count;
410 			vx_put(vp);
411 			continue;
412 		}
413 
414 		/*
415 		 * All right, we are good, move the vp to the end of the
416 		 * mountlist and clean it out.  The vget will have returned
417 		 * an error if the vnode was destroyed (VRECLAIMED set), so we
418 		 * do not have to check again.  The vput() will move the
419 		 * vnode to the free list if the vgone() was successful.
420 		 */
421 		KKASSERT(vp->v_mount == mp);
422 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
423 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist,vp, v_nmntvnodes);
424 		vgone(vp);
425 		vx_put(vp);
426 		++done;
427 		--count;
428 	}
429 	lwkt_reltoken(&ilock);
430 	return (done);
431 }
432 
433 /*
434  * Attempt to recycle vnodes in a context that is always safe to block.
435  * Calling vlrurecycle() from the bowels of file system code has some
436  * interesting deadlock problems.
437  */
438 static struct thread *vnlruthread;
439 static int vnlruproc_sig;
440 
441 void
442 vnlru_proc_wait(void)
443 {
444 	if (vnlruproc_sig == 0) {
445 		vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
446 		wakeup(vnlruthread);
447 	}
448 	tsleep(&vnlruproc_sig, 0, "vlruwk", hz);
449 }
450 
451 static void
452 vnlru_proc(void)
453 {
454 	struct mount *mp, *nmp;
455 	lwkt_tokref ilock;
456 	int s;
457 	int done;
458 	struct thread *td = curthread;
459 
460 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, td,
461 	    SHUTDOWN_PRI_FIRST);
462 
463 	s = splbio();
464 	for (;;) {
465 		kproc_suspend_loop();
466 		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
467 			vnlruproc_sig = 0;
468 			wakeup(&vnlruproc_sig);
469 			tsleep(td, 0, "vlruwt", hz);
470 			continue;
471 		}
472 		done = 0;
473 		cache_cleanneg(0);
474 		lwkt_gettoken(&ilock, &mountlist_token);
475 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
476 			if (vfs_busy(mp, LK_NOWAIT, &ilock, td)) {
477 				nmp = TAILQ_NEXT(mp, mnt_list);
478 				continue;
479 			}
480 			done += vlrureclaim(mp);
481 			lwkt_gettokref(&ilock);
482 			nmp = TAILQ_NEXT(mp, mnt_list);
483 			vfs_unbusy(mp, td);
484 		}
485 		lwkt_reltoken(&ilock);
486 		if (done == 0) {
487 			++vnlru_nowhere;
488 			tsleep(td, 0, "vlrup", hz * 3);
489 			if (vnlru_nowhere % 10 == 0)
490 				printf("vnlru_proc: vnode recycler stopped working!\n");
491 		} else {
492 			vnlru_nowhere = 0;
493 		}
494 	}
495 	splx(s);
496 }
497 
498 static struct kproc_desc vnlru_kp = {
499 	"vnlru",
500 	vnlru_proc,
501 	&vnlruthread
502 };
503 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
504 
505 /*
506  * Move a vnode from one mount queue to another.
507  */
508 void
509 insmntque(struct vnode *vp, struct mount *mp)
510 {
511 	lwkt_tokref ilock;
512 
513 	lwkt_gettoken(&ilock, &mntvnode_token);
514 	/*
515 	 * Delete from old mount point vnode list, if on one.
516 	 */
517 	if (vp->v_mount != NULL) {
518 		KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
519 			("bad mount point vnode list size"));
520 		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
521 		vp->v_mount->mnt_nvnodelistsize--;
522 	}
523 	/*
524 	 * Insert into list of vnodes for the new mount point, if available.
525 	 */
526 	if ((vp->v_mount = mp) == NULL) {
527 		lwkt_reltoken(&ilock);
528 		return;
529 	}
530 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
531 	mp->mnt_nvnodelistsize++;
532 	lwkt_reltoken(&ilock);
533 }
534 
535 
536 /*
537  * Scan the vnodes under a mount point.  The first function is called
538  * with just the mountlist token held (no vnode lock).  The second
539  * function is called with the vnode VX locked.
540  */
541 int
542 vmntvnodescan(
543     struct mount *mp,
544     int flags,
545     int (*fastfunc)(struct mount *mp, struct vnode *vp, void *data),
546     int (*slowfunc)(struct mount *mp, struct vnode *vp, void *data),
547     void *data
548 ) {
549 	lwkt_tokref ilock;
550 	struct vnode *pvp;
551 	struct vnode *vp;
552 	int r = 0;
553 
554 	/*
555 	 * Scan the vnodes on the mount's vnode list.  Use a placemarker
556 	 */
557 	pvp = allocvnode_placemarker();
558 
559 	lwkt_gettoken(&ilock, &mntvnode_token);
560 	TAILQ_INSERT_HEAD(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
561 
562 	while ((vp = TAILQ_NEXT(pvp, v_nmntvnodes)) != NULL) {
563 		/*
564 		 * Move the placemarker and skip other placemarkers we
565 		 * encounter.  The nothing can get in our way so the
566 		 * mount point on the vp must be valid.
567 		 */
568 		TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
569 		TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, pvp, v_nmntvnodes);
570 		if (vp->v_flag & VPLACEMARKER)	/* another procs placemarker */
571 			continue;
572 		if (vp->v_type == VNON)		/* visible but not ready */
573 			continue;
574 		KKASSERT(vp->v_mount == mp);
575 
576 		/*
577 		 * Quick test.  A negative return continues the loop without
578 		 * calling the slow test.  0 continues onto the slow test.
579 		 * A positive number aborts the loop.
580 		 */
581 		if (fastfunc) {
582 			if ((r = fastfunc(mp, vp, data)) < 0)
583 				continue;
584 			if (r)
585 				break;
586 		}
587 
588 		/*
589 		 * Get a vxlock on the vnode, retry if it has moved or isn't
590 		 * in the mountlist where we expect it.
591 		 */
592 		if (slowfunc) {
593 			int error;
594 
595 			switch(flags) {
596 			case VMSC_GETVP:
597 				error = vget(vp, LK_EXCLUSIVE, curthread);
598 				break;
599 			case VMSC_GETVP|VMSC_NOWAIT:
600 				error = vget(vp, LK_EXCLUSIVE|LK_NOWAIT,
601 						curthread);
602 				break;
603 			case VMSC_GETVX:
604 				error = vx_get(vp);
605 				break;
606 			case VMSC_REFVP:
607 				vref(vp);
608 				/* fall through */
609 			default:
610 				error = 0;
611 				break;
612 			}
613 			if (error)
614 				continue;
615 			if (TAILQ_PREV(pvp, vnodelst, v_nmntvnodes) != vp)
616 				goto skip;
617 			if (vp->v_type == VNON)
618 				goto skip;
619 			r = slowfunc(mp, vp, data);
620 skip:
621 			switch(flags) {
622 			case VMSC_GETVP:
623 			case VMSC_GETVP|VMSC_NOWAIT:
624 				vput(vp);
625 				break;
626 			case VMSC_GETVX:
627 				vx_put(vp);
628 				break;
629 			case VMSC_REFVP:
630 				vrele(vp);
631 				/* fall through */
632 			default:
633 				break;
634 			}
635 			if (r != 0)
636 				break;
637 		}
638 	}
639 	TAILQ_REMOVE(&mp->mnt_nvnodelist, pvp, v_nmntvnodes);
640 	freevnode_placemarker(pvp);
641 	lwkt_reltoken(&ilock);
642 	return(r);
643 }
644 
645 /*
646  * Remove any vnodes in the vnode table belonging to mount point mp.
647  *
648  * If FORCECLOSE is not specified, there should not be any active ones,
649  * return error if any are found (nb: this is a user error, not a
650  * system error). If FORCECLOSE is specified, detach any active vnodes
651  * that are found.
652  *
653  * If WRITECLOSE is set, only flush out regular file vnodes open for
654  * writing.
655  *
656  * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
657  *
658  * `rootrefs' specifies the base reference count for the root vnode
659  * of this filesystem. The root vnode is considered busy if its
660  * v_usecount exceeds this value. On a successful return, vflush()
661  * will call vrele() on the root vnode exactly rootrefs times.
662  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
663  * be zero.
664  */
665 #ifdef DIAGNOSTIC
666 static int busyprt = 0;		/* print out busy vnodes */
667 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
668 #endif
669 
670 static int vflush_scan(struct mount *mp, struct vnode *vp, void *data);
671 
672 struct vflush_info {
673 	int flags;
674 	int busy;
675 	thread_t td;
676 };
677 
678 int
679 vflush(struct mount *mp, int rootrefs, int flags)
680 {
681 	struct thread *td = curthread;	/* XXX */
682 	struct vnode *rootvp = NULL;
683 	int error;
684 	struct vflush_info vflush_info;
685 
686 	if (rootrefs > 0) {
687 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
688 		    ("vflush: bad args"));
689 		/*
690 		 * Get the filesystem root vnode. We can vput() it
691 		 * immediately, since with rootrefs > 0, it won't go away.
692 		 */
693 		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
694 			return (error);
695 		vput(rootvp);
696 	}
697 
698 	vflush_info.busy = 0;
699 	vflush_info.flags = flags;
700 	vflush_info.td = td;
701 	vmntvnodescan(mp, VMSC_GETVX, NULL, vflush_scan, &vflush_info);
702 
703 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
704 		/*
705 		 * If just the root vnode is busy, and if its refcount
706 		 * is equal to `rootrefs', then go ahead and kill it.
707 		 */
708 		KASSERT(vflush_info.busy > 0, ("vflush: not busy"));
709 		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
710 		if (vflush_info.busy == 1 && rootvp->v_usecount == rootrefs) {
711 			if (vx_lock(rootvp) == 0) {
712 				vgone(rootvp);
713 				vx_unlock(rootvp);
714 				vflush_info.busy = 0;
715 			}
716 		}
717 	}
718 	if (vflush_info.busy)
719 		return (EBUSY);
720 	for (; rootrefs > 0; rootrefs--)
721 		vrele(rootvp);
722 	return (0);
723 }
724 
725 /*
726  * The scan callback is made with an VX locked vnode.
727  */
728 static int
729 vflush_scan(struct mount *mp, struct vnode *vp, void *data)
730 {
731 	struct vflush_info *info = data;
732 	struct vattr vattr;
733 
734 	/*
735 	 * Skip over a vnodes marked VSYSTEM.
736 	 */
737 	if ((info->flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
738 		return(0);
739 	}
740 
741 	/*
742 	 * If WRITECLOSE is set, flush out unlinked but still open
743 	 * files (even if open only for reading) and regular file
744 	 * vnodes open for writing.
745 	 */
746 	if ((info->flags & WRITECLOSE) &&
747 	    (vp->v_type == VNON ||
748 	    (VOP_GETATTR(vp, &vattr, info->td) == 0 &&
749 	    vattr.va_nlink > 0)) &&
750 	    (vp->v_writecount == 0 || vp->v_type != VREG)) {
751 		return(0);
752 	}
753 
754 	/*
755 	 * With v_usecount == 0, all we need to do is clear out the
756 	 * vnode data structures and we are done.
757 	 */
758 	if (vp->v_usecount == 1) {
759 		vgone(vp);
760 		return(0);
761 	}
762 
763 	/*
764 	 * If FORCECLOSE is set, forcibly close the vnode. For block
765 	 * or character devices, revert to an anonymous device. For
766 	 * all other files, just kill them.
767 	 */
768 	if (info->flags & FORCECLOSE) {
769 		if (vp->v_type != VBLK && vp->v_type != VCHR) {
770 			vgone(vp);
771 		} else {
772 			vclean(vp, 0, info->td);
773 			vp->v_ops = spec_vnode_vops;
774 			insmntque(vp, NULL);
775 		}
776 		return(0);
777 	}
778 #ifdef DIAGNOSTIC
779 	if (busyprt)
780 		vprint("vflush: busy vnode", vp);
781 #endif
782 	++info->busy;
783 	return(0);
784 }
785 
786