xref: /dragonfly/sys/kern/vfs_lock.c (revision 59b0b316)
1 /*
2  * Copyright (c) 2004,2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * External lock/ref-related vnode functions
37  *
38  * vs_state transition locking requirements:
39  *
40  *	INACTIVE -> CACHED|DYING	vx_lock(excl) + vfs_spin
41  *	DYING    -> CACHED		vx_lock(excl)
42  *	ACTIVE   -> INACTIVE		(none)       + v_spin + vfs_spin
43  *	INACTIVE -> ACTIVE		vn_lock(any) + v_spin + vfs_spin
44  *	CACHED   -> ACTIVE		vn_lock(any) + v_spin + vfs_spin
45  *
46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vfs_spin,
47  *
48  *	 Switching into ACTIVE also requires a vref and vnode lock, however
49  *	 the vnode lock is allowed to be SHARED.
50  *
51  *	 Switching into a CACHED or DYING state requires an exclusive vnode
52  *	 lock or vx_lock (which is almost the same thing).
53  */
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/buf.h>
63 #include <sys/sysctl.h>
64 
65 #include <machine/limits.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
69 
70 #include <sys/buf2.h>
71 #include <sys/thread2.h>
72 
73 #define VACT_MAX	10
74 #define VACT_INC	2
75 
76 static void vnode_terminate(struct vnode *vp);
77 
78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
79 
80 /*
81  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
82  * are inserted prior to the mid point, and otherwise inserted
83  * at the tail.
84  */
85 TAILQ_HEAD(freelst, vnode);
86 static struct freelst	vnode_active_list;
87 static struct freelst	vnode_inactive_list;
88 static struct vnode	vnode_active_rover;
89 static struct spinlock	vfs_spin = SPINLOCK_INITIALIZER(vfs_spin, "vfs_spin");
90 
91 int  activevnodes = 0;
92 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
93 	&activevnodes, 0, "Number of active nodes");
94 int  cachedvnodes = 0;
95 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
96 	&cachedvnodes, 0, "Number of total cached nodes");
97 int  inactivevnodes = 0;
98 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
99 	&inactivevnodes, 0, "Number of inactive nodes");
100 static int batchfreevnodes = 5;
101 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
102 	&batchfreevnodes, 0, "Number of vnodes to free at once");
103 #ifdef TRACKVNODE
104 static u_long trackvnode;
105 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
106 		&trackvnode, 0, "");
107 #endif
108 
109 /*
110  * Called from vfsinit()
111  */
112 void
113 vfs_lock_init(void)
114 {
115 	TAILQ_INIT(&vnode_inactive_list);
116 	TAILQ_INIT(&vnode_active_list);
117 	TAILQ_INSERT_TAIL(&vnode_active_list, &vnode_active_rover, v_list);
118 	spin_init(&vfs_spin, "vfslock");
119 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
120 }
121 
122 /*
123  * Misc functions
124  */
125 static __inline
126 void
127 _vsetflags(struct vnode *vp, int flags)
128 {
129 	atomic_set_int(&vp->v_flag, flags);
130 }
131 
132 static __inline
133 void
134 _vclrflags(struct vnode *vp, int flags)
135 {
136 	atomic_clear_int(&vp->v_flag, flags);
137 }
138 
139 void
140 vsetflags(struct vnode *vp, int flags)
141 {
142 	_vsetflags(vp, flags);
143 }
144 
145 void
146 vclrflags(struct vnode *vp, int flags)
147 {
148 	_vclrflags(vp, flags);
149 }
150 
151 /*
152  * Place the vnode on the active list.
153  *
154  * Caller must hold vp->v_spin
155  */
156 static __inline
157 void
158 _vactivate(struct vnode *vp)
159 {
160 #ifdef TRACKVNODE
161 	if ((u_long)vp == trackvnode)
162 		kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
163 #endif
164 	spin_lock(&vfs_spin);
165 
166 	switch(vp->v_state) {
167 	case VS_ACTIVE:
168 		panic("_vactivate: already active");
169 		/* NOT REACHED */
170 		spin_unlock(&vfs_spin);
171 		return;
172 	case VS_INACTIVE:
173 		TAILQ_REMOVE(&vnode_inactive_list, vp, v_list);
174 		--inactivevnodes;
175 		break;
176 	case VS_CACHED:
177 	case VS_DYING:
178 		break;
179 	}
180 	TAILQ_INSERT_TAIL(&vnode_active_list, vp, v_list);
181 	vp->v_state = VS_ACTIVE;
182 	++activevnodes;
183 
184 	spin_unlock(&vfs_spin);
185 }
186 
187 /*
188  * Put a vnode on the inactive list.
189  *
190  * Caller must hold v_spin
191  */
192 static __inline
193 void
194 _vinactive(struct vnode *vp)
195 {
196 #ifdef TRACKVNODE
197 	if ((u_long)vp == trackvnode) {
198 		kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
199 		print_backtrace(-1);
200 	}
201 #endif
202 	spin_lock(&vfs_spin);
203 
204 	/*
205 	 * Remove from active list if it is sitting on it
206 	 */
207 	switch(vp->v_state) {
208 	case VS_ACTIVE:
209 		TAILQ_REMOVE(&vnode_active_list, vp, v_list);
210 		--activevnodes;
211 		break;
212 	case VS_INACTIVE:
213 		panic("_vinactive: already inactive");
214 		/* NOT REACHED */
215 		spin_unlock(&vfs_spin);
216 		return;
217 	case VS_CACHED:
218 	case VS_DYING:
219 		break;
220 	}
221 
222 	/*
223 	 * Distinguish between basically dead vnodes, vnodes with cached
224 	 * data, and vnodes without cached data.  A rover will shift the
225 	 * vnodes around as their cache status is lost.
226 	 */
227 	if (vp->v_flag & VRECLAIMED) {
228 		TAILQ_INSERT_HEAD(&vnode_inactive_list, vp, v_list);
229 	} else {
230 		TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list);
231 	}
232 	++inactivevnodes;
233 	vp->v_state = VS_INACTIVE;
234 
235 	spin_unlock(&vfs_spin);
236 }
237 
238 static __inline
239 void
240 _vinactive_tail(struct vnode *vp)
241 {
242 	spin_lock(&vfs_spin);
243 
244 	/*
245 	 * Remove from active list if it is sitting on it
246 	 */
247 	switch(vp->v_state) {
248 	case VS_ACTIVE:
249 		TAILQ_REMOVE(&vnode_active_list, vp, v_list);
250 		--activevnodes;
251 		break;
252 	case VS_INACTIVE:
253 		panic("_vinactive_tail: already inactive");
254 		/* NOT REACHED */
255 		spin_unlock(&vfs_spin);
256 		return;
257 	case VS_CACHED:
258 	case VS_DYING:
259 		break;
260 	}
261 
262 	TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list);
263 	++inactivevnodes;
264 	vp->v_state = VS_INACTIVE;
265 
266 	spin_unlock(&vfs_spin);
267 }
268 
269 /*
270  * Add a ref to an active vnode.  This function should never be called
271  * with an inactive vnode (use vget() instead), but might be called
272  * with other states.
273  */
274 void
275 vref(struct vnode *vp)
276 {
277 	KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
278 		("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
279 	atomic_add_int(&vp->v_refcnt, 1);
280 }
281 
282 /*
283  * Count number of cached vnodes.  This is middling expensive so be
284  * careful not to make this call in the critical path, particularly
285  * not updating the global.  Each cpu tracks its own accumulator.
286  * The individual accumulators are not accurate and must be summed
287  * together.
288  */
289 int
290 countcachedvnodes(int gupdate)
291 {
292 	int i;
293 	int n = 0;
294 
295 	for (i = 0; i < ncpus; ++i) {
296 		globaldata_t gd = globaldata_find(i);
297 		n += gd->gd_cachedvnodes;
298 	}
299 	if (gupdate)
300 		cachedvnodes = n;
301 	return n;
302 }
303 
304 /*
305  * Release a ref on an active or inactive vnode.
306  *
307  * Caller has no other requirements.
308  *
309  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
310  * transition, otherwise we leave the vnode in the active list and
311  * do a lockless transition to 0, which is very important for the
312  * critical path.
313  *
314  * (vrele() is not called when a vnode is being destroyed w/kfree)
315  */
316 void
317 vrele(struct vnode *vp)
318 {
319 	for (;;) {
320 		int count = vp->v_refcnt;
321 		cpu_ccfence();
322 		KKASSERT((count & VREF_MASK) > 0);
323 		KKASSERT(vp->v_state == VS_ACTIVE ||
324 			 vp->v_state == VS_INACTIVE);
325 
326 		/*
327 		 * 2+ case
328 		 */
329 		if ((count & VREF_MASK) > 1) {
330 			if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
331 				break;
332 			continue;
333 		}
334 
335 		/*
336 		 * 1->0 transition case must handle possible finalization.
337 		 * When finalizing we transition 1->0x40000000.  Note that
338 		 * cachedvnodes is only adjusted on transitions to ->0.
339 		 *
340 		 * WARNING! VREF_TERMINATE can be cleared at any point
341 		 *	    when the refcnt is non-zero (by vget()) and
342 		 *	    the vnode has not been reclaimed.  Thus
343 		 *	    transitions out of VREF_TERMINATE do not have
344 		 *	    to mess with cachedvnodes.
345 		 */
346 		if (count & VREF_FINALIZE) {
347 			vx_lock(vp);
348 			if (atomic_cmpset_int(&vp->v_refcnt,
349 					      count, VREF_TERMINATE)) {
350 				vnode_terminate(vp);
351 				break;
352 			}
353 			vx_unlock(vp);
354 		} else {
355 			if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
356 				atomic_add_int(&mycpu->gd_cachedvnodes, 1);
357 				break;
358 			}
359 		}
360 		/* retry */
361 	}
362 }
363 
364 /*
365  * Add an auxiliary data structure reference to the vnode.  Auxiliary
366  * references do not change the state of the vnode or prevent deactivation
367  * or reclamation of the vnode, but will prevent the vnode from being
368  * destroyed (kfree()'d).
369  *
370  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
371  *	     already be held by the caller.  vdrop() will clean up the
372  *	     free list state.
373  */
374 void
375 vhold(struct vnode *vp)
376 {
377 	atomic_add_int(&vp->v_auxrefs, 1);
378 }
379 
380 /*
381  * Remove an auxiliary reference from the vnode.
382  */
383 void
384 vdrop(struct vnode *vp)
385 {
386 	atomic_add_int(&vp->v_auxrefs, -1);
387 }
388 
389 /*
390  * This function is called on the 1->0 transition (which is actually
391  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
392  * of the vnode.
393  *
394  * Additional vrefs are allowed to race but will not result in a reentrant
395  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
396  * prevents additional 1->0 transitions.
397  *
398  * ONLY A VGET() CAN REACTIVATE THE VNODE.
399  *
400  * Caller must hold the VX lock.
401  *
402  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
403  *
404  * NOTE: The vnode may be marked inactive with dirty buffers
405  *	 or dirty pages in its cached VM object still present.
406  *
407  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
408  *	 previously be active).  We lose control of the vnode the instant
409  *	 it is placed on the free list.
410  *
411  *	 The VX lock is required when transitioning to VS_CACHED but is
412  *	 not sufficient for the vshouldfree() interlocked test or when
413  *	 transitioning away from VS_CACHED.  v_spin is also required for
414  *	 those cases.
415  */
416 static
417 void
418 vnode_terminate(struct vnode *vp)
419 {
420 	KKASSERT(vp->v_state == VS_ACTIVE);
421 
422 	if ((vp->v_flag & VINACTIVE) == 0) {
423 		_vsetflags(vp, VINACTIVE);
424 		if (vp->v_mount)
425 			VOP_INACTIVE(vp);
426 		/* might deactivate page */
427 	}
428 	spin_lock(&vp->v_spin);
429 	_vinactive(vp);
430 	spin_unlock(&vp->v_spin);
431 
432 	vx_unlock(vp);
433 }
434 
435 /****************************************************************
436  *			VX LOCKING FUNCTIONS			*
437  ****************************************************************
438  *
439  * These functions lock vnodes for reclamation and deactivation related
440  * activities.  The caller must already be holding some sort of reference
441  * on the vnode.
442  */
443 void
444 vx_lock(struct vnode *vp)
445 {
446 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
447 }
448 
449 void
450 vx_unlock(struct vnode *vp)
451 {
452 	lockmgr(&vp->v_lock, LK_RELEASE);
453 }
454 
455 /****************************************************************
456  *			VNODE ACQUISITION FUNCTIONS		*
457  ****************************************************************
458  *
459  * These functions must be used when accessing a vnode that has no
460  * chance of being destroyed in a SMP race.  That means the caller will
461  * usually either hold an auxiliary reference (such as the namecache)
462  * or hold some other lock that ensures that the vnode cannot be destroyed.
463  *
464  * These functions are MANDATORY for any code chain accessing a vnode
465  * whos activation state is not known.
466  *
467  * vget() can be called with LK_NOWAIT and will return EBUSY if the
468  * lock cannot be immediately acquired.
469  *
470  * vget()/vput() are used when reactivation is desired.
471  *
472  * vx_get() and vx_put() are used when reactivation is not desired.
473  */
474 int
475 vget(struct vnode *vp, int flags)
476 {
477 	int error;
478 
479 	/*
480 	 * A lock type must be passed
481 	 */
482 	if ((flags & LK_TYPE_MASK) == 0) {
483 		panic("vget() called with no lock specified!");
484 		/* NOT REACHED */
485 	}
486 
487 	/*
488 	 * Reference the structure and then acquire the lock.
489 	 *
490 	 * NOTE: The requested lock might be a shared lock and does
491 	 *	 not protect our access to the refcnt or other fields.
492 	 */
493 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
494 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
495 
496 	if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
497 		/*
498 		 * The lock failed, undo and return an error.  This will not
499 		 * normally trigger a termination.
500 		 */
501 		vrele(vp);
502 	} else if (vp->v_flag & VRECLAIMED) {
503 		/*
504 		 * The node is being reclaimed and cannot be reactivated
505 		 * any more, undo and return ENOENT.
506 		 */
507 		vn_unlock(vp);
508 		vrele(vp);
509 		error = ENOENT;
510 	} else if (vp->v_state == VS_ACTIVE) {
511 		/*
512 		 * A VS_ACTIVE vnode coupled with the fact that we have
513 		 * a vnode lock (even if shared) prevents v_state from
514 		 * changing.  Since the vnode is not in a VRECLAIMED state,
515 		 * we can safely clear VINACTIVE.
516 		 *
517 		 * NOTE! Multiple threads may clear VINACTIVE if this is
518 		 *	 shared lock.  This race is allowed.
519 		 */
520 		_vclrflags(vp, VINACTIVE);	/* SMP race ok */
521 		vp->v_act += VACT_INC;
522 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
523 			vp->v_act = VACT_MAX;
524 		error = 0;
525 	} else {
526 		/*
527 		 * If the vnode is not VS_ACTIVE it must be reactivated
528 		 * in addition to clearing VINACTIVE.  An exclusive spin_lock
529 		 * is needed to manipulate the vnode's list.
530 		 *
531 		 * Because the lockmgr lock might be shared, we might race
532 		 * another reactivation, which we handle.  In this situation,
533 		 * however, the refcnt prevents other v_state races.
534 		 *
535 		 * As with above, clearing VINACTIVE is allowed to race other
536 		 * clearings of VINACTIVE.
537 		 *
538 		 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
539 		 * the refcnt is non-zero and the vnode has not been
540 		 * reclaimed.  This also means that the transitions do
541 		 * not affect cachedvnodes.
542 		 */
543 		_vclrflags(vp, VINACTIVE);
544 		vp->v_act += VACT_INC;
545 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
546 			vp->v_act = VACT_MAX;
547 		spin_lock(&vp->v_spin);
548 
549 		switch(vp->v_state) {
550 		case VS_INACTIVE:
551 			_vactivate(vp);
552 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
553 							VREF_FINALIZE);
554 			spin_unlock(&vp->v_spin);
555 			break;
556 		case VS_CACHED:
557 			_vactivate(vp);
558 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
559 							VREF_FINALIZE);
560 			spin_unlock(&vp->v_spin);
561 			break;
562 		case VS_ACTIVE:
563 			atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE);
564 			spin_unlock(&vp->v_spin);
565 			break;
566 		case VS_DYING:
567 			spin_unlock(&vp->v_spin);
568 			panic("Impossible VS_DYING state");
569 			break;
570 		}
571 		error = 0;
572 	}
573 	return(error);
574 }
575 
576 #ifdef DEBUG_VPUT
577 
578 void
579 debug_vput(struct vnode *vp, const char *filename, int line)
580 {
581 	kprintf("vput(%p) %s:%d\n", vp, filename, line);
582 	vn_unlock(vp);
583 	vrele(vp);
584 }
585 
586 #else
587 
588 void
589 vput(struct vnode *vp)
590 {
591 	vn_unlock(vp);
592 	vrele(vp);
593 }
594 
595 #endif
596 
597 /*
598  * Acquire the vnode lock unguarded.
599  *
600  * The non-blocking version also uses a slightly different mechanic.
601  * This function will explicitly fail not only if it cannot acquire
602  * the lock normally, but also if the caller already holds a lock.
603  *
604  * The adjusted mechanic is used to close a loophole where complex
605  * VOP_RECLAIM code can circle around recursively and allocate the
606  * same vnode it is trying to destroy from the freelist.
607  *
608  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
609  * cause the incorrect behavior to occur.  If not for that lockmgr()
610  * would do the right thing.
611  *
612  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
613  */
614 void
615 vx_get(struct vnode *vp)
616 {
617 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
618 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
619 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
620 }
621 
622 int
623 vx_get_nonblock(struct vnode *vp)
624 {
625 	int error;
626 
627 	if (lockcountnb(&vp->v_lock))
628 		return(EBUSY);
629 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
630 	if (error == 0) {
631 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
632 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
633 	}
634 	return(error);
635 }
636 
637 /*
638  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
639  * any needed state transitions.
640  *
641  * However, filesystems use this function to get rid of unwanted new vnodes
642  * so try to get the vnode on the correct queue in that case.
643  */
644 void
645 vx_put(struct vnode *vp)
646 {
647 	if (vp->v_type == VNON || vp->v_type == VBAD)
648 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
649 	lockmgr(&vp->v_lock, LK_RELEASE);
650 	vrele(vp);
651 }
652 
653 /*
654  * Try to reuse a vnode from the free list.  This function is somewhat
655  * advisory in that NULL can be returned as a normal case, even if free
656  * vnodes are present.
657  *
658  * The scan is limited because it can result in excessive CPU use during
659  * periods of extreme vnode use.
660  *
661  * NOTE: The returned vnode is not completely initialized.
662  */
663 static
664 struct vnode *
665 cleanfreevnode(int maxcount)
666 {
667 	struct vnode *vp;
668 	int count;
669 	int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
670 
671 	/*
672 	 * Try to deactivate some vnodes cached on the active list.
673 	 */
674 	if (countcachedvnodes(0) < inactivevnodes)
675 		goto skip;
676 
677 	for (count = 0; count < maxcount * 2; count++) {
678 		spin_lock(&vfs_spin);
679 
680 		vp = TAILQ_NEXT(&vnode_active_rover, v_list);
681 		TAILQ_REMOVE(&vnode_active_list, &vnode_active_rover, v_list);
682 		if (vp == NULL) {
683 			TAILQ_INSERT_HEAD(&vnode_active_list,
684 					  &vnode_active_rover, v_list);
685 		} else {
686 			TAILQ_INSERT_AFTER(&vnode_active_list, vp,
687 					   &vnode_active_rover, v_list);
688 		}
689 		if (vp == NULL) {
690 			spin_unlock(&vfs_spin);
691 			continue;
692 		}
693 		if ((vp->v_refcnt & VREF_MASK) != 0) {
694 			spin_unlock(&vfs_spin);
695 			vp->v_act += VACT_INC;
696 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
697 				vp->v_act = VACT_MAX;
698 			continue;
699 		}
700 
701 		/*
702 		 * decrement by less if the vnode's object has a lot of
703 		 * VM pages.  XXX possible SMP races.
704 		 */
705 		if (vp->v_act > 0) {
706 			vm_object_t obj;
707 			if ((obj = vp->v_object) != NULL &&
708 			    obj->resident_page_count >= trigger) {
709 				vp->v_act -= 1;
710 			} else {
711 				vp->v_act -= VACT_INC;
712 			}
713 			if (vp->v_act < 0)
714 				vp->v_act = 0;
715 			spin_unlock(&vfs_spin);
716 			continue;
717 		}
718 
719 		/*
720 		 * Try to deactivate the vnode.
721 		 */
722 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
723 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
724 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
725 
726 		spin_unlock(&vfs_spin);
727 		vrele(vp);
728 	}
729 
730 skip:
731 	/*
732 	 * Loop trying to lock the first vnode on the free list.
733 	 * Cycle if we can't.
734 	 */
735 	for (count = 0; count < maxcount; count++) {
736 		spin_lock(&vfs_spin);
737 
738 		vp = TAILQ_FIRST(&vnode_inactive_list);
739 		if (vp == NULL) {
740 			spin_unlock(&vfs_spin);
741 			break;
742 		}
743 
744 		/*
745 		 * non-blocking vx_get will also ref the vnode on success.
746 		 */
747 		if (vx_get_nonblock(vp)) {
748 			KKASSERT(vp->v_state == VS_INACTIVE);
749 			TAILQ_REMOVE(&vnode_inactive_list, vp, v_list);
750 			TAILQ_INSERT_TAIL(&vnode_inactive_list, vp, v_list);
751 			spin_unlock(&vfs_spin);
752 			continue;
753 		}
754 
755 		/*
756 		 * Because we are holding vfs_spin the vnode should currently
757 		 * be inactive and VREF_TERMINATE should still be set.
758 		 *
759 		 * Once vfs_spin is released the vnode's state should remain
760 		 * unmodified due to both the lock and ref on it.
761 		 */
762 		KKASSERT(vp->v_state == VS_INACTIVE);
763 		spin_unlock(&vfs_spin);
764 #ifdef TRACKVNODE
765 		if ((u_long)vp == trackvnode)
766 			kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
767 #endif
768 
769 		/*
770 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
771 		 * This includes namecache refs due to a related ncp being
772 		 * locked or having children, a VM object association, or
773 		 * other hold users.
774 		 *
775 		 * Do not reclaim/reuse a vnode if someone else has a real
776 		 * ref on it.  This can occur if a filesystem temporarily
777 		 * releases the vnode lock during VOP_RECLAIM.
778 		 */
779 		if (vp->v_auxrefs ||
780 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
781 failed:
782 			if (vp->v_state == VS_INACTIVE) {
783 				spin_lock(&vfs_spin);
784 				if (vp->v_state == VS_INACTIVE) {
785 					TAILQ_REMOVE(&vnode_inactive_list,
786 						     vp, v_list);
787 					TAILQ_INSERT_TAIL(&vnode_inactive_list,
788 							  vp, v_list);
789 				}
790 				spin_unlock(&vfs_spin);
791 			}
792 			vx_put(vp);
793 			continue;
794 		}
795 
796 		/*
797 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
798 		 * for vnodes pulled from the inactive list, and cannot be
799 		 * changed while we hold the vx lock.
800 		 *
801 		 * Try to reclaim the vnode.
802 		 */
803 		KKASSERT(vp->v_flag & VINACTIVE);
804 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
805 
806 		if ((vp->v_flag & VRECLAIMED) == 0) {
807 			if (cache_inval_vp_nonblock(vp))
808 				goto failed;
809 			vgone_vxlocked(vp);
810 			/* vnode is still VX locked */
811 		}
812 
813 		/*
814 		 * At this point if there are no other refs or auxrefs on
815 		 * the vnode with the inactive list locked, and we remove
816 		 * the vnode from the inactive list, it should not be
817 		 * possible for anyone else to access the vnode any more.
818 		 *
819 		 * Since the vnode is in a VRECLAIMED state, no new
820 		 * namecache associations could have been made and the
821 		 * vnode should have already been removed from its mountlist.
822 		 *
823 		 * Since we hold a VX lock on the vnode it cannot have been
824 		 * reactivated (moved out of the inactive list).
825 		 */
826 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
827 		spin_lock(&vfs_spin);
828 		if (vp->v_auxrefs ||
829 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
830 			spin_unlock(&vfs_spin);
831 			goto failed;
832 		}
833 		KKASSERT(vp->v_state == VS_INACTIVE);
834 		TAILQ_REMOVE(&vnode_inactive_list, vp, v_list);
835 		--inactivevnodes;
836 		vp->v_state = VS_DYING;
837 		spin_unlock(&vfs_spin);
838 
839 		/*
840 		 * Nothing should have been able to access this vp.  Only
841 		 * our ref should remain now.
842 		 */
843 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
844 		KASSERT(vp->v_refcnt == 1,
845 			("vp %p badrefs %08x", vp, vp->v_refcnt));
846 
847 		/*
848 		 * Return a VX locked vnode suitable for reuse.
849 		 */
850 		return(vp);
851 	}
852 	return(NULL);
853 }
854 
855 /*
856  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
857  *
858  * All new vnodes set the VAGE flags.  An open() of the vnode will
859  * decrement the (2-bit) flags.  Vnodes which are opened several times
860  * are thus retained in the cache over vnodes which are merely stat()d.
861  *
862  * We always allocate the vnode.  Attempting to recycle existing vnodes
863  * here can lead to numerous deadlocks, particularly with softupdates.
864  */
865 struct vnode *
866 allocvnode(int lktimeout, int lkflags)
867 {
868 	struct vnode *vp;
869 
870 	/*
871 	 * Do not flag for synchronous recyclement unless there are enough
872 	 * freeable vnodes to recycle and the number of vnodes has
873 	 * significantly exceeded our target.  We want the normal vnlru
874 	 * process to handle the cleaning (at 9/10's) before we are forced
875 	 * to flag it here at 11/10's for userexit path processing.
876 	 */
877 	if (numvnodes >= maxvnodes * 11 / 10 &&
878 	    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
879 		struct thread *td = curthread;
880 		if (td->td_lwp)
881 			atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
882 	}
883 
884 	/*
885 	 * lktimeout only applies when LK_TIMELOCK is used, and only
886 	 * the pageout daemon uses it.  The timeout may not be zero
887 	 * or the pageout daemon can deadlock in low-VM situations.
888 	 */
889 	if (lktimeout == 0)
890 		lktimeout = hz / 10;
891 
892 	vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
893 
894 	lwkt_token_init(&vp->v_token, "vnode");
895 	lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
896 	TAILQ_INIT(&vp->v_namecache);
897 	RB_INIT(&vp->v_rbclean_tree);
898 	RB_INIT(&vp->v_rbdirty_tree);
899 	RB_INIT(&vp->v_rbhash_tree);
900 	spin_init(&vp->v_spin, "allocvnode");
901 
902 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
903 	atomic_add_int(&numvnodes, 1);
904 	vp->v_refcnt = 1;
905 	vp->v_flag = VAGE0 | VAGE1;
906 	vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
907 
908 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
909 	/* exclusive lock still held */
910 
911 	vp->v_filesize = NOOFFSET;
912 	vp->v_type = VNON;
913 	vp->v_tag = 0;
914 	vp->v_state = VS_CACHED;
915 	_vactivate(vp);
916 
917 	return (vp);
918 }
919 
920 /*
921  * Called after a process has allocated a vnode via allocvnode()
922  * and we detected that too many vnodes were present.
923  *
924  * This function is called just prior to a return to userland if the
925  * process at some point had to allocate a new vnode during the last
926  * system call and the vnode count was found to be excessive.
927  *
928  * This is a synchronous path that we do not normally want to execute.
929  *
930  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
931  *
932  * WARNING: Sometimes numvnodes can blow out due to children being
933  *	    present under directory vnodes in the namecache.  For the
934  *	    moment use an if() instead of a while() and note that if
935  *	    we were to use a while() we would still have to break out
936  *	    if freesomevnodes() returned 0.  vnlru will also be trying
937  *	    hard to free vnodes at the same time (with a lower trigger
938  *	    pointer).
939  */
940 void
941 allocvnode_gc(void)
942 {
943 	if (numvnodes >= maxvnodes &&
944 	    countcachedvnodes(0) + inactivevnodes >= maxvnodes * 5 / 10) {
945 		freesomevnodes(batchfreevnodes);
946 	}
947 }
948 
949 int
950 freesomevnodes(int n)
951 {
952 	struct vnode *vp;
953 	int count = 0;
954 
955 	while (n) {
956 		if ((vp = cleanfreevnode(n)) == NULL)
957 			break;
958 		vx_unlock(vp);
959 		--n;
960 		++count;
961 		kfree(vp, M_VNODE);
962 		atomic_add_int(&numvnodes, -1);
963 	}
964 	return(count);
965 }
966