xref: /dragonfly/sys/kern/vfs_lock.c (revision 81c11cd3)
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/kern/vfs_lock.c,v 1.30 2008/06/30 03:57:41 dillon Exp $
35  */
36 
37 /*
38  * External virtual filesystem routines
39  */
40 #include "opt_ddb.h"
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/proc.h>
48 #include <sys/vnode.h>
49 #include <sys/buf.h>
50 #include <sys/sysctl.h>
51 
52 #include <machine/limits.h>
53 
54 #include <vm/vm.h>
55 #include <vm/vm_object.h>
56 
57 #include <sys/buf2.h>
58 #include <sys/thread2.h>
59 #include <sys/sysref2.h>
60 #include <sys/mplock2.h>
61 
62 static void vnode_terminate(struct vnode *vp);
63 static boolean_t vnode_ctor(void *obj, void *private, int ocflags);
64 static void vnode_dtor(void *obj, void *private);
65 
66 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
67 static struct sysref_class vnode_sysref_class = {
68 	.name =		"vnode",
69 	.mtype =	M_VNODE,
70 	.proto =	SYSREF_PROTO_VNODE,
71 	.offset =	offsetof(struct vnode, v_sysref),
72 	.objsize =	sizeof(struct vnode),
73 	.mag_capacity =	256,
74 	.flags =	SRC_MANAGEDINIT,
75 	.ctor =		vnode_ctor,
76 	.dtor =		vnode_dtor,
77 	.ops = {
78 		.terminate = (sysref_terminate_func_t)vnode_terminate,
79 		.lock = (sysref_terminate_func_t)vx_lock,
80 		.unlock = (sysref_terminate_func_t)vx_unlock
81 	}
82 };
83 
84 /*
85  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
86  * are inserted prior to the mid point, and otherwise inserted
87  * at the tail.
88  */
89 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
90 static struct vnode	vnode_free_mid1;
91 static struct vnode	vnode_free_mid2;
92 static struct vnode	vnode_free_rover;
93 static struct spinlock	vfs_spin = SPINLOCK_INITIALIZER(vfs_spin);
94 static enum { ROVER_MID1, ROVER_MID2 } rover_state = ROVER_MID2;
95 
96 int  freevnodes = 0;
97 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD,
98 	&freevnodes, 0, "Number of free nodes");
99 static int wantfreevnodes = 25;
100 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
101 	&wantfreevnodes, 0, "Desired number of free vnodes");
102 #ifdef TRACKVNODE
103 static ulong trackvnode;
104 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
105 		&trackvnode, 0, "");
106 #endif
107 
108 /*
109  * Called from vfsinit()
110  */
111 void
112 vfs_lock_init(void)
113 {
114 	TAILQ_INIT(&vnode_free_list);
115 	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_mid1, v_freelist);
116 	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_mid2, v_freelist);
117 	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_rover, v_freelist);
118 	spin_init(&vfs_spin);
119 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
120 }
121 
122 /*
123  * Misc functions
124  */
125 static __inline
126 void
127 _vsetflags(struct vnode *vp, int flags)
128 {
129 	atomic_set_int(&vp->v_flag, flags);
130 }
131 
132 static __inline
133 void
134 _vclrflags(struct vnode *vp, int flags)
135 {
136 	atomic_clear_int(&vp->v_flag, flags);
137 }
138 
139 void
140 vsetflags(struct vnode *vp, int flags)
141 {
142 	_vsetflags(vp, flags);
143 }
144 
145 void
146 vclrflags(struct vnode *vp, int flags)
147 {
148 	_vclrflags(vp, flags);
149 }
150 
151 /*
152  * Inline helper functions.
153  *
154  * WARNING: vbusy() may only be called while the vnode lock or VX lock
155  *	    is held.  The vnode spinlock need not be held.
156  *
157  * MPSAFE
158  */
159 static __inline
160 void
161 __vbusy_interlocked(struct vnode *vp)
162 {
163 	KKASSERT(vp->v_flag & VFREE);
164 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
165 	freevnodes--;
166 	_vclrflags(vp, VFREE);
167 }
168 
169 static __inline
170 void
171 __vbusy(struct vnode *vp)
172 {
173 #ifdef TRACKVNODE
174 	if ((ulong)vp == trackvnode)
175 		kprintf("__vbusy %p %08x\n", vp, vp->v_flag);
176 #endif
177 	spin_lock(&vfs_spin);
178 	__vbusy_interlocked(vp);
179 	spin_unlock(&vfs_spin);
180 }
181 
182 /*
183  * Put a vnode on the free list.  The caller has cleared VCACHED or owns the
184  * implied sysref related to having removed the vnode from the freelist
185  * (and VCACHED is already clear in that case).
186  *
187  * MPSAFE
188  */
189 static __inline
190 void
191 __vfree(struct vnode *vp)
192 {
193 #ifdef TRACKVNODE
194 	if ((ulong)vp == trackvnode) {
195 		kprintf("__vfree %p %08x\n", vp, vp->v_flag);
196 		print_backtrace(-1);
197 	}
198 #endif
199 	spin_lock(&vfs_spin);
200 	KKASSERT((vp->v_flag & VFREE) == 0);
201 
202 	/*
203 	 * Distinguish between basically dead vnodes, vnodes with cached
204 	 * data, and vnodes without cached data.  A rover will shift the
205 	 * vnodes around as their cache status is lost.
206 	 */
207 	if (vp->v_flag & VRECLAIMED) {
208 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
209 	} else if (vp->v_object && vp->v_object->resident_page_count) {
210 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
211 	} else if (vp->v_object && vp->v_object->swblock_count) {
212 		TAILQ_INSERT_BEFORE(&vnode_free_mid2, vp, v_freelist);
213 	} else {
214 		TAILQ_INSERT_BEFORE(&vnode_free_mid1, vp, v_freelist);
215 	}
216 	freevnodes++;
217 	_vsetflags(vp, VFREE);
218 	spin_unlock(&vfs_spin);
219 }
220 
221 /*
222  * Put a vnode on the free list.  The caller has cleared VCACHED or owns the
223  * implied sysref related to having removed the vnode from the freelist
224  * (and VCACHED is already clear in that case).
225  *
226  * MPSAFE
227  */
228 static __inline
229 void
230 __vfreetail(struct vnode *vp)
231 {
232 #ifdef TRACKVNODE
233 	if ((ulong)vp == trackvnode)
234 		kprintf("__vfreetail %p %08x\n", vp, vp->v_flag);
235 #endif
236 	spin_lock(&vfs_spin);
237 	KKASSERT((vp->v_flag & VFREE) == 0);
238 	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
239 	freevnodes++;
240 	_vsetflags(vp, VFREE);
241 	spin_unlock(&vfs_spin);
242 }
243 
244 /*
245  * Return a C boolean if we should put the vnode on the freelist (VFREE),
246  * or leave it / mark it as VCACHED.
247  *
248  * This routine is only valid if the vnode is already either VFREE or
249  * VCACHED, or if it can become VFREE or VCACHED via vnode_terminate().
250  *
251  * WARNING!  This functions is typically called with v_spinlock held.
252  *
253  * MPSAFE
254  */
255 static __inline boolean_t
256 vshouldfree(struct vnode *vp)
257 {
258 	return (vp->v_auxrefs == 0 &&
259 	    (vp->v_object == NULL || vp->v_object->resident_page_count == 0));
260 }
261 
262 /*
263  * Add a ref to an active vnode.  This function should never be called
264  * with an inactive vnode (use vget() instead).
265  *
266  * MPSAFE
267  */
268 void
269 vref(struct vnode *vp)
270 {
271 	KKASSERT(vp->v_sysref.refcnt > 0 &&
272 		 (vp->v_flag & (VFREE|VINACTIVE)) == 0);
273 	sysref_get(&vp->v_sysref);
274 }
275 
276 /*
277  * Release a ref on an active or inactive vnode.  The sysref termination
278  * function will be called when the active last active reference is released,
279  * and the vnode is returned to the objcache when the last inactive
280  * reference is released.
281  */
282 void
283 vrele(struct vnode *vp)
284 {
285 	sysref_put(&vp->v_sysref);
286 }
287 
288 /*
289  * Add an auxiliary data structure reference to the vnode.  Auxiliary
290  * references do not change the state of the vnode or prevent them
291  * from being deactivated, reclaimed, or placed on or removed from
292  * the free list.
293  *
294  * An auxiliary reference DOES prevent the vnode from being destroyed,
295  * allowing you to vx_lock() it, test state, etc.
296  *
297  * An auxiliary reference DOES NOT move a vnode out of the VFREE state
298  * once it has entered it.
299  *
300  * WARNING!  vhold() and vhold_interlocked() must not acquire v_spinlock.
301  *	     The spinlock may or may not already be held by the caller.
302  *	     vdrop() will clean up the free list state.
303  *
304  * MPSAFE
305  */
306 void
307 vhold(struct vnode *vp)
308 {
309 	KKASSERT(vp->v_sysref.refcnt != 0);
310 	atomic_add_int(&vp->v_auxrefs, 1);
311 }
312 
313 void
314 vhold_interlocked(struct vnode *vp)
315 {
316 	atomic_add_int(&vp->v_auxrefs, 1);
317 }
318 
319 /*
320  * Remove an auxiliary reference from the vnode.
321  *
322  * vdrop needs to check for a VCACHE->VFREE transition to catch cases
323  * where a vnode is held past its reclamation.  We use v_spinlock to
324  * interlock VCACHED -> !VCACHED transitions.
325  *
326  * MPSAFE
327  */
328 void
329 vdrop(struct vnode *vp)
330 {
331 	KKASSERT(vp->v_sysref.refcnt != 0 && vp->v_auxrefs > 0);
332 	spin_lock(&vp->v_spinlock);
333 	atomic_subtract_int(&vp->v_auxrefs, 1);
334 	if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
335 		_vclrflags(vp, VCACHED);
336 		__vfree(vp);
337 	}
338 	spin_unlock(&vp->v_spinlock);
339 }
340 
341 /*
342  * This function is called when the last active reference on the vnode
343  * is released, typically via vrele().  SYSREF will VX lock the vnode
344  * and then give the vnode a negative ref count, indicating that it is
345  * undergoing termination or is being set aside for the cache, and one
346  * final sysref_put() is required to actually return it to the memory
347  * subsystem.
348  *
349  * Additional inactive sysrefs may race us but that's ok.  Reactivations
350  * cannot race us because the sysref code interlocked with the VX lock
351  * (which is held on call).
352  *
353  * MPSAFE
354  */
355 void
356 vnode_terminate(struct vnode *vp)
357 {
358 	/*
359 	 * We own the VX lock, it should not be possible for someone else
360 	 * to have reactivated the vp.
361 	 */
362 	KKASSERT(sysref_isinactive(&vp->v_sysref));
363 
364 	/*
365 	 * Deactivate the vnode by marking it VFREE or VCACHED.
366 	 * The vnode can be reactivated from either state until
367 	 * reclaimed.  These states inherit the 'last' sysref on the
368 	 * vnode.
369 	 *
370 	 * NOTE: There may be additional inactive references from
371 	 * other entities blocking on the VX lock while we hold it,
372 	 * but this does not prevent us from changing the vnode's
373 	 * state.
374 	 *
375 	 * NOTE: The vnode could already be marked inactive.  XXX
376 	 *	 how?
377 	 *
378 	 * NOTE: v_mount may be NULL due to assignment to
379 	 *	 dead_vnode_vops
380 	 *
381 	 * NOTE: The vnode may be marked inactive with dirty buffers
382 	 *	 or dirty pages in its cached VM object still present.
383 	 *
384 	 * NOTE: VCACHED should not be set on entry.  We lose control
385 	 *	 of the sysref the instant the vnode is placed on the
386 	 *	 free list or when VCACHED is set.
387 	 *
388 	 *	 The VX lock is required when transitioning to
389 	 *	 +VCACHED but is not sufficient for the vshouldfree()
390 	 *	 interlocked test or when transitioning to -VCACHED.
391 	 */
392 	if ((vp->v_flag & VINACTIVE) == 0) {
393 		_vsetflags(vp, VINACTIVE);
394 		if (vp->v_mount)
395 			VOP_INACTIVE(vp);
396 	}
397 	spin_lock(&vp->v_spinlock);
398 	KKASSERT((vp->v_flag & (VFREE|VCACHED)) == 0);
399 	if (vshouldfree(vp))
400 		__vfree(vp);
401 	else
402 		_vsetflags(vp, VCACHED); /* inactive but not yet free*/
403 	spin_unlock(&vp->v_spinlock);
404 	vx_unlock(vp);
405 }
406 
407 /*
408  * Physical vnode constructor / destructor.  These are only executed on
409  * the backend of the objcache.  They are NOT executed on every vnode
410  * allocation or deallocation.
411  *
412  * MPSAFE
413  */
414 boolean_t
415 vnode_ctor(void *obj, void *private, int ocflags)
416 {
417 	struct vnode *vp = obj;
418 
419 	lwkt_token_init(&vp->v_token, 1, "vnode");
420 	lockinit(&vp->v_lock, "vnode", 0, 0);
421 	ccms_dataspace_init(&vp->v_ccms);
422 	TAILQ_INIT(&vp->v_namecache);
423 	RB_INIT(&vp->v_rbclean_tree);
424 	RB_INIT(&vp->v_rbdirty_tree);
425 	RB_INIT(&vp->v_rbhash_tree);
426 	return(TRUE);
427 }
428 
429 /*
430  * MPSAFE
431  */
432 void
433 vnode_dtor(void *obj, void *private)
434 {
435 	struct vnode *vp = obj;
436 
437 	KKASSERT((vp->v_flag & (VCACHED|VFREE)) == 0);
438 	ccms_dataspace_destroy(&vp->v_ccms);
439 }
440 
441 /****************************************************************
442  *			VX LOCKING FUNCTIONS			*
443  ****************************************************************
444  *
445  * These functions lock vnodes for reclamation and deactivation related
446  * activities.  The caller must already be holding some sort of reference
447  * on the vnode.
448  *
449  * MPSAFE
450  */
451 void
452 vx_lock(struct vnode *vp)
453 {
454 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
455 }
456 
457 /*
458  * The non-blocking version also uses a slightly different mechanic.
459  * This function will explicitly fail not only if it cannot acquire
460  * the lock normally, but also if the caller already holds a lock.
461  *
462  * The adjusted mechanic is used to close a loophole where complex
463  * VOP_RECLAIM code can circle around recursively and allocate the
464  * same vnode it is trying to destroy from the freelist.
465  *
466  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
467  * cause the incorrect behavior to occur.  If not for that lockmgr()
468  * would do the right thing.
469  */
470 static int
471 vx_lock_nonblock(struct vnode *vp)
472 {
473 	if (lockcountnb(&vp->v_lock))
474 		return(EBUSY);
475 	return(lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT | LK_NOSPINWAIT));
476 }
477 
478 void
479 vx_unlock(struct vnode *vp)
480 {
481 	lockmgr(&vp->v_lock, LK_RELEASE);
482 }
483 
484 /****************************************************************
485  *			VNODE ACQUISITION FUNCTIONS		*
486  ****************************************************************
487  *
488  * These functions must be used when accessing a vnode via an auxiliary
489  * reference such as the namecache or free list, or when you wish to
490  * do a combo ref+lock sequence.
491  *
492  * These functions are MANDATORY for any code chain accessing a vnode
493  * whos activation state is not known.
494  *
495  * vget() can be called with LK_NOWAIT and will return EBUSY if the
496  * lock cannot be immediately acquired.
497  *
498  * vget()/vput() are used when reactivation is desired.
499  *
500  * vx_get() and vx_put() are used when reactivation is not desired.
501  */
502 int
503 vget(struct vnode *vp, int flags)
504 {
505 	int error;
506 
507 	/*
508 	 * A lock type must be passed
509 	 */
510 	if ((flags & LK_TYPE_MASK) == 0) {
511 		panic("vget() called with no lock specified!");
512 		/* NOT REACHED */
513 	}
514 
515 	/*
516 	 * Reference the structure and then acquire the lock.  0->1
517 	 * transitions and refs during termination are allowed here so
518 	 * call sysref directly.
519 	 *
520 	 * NOTE: The requested lock might be a shared lock and does
521 	 *	 not protect our access to the refcnt or other fields.
522 	 */
523 	sysref_get(&vp->v_sysref);
524 	if ((error = vn_lock(vp, flags)) != 0) {
525 		/*
526 		 * The lock failed, undo and return an error.
527 		 */
528 		sysref_put(&vp->v_sysref);
529 	} else if (vp->v_flag & VRECLAIMED) {
530 		/*
531 		 * The node is being reclaimed and cannot be reactivated
532 		 * any more, undo and return ENOENT.
533 		 */
534 		vn_unlock(vp);
535 		vrele(vp);
536 		error = ENOENT;
537 	} else {
538 		/*
539 		 * If the vnode is marked VFREE or VCACHED it needs to be
540 		 * reactivated, otherwise it had better already be active.
541 		 * VINACTIVE must also be cleared.
542 		 *
543 		 * In the VFREE/VCACHED case we have to throw away the
544 		 * sysref that was earmarking those cases and preventing
545 		 * the vnode from being destroyed.  Our sysref is still held.
546 		 *
547 		 * We are allowed to reactivate the vnode while we hold
548 		 * the VX lock, assuming it can be reactivated.
549 		 */
550 		spin_lock(&vp->v_spinlock);
551 		if (vp->v_flag & VFREE) {
552 			__vbusy(vp);
553 			sysref_activate(&vp->v_sysref);
554 			spin_unlock(&vp->v_spinlock);
555 			sysref_put(&vp->v_sysref);
556 		} else if (vp->v_flag & VCACHED) {
557 			_vclrflags(vp, VCACHED);
558 			sysref_activate(&vp->v_sysref);
559 			spin_unlock(&vp->v_spinlock);
560 			sysref_put(&vp->v_sysref);
561 		} else {
562 			if (sysref_isinactive(&vp->v_sysref)) {
563 				sysref_activate(&vp->v_sysref);
564 				kprintf("Warning vp %p reactivation race\n",
565 					vp);
566 			}
567 			spin_unlock(&vp->v_spinlock);
568 		}
569 		_vclrflags(vp, VINACTIVE);
570 		error = 0;
571 	}
572 	return(error);
573 }
574 
575 /*
576  * MPSAFE
577  */
578 void
579 vput(struct vnode *vp)
580 {
581 	vn_unlock(vp);
582 	vrele(vp);
583 }
584 
585 /*
586  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
587  *
588  * MPSAFE
589  */
590 void
591 vx_get(struct vnode *vp)
592 {
593 	sysref_get(&vp->v_sysref);
594 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
595 }
596 
597 /*
598  * MPSAFE
599  */
600 int
601 vx_get_nonblock(struct vnode *vp)
602 {
603 	int error;
604 
605 	sysref_get(&vp->v_sysref);
606 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
607 	if (error)
608 		sysref_put(&vp->v_sysref);
609 	return(error);
610 }
611 
612 /*
613  * Relase a VX lock that also held a ref on the vnode.
614  *
615  * vx_put needs to check for a VCACHED->VFREE transition to catch the
616  * case where e.g. vnlru issues a vgone*().
617  *
618  * MPSAFE
619  */
620 void
621 vx_put(struct vnode *vp)
622 {
623 	spin_lock(&vp->v_spinlock);
624 	if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
625 		_vclrflags(vp, VCACHED);
626 		__vfree(vp);
627 	}
628 	spin_unlock(&vp->v_spinlock);
629 	lockmgr(&vp->v_lock, LK_RELEASE);
630 	sysref_put(&vp->v_sysref);
631 }
632 
633 /*
634  * The rover looks for vnodes past the midline with no cached data and
635  * moves them to before the midline.  If we do not do this the midline
636  * can wind up in a degenerate state.
637  */
638 static
639 void
640 vnode_rover_locked(void)
641 {
642 	struct vnode *vp;
643 
644 	/*
645 	 * Get the vnode after the rover.  The rover roves between mid1 and
646 	 * the end so the only special vnode it can encounter is mid2.
647 	 */
648 	vp = TAILQ_NEXT(&vnode_free_rover, v_freelist);
649 	if (vp == &vnode_free_mid2) {
650 		vp = TAILQ_NEXT(vp, v_freelist);
651 		rover_state = ROVER_MID2;
652 	}
653 	KKASSERT(vp != &vnode_free_mid1);
654 
655 	/*
656 	 * Start over if we finished the scan.
657 	 */
658 	TAILQ_REMOVE(&vnode_free_list, &vnode_free_rover, v_freelist);
659 	if (vp == NULL) {
660 		TAILQ_INSERT_AFTER(&vnode_free_list, &vnode_free_mid1,
661 				   &vnode_free_rover, v_freelist);
662 		rover_state = ROVER_MID1;
663 		return;
664 	}
665 	TAILQ_INSERT_AFTER(&vnode_free_list, vp, &vnode_free_rover, v_freelist);
666 
667 	/*
668 	 * Shift vp if appropriate.
669 	 */
670 	if (vp->v_object && vp->v_object->resident_page_count) {
671 		/*
672 		 * Promote vnode with resident pages to section 3.
673 		 * (This case shouldn't happen).
674 		 */
675 		if (rover_state == ROVER_MID1) {
676 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
677 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
678 		}
679 	} else if (vp->v_object && vp->v_object->swblock_count) {
680 		/*
681 		 * Demote vnode with only swap pages to section 2
682 		 */
683 		if (rover_state == ROVER_MID2) {
684 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
685 			TAILQ_INSERT_BEFORE(&vnode_free_mid2, vp, v_freelist);
686 		}
687 	} else {
688 		/*
689 		 * Demote vnode with no cached data to section 1
690 		 */
691 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
692 		TAILQ_INSERT_BEFORE(&vnode_free_mid1, vp, v_freelist);
693 	}
694 }
695 
696 /*
697  * Try to reuse a vnode from the free list.
698  *
699  * NOTE: The returned vnode is not completely initialized.
700  *
701  * WARNING: The freevnodes count can race, NULL can be returned even if
702  *	    freevnodes != 0.
703  *
704  * MPSAFE
705  */
706 static
707 struct vnode *
708 allocfreevnode(void)
709 {
710 	struct vnode *vp;
711 	int count;
712 
713 	for (count = 0; count < freevnodes; count++) {
714 		/*
715 		 * Try to lock the first vnode on the free list.
716 		 * Cycle if we can't.
717 		 *
718 		 * We use a bad hack in vx_lock_nonblock() which avoids
719 		 * the lock order reversal between vfs_spin and v_spinlock.
720 		 * This is very fragile code and I don't want to use
721 		 * vhold here.
722 		 */
723 		spin_lock(&vfs_spin);
724 		vnode_rover_locked();
725 		vnode_rover_locked();
726 		vp = TAILQ_FIRST(&vnode_free_list);
727 		while (vp == &vnode_free_mid1 || vp == &vnode_free_mid2 ||
728 		       vp == &vnode_free_rover) {
729 			vp = TAILQ_NEXT(vp, v_freelist);
730 		}
731 		if (vp == NULL)
732 			break;
733 		if (vx_lock_nonblock(vp)) {
734 			KKASSERT(vp->v_flag & VFREE);
735 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
736 			TAILQ_INSERT_TAIL(&vnode_free_list,
737 					  vp, v_freelist);
738 			spin_unlock(&vfs_spin);
739 			continue;
740 		}
741 
742 		/*
743 		 * We inherit the sysref associated the vnode on the free
744 		 * list.  Because VCACHED is clear the vnode will not
745 		 * be placed back on the free list.  We own the sysref
746 		 * free and clear and thus control the disposition of
747 		 * the vnode.
748 		 */
749 		__vbusy_interlocked(vp);
750 		spin_unlock(&vfs_spin);
751 #ifdef TRACKVNODE
752 		if ((ulong)vp == trackvnode)
753 			kprintf("allocfreevnode %p %08x\n", vp, vp->v_flag);
754 #endif
755 		/*
756 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
757 		 * This includes namecache refs due to a related ncp being
758 		 * locked or having children.
759 		 *
760 		 * We will make this test several times as auxrefs can
761 		 * get incremented on us without any spinlocks being held
762 		 * until we have removed all namecache and inode references
763 		 * to the vnode.
764 		 *
765 		 * Because VCACHED is already in the correct state (cleared)
766 		 * we cannot race other vdrop()s occuring at the same time
767 		 * and can safely place vp on the free list.
768 		 *
769 		 * The free list association reinherits the sysref.
770 		 */
771 		if (vp->v_auxrefs) {
772 			__vfreetail(vp);
773 			vx_unlock(vp);
774 			continue;
775 		}
776 
777 		/*
778 		 * We inherit the reference that was previously associated
779 		 * with the vnode being on the free list.  VCACHED had better
780 		 * not be set because the reference and VX lock prevents
781 		 * the sysref from transitioning to an active state.
782 		 */
783 		KKASSERT((vp->v_flag & (VINACTIVE|VCACHED)) == VINACTIVE);
784 		KKASSERT(sysref_isinactive(&vp->v_sysref));
785 
786 		/*
787 		 * Holding the VX lock on an inactive vnode prevents it
788 		 * from being reactivated or reused.  New namecache
789 		 * associations can only be made using active vnodes.
790 		 *
791 		 * Another thread may be blocked on our vnode lock while
792 		 * holding a namecache lock.  We can only reuse this vnode
793 		 * if we can clear all namecache associations without
794 		 * blocking.
795 		 *
796 		 * Because VCACHED is already in the correct state (cleared)
797 		 * we cannot race other vdrop()s occuring at the same time
798 		 * and can safely place vp on the free list.
799 		 */
800 		if ((vp->v_flag & VRECLAIMED) == 0) {
801 			if (cache_inval_vp_nonblock(vp)) {
802 				__vfreetail(vp);
803 				vx_unlock(vp);
804 				continue;
805 			}
806 			vgone_vxlocked(vp);
807 			/* vnode is still VX locked */
808 		}
809 
810 		/*
811 		 * We can reuse the vnode if no primary or auxiliary
812 		 * references remain other then ours, else put it
813 		 * back on the free list and keep looking.
814 		 *
815 		 * Either the free list inherits the last reference
816 		 * or we fall through and sysref_activate() the last
817 		 * reference.
818 		 *
819 		 * Since the vnode is in a VRECLAIMED state, no new
820 		 * namecache associations could have been made.
821 		 */
822 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
823 		if (vp->v_auxrefs ||
824 		    !sysref_islastdeactivation(&vp->v_sysref)) {
825 			__vfreetail(vp);
826 			vx_unlock(vp);
827 			continue;
828 		}
829 
830 		/*
831 		 * Return a VX locked vnode suitable for reuse.  The caller
832 		 * inherits the sysref.
833 		 */
834 		return(vp);
835 	}
836 	return(NULL);
837 }
838 
839 /*
840  * Obtain a new vnode from the freelist, allocating more if necessary.
841  * The returned vnode is VX locked & refd.
842  *
843  * All new vnodes set the VAGE flags.  An open() of the vnode will
844  * decrement the (2-bit) flags.  Vnodes which are opened several times
845  * are thus retained in the cache over vnodes which are merely stat()d.
846  *
847  * MPSAFE
848  */
849 struct vnode *
850 allocvnode(int lktimeout, int lkflags)
851 {
852 	struct vnode *vp;
853 
854 	/*
855 	 * Try to reuse vnodes if we hit the max.  This situation only
856 	 * occurs in certain large-memory (2G+) situations.  We cannot
857 	 * attempt to directly reclaim vnodes due to nasty recursion
858 	 * problems.
859 	 */
860 	while (numvnodes - freevnodes > desiredvnodes)
861 		vnlru_proc_wait();
862 
863 	/*
864 	 * Try to build up as many vnodes as we can before reallocating
865 	 * from the free list.  A vnode on the free list simply means
866 	 * that it is inactive with no resident pages.  It may or may not
867 	 * have been reclaimed and could have valuable information associated
868 	 * with it that we shouldn't throw away unless we really need to.
869 	 *
870 	 * HAMMER NOTE: Re-establishing a vnode is a fairly expensive
871 	 * operation for HAMMER but this should benefit UFS as well.
872 	 */
873 	if (freevnodes >= wantfreevnodes && numvnodes >= desiredvnodes)
874 		vp = allocfreevnode();
875 	else
876 		vp = NULL;
877 	if (vp == NULL) {
878 		vp = sysref_alloc(&vnode_sysref_class);
879 		KKASSERT((vp->v_flag & (VCACHED|VFREE)) == 0);
880 		lockmgr(&vp->v_lock, LK_EXCLUSIVE);
881 		numvnodes++;
882 	}
883 
884 	/*
885 	 * We are using a managed sysref class, vnode fields are only
886 	 * zerod on initial allocation from the backing store, not
887 	 * on reallocation.  Thus we have to clear these fields for both
888 	 * reallocation and reuse.
889 	 */
890 #ifdef INVARIANTS
891 	if (vp->v_data)
892 		panic("cleaned vnode isn't");
893 	if (bio_track_active(&vp->v_track_read) ||
894 	    bio_track_active(&vp->v_track_write)) {
895 		panic("Clean vnode has pending I/O's");
896 	}
897 	if (vp->v_flag & VONWORKLST)
898 		panic("Clean vnode still pending on syncer worklist!");
899 	if (!RB_EMPTY(&vp->v_rbdirty_tree))
900 		panic("Clean vnode still has dirty buffers!");
901 	if (!RB_EMPTY(&vp->v_rbclean_tree))
902 		panic("Clean vnode still has clean buffers!");
903 	if (!RB_EMPTY(&vp->v_rbhash_tree))
904 		panic("Clean vnode still on hash tree!");
905 	KKASSERT(vp->v_mount == NULL);
906 #endif
907 	vp->v_flag = VAGE0 | VAGE1;
908 	vp->v_lastw = 0;
909 	vp->v_lasta = 0;
910 	vp->v_cstart = 0;
911 	vp->v_clen = 0;
912 	vp->v_socket = 0;
913 	vp->v_opencount = 0;
914 	vp->v_writecount = 0;	/* XXX */
915 
916 	/*
917 	 * lktimeout only applies when LK_TIMELOCK is used, and only
918 	 * the pageout daemon uses it.  The timeout may not be zero
919 	 * or the pageout daemon can deadlock in low-VM situations.
920 	 */
921 	if (lktimeout == 0)
922 		lktimeout = hz / 10;
923 	lockreinit(&vp->v_lock, "vnode", lktimeout, lkflags);
924 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
925 	/* exclusive lock still held */
926 
927 	/*
928 	 * Note: sysref needs to be activated to convert -0x40000000 to +1.
929 	 * The -0x40000000 comes from the last ref on reuse, and from
930 	 * sysref_init() on allocate.
931 	 */
932 	sysref_activate(&vp->v_sysref);
933 	vp->v_filesize = NOOFFSET;
934 	vp->v_type = VNON;
935 	vp->v_tag = 0;
936 	vp->v_ops = NULL;
937 	vp->v_data = NULL;
938 	KKASSERT(vp->v_mount == NULL);
939 
940 	return (vp);
941 }
942 
943 /*
944  * MPSAFE
945  */
946 int
947 freesomevnodes(int n)
948 {
949 	struct vnode *vp;
950 	int count = 0;
951 
952 	while (n) {
953 		--n;
954 		if ((vp = allocfreevnode()) == NULL)
955 			break;
956 		vx_put(vp);
957 		--numvnodes;
958 	}
959 	return(count);
960 }
961