xref: /dragonfly/sys/kern/vfs_lock.c (revision a32bc35d)
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * External virtual filesystem routines
37  */
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/malloc.h>
43 #include <sys/mount.h>
44 #include <sys/proc.h>
45 #include <sys/vnode.h>
46 #include <sys/buf.h>
47 #include <sys/sysctl.h>
48 
49 #include <machine/limits.h>
50 
51 #include <vm/vm.h>
52 #include <vm/vm_object.h>
53 
54 #include <sys/buf2.h>
55 #include <sys/thread2.h>
56 #include <sys/sysref2.h>
57 
58 static void vnode_terminate(struct vnode *vp);
59 static boolean_t vnode_ctor(void *obj, void *private, int ocflags);
60 static void vnode_dtor(void *obj, void *private);
61 
62 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
63 static struct sysref_class vnode_sysref_class = {
64 	.name =		"vnode",
65 	.mtype =	M_VNODE,
66 	.proto =	SYSREF_PROTO_VNODE,
67 	.offset =	offsetof(struct vnode, v_sysref),
68 	.objsize =	sizeof(struct vnode),
69 	.nom_cache =	256,
70 	.flags =	SRC_MANAGEDINIT,
71 	.ctor =		vnode_ctor,
72 	.dtor =		vnode_dtor,
73 	.ops = {
74 		.terminate = (sysref_terminate_func_t)vnode_terminate,
75 		.lock = (sysref_terminate_func_t)vx_lock,
76 		.unlock = (sysref_terminate_func_t)vx_unlock
77 	}
78 };
79 
80 /*
81  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
82  * are inserted prior to the mid point, and otherwise inserted
83  * at the tail.
84  */
85 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
86 static struct vnode	vnode_free_mid1;
87 static struct vnode	vnode_free_mid2;
88 static struct vnode	vnode_free_rover;
89 static struct spinlock	vfs_spin = SPINLOCK_INITIALIZER(vfs_spin);
90 static enum { ROVER_MID1, ROVER_MID2 } rover_state = ROVER_MID2;
91 
92 int  freevnodes = 0;
93 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD,
94 	&freevnodes, 0, "Number of free nodes");
95 static int wantfreevnodes = 25;
96 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
97 	&wantfreevnodes, 0, "Desired number of free vnodes");
98 #ifdef TRACKVNODE
99 static ulong trackvnode;
100 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
101 		&trackvnode, 0, "");
102 #endif
103 
104 /*
105  * Called from vfsinit()
106  */
107 void
108 vfs_lock_init(void)
109 {
110 	TAILQ_INIT(&vnode_free_list);
111 	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_mid1, v_freelist);
112 	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_mid2, v_freelist);
113 	TAILQ_INSERT_TAIL(&vnode_free_list, &vnode_free_rover, v_freelist);
114 	spin_init(&vfs_spin);
115 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
116 }
117 
118 /*
119  * Misc functions
120  */
121 static __inline
122 void
123 _vsetflags(struct vnode *vp, int flags)
124 {
125 	atomic_set_int(&vp->v_flag, flags);
126 }
127 
128 static __inline
129 void
130 _vclrflags(struct vnode *vp, int flags)
131 {
132 	atomic_clear_int(&vp->v_flag, flags);
133 }
134 
135 void
136 vsetflags(struct vnode *vp, int flags)
137 {
138 	_vsetflags(vp, flags);
139 }
140 
141 void
142 vclrflags(struct vnode *vp, int flags)
143 {
144 	_vclrflags(vp, flags);
145 }
146 
147 /*
148  * Inline helper functions.
149  *
150  * WARNING: vbusy() may only be called while the vnode lock or VX lock
151  *	    is held.  The vnode spinlock need not be held.
152  *
153  * MPSAFE
154  */
155 static __inline
156 void
157 __vbusy_interlocked(struct vnode *vp)
158 {
159 	KKASSERT(vp->v_flag & VFREE);
160 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
161 	freevnodes--;
162 	_vclrflags(vp, VFREE);
163 }
164 
165 static __inline
166 void
167 __vbusy(struct vnode *vp)
168 {
169 #ifdef TRACKVNODE
170 	if ((ulong)vp == trackvnode)
171 		kprintf("__vbusy %p %08x\n", vp, vp->v_flag);
172 #endif
173 	spin_lock(&vfs_spin);
174 	__vbusy_interlocked(vp);
175 	spin_unlock(&vfs_spin);
176 }
177 
178 /*
179  * Put a vnode on the free list.  The caller has cleared VCACHED or owns the
180  * implied sysref related to having removed the vnode from the freelist
181  * (and VCACHED is already clear in that case).
182  *
183  * MPSAFE
184  */
185 static __inline
186 void
187 __vfree(struct vnode *vp)
188 {
189 #ifdef TRACKVNODE
190 	if ((ulong)vp == trackvnode) {
191 		kprintf("__vfree %p %08x\n", vp, vp->v_flag);
192 		print_backtrace(-1);
193 	}
194 #endif
195 	spin_lock(&vfs_spin);
196 	KKASSERT((vp->v_flag & VFREE) == 0);
197 
198 	/*
199 	 * Distinguish between basically dead vnodes, vnodes with cached
200 	 * data, and vnodes without cached data.  A rover will shift the
201 	 * vnodes around as their cache status is lost.
202 	 */
203 	if (vp->v_flag & VRECLAIMED) {
204 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
205 	} else if (vp->v_object && vp->v_object->resident_page_count) {
206 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
207 	} else if (vp->v_object && vp->v_object->swblock_count) {
208 		TAILQ_INSERT_BEFORE(&vnode_free_mid2, vp, v_freelist);
209 	} else {
210 		TAILQ_INSERT_BEFORE(&vnode_free_mid1, vp, v_freelist);
211 	}
212 	freevnodes++;
213 	_vsetflags(vp, VFREE);
214 	spin_unlock(&vfs_spin);
215 }
216 
217 /*
218  * Put a vnode on the free list.  The caller has cleared VCACHED or owns the
219  * implied sysref related to having removed the vnode from the freelist
220  * (and VCACHED is already clear in that case).
221  *
222  * MPSAFE
223  */
224 static __inline
225 void
226 __vfreetail(struct vnode *vp)
227 {
228 #ifdef TRACKVNODE
229 	if ((ulong)vp == trackvnode)
230 		kprintf("__vfreetail %p %08x\n", vp, vp->v_flag);
231 #endif
232 	spin_lock(&vfs_spin);
233 	KKASSERT((vp->v_flag & VFREE) == 0);
234 	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
235 	freevnodes++;
236 	_vsetflags(vp, VFREE);
237 	spin_unlock(&vfs_spin);
238 }
239 
240 /*
241  * Return a C boolean if we should put the vnode on the freelist (VFREE),
242  * or leave it / mark it as VCACHED.
243  *
244  * This routine is only valid if the vnode is already either VFREE or
245  * VCACHED, or if it can become VFREE or VCACHED via vnode_terminate().
246  *
247  * WARNING!  This functions is typically called with v_spin held.
248  *
249  * MPSAFE
250  */
251 static __inline boolean_t
252 vshouldfree(struct vnode *vp)
253 {
254 	return (vp->v_auxrefs == 0 &&
255 	    (vp->v_object == NULL || vp->v_object->resident_page_count == 0));
256 }
257 
258 /*
259  * Add a ref to an active vnode.  This function should never be called
260  * with an inactive vnode (use vget() instead).
261  *
262  * MPSAFE
263  */
264 void
265 vref(struct vnode *vp)
266 {
267 	KKASSERT(vp->v_sysref.refcnt > 0 &&
268 		 (vp->v_flag & (VFREE|VINACTIVE)) == 0);
269 	sysref_get(&vp->v_sysref);
270 }
271 
272 /*
273  * Release a ref on an active or inactive vnode.  The sysref termination
274  * function will be called when the active last active reference is released,
275  * and the vnode is returned to the objcache when the last inactive
276  * reference is released.
277  */
278 void
279 vrele(struct vnode *vp)
280 {
281 	sysref_put(&vp->v_sysref);
282 }
283 
284 /*
285  * Add an auxiliary data structure reference to the vnode.  Auxiliary
286  * references do not change the state of the vnode or prevent them
287  * from being deactivated, reclaimed, or placed on or removed from
288  * the free list.
289  *
290  * An auxiliary reference DOES prevent the vnode from being destroyed,
291  * allowing you to vx_lock() it, test state, etc.
292  *
293  * An auxiliary reference DOES NOT move a vnode out of the VFREE state
294  * once it has entered it.
295  *
296  * WARNING!  vhold() and vhold_interlocked() must not acquire v_spin.
297  *	     The spinlock may or may not already be held by the caller.
298  *	     vdrop() will clean up the free list state.
299  *
300  * MPSAFE
301  */
302 void
303 vhold(struct vnode *vp)
304 {
305 	KKASSERT(vp->v_sysref.refcnt != 0);
306 	atomic_add_int(&vp->v_auxrefs, 1);
307 }
308 
309 void
310 vhold_interlocked(struct vnode *vp)
311 {
312 	atomic_add_int(&vp->v_auxrefs, 1);
313 }
314 
315 /*
316  * Remove an auxiliary reference from the vnode.
317  *
318  * vdrop needs to check for a VCACHE->VFREE transition to catch cases
319  * where a vnode is held past its reclamation.  We use v_spin to
320  * interlock VCACHED -> !VCACHED transitions.
321  *
322  * MPSAFE
323  */
324 void
325 vdrop(struct vnode *vp)
326 {
327 	KKASSERT(vp->v_sysref.refcnt != 0 && vp->v_auxrefs > 0);
328 	spin_lock(&vp->v_spin);
329 	atomic_subtract_int(&vp->v_auxrefs, 1);
330 	if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
331 		_vclrflags(vp, VCACHED);
332 		__vfree(vp);
333 	}
334 	spin_unlock(&vp->v_spin);
335 }
336 
337 /*
338  * This function is called when the last active reference on the vnode
339  * is released, typically via vrele().  SYSREF will VX lock the vnode
340  * and then give the vnode a negative ref count, indicating that it is
341  * undergoing termination or is being set aside for the cache, and one
342  * final sysref_put() is required to actually return it to the memory
343  * subsystem.
344  *
345  * Additional inactive sysrefs may race us but that's ok.  Reactivations
346  * cannot race us because the sysref code interlocked with the VX lock
347  * (which is held on call).
348  *
349  * MPSAFE
350  */
351 void
352 vnode_terminate(struct vnode *vp)
353 {
354 	/*
355 	 * We own the VX lock, it should not be possible for someone else
356 	 * to have reactivated the vp.
357 	 */
358 	KKASSERT(sysref_isinactive(&vp->v_sysref));
359 
360 	/*
361 	 * Deactivate the vnode by marking it VFREE or VCACHED.
362 	 * The vnode can be reactivated from either state until
363 	 * reclaimed.  These states inherit the 'last' sysref on the
364 	 * vnode.
365 	 *
366 	 * NOTE: There may be additional inactive references from
367 	 * other entities blocking on the VX lock while we hold it,
368 	 * but this does not prevent us from changing the vnode's
369 	 * state.
370 	 *
371 	 * NOTE: The vnode could already be marked inactive.  XXX
372 	 *	 how?
373 	 *
374 	 * NOTE: v_mount may be NULL due to assignment to
375 	 *	 dead_vnode_vops
376 	 *
377 	 * NOTE: The vnode may be marked inactive with dirty buffers
378 	 *	 or dirty pages in its cached VM object still present.
379 	 *
380 	 * NOTE: VCACHED should not be set on entry.  We lose control
381 	 *	 of the sysref the instant the vnode is placed on the
382 	 *	 free list or when VCACHED is set.
383 	 *
384 	 *	 The VX lock is required when transitioning to
385 	 *	 +VCACHED but is not sufficient for the vshouldfree()
386 	 *	 interlocked test or when transitioning to -VCACHED.
387 	 */
388 	if ((vp->v_flag & VINACTIVE) == 0) {
389 		_vsetflags(vp, VINACTIVE);
390 		if (vp->v_mount)
391 			VOP_INACTIVE(vp);
392 	}
393 	spin_lock(&vp->v_spin);
394 	KKASSERT((vp->v_flag & (VFREE|VCACHED)) == 0);
395 	if (vshouldfree(vp))
396 		__vfree(vp);
397 	else
398 		_vsetflags(vp, VCACHED); /* inactive but not yet free*/
399 	spin_unlock(&vp->v_spin);
400 	vx_unlock(vp);
401 }
402 
403 /*
404  * Physical vnode constructor / destructor.  These are only executed on
405  * the backend of the objcache.  They are NOT executed on every vnode
406  * allocation or deallocation.
407  *
408  * MPSAFE
409  */
410 boolean_t
411 vnode_ctor(void *obj, void *private, int ocflags)
412 {
413 	struct vnode *vp = obj;
414 
415 	lwkt_token_init(&vp->v_token, "vnode");
416 	lockinit(&vp->v_lock, "vnode", 0, 0);
417 	TAILQ_INIT(&vp->v_namecache);
418 	RB_INIT(&vp->v_rbclean_tree);
419 	RB_INIT(&vp->v_rbdirty_tree);
420 	RB_INIT(&vp->v_rbhash_tree);
421 	spin_init(&vp->v_spin);
422 	return(TRUE);
423 }
424 
425 /*
426  * MPSAFE
427  */
428 void
429 vnode_dtor(void *obj, void *private)
430 {
431 	struct vnode *vp = obj;
432 
433 	KKASSERT((vp->v_flag & (VCACHED|VFREE)) == 0);
434 }
435 
436 /****************************************************************
437  *			VX LOCKING FUNCTIONS			*
438  ****************************************************************
439  *
440  * These functions lock vnodes for reclamation and deactivation related
441  * activities.  The caller must already be holding some sort of reference
442  * on the vnode.
443  *
444  * MPSAFE
445  */
446 void
447 vx_lock(struct vnode *vp)
448 {
449 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
450 }
451 
452 /*
453  * The non-blocking version also uses a slightly different mechanic.
454  * This function will explicitly fail not only if it cannot acquire
455  * the lock normally, but also if the caller already holds a lock.
456  *
457  * The adjusted mechanic is used to close a loophole where complex
458  * VOP_RECLAIM code can circle around recursively and allocate the
459  * same vnode it is trying to destroy from the freelist.
460  *
461  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
462  * cause the incorrect behavior to occur.  If not for that lockmgr()
463  * would do the right thing.
464  */
465 static int
466 vx_lock_nonblock(struct vnode *vp)
467 {
468 	if (lockcountnb(&vp->v_lock))
469 		return(EBUSY);
470 	return(lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT));
471 }
472 
473 void
474 vx_unlock(struct vnode *vp)
475 {
476 	lockmgr(&vp->v_lock, LK_RELEASE);
477 }
478 
479 /****************************************************************
480  *			VNODE ACQUISITION FUNCTIONS		*
481  ****************************************************************
482  *
483  * These functions must be used when accessing a vnode via an auxiliary
484  * reference such as the namecache or free list, or when you wish to
485  * do a combo ref+lock sequence.
486  *
487  * These functions are MANDATORY for any code chain accessing a vnode
488  * whos activation state is not known.
489  *
490  * vget() can be called with LK_NOWAIT and will return EBUSY if the
491  * lock cannot be immediately acquired.
492  *
493  * vget()/vput() are used when reactivation is desired.
494  *
495  * vx_get() and vx_put() are used when reactivation is not desired.
496  */
497 int
498 vget(struct vnode *vp, int flags)
499 {
500 	int error;
501 
502 	/*
503 	 * A lock type must be passed
504 	 */
505 	if ((flags & LK_TYPE_MASK) == 0) {
506 		panic("vget() called with no lock specified!");
507 		/* NOT REACHED */
508 	}
509 
510 	/*
511 	 * Reference the structure and then acquire the lock.  0->1
512 	 * transitions and refs during termination are allowed here so
513 	 * call sysref directly.
514 	 *
515 	 * NOTE: The requested lock might be a shared lock and does
516 	 *	 not protect our access to the refcnt or other fields.
517 	 */
518 	sysref_get(&vp->v_sysref);
519 	if ((error = vn_lock(vp, flags)) != 0) {
520 		/*
521 		 * The lock failed, undo and return an error.
522 		 */
523 		sysref_put(&vp->v_sysref);
524 	} else if (vp->v_flag & VRECLAIMED) {
525 		/*
526 		 * The node is being reclaimed and cannot be reactivated
527 		 * any more, undo and return ENOENT.
528 		 */
529 		vn_unlock(vp);
530 		vrele(vp);
531 		error = ENOENT;
532 	} else {
533 		/*
534 		 * If the vnode is marked VFREE or VCACHED it needs to be
535 		 * reactivated, otherwise it had better already be active.
536 		 * VINACTIVE must also be cleared.
537 		 *
538 		 * In the VFREE/VCACHED case we have to throw away the
539 		 * sysref that was earmarking those cases and preventing
540 		 * the vnode from being destroyed.  Our sysref is still held.
541 		 *
542 		 * We are allowed to reactivate the vnode while we hold
543 		 * the VX lock, assuming it can be reactivated.
544 		 */
545 		spin_lock(&vp->v_spin);
546 		if (vp->v_flag & VFREE) {
547 			__vbusy(vp);
548 			sysref_activate(&vp->v_sysref);
549 			spin_unlock(&vp->v_spin);
550 			sysref_put(&vp->v_sysref);
551 		} else if (vp->v_flag & VCACHED) {
552 			_vclrflags(vp, VCACHED);
553 			sysref_activate(&vp->v_sysref);
554 			spin_unlock(&vp->v_spin);
555 			sysref_put(&vp->v_sysref);
556 		} else {
557 			if (sysref_isinactive(&vp->v_sysref)) {
558 				sysref_activate(&vp->v_sysref);
559 				kprintf("Warning vp %p reactivation race\n",
560 					vp);
561 			}
562 			spin_unlock(&vp->v_spin);
563 		}
564 		_vclrflags(vp, VINACTIVE);
565 		error = 0;
566 	}
567 	return(error);
568 }
569 
570 #ifdef DEBUG_VPUT
571 
572 void
573 debug_vput(struct vnode *vp, const char *filename, int line)
574 {
575 	kprintf("vput(%p) %s:%d\n", vp, filename, line);
576 	vn_unlock(vp);
577 	vrele(vp);
578 }
579 
580 #else
581 
582 /*
583  * MPSAFE
584  */
585 void
586 vput(struct vnode *vp)
587 {
588 	vn_unlock(vp);
589 	vrele(vp);
590 }
591 
592 #endif
593 
594 /*
595  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
596  *
597  * MPSAFE
598  */
599 void
600 vx_get(struct vnode *vp)
601 {
602 	sysref_get(&vp->v_sysref);
603 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
604 }
605 
606 /*
607  * MPSAFE
608  */
609 int
610 vx_get_nonblock(struct vnode *vp)
611 {
612 	int error;
613 
614 	sysref_get(&vp->v_sysref);
615 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
616 	if (error)
617 		sysref_put(&vp->v_sysref);
618 	return(error);
619 }
620 
621 /*
622  * Relase a VX lock that also held a ref on the vnode.
623  *
624  * vx_put needs to check for a VCACHED->VFREE transition to catch the
625  * case where e.g. vnlru issues a vgone*().
626  *
627  * MPSAFE
628  */
629 void
630 vx_put(struct vnode *vp)
631 {
632 	spin_lock(&vp->v_spin);
633 	if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
634 		_vclrflags(vp, VCACHED);
635 		__vfree(vp);
636 	}
637 	spin_unlock(&vp->v_spin);
638 	lockmgr(&vp->v_lock, LK_RELEASE);
639 	sysref_put(&vp->v_sysref);
640 }
641 
642 /*
643  * The rover looks for vnodes past the midline with no cached data and
644  * moves them to before the midline.  If we do not do this the midline
645  * can wind up in a degenerate state.
646  */
647 static
648 void
649 vnode_rover_locked(void)
650 {
651 	struct vnode *vp;
652 
653 	/*
654 	 * Get the vnode after the rover.  The rover roves between mid1 and
655 	 * the end so the only special vnode it can encounter is mid2.
656 	 */
657 	vp = TAILQ_NEXT(&vnode_free_rover, v_freelist);
658 	if (vp == &vnode_free_mid2) {
659 		vp = TAILQ_NEXT(vp, v_freelist);
660 		rover_state = ROVER_MID2;
661 	}
662 	KKASSERT(vp != &vnode_free_mid1);
663 
664 	/*
665 	 * Start over if we finished the scan.
666 	 */
667 	TAILQ_REMOVE(&vnode_free_list, &vnode_free_rover, v_freelist);
668 	if (vp == NULL) {
669 		TAILQ_INSERT_AFTER(&vnode_free_list, &vnode_free_mid1,
670 				   &vnode_free_rover, v_freelist);
671 		rover_state = ROVER_MID1;
672 		return;
673 	}
674 	TAILQ_INSERT_AFTER(&vnode_free_list, vp, &vnode_free_rover, v_freelist);
675 
676 	/*
677 	 * Shift vp if appropriate.
678 	 */
679 	if (vp->v_object && vp->v_object->resident_page_count) {
680 		/*
681 		 * Promote vnode with resident pages to section 3.
682 		 * (This case shouldn't happen).
683 		 */
684 		if (rover_state == ROVER_MID1) {
685 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
686 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
687 		}
688 	} else if (vp->v_object && vp->v_object->swblock_count) {
689 		/*
690 		 * Demote vnode with only swap pages to section 2
691 		 */
692 		if (rover_state == ROVER_MID2) {
693 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
694 			TAILQ_INSERT_BEFORE(&vnode_free_mid2, vp, v_freelist);
695 		}
696 	} else {
697 		/*
698 		 * Demote vnode with no cached data to section 1
699 		 */
700 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
701 		TAILQ_INSERT_BEFORE(&vnode_free_mid1, vp, v_freelist);
702 	}
703 }
704 
705 /*
706  * Try to reuse a vnode from the free list.
707  *
708  * NOTE: The returned vnode is not completely initialized.
709  *
710  * WARNING: The freevnodes count can race, NULL can be returned even if
711  *	    freevnodes != 0.
712  *
713  * MPSAFE
714  */
715 static
716 struct vnode *
717 allocfreevnode(void)
718 {
719 	struct vnode *vp;
720 	int count;
721 
722 	for (count = 0; count < freevnodes; count++) {
723 		/*
724 		 * Try to lock the first vnode on the free list.
725 		 * Cycle if we can't.
726 		 *
727 		 * We use a bad hack in vx_lock_nonblock() which avoids
728 		 * the lock order reversal between vfs_spin and v_spin.
729 		 * This is very fragile code and I don't want to use
730 		 * vhold here.
731 		 */
732 		spin_lock(&vfs_spin);
733 		vnode_rover_locked();
734 		vnode_rover_locked();
735 		vp = TAILQ_FIRST(&vnode_free_list);
736 		while (vp == &vnode_free_mid1 || vp == &vnode_free_mid2 ||
737 		       vp == &vnode_free_rover) {
738 			vp = TAILQ_NEXT(vp, v_freelist);
739 		}
740 		if (vp == NULL)
741 			break;
742 		if (vx_lock_nonblock(vp)) {
743 			KKASSERT(vp->v_flag & VFREE);
744 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
745 			TAILQ_INSERT_TAIL(&vnode_free_list,
746 					  vp, v_freelist);
747 			spin_unlock(&vfs_spin);
748 			continue;
749 		}
750 
751 		/*
752 		 * We inherit the sysref associated the vnode on the free
753 		 * list.  Because VCACHED is clear the vnode will not
754 		 * be placed back on the free list.  We own the sysref
755 		 * free and clear and thus control the disposition of
756 		 * the vnode.
757 		 */
758 		__vbusy_interlocked(vp);
759 		spin_unlock(&vfs_spin);
760 #ifdef TRACKVNODE
761 		if ((ulong)vp == trackvnode)
762 			kprintf("allocfreevnode %p %08x\n", vp, vp->v_flag);
763 #endif
764 		/*
765 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
766 		 * This includes namecache refs due to a related ncp being
767 		 * locked or having children.
768 		 *
769 		 * We will make this test several times as auxrefs can
770 		 * get incremented on us without any spinlocks being held
771 		 * until we have removed all namecache and inode references
772 		 * to the vnode.
773 		 *
774 		 * Because VCACHED is already in the correct state (cleared)
775 		 * we cannot race other vdrop()s occuring at the same time
776 		 * and can safely place vp on the free list.
777 		 *
778 		 * The free list association reinherits the sysref.
779 		 */
780 		if (vp->v_auxrefs) {
781 			__vfreetail(vp);
782 			vx_unlock(vp);
783 			continue;
784 		}
785 
786 		/*
787 		 * We inherit the reference that was previously associated
788 		 * with the vnode being on the free list.  VCACHED had better
789 		 * not be set because the reference and VX lock prevents
790 		 * the sysref from transitioning to an active state.
791 		 */
792 		KKASSERT((vp->v_flag & (VINACTIVE|VCACHED)) == VINACTIVE);
793 		KKASSERT(sysref_isinactive(&vp->v_sysref));
794 
795 		/*
796 		 * Holding the VX lock on an inactive vnode prevents it
797 		 * from being reactivated or reused.  New namecache
798 		 * associations can only be made using active vnodes.
799 		 *
800 		 * Another thread may be blocked on our vnode lock while
801 		 * holding a namecache lock.  We can only reuse this vnode
802 		 * if we can clear all namecache associations without
803 		 * blocking.
804 		 *
805 		 * Because VCACHED is already in the correct state (cleared)
806 		 * we cannot race other vdrop()s occuring at the same time
807 		 * and can safely place vp on the free list.
808 		 */
809 		if ((vp->v_flag & VRECLAIMED) == 0) {
810 			if (cache_inval_vp_nonblock(vp)) {
811 				__vfreetail(vp);
812 				vx_unlock(vp);
813 				continue;
814 			}
815 			vgone_vxlocked(vp);
816 			/* vnode is still VX locked */
817 		}
818 
819 		/*
820 		 * We can reuse the vnode if no primary or auxiliary
821 		 * references remain other then ours, else put it
822 		 * back on the free list and keep looking.
823 		 *
824 		 * Either the free list inherits the last reference
825 		 * or we fall through and sysref_activate() the last
826 		 * reference.
827 		 *
828 		 * Since the vnode is in a VRECLAIMED state, no new
829 		 * namecache associations could have been made.
830 		 */
831 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
832 		if (vp->v_auxrefs ||
833 		    !sysref_islastdeactivation(&vp->v_sysref)) {
834 			__vfreetail(vp);
835 			vx_unlock(vp);
836 			continue;
837 		}
838 
839 		/*
840 		 * Return a VX locked vnode suitable for reuse.  The caller
841 		 * inherits the sysref.
842 		 */
843 		return(vp);
844 	}
845 	return(NULL);
846 }
847 
848 /*
849  * Obtain a new vnode from the freelist, allocating more if necessary.
850  * The returned vnode is VX locked & vrefd.
851  *
852  * All new vnodes set the VAGE flags.  An open() of the vnode will
853  * decrement the (2-bit) flags.  Vnodes which are opened several times
854  * are thus retained in the cache over vnodes which are merely stat()d.
855  *
856  * MPSAFE
857  */
858 struct vnode *
859 allocvnode(int lktimeout, int lkflags)
860 {
861 	struct vnode *vp;
862 
863 	/*
864 	 * Try to reuse vnodes if we hit the max.  This situation only
865 	 * occurs in certain large-memory (2G+) situations.  We cannot
866 	 * attempt to directly reclaim vnodes due to nasty recursion
867 	 * problems.
868 	 */
869 	while (numvnodes - freevnodes > desiredvnodes)
870 		vnlru_proc_wait();
871 
872 	/*
873 	 * Try to build up as many vnodes as we can before reallocating
874 	 * from the free list.  A vnode on the free list simply means
875 	 * that it is inactive with no resident pages.  It may or may not
876 	 * have been reclaimed and could have valuable information associated
877 	 * with it that we shouldn't throw away unless we really need to.
878 	 *
879 	 * HAMMER NOTE: Re-establishing a vnode is a fairly expensive
880 	 * operation for HAMMER but this should benefit UFS as well.
881 	 */
882 	if (freevnodes >= wantfreevnodes && numvnodes >= desiredvnodes)
883 		vp = allocfreevnode();
884 	else
885 		vp = NULL;
886 	if (vp == NULL) {
887 		vp = sysref_alloc(&vnode_sysref_class);
888 		KKASSERT((vp->v_flag & (VCACHED|VFREE)) == 0);
889 		lockmgr(&vp->v_lock, LK_EXCLUSIVE);
890 		numvnodes++;
891 	}
892 
893 	/*
894 	 * We are using a managed sysref class, vnode fields are only
895 	 * zerod on initial allocation from the backing store, not
896 	 * on reallocation.  Thus we have to clear these fields for both
897 	 * reallocation and reuse.
898 	 */
899 #ifdef INVARIANTS
900 	if (vp->v_data)
901 		panic("cleaned vnode isn't");
902 	if (bio_track_active(&vp->v_track_read) ||
903 	    bio_track_active(&vp->v_track_write)) {
904 		panic("Clean vnode has pending I/O's");
905 	}
906 	if (vp->v_flag & VONWORKLST)
907 		panic("Clean vnode still pending on syncer worklist!");
908 	if (!RB_EMPTY(&vp->v_rbdirty_tree))
909 		panic("Clean vnode still has dirty buffers!");
910 	if (!RB_EMPTY(&vp->v_rbclean_tree))
911 		panic("Clean vnode still has clean buffers!");
912 	if (!RB_EMPTY(&vp->v_rbhash_tree))
913 		panic("Clean vnode still on hash tree!");
914 	KKASSERT(vp->v_mount == NULL);
915 #endif
916 	vp->v_flag = VAGE0 | VAGE1;
917 	vp->v_lastw = 0;
918 	vp->v_lasta = 0;
919 	vp->v_cstart = 0;
920 	vp->v_clen = 0;
921 	vp->v_socket = 0;
922 	vp->v_opencount = 0;
923 	vp->v_writecount = 0;	/* XXX */
924 
925 	/*
926 	 * lktimeout only applies when LK_TIMELOCK is used, and only
927 	 * the pageout daemon uses it.  The timeout may not be zero
928 	 * or the pageout daemon can deadlock in low-VM situations.
929 	 */
930 	if (lktimeout == 0)
931 		lktimeout = hz / 10;
932 	lockreinit(&vp->v_lock, "vnode", lktimeout, lkflags);
933 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
934 	/* exclusive lock still held */
935 
936 	/*
937 	 * Note: sysref needs to be activated to convert -0x40000000 to +1.
938 	 * The -0x40000000 comes from the last ref on reuse, and from
939 	 * sysref_init() on allocate.
940 	 */
941 	sysref_activate(&vp->v_sysref);
942 	vp->v_filesize = NOOFFSET;
943 	vp->v_type = VNON;
944 	vp->v_tag = 0;
945 	vp->v_ops = NULL;
946 	vp->v_data = NULL;
947 	vp->v_pfsmp = NULL;
948 	KKASSERT(vp->v_mount == NULL);
949 
950 	return (vp);
951 }
952 
953 /*
954  * MPSAFE
955  */
956 int
957 freesomevnodes(int n)
958 {
959 	struct vnode *vp;
960 	int count = 0;
961 
962 	while (n) {
963 		--n;
964 		if ((vp = allocfreevnode()) == NULL)
965 			break;
966 		vx_put(vp);
967 		--numvnodes;
968 	}
969 	return(count);
970 }
971