xref: /dragonfly/sys/kern/vfs_lock.c (revision 45914ee7)
1 /*
2  * Copyright (c) 2004,2013-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * External lock/ref-related vnode functions
37  *
38  * vs_state transition locking requirements:
39  *
40  *	INACTIVE -> CACHED|DYING	vx_lock(excl) + vi->spin
41  *	DYING    -> CACHED		vx_lock(excl)
42  *	ACTIVE   -> INACTIVE		(none)       + v_spin + vi->spin
43  *	INACTIVE -> ACTIVE		vn_lock(any) + v_spin + vi->spin
44  *	CACHED   -> ACTIVE		vn_lock(any) + v_spin + vi->spin
45  *
46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
47  *
48  *	 Switching into ACTIVE also requires a vref and vnode lock, however
49  *	 the vnode lock is allowed to be SHARED.
50  *
51  *	 Switching into a CACHED or DYING state requires an exclusive vnode
52  *	 lock or vx_lock (which is almost the same thing).
53  */
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/buf.h>
63 #include <sys/sysctl.h>
64 
65 #include <machine/limits.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
69 
70 #include <sys/buf2.h>
71 #include <sys/thread2.h>
72 
73 #define VACT_MAX	10
74 #define VACT_INC	2
75 
76 static void vnode_terminate(struct vnode *vp);
77 
78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
79 
80 /*
81  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
82  * are inserted prior to the mid point, and otherwise inserted
83  * at the tail.
84  *
85  * The vnode code goes to great lengths to avoid moving vnodes between
86  * lists, but sometimes it is unavoidable.  For this situation we try to
87  * avoid lock contention but we do not try very hard to avoid cache line
88  * congestion.  A modestly sized hash table is used.
89  */
90 #define VLIST_PRIME2	123462047LU
91 #define VLIST_XOR	(uintptr_t)0xab4582fa8322fb71LLU
92 
93 #define VLIST_HASH(vp)	(((uintptr_t)vp ^ VLIST_XOR) % \
94 			 VLIST_PRIME2 % (unsigned)ncpus)
95 
96 TAILQ_HEAD(freelst, vnode);
97 
98 struct vnode_index {
99 	struct freelst	active_list;
100 	struct vnode	active_rover;
101 	struct freelst	inactive_list;
102 	struct spinlock	spin;
103 	int	deac_rover;
104 	int	free_rover;
105 } __cachealign;
106 
107 static struct vnode_index *vnode_list_hash;
108 
109 int  activevnodes = 0;
110 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
111 	&activevnodes, 0, "Number of active nodes");
112 int  cachedvnodes = 0;
113 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
114 	&cachedvnodes, 0, "Number of total cached nodes");
115 int  inactivevnodes = 0;
116 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
117 	&inactivevnodes, 0, "Number of inactive nodes");
118 static int batchfreevnodes = 5;
119 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
120 	&batchfreevnodes, 0, "Number of vnodes to free at once");
121 #ifdef TRACKVNODE
122 static u_long trackvnode;
123 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
124 		&trackvnode, 0, "");
125 #endif
126 
127 /*
128  * Called from vfsinit()
129  */
130 void
131 vfs_lock_init(void)
132 {
133 	int i;
134 
135 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
136 	vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
137 				  M_VNODE, M_ZERO | M_WAITOK);
138 	for (i = 0; i < ncpus; ++i) {
139 		struct vnode_index *vi = &vnode_list_hash[i];
140 
141 		TAILQ_INIT(&vi->inactive_list);
142 		TAILQ_INIT(&vi->active_list);
143 		TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
144 		spin_init(&vi->spin, "vfslock");
145 	}
146 }
147 
148 /*
149  * Misc functions
150  */
151 static __inline
152 void
153 _vsetflags(struct vnode *vp, int flags)
154 {
155 	atomic_set_int(&vp->v_flag, flags);
156 }
157 
158 static __inline
159 void
160 _vclrflags(struct vnode *vp, int flags)
161 {
162 	atomic_clear_int(&vp->v_flag, flags);
163 }
164 
165 void
166 vsetflags(struct vnode *vp, int flags)
167 {
168 	_vsetflags(vp, flags);
169 }
170 
171 void
172 vclrflags(struct vnode *vp, int flags)
173 {
174 	_vclrflags(vp, flags);
175 }
176 
177 /*
178  * Place the vnode on the active list.
179  *
180  * Caller must hold vp->v_spin
181  */
182 static __inline
183 void
184 _vactivate(struct vnode *vp)
185 {
186 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
187 
188 #ifdef TRACKVNODE
189 	if ((u_long)vp == trackvnode)
190 		kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
191 #endif
192 	spin_lock(&vi->spin);
193 
194 	switch(vp->v_state) {
195 	case VS_ACTIVE:
196 		spin_unlock(&vi->spin);
197 		panic("_vactivate: already active");
198 		/* NOT REACHED */
199 		return;
200 	case VS_INACTIVE:
201 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
202 		atomic_add_int(&inactivevnodes, -1);
203 		break;
204 	case VS_CACHED:
205 	case VS_DYING:
206 		break;
207 	}
208 	TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
209 	vp->v_state = VS_ACTIVE;
210 	spin_unlock(&vi->spin);
211 	atomic_add_int(&activevnodes, 1);
212 }
213 
214 /*
215  * Put a vnode on the inactive list.
216  *
217  * Caller must hold v_spin
218  */
219 static __inline
220 void
221 _vinactive(struct vnode *vp)
222 {
223 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
224 
225 #ifdef TRACKVNODE
226 	if ((u_long)vp == trackvnode) {
227 		kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
228 		print_backtrace(-1);
229 	}
230 #endif
231 	spin_lock(&vi->spin);
232 
233 	/*
234 	 * Remove from active list if it is sitting on it
235 	 */
236 	switch(vp->v_state) {
237 	case VS_ACTIVE:
238 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
239 		atomic_add_int(&activevnodes, -1);
240 		break;
241 	case VS_INACTIVE:
242 		spin_unlock(&vi->spin);
243 		panic("_vinactive: already inactive");
244 		/* NOT REACHED */
245 		return;
246 	case VS_CACHED:
247 	case VS_DYING:
248 		break;
249 	}
250 
251 	/*
252 	 * Distinguish between basically dead vnodes, vnodes with cached
253 	 * data, and vnodes without cached data.  A rover will shift the
254 	 * vnodes around as their cache status is lost.
255 	 */
256 	if (vp->v_flag & VRECLAIMED) {
257 		TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
258 	} else {
259 		TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
260 	}
261 	vp->v_state = VS_INACTIVE;
262 	spin_unlock(&vi->spin);
263 	atomic_add_int(&inactivevnodes, 1);
264 }
265 
266 static __inline
267 void
268 _vinactive_tail(struct vnode *vp)
269 {
270 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
271 
272 	spin_lock(&vi->spin);
273 
274 	/*
275 	 * Remove from active list if it is sitting on it
276 	 */
277 	switch(vp->v_state) {
278 	case VS_ACTIVE:
279 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
280 		atomic_add_int(&activevnodes, -1);
281 		break;
282 	case VS_INACTIVE:
283 		spin_unlock(&vi->spin);
284 		panic("_vinactive_tail: already inactive");
285 		/* NOT REACHED */
286 		return;
287 	case VS_CACHED:
288 	case VS_DYING:
289 		break;
290 	}
291 
292 	TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
293 	vp->v_state = VS_INACTIVE;
294 	spin_unlock(&vi->spin);
295 	atomic_add_int(&inactivevnodes, 1);
296 }
297 
298 /*
299  * Add a ref to an active vnode.  This function should never be called
300  * with an inactive vnode (use vget() instead), but might be called
301  * with other states.
302  */
303 void
304 vref(struct vnode *vp)
305 {
306 	KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
307 		("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
308 	atomic_add_int(&vp->v_refcnt, 1);
309 }
310 
311 /*
312  * Count number of cached vnodes.  This is middling expensive so be
313  * careful not to make this call in the critical path, particularly
314  * not updating the global.  Each cpu tracks its own accumulator.
315  * The individual accumulators are not accurate and must be summed
316  * together.
317  */
318 int
319 countcachedvnodes(int gupdate)
320 {
321 	int i;
322 	int n = 0;
323 
324 	for (i = 0; i < ncpus; ++i) {
325 		globaldata_t gd = globaldata_find(i);
326 		n += gd->gd_cachedvnodes;
327 	}
328 	if (gupdate)
329 		cachedvnodes = n;
330 	return n;
331 }
332 
333 /*
334  * Release a ref on an active or inactive vnode.
335  *
336  * Caller has no other requirements.
337  *
338  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
339  * transition, otherwise we leave the vnode in the active list and
340  * do a lockless transition to 0, which is very important for the
341  * critical path.
342  *
343  * (vrele() is not called when a vnode is being destroyed w/kfree)
344  */
345 void
346 vrele(struct vnode *vp)
347 {
348 	for (;;) {
349 		int count = vp->v_refcnt;
350 		cpu_ccfence();
351 		KKASSERT((count & VREF_MASK) > 0);
352 		KKASSERT(vp->v_state == VS_ACTIVE ||
353 			 vp->v_state == VS_INACTIVE);
354 
355 		/*
356 		 * 2+ case
357 		 */
358 		if ((count & VREF_MASK) > 1) {
359 			if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
360 				break;
361 			continue;
362 		}
363 
364 		/*
365 		 * 1->0 transition case must handle possible finalization.
366 		 * When finalizing we transition 1->0x40000000.  Note that
367 		 * cachedvnodes is only adjusted on transitions to ->0.
368 		 *
369 		 * WARNING! VREF_TERMINATE can be cleared at any point
370 		 *	    when the refcnt is non-zero (by vget()) and
371 		 *	    the vnode has not been reclaimed.  Thus
372 		 *	    transitions out of VREF_TERMINATE do not have
373 		 *	    to mess with cachedvnodes.
374 		 */
375 		if (count & VREF_FINALIZE) {
376 			vx_lock(vp);
377 			if (atomic_cmpset_int(&vp->v_refcnt,
378 					      count, VREF_TERMINATE)) {
379 				vnode_terminate(vp);
380 				break;
381 			}
382 			vx_unlock(vp);
383 		} else {
384 			if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
385 				atomic_add_int(&mycpu->gd_cachedvnodes, 1);
386 				break;
387 			}
388 		}
389 		/* retry */
390 	}
391 }
392 
393 /*
394  * Add an auxiliary data structure reference to the vnode.  Auxiliary
395  * references do not change the state of the vnode or prevent deactivation
396  * or reclamation of the vnode, but will prevent the vnode from being
397  * destroyed (kfree()'d).
398  *
399  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
400  *	     already be held by the caller.  vdrop() will clean up the
401  *	     free list state.
402  */
403 void
404 vhold(struct vnode *vp)
405 {
406 	atomic_add_int(&vp->v_auxrefs, 1);
407 }
408 
409 /*
410  * Remove an auxiliary reference from the vnode.
411  */
412 void
413 vdrop(struct vnode *vp)
414 {
415 	atomic_add_int(&vp->v_auxrefs, -1);
416 }
417 
418 /*
419  * This function is called on the 1->0 transition (which is actually
420  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
421  * of the vnode.
422  *
423  * Additional vrefs are allowed to race but will not result in a reentrant
424  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
425  * prevents additional 1->0 transitions.
426  *
427  * ONLY A VGET() CAN REACTIVATE THE VNODE.
428  *
429  * Caller must hold the VX lock.
430  *
431  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
432  *
433  * NOTE: The vnode may be marked inactive with dirty buffers
434  *	 or dirty pages in its cached VM object still present.
435  *
436  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
437  *	 previously be active).  We lose control of the vnode the instant
438  *	 it is placed on the free list.
439  *
440  *	 The VX lock is required when transitioning to VS_CACHED but is
441  *	 not sufficient for the vshouldfree() interlocked test or when
442  *	 transitioning away from VS_CACHED.  v_spin is also required for
443  *	 those cases.
444  */
445 static
446 void
447 vnode_terminate(struct vnode *vp)
448 {
449 	KKASSERT(vp->v_state == VS_ACTIVE);
450 
451 	if ((vp->v_flag & VINACTIVE) == 0) {
452 		_vsetflags(vp, VINACTIVE);
453 		if (vp->v_mount)
454 			VOP_INACTIVE(vp);
455 		/* might deactivate page */
456 	}
457 	spin_lock(&vp->v_spin);
458 	_vinactive(vp);
459 	spin_unlock(&vp->v_spin);
460 
461 	vx_unlock(vp);
462 }
463 
464 /****************************************************************
465  *			VX LOCKING FUNCTIONS			*
466  ****************************************************************
467  *
468  * These functions lock vnodes for reclamation and deactivation related
469  * activities.  The caller must already be holding some sort of reference
470  * on the vnode.
471  */
472 void
473 vx_lock(struct vnode *vp)
474 {
475 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
476 }
477 
478 void
479 vx_unlock(struct vnode *vp)
480 {
481 	lockmgr(&vp->v_lock, LK_RELEASE);
482 }
483 
484 /****************************************************************
485  *			VNODE ACQUISITION FUNCTIONS		*
486  ****************************************************************
487  *
488  * These functions must be used when accessing a vnode that has no
489  * chance of being destroyed in a SMP race.  That means the caller will
490  * usually either hold an auxiliary reference (such as the namecache)
491  * or hold some other lock that ensures that the vnode cannot be destroyed.
492  *
493  * These functions are MANDATORY for any code chain accessing a vnode
494  * whos activation state is not known.
495  *
496  * vget() can be called with LK_NOWAIT and will return EBUSY if the
497  * lock cannot be immediately acquired.
498  *
499  * vget()/vput() are used when reactivation is desired.
500  *
501  * vx_get() and vx_put() are used when reactivation is not desired.
502  */
503 int
504 vget(struct vnode *vp, int flags)
505 {
506 	int error;
507 
508 	/*
509 	 * A lock type must be passed
510 	 */
511 	if ((flags & LK_TYPE_MASK) == 0) {
512 		panic("vget() called with no lock specified!");
513 		/* NOT REACHED */
514 	}
515 
516 	/*
517 	 * Reference the structure and then acquire the lock.
518 	 *
519 	 * NOTE: The requested lock might be a shared lock and does
520 	 *	 not protect our access to the refcnt or other fields.
521 	 */
522 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
523 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
524 
525 	if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
526 		/*
527 		 * The lock failed, undo and return an error.  This will not
528 		 * normally trigger a termination.
529 		 */
530 		vrele(vp);
531 	} else if (vp->v_flag & VRECLAIMED) {
532 		/*
533 		 * The node is being reclaimed and cannot be reactivated
534 		 * any more, undo and return ENOENT.
535 		 */
536 		vn_unlock(vp);
537 		vrele(vp);
538 		error = ENOENT;
539 	} else if (vp->v_state == VS_ACTIVE) {
540 		/*
541 		 * A VS_ACTIVE vnode coupled with the fact that we have
542 		 * a vnode lock (even if shared) prevents v_state from
543 		 * changing.  Since the vnode is not in a VRECLAIMED state,
544 		 * we can safely clear VINACTIVE.
545 		 *
546 		 * NOTE! Multiple threads may clear VINACTIVE if this is
547 		 *	 shared lock.  This race is allowed.
548 		 */
549 		_vclrflags(vp, VINACTIVE);	/* SMP race ok */
550 		vp->v_act += VACT_INC;
551 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
552 			vp->v_act = VACT_MAX;
553 		error = 0;
554 	} else {
555 		/*
556 		 * If the vnode is not VS_ACTIVE it must be reactivated
557 		 * in addition to clearing VINACTIVE.  An exclusive spin_lock
558 		 * is needed to manipulate the vnode's list.
559 		 *
560 		 * Because the lockmgr lock might be shared, we might race
561 		 * another reactivation, which we handle.  In this situation,
562 		 * however, the refcnt prevents other v_state races.
563 		 *
564 		 * As with above, clearing VINACTIVE is allowed to race other
565 		 * clearings of VINACTIVE.
566 		 *
567 		 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
568 		 * the refcnt is non-zero and the vnode has not been
569 		 * reclaimed.  This also means that the transitions do
570 		 * not affect cachedvnodes.
571 		 */
572 		_vclrflags(vp, VINACTIVE);
573 		vp->v_act += VACT_INC;
574 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
575 			vp->v_act = VACT_MAX;
576 		spin_lock(&vp->v_spin);
577 
578 		switch(vp->v_state) {
579 		case VS_INACTIVE:
580 			_vactivate(vp);
581 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
582 							VREF_FINALIZE);
583 			spin_unlock(&vp->v_spin);
584 			break;
585 		case VS_CACHED:
586 			_vactivate(vp);
587 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
588 							VREF_FINALIZE);
589 			spin_unlock(&vp->v_spin);
590 			break;
591 		case VS_ACTIVE:
592 			atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE);
593 			spin_unlock(&vp->v_spin);
594 			break;
595 		case VS_DYING:
596 			spin_unlock(&vp->v_spin);
597 			panic("Impossible VS_DYING state");
598 			break;
599 		}
600 		error = 0;
601 	}
602 	return(error);
603 }
604 
605 #ifdef DEBUG_VPUT
606 
607 void
608 debug_vput(struct vnode *vp, const char *filename, int line)
609 {
610 	kprintf("vput(%p) %s:%d\n", vp, filename, line);
611 	vn_unlock(vp);
612 	vrele(vp);
613 }
614 
615 #else
616 
617 void
618 vput(struct vnode *vp)
619 {
620 	vn_unlock(vp);
621 	vrele(vp);
622 }
623 
624 #endif
625 
626 /*
627  * Acquire the vnode lock unguarded.
628  *
629  * The non-blocking version also uses a slightly different mechanic.
630  * This function will explicitly fail not only if it cannot acquire
631  * the lock normally, but also if the caller already holds a lock.
632  *
633  * The adjusted mechanic is used to close a loophole where complex
634  * VOP_RECLAIM code can circle around recursively and allocate the
635  * same vnode it is trying to destroy from the freelist.
636  *
637  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
638  * cause the incorrect behavior to occur.  If not for that lockmgr()
639  * would do the right thing.
640  *
641  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
642  */
643 void
644 vx_get(struct vnode *vp)
645 {
646 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
647 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
648 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
649 }
650 
651 int
652 vx_get_nonblock(struct vnode *vp)
653 {
654 	int error;
655 
656 	if (lockinuse(&vp->v_lock))
657 		return(EBUSY);
658 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
659 	if (error == 0) {
660 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
661 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
662 	}
663 	return(error);
664 }
665 
666 /*
667  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
668  * any needed state transitions.
669  *
670  * However, filesystems use this function to get rid of unwanted new vnodes
671  * so try to get the vnode on the correct queue in that case.
672  */
673 void
674 vx_put(struct vnode *vp)
675 {
676 	if (vp->v_type == VNON || vp->v_type == VBAD)
677 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
678 	lockmgr(&vp->v_lock, LK_RELEASE);
679 	vrele(vp);
680 }
681 
682 /*
683  * Try to reuse a vnode from the free list.  This function is somewhat
684  * advisory in that NULL can be returned as a normal case, even if free
685  * vnodes are present.
686  *
687  * The scan is limited because it can result in excessive CPU use during
688  * periods of extreme vnode use.
689  *
690  * NOTE: The returned vnode is not completely initialized.
691  */
692 static
693 struct vnode *
694 cleanfreevnode(int maxcount)
695 {
696 	struct vnode_index *vi;
697 	struct vnode *vp;
698 	int count;
699 	int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
700 	int ri;
701 	int cpu_count;
702 
703 	/*
704 	 * Try to deactivate some vnodes cached on the active list.
705 	 */
706 	if (countcachedvnodes(0) < inactivevnodes)
707 		goto skip;
708 
709 	ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
710 
711 	for (count = 0; count < maxcount * 2; ++count, ++ri) {
712 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
713 
714 		spin_lock(&vi->spin);
715 
716 		vp = TAILQ_NEXT(&vi->active_rover, v_list);
717 		TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
718 		if (vp == NULL) {
719 			TAILQ_INSERT_HEAD(&vi->active_list,
720 					  &vi->active_rover, v_list);
721 		} else {
722 			TAILQ_INSERT_AFTER(&vi->active_list, vp,
723 					   &vi->active_rover, v_list);
724 		}
725 		if (vp == NULL) {
726 			spin_unlock(&vi->spin);
727 			continue;
728 		}
729 		if ((vp->v_refcnt & VREF_MASK) != 0) {
730 			spin_unlock(&vi->spin);
731 			vp->v_act += VACT_INC;
732 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
733 				vp->v_act = VACT_MAX;
734 			continue;
735 		}
736 
737 		/*
738 		 * decrement by less if the vnode's object has a lot of
739 		 * VM pages.  XXX possible SMP races.
740 		 */
741 		if (vp->v_act > 0) {
742 			vm_object_t obj;
743 			if ((obj = vp->v_object) != NULL &&
744 			    obj->resident_page_count >= trigger) {
745 				vp->v_act -= 1;
746 			} else {
747 				vp->v_act -= VACT_INC;
748 			}
749 			if (vp->v_act < 0)
750 				vp->v_act = 0;
751 			spin_unlock(&vi->spin);
752 			continue;
753 		}
754 
755 		/*
756 		 * Try to deactivate the vnode.
757 		 */
758 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
759 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
760 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
761 
762 		spin_unlock(&vi->spin);
763 		vrele(vp);
764 	}
765 
766 	vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
767 
768 skip:
769 	/*
770 	 * Loop trying to lock the first vnode on the free list.
771 	 * Cycle if we can't.
772 	 */
773 	cpu_count = ncpus;
774 	ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
775 
776 	for (count = 0; count < maxcount; ++count, ++ri) {
777 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
778 
779 		spin_lock(&vi->spin);
780 
781 		vp = TAILQ_FIRST(&vi->inactive_list);
782 		if (vp == NULL) {
783 			spin_unlock(&vi->spin);
784 			if (--cpu_count == 0)
785 				break;
786 			ri = (ri + 16) & ~15;
787 			--ri;
788 			continue;
789 		}
790 
791 		/*
792 		 * non-blocking vx_get will also ref the vnode on success.
793 		 */
794 		if (vx_get_nonblock(vp)) {
795 			KKASSERT(vp->v_state == VS_INACTIVE);
796 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
797 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
798 			spin_unlock(&vi->spin);
799 			continue;
800 		}
801 
802 		/*
803 		 * Because we are holding vfs_spin the vnode should currently
804 		 * be inactive and VREF_TERMINATE should still be set.
805 		 *
806 		 * Once vfs_spin is released the vnode's state should remain
807 		 * unmodified due to both the lock and ref on it.
808 		 */
809 		KKASSERT(vp->v_state == VS_INACTIVE);
810 		spin_unlock(&vi->spin);
811 #ifdef TRACKVNODE
812 		if ((u_long)vp == trackvnode)
813 			kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
814 #endif
815 
816 		/*
817 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
818 		 * This includes namecache refs due to a related ncp being
819 		 * locked or having children, a VM object association, or
820 		 * other hold users.
821 		 *
822 		 * Do not reclaim/reuse a vnode if someone else has a real
823 		 * ref on it.  This can occur if a filesystem temporarily
824 		 * releases the vnode lock during VOP_RECLAIM.
825 		 */
826 		if (vp->v_auxrefs ||
827 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
828 failed:
829 			if (vp->v_state == VS_INACTIVE) {
830 				spin_lock(&vi->spin);
831 				if (vp->v_state == VS_INACTIVE) {
832 					TAILQ_REMOVE(&vi->inactive_list,
833 						     vp, v_list);
834 					TAILQ_INSERT_TAIL(&vi->inactive_list,
835 							  vp, v_list);
836 				}
837 				spin_unlock(&vi->spin);
838 			}
839 			vx_put(vp);
840 			continue;
841 		}
842 
843 		/*
844 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
845 		 * for vnodes pulled from the inactive list, and cannot be
846 		 * changed while we hold the vx lock.
847 		 *
848 		 * Try to reclaim the vnode.
849 		 */
850 		KKASSERT(vp->v_flag & VINACTIVE);
851 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
852 
853 		if ((vp->v_flag & VRECLAIMED) == 0) {
854 			if (cache_inval_vp_nonblock(vp))
855 				goto failed;
856 			vgone_vxlocked(vp);
857 			/* vnode is still VX locked */
858 		}
859 
860 		/*
861 		 * At this point if there are no other refs or auxrefs on
862 		 * the vnode with the inactive list locked, and we remove
863 		 * the vnode from the inactive list, it should not be
864 		 * possible for anyone else to access the vnode any more.
865 		 *
866 		 * Since the vnode is in a VRECLAIMED state, no new
867 		 * namecache associations could have been made and the
868 		 * vnode should have already been removed from its mountlist.
869 		 *
870 		 * Since we hold a VX lock on the vnode it cannot have been
871 		 * reactivated (moved out of the inactive list).
872 		 */
873 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
874 		spin_lock(&vi->spin);
875 		if (vp->v_auxrefs ||
876 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
877 			spin_unlock(&vi->spin);
878 			goto failed;
879 		}
880 		KKASSERT(vp->v_state == VS_INACTIVE);
881 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
882 		atomic_add_int(&inactivevnodes, -1);
883 		vp->v_state = VS_DYING;
884 		spin_unlock(&vi->spin);
885 
886 		/*
887 		 * Nothing should have been able to access this vp.  Only
888 		 * our ref should remain now.
889 		 */
890 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
891 		KASSERT(vp->v_refcnt == 1,
892 			("vp %p badrefs %08x", vp, vp->v_refcnt));
893 
894 		/*
895 		 * Return a VX locked vnode suitable for reuse.
896 		 */
897 		vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
898 		return(vp);
899 	}
900 	vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
901 	return(NULL);
902 }
903 
904 /*
905  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
906  *
907  * All new vnodes set the VAGE flags.  An open() of the vnode will
908  * decrement the (2-bit) flags.  Vnodes which are opened several times
909  * are thus retained in the cache over vnodes which are merely stat()d.
910  *
911  * We always allocate the vnode.  Attempting to recycle existing vnodes
912  * here can lead to numerous deadlocks, particularly with softupdates.
913  */
914 struct vnode *
915 allocvnode(int lktimeout, int lkflags)
916 {
917 	struct vnode *vp;
918 
919 	/*
920 	 * Do not flag for synchronous recyclement unless there are enough
921 	 * freeable vnodes to recycle and the number of vnodes has
922 	 * significantly exceeded our target.  We want the normal vnlru
923 	 * process to handle the cleaning (at 9/10's) before we are forced
924 	 * to flag it here at 11/10's for userexit path processing.
925 	 */
926 	if (numvnodes >= maxvnodes * 11 / 10 &&
927 	    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
928 		struct thread *td = curthread;
929 		if (td->td_lwp)
930 			atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
931 	}
932 
933 	/*
934 	 * lktimeout only applies when LK_TIMELOCK is used, and only
935 	 * the pageout daemon uses it.  The timeout may not be zero
936 	 * or the pageout daemon can deadlock in low-VM situations.
937 	 */
938 	if (lktimeout == 0)
939 		lktimeout = hz / 10;
940 
941 	vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
942 
943 	lwkt_token_init(&vp->v_token, "vnode");
944 	lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
945 	TAILQ_INIT(&vp->v_namecache);
946 	RB_INIT(&vp->v_rbclean_tree);
947 	RB_INIT(&vp->v_rbdirty_tree);
948 	RB_INIT(&vp->v_rbhash_tree);
949 	spin_init(&vp->v_spin, "allocvnode");
950 
951 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
952 	atomic_add_int(&numvnodes, 1);
953 	vp->v_refcnt = 1;
954 	vp->v_flag = VAGE0 | VAGE1;
955 	vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
956 
957 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
958 	/* exclusive lock still held */
959 
960 	vp->v_filesize = NOOFFSET;
961 	vp->v_type = VNON;
962 	vp->v_tag = 0;
963 	vp->v_state = VS_CACHED;
964 	_vactivate(vp);
965 
966 	return (vp);
967 }
968 
969 /*
970  * Called after a process has allocated a vnode via allocvnode()
971  * and we detected that too many vnodes were present.
972  *
973  * This function is called just prior to a return to userland if the
974  * process at some point had to allocate a new vnode during the last
975  * system call and the vnode count was found to be excessive.
976  *
977  * This is a synchronous path that we do not normally want to execute.
978  *
979  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
980  *
981  * WARNING: Sometimes numvnodes can blow out due to children being
982  *	    present under directory vnodes in the namecache.  For the
983  *	    moment use an if() instead of a while() and note that if
984  *	    we were to use a while() we would still have to break out
985  *	    if freesomevnodes() returned 0.  vnlru will also be trying
986  *	    hard to free vnodes at the same time (with a lower trigger
987  *	    pointer).
988  */
989 void
990 allocvnode_gc(void)
991 {
992 	if (numvnodes >= maxvnodes &&
993 	    countcachedvnodes(0) + inactivevnodes >= maxvnodes * 5 / 10) {
994 		freesomevnodes(batchfreevnodes);
995 	}
996 }
997 
998 int
999 freesomevnodes(int n)
1000 {
1001 	struct vnode *vp;
1002 	int count = 0;
1003 
1004 	while (n) {
1005 		if ((vp = cleanfreevnode(n)) == NULL)
1006 			break;
1007 		vx_unlock(vp);
1008 		--n;
1009 		++count;
1010 		kfree(vp, M_VNODE);
1011 		atomic_add_int(&numvnodes, -1);
1012 	}
1013 	return(count);
1014 }
1015