xref: /dragonfly/sys/kern/vfs_lock.c (revision 0b2c5ee3)
1 /*
2  * Copyright (c) 2004,2013-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * External lock/ref-related vnode functions
37  *
38  * vs_state transition locking requirements:
39  *
40  *	INACTIVE -> CACHED|DYING	vx_lock(excl) + vi->spin
41  *	DYING    -> CACHED		vx_lock(excl)
42  *	ACTIVE   -> INACTIVE		(none)       + v_spin + vi->spin
43  *	INACTIVE -> ACTIVE		vn_lock(any) + v_spin + vi->spin
44  *	CACHED   -> ACTIVE		vn_lock(any) + v_spin + vi->spin
45  *
46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
47  *
48  *	 Switching into ACTIVE also requires a vref and vnode lock, however
49  *	 the vnode lock is allowed to be SHARED.
50  *
51  *	 Switching into a CACHED or DYING state requires an exclusive vnode
52  *	 lock or vx_lock (which is almost the same thing but not quite).
53  */
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/spinlock2.h>
63 #include <sys/sysctl.h>
64 
65 #include <machine/limits.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
69 
70 #define VACT_MAX	10
71 #define VACT_INC	2
72 
73 static void vnode_terminate(struct vnode *vp);
74 
75 static MALLOC_DEFINE_OBJ(M_VNODE, sizeof(struct vnode), "vnodes", "vnodes");
76 static MALLOC_DEFINE(M_VNODE_HASH, "vnodelsthash", "vnode list hash");
77 
78 /*
79  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
80  * are inserted prior to the mid point, and otherwise inserted
81  * at the tail.
82  *
83  * The vnode code goes to great lengths to avoid moving vnodes between
84  * lists, but sometimes it is unavoidable.  For this situation we try to
85  * avoid lock contention but we do not try very hard to avoid cache line
86  * congestion.  A modestly sized hash table is used.
87  */
88 #define VLIST_PRIME2	123462047LU
89 #define VLIST_XOR	(uintptr_t)0xab4582fa8322fb71LLU
90 
91 #define VLIST_HASH(vp)	(((uintptr_t)vp ^ VLIST_XOR) % \
92 			 VLIST_PRIME2 % (unsigned)ncpus)
93 
94 static struct vnode_index *vnode_list_hash;
95 
96 int  activevnodes = 0;
97 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
98 	&activevnodes, 0, "Number of active nodes");
99 int  cachedvnodes = 0;
100 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
101 	&cachedvnodes, 0, "Number of total cached nodes");
102 int  inactivevnodes = 0;
103 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
104 	&inactivevnodes, 0, "Number of inactive nodes");
105 static int batchfreevnodes = 5;
106 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
107 	&batchfreevnodes, 0, "Number of vnodes to free at once");
108 #ifdef TRACKVNODE
109 static u_long trackvnode;
110 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
111 		&trackvnode, 0, "");
112 #endif
113 
114 /*
115  * Called from vfsinit()
116  */
117 void
118 vfs_lock_init(void)
119 {
120 	int i;
121 
122 	kmalloc_obj_raise_limit(M_VNODE, 0);	/* unlimited */
123 	vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
124 				  M_VNODE_HASH, M_ZERO | M_WAITOK);
125 	for (i = 0; i < ncpus; ++i) {
126 		struct vnode_index *vi = &vnode_list_hash[i];
127 
128 		TAILQ_INIT(&vi->inactive_list);
129 		TAILQ_INIT(&vi->active_list);
130 		TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
131 		spin_init(&vi->spin, "vfslock");
132 	}
133 }
134 
135 /*
136  * Misc functions
137  */
138 static __inline
139 void
140 _vsetflags(struct vnode *vp, int flags)
141 {
142 	atomic_set_int(&vp->v_flag, flags);
143 }
144 
145 static __inline
146 void
147 _vclrflags(struct vnode *vp, int flags)
148 {
149 	atomic_clear_int(&vp->v_flag, flags);
150 }
151 
152 void
153 vsetflags(struct vnode *vp, int flags)
154 {
155 	_vsetflags(vp, flags);
156 }
157 
158 void
159 vclrflags(struct vnode *vp, int flags)
160 {
161 	_vclrflags(vp, flags);
162 }
163 
164 /*
165  * Place the vnode on the active list.
166  *
167  * Caller must hold vp->v_spin
168  */
169 static __inline
170 void
171 _vactivate(struct vnode *vp)
172 {
173 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
174 
175 #ifdef TRACKVNODE
176 	if ((u_long)vp == trackvnode)
177 		kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
178 #endif
179 	spin_lock(&vi->spin);
180 
181 	switch(vp->v_state) {
182 	case VS_ACTIVE:
183 		spin_unlock(&vi->spin);
184 		panic("_vactivate: already active");
185 		/* NOT REACHED */
186 		return;
187 	case VS_INACTIVE:
188 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
189 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
190 		break;
191 	case VS_CACHED:
192 	case VS_DYING:
193 		break;
194 	}
195 	TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
196 	vp->v_state = VS_ACTIVE;
197 	spin_unlock(&vi->spin);
198 	atomic_add_int(&mycpu->gd_activevnodes, 1);
199 }
200 
201 /*
202  * Put a vnode on the inactive list.
203  *
204  * Caller must hold v_spin
205  */
206 static __inline
207 void
208 _vinactive(struct vnode *vp)
209 {
210 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
211 
212 #ifdef TRACKVNODE
213 	if ((u_long)vp == trackvnode) {
214 		kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
215 		print_backtrace(-1);
216 	}
217 #endif
218 	spin_lock(&vi->spin);
219 
220 	/*
221 	 * Remove from active list if it is sitting on it
222 	 */
223 	switch(vp->v_state) {
224 	case VS_ACTIVE:
225 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
226 		atomic_add_int(&mycpu->gd_activevnodes, -1);
227 		break;
228 	case VS_INACTIVE:
229 		spin_unlock(&vi->spin);
230 		panic("_vinactive: already inactive");
231 		/* NOT REACHED */
232 		return;
233 	case VS_CACHED:
234 	case VS_DYING:
235 		break;
236 	}
237 
238 	/*
239 	 * Distinguish between basically dead vnodes, vnodes with cached
240 	 * data, and vnodes without cached data.  A rover will shift the
241 	 * vnodes around as their cache status is lost.
242 	 */
243 	if (vp->v_flag & VRECLAIMED) {
244 		TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
245 	} else {
246 		TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
247 	}
248 	vp->v_state = VS_INACTIVE;
249 	spin_unlock(&vi->spin);
250 	atomic_add_int(&mycpu->gd_inactivevnodes, 1);
251 }
252 
253 /*
254  * Add a ref to an active vnode.  This function should never be called
255  * with an inactive vnode (use vget() instead), but might be called
256  * with other states.
257  */
258 void
259 vref(struct vnode *vp)
260 {
261 	KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
262 		("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
263 	atomic_add_int(&vp->v_refcnt, 1);
264 }
265 
266 void
267 vref_special(struct vnode *vp)
268 {
269 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
270 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
271 }
272 
273 void
274 synchronizevnodecount(void)
275 {
276 	int nca = 0;
277 	int act = 0;
278 	int ina = 0;
279 	int i;
280 
281 	for (i = 0; i < ncpus; ++i) {
282 		globaldata_t gd = globaldata_find(i);
283 		nca += gd->gd_cachedvnodes;
284 		act += gd->gd_activevnodes;
285 		ina += gd->gd_inactivevnodes;
286 	}
287 	cachedvnodes = nca;
288 	activevnodes = act;
289 	inactivevnodes = ina;
290 }
291 
292 /*
293  * Count number of cached vnodes.  This is middling expensive so be
294  * careful not to make this call in the critical path.  Each cpu tracks
295  * its own accumulator.  The individual accumulators must be summed
296  * together to get an accurate value.
297  */
298 int
299 countcachedvnodes(void)
300 {
301 	int i;
302 	int n = 0;
303 
304 	for (i = 0; i < ncpus; ++i) {
305 		globaldata_t gd = globaldata_find(i);
306 		n += gd->gd_cachedvnodes;
307 	}
308 	return n;
309 }
310 
311 int
312 countcachedandinactivevnodes(void)
313 {
314 	int i;
315 	int n = 0;
316 
317 	for (i = 0; i < ncpus; ++i) {
318 		globaldata_t gd = globaldata_find(i);
319 		n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
320 	}
321 	return n;
322 }
323 
324 /*
325  * Release a ref on an active or inactive vnode.
326  *
327  * Caller has no other requirements.
328  *
329  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
330  * transition, otherwise we leave the vnode in the active list and
331  * do a lockless transition to 0, which is very important for the
332  * critical path.
333  *
334  * (vrele() is not called when a vnode is being destroyed w/kfree)
335  */
336 void
337 vrele(struct vnode *vp)
338 {
339 	int count;
340 
341 #if 1
342 	count = vp->v_refcnt;
343 	cpu_ccfence();
344 
345 	for (;;) {
346 		KKASSERT((count & VREF_MASK) > 0);
347 		KKASSERT(vp->v_state == VS_ACTIVE ||
348 			 vp->v_state == VS_INACTIVE);
349 
350 		/*
351 		 * 2+ case
352 		 */
353 		if ((count & VREF_MASK) > 1) {
354 			if (atomic_fcmpset_int(&vp->v_refcnt,
355 					       &count, count - 1)) {
356 				break;
357 			}
358 			continue;
359 		}
360 
361 		/*
362 		 * 1->0 transition case must handle possible finalization.
363 		 * When finalizing we transition 1->0x40000000.  Note that
364 		 * cachedvnodes is only adjusted on transitions to ->0.
365 		 *
366 		 * WARNING! VREF_TERMINATE can be cleared at any point
367 		 *	    when the refcnt is non-zero (by vget()) and
368 		 *	    the vnode has not been reclaimed.  Thus
369 		 *	    transitions out of VREF_TERMINATE do not have
370 		 *	    to mess with cachedvnodes.
371 		 */
372 		if (count & VREF_FINALIZE) {
373 			vx_lock(vp);
374 			if (atomic_fcmpset_int(&vp->v_refcnt,
375 					      &count, VREF_TERMINATE)) {
376 				vnode_terminate(vp);
377 				break;
378 			}
379 			vx_unlock(vp);
380 		} else {
381 			if (atomic_fcmpset_int(&vp->v_refcnt, &count, 0)) {
382 				atomic_add_int(&mycpu->gd_cachedvnodes, 1);
383 				break;
384 			}
385 		}
386 		cpu_pause();
387 		/* retry */
388 	}
389 #else
390 	/*
391 	 * XXX NOT YET WORKING!  Multiple threads can reference the vnode
392 	 * after dropping their count, racing destruction, because this
393 	 * code is not directly transitioning from 1->VREF_FINALIZE.
394 	 */
395         /*
396          * Drop the ref-count.  On the 1->0 transition we check VREF_FINALIZE
397          * and attempt to acquire VREF_TERMINATE if set.  It is possible for
398          * concurrent vref/vrele to race and bounce 0->1, 1->0, etc, but
399          * only one will be able to transition the vnode into the
400          * VREF_TERMINATE state.
401          *
402          * NOTE: VREF_TERMINATE is *in* VREF_MASK, so the vnode may only enter
403          *       this state once.
404          */
405         count = atomic_fetchadd_int(&vp->v_refcnt, -1);
406         if ((count & VREF_MASK) == 1) {
407                 atomic_add_int(&mycpu->gd_cachedvnodes, 1);
408                 --count;
409                 while ((count & (VREF_MASK | VREF_FINALIZE)) == VREF_FINALIZE) {
410                         vx_lock(vp);
411                         if (atomic_fcmpset_int(&vp->v_refcnt,
412                                                &count, VREF_TERMINATE)) {
413                                 atomic_add_int(&mycpu->gd_cachedvnodes, -1);
414                                 vnode_terminate(vp);
415                                 break;
416                         }
417                         vx_unlock(vp);
418                 }
419         }
420 #endif
421 }
422 
423 /*
424  * Add an auxiliary data structure reference to the vnode.  Auxiliary
425  * references do not change the state of the vnode or prevent deactivation
426  * or reclamation of the vnode, but will prevent the vnode from being
427  * destroyed (kfree()'d).
428  *
429  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
430  *	     already be held by the caller.  vdrop() will clean up the
431  *	     free list state.
432  */
433 void
434 vhold(struct vnode *vp)
435 {
436 	atomic_add_int(&vp->v_auxrefs, 1);
437 }
438 
439 /*
440  * Remove an auxiliary reference from the vnode.
441  */
442 void
443 vdrop(struct vnode *vp)
444 {
445 	atomic_add_int(&vp->v_auxrefs, -1);
446 }
447 
448 /*
449  * This function is called on the 1->0 transition (which is actually
450  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
451  * of the vnode.
452  *
453  * Additional vrefs are allowed to race but will not result in a reentrant
454  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
455  * prevents additional 1->0 transitions.
456  *
457  * ONLY A VGET() CAN REACTIVATE THE VNODE.
458  *
459  * Caller must hold the VX lock.
460  *
461  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
462  *
463  * NOTE: The vnode may be marked inactive with dirty buffers
464  *	 or dirty pages in its cached VM object still present.
465  *
466  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
467  *	 previously be active).  We lose control of the vnode the instant
468  *	 it is placed on the free list.
469  *
470  *	 The VX lock is required when transitioning to VS_CACHED but is
471  *	 not sufficient for the vshouldfree() interlocked test or when
472  *	 transitioning away from VS_CACHED.  v_spin is also required for
473  *	 those cases.
474  */
475 static
476 void
477 vnode_terminate(struct vnode *vp)
478 {
479 	KKASSERT(vp->v_state == VS_ACTIVE);
480 
481 	if ((vp->v_flag & VINACTIVE) == 0) {
482 		_vsetflags(vp, VINACTIVE);
483 		if (vp->v_mount)
484 			VOP_INACTIVE(vp);
485 	}
486 	spin_lock(&vp->v_spin);
487 	_vinactive(vp);
488 	spin_unlock(&vp->v_spin);
489 
490 	vx_unlock(vp);
491 }
492 
493 /****************************************************************
494  *			VX LOCKING FUNCTIONS			*
495  ****************************************************************
496  *
497  * These functions lock vnodes for reclamation and deactivation related
498  * activities.  The caller must already be holding some sort of reference
499  * on the vnode.
500  */
501 void
502 vx_lock(struct vnode *vp)
503 {
504 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
505 	spin_lock_update_only(&vp->v_spin);
506 }
507 
508 void
509 vx_unlock(struct vnode *vp)
510 {
511 	spin_unlock_update_only(&vp->v_spin);
512 	lockmgr(&vp->v_lock, LK_RELEASE);
513 }
514 
515 /*
516  * Downgrades a VX lock to a normal VN lock.  The lock remains EXCLUSIVE.
517  *
518  * Generally required after calling getnewvnode() if the intention is
519  * to return a normal locked vnode to the caller.
520  */
521 void
522 vx_downgrade(struct vnode *vp)
523 {
524 	spin_unlock_update_only(&vp->v_spin);
525 }
526 
527 /****************************************************************
528  *			VNODE ACQUISITION FUNCTIONS		*
529  ****************************************************************
530  *
531  * These functions must be used when accessing a vnode that has no
532  * chance of being destroyed in a SMP race.  That means the caller will
533  * usually either hold an auxiliary reference (such as the namecache)
534  * or hold some other lock that ensures that the vnode cannot be destroyed.
535  *
536  * These functions are MANDATORY for any code chain accessing a vnode
537  * whos activation state is not known.
538  *
539  * vget() can be called with LK_NOWAIT and will return EBUSY if the
540  * lock cannot be immediately acquired.
541  *
542  * vget()/vput() are used when reactivation is desired.
543  *
544  * vx_get() and vx_put() are used when reactivation is not desired.
545  */
546 int
547 vget(struct vnode *vp, int flags)
548 {
549 	int error;
550 
551 	/*
552 	 * A lock type must be passed
553 	 */
554 	if ((flags & LK_TYPE_MASK) == 0) {
555 		panic("vget() called with no lock specified!");
556 		/* NOT REACHED */
557 	}
558 
559 	/*
560 	 * Reference the structure and then acquire the lock.
561 	 *
562 	 * NOTE: The requested lock might be a shared lock and does
563 	 *	 not protect our access to the refcnt or other fields.
564 	 */
565 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
566 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
567 
568 	if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
569 		/*
570 		 * The lock failed, undo and return an error.  This will not
571 		 * normally trigger a termination.
572 		 */
573 		vrele(vp);
574 	} else if (vp->v_flag & VRECLAIMED) {
575 		/*
576 		 * The node is being reclaimed and cannot be reactivated
577 		 * any more, undo and return ENOENT.
578 		 */
579 		vn_unlock(vp);
580 		vrele(vp);
581 		error = ENOENT;
582 	} else if (vp->v_state == VS_ACTIVE) {
583 		/*
584 		 * A VS_ACTIVE vnode coupled with the fact that we have
585 		 * a vnode lock (even if shared) prevents v_state from
586 		 * changing.  Since the vnode is not in a VRECLAIMED state,
587 		 * we can safely clear VINACTIVE.
588 		 *
589 		 * It is possible for a shared lock to cause a race with
590 		 * another thread that is also in the process of clearing
591 		 * VREF_TERMINATE, meaning that we might return with it still
592 		 * set and then assert in a later vref().  The solution is to
593 		 * unconditionally clear VREF_TERMINATE here as well.
594 		 *
595 		 * NOTE! Multiple threads may clear VINACTIVE if this is
596 		 *	 shared lock.  This race is allowed.
597 		 */
598 		if (vp->v_flag & VINACTIVE)
599 			_vclrflags(vp, VINACTIVE);	/* SMP race ok */
600 		if (vp->v_act < VACT_MAX) {
601 			vp->v_act += VACT_INC;
602 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
603 				vp->v_act = VACT_MAX;
604 		}
605 		error = 0;
606 		if (vp->v_refcnt & VREF_TERMINATE)	/* SMP race ok */
607 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE);
608 	} else {
609 		/*
610 		 * If the vnode is not VS_ACTIVE it must be reactivated
611 		 * in addition to clearing VINACTIVE.  An exclusive spin_lock
612 		 * is needed to manipulate the vnode's list.
613 		 *
614 		 * Because the lockmgr lock might be shared, we might race
615 		 * another reactivation, which we handle.  In this situation,
616 		 * however, the refcnt prevents other v_state races.
617 		 *
618 		 * As with above, clearing VINACTIVE is allowed to race other
619 		 * clearings of VINACTIVE.
620 		 *
621 		 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
622 		 * the refcnt is non-zero and the vnode has not been
623 		 * reclaimed.  This also means that the transitions do
624 		 * not affect cachedvnodes.
625 		 *
626 		 * It is possible for a shared lock to cause a race with
627 		 * another thread that is also in the process of clearing
628 		 * VREF_TERMINATE, meaning that we might return with it still
629 		 * set and then assert in a later vref().  The solution is to
630 		 * unconditionally clear VREF_TERMINATE here as well.
631 		 */
632 		_vclrflags(vp, VINACTIVE);
633 		vp->v_act += VACT_INC;
634 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
635 			vp->v_act = VACT_MAX;
636 		spin_lock(&vp->v_spin);
637 
638 		switch(vp->v_state) {
639 		case VS_INACTIVE:
640 			_vactivate(vp);
641 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
642 							VREF_FINALIZE);
643 			spin_unlock(&vp->v_spin);
644 			break;
645 		case VS_CACHED:
646 			_vactivate(vp);
647 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
648 							VREF_FINALIZE);
649 			spin_unlock(&vp->v_spin);
650 			break;
651 		case VS_ACTIVE:
652 			atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE |
653 							VREF_TERMINATE);
654 			spin_unlock(&vp->v_spin);
655 			break;
656 		case VS_DYING:
657 			spin_unlock(&vp->v_spin);
658 			panic("Impossible VS_DYING state");
659 			break;
660 		}
661 		error = 0;
662 	}
663 	return(error);
664 }
665 
666 #ifdef DEBUG_VPUT
667 
668 void
669 debug_vput(struct vnode *vp, const char *filename, int line)
670 {
671 	kprintf("vput(%p) %s:%d\n", vp, filename, line);
672 	vn_unlock(vp);
673 	vrele(vp);
674 }
675 
676 #else
677 
678 void
679 vput(struct vnode *vp)
680 {
681 	vn_unlock(vp);
682 	vrele(vp);
683 }
684 
685 #endif
686 
687 /*
688  * Acquire the vnode lock unguarded.
689  *
690  * The non-blocking version also uses a slightly different mechanic.
691  * This function will explicitly fail not only if it cannot acquire
692  * the lock normally, but also if the caller already holds a lock.
693  *
694  * The adjusted mechanic is used to close a loophole where complex
695  * VOP_RECLAIM code can circle around recursively and allocate the
696  * same vnode it is trying to destroy from the freelist.
697  *
698  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
699  * cause the incorrect behavior to occur.  If not for that lockmgr()
700  * would do the right thing.
701  *
702  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
703  */
704 void
705 vx_get(struct vnode *vp)
706 {
707 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
708 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
709 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
710 	spin_lock_update_only(&vp->v_spin);
711 }
712 
713 int
714 vx_get_nonblock(struct vnode *vp)
715 {
716 	int error;
717 
718 	if (lockinuse(&vp->v_lock))
719 		return(EBUSY);
720 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
721 	if (error == 0) {
722 		spin_lock_update_only(&vp->v_spin);
723 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
724 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
725 	}
726 	return(error);
727 }
728 
729 /*
730  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
731  * any needed state transitions.
732  *
733  * However, filesystems use this function to get rid of unwanted new vnodes
734  * so try to get the vnode on the correct queue in that case.
735  */
736 void
737 vx_put(struct vnode *vp)
738 {
739 	if (vp->v_type == VNON || vp->v_type == VBAD)
740 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
741 	spin_unlock_update_only(&vp->v_spin);
742 	lockmgr(&vp->v_lock, LK_RELEASE);
743 	vrele(vp);
744 }
745 
746 /*
747  * Try to reuse a vnode from the free list.  This function is somewhat
748  * advisory in that NULL can be returned as a normal case, even if free
749  * vnodes are present.
750  *
751  * The scan is limited because it can result in excessive CPU use during
752  * periods of extreme vnode use.
753  *
754  * NOTE: The returned vnode is not completely initialized.
755  *	 The returned vnode will be VX locked.
756  */
757 static
758 struct vnode *
759 cleanfreevnode(int maxcount)
760 {
761 	struct vnode_index *vi;
762 	struct vnode *vp;
763 	int count;
764 	int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
765 	int ri;
766 	int cpu_count;
767 
768 	/*
769 	 * Try to deactivate some vnodes cached on the active list.
770 	 */
771 	if (countcachedvnodes() < inactivevnodes)
772 		goto skip;
773 
774 	ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
775 
776 	for (count = 0; count < maxcount * 2; ++count, ++ri) {
777 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
778 
779 		spin_lock(&vi->spin);
780 
781 		vp = TAILQ_NEXT(&vi->active_rover, v_list);
782 		TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
783 		if (vp == NULL) {
784 			TAILQ_INSERT_HEAD(&vi->active_list,
785 					  &vi->active_rover, v_list);
786 		} else {
787 			TAILQ_INSERT_AFTER(&vi->active_list, vp,
788 					   &vi->active_rover, v_list);
789 		}
790 		if (vp == NULL) {
791 			spin_unlock(&vi->spin);
792 			continue;
793 		}
794 		if ((vp->v_refcnt & VREF_MASK) != 0) {
795 			spin_unlock(&vi->spin);
796 			vp->v_act += VACT_INC;
797 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
798 				vp->v_act = VACT_MAX;
799 			continue;
800 		}
801 
802 		/*
803 		 * decrement by less if the vnode's object has a lot of
804 		 * VM pages.  XXX possible SMP races.
805 		 */
806 		if (vp->v_act > 0) {
807 			vm_object_t obj;
808 			if ((obj = vp->v_object) != NULL &&
809 			    obj->resident_page_count >= trigger) {
810 				vp->v_act -= 1;
811 			} else {
812 				vp->v_act -= VACT_INC;
813 			}
814 			if (vp->v_act < 0)
815 				vp->v_act = 0;
816 			spin_unlock(&vi->spin);
817 			continue;
818 		}
819 
820 		/*
821 		 * Try to deactivate the vnode.
822 		 */
823 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
824 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
825 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
826 
827 		spin_unlock(&vi->spin);
828 		vrele(vp);
829 	}
830 
831 	vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
832 
833 skip:
834 	/*
835 	 * Loop trying to lock the first vnode on the free list.
836 	 * Cycle if we can't.
837 	 */
838 	cpu_count = ncpus;
839 	ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
840 
841 	for (count = 0; count < maxcount; ++count, ++ri) {
842 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
843 
844 		spin_lock(&vi->spin);
845 
846 		vp = TAILQ_FIRST(&vi->inactive_list);
847 		if (vp == NULL) {
848 			spin_unlock(&vi->spin);
849 			if (--cpu_count == 0)
850 				break;
851 			ri = (ri + 16) & ~15;
852 			--ri;
853 			continue;
854 		}
855 
856 		/*
857 		 * non-blocking vx_get will also ref the vnode on success.
858 		 */
859 		if (vx_get_nonblock(vp)) {
860 			KKASSERT(vp->v_state == VS_INACTIVE);
861 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
862 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
863 			spin_unlock(&vi->spin);
864 			continue;
865 		}
866 
867 		/*
868 		 * Because we are holding vfs_spin the vnode should currently
869 		 * be inactive and VREF_TERMINATE should still be set.
870 		 *
871 		 * Once vfs_spin is released the vnode's state should remain
872 		 * unmodified due to both the lock and ref on it.
873 		 */
874 		KKASSERT(vp->v_state == VS_INACTIVE);
875 		spin_unlock(&vi->spin);
876 #ifdef TRACKVNODE
877 		if ((u_long)vp == trackvnode)
878 			kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
879 #endif
880 
881 		/*
882 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
883 		 * This includes namecache refs due to a related ncp being
884 		 * locked or having children, a VM object association, or
885 		 * other hold users.
886 		 *
887 		 * Do not reclaim/reuse a vnode if someone else has a real
888 		 * ref on it.  This can occur if a filesystem temporarily
889 		 * releases the vnode lock during VOP_RECLAIM.
890 		 */
891 		if (vp->v_auxrefs != vp->v_namecache_count ||
892 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
893 failed:
894 			if (vp->v_state == VS_INACTIVE) {
895 				spin_lock(&vi->spin);
896 				if (vp->v_state == VS_INACTIVE) {
897 					TAILQ_REMOVE(&vi->inactive_list,
898 						     vp, v_list);
899 					TAILQ_INSERT_TAIL(&vi->inactive_list,
900 							  vp, v_list);
901 				}
902 				spin_unlock(&vi->spin);
903 			}
904 			vx_put(vp);
905 			continue;
906 		}
907 
908 		/*
909 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
910 		 * for vnodes pulled from the inactive list, and cannot be
911 		 * changed while we hold the vx lock.
912 		 *
913 		 * Try to reclaim the vnode.
914 		 *
915 		 * The cache_inval_vp() can fail if any of the namecache
916 		 * elements are actively locked, preventing the vnode from
917 		 * bring reclaimed.  This is desired operation as it gives
918 		 * the namecache code certain guarantees just by holding
919 		 * a ncp.
920 		 */
921 		KKASSERT(vp->v_flag & VINACTIVE);
922 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
923 
924 		if ((vp->v_flag & VRECLAIMED) == 0) {
925 			if (cache_inval_vp_nonblock(vp))
926 				goto failed;
927 			vgone_vxlocked(vp);
928 			/* vnode is still VX locked */
929 		}
930 
931 		/*
932 		 * At this point if there are no other refs or auxrefs on
933 		 * the vnode with the inactive list locked, and we remove
934 		 * the vnode from the inactive list, it should not be
935 		 * possible for anyone else to access the vnode any more.
936 		 *
937 		 * Since the vnode is in a VRECLAIMED state, no new
938 		 * namecache associations could have been made and the
939 		 * vnode should have already been removed from its mountlist.
940 		 *
941 		 * Since we hold a VX lock on the vnode it cannot have been
942 		 * reactivated (moved out of the inactive list).
943 		 */
944 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
945 		spin_lock(&vi->spin);
946 		if (vp->v_auxrefs ||
947 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
948 			spin_unlock(&vi->spin);
949 			goto failed;
950 		}
951 		KKASSERT(vp->v_state == VS_INACTIVE);
952 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
953 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
954 		vp->v_state = VS_DYING;
955 		spin_unlock(&vi->spin);
956 
957 		/*
958 		 * Nothing should have been able to access this vp.  Only
959 		 * our ref should remain now.
960 		 */
961 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
962 		KASSERT(vp->v_refcnt == 1,
963 			("vp %p badrefs %08x", vp, vp->v_refcnt));
964 
965 		/*
966 		 * Return a VX locked vnode suitable for reuse.
967 		 */
968 		vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
969 		return(vp);
970 	}
971 	vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
972 	return(NULL);
973 }
974 
975 /*
976  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
977  *
978  * All new vnodes set the VAGE flags.  An open() of the vnode will
979  * decrement the (2-bit) flags.  Vnodes which are opened several times
980  * are thus retained in the cache over vnodes which are merely stat()d.
981  *
982  * We attempt to reuse an already-recycled vnode from our pcpu inactive
983  * queue first, and allocate otherwise.  Attempting to recycle inactive
984  * vnodes here can lead to numerous deadlocks, particularly with
985  * softupdates.
986  */
987 struct vnode *
988 allocvnode(int lktimeout, int lkflags)
989 {
990 	struct vnode *vp;
991 	struct vnode_index *vi;
992 
993 	/*
994 	 * lktimeout only applies when LK_TIMELOCK is used, and only
995 	 * the pageout daemon uses it.  The timeout may not be zero
996 	 * or the pageout daemon can deadlock in low-VM situations.
997 	 */
998 	if (lktimeout == 0)
999 		lktimeout = hz / 10;
1000 
1001 	/*
1002 	 * Do not flag for synchronous recyclement unless there are enough
1003 	 * freeable vnodes to recycle and the number of vnodes has
1004 	 * significantly exceeded our target.  We want the normal vnlru
1005 	 * process to handle the cleaning (at 9/10's) before we are forced
1006 	 * to flag it here at 11/10's for userexit path processing.
1007 	 */
1008 	if (numvnodes >= maxvnodes * 11 / 10 &&
1009 	    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
1010 		struct thread *td = curthread;
1011 		if (td->td_lwp)
1012 			atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
1013 	}
1014 
1015 	/*
1016 	 * Try to trivially reuse a reclaimed vnode from the head of the
1017 	 * inactive list for this cpu.  Any vnode cycling which occurs
1018 	 * which terminates the vnode will cause it to be returned to the
1019 	 * same pcpu structure (e.g. unlink calls).
1020 	 */
1021 	vi = &vnode_list_hash[mycpuid];
1022 	spin_lock(&vi->spin);
1023 
1024 	vp = TAILQ_FIRST(&vi->inactive_list);
1025 	if (vp && (vp->v_flag & VRECLAIMED)) {
1026 		/*
1027 		 * non-blocking vx_get will also ref the vnode on success.
1028 		 */
1029 		if (vx_get_nonblock(vp)) {
1030 			KKASSERT(vp->v_state == VS_INACTIVE);
1031 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1032 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
1033 			spin_unlock(&vi->spin);
1034 			goto slower;
1035 		}
1036 
1037 		/*
1038 		 * Because we are holding vfs_spin the vnode should currently
1039 		 * be inactive and VREF_TERMINATE should still be set.
1040 		 *
1041 		 * Once vfs_spin is released the vnode's state should remain
1042 		 * unmodified due to both the lock and ref on it.
1043 		 */
1044 		KKASSERT(vp->v_state == VS_INACTIVE);
1045 #ifdef TRACKVNODE
1046 		if ((u_long)vp == trackvnode)
1047 			kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1048 #endif
1049 
1050 		/*
1051 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
1052 		 * This includes namecache refs due to a related ncp being
1053 		 * locked or having children, a VM object association, or
1054 		 * other hold users.
1055 		 *
1056 		 * Do not reclaim/reuse a vnode if someone else has a real
1057 		 * ref on it.  This can occur if a filesystem temporarily
1058 		 * releases the vnode lock during VOP_RECLAIM.
1059 		 */
1060 		if (vp->v_auxrefs ||
1061 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1062 			if (vp->v_state == VS_INACTIVE) {
1063 				TAILQ_REMOVE(&vi->inactive_list,
1064 					     vp, v_list);
1065 				TAILQ_INSERT_TAIL(&vi->inactive_list,
1066 						  vp, v_list);
1067 			}
1068 			spin_unlock(&vi->spin);
1069 			vx_put(vp);
1070 			goto slower;
1071 		}
1072 
1073 		/*
1074 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
1075 		 * for vnodes pulled from the inactive list, and cannot be
1076 		 * changed while we hold the vx lock.
1077 		 *
1078 		 * Try to reclaim the vnode.
1079 		 */
1080 		KKASSERT(vp->v_flag & VINACTIVE);
1081 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1082 
1083 		if ((vp->v_flag & VRECLAIMED) == 0) {
1084 			spin_unlock(&vi->spin);
1085 			vx_put(vp);
1086 			goto slower;
1087 		}
1088 
1089 		/*
1090 		 * At this point if there are no other refs or auxrefs on
1091 		 * the vnode with the inactive list locked, and we remove
1092 		 * the vnode from the inactive list, it should not be
1093 		 * possible for anyone else to access the vnode any more.
1094 		 *
1095 		 * Since the vnode is in a VRECLAIMED state, no new
1096 		 * namecache associations could have been made and the
1097 		 * vnode should have already been removed from its mountlist.
1098 		 *
1099 		 * Since we hold a VX lock on the vnode it cannot have been
1100 		 * reactivated (moved out of the inactive list).
1101 		 */
1102 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1103 		KKASSERT(vp->v_state == VS_INACTIVE);
1104 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1105 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1106 		vp->v_state = VS_DYING;
1107 		spin_unlock(&vi->spin);
1108 
1109 		/*
1110 		 * Nothing should have been able to access this vp.  Only
1111 		 * our ref should remain now.
1112 		 *
1113 		 * At this point we can kfree() the vnode if we want to.
1114 		 * Instead, we reuse it for the allocation.
1115 		 */
1116 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1117 		KASSERT(vp->v_refcnt == 1,
1118 			("vp %p badrefs %08x", vp, vp->v_refcnt));
1119 		vx_unlock(vp);		/* safety: keep the API clean */
1120 		bzero(vp, sizeof(*vp));
1121 	} else {
1122 		spin_unlock(&vi->spin);
1123 slower:
1124 		vp = kmalloc_obj(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1125 		atomic_add_int(&numvnodes, 1);
1126 	}
1127 
1128 	lwkt_token_init(&vp->v_token, "vnode");
1129 	lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1130 	TAILQ_INIT(&vp->v_namecache);
1131 	RB_INIT(&vp->v_rbclean_tree);
1132 	RB_INIT(&vp->v_rbdirty_tree);
1133 	RB_INIT(&vp->v_rbhash_tree);
1134 	spin_init(&vp->v_spin, "allocvnode");
1135 
1136 	vx_lock(vp);
1137 	vp->v_refcnt = 1;
1138 	vp->v_flag = VAGE0 | VAGE1;
1139 	vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1140 
1141 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1142 	/* exclusive lock still held */
1143 
1144 	vp->v_filesize = NOOFFSET;
1145 	vp->v_type = VNON;
1146 	vp->v_tag = 0;
1147 	vp->v_state = VS_CACHED;
1148 	_vactivate(vp);
1149 
1150 	return (vp);
1151 }
1152 
1153 /*
1154  * Called after a process has allocated a vnode via allocvnode()
1155  * and we detected that too many vnodes were present.
1156  *
1157  * This function is called just prior to a return to userland if the
1158  * process at some point had to allocate a new vnode during the last
1159  * system call and the vnode count was found to be excessive.
1160  *
1161  * This is a synchronous path that we do not normally want to execute.
1162  *
1163  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1164  *
1165  * WARNING: Sometimes numvnodes can blow out due to children being
1166  *	    present under directory vnodes in the namecache.  For the
1167  *	    moment use an if() instead of a while() and note that if
1168  *	    we were to use a while() we would still have to break out
1169  *	    if freesomevnodes() returned 0.  vnlru will also be trying
1170  *	    hard to free vnodes at the same time (with a lower trigger
1171  *	    pointer).
1172  */
1173 void
1174 allocvnode_gc(void)
1175 {
1176 	if (numvnodes >= maxvnodes &&
1177 	    countcachedandinactivevnodes() >= maxvnodes * 5 / 10) {
1178 		freesomevnodes(batchfreevnodes);
1179 	}
1180 }
1181 
1182 int
1183 freesomevnodes(int n)
1184 {
1185 	struct vnode *vp;
1186 	int count = 0;
1187 
1188 	while (n) {
1189 		if ((vp = cleanfreevnode(n)) == NULL)
1190 			break;
1191 		vx_unlock(vp);
1192 		--n;
1193 		++count;
1194 		kfree_obj(vp, M_VNODE);
1195 		atomic_add_int(&numvnodes, -1);
1196 	}
1197 	return(count);
1198 }
1199