xref: /dragonfly/sys/kern/vfs_lock.c (revision d8d5b238)
1 /*
2  * Copyright (c) 2004,2013-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * External lock/ref-related vnode functions
37  *
38  * vs_state transition locking requirements:
39  *
40  *	INACTIVE -> CACHED|DYING	vx_lock(excl) + vi->spin
41  *	DYING    -> CACHED		vx_lock(excl)
42  *	ACTIVE   -> INACTIVE		(none)       + v_spin + vi->spin
43  *	INACTIVE -> ACTIVE		vn_lock(any) + v_spin + vi->spin
44  *	CACHED   -> ACTIVE		vn_lock(any) + v_spin + vi->spin
45  *
46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
47  *
48  *	 Switching into ACTIVE also requires a vref and vnode lock, however
49  *	 the vnode lock is allowed to be SHARED.
50  *
51  *	 Switching into a CACHED or DYING state requires an exclusive vnode
52  *	 lock or vx_lock (which is almost the same thing).
53  */
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/spinlock2.h>
63 #include <sys/sysctl.h>
64 
65 #include <machine/limits.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
69 
70 #define VACT_MAX	10
71 #define VACT_INC	2
72 
73 static void vnode_terminate(struct vnode *vp);
74 
75 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
76 
77 /*
78  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
79  * are inserted prior to the mid point, and otherwise inserted
80  * at the tail.
81  *
82  * The vnode code goes to great lengths to avoid moving vnodes between
83  * lists, but sometimes it is unavoidable.  For this situation we try to
84  * avoid lock contention but we do not try very hard to avoid cache line
85  * congestion.  A modestly sized hash table is used.
86  */
87 #define VLIST_PRIME2	123462047LU
88 #define VLIST_XOR	(uintptr_t)0xab4582fa8322fb71LLU
89 
90 #define VLIST_HASH(vp)	(((uintptr_t)vp ^ VLIST_XOR) % \
91 			 VLIST_PRIME2 % (unsigned)ncpus)
92 
93 static struct vnode_index *vnode_list_hash;
94 
95 int  activevnodes = 0;
96 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
97 	&activevnodes, 0, "Number of active nodes");
98 int  cachedvnodes = 0;
99 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
100 	&cachedvnodes, 0, "Number of total cached nodes");
101 int  inactivevnodes = 0;
102 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
103 	&inactivevnodes, 0, "Number of inactive nodes");
104 static int batchfreevnodes = 5;
105 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
106 	&batchfreevnodes, 0, "Number of vnodes to free at once");
107 #ifdef TRACKVNODE
108 static u_long trackvnode;
109 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
110 		&trackvnode, 0, "");
111 #endif
112 
113 /*
114  * Called from vfsinit()
115  */
116 void
117 vfs_lock_init(void)
118 {
119 	int i;
120 
121 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
122 	vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
123 				  M_VNODE, M_ZERO | M_WAITOK);
124 	for (i = 0; i < ncpus; ++i) {
125 		struct vnode_index *vi = &vnode_list_hash[i];
126 
127 		TAILQ_INIT(&vi->inactive_list);
128 		TAILQ_INIT(&vi->active_list);
129 		TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
130 		spin_init(&vi->spin, "vfslock");
131 	}
132 }
133 
134 /*
135  * Misc functions
136  */
137 static __inline
138 void
139 _vsetflags(struct vnode *vp, int flags)
140 {
141 	atomic_set_int(&vp->v_flag, flags);
142 }
143 
144 static __inline
145 void
146 _vclrflags(struct vnode *vp, int flags)
147 {
148 	atomic_clear_int(&vp->v_flag, flags);
149 }
150 
151 void
152 vsetflags(struct vnode *vp, int flags)
153 {
154 	_vsetflags(vp, flags);
155 }
156 
157 void
158 vclrflags(struct vnode *vp, int flags)
159 {
160 	_vclrflags(vp, flags);
161 }
162 
163 /*
164  * Place the vnode on the active list.
165  *
166  * Caller must hold vp->v_spin
167  */
168 static __inline
169 void
170 _vactivate(struct vnode *vp)
171 {
172 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
173 
174 #ifdef TRACKVNODE
175 	if ((u_long)vp == trackvnode)
176 		kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
177 #endif
178 	spin_lock(&vi->spin);
179 
180 	switch(vp->v_state) {
181 	case VS_ACTIVE:
182 		spin_unlock(&vi->spin);
183 		panic("_vactivate: already active");
184 		/* NOT REACHED */
185 		return;
186 	case VS_INACTIVE:
187 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
188 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
189 		break;
190 	case VS_CACHED:
191 	case VS_DYING:
192 		break;
193 	}
194 	TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
195 	vp->v_state = VS_ACTIVE;
196 	spin_unlock(&vi->spin);
197 	atomic_add_int(&mycpu->gd_activevnodes, 1);
198 }
199 
200 /*
201  * Put a vnode on the inactive list.
202  *
203  * Caller must hold v_spin
204  */
205 static __inline
206 void
207 _vinactive(struct vnode *vp)
208 {
209 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
210 
211 #ifdef TRACKVNODE
212 	if ((u_long)vp == trackvnode) {
213 		kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
214 		print_backtrace(-1);
215 	}
216 #endif
217 	spin_lock(&vi->spin);
218 
219 	/*
220 	 * Remove from active list if it is sitting on it
221 	 */
222 	switch(vp->v_state) {
223 	case VS_ACTIVE:
224 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
225 		atomic_add_int(&mycpu->gd_activevnodes, -1);
226 		break;
227 	case VS_INACTIVE:
228 		spin_unlock(&vi->spin);
229 		panic("_vinactive: already inactive");
230 		/* NOT REACHED */
231 		return;
232 	case VS_CACHED:
233 	case VS_DYING:
234 		break;
235 	}
236 
237 	/*
238 	 * Distinguish between basically dead vnodes, vnodes with cached
239 	 * data, and vnodes without cached data.  A rover will shift the
240 	 * vnodes around as their cache status is lost.
241 	 */
242 	if (vp->v_flag & VRECLAIMED) {
243 		TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
244 	} else {
245 		TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
246 	}
247 	vp->v_state = VS_INACTIVE;
248 	spin_unlock(&vi->spin);
249 	atomic_add_int(&mycpu->gd_inactivevnodes, 1);
250 }
251 
252 static __inline
253 void
254 _vinactive_tail(struct vnode *vp)
255 {
256 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
257 
258 	spin_lock(&vi->spin);
259 
260 	/*
261 	 * Remove from active list if it is sitting on it
262 	 */
263 	switch(vp->v_state) {
264 	case VS_ACTIVE:
265 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
266 		atomic_add_int(&mycpu->gd_activevnodes, -1);
267 		break;
268 	case VS_INACTIVE:
269 		spin_unlock(&vi->spin);
270 		panic("_vinactive_tail: already inactive");
271 		/* NOT REACHED */
272 		return;
273 	case VS_CACHED:
274 	case VS_DYING:
275 		break;
276 	}
277 
278 	TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
279 	vp->v_state = VS_INACTIVE;
280 	spin_unlock(&vi->spin);
281 	atomic_add_int(&mycpu->gd_inactivevnodes, 1);
282 }
283 
284 /*
285  * Add a ref to an active vnode.  This function should never be called
286  * with an inactive vnode (use vget() instead), but might be called
287  * with other states.
288  */
289 void
290 vref(struct vnode *vp)
291 {
292 	KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
293 		("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
294 	atomic_add_int(&vp->v_refcnt, 1);
295 }
296 
297 void
298 synchronizevnodecount(void)
299 {
300 	int nca = 0;
301 	int act = 0;
302 	int ina = 0;
303 	int i;
304 
305 	for (i = 0; i < ncpus; ++i) {
306 		globaldata_t gd = globaldata_find(i);
307 		nca += gd->gd_cachedvnodes;
308 		act += gd->gd_activevnodes;
309 		ina += gd->gd_inactivevnodes;
310 	}
311 	cachedvnodes = nca;
312 	activevnodes = act;
313 	inactivevnodes = ina;
314 }
315 
316 /*
317  * Count number of cached vnodes.  This is middling expensive so be
318  * careful not to make this call in the critical path.  Each cpu tracks
319  * its own accumulator.  The individual accumulators must be summed
320  * together to get an accurate value.
321  */
322 int
323 countcachedvnodes(void)
324 {
325 	int i;
326 	int n = 0;
327 
328 	for (i = 0; i < ncpus; ++i) {
329 		globaldata_t gd = globaldata_find(i);
330 		n += gd->gd_cachedvnodes;
331 	}
332 	return n;
333 }
334 
335 int
336 countcachedandinactivevnodes(void)
337 {
338 	int i;
339 	int n = 0;
340 
341 	for (i = 0; i < ncpus; ++i) {
342 		globaldata_t gd = globaldata_find(i);
343 		n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
344 	}
345 	return n;
346 }
347 
348 /*
349  * Release a ref on an active or inactive vnode.
350  *
351  * Caller has no other requirements.
352  *
353  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
354  * transition, otherwise we leave the vnode in the active list and
355  * do a lockless transition to 0, which is very important for the
356  * critical path.
357  *
358  * (vrele() is not called when a vnode is being destroyed w/kfree)
359  */
360 void
361 vrele(struct vnode *vp)
362 {
363 	for (;;) {
364 		int count = vp->v_refcnt;
365 		cpu_ccfence();
366 		KKASSERT((count & VREF_MASK) > 0);
367 		KKASSERT(vp->v_state == VS_ACTIVE ||
368 			 vp->v_state == VS_INACTIVE);
369 
370 		/*
371 		 * 2+ case
372 		 */
373 		if ((count & VREF_MASK) > 1) {
374 			if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
375 				break;
376 			continue;
377 		}
378 
379 		/*
380 		 * 1->0 transition case must handle possible finalization.
381 		 * When finalizing we transition 1->0x40000000.  Note that
382 		 * cachedvnodes is only adjusted on transitions to ->0.
383 		 *
384 		 * WARNING! VREF_TERMINATE can be cleared at any point
385 		 *	    when the refcnt is non-zero (by vget()) and
386 		 *	    the vnode has not been reclaimed.  Thus
387 		 *	    transitions out of VREF_TERMINATE do not have
388 		 *	    to mess with cachedvnodes.
389 		 */
390 		if (count & VREF_FINALIZE) {
391 			vx_lock(vp);
392 			if (atomic_cmpset_int(&vp->v_refcnt,
393 					      count, VREF_TERMINATE)) {
394 				vnode_terminate(vp);
395 				break;
396 			}
397 			vx_unlock(vp);
398 		} else {
399 			if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
400 				atomic_add_int(&mycpu->gd_cachedvnodes, 1);
401 				break;
402 			}
403 		}
404 		/* retry */
405 	}
406 }
407 
408 /*
409  * Add an auxiliary data structure reference to the vnode.  Auxiliary
410  * references do not change the state of the vnode or prevent deactivation
411  * or reclamation of the vnode, but will prevent the vnode from being
412  * destroyed (kfree()'d).
413  *
414  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
415  *	     already be held by the caller.  vdrop() will clean up the
416  *	     free list state.
417  */
418 void
419 vhold(struct vnode *vp)
420 {
421 	atomic_add_int(&vp->v_auxrefs, 1);
422 }
423 
424 /*
425  * Remove an auxiliary reference from the vnode.
426  */
427 void
428 vdrop(struct vnode *vp)
429 {
430 	atomic_add_int(&vp->v_auxrefs, -1);
431 }
432 
433 /*
434  * This function is called on the 1->0 transition (which is actually
435  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
436  * of the vnode.
437  *
438  * Additional vrefs are allowed to race but will not result in a reentrant
439  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
440  * prevents additional 1->0 transitions.
441  *
442  * ONLY A VGET() CAN REACTIVATE THE VNODE.
443  *
444  * Caller must hold the VX lock.
445  *
446  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
447  *
448  * NOTE: The vnode may be marked inactive with dirty buffers
449  *	 or dirty pages in its cached VM object still present.
450  *
451  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
452  *	 previously be active).  We lose control of the vnode the instant
453  *	 it is placed on the free list.
454  *
455  *	 The VX lock is required when transitioning to VS_CACHED but is
456  *	 not sufficient for the vshouldfree() interlocked test or when
457  *	 transitioning away from VS_CACHED.  v_spin is also required for
458  *	 those cases.
459  */
460 static
461 void
462 vnode_terminate(struct vnode *vp)
463 {
464 	KKASSERT(vp->v_state == VS_ACTIVE);
465 
466 	if ((vp->v_flag & VINACTIVE) == 0) {
467 		_vsetflags(vp, VINACTIVE);
468 		if (vp->v_mount)
469 			VOP_INACTIVE(vp);
470 	}
471 	spin_lock(&vp->v_spin);
472 	_vinactive(vp);
473 	spin_unlock(&vp->v_spin);
474 
475 	vx_unlock(vp);
476 }
477 
478 /****************************************************************
479  *			VX LOCKING FUNCTIONS			*
480  ****************************************************************
481  *
482  * These functions lock vnodes for reclamation and deactivation related
483  * activities.  The caller must already be holding some sort of reference
484  * on the vnode.
485  */
486 void
487 vx_lock(struct vnode *vp)
488 {
489 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
490 }
491 
492 void
493 vx_unlock(struct vnode *vp)
494 {
495 	lockmgr(&vp->v_lock, LK_RELEASE);
496 }
497 
498 /****************************************************************
499  *			VNODE ACQUISITION FUNCTIONS		*
500  ****************************************************************
501  *
502  * These functions must be used when accessing a vnode that has no
503  * chance of being destroyed in a SMP race.  That means the caller will
504  * usually either hold an auxiliary reference (such as the namecache)
505  * or hold some other lock that ensures that the vnode cannot be destroyed.
506  *
507  * These functions are MANDATORY for any code chain accessing a vnode
508  * whos activation state is not known.
509  *
510  * vget() can be called with LK_NOWAIT and will return EBUSY if the
511  * lock cannot be immediately acquired.
512  *
513  * vget()/vput() are used when reactivation is desired.
514  *
515  * vx_get() and vx_put() are used when reactivation is not desired.
516  */
517 int
518 vget(struct vnode *vp, int flags)
519 {
520 	int error;
521 
522 	/*
523 	 * A lock type must be passed
524 	 */
525 	if ((flags & LK_TYPE_MASK) == 0) {
526 		panic("vget() called with no lock specified!");
527 		/* NOT REACHED */
528 	}
529 
530 	/*
531 	 * Reference the structure and then acquire the lock.
532 	 *
533 	 * NOTE: The requested lock might be a shared lock and does
534 	 *	 not protect our access to the refcnt or other fields.
535 	 */
536 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
537 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
538 
539 	if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
540 		/*
541 		 * The lock failed, undo and return an error.  This will not
542 		 * normally trigger a termination.
543 		 */
544 		vrele(vp);
545 	} else if (vp->v_flag & VRECLAIMED) {
546 		/*
547 		 * The node is being reclaimed and cannot be reactivated
548 		 * any more, undo and return ENOENT.
549 		 */
550 		vn_unlock(vp);
551 		vrele(vp);
552 		error = ENOENT;
553 	} else if (vp->v_state == VS_ACTIVE) {
554 		/*
555 		 * A VS_ACTIVE vnode coupled with the fact that we have
556 		 * a vnode lock (even if shared) prevents v_state from
557 		 * changing.  Since the vnode is not in a VRECLAIMED state,
558 		 * we can safely clear VINACTIVE.
559 		 *
560 		 * It is possible for a shared lock to cause a race with
561 		 * another thread that is also in the process of clearing
562 		 * VREF_TERMINATE, meaning that we might return with it still
563 		 * set and then assert in a later vref().  The solution is to
564 		 * unconditionally clear VREF_TERMINATE here as well.
565 		 *
566 		 * NOTE! Multiple threads may clear VINACTIVE if this is
567 		 *	 shared lock.  This race is allowed.
568 		 */
569 		_vclrflags(vp, VINACTIVE);	/* SMP race ok */
570 		vp->v_act += VACT_INC;
571 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
572 			vp->v_act = VACT_MAX;
573 		error = 0;
574 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE);
575 	} else {
576 		/*
577 		 * If the vnode is not VS_ACTIVE it must be reactivated
578 		 * in addition to clearing VINACTIVE.  An exclusive spin_lock
579 		 * is needed to manipulate the vnode's list.
580 		 *
581 		 * Because the lockmgr lock might be shared, we might race
582 		 * another reactivation, which we handle.  In this situation,
583 		 * however, the refcnt prevents other v_state races.
584 		 *
585 		 * As with above, clearing VINACTIVE is allowed to race other
586 		 * clearings of VINACTIVE.
587 		 *
588 		 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
589 		 * the refcnt is non-zero and the vnode has not been
590 		 * reclaimed.  This also means that the transitions do
591 		 * not affect cachedvnodes.
592 		 *
593 		 * It is possible for a shared lock to cause a race with
594 		 * another thread that is also in the process of clearing
595 		 * VREF_TERMINATE, meaning that we might return with it still
596 		 * set and then assert in a later vref().  The solution is to
597 		 * unconditionally clear VREF_TERMINATE here as well.
598 		 */
599 		_vclrflags(vp, VINACTIVE);
600 		vp->v_act += VACT_INC;
601 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
602 			vp->v_act = VACT_MAX;
603 		spin_lock(&vp->v_spin);
604 
605 		switch(vp->v_state) {
606 		case VS_INACTIVE:
607 			_vactivate(vp);
608 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
609 							VREF_FINALIZE);
610 			spin_unlock(&vp->v_spin);
611 			break;
612 		case VS_CACHED:
613 			_vactivate(vp);
614 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
615 							VREF_FINALIZE);
616 			spin_unlock(&vp->v_spin);
617 			break;
618 		case VS_ACTIVE:
619 			atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE |
620 							VREF_TERMINATE);
621 			spin_unlock(&vp->v_spin);
622 			break;
623 		case VS_DYING:
624 			spin_unlock(&vp->v_spin);
625 			panic("Impossible VS_DYING state");
626 			break;
627 		}
628 		error = 0;
629 	}
630 	return(error);
631 }
632 
633 #ifdef DEBUG_VPUT
634 
635 void
636 debug_vput(struct vnode *vp, const char *filename, int line)
637 {
638 	kprintf("vput(%p) %s:%d\n", vp, filename, line);
639 	vn_unlock(vp);
640 	vrele(vp);
641 }
642 
643 #else
644 
645 void
646 vput(struct vnode *vp)
647 {
648 	vn_unlock(vp);
649 	vrele(vp);
650 }
651 
652 #endif
653 
654 /*
655  * Acquire the vnode lock unguarded.
656  *
657  * The non-blocking version also uses a slightly different mechanic.
658  * This function will explicitly fail not only if it cannot acquire
659  * the lock normally, but also if the caller already holds a lock.
660  *
661  * The adjusted mechanic is used to close a loophole where complex
662  * VOP_RECLAIM code can circle around recursively and allocate the
663  * same vnode it is trying to destroy from the freelist.
664  *
665  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
666  * cause the incorrect behavior to occur.  If not for that lockmgr()
667  * would do the right thing.
668  *
669  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
670  */
671 void
672 vx_get(struct vnode *vp)
673 {
674 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
675 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
676 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
677 }
678 
679 int
680 vx_get_nonblock(struct vnode *vp)
681 {
682 	int error;
683 
684 	if (lockinuse(&vp->v_lock))
685 		return(EBUSY);
686 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
687 	if (error == 0) {
688 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
689 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
690 	}
691 	return(error);
692 }
693 
694 /*
695  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
696  * any needed state transitions.
697  *
698  * However, filesystems use this function to get rid of unwanted new vnodes
699  * so try to get the vnode on the correct queue in that case.
700  */
701 void
702 vx_put(struct vnode *vp)
703 {
704 	if (vp->v_type == VNON || vp->v_type == VBAD)
705 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
706 	lockmgr(&vp->v_lock, LK_RELEASE);
707 	vrele(vp);
708 }
709 
710 /*
711  * Try to reuse a vnode from the free list.  This function is somewhat
712  * advisory in that NULL can be returned as a normal case, even if free
713  * vnodes are present.
714  *
715  * The scan is limited because it can result in excessive CPU use during
716  * periods of extreme vnode use.
717  *
718  * NOTE: The returned vnode is not completely initialized.
719  */
720 static
721 struct vnode *
722 cleanfreevnode(int maxcount)
723 {
724 	struct vnode_index *vi;
725 	struct vnode *vp;
726 	int count;
727 	int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
728 	int ri;
729 	int cpu_count;
730 
731 	/*
732 	 * Try to deactivate some vnodes cached on the active list.
733 	 */
734 	if (countcachedvnodes() < inactivevnodes)
735 		goto skip;
736 
737 	ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
738 
739 	for (count = 0; count < maxcount * 2; ++count, ++ri) {
740 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
741 
742 		spin_lock(&vi->spin);
743 
744 		vp = TAILQ_NEXT(&vi->active_rover, v_list);
745 		TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
746 		if (vp == NULL) {
747 			TAILQ_INSERT_HEAD(&vi->active_list,
748 					  &vi->active_rover, v_list);
749 		} else {
750 			TAILQ_INSERT_AFTER(&vi->active_list, vp,
751 					   &vi->active_rover, v_list);
752 		}
753 		if (vp == NULL) {
754 			spin_unlock(&vi->spin);
755 			continue;
756 		}
757 		if ((vp->v_refcnt & VREF_MASK) != 0) {
758 			spin_unlock(&vi->spin);
759 			vp->v_act += VACT_INC;
760 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
761 				vp->v_act = VACT_MAX;
762 			continue;
763 		}
764 
765 		/*
766 		 * decrement by less if the vnode's object has a lot of
767 		 * VM pages.  XXX possible SMP races.
768 		 */
769 		if (vp->v_act > 0) {
770 			vm_object_t obj;
771 			if ((obj = vp->v_object) != NULL &&
772 			    obj->resident_page_count >= trigger) {
773 				vp->v_act -= 1;
774 			} else {
775 				vp->v_act -= VACT_INC;
776 			}
777 			if (vp->v_act < 0)
778 				vp->v_act = 0;
779 			spin_unlock(&vi->spin);
780 			continue;
781 		}
782 
783 		/*
784 		 * Try to deactivate the vnode.
785 		 */
786 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
787 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
788 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
789 
790 		spin_unlock(&vi->spin);
791 		vrele(vp);
792 	}
793 
794 	vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
795 
796 skip:
797 	/*
798 	 * Loop trying to lock the first vnode on the free list.
799 	 * Cycle if we can't.
800 	 */
801 	cpu_count = ncpus;
802 	ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
803 
804 	for (count = 0; count < maxcount; ++count, ++ri) {
805 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
806 
807 		spin_lock(&vi->spin);
808 
809 		vp = TAILQ_FIRST(&vi->inactive_list);
810 		if (vp == NULL) {
811 			spin_unlock(&vi->spin);
812 			if (--cpu_count == 0)
813 				break;
814 			ri = (ri + 16) & ~15;
815 			--ri;
816 			continue;
817 		}
818 
819 		/*
820 		 * non-blocking vx_get will also ref the vnode on success.
821 		 */
822 		if (vx_get_nonblock(vp)) {
823 			KKASSERT(vp->v_state == VS_INACTIVE);
824 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
825 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
826 			spin_unlock(&vi->spin);
827 			continue;
828 		}
829 
830 		/*
831 		 * Because we are holding vfs_spin the vnode should currently
832 		 * be inactive and VREF_TERMINATE should still be set.
833 		 *
834 		 * Once vfs_spin is released the vnode's state should remain
835 		 * unmodified due to both the lock and ref on it.
836 		 */
837 		KKASSERT(vp->v_state == VS_INACTIVE);
838 		spin_unlock(&vi->spin);
839 #ifdef TRACKVNODE
840 		if ((u_long)vp == trackvnode)
841 			kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
842 #endif
843 
844 		/*
845 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
846 		 * This includes namecache refs due to a related ncp being
847 		 * locked or having children, a VM object association, or
848 		 * other hold users.
849 		 *
850 		 * Do not reclaim/reuse a vnode if someone else has a real
851 		 * ref on it.  This can occur if a filesystem temporarily
852 		 * releases the vnode lock during VOP_RECLAIM.
853 		 */
854 		if (vp->v_auxrefs ||
855 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
856 failed:
857 			if (vp->v_state == VS_INACTIVE) {
858 				spin_lock(&vi->spin);
859 				if (vp->v_state == VS_INACTIVE) {
860 					TAILQ_REMOVE(&vi->inactive_list,
861 						     vp, v_list);
862 					TAILQ_INSERT_TAIL(&vi->inactive_list,
863 							  vp, v_list);
864 				}
865 				spin_unlock(&vi->spin);
866 			}
867 			vx_put(vp);
868 			continue;
869 		}
870 
871 		/*
872 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
873 		 * for vnodes pulled from the inactive list, and cannot be
874 		 * changed while we hold the vx lock.
875 		 *
876 		 * Try to reclaim the vnode.
877 		 */
878 		KKASSERT(vp->v_flag & VINACTIVE);
879 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
880 
881 		if ((vp->v_flag & VRECLAIMED) == 0) {
882 			if (cache_inval_vp_nonblock(vp))
883 				goto failed;
884 			vgone_vxlocked(vp);
885 			/* vnode is still VX locked */
886 		}
887 
888 		/*
889 		 * At this point if there are no other refs or auxrefs on
890 		 * the vnode with the inactive list locked, and we remove
891 		 * the vnode from the inactive list, it should not be
892 		 * possible for anyone else to access the vnode any more.
893 		 *
894 		 * Since the vnode is in a VRECLAIMED state, no new
895 		 * namecache associations could have been made and the
896 		 * vnode should have already been removed from its mountlist.
897 		 *
898 		 * Since we hold a VX lock on the vnode it cannot have been
899 		 * reactivated (moved out of the inactive list).
900 		 */
901 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
902 		spin_lock(&vi->spin);
903 		if (vp->v_auxrefs ||
904 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
905 			spin_unlock(&vi->spin);
906 			goto failed;
907 		}
908 		KKASSERT(vp->v_state == VS_INACTIVE);
909 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
910 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
911 		vp->v_state = VS_DYING;
912 		spin_unlock(&vi->spin);
913 
914 		/*
915 		 * Nothing should have been able to access this vp.  Only
916 		 * our ref should remain now.
917 		 */
918 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
919 		KASSERT(vp->v_refcnt == 1,
920 			("vp %p badrefs %08x", vp, vp->v_refcnt));
921 
922 		/*
923 		 * Return a VX locked vnode suitable for reuse.
924 		 */
925 		vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
926 		return(vp);
927 	}
928 	vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
929 	return(NULL);
930 }
931 
932 /*
933  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
934  *
935  * All new vnodes set the VAGE flags.  An open() of the vnode will
936  * decrement the (2-bit) flags.  Vnodes which are opened several times
937  * are thus retained in the cache over vnodes which are merely stat()d.
938  *
939  * We attempt to reuse an already-recycled vnode from our pcpu inactive
940  * queue first, and allocate otherwise.  Attempting to recycle inactive
941  * vnodes here can lead to numerous deadlocks, particularly with
942  * softupdates.
943  */
944 struct vnode *
945 allocvnode(int lktimeout, int lkflags)
946 {
947 	struct vnode *vp;
948 	struct vnode_index *vi;
949 
950 	/*
951 	 * lktimeout only applies when LK_TIMELOCK is used, and only
952 	 * the pageout daemon uses it.  The timeout may not be zero
953 	 * or the pageout daemon can deadlock in low-VM situations.
954 	 */
955 	if (lktimeout == 0)
956 		lktimeout = hz / 10;
957 
958 	/*
959 	 * Do not flag for synchronous recyclement unless there are enough
960 	 * freeable vnodes to recycle and the number of vnodes has
961 	 * significantly exceeded our target.  We want the normal vnlru
962 	 * process to handle the cleaning (at 9/10's) before we are forced
963 	 * to flag it here at 11/10's for userexit path processing.
964 	 */
965 	if (numvnodes >= maxvnodes * 11 / 10 &&
966 	    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
967 		struct thread *td = curthread;
968 		if (td->td_lwp)
969 			atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
970 	}
971 
972 	/*
973 	 * Try to trivially reuse a reclaimed vnode from the head of the
974 	 * inactive list for this cpu.  Any vnode cycling which occurs
975 	 * which terminates the vnode will cause it to be returned to the
976 	 * same pcpu structure (e.g. unlink calls).
977 	 */
978 	vi = &vnode_list_hash[mycpuid];
979 	spin_lock(&vi->spin);
980 
981 	vp = TAILQ_FIRST(&vi->inactive_list);
982 	if (vp && (vp->v_flag & VRECLAIMED)) {
983 		/*
984 		 * non-blocking vx_get will also ref the vnode on success.
985 		 */
986 		if (vx_get_nonblock(vp)) {
987 			KKASSERT(vp->v_state == VS_INACTIVE);
988 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
989 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
990 			spin_unlock(&vi->spin);
991 			goto slower;
992 		}
993 
994 		/*
995 		 * Because we are holding vfs_spin the vnode should currently
996 		 * be inactive and VREF_TERMINATE should still be set.
997 		 *
998 		 * Once vfs_spin is released the vnode's state should remain
999 		 * unmodified due to both the lock and ref on it.
1000 		 */
1001 		KKASSERT(vp->v_state == VS_INACTIVE);
1002 #ifdef TRACKVNODE
1003 		if ((u_long)vp == trackvnode)
1004 			kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1005 #endif
1006 
1007 		/*
1008 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
1009 		 * This includes namecache refs due to a related ncp being
1010 		 * locked or having children, a VM object association, or
1011 		 * other hold users.
1012 		 *
1013 		 * Do not reclaim/reuse a vnode if someone else has a real
1014 		 * ref on it.  This can occur if a filesystem temporarily
1015 		 * releases the vnode lock during VOP_RECLAIM.
1016 		 */
1017 		if (vp->v_auxrefs ||
1018 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1019 			if (vp->v_state == VS_INACTIVE) {
1020 				if (vp->v_state == VS_INACTIVE) {
1021 					TAILQ_REMOVE(&vi->inactive_list,
1022 						     vp, v_list);
1023 					TAILQ_INSERT_TAIL(&vi->inactive_list,
1024 							  vp, v_list);
1025 				}
1026 			}
1027 			spin_unlock(&vi->spin);
1028 			vx_put(vp);
1029 			goto slower;
1030 		}
1031 
1032 		/*
1033 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
1034 		 * for vnodes pulled from the inactive list, and cannot be
1035 		 * changed while we hold the vx lock.
1036 		 *
1037 		 * Try to reclaim the vnode.
1038 		 */
1039 		KKASSERT(vp->v_flag & VINACTIVE);
1040 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1041 
1042 		if ((vp->v_flag & VRECLAIMED) == 0) {
1043 			spin_unlock(&vi->spin);
1044 			vx_put(vp);
1045 			goto slower;
1046 		}
1047 
1048 		/*
1049 		 * At this point if there are no other refs or auxrefs on
1050 		 * the vnode with the inactive list locked, and we remove
1051 		 * the vnode from the inactive list, it should not be
1052 		 * possible for anyone else to access the vnode any more.
1053 		 *
1054 		 * Since the vnode is in a VRECLAIMED state, no new
1055 		 * namecache associations could have been made and the
1056 		 * vnode should have already been removed from its mountlist.
1057 		 *
1058 		 * Since we hold a VX lock on the vnode it cannot have been
1059 		 * reactivated (moved out of the inactive list).
1060 		 */
1061 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1062 		KKASSERT(vp->v_state == VS_INACTIVE);
1063 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1064 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1065 		vp->v_state = VS_DYING;
1066 		spin_unlock(&vi->spin);
1067 
1068 		/*
1069 		 * Nothing should have been able to access this vp.  Only
1070 		 * our ref should remain now.
1071 		 *
1072 		 * At this point we can kfree() the vnode if we want to.
1073 		 * Instead, we reuse it for the allocation.
1074 		 */
1075 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1076 		KASSERT(vp->v_refcnt == 1,
1077 			("vp %p badrefs %08x", vp, vp->v_refcnt));
1078 		bzero(vp, sizeof(*vp));
1079 	} else {
1080 		spin_unlock(&vi->spin);
1081 slower:
1082 		vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1083 		atomic_add_int(&numvnodes, 1);
1084 	}
1085 
1086 	lwkt_token_init(&vp->v_token, "vnode");
1087 	lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1088 	TAILQ_INIT(&vp->v_namecache);
1089 	RB_INIT(&vp->v_rbclean_tree);
1090 	RB_INIT(&vp->v_rbdirty_tree);
1091 	RB_INIT(&vp->v_rbhash_tree);
1092 	spin_init(&vp->v_spin, "allocvnode");
1093 
1094 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
1095 	vp->v_refcnt = 1;
1096 	vp->v_flag = VAGE0 | VAGE1;
1097 	vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1098 
1099 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1100 	/* exclusive lock still held */
1101 
1102 	vp->v_filesize = NOOFFSET;
1103 	vp->v_type = VNON;
1104 	vp->v_tag = 0;
1105 	vp->v_state = VS_CACHED;
1106 	_vactivate(vp);
1107 
1108 	return (vp);
1109 }
1110 
1111 /*
1112  * Called after a process has allocated a vnode via allocvnode()
1113  * and we detected that too many vnodes were present.
1114  *
1115  * This function is called just prior to a return to userland if the
1116  * process at some point had to allocate a new vnode during the last
1117  * system call and the vnode count was found to be excessive.
1118  *
1119  * This is a synchronous path that we do not normally want to execute.
1120  *
1121  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1122  *
1123  * WARNING: Sometimes numvnodes can blow out due to children being
1124  *	    present under directory vnodes in the namecache.  For the
1125  *	    moment use an if() instead of a while() and note that if
1126  *	    we were to use a while() we would still have to break out
1127  *	    if freesomevnodes() returned 0.  vnlru will also be trying
1128  *	    hard to free vnodes at the same time (with a lower trigger
1129  *	    pointer).
1130  */
1131 void
1132 allocvnode_gc(void)
1133 {
1134 	if (numvnodes >= maxvnodes &&
1135 	    countcachedandinactivevnodes() >= maxvnodes * 5 / 10) {
1136 		freesomevnodes(batchfreevnodes);
1137 	}
1138 }
1139 
1140 int
1141 freesomevnodes(int n)
1142 {
1143 	struct vnode *vp;
1144 	int count = 0;
1145 
1146 	while (n) {
1147 		if ((vp = cleanfreevnode(n)) == NULL)
1148 			break;
1149 		vx_unlock(vp);
1150 		--n;
1151 		++count;
1152 		kfree(vp, M_VNODE);
1153 		atomic_add_int(&numvnodes, -1);
1154 	}
1155 	return(count);
1156 }
1157