xref: /dragonfly/sys/kern/vfs_lock.c (revision 0b29ed9d)
1 /*
2  * Copyright (c) 2004,2013-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * External lock/ref-related vnode functions
37  *
38  * vs_state transition locking requirements:
39  *
40  *	INACTIVE -> CACHED|DYING	vx_lock(excl) + vi->spin
41  *	DYING    -> CACHED		vx_lock(excl)
42  *	ACTIVE   -> INACTIVE		(none)       + v_spin + vi->spin
43  *	INACTIVE -> ACTIVE		vn_lock(any) + v_spin + vi->spin
44  *	CACHED   -> ACTIVE		vn_lock(any) + v_spin + vi->spin
45  *
46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
47  *
48  *	 Switching into ACTIVE also requires a vref and vnode lock, however
49  *	 the vnode lock is allowed to be SHARED.
50  *
51  *	 Switching into a CACHED or DYING state requires an exclusive vnode
52  *	 lock or vx_lock (which is almost the same thing).
53  */
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/buf.h>
63 #include <sys/sysctl.h>
64 
65 #include <machine/limits.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
69 
70 #include <sys/buf2.h>
71 #include <sys/thread2.h>
72 
73 #define VACT_MAX	10
74 #define VACT_INC	2
75 
76 static void vnode_terminate(struct vnode *vp);
77 
78 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
79 
80 /*
81  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
82  * are inserted prior to the mid point, and otherwise inserted
83  * at the tail.
84  *
85  * The vnode code goes to great lengths to avoid moving vnodes between
86  * lists, but sometimes it is unavoidable.  For this situation we try to
87  * avoid lock contention but we do not try very hard to avoid cache line
88  * congestion.  A modestly sized hash table is used.
89  */
90 #define VLIST_PRIME2	123462047LU
91 #define VLIST_XOR	(uintptr_t)0xab4582fa8322fb71LLU
92 
93 #define VLIST_HASH(vp)	(((uintptr_t)vp ^ VLIST_XOR) % \
94 			 VLIST_PRIME2 % (unsigned)ncpus)
95 
96 static struct vnode_index *vnode_list_hash;
97 
98 int  activevnodes = 0;
99 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
100 	&activevnodes, 0, "Number of active nodes");
101 int  cachedvnodes = 0;
102 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
103 	&cachedvnodes, 0, "Number of total cached nodes");
104 int  inactivevnodes = 0;
105 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
106 	&inactivevnodes, 0, "Number of inactive nodes");
107 static int batchfreevnodes = 5;
108 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
109 	&batchfreevnodes, 0, "Number of vnodes to free at once");
110 #ifdef TRACKVNODE
111 static u_long trackvnode;
112 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
113 		&trackvnode, 0, "");
114 #endif
115 
116 /*
117  * Called from vfsinit()
118  */
119 void
120 vfs_lock_init(void)
121 {
122 	int i;
123 
124 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
125 	vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
126 				  M_VNODE, M_ZERO | M_WAITOK);
127 	for (i = 0; i < ncpus; ++i) {
128 		struct vnode_index *vi = &vnode_list_hash[i];
129 
130 		TAILQ_INIT(&vi->inactive_list);
131 		TAILQ_INIT(&vi->active_list);
132 		TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
133 		spin_init(&vi->spin, "vfslock");
134 	}
135 }
136 
137 /*
138  * Misc functions
139  */
140 static __inline
141 void
142 _vsetflags(struct vnode *vp, int flags)
143 {
144 	atomic_set_int(&vp->v_flag, flags);
145 }
146 
147 static __inline
148 void
149 _vclrflags(struct vnode *vp, int flags)
150 {
151 	atomic_clear_int(&vp->v_flag, flags);
152 }
153 
154 void
155 vsetflags(struct vnode *vp, int flags)
156 {
157 	_vsetflags(vp, flags);
158 }
159 
160 void
161 vclrflags(struct vnode *vp, int flags)
162 {
163 	_vclrflags(vp, flags);
164 }
165 
166 /*
167  * Place the vnode on the active list.
168  *
169  * Caller must hold vp->v_spin
170  */
171 static __inline
172 void
173 _vactivate(struct vnode *vp)
174 {
175 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
176 
177 #ifdef TRACKVNODE
178 	if ((u_long)vp == trackvnode)
179 		kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
180 #endif
181 	spin_lock(&vi->spin);
182 
183 	switch(vp->v_state) {
184 	case VS_ACTIVE:
185 		spin_unlock(&vi->spin);
186 		panic("_vactivate: already active");
187 		/* NOT REACHED */
188 		return;
189 	case VS_INACTIVE:
190 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
191 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
192 		break;
193 	case VS_CACHED:
194 	case VS_DYING:
195 		break;
196 	}
197 	TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
198 	vp->v_state = VS_ACTIVE;
199 	spin_unlock(&vi->spin);
200 	atomic_add_int(&mycpu->gd_activevnodes, 1);
201 }
202 
203 /*
204  * Put a vnode on the inactive list.
205  *
206  * Caller must hold v_spin
207  */
208 static __inline
209 void
210 _vinactive(struct vnode *vp)
211 {
212 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
213 
214 #ifdef TRACKVNODE
215 	if ((u_long)vp == trackvnode) {
216 		kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
217 		print_backtrace(-1);
218 	}
219 #endif
220 	spin_lock(&vi->spin);
221 
222 	/*
223 	 * Remove from active list if it is sitting on it
224 	 */
225 	switch(vp->v_state) {
226 	case VS_ACTIVE:
227 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
228 		atomic_add_int(&mycpu->gd_activevnodes, -1);
229 		break;
230 	case VS_INACTIVE:
231 		spin_unlock(&vi->spin);
232 		panic("_vinactive: already inactive");
233 		/* NOT REACHED */
234 		return;
235 	case VS_CACHED:
236 	case VS_DYING:
237 		break;
238 	}
239 
240 	/*
241 	 * Distinguish between basically dead vnodes, vnodes with cached
242 	 * data, and vnodes without cached data.  A rover will shift the
243 	 * vnodes around as their cache status is lost.
244 	 */
245 	if (vp->v_flag & VRECLAIMED) {
246 		TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
247 	} else {
248 		TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
249 	}
250 	vp->v_state = VS_INACTIVE;
251 	spin_unlock(&vi->spin);
252 	atomic_add_int(&mycpu->gd_inactivevnodes, 1);
253 }
254 
255 static __inline
256 void
257 _vinactive_tail(struct vnode *vp)
258 {
259 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
260 
261 	spin_lock(&vi->spin);
262 
263 	/*
264 	 * Remove from active list if it is sitting on it
265 	 */
266 	switch(vp->v_state) {
267 	case VS_ACTIVE:
268 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
269 		atomic_add_int(&mycpu->gd_activevnodes, -1);
270 		break;
271 	case VS_INACTIVE:
272 		spin_unlock(&vi->spin);
273 		panic("_vinactive_tail: already inactive");
274 		/* NOT REACHED */
275 		return;
276 	case VS_CACHED:
277 	case VS_DYING:
278 		break;
279 	}
280 
281 	TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
282 	vp->v_state = VS_INACTIVE;
283 	spin_unlock(&vi->spin);
284 	atomic_add_int(&mycpu->gd_inactivevnodes, 1);
285 }
286 
287 /*
288  * Add a ref to an active vnode.  This function should never be called
289  * with an inactive vnode (use vget() instead), but might be called
290  * with other states.
291  */
292 void
293 vref(struct vnode *vp)
294 {
295 	KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
296 		("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
297 	atomic_add_int(&vp->v_refcnt, 1);
298 }
299 
300 void
301 synchronizevnodecount(void)
302 {
303 	int nca = 0;
304 	int act = 0;
305 	int ina = 0;
306 	int i;
307 
308 	for (i = 0; i < ncpus; ++i) {
309 		globaldata_t gd = globaldata_find(i);
310 		nca += gd->gd_cachedvnodes;
311 		act += gd->gd_activevnodes;
312 		ina += gd->gd_inactivevnodes;
313 	}
314 	cachedvnodes = nca;
315 	activevnodes = act;
316 	inactivevnodes = ina;
317 }
318 
319 /*
320  * Count number of cached vnodes.  This is middling expensive so be
321  * careful not to make this call in the critical path.  Each cpu tracks
322  * its own accumulator.  The individual accumulators must be summed
323  * together to get an accurate value.
324  */
325 int
326 countcachedvnodes(void)
327 {
328 	int i;
329 	int n = 0;
330 
331 	for (i = 0; i < ncpus; ++i) {
332 		globaldata_t gd = globaldata_find(i);
333 		n += gd->gd_cachedvnodes;
334 	}
335 	return n;
336 }
337 
338 int
339 countcachedandinactivevnodes(void)
340 {
341 	int i;
342 	int n = 0;
343 
344 	for (i = 0; i < ncpus; ++i) {
345 		globaldata_t gd = globaldata_find(i);
346 		n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
347 	}
348 	return n;
349 }
350 
351 /*
352  * Release a ref on an active or inactive vnode.
353  *
354  * Caller has no other requirements.
355  *
356  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
357  * transition, otherwise we leave the vnode in the active list and
358  * do a lockless transition to 0, which is very important for the
359  * critical path.
360  *
361  * (vrele() is not called when a vnode is being destroyed w/kfree)
362  */
363 void
364 vrele(struct vnode *vp)
365 {
366 	for (;;) {
367 		int count = vp->v_refcnt;
368 		cpu_ccfence();
369 		KKASSERT((count & VREF_MASK) > 0);
370 		KKASSERT(vp->v_state == VS_ACTIVE ||
371 			 vp->v_state == VS_INACTIVE);
372 
373 		/*
374 		 * 2+ case
375 		 */
376 		if ((count & VREF_MASK) > 1) {
377 			if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
378 				break;
379 			continue;
380 		}
381 
382 		/*
383 		 * 1->0 transition case must handle possible finalization.
384 		 * When finalizing we transition 1->0x40000000.  Note that
385 		 * cachedvnodes is only adjusted on transitions to ->0.
386 		 *
387 		 * WARNING! VREF_TERMINATE can be cleared at any point
388 		 *	    when the refcnt is non-zero (by vget()) and
389 		 *	    the vnode has not been reclaimed.  Thus
390 		 *	    transitions out of VREF_TERMINATE do not have
391 		 *	    to mess with cachedvnodes.
392 		 */
393 		if (count & VREF_FINALIZE) {
394 			vx_lock(vp);
395 			if (atomic_cmpset_int(&vp->v_refcnt,
396 					      count, VREF_TERMINATE)) {
397 				vnode_terminate(vp);
398 				break;
399 			}
400 			vx_unlock(vp);
401 		} else {
402 			if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
403 				atomic_add_int(&mycpu->gd_cachedvnodes, 1);
404 				break;
405 			}
406 		}
407 		/* retry */
408 	}
409 }
410 
411 /*
412  * Add an auxiliary data structure reference to the vnode.  Auxiliary
413  * references do not change the state of the vnode or prevent deactivation
414  * or reclamation of the vnode, but will prevent the vnode from being
415  * destroyed (kfree()'d).
416  *
417  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
418  *	     already be held by the caller.  vdrop() will clean up the
419  *	     free list state.
420  */
421 void
422 vhold(struct vnode *vp)
423 {
424 	atomic_add_int(&vp->v_auxrefs, 1);
425 }
426 
427 /*
428  * Remove an auxiliary reference from the vnode.
429  */
430 void
431 vdrop(struct vnode *vp)
432 {
433 	atomic_add_int(&vp->v_auxrefs, -1);
434 }
435 
436 /*
437  * This function is called on the 1->0 transition (which is actually
438  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
439  * of the vnode.
440  *
441  * Additional vrefs are allowed to race but will not result in a reentrant
442  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
443  * prevents additional 1->0 transitions.
444  *
445  * ONLY A VGET() CAN REACTIVATE THE VNODE.
446  *
447  * Caller must hold the VX lock.
448  *
449  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
450  *
451  * NOTE: The vnode may be marked inactive with dirty buffers
452  *	 or dirty pages in its cached VM object still present.
453  *
454  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
455  *	 previously be active).  We lose control of the vnode the instant
456  *	 it is placed on the free list.
457  *
458  *	 The VX lock is required when transitioning to VS_CACHED but is
459  *	 not sufficient for the vshouldfree() interlocked test or when
460  *	 transitioning away from VS_CACHED.  v_spin is also required for
461  *	 those cases.
462  */
463 static
464 void
465 vnode_terminate(struct vnode *vp)
466 {
467 	KKASSERT(vp->v_state == VS_ACTIVE);
468 
469 	if ((vp->v_flag & VINACTIVE) == 0) {
470 		_vsetflags(vp, VINACTIVE);
471 		if (vp->v_mount)
472 			VOP_INACTIVE(vp);
473 	}
474 	spin_lock(&vp->v_spin);
475 	_vinactive(vp);
476 	spin_unlock(&vp->v_spin);
477 
478 	vx_unlock(vp);
479 }
480 
481 /****************************************************************
482  *			VX LOCKING FUNCTIONS			*
483  ****************************************************************
484  *
485  * These functions lock vnodes for reclamation and deactivation related
486  * activities.  The caller must already be holding some sort of reference
487  * on the vnode.
488  */
489 void
490 vx_lock(struct vnode *vp)
491 {
492 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
493 }
494 
495 void
496 vx_unlock(struct vnode *vp)
497 {
498 	lockmgr(&vp->v_lock, LK_RELEASE);
499 }
500 
501 /****************************************************************
502  *			VNODE ACQUISITION FUNCTIONS		*
503  ****************************************************************
504  *
505  * These functions must be used when accessing a vnode that has no
506  * chance of being destroyed in a SMP race.  That means the caller will
507  * usually either hold an auxiliary reference (such as the namecache)
508  * or hold some other lock that ensures that the vnode cannot be destroyed.
509  *
510  * These functions are MANDATORY for any code chain accessing a vnode
511  * whos activation state is not known.
512  *
513  * vget() can be called with LK_NOWAIT and will return EBUSY if the
514  * lock cannot be immediately acquired.
515  *
516  * vget()/vput() are used when reactivation is desired.
517  *
518  * vx_get() and vx_put() are used when reactivation is not desired.
519  */
520 int
521 vget(struct vnode *vp, int flags)
522 {
523 	int error;
524 
525 	/*
526 	 * A lock type must be passed
527 	 */
528 	if ((flags & LK_TYPE_MASK) == 0) {
529 		panic("vget() called with no lock specified!");
530 		/* NOT REACHED */
531 	}
532 
533 	/*
534 	 * Reference the structure and then acquire the lock.
535 	 *
536 	 * NOTE: The requested lock might be a shared lock and does
537 	 *	 not protect our access to the refcnt or other fields.
538 	 */
539 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
540 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
541 
542 	if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
543 		/*
544 		 * The lock failed, undo and return an error.  This will not
545 		 * normally trigger a termination.
546 		 */
547 		vrele(vp);
548 	} else if (vp->v_flag & VRECLAIMED) {
549 		/*
550 		 * The node is being reclaimed and cannot be reactivated
551 		 * any more, undo and return ENOENT.
552 		 */
553 		vn_unlock(vp);
554 		vrele(vp);
555 		error = ENOENT;
556 	} else if (vp->v_state == VS_ACTIVE) {
557 		/*
558 		 * A VS_ACTIVE vnode coupled with the fact that we have
559 		 * a vnode lock (even if shared) prevents v_state from
560 		 * changing.  Since the vnode is not in a VRECLAIMED state,
561 		 * we can safely clear VINACTIVE.
562 		 *
563 		 * It is possible for a shared lock to cause a race with
564 		 * another thread that is also in the process of clearing
565 		 * VREF_TERMINATE, meaning that we might return with it still
566 		 * set and then assert in a later vref().  The solution is to
567 		 * unconditionally clear VREF_TERMINATE here as well.
568 		 *
569 		 * NOTE! Multiple threads may clear VINACTIVE if this is
570 		 *	 shared lock.  This race is allowed.
571 		 */
572 		_vclrflags(vp, VINACTIVE);	/* SMP race ok */
573 		vp->v_act += VACT_INC;
574 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
575 			vp->v_act = VACT_MAX;
576 		error = 0;
577 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE);
578 	} else {
579 		/*
580 		 * If the vnode is not VS_ACTIVE it must be reactivated
581 		 * in addition to clearing VINACTIVE.  An exclusive spin_lock
582 		 * is needed to manipulate the vnode's list.
583 		 *
584 		 * Because the lockmgr lock might be shared, we might race
585 		 * another reactivation, which we handle.  In this situation,
586 		 * however, the refcnt prevents other v_state races.
587 		 *
588 		 * As with above, clearing VINACTIVE is allowed to race other
589 		 * clearings of VINACTIVE.
590 		 *
591 		 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
592 		 * the refcnt is non-zero and the vnode has not been
593 		 * reclaimed.  This also means that the transitions do
594 		 * not affect cachedvnodes.
595 		 *
596 		 * It is possible for a shared lock to cause a race with
597 		 * another thread that is also in the process of clearing
598 		 * VREF_TERMINATE, meaning that we might return with it still
599 		 * set and then assert in a later vref().  The solution is to
600 		 * unconditionally clear VREF_TERMINATE here as well.
601 		 */
602 		_vclrflags(vp, VINACTIVE);
603 		vp->v_act += VACT_INC;
604 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
605 			vp->v_act = VACT_MAX;
606 		spin_lock(&vp->v_spin);
607 
608 		switch(vp->v_state) {
609 		case VS_INACTIVE:
610 			_vactivate(vp);
611 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
612 							VREF_FINALIZE);
613 			spin_unlock(&vp->v_spin);
614 			break;
615 		case VS_CACHED:
616 			_vactivate(vp);
617 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
618 							VREF_FINALIZE);
619 			spin_unlock(&vp->v_spin);
620 			break;
621 		case VS_ACTIVE:
622 			atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE |
623 							VREF_TERMINATE);
624 			spin_unlock(&vp->v_spin);
625 			break;
626 		case VS_DYING:
627 			spin_unlock(&vp->v_spin);
628 			panic("Impossible VS_DYING state");
629 			break;
630 		}
631 		error = 0;
632 	}
633 	return(error);
634 }
635 
636 #ifdef DEBUG_VPUT
637 
638 void
639 debug_vput(struct vnode *vp, const char *filename, int line)
640 {
641 	kprintf("vput(%p) %s:%d\n", vp, filename, line);
642 	vn_unlock(vp);
643 	vrele(vp);
644 }
645 
646 #else
647 
648 void
649 vput(struct vnode *vp)
650 {
651 	vn_unlock(vp);
652 	vrele(vp);
653 }
654 
655 #endif
656 
657 /*
658  * Acquire the vnode lock unguarded.
659  *
660  * The non-blocking version also uses a slightly different mechanic.
661  * This function will explicitly fail not only if it cannot acquire
662  * the lock normally, but also if the caller already holds a lock.
663  *
664  * The adjusted mechanic is used to close a loophole where complex
665  * VOP_RECLAIM code can circle around recursively and allocate the
666  * same vnode it is trying to destroy from the freelist.
667  *
668  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
669  * cause the incorrect behavior to occur.  If not for that lockmgr()
670  * would do the right thing.
671  *
672  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
673  */
674 void
675 vx_get(struct vnode *vp)
676 {
677 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
678 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
679 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
680 }
681 
682 int
683 vx_get_nonblock(struct vnode *vp)
684 {
685 	int error;
686 
687 	if (lockinuse(&vp->v_lock))
688 		return(EBUSY);
689 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
690 	if (error == 0) {
691 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
692 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
693 	}
694 	return(error);
695 }
696 
697 /*
698  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
699  * any needed state transitions.
700  *
701  * However, filesystems use this function to get rid of unwanted new vnodes
702  * so try to get the vnode on the correct queue in that case.
703  */
704 void
705 vx_put(struct vnode *vp)
706 {
707 	if (vp->v_type == VNON || vp->v_type == VBAD)
708 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
709 	lockmgr(&vp->v_lock, LK_RELEASE);
710 	vrele(vp);
711 }
712 
713 /*
714  * Try to reuse a vnode from the free list.  This function is somewhat
715  * advisory in that NULL can be returned as a normal case, even if free
716  * vnodes are present.
717  *
718  * The scan is limited because it can result in excessive CPU use during
719  * periods of extreme vnode use.
720  *
721  * NOTE: The returned vnode is not completely initialized.
722  */
723 static
724 struct vnode *
725 cleanfreevnode(int maxcount)
726 {
727 	struct vnode_index *vi;
728 	struct vnode *vp;
729 	int count;
730 	int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
731 	int ri;
732 	int cpu_count;
733 
734 	/*
735 	 * Try to deactivate some vnodes cached on the active list.
736 	 */
737 	if (countcachedvnodes() < inactivevnodes)
738 		goto skip;
739 
740 	ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
741 
742 	for (count = 0; count < maxcount * 2; ++count, ++ri) {
743 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
744 
745 		spin_lock(&vi->spin);
746 
747 		vp = TAILQ_NEXT(&vi->active_rover, v_list);
748 		TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
749 		if (vp == NULL) {
750 			TAILQ_INSERT_HEAD(&vi->active_list,
751 					  &vi->active_rover, v_list);
752 		} else {
753 			TAILQ_INSERT_AFTER(&vi->active_list, vp,
754 					   &vi->active_rover, v_list);
755 		}
756 		if (vp == NULL) {
757 			spin_unlock(&vi->spin);
758 			continue;
759 		}
760 		if ((vp->v_refcnt & VREF_MASK) != 0) {
761 			spin_unlock(&vi->spin);
762 			vp->v_act += VACT_INC;
763 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
764 				vp->v_act = VACT_MAX;
765 			continue;
766 		}
767 
768 		/*
769 		 * decrement by less if the vnode's object has a lot of
770 		 * VM pages.  XXX possible SMP races.
771 		 */
772 		if (vp->v_act > 0) {
773 			vm_object_t obj;
774 			if ((obj = vp->v_object) != NULL &&
775 			    obj->resident_page_count >= trigger) {
776 				vp->v_act -= 1;
777 			} else {
778 				vp->v_act -= VACT_INC;
779 			}
780 			if (vp->v_act < 0)
781 				vp->v_act = 0;
782 			spin_unlock(&vi->spin);
783 			continue;
784 		}
785 
786 		/*
787 		 * Try to deactivate the vnode.
788 		 */
789 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
790 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
791 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
792 
793 		spin_unlock(&vi->spin);
794 		vrele(vp);
795 	}
796 
797 	vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
798 
799 skip:
800 	/*
801 	 * Loop trying to lock the first vnode on the free list.
802 	 * Cycle if we can't.
803 	 */
804 	cpu_count = ncpus;
805 	ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
806 
807 	for (count = 0; count < maxcount; ++count, ++ri) {
808 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
809 
810 		spin_lock(&vi->spin);
811 
812 		vp = TAILQ_FIRST(&vi->inactive_list);
813 		if (vp == NULL) {
814 			spin_unlock(&vi->spin);
815 			if (--cpu_count == 0)
816 				break;
817 			ri = (ri + 16) & ~15;
818 			--ri;
819 			continue;
820 		}
821 
822 		/*
823 		 * non-blocking vx_get will also ref the vnode on success.
824 		 */
825 		if (vx_get_nonblock(vp)) {
826 			KKASSERT(vp->v_state == VS_INACTIVE);
827 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
828 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
829 			spin_unlock(&vi->spin);
830 			continue;
831 		}
832 
833 		/*
834 		 * Because we are holding vfs_spin the vnode should currently
835 		 * be inactive and VREF_TERMINATE should still be set.
836 		 *
837 		 * Once vfs_spin is released the vnode's state should remain
838 		 * unmodified due to both the lock and ref on it.
839 		 */
840 		KKASSERT(vp->v_state == VS_INACTIVE);
841 		spin_unlock(&vi->spin);
842 #ifdef TRACKVNODE
843 		if ((u_long)vp == trackvnode)
844 			kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
845 #endif
846 
847 		/*
848 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
849 		 * This includes namecache refs due to a related ncp being
850 		 * locked or having children, a VM object association, or
851 		 * other hold users.
852 		 *
853 		 * Do not reclaim/reuse a vnode if someone else has a real
854 		 * ref on it.  This can occur if a filesystem temporarily
855 		 * releases the vnode lock during VOP_RECLAIM.
856 		 */
857 		if (vp->v_auxrefs ||
858 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
859 failed:
860 			if (vp->v_state == VS_INACTIVE) {
861 				spin_lock(&vi->spin);
862 				if (vp->v_state == VS_INACTIVE) {
863 					TAILQ_REMOVE(&vi->inactive_list,
864 						     vp, v_list);
865 					TAILQ_INSERT_TAIL(&vi->inactive_list,
866 							  vp, v_list);
867 				}
868 				spin_unlock(&vi->spin);
869 			}
870 			vx_put(vp);
871 			continue;
872 		}
873 
874 		/*
875 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
876 		 * for vnodes pulled from the inactive list, and cannot be
877 		 * changed while we hold the vx lock.
878 		 *
879 		 * Try to reclaim the vnode.
880 		 */
881 		KKASSERT(vp->v_flag & VINACTIVE);
882 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
883 
884 		if ((vp->v_flag & VRECLAIMED) == 0) {
885 			if (cache_inval_vp_nonblock(vp))
886 				goto failed;
887 			vgone_vxlocked(vp);
888 			/* vnode is still VX locked */
889 		}
890 
891 		/*
892 		 * At this point if there are no other refs or auxrefs on
893 		 * the vnode with the inactive list locked, and we remove
894 		 * the vnode from the inactive list, it should not be
895 		 * possible for anyone else to access the vnode any more.
896 		 *
897 		 * Since the vnode is in a VRECLAIMED state, no new
898 		 * namecache associations could have been made and the
899 		 * vnode should have already been removed from its mountlist.
900 		 *
901 		 * Since we hold a VX lock on the vnode it cannot have been
902 		 * reactivated (moved out of the inactive list).
903 		 */
904 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
905 		spin_lock(&vi->spin);
906 		if (vp->v_auxrefs ||
907 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
908 			spin_unlock(&vi->spin);
909 			goto failed;
910 		}
911 		KKASSERT(vp->v_state == VS_INACTIVE);
912 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
913 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
914 		vp->v_state = VS_DYING;
915 		spin_unlock(&vi->spin);
916 
917 		/*
918 		 * Nothing should have been able to access this vp.  Only
919 		 * our ref should remain now.
920 		 */
921 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
922 		KASSERT(vp->v_refcnt == 1,
923 			("vp %p badrefs %08x", vp, vp->v_refcnt));
924 
925 		/*
926 		 * Return a VX locked vnode suitable for reuse.
927 		 */
928 		vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
929 		return(vp);
930 	}
931 	vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
932 	return(NULL);
933 }
934 
935 /*
936  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
937  *
938  * All new vnodes set the VAGE flags.  An open() of the vnode will
939  * decrement the (2-bit) flags.  Vnodes which are opened several times
940  * are thus retained in the cache over vnodes which are merely stat()d.
941  *
942  * We attempt to reuse an already-recycled vnode from our pcpu inactive
943  * queue first, and allocate otherwise.  Attempting to recycle inactive
944  * vnodes here can lead to numerous deadlocks, particularly with
945  * softupdates.
946  */
947 struct vnode *
948 allocvnode(int lktimeout, int lkflags)
949 {
950 	struct vnode *vp;
951 	struct vnode_index *vi;
952 
953 	/*
954 	 * lktimeout only applies when LK_TIMELOCK is used, and only
955 	 * the pageout daemon uses it.  The timeout may not be zero
956 	 * or the pageout daemon can deadlock in low-VM situations.
957 	 */
958 	if (lktimeout == 0)
959 		lktimeout = hz / 10;
960 
961 	/*
962 	 * Do not flag for synchronous recyclement unless there are enough
963 	 * freeable vnodes to recycle and the number of vnodes has
964 	 * significantly exceeded our target.  We want the normal vnlru
965 	 * process to handle the cleaning (at 9/10's) before we are forced
966 	 * to flag it here at 11/10's for userexit path processing.
967 	 */
968 	if (numvnodes >= maxvnodes * 11 / 10 &&
969 	    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
970 		struct thread *td = curthread;
971 		if (td->td_lwp)
972 			atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
973 	}
974 
975 	/*
976 	 * Try to trivially reuse a reclaimed vnode from the head of the
977 	 * inactive list for this cpu.  Any vnode cycling which occurs
978 	 * which terminates the vnode will cause it to be returned to the
979 	 * same pcpu structure (e.g. unlink calls).
980 	 */
981 	vi = &vnode_list_hash[mycpuid];
982 	spin_lock(&vi->spin);
983 
984 	vp = TAILQ_FIRST(&vi->inactive_list);
985 	if (vp && (vp->v_flag & VRECLAIMED)) {
986 		/*
987 		 * non-blocking vx_get will also ref the vnode on success.
988 		 */
989 		if (vx_get_nonblock(vp)) {
990 			KKASSERT(vp->v_state == VS_INACTIVE);
991 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
992 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
993 			spin_unlock(&vi->spin);
994 			goto slower;
995 		}
996 
997 		/*
998 		 * Because we are holding vfs_spin the vnode should currently
999 		 * be inactive and VREF_TERMINATE should still be set.
1000 		 *
1001 		 * Once vfs_spin is released the vnode's state should remain
1002 		 * unmodified due to both the lock and ref on it.
1003 		 */
1004 		KKASSERT(vp->v_state == VS_INACTIVE);
1005 #ifdef TRACKVNODE
1006 		if ((u_long)vp == trackvnode)
1007 			kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1008 #endif
1009 
1010 		/*
1011 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
1012 		 * This includes namecache refs due to a related ncp being
1013 		 * locked or having children, a VM object association, or
1014 		 * other hold users.
1015 		 *
1016 		 * Do not reclaim/reuse a vnode if someone else has a real
1017 		 * ref on it.  This can occur if a filesystem temporarily
1018 		 * releases the vnode lock during VOP_RECLAIM.
1019 		 */
1020 		if (vp->v_auxrefs ||
1021 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1022 			if (vp->v_state == VS_INACTIVE) {
1023 				if (vp->v_state == VS_INACTIVE) {
1024 					TAILQ_REMOVE(&vi->inactive_list,
1025 						     vp, v_list);
1026 					TAILQ_INSERT_TAIL(&vi->inactive_list,
1027 							  vp, v_list);
1028 				}
1029 			}
1030 			spin_unlock(&vi->spin);
1031 			vx_put(vp);
1032 			goto slower;
1033 		}
1034 
1035 		/*
1036 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
1037 		 * for vnodes pulled from the inactive list, and cannot be
1038 		 * changed while we hold the vx lock.
1039 		 *
1040 		 * Try to reclaim the vnode.
1041 		 */
1042 		KKASSERT(vp->v_flag & VINACTIVE);
1043 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1044 
1045 		if ((vp->v_flag & VRECLAIMED) == 0) {
1046 			spin_unlock(&vi->spin);
1047 			vx_put(vp);
1048 			goto slower;
1049 		}
1050 
1051 		/*
1052 		 * At this point if there are no other refs or auxrefs on
1053 		 * the vnode with the inactive list locked, and we remove
1054 		 * the vnode from the inactive list, it should not be
1055 		 * possible for anyone else to access the vnode any more.
1056 		 *
1057 		 * Since the vnode is in a VRECLAIMED state, no new
1058 		 * namecache associations could have been made and the
1059 		 * vnode should have already been removed from its mountlist.
1060 		 *
1061 		 * Since we hold a VX lock on the vnode it cannot have been
1062 		 * reactivated (moved out of the inactive list).
1063 		 */
1064 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1065 		KKASSERT(vp->v_state == VS_INACTIVE);
1066 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1067 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1068 		vp->v_state = VS_DYING;
1069 		spin_unlock(&vi->spin);
1070 
1071 		/*
1072 		 * Nothing should have been able to access this vp.  Only
1073 		 * our ref should remain now.
1074 		 *
1075 		 * At this point we can kfree() the vnode if we want to.
1076 		 * Instead, we reuse it for the allocation.
1077 		 */
1078 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1079 		KASSERT(vp->v_refcnt == 1,
1080 			("vp %p badrefs %08x", vp, vp->v_refcnt));
1081 		bzero(vp, sizeof(*vp));
1082 	} else {
1083 		spin_unlock(&vi->spin);
1084 slower:
1085 		vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1086 		atomic_add_int(&numvnodes, 1);
1087 	}
1088 
1089 	lwkt_token_init(&vp->v_token, "vnode");
1090 	lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1091 	TAILQ_INIT(&vp->v_namecache);
1092 	RB_INIT(&vp->v_rbclean_tree);
1093 	RB_INIT(&vp->v_rbdirty_tree);
1094 	RB_INIT(&vp->v_rbhash_tree);
1095 	spin_init(&vp->v_spin, "allocvnode");
1096 
1097 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
1098 	vp->v_refcnt = 1;
1099 	vp->v_flag = VAGE0 | VAGE1;
1100 	vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1101 
1102 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1103 	/* exclusive lock still held */
1104 
1105 	vp->v_filesize = NOOFFSET;
1106 	vp->v_type = VNON;
1107 	vp->v_tag = 0;
1108 	vp->v_state = VS_CACHED;
1109 	_vactivate(vp);
1110 
1111 	return (vp);
1112 }
1113 
1114 /*
1115  * Called after a process has allocated a vnode via allocvnode()
1116  * and we detected that too many vnodes were present.
1117  *
1118  * This function is called just prior to a return to userland if the
1119  * process at some point had to allocate a new vnode during the last
1120  * system call and the vnode count was found to be excessive.
1121  *
1122  * This is a synchronous path that we do not normally want to execute.
1123  *
1124  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1125  *
1126  * WARNING: Sometimes numvnodes can blow out due to children being
1127  *	    present under directory vnodes in the namecache.  For the
1128  *	    moment use an if() instead of a while() and note that if
1129  *	    we were to use a while() we would still have to break out
1130  *	    if freesomevnodes() returned 0.  vnlru will also be trying
1131  *	    hard to free vnodes at the same time (with a lower trigger
1132  *	    pointer).
1133  */
1134 void
1135 allocvnode_gc(void)
1136 {
1137 	if (numvnodes >= maxvnodes &&
1138 	    countcachedandinactivevnodes() >= maxvnodes * 5 / 10) {
1139 		freesomevnodes(batchfreevnodes);
1140 	}
1141 }
1142 
1143 int
1144 freesomevnodes(int n)
1145 {
1146 	struct vnode *vp;
1147 	int count = 0;
1148 
1149 	while (n) {
1150 		if ((vp = cleanfreevnode(n)) == NULL)
1151 			break;
1152 		vx_unlock(vp);
1153 		--n;
1154 		++count;
1155 		kfree(vp, M_VNODE);
1156 		atomic_add_int(&numvnodes, -1);
1157 	}
1158 	return(count);
1159 }
1160