xref: /dragonfly/sys/kern/vfs_lock.c (revision 3c7e5806)
1 /*
2  * Copyright (c) 2004,2013-2017 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * External lock/ref-related vnode functions
37  *
38  * vs_state transition locking requirements:
39  *
40  *	INACTIVE -> CACHED|DYING	vx_lock(excl) + vi->spin
41  *	DYING    -> CACHED		vx_lock(excl)
42  *	ACTIVE   -> INACTIVE		(none)       + v_spin + vi->spin
43  *	INACTIVE -> ACTIVE		vn_lock(any) + v_spin + vi->spin
44  *	CACHED   -> ACTIVE		vn_lock(any) + v_spin + vi->spin
45  *
46  * NOTE: Switching to/from ACTIVE/INACTIVE requires v_spin and vi->spin,
47  *
48  *	 Switching into ACTIVE also requires a vref and vnode lock, however
49  *	 the vnode lock is allowed to be SHARED.
50  *
51  *	 Switching into a CACHED or DYING state requires an exclusive vnode
52  *	 lock or vx_lock (which is almost the same thing).
53  */
54 
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/kernel.h>
58 #include <sys/malloc.h>
59 #include <sys/mount.h>
60 #include <sys/proc.h>
61 #include <sys/vnode.h>
62 #include <sys/buf.h>
63 #include <sys/sysctl.h>
64 
65 #include <machine/limits.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_object.h>
69 
70 #include <sys/buf2.h>
71 
72 #define VACT_MAX	10
73 #define VACT_INC	2
74 
75 static void vnode_terminate(struct vnode *vp);
76 
77 static MALLOC_DEFINE(M_VNODE, "vnodes", "vnode structures");
78 
79 /*
80  * The vnode free list hold inactive vnodes.  Aged inactive vnodes
81  * are inserted prior to the mid point, and otherwise inserted
82  * at the tail.
83  *
84  * The vnode code goes to great lengths to avoid moving vnodes between
85  * lists, but sometimes it is unavoidable.  For this situation we try to
86  * avoid lock contention but we do not try very hard to avoid cache line
87  * congestion.  A modestly sized hash table is used.
88  */
89 #define VLIST_PRIME2	123462047LU
90 #define VLIST_XOR	(uintptr_t)0xab4582fa8322fb71LLU
91 
92 #define VLIST_HASH(vp)	(((uintptr_t)vp ^ VLIST_XOR) % \
93 			 VLIST_PRIME2 % (unsigned)ncpus)
94 
95 static struct vnode_index *vnode_list_hash;
96 
97 int  activevnodes = 0;
98 SYSCTL_INT(_debug, OID_AUTO, activevnodes, CTLFLAG_RD,
99 	&activevnodes, 0, "Number of active nodes");
100 int  cachedvnodes = 0;
101 SYSCTL_INT(_debug, OID_AUTO, cachedvnodes, CTLFLAG_RD,
102 	&cachedvnodes, 0, "Number of total cached nodes");
103 int  inactivevnodes = 0;
104 SYSCTL_INT(_debug, OID_AUTO, inactivevnodes, CTLFLAG_RD,
105 	&inactivevnodes, 0, "Number of inactive nodes");
106 static int batchfreevnodes = 5;
107 SYSCTL_INT(_debug, OID_AUTO, batchfreevnodes, CTLFLAG_RW,
108 	&batchfreevnodes, 0, "Number of vnodes to free at once");
109 #ifdef TRACKVNODE
110 static u_long trackvnode;
111 SYSCTL_ULONG(_debug, OID_AUTO, trackvnode, CTLFLAG_RW,
112 		&trackvnode, 0, "");
113 #endif
114 
115 /*
116  * Called from vfsinit()
117  */
118 void
119 vfs_lock_init(void)
120 {
121 	int i;
122 
123 	kmalloc_raise_limit(M_VNODE, 0);	/* unlimited */
124 	vnode_list_hash = kmalloc(sizeof(*vnode_list_hash) * ncpus,
125 				  M_VNODE, M_ZERO | M_WAITOK);
126 	for (i = 0; i < ncpus; ++i) {
127 		struct vnode_index *vi = &vnode_list_hash[i];
128 
129 		TAILQ_INIT(&vi->inactive_list);
130 		TAILQ_INIT(&vi->active_list);
131 		TAILQ_INSERT_TAIL(&vi->active_list, &vi->active_rover, v_list);
132 		spin_init(&vi->spin, "vfslock");
133 	}
134 }
135 
136 /*
137  * Misc functions
138  */
139 static __inline
140 void
141 _vsetflags(struct vnode *vp, int flags)
142 {
143 	atomic_set_int(&vp->v_flag, flags);
144 }
145 
146 static __inline
147 void
148 _vclrflags(struct vnode *vp, int flags)
149 {
150 	atomic_clear_int(&vp->v_flag, flags);
151 }
152 
153 void
154 vsetflags(struct vnode *vp, int flags)
155 {
156 	_vsetflags(vp, flags);
157 }
158 
159 void
160 vclrflags(struct vnode *vp, int flags)
161 {
162 	_vclrflags(vp, flags);
163 }
164 
165 /*
166  * Place the vnode on the active list.
167  *
168  * Caller must hold vp->v_spin
169  */
170 static __inline
171 void
172 _vactivate(struct vnode *vp)
173 {
174 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
175 
176 #ifdef TRACKVNODE
177 	if ((u_long)vp == trackvnode)
178 		kprintf("_vactivate %p %08x\n", vp, vp->v_flag);
179 #endif
180 	spin_lock(&vi->spin);
181 
182 	switch(vp->v_state) {
183 	case VS_ACTIVE:
184 		spin_unlock(&vi->spin);
185 		panic("_vactivate: already active");
186 		/* NOT REACHED */
187 		return;
188 	case VS_INACTIVE:
189 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
190 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
191 		break;
192 	case VS_CACHED:
193 	case VS_DYING:
194 		break;
195 	}
196 	TAILQ_INSERT_TAIL(&vi->active_list, vp, v_list);
197 	vp->v_state = VS_ACTIVE;
198 	spin_unlock(&vi->spin);
199 	atomic_add_int(&mycpu->gd_activevnodes, 1);
200 }
201 
202 /*
203  * Put a vnode on the inactive list.
204  *
205  * Caller must hold v_spin
206  */
207 static __inline
208 void
209 _vinactive(struct vnode *vp)
210 {
211 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
212 
213 #ifdef TRACKVNODE
214 	if ((u_long)vp == trackvnode) {
215 		kprintf("_vinactive %p %08x\n", vp, vp->v_flag);
216 		print_backtrace(-1);
217 	}
218 #endif
219 	spin_lock(&vi->spin);
220 
221 	/*
222 	 * Remove from active list if it is sitting on it
223 	 */
224 	switch(vp->v_state) {
225 	case VS_ACTIVE:
226 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
227 		atomic_add_int(&mycpu->gd_activevnodes, -1);
228 		break;
229 	case VS_INACTIVE:
230 		spin_unlock(&vi->spin);
231 		panic("_vinactive: already inactive");
232 		/* NOT REACHED */
233 		return;
234 	case VS_CACHED:
235 	case VS_DYING:
236 		break;
237 	}
238 
239 	/*
240 	 * Distinguish between basically dead vnodes, vnodes with cached
241 	 * data, and vnodes without cached data.  A rover will shift the
242 	 * vnodes around as their cache status is lost.
243 	 */
244 	if (vp->v_flag & VRECLAIMED) {
245 		TAILQ_INSERT_HEAD(&vi->inactive_list, vp, v_list);
246 	} else {
247 		TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
248 	}
249 	vp->v_state = VS_INACTIVE;
250 	spin_unlock(&vi->spin);
251 	atomic_add_int(&mycpu->gd_inactivevnodes, 1);
252 }
253 
254 static __inline
255 void
256 _vinactive_tail(struct vnode *vp)
257 {
258 	struct vnode_index *vi = &vnode_list_hash[VLIST_HASH(vp)];
259 
260 	spin_lock(&vi->spin);
261 
262 	/*
263 	 * Remove from active list if it is sitting on it
264 	 */
265 	switch(vp->v_state) {
266 	case VS_ACTIVE:
267 		TAILQ_REMOVE(&vi->active_list, vp, v_list);
268 		atomic_add_int(&mycpu->gd_activevnodes, -1);
269 		break;
270 	case VS_INACTIVE:
271 		spin_unlock(&vi->spin);
272 		panic("_vinactive_tail: already inactive");
273 		/* NOT REACHED */
274 		return;
275 	case VS_CACHED:
276 	case VS_DYING:
277 		break;
278 	}
279 
280 	TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
281 	vp->v_state = VS_INACTIVE;
282 	spin_unlock(&vi->spin);
283 	atomic_add_int(&mycpu->gd_inactivevnodes, 1);
284 }
285 
286 /*
287  * Add a ref to an active vnode.  This function should never be called
288  * with an inactive vnode (use vget() instead), but might be called
289  * with other states.
290  */
291 void
292 vref(struct vnode *vp)
293 {
294 	KASSERT((VREFCNT(vp) > 0 && vp->v_state != VS_INACTIVE),
295 		("vref: bad refcnt %08x %d", vp->v_refcnt, vp->v_state));
296 	atomic_add_int(&vp->v_refcnt, 1);
297 }
298 
299 void
300 synchronizevnodecount(void)
301 {
302 	int nca = 0;
303 	int act = 0;
304 	int ina = 0;
305 	int i;
306 
307 	for (i = 0; i < ncpus; ++i) {
308 		globaldata_t gd = globaldata_find(i);
309 		nca += gd->gd_cachedvnodes;
310 		act += gd->gd_activevnodes;
311 		ina += gd->gd_inactivevnodes;
312 	}
313 	cachedvnodes = nca;
314 	activevnodes = act;
315 	inactivevnodes = ina;
316 }
317 
318 /*
319  * Count number of cached vnodes.  This is middling expensive so be
320  * careful not to make this call in the critical path.  Each cpu tracks
321  * its own accumulator.  The individual accumulators must be summed
322  * together to get an accurate value.
323  */
324 int
325 countcachedvnodes(void)
326 {
327 	int i;
328 	int n = 0;
329 
330 	for (i = 0; i < ncpus; ++i) {
331 		globaldata_t gd = globaldata_find(i);
332 		n += gd->gd_cachedvnodes;
333 	}
334 	return n;
335 }
336 
337 int
338 countcachedandinactivevnodes(void)
339 {
340 	int i;
341 	int n = 0;
342 
343 	for (i = 0; i < ncpus; ++i) {
344 		globaldata_t gd = globaldata_find(i);
345 		n += gd->gd_cachedvnodes + gd->gd_inactivevnodes;
346 	}
347 	return n;
348 }
349 
350 /*
351  * Release a ref on an active or inactive vnode.
352  *
353  * Caller has no other requirements.
354  *
355  * If VREF_FINALIZE is set this will deactivate the vnode on the 1->0
356  * transition, otherwise we leave the vnode in the active list and
357  * do a lockless transition to 0, which is very important for the
358  * critical path.
359  *
360  * (vrele() is not called when a vnode is being destroyed w/kfree)
361  */
362 void
363 vrele(struct vnode *vp)
364 {
365 	for (;;) {
366 		int count = vp->v_refcnt;
367 		cpu_ccfence();
368 		KKASSERT((count & VREF_MASK) > 0);
369 		KKASSERT(vp->v_state == VS_ACTIVE ||
370 			 vp->v_state == VS_INACTIVE);
371 
372 		/*
373 		 * 2+ case
374 		 */
375 		if ((count & VREF_MASK) > 1) {
376 			if (atomic_cmpset_int(&vp->v_refcnt, count, count - 1))
377 				break;
378 			continue;
379 		}
380 
381 		/*
382 		 * 1->0 transition case must handle possible finalization.
383 		 * When finalizing we transition 1->0x40000000.  Note that
384 		 * cachedvnodes is only adjusted on transitions to ->0.
385 		 *
386 		 * WARNING! VREF_TERMINATE can be cleared at any point
387 		 *	    when the refcnt is non-zero (by vget()) and
388 		 *	    the vnode has not been reclaimed.  Thus
389 		 *	    transitions out of VREF_TERMINATE do not have
390 		 *	    to mess with cachedvnodes.
391 		 */
392 		if (count & VREF_FINALIZE) {
393 			vx_lock(vp);
394 			if (atomic_cmpset_int(&vp->v_refcnt,
395 					      count, VREF_TERMINATE)) {
396 				vnode_terminate(vp);
397 				break;
398 			}
399 			vx_unlock(vp);
400 		} else {
401 			if (atomic_cmpset_int(&vp->v_refcnt, count, 0)) {
402 				atomic_add_int(&mycpu->gd_cachedvnodes, 1);
403 				break;
404 			}
405 		}
406 		/* retry */
407 	}
408 }
409 
410 /*
411  * Add an auxiliary data structure reference to the vnode.  Auxiliary
412  * references do not change the state of the vnode or prevent deactivation
413  * or reclamation of the vnode, but will prevent the vnode from being
414  * destroyed (kfree()'d).
415  *
416  * WARNING!  vhold() must not acquire v_spin.  The spinlock may or may not
417  *	     already be held by the caller.  vdrop() will clean up the
418  *	     free list state.
419  */
420 void
421 vhold(struct vnode *vp)
422 {
423 	atomic_add_int(&vp->v_auxrefs, 1);
424 }
425 
426 /*
427  * Remove an auxiliary reference from the vnode.
428  */
429 void
430 vdrop(struct vnode *vp)
431 {
432 	atomic_add_int(&vp->v_auxrefs, -1);
433 }
434 
435 /*
436  * This function is called on the 1->0 transition (which is actually
437  * 1->VREF_TERMINATE) when VREF_FINALIZE is set, forcing deactivation
438  * of the vnode.
439  *
440  * Additional vrefs are allowed to race but will not result in a reentrant
441  * call to vnode_terminate() due to refcnt being VREF_TERMINATE.  This
442  * prevents additional 1->0 transitions.
443  *
444  * ONLY A VGET() CAN REACTIVATE THE VNODE.
445  *
446  * Caller must hold the VX lock.
447  *
448  * NOTE: v_mount may be NULL due to assigmment to dead_vnode_vops
449  *
450  * NOTE: The vnode may be marked inactive with dirty buffers
451  *	 or dirty pages in its cached VM object still present.
452  *
453  * NOTE: VS_FREE should not be set on entry (the vnode was expected to
454  *	 previously be active).  We lose control of the vnode the instant
455  *	 it is placed on the free list.
456  *
457  *	 The VX lock is required when transitioning to VS_CACHED but is
458  *	 not sufficient for the vshouldfree() interlocked test or when
459  *	 transitioning away from VS_CACHED.  v_spin is also required for
460  *	 those cases.
461  */
462 static
463 void
464 vnode_terminate(struct vnode *vp)
465 {
466 	KKASSERT(vp->v_state == VS_ACTIVE);
467 
468 	if ((vp->v_flag & VINACTIVE) == 0) {
469 		_vsetflags(vp, VINACTIVE);
470 		if (vp->v_mount)
471 			VOP_INACTIVE(vp);
472 	}
473 	spin_lock(&vp->v_spin);
474 	_vinactive(vp);
475 	spin_unlock(&vp->v_spin);
476 
477 	vx_unlock(vp);
478 }
479 
480 /****************************************************************
481  *			VX LOCKING FUNCTIONS			*
482  ****************************************************************
483  *
484  * These functions lock vnodes for reclamation and deactivation related
485  * activities.  The caller must already be holding some sort of reference
486  * on the vnode.
487  */
488 void
489 vx_lock(struct vnode *vp)
490 {
491 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
492 }
493 
494 void
495 vx_unlock(struct vnode *vp)
496 {
497 	lockmgr(&vp->v_lock, LK_RELEASE);
498 }
499 
500 /****************************************************************
501  *			VNODE ACQUISITION FUNCTIONS		*
502  ****************************************************************
503  *
504  * These functions must be used when accessing a vnode that has no
505  * chance of being destroyed in a SMP race.  That means the caller will
506  * usually either hold an auxiliary reference (such as the namecache)
507  * or hold some other lock that ensures that the vnode cannot be destroyed.
508  *
509  * These functions are MANDATORY for any code chain accessing a vnode
510  * whos activation state is not known.
511  *
512  * vget() can be called with LK_NOWAIT and will return EBUSY if the
513  * lock cannot be immediately acquired.
514  *
515  * vget()/vput() are used when reactivation is desired.
516  *
517  * vx_get() and vx_put() are used when reactivation is not desired.
518  */
519 int
520 vget(struct vnode *vp, int flags)
521 {
522 	int error;
523 
524 	/*
525 	 * A lock type must be passed
526 	 */
527 	if ((flags & LK_TYPE_MASK) == 0) {
528 		panic("vget() called with no lock specified!");
529 		/* NOT REACHED */
530 	}
531 
532 	/*
533 	 * Reference the structure and then acquire the lock.
534 	 *
535 	 * NOTE: The requested lock might be a shared lock and does
536 	 *	 not protect our access to the refcnt or other fields.
537 	 */
538 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
539 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
540 
541 	if ((error = vn_lock(vp, flags | LK_FAILRECLAIM)) != 0) {
542 		/*
543 		 * The lock failed, undo and return an error.  This will not
544 		 * normally trigger a termination.
545 		 */
546 		vrele(vp);
547 	} else if (vp->v_flag & VRECLAIMED) {
548 		/*
549 		 * The node is being reclaimed and cannot be reactivated
550 		 * any more, undo and return ENOENT.
551 		 */
552 		vn_unlock(vp);
553 		vrele(vp);
554 		error = ENOENT;
555 	} else if (vp->v_state == VS_ACTIVE) {
556 		/*
557 		 * A VS_ACTIVE vnode coupled with the fact that we have
558 		 * a vnode lock (even if shared) prevents v_state from
559 		 * changing.  Since the vnode is not in a VRECLAIMED state,
560 		 * we can safely clear VINACTIVE.
561 		 *
562 		 * It is possible for a shared lock to cause a race with
563 		 * another thread that is also in the process of clearing
564 		 * VREF_TERMINATE, meaning that we might return with it still
565 		 * set and then assert in a later vref().  The solution is to
566 		 * unconditionally clear VREF_TERMINATE here as well.
567 		 *
568 		 * NOTE! Multiple threads may clear VINACTIVE if this is
569 		 *	 shared lock.  This race is allowed.
570 		 */
571 		_vclrflags(vp, VINACTIVE);	/* SMP race ok */
572 		vp->v_act += VACT_INC;
573 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
574 			vp->v_act = VACT_MAX;
575 		error = 0;
576 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE);
577 	} else {
578 		/*
579 		 * If the vnode is not VS_ACTIVE it must be reactivated
580 		 * in addition to clearing VINACTIVE.  An exclusive spin_lock
581 		 * is needed to manipulate the vnode's list.
582 		 *
583 		 * Because the lockmgr lock might be shared, we might race
584 		 * another reactivation, which we handle.  In this situation,
585 		 * however, the refcnt prevents other v_state races.
586 		 *
587 		 * As with above, clearing VINACTIVE is allowed to race other
588 		 * clearings of VINACTIVE.
589 		 *
590 		 * VREF_TERMINATE and VREF_FINALIZE can only be cleared when
591 		 * the refcnt is non-zero and the vnode has not been
592 		 * reclaimed.  This also means that the transitions do
593 		 * not affect cachedvnodes.
594 		 *
595 		 * It is possible for a shared lock to cause a race with
596 		 * another thread that is also in the process of clearing
597 		 * VREF_TERMINATE, meaning that we might return with it still
598 		 * set and then assert in a later vref().  The solution is to
599 		 * unconditionally clear VREF_TERMINATE here as well.
600 		 */
601 		_vclrflags(vp, VINACTIVE);
602 		vp->v_act += VACT_INC;
603 		if (vp->v_act > VACT_MAX)	/* SMP race ok */
604 			vp->v_act = VACT_MAX;
605 		spin_lock(&vp->v_spin);
606 
607 		switch(vp->v_state) {
608 		case VS_INACTIVE:
609 			_vactivate(vp);
610 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
611 							VREF_FINALIZE);
612 			spin_unlock(&vp->v_spin);
613 			break;
614 		case VS_CACHED:
615 			_vactivate(vp);
616 			atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE |
617 							VREF_FINALIZE);
618 			spin_unlock(&vp->v_spin);
619 			break;
620 		case VS_ACTIVE:
621 			atomic_clear_int(&vp->v_refcnt, VREF_FINALIZE |
622 							VREF_TERMINATE);
623 			spin_unlock(&vp->v_spin);
624 			break;
625 		case VS_DYING:
626 			spin_unlock(&vp->v_spin);
627 			panic("Impossible VS_DYING state");
628 			break;
629 		}
630 		error = 0;
631 	}
632 	return(error);
633 }
634 
635 #ifdef DEBUG_VPUT
636 
637 void
638 debug_vput(struct vnode *vp, const char *filename, int line)
639 {
640 	kprintf("vput(%p) %s:%d\n", vp, filename, line);
641 	vn_unlock(vp);
642 	vrele(vp);
643 }
644 
645 #else
646 
647 void
648 vput(struct vnode *vp)
649 {
650 	vn_unlock(vp);
651 	vrele(vp);
652 }
653 
654 #endif
655 
656 /*
657  * Acquire the vnode lock unguarded.
658  *
659  * The non-blocking version also uses a slightly different mechanic.
660  * This function will explicitly fail not only if it cannot acquire
661  * the lock normally, but also if the caller already holds a lock.
662  *
663  * The adjusted mechanic is used to close a loophole where complex
664  * VOP_RECLAIM code can circle around recursively and allocate the
665  * same vnode it is trying to destroy from the freelist.
666  *
667  * Any filesystem (aka UFS) which puts LK_CANRECURSE in lk_flags can
668  * cause the incorrect behavior to occur.  If not for that lockmgr()
669  * would do the right thing.
670  *
671  * XXX The vx_*() locks should use auxrefs, not the main reference counter.
672  */
673 void
674 vx_get(struct vnode *vp)
675 {
676 	if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
677 		atomic_add_int(&mycpu->gd_cachedvnodes, -1);
678 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
679 }
680 
681 int
682 vx_get_nonblock(struct vnode *vp)
683 {
684 	int error;
685 
686 	if (lockinuse(&vp->v_lock))
687 		return(EBUSY);
688 	error = lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT);
689 	if (error == 0) {
690 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
691 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
692 	}
693 	return(error);
694 }
695 
696 /*
697  * Release a VX lock that also held a ref on the vnode.  vrele() will handle
698  * any needed state transitions.
699  *
700  * However, filesystems use this function to get rid of unwanted new vnodes
701  * so try to get the vnode on the correct queue in that case.
702  */
703 void
704 vx_put(struct vnode *vp)
705 {
706 	if (vp->v_type == VNON || vp->v_type == VBAD)
707 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
708 	lockmgr(&vp->v_lock, LK_RELEASE);
709 	vrele(vp);
710 }
711 
712 /*
713  * Try to reuse a vnode from the free list.  This function is somewhat
714  * advisory in that NULL can be returned as a normal case, even if free
715  * vnodes are present.
716  *
717  * The scan is limited because it can result in excessive CPU use during
718  * periods of extreme vnode use.
719  *
720  * NOTE: The returned vnode is not completely initialized.
721  */
722 static
723 struct vnode *
724 cleanfreevnode(int maxcount)
725 {
726 	struct vnode_index *vi;
727 	struct vnode *vp;
728 	int count;
729 	int trigger = (long)vmstats.v_page_count / (activevnodes * 2 + 1);
730 	int ri;
731 	int cpu_count;
732 
733 	/*
734 	 * Try to deactivate some vnodes cached on the active list.
735 	 */
736 	if (countcachedvnodes() < inactivevnodes)
737 		goto skip;
738 
739 	ri = vnode_list_hash[mycpu->gd_cpuid].deac_rover + 1;
740 
741 	for (count = 0; count < maxcount * 2; ++count, ++ri) {
742 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
743 
744 		spin_lock(&vi->spin);
745 
746 		vp = TAILQ_NEXT(&vi->active_rover, v_list);
747 		TAILQ_REMOVE(&vi->active_list, &vi->active_rover, v_list);
748 		if (vp == NULL) {
749 			TAILQ_INSERT_HEAD(&vi->active_list,
750 					  &vi->active_rover, v_list);
751 		} else {
752 			TAILQ_INSERT_AFTER(&vi->active_list, vp,
753 					   &vi->active_rover, v_list);
754 		}
755 		if (vp == NULL) {
756 			spin_unlock(&vi->spin);
757 			continue;
758 		}
759 		if ((vp->v_refcnt & VREF_MASK) != 0) {
760 			spin_unlock(&vi->spin);
761 			vp->v_act += VACT_INC;
762 			if (vp->v_act > VACT_MAX)	/* SMP race ok */
763 				vp->v_act = VACT_MAX;
764 			continue;
765 		}
766 
767 		/*
768 		 * decrement by less if the vnode's object has a lot of
769 		 * VM pages.  XXX possible SMP races.
770 		 */
771 		if (vp->v_act > 0) {
772 			vm_object_t obj;
773 			if ((obj = vp->v_object) != NULL &&
774 			    obj->resident_page_count >= trigger) {
775 				vp->v_act -= 1;
776 			} else {
777 				vp->v_act -= VACT_INC;
778 			}
779 			if (vp->v_act < 0)
780 				vp->v_act = 0;
781 			spin_unlock(&vi->spin);
782 			continue;
783 		}
784 
785 		/*
786 		 * Try to deactivate the vnode.
787 		 */
788 		if ((atomic_fetchadd_int(&vp->v_refcnt, 1) & VREF_MASK) == 0)
789 			atomic_add_int(&mycpu->gd_cachedvnodes, -1);
790 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
791 
792 		spin_unlock(&vi->spin);
793 		vrele(vp);
794 	}
795 
796 	vnode_list_hash[mycpu->gd_cpuid].deac_rover = ri;
797 
798 skip:
799 	/*
800 	 * Loop trying to lock the first vnode on the free list.
801 	 * Cycle if we can't.
802 	 */
803 	cpu_count = ncpus;
804 	ri = vnode_list_hash[mycpu->gd_cpuid].free_rover + 1;
805 
806 	for (count = 0; count < maxcount; ++count, ++ri) {
807 		vi = &vnode_list_hash[((unsigned)ri >> 4) % ncpus];
808 
809 		spin_lock(&vi->spin);
810 
811 		vp = TAILQ_FIRST(&vi->inactive_list);
812 		if (vp == NULL) {
813 			spin_unlock(&vi->spin);
814 			if (--cpu_count == 0)
815 				break;
816 			ri = (ri + 16) & ~15;
817 			--ri;
818 			continue;
819 		}
820 
821 		/*
822 		 * non-blocking vx_get will also ref the vnode on success.
823 		 */
824 		if (vx_get_nonblock(vp)) {
825 			KKASSERT(vp->v_state == VS_INACTIVE);
826 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
827 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
828 			spin_unlock(&vi->spin);
829 			continue;
830 		}
831 
832 		/*
833 		 * Because we are holding vfs_spin the vnode should currently
834 		 * be inactive and VREF_TERMINATE should still be set.
835 		 *
836 		 * Once vfs_spin is released the vnode's state should remain
837 		 * unmodified due to both the lock and ref on it.
838 		 */
839 		KKASSERT(vp->v_state == VS_INACTIVE);
840 		spin_unlock(&vi->spin);
841 #ifdef TRACKVNODE
842 		if ((u_long)vp == trackvnode)
843 			kprintf("cleanfreevnode %p %08x\n", vp, vp->v_flag);
844 #endif
845 
846 		/*
847 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
848 		 * This includes namecache refs due to a related ncp being
849 		 * locked or having children, a VM object association, or
850 		 * other hold users.
851 		 *
852 		 * Do not reclaim/reuse a vnode if someone else has a real
853 		 * ref on it.  This can occur if a filesystem temporarily
854 		 * releases the vnode lock during VOP_RECLAIM.
855 		 */
856 		if (vp->v_auxrefs ||
857 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
858 failed:
859 			if (vp->v_state == VS_INACTIVE) {
860 				spin_lock(&vi->spin);
861 				if (vp->v_state == VS_INACTIVE) {
862 					TAILQ_REMOVE(&vi->inactive_list,
863 						     vp, v_list);
864 					TAILQ_INSERT_TAIL(&vi->inactive_list,
865 							  vp, v_list);
866 				}
867 				spin_unlock(&vi->spin);
868 			}
869 			vx_put(vp);
870 			continue;
871 		}
872 
873 		/*
874 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
875 		 * for vnodes pulled from the inactive list, and cannot be
876 		 * changed while we hold the vx lock.
877 		 *
878 		 * Try to reclaim the vnode.
879 		 */
880 		KKASSERT(vp->v_flag & VINACTIVE);
881 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
882 
883 		if ((vp->v_flag & VRECLAIMED) == 0) {
884 			if (cache_inval_vp_nonblock(vp))
885 				goto failed;
886 			vgone_vxlocked(vp);
887 			/* vnode is still VX locked */
888 		}
889 
890 		/*
891 		 * At this point if there are no other refs or auxrefs on
892 		 * the vnode with the inactive list locked, and we remove
893 		 * the vnode from the inactive list, it should not be
894 		 * possible for anyone else to access the vnode any more.
895 		 *
896 		 * Since the vnode is in a VRECLAIMED state, no new
897 		 * namecache associations could have been made and the
898 		 * vnode should have already been removed from its mountlist.
899 		 *
900 		 * Since we hold a VX lock on the vnode it cannot have been
901 		 * reactivated (moved out of the inactive list).
902 		 */
903 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
904 		spin_lock(&vi->spin);
905 		if (vp->v_auxrefs ||
906 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
907 			spin_unlock(&vi->spin);
908 			goto failed;
909 		}
910 		KKASSERT(vp->v_state == VS_INACTIVE);
911 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
912 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
913 		vp->v_state = VS_DYING;
914 		spin_unlock(&vi->spin);
915 
916 		/*
917 		 * Nothing should have been able to access this vp.  Only
918 		 * our ref should remain now.
919 		 */
920 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
921 		KASSERT(vp->v_refcnt == 1,
922 			("vp %p badrefs %08x", vp, vp->v_refcnt));
923 
924 		/*
925 		 * Return a VX locked vnode suitable for reuse.
926 		 */
927 		vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
928 		return(vp);
929 	}
930 	vnode_list_hash[mycpu->gd_cpuid].free_rover = ri;
931 	return(NULL);
932 }
933 
934 /*
935  * Obtain a new vnode.  The returned vnode is VX locked & vrefd.
936  *
937  * All new vnodes set the VAGE flags.  An open() of the vnode will
938  * decrement the (2-bit) flags.  Vnodes which are opened several times
939  * are thus retained in the cache over vnodes which are merely stat()d.
940  *
941  * We attempt to reuse an already-recycled vnode from our pcpu inactive
942  * queue first, and allocate otherwise.  Attempting to recycle inactive
943  * vnodes here can lead to numerous deadlocks, particularly with
944  * softupdates.
945  */
946 struct vnode *
947 allocvnode(int lktimeout, int lkflags)
948 {
949 	struct vnode *vp;
950 	struct vnode_index *vi;
951 
952 	/*
953 	 * lktimeout only applies when LK_TIMELOCK is used, and only
954 	 * the pageout daemon uses it.  The timeout may not be zero
955 	 * or the pageout daemon can deadlock in low-VM situations.
956 	 */
957 	if (lktimeout == 0)
958 		lktimeout = hz / 10;
959 
960 	/*
961 	 * Do not flag for synchronous recyclement unless there are enough
962 	 * freeable vnodes to recycle and the number of vnodes has
963 	 * significantly exceeded our target.  We want the normal vnlru
964 	 * process to handle the cleaning (at 9/10's) before we are forced
965 	 * to flag it here at 11/10's for userexit path processing.
966 	 */
967 	if (numvnodes >= maxvnodes * 11 / 10 &&
968 	    cachedvnodes + inactivevnodes >= maxvnodes * 5 / 10) {
969 		struct thread *td = curthread;
970 		if (td->td_lwp)
971 			atomic_set_int(&td->td_lwp->lwp_mpflags, LWP_MP_VNLRU);
972 	}
973 
974 	/*
975 	 * Try to trivially reuse a reclaimed vnode from the head of the
976 	 * inactive list for this cpu.  Any vnode cycling which occurs
977 	 * which terminates the vnode will cause it to be returned to the
978 	 * same pcpu structure (e.g. unlink calls).
979 	 */
980 	vi = &vnode_list_hash[mycpuid];
981 	spin_lock(&vi->spin);
982 
983 	vp = TAILQ_FIRST(&vi->inactive_list);
984 	if (vp && (vp->v_flag & VRECLAIMED)) {
985 		/*
986 		 * non-blocking vx_get will also ref the vnode on success.
987 		 */
988 		if (vx_get_nonblock(vp)) {
989 			KKASSERT(vp->v_state == VS_INACTIVE);
990 			TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
991 			TAILQ_INSERT_TAIL(&vi->inactive_list, vp, v_list);
992 			spin_unlock(&vi->spin);
993 			goto slower;
994 		}
995 
996 		/*
997 		 * Because we are holding vfs_spin the vnode should currently
998 		 * be inactive and VREF_TERMINATE should still be set.
999 		 *
1000 		 * Once vfs_spin is released the vnode's state should remain
1001 		 * unmodified due to both the lock and ref on it.
1002 		 */
1003 		KKASSERT(vp->v_state == VS_INACTIVE);
1004 #ifdef TRACKVNODE
1005 		if ((u_long)vp == trackvnode)
1006 			kprintf("allocvnode %p %08x\n", vp, vp->v_flag);
1007 #endif
1008 
1009 		/*
1010 		 * Do not reclaim/reuse a vnode while auxillary refs exists.
1011 		 * This includes namecache refs due to a related ncp being
1012 		 * locked or having children, a VM object association, or
1013 		 * other hold users.
1014 		 *
1015 		 * Do not reclaim/reuse a vnode if someone else has a real
1016 		 * ref on it.  This can occur if a filesystem temporarily
1017 		 * releases the vnode lock during VOP_RECLAIM.
1018 		 */
1019 		if (vp->v_auxrefs ||
1020 		    (vp->v_refcnt & ~VREF_FINALIZE) != VREF_TERMINATE + 1) {
1021 			if (vp->v_state == VS_INACTIVE) {
1022 				if (vp->v_state == VS_INACTIVE) {
1023 					TAILQ_REMOVE(&vi->inactive_list,
1024 						     vp, v_list);
1025 					TAILQ_INSERT_TAIL(&vi->inactive_list,
1026 							  vp, v_list);
1027 				}
1028 			}
1029 			spin_unlock(&vi->spin);
1030 			vx_put(vp);
1031 			goto slower;
1032 		}
1033 
1034 		/*
1035 		 * VINACTIVE and VREF_TERMINATE are expected to both be set
1036 		 * for vnodes pulled from the inactive list, and cannot be
1037 		 * changed while we hold the vx lock.
1038 		 *
1039 		 * Try to reclaim the vnode.
1040 		 */
1041 		KKASSERT(vp->v_flag & VINACTIVE);
1042 		KKASSERT(vp->v_refcnt & VREF_TERMINATE);
1043 
1044 		if ((vp->v_flag & VRECLAIMED) == 0) {
1045 			spin_unlock(&vi->spin);
1046 			vx_put(vp);
1047 			goto slower;
1048 		}
1049 
1050 		/*
1051 		 * At this point if there are no other refs or auxrefs on
1052 		 * the vnode with the inactive list locked, and we remove
1053 		 * the vnode from the inactive list, it should not be
1054 		 * possible for anyone else to access the vnode any more.
1055 		 *
1056 		 * Since the vnode is in a VRECLAIMED state, no new
1057 		 * namecache associations could have been made and the
1058 		 * vnode should have already been removed from its mountlist.
1059 		 *
1060 		 * Since we hold a VX lock on the vnode it cannot have been
1061 		 * reactivated (moved out of the inactive list).
1062 		 */
1063 		KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1064 		KKASSERT(vp->v_state == VS_INACTIVE);
1065 		TAILQ_REMOVE(&vi->inactive_list, vp, v_list);
1066 		atomic_add_int(&mycpu->gd_inactivevnodes, -1);
1067 		vp->v_state = VS_DYING;
1068 		spin_unlock(&vi->spin);
1069 
1070 		/*
1071 		 * Nothing should have been able to access this vp.  Only
1072 		 * our ref should remain now.
1073 		 *
1074 		 * At this point we can kfree() the vnode if we want to.
1075 		 * Instead, we reuse it for the allocation.
1076 		 */
1077 		atomic_clear_int(&vp->v_refcnt, VREF_TERMINATE|VREF_FINALIZE);
1078 		KASSERT(vp->v_refcnt == 1,
1079 			("vp %p badrefs %08x", vp, vp->v_refcnt));
1080 		bzero(vp, sizeof(*vp));
1081 	} else {
1082 		spin_unlock(&vi->spin);
1083 slower:
1084 		vp = kmalloc(sizeof(*vp), M_VNODE, M_ZERO | M_WAITOK);
1085 		atomic_add_int(&numvnodes, 1);
1086 	}
1087 
1088 	lwkt_token_init(&vp->v_token, "vnode");
1089 	lockinit(&vp->v_lock, "vnode", lktimeout, lkflags);
1090 	TAILQ_INIT(&vp->v_namecache);
1091 	RB_INIT(&vp->v_rbclean_tree);
1092 	RB_INIT(&vp->v_rbdirty_tree);
1093 	RB_INIT(&vp->v_rbhash_tree);
1094 	spin_init(&vp->v_spin, "allocvnode");
1095 
1096 	lockmgr(&vp->v_lock, LK_EXCLUSIVE);
1097 	vp->v_refcnt = 1;
1098 	vp->v_flag = VAGE0 | VAGE1;
1099 	vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
1100 
1101 	KKASSERT(TAILQ_EMPTY(&vp->v_namecache));
1102 	/* exclusive lock still held */
1103 
1104 	vp->v_filesize = NOOFFSET;
1105 	vp->v_type = VNON;
1106 	vp->v_tag = 0;
1107 	vp->v_state = VS_CACHED;
1108 	_vactivate(vp);
1109 
1110 	return (vp);
1111 }
1112 
1113 /*
1114  * Called after a process has allocated a vnode via allocvnode()
1115  * and we detected that too many vnodes were present.
1116  *
1117  * This function is called just prior to a return to userland if the
1118  * process at some point had to allocate a new vnode during the last
1119  * system call and the vnode count was found to be excessive.
1120  *
1121  * This is a synchronous path that we do not normally want to execute.
1122  *
1123  * Flagged at >= 11/10's, runs if >= 10/10, vnlru runs at 9/10.
1124  *
1125  * WARNING: Sometimes numvnodes can blow out due to children being
1126  *	    present under directory vnodes in the namecache.  For the
1127  *	    moment use an if() instead of a while() and note that if
1128  *	    we were to use a while() we would still have to break out
1129  *	    if freesomevnodes() returned 0.  vnlru will also be trying
1130  *	    hard to free vnodes at the same time (with a lower trigger
1131  *	    pointer).
1132  */
1133 void
1134 allocvnode_gc(void)
1135 {
1136 	if (numvnodes >= maxvnodes &&
1137 	    countcachedandinactivevnodes() >= maxvnodes * 5 / 10) {
1138 		freesomevnodes(batchfreevnodes);
1139 	}
1140 }
1141 
1142 int
1143 freesomevnodes(int n)
1144 {
1145 	struct vnode *vp;
1146 	int count = 0;
1147 
1148 	while (n) {
1149 		if ((vp = cleanfreevnode(n)) == NULL)
1150 			break;
1151 		vx_unlock(vp);
1152 		--n;
1153 		++count;
1154 		kfree(vp, M_VNODE);
1155 		atomic_add_int(&numvnodes, -1);
1156 	}
1157 	return(count);
1158 }
1159