xref: /freebsd/sys/kern/vfs_cache.c (revision e17f5b1d)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993, 1995
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_ddb.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/capsicum.h>
46 #include <sys/counter.h>
47 #include <sys/filedesc.h>
48 #include <sys/fnv_hash.h>
49 #include <sys/kernel.h>
50 #include <sys/ktr.h>
51 #include <sys/lock.h>
52 #include <sys/malloc.h>
53 #include <sys/fcntl.h>
54 #include <sys/mount.h>
55 #include <sys/namei.h>
56 #include <sys/proc.h>
57 #include <sys/rwlock.h>
58 #include <sys/sdt.h>
59 #include <sys/smr.h>
60 #include <sys/smp.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysproto.h>
64 #include <sys/vnode.h>
65 #include <ck_queue.h>
66 #ifdef KTRACE
67 #include <sys/ktrace.h>
68 #endif
69 
70 #ifdef DDB
71 #include <ddb/ddb.h>
72 #endif
73 
74 #include <vm/uma.h>
75 
76 SDT_PROVIDER_DECLARE(vfs);
77 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
78     "struct vnode *");
79 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
80     "char *");
81 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
82 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
83     "char *", "struct vnode *");
84 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
85 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
86     "struct vnode *", "char *");
87 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
88     "struct vnode *");
89 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
90     "struct vnode *", "char *");
91 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
92     "char *");
93 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
94 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
95 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
96 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
97     "struct vnode *");
98 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
99     "char *");
100 SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
101     "char *");
102 
103 /*
104  * This structure describes the elements in the cache of recent
105  * names looked up by namei.
106  */
107 struct negstate {
108 	u_char neg_flag;
109 };
110 _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *),
111     "the state must fit in a union with a pointer without growing it");
112 
113 struct	namecache {
114 	CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */
115 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
116 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
117 	struct	vnode *nc_dvp;		/* vnode of parent of name */
118 	union {
119 		struct	vnode *nu_vp;	/* vnode the name refers to */
120 		struct	negstate nu_neg;/* negative entry state */
121 	} n_un;
122 	u_char	nc_flag;		/* flag bits */
123 	u_char	nc_nlen;		/* length of name */
124 	char	nc_name[0];		/* segment name + nul */
125 };
126 
127 /*
128  * struct namecache_ts repeats struct namecache layout up to the
129  * nc_nlen member.
130  * struct namecache_ts is used in place of struct namecache when time(s) need
131  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
132  * both a non-dotdot directory name plus dotdot for the directory's
133  * parent.
134  */
135 struct	namecache_ts {
136 	struct	timespec nc_time;	/* timespec provided by fs */
137 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
138 	int	nc_ticks;		/* ticks value when entry was added */
139 	struct namecache nc_nc;
140 };
141 
142 #define	nc_vp		n_un.nu_vp
143 #define	nc_neg		n_un.nu_neg
144 
145 /*
146  * Flags in namecache.nc_flag
147  */
148 #define NCF_WHITE	0x01
149 #define NCF_ISDOTDOT	0x02
150 #define	NCF_TS		0x04
151 #define	NCF_DTS		0x08
152 #define	NCF_DVDROP	0x10
153 #define	NCF_NEGATIVE	0x20
154 #define	NCF_INVALID	0x40
155 
156 /*
157  * Flags in negstate.neg_flag
158  */
159 #define NEG_HOT		0x01
160 
161 /*
162  * Mark an entry as invalid.
163  *
164  * This is called before it starts getting deconstructed.
165  */
166 static void
167 cache_ncp_invalidate(struct namecache *ncp)
168 {
169 
170 	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
171 	    ("%s: entry %p already invalid", __func__, ncp));
172 	ncp->nc_flag |= NCF_INVALID;
173 	atomic_thread_fence_rel();
174 }
175 
176 /*
177  * Verify validity of an entry.
178  *
179  * All places which elide locks are supposed to call this after they are
180  * done with reading from an entry.
181  */
182 static bool
183 cache_ncp_invalid(struct namecache *ncp)
184 {
185 
186 	atomic_thread_fence_acq();
187 	return ((ncp->nc_flag & NCF_INVALID) != 0);
188 }
189 
190 /*
191  * Name caching works as follows:
192  *
193  * Names found by directory scans are retained in a cache
194  * for future reference.  It is managed LRU, so frequently
195  * used names will hang around.  Cache is indexed by hash value
196  * obtained from (dvp, name) where dvp refers to the directory
197  * containing name.
198  *
199  * If it is a "negative" entry, (i.e. for a name that is known NOT to
200  * exist) the vnode pointer will be NULL.
201  *
202  * Upon reaching the last segment of a path, if the reference
203  * is for DELETE, or NOCACHE is set (rewrite), and the
204  * name is located in the cache, it will be dropped.
205  *
206  * These locks are used (in the order in which they can be taken):
207  * NAME		TYPE	ROLE
208  * vnodelock	mtx	vnode lists and v_cache_dd field protection
209  * bucketlock	rwlock	for access to given set of hash buckets
210  * neglist	mtx	negative entry LRU management
211  *
212  * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
213  * shrinking the LRU list.
214  *
215  * It is legal to take multiple vnodelock and bucketlock locks. The locking
216  * order is lower address first. Both are recursive.
217  *
218  * "." lookups are lockless.
219  *
220  * ".." and vnode -> name lookups require vnodelock.
221  *
222  * name -> vnode lookup requires the relevant bucketlock to be held for reading.
223  *
224  * Insertions and removals of entries require involved vnodes and bucketlocks
225  * to be write-locked to prevent other threads from seeing the entry.
226  *
227  * Some lookups result in removal of the found entry (e.g. getting rid of a
228  * negative entry with the intent to create a positive one), which poses a
229  * problem when multiple threads reach the state. Similarly, two different
230  * threads can purge two different vnodes and try to remove the same name.
231  *
232  * If the already held vnode lock is lower than the second required lock, we
233  * can just take the other lock. However, in the opposite case, this could
234  * deadlock. As such, this is resolved by trylocking and if that fails unlocking
235  * the first node, locking everything in order and revalidating the state.
236  */
237 
238 VFS_SMR_DECLARE;
239 
240 /*
241  * Structures associated with name caching.
242  */
243 #define NCHHASH(hash) \
244 	(&nchashtbl[(hash) & nchash])
245 static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
246 static u_long __read_mostly	nchash;			/* size of hash table */
247 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
248     "Size of namecache hash table");
249 static u_long __read_mostly	ncnegfactor = 5; /* ratio of negative entries */
250 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
251     "Ratio of negative namecache entries");
252 static u_long __exclusive_cache_line	numneg;	/* number of negative entries allocated */
253 static u_long __exclusive_cache_line	numcache;/* number of cache entries allocated */
254 u_int ncsizefactor = 2;
255 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
256     "Size factor for namecache");
257 static u_int __read_mostly	ncpurgeminvnodes;
258 SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0,
259     "Number of vnodes below which purgevfs ignores the request");
260 static u_int __read_mostly	ncsize; /* the size as computed on creation or resizing */
261 
262 struct nchstats	nchstats;		/* cache effectiveness statistics */
263 
264 static struct mtx __exclusive_cache_line	ncneg_shrink_lock;
265 
266 struct neglist {
267 	struct mtx		nl_lock;
268 	TAILQ_HEAD(, namecache) nl_list;
269 } __aligned(CACHE_LINE_SIZE);
270 
271 static struct neglist __read_mostly	*neglists;
272 static struct neglist ncneg_hot;
273 static u_long numhotneg;
274 
275 #define	numneglists (ncneghash + 1)
276 static u_int __read_mostly	ncneghash;
277 static inline struct neglist *
278 NCP2NEGLIST(struct namecache *ncp)
279 {
280 
281 	return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]);
282 }
283 
284 static inline struct negstate *
285 NCP2NEGSTATE(struct namecache *ncp)
286 {
287 
288 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
289 	return (&ncp->nc_neg);
290 }
291 
292 #define	numbucketlocks (ncbuckethash + 1)
293 static u_int __read_mostly  ncbuckethash;
294 static struct rwlock_padalign __read_mostly  *bucketlocks;
295 #define	HASH2BUCKETLOCK(hash) \
296 	((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)]))
297 
298 #define	numvnodelocks (ncvnodehash + 1)
299 static u_int __read_mostly  ncvnodehash;
300 static struct mtx __read_mostly *vnodelocks;
301 static inline struct mtx *
302 VP2VNODELOCK(struct vnode *vp)
303 {
304 
305 	return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]);
306 }
307 
308 /*
309  * UMA zones for the VFS cache.
310  *
311  * The small cache is used for entries with short names, which are the
312  * most common.  The large cache is used for entries which are too big to
313  * fit in the small cache.
314  */
315 static uma_zone_t __read_mostly cache_zone_small;
316 static uma_zone_t __read_mostly cache_zone_small_ts;
317 static uma_zone_t __read_mostly cache_zone_large;
318 static uma_zone_t __read_mostly cache_zone_large_ts;
319 
320 #define	CACHE_PATH_CUTOFF	35
321 
322 static struct namecache *
323 cache_alloc(int len, int ts)
324 {
325 	struct namecache_ts *ncp_ts;
326 	struct namecache *ncp;
327 
328 	if (__predict_false(ts)) {
329 		if (len <= CACHE_PATH_CUTOFF)
330 			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
331 		else
332 			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
333 		ncp = &ncp_ts->nc_nc;
334 	} else {
335 		if (len <= CACHE_PATH_CUTOFF)
336 			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
337 		else
338 			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
339 	}
340 	return (ncp);
341 }
342 
343 static void
344 cache_free(struct namecache *ncp)
345 {
346 	struct namecache_ts *ncp_ts;
347 
348 	if (ncp == NULL)
349 		return;
350 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
351 		vdrop(ncp->nc_dvp);
352 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
353 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
354 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
355 			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
356 		else
357 			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
358 	} else {
359 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
360 			uma_zfree_smr(cache_zone_small, ncp);
361 		else
362 			uma_zfree_smr(cache_zone_large, ncp);
363 	}
364 }
365 
366 static void
367 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
368 {
369 	struct namecache_ts *ncp_ts;
370 
371 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
372 	    (tsp == NULL && ticksp == NULL),
373 	    ("No NCF_TS"));
374 
375 	if (tsp == NULL && ticksp == NULL)
376 		return;
377 
378 	ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
379 	if (tsp != NULL)
380 		*tsp = ncp_ts->nc_time;
381 	if (ticksp != NULL)
382 		*ticksp = ncp_ts->nc_ticks;
383 }
384 
385 #ifdef DEBUG_CACHE
386 static int __read_mostly	doingcache = 1;	/* 1 => enable the cache */
387 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
388     "VFS namecache enabled");
389 #endif
390 
391 /* Export size information to userland */
392 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
393     sizeof(struct namecache), "sizeof(struct namecache)");
394 
395 /*
396  * The new name cache statistics
397  */
398 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
399     "Name cache statistics");
400 #define STATNODE_ULONG(name, descr)					\
401 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
402 #define STATNODE_COUNTER(name, descr)					\
403 	static COUNTER_U64_DEFINE_EARLY(name);				\
404 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \
405 	    descr);
406 STATNODE_ULONG(numneg, "Number of negative cache entries");
407 STATNODE_ULONG(numcache, "Number of cache entries");
408 STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held");
409 STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit");
410 STATNODE_COUNTER(dothits, "Number of '.' hits");
411 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
412 STATNODE_COUNTER(numchecks, "Number of checks in lookup");
413 STATNODE_COUNTER(nummiss, "Number of cache misses");
414 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
415 STATNODE_COUNTER(numposzaps,
416     "Number of cache hits (positive) we do not want to cache");
417 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
418 STATNODE_COUNTER(numnegzaps,
419     "Number of cache hits (negative) we do not want to cache");
420 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
421 /* These count for vn_getcwd(), too. */
422 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
423 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
424 STATNODE_COUNTER(numfullpathfail2,
425     "Number of fullpath search errors (VOP_VPTOCNP failures)");
426 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
427 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
428 STATNODE_COUNTER(zap_and_exit_bucket_relock_success,
429     "Number of successful removals after relocking");
430 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
431     "Number of times zap_and_exit failed to lock");
432 static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2,
433     "Number of times zap_and_exit failed to lock");
434 static long cache_lock_vnodes_cel_3_failures;
435 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
436     "Number of times 3-way vnode locking failed");
437 STATNODE_ULONG(numhotneg, "Number of hot negative entries");
438 STATNODE_COUNTER(numneg_evicted,
439     "Number of negative entries evicted when adding a new entry");
440 STATNODE_COUNTER(shrinking_skipped,
441     "Number of times shrinking was already in progress");
442 
443 static void cache_zap_locked(struct namecache *ncp);
444 static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
445     char **freebuf, size_t *buflen);
446 static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
447     char *buf, char **retbuf, size_t *buflen);
448 static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
449     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend);
450 
451 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
452 
453 static int cache_yield;
454 SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
455     "Number of times cache called yield");
456 
457 static void __noinline
458 cache_maybe_yield(void)
459 {
460 
461 	if (should_yield()) {
462 		cache_yield++;
463 		kern_yield(PRI_USER);
464 	}
465 }
466 
467 static inline void
468 cache_assert_vlp_locked(struct mtx *vlp)
469 {
470 
471 	if (vlp != NULL)
472 		mtx_assert(vlp, MA_OWNED);
473 }
474 
475 static inline void
476 cache_assert_vnode_locked(struct vnode *vp)
477 {
478 	struct mtx *vlp;
479 
480 	vlp = VP2VNODELOCK(vp);
481 	cache_assert_vlp_locked(vlp);
482 }
483 
484 static uint32_t
485 cache_get_hash(char *name, u_char len, struct vnode *dvp)
486 {
487 	uint32_t hash;
488 
489 	hash = fnv_32_buf(name, len, FNV1_32_INIT);
490 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
491 	return (hash);
492 }
493 
494 static inline struct rwlock *
495 NCP2BUCKETLOCK(struct namecache *ncp)
496 {
497 	uint32_t hash;
498 
499 	hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp);
500 	return (HASH2BUCKETLOCK(hash));
501 }
502 
503 #ifdef INVARIANTS
504 static void
505 cache_assert_bucket_locked(struct namecache *ncp, int mode)
506 {
507 	struct rwlock *blp;
508 
509 	blp = NCP2BUCKETLOCK(ncp);
510 	rw_assert(blp, mode);
511 }
512 #else
513 #define cache_assert_bucket_locked(x, y) do { } while (0)
514 #endif
515 
516 #define cache_sort_vnodes(x, y)	_cache_sort_vnodes((void **)(x), (void **)(y))
517 static void
518 _cache_sort_vnodes(void **p1, void **p2)
519 {
520 	void *tmp;
521 
522 	MPASS(*p1 != NULL || *p2 != NULL);
523 
524 	if (*p1 > *p2) {
525 		tmp = *p2;
526 		*p2 = *p1;
527 		*p1 = tmp;
528 	}
529 }
530 
531 static void
532 cache_lock_all_buckets(void)
533 {
534 	u_int i;
535 
536 	for (i = 0; i < numbucketlocks; i++)
537 		rw_wlock(&bucketlocks[i]);
538 }
539 
540 static void
541 cache_unlock_all_buckets(void)
542 {
543 	u_int i;
544 
545 	for (i = 0; i < numbucketlocks; i++)
546 		rw_wunlock(&bucketlocks[i]);
547 }
548 
549 static void
550 cache_lock_all_vnodes(void)
551 {
552 	u_int i;
553 
554 	for (i = 0; i < numvnodelocks; i++)
555 		mtx_lock(&vnodelocks[i]);
556 }
557 
558 static void
559 cache_unlock_all_vnodes(void)
560 {
561 	u_int i;
562 
563 	for (i = 0; i < numvnodelocks; i++)
564 		mtx_unlock(&vnodelocks[i]);
565 }
566 
567 static int
568 cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
569 {
570 
571 	cache_sort_vnodes(&vlp1, &vlp2);
572 
573 	if (vlp1 != NULL) {
574 		if (!mtx_trylock(vlp1))
575 			return (EAGAIN);
576 	}
577 	if (!mtx_trylock(vlp2)) {
578 		if (vlp1 != NULL)
579 			mtx_unlock(vlp1);
580 		return (EAGAIN);
581 	}
582 
583 	return (0);
584 }
585 
586 static void
587 cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
588 {
589 
590 	MPASS(vlp1 != NULL || vlp2 != NULL);
591 	MPASS(vlp1 <= vlp2);
592 
593 	if (vlp1 != NULL)
594 		mtx_lock(vlp1);
595 	if (vlp2 != NULL)
596 		mtx_lock(vlp2);
597 }
598 
599 static void
600 cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
601 {
602 
603 	MPASS(vlp1 != NULL || vlp2 != NULL);
604 
605 	if (vlp1 != NULL)
606 		mtx_unlock(vlp1);
607 	if (vlp2 != NULL)
608 		mtx_unlock(vlp2);
609 }
610 
611 static int
612 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
613 {
614 	struct nchstats snap;
615 
616 	if (req->oldptr == NULL)
617 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
618 
619 	snap = nchstats;
620 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
621 	snap.ncs_neghits = counter_u64_fetch(numneghits);
622 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
623 	    counter_u64_fetch(numnegzaps);
624 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
625 	    counter_u64_fetch(nummiss);
626 
627 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
628 }
629 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
630     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
631     "VFS cache effectiveness statistics");
632 
633 #ifdef DIAGNOSTIC
634 /*
635  * Grab an atomic snapshot of the name cache hash chain lengths
636  */
637 static SYSCTL_NODE(_debug, OID_AUTO, hashstat,
638     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
639     "hash table stats");
640 
641 static int
642 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
643 {
644 	struct nchashhead *ncpp;
645 	struct namecache *ncp;
646 	int i, error, n_nchash, *cntbuf;
647 
648 retry:
649 	n_nchash = nchash + 1;	/* nchash is max index, not count */
650 	if (req->oldptr == NULL)
651 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
652 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
653 	cache_lock_all_buckets();
654 	if (n_nchash != nchash + 1) {
655 		cache_unlock_all_buckets();
656 		free(cntbuf, M_TEMP);
657 		goto retry;
658 	}
659 	/* Scan hash tables counting entries */
660 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
661 		CK_LIST_FOREACH(ncp, ncpp, nc_hash)
662 			cntbuf[i]++;
663 	cache_unlock_all_buckets();
664 	for (error = 0, i = 0; i < n_nchash; i++)
665 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
666 			break;
667 	free(cntbuf, M_TEMP);
668 	return (error);
669 }
670 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
671     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
672     "nchash chain lengths");
673 
674 static int
675 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
676 {
677 	int error;
678 	struct nchashhead *ncpp;
679 	struct namecache *ncp;
680 	int n_nchash;
681 	int count, maxlength, used, pct;
682 
683 	if (!req->oldptr)
684 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
685 
686 	cache_lock_all_buckets();
687 	n_nchash = nchash + 1;	/* nchash is max index, not count */
688 	used = 0;
689 	maxlength = 0;
690 
691 	/* Scan hash tables for applicable entries */
692 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
693 		count = 0;
694 		CK_LIST_FOREACH(ncp, ncpp, nc_hash) {
695 			count++;
696 		}
697 		if (count)
698 			used++;
699 		if (maxlength < count)
700 			maxlength = count;
701 	}
702 	n_nchash = nchash + 1;
703 	cache_unlock_all_buckets();
704 	pct = (used * 100) / (n_nchash / 100);
705 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
706 	if (error)
707 		return (error);
708 	error = SYSCTL_OUT(req, &used, sizeof(used));
709 	if (error)
710 		return (error);
711 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
712 	if (error)
713 		return (error);
714 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
715 	if (error)
716 		return (error);
717 	return (0);
718 }
719 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
720     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
721     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
722 #endif
723 
724 /*
725  * Negative entries management
726  *
727  * A variation of LRU scheme is used. New entries are hashed into one of
728  * numneglists cold lists. Entries get promoted to the hot list on first hit.
729  *
730  * The shrinker will demote hot list head and evict from the cold list in a
731  * round-robin manner.
732  */
733 static void
734 cache_negative_init(struct namecache *ncp)
735 {
736 	struct negstate *negstate;
737 
738 	ncp->nc_flag |= NCF_NEGATIVE;
739 	negstate = NCP2NEGSTATE(ncp);
740 	negstate->neg_flag = 0;
741 }
742 
743 static void
744 cache_negative_hit(struct namecache *ncp)
745 {
746 	struct neglist *neglist;
747 	struct negstate *negstate;
748 
749 	negstate = NCP2NEGSTATE(ncp);
750 	if ((negstate->neg_flag & NEG_HOT) != 0)
751 		return;
752 	neglist = NCP2NEGLIST(ncp);
753 	mtx_lock(&ncneg_hot.nl_lock);
754 	mtx_lock(&neglist->nl_lock);
755 	if ((negstate->neg_flag & NEG_HOT) == 0) {
756 		numhotneg++;
757 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
758 		TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst);
759 		negstate->neg_flag |= NEG_HOT;
760 	}
761 	mtx_unlock(&neglist->nl_lock);
762 	mtx_unlock(&ncneg_hot.nl_lock);
763 }
764 
765 static void
766 cache_negative_insert(struct namecache *ncp)
767 {
768 	struct neglist *neglist;
769 
770 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
771 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
772 	neglist = NCP2NEGLIST(ncp);
773 	mtx_lock(&neglist->nl_lock);
774 	TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
775 	mtx_unlock(&neglist->nl_lock);
776 	atomic_add_rel_long(&numneg, 1);
777 }
778 
779 static void
780 cache_negative_remove(struct namecache *ncp)
781 {
782 	struct neglist *neglist;
783 	struct negstate *negstate;
784 	bool hot_locked = false;
785 	bool list_locked = false;
786 
787 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
788 	neglist = NCP2NEGLIST(ncp);
789 	negstate = NCP2NEGSTATE(ncp);
790 	if ((negstate->neg_flag & NEG_HOT) != 0) {
791 		hot_locked = true;
792 		mtx_lock(&ncneg_hot.nl_lock);
793 		if ((negstate->neg_flag & NEG_HOT) == 0) {
794 			list_locked = true;
795 			mtx_lock(&neglist->nl_lock);
796 		}
797 	} else {
798 		list_locked = true;
799 		mtx_lock(&neglist->nl_lock);
800 	}
801 	if ((negstate->neg_flag & NEG_HOT) != 0) {
802 		mtx_assert(&ncneg_hot.nl_lock, MA_OWNED);
803 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
804 		numhotneg--;
805 	} else {
806 		mtx_assert(&neglist->nl_lock, MA_OWNED);
807 		TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst);
808 	}
809 	if (list_locked)
810 		mtx_unlock(&neglist->nl_lock);
811 	if (hot_locked)
812 		mtx_unlock(&ncneg_hot.nl_lock);
813 	atomic_subtract_rel_long(&numneg, 1);
814 }
815 
816 static void
817 cache_negative_shrink_select(struct namecache **ncpp,
818     struct neglist **neglistpp)
819 {
820 	struct neglist *neglist;
821 	struct namecache *ncp;
822 	static u_int cycle;
823 	u_int i;
824 
825 	*ncpp = ncp = NULL;
826 
827 	for (i = 0; i < numneglists; i++) {
828 		neglist = &neglists[(cycle + i) % numneglists];
829 		if (TAILQ_FIRST(&neglist->nl_list) == NULL)
830 			continue;
831 		mtx_lock(&neglist->nl_lock);
832 		ncp = TAILQ_FIRST(&neglist->nl_list);
833 		if (ncp != NULL)
834 			break;
835 		mtx_unlock(&neglist->nl_lock);
836 	}
837 
838 	*neglistpp = neglist;
839 	*ncpp = ncp;
840 	cycle++;
841 }
842 
843 static void
844 cache_negative_zap_one(void)
845 {
846 	struct namecache *ncp, *ncp2;
847 	struct neglist *neglist;
848 	struct negstate *negstate;
849 	struct mtx *dvlp;
850 	struct rwlock *blp;
851 
852 	if (mtx_owner(&ncneg_shrink_lock) != NULL ||
853 	    !mtx_trylock(&ncneg_shrink_lock)) {
854 		counter_u64_add(shrinking_skipped, 1);
855 		return;
856 	}
857 
858 	mtx_lock(&ncneg_hot.nl_lock);
859 	ncp = TAILQ_FIRST(&ncneg_hot.nl_list);
860 	if (ncp != NULL) {
861 		neglist = NCP2NEGLIST(ncp);
862 		negstate = NCP2NEGSTATE(ncp);
863 		mtx_lock(&neglist->nl_lock);
864 		MPASS((negstate->neg_flag & NEG_HOT) != 0);
865 		TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst);
866 		TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst);
867 		negstate->neg_flag &= ~NEG_HOT;
868 		numhotneg--;
869 		mtx_unlock(&neglist->nl_lock);
870 	}
871 	mtx_unlock(&ncneg_hot.nl_lock);
872 
873 	cache_negative_shrink_select(&ncp, &neglist);
874 
875 	mtx_unlock(&ncneg_shrink_lock);
876 	if (ncp == NULL)
877 		return;
878 
879 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
880 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
881 	blp = NCP2BUCKETLOCK(ncp);
882 	mtx_unlock(&neglist->nl_lock);
883 	mtx_lock(dvlp);
884 	rw_wlock(blp);
885 	/*
886 	 * Enter SMR to safely check the negative list.
887 	 * Even if the found pointer matches, the entry may now be reallocated
888 	 * and used by a different vnode.
889 	 */
890 	vfs_smr_enter();
891 	ncp2 = TAILQ_FIRST(&neglist->nl_list);
892 	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
893 	    blp != NCP2BUCKETLOCK(ncp2)) {
894 		vfs_smr_exit();
895 		ncp = NULL;
896 	} else {
897 		vfs_smr_exit();
898 		SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
899 		    ncp->nc_name);
900 		cache_zap_locked(ncp);
901 		counter_u64_add(numneg_evicted, 1);
902 	}
903 	rw_wunlock(blp);
904 	mtx_unlock(dvlp);
905 	cache_free(ncp);
906 }
907 
908 /*
909  * cache_zap_locked():
910  *
911  *   Removes a namecache entry from cache, whether it contains an actual
912  *   pointer to a vnode or if it is just a negative cache entry.
913  */
914 static void
915 cache_zap_locked(struct namecache *ncp)
916 {
917 
918 	if (!(ncp->nc_flag & NCF_NEGATIVE))
919 		cache_assert_vnode_locked(ncp->nc_vp);
920 	cache_assert_vnode_locked(ncp->nc_dvp);
921 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
922 
923 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
924 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
925 
926 	cache_ncp_invalidate(ncp);
927 
928 	CK_LIST_REMOVE(ncp, nc_hash);
929 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
930 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
931 		    ncp->nc_name, ncp->nc_vp);
932 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
933 		if (ncp == ncp->nc_vp->v_cache_dd)
934 			ncp->nc_vp->v_cache_dd = NULL;
935 	} else {
936 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
937 		    ncp->nc_name);
938 		cache_negative_remove(ncp);
939 	}
940 	if (ncp->nc_flag & NCF_ISDOTDOT) {
941 		if (ncp == ncp->nc_dvp->v_cache_dd)
942 			ncp->nc_dvp->v_cache_dd = NULL;
943 	} else {
944 		LIST_REMOVE(ncp, nc_src);
945 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
946 			ncp->nc_flag |= NCF_DVDROP;
947 			counter_u64_add(numcachehv, -1);
948 		}
949 	}
950 	atomic_subtract_rel_long(&numcache, 1);
951 }
952 
953 static void
954 cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
955 {
956 	struct rwlock *blp;
957 
958 	MPASS(ncp->nc_dvp == vp);
959 	MPASS(ncp->nc_flag & NCF_NEGATIVE);
960 	cache_assert_vnode_locked(vp);
961 
962 	blp = NCP2BUCKETLOCK(ncp);
963 	rw_wlock(blp);
964 	cache_zap_locked(ncp);
965 	rw_wunlock(blp);
966 }
967 
968 static bool
969 cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
970     struct mtx **vlpp)
971 {
972 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
973 	struct rwlock *blp;
974 
975 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
976 	cache_assert_vnode_locked(vp);
977 
978 	if (ncp->nc_flag & NCF_NEGATIVE) {
979 		if (*vlpp != NULL) {
980 			mtx_unlock(*vlpp);
981 			*vlpp = NULL;
982 		}
983 		cache_zap_negative_locked_vnode_kl(ncp, vp);
984 		return (true);
985 	}
986 
987 	pvlp = VP2VNODELOCK(vp);
988 	blp = NCP2BUCKETLOCK(ncp);
989 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
990 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
991 
992 	if (*vlpp == vlp1 || *vlpp == vlp2) {
993 		to_unlock = *vlpp;
994 		*vlpp = NULL;
995 	} else {
996 		if (*vlpp != NULL) {
997 			mtx_unlock(*vlpp);
998 			*vlpp = NULL;
999 		}
1000 		cache_sort_vnodes(&vlp1, &vlp2);
1001 		if (vlp1 == pvlp) {
1002 			mtx_lock(vlp2);
1003 			to_unlock = vlp2;
1004 		} else {
1005 			if (!mtx_trylock(vlp1))
1006 				goto out_relock;
1007 			to_unlock = vlp1;
1008 		}
1009 	}
1010 	rw_wlock(blp);
1011 	cache_zap_locked(ncp);
1012 	rw_wunlock(blp);
1013 	if (to_unlock != NULL)
1014 		mtx_unlock(to_unlock);
1015 	return (true);
1016 
1017 out_relock:
1018 	mtx_unlock(vlp2);
1019 	mtx_lock(vlp1);
1020 	mtx_lock(vlp2);
1021 	MPASS(*vlpp == NULL);
1022 	*vlpp = vlp1;
1023 	return (false);
1024 }
1025 
1026 static int __noinline
1027 cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
1028 {
1029 	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
1030 	struct rwlock *blp;
1031 	int error = 0;
1032 
1033 	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
1034 	cache_assert_vnode_locked(vp);
1035 
1036 	pvlp = VP2VNODELOCK(vp);
1037 	if (ncp->nc_flag & NCF_NEGATIVE) {
1038 		cache_zap_negative_locked_vnode_kl(ncp, vp);
1039 		goto out;
1040 	}
1041 
1042 	blp = NCP2BUCKETLOCK(ncp);
1043 	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
1044 	vlp2 = VP2VNODELOCK(ncp->nc_vp);
1045 	cache_sort_vnodes(&vlp1, &vlp2);
1046 	if (vlp1 == pvlp) {
1047 		mtx_lock(vlp2);
1048 		to_unlock = vlp2;
1049 	} else {
1050 		if (!mtx_trylock(vlp1)) {
1051 			error = EAGAIN;
1052 			goto out;
1053 		}
1054 		to_unlock = vlp1;
1055 	}
1056 	rw_wlock(blp);
1057 	cache_zap_locked(ncp);
1058 	rw_wunlock(blp);
1059 	mtx_unlock(to_unlock);
1060 out:
1061 	mtx_unlock(pvlp);
1062 	return (error);
1063 }
1064 
1065 /*
1066  * If trylocking failed we can get here. We know enough to take all needed locks
1067  * in the right order and re-lookup the entry.
1068  */
1069 static int
1070 cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1071     struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash,
1072     struct rwlock *blp)
1073 {
1074 	struct namecache *rncp;
1075 
1076 	cache_assert_bucket_locked(ncp, RA_UNLOCKED);
1077 
1078 	cache_sort_vnodes(&dvlp, &vlp);
1079 	cache_lock_vnodes(dvlp, vlp);
1080 	rw_wlock(blp);
1081 	CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
1082 		if (rncp == ncp && rncp->nc_dvp == dvp &&
1083 		    rncp->nc_nlen == cnp->cn_namelen &&
1084 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
1085 			break;
1086 	}
1087 	if (rncp != NULL) {
1088 		cache_zap_locked(rncp);
1089 		rw_wunlock(blp);
1090 		cache_unlock_vnodes(dvlp, vlp);
1091 		counter_u64_add(zap_and_exit_bucket_relock_success, 1);
1092 		return (0);
1093 	}
1094 
1095 	rw_wunlock(blp);
1096 	cache_unlock_vnodes(dvlp, vlp);
1097 	return (EAGAIN);
1098 }
1099 
1100 static int __noinline
1101 cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1102     uint32_t hash, struct rwlock *blp)
1103 {
1104 	struct mtx *dvlp, *vlp;
1105 	struct vnode *dvp;
1106 
1107 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1108 
1109 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1110 	vlp = NULL;
1111 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1112 		vlp = VP2VNODELOCK(ncp->nc_vp);
1113 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1114 		cache_zap_locked(ncp);
1115 		rw_wunlock(blp);
1116 		cache_unlock_vnodes(dvlp, vlp);
1117 		return (0);
1118 	}
1119 
1120 	dvp = ncp->nc_dvp;
1121 	rw_wunlock(blp);
1122 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1123 }
1124 
1125 static int __noinline
1126 cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp,
1127     uint32_t hash, struct rwlock *blp)
1128 {
1129 	struct mtx *dvlp, *vlp;
1130 	struct vnode *dvp;
1131 
1132 	cache_assert_bucket_locked(ncp, RA_RLOCKED);
1133 
1134 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1135 	vlp = NULL;
1136 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1137 		vlp = VP2VNODELOCK(ncp->nc_vp);
1138 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1139 		rw_runlock(blp);
1140 		rw_wlock(blp);
1141 		cache_zap_locked(ncp);
1142 		rw_wunlock(blp);
1143 		cache_unlock_vnodes(dvlp, vlp);
1144 		return (0);
1145 	}
1146 
1147 	dvp = ncp->nc_dvp;
1148 	rw_runlock(blp);
1149 	return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp));
1150 }
1151 
1152 static int
1153 cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
1154     struct mtx **vlpp1, struct mtx **vlpp2)
1155 {
1156 	struct mtx *dvlp, *vlp;
1157 
1158 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
1159 
1160 	dvlp = VP2VNODELOCK(ncp->nc_dvp);
1161 	vlp = NULL;
1162 	if (!(ncp->nc_flag & NCF_NEGATIVE))
1163 		vlp = VP2VNODELOCK(ncp->nc_vp);
1164 	cache_sort_vnodes(&dvlp, &vlp);
1165 
1166 	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
1167 		cache_zap_locked(ncp);
1168 		cache_unlock_vnodes(dvlp, vlp);
1169 		*vlpp1 = NULL;
1170 		*vlpp2 = NULL;
1171 		return (0);
1172 	}
1173 
1174 	if (*vlpp1 != NULL)
1175 		mtx_unlock(*vlpp1);
1176 	if (*vlpp2 != NULL)
1177 		mtx_unlock(*vlpp2);
1178 	*vlpp1 = NULL;
1179 	*vlpp2 = NULL;
1180 
1181 	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
1182 		cache_zap_locked(ncp);
1183 		cache_unlock_vnodes(dvlp, vlp);
1184 		return (0);
1185 	}
1186 
1187 	rw_wunlock(blp);
1188 	*vlpp1 = dvlp;
1189 	*vlpp2 = vlp;
1190 	if (*vlpp1 != NULL)
1191 		mtx_lock(*vlpp1);
1192 	mtx_lock(*vlpp2);
1193 	rw_wlock(blp);
1194 	return (EAGAIN);
1195 }
1196 
1197 static void
1198 cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
1199 {
1200 
1201 	if (blp != NULL) {
1202 		rw_runlock(blp);
1203 	} else {
1204 		mtx_unlock(vlp);
1205 	}
1206 }
1207 
1208 static int __noinline
1209 cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1210     struct timespec *tsp, int *ticksp)
1211 {
1212 	int ltype;
1213 
1214 	*vpp = dvp;
1215 	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
1216 			dvp, cnp->cn_nameptr);
1217 	counter_u64_add(dothits, 1);
1218 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
1219 	if (tsp != NULL)
1220 		timespecclear(tsp);
1221 	if (ticksp != NULL)
1222 		*ticksp = ticks;
1223 	vrefact(*vpp);
1224 	/*
1225 	 * When we lookup "." we still can be asked to lock it
1226 	 * differently...
1227 	 */
1228 	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
1229 	if (ltype != VOP_ISLOCKED(*vpp)) {
1230 		if (ltype == LK_EXCLUSIVE) {
1231 			vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
1232 			if (VN_IS_DOOMED((*vpp))) {
1233 				/* forced unmount */
1234 				vrele(*vpp);
1235 				*vpp = NULL;
1236 				return (ENOENT);
1237 			}
1238 		} else
1239 			vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
1240 	}
1241 	return (-1);
1242 }
1243 
1244 static __noinline int
1245 cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp,
1246     struct componentname *cnp, struct timespec *tsp, int *ticksp)
1247 {
1248 	struct namecache *ncp;
1249 	struct rwlock *blp;
1250 	struct mtx *dvlp, *dvlp2;
1251 	uint32_t hash;
1252 	int error;
1253 
1254 	if (cnp->cn_namelen == 2 &&
1255 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1256 		counter_u64_add(dotdothits, 1);
1257 		dvlp = VP2VNODELOCK(dvp);
1258 		dvlp2 = NULL;
1259 		mtx_lock(dvlp);
1260 retry_dotdot:
1261 		ncp = dvp->v_cache_dd;
1262 		if (ncp == NULL) {
1263 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1264 			    "..", NULL);
1265 			mtx_unlock(dvlp);
1266 			if (dvlp2 != NULL)
1267 				mtx_unlock(dvlp2);
1268 			return (0);
1269 		}
1270 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1271 			if (ncp->nc_dvp != dvp)
1272 				panic("dvp %p v_cache_dd %p\n", dvp, ncp);
1273 			if (!cache_zap_locked_vnode_kl2(ncp,
1274 			    dvp, &dvlp2))
1275 				goto retry_dotdot;
1276 			MPASS(dvp->v_cache_dd == NULL);
1277 			mtx_unlock(dvlp);
1278 			if (dvlp2 != NULL)
1279 				mtx_unlock(dvlp2);
1280 			cache_free(ncp);
1281 		} else {
1282 			dvp->v_cache_dd = NULL;
1283 			mtx_unlock(dvlp);
1284 			if (dvlp2 != NULL)
1285 				mtx_unlock(dvlp2);
1286 		}
1287 		return (0);
1288 	}
1289 
1290 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1291 	blp = HASH2BUCKETLOCK(hash);
1292 retry:
1293 	if (CK_LIST_EMPTY(NCHHASH(hash)))
1294 		goto out_no_entry;
1295 
1296 	rw_wlock(blp);
1297 
1298 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1299 		counter_u64_add(numchecks, 1);
1300 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1301 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1302 			break;
1303 	}
1304 
1305 	/* We failed to find an entry */
1306 	if (ncp == NULL) {
1307 		rw_wunlock(blp);
1308 		goto out_no_entry;
1309 	}
1310 
1311 	error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp);
1312 	if (__predict_false(error != 0)) {
1313 		zap_and_exit_bucket_fail++;
1314 		cache_maybe_yield();
1315 		goto retry;
1316 	}
1317 	counter_u64_add(numposzaps, 1);
1318 	cache_free(ncp);
1319 	return (0);
1320 out_no_entry:
1321 	SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL);
1322 	counter_u64_add(nummisszap, 1);
1323 	return (0);
1324 }
1325 
1326 /**
1327  * Lookup a name in the name cache
1328  *
1329  * # Arguments
1330  *
1331  * - dvp:	Parent directory in which to search.
1332  * - vpp:	Return argument.  Will contain desired vnode on cache hit.
1333  * - cnp:	Parameters of the name search.  The most interesting bits of
1334  *   		the cn_flags field have the following meanings:
1335  *   	- MAKEENTRY:	If clear, free an entry from the cache rather than look
1336  *   			it up.
1337  *   	- ISDOTDOT:	Must be set if and only if cn_nameptr == ".."
1338  * - tsp:	Return storage for cache timestamp.  On a successful (positive
1339  *   		or negative) lookup, tsp will be filled with any timespec that
1340  *   		was stored when this cache entry was created.  However, it will
1341  *   		be clear for "." entries.
1342  * - ticks:	Return storage for alternate cache timestamp.  On a successful
1343  *   		(positive or negative) lookup, it will contain the ticks value
1344  *   		that was current when the cache entry was created, unless cnp
1345  *   		was ".".
1346  *
1347  * # Returns
1348  *
1349  * - -1:	A positive cache hit.  vpp will contain the desired vnode.
1350  * - ENOENT:	A negative cache hit, or dvp was recycled out from under us due
1351  *		to a forced unmount.  vpp will not be modified.  If the entry
1352  *		is a whiteout, then the ISWHITEOUT flag will be set in
1353  *		cnp->cn_flags.
1354  * - 0:		A cache miss.  vpp will not be modified.
1355  *
1356  * # Locking
1357  *
1358  * On a cache hit, vpp will be returned locked and ref'd.  If we're looking up
1359  * .., dvp is unlocked.  If we're looking up . an extra ref is taken, but the
1360  * lock is not recursively acquired.
1361  */
1362 int
1363 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1364     struct timespec *tsp, int *ticksp)
1365 {
1366 	struct namecache_ts *ncp_ts;
1367 	struct namecache *ncp;
1368 	struct negstate *negstate;
1369 	struct rwlock *blp;
1370 	struct mtx *dvlp;
1371 	uint32_t hash;
1372 	enum vgetstate vs;
1373 	int error, ltype;
1374 	bool try_smr, doing_smr, whiteout;
1375 
1376 #ifdef DEBUG_CACHE
1377 	if (__predict_false(!doingcache)) {
1378 		cnp->cn_flags &= ~MAKEENTRY;
1379 		return (0);
1380 	}
1381 #endif
1382 
1383 	if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.'))
1384 		return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp));
1385 
1386 	if ((cnp->cn_flags & MAKEENTRY) == 0)
1387 		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
1388 
1389 	try_smr = true;
1390 	if (cnp->cn_nameiop == CREATE)
1391 		try_smr = false;
1392 retry:
1393 	doing_smr = false;
1394 	blp = NULL;
1395 	dvlp = NULL;
1396 	error = 0;
1397 	if (cnp->cn_namelen == 2 &&
1398 	    cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') {
1399 		counter_u64_add(dotdothits, 1);
1400 		dvlp = VP2VNODELOCK(dvp);
1401 		mtx_lock(dvlp);
1402 		ncp = dvp->v_cache_dd;
1403 		if (ncp == NULL) {
1404 			SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
1405 			    "..", NULL);
1406 			mtx_unlock(dvlp);
1407 			return (0);
1408 		}
1409 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
1410 			if (ncp->nc_flag & NCF_NEGATIVE)
1411 				*vpp = NULL;
1412 			else
1413 				*vpp = ncp->nc_vp;
1414 		} else
1415 			*vpp = ncp->nc_dvp;
1416 		/* Return failure if negative entry was found. */
1417 		if (*vpp == NULL)
1418 			goto negative_success;
1419 		CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
1420 		    dvp, cnp->cn_nameptr, *vpp);
1421 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
1422 		    *vpp);
1423 		cache_out_ts(ncp, tsp, ticksp);
1424 		if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
1425 		    NCF_DTS && tsp != NULL) {
1426 			ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1427 			*tsp = ncp_ts->nc_dotdottime;
1428 		}
1429 		goto success;
1430 	}
1431 
1432 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
1433 retry_hashed:
1434 	if (try_smr) {
1435 		vfs_smr_enter();
1436 		doing_smr = true;
1437 		try_smr = false;
1438 	} else {
1439 		blp = HASH2BUCKETLOCK(hash);
1440 		rw_rlock(blp);
1441 	}
1442 
1443 	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
1444 		counter_u64_add(numchecks, 1);
1445 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
1446 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
1447 			break;
1448 	}
1449 
1450 	/* We failed to find an entry */
1451 	if (__predict_false(ncp == NULL)) {
1452 		if (doing_smr)
1453 			vfs_smr_exit();
1454 		else
1455 			rw_runlock(blp);
1456 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
1457 		    NULL);
1458 		counter_u64_add(nummiss, 1);
1459 		return (0);
1460 	}
1461 
1462 	if (ncp->nc_flag & NCF_NEGATIVE)
1463 		goto negative_success;
1464 
1465 	/* We found a "positive" match, return the vnode */
1466 	counter_u64_add(numposhits, 1);
1467 	*vpp = ncp->nc_vp;
1468 	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
1469 	    dvp, cnp->cn_nameptr, *vpp, ncp);
1470 	SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
1471 	    *vpp);
1472 	cache_out_ts(ncp, tsp, ticksp);
1473 success:
1474 	/*
1475 	 * On success we return a locked and ref'd vnode as per the lookup
1476 	 * protocol.
1477 	 */
1478 	MPASS(dvp != *vpp);
1479 	ltype = 0;	/* silence gcc warning */
1480 	if (cnp->cn_flags & ISDOTDOT) {
1481 		ltype = VOP_ISLOCKED(dvp);
1482 		VOP_UNLOCK(dvp);
1483 	}
1484 	if (doing_smr) {
1485 		if (cache_ncp_invalid(ncp)) {
1486 			vfs_smr_exit();
1487 			*vpp = NULL;
1488 			goto retry;
1489 		}
1490 		vs = vget_prep_smr(*vpp);
1491 		vfs_smr_exit();
1492 		if (vs == VGET_NONE) {
1493 			*vpp = NULL;
1494 			goto retry;
1495 		}
1496 	} else {
1497 		vs = vget_prep(*vpp);
1498 		cache_lookup_unlock(blp, dvlp);
1499 	}
1500 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
1501 	if (cnp->cn_flags & ISDOTDOT) {
1502 		vn_lock(dvp, ltype | LK_RETRY);
1503 		if (VN_IS_DOOMED(dvp)) {
1504 			if (error == 0)
1505 				vput(*vpp);
1506 			*vpp = NULL;
1507 			return (ENOENT);
1508 		}
1509 	}
1510 	if (error) {
1511 		*vpp = NULL;
1512 		goto retry;
1513 	}
1514 	if ((cnp->cn_flags & ISLASTCN) &&
1515 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
1516 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
1517 	}
1518 	return (-1);
1519 
1520 negative_success:
1521 	/* We found a negative match, and want to create it, so purge */
1522 	if (cnp->cn_nameiop == CREATE) {
1523 		MPASS(!doing_smr);
1524 		counter_u64_add(numnegzaps, 1);
1525 		goto zap_and_exit;
1526 	}
1527 
1528 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
1529 	cache_out_ts(ncp, tsp, ticksp);
1530 	counter_u64_add(numneghits, 1);
1531 	whiteout = (ncp->nc_flag & NCF_WHITE);
1532 
1533 	if (doing_smr) {
1534 		/*
1535 		 * We need to take locks to promote an entry.
1536 		 */
1537 		negstate = NCP2NEGSTATE(ncp);
1538 		if ((negstate->neg_flag & NEG_HOT) == 0 ||
1539 		    cache_ncp_invalid(ncp)) {
1540 			vfs_smr_exit();
1541 			doing_smr = false;
1542 			goto retry_hashed;
1543 		}
1544 		vfs_smr_exit();
1545 	} else {
1546 		cache_negative_hit(ncp);
1547 		cache_lookup_unlock(blp, dvlp);
1548 	}
1549 	if (whiteout)
1550 		cnp->cn_flags |= ISWHITEOUT;
1551 	return (ENOENT);
1552 
1553 zap_and_exit:
1554 	MPASS(!doing_smr);
1555 	if (blp != NULL)
1556 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
1557 	else
1558 		error = cache_zap_locked_vnode(ncp, dvp);
1559 	if (__predict_false(error != 0)) {
1560 		zap_and_exit_bucket_fail2++;
1561 		cache_maybe_yield();
1562 		goto retry;
1563 	}
1564 	cache_free(ncp);
1565 	return (0);
1566 }
1567 
1568 struct celockstate {
1569 	struct mtx *vlp[3];
1570 	struct rwlock *blp[2];
1571 };
1572 CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
1573 CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
1574 
1575 static inline void
1576 cache_celockstate_init(struct celockstate *cel)
1577 {
1578 
1579 	bzero(cel, sizeof(*cel));
1580 }
1581 
1582 static void
1583 cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
1584     struct vnode *dvp)
1585 {
1586 	struct mtx *vlp1, *vlp2;
1587 
1588 	MPASS(cel->vlp[0] == NULL);
1589 	MPASS(cel->vlp[1] == NULL);
1590 	MPASS(cel->vlp[2] == NULL);
1591 
1592 	MPASS(vp != NULL || dvp != NULL);
1593 
1594 	vlp1 = VP2VNODELOCK(vp);
1595 	vlp2 = VP2VNODELOCK(dvp);
1596 	cache_sort_vnodes(&vlp1, &vlp2);
1597 
1598 	if (vlp1 != NULL) {
1599 		mtx_lock(vlp1);
1600 		cel->vlp[0] = vlp1;
1601 	}
1602 	mtx_lock(vlp2);
1603 	cel->vlp[1] = vlp2;
1604 }
1605 
1606 static void
1607 cache_unlock_vnodes_cel(struct celockstate *cel)
1608 {
1609 
1610 	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
1611 
1612 	if (cel->vlp[0] != NULL)
1613 		mtx_unlock(cel->vlp[0]);
1614 	if (cel->vlp[1] != NULL)
1615 		mtx_unlock(cel->vlp[1]);
1616 	if (cel->vlp[2] != NULL)
1617 		mtx_unlock(cel->vlp[2]);
1618 }
1619 
1620 static bool
1621 cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
1622 {
1623 	struct mtx *vlp;
1624 	bool ret;
1625 
1626 	cache_assert_vlp_locked(cel->vlp[0]);
1627 	cache_assert_vlp_locked(cel->vlp[1]);
1628 	MPASS(cel->vlp[2] == NULL);
1629 
1630 	MPASS(vp != NULL);
1631 	vlp = VP2VNODELOCK(vp);
1632 
1633 	ret = true;
1634 	if (vlp >= cel->vlp[1]) {
1635 		mtx_lock(vlp);
1636 	} else {
1637 		if (mtx_trylock(vlp))
1638 			goto out;
1639 		cache_lock_vnodes_cel_3_failures++;
1640 		cache_unlock_vnodes_cel(cel);
1641 		if (vlp < cel->vlp[0]) {
1642 			mtx_lock(vlp);
1643 			mtx_lock(cel->vlp[0]);
1644 			mtx_lock(cel->vlp[1]);
1645 		} else {
1646 			if (cel->vlp[0] != NULL)
1647 				mtx_lock(cel->vlp[0]);
1648 			mtx_lock(vlp);
1649 			mtx_lock(cel->vlp[1]);
1650 		}
1651 		ret = false;
1652 	}
1653 out:
1654 	cel->vlp[2] = vlp;
1655 	return (ret);
1656 }
1657 
1658 static void
1659 cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
1660     struct rwlock *blp2)
1661 {
1662 
1663 	MPASS(cel->blp[0] == NULL);
1664 	MPASS(cel->blp[1] == NULL);
1665 
1666 	cache_sort_vnodes(&blp1, &blp2);
1667 
1668 	if (blp1 != NULL) {
1669 		rw_wlock(blp1);
1670 		cel->blp[0] = blp1;
1671 	}
1672 	rw_wlock(blp2);
1673 	cel->blp[1] = blp2;
1674 }
1675 
1676 static void
1677 cache_unlock_buckets_cel(struct celockstate *cel)
1678 {
1679 
1680 	if (cel->blp[0] != NULL)
1681 		rw_wunlock(cel->blp[0]);
1682 	rw_wunlock(cel->blp[1]);
1683 }
1684 
1685 /*
1686  * Lock part of the cache affected by the insertion.
1687  *
1688  * This means vnodelocks for dvp, vp and the relevant bucketlock.
1689  * However, insertion can result in removal of an old entry. In this
1690  * case we have an additional vnode and bucketlock pair to lock. If the
1691  * entry is negative, ncelock is locked instead of the vnode.
1692  *
1693  * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
1694  * preserving the locking order (smaller address first).
1695  */
1696 static void
1697 cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1698     uint32_t hash)
1699 {
1700 	struct namecache *ncp;
1701 	struct rwlock *blps[2];
1702 
1703 	blps[0] = HASH2BUCKETLOCK(hash);
1704 	for (;;) {
1705 		blps[1] = NULL;
1706 		cache_lock_vnodes_cel(cel, dvp, vp);
1707 		if (vp == NULL || vp->v_type != VDIR)
1708 			break;
1709 		ncp = vp->v_cache_dd;
1710 		if (ncp == NULL)
1711 			break;
1712 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1713 			break;
1714 		MPASS(ncp->nc_dvp == vp);
1715 		blps[1] = NCP2BUCKETLOCK(ncp);
1716 		if (ncp->nc_flag & NCF_NEGATIVE)
1717 			break;
1718 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1719 			break;
1720 		/*
1721 		 * All vnodes got re-locked. Re-validate the state and if
1722 		 * nothing changed we are done. Otherwise restart.
1723 		 */
1724 		if (ncp == vp->v_cache_dd &&
1725 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1726 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1727 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1728 			break;
1729 		cache_unlock_vnodes_cel(cel);
1730 		cel->vlp[0] = NULL;
1731 		cel->vlp[1] = NULL;
1732 		cel->vlp[2] = NULL;
1733 	}
1734 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1735 }
1736 
1737 static void
1738 cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
1739     uint32_t hash)
1740 {
1741 	struct namecache *ncp;
1742 	struct rwlock *blps[2];
1743 
1744 	blps[0] = HASH2BUCKETLOCK(hash);
1745 	for (;;) {
1746 		blps[1] = NULL;
1747 		cache_lock_vnodes_cel(cel, dvp, vp);
1748 		ncp = dvp->v_cache_dd;
1749 		if (ncp == NULL)
1750 			break;
1751 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
1752 			break;
1753 		MPASS(ncp->nc_dvp == dvp);
1754 		blps[1] = NCP2BUCKETLOCK(ncp);
1755 		if (ncp->nc_flag & NCF_NEGATIVE)
1756 			break;
1757 		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
1758 			break;
1759 		if (ncp == dvp->v_cache_dd &&
1760 		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
1761 		    blps[1] == NCP2BUCKETLOCK(ncp) &&
1762 		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
1763 			break;
1764 		cache_unlock_vnodes_cel(cel);
1765 		cel->vlp[0] = NULL;
1766 		cel->vlp[1] = NULL;
1767 		cel->vlp[2] = NULL;
1768 	}
1769 	cache_lock_buckets_cel(cel, blps[0], blps[1]);
1770 }
1771 
1772 static void
1773 cache_enter_unlock(struct celockstate *cel)
1774 {
1775 
1776 	cache_unlock_buckets_cel(cel);
1777 	cache_unlock_vnodes_cel(cel);
1778 }
1779 
1780 static void __noinline
1781 cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp,
1782     struct componentname *cnp)
1783 {
1784 	struct celockstate cel;
1785 	struct namecache *ncp;
1786 	uint32_t hash;
1787 	int len;
1788 
1789 	if (dvp->v_cache_dd == NULL)
1790 		return;
1791 	len = cnp->cn_namelen;
1792 	cache_celockstate_init(&cel);
1793 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1794 	cache_enter_lock_dd(&cel, dvp, vp, hash);
1795 	ncp = dvp->v_cache_dd;
1796 	if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) {
1797 		KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent"));
1798 		cache_zap_locked(ncp);
1799 	} else {
1800 		ncp = NULL;
1801 	}
1802 	dvp->v_cache_dd = NULL;
1803 	cache_enter_unlock(&cel);
1804 	cache_free(ncp);
1805 }
1806 
1807 /*
1808  * Add an entry to the cache.
1809  */
1810 void
1811 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
1812     struct timespec *tsp, struct timespec *dtsp)
1813 {
1814 	struct celockstate cel;
1815 	struct namecache *ncp, *n2, *ndd;
1816 	struct namecache_ts *ncp_ts, *n2_ts;
1817 	struct nchashhead *ncpp;
1818 	uint32_t hash;
1819 	int flag;
1820 	int len;
1821 	u_long lnumcache;
1822 
1823 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
1824 	VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp,
1825 	    ("cache_enter: Adding a doomed vnode"));
1826 	VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp,
1827 	    ("cache_enter: Doomed vnode used as src"));
1828 
1829 #ifdef DEBUG_CACHE
1830 	if (__predict_false(!doingcache))
1831 		return;
1832 #endif
1833 
1834 	flag = 0;
1835 	if (__predict_false(cnp->cn_nameptr[0] == '.')) {
1836 		if (cnp->cn_namelen == 1)
1837 			return;
1838 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
1839 			cache_enter_dotdot_prep(dvp, vp, cnp);
1840 			flag = NCF_ISDOTDOT;
1841 		}
1842 	}
1843 
1844 	/*
1845 	 * Avoid blowout in namecache entries.
1846 	 */
1847 	lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
1848 	if (__predict_false(lnumcache >= ncsize)) {
1849 		atomic_add_long(&numcache, -1);
1850 		counter_u64_add(numdrops, 1);
1851 		return;
1852 	}
1853 
1854 	cache_celockstate_init(&cel);
1855 	ndd = NULL;
1856 	ncp_ts = NULL;
1857 
1858 	/*
1859 	 * Calculate the hash key and setup as much of the new
1860 	 * namecache entry as possible before acquiring the lock.
1861 	 */
1862 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
1863 	ncp->nc_flag = flag;
1864 	ncp->nc_vp = vp;
1865 	if (vp == NULL)
1866 		cache_negative_init(ncp);
1867 	ncp->nc_dvp = dvp;
1868 	if (tsp != NULL) {
1869 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
1870 		ncp_ts->nc_time = *tsp;
1871 		ncp_ts->nc_ticks = ticks;
1872 		ncp_ts->nc_nc.nc_flag |= NCF_TS;
1873 		if (dtsp != NULL) {
1874 			ncp_ts->nc_dotdottime = *dtsp;
1875 			ncp_ts->nc_nc.nc_flag |= NCF_DTS;
1876 		}
1877 	}
1878 	len = ncp->nc_nlen = cnp->cn_namelen;
1879 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
1880 	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
1881 	cache_enter_lock(&cel, dvp, vp, hash);
1882 
1883 	/*
1884 	 * See if this vnode or negative entry is already in the cache
1885 	 * with this name.  This can happen with concurrent lookups of
1886 	 * the same path name.
1887 	 */
1888 	ncpp = NCHHASH(hash);
1889 	CK_LIST_FOREACH(n2, ncpp, nc_hash) {
1890 		if (n2->nc_dvp == dvp &&
1891 		    n2->nc_nlen == cnp->cn_namelen &&
1892 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
1893 			if (tsp != NULL) {
1894 				KASSERT((n2->nc_flag & NCF_TS) != 0,
1895 				    ("no NCF_TS"));
1896 				n2_ts = __containerof(n2, struct namecache_ts, nc_nc);
1897 				n2_ts->nc_time = ncp_ts->nc_time;
1898 				n2_ts->nc_ticks = ncp_ts->nc_ticks;
1899 				if (dtsp != NULL) {
1900 					n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime;
1901 					n2_ts->nc_nc.nc_flag |= NCF_DTS;
1902 				}
1903 			}
1904 			goto out_unlock_free;
1905 		}
1906 	}
1907 
1908 	if (flag == NCF_ISDOTDOT) {
1909 		/*
1910 		 * See if we are trying to add .. entry, but some other lookup
1911 		 * has populated v_cache_dd pointer already.
1912 		 */
1913 		if (dvp->v_cache_dd != NULL)
1914 			goto out_unlock_free;
1915 		KASSERT(vp == NULL || vp->v_type == VDIR,
1916 		    ("wrong vnode type %p", vp));
1917 		dvp->v_cache_dd = ncp;
1918 	}
1919 
1920 	if (vp != NULL) {
1921 		if (vp->v_type == VDIR) {
1922 			if (flag != NCF_ISDOTDOT) {
1923 				/*
1924 				 * For this case, the cache entry maps both the
1925 				 * directory name in it and the name ".." for the
1926 				 * directory's parent.
1927 				 */
1928 				if ((ndd = vp->v_cache_dd) != NULL) {
1929 					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
1930 						cache_zap_locked(ndd);
1931 					else
1932 						ndd = NULL;
1933 				}
1934 				vp->v_cache_dd = ncp;
1935 			}
1936 		} else {
1937 			vp->v_cache_dd = NULL;
1938 		}
1939 	}
1940 
1941 	if (flag != NCF_ISDOTDOT) {
1942 		if (LIST_EMPTY(&dvp->v_cache_src)) {
1943 			vhold(dvp);
1944 			counter_u64_add(numcachehv, 1);
1945 		}
1946 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
1947 	}
1948 
1949 	/*
1950 	 * If the entry is "negative", we place it into the
1951 	 * "negative" cache queue, otherwise, we place it into the
1952 	 * destination vnode's cache entries queue.
1953 	 */
1954 	if (vp != NULL) {
1955 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
1956 		SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name,
1957 		    vp);
1958 	} else {
1959 		if (cnp->cn_flags & ISWHITEOUT)
1960 			ncp->nc_flag |= NCF_WHITE;
1961 		cache_negative_insert(ncp);
1962 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
1963 		    ncp->nc_name);
1964 	}
1965 
1966 	atomic_thread_fence_rel();
1967 	/*
1968 	 * Insert the new namecache entry into the appropriate chain
1969 	 * within the cache entries table.
1970 	 */
1971 	CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
1972 
1973 	cache_enter_unlock(&cel);
1974 	if (numneg * ncnegfactor > lnumcache)
1975 		cache_negative_zap_one();
1976 	cache_free(ndd);
1977 	return;
1978 out_unlock_free:
1979 	cache_enter_unlock(&cel);
1980 	cache_free(ncp);
1981 	return;
1982 }
1983 
1984 static u_int
1985 cache_roundup_2(u_int val)
1986 {
1987 	u_int res;
1988 
1989 	for (res = 1; res <= val; res <<= 1)
1990 		continue;
1991 
1992 	return (res);
1993 }
1994 
1995 /*
1996  * Name cache initialization, from vfs_init() when we are booting
1997  */
1998 static void
1999 nchinit(void *dummy __unused)
2000 {
2001 	u_int i;
2002 
2003 	cache_zone_small = uma_zcreate("S VFS Cache",
2004 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
2005 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2006 	    UMA_ZONE_ZINIT);
2007 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
2008 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
2009 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2010 	    UMA_ZONE_ZINIT);
2011 	cache_zone_large = uma_zcreate("L VFS Cache",
2012 	    sizeof(struct namecache) + NAME_MAX + 1,
2013 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
2014 	    UMA_ZONE_ZINIT);
2015 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
2016 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
2017 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
2018 	    UMA_ZONE_ZINIT);
2019 
2020 	VFS_SMR_ZONE_SET(cache_zone_small);
2021 	VFS_SMR_ZONE_SET(cache_zone_small_ts);
2022 	VFS_SMR_ZONE_SET(cache_zone_large);
2023 	VFS_SMR_ZONE_SET(cache_zone_large_ts);
2024 
2025 	ncsize = desiredvnodes * ncsizefactor;
2026 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
2027 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
2028 	if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */
2029 		ncbuckethash = 7;
2030 	if (ncbuckethash > nchash)
2031 		ncbuckethash = nchash;
2032 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
2033 	    M_WAITOK | M_ZERO);
2034 	for (i = 0; i < numbucketlocks; i++)
2035 		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
2036 	ncvnodehash = ncbuckethash;
2037 	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
2038 	    M_WAITOK | M_ZERO);
2039 	for (i = 0; i < numvnodelocks; i++)
2040 		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
2041 	ncpurgeminvnodes = numbucketlocks * 2;
2042 
2043 	ncneghash = 3;
2044 	neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE,
2045 	    M_WAITOK | M_ZERO);
2046 	for (i = 0; i < numneglists; i++) {
2047 		mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
2048 		TAILQ_INIT(&neglists[i].nl_list);
2049 	}
2050 	mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF);
2051 	TAILQ_INIT(&ncneg_hot.nl_list);
2052 
2053 	mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
2054 }
2055 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
2056 
2057 void
2058 cache_changesize(u_long newmaxvnodes)
2059 {
2060 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
2061 	u_long new_nchash, old_nchash;
2062 	struct namecache *ncp;
2063 	uint32_t hash;
2064 	u_long newncsize;
2065 	int i;
2066 
2067 	newncsize = newmaxvnodes * ncsizefactor;
2068 	newmaxvnodes = cache_roundup_2(newmaxvnodes * 2);
2069 	if (newmaxvnodes < numbucketlocks)
2070 		newmaxvnodes = numbucketlocks;
2071 
2072 	new_nchashtbl = hashinit(newmaxvnodes, M_VFSCACHE, &new_nchash);
2073 	/* If same hash table size, nothing to do */
2074 	if (nchash == new_nchash) {
2075 		free(new_nchashtbl, M_VFSCACHE);
2076 		return;
2077 	}
2078 	/*
2079 	 * Move everything from the old hash table to the new table.
2080 	 * None of the namecache entries in the table can be removed
2081 	 * because to do so, they have to be removed from the hash table.
2082 	 */
2083 	cache_lock_all_vnodes();
2084 	cache_lock_all_buckets();
2085 	old_nchashtbl = nchashtbl;
2086 	old_nchash = nchash;
2087 	nchashtbl = new_nchashtbl;
2088 	nchash = new_nchash;
2089 	for (i = 0; i <= old_nchash; i++) {
2090 		while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) {
2091 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
2092 			    ncp->nc_dvp);
2093 			CK_LIST_REMOVE(ncp, nc_hash);
2094 			CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
2095 		}
2096 	}
2097 	ncsize = newncsize;
2098 	cache_unlock_all_buckets();
2099 	cache_unlock_all_vnodes();
2100 	free(old_nchashtbl, M_VFSCACHE);
2101 }
2102 
2103 /*
2104  * Invalidate all entries from and to a particular vnode.
2105  */
2106 void
2107 cache_purge(struct vnode *vp)
2108 {
2109 	TAILQ_HEAD(, namecache) ncps;
2110 	struct namecache *ncp, *nnp;
2111 	struct mtx *vlp, *vlp2;
2112 
2113 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
2114 	SDT_PROBE1(vfs, namecache, purge, done, vp);
2115 	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
2116 	    vp->v_cache_dd == NULL)
2117 		return;
2118 	TAILQ_INIT(&ncps);
2119 	vlp = VP2VNODELOCK(vp);
2120 	vlp2 = NULL;
2121 	mtx_lock(vlp);
2122 retry:
2123 	while (!LIST_EMPTY(&vp->v_cache_src)) {
2124 		ncp = LIST_FIRST(&vp->v_cache_src);
2125 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2126 			goto retry;
2127 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2128 	}
2129 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
2130 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
2131 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2132 			goto retry;
2133 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2134 	}
2135 	ncp = vp->v_cache_dd;
2136 	if (ncp != NULL) {
2137 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
2138 		   ("lost dotdot link"));
2139 		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
2140 			goto retry;
2141 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2142 	}
2143 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
2144 	mtx_unlock(vlp);
2145 	if (vlp2 != NULL)
2146 		mtx_unlock(vlp2);
2147 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2148 		cache_free(ncp);
2149 	}
2150 }
2151 
2152 /*
2153  * Invalidate all negative entries for a particular directory vnode.
2154  */
2155 void
2156 cache_purge_negative(struct vnode *vp)
2157 {
2158 	TAILQ_HEAD(, namecache) ncps;
2159 	struct namecache *ncp, *nnp;
2160 	struct mtx *vlp;
2161 
2162 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
2163 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
2164 	if (LIST_EMPTY(&vp->v_cache_src))
2165 		return;
2166 	TAILQ_INIT(&ncps);
2167 	vlp = VP2VNODELOCK(vp);
2168 	mtx_lock(vlp);
2169 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
2170 		if (!(ncp->nc_flag & NCF_NEGATIVE))
2171 			continue;
2172 		cache_zap_negative_locked_vnode_kl(ncp, vp);
2173 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
2174 	}
2175 	mtx_unlock(vlp);
2176 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2177 		cache_free(ncp);
2178 	}
2179 }
2180 
2181 /*
2182  * Flush all entries referencing a particular filesystem.
2183  */
2184 void
2185 cache_purgevfs(struct mount *mp, bool force)
2186 {
2187 	TAILQ_HEAD(, namecache) ncps;
2188 	struct mtx *vlp1, *vlp2;
2189 	struct rwlock *blp;
2190 	struct nchashhead *bucket;
2191 	struct namecache *ncp, *nnp;
2192 	u_long i, j, n_nchash;
2193 	int error;
2194 
2195 	/* Scan hash tables for applicable entries */
2196 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
2197 	if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes)
2198 		return;
2199 	TAILQ_INIT(&ncps);
2200 	n_nchash = nchash + 1;
2201 	vlp1 = vlp2 = NULL;
2202 	for (i = 0; i < numbucketlocks; i++) {
2203 		blp = (struct rwlock *)&bucketlocks[i];
2204 		rw_wlock(blp);
2205 		for (j = i; j < n_nchash; j += numbucketlocks) {
2206 retry:
2207 			bucket = &nchashtbl[j];
2208 			CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
2209 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
2210 				if (ncp->nc_dvp->v_mount != mp)
2211 					continue;
2212 				error = cache_zap_wlocked_bucket_kl(ncp, blp,
2213 				    &vlp1, &vlp2);
2214 				if (error != 0)
2215 					goto retry;
2216 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
2217 			}
2218 		}
2219 		rw_wunlock(blp);
2220 		if (vlp1 == NULL && vlp2 == NULL)
2221 			cache_maybe_yield();
2222 	}
2223 	if (vlp1 != NULL)
2224 		mtx_unlock(vlp1);
2225 	if (vlp2 != NULL)
2226 		mtx_unlock(vlp2);
2227 
2228 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
2229 		cache_free(ncp);
2230 	}
2231 }
2232 
2233 /*
2234  * Perform canonical checks and cache lookup and pass on to filesystem
2235  * through the vop_cachedlookup only if needed.
2236  */
2237 
2238 int
2239 vfs_cache_lookup(struct vop_lookup_args *ap)
2240 {
2241 	struct vnode *dvp;
2242 	int error;
2243 	struct vnode **vpp = ap->a_vpp;
2244 	struct componentname *cnp = ap->a_cnp;
2245 	int flags = cnp->cn_flags;
2246 
2247 	*vpp = NULL;
2248 	dvp = ap->a_dvp;
2249 
2250 	if (dvp->v_type != VDIR)
2251 		return (ENOTDIR);
2252 
2253 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
2254 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
2255 		return (EROFS);
2256 
2257 	error = vn_dir_check_exec(dvp, cnp);
2258 	if (error != 0)
2259 		return (error);
2260 
2261 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
2262 	if (error == 0)
2263 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
2264 	if (error == -1)
2265 		return (0);
2266 	return (error);
2267 }
2268 
2269 /* Implementation of the getcwd syscall. */
2270 int
2271 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
2272 {
2273 	char *buf, *retbuf;
2274 	size_t buflen;
2275 	int error;
2276 
2277 	buflen = uap->buflen;
2278 	if (__predict_false(buflen < 2))
2279 		return (EINVAL);
2280 	if (buflen > MAXPATHLEN)
2281 		buflen = MAXPATHLEN;
2282 
2283 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2284 	error = vn_getcwd(td, buf, &retbuf, &buflen);
2285 	if (error == 0)
2286 		error = copyout(retbuf, uap->buf, buflen);
2287 	free(buf, M_TEMP);
2288 	return (error);
2289 }
2290 
2291 int
2292 vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen)
2293 {
2294 	struct pwd *pwd;
2295 	int error;
2296 
2297 	pwd = pwd_hold(td);
2298 	error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen);
2299 	pwd_drop(pwd);
2300 
2301 #ifdef KTRACE
2302 	if (KTRPOINT(curthread, KTR_NAMEI) && error == 0)
2303 		ktrnamei(*retbuf);
2304 #endif
2305 	return (error);
2306 }
2307 
2308 static int
2309 kern___realpathat(struct thread *td, int fd, const char *path, char *buf,
2310     size_t size, int flags, enum uio_seg pathseg)
2311 {
2312 	struct nameidata nd;
2313 	char *retbuf, *freebuf;
2314 	int error;
2315 
2316 	if (flags != 0)
2317 		return (EINVAL);
2318 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1,
2319 	    pathseg, path, fd, &cap_fstat_rights, td);
2320 	if ((error = namei(&nd)) != 0)
2321 		return (error);
2322 	error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size);
2323 	if (error == 0) {
2324 		error = copyout(retbuf, buf, size);
2325 		free(freebuf, M_TEMP);
2326 	}
2327 	NDFREE(&nd, 0);
2328 	return (error);
2329 }
2330 
2331 int
2332 sys___realpathat(struct thread *td, struct __realpathat_args *uap)
2333 {
2334 
2335 	return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size,
2336 	    uap->flags, UIO_USERSPACE));
2337 }
2338 
2339 /*
2340  * Retrieve the full filesystem path that correspond to a vnode from the name
2341  * cache (if available)
2342  */
2343 int
2344 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
2345 {
2346 	struct pwd *pwd;
2347 	char *buf;
2348 	size_t buflen;
2349 	int error;
2350 
2351 	if (__predict_false(vn == NULL))
2352 		return (EINVAL);
2353 
2354 	buflen = MAXPATHLEN;
2355 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2356 	pwd = pwd_hold(td);
2357 	error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen);
2358 	pwd_drop(pwd);
2359 
2360 	if (!error)
2361 		*freebuf = buf;
2362 	else
2363 		free(buf, M_TEMP);
2364 	return (error);
2365 }
2366 
2367 /*
2368  * This function is similar to vn_fullpath, but it attempts to lookup the
2369  * pathname relative to the global root mount point.  This is required for the
2370  * auditing sub-system, as audited pathnames must be absolute, relative to the
2371  * global root mount point.
2372  */
2373 int
2374 vn_fullpath_global(struct thread *td, struct vnode *vn,
2375     char **retbuf, char **freebuf)
2376 {
2377 	char *buf;
2378 	size_t buflen;
2379 	int error;
2380 
2381 	if (__predict_false(vn == NULL))
2382 		return (EINVAL);
2383 	buflen = MAXPATHLEN;
2384 	buf = malloc(buflen, M_TEMP, M_WAITOK);
2385 	error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen);
2386 	if (!error)
2387 		*freebuf = buf;
2388 	else
2389 		free(buf, M_TEMP);
2390 	return (error);
2391 }
2392 
2393 int
2394 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen)
2395 {
2396 	struct vnode *dvp;
2397 	struct namecache *ncp;
2398 	struct mtx *vlp;
2399 	int error;
2400 
2401 	vlp = VP2VNODELOCK(*vp);
2402 	mtx_lock(vlp);
2403 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
2404 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2405 			break;
2406 	}
2407 	if (ncp != NULL) {
2408 		if (*buflen < ncp->nc_nlen) {
2409 			mtx_unlock(vlp);
2410 			vrele(*vp);
2411 			counter_u64_add(numfullpathfail4, 1);
2412 			error = ENOMEM;
2413 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2414 			    vp, NULL);
2415 			return (error);
2416 		}
2417 		*buflen -= ncp->nc_nlen;
2418 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
2419 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
2420 		    ncp->nc_name, vp);
2421 		dvp = *vp;
2422 		*vp = ncp->nc_dvp;
2423 		vref(*vp);
2424 		mtx_unlock(vlp);
2425 		vrele(dvp);
2426 		return (0);
2427 	}
2428 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
2429 
2430 	mtx_unlock(vlp);
2431 	vn_lock(*vp, LK_SHARED | LK_RETRY);
2432 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
2433 	vput(*vp);
2434 	if (error) {
2435 		counter_u64_add(numfullpathfail2, 1);
2436 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
2437 		return (error);
2438 	}
2439 
2440 	*vp = dvp;
2441 	if (VN_IS_DOOMED(dvp)) {
2442 		/* forced unmount */
2443 		vrele(dvp);
2444 		error = ENOENT;
2445 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
2446 		return (error);
2447 	}
2448 	/*
2449 	 * *vp has its use count incremented still.
2450 	 */
2451 
2452 	return (0);
2453 }
2454 
2455 /*
2456  * Resolve a directory to a pathname.
2457  *
2458  * The name of the directory can always be found in the namecache or fetched
2459  * from the filesystem. There is also guaranteed to be only one parent, meaning
2460  * we can just follow vnodes up until we find the root.
2461  *
2462  * The vnode must be referenced.
2463  */
2464 static int
2465 vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir,
2466     char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend)
2467 {
2468 #ifdef KDTRACE_HOOKS
2469 	struct vnode *startvp = vp;
2470 #endif
2471 	struct vnode *vp1;
2472 	size_t buflen;
2473 	int error;
2474 
2475 	VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp);
2476 	VNPASS(vp->v_usecount > 0, vp);
2477 
2478 	buflen = *len;
2479 
2480 	if (!slash_prefixed) {
2481 		MPASS(*len >= 2);
2482 		buflen--;
2483 		buf[buflen] = '\0';
2484 	}
2485 
2486 	error = 0;
2487 
2488 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
2489 	counter_u64_add(numfullpathcalls, 1);
2490 	while (vp != rdir && vp != rootvnode) {
2491 		/*
2492 		 * The vp vnode must be already fully constructed,
2493 		 * since it is either found in namecache or obtained
2494 		 * from VOP_VPTOCNP().  We may test for VV_ROOT safely
2495 		 * without obtaining the vnode lock.
2496 		 */
2497 		if ((vp->v_vflag & VV_ROOT) != 0) {
2498 			vn_lock(vp, LK_RETRY | LK_SHARED);
2499 
2500 			/*
2501 			 * With the vnode locked, check for races with
2502 			 * unmount, forced or not.  Note that we
2503 			 * already verified that vp is not equal to
2504 			 * the root vnode, which means that
2505 			 * mnt_vnodecovered can be NULL only for the
2506 			 * case of unmount.
2507 			 */
2508 			if (VN_IS_DOOMED(vp) ||
2509 			    (vp1 = vp->v_mount->mnt_vnodecovered) == NULL ||
2510 			    vp1->v_mountedhere != vp->v_mount) {
2511 				vput(vp);
2512 				error = ENOENT;
2513 				SDT_PROBE3(vfs, namecache, fullpath, return,
2514 				    error, vp, NULL);
2515 				break;
2516 			}
2517 
2518 			vref(vp1);
2519 			vput(vp);
2520 			vp = vp1;
2521 			continue;
2522 		}
2523 		if (vp->v_type != VDIR) {
2524 			vrele(vp);
2525 			counter_u64_add(numfullpathfail1, 1);
2526 			error = ENOTDIR;
2527 			SDT_PROBE3(vfs, namecache, fullpath, return,
2528 			    error, vp, NULL);
2529 			break;
2530 		}
2531 		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
2532 		if (error)
2533 			break;
2534 		if (buflen == 0) {
2535 			vrele(vp);
2536 			error = ENOMEM;
2537 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
2538 			    startvp, NULL);
2539 			break;
2540 		}
2541 		buf[--buflen] = '/';
2542 		slash_prefixed = true;
2543 	}
2544 	if (error)
2545 		return (error);
2546 	if (!slash_prefixed) {
2547 		if (buflen == 0) {
2548 			vrele(vp);
2549 			counter_u64_add(numfullpathfail4, 1);
2550 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
2551 			    startvp, NULL);
2552 			return (ENOMEM);
2553 		}
2554 		buf[--buflen] = '/';
2555 	}
2556 	counter_u64_add(numfullpathfound, 1);
2557 	vrele(vp);
2558 
2559 	*retbuf = buf + buflen;
2560 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf);
2561 	*len -= buflen;
2562 	*len += addend;
2563 	return (0);
2564 }
2565 
2566 /*
2567  * Resolve an arbitrary vnode to a pathname.
2568  *
2569  * Note 2 caveats:
2570  * - hardlinks are not tracked, thus if the vnode is not a directory this can
2571  *   resolve to a different path than the one used to find it
2572  * - namecache is not mandatory, meaning names are not guaranteed to be added
2573  *   (in which case resolving fails)
2574  */
2575 static int
2576 vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir,
2577     char *buf, char **retbuf, size_t *buflen)
2578 {
2579 	size_t orig_buflen;
2580 	bool slash_prefixed;
2581 	int error;
2582 
2583 	if (*buflen < 2)
2584 		return (EINVAL);
2585 
2586 	orig_buflen = *buflen;
2587 
2588 	vref(vp);
2589 	slash_prefixed = false;
2590 	if (vp->v_type != VDIR) {
2591 		*buflen -= 1;
2592 		buf[*buflen] = '\0';
2593 		error = vn_vptocnp(&vp, td->td_ucred, buf, buflen);
2594 		if (error)
2595 			return (error);
2596 		if (*buflen == 0) {
2597 			vrele(vp);
2598 			return (ENOMEM);
2599 		}
2600 		*buflen -= 1;
2601 		buf[*buflen] = '/';
2602 		slash_prefixed = true;
2603 	}
2604 
2605 	return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed,
2606 	    orig_buflen - *buflen));
2607 }
2608 
2609 /*
2610  * Resolve an arbitrary vnode to a pathname (taking care of hardlinks).
2611  *
2612  * Since the namecache does not track handlings, the caller is expected to first
2613  * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei.
2614  *
2615  * Then we have 2 cases:
2616  * - if the found vnode is a directory, the path can be constructed just by
2617  *   fullowing names up the chain
2618  * - otherwise we populate the buffer with the saved name and start resolving
2619  *   from the parent
2620  */
2621 static int
2622 vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf,
2623     char **freebuf, size_t *buflen)
2624 {
2625 	char *buf, *tmpbuf;
2626 	struct pwd *pwd;
2627 	struct componentname *cnp;
2628 	struct vnode *vp;
2629 	size_t addend;
2630 	int error;
2631 	bool slash_prefixed;
2632 
2633 	if (*buflen < 2)
2634 		return (EINVAL);
2635 	if (*buflen > MAXPATHLEN)
2636 		*buflen = MAXPATHLEN;
2637 
2638 	slash_prefixed = false;
2639 
2640 	buf = malloc(*buflen, M_TEMP, M_WAITOK);
2641 	pwd = pwd_hold(td);
2642 
2643 	addend = 0;
2644 	vp = ndp->ni_vp;
2645 	if (vp->v_type != VDIR) {
2646 		cnp = &ndp->ni_cnd;
2647 		addend = cnp->cn_namelen + 2;
2648 		if (*buflen < addend) {
2649 			error = ENOMEM;
2650 			goto out_bad;
2651 		}
2652 		*buflen -= addend;
2653 		tmpbuf = buf + *buflen;
2654 		tmpbuf[0] = '/';
2655 		memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen);
2656 		tmpbuf[addend - 1] = '\0';
2657 		slash_prefixed = true;
2658 		vp = ndp->ni_dvp;
2659 	}
2660 
2661 	vref(vp);
2662 	error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen,
2663 	    slash_prefixed, addend);
2664 	if (error != 0)
2665 		goto out_bad;
2666 
2667 	pwd_drop(pwd);
2668 	*freebuf = buf;
2669 
2670 	return (0);
2671 out_bad:
2672 	pwd_drop(pwd);
2673 	free(buf, M_TEMP);
2674 	return (error);
2675 }
2676 
2677 struct vnode *
2678 vn_dir_dd_ino(struct vnode *vp)
2679 {
2680 	struct namecache *ncp;
2681 	struct vnode *ddvp;
2682 	struct mtx *vlp;
2683 	enum vgetstate vs;
2684 
2685 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
2686 	vlp = VP2VNODELOCK(vp);
2687 	mtx_lock(vlp);
2688 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
2689 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
2690 			continue;
2691 		ddvp = ncp->nc_dvp;
2692 		vs = vget_prep(ddvp);
2693 		mtx_unlock(vlp);
2694 		if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs))
2695 			return (NULL);
2696 		return (ddvp);
2697 	}
2698 	mtx_unlock(vlp);
2699 	return (NULL);
2700 }
2701 
2702 int
2703 vn_commname(struct vnode *vp, char *buf, u_int buflen)
2704 {
2705 	struct namecache *ncp;
2706 	struct mtx *vlp;
2707 	int l;
2708 
2709 	vlp = VP2VNODELOCK(vp);
2710 	mtx_lock(vlp);
2711 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
2712 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
2713 			break;
2714 	if (ncp == NULL) {
2715 		mtx_unlock(vlp);
2716 		return (ENOENT);
2717 	}
2718 	l = min(ncp->nc_nlen, buflen - 1);
2719 	memcpy(buf, ncp->nc_name, l);
2720 	mtx_unlock(vlp);
2721 	buf[l] = '\0';
2722 	return (0);
2723 }
2724 
2725 /*
2726  * This function updates path string to vnode's full global path
2727  * and checks the size of the new path string against the pathlen argument.
2728  *
2729  * Requires a locked, referenced vnode.
2730  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
2731  *
2732  * If vp is a directory, the call to vn_fullpath_global() always succeeds
2733  * because it falls back to the ".." lookup if the namecache lookup fails.
2734  */
2735 int
2736 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
2737     u_int pathlen)
2738 {
2739 	struct nameidata nd;
2740 	struct vnode *vp1;
2741 	char *rpath, *fbuf;
2742 	int error;
2743 
2744 	ASSERT_VOP_ELOCKED(vp, __func__);
2745 
2746 	/* Construct global filesystem path from vp. */
2747 	VOP_UNLOCK(vp);
2748 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
2749 
2750 	if (error != 0) {
2751 		vrele(vp);
2752 		return (error);
2753 	}
2754 
2755 	if (strlen(rpath) >= pathlen) {
2756 		vrele(vp);
2757 		error = ENAMETOOLONG;
2758 		goto out;
2759 	}
2760 
2761 	/*
2762 	 * Re-lookup the vnode by path to detect a possible rename.
2763 	 * As a side effect, the vnode is relocked.
2764 	 * If vnode was renamed, return ENOENT.
2765 	 */
2766 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
2767 	    UIO_SYSSPACE, path, td);
2768 	error = namei(&nd);
2769 	if (error != 0) {
2770 		vrele(vp);
2771 		goto out;
2772 	}
2773 	NDFREE(&nd, NDF_ONLY_PNBUF);
2774 	vp1 = nd.ni_vp;
2775 	vrele(vp);
2776 	if (vp1 == vp)
2777 		strcpy(path, rpath);
2778 	else {
2779 		vput(vp1);
2780 		error = ENOENT;
2781 	}
2782 
2783 out:
2784 	free(fbuf, M_TEMP);
2785 	return (error);
2786 }
2787 
2788 #ifdef DDB
2789 static void
2790 db_print_vpath(struct vnode *vp)
2791 {
2792 
2793 	while (vp != NULL) {
2794 		db_printf("%p: ", vp);
2795 		if (vp == rootvnode) {
2796 			db_printf("/");
2797 			vp = NULL;
2798 		} else {
2799 			if (vp->v_vflag & VV_ROOT) {
2800 				db_printf("<mount point>");
2801 				vp = vp->v_mount->mnt_vnodecovered;
2802 			} else {
2803 				struct namecache *ncp;
2804 				char *ncn;
2805 				int i;
2806 
2807 				ncp = TAILQ_FIRST(&vp->v_cache_dst);
2808 				if (ncp != NULL) {
2809 					ncn = ncp->nc_name;
2810 					for (i = 0; i < ncp->nc_nlen; i++)
2811 						db_printf("%c", *ncn++);
2812 					vp = ncp->nc_dvp;
2813 				} else {
2814 					vp = NULL;
2815 				}
2816 			}
2817 		}
2818 		db_printf("\n");
2819 	}
2820 
2821 	return;
2822 }
2823 
2824 DB_SHOW_COMMAND(vpath, db_show_vpath)
2825 {
2826 	struct vnode *vp;
2827 
2828 	if (!have_addr) {
2829 		db_printf("usage: show vpath <struct vnode *>\n");
2830 		return;
2831 	}
2832 
2833 	vp = (struct vnode *)addr;
2834 	db_print_vpath(vp);
2835 }
2836 
2837 #endif
2838