xref: /dragonfly/sys/kern/vfs_cache.c (revision 2cd2d2b5)
1 /*
2  * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * Copyright (c) 1989, 1993, 1995
35  *	The Regents of the University of California.  All rights reserved.
36  *
37  * This code is derived from software contributed to Berkeley by
38  * Poul-Henning Kamp of the FreeBSD Project.
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions
42  * are met:
43  * 1. Redistributions of source code must retain the above copyright
44  *    notice, this list of conditions and the following disclaimer.
45  * 2. Redistributions in binary form must reproduce the above copyright
46  *    notice, this list of conditions and the following disclaimer in the
47  *    documentation and/or other materials provided with the distribution.
48  * 3. All advertising materials mentioning features or use of this software
49  *    must display the following acknowledgement:
50  *	This product includes software developed by the University of
51  *	California, Berkeley and its contributors.
52  * 4. Neither the name of the University nor the names of its contributors
53  *    may be used to endorse or promote products derived from this software
54  *    without specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66  * SUCH DAMAGE.
67  *
68  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
69  * $FreeBSD: src/sys/kern/vfs_cache.c,v 1.42.2.6 2001/10/05 20:07:03 dillon Exp $
70  * $DragonFly: src/sys/kern/vfs_cache.c,v 1.25 2004/07/16 05:51:10 dillon Exp $
71  */
72 
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/mount.h>
78 #include <sys/vnode.h>
79 #include <sys/malloc.h>
80 #include <sys/sysproto.h>
81 #include <sys/proc.h>
82 #include <sys/namei.h>
83 #include <sys/filedesc.h>
84 #include <sys/fnv_hash.h>
85 #include <sys/globaldata.h>
86 #include <sys/kern_syscall.h>
87 
88 /*
89  * Random lookups in the cache are accomplished with a hash table using
90  * a hash key of (nc_src_vp, name).
91  *
92  * Negative entries may exist and correspond to structures where nc_vp
93  * is NULL.  In a negative entry, NCF_WHITEOUT will be set if the entry
94  * corresponds to a whited-out directory entry (verses simply not finding the
95  * entry at all).
96  *
97  * Upon reaching the last segment of a path, if the reference is for DELETE,
98  * or NOCACHE is set (rewrite), and the name is located in the cache, it
99  * will be dropped.
100  */
101 
102 /*
103  * Structures associated with name cacheing.
104  */
105 #define NCHHASH(hash)	(&nchashtbl[(hash) & nchash])
106 #define MINNEG		1024
107 
108 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
109 
110 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
111 static struct namecache_list	ncneglist;		/* instead of vnode */
112 static struct namecache		rootnamecache;		/* Dummy node */
113 
114 static int	nczapcheck;		/* panic on bad release */
115 SYSCTL_INT(_debug, OID_AUTO, nczapcheck, CTLFLAG_RW, &nczapcheck, 0, "");
116 
117 static u_long	nchash;			/* size of hash table */
118 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
119 
120 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
121 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
122 
123 static u_long	numneg;		/* number of cache entries allocated */
124 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
125 
126 static u_long	numcache;		/* number of cache entries allocated */
127 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
128 
129 static u_long	numunres;		/* number of unresolved entries */
130 SYSCTL_ULONG(_debug, OID_AUTO, numunres, CTLFLAG_RD, &numunres, 0, "");
131 
132 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
133 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
134 
135 /*
136  * The new name cache statistics
137  */
138 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
139 #define STATNODE(mode, name, var) \
140 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
141 STATNODE(CTLFLAG_RD, numneg, &numneg);
142 STATNODE(CTLFLAG_RD, numcache, &numcache);
143 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
144 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
145 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
146 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
147 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
148 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
149 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
150 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
151 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
152 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
153 
154 struct nchstats nchstats[SMP_MAXCPU];
155 /*
156  * Export VFS cache effectiveness statistics to user-land.
157  *
158  * The statistics are left for aggregation to user-land so
159  * neat things can be achieved, like observing per-CPU cache
160  * distribution.
161  */
162 static int
163 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
164 {
165 	struct globaldata *gd;
166 	int i, error;
167 
168 	error = 0;
169 	for (i = 0; i < ncpus; ++i) {
170 		gd = globaldata_find(i);
171 		if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
172 			sizeof(struct nchstats))))
173 			break;
174 	}
175 
176 	return (error);
177 }
178 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
179   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
180 
181 static void cache_zap(struct namecache *ncp);
182 
183 /*
184  * cache_hold() and cache_drop() prevent the premature deletion of a
185  * namecache entry but do not prevent operations (such as zapping) on
186  * that namecache entry.
187  */
188 static __inline
189 struct namecache *
190 _cache_hold(struct namecache *ncp)
191 {
192 	++ncp->nc_refs;
193 	return(ncp);
194 }
195 
196 static __inline
197 void
198 _cache_drop(struct namecache *ncp)
199 {
200 	KKASSERT(ncp->nc_refs > 0);
201 	if (ncp->nc_refs == 1 &&
202 	    (ncp->nc_flag & NCF_UNRESOLVED) &&
203 	    TAILQ_EMPTY(&ncp->nc_list)
204 	) {
205 		cache_zap(ncp);
206 	} else {
207 		--ncp->nc_refs;
208 	}
209 }
210 
211 struct namecache *
212 cache_hold(struct namecache *ncp)
213 {
214 	return(_cache_hold(ncp));
215 }
216 
217 void
218 cache_drop(struct namecache *ncp)
219 {
220 	_cache_drop(ncp);
221 }
222 
223 static void
224 cache_link_parent(struct namecache *ncp, struct namecache *par)
225 {
226 	KKASSERT(ncp->nc_parent == NULL);
227 	ncp->nc_parent = par;
228 	if (TAILQ_EMPTY(&par->nc_list)) {
229 		if (par->nc_vp)
230 			vhold(par->nc_vp);
231 	}
232 	TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
233 }
234 
235 static void
236 cache_unlink_parent(struct namecache *ncp)
237 {
238 	struct namecache *par;
239 
240 	if ((par = ncp->nc_parent) != NULL) {
241 		ncp->nc_parent = NULL;
242 		par = cache_hold(par);
243 		TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
244 		if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
245 			vdrop(par->nc_vp);
246 		cache_drop(par);
247 	}
248 }
249 
250 static struct namecache *
251 cache_alloc(struct vnode *vp)
252 {
253 	struct namecache *ncp;
254 
255 	ncp = malloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
256 	TAILQ_INIT(&ncp->nc_list);
257 	ncp->nc_vp = vp;
258 	if (vp != NULL) {
259 		TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
260 		++numcache;
261 	} else {
262 		TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
263 		++numneg;
264 	}
265 	return(ncp);
266 }
267 
268 /*
269  * Try to destroy a namecache entry.  The entry is disassociated from its
270  * vnode or ncneglist and reverted to an UNRESOLVED state.
271  *
272  * Then, if there are no additional references to the ncp and we can
273  * successfully delete the children, the entry is also removed from the
274  * namecache hashlist / topology.
275  *
276  * References or undeletable children will prevent the entry from being
277  * removed from the topology.  The entry may be revalidated (typically
278  * by cache_enter()) at a later time.  Children remain because:
279  *
280  *	+ we have tried to delete a node rather then a leaf in the topology.
281  *	+ the presence of negative entries (we try to scrap these).
282  *	+ an entry or child has a non-zero ref count and cannot be scrapped.
283  *
284  * This function must be called with the ncp held and will drop the ref
285  * count during zapping.
286  */
287 static void
288 cache_zap(struct namecache *ncp)
289 {
290 	struct namecache *par;
291 	struct vnode *vp;
292 
293 	/*
294 	 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
295 	 */
296 	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
297 		ncp->nc_flag |= NCF_UNRESOLVED;
298 		++numunres;
299 		if ((vp = ncp->nc_vp) != NULL) {
300 			--numcache;
301 			ncp->nc_vp = NULL;	/* safety */
302 			TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
303 			if (!TAILQ_EMPTY(&ncp->nc_list))
304 				vdrop(vp);
305 		} else {
306 			TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
307 			--numneg;
308 		}
309 	}
310 
311 	/*
312 	 * Try to scrap the entry and possibly tail-recurse on its parent.
313 	 * We only scrap unref'd (other then our ref) unresolved entries,
314 	 * we do not scrap 'live' entries.
315 	 */
316 	while (ncp->nc_flag & NCF_UNRESOLVED) {
317 		/*
318 		 * Someone other then us has a ref, stop.
319 		 */
320 		if (ncp->nc_refs > 1)
321 			goto done;
322 
323 		/*
324 		 * We have children, stop.
325 		 */
326 		if (!TAILQ_EMPTY(&ncp->nc_list))
327 			goto done;
328 
329 		/*
330 		 * Ok, we can completely destroy and free this entry.  Sanity
331 		 * check it against our static rootnamecache structure,
332 		 * then remove it from the hash.
333 		 */
334 		KKASSERT(ncp != &rootnamecache);
335 
336 		if (ncp->nc_flag & NCF_HASHED) {
337 			ncp->nc_flag &= ~NCF_HASHED;
338 			LIST_REMOVE(ncp, nc_hash);
339 		}
340 
341 		/*
342 		 * Unlink from its parent and free, then loop on the
343 		 * parent.  XXX temp hack, in stage-3 parent is never NULL
344 		 */
345 		if ((par = ncp->nc_parent) != NULL) {
346 			par = cache_hold(par);
347 			TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
348 			if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
349 				vdrop(par->nc_vp);
350 		}
351 		--numunres;
352 		ncp->nc_refs = -1;	/* safety */
353 		ncp->nc_parent = NULL;	/* safety */
354 		if (ncp->nc_name)
355 			free(ncp->nc_name, M_VFSCACHE);
356 		free(ncp, M_VFSCACHE);
357 		ncp = par;
358 		if (par == NULL)	/* temp hack */
359 			return;		/* temp hack */
360 	}
361 done:
362 	--ncp->nc_refs;
363 }
364 
365 /*
366  * Lookup an entry in the cache
367  *
368  * Lookup is called with dvp pointing to the directory to search,
369  * cnp pointing to the name of the entry being sought.
370  *
371  * If the lookup succeeds, the vnode is returned in *vpp, and a
372  * status of -1 is returned.
373  *
374  * If the lookup determines that the name does not exist (negative cacheing),
375  * a status of ENOENT is returned.
376  *
377  * If the lookup fails, a status of zero is returned.
378  *
379  * Note that UNRESOLVED entries are ignored.  They are not negative cache
380  * entries.
381  */
382 int
383 cache_lookup(struct vnode *dvp, struct namecache *par, struct vnode **vpp,
384 		struct namecache **ncpp, struct componentname *cnp)
385 {
386 	struct namecache *ncp;
387 	u_int32_t hash;
388 	globaldata_t gd = mycpu;
389 
390 	numcalls++;
391 
392 	/*
393 	 * Obtain the namecache entry associated with dvp, creating one if
394 	 * necessary.  If we have to create one we have insufficient
395 	 * information to hash it or even supply the name, but we still
396 	 * need one so we can link it in.
397 	 *
398 	 * NOTE: in this stage of development, the passed 'par' is
399 	 * almost always NULL.
400 	 */
401 	if (par == NULL) {
402 		if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
403 			par = cache_alloc(dvp);
404 	}
405 
406 	/*
407 	 * Deal with "." and "..".  In this stage of code development we leave
408 	 * the returned ncpp NULL.  Note that if the namecache is disjoint,
409 	 * we won't find a vnode for "..".
410 	 */
411 	if (cnp->cn_nameptr[0] == '.') {
412 		if (cnp->cn_namelen == 1) {
413 			*vpp = dvp;
414 			dothits++;
415 			numposhits++;	/* include in total statistics */
416 			return (-1);
417 		}
418 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
419 			dotdothits++;
420 			numposhits++;	/* include in total statistics */
421 			if ((cnp->cn_flags & CNP_MAKEENTRY) == 0)
422 				return (0);
423 			if (par->nc_parent == NULL ||
424 			    par->nc_parent->nc_vp == NULL) {
425 				return (0);
426 			}
427 			*vpp = par->nc_parent->nc_vp;
428 			return (-1);
429 		}
430 	}
431 
432 	/*
433 	 * Try to locate an existing entry
434 	 */
435 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
436 	hash = fnv_32_buf(&par, sizeof(par), hash);
437 	if (nczapcheck > 1)
438 	    printf("DVP %p/%p %08x %*.*s\n", dvp, par, hash, (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr);
439 restart:
440 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
441 		numchecks++;
442 		if (nczapcheck > 1) {
443 		    printf("TEST ncp par=%p %*.*s\n",
444 			ncp->nc_parent, ncp->nc_nlen, ncp->nc_nlen,
445 			ncp->nc_name);
446 		}
447 
448 		/*
449 		 * Zap entries that have timed out.
450 		 */
451 		if (ncp->nc_timeout &&
452 		    (int)(ncp->nc_timeout - ticks) < 0
453 		) {
454 			if (nczapcheck > 1)
455 			    printf("TIMEOUT\n");
456 			cache_zap(cache_hold(ncp));
457 			goto restart;
458 		}
459 
460 		/*
461 		 * Break out if we find a matching entry.  UNRESOLVED entries
462 		 * never match (they are in the middle of being destroyed).
463 		 */
464 		if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
465 		    ncp->nc_parent == par &&
466 		    ncp->nc_nlen == cnp->cn_namelen &&
467 		    bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen) == 0
468 		) {
469 			if (nczapcheck > 1)
470 			    printf("GOOD\n");
471 			cache_hold(ncp);
472 			break;
473 		}
474 	}
475 
476 	/*
477 	 * If we failed to locate an entry, return 0 (indicates failure).
478 	 */
479 	if (ncp == NULL) {
480 		if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
481 			nummisszap++;
482 		} else {
483 			nummiss++;
484 		}
485 		gd->gd_nchstats->ncs_miss++;
486 		if (nczapcheck) {
487 		    printf("MISS %p/%p %*.*s/%*.*s\n", dvp, par,
488 			par->nc_nlen, par->nc_nlen, (par->nc_name ? par->nc_name : ""),
489 			(int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr);
490 		}
491 		return (0);
492 	}
493 
494 	/*
495 	 * If we found an entry, but we don't want to have one, we zap it.
496 	 */
497 	if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
498 		numposzaps++;
499 		gd->gd_nchstats->ncs_badhits++;
500 		cache_zap(ncp);
501 		return (0);
502 	}
503 
504 	/*
505 	 * If the vnode is not NULL then return the positive match.
506 	 */
507 	if (ncp->nc_vp) {
508 		numposhits++;
509 		gd->gd_nchstats->ncs_goodhits++;
510 		*vpp = ncp->nc_vp;
511 		cache_drop(ncp);
512 		return (-1);
513 	}
514 
515 	/*
516 	 * If the vnode is NULL we found a negative match.  If we want to
517 	 * create it, purge the negative match and return failure (as if
518 	 * we hadn't found a match in the first place).
519 	 */
520 	if (cnp->cn_nameiop == NAMEI_CREATE) {
521 		numnegzaps++;
522 		gd->gd_nchstats->ncs_badhits++;
523 		cache_zap(ncp);
524 		return (0);
525 	}
526 
527 	numneghits++;
528 
529 	/*
530 	 * We found a "negative" match, ENOENT notifies client of this match.
531 	 * The nc_flag field records whether this is a whiteout.  Since there
532 	 * is no vnode we can use the vnode tailq link field with ncneglist.
533 	 */
534 	TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
535 	TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
536 	gd->gd_nchstats->ncs_neghits++;
537 	if (ncp->nc_flag & NCF_WHITEOUT)
538 		cnp->cn_flags |= CNP_ISWHITEOUT;
539 	cache_drop(ncp);
540 	return (ENOENT);
541 }
542 
543 /*
544  * Generate a special linkage between the mount point and the root of the
545  * mounted filesystem in order to maintain the namecache topology across
546  * a mount point.  The special linkage has a 0-length name component
547  * and sets NCF_MOUNTPT.
548  */
549 void
550 cache_mount(struct vnode *dvp, struct vnode *tvp)
551 {
552 	struct namecache *ncp;
553 	struct namecache *par;
554 	struct nchashhead *nchpp;
555 	u_int32_t hash;
556 
557 	/*
558 	 * If a linkage already exists we do not have to do anything.
559 	 */
560 	hash = fnv_32_buf("", 0, FNV1_32_INIT);
561 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
562 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
563 		numchecks++;
564 		if (ncp->nc_vp == tvp &&
565 		    ncp->nc_nlen == 0 &&
566 		    ncp->nc_parent &&
567 		    ncp->nc_parent->nc_vp == dvp
568 		) {
569 			return;
570 		}
571 	}
572 
573 	if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
574 		par = cache_alloc(dvp);
575 
576 	/*
577 	 * Otherwise create a new linkage.
578 	 */
579 	ncp = cache_alloc(tvp);
580 	ncp->nc_flag = NCF_MOUNTPT;
581 	cache_link_parent(ncp, par);
582 
583 	/*
584 	 * Hash table
585 	 */
586 	hash = fnv_32_buf("", 0, FNV1_32_INIT);
587 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
588 	nchpp = NCHHASH(hash);
589 	LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
590 
591 	ncp->nc_flag |= NCF_HASHED;
592 }
593 
594 /*
595  * Add an entry to the cache.
596  */
597 void
598 cache_enter(struct vnode *dvp, struct namecache *par, struct vnode *vp, struct componentname *cnp)
599 {
600 	struct namecache *ncp;
601 	struct namecache *bpar;
602 	struct nchashhead *nchpp;
603 	u_int32_t hash;
604 	char *name;
605 
606 	/*
607 	 * If the directory has no namecache entry we must associate one with
608 	 * it.  The name of the entry is not known so it isn't hashed.
609 	 */
610 	if (par == NULL) {
611 		if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
612 			par = cache_alloc(dvp);
613 	}
614 
615 	/*
616 	 * This may be a bit confusing.  "." and ".." are 'virtual' entries.
617 	 * We do not actually create a namecache entry representing either.
618 	 * However, the ".." case is used to linkup a potentially disjoint
619 	 * directory with its parent, to disconnect a directory from its
620 	 * parent, or to change an existing linkage that may no longer be
621 	 * correct (as might occur when a subdirectory is renamed).
622 	 */
623 
624 	if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')
625 		return;
626 	if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' &&
627 	    cnp->cn_nameptr[1] == '.'
628 	) {
629 		if (vp == NULL) {
630 			if (par->nc_parent)
631 				cache_unlink_parent(par);
632 		} else {
633 			if ((ncp = TAILQ_FIRST(&vp->v_namecache)) == NULL)
634 				ncp = cache_alloc(vp);
635 			cache_hold(par);
636 			if (par->nc_parent)
637 				cache_unlink_parent(par);
638 			cache_link_parent(par, ncp); /* ncp is parent of par */
639 			cache_drop(par);
640 		}
641 		return;
642 	}
643 
644 	/*
645 	 * Locate other entries associated with this vnode and zap them,
646 	 * because the purge code may not be able to find them due to
647 	 * the topology not yet being consistent.  This is a temporary
648 	 * hack.
649 	 */
650 	if (vp) {
651 again:
652 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
653 			if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
654 				cache_zap(cache_hold(ncp));
655 				goto again;
656 			}
657 		}
658 	}
659 
660 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
661 	bpar = par;
662 	hash = fnv_32_buf(&bpar, sizeof(bpar), hash);
663 
664 	if (nczapcheck > 1)
665 	    printf("ENTER %p/%p %08x '%*.*s' %p ", dvp, par, hash, (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr, vp);
666 
667 
668 	name = malloc(cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
669 	ncp = cache_alloc(vp);
670 	if (nczapcheck > 1)
671 	    printf("alloc\n");
672 
673 	/*
674 	 * Set a timeout
675 	 */
676 	if (cnp->cn_flags & CNP_CACHETIMEOUT) {
677 		if ((ncp->nc_timeout = ticks + cnp->cn_timeout) == 0)
678 			ncp->nc_timeout = 1;
679 	}
680 
681 	/*
682 	 * Linkup the parent pointer, bump the parent vnode's hold
683 	 * count when we go from 0->1 children.
684 	 */
685 	cache_link_parent(ncp, par);
686 
687 	/*
688 	 * Add to the hash table
689 	 */
690 	ncp->nc_name = name;
691 	ncp->nc_nlen = cnp->cn_namelen;
692 	bcopy(cnp->cn_nameptr, ncp->nc_name, cnp->cn_namelen);
693 	nchpp = NCHHASH(hash);
694 	LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
695 	ncp->nc_flag |= NCF_HASHED;
696 
697 	/*
698 	 * If the target vnode is NULL if this is to be a negative cache
699 	 * entry.
700 	 */
701 	if (vp == NULL) {
702 		ncp->nc_flag &= ~NCF_WHITEOUT;
703 		if (cnp->cn_flags & CNP_ISWHITEOUT)
704 			ncp->nc_flag |= NCF_WHITEOUT;
705 	}
706 
707 	/*
708 	 * Don't cache too many negative hits
709 	 */
710 	if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
711 		ncp = TAILQ_FIRST(&ncneglist);
712 		KKASSERT(ncp != NULL);
713 		cache_zap(cache_hold(ncp));
714 	}
715 }
716 
717 /*
718  * Name cache initialization, from vfsinit() when we are booting
719  *
720  * rootnamecache is initialized such that it cannot be recursively deleted.
721  */
722 void
723 nchinit(void)
724 {
725 	int i;
726 	globaldata_t gd;
727 
728 	/* initialise per-cpu namecache effectiveness statistics. */
729 	for (i = 0; i < ncpus; ++i) {
730 		gd = globaldata_find(i);
731 		gd->gd_nchstats = &nchstats[i];
732 	}
733 
734 	TAILQ_INIT(&ncneglist);
735 	nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
736 	TAILQ_INIT(&rootnamecache.nc_list);
737 	rootnamecache.nc_flag |= NCF_HASHED | NCF_ROOT | NCF_UNRESOLVED;
738 	rootnamecache.nc_refs = 1;
739 }
740 
741 /*
742  * vfs_cache_setroot()
743  *
744  *	Create an association between the root of our namecache and
745  *	the root vnode.  This routine may be called several times during
746  *	booting.
747  */
748 void
749 vfs_cache_setroot(struct vnode *nvp)
750 {
751 	KKASSERT(rootnamecache.nc_refs > 0);	/* don't accidently free */
752 	cache_zap(cache_hold(&rootnamecache));
753 
754 	rootnamecache.nc_vp = nvp;
755 	rootnamecache.nc_flag &= ~NCF_UNRESOLVED;
756 	if (nvp) {
757 		++numcache;
758 		if (!TAILQ_EMPTY(&rootnamecache.nc_list))
759 			vhold(nvp);
760 		TAILQ_INSERT_HEAD(&nvp->v_namecache, &rootnamecache, nc_vnode);
761 	} else {
762 		++numneg;
763 		TAILQ_INSERT_TAIL(&ncneglist, &rootnamecache, nc_vnode);
764 		rootnamecache.nc_flag &= ~NCF_WHITEOUT;
765 	}
766 }
767 
768 /*
769  * Invalidate all namecache entries to a particular vnode as well as
770  * any direct children of that vnode in the namecache.  This is a
771  * 'catch all' purge used by filesystems that do not know any better.
772  *
773  * A new vnode v_id is generated.  Note that no vnode will ever have a
774  * v_id of 0.
775  *
776  * Note that the linkage between the vnode and its namecache entries will
777  * be removed, but the namecache entries themselves might stay put due to
778  * active references from elsewhere in the system or due to the existance of
779  * the children.   The namecache topology is left intact even if we do not
780  * know what the vnode association is.  Such entries will be marked
781  * NCF_UNRESOLVED.
782  *
783  * XXX: Only time and the size of v_id prevents this from failing:
784  * XXX: In theory we should hunt down all (struct vnode*, v_id)
785  * XXX: soft references and nuke them, at least on the global
786  * XXX: v_id wraparound.  The period of resistance can be extended
787  * XXX: by incrementing each vnodes v_id individually instead of
788  * XXX: using the global v_id.
789  */
790 void
791 cache_purge(struct vnode *vp)
792 {
793 	static u_long nextid;
794 	struct namecache *ncp;
795 	struct namecache *scan;
796 
797 	/*
798 	 * Disassociate the vnode from its namecache entries along with
799 	 * (for historical reasons) any direct children.
800 	 */
801 	while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) {
802 		cache_hold(ncp);
803 
804 restart: /* YYY hack, fix me */
805 		TAILQ_FOREACH(scan, &ncp->nc_list, nc_entry) {
806 			if ((scan->nc_flag & NCF_UNRESOLVED) == 0) {
807 				cache_zap(cache_hold(scan));
808 				goto restart;
809 			}
810 		}
811 		cache_zap(ncp);
812 	}
813 
814 	/*
815 	 * Calculate a new unique id for ".." handling
816 	 */
817 	do {
818 		nextid++;
819 	} while (nextid == vp->v_id || nextid == 0);
820 	vp->v_id = nextid;
821 }
822 
823 /*
824  * Flush all entries referencing a particular filesystem.
825  *
826  * Since we need to check it anyway, we will flush all the invalid
827  * entries at the same time.
828  */
829 void
830 cache_purgevfs(struct mount *mp)
831 {
832 	struct nchashhead *nchpp;
833 	struct namecache *ncp, *nnp;
834 
835 	/*
836 	 * Scan hash tables for applicable entries.
837 	 */
838 	for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
839 		ncp = LIST_FIRST(nchpp);
840 		if (ncp)
841 			cache_hold(ncp);
842 		while (ncp) {
843 			nnp = LIST_NEXT(ncp, nc_hash);
844 			if (nnp)
845 				cache_hold(nnp);
846 			if (ncp->nc_vp && ncp->nc_vp->v_mount == mp)
847 				cache_zap(ncp);
848 			else
849 				cache_drop(ncp);
850 			ncp = nnp;
851 		}
852 	}
853 }
854 
855 /*
856  * cache_leaf_test()
857  *
858  *	Test whether the vnode is at a leaf in the nameicache tree.
859  *
860  *	Returns 0 if it is a leaf, -1 if it isn't.
861  */
862 int
863 cache_leaf_test(struct vnode *vp)
864 {
865 	struct namecache *scan;
866 	struct namecache *ncp;
867 
868 	TAILQ_FOREACH(scan, &vp->v_namecache, nc_vnode) {
869 		TAILQ_FOREACH(ncp, &scan->nc_list, nc_entry) {
870 			/* YYY && ncp->nc_vp->v_type == VDIR ? */
871 			if (ncp->nc_vp != NULL)
872 				return(-1);
873 		}
874 	}
875 	return(0);
876 }
877 
878 /*
879  * Perform canonical checks and cache lookup and pass on to filesystem
880  * through the vop_cachedlookup only if needed.
881  *
882  * vop_lookup_args {
883  *	struct vnode a_dvp;
884  *	struct namecache *a_ncp;
885  *	struct vnode **a_vpp;
886  *	struct namecache **a_ncpp;
887  *	struct componentname *a_cnp;
888  * }
889  */
890 int
891 vfs_cache_lookup(struct vop_lookup_args *ap)
892 {
893 	struct vnode *dvp, *vp;
894 	int lockparent;
895 	int error;
896 	struct namecache *par = ap->a_par;
897 	struct vnode **vpp = ap->a_vpp;
898 	struct namecache **ncpp = ap->a_ncpp;
899 	struct componentname *cnp = ap->a_cnp;
900 	struct ucred *cred = cnp->cn_cred;
901 	int flags = cnp->cn_flags;
902 	struct thread *td = cnp->cn_td;
903 	u_long vpid;	/* capability number of vnode */
904 
905 	*vpp = NULL;
906 	if (ncpp)
907 		*ncpp = NULL;
908 	dvp = ap->a_dvp;
909 	lockparent = flags & CNP_LOCKPARENT;
910 
911 	if (dvp->v_type != VDIR)
912                 return (ENOTDIR);
913 
914 	if ((flags & CNP_ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
915 	    (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME)) {
916 		return (EROFS);
917 	}
918 
919 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
920 
921 	if (error)
922 		return (error);
923 
924 	error = cache_lookup(dvp, par, vpp, ncpp, cnp);
925 
926 	if (!error)
927 		return (VOP_CACHEDLOOKUP(dvp, par, vpp, ncpp, cnp));
928 
929 	if (error == ENOENT)
930 		return (error);
931 
932 	vp = *vpp;
933 	vpid = vp->v_id;
934 	cnp->cn_flags &= ~CNP_PDIRUNLOCK;
935 	if (dvp == vp) {   /* lookup on "." */
936 		vref(vp);
937 		error = 0;
938 	} else if (flags & CNP_ISDOTDOT) {
939 		VOP_UNLOCK(dvp, NULL, 0, td);
940 		cnp->cn_flags |= CNP_PDIRUNLOCK;
941 		error = vget(vp, NULL, LK_EXCLUSIVE, td);
942 		if (!error && lockparent && (flags & CNP_ISLASTCN)) {
943 			if ((error = vn_lock(dvp, NULL, LK_EXCLUSIVE, td)) == 0)
944 				cnp->cn_flags &= ~CNP_PDIRUNLOCK;
945 		}
946 	} else {
947 		error = vget(vp, NULL, LK_EXCLUSIVE, td);
948 		if (!lockparent || error || !(flags & CNP_ISLASTCN)) {
949 			VOP_UNLOCK(dvp, NULL, 0, td);
950 			cnp->cn_flags |= CNP_PDIRUNLOCK;
951 		}
952 	}
953 	/*
954 	 * Check that the capability number did not change
955 	 * while we were waiting for the lock.
956 	 */
957 	if (!error) {
958 		if (vpid == vp->v_id)
959 			return (0);
960 		vput(vp);
961 		if (lockparent && dvp != vp && (flags & CNP_ISLASTCN)) {
962 			VOP_UNLOCK(dvp, NULL, 0, td);
963 			cnp->cn_flags |= CNP_PDIRUNLOCK;
964 		}
965 	}
966 	if (cnp->cn_flags & CNP_PDIRUNLOCK) {
967 		error = vn_lock(dvp, NULL, LK_EXCLUSIVE, td);
968 		if (error)
969 			return (error);
970 		cnp->cn_flags &= ~CNP_PDIRUNLOCK;
971 	}
972 	return (VOP_CACHEDLOOKUP(dvp, par, vpp, ncpp, cnp));
973 }
974 
975 static int disablecwd;
976 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
977 
978 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
979 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
980 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
981 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
982 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
983 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
984 
985 int
986 __getcwd(struct __getcwd_args *uap)
987 {
988 	int buflen;
989 	int error;
990 	char *buf;
991 	char *bp;
992 
993 	if (disablecwd)
994 		return (ENODEV);
995 
996 	buflen = uap->buflen;
997 	if (buflen < 2)
998 		return (EINVAL);
999 	if (buflen > MAXPATHLEN)
1000 		buflen = MAXPATHLEN;
1001 
1002 	buf = malloc(buflen, M_TEMP, M_WAITOK);
1003 	bp = kern_getcwd(buf, buflen, &error);
1004 	if (error == 0)
1005 		error = copyout(bp, uap->buf, strlen(bp) + 1);
1006 	free(buf, M_TEMP);
1007 	return (error);
1008 }
1009 
1010 char *
1011 kern_getcwd(char *buf, size_t buflen, int *error)
1012 {
1013 	struct proc *p = curproc;
1014 	char *bp;
1015 	int i, slash_prefixed;
1016 	struct filedesc *fdp;
1017 	struct namecache *ncp;
1018 	struct vnode *vp;
1019 
1020 	numcwdcalls++;
1021 	bp = buf;
1022 	bp += buflen - 1;
1023 	*bp = '\0';
1024 	fdp = p->p_fd;
1025 	slash_prefixed = 0;
1026 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
1027 		if (vp->v_flag & VROOT) {
1028 			if (vp->v_mount == NULL) {	/* forced unmount */
1029 				*error = EBADF;
1030 				return(NULL);
1031 			}
1032 			vp = vp->v_mount->mnt_vnodecovered;
1033 			continue;
1034 		}
1035 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1036 			if (ncp->nc_parent && ncp->nc_parent->nc_vp &&
1037 			    ncp->nc_nlen > 0) {
1038 				break;
1039 			}
1040 		}
1041 		if (ncp == NULL) {
1042 			numcwdfail2++;
1043 			*error = ENOENT;
1044 			return(NULL);
1045 		}
1046 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
1047 			if (bp == buf) {
1048 				numcwdfail4++;
1049 				*error = ENOMEM;
1050 				return(NULL);
1051 			}
1052 			*--bp = ncp->nc_name[i];
1053 		}
1054 		if (bp == buf) {
1055 			numcwdfail4++;
1056 			*error = ENOMEM;
1057 			return(NULL);
1058 		}
1059 		*--bp = '/';
1060 		slash_prefixed = 1;
1061 		vp = ncp->nc_parent->nc_vp;
1062 	}
1063 	if (!slash_prefixed) {
1064 		if (bp == buf) {
1065 			numcwdfail4++;
1066 			*error = ENOMEM;
1067 			return(NULL);
1068 		}
1069 		*--bp = '/';
1070 	}
1071 	numcwdfound++;
1072 	*error = 0;
1073 	return (bp);
1074 }
1075 
1076 /*
1077  * Thus begins the fullpath magic.
1078  */
1079 
1080 #undef STATNODE
1081 #define STATNODE(name)							\
1082 	static u_int name;						\
1083 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
1084 
1085 static int disablefullpath;
1086 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
1087     &disablefullpath, 0, "");
1088 
1089 STATNODE(numfullpathcalls);
1090 STATNODE(numfullpathfail1);
1091 STATNODE(numfullpathfail2);
1092 STATNODE(numfullpathfail3);
1093 STATNODE(numfullpathfail4);
1094 STATNODE(numfullpathfound);
1095 
1096 int
1097 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf)
1098 {
1099 	char *bp, *buf;
1100 	int i, slash_prefixed;
1101 	struct filedesc *fdp;
1102 	struct namecache *ncp;
1103 	struct vnode *vp;
1104 
1105 	numfullpathcalls++;
1106 	if (disablefullpath)
1107 		return (ENODEV);
1108 
1109 	if (p == NULL)
1110 		return (EINVAL);
1111 
1112 	/* vn is NULL, client wants us to use p->p_textvp */
1113 	if (vn == NULL) {
1114 		if ((vn = p->p_textvp) == NULL)
1115 			return (EINVAL);
1116 	}
1117 
1118 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1119 	bp = buf + MAXPATHLEN - 1;
1120 	*bp = '\0';
1121 	fdp = p->p_fd;
1122 	slash_prefixed = 0;
1123 	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
1124 		if (vp->v_flag & VROOT) {
1125 			if (vp->v_mount == NULL) {	/* forced unmount */
1126 				free(buf, M_TEMP);
1127 				return (EBADF);
1128 			}
1129 			vp = vp->v_mount->mnt_vnodecovered;
1130 			continue;
1131 		}
1132 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1133 			if (ncp->nc_parent && ncp->nc_parent->nc_vp &&
1134 			    ncp->nc_nlen > 0) {
1135 				break;
1136 			}
1137 		}
1138 		if (ncp == NULL) {
1139 			numfullpathfail2++;
1140 			free(buf, M_TEMP);
1141 			return (ENOENT);
1142 		}
1143 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
1144 			if (bp == buf) {
1145 				numfullpathfail4++;
1146 				free(buf, M_TEMP);
1147 				return (ENOMEM);
1148 			}
1149 			*--bp = ncp->nc_name[i];
1150 		}
1151 		if (bp == buf) {
1152 			numfullpathfail4++;
1153 			free(buf, M_TEMP);
1154 			return (ENOMEM);
1155 		}
1156 		*--bp = '/';
1157 		slash_prefixed = 1;
1158 		vp = ncp->nc_parent->nc_vp;
1159 	}
1160 	if (!slash_prefixed) {
1161 		if (bp == buf) {
1162 			numfullpathfail4++;
1163 			free(buf, M_TEMP);
1164 			return (ENOMEM);
1165 		}
1166 		*--bp = '/';
1167 	}
1168 	numfullpathfound++;
1169 	*retbuf = bp;
1170 	*freebuf = buf;
1171 	return (0);
1172 }
1173 
1174