xref: /dragonfly/sys/kern/vfs_cache.c (revision e8364298)
1 /*
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>,
5  *	All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
39  * $FreeBSD: src/sys/kern/vfs_cache.c,v 1.42.2.6 2001/10/05 20:07:03 dillon Exp $
40  * $DragonFly: src/sys/kern/vfs_cache.c,v 1.24 2004/06/25 17:37:19 dillon Exp $
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysctl.h>
47 #include <sys/mount.h>
48 #include <sys/vnode.h>
49 #include <sys/malloc.h>
50 #include <sys/sysproto.h>
51 #include <sys/proc.h>
52 #include <sys/namei.h>
53 #include <sys/filedesc.h>
54 #include <sys/fnv_hash.h>
55 #include <sys/globaldata.h>
56 #include <sys/kern_syscall.h>
57 
58 /*
59  * Random lookups in the cache are accomplished with a hash table using
60  * a hash key of (nc_src_vp, name).
61  *
62  * Negative entries may exist and correspond to structures where nc_vp
63  * is NULL.  In a negative entry, NCF_WHITEOUT will be set if the entry
64  * corresponds to a whited-out directory entry (verses simply not finding the
65  * entry at all).
66  *
67  * Upon reaching the last segment of a path, if the reference is for DELETE,
68  * or NOCACHE is set (rewrite), and the name is located in the cache, it
69  * will be dropped.
70  */
71 
72 /*
73  * Structures associated with name cacheing.
74  */
75 #define NCHHASH(hash)	(&nchashtbl[(hash) & nchash])
76 #define MINNEG		1024
77 
78 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
79 
80 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
81 static struct namecache_list	ncneglist;		/* instead of vnode */
82 static struct namecache		rootnamecache;		/* Dummy node */
83 
84 static int	nczapcheck;		/* panic on bad release */
85 SYSCTL_INT(_debug, OID_AUTO, nczapcheck, CTLFLAG_RW, &nczapcheck, 0, "");
86 
87 static u_long	nchash;			/* size of hash table */
88 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
89 
90 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
91 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
92 
93 static u_long	numneg;		/* number of cache entries allocated */
94 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
95 
96 static u_long	numcache;		/* number of cache entries allocated */
97 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
98 
99 static u_long	numunres;		/* number of unresolved entries */
100 SYSCTL_ULONG(_debug, OID_AUTO, numunres, CTLFLAG_RD, &numunres, 0, "");
101 
102 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
103 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
104 
105 /*
106  * The new name cache statistics
107  */
108 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
109 #define STATNODE(mode, name, var) \
110 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
111 STATNODE(CTLFLAG_RD, numneg, &numneg);
112 STATNODE(CTLFLAG_RD, numcache, &numcache);
113 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
114 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
115 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
116 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
117 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
118 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
119 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
120 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
121 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
122 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
123 
124 struct nchstats nchstats[SMP_MAXCPU];
125 /*
126  * Export VFS cache effectiveness statistics to user-land.
127  *
128  * The statistics are left for aggregation to user-land so
129  * neat things can be achieved, like observing per-CPU cache
130  * distribution.
131  */
132 static int
133 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
134 {
135 	struct globaldata *gd;
136 	int i, error;
137 
138 	error = 0;
139 	for (i = 0; i < ncpus; ++i) {
140 		gd = globaldata_find(i);
141 		if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
142 			sizeof(struct nchstats))))
143 			break;
144 	}
145 
146 	return (error);
147 }
148 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
149   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
150 
151 static void cache_zap(struct namecache *ncp);
152 
153 /*
154  * cache_hold() and cache_drop() prevent the premature deletion of a
155  * namecache entry but do not prevent operations (such as zapping) on
156  * that namecache entry.
157  */
158 static __inline
159 struct namecache *
160 _cache_hold(struct namecache *ncp)
161 {
162 	++ncp->nc_refs;
163 	return(ncp);
164 }
165 
166 static __inline
167 void
168 _cache_drop(struct namecache *ncp)
169 {
170 	KKASSERT(ncp->nc_refs > 0);
171 	if (ncp->nc_refs == 1 &&
172 	    (ncp->nc_flag & NCF_UNRESOLVED) &&
173 	    TAILQ_EMPTY(&ncp->nc_list)
174 	) {
175 		cache_zap(ncp);
176 	} else {
177 		--ncp->nc_refs;
178 	}
179 }
180 
181 struct namecache *
182 cache_hold(struct namecache *ncp)
183 {
184 	return(_cache_hold(ncp));
185 }
186 
187 void
188 cache_drop(struct namecache *ncp)
189 {
190 	_cache_drop(ncp);
191 }
192 
193 static void
194 cache_link_parent(struct namecache *ncp, struct namecache *par)
195 {
196 	KKASSERT(ncp->nc_parent == NULL);
197 	ncp->nc_parent = par;
198 	if (TAILQ_EMPTY(&par->nc_list)) {
199 		if (par->nc_vp)
200 			vhold(par->nc_vp);
201 	}
202 	TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
203 }
204 
205 static void
206 cache_unlink_parent(struct namecache *ncp)
207 {
208 	struct namecache *par;
209 
210 	if ((par = ncp->nc_parent) != NULL) {
211 		ncp->nc_parent = NULL;
212 		par = cache_hold(par);
213 		TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
214 		if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
215 			vdrop(par->nc_vp);
216 		cache_drop(par);
217 	}
218 }
219 
220 static struct namecache *
221 cache_alloc(struct vnode *vp)
222 {
223 	struct namecache *ncp;
224 
225 	ncp = malloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
226 	TAILQ_INIT(&ncp->nc_list);
227 	ncp->nc_vp = vp;
228 	if (vp != NULL) {
229 		TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
230 		++numcache;
231 	} else {
232 		TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
233 		++numneg;
234 	}
235 	return(ncp);
236 }
237 
238 /*
239  * Try to destroy a namecache entry.  The entry is disassociated from its
240  * vnode or ncneglist and reverted to an UNRESOLVED state.
241  *
242  * Then, if there are no additional references to the ncp and we can
243  * successfully delete the children, the entry is also removed from the
244  * namecache hashlist / topology.
245  *
246  * References or undeletable children will prevent the entry from being
247  * removed from the topology.  The entry may be revalidated (typically
248  * by cache_enter()) at a later time.  Children remain because:
249  *
250  *	+ we have tried to delete a node rather then a leaf in the topology.
251  *	+ the presence of negative entries (we try to scrap these).
252  *	+ an entry or child has a non-zero ref count and cannot be scrapped.
253  *
254  * This function must be called with the ncp held and will drop the ref
255  * count during zapping.
256  */
257 static void
258 cache_zap(struct namecache *ncp)
259 {
260 	struct namecache *par;
261 	struct vnode *vp;
262 
263 	/*
264 	 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
265 	 */
266 	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
267 		ncp->nc_flag |= NCF_UNRESOLVED;
268 		++numunres;
269 		if ((vp = ncp->nc_vp) != NULL) {
270 			--numcache;
271 			ncp->nc_vp = NULL;	/* safety */
272 			TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
273 			if (!TAILQ_EMPTY(&ncp->nc_list))
274 				vdrop(vp);
275 		} else {
276 			TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
277 			--numneg;
278 		}
279 	}
280 
281 	/*
282 	 * Try to scrap the entry and possibly tail-recurse on its parent.
283 	 * We only scrap unref'd (other then our ref) unresolved entries,
284 	 * we do not scrap 'live' entries.
285 	 */
286 	while (ncp->nc_flag & NCF_UNRESOLVED) {
287 		/*
288 		 * Someone other then us has a ref, stop.
289 		 */
290 		if (ncp->nc_refs > 1)
291 			goto done;
292 
293 		/*
294 		 * We have children, stop.
295 		 */
296 		if (!TAILQ_EMPTY(&ncp->nc_list))
297 			goto done;
298 
299 		/*
300 		 * Ok, we can completely destroy and free this entry.  Sanity
301 		 * check it against our static rootnamecache structure,
302 		 * then remove it from the hash.
303 		 */
304 		KKASSERT(ncp != &rootnamecache);
305 
306 		if (ncp->nc_flag & NCF_HASHED) {
307 			ncp->nc_flag &= ~NCF_HASHED;
308 			LIST_REMOVE(ncp, nc_hash);
309 		}
310 
311 		/*
312 		 * Unlink from its parent and free, then loop on the
313 		 * parent.  XXX temp hack, in stage-3 parent is never NULL
314 		 */
315 		if ((par = ncp->nc_parent) != NULL) {
316 			par = cache_hold(par);
317 			TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
318 			if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
319 				vdrop(par->nc_vp);
320 		}
321 		--numunres;
322 		ncp->nc_refs = -1;	/* safety */
323 		ncp->nc_parent = NULL;	/* safety */
324 		if (ncp->nc_name)
325 			free(ncp->nc_name, M_VFSCACHE);
326 		free(ncp, M_VFSCACHE);
327 		ncp = par;
328 		if (par == NULL)	/* temp hack */
329 			return;		/* temp hack */
330 	}
331 done:
332 	--ncp->nc_refs;
333 }
334 
335 /*
336  * Lookup an entry in the cache
337  *
338  * Lookup is called with dvp pointing to the directory to search,
339  * cnp pointing to the name of the entry being sought.
340  *
341  * If the lookup succeeds, the vnode is returned in *vpp, and a
342  * status of -1 is returned.
343  *
344  * If the lookup determines that the name does not exist (negative cacheing),
345  * a status of ENOENT is returned.
346  *
347  * If the lookup fails, a status of zero is returned.
348  *
349  * Note that UNRESOLVED entries are ignored.  They are not negative cache
350  * entries.
351  */
352 int
353 cache_lookup(struct vnode *dvp, struct namecache *par, struct vnode **vpp,
354 		struct namecache **ncpp, struct componentname *cnp)
355 {
356 	struct namecache *ncp;
357 	u_int32_t hash;
358 	globaldata_t gd = mycpu;
359 
360 	numcalls++;
361 
362 	/*
363 	 * Obtain the namecache entry associated with dvp, creating one if
364 	 * necessary.  If we have to create one we have insufficient
365 	 * information to hash it or even supply the name, but we still
366 	 * need one so we can link it in.
367 	 *
368 	 * NOTE: in this stage of development, the passed 'par' is
369 	 * almost always NULL.
370 	 */
371 	if (par == NULL) {
372 		if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
373 			par = cache_alloc(dvp);
374 	}
375 
376 	/*
377 	 * Deal with "." and "..".  In this stage of code development we leave
378 	 * the returned ncpp NULL.  Note that if the namecache is disjoint,
379 	 * we won't find a vnode for "..".
380 	 */
381 	if (cnp->cn_nameptr[0] == '.') {
382 		if (cnp->cn_namelen == 1) {
383 			*vpp = dvp;
384 			dothits++;
385 			numposhits++;	/* include in total statistics */
386 			return (-1);
387 		}
388 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
389 			dotdothits++;
390 			numposhits++;	/* include in total statistics */
391 			if ((cnp->cn_flags & CNP_MAKEENTRY) == 0)
392 				return (0);
393 			if (par->nc_parent == NULL ||
394 			    par->nc_parent->nc_vp == NULL) {
395 				return (0);
396 			}
397 			*vpp = par->nc_parent->nc_vp;
398 			return (-1);
399 		}
400 	}
401 
402 	/*
403 	 * Try to locate an existing entry
404 	 */
405 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
406 	hash = fnv_32_buf(&par, sizeof(par), hash);
407 	if (nczapcheck > 1)
408 	    printf("DVP %p/%p %08x %*.*s\n", dvp, par, hash, (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr);
409 restart:
410 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
411 		numchecks++;
412 		if (nczapcheck > 1) {
413 		    printf("TEST ncp par=%p %*.*s\n",
414 			ncp->nc_parent, ncp->nc_nlen, ncp->nc_nlen,
415 			ncp->nc_name);
416 		}
417 
418 		/*
419 		 * Zap entries that have timed out.
420 		 */
421 		if (ncp->nc_timeout &&
422 		    (int)(ncp->nc_timeout - ticks) < 0
423 		) {
424 			if (nczapcheck > 1)
425 			    printf("TIMEOUT\n");
426 			cache_zap(cache_hold(ncp));
427 			goto restart;
428 		}
429 
430 		/*
431 		 * Break out if we find a matching entry.  UNRESOLVED entries
432 		 * never match (they are in the middle of being destroyed).
433 		 */
434 		if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
435 		    ncp->nc_parent == par &&
436 		    ncp->nc_nlen == cnp->cn_namelen &&
437 		    bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen) == 0
438 		) {
439 			if (nczapcheck > 1)
440 			    printf("GOOD\n");
441 			cache_hold(ncp);
442 			break;
443 		}
444 	}
445 
446 	/*
447 	 * If we failed to locate an entry, return 0 (indicates failure).
448 	 */
449 	if (ncp == NULL) {
450 		if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
451 			nummisszap++;
452 		} else {
453 			nummiss++;
454 		}
455 		gd->gd_nchstats->ncs_miss++;
456 		if (nczapcheck) {
457 		    printf("MISS %p/%p %*.*s/%*.*s\n", dvp, par,
458 			par->nc_nlen, par->nc_nlen, (par->nc_name ? par->nc_name : ""),
459 			(int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr);
460 		}
461 		return (0);
462 	}
463 
464 	/*
465 	 * If we found an entry, but we don't want to have one, we zap it.
466 	 */
467 	if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
468 		numposzaps++;
469 		gd->gd_nchstats->ncs_badhits++;
470 		cache_zap(ncp);
471 		return (0);
472 	}
473 
474 	/*
475 	 * If the vnode is not NULL then return the positive match.
476 	 */
477 	if (ncp->nc_vp) {
478 		numposhits++;
479 		gd->gd_nchstats->ncs_goodhits++;
480 		*vpp = ncp->nc_vp;
481 		cache_drop(ncp);
482 		return (-1);
483 	}
484 
485 	/*
486 	 * If the vnode is NULL we found a negative match.  If we want to
487 	 * create it, purge the negative match and return failure (as if
488 	 * we hadn't found a match in the first place).
489 	 */
490 	if (cnp->cn_nameiop == NAMEI_CREATE) {
491 		numnegzaps++;
492 		gd->gd_nchstats->ncs_badhits++;
493 		cache_zap(ncp);
494 		return (0);
495 	}
496 
497 	numneghits++;
498 
499 	/*
500 	 * We found a "negative" match, ENOENT notifies client of this match.
501 	 * The nc_flag field records whether this is a whiteout.  Since there
502 	 * is no vnode we can use the vnode tailq link field with ncneglist.
503 	 */
504 	TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
505 	TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
506 	gd->gd_nchstats->ncs_neghits++;
507 	if (ncp->nc_flag & NCF_WHITEOUT)
508 		cnp->cn_flags |= CNP_ISWHITEOUT;
509 	cache_drop(ncp);
510 	return (ENOENT);
511 }
512 
513 /*
514  * Generate a special linkage between the mount point and the root of the
515  * mounted filesystem in order to maintain the namecache topology across
516  * a mount point.  The special linkage has a 0-length name component
517  * and sets NCF_MOUNTPT.
518  */
519 void
520 cache_mount(struct vnode *dvp, struct vnode *tvp)
521 {
522 	struct namecache *ncp;
523 	struct namecache *par;
524 	struct nchashhead *nchpp;
525 	u_int32_t hash;
526 
527 	/*
528 	 * If a linkage already exists we do not have to do anything.
529 	 */
530 	hash = fnv_32_buf("", 0, FNV1_32_INIT);
531 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
532 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
533 		numchecks++;
534 		if (ncp->nc_vp == tvp &&
535 		    ncp->nc_nlen == 0 &&
536 		    ncp->nc_parent &&
537 		    ncp->nc_parent->nc_vp == dvp
538 		) {
539 			return;
540 		}
541 	}
542 
543 	if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
544 		par = cache_alloc(dvp);
545 
546 	/*
547 	 * Otherwise create a new linkage.
548 	 */
549 	ncp = cache_alloc(tvp);
550 	ncp->nc_flag = NCF_MOUNTPT;
551 	cache_link_parent(ncp, par);
552 
553 	/*
554 	 * Hash table
555 	 */
556 	hash = fnv_32_buf("", 0, FNV1_32_INIT);
557 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
558 	nchpp = NCHHASH(hash);
559 	LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
560 
561 	ncp->nc_flag |= NCF_HASHED;
562 }
563 
564 /*
565  * Add an entry to the cache.
566  */
567 void
568 cache_enter(struct vnode *dvp, struct namecache *par, struct vnode *vp, struct componentname *cnp)
569 {
570 	struct namecache *ncp;
571 	struct namecache *bpar;
572 	struct nchashhead *nchpp;
573 	u_int32_t hash;
574 	char *name;
575 
576 	/*
577 	 * If the directory has no namecache entry we must associate one with
578 	 * it.  The name of the entry is not known so it isn't hashed.
579 	 */
580 	if (par == NULL) {
581 		if ((par = TAILQ_FIRST(&dvp->v_namecache)) == NULL)
582 			par = cache_alloc(dvp);
583 	}
584 
585 	/*
586 	 * This may be a bit confusing.  "." and ".." are 'virtual' entries.
587 	 * We do not actually create a namecache entry representing either.
588 	 * However, the ".." case is used to linkup a potentially disjoint
589 	 * directory with its parent, to disconnect a directory from its
590 	 * parent, or to change an existing linkage that may no longer be
591 	 * correct (as might occur when a subdirectory is renamed).
592 	 */
593 
594 	if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')
595 		return;
596 	if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' &&
597 	    cnp->cn_nameptr[1] == '.'
598 	) {
599 		if (vp == NULL) {
600 			if (par->nc_parent)
601 				cache_unlink_parent(par);
602 		} else {
603 			if ((ncp = TAILQ_FIRST(&vp->v_namecache)) == NULL)
604 				ncp = cache_alloc(vp);
605 			cache_hold(par);
606 			if (par->nc_parent)
607 				cache_unlink_parent(par);
608 			cache_link_parent(par, ncp); /* ncp is parent of par */
609 			cache_drop(par);
610 		}
611 		return;
612 	}
613 
614 	/*
615 	 * Locate other entries associated with this vnode and zap them,
616 	 * because the purge code may not be able to find them due to
617 	 * the topology not yet being consistent.  This is a temporary
618 	 * hack.
619 	 */
620 	if (vp) {
621 again:
622 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
623 			if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
624 				cache_zap(cache_hold(ncp));
625 				goto again;
626 			}
627 		}
628 	}
629 
630 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
631 	bpar = par;
632 	hash = fnv_32_buf(&bpar, sizeof(bpar), hash);
633 
634 	if (nczapcheck > 1)
635 	    printf("ENTER %p/%p %08x '%*.*s' %p ", dvp, par, hash, (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr, vp);
636 
637 
638 	name = malloc(cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
639 	ncp = cache_alloc(vp);
640 	if (nczapcheck > 1)
641 	    printf("alloc\n");
642 
643 	/*
644 	 * Set a timeout
645 	 */
646 	if (cnp->cn_flags & CNP_CACHETIMEOUT) {
647 		if ((ncp->nc_timeout = ticks + cnp->cn_timeout) == 0)
648 			ncp->nc_timeout = 1;
649 	}
650 
651 	/*
652 	 * Linkup the parent pointer, bump the parent vnode's hold
653 	 * count when we go from 0->1 children.
654 	 */
655 	cache_link_parent(ncp, par);
656 
657 	/*
658 	 * Add to the hash table
659 	 */
660 	ncp->nc_name = name;
661 	ncp->nc_nlen = cnp->cn_namelen;
662 	bcopy(cnp->cn_nameptr, ncp->nc_name, cnp->cn_namelen);
663 	nchpp = NCHHASH(hash);
664 	LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
665 	ncp->nc_flag |= NCF_HASHED;
666 
667 	/*
668 	 * If the target vnode is NULL if this is to be a negative cache
669 	 * entry.
670 	 */
671 	if (vp == NULL) {
672 		ncp->nc_flag &= ~NCF_WHITEOUT;
673 		if (cnp->cn_flags & CNP_ISWHITEOUT)
674 			ncp->nc_flag |= NCF_WHITEOUT;
675 	}
676 
677 	/*
678 	 * Don't cache too many negative hits
679 	 */
680 	if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
681 		ncp = TAILQ_FIRST(&ncneglist);
682 		KKASSERT(ncp != NULL);
683 		cache_zap(cache_hold(ncp));
684 	}
685 }
686 
687 /*
688  * Name cache initialization, from vfsinit() when we are booting
689  *
690  * rootnamecache is initialized such that it cannot be recursively deleted.
691  */
692 void
693 nchinit(void)
694 {
695 	int i;
696 	globaldata_t gd;
697 
698 	/* initialise per-cpu namecache effectiveness statistics. */
699 	for (i = 0; i < ncpus; ++i) {
700 		gd = globaldata_find(i);
701 		gd->gd_nchstats = &nchstats[i];
702 	}
703 
704 	TAILQ_INIT(&ncneglist);
705 	nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
706 	TAILQ_INIT(&rootnamecache.nc_list);
707 	rootnamecache.nc_flag |= NCF_HASHED | NCF_ROOT | NCF_UNRESOLVED;
708 	rootnamecache.nc_refs = 1;
709 }
710 
711 /*
712  * vfs_cache_setroot()
713  *
714  *	Create an association between the root of our namecache and
715  *	the root vnode.  This routine may be called several times during
716  *	booting.
717  */
718 void
719 vfs_cache_setroot(struct vnode *nvp)
720 {
721 	KKASSERT(rootnamecache.nc_refs > 0);	/* don't accidently free */
722 	cache_zap(cache_hold(&rootnamecache));
723 
724 	rootnamecache.nc_vp = nvp;
725 	rootnamecache.nc_flag &= ~NCF_UNRESOLVED;
726 	if (nvp) {
727 		++numcache;
728 		if (!TAILQ_EMPTY(&rootnamecache.nc_list))
729 			vhold(nvp);
730 		TAILQ_INSERT_HEAD(&nvp->v_namecache, &rootnamecache, nc_vnode);
731 	} else {
732 		++numneg;
733 		TAILQ_INSERT_TAIL(&ncneglist, &rootnamecache, nc_vnode);
734 		rootnamecache.nc_flag &= ~NCF_WHITEOUT;
735 	}
736 }
737 
738 /*
739  * Invalidate all namecache entries to a particular vnode as well as
740  * any direct children of that vnode in the namecache.  This is a
741  * 'catch all' purge used by filesystems that do not know any better.
742  *
743  * A new vnode v_id is generated.  Note that no vnode will ever have a
744  * v_id of 0.
745  *
746  * Note that the linkage between the vnode and its namecache entries will
747  * be removed, but the namecache entries themselves might stay put due to
748  * active references from elsewhere in the system or due to the existance of
749  * the children.   The namecache topology is left intact even if we do not
750  * know what the vnode association is.  Such entries will be marked
751  * NCF_UNRESOLVED.
752  *
753  * XXX: Only time and the size of v_id prevents this from failing:
754  * XXX: In theory we should hunt down all (struct vnode*, v_id)
755  * XXX: soft references and nuke them, at least on the global
756  * XXX: v_id wraparound.  The period of resistance can be extended
757  * XXX: by incrementing each vnodes v_id individually instead of
758  * XXX: using the global v_id.
759  */
760 void
761 cache_purge(struct vnode *vp)
762 {
763 	static u_long nextid;
764 	struct namecache *ncp;
765 	struct namecache *scan;
766 
767 	/*
768 	 * Disassociate the vnode from its namecache entries along with
769 	 * (for historical reasons) any direct children.
770 	 */
771 	while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) {
772 		cache_hold(ncp);
773 
774 restart: /* YYY hack, fix me */
775 		TAILQ_FOREACH(scan, &ncp->nc_list, nc_entry) {
776 			if ((scan->nc_flag & NCF_UNRESOLVED) == 0) {
777 				cache_zap(cache_hold(scan));
778 				goto restart;
779 			}
780 		}
781 		cache_zap(ncp);
782 	}
783 
784 	/*
785 	 * Calculate a new unique id for ".." handling
786 	 */
787 	do {
788 		nextid++;
789 	} while (nextid == vp->v_id || nextid == 0);
790 	vp->v_id = nextid;
791 }
792 
793 /*
794  * Flush all entries referencing a particular filesystem.
795  *
796  * Since we need to check it anyway, we will flush all the invalid
797  * entries at the same time.
798  */
799 void
800 cache_purgevfs(struct mount *mp)
801 {
802 	struct nchashhead *nchpp;
803 	struct namecache *ncp, *nnp;
804 
805 	/*
806 	 * Scan hash tables for applicable entries.
807 	 */
808 	for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
809 		ncp = LIST_FIRST(nchpp);
810 		if (ncp)
811 			cache_hold(ncp);
812 		while (ncp) {
813 			nnp = LIST_NEXT(ncp, nc_hash);
814 			if (nnp)
815 				cache_hold(nnp);
816 			if (ncp->nc_vp && ncp->nc_vp->v_mount == mp)
817 				cache_zap(ncp);
818 			else
819 				cache_drop(ncp);
820 			ncp = nnp;
821 		}
822 	}
823 }
824 
825 /*
826  * cache_leaf_test()
827  *
828  *	Test whether the vnode is at a leaf in the nameicache tree.
829  *
830  *	Returns 0 if it is a leaf, -1 if it isn't.
831  */
832 int
833 cache_leaf_test(struct vnode *vp)
834 {
835 	struct namecache *scan;
836 	struct namecache *ncp;
837 
838 	TAILQ_FOREACH(scan, &vp->v_namecache, nc_vnode) {
839 		TAILQ_FOREACH(ncp, &scan->nc_list, nc_entry) {
840 			/* YYY && ncp->nc_vp->v_type == VDIR ? */
841 			if (ncp->nc_vp != NULL)
842 				return(-1);
843 		}
844 	}
845 	return(0);
846 }
847 
848 /*
849  * Perform canonical checks and cache lookup and pass on to filesystem
850  * through the vop_cachedlookup only if needed.
851  *
852  * vop_lookup_args {
853  *	struct vnode a_dvp;
854  *	struct namecache *a_ncp;
855  *	struct vnode **a_vpp;
856  *	struct namecache **a_ncpp;
857  *	struct componentname *a_cnp;
858  * }
859  */
860 int
861 vfs_cache_lookup(struct vop_lookup_args *ap)
862 {
863 	struct vnode *dvp, *vp;
864 	int lockparent;
865 	int error;
866 	struct namecache *par = ap->a_par;
867 	struct vnode **vpp = ap->a_vpp;
868 	struct namecache **ncpp = ap->a_ncpp;
869 	struct componentname *cnp = ap->a_cnp;
870 	struct ucred *cred = cnp->cn_cred;
871 	int flags = cnp->cn_flags;
872 	struct thread *td = cnp->cn_td;
873 	u_long vpid;	/* capability number of vnode */
874 
875 	*vpp = NULL;
876 	if (ncpp)
877 		*ncpp = NULL;
878 	dvp = ap->a_dvp;
879 	lockparent = flags & CNP_LOCKPARENT;
880 
881 	if (dvp->v_type != VDIR)
882                 return (ENOTDIR);
883 
884 	if ((flags & CNP_ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
885 	    (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME)) {
886 		return (EROFS);
887 	}
888 
889 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
890 
891 	if (error)
892 		return (error);
893 
894 	error = cache_lookup(dvp, par, vpp, ncpp, cnp);
895 
896 	if (!error)
897 		return (VOP_CACHEDLOOKUP(dvp, par, vpp, ncpp, cnp));
898 
899 	if (error == ENOENT)
900 		return (error);
901 
902 	vp = *vpp;
903 	vpid = vp->v_id;
904 	cnp->cn_flags &= ~CNP_PDIRUNLOCK;
905 	if (dvp == vp) {   /* lookup on "." */
906 		vref(vp);
907 		error = 0;
908 	} else if (flags & CNP_ISDOTDOT) {
909 		VOP_UNLOCK(dvp, NULL, 0, td);
910 		cnp->cn_flags |= CNP_PDIRUNLOCK;
911 		error = vget(vp, NULL, LK_EXCLUSIVE, td);
912 		if (!error && lockparent && (flags & CNP_ISLASTCN)) {
913 			if ((error = vn_lock(dvp, NULL, LK_EXCLUSIVE, td)) == 0)
914 				cnp->cn_flags &= ~CNP_PDIRUNLOCK;
915 		}
916 	} else {
917 		error = vget(vp, NULL, LK_EXCLUSIVE, td);
918 		if (!lockparent || error || !(flags & CNP_ISLASTCN)) {
919 			VOP_UNLOCK(dvp, NULL, 0, td);
920 			cnp->cn_flags |= CNP_PDIRUNLOCK;
921 		}
922 	}
923 	/*
924 	 * Check that the capability number did not change
925 	 * while we were waiting for the lock.
926 	 */
927 	if (!error) {
928 		if (vpid == vp->v_id)
929 			return (0);
930 		vput(vp);
931 		if (lockparent && dvp != vp && (flags & CNP_ISLASTCN)) {
932 			VOP_UNLOCK(dvp, NULL, 0, td);
933 			cnp->cn_flags |= CNP_PDIRUNLOCK;
934 		}
935 	}
936 	if (cnp->cn_flags & CNP_PDIRUNLOCK) {
937 		error = vn_lock(dvp, NULL, LK_EXCLUSIVE, td);
938 		if (error)
939 			return (error);
940 		cnp->cn_flags &= ~CNP_PDIRUNLOCK;
941 	}
942 	return (VOP_CACHEDLOOKUP(dvp, par, vpp, ncpp, cnp));
943 }
944 
945 static int disablecwd;
946 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
947 
948 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
949 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
950 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
951 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
952 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
953 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
954 
955 int
956 __getcwd(struct __getcwd_args *uap)
957 {
958 	int buflen;
959 	int error;
960 	char *buf;
961 	char *bp;
962 
963 	if (disablecwd)
964 		return (ENODEV);
965 
966 	buflen = uap->buflen;
967 	if (buflen < 2)
968 		return (EINVAL);
969 	if (buflen > MAXPATHLEN)
970 		buflen = MAXPATHLEN;
971 
972 	buf = malloc(buflen, M_TEMP, M_WAITOK);
973 	bp = kern_getcwd(buf, buflen, &error);
974 	if (error == 0)
975 		error = copyout(bp, uap->buf, strlen(bp) + 1);
976 	free(buf, M_TEMP);
977 	return (error);
978 }
979 
980 char *
981 kern_getcwd(char *buf, size_t buflen, int *error)
982 {
983 	struct proc *p = curproc;
984 	char *bp;
985 	int i, slash_prefixed;
986 	struct filedesc *fdp;
987 	struct namecache *ncp;
988 	struct vnode *vp;
989 
990 	numcwdcalls++;
991 	bp = buf;
992 	bp += buflen - 1;
993 	*bp = '\0';
994 	fdp = p->p_fd;
995 	slash_prefixed = 0;
996 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
997 		if (vp->v_flag & VROOT) {
998 			if (vp->v_mount == NULL) {	/* forced unmount */
999 				*error = EBADF;
1000 				return(NULL);
1001 			}
1002 			vp = vp->v_mount->mnt_vnodecovered;
1003 			continue;
1004 		}
1005 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1006 			if (ncp->nc_parent && ncp->nc_parent->nc_vp &&
1007 			    ncp->nc_nlen > 0) {
1008 				break;
1009 			}
1010 		}
1011 		if (ncp == NULL) {
1012 			numcwdfail2++;
1013 			*error = ENOENT;
1014 			return(NULL);
1015 		}
1016 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
1017 			if (bp == buf) {
1018 				numcwdfail4++;
1019 				*error = ENOMEM;
1020 				return(NULL);
1021 			}
1022 			*--bp = ncp->nc_name[i];
1023 		}
1024 		if (bp == buf) {
1025 			numcwdfail4++;
1026 			*error = ENOMEM;
1027 			return(NULL);
1028 		}
1029 		*--bp = '/';
1030 		slash_prefixed = 1;
1031 		vp = ncp->nc_parent->nc_vp;
1032 	}
1033 	if (!slash_prefixed) {
1034 		if (bp == buf) {
1035 			numcwdfail4++;
1036 			*error = ENOMEM;
1037 			return(NULL);
1038 		}
1039 		*--bp = '/';
1040 	}
1041 	numcwdfound++;
1042 	*error = 0;
1043 	return (bp);
1044 }
1045 
1046 /*
1047  * Thus begins the fullpath magic.
1048  */
1049 
1050 #undef STATNODE
1051 #define STATNODE(name)							\
1052 	static u_int name;						\
1053 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
1054 
1055 static int disablefullpath;
1056 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
1057     &disablefullpath, 0, "");
1058 
1059 STATNODE(numfullpathcalls);
1060 STATNODE(numfullpathfail1);
1061 STATNODE(numfullpathfail2);
1062 STATNODE(numfullpathfail3);
1063 STATNODE(numfullpathfail4);
1064 STATNODE(numfullpathfound);
1065 
1066 int
1067 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf)
1068 {
1069 	char *bp, *buf;
1070 	int i, slash_prefixed;
1071 	struct filedesc *fdp;
1072 	struct namecache *ncp;
1073 	struct vnode *vp;
1074 
1075 	numfullpathcalls++;
1076 	if (disablefullpath)
1077 		return (ENODEV);
1078 
1079 	if (p == NULL)
1080 		return (EINVAL);
1081 
1082 	/* vn is NULL, client wants us to use p->p_textvp */
1083 	if (vn == NULL) {
1084 		if ((vn = p->p_textvp) == NULL)
1085 			return (EINVAL);
1086 	}
1087 
1088 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1089 	bp = buf + MAXPATHLEN - 1;
1090 	*bp = '\0';
1091 	fdp = p->p_fd;
1092 	slash_prefixed = 0;
1093 	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
1094 		if (vp->v_flag & VROOT) {
1095 			if (vp->v_mount == NULL) {	/* forced unmount */
1096 				free(buf, M_TEMP);
1097 				return (EBADF);
1098 			}
1099 			vp = vp->v_mount->mnt_vnodecovered;
1100 			continue;
1101 		}
1102 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1103 			if (ncp->nc_parent && ncp->nc_parent->nc_vp &&
1104 			    ncp->nc_nlen > 0) {
1105 				break;
1106 			}
1107 		}
1108 		if (ncp == NULL) {
1109 			numfullpathfail2++;
1110 			free(buf, M_TEMP);
1111 			return (ENOENT);
1112 		}
1113 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
1114 			if (bp == buf) {
1115 				numfullpathfail4++;
1116 				free(buf, M_TEMP);
1117 				return (ENOMEM);
1118 			}
1119 			*--bp = ncp->nc_name[i];
1120 		}
1121 		if (bp == buf) {
1122 			numfullpathfail4++;
1123 			free(buf, M_TEMP);
1124 			return (ENOMEM);
1125 		}
1126 		*--bp = '/';
1127 		slash_prefixed = 1;
1128 		vp = ncp->nc_parent->nc_vp;
1129 	}
1130 	if (!slash_prefixed) {
1131 		if (bp == buf) {
1132 			numfullpathfail4++;
1133 			free(buf, M_TEMP);
1134 			return (ENOMEM);
1135 		}
1136 		*--bp = '/';
1137 	}
1138 	numfullpathfound++;
1139 	*retbuf = bp;
1140 	*freebuf = buf;
1141 	return (0);
1142 }
1143 
1144