xref: /dragonfly/sys/kern/vfs_cache.c (revision 4caa7869)
1 /*
2  * Copyright (c) 1989, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>,
5  *	All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Poul-Henning Kamp of the FreeBSD Project.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
39  * $FreeBSD: src/sys/kern/vfs_cache.c,v 1.42.2.6 2001/10/05 20:07:03 dillon Exp $
40  * $DragonFly: src/sys/kern/vfs_cache.c,v 1.12 2003/10/18 05:53:57 dillon Exp $
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysctl.h>
47 #include <sys/mount.h>
48 #include <sys/vnode.h>
49 #include <sys/malloc.h>
50 #include <sys/sysproto.h>
51 #include <sys/proc.h>
52 #include <sys/namei.h>
53 #include <sys/filedesc.h>
54 #include <sys/fnv_hash.h>
55 
56 /*
57  * Random lookups in the cache are accomplished with a hash table using
58  * a hash key of (nc_src_vp, name).
59  *
60  * Negative entries may exist and correspond to structures where nc_vp
61  * is NULL.  In a negative entry, NCF_WHITEOUT will be set if the entry
62  * corresponds to a whited-out directory entry (verses simply not finding the
63  * entry at all).
64  *
65  * Upon reaching the last segment of a path, if the reference is for DELETE,
66  * or NOCACHE is set (rewrite), and the name is located in the cache, it
67  * will be dropped.
68  */
69 
70 /*
71  * Structures associated with name cacheing.
72  */
73 #define NCHHASH(hash)	(&nchashtbl[(hash) & nchash])
74 #define MINNEG		1024
75 
76 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
77 static struct namecache_list	ncneglist;		/* instead of vnode */
78 static struct namecache		rootnamecache;		/* Dummy node */
79 
80 static int	nczapcheck;		/* panic on bad release */
81 SYSCTL_INT(_debug, OID_AUTO, nczapcheck, CTLFLAG_RW, &nczapcheck, 0, "");
82 
83 static u_long	nchash;			/* size of hash table */
84 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
85 
86 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
87 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
88 
89 static u_long	numneg;		/* number of cache entries allocated */
90 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
91 
92 static u_long	numcache;		/* number of cache entries allocated */
93 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
94 
95 static u_long	numunres;		/* number of unresolved entries */
96 SYSCTL_ULONG(_debug, OID_AUTO, numunres, CTLFLAG_RD, &numunres, 0, "");
97 
98 struct	nchstats nchstats;		/* cache effectiveness statistics */
99 
100 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
101 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
102 
103 /*
104  * The new name cache statistics
105  */
106 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
107 #define STATNODE(mode, name, var) \
108 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
109 STATNODE(CTLFLAG_RD, numneg, &numneg);
110 STATNODE(CTLFLAG_RD, numcache, &numcache);
111 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
112 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
113 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
114 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
115 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
116 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
117 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
118 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
119 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
120 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
121 
122 
123 static void cache_zap(struct namecache *ncp);
124 
125 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
126 
127 /*
128  * cache_hold() and cache_drop() prevent the premature deletion of a
129  * namecache entry but do not prevent operations (such as zapping) on
130  * that namecache entry.
131  */
132 static __inline
133 struct namecache *
134 _cache_hold(struct namecache *ncp)
135 {
136 	++ncp->nc_refs;
137 	return(ncp);
138 }
139 
140 static __inline
141 void
142 _cache_drop(struct namecache *ncp)
143 {
144 	KKASSERT(ncp->nc_refs > 0);
145 	if (ncp->nc_refs == 1 &&
146 	    (ncp->nc_flag & NCF_UNRESOLVED) &&
147 	    TAILQ_EMPTY(&ncp->nc_list)
148 	) {
149 		cache_zap(ncp);
150 	} else {
151 		--ncp->nc_refs;
152 	}
153 }
154 
155 struct namecache *
156 cache_hold(struct namecache *ncp)
157 {
158 	return(_cache_hold(ncp));
159 }
160 
161 void
162 cache_drop(struct namecache *ncp)
163 {
164 	_cache_drop(ncp);
165 }
166 
167 /*
168  * Try to destroy a namecache entry.  The entry is disassociated from its
169  * vnode or ncneglist and reverted to an UNRESOLVED state.
170  *
171  * Then, if there are no additional references to the ncp and we can
172  * successfully delete the children, the entry is also removed from the
173  * namecache hashlist / topology.
174  *
175  * References or undeletable children will prevent the entry from being
176  * removed from the topology.  The entry may be revalidated (typically
177  * by cache_enter()) at a later time.  Children remain because:
178  *
179  *	+ we have tried to delete a node rather then a leaf in the topology.
180  *	+ the presence of negative entries (we try to scrap these).
181  *	+ an entry or child has a non-zero ref count and cannot be scrapped.
182  *
183  * This function must be called with the ncp held and will drop the ref
184  * count during zapping.
185  */
186 static void
187 cache_zap(struct namecache *ncp)
188 {
189 	struct namecache *par;
190 	struct vnode *vp;
191 
192 	/*
193 	 * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
194 	 */
195 	if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
196 		ncp->nc_flag |= NCF_UNRESOLVED;
197 		++numunres;
198 		if ((vp = ncp->nc_vp) != NULL) {
199 			--numcache;
200 			ncp->nc_vp = NULL;	/* safety */
201 			TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
202 			if (!TAILQ_EMPTY(&ncp->nc_list))
203 				vdrop(vp);
204 		} else {
205 			TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
206 			--numneg;
207 		}
208 	}
209 
210 	/*
211 	 * Try to scrap the entry and possibly tail-recurse on its parent.
212 	 * We only scrap unref'd (other then our ref) unresolved entries,
213 	 * we do not scrap 'live' entries.
214 	 */
215 	while (ncp->nc_flag & NCF_UNRESOLVED) {
216 		/*
217 		 * Someone other then us has a ref, stop.
218 		 */
219 		if (ncp->nc_refs > 1)
220 			goto done;
221 
222 		/*
223 		 * We have children, stop.
224 		 */
225 		if (!TAILQ_EMPTY(&ncp->nc_list))
226 			goto done;
227 
228 		/*
229 		 * Ok, we can completely destroy and free this entry.  Sanity
230 		 * check it against our static rootnamecache structure,
231 		 * then remove it from the hash.
232 		 */
233 		KKASSERT(ncp != &rootnamecache);
234 
235 		if (ncp->nc_flag & NCF_HASHED) {
236 			ncp->nc_flag &= ~NCF_HASHED;
237 			LIST_REMOVE(ncp, nc_hash);
238 		}
239 
240 		/*
241 		 * Unlink from its parent and free, then loop on the
242 		 * parent.  XXX temp hack, in stage-3 parent is never NULL
243 		 */
244 		if ((par = ncp->nc_parent) != NULL) {
245 			par = cache_hold(par);
246 			TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
247 			if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
248 				vdrop(par->nc_vp);
249 		}
250 		--numunres;
251 		ncp->nc_refs = -1;	/* safety */
252 		ncp->nc_parent = NULL;	/* safety */
253 		if (ncp->nc_name)
254 			free(ncp->nc_name, M_VFSCACHE);
255 		free(ncp, M_VFSCACHE);
256 		ncp = par;
257 		if (par == NULL)	/* temp hack */
258 			return;		/* temp hack */
259 	}
260 done:
261 	--ncp->nc_refs;
262 }
263 
264 /*
265  * Lookup an entry in the cache
266  *
267  * Lookup is called with dvp pointing to the directory to search,
268  * cnp pointing to the name of the entry being sought.
269  *
270  * If the lookup succeeds, the vnode is returned in *vpp, and a
271  * status of -1 is returned.
272  *
273  * If the lookup determines that the name does not exist (negative cacheing),
274  * a status of ENOENT is returned.
275  *
276  * If the lookup fails, a status of zero is returned.
277  *
278  * Note that UNRESOLVED entries are ignored.  They are not negative cache
279  * entries.
280  */
281 int
282 cache_lookup(struct vnode *dvp, struct namecache *par, struct vnode **vpp,
283 		struct namecache **ncpp, struct componentname *cnp)
284 {
285 	struct namecache *ncp;
286 	u_int32_t hash;
287 
288 	numcalls++;
289 
290 	if (cnp->cn_nameptr[0] == '.') {
291 		if (cnp->cn_namelen == 1) {
292 			*vpp = dvp;
293 			dothits++;
294 			numposhits++;	/* include in total statistics */
295 			return (-1);
296 		}
297 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
298 			dotdothits++;
299 			numposhits++;	/* include in total statistics */
300 			if (dvp->v_dd->v_id != dvp->v_ddid ||
301 			    (cnp->cn_flags & CNP_MAKEENTRY) == 0) {
302 				dvp->v_ddid = 0;
303 				return (0);
304 			}
305 			*vpp = dvp->v_dd;
306 			return (-1);
307 		}
308 	}
309 
310 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
311 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
312 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
313 		numchecks++;
314 		if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
315 		    /* ncp->nc_parent == par instead of dvp STAGE-3 */
316 		    ncp->nc_dvp_data == (uintptr_t)dvp && /* STAGE-2 only */
317 		    ncp->nc_dvp_id == dvp->v_id && /* STAGE-2 only */
318 		    ncp->nc_nlen == cnp->cn_namelen &&
319 		    bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen) == 0
320 		) {
321 			cache_hold(ncp);
322 			break;
323 		}
324 	}
325 
326 	/* We failed to find an entry */
327 	if (ncp == NULL) {
328 		if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
329 			nummisszap++;
330 		} else {
331 			nummiss++;
332 		}
333 		nchstats.ncs_miss++;
334 		return (0);
335 	}
336 
337 	/* We don't want to have an entry, so dump it */
338 	if ((cnp->cn_flags & CNP_MAKEENTRY) == 0) {
339 		numposzaps++;
340 		nchstats.ncs_badhits++;
341 		cache_zap(ncp);
342 		return (0);
343 	}
344 
345 	/* We found a "positive" match, return the vnode */
346 	if (ncp->nc_vp) {
347 		numposhits++;
348 		nchstats.ncs_goodhits++;
349 		*vpp = ncp->nc_vp;
350 		cache_drop(ncp);
351 		return (-1);
352 	}
353 
354 	/* We found a negative match, and want to create it, so purge */
355 	if (cnp->cn_nameiop == NAMEI_CREATE) {
356 		numnegzaps++;
357 		nchstats.ncs_badhits++;
358 		cache_zap(ncp);
359 		return (0);
360 	}
361 
362 	numneghits++;
363 
364 	/*
365 	 * We found a "negative" match, ENOENT notifies client of this match.
366 	 * The nc_flag field records whether this is a whiteout.  Since there
367 	 * is no vnode we can use the vnode tailq link field with ncneglist.
368 	 */
369 	TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
370 	TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
371 	nchstats.ncs_neghits++;
372 	if (ncp->nc_flag & NCF_WHITEOUT)
373 		cnp->cn_flags |= CNP_ISWHITEOUT;
374 	cache_drop(ncp);
375 	return (ENOENT);
376 }
377 
378 /*
379  * Generate a special linkage between the mount point and the root of the
380  * mounted filesystem in order to maintain the namecache topology across
381  * a mount point.  The special linkage has a 0-length name component
382  * and sets NCF_MOUNTPT.
383  */
384 void
385 cache_mount(struct vnode *dvp, struct vnode *tvp)
386 {
387 	struct namecache *ncp;
388 	struct namecache *par;
389 	struct nchashhead *nchpp;
390 	u_int32_t hash;
391 
392 	/*
393 	 * If a linkage already exists we do not have to do anything.
394 	 */
395 	hash = fnv_32_buf("", 0, FNV1_32_INIT);
396 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
397 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
398 		numchecks++;
399 		if (ncp->nc_vp == tvp &&
400 		    ncp->nc_nlen == 0 &&
401 		    /* ncp->nc_parent == par STAGE-3 par instad of dvp */
402 		    ncp->nc_dvp_data == (uintptr_t)dvp && /* STAGE-2 only */
403 		    ncp->nc_dvp_id == dvp->v_id /* STAGE-2 only */
404 		) {
405 			return;
406 		}
407 	}
408 
409 	/*
410 	 * STAGE-2 par can be NULL
411 	 * STAGE-3 par is passed as argument and cannot be NULL
412 	 */
413 	par = TAILQ_FIRST(&dvp->v_namecache);
414 
415 	/*
416 	 * Otherwise create a new linkage.
417 	 */
418 	ncp = malloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK | M_ZERO);
419 
420 	TAILQ_INIT(&ncp->nc_list);
421 	ncp->nc_flag = NCF_MOUNTPT;
422 	if (par != NULL) {
423 		/* STAGE-3 par never NULL */
424 		ncp->nc_parent = par;
425 		if (TAILQ_EMPTY(&par->nc_list)) {
426 			if (par->nc_vp)
427 				vhold(par->nc_vp);
428 		}
429 		TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
430 	}
431 	ncp->nc_dvp_data = (uintptr_t)dvp; /* STAGE-2 ONLY */
432 	ncp->nc_dvp_id = dvp->v_id; 	/* STAGE-2 ONLY */
433 
434 	/*
435 	 * Linkup the target vnode.  The target vnode is NULL if this is
436 	 * to be a negative cache entry.
437 	 */
438 	++numcache;
439 	ncp->nc_vp = tvp;
440 	TAILQ_INSERT_HEAD(&tvp->v_namecache, ncp, nc_vnode);
441 #if 0
442 	if (tvp->v_type == VDIR) {
443 		vp->v_dd = dvp;
444 		vp->v_ddid = dvp->v_id;
445 	}
446 #endif
447 
448 	/*
449 	 * Hash table
450 	 */
451 	hash = fnv_32_buf("", 0, FNV1_32_INIT);
452 	hash = fnv_32_buf(&ncp->nc_dvp_id, sizeof(ncp->nc_dvp_id), hash);
453 	nchpp = NCHHASH(hash);
454 	LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
455 
456 	ncp->nc_flag |= NCF_HASHED;
457 }
458 
459 /*
460  * Add an entry to the cache.
461  */
462 void
463 cache_enter(struct vnode *dvp, struct namecache *par, struct vnode *vp, struct componentname *cnp)
464 {
465 	struct namecache *ncp;
466 	struct nchashhead *nchpp;
467 	u_int32_t hash;
468 
469 	/* YYY use par */
470 
471 	/*
472 	 * "." and ".." are degenerate cases, they are not added to the
473 	 * cache.
474 	 */
475 	if (cnp->cn_nameptr[0] == '.') {
476 		if (cnp->cn_namelen == 1) {
477 			return;
478 		}
479 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
480 			if (vp) {
481 				dvp->v_dd = vp;
482 				dvp->v_ddid = vp->v_id;
483 			} else {
484 				dvp->v_dd = dvp;
485 				dvp->v_ddid = 0;
486 			}
487 			return;
488 		}
489 	}
490 
491 	if (nczapcheck)
492 	    printf("ENTER '%*.*s' %p ", (int)cnp->cn_namelen, (int)cnp->cn_namelen, cnp->cn_nameptr, vp);
493 
494 	/*
495 	 * Locate other entries associated with this vnode and zap them,
496 	 * because the purge code may not be able to find them due to
497 	 * the topology not yet being consistent.  This is a temporary
498 	 * hack.
499 	 */
500 	if (vp) {
501 again:
502 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
503 			if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
504 				cache_zap(cache_hold(ncp));
505 				goto again;
506 			}
507 		}
508 	}
509 
510 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
511 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
512 
513 	ncp = malloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK | M_ZERO);
514 	ncp->nc_name = malloc(cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
515 	TAILQ_INIT(&ncp->nc_list);
516 	if (nczapcheck)
517 	    printf("alloc\n");
518 
519 	/*
520 	 * Linkup the parent pointer, bump the parent vnode's hold
521 	 * count when we go from 0->1 children.
522 	 *
523 	 * STAGE-2 par may be NULL
524 	 * STAGE-3 par may not be NULL, nc_dvp_* removed
525 	 */
526 	par = TAILQ_FIRST(&dvp->v_namecache);
527 	if (par != NULL) {
528 		ncp->nc_parent = par;
529 		if (TAILQ_EMPTY(&par->nc_list)) {
530 			if (par->nc_vp)
531 				vhold(par->nc_vp);
532 		}
533 		TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
534 	}
535 	ncp->nc_dvp_data = (uintptr_t)dvp;
536 	ncp->nc_dvp_id = dvp->v_id;
537 
538 	/*
539 	 * Add to the hash table
540 	 */
541 	ncp->nc_nlen = cnp->cn_namelen;
542 	bcopy(cnp->cn_nameptr, ncp->nc_name, cnp->cn_namelen);
543 	nchpp = NCHHASH(hash);
544 	LIST_INSERT_HEAD(nchpp, ncp, nc_hash);
545 
546 	ncp->nc_flag |= NCF_HASHED;
547 
548 	/*
549 	 * Linkup the target vnode.  The target vnode is NULL if this is
550 	 * to be a negative cache entry.
551 	 */
552 	ncp->nc_vp = vp;
553 	if (vp == NULL) {
554 		++numneg;
555 		TAILQ_INSERT_HEAD(&ncneglist, ncp, nc_vnode);
556 		ncp->nc_flag &= ~NCF_WHITEOUT;
557 		if (cnp->cn_flags & CNP_ISWHITEOUT)
558 			ncp->nc_flag |= NCF_WHITEOUT;
559 	} else {
560 		++numcache;
561 		if (!TAILQ_EMPTY(&ncp->nc_list))
562 			vhold(vp);
563 		TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
564 		if (vp->v_type == VDIR) {
565 			vp->v_dd = dvp;
566 			vp->v_ddid = dvp->v_id;
567 		}
568 	}
569 
570 	/*
571 	 * Don't cache too many negative hits
572 	 */
573 	if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
574 		ncp = TAILQ_FIRST(&ncneglist);
575 		KKASSERT(ncp != NULL);
576 		cache_zap(cache_hold(ncp));
577 	}
578 }
579 
580 /*
581  * Name cache initialization, from vfs_init() when we are booting
582  *
583  * rootnamecache is initialized such that it cannot be recursively deleted.
584  */
585 void
586 nchinit(void)
587 {
588 	TAILQ_INIT(&ncneglist);
589 	nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash);
590 	TAILQ_INIT(&rootnamecache.nc_list);
591 	rootnamecache.nc_flag |= NCF_HASHED | NCF_ROOT | NCF_UNRESOLVED;
592 	rootnamecache.nc_refs = 1;
593 }
594 
595 /*
596  * vfs_cache_setroot()
597  *
598  *	Create an association between the root of our namecache and
599  *	the root vnode.  This routine may be called several times during
600  *	booting.
601  */
602 void
603 vfs_cache_setroot(struct vnode *nvp)
604 {
605 	KKASSERT(rootnamecache.nc_refs > 0);	/* don't accidently free */
606 	cache_zap(cache_hold(&rootnamecache));
607 
608 	rootnamecache.nc_vp = nvp;
609 	rootnamecache.nc_flag &= ~NCF_UNRESOLVED;
610 	if (nvp) {
611 		++numcache;
612 		if (!TAILQ_EMPTY(&rootnamecache.nc_list))
613 			vhold(nvp);
614 		TAILQ_INSERT_HEAD(&nvp->v_namecache, &rootnamecache, nc_vnode);
615 	} else {
616 		++numneg;
617 		TAILQ_INSERT_HEAD(&ncneglist, &rootnamecache, nc_vnode);
618 		rootnamecache.nc_flag &= ~NCF_WHITEOUT;
619 	}
620 }
621 
622 /*
623  * Invalidate all namecache entries to a particular vnode as well as
624  * any direct children of that vnode in the namecache.  This is a
625  * 'catch all' purge used by filesystems that do not know any better.
626  *
627  * A new vnode v_id is generated.  Note that no vnode will ever have a
628  * v_id of 0.
629  *
630  * Note that the linkage between the vnode and its namecache entries will
631  * be removed, but the namecache entries themselves might stay put due to
632  * active references from elsewhere in the system or due to the existance of
633  * the children.   The namecache topology is left intact even if we do not
634  * know what the vnode association is.  Such entries will be marked
635  * NCF_UNRESOLVED.
636  *
637  * XXX: Only time and the size of v_id prevents this from failing:
638  * XXX: In theory we should hunt down all (struct vnode*, v_id)
639  * XXX: soft references and nuke them, at least on the global
640  * XXX: v_id wraparound.  The period of resistance can be extended
641  * XXX: by incrementing each vnodes v_id individually instead of
642  * XXX: using the global v_id.
643  */
644 void
645 cache_purge(struct vnode *vp)
646 {
647 	static u_long nextid;
648 	struct namecache *ncp;
649 	struct namecache *scan;
650 
651 	/*
652 	 * Disassociate the vnode from its namecache entries along with
653 	 * (for historical reasons) any direct children.
654 	 */
655 	while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) {
656 		cache_hold(ncp);
657 
658 restart: /* YYY hack, fix me */
659 		TAILQ_FOREACH(scan, &ncp->nc_list, nc_entry) {
660 			if ((scan->nc_flag & NCF_UNRESOLVED) == 0) {
661 				cache_zap(cache_hold(scan));
662 				goto restart;
663 			}
664 		}
665 		cache_zap(ncp);
666 	}
667 
668 	/*
669 	 * Calculate a new unique id for ".." handling
670 	 */
671 	do {
672 		nextid++;
673 	} while (nextid == vp->v_id || nextid == 0);
674 	vp->v_id = nextid;
675 	vp->v_dd = vp;
676 	vp->v_ddid = 0;
677 }
678 
679 /*
680  * Flush all entries referencing a particular filesystem.
681  *
682  * Since we need to check it anyway, we will flush all the invalid
683  * entries at the same time.
684  */
685 void
686 cache_purgevfs(struct mount *mp)
687 {
688 	struct nchashhead *nchpp;
689 	struct namecache *ncp, *nnp;
690 
691 	/*
692 	 * Scan hash tables for applicable entries.
693 	 */
694 	for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
695 		ncp = LIST_FIRST(nchpp);
696 		if (ncp)
697 			cache_hold(ncp);
698 		while (ncp) {
699 			nnp = LIST_NEXT(ncp, nc_hash);
700 			if (nnp)
701 				cache_hold(nnp);
702 			if (ncp->nc_vp && ncp->nc_vp->v_mount == mp)
703 				cache_zap(ncp);
704 			else
705 				cache_drop(ncp);
706 			ncp = nnp;
707 		}
708 	}
709 }
710 
711 /*
712  * cache_leaf_test()
713  *
714  *	Test whether the vnode is at a leaf in the nameicache tree.
715  *
716  *	Returns 0 if it is a leaf, -1 if it isn't.
717  */
718 int
719 cache_leaf_test(struct vnode *vp)
720 {
721 	struct namecache *scan;
722 	struct namecache *ncp;
723 
724 	TAILQ_FOREACH(scan, &vp->v_namecache, nc_vnode) {
725 		TAILQ_FOREACH(ncp, &scan->nc_list, nc_entry) {
726 			/* YYY && ncp->nc_vp->v_type == VDIR ? */
727 			if (ncp->nc_vp != NULL)
728 				return(-1);
729 		}
730 	}
731 	return(0);
732 }
733 
734 /*
735  * Perform canonical checks and cache lookup and pass on to filesystem
736  * through the vop_cachedlookup only if needed.
737  *
738  * vop_lookup_args {
739  *	struct vnode a_dvp;
740  *	struct namecache *a_ncp;
741  *	struct vnode **a_vpp;
742  *	struct namecache **a_ncpp;
743  *	struct componentname *a_cnp;
744  * }
745  */
746 int
747 vfs_cache_lookup(struct vop_lookup_args *ap)
748 {
749 	struct vnode *dvp, *vp;
750 	int lockparent;
751 	int error;
752 	struct namecache *par = ap->a_par;
753 	struct vnode **vpp = ap->a_vpp;
754 	struct namecache **ncpp = ap->a_ncpp;
755 	struct componentname *cnp = ap->a_cnp;
756 	struct ucred *cred = cnp->cn_cred;
757 	int flags = cnp->cn_flags;
758 	struct thread *td = cnp->cn_td;
759 	u_long vpid;	/* capability number of vnode */
760 
761 	*vpp = NULL;
762 	if (ncpp)
763 		*ncpp = NULL;
764 	dvp = ap->a_dvp;
765 	lockparent = flags & CNP_LOCKPARENT;
766 
767 	if (dvp->v_type != VDIR)
768                 return (ENOTDIR);
769 
770 	if ((flags & CNP_ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
771 	    (cnp->cn_nameiop == NAMEI_DELETE || cnp->cn_nameiop == NAMEI_RENAME)) {
772 		return (EROFS);
773 	}
774 
775 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
776 
777 	if (error)
778 		return (error);
779 
780 	error = cache_lookup(dvp, par, vpp, ncpp, cnp);
781 
782 	if (!error)
783 		return (VOP_CACHEDLOOKUP(dvp, par, vpp, ncpp, cnp));
784 
785 	if (error == ENOENT)
786 		return (error);
787 
788 	vp = *vpp;
789 	vpid = vp->v_id;
790 	cnp->cn_flags &= ~CNP_PDIRUNLOCK;
791 	if (dvp == vp) {   /* lookup on "." */
792 		VREF(vp);
793 		error = 0;
794 	} else if (flags & CNP_ISDOTDOT) {
795 		VOP_UNLOCK(dvp, 0, td);
796 		cnp->cn_flags |= CNP_PDIRUNLOCK;
797 		error = vget(vp, LK_EXCLUSIVE, td);
798 		if (!error && lockparent && (flags & CNP_ISLASTCN)) {
799 			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
800 				cnp->cn_flags &= ~CNP_PDIRUNLOCK;
801 		}
802 	} else {
803 		error = vget(vp, LK_EXCLUSIVE, td);
804 		if (!lockparent || error || !(flags & CNP_ISLASTCN)) {
805 			VOP_UNLOCK(dvp, 0, td);
806 			cnp->cn_flags |= CNP_PDIRUNLOCK;
807 		}
808 	}
809 	/*
810 	 * Check that the capability number did not change
811 	 * while we were waiting for the lock.
812 	 */
813 	if (!error) {
814 		if (vpid == vp->v_id)
815 			return (0);
816 		vput(vp);
817 		if (lockparent && dvp != vp && (flags & CNP_ISLASTCN)) {
818 			VOP_UNLOCK(dvp, 0, td);
819 			cnp->cn_flags |= CNP_PDIRUNLOCK;
820 		}
821 	}
822 	if (cnp->cn_flags & CNP_PDIRUNLOCK) {
823 		error = vn_lock(dvp, LK_EXCLUSIVE, td);
824 		if (error)
825 			return (error);
826 		cnp->cn_flags &= ~CNP_PDIRUNLOCK;
827 	}
828 	return (VOP_CACHEDLOOKUP(dvp, par, vpp, ncpp, cnp));
829 }
830 
831 static int disablecwd;
832 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
833 
834 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
835 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
836 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
837 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
838 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
839 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
840 
841 int
842 __getcwd(struct __getcwd_args *uap)
843 {
844 	struct proc *p = curproc;
845 	char *bp, *buf;
846 	int error, i, slash_prefixed;
847 	struct filedesc *fdp;
848 	struct namecache *ncp;
849 	struct vnode *vp;
850 
851 	numcwdcalls++;
852 	if (disablecwd)
853 		return (ENODEV);
854 	if (uap->buflen < 2)
855 		return (EINVAL);
856 	if (uap->buflen > MAXPATHLEN)
857 		uap->buflen = MAXPATHLEN;
858 	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
859 	bp += uap->buflen - 1;
860 	*bp = '\0';
861 	fdp = p->p_fd;
862 	slash_prefixed = 0;
863 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
864 		if (vp->v_flag & VROOT) {
865 			if (vp->v_mount == NULL) {	/* forced unmount */
866 				free(buf, M_TEMP);
867 				return (EBADF);
868 			}
869 			vp = vp->v_mount->mnt_vnodecovered;
870 			continue;
871 		}
872 		if (vp->v_dd->v_id != vp->v_ddid) {
873 			numcwdfail1++;
874 			free(buf, M_TEMP);
875 			return (ENOTDIR);
876 		}
877 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
878 			/* ncp->nc_parent == par STAGE-3 */
879 			if (ncp->nc_dvp_data == (uintptr_t)vp->v_dd &&
880 			    ncp->nc_dvp_id == vp->v_ddid) {
881 				break;
882 			}
883 		}
884 		if (ncp == NULL) {
885 			numcwdfail2++;
886 			free(buf, M_TEMP);
887 			return (ENOENT);
888 		}
889 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
890 			if (bp == buf) {
891 				numcwdfail4++;
892 				free(buf, M_TEMP);
893 				return (ENOMEM);
894 			}
895 			*--bp = ncp->nc_name[i];
896 		}
897 		if (bp == buf) {
898 			numcwdfail4++;
899 			free(buf, M_TEMP);
900 			return (ENOMEM);
901 		}
902 		*--bp = '/';
903 		slash_prefixed = 1;
904 		vp = vp->v_dd;
905 	}
906 	if (!slash_prefixed) {
907 		if (bp == buf) {
908 			numcwdfail4++;
909 			free(buf, M_TEMP);
910 			return (ENOMEM);
911 		}
912 		*--bp = '/';
913 	}
914 	numcwdfound++;
915 	error = copyout(bp, uap->buf, strlen(bp) + 1);
916 	free(buf, M_TEMP);
917 	return (error);
918 }
919 
920 /*
921  * Thus begins the fullpath magic.
922  */
923 
924 #undef STATNODE
925 #define STATNODE(name)							\
926 	static u_int name;						\
927 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
928 
929 static int disablefullpath;
930 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
931     &disablefullpath, 0, "");
932 
933 STATNODE(numfullpathcalls);
934 STATNODE(numfullpathfail1);
935 STATNODE(numfullpathfail2);
936 STATNODE(numfullpathfail3);
937 STATNODE(numfullpathfail4);
938 STATNODE(numfullpathfound);
939 
940 int
941 textvp_fullpath(struct proc *p, char **retbuf, char **retfreebuf)
942 {
943 	char *bp, *buf;
944 	int i, slash_prefixed;
945 	struct filedesc *fdp;
946 	struct namecache *ncp;
947 	struct vnode *vp, *textvp;
948 
949 	numfullpathcalls++;
950 	if (disablefullpath)
951 		return (ENODEV);
952 	textvp = p->p_textvp;
953 	if (textvp == NULL)
954 		return (EINVAL);
955 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
956 	bp = buf + MAXPATHLEN - 1;
957 	*bp = '\0';
958 	fdp = p->p_fd;
959 	slash_prefixed = 0;
960 	for (vp = textvp; vp != fdp->fd_rdir && vp != rootvnode;) {
961 		if (vp->v_flag & VROOT) {
962 			if (vp->v_mount == NULL) {	/* forced unmount */
963 				free(buf, M_TEMP);
964 				return (EBADF);
965 			}
966 			vp = vp->v_mount->mnt_vnodecovered;
967 			continue;
968 		}
969 		if (vp != textvp && vp->v_dd->v_id != vp->v_ddid) {
970 			numfullpathfail1++;
971 			free(buf, M_TEMP);
972 			return (ENOTDIR);
973 		}
974 		TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
975 			if (vp == textvp)
976 				break;
977 			/* ncp->nc_parent == par STAGE-3 */
978 			if (ncp->nc_dvp_data == (uintptr_t)vp->v_dd &&
979 			    ncp->nc_dvp_id == vp->v_ddid) {
980 				break;
981 			}
982 		}
983 		if (ncp == NULL) {
984 			numfullpathfail2++;
985 			free(buf, M_TEMP);
986 			return (ENOENT);
987 		}
988 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
989 			if (bp == buf) {
990 				numfullpathfail4++;
991 				free(buf, M_TEMP);
992 				return (ENOMEM);
993 			}
994 			*--bp = ncp->nc_name[i];
995 		}
996 		if (bp == buf) {
997 			numfullpathfail4++;
998 			free(buf, M_TEMP);
999 			return (ENOMEM);
1000 		}
1001 		*--bp = '/';
1002 		slash_prefixed = 1;
1003 		vp = vp->v_dd;
1004 	}
1005 	if (!slash_prefixed) {
1006 		if (bp == buf) {
1007 			numfullpathfail4++;
1008 			free(buf, M_TEMP);
1009 			return (ENOMEM);
1010 		}
1011 		*--bp = '/';
1012 	}
1013 	numfullpathfound++;
1014 	*retbuf = bp;
1015 	*retfreebuf = buf;
1016 	return (0);
1017 }
1018 
1019