xref: /dragonfly/sys/kern/vfs_nlookup.c (revision 3d33658b)
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * nlookup() is the 'new' namei interface.  Rather then return directory and
36  * leaf vnodes (in various lock states) the new interface instead deals in
37  * namecache records.  Namecache records may represent both a positive or
38  * a negative hit.  The namespace is locked via the namecache record instead
39  * of via the vnode, and only the leaf namecache record (representing the
40  * filename) needs to be locked.
41  *
42  * This greatly improves filesystem parallelism and is a huge simplification
43  * of the API verses the old vnode locking / namei scheme.
44  *
45  * Filesystems must actively control the caching aspects of the namecache,
46  * and since namecache pointers are used as handles they are non-optional
47  * even for filesystems which do not generally wish to cache things.  It is
48  * intended that a separate cache coherency API will be constructed to handle
49  * these issues.
50  */
51 
52 #include "opt_ktrace.h"
53 
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/kernel.h>
57 #include <sys/vnode.h>
58 #include <sys/mount.h>
59 #include <sys/filedesc.h>
60 #include <sys/proc.h>
61 #include <sys/namei.h>
62 #include <sys/nlookup.h>
63 #include <sys/malloc.h>
64 #include <sys/stat.h>
65 #include <sys/objcache.h>
66 #include <sys/file.h>
67 #include <sys/kcollect.h>
68 
69 #ifdef KTRACE
70 #include <sys/ktrace.h>
71 #endif
72 
73 static int naccess(struct nchandle *nch, int vmode, struct ucred *cred,
74 		int *stickyp);
75 
76 /*
77  * unmount operations flag NLC_IGNBADDIR in order to allow the
78  * umount to successfully issue a nlookup() on the path in order
79  * to extract the mount point.  Allow certain errors through.
80  */
81 static __inline
82 int
83 keeperror(struct nlookupdata *nd, int error)
84 {
85 	if (error) {
86 		if ((nd->nl_flags & NLC_IGNBADDIR) == 0 ||
87 		   (error != EIO && error != EBADRPC && error != ESTALE)) {
88 			return 1;
89 		}
90 	}
91 	return 0;
92 }
93 
94 /*
95  * Initialize a nlookup() structure, early error return for copyin faults
96  * or a degenerate empty string (which is not allowed).
97  *
98  * The first process proc0's credentials are used if the calling thread
99  * is not associated with a process context.
100  *
101  * MPSAFE
102  */
103 int
104 nlookup_init(struct nlookupdata *nd,
105 	     const char *path, enum uio_seg seg, int flags)
106 {
107     size_t pathlen;
108     struct proc *p;
109     thread_t td;
110     int error;
111 
112     td = curthread;
113     p = td->td_proc;
114 
115     /*
116      * note: the pathlen set by copy*str() includes the terminating \0.
117      */
118     bzero(nd, sizeof(struct nlookupdata));
119     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
120     nd->nl_flags |= NLC_HASBUF;
121     if (seg == UIO_SYSSPACE)
122 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
123     else
124 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
125 
126     /*
127      * Don't allow empty pathnames.
128      * POSIX.1 requirement: "" is not a vaild file name.
129      */
130     if (error == 0 && pathlen <= 1)
131 	error = ENOENT;
132 
133     if (error == 0) {
134 	if (p && p->p_fd) {
135 	    cache_copy_ncdir(p, &nd->nl_nch);
136 	    cache_copy(&p->p_fd->fd_nrdir, &nd->nl_rootnch);
137 	    if (p->p_fd->fd_njdir.ncp)
138 		cache_copy(&p->p_fd->fd_njdir, &nd->nl_jailnch);
139 	    nd->nl_cred = td->td_ucred;
140 	    nd->nl_flags |= NLC_BORROWCRED | NLC_NCDIR;
141 	} else {
142 	    cache_copy(&rootnch, &nd->nl_nch);
143 	    cache_copy(&nd->nl_nch, &nd->nl_rootnch);
144 	    cache_copy(&nd->nl_nch, &nd->nl_jailnch);
145 	    nd->nl_cred = proc0.p_ucred;
146 	    nd->nl_flags |= NLC_BORROWCRED;
147 	}
148 	nd->nl_td = td;
149 	nd->nl_flags |= flags;
150     } else {
151 	nlookup_done(nd);
152     }
153     return(error);
154 }
155 
156 
157 /*
158  * nlookup_init() for "at" family of syscalls.
159  *
160  * Works similarly to nlookup_init() but if path is relative and fd is not
161  * AT_FDCWD, path is interpreted relative to the directory pointed to by fd.
162  * In this case, the file entry pointed to by fd is ref'ed and returned in
163  * *fpp.
164  *
165  * If the call succeeds, nlookup_done_at() must be called to clean-up the nd
166  * and release the ref to the file entry.
167  */
168 int
169 nlookup_init_at(struct nlookupdata *nd, struct file **fpp, int fd,
170 		const char *path, enum uio_seg seg, int flags)
171 {
172 	struct thread *td = curthread;
173 	struct file* fp;
174 	struct vnode *vp;
175 	int error;
176 
177 	*fpp = NULL;
178 
179 	if  ((error = nlookup_init(nd, path, seg, flags)) != 0) {
180 		return (error);
181 	}
182 
183 	if (nd->nl_path[0] != '/' && fd != AT_FDCWD) {
184 		if ((error = holdvnode(td, fd, &fp)) != 0)
185 			goto done;
186 		vp = (struct vnode*)fp->f_data;
187 		if (vp->v_type != VDIR || fp->f_nchandle.ncp == NULL) {
188 			fdrop(fp);
189 			fp = NULL;
190 			error = ENOTDIR;
191 			goto done;
192 		}
193 		if (nd->nl_flags & NLC_NCDIR) {
194 			cache_drop_ncdir(&nd->nl_nch);
195 			nd->nl_flags &= ~NLC_NCDIR;
196 		} else {
197 			cache_drop(&nd->nl_nch);
198 		}
199 		cache_copy(&fp->f_nchandle, &nd->nl_nch);
200 		*fpp = fp;
201 	}
202 
203 
204 done:
205 	if (error)
206 		nlookup_done(nd);
207 	return (error);
208 
209 }
210 
211 /*
212  * This works similarly to nlookup_init() but does not assume a process
213  * context.  rootnch is always chosen for the root directory and the cred
214  * and starting directory are supplied in arguments.
215  */
216 int
217 nlookup_init_raw(struct nlookupdata *nd,
218 	     const char *path, enum uio_seg seg, int flags,
219 	     struct ucred *cred, struct nchandle *ncstart)
220 {
221     size_t pathlen;
222     thread_t td;
223     int error;
224 
225     td = curthread;
226 
227     bzero(nd, sizeof(struct nlookupdata));
228     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
229     nd->nl_flags |= NLC_HASBUF;
230     if (seg == UIO_SYSSPACE)
231 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
232     else
233 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
234 
235     /*
236      * Don't allow empty pathnames.
237      * POSIX.1 requirement: "" is not a vaild file name.
238      */
239     if (error == 0 && pathlen <= 1)
240 	error = ENOENT;
241 
242     if (error == 0) {
243 	cache_copy(ncstart, &nd->nl_nch);
244 	cache_copy(&rootnch, &nd->nl_rootnch);
245 	cache_copy(&rootnch, &nd->nl_jailnch);
246 	nd->nl_cred = crhold(cred);
247 	nd->nl_td = td;
248 	nd->nl_flags |= flags;
249     } else {
250 	nlookup_done(nd);
251     }
252     return(error);
253 }
254 
255 /*
256  * This works similarly to nlookup_init_raw() but does not rely
257  * on rootnch being initialized yet.
258  */
259 int
260 nlookup_init_root(struct nlookupdata *nd,
261 	     const char *path, enum uio_seg seg, int flags,
262 	     struct ucred *cred, struct nchandle *ncstart,
263 	     struct nchandle *ncroot)
264 {
265     size_t pathlen;
266     thread_t td;
267     int error;
268 
269     td = curthread;
270 
271     bzero(nd, sizeof(struct nlookupdata));
272     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
273     nd->nl_flags |= NLC_HASBUF;
274     if (seg == UIO_SYSSPACE)
275 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
276     else
277 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
278 
279     /*
280      * Don't allow empty pathnames.
281      * POSIX.1 requirement: "" is not a vaild file name.
282      */
283     if (error == 0 && pathlen <= 1)
284 	error = ENOENT;
285 
286     if (error == 0) {
287 	cache_copy(ncstart, &nd->nl_nch);
288 	cache_copy(ncroot, &nd->nl_rootnch);
289 	cache_copy(ncroot, &nd->nl_jailnch);
290 	nd->nl_cred = crhold(cred);
291 	nd->nl_td = td;
292 	nd->nl_flags |= flags;
293     } else {
294 	nlookup_done(nd);
295     }
296     return(error);
297 }
298 
299 #if 0
300 /*
301  * Set a different credential; this credential will be used by future
302  * operations performed on nd.nl_open_vp and nlookupdata structure.
303  */
304 void
305 nlookup_set_cred(struct nlookupdata *nd, struct ucred *cred)
306 {
307 	KKASSERT(nd->nl_cred != NULL);
308 
309 	if (nd->nl_cred != cred) {
310 		cred = crhold(cred);
311 		if ((nd->nl_flags & NLC_BORROWCRED) == 0)
312 			crfree(nd->nl_cred);
313 		nd->nl_flags &= ~NLC_BORROWCRED;
314 		nd->nl_cred = cred;
315 	}
316 }
317 #endif
318 
319 /*
320  * Cleanup a nlookupdata structure after we are through with it.  This may
321  * be called on any nlookupdata structure initialized with nlookup_init().
322  * Calling nlookup_done() is mandatory in all cases except where nlookup_init()
323  * returns an error, even if as a consumer you believe you have taken all
324  * dynamic elements out of the nlookupdata structure.
325  */
326 void
327 nlookup_done(struct nlookupdata *nd)
328 {
329     if (nd->nl_nch.ncp) {
330 	if (nd->nl_flags & NLC_NCPISLOCKED) {
331 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
332 	    cache_unlock(&nd->nl_nch);
333 	}
334 	if (nd->nl_flags & NLC_NCDIR) {
335 		cache_drop_ncdir(&nd->nl_nch);
336 		nd->nl_flags &= ~NLC_NCDIR;
337 	} else {
338 		cache_drop(&nd->nl_nch);	/* NULL's out the nch */
339 	}
340     }
341     if (nd->nl_rootnch.ncp)
342 	cache_drop_and_cache(&nd->nl_rootnch);
343     if (nd->nl_jailnch.ncp)
344 	cache_drop_and_cache(&nd->nl_jailnch);
345     if ((nd->nl_flags & NLC_HASBUF) && nd->nl_path) {
346 	objcache_put(namei_oc, nd->nl_path);
347 	nd->nl_path = NULL;
348     }
349     if (nd->nl_cred) {
350 	if ((nd->nl_flags & NLC_BORROWCRED) == 0)
351 	    crfree(nd->nl_cred);
352 	nd->nl_cred = NULL;
353 	nd->nl_flags &= ~NLC_BORROWCRED;
354     }
355     if (nd->nl_open_vp) {
356 	if (nd->nl_flags & NLC_LOCKVP) {
357 		vn_unlock(nd->nl_open_vp);
358 		nd->nl_flags &= ~NLC_LOCKVP;
359 	}
360 	vn_close(nd->nl_open_vp, nd->nl_vp_fmode, NULL);
361 	nd->nl_open_vp = NULL;
362     }
363     if (nd->nl_dvp) {
364 	vrele(nd->nl_dvp);
365 	nd->nl_dvp = NULL;
366     }
367     nd->nl_flags = 0;	/* clear remaining flags (just clear everything) */
368 }
369 
370 /*
371  * Works similarly to nlookup_done() when nd initialized with
372  * nlookup_init_at().
373  */
374 void
375 nlookup_done_at(struct nlookupdata *nd, struct file *fp)
376 {
377 	nlookup_done(nd);
378 	if (fp != NULL)
379 		fdrop(fp);
380 }
381 
382 void
383 nlookup_zero(struct nlookupdata *nd)
384 {
385 	bzero(nd, sizeof(struct nlookupdata));
386 }
387 
388 /*
389  * Simple all-in-one nlookup.  Returns a locked namecache structure or NULL
390  * if an error occured.
391  *
392  * Note that the returned ncp is not checked for permissions, though VEXEC
393  * is checked on the directory path leading up to the result.  The caller
394  * must call naccess() to check the permissions of the returned leaf.
395  */
396 struct nchandle
397 nlookup_simple(const char *str, enum uio_seg seg,
398 	       int niflags, int *error)
399 {
400     struct nlookupdata nd;
401     struct nchandle nch;
402 
403     *error = nlookup_init(&nd, str, seg, niflags);
404     if (*error == 0) {
405 	    if ((*error = nlookup(&nd)) == 0) {
406 		    nch = nd.nl_nch;	/* keep hold ref from structure */
407 		    cache_zero(&nd.nl_nch); /* and NULL out */
408 	    } else {
409 		    cache_zero(&nch);
410 	    }
411 	    nlookup_done(&nd);
412     } else {
413 	    cache_zero(&nch);
414     }
415     return(nch);
416 }
417 
418 /*
419  * Returns non-zero if the path element is the last element
420  */
421 static
422 int
423 islastelement(const char *ptr)
424 {
425 	while (*ptr == '/')
426 		++ptr;
427 	return (*ptr == 0);
428 }
429 
430 /*
431  * Returns non-zero if we need to lock the namecache element
432  * exclusively.  Unless otherwise requested by NLC_SHAREDLOCK,
433  * the last element of the namecache lookup will be locked
434  * exclusively.
435  *
436  * NOTE: Even if we return on-zero, an unresolved namecache record
437  *	 will always be locked exclusively.
438  */
439 static __inline
440 int
441 wantsexcllock(struct nlookupdata *nd, const char *ptr)
442 {
443 	if ((nd->nl_flags & NLC_SHAREDLOCK) == 0)
444 		return(islastelement(ptr));
445 	return(0);
446 }
447 
448 
449 /*
450  * Do a generic nlookup.  Note that the passed nd is not nlookup_done()'d
451  * on return, even if an error occurs.  If no error occurs or NLC_CREATE
452  * is flagged and ENOENT is returned, then the returned nl_nch is always
453  * referenced and locked exclusively.
454  *
455  * WARNING: For any general error other than ENOENT w/NLC_CREATE, the
456  *	    the resulting nl_nch may or may not be locked and if locked
457  *	    might be locked either shared or exclusive.
458  *
459  * Intermediate directory elements, including the current directory, require
460  * execute (search) permission.  nlookup does not examine the access
461  * permissions on the returned element.
462  *
463  * If NLC_CREATE is set the last directory must allow node creation,
464  * and an error code of 0 will be returned for a non-existant
465  * target (not ENOENT).
466  *
467  * If NLC_RENAME_DST is set the last directory mut allow node deletion,
468  * plus the sticky check is made, and an error code of 0 will be returned
469  * for a non-existant target (not ENOENT).
470  *
471  * If NLC_DELETE is set the last directory mut allow node deletion,
472  * plus the sticky check is made.
473  *
474  * If NLC_REFDVP is set nd->nl_dvp will be set to the directory vnode
475  * of the returned entry.  The vnode will be referenced, but not locked,
476  * and will be released by nlookup_done() along with everything else.
477  *
478  * NOTE: As an optimization we attempt to obtain a shared namecache lock
479  *	 on any intermediate elements.  On success, the returned element
480  *	 is ALWAYS locked exclusively.
481  */
482 int
483 nlookup(struct nlookupdata *nd)
484 {
485     globaldata_t gd = mycpu;
486     struct nlcomponent nlc;
487     struct nchandle nch;
488     struct nchandle par;
489     struct nchandle nctmp;
490     struct mount *mp;
491     struct vnode *hvp;		/* hold to prevent recyclement */
492     int wasdotordotdot;
493     char *ptr;
494     char *nptr;
495     int error;
496     int len;
497     int dflags;
498     int hit = 1;
499     int saveflag = nd->nl_flags & ~NLC_NCDIR;
500     boolean_t doretry = FALSE;
501     boolean_t inretry = FALSE;
502 
503 nlookup_start:
504 #ifdef KTRACE
505     if (KTRPOINT(nd->nl_td, KTR_NAMEI))
506 	ktrnamei(nd->nl_td->td_lwp, nd->nl_path);
507 #endif
508     bzero(&nlc, sizeof(nlc));
509 
510     /*
511      * Setup for the loop.  The current working namecache element is
512      * always at least referenced.  We lock it as required, but always
513      * return a locked, resolved namecache entry.
514      */
515     nd->nl_loopcnt = 0;
516     if (nd->nl_dvp) {
517 	vrele(nd->nl_dvp);
518 	nd->nl_dvp = NULL;
519     }
520     ptr = nd->nl_path;
521 
522     /*
523      * Loop on the path components.  At the top of the loop nd->nl_nch
524      * is ref'd and unlocked and represents our current position.
525      */
526     for (;;) {
527 	/*
528 	 * Make sure nl_nch is locked so we can access the vnode, resolution
529 	 * state, etc.
530 	 */
531 	if ((nd->nl_flags & NLC_NCPISLOCKED) == 0) {
532 		nd->nl_flags |= NLC_NCPISLOCKED;
533 		cache_lock_maybe_shared(&nd->nl_nch, wantsexcllock(nd, ptr));
534 	}
535 
536 	/*
537 	 * Check if the root directory should replace the current
538 	 * directory.  This is done at the start of a translation
539 	 * or after a symbolic link has been found.  In other cases
540 	 * ptr will never be pointing at a '/'.
541 	 */
542 	if (*ptr == '/') {
543 	    do {
544 		++ptr;
545 	    } while (*ptr == '/');
546 	    cache_unlock(&nd->nl_nch);
547 	    cache_get_maybe_shared(&nd->nl_rootnch, &nch,
548 				   wantsexcllock(nd, ptr));
549 	    if (nd->nl_flags & NLC_NCDIR) {
550 		    cache_drop_ncdir(&nd->nl_nch);
551 		    nd->nl_flags &= ~NLC_NCDIR;
552 	    } else {
553 		    cache_drop(&nd->nl_nch);
554 	    }
555 	    nd->nl_nch = nch;		/* remains locked */
556 
557 	    /*
558 	     * Fast-track termination.  There is no parent directory of
559 	     * the root in the same mount from the point of view of
560 	     * the caller so return EACCES if NLC_REFDVP is specified,
561 	     * and EEXIST if NLC_CREATE is also specified.
562 	     * e.g. 'rmdir /' or 'mkdir /' are not allowed.
563 	     */
564 	    if (*ptr == 0) {
565 		if (nd->nl_flags & NLC_REFDVP)
566 			error = (nd->nl_flags & NLC_CREATE) ? EEXIST : EACCES;
567 		else
568 			error = 0;
569 		break;
570 	    }
571 	    continue;
572 	}
573 
574 	/*
575 	 * Pre-calculate next path component so we can check whether the
576 	 * current component directory is the last directory in the path
577 	 * or not.
578 	 */
579 	for (nptr = ptr; *nptr && *nptr != '/'; ++nptr)
580 		;
581 
582 	/*
583 	 * Check directory search permissions (nd->nl_nch is locked & refd).
584 	 * This will load dflags to obtain directory-special permissions to
585 	 * be checked along with the last component.
586 	 *
587 	 * We only need to pass-in &dflags for the second-to-last component.
588 	 * Optimize by passing-in NULL for any prior components, which may
589 	 * allow the code to bypass the naccess() call.
590 	 */
591 	dflags = 0;
592 	if (*nptr == '/')
593 	    error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, NULL);
594 	else
595 	    error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, &dflags);
596 	if (error) {
597 	    if (keeperror(nd, error))
598 		    break;
599 	    error = 0;
600 	}
601 
602 	/*
603 	 * Extract the next (or last) path component.  Path components are
604 	 * limited to 255 characters.
605 	 */
606 	nlc.nlc_nameptr = ptr;
607 	nlc.nlc_namelen = nptr - ptr;
608 	ptr = nptr;
609 	if (nlc.nlc_namelen >= 256) {
610 	    error = ENAMETOOLONG;
611 	    break;
612 	}
613 
614 	/*
615 	 * Lookup the path component in the cache, creating an unresolved
616 	 * entry if necessary.  We have to handle "." and ".." as special
617 	 * cases.
618 	 *
619 	 * When handling ".." we have to detect a traversal back through a
620 	 * mount point.   If we are at the root, ".." just returns the root.
621 	 *
622 	 * When handling "." or ".." we also have to recalculate dflags
623 	 * since our dflags will be for some sub-directory instead of the
624 	 * parent dir.
625 	 *
626 	 * This subsection returns a locked, refd 'nch' unless it errors out,
627 	 * and an unlocked but still ref'd nd->nl_nch.
628 	 *
629 	 * The namecache topology is not allowed to be disconnected, so
630 	 * encountering a NULL parent will generate EINVAL.  This typically
631 	 * occurs when a directory is removed out from under a process.
632 	 *
633 	 * WARNING! The unlocking of nd->nl_nch is sensitive code.
634 	 */
635 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
636 
637 	if (nlc.nlc_namelen == 1 && nlc.nlc_nameptr[0] == '.') {
638 	    cache_unlock(&nd->nl_nch);
639 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
640 	    cache_get_maybe_shared(&nd->nl_nch, &nch, wantsexcllock(nd, ptr));
641 	    wasdotordotdot = 1;
642 	} else if (nlc.nlc_namelen == 2 &&
643 		   nlc.nlc_nameptr[0] == '.' && nlc.nlc_nameptr[1] == '.') {
644 	    if (nd->nl_nch.mount == nd->nl_rootnch.mount &&
645 		nd->nl_nch.ncp == nd->nl_rootnch.ncp
646 	    ) {
647 		/*
648 		 * ".." at the root returns the root
649 		 */
650 		cache_unlock(&nd->nl_nch);
651 		nd->nl_flags &= ~NLC_NCPISLOCKED;
652 		cache_get_maybe_shared(&nd->nl_nch, &nch,
653 				       wantsexcllock(nd, ptr));
654 	    } else {
655 		/*
656 		 * Locate the parent ncp.  If we are at the root of a
657 		 * filesystem mount we have to skip to the mounted-on
658 		 * point in the underlying filesystem.
659 		 *
660 		 * Expect the parent to always be good since the
661 		 * mountpoint doesn't go away.  XXX hack.  cache_get()
662 		 * requires the ncp to already have a ref as a safety.
663 		 *
664 		 * However, a process which has been broken out of a chroot
665 		 * will wind up with a NULL parent if it tries to '..' above
666 		 * the real root, deal with the case.  Note that this does
667 		 * not protect us from a jail breakout, it just stops a panic
668 		 * if the jail-broken process tries to '..' past the real
669 		 * root.
670 		 */
671 		nctmp = nd->nl_nch;
672 		while (nctmp.ncp == nctmp.mount->mnt_ncmountpt.ncp) {
673 			nctmp = nctmp.mount->mnt_ncmounton;
674 			if (nctmp.ncp == NULL)
675 				break;
676 		}
677 		if (nctmp.ncp == NULL) {
678 			if (curthread->td_proc) {
679 				kprintf("vfs_nlookup: '..' traverse broke "
680 					"jail: pid %d (%s)\n",
681 					curthread->td_proc->p_pid,
682 					curthread->td_comm);
683 			}
684 			nctmp = nd->nl_rootnch;
685 		} else {
686 			nctmp.ncp = nctmp.ncp->nc_parent;
687 		}
688 		cache_hold(&nctmp);
689 		cache_unlock(&nd->nl_nch);
690 		nd->nl_flags &= ~NLC_NCPISLOCKED;
691 		cache_get_maybe_shared(&nctmp, &nch, wantsexcllock(nd, ptr));
692 		cache_drop(&nctmp);		/* NOTE: zero's nctmp */
693 	    }
694 	    wasdotordotdot = 2;
695 	} else {
696 	    /*
697 	     * Must unlock nl_nch when traversing down the path.  However,
698 	     * the child ncp has not yet been found/created and the parent's
699 	     * child list might be empty.  Thus releasing the lock can
700 	     * allow a race whereby the parent ncp's vnode is recycled.
701 	     * This case can occur especially when maxvnodes is set very low.
702 	     *
703 	     * We need the parent's ncp to remain resolved for all normal
704 	     * filesystem activities, so we vhold() the vp during the lookup
705 	     * to prevent recyclement due to vnlru / maxvnodes.
706 	     *
707 	     * If we race an unlink or rename the ncp might be marked
708 	     * DESTROYED after resolution, requiring a retry.
709 	     */
710 	    if ((hvp = nd->nl_nch.ncp->nc_vp) != NULL)
711 		vhold(hvp);
712 	    cache_unlock(&nd->nl_nch);
713 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
714 	    error = cache_nlookup_maybe_shared(&nd->nl_nch, &nlc,
715 					       wantsexcllock(nd, ptr), &nch);
716 	    if (error == EWOULDBLOCK) {
717 		    nch = cache_nlookup(&nd->nl_nch, &nlc);
718 		    if (nch.ncp->nc_flag & NCF_UNRESOLVED)
719 			hit = 0;
720 		    for (;;) {
721 			error = cache_resolve(&nch, nd->nl_cred);
722 			if (error != EAGAIN &&
723 			    (nch.ncp->nc_flag & NCF_DESTROYED) == 0) {
724 				if (error == ESTALE) {
725 				    if (!inretry)
726 					error = ENOENT;
727 				    doretry = TRUE;
728 				}
729 				break;
730 			}
731 			kprintf("[diagnostic] nlookup: relookup %*.*s\n",
732 				nch.ncp->nc_nlen, nch.ncp->nc_nlen,
733 				nch.ncp->nc_name);
734 			cache_put(&nch);
735 			nch = cache_nlookup(&nd->nl_nch, &nlc);
736 		    }
737 	    }
738 	    if (hvp)
739 		vdrop(hvp);
740 	    wasdotordotdot = 0;
741 	}
742 
743 	/*
744 	 * If the last component was "." or ".." our dflags no longer
745 	 * represents the parent directory and we have to explicitly
746 	 * look it up.
747 	 *
748 	 * Expect the parent to be good since nch is locked.
749 	 */
750 	if (wasdotordotdot && error == 0) {
751 	    dflags = 0;
752 	    if ((par.ncp = nch.ncp->nc_parent) != NULL) {
753 		par.mount = nch.mount;
754 		cache_hold(&par);
755 		cache_lock_maybe_shared(&par, wantsexcllock(nd, ptr));
756 		error = naccess(&par, 0, nd->nl_cred, &dflags);
757 		cache_put(&par);
758 		if (error) {
759 		    if (!keeperror(nd, error))
760 			    error = 0;
761 		}
762 	    }
763 	}
764 
765 	/*
766 	 * [end of subsection]
767 	 *
768 	 * nch is locked and referenced.
769 	 * nd->nl_nch is unlocked and referenced.
770 	 *
771 	 * nl_nch must be unlocked or we could chain lock to the root
772 	 * if a resolve gets stuck (e.g. in NFS).
773 	 */
774 	KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0);
775 
776 	/*
777 	 * Resolve the namespace if necessary.  The ncp returned by
778 	 * cache_nlookup() is referenced and locked.
779 	 *
780 	 * XXX neither '.' nor '..' should return EAGAIN since they were
781 	 * previously resolved and thus cannot be newly created ncp's.
782 	 */
783 	if (nch.ncp->nc_flag & NCF_UNRESOLVED) {
784 	    hit = 0;
785 	    error = cache_resolve(&nch, nd->nl_cred);
786 	    if (error == ESTALE) {
787 		if (!inretry)
788 		    error = ENOENT;
789 		doretry = TRUE;
790 	    }
791 	    KKASSERT(error != EAGAIN);
792 	} else {
793 	    error = nch.ncp->nc_error;
794 	}
795 
796 	/*
797 	 * Early completion.  ENOENT is not an error if this is the last
798 	 * component and NLC_CREATE or NLC_RENAME (rename target) was
799 	 * requested.  Note that ncp->nc_error is left as ENOENT in that
800 	 * case, which we check later on.
801 	 *
802 	 * Also handle invalid '.' or '..' components terminating a path
803 	 * for a create/rename/delete.  The standard requires this and pax
804 	 * pretty stupidly depends on it.
805 	 */
806 	if (islastelement(ptr)) {
807 	    if (error == ENOENT &&
808 		(nd->nl_flags & (NLC_CREATE | NLC_RENAME_DST))
809 	    ) {
810 		if (nd->nl_flags & NLC_NFS_RDONLY) {
811 			error = EROFS;
812 		} else {
813 			error = naccess(&nch, nd->nl_flags | dflags,
814 					nd->nl_cred, NULL);
815 		}
816 	    }
817 	    if (error == 0 && wasdotordotdot &&
818 		(nd->nl_flags & (NLC_CREATE | NLC_DELETE |
819 				 NLC_RENAME_SRC | NLC_RENAME_DST))) {
820 		/*
821 		 * POSIX junk
822 		 */
823 		if (nd->nl_flags & NLC_CREATE)
824 			error = EEXIST;
825 		else if (nd->nl_flags & NLC_DELETE)
826 			error = (wasdotordotdot == 1) ? EINVAL : ENOTEMPTY;
827 		else
828 			error = EINVAL;
829 	    }
830 	}
831 
832 	/*
833 	 * Early completion on error.
834 	 */
835 	if (error) {
836 	    cache_put(&nch);
837 	    break;
838 	}
839 
840 	/*
841 	 * If the element is a symlink and it is either not the last
842 	 * element or it is the last element and we are allowed to
843 	 * follow symlinks, resolve the symlink.
844 	 */
845 	if ((nch.ncp->nc_flag & NCF_ISSYMLINK) &&
846 	    (*ptr || (nd->nl_flags & NLC_FOLLOW))
847 	) {
848 	    if (nd->nl_loopcnt++ >= MAXSYMLINKS) {
849 		error = ELOOP;
850 		cache_put(&nch);
851 		break;
852 	    }
853 	    error = nreadsymlink(nd, &nch, &nlc);
854 	    cache_put(&nch);
855 	    if (error)
856 		break;
857 
858 	    /*
859 	     * Concatenate trailing path elements onto the returned symlink.
860 	     * Note that if the path component (ptr) is not exhausted, it
861 	     * will being with a '/', so we do not have to add another one.
862 	     *
863 	     * The symlink may not be empty.
864 	     */
865 	    len = strlen(ptr);
866 	    if (nlc.nlc_namelen == 0 || nlc.nlc_namelen + len >= MAXPATHLEN) {
867 		error = nlc.nlc_namelen ? ENAMETOOLONG : ENOENT;
868 		objcache_put(namei_oc, nlc.nlc_nameptr);
869 		break;
870 	    }
871 	    bcopy(ptr, nlc.nlc_nameptr + nlc.nlc_namelen, len + 1);
872 	    if (nd->nl_flags & NLC_HASBUF)
873 		objcache_put(namei_oc, nd->nl_path);
874 	    nd->nl_path = nlc.nlc_nameptr;
875 	    nd->nl_flags |= NLC_HASBUF;
876 	    ptr = nd->nl_path;
877 
878 	    /*
879 	     * Go back up to the top to resolve any initial '/'s in the
880 	     * symlink.
881 	     */
882 	    continue;
883 	}
884 
885 	/*
886 	 * If the element is a directory and we are crossing a mount point,
887 	 * Locate the mount.
888 	 */
889 	while ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
890 	    (nd->nl_flags & NLC_NOCROSSMOUNT) == 0 &&
891 	    (mp = cache_findmount(&nch)) != NULL
892 	) {
893 	    struct vnode *tdp;
894 	    int vfs_do_busy = 0;
895 
896 	    /*
897 	     * VFS must be busied before the namecache entry is locked,
898 	     * but we don't want to waste time calling vfs_busy() if the
899 	     * mount point is already resolved.
900 	     */
901 again:
902 	    cache_put(&nch);
903 	    if (vfs_do_busy) {
904 		while (vfs_busy(mp, 0)) {
905 		    if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
906 			kprintf("nlookup: warning umount race avoided\n");
907 			cache_dropmount(mp);
908 			error = EBUSY;
909 			vfs_do_busy = 0;
910 			goto double_break;
911 		    }
912 		}
913 	    }
914 	    cache_get_maybe_shared(&mp->mnt_ncmountpt, &nch,
915 				   wantsexcllock(nd, ptr));
916 
917 	    if (nch.ncp->nc_flag & NCF_UNRESOLVED) {
918 		if (vfs_do_busy == 0) {
919 		    vfs_do_busy = 1;
920 		    goto again;
921 		}
922 		error = VFS_ROOT(mp, &tdp);
923 		vfs_unbusy(mp);
924 		vfs_do_busy = 0;
925 		if (keeperror(nd, error)) {
926 		    cache_dropmount(mp);
927 		    break;
928 		}
929 		if (error == 0) {
930 		    cache_setvp(&nch, tdp);
931 		    vput(tdp);
932 		}
933 	    }
934 	    if (vfs_do_busy)
935 		vfs_unbusy(mp);
936 	    cache_dropmount(mp);
937 	}
938 
939 	if (keeperror(nd, error)) {
940 	    cache_put(&nch);
941 double_break:
942 	    break;
943 	}
944 
945 	/*
946 	 * Skip any slashes to get to the next element.  If there
947 	 * are any slashes at all the current element must be a
948 	 * directory or, in the create case, intended to become a directory.
949 	 * If it isn't we break without incrementing ptr and fall through
950 	 * to the failure case below.
951 	 */
952 	while (*ptr == '/') {
953 	    if ((nch.ncp->nc_flag & NCF_ISDIR) == 0 &&
954 		!(nd->nl_flags & NLC_WILLBEDIR)
955 	    ) {
956 		break;
957 	    }
958 	    ++ptr;
959 	}
960 
961 	/*
962 	 * Continuation case: additional elements and the current
963 	 * element is a directory.
964 	 */
965 	if (*ptr && (nch.ncp->nc_flag & NCF_ISDIR)) {
966 	    if (nd->nl_flags & NLC_NCDIR) {
967 		    cache_drop_ncdir(&nd->nl_nch);
968 		    nd->nl_flags &= ~NLC_NCDIR;
969 	    } else {
970 		    cache_drop(&nd->nl_nch);
971 	    }
972 	    cache_unlock(&nch);
973 	    KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0);
974 	    nd->nl_nch = nch;
975 	    continue;
976 	}
977 
978 	/*
979 	 * Failure case: additional elements and the current element
980 	 * is not a directory
981 	 */
982 	if (*ptr) {
983 	    cache_put(&nch);
984 	    error = ENOTDIR;
985 	    break;
986 	}
987 
988 	/*
989 	 * Successful lookup of last element.
990 	 *
991 	 * Check permissions if the target exists.  If the target does not
992 	 * exist directory permissions were already tested in the early
993 	 * completion code above.
994 	 *
995 	 * nd->nl_flags will be adjusted on return with NLC_APPENDONLY
996 	 * if the file is marked append-only, and NLC_STICKY if the directory
997 	 * containing the file is sticky.
998 	 */
999 	if (nch.ncp->nc_vp && (nd->nl_flags & NLC_ALLCHKS)) {
1000 	    error = naccess(&nch, nd->nl_flags | dflags,
1001 			    nd->nl_cred, NULL);
1002 	    if (keeperror(nd, error)) {
1003 		cache_put(&nch);
1004 		break;
1005 	    }
1006 	}
1007 
1008 	/*
1009 	 * Termination: no more elements.
1010 	 *
1011 	 * If NLC_REFDVP is set acquire a referenced parent dvp.
1012 	 */
1013 	if (nd->nl_flags & NLC_REFDVP) {
1014 		cache_lock(&nd->nl_nch);
1015 		error = cache_vref(&nd->nl_nch, nd->nl_cred, &nd->nl_dvp);
1016 		cache_unlock(&nd->nl_nch);
1017 		if (keeperror(nd, error)) {
1018 			kprintf("NLC_REFDVP: Cannot ref dvp of %p\n", nch.ncp);
1019 			cache_put(&nch);
1020 			break;
1021 		}
1022 	}
1023 	if (nd->nl_flags & NLC_NCDIR) {
1024 		cache_drop_ncdir(&nd->nl_nch);
1025 		nd->nl_flags &= ~NLC_NCDIR;
1026 	} else {
1027 		cache_drop(&nd->nl_nch);
1028 	}
1029 	nd->nl_nch = nch;
1030 	nd->nl_flags |= NLC_NCPISLOCKED;
1031 	error = 0;
1032 	break;
1033     }
1034 
1035     if (hit)
1036 	++gd->gd_nchstats->ncs_longhits;
1037     else
1038 	++gd->gd_nchstats->ncs_longmiss;
1039 
1040     if (nd->nl_flags & NLC_NCPISLOCKED)
1041 	KKASSERT(cache_lockstatus(&nd->nl_nch) > 0);
1042 
1043     /*
1044      * Retry the whole thing if doretry flag is set, but only once.
1045      * autofs(5) may mount another filesystem under its root directory
1046      * while resolving a path.
1047      */
1048     if (doretry && !inretry) {
1049 	inretry = TRUE;
1050 	nd->nl_flags &= NLC_NCDIR;
1051 	nd->nl_flags |= saveflag;
1052 	goto nlookup_start;
1053     }
1054 
1055     /*
1056      * NOTE: If NLC_CREATE was set the ncp may represent a negative hit
1057      * (ncp->nc_error will be ENOENT), but we will still return an error
1058      * code of 0.
1059      */
1060     return(error);
1061 }
1062 
1063 /*
1064  * Resolve a mount point's glue ncp.  This ncp connects creates the illusion
1065  * of continuity in the namecache tree by connecting the ncp related to the
1066  * vnode under the mount to the ncp related to the mount's root vnode.
1067  *
1068  * If no error occured a locked, ref'd ncp is stored in *ncpp.
1069  */
1070 int
1071 nlookup_mp(struct mount *mp, struct nchandle *nch)
1072 {
1073     struct vnode *vp;
1074     int error;
1075 
1076     error = 0;
1077     cache_get(&mp->mnt_ncmountpt, nch);
1078     if (nch->ncp->nc_flag & NCF_UNRESOLVED) {
1079 	while (vfs_busy(mp, 0))
1080 	    ;
1081 	error = VFS_ROOT(mp, &vp);
1082 	vfs_unbusy(mp);
1083 	if (error) {
1084 	    cache_put(nch);
1085 	} else {
1086 	    cache_setvp(nch, vp);
1087 	    vput(vp);
1088 	}
1089     }
1090     return(error);
1091 }
1092 
1093 /*
1094  * Read the contents of a symlink, allocate a path buffer out of the
1095  * namei_oc and initialize the supplied nlcomponent with the result.
1096  *
1097  * If an error occurs no buffer will be allocated or returned in the nlc.
1098  */
1099 int
1100 nreadsymlink(struct nlookupdata *nd, struct nchandle *nch,
1101 		struct nlcomponent *nlc)
1102 {
1103     struct vnode *vp;
1104     struct iovec aiov;
1105     struct uio auio;
1106     int linklen;
1107     int error;
1108     char *cp;
1109 
1110     nlc->nlc_nameptr = NULL;
1111     nlc->nlc_namelen = 0;
1112     if (nch->ncp->nc_vp == NULL)
1113 	return(ENOENT);
1114     if ((error = cache_vget(nch, nd->nl_cred, LK_SHARED, &vp)) != 0)
1115 	return(error);
1116     cp = objcache_get(namei_oc, M_WAITOK);
1117     aiov.iov_base = cp;
1118     aiov.iov_len = MAXPATHLEN;
1119     auio.uio_iov = &aiov;
1120     auio.uio_iovcnt = 1;
1121     auio.uio_offset = 0;
1122     auio.uio_rw = UIO_READ;
1123     auio.uio_segflg = UIO_SYSSPACE;
1124     auio.uio_td = nd->nl_td;
1125     auio.uio_resid = MAXPATHLEN - 1;
1126     error = VOP_READLINK(vp, &auio, nd->nl_cred);
1127     if (error)
1128 	goto fail;
1129     linklen = MAXPATHLEN - 1 - auio.uio_resid;
1130     if (varsym_enable) {
1131 	linklen = varsymreplace(cp, linklen, MAXPATHLEN - 1);
1132 	if (linklen < 0) {
1133 	    error = ENAMETOOLONG;
1134 	    goto fail;
1135 	}
1136     }
1137     cp[linklen] = 0;
1138     nlc->nlc_nameptr = cp;
1139     nlc->nlc_namelen = linklen;
1140     vput(vp);
1141     return(0);
1142 fail:
1143     objcache_put(namei_oc, cp);
1144     vput(vp);
1145     return(error);
1146 }
1147 
1148 /*
1149  * Check access [XXX cache vattr!] [XXX quota]
1150  *
1151  * Generally check the NLC_* access bits.   All specified bits must pass
1152  * for this function to return 0.
1153  *
1154  * The file does not have to exist when checking NLC_CREATE or NLC_RENAME_DST
1155  * access, otherwise it must exist.  No error is returned in this case.
1156  *
1157  * The file must not exist if NLC_EXCL is specified.
1158  *
1159  * Directory permissions in general are tested for NLC_CREATE if the file
1160  * does not exist, NLC_DELETE if the file does exist, and NLC_RENAME_DST
1161  * whether the file exists or not.
1162  *
1163  * The directory sticky bit is tested for NLC_DELETE and NLC_RENAME_DST,
1164  * the latter is only tested if the target exists.
1165  *
1166  * The passed ncp must be referenced and locked.  If it is already resolved
1167  * it may be locked shared but otherwise should be locked exclusively.
1168  */
1169 
1170 #define S_WXOK_MASK	(S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
1171 
1172 static int
1173 naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp)
1174 {
1175     struct vnode *vp;
1176     struct vattr va;
1177     struct namecache *ncp;
1178     int error;
1179     int cflags;
1180 
1181     KKASSERT(cache_lockstatus(nch) > 0);
1182 
1183     ncp = nch->ncp;
1184     if (ncp->nc_flag & NCF_UNRESOLVED) {
1185 	cache_resolve(nch, cred);
1186 	ncp = nch->ncp;
1187     }
1188     error = ncp->nc_error;
1189 
1190     /*
1191      * Directory permissions checks.  Silently ignore ENOENT if these
1192      * tests pass.  It isn't an error.
1193      *
1194      * We can safely resolve ncp->nc_parent because ncp is currently
1195      * locked.
1196      */
1197     if (nflags & (NLC_CREATE | NLC_DELETE | NLC_RENAME_SRC | NLC_RENAME_DST)) {
1198 	if (((nflags & NLC_CREATE) && ncp->nc_vp == NULL) ||
1199 	    ((nflags & NLC_DELETE) && ncp->nc_vp != NULL) ||
1200 	    ((nflags & NLC_RENAME_SRC) && ncp->nc_vp != NULL) ||
1201 	    (nflags & NLC_RENAME_DST)
1202 	) {
1203 	    struct nchandle par;
1204 
1205 	    if ((par.ncp = ncp->nc_parent) == NULL) {
1206 		if (error != EAGAIN)
1207 			error = EINVAL;
1208 	    } else if (error == 0 || error == ENOENT) {
1209 		par.mount = nch->mount;
1210 		cache_hold(&par);
1211 		cache_lock_maybe_shared(&par, 0);
1212 		error = naccess(&par, NLC_WRITE, cred, NULL);
1213 		cache_put(&par);
1214 	    }
1215 	}
1216     }
1217 
1218     /*
1219      * NLC_EXCL check.  Target file must not exist.
1220      */
1221     if (error == 0 && (nflags & NLC_EXCL) && ncp->nc_vp != NULL)
1222 	error = EEXIST;
1223 
1224     /*
1225      * Try to short-cut the vnode operation for intermediate directory
1226      * components.  This is a major SMP win because it avoids having
1227      * to execute a lot of code for intermediate directory components,
1228      * including shared refs and locks on intermediate directory vnodes.
1229      *
1230      * We can only do this if the caller does not need nflagsp.
1231      */
1232     if (error == 0 && nflagsp == NULL &&
1233 	nflags == NLC_EXEC && (ncp->nc_flag & NCF_WXOK)) {
1234 	return 0;
1235     }
1236 
1237     /*
1238      * Get the vnode attributes so we can do the rest of our checks.
1239      *
1240      * NOTE: We only call naccess_va() if the target exists.
1241      */
1242     if (error == 0) {
1243 	error = cache_vget(nch, cred, LK_SHARED, &vp);
1244 	if (error == ENOENT) {
1245 	    /*
1246 	     * Silently zero-out ENOENT if creating or renaming
1247 	     * (rename target).  It isn't an error.
1248 	     */
1249 	    if (nflags & (NLC_CREATE | NLC_RENAME_DST))
1250 		error = 0;
1251 	} else if (error == 0) {
1252 	    /*
1253 	     * Get the vnode attributes and check for illegal O_TRUNC
1254 	     * requests and read-only mounts.
1255 	     *
1256 	     * NOTE: You can still open devices on read-only mounts for
1257 	     * 	     writing.
1258 	     *
1259 	     * NOTE: creates/deletes/renames are handled by the NLC_WRITE
1260 	     *	     check on the parent directory above.
1261 	     *
1262 	     * XXX cache the va in the namecache or in the vnode
1263 	     */
1264 	    error = VOP_GETATTR(vp, &va);
1265 	    if (error == 0 && (nflags & NLC_TRUNCATE)) {
1266 		switch(va.va_type) {
1267 		case VREG:
1268 		case VDATABASE:
1269 		case VCHR:
1270 		case VBLK:
1271 		case VFIFO:
1272 		    break;
1273 		case VDIR:
1274 		    error = EISDIR;
1275 		    break;
1276 		default:
1277 		    error = EINVAL;
1278 		    break;
1279 		}
1280 	    }
1281 	    if (error == 0 && (nflags & NLC_WRITE) && vp->v_mount &&
1282 		(vp->v_mount->mnt_flag & MNT_RDONLY)
1283 	    ) {
1284 		switch(va.va_type) {
1285 		case VDIR:
1286 		case VLNK:
1287 		case VREG:
1288 		case VDATABASE:
1289 		    error = EROFS;
1290 		    break;
1291 		default:
1292 		    break;
1293 		}
1294 	    }
1295 	    vput(vp);
1296 
1297 	    /*
1298 	     * Check permissions based on file attributes.  The passed
1299 	     * flags (*nflagsp) are modified with feedback based on
1300 	     * special attributes and requirements.
1301 	     */
1302 	    if (error == 0) {
1303 		/*
1304 		 * Adjust the returned (*nflagsp) if non-NULL.
1305 		 */
1306 		if (nflagsp) {
1307 		    if ((va.va_mode & VSVTX) && va.va_uid != cred->cr_uid)
1308 			*nflagsp |= NLC_STICKY;
1309 		    if (va.va_flags & APPEND)
1310 			*nflagsp |= NLC_APPENDONLY;
1311 		    if (va.va_flags & IMMUTABLE)
1312 			*nflagsp |= NLC_IMMUTABLE;
1313 		}
1314 
1315 		/*
1316 		 * NCF_WXOK can be set for world-searchable directories.
1317 		 *
1318 		 * XXX When we implement capabilities this code would also
1319 		 * need a cap check, or only set the flag if there are no
1320 		 * capabilities.
1321 		 */
1322 		cflags = 0;
1323 		if (va.va_type == VDIR &&
1324 		    (va.va_mode & S_WXOK_MASK) == S_WXOK_MASK) {
1325 			cflags |= NCF_WXOK;
1326 		}
1327 
1328 		/*
1329 		 * Track swapcache management flags in the namecache.
1330 		 *
1331 		 * Calculate the flags based on the current vattr info
1332 		 * and recalculate the inherited flags from the parent
1333 		 * (the original cache linkage may have occurred without
1334 		 * getattrs and thus have stale flags).
1335 		 */
1336 		if (va.va_flags & SF_NOCACHE)
1337 			cflags |= NCF_SF_NOCACHE;
1338 		if (va.va_flags & UF_CACHE)
1339 			cflags |= NCF_UF_CACHE;
1340 		if (ncp->nc_parent) {
1341 			if (ncp->nc_parent->nc_flag &
1342 			    (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) {
1343 				cflags |= NCF_SF_PNOCACHE;
1344 			}
1345 			if (ncp->nc_parent->nc_flag &
1346 			    (NCF_UF_CACHE | NCF_UF_PCACHE)) {
1347 				cflags |= NCF_UF_PCACHE;
1348 			}
1349 		}
1350 
1351 		/*
1352 		 * We're not supposed to update nc_flag when holding a shared
1353 		 * lock, but we allow the case for certain flags.  Note that
1354 		 * holding an exclusive lock allows updating nc_flag without
1355 		 * atomics.  nc_flag is not allowe to be updated at all unless
1356 		 * a shared or exclusive lock is held.
1357 		 */
1358 		atomic_clear_short(&ncp->nc_flag,
1359 				   (NCF_SF_NOCACHE | NCF_UF_CACHE |
1360 				   NCF_SF_PNOCACHE | NCF_UF_PCACHE |
1361 				   NCF_WXOK) & ~cflags);
1362 		atomic_set_short(&ncp->nc_flag, cflags);
1363 
1364 		/*
1365 		 * Process general access.
1366 		 */
1367 		error = naccess_va(&va, nflags, cred);
1368 	    }
1369 	}
1370     }
1371     return(error);
1372 }
1373 
1374 /*
1375  * Check the requested access against the given vattr using cred.
1376  */
1377 int
1378 naccess_va(struct vattr *va, int nflags, struct ucred *cred)
1379 {
1380     int i;
1381     int vmode;
1382 
1383     /*
1384      * Test the immutable bit.  Creations, deletions, renames (source
1385      * or destination) are not allowed.  chown/chmod/other is also not
1386      * allowed but is handled by SETATTR.  Hardlinks to the immutable
1387      * file are allowed.
1388      *
1389      * If the directory is set to immutable then creations, deletions,
1390      * renames (source or dest) and hardlinks to files within the directory
1391      * are not allowed, and regular files opened through the directory may
1392      * not be written to or truncated (unless a special device).
1393      *
1394      * NOTE!  New hardlinks to immutable files work but new hardlinks to
1395      * files, immutable or not, sitting inside an immutable directory are
1396      * not allowed.  As always if the file is hardlinked via some other
1397      * path additional hardlinks may be possible even if the file is marked
1398      * immutable.  The sysop needs to create a closure by checking the hard
1399      * link count.  Once closure is achieved you are good, and security
1400      * scripts should check link counts anyway.
1401      *
1402      * Writes and truncations are only allowed on special devices.
1403      */
1404     if ((va->va_flags & IMMUTABLE) || (nflags & NLC_IMMUTABLE)) {
1405 	if ((nflags & NLC_IMMUTABLE) && (nflags & NLC_HLINK))
1406 	    return (EPERM);
1407 	if (nflags & (NLC_CREATE | NLC_DELETE |
1408 		      NLC_RENAME_SRC | NLC_RENAME_DST)) {
1409 	    return (EPERM);
1410 	}
1411 	if (nflags & (NLC_WRITE | NLC_TRUNCATE)) {
1412 	    switch(va->va_type) {
1413 	    case VDIR:
1414 		return (EISDIR);
1415 	    case VLNK:
1416 	    case VREG:
1417 	    case VDATABASE:
1418 		return (EPERM);
1419 	    default:
1420 		break;
1421 	    }
1422 	}
1423     }
1424 
1425     /*
1426      * Test the no-unlink and append-only bits for opens, rename targets,
1427      * and deletions.  These bits are not tested for creations or
1428      * rename sources.
1429      *
1430      * Unlike FreeBSD we allow a file with APPEND set to be renamed.
1431      * If you do not wish this you must also set NOUNLINK.
1432      *
1433      * If the governing directory is marked APPEND-only it implies
1434      * NOUNLINK for all entries in the directory.
1435      */
1436     if (((va->va_flags & NOUNLINK) || (nflags & NLC_APPENDONLY)) &&
1437 	(nflags & (NLC_DELETE | NLC_RENAME_SRC | NLC_RENAME_DST))
1438     ) {
1439 	return (EPERM);
1440     }
1441 
1442     /*
1443      * A file marked append-only may not be deleted but can be renamed.
1444      */
1445     if ((va->va_flags & APPEND) &&
1446 	(nflags & (NLC_DELETE | NLC_RENAME_DST))
1447     ) {
1448 	return (EPERM);
1449     }
1450 
1451     /*
1452      * A file marked append-only which is opened for writing must also
1453      * be opened O_APPEND.
1454      */
1455     if ((va->va_flags & APPEND) && (nflags & (NLC_OPEN | NLC_TRUNCATE))) {
1456 	if (nflags & NLC_TRUNCATE)
1457 	    return (EPERM);
1458 	if ((nflags & (NLC_OPEN | NLC_WRITE)) == (NLC_OPEN | NLC_WRITE)) {
1459 	    if ((nflags & NLC_APPEND) == 0)
1460 		return (EPERM);
1461 	}
1462     }
1463 
1464     /*
1465      * root gets universal access
1466      */
1467     if (cred->cr_uid == 0)
1468 	return(0);
1469 
1470     /*
1471      * Check owner perms.
1472      *
1473      * If NLC_OWN is set the owner of the file is allowed no matter when
1474      * the owner-mode bits say (utimes).
1475      */
1476     vmode = 0;
1477     if (nflags & NLC_READ)
1478 	vmode |= S_IRUSR;
1479     if (nflags & NLC_WRITE)
1480 	vmode |= S_IWUSR;
1481     if (nflags & NLC_EXEC)
1482 	vmode |= S_IXUSR;
1483 
1484     if (cred->cr_uid == va->va_uid) {
1485 	if ((nflags & NLC_OWN) == 0) {
1486 	    if ((vmode & va->va_mode) != vmode)
1487 		return(EACCES);
1488 	}
1489 	return(0);
1490     }
1491 
1492     /*
1493      * If NLC_STICKY is set only the owner may delete or rename a file.
1494      * This bit is typically set on /tmp.
1495      *
1496      * Note that the NLC_READ/WRITE/EXEC bits are not typically set in
1497      * the specific delete or rename case.  For deletions and renames we
1498      * usually just care about directory permissions, not file permissions.
1499      */
1500     if ((nflags & NLC_STICKY) &&
1501 	(nflags & (NLC_RENAME_SRC | NLC_RENAME_DST | NLC_DELETE))) {
1502 	return(EACCES);
1503     }
1504 
1505     /*
1506      * Check group perms
1507      */
1508     vmode >>= 3;
1509     for (i = 0; i < cred->cr_ngroups; ++i) {
1510 	if (va->va_gid == cred->cr_groups[i]) {
1511 	    if ((vmode & va->va_mode) != vmode)
1512 		return(EACCES);
1513 	    return(0);
1514 	}
1515     }
1516 
1517     /*
1518      * Check world perms
1519      */
1520     vmode >>= 3;
1521     if ((vmode & va->va_mode) != vmode)
1522 	return(EACCES);
1523     return(0);
1524 }
1525 
1526 /*
1527  * Long-term (10-second interval) statistics collection
1528  */
1529 static
1530 uint64_t
1531 collect_nlookup_callback(int n)
1532 {
1533 	static uint64_t last_total;
1534 	uint64_t save;
1535 	uint64_t total;
1536 
1537 	total = 0;
1538 	for (n = 0; n < ncpus; ++n) {
1539 		globaldata_t gd = globaldata_find(n);
1540 		struct nchstats *sp;
1541 
1542 		if ((sp = gd->gd_nchstats) != NULL)
1543 			total += sp->ncs_longhits + sp->ncs_longmiss;
1544 	}
1545 	save = total;
1546 	total = total - last_total;
1547 	last_total = save;
1548 
1549 	return total;
1550 }
1551 
1552 static
1553 void
1554 nlookup_collect_init(void *dummy __unused)
1555 {
1556 	kcollect_register(KCOLLECT_NLOOKUP, "nlookup", collect_nlookup_callback,
1557 			  KCOLLECT_SCALE(KCOLLECT_NLOOKUP_FORMAT, 0));
1558 }
1559 SYSINIT(collect_nlookup, SI_SUB_PROP, SI_ORDER_ANY, nlookup_collect_init, 0);
1560