xref: /dragonfly/sys/kern/vfs_nlookup.c (revision 38b5d46c)
1 /*
2  * Copyright (c) 2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * nlookup() is the 'new' namei interface.  Rather then return directory and
36  * leaf vnodes (in various lock states) the new interface instead deals in
37  * namecache records.  Namecache records may represent both a positive or
38  * a negative hit.  The namespace is locked via the namecache record instead
39  * of via the vnode, and only the leaf namecache record (representing the
40  * filename) needs to be locked.
41  *
42  * This greatly improves filesystem parallelism and is a huge simplification
43  * of the API verses the old vnode locking / namei scheme.
44  *
45  * Filesystems must actively control the caching aspects of the namecache,
46  * and since namecache pointers are used as handles they are non-optional
47  * even for filesystems which do not generally wish to cache things.  It is
48  * intended that a separate cache coherency API will be constructed to handle
49  * these issues.
50  */
51 
52 #include "opt_ktrace.h"
53 
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/kernel.h>
57 #include <sys/vnode.h>
58 #include <sys/mount.h>
59 #include <sys/filedesc.h>
60 #include <sys/proc.h>
61 #include <sys/namei.h>
62 #include <sys/nlookup.h>
63 #include <sys/malloc.h>
64 #include <sys/stat.h>
65 #include <sys/objcache.h>
66 #include <sys/file.h>
67 
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 
72 static int naccess(struct nchandle *nch, int vmode, struct ucred *cred,
73 		int *stickyp);
74 
75 /*
76  * Initialize a nlookup() structure, early error return for copyin faults
77  * or a degenerate empty string (which is not allowed).
78  *
79  * The first process proc0's credentials are used if the calling thread
80  * is not associated with a process context.
81  *
82  * MPSAFE
83  */
84 int
85 nlookup_init(struct nlookupdata *nd,
86 	     const char *path, enum uio_seg seg, int flags)
87 {
88     size_t pathlen;
89     struct proc *p;
90     thread_t td;
91     int error;
92 
93     td = curthread;
94     p = td->td_proc;
95 
96     /*
97      * note: the pathlen set by copy*str() includes the terminating \0.
98      */
99     bzero(nd, sizeof(struct nlookupdata));
100     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
101     nd->nl_flags |= NLC_HASBUF;
102     if (seg == UIO_SYSSPACE)
103 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
104     else
105 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
106 
107     /*
108      * Don't allow empty pathnames.
109      * POSIX.1 requirement: "" is not a vaild file name.
110      */
111     if (error == 0 && pathlen <= 1)
112 	error = ENOENT;
113 
114     if (error == 0) {
115 	if (p && p->p_fd) {
116 	    cache_copy_ncdir(p, &nd->nl_nch);
117 	    cache_copy(&p->p_fd->fd_nrdir, &nd->nl_rootnch);
118 	    if (p->p_fd->fd_njdir.ncp)
119 		cache_copy(&p->p_fd->fd_njdir, &nd->nl_jailnch);
120 	    nd->nl_cred = td->td_ucred;
121 	    nd->nl_flags |= NLC_BORROWCRED | NLC_NCDIR;
122 	} else {
123 	    cache_copy(&rootnch, &nd->nl_nch);
124 	    cache_copy(&nd->nl_nch, &nd->nl_rootnch);
125 	    cache_copy(&nd->nl_nch, &nd->nl_jailnch);
126 	    nd->nl_cred = proc0.p_ucred;
127 	    nd->nl_flags |= NLC_BORROWCRED;
128 	}
129 	nd->nl_td = td;
130 	nd->nl_flags |= flags;
131     } else {
132 	nlookup_done(nd);
133     }
134     return(error);
135 }
136 
137 
138 /*
139  * nlookup_init() for "at" family of syscalls.
140  *
141  * Works similarly to nlookup_init() but if path is relative and fd is not
142  * AT_FDCWD, path is interpreted relative to the directory pointed to by fd.
143  * In this case, the file entry pointed to by fd is ref'ed and returned in
144  * *fpp.
145  *
146  * If the call succeeds, nlookup_done_at() must be called to clean-up the nd
147  * and release the ref to the file entry.
148  */
149 int
150 nlookup_init_at(struct nlookupdata *nd, struct file **fpp, int fd,
151 		const char *path, enum uio_seg seg, int flags)
152 {
153 	struct thread *td = curthread;
154 	struct proc *p = td->td_proc;
155 	struct file* fp;
156 	struct vnode *vp;
157 	int error;
158 
159 	*fpp = NULL;
160 
161 	if  ((error = nlookup_init(nd, path, seg, flags)) != 0) {
162 		return (error);
163 	}
164 
165 	if (nd->nl_path[0] != '/' && fd != AT_FDCWD) {
166 		if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
167 			goto done;
168 		vp = (struct vnode*)fp->f_data;
169 		if (vp->v_type != VDIR || fp->f_nchandle.ncp == NULL) {
170 			fdrop(fp);
171 			fp = NULL;
172 			error = ENOTDIR;
173 			goto done;
174 		}
175 		if (nd->nl_flags & NLC_NCDIR) {
176 			cache_drop_ncdir(&nd->nl_nch);
177 			nd->nl_flags &= ~NLC_NCDIR;
178 		} else {
179 			cache_drop(&nd->nl_nch);
180 		}
181 		cache_copy(&fp->f_nchandle, &nd->nl_nch);
182 		*fpp = fp;
183 	}
184 
185 
186 done:
187 	if (error)
188 		nlookup_done(nd);
189 	return (error);
190 
191 }
192 
193 /*
194  * This works similarly to nlookup_init() but does not assume a process
195  * context.  rootnch is always chosen for the root directory and the cred
196  * and starting directory are supplied in arguments.
197  */
198 int
199 nlookup_init_raw(struct nlookupdata *nd,
200 	     const char *path, enum uio_seg seg, int flags,
201 	     struct ucred *cred, struct nchandle *ncstart)
202 {
203     size_t pathlen;
204     thread_t td;
205     int error;
206 
207     td = curthread;
208 
209     bzero(nd, sizeof(struct nlookupdata));
210     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
211     nd->nl_flags |= NLC_HASBUF;
212     if (seg == UIO_SYSSPACE)
213 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
214     else
215 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
216 
217     /*
218      * Don't allow empty pathnames.
219      * POSIX.1 requirement: "" is not a vaild file name.
220      */
221     if (error == 0 && pathlen <= 1)
222 	error = ENOENT;
223 
224     if (error == 0) {
225 	cache_copy(ncstart, &nd->nl_nch);
226 	cache_copy(&rootnch, &nd->nl_rootnch);
227 	cache_copy(&rootnch, &nd->nl_jailnch);
228 	nd->nl_cred = crhold(cred);
229 	nd->nl_td = td;
230 	nd->nl_flags |= flags;
231     } else {
232 	nlookup_done(nd);
233     }
234     return(error);
235 }
236 
237 /*
238  * This works similarly to nlookup_init_raw() but does not rely
239  * on rootnch being initialized yet.
240  */
241 int
242 nlookup_init_root(struct nlookupdata *nd,
243 	     const char *path, enum uio_seg seg, int flags,
244 	     struct ucred *cred, struct nchandle *ncstart,
245 	     struct nchandle *ncroot)
246 {
247     size_t pathlen;
248     thread_t td;
249     int error;
250 
251     td = curthread;
252 
253     bzero(nd, sizeof(struct nlookupdata));
254     nd->nl_path = objcache_get(namei_oc, M_WAITOK);
255     nd->nl_flags |= NLC_HASBUF;
256     if (seg == UIO_SYSSPACE)
257 	error = copystr(path, nd->nl_path, MAXPATHLEN, &pathlen);
258     else
259 	error = copyinstr(path, nd->nl_path, MAXPATHLEN, &pathlen);
260 
261     /*
262      * Don't allow empty pathnames.
263      * POSIX.1 requirement: "" is not a vaild file name.
264      */
265     if (error == 0 && pathlen <= 1)
266 	error = ENOENT;
267 
268     if (error == 0) {
269 	cache_copy(ncstart, &nd->nl_nch);
270 	cache_copy(ncroot, &nd->nl_rootnch);
271 	cache_copy(ncroot, &nd->nl_jailnch);
272 	nd->nl_cred = crhold(cred);
273 	nd->nl_td = td;
274 	nd->nl_flags |= flags;
275     } else {
276 	nlookup_done(nd);
277     }
278     return(error);
279 }
280 
281 #if 0
282 /*
283  * Set a different credential; this credential will be used by future
284  * operations performed on nd.nl_open_vp and nlookupdata structure.
285  */
286 void
287 nlookup_set_cred(struct nlookupdata *nd, struct ucred *cred)
288 {
289 	KKASSERT(nd->nl_cred != NULL);
290 
291 	if (nd->nl_cred != cred) {
292 		cred = crhold(cred);
293 		if ((nd->nl_flags & NLC_BORROWCRED) == 0)
294 			crfree(nd->nl_cred);
295 		nd->nl_flags &= ~NLC_BORROWCRED;
296 		nd->nl_cred = cred;
297 	}
298 }
299 #endif
300 
301 /*
302  * Cleanup a nlookupdata structure after we are through with it.  This may
303  * be called on any nlookupdata structure initialized with nlookup_init().
304  * Calling nlookup_done() is mandatory in all cases except where nlookup_init()
305  * returns an error, even if as a consumer you believe you have taken all
306  * dynamic elements out of the nlookupdata structure.
307  */
308 void
309 nlookup_done(struct nlookupdata *nd)
310 {
311     if (nd->nl_nch.ncp) {
312 	if (nd->nl_flags & NLC_NCPISLOCKED) {
313 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
314 	    cache_unlock(&nd->nl_nch);
315 	}
316 	if (nd->nl_flags & NLC_NCDIR) {
317 		cache_drop_ncdir(&nd->nl_nch);
318 		nd->nl_flags &= ~NLC_NCDIR;
319 	} else {
320 		cache_drop(&nd->nl_nch);	/* NULL's out the nch */
321 	}
322     }
323     if (nd->nl_rootnch.ncp)
324 	cache_drop_and_cache(&nd->nl_rootnch);
325     if (nd->nl_jailnch.ncp)
326 	cache_drop_and_cache(&nd->nl_jailnch);
327     if ((nd->nl_flags & NLC_HASBUF) && nd->nl_path) {
328 	objcache_put(namei_oc, nd->nl_path);
329 	nd->nl_path = NULL;
330     }
331     if (nd->nl_cred) {
332 	if ((nd->nl_flags & NLC_BORROWCRED) == 0)
333 	    crfree(nd->nl_cred);
334 	nd->nl_cred = NULL;
335 	nd->nl_flags &= ~NLC_BORROWCRED;
336     }
337     if (nd->nl_open_vp) {
338 	if (nd->nl_flags & NLC_LOCKVP) {
339 		vn_unlock(nd->nl_open_vp);
340 		nd->nl_flags &= ~NLC_LOCKVP;
341 	}
342 	vn_close(nd->nl_open_vp, nd->nl_vp_fmode, NULL);
343 	nd->nl_open_vp = NULL;
344     }
345     if (nd->nl_dvp) {
346 	vrele(nd->nl_dvp);
347 	nd->nl_dvp = NULL;
348     }
349     nd->nl_flags = 0;	/* clear remaining flags (just clear everything) */
350 }
351 
352 /*
353  * Works similarly to nlookup_done() when nd initialized with
354  * nlookup_init_at().
355  */
356 void
357 nlookup_done_at(struct nlookupdata *nd, struct file *fp)
358 {
359 	nlookup_done(nd);
360 	if (fp != NULL)
361 		fdrop(fp);
362 }
363 
364 void
365 nlookup_zero(struct nlookupdata *nd)
366 {
367 	bzero(nd, sizeof(struct nlookupdata));
368 }
369 
370 /*
371  * Simple all-in-one nlookup.  Returns a locked namecache structure or NULL
372  * if an error occured.
373  *
374  * Note that the returned ncp is not checked for permissions, though VEXEC
375  * is checked on the directory path leading up to the result.  The caller
376  * must call naccess() to check the permissions of the returned leaf.
377  */
378 struct nchandle
379 nlookup_simple(const char *str, enum uio_seg seg,
380 	       int niflags, int *error)
381 {
382     struct nlookupdata nd;
383     struct nchandle nch;
384 
385     *error = nlookup_init(&nd, str, seg, niflags);
386     if (*error == 0) {
387 	    if ((*error = nlookup(&nd)) == 0) {
388 		    nch = nd.nl_nch;	/* keep hold ref from structure */
389 		    cache_zero(&nd.nl_nch); /* and NULL out */
390 	    } else {
391 		    cache_zero(&nch);
392 	    }
393 	    nlookup_done(&nd);
394     } else {
395 	    cache_zero(&nch);
396     }
397     return(nch);
398 }
399 
400 /*
401  * Returns non-zero if the path element is the last element
402  */
403 static
404 int
405 islastelement(const char *ptr)
406 {
407 	while (*ptr == '/')
408 		++ptr;
409 	return (*ptr == 0);
410 }
411 
412 /*
413  * Returns non-zero if we need to lock the namecache element
414  * exclusively.  Unless otherwise requested by NLC_SHAREDLOCK,
415  * the last element of the namecache lookup will be locked
416  * exclusively.
417  *
418  * NOTE: Even if we return on-zero, an unresolved namecache record
419  *	 will always be locked exclusively.
420  */
421 static __inline
422 int
423 wantsexcllock(struct nlookupdata *nd, const char *ptr)
424 {
425 	if ((nd->nl_flags & NLC_SHAREDLOCK) == 0)
426 		return(islastelement(ptr));
427 	return(0);
428 }
429 
430 
431 /*
432  * Do a generic nlookup.  Note that the passed nd is not nlookup_done()'d
433  * on return, even if an error occurs.  If no error occurs or NLC_CREATE
434  * is flagged and ENOENT is returned, then the returned nl_nch is always
435  * referenced and locked exclusively.
436  *
437  * WARNING: For any general error other than ENOENT w/NLC_CREATE, the
438  *	    the resulting nl_nch may or may not be locked and if locked
439  *	    might be locked either shared or exclusive.
440  *
441  * Intermediate directory elements, including the current directory, require
442  * execute (search) permission.  nlookup does not examine the access
443  * permissions on the returned element.
444  *
445  * If NLC_CREATE is set the last directory must allow node creation,
446  * and an error code of 0 will be returned for a non-existant
447  * target (not ENOENT).
448  *
449  * If NLC_RENAME_DST is set the last directory mut allow node deletion,
450  * plus the sticky check is made, and an error code of 0 will be returned
451  * for a non-existant target (not ENOENT).
452  *
453  * If NLC_DELETE is set the last directory mut allow node deletion,
454  * plus the sticky check is made.
455  *
456  * If NLC_REFDVP is set nd->nl_dvp will be set to the directory vnode
457  * of the returned entry.  The vnode will be referenced, but not locked,
458  * and will be released by nlookup_done() along with everything else.
459  *
460  * NOTE: As an optimization we attempt to obtain a shared namecache lock
461  *	 on any intermediate elements.  On success, the returned element
462  *	 is ALWAYS locked exclusively.
463  */
464 int
465 nlookup(struct nlookupdata *nd)
466 {
467     globaldata_t gd = mycpu;
468     struct nlcomponent nlc;
469     struct nchandle nch;
470     struct nchandle par;
471     struct nchandle nctmp;
472     struct mount *mp;
473     struct vnode *hvp;		/* hold to prevent recyclement */
474     int wasdotordotdot;
475     char *ptr;
476     int error;
477     int len;
478     int dflags;
479     int hit = 1;
480     int saveflag = nd->nl_flags & ~NLC_NCDIR;
481     boolean_t doretry = FALSE;
482     boolean_t inretry = FALSE;
483 
484 nlookup_start:
485 #ifdef KTRACE
486     if (KTRPOINT(nd->nl_td, KTR_NAMEI))
487 	ktrnamei(nd->nl_td->td_lwp, nd->nl_path);
488 #endif
489     bzero(&nlc, sizeof(nlc));
490 
491     /*
492      * Setup for the loop.  The current working namecache element is
493      * always at least referenced.  We lock it as required, but always
494      * return a locked, resolved namecache entry.
495      */
496     nd->nl_loopcnt = 0;
497     if (nd->nl_dvp) {
498 	vrele(nd->nl_dvp);
499 	nd->nl_dvp = NULL;
500     }
501     ptr = nd->nl_path;
502 
503     /*
504      * Loop on the path components.  At the top of the loop nd->nl_nch
505      * is ref'd and unlocked and represents our current position.
506      */
507     for (;;) {
508 	/*
509 	 * Make sure nl_nch is locked so we can access the vnode, resolution
510 	 * state, etc.
511 	 */
512 	if ((nd->nl_flags & NLC_NCPISLOCKED) == 0) {
513 		nd->nl_flags |= NLC_NCPISLOCKED;
514 		cache_lock_maybe_shared(&nd->nl_nch, wantsexcllock(nd, ptr));
515 	}
516 
517 	/*
518 	 * Check if the root directory should replace the current
519 	 * directory.  This is done at the start of a translation
520 	 * or after a symbolic link has been found.  In other cases
521 	 * ptr will never be pointing at a '/'.
522 	 */
523 	if (*ptr == '/') {
524 	    do {
525 		++ptr;
526 	    } while (*ptr == '/');
527 	    cache_unlock(&nd->nl_nch);
528 	    cache_get_maybe_shared(&nd->nl_rootnch, &nch,
529 				   wantsexcllock(nd, ptr));
530 	    if (nd->nl_flags & NLC_NCDIR) {
531 		    cache_drop_ncdir(&nd->nl_nch);
532 		    nd->nl_flags &= ~NLC_NCDIR;
533 	    } else {
534 		    cache_drop(&nd->nl_nch);
535 	    }
536 	    nd->nl_nch = nch;		/* remains locked */
537 
538 	    /*
539 	     * Fast-track termination.  There is no parent directory of
540 	     * the root in the same mount from the point of view of
541 	     * the caller so return EACCES if NLC_REFDVP is specified,
542 	     * and EEXIST if NLC_CREATE is also specified.
543 	     * e.g. 'rmdir /' or 'mkdir /' are not allowed.
544 	     */
545 	    if (*ptr == 0) {
546 		if (nd->nl_flags & NLC_REFDVP)
547 			error = (nd->nl_flags & NLC_CREATE) ? EEXIST : EACCES;
548 		else
549 			error = 0;
550 		break;
551 	    }
552 	    continue;
553 	}
554 
555 	/*
556 	 * Check directory search permissions (nd->nl_nch is locked & refd)
557 	 */
558 	dflags = 0;
559 	error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, &dflags);
560 	if (error)
561 	    break;
562 
563 	/*
564 	 * Extract the path component.  Path components are limited to
565 	 * 255 characters.
566 	 */
567 	nlc.nlc_nameptr = ptr;
568 	while (*ptr && *ptr != '/')
569 	    ++ptr;
570 	nlc.nlc_namelen = ptr - nlc.nlc_nameptr;
571 	if (nlc.nlc_namelen >= 256) {
572 	    error = ENAMETOOLONG;
573 	    break;
574 	}
575 
576 	/*
577 	 * Lookup the path component in the cache, creating an unresolved
578 	 * entry if necessary.  We have to handle "." and ".." as special
579 	 * cases.
580 	 *
581 	 * When handling ".." we have to detect a traversal back through a
582 	 * mount point.   If we are at the root, ".." just returns the root.
583 	 *
584 	 * When handling "." or ".." we also have to recalculate dflags
585 	 * since our dflags will be for some sub-directory instead of the
586 	 * parent dir.
587 	 *
588 	 * This subsection returns a locked, refd 'nch' unless it errors out,
589 	 * and an unlocked but still ref'd nd->nl_nch.
590 	 *
591 	 * The namecache topology is not allowed to be disconnected, so
592 	 * encountering a NULL parent will generate EINVAL.  This typically
593 	 * occurs when a directory is removed out from under a process.
594 	 *
595 	 * WARNING! The unlocking of nd->nl_nch is sensitive code.
596 	 */
597 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
598 
599 	if (nlc.nlc_namelen == 1 && nlc.nlc_nameptr[0] == '.') {
600 	    cache_unlock(&nd->nl_nch);
601 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
602 	    cache_get_maybe_shared(&nd->nl_nch, &nch, wantsexcllock(nd, ptr));
603 	    wasdotordotdot = 1;
604 	} else if (nlc.nlc_namelen == 2 &&
605 		   nlc.nlc_nameptr[0] == '.' && nlc.nlc_nameptr[1] == '.') {
606 	    if (nd->nl_nch.mount == nd->nl_rootnch.mount &&
607 		nd->nl_nch.ncp == nd->nl_rootnch.ncp
608 	    ) {
609 		/*
610 		 * ".." at the root returns the root
611 		 */
612 		cache_unlock(&nd->nl_nch);
613 		nd->nl_flags &= ~NLC_NCPISLOCKED;
614 		cache_get_maybe_shared(&nd->nl_nch, &nch,
615 				       wantsexcllock(nd, ptr));
616 	    } else {
617 		/*
618 		 * Locate the parent ncp.  If we are at the root of a
619 		 * filesystem mount we have to skip to the mounted-on
620 		 * point in the underlying filesystem.
621 		 *
622 		 * Expect the parent to always be good since the
623 		 * mountpoint doesn't go away.  XXX hack.  cache_get()
624 		 * requires the ncp to already have a ref as a safety.
625 		 *
626 		 * However, a process which has been broken out of a chroot
627 		 * will wind up with a NULL parent if it tries to '..' above
628 		 * the real root, deal with the case.  Note that this does
629 		 * not protect us from a jail breakout, it just stops a panic
630 		 * if the jail-broken process tries to '..' past the real
631 		 * root.
632 		 */
633 		nctmp = nd->nl_nch;
634 		while (nctmp.ncp == nctmp.mount->mnt_ncmountpt.ncp) {
635 			nctmp = nctmp.mount->mnt_ncmounton;
636 			if (nctmp.ncp == NULL)
637 				break;
638 		}
639 		if (nctmp.ncp == NULL) {
640 			if (curthread->td_proc) {
641 				kprintf("vfs_nlookup: '..' traverse broke "
642 					"jail: pid %d (%s)\n",
643 					curthread->td_proc->p_pid,
644 					curthread->td_comm);
645 			}
646 			nctmp = nd->nl_rootnch;
647 		} else {
648 			nctmp.ncp = nctmp.ncp->nc_parent;
649 		}
650 		cache_hold(&nctmp);
651 		cache_unlock(&nd->nl_nch);
652 		nd->nl_flags &= ~NLC_NCPISLOCKED;
653 		cache_get_maybe_shared(&nctmp, &nch, wantsexcllock(nd, ptr));
654 		cache_drop(&nctmp);		/* NOTE: zero's nctmp */
655 	    }
656 	    wasdotordotdot = 2;
657 	} else {
658 	    /*
659 	     * Must unlock nl_nch when traversing down the path.  However,
660 	     * the child ncp has not yet been found/created and the parent's
661 	     * child list might be empty.  Thus releasing the lock can
662 	     * allow a race whereby the parent ncp's vnode is recycled.
663 	     * This case can occur especially when maxvnodes is set very low.
664 	     *
665 	     * We need the parent's ncp to remain resolved for all normal
666 	     * filesystem activities, so we vhold() the vp during the lookup
667 	     * to prevent recyclement due to vnlru / maxvnodes.
668 	     *
669 	     * If we race an unlink or rename the ncp might be marked
670 	     * DESTROYED after resolution, requiring a retry.
671 	     */
672 	    if ((hvp = nd->nl_nch.ncp->nc_vp) != NULL)
673 		vhold(hvp);
674 	    cache_unlock(&nd->nl_nch);
675 	    nd->nl_flags &= ~NLC_NCPISLOCKED;
676 	    error = cache_nlookup_maybe_shared(&nd->nl_nch, &nlc,
677 					       wantsexcllock(nd, ptr), &nch);
678 	    if (error == EWOULDBLOCK) {
679 		    nch = cache_nlookup(&nd->nl_nch, &nlc);
680 		    if (nch.ncp->nc_flag & NCF_UNRESOLVED)
681 			hit = 0;
682 		    for (;;) {
683 			error = cache_resolve(&nch, nd->nl_cred);
684 			if (error != EAGAIN &&
685 			    (nch.ncp->nc_flag & NCF_DESTROYED) == 0) {
686 				if (error == ESTALE) {
687 				    if (!inretry)
688 					error = ENOENT;
689 				    doretry = TRUE;
690 				}
691 				break;
692 			}
693 			kprintf("[diagnostic] nlookup: relookup %*.*s\n",
694 				nch.ncp->nc_nlen, nch.ncp->nc_nlen,
695 				nch.ncp->nc_name);
696 			cache_put(&nch);
697 			nch = cache_nlookup(&nd->nl_nch, &nlc);
698 		    }
699 	    }
700 	    if (hvp)
701 		vdrop(hvp);
702 	    wasdotordotdot = 0;
703 	}
704 
705 	/*
706 	 * If the last component was "." or ".." our dflags no longer
707 	 * represents the parent directory and we have to explicitly
708 	 * look it up.
709 	 *
710 	 * Expect the parent to be good since nch is locked.
711 	 */
712 	if (wasdotordotdot && error == 0) {
713 	    dflags = 0;
714 	    if ((par.ncp = nch.ncp->nc_parent) != NULL) {
715 		par.mount = nch.mount;
716 		cache_hold(&par);
717 		cache_lock_maybe_shared(&par, wantsexcllock(nd, ptr));
718 		error = naccess(&par, 0, nd->nl_cred, &dflags);
719 		cache_put(&par);
720 	    }
721 	}
722 
723 	/*
724 	 * [end of subsection]
725 	 *
726 	 * nch is locked and referenced.
727 	 * nd->nl_nch is unlocked and referenced.
728 	 *
729 	 * nl_nch must be unlocked or we could chain lock to the root
730 	 * if a resolve gets stuck (e.g. in NFS).
731 	 */
732 	KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0);
733 
734 	/*
735 	 * Resolve the namespace if necessary.  The ncp returned by
736 	 * cache_nlookup() is referenced and locked.
737 	 *
738 	 * XXX neither '.' nor '..' should return EAGAIN since they were
739 	 * previously resolved and thus cannot be newly created ncp's.
740 	 */
741 	if (nch.ncp->nc_flag & NCF_UNRESOLVED) {
742 	    hit = 0;
743 	    error = cache_resolve(&nch, nd->nl_cred);
744 	    if (error == ESTALE) {
745 		if (!inretry)
746 		    error = ENOENT;
747 		doretry = TRUE;
748 	    }
749 	    KKASSERT(error != EAGAIN);
750 	} else {
751 	    error = nch.ncp->nc_error;
752 	}
753 
754 	/*
755 	 * Early completion.  ENOENT is not an error if this is the last
756 	 * component and NLC_CREATE or NLC_RENAME (rename target) was
757 	 * requested.  Note that ncp->nc_error is left as ENOENT in that
758 	 * case, which we check later on.
759 	 *
760 	 * Also handle invalid '.' or '..' components terminating a path
761 	 * for a create/rename/delete.  The standard requires this and pax
762 	 * pretty stupidly depends on it.
763 	 */
764 	if (islastelement(ptr)) {
765 	    if (error == ENOENT &&
766 		(nd->nl_flags & (NLC_CREATE | NLC_RENAME_DST))
767 	    ) {
768 		if (nd->nl_flags & NLC_NFS_RDONLY) {
769 			error = EROFS;
770 		} else {
771 			error = naccess(&nch, nd->nl_flags | dflags,
772 					nd->nl_cred, NULL);
773 		}
774 	    }
775 	    if (error == 0 && wasdotordotdot &&
776 		(nd->nl_flags & (NLC_CREATE | NLC_DELETE |
777 				 NLC_RENAME_SRC | NLC_RENAME_DST))) {
778 		/*
779 		 * POSIX junk
780 		 */
781 		if (nd->nl_flags & NLC_CREATE)
782 			error = EEXIST;
783 		else if (nd->nl_flags & NLC_DELETE)
784 			error = (wasdotordotdot == 1) ? EINVAL : ENOTEMPTY;
785 		else
786 			error = EINVAL;
787 	    }
788 	}
789 
790 	/*
791 	 * Early completion on error.
792 	 */
793 	if (error) {
794 	    cache_put(&nch);
795 	    break;
796 	}
797 
798 	/*
799 	 * If the element is a symlink and it is either not the last
800 	 * element or it is the last element and we are allowed to
801 	 * follow symlinks, resolve the symlink.
802 	 */
803 	if ((nch.ncp->nc_flag & NCF_ISSYMLINK) &&
804 	    (*ptr || (nd->nl_flags & NLC_FOLLOW))
805 	) {
806 	    if (nd->nl_loopcnt++ >= MAXSYMLINKS) {
807 		error = ELOOP;
808 		cache_put(&nch);
809 		break;
810 	    }
811 	    error = nreadsymlink(nd, &nch, &nlc);
812 	    cache_put(&nch);
813 	    if (error)
814 		break;
815 
816 	    /*
817 	     * Concatenate trailing path elements onto the returned symlink.
818 	     * Note that if the path component (ptr) is not exhausted, it
819 	     * will being with a '/', so we do not have to add another one.
820 	     *
821 	     * The symlink may not be empty.
822 	     */
823 	    len = strlen(ptr);
824 	    if (nlc.nlc_namelen == 0 || nlc.nlc_namelen + len >= MAXPATHLEN) {
825 		error = nlc.nlc_namelen ? ENAMETOOLONG : ENOENT;
826 		objcache_put(namei_oc, nlc.nlc_nameptr);
827 		break;
828 	    }
829 	    bcopy(ptr, nlc.nlc_nameptr + nlc.nlc_namelen, len + 1);
830 	    if (nd->nl_flags & NLC_HASBUF)
831 		objcache_put(namei_oc, nd->nl_path);
832 	    nd->nl_path = nlc.nlc_nameptr;
833 	    nd->nl_flags |= NLC_HASBUF;
834 	    ptr = nd->nl_path;
835 
836 	    /*
837 	     * Go back up to the top to resolve any initial '/'s in the
838 	     * symlink.
839 	     */
840 	    continue;
841 	}
842 
843 	/*
844 	 * If the element is a directory and we are crossing a mount point,
845 	 * Locate the mount.
846 	 */
847 	while ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
848 	    (nd->nl_flags & NLC_NOCROSSMOUNT) == 0 &&
849 	    (mp = cache_findmount(&nch)) != NULL
850 	) {
851 	    struct vnode *tdp;
852 	    int vfs_do_busy = 0;
853 
854 	    /*
855 	     * VFS must be busied before the namecache entry is locked,
856 	     * but we don't want to waste time calling vfs_busy() if the
857 	     * mount point is already resolved.
858 	     */
859 again:
860 	    cache_put(&nch);
861 	    if (vfs_do_busy) {
862 		while (vfs_busy(mp, 0)) {
863 		    if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
864 			kprintf("nlookup: warning umount race avoided\n");
865 			cache_dropmount(mp);
866 			error = EBUSY;
867 			vfs_do_busy = 0;
868 			goto double_break;
869 		    }
870 		}
871 	    }
872 	    cache_get_maybe_shared(&mp->mnt_ncmountpt, &nch,
873 				   wantsexcllock(nd, ptr));
874 
875 	    if (nch.ncp->nc_flag & NCF_UNRESOLVED) {
876 		if (vfs_do_busy == 0) {
877 		    vfs_do_busy = 1;
878 		    goto again;
879 		}
880 		error = VFS_ROOT(mp, &tdp);
881 		vfs_unbusy(mp);
882 		vfs_do_busy = 0;
883 		if (error) {
884 		    cache_dropmount(mp);
885 		    break;
886 		}
887 		cache_setvp(&nch, tdp);
888 		vput(tdp);
889 	    }
890 	    if (vfs_do_busy)
891 		vfs_unbusy(mp);
892 	    cache_dropmount(mp);
893 	}
894 
895 	if (error) {
896 	    cache_put(&nch);
897 double_break:
898 	    break;
899 	}
900 
901 	/*
902 	 * Skip any slashes to get to the next element.  If there
903 	 * are any slashes at all the current element must be a
904 	 * directory or, in the create case, intended to become a directory.
905 	 * If it isn't we break without incrementing ptr and fall through
906 	 * to the failure case below.
907 	 */
908 	while (*ptr == '/') {
909 	    if ((nch.ncp->nc_flag & NCF_ISDIR) == 0 &&
910 		!(nd->nl_flags & NLC_WILLBEDIR)
911 	    ) {
912 		break;
913 	    }
914 	    ++ptr;
915 	}
916 
917 	/*
918 	 * Continuation case: additional elements and the current
919 	 * element is a directory.
920 	 */
921 	if (*ptr && (nch.ncp->nc_flag & NCF_ISDIR)) {
922 	    if (nd->nl_flags & NLC_NCDIR) {
923 		    cache_drop_ncdir(&nd->nl_nch);
924 		    nd->nl_flags &= ~NLC_NCDIR;
925 	    } else {
926 		    cache_drop(&nd->nl_nch);
927 	    }
928 	    cache_unlock(&nch);
929 	    KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0);
930 	    nd->nl_nch = nch;
931 	    continue;
932 	}
933 
934 	/*
935 	 * Failure case: additional elements and the current element
936 	 * is not a directory
937 	 */
938 	if (*ptr) {
939 	    cache_put(&nch);
940 	    error = ENOTDIR;
941 	    break;
942 	}
943 
944 	/*
945 	 * Successful lookup of last element.
946 	 *
947 	 * Check permissions if the target exists.  If the target does not
948 	 * exist directory permissions were already tested in the early
949 	 * completion code above.
950 	 *
951 	 * nd->nl_flags will be adjusted on return with NLC_APPENDONLY
952 	 * if the file is marked append-only, and NLC_STICKY if the directory
953 	 * containing the file is sticky.
954 	 */
955 	if (nch.ncp->nc_vp && (nd->nl_flags & NLC_ALLCHKS)) {
956 	    error = naccess(&nch, nd->nl_flags | dflags,
957 			    nd->nl_cred, NULL);
958 	    if (error) {
959 		cache_put(&nch);
960 		break;
961 	    }
962 	}
963 
964 	/*
965 	 * Termination: no more elements.
966 	 *
967 	 * If NLC_REFDVP is set acquire a referenced parent dvp.
968 	 */
969 	if (nd->nl_flags & NLC_REFDVP) {
970 		cache_lock(&nd->nl_nch);
971 		error = cache_vref(&nd->nl_nch, nd->nl_cred, &nd->nl_dvp);
972 		cache_unlock(&nd->nl_nch);
973 		if (error) {
974 			kprintf("NLC_REFDVP: Cannot ref dvp of %p\n", nch.ncp);
975 			cache_put(&nch);
976 			break;
977 		}
978 	}
979 	if (nd->nl_flags & NLC_NCDIR) {
980 		cache_drop_ncdir(&nd->nl_nch);
981 		nd->nl_flags &= ~NLC_NCDIR;
982 	} else {
983 		cache_drop(&nd->nl_nch);
984 	}
985 	nd->nl_nch = nch;
986 	nd->nl_flags |= NLC_NCPISLOCKED;
987 	error = 0;
988 	break;
989     }
990 
991     if (hit)
992 	++gd->gd_nchstats->ncs_longhits;
993     else
994 	++gd->gd_nchstats->ncs_longmiss;
995 
996     if (nd->nl_flags & NLC_NCPISLOCKED)
997 	KKASSERT(cache_lockstatus(&nd->nl_nch) > 0);
998 
999     /*
1000      * Retry the whole thing if doretry flag is set, but only once.
1001      * autofs(5) may mount another filesystem under its root directory
1002      * while resolving a path.
1003      */
1004     if (doretry && !inretry) {
1005 	inretry = TRUE;
1006 	nd->nl_flags &= NLC_NCDIR;
1007 	nd->nl_flags |= saveflag;
1008 	goto nlookup_start;
1009     }
1010 
1011     /*
1012      * NOTE: If NLC_CREATE was set the ncp may represent a negative hit
1013      * (ncp->nc_error will be ENOENT), but we will still return an error
1014      * code of 0.
1015      */
1016     return(error);
1017 }
1018 
1019 /*
1020  * Resolve a mount point's glue ncp.  This ncp connects creates the illusion
1021  * of continuity in the namecache tree by connecting the ncp related to the
1022  * vnode under the mount to the ncp related to the mount's root vnode.
1023  *
1024  * If no error occured a locked, ref'd ncp is stored in *ncpp.
1025  */
1026 int
1027 nlookup_mp(struct mount *mp, struct nchandle *nch)
1028 {
1029     struct vnode *vp;
1030     int error;
1031 
1032     error = 0;
1033     cache_get(&mp->mnt_ncmountpt, nch);
1034     if (nch->ncp->nc_flag & NCF_UNRESOLVED) {
1035 	while (vfs_busy(mp, 0))
1036 	    ;
1037 	error = VFS_ROOT(mp, &vp);
1038 	vfs_unbusy(mp);
1039 	if (error) {
1040 	    cache_put(nch);
1041 	} else {
1042 	    cache_setvp(nch, vp);
1043 	    vput(vp);
1044 	}
1045     }
1046     return(error);
1047 }
1048 
1049 /*
1050  * Read the contents of a symlink, allocate a path buffer out of the
1051  * namei_oc and initialize the supplied nlcomponent with the result.
1052  *
1053  * If an error occurs no buffer will be allocated or returned in the nlc.
1054  */
1055 int
1056 nreadsymlink(struct nlookupdata *nd, struct nchandle *nch,
1057 		struct nlcomponent *nlc)
1058 {
1059     struct vnode *vp;
1060     struct iovec aiov;
1061     struct uio auio;
1062     int linklen;
1063     int error;
1064     char *cp;
1065 
1066     nlc->nlc_nameptr = NULL;
1067     nlc->nlc_namelen = 0;
1068     if (nch->ncp->nc_vp == NULL)
1069 	return(ENOENT);
1070     if ((error = cache_vget(nch, nd->nl_cred, LK_SHARED, &vp)) != 0)
1071 	return(error);
1072     cp = objcache_get(namei_oc, M_WAITOK);
1073     aiov.iov_base = cp;
1074     aiov.iov_len = MAXPATHLEN;
1075     auio.uio_iov = &aiov;
1076     auio.uio_iovcnt = 1;
1077     auio.uio_offset = 0;
1078     auio.uio_rw = UIO_READ;
1079     auio.uio_segflg = UIO_SYSSPACE;
1080     auio.uio_td = nd->nl_td;
1081     auio.uio_resid = MAXPATHLEN - 1;
1082     error = VOP_READLINK(vp, &auio, nd->nl_cred);
1083     if (error)
1084 	goto fail;
1085     linklen = MAXPATHLEN - 1 - auio.uio_resid;
1086     if (varsym_enable) {
1087 	linklen = varsymreplace(cp, linklen, MAXPATHLEN - 1);
1088 	if (linklen < 0) {
1089 	    error = ENAMETOOLONG;
1090 	    goto fail;
1091 	}
1092     }
1093     cp[linklen] = 0;
1094     nlc->nlc_nameptr = cp;
1095     nlc->nlc_namelen = linklen;
1096     vput(vp);
1097     return(0);
1098 fail:
1099     objcache_put(namei_oc, cp);
1100     vput(vp);
1101     return(error);
1102 }
1103 
1104 /*
1105  * Check access [XXX cache vattr!] [XXX quota]
1106  *
1107  * Generally check the NLC_* access bits.   All specified bits must pass
1108  * for this function to return 0.
1109  *
1110  * The file does not have to exist when checking NLC_CREATE or NLC_RENAME_DST
1111  * access, otherwise it must exist.  No error is returned in this case.
1112  *
1113  * The file must not exist if NLC_EXCL is specified.
1114  *
1115  * Directory permissions in general are tested for NLC_CREATE if the file
1116  * does not exist, NLC_DELETE if the file does exist, and NLC_RENAME_DST
1117  * whether the file exists or not.
1118  *
1119  * The directory sticky bit is tested for NLC_DELETE and NLC_RENAME_DST,
1120  * the latter is only tested if the target exists.
1121  *
1122  * The passed ncp must be referenced and locked.  If it is already resolved
1123  * it may be locked shared but otherwise should be locked exclusively.
1124  */
1125 
1126 #define S_WXOK_MASK	(S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH)
1127 
1128 static int
1129 naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp)
1130 {
1131     struct vnode *vp;
1132     struct vattr va;
1133     struct namecache *ncp;
1134     int error;
1135     int cflags;
1136 
1137     KKASSERT(cache_lockstatus(nch) > 0);
1138 
1139     ncp = nch->ncp;
1140     if (ncp->nc_flag & NCF_UNRESOLVED) {
1141 	cache_resolve(nch, cred);
1142 	ncp = nch->ncp;
1143     }
1144     error = ncp->nc_error;
1145 
1146     /*
1147      * Directory permissions checks.  Silently ignore ENOENT if these
1148      * tests pass.  It isn't an error.
1149      *
1150      * We can safely resolve ncp->nc_parent because ncp is currently
1151      * locked.
1152      */
1153     if (nflags & (NLC_CREATE | NLC_DELETE | NLC_RENAME_SRC | NLC_RENAME_DST)) {
1154 	if (((nflags & NLC_CREATE) && ncp->nc_vp == NULL) ||
1155 	    ((nflags & NLC_DELETE) && ncp->nc_vp != NULL) ||
1156 	    ((nflags & NLC_RENAME_SRC) && ncp->nc_vp != NULL) ||
1157 	    (nflags & NLC_RENAME_DST)
1158 	) {
1159 	    struct nchandle par;
1160 
1161 	    if ((par.ncp = ncp->nc_parent) == NULL) {
1162 		if (error != EAGAIN)
1163 			error = EINVAL;
1164 	    } else if (error == 0 || error == ENOENT) {
1165 		par.mount = nch->mount;
1166 		cache_hold(&par);
1167 		cache_lock_maybe_shared(&par, 0);
1168 		error = naccess(&par, NLC_WRITE, cred, NULL);
1169 		cache_put(&par);
1170 	    }
1171 	}
1172     }
1173 
1174     /*
1175      * NLC_EXCL check.  Target file must not exist.
1176      */
1177     if (error == 0 && (nflags & NLC_EXCL) && ncp->nc_vp != NULL)
1178 	error = EEXIST;
1179 
1180     /*
1181      * Try to short-cut the vnode operation for intermediate directory
1182      * components.  This is a major SMP win because it avoids having
1183      * to execute a lot of code for intermediate directory components,
1184      * including shared refs and locks on intermediate directory vnodes.
1185      */
1186     if (error == 0 && nflags == NLC_EXEC && (ncp->nc_flag & NCF_WXOK)) {
1187 	return 0;
1188     }
1189 
1190     /*
1191      * Get the vnode attributes so we can do the rest of our checks.
1192      *
1193      * NOTE: We only call naccess_va() if the target exists.
1194      */
1195     if (error == 0) {
1196 	error = cache_vget(nch, cred, LK_SHARED, &vp);
1197 	if (error == ENOENT) {
1198 	    /*
1199 	     * Silently zero-out ENOENT if creating or renaming
1200 	     * (rename target).  It isn't an error.
1201 	     */
1202 	    if (nflags & (NLC_CREATE | NLC_RENAME_DST))
1203 		error = 0;
1204 	} else if (error == 0) {
1205 	    /*
1206 	     * Get the vnode attributes and check for illegal O_TRUNC
1207 	     * requests and read-only mounts.
1208 	     *
1209 	     * NOTE: You can still open devices on read-only mounts for
1210 	     * 	     writing.
1211 	     *
1212 	     * NOTE: creates/deletes/renames are handled by the NLC_WRITE
1213 	     *	     check on the parent directory above.
1214 	     *
1215 	     * XXX cache the va in the namecache or in the vnode
1216 	     */
1217 	    error = VOP_GETATTR(vp, &va);
1218 	    if (error == 0 && (nflags & NLC_TRUNCATE)) {
1219 		switch(va.va_type) {
1220 		case VREG:
1221 		case VDATABASE:
1222 		case VCHR:
1223 		case VBLK:
1224 		case VFIFO:
1225 		    break;
1226 		case VDIR:
1227 		    error = EISDIR;
1228 		    break;
1229 		default:
1230 		    error = EINVAL;
1231 		    break;
1232 		}
1233 	    }
1234 	    if (error == 0 && (nflags & NLC_WRITE) && vp->v_mount &&
1235 		(vp->v_mount->mnt_flag & MNT_RDONLY)
1236 	    ) {
1237 		switch(va.va_type) {
1238 		case VDIR:
1239 		case VLNK:
1240 		case VREG:
1241 		case VDATABASE:
1242 		    error = EROFS;
1243 		    break;
1244 		default:
1245 		    break;
1246 		}
1247 	    }
1248 	    vput(vp);
1249 
1250 	    /*
1251 	     * Check permissions based on file attributes.  The passed
1252 	     * flags (*nflagsp) are modified with feedback based on
1253 	     * special attributes and requirements.
1254 	     */
1255 	    if (error == 0) {
1256 		/*
1257 		 * Adjust the returned (*nflagsp) if non-NULL.
1258 		 */
1259 		if (nflagsp) {
1260 		    if ((va.va_mode & VSVTX) && va.va_uid != cred->cr_uid)
1261 			*nflagsp |= NLC_STICKY;
1262 		    if (va.va_flags & APPEND)
1263 			*nflagsp |= NLC_APPENDONLY;
1264 		    if (va.va_flags & IMMUTABLE)
1265 			*nflagsp |= NLC_IMMUTABLE;
1266 		}
1267 
1268 		/*
1269 		 * NCF_WXOK can be set for world-searchable directories.
1270 		 *
1271 		 * XXX When we implement capabilities this code would also
1272 		 * need a cap check, or only set the flag if there are no
1273 		 * capabilities.
1274 		 */
1275 		cflags = 0;
1276 		if (va.va_type == VDIR &&
1277 		    (va.va_mode & S_WXOK_MASK) == S_WXOK_MASK) {
1278 			cflags |= NCF_WXOK;
1279 		}
1280 
1281 		/*
1282 		 * Track swapcache management flags in the namecache.
1283 		 *
1284 		 * Calculate the flags based on the current vattr info
1285 		 * and recalculate the inherited flags from the parent
1286 		 * (the original cache linkage may have occurred without
1287 		 * getattrs and thus have stale flags).
1288 		 */
1289 		if (va.va_flags & SF_NOCACHE)
1290 			cflags |= NCF_SF_NOCACHE;
1291 		if (va.va_flags & UF_CACHE)
1292 			cflags |= NCF_UF_CACHE;
1293 		if (ncp->nc_parent) {
1294 			if (ncp->nc_parent->nc_flag &
1295 			    (NCF_SF_NOCACHE | NCF_SF_PNOCACHE)) {
1296 				cflags |= NCF_SF_PNOCACHE;
1297 			}
1298 			if (ncp->nc_parent->nc_flag &
1299 			    (NCF_UF_CACHE | NCF_UF_PCACHE)) {
1300 				cflags |= NCF_UF_PCACHE;
1301 			}
1302 		}
1303 
1304 		/*
1305 		 * We're not supposed to update nc_flag when holding a shared
1306 		 * lock, but we allow the case for certain flags.  Note that
1307 		 * holding an exclusive lock allows updating nc_flag without
1308 		 * atomics.  nc_flag is not allowe to be updated at all unless
1309 		 * a shared or exclusive lock is held.
1310 		 */
1311 		atomic_clear_short(&ncp->nc_flag,
1312 				   (NCF_SF_NOCACHE | NCF_UF_CACHE |
1313 				   NCF_SF_PNOCACHE | NCF_UF_PCACHE |
1314 				   NCF_WXOK) & ~cflags);
1315 		atomic_set_short(&ncp->nc_flag, cflags);
1316 
1317 		/*
1318 		 * Process general access.
1319 		 */
1320 		error = naccess_va(&va, nflags, cred);
1321 	    }
1322 	}
1323     }
1324     return(error);
1325 }
1326 
1327 /*
1328  * Check the requested access against the given vattr using cred.
1329  */
1330 int
1331 naccess_va(struct vattr *va, int nflags, struct ucred *cred)
1332 {
1333     int i;
1334     int vmode;
1335 
1336     /*
1337      * Test the immutable bit.  Creations, deletions, renames (source
1338      * or destination) are not allowed.  chown/chmod/other is also not
1339      * allowed but is handled by SETATTR.  Hardlinks to the immutable
1340      * file are allowed.
1341      *
1342      * If the directory is set to immutable then creations, deletions,
1343      * renames (source or dest) and hardlinks to files within the directory
1344      * are not allowed, and regular files opened through the directory may
1345      * not be written to or truncated (unless a special device).
1346      *
1347      * NOTE!  New hardlinks to immutable files work but new hardlinks to
1348      * files, immutable or not, sitting inside an immutable directory are
1349      * not allowed.  As always if the file is hardlinked via some other
1350      * path additional hardlinks may be possible even if the file is marked
1351      * immutable.  The sysop needs to create a closure by checking the hard
1352      * link count.  Once closure is achieved you are good, and security
1353      * scripts should check link counts anyway.
1354      *
1355      * Writes and truncations are only allowed on special devices.
1356      */
1357     if ((va->va_flags & IMMUTABLE) || (nflags & NLC_IMMUTABLE)) {
1358 	if ((nflags & NLC_IMMUTABLE) && (nflags & NLC_HLINK))
1359 	    return (EPERM);
1360 	if (nflags & (NLC_CREATE | NLC_DELETE |
1361 		      NLC_RENAME_SRC | NLC_RENAME_DST)) {
1362 	    return (EPERM);
1363 	}
1364 	if (nflags & (NLC_WRITE | NLC_TRUNCATE)) {
1365 	    switch(va->va_type) {
1366 	    case VDIR:
1367 		return (EISDIR);
1368 	    case VLNK:
1369 	    case VREG:
1370 	    case VDATABASE:
1371 		return (EPERM);
1372 	    default:
1373 		break;
1374 	    }
1375 	}
1376     }
1377 
1378     /*
1379      * Test the no-unlink and append-only bits for opens, rename targets,
1380      * and deletions.  These bits are not tested for creations or
1381      * rename sources.
1382      *
1383      * Unlike FreeBSD we allow a file with APPEND set to be renamed.
1384      * If you do not wish this you must also set NOUNLINK.
1385      *
1386      * If the governing directory is marked APPEND-only it implies
1387      * NOUNLINK for all entries in the directory.
1388      */
1389     if (((va->va_flags & NOUNLINK) || (nflags & NLC_APPENDONLY)) &&
1390 	(nflags & (NLC_DELETE | NLC_RENAME_SRC | NLC_RENAME_DST))
1391     ) {
1392 	return (EPERM);
1393     }
1394 
1395     /*
1396      * A file marked append-only may not be deleted but can be renamed.
1397      */
1398     if ((va->va_flags & APPEND) &&
1399 	(nflags & (NLC_DELETE | NLC_RENAME_DST))
1400     ) {
1401 	return (EPERM);
1402     }
1403 
1404     /*
1405      * A file marked append-only which is opened for writing must also
1406      * be opened O_APPEND.
1407      */
1408     if ((va->va_flags & APPEND) && (nflags & (NLC_OPEN | NLC_TRUNCATE))) {
1409 	if (nflags & NLC_TRUNCATE)
1410 	    return (EPERM);
1411 	if ((nflags & (NLC_OPEN | NLC_WRITE)) == (NLC_OPEN | NLC_WRITE)) {
1412 	    if ((nflags & NLC_APPEND) == 0)
1413 		return (EPERM);
1414 	}
1415     }
1416 
1417     /*
1418      * root gets universal access
1419      */
1420     if (cred->cr_uid == 0)
1421 	return(0);
1422 
1423     /*
1424      * Check owner perms.
1425      *
1426      * If NLC_OWN is set the owner of the file is allowed no matter when
1427      * the owner-mode bits say (utimes).
1428      */
1429     vmode = 0;
1430     if (nflags & NLC_READ)
1431 	vmode |= S_IRUSR;
1432     if (nflags & NLC_WRITE)
1433 	vmode |= S_IWUSR;
1434     if (nflags & NLC_EXEC)
1435 	vmode |= S_IXUSR;
1436 
1437     if (cred->cr_uid == va->va_uid) {
1438 	if ((nflags & NLC_OWN) == 0) {
1439 	    if ((vmode & va->va_mode) != vmode)
1440 		return(EACCES);
1441 	}
1442 	return(0);
1443     }
1444 
1445     /*
1446      * If NLC_STICKY is set only the owner may delete or rename a file.
1447      * This bit is typically set on /tmp.
1448      *
1449      * Note that the NLC_READ/WRITE/EXEC bits are not typically set in
1450      * the specific delete or rename case.  For deletions and renames we
1451      * usually just care about directory permissions, not file permissions.
1452      */
1453     if ((nflags & NLC_STICKY) &&
1454 	(nflags & (NLC_RENAME_SRC | NLC_RENAME_DST | NLC_DELETE))) {
1455 	return(EACCES);
1456     }
1457 
1458     /*
1459      * Check group perms
1460      */
1461     vmode >>= 3;
1462     for (i = 0; i < cred->cr_ngroups; ++i) {
1463 	if (va->va_gid == cred->cr_groups[i]) {
1464 	    if ((vmode & va->va_mode) != vmode)
1465 		return(EACCES);
1466 	    return(0);
1467 	}
1468     }
1469 
1470     /*
1471      * Check world perms
1472      */
1473     vmode >>= 3;
1474     if ((vmode & va->va_mode) != vmode)
1475 	return(EACCES);
1476     return(0);
1477 }
1478 
1479