xref: /freebsd/sys/kern/vfs_lookup.c (revision 535af610)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include "opt_capsicum.h"
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/dirent.h>
48 #include <sys/kernel.h>
49 #include <sys/capsicum.h>
50 #include <sys/fcntl.h>
51 #include <sys/jail.h>
52 #include <sys/lock.h>
53 #include <sys/mutex.h>
54 #include <sys/namei.h>
55 #include <sys/vnode.h>
56 #include <sys/mount.h>
57 #include <sys/filedesc.h>
58 #include <sys/proc.h>
59 #include <sys/sdt.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #ifdef KTRACE
63 #include <sys/ktrace.h>
64 #endif
65 #ifdef INVARIANTS
66 #include <machine/_inttypes.h>
67 #endif
68 
69 #include <security/audit/audit.h>
70 #include <security/mac/mac_framework.h>
71 
72 #include <vm/uma.h>
73 
74 #ifdef INVARIANTS
75 static void NDVALIDATE_impl(struct nameidata *, int);
76 #define NDVALIDATE(ndp) NDVALIDATE_impl(ndp, __LINE__)
77 #else
78 #define NDVALIDATE(ndp)
79 #endif
80 
81 /*
82  * Prepare namei() to restart. Reset components to its original state and set
83  * ISRESTARTED flag which signals the underlying lookup code to change the root
84  * from ABI root to actual root and prevents a further restarts.
85  */
86 #define	NDRESTART(ndp) do {						\
87 	NDREINIT_DBG(ndp);						\
88 	ndp->ni_resflags = 0;						\
89 	ndp->ni_cnd.cn_flags &= ~NAMEI_INTERNAL_FLAGS;			\
90 	ndp->ni_cnd.cn_flags |= ISRESTARTED;				\
91 } while (0)
92 
93 SDT_PROVIDER_DEFINE(vfs);
94 SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *",
95     "unsigned long", "bool");
96 SDT_PROBE_DEFINE4(vfs, namei, lookup, return, "int", "struct vnode *", "bool",
97     "struct nameidata");
98 
99 /* Allocation zone for namei. */
100 uma_zone_t namei_zone;
101 
102 /* Placeholder vnode for mp traversal. */
103 static struct vnode *vp_crossmp;
104 
105 static int
106 crossmp_vop_islocked(struct vop_islocked_args *ap)
107 {
108 
109 	return (LK_SHARED);
110 }
111 
112 static int
113 crossmp_vop_lock1(struct vop_lock1_args *ap)
114 {
115 	struct vnode *vp;
116 	struct lock *lk __diagused;
117 	int flags;
118 
119 	vp = ap->a_vp;
120 	lk = vp->v_vnlock;
121 	flags = ap->a_flags;
122 
123 	KASSERT((flags & (LK_SHARED | LK_NOWAIT)) == (LK_SHARED | LK_NOWAIT),
124 	    ("%s: invalid lock request 0x%x for crossmp", __func__, flags));
125 
126 	if ((flags & LK_INTERLOCK) != 0)
127 		VI_UNLOCK(vp);
128 	LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, ap->a_line);
129 	return (0);
130 }
131 
132 static int
133 crossmp_vop_unlock(struct vop_unlock_args *ap)
134 {
135 	struct vnode *vp;
136 	struct lock *lk __diagused;
137 
138 	vp = ap->a_vp;
139 	lk = vp->v_vnlock;
140 
141 	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE,
142 	    LOCK_LINE);
143 	return (0);
144 }
145 
146 static struct vop_vector crossmp_vnodeops = {
147 	.vop_default =		&default_vnodeops,
148 	.vop_islocked =		crossmp_vop_islocked,
149 	.vop_lock1 =		crossmp_vop_lock1,
150 	.vop_unlock =		crossmp_vop_unlock,
151 };
152 /*
153  * VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode
154  * gets allocated early. See nameiinit for the direct call below.
155  */
156 
157 struct nameicap_tracker {
158 	struct vnode *dp;
159 	TAILQ_ENTRY(nameicap_tracker) nm_link;
160 };
161 
162 /* Zone for cap mode tracker elements used for dotdot capability checks. */
163 MALLOC_DEFINE(M_NAMEITRACKER, "namei_tracker", "namei tracking for dotdot");
164 
165 static void
166 nameiinit(void *dummy __unused)
167 {
168 
169 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
170 	    UMA_ALIGN_PTR, 0);
171 	vfs_vector_op_register(&crossmp_vnodeops);
172 	getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp);
173 	vp_crossmp->v_state = VSTATE_CONSTRUCTED;
174 	vp_crossmp->v_irflag |= VIRF_CROSSMP;
175 }
176 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL);
177 
178 static int lookup_cap_dotdot = 1;
179 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
180     &lookup_cap_dotdot, 0,
181     "enables \"..\" components in path lookup in capability mode");
182 static int lookup_cap_dotdot_nonlocal = 1;
183 SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
184     &lookup_cap_dotdot_nonlocal, 0,
185     "enables \"..\" components in path lookup in capability mode "
186     "on non-local mount");
187 
188 static void
189 nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
190 {
191 	struct nameicap_tracker *nt;
192 
193 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
194 		return;
195 	nt = TAILQ_LAST(&ndp->ni_cap_tracker, nameicap_tracker_head);
196 	if (nt != NULL && nt->dp == dp)
197 		return;
198 	nt = malloc(sizeof(*nt), M_NAMEITRACKER, M_WAITOK);
199 	vhold(dp);
200 	nt->dp = dp;
201 	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
202 }
203 
204 static void
205 nameicap_cleanup_from(struct nameidata *ndp, struct nameicap_tracker *first)
206 {
207 	struct nameicap_tracker *nt, *nt1;
208 
209 	nt = first;
210 	TAILQ_FOREACH_FROM_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
211 		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
212 		vdrop(nt->dp);
213 		free(nt, M_NAMEITRACKER);
214 	}
215 }
216 
217 static void
218 nameicap_cleanup(struct nameidata *ndp)
219 {
220 	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
221 	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
222 	nameicap_cleanup_from(ndp, NULL);
223 }
224 
225 /*
226  * For dotdot lookups in capability mode, only allow the component
227  * lookup to succeed if the resulting directory was already traversed
228  * during the operation.  This catches situations where already
229  * traversed directory is moved to different parent, and then we walk
230  * over it with dotdots.
231  *
232  * Also allow to force failure of dotdot lookups for non-local
233  * filesystems, where external agents might assist local lookups to
234  * escape the compartment.
235  */
236 static int
237 nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
238 {
239 	struct nameicap_tracker *nt;
240 	struct mount *mp;
241 
242 	if (dp == NULL || dp->v_type != VDIR || (ndp->ni_lcf &
243 	    NI_LCF_STRICTRELATIVE) == 0)
244 		return (0);
245 	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0)
246 		return (ENOTCAPABLE);
247 	mp = dp->v_mount;
248 	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
249 	    (mp->mnt_flag & MNT_LOCAL) == 0)
250 		return (ENOTCAPABLE);
251 	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
252 	    nm_link) {
253 		if (dp == nt->dp) {
254 			nt = TAILQ_NEXT(nt, nm_link);
255 			if (nt != NULL)
256 				nameicap_cleanup_from(ndp, nt);
257 			return (0);
258 		}
259 	}
260 	return (ENOTCAPABLE);
261 }
262 
263 static void
264 namei_cleanup_cnp(struct componentname *cnp)
265 {
266 
267 	uma_zfree(namei_zone, cnp->cn_pnbuf);
268 	cnp->cn_pnbuf = NULL;
269 	cnp->cn_nameptr = NULL;
270 }
271 
272 static int
273 namei_handle_root(struct nameidata *ndp, struct vnode **dpp)
274 {
275 	struct componentname *cnp;
276 
277 	cnp = &ndp->ni_cnd;
278 	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) {
279 #ifdef KTRACE
280 		if (KTRPOINT(curthread, KTR_CAPFAIL))
281 			ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
282 #endif
283 		return (ENOTCAPABLE);
284 	}
285 	while (*(cnp->cn_nameptr) == '/') {
286 		cnp->cn_nameptr++;
287 		ndp->ni_pathlen--;
288 	}
289 	*dpp = ndp->ni_rootdir;
290 	vrefact(*dpp);
291 	return (0);
292 }
293 
294 static int
295 namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp)
296 {
297 	struct componentname *cnp;
298 	struct thread *td;
299 	struct pwd *pwd;
300 	int error;
301 	bool startdir_used;
302 
303 	cnp = &ndp->ni_cnd;
304 	td = curthread;
305 
306 	startdir_used = false;
307 	*pwdp = NULL;
308 	*dpp = NULL;
309 
310 #ifdef CAPABILITY_MODE
311 	/*
312 	 * In capability mode, lookups must be restricted to happen in
313 	 * the subtree with the root specified by the file descriptor:
314 	 * - The root must be real file descriptor, not the pseudo-descriptor
315 	 *   AT_FDCWD.
316 	 * - The passed path must be relative and not absolute.
317 	 * - If lookup_cap_dotdot is disabled, path must not contain the
318 	 *   '..' components.
319 	 * - If lookup_cap_dotdot is enabled, we verify that all '..'
320 	 *   components lookups result in the directories which were
321 	 *   previously walked by us, which prevents an escape from
322 	 *   the relative root.
323 	 */
324 	if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) {
325 		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
326 		ndp->ni_resflags |= NIRES_STRICTREL;
327 		if (ndp->ni_dirfd == AT_FDCWD) {
328 #ifdef KTRACE
329 			if (KTRPOINT(td, KTR_CAPFAIL))
330 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
331 #endif
332 			return (ECAPMODE);
333 		}
334 	}
335 #endif
336 	error = 0;
337 
338 	/*
339 	 * Get starting point for the translation.
340 	 */
341 	pwd = pwd_hold(td);
342 	/*
343 	 * The reference on ni_rootdir is acquired in the block below to avoid
344 	 * back-to-back atomics for absolute lookups.
345 	 */
346 	namei_setup_rootdir(ndp, cnp, pwd);
347 	ndp->ni_topdir = pwd->pwd_jdir;
348 
349 	if (cnp->cn_pnbuf[0] == '/') {
350 		ndp->ni_resflags |= NIRES_ABS;
351 		error = namei_handle_root(ndp, dpp);
352 	} else {
353 		if (ndp->ni_startdir != NULL) {
354 			*dpp = ndp->ni_startdir;
355 			startdir_used = true;
356 		} else if (ndp->ni_dirfd == AT_FDCWD) {
357 			*dpp = pwd->pwd_cdir;
358 			vrefact(*dpp);
359 		} else {
360 			if (cnp->cn_flags & AUDITVNODE1)
361 				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
362 			if (cnp->cn_flags & AUDITVNODE2)
363 				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
364 
365 			error = fgetvp_lookup(ndp->ni_dirfd, ndp, dpp);
366 		}
367 		if (error == 0 && (*dpp)->v_type != VDIR &&
368 		    (cnp->cn_pnbuf[0] != '\0' ||
369 		    (cnp->cn_flags & EMPTYPATH) == 0))
370 			error = ENOTDIR;
371 	}
372 	if (error == 0 && (cnp->cn_flags & RBENEATH) != 0) {
373 		if (cnp->cn_pnbuf[0] == '/') {
374 			error = ENOTCAPABLE;
375 		} else if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0) {
376 			ndp->ni_lcf |= NI_LCF_STRICTRELATIVE |
377 			    NI_LCF_CAP_DOTDOT;
378 		}
379 	}
380 
381 	/*
382 	 * If we are auditing the kernel pathname, save the user pathname.
383 	 */
384 	if (AUDITING_TD(td)) {
385 		if (cnp->cn_flags & AUDITVNODE1)
386 			AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
387 		if (cnp->cn_flags & AUDITVNODE2)
388 			AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf);
389 	}
390 	if (ndp->ni_startdir != NULL && !startdir_used)
391 		vrele(ndp->ni_startdir);
392 	if (error != 0) {
393 		if (*dpp != NULL)
394 			vrele(*dpp);
395 		pwd_drop(pwd);
396 		return (error);
397 	}
398 	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
399 	    lookup_cap_dotdot != 0)
400 		ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
401 	SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf,
402 	    cnp->cn_flags, false);
403 	*pwdp = pwd;
404 	return (0);
405 }
406 
407 static int
408 namei_getpath(struct nameidata *ndp)
409 {
410 	struct componentname *cnp;
411 	int error;
412 
413 	cnp = &ndp->ni_cnd;
414 
415 	/*
416 	 * Get a buffer for the name to be translated, and copy the
417 	 * name into the buffer.
418 	 */
419 	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
420 	if (ndp->ni_segflg == UIO_SYSSPACE) {
421 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
422 		    &ndp->ni_pathlen);
423 	} else {
424 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
425 		    &ndp->ni_pathlen);
426 	}
427 
428 	return (error);
429 }
430 
431 static int
432 namei_emptypath(struct nameidata *ndp)
433 {
434 	struct componentname *cnp;
435 	struct pwd *pwd;
436 	struct vnode *dp;
437 	int error;
438 
439 	cnp = &ndp->ni_cnd;
440 	MPASS(*cnp->cn_pnbuf == '\0');
441 	MPASS((cnp->cn_flags & EMPTYPATH) != 0);
442 	MPASS((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) == 0);
443 
444 	ndp->ni_resflags |= NIRES_EMPTYPATH;
445 	error = namei_setup(ndp, &dp, &pwd);
446 	if (error != 0) {
447 		goto errout;
448 	}
449 
450 	/*
451 	 * Usecount on dp already provided by namei_setup.
452 	 */
453 	ndp->ni_vp = dp;
454 	pwd_drop(pwd);
455 	NDVALIDATE(ndp);
456 	if ((cnp->cn_flags & LOCKLEAF) != 0) {
457 		VOP_LOCK(dp, (cnp->cn_flags & LOCKSHARED) != 0 ?
458 		    LK_SHARED : LK_EXCLUSIVE);
459 		if (VN_IS_DOOMED(dp)) {
460 			vput(dp);
461 			error = ENOENT;
462 			goto errout;
463 		}
464 	}
465 	SDT_PROBE4(vfs, namei, lookup, return, 0, ndp->ni_vp, false, ndp);
466 	return (0);
467 
468 errout:
469 	SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
470 	namei_cleanup_cnp(cnp);
471 	return (error);
472 }
473 
474 static int __noinline
475 namei_follow_link(struct nameidata *ndp)
476 {
477 	char *cp;
478 	struct iovec aiov;
479 	struct uio auio;
480 	struct componentname *cnp;
481 	struct thread *td;
482 	int error, linklen;
483 
484 	error = 0;
485 	cnp = &ndp->ni_cnd;
486 	td = curthread;
487 
488 	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
489 		error = ELOOP;
490 		goto out;
491 	}
492 #ifdef MAC
493 	if ((cnp->cn_flags & NOMACCHECK) == 0) {
494 		error = mac_vnode_check_readlink(td->td_ucred, ndp->ni_vp);
495 		if (error != 0)
496 			goto out;
497 	}
498 #endif
499 	if (ndp->ni_pathlen > 1)
500 		cp = uma_zalloc(namei_zone, M_WAITOK);
501 	else
502 		cp = cnp->cn_pnbuf;
503 	aiov.iov_base = cp;
504 	aiov.iov_len = MAXPATHLEN;
505 	auio.uio_iov = &aiov;
506 	auio.uio_iovcnt = 1;
507 	auio.uio_offset = 0;
508 	auio.uio_rw = UIO_READ;
509 	auio.uio_segflg = UIO_SYSSPACE;
510 	auio.uio_td = td;
511 	auio.uio_resid = MAXPATHLEN;
512 	error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
513 	if (error != 0) {
514 		if (ndp->ni_pathlen > 1)
515 			uma_zfree(namei_zone, cp);
516 		goto out;
517 	}
518 	linklen = MAXPATHLEN - auio.uio_resid;
519 	if (linklen == 0) {
520 		if (ndp->ni_pathlen > 1)
521 			uma_zfree(namei_zone, cp);
522 		error = ENOENT;
523 		goto out;
524 	}
525 	if (linklen + ndp->ni_pathlen > MAXPATHLEN) {
526 		if (ndp->ni_pathlen > 1)
527 			uma_zfree(namei_zone, cp);
528 		error = ENAMETOOLONG;
529 		goto out;
530 	}
531 	if (ndp->ni_pathlen > 1) {
532 		bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
533 		uma_zfree(namei_zone, cnp->cn_pnbuf);
534 		cnp->cn_pnbuf = cp;
535 	} else
536 		cnp->cn_pnbuf[linklen] = '\0';
537 	ndp->ni_pathlen += linklen;
538 out:
539 	return (error);
540 }
541 
542 /*
543  * Convert a pathname into a pointer to a locked vnode.
544  *
545  * The FOLLOW flag is set when symbolic links are to be followed
546  * when they occur at the end of the name translation process.
547  * Symbolic links are always followed for all other pathname
548  * components other than the last.
549  *
550  * The segflg defines whether the name is to be copied from user
551  * space or kernel space.
552  *
553  * Overall outline of namei:
554  *
555  *	copy in name
556  *	get starting directory
557  *	while (!done && !error) {
558  *		call lookup to search path.
559  *		if symbolic link, massage name in buffer and continue
560  *	}
561  */
562 int
563 namei(struct nameidata *ndp)
564 {
565 	struct vnode *dp;	/* the directory we are searching */
566 	struct componentname *cnp;
567 	struct thread *td;
568 	struct pwd *pwd;
569 	int error;
570 	enum cache_fpl_status status;
571 
572 	cnp = &ndp->ni_cnd;
573 	td = curthread;
574 #ifdef INVARIANTS
575 	KASSERT((ndp->ni_debugflags & NAMEI_DBG_CALLED) == 0,
576 	    ("%s: repeated call to namei without NDREINIT", __func__));
577 	KASSERT(ndp->ni_debugflags == NAMEI_DBG_INITED,
578 	    ("%s: bad debugflags %d", __func__, ndp->ni_debugflags));
579 	ndp->ni_debugflags |= NAMEI_DBG_CALLED;
580 	if (ndp->ni_startdir != NULL)
581 		ndp->ni_debugflags |= NAMEI_DBG_HADSTARTDIR;
582 	if (cnp->cn_flags & FAILIFEXISTS) {
583 		KASSERT(cnp->cn_nameiop == CREATE,
584 		    ("%s: FAILIFEXISTS passed for op %d", __func__, cnp->cn_nameiop));
585 		/*
586 		 * The limitation below is to restrict hairy corner cases.
587 		 */
588 		KASSERT((cnp->cn_flags & (LOCKPARENT | LOCKLEAF)) == LOCKPARENT,
589 		    ("%s: FAILIFEXISTS must be passed with LOCKPARENT and without LOCKLEAF",
590 		    __func__));
591 	}
592 #endif
593 	ndp->ni_cnd.cn_cred = td->td_ucred;
594 	KASSERT(ndp->ni_resflags == 0, ("%s: garbage in ni_resflags: %x\n",
595 	    __func__, ndp->ni_resflags));
596 	KASSERT(cnp->cn_cred && td->td_proc, ("namei: bad cred/proc"));
597 	KASSERT((cnp->cn_flags & NAMEI_INTERNAL_FLAGS) == 0,
598 	    ("namei: unexpected flags: %" PRIx64 "\n",
599 	    cnp->cn_flags & NAMEI_INTERNAL_FLAGS));
600 	if (cnp->cn_flags & NOCACHE)
601 		KASSERT(cnp->cn_nameiop != LOOKUP,
602 		    ("%s: NOCACHE passed with LOOKUP", __func__));
603 	MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR ||
604 	    ndp->ni_startdir->v_type == VBAD);
605 
606 restart:
607 	ndp->ni_lcf = 0;
608 	ndp->ni_loopcnt = 0;
609 	ndp->ni_vp = NULL;
610 
611 	error = namei_getpath(ndp);
612 	if (__predict_false(error != 0)) {
613 		namei_cleanup_cnp(cnp);
614 		SDT_PROBE4(vfs, namei, lookup, return, error, NULL,
615 		    false, ndp);
616 		return (error);
617 	}
618 
619 	cnp->cn_nameptr = cnp->cn_pnbuf;
620 
621 #ifdef KTRACE
622 	if (KTRPOINT(td, KTR_NAMEI)) {
623 		ktrnamei(cnp->cn_pnbuf);
624 	}
625 #endif
626 	TSNAMEI(curthread->td_proc->p_pid, cnp->cn_pnbuf);
627 
628 	/*
629 	 * First try looking up the target without locking any vnodes.
630 	 *
631 	 * We may need to start from scratch or pick up where it left off.
632 	 */
633 	error = cache_fplookup(ndp, &status, &pwd);
634 	switch (status) {
635 	case CACHE_FPL_STATUS_UNSET:
636 		__assert_unreachable();
637 		break;
638 	case CACHE_FPL_STATUS_HANDLED:
639 		if (error == 0)
640 			NDVALIDATE(ndp);
641 		else if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
642 		    (cnp->cn_flags & ISRESTARTED) == 0)) {
643 			namei_cleanup_cnp(cnp);
644 			NDRESTART(ndp);
645 			goto restart;
646 		}
647 		return (error);
648 	case CACHE_FPL_STATUS_PARTIAL:
649 		TAILQ_INIT(&ndp->ni_cap_tracker);
650 		dp = ndp->ni_startdir;
651 		break;
652 	case CACHE_FPL_STATUS_DESTROYED:
653 		ndp->ni_loopcnt = 0;
654 		error = namei_getpath(ndp);
655 		if (__predict_false(error != 0)) {
656 			namei_cleanup_cnp(cnp);
657 			return (error);
658 		}
659 		cnp->cn_nameptr = cnp->cn_pnbuf;
660 		/* FALLTHROUGH */
661 	case CACHE_FPL_STATUS_ABORTED:
662 		TAILQ_INIT(&ndp->ni_cap_tracker);
663 		MPASS(ndp->ni_lcf == 0);
664 		if (*cnp->cn_pnbuf == '\0') {
665 			if ((cnp->cn_flags & EMPTYPATH) != 0) {
666 				return (namei_emptypath(ndp));
667 			}
668 			namei_cleanup_cnp(cnp);
669 			SDT_PROBE4(vfs, namei, lookup, return, ENOENT, NULL,
670 			    false, ndp);
671 			return (ENOENT);
672 		}
673 		error = namei_setup(ndp, &dp, &pwd);
674 		if (error != 0) {
675 			namei_cleanup_cnp(cnp);
676 			return (error);
677 		}
678 		break;
679 	}
680 
681 	/*
682 	 * Locked lookup.
683 	 */
684 	for (;;) {
685 		ndp->ni_startdir = dp;
686 		error = vfs_lookup(ndp);
687 		if (error != 0) {
688 			if (__predict_false(pwd->pwd_adir != pwd->pwd_rdir &&
689 			    error == ENOENT &&
690 			    (cnp->cn_flags & ISRESTARTED) == 0)) {
691 				nameicap_cleanup(ndp);
692 				pwd_drop(pwd);
693 				namei_cleanup_cnp(cnp);
694 				NDRESTART(ndp);
695 				goto restart;
696 			} else
697 				goto out;
698 		}
699 
700 		/*
701 		 * If not a symbolic link, we're done.
702 		 */
703 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
704 			SDT_PROBE4(vfs, namei, lookup, return, error,
705 			    ndp->ni_vp, false, ndp);
706 			nameicap_cleanup(ndp);
707 			pwd_drop(pwd);
708 			NDVALIDATE(ndp);
709 			return (0);
710 		}
711 		error = namei_follow_link(ndp);
712 		if (error != 0)
713 			break;
714 		vput(ndp->ni_vp);
715 		dp = ndp->ni_dvp;
716 		/*
717 		 * Check if root directory should replace current directory.
718 		 */
719 		cnp->cn_nameptr = cnp->cn_pnbuf;
720 		if (*(cnp->cn_nameptr) == '/') {
721 			/*
722 			 * Reset the lookup to start from the real root without
723 			 * origin path name reloading.
724 			 */
725 			if (__predict_false(ndp->ni_rootdir != pwd->pwd_rdir)) {
726 				cnp->cn_flags |= ISRESTARTED;
727 				ndp->ni_rootdir = pwd->pwd_rdir;
728 			}
729 			vrele(dp);
730 			error = namei_handle_root(ndp, &dp);
731 			if (error != 0)
732 				goto out;
733 		}
734 	}
735 	vput(ndp->ni_vp);
736 	ndp->ni_vp = NULL;
737 	vrele(ndp->ni_dvp);
738 out:
739 	MPASS(error != 0);
740 	SDT_PROBE4(vfs, namei, lookup, return, error, NULL, false, ndp);
741 	namei_cleanup_cnp(cnp);
742 	nameicap_cleanup(ndp);
743 	pwd_drop(pwd);
744 	return (error);
745 }
746 
747 static int
748 enforce_lkflags(struct mount *mp, int lkflags)
749 {
750 
751 	if (mp == NULL || ((lkflags & LK_SHARED) &&
752 	    !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) {
753 		lkflags &= ~LK_SHARED;
754 		lkflags |= LK_EXCLUSIVE;
755 	}
756 	lkflags |= LK_NODDLKTREAT;
757 	return (lkflags);
758 }
759 
760 static __inline int
761 needs_exclusive_leaf(struct mount *mp, int flags)
762 {
763 
764 	/*
765 	 * Intermediate nodes can use shared locks, we only need to
766 	 * force an exclusive lock for leaf nodes.
767 	 */
768 	if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF))
769 		return (0);
770 
771 	/* Always use exclusive locks if LOCKSHARED isn't set. */
772 	if (!(flags & LOCKSHARED))
773 		return (1);
774 
775 	/*
776 	 * For lookups during open(), if the mount point supports
777 	 * extended shared operations, then use a shared lock for the
778 	 * leaf node, otherwise use an exclusive lock.
779 	 */
780 	if ((flags & ISOPEN) != 0)
781 		return (!MNT_EXTENDED_SHARED(mp));
782 
783 	/*
784 	 * Lookup requests outside of open() that specify LOCKSHARED
785 	 * only need a shared lock on the leaf vnode.
786 	 */
787 	return (0);
788 }
789 
790 /*
791  * Various filesystems expect to be able to copy a name component with length
792  * bounded by NAME_MAX into a directory entry buffer of size MAXNAMLEN.  Make
793  * sure that these are the same size.
794  */
795 _Static_assert(MAXNAMLEN == NAME_MAX,
796     "MAXNAMLEN and NAME_MAX have different values");
797 
798 static int __noinline
799 vfs_lookup_degenerate(struct nameidata *ndp, struct vnode *dp, int wantparent)
800 {
801 	struct componentname *cnp;
802 	struct mount *mp;
803 	int error;
804 
805 	cnp = &ndp->ni_cnd;
806 
807 	cnp->cn_flags |= ISLASTCN;
808 
809 	mp = atomic_load_ptr(&dp->v_mount);
810 	if (needs_exclusive_leaf(mp, cnp->cn_flags)) {
811 		cnp->cn_lkflags &= ~LK_SHARED;
812 		cnp->cn_lkflags |= LK_EXCLUSIVE;
813 	}
814 
815 	vn_lock(dp, enforce_lkflags(mp, cnp->cn_lkflags | LK_RETRY));
816 
817 	if (dp->v_type != VDIR) {
818 		error = ENOTDIR;
819 		goto bad;
820 	}
821 	if (cnp->cn_nameiop != LOOKUP) {
822 		error = EISDIR;
823 		goto bad;
824 	}
825 	if (wantparent) {
826 		ndp->ni_dvp = dp;
827 		VREF(dp);
828 	}
829 	ndp->ni_vp = dp;
830 	cnp->cn_namelen = 0;
831 
832 	if (cnp->cn_flags & AUDITVNODE1)
833 		AUDIT_ARG_VNODE1(dp);
834 	else if (cnp->cn_flags & AUDITVNODE2)
835 		AUDIT_ARG_VNODE2(dp);
836 
837 	if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
838 		VOP_UNLOCK(dp);
839 	return (0);
840 bad:
841 	VOP_UNLOCK(dp);
842 	return (error);
843 }
844 
845 /*
846  * FAILIFEXISTS handling.
847  *
848  * XXX namei called with LOCKPARENT but not LOCKLEAF has the strange
849  * behaviour of leaving the vnode unlocked if the target is the same
850  * vnode as the parent.
851  */
852 static int __noinline
853 vfs_lookup_failifexists(struct nameidata *ndp)
854 {
855 	struct componentname *cnp __diagused;
856 
857 	cnp = &ndp->ni_cnd;
858 
859 	MPASS((cnp->cn_flags & ISSYMLINK) == 0);
860 	if (ndp->ni_vp == ndp->ni_dvp)
861 		vrele(ndp->ni_dvp);
862 	else
863 		vput(ndp->ni_dvp);
864 	vrele(ndp->ni_vp);
865 	ndp->ni_dvp = NULL;
866 	ndp->ni_vp = NULL;
867 	NDFREE_PNBUF(ndp);
868 	return (EEXIST);
869 }
870 
871 static int __noinline
872 vfs_lookup_cross_mount(struct nameidata *ndp)
873 {
874 	struct componentname *cnp;
875 	struct mount *mp;
876 	struct vnode *dp, *tdp;
877 	int error, crosslkflags;
878 	bool crosslock;
879 
880 	cnp = &ndp->ni_cnd;
881 	dp = ndp->ni_vp;
882 
883 	/*
884 	 * The vnode has been mounted on, find the root of the mounted
885 	 * filesystem.
886 	 */
887 	for (;;) {
888 		mp = dp->v_mountedhere;
889 		ASSERT_VOP_LOCKED(dp, __func__);
890 		VNPASS((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0 && mp != NULL, dp);
891 
892 		crosslock = (dp->v_vflag & VV_CROSSLOCK) != 0;
893 		crosslkflags = enforce_lkflags(mp, cnp->cn_lkflags);
894 		if (__predict_false(crosslock)) {
895 			/*
896 			 * We are going to be holding the vnode lock, which
897 			 * in this case is shared by the root vnode of the
898 			 * filesystem mounted at mp, across the call to
899 			 * VFS_ROOT().  Make the situation clear to the
900 			 * filesystem by passing LK_CANRECURSE if the
901 			 * lock is held exclusive, or by clearinng
902 			 * LK_NODDLKTREAT to allow recursion on the shared
903 			 * lock in the presence of an exclusive waiter.
904 			 */
905 			if (VOP_ISLOCKED(dp) == LK_EXCLUSIVE) {
906 				crosslkflags &= ~LK_SHARED;
907 				crosslkflags |= LK_EXCLUSIVE | LK_CANRECURSE;
908 			} else if ((crosslkflags & LK_EXCLUSIVE) != 0) {
909 				error = vn_lock(dp, LK_UPGRADE);
910 				if (error != 0)
911 					break;
912 				if (dp->v_mountedhere != mp) {
913 					continue;
914 				}
915 			} else
916 				crosslkflags &= ~LK_NODDLKTREAT;
917 		}
918 		if (vfs_busy(mp, 0) != 0)
919 			continue;
920 		if (__predict_true(!crosslock))
921 			vput(dp);
922 		if (dp != ndp->ni_dvp)
923 			vput(ndp->ni_dvp);
924 		else
925 			vrele(ndp->ni_dvp);
926 		vrefact(vp_crossmp);
927 		ndp->ni_dvp = vp_crossmp;
928 		error = VFS_ROOT(mp, crosslkflags, &tdp);
929 		vfs_unbusy(mp);
930 		if (__predict_false(crosslock))
931 			vput(dp);
932 		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT))
933 			panic("vp_crossmp exclusively locked or reclaimed");
934 		if (error != 0)
935 			break;
936 		ndp->ni_vp = dp = tdp;
937 		if ((vn_irflag_read(dp) & VIRF_MOUNTPOINT) == 0)
938 			break;
939 	}
940 
941 	return (error);
942 }
943 
944 /*
945  * Search a pathname.
946  * This is a very central and rather complicated routine.
947  *
948  * The pathname is pointed to by cn_nameptr and is of length ni_pathlen.
949  * The starting directory is taken from ni_startdir. The pathname is
950  * descended until done, or a symbolic link is encountered. The cn_flags
951  * has ISLASTCN or'ed if the path is completed or ISSYMLINK or'ed if a
952  * symbolic link needing interpretation is encountered.
953  *
954  * The cn_nameiop is LOOKUP, CREATE, RENAME, or DELETE depending on
955  * whether the name is to be looked up, created, renamed, or deleted.
956  * When CREATE, RENAME, or DELETE is specified, information usable in
957  * creating, renaming, or deleting a directory entry may be calculated.
958  * If cn_flags has LOCKPARENT or'ed into it, the parent directory is returned
959  * locked. If it has WANTPARENT or'ed into it, the parent directory is
960  * returned unlocked. Otherwise the parent directory is not returned. If
961  * the target of the pathname exists and LOCKLEAF is or'ed into the cn_flags
962  * the target is returned locked, otherwise it is returned unlocked.
963  *
964  * Overall outline of lookup:
965  *
966  *	handle degenerate case where name is null string
967  *
968  * dirloop:
969  *	identify next component of name at ndp->ni_cnd.cn_nameptr
970  *	handle .. special cases related to capabilities, chroot, jail
971  *	if .. and crossing mount points and on mounted filesys, find parent
972  *	call VOP_LOOKUP routine for next component name
973  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
974  *	    component vnode returned in ni_vp (if it exists), locked.
975  *	if result vnode is mounted on and crossing mount points,
976  *	    find mounted on vnode
977  *	if more components of name, do next level at dirloop
978  *	if VOP_LOOKUP returns ERELOOKUP, repeat the same level at dirloop
979  *	return the answer in ni_vp, locked if LOCKLEAF set
980  *	    if LOCKPARENT set, return locked parent in ni_dvp
981  *	    if WANTPARENT set, return unlocked parent in ni_dvp
982  */
983 int
984 vfs_lookup(struct nameidata *ndp)
985 {
986 	char *cp;			/* pointer into pathname argument */
987 	char *prev_ni_next;		/* saved ndp->ni_next */
988 	char *nulchar;			/* location of '\0' in cn_pnbuf */
989 	char *lastchar;			/* location of the last character */
990 	struct vnode *dp = NULL;	/* the directory we are searching */
991 	struct vnode *tdp;		/* saved dp */
992 	struct prison *pr;
993 	size_t prev_ni_pathlen;		/* saved ndp->ni_pathlen */
994 	int docache;			/* == 0 do not cache last component */
995 	int wantparent;			/* 1 => wantparent or lockparent flag */
996 	int rdonly;			/* lookup read-only flag bit */
997 	int error = 0;
998 	int relookup = 0;		/* do not consume the path component */
999 	struct componentname *cnp = &ndp->ni_cnd;
1000 	int lkflags_save;
1001 	int ni_dvp_unlocked;
1002 
1003 	/*
1004 	 * Setup: break out flag bits into variables.
1005 	 */
1006 	ni_dvp_unlocked = 0;
1007 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
1008 	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
1009 	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
1010 	/*
1011 	 * When set to zero, docache causes the last component of the
1012 	 * pathname to be deleted from the cache and the full lookup
1013 	 * of the name to be done (via VOP_CACHEDLOOKUP()). Often
1014 	 * filesystems need some pre-computed values that are made
1015 	 * during the full lookup, for instance UFS sets dp->i_offset.
1016 	 *
1017 	 * The docache variable is set to zero when requested by the
1018 	 * NOCACHE flag and for all modifying operations except CREATE.
1019 	 */
1020 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
1021 	if (cnp->cn_nameiop == DELETE ||
1022 	    (wantparent && cnp->cn_nameiop != CREATE &&
1023 	     cnp->cn_nameiop != LOOKUP))
1024 		docache = 0;
1025 	rdonly = cnp->cn_flags & RDONLY;
1026 	cnp->cn_flags &= ~ISSYMLINK;
1027 	ndp->ni_dvp = NULL;
1028 
1029 	cnp->cn_lkflags = LK_SHARED;
1030 	dp = ndp->ni_startdir;
1031 	ndp->ni_startdir = NULLVP;
1032 
1033 	/*
1034 	 * Leading slashes, if any, are supposed to be skipped by the caller.
1035 	 */
1036 	MPASS(cnp->cn_nameptr[0] != '/');
1037 
1038 	/*
1039 	 * Check for degenerate name (e.g. / or "") which is a way of talking
1040 	 * about a directory, e.g. like "/." or ".".
1041 	 */
1042 	if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
1043 		error = vfs_lookup_degenerate(ndp, dp, wantparent);
1044 		if (error == 0)
1045 			goto success_right_lock;
1046 		goto bad_unlocked;
1047 	}
1048 
1049 	/*
1050 	 * Nul-out trailing slashes (e.g., "foo///" -> "foo").
1051 	 *
1052 	 * This must be done before VOP_LOOKUP() because some fs's don't know
1053 	 * about trailing slashes.  Remember if there were trailing slashes to
1054 	 * handle symlinks, existing non-directories and non-existing files
1055 	 * that won't be directories specially later.
1056 	 */
1057 	MPASS(ndp->ni_pathlen >= 2);
1058 	lastchar = &cnp->cn_nameptr[ndp->ni_pathlen - 2];
1059 	if (*lastchar == '/') {
1060 		while (lastchar >= cnp->cn_pnbuf) {
1061 			*lastchar = '\0';
1062 			lastchar--;
1063 			ndp->ni_pathlen--;
1064 			if (*lastchar != '/') {
1065 				break;
1066 			}
1067 		}
1068 		cnp->cn_flags |= TRAILINGSLASH;
1069 	}
1070 
1071 	/*
1072 	 * We use shared locks until we hit the parent of the last cn then
1073 	 * we adjust based on the requesting flags.
1074 	 */
1075 	vn_lock(dp,
1076 	    enforce_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY));
1077 
1078 dirloop:
1079 	/*
1080 	 * Search a new directory.
1081 	 *
1082 	 * The last component of the filename is left accessible via
1083 	 * cnp->cn_nameptr. It has to be freed with a call to NDFREE*.
1084 	 *
1085 	 * Store / as a temporary sentinel so that we only have one character
1086 	 * to test for. Pathnames tend to be short so this should not be
1087 	 * resulting in cache misses.
1088 	 */
1089 	nulchar = &cnp->cn_nameptr[ndp->ni_pathlen - 1];
1090 	KASSERT(*nulchar == '\0',
1091 	    ("%s: expected nul at %p; string [%s]\n", __func__, nulchar,
1092 	    cnp->cn_pnbuf));
1093 	*nulchar = '/';
1094 	for (cp = cnp->cn_nameptr; *cp != '/'; cp++) {
1095 		KASSERT(*cp != '\0',
1096 		    ("%s: encountered unexpected nul; string [%s]\n", __func__,
1097 		    cnp->cn_nameptr));
1098 		continue;
1099 	}
1100 	*nulchar = '\0';
1101 	cnp->cn_namelen = cp - cnp->cn_nameptr;
1102 	if (__predict_false(cnp->cn_namelen > NAME_MAX)) {
1103 		error = ENAMETOOLONG;
1104 		goto bad;
1105 	}
1106 	prev_ni_pathlen = ndp->ni_pathlen;
1107 	ndp->ni_pathlen -= cnp->cn_namelen;
1108 	KASSERT(ndp->ni_pathlen <= PATH_MAX,
1109 	    ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
1110 	prev_ni_next = ndp->ni_next;
1111 	ndp->ni_next = cp;
1112 
1113 	/*
1114 	 * Something else should be clearing this.
1115 	 */
1116 	cnp->cn_flags &= ~(ISDOTDOT|ISLASTCN);
1117 
1118 	cnp->cn_flags |= MAKEENTRY;
1119 	if (*cp == '\0' && docache == 0)
1120 		cnp->cn_flags &= ~MAKEENTRY;
1121 	if (cnp->cn_namelen == 2 &&
1122 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
1123 		cnp->cn_flags |= ISDOTDOT;
1124 	if (*ndp->ni_next == 0) {
1125 		cnp->cn_flags |= ISLASTCN;
1126 
1127 		if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' &&
1128 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))) {
1129 			error = EINVAL;
1130 			goto bad;
1131 		}
1132 	}
1133 
1134 	nameicap_tracker_add(ndp, dp);
1135 
1136 	/*
1137 	 * Make sure degenerate names don't get here, their handling was
1138 	 * previously found in this spot.
1139 	 */
1140 	MPASS(cnp->cn_nameptr[0] != '\0');
1141 
1142 	/*
1143 	 * Handle "..": five special cases.
1144 	 * 0. If doing a capability lookup and lookup_cap_dotdot is
1145 	 *    disabled, return ENOTCAPABLE.
1146 	 * 1. Return an error if this is the last component of
1147 	 *    the name and the operation is DELETE or RENAME.
1148 	 * 2. If at root directory (e.g. after chroot)
1149 	 *    or at absolute root directory
1150 	 *    then ignore it so can't get out.
1151 	 * 3. If this vnode is the root of a mounted
1152 	 *    filesystem, then replace it with the
1153 	 *    vnode which was mounted on so we take the
1154 	 *    .. in the other filesystem.
1155 	 * 4. If the vnode is the top directory of
1156 	 *    the jail or chroot, don't let them out.
1157 	 * 5. If doing a capability lookup and lookup_cap_dotdot is
1158 	 *    enabled, return ENOTCAPABLE if the lookup would escape
1159 	 *    from the initial file descriptor directory.  Checks are
1160 	 *    done by ensuring that namei() already traversed the
1161 	 *    result of dotdot lookup.
1162 	 */
1163 	if (cnp->cn_flags & ISDOTDOT) {
1164 		if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT))
1165 		    == NI_LCF_STRICTRELATIVE) {
1166 #ifdef KTRACE
1167 			if (KTRPOINT(curthread, KTR_CAPFAIL))
1168 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
1169 #endif
1170 			error = ENOTCAPABLE;
1171 			goto bad;
1172 		}
1173 		if ((cnp->cn_flags & ISLASTCN) != 0 &&
1174 		    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1175 			error = EINVAL;
1176 			goto bad;
1177 		}
1178 		for (;;) {
1179 			for (pr = cnp->cn_cred->cr_prison; pr != NULL;
1180 			     pr = pr->pr_parent)
1181 				if (dp == pr->pr_root)
1182 					break;
1183 			bool isroot = dp == ndp->ni_rootdir ||
1184 			    dp == ndp->ni_topdir || dp == rootvnode ||
1185 			    pr != NULL;
1186 			if (isroot && (ndp->ni_lcf &
1187 			    NI_LCF_STRICTRELATIVE) != 0) {
1188 				error = ENOTCAPABLE;
1189 				goto capdotdot;
1190 			}
1191 			if (isroot || ((dp->v_vflag & VV_ROOT) != 0 &&
1192 			    (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
1193 				ndp->ni_dvp = dp;
1194 				ndp->ni_vp = dp;
1195 				VREF(dp);
1196 				goto nextname;
1197 			}
1198 			if ((dp->v_vflag & VV_ROOT) == 0)
1199 				break;
1200 			if (VN_IS_DOOMED(dp)) {	/* forced unmount */
1201 				error = ENOENT;
1202 				goto bad;
1203 			}
1204 			tdp = dp;
1205 			dp = dp->v_mount->mnt_vnodecovered;
1206 			VREF(dp);
1207 			vput(tdp);
1208 			vn_lock(dp,
1209 			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
1210 			    LK_RETRY));
1211 			error = nameicap_check_dotdot(ndp, dp);
1212 			if (error != 0) {
1213 capdotdot:
1214 #ifdef KTRACE
1215 				if (KTRPOINT(curthread, KTR_CAPFAIL))
1216 					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
1217 #endif
1218 				goto bad;
1219 			}
1220 		}
1221 	}
1222 
1223 	/*
1224 	 * We now have a segment name to search for, and a directory to search.
1225 	 */
1226 unionlookup:
1227 #ifdef MAC
1228 	error = mac_vnode_check_lookup(cnp->cn_cred, dp, cnp);
1229 	if (__predict_false(error))
1230 		goto bad;
1231 #endif
1232 	ndp->ni_dvp = dp;
1233 	ndp->ni_vp = NULL;
1234 	ASSERT_VOP_LOCKED(dp, "lookup");
1235 	/*
1236 	 * If we have a shared lock we may need to upgrade the lock for the
1237 	 * last operation.
1238 	 */
1239 	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) &&
1240 	    dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED)
1241 		vn_lock(dp, LK_UPGRADE|LK_RETRY);
1242 	if (VN_IS_DOOMED(dp)) {
1243 		error = ENOENT;
1244 		goto bad;
1245 	}
1246 	/*
1247 	 * If we're looking up the last component and we need an exclusive
1248 	 * lock, adjust our lkflags.
1249 	 */
1250 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags))
1251 		cnp->cn_lkflags = LK_EXCLUSIVE;
1252 	lkflags_save = cnp->cn_lkflags;
1253 	cnp->cn_lkflags = enforce_lkflags(dp->v_mount, cnp->cn_lkflags);
1254 	error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp);
1255 	cnp->cn_lkflags = lkflags_save;
1256 	if (error != 0) {
1257 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
1258 		if ((error == ENOENT) &&
1259 		    (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) &&
1260 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
1261 			tdp = dp;
1262 			dp = dp->v_mount->mnt_vnodecovered;
1263 			VREF(dp);
1264 			vput(tdp);
1265 			vn_lock(dp,
1266 			    enforce_lkflags(dp->v_mount, cnp->cn_lkflags |
1267 			    LK_RETRY));
1268 			nameicap_tracker_add(ndp, dp);
1269 			goto unionlookup;
1270 		}
1271 
1272 		if (error == ERELOOKUP) {
1273 			vref(dp);
1274 			ndp->ni_vp = dp;
1275 			error = 0;
1276 			relookup = 1;
1277 			goto good;
1278 		}
1279 
1280 		if (error != EJUSTRETURN)
1281 			goto bad;
1282 		/*
1283 		 * At this point, we know we're at the end of the
1284 		 * pathname.  If creating / renaming, we can consider
1285 		 * allowing the file or directory to be created / renamed,
1286 		 * provided we're not on a read-only filesystem.
1287 		 */
1288 		if (rdonly) {
1289 			error = EROFS;
1290 			goto bad;
1291 		}
1292 		/* trailing slash only allowed for directories */
1293 		if ((cnp->cn_flags & TRAILINGSLASH) &&
1294 		    !(cnp->cn_flags & WILLBEDIR)) {
1295 			error = ENOENT;
1296 			goto bad;
1297 		}
1298 		if ((cnp->cn_flags & LOCKPARENT) == 0)
1299 			VOP_UNLOCK(dp);
1300 		/*
1301 		 * We return with ni_vp NULL to indicate that the entry
1302 		 * doesn't currently exist, leaving a pointer to the
1303 		 * (possibly locked) directory vnode in ndp->ni_dvp.
1304 		 */
1305 		goto success;
1306 	}
1307 
1308 good:
1309 	dp = ndp->ni_vp;
1310 
1311 	/*
1312 	 * Check for symbolic link
1313 	 */
1314 	if ((dp->v_type == VLNK) &&
1315 	    ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) ||
1316 	     *ndp->ni_next == '/')) {
1317 		cnp->cn_flags |= ISSYMLINK;
1318 		if (VN_IS_DOOMED(dp)) {
1319 			/*
1320 			 * We can't know whether the directory was mounted with
1321 			 * NOSYMFOLLOW, so we can't follow safely.
1322 			 */
1323 			error = ENOENT;
1324 			goto bad2;
1325 		}
1326 		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
1327 			error = EACCES;
1328 			goto bad2;
1329 		}
1330 		/*
1331 		 * Symlink code always expects an unlocked dvp.
1332 		 */
1333 		if (ndp->ni_dvp != ndp->ni_vp) {
1334 			VOP_UNLOCK(ndp->ni_dvp);
1335 			ni_dvp_unlocked = 1;
1336 		}
1337 		goto success;
1338 	}
1339 
1340 	if ((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0 &&
1341 	    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
1342 		error = vfs_lookup_cross_mount(ndp);
1343 		if (error != 0)
1344 			goto bad_unlocked;
1345 		/*
1346 		 * FALLTHROUGH to nextname
1347 		 */
1348 		dp = ndp->ni_vp;
1349 	}
1350 
1351 nextname:
1352 	/*
1353 	 * Not a symbolic link that we will follow.  Continue with the
1354 	 * next component if there is any; otherwise, we're done.
1355 	 */
1356 	KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/',
1357 	    ("lookup: invalid path state."));
1358 	if (relookup) {
1359 		relookup = 0;
1360 		ndp->ni_pathlen = prev_ni_pathlen;
1361 		ndp->ni_next = prev_ni_next;
1362 		if (ndp->ni_dvp != dp)
1363 			vput(ndp->ni_dvp);
1364 		else
1365 			vrele(ndp->ni_dvp);
1366 		goto dirloop;
1367 	}
1368 	if (cnp->cn_flags & ISDOTDOT) {
1369 		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
1370 		if (error != 0) {
1371 #ifdef KTRACE
1372 			if (KTRPOINT(curthread, KTR_CAPFAIL))
1373 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
1374 #endif
1375 			goto bad2;
1376 		}
1377 	}
1378 	if (*ndp->ni_next == '/') {
1379 		cnp->cn_nameptr = ndp->ni_next;
1380 		while (*cnp->cn_nameptr == '/') {
1381 			cnp->cn_nameptr++;
1382 			ndp->ni_pathlen--;
1383 		}
1384 		if (ndp->ni_dvp != dp)
1385 			vput(ndp->ni_dvp);
1386 		else
1387 			vrele(ndp->ni_dvp);
1388 		goto dirloop;
1389 	}
1390 	/*
1391 	 * If we're processing a path with a trailing slash,
1392 	 * check that the end result is a directory.
1393 	 */
1394 	if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) {
1395 		error = ENOTDIR;
1396 		goto bad2;
1397 	}
1398 	/*
1399 	 * Disallow directory write attempts on read-only filesystems.
1400 	 */
1401 	if (rdonly &&
1402 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1403 		error = EROFS;
1404 		goto bad2;
1405 	}
1406 	if (!wantparent) {
1407 		ni_dvp_unlocked = 2;
1408 		if (ndp->ni_dvp != dp)
1409 			vput(ndp->ni_dvp);
1410 		else
1411 			vrele(ndp->ni_dvp);
1412 	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
1413 		VOP_UNLOCK(ndp->ni_dvp);
1414 		ni_dvp_unlocked = 1;
1415 	}
1416 
1417 	if (cnp->cn_flags & AUDITVNODE1)
1418 		AUDIT_ARG_VNODE1(dp);
1419 	else if (cnp->cn_flags & AUDITVNODE2)
1420 		AUDIT_ARG_VNODE2(dp);
1421 
1422 	if ((cnp->cn_flags & LOCKLEAF) == 0)
1423 		VOP_UNLOCK(dp);
1424 success:
1425 	/*
1426 	 * FIXME: for lookups which only cross a mount point to fetch the
1427 	 * root vnode, ni_dvp will be set to vp_crossmp. This can be a problem
1428 	 * if either WANTPARENT or LOCKPARENT is set.
1429 	 */
1430 	/*
1431 	 * Because of shared lookup we may have the vnode shared locked, but
1432 	 * the caller may want it to be exclusively locked.
1433 	 */
1434 	if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) &&
1435 	    VOP_ISLOCKED(dp) != LK_EXCLUSIVE) {
1436 		vn_lock(dp, LK_UPGRADE | LK_RETRY);
1437 		if (VN_IS_DOOMED(dp)) {
1438 			error = ENOENT;
1439 			goto bad2;
1440 		}
1441 	}
1442 success_right_lock:
1443 	if (ndp->ni_vp != NULL) {
1444 		if ((cnp->cn_flags & ISDOTDOT) == 0)
1445 			nameicap_tracker_add(ndp, ndp->ni_vp);
1446 		if ((cnp->cn_flags & (FAILIFEXISTS | ISSYMLINK)) == FAILIFEXISTS)
1447 			return (vfs_lookup_failifexists(ndp));
1448 	}
1449 	return (0);
1450 
1451 bad2:
1452 	if (ni_dvp_unlocked != 2) {
1453 		if (dp != ndp->ni_dvp && !ni_dvp_unlocked)
1454 			vput(ndp->ni_dvp);
1455 		else
1456 			vrele(ndp->ni_dvp);
1457 	}
1458 bad:
1459 	vput(dp);
1460 bad_unlocked:
1461 	ndp->ni_vp = NULL;
1462 	return (error);
1463 }
1464 
1465 /*
1466  * relookup - lookup a path name component
1467  *    Used by lookup to re-acquire things.
1468  */
1469 int
1470 vfs_relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
1471     bool refstart)
1472 {
1473 	struct vnode *dp = NULL;		/* the directory we are searching */
1474 	int rdonly;			/* lookup read-only flag bit */
1475 	int error = 0;
1476 
1477 	KASSERT(cnp->cn_flags & ISLASTCN,
1478 	    ("relookup: Not given last component."));
1479 	/*
1480 	 * Setup: break out flag bits into variables.
1481 	 */
1482 	KASSERT((cnp->cn_flags & (LOCKPARENT | WANTPARENT)) != 0,
1483 	    ("relookup: parent not wanted"));
1484 	rdonly = cnp->cn_flags & RDONLY;
1485 	cnp->cn_flags &= ~ISSYMLINK;
1486 	dp = dvp;
1487 	cnp->cn_lkflags = LK_EXCLUSIVE;
1488 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY);
1489 
1490 	/*
1491 	 * Search a new directory.
1492 	 *
1493 	 * See a comment in vfs_lookup for cnp->cn_nameptr.
1494 	 *
1495 	 * Check for "" which represents the root directory after slash
1496 	 * removal.
1497 	 */
1498 	if (cnp->cn_nameptr[0] == '\0') {
1499 		/*
1500 		 * Support only LOOKUP for "/" because lookup()
1501 		 * can't succeed for CREATE, DELETE and RENAME.
1502 		 */
1503 		KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP"));
1504 		KASSERT(dp->v_type == VDIR, ("dp is not a directory"));
1505 
1506 		if (!(cnp->cn_flags & LOCKLEAF))
1507 			VOP_UNLOCK(dp);
1508 		*vpp = dp;
1509 		/* XXX This should probably move to the top of function. */
1510 		if (refstart)
1511 			panic("lookup: SAVESTART");
1512 		return (0);
1513 	}
1514 
1515 	if (cnp->cn_flags & ISDOTDOT)
1516 		panic ("relookup: lookup on dot-dot");
1517 
1518 	/*
1519 	 * We now have a segment name to search for, and a directory to search.
1520 	 */
1521 	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
1522 		KASSERT(*vpp == NULL, ("leaf should be empty"));
1523 		if (error != EJUSTRETURN)
1524 			goto bad;
1525 		/*
1526 		 * If creating and at end of pathname, then can consider
1527 		 * allowing file to be created.
1528 		 */
1529 		if (rdonly) {
1530 			error = EROFS;
1531 			goto bad;
1532 		}
1533 		/* ASSERT(dvp == ndp->ni_startdir) */
1534 		if (refstart)
1535 			VREF(dvp);
1536 		if ((cnp->cn_flags & LOCKPARENT) == 0)
1537 			VOP_UNLOCK(dp);
1538 		/*
1539 		 * We return with ni_vp NULL to indicate that the entry
1540 		 * doesn't currently exist, leaving a pointer to the
1541 		 * (possibly locked) directory vnode in ndp->ni_dvp.
1542 		 */
1543 		return (0);
1544 	}
1545 
1546 	dp = *vpp;
1547 
1548 	/*
1549 	 * Disallow directory write attempts on read-only filesystems.
1550 	 */
1551 	if (rdonly &&
1552 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
1553 		if (dvp == dp)
1554 			vrele(dvp);
1555 		else
1556 			vput(dvp);
1557 		error = EROFS;
1558 		goto bad;
1559 	}
1560 	/*
1561 	 * Set the parent lock/ref state to the requested state.
1562 	 */
1563 	if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp)
1564 		VOP_UNLOCK(dvp);
1565 	/*
1566 	 * Check for symbolic link
1567 	 */
1568 	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
1569 	    ("relookup: symlink found.\n"));
1570 
1571 	/* ASSERT(dvp == ndp->ni_startdir) */
1572 	if (refstart)
1573 		VREF(dvp);
1574 
1575 	if ((cnp->cn_flags & LOCKLEAF) == 0)
1576 		VOP_UNLOCK(dp);
1577 	return (0);
1578 bad:
1579 	vput(dp);
1580 	*vpp = NULL;
1581 	return (error);
1582 }
1583 
1584 #ifdef INVARIANTS
1585 /*
1586  * Validate the final state of ndp after the lookup.
1587  */
1588 static void
1589 NDVALIDATE_impl(struct nameidata *ndp, int line)
1590 {
1591 	struct componentname *cnp;
1592 
1593 	cnp = &ndp->ni_cnd;
1594 	if (cnp->cn_pnbuf == NULL)
1595 		panic("%s: got no buf! called from %d", __func__, line);
1596 }
1597 
1598 #endif
1599