xref: /dragonfly/sys/kern/vfs_syscalls.c (revision 493fd20c)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/buf.h>
41 #include <sys/conf.h>
42 #include <sys/sysent.h>
43 #include <sys/malloc.h>
44 #include <sys/mount.h>
45 #include <sys/mountctl.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/linker.h>
52 #include <sys/stat.h>
53 #include <sys/unistd.h>
54 #include <sys/vnode.h>
55 #include <sys/proc.h>
56 #include <sys/priv.h>
57 #include <sys/jail.h>
58 #include <sys/namei.h>
59 #include <sys/nlookup.h>
60 #include <sys/dirent.h>
61 #include <sys/extattr.h>
62 #include <sys/spinlock.h>
63 #include <sys/kern_syscall.h>
64 #include <sys/objcache.h>
65 #include <sys/sysctl.h>
66 
67 #include <sys/buf2.h>
68 #include <sys/file2.h>
69 #include <sys/spinlock2.h>
70 
71 #include <vm/vm.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 
75 #include <machine/limits.h>
76 #include <machine/stdarg.h>
77 
78 static void mount_warning(struct mount *mp, const char *ctl, ...)
79 		__printflike(2, 3);
80 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
81 static int checkvp_chdir (struct vnode *vn, struct thread *td);
82 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
83 static int chroot_refuse_vdir_fds (thread_t td, struct filedesc *fdp);
84 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
85 static int getutimes (struct timeval *, struct timespec *);
86 static int getutimens (const struct timespec *, struct timespec *, int *);
87 static int setfown (struct mount *, struct vnode *, uid_t, gid_t);
88 static int setfmode (struct vnode *, int);
89 static int setfflags (struct vnode *, int);
90 static int setutimes (struct vnode *, struct vattr *,
91 			const struct timespec *, int);
92 
93 static int	usermount = 0;	/* if 1, non-root can mount fs. */
94 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
95     "Allow non-root users to mount filesystems");
96 
97 static int	debug_unmount = 0; /* if 1 loop until unmount success */
98 SYSCTL_INT(_vfs, OID_AUTO, debug_unmount, CTLFLAG_RW, &debug_unmount, 0,
99     "Stall failed unmounts in loop");
100 /*
101  * Virtual File System System Calls
102  */
103 
104 /*
105  * Mount a file system.
106  *
107  * mount_args(char *type, char *path, int flags, caddr_t data)
108  *
109  * MPALMOSTSAFE
110  */
111 int
112 sys_mount(struct mount_args *uap)
113 {
114 	struct thread *td = curthread;
115 	struct vnode *vp;
116 	struct nchandle nch;
117 	struct mount *mp, *nullmp;
118 	struct vfsconf *vfsp;
119 	int error, flag = 0, flag2 = 0;
120 	int hasmount;
121 	struct vattr va;
122 	struct nlookupdata nd;
123 	char fstypename[MFSNAMELEN];
124 	struct ucred *cred;
125 
126 	cred = td->td_ucred;
127 	if (jailed(cred)) {
128 		error = EPERM;
129 		goto done;
130 	}
131 	if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
132 		goto done;
133 
134 	/*
135 	 * Do not allow NFS export by non-root users.
136 	 */
137 	if (uap->flags & MNT_EXPORTED) {
138 		error = priv_check(td, PRIV_ROOT);
139 		if (error)
140 			goto done;
141 	}
142 	/*
143 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
144 	 */
145 	if (priv_check(td, PRIV_ROOT))
146 		uap->flags |= MNT_NOSUID | MNT_NODEV;
147 
148 	/*
149 	 * Lookup the requested path and extract the nch and vnode.
150 	 */
151 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
152 	if (error == 0) {
153 		if ((error = nlookup(&nd)) == 0) {
154 			if (nd.nl_nch.ncp->nc_vp == NULL)
155 				error = ENOENT;
156 		}
157 	}
158 	if (error) {
159 		nlookup_done(&nd);
160 		goto done;
161 	}
162 
163 	/*
164 	 * If the target filesystem is resolved via a nullfs mount, then
165 	 * nd.nl_nch.mount will be pointing to the nullfs mount structure
166 	 * instead of the target file system. We need it in case we are
167 	 * doing an update.
168 	 */
169 	nullmp = nd.nl_nch.mount;
170 
171 	/*
172 	 * Extract the locked+refd ncp and cleanup the nd structure
173 	 */
174 	nch = nd.nl_nch;
175 	cache_zero(&nd.nl_nch);
176 	nlookup_done(&nd);
177 
178 	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
179 	    (mp = cache_findmount(&nch)) != NULL) {
180 		cache_dropmount(mp);
181 		hasmount = 1;
182 	} else {
183 		hasmount = 0;
184 	}
185 
186 
187 	/*
188 	 * now we have the locked ref'd nch and unreferenced vnode.
189 	 */
190 	vp = nch.ncp->nc_vp;
191 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
192 		cache_put(&nch);
193 		goto done;
194 	}
195 	cache_unlock(&nch);
196 
197 	/*
198 	 * Extract the file system type. We need to know this early, to take
199 	 * appropriate actions if we are dealing with a nullfs.
200 	 */
201         if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
202                 cache_drop(&nch);
203                 vput(vp);
204 		goto done;
205         }
206 
207 	/*
208 	 * Now we have an unlocked ref'd nch and a locked ref'd vp
209 	 */
210 	if (uap->flags & MNT_UPDATE) {
211 		if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
212 			cache_drop(&nch);
213 			vput(vp);
214 			error = EINVAL;
215 			goto done;
216 		}
217 
218 		if (strncmp(fstypename, "null", 5) == 0) {
219 			KKASSERT(nullmp);
220 			mp = nullmp;
221 		} else {
222 			mp = vp->v_mount;
223 		}
224 
225 		flag = mp->mnt_flag;
226 		flag2 = mp->mnt_kern_flag;
227 		/*
228 		 * We only allow the filesystem to be reloaded if it
229 		 * is currently mounted read-only.
230 		 */
231 		if ((uap->flags & MNT_RELOAD) &&
232 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
233 			cache_drop(&nch);
234 			vput(vp);
235 			error = EOPNOTSUPP;	/* Needs translation */
236 			goto done;
237 		}
238 		/*
239 		 * Only root, or the user that did the original mount is
240 		 * permitted to update it.
241 		 */
242 		if (mp->mnt_stat.f_owner != cred->cr_uid &&
243 		    (error = priv_check(td, PRIV_ROOT))) {
244 			cache_drop(&nch);
245 			vput(vp);
246 			goto done;
247 		}
248 		if (vfs_busy(mp, LK_NOWAIT)) {
249 			cache_drop(&nch);
250 			vput(vp);
251 			error = EBUSY;
252 			goto done;
253 		}
254 		if (hasmount) {
255 			cache_drop(&nch);
256 			vfs_unbusy(mp);
257 			vput(vp);
258 			error = EBUSY;
259 			goto done;
260 		}
261 		mp->mnt_flag |=
262 		    uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
263 		lwkt_gettoken(&mp->mnt_token);
264 		vn_unlock(vp);
265 		vfsp = mp->mnt_vfc;
266 		goto update;
267 	}
268 
269 	/*
270 	 * If the user is not root, ensure that they own the directory
271 	 * onto which we are attempting to mount.
272 	 */
273 	if ((error = VOP_GETATTR(vp, &va)) ||
274 	    (va.va_uid != cred->cr_uid &&
275 	     (error = priv_check(td, PRIV_ROOT)))) {
276 		cache_drop(&nch);
277 		vput(vp);
278 		goto done;
279 	}
280 	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
281 		cache_drop(&nch);
282 		vput(vp);
283 		goto done;
284 	}
285 	if (vp->v_type != VDIR) {
286 		cache_drop(&nch);
287 		vput(vp);
288 		error = ENOTDIR;
289 		goto done;
290 	}
291 	if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
292 		cache_drop(&nch);
293 		vput(vp);
294 		error = EPERM;
295 		goto done;
296 	}
297 	vfsp = vfsconf_find_by_name(fstypename);
298 	if (vfsp == NULL) {
299 		linker_file_t lf;
300 
301 		/* Only load modules for root (very important!) */
302 		if ((error = priv_check(td, PRIV_ROOT)) != 0) {
303 			cache_drop(&nch);
304 			vput(vp);
305 			goto done;
306 		}
307 		error = linker_load_file(fstypename, &lf);
308 		if (error || lf == NULL) {
309 			cache_drop(&nch);
310 			vput(vp);
311 			if (lf == NULL)
312 				error = ENODEV;
313 			goto done;
314 		}
315 		lf->userrefs++;
316 		/* lookup again, see if the VFS was loaded */
317 		vfsp = vfsconf_find_by_name(fstypename);
318 		if (vfsp == NULL) {
319 			lf->userrefs--;
320 			linker_file_unload(lf);
321 			cache_drop(&nch);
322 			vput(vp);
323 			error = ENODEV;
324 			goto done;
325 		}
326 	}
327 	if (hasmount) {
328 		cache_drop(&nch);
329 		vput(vp);
330 		error = EBUSY;
331 		goto done;
332 	}
333 
334 	/*
335 	 * Allocate and initialize the filesystem.
336 	 */
337 	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
338 	mount_init(mp);
339 	vfs_busy(mp, LK_NOWAIT);
340 	mp->mnt_op = vfsp->vfc_vfsops;
341 	mp->mnt_vfc = vfsp;
342 	mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
343 	vfsp->vfc_refcount++;
344 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
345 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
346 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
347 	mp->mnt_stat.f_owner = cred->cr_uid;
348 	lwkt_gettoken(&mp->mnt_token);
349 	vn_unlock(vp);
350 update:
351 	/*
352 	 * (per-mount token acquired at this point)
353 	 *
354 	 * Set the mount level flags.
355 	 */
356 	if (uap->flags & MNT_RDONLY)
357 		mp->mnt_flag |= MNT_RDONLY;
358 	else if (mp->mnt_flag & MNT_RDONLY)
359 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
360 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
361 	    MNT_SYNCHRONOUS | MNT_ASYNC | MNT_NOATIME |
362 	    MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
363 	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
364 	    MNT_AUTOMOUNTED);
365 	mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
366 	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_ASYNC | MNT_FORCE |
367 	    MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
368 	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
369 	    MNT_AUTOMOUNTED);
370 
371 	/*
372 	 * Pre-set the mount's ALL_MPSAFE flags if specified in the vfsconf.
373 	 * This way the initial VFS_MOUNT() call will also be MPSAFE.
374 	 */
375 	if (vfsp->vfc_flags & VFCF_MPSAFE)
376 		mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;
377 
378 	/*
379 	 * Mount the filesystem.
380 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
381 	 * get.
382 	 */
383 	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
384 	if (mp->mnt_flag & MNT_UPDATE) {
385 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
386 			mp->mnt_flag &= ~MNT_RDONLY;
387 		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
388 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
389 		if (error) {
390 			mp->mnt_flag = flag;
391 			mp->mnt_kern_flag = flag2;
392 		}
393 		lwkt_reltoken(&mp->mnt_token);
394 		vfs_unbusy(mp);
395 		vrele(vp);
396 		cache_drop(&nch);
397 		goto done;
398 	}
399 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
400 
401 	/*
402 	 * Put the new filesystem on the mount list after root.  The mount
403 	 * point gets its own mnt_ncmountpt (unless the VFS already set one
404 	 * up) which represents the root of the mount.  The lookup code
405 	 * detects the mount point going forward and checks the root of
406 	 * the mount going backwards.
407 	 *
408 	 * It is not necessary to invalidate or purge the vnode underneath
409 	 * because elements under the mount will be given their own glue
410 	 * namecache record.
411 	 */
412 	if (!error) {
413 		if (mp->mnt_ncmountpt.ncp == NULL) {
414 			/*
415 			 * Allocate, then unlock, but leave the ref intact.
416 			 * This is the mnt_refs (1) that we will retain
417 			 * through to the unmount.
418 			 */
419 			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
420 			cache_unlock(&mp->mnt_ncmountpt);
421 		}
422 		vn_unlock(vp);
423 		mp->mnt_ncmounton = nch;		/* inherits ref */
424 		cache_lock(&nch);
425 		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
426 		cache_unlock(&nch);
427 		cache_ismounting(mp);
428 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
429 
430 		mountlist_insert(mp, MNTINS_LAST);
431 		vn_unlock(vp);
432 		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
433 		error = vfs_allocate_syncvnode(mp);
434 		lwkt_reltoken(&mp->mnt_token);
435 		vfs_unbusy(mp);
436 		error = VFS_START(mp, 0);
437 		vrele(vp);
438 		KNOTE(&fs_klist, VQ_MOUNT);
439 	} else {
440 		vn_syncer_thr_stop(mp);
441 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
442 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
443 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
444 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
445 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
446 		mp->mnt_vfc->vfc_refcount--;
447 		lwkt_reltoken(&mp->mnt_token);
448 		vfs_unbusy(mp);
449 		kfree(mp, M_MOUNT);
450 		cache_drop(&nch);
451 		vput(vp);
452 	}
453 done:
454 	return (error);
455 }
456 
457 /*
458  * Scan all active processes to see if any of them have a current
459  * or root directory onto which the new filesystem has just been
460  * mounted. If so, replace them with the new mount point.
461  *
462  * Both old_nch and new_nch are ref'd on call but not locked.
463  * new_nch must be temporarily locked so it can be associated with the
464  * vnode representing the root of the mount point.
465  */
466 struct checkdirs_info {
467 	struct nchandle old_nch;
468 	struct nchandle new_nch;
469 	struct vnode *old_vp;
470 	struct vnode *new_vp;
471 };
472 
473 static int checkdirs_callback(struct proc *p, void *data);
474 
475 static void
476 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
477 {
478 	struct checkdirs_info info;
479 	struct vnode *olddp;
480 	struct vnode *newdp;
481 	struct mount *mp;
482 
483 	/*
484 	 * If the old mount point's vnode has a usecount of 1, it is not
485 	 * being held as a descriptor anywhere.
486 	 */
487 	olddp = old_nch->ncp->nc_vp;
488 	if (olddp == NULL || VREFCNT(olddp) == 1)
489 		return;
490 
491 	/*
492 	 * Force the root vnode of the new mount point to be resolved
493 	 * so we can update any matching processes.
494 	 */
495 	mp = new_nch->mount;
496 	if (VFS_ROOT(mp, &newdp))
497 		panic("mount: lost mount");
498 	vn_unlock(newdp);
499 	cache_lock(new_nch);
500 	vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
501 	cache_setunresolved(new_nch);
502 	cache_setvp(new_nch, newdp);
503 	cache_unlock(new_nch);
504 
505 	/*
506 	 * Special handling of the root node
507 	 */
508 	if (rootvnode == olddp) {
509 		vref(newdp);
510 		vfs_cache_setroot(newdp, cache_hold(new_nch));
511 	}
512 
513 	/*
514 	 * Pass newdp separately so the callback does not have to access
515 	 * it via new_nch->ncp->nc_vp.
516 	 */
517 	info.old_nch = *old_nch;
518 	info.new_nch = *new_nch;
519 	info.new_vp = newdp;
520 	allproc_scan(checkdirs_callback, &info, 0);
521 	vput(newdp);
522 }
523 
524 /*
525  * NOTE: callback is not MP safe because the scanned process's filedesc
526  * structure can be ripped out from under us, amoung other things.
527  */
528 static int
529 checkdirs_callback(struct proc *p, void *data)
530 {
531 	struct checkdirs_info *info = data;
532 	struct filedesc *fdp;
533 	struct nchandle ncdrop1;
534 	struct nchandle ncdrop2;
535 	struct vnode *vprele1;
536 	struct vnode *vprele2;
537 
538 	if ((fdp = p->p_fd) != NULL) {
539 		cache_zero(&ncdrop1);
540 		cache_zero(&ncdrop2);
541 		vprele1 = NULL;
542 		vprele2 = NULL;
543 
544 		/*
545 		 * MPUNSAFE - XXX fdp can be pulled out from under a
546 		 * foreign process.
547 		 *
548 		 * A shared filedesc is ok, we don't have to copy it
549 		 * because we are making this change globally.
550 		 */
551 		spin_lock(&fdp->fd_spin);
552 		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
553 		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
554 			vprele1 = fdp->fd_cdir;
555 			vref(info->new_vp);
556 			fdp->fd_cdir = info->new_vp;
557 			ncdrop1 = fdp->fd_ncdir;
558 			cache_copy(&info->new_nch, &fdp->fd_ncdir);
559 		}
560 		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
561 		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
562 			vprele2 = fdp->fd_rdir;
563 			vref(info->new_vp);
564 			fdp->fd_rdir = info->new_vp;
565 			ncdrop2 = fdp->fd_nrdir;
566 			cache_copy(&info->new_nch, &fdp->fd_nrdir);
567 		}
568 		spin_unlock(&fdp->fd_spin);
569 		if (ncdrop1.ncp)
570 			cache_drop(&ncdrop1);
571 		if (ncdrop2.ncp)
572 			cache_drop(&ncdrop2);
573 		if (vprele1)
574 			vrele(vprele1);
575 		if (vprele2)
576 			vrele(vprele2);
577 	}
578 	return(0);
579 }
580 
581 /*
582  * Unmount a file system.
583  *
584  * Note: unmount takes a path to the vnode mounted on as argument,
585  * not special file (as before).
586  *
587  * umount_args(char *path, int flags)
588  *
589  * MPALMOSTSAFE
590  */
591 int
592 sys_unmount(struct unmount_args *uap)
593 {
594 	struct thread *td = curthread;
595 	struct proc *p __debugvar = td->td_proc;
596 	struct mount *mp = NULL;
597 	struct nlookupdata nd;
598 	int error;
599 
600 	KKASSERT(p);
601 	if (td->td_ucred->cr_prison != NULL) {
602 		error = EPERM;
603 		goto done;
604 	}
605 	if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
606 		goto done;
607 
608 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
609 	if (error == 0)
610 		error = nlookup(&nd);
611 	if (error)
612 		goto out;
613 
614 	mp = nd.nl_nch.mount;
615 
616 	/*
617 	 * Only root, or the user that did the original mount is
618 	 * permitted to unmount this filesystem.
619 	 */
620 	if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
621 	    (error = priv_check(td, PRIV_ROOT)))
622 		goto out;
623 
624 	/*
625 	 * Don't allow unmounting the root file system.
626 	 */
627 	if (mp->mnt_flag & MNT_ROOTFS) {
628 		error = EINVAL;
629 		goto out;
630 	}
631 
632 	/*
633 	 * Must be the root of the filesystem
634 	 */
635 	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
636 		error = EINVAL;
637 		goto out;
638 	}
639 
640 	/*
641 	 * If no error try to issue the unmount.  We lose our cache
642 	 * ref when we call nlookup_done so we must hold the mount point
643 	 * to prevent use-after-free races.
644 	 */
645 out:
646 	if (error == 0) {
647 		mount_hold(mp);
648 		nlookup_done(&nd);
649 		error = dounmount(mp, uap->flags, 0);
650 		mount_drop(mp);
651 	} else {
652 		nlookup_done(&nd);
653 	}
654 done:
655 	return (error);
656 }
657 
658 /*
659  * Do the actual file system unmount (interlocked against the mountlist
660  * token and mp->mnt_token).
661  */
662 static int
663 dounmount_interlock(struct mount *mp)
664 {
665 	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
666 		return (EBUSY);
667 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
668 	return(0);
669 }
670 
671 static int
672 unmount_allproc_cb(struct proc *p, void *arg)
673 {
674 	struct mount *mp;
675 
676 	if (p->p_textnch.ncp == NULL)
677 		return 0;
678 
679 	mp = (struct mount *)arg;
680 	if (p->p_textnch.mount == mp)
681 		cache_drop(&p->p_textnch);
682 
683 	return 0;
684 }
685 
686 /*
687  * The guts of the unmount code.  The mount owns one ref and one hold
688  * count.  If we successfully interlock the unmount, those refs are ours.
689  * (The ref is from mnt_ncmountpt).
690  *
691  * When halting we shortcut certain mount types such as devfs by not actually
692  * issuing the VFS_SYNC() or VFS_UNMOUNT().  They are still disconnected
693  * from the mountlist so higher-level filesytems can unmount cleanly.
694  *
695  * The mount types that allow QUICKHALT are: devfs, tmpfs, procfs.
696  */
697 int
698 dounmount(struct mount *mp, int flags, int halting)
699 {
700 	struct namecache *ncp;
701 	struct nchandle nch;
702 	struct vnode *vp;
703 	int error;
704 	int async_flag;
705 	int lflags;
706 	int freeok = 1;
707 	int hadsyncer = 0;
708 	int retry;
709 	int quickhalt;
710 
711 	lwkt_gettoken(&mp->mnt_token);
712 
713 	/*
714 	 * When halting, certain mount points can essentially just
715 	 * be unhooked and otherwise ignored.
716 	 */
717 	if (halting && (mp->mnt_kern_flag & MNTK_QUICKHALT)) {
718 		quickhalt = 1;
719 		freeok = 0;
720 	} else {
721 		quickhalt = 0;
722 	}
723 
724 
725 	/*
726 	 * Exclusive access for unmounting purposes.
727 	 */
728 	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
729 		goto out;
730 
731 	/*
732 	 * We now 'own' the last mp->mnt_refs
733 	 *
734 	 * Allow filesystems to detect that a forced unmount is in progress.
735 	 */
736 	if (flags & MNT_FORCE)
737 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
738 	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_TIMELOCK);
739 	error = lockmgr(&mp->mnt_lock, lflags);
740 	if (error) {
741 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
742 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
743 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
744 			wakeup(mp);
745 		}
746 		goto out;
747 	}
748 
749 	if (mp->mnt_flag & MNT_EXPUBLIC)
750 		vfs_setpublicfs(NULL, NULL, NULL);
751 
752 	vfs_msync(mp, MNT_WAIT);
753 	async_flag = mp->mnt_flag & MNT_ASYNC;
754 	mp->mnt_flag &=~ MNT_ASYNC;
755 
756 	/*
757 	 * Decomission our special mnt_syncer vnode.  This also stops
758 	 * the vnlru code.  If we are unable to unmount we recommission
759 	 * the vnode.
760 	 *
761 	 * Then sync the filesystem.
762 	 */
763 	if ((vp = mp->mnt_syncer) != NULL) {
764 		mp->mnt_syncer = NULL;
765 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
766 		vrele(vp);
767 		hadsyncer = 1;
768 	}
769 
770 	/*
771 	 * Sync normally-mounted filesystem.
772 	 */
773 	if (quickhalt == 0) {
774 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
775 			VFS_SYNC(mp, MNT_WAIT);
776 	}
777 
778 	/*
779 	 * nchandle records ref the mount structure.  Expect a count of 1
780 	 * (our mount->mnt_ncmountpt).
781 	 *
782 	 * Scans can get temporary refs on a mountpoint (thought really
783 	 * heavy duty stuff like cache_findmount() do not).
784 	 */
785 	for (retry = 0; (retry < 10 || debug_unmount); ++retry) {
786 		/*
787 		 * Invalidate the namecache topology under the mount.
788 		 * nullfs mounts alias a real mount's namecache topology
789 		 * and it should not be invalidated in that case.
790 		 */
791 		if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
792 			cache_lock(&mp->mnt_ncmountpt);
793 			cache_inval(&mp->mnt_ncmountpt,
794 				    CINV_DESTROY|CINV_CHILDREN);
795 			cache_unlock(&mp->mnt_ncmountpt);
796 		}
797 
798 		/*
799 		 * Clear pcpu caches
800 		 */
801 		cache_unmounting(mp);
802 		if (mp->mnt_refs != 1)
803 			cache_clearmntcache();
804 
805 		/*
806 		 * Break out if we are good.  Don't count ncp refs if the
807 		 * mount is aliased.
808 		 */
809 		ncp = (mp->mnt_kern_flag & MNTK_NCALIASED) ?
810 		      NULL : mp->mnt_ncmountpt.ncp;
811 		if (mp->mnt_refs == 1 &&
812 		    (ncp == NULL || (ncp->nc_refs == 1 &&
813 				     TAILQ_FIRST(&ncp->nc_list) == NULL))) {
814 			break;
815 		}
816 
817 		/*
818 		 * If forcing the unmount, clean out any p->p_textnch
819 		 * nchandles that match this mount.
820 		 */
821 		if (flags & MNT_FORCE)
822 			allproc_scan(&unmount_allproc_cb, mp, 0);
823 
824 		/*
825 		 * Sleep and retry.
826 		 */
827 		tsleep(&mp->mnt_refs, 0, "mntbsy", hz / 10 + 1);
828 		if ((retry & 15) == 15) {
829 			mount_warning(mp,
830 				      "(%p) debug - retry %d, "
831 				      "%d namecache refs, %d mount refs",
832 				      mp, retry,
833 				      (ncp ? ncp->nc_refs - 1 : 0),
834 				      mp->mnt_refs - 1);
835 		}
836 	}
837 
838 	error = 0;
839 	ncp = (mp->mnt_kern_flag & MNTK_NCALIASED) ?
840 	      NULL : mp->mnt_ncmountpt.ncp;
841 	if (mp->mnt_refs != 1 ||
842 	    (ncp != NULL && (ncp->nc_refs != 1 ||
843 			     TAILQ_FIRST(&ncp->nc_list)))) {
844 		mount_warning(mp,
845 			      "(%p): %d namecache refs, %d mount refs "
846 			      "still present",
847 			      mp,
848 			      (ncp ? ncp->nc_refs - 1 : 0),
849 			      mp->mnt_refs - 1);
850 		if (flags & MNT_FORCE) {
851 			freeok = 0;
852 			mount_warning(mp, "forcing unmount\n");
853 		} else {
854 			error = EBUSY;
855 		}
856 	}
857 
858 	/*
859 	 * So far so good, sync the filesystem once more and
860 	 * call the VFS unmount code if the sync succeeds.
861 	 */
862 	if (error == 0 && quickhalt == 0) {
863 		if (mp->mnt_flag & MNT_RDONLY) {
864 			error = VFS_UNMOUNT(mp, flags);
865 		} else {
866 			error = VFS_SYNC(mp, MNT_WAIT);
867 			if ((error == 0) ||
868 			    (error == EOPNOTSUPP) || /* No sync */
869 			    (flags & MNT_FORCE)) {
870 				error = VFS_UNMOUNT(mp, flags);
871 			}
872 		}
873 		if (error) {
874 			mount_warning(mp,
875 				      "(%p) unmount: vfs refused to unmount, "
876 				      "error %d",
877 				      mp, error);
878 		}
879 	}
880 
881 	/*
882 	 * If an error occurred we can still recover, restoring the
883 	 * syncer vnode and misc flags.
884 	 */
885 	if (error) {
886 		if (mp->mnt_syncer == NULL && hadsyncer)
887 			vfs_allocate_syncvnode(mp);
888 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
889 		mp->mnt_flag |= async_flag;
890 		lockmgr(&mp->mnt_lock, LK_RELEASE);
891 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
892 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
893 			wakeup(mp);
894 		}
895 		goto out;
896 	}
897 	/*
898 	 * Clean up any journals still associated with the mount after
899 	 * filesystem activity has ceased.
900 	 */
901 	journal_remove_all_journals(mp,
902 	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
903 
904 	mountlist_remove(mp);
905 
906 	/*
907 	 * Remove any installed vnode ops here so the individual VFSs don't
908 	 * have to.
909 	 *
910 	 * mnt_refs should go to zero when we scrap mnt_ncmountpt.
911 	 *
912 	 * When quickhalting we have to keep these intact because the
913 	 * underlying vnodes have not been destroyed, and some might be
914 	 * dirty.
915 	 */
916 	if (quickhalt == 0) {
917 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
918 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
919 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
920 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
921 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
922 	}
923 
924 	if (mp->mnt_ncmountpt.ncp != NULL) {
925 		nch = mp->mnt_ncmountpt;
926 		cache_zero(&mp->mnt_ncmountpt);
927 		cache_clrmountpt(&nch);
928 		cache_drop(&nch);
929 	}
930 	if (mp->mnt_ncmounton.ncp != NULL) {
931 		cache_unmounting(mp);
932 		nch = mp->mnt_ncmounton;
933 		cache_zero(&mp->mnt_ncmounton);
934 		cache_clrmountpt(&nch);
935 		cache_drop(&nch);
936 	}
937 
938 	mp->mnt_vfc->vfc_refcount--;
939 
940 	/*
941 	 * If not quickhalting the mount, we expect there to be no
942 	 * vnodes left.
943 	 */
944 	if (quickhalt == 0 && !TAILQ_EMPTY(&mp->mnt_nvnodelist))
945 		panic("unmount: dangling vnode");
946 
947 	/*
948 	 * Release the lock
949 	 */
950 	lockmgr(&mp->mnt_lock, LK_RELEASE);
951 	if (mp->mnt_kern_flag & MNTK_MWAIT) {
952 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
953 		wakeup(mp);
954 	}
955 
956 	/*
957 	 * If we reach here and freeok != 0 we must free the mount.
958 	 * mnt_refs should already have dropped to 0, so if it is not
959 	 * zero we must cycle the caches and wait.
960 	 *
961 	 * When we are satisfied that the mount has disconnected we can
962 	 * drop the hold on the mp that represented the mount (though the
963 	 * caller might actually have another, so the caller's drop may
964 	 * do the actual free).
965 	 */
966 	if (freeok) {
967 		if (mp->mnt_refs > 0)
968 			cache_clearmntcache();
969 		while (mp->mnt_refs > 0) {
970 			cache_unmounting(mp);
971 			wakeup(mp);
972 			tsleep(&mp->mnt_refs, 0, "umntrwait", hz / 10 + 1);
973 			cache_clearmntcache();
974 		}
975 		lwkt_reltoken(&mp->mnt_token);
976 		mount_drop(mp);
977 		mp = NULL;
978 	} else {
979 		cache_clearmntcache();
980 	}
981 	error = 0;
982 	KNOTE(&fs_klist, VQ_UNMOUNT);
983 out:
984 	if (mp)
985 		lwkt_reltoken(&mp->mnt_token);
986 	return (error);
987 }
988 
989 static
990 void
991 mount_warning(struct mount *mp, const char *ctl, ...)
992 {
993 	char *ptr;
994 	char *buf;
995 	__va_list va;
996 
997 	__va_start(va, ctl);
998 	if (cache_fullpath(NULL, &mp->mnt_ncmounton, NULL,
999 			   &ptr, &buf, 0) == 0) {
1000 		kprintf("unmount(%s): ", ptr);
1001 		kvprintf(ctl, va);
1002 		kprintf("\n");
1003 		kfree(buf, M_TEMP);
1004 	} else {
1005 		kprintf("unmount(%p", mp);
1006 		if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
1007 			kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
1008 		kprintf("): ");
1009 		kvprintf(ctl, va);
1010 		kprintf("\n");
1011 	}
1012 	__va_end(va);
1013 }
1014 
1015 /*
1016  * Shim cache_fullpath() to handle the case where a process is chrooted into
1017  * a subdirectory of a mount.  In this case if the root mount matches the
1018  * process root directory's mount we have to specify the process's root
1019  * directory instead of the mount point, because the mount point might
1020  * be above the root directory.
1021  */
1022 static
1023 int
1024 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
1025 {
1026 	struct nchandle *nch;
1027 
1028 	if (p && p->p_fd->fd_nrdir.mount == mp)
1029 		nch = &p->p_fd->fd_nrdir;
1030 	else
1031 		nch = &mp->mnt_ncmountpt;
1032 	return(cache_fullpath(p, nch, NULL, rb, fb, 0));
1033 }
1034 
1035 /*
1036  * Sync each mounted filesystem.
1037  */
1038 
1039 #ifdef DEBUG
1040 static int syncprt = 0;
1041 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
1042 #endif /* DEBUG */
1043 
1044 static int sync_callback(struct mount *mp, void *data);
1045 
1046 int
1047 sys_sync(struct sync_args *uap)
1048 {
1049 	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
1050 	return (0);
1051 }
1052 
1053 static
1054 int
1055 sync_callback(struct mount *mp, void *data __unused)
1056 {
1057 	int asyncflag;
1058 
1059 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1060 		lwkt_gettoken(&mp->mnt_token);
1061 		asyncflag = mp->mnt_flag & MNT_ASYNC;
1062 		mp->mnt_flag &= ~MNT_ASYNC;
1063 		lwkt_reltoken(&mp->mnt_token);
1064 		vfs_msync(mp, MNT_NOWAIT);
1065 		VFS_SYNC(mp, MNT_NOWAIT);
1066 		lwkt_gettoken(&mp->mnt_token);
1067 		mp->mnt_flag |= asyncflag;
1068 		lwkt_reltoken(&mp->mnt_token);
1069 	}
1070 	return(0);
1071 }
1072 
1073 /* XXX PRISON: could be per prison flag */
1074 static int prison_quotas;
1075 #if 0
1076 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
1077 #endif
1078 
1079 /*
1080  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
1081  *
1082  * Change filesystem quotas.
1083  *
1084  * MPALMOSTSAFE
1085  */
1086 int
1087 sys_quotactl(struct quotactl_args *uap)
1088 {
1089 	struct nlookupdata nd;
1090 	struct thread *td;
1091 	struct mount *mp;
1092 	int error;
1093 
1094 	td = curthread;
1095 	if (td->td_ucred->cr_prison && !prison_quotas) {
1096 		error = EPERM;
1097 		goto done;
1098 	}
1099 
1100 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1101 	if (error == 0)
1102 		error = nlookup(&nd);
1103 	if (error == 0) {
1104 		mp = nd.nl_nch.mount;
1105 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
1106 				    uap->arg, nd.nl_cred);
1107 	}
1108 	nlookup_done(&nd);
1109 done:
1110 	return (error);
1111 }
1112 
1113 /*
1114  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
1115  *		void *buf, int buflen)
1116  *
1117  * This function operates on a mount point and executes the specified
1118  * operation using the specified control data, and possibly returns data.
1119  *
1120  * The actual number of bytes stored in the result buffer is returned, 0
1121  * if none, otherwise an error is returned.
1122  *
1123  * MPALMOSTSAFE
1124  */
1125 int
1126 sys_mountctl(struct mountctl_args *uap)
1127 {
1128 	struct thread *td = curthread;
1129 	struct file *fp;
1130 	void *ctl = NULL;
1131 	void *buf = NULL;
1132 	char *path = NULL;
1133 	int error;
1134 
1135 	/*
1136 	 * Sanity and permissions checks.  We must be root.
1137 	 */
1138 	if (td->td_ucred->cr_prison != NULL)
1139 		return (EPERM);
1140 	if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
1141 	    (error = priv_check(td, PRIV_ROOT)) != 0)
1142 		return (error);
1143 
1144 	/*
1145 	 * Argument length checks
1146 	 */
1147 	if (uap->ctllen < 0 || uap->ctllen > 1024)
1148 		return (EINVAL);
1149 	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
1150 		return (EINVAL);
1151 	if (uap->path == NULL)
1152 		return (EINVAL);
1153 
1154 	/*
1155 	 * Allocate the necessary buffers and copyin data
1156 	 */
1157 	path = objcache_get(namei_oc, M_WAITOK);
1158 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1159 	if (error)
1160 		goto done;
1161 
1162 	if (uap->ctllen) {
1163 		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
1164 		error = copyin(uap->ctl, ctl, uap->ctllen);
1165 		if (error)
1166 			goto done;
1167 	}
1168 	if (uap->buflen)
1169 		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
1170 
1171 	/*
1172 	 * Validate the descriptor
1173 	 */
1174 	if (uap->fd >= 0) {
1175 		fp = holdfp(td, uap->fd, -1);
1176 		if (fp == NULL) {
1177 			error = EBADF;
1178 			goto done;
1179 		}
1180 	} else {
1181 		fp = NULL;
1182 	}
1183 
1184 	/*
1185 	 * Execute the internal kernel function and clean up.
1186 	 */
1187 	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen,
1188 			      buf, uap->buflen, &uap->sysmsg_result);
1189 	if (fp)
1190 		dropfp(td, uap->fd, fp);
1191 	if (error == 0 && uap->sysmsg_result > 0)
1192 		error = copyout(buf, uap->buf, uap->sysmsg_result);
1193 done:
1194 	if (path)
1195 		objcache_put(namei_oc, path);
1196 	if (ctl)
1197 		kfree(ctl, M_TEMP);
1198 	if (buf)
1199 		kfree(buf, M_TEMP);
1200 	return (error);
1201 }
1202 
1203 /*
1204  * Execute a mount control operation by resolving the path to a mount point
1205  * and calling vop_mountctl().
1206  *
1207  * Use the mount point from the nch instead of the vnode so nullfs mounts
1208  * can properly spike the VOP.
1209  */
1210 int
1211 kern_mountctl(const char *path, int op, struct file *fp,
1212 		const void *ctl, int ctllen,
1213 		void *buf, int buflen, int *res)
1214 {
1215 	struct vnode *vp;
1216 	struct nlookupdata nd;
1217 	struct nchandle nch;
1218 	struct mount *mp;
1219 	int error;
1220 
1221 	*res = 0;
1222 	vp = NULL;
1223 	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
1224 	if (error)
1225 		return (error);
1226 	error = nlookup(&nd);
1227 	if (error) {
1228 		nlookup_done(&nd);
1229 		return (error);
1230 	}
1231 	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
1232 	if (error) {
1233 		nlookup_done(&nd);
1234 		return (error);
1235 	}
1236 
1237 	/*
1238 	 * Yes, all this is needed to use the nch.mount below, because
1239 	 * we must maintain a ref on the mount to avoid ripouts (e.g.
1240 	 * due to heavy mount/unmount use by synth or poudriere).
1241 	 */
1242 	nch = nd.nl_nch;
1243 	cache_zero(&nd.nl_nch);
1244 	cache_unlock(&nch);
1245 	nlookup_done(&nd);
1246 	vn_unlock(vp);
1247 
1248 	mp = nch.mount;
1249 
1250 	/*
1251 	 * Must be the root of the filesystem
1252 	 */
1253 	if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
1254 		cache_drop(&nch);
1255 		vrele(vp);
1256 		return (EINVAL);
1257 	}
1258 	if (mp == NULL || mp->mnt_kern_flag & MNTK_UNMOUNT) {
1259 		kprintf("kern_mountctl: Warning, \"%s\" racing unmount\n",
1260 			path);
1261 		cache_drop(&nch);
1262 		vrele(vp);
1263 		return (EINVAL);
1264 	}
1265 	error = vop_mountctl(mp->mnt_vn_use_ops, vp, op, fp, ctl, ctllen,
1266 			     buf, buflen, res);
1267 	vrele(vp);
1268 	cache_drop(&nch);
1269 
1270 	return (error);
1271 }
1272 
1273 int
1274 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1275 {
1276 	struct thread *td = curthread;
1277 	struct proc *p = td->td_proc;
1278 	struct mount *mp;
1279 	struct statfs *sp;
1280 	char *fullpath, *freepath;
1281 	int error;
1282 
1283 	if ((error = nlookup(nd)) != 0)
1284 		return (error);
1285 	mp = nd->nl_nch.mount;
1286 	sp = &mp->mnt_stat;
1287 	if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1288 		return (error);
1289 
1290 	error = mount_path(p, mp, &fullpath, &freepath);
1291 	if (error)
1292 		return(error);
1293 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1294 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1295 	kfree(freepath, M_TEMP);
1296 
1297 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1298 	bcopy(sp, buf, sizeof(*buf));
1299 	/* Only root should have access to the fsid's. */
1300 	if (priv_check(td, PRIV_ROOT))
1301 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1302 	return (0);
1303 }
1304 
1305 /*
1306  * statfs_args(char *path, struct statfs *buf)
1307  *
1308  * Get filesystem statistics.
1309  */
1310 int
1311 sys_statfs(struct statfs_args *uap)
1312 {
1313 	struct nlookupdata nd;
1314 	struct statfs buf;
1315 	int error;
1316 
1317 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1318 	if (error == 0)
1319 		error = kern_statfs(&nd, &buf);
1320 	nlookup_done(&nd);
1321 	if (error == 0)
1322 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1323 	return (error);
1324 }
1325 
1326 int
1327 kern_fstatfs(int fd, struct statfs *buf)
1328 {
1329 	struct thread *td = curthread;
1330 	struct proc *p = td->td_proc;
1331 	struct file *fp;
1332 	struct mount *mp;
1333 	struct statfs *sp;
1334 	char *fullpath, *freepath;
1335 	int error;
1336 
1337 	KKASSERT(p);
1338 	if ((error = holdvnode(td, fd, &fp)) != 0)
1339 		return (error);
1340 
1341 	/*
1342 	 * Try to use mount info from any overlays rather than the
1343 	 * mount info for the underlying vnode, otherwise we will
1344 	 * fail when operating on null-mounted paths inside a chroot.
1345 	 */
1346 	if ((mp = fp->f_nchandle.mount) == NULL)
1347 		mp = ((struct vnode *)fp->f_data)->v_mount;
1348 	if (mp == NULL) {
1349 		error = EBADF;
1350 		goto done;
1351 	}
1352 	if (fp->f_cred == NULL) {
1353 		error = EINVAL;
1354 		goto done;
1355 	}
1356 	sp = &mp->mnt_stat;
1357 	if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1358 		goto done;
1359 
1360 	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1361 		goto done;
1362 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1363 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1364 	kfree(freepath, M_TEMP);
1365 
1366 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1367 	bcopy(sp, buf, sizeof(*buf));
1368 
1369 	/* Only root should have access to the fsid's. */
1370 	if (priv_check(td, PRIV_ROOT))
1371 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1372 	error = 0;
1373 done:
1374 	fdrop(fp);
1375 	return (error);
1376 }
1377 
1378 /*
1379  * fstatfs_args(int fd, struct statfs *buf)
1380  *
1381  * Get filesystem statistics.
1382  */
1383 int
1384 sys_fstatfs(struct fstatfs_args *uap)
1385 {
1386 	struct statfs buf;
1387 	int error;
1388 
1389 	error = kern_fstatfs(uap->fd, &buf);
1390 
1391 	if (error == 0)
1392 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1393 	return (error);
1394 }
1395 
1396 int
1397 kern_statvfs(struct nlookupdata *nd, struct statvfs *buf)
1398 {
1399 	struct mount *mp;
1400 	struct statvfs *sp;
1401 	int error;
1402 
1403 	if ((error = nlookup(nd)) != 0)
1404 		return (error);
1405 	mp = nd->nl_nch.mount;
1406 	sp = &mp->mnt_vstat;
1407 	if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
1408 		return (error);
1409 
1410 	sp->f_flag = 0;
1411 	if (mp->mnt_flag & MNT_RDONLY)
1412 		sp->f_flag |= ST_RDONLY;
1413 	if (mp->mnt_flag & MNT_NOSUID)
1414 		sp->f_flag |= ST_NOSUID;
1415 	bcopy(sp, buf, sizeof(*buf));
1416 	return (0);
1417 }
1418 
1419 /*
1420  * statfs_args(char *path, struct statfs *buf)
1421  *
1422  * Get filesystem statistics.
1423  */
1424 int
1425 sys_statvfs(struct statvfs_args *uap)
1426 {
1427 	struct nlookupdata nd;
1428 	struct statvfs buf;
1429 	int error;
1430 
1431 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1432 	if (error == 0)
1433 		error = kern_statvfs(&nd, &buf);
1434 	nlookup_done(&nd);
1435 	if (error == 0)
1436 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1437 	return (error);
1438 }
1439 
1440 int
1441 kern_fstatvfs(int fd, struct statvfs *buf)
1442 {
1443 	struct thread *td = curthread;
1444 	struct file *fp;
1445 	struct mount *mp;
1446 	struct statvfs *sp;
1447 	int error;
1448 
1449 	if ((error = holdvnode(td, fd, &fp)) != 0)
1450 		return (error);
1451 	if ((mp = fp->f_nchandle.mount) == NULL)
1452 		mp = ((struct vnode *)fp->f_data)->v_mount;
1453 	if (mp == NULL) {
1454 		error = EBADF;
1455 		goto done;
1456 	}
1457 	if (fp->f_cred == NULL) {
1458 		error = EINVAL;
1459 		goto done;
1460 	}
1461 	sp = &mp->mnt_vstat;
1462 	if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
1463 		goto done;
1464 
1465 	sp->f_flag = 0;
1466 	if (mp->mnt_flag & MNT_RDONLY)
1467 		sp->f_flag |= ST_RDONLY;
1468 	if (mp->mnt_flag & MNT_NOSUID)
1469 		sp->f_flag |= ST_NOSUID;
1470 
1471 	bcopy(sp, buf, sizeof(*buf));
1472 	error = 0;
1473 done:
1474 	fdrop(fp);
1475 	return (error);
1476 }
1477 
1478 /*
1479  * fstatfs_args(int fd, struct statfs *buf)
1480  *
1481  * Get filesystem statistics.
1482  */
1483 int
1484 sys_fstatvfs(struct fstatvfs_args *uap)
1485 {
1486 	struct statvfs buf;
1487 	int error;
1488 
1489 	error = kern_fstatvfs(uap->fd, &buf);
1490 
1491 	if (error == 0)
1492 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1493 	return (error);
1494 }
1495 
1496 /*
1497  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1498  *
1499  * Get statistics on all filesystems.
1500  */
1501 
1502 struct getfsstat_info {
1503 	struct statfs *sfsp;
1504 	long count;
1505 	long maxcount;
1506 	int error;
1507 	int flags;
1508 	struct thread *td;
1509 };
1510 
1511 static int getfsstat_callback(struct mount *, void *);
1512 
1513 int
1514 sys_getfsstat(struct getfsstat_args *uap)
1515 {
1516 	struct thread *td = curthread;
1517 	struct getfsstat_info info;
1518 
1519 	bzero(&info, sizeof(info));
1520 
1521 	info.maxcount = uap->bufsize / sizeof(struct statfs);
1522 	info.sfsp = uap->buf;
1523 	info.count = 0;
1524 	info.flags = uap->flags;
1525 	info.td = td;
1526 
1527 	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1528 	if (info.sfsp && info.count > info.maxcount)
1529 		uap->sysmsg_result = info.maxcount;
1530 	else
1531 		uap->sysmsg_result = info.count;
1532 	return (info.error);
1533 }
1534 
1535 static int
1536 getfsstat_callback(struct mount *mp, void *data)
1537 {
1538 	struct getfsstat_info *info = data;
1539 	struct statfs *sp;
1540 	char *freepath;
1541 	char *fullpath;
1542 	int error;
1543 
1544 	if (info->td->td_proc && !chroot_visible_mnt(mp, info->td->td_proc))
1545 		return(0);
1546 
1547 	if (info->sfsp && info->count < info->maxcount) {
1548 		sp = &mp->mnt_stat;
1549 
1550 		/*
1551 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1552 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1553 		 * overrides MNT_WAIT.
1554 		 */
1555 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1556 		    (info->flags & MNT_WAIT)) &&
1557 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1558 			return(0);
1559 		}
1560 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1561 
1562 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1563 		if (error) {
1564 			info->error = error;
1565 			return(-1);
1566 		}
1567 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1568 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1569 		kfree(freepath, M_TEMP);
1570 
1571 		error = copyout(sp, info->sfsp, sizeof(*sp));
1572 		if (error) {
1573 			info->error = error;
1574 			return (-1);
1575 		}
1576 		++info->sfsp;
1577 	}
1578 	info->count++;
1579 	return(0);
1580 }
1581 
1582 /*
1583  * getvfsstat_args(struct statfs *buf, struct statvfs *vbuf,
1584 		   long bufsize, int flags)
1585  *
1586  * Get statistics on all filesystems.
1587  */
1588 
1589 struct getvfsstat_info {
1590 	struct statfs *sfsp;
1591 	struct statvfs *vsfsp;
1592 	long count;
1593 	long maxcount;
1594 	int error;
1595 	int flags;
1596 	struct thread *td;
1597 };
1598 
1599 static int getvfsstat_callback(struct mount *, void *);
1600 
1601 int
1602 sys_getvfsstat(struct getvfsstat_args *uap)
1603 {
1604 	struct thread *td = curthread;
1605 	struct getvfsstat_info info;
1606 
1607 	bzero(&info, sizeof(info));
1608 
1609 	info.maxcount = uap->vbufsize / sizeof(struct statvfs);
1610 	info.sfsp = uap->buf;
1611 	info.vsfsp = uap->vbuf;
1612 	info.count = 0;
1613 	info.flags = uap->flags;
1614 	info.td = td;
1615 
1616 	mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
1617 	if (info.vsfsp && info.count > info.maxcount)
1618 		uap->sysmsg_result = info.maxcount;
1619 	else
1620 		uap->sysmsg_result = info.count;
1621 	return (info.error);
1622 }
1623 
1624 static int
1625 getvfsstat_callback(struct mount *mp, void *data)
1626 {
1627 	struct getvfsstat_info *info = data;
1628 	struct statfs *sp;
1629 	struct statvfs *vsp;
1630 	char *freepath;
1631 	char *fullpath;
1632 	int error;
1633 
1634 	if (info->td->td_proc && !chroot_visible_mnt(mp, info->td->td_proc))
1635 		return(0);
1636 
1637 	if (info->vsfsp && info->count < info->maxcount) {
1638 		sp = &mp->mnt_stat;
1639 		vsp = &mp->mnt_vstat;
1640 
1641 		/*
1642 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1643 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1644 		 * overrides MNT_WAIT.
1645 		 */
1646 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1647 		    (info->flags & MNT_WAIT)) &&
1648 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1649 			return(0);
1650 		}
1651 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1652 
1653 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1654 		    (info->flags & MNT_WAIT)) &&
1655 		    (error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
1656 			return(0);
1657 		}
1658 		vsp->f_flag = 0;
1659 		if (mp->mnt_flag & MNT_RDONLY)
1660 			vsp->f_flag |= ST_RDONLY;
1661 		if (mp->mnt_flag & MNT_NOSUID)
1662 			vsp->f_flag |= ST_NOSUID;
1663 
1664 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1665 		if (error) {
1666 			info->error = error;
1667 			return(-1);
1668 		}
1669 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1670 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1671 		kfree(freepath, M_TEMP);
1672 
1673 		error = copyout(sp, info->sfsp, sizeof(*sp));
1674 		if (error == 0)
1675 			error = copyout(vsp, info->vsfsp, sizeof(*vsp));
1676 		if (error) {
1677 			info->error = error;
1678 			return (-1);
1679 		}
1680 		++info->sfsp;
1681 		++info->vsfsp;
1682 	}
1683 	info->count++;
1684 	return(0);
1685 }
1686 
1687 
1688 /*
1689  * fchdir_args(int fd)
1690  *
1691  * Change current working directory to a given file descriptor.
1692  */
1693 int
1694 sys_fchdir(struct fchdir_args *uap)
1695 {
1696 	struct thread *td = curthread;
1697 	struct proc *p = td->td_proc;
1698 	struct filedesc *fdp = p->p_fd;
1699 	struct vnode *vp, *ovp;
1700 	struct mount *mp;
1701 	struct file *fp;
1702 	struct nchandle nch, onch, tnch;
1703 	int error;
1704 
1705 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
1706 		return (error);
1707 	lwkt_gettoken(&p->p_token);
1708 	vp = (struct vnode *)fp->f_data;
1709 	vref(vp);
1710 	vn_lock(vp, LK_SHARED | LK_RETRY);
1711 	if (fp->f_nchandle.ncp == NULL)
1712 		error = ENOTDIR;
1713 	else
1714 		error = checkvp_chdir(vp, td);
1715 	if (error) {
1716 		vput(vp);
1717 		goto done;
1718 	}
1719 	cache_copy(&fp->f_nchandle, &nch);
1720 
1721 	/*
1722 	 * If the ncp has become a mount point, traverse through
1723 	 * the mount point.
1724 	 */
1725 
1726 	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1727 	       (mp = cache_findmount(&nch)) != NULL
1728 	) {
1729 		error = nlookup_mp(mp, &tnch);
1730 		if (error == 0) {
1731 			cache_unlock(&tnch);	/* leave ref intact */
1732 			vput(vp);
1733 			vp = tnch.ncp->nc_vp;
1734 			error = vget(vp, LK_SHARED);
1735 			KKASSERT(error == 0);
1736 			cache_drop(&nch);
1737 			nch = tnch;
1738 		}
1739 		cache_dropmount(mp);
1740 	}
1741 	if (error == 0) {
1742 		spin_lock(&fdp->fd_spin);
1743 		ovp = fdp->fd_cdir;
1744 		onch = fdp->fd_ncdir;
1745 		fdp->fd_cdir = vp;
1746 		fdp->fd_ncdir = nch;
1747 		spin_unlock(&fdp->fd_spin);
1748 		vn_unlock(vp);		/* leave ref intact */
1749 		cache_drop(&onch);
1750 		vrele(ovp);
1751 	} else {
1752 		cache_drop(&nch);
1753 		vput(vp);
1754 	}
1755 	fdrop(fp);
1756 done:
1757 	lwkt_reltoken(&p->p_token);
1758 	return (error);
1759 }
1760 
1761 int
1762 kern_chdir(struct nlookupdata *nd)
1763 {
1764 	struct thread *td = curthread;
1765 	struct proc *p = td->td_proc;
1766 	struct filedesc *fdp = p->p_fd;
1767 	struct vnode *vp, *ovp;
1768 	struct nchandle onch;
1769 	int error;
1770 
1771 	nd->nl_flags |= NLC_SHAREDLOCK;
1772 	if ((error = nlookup(nd)) != 0)
1773 		return (error);
1774 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1775 		return (ENOENT);
1776 	if ((error = vget(vp, LK_SHARED)) != 0)
1777 		return (error);
1778 
1779 	lwkt_gettoken(&p->p_token);
1780 	error = checkvp_chdir(vp, td);
1781 	vn_unlock(vp);
1782 	if (error == 0) {
1783 		spin_lock(&fdp->fd_spin);
1784 		ovp = fdp->fd_cdir;
1785 		onch = fdp->fd_ncdir;
1786 		fdp->fd_ncdir = nd->nl_nch;
1787 		fdp->fd_cdir = vp;
1788 		spin_unlock(&fdp->fd_spin);
1789 		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1790 		cache_drop(&onch);
1791 		vrele(ovp);
1792 		cache_zero(&nd->nl_nch);
1793 	} else {
1794 		vrele(vp);
1795 	}
1796 	lwkt_reltoken(&p->p_token);
1797 	return (error);
1798 }
1799 
1800 /*
1801  * chdir_args(char *path)
1802  *
1803  * Change current working directory (``.'').
1804  */
1805 int
1806 sys_chdir(struct chdir_args *uap)
1807 {
1808 	struct nlookupdata nd;
1809 	int error;
1810 
1811 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1812 	if (error == 0)
1813 		error = kern_chdir(&nd);
1814 	nlookup_done(&nd);
1815 	return (error);
1816 }
1817 
1818 /*
1819  * Helper function for raised chroot(2) security function:  Refuse if
1820  * any filedescriptors are open directories.
1821  */
1822 static int
1823 chroot_refuse_vdir_fds(thread_t td, struct filedesc *fdp)
1824 {
1825 	struct vnode *vp;
1826 	struct file *fp;
1827 	int error;
1828 	int fd;
1829 
1830 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1831 		if ((error = holdvnode(td, fd, &fp)) != 0)
1832 			continue;
1833 		vp = (struct vnode *)fp->f_data;
1834 		if (vp->v_type != VDIR) {
1835 			fdrop(fp);
1836 			continue;
1837 		}
1838 		fdrop(fp);
1839 		return(EPERM);
1840 	}
1841 	return (0);
1842 }
1843 
1844 /*
1845  * This sysctl determines if we will allow a process to chroot(2) if it
1846  * has a directory open:
1847  *	0: disallowed for all processes.
1848  *	1: allowed for processes that were not already chroot(2)'ed.
1849  *	2: allowed for all processes.
1850  */
1851 
1852 static int chroot_allow_open_directories = 1;
1853 
1854 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1855      &chroot_allow_open_directories, 0, "");
1856 
1857 /*
1858  * chroot to the specified namecache entry.  We obtain the vp from the
1859  * namecache data.  The passed ncp must be locked and referenced and will
1860  * remain locked and referenced on return.
1861  */
1862 int
1863 kern_chroot(struct nchandle *nch)
1864 {
1865 	struct thread *td = curthread;
1866 	struct proc *p = td->td_proc;
1867 	struct filedesc *fdp = p->p_fd;
1868 	struct vnode *vp;
1869 	int error;
1870 
1871 	/*
1872 	 * Only privileged user can chroot
1873 	 */
1874 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1875 	if (error)
1876 		return (error);
1877 
1878 	/*
1879 	 * Disallow open directory descriptors (fchdir() breakouts).
1880 	 */
1881 	if (chroot_allow_open_directories == 0 ||
1882 	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1883 		if ((error = chroot_refuse_vdir_fds(td, fdp)) != 0)
1884 			return (error);
1885 	}
1886 	if ((vp = nch->ncp->nc_vp) == NULL)
1887 		return (ENOENT);
1888 
1889 	if ((error = vget(vp, LK_SHARED)) != 0)
1890 		return (error);
1891 
1892 	/*
1893 	 * Check the validity of vp as a directory to change to and
1894 	 * associate it with rdir/jdir.
1895 	 */
1896 	error = checkvp_chdir(vp, td);
1897 	vn_unlock(vp);			/* leave reference intact */
1898 	if (error == 0) {
1899 		lwkt_gettoken(&p->p_token);
1900 		vrele(fdp->fd_rdir);
1901 		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
1902 		cache_drop(&fdp->fd_nrdir);
1903 		cache_copy(nch, &fdp->fd_nrdir);
1904 		if (fdp->fd_jdir == NULL) {
1905 			fdp->fd_jdir = vp;
1906 			vref(fdp->fd_jdir);
1907 			cache_copy(nch, &fdp->fd_njdir);
1908 		}
1909 		if ((p->p_flags & P_DIDCHROOT) == 0) {
1910 			p->p_flags |= P_DIDCHROOT;
1911 			if (p->p_depth <= 65535 - 32)
1912 				p->p_depth += 32;
1913 		}
1914 		lwkt_reltoken(&p->p_token);
1915 	} else {
1916 		vrele(vp);
1917 	}
1918 	return (error);
1919 }
1920 
1921 /*
1922  * chroot_args(char *path)
1923  *
1924  * Change notion of root (``/'') directory.
1925  */
1926 int
1927 sys_chroot(struct chroot_args *uap)
1928 {
1929 	struct thread *td __debugvar = curthread;
1930 	struct nlookupdata nd;
1931 	int error;
1932 
1933 	KKASSERT(td->td_proc);
1934 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1935 	if (error == 0) {
1936 		nd.nl_flags |= NLC_EXEC;
1937 		error = nlookup(&nd);
1938 		if (error == 0)
1939 			error = kern_chroot(&nd.nl_nch);
1940 	}
1941 	nlookup_done(&nd);
1942 	return(error);
1943 }
1944 
1945 int
1946 sys_chroot_kernel(struct chroot_kernel_args *uap)
1947 {
1948 	struct thread *td = curthread;
1949 	struct nlookupdata nd;
1950 	struct nchandle *nch;
1951 	struct vnode *vp;
1952 	int error;
1953 
1954 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1955 	if (error)
1956 		goto error_nond;
1957 
1958 	error = nlookup(&nd);
1959 	if (error)
1960 		goto error_out;
1961 
1962 	nch = &nd.nl_nch;
1963 
1964 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1965 	if (error)
1966 		goto error_out;
1967 
1968 	if ((vp = nch->ncp->nc_vp) == NULL) {
1969 		error = ENOENT;
1970 		goto error_out;
1971 	}
1972 
1973 	if ((error = cache_vref(nch, nd.nl_cred, &vp)) != 0)
1974 		goto error_out;
1975 
1976 	kprintf("chroot_kernel: set new rootnch/rootvnode to %s\n", uap->path);
1977 	vfs_cache_setroot(vp, cache_hold(nch));
1978 
1979 error_out:
1980 	nlookup_done(&nd);
1981 error_nond:
1982 	return(error);
1983 }
1984 
1985 /*
1986  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1987  * determine whether it is legal to chdir to the vnode.  The vnode's state
1988  * is not changed by this call.
1989  */
1990 static int
1991 checkvp_chdir(struct vnode *vp, struct thread *td)
1992 {
1993 	int error;
1994 
1995 	if (vp->v_type != VDIR)
1996 		error = ENOTDIR;
1997 	else
1998 		error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
1999 	return (error);
2000 }
2001 
2002 int
2003 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
2004 {
2005 	struct thread *td = curthread;
2006 	struct proc *p = td->td_proc;
2007 	struct lwp *lp = td->td_lwp;
2008 	struct filedesc *fdp = p->p_fd;
2009 	int cmode, flags;
2010 	struct file *nfp;
2011 	struct file *fp;
2012 	struct vnode *vp;
2013 	int type, indx, error = 0;
2014 	struct flock lf;
2015 
2016 	if ((oflags & O_ACCMODE) == O_ACCMODE)
2017 		return (EINVAL);
2018 	flags = FFLAGS(oflags);
2019 	error = falloc(lp, &nfp, NULL);
2020 	if (error)
2021 		return (error);
2022 	fp = nfp;
2023 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
2024 
2025 	/*
2026 	 * XXX p_dupfd is a real mess.  It allows a device to return a
2027 	 * file descriptor to be duplicated rather then doing the open
2028 	 * itself.
2029 	 */
2030 	lp->lwp_dupfd = -1;
2031 
2032 	/*
2033 	 * Call vn_open() to do the lookup and assign the vnode to the
2034 	 * file pointer.  vn_open() does not change the ref count on fp
2035 	 * and the vnode, on success, will be inherited by the file pointer
2036 	 * and unlocked.
2037 	 *
2038 	 * Request a shared lock on the vnode if possible.
2039 	 *
2040 	 * Executable binaries can race VTEXT against O_RDWR opens, so
2041 	 * use an exclusive lock for O_RDWR opens as well.
2042 	 *
2043 	 * NOTE: We need a flag to separate terminal vnode locking from
2044 	 *	 parent locking.  O_CREAT needs parent locking, but O_TRUNC
2045 	 *	 and O_RDWR only need to lock the terminal vnode exclusively.
2046 	 */
2047 	nd->nl_flags |= NLC_LOCKVP;
2048 	if ((flags & (O_CREAT|O_TRUNC|O_RDWR)) == 0)
2049 		nd->nl_flags |= NLC_SHAREDLOCK;
2050 
2051 	error = vn_open(nd, fp, flags, cmode);
2052 	nlookup_done(nd);
2053 
2054 	if (error) {
2055 		/*
2056 		 * handle special fdopen() case.  bleh.  dupfdopen() is
2057 		 * responsible for dropping the old contents of ofiles[indx]
2058 		 * if it succeeds.
2059 		 *
2060 		 * Note that fsetfd() will add a ref to fp which represents
2061 		 * the fd_files[] assignment.  We must still drop our
2062 		 * reference.
2063 		 */
2064 		if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
2065 			if (fdalloc(p, 0, &indx) == 0) {
2066 				error = dupfdopen(td, indx, lp->lwp_dupfd, flags, error);
2067 				if (error == 0) {
2068 					*res = indx;
2069 					fdrop(fp);	/* our ref */
2070 					return (0);
2071 				}
2072 				fsetfd(fdp, NULL, indx);
2073 			}
2074 		}
2075 		fdrop(fp);	/* our ref */
2076 		if (error == ERESTART)
2077 			error = EINTR;
2078 		return (error);
2079 	}
2080 
2081 	/*
2082 	 * ref the vnode for ourselves so it can't be ripped out from under
2083 	 * is.  XXX need an ND flag to request that the vnode be returned
2084 	 * anyway.
2085 	 *
2086 	 * Reserve a file descriptor but do not assign it until the open
2087 	 * succeeds.
2088 	 */
2089 	vp = (struct vnode *)fp->f_data;
2090 	vref(vp);
2091 	if ((error = fdalloc(p, 0, &indx)) != 0) {
2092 		fdrop(fp);
2093 		vrele(vp);
2094 		return (error);
2095 	}
2096 
2097 	/*
2098 	 * If no error occurs the vp will have been assigned to the file
2099 	 * pointer.
2100 	 */
2101 	lp->lwp_dupfd = 0;
2102 
2103 	if (flags & (O_EXLOCK | O_SHLOCK)) {
2104 		lf.l_whence = SEEK_SET;
2105 		lf.l_start = 0;
2106 		lf.l_len = 0;
2107 		if (flags & O_EXLOCK)
2108 			lf.l_type = F_WRLCK;
2109 		else
2110 			lf.l_type = F_RDLCK;
2111 		if (flags & FNONBLOCK)
2112 			type = 0;
2113 		else
2114 			type = F_WAIT;
2115 
2116 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
2117 			/*
2118 			 * lock request failed.  Clean up the reserved
2119 			 * descriptor.
2120 			 */
2121 			vrele(vp);
2122 			fsetfd(fdp, NULL, indx);
2123 			fdrop(fp);
2124 			return (error);
2125 		}
2126 		atomic_set_int(&fp->f_flag, FHASLOCK); /* race ok */
2127 	}
2128 #if 0
2129 	/*
2130 	 * Assert that all regular file vnodes were created with a object.
2131 	 */
2132 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
2133 		("open: regular file has no backing object after vn_open"));
2134 #endif
2135 
2136 	vrele(vp);
2137 
2138 	/*
2139 	 * release our private reference, leaving the one associated with the
2140 	 * descriptor table intact.
2141 	 */
2142 	if (oflags & O_CLOEXEC)
2143 		fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
2144 	fsetfd(fdp, fp, indx);
2145 	fdrop(fp);
2146 	*res = indx;
2147 
2148 	return (error);
2149 }
2150 
2151 /*
2152  * open_args(char *path, int flags, int mode)
2153  *
2154  * Check permissions, allocate an open file structure,
2155  * and call the device open routine if any.
2156  */
2157 int
2158 sys_open(struct open_args *uap)
2159 {
2160 	struct nlookupdata nd;
2161 	int error;
2162 
2163 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2164 	if (error == 0) {
2165 		error = kern_open(&nd, uap->flags,
2166 				    uap->mode, &uap->sysmsg_result);
2167 	}
2168 	nlookup_done(&nd);
2169 	return (error);
2170 }
2171 
2172 /*
2173  * openat_args(int fd, char *path, int flags, int mode)
2174  */
2175 int
2176 sys_openat(struct openat_args *uap)
2177 {
2178 	struct nlookupdata nd;
2179 	int error;
2180 	struct file *fp;
2181 
2182 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2183 	if (error == 0) {
2184 		error = kern_open(&nd, uap->flags, uap->mode,
2185 					&uap->sysmsg_result);
2186 	}
2187 	nlookup_done_at(&nd, fp);
2188 	return (error);
2189 }
2190 
2191 int
2192 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
2193 {
2194 	struct thread *td = curthread;
2195 	struct proc *p = td->td_proc;
2196 	struct vnode *vp;
2197 	struct vattr vattr;
2198 	int error;
2199 	int whiteout = 0;
2200 
2201 	KKASSERT(p);
2202 
2203 	VATTR_NULL(&vattr);
2204 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2205 	vattr.va_rmajor = rmajor;
2206 	vattr.va_rminor = rminor;
2207 
2208 	switch (mode & S_IFMT) {
2209 	case S_IFMT:	/* used by badsect to flag bad sectors */
2210 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_BAD, 0);
2211 		vattr.va_type = VBAD;
2212 		break;
2213 	case S_IFCHR:
2214 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
2215 		vattr.va_type = VCHR;
2216 		break;
2217 	case S_IFBLK:
2218 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
2219 		vattr.va_type = VBLK;
2220 		break;
2221 	case S_IFWHT:
2222 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_WHT, 0);
2223 		whiteout = 1;
2224 		break;
2225 	case S_IFDIR:	/* special directories support for HAMMER */
2226 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_DIR, 0);
2227 		vattr.va_type = VDIR;
2228 		break;
2229 	default:
2230 		error = EINVAL;
2231 		break;
2232 	}
2233 
2234 	if (error)
2235 		return (error);
2236 
2237 	bwillinode(1);
2238 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2239 	if ((error = nlookup(nd)) != 0)
2240 		return (error);
2241 	if (nd->nl_nch.ncp->nc_vp)
2242 		return (EEXIST);
2243 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2244 		return (error);
2245 
2246 	if (whiteout) {
2247 		error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
2248 				      nd->nl_cred, NAMEI_CREATE);
2249 	} else {
2250 		vp = NULL;
2251 		error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
2252 				   &vp, nd->nl_cred, &vattr);
2253 		if (error == 0)
2254 			vput(vp);
2255 	}
2256 	return (error);
2257 }
2258 
2259 /*
2260  * mknod_args(char *path, int mode, int dev)
2261  *
2262  * Create a special file.
2263  */
2264 int
2265 sys_mknod(struct mknod_args *uap)
2266 {
2267 	struct nlookupdata nd;
2268 	int error;
2269 
2270 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2271 	if (error == 0) {
2272 		error = kern_mknod(&nd, uap->mode,
2273 				   umajor(uap->dev), uminor(uap->dev));
2274 	}
2275 	nlookup_done(&nd);
2276 	return (error);
2277 }
2278 
2279 /*
2280  * mknodat_args(int fd, char *path, mode_t mode, dev_t dev)
2281  *
2282  * Create a special file.  The path is relative to the directory associated
2283  * with fd.
2284  */
2285 int
2286 sys_mknodat(struct mknodat_args *uap)
2287 {
2288 	struct nlookupdata nd;
2289 	struct file *fp;
2290 	int error;
2291 
2292 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2293 	if (error == 0) {
2294 		error = kern_mknod(&nd, uap->mode,
2295 				   umajor(uap->dev), uminor(uap->dev));
2296 	}
2297 	nlookup_done_at(&nd, fp);
2298 	return (error);
2299 }
2300 
2301 int
2302 kern_mkfifo(struct nlookupdata *nd, int mode)
2303 {
2304 	struct thread *td = curthread;
2305 	struct proc *p = td->td_proc;
2306 	struct vattr vattr;
2307 	struct vnode *vp;
2308 	int error;
2309 
2310 	bwillinode(1);
2311 
2312 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2313 	if ((error = nlookup(nd)) != 0)
2314 		return (error);
2315 	if (nd->nl_nch.ncp->nc_vp)
2316 		return (EEXIST);
2317 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2318 		return (error);
2319 
2320 	VATTR_NULL(&vattr);
2321 	vattr.va_type = VFIFO;
2322 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2323 	vp = NULL;
2324 	error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
2325 	if (error == 0)
2326 		vput(vp);
2327 	return (error);
2328 }
2329 
2330 /*
2331  * mkfifo_args(char *path, int mode)
2332  *
2333  * Create a named pipe.
2334  */
2335 int
2336 sys_mkfifo(struct mkfifo_args *uap)
2337 {
2338 	struct nlookupdata nd;
2339 	int error;
2340 
2341 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2342 	if (error == 0)
2343 		error = kern_mkfifo(&nd, uap->mode);
2344 	nlookup_done(&nd);
2345 	return (error);
2346 }
2347 
2348 /*
2349  * mkfifoat_args(int fd, char *path, mode_t mode)
2350  *
2351  * Create a named pipe.  The path is relative to the directory associated
2352  * with fd.
2353  */
2354 int
2355 sys_mkfifoat(struct mkfifoat_args *uap)
2356 {
2357 	struct nlookupdata nd;
2358 	struct file *fp;
2359 	int error;
2360 
2361 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2362 	if (error == 0)
2363 		error = kern_mkfifo(&nd, uap->mode);
2364 	nlookup_done_at(&nd, fp);
2365 	return (error);
2366 }
2367 
2368 static int hardlink_check_uid = 0;
2369 SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
2370     &hardlink_check_uid, 0,
2371     "Unprivileged processes cannot create hard links to files owned by other "
2372     "users");
2373 static int hardlink_check_gid = 0;
2374 SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
2375     &hardlink_check_gid, 0,
2376     "Unprivileged processes cannot create hard links to files owned by other "
2377     "groups");
2378 
2379 static int
2380 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
2381 {
2382 	struct vattr va;
2383 	int error;
2384 
2385 	/*
2386 	 * Shortcut if disabled
2387 	 */
2388 	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
2389 		return (0);
2390 
2391 	/*
2392 	 * Privileged user can always hardlink
2393 	 */
2394 	if (priv_check_cred(cred, PRIV_VFS_LINK, 0) == 0)
2395 		return (0);
2396 
2397 	/*
2398 	 * Otherwise only if the originating file is owned by the
2399 	 * same user or group.  Note that any group is allowed if
2400 	 * the file is owned by the caller.
2401 	 */
2402 	error = VOP_GETATTR(vp, &va);
2403 	if (error != 0)
2404 		return (error);
2405 
2406 	if (hardlink_check_uid) {
2407 		if (cred->cr_uid != va.va_uid)
2408 			return (EPERM);
2409 	}
2410 
2411 	if (hardlink_check_gid) {
2412 		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
2413 			return (EPERM);
2414 	}
2415 
2416 	return (0);
2417 }
2418 
2419 int
2420 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
2421 {
2422 	struct thread *td = curthread;
2423 	struct vnode *vp;
2424 	int error;
2425 
2426 	/*
2427 	 * Lookup the source and obtained a locked vnode.
2428 	 *
2429 	 * You may only hardlink a file which you have write permission
2430 	 * on or which you own.
2431 	 *
2432 	 * XXX relookup on vget failure / race ?
2433 	 */
2434 	bwillinode(1);
2435 	nd->nl_flags |= NLC_WRITE | NLC_OWN | NLC_HLINK;
2436 	if ((error = nlookup(nd)) != 0)
2437 		return (error);
2438 	vp = nd->nl_nch.ncp->nc_vp;
2439 	KKASSERT(vp != NULL);
2440 	if (vp->v_type == VDIR)
2441 		return (EPERM);		/* POSIX */
2442 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2443 		return (error);
2444 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
2445 		return (error);
2446 
2447 	/*
2448 	 * Unlock the source so we can lookup the target without deadlocking
2449 	 * (XXX vp is locked already, possible other deadlock?).  The target
2450 	 * must not exist.
2451 	 */
2452 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
2453 	nd->nl_flags &= ~NLC_NCPISLOCKED;
2454 	cache_unlock(&nd->nl_nch);
2455 	vn_unlock(vp);
2456 
2457 	linknd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2458 	if ((error = nlookup(linknd)) != 0) {
2459 		vrele(vp);
2460 		return (error);
2461 	}
2462 	if (linknd->nl_nch.ncp->nc_vp) {
2463 		vrele(vp);
2464 		return (EEXIST);
2465 	}
2466 	VFS_MODIFYING(vp->v_mount);
2467 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
2468 	if (error) {
2469 		vrele(vp);
2470 		return (error);
2471 	}
2472 
2473 	/*
2474 	 * Finally run the new API VOP.
2475 	 */
2476 	error = can_hardlink(vp, td, td->td_ucred);
2477 	if (error == 0) {
2478 		error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
2479 				  vp, linknd->nl_cred);
2480 	}
2481 	vput(vp);
2482 	return (error);
2483 }
2484 
2485 /*
2486  * link_args(char *path, char *link)
2487  *
2488  * Make a hard file link.
2489  */
2490 int
2491 sys_link(struct link_args *uap)
2492 {
2493 	struct nlookupdata nd, linknd;
2494 	int error;
2495 
2496 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2497 	if (error == 0) {
2498 		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
2499 		if (error == 0)
2500 			error = kern_link(&nd, &linknd);
2501 		nlookup_done(&linknd);
2502 	}
2503 	nlookup_done(&nd);
2504 	return (error);
2505 }
2506 
2507 /*
2508  * linkat_args(int fd1, char *path1, int fd2, char *path2, int flags)
2509  *
2510  * Make a hard file link. The path1 argument is relative to the directory
2511  * associated with fd1, and similarly the path2 argument is relative to
2512  * the directory associated with fd2.
2513  */
2514 int
2515 sys_linkat(struct linkat_args *uap)
2516 {
2517 	struct nlookupdata nd, linknd;
2518 	struct file *fp1, *fp2;
2519 	int error;
2520 
2521 	error = nlookup_init_at(&nd, &fp1, uap->fd1, uap->path1, UIO_USERSPACE,
2522 	    (uap->flags & AT_SYMLINK_FOLLOW) ? NLC_FOLLOW : 0);
2523 	if (error == 0) {
2524 		error = nlookup_init_at(&linknd, &fp2, uap->fd2,
2525 		    uap->path2, UIO_USERSPACE, 0);
2526 		if (error == 0)
2527 			error = kern_link(&nd, &linknd);
2528 		nlookup_done_at(&linknd, fp2);
2529 	}
2530 	nlookup_done_at(&nd, fp1);
2531 	return (error);
2532 }
2533 
2534 int
2535 kern_symlink(struct nlookupdata *nd, char *path, int mode)
2536 {
2537 	struct vattr vattr;
2538 	struct vnode *vp;
2539 	struct vnode *dvp;
2540 	int error;
2541 
2542 	bwillinode(1);
2543 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2544 	if ((error = nlookup(nd)) != 0)
2545 		return (error);
2546 	if (nd->nl_nch.ncp->nc_vp)
2547 		return (EEXIST);
2548 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2549 		return (error);
2550 	dvp = nd->nl_dvp;
2551 	VATTR_NULL(&vattr);
2552 	vattr.va_mode = mode;
2553 	error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
2554 	if (error == 0)
2555 		vput(vp);
2556 	return (error);
2557 }
2558 
2559 /*
2560  * symlink(char *path, char *link)
2561  *
2562  * Make a symbolic link.
2563  */
2564 int
2565 sys_symlink(struct symlink_args *uap)
2566 {
2567 	struct thread *td = curthread;
2568 	struct nlookupdata nd;
2569 	char *path;
2570 	int error;
2571 	int mode;
2572 
2573 	path = objcache_get(namei_oc, M_WAITOK);
2574 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
2575 	if (error == 0) {
2576 		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
2577 		if (error == 0) {
2578 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2579 			error = kern_symlink(&nd, path, mode);
2580 		}
2581 		nlookup_done(&nd);
2582 	}
2583 	objcache_put(namei_oc, path);
2584 	return (error);
2585 }
2586 
2587 /*
2588  * symlinkat_args(char *path1, int fd, char *path2)
2589  *
2590  * Make a symbolic link.  The path2 argument is relative to the directory
2591  * associated with fd.
2592  */
2593 int
2594 sys_symlinkat(struct symlinkat_args *uap)
2595 {
2596 	struct thread *td = curthread;
2597 	struct nlookupdata nd;
2598 	struct file *fp;
2599 	char *path1;
2600 	int error;
2601 	int mode;
2602 
2603 	path1 = objcache_get(namei_oc, M_WAITOK);
2604 	error = copyinstr(uap->path1, path1, MAXPATHLEN, NULL);
2605 	if (error == 0) {
2606 		error = nlookup_init_at(&nd, &fp, uap->fd, uap->path2,
2607 		    UIO_USERSPACE, 0);
2608 		if (error == 0) {
2609 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2610 			error = kern_symlink(&nd, path1, mode);
2611 		}
2612 		nlookup_done_at(&nd, fp);
2613 	}
2614 	objcache_put(namei_oc, path1);
2615 	return (error);
2616 }
2617 
2618 /*
2619  * undelete_args(char *path)
2620  *
2621  * Delete a whiteout from the filesystem.
2622  */
2623 int
2624 sys_undelete(struct undelete_args *uap)
2625 {
2626 	struct nlookupdata nd;
2627 	int error;
2628 
2629 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2630 	bwillinode(1);
2631 	nd.nl_flags |= NLC_DELETE | NLC_REFDVP;
2632 	if (error == 0)
2633 		error = nlookup(&nd);
2634 	if (error == 0)
2635 		error = ncp_writechk(&nd.nl_nch);
2636 	if (error == 0) {
2637 		error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
2638 				      NAMEI_DELETE);
2639 	}
2640 	nlookup_done(&nd);
2641 	return (error);
2642 }
2643 
2644 int
2645 kern_unlink(struct nlookupdata *nd)
2646 {
2647 	int error;
2648 
2649 	bwillinode(1);
2650 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
2651 	if ((error = nlookup(nd)) != 0)
2652 		return (error);
2653 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2654 		return (error);
2655 	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
2656 	return (error);
2657 }
2658 
2659 /*
2660  * unlink_args(char *path)
2661  *
2662  * Delete a name from the filesystem.
2663  */
2664 int
2665 sys_unlink(struct unlink_args *uap)
2666 {
2667 	struct nlookupdata nd;
2668 	int error;
2669 
2670 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2671 	if (error == 0)
2672 		error = kern_unlink(&nd);
2673 	nlookup_done(&nd);
2674 	return (error);
2675 }
2676 
2677 
2678 /*
2679  * unlinkat_args(int fd, char *path, int flags)
2680  *
2681  * Delete the file or directory entry pointed to by fd/path.
2682  */
2683 int
2684 sys_unlinkat(struct unlinkat_args *uap)
2685 {
2686 	struct nlookupdata nd;
2687 	struct file *fp;
2688 	int error;
2689 
2690 	if (uap->flags & ~AT_REMOVEDIR)
2691 		return (EINVAL);
2692 
2693 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2694 	if (error == 0) {
2695 		if (uap->flags & AT_REMOVEDIR)
2696 			error = kern_rmdir(&nd);
2697 		else
2698 			error = kern_unlink(&nd);
2699 	}
2700 	nlookup_done_at(&nd, fp);
2701 	return (error);
2702 }
2703 
2704 int
2705 kern_lseek(int fd, off_t offset, int whence, off_t *res)
2706 {
2707 	struct thread *td = curthread;
2708 	struct file *fp;
2709 	struct vnode *vp;
2710 	struct vattr vattr;
2711 	off_t new_offset;
2712 	int error;
2713 
2714 	fp = holdfp(td, fd, -1);
2715 	if (fp == NULL)
2716 		return (EBADF);
2717 	if (fp->f_type != DTYPE_VNODE) {
2718 		error = ESPIPE;
2719 		goto done;
2720 	}
2721 	vp = (struct vnode *)fp->f_data;
2722 
2723 	switch (whence) {
2724 	case L_INCR:
2725 		spin_lock(&fp->f_spin);
2726 		new_offset = fp->f_offset + offset;
2727 		error = 0;
2728 		break;
2729 	case L_XTND:
2730 		error = VOP_GETATTR(vp, &vattr);
2731 		spin_lock(&fp->f_spin);
2732 		new_offset = offset + vattr.va_size;
2733 		break;
2734 	case L_SET:
2735 		new_offset = offset;
2736 		error = 0;
2737 		spin_lock(&fp->f_spin);
2738 		break;
2739 	default:
2740 		new_offset = 0;
2741 		error = EINVAL;
2742 		spin_lock(&fp->f_spin);
2743 		break;
2744 	}
2745 
2746 	/*
2747 	 * Validate the seek position.  Negative offsets are not allowed
2748 	 * for regular files or directories.
2749 	 *
2750 	 * Normally we would also not want to allow negative offsets for
2751 	 * character and block-special devices.  However kvm addresses
2752 	 * on 64 bit architectures might appear to be negative and must
2753 	 * be allowed.
2754 	 */
2755 	if (error == 0) {
2756 		if (new_offset < 0 &&
2757 		    (vp->v_type == VREG || vp->v_type == VDIR)) {
2758 			error = EINVAL;
2759 		} else {
2760 			fp->f_offset = new_offset;
2761 		}
2762 	}
2763 	*res = fp->f_offset;
2764 	spin_unlock(&fp->f_spin);
2765 done:
2766 	dropfp(td, fd, fp);
2767 
2768 	return (error);
2769 }
2770 
2771 /*
2772  * lseek_args(int fd, int pad, off_t offset, int whence)
2773  *
2774  * Reposition read/write file offset.
2775  */
2776 int
2777 sys_lseek(struct lseek_args *uap)
2778 {
2779 	int error;
2780 
2781 	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2782 			   &uap->sysmsg_offset);
2783 
2784 	return (error);
2785 }
2786 
2787 /*
2788  * Check if current process can access given file.  amode is a bitmask of *_OK
2789  * access bits.  flags is a bitmask of AT_* flags.
2790  */
2791 int
2792 kern_access(struct nlookupdata *nd, int amode, int flags)
2793 {
2794 	struct vnode *vp;
2795 	int error, mode;
2796 
2797 	if (flags & ~AT_EACCESS)
2798 		return (EINVAL);
2799 	nd->nl_flags |= NLC_SHAREDLOCK;
2800 	if ((error = nlookup(nd)) != 0)
2801 		return (error);
2802 retry:
2803 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
2804 	if (error)
2805 		return (error);
2806 
2807 	/* Flags == 0 means only check for existence. */
2808 	if (amode) {
2809 		mode = 0;
2810 		if (amode & R_OK)
2811 			mode |= VREAD;
2812 		if (amode & W_OK)
2813 			mode |= VWRITE;
2814 		if (amode & X_OK)
2815 			mode |= VEXEC;
2816 		if ((mode & VWRITE) == 0 ||
2817 		    (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2818 			error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
2819 
2820 		/*
2821 		 * If the file handle is stale we have to re-resolve the
2822 		 * entry with the ncp held exclusively.  This is a hack
2823 		 * at the moment.
2824 		 */
2825 		if (error == ESTALE) {
2826 			vput(vp);
2827 			cache_unlock(&nd->nl_nch);
2828 			cache_lock(&nd->nl_nch);
2829 			cache_setunresolved(&nd->nl_nch);
2830 			error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2831 			if (error == 0) {
2832 				vp = NULL;
2833 				goto retry;
2834 			}
2835 			return(error);
2836 		}
2837 	}
2838 	vput(vp);
2839 	return (error);
2840 }
2841 
2842 /*
2843  * access_args(char *path, int flags)
2844  *
2845  * Check access permissions.
2846  */
2847 int
2848 sys_access(struct access_args *uap)
2849 {
2850 	struct nlookupdata nd;
2851 	int error;
2852 
2853 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2854 	if (error == 0)
2855 		error = kern_access(&nd, uap->flags, 0);
2856 	nlookup_done(&nd);
2857 	return (error);
2858 }
2859 
2860 
2861 /*
2862  * eaccess_args(char *path, int flags)
2863  *
2864  * Check access permissions.
2865  */
2866 int
2867 sys_eaccess(struct eaccess_args *uap)
2868 {
2869 	struct nlookupdata nd;
2870 	int error;
2871 
2872 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2873 	if (error == 0)
2874 		error = kern_access(&nd, uap->flags, AT_EACCESS);
2875 	nlookup_done(&nd);
2876 	return (error);
2877 }
2878 
2879 
2880 /*
2881  * faccessat_args(int fd, char *path, int amode, int flags)
2882  *
2883  * Check access permissions.
2884  */
2885 int
2886 sys_faccessat(struct faccessat_args *uap)
2887 {
2888 	struct nlookupdata nd;
2889 	struct file *fp;
2890 	int error;
2891 
2892 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
2893 				NLC_FOLLOW);
2894 	if (error == 0)
2895 		error = kern_access(&nd, uap->amode, uap->flags);
2896 	nlookup_done_at(&nd, fp);
2897 	return (error);
2898 }
2899 
2900 int
2901 kern_stat(struct nlookupdata *nd, struct stat *st)
2902 {
2903 	int error;
2904 	struct vnode *vp;
2905 
2906 	nd->nl_flags |= NLC_SHAREDLOCK;
2907 	if ((error = nlookup(nd)) != 0)
2908 		return (error);
2909 again:
2910 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2911 		return (ENOENT);
2912 
2913 	if ((error = vget(vp, LK_SHARED)) != 0)
2914 		return (error);
2915 	error = vn_stat(vp, st, nd->nl_cred);
2916 
2917 	/*
2918 	 * If the file handle is stale we have to re-resolve the
2919 	 * entry with the ncp held exclusively.  This is a hack
2920 	 * at the moment.
2921 	 */
2922 	if (error == ESTALE) {
2923 		vput(vp);
2924 		cache_unlock(&nd->nl_nch);
2925 		cache_lock(&nd->nl_nch);
2926 		cache_setunresolved(&nd->nl_nch);
2927 		error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2928 		if (error == 0)
2929 			goto again;
2930 	} else {
2931 		vput(vp);
2932 	}
2933 	return (error);
2934 }
2935 
2936 /*
2937  * stat_args(char *path, struct stat *ub)
2938  *
2939  * Get file status; this version follows links.
2940  */
2941 int
2942 sys_stat(struct stat_args *uap)
2943 {
2944 	struct nlookupdata nd;
2945 	struct stat st;
2946 	int error;
2947 
2948 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2949 	if (error == 0) {
2950 		error = kern_stat(&nd, &st);
2951 		if (error == 0)
2952 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2953 	}
2954 	nlookup_done(&nd);
2955 	return (error);
2956 }
2957 
2958 /*
2959  * lstat_args(char *path, struct stat *ub)
2960  *
2961  * Get file status; this version does not follow links.
2962  */
2963 int
2964 sys_lstat(struct lstat_args *uap)
2965 {
2966 	struct nlookupdata nd;
2967 	struct stat st;
2968 	int error;
2969 
2970 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2971 	if (error == 0) {
2972 		error = kern_stat(&nd, &st);
2973 		if (error == 0)
2974 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2975 	}
2976 	nlookup_done(&nd);
2977 	return (error);
2978 }
2979 
2980 /*
2981  * fstatat_args(int fd, char *path, struct stat *sb, int flags)
2982  *
2983  * Get status of file pointed to by fd/path.
2984  */
2985 int
2986 sys_fstatat(struct fstatat_args *uap)
2987 {
2988 	struct nlookupdata nd;
2989 	struct stat st;
2990 	int error;
2991 	int flags;
2992 	struct file *fp;
2993 
2994 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
2995 		return (EINVAL);
2996 
2997 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
2998 
2999 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3000 				UIO_USERSPACE, flags);
3001 	if (error == 0) {
3002 		error = kern_stat(&nd, &st);
3003 		if (error == 0)
3004 			error = copyout(&st, uap->sb, sizeof(*uap->sb));
3005 	}
3006 	nlookup_done_at(&nd, fp);
3007 	return (error);
3008 }
3009 
3010 static int
3011 kern_pathconf(char *path, int name, int flags, register_t *sysmsg_regp)
3012 {
3013 	struct nlookupdata nd;
3014 	struct vnode *vp;
3015 	int error;
3016 
3017 	vp = NULL;
3018 	error = nlookup_init(&nd, path, UIO_USERSPACE, flags);
3019 	if (error == 0)
3020 		error = nlookup(&nd);
3021 	if (error == 0)
3022 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3023 	nlookup_done(&nd);
3024 	if (error == 0) {
3025 		error = VOP_PATHCONF(vp, name, sysmsg_regp);
3026 		vput(vp);
3027 	}
3028 	return (error);
3029 }
3030 
3031 /*
3032  * pathconf_Args(char *path, int name)
3033  *
3034  * Get configurable pathname variables.
3035  */
3036 int
3037 sys_pathconf(struct pathconf_args *uap)
3038 {
3039 	return (kern_pathconf(uap->path, uap->name, NLC_FOLLOW,
3040 		&uap->sysmsg_reg));
3041 }
3042 
3043 /*
3044  * lpathconf_Args(char *path, int name)
3045  *
3046  * Get configurable pathname variables, but don't follow symlinks.
3047  */
3048 int
3049 sys_lpathconf(struct lpathconf_args *uap)
3050 {
3051 	return (kern_pathconf(uap->path, uap->name, 0, &uap->sysmsg_reg));
3052 }
3053 
3054 /*
3055  * XXX: daver
3056  * kern_readlink isn't properly split yet.  There is a copyin burried
3057  * in VOP_READLINK().
3058  */
3059 int
3060 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
3061 {
3062 	struct thread *td = curthread;
3063 	struct vnode *vp;
3064 	struct iovec aiov;
3065 	struct uio auio;
3066 	int error;
3067 
3068 	nd->nl_flags |= NLC_SHAREDLOCK;
3069 	if ((error = nlookup(nd)) != 0)
3070 		return (error);
3071 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
3072 	if (error)
3073 		return (error);
3074 	if (vp->v_type != VLNK) {
3075 		error = EINVAL;
3076 	} else {
3077 		aiov.iov_base = buf;
3078 		aiov.iov_len = count;
3079 		auio.uio_iov = &aiov;
3080 		auio.uio_iovcnt = 1;
3081 		auio.uio_offset = 0;
3082 		auio.uio_rw = UIO_READ;
3083 		auio.uio_segflg = UIO_USERSPACE;
3084 		auio.uio_td = td;
3085 		auio.uio_resid = count;
3086 		error = VOP_READLINK(vp, &auio, td->td_ucred);
3087 	}
3088 	vput(vp);
3089 	*res = count - auio.uio_resid;
3090 	return (error);
3091 }
3092 
3093 /*
3094  * readlink_args(char *path, char *buf, int count)
3095  *
3096  * Return target name of a symbolic link.
3097  */
3098 int
3099 sys_readlink(struct readlink_args *uap)
3100 {
3101 	struct nlookupdata nd;
3102 	int error;
3103 
3104 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3105 	if (error == 0) {
3106 		error = kern_readlink(&nd, uap->buf, uap->count,
3107 					&uap->sysmsg_result);
3108 	}
3109 	nlookup_done(&nd);
3110 	return (error);
3111 }
3112 
3113 /*
3114  * readlinkat_args(int fd, char *path, char *buf, size_t bufsize)
3115  *
3116  * Return target name of a symbolic link.  The path is relative to the
3117  * directory associated with fd.
3118  */
3119 int
3120 sys_readlinkat(struct readlinkat_args *uap)
3121 {
3122 	struct nlookupdata nd;
3123 	struct file *fp;
3124 	int error;
3125 
3126 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
3127 	if (error == 0) {
3128 		error = kern_readlink(&nd, uap->buf, uap->bufsize,
3129 					&uap->sysmsg_result);
3130 	}
3131 	nlookup_done_at(&nd, fp);
3132 	return (error);
3133 }
3134 
3135 static int
3136 setfflags(struct vnode *vp, int flags)
3137 {
3138 	struct thread *td = curthread;
3139 	int error;
3140 	struct vattr vattr;
3141 
3142 	/*
3143 	 * Prevent non-root users from setting flags on devices.  When
3144 	 * a device is reused, users can retain ownership of the device
3145 	 * if they are allowed to set flags and programs assume that
3146 	 * chown can't fail when done as root.
3147 	 */
3148 	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
3149 	    ((error = priv_check_cred(td->td_ucred, PRIV_VFS_CHFLAGS_DEV, 0)) != 0))
3150 		return (error);
3151 
3152 	/*
3153 	 * note: vget is required for any operation that might mod the vnode
3154 	 * so VINACTIVE is properly cleared.
3155 	 */
3156 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3157 		VATTR_NULL(&vattr);
3158 		vattr.va_flags = flags;
3159 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3160 		vput(vp);
3161 	}
3162 	return (error);
3163 }
3164 
3165 /*
3166  * chflags(char *path, int flags)
3167  *
3168  * Change flags of a file given a path name.
3169  */
3170 int
3171 sys_chflags(struct chflags_args *uap)
3172 {
3173 	struct nlookupdata nd;
3174 	struct vnode *vp;
3175 	int error;
3176 
3177 	vp = NULL;
3178 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3179 	if (error == 0)
3180 		error = nlookup(&nd);
3181 	if (error == 0)
3182 		error = ncp_writechk(&nd.nl_nch);
3183 	if (error == 0)
3184 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3185 	nlookup_done(&nd);
3186 	if (error == 0) {
3187 		error = setfflags(vp, uap->flags);
3188 		vrele(vp);
3189 	}
3190 	return (error);
3191 }
3192 
3193 /*
3194  * lchflags(char *path, int flags)
3195  *
3196  * Change flags of a file given a path name, but don't follow symlinks.
3197  */
3198 int
3199 sys_lchflags(struct lchflags_args *uap)
3200 {
3201 	struct nlookupdata nd;
3202 	struct vnode *vp;
3203 	int error;
3204 
3205 	vp = NULL;
3206 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3207 	if (error == 0)
3208 		error = nlookup(&nd);
3209 	if (error == 0)
3210 		error = ncp_writechk(&nd.nl_nch);
3211 	if (error == 0)
3212 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3213 	nlookup_done(&nd);
3214 	if (error == 0) {
3215 		error = setfflags(vp, uap->flags);
3216 		vrele(vp);
3217 	}
3218 	return (error);
3219 }
3220 
3221 /*
3222  * fchflags_args(int fd, int flags)
3223  *
3224  * Change flags of a file given a file descriptor.
3225  */
3226 int
3227 sys_fchflags(struct fchflags_args *uap)
3228 {
3229 	struct thread *td = curthread;
3230 	struct file *fp;
3231 	int error;
3232 
3233 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3234 		return (error);
3235 	if (fp->f_nchandle.ncp)
3236 		error = ncp_writechk(&fp->f_nchandle);
3237 	if (error == 0)
3238 		error = setfflags((struct vnode *) fp->f_data, uap->flags);
3239 	fdrop(fp);
3240 	return (error);
3241 }
3242 
3243 /*
3244  * chflagsat_args(int fd, const char *path, int flags, int atflags)
3245  * change flags given a pathname relative to a filedescriptor
3246  */
3247 int sys_chflagsat(struct chflagsat_args *uap)
3248 {
3249 	struct nlookupdata nd;
3250 	struct vnode *vp;
3251 	struct file *fp;
3252 	int error;
3253 	int lookupflags;
3254 
3255 	if (uap->atflags & ~AT_SYMLINK_NOFOLLOW)
3256 		return (EINVAL);
3257 
3258 	lookupflags = (uap->atflags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3259 
3260 	vp = NULL;
3261 	error = nlookup_init_at(&nd, &fp, uap->fd,  uap->path, UIO_USERSPACE, lookupflags);
3262 	if (error == 0)
3263 		error = nlookup(&nd);
3264 	if (error == 0)
3265 		error = ncp_writechk(&nd.nl_nch);
3266 	if (error == 0)
3267 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3268 	nlookup_done_at(&nd, fp);
3269 	if (error == 0) {
3270 		error = setfflags(vp, uap->flags);
3271 		vrele(vp);
3272 	}
3273 	return (error);
3274 }
3275 
3276 
3277 static int
3278 setfmode(struct vnode *vp, int mode)
3279 {
3280 	struct thread *td = curthread;
3281 	int error;
3282 	struct vattr vattr;
3283 
3284 	/*
3285 	 * note: vget is required for any operation that might mod the vnode
3286 	 * so VINACTIVE is properly cleared.
3287 	 */
3288 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3289 		VATTR_NULL(&vattr);
3290 		vattr.va_mode = mode & ALLPERMS;
3291 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3292 		cache_inval_wxok(vp);
3293 		vput(vp);
3294 	}
3295 	return error;
3296 }
3297 
3298 int
3299 kern_chmod(struct nlookupdata *nd, int mode)
3300 {
3301 	struct vnode *vp;
3302 	int error;
3303 
3304 	if ((error = nlookup(nd)) != 0)
3305 		return (error);
3306 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3307 		return (error);
3308 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3309 		error = setfmode(vp, mode);
3310 	vrele(vp);
3311 	return (error);
3312 }
3313 
3314 /*
3315  * chmod_args(char *path, int mode)
3316  *
3317  * Change mode of a file given path name.
3318  */
3319 int
3320 sys_chmod(struct chmod_args *uap)
3321 {
3322 	struct nlookupdata nd;
3323 	int error;
3324 
3325 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3326 	if (error == 0)
3327 		error = kern_chmod(&nd, uap->mode);
3328 	nlookup_done(&nd);
3329 	return (error);
3330 }
3331 
3332 /*
3333  * lchmod_args(char *path, int mode)
3334  *
3335  * Change mode of a file given path name (don't follow links.)
3336  */
3337 int
3338 sys_lchmod(struct lchmod_args *uap)
3339 {
3340 	struct nlookupdata nd;
3341 	int error;
3342 
3343 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3344 	if (error == 0)
3345 		error = kern_chmod(&nd, uap->mode);
3346 	nlookup_done(&nd);
3347 	return (error);
3348 }
3349 
3350 /*
3351  * fchmod_args(int fd, int mode)
3352  *
3353  * Change mode of a file given a file descriptor.
3354  */
3355 int
3356 sys_fchmod(struct fchmod_args *uap)
3357 {
3358 	struct thread *td = curthread;
3359 	struct file *fp;
3360 	int error;
3361 
3362 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3363 		return (error);
3364 	if (fp->f_nchandle.ncp)
3365 		error = ncp_writechk(&fp->f_nchandle);
3366 	if (error == 0)
3367 		error = setfmode((struct vnode *)fp->f_data, uap->mode);
3368 	fdrop(fp);
3369 	return (error);
3370 }
3371 
3372 /*
3373  * fchmodat_args(char *path, int mode)
3374  *
3375  * Change mode of a file pointed to by fd/path.
3376  */
3377 int
3378 sys_fchmodat(struct fchmodat_args *uap)
3379 {
3380 	struct nlookupdata nd;
3381 	struct file *fp;
3382 	int error;
3383 	int flags;
3384 
3385 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3386 		return (EINVAL);
3387 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3388 
3389 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3390 				UIO_USERSPACE, flags);
3391 	if (error == 0)
3392 		error = kern_chmod(&nd, uap->mode);
3393 	nlookup_done_at(&nd, fp);
3394 	return (error);
3395 }
3396 
3397 static int
3398 setfown(struct mount *mp, struct vnode *vp, uid_t uid, gid_t gid)
3399 {
3400 	struct thread *td = curthread;
3401 	int error;
3402 	struct vattr vattr;
3403 	uid_t o_uid;
3404 	gid_t o_gid;
3405 	uint64_t size;
3406 
3407 	/*
3408 	 * note: vget is required for any operation that might mod the vnode
3409 	 * so VINACTIVE is properly cleared.
3410 	 */
3411 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3412 		if ((error = VOP_GETATTR(vp, &vattr)) != 0)
3413 			return error;
3414 		o_uid = vattr.va_uid;
3415 		o_gid = vattr.va_gid;
3416 		size = vattr.va_size;
3417 
3418 		VATTR_NULL(&vattr);
3419 		vattr.va_uid = uid;
3420 		vattr.va_gid = gid;
3421 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3422 		vput(vp);
3423 	}
3424 
3425 	if (error == 0) {
3426 		if (uid == -1)
3427 			uid = o_uid;
3428 		if (gid == -1)
3429 			gid = o_gid;
3430 		VFS_ACCOUNT(mp, o_uid, o_gid, -size);
3431 		VFS_ACCOUNT(mp,   uid,   gid,  size);
3432 	}
3433 
3434 	return error;
3435 }
3436 
3437 int
3438 kern_chown(struct nlookupdata *nd, int uid, int gid)
3439 {
3440 	struct vnode *vp;
3441 	int error;
3442 
3443 	if ((error = nlookup(nd)) != 0)
3444 		return (error);
3445 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3446 		return (error);
3447 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3448 		error = setfown(nd->nl_nch.mount, vp, uid, gid);
3449 	vrele(vp);
3450 	return (error);
3451 }
3452 
3453 /*
3454  * chown(char *path, int uid, int gid)
3455  *
3456  * Set ownership given a path name.
3457  */
3458 int
3459 sys_chown(struct chown_args *uap)
3460 {
3461 	struct nlookupdata nd;
3462 	int error;
3463 
3464 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3465 	if (error == 0)
3466 		error = kern_chown(&nd, uap->uid, uap->gid);
3467 	nlookup_done(&nd);
3468 	return (error);
3469 }
3470 
3471 /*
3472  * lchown_args(char *path, int uid, int gid)
3473  *
3474  * Set ownership given a path name, do not cross symlinks.
3475  */
3476 int
3477 sys_lchown(struct lchown_args *uap)
3478 {
3479 	struct nlookupdata nd;
3480 	int error;
3481 
3482 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3483 	if (error == 0)
3484 		error = kern_chown(&nd, uap->uid, uap->gid);
3485 	nlookup_done(&nd);
3486 	return (error);
3487 }
3488 
3489 /*
3490  * fchown_args(int fd, int uid, int gid)
3491  *
3492  * Set ownership given a file descriptor.
3493  */
3494 int
3495 sys_fchown(struct fchown_args *uap)
3496 {
3497 	struct thread *td = curthread;
3498 	struct proc *p = td->td_proc;
3499 	struct file *fp;
3500 	int error;
3501 
3502 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3503 		return (error);
3504 	if (fp->f_nchandle.ncp)
3505 		error = ncp_writechk(&fp->f_nchandle);
3506 	if (error == 0)
3507 		error = setfown(p->p_fd->fd_ncdir.mount,
3508 			(struct vnode *)fp->f_data, uap->uid, uap->gid);
3509 	fdrop(fp);
3510 	return (error);
3511 }
3512 
3513 /*
3514  * fchownat(int fd, char *path, int uid, int gid, int flags)
3515  *
3516  * Set ownership of file pointed to by fd/path.
3517  */
3518 int
3519 sys_fchownat(struct fchownat_args *uap)
3520 {
3521 	struct nlookupdata nd;
3522 	struct file *fp;
3523 	int error;
3524 	int flags;
3525 
3526 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3527 		return (EINVAL);
3528 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3529 
3530 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3531 				UIO_USERSPACE, flags);
3532 	if (error == 0)
3533 		error = kern_chown(&nd, uap->uid, uap->gid);
3534 	nlookup_done_at(&nd, fp);
3535 	return (error);
3536 }
3537 
3538 
3539 static int
3540 getutimes(struct timeval *tvp, struct timespec *tsp)
3541 {
3542 	struct timeval tv[2];
3543 	int error;
3544 
3545 	if (tvp == NULL) {
3546 		microtime(&tv[0]);
3547 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
3548 		tsp[1] = tsp[0];
3549 	} else {
3550 		if ((error = itimerfix(tvp)) != 0)
3551 			return (error);
3552 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3553 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3554 	}
3555 	return 0;
3556 }
3557 
3558 static int
3559 getutimens(const struct timespec *ts, struct timespec *newts, int *nullflag)
3560 {
3561 	struct timespec tsnow;
3562 	int error;
3563 
3564 	*nullflag = 0;
3565 	nanotime(&tsnow);
3566 	if (ts == NULL) {
3567 		newts[0] = tsnow;
3568 		newts[1] = tsnow;
3569 		*nullflag = 1;
3570 		return (0);
3571 	}
3572 
3573 	newts[0] = ts[0];
3574 	newts[1] = ts[1];
3575 	if (newts[0].tv_nsec == UTIME_OMIT && newts[1].tv_nsec == UTIME_OMIT)
3576 		return (0);
3577 	if (newts[0].tv_nsec == UTIME_NOW && newts[1].tv_nsec == UTIME_NOW)
3578 		*nullflag = 1;
3579 
3580 	if (newts[0].tv_nsec == UTIME_OMIT)
3581 		newts[0].tv_sec = VNOVAL;
3582 	else if (newts[0].tv_nsec == UTIME_NOW)
3583 		newts[0] = tsnow;
3584 	else if ((error = itimespecfix(&newts[0])) != 0)
3585 		return (error);
3586 
3587 	if (newts[1].tv_nsec == UTIME_OMIT)
3588 		newts[1].tv_sec = VNOVAL;
3589 	else if (newts[1].tv_nsec == UTIME_NOW)
3590 		newts[1] = tsnow;
3591 	else if ((error = itimespecfix(&newts[1])) != 0)
3592 		return (error);
3593 
3594 	return (0);
3595 }
3596 
3597 static int
3598 setutimes(struct vnode *vp, struct vattr *vattr,
3599 	  const struct timespec *ts, int nullflag)
3600 {
3601 	struct thread *td = curthread;
3602 	int error;
3603 
3604 	VATTR_NULL(vattr);
3605 	vattr->va_atime = ts[0];
3606 	vattr->va_mtime = ts[1];
3607 	if (nullflag)
3608 		vattr->va_vaflags |= VA_UTIMES_NULL;
3609 	error = VOP_SETATTR(vp, vattr, td->td_ucred);
3610 
3611 	return error;
3612 }
3613 
3614 int
3615 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
3616 {
3617 	struct timespec ts[2];
3618 	int error;
3619 
3620 	if (tptr) {
3621 		if ((error = getutimes(tptr, ts)) != 0)
3622 			return (error);
3623 	}
3624 	error = kern_utimensat(nd, tptr ? ts : NULL, 0);
3625 	return (error);
3626 }
3627 
3628 /*
3629  * utimes_args(char *path, struct timeval *tptr)
3630  *
3631  * Set the access and modification times of a file.
3632  */
3633 int
3634 sys_utimes(struct utimes_args *uap)
3635 {
3636 	struct timeval tv[2];
3637 	struct nlookupdata nd;
3638 	int error;
3639 
3640 	if (uap->tptr) {
3641  		error = copyin(uap->tptr, tv, sizeof(tv));
3642 		if (error)
3643 			return (error);
3644 	}
3645 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3646 	if (error == 0)
3647 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3648 	nlookup_done(&nd);
3649 	return (error);
3650 }
3651 
3652 /*
3653  * lutimes_args(char *path, struct timeval *tptr)
3654  *
3655  * Set the access and modification times of a file.
3656  */
3657 int
3658 sys_lutimes(struct lutimes_args *uap)
3659 {
3660 	struct timeval tv[2];
3661 	struct nlookupdata nd;
3662 	int error;
3663 
3664 	if (uap->tptr) {
3665 		error = copyin(uap->tptr, tv, sizeof(tv));
3666 		if (error)
3667 			return (error);
3668 	}
3669 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3670 	if (error == 0)
3671 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3672 	nlookup_done(&nd);
3673 	return (error);
3674 }
3675 
3676 /*
3677  * Set utimes on a file descriptor.  The creds used to open the
3678  * file are used to determine whether the operation is allowed
3679  * or not.
3680  */
3681 int
3682 kern_futimens(int fd, struct timespec *ts)
3683 {
3684 	struct thread *td = curthread;
3685 	struct timespec newts[2];
3686 	struct file *fp;
3687 	struct vnode *vp;
3688 	struct vattr vattr;
3689 	int nullflag;
3690 	int error;
3691 
3692 	error = getutimens(ts, newts, &nullflag);
3693 	if (error)
3694 		return (error);
3695 	if ((error = holdvnode(td, fd, &fp)) != 0)
3696 		return (error);
3697 	if (fp->f_nchandle.ncp)
3698 		error = ncp_writechk(&fp->f_nchandle);
3699 	if (error == 0) {
3700 		vp = fp->f_data;
3701 		error = vget(vp, LK_EXCLUSIVE);
3702 		if (error == 0) {
3703 			error = VOP_GETATTR(vp, &vattr);
3704 			if (error == 0) {
3705 				error = naccess_va(&vattr, NLC_OWN | NLC_WRITE,
3706 						   fp->f_cred);
3707 			}
3708 			if (error == 0) {
3709 				error = setutimes(vp, &vattr, newts, nullflag);
3710 			}
3711 			vput(vp);
3712 		}
3713 	}
3714 	fdrop(fp);
3715 	return (error);
3716 }
3717 
3718 /*
3719  * futimens_args(int fd, struct timespec *ts)
3720  *
3721  * Set the access and modification times of a file.
3722  */
3723 int
3724 sys_futimens(struct futimens_args *uap)
3725 {
3726 	struct timespec ts[2];
3727 	int error;
3728 
3729 	if (uap->ts) {
3730 		error = copyin(uap->ts, ts, sizeof(ts));
3731 		if (error)
3732 			return (error);
3733 	}
3734 	error = kern_futimens(uap->fd, uap->ts ? ts : NULL);
3735 	return (error);
3736 }
3737 
3738 int
3739 kern_futimes(int fd, struct timeval *tptr)
3740 {
3741 	struct timespec ts[2];
3742 	int error;
3743 
3744 	if (tptr) {
3745 		if ((error = getutimes(tptr, ts)) != 0)
3746 			return (error);
3747 	}
3748 	error = kern_futimens(fd, tptr ? ts : NULL);
3749 	return (error);
3750 }
3751 
3752 /*
3753  * futimes_args(int fd, struct timeval *tptr)
3754  *
3755  * Set the access and modification times of a file.
3756  */
3757 int
3758 sys_futimes(struct futimes_args *uap)
3759 {
3760 	struct timeval tv[2];
3761 	int error;
3762 
3763 	if (uap->tptr) {
3764 		error = copyin(uap->tptr, tv, sizeof(tv));
3765 		if (error)
3766 			return (error);
3767 	}
3768 	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
3769 	return (error);
3770 }
3771 
3772 int
3773 kern_utimensat(struct nlookupdata *nd, const struct timespec *ts, int flags)
3774 {
3775 	struct timespec newts[2];
3776 	struct vnode *vp;
3777 	struct vattr vattr;
3778 	int nullflag;
3779 	int error;
3780 
3781 	if (flags & ~AT_SYMLINK_NOFOLLOW)
3782 		return (EINVAL);
3783 
3784 	error = getutimens(ts, newts, &nullflag);
3785 	if (error)
3786 		return (error);
3787 
3788 	nd->nl_flags |= NLC_OWN | NLC_WRITE;
3789 	if ((error = nlookup(nd)) != 0)
3790 		return (error);
3791 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3792 		return (error);
3793 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3794 		return (error);
3795 	if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3796 		error = vget(vp, LK_EXCLUSIVE);
3797 		if (error == 0) {
3798 			error = setutimes(vp, &vattr, newts, nullflag);
3799 			vput(vp);
3800 		}
3801 	}
3802 	vrele(vp);
3803 	return (error);
3804 }
3805 
3806 /*
3807  * utimensat_args(int fd, const char *path, const struct timespec *ts, int flags);
3808  *
3809  * Set file access and modification times of a file.
3810  */
3811 int
3812 sys_utimensat(struct utimensat_args *uap)
3813 {
3814 	struct timespec ts[2];
3815 	struct nlookupdata nd;
3816 	struct file *fp;
3817 	int error;
3818 	int flags;
3819 
3820 	if (uap->ts) {
3821 		error = copyin(uap->ts, ts, sizeof(ts));
3822 		if (error)
3823 			return (error);
3824 	}
3825 
3826 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3827 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3828 	                        UIO_USERSPACE, flags);
3829 	if (error == 0)
3830 		error = kern_utimensat(&nd, uap->ts ? ts : NULL, uap->flags);
3831 	nlookup_done_at(&nd, fp);
3832 	return (error);
3833 }
3834 
3835 int
3836 kern_truncate(struct nlookupdata *nd, off_t length)
3837 {
3838 	struct vnode *vp;
3839 	struct vattr vattr;
3840 	int error;
3841 	uid_t uid = 0;
3842 	gid_t gid = 0;
3843 	uint64_t old_size = 0;
3844 
3845 	if (length < 0)
3846 		return(EINVAL);
3847 	nd->nl_flags |= NLC_WRITE | NLC_TRUNCATE;
3848 	if ((error = nlookup(nd)) != 0)
3849 		return (error);
3850 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3851 		return (error);
3852 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3853 		return (error);
3854 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
3855 	if (error) {
3856 		vrele(vp);
3857 		return (error);
3858 	}
3859 	if (vp->v_type == VDIR) {
3860 		error = EISDIR;
3861 		goto done;
3862 	}
3863 	if (vfs_quota_enabled) {
3864 		error = VOP_GETATTR(vp, &vattr);
3865 		KASSERT(error == 0, ("kern_truncate(): VOP_GETATTR didn't return 0"));
3866 		uid = vattr.va_uid;
3867 		gid = vattr.va_gid;
3868 		old_size = vattr.va_size;
3869 	}
3870 
3871 	if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3872 		VATTR_NULL(&vattr);
3873 		vattr.va_size = length;
3874 		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
3875 		VFS_ACCOUNT(nd->nl_nch.mount, uid, gid, length - old_size);
3876 	}
3877 done:
3878 	vput(vp);
3879 	return (error);
3880 }
3881 
3882 /*
3883  * truncate(char *path, int pad, off_t length)
3884  *
3885  * Truncate a file given its path name.
3886  */
3887 int
3888 sys_truncate(struct truncate_args *uap)
3889 {
3890 	struct nlookupdata nd;
3891 	int error;
3892 
3893 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3894 	if (error == 0)
3895 		error = kern_truncate(&nd, uap->length);
3896 	nlookup_done(&nd);
3897 	return error;
3898 }
3899 
3900 int
3901 kern_ftruncate(int fd, off_t length)
3902 {
3903 	struct thread *td = curthread;
3904 	struct vattr vattr;
3905 	struct vnode *vp;
3906 	struct file *fp;
3907 	int error;
3908 	uid_t uid = 0;
3909 	gid_t gid = 0;
3910 	uint64_t old_size = 0;
3911 	struct mount *mp;
3912 
3913 	if (length < 0)
3914 		return(EINVAL);
3915 	if ((error = holdvnode(td, fd, &fp)) != 0)
3916 		return (error);
3917 	if (fp->f_nchandle.ncp) {
3918 		error = ncp_writechk(&fp->f_nchandle);
3919 		if (error)
3920 			goto done;
3921 	}
3922 	if ((fp->f_flag & FWRITE) == 0) {
3923 		error = EINVAL;
3924 		goto done;
3925 	}
3926 	if (fp->f_flag & FAPPENDONLY) {	/* inode was set s/uapnd */
3927 		error = EINVAL;
3928 		goto done;
3929 	}
3930 	vp = (struct vnode *)fp->f_data;
3931 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3932 	if (vp->v_type == VDIR) {
3933 		error = EISDIR;
3934 		vn_unlock(vp);
3935 		goto done;
3936 	}
3937 
3938 	if (vfs_quota_enabled) {
3939 		error = VOP_GETATTR(vp, &vattr);
3940 		KASSERT(error == 0, ("kern_ftruncate(): VOP_GETATTR didn't return 0"));
3941 		uid = vattr.va_uid;
3942 		gid = vattr.va_gid;
3943 		old_size = vattr.va_size;
3944 	}
3945 
3946 	if ((error = vn_writechk(vp, NULL)) == 0) {
3947 		VATTR_NULL(&vattr);
3948 		vattr.va_size = length;
3949 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3950 		mp = vq_vptomp(vp);
3951 		VFS_ACCOUNT(mp, uid, gid, length - old_size);
3952 	}
3953 	vn_unlock(vp);
3954 done:
3955 	fdrop(fp);
3956 	return (error);
3957 }
3958 
3959 /*
3960  * ftruncate_args(int fd, int pad, off_t length)
3961  *
3962  * Truncate a file given a file descriptor.
3963  */
3964 int
3965 sys_ftruncate(struct ftruncate_args *uap)
3966 {
3967 	int error;
3968 
3969 	error = kern_ftruncate(uap->fd, uap->length);
3970 
3971 	return (error);
3972 }
3973 
3974 /*
3975  * fsync(int fd)
3976  *
3977  * Sync an open file.
3978  */
3979 int
3980 sys_fsync(struct fsync_args *uap)
3981 {
3982 	struct thread *td = curthread;
3983 	struct vnode *vp;
3984 	struct file *fp;
3985 	vm_object_t obj;
3986 	int error;
3987 
3988 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3989 		return (error);
3990 	vp = (struct vnode *)fp->f_data;
3991 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3992 	if ((obj = vp->v_object) != NULL) {
3993 		if (vp->v_mount == NULL ||
3994 		    (vp->v_mount->mnt_kern_flag & MNTK_NOMSYNC) == 0) {
3995 			vm_object_page_clean(obj, 0, 0, 0);
3996 		}
3997 	}
3998 	error = VOP_FSYNC(vp, MNT_WAIT, VOP_FSYNC_SYSCALL);
3999 	if (error == 0 && vp->v_mount)
4000 		error = buf_fsync(vp);
4001 	vn_unlock(vp);
4002 	fdrop(fp);
4003 
4004 	return (error);
4005 }
4006 
4007 int
4008 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
4009 {
4010 	struct nchandle fnchd;
4011 	struct nchandle tnchd;
4012 	struct namecache *ncp;
4013 	struct vnode *fdvp;
4014 	struct vnode *tdvp;
4015 	struct mount *mp;
4016 	int error;
4017 	u_int fncp_gen;
4018 	u_int tncp_gen;
4019 
4020 	bwillinode(1);
4021 	fromnd->nl_flags |= NLC_REFDVP | NLC_RENAME_SRC;
4022 	if ((error = nlookup(fromnd)) != 0)
4023 		return (error);
4024 	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
4025 		return (ENOENT);
4026 	fnchd.mount = fromnd->nl_nch.mount;
4027 	cache_hold(&fnchd);
4028 
4029 	/*
4030 	 * unlock the source nch so we can lookup the target nch without
4031 	 * deadlocking.  The target may or may not exist so we do not check
4032 	 * for a target vp like kern_mkdir() and other creation functions do.
4033 	 *
4034 	 * The source and target directories are ref'd and rechecked after
4035 	 * everything is relocked to determine if the source or target file
4036 	 * has been renamed.
4037 	 */
4038 	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
4039 	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
4040 
4041 	fncp_gen = fromnd->nl_nch.ncp->nc_generation;
4042 
4043 	cache_unlock(&fromnd->nl_nch);
4044 
4045 	tond->nl_flags |= NLC_RENAME_DST | NLC_REFDVP;
4046 	if ((error = nlookup(tond)) != 0) {
4047 		cache_drop(&fnchd);
4048 		return (error);
4049 	}
4050 	tncp_gen = tond->nl_nch.ncp->nc_generation;
4051 
4052 	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
4053 		cache_drop(&fnchd);
4054 		return (ENOENT);
4055 	}
4056 	tnchd.mount = tond->nl_nch.mount;
4057 	cache_hold(&tnchd);
4058 
4059 	/*
4060 	 * If the source and target are the same there is nothing to do
4061 	 */
4062 	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
4063 		cache_drop(&fnchd);
4064 		cache_drop(&tnchd);
4065 		return (0);
4066 	}
4067 
4068 	/*
4069 	 * Mount points cannot be renamed or overwritten
4070 	 */
4071 	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
4072 	    NCF_ISMOUNTPT
4073 	) {
4074 		cache_drop(&fnchd);
4075 		cache_drop(&tnchd);
4076 		return (EINVAL);
4077 	}
4078 
4079 	/*
4080 	 * Relock the source ncp.  cache_relock() will deal with any
4081 	 * deadlocks against the already-locked tond and will also
4082 	 * make sure both are resolved.
4083 	 *
4084 	 * NOTE AFTER RELOCKING: The source or target ncp may have become
4085 	 * invalid while they were unlocked, nc_vp and nc_mount could
4086 	 * be NULL.
4087 	 */
4088 	cache_relock(&fromnd->nl_nch, fromnd->nl_cred,
4089 		     &tond->nl_nch, tond->nl_cred);
4090 	fromnd->nl_flags |= NLC_NCPISLOCKED;
4091 
4092 	/*
4093 	 * If the namecache generation changed for either fromnd or tond,
4094 	 * we must retry.
4095 	 */
4096 	if (fromnd->nl_nch.ncp->nc_generation != fncp_gen ||
4097 	    tond->nl_nch.ncp->nc_generation != tncp_gen) {
4098 		kprintf("kern_rename: retry due to gen on: "
4099 			"\"%s\" -> \"%s\"\n",
4100 			fromnd->nl_nch.ncp->nc_name,
4101 			tond->nl_nch.ncp->nc_name);
4102 		cache_drop(&fnchd);
4103 		cache_drop(&tnchd);
4104 		return (EAGAIN);
4105 	}
4106 
4107 	/*
4108 	 * If either fromnd or tond are marked destroyed a ripout occured
4109 	 * out from under us and we must retry.
4110 	 */
4111 	if ((fromnd->nl_nch.ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED)) ||
4112 	    fromnd->nl_nch.ncp->nc_vp == NULL ||
4113 	    (tond->nl_nch.ncp->nc_flag & NCF_DESTROYED)) {
4114 		kprintf("kern_rename: retry due to ripout on: "
4115 			"\"%s\" -> \"%s\"\n",
4116 			fromnd->nl_nch.ncp->nc_name,
4117 			tond->nl_nch.ncp->nc_name);
4118 		cache_drop(&fnchd);
4119 		cache_drop(&tnchd);
4120 		return (EAGAIN);
4121 	}
4122 
4123 	/*
4124 	 * Make sure the parent directories linkages are the same.
4125 	 * XXX shouldn't be needed any more w/ generation check above.
4126 	 */
4127 	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
4128 	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
4129 		cache_drop(&fnchd);
4130 		cache_drop(&tnchd);
4131 		return (ENOENT);
4132 	}
4133 
4134 	/*
4135 	 * Both the source and target must be within the same filesystem and
4136 	 * in the same filesystem as their parent directories within the
4137 	 * namecache topology.
4138 	 *
4139 	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
4140 	 */
4141 	mp = fnchd.mount;
4142 	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
4143 	    mp != tond->nl_nch.mount) {
4144 		cache_drop(&fnchd);
4145 		cache_drop(&tnchd);
4146 		return (EXDEV);
4147 	}
4148 
4149 	/*
4150 	 * Make sure the mount point is writable
4151 	 */
4152 	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
4153 		cache_drop(&fnchd);
4154 		cache_drop(&tnchd);
4155 		return (error);
4156 	}
4157 
4158 	/*
4159 	 * If the target exists and either the source or target is a directory,
4160 	 * then both must be directories.
4161 	 *
4162 	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
4163 	 * have become NULL.
4164 	 */
4165 	if (tond->nl_nch.ncp->nc_vp) {
4166 		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
4167 			error = ENOENT;
4168 		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
4169 			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
4170 				error = ENOTDIR;
4171 		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
4172 			error = EISDIR;
4173 		}
4174 	}
4175 
4176 	/*
4177 	 * You cannot rename a source into itself or a subdirectory of itself.
4178 	 * We check this by travsersing the target directory upwards looking
4179 	 * for a match against the source.
4180 	 *
4181 	 * XXX MPSAFE
4182 	 */
4183 	if (error == 0) {
4184 		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
4185 			if (fromnd->nl_nch.ncp == ncp) {
4186 				error = EINVAL;
4187 				break;
4188 			}
4189 		}
4190 	}
4191 
4192 	cache_drop(&fnchd);
4193 	cache_drop(&tnchd);
4194 
4195 	/*
4196 	 * Even though the namespaces are different, they may still represent
4197 	 * hardlinks to the same file.  The filesystem might have a hard time
4198 	 * with this so we issue a NREMOVE of the source instead of a NRENAME
4199 	 * when we detect the situation.
4200 	 */
4201 	if (error == 0) {
4202 		fdvp = fromnd->nl_dvp;
4203 		tdvp = tond->nl_dvp;
4204 		if (fdvp == NULL || tdvp == NULL) {
4205 			error = EPERM;
4206 		} else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
4207 			error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
4208 					    fromnd->nl_cred);
4209 		} else {
4210 			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
4211 					    fdvp, tdvp, tond->nl_cred);
4212 		}
4213 	}
4214 	return (error);
4215 }
4216 
4217 /*
4218  * rename_args(char *from, char *to)
4219  *
4220  * Rename files.  Source and destination must either both be directories,
4221  * or both not be directories.  If target is a directory, it must be empty.
4222  */
4223 int
4224 sys_rename(struct rename_args *uap)
4225 {
4226 	struct nlookupdata fromnd, tond;
4227 	int error;
4228 
4229 	do {
4230 		error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
4231 		if (error == 0) {
4232 			error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
4233 			if (error == 0)
4234 				error = kern_rename(&fromnd, &tond);
4235 			nlookup_done(&tond);
4236 		}
4237 		nlookup_done(&fromnd);
4238 	} while (error == EAGAIN);
4239 	return (error);
4240 }
4241 
4242 /*
4243  * renameat_args(int oldfd, char *old, int newfd, char *new)
4244  *
4245  * Rename files using paths relative to the directories associated with
4246  * oldfd and newfd.  Source and destination must either both be directories,
4247  * or both not be directories.  If target is a directory, it must be empty.
4248  */
4249 int
4250 sys_renameat(struct renameat_args *uap)
4251 {
4252 	struct nlookupdata oldnd, newnd;
4253 	struct file *oldfp, *newfp;
4254 	int error;
4255 
4256 	do {
4257 		error = nlookup_init_at(&oldnd, &oldfp,
4258 					uap->oldfd, uap->old,
4259 					UIO_USERSPACE, 0);
4260 		if (error == 0) {
4261 			error = nlookup_init_at(&newnd, &newfp,
4262 						uap->newfd, uap->new,
4263 						UIO_USERSPACE, 0);
4264 			if (error == 0)
4265 				error = kern_rename(&oldnd, &newnd);
4266 			nlookup_done_at(&newnd, newfp);
4267 		}
4268 		nlookup_done_at(&oldnd, oldfp);
4269 	} while (error == EAGAIN);
4270 	return (error);
4271 }
4272 
4273 int
4274 kern_mkdir(struct nlookupdata *nd, int mode)
4275 {
4276 	struct thread *td = curthread;
4277 	struct proc *p = td->td_proc;
4278 	struct vnode *vp;
4279 	struct vattr vattr;
4280 	int error;
4281 
4282 	bwillinode(1);
4283 	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE | NLC_REFDVP;
4284 	if ((error = nlookup(nd)) != 0)
4285 		return (error);
4286 
4287 	if (nd->nl_nch.ncp->nc_vp)
4288 		return (EEXIST);
4289 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4290 		return (error);
4291 	VATTR_NULL(&vattr);
4292 	vattr.va_type = VDIR;
4293 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
4294 
4295 	vp = NULL;
4296 	error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
4297 	if (error == 0)
4298 		vput(vp);
4299 	return (error);
4300 }
4301 
4302 /*
4303  * mkdir_args(char *path, int mode)
4304  *
4305  * Make a directory file.
4306  */
4307 int
4308 sys_mkdir(struct mkdir_args *uap)
4309 {
4310 	struct nlookupdata nd;
4311 	int error;
4312 
4313 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4314 	if (error == 0)
4315 		error = kern_mkdir(&nd, uap->mode);
4316 	nlookup_done(&nd);
4317 	return (error);
4318 }
4319 
4320 /*
4321  * mkdirat_args(int fd, char *path, mode_t mode)
4322  *
4323  * Make a directory file.  The path is relative to the directory associated
4324  * with fd.
4325  */
4326 int
4327 sys_mkdirat(struct mkdirat_args *uap)
4328 {
4329 	struct nlookupdata nd;
4330 	struct file *fp;
4331 	int error;
4332 
4333 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
4334 	if (error == 0)
4335 		error = kern_mkdir(&nd, uap->mode);
4336 	nlookup_done_at(&nd, fp);
4337 	return (error);
4338 }
4339 
4340 int
4341 kern_rmdir(struct nlookupdata *nd)
4342 {
4343 	int error;
4344 
4345 	bwillinode(1);
4346 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
4347 	if ((error = nlookup(nd)) != 0)
4348 		return (error);
4349 
4350 	/*
4351 	 * Do not allow directories representing mount points to be
4352 	 * deleted, even if empty.  Check write perms on mount point
4353 	 * in case the vnode is aliased (aka nullfs).
4354 	 */
4355 	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
4356 		return (EBUSY);
4357 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4358 		return (error);
4359 	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
4360 	return (error);
4361 }
4362 
4363 /*
4364  * rmdir_args(char *path)
4365  *
4366  * Remove a directory file.
4367  */
4368 int
4369 sys_rmdir(struct rmdir_args *uap)
4370 {
4371 	struct nlookupdata nd;
4372 	int error;
4373 
4374 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4375 	if (error == 0)
4376 		error = kern_rmdir(&nd);
4377 	nlookup_done(&nd);
4378 	return (error);
4379 }
4380 
4381 int
4382 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
4383 		   enum uio_seg direction)
4384 {
4385 	struct thread *td = curthread;
4386 	struct vnode *vp;
4387 	struct file *fp;
4388 	struct uio auio;
4389 	struct iovec aiov;
4390 	off_t loff;
4391 	int error, eofflag;
4392 
4393 	if ((error = holdvnode(td, fd, &fp)) != 0)
4394 		return (error);
4395 	if ((fp->f_flag & FREAD) == 0) {
4396 		error = EBADF;
4397 		goto done;
4398 	}
4399 	vp = (struct vnode *)fp->f_data;
4400 	if (vp->v_type != VDIR) {
4401 		error = EINVAL;
4402 		goto done;
4403 	}
4404 	aiov.iov_base = buf;
4405 	aiov.iov_len = count;
4406 	auio.uio_iov = &aiov;
4407 	auio.uio_iovcnt = 1;
4408 	auio.uio_rw = UIO_READ;
4409 	auio.uio_segflg = direction;
4410 	auio.uio_td = td;
4411 	auio.uio_resid = count;
4412 	loff = auio.uio_offset = fp->f_offset;
4413 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
4414 	fp->f_offset = auio.uio_offset;
4415 	if (error)
4416 		goto done;
4417 
4418 	/*
4419 	 * WARNING!  *basep may not be wide enough to accomodate the
4420 	 * seek offset.   XXX should we hack this to return the upper 32 bits
4421 	 * for offsets greater then 4G?
4422 	 */
4423 	if (basep) {
4424 		*basep = (long)loff;
4425 	}
4426 	*res = count - auio.uio_resid;
4427 done:
4428 	fdrop(fp);
4429 	return (error);
4430 }
4431 
4432 /*
4433  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
4434  *
4435  * Read a block of directory entries in a file system independent format.
4436  */
4437 int
4438 sys_getdirentries(struct getdirentries_args *uap)
4439 {
4440 	long base;
4441 	int error;
4442 
4443 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
4444 				   &uap->sysmsg_result, UIO_USERSPACE);
4445 
4446 	if (error == 0 && uap->basep)
4447 		error = copyout(&base, uap->basep, sizeof(*uap->basep));
4448 	return (error);
4449 }
4450 
4451 /*
4452  * getdents_args(int fd, char *buf, size_t count)
4453  */
4454 int
4455 sys_getdents(struct getdents_args *uap)
4456 {
4457 	int error;
4458 
4459 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
4460 				   &uap->sysmsg_result, UIO_USERSPACE);
4461 
4462 	return (error);
4463 }
4464 
4465 /*
4466  * Set the mode mask for creation of filesystem nodes.
4467  *
4468  * umask(int newmask)
4469  */
4470 int
4471 sys_umask(struct umask_args *uap)
4472 {
4473 	struct thread *td = curthread;
4474 	struct proc *p = td->td_proc;
4475 	struct filedesc *fdp;
4476 
4477 	fdp = p->p_fd;
4478 	uap->sysmsg_result = fdp->fd_cmask;
4479 	fdp->fd_cmask = uap->newmask & ALLPERMS;
4480 	return (0);
4481 }
4482 
4483 /*
4484  * revoke(char *path)
4485  *
4486  * Void all references to file by ripping underlying filesystem
4487  * away from vnode.
4488  */
4489 int
4490 sys_revoke(struct revoke_args *uap)
4491 {
4492 	struct nlookupdata nd;
4493 	struct vattr vattr;
4494 	struct vnode *vp;
4495 	struct ucred *cred;
4496 	int error;
4497 
4498 	vp = NULL;
4499 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4500 	if (error == 0)
4501 		error = nlookup(&nd);
4502 	if (error == 0)
4503 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4504 	cred = crhold(nd.nl_cred);
4505 	nlookup_done(&nd);
4506 	if (error == 0) {
4507 		if (error == 0)
4508 			error = VOP_GETATTR(vp, &vattr);
4509 		if (error == 0 && cred->cr_uid != vattr.va_uid)
4510 			error = priv_check_cred(cred, PRIV_VFS_REVOKE, 0);
4511 		if (error == 0 && (vp->v_type == VCHR || vp->v_type == VBLK)) {
4512 			if (vcount(vp) > 0)
4513 				error = vrevoke(vp, cred);
4514 		} else if (error == 0) {
4515 			error = vrevoke(vp, cred);
4516 		}
4517 		vrele(vp);
4518 	}
4519 	if (cred)
4520 		crfree(cred);
4521 	return (error);
4522 }
4523 
4524 /*
4525  * getfh_args(char *fname, fhandle_t *fhp)
4526  *
4527  * Get (NFS) file handle
4528  *
4529  * NOTE: We use the fsid of the covering mount, even if it is a nullfs
4530  * mount.  This allows nullfs mounts to be explicitly exported.
4531  *
4532  * WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
4533  *
4534  * 	    nullfs mounts of subdirectories are not safe.  That is, it will
4535  *	    work, but you do not really have protection against access to
4536  *	    the related parent directories.
4537  */
4538 int
4539 sys_getfh(struct getfh_args *uap)
4540 {
4541 	struct thread *td = curthread;
4542 	struct nlookupdata nd;
4543 	fhandle_t fh;
4544 	struct vnode *vp;
4545 	struct mount *mp;
4546 	int error;
4547 
4548 	/*
4549 	 * Must be super user
4550 	 */
4551 	if ((error = priv_check(td, PRIV_ROOT)) != 0)
4552 		return (error);
4553 
4554 	vp = NULL;
4555 	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
4556 	if (error == 0)
4557 		error = nlookup(&nd);
4558 	if (error == 0)
4559 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4560 	mp = nd.nl_nch.mount;
4561 	nlookup_done(&nd);
4562 	if (error == 0) {
4563 		bzero(&fh, sizeof(fh));
4564 		fh.fh_fsid = mp->mnt_stat.f_fsid;
4565 		error = VFS_VPTOFH(vp, &fh.fh_fid);
4566 		vput(vp);
4567 		if (error == 0)
4568 			error = copyout(&fh, uap->fhp, sizeof(fh));
4569 	}
4570 	return (error);
4571 }
4572 
4573 /*
4574  * fhopen_args(const struct fhandle *u_fhp, int flags)
4575  *
4576  * syscall for the rpc.lockd to use to translate a NFS file handle into
4577  * an open descriptor.
4578  *
4579  * warning: do not remove the priv_check() call or this becomes one giant
4580  * security hole.
4581  */
4582 int
4583 sys_fhopen(struct fhopen_args *uap)
4584 {
4585 	struct thread *td = curthread;
4586 	struct filedesc *fdp = td->td_proc->p_fd;
4587 	struct mount *mp;
4588 	struct vnode *vp;
4589 	struct fhandle fhp;
4590 	struct vattr vat;
4591 	struct vattr *vap = &vat;
4592 	struct flock lf;
4593 	int fmode, mode, error = 0, type;
4594 	struct file *nfp;
4595 	struct file *fp;
4596 	int indx;
4597 
4598 	/*
4599 	 * Must be super user
4600 	 */
4601 	error = priv_check(td, PRIV_ROOT);
4602 	if (error)
4603 		return (error);
4604 
4605 	fmode = FFLAGS(uap->flags);
4606 
4607 	/*
4608 	 * Why not allow a non-read/write open for our lockd?
4609 	 */
4610 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4611 		return (EINVAL);
4612 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4613 	if (error)
4614 		return(error);
4615 
4616 	/*
4617 	 * Find the mount point
4618 	 */
4619 	mp = vfs_getvfs(&fhp.fh_fsid);
4620 	if (mp == NULL) {
4621 		error = ESTALE;
4622 		goto done2;
4623 	}
4624 	/* now give me my vnode, it gets returned to me locked */
4625 	error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
4626 	if (error)
4627 		goto done;
4628  	/*
4629 	 * from now on we have to make sure not
4630 	 * to forget about the vnode
4631 	 * any error that causes an abort must vput(vp)
4632 	 * just set error = err and 'goto bad;'.
4633 	 */
4634 
4635 	/*
4636 	 * from vn_open
4637 	 */
4638 	if (vp->v_type == VLNK) {
4639 		error = EMLINK;
4640 		goto bad;
4641 	}
4642 	if (vp->v_type == VSOCK) {
4643 		error = EOPNOTSUPP;
4644 		goto bad;
4645 	}
4646 	mode = 0;
4647 	if (fmode & (FWRITE | O_TRUNC)) {
4648 		if (vp->v_type == VDIR) {
4649 			error = EISDIR;
4650 			goto bad;
4651 		}
4652 		error = vn_writechk(vp, NULL);
4653 		if (error)
4654 			goto bad;
4655 		mode |= VWRITE;
4656 	}
4657 	if (fmode & FREAD)
4658 		mode |= VREAD;
4659 	if (mode) {
4660 		error = VOP_ACCESS(vp, mode, td->td_ucred);
4661 		if (error)
4662 			goto bad;
4663 	}
4664 	if (fmode & O_TRUNC) {
4665 		vn_unlock(vp);				/* XXX */
4666 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4667 		VATTR_NULL(vap);
4668 		vap->va_size = 0;
4669 		error = VOP_SETATTR(vp, vap, td->td_ucred);
4670 		if (error)
4671 			goto bad;
4672 	}
4673 
4674 	/*
4675 	 * VOP_OPEN needs the file pointer so it can potentially override
4676 	 * it.
4677 	 *
4678 	 * WARNING! no f_nchandle will be associated when fhopen()ing a
4679 	 * directory.  XXX
4680 	 */
4681 	if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
4682 		goto bad;
4683 	fp = nfp;
4684 
4685 	error = VOP_OPEN(vp, fmode, td->td_ucred, fp);
4686 	if (error) {
4687 		/*
4688 		 * setting f_ops this way prevents VOP_CLOSE from being
4689 		 * called or fdrop() releasing the vp from v_data.   Since
4690 		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
4691 		 */
4692 		fp->f_ops = &badfileops;
4693 		fp->f_data = NULL;
4694 		goto bad_drop;
4695 	}
4696 
4697 	/*
4698 	 * The fp is given its own reference, we still have our ref and lock.
4699 	 *
4700 	 * Assert that all regular files must be created with a VM object.
4701 	 */
4702 	if (vp->v_type == VREG && vp->v_object == NULL) {
4703 		kprintf("fhopen: regular file did not "
4704 			"have VM object: %p\n",
4705 			vp);
4706 		goto bad_drop;
4707 	}
4708 
4709 	/*
4710 	 * The open was successful.  Handle any locking requirements.
4711 	 */
4712 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4713 		lf.l_whence = SEEK_SET;
4714 		lf.l_start = 0;
4715 		lf.l_len = 0;
4716 		if (fmode & O_EXLOCK)
4717 			lf.l_type = F_WRLCK;
4718 		else
4719 			lf.l_type = F_RDLCK;
4720 		if (fmode & FNONBLOCK)
4721 			type = 0;
4722 		else
4723 			type = F_WAIT;
4724 		vn_unlock(vp);
4725 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK,
4726 					 &lf, type)) != 0) {
4727 			/*
4728 			 * release our private reference.
4729 			 */
4730 			fsetfd(fdp, NULL, indx);
4731 			fdrop(fp);
4732 			vrele(vp);
4733 			goto done;
4734 		}
4735 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4736 		atomic_set_int(&fp->f_flag, FHASLOCK);	/* race ok */
4737 	}
4738 
4739 	/*
4740 	 * Clean up.  Associate the file pointer with the previously
4741 	 * reserved descriptor and return it.
4742 	 */
4743 	vput(vp);
4744 	if (uap->flags & O_CLOEXEC)
4745 		fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
4746 	fsetfd(fdp, fp, indx);
4747 	fdrop(fp);
4748 	uap->sysmsg_result = indx;
4749 	mount_drop(mp);
4750 
4751 	return (error);
4752 
4753 bad_drop:
4754 	fsetfd(fdp, NULL, indx);
4755 	fdrop(fp);
4756 bad:
4757 	vput(vp);
4758 done:
4759 	mount_drop(mp);
4760 done2:
4761 	return (error);
4762 }
4763 
4764 /*
4765  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
4766  */
4767 int
4768 sys_fhstat(struct fhstat_args *uap)
4769 {
4770 	struct thread *td = curthread;
4771 	struct stat sb;
4772 	fhandle_t fh;
4773 	struct mount *mp;
4774 	struct vnode *vp;
4775 	int error;
4776 
4777 	/*
4778 	 * Must be super user
4779 	 */
4780 	error = priv_check(td, PRIV_ROOT);
4781 	if (error)
4782 		return (error);
4783 
4784 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4785 	if (error)
4786 		return (error);
4787 
4788 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
4789 		error = ESTALE;
4790 	if (error == 0) {
4791 		if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
4792 			error = vn_stat(vp, &sb, td->td_ucred);
4793 			vput(vp);
4794 		}
4795 	}
4796 	if (error == 0)
4797 		error = copyout(&sb, uap->sb, sizeof(sb));
4798 	if (mp)
4799 		mount_drop(mp);
4800 
4801 	return (error);
4802 }
4803 
4804 /*
4805  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
4806  */
4807 int
4808 sys_fhstatfs(struct fhstatfs_args *uap)
4809 {
4810 	struct thread *td = curthread;
4811 	struct proc *p = td->td_proc;
4812 	struct statfs *sp;
4813 	struct mount *mp;
4814 	struct vnode *vp;
4815 	struct statfs sb;
4816 	char *fullpath, *freepath;
4817 	fhandle_t fh;
4818 	int error;
4819 
4820 	/*
4821 	 * Must be super user
4822 	 */
4823 	if ((error = priv_check(td, PRIV_ROOT)))
4824 		return (error);
4825 
4826 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4827 		return (error);
4828 
4829 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4830 		error = ESTALE;
4831 		goto done;
4832 	}
4833 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4834 		error = ESTALE;
4835 		goto done;
4836 	}
4837 
4838 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
4839 		goto done;
4840 	mp = vp->v_mount;
4841 	sp = &mp->mnt_stat;
4842 	vput(vp);
4843 	if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
4844 		goto done;
4845 
4846 	error = mount_path(p, mp, &fullpath, &freepath);
4847 	if (error)
4848 		goto done;
4849 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4850 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
4851 	kfree(freepath, M_TEMP);
4852 
4853 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4854 	if (priv_check(td, PRIV_ROOT)) {
4855 		bcopy(sp, &sb, sizeof(sb));
4856 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
4857 		sp = &sb;
4858 	}
4859 	error = copyout(sp, uap->buf, sizeof(*sp));
4860 done:
4861 	if (mp)
4862 		mount_drop(mp);
4863 
4864 	return (error);
4865 }
4866 
4867 /*
4868  * fhstatvfs_args(struct fhandle *u_fhp, struct statvfs *buf)
4869  */
4870 int
4871 sys_fhstatvfs(struct fhstatvfs_args *uap)
4872 {
4873 	struct thread *td = curthread;
4874 	struct proc *p = td->td_proc;
4875 	struct statvfs *sp;
4876 	struct mount *mp;
4877 	struct vnode *vp;
4878 	fhandle_t fh;
4879 	int error;
4880 
4881 	/*
4882 	 * Must be super user
4883 	 */
4884 	if ((error = priv_check(td, PRIV_ROOT)))
4885 		return (error);
4886 
4887 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4888 		return (error);
4889 
4890 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4891 		error = ESTALE;
4892 		goto done;
4893 	}
4894 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4895 		error = ESTALE;
4896 		goto done;
4897 	}
4898 
4899 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
4900 		goto done;
4901 	mp = vp->v_mount;
4902 	sp = &mp->mnt_vstat;
4903 	vput(vp);
4904 	if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
4905 		goto done;
4906 
4907 	sp->f_flag = 0;
4908 	if (mp->mnt_flag & MNT_RDONLY)
4909 		sp->f_flag |= ST_RDONLY;
4910 	if (mp->mnt_flag & MNT_NOSUID)
4911 		sp->f_flag |= ST_NOSUID;
4912 	error = copyout(sp, uap->buf, sizeof(*sp));
4913 done:
4914 	if (mp)
4915 		mount_drop(mp);
4916 	return (error);
4917 }
4918 
4919 
4920 /*
4921  * Syscall to push extended attribute configuration information into the
4922  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
4923  * a command (int cmd), and attribute name and misc data.  For now, the
4924  * attribute name is left in userspace for consumption by the VFS_op.
4925  * It will probably be changed to be copied into sysspace by the
4926  * syscall in the future, once issues with various consumers of the
4927  * attribute code have raised their hands.
4928  *
4929  * Currently this is used only by UFS Extended Attributes.
4930  */
4931 int
4932 sys_extattrctl(struct extattrctl_args *uap)
4933 {
4934 	struct nlookupdata nd;
4935 	struct vnode *vp;
4936 	char attrname[EXTATTR_MAXNAMELEN];
4937 	int error;
4938 	size_t size;
4939 
4940 	attrname[0] = 0;
4941 	vp = NULL;
4942 	error = 0;
4943 
4944 	if (error == 0 && uap->filename) {
4945 		error = nlookup_init(&nd, uap->filename, UIO_USERSPACE,
4946 				     NLC_FOLLOW);
4947 		if (error == 0)
4948 			error = nlookup(&nd);
4949 		if (error == 0)
4950 			error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4951 		nlookup_done(&nd);
4952 	}
4953 
4954 	if (error == 0 && uap->attrname) {
4955 		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
4956 				  &size);
4957 	}
4958 
4959 	if (error == 0) {
4960 		error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4961 		if (error == 0)
4962 			error = nlookup(&nd);
4963 		if (error == 0)
4964 			error = ncp_writechk(&nd.nl_nch);
4965 		if (error == 0) {
4966 			error = VFS_EXTATTRCTL(nd.nl_nch.mount, uap->cmd, vp,
4967 					       uap->attrnamespace,
4968 					       uap->attrname, nd.nl_cred);
4969 		}
4970 		nlookup_done(&nd);
4971 	}
4972 
4973 	return (error);
4974 }
4975 
4976 /*
4977  * Syscall to get a named extended attribute on a file or directory.
4978  */
4979 int
4980 sys_extattr_set_file(struct extattr_set_file_args *uap)
4981 {
4982 	char attrname[EXTATTR_MAXNAMELEN];
4983 	struct nlookupdata nd;
4984 	struct vnode *vp;
4985 	struct uio auio;
4986 	struct iovec aiov;
4987 	int error;
4988 
4989 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4990 	if (error)
4991 		return (error);
4992 
4993 	vp = NULL;
4994 
4995 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4996 	if (error == 0)
4997 		error = nlookup(&nd);
4998 	if (error == 0)
4999 		error = ncp_writechk(&nd.nl_nch);
5000 	if (error == 0)
5001 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
5002 	if (error) {
5003 		nlookup_done(&nd);
5004 		return (error);
5005 	}
5006 
5007 	bzero(&auio, sizeof(auio));
5008 	aiov.iov_base = uap->data;
5009 	aiov.iov_len = uap->nbytes;
5010 	auio.uio_iov = &aiov;
5011 	auio.uio_iovcnt = 1;
5012 	auio.uio_offset = 0;
5013 	auio.uio_resid = uap->nbytes;
5014 	auio.uio_rw = UIO_WRITE;
5015 	auio.uio_td = curthread;
5016 
5017 	error = VOP_SETEXTATTR(vp, uap->attrnamespace, attrname,
5018 			       &auio, nd.nl_cred);
5019 
5020 	vput(vp);
5021 	nlookup_done(&nd);
5022 	return (error);
5023 }
5024 
5025 /*
5026  * Syscall to get a named extended attribute on a file or directory.
5027  */
5028 int
5029 sys_extattr_get_file(struct extattr_get_file_args *uap)
5030 {
5031 	char attrname[EXTATTR_MAXNAMELEN];
5032 	struct nlookupdata nd;
5033 	struct uio auio;
5034 	struct iovec aiov;
5035 	struct vnode *vp;
5036 	int error;
5037 
5038 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5039 	if (error)
5040 		return (error);
5041 
5042 	vp = NULL;
5043 
5044 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5045 	if (error == 0)
5046 		error = nlookup(&nd);
5047 	if (error == 0)
5048 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_SHARED, &vp);
5049 	if (error) {
5050 		nlookup_done(&nd);
5051 		return (error);
5052 	}
5053 
5054 	bzero(&auio, sizeof(auio));
5055 	aiov.iov_base = uap->data;
5056 	aiov.iov_len = uap->nbytes;
5057 	auio.uio_iov = &aiov;
5058 	auio.uio_iovcnt = 1;
5059 	auio.uio_offset = 0;
5060 	auio.uio_resid = uap->nbytes;
5061 	auio.uio_rw = UIO_READ;
5062 	auio.uio_td = curthread;
5063 
5064 	error = VOP_GETEXTATTR(vp, uap->attrnamespace, attrname,
5065 				&auio, nd.nl_cred);
5066 	uap->sysmsg_result = uap->nbytes - auio.uio_resid;
5067 
5068 	vput(vp);
5069 	nlookup_done(&nd);
5070 	return(error);
5071 }
5072 
5073 /*
5074  * Syscall to delete a named extended attribute from a file or directory.
5075  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
5076  */
5077 int
5078 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
5079 {
5080 	char attrname[EXTATTR_MAXNAMELEN];
5081 	struct nlookupdata nd;
5082 	struct vnode *vp;
5083 	int error;
5084 
5085 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5086 	if (error)
5087 		return(error);
5088 
5089 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5090 	if (error == 0)
5091 		error = nlookup(&nd);
5092 	if (error == 0)
5093 		error = ncp_writechk(&nd.nl_nch);
5094 	if (error == 0) {
5095 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
5096 		if (error == 0) {
5097 			error = VOP_SETEXTATTR(vp, uap->attrnamespace,
5098 					       attrname, NULL, nd.nl_cred);
5099 			vput(vp);
5100 		}
5101 	}
5102 	nlookup_done(&nd);
5103 	return(error);
5104 }
5105 
5106 /*
5107  * Determine if the mount is visible to the process.
5108  */
5109 static int
5110 chroot_visible_mnt(struct mount *mp, struct proc *p)
5111 {
5112 	struct nchandle nch;
5113 
5114 	/*
5115 	 * Traverse from the mount point upwards.  If we hit the process
5116 	 * root then the mount point is visible to the process.
5117 	 */
5118 	nch = mp->mnt_ncmountpt;
5119 	while (nch.ncp) {
5120 		if (nch.mount == p->p_fd->fd_nrdir.mount &&
5121 		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
5122 			return(1);
5123 		}
5124 		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
5125 			nch = nch.mount->mnt_ncmounton;
5126 		} else {
5127 			nch.ncp = nch.ncp->nc_parent;
5128 		}
5129 	}
5130 
5131 	/*
5132 	 * If the mount point is not visible to the process, but the
5133 	 * process root is in a subdirectory of the mount, return
5134 	 * TRUE anyway.
5135 	 */
5136 	if (p->p_fd->fd_nrdir.mount == mp)
5137 		return(1);
5138 
5139 	return(0);
5140 }
5141 
5142