xref: /dragonfly/sys/kern/vfs_syscalls.c (revision 60e242c5)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
36  */
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/buf.h>
41 #include <sys/conf.h>
42 #include <sys/sysent.h>
43 #include <sys/malloc.h>
44 #include <sys/mount.h>
45 #include <sys/mountctl.h>
46 #include <sys/sysmsg.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/linker.h>
52 #include <sys/stat.h>
53 #include <sys/unistd.h>
54 #include <sys/vnode.h>
55 #include <sys/proc.h>
56 #include <sys/caps.h>
57 #include <sys/jail.h>
58 #include <sys/namei.h>
59 #include <sys/nlookup.h>
60 #include <sys/dirent.h>
61 #include <sys/extattr.h>
62 #include <sys/spinlock.h>
63 #include <sys/kern_syscall.h>
64 #include <sys/objcache.h>
65 #include <sys/sysctl.h>
66 
67 #include <sys/buf2.h>
68 #include <sys/file2.h>
69 #include <sys/spinlock2.h>
70 
71 #include <vm/vm.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 
75 #include <machine/limits.h>
76 #include <machine/stdarg.h>
77 
78 static void mount_warning(struct mount *mp, const char *ctl, ...)
79 		__printflike(2, 3);
80 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
81 static int checkvp_chdir (struct vnode *vn, struct thread *td);
82 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
83 static int get_fscap(const char *);
84 static int chroot_refuse_vdir_fds (thread_t td, struct filedesc *fdp);
85 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
86 static int getutimes (struct timeval *, struct timespec *);
87 static int getutimens (const struct timespec *, struct timespec *, int *);
88 static int setfown (struct mount *, struct vnode *, uid_t, gid_t);
89 static int setfmode (struct vnode *, int);
90 static int setfflags (struct vnode *, u_long);
91 static int setutimes (struct vnode *, struct vattr *,
92 			const struct timespec *, int);
93 
94 static int	usermount = 0;	/* if 1, non-root can mount fs. */
95 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
96     "Allow non-root users to mount filesystems");
97 
98 static int	debug_unmount = 0; /* if 1 loop until unmount success */
99 SYSCTL_INT(_vfs, OID_AUTO, debug_unmount, CTLFLAG_RW, &debug_unmount, 0,
100     "Stall failed unmounts in loop");
101 
102 static struct krate krate_rename = { 1 };
103 
104 /*
105  * Virtual File System System Calls
106  */
107 
108 /*
109  * Mount a file system.
110  *
111  * mount_args(char *type, char *path, int flags, caddr_t data)
112  *
113  * MPALMOSTSAFE
114  */
115 int
116 sys_mount(struct sysmsg *sysmsg, const struct mount_args *uap)
117 {
118 	struct thread *td = curthread;
119 	struct vnode *vp;
120 	struct nchandle nch;
121 	struct mount *mp, *nullmp;
122 	struct vfsconf *vfsp;
123 	int error, flag = 0, flag2 = 0;
124 	int hasmount;
125 	int priv = 0;
126 	int flags = uap->flags;
127 	struct vattr va;
128 	struct nlookupdata nd;
129 	char fstypename[MFSNAMELEN];
130 	struct ucred *cred;
131 
132 	cred = td->td_ucred;
133 
134 	/* We do not allow user mounts inside a jail for now */
135 	if (usermount && jailed(cred)) {
136 		error = EPERM;
137 		goto done;
138 	}
139 
140 	/*
141 	 * Extract the file system type. We need to know this early, to take
142 	 * appropriate actions for jails and the filesystems to mount.
143 	 */
144         if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0)
145 		goto done;
146 
147 	/*
148 	 * Select the correct cap according to the file system type.
149 	 */
150 	priv = get_fscap(fstypename);
151 
152 	if (usermount == 0 && (error = caps_priv_check_td(td, priv)))
153 		goto done;
154 
155 	/*
156 	 * Do not allow NFS export by non-root users.
157 	 */
158 	if (flags & MNT_EXPORTED) {
159 		error = caps_priv_check_td(td, priv);
160 		if (error)
161 			goto done;
162 	}
163 	/*
164 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
165 	 */
166 	if (caps_priv_check_td(td, priv))
167 		flags |= MNT_NOSUID | MNT_NODEV;
168 
169 	/*
170 	 * Lookup the requested path and extract the nch and vnode.
171 	 */
172 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
173 	if (error == 0) {
174 		if ((error = nlookup(&nd)) == 0) {
175 			if (nd.nl_nch.ncp->nc_vp == NULL)
176 				error = ENOENT;
177 		}
178 	}
179 	if (error) {
180 		nlookup_done(&nd);
181 		goto done;
182 	}
183 
184 	/*
185 	 * If the target filesystem is resolved via a nullfs mount, then
186 	 * nd.nl_nch.mount will be pointing to the nullfs mount structure
187 	 * instead of the target file system. We need it in case we are
188 	 * doing an update.
189 	 */
190 	nullmp = nd.nl_nch.mount;
191 
192 	/*
193 	 * Extract the locked+refd ncp and cleanup the nd structure
194 	 */
195 	nch = nd.nl_nch;
196 	cache_zero(&nd.nl_nch);
197 	nlookup_done(&nd);
198 
199 	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
200 	    (mp = cache_findmount(&nch)) != NULL) {
201 		cache_dropmount(mp);
202 		hasmount = 1;
203 	} else {
204 		hasmount = 0;
205 	}
206 
207 
208 	/*
209 	 * now we have the locked ref'd nch and unreferenced vnode.
210 	 */
211 	vp = nch.ncp->nc_vp;
212 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
213 		cache_put(&nch);
214 		goto done;
215 	}
216 	cache_unlock(&nch);
217 
218 	/*
219 	 * Now we have an unlocked ref'd nch and a locked ref'd vp
220 	 */
221 	if (flags & MNT_UPDATE) {
222 		if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
223 			cache_drop(&nch);
224 			vput(vp);
225 			error = EINVAL;
226 			goto done;
227 		}
228 
229 		if (strncmp(fstypename, "null", 5) == 0) {
230 			KKASSERT(nullmp);
231 			mp = nullmp;
232 		} else {
233 			mp = vp->v_mount;
234 		}
235 
236 		flag = mp->mnt_flag;
237 		flag2 = mp->mnt_kern_flag;
238 		/*
239 		 * We only allow the filesystem to be reloaded if it
240 		 * is currently mounted read-only.
241 		 */
242 		if ((flags & MNT_RELOAD) &&
243 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
244 			cache_drop(&nch);
245 			vput(vp);
246 			error = EOPNOTSUPP;	/* Needs translation */
247 			goto done;
248 		}
249 		/*
250 		 * Only root, or the user that did the original mount is
251 		 * permitted to update it.
252 		 */
253 		if (mp->mnt_stat.f_owner != cred->cr_uid &&
254 		    (error = caps_priv_check_td(td, priv))) {
255 			cache_drop(&nch);
256 			vput(vp);
257 			goto done;
258 		}
259 		if (vfs_busy(mp, LK_NOWAIT)) {
260 			cache_drop(&nch);
261 			vput(vp);
262 			error = EBUSY;
263 			goto done;
264 		}
265 		if (hasmount) {
266 			cache_drop(&nch);
267 			vfs_unbusy(mp);
268 			vput(vp);
269 			error = EBUSY;
270 			goto done;
271 		}
272 		mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
273 		lwkt_gettoken(&mp->mnt_token);
274 		vn_unlock(vp);
275 		vfsp = mp->mnt_vfc;
276 		goto update;
277 	}
278 
279 	/*
280 	 * If the user is not root, ensure that they own the directory
281 	 * onto which we are attempting to mount.
282 	 */
283 	if ((error = VOP_GETATTR(vp, &va)) ||
284 	    (va.va_uid != cred->cr_uid &&
285 	     (error = caps_priv_check_td(td, priv)))) {
286 		cache_drop(&nch);
287 		vput(vp);
288 		goto done;
289 	}
290 	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
291 		cache_drop(&nch);
292 		vput(vp);
293 		goto done;
294 	}
295 	if (vp->v_type != VDIR) {
296 		cache_drop(&nch);
297 		vput(vp);
298 		error = ENOTDIR;
299 		goto done;
300 	}
301 	if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
302 		cache_drop(&nch);
303 		vput(vp);
304 		error = EPERM;
305 		goto done;
306 	}
307 	vfsp = vfsconf_find_by_name(fstypename);
308 	if (vfsp == NULL) {
309 		linker_file_t lf;
310 
311 		/* Only load modules for root (very important!) */
312 		error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
313 		if (error) {
314 			cache_drop(&nch);
315 			vput(vp);
316 			goto done;
317 		}
318 		error = linker_load_file(fstypename, &lf);
319 		if (error || lf == NULL) {
320 			cache_drop(&nch);
321 			vput(vp);
322 			if (lf == NULL)
323 				error = ENODEV;
324 			goto done;
325 		}
326 		lf->userrefs++;
327 		/* lookup again, see if the VFS was loaded */
328 		vfsp = vfsconf_find_by_name(fstypename);
329 		if (vfsp == NULL) {
330 			lf->userrefs--;
331 			linker_file_unload(lf);
332 			cache_drop(&nch);
333 			vput(vp);
334 			error = ENODEV;
335 			goto done;
336 		}
337 	}
338 	if (hasmount) {
339 		cache_drop(&nch);
340 		vput(vp);
341 		error = EBUSY;
342 		goto done;
343 	}
344 
345 	/*
346 	 * Allocate and initialize the filesystem.
347 	 */
348 	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
349 	mount_init(mp, vfsp->vfc_vfsops);
350 	vfs_busy(mp, LK_NOWAIT);
351 	mp->mnt_vfc = vfsp;
352 	mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT;
353 	vfsp->vfc_refcount++;
354 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
355 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
356 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
357 	mp->mnt_stat.f_owner = cred->cr_uid;
358 	lwkt_gettoken(&mp->mnt_token);
359 	vn_unlock(vp);
360 update:
361 	/*
362 	 * (per-mount token acquired at this point)
363 	 *
364 	 * Set the mount level flags.
365 	 */
366 	if (flags & MNT_RDONLY)
367 		mp->mnt_flag |= MNT_RDONLY;
368 	else if (mp->mnt_flag & MNT_RDONLY)
369 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
370 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
371 	    MNT_SYNCHRONOUS | MNT_ASYNC | MNT_NOATIME |
372 	    MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
373 	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
374 	    MNT_AUTOMOUNTED);
375 	mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC |
376 	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_ASYNC | MNT_FORCE |
377 	    MNT_NOSYMFOLLOW | MNT_IGNORE | MNT_TRIM |
378 	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR |
379 	    MNT_AUTOMOUNTED);
380 
381 	/*
382 	 * Pre-set the mount's ALL_MPSAFE flags if specified in the vfsconf.
383 	 * This way the initial VFS_MOUNT() call will also be MPSAFE.
384 	 */
385 	if (vfsp->vfc_flags & VFCF_MPSAFE)
386 		mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;
387 
388 	/*
389 	 * Mount the filesystem.
390 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
391 	 * get.
392 	 */
393 	if (mp->mnt_flag & MNT_UPDATE) {
394 		error = VFS_MOUNT(mp, uap->path, uap->data, cred);
395 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
396 			mp->mnt_flag &= ~MNT_RDONLY;
397 		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
398 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
399 		if (error) {
400 			mp->mnt_flag = flag;
401 			mp->mnt_kern_flag = flag2;
402 		}
403 		lwkt_reltoken(&mp->mnt_token);
404 		vfs_unbusy(mp);
405 		vrele(vp);
406 		cache_drop(&nch);
407 		goto done;
408 	}
409 	mp->mnt_ncmounton = nch;
410 	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
411 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
412 
413 	/*
414 	 * Put the new filesystem on the mount list after root.  The mount
415 	 * point gets its own mnt_ncmountpt (unless the VFS already set one
416 	 * up) which represents the root of the mount.  The lookup code
417 	 * detects the mount point going forward and checks the root of
418 	 * the mount going backwards.
419 	 *
420 	 * It is not necessary to invalidate or purge the vnode underneath
421 	 * because elements under the mount will be given their own glue
422 	 * namecache record.
423 	 */
424 	if (!error) {
425 		if (mp->mnt_ncmountpt.ncp == NULL) {
426 			/*
427 			 * Allocate, then unlock, but leave the ref intact.
428 			 * This is the mnt_refs (1) that we will retain
429 			 * through to the unmount.
430 			 */
431 			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
432 			cache_unlock(&mp->mnt_ncmountpt);
433 		}
434 		vn_unlock(vp);
435 		cache_lock(&nch);
436 		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
437 		cache_unlock(&nch);
438 		cache_ismounting(mp);
439 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
440 
441 		mountlist_insert(mp, MNTINS_LAST);
442 		vn_unlock(vp);
443 		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
444 		error = vfs_allocate_syncvnode(mp);
445 		lwkt_reltoken(&mp->mnt_token);
446 		vfs_unbusy(mp);
447 		error = VFS_START(mp, 0);
448 		vrele(vp);
449 		KNOTE(&fs_klist, VQ_MOUNT);
450 	} else {
451 		bzero(&mp->mnt_ncmounton, sizeof(mp->mnt_ncmounton));
452 		vn_syncer_thr_stop(mp);
453 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
454 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
455 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
456 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
457 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
458 		if (mp->mnt_cred) {
459 			crfree(mp->mnt_cred);
460 			mp->mnt_cred = NULL;
461 		}
462 		mp->mnt_vfc->vfc_refcount--;
463 		lwkt_reltoken(&mp->mnt_token);
464 		vfs_unbusy(mp);
465 		kfree(mp, M_MOUNT);
466 		cache_drop(&nch);
467 		vput(vp);
468 	}
469 done:
470 	return (error);
471 }
472 
473 /*
474  * Scan all active processes to see if any of them have a current
475  * or root directory onto which the new filesystem has just been
476  * mounted. If so, replace them with the new mount point.
477  *
478  * Both old_nch and new_nch are ref'd on call but not locked.
479  * new_nch must be temporarily locked so it can be associated with the
480  * vnode representing the root of the mount point.
481  */
482 struct checkdirs_info {
483 	struct nchandle old_nch;
484 	struct nchandle new_nch;
485 	struct vnode *old_vp;
486 	struct vnode *new_vp;
487 };
488 
489 static int checkdirs_callback(struct proc *p, void *data);
490 
491 static void
492 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
493 {
494 	struct checkdirs_info info;
495 	struct vnode *olddp;
496 	struct vnode *newdp;
497 	struct mount *mp;
498 
499 	/*
500 	 * If the old mount point's vnode has a usecount of 1, it is not
501 	 * being held as a descriptor anywhere.
502 	 */
503 	olddp = old_nch->ncp->nc_vp;
504 	if (olddp == NULL || VREFCNT(olddp) == 1)
505 		return;
506 
507 	/*
508 	 * Force the root vnode of the new mount point to be resolved
509 	 * so we can update any matching processes.
510 	 */
511 	mp = new_nch->mount;
512 	if (VFS_ROOT(mp, &newdp))
513 		panic("mount: lost mount");
514 	vn_unlock(newdp);
515 	cache_lock(new_nch);
516 	vn_lock(newdp, LK_EXCLUSIVE | LK_RETRY);
517 	cache_setunresolved(new_nch);
518 	cache_setvp(new_nch, newdp);
519 	cache_unlock(new_nch);
520 
521 	/*
522 	 * Special handling of the root node
523 	 */
524 	if (rootvnode == olddp) {
525 		vref(newdp);
526 		vfs_cache_setroot(newdp, cache_hold(new_nch));
527 	}
528 
529 	/*
530 	 * Pass newdp separately so the callback does not have to access
531 	 * it via new_nch->ncp->nc_vp.
532 	 */
533 	info.old_nch = *old_nch;
534 	info.new_nch = *new_nch;
535 	info.new_vp = newdp;
536 	allproc_scan(checkdirs_callback, &info, 0);
537 	vput(newdp);
538 }
539 
540 /*
541  * NOTE: callback is not MP safe because the scanned process's filedesc
542  * structure can be ripped out from under us, amoung other things.
543  */
544 static int
545 checkdirs_callback(struct proc *p, void *data)
546 {
547 	struct checkdirs_info *info = data;
548 	struct filedesc *fdp;
549 	struct nchandle ncdrop1;
550 	struct nchandle ncdrop2;
551 	struct vnode *vprele1;
552 	struct vnode *vprele2;
553 
554 	if ((fdp = p->p_fd) != NULL) {
555 		cache_zero(&ncdrop1);
556 		cache_zero(&ncdrop2);
557 		vprele1 = NULL;
558 		vprele2 = NULL;
559 
560 		/*
561 		 * MPUNSAFE - XXX fdp can be pulled out from under a
562 		 * foreign process.
563 		 *
564 		 * A shared filedesc is ok, we don't have to copy it
565 		 * because we are making this change globally.
566 		 */
567 		spin_lock(&fdp->fd_spin);
568 		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
569 		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
570 			vprele1 = fdp->fd_cdir;
571 			vref(info->new_vp);
572 			fdp->fd_cdir = info->new_vp;
573 			ncdrop1 = fdp->fd_ncdir;
574 			cache_copy(&info->new_nch, &fdp->fd_ncdir);
575 		}
576 		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
577 		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
578 			vprele2 = fdp->fd_rdir;
579 			vref(info->new_vp);
580 			fdp->fd_rdir = info->new_vp;
581 			ncdrop2 = fdp->fd_nrdir;
582 			cache_copy(&info->new_nch, &fdp->fd_nrdir);
583 		}
584 		spin_unlock(&fdp->fd_spin);
585 		if (ncdrop1.ncp)
586 			cache_drop(&ncdrop1);
587 		if (ncdrop2.ncp)
588 			cache_drop(&ncdrop2);
589 		if (vprele1)
590 			vrele(vprele1);
591 		if (vprele2)
592 			vrele(vprele2);
593 	}
594 	return(0);
595 }
596 
597 /*
598  * Unmount a file system.
599  *
600  * Note: unmount takes a path to the vnode mounted on as argument,
601  * not special file (as before).
602  *
603  * umount_args(char *path, int flags)
604  *
605  * MPALMOSTSAFE
606  */
607 int
608 sys_unmount(struct sysmsg *sysmsg, const struct unmount_args *uap)
609 {
610 	struct thread *td = curthread;
611 	struct proc *p __debugvar = td->td_proc;
612 	struct mount *mp = NULL;
613 	struct nlookupdata nd;
614 	char fstypename[MFSNAMELEN];
615 	int priv = 0;
616 	int error;
617 	struct ucred *cred;
618 
619 	cred = td->td_ucred;
620 
621 	KKASSERT(p);
622 
623 	/* We do not allow user umounts inside a jail for now */
624 	if (usermount && jailed(cred)) {
625 		error = EPERM;
626 		goto done;
627 	}
628 
629 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE,
630 			     NLC_FOLLOW | NLC_IGNBADDIR);
631 	if (error == 0)
632 		error = nlookup(&nd);
633 	if (error)
634 		goto out;
635 
636 	mp = nd.nl_nch.mount;
637 
638 	/* Figure out the fsname in order to select proper privs */
639 	ksnprintf(fstypename, MFSNAMELEN, "%s", mp->mnt_vfc->vfc_name);
640 	priv = get_fscap(fstypename);
641 
642 	if (usermount == 0 && (error = caps_priv_check_td(td, priv))) {
643 		nlookup_done(&nd);
644 		goto done;
645 	}
646 
647 	/*
648 	 * Only root, or the user that did the original mount is
649 	 * permitted to unmount this filesystem.
650 	 */
651 	if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
652 	    (error = caps_priv_check_td(td, priv)))
653 	{
654 		goto out;
655 	}
656 
657 	/*
658 	 * Don't allow unmounting the root file system.
659 	 */
660 	if (mp->mnt_flag & MNT_ROOTFS) {
661 		error = EINVAL;
662 		goto out;
663 	}
664 
665 	/*
666 	 * Must be the root of the filesystem
667 	 */
668 	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
669 		error = EINVAL;
670 		goto out;
671 	}
672 
673 	/* Check if this mount belongs to this prison */
674 	if (jailed(cred) && mp->mnt_cred && (!mp->mnt_cred->cr_prison ||
675 		mp->mnt_cred->cr_prison != cred->cr_prison)) {
676 		kprintf("mountpoint %s does not belong to this jail\n",
677 		    uap->path);
678 		error = EPERM;
679 		goto out;
680 	}
681 
682 	/*
683 	 * If no error try to issue the unmount.  We lose our cache
684 	 * ref when we call nlookup_done so we must hold the mount point
685 	 * to prevent use-after-free races.
686 	 */
687 out:
688 	if (error == 0) {
689 		mount_hold(mp);
690 		nlookup_done(&nd);
691 		error = dounmount(mp, uap->flags, 0);
692 		mount_drop(mp);
693 	} else {
694 		nlookup_done(&nd);
695 	}
696 done:
697 	return (error);
698 }
699 
700 /*
701  * Do the actual file system unmount (interlocked against the mountlist
702  * token and mp->mnt_token).
703  */
704 static int
705 dounmount_interlock(struct mount *mp)
706 {
707 	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
708 		return (EBUSY);
709 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
710 	return(0);
711 }
712 
713 static int
714 unmount_allproc_cb(struct proc *p, void *arg)
715 {
716 	struct mount *mp;
717 
718 	if (p->p_textnch.ncp == NULL)
719 		return 0;
720 
721 	mp = (struct mount *)arg;
722 	if (p->p_textnch.mount == mp)
723 		cache_drop(&p->p_textnch);
724 
725 	return 0;
726 }
727 
728 /*
729  * The guts of the unmount code.  The mount owns one ref and one hold
730  * count.  If we successfully interlock the unmount, those refs are ours.
731  * (The ref is from mnt_ncmountpt).
732  *
733  * When halting we shortcut certain mount types such as devfs by not actually
734  * issuing the VFS_SYNC() or VFS_UNMOUNT().  They are still disconnected
735  * from the mountlist so higher-level filesytems can unmount cleanly.
736  *
737  * The mount types that allow QUICKHALT are: devfs, tmpfs, procfs.
738  */
739 int
740 dounmount(struct mount *mp, int flags, int halting)
741 {
742 	struct namecache *ncp;
743 	struct nchandle nch;
744 	struct vnode *vp;
745 	int error;
746 	int async_flag;
747 	int lflags;
748 	int freeok = 1;
749 	int hadsyncer = 0;
750 	int retry;
751 	int quickhalt;
752 
753 	lwkt_gettoken(&mp->mnt_token);
754 
755 	/*
756 	 * When halting, certain mount points can essentially just
757 	 * be unhooked and otherwise ignored.
758 	 */
759 	if (halting && (mp->mnt_kern_flag & MNTK_QUICKHALT)) {
760 		quickhalt = 1;
761 		freeok = 0;
762 	} else {
763 		quickhalt = 0;
764 	}
765 
766 
767 	/*
768 	 * Exclusive access for unmounting purposes.
769 	 */
770 	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
771 		goto out;
772 
773 	/*
774 	 * We now 'own' the last mp->mnt_refs
775 	 *
776 	 * Allow filesystems to detect that a forced unmount is in progress.
777 	 */
778 	if (flags & MNT_FORCE)
779 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
780 	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_TIMELOCK);
781 	error = lockmgr(&mp->mnt_lock, lflags);
782 	if (error) {
783 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
784 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
785 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
786 			wakeup(mp);
787 		}
788 		goto out;
789 	}
790 
791 	if (mp->mnt_flag & MNT_EXPUBLIC)
792 		vfs_setpublicfs(NULL, NULL, NULL);
793 
794 	vfs_msync(mp, MNT_WAIT);
795 	async_flag = mp->mnt_flag & MNT_ASYNC;
796 	mp->mnt_flag &=~ MNT_ASYNC;
797 
798 	/*
799 	 * Decomission our special mnt_syncer vnode.  This also stops
800 	 * the vnlru code.  If we are unable to unmount we recommission
801 	 * the vnode.
802 	 *
803 	 * Then sync the filesystem.
804 	 */
805 	if ((vp = mp->mnt_syncer) != NULL) {
806 		mp->mnt_syncer = NULL;
807 		atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
808 		vrele(vp);
809 		hadsyncer = 1;
810 	}
811 
812 	/*
813 	 * Sync normally-mounted filesystem.
814 	 */
815 	if (quickhalt == 0) {
816 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
817 			VFS_SYNC(mp, MNT_WAIT);
818 	}
819 
820 	/*
821 	 * nchandle records ref the mount structure.  Expect a count of 1
822 	 * (our mount->mnt_ncmountpt).
823 	 *
824 	 * Scans can get temporary refs on a mountpoint (thought really
825 	 * heavy duty stuff like cache_findmount() do not).
826 	 */
827 	for (retry = 0; (retry < 10 || debug_unmount); ++retry) {
828 		/*
829 		 * Invalidate the namecache topology under the mount.
830 		 * nullfs mounts alias a real mount's namecache topology
831 		 * and it should not be invalidated in that case.
832 		 */
833 		if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
834 			cache_lock(&mp->mnt_ncmountpt);
835 			cache_inval(&mp->mnt_ncmountpt,
836 				    CINV_DESTROY|CINV_CHILDREN);
837 			cache_unlock(&mp->mnt_ncmountpt);
838 		}
839 
840 		/*
841 		 * Clear pcpu caches
842 		 */
843 		cache_unmounting(mp);
844 		if (mp->mnt_refs != 1)
845 			cache_clearmntcache(mp);
846 
847 		/*
848 		 * Break out if we are good.  Don't count ncp refs if the
849 		 * mount is aliased.
850 		 */
851 		ncp = (mp->mnt_kern_flag & MNTK_NCALIASED) ?
852 		      NULL : mp->mnt_ncmountpt.ncp;
853 		if (mp->mnt_refs == 1 &&
854 		    (ncp == NULL || (ncp->nc_refs == 1 &&
855 				     TAILQ_FIRST(&ncp->nc_list) == NULL))) {
856 			break;
857 		}
858 
859 		/*
860 		 * If forcing the unmount, clean out any p->p_textnch
861 		 * nchandles that match this mount.
862 		 */
863 		if (flags & MNT_FORCE)
864 			allproc_scan(&unmount_allproc_cb, mp, 0);
865 
866 		/*
867 		 * Sleep and retry.
868 		 */
869 		tsleep(&mp->mnt_refs, 0, "mntbsy", hz / 10 + 1);
870 		if ((retry & 15) == 15) {
871 			mount_warning(mp,
872 				      "(%p) debug - retry %d, "
873 				      "%d namecache refs, %d mount refs",
874 				      mp, retry,
875 				      (ncp ? ncp->nc_refs - 1 : 0),
876 				      mp->mnt_refs - 1);
877 		}
878 	}
879 
880 	error = 0;
881 	ncp = (mp->mnt_kern_flag & MNTK_NCALIASED) ?
882 	      NULL : mp->mnt_ncmountpt.ncp;
883 	if (mp->mnt_refs != 1 ||
884 	    (ncp != NULL && (ncp->nc_refs != 1 ||
885 			     TAILQ_FIRST(&ncp->nc_list)))) {
886 		mount_warning(mp,
887 			      "(%p): %d namecache refs, %d mount refs "
888 			      "still present",
889 			      mp,
890 			      (ncp ? ncp->nc_refs - 1 : 0),
891 			      mp->mnt_refs - 1);
892 		if (flags & MNT_FORCE) {
893 			freeok = 0;
894 			mount_warning(mp, "forcing unmount\n");
895 		} else {
896 			error = EBUSY;
897 		}
898 	}
899 
900 	/*
901 	 * So far so good, sync the filesystem once more and
902 	 * call the VFS unmount code if the sync succeeds.
903 	 */
904 	if (error == 0 && quickhalt == 0) {
905 		if (mp->mnt_flag & MNT_RDONLY) {
906 			error = VFS_UNMOUNT(mp, flags);
907 		} else {
908 			error = VFS_SYNC(mp, MNT_WAIT);
909 			if (error == 0 ||		/* no error */
910 			    error == EOPNOTSUPP ||	/* no sync avail */
911 			    (flags & MNT_FORCE)) {	/* force anyway */
912 				error = VFS_UNMOUNT(mp, flags);
913 			}
914 		}
915 		if (error) {
916 			mount_warning(mp,
917 				      "(%p) unmount: vfs refused to unmount, "
918 				      "error %d",
919 				      mp, error);
920 		}
921 	}
922 
923 	/*
924 	 * If an error occurred we can still recover, restoring the
925 	 * syncer vnode and misc flags.
926 	 */
927 	if (error) {
928 		if (mp->mnt_syncer == NULL && hadsyncer)
929 			vfs_allocate_syncvnode(mp);
930 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
931 		mp->mnt_flag |= async_flag;
932 		lockmgr(&mp->mnt_lock, LK_RELEASE);
933 		if (mp->mnt_kern_flag & MNTK_MWAIT) {
934 			mp->mnt_kern_flag &= ~MNTK_MWAIT;
935 			wakeup(mp);
936 		}
937 		goto out;
938 	}
939 	/*
940 	 * Clean up any journals still associated with the mount after
941 	 * filesystem activity has ceased.
942 	 */
943 	journal_remove_all_journals(mp,
944 	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
945 
946 	mountlist_remove(mp);
947 
948 	/*
949 	 * Remove any installed vnode ops here so the individual VFSs don't
950 	 * have to.
951 	 *
952 	 * mnt_refs should go to zero when we scrap mnt_ncmountpt.
953 	 *
954 	 * When quickhalting we have to keep these intact because the
955 	 * underlying vnodes have not been destroyed, and some might be
956 	 * dirty.
957 	 */
958 	if (quickhalt == 0) {
959 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
960 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
961 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
962 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
963 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
964 	}
965 
966 	if (mp->mnt_ncmountpt.ncp != NULL) {
967 		nch = mp->mnt_ncmountpt;
968 		cache_zero(&mp->mnt_ncmountpt);
969 		cache_clrmountpt(&nch);
970 		cache_drop(&nch);
971 	}
972 	if (mp->mnt_ncmounton.ncp != NULL) {
973 		cache_unmounting(mp);
974 		nch = mp->mnt_ncmounton;
975 		cache_zero(&mp->mnt_ncmounton);
976 		cache_clrmountpt(&nch);
977 		cache_drop(&nch);
978 	}
979 
980 	if (mp->mnt_cred) {
981 		crfree(mp->mnt_cred);
982 		mp->mnt_cred = NULL;
983 	}
984 
985 	mp->mnt_vfc->vfc_refcount--;
986 
987 	/*
988 	 * If not quickhalting the mount, we expect there to be no
989 	 * vnodes left.
990 	 */
991 	if (quickhalt == 0 && !TAILQ_EMPTY(&mp->mnt_nvnodelist))
992 		panic("unmount: dangling vnode");
993 
994 	/*
995 	 * Release the lock
996 	 */
997 	lockmgr(&mp->mnt_lock, LK_RELEASE);
998 	if (mp->mnt_kern_flag & MNTK_MWAIT) {
999 		mp->mnt_kern_flag &= ~MNTK_MWAIT;
1000 		wakeup(mp);
1001 	}
1002 
1003 	/*
1004 	 * If we reach here and freeok != 0 we must free the mount.
1005 	 * mnt_refs should already have dropped to 0, so if it is not
1006 	 * zero we must cycle the caches and wait.
1007 	 *
1008 	 * When we are satisfied that the mount has disconnected we can
1009 	 * drop the hold on the mp that represented the mount (though the
1010 	 * caller might actually have another, so the caller's drop may
1011 	 * do the actual free).
1012 	 */
1013 	if (freeok) {
1014 		if (mp->mnt_refs > 0)
1015 			cache_clearmntcache(mp);
1016 		while (mp->mnt_refs > 0) {
1017 			cache_unmounting(mp);
1018 			wakeup(mp);
1019 			tsleep(&mp->mnt_refs, 0, "umntrwait", hz / 10 + 1);
1020 			cache_clearmntcache(mp);
1021 		}
1022 		lwkt_reltoken(&mp->mnt_token);
1023 		mount_drop(mp);
1024 		mp = NULL;
1025 	} else {
1026 		cache_clearmntcache(mp);
1027 	}
1028 	error = 0;
1029 	KNOTE(&fs_klist, VQ_UNMOUNT);
1030 out:
1031 	if (mp)
1032 		lwkt_reltoken(&mp->mnt_token);
1033 	return (error);
1034 }
1035 
1036 static
1037 void
1038 mount_warning(struct mount *mp, const char *ctl, ...)
1039 {
1040 	char *ptr;
1041 	char *buf;
1042 	__va_list va;
1043 
1044 	__va_start(va, ctl);
1045 	if (cache_fullpath(NULL, &mp->mnt_ncmounton, NULL,
1046 			   &ptr, &buf, 0) == 0) {
1047 		kprintf("unmount(%s): ", ptr);
1048 		kvprintf(ctl, va);
1049 		kprintf("\n");
1050 		kfree(buf, M_TEMP);
1051 	} else {
1052 		kprintf("unmount(%p", mp);
1053 		if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
1054 			kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
1055 		kprintf("): ");
1056 		kvprintf(ctl, va);
1057 		kprintf("\n");
1058 	}
1059 	__va_end(va);
1060 }
1061 
1062 /*
1063  * Shim cache_fullpath() to handle the case where a process is chrooted into
1064  * a subdirectory of a mount.  In this case if the root mount matches the
1065  * process root directory's mount we have to specify the process's root
1066  * directory instead of the mount point, because the mount point might
1067  * be above the root directory.
1068  */
1069 static
1070 int
1071 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
1072 {
1073 	struct nchandle *nch;
1074 
1075 	if (p && p->p_fd->fd_nrdir.mount == mp)
1076 		nch = &p->p_fd->fd_nrdir;
1077 	else
1078 		nch = &mp->mnt_ncmountpt;
1079 	return(cache_fullpath(p, nch, NULL, rb, fb, 0));
1080 }
1081 
1082 /*
1083  * Sync each mounted filesystem.
1084  */
1085 
1086 #ifdef DEBUG
1087 static int syncprt = 0;
1088 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
1089 #endif /* DEBUG */
1090 
1091 static int sync_callback(struct mount *mp, void *data);
1092 
1093 int
1094 sys_sync(struct sysmsg *sysmsg, const struct sync_args *uap)
1095 {
1096 	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
1097 	return (0);
1098 }
1099 
1100 static
1101 int
1102 sync_callback(struct mount *mp, void *data __unused)
1103 {
1104 	int asyncflag;
1105 
1106 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1107 		lwkt_gettoken(&mp->mnt_token);
1108 		asyncflag = mp->mnt_flag & MNT_ASYNC;
1109 		mp->mnt_flag &= ~MNT_ASYNC;
1110 		lwkt_reltoken(&mp->mnt_token);
1111 		vfs_msync(mp, MNT_NOWAIT);
1112 		VFS_SYNC(mp, MNT_NOWAIT);
1113 		lwkt_gettoken(&mp->mnt_token);
1114 		mp->mnt_flag |= asyncflag;
1115 		lwkt_reltoken(&mp->mnt_token);
1116 	}
1117 	return(0);
1118 }
1119 
1120 /* XXX PRISON: could be per prison flag */
1121 static int prison_quotas;
1122 #if 0
1123 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
1124 #endif
1125 
1126 /*
1127  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
1128  *
1129  * Change filesystem quotas.
1130  *
1131  * MPALMOSTSAFE
1132  */
1133 int
1134 sys_quotactl(struct sysmsg *sysmsg, const struct quotactl_args *uap)
1135 {
1136 	struct nlookupdata nd;
1137 	struct thread *td;
1138 	struct mount *mp;
1139 	int error;
1140 
1141 	td = curthread;
1142 	if (td->td_ucred->cr_prison && !prison_quotas) {
1143 		error = EPERM;
1144 		goto done;
1145 	}
1146 
1147 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1148 	if (error == 0)
1149 		error = nlookup(&nd);
1150 	if (error == 0) {
1151 		mp = nd.nl_nch.mount;
1152 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
1153 				    uap->arg, nd.nl_cred);
1154 	}
1155 	nlookup_done(&nd);
1156 done:
1157 	return (error);
1158 }
1159 
1160 /*
1161  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
1162  *		void *buf, int buflen)
1163  *
1164  * This function operates on a mount point and executes the specified
1165  * operation using the specified control data, and possibly returns data.
1166  *
1167  * The actual number of bytes stored in the result buffer is returned, 0
1168  * if none, otherwise an error is returned.
1169  *
1170  * MPALMOSTSAFE
1171  */
1172 int
1173 sys_mountctl(struct sysmsg *sysmsg, const struct mountctl_args *uap)
1174 {
1175 	struct thread *td = curthread;
1176 	struct file *fp;
1177 	void *ctl = NULL;
1178 	void *buf = NULL;
1179 	char *path = NULL;
1180 	int error;
1181 
1182 	/*
1183 	 * Sanity and permissions checks.  We must be root.
1184 	 */
1185 	if (td->td_ucred->cr_prison != NULL)
1186 		return (EPERM);
1187 	if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
1188 	    (error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)) != 0)
1189 	{
1190 		return (error);
1191 	}
1192 
1193 	/*
1194 	 * Argument length checks
1195 	 */
1196 	if (uap->ctllen < 0 || uap->ctllen > 1024)
1197 		return (EINVAL);
1198 	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
1199 		return (EINVAL);
1200 	if (uap->path == NULL)
1201 		return (EINVAL);
1202 
1203 	/*
1204 	 * Allocate the necessary buffers and copyin data
1205 	 */
1206 	path = objcache_get(namei_oc, M_WAITOK);
1207 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1208 	if (error)
1209 		goto done;
1210 
1211 	if (uap->ctllen) {
1212 		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
1213 		error = copyin(uap->ctl, ctl, uap->ctllen);
1214 		if (error)
1215 			goto done;
1216 	}
1217 	if (uap->buflen)
1218 		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
1219 
1220 	/*
1221 	 * Validate the descriptor
1222 	 */
1223 	if (uap->fd >= 0) {
1224 		fp = holdfp(td, uap->fd, -1);
1225 		if (fp == NULL) {
1226 			error = EBADF;
1227 			goto done;
1228 		}
1229 	} else {
1230 		fp = NULL;
1231 	}
1232 
1233 	/*
1234 	 * Execute the internal kernel function and clean up.
1235 	 */
1236 	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen,
1237 			      buf, uap->buflen, &sysmsg->sysmsg_result);
1238 	if (fp)
1239 		dropfp(td, uap->fd, fp);
1240 	if (error == 0 && sysmsg->sysmsg_result > 0)
1241 		error = copyout(buf, uap->buf, sysmsg->sysmsg_result);
1242 done:
1243 	if (path)
1244 		objcache_put(namei_oc, path);
1245 	if (ctl)
1246 		kfree(ctl, M_TEMP);
1247 	if (buf)
1248 		kfree(buf, M_TEMP);
1249 	return (error);
1250 }
1251 
1252 /*
1253  * Execute a mount control operation by resolving the path to a mount point
1254  * and calling vop_mountctl().
1255  *
1256  * Use the mount point from the nch instead of the vnode so nullfs mounts
1257  * can properly spike the VOP.
1258  */
1259 int
1260 kern_mountctl(const char *path, int op, struct file *fp,
1261 		const void *ctl, int ctllen,
1262 		void *buf, int buflen, int *res)
1263 {
1264 	struct vnode *vp;
1265 	struct nlookupdata nd;
1266 	struct nchandle nch;
1267 	struct mount *mp;
1268 	int error;
1269 
1270 	*res = 0;
1271 	vp = NULL;
1272 	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
1273 	if (error)
1274 		return (error);
1275 	error = nlookup(&nd);
1276 	if (error) {
1277 		nlookup_done(&nd);
1278 		return (error);
1279 	}
1280 	error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
1281 	if (error) {
1282 		nlookup_done(&nd);
1283 		return (error);
1284 	}
1285 
1286 	/*
1287 	 * Yes, all this is needed to use the nch.mount below, because
1288 	 * we must maintain a ref on the mount to avoid ripouts (e.g.
1289 	 * due to heavy mount/unmount use by synth or poudriere).
1290 	 */
1291 	nch = nd.nl_nch;
1292 	cache_zero(&nd.nl_nch);
1293 	cache_unlock(&nch);
1294 	nlookup_done(&nd);
1295 	vn_unlock(vp);
1296 
1297 	mp = nch.mount;
1298 
1299 	/*
1300 	 * Must be the root of the filesystem
1301 	 */
1302 	if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
1303 		cache_drop(&nch);
1304 		vrele(vp);
1305 		return (EINVAL);
1306 	}
1307 	if (mp == NULL || mp->mnt_kern_flag & MNTK_UNMOUNT) {
1308 		kprintf("kern_mountctl: Warning, \"%s\" racing unmount\n",
1309 			path);
1310 		cache_drop(&nch);
1311 		vrele(vp);
1312 		return (EINVAL);
1313 	}
1314 	error = vop_mountctl(mp->mnt_vn_use_ops, vp, op, fp, ctl, ctllen,
1315 			     buf, buflen, res);
1316 	vrele(vp);
1317 	cache_drop(&nch);
1318 
1319 	return (error);
1320 }
1321 
1322 int
1323 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1324 {
1325 	struct thread *td = curthread;
1326 	struct proc *p = td->td_proc;
1327 	struct mount *mp;
1328 	struct statfs *sp;
1329 	char *fullpath, *freepath;
1330 	int error;
1331 
1332 	if ((error = nlookup(nd)) != 0)
1333 		return (error);
1334 	mp = nd->nl_nch.mount;
1335 	sp = &mp->mnt_stat;
1336 
1337 	/*
1338 	 * Ignore refresh error, user should have visibility.
1339 	 * This can happen if a NFS mount goes bad (e.g. server
1340 	 * revokes perms or goes down).
1341 	 */
1342 	error = VFS_STATFS(mp, sp, nd->nl_cred);
1343 	/* ignore error */
1344 
1345 	error = mount_path(p, mp, &fullpath, &freepath);
1346 	if (error)
1347 		return(error);
1348 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1349 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1350 	kfree(freepath, M_TEMP);
1351 
1352 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1353 	bcopy(sp, buf, sizeof(*buf));
1354 	/* Only root should have access to the fsid's. */
1355 	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT))
1356 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1357 	return (0);
1358 }
1359 
1360 /*
1361  * statfs_args(char *path, struct statfs *buf)
1362  *
1363  * Get filesystem statistics.
1364  */
1365 int
1366 sys_statfs(struct sysmsg *sysmsg, const struct statfs_args *uap)
1367 {
1368 	struct nlookupdata nd;
1369 	struct statfs buf;
1370 	int error;
1371 
1372 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1373 	if (error == 0)
1374 		error = kern_statfs(&nd, &buf);
1375 	nlookup_done(&nd);
1376 	if (error == 0)
1377 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1378 	return (error);
1379 }
1380 
1381 int
1382 kern_fstatfs(int fd, struct statfs *buf)
1383 {
1384 	struct thread *td = curthread;
1385 	struct proc *p = td->td_proc;
1386 	struct file *fp;
1387 	struct mount *mp;
1388 	struct statfs *sp;
1389 	char *fullpath, *freepath;
1390 	int error;
1391 
1392 	KKASSERT(p);
1393 	if ((error = holdvnode(td, fd, &fp)) != 0)
1394 		return (error);
1395 
1396 	/*
1397 	 * Try to use mount info from any overlays rather than the
1398 	 * mount info for the underlying vnode, otherwise we will
1399 	 * fail when operating on null-mounted paths inside a chroot.
1400 	 */
1401 	if ((mp = fp->f_nchandle.mount) == NULL)
1402 		mp = ((struct vnode *)fp->f_data)->v_mount;
1403 	if (mp == NULL) {
1404 		error = EBADF;
1405 		goto done;
1406 	}
1407 	if (fp->f_cred == NULL) {
1408 		error = EINVAL;
1409 		goto done;
1410 	}
1411 
1412 	/*
1413 	 * Ignore refresh error, user should have visibility.
1414 	 * This can happen if a NFS mount goes bad (e.g. server
1415 	 * revokes perms or goes down).
1416 	 */
1417 	sp = &mp->mnt_stat;
1418 	error = VFS_STATFS(mp, sp, fp->f_cred);
1419 
1420 	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1421 		goto done;
1422 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1423 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1424 	kfree(freepath, M_TEMP);
1425 
1426 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1427 	bcopy(sp, buf, sizeof(*buf));
1428 
1429 	/* Only root should have access to the fsid's. */
1430 	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT))
1431 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1432 	error = 0;
1433 done:
1434 	fdrop(fp);
1435 	return (error);
1436 }
1437 
1438 /*
1439  * fstatfs_args(int fd, struct statfs *buf)
1440  *
1441  * Get filesystem statistics.
1442  */
1443 int
1444 sys_fstatfs(struct sysmsg *sysmsg, const struct fstatfs_args *uap)
1445 {
1446 	struct statfs buf;
1447 	int error;
1448 
1449 	error = kern_fstatfs(uap->fd, &buf);
1450 
1451 	if (error == 0)
1452 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1453 	return (error);
1454 }
1455 
1456 int
1457 kern_statvfs(struct nlookupdata *nd, struct statvfs *buf)
1458 {
1459 	struct mount *mp;
1460 	struct statvfs *sp;
1461 	int error;
1462 
1463 	if ((error = nlookup(nd)) != 0)
1464 		return (error);
1465 	mp = nd->nl_nch.mount;
1466 	sp = &mp->mnt_vstat;
1467 	if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
1468 		return (error);
1469 
1470 	sp->f_flag = 0;
1471 	if (mp->mnt_flag & MNT_RDONLY)
1472 		sp->f_flag |= ST_RDONLY;
1473 	if (mp->mnt_flag & MNT_NOSUID)
1474 		sp->f_flag |= ST_NOSUID;
1475 	bcopy(sp, buf, sizeof(*buf));
1476 	return (0);
1477 }
1478 
1479 /*
1480  * statfs_args(char *path, struct statfs *buf)
1481  *
1482  * Get filesystem statistics.
1483  */
1484 int
1485 sys_statvfs(struct sysmsg *sysmsg, const struct statvfs_args *uap)
1486 {
1487 	struct nlookupdata nd;
1488 	struct statvfs buf;
1489 	int error;
1490 
1491 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1492 	if (error == 0)
1493 		error = kern_statvfs(&nd, &buf);
1494 	nlookup_done(&nd);
1495 	if (error == 0)
1496 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1497 	return (error);
1498 }
1499 
1500 int
1501 kern_fstatvfs(int fd, struct statvfs *buf)
1502 {
1503 	struct thread *td = curthread;
1504 	struct file *fp;
1505 	struct mount *mp;
1506 	struct statvfs *sp;
1507 	int error;
1508 
1509 	if ((error = holdvnode(td, fd, &fp)) != 0)
1510 		return (error);
1511 	if ((mp = fp->f_nchandle.mount) == NULL)
1512 		mp = ((struct vnode *)fp->f_data)->v_mount;
1513 	if (mp == NULL) {
1514 		error = EBADF;
1515 		goto done;
1516 	}
1517 	if (fp->f_cred == NULL) {
1518 		error = EINVAL;
1519 		goto done;
1520 	}
1521 	sp = &mp->mnt_vstat;
1522 	if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
1523 		goto done;
1524 
1525 	sp->f_flag = 0;
1526 	if (mp->mnt_flag & MNT_RDONLY)
1527 		sp->f_flag |= ST_RDONLY;
1528 	if (mp->mnt_flag & MNT_NOSUID)
1529 		sp->f_flag |= ST_NOSUID;
1530 
1531 	bcopy(sp, buf, sizeof(*buf));
1532 	error = 0;
1533 done:
1534 	fdrop(fp);
1535 	return (error);
1536 }
1537 
1538 /*
1539  * fstatfs_args(int fd, struct statfs *buf)
1540  *
1541  * Get filesystem statistics.
1542  */
1543 int
1544 sys_fstatvfs(struct sysmsg *sysmsg, const struct fstatvfs_args *uap)
1545 {
1546 	struct statvfs buf;
1547 	int error;
1548 
1549 	error = kern_fstatvfs(uap->fd, &buf);
1550 
1551 	if (error == 0)
1552 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1553 	return (error);
1554 }
1555 
1556 /*
1557  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1558  *
1559  * Get statistics on all filesystems.
1560  */
1561 
1562 struct getfsstat_info {
1563 	struct statfs *sfsp;
1564 	long count;
1565 	long maxcount;
1566 	int error;
1567 	int flags;
1568 	struct thread *td;
1569 };
1570 
1571 static int getfsstat_callback(struct mount *, void *);
1572 
1573 int
1574 sys_getfsstat(struct sysmsg *sysmsg, const struct getfsstat_args *uap)
1575 {
1576 	struct thread *td = curthread;
1577 	struct getfsstat_info info;
1578 
1579 	bzero(&info, sizeof(info));
1580 
1581 	info.maxcount = uap->bufsize / sizeof(struct statfs);
1582 	info.sfsp = uap->buf;
1583 	info.count = 0;
1584 	info.flags = uap->flags;
1585 	info.td = td;
1586 
1587 	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1588 	if (info.sfsp && info.count > info.maxcount)
1589 		sysmsg->sysmsg_result = info.maxcount;
1590 	else
1591 		sysmsg->sysmsg_result = info.count;
1592 	return (info.error);
1593 }
1594 
1595 static int
1596 getfsstat_callback(struct mount *mp, void *data)
1597 {
1598 	struct getfsstat_info *info = data;
1599 	struct statfs *sp;
1600 	char *freepath;
1601 	char *fullpath;
1602 	int error;
1603 
1604 	if (info->td->td_proc && !chroot_visible_mnt(mp, info->td->td_proc))
1605 		return(0);
1606 
1607 	if (info->sfsp && info->count < info->maxcount) {
1608 		sp = &mp->mnt_stat;
1609 
1610 		/*
1611 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1612 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1613 		 * overrides MNT_WAIT.
1614 		 *
1615 		 * Ignore refresh error, user should have visibility.
1616 		 * This can happen if a NFS mount goes bad (e.g. server
1617 		 * revokes perms or goes down).
1618 		 */
1619 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1620 		    (info->flags & MNT_WAIT)) &&
1621 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1622 			/* ignore error */
1623 		}
1624 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1625 
1626 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1627 		if (error) {
1628 			info->error = error;
1629 			return(-1);
1630 		}
1631 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1632 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1633 		kfree(freepath, M_TEMP);
1634 
1635 		error = copyout(sp, info->sfsp, sizeof(*sp));
1636 		if (error) {
1637 			info->error = error;
1638 			return (-1);
1639 		}
1640 		++info->sfsp;
1641 	}
1642 	info->count++;
1643 	return(0);
1644 }
1645 
1646 /*
1647  * getvfsstat_args(struct statfs *buf, struct statvfs *vbuf,
1648 		   long bufsize, int flags)
1649  *
1650  * Get statistics on all filesystems.
1651  */
1652 
1653 struct getvfsstat_info {
1654 	struct statfs *sfsp;
1655 	struct statvfs *vsfsp;
1656 	long count;
1657 	long maxcount;
1658 	int error;
1659 	int flags;
1660 	struct thread *td;
1661 };
1662 
1663 static int getvfsstat_callback(struct mount *, void *);
1664 
1665 int
1666 sys_getvfsstat(struct sysmsg *sysmsg, const struct getvfsstat_args *uap)
1667 {
1668 	struct thread *td = curthread;
1669 	struct getvfsstat_info info;
1670 
1671 	bzero(&info, sizeof(info));
1672 
1673 	info.maxcount = uap->vbufsize / sizeof(struct statvfs);
1674 	info.sfsp = uap->buf;
1675 	info.vsfsp = uap->vbuf;
1676 	info.count = 0;
1677 	info.flags = uap->flags;
1678 	info.td = td;
1679 
1680 	mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
1681 	if (info.vsfsp && info.count > info.maxcount)
1682 		sysmsg->sysmsg_result = info.maxcount;
1683 	else
1684 		sysmsg->sysmsg_result = info.count;
1685 	return (info.error);
1686 }
1687 
1688 static int
1689 getvfsstat_callback(struct mount *mp, void *data)
1690 {
1691 	struct getvfsstat_info *info = data;
1692 	struct statfs *sp;
1693 	struct statvfs *vsp;
1694 	char *freepath;
1695 	char *fullpath;
1696 	int error;
1697 
1698 	if (info->td->td_proc && !chroot_visible_mnt(mp, info->td->td_proc))
1699 		return(0);
1700 
1701 	if (info->vsfsp && info->count < info->maxcount) {
1702 		sp = &mp->mnt_stat;
1703 		vsp = &mp->mnt_vstat;
1704 
1705 		/*
1706 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1707 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1708 		 * overrides MNT_WAIT.
1709 		 *
1710 		 * Ignore refresh error, user should have visibility.
1711 		 * This can happen if a NFS mount goes bad (e.g. server
1712 		 * revokes perms or goes down).
1713 		 */
1714 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1715 		    (info->flags & MNT_WAIT)) &&
1716 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1717 			/* ignore error */
1718 		}
1719 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1720 
1721 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1722 		    (info->flags & MNT_WAIT)) &&
1723 		    (error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
1724 			/* ignore error */
1725 		}
1726 		vsp->f_flag = 0;
1727 		if (mp->mnt_flag & MNT_RDONLY)
1728 			vsp->f_flag |= ST_RDONLY;
1729 		if (mp->mnt_flag & MNT_NOSUID)
1730 			vsp->f_flag |= ST_NOSUID;
1731 
1732 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1733 		if (error) {
1734 			info->error = error;
1735 			return(-1);
1736 		}
1737 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1738 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1739 		kfree(freepath, M_TEMP);
1740 
1741 		error = copyout(sp, info->sfsp, sizeof(*sp));
1742 		if (error == 0)
1743 			error = copyout(vsp, info->vsfsp, sizeof(*vsp));
1744 		if (error) {
1745 			info->error = error;
1746 			return (-1);
1747 		}
1748 		++info->sfsp;
1749 		++info->vsfsp;
1750 	}
1751 	info->count++;
1752 	return(0);
1753 }
1754 
1755 
1756 /*
1757  * fchdir_args(int fd)
1758  *
1759  * Change current working directory to a given file descriptor.
1760  */
1761 int
1762 sys_fchdir(struct sysmsg *sysmsg, const struct fchdir_args *uap)
1763 {
1764 	struct thread *td = curthread;
1765 	struct proc *p = td->td_proc;
1766 	struct filedesc *fdp = p->p_fd;
1767 	struct vnode *vp, *ovp;
1768 	struct mount *mp;
1769 	struct file *fp;
1770 	struct nchandle nch, onch, tnch;
1771 	int error;
1772 
1773 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
1774 		return (error);
1775 	lwkt_gettoken(&p->p_token);
1776 	vp = (struct vnode *)fp->f_data;
1777 	vref(vp);
1778 	vn_lock(vp, LK_SHARED | LK_RETRY);
1779 	if (fp->f_nchandle.ncp == NULL)
1780 		error = ENOTDIR;
1781 	else
1782 		error = checkvp_chdir(vp, td);
1783 	if (error) {
1784 		vput(vp);
1785 		goto done;
1786 	}
1787 	cache_copy(&fp->f_nchandle, &nch);
1788 
1789 	/*
1790 	 * If the ncp has become a mount point, traverse through
1791 	 * the mount point.
1792 	 */
1793 
1794 	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1795 	       (mp = cache_findmount(&nch)) != NULL
1796 	) {
1797 		error = nlookup_mp(mp, &tnch);
1798 		if (error == 0) {
1799 			cache_unlock(&tnch);	/* leave ref intact */
1800 			vput(vp);
1801 			vp = tnch.ncp->nc_vp;
1802 			error = vget(vp, LK_SHARED);
1803 			KKASSERT(error == 0);
1804 			cache_drop(&nch);
1805 			nch = tnch;
1806 		}
1807 		cache_dropmount(mp);
1808 	}
1809 	if (error == 0) {
1810 		spin_lock(&fdp->fd_spin);
1811 		ovp = fdp->fd_cdir;
1812 		onch = fdp->fd_ncdir;
1813 		fdp->fd_cdir = vp;
1814 		fdp->fd_ncdir = nch;
1815 		spin_unlock(&fdp->fd_spin);
1816 		vn_unlock(vp);		/* leave ref intact */
1817 		cache_drop(&onch);
1818 		vrele(ovp);
1819 	} else {
1820 		cache_drop(&nch);
1821 		vput(vp);
1822 	}
1823 	fdrop(fp);
1824 done:
1825 	lwkt_reltoken(&p->p_token);
1826 	return (error);
1827 }
1828 
1829 int
1830 kern_chdir(struct nlookupdata *nd)
1831 {
1832 	struct thread *td = curthread;
1833 	struct proc *p = td->td_proc;
1834 	struct filedesc *fdp = p->p_fd;
1835 	struct vnode *vp, *ovp;
1836 	struct nchandle onch;
1837 	int error;
1838 
1839 	nd->nl_flags |= NLC_SHAREDLOCK;
1840 	if ((error = nlookup(nd)) != 0)
1841 		return (error);
1842 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1843 		return (ENOENT);
1844 	if ((error = vget(vp, LK_SHARED)) != 0)
1845 		return (error);
1846 
1847 	lwkt_gettoken(&p->p_token);
1848 	error = checkvp_chdir(vp, td);
1849 	vn_unlock(vp);
1850 	if (error == 0) {
1851 		spin_lock(&fdp->fd_spin);
1852 		ovp = fdp->fd_cdir;
1853 		onch = fdp->fd_ncdir;
1854 		fdp->fd_ncdir = nd->nl_nch;
1855 		fdp->fd_cdir = vp;
1856 		spin_unlock(&fdp->fd_spin);
1857 		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1858 		cache_drop(&onch);
1859 		vrele(ovp);
1860 		cache_zero(&nd->nl_nch);
1861 	} else {
1862 		vrele(vp);
1863 	}
1864 	lwkt_reltoken(&p->p_token);
1865 	return (error);
1866 }
1867 
1868 /*
1869  * chdir_args(char *path)
1870  *
1871  * Change current working directory (``.'').
1872  */
1873 int
1874 sys_chdir(struct sysmsg *sysmsg, const struct chdir_args *uap)
1875 {
1876 	struct nlookupdata nd;
1877 	int error;
1878 
1879 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1880 	if (error == 0)
1881 		error = kern_chdir(&nd);
1882 	nlookup_done(&nd);
1883 	return (error);
1884 }
1885 
1886 /*
1887  * Helper function for raised chroot(2) security function:  Refuse if
1888  * any filedescriptors are open directories.
1889  */
1890 static int
1891 chroot_refuse_vdir_fds(thread_t td, struct filedesc *fdp)
1892 {
1893 	struct vnode *vp;
1894 	struct file *fp;
1895 	int error;
1896 	int fd;
1897 
1898 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1899 		if ((error = holdvnode(td, fd, &fp)) != 0)
1900 			continue;
1901 		vp = (struct vnode *)fp->f_data;
1902 		if (vp->v_type != VDIR) {
1903 			fdrop(fp);
1904 			continue;
1905 		}
1906 		fdrop(fp);
1907 		return(EPERM);
1908 	}
1909 	return (0);
1910 }
1911 
1912 /*
1913  * This sysctl determines if we will allow a process to chroot(2) if it
1914  * has a directory open:
1915  *	0: disallowed for all processes.
1916  *	1: allowed for processes that were not already chroot(2)'ed.
1917  *	2: allowed for all processes.
1918  */
1919 
1920 static int chroot_allow_open_directories = 1;
1921 
1922 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1923      &chroot_allow_open_directories, 0, "");
1924 
1925 /*
1926  * chroot to the specified namecache entry.  We obtain the vp from the
1927  * namecache data.  The passed ncp must be locked and referenced and will
1928  * remain locked and referenced on return.
1929  */
1930 int
1931 kern_chroot(struct nchandle *nch)
1932 {
1933 	struct thread *td = curthread;
1934 	struct proc *p = td->td_proc;
1935 	struct filedesc *fdp = p->p_fd;
1936 	struct vnode *vp;
1937 	int error;
1938 
1939 	/*
1940 	 * Only privileged user can chroot
1941 	 */
1942 	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHROOT);
1943 	if (error)
1944 		return (error);
1945 
1946 	/*
1947 	 * Disallow open directory descriptors (fchdir() breakouts).
1948 	 */
1949 	if (chroot_allow_open_directories == 0 ||
1950 	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1951 		if ((error = chroot_refuse_vdir_fds(td, fdp)) != 0)
1952 			return (error);
1953 	}
1954 	if ((vp = nch->ncp->nc_vp) == NULL)
1955 		return (ENOENT);
1956 
1957 	if ((error = vget(vp, LK_SHARED)) != 0)
1958 		return (error);
1959 
1960 	/*
1961 	 * Check the validity of vp as a directory to change to and
1962 	 * associate it with rdir/jdir.
1963 	 */
1964 	error = checkvp_chdir(vp, td);
1965 	vn_unlock(vp);			/* leave reference intact */
1966 	if (error == 0) {
1967 		lwkt_gettoken(&p->p_token);
1968 		vrele(fdp->fd_rdir);
1969 		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
1970 		cache_drop(&fdp->fd_nrdir);
1971 		cache_copy(nch, &fdp->fd_nrdir);
1972 		if (fdp->fd_jdir == NULL) {
1973 			fdp->fd_jdir = vp;
1974 			vref(fdp->fd_jdir);
1975 			cache_copy(nch, &fdp->fd_njdir);
1976 		}
1977 		if ((p->p_flags & P_DIDCHROOT) == 0) {
1978 			p->p_flags |= P_DIDCHROOT;
1979 			if (p->p_depth <= 65535 - 32)
1980 				p->p_depth += 32;
1981 		}
1982 		lwkt_reltoken(&p->p_token);
1983 	} else {
1984 		vrele(vp);
1985 	}
1986 	return (error);
1987 }
1988 
1989 /*
1990  * chroot_args(char *path)
1991  *
1992  * Change notion of root (``/'') directory.
1993  */
1994 int
1995 sys_chroot(struct sysmsg *sysmsg, const struct chroot_args *uap)
1996 {
1997 	struct thread *td __debugvar = curthread;
1998 	struct nlookupdata nd;
1999 	int error;
2000 
2001 	KKASSERT(td->td_proc);
2002 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2003 	if (error == 0) {
2004 		nd.nl_flags |= NLC_EXEC;
2005 		error = nlookup(&nd);
2006 		if (error == 0)
2007 			error = kern_chroot(&nd.nl_nch);
2008 	}
2009 	nlookup_done(&nd);
2010 	return(error);
2011 }
2012 
2013 int
2014 sys_chroot_kernel(struct sysmsg *sysmsg, const struct chroot_kernel_args *uap)
2015 {
2016 	struct thread *td = curthread;
2017 	struct nlookupdata nd;
2018 	struct nchandle *nch;
2019 	struct vnode *vp;
2020 	int error;
2021 
2022 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2023 	if (error)
2024 		goto error_nond;
2025 
2026 	error = nlookup(&nd);
2027 	if (error)
2028 		goto error_out;
2029 
2030 	nch = &nd.nl_nch;
2031 
2032 	error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHROOT);
2033 	if (error)
2034 		goto error_out;
2035 
2036 	if ((vp = nch->ncp->nc_vp) == NULL) {
2037 		error = ENOENT;
2038 		goto error_out;
2039 	}
2040 
2041 	if ((error = cache_vref(nch, nd.nl_cred, &vp)) != 0)
2042 		goto error_out;
2043 
2044 	vfs_cache_setroot(vp, cache_hold(nch));
2045 
2046 error_out:
2047 	nlookup_done(&nd);
2048 error_nond:
2049 	return(error);
2050 }
2051 
2052 /*
2053  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
2054  * determine whether it is legal to chdir to the vnode.  The vnode's state
2055  * is not changed by this call.
2056  */
2057 static int
2058 checkvp_chdir(struct vnode *vp, struct thread *td)
2059 {
2060 	int error;
2061 
2062 	if (vp->v_type != VDIR)
2063 		error = ENOTDIR;
2064 	else
2065 		error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
2066 	return (error);
2067 }
2068 
2069 int
2070 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
2071 {
2072 	struct thread *td = curthread;
2073 	struct proc *p = td->td_proc;
2074 	struct lwp *lp = td->td_lwp;
2075 	struct filedesc *fdp = p->p_fd;
2076 	int cmode, flags;
2077 	struct file *nfp;
2078 	struct file *fp;
2079 	int type, indx, error = 0;
2080 	struct flock lf;
2081 
2082 	if ((oflags & O_ACCMODE) == O_ACCMODE)
2083 		return (EINVAL);
2084 	flags = FFLAGS(oflags);
2085 	error = falloc(lp, &nfp, NULL);
2086 	if (error)
2087 		return (error);
2088 	fp = nfp;
2089 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
2090 
2091 	/*
2092 	 * Call vn_open() to do the lookup and assign the vnode to the
2093 	 * file pointer.  vn_open() does not change the ref count on fp
2094 	 * and the vnode, on success, will be inherited by the file pointer
2095 	 * and unlocked.
2096 	 *
2097 	 * Request a shared lock on the vnode if possible.
2098 	 *
2099 	 * When NLC_SHAREDLOCK is set we may still need an exclusive vnode
2100 	 * lock for O_RDWR opens on executables in order to avoid a VTEXT
2101 	 * detection race.  The NLC_EXCLLOCK_IFEXEC handles this case.
2102 	 *
2103 	 * NOTE: We need a flag to separate terminal vnode locking from
2104 	 *	 parent locking.  O_CREAT needs parent locking, but O_TRUNC
2105 	 *	 and O_RDWR only need to lock the terminal vnode exclusively.
2106 	 */
2107 	nd->nl_flags |= NLC_LOCKVP;
2108 	if ((flags & (O_CREAT|O_TRUNC)) == 0) {
2109 		nd->nl_flags |= NLC_SHAREDLOCK;
2110 		if (flags & O_RDWR)
2111 			nd->nl_flags |= NLC_EXCLLOCK_IFEXEC;
2112 	}
2113 
2114 	/*
2115 	 * Issue the vn_open, passing in the referenced fp.  the vn_open()
2116 	 * is allowed to replace fp by fdrop()ing it and returning its own
2117 	 * referenced fp.
2118 	 */
2119 	nfp = fp;
2120 	error = vn_open(nd, &nfp, flags, cmode);
2121 	fp = nfp;
2122 	nlookup_done(nd);
2123 
2124 	/*
2125 	 * Deal with any error condition
2126 	 */
2127 	if (error) {
2128 		fdrop(fp);	/* our ref */
2129 		if (error == ERESTART)
2130 			error = EINTR;
2131 		return (error);
2132 	}
2133 
2134 	/*
2135 	 * Reserve a file descriptor.
2136 	 */
2137 	if ((error = fdalloc(p, 0, &indx)) != 0) {
2138 		fdrop(fp);
2139 		return (error);
2140 	}
2141 
2142 	/*
2143 	 * Handle advisory lock flags.  This is only supported with vnodes.
2144 	 * For things like /dev/fd/N we might not actually get a vnode.
2145 	 */
2146 	if ((flags & (O_EXLOCK | O_SHLOCK)) && fp->f_type == DTYPE_VNODE) {
2147 		struct vnode *vp;
2148 
2149 		vp = (struct vnode *)fp->f_data;
2150 		vref(vp);
2151 
2152 		lf.l_whence = SEEK_SET;
2153 		lf.l_start = 0;
2154 		lf.l_len = 0;
2155 		if (flags & O_EXLOCK)
2156 			lf.l_type = F_WRLCK;
2157 		else
2158 			lf.l_type = F_RDLCK;
2159 		if (flags & FNONBLOCK)
2160 			type = 0;
2161 		else
2162 			type = F_WAIT;
2163 
2164 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
2165 		if (error) {
2166 			/*
2167 			 * lock request failed.  Clean up the reserved
2168 			 * descriptor.
2169 			 */
2170 			vrele(vp);
2171 			fsetfd(fdp, NULL, indx);
2172 			fdrop(fp);
2173 			return (error);
2174 		}
2175 		atomic_set_int(&fp->f_flag, FHASLOCK); /* race ok */
2176 		vrele(vp);
2177 	}
2178 
2179 	/*
2180 	 * release our private reference, leaving the one associated with the
2181 	 * descriptor table intact.
2182 	 */
2183 	if (oflags & O_CLOEXEC)
2184 		fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
2185 	fsetfd(fdp, fp, indx);
2186 	fdrop(fp);
2187 	*res = indx;
2188 
2189 	return (error);
2190 }
2191 
2192 /*
2193  * open_args(char *path, int flags, int mode)
2194  *
2195  * Check permissions, allocate an open file structure,
2196  * and call the device open routine if any.
2197  */
2198 int
2199 sys_open(struct sysmsg *sysmsg, const struct open_args *uap)
2200 {
2201 	struct nlookupdata nd;
2202 	int error;
2203 
2204 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2205 	if (error == 0) {
2206 		error = kern_open(&nd, uap->flags,
2207 				    uap->mode, &sysmsg->sysmsg_result);
2208 	}
2209 	nlookup_done(&nd);
2210 	return (error);
2211 }
2212 
2213 /*
2214  * openat_args(int fd, char *path, int flags, int mode)
2215  */
2216 int
2217 sys_openat(struct sysmsg *sysmsg, const struct openat_args *uap)
2218 {
2219 	struct nlookupdata nd;
2220 	int error;
2221 	struct file *fp;
2222 
2223 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2224 	if (error == 0) {
2225 		error = kern_open(&nd, uap->flags, uap->mode,
2226 					&sysmsg->sysmsg_result);
2227 	}
2228 	nlookup_done_at(&nd, fp);
2229 	return (error);
2230 }
2231 
2232 int
2233 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
2234 {
2235 	struct thread *td = curthread;
2236 	struct proc *p = td->td_proc;
2237 	struct vnode *vp;
2238 	struct vattr vattr;
2239 	int error;
2240 	int whiteout = 0;
2241 
2242 	KKASSERT(p);
2243 
2244 	VATTR_NULL(&vattr);
2245 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2246 	vattr.va_rmajor = rmajor;
2247 	vattr.va_rminor = rminor;
2248 
2249 	switch (mode & S_IFMT) {
2250 	case S_IFMT:	/* used by badsect to flag bad sectors */
2251 		error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_BAD);
2252 		vattr.va_type = VBAD;
2253 		break;
2254 	case S_IFCHR:
2255 		error = caps_priv_check_td(td, SYSCAP_NOVFS_MKNOD_DEV);
2256 		vattr.va_type = VCHR;
2257 		break;
2258 	case S_IFBLK:
2259 		error = caps_priv_check_td(td, SYSCAP_NOVFS_MKNOD_DEV);
2260 		vattr.va_type = VBLK;
2261 		break;
2262 	case S_IFWHT:
2263 		error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_WHT);
2264 		whiteout = 1;
2265 		break;
2266 	case S_IFDIR:	/* special directories support for HAMMER */
2267 		error = caps_priv_check(td->td_ucred, SYSCAP_NOVFS_MKNOD_DIR);
2268 		vattr.va_type = VDIR;
2269 		break;
2270 	case S_IFIFO:
2271 		return (kern_mkfifo(nd, mode));
2272 		break;
2273 	default:
2274 		error = EINVAL;
2275 		break;
2276 	}
2277 
2278 	if (error)
2279 		return (error);
2280 
2281 	bwillinode(1);
2282 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2283 	if ((error = nlookup(nd)) != 0)
2284 		return (error);
2285 	if (nd->nl_nch.ncp->nc_vp)
2286 		return (EEXIST);
2287 	if (nd->nl_dvp == NULL)
2288 		return (EINVAL);
2289 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2290 		return (error);
2291 
2292 	if (whiteout) {
2293 		error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
2294 				      nd->nl_cred, NAMEI_CREATE);
2295 	} else {
2296 		vp = NULL;
2297 		error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
2298 				   &vp, nd->nl_cred, &vattr);
2299 		if (error == 0)
2300 			vput(vp);
2301 	}
2302 	return (error);
2303 }
2304 
2305 /*
2306  * mknod_args(char *path, int mode, int dev)
2307  *
2308  * Create a special file.
2309  */
2310 int
2311 sys_mknod(struct sysmsg *sysmsg, const struct mknod_args *uap)
2312 {
2313 	struct nlookupdata nd;
2314 	int error;
2315 
2316 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2317 	if (error == 0) {
2318 		error = kern_mknod(&nd, uap->mode,
2319 				   umajor(uap->dev), uminor(uap->dev));
2320 	}
2321 	nlookup_done(&nd);
2322 	return (error);
2323 }
2324 
2325 /*
2326  * mknodat_args(int fd, char *path, mode_t mode, dev_t dev)
2327  *
2328  * Create a special file.  The path is relative to the directory associated
2329  * with fd.
2330  */
2331 int
2332 sys_mknodat(struct sysmsg *sysmsg, const struct mknodat_args *uap)
2333 {
2334 	struct nlookupdata nd;
2335 	struct file *fp;
2336 	int error;
2337 
2338 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2339 	if (error == 0) {
2340 		error = kern_mknod(&nd, uap->mode,
2341 				   umajor(uap->dev), uminor(uap->dev));
2342 	}
2343 	nlookup_done_at(&nd, fp);
2344 	return (error);
2345 }
2346 
2347 int
2348 kern_mkfifo(struct nlookupdata *nd, int mode)
2349 {
2350 	struct thread *td = curthread;
2351 	struct proc *p = td->td_proc;
2352 	struct vattr vattr;
2353 	struct vnode *vp;
2354 	int error;
2355 
2356 	bwillinode(1);
2357 
2358 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2359 	if ((error = nlookup(nd)) != 0)
2360 		return (error);
2361 	if (nd->nl_nch.ncp->nc_vp)
2362 		return (EEXIST);
2363 	if (nd->nl_dvp == NULL)
2364 		return (EINVAL);
2365 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2366 		return (error);
2367 
2368 	VATTR_NULL(&vattr);
2369 	vattr.va_type = VFIFO;
2370 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2371 	vp = NULL;
2372 	error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
2373 	if (error == 0)
2374 		vput(vp);
2375 	return (error);
2376 }
2377 
2378 /*
2379  * mkfifo_args(char *path, int mode)
2380  *
2381  * Create a named pipe.
2382  */
2383 int
2384 sys_mkfifo(struct sysmsg *sysmsg, const struct mkfifo_args *uap)
2385 {
2386 	struct nlookupdata nd;
2387 	int error;
2388 
2389 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2390 	if (error == 0)
2391 		error = kern_mkfifo(&nd, uap->mode);
2392 	nlookup_done(&nd);
2393 	return (error);
2394 }
2395 
2396 /*
2397  * mkfifoat_args(int fd, char *path, mode_t mode)
2398  *
2399  * Create a named pipe.  The path is relative to the directory associated
2400  * with fd.
2401  */
2402 int
2403 sys_mkfifoat(struct sysmsg *sysmsg, const struct mkfifoat_args *uap)
2404 {
2405 	struct nlookupdata nd;
2406 	struct file *fp;
2407 	int error;
2408 
2409 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2410 	if (error == 0)
2411 		error = kern_mkfifo(&nd, uap->mode);
2412 	nlookup_done_at(&nd, fp);
2413 	return (error);
2414 }
2415 
2416 static int hardlink_check_uid = 0;
2417 SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
2418     &hardlink_check_uid, 0,
2419     "Unprivileged processes cannot create hard links to files owned by other "
2420     "users");
2421 static int hardlink_check_gid = 0;
2422 SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
2423     &hardlink_check_gid, 0,
2424     "Unprivileged processes cannot create hard links to files owned by other "
2425     "groups");
2426 
2427 static int
2428 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
2429 {
2430 	struct vattr va;
2431 	int error;
2432 
2433 	/*
2434 	 * Shortcut if disabled
2435 	 */
2436 	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
2437 		return (0);
2438 
2439 	/*
2440 	 * Privileged user can always hardlink
2441 	 */
2442 	if (caps_priv_check(cred, SYSCAP_NOVFS_LINK) == 0)
2443 		return (0);
2444 
2445 	/*
2446 	 * Otherwise only if the originating file is owned by the
2447 	 * same user or group.  Note that any group is allowed if
2448 	 * the file is owned by the caller.
2449 	 */
2450 	error = VOP_GETATTR(vp, &va);
2451 	if (error != 0)
2452 		return (error);
2453 
2454 	if (hardlink_check_uid) {
2455 		if (cred->cr_uid != va.va_uid)
2456 			return (EPERM);
2457 	}
2458 
2459 	if (hardlink_check_gid) {
2460 		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
2461 			return (EPERM);
2462 	}
2463 
2464 	return (0);
2465 }
2466 
2467 int
2468 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
2469 {
2470 	struct thread *td = curthread;
2471 	struct vnode *vp;
2472 	int error;
2473 
2474 	/*
2475 	 * Lookup the source and obtained a locked vnode.
2476 	 *
2477 	 * You may only hardlink a file which you have write permission
2478 	 * on or which you own.
2479 	 *
2480 	 * XXX relookup on vget failure / race ?
2481 	 */
2482 	bwillinode(1);
2483 	nd->nl_flags |= NLC_WRITE | NLC_OWN | NLC_HLINK;
2484 	if ((error = nlookup(nd)) != 0)
2485 		return (error);
2486 	vp = nd->nl_nch.ncp->nc_vp;
2487 	KKASSERT(vp != NULL);
2488 	if (vp->v_type == VDIR)
2489 		return (EPERM);		/* POSIX */
2490 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2491 		return (error);
2492 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
2493 		return (error);
2494 
2495 	/*
2496 	 * Unlock the source so we can lookup the target without deadlocking
2497 	 * (XXX vp is locked already, possible other deadlock?).  The target
2498 	 * must not exist.
2499 	 */
2500 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
2501 	nd->nl_flags &= ~NLC_NCPISLOCKED;
2502 	cache_unlock(&nd->nl_nch);
2503 	vn_unlock(vp);
2504 
2505 	linknd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2506 	if ((error = nlookup(linknd)) != 0) {
2507 		vrele(vp);
2508 		return (error);
2509 	}
2510 	if (linknd->nl_nch.ncp->nc_vp) {
2511 		vrele(vp);
2512 		return (EEXIST);
2513 	}
2514 	if (linknd->nl_dvp == NULL) {
2515 		vrele(vp);
2516 		return (EINVAL);
2517 	}
2518 	VFS_MODIFYING(vp->v_mount);
2519 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
2520 	if (error) {
2521 		vrele(vp);
2522 		return (error);
2523 	}
2524 
2525 	/*
2526 	 * Finally run the new API VOP.
2527 	 */
2528 	error = can_hardlink(vp, td, td->td_ucred);
2529 	if (error == 0) {
2530 		error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
2531 				  vp, linknd->nl_cred);
2532 	}
2533 	vput(vp);
2534 	return (error);
2535 }
2536 
2537 /*
2538  * link_args(char *path, char *link)
2539  *
2540  * Make a hard file link.
2541  */
2542 int
2543 sys_link(struct sysmsg *sysmsg, const struct link_args *uap)
2544 {
2545 	struct nlookupdata nd, linknd;
2546 	int error;
2547 
2548 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2549 	if (error == 0) {
2550 		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
2551 		if (error == 0)
2552 			error = kern_link(&nd, &linknd);
2553 		nlookup_done(&linknd);
2554 	}
2555 	nlookup_done(&nd);
2556 	return (error);
2557 }
2558 
2559 /*
2560  * linkat_args(int fd1, char *path1, int fd2, char *path2, int flags)
2561  *
2562  * Make a hard file link. The path1 argument is relative to the directory
2563  * associated with fd1, and similarly the path2 argument is relative to
2564  * the directory associated with fd2.
2565  */
2566 int
2567 sys_linkat(struct sysmsg *sysmsg, const struct linkat_args *uap)
2568 {
2569 	struct nlookupdata nd, linknd;
2570 	struct file *fp1, *fp2;
2571 	int error;
2572 
2573 	error = nlookup_init_at(&nd, &fp1, uap->fd1, uap->path1, UIO_USERSPACE,
2574 	    (uap->flags & AT_SYMLINK_FOLLOW) ? NLC_FOLLOW : 0);
2575 	if (error == 0) {
2576 		error = nlookup_init_at(&linknd, &fp2, uap->fd2,
2577 		    uap->path2, UIO_USERSPACE, 0);
2578 		if (error == 0)
2579 			error = kern_link(&nd, &linknd);
2580 		nlookup_done_at(&linknd, fp2);
2581 	}
2582 	nlookup_done_at(&nd, fp1);
2583 	return (error);
2584 }
2585 
2586 int
2587 kern_symlink(struct nlookupdata *nd, char *path, int mode)
2588 {
2589 	struct vattr vattr;
2590 	struct vnode *vp;
2591 	struct vnode *dvp;
2592 	int error;
2593 
2594 	bwillinode(1);
2595 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2596 	if ((error = nlookup(nd)) != 0)
2597 		return (error);
2598 	if (nd->nl_nch.ncp->nc_vp)
2599 		return (EEXIST);
2600 	if (nd->nl_dvp == NULL)
2601 		return (EINVAL);
2602 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2603 		return (error);
2604 	dvp = nd->nl_dvp;
2605 	VATTR_NULL(&vattr);
2606 	vattr.va_mode = mode;
2607 	error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
2608 	if (error == 0)
2609 		vput(vp);
2610 	return (error);
2611 }
2612 
2613 /*
2614  * symlink(char *path, char *link)
2615  *
2616  * Make a symbolic link.
2617  */
2618 int
2619 sys_symlink(struct sysmsg *sysmsg, const struct symlink_args *uap)
2620 {
2621 	struct thread *td = curthread;
2622 	struct nlookupdata nd;
2623 	char *path;
2624 	int error;
2625 	int mode;
2626 
2627 	path = objcache_get(namei_oc, M_WAITOK);
2628 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
2629 	if (error == 0) {
2630 		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
2631 		if (error == 0) {
2632 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2633 			error = kern_symlink(&nd, path, mode);
2634 		}
2635 		nlookup_done(&nd);
2636 	}
2637 	objcache_put(namei_oc, path);
2638 	return (error);
2639 }
2640 
2641 /*
2642  * symlinkat_args(char *path1, int fd, char *path2)
2643  *
2644  * Make a symbolic link.  The path2 argument is relative to the directory
2645  * associated with fd.
2646  */
2647 int
2648 sys_symlinkat(struct sysmsg *sysmsg, const struct symlinkat_args *uap)
2649 {
2650 	struct thread *td = curthread;
2651 	struct nlookupdata nd;
2652 	struct file *fp;
2653 	char *path1;
2654 	int error;
2655 	int mode;
2656 
2657 	path1 = objcache_get(namei_oc, M_WAITOK);
2658 	error = copyinstr(uap->path1, path1, MAXPATHLEN, NULL);
2659 	if (error == 0) {
2660 		error = nlookup_init_at(&nd, &fp, uap->fd, uap->path2,
2661 		    UIO_USERSPACE, 0);
2662 		if (error == 0) {
2663 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2664 			error = kern_symlink(&nd, path1, mode);
2665 		}
2666 		nlookup_done_at(&nd, fp);
2667 	}
2668 	objcache_put(namei_oc, path1);
2669 	return (error);
2670 }
2671 
2672 /*
2673  * undelete_args(char *path)
2674  *
2675  * Delete a whiteout from the filesystem.
2676  */
2677 int
2678 sys_undelete(struct sysmsg *sysmsg, const struct undelete_args *uap)
2679 {
2680 	struct nlookupdata nd;
2681 	int error;
2682 
2683 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2684 	bwillinode(1);
2685 	nd.nl_flags |= NLC_DELETE | NLC_REFDVP;
2686 	if (error == 0)
2687 		error = nlookup(&nd);
2688 	if (error == 0 && nd.nl_dvp == NULL)
2689 		error = EINVAL;
2690 	if (error == 0)
2691 		error = ncp_writechk(&nd.nl_nch);
2692 	if (error == 0) {
2693 		error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
2694 				      NAMEI_DELETE);
2695 	}
2696 	nlookup_done(&nd);
2697 	return (error);
2698 }
2699 
2700 int
2701 kern_unlink(struct nlookupdata *nd)
2702 {
2703 	int error;
2704 
2705 	bwillinode(1);
2706 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
2707 	if ((error = nlookup(nd)) != 0)
2708 		return (error);
2709 	if (nd->nl_dvp == NULL)
2710 		return EINVAL;
2711 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2712 		return (error);
2713 	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
2714 	return (error);
2715 }
2716 
2717 /*
2718  * unlink_args(char *path)
2719  *
2720  * Delete a name from the filesystem.
2721  */
2722 int
2723 sys_unlink(struct sysmsg *sysmsg, const struct unlink_args *uap)
2724 {
2725 	struct nlookupdata nd;
2726 	int error;
2727 
2728 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2729 	if (error == 0)
2730 		error = kern_unlink(&nd);
2731 	nlookup_done(&nd);
2732 	return (error);
2733 }
2734 
2735 
2736 /*
2737  * unlinkat_args(int fd, char *path, int flags)
2738  *
2739  * Delete the file or directory entry pointed to by fd/path.
2740  */
2741 int
2742 sys_unlinkat(struct sysmsg *sysmsg, const struct unlinkat_args *uap)
2743 {
2744 	struct nlookupdata nd;
2745 	struct file *fp;
2746 	int error;
2747 
2748 	if (uap->flags & ~AT_REMOVEDIR)
2749 		return (EINVAL);
2750 
2751 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2752 	if (error == 0) {
2753 		if (uap->flags & AT_REMOVEDIR)
2754 			error = kern_rmdir(&nd);
2755 		else
2756 			error = kern_unlink(&nd);
2757 	}
2758 	nlookup_done_at(&nd, fp);
2759 	return (error);
2760 }
2761 
2762 int
2763 kern_lseek(int fd, off_t offset, int whence, off_t *res)
2764 {
2765 	struct thread *td = curthread;
2766 	struct file *fp;
2767 	struct vnode *vp;
2768 	struct vattr_lite lva;
2769 	off_t new_offset;
2770 	int error;
2771 
2772 	fp = holdfp(td, fd, -1);
2773 	if (fp == NULL)
2774 		return (EBADF);
2775 	if (fp->f_type != DTYPE_VNODE) {
2776 		error = ESPIPE;
2777 		goto done;
2778 	}
2779 	vp = (struct vnode *)fp->f_data;
2780 
2781 	switch (whence) {
2782 	case L_INCR:
2783 		spin_lock(&fp->f_spin);
2784 		new_offset = fp->f_offset + offset;
2785 		error = 0;
2786 		break;
2787 	case L_XTND:
2788 		error = VOP_GETATTR_LITE(vp, &lva);
2789 		spin_lock(&fp->f_spin);
2790 		new_offset = offset + lva.va_size;
2791 		break;
2792 	case L_SET:
2793 		new_offset = offset;
2794 		error = 0;
2795 		spin_lock(&fp->f_spin);
2796 		break;
2797 	default:
2798 		new_offset = 0;
2799 		error = EINVAL;
2800 		spin_lock(&fp->f_spin);
2801 		break;
2802 	}
2803 
2804 	/*
2805 	 * Validate the seek position.  Negative offsets are not allowed
2806 	 * for regular files or directories.
2807 	 *
2808 	 * Normally we would also not want to allow negative offsets for
2809 	 * character and block-special devices.  However kvm addresses
2810 	 * on 64 bit architectures might appear to be negative and must
2811 	 * be allowed.
2812 	 */
2813 	if (error == 0) {
2814 		if (new_offset < 0 &&
2815 		    (vp->v_type == VREG || vp->v_type == VDIR)) {
2816 			error = EINVAL;
2817 		} else {
2818 			fp->f_offset = new_offset;
2819 		}
2820 	}
2821 	*res = fp->f_offset;
2822 	spin_unlock(&fp->f_spin);
2823 done:
2824 	dropfp(td, fd, fp);
2825 
2826 	return (error);
2827 }
2828 
2829 /*
2830  * lseek_args(int fd, int pad, off_t offset, int whence)
2831  *
2832  * Reposition read/write file offset.
2833  */
2834 int
2835 sys_lseek(struct sysmsg *sysmsg, const struct lseek_args *uap)
2836 {
2837 	int error;
2838 
2839 	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2840 			   &sysmsg->sysmsg_offset);
2841 
2842 	return (error);
2843 }
2844 
2845 /*
2846  * Check if current process can access given file.  amode is a bitmask of *_OK
2847  * access bits.  flags is a bitmask of AT_* flags.
2848  */
2849 int
2850 kern_access(struct nlookupdata *nd, int amode, int flags)
2851 {
2852 	struct vnode *vp;
2853 	int error, mode;
2854 
2855 	if (flags & ~AT_EACCESS)
2856 		return (EINVAL);
2857 	nd->nl_flags |= NLC_SHAREDLOCK;
2858 	if ((error = nlookup(nd)) != 0)
2859 		return (error);
2860 	if ((amode & W_OK) && (error = ncp_writechk(&nd->nl_nch)) != 0)
2861 		return (error);
2862 retry:
2863 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
2864 	if (error)
2865 		return (error);
2866 
2867 	/* Flags == 0 means only check for existence. */
2868 	if (amode) {
2869 		mode = 0;
2870 		if (amode & R_OK)
2871 			mode |= VREAD;
2872 		if (amode & W_OK)
2873 			mode |= VWRITE;
2874 		if (amode & X_OK)
2875 			mode |= VEXEC;
2876 		if ((mode & VWRITE) == 0 ||
2877 		    (error = vn_writechk(vp)) == 0) {
2878 			error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
2879 		}
2880 
2881 		/*
2882 		 * If the file handle is stale we have to re-resolve the
2883 		 * entry with the ncp held exclusively.  This is a hack
2884 		 * at the moment.
2885 		 */
2886 		if (error == ESTALE) {
2887 			u_int dummy_gen;
2888 
2889 			vput(vp);
2890 			cache_unlock(&nd->nl_nch);
2891 			cache_lock(&nd->nl_nch);
2892 			dummy_gen = nd->nl_nch.ncp->nc_generation;
2893 			cache_setunresolved(&nd->nl_nch);
2894 			error = cache_resolve(&nd->nl_nch, &dummy_gen,
2895 					      nd->nl_cred);
2896 			if (error == 0) {
2897 				vp = NULL;
2898 				goto retry;
2899 			}
2900 			return(error);
2901 		}
2902 	}
2903 	vput(vp);
2904 	return (error);
2905 }
2906 
2907 /*
2908  * access_args(char *path, int flags)
2909  *
2910  * Check access permissions.
2911  */
2912 int
2913 sys_access(struct sysmsg *sysmsg, const struct access_args *uap)
2914 {
2915 	struct nlookupdata nd;
2916 	int error;
2917 
2918 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2919 	if (error == 0)
2920 		error = kern_access(&nd, uap->flags, 0);
2921 	nlookup_done(&nd);
2922 	return (error);
2923 }
2924 
2925 
2926 /*
2927  * eaccess_args(char *path, int flags)
2928  *
2929  * Check access permissions.
2930  */
2931 int
2932 sys_eaccess(struct sysmsg *sysmsg, const struct eaccess_args *uap)
2933 {
2934 	struct nlookupdata nd;
2935 	int error;
2936 
2937 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2938 	if (error == 0)
2939 		error = kern_access(&nd, uap->flags, AT_EACCESS);
2940 	nlookup_done(&nd);
2941 	return (error);
2942 }
2943 
2944 
2945 /*
2946  * faccessat_args(int fd, char *path, int amode, int flags)
2947  *
2948  * Check access permissions.
2949  */
2950 int
2951 sys_faccessat(struct sysmsg *sysmsg, const struct faccessat_args *uap)
2952 {
2953 	struct nlookupdata nd;
2954 	struct file *fp;
2955 	int error;
2956 
2957 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
2958 				NLC_FOLLOW);
2959 	if (error == 0)
2960 		error = kern_access(&nd, uap->amode, uap->flags);
2961 	nlookup_done_at(&nd, fp);
2962 	return (error);
2963 }
2964 
2965 int
2966 kern_stat(struct nlookupdata *nd, struct stat *st)
2967 {
2968 	int error;
2969 	struct vnode *vp;
2970 
2971 	nd->nl_flags |= NLC_SHAREDLOCK;
2972 	if ((error = nlookup(nd)) != 0)
2973 		return (error);
2974 again:
2975 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2976 		return (ENOENT);
2977 
2978 #if 1
2979 	error = cache_vref(&nd->nl_nch, NULL, &vp);
2980 #else
2981 	error = vget(vp, LK_SHARED);
2982 #endif
2983 	if (error)
2984 		return (error);
2985 	error = vn_stat(vp, st, nd->nl_cred);
2986 
2987 	/*
2988 	 * If the file handle is stale we have to re-resolve the
2989 	 * entry with the ncp held exclusively.  This is a hack
2990 	 * at the moment.
2991 	 */
2992 	if (error == ESTALE) {
2993 		u_int dummy_gen;
2994 #if 1
2995 		vrele(vp);
2996 #else
2997 		vput(vp);
2998 #endif
2999 		cache_unlock(&nd->nl_nch);
3000 		cache_lock(&nd->nl_nch);
3001 		dummy_gen = nd->nl_nch.ncp->nc_generation;
3002 		cache_setunresolved(&nd->nl_nch);
3003 		error = cache_resolve(&nd->nl_nch, &dummy_gen, nd->nl_cred);
3004 		if (error == 0)
3005 			goto again;
3006 	} else {
3007 #if 1
3008 		vrele(vp);
3009 #else
3010 		vput(vp);
3011 #endif
3012 	}
3013 	return (error);
3014 }
3015 
3016 /*
3017  * stat_args(char *path, struct stat *ub)
3018  *
3019  * Get file status; this version follows links.
3020  */
3021 int
3022 sys_stat(struct sysmsg *sysmsg, const struct stat_args *uap)
3023 {
3024 	struct nlookupdata nd;
3025 	struct stat st;
3026 	int error;
3027 
3028 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3029 	if (error == 0) {
3030 		error = kern_stat(&nd, &st);
3031 		if (error == 0)
3032 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
3033 	}
3034 	nlookup_done(&nd);
3035 	return (error);
3036 }
3037 
3038 /*
3039  * lstat_args(char *path, struct stat *ub)
3040  *
3041  * Get file status; this version does not follow links.
3042  */
3043 int
3044 sys_lstat(struct sysmsg *sysmsg, const struct lstat_args *uap)
3045 {
3046 	struct nlookupdata nd;
3047 	struct stat st;
3048 	int error;
3049 
3050 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3051 	if (error == 0) {
3052 		error = kern_stat(&nd, &st);
3053 		if (error == 0)
3054 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
3055 	}
3056 	nlookup_done(&nd);
3057 	return (error);
3058 }
3059 
3060 /*
3061  * fstatat_args(int fd, char *path, struct stat *sb, int flags)
3062  *
3063  * Get status of file pointed to by fd/path.
3064  */
3065 int
3066 sys_fstatat(struct sysmsg *sysmsg, const struct fstatat_args *uap)
3067 {
3068 	struct nlookupdata nd;
3069 	struct stat st;
3070 	int error;
3071 	int flags;
3072 	struct file *fp;
3073 
3074 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3075 		return (EINVAL);
3076 
3077 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3078 
3079 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3080 				UIO_USERSPACE, flags);
3081 	if (error == 0) {
3082 		error = kern_stat(&nd, &st);
3083 		if (error == 0)
3084 			error = copyout(&st, uap->sb, sizeof(*uap->sb));
3085 	}
3086 	nlookup_done_at(&nd, fp);
3087 	return (error);
3088 }
3089 
3090 static int
3091 kern_pathconf(char *path, int name, int flags, register_t *sysmsg_regp)
3092 {
3093 	struct nlookupdata nd;
3094 	struct vnode *vp;
3095 	int error;
3096 
3097 	vp = NULL;
3098 	error = nlookup_init(&nd, path, UIO_USERSPACE, flags);
3099 	if (error == 0)
3100 		error = nlookup(&nd);
3101 	if (error == 0)
3102 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3103 	nlookup_done(&nd);
3104 	if (error == 0) {
3105 		error = VOP_PATHCONF(vp, name, sysmsg_regp);
3106 		vput(vp);
3107 	}
3108 	return (error);
3109 }
3110 
3111 /*
3112  * pathconf_Args(char *path, int name)
3113  *
3114  * Get configurable pathname variables.
3115  */
3116 int
3117 sys_pathconf(struct sysmsg *sysmsg, const struct pathconf_args *uap)
3118 {
3119 	return (kern_pathconf(uap->path, uap->name, NLC_FOLLOW,
3120 		&sysmsg->sysmsg_reg));
3121 }
3122 
3123 /*
3124  * lpathconf_Args(char *path, int name)
3125  *
3126  * Get configurable pathname variables, but don't follow symlinks.
3127  */
3128 int
3129 sys_lpathconf(struct sysmsg *sysmsg, const struct lpathconf_args *uap)
3130 {
3131 	return (kern_pathconf(uap->path, uap->name, 0, &sysmsg->sysmsg_reg));
3132 }
3133 
3134 /*
3135  * XXX: daver
3136  * kern_readlink isn't properly split yet.  There is a copyin burried
3137  * in VOP_READLINK().
3138  */
3139 int
3140 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
3141 {
3142 	struct thread *td = curthread;
3143 	struct vnode *vp;
3144 	struct iovec aiov;
3145 	struct uio auio;
3146 	int error;
3147 
3148 	nd->nl_flags |= NLC_SHAREDLOCK;
3149 	if ((error = nlookup(nd)) != 0)
3150 		return (error);
3151 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_SHARED, &vp);
3152 	if (error)
3153 		return (error);
3154 	if (vp->v_type != VLNK) {
3155 		error = EINVAL;
3156 	} else {
3157 		aiov.iov_base = buf;
3158 		aiov.iov_len = count;
3159 		auio.uio_iov = &aiov;
3160 		auio.uio_iovcnt = 1;
3161 		auio.uio_offset = 0;
3162 		auio.uio_rw = UIO_READ;
3163 		auio.uio_segflg = UIO_USERSPACE;
3164 		auio.uio_td = td;
3165 		auio.uio_resid = count;
3166 		error = VOP_READLINK(vp, &auio, td->td_ucred);
3167 	}
3168 	vput(vp);
3169 	*res = count - auio.uio_resid;
3170 	return (error);
3171 }
3172 
3173 /*
3174  * readlink_args(char *path, char *buf, int count)
3175  *
3176  * Return target name of a symbolic link.
3177  */
3178 int
3179 sys_readlink(struct sysmsg *sysmsg, const struct readlink_args *uap)
3180 {
3181 	struct nlookupdata nd;
3182 	int error;
3183 
3184 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3185 	if (error == 0) {
3186 		error = kern_readlink(&nd, uap->buf, uap->count,
3187 					&sysmsg->sysmsg_result);
3188 	}
3189 	nlookup_done(&nd);
3190 	return (error);
3191 }
3192 
3193 /*
3194  * readlinkat_args(int fd, char *path, char *buf, size_t bufsize)
3195  *
3196  * Return target name of a symbolic link.  The path is relative to the
3197  * directory associated with fd.
3198  */
3199 int
3200 sys_readlinkat(struct sysmsg *sysmsg, const struct readlinkat_args *uap)
3201 {
3202 	struct nlookupdata nd;
3203 	struct file *fp;
3204 	int error;
3205 
3206 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
3207 	if (error == 0) {
3208 		error = kern_readlink(&nd, uap->buf, uap->bufsize,
3209 					&sysmsg->sysmsg_result);
3210 	}
3211 	nlookup_done_at(&nd, fp);
3212 	return (error);
3213 }
3214 
3215 static int
3216 setfflags(struct vnode *vp, u_long flags)
3217 {
3218 	struct thread *td = curthread;
3219 	int error;
3220 	struct vattr vattr;
3221 
3222 	/*
3223 	 * Prevent non-root users from setting flags on devices.  When
3224 	 * a device is reused, users can retain ownership of the device
3225 	 * if they are allowed to set flags and programs assume that
3226 	 * chown can't fail when done as root.
3227 	 */
3228 	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
3229 	    ((error =
3230 		caps_priv_check(td->td_ucred, SYSCAP_NOVFS_CHFLAGS_DEV)) != 0))
3231 	{
3232 		return (error);
3233 	}
3234 
3235 	/*
3236 	 * note: vget is required for any operation that might mod the vnode
3237 	 * so VINACTIVE is properly cleared.
3238 	 */
3239 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3240 		VATTR_NULL(&vattr);
3241 		vattr.va_flags = flags;
3242 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3243 		vput(vp);
3244 	}
3245 	return (error);
3246 }
3247 
3248 /*
3249  * chflags(const char *path, u_long flags)
3250  *
3251  * Change flags of a file given a path name.
3252  */
3253 int
3254 sys_chflags(struct sysmsg *sysmsg, const struct chflags_args *uap)
3255 {
3256 	struct nlookupdata nd;
3257 	struct vnode *vp;
3258 	int error;
3259 
3260 	vp = NULL;
3261 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3262 	if (error == 0)
3263 		error = nlookup(&nd);
3264 	if (error == 0)
3265 		error = ncp_writechk(&nd.nl_nch);
3266 	if (error == 0)
3267 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3268 	nlookup_done(&nd);
3269 	if (error == 0) {
3270 		error = setfflags(vp, uap->flags);
3271 		vrele(vp);
3272 	}
3273 	return (error);
3274 }
3275 
3276 /*
3277  * lchflags(const char *path, u_long flags)
3278  *
3279  * Change flags of a file given a path name, but don't follow symlinks.
3280  */
3281 int
3282 sys_lchflags(struct sysmsg *sysmsg, const struct lchflags_args *uap)
3283 {
3284 	struct nlookupdata nd;
3285 	struct vnode *vp;
3286 	int error;
3287 
3288 	vp = NULL;
3289 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3290 	if (error == 0)
3291 		error = nlookup(&nd);
3292 	if (error == 0)
3293 		error = ncp_writechk(&nd.nl_nch);
3294 	if (error == 0)
3295 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3296 	nlookup_done(&nd);
3297 	if (error == 0) {
3298 		error = setfflags(vp, uap->flags);
3299 		vrele(vp);
3300 	}
3301 	return (error);
3302 }
3303 
3304 /*
3305  * fchflags_args(int fd, u_flags flags)
3306  *
3307  * Change flags of a file given a file descriptor.
3308  */
3309 int
3310 sys_fchflags(struct sysmsg *sysmsg, const struct fchflags_args *uap)
3311 {
3312 	struct thread *td = curthread;
3313 	struct file *fp;
3314 	int error;
3315 
3316 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3317 		return (error);
3318 	if (fp->f_nchandle.ncp)
3319 		error = ncp_writechk(&fp->f_nchandle);
3320 	if (error == 0)
3321 		error = setfflags((struct vnode *) fp->f_data, uap->flags);
3322 	fdrop(fp);
3323 	return (error);
3324 }
3325 
3326 /*
3327  * chflagsat_args(int fd, const char *path, u_long flags, int atflags)
3328  * change flags given a pathname relative to a filedescriptor
3329  */
3330 int
3331 sys_chflagsat(struct sysmsg *sysmsg, const struct chflagsat_args *uap)
3332 {
3333 	struct nlookupdata nd;
3334 	struct vnode *vp;
3335 	struct file *fp;
3336 	int error;
3337 	int lookupflags;
3338 
3339 	if (uap->atflags & ~AT_SYMLINK_NOFOLLOW)
3340 		return (EINVAL);
3341 
3342 	lookupflags = (uap->atflags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3343 
3344 	vp = NULL;
3345 	error = nlookup_init_at(&nd, &fp, uap->fd,  uap->path, UIO_USERSPACE, lookupflags);
3346 	if (error == 0)
3347 		error = nlookup(&nd);
3348 	if (error == 0)
3349 		error = ncp_writechk(&nd.nl_nch);
3350 	if (error == 0)
3351 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3352 	nlookup_done_at(&nd, fp);
3353 	if (error == 0) {
3354 		error = setfflags(vp, uap->flags);
3355 		vrele(vp);
3356 	}
3357 	return (error);
3358 }
3359 
3360 
3361 static int
3362 setfmode(struct vnode *vp, int mode)
3363 {
3364 	struct thread *td = curthread;
3365 	int error;
3366 	struct vattr vattr;
3367 
3368 	/*
3369 	 * note: vget is required for any operation that might mod the vnode
3370 	 * so VINACTIVE is properly cleared.
3371 	 */
3372 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3373 		VATTR_NULL(&vattr);
3374 		vattr.va_mode = mode & ALLPERMS;
3375 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3376 		cache_inval_wxok(vp);
3377 		vput(vp);
3378 	}
3379 	return error;
3380 }
3381 
3382 int
3383 kern_chmod(struct nlookupdata *nd, int mode)
3384 {
3385 	struct vnode *vp;
3386 	int error;
3387 
3388 	if ((error = nlookup(nd)) != 0)
3389 		return (error);
3390 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3391 		return (error);
3392 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3393 		error = setfmode(vp, mode);
3394 	vrele(vp);
3395 	return (error);
3396 }
3397 
3398 /*
3399  * chmod_args(char *path, int mode)
3400  *
3401  * Change mode of a file given path name.
3402  */
3403 int
3404 sys_chmod(struct sysmsg *sysmsg, const struct chmod_args *uap)
3405 {
3406 	struct nlookupdata nd;
3407 	int error;
3408 
3409 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3410 	if (error == 0)
3411 		error = kern_chmod(&nd, uap->mode);
3412 	nlookup_done(&nd);
3413 	return (error);
3414 }
3415 
3416 /*
3417  * lchmod_args(char *path, int mode)
3418  *
3419  * Change mode of a file given path name (don't follow links.)
3420  */
3421 int
3422 sys_lchmod(struct sysmsg *sysmsg, const struct lchmod_args *uap)
3423 {
3424 	struct nlookupdata nd;
3425 	int error;
3426 
3427 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3428 	if (error == 0)
3429 		error = kern_chmod(&nd, uap->mode);
3430 	nlookup_done(&nd);
3431 	return (error);
3432 }
3433 
3434 /*
3435  * fchmod_args(int fd, int mode)
3436  *
3437  * Change mode of a file given a file descriptor.
3438  */
3439 int
3440 sys_fchmod(struct sysmsg *sysmsg, const struct fchmod_args *uap)
3441 {
3442 	struct thread *td = curthread;
3443 	struct file *fp;
3444 	int error;
3445 
3446 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3447 		return (error);
3448 	if (fp->f_nchandle.ncp)
3449 		error = ncp_writechk(&fp->f_nchandle);
3450 	if (error == 0)
3451 		error = setfmode((struct vnode *)fp->f_data, uap->mode);
3452 	fdrop(fp);
3453 	return (error);
3454 }
3455 
3456 /*
3457  * fchmodat_args(char *path, int mode)
3458  *
3459  * Change mode of a file pointed to by fd/path.
3460  */
3461 int
3462 sys_fchmodat(struct sysmsg *sysmsg, const struct fchmodat_args *uap)
3463 {
3464 	struct nlookupdata nd;
3465 	struct file *fp;
3466 	int error;
3467 	int flags;
3468 
3469 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3470 		return (EINVAL);
3471 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3472 
3473 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3474 				UIO_USERSPACE, flags);
3475 	if (error == 0)
3476 		error = kern_chmod(&nd, uap->mode);
3477 	nlookup_done_at(&nd, fp);
3478 	return (error);
3479 }
3480 
3481 static int
3482 setfown(struct mount *mp, struct vnode *vp, uid_t uid, gid_t gid)
3483 {
3484 	struct thread *td = curthread;
3485 	int error;
3486 	struct vattr vattr;
3487 	uid_t o_uid;
3488 	gid_t o_gid;
3489 	uint64_t size;
3490 
3491 	/*
3492 	 * note: vget is required for any operation that might mod the vnode
3493 	 * so VINACTIVE is properly cleared.
3494 	 */
3495 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3496 		if ((error = VOP_GETATTR(vp, &vattr)) != 0)
3497 			return error;
3498 		o_uid = vattr.va_uid;
3499 		o_gid = vattr.va_gid;
3500 		size = vattr.va_size;
3501 
3502 		VATTR_NULL(&vattr);
3503 		vattr.va_uid = uid;
3504 		vattr.va_gid = gid;
3505 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3506 		vput(vp);
3507 	}
3508 
3509 	if (error == 0) {
3510 		if (uid == -1)
3511 			uid = o_uid;
3512 		if (gid == -1)
3513 			gid = o_gid;
3514 		VFS_ACCOUNT(mp, o_uid, o_gid, -size);
3515 		VFS_ACCOUNT(mp,   uid,   gid,  size);
3516 	}
3517 
3518 	return error;
3519 }
3520 
3521 int
3522 kern_chown(struct nlookupdata *nd, int uid, int gid)
3523 {
3524 	struct vnode *vp;
3525 	int error;
3526 
3527 	if ((error = nlookup(nd)) != 0)
3528 		return (error);
3529 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3530 		return (error);
3531 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3532 		error = setfown(nd->nl_nch.mount, vp, uid, gid);
3533 	vrele(vp);
3534 	return (error);
3535 }
3536 
3537 /*
3538  * chown(char *path, int uid, int gid)
3539  *
3540  * Set ownership given a path name.
3541  */
3542 int
3543 sys_chown(struct sysmsg *sysmsg, const struct chown_args *uap)
3544 {
3545 	struct nlookupdata nd;
3546 	int error;
3547 
3548 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3549 	if (error == 0)
3550 		error = kern_chown(&nd, uap->uid, uap->gid);
3551 	nlookup_done(&nd);
3552 	return (error);
3553 }
3554 
3555 /*
3556  * lchown_args(char *path, int uid, int gid)
3557  *
3558  * Set ownership given a path name, do not cross symlinks.
3559  */
3560 int
3561 sys_lchown(struct sysmsg *sysmsg, const struct lchown_args *uap)
3562 {
3563 	struct nlookupdata nd;
3564 	int error;
3565 
3566 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3567 	if (error == 0)
3568 		error = kern_chown(&nd, uap->uid, uap->gid);
3569 	nlookup_done(&nd);
3570 	return (error);
3571 }
3572 
3573 /*
3574  * fchown_args(int fd, int uid, int gid)
3575  *
3576  * Set ownership given a file descriptor.
3577  */
3578 int
3579 sys_fchown(struct sysmsg *sysmsg, const struct fchown_args *uap)
3580 {
3581 	struct thread *td = curthread;
3582 	struct proc *p = td->td_proc;
3583 	struct file *fp;
3584 	int error;
3585 
3586 	if ((error = holdvnode(td, uap->fd, &fp)) != 0)
3587 		return (error);
3588 	if (fp->f_nchandle.ncp)
3589 		error = ncp_writechk(&fp->f_nchandle);
3590 	if (error == 0)
3591 		error = setfown(p->p_fd->fd_ncdir.mount,
3592 			(struct vnode *)fp->f_data, uap->uid, uap->gid);
3593 	fdrop(fp);
3594 	return (error);
3595 }
3596 
3597 /*
3598  * fchownat(int fd, char *path, int uid, int gid, int flags)
3599  *
3600  * Set ownership of file pointed to by fd/path.
3601  */
3602 int
3603 sys_fchownat(struct sysmsg *sysmsg, const struct fchownat_args *uap)
3604 {
3605 	struct nlookupdata nd;
3606 	struct file *fp;
3607 	int error;
3608 	int flags;
3609 
3610 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3611 		return (EINVAL);
3612 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3613 
3614 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3615 				UIO_USERSPACE, flags);
3616 	if (error == 0)
3617 		error = kern_chown(&nd, uap->uid, uap->gid);
3618 	nlookup_done_at(&nd, fp);
3619 	return (error);
3620 }
3621 
3622 
3623 static int
3624 getutimes(struct timeval *tvp, struct timespec *tsp)
3625 {
3626 	struct timeval tv[2];
3627 	int error;
3628 
3629 	if (tvp == NULL) {
3630 		microtime(&tv[0]);
3631 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
3632 		tsp[1] = tsp[0];
3633 	} else {
3634 		if ((error = itimerfix(tvp)) != 0)
3635 			return (error);
3636 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3637 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3638 	}
3639 	return 0;
3640 }
3641 
3642 static int
3643 getutimens(const struct timespec *ts, struct timespec *newts, int *nullflag)
3644 {
3645 	struct timespec tsnow;
3646 	int error;
3647 
3648 	*nullflag = 0;
3649 	nanotime(&tsnow);
3650 	if (ts == NULL) {
3651 		newts[0] = tsnow;
3652 		newts[1] = tsnow;
3653 		*nullflag = 1;
3654 		return (0);
3655 	}
3656 
3657 	newts[0] = ts[0];
3658 	newts[1] = ts[1];
3659 	if (newts[0].tv_nsec == UTIME_OMIT && newts[1].tv_nsec == UTIME_OMIT)
3660 		return (0);
3661 	if (newts[0].tv_nsec == UTIME_NOW && newts[1].tv_nsec == UTIME_NOW)
3662 		*nullflag = 1;
3663 
3664 	if (newts[0].tv_nsec == UTIME_OMIT)
3665 		newts[0].tv_sec = VNOVAL;
3666 	else if (newts[0].tv_nsec == UTIME_NOW)
3667 		newts[0] = tsnow;
3668 	else if ((error = itimespecfix(&newts[0])) != 0)
3669 		return (error);
3670 
3671 	if (newts[1].tv_nsec == UTIME_OMIT)
3672 		newts[1].tv_sec = VNOVAL;
3673 	else if (newts[1].tv_nsec == UTIME_NOW)
3674 		newts[1] = tsnow;
3675 	else if ((error = itimespecfix(&newts[1])) != 0)
3676 		return (error);
3677 
3678 	return (0);
3679 }
3680 
3681 static int
3682 setutimes(struct vnode *vp, struct vattr *vattr,
3683 	  const struct timespec *ts, int nullflag)
3684 {
3685 	struct thread *td = curthread;
3686 	int error;
3687 
3688 	VATTR_NULL(vattr);
3689 	vattr->va_atime = ts[0];
3690 	vattr->va_mtime = ts[1];
3691 	if (nullflag)
3692 		vattr->va_vaflags |= VA_UTIMES_NULL;
3693 	error = VOP_SETATTR(vp, vattr, td->td_ucred);
3694 
3695 	return error;
3696 }
3697 
3698 int
3699 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
3700 {
3701 	struct timespec ts[2];
3702 	int error;
3703 
3704 	if (tptr) {
3705 		if ((error = getutimes(tptr, ts)) != 0)
3706 			return (error);
3707 	}
3708 	error = kern_utimensat(nd, tptr ? ts : NULL, 0);
3709 	return (error);
3710 }
3711 
3712 /*
3713  * utimes_args(char *path, struct timeval *tptr)
3714  *
3715  * Set the access and modification times of a file.
3716  */
3717 int
3718 sys_utimes(struct sysmsg *sysmsg, const struct utimes_args *uap)
3719 {
3720 	struct timeval tv[2];
3721 	struct nlookupdata nd;
3722 	int error;
3723 
3724 	if (uap->tptr) {
3725  		error = copyin(uap->tptr, tv, sizeof(tv));
3726 		if (error)
3727 			return (error);
3728 	}
3729 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3730 	if (error == 0)
3731 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3732 	nlookup_done(&nd);
3733 	return (error);
3734 }
3735 
3736 /*
3737  * lutimes_args(char *path, struct timeval *tptr)
3738  *
3739  * Set the access and modification times of a file.
3740  */
3741 int
3742 sys_lutimes(struct sysmsg *sysmsg, const struct lutimes_args *uap)
3743 {
3744 	struct timeval tv[2];
3745 	struct nlookupdata nd;
3746 	int error;
3747 
3748 	if (uap->tptr) {
3749 		error = copyin(uap->tptr, tv, sizeof(tv));
3750 		if (error)
3751 			return (error);
3752 	}
3753 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3754 	if (error == 0)
3755 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3756 	nlookup_done(&nd);
3757 	return (error);
3758 }
3759 
3760 /*
3761  * Set utimes on a file descriptor.  The creds used to open the
3762  * file are used to determine whether the operation is allowed
3763  * or not.
3764  */
3765 int
3766 kern_futimens(int fd, struct timespec *ts)
3767 {
3768 	struct thread *td = curthread;
3769 	struct timespec newts[2];
3770 	struct file *fp;
3771 	struct vnode *vp;
3772 	struct vattr vattr;
3773 	struct vattr_lite lva;
3774 	int nullflag;
3775 	int error;
3776 
3777 	error = getutimens(ts, newts, &nullflag);
3778 	if (error)
3779 		return (error);
3780 	if ((error = holdvnode(td, fd, &fp)) != 0)
3781 		return (error);
3782 	if (fp->f_nchandle.ncp)
3783 		error = ncp_writechk(&fp->f_nchandle);
3784 	if (error == 0) {
3785 		vp = fp->f_data;
3786 		error = vget(vp, LK_EXCLUSIVE);
3787 		if (error == 0) {
3788 			error = VOP_GETATTR_FP(vp, &vattr, fp);
3789 			if (error == 0) {
3790 				lva.va_type = vattr.va_type;
3791 				lva.va_nlink = vattr.va_nlink;
3792 				lva.va_mode = vattr.va_mode;
3793 				lva.va_uid = vattr.va_uid;
3794 				lva.va_gid = vattr.va_gid;
3795 				lva.va_size = vattr.va_size;
3796 				lva.va_flags = vattr.va_flags;
3797 
3798 				error = naccess_lva(&lva, NLC_OWN | NLC_WRITE,
3799 						   fp->f_cred);
3800 			}
3801 			if (error == 0) {
3802 				error = setutimes(vp, &vattr, newts, nullflag);
3803 			}
3804 			vput(vp);
3805 		}
3806 	}
3807 	fdrop(fp);
3808 	return (error);
3809 }
3810 
3811 /*
3812  * futimens_args(int fd, struct timespec *ts)
3813  *
3814  * Set the access and modification times of a file.
3815  */
3816 int
3817 sys_futimens(struct sysmsg *sysmsg, const struct futimens_args *uap)
3818 {
3819 	struct timespec ts[2];
3820 	int error;
3821 
3822 	if (uap->ts) {
3823 		error = copyin(uap->ts, ts, sizeof(ts));
3824 		if (error)
3825 			return (error);
3826 	}
3827 	error = kern_futimens(uap->fd, uap->ts ? ts : NULL);
3828 	return (error);
3829 }
3830 
3831 int
3832 kern_futimes(int fd, struct timeval *tptr)
3833 {
3834 	struct timespec ts[2];
3835 	int error;
3836 
3837 	if (tptr) {
3838 		if ((error = getutimes(tptr, ts)) != 0)
3839 			return (error);
3840 	}
3841 	error = kern_futimens(fd, tptr ? ts : NULL);
3842 	return (error);
3843 }
3844 
3845 /*
3846  * futimes_args(int fd, struct timeval *tptr)
3847  *
3848  * Set the access and modification times of a file.
3849  */
3850 int
3851 sys_futimes(struct sysmsg *sysmsg, const struct futimes_args *uap)
3852 {
3853 	struct timeval tv[2];
3854 	int error;
3855 
3856 	if (uap->tptr) {
3857 		error = copyin(uap->tptr, tv, sizeof(tv));
3858 		if (error)
3859 			return (error);
3860 	}
3861 	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
3862 	return (error);
3863 }
3864 
3865 int
3866 kern_utimensat(struct nlookupdata *nd, const struct timespec *ts, int flags)
3867 {
3868 	struct timespec newts[2];
3869 	struct vnode *vp;
3870 	struct vattr vattr;
3871 	int nullflag;
3872 	int error;
3873 
3874 	if (flags & ~AT_SYMLINK_NOFOLLOW)
3875 		return (EINVAL);
3876 
3877 	error = getutimens(ts, newts, &nullflag);
3878 	if (error)
3879 		return (error);
3880 
3881 	nd->nl_flags |= NLC_OWN | NLC_WRITE;
3882 	if ((error = nlookup(nd)) != 0)
3883 		return (error);
3884 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3885 		return (error);
3886 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3887 		return (error);
3888 	if ((error = vn_writechk(vp)) == 0) {
3889 		error = vget(vp, LK_EXCLUSIVE);
3890 		if (error == 0) {
3891 			error = setutimes(vp, &vattr, newts, nullflag);
3892 			vput(vp);
3893 		}
3894 	}
3895 	vrele(vp);
3896 	return (error);
3897 }
3898 
3899 /*
3900  * utimensat_args(int fd, const char *path, const struct timespec *ts, int flags);
3901  *
3902  * Set file access and modification times of a file.
3903  */
3904 int
3905 sys_utimensat(struct sysmsg *sysmsg, const struct utimensat_args *uap)
3906 {
3907 	struct timespec ts[2];
3908 	struct nlookupdata nd;
3909 	struct file *fp;
3910 	int error;
3911 	int flags;
3912 
3913 	if (uap->ts) {
3914 		error = copyin(uap->ts, ts, sizeof(ts));
3915 		if (error)
3916 			return (error);
3917 	}
3918 
3919 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3920 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3921 	                        UIO_USERSPACE, flags);
3922 	if (error == 0)
3923 		error = kern_utimensat(&nd, uap->ts ? ts : NULL, uap->flags);
3924 	nlookup_done_at(&nd, fp);
3925 	return (error);
3926 }
3927 
3928 int
3929 kern_truncate(struct nlookupdata *nd, off_t length)
3930 {
3931 	struct vnode *vp;
3932 	struct vattr vattr;
3933 	int error;
3934 	uid_t uid = 0;
3935 	gid_t gid = 0;
3936 	uint64_t old_size = 0;
3937 
3938 	if (length < 0)
3939 		return(EINVAL);
3940 	nd->nl_flags |= NLC_WRITE | NLC_TRUNCATE;
3941 	if ((error = nlookup(nd)) != 0)
3942 		return (error);
3943 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3944 		return (error);
3945 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3946 		return (error);
3947 	error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_FAILRECLAIM);
3948 	if (error) {
3949 		vrele(vp);
3950 		return (error);
3951 	}
3952 	if (vp->v_type == VDIR) {
3953 		error = EISDIR;
3954 		goto done;
3955 	}
3956 	if (vfs_quota_enabled) {
3957 		error = VOP_GETATTR(vp, &vattr);
3958 		KASSERT(error == 0, ("kern_truncate(): VOP_GETATTR didn't return 0"));
3959 		uid = vattr.va_uid;
3960 		gid = vattr.va_gid;
3961 		old_size = vattr.va_size;
3962 	}
3963 
3964 	if ((error = vn_writechk(vp)) == 0) {
3965 		VATTR_NULL(&vattr);
3966 		vattr.va_size = length;
3967 		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
3968 		VFS_ACCOUNT(nd->nl_nch.mount, uid, gid, length - old_size);
3969 	}
3970 done:
3971 	vput(vp);
3972 	return (error);
3973 }
3974 
3975 /*
3976  * truncate(char *path, int pad, off_t length)
3977  *
3978  * Truncate a file given its path name.
3979  */
3980 int
3981 sys_truncate(struct sysmsg *sysmsg, const struct truncate_args *uap)
3982 {
3983 	struct nlookupdata nd;
3984 	int error;
3985 
3986 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3987 	if (error == 0)
3988 		error = kern_truncate(&nd, uap->length);
3989 	nlookup_done(&nd);
3990 	return error;
3991 }
3992 
3993 int
3994 kern_ftruncate(int fd, off_t length)
3995 {
3996 	struct thread *td = curthread;
3997 	struct vattr vattr;
3998 	struct vnode *vp;
3999 	struct file *fp;
4000 	int error;
4001 	uid_t uid = 0;
4002 	gid_t gid = 0;
4003 	uint64_t old_size = 0;
4004 	struct mount *mp;
4005 
4006 	if (length < 0)
4007 		return(EINVAL);
4008 	if ((error = holdvnode(td, fd, &fp)) != 0)
4009 		return (error);
4010 	if (fp->f_nchandle.ncp) {
4011 		error = ncp_writechk(&fp->f_nchandle);
4012 		if (error)
4013 			goto done;
4014 	}
4015 	if ((fp->f_flag & FWRITE) == 0) {
4016 		error = EINVAL;
4017 		goto done;
4018 	}
4019 	if (fp->f_flag & FAPPENDONLY) {	/* inode was set s/uapnd */
4020 		error = EINVAL;
4021 		goto done;
4022 	}
4023 	vp = (struct vnode *)fp->f_data;
4024 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4025 	if (vp->v_type == VDIR) {
4026 		error = EISDIR;
4027 		vn_unlock(vp);
4028 		goto done;
4029 	}
4030 
4031 	if (vfs_quota_enabled) {
4032 		error = VOP_GETATTR_FP(vp, &vattr, fp);
4033 		KASSERT(error == 0, ("kern_ftruncate(): VOP_GETATTR didn't return 0"));
4034 		uid = vattr.va_uid;
4035 		gid = vattr.va_gid;
4036 		old_size = vattr.va_size;
4037 	}
4038 
4039 	if ((error = vn_writechk(vp)) == 0) {
4040 		VATTR_NULL(&vattr);
4041 		vattr.va_size = length;
4042 		error = VOP_SETATTR_FP(vp, &vattr, fp->f_cred, fp);
4043 		mp = vq_vptomp(vp);
4044 		VFS_ACCOUNT(mp, uid, gid, length - old_size);
4045 	}
4046 	vn_unlock(vp);
4047 done:
4048 	fdrop(fp);
4049 	return (error);
4050 }
4051 
4052 /*
4053  * ftruncate_args(int fd, int pad, off_t length)
4054  *
4055  * Truncate a file given a file descriptor.
4056  */
4057 int
4058 sys_ftruncate(struct sysmsg *sysmsg, const struct ftruncate_args *uap)
4059 {
4060 	int error;
4061 
4062 	error = kern_ftruncate(uap->fd, uap->length);
4063 
4064 	return (error);
4065 }
4066 
4067 int
4068 kern_fsync(int fd, bool fullsync)
4069 {
4070 	struct thread *td = curthread;
4071 	struct vnode *vp;
4072 	struct file *fp;
4073 	vm_object_t obj;
4074 	int error;
4075 
4076 	if ((error = holdvnode(td, fd, &fp)) != 0)
4077 		return (error);
4078 	vp = (struct vnode *)fp->f_data;
4079 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4080 	if ((obj = vp->v_object) != NULL) {
4081 		if (vp->v_mount == NULL ||
4082 		    (vp->v_mount->mnt_kern_flag & MNTK_NOMSYNC) == 0) {
4083 			vm_object_page_clean(obj, 0, 0, 0);
4084 		}
4085 	}
4086 	error = fullsync ?
4087 		VOP_FSYNC_FP(vp, MNT_WAIT, VOP_FSYNC_SYSCALL, fp) :
4088 		VOP_FDATASYNC_FP(vp, MNT_WAIT, VOP_FSYNC_SYSCALL, fp);
4089 	if (error == 0 && vp->v_mount)
4090 		error = buf_fsync(vp);
4091 	vn_unlock(vp);
4092 	fdrop(fp);
4093 
4094 	return (error);
4095 }
4096 
4097 /*
4098  * fsync(int fd)
4099  *
4100  * Sync an open file.
4101  */
4102 int
4103 sys_fsync(struct sysmsg *sysmsg, const struct fsync_args *uap)
4104 {
4105 	return (kern_fsync(uap->fd, true));
4106 }
4107 
4108 /*
4109  * fdatasync(int fd)
4110  *
4111  * Data-sync an open file.
4112  */
4113 int
4114 sys_fdatasync(struct sysmsg *sysmsg, const struct fdatasync_args *uap)
4115 {
4116 	return (kern_fsync(uap->fd, false));
4117 }
4118 
4119 /*
4120  * rename op.
4121  *
4122  * NOTE: error == 0 and nl_dvp is NULL indicates a mount point, operation
4123  *	 disallowed.  e.g. /var/cache where /var/cache is a null-mount, for
4124  *	 example.
4125  */
4126 int
4127 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
4128 {
4129 	struct nchandle fnchd;
4130 	struct nchandle tnchd;
4131 	struct namecache *ncp;
4132 	struct vnode *fdvp;
4133 	struct vnode *tdvp;
4134 	struct mount *mp;
4135 	struct mount *userenlk;
4136 	int error;
4137 	u_int fncp_gen;
4138 	u_int tncp_gen;
4139 
4140 	bwillinode(1);
4141 	fromnd->nl_flags |= NLC_REFDVP | NLC_RENAME_SRC;
4142 	if ((error = nlookup(fromnd)) != 0)
4143 		return (error);
4144 
4145 	/*
4146 	 * Attempt to rename a mount point (from or to)
4147 	 */
4148 	if (error == 0 && fromnd->nl_dvp == NULL)
4149 		return (EINVAL);
4150 
4151 	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
4152 		return (ENOENT);
4153 	fnchd.mount = fromnd->nl_nch.mount;
4154 	cache_hold(&fnchd);
4155 
4156 	/*
4157 	 * unlock the source nch so we can lookup the target nch without
4158 	 * deadlocking.  The target may or may not exist so we do not check
4159 	 * for a target vp like kern_mkdir() and other creation functions do.
4160 	 *
4161 	 * The source and target directories are ref'd and rechecked after
4162 	 * everything is relocked to determine if the source or target file
4163 	 * has been renamed.
4164 	 */
4165 	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
4166 	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
4167 	fncp_gen = fromnd->nl_nch.ncp->nc_generation;
4168 
4169 	if (fromnd->nl_nch.ncp->nc_vp &&
4170 	    fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
4171 		userenlk = fnchd.mount;
4172 		cache_unlock(&fromnd->nl_nch);
4173 		lockmgr(&userenlk->mnt_renlock, LK_EXCLUSIVE);
4174 	} else {
4175 		userenlk = NULL;
4176 		cache_unlock(&fromnd->nl_nch);
4177 	}
4178 
4179 	/*
4180 	 * Lookup target
4181 	 */
4182 	tond->nl_flags |= NLC_RENAME_DST | NLC_REFDVP;
4183 	if ((error = nlookup(tond)) != 0) {
4184 		cache_drop(&fnchd);
4185 		goto done;
4186 	}
4187 	tncp_gen = tond->nl_nch.ncp->nc_generation;
4188 
4189 	/*
4190 	 * Attempt to rename a mount point (from or to)
4191 	 */
4192 	if (error == 0 && tond->nl_dvp == NULL) {
4193 		cache_drop(&fnchd);
4194 		error = ENOENT;
4195 		goto done;
4196 	}
4197 
4198 	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
4199 		cache_drop(&fnchd);
4200 		error = ENOENT;
4201 		goto done;
4202 	}
4203 	tnchd.mount = tond->nl_nch.mount;
4204 	cache_hold(&tnchd);
4205 
4206 	/*
4207 	 * If the source and target are the same there is nothing to do
4208 	 */
4209 	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
4210 		cache_drop(&fnchd);
4211 		cache_drop(&tnchd);
4212 		error = 0;
4213 		goto done;
4214 	}
4215 
4216 	/*
4217 	 * Mount points cannot be renamed or overwritten
4218 	 */
4219 	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
4220 	    NCF_ISMOUNTPT
4221 	) {
4222 		cache_drop(&fnchd);
4223 		cache_drop(&tnchd);
4224 		error = EINVAL;
4225 		goto done;
4226 	}
4227 
4228 	/*
4229 	 * Lock all four namecache entries.  tond is already locked.
4230 	 */
4231 	cache_lock4_tondlocked(&fnchd, &fromnd->nl_nch,
4232 			       &tnchd, &tond->nl_nch,
4233 			       fromnd->nl_cred, tond->nl_cred);
4234 	fromnd->nl_flags |= NLC_NCPISLOCKED;
4235 
4236 	/*
4237 	 * If the namecache generation changed for either fromnd or tond,
4238 	 * we must retry.
4239 	 */
4240 	if (((fromnd->nl_nch.ncp->nc_generation - fncp_gen) & ~1) ||
4241 	    ((tond->nl_nch.ncp->nc_generation - tncp_gen) & ~1))
4242 	{
4243 		krateprintf(&krate_rename,
4244 			"kern_rename: retry due to race on: "
4245 			"\"%s\" -> \"%s\" (%d,%d)\n",
4246 			fromnd->nl_nch.ncp->nc_name,
4247 			tond->nl_nch.ncp->nc_name,
4248 			fromnd->nl_nch.ncp->nc_generation - fncp_gen,
4249 			tond->nl_nch.ncp->nc_generation - tncp_gen);
4250 		error = EAGAIN;
4251 		goto finish;
4252 	}
4253 
4254 	/*
4255 	 * If either fromnd or tond are marked destroyed a ripout occured
4256 	 * out from under us and we must retry.
4257 	 */
4258 	if ((fromnd->nl_nch.ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED)) ||
4259 	    fromnd->nl_nch.ncp->nc_vp == NULL ||
4260 	    (tond->nl_nch.ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED))) {
4261 		krateprintf(&krate_rename,
4262 			"kern_rename: retry due to ripout on: "
4263 			"\"%s\" -> \"%s\"\n",
4264 			fromnd->nl_nch.ncp->nc_name,
4265 			tond->nl_nch.ncp->nc_name);
4266 		error = EAGAIN;
4267 		goto finish;
4268 	}
4269 
4270 	/*
4271 	 * Make sure the parent directories linkages are the same.  We have
4272 	 * already checked that fromnd and tond are not mount points so this
4273 	 * should not loop forever on a cross-mount.
4274 	 */
4275 	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
4276 	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
4277 		error = EAGAIN;
4278 		goto finish;
4279 	}
4280 
4281 	/*
4282 	 * Both the source and target must be within the same filesystem and
4283 	 * in the same filesystem as their parent directories within the
4284 	 * namecache topology.
4285 	 *
4286 	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
4287 	 */
4288 	mp = fnchd.mount;
4289 	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
4290 	    mp != tond->nl_nch.mount) {
4291 		error = EXDEV;
4292 		goto finish;
4293 	}
4294 
4295 	/*
4296 	 * Make sure the mount point is writable
4297 	 */
4298 	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
4299 		goto finish;
4300 	}
4301 
4302 	/*
4303 	 * If the target exists and either the source or target is a directory,
4304 	 * then both must be directories.
4305 	 *
4306 	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
4307 	 * have become NULL.
4308 	 */
4309 	if (tond->nl_nch.ncp->nc_vp) {
4310 		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
4311 			error = ENOENT;
4312 		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
4313 			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
4314 				error = ENOTDIR;
4315 		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
4316 			error = EISDIR;
4317 		}
4318 	}
4319 
4320 	/*
4321 	 * You cannot rename a source into itself or a subdirectory of itself.
4322 	 * We check this by travsersing the target directory upwards looking
4323 	 * for a match against the source.
4324 	 *
4325 	 * Only required when renaming a directory, in which case userenlk is
4326 	 * non-NULL.
4327 	 */
4328 	if (__predict_false(userenlk && error == 0)) {
4329 		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
4330 			if (fromnd->nl_nch.ncp == ncp) {
4331 				error = EINVAL;
4332 				break;
4333 			}
4334 		}
4335 	}
4336 
4337 	/*
4338 	 * Even though the namespaces are different, they may still represent
4339 	 * hardlinks to the same file.  The filesystem might have a hard time
4340 	 * with this so we issue a NREMOVE of the source instead of a NRENAME
4341 	 * when we detect the situation.
4342 	 */
4343 	if (error == 0) {
4344 		fdvp = fromnd->nl_dvp;
4345 		tdvp = tond->nl_dvp;
4346 		if (fdvp == NULL || tdvp == NULL) {
4347 			error = EPERM;
4348 		} else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
4349 			error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
4350 					    fromnd->nl_cred);
4351 		} else {
4352 			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
4353 					    fdvp, tdvp, tond->nl_cred);
4354 		}
4355 	}
4356 finish:
4357 	cache_put(&tnchd);
4358 	cache_put(&fnchd);
4359 done:
4360 	if (userenlk)
4361 		lockmgr(&userenlk->mnt_renlock, LK_RELEASE);
4362 	return (error);
4363 }
4364 
4365 /*
4366  * rename_args(char *from, char *to)
4367  *
4368  * Rename files.  Source and destination must either both be directories,
4369  * or both not be directories.  If target is a directory, it must be empty.
4370  */
4371 int
4372 sys_rename(struct sysmsg *sysmsg, const struct rename_args *uap)
4373 {
4374 	struct nlookupdata fromnd, tond;
4375 	int error;
4376 
4377 	do {
4378 		error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
4379 		if (error == 0) {
4380 			error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
4381 			if (error == 0)
4382 				error = kern_rename(&fromnd, &tond);
4383 			nlookup_done(&tond);
4384 		}
4385 		nlookup_done(&fromnd);
4386 	} while (error == EAGAIN);
4387 	return (error);
4388 }
4389 
4390 /*
4391  * renameat_args(int oldfd, char *old, int newfd, char *new)
4392  *
4393  * Rename files using paths relative to the directories associated with
4394  * oldfd and newfd.  Source and destination must either both be directories,
4395  * or both not be directories.  If target is a directory, it must be empty.
4396  */
4397 int
4398 sys_renameat(struct sysmsg *sysmsg, const struct renameat_args *uap)
4399 {
4400 	struct nlookupdata oldnd, newnd;
4401 	struct file *oldfp, *newfp;
4402 	int error;
4403 
4404 	do {
4405 		error = nlookup_init_at(&oldnd, &oldfp,
4406 					uap->oldfd, uap->old,
4407 					UIO_USERSPACE, 0);
4408 		if (error == 0) {
4409 			error = nlookup_init_at(&newnd, &newfp,
4410 						uap->newfd, uap->new,
4411 						UIO_USERSPACE, 0);
4412 			if (error == 0)
4413 				error = kern_rename(&oldnd, &newnd);
4414 			nlookup_done_at(&newnd, newfp);
4415 		}
4416 		nlookup_done_at(&oldnd, oldfp);
4417 	} while (error == EAGAIN);
4418 	return (error);
4419 }
4420 
4421 int
4422 kern_mkdir(struct nlookupdata *nd, int mode)
4423 {
4424 	struct thread *td = curthread;
4425 	struct proc *p = td->td_proc;
4426 	struct vnode *vp;
4427 	struct vattr vattr;
4428 	int error;
4429 
4430 	bwillinode(1);
4431 	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE | NLC_REFDVP;
4432 	if ((error = nlookup(nd)) != 0)
4433 		return (error);
4434 
4435 	if (nd->nl_nch.ncp->nc_vp)
4436 		return (EEXIST);
4437 	if (nd->nl_dvp == NULL)
4438 		return (EINVAL);
4439 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4440 		return (error);
4441 	VATTR_NULL(&vattr);
4442 	vattr.va_type = VDIR;
4443 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
4444 
4445 	vp = NULL;
4446 	error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
4447 	if (error == 0)
4448 		vput(vp);
4449 	return (error);
4450 }
4451 
4452 /*
4453  * mkdir_args(char *path, int mode)
4454  *
4455  * Make a directory file.
4456  */
4457 int
4458 sys_mkdir(struct sysmsg *sysmsg, const struct mkdir_args *uap)
4459 {
4460 	struct nlookupdata nd;
4461 	int error;
4462 
4463 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4464 	if (error == 0)
4465 		error = kern_mkdir(&nd, uap->mode);
4466 	nlookup_done(&nd);
4467 	return (error);
4468 }
4469 
4470 /*
4471  * mkdirat_args(int fd, char *path, mode_t mode)
4472  *
4473  * Make a directory file.  The path is relative to the directory associated
4474  * with fd.
4475  */
4476 int
4477 sys_mkdirat(struct sysmsg *sysmsg, const struct mkdirat_args *uap)
4478 {
4479 	struct nlookupdata nd;
4480 	struct file *fp;
4481 	int error;
4482 
4483 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
4484 	if (error == 0)
4485 		error = kern_mkdir(&nd, uap->mode);
4486 	nlookup_done_at(&nd, fp);
4487 	return (error);
4488 }
4489 
4490 int
4491 kern_rmdir(struct nlookupdata *nd)
4492 {
4493 	int error;
4494 
4495 	bwillinode(1);
4496 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
4497 	if ((error = nlookup(nd)) != 0)
4498 		return (error);
4499 
4500 	/*
4501 	 * Do not allow directories representing mount points to be
4502 	 * deleted, even if empty.  Check write perms on mount point
4503 	 * in case the vnode is aliased (aka nullfs).
4504 	 */
4505 	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
4506 		return (EBUSY);
4507 	if (nd->nl_dvp == NULL)
4508 		return (EINVAL);
4509 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4510 		return (error);
4511 	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
4512 	return (error);
4513 }
4514 
4515 /*
4516  * rmdir_args(char *path)
4517  *
4518  * Remove a directory file.
4519  */
4520 int
4521 sys_rmdir(struct sysmsg *sysmsg, const struct rmdir_args *uap)
4522 {
4523 	struct nlookupdata nd;
4524 	int error;
4525 
4526 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4527 	if (error == 0)
4528 		error = kern_rmdir(&nd);
4529 	nlookup_done(&nd);
4530 	return (error);
4531 }
4532 
4533 int
4534 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
4535 		   enum uio_seg direction)
4536 {
4537 	struct thread *td = curthread;
4538 	struct vnode *vp;
4539 	struct file *fp;
4540 	struct uio auio;
4541 	struct iovec aiov;
4542 	off_t loff;
4543 	int error, eofflag;
4544 
4545 	if ((error = holdvnode(td, fd, &fp)) != 0)
4546 		return (error);
4547 	if ((fp->f_flag & FREAD) == 0) {
4548 		error = EBADF;
4549 		goto done;
4550 	}
4551 	vp = (struct vnode *)fp->f_data;
4552 	if (vp->v_type != VDIR) {
4553 		error = EINVAL;
4554 		goto done;
4555 	}
4556 	aiov.iov_base = buf;
4557 	aiov.iov_len = count;
4558 	auio.uio_iov = &aiov;
4559 	auio.uio_iovcnt = 1;
4560 	auio.uio_rw = UIO_READ;
4561 	auio.uio_segflg = direction;
4562 	auio.uio_td = td;
4563 	auio.uio_resid = count;
4564 	loff = auio.uio_offset = fp->f_offset;
4565 	error = VOP_READDIR_FP(vp, &auio, fp->f_cred, &eofflag, NULL, NULL, fp);
4566 	fp->f_offset = auio.uio_offset;
4567 	if (error)
4568 		goto done;
4569 
4570 	/*
4571 	 * WARNING!  *basep may not be wide enough to accomodate the
4572 	 * seek offset.   XXX should we hack this to return the upper 32 bits
4573 	 * for offsets greater then 4G?
4574 	 */
4575 	if (basep) {
4576 		*basep = (long)loff;
4577 	}
4578 	*res = count - auio.uio_resid;
4579 done:
4580 	fdrop(fp);
4581 	return (error);
4582 }
4583 
4584 /*
4585  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
4586  *
4587  * Read a block of directory entries in a file system independent format.
4588  */
4589 int
4590 sys_getdirentries(struct sysmsg *sysmsg, const struct getdirentries_args *uap)
4591 {
4592 	long base;
4593 	int error;
4594 
4595 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
4596 				   &sysmsg->sysmsg_result, UIO_USERSPACE);
4597 
4598 	if (error == 0 && uap->basep)
4599 		error = copyout(&base, uap->basep, sizeof(*uap->basep));
4600 	return (error);
4601 }
4602 
4603 /*
4604  * getdents_args(int fd, char *buf, size_t count)
4605  */
4606 int
4607 sys_getdents(struct sysmsg *sysmsg, const struct getdents_args *uap)
4608 {
4609 	int error;
4610 
4611 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
4612 				   &sysmsg->sysmsg_result, UIO_USERSPACE);
4613 
4614 	return (error);
4615 }
4616 
4617 /*
4618  * Set the mode mask for creation of filesystem nodes.
4619  *
4620  * umask(int newmask)
4621  */
4622 int
4623 sys_umask(struct sysmsg *sysmsg, const struct umask_args *uap)
4624 {
4625 	struct thread *td = curthread;
4626 	struct proc *p = td->td_proc;
4627 	struct filedesc *fdp;
4628 
4629 	fdp = p->p_fd;
4630 	sysmsg->sysmsg_result = fdp->fd_cmask;
4631 	fdp->fd_cmask = uap->newmask & ALLPERMS;
4632 	return (0);
4633 }
4634 
4635 /*
4636  * revoke(char *path)
4637  *
4638  * Void all references to file by ripping underlying filesystem
4639  * away from vnode.
4640  */
4641 int
4642 sys_revoke(struct sysmsg *sysmsg, const struct revoke_args *uap)
4643 {
4644 	struct nlookupdata nd;
4645 	struct vattr vattr;
4646 	struct vnode *vp;
4647 	struct ucred *cred;
4648 	int error;
4649 
4650 	vp = NULL;
4651 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4652 	if (error == 0)
4653 		error = nlookup(&nd);
4654 	if (error == 0)
4655 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4656 	cred = crhold(nd.nl_cred);
4657 	nlookup_done(&nd);
4658 	if (error == 0) {
4659 		if (error == 0)
4660 			error = VOP_GETATTR(vp, &vattr);
4661 		if (error == 0 && cred->cr_uid != vattr.va_uid)
4662 			error = caps_priv_check(cred, SYSCAP_NOVFS_REVOKE);
4663 		if (error == 0 && (vp->v_type == VCHR || vp->v_type == VBLK)) {
4664 			if (vcount(vp) > 0)
4665 				error = vrevoke(vp, cred);
4666 		} else if (error == 0) {
4667 			error = vrevoke(vp, cred);
4668 		}
4669 		vrele(vp);
4670 	}
4671 	if (cred)
4672 		crfree(cred);
4673 	return (error);
4674 }
4675 
4676 /*
4677  * getfh_args(char *fname, fhandle_t *fhp)
4678  *
4679  * Get (NFS) file handle
4680  *
4681  * NOTE: We use the fsid of the covering mount, even if it is a nullfs
4682  * mount.  This allows nullfs mounts to be explicitly exported.
4683  *
4684  * WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
4685  *
4686  * 	    nullfs mounts of subdirectories are not safe.  That is, it will
4687  *	    work, but you do not really have protection against access to
4688  *	    the related parent directories.
4689  */
4690 int
4691 sys_getfh(struct sysmsg *sysmsg, const struct getfh_args *uap)
4692 {
4693 	struct nlookupdata nd;
4694 	fhandle_t fh;
4695 	struct vnode *vp;
4696 	struct mount *mp;
4697 	int error;
4698 
4699 	/*
4700 	 * Must be super user
4701 	 */
4702 	if ((error = caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) != 0)
4703 		return (error);
4704 
4705 	vp = NULL;
4706 	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
4707 	if (error == 0)
4708 		error = nlookup(&nd);
4709 	if (error == 0)
4710 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4711 	mp = nd.nl_nch.mount;
4712 	nlookup_done(&nd);
4713 	if (error == 0) {
4714 		bzero(&fh, sizeof(fh));
4715 		fh.fh_fsid = mp->mnt_stat.f_fsid;
4716 		error = VFS_VPTOFH(vp, &fh.fh_fid);
4717 		vput(vp);
4718 		if (error == 0)
4719 			error = copyout(&fh, uap->fhp, sizeof(fh));
4720 	}
4721 	return (error);
4722 }
4723 
4724 /*
4725  * fhopen_args(const struct fhandle *u_fhp, int flags)
4726  *
4727  * syscall for the rpc.lockd to use to translate a NFS file handle into
4728  * an open descriptor.
4729  *
4730  * WARNING: Do not remove the caps_priv_check() call or this becomes
4731  *	    one giant security hole.
4732  */
4733 int
4734 sys_fhopen(struct sysmsg *sysmsg, const struct fhopen_args *uap)
4735 {
4736 	struct thread *td = curthread;
4737 	struct filedesc *fdp = td->td_proc->p_fd;
4738 	struct mount *mp;
4739 	struct vnode *vp;
4740 	struct fhandle fhp;
4741 	struct vattr vat;
4742 	struct vattr *vap = &vat;
4743 	struct flock lf;
4744 	int fmode, mode, error = 0, type;
4745 	struct file *nfp;
4746 	struct file *fp;
4747 	int indx;
4748 
4749 	/*
4750 	 * Must be super user
4751 	 */
4752 	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
4753 	if (error)
4754 		return (error);
4755 
4756 	fmode = FFLAGS(uap->flags);
4757 
4758 	/*
4759 	 * Why not allow a non-read/write open for our lockd?
4760 	 */
4761 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4762 		return (EINVAL);
4763 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4764 	if (error)
4765 		return(error);
4766 
4767 	/*
4768 	 * Find the mount point
4769 	 */
4770 	mp = vfs_getvfs(&fhp.fh_fsid);
4771 	if (mp == NULL) {
4772 		error = ESTALE;
4773 		goto done2;
4774 	}
4775 	/* now give me my vnode, it gets returned to me locked */
4776 	error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
4777 	if (error)
4778 		goto done;
4779  	/*
4780 	 * from now on we have to make sure not
4781 	 * to forget about the vnode
4782 	 * any error that causes an abort must vput(vp)
4783 	 * just set error = err and 'goto bad;'.
4784 	 */
4785 
4786 	/*
4787 	 * from vn_open
4788 	 */
4789 	if (vp->v_type == VLNK) {
4790 		error = EMLINK;
4791 		goto bad;
4792 	}
4793 	if (vp->v_type == VSOCK) {
4794 		error = EOPNOTSUPP;
4795 		goto bad;
4796 	}
4797 	mode = 0;
4798 	if (fmode & (FWRITE | O_TRUNC)) {
4799 		if (vp->v_type == VDIR) {
4800 			error = EISDIR;
4801 			goto bad;
4802 		}
4803 		error = vn_writechk(vp);
4804 		if (error)
4805 			goto bad;
4806 		mode |= VWRITE;
4807 	}
4808 	if (fmode & FREAD)
4809 		mode |= VREAD;
4810 	if (mode) {
4811 		error = VOP_ACCESS(vp, mode, td->td_ucred);
4812 		if (error)
4813 			goto bad;
4814 	}
4815 	if (fmode & O_TRUNC) {
4816 		vn_unlock(vp);				/* XXX */
4817 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4818 		VATTR_NULL(vap);
4819 		vap->va_size = 0;
4820 		error = VOP_SETATTR(vp, vap, td->td_ucred);
4821 		if (error)
4822 			goto bad;
4823 	}
4824 
4825 	/*
4826 	 * VOP_OPEN needs the file pointer so it can potentially override
4827 	 * it.
4828 	 *
4829 	 * WARNING! no f_nchandle will be associated when fhopen()ing a
4830 	 * directory.  XXX
4831 	 */
4832 	if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
4833 		goto bad;
4834 	error = VOP_OPEN(vp, fmode, td->td_ucred, &nfp);
4835 	fp = nfp;
4836 
4837 	if (error) {
4838 		/*
4839 		 * setting f_ops this way prevents VOP_CLOSE from being
4840 		 * called or fdrop() releasing the vp from v_data.   Since
4841 		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
4842 		 */
4843 		fp->f_ops = &badfileops;
4844 		fp->f_data = NULL;
4845 		goto bad_drop;
4846 	}
4847 
4848 	/*
4849 	 * The fp is given its own reference, we still have our ref and lock.
4850 	 *
4851 	 * Assert that all regular files must be created with a VM object.
4852 	 */
4853 	if (vp->v_type == VREG && vp->v_object == NULL) {
4854 		kprintf("fhopen: regular file did not "
4855 			"have VM object: %p\n",
4856 			vp);
4857 		goto bad_drop;
4858 	}
4859 
4860 	/*
4861 	 * The open was successful.  Handle any locking requirements.
4862 	 */
4863 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4864 		lf.l_whence = SEEK_SET;
4865 		lf.l_start = 0;
4866 		lf.l_len = 0;
4867 		if (fmode & O_EXLOCK)
4868 			lf.l_type = F_WRLCK;
4869 		else
4870 			lf.l_type = F_RDLCK;
4871 		if (fmode & FNONBLOCK)
4872 			type = 0;
4873 		else
4874 			type = F_WAIT;
4875 		vn_unlock(vp);
4876 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK,
4877 					 &lf, type)) != 0) {
4878 			/*
4879 			 * release our private reference.
4880 			 */
4881 			fsetfd(fdp, NULL, indx);
4882 			fdrop(fp);
4883 			vrele(vp);
4884 			goto done;
4885 		}
4886 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4887 		atomic_set_int(&fp->f_flag, FHASLOCK);	/* race ok */
4888 	}
4889 
4890 	/*
4891 	 * Clean up.  Associate the file pointer with the previously
4892 	 * reserved descriptor and return it.
4893 	 */
4894 	vput(vp);
4895 	if (uap->flags & O_CLOEXEC)
4896 		fdp->fd_files[indx].fileflags |= UF_EXCLOSE;
4897 	fsetfd(fdp, fp, indx);
4898 	fdrop(fp);
4899 	sysmsg->sysmsg_result = indx;
4900 	mount_drop(mp);
4901 
4902 	return (error);
4903 
4904 bad_drop:
4905 	fsetfd(fdp, NULL, indx);
4906 	fdrop(fp);
4907 bad:
4908 	vput(vp);
4909 done:
4910 	mount_drop(mp);
4911 done2:
4912 	return (error);
4913 }
4914 
4915 /*
4916  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
4917  */
4918 int
4919 sys_fhstat(struct sysmsg *sysmsg, const struct fhstat_args *uap)
4920 {
4921 	struct thread *td = curthread;
4922 	struct stat sb;
4923 	fhandle_t fh;
4924 	struct mount *mp;
4925 	struct vnode *vp;
4926 	int error;
4927 
4928 	/*
4929 	 * Must be super user
4930 	 */
4931 	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
4932 	if (error)
4933 		return (error);
4934 
4935 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4936 	if (error)
4937 		return (error);
4938 
4939 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
4940 		error = ESTALE;
4941 	if (error == 0) {
4942 		if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
4943 			error = vn_stat(vp, &sb, td->td_ucred);
4944 			vput(vp);
4945 		}
4946 	}
4947 	if (error == 0)
4948 		error = copyout(&sb, uap->sb, sizeof(sb));
4949 	if (mp)
4950 		mount_drop(mp);
4951 
4952 	return (error);
4953 }
4954 
4955 /*
4956  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
4957  */
4958 int
4959 sys_fhstatfs(struct sysmsg *sysmsg, const struct fhstatfs_args *uap)
4960 {
4961 	struct thread *td = curthread;
4962 	struct proc *p = td->td_proc;
4963 	struct statfs *sp;
4964 	struct mount *mp;
4965 	struct vnode *vp;
4966 	struct statfs sb;
4967 	char *fullpath, *freepath;
4968 	fhandle_t fh;
4969 	int error;
4970 
4971 	/*
4972 	 * Must be super user
4973 	 */
4974 	error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT);
4975 	if (error)
4976 		return (error);
4977 
4978 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4979 		return (error);
4980 
4981 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4982 		error = ESTALE;
4983 		goto done;
4984 	}
4985 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4986 		error = ESTALE;
4987 		goto done;
4988 	}
4989 
4990 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
4991 		goto done;
4992 	mp = vp->v_mount;
4993 	sp = &mp->mnt_stat;
4994 	vput(vp);
4995 	if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
4996 		goto done;
4997 
4998 	error = mount_path(p, mp, &fullpath, &freepath);
4999 	if (error)
5000 		goto done;
5001 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
5002 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
5003 	kfree(freepath, M_TEMP);
5004 
5005 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
5006 	if (caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)) {
5007 		bcopy(sp, &sb, sizeof(sb));
5008 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
5009 		sp = &sb;
5010 	}
5011 	error = copyout(sp, uap->buf, sizeof(*sp));
5012 done:
5013 	if (mp)
5014 		mount_drop(mp);
5015 
5016 	return (error);
5017 }
5018 
5019 /*
5020  * fhstatvfs_args(struct fhandle *u_fhp, struct statvfs *buf)
5021  */
5022 int
5023 sys_fhstatvfs(struct sysmsg *sysmsg, const struct fhstatvfs_args *uap)
5024 {
5025 	struct thread *td = curthread;
5026 	struct proc *p = td->td_proc;
5027 	struct statvfs *sp;
5028 	struct mount *mp;
5029 	struct vnode *vp;
5030 	fhandle_t fh;
5031 	int error;
5032 
5033 	/*
5034 	 * Must be super user
5035 	 */
5036 	if ((error = caps_priv_check_td(td, SYSCAP_RESTRICTEDROOT)))
5037 		return (error);
5038 
5039 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
5040 		return (error);
5041 
5042 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
5043 		error = ESTALE;
5044 		goto done;
5045 	}
5046 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
5047 		error = ESTALE;
5048 		goto done;
5049 	}
5050 
5051 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
5052 		goto done;
5053 	mp = vp->v_mount;
5054 	sp = &mp->mnt_vstat;
5055 	vput(vp);
5056 	if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
5057 		goto done;
5058 
5059 	sp->f_flag = 0;
5060 	if (mp->mnt_flag & MNT_RDONLY)
5061 		sp->f_flag |= ST_RDONLY;
5062 	if (mp->mnt_flag & MNT_NOSUID)
5063 		sp->f_flag |= ST_NOSUID;
5064 	error = copyout(sp, uap->buf, sizeof(*sp));
5065 done:
5066 	if (mp)
5067 		mount_drop(mp);
5068 	return (error);
5069 }
5070 
5071 
5072 /*
5073  * Syscall to push extended attribute configuration information into the
5074  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
5075  * a command (int cmd), and attribute name and misc data.  For now, the
5076  * attribute name is left in userspace for consumption by the VFS_op.
5077  * It will probably be changed to be copied into sysspace by the
5078  * syscall in the future, once issues with various consumers of the
5079  * attribute code have raised their hands.
5080  *
5081  * Currently this is used only by UFS Extended Attributes.
5082  */
5083 int
5084 sys_extattrctl(struct sysmsg *sysmsg, const struct extattrctl_args *uap)
5085 {
5086 	struct nlookupdata nd;
5087 	struct vnode *vp;
5088 	char attrname[EXTATTR_MAXNAMELEN];
5089 	int error;
5090 	size_t size;
5091 
5092 	attrname[0] = 0;
5093 	vp = NULL;
5094 	error = 0;
5095 
5096 	if (error == 0 && uap->filename) {
5097 		error = nlookup_init(&nd, uap->filename, UIO_USERSPACE,
5098 				     NLC_FOLLOW);
5099 		if (error == 0)
5100 			error = nlookup(&nd);
5101 		if (error == 0)
5102 			error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
5103 		nlookup_done(&nd);
5104 	}
5105 
5106 	if (error == 0 && uap->attrname) {
5107 		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
5108 				  &size);
5109 	}
5110 
5111 	if (error == 0) {
5112 		error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5113 		if (error == 0)
5114 			error = nlookup(&nd);
5115 		if (error == 0)
5116 			error = ncp_writechk(&nd.nl_nch);
5117 		if (error == 0) {
5118 			error = VFS_EXTATTRCTL(nd.nl_nch.mount, uap->cmd, vp,
5119 					       uap->attrnamespace,
5120 					       uap->attrname, nd.nl_cred);
5121 		}
5122 		nlookup_done(&nd);
5123 	}
5124 
5125 	return (error);
5126 }
5127 
5128 /*
5129  * Syscall to get a named extended attribute on a file or directory.
5130  */
5131 int
5132 sys_extattr_set_file(struct sysmsg *sysmsg,
5133 		     const struct extattr_set_file_args *uap)
5134 {
5135 	char attrname[EXTATTR_MAXNAMELEN];
5136 	struct nlookupdata nd;
5137 	struct vnode *vp;
5138 	struct uio auio;
5139 	struct iovec aiov;
5140 	int error;
5141 
5142 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5143 	if (error)
5144 		return (error);
5145 
5146 	vp = NULL;
5147 
5148 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5149 	if (error == 0)
5150 		error = nlookup(&nd);
5151 	if (error == 0)
5152 		error = ncp_writechk(&nd.nl_nch);
5153 	if (error == 0)
5154 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
5155 	if (error) {
5156 		nlookup_done(&nd);
5157 		return (error);
5158 	}
5159 
5160 	bzero(&auio, sizeof(auio));
5161 	aiov.iov_base = uap->data;
5162 	aiov.iov_len = uap->nbytes;
5163 	auio.uio_iov = &aiov;
5164 	auio.uio_iovcnt = 1;
5165 	auio.uio_offset = 0;
5166 	auio.uio_resid = uap->nbytes;
5167 	auio.uio_rw = UIO_WRITE;
5168 	auio.uio_td = curthread;
5169 
5170 	error = VOP_SETEXTATTR(vp, uap->attrnamespace, attrname,
5171 			       &auio, nd.nl_cred);
5172 
5173 	vput(vp);
5174 	nlookup_done(&nd);
5175 	return (error);
5176 }
5177 
5178 /*
5179  * Syscall to get a named extended attribute on a file or directory.
5180  */
5181 int
5182 sys_extattr_get_file(struct sysmsg *sysmsg,
5183 		     const struct extattr_get_file_args *uap)
5184 {
5185 	char attrname[EXTATTR_MAXNAMELEN];
5186 	struct nlookupdata nd;
5187 	struct uio auio;
5188 	struct iovec aiov;
5189 	struct vnode *vp;
5190 	int error;
5191 
5192 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5193 	if (error)
5194 		return (error);
5195 
5196 	vp = NULL;
5197 
5198 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5199 	if (error == 0)
5200 		error = nlookup(&nd);
5201 	if (error == 0)
5202 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_SHARED, &vp);
5203 	if (error) {
5204 		nlookup_done(&nd);
5205 		return (error);
5206 	}
5207 
5208 	bzero(&auio, sizeof(auio));
5209 	aiov.iov_base = uap->data;
5210 	aiov.iov_len = uap->nbytes;
5211 	auio.uio_iov = &aiov;
5212 	auio.uio_iovcnt = 1;
5213 	auio.uio_offset = 0;
5214 	auio.uio_resid = uap->nbytes;
5215 	auio.uio_rw = UIO_READ;
5216 	auio.uio_td = curthread;
5217 
5218 	error = VOP_GETEXTATTR(vp, uap->attrnamespace, attrname,
5219 				&auio, nd.nl_cred);
5220 	sysmsg->sysmsg_result = uap->nbytes - auio.uio_resid;
5221 
5222 	vput(vp);
5223 	nlookup_done(&nd);
5224 	return(error);
5225 }
5226 
5227 /*
5228  * Syscall to delete a named extended attribute from a file or directory.
5229  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
5230  */
5231 int
5232 sys_extattr_delete_file(struct sysmsg *sysmsg,
5233 			const struct extattr_delete_file_args *uap)
5234 {
5235 	char attrname[EXTATTR_MAXNAMELEN];
5236 	struct nlookupdata nd;
5237 	struct vnode *vp;
5238 	int error;
5239 
5240 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
5241 	if (error)
5242 		return(error);
5243 
5244 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5245 	if (error == 0)
5246 		error = nlookup(&nd);
5247 	if (error == 0)
5248 		error = ncp_writechk(&nd.nl_nch);
5249 	if (error == 0) {
5250 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
5251 		if (error == 0) {
5252 			error = VOP_SETEXTATTR(vp, uap->attrnamespace,
5253 					       attrname, NULL, nd.nl_cred);
5254 			vput(vp);
5255 		}
5256 	}
5257 	nlookup_done(&nd);
5258 	return(error);
5259 }
5260 
5261 /*
5262  * Determine if the mount is visible to the process.
5263  */
5264 static int
5265 chroot_visible_mnt(struct mount *mp, struct proc *p)
5266 {
5267 	struct nchandle nch;
5268 
5269 	/*
5270 	 * Traverse from the mount point upwards.  If we hit the process
5271 	 * root then the mount point is visible to the process.
5272 	 */
5273 	nch = mp->mnt_ncmountpt;
5274 	while (nch.ncp) {
5275 		if (nch.mount == p->p_fd->fd_nrdir.mount &&
5276 		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
5277 			return(1);
5278 		}
5279 		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
5280 			nch = nch.mount->mnt_ncmounton;
5281 		} else {
5282 			nch.ncp = nch.ncp->nc_parent;
5283 		}
5284 	}
5285 
5286 	/*
5287 	 * If the mount point is not visible to the process, but the
5288 	 * process root is in a subdirectory of the mount, return
5289 	 * TRUE anyway.
5290 	 */
5291 	if (p->p_fd->fd_nrdir.mount == mp)
5292 		return(1);
5293 
5294 	return(0);
5295 }
5296 
5297 /*
5298  * Return the appropriate system capability restriction.
5299  */
5300 static int
5301 get_fscap(const char *fsname)
5302 {
5303 
5304 	if (strncmp("null", fsname, 5) == 0) {
5305 		return SYSCAP_NOMOUNT_NULLFS;
5306 	} else if (strncmp(fsname, "devfs", 6) == 0) {
5307 		return SYSCAP_NOMOUNT_DEVFS;
5308 	} else if (strncmp(fsname, "procfs", 7) == 0) {
5309 		return SYSCAP_NOMOUNT_PROCFS;
5310 	} else if (strncmp(fsname, "tmpfs", 6) == 0) {
5311 		return SYSCAP_NOMOUNT_TMPFS;
5312 	} else if (strncmp(fsname, "fusefs", 7) == 0) {
5313 		return SYSCAP_NOMOUNT_FUSE;
5314 	}
5315 	return SYSCAP_RESTRICTEDROOT;
5316 }
5317 
5318 int
5319 sys___realpath(struct sysmsg *sysmsg, const struct __realpath_args *uap)
5320 {
5321 	struct nlookupdata nd;
5322 	char *rbuf;
5323 	char *fbuf;
5324 	ssize_t rlen;
5325 	int error;
5326 
5327 	/*
5328 	 * Invalid length if less than 0.  0 is allowed
5329 	 */
5330 	if ((ssize_t)uap->len < 0)
5331 		return EINVAL;
5332 
5333 	rbuf = NULL;
5334 	fbuf = NULL;
5335 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
5336 	if (error)
5337 		goto done;
5338 
5339 	nd.nl_flags |= NLC_SHAREDLOCK;
5340 	error = nlookup(&nd);
5341 	if (error)
5342 		goto done;
5343 
5344 	if (nd.nl_nch.ncp->nc_vp == NULL) {
5345 		error = ENOENT;
5346 		goto done;
5347 	}
5348 
5349 	/*
5350 	 * Shortcut test for existence.
5351 	 */
5352 	if (uap->len == 0) {
5353 		error = ENAMETOOLONG;
5354 		goto done;
5355 	}
5356 
5357 	/*
5358 	 * Obtain the path relative to the process root.  The nch must not
5359 	 * be locked for the cache_fullpath() call.
5360 	 */
5361 	if (nd.nl_flags & NLC_NCPISLOCKED) {
5362 		nd.nl_flags &= ~NLC_NCPISLOCKED;
5363 		cache_unlock(&nd.nl_nch);
5364 	}
5365 	error = cache_fullpath(curproc, &nd.nl_nch, NULL, &rbuf, &fbuf, 0);
5366 	if (error)
5367 		goto done;
5368 
5369 	rlen = (ssize_t)strlen(rbuf);
5370 	if (rlen >= uap->len) {
5371 		error = ENAMETOOLONG;
5372 		goto done;
5373 	}
5374 	error = copyout(rbuf, uap->buf, rlen + 1);
5375 	if (error == 0)
5376 		sysmsg->sysmsg_szresult = rlen;
5377 done:
5378 	nlookup_done(&nd);
5379 	if (fbuf)
5380 		kfree(fbuf, M_TEMP);
5381 
5382 	return error;
5383 }
5384 
5385 int
5386 sys_posix_fallocate(struct sysmsg *sysmsg, const struct posix_fallocate_args *uap)
5387 {
5388 	return (kern_posix_fallocate(uap->fd, uap->offset, uap->len));
5389 }
5390 
5391 int
5392 kern_posix_fallocate(int fd, off_t offset, off_t len)
5393 {
5394 	struct thread *td = curthread;
5395 	struct vnode *vp;
5396 	struct file *fp;
5397 	int error;
5398 
5399 	if (offset < 0 || len <= 0)
5400 		return (EINVAL);
5401 	/* Check for wrap. */
5402 	if (offset > OFF_MAX - len)
5403 		return (EFBIG);
5404 
5405 	fp = holdfp(td, fd, -1);
5406 	if (fp == NULL)
5407 		return (EBADF);
5408 
5409 	switch (fp->f_type) {
5410 	case DTYPE_VNODE:
5411 		break;
5412 	case DTYPE_PIPE:
5413 	case DTYPE_FIFO:
5414 		error = ESPIPE;
5415 		goto out;
5416 	default:
5417 		error = ENODEV;
5418 		goto out;
5419 	}
5420 
5421 	if ((fp->f_flag & FWRITE) == 0) {
5422 		error = EBADF;
5423 		goto out;
5424 	}
5425 
5426 	vp = fp->f_data;
5427 	if (vp->v_type != VREG) {
5428 		error = ENODEV;
5429 		goto out;
5430 	}
5431 
5432 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
5433 	error = VOP_ALLOCATE(vp, offset, len);
5434 	vn_unlock(vp);
5435 out:
5436 	dropfp(td, fd, fp);
5437 	return (error);
5438 }
5439