xref: /dragonfly/sys/kern/vfs_syscalls.c (revision 9b5a9965)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
39  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
40  * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.118 2007/07/19 01:16:39 dillon Exp $
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/buf.h>
46 #include <sys/conf.h>
47 #include <sys/sysent.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/mountctl.h>
51 #include <sys/sysproto.h>
52 #include <sys/filedesc.h>
53 #include <sys/kernel.h>
54 #include <sys/fcntl.h>
55 #include <sys/file.h>
56 #include <sys/linker.h>
57 #include <sys/stat.h>
58 #include <sys/unistd.h>
59 #include <sys/vnode.h>
60 #include <sys/proc.h>
61 #include <sys/namei.h>
62 #include <sys/nlookup.h>
63 #include <sys/dirent.h>
64 #include <sys/extattr.h>
65 #include <sys/spinlock.h>
66 #include <sys/kern_syscall.h>
67 #include <sys/objcache.h>
68 #include <sys/sysctl.h>
69 #include <sys/file2.h>
70 #include <sys/spinlock2.h>
71 
72 #include <vm/vm.h>
73 #include <vm/vm_object.h>
74 #include <vm/vm_page.h>
75 
76 #include <machine/limits.h>
77 #include <machine/stdarg.h>
78 
79 #include <vfs/union/union.h>
80 
81 static void mount_warning(struct mount *mp, const char *ctl, ...);
82 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
83 static int checkvp_chdir (struct vnode *vn, struct thread *td);
84 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
85 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
86 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
87 static int getutimes (const struct timeval *, struct timespec *);
88 static int setfown (struct vnode *, uid_t, gid_t);
89 static int setfmode (struct vnode *, int);
90 static int setfflags (struct vnode *, int);
91 static int setutimes (struct vnode *, const struct timespec *, int);
92 static int	usermount = 0;	/* if 1, non-root can mount fs. */
93 
94 int (*union_dircheckp) (struct thread *, struct vnode **, struct file *);
95 
96 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
97 
98 /*
99  * Virtual File System System Calls
100  */
101 
102 /*
103  * Mount a file system.
104  */
105 /*
106  * mount_args(char *type, char *path, int flags, caddr_t data)
107  */
108 /* ARGSUSED */
109 int
110 sys_mount(struct mount_args *uap)
111 {
112 	struct thread *td = curthread;
113 	struct proc *p = td->td_proc;
114 	struct vnode *vp;
115 	struct nchandle nch;
116 	struct mount *mp;
117 	struct vfsconf *vfsp;
118 	int error, flag = 0, flag2 = 0;
119 	int hasmount;
120 	struct vattr va;
121 	struct nlookupdata nd;
122 	char fstypename[MFSNAMELEN];
123 	struct ucred *cred = p->p_ucred;
124 
125 	KKASSERT(p);
126 	if (cred->cr_prison != NULL)
127 		return (EPERM);
128 	if (usermount == 0 && (error = suser(td)))
129 		return (error);
130 	/*
131 	 * Do not allow NFS export by non-root users.
132 	 */
133 	if (uap->flags & MNT_EXPORTED) {
134 		error = suser(td);
135 		if (error)
136 			return (error);
137 	}
138 	/*
139 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
140 	 */
141 	if (suser(td))
142 		uap->flags |= MNT_NOSUID | MNT_NODEV;
143 
144 	/*
145 	 * Lookup the requested path and extract the nch and vnode.
146 	 */
147 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
148 	if (error == 0) {
149 		if ((error = nlookup(&nd)) == 0) {
150 			if (nd.nl_nch.ncp->nc_vp == NULL)
151 				error = ENOENT;
152 		}
153 	}
154 	if (error) {
155 		nlookup_done(&nd);
156 		return (error);
157 	}
158 
159 	/*
160 	 * Extract the locked+refd ncp and cleanup the nd structure
161 	 */
162 	nch = nd.nl_nch;
163 	cache_zero(&nd.nl_nch);
164 	nlookup_done(&nd);
165 
166 	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && cache_findmount(&nch))
167 		hasmount = 1;
168 	else
169 		hasmount = 0;
170 
171 
172 	/*
173 	 * now we have the locked ref'd nch and unreferenced vnode.
174 	 */
175 	vp = nch.ncp->nc_vp;
176 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
177 		cache_put(&nch);
178 		return (error);
179 	}
180 	cache_unlock(&nch);
181 
182 	/*
183 	 * Now we have an unlocked ref'd nch and a locked ref'd vp
184 	 */
185 	if (uap->flags & MNT_UPDATE) {
186 		if ((vp->v_flag & VROOT) == 0) {
187 			cache_drop(&nch);
188 			vput(vp);
189 			return (EINVAL);
190 		}
191 		mp = vp->v_mount;
192 		flag = mp->mnt_flag;
193 		flag2 = mp->mnt_kern_flag;
194 		/*
195 		 * We only allow the filesystem to be reloaded if it
196 		 * is currently mounted read-only.
197 		 */
198 		if ((uap->flags & MNT_RELOAD) &&
199 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
200 			cache_drop(&nch);
201 			vput(vp);
202 			return (EOPNOTSUPP);	/* Needs translation */
203 		}
204 		/*
205 		 * Only root, or the user that did the original mount is
206 		 * permitted to update it.
207 		 */
208 		if (mp->mnt_stat.f_owner != cred->cr_uid &&
209 		    (error = suser(td))) {
210 			cache_drop(&nch);
211 			vput(vp);
212 			return (error);
213 		}
214 		if (vfs_busy(mp, LK_NOWAIT)) {
215 			cache_drop(&nch);
216 			vput(vp);
217 			return (EBUSY);
218 		}
219 		if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
220 			cache_drop(&nch);
221 			vfs_unbusy(mp);
222 			vput(vp);
223 			return (EBUSY);
224 		}
225 		vp->v_flag |= VMOUNT;
226 		mp->mnt_flag |=
227 		    uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
228 		vn_unlock(vp);
229 		goto update;
230 	}
231 	/*
232 	 * If the user is not root, ensure that they own the directory
233 	 * onto which we are attempting to mount.
234 	 */
235 	if ((error = VOP_GETATTR(vp, &va)) ||
236 	    (va.va_uid != cred->cr_uid && (error = suser(td)))) {
237 		cache_drop(&nch);
238 		vput(vp);
239 		return (error);
240 	}
241 	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
242 		cache_drop(&nch);
243 		vput(vp);
244 		return (error);
245 	}
246 	if (vp->v_type != VDIR) {
247 		cache_drop(&nch);
248 		vput(vp);
249 		return (ENOTDIR);
250 	}
251 	if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
252 		cache_drop(&nch);
253 		vput(vp);
254 		return (error);
255 	}
256 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
257 		if (!strcmp(vfsp->vfc_name, fstypename))
258 			break;
259 	}
260 	if (vfsp == NULL) {
261 		linker_file_t lf;
262 
263 		/* Only load modules for root (very important!) */
264 		if ((error = suser(td)) != 0) {
265 			cache_drop(&nch);
266 			vput(vp);
267 			return error;
268 		}
269 		error = linker_load_file(fstypename, &lf);
270 		if (error || lf == NULL) {
271 			cache_drop(&nch);
272 			vput(vp);
273 			if (lf == NULL)
274 				error = ENODEV;
275 			return error;
276 		}
277 		lf->userrefs++;
278 		/* lookup again, see if the VFS was loaded */
279 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
280 			if (!strcmp(vfsp->vfc_name, fstypename))
281 				break;
282 		}
283 		if (vfsp == NULL) {
284 			lf->userrefs--;
285 			linker_file_unload(lf);
286 			cache_drop(&nch);
287 			vput(vp);
288 			return (ENODEV);
289 		}
290 	}
291 	if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
292 		cache_drop(&nch);
293 		vput(vp);
294 		return (EBUSY);
295 	}
296 	vp->v_flag |= VMOUNT;
297 
298 	/*
299 	 * Allocate and initialize the filesystem.
300 	 */
301 	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
302 	TAILQ_INIT(&mp->mnt_nvnodelist);
303 	TAILQ_INIT(&mp->mnt_reservedvnlist);
304 	TAILQ_INIT(&mp->mnt_jlist);
305 	mp->mnt_nvnodelistsize = 0;
306 	lockinit(&mp->mnt_lock, "vfslock", 0, 0);
307 	vfs_busy(mp, LK_NOWAIT);
308 	mp->mnt_op = vfsp->vfc_vfsops;
309 	mp->mnt_vfc = vfsp;
310 	vfsp->vfc_refcount++;
311 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
312 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
313 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
314 	mp->mnt_stat.f_owner = cred->cr_uid;
315 	mp->mnt_iosize_max = DFLTPHYS;
316 	vn_unlock(vp);
317 update:
318 	/*
319 	 * Set the mount level flags.
320 	 */
321 	if (uap->flags & MNT_RDONLY)
322 		mp->mnt_flag |= MNT_RDONLY;
323 	else if (mp->mnt_flag & MNT_RDONLY)
324 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
325 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
326 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
327 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
328 	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
329 	mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
330 	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
331 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
332 	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
333 	/*
334 	 * Mount the filesystem.
335 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
336 	 * get.
337 	 */
338 	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
339 	if (mp->mnt_flag & MNT_UPDATE) {
340 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
341 			mp->mnt_flag &= ~MNT_RDONLY;
342 		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
343 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
344 		if (error) {
345 			mp->mnt_flag = flag;
346 			mp->mnt_kern_flag = flag2;
347 		}
348 		vfs_unbusy(mp);
349 		vp->v_flag &= ~VMOUNT;
350 		vrele(vp);
351 		cache_drop(&nch);
352 		return (error);
353 	}
354 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
355 	/*
356 	 * Put the new filesystem on the mount list after root.  The mount
357 	 * point gets its own mnt_ncmountpt (unless the VFS already set one
358 	 * up) which represents the root of the mount.  The lookup code
359 	 * detects the mount point going forward and checks the root of
360 	 * the mount going backwards.
361 	 *
362 	 * It is not necessary to invalidate or purge the vnode underneath
363 	 * because elements under the mount will be given their own glue
364 	 * namecache record.
365 	 */
366 	if (!error) {
367 		if (mp->mnt_ncmountpt.ncp == NULL) {
368 			/*
369 			 * allocate, then unlock, but leave the ref intact
370 			 */
371 			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
372 			cache_unlock(&mp->mnt_ncmountpt);
373 		}
374 		mp->mnt_ncmounton = nch;		/* inherits ref */
375 		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
376 
377 		/* XXX get the root of the fs and cache_setvp(mnt_ncmountpt...) */
378 		vp->v_flag &= ~VMOUNT;
379 		mountlist_insert(mp, MNTINS_LAST);
380 		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
381 		vn_unlock(vp);
382 		error = vfs_allocate_syncvnode(mp);
383 		vfs_unbusy(mp);
384 		error = VFS_START(mp, 0);
385 		vrele(vp);
386 	} else {
387 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
388 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
389 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
390 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
391 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
392 		vp->v_flag &= ~VMOUNT;
393 		mp->mnt_vfc->vfc_refcount--;
394 		vfs_unbusy(mp);
395 		kfree(mp, M_MOUNT);
396 		cache_drop(&nch);
397 		vput(vp);
398 	}
399 	return (error);
400 }
401 
402 /*
403  * Scan all active processes to see if any of them have a current
404  * or root directory onto which the new filesystem has just been
405  * mounted. If so, replace them with the new mount point.
406  *
407  * The passed ncp is ref'd and locked (from the mount code) and
408  * must be associated with the vnode representing the root of the
409  * mount point.
410  */
411 struct checkdirs_info {
412 	struct nchandle old_nch;
413 	struct nchandle new_nch;
414 	struct vnode *old_vp;
415 	struct vnode *new_vp;
416 };
417 
418 static int checkdirs_callback(struct proc *p, void *data);
419 
420 static void
421 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
422 {
423 	struct checkdirs_info info;
424 	struct vnode *olddp;
425 	struct vnode *newdp;
426 	struct mount *mp;
427 
428 	/*
429 	 * If the old mount point's vnode has a usecount of 1, it is not
430 	 * being held as a descriptor anywhere.
431 	 */
432 	olddp = old_nch->ncp->nc_vp;
433 	if (olddp == NULL || olddp->v_sysref.refcnt == 1)
434 		return;
435 
436 	/*
437 	 * Force the root vnode of the new mount point to be resolved
438 	 * so we can update any matching processes.
439 	 */
440 	mp = new_nch->mount;
441 	if (VFS_ROOT(mp, &newdp))
442 		panic("mount: lost mount");
443 	cache_setunresolved(new_nch);
444 	cache_setvp(new_nch, newdp);
445 
446 	/*
447 	 * Special handling of the root node
448 	 */
449 	if (rootvnode == olddp) {
450 		vref(newdp);
451 		vfs_cache_setroot(newdp, cache_hold(new_nch));
452 	}
453 
454 	/*
455 	 * Pass newdp separately so the callback does not have to access
456 	 * it via new_nch->ncp->nc_vp.
457 	 */
458 	info.old_nch = *old_nch;
459 	info.new_nch = *new_nch;
460 	info.new_vp = newdp;
461 	allproc_scan(checkdirs_callback, &info);
462 	vput(newdp);
463 }
464 
465 /*
466  * NOTE: callback is not MP safe because the scanned process's filedesc
467  * structure can be ripped out from under us, amoung other things.
468  */
469 static int
470 checkdirs_callback(struct proc *p, void *data)
471 {
472 	struct checkdirs_info *info = data;
473 	struct filedesc *fdp;
474 	struct nchandle ncdrop1;
475 	struct nchandle ncdrop2;
476 	struct vnode *vprele1;
477 	struct vnode *vprele2;
478 
479 	if ((fdp = p->p_fd) != NULL) {
480 		cache_zero(&ncdrop1);
481 		cache_zero(&ncdrop2);
482 		vprele1 = NULL;
483 		vprele2 = NULL;
484 
485 		/*
486 		 * MPUNSAFE - XXX fdp can be pulled out from under a
487 		 * foreign process.
488 		 *
489 		 * A shared filedesc is ok, we don't have to copy it
490 		 * because we are making this change globally.
491 		 */
492 		spin_lock_wr(&fdp->fd_spin);
493 		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
494 		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
495 			vprele1 = fdp->fd_cdir;
496 			vref(info->new_vp);
497 			fdp->fd_cdir = info->new_vp;
498 			ncdrop1 = fdp->fd_ncdir;
499 			cache_copy(&info->new_nch, &fdp->fd_ncdir);
500 		}
501 		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
502 		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
503 			vprele2 = fdp->fd_rdir;
504 			vref(info->new_vp);
505 			fdp->fd_rdir = info->new_vp;
506 			ncdrop2 = fdp->fd_nrdir;
507 			cache_copy(&info->new_nch, &fdp->fd_nrdir);
508 		}
509 		spin_unlock_wr(&fdp->fd_spin);
510 		if (ncdrop1.ncp)
511 			cache_drop(&ncdrop1);
512 		if (ncdrop2.ncp)
513 			cache_drop(&ncdrop2);
514 		if (vprele1)
515 			vrele(vprele1);
516 		if (vprele2)
517 			vrele(vprele2);
518 	}
519 	return(0);
520 }
521 
522 /*
523  * Unmount a file system.
524  *
525  * Note: unmount takes a path to the vnode mounted on as argument,
526  * not special file (as before).
527  */
528 /*
529  * umount_args(char *path, int flags)
530  */
531 /* ARGSUSED */
532 int
533 sys_unmount(struct unmount_args *uap)
534 {
535 	struct thread *td = curthread;
536 	struct proc *p = td->td_proc;
537 	struct mount *mp = NULL;
538 	int error;
539 	struct nlookupdata nd;
540 
541 	KKASSERT(p);
542 	if (p->p_ucred->cr_prison != NULL)
543 		return (EPERM);
544 	if (usermount == 0 && (error = suser(td)))
545 		return (error);
546 
547 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
548 	if (error == 0)
549 		error = nlookup(&nd);
550 	if (error)
551 		goto out;
552 
553 	mp = nd.nl_nch.mount;
554 
555 	/*
556 	 * Only root, or the user that did the original mount is
557 	 * permitted to unmount this filesystem.
558 	 */
559 	if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
560 	    (error = suser(td)))
561 		goto out;
562 
563 	/*
564 	 * Don't allow unmounting the root file system.
565 	 */
566 	if (mp->mnt_flag & MNT_ROOTFS) {
567 		error = EINVAL;
568 		goto out;
569 	}
570 
571 	/*
572 	 * Must be the root of the filesystem
573 	 */
574 	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
575 		error = EINVAL;
576 		goto out;
577 	}
578 
579 out:
580 	nlookup_done(&nd);
581 	if (error)
582 		return (error);
583 	return (dounmount(mp, uap->flags));
584 }
585 
586 /*
587  * Do the actual file system unmount.
588  */
589 static int
590 dounmount_interlock(struct mount *mp)
591 {
592 	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
593 		return (EBUSY);
594 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
595 	return(0);
596 }
597 
598 int
599 dounmount(struct mount *mp, int flags)
600 {
601 	struct namecache *ncp;
602 	struct nchandle nch;
603 	int error;
604 	int async_flag;
605 	int lflags;
606 	int freeok = 1;
607 
608 	/*
609 	 * Exclusive access for unmounting purposes
610 	 */
611 	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
612 		return (error);
613 
614 	/*
615 	 * Allow filesystems to detect that a forced unmount is in progress.
616 	 */
617 	if (flags & MNT_FORCE)
618 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
619 	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT);
620 	error = lockmgr(&mp->mnt_lock, lflags);
621 	if (error) {
622 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
623 		if (mp->mnt_kern_flag & MNTK_MWAIT)
624 			wakeup(mp);
625 		return (error);
626 	}
627 
628 	if (mp->mnt_flag & MNT_EXPUBLIC)
629 		vfs_setpublicfs(NULL, NULL, NULL);
630 
631 	vfs_msync(mp, MNT_WAIT);
632 	async_flag = mp->mnt_flag & MNT_ASYNC;
633 	mp->mnt_flag &=~ MNT_ASYNC;
634 
635 	/*
636 	 * If this filesystem isn't aliasing other filesystems,
637 	 * try to invalidate any remaining namecache entries and
638 	 * check the count afterwords.
639 	 */
640 	if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
641 		cache_lock(&mp->mnt_ncmountpt);
642 		cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
643 		cache_unlock(&mp->mnt_ncmountpt);
644 
645 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
646 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
647 
648 			if ((flags & MNT_FORCE) == 0) {
649 				error = EBUSY;
650 				mount_warning(mp, "Cannot unmount: "
651 						  "%d namecache "
652 						  "references still "
653 						  "present",
654 						  ncp->nc_refs - 1);
655 			} else {
656 				mount_warning(mp, "Forced unmount: "
657 						  "%d namecache "
658 						  "references still "
659 						  "present",
660 						  ncp->nc_refs - 1);
661 				freeok = 0;
662 			}
663 		}
664 	}
665 
666 	/*
667 	 * nchandle records ref the mount structure.  Expect a count of 1
668 	 * (our mount->mnt_ncmountpt).
669 	 */
670 	if (mp->mnt_refs != 1) {
671 		if ((flags & MNT_FORCE) == 0) {
672 			mount_warning(mp, "Cannot unmount: "
673 					  "%d process references still "
674 					  "present", mp->mnt_refs);
675 			error = EBUSY;
676 		} else {
677 			mount_warning(mp, "Forced unmount: "
678 					  "%d process references still "
679 					  "present", mp->mnt_refs);
680 			freeok = 0;
681 		}
682 	}
683 
684 	if (error == 0) {
685 		if (mp->mnt_syncer != NULL)
686 			vrele(mp->mnt_syncer);
687 		if (((mp->mnt_flag & MNT_RDONLY) ||
688 		     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) ||
689 		    (flags & MNT_FORCE)) {
690 			error = VFS_UNMOUNT(mp, flags);
691 		}
692 	}
693 	if (error) {
694 		if (mp->mnt_syncer == NULL)
695 			vfs_allocate_syncvnode(mp);
696 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
697 		mp->mnt_flag |= async_flag;
698 		lockmgr(&mp->mnt_lock, LK_RELEASE);
699 		if (mp->mnt_kern_flag & MNTK_MWAIT)
700 			wakeup(mp);
701 		return (error);
702 	}
703 	/*
704 	 * Clean up any journals still associated with the mount after
705 	 * filesystem activity has ceased.
706 	 */
707 	journal_remove_all_journals(mp,
708 	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
709 
710 	mountlist_remove(mp);
711 
712 	/*
713 	 * Remove any installed vnode ops here so the individual VFSs don't
714 	 * have to.
715 	 */
716 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
717 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
718 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
719 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
720 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
721 
722 	if (mp->mnt_ncmountpt.ncp != NULL) {
723 		nch = mp->mnt_ncmountpt;
724 		cache_zero(&mp->mnt_ncmountpt);
725 		cache_clrmountpt(&nch);
726 		cache_drop(&nch);
727 	}
728 	if (mp->mnt_ncmounton.ncp != NULL) {
729 		nch = mp->mnt_ncmounton;
730 		cache_zero(&mp->mnt_ncmounton);
731 		cache_clrmountpt(&nch);
732 		cache_drop(&nch);
733 	}
734 
735 	mp->mnt_vfc->vfc_refcount--;
736 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
737 		panic("unmount: dangling vnode");
738 	lockmgr(&mp->mnt_lock, LK_RELEASE);
739 	if (mp->mnt_kern_flag & MNTK_MWAIT)
740 		wakeup(mp);
741 	if (freeok)
742 		kfree(mp, M_MOUNT);
743 	return (0);
744 }
745 
746 static
747 void
748 mount_warning(struct mount *mp, const char *ctl, ...)
749 {
750 	char *ptr;
751 	char *buf;
752 	__va_list va;
753 
754 	__va_start(va, ctl);
755 	if (cache_fullpath(NULL, &mp->mnt_ncmounton, &ptr, &buf) == 0) {
756 		kprintf("unmount(%s): ", ptr);
757 		kvprintf(ctl, va);
758 		kprintf("\n");
759 		kfree(buf, M_TEMP);
760 	} else {
761 		kprintf("unmount(%p", mp);
762 		if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
763 			kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
764 		kprintf("): ");
765 		kvprintf(ctl, va);
766 		kprintf("\n");
767 	}
768 	__va_end(va);
769 }
770 
771 /*
772  * Shim cache_fullpath() to handle the case where a process is chrooted into
773  * a subdirectory of a mount.  In this case if the root mount matches the
774  * process root directory's mount we have to specify the process's root
775  * directory instead of the mount point, because the mount point might
776  * be above the root directory.
777  */
778 static
779 int
780 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
781 {
782 	struct nchandle *nch;
783 
784 	if (p && p->p_fd->fd_nrdir.mount == mp)
785 		nch = &p->p_fd->fd_nrdir;
786 	else
787 		nch = &mp->mnt_ncmountpt;
788 	return(cache_fullpath(p, nch, rb, fb));
789 }
790 
791 /*
792  * Sync each mounted filesystem.
793  */
794 
795 #ifdef DEBUG
796 static int syncprt = 0;
797 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
798 #endif /* DEBUG */
799 
800 static int sync_callback(struct mount *mp, void *data);
801 
802 /* ARGSUSED */
803 int
804 sys_sync(struct sync_args *uap)
805 {
806 	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
807 #ifdef DEBUG
808 	/*
809 	 * print out buffer pool stat information on each sync() call.
810 	 */
811 	if (syncprt)
812 		vfs_bufstats();
813 #endif /* DEBUG */
814 	return (0);
815 }
816 
817 static
818 int
819 sync_callback(struct mount *mp, void *data __unused)
820 {
821 	int asyncflag;
822 
823 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
824 		asyncflag = mp->mnt_flag & MNT_ASYNC;
825 		mp->mnt_flag &= ~MNT_ASYNC;
826 		vfs_msync(mp, MNT_NOWAIT);
827 		VFS_SYNC(mp, MNT_NOWAIT);
828 		mp->mnt_flag |= asyncflag;
829 	}
830 	return(0);
831 }
832 
833 /* XXX PRISON: could be per prison flag */
834 static int prison_quotas;
835 #if 0
836 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
837 #endif
838 
839 /*
840  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
841  *
842  * Change filesystem quotas.
843  */
844 /* ARGSUSED */
845 int
846 sys_quotactl(struct quotactl_args *uap)
847 {
848 	struct nlookupdata nd;
849 	struct thread *td;
850 	struct proc *p;
851 	struct mount *mp;
852 	int error;
853 
854 	td = curthread;
855 	p = td->td_proc;
856 	if (p->p_ucred->cr_prison && !prison_quotas)
857 		return (EPERM);
858 
859 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
860 	if (error == 0)
861 		error = nlookup(&nd);
862 	if (error == 0) {
863 		mp = nd.nl_nch.mount;
864 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
865 				    uap->arg, nd.nl_cred);
866 	}
867 	nlookup_done(&nd);
868 	return (error);
869 }
870 
871 /*
872  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
873  *		void *buf, int buflen)
874  *
875  * This function operates on a mount point and executes the specified
876  * operation using the specified control data, and possibly returns data.
877  *
878  * The actual number of bytes stored in the result buffer is returned, 0
879  * if none, otherwise an error is returned.
880  */
881 /* ARGSUSED */
882 int
883 sys_mountctl(struct mountctl_args *uap)
884 {
885 	struct thread *td = curthread;
886 	struct proc *p = td->td_proc;
887 	struct file *fp;
888 	void *ctl = NULL;
889 	void *buf = NULL;
890 	char *path = NULL;
891 	int error;
892 
893 	/*
894 	 * Sanity and permissions checks.  We must be root.
895 	 */
896 	KKASSERT(p);
897 	if (p->p_ucred->cr_prison != NULL)
898 		return (EPERM);
899 	if ((error = suser(td)) != 0)
900 		return (error);
901 
902 	/*
903 	 * Argument length checks
904 	 */
905 	if (uap->ctllen < 0 || uap->ctllen > 1024)
906 		return (EINVAL);
907 	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
908 		return (EINVAL);
909 	if (uap->path == NULL)
910 		return (EINVAL);
911 
912 	/*
913 	 * Allocate the necessary buffers and copyin data
914 	 */
915 	path = objcache_get(namei_oc, M_WAITOK);
916 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
917 	if (error)
918 		goto done;
919 
920 	if (uap->ctllen) {
921 		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
922 		error = copyin(uap->ctl, ctl, uap->ctllen);
923 		if (error)
924 			goto done;
925 	}
926 	if (uap->buflen)
927 		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
928 
929 	/*
930 	 * Validate the descriptor
931 	 */
932 	if (uap->fd >= 0) {
933 		fp = holdfp(p->p_fd, uap->fd, -1);
934 		if (fp == NULL) {
935 			error = EBADF;
936 			goto done;
937 		}
938 	} else {
939 		fp = NULL;
940 	}
941 
942 	/*
943 	 * Execute the internal kernel function and clean up.
944 	 */
945 	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
946 	if (fp)
947 		fdrop(fp);
948 	if (error == 0 && uap->sysmsg_result > 0)
949 		error = copyout(buf, uap->buf, uap->sysmsg_result);
950 done:
951 	if (path)
952 		objcache_put(namei_oc, path);
953 	if (ctl)
954 		kfree(ctl, M_TEMP);
955 	if (buf)
956 		kfree(buf, M_TEMP);
957 	return (error);
958 }
959 
960 /*
961  * Execute a mount control operation by resolving the path to a mount point
962  * and calling vop_mountctl().
963  */
964 int
965 kern_mountctl(const char *path, int op, struct file *fp,
966 		const void *ctl, int ctllen,
967 		void *buf, int buflen, int *res)
968 {
969 	struct vnode *vp;
970 	struct mount *mp;
971 	struct nlookupdata nd;
972 	int error;
973 
974 	*res = 0;
975 	vp = NULL;
976 	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
977 	if (error == 0)
978 		error = nlookup(&nd);
979 	if (error == 0)
980 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
981 	nlookup_done(&nd);
982 	if (error)
983 		return (error);
984 
985 	mp = vp->v_mount;
986 
987 	/*
988 	 * Must be the root of the filesystem
989 	 */
990 	if ((vp->v_flag & VROOT) == 0) {
991 		vput(vp);
992 		return (EINVAL);
993 	}
994 	error = vop_mountctl(mp->mnt_vn_use_ops, op, fp, ctl, ctllen,
995 				buf, buflen, res);
996 	vput(vp);
997 	return (error);
998 }
999 
1000 int
1001 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1002 {
1003 	struct thread *td = curthread;
1004 	struct proc *p = td->td_proc;
1005 	struct mount *mp;
1006 	struct statfs *sp;
1007 	char *fullpath, *freepath;
1008 	int error;
1009 
1010 	if ((error = nlookup(nd)) != 0)
1011 		return (error);
1012 	mp = nd->nl_nch.mount;
1013 	sp = &mp->mnt_stat;
1014 	if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1015 		return (error);
1016 
1017 	error = mount_path(p, mp, &fullpath, &freepath);
1018 	if (error)
1019 		return(error);
1020 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1021 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1022 	kfree(freepath, M_TEMP);
1023 
1024 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1025 	bcopy(sp, buf, sizeof(*buf));
1026 	/* Only root should have access to the fsid's. */
1027 	if (suser(td))
1028 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1029 	return (0);
1030 }
1031 
1032 /*
1033  * statfs_args(char *path, struct statfs *buf)
1034  *
1035  * Get filesystem statistics.
1036  */
1037 int
1038 sys_statfs(struct statfs_args *uap)
1039 {
1040 	struct nlookupdata nd;
1041 	struct statfs buf;
1042 	int error;
1043 
1044 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1045 	if (error == 0)
1046 		error = kern_statfs(&nd, &buf);
1047 	nlookup_done(&nd);
1048 	if (error == 0)
1049 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1050 	return (error);
1051 }
1052 
1053 int
1054 kern_fstatfs(int fd, struct statfs *buf)
1055 {
1056 	struct thread *td = curthread;
1057 	struct proc *p = td->td_proc;
1058 	struct file *fp;
1059 	struct mount *mp;
1060 	struct statfs *sp;
1061 	char *fullpath, *freepath;
1062 	int error;
1063 
1064 	KKASSERT(p);
1065 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1066 		return (error);
1067 	mp = ((struct vnode *)fp->f_data)->v_mount;
1068 	if (mp == NULL) {
1069 		error = EBADF;
1070 		goto done;
1071 	}
1072 	if (fp->f_cred == NULL) {
1073 		error = EINVAL;
1074 		goto done;
1075 	}
1076 	sp = &mp->mnt_stat;
1077 	if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1078 		goto done;
1079 
1080 	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1081 		goto done;
1082 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1083 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1084 	kfree(freepath, M_TEMP);
1085 
1086 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1087 	bcopy(sp, buf, sizeof(*buf));
1088 
1089 	/* Only root should have access to the fsid's. */
1090 	if (suser(td))
1091 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1092 	error = 0;
1093 done:
1094 	fdrop(fp);
1095 	return (error);
1096 }
1097 
1098 /*
1099  * fstatfs_args(int fd, struct statfs *buf)
1100  *
1101  * Get filesystem statistics.
1102  */
1103 int
1104 sys_fstatfs(struct fstatfs_args *uap)
1105 {
1106 	struct statfs buf;
1107 	int error;
1108 
1109 	error = kern_fstatfs(uap->fd, &buf);
1110 
1111 	if (error == 0)
1112 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1113 	return (error);
1114 }
1115 
1116 /*
1117  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1118  *
1119  * Get statistics on all filesystems.
1120  */
1121 
1122 struct getfsstat_info {
1123 	struct statfs *sfsp;
1124 	long count;
1125 	long maxcount;
1126 	int error;
1127 	int flags;
1128 	struct proc *p;
1129 };
1130 
1131 static int getfsstat_callback(struct mount *, void *);
1132 
1133 /* ARGSUSED */
1134 int
1135 sys_getfsstat(struct getfsstat_args *uap)
1136 {
1137 	struct thread *td = curthread;
1138 	struct proc *p = td->td_proc;
1139 	struct getfsstat_info info;
1140 
1141 	bzero(&info, sizeof(info));
1142 
1143 	info.maxcount = uap->bufsize / sizeof(struct statfs);
1144 	info.sfsp = uap->buf;
1145 	info.count = 0;
1146 	info.flags = uap->flags;
1147 	info.p = p;
1148 
1149 	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1150 	if (info.sfsp && info.count > info.maxcount)
1151 		uap->sysmsg_result = info.maxcount;
1152 	else
1153 		uap->sysmsg_result = info.count;
1154 	return (info.error);
1155 }
1156 
1157 static int
1158 getfsstat_callback(struct mount *mp, void *data)
1159 {
1160 	struct getfsstat_info *info = data;
1161 	struct statfs *sp;
1162 	char *freepath;
1163 	char *fullpath;
1164 	int error;
1165 
1166 	if (info->sfsp && info->count < info->maxcount) {
1167 		if (info->p && !chroot_visible_mnt(mp, info->p))
1168 			return(0);
1169 		sp = &mp->mnt_stat;
1170 
1171 		/*
1172 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1173 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1174 		 * overrides MNT_WAIT.
1175 		 */
1176 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1177 		    (info->flags & MNT_WAIT)) &&
1178 		    (error = VFS_STATFS(mp, sp, info->p->p_ucred))) {
1179 			return(0);
1180 		}
1181 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1182 
1183 		error = mount_path(info->p, mp, &fullpath, &freepath);
1184 		if (error) {
1185 			info->error = error;
1186 			return(-1);
1187 		}
1188 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1189 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1190 		kfree(freepath, M_TEMP);
1191 
1192 		error = copyout(sp, info->sfsp, sizeof(*sp));
1193 		if (error) {
1194 			info->error = error;
1195 			return (-1);
1196 		}
1197 		++info->sfsp;
1198 	}
1199 	info->count++;
1200 	return(0);
1201 }
1202 
1203 /*
1204  * fchdir_args(int fd)
1205  *
1206  * Change current working directory to a given file descriptor.
1207  */
1208 /* ARGSUSED */
1209 int
1210 sys_fchdir(struct fchdir_args *uap)
1211 {
1212 	struct thread *td = curthread;
1213 	struct proc *p = td->td_proc;
1214 	struct filedesc *fdp = p->p_fd;
1215 	struct vnode *vp, *ovp;
1216 	struct mount *mp;
1217 	struct file *fp;
1218 	struct nchandle nch, onch, tnch;
1219 	int error;
1220 
1221 	if ((error = holdvnode(fdp, uap->fd, &fp)) != 0)
1222 		return (error);
1223 	vp = (struct vnode *)fp->f_data;
1224 	vref(vp);
1225 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1226 	if (vp->v_type != VDIR || fp->f_nchandle.ncp == NULL)
1227 		error = ENOTDIR;
1228 	else
1229 		error = VOP_ACCESS(vp, VEXEC, p->p_ucred);
1230 	if (error) {
1231 		vput(vp);
1232 		fdrop(fp);
1233 		return (error);
1234 	}
1235 	cache_copy(&fp->f_nchandle, &nch);
1236 
1237 	/*
1238 	 * If the ncp has become a mount point, traverse through
1239 	 * the mount point.
1240 	 */
1241 
1242 	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1243 	       (mp = cache_findmount(&nch)) != NULL
1244 	) {
1245 		error = nlookup_mp(mp, &tnch);
1246 		if (error == 0) {
1247 			cache_unlock(&tnch);	/* leave ref intact */
1248 			vput(vp);
1249 			vp = tnch.ncp->nc_vp;
1250 			error = vget(vp, LK_SHARED);
1251 			KKASSERT(error == 0);
1252 			cache_drop(&nch);
1253 			nch = tnch;
1254 		}
1255 	}
1256 	if (error == 0) {
1257 		ovp = fdp->fd_cdir;
1258 		onch = fdp->fd_ncdir;
1259 		vn_unlock(vp);		/* leave ref intact */
1260 		fdp->fd_cdir = vp;
1261 		fdp->fd_ncdir = nch;
1262 		cache_drop(&onch);
1263 		vrele(ovp);
1264 	} else {
1265 		cache_drop(&nch);
1266 		vput(vp);
1267 	}
1268 	fdrop(fp);
1269 	return (error);
1270 }
1271 
1272 int
1273 kern_chdir(struct nlookupdata *nd)
1274 {
1275 	struct thread *td = curthread;
1276 	struct proc *p = td->td_proc;
1277 	struct filedesc *fdp = p->p_fd;
1278 	struct vnode *vp, *ovp;
1279 	struct nchandle onch;
1280 	int error;
1281 
1282 	if ((error = nlookup(nd)) != 0)
1283 		return (error);
1284 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1285 		return (ENOENT);
1286 	if ((error = vget(vp, LK_SHARED)) != 0)
1287 		return (error);
1288 
1289 	error = checkvp_chdir(vp, td);
1290 	vn_unlock(vp);
1291 	if (error == 0) {
1292 		ovp = fdp->fd_cdir;
1293 		onch = fdp->fd_ncdir;
1294 		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1295 		fdp->fd_ncdir = nd->nl_nch;
1296 		fdp->fd_cdir = vp;
1297 		cache_drop(&onch);
1298 		vrele(ovp);
1299 		cache_zero(&nd->nl_nch);
1300 	} else {
1301 		vrele(vp);
1302 	}
1303 	return (error);
1304 }
1305 
1306 /*
1307  * chdir_args(char *path)
1308  *
1309  * Change current working directory (``.'').
1310  */
1311 int
1312 sys_chdir(struct chdir_args *uap)
1313 {
1314 	struct nlookupdata nd;
1315 	int error;
1316 
1317 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1318 	if (error == 0)
1319 		error = kern_chdir(&nd);
1320 	nlookup_done(&nd);
1321 	return (error);
1322 }
1323 
1324 /*
1325  * Helper function for raised chroot(2) security function:  Refuse if
1326  * any filedescriptors are open directories.
1327  */
1328 static int
1329 chroot_refuse_vdir_fds(struct filedesc *fdp)
1330 {
1331 	struct vnode *vp;
1332 	struct file *fp;
1333 	int error;
1334 	int fd;
1335 
1336 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1337 		if ((error = holdvnode(fdp, fd, &fp)) != 0)
1338 			continue;
1339 		vp = (struct vnode *)fp->f_data;
1340 		if (vp->v_type != VDIR) {
1341 			fdrop(fp);
1342 			continue;
1343 		}
1344 		fdrop(fp);
1345 		return(EPERM);
1346 	}
1347 	return (0);
1348 }
1349 
1350 /*
1351  * This sysctl determines if we will allow a process to chroot(2) if it
1352  * has a directory open:
1353  *	0: disallowed for all processes.
1354  *	1: allowed for processes that were not already chroot(2)'ed.
1355  *	2: allowed for all processes.
1356  */
1357 
1358 static int chroot_allow_open_directories = 1;
1359 
1360 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1361      &chroot_allow_open_directories, 0, "");
1362 
1363 /*
1364  * chroot to the specified namecache entry.  We obtain the vp from the
1365  * namecache data.  The passed ncp must be locked and referenced and will
1366  * remain locked and referenced on return.
1367  */
1368 int
1369 kern_chroot(struct nchandle *nch)
1370 {
1371 	struct thread *td = curthread;
1372 	struct proc *p = td->td_proc;
1373 	struct filedesc *fdp = p->p_fd;
1374 	struct vnode *vp;
1375 	int error;
1376 
1377 	/*
1378 	 * Only root can chroot
1379 	 */
1380 	if ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0)
1381 		return (error);
1382 
1383 	/*
1384 	 * Disallow open directory descriptors (fchdir() breakouts).
1385 	 */
1386 	if (chroot_allow_open_directories == 0 ||
1387 	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1388 		if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1389 			return (error);
1390 	}
1391 	if ((vp = nch->ncp->nc_vp) == NULL)
1392 		return (ENOENT);
1393 
1394 	if ((error = vget(vp, LK_SHARED)) != 0)
1395 		return (error);
1396 
1397 	/*
1398 	 * Check the validity of vp as a directory to change to and
1399 	 * associate it with rdir/jdir.
1400 	 */
1401 	error = checkvp_chdir(vp, td);
1402 	vn_unlock(vp);			/* leave reference intact */
1403 	if (error == 0) {
1404 		vrele(fdp->fd_rdir);
1405 		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
1406 		cache_drop(&fdp->fd_nrdir);
1407 		cache_copy(nch, &fdp->fd_nrdir);
1408 		if (fdp->fd_jdir == NULL) {
1409 			fdp->fd_jdir = vp;
1410 			vref(fdp->fd_jdir);
1411 			cache_copy(nch, &fdp->fd_njdir);
1412 		}
1413 	} else {
1414 		vrele(vp);
1415 	}
1416 	return (error);
1417 }
1418 
1419 /*
1420  * chroot_args(char *path)
1421  *
1422  * Change notion of root (``/'') directory.
1423  */
1424 /* ARGSUSED */
1425 int
1426 sys_chroot(struct chroot_args *uap)
1427 {
1428 	struct thread *td = curthread;
1429 	struct nlookupdata nd;
1430 	int error;
1431 
1432 	KKASSERT(td->td_proc);
1433 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1434 	if (error) {
1435 		nlookup_done(&nd);
1436 		return(error);
1437 	}
1438 	error = nlookup(&nd);
1439 	if (error == 0)
1440 		error = kern_chroot(&nd.nl_nch);
1441 	nlookup_done(&nd);
1442 	return(error);
1443 }
1444 
1445 /*
1446  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1447  * determine whether it is legal to chdir to the vnode.  The vnode's state
1448  * is not changed by this call.
1449  */
1450 int
1451 checkvp_chdir(struct vnode *vp, struct thread *td)
1452 {
1453 	int error;
1454 
1455 	if (vp->v_type != VDIR)
1456 		error = ENOTDIR;
1457 	else
1458 		error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred);
1459 	return (error);
1460 }
1461 
1462 int
1463 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1464 {
1465 	struct thread *td = curthread;
1466 	struct proc *p = td->td_proc;
1467 	struct lwp *lp = td->td_lwp;
1468 	struct filedesc *fdp = p->p_fd;
1469 	int cmode, flags;
1470 	struct file *nfp;
1471 	struct file *fp;
1472 	struct vnode *vp;
1473 	int type, indx, error;
1474 	struct flock lf;
1475 
1476 	if ((oflags & O_ACCMODE) == O_ACCMODE)
1477 		return (EINVAL);
1478 	flags = FFLAGS(oflags);
1479 	error = falloc(p, &nfp, NULL);
1480 	if (error)
1481 		return (error);
1482 	fp = nfp;
1483 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1484 
1485 	/*
1486 	 * XXX p_dupfd is a real mess.  It allows a device to return a
1487 	 * file descriptor to be duplicated rather then doing the open
1488 	 * itself.
1489 	 */
1490 	lp->lwp_dupfd = -1;
1491 
1492 	/*
1493 	 * Call vn_open() to do the lookup and assign the vnode to the
1494 	 * file pointer.  vn_open() does not change the ref count on fp
1495 	 * and the vnode, on success, will be inherited by the file pointer
1496 	 * and unlocked.
1497 	 */
1498 	nd->nl_flags |= NLC_LOCKVP;
1499 	error = vn_open(nd, fp, flags, cmode);
1500 	nlookup_done(nd);
1501 	if (error) {
1502 		/*
1503 		 * handle special fdopen() case.  bleh.  dupfdopen() is
1504 		 * responsible for dropping the old contents of ofiles[indx]
1505 		 * if it succeeds.
1506 		 *
1507 		 * Note that fsetfd() will add a ref to fp which represents
1508 		 * the fd_files[] assignment.  We must still drop our
1509 		 * reference.
1510 		 */
1511 		if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
1512 			if (fdalloc(p, 0, &indx) == 0) {
1513 				error = dupfdopen(p, indx, lp->lwp_dupfd, flags, error);
1514 				if (error == 0) {
1515 					*res = indx;
1516 					fdrop(fp);	/* our ref */
1517 					return (0);
1518 				}
1519 				fsetfd(p, NULL, indx);
1520 			}
1521 		}
1522 		fdrop(fp);	/* our ref */
1523 		if (error == ERESTART)
1524 			error = EINTR;
1525 		return (error);
1526 	}
1527 
1528 	/*
1529 	 * ref the vnode for ourselves so it can't be ripped out from under
1530 	 * is.  XXX need an ND flag to request that the vnode be returned
1531 	 * anyway.
1532 	 *
1533 	 * Reserve a file descriptor but do not assign it until the open
1534 	 * succeeds.
1535 	 */
1536 	vp = (struct vnode *)fp->f_data;
1537 	vref(vp);
1538 	if ((error = fdalloc(p, 0, &indx)) != 0) {
1539 		fdrop(fp);
1540 		vrele(vp);
1541 		return (error);
1542 	}
1543 
1544 	/*
1545 	 * If no error occurs the vp will have been assigned to the file
1546 	 * pointer.
1547 	 */
1548 	lp->lwp_dupfd = 0;
1549 
1550 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1551 		lf.l_whence = SEEK_SET;
1552 		lf.l_start = 0;
1553 		lf.l_len = 0;
1554 		if (flags & O_EXLOCK)
1555 			lf.l_type = F_WRLCK;
1556 		else
1557 			lf.l_type = F_RDLCK;
1558 		if (flags & FNONBLOCK)
1559 			type = 0;
1560 		else
1561 			type = F_WAIT;
1562 
1563 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
1564 			/*
1565 			 * lock request failed.  Clean up the reserved
1566 			 * descriptor.
1567 			 */
1568 			vrele(vp);
1569 			fsetfd(p, NULL, indx);
1570 			fdrop(fp);
1571 			return (error);
1572 		}
1573 		fp->f_flag |= FHASLOCK;
1574 	}
1575 #if 0
1576 	/*
1577 	 * Assert that all regular file vnodes were created with a object.
1578 	 */
1579 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
1580 		("open: regular file has no backing object after vn_open"));
1581 #endif
1582 
1583 	vrele(vp);
1584 
1585 	/*
1586 	 * release our private reference, leaving the one associated with the
1587 	 * descriptor table intact.
1588 	 */
1589 	fsetfd(p, fp, indx);
1590 	fdrop(fp);
1591 	*res = indx;
1592 	return (0);
1593 }
1594 
1595 /*
1596  * open_args(char *path, int flags, int mode)
1597  *
1598  * Check permissions, allocate an open file structure,
1599  * and call the device open routine if any.
1600  */
1601 int
1602 sys_open(struct open_args *uap)
1603 {
1604 	struct nlookupdata nd;
1605 	int error;
1606 
1607 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1608 	if (error == 0) {
1609 		error = kern_open(&nd, uap->flags,
1610 				    uap->mode, &uap->sysmsg_result);
1611 	}
1612 	nlookup_done(&nd);
1613 	return (error);
1614 }
1615 
1616 int
1617 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
1618 {
1619 	struct thread *td = curthread;
1620 	struct proc *p = td->td_proc;
1621 	struct vnode *vp;
1622 	struct vattr vattr;
1623 	int error;
1624 	int whiteout = 0;
1625 
1626 	KKASSERT(p);
1627 
1628 	switch (mode & S_IFMT) {
1629 	case S_IFCHR:
1630 	case S_IFBLK:
1631 		error = suser(td);
1632 		break;
1633 	default:
1634 		error = suser_cred(p->p_ucred, PRISON_ROOT);
1635 		break;
1636 	}
1637 	if (error)
1638 		return (error);
1639 
1640 	bwillwrite();
1641 	nd->nl_flags |= NLC_CREATE;
1642 	if ((error = nlookup(nd)) != 0)
1643 		return (error);
1644 	if (nd->nl_nch.ncp->nc_vp)
1645 		return (EEXIST);
1646 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1647 		return (error);
1648 
1649 	VATTR_NULL(&vattr);
1650 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1651 	vattr.va_rmajor = rmajor;
1652 	vattr.va_rminor = rminor;
1653 	whiteout = 0;
1654 
1655 	switch (mode & S_IFMT) {
1656 	case S_IFMT:	/* used by badsect to flag bad sectors */
1657 		vattr.va_type = VBAD;
1658 		break;
1659 	case S_IFCHR:
1660 		vattr.va_type = VCHR;
1661 		break;
1662 	case S_IFBLK:
1663 		vattr.va_type = VBLK;
1664 		break;
1665 	case S_IFWHT:
1666 		whiteout = 1;
1667 		break;
1668 	default:
1669 		error = EINVAL;
1670 		break;
1671 	}
1672 	if (error == 0) {
1673 		if (whiteout) {
1674 			error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_cred, NAMEI_CREATE);
1675 		} else {
1676 			vp = NULL;
1677 			error = VOP_NMKNOD(&nd->nl_nch, &vp, nd->nl_cred, &vattr);
1678 			if (error == 0)
1679 				vput(vp);
1680 		}
1681 	}
1682 	return (error);
1683 }
1684 
1685 /*
1686  * mknod_args(char *path, int mode, int dev)
1687  *
1688  * Create a special file.
1689  */
1690 int
1691 sys_mknod(struct mknod_args *uap)
1692 {
1693 	struct nlookupdata nd;
1694 	int error;
1695 
1696 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1697 	if (error == 0) {
1698 		error = kern_mknod(&nd, uap->mode,
1699 				   umajor(uap->dev), uminor(uap->dev));
1700 	}
1701 	nlookup_done(&nd);
1702 	return (error);
1703 }
1704 
1705 int
1706 kern_mkfifo(struct nlookupdata *nd, int mode)
1707 {
1708 	struct thread *td = curthread;
1709 	struct proc *p = td->td_proc;
1710 	struct vattr vattr;
1711 	struct vnode *vp;
1712 	int error;
1713 
1714 	bwillwrite();
1715 
1716 	nd->nl_flags |= NLC_CREATE;
1717 	if ((error = nlookup(nd)) != 0)
1718 		return (error);
1719 	if (nd->nl_nch.ncp->nc_vp)
1720 		return (EEXIST);
1721 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1722 		return (error);
1723 
1724 	VATTR_NULL(&vattr);
1725 	vattr.va_type = VFIFO;
1726 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1727 	vp = NULL;
1728 	error = VOP_NMKNOD(&nd->nl_nch, &vp, nd->nl_cred, &vattr);
1729 	if (error == 0)
1730 		vput(vp);
1731 	return (error);
1732 }
1733 
1734 /*
1735  * mkfifo_args(char *path, int mode)
1736  *
1737  * Create a named pipe.
1738  */
1739 int
1740 sys_mkfifo(struct mkfifo_args *uap)
1741 {
1742 	struct nlookupdata nd;
1743 	int error;
1744 
1745 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1746 	if (error == 0)
1747 		error = kern_mkfifo(&nd, uap->mode);
1748 	nlookup_done(&nd);
1749 	return (error);
1750 }
1751 
1752 static int hardlink_check_uid = 0;
1753 SYSCTL_INT(_kern, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1754     &hardlink_check_uid, 0,
1755     "Unprivileged processes cannot create hard links to files owned by other "
1756     "users");
1757 static int hardlink_check_gid = 0;
1758 SYSCTL_INT(_kern, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1759     &hardlink_check_gid, 0,
1760     "Unprivileged processes cannot create hard links to files owned by other "
1761     "groups");
1762 
1763 static int
1764 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
1765 {
1766 	struct vattr va;
1767 	int error;
1768 
1769 	/*
1770 	 * Shortcut if disabled
1771 	 */
1772 	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
1773 		return (0);
1774 
1775 	/*
1776 	 * root cred can always hardlink
1777 	 */
1778 	if (suser_cred(cred, PRISON_ROOT) == 0)
1779 		return (0);
1780 
1781 	/*
1782 	 * Otherwise only if the originating file is owned by the
1783 	 * same user or group.  Note that any group is allowed if
1784 	 * the file is owned by the caller.
1785 	 */
1786 	error = VOP_GETATTR(vp, &va);
1787 	if (error != 0)
1788 		return (error);
1789 
1790 	if (hardlink_check_uid) {
1791 		if (cred->cr_uid != va.va_uid)
1792 			return (EPERM);
1793 	}
1794 
1795 	if (hardlink_check_gid) {
1796 		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
1797 			return (EPERM);
1798 	}
1799 
1800 	return (0);
1801 }
1802 
1803 int
1804 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
1805 {
1806 	struct thread *td = curthread;
1807 	struct vnode *vp;
1808 	int error;
1809 
1810 	/*
1811 	 * Lookup the source and obtained a locked vnode.
1812 	 *
1813 	 * XXX relookup on vget failure / race ?
1814 	 */
1815 	bwillwrite();
1816 	if ((error = nlookup(nd)) != 0)
1817 		return (error);
1818 	vp = nd->nl_nch.ncp->nc_vp;
1819 	KKASSERT(vp != NULL);
1820 	if (vp->v_type == VDIR)
1821 		return (EPERM);		/* POSIX */
1822 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1823 		return (error);
1824 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
1825 		return (error);
1826 
1827 	/*
1828 	 * Unlock the source so we can lookup the target without deadlocking
1829 	 * (XXX vp is locked already, possible other deadlock?).  The target
1830 	 * must not exist.
1831 	 */
1832 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
1833 	nd->nl_flags &= ~NLC_NCPISLOCKED;
1834 	cache_unlock(&nd->nl_nch);
1835 
1836 	linknd->nl_flags |= NLC_CREATE;
1837 	if ((error = nlookup(linknd)) != 0) {
1838 		vput(vp);
1839 		return (error);
1840 	}
1841 	if (linknd->nl_nch.ncp->nc_vp) {
1842 		vput(vp);
1843 		return (EEXIST);
1844 	}
1845 
1846 	/*
1847 	 * Finally run the new API VOP.
1848 	 */
1849 	error = can_hardlink(vp, td, td->td_proc->p_ucred);
1850 	if (error == 0)
1851 		error = VOP_NLINK(&linknd->nl_nch, vp, linknd->nl_cred);
1852 	vput(vp);
1853 	return (error);
1854 }
1855 
1856 /*
1857  * link_args(char *path, char *link)
1858  *
1859  * Make a hard file link.
1860  */
1861 int
1862 sys_link(struct link_args *uap)
1863 {
1864 	struct nlookupdata nd, linknd;
1865 	int error;
1866 
1867 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1868 	if (error == 0) {
1869 		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
1870 		if (error == 0)
1871 			error = kern_link(&nd, &linknd);
1872 		nlookup_done(&linknd);
1873 	}
1874 	nlookup_done(&nd);
1875 	return (error);
1876 }
1877 
1878 int
1879 kern_symlink(struct nlookupdata *nd, char *path, int mode)
1880 {
1881 	struct vattr vattr;
1882 	struct vnode *vp;
1883 	int error;
1884 
1885 	bwillwrite();
1886 	nd->nl_flags |= NLC_CREATE;
1887 	if ((error = nlookup(nd)) != 0)
1888 		return (error);
1889 	if (nd->nl_nch.ncp->nc_vp)
1890 		return (EEXIST);
1891 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1892 		return (error);
1893 	VATTR_NULL(&vattr);
1894 	vattr.va_mode = mode;
1895 	error = VOP_NSYMLINK(&nd->nl_nch, &vp, nd->nl_cred, &vattr, path);
1896 	if (error == 0)
1897 		vput(vp);
1898 	return (error);
1899 }
1900 
1901 /*
1902  * symlink(char *path, char *link)
1903  *
1904  * Make a symbolic link.
1905  */
1906 int
1907 sys_symlink(struct symlink_args *uap)
1908 {
1909 	struct thread *td = curthread;
1910 	struct nlookupdata nd;
1911 	char *path;
1912 	int error;
1913 	int mode;
1914 
1915 	path = objcache_get(namei_oc, M_WAITOK);
1916 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1917 	if (error == 0) {
1918 		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
1919 		if (error == 0) {
1920 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
1921 			error = kern_symlink(&nd, path, mode);
1922 		}
1923 		nlookup_done(&nd);
1924 	}
1925 	objcache_put(namei_oc, path);
1926 	return (error);
1927 }
1928 
1929 /*
1930  * undelete_args(char *path)
1931  *
1932  * Delete a whiteout from the filesystem.
1933  */
1934 /* ARGSUSED */
1935 int
1936 sys_undelete(struct undelete_args *uap)
1937 {
1938 	struct nlookupdata nd;
1939 	int error;
1940 
1941 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1942 	bwillwrite();
1943 	nd.nl_flags |= NLC_DELETE;
1944 	if (error == 0)
1945 		error = nlookup(&nd);
1946 	if (error == 0)
1947 		error = ncp_writechk(&nd.nl_nch);
1948 	if (error == 0)
1949 		error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_cred, NAMEI_DELETE);
1950 	nlookup_done(&nd);
1951 	return (error);
1952 }
1953 
1954 int
1955 kern_unlink(struct nlookupdata *nd)
1956 {
1957 	int error;
1958 
1959 	bwillwrite();
1960 	nd->nl_flags |= NLC_DELETE;
1961 	if ((error = nlookup(nd)) != 0)
1962 		return (error);
1963 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1964 		return (error);
1965 	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_cred);
1966 	return (error);
1967 }
1968 
1969 /*
1970  * unlink_args(char *path)
1971  *
1972  * Delete a name from the filesystem.
1973  */
1974 int
1975 sys_unlink(struct unlink_args *uap)
1976 {
1977 	struct nlookupdata nd;
1978 	int error;
1979 
1980 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1981 	if (error == 0)
1982 		error = kern_unlink(&nd);
1983 	nlookup_done(&nd);
1984 	return (error);
1985 }
1986 
1987 int
1988 kern_lseek(int fd, off_t offset, int whence, off_t *res)
1989 {
1990 	struct thread *td = curthread;
1991 	struct proc *p = td->td_proc;
1992 	struct file *fp;
1993 	struct vattr vattr;
1994 	int error;
1995 
1996 	fp = holdfp(p->p_fd, fd, -1);
1997 	if (fp == NULL)
1998 		return (EBADF);
1999 	if (fp->f_type != DTYPE_VNODE) {
2000 		error = ESPIPE;
2001 		goto done;
2002 	}
2003 
2004 	switch (whence) {
2005 	case L_INCR:
2006 		fp->f_offset += offset;
2007 		error = 0;
2008 		break;
2009 	case L_XTND:
2010 		error = VOP_GETATTR((struct vnode *)fp->f_data, &vattr);
2011 		if (error == 0)
2012 			fp->f_offset = offset + vattr.va_size;
2013 		break;
2014 	case L_SET:
2015 		fp->f_offset = offset;
2016 		error = 0;
2017 		break;
2018 	default:
2019 		error = EINVAL;
2020 		break;
2021 	}
2022 	*res = fp->f_offset;
2023 done:
2024 	fdrop(fp);
2025 	return (error);
2026 }
2027 
2028 /*
2029  * lseek_args(int fd, int pad, off_t offset, int whence)
2030  *
2031  * Reposition read/write file offset.
2032  */
2033 int
2034 sys_lseek(struct lseek_args *uap)
2035 {
2036 	int error;
2037 
2038 	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2039 	    &uap->sysmsg_offset);
2040 
2041 	return (error);
2042 }
2043 
2044 int
2045 kern_access(struct nlookupdata *nd, int aflags)
2046 {
2047 	struct vnode *vp;
2048 	int error, flags;
2049 
2050 	if ((error = nlookup(nd)) != 0)
2051 		return (error);
2052 retry:
2053 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2054 	if (error)
2055 		return (error);
2056 
2057 	/* Flags == 0 means only check for existence. */
2058 	if (aflags) {
2059 		flags = 0;
2060 		if (aflags & R_OK)
2061 			flags |= VREAD;
2062 		if (aflags & W_OK)
2063 			flags |= VWRITE;
2064 		if (aflags & X_OK)
2065 			flags |= VEXEC;
2066 		if ((flags & VWRITE) == 0 ||
2067 		    (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2068 			error = VOP_ACCESS(vp, flags, nd->nl_cred);
2069 
2070 		/*
2071 		 * If the file handle is stale we have to re-resolve the
2072 		 * entry.  This is a hack at the moment.
2073 		 */
2074 		if (error == ESTALE) {
2075 			cache_setunresolved(&nd->nl_nch);
2076 			error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2077 			if (error == 0) {
2078 				vput(vp);
2079 				vp = NULL;
2080 				goto retry;
2081 			}
2082 		}
2083 	}
2084 	vput(vp);
2085 	return (error);
2086 }
2087 
2088 /*
2089  * access_args(char *path, int flags)
2090  *
2091  * Check access permissions.
2092  */
2093 int
2094 sys_access(struct access_args *uap)
2095 {
2096 	struct nlookupdata nd;
2097 	int error;
2098 
2099 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2100 	if (error == 0)
2101 		error = kern_access(&nd, uap->flags);
2102 	nlookup_done(&nd);
2103 	return (error);
2104 }
2105 
2106 int
2107 kern_stat(struct nlookupdata *nd, struct stat *st)
2108 {
2109 	int error;
2110 	struct vnode *vp;
2111 	thread_t td;
2112 
2113 	if ((error = nlookup(nd)) != 0)
2114 		return (error);
2115 again:
2116 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2117 		return (ENOENT);
2118 
2119 	td = curthread;
2120 	if ((error = vget(vp, LK_SHARED)) != 0)
2121 		return (error);
2122 	error = vn_stat(vp, st, nd->nl_cred);
2123 
2124 	/*
2125 	 * If the file handle is stale we have to re-resolve the entry.  This
2126 	 * is a hack at the moment.
2127 	 */
2128 	if (error == ESTALE) {
2129 		cache_setunresolved(&nd->nl_nch);
2130 		error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2131 		if (error == 0) {
2132 			vput(vp);
2133 			goto again;
2134 		}
2135 	}
2136 	vput(vp);
2137 	return (error);
2138 }
2139 
2140 /*
2141  * stat_args(char *path, struct stat *ub)
2142  *
2143  * Get file status; this version follows links.
2144  */
2145 int
2146 sys_stat(struct stat_args *uap)
2147 {
2148 	struct nlookupdata nd;
2149 	struct stat st;
2150 	int error;
2151 
2152 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2153 	if (error == 0) {
2154 		error = kern_stat(&nd, &st);
2155 		if (error == 0)
2156 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2157 	}
2158 	nlookup_done(&nd);
2159 	return (error);
2160 }
2161 
2162 /*
2163  * lstat_args(char *path, struct stat *ub)
2164  *
2165  * Get file status; this version does not follow links.
2166  */
2167 int
2168 sys_lstat(struct lstat_args *uap)
2169 {
2170 	struct nlookupdata nd;
2171 	struct stat st;
2172 	int error;
2173 
2174 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2175 	if (error == 0) {
2176 		error = kern_stat(&nd, &st);
2177 		if (error == 0)
2178 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2179 	}
2180 	nlookup_done(&nd);
2181 	return (error);
2182 }
2183 
2184 /*
2185  * pathconf_Args(char *path, int name)
2186  *
2187  * Get configurable pathname variables.
2188  */
2189 /* ARGSUSED */
2190 int
2191 sys_pathconf(struct pathconf_args *uap)
2192 {
2193 	struct nlookupdata nd;
2194 	struct vnode *vp;
2195 	int error;
2196 
2197 	vp = NULL;
2198 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2199 	if (error == 0)
2200 		error = nlookup(&nd);
2201 	if (error == 0)
2202 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
2203 	nlookup_done(&nd);
2204 	if (error == 0) {
2205 		error = VOP_PATHCONF(vp, uap->name, uap->sysmsg_fds);
2206 		vput(vp);
2207 	}
2208 	return (error);
2209 }
2210 
2211 /*
2212  * XXX: daver
2213  * kern_readlink isn't properly split yet.  There is a copyin burried
2214  * in VOP_READLINK().
2215  */
2216 int
2217 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
2218 {
2219 	struct thread *td = curthread;
2220 	struct proc *p = td->td_proc;
2221 	struct vnode *vp;
2222 	struct iovec aiov;
2223 	struct uio auio;
2224 	int error;
2225 
2226 	if ((error = nlookup(nd)) != 0)
2227 		return (error);
2228 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2229 	if (error)
2230 		return (error);
2231 	if (vp->v_type != VLNK) {
2232 		error = EINVAL;
2233 	} else {
2234 		aiov.iov_base = buf;
2235 		aiov.iov_len = count;
2236 		auio.uio_iov = &aiov;
2237 		auio.uio_iovcnt = 1;
2238 		auio.uio_offset = 0;
2239 		auio.uio_rw = UIO_READ;
2240 		auio.uio_segflg = UIO_USERSPACE;
2241 		auio.uio_td = td;
2242 		auio.uio_resid = count;
2243 		error = VOP_READLINK(vp, &auio, p->p_ucred);
2244 	}
2245 	vput(vp);
2246 	*res = count - auio.uio_resid;
2247 	return (error);
2248 }
2249 
2250 /*
2251  * readlink_args(char *path, char *buf, int count)
2252  *
2253  * Return target name of a symbolic link.
2254  */
2255 int
2256 sys_readlink(struct readlink_args *uap)
2257 {
2258 	struct nlookupdata nd;
2259 	int error;
2260 
2261 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2262 	if (error == 0) {
2263 		error = kern_readlink(&nd, uap->buf, uap->count,
2264 					&uap->sysmsg_result);
2265 	}
2266 	nlookup_done(&nd);
2267 	return (error);
2268 }
2269 
2270 static int
2271 setfflags(struct vnode *vp, int flags)
2272 {
2273 	struct thread *td = curthread;
2274 	struct proc *p = td->td_proc;
2275 	int error;
2276 	struct vattr vattr;
2277 
2278 	/*
2279 	 * Prevent non-root users from setting flags on devices.  When
2280 	 * a device is reused, users can retain ownership of the device
2281 	 * if they are allowed to set flags and programs assume that
2282 	 * chown can't fail when done as root.
2283 	 */
2284 	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
2285 	    ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0))
2286 		return (error);
2287 
2288 	/*
2289 	 * note: vget is required for any operation that might mod the vnode
2290 	 * so VINACTIVE is properly cleared.
2291 	 */
2292 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2293 		VATTR_NULL(&vattr);
2294 		vattr.va_flags = flags;
2295 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2296 		vput(vp);
2297 	}
2298 	return (error);
2299 }
2300 
2301 /*
2302  * chflags(char *path, int flags)
2303  *
2304  * Change flags of a file given a path name.
2305  */
2306 /* ARGSUSED */
2307 int
2308 sys_chflags(struct chflags_args *uap)
2309 {
2310 	struct nlookupdata nd;
2311 	struct vnode *vp;
2312 	int error;
2313 
2314 	vp = NULL;
2315 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2316 	/* XXX Add NLC flag indicating modifying operation? */
2317 	if (error == 0)
2318 		error = nlookup(&nd);
2319 	if (error == 0)
2320 		error = ncp_writechk(&nd.nl_nch);
2321 	if (error == 0)
2322 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
2323 	nlookup_done(&nd);
2324 	if (error == 0) {
2325 		error = setfflags(vp, uap->flags);
2326 		vrele(vp);
2327 	}
2328 	return (error);
2329 }
2330 
2331 /*
2332  * fchflags_args(int fd, int flags)
2333  *
2334  * Change flags of a file given a file descriptor.
2335  */
2336 /* ARGSUSED */
2337 int
2338 sys_fchflags(struct fchflags_args *uap)
2339 {
2340 	struct thread *td = curthread;
2341 	struct proc *p = td->td_proc;
2342 	struct file *fp;
2343 	int error;
2344 
2345 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2346 		return (error);
2347 	if (fp->f_nchandle.ncp)
2348 		error = ncp_writechk(&fp->f_nchandle);
2349 	if (error == 0)
2350 		error = setfflags((struct vnode *) fp->f_data, uap->flags);
2351 	fdrop(fp);
2352 	return (error);
2353 }
2354 
2355 static int
2356 setfmode(struct vnode *vp, int mode)
2357 {
2358 	struct thread *td = curthread;
2359 	struct proc *p = td->td_proc;
2360 	int error;
2361 	struct vattr vattr;
2362 
2363 	/*
2364 	 * note: vget is required for any operation that might mod the vnode
2365 	 * so VINACTIVE is properly cleared.
2366 	 */
2367 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2368 		VATTR_NULL(&vattr);
2369 		vattr.va_mode = mode & ALLPERMS;
2370 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2371 		vput(vp);
2372 	}
2373 	return error;
2374 }
2375 
2376 int
2377 kern_chmod(struct nlookupdata *nd, int mode)
2378 {
2379 	struct vnode *vp;
2380 	int error;
2381 
2382 	/* XXX Add NLC flag indicating modifying operation? */
2383 	if ((error = nlookup(nd)) != 0)
2384 		return (error);
2385 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2386 		return (error);
2387 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
2388 		error = setfmode(vp, mode);
2389 	vrele(vp);
2390 	return (error);
2391 }
2392 
2393 /*
2394  * chmod_args(char *path, int mode)
2395  *
2396  * Change mode of a file given path name.
2397  */
2398 /* ARGSUSED */
2399 int
2400 sys_chmod(struct chmod_args *uap)
2401 {
2402 	struct nlookupdata nd;
2403 	int error;
2404 
2405 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2406 	if (error == 0)
2407 		error = kern_chmod(&nd, uap->mode);
2408 	nlookup_done(&nd);
2409 	return (error);
2410 }
2411 
2412 /*
2413  * lchmod_args(char *path, int mode)
2414  *
2415  * Change mode of a file given path name (don't follow links.)
2416  */
2417 /* ARGSUSED */
2418 int
2419 sys_lchmod(struct lchmod_args *uap)
2420 {
2421 	struct nlookupdata nd;
2422 	int error;
2423 
2424 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2425 	if (error == 0)
2426 		error = kern_chmod(&nd, uap->mode);
2427 	nlookup_done(&nd);
2428 	return (error);
2429 }
2430 
2431 /*
2432  * fchmod_args(int fd, int mode)
2433  *
2434  * Change mode of a file given a file descriptor.
2435  */
2436 /* ARGSUSED */
2437 int
2438 sys_fchmod(struct fchmod_args *uap)
2439 {
2440 	struct thread *td = curthread;
2441 	struct proc *p = td->td_proc;
2442 	struct file *fp;
2443 	int error;
2444 
2445 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2446 		return (error);
2447 	if (fp->f_nchandle.ncp)
2448 		error = ncp_writechk(&fp->f_nchandle);
2449 	if (error == 0)
2450 		error = setfmode((struct vnode *)fp->f_data, uap->mode);
2451 	fdrop(fp);
2452 	return (error);
2453 }
2454 
2455 static int
2456 setfown(struct vnode *vp, uid_t uid, gid_t gid)
2457 {
2458 	struct thread *td = curthread;
2459 	struct proc *p = td->td_proc;
2460 	int error;
2461 	struct vattr vattr;
2462 
2463 	/*
2464 	 * note: vget is required for any operation that might mod the vnode
2465 	 * so VINACTIVE is properly cleared.
2466 	 */
2467 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2468 		VATTR_NULL(&vattr);
2469 		vattr.va_uid = uid;
2470 		vattr.va_gid = gid;
2471 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2472 		vput(vp);
2473 	}
2474 	return error;
2475 }
2476 
2477 int
2478 kern_chown(struct nlookupdata *nd, int uid, int gid)
2479 {
2480 	struct vnode *vp;
2481 	int error;
2482 
2483 	/* XXX Add NLC flag indicating modifying operation? */
2484 	if ((error = nlookup(nd)) != 0)
2485 		return (error);
2486 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2487 		return (error);
2488 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
2489 		error = setfown(vp, uid, gid);
2490 	vrele(vp);
2491 	return (error);
2492 }
2493 
2494 /*
2495  * chown(char *path, int uid, int gid)
2496  *
2497  * Set ownership given a path name.
2498  */
2499 int
2500 sys_chown(struct chown_args *uap)
2501 {
2502 	struct nlookupdata nd;
2503 	int error;
2504 
2505 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2506 	if (error == 0)
2507 		error = kern_chown(&nd, uap->uid, uap->gid);
2508 	nlookup_done(&nd);
2509 	return (error);
2510 }
2511 
2512 /*
2513  * lchown_args(char *path, int uid, int gid)
2514  *
2515  * Set ownership given a path name, do not cross symlinks.
2516  */
2517 int
2518 sys_lchown(struct lchown_args *uap)
2519 {
2520 	struct nlookupdata nd;
2521 	int error;
2522 
2523 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2524 	if (error == 0)
2525 		error = kern_chown(&nd, uap->uid, uap->gid);
2526 	nlookup_done(&nd);
2527 	return (error);
2528 }
2529 
2530 /*
2531  * fchown_args(int fd, int uid, int gid)
2532  *
2533  * Set ownership given a file descriptor.
2534  */
2535 /* ARGSUSED */
2536 int
2537 sys_fchown(struct fchown_args *uap)
2538 {
2539 	struct thread *td = curthread;
2540 	struct proc *p = td->td_proc;
2541 	struct file *fp;
2542 	int error;
2543 
2544 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2545 		return (error);
2546 	if (fp->f_nchandle.ncp)
2547 		error = ncp_writechk(&fp->f_nchandle);
2548 	if (error == 0)
2549 		error = setfown((struct vnode *)fp->f_data, uap->uid, uap->gid);
2550 	fdrop(fp);
2551 	return (error);
2552 }
2553 
2554 static int
2555 getutimes(const struct timeval *tvp, struct timespec *tsp)
2556 {
2557 	struct timeval tv[2];
2558 
2559 	if (tvp == NULL) {
2560 		microtime(&tv[0]);
2561 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
2562 		tsp[1] = tsp[0];
2563 	} else {
2564 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
2565 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
2566 	}
2567 	return 0;
2568 }
2569 
2570 static int
2571 setutimes(struct vnode *vp, const struct timespec *ts, int nullflag)
2572 {
2573 	struct thread *td = curthread;
2574 	struct proc *p = td->td_proc;
2575 	int error;
2576 	struct vattr vattr;
2577 
2578 	/*
2579 	 * note: vget is required for any operation that might mod the vnode
2580 	 * so VINACTIVE is properly cleared.
2581 	 */
2582 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2583 		VATTR_NULL(&vattr);
2584 		vattr.va_atime = ts[0];
2585 		vattr.va_mtime = ts[1];
2586 		if (nullflag)
2587 			vattr.va_vaflags |= VA_UTIMES_NULL;
2588 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2589 		vput(vp);
2590 	}
2591 	return error;
2592 }
2593 
2594 int
2595 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
2596 {
2597 	struct timespec ts[2];
2598 	struct vnode *vp;
2599 	int error;
2600 
2601 	if ((error = getutimes(tptr, ts)) != 0)
2602 		return (error);
2603 	/* XXX Add NLC flag indicating modifying operation? */
2604 	if ((error = nlookup(nd)) != 0)
2605 		return (error);
2606 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2607 		return (error);
2608 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2609 		return (error);
2610 	error = setutimes(vp, ts, tptr == NULL);
2611 	vrele(vp);
2612 	return (error);
2613 }
2614 
2615 /*
2616  * utimes_args(char *path, struct timeval *tptr)
2617  *
2618  * Set the access and modification times of a file.
2619  */
2620 int
2621 sys_utimes(struct utimes_args *uap)
2622 {
2623 	struct timeval tv[2];
2624 	struct nlookupdata nd;
2625 	int error;
2626 
2627 	if (uap->tptr) {
2628  		error = copyin(uap->tptr, tv, sizeof(tv));
2629 		if (error)
2630 			return (error);
2631 	}
2632 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2633 	if (error == 0)
2634 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
2635 	nlookup_done(&nd);
2636 	return (error);
2637 }
2638 
2639 /*
2640  * lutimes_args(char *path, struct timeval *tptr)
2641  *
2642  * Set the access and modification times of a file.
2643  */
2644 int
2645 sys_lutimes(struct lutimes_args *uap)
2646 {
2647 	struct timeval tv[2];
2648 	struct nlookupdata nd;
2649 	int error;
2650 
2651 	if (uap->tptr) {
2652 		error = copyin(uap->tptr, tv, sizeof(tv));
2653 		if (error)
2654 			return (error);
2655 	}
2656 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2657 	if (error == 0)
2658 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
2659 	nlookup_done(&nd);
2660 	return (error);
2661 }
2662 
2663 int
2664 kern_futimes(int fd, struct timeval *tptr)
2665 {
2666 	struct thread *td = curthread;
2667 	struct proc *p = td->td_proc;
2668 	struct timespec ts[2];
2669 	struct file *fp;
2670 	int error;
2671 
2672 	error = getutimes(tptr, ts);
2673 	if (error)
2674 		return (error);
2675 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
2676 		return (error);
2677 	if (fp->f_nchandle.ncp)
2678 		error = ncp_writechk(&fp->f_nchandle);
2679 	if (error == 0)
2680 		error =  setutimes((struct vnode *)fp->f_data, ts, tptr == NULL);
2681 	fdrop(fp);
2682 	return (error);
2683 }
2684 
2685 /*
2686  * futimes_args(int fd, struct timeval *tptr)
2687  *
2688  * Set the access and modification times of a file.
2689  */
2690 int
2691 sys_futimes(struct futimes_args *uap)
2692 {
2693 	struct timeval tv[2];
2694 	int error;
2695 
2696 	if (uap->tptr) {
2697 		error = copyin(uap->tptr, tv, sizeof(tv));
2698 		if (error)
2699 			return (error);
2700 	}
2701 
2702 	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
2703 
2704 	return (error);
2705 }
2706 
2707 int
2708 kern_truncate(struct nlookupdata *nd, off_t length)
2709 {
2710 	struct vnode *vp;
2711 	struct vattr vattr;
2712 	int error;
2713 
2714 	if (length < 0)
2715 		return(EINVAL);
2716 	/* XXX Add NLC flag indicating modifying operation? */
2717 	if ((error = nlookup(nd)) != 0)
2718 		return (error);
2719 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2720 		return (error);
2721 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2722 		return (error);
2723 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
2724 		vrele(vp);
2725 		return (error);
2726 	}
2727 	if (vp->v_type == VDIR) {
2728 		error = EISDIR;
2729 	} else if ((error = vn_writechk(vp, &nd->nl_nch)) == 0 &&
2730 	    (error = VOP_ACCESS(vp, VWRITE, nd->nl_cred)) == 0) {
2731 		VATTR_NULL(&vattr);
2732 		vattr.va_size = length;
2733 		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
2734 	}
2735 	vput(vp);
2736 	return (error);
2737 }
2738 
2739 /*
2740  * truncate(char *path, int pad, off_t length)
2741  *
2742  * Truncate a file given its path name.
2743  */
2744 int
2745 sys_truncate(struct truncate_args *uap)
2746 {
2747 	struct nlookupdata nd;
2748 	int error;
2749 
2750 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2751 	if (error == 0)
2752 		error = kern_truncate(&nd, uap->length);
2753 	nlookup_done(&nd);
2754 	return error;
2755 }
2756 
2757 int
2758 kern_ftruncate(int fd, off_t length)
2759 {
2760 	struct thread *td = curthread;
2761 	struct proc *p = td->td_proc;
2762 	struct vattr vattr;
2763 	struct vnode *vp;
2764 	struct file *fp;
2765 	int error;
2766 
2767 	if (length < 0)
2768 		return(EINVAL);
2769 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
2770 		return (error);
2771 	if (fp->f_nchandle.ncp) {
2772 		error = ncp_writechk(&fp->f_nchandle);
2773 		if (error)
2774 			goto done;
2775 	}
2776 	if ((fp->f_flag & FWRITE) == 0) {
2777 		error = EINVAL;
2778 		goto done;
2779 	}
2780 	vp = (struct vnode *)fp->f_data;
2781 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2782 	if (vp->v_type == VDIR) {
2783 		error = EISDIR;
2784 	} else if ((error = vn_writechk(vp, NULL)) == 0) {
2785 		VATTR_NULL(&vattr);
2786 		vattr.va_size = length;
2787 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
2788 	}
2789 	vn_unlock(vp);
2790 done:
2791 	fdrop(fp);
2792 	return (error);
2793 }
2794 
2795 /*
2796  * ftruncate_args(int fd, int pad, off_t length)
2797  *
2798  * Truncate a file given a file descriptor.
2799  */
2800 int
2801 sys_ftruncate(struct ftruncate_args *uap)
2802 {
2803 	int error;
2804 
2805 	error = kern_ftruncate(uap->fd, uap->length);
2806 
2807 	return (error);
2808 }
2809 
2810 /*
2811  * fsync(int fd)
2812  *
2813  * Sync an open file.
2814  */
2815 /* ARGSUSED */
2816 int
2817 sys_fsync(struct fsync_args *uap)
2818 {
2819 	struct thread *td = curthread;
2820 	struct proc *p = td->td_proc;
2821 	struct vnode *vp;
2822 	struct file *fp;
2823 	vm_object_t obj;
2824 	int error;
2825 
2826 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2827 		return (error);
2828 	vp = (struct vnode *)fp->f_data;
2829 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2830 	if ((obj = vp->v_object) != NULL)
2831 		vm_object_page_clean(obj, 0, 0, 0);
2832 	if ((error = VOP_FSYNC(vp, MNT_WAIT)) == 0 &&
2833 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) &&
2834 	    bioops.io_fsync) {
2835 		error = (*bioops.io_fsync)(vp);
2836 	}
2837 	vn_unlock(vp);
2838 	fdrop(fp);
2839 	return (error);
2840 }
2841 
2842 int
2843 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
2844 {
2845 	struct nchandle fnchd;
2846 	struct nchandle tnchd;
2847 	struct namecache *ncp;
2848 	struct mount *mp;
2849 	int error;
2850 
2851 	bwillwrite();
2852 	if ((error = nlookup(fromnd)) != 0)
2853 		return (error);
2854 	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
2855 		return (ENOENT);
2856 	fnchd.mount = fromnd->nl_nch.mount;
2857 	cache_hold(&fnchd);
2858 
2859 	/*
2860 	 * unlock the source nch so we can lookup the target nch without
2861 	 * deadlocking.  The target may or may not exist so we do not check
2862 	 * for a target vp like kern_mkdir() and other creation functions do.
2863 	 *
2864 	 * The source and target directories are ref'd and rechecked after
2865 	 * everything is relocked to determine if the source or target file
2866 	 * has been renamed.
2867 	 */
2868 	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
2869 	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
2870 	cache_unlock(&fromnd->nl_nch);
2871 
2872 	tond->nl_flags |= NLC_CREATE;
2873 	if ((error = nlookup(tond)) != 0) {
2874 		cache_drop(&fnchd);
2875 		return (error);
2876 	}
2877 	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
2878 		cache_drop(&fnchd);
2879 		return (ENOENT);
2880 	}
2881 	tnchd.mount = tond->nl_nch.mount;
2882 	cache_hold(&tnchd);
2883 
2884 	/*
2885 	 * If the source and target are the same there is nothing to do
2886 	 */
2887 	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
2888 		cache_drop(&fnchd);
2889 		cache_drop(&tnchd);
2890 		return (0);
2891 	}
2892 
2893 	/*
2894 	 * Mount points cannot be renamed or overwritten
2895 	 */
2896 	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
2897 	    NCF_ISMOUNTPT
2898 	) {
2899 		cache_drop(&fnchd);
2900 		cache_drop(&tnchd);
2901 		return (EINVAL);
2902 	}
2903 
2904 	/*
2905 	 * relock the source ncp.  NOTE AFTER RELOCKING: the source ncp
2906 	 * may have become invalid while it was unlocked, nc_vp and nc_mount
2907 	 * could be NULL.
2908 	 */
2909 	if (cache_lock_nonblock(&fromnd->nl_nch) == 0) {
2910 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2911 	} else if (fromnd->nl_nch.ncp > tond->nl_nch.ncp) {
2912 		cache_lock(&fromnd->nl_nch);
2913 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2914 	} else {
2915 		cache_unlock(&tond->nl_nch);
2916 		cache_lock(&fromnd->nl_nch);
2917 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2918 		cache_lock(&tond->nl_nch);
2919 		cache_resolve(&tond->nl_nch, tond->nl_cred);
2920 	}
2921 	fromnd->nl_flags |= NLC_NCPISLOCKED;
2922 
2923 	/*
2924 	 * make sure the parent directories linkages are the same
2925 	 */
2926 	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
2927 	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
2928 		cache_drop(&fnchd);
2929 		cache_drop(&tnchd);
2930 		return (ENOENT);
2931 	}
2932 
2933 	/*
2934 	 * Both the source and target must be within the same filesystem and
2935 	 * in the same filesystem as their parent directories within the
2936 	 * namecache topology.
2937 	 *
2938 	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
2939 	 */
2940 	mp = fnchd.mount;
2941 	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
2942 	    mp != tond->nl_nch.mount) {
2943 		cache_drop(&fnchd);
2944 		cache_drop(&tnchd);
2945 		return (EXDEV);
2946 	}
2947 
2948 	/*
2949 	 * Make sure the mount point is writable
2950 	 */
2951 	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
2952 		cache_drop(&fnchd);
2953 		cache_drop(&tnchd);
2954 		return (error);
2955 	}
2956 
2957 	/*
2958 	 * If the target exists and either the source or target is a directory,
2959 	 * then both must be directories.
2960 	 *
2961 	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
2962 	 * have become NULL.
2963 	 */
2964 	if (tond->nl_nch.ncp->nc_vp) {
2965 		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
2966 			error = ENOENT;
2967 		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
2968 			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
2969 				error = ENOTDIR;
2970 		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
2971 			error = EISDIR;
2972 		}
2973 	}
2974 
2975 	/*
2976 	 * You cannot rename a source into itself or a subdirectory of itself.
2977 	 * We check this by travsersing the target directory upwards looking
2978 	 * for a match against the source.
2979 	 */
2980 	if (error == 0) {
2981 		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
2982 			if (fromnd->nl_nch.ncp == ncp) {
2983 				error = EINVAL;
2984 				break;
2985 			}
2986 		}
2987 	}
2988 
2989 	cache_drop(&fnchd);
2990 	cache_drop(&tnchd);
2991 
2992 	/*
2993 	 * Even though the namespaces are different, they may still represent
2994 	 * hardlinks to the same file.  The filesystem might have a hard time
2995 	 * with this so we issue a NREMOVE of the source instead of a NRENAME
2996 	 * when we detect the situation.
2997 	 */
2998 	if (error == 0) {
2999 		if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
3000 			error = VOP_NREMOVE(&fromnd->nl_nch, fromnd->nl_cred);
3001 		} else {
3002 			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
3003 					    tond->nl_cred);
3004 		}
3005 	}
3006 	return (error);
3007 }
3008 
3009 /*
3010  * rename_args(char *from, char *to)
3011  *
3012  * Rename files.  Source and destination must either both be directories,
3013  * or both not be directories.  If target is a directory, it must be empty.
3014  */
3015 int
3016 sys_rename(struct rename_args *uap)
3017 {
3018 	struct nlookupdata fromnd, tond;
3019 	int error;
3020 
3021 	error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
3022 	if (error == 0) {
3023 		error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
3024 		if (error == 0)
3025 			error = kern_rename(&fromnd, &tond);
3026 		nlookup_done(&tond);
3027 	}
3028 	nlookup_done(&fromnd);
3029 	return (error);
3030 }
3031 
3032 int
3033 kern_mkdir(struct nlookupdata *nd, int mode)
3034 {
3035 	struct thread *td = curthread;
3036 	struct proc *p = td->td_proc;
3037 	struct vnode *vp;
3038 	struct vattr vattr;
3039 	int error;
3040 
3041 	bwillwrite();
3042 	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE;
3043 	if ((error = nlookup(nd)) != 0)
3044 		return (error);
3045 
3046 	if (nd->nl_nch.ncp->nc_vp)
3047 		return (EEXIST);
3048 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3049 		return (error);
3050 
3051 	VATTR_NULL(&vattr);
3052 	vattr.va_type = VDIR;
3053 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
3054 
3055 	vp = NULL;
3056 	error = VOP_NMKDIR(&nd->nl_nch, &vp, p->p_ucred, &vattr);
3057 	if (error == 0)
3058 		vput(vp);
3059 	return (error);
3060 }
3061 
3062 /*
3063  * mkdir_args(char *path, int mode)
3064  *
3065  * Make a directory file.
3066  */
3067 /* ARGSUSED */
3068 int
3069 sys_mkdir(struct mkdir_args *uap)
3070 {
3071 	struct nlookupdata nd;
3072 	int error;
3073 
3074 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3075 	if (error == 0)
3076 		error = kern_mkdir(&nd, uap->mode);
3077 	nlookup_done(&nd);
3078 	return (error);
3079 }
3080 
3081 int
3082 kern_rmdir(struct nlookupdata *nd)
3083 {
3084 	int error;
3085 
3086 	bwillwrite();
3087 	nd->nl_flags |= NLC_DELETE;
3088 	if ((error = nlookup(nd)) != 0)
3089 		return (error);
3090 
3091 	/*
3092 	 * Do not allow directories representing mount points to be
3093 	 * deleted, even if empty.  Check write perms on mount point
3094 	 * in case the vnode is aliased (aka nullfs).
3095 	 */
3096 	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
3097 		return (EINVAL);
3098 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3099 		return (error);
3100 
3101 	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_cred);
3102 	return (error);
3103 }
3104 
3105 /*
3106  * rmdir_args(char *path)
3107  *
3108  * Remove a directory file.
3109  */
3110 /* ARGSUSED */
3111 int
3112 sys_rmdir(struct rmdir_args *uap)
3113 {
3114 	struct nlookupdata nd;
3115 	int error;
3116 
3117 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3118 	if (error == 0)
3119 		error = kern_rmdir(&nd);
3120 	nlookup_done(&nd);
3121 	return (error);
3122 }
3123 
3124 int
3125 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
3126     enum uio_seg direction)
3127 {
3128 	struct thread *td = curthread;
3129 	struct proc *p = td->td_proc;
3130 	struct vnode *vp;
3131 	struct file *fp;
3132 	struct uio auio;
3133 	struct iovec aiov;
3134 	long loff;
3135 	int error, eofflag;
3136 
3137 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3138 		return (error);
3139 	if ((fp->f_flag & FREAD) == 0) {
3140 		error = EBADF;
3141 		goto done;
3142 	}
3143 	vp = (struct vnode *)fp->f_data;
3144 unionread:
3145 	if (vp->v_type != VDIR) {
3146 		error = EINVAL;
3147 		goto done;
3148 	}
3149 	aiov.iov_base = buf;
3150 	aiov.iov_len = count;
3151 	auio.uio_iov = &aiov;
3152 	auio.uio_iovcnt = 1;
3153 	auio.uio_rw = UIO_READ;
3154 	auio.uio_segflg = direction;
3155 	auio.uio_td = td;
3156 	auio.uio_resid = count;
3157 	loff = auio.uio_offset = fp->f_offset;
3158 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
3159 	fp->f_offset = auio.uio_offset;
3160 	if (error)
3161 		goto done;
3162 	if (count == auio.uio_resid) {
3163 		if (union_dircheckp) {
3164 			error = union_dircheckp(td, &vp, fp);
3165 			if (error == -1)
3166 				goto unionread;
3167 			if (error)
3168 				goto done;
3169 		}
3170 #if 0
3171 		if ((vp->v_flag & VROOT) &&
3172 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
3173 			struct vnode *tvp = vp;
3174 			vp = vp->v_mount->mnt_vnodecovered;
3175 			vref(vp);
3176 			fp->f_data = vp;
3177 			fp->f_offset = 0;
3178 			vrele(tvp);
3179 			goto unionread;
3180 		}
3181 #endif
3182 	}
3183 	if (basep) {
3184 		*basep = loff;
3185 	}
3186 	*res = count - auio.uio_resid;
3187 done:
3188 	fdrop(fp);
3189 	return (error);
3190 }
3191 
3192 /*
3193  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
3194  *
3195  * Read a block of directory entries in a file system independent format.
3196  */
3197 int
3198 sys_getdirentries(struct getdirentries_args *uap)
3199 {
3200 	long base;
3201 	int error;
3202 
3203 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
3204 	    &uap->sysmsg_result, UIO_USERSPACE);
3205 
3206 	if (error == 0)
3207 		error = copyout(&base, uap->basep, sizeof(*uap->basep));
3208 	return (error);
3209 }
3210 
3211 /*
3212  * getdents_args(int fd, char *buf, size_t count)
3213  */
3214 int
3215 sys_getdents(struct getdents_args *uap)
3216 {
3217 	int error;
3218 
3219 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
3220 	    &uap->sysmsg_result, UIO_USERSPACE);
3221 
3222 	return (error);
3223 }
3224 
3225 /*
3226  * umask(int newmask)
3227  *
3228  * Set the mode mask for creation of filesystem nodes.
3229  *
3230  * MP SAFE
3231  */
3232 int
3233 sys_umask(struct umask_args *uap)
3234 {
3235 	struct thread *td = curthread;
3236 	struct proc *p = td->td_proc;
3237 	struct filedesc *fdp;
3238 
3239 	fdp = p->p_fd;
3240 	uap->sysmsg_result = fdp->fd_cmask;
3241 	fdp->fd_cmask = uap->newmask & ALLPERMS;
3242 	return (0);
3243 }
3244 
3245 /*
3246  * revoke(char *path)
3247  *
3248  * Void all references to file by ripping underlying filesystem
3249  * away from vnode.
3250  */
3251 /* ARGSUSED */
3252 int
3253 sys_revoke(struct revoke_args *uap)
3254 {
3255 	struct nlookupdata nd;
3256 	struct vattr vattr;
3257 	struct vnode *vp;
3258 	struct ucred *cred;
3259 	int error;
3260 
3261 	vp = NULL;
3262 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3263 	if (error == 0)
3264 		error = nlookup(&nd);
3265 	if (error == 0)
3266 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3267 	cred = crhold(nd.nl_cred);
3268 	nlookup_done(&nd);
3269 	if (error == 0) {
3270 		if (vp->v_type != VCHR && vp->v_type != VBLK)
3271 			error = EINVAL;
3272 		if (error == 0)
3273 			error = VOP_GETATTR(vp, &vattr);
3274 		if (error == 0 && cred->cr_uid != vattr.va_uid)
3275 			error = suser_cred(cred, PRISON_ROOT);
3276 		if (error == 0 && count_udev(vp->v_umajor, vp->v_uminor) > 0) {
3277 			error = 0;
3278 			vx_lock(vp);
3279 			VOP_REVOKE(vp, REVOKEALL);
3280 			vx_unlock(vp);
3281 		}
3282 		vrele(vp);
3283 	}
3284 	if (cred)
3285 		crfree(cred);
3286 	return (error);
3287 }
3288 
3289 /*
3290  * getfh_args(char *fname, fhandle_t *fhp)
3291  *
3292  * Get (NFS) file handle
3293  */
3294 int
3295 sys_getfh(struct getfh_args *uap)
3296 {
3297 	struct thread *td = curthread;
3298 	struct nlookupdata nd;
3299 	fhandle_t fh;
3300 	struct vnode *vp;
3301 	int error;
3302 
3303 	/*
3304 	 * Must be super user
3305 	 */
3306 	if ((error = suser(td)) != 0)
3307 		return (error);
3308 
3309 	vp = NULL;
3310 	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
3311 	if (error == 0)
3312 		error = nlookup(&nd);
3313 	if (error == 0)
3314 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3315 	nlookup_done(&nd);
3316 	if (error == 0) {
3317 		bzero(&fh, sizeof(fh));
3318 		fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
3319 		error = VFS_VPTOFH(vp, &fh.fh_fid);
3320 		vput(vp);
3321 		if (error == 0)
3322 			error = copyout(&fh, uap->fhp, sizeof(fh));
3323 	}
3324 	return (error);
3325 }
3326 
3327 /*
3328  * fhopen_args(const struct fhandle *u_fhp, int flags)
3329  *
3330  * syscall for the rpc.lockd to use to translate a NFS file handle into
3331  * an open descriptor.
3332  *
3333  * warning: do not remove the suser() call or this becomes one giant
3334  * security hole.
3335  */
3336 int
3337 sys_fhopen(struct fhopen_args *uap)
3338 {
3339 	struct thread *td = curthread;
3340 	struct proc *p = td->td_proc;
3341 	struct mount *mp;
3342 	struct vnode *vp;
3343 	struct fhandle fhp;
3344 	struct vattr vat;
3345 	struct vattr *vap = &vat;
3346 	struct flock lf;
3347 	int fmode, mode, error, type;
3348 	struct file *nfp;
3349 	struct file *fp;
3350 	int indx;
3351 
3352 	/*
3353 	 * Must be super user
3354 	 */
3355 	error = suser(td);
3356 	if (error)
3357 		return (error);
3358 
3359 	fmode = FFLAGS(uap->flags);
3360 	/* why not allow a non-read/write open for our lockd? */
3361 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
3362 		return (EINVAL);
3363 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
3364 	if (error)
3365 		return(error);
3366 	/* find the mount point */
3367 	mp = vfs_getvfs(&fhp.fh_fsid);
3368 	if (mp == NULL)
3369 		return (ESTALE);
3370 	/* now give me my vnode, it gets returned to me locked */
3371 	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
3372 	if (error)
3373 		return (error);
3374  	/*
3375 	 * from now on we have to make sure not
3376 	 * to forget about the vnode
3377 	 * any error that causes an abort must vput(vp)
3378 	 * just set error = err and 'goto bad;'.
3379 	 */
3380 
3381 	/*
3382 	 * from vn_open
3383 	 */
3384 	if (vp->v_type == VLNK) {
3385 		error = EMLINK;
3386 		goto bad;
3387 	}
3388 	if (vp->v_type == VSOCK) {
3389 		error = EOPNOTSUPP;
3390 		goto bad;
3391 	}
3392 	mode = 0;
3393 	if (fmode & (FWRITE | O_TRUNC)) {
3394 		if (vp->v_type == VDIR) {
3395 			error = EISDIR;
3396 			goto bad;
3397 		}
3398 		error = vn_writechk(vp, NULL);
3399 		if (error)
3400 			goto bad;
3401 		mode |= VWRITE;
3402 	}
3403 	if (fmode & FREAD)
3404 		mode |= VREAD;
3405 	if (mode) {
3406 		error = VOP_ACCESS(vp, mode, p->p_ucred);
3407 		if (error)
3408 			goto bad;
3409 	}
3410 	if (fmode & O_TRUNC) {
3411 		vn_unlock(vp);				/* XXX */
3412 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
3413 		VATTR_NULL(vap);
3414 		vap->va_size = 0;
3415 		error = VOP_SETATTR(vp, vap, p->p_ucred);
3416 		if (error)
3417 			goto bad;
3418 	}
3419 
3420 	/*
3421 	 * VOP_OPEN needs the file pointer so it can potentially override
3422 	 * it.
3423 	 *
3424 	 * WARNING! no f_nchandle will be associated when fhopen()ing a
3425 	 * directory.  XXX
3426 	 */
3427 	if ((error = falloc(p, &nfp, &indx)) != 0)
3428 		goto bad;
3429 	fp = nfp;
3430 
3431 	error = VOP_OPEN(vp, fmode, p->p_ucred, fp);
3432 	if (error) {
3433 		/*
3434 		 * setting f_ops this way prevents VOP_CLOSE from being
3435 		 * called or fdrop() releasing the vp from v_data.   Since
3436 		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
3437 		 */
3438 		fp->f_ops = &badfileops;
3439 		fp->f_data = NULL;
3440 		goto bad_drop;
3441 	}
3442 
3443 	/*
3444 	 * The fp is given its own reference, we still have our ref and lock.
3445 	 *
3446 	 * Assert that all regular files must be created with a VM object.
3447 	 */
3448 	if (vp->v_type == VREG && vp->v_object == NULL) {
3449 		kprintf("fhopen: regular file did not have VM object: %p\n", vp);
3450 		goto bad_drop;
3451 	}
3452 
3453 	/*
3454 	 * The open was successful.  Handle any locking requirements.
3455 	 */
3456 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
3457 		lf.l_whence = SEEK_SET;
3458 		lf.l_start = 0;
3459 		lf.l_len = 0;
3460 		if (fmode & O_EXLOCK)
3461 			lf.l_type = F_WRLCK;
3462 		else
3463 			lf.l_type = F_RDLCK;
3464 		if (fmode & FNONBLOCK)
3465 			type = 0;
3466 		else
3467 			type = F_WAIT;
3468 		vn_unlock(vp);
3469 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
3470 			/*
3471 			 * release our private reference.
3472 			 */
3473 			fsetfd(p, NULL, indx);
3474 			fdrop(fp);
3475 			vrele(vp);
3476 			return (error);
3477 		}
3478 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3479 		fp->f_flag |= FHASLOCK;
3480 	}
3481 
3482 	/*
3483 	 * Clean up.  Associate the file pointer with the previously
3484 	 * reserved descriptor and return it.
3485 	 */
3486 	vput(vp);
3487 	fsetfd(p, fp, indx);
3488 	fdrop(fp);
3489 	uap->sysmsg_result = indx;
3490 	return (0);
3491 
3492 bad_drop:
3493 	fsetfd(p, NULL, indx);
3494 	fdrop(fp);
3495 bad:
3496 	vput(vp);
3497 	return (error);
3498 }
3499 
3500 /*
3501  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
3502  */
3503 int
3504 sys_fhstat(struct fhstat_args *uap)
3505 {
3506 	struct thread *td = curthread;
3507 	struct stat sb;
3508 	fhandle_t fh;
3509 	struct mount *mp;
3510 	struct vnode *vp;
3511 	int error;
3512 
3513 	/*
3514 	 * Must be super user
3515 	 */
3516 	error = suser(td);
3517 	if (error)
3518 		return (error);
3519 
3520 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
3521 	if (error)
3522 		return (error);
3523 
3524 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
3525 		return (ESTALE);
3526 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
3527 		return (error);
3528 	error = vn_stat(vp, &sb, td->td_proc->p_ucred);
3529 	vput(vp);
3530 	if (error)
3531 		return (error);
3532 	error = copyout(&sb, uap->sb, sizeof(sb));
3533 	return (error);
3534 }
3535 
3536 /*
3537  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
3538  */
3539 int
3540 sys_fhstatfs(struct fhstatfs_args *uap)
3541 {
3542 	struct thread *td = curthread;
3543 	struct proc *p = td->td_proc;
3544 	struct statfs *sp;
3545 	struct mount *mp;
3546 	struct vnode *vp;
3547 	struct statfs sb;
3548 	char *fullpath, *freepath;
3549 	fhandle_t fh;
3550 	int error;
3551 
3552 	/*
3553 	 * Must be super user
3554 	 */
3555 	if ((error = suser(td)))
3556 		return (error);
3557 
3558 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
3559 		return (error);
3560 
3561 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
3562 		return (ESTALE);
3563 
3564 	if (p != NULL && !chroot_visible_mnt(mp, p))
3565 		return (ESTALE);
3566 
3567 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
3568 		return (error);
3569 	mp = vp->v_mount;
3570 	sp = &mp->mnt_stat;
3571 	vput(vp);
3572 	if ((error = VFS_STATFS(mp, sp, p->p_ucred)) != 0)
3573 		return (error);
3574 
3575 	error = mount_path(p, mp, &fullpath, &freepath);
3576 	if (error)
3577 		return(error);
3578 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3579 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
3580 	kfree(freepath, M_TEMP);
3581 
3582 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3583 	if (suser(td)) {
3584 		bcopy(sp, &sb, sizeof(sb));
3585 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
3586 		sp = &sb;
3587 	}
3588 	return (copyout(sp, uap->buf, sizeof(*sp)));
3589 }
3590 
3591 /*
3592  * Syscall to push extended attribute configuration information into the
3593  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
3594  * a command (int cmd), and attribute name and misc data.  For now, the
3595  * attribute name is left in userspace for consumption by the VFS_op.
3596  * It will probably be changed to be copied into sysspace by the
3597  * syscall in the future, once issues with various consumers of the
3598  * attribute code have raised their hands.
3599  *
3600  * Currently this is used only by UFS Extended Attributes.
3601  */
3602 int
3603 sys_extattrctl(struct extattrctl_args *uap)
3604 {
3605 	struct nlookupdata nd;
3606 	struct mount *mp;
3607 	struct vnode *vp;
3608 	int error;
3609 
3610 	vp = NULL;
3611 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3612 	if (error == 0)
3613 		error = nlookup(&nd);
3614 	if (error == 0) {
3615 		mp = nd.nl_nch.mount;
3616 		error = VFS_EXTATTRCTL(mp, uap->cmd,
3617 				uap->attrname, uap->arg,
3618 				nd.nl_cred);
3619 	}
3620 	nlookup_done(&nd);
3621 	return (error);
3622 }
3623 
3624 /*
3625  * Syscall to set a named extended attribute on a file or directory.
3626  * Accepts attribute name, and a uio structure pointing to the data to set.
3627  * The uio is consumed in the style of writev().  The real work happens
3628  * in VOP_SETEXTATTR().
3629  */
3630 int
3631 sys_extattr_set_file(struct extattr_set_file_args *uap)
3632 {
3633 	char attrname[EXTATTR_MAXNAMELEN];
3634 	struct iovec aiov[UIO_SMALLIOV];
3635 	struct iovec *needfree;
3636 	struct nlookupdata nd;
3637 	struct iovec *iov;
3638 	struct vnode *vp;
3639 	struct uio auio;
3640 	u_int iovlen;
3641 	u_int cnt;
3642 	int error;
3643 	int i;
3644 
3645 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3646 	if (error)
3647 		return (error);
3648 
3649 	vp = NULL;
3650 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3651 	if (error == 0)
3652 		error = nlookup(&nd);
3653 	if (error == 0)
3654 		error = ncp_writechk(&nd.nl_nch);
3655 	if (error == 0)
3656 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3657 	if (error) {
3658 		nlookup_done(&nd);
3659 		return (error);
3660 	}
3661 
3662 	needfree = NULL;
3663 	iovlen = uap->iovcnt * sizeof(struct iovec);
3664 	if (uap->iovcnt > UIO_SMALLIOV) {
3665 		if (uap->iovcnt > UIO_MAXIOV) {
3666 			error = EINVAL;
3667 			goto done;
3668 		}
3669 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
3670 		needfree = iov;
3671 	} else {
3672 		iov = aiov;
3673 	}
3674 	auio.uio_iov = iov;
3675 	auio.uio_iovcnt = uap->iovcnt;
3676 	auio.uio_rw = UIO_WRITE;
3677 	auio.uio_segflg = UIO_USERSPACE;
3678 	auio.uio_td = nd.nl_td;
3679 	auio.uio_offset = 0;
3680 	if ((error = copyin(uap->iovp, iov, iovlen)))
3681 		goto done;
3682 	auio.uio_resid = 0;
3683 	for (i = 0; i < uap->iovcnt; i++) {
3684 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
3685 			error = EINVAL;
3686 			goto done;
3687 		}
3688 		auio.uio_resid += iov->iov_len;
3689 		iov++;
3690 	}
3691 	cnt = auio.uio_resid;
3692 	error = VOP_SETEXTATTR(vp, attrname, &auio, nd.nl_cred);
3693 	cnt -= auio.uio_resid;
3694 	uap->sysmsg_result = cnt;
3695 done:
3696 	vput(vp);
3697 	nlookup_done(&nd);
3698 	if (needfree)
3699 		FREE(needfree, M_IOV);
3700 	return (error);
3701 }
3702 
3703 /*
3704  * Syscall to get a named extended attribute on a file or directory.
3705  * Accepts attribute name, and a uio structure pointing to a buffer for the
3706  * data.  The uio is consumed in the style of readv().  The real work
3707  * happens in VOP_GETEXTATTR();
3708  */
3709 int
3710 sys_extattr_get_file(struct extattr_get_file_args *uap)
3711 {
3712 	char attrname[EXTATTR_MAXNAMELEN];
3713 	struct iovec aiov[UIO_SMALLIOV];
3714 	struct iovec *needfree;
3715 	struct nlookupdata nd;
3716 	struct iovec *iov;
3717 	struct vnode *vp;
3718 	struct uio auio;
3719 	u_int iovlen;
3720 	u_int cnt;
3721 	int error;
3722 	int i;
3723 
3724 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3725 	if (error)
3726 		return (error);
3727 
3728 	vp = NULL;
3729 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3730 	if (error == 0)
3731 		error = nlookup(&nd);
3732 	if (error == 0)
3733 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3734 	if (error) {
3735 		nlookup_done(&nd);
3736 		return (error);
3737 	}
3738 
3739 	iovlen = uap->iovcnt * sizeof (struct iovec);
3740 	needfree = NULL;
3741 	if (uap->iovcnt > UIO_SMALLIOV) {
3742 		if (uap->iovcnt > UIO_MAXIOV) {
3743 			error = EINVAL;
3744 			goto done;
3745 		}
3746 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
3747 		needfree = iov;
3748 	} else {
3749 		iov = aiov;
3750 	}
3751 	auio.uio_iov = iov;
3752 	auio.uio_iovcnt = uap->iovcnt;
3753 	auio.uio_rw = UIO_READ;
3754 	auio.uio_segflg = UIO_USERSPACE;
3755 	auio.uio_td = nd.nl_td;
3756 	auio.uio_offset = 0;
3757 	if ((error = copyin(uap->iovp, iov, iovlen)))
3758 		goto done;
3759 	auio.uio_resid = 0;
3760 	for (i = 0; i < uap->iovcnt; i++) {
3761 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
3762 			error = EINVAL;
3763 			goto done;
3764 		}
3765 		auio.uio_resid += iov->iov_len;
3766 		iov++;
3767 	}
3768 	cnt = auio.uio_resid;
3769 	error = VOP_GETEXTATTR(vp, attrname, &auio, nd.nl_cred);
3770 	cnt -= auio.uio_resid;
3771 	uap->sysmsg_result = cnt;
3772 done:
3773 	vput(vp);
3774 	nlookup_done(&nd);
3775 	if (needfree)
3776 		FREE(needfree, M_IOV);
3777 	return(error);
3778 }
3779 
3780 /*
3781  * Syscall to delete a named extended attribute from a file or directory.
3782  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
3783  */
3784 int
3785 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
3786 {
3787 	char attrname[EXTATTR_MAXNAMELEN];
3788 	struct nlookupdata nd;
3789 	struct vnode *vp;
3790 	int error;
3791 
3792 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3793 	if (error)
3794 		return(error);
3795 
3796 	vp = NULL;
3797 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3798 	if (error == 0)
3799 		error = nlookup(&nd);
3800 	if (error == 0)
3801 		error = ncp_writechk(&nd.nl_nch);
3802 	if (error == 0)
3803 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3804 	if (error) {
3805 		nlookup_done(&nd);
3806 		return (error);
3807 	}
3808 
3809 	error = VOP_SETEXTATTR(vp, attrname, NULL, nd.nl_cred);
3810 	vput(vp);
3811 	nlookup_done(&nd);
3812 	return(error);
3813 }
3814 
3815 /*
3816  * Determine if the mount is visible to the process.
3817  */
3818 static int
3819 chroot_visible_mnt(struct mount *mp, struct proc *p)
3820 {
3821 	struct nchandle nch;
3822 
3823 	/*
3824 	 * Traverse from the mount point upwards.  If we hit the process
3825 	 * root then the mount point is visible to the process.
3826 	 */
3827 	nch = mp->mnt_ncmountpt;
3828 	while (nch.ncp) {
3829 		if (nch.mount == p->p_fd->fd_nrdir.mount &&
3830 		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
3831 			return(1);
3832 		}
3833 		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
3834 			nch = nch.mount->mnt_ncmounton;
3835 		} else {
3836 			nch.ncp = nch.ncp->nc_parent;
3837 		}
3838 	}
3839 
3840 	/*
3841 	 * If the mount point is not visible to the process, but the
3842 	 * process root is in a subdirectory of the mount, return
3843 	 * TRUE anyway.
3844 	 */
3845 	if (p->p_fd->fd_nrdir.mount == mp)
3846 		return(1);
3847 
3848 	return(0);
3849 }
3850 
3851