xref: /dragonfly/sys/kern/vfs_syscalls.c (revision 3f625015)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
39  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
40  * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.116 2007/05/09 00:53:34 dillon Exp $
41  */
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/buf.h>
46 #include <sys/conf.h>
47 #include <sys/sysent.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/mountctl.h>
51 #include <sys/sysproto.h>
52 #include <sys/filedesc.h>
53 #include <sys/kernel.h>
54 #include <sys/fcntl.h>
55 #include <sys/file.h>
56 #include <sys/linker.h>
57 #include <sys/stat.h>
58 #include <sys/unistd.h>
59 #include <sys/vnode.h>
60 #include <sys/proc.h>
61 #include <sys/namei.h>
62 #include <sys/nlookup.h>
63 #include <sys/dirent.h>
64 #include <sys/extattr.h>
65 #include <sys/spinlock.h>
66 #include <sys/kern_syscall.h>
67 #include <sys/objcache.h>
68 #include <sys/sysctl.h>
69 #include <sys/file2.h>
70 #include <sys/spinlock2.h>
71 
72 #include <vm/vm.h>
73 #include <vm/vm_object.h>
74 #include <vm/vm_page.h>
75 
76 #include <machine/limits.h>
77 #include <machine/stdarg.h>
78 
79 #include <vfs/union/union.h>
80 
81 static void mount_warning(struct mount *mp, const char *ctl, ...);
82 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
83 static int checkvp_chdir (struct vnode *vn, struct thread *td);
84 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
85 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
86 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
87 static int getutimes (const struct timeval *, struct timespec *);
88 static int setfown (struct vnode *, uid_t, gid_t);
89 static int setfmode (struct vnode *, int);
90 static int setfflags (struct vnode *, int);
91 static int setutimes (struct vnode *, const struct timespec *, int);
92 static int	usermount = 0;	/* if 1, non-root can mount fs. */
93 
94 int (*union_dircheckp) (struct thread *, struct vnode **, struct file *);
95 
96 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
97 
98 /*
99  * Virtual File System System Calls
100  */
101 
102 /*
103  * Mount a file system.
104  */
105 /*
106  * mount_args(char *type, char *path, int flags, caddr_t data)
107  */
108 /* ARGSUSED */
109 int
110 sys_mount(struct mount_args *uap)
111 {
112 	struct thread *td = curthread;
113 	struct proc *p = td->td_proc;
114 	struct vnode *vp;
115 	struct nchandle nch;
116 	struct mount *mp;
117 	struct vfsconf *vfsp;
118 	int error, flag = 0, flag2 = 0;
119 	int hasmount;
120 	struct vattr va;
121 	struct nlookupdata nd;
122 	char fstypename[MFSNAMELEN];
123 	struct ucred *cred = p->p_ucred;
124 
125 	KKASSERT(p);
126 	if (cred->cr_prison != NULL)
127 		return (EPERM);
128 	if (usermount == 0 && (error = suser(td)))
129 		return (error);
130 	/*
131 	 * Do not allow NFS export by non-root users.
132 	 */
133 	if (uap->flags & MNT_EXPORTED) {
134 		error = suser(td);
135 		if (error)
136 			return (error);
137 	}
138 	/*
139 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
140 	 */
141 	if (suser(td))
142 		uap->flags |= MNT_NOSUID | MNT_NODEV;
143 
144 	/*
145 	 * Lookup the requested path and extract the nch and vnode.
146 	 */
147 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
148 	if (error == 0) {
149 		if ((error = nlookup(&nd)) == 0) {
150 			if (nd.nl_nch.ncp->nc_vp == NULL)
151 				error = ENOENT;
152 		}
153 	}
154 	if (error) {
155 		nlookup_done(&nd);
156 		return (error);
157 	}
158 
159 	/*
160 	 * Extract the locked+refd ncp and cleanup the nd structure
161 	 */
162 	nch = nd.nl_nch;
163 	cache_zero(&nd.nl_nch);
164 	nlookup_done(&nd);
165 
166 	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && cache_findmount(&nch))
167 		hasmount = 1;
168 	else
169 		hasmount = 0;
170 
171 
172 	/*
173 	 * now we have the locked ref'd nch and unreferenced vnode.
174 	 */
175 	vp = nch.ncp->nc_vp;
176 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
177 		cache_put(&nch);
178 		return (error);
179 	}
180 	cache_unlock(&nch);
181 
182 	/*
183 	 * Now we have an unlocked ref'd nch and a locked ref'd vp
184 	 */
185 	if (uap->flags & MNT_UPDATE) {
186 		if ((vp->v_flag & VROOT) == 0) {
187 			cache_drop(&nch);
188 			vput(vp);
189 			return (EINVAL);
190 		}
191 		mp = vp->v_mount;
192 		flag = mp->mnt_flag;
193 		flag2 = mp->mnt_kern_flag;
194 		/*
195 		 * We only allow the filesystem to be reloaded if it
196 		 * is currently mounted read-only.
197 		 */
198 		if ((uap->flags & MNT_RELOAD) &&
199 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
200 			cache_drop(&nch);
201 			vput(vp);
202 			return (EOPNOTSUPP);	/* Needs translation */
203 		}
204 		/*
205 		 * Only root, or the user that did the original mount is
206 		 * permitted to update it.
207 		 */
208 		if (mp->mnt_stat.f_owner != cred->cr_uid &&
209 		    (error = suser(td))) {
210 			cache_drop(&nch);
211 			vput(vp);
212 			return (error);
213 		}
214 		if (vfs_busy(mp, LK_NOWAIT)) {
215 			cache_drop(&nch);
216 			vput(vp);
217 			return (EBUSY);
218 		}
219 		if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
220 			cache_drop(&nch);
221 			vfs_unbusy(mp);
222 			vput(vp);
223 			return (EBUSY);
224 		}
225 		vp->v_flag |= VMOUNT;
226 		mp->mnt_flag |=
227 		    uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
228 		vn_unlock(vp);
229 		goto update;
230 	}
231 	/*
232 	 * If the user is not root, ensure that they own the directory
233 	 * onto which we are attempting to mount.
234 	 */
235 	if ((error = VOP_GETATTR(vp, &va)) ||
236 	    (va.va_uid != cred->cr_uid && (error = suser(td)))) {
237 		cache_drop(&nch);
238 		vput(vp);
239 		return (error);
240 	}
241 	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
242 		cache_drop(&nch);
243 		vput(vp);
244 		return (error);
245 	}
246 	if (vp->v_type != VDIR) {
247 		cache_drop(&nch);
248 		vput(vp);
249 		return (ENOTDIR);
250 	}
251 	if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
252 		cache_drop(&nch);
253 		vput(vp);
254 		return (error);
255 	}
256 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
257 		if (!strcmp(vfsp->vfc_name, fstypename))
258 			break;
259 	}
260 	if (vfsp == NULL) {
261 		linker_file_t lf;
262 
263 		/* Only load modules for root (very important!) */
264 		if ((error = suser(td)) != 0) {
265 			cache_drop(&nch);
266 			vput(vp);
267 			return error;
268 		}
269 		error = linker_load_file(fstypename, &lf);
270 		if (error || lf == NULL) {
271 			cache_drop(&nch);
272 			vput(vp);
273 			if (lf == NULL)
274 				error = ENODEV;
275 			return error;
276 		}
277 		lf->userrefs++;
278 		/* lookup again, see if the VFS was loaded */
279 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
280 			if (!strcmp(vfsp->vfc_name, fstypename))
281 				break;
282 		}
283 		if (vfsp == NULL) {
284 			lf->userrefs--;
285 			linker_file_unload(lf);
286 			cache_drop(&nch);
287 			vput(vp);
288 			return (ENODEV);
289 		}
290 	}
291 	if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
292 		cache_drop(&nch);
293 		vput(vp);
294 		return (EBUSY);
295 	}
296 	vp->v_flag |= VMOUNT;
297 
298 	/*
299 	 * Allocate and initialize the filesystem.
300 	 */
301 	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
302 	TAILQ_INIT(&mp->mnt_nvnodelist);
303 	TAILQ_INIT(&mp->mnt_reservedvnlist);
304 	TAILQ_INIT(&mp->mnt_jlist);
305 	mp->mnt_nvnodelistsize = 0;
306 	lockinit(&mp->mnt_lock, "vfslock", 0, 0);
307 	vfs_busy(mp, LK_NOWAIT);
308 	mp->mnt_op = vfsp->vfc_vfsops;
309 	mp->mnt_vfc = vfsp;
310 	vfsp->vfc_refcount++;
311 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
312 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
313 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
314 	mp->mnt_stat.f_owner = cred->cr_uid;
315 	mp->mnt_iosize_max = DFLTPHYS;
316 	vn_unlock(vp);
317 update:
318 	/*
319 	 * Set the mount level flags.
320 	 */
321 	if (uap->flags & MNT_RDONLY)
322 		mp->mnt_flag |= MNT_RDONLY;
323 	else if (mp->mnt_flag & MNT_RDONLY)
324 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
325 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
326 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
327 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
328 	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
329 	mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
330 	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
331 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
332 	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
333 	/*
334 	 * Mount the filesystem.
335 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
336 	 * get.
337 	 */
338 	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
339 	if (mp->mnt_flag & MNT_UPDATE) {
340 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
341 			mp->mnt_flag &= ~MNT_RDONLY;
342 		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
343 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
344 		if (error) {
345 			mp->mnt_flag = flag;
346 			mp->mnt_kern_flag = flag2;
347 		}
348 		vfs_unbusy(mp);
349 		vp->v_flag &= ~VMOUNT;
350 		vrele(vp);
351 		cache_drop(&nch);
352 		return (error);
353 	}
354 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
355 	/*
356 	 * Put the new filesystem on the mount list after root.  The mount
357 	 * point gets its own mnt_ncmountpt (unless the VFS already set one
358 	 * up) which represents the root of the mount.  The lookup code
359 	 * detects the mount point going forward and checks the root of
360 	 * the mount going backwards.
361 	 *
362 	 * It is not necessary to invalidate or purge the vnode underneath
363 	 * because elements under the mount will be given their own glue
364 	 * namecache record.
365 	 */
366 	if (!error) {
367 		if (mp->mnt_ncmountpt.ncp == NULL) {
368 			/*
369 			 * allocate, then unlock, but leave the ref intact
370 			 */
371 			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
372 			cache_unlock(&mp->mnt_ncmountpt);
373 		}
374 		mp->mnt_ncmounton = nch;		/* inherits ref */
375 		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
376 
377 		/* XXX get the root of the fs and cache_setvp(mnt_ncmountpt...) */
378 		vp->v_flag &= ~VMOUNT;
379 		mountlist_insert(mp, MNTINS_LAST);
380 		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
381 		vn_unlock(vp);
382 		error = vfs_allocate_syncvnode(mp);
383 		vfs_unbusy(mp);
384 		error = VFS_START(mp, 0);
385 		vrele(vp);
386 	} else {
387 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
388 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
389 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
390 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
391 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
392 		vp->v_flag &= ~VMOUNT;
393 		mp->mnt_vfc->vfc_refcount--;
394 		vfs_unbusy(mp);
395 		kfree(mp, M_MOUNT);
396 		cache_drop(&nch);
397 		vput(vp);
398 	}
399 	return (error);
400 }
401 
402 /*
403  * Scan all active processes to see if any of them have a current
404  * or root directory onto which the new filesystem has just been
405  * mounted. If so, replace them with the new mount point.
406  *
407  * The passed ncp is ref'd and locked (from the mount code) and
408  * must be associated with the vnode representing the root of the
409  * mount point.
410  */
411 struct checkdirs_info {
412 	struct nchandle old_nch;
413 	struct nchandle new_nch;
414 	struct vnode *old_vp;
415 	struct vnode *new_vp;
416 };
417 
418 static int checkdirs_callback(struct proc *p, void *data);
419 
420 static void
421 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
422 {
423 	struct checkdirs_info info;
424 	struct vnode *olddp;
425 	struct vnode *newdp;
426 	struct mount *mp;
427 
428 	/*
429 	 * If the old mount point's vnode has a usecount of 1, it is not
430 	 * being held as a descriptor anywhere.
431 	 */
432 	olddp = old_nch->ncp->nc_vp;
433 	if (olddp == NULL || olddp->v_sysref.refcnt == 1)
434 		return;
435 
436 	/*
437 	 * Force the root vnode of the new mount point to be resolved
438 	 * so we can update any matching processes.
439 	 */
440 	mp = new_nch->mount;
441 	if (VFS_ROOT(mp, &newdp))
442 		panic("mount: lost mount");
443 	cache_setunresolved(new_nch);
444 	cache_setvp(new_nch, newdp);
445 
446 	/*
447 	 * Special handling of the root node
448 	 */
449 	if (rootvnode == olddp) {
450 		vref(newdp);
451 		vfs_cache_setroot(newdp, cache_hold(new_nch));
452 	}
453 
454 	/*
455 	 * Pass newdp separately so the callback does not have to access
456 	 * it via new_nch->ncp->nc_vp.
457 	 */
458 	info.old_nch = *old_nch;
459 	info.new_nch = *new_nch;
460 	info.new_vp = newdp;
461 	allproc_scan(checkdirs_callback, &info);
462 	vput(newdp);
463 }
464 
465 /*
466  * NOTE: callback is not MP safe because the scanned process's filedesc
467  * structure can be ripped out from under us, amoung other things.
468  */
469 static int
470 checkdirs_callback(struct proc *p, void *data)
471 {
472 	struct checkdirs_info *info = data;
473 	struct filedesc *fdp;
474 	struct nchandle ncdrop1;
475 	struct nchandle ncdrop2;
476 	struct vnode *vprele1;
477 	struct vnode *vprele2;
478 
479 	if ((fdp = p->p_fd) != NULL) {
480 		cache_zero(&ncdrop1);
481 		cache_zero(&ncdrop2);
482 		vprele1 = NULL;
483 		vprele2 = NULL;
484 
485 		/*
486 		 * MPUNSAFE - XXX fdp can be pulled out from under a
487 		 * foreign process.
488 		 *
489 		 * A shared filedesc is ok, we don't have to copy it
490 		 * because we are making this change globally.
491 		 */
492 		spin_lock_wr(&fdp->fd_spin);
493 		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
494 		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
495 			vprele1 = fdp->fd_cdir;
496 			vref(info->new_vp);
497 			fdp->fd_cdir = info->new_vp;
498 			ncdrop1 = fdp->fd_ncdir;
499 			cache_copy(&info->new_nch, &fdp->fd_ncdir);
500 		}
501 		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
502 		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
503 			vprele2 = fdp->fd_rdir;
504 			vref(info->new_vp);
505 			fdp->fd_rdir = info->new_vp;
506 			ncdrop2 = fdp->fd_nrdir;
507 			cache_copy(&info->new_nch, &fdp->fd_nrdir);
508 		}
509 		spin_unlock_wr(&fdp->fd_spin);
510 		if (ncdrop1.ncp)
511 			cache_drop(&ncdrop1);
512 		if (ncdrop2.ncp)
513 			cache_drop(&ncdrop2);
514 		if (vprele1)
515 			vrele(vprele1);
516 		if (vprele2)
517 			vrele(vprele2);
518 	}
519 	return(0);
520 }
521 
522 /*
523  * Unmount a file system.
524  *
525  * Note: unmount takes a path to the vnode mounted on as argument,
526  * not special file (as before).
527  */
528 /*
529  * umount_args(char *path, int flags)
530  */
531 /* ARGSUSED */
532 int
533 sys_unmount(struct unmount_args *uap)
534 {
535 	struct thread *td = curthread;
536 	struct proc *p = td->td_proc;
537 	struct mount *mp = NULL;
538 	int error;
539 	struct nlookupdata nd;
540 
541 	KKASSERT(p);
542 	if (p->p_ucred->cr_prison != NULL)
543 		return (EPERM);
544 	if (usermount == 0 && (error = suser(td)))
545 		return (error);
546 
547 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
548 	if (error == 0)
549 		error = nlookup(&nd);
550 	if (error)
551 		goto out;
552 
553 	mp = nd.nl_nch.mount;
554 
555 	/*
556 	 * Only root, or the user that did the original mount is
557 	 * permitted to unmount this filesystem.
558 	 */
559 	if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
560 	    (error = suser(td)))
561 		goto out;
562 
563 	/*
564 	 * Don't allow unmounting the root file system.
565 	 */
566 	if (mp->mnt_flag & MNT_ROOTFS) {
567 		error = EINVAL;
568 		goto out;
569 	}
570 
571 	/*
572 	 * Must be the root of the filesystem
573 	 */
574 	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
575 		error = EINVAL;
576 		goto out;
577 	}
578 
579 out:
580 	nlookup_done(&nd);
581 	if (error)
582 		return (error);
583 	return (dounmount(mp, uap->flags));
584 }
585 
586 /*
587  * Do the actual file system unmount.
588  */
589 static int
590 dounmount_interlock(struct mount *mp)
591 {
592 	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
593 		return (EBUSY);
594 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
595 	return(0);
596 }
597 
598 int
599 dounmount(struct mount *mp, int flags)
600 {
601 	struct namecache *ncp;
602 	struct nchandle nch;
603 	int error;
604 	int async_flag;
605 	int lflags;
606 	int freeok = 1;
607 
608 	/*
609 	 * Exclusive access for unmounting purposes
610 	 */
611 	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
612 		return (error);
613 
614 	/*
615 	 * Allow filesystems to detect that a forced unmount is in progress.
616 	 */
617 	if (flags & MNT_FORCE)
618 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
619 	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT);
620 	error = lockmgr(&mp->mnt_lock, lflags);
621 	if (error) {
622 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
623 		if (mp->mnt_kern_flag & MNTK_MWAIT)
624 			wakeup(mp);
625 		return (error);
626 	}
627 
628 	if (mp->mnt_flag & MNT_EXPUBLIC)
629 		vfs_setpublicfs(NULL, NULL, NULL);
630 
631 	vfs_msync(mp, MNT_WAIT);
632 	async_flag = mp->mnt_flag & MNT_ASYNC;
633 	mp->mnt_flag &=~ MNT_ASYNC;
634 
635 	/*
636 	 * If this filesystem isn't aliasing other filesystems,
637 	 * try to invalidate any remaining namecache entries and
638 	 * check the count afterwords.
639 	 */
640 	if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
641 		cache_lock(&mp->mnt_ncmountpt);
642 		cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
643 		cache_unlock(&mp->mnt_ncmountpt);
644 
645 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
646 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
647 
648 			if ((flags & MNT_FORCE) == 0) {
649 				error = EBUSY;
650 				mount_warning(mp, "Cannot unmount: "
651 						  "%d namecache "
652 						  "references still "
653 						  "present",
654 						  ncp->nc_refs - 1);
655 			} else {
656 				mount_warning(mp, "Forced unmount: "
657 						  "%d namecache "
658 						  "references still "
659 						  "present",
660 						  ncp->nc_refs - 1);
661 				freeok = 0;
662 			}
663 		}
664 	}
665 
666 	/*
667 	 * nchandle records ref the mount structure.  Expect a count of 1
668 	 * (our mount->mnt_ncmountpt).
669 	 */
670 	if (mp->mnt_refs != 1) {
671 		if ((flags & MNT_FORCE) == 0) {
672 			mount_warning(mp, "Cannot unmount: "
673 					  "%d process references still "
674 					  "present", mp->mnt_refs);
675 			error = EBUSY;
676 		} else {
677 			mount_warning(mp, "Forced unmount: "
678 					  "%d process references still "
679 					  "present", mp->mnt_refs);
680 			freeok = 0;
681 		}
682 	}
683 
684 	if (error == 0) {
685 		if (mp->mnt_syncer != NULL)
686 			vrele(mp->mnt_syncer);
687 		if (((mp->mnt_flag & MNT_RDONLY) ||
688 		     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) ||
689 		    (flags & MNT_FORCE)) {
690 			error = VFS_UNMOUNT(mp, flags);
691 		}
692 	}
693 	if (error) {
694 		if (mp->mnt_syncer == NULL)
695 			vfs_allocate_syncvnode(mp);
696 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
697 		mp->mnt_flag |= async_flag;
698 		lockmgr(&mp->mnt_lock, LK_RELEASE);
699 		if (mp->mnt_kern_flag & MNTK_MWAIT)
700 			wakeup(mp);
701 		return (error);
702 	}
703 	/*
704 	 * Clean up any journals still associated with the mount after
705 	 * filesystem activity has ceased.
706 	 */
707 	journal_remove_all_journals(mp,
708 	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
709 
710 	mountlist_remove(mp);
711 
712 	/*
713 	 * Remove any installed vnode ops here so the individual VFSs don't
714 	 * have to.
715 	 */
716 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
717 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
718 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
719 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
720 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
721 
722 	if (mp->mnt_ncmountpt.ncp != NULL) {
723 		nch = mp->mnt_ncmountpt;
724 		cache_zero(&mp->mnt_ncmountpt);
725 		cache_clrmountpt(&nch);
726 		cache_drop(&nch);
727 	}
728 	if (mp->mnt_ncmounton.ncp != NULL) {
729 		nch = mp->mnt_ncmounton;
730 		cache_zero(&mp->mnt_ncmounton);
731 		cache_clrmountpt(&nch);
732 		cache_drop(&nch);
733 	}
734 
735 	mp->mnt_vfc->vfc_refcount--;
736 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
737 		panic("unmount: dangling vnode");
738 	lockmgr(&mp->mnt_lock, LK_RELEASE);
739 	if (mp->mnt_kern_flag & MNTK_MWAIT)
740 		wakeup(mp);
741 	if (freeok)
742 		kfree(mp, M_MOUNT);
743 	return (0);
744 }
745 
746 static
747 void
748 mount_warning(struct mount *mp, const char *ctl, ...)
749 {
750 	char *ptr;
751 	char *buf;
752 	__va_list va;
753 
754 	__va_start(va, ctl);
755 	if (cache_fullpath(NULL, &mp->mnt_ncmounton, &ptr, &buf) == 0) {
756 		kprintf("unmount(%s): ", ptr);
757 		kvprintf(ctl, va);
758 		kprintf("\n");
759 		kfree(buf, M_TEMP);
760 	} else {
761 		kprintf("unmount(%p): ", mp);
762 		kvprintf(ctl, va);
763 		kprintf("\n");
764 	}
765 	__va_end(va);
766 }
767 
768 /*
769  * Shim cache_fullpath() to handle the case where a process is chrooted into
770  * a subdirectory of a mount.  In this case if the root mount matches the
771  * process root directory's mount we have to specify the process's root
772  * directory instead of the mount point, because the mount point might
773  * be above the root directory.
774  */
775 static
776 int
777 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
778 {
779 	struct nchandle *nch;
780 
781 	if (p && p->p_fd->fd_nrdir.mount == mp)
782 		nch = &p->p_fd->fd_nrdir;
783 	else
784 		nch = &mp->mnt_ncmountpt;
785 	return(cache_fullpath(p, nch, rb, fb));
786 }
787 
788 /*
789  * Sync each mounted filesystem.
790  */
791 
792 #ifdef DEBUG
793 static int syncprt = 0;
794 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
795 #endif /* DEBUG */
796 
797 static int sync_callback(struct mount *mp, void *data);
798 
799 /* ARGSUSED */
800 int
801 sys_sync(struct sync_args *uap)
802 {
803 	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
804 #ifdef DEBUG
805 	/*
806 	 * print out buffer pool stat information on each sync() call.
807 	 */
808 	if (syncprt)
809 		vfs_bufstats();
810 #endif /* DEBUG */
811 	return (0);
812 }
813 
814 static
815 int
816 sync_callback(struct mount *mp, void *data __unused)
817 {
818 	int asyncflag;
819 
820 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
821 		asyncflag = mp->mnt_flag & MNT_ASYNC;
822 		mp->mnt_flag &= ~MNT_ASYNC;
823 		vfs_msync(mp, MNT_NOWAIT);
824 		VFS_SYNC(mp, MNT_NOWAIT);
825 		mp->mnt_flag |= asyncflag;
826 	}
827 	return(0);
828 }
829 
830 /* XXX PRISON: could be per prison flag */
831 static int prison_quotas;
832 #if 0
833 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
834 #endif
835 
836 /*
837  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
838  *
839  * Change filesystem quotas.
840  */
841 /* ARGSUSED */
842 int
843 sys_quotactl(struct quotactl_args *uap)
844 {
845 	struct nlookupdata nd;
846 	struct thread *td;
847 	struct proc *p;
848 	struct mount *mp;
849 	int error;
850 
851 	td = curthread;
852 	p = td->td_proc;
853 	if (p->p_ucred->cr_prison && !prison_quotas)
854 		return (EPERM);
855 
856 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
857 	if (error == 0)
858 		error = nlookup(&nd);
859 	if (error == 0) {
860 		mp = nd.nl_nch.mount;
861 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
862 				    uap->arg, nd.nl_cred);
863 	}
864 	nlookup_done(&nd);
865 	return (error);
866 }
867 
868 /*
869  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
870  *		void *buf, int buflen)
871  *
872  * This function operates on a mount point and executes the specified
873  * operation using the specified control data, and possibly returns data.
874  *
875  * The actual number of bytes stored in the result buffer is returned, 0
876  * if none, otherwise an error is returned.
877  */
878 /* ARGSUSED */
879 int
880 sys_mountctl(struct mountctl_args *uap)
881 {
882 	struct thread *td = curthread;
883 	struct proc *p = td->td_proc;
884 	struct file *fp;
885 	void *ctl = NULL;
886 	void *buf = NULL;
887 	char *path = NULL;
888 	int error;
889 
890 	/*
891 	 * Sanity and permissions checks.  We must be root.
892 	 */
893 	KKASSERT(p);
894 	if (p->p_ucred->cr_prison != NULL)
895 		return (EPERM);
896 	if ((error = suser(td)) != 0)
897 		return (error);
898 
899 	/*
900 	 * Argument length checks
901 	 */
902 	if (uap->ctllen < 0 || uap->ctllen > 1024)
903 		return (EINVAL);
904 	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
905 		return (EINVAL);
906 	if (uap->path == NULL)
907 		return (EINVAL);
908 
909 	/*
910 	 * Allocate the necessary buffers and copyin data
911 	 */
912 	path = objcache_get(namei_oc, M_WAITOK);
913 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
914 	if (error)
915 		goto done;
916 
917 	if (uap->ctllen) {
918 		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
919 		error = copyin(uap->ctl, ctl, uap->ctllen);
920 		if (error)
921 			goto done;
922 	}
923 	if (uap->buflen)
924 		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
925 
926 	/*
927 	 * Validate the descriptor
928 	 */
929 	fp = holdfp(p->p_fd, uap->fd, -1);
930 	if (fp == NULL) {
931 		error = EBADF;
932 		goto done;
933 	}
934 
935 	/*
936 	 * Execute the internal kernel function and clean up.
937 	 */
938 	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
939 	if (fp)
940 		fdrop(fp);
941 	if (error == 0 && uap->sysmsg_result > 0)
942 		error = copyout(buf, uap->buf, uap->sysmsg_result);
943 done:
944 	if (path)
945 		objcache_put(namei_oc, path);
946 	if (ctl)
947 		kfree(ctl, M_TEMP);
948 	if (buf)
949 		kfree(buf, M_TEMP);
950 	return (error);
951 }
952 
953 /*
954  * Execute a mount control operation by resolving the path to a mount point
955  * and calling vop_mountctl().
956  */
957 int
958 kern_mountctl(const char *path, int op, struct file *fp,
959 		const void *ctl, int ctllen,
960 		void *buf, int buflen, int *res)
961 {
962 	struct vnode *vp;
963 	struct mount *mp;
964 	struct nlookupdata nd;
965 	int error;
966 
967 	*res = 0;
968 	vp = NULL;
969 	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
970 	if (error == 0)
971 		error = nlookup(&nd);
972 	if (error == 0)
973 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
974 	nlookup_done(&nd);
975 	if (error)
976 		return (error);
977 
978 	mp = vp->v_mount;
979 
980 	/*
981 	 * Must be the root of the filesystem
982 	 */
983 	if ((vp->v_flag & VROOT) == 0) {
984 		vput(vp);
985 		return (EINVAL);
986 	}
987 	error = vop_mountctl(mp->mnt_vn_use_ops, op, fp, ctl, ctllen,
988 				buf, buflen, res);
989 	vput(vp);
990 	return (error);
991 }
992 
993 int
994 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
995 {
996 	struct thread *td = curthread;
997 	struct proc *p = td->td_proc;
998 	struct mount *mp;
999 	struct statfs *sp;
1000 	char *fullpath, *freepath;
1001 	int error;
1002 
1003 	if ((error = nlookup(nd)) != 0)
1004 		return (error);
1005 	mp = nd->nl_nch.mount;
1006 	sp = &mp->mnt_stat;
1007 	if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1008 		return (error);
1009 
1010 	error = mount_path(p, mp, &fullpath, &freepath);
1011 	if (error)
1012 		return(error);
1013 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1014 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1015 	kfree(freepath, M_TEMP);
1016 
1017 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1018 	bcopy(sp, buf, sizeof(*buf));
1019 	/* Only root should have access to the fsid's. */
1020 	if (suser(td))
1021 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1022 	return (0);
1023 }
1024 
1025 /*
1026  * statfs_args(char *path, struct statfs *buf)
1027  *
1028  * Get filesystem statistics.
1029  */
1030 int
1031 sys_statfs(struct statfs_args *uap)
1032 {
1033 	struct nlookupdata nd;
1034 	struct statfs buf;
1035 	int error;
1036 
1037 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1038 	if (error == 0)
1039 		error = kern_statfs(&nd, &buf);
1040 	nlookup_done(&nd);
1041 	if (error == 0)
1042 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1043 	return (error);
1044 }
1045 
1046 int
1047 kern_fstatfs(int fd, struct statfs *buf)
1048 {
1049 	struct thread *td = curthread;
1050 	struct proc *p = td->td_proc;
1051 	struct file *fp;
1052 	struct mount *mp;
1053 	struct statfs *sp;
1054 	char *fullpath, *freepath;
1055 	int error;
1056 
1057 	KKASSERT(p);
1058 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1059 		return (error);
1060 	mp = ((struct vnode *)fp->f_data)->v_mount;
1061 	if (mp == NULL) {
1062 		error = EBADF;
1063 		goto done;
1064 	}
1065 	if (fp->f_cred == NULL) {
1066 		error = EINVAL;
1067 		goto done;
1068 	}
1069 	sp = &mp->mnt_stat;
1070 	if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1071 		goto done;
1072 
1073 	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1074 		goto done;
1075 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1076 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1077 	kfree(freepath, M_TEMP);
1078 
1079 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1080 	bcopy(sp, buf, sizeof(*buf));
1081 
1082 	/* Only root should have access to the fsid's. */
1083 	if (suser(td))
1084 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1085 	error = 0;
1086 done:
1087 	fdrop(fp);
1088 	return (error);
1089 }
1090 
1091 /*
1092  * fstatfs_args(int fd, struct statfs *buf)
1093  *
1094  * Get filesystem statistics.
1095  */
1096 int
1097 sys_fstatfs(struct fstatfs_args *uap)
1098 {
1099 	struct statfs buf;
1100 	int error;
1101 
1102 	error = kern_fstatfs(uap->fd, &buf);
1103 
1104 	if (error == 0)
1105 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1106 	return (error);
1107 }
1108 
1109 /*
1110  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1111  *
1112  * Get statistics on all filesystems.
1113  */
1114 
1115 struct getfsstat_info {
1116 	struct statfs *sfsp;
1117 	long count;
1118 	long maxcount;
1119 	int error;
1120 	int flags;
1121 	struct proc *p;
1122 };
1123 
1124 static int getfsstat_callback(struct mount *, void *);
1125 
1126 /* ARGSUSED */
1127 int
1128 sys_getfsstat(struct getfsstat_args *uap)
1129 {
1130 	struct thread *td = curthread;
1131 	struct proc *p = td->td_proc;
1132 	struct getfsstat_info info;
1133 
1134 	bzero(&info, sizeof(info));
1135 
1136 	info.maxcount = uap->bufsize / sizeof(struct statfs);
1137 	info.sfsp = uap->buf;
1138 	info.count = 0;
1139 	info.flags = uap->flags;
1140 	info.p = p;
1141 
1142 	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1143 	if (info.sfsp && info.count > info.maxcount)
1144 		uap->sysmsg_result = info.maxcount;
1145 	else
1146 		uap->sysmsg_result = info.count;
1147 	return (info.error);
1148 }
1149 
1150 static int
1151 getfsstat_callback(struct mount *mp, void *data)
1152 {
1153 	struct getfsstat_info *info = data;
1154 	struct statfs *sp;
1155 	char *freepath;
1156 	char *fullpath;
1157 	int error;
1158 
1159 	if (info->sfsp && info->count < info->maxcount) {
1160 		if (info->p && !chroot_visible_mnt(mp, info->p))
1161 			return(0);
1162 		sp = &mp->mnt_stat;
1163 
1164 		/*
1165 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1166 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1167 		 * overrides MNT_WAIT.
1168 		 */
1169 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1170 		    (info->flags & MNT_WAIT)) &&
1171 		    (error = VFS_STATFS(mp, sp, info->p->p_ucred))) {
1172 			return(0);
1173 		}
1174 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1175 
1176 		error = mount_path(info->p, mp, &fullpath, &freepath);
1177 		if (error) {
1178 			info->error = error;
1179 			return(-1);
1180 		}
1181 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1182 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1183 		kfree(freepath, M_TEMP);
1184 
1185 		error = copyout(sp, info->sfsp, sizeof(*sp));
1186 		if (error) {
1187 			info->error = error;
1188 			return (-1);
1189 		}
1190 		++info->sfsp;
1191 	}
1192 	info->count++;
1193 	return(0);
1194 }
1195 
1196 /*
1197  * fchdir_args(int fd)
1198  *
1199  * Change current working directory to a given file descriptor.
1200  */
1201 /* ARGSUSED */
1202 int
1203 sys_fchdir(struct fchdir_args *uap)
1204 {
1205 	struct thread *td = curthread;
1206 	struct proc *p = td->td_proc;
1207 	struct filedesc *fdp = p->p_fd;
1208 	struct vnode *vp, *ovp;
1209 	struct mount *mp;
1210 	struct file *fp;
1211 	struct nchandle nch, onch, tnch;
1212 	int error;
1213 
1214 	if ((error = holdvnode(fdp, uap->fd, &fp)) != 0)
1215 		return (error);
1216 	vp = (struct vnode *)fp->f_data;
1217 	vref(vp);
1218 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1219 	if (vp->v_type != VDIR || fp->f_nchandle.ncp == NULL)
1220 		error = ENOTDIR;
1221 	else
1222 		error = VOP_ACCESS(vp, VEXEC, p->p_ucred);
1223 	if (error) {
1224 		vput(vp);
1225 		fdrop(fp);
1226 		return (error);
1227 	}
1228 	cache_copy(&fp->f_nchandle, &nch);
1229 
1230 	/*
1231 	 * If the ncp has become a mount point, traverse through
1232 	 * the mount point.
1233 	 */
1234 
1235 	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1236 	       (mp = cache_findmount(&nch)) != NULL
1237 	) {
1238 		error = nlookup_mp(mp, &tnch);
1239 		if (error == 0) {
1240 			cache_unlock(&tnch);	/* leave ref intact */
1241 			vput(vp);
1242 			vp = tnch.ncp->nc_vp;
1243 			error = vget(vp, LK_SHARED);
1244 			KKASSERT(error == 0);
1245 			cache_drop(&nch);
1246 			nch = tnch;
1247 		}
1248 	}
1249 	if (error == 0) {
1250 		ovp = fdp->fd_cdir;
1251 		onch = fdp->fd_ncdir;
1252 		vn_unlock(vp);		/* leave ref intact */
1253 		fdp->fd_cdir = vp;
1254 		fdp->fd_ncdir = nch;
1255 		cache_drop(&onch);
1256 		vrele(ovp);
1257 	} else {
1258 		cache_drop(&nch);
1259 		vput(vp);
1260 	}
1261 	fdrop(fp);
1262 	return (error);
1263 }
1264 
1265 int
1266 kern_chdir(struct nlookupdata *nd)
1267 {
1268 	struct thread *td = curthread;
1269 	struct proc *p = td->td_proc;
1270 	struct filedesc *fdp = p->p_fd;
1271 	struct vnode *vp, *ovp;
1272 	struct nchandle onch;
1273 	int error;
1274 
1275 	if ((error = nlookup(nd)) != 0)
1276 		return (error);
1277 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1278 		return (ENOENT);
1279 	if ((error = vget(vp, LK_SHARED)) != 0)
1280 		return (error);
1281 
1282 	error = checkvp_chdir(vp, td);
1283 	vn_unlock(vp);
1284 	if (error == 0) {
1285 		ovp = fdp->fd_cdir;
1286 		onch = fdp->fd_ncdir;
1287 		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1288 		fdp->fd_ncdir = nd->nl_nch;
1289 		fdp->fd_cdir = vp;
1290 		cache_drop(&onch);
1291 		vrele(ovp);
1292 		cache_zero(&nd->nl_nch);
1293 	} else {
1294 		vrele(vp);
1295 	}
1296 	return (error);
1297 }
1298 
1299 /*
1300  * chdir_args(char *path)
1301  *
1302  * Change current working directory (``.'').
1303  */
1304 int
1305 sys_chdir(struct chdir_args *uap)
1306 {
1307 	struct nlookupdata nd;
1308 	int error;
1309 
1310 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1311 	if (error == 0)
1312 		error = kern_chdir(&nd);
1313 	nlookup_done(&nd);
1314 	return (error);
1315 }
1316 
1317 /*
1318  * Helper function for raised chroot(2) security function:  Refuse if
1319  * any filedescriptors are open directories.
1320  */
1321 static int
1322 chroot_refuse_vdir_fds(struct filedesc *fdp)
1323 {
1324 	struct vnode *vp;
1325 	struct file *fp;
1326 	int error;
1327 	int fd;
1328 
1329 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1330 		if ((error = holdvnode(fdp, fd, &fp)) != 0)
1331 			continue;
1332 		vp = (struct vnode *)fp->f_data;
1333 		if (vp->v_type != VDIR) {
1334 			fdrop(fp);
1335 			continue;
1336 		}
1337 		fdrop(fp);
1338 		return(EPERM);
1339 	}
1340 	return (0);
1341 }
1342 
1343 /*
1344  * This sysctl determines if we will allow a process to chroot(2) if it
1345  * has a directory open:
1346  *	0: disallowed for all processes.
1347  *	1: allowed for processes that were not already chroot(2)'ed.
1348  *	2: allowed for all processes.
1349  */
1350 
1351 static int chroot_allow_open_directories = 1;
1352 
1353 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1354      &chroot_allow_open_directories, 0, "");
1355 
1356 /*
1357  * chroot to the specified namecache entry.  We obtain the vp from the
1358  * namecache data.  The passed ncp must be locked and referenced and will
1359  * remain locked and referenced on return.
1360  */
1361 int
1362 kern_chroot(struct nchandle *nch)
1363 {
1364 	struct thread *td = curthread;
1365 	struct proc *p = td->td_proc;
1366 	struct filedesc *fdp = p->p_fd;
1367 	struct vnode *vp;
1368 	int error;
1369 
1370 	/*
1371 	 * Only root can chroot
1372 	 */
1373 	if ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0)
1374 		return (error);
1375 
1376 	/*
1377 	 * Disallow open directory descriptors (fchdir() breakouts).
1378 	 */
1379 	if (chroot_allow_open_directories == 0 ||
1380 	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1381 		if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1382 			return (error);
1383 	}
1384 	if ((vp = nch->ncp->nc_vp) == NULL)
1385 		return (ENOENT);
1386 
1387 	if ((error = vget(vp, LK_SHARED)) != 0)
1388 		return (error);
1389 
1390 	/*
1391 	 * Check the validity of vp as a directory to change to and
1392 	 * associate it with rdir/jdir.
1393 	 */
1394 	error = checkvp_chdir(vp, td);
1395 	vn_unlock(vp);			/* leave reference intact */
1396 	if (error == 0) {
1397 		vrele(fdp->fd_rdir);
1398 		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
1399 		cache_drop(&fdp->fd_nrdir);
1400 		cache_copy(nch, &fdp->fd_nrdir);
1401 		if (fdp->fd_jdir == NULL) {
1402 			fdp->fd_jdir = vp;
1403 			vref(fdp->fd_jdir);
1404 			cache_copy(nch, &fdp->fd_njdir);
1405 		}
1406 	} else {
1407 		vrele(vp);
1408 	}
1409 	return (error);
1410 }
1411 
1412 /*
1413  * chroot_args(char *path)
1414  *
1415  * Change notion of root (``/'') directory.
1416  */
1417 /* ARGSUSED */
1418 int
1419 sys_chroot(struct chroot_args *uap)
1420 {
1421 	struct thread *td = curthread;
1422 	struct nlookupdata nd;
1423 	int error;
1424 
1425 	KKASSERT(td->td_proc);
1426 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1427 	if (error) {
1428 		nlookup_done(&nd);
1429 		return(error);
1430 	}
1431 	error = nlookup(&nd);
1432 	if (error == 0)
1433 		error = kern_chroot(&nd.nl_nch);
1434 	nlookup_done(&nd);
1435 	return(error);
1436 }
1437 
1438 /*
1439  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1440  * determine whether it is legal to chdir to the vnode.  The vnode's state
1441  * is not changed by this call.
1442  */
1443 int
1444 checkvp_chdir(struct vnode *vp, struct thread *td)
1445 {
1446 	int error;
1447 
1448 	if (vp->v_type != VDIR)
1449 		error = ENOTDIR;
1450 	else
1451 		error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred);
1452 	return (error);
1453 }
1454 
1455 int
1456 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1457 {
1458 	struct thread *td = curthread;
1459 	struct proc *p = td->td_proc;
1460 	struct lwp *lp = td->td_lwp;
1461 	struct filedesc *fdp = p->p_fd;
1462 	int cmode, flags;
1463 	struct file *nfp;
1464 	struct file *fp;
1465 	struct vnode *vp;
1466 	int type, indx, error;
1467 	struct flock lf;
1468 
1469 	if ((oflags & O_ACCMODE) == O_ACCMODE)
1470 		return (EINVAL);
1471 	flags = FFLAGS(oflags);
1472 	error = falloc(p, &nfp, NULL);
1473 	if (error)
1474 		return (error);
1475 	fp = nfp;
1476 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
1477 
1478 	/*
1479 	 * XXX p_dupfd is a real mess.  It allows a device to return a
1480 	 * file descriptor to be duplicated rather then doing the open
1481 	 * itself.
1482 	 */
1483 	lp->lwp_dupfd = -1;
1484 
1485 	/*
1486 	 * Call vn_open() to do the lookup and assign the vnode to the
1487 	 * file pointer.  vn_open() does not change the ref count on fp
1488 	 * and the vnode, on success, will be inherited by the file pointer
1489 	 * and unlocked.
1490 	 */
1491 	nd->nl_flags |= NLC_LOCKVP;
1492 	error = vn_open(nd, fp, flags, cmode);
1493 	nlookup_done(nd);
1494 	if (error) {
1495 		/*
1496 		 * handle special fdopen() case.  bleh.  dupfdopen() is
1497 		 * responsible for dropping the old contents of ofiles[indx]
1498 		 * if it succeeds.
1499 		 *
1500 		 * Note that fsetfd() will add a ref to fp which represents
1501 		 * the fd_files[] assignment.  We must still drop our
1502 		 * reference.
1503 		 */
1504 		if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
1505 			if (fdalloc(p, 0, &indx) == 0) {
1506 				error = dupfdopen(p, indx, lp->lwp_dupfd, flags, error);
1507 				if (error == 0) {
1508 					*res = indx;
1509 					fdrop(fp);	/* our ref */
1510 					return (0);
1511 				}
1512 				fsetfd(p, NULL, indx);
1513 			}
1514 		}
1515 		fdrop(fp);	/* our ref */
1516 		if (error == ERESTART)
1517 			error = EINTR;
1518 		return (error);
1519 	}
1520 
1521 	/*
1522 	 * ref the vnode for ourselves so it can't be ripped out from under
1523 	 * is.  XXX need an ND flag to request that the vnode be returned
1524 	 * anyway.
1525 	 *
1526 	 * Reserve a file descriptor but do not assign it until the open
1527 	 * succeeds.
1528 	 */
1529 	vp = (struct vnode *)fp->f_data;
1530 	vref(vp);
1531 	if ((error = fdalloc(p, 0, &indx)) != 0) {
1532 		fdrop(fp);
1533 		vrele(vp);
1534 		return (error);
1535 	}
1536 
1537 	/*
1538 	 * If no error occurs the vp will have been assigned to the file
1539 	 * pointer.
1540 	 */
1541 	lp->lwp_dupfd = 0;
1542 
1543 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1544 		lf.l_whence = SEEK_SET;
1545 		lf.l_start = 0;
1546 		lf.l_len = 0;
1547 		if (flags & O_EXLOCK)
1548 			lf.l_type = F_WRLCK;
1549 		else
1550 			lf.l_type = F_RDLCK;
1551 		if (flags & FNONBLOCK)
1552 			type = 0;
1553 		else
1554 			type = F_WAIT;
1555 
1556 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
1557 			/*
1558 			 * lock request failed.  Clean up the reserved
1559 			 * descriptor.
1560 			 */
1561 			vrele(vp);
1562 			fsetfd(p, NULL, indx);
1563 			fdrop(fp);
1564 			return (error);
1565 		}
1566 		fp->f_flag |= FHASLOCK;
1567 	}
1568 #if 0
1569 	/*
1570 	 * Assert that all regular file vnodes were created with a object.
1571 	 */
1572 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
1573 		("open: regular file has no backing object after vn_open"));
1574 #endif
1575 
1576 	vrele(vp);
1577 
1578 	/*
1579 	 * release our private reference, leaving the one associated with the
1580 	 * descriptor table intact.
1581 	 */
1582 	fsetfd(p, fp, indx);
1583 	fdrop(fp);
1584 	*res = indx;
1585 	return (0);
1586 }
1587 
1588 /*
1589  * open_args(char *path, int flags, int mode)
1590  *
1591  * Check permissions, allocate an open file structure,
1592  * and call the device open routine if any.
1593  */
1594 int
1595 sys_open(struct open_args *uap)
1596 {
1597 	struct nlookupdata nd;
1598 	int error;
1599 
1600 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1601 	if (error == 0) {
1602 		error = kern_open(&nd, uap->flags,
1603 				    uap->mode, &uap->sysmsg_result);
1604 	}
1605 	nlookup_done(&nd);
1606 	return (error);
1607 }
1608 
1609 int
1610 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
1611 {
1612 	struct thread *td = curthread;
1613 	struct proc *p = td->td_proc;
1614 	struct vnode *vp;
1615 	struct vattr vattr;
1616 	int error;
1617 	int whiteout = 0;
1618 
1619 	KKASSERT(p);
1620 
1621 	switch (mode & S_IFMT) {
1622 	case S_IFCHR:
1623 	case S_IFBLK:
1624 		error = suser(td);
1625 		break;
1626 	default:
1627 		error = suser_cred(p->p_ucred, PRISON_ROOT);
1628 		break;
1629 	}
1630 	if (error)
1631 		return (error);
1632 
1633 	bwillwrite();
1634 	nd->nl_flags |= NLC_CREATE;
1635 	if ((error = nlookup(nd)) != 0)
1636 		return (error);
1637 	if (nd->nl_nch.ncp->nc_vp)
1638 		return (EEXIST);
1639 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1640 		return (error);
1641 
1642 	VATTR_NULL(&vattr);
1643 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1644 	vattr.va_rmajor = rmajor;
1645 	vattr.va_rminor = rminor;
1646 	whiteout = 0;
1647 
1648 	switch (mode & S_IFMT) {
1649 	case S_IFMT:	/* used by badsect to flag bad sectors */
1650 		vattr.va_type = VBAD;
1651 		break;
1652 	case S_IFCHR:
1653 		vattr.va_type = VCHR;
1654 		break;
1655 	case S_IFBLK:
1656 		vattr.va_type = VBLK;
1657 		break;
1658 	case S_IFWHT:
1659 		whiteout = 1;
1660 		break;
1661 	default:
1662 		error = EINVAL;
1663 		break;
1664 	}
1665 	if (error == 0) {
1666 		if (whiteout) {
1667 			error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_cred, NAMEI_CREATE);
1668 		} else {
1669 			vp = NULL;
1670 			error = VOP_NMKNOD(&nd->nl_nch, &vp, nd->nl_cred, &vattr);
1671 			if (error == 0)
1672 				vput(vp);
1673 		}
1674 	}
1675 	return (error);
1676 }
1677 
1678 /*
1679  * mknod_args(char *path, int mode, int dev)
1680  *
1681  * Create a special file.
1682  */
1683 int
1684 sys_mknod(struct mknod_args *uap)
1685 {
1686 	struct nlookupdata nd;
1687 	int error;
1688 
1689 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1690 	if (error == 0) {
1691 		error = kern_mknod(&nd, uap->mode,
1692 				   umajor(uap->dev), uminor(uap->dev));
1693 	}
1694 	nlookup_done(&nd);
1695 	return (error);
1696 }
1697 
1698 int
1699 kern_mkfifo(struct nlookupdata *nd, int mode)
1700 {
1701 	struct thread *td = curthread;
1702 	struct proc *p = td->td_proc;
1703 	struct vattr vattr;
1704 	struct vnode *vp;
1705 	int error;
1706 
1707 	bwillwrite();
1708 
1709 	nd->nl_flags |= NLC_CREATE;
1710 	if ((error = nlookup(nd)) != 0)
1711 		return (error);
1712 	if (nd->nl_nch.ncp->nc_vp)
1713 		return (EEXIST);
1714 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1715 		return (error);
1716 
1717 	VATTR_NULL(&vattr);
1718 	vattr.va_type = VFIFO;
1719 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
1720 	vp = NULL;
1721 	error = VOP_NMKNOD(&nd->nl_nch, &vp, nd->nl_cred, &vattr);
1722 	if (error == 0)
1723 		vput(vp);
1724 	return (error);
1725 }
1726 
1727 /*
1728  * mkfifo_args(char *path, int mode)
1729  *
1730  * Create a named pipe.
1731  */
1732 int
1733 sys_mkfifo(struct mkfifo_args *uap)
1734 {
1735 	struct nlookupdata nd;
1736 	int error;
1737 
1738 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1739 	if (error == 0)
1740 		error = kern_mkfifo(&nd, uap->mode);
1741 	nlookup_done(&nd);
1742 	return (error);
1743 }
1744 
1745 static int hardlink_check_uid = 0;
1746 SYSCTL_INT(_kern, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1747     &hardlink_check_uid, 0,
1748     "Unprivileged processes cannot create hard links to files owned by other "
1749     "users");
1750 static int hardlink_check_gid = 0;
1751 SYSCTL_INT(_kern, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1752     &hardlink_check_gid, 0,
1753     "Unprivileged processes cannot create hard links to files owned by other "
1754     "groups");
1755 
1756 static int
1757 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
1758 {
1759 	struct vattr va;
1760 	int error;
1761 
1762 	/*
1763 	 * Shortcut if disabled
1764 	 */
1765 	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
1766 		return (0);
1767 
1768 	/*
1769 	 * root cred can always hardlink
1770 	 */
1771 	if (suser_cred(cred, PRISON_ROOT) == 0)
1772 		return (0);
1773 
1774 	/*
1775 	 * Otherwise only if the originating file is owned by the
1776 	 * same user or group.  Note that any group is allowed if
1777 	 * the file is owned by the caller.
1778 	 */
1779 	error = VOP_GETATTR(vp, &va);
1780 	if (error != 0)
1781 		return (error);
1782 
1783 	if (hardlink_check_uid) {
1784 		if (cred->cr_uid != va.va_uid)
1785 			return (EPERM);
1786 	}
1787 
1788 	if (hardlink_check_gid) {
1789 		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
1790 			return (EPERM);
1791 	}
1792 
1793 	return (0);
1794 }
1795 
1796 int
1797 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
1798 {
1799 	struct thread *td = curthread;
1800 	struct vnode *vp;
1801 	int error;
1802 
1803 	/*
1804 	 * Lookup the source and obtained a locked vnode.
1805 	 *
1806 	 * XXX relookup on vget failure / race ?
1807 	 */
1808 	bwillwrite();
1809 	if ((error = nlookup(nd)) != 0)
1810 		return (error);
1811 	vp = nd->nl_nch.ncp->nc_vp;
1812 	KKASSERT(vp != NULL);
1813 	if (vp->v_type == VDIR)
1814 		return (EPERM);		/* POSIX */
1815 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1816 		return (error);
1817 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
1818 		return (error);
1819 
1820 	/*
1821 	 * Unlock the source so we can lookup the target without deadlocking
1822 	 * (XXX vp is locked already, possible other deadlock?).  The target
1823 	 * must not exist.
1824 	 */
1825 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
1826 	nd->nl_flags &= ~NLC_NCPISLOCKED;
1827 	cache_unlock(&nd->nl_nch);
1828 
1829 	linknd->nl_flags |= NLC_CREATE;
1830 	if ((error = nlookup(linknd)) != 0) {
1831 		vput(vp);
1832 		return (error);
1833 	}
1834 	if (linknd->nl_nch.ncp->nc_vp) {
1835 		vput(vp);
1836 		return (EEXIST);
1837 	}
1838 
1839 	/*
1840 	 * Finally run the new API VOP.
1841 	 */
1842 	error = can_hardlink(vp, td, td->td_proc->p_ucred);
1843 	if (error == 0)
1844 		error = VOP_NLINK(&linknd->nl_nch, vp, linknd->nl_cred);
1845 	vput(vp);
1846 	return (error);
1847 }
1848 
1849 /*
1850  * link_args(char *path, char *link)
1851  *
1852  * Make a hard file link.
1853  */
1854 int
1855 sys_link(struct link_args *uap)
1856 {
1857 	struct nlookupdata nd, linknd;
1858 	int error;
1859 
1860 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1861 	if (error == 0) {
1862 		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
1863 		if (error == 0)
1864 			error = kern_link(&nd, &linknd);
1865 		nlookup_done(&linknd);
1866 	}
1867 	nlookup_done(&nd);
1868 	return (error);
1869 }
1870 
1871 int
1872 kern_symlink(struct nlookupdata *nd, char *path, int mode)
1873 {
1874 	struct vattr vattr;
1875 	struct vnode *vp;
1876 	int error;
1877 
1878 	bwillwrite();
1879 	nd->nl_flags |= NLC_CREATE;
1880 	if ((error = nlookup(nd)) != 0)
1881 		return (error);
1882 	if (nd->nl_nch.ncp->nc_vp)
1883 		return (EEXIST);
1884 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1885 		return (error);
1886 	VATTR_NULL(&vattr);
1887 	vattr.va_mode = mode;
1888 	error = VOP_NSYMLINK(&nd->nl_nch, &vp, nd->nl_cred, &vattr, path);
1889 	if (error == 0)
1890 		vput(vp);
1891 	return (error);
1892 }
1893 
1894 /*
1895  * symlink(char *path, char *link)
1896  *
1897  * Make a symbolic link.
1898  */
1899 int
1900 sys_symlink(struct symlink_args *uap)
1901 {
1902 	struct thread *td = curthread;
1903 	struct nlookupdata nd;
1904 	char *path;
1905 	int error;
1906 	int mode;
1907 
1908 	path = objcache_get(namei_oc, M_WAITOK);
1909 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
1910 	if (error == 0) {
1911 		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
1912 		if (error == 0) {
1913 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
1914 			error = kern_symlink(&nd, path, mode);
1915 		}
1916 		nlookup_done(&nd);
1917 	}
1918 	objcache_put(namei_oc, path);
1919 	return (error);
1920 }
1921 
1922 /*
1923  * undelete_args(char *path)
1924  *
1925  * Delete a whiteout from the filesystem.
1926  */
1927 /* ARGSUSED */
1928 int
1929 sys_undelete(struct undelete_args *uap)
1930 {
1931 	struct nlookupdata nd;
1932 	int error;
1933 
1934 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1935 	bwillwrite();
1936 	nd.nl_flags |= NLC_DELETE;
1937 	if (error == 0)
1938 		error = nlookup(&nd);
1939 	if (error == 0)
1940 		error = ncp_writechk(&nd.nl_nch);
1941 	if (error == 0)
1942 		error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_cred, NAMEI_DELETE);
1943 	nlookup_done(&nd);
1944 	return (error);
1945 }
1946 
1947 int
1948 kern_unlink(struct nlookupdata *nd)
1949 {
1950 	int error;
1951 
1952 	bwillwrite();
1953 	nd->nl_flags |= NLC_DELETE;
1954 	if ((error = nlookup(nd)) != 0)
1955 		return (error);
1956 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
1957 		return (error);
1958 	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_cred);
1959 	return (error);
1960 }
1961 
1962 /*
1963  * unlink_args(char *path)
1964  *
1965  * Delete a name from the filesystem.
1966  */
1967 int
1968 sys_unlink(struct unlink_args *uap)
1969 {
1970 	struct nlookupdata nd;
1971 	int error;
1972 
1973 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1974 	if (error == 0)
1975 		error = kern_unlink(&nd);
1976 	nlookup_done(&nd);
1977 	return (error);
1978 }
1979 
1980 int
1981 kern_lseek(int fd, off_t offset, int whence, off_t *res)
1982 {
1983 	struct thread *td = curthread;
1984 	struct proc *p = td->td_proc;
1985 	struct file *fp;
1986 	struct vattr vattr;
1987 	int error;
1988 
1989 	fp = holdfp(p->p_fd, fd, -1);
1990 	if (fp == NULL)
1991 		return (EBADF);
1992 	if (fp->f_type != DTYPE_VNODE) {
1993 		error = ESPIPE;
1994 		goto done;
1995 	}
1996 
1997 	switch (whence) {
1998 	case L_INCR:
1999 		fp->f_offset += offset;
2000 		error = 0;
2001 		break;
2002 	case L_XTND:
2003 		error = VOP_GETATTR((struct vnode *)fp->f_data, &vattr);
2004 		if (error == 0)
2005 			fp->f_offset = offset + vattr.va_size;
2006 		break;
2007 	case L_SET:
2008 		fp->f_offset = offset;
2009 		error = 0;
2010 		break;
2011 	default:
2012 		error = EINVAL;
2013 		break;
2014 	}
2015 	*res = fp->f_offset;
2016 done:
2017 	fdrop(fp);
2018 	return (error);
2019 }
2020 
2021 /*
2022  * lseek_args(int fd, int pad, off_t offset, int whence)
2023  *
2024  * Reposition read/write file offset.
2025  */
2026 int
2027 sys_lseek(struct lseek_args *uap)
2028 {
2029 	int error;
2030 
2031 	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2032 	    &uap->sysmsg_offset);
2033 
2034 	return (error);
2035 }
2036 
2037 int
2038 kern_access(struct nlookupdata *nd, int aflags)
2039 {
2040 	struct vnode *vp;
2041 	int error, flags;
2042 
2043 	if ((error = nlookup(nd)) != 0)
2044 		return (error);
2045 retry:
2046 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2047 	if (error)
2048 		return (error);
2049 
2050 	/* Flags == 0 means only check for existence. */
2051 	if (aflags) {
2052 		flags = 0;
2053 		if (aflags & R_OK)
2054 			flags |= VREAD;
2055 		if (aflags & W_OK)
2056 			flags |= VWRITE;
2057 		if (aflags & X_OK)
2058 			flags |= VEXEC;
2059 		if ((flags & VWRITE) == 0 ||
2060 		    (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2061 			error = VOP_ACCESS(vp, flags, nd->nl_cred);
2062 
2063 		/*
2064 		 * If the file handle is stale we have to re-resolve the
2065 		 * entry.  This is a hack at the moment.
2066 		 */
2067 		if (error == ESTALE) {
2068 			cache_setunresolved(&nd->nl_nch);
2069 			error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2070 			if (error == 0) {
2071 				vput(vp);
2072 				vp = NULL;
2073 				goto retry;
2074 			}
2075 		}
2076 	}
2077 	vput(vp);
2078 	return (error);
2079 }
2080 
2081 /*
2082  * access_args(char *path, int flags)
2083  *
2084  * Check access permissions.
2085  */
2086 int
2087 sys_access(struct access_args *uap)
2088 {
2089 	struct nlookupdata nd;
2090 	int error;
2091 
2092 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2093 	if (error == 0)
2094 		error = kern_access(&nd, uap->flags);
2095 	nlookup_done(&nd);
2096 	return (error);
2097 }
2098 
2099 int
2100 kern_stat(struct nlookupdata *nd, struct stat *st)
2101 {
2102 	int error;
2103 	struct vnode *vp;
2104 	thread_t td;
2105 
2106 	if ((error = nlookup(nd)) != 0)
2107 		return (error);
2108 again:
2109 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2110 		return (ENOENT);
2111 
2112 	td = curthread;
2113 	if ((error = vget(vp, LK_SHARED)) != 0)
2114 		return (error);
2115 	error = vn_stat(vp, st, nd->nl_cred);
2116 
2117 	/*
2118 	 * If the file handle is stale we have to re-resolve the entry.  This
2119 	 * is a hack at the moment.
2120 	 */
2121 	if (error == ESTALE) {
2122 		cache_setunresolved(&nd->nl_nch);
2123 		error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2124 		if (error == 0) {
2125 			vput(vp);
2126 			goto again;
2127 		}
2128 	}
2129 	vput(vp);
2130 	return (error);
2131 }
2132 
2133 /*
2134  * stat_args(char *path, struct stat *ub)
2135  *
2136  * Get file status; this version follows links.
2137  */
2138 int
2139 sys_stat(struct stat_args *uap)
2140 {
2141 	struct nlookupdata nd;
2142 	struct stat st;
2143 	int error;
2144 
2145 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2146 	if (error == 0) {
2147 		error = kern_stat(&nd, &st);
2148 		if (error == 0)
2149 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2150 	}
2151 	nlookup_done(&nd);
2152 	return (error);
2153 }
2154 
2155 /*
2156  * lstat_args(char *path, struct stat *ub)
2157  *
2158  * Get file status; this version does not follow links.
2159  */
2160 int
2161 sys_lstat(struct lstat_args *uap)
2162 {
2163 	struct nlookupdata nd;
2164 	struct stat st;
2165 	int error;
2166 
2167 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2168 	if (error == 0) {
2169 		error = kern_stat(&nd, &st);
2170 		if (error == 0)
2171 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2172 	}
2173 	nlookup_done(&nd);
2174 	return (error);
2175 }
2176 
2177 /*
2178  * pathconf_Args(char *path, int name)
2179  *
2180  * Get configurable pathname variables.
2181  */
2182 /* ARGSUSED */
2183 int
2184 sys_pathconf(struct pathconf_args *uap)
2185 {
2186 	struct nlookupdata nd;
2187 	struct vnode *vp;
2188 	int error;
2189 
2190 	vp = NULL;
2191 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2192 	if (error == 0)
2193 		error = nlookup(&nd);
2194 	if (error == 0)
2195 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
2196 	nlookup_done(&nd);
2197 	if (error == 0) {
2198 		error = VOP_PATHCONF(vp, uap->name, uap->sysmsg_fds);
2199 		vput(vp);
2200 	}
2201 	return (error);
2202 }
2203 
2204 /*
2205  * XXX: daver
2206  * kern_readlink isn't properly split yet.  There is a copyin burried
2207  * in VOP_READLINK().
2208  */
2209 int
2210 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
2211 {
2212 	struct thread *td = curthread;
2213 	struct proc *p = td->td_proc;
2214 	struct vnode *vp;
2215 	struct iovec aiov;
2216 	struct uio auio;
2217 	int error;
2218 
2219 	if ((error = nlookup(nd)) != 0)
2220 		return (error);
2221 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2222 	if (error)
2223 		return (error);
2224 	if (vp->v_type != VLNK) {
2225 		error = EINVAL;
2226 	} else {
2227 		aiov.iov_base = buf;
2228 		aiov.iov_len = count;
2229 		auio.uio_iov = &aiov;
2230 		auio.uio_iovcnt = 1;
2231 		auio.uio_offset = 0;
2232 		auio.uio_rw = UIO_READ;
2233 		auio.uio_segflg = UIO_USERSPACE;
2234 		auio.uio_td = td;
2235 		auio.uio_resid = count;
2236 		error = VOP_READLINK(vp, &auio, p->p_ucred);
2237 	}
2238 	vput(vp);
2239 	*res = count - auio.uio_resid;
2240 	return (error);
2241 }
2242 
2243 /*
2244  * readlink_args(char *path, char *buf, int count)
2245  *
2246  * Return target name of a symbolic link.
2247  */
2248 int
2249 sys_readlink(struct readlink_args *uap)
2250 {
2251 	struct nlookupdata nd;
2252 	int error;
2253 
2254 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2255 	if (error == 0) {
2256 		error = kern_readlink(&nd, uap->buf, uap->count,
2257 					&uap->sysmsg_result);
2258 	}
2259 	nlookup_done(&nd);
2260 	return (error);
2261 }
2262 
2263 static int
2264 setfflags(struct vnode *vp, int flags)
2265 {
2266 	struct thread *td = curthread;
2267 	struct proc *p = td->td_proc;
2268 	int error;
2269 	struct vattr vattr;
2270 
2271 	/*
2272 	 * Prevent non-root users from setting flags on devices.  When
2273 	 * a device is reused, users can retain ownership of the device
2274 	 * if they are allowed to set flags and programs assume that
2275 	 * chown can't fail when done as root.
2276 	 */
2277 	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
2278 	    ((error = suser_cred(p->p_ucred, PRISON_ROOT)) != 0))
2279 		return (error);
2280 
2281 	/*
2282 	 * note: vget is required for any operation that might mod the vnode
2283 	 * so VINACTIVE is properly cleared.
2284 	 */
2285 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2286 		VATTR_NULL(&vattr);
2287 		vattr.va_flags = flags;
2288 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2289 		vput(vp);
2290 	}
2291 	return (error);
2292 }
2293 
2294 /*
2295  * chflags(char *path, int flags)
2296  *
2297  * Change flags of a file given a path name.
2298  */
2299 /* ARGSUSED */
2300 int
2301 sys_chflags(struct chflags_args *uap)
2302 {
2303 	struct nlookupdata nd;
2304 	struct vnode *vp;
2305 	int error;
2306 
2307 	vp = NULL;
2308 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2309 	/* XXX Add NLC flag indicating modifying operation? */
2310 	if (error == 0)
2311 		error = nlookup(&nd);
2312 	if (error == 0)
2313 		error = ncp_writechk(&nd.nl_nch);
2314 	if (error == 0)
2315 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
2316 	nlookup_done(&nd);
2317 	if (error == 0) {
2318 		error = setfflags(vp, uap->flags);
2319 		vrele(vp);
2320 	}
2321 	return (error);
2322 }
2323 
2324 /*
2325  * fchflags_args(int fd, int flags)
2326  *
2327  * Change flags of a file given a file descriptor.
2328  */
2329 /* ARGSUSED */
2330 int
2331 sys_fchflags(struct fchflags_args *uap)
2332 {
2333 	struct thread *td = curthread;
2334 	struct proc *p = td->td_proc;
2335 	struct file *fp;
2336 	int error;
2337 
2338 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2339 		return (error);
2340 	if (fp->f_nchandle.ncp)
2341 		error = ncp_writechk(&fp->f_nchandle);
2342 	if (error == 0)
2343 		error = setfflags((struct vnode *) fp->f_data, uap->flags);
2344 	fdrop(fp);
2345 	return (error);
2346 }
2347 
2348 static int
2349 setfmode(struct vnode *vp, int mode)
2350 {
2351 	struct thread *td = curthread;
2352 	struct proc *p = td->td_proc;
2353 	int error;
2354 	struct vattr vattr;
2355 
2356 	/*
2357 	 * note: vget is required for any operation that might mod the vnode
2358 	 * so VINACTIVE is properly cleared.
2359 	 */
2360 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2361 		VATTR_NULL(&vattr);
2362 		vattr.va_mode = mode & ALLPERMS;
2363 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2364 		vput(vp);
2365 	}
2366 	return error;
2367 }
2368 
2369 int
2370 kern_chmod(struct nlookupdata *nd, int mode)
2371 {
2372 	struct vnode *vp;
2373 	int error;
2374 
2375 	/* XXX Add NLC flag indicating modifying operation? */
2376 	if ((error = nlookup(nd)) != 0)
2377 		return (error);
2378 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2379 		return (error);
2380 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
2381 		error = setfmode(vp, mode);
2382 	vrele(vp);
2383 	return (error);
2384 }
2385 
2386 /*
2387  * chmod_args(char *path, int mode)
2388  *
2389  * Change mode of a file given path name.
2390  */
2391 /* ARGSUSED */
2392 int
2393 sys_chmod(struct chmod_args *uap)
2394 {
2395 	struct nlookupdata nd;
2396 	int error;
2397 
2398 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2399 	if (error == 0)
2400 		error = kern_chmod(&nd, uap->mode);
2401 	nlookup_done(&nd);
2402 	return (error);
2403 }
2404 
2405 /*
2406  * lchmod_args(char *path, int mode)
2407  *
2408  * Change mode of a file given path name (don't follow links.)
2409  */
2410 /* ARGSUSED */
2411 int
2412 sys_lchmod(struct lchmod_args *uap)
2413 {
2414 	struct nlookupdata nd;
2415 	int error;
2416 
2417 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2418 	if (error == 0)
2419 		error = kern_chmod(&nd, uap->mode);
2420 	nlookup_done(&nd);
2421 	return (error);
2422 }
2423 
2424 /*
2425  * fchmod_args(int fd, int mode)
2426  *
2427  * Change mode of a file given a file descriptor.
2428  */
2429 /* ARGSUSED */
2430 int
2431 sys_fchmod(struct fchmod_args *uap)
2432 {
2433 	struct thread *td = curthread;
2434 	struct proc *p = td->td_proc;
2435 	struct file *fp;
2436 	int error;
2437 
2438 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2439 		return (error);
2440 	if (fp->f_nchandle.ncp)
2441 		error = ncp_writechk(&fp->f_nchandle);
2442 	if (error == 0)
2443 		error = setfmode((struct vnode *)fp->f_data, uap->mode);
2444 	fdrop(fp);
2445 	return (error);
2446 }
2447 
2448 static int
2449 setfown(struct vnode *vp, uid_t uid, gid_t gid)
2450 {
2451 	struct thread *td = curthread;
2452 	struct proc *p = td->td_proc;
2453 	int error;
2454 	struct vattr vattr;
2455 
2456 	/*
2457 	 * note: vget is required for any operation that might mod the vnode
2458 	 * so VINACTIVE is properly cleared.
2459 	 */
2460 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2461 		VATTR_NULL(&vattr);
2462 		vattr.va_uid = uid;
2463 		vattr.va_gid = gid;
2464 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2465 		vput(vp);
2466 	}
2467 	return error;
2468 }
2469 
2470 int
2471 kern_chown(struct nlookupdata *nd, int uid, int gid)
2472 {
2473 	struct vnode *vp;
2474 	int error;
2475 
2476 	/* XXX Add NLC flag indicating modifying operation? */
2477 	if ((error = nlookup(nd)) != 0)
2478 		return (error);
2479 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2480 		return (error);
2481 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
2482 		error = setfown(vp, uid, gid);
2483 	vrele(vp);
2484 	return (error);
2485 }
2486 
2487 /*
2488  * chown(char *path, int uid, int gid)
2489  *
2490  * Set ownership given a path name.
2491  */
2492 int
2493 sys_chown(struct chown_args *uap)
2494 {
2495 	struct nlookupdata nd;
2496 	int error;
2497 
2498 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2499 	if (error == 0)
2500 		error = kern_chown(&nd, uap->uid, uap->gid);
2501 	nlookup_done(&nd);
2502 	return (error);
2503 }
2504 
2505 /*
2506  * lchown_args(char *path, int uid, int gid)
2507  *
2508  * Set ownership given a path name, do not cross symlinks.
2509  */
2510 int
2511 sys_lchown(struct lchown_args *uap)
2512 {
2513 	struct nlookupdata nd;
2514 	int error;
2515 
2516 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2517 	if (error == 0)
2518 		error = kern_chown(&nd, uap->uid, uap->gid);
2519 	nlookup_done(&nd);
2520 	return (error);
2521 }
2522 
2523 /*
2524  * fchown_args(int fd, int uid, int gid)
2525  *
2526  * Set ownership given a file descriptor.
2527  */
2528 /* ARGSUSED */
2529 int
2530 sys_fchown(struct fchown_args *uap)
2531 {
2532 	struct thread *td = curthread;
2533 	struct proc *p = td->td_proc;
2534 	struct file *fp;
2535 	int error;
2536 
2537 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2538 		return (error);
2539 	if (fp->f_nchandle.ncp)
2540 		error = ncp_writechk(&fp->f_nchandle);
2541 	if (error == 0)
2542 		error = setfown((struct vnode *)fp->f_data, uap->uid, uap->gid);
2543 	fdrop(fp);
2544 	return (error);
2545 }
2546 
2547 static int
2548 getutimes(const struct timeval *tvp, struct timespec *tsp)
2549 {
2550 	struct timeval tv[2];
2551 
2552 	if (tvp == NULL) {
2553 		microtime(&tv[0]);
2554 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
2555 		tsp[1] = tsp[0];
2556 	} else {
2557 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
2558 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
2559 	}
2560 	return 0;
2561 }
2562 
2563 static int
2564 setutimes(struct vnode *vp, const struct timespec *ts, int nullflag)
2565 {
2566 	struct thread *td = curthread;
2567 	struct proc *p = td->td_proc;
2568 	int error;
2569 	struct vattr vattr;
2570 
2571 	/*
2572 	 * note: vget is required for any operation that might mod the vnode
2573 	 * so VINACTIVE is properly cleared.
2574 	 */
2575 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2576 		VATTR_NULL(&vattr);
2577 		vattr.va_atime = ts[0];
2578 		vattr.va_mtime = ts[1];
2579 		if (nullflag)
2580 			vattr.va_vaflags |= VA_UTIMES_NULL;
2581 		error = VOP_SETATTR(vp, &vattr, p->p_ucred);
2582 		vput(vp);
2583 	}
2584 	return error;
2585 }
2586 
2587 int
2588 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
2589 {
2590 	struct timespec ts[2];
2591 	struct vnode *vp;
2592 	int error;
2593 
2594 	if ((error = getutimes(tptr, ts)) != 0)
2595 		return (error);
2596 	/* XXX Add NLC flag indicating modifying operation? */
2597 	if ((error = nlookup(nd)) != 0)
2598 		return (error);
2599 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2600 		return (error);
2601 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2602 		return (error);
2603 	error = setutimes(vp, ts, tptr == NULL);
2604 	vrele(vp);
2605 	return (error);
2606 }
2607 
2608 /*
2609  * utimes_args(char *path, struct timeval *tptr)
2610  *
2611  * Set the access and modification times of a file.
2612  */
2613 int
2614 sys_utimes(struct utimes_args *uap)
2615 {
2616 	struct timeval tv[2];
2617 	struct nlookupdata nd;
2618 	int error;
2619 
2620 	if (uap->tptr) {
2621  		error = copyin(uap->tptr, tv, sizeof(tv));
2622 		if (error)
2623 			return (error);
2624 	}
2625 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2626 	if (error == 0)
2627 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
2628 	nlookup_done(&nd);
2629 	return (error);
2630 }
2631 
2632 /*
2633  * lutimes_args(char *path, struct timeval *tptr)
2634  *
2635  * Set the access and modification times of a file.
2636  */
2637 int
2638 sys_lutimes(struct lutimes_args *uap)
2639 {
2640 	struct timeval tv[2];
2641 	struct nlookupdata nd;
2642 	int error;
2643 
2644 	if (uap->tptr) {
2645 		error = copyin(uap->tptr, tv, sizeof(tv));
2646 		if (error)
2647 			return (error);
2648 	}
2649 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2650 	if (error == 0)
2651 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
2652 	nlookup_done(&nd);
2653 	return (error);
2654 }
2655 
2656 int
2657 kern_futimes(int fd, struct timeval *tptr)
2658 {
2659 	struct thread *td = curthread;
2660 	struct proc *p = td->td_proc;
2661 	struct timespec ts[2];
2662 	struct file *fp;
2663 	int error;
2664 
2665 	error = getutimes(tptr, ts);
2666 	if (error)
2667 		return (error);
2668 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
2669 		return (error);
2670 	if (fp->f_nchandle.ncp)
2671 		error = ncp_writechk(&fp->f_nchandle);
2672 	if (error == 0)
2673 		error =  setutimes((struct vnode *)fp->f_data, ts, tptr == NULL);
2674 	fdrop(fp);
2675 	return (error);
2676 }
2677 
2678 /*
2679  * futimes_args(int fd, struct timeval *tptr)
2680  *
2681  * Set the access and modification times of a file.
2682  */
2683 int
2684 sys_futimes(struct futimes_args *uap)
2685 {
2686 	struct timeval tv[2];
2687 	int error;
2688 
2689 	if (uap->tptr) {
2690 		error = copyin(uap->tptr, tv, sizeof(tv));
2691 		if (error)
2692 			return (error);
2693 	}
2694 
2695 	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
2696 
2697 	return (error);
2698 }
2699 
2700 int
2701 kern_truncate(struct nlookupdata *nd, off_t length)
2702 {
2703 	struct vnode *vp;
2704 	struct vattr vattr;
2705 	int error;
2706 
2707 	if (length < 0)
2708 		return(EINVAL);
2709 	/* XXX Add NLC flag indicating modifying operation? */
2710 	if ((error = nlookup(nd)) != 0)
2711 		return (error);
2712 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2713 		return (error);
2714 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
2715 		return (error);
2716 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
2717 		vrele(vp);
2718 		return (error);
2719 	}
2720 	if (vp->v_type == VDIR) {
2721 		error = EISDIR;
2722 	} else if ((error = vn_writechk(vp, &nd->nl_nch)) == 0 &&
2723 	    (error = VOP_ACCESS(vp, VWRITE, nd->nl_cred)) == 0) {
2724 		VATTR_NULL(&vattr);
2725 		vattr.va_size = length;
2726 		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
2727 	}
2728 	vput(vp);
2729 	return (error);
2730 }
2731 
2732 /*
2733  * truncate(char *path, int pad, off_t length)
2734  *
2735  * Truncate a file given its path name.
2736  */
2737 int
2738 sys_truncate(struct truncate_args *uap)
2739 {
2740 	struct nlookupdata nd;
2741 	int error;
2742 
2743 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2744 	if (error == 0)
2745 		error = kern_truncate(&nd, uap->length);
2746 	nlookup_done(&nd);
2747 	return error;
2748 }
2749 
2750 int
2751 kern_ftruncate(int fd, off_t length)
2752 {
2753 	struct thread *td = curthread;
2754 	struct proc *p = td->td_proc;
2755 	struct vattr vattr;
2756 	struct vnode *vp;
2757 	struct file *fp;
2758 	int error;
2759 
2760 	if (length < 0)
2761 		return(EINVAL);
2762 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
2763 		return (error);
2764 	if (fp->f_nchandle.ncp) {
2765 		error = ncp_writechk(&fp->f_nchandle);
2766 		if (error)
2767 			goto done;
2768 	}
2769 	if ((fp->f_flag & FWRITE) == 0) {
2770 		error = EINVAL;
2771 		goto done;
2772 	}
2773 	vp = (struct vnode *)fp->f_data;
2774 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2775 	if (vp->v_type == VDIR) {
2776 		error = EISDIR;
2777 	} else if ((error = vn_writechk(vp, NULL)) == 0) {
2778 		VATTR_NULL(&vattr);
2779 		vattr.va_size = length;
2780 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
2781 	}
2782 	vn_unlock(vp);
2783 done:
2784 	fdrop(fp);
2785 	return (error);
2786 }
2787 
2788 /*
2789  * ftruncate_args(int fd, int pad, off_t length)
2790  *
2791  * Truncate a file given a file descriptor.
2792  */
2793 int
2794 sys_ftruncate(struct ftruncate_args *uap)
2795 {
2796 	int error;
2797 
2798 	error = kern_ftruncate(uap->fd, uap->length);
2799 
2800 	return (error);
2801 }
2802 
2803 /*
2804  * fsync(int fd)
2805  *
2806  * Sync an open file.
2807  */
2808 /* ARGSUSED */
2809 int
2810 sys_fsync(struct fsync_args *uap)
2811 {
2812 	struct thread *td = curthread;
2813 	struct proc *p = td->td_proc;
2814 	struct vnode *vp;
2815 	struct file *fp;
2816 	vm_object_t obj;
2817 	int error;
2818 
2819 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
2820 		return (error);
2821 	vp = (struct vnode *)fp->f_data;
2822 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2823 	if ((obj = vp->v_object) != NULL)
2824 		vm_object_page_clean(obj, 0, 0, 0);
2825 	if ((error = VOP_FSYNC(vp, MNT_WAIT)) == 0 &&
2826 	    vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) &&
2827 	    bioops.io_fsync) {
2828 		error = (*bioops.io_fsync)(vp);
2829 	}
2830 	vn_unlock(vp);
2831 	fdrop(fp);
2832 	return (error);
2833 }
2834 
2835 int
2836 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
2837 {
2838 	struct nchandle fnchd;
2839 	struct nchandle tnchd;
2840 	struct namecache *ncp;
2841 	struct mount *mp;
2842 	int error;
2843 
2844 	bwillwrite();
2845 	if ((error = nlookup(fromnd)) != 0)
2846 		return (error);
2847 	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
2848 		return (ENOENT);
2849 	fnchd.mount = fromnd->nl_nch.mount;
2850 	cache_hold(&fnchd);
2851 
2852 	/*
2853 	 * unlock the source nch so we can lookup the target nch without
2854 	 * deadlocking.  The target may or may not exist so we do not check
2855 	 * for a target vp like kern_mkdir() and other creation functions do.
2856 	 *
2857 	 * The source and target directories are ref'd and rechecked after
2858 	 * everything is relocked to determine if the source or target file
2859 	 * has been renamed.
2860 	 */
2861 	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
2862 	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
2863 	cache_unlock(&fromnd->nl_nch);
2864 
2865 	tond->nl_flags |= NLC_CREATE;
2866 	if ((error = nlookup(tond)) != 0) {
2867 		cache_drop(&fnchd);
2868 		return (error);
2869 	}
2870 	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
2871 		cache_drop(&fnchd);
2872 		return (ENOENT);
2873 	}
2874 	tnchd.mount = tond->nl_nch.mount;
2875 	cache_hold(&tnchd);
2876 
2877 	/*
2878 	 * If the source and target are the same there is nothing to do
2879 	 */
2880 	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
2881 		cache_drop(&fnchd);
2882 		cache_drop(&tnchd);
2883 		return (0);
2884 	}
2885 
2886 	/*
2887 	 * Mount points cannot be renamed or overwritten
2888 	 */
2889 	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
2890 	    NCF_ISMOUNTPT
2891 	) {
2892 		cache_drop(&fnchd);
2893 		cache_drop(&tnchd);
2894 		return (EINVAL);
2895 	}
2896 
2897 	/*
2898 	 * relock the source ncp.  NOTE AFTER RELOCKING: the source ncp
2899 	 * may have become invalid while it was unlocked, nc_vp and nc_mount
2900 	 * could be NULL.
2901 	 */
2902 	if (cache_lock_nonblock(&fromnd->nl_nch) == 0) {
2903 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2904 	} else if (fromnd->nl_nch.ncp > tond->nl_nch.ncp) {
2905 		cache_lock(&fromnd->nl_nch);
2906 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2907 	} else {
2908 		cache_unlock(&tond->nl_nch);
2909 		cache_lock(&fromnd->nl_nch);
2910 		cache_resolve(&fromnd->nl_nch, fromnd->nl_cred);
2911 		cache_lock(&tond->nl_nch);
2912 		cache_resolve(&tond->nl_nch, tond->nl_cred);
2913 	}
2914 	fromnd->nl_flags |= NLC_NCPISLOCKED;
2915 
2916 	/*
2917 	 * make sure the parent directories linkages are the same
2918 	 */
2919 	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
2920 	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
2921 		cache_drop(&fnchd);
2922 		cache_drop(&tnchd);
2923 		return (ENOENT);
2924 	}
2925 
2926 	/*
2927 	 * Both the source and target must be within the same filesystem and
2928 	 * in the same filesystem as their parent directories within the
2929 	 * namecache topology.
2930 	 *
2931 	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
2932 	 */
2933 	mp = fnchd.mount;
2934 	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
2935 	    mp != tond->nl_nch.mount) {
2936 		cache_drop(&fnchd);
2937 		cache_drop(&tnchd);
2938 		return (EXDEV);
2939 	}
2940 
2941 	/*
2942 	 * Make sure the mount point is writable
2943 	 */
2944 	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
2945 		cache_drop(&fnchd);
2946 		cache_drop(&tnchd);
2947 		return (error);
2948 	}
2949 
2950 	/*
2951 	 * If the target exists and either the source or target is a directory,
2952 	 * then both must be directories.
2953 	 *
2954 	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
2955 	 * have become NULL.
2956 	 */
2957 	if (tond->nl_nch.ncp->nc_vp) {
2958 		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
2959 			error = ENOENT;
2960 		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
2961 			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
2962 				error = ENOTDIR;
2963 		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
2964 			error = EISDIR;
2965 		}
2966 	}
2967 
2968 	/*
2969 	 * You cannot rename a source into itself or a subdirectory of itself.
2970 	 * We check this by travsersing the target directory upwards looking
2971 	 * for a match against the source.
2972 	 */
2973 	if (error == 0) {
2974 		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
2975 			if (fromnd->nl_nch.ncp == ncp) {
2976 				error = EINVAL;
2977 				break;
2978 			}
2979 		}
2980 	}
2981 
2982 	cache_drop(&fnchd);
2983 	cache_drop(&tnchd);
2984 
2985 	/*
2986 	 * Even though the namespaces are different, they may still represent
2987 	 * hardlinks to the same file.  The filesystem might have a hard time
2988 	 * with this so we issue a NREMOVE of the source instead of a NRENAME
2989 	 * when we detect the situation.
2990 	 */
2991 	if (error == 0) {
2992 		if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
2993 			error = VOP_NREMOVE(&fromnd->nl_nch, fromnd->nl_cred);
2994 		} else {
2995 			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
2996 					    tond->nl_cred);
2997 		}
2998 	}
2999 	return (error);
3000 }
3001 
3002 /*
3003  * rename_args(char *from, char *to)
3004  *
3005  * Rename files.  Source and destination must either both be directories,
3006  * or both not be directories.  If target is a directory, it must be empty.
3007  */
3008 int
3009 sys_rename(struct rename_args *uap)
3010 {
3011 	struct nlookupdata fromnd, tond;
3012 	int error;
3013 
3014 	error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
3015 	if (error == 0) {
3016 		error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
3017 		if (error == 0)
3018 			error = kern_rename(&fromnd, &tond);
3019 		nlookup_done(&tond);
3020 	}
3021 	nlookup_done(&fromnd);
3022 	return (error);
3023 }
3024 
3025 int
3026 kern_mkdir(struct nlookupdata *nd, int mode)
3027 {
3028 	struct thread *td = curthread;
3029 	struct proc *p = td->td_proc;
3030 	struct vnode *vp;
3031 	struct vattr vattr;
3032 	int error;
3033 
3034 	bwillwrite();
3035 	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE;
3036 	if ((error = nlookup(nd)) != 0)
3037 		return (error);
3038 
3039 	if (nd->nl_nch.ncp->nc_vp)
3040 		return (EEXIST);
3041 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3042 		return (error);
3043 
3044 	VATTR_NULL(&vattr);
3045 	vattr.va_type = VDIR;
3046 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
3047 
3048 	vp = NULL;
3049 	error = VOP_NMKDIR(&nd->nl_nch, &vp, p->p_ucred, &vattr);
3050 	if (error == 0)
3051 		vput(vp);
3052 	return (error);
3053 }
3054 
3055 /*
3056  * mkdir_args(char *path, int mode)
3057  *
3058  * Make a directory file.
3059  */
3060 /* ARGSUSED */
3061 int
3062 sys_mkdir(struct mkdir_args *uap)
3063 {
3064 	struct nlookupdata nd;
3065 	int error;
3066 
3067 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3068 	if (error == 0)
3069 		error = kern_mkdir(&nd, uap->mode);
3070 	nlookup_done(&nd);
3071 	return (error);
3072 }
3073 
3074 int
3075 kern_rmdir(struct nlookupdata *nd)
3076 {
3077 	int error;
3078 
3079 	bwillwrite();
3080 	nd->nl_flags |= NLC_DELETE;
3081 	if ((error = nlookup(nd)) != 0)
3082 		return (error);
3083 
3084 	/*
3085 	 * Do not allow directories representing mount points to be
3086 	 * deleted, even if empty.  Check write perms on mount point
3087 	 * in case the vnode is aliased (aka nullfs).
3088 	 */
3089 	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
3090 		return (EINVAL);
3091 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3092 		return (error);
3093 
3094 	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_cred);
3095 	return (error);
3096 }
3097 
3098 /*
3099  * rmdir_args(char *path)
3100  *
3101  * Remove a directory file.
3102  */
3103 /* ARGSUSED */
3104 int
3105 sys_rmdir(struct rmdir_args *uap)
3106 {
3107 	struct nlookupdata nd;
3108 	int error;
3109 
3110 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3111 	if (error == 0)
3112 		error = kern_rmdir(&nd);
3113 	nlookup_done(&nd);
3114 	return (error);
3115 }
3116 
3117 int
3118 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
3119     enum uio_seg direction)
3120 {
3121 	struct thread *td = curthread;
3122 	struct proc *p = td->td_proc;
3123 	struct vnode *vp;
3124 	struct file *fp;
3125 	struct uio auio;
3126 	struct iovec aiov;
3127 	long loff;
3128 	int error, eofflag;
3129 
3130 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3131 		return (error);
3132 	if ((fp->f_flag & FREAD) == 0) {
3133 		error = EBADF;
3134 		goto done;
3135 	}
3136 	vp = (struct vnode *)fp->f_data;
3137 unionread:
3138 	if (vp->v_type != VDIR) {
3139 		error = EINVAL;
3140 		goto done;
3141 	}
3142 	aiov.iov_base = buf;
3143 	aiov.iov_len = count;
3144 	auio.uio_iov = &aiov;
3145 	auio.uio_iovcnt = 1;
3146 	auio.uio_rw = UIO_READ;
3147 	auio.uio_segflg = direction;
3148 	auio.uio_td = td;
3149 	auio.uio_resid = count;
3150 	loff = auio.uio_offset = fp->f_offset;
3151 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
3152 	fp->f_offset = auio.uio_offset;
3153 	if (error)
3154 		goto done;
3155 	if (count == auio.uio_resid) {
3156 		if (union_dircheckp) {
3157 			error = union_dircheckp(td, &vp, fp);
3158 			if (error == -1)
3159 				goto unionread;
3160 			if (error)
3161 				goto done;
3162 		}
3163 #if 0
3164 		if ((vp->v_flag & VROOT) &&
3165 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
3166 			struct vnode *tvp = vp;
3167 			vp = vp->v_mount->mnt_vnodecovered;
3168 			vref(vp);
3169 			fp->f_data = vp;
3170 			fp->f_offset = 0;
3171 			vrele(tvp);
3172 			goto unionread;
3173 		}
3174 #endif
3175 	}
3176 	if (basep) {
3177 		*basep = loff;
3178 	}
3179 	*res = count - auio.uio_resid;
3180 done:
3181 	fdrop(fp);
3182 	return (error);
3183 }
3184 
3185 /*
3186  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
3187  *
3188  * Read a block of directory entries in a file system independent format.
3189  */
3190 int
3191 sys_getdirentries(struct getdirentries_args *uap)
3192 {
3193 	long base;
3194 	int error;
3195 
3196 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
3197 	    &uap->sysmsg_result, UIO_USERSPACE);
3198 
3199 	if (error == 0)
3200 		error = copyout(&base, uap->basep, sizeof(*uap->basep));
3201 	return (error);
3202 }
3203 
3204 /*
3205  * getdents_args(int fd, char *buf, size_t count)
3206  */
3207 int
3208 sys_getdents(struct getdents_args *uap)
3209 {
3210 	int error;
3211 
3212 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
3213 	    &uap->sysmsg_result, UIO_USERSPACE);
3214 
3215 	return (error);
3216 }
3217 
3218 /*
3219  * umask(int newmask)
3220  *
3221  * Set the mode mask for creation of filesystem nodes.
3222  *
3223  * MP SAFE
3224  */
3225 int
3226 sys_umask(struct umask_args *uap)
3227 {
3228 	struct thread *td = curthread;
3229 	struct proc *p = td->td_proc;
3230 	struct filedesc *fdp;
3231 
3232 	fdp = p->p_fd;
3233 	uap->sysmsg_result = fdp->fd_cmask;
3234 	fdp->fd_cmask = uap->newmask & ALLPERMS;
3235 	return (0);
3236 }
3237 
3238 /*
3239  * revoke(char *path)
3240  *
3241  * Void all references to file by ripping underlying filesystem
3242  * away from vnode.
3243  */
3244 /* ARGSUSED */
3245 int
3246 sys_revoke(struct revoke_args *uap)
3247 {
3248 	struct nlookupdata nd;
3249 	struct vattr vattr;
3250 	struct vnode *vp;
3251 	struct ucred *cred;
3252 	int error;
3253 
3254 	vp = NULL;
3255 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3256 	if (error == 0)
3257 		error = nlookup(&nd);
3258 	if (error == 0)
3259 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3260 	cred = crhold(nd.nl_cred);
3261 	nlookup_done(&nd);
3262 	if (error == 0) {
3263 		if (vp->v_type != VCHR && vp->v_type != VBLK)
3264 			error = EINVAL;
3265 		if (error == 0)
3266 			error = VOP_GETATTR(vp, &vattr);
3267 		if (error == 0 && cred->cr_uid != vattr.va_uid)
3268 			error = suser_cred(cred, PRISON_ROOT);
3269 		if (error == 0 && count_udev(vp->v_umajor, vp->v_uminor) > 0) {
3270 			error = 0;
3271 			vx_lock(vp);
3272 			VOP_REVOKE(vp, REVOKEALL);
3273 			vx_unlock(vp);
3274 		}
3275 		vrele(vp);
3276 	}
3277 	if (cred)
3278 		crfree(cred);
3279 	return (error);
3280 }
3281 
3282 /*
3283  * getfh_args(char *fname, fhandle_t *fhp)
3284  *
3285  * Get (NFS) file handle
3286  */
3287 int
3288 sys_getfh(struct getfh_args *uap)
3289 {
3290 	struct thread *td = curthread;
3291 	struct nlookupdata nd;
3292 	fhandle_t fh;
3293 	struct vnode *vp;
3294 	int error;
3295 
3296 	/*
3297 	 * Must be super user
3298 	 */
3299 	if ((error = suser(td)) != 0)
3300 		return (error);
3301 
3302 	vp = NULL;
3303 	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
3304 	if (error == 0)
3305 		error = nlookup(&nd);
3306 	if (error == 0)
3307 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3308 	nlookup_done(&nd);
3309 	if (error == 0) {
3310 		bzero(&fh, sizeof(fh));
3311 		fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
3312 		error = VFS_VPTOFH(vp, &fh.fh_fid);
3313 		vput(vp);
3314 		if (error == 0)
3315 			error = copyout(&fh, uap->fhp, sizeof(fh));
3316 	}
3317 	return (error);
3318 }
3319 
3320 /*
3321  * fhopen_args(const struct fhandle *u_fhp, int flags)
3322  *
3323  * syscall for the rpc.lockd to use to translate a NFS file handle into
3324  * an open descriptor.
3325  *
3326  * warning: do not remove the suser() call or this becomes one giant
3327  * security hole.
3328  */
3329 int
3330 sys_fhopen(struct fhopen_args *uap)
3331 {
3332 	struct thread *td = curthread;
3333 	struct proc *p = td->td_proc;
3334 	struct mount *mp;
3335 	struct vnode *vp;
3336 	struct fhandle fhp;
3337 	struct vattr vat;
3338 	struct vattr *vap = &vat;
3339 	struct flock lf;
3340 	int fmode, mode, error, type;
3341 	struct file *nfp;
3342 	struct file *fp;
3343 	int indx;
3344 
3345 	/*
3346 	 * Must be super user
3347 	 */
3348 	error = suser(td);
3349 	if (error)
3350 		return (error);
3351 
3352 	fmode = FFLAGS(uap->flags);
3353 	/* why not allow a non-read/write open for our lockd? */
3354 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
3355 		return (EINVAL);
3356 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
3357 	if (error)
3358 		return(error);
3359 	/* find the mount point */
3360 	mp = vfs_getvfs(&fhp.fh_fsid);
3361 	if (mp == NULL)
3362 		return (ESTALE);
3363 	/* now give me my vnode, it gets returned to me locked */
3364 	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
3365 	if (error)
3366 		return (error);
3367  	/*
3368 	 * from now on we have to make sure not
3369 	 * to forget about the vnode
3370 	 * any error that causes an abort must vput(vp)
3371 	 * just set error = err and 'goto bad;'.
3372 	 */
3373 
3374 	/*
3375 	 * from vn_open
3376 	 */
3377 	if (vp->v_type == VLNK) {
3378 		error = EMLINK;
3379 		goto bad;
3380 	}
3381 	if (vp->v_type == VSOCK) {
3382 		error = EOPNOTSUPP;
3383 		goto bad;
3384 	}
3385 	mode = 0;
3386 	if (fmode & (FWRITE | O_TRUNC)) {
3387 		if (vp->v_type == VDIR) {
3388 			error = EISDIR;
3389 			goto bad;
3390 		}
3391 		error = vn_writechk(vp, NULL);
3392 		if (error)
3393 			goto bad;
3394 		mode |= VWRITE;
3395 	}
3396 	if (fmode & FREAD)
3397 		mode |= VREAD;
3398 	if (mode) {
3399 		error = VOP_ACCESS(vp, mode, p->p_ucred);
3400 		if (error)
3401 			goto bad;
3402 	}
3403 	if (fmode & O_TRUNC) {
3404 		vn_unlock(vp);				/* XXX */
3405 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
3406 		VATTR_NULL(vap);
3407 		vap->va_size = 0;
3408 		error = VOP_SETATTR(vp, vap, p->p_ucred);
3409 		if (error)
3410 			goto bad;
3411 	}
3412 
3413 	/*
3414 	 * VOP_OPEN needs the file pointer so it can potentially override
3415 	 * it.
3416 	 *
3417 	 * WARNING! no f_nchandle will be associated when fhopen()ing a
3418 	 * directory.  XXX
3419 	 */
3420 	if ((error = falloc(p, &nfp, &indx)) != 0)
3421 		goto bad;
3422 	fp = nfp;
3423 
3424 	error = VOP_OPEN(vp, fmode, p->p_ucred, fp);
3425 	if (error) {
3426 		/*
3427 		 * setting f_ops this way prevents VOP_CLOSE from being
3428 		 * called or fdrop() releasing the vp from v_data.   Since
3429 		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
3430 		 */
3431 		fp->f_ops = &badfileops;
3432 		fp->f_data = NULL;
3433 		goto bad_drop;
3434 	}
3435 
3436 	/*
3437 	 * The fp is given its own reference, we still have our ref and lock.
3438 	 *
3439 	 * Assert that all regular files must be created with a VM object.
3440 	 */
3441 	if (vp->v_type == VREG && vp->v_object == NULL) {
3442 		kprintf("fhopen: regular file did not have VM object: %p\n", vp);
3443 		goto bad_drop;
3444 	}
3445 
3446 	/*
3447 	 * The open was successful.  Handle any locking requirements.
3448 	 */
3449 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
3450 		lf.l_whence = SEEK_SET;
3451 		lf.l_start = 0;
3452 		lf.l_len = 0;
3453 		if (fmode & O_EXLOCK)
3454 			lf.l_type = F_WRLCK;
3455 		else
3456 			lf.l_type = F_RDLCK;
3457 		if (fmode & FNONBLOCK)
3458 			type = 0;
3459 		else
3460 			type = F_WAIT;
3461 		vn_unlock(vp);
3462 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
3463 			/*
3464 			 * release our private reference.
3465 			 */
3466 			fsetfd(p, NULL, indx);
3467 			fdrop(fp);
3468 			vrele(vp);
3469 			return (error);
3470 		}
3471 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3472 		fp->f_flag |= FHASLOCK;
3473 	}
3474 
3475 	/*
3476 	 * Clean up.  Associate the file pointer with the previously
3477 	 * reserved descriptor and return it.
3478 	 */
3479 	vput(vp);
3480 	fsetfd(p, fp, indx);
3481 	fdrop(fp);
3482 	uap->sysmsg_result = indx;
3483 	return (0);
3484 
3485 bad_drop:
3486 	fsetfd(p, NULL, indx);
3487 	fdrop(fp);
3488 bad:
3489 	vput(vp);
3490 	return (error);
3491 }
3492 
3493 /*
3494  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
3495  */
3496 int
3497 sys_fhstat(struct fhstat_args *uap)
3498 {
3499 	struct thread *td = curthread;
3500 	struct stat sb;
3501 	fhandle_t fh;
3502 	struct mount *mp;
3503 	struct vnode *vp;
3504 	int error;
3505 
3506 	/*
3507 	 * Must be super user
3508 	 */
3509 	error = suser(td);
3510 	if (error)
3511 		return (error);
3512 
3513 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
3514 	if (error)
3515 		return (error);
3516 
3517 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
3518 		return (ESTALE);
3519 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
3520 		return (error);
3521 	error = vn_stat(vp, &sb, td->td_proc->p_ucred);
3522 	vput(vp);
3523 	if (error)
3524 		return (error);
3525 	error = copyout(&sb, uap->sb, sizeof(sb));
3526 	return (error);
3527 }
3528 
3529 /*
3530  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
3531  */
3532 int
3533 sys_fhstatfs(struct fhstatfs_args *uap)
3534 {
3535 	struct thread *td = curthread;
3536 	struct proc *p = td->td_proc;
3537 	struct statfs *sp;
3538 	struct mount *mp;
3539 	struct vnode *vp;
3540 	struct statfs sb;
3541 	char *fullpath, *freepath;
3542 	fhandle_t fh;
3543 	int error;
3544 
3545 	/*
3546 	 * Must be super user
3547 	 */
3548 	if ((error = suser(td)))
3549 		return (error);
3550 
3551 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
3552 		return (error);
3553 
3554 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
3555 		return (ESTALE);
3556 
3557 	if (p != NULL && !chroot_visible_mnt(mp, p))
3558 		return (ESTALE);
3559 
3560 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
3561 		return (error);
3562 	mp = vp->v_mount;
3563 	sp = &mp->mnt_stat;
3564 	vput(vp);
3565 	if ((error = VFS_STATFS(mp, sp, p->p_ucred)) != 0)
3566 		return (error);
3567 
3568 	error = mount_path(p, mp, &fullpath, &freepath);
3569 	if (error)
3570 		return(error);
3571 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3572 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
3573 	kfree(freepath, M_TEMP);
3574 
3575 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3576 	if (suser(td)) {
3577 		bcopy(sp, &sb, sizeof(sb));
3578 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
3579 		sp = &sb;
3580 	}
3581 	return (copyout(sp, uap->buf, sizeof(*sp)));
3582 }
3583 
3584 /*
3585  * Syscall to push extended attribute configuration information into the
3586  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
3587  * a command (int cmd), and attribute name and misc data.  For now, the
3588  * attribute name is left in userspace for consumption by the VFS_op.
3589  * It will probably be changed to be copied into sysspace by the
3590  * syscall in the future, once issues with various consumers of the
3591  * attribute code have raised their hands.
3592  *
3593  * Currently this is used only by UFS Extended Attributes.
3594  */
3595 int
3596 sys_extattrctl(struct extattrctl_args *uap)
3597 {
3598 	struct nlookupdata nd;
3599 	struct mount *mp;
3600 	struct vnode *vp;
3601 	int error;
3602 
3603 	vp = NULL;
3604 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3605 	if (error == 0)
3606 		error = nlookup(&nd);
3607 	if (error == 0) {
3608 		mp = nd.nl_nch.mount;
3609 		error = VFS_EXTATTRCTL(mp, uap->cmd,
3610 				uap->attrname, uap->arg,
3611 				nd.nl_cred);
3612 	}
3613 	nlookup_done(&nd);
3614 	return (error);
3615 }
3616 
3617 /*
3618  * Syscall to set a named extended attribute on a file or directory.
3619  * Accepts attribute name, and a uio structure pointing to the data to set.
3620  * The uio is consumed in the style of writev().  The real work happens
3621  * in VOP_SETEXTATTR().
3622  */
3623 int
3624 sys_extattr_set_file(struct extattr_set_file_args *uap)
3625 {
3626 	char attrname[EXTATTR_MAXNAMELEN];
3627 	struct iovec aiov[UIO_SMALLIOV];
3628 	struct iovec *needfree;
3629 	struct nlookupdata nd;
3630 	struct iovec *iov;
3631 	struct vnode *vp;
3632 	struct uio auio;
3633 	u_int iovlen;
3634 	u_int cnt;
3635 	int error;
3636 	int i;
3637 
3638 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3639 	if (error)
3640 		return (error);
3641 
3642 	vp = NULL;
3643 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3644 	if (error == 0)
3645 		error = nlookup(&nd);
3646 	if (error == 0)
3647 		error = ncp_writechk(&nd.nl_nch);
3648 	if (error == 0)
3649 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3650 	if (error) {
3651 		nlookup_done(&nd);
3652 		return (error);
3653 	}
3654 
3655 	needfree = NULL;
3656 	iovlen = uap->iovcnt * sizeof(struct iovec);
3657 	if (uap->iovcnt > UIO_SMALLIOV) {
3658 		if (uap->iovcnt > UIO_MAXIOV) {
3659 			error = EINVAL;
3660 			goto done;
3661 		}
3662 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
3663 		needfree = iov;
3664 	} else {
3665 		iov = aiov;
3666 	}
3667 	auio.uio_iov = iov;
3668 	auio.uio_iovcnt = uap->iovcnt;
3669 	auio.uio_rw = UIO_WRITE;
3670 	auio.uio_segflg = UIO_USERSPACE;
3671 	auio.uio_td = nd.nl_td;
3672 	auio.uio_offset = 0;
3673 	if ((error = copyin(uap->iovp, iov, iovlen)))
3674 		goto done;
3675 	auio.uio_resid = 0;
3676 	for (i = 0; i < uap->iovcnt; i++) {
3677 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
3678 			error = EINVAL;
3679 			goto done;
3680 		}
3681 		auio.uio_resid += iov->iov_len;
3682 		iov++;
3683 	}
3684 	cnt = auio.uio_resid;
3685 	error = VOP_SETEXTATTR(vp, attrname, &auio, nd.nl_cred);
3686 	cnt -= auio.uio_resid;
3687 	uap->sysmsg_result = cnt;
3688 done:
3689 	vput(vp);
3690 	nlookup_done(&nd);
3691 	if (needfree)
3692 		FREE(needfree, M_IOV);
3693 	return (error);
3694 }
3695 
3696 /*
3697  * Syscall to get a named extended attribute on a file or directory.
3698  * Accepts attribute name, and a uio structure pointing to a buffer for the
3699  * data.  The uio is consumed in the style of readv().  The real work
3700  * happens in VOP_GETEXTATTR();
3701  */
3702 int
3703 sys_extattr_get_file(struct extattr_get_file_args *uap)
3704 {
3705 	char attrname[EXTATTR_MAXNAMELEN];
3706 	struct iovec aiov[UIO_SMALLIOV];
3707 	struct iovec *needfree;
3708 	struct nlookupdata nd;
3709 	struct iovec *iov;
3710 	struct vnode *vp;
3711 	struct uio auio;
3712 	u_int iovlen;
3713 	u_int cnt;
3714 	int error;
3715 	int i;
3716 
3717 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3718 	if (error)
3719 		return (error);
3720 
3721 	vp = NULL;
3722 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3723 	if (error == 0)
3724 		error = nlookup(&nd);
3725 	if (error == 0)
3726 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3727 	if (error) {
3728 		nlookup_done(&nd);
3729 		return (error);
3730 	}
3731 
3732 	iovlen = uap->iovcnt * sizeof (struct iovec);
3733 	needfree = NULL;
3734 	if (uap->iovcnt > UIO_SMALLIOV) {
3735 		if (uap->iovcnt > UIO_MAXIOV) {
3736 			error = EINVAL;
3737 			goto done;
3738 		}
3739 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
3740 		needfree = iov;
3741 	} else {
3742 		iov = aiov;
3743 	}
3744 	auio.uio_iov = iov;
3745 	auio.uio_iovcnt = uap->iovcnt;
3746 	auio.uio_rw = UIO_READ;
3747 	auio.uio_segflg = UIO_USERSPACE;
3748 	auio.uio_td = nd.nl_td;
3749 	auio.uio_offset = 0;
3750 	if ((error = copyin(uap->iovp, iov, iovlen)))
3751 		goto done;
3752 	auio.uio_resid = 0;
3753 	for (i = 0; i < uap->iovcnt; i++) {
3754 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
3755 			error = EINVAL;
3756 			goto done;
3757 		}
3758 		auio.uio_resid += iov->iov_len;
3759 		iov++;
3760 	}
3761 	cnt = auio.uio_resid;
3762 	error = VOP_GETEXTATTR(vp, attrname, &auio, nd.nl_cred);
3763 	cnt -= auio.uio_resid;
3764 	uap->sysmsg_result = cnt;
3765 done:
3766 	vput(vp);
3767 	nlookup_done(&nd);
3768 	if (needfree)
3769 		FREE(needfree, M_IOV);
3770 	return(error);
3771 }
3772 
3773 /*
3774  * Syscall to delete a named extended attribute from a file or directory.
3775  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
3776  */
3777 int
3778 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
3779 {
3780 	char attrname[EXTATTR_MAXNAMELEN];
3781 	struct nlookupdata nd;
3782 	struct vnode *vp;
3783 	int error;
3784 
3785 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
3786 	if (error)
3787 		return(error);
3788 
3789 	vp = NULL;
3790 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3791 	if (error == 0)
3792 		error = nlookup(&nd);
3793 	if (error == 0)
3794 		error = ncp_writechk(&nd.nl_nch);
3795 	if (error == 0)
3796 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
3797 	if (error) {
3798 		nlookup_done(&nd);
3799 		return (error);
3800 	}
3801 
3802 	error = VOP_SETEXTATTR(vp, attrname, NULL, nd.nl_cred);
3803 	vput(vp);
3804 	nlookup_done(&nd);
3805 	return(error);
3806 }
3807 
3808 /*
3809  * Determine if the mount is visible to the process.
3810  */
3811 static int
3812 chroot_visible_mnt(struct mount *mp, struct proc *p)
3813 {
3814 	struct nchandle nch;
3815 
3816 	/*
3817 	 * Traverse from the mount point upwards.  If we hit the process
3818 	 * root then the mount point is visible to the process.
3819 	 */
3820 	nch = mp->mnt_ncmountpt;
3821 	while (nch.ncp) {
3822 		if (nch.mount == p->p_fd->fd_nrdir.mount &&
3823 		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
3824 			return(1);
3825 		}
3826 		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
3827 			nch = nch.mount->mnt_ncmounton;
3828 		} else {
3829 			nch.ncp = nch.ncp->nc_parent;
3830 		}
3831 	}
3832 
3833 	/*
3834 	 * If the mount point is not visible to the process, but the
3835 	 * process root is in a subdirectory of the mount, return
3836 	 * TRUE anyway.
3837 	 */
3838 	if (p->p_fd->fd_nrdir.mount == mp)
3839 		return(1);
3840 
3841 	return(0);
3842 }
3843 
3844