xref: /dragonfly/sys/kern/vfs_syscalls.c (revision b58f1e66)
1 /*
2  * Copyright (c) 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
39  * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $
40  */
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/buf.h>
45 #include <sys/conf.h>
46 #include <sys/sysent.h>
47 #include <sys/malloc.h>
48 #include <sys/mount.h>
49 #include <sys/mountctl.h>
50 #include <sys/sysproto.h>
51 #include <sys/filedesc.h>
52 #include <sys/kernel.h>
53 #include <sys/fcntl.h>
54 #include <sys/file.h>
55 #include <sys/linker.h>
56 #include <sys/stat.h>
57 #include <sys/unistd.h>
58 #include <sys/vnode.h>
59 #include <sys/proc.h>
60 #include <sys/priv.h>
61 #include <sys/jail.h>
62 #include <sys/namei.h>
63 #include <sys/nlookup.h>
64 #include <sys/dirent.h>
65 #include <sys/extattr.h>
66 #include <sys/spinlock.h>
67 #include <sys/kern_syscall.h>
68 #include <sys/objcache.h>
69 #include <sys/sysctl.h>
70 
71 #include <sys/buf2.h>
72 #include <sys/file2.h>
73 #include <sys/spinlock2.h>
74 #include <sys/mplock2.h>
75 
76 #include <vm/vm.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_page.h>
79 
80 #include <machine/limits.h>
81 #include <machine/stdarg.h>
82 
83 #include <vfs/union/union.h>
84 
85 static void mount_warning(struct mount *mp, const char *ctl, ...)
86 		__printflike(2, 3);
87 static int mount_path(struct proc *p, struct mount *mp, char **rb, char **fb);
88 static int checkvp_chdir (struct vnode *vn, struct thread *td);
89 static void checkdirs (struct nchandle *old_nch, struct nchandle *new_nch);
90 static int chroot_refuse_vdir_fds (struct filedesc *fdp);
91 static int chroot_visible_mnt(struct mount *mp, struct proc *p);
92 static int getutimes (const struct timeval *, struct timespec *);
93 static int setfown (struct vnode *, uid_t, gid_t);
94 static int setfmode (struct vnode *, int);
95 static int setfflags (struct vnode *, int);
96 static int setutimes (struct vnode *, struct vattr *,
97 			const struct timespec *, int);
98 static int	usermount = 0;	/* if 1, non-root can mount fs. */
99 
100 int (*union_dircheckp) (struct thread *, struct vnode **, struct file *);
101 
102 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
103     "Allow non-root users to mount filesystems");
104 
105 /*
106  * Virtual File System System Calls
107  */
108 
109 /*
110  * Mount a file system.
111  *
112  * mount_args(char *type, char *path, int flags, caddr_t data)
113  *
114  * MPALMOSTSAFE
115  */
116 int
117 sys_mount(struct mount_args *uap)
118 {
119 	struct thread *td = curthread;
120 	struct vnode *vp;
121 	struct nchandle nch;
122 	struct mount *mp, *nullmp;
123 	struct vfsconf *vfsp;
124 	int error, flag = 0, flag2 = 0;
125 	int hasmount;
126 	struct vattr va;
127 	struct nlookupdata nd;
128 	char fstypename[MFSNAMELEN];
129 	struct ucred *cred;
130 
131 	get_mplock();
132 	cred = td->td_ucred;
133 	if (jailed(cred)) {
134 		error = EPERM;
135 		goto done;
136 	}
137 	if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
138 		goto done;
139 
140 	/*
141 	 * Do not allow NFS export by non-root users.
142 	 */
143 	if (uap->flags & MNT_EXPORTED) {
144 		error = priv_check(td, PRIV_ROOT);
145 		if (error)
146 			goto done;
147 	}
148 	/*
149 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
150 	 */
151 	if (priv_check(td, PRIV_ROOT))
152 		uap->flags |= MNT_NOSUID | MNT_NODEV;
153 
154 	/*
155 	 * Lookup the requested path and extract the nch and vnode.
156 	 */
157 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
158 	if (error == 0) {
159 		if ((error = nlookup(&nd)) == 0) {
160 			if (nd.nl_nch.ncp->nc_vp == NULL)
161 				error = ENOENT;
162 		}
163 	}
164 	if (error) {
165 		nlookup_done(&nd);
166 		goto done;
167 	}
168 
169 	/*
170 	 * If the target filesystem is resolved via a nullfs mount, then
171 	 * nd.nl_nch.mount will be pointing to the nullfs mount structure
172 	 * instead of the target file system. We need it in case we are
173 	 * doing an update.
174 	 */
175 	nullmp = nd.nl_nch.mount;
176 
177 	/*
178 	 * Extract the locked+refd ncp and cleanup the nd structure
179 	 */
180 	nch = nd.nl_nch;
181 	cache_zero(&nd.nl_nch);
182 	nlookup_done(&nd);
183 
184 	if ((nch.ncp->nc_flag & NCF_ISMOUNTPT) && cache_findmount(&nch))
185 		hasmount = 1;
186 	else
187 		hasmount = 0;
188 
189 
190 	/*
191 	 * now we have the locked ref'd nch and unreferenced vnode.
192 	 */
193 	vp = nch.ncp->nc_vp;
194 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0) {
195 		cache_put(&nch);
196 		goto done;
197 	}
198 	cache_unlock(&nch);
199 
200 	/*
201 	 * Extract the file system type. We need to know this early, to take
202 	 * appropriate actions if we are dealing with a nullfs.
203 	 */
204         if ((error = copyinstr(uap->type, fstypename, MFSNAMELEN, NULL)) != 0) {
205                 cache_drop(&nch);
206                 vput(vp);
207 		goto done;
208         }
209 
210 	/*
211 	 * Now we have an unlocked ref'd nch and a locked ref'd vp
212 	 */
213 	if (uap->flags & MNT_UPDATE) {
214 		if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
215 			cache_drop(&nch);
216 			vput(vp);
217 			error = EINVAL;
218 			goto done;
219 		}
220 
221 		if (strncmp(fstypename, "null", 5) == 0) {
222 			KKASSERT(nullmp);
223 			mp = nullmp;
224 		} else {
225 			mp = vp->v_mount;
226 		}
227 
228 		flag = mp->mnt_flag;
229 		flag2 = mp->mnt_kern_flag;
230 		/*
231 		 * We only allow the filesystem to be reloaded if it
232 		 * is currently mounted read-only.
233 		 */
234 		if ((uap->flags & MNT_RELOAD) &&
235 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
236 			cache_drop(&nch);
237 			vput(vp);
238 			error = EOPNOTSUPP;	/* Needs translation */
239 			goto done;
240 		}
241 		/*
242 		 * Only root, or the user that did the original mount is
243 		 * permitted to update it.
244 		 */
245 		if (mp->mnt_stat.f_owner != cred->cr_uid &&
246 		    (error = priv_check(td, PRIV_ROOT))) {
247 			cache_drop(&nch);
248 			vput(vp);
249 			goto done;
250 		}
251 		if (vfs_busy(mp, LK_NOWAIT)) {
252 			cache_drop(&nch);
253 			vput(vp);
254 			error = EBUSY;
255 			goto done;
256 		}
257 		if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
258 			cache_drop(&nch);
259 			vfs_unbusy(mp);
260 			vput(vp);
261 			error = EBUSY;
262 			goto done;
263 		}
264 		vsetflags(vp, VMOUNT);
265 		mp->mnt_flag |=
266 		    uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
267 		vn_unlock(vp);
268 		goto update;
269 	}
270 	/*
271 	 * If the user is not root, ensure that they own the directory
272 	 * onto which we are attempting to mount.
273 	 */
274 	if ((error = VOP_GETATTR(vp, &va)) ||
275 	    (va.va_uid != cred->cr_uid && (error = priv_check(td, PRIV_ROOT)))) {
276 		cache_drop(&nch);
277 		vput(vp);
278 		goto done;
279 	}
280 	if ((error = vinvalbuf(vp, V_SAVE, 0, 0)) != 0) {
281 		cache_drop(&nch);
282 		vput(vp);
283 		goto done;
284 	}
285 	if (vp->v_type != VDIR) {
286 		cache_drop(&nch);
287 		vput(vp);
288 		error = ENOTDIR;
289 		goto done;
290 	}
291 	if (vp->v_mount->mnt_kern_flag & MNTK_NOSTKMNT) {
292 		cache_drop(&nch);
293 		vput(vp);
294 		error = EPERM;
295 		goto done;
296 	}
297 	vfsp = vfsconf_find_by_name(fstypename);
298 	if (vfsp == NULL) {
299 		linker_file_t lf;
300 
301 		/* Only load modules for root (very important!) */
302 		if ((error = priv_check(td, PRIV_ROOT)) != 0) {
303 			cache_drop(&nch);
304 			vput(vp);
305 			goto done;
306 		}
307 		error = linker_load_file(fstypename, &lf);
308 		if (error || lf == NULL) {
309 			cache_drop(&nch);
310 			vput(vp);
311 			if (lf == NULL)
312 				error = ENODEV;
313 			goto done;
314 		}
315 		lf->userrefs++;
316 		/* lookup again, see if the VFS was loaded */
317 		vfsp = vfsconf_find_by_name(fstypename);
318 		if (vfsp == NULL) {
319 			lf->userrefs--;
320 			linker_file_unload(lf);
321 			cache_drop(&nch);
322 			vput(vp);
323 			error = ENODEV;
324 			goto done;
325 		}
326 	}
327 	if ((vp->v_flag & VMOUNT) != 0 || hasmount) {
328 		cache_drop(&nch);
329 		vput(vp);
330 		error = EBUSY;
331 		goto done;
332 	}
333 	vsetflags(vp, VMOUNT);
334 
335 	/*
336 	 * Allocate and initialize the filesystem.
337 	 */
338 	mp = kmalloc(sizeof(struct mount), M_MOUNT, M_ZERO|M_WAITOK);
339 	mount_init(mp);
340 	vfs_busy(mp, LK_NOWAIT);
341 	mp->mnt_op = vfsp->vfc_vfsops;
342 	mp->mnt_vfc = vfsp;
343 	vfsp->vfc_refcount++;
344 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
345 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
346 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
347 	mp->mnt_stat.f_owner = cred->cr_uid;
348 	vn_unlock(vp);
349 update:
350 	/*
351 	 * Set the mount level flags.
352 	 */
353 	if (uap->flags & MNT_RDONLY)
354 		mp->mnt_flag |= MNT_RDONLY;
355 	else if (mp->mnt_flag & MNT_RDONLY)
356 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
357 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
358 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME |
359 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
360 	    MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
361 	mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC |
362 	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
363 	    MNT_NOSYMFOLLOW | MNT_IGNORE |
364 	    MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR);
365 	/*
366 	 * Mount the filesystem.
367 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
368 	 * get.
369 	 */
370 	error = VFS_MOUNT(mp, uap->path, uap->data, cred);
371 	if (mp->mnt_flag & MNT_UPDATE) {
372 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
373 			mp->mnt_flag &= ~MNT_RDONLY;
374 		mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE);
375 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
376 		if (error) {
377 			mp->mnt_flag = flag;
378 			mp->mnt_kern_flag = flag2;
379 		}
380 		vfs_unbusy(mp);
381 		vclrflags(vp, VMOUNT);
382 		vrele(vp);
383 		cache_drop(&nch);
384 		goto done;
385 	}
386 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
387 	/*
388 	 * Put the new filesystem on the mount list after root.  The mount
389 	 * point gets its own mnt_ncmountpt (unless the VFS already set one
390 	 * up) which represents the root of the mount.  The lookup code
391 	 * detects the mount point going forward and checks the root of
392 	 * the mount going backwards.
393 	 *
394 	 * It is not necessary to invalidate or purge the vnode underneath
395 	 * because elements under the mount will be given their own glue
396 	 * namecache record.
397 	 */
398 	if (!error) {
399 		if (mp->mnt_ncmountpt.ncp == NULL) {
400 			/*
401 			 * allocate, then unlock, but leave the ref intact
402 			 */
403 			cache_allocroot(&mp->mnt_ncmountpt, mp, NULL);
404 			cache_unlock(&mp->mnt_ncmountpt);
405 		}
406 		mp->mnt_ncmounton = nch;		/* inherits ref */
407 		nch.ncp->nc_flag |= NCF_ISMOUNTPT;
408 
409 		/* XXX get the root of the fs and cache_setvp(mnt_ncmountpt...) */
410 		vclrflags(vp, VMOUNT);
411 		mountlist_insert(mp, MNTINS_LAST);
412 		vn_unlock(vp);
413 		checkdirs(&mp->mnt_ncmounton, &mp->mnt_ncmountpt);
414 		error = vfs_allocate_syncvnode(mp);
415 		vfs_unbusy(mp);
416 		error = VFS_START(mp, 0);
417 		vrele(vp);
418 	} else {
419 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
420 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
421 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
422 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
423 		vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
424 		vclrflags(vp, VMOUNT);
425 		mp->mnt_vfc->vfc_refcount--;
426 		vfs_unbusy(mp);
427 		kfree(mp, M_MOUNT);
428 		cache_drop(&nch);
429 		vput(vp);
430 	}
431 done:
432 	rel_mplock();
433 	return (error);
434 }
435 
436 /*
437  * Scan all active processes to see if any of them have a current
438  * or root directory onto which the new filesystem has just been
439  * mounted. If so, replace them with the new mount point.
440  *
441  * The passed ncp is ref'd and locked (from the mount code) and
442  * must be associated with the vnode representing the root of the
443  * mount point.
444  */
445 struct checkdirs_info {
446 	struct nchandle old_nch;
447 	struct nchandle new_nch;
448 	struct vnode *old_vp;
449 	struct vnode *new_vp;
450 };
451 
452 static int checkdirs_callback(struct proc *p, void *data);
453 
454 static void
455 checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
456 {
457 	struct checkdirs_info info;
458 	struct vnode *olddp;
459 	struct vnode *newdp;
460 	struct mount *mp;
461 
462 	/*
463 	 * If the old mount point's vnode has a usecount of 1, it is not
464 	 * being held as a descriptor anywhere.
465 	 */
466 	olddp = old_nch->ncp->nc_vp;
467 	if (olddp == NULL || olddp->v_sysref.refcnt == 1)
468 		return;
469 
470 	/*
471 	 * Force the root vnode of the new mount point to be resolved
472 	 * so we can update any matching processes.
473 	 */
474 	mp = new_nch->mount;
475 	if (VFS_ROOT(mp, &newdp))
476 		panic("mount: lost mount");
477 	cache_setunresolved(new_nch);
478 	cache_setvp(new_nch, newdp);
479 
480 	/*
481 	 * Special handling of the root node
482 	 */
483 	if (rootvnode == olddp) {
484 		vref(newdp);
485 		vfs_cache_setroot(newdp, cache_hold(new_nch));
486 	}
487 
488 	/*
489 	 * Pass newdp separately so the callback does not have to access
490 	 * it via new_nch->ncp->nc_vp.
491 	 */
492 	info.old_nch = *old_nch;
493 	info.new_nch = *new_nch;
494 	info.new_vp = newdp;
495 	allproc_scan(checkdirs_callback, &info);
496 	vput(newdp);
497 }
498 
499 /*
500  * NOTE: callback is not MP safe because the scanned process's filedesc
501  * structure can be ripped out from under us, amoung other things.
502  */
503 static int
504 checkdirs_callback(struct proc *p, void *data)
505 {
506 	struct checkdirs_info *info = data;
507 	struct filedesc *fdp;
508 	struct nchandle ncdrop1;
509 	struct nchandle ncdrop2;
510 	struct vnode *vprele1;
511 	struct vnode *vprele2;
512 
513 	if ((fdp = p->p_fd) != NULL) {
514 		cache_zero(&ncdrop1);
515 		cache_zero(&ncdrop2);
516 		vprele1 = NULL;
517 		vprele2 = NULL;
518 
519 		/*
520 		 * MPUNSAFE - XXX fdp can be pulled out from under a
521 		 * foreign process.
522 		 *
523 		 * A shared filedesc is ok, we don't have to copy it
524 		 * because we are making this change globally.
525 		 */
526 		spin_lock(&fdp->fd_spin);
527 		if (fdp->fd_ncdir.mount == info->old_nch.mount &&
528 		    fdp->fd_ncdir.ncp == info->old_nch.ncp) {
529 			vprele1 = fdp->fd_cdir;
530 			vref(info->new_vp);
531 			fdp->fd_cdir = info->new_vp;
532 			ncdrop1 = fdp->fd_ncdir;
533 			cache_copy(&info->new_nch, &fdp->fd_ncdir);
534 		}
535 		if (fdp->fd_nrdir.mount == info->old_nch.mount &&
536 		    fdp->fd_nrdir.ncp == info->old_nch.ncp) {
537 			vprele2 = fdp->fd_rdir;
538 			vref(info->new_vp);
539 			fdp->fd_rdir = info->new_vp;
540 			ncdrop2 = fdp->fd_nrdir;
541 			cache_copy(&info->new_nch, &fdp->fd_nrdir);
542 		}
543 		spin_unlock(&fdp->fd_spin);
544 		if (ncdrop1.ncp)
545 			cache_drop(&ncdrop1);
546 		if (ncdrop2.ncp)
547 			cache_drop(&ncdrop2);
548 		if (vprele1)
549 			vrele(vprele1);
550 		if (vprele2)
551 			vrele(vprele2);
552 	}
553 	return(0);
554 }
555 
556 /*
557  * Unmount a file system.
558  *
559  * Note: unmount takes a path to the vnode mounted on as argument,
560  * not special file (as before).
561  *
562  * umount_args(char *path, int flags)
563  *
564  * MPALMOSTSAFE
565  */
566 int
567 sys_unmount(struct unmount_args *uap)
568 {
569 	struct thread *td = curthread;
570 	struct proc *p __debugvar = td->td_proc;
571 	struct mount *mp = NULL;
572 	struct nlookupdata nd;
573 	int error;
574 
575 	KKASSERT(p);
576 	get_mplock();
577 	if (td->td_ucred->cr_prison != NULL) {
578 		error = EPERM;
579 		goto done;
580 	}
581 	if (usermount == 0 && (error = priv_check(td, PRIV_ROOT)))
582 		goto done;
583 
584 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
585 	if (error == 0)
586 		error = nlookup(&nd);
587 	if (error)
588 		goto out;
589 
590 	mp = nd.nl_nch.mount;
591 
592 	/*
593 	 * Only root, or the user that did the original mount is
594 	 * permitted to unmount this filesystem.
595 	 */
596 	if ((mp->mnt_stat.f_owner != td->td_ucred->cr_uid) &&
597 	    (error = priv_check(td, PRIV_ROOT)))
598 		goto out;
599 
600 	/*
601 	 * Don't allow unmounting the root file system.
602 	 */
603 	if (mp->mnt_flag & MNT_ROOTFS) {
604 		error = EINVAL;
605 		goto out;
606 	}
607 
608 	/*
609 	 * Must be the root of the filesystem
610 	 */
611 	if (nd.nl_nch.ncp != mp->mnt_ncmountpt.ncp) {
612 		error = EINVAL;
613 		goto out;
614 	}
615 
616 out:
617 	nlookup_done(&nd);
618 	if (error == 0)
619 		error = dounmount(mp, uap->flags);
620 done:
621 	rel_mplock();
622 	return (error);
623 }
624 
625 /*
626  * Do the actual file system unmount.
627  */
628 static int
629 dounmount_interlock(struct mount *mp)
630 {
631 	if (mp->mnt_kern_flag & MNTK_UNMOUNT)
632 		return (EBUSY);
633 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
634 	return(0);
635 }
636 
637 static int
638 unmount_allproc_cb(struct proc *p, void *arg)
639 {
640 	struct mount *mp;
641 
642 	if (p->p_textnch.ncp == NULL)
643 		return 0;
644 
645 	mp = (struct mount *)arg;
646 	if (p->p_textnch.mount == mp)
647 		cache_drop(&p->p_textnch);
648 
649 	return 0;
650 }
651 
652 int
653 dounmount(struct mount *mp, int flags)
654 {
655 	struct namecache *ncp;
656 	struct nchandle nch;
657 	struct vnode *vp;
658 	int error;
659 	int async_flag;
660 	int lflags;
661 	int freeok = 1;
662 
663 	/*
664 	 * Exclusive access for unmounting purposes
665 	 */
666 	if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0)
667 		return (error);
668 
669 	/*
670 	 * Allow filesystems to detect that a forced unmount is in progress.
671 	 */
672 	if (flags & MNT_FORCE)
673 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
674 	lflags = LK_EXCLUSIVE | ((flags & MNT_FORCE) ? 0 : LK_NOWAIT);
675 	error = lockmgr(&mp->mnt_lock, lflags);
676 	if (error) {
677 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
678 		if (mp->mnt_kern_flag & MNTK_MWAIT)
679 			wakeup(mp);
680 		return (error);
681 	}
682 
683 	if (mp->mnt_flag & MNT_EXPUBLIC)
684 		vfs_setpublicfs(NULL, NULL, NULL);
685 
686 	vfs_msync(mp, MNT_WAIT);
687 	async_flag = mp->mnt_flag & MNT_ASYNC;
688 	mp->mnt_flag &=~ MNT_ASYNC;
689 
690 	/*
691 	 * If this filesystem isn't aliasing other filesystems,
692 	 * try to invalidate any remaining namecache entries and
693 	 * check the count afterwords.
694 	 */
695 	if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) {
696 		cache_lock(&mp->mnt_ncmountpt);
697 		cache_inval(&mp->mnt_ncmountpt, CINV_DESTROY|CINV_CHILDREN);
698 		cache_unlock(&mp->mnt_ncmountpt);
699 
700 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
701 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
702 			allproc_scan(&unmount_allproc_cb, mp);
703 		}
704 
705 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
706 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
707 
708 			if ((flags & MNT_FORCE) == 0) {
709 				error = EBUSY;
710 				mount_warning(mp, "Cannot unmount: "
711 						  "%d namecache "
712 						  "references still "
713 						  "present",
714 						  ncp->nc_refs - 1);
715 			} else {
716 				mount_warning(mp, "Forced unmount: "
717 						  "%d namecache "
718 						  "references still "
719 						  "present",
720 						  ncp->nc_refs - 1);
721 				freeok = 0;
722 			}
723 		}
724 	}
725 
726 	/*
727 	 * nchandle records ref the mount structure.  Expect a count of 1
728 	 * (our mount->mnt_ncmountpt).
729 	 */
730 	if (mp->mnt_refs != 1) {
731 		if ((flags & MNT_FORCE) == 0) {
732 			mount_warning(mp, "Cannot unmount: "
733 					  "%d process references still "
734 					  "present", mp->mnt_refs);
735 			error = EBUSY;
736 		} else {
737 			mount_warning(mp, "Forced unmount: "
738 					  "%d process references still "
739 					  "present", mp->mnt_refs);
740 			freeok = 0;
741 		}
742 	}
743 
744 	/*
745 	 * Decomission our special mnt_syncer vnode.  This also stops
746 	 * the vnlru code.  If we are unable to unmount we recommission
747 	 * the vnode.
748 	 */
749 	if (error == 0) {
750 		if ((vp = mp->mnt_syncer) != NULL) {
751 			mp->mnt_syncer = NULL;
752 			vrele(vp);
753 		}
754 		if (((mp->mnt_flag & MNT_RDONLY) ||
755 		     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) ||
756 		    (flags & MNT_FORCE)) {
757 			error = VFS_UNMOUNT(mp, flags);
758 		}
759 	}
760 	if (error) {
761 		if (mp->mnt_syncer == NULL)
762 			vfs_allocate_syncvnode(mp);
763 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
764 		mp->mnt_flag |= async_flag;
765 		lockmgr(&mp->mnt_lock, LK_RELEASE);
766 		if (mp->mnt_kern_flag & MNTK_MWAIT)
767 			wakeup(mp);
768 		return (error);
769 	}
770 	/*
771 	 * Clean up any journals still associated with the mount after
772 	 * filesystem activity has ceased.
773 	 */
774 	journal_remove_all_journals(mp,
775 	    ((flags & MNT_FORCE) ? MC_JOURNAL_STOP_IMM : 0));
776 
777 	mountlist_remove(mp);
778 
779 	/*
780 	 * Remove any installed vnode ops here so the individual VFSs don't
781 	 * have to.
782 	 */
783 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops);
784 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops);
785 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_norm_ops);
786 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_spec_ops);
787 	vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_fifo_ops);
788 
789 	if (mp->mnt_ncmountpt.ncp != NULL) {
790 		nch = mp->mnt_ncmountpt;
791 		cache_zero(&mp->mnt_ncmountpt);
792 		cache_clrmountpt(&nch);
793 		cache_drop(&nch);
794 	}
795 	if (mp->mnt_ncmounton.ncp != NULL) {
796 		nch = mp->mnt_ncmounton;
797 		cache_zero(&mp->mnt_ncmounton);
798 		cache_clrmountpt(&nch);
799 		cache_drop(&nch);
800 	}
801 
802 	mp->mnt_vfc->vfc_refcount--;
803 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
804 		panic("unmount: dangling vnode");
805 	lockmgr(&mp->mnt_lock, LK_RELEASE);
806 	if (mp->mnt_kern_flag & MNTK_MWAIT)
807 		wakeup(mp);
808 	if (freeok)
809 		kfree(mp, M_MOUNT);
810 	return (0);
811 }
812 
813 static
814 void
815 mount_warning(struct mount *mp, const char *ctl, ...)
816 {
817 	char *ptr;
818 	char *buf;
819 	__va_list va;
820 
821 	__va_start(va, ctl);
822 	if (cache_fullpath(NULL, &mp->mnt_ncmounton, &ptr, &buf, 0) == 0) {
823 		kprintf("unmount(%s): ", ptr);
824 		kvprintf(ctl, va);
825 		kprintf("\n");
826 		kfree(buf, M_TEMP);
827 	} else {
828 		kprintf("unmount(%p", mp);
829 		if (mp->mnt_ncmounton.ncp && mp->mnt_ncmounton.ncp->nc_name)
830 			kprintf(",%s", mp->mnt_ncmounton.ncp->nc_name);
831 		kprintf("): ");
832 		kvprintf(ctl, va);
833 		kprintf("\n");
834 	}
835 	__va_end(va);
836 }
837 
838 /*
839  * Shim cache_fullpath() to handle the case where a process is chrooted into
840  * a subdirectory of a mount.  In this case if the root mount matches the
841  * process root directory's mount we have to specify the process's root
842  * directory instead of the mount point, because the mount point might
843  * be above the root directory.
844  */
845 static
846 int
847 mount_path(struct proc *p, struct mount *mp, char **rb, char **fb)
848 {
849 	struct nchandle *nch;
850 
851 	if (p && p->p_fd->fd_nrdir.mount == mp)
852 		nch = &p->p_fd->fd_nrdir;
853 	else
854 		nch = &mp->mnt_ncmountpt;
855 	return(cache_fullpath(p, nch, rb, fb, 0));
856 }
857 
858 /*
859  * Sync each mounted filesystem.
860  */
861 
862 #ifdef DEBUG
863 static int syncprt = 0;
864 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
865 #endif /* DEBUG */
866 
867 static int sync_callback(struct mount *mp, void *data);
868 
869 /*
870  * MPALMOSTSAFE
871  */
872 int
873 sys_sync(struct sync_args *uap)
874 {
875 	get_mplock();
876 	mountlist_scan(sync_callback, NULL, MNTSCAN_FORWARD);
877 #ifdef DEBUG
878 	/*
879 	 * print out buffer pool stat information on each sync() call.
880 	 */
881 	if (syncprt)
882 		vfs_bufstats();
883 #endif /* DEBUG */
884 	rel_mplock();
885 	return (0);
886 }
887 
888 static
889 int
890 sync_callback(struct mount *mp, void *data __unused)
891 {
892 	int asyncflag;
893 
894 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
895 		asyncflag = mp->mnt_flag & MNT_ASYNC;
896 		mp->mnt_flag &= ~MNT_ASYNC;
897 		vfs_msync(mp, MNT_NOWAIT);
898 		VFS_SYNC(mp, MNT_NOWAIT | MNT_LAZY);
899 		mp->mnt_flag |= asyncflag;
900 	}
901 	return(0);
902 }
903 
904 /* XXX PRISON: could be per prison flag */
905 static int prison_quotas;
906 #if 0
907 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
908 #endif
909 
910 /*
911  *  quotactl_args(char *path, int fcmd, int uid, caddr_t arg)
912  *
913  * Change filesystem quotas.
914  *
915  * MPALMOSTSAFE
916  */
917 int
918 sys_quotactl(struct quotactl_args *uap)
919 {
920 	struct nlookupdata nd;
921 	struct thread *td;
922 	struct proc *p;
923 	struct mount *mp;
924 	int error;
925 
926 	get_mplock();
927 	td = curthread;
928 	p = td->td_proc;
929 	if (td->td_ucred->cr_prison && !prison_quotas) {
930 		error = EPERM;
931 		goto done;
932 	}
933 
934 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
935 	if (error == 0)
936 		error = nlookup(&nd);
937 	if (error == 0) {
938 		mp = nd.nl_nch.mount;
939 		error = VFS_QUOTACTL(mp, uap->cmd, uap->uid,
940 				    uap->arg, nd.nl_cred);
941 	}
942 	nlookup_done(&nd);
943 done:
944 	rel_mplock();
945 	return (error);
946 }
947 
948 /*
949  * mountctl(char *path, int op, int fd, const void *ctl, int ctllen,
950  *		void *buf, int buflen)
951  *
952  * This function operates on a mount point and executes the specified
953  * operation using the specified control data, and possibly returns data.
954  *
955  * The actual number of bytes stored in the result buffer is returned, 0
956  * if none, otherwise an error is returned.
957  *
958  * MPALMOSTSAFE
959  */
960 int
961 sys_mountctl(struct mountctl_args *uap)
962 {
963 	struct thread *td = curthread;
964 	struct proc *p = td->td_proc;
965 	struct file *fp;
966 	void *ctl = NULL;
967 	void *buf = NULL;
968 	char *path = NULL;
969 	int error;
970 
971 	/*
972 	 * Sanity and permissions checks.  We must be root.
973 	 */
974 	KKASSERT(p);
975 	if (td->td_ucred->cr_prison != NULL)
976 		return (EPERM);
977 	if ((uap->op != MOUNTCTL_MOUNTFLAGS) &&
978 	    (error = priv_check(td, PRIV_ROOT)) != 0)
979 		return (error);
980 
981 	/*
982 	 * Argument length checks
983 	 */
984 	if (uap->ctllen < 0 || uap->ctllen > 1024)
985 		return (EINVAL);
986 	if (uap->buflen < 0 || uap->buflen > 16 * 1024)
987 		return (EINVAL);
988 	if (uap->path == NULL)
989 		return (EINVAL);
990 
991 	/*
992 	 * Allocate the necessary buffers and copyin data
993 	 */
994 	path = objcache_get(namei_oc, M_WAITOK);
995 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
996 	if (error)
997 		goto done;
998 
999 	if (uap->ctllen) {
1000 		ctl = kmalloc(uap->ctllen + 1, M_TEMP, M_WAITOK|M_ZERO);
1001 		error = copyin(uap->ctl, ctl, uap->ctllen);
1002 		if (error)
1003 			goto done;
1004 	}
1005 	if (uap->buflen)
1006 		buf = kmalloc(uap->buflen + 1, M_TEMP, M_WAITOK|M_ZERO);
1007 
1008 	/*
1009 	 * Validate the descriptor
1010 	 */
1011 	if (uap->fd >= 0) {
1012 		fp = holdfp(p->p_fd, uap->fd, -1);
1013 		if (fp == NULL) {
1014 			error = EBADF;
1015 			goto done;
1016 		}
1017 	} else {
1018 		fp = NULL;
1019 	}
1020 
1021 	/*
1022 	 * Execute the internal kernel function and clean up.
1023 	 */
1024 	get_mplock();
1025 	error = kern_mountctl(path, uap->op, fp, ctl, uap->ctllen, buf, uap->buflen, &uap->sysmsg_result);
1026 	rel_mplock();
1027 	if (fp)
1028 		fdrop(fp);
1029 	if (error == 0 && uap->sysmsg_result > 0)
1030 		error = copyout(buf, uap->buf, uap->sysmsg_result);
1031 done:
1032 	if (path)
1033 		objcache_put(namei_oc, path);
1034 	if (ctl)
1035 		kfree(ctl, M_TEMP);
1036 	if (buf)
1037 		kfree(buf, M_TEMP);
1038 	return (error);
1039 }
1040 
1041 /*
1042  * Execute a mount control operation by resolving the path to a mount point
1043  * and calling vop_mountctl().
1044  *
1045  * Use the mount point from the nch instead of the vnode so nullfs mounts
1046  * can properly spike the VOP.
1047  */
1048 int
1049 kern_mountctl(const char *path, int op, struct file *fp,
1050 		const void *ctl, int ctllen,
1051 		void *buf, int buflen, int *res)
1052 {
1053 	struct vnode *vp;
1054 	struct mount *mp;
1055 	struct nlookupdata nd;
1056 	int error;
1057 
1058 	*res = 0;
1059 	vp = NULL;
1060 	error = nlookup_init(&nd, path, UIO_SYSSPACE, NLC_FOLLOW);
1061 	if (error == 0)
1062 		error = nlookup(&nd);
1063 	if (error == 0)
1064 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
1065 	mp = nd.nl_nch.mount;
1066 	nlookup_done(&nd);
1067 	if (error)
1068 		return (error);
1069 	vn_unlock(vp);
1070 
1071 	/*
1072 	 * Must be the root of the filesystem
1073 	 */
1074 	if ((vp->v_flag & (VROOT|VPFSROOT)) == 0) {
1075 		vrele(vp);
1076 		return (EINVAL);
1077 	}
1078 	error = vop_mountctl(mp->mnt_vn_use_ops, vp, op, fp, ctl, ctllen,
1079 			     buf, buflen, res);
1080 	vrele(vp);
1081 	return (error);
1082 }
1083 
1084 int
1085 kern_statfs(struct nlookupdata *nd, struct statfs *buf)
1086 {
1087 	struct thread *td = curthread;
1088 	struct proc *p = td->td_proc;
1089 	struct mount *mp;
1090 	struct statfs *sp;
1091 	char *fullpath, *freepath;
1092 	int error;
1093 
1094 	if ((error = nlookup(nd)) != 0)
1095 		return (error);
1096 	mp = nd->nl_nch.mount;
1097 	sp = &mp->mnt_stat;
1098 	if ((error = VFS_STATFS(mp, sp, nd->nl_cred)) != 0)
1099 		return (error);
1100 
1101 	error = mount_path(p, mp, &fullpath, &freepath);
1102 	if (error)
1103 		return(error);
1104 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1105 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1106 	kfree(freepath, M_TEMP);
1107 
1108 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1109 	bcopy(sp, buf, sizeof(*buf));
1110 	/* Only root should have access to the fsid's. */
1111 	if (priv_check(td, PRIV_ROOT))
1112 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1113 	return (0);
1114 }
1115 
1116 /*
1117  * statfs_args(char *path, struct statfs *buf)
1118  *
1119  * Get filesystem statistics.
1120  *
1121  * MPALMOSTSAFE
1122  */
1123 int
1124 sys_statfs(struct statfs_args *uap)
1125 {
1126 	struct nlookupdata nd;
1127 	struct statfs buf;
1128 	int error;
1129 
1130 	get_mplock();
1131 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1132 	if (error == 0)
1133 		error = kern_statfs(&nd, &buf);
1134 	nlookup_done(&nd);
1135 	if (error == 0)
1136 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1137 	rel_mplock();
1138 	return (error);
1139 }
1140 
1141 /*
1142  * MPALMOSTSAFE
1143  */
1144 int
1145 kern_fstatfs(int fd, struct statfs *buf)
1146 {
1147 	struct thread *td = curthread;
1148 	struct proc *p = td->td_proc;
1149 	struct file *fp;
1150 	struct mount *mp;
1151 	struct statfs *sp;
1152 	char *fullpath, *freepath;
1153 	int error;
1154 
1155 	KKASSERT(p);
1156 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1157 		return (error);
1158 	get_mplock();
1159 	mp = ((struct vnode *)fp->f_data)->v_mount;
1160 	if (mp == NULL) {
1161 		error = EBADF;
1162 		goto done;
1163 	}
1164 	if (fp->f_cred == NULL) {
1165 		error = EINVAL;
1166 		goto done;
1167 	}
1168 	sp = &mp->mnt_stat;
1169 	if ((error = VFS_STATFS(mp, sp, fp->f_cred)) != 0)
1170 		goto done;
1171 
1172 	if ((error = mount_path(p, mp, &fullpath, &freepath)) != 0)
1173 		goto done;
1174 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1175 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1176 	kfree(freepath, M_TEMP);
1177 
1178 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1179 	bcopy(sp, buf, sizeof(*buf));
1180 
1181 	/* Only root should have access to the fsid's. */
1182 	if (priv_check(td, PRIV_ROOT))
1183 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
1184 	error = 0;
1185 done:
1186 	rel_mplock();
1187 	fdrop(fp);
1188 	return (error);
1189 }
1190 
1191 /*
1192  * fstatfs_args(int fd, struct statfs *buf)
1193  *
1194  * Get filesystem statistics.
1195  *
1196  * MPSAFE
1197  */
1198 int
1199 sys_fstatfs(struct fstatfs_args *uap)
1200 {
1201 	struct statfs buf;
1202 	int error;
1203 
1204 	error = kern_fstatfs(uap->fd, &buf);
1205 
1206 	if (error == 0)
1207 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1208 	return (error);
1209 }
1210 
1211 int
1212 kern_statvfs(struct nlookupdata *nd, struct statvfs *buf)
1213 {
1214 	struct mount *mp;
1215 	struct statvfs *sp;
1216 	int error;
1217 
1218 	if ((error = nlookup(nd)) != 0)
1219 		return (error);
1220 	mp = nd->nl_nch.mount;
1221 	sp = &mp->mnt_vstat;
1222 	if ((error = VFS_STATVFS(mp, sp, nd->nl_cred)) != 0)
1223 		return (error);
1224 
1225 	sp->f_flag = 0;
1226 	if (mp->mnt_flag & MNT_RDONLY)
1227 		sp->f_flag |= ST_RDONLY;
1228 	if (mp->mnt_flag & MNT_NOSUID)
1229 		sp->f_flag |= ST_NOSUID;
1230 	bcopy(sp, buf, sizeof(*buf));
1231 	return (0);
1232 }
1233 
1234 /*
1235  * statfs_args(char *path, struct statfs *buf)
1236  *
1237  * Get filesystem statistics.
1238  *
1239  * MPALMOSTSAFE
1240  */
1241 int
1242 sys_statvfs(struct statvfs_args *uap)
1243 {
1244 	struct nlookupdata nd;
1245 	struct statvfs buf;
1246 	int error;
1247 
1248 	get_mplock();
1249 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1250 	if (error == 0)
1251 		error = kern_statvfs(&nd, &buf);
1252 	nlookup_done(&nd);
1253 	if (error == 0)
1254 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1255 	rel_mplock();
1256 	return (error);
1257 }
1258 
1259 int
1260 kern_fstatvfs(int fd, struct statvfs *buf)
1261 {
1262 	struct thread *td = curthread;
1263 	struct proc *p = td->td_proc;
1264 	struct file *fp;
1265 	struct mount *mp;
1266 	struct statvfs *sp;
1267 	int error;
1268 
1269 	KKASSERT(p);
1270 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
1271 		return (error);
1272 	mp = ((struct vnode *)fp->f_data)->v_mount;
1273 	if (mp == NULL) {
1274 		error = EBADF;
1275 		goto done;
1276 	}
1277 	if (fp->f_cred == NULL) {
1278 		error = EINVAL;
1279 		goto done;
1280 	}
1281 	sp = &mp->mnt_vstat;
1282 	if ((error = VFS_STATVFS(mp, sp, fp->f_cred)) != 0)
1283 		goto done;
1284 
1285 	sp->f_flag = 0;
1286 	if (mp->mnt_flag & MNT_RDONLY)
1287 		sp->f_flag |= ST_RDONLY;
1288 	if (mp->mnt_flag & MNT_NOSUID)
1289 		sp->f_flag |= ST_NOSUID;
1290 
1291 	bcopy(sp, buf, sizeof(*buf));
1292 	error = 0;
1293 done:
1294 	fdrop(fp);
1295 	return (error);
1296 }
1297 
1298 /*
1299  * fstatfs_args(int fd, struct statfs *buf)
1300  *
1301  * Get filesystem statistics.
1302  *
1303  * MPALMOSTSAFE
1304  */
1305 int
1306 sys_fstatvfs(struct fstatvfs_args *uap)
1307 {
1308 	struct statvfs buf;
1309 	int error;
1310 
1311 	get_mplock();
1312 	error = kern_fstatvfs(uap->fd, &buf);
1313 	rel_mplock();
1314 
1315 	if (error == 0)
1316 		error = copyout(&buf, uap->buf, sizeof(*uap->buf));
1317 	return (error);
1318 }
1319 
1320 /*
1321  * getfsstat_args(struct statfs *buf, long bufsize, int flags)
1322  *
1323  * Get statistics on all filesystems.
1324  */
1325 
1326 struct getfsstat_info {
1327 	struct statfs *sfsp;
1328 	long count;
1329 	long maxcount;
1330 	int error;
1331 	int flags;
1332 	struct thread *td;
1333 };
1334 
1335 static int getfsstat_callback(struct mount *, void *);
1336 
1337 /*
1338  * MPALMOSTSAFE
1339  */
1340 int
1341 sys_getfsstat(struct getfsstat_args *uap)
1342 {
1343 	struct thread *td = curthread;
1344 	struct getfsstat_info info;
1345 
1346 	bzero(&info, sizeof(info));
1347 
1348 	info.maxcount = uap->bufsize / sizeof(struct statfs);
1349 	info.sfsp = uap->buf;
1350 	info.count = 0;
1351 	info.flags = uap->flags;
1352 	info.td = td;
1353 
1354 	get_mplock();
1355 	mountlist_scan(getfsstat_callback, &info, MNTSCAN_FORWARD);
1356 	rel_mplock();
1357 	if (info.sfsp && info.count > info.maxcount)
1358 		uap->sysmsg_result = info.maxcount;
1359 	else
1360 		uap->sysmsg_result = info.count;
1361 	return (info.error);
1362 }
1363 
1364 static int
1365 getfsstat_callback(struct mount *mp, void *data)
1366 {
1367 	struct getfsstat_info *info = data;
1368 	struct statfs *sp;
1369 	char *freepath;
1370 	char *fullpath;
1371 	int error;
1372 
1373 	if (info->sfsp && info->count < info->maxcount) {
1374 		if (info->td->td_proc &&
1375 		    !chroot_visible_mnt(mp, info->td->td_proc)) {
1376 			return(0);
1377 		}
1378 		sp = &mp->mnt_stat;
1379 
1380 		/*
1381 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1382 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1383 		 * overrides MNT_WAIT.
1384 		 */
1385 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1386 		    (info->flags & MNT_WAIT)) &&
1387 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1388 			return(0);
1389 		}
1390 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1391 
1392 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1393 		if (error) {
1394 			info->error = error;
1395 			return(-1);
1396 		}
1397 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1398 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1399 		kfree(freepath, M_TEMP);
1400 
1401 		error = copyout(sp, info->sfsp, sizeof(*sp));
1402 		if (error) {
1403 			info->error = error;
1404 			return (-1);
1405 		}
1406 		++info->sfsp;
1407 	}
1408 	info->count++;
1409 	return(0);
1410 }
1411 
1412 /*
1413  * getvfsstat_args(struct statfs *buf, struct statvfs *vbuf,
1414 		   long bufsize, int flags)
1415  *
1416  * Get statistics on all filesystems.
1417  */
1418 
1419 struct getvfsstat_info {
1420 	struct statfs *sfsp;
1421 	struct statvfs *vsfsp;
1422 	long count;
1423 	long maxcount;
1424 	int error;
1425 	int flags;
1426 	struct thread *td;
1427 };
1428 
1429 static int getvfsstat_callback(struct mount *, void *);
1430 
1431 /*
1432  * MPALMOSTSAFE
1433  */
1434 int
1435 sys_getvfsstat(struct getvfsstat_args *uap)
1436 {
1437 	struct thread *td = curthread;
1438 	struct getvfsstat_info info;
1439 
1440 	bzero(&info, sizeof(info));
1441 
1442 	info.maxcount = uap->vbufsize / sizeof(struct statvfs);
1443 	info.sfsp = uap->buf;
1444 	info.vsfsp = uap->vbuf;
1445 	info.count = 0;
1446 	info.flags = uap->flags;
1447 	info.td = td;
1448 
1449 	get_mplock();
1450 	mountlist_scan(getvfsstat_callback, &info, MNTSCAN_FORWARD);
1451 	if (info.vsfsp && info.count > info.maxcount)
1452 		uap->sysmsg_result = info.maxcount;
1453 	else
1454 		uap->sysmsg_result = info.count;
1455 	rel_mplock();
1456 	return (info.error);
1457 }
1458 
1459 static int
1460 getvfsstat_callback(struct mount *mp, void *data)
1461 {
1462 	struct getvfsstat_info *info = data;
1463 	struct statfs *sp;
1464 	struct statvfs *vsp;
1465 	char *freepath;
1466 	char *fullpath;
1467 	int error;
1468 
1469 	if (info->vsfsp && info->count < info->maxcount) {
1470 		if (info->td->td_proc &&
1471 		    !chroot_visible_mnt(mp, info->td->td_proc)) {
1472 			return(0);
1473 		}
1474 		sp = &mp->mnt_stat;
1475 		vsp = &mp->mnt_vstat;
1476 
1477 		/*
1478 		 * If MNT_NOWAIT or MNT_LAZY is specified, do not
1479 		 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
1480 		 * overrides MNT_WAIT.
1481 		 */
1482 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1483 		    (info->flags & MNT_WAIT)) &&
1484 		    (error = VFS_STATFS(mp, sp, info->td->td_ucred))) {
1485 			return(0);
1486 		}
1487 		sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
1488 
1489 		if (((info->flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
1490 		    (info->flags & MNT_WAIT)) &&
1491 		    (error = VFS_STATVFS(mp, vsp, info->td->td_ucred))) {
1492 			return(0);
1493 		}
1494 		vsp->f_flag = 0;
1495 		if (mp->mnt_flag & MNT_RDONLY)
1496 			vsp->f_flag |= ST_RDONLY;
1497 		if (mp->mnt_flag & MNT_NOSUID)
1498 			vsp->f_flag |= ST_NOSUID;
1499 
1500 		error = mount_path(info->td->td_proc, mp, &fullpath, &freepath);
1501 		if (error) {
1502 			info->error = error;
1503 			return(-1);
1504 		}
1505 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1506 		strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
1507 		kfree(freepath, M_TEMP);
1508 
1509 		error = copyout(sp, info->sfsp, sizeof(*sp));
1510 		if (error == 0)
1511 			error = copyout(vsp, info->vsfsp, sizeof(*vsp));
1512 		if (error) {
1513 			info->error = error;
1514 			return (-1);
1515 		}
1516 		++info->sfsp;
1517 		++info->vsfsp;
1518 	}
1519 	info->count++;
1520 	return(0);
1521 }
1522 
1523 
1524 /*
1525  * fchdir_args(int fd)
1526  *
1527  * Change current working directory to a given file descriptor.
1528  *
1529  * MPALMOSTSAFE
1530  */
1531 int
1532 sys_fchdir(struct fchdir_args *uap)
1533 {
1534 	struct thread *td = curthread;
1535 	struct proc *p = td->td_proc;
1536 	struct filedesc *fdp = p->p_fd;
1537 	struct vnode *vp, *ovp;
1538 	struct mount *mp;
1539 	struct file *fp;
1540 	struct nchandle nch, onch, tnch;
1541 	int error;
1542 
1543 	if ((error = holdvnode(fdp, uap->fd, &fp)) != 0)
1544 		return (error);
1545 	get_mplock();
1546 	vp = (struct vnode *)fp->f_data;
1547 	vref(vp);
1548 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1549 	if (fp->f_nchandle.ncp == NULL)
1550 		error = ENOTDIR;
1551 	else
1552 		error = checkvp_chdir(vp, td);
1553 	if (error) {
1554 		vput(vp);
1555 		goto done;
1556 	}
1557 	cache_copy(&fp->f_nchandle, &nch);
1558 
1559 	/*
1560 	 * If the ncp has become a mount point, traverse through
1561 	 * the mount point.
1562 	 */
1563 
1564 	while (!error && (nch.ncp->nc_flag & NCF_ISMOUNTPT) &&
1565 	       (mp = cache_findmount(&nch)) != NULL
1566 	) {
1567 		error = nlookup_mp(mp, &tnch);
1568 		if (error == 0) {
1569 			cache_unlock(&tnch);	/* leave ref intact */
1570 			vput(vp);
1571 			vp = tnch.ncp->nc_vp;
1572 			error = vget(vp, LK_SHARED);
1573 			KKASSERT(error == 0);
1574 			cache_drop(&nch);
1575 			nch = tnch;
1576 		}
1577 	}
1578 	if (error == 0) {
1579 		ovp = fdp->fd_cdir;
1580 		onch = fdp->fd_ncdir;
1581 		vn_unlock(vp);		/* leave ref intact */
1582 		fdp->fd_cdir = vp;
1583 		fdp->fd_ncdir = nch;
1584 		cache_drop(&onch);
1585 		vrele(ovp);
1586 	} else {
1587 		cache_drop(&nch);
1588 		vput(vp);
1589 	}
1590 	fdrop(fp);
1591 done:
1592 	rel_mplock();
1593 	return (error);
1594 }
1595 
1596 int
1597 kern_chdir(struct nlookupdata *nd)
1598 {
1599 	struct thread *td = curthread;
1600 	struct proc *p = td->td_proc;
1601 	struct filedesc *fdp = p->p_fd;
1602 	struct vnode *vp, *ovp;
1603 	struct nchandle onch;
1604 	int error;
1605 
1606 	if ((error = nlookup(nd)) != 0)
1607 		return (error);
1608 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
1609 		return (ENOENT);
1610 	if ((error = vget(vp, LK_SHARED)) != 0)
1611 		return (error);
1612 
1613 	error = checkvp_chdir(vp, td);
1614 	vn_unlock(vp);
1615 	if (error == 0) {
1616 		ovp = fdp->fd_cdir;
1617 		onch = fdp->fd_ncdir;
1618 		cache_unlock(&nd->nl_nch);	/* leave reference intact */
1619 		fdp->fd_ncdir = nd->nl_nch;
1620 		fdp->fd_cdir = vp;
1621 		cache_drop(&onch);
1622 		vrele(ovp);
1623 		cache_zero(&nd->nl_nch);
1624 	} else {
1625 		vrele(vp);
1626 	}
1627 	return (error);
1628 }
1629 
1630 /*
1631  * chdir_args(char *path)
1632  *
1633  * Change current working directory (``.'').
1634  *
1635  * MPALMOSTSAFE
1636  */
1637 int
1638 sys_chdir(struct chdir_args *uap)
1639 {
1640 	struct nlookupdata nd;
1641 	int error;
1642 
1643 	get_mplock();
1644 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1645 	if (error == 0)
1646 		error = kern_chdir(&nd);
1647 	nlookup_done(&nd);
1648 	rel_mplock();
1649 	return (error);
1650 }
1651 
1652 /*
1653  * Helper function for raised chroot(2) security function:  Refuse if
1654  * any filedescriptors are open directories.
1655  */
1656 static int
1657 chroot_refuse_vdir_fds(struct filedesc *fdp)
1658 {
1659 	struct vnode *vp;
1660 	struct file *fp;
1661 	int error;
1662 	int fd;
1663 
1664 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
1665 		if ((error = holdvnode(fdp, fd, &fp)) != 0)
1666 			continue;
1667 		vp = (struct vnode *)fp->f_data;
1668 		if (vp->v_type != VDIR) {
1669 			fdrop(fp);
1670 			continue;
1671 		}
1672 		fdrop(fp);
1673 		return(EPERM);
1674 	}
1675 	return (0);
1676 }
1677 
1678 /*
1679  * This sysctl determines if we will allow a process to chroot(2) if it
1680  * has a directory open:
1681  *	0: disallowed for all processes.
1682  *	1: allowed for processes that were not already chroot(2)'ed.
1683  *	2: allowed for all processes.
1684  */
1685 
1686 static int chroot_allow_open_directories = 1;
1687 
1688 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
1689      &chroot_allow_open_directories, 0, "");
1690 
1691 /*
1692  * chroot to the specified namecache entry.  We obtain the vp from the
1693  * namecache data.  The passed ncp must be locked and referenced and will
1694  * remain locked and referenced on return.
1695  */
1696 int
1697 kern_chroot(struct nchandle *nch)
1698 {
1699 	struct thread *td = curthread;
1700 	struct proc *p = td->td_proc;
1701 	struct filedesc *fdp = p->p_fd;
1702 	struct vnode *vp;
1703 	int error;
1704 
1705 	/*
1706 	 * Only privileged user can chroot
1707 	 */
1708 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1709 	if (error)
1710 		return (error);
1711 
1712 	/*
1713 	 * Disallow open directory descriptors (fchdir() breakouts).
1714 	 */
1715 	if (chroot_allow_open_directories == 0 ||
1716 	   (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
1717 		if ((error = chroot_refuse_vdir_fds(fdp)) != 0)
1718 			return (error);
1719 	}
1720 	if ((vp = nch->ncp->nc_vp) == NULL)
1721 		return (ENOENT);
1722 
1723 	if ((error = vget(vp, LK_SHARED)) != 0)
1724 		return (error);
1725 
1726 	/*
1727 	 * Check the validity of vp as a directory to change to and
1728 	 * associate it with rdir/jdir.
1729 	 */
1730 	error = checkvp_chdir(vp, td);
1731 	vn_unlock(vp);			/* leave reference intact */
1732 	if (error == 0) {
1733 		vrele(fdp->fd_rdir);
1734 		fdp->fd_rdir = vp;	/* reference inherited by fd_rdir */
1735 		cache_drop(&fdp->fd_nrdir);
1736 		cache_copy(nch, &fdp->fd_nrdir);
1737 		if (fdp->fd_jdir == NULL) {
1738 			fdp->fd_jdir = vp;
1739 			vref(fdp->fd_jdir);
1740 			cache_copy(nch, &fdp->fd_njdir);
1741 		}
1742 	} else {
1743 		vrele(vp);
1744 	}
1745 	return (error);
1746 }
1747 
1748 /*
1749  * chroot_args(char *path)
1750  *
1751  * Change notion of root (``/'') directory.
1752  *
1753  * MPALMOSTSAFE
1754  */
1755 int
1756 sys_chroot(struct chroot_args *uap)
1757 {
1758 	struct thread *td __debugvar = curthread;
1759 	struct nlookupdata nd;
1760 	int error;
1761 
1762 	KKASSERT(td->td_proc);
1763 	get_mplock();
1764 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1765 	if (error == 0) {
1766 		nd.nl_flags |= NLC_EXEC;
1767 		error = nlookup(&nd);
1768 		if (error == 0)
1769 			error = kern_chroot(&nd.nl_nch);
1770 	}
1771 	nlookup_done(&nd);
1772 	rel_mplock();
1773 	return(error);
1774 }
1775 
1776 int
1777 sys_chroot_kernel(struct chroot_kernel_args *uap)
1778 {
1779 	struct thread *td = curthread;
1780 	struct nlookupdata nd;
1781 	struct nchandle *nch;
1782 	struct vnode *vp;
1783 	int error;
1784 
1785 	get_mplock();
1786 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
1787 	if (error)
1788 		goto error_nond;
1789 
1790 	error = nlookup(&nd);
1791 	if (error)
1792 		goto error_out;
1793 
1794 	nch = &nd.nl_nch;
1795 
1796 	error = priv_check_cred(td->td_ucred, PRIV_VFS_CHROOT, 0);
1797 	if (error)
1798 		goto error_out;
1799 
1800 	if ((vp = nch->ncp->nc_vp) == NULL) {
1801 		error = ENOENT;
1802 		goto error_out;
1803 	}
1804 
1805 	if ((error = cache_vref(nch, nd.nl_cred, &vp)) != 0)
1806 		goto error_out;
1807 
1808 	kprintf("chroot_kernel: set new rootnch/rootvnode to %s\n", uap->path);
1809 	vfs_cache_setroot(vp, cache_hold(nch));
1810 
1811 error_out:
1812 	nlookup_done(&nd);
1813 error_nond:
1814 	rel_mplock();
1815 	return(error);
1816 }
1817 
1818 /*
1819  * Common routine for chroot and chdir.  Given a locked, referenced vnode,
1820  * determine whether it is legal to chdir to the vnode.  The vnode's state
1821  * is not changed by this call.
1822  */
1823 int
1824 checkvp_chdir(struct vnode *vp, struct thread *td)
1825 {
1826 	int error;
1827 
1828 	if (vp->v_type != VDIR)
1829 		error = ENOTDIR;
1830 	else
1831 		error = VOP_EACCESS(vp, VEXEC, td->td_ucred);
1832 	return (error);
1833 }
1834 
1835 /*
1836  * MPSAFE
1837  */
1838 int
1839 kern_open(struct nlookupdata *nd, int oflags, int mode, int *res)
1840 {
1841 	struct thread *td = curthread;
1842 	struct proc *p = td->td_proc;
1843 	struct lwp *lp = td->td_lwp;
1844 	struct filedesc *fdp = p->p_fd;
1845 	int cmode, flags;
1846 	struct file *nfp;
1847 	struct file *fp;
1848 	struct vnode *vp;
1849 	int type, indx, error;
1850 	struct flock lf;
1851 
1852 	if ((oflags & O_ACCMODE) == O_ACCMODE)
1853 		return (EINVAL);
1854 	flags = FFLAGS(oflags);
1855 	error = falloc(lp, &nfp, NULL);
1856 	if (error)
1857 		return (error);
1858 	fp = nfp;
1859 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1860 
1861 	/*
1862 	 * XXX p_dupfd is a real mess.  It allows a device to return a
1863 	 * file descriptor to be duplicated rather then doing the open
1864 	 * itself.
1865 	 */
1866 	lp->lwp_dupfd = -1;
1867 
1868 	/*
1869 	 * Call vn_open() to do the lookup and assign the vnode to the
1870 	 * file pointer.  vn_open() does not change the ref count on fp
1871 	 * and the vnode, on success, will be inherited by the file pointer
1872 	 * and unlocked.
1873 	 */
1874 	nd->nl_flags |= NLC_LOCKVP;
1875 	error = vn_open(nd, fp, flags, cmode);
1876 	nlookup_done(nd);
1877 	if (error) {
1878 		/*
1879 		 * handle special fdopen() case.  bleh.  dupfdopen() is
1880 		 * responsible for dropping the old contents of ofiles[indx]
1881 		 * if it succeeds.
1882 		 *
1883 		 * Note that fsetfd() will add a ref to fp which represents
1884 		 * the fd_files[] assignment.  We must still drop our
1885 		 * reference.
1886 		 */
1887 		if ((error == ENODEV || error == ENXIO) && lp->lwp_dupfd >= 0) {
1888 			if (fdalloc(p, 0, &indx) == 0) {
1889 				error = dupfdopen(fdp, indx, lp->lwp_dupfd, flags, error);
1890 				if (error == 0) {
1891 					*res = indx;
1892 					fdrop(fp);	/* our ref */
1893 					return (0);
1894 				}
1895 				fsetfd(fdp, NULL, indx);
1896 			}
1897 		}
1898 		fdrop(fp);	/* our ref */
1899 		if (error == ERESTART)
1900 			error = EINTR;
1901 		return (error);
1902 	}
1903 
1904 	/*
1905 	 * ref the vnode for ourselves so it can't be ripped out from under
1906 	 * is.  XXX need an ND flag to request that the vnode be returned
1907 	 * anyway.
1908 	 *
1909 	 * Reserve a file descriptor but do not assign it until the open
1910 	 * succeeds.
1911 	 */
1912 	vp = (struct vnode *)fp->f_data;
1913 	vref(vp);
1914 	if ((error = fdalloc(p, 0, &indx)) != 0) {
1915 		fdrop(fp);
1916 		vrele(vp);
1917 		return (error);
1918 	}
1919 
1920 	/*
1921 	 * If no error occurs the vp will have been assigned to the file
1922 	 * pointer.
1923 	 */
1924 	lp->lwp_dupfd = 0;
1925 
1926 	if (flags & (O_EXLOCK | O_SHLOCK)) {
1927 		lf.l_whence = SEEK_SET;
1928 		lf.l_start = 0;
1929 		lf.l_len = 0;
1930 		if (flags & O_EXLOCK)
1931 			lf.l_type = F_WRLCK;
1932 		else
1933 			lf.l_type = F_RDLCK;
1934 		if (flags & FNONBLOCK)
1935 			type = 0;
1936 		else
1937 			type = F_WAIT;
1938 
1939 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
1940 			/*
1941 			 * lock request failed.  Clean up the reserved
1942 			 * descriptor.
1943 			 */
1944 			vrele(vp);
1945 			fsetfd(fdp, NULL, indx);
1946 			fdrop(fp);
1947 			return (error);
1948 		}
1949 		fp->f_flag |= FHASLOCK;
1950 	}
1951 #if 0
1952 	/*
1953 	 * Assert that all regular file vnodes were created with a object.
1954 	 */
1955 	KASSERT(vp->v_type != VREG || vp->v_object != NULL,
1956 		("open: regular file has no backing object after vn_open"));
1957 #endif
1958 
1959 	vrele(vp);
1960 
1961 	/*
1962 	 * release our private reference, leaving the one associated with the
1963 	 * descriptor table intact.
1964 	 */
1965 	fsetfd(fdp, fp, indx);
1966 	fdrop(fp);
1967 	*res = indx;
1968 	return (0);
1969 }
1970 
1971 /*
1972  * open_args(char *path, int flags, int mode)
1973  *
1974  * Check permissions, allocate an open file structure,
1975  * and call the device open routine if any.
1976  *
1977  * MPALMOSTSAFE
1978  */
1979 int
1980 sys_open(struct open_args *uap)
1981 {
1982 	struct nlookupdata nd;
1983 	int error;
1984 
1985 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
1986 	if (error == 0) {
1987 		error = kern_open(&nd, uap->flags,
1988 				    uap->mode, &uap->sysmsg_result);
1989 	}
1990 	nlookup_done(&nd);
1991 	return (error);
1992 }
1993 
1994 /*
1995  * openat_args(int fd, char *path, int flags, int mode)
1996  *
1997  * MPALMOSTSAFE
1998  */
1999 int
2000 sys_openat(struct openat_args *uap)
2001 {
2002 	struct nlookupdata nd;
2003 	int error;
2004 	struct file *fp;
2005 
2006 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2007 	if (error == 0) {
2008 		error = kern_open(&nd, uap->flags, uap->mode,
2009 					&uap->sysmsg_result);
2010 	}
2011 	nlookup_done_at(&nd, fp);
2012 	return (error);
2013 }
2014 
2015 int
2016 kern_mknod(struct nlookupdata *nd, int mode, int rmajor, int rminor)
2017 {
2018 	struct thread *td = curthread;
2019 	struct proc *p = td->td_proc;
2020 	struct vnode *vp;
2021 	struct vattr vattr;
2022 	int error;
2023 	int whiteout = 0;
2024 
2025 	KKASSERT(p);
2026 
2027 	VATTR_NULL(&vattr);
2028 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2029 	vattr.va_rmajor = rmajor;
2030 	vattr.va_rminor = rminor;
2031 
2032 	switch (mode & S_IFMT) {
2033 	case S_IFMT:	/* used by badsect to flag bad sectors */
2034 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_BAD, 0);
2035 		vattr.va_type = VBAD;
2036 		break;
2037 	case S_IFCHR:
2038 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
2039 		vattr.va_type = VCHR;
2040 		break;
2041 	case S_IFBLK:
2042 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
2043 		vattr.va_type = VBLK;
2044 		break;
2045 	case S_IFWHT:
2046 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_WHT, 0);
2047 		whiteout = 1;
2048 		break;
2049 	case S_IFDIR:	/* special directories support for HAMMER */
2050 		error = priv_check_cred(td->td_ucred, PRIV_VFS_MKNOD_DIR, 0);
2051 		vattr.va_type = VDIR;
2052 		break;
2053 	default:
2054 		error = EINVAL;
2055 		break;
2056 	}
2057 
2058 	if (error)
2059 		return (error);
2060 
2061 	bwillinode(1);
2062 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2063 	if ((error = nlookup(nd)) != 0)
2064 		return (error);
2065 	if (nd->nl_nch.ncp->nc_vp)
2066 		return (EEXIST);
2067 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2068 		return (error);
2069 
2070 	if (whiteout) {
2071 		error = VOP_NWHITEOUT(&nd->nl_nch, nd->nl_dvp,
2072 				      nd->nl_cred, NAMEI_CREATE);
2073 	} else {
2074 		vp = NULL;
2075 		error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp,
2076 				   &vp, nd->nl_cred, &vattr);
2077 		if (error == 0)
2078 			vput(vp);
2079 	}
2080 	return (error);
2081 }
2082 
2083 /*
2084  * mknod_args(char *path, int mode, int dev)
2085  *
2086  * Create a special file.
2087  *
2088  * MPALMOSTSAFE
2089  */
2090 int
2091 sys_mknod(struct mknod_args *uap)
2092 {
2093 	struct nlookupdata nd;
2094 	int error;
2095 
2096 	get_mplock();
2097 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2098 	if (error == 0) {
2099 		error = kern_mknod(&nd, uap->mode,
2100 				   umajor(uap->dev), uminor(uap->dev));
2101 	}
2102 	nlookup_done(&nd);
2103 	rel_mplock();
2104 	return (error);
2105 }
2106 
2107 /*
2108  * mknodat_args(int fd, char *path, mode_t mode, dev_t dev)
2109  *
2110  * Create a special file.  The path is relative to the directory associated
2111  * with fd.
2112  *
2113  * MPALMOSTSAFE
2114  */
2115 int
2116 sys_mknodat(struct mknodat_args *uap)
2117 {
2118 	struct nlookupdata nd;
2119 	struct file *fp;
2120 	int error;
2121 
2122 	get_mplock();
2123 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2124 	if (error == 0) {
2125 		error = kern_mknod(&nd, uap->mode,
2126 				   umajor(uap->dev), uminor(uap->dev));
2127 	}
2128 	nlookup_done_at(&nd, fp);
2129 	rel_mplock();
2130 	return (error);
2131 }
2132 
2133 int
2134 kern_mkfifo(struct nlookupdata *nd, int mode)
2135 {
2136 	struct thread *td = curthread;
2137 	struct proc *p = td->td_proc;
2138 	struct vattr vattr;
2139 	struct vnode *vp;
2140 	int error;
2141 
2142 	bwillinode(1);
2143 
2144 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2145 	if ((error = nlookup(nd)) != 0)
2146 		return (error);
2147 	if (nd->nl_nch.ncp->nc_vp)
2148 		return (EEXIST);
2149 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2150 		return (error);
2151 
2152 	VATTR_NULL(&vattr);
2153 	vattr.va_type = VFIFO;
2154 	vattr.va_mode = (mode & ALLPERMS) &~ p->p_fd->fd_cmask;
2155 	vp = NULL;
2156 	error = VOP_NMKNOD(&nd->nl_nch, nd->nl_dvp, &vp, nd->nl_cred, &vattr);
2157 	if (error == 0)
2158 		vput(vp);
2159 	return (error);
2160 }
2161 
2162 /*
2163  * mkfifo_args(char *path, int mode)
2164  *
2165  * Create a named pipe.
2166  *
2167  * MPALMOSTSAFE
2168  */
2169 int
2170 sys_mkfifo(struct mkfifo_args *uap)
2171 {
2172 	struct nlookupdata nd;
2173 	int error;
2174 
2175 	get_mplock();
2176 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2177 	if (error == 0)
2178 		error = kern_mkfifo(&nd, uap->mode);
2179 	nlookup_done(&nd);
2180 	rel_mplock();
2181 	return (error);
2182 }
2183 
2184 /*
2185  * mkfifoat_args(int fd, char *path, mode_t mode)
2186  *
2187  * Create a named pipe.  The path is relative to the directory associated
2188  * with fd.
2189  *
2190  * MPALMOSTSAFE
2191  */
2192 int
2193 sys_mkfifoat(struct mkfifoat_args *uap)
2194 {
2195 	struct nlookupdata nd;
2196 	struct file *fp;
2197 	int error;
2198 
2199 	get_mplock();
2200 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2201 	if (error == 0)
2202 		error = kern_mkfifo(&nd, uap->mode);
2203 	nlookup_done_at(&nd, fp);
2204 	rel_mplock();
2205 	return (error);
2206 }
2207 
2208 static int hardlink_check_uid = 0;
2209 SYSCTL_INT(_security, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
2210     &hardlink_check_uid, 0,
2211     "Unprivileged processes cannot create hard links to files owned by other "
2212     "users");
2213 static int hardlink_check_gid = 0;
2214 SYSCTL_INT(_security, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
2215     &hardlink_check_gid, 0,
2216     "Unprivileged processes cannot create hard links to files owned by other "
2217     "groups");
2218 
2219 static int
2220 can_hardlink(struct vnode *vp, struct thread *td, struct ucred *cred)
2221 {
2222 	struct vattr va;
2223 	int error;
2224 
2225 	/*
2226 	 * Shortcut if disabled
2227 	 */
2228 	if (hardlink_check_uid == 0 && hardlink_check_gid == 0)
2229 		return (0);
2230 
2231 	/*
2232 	 * Privileged user can always hardlink
2233 	 */
2234 	if (priv_check_cred(cred, PRIV_VFS_LINK, 0) == 0)
2235 		return (0);
2236 
2237 	/*
2238 	 * Otherwise only if the originating file is owned by the
2239 	 * same user or group.  Note that any group is allowed if
2240 	 * the file is owned by the caller.
2241 	 */
2242 	error = VOP_GETATTR(vp, &va);
2243 	if (error != 0)
2244 		return (error);
2245 
2246 	if (hardlink_check_uid) {
2247 		if (cred->cr_uid != va.va_uid)
2248 			return (EPERM);
2249 	}
2250 
2251 	if (hardlink_check_gid) {
2252 		if (cred->cr_uid != va.va_uid && !groupmember(va.va_gid, cred))
2253 			return (EPERM);
2254 	}
2255 
2256 	return (0);
2257 }
2258 
2259 int
2260 kern_link(struct nlookupdata *nd, struct nlookupdata *linknd)
2261 {
2262 	struct thread *td = curthread;
2263 	struct vnode *vp;
2264 	int error;
2265 
2266 	/*
2267 	 * Lookup the source and obtained a locked vnode.
2268 	 *
2269 	 * You may only hardlink a file which you have write permission
2270 	 * on or which you own.
2271 	 *
2272 	 * XXX relookup on vget failure / race ?
2273 	 */
2274 	bwillinode(1);
2275 	nd->nl_flags |= NLC_WRITE | NLC_OWN | NLC_HLINK;
2276 	if ((error = nlookup(nd)) != 0)
2277 		return (error);
2278 	vp = nd->nl_nch.ncp->nc_vp;
2279 	KKASSERT(vp != NULL);
2280 	if (vp->v_type == VDIR)
2281 		return (EPERM);		/* POSIX */
2282 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2283 		return (error);
2284 	if ((error = vget(vp, LK_EXCLUSIVE)) != 0)
2285 		return (error);
2286 
2287 	/*
2288 	 * Unlock the source so we can lookup the target without deadlocking
2289 	 * (XXX vp is locked already, possible other deadlock?).  The target
2290 	 * must not exist.
2291 	 */
2292 	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
2293 	nd->nl_flags &= ~NLC_NCPISLOCKED;
2294 	cache_unlock(&nd->nl_nch);
2295 	vn_unlock(vp);
2296 
2297 	linknd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2298 	if ((error = nlookup(linknd)) != 0) {
2299 		vrele(vp);
2300 		return (error);
2301 	}
2302 	if (linknd->nl_nch.ncp->nc_vp) {
2303 		vrele(vp);
2304 		return (EEXIST);
2305 	}
2306 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
2307 		vrele(vp);
2308 		return (error);
2309 	}
2310 
2311 	/*
2312 	 * Finally run the new API VOP.
2313 	 */
2314 	error = can_hardlink(vp, td, td->td_ucred);
2315 	if (error == 0) {
2316 		error = VOP_NLINK(&linknd->nl_nch, linknd->nl_dvp,
2317 				  vp, linknd->nl_cred);
2318 	}
2319 	vput(vp);
2320 	return (error);
2321 }
2322 
2323 /*
2324  * link_args(char *path, char *link)
2325  *
2326  * Make a hard file link.
2327  *
2328  * MPALMOSTSAFE
2329  */
2330 int
2331 sys_link(struct link_args *uap)
2332 {
2333 	struct nlookupdata nd, linknd;
2334 	int error;
2335 
2336 	get_mplock();
2337 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2338 	if (error == 0) {
2339 		error = nlookup_init(&linknd, uap->link, UIO_USERSPACE, 0);
2340 		if (error == 0)
2341 			error = kern_link(&nd, &linknd);
2342 		nlookup_done(&linknd);
2343 	}
2344 	nlookup_done(&nd);
2345 	rel_mplock();
2346 	return (error);
2347 }
2348 
2349 int
2350 kern_symlink(struct nlookupdata *nd, char *path, int mode)
2351 {
2352 	struct vattr vattr;
2353 	struct vnode *vp;
2354 	struct vnode *dvp;
2355 	int error;
2356 
2357 	bwillinode(1);
2358 	nd->nl_flags |= NLC_CREATE | NLC_REFDVP;
2359 	if ((error = nlookup(nd)) != 0)
2360 		return (error);
2361 	if (nd->nl_nch.ncp->nc_vp)
2362 		return (EEXIST);
2363 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2364 		return (error);
2365 	dvp = nd->nl_dvp;
2366 	VATTR_NULL(&vattr);
2367 	vattr.va_mode = mode;
2368 	error = VOP_NSYMLINK(&nd->nl_nch, dvp, &vp, nd->nl_cred, &vattr, path);
2369 	if (error == 0)
2370 		vput(vp);
2371 	return (error);
2372 }
2373 
2374 /*
2375  * symlink(char *path, char *link)
2376  *
2377  * Make a symbolic link.
2378  *
2379  * MPALMOSTSAFE
2380  */
2381 int
2382 sys_symlink(struct symlink_args *uap)
2383 {
2384 	struct thread *td = curthread;
2385 	struct nlookupdata nd;
2386 	char *path;
2387 	int error;
2388 	int mode;
2389 
2390 	path = objcache_get(namei_oc, M_WAITOK);
2391 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
2392 	if (error == 0) {
2393 		get_mplock();
2394 		error = nlookup_init(&nd, uap->link, UIO_USERSPACE, 0);
2395 		if (error == 0) {
2396 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2397 			error = kern_symlink(&nd, path, mode);
2398 		}
2399 		nlookup_done(&nd);
2400 		rel_mplock();
2401 	}
2402 	objcache_put(namei_oc, path);
2403 	return (error);
2404 }
2405 
2406 /*
2407  * symlinkat_args(char *path1, int fd, char *path2)
2408  *
2409  * Make a symbolic link.  The path2 argument is relative to the directory
2410  * associated with fd.
2411  *
2412  * MPALMOSTSAFE
2413  */
2414 int
2415 sys_symlinkat(struct symlinkat_args *uap)
2416 {
2417 	struct thread *td = curthread;
2418 	struct nlookupdata nd;
2419 	struct file *fp;
2420 	char *path1;
2421 	int error;
2422 	int mode;
2423 
2424 	path1 = objcache_get(namei_oc, M_WAITOK);
2425 	error = copyinstr(uap->path1, path1, MAXPATHLEN, NULL);
2426 	if (error == 0) {
2427 		get_mplock();
2428 		error = nlookup_init_at(&nd, &fp, uap->fd, uap->path2,
2429 		    UIO_USERSPACE, 0);
2430 		if (error == 0) {
2431 			mode = ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask;
2432 			error = kern_symlink(&nd, path1, mode);
2433 		}
2434 		nlookup_done_at(&nd, fp);
2435 		rel_mplock();
2436 	}
2437 	objcache_put(namei_oc, path1);
2438 	return (error);
2439 }
2440 
2441 /*
2442  * undelete_args(char *path)
2443  *
2444  * Delete a whiteout from the filesystem.
2445  *
2446  * MPALMOSTSAFE
2447  */
2448 int
2449 sys_undelete(struct undelete_args *uap)
2450 {
2451 	struct nlookupdata nd;
2452 	int error;
2453 
2454 	get_mplock();
2455 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2456 	bwillinode(1);
2457 	nd.nl_flags |= NLC_DELETE | NLC_REFDVP;
2458 	if (error == 0)
2459 		error = nlookup(&nd);
2460 	if (error == 0)
2461 		error = ncp_writechk(&nd.nl_nch);
2462 	if (error == 0) {
2463 		error = VOP_NWHITEOUT(&nd.nl_nch, nd.nl_dvp, nd.nl_cred,
2464 				      NAMEI_DELETE);
2465 	}
2466 	nlookup_done(&nd);
2467 	rel_mplock();
2468 	return (error);
2469 }
2470 
2471 int
2472 kern_unlink(struct nlookupdata *nd)
2473 {
2474 	int error;
2475 
2476 	bwillinode(1);
2477 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
2478 	if ((error = nlookup(nd)) != 0)
2479 		return (error);
2480 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
2481 		return (error);
2482 	error = VOP_NREMOVE(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
2483 	return (error);
2484 }
2485 
2486 /*
2487  * unlink_args(char *path)
2488  *
2489  * Delete a name from the filesystem.
2490  *
2491  * MPALMOSTSAFE
2492  */
2493 int
2494 sys_unlink(struct unlink_args *uap)
2495 {
2496 	struct nlookupdata nd;
2497 	int error;
2498 
2499 	get_mplock();
2500 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2501 	if (error == 0)
2502 		error = kern_unlink(&nd);
2503 	nlookup_done(&nd);
2504 	rel_mplock();
2505 	return (error);
2506 }
2507 
2508 
2509 /*
2510  * unlinkat_args(int fd, char *path, int flags)
2511  *
2512  * Delete the file or directory entry pointed to by fd/path.
2513  *
2514  * MPALMOSTSAFE
2515  */
2516 int
2517 sys_unlinkat(struct unlinkat_args *uap)
2518 {
2519 	struct nlookupdata nd;
2520 	struct file *fp;
2521 	int error;
2522 
2523 	if (uap->flags & ~AT_REMOVEDIR)
2524 		return (EINVAL);
2525 
2526 	get_mplock();
2527 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2528 	if (error == 0) {
2529 		if (uap->flags & AT_REMOVEDIR)
2530 			error = kern_rmdir(&nd);
2531 		else
2532 			error = kern_unlink(&nd);
2533 	}
2534 	nlookup_done_at(&nd, fp);
2535 	rel_mplock();
2536 	return (error);
2537 }
2538 
2539 /*
2540  * MPALMOSTSAFE
2541  */
2542 int
2543 kern_lseek(int fd, off_t offset, int whence, off_t *res)
2544 {
2545 	struct thread *td = curthread;
2546 	struct proc *p = td->td_proc;
2547 	struct file *fp;
2548 	struct vnode *vp;
2549 	struct vattr vattr;
2550 	off_t new_offset;
2551 	int error;
2552 
2553 	fp = holdfp(p->p_fd, fd, -1);
2554 	if (fp == NULL)
2555 		return (EBADF);
2556 	if (fp->f_type != DTYPE_VNODE) {
2557 		error = ESPIPE;
2558 		goto done;
2559 	}
2560 	vp = (struct vnode *)fp->f_data;
2561 
2562 	switch (whence) {
2563 	case L_INCR:
2564 		spin_lock(&fp->f_spin);
2565 		new_offset = fp->f_offset + offset;
2566 		error = 0;
2567 		break;
2568 	case L_XTND:
2569 		get_mplock();
2570 		error = VOP_GETATTR(vp, &vattr);
2571 		rel_mplock();
2572 		spin_lock(&fp->f_spin);
2573 		new_offset = offset + vattr.va_size;
2574 		break;
2575 	case L_SET:
2576 		new_offset = offset;
2577 		error = 0;
2578 		spin_lock(&fp->f_spin);
2579 		break;
2580 	default:
2581 		new_offset = 0;
2582 		error = EINVAL;
2583 		spin_lock(&fp->f_spin);
2584 		break;
2585 	}
2586 
2587 	/*
2588 	 * Validate the seek position.  Negative offsets are not allowed
2589 	 * for regular files or directories.
2590 	 *
2591 	 * Normally we would also not want to allow negative offsets for
2592 	 * character and block-special devices.  However kvm addresses
2593 	 * on 64 bit architectures might appear to be negative and must
2594 	 * be allowed.
2595 	 */
2596 	if (error == 0) {
2597 		if (new_offset < 0 &&
2598 		    (vp->v_type == VREG || vp->v_type == VDIR)) {
2599 			error = EINVAL;
2600 		} else {
2601 			fp->f_offset = new_offset;
2602 		}
2603 	}
2604 	*res = fp->f_offset;
2605 	spin_unlock(&fp->f_spin);
2606 done:
2607 	fdrop(fp);
2608 	return (error);
2609 }
2610 
2611 /*
2612  * lseek_args(int fd, int pad, off_t offset, int whence)
2613  *
2614  * Reposition read/write file offset.
2615  *
2616  * MPSAFE
2617  */
2618 int
2619 sys_lseek(struct lseek_args *uap)
2620 {
2621 	int error;
2622 
2623 	error = kern_lseek(uap->fd, uap->offset, uap->whence,
2624 			   &uap->sysmsg_offset);
2625 
2626 	return (error);
2627 }
2628 
2629 /*
2630  * Check if current process can access given file.  amode is a bitmask of *_OK
2631  * access bits.  flags is a bitmask of AT_* flags.
2632  */
2633 int
2634 kern_access(struct nlookupdata *nd, int amode, int flags)
2635 {
2636 	struct vnode *vp;
2637 	int error, mode;
2638 
2639 	if (flags & ~AT_EACCESS)
2640 		return (EINVAL);
2641 	if ((error = nlookup(nd)) != 0)
2642 		return (error);
2643 retry:
2644 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2645 	if (error)
2646 		return (error);
2647 
2648 	/* Flags == 0 means only check for existence. */
2649 	if (amode) {
2650 		mode = 0;
2651 		if (amode & R_OK)
2652 			mode |= VREAD;
2653 		if (amode & W_OK)
2654 			mode |= VWRITE;
2655 		if (amode & X_OK)
2656 			mode |= VEXEC;
2657 		if ((mode & VWRITE) == 0 ||
2658 		    (error = vn_writechk(vp, &nd->nl_nch)) == 0)
2659 			error = VOP_ACCESS_FLAGS(vp, mode, flags, nd->nl_cred);
2660 
2661 		/*
2662 		 * If the file handle is stale we have to re-resolve the
2663 		 * entry.  This is a hack at the moment.
2664 		 */
2665 		if (error == ESTALE) {
2666 			vput(vp);
2667 			cache_setunresolved(&nd->nl_nch);
2668 			error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2669 			if (error == 0) {
2670 				vp = NULL;
2671 				goto retry;
2672 			}
2673 			return(error);
2674 		}
2675 	}
2676 	vput(vp);
2677 	return (error);
2678 }
2679 
2680 /*
2681  * access_args(char *path, int flags)
2682  *
2683  * Check access permissions.
2684  *
2685  * MPALMOSTSAFE
2686  */
2687 int
2688 sys_access(struct access_args *uap)
2689 {
2690 	struct nlookupdata nd;
2691 	int error;
2692 
2693 	get_mplock();
2694 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2695 	if (error == 0)
2696 		error = kern_access(&nd, uap->flags, 0);
2697 	nlookup_done(&nd);
2698 	rel_mplock();
2699 	return (error);
2700 }
2701 
2702 
2703 /*
2704  * faccessat_args(int fd, char *path, int amode, int flags)
2705  *
2706  * Check access permissions.
2707  *
2708  * MPALMOSTSAFE
2709  */
2710 int
2711 sys_faccessat(struct faccessat_args *uap)
2712 {
2713 	struct nlookupdata nd;
2714 	struct file *fp;
2715 	int error;
2716 
2717 	get_mplock();
2718 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE,
2719 				NLC_FOLLOW);
2720 	if (error == 0)
2721 		error = kern_access(&nd, uap->amode, uap->flags);
2722 	nlookup_done_at(&nd, fp);
2723 	rel_mplock();
2724 	return (error);
2725 }
2726 
2727 
2728 /*
2729  * MPSAFE
2730  */
2731 int
2732 kern_stat(struct nlookupdata *nd, struct stat *st)
2733 {
2734 	int error;
2735 	struct vnode *vp;
2736 	thread_t td;
2737 
2738 	if ((error = nlookup(nd)) != 0)
2739 		return (error);
2740 again:
2741 	if ((vp = nd->nl_nch.ncp->nc_vp) == NULL)
2742 		return (ENOENT);
2743 
2744 	td = curthread;
2745 	if ((error = vget(vp, LK_SHARED)) != 0)
2746 		return (error);
2747 	error = vn_stat(vp, st, nd->nl_cred);
2748 
2749 	/*
2750 	 * If the file handle is stale we have to re-resolve the entry.  This
2751 	 * is a hack at the moment.
2752 	 */
2753 	if (error == ESTALE) {
2754 		vput(vp);
2755 		cache_setunresolved(&nd->nl_nch);
2756 		error = cache_resolve(&nd->nl_nch, nd->nl_cred);
2757 		if (error == 0)
2758 			goto again;
2759 	} else {
2760 		vput(vp);
2761 	}
2762 	return (error);
2763 }
2764 
2765 /*
2766  * stat_args(char *path, struct stat *ub)
2767  *
2768  * Get file status; this version follows links.
2769  *
2770  * MPSAFE
2771  */
2772 int
2773 sys_stat(struct stat_args *uap)
2774 {
2775 	struct nlookupdata nd;
2776 	struct stat st;
2777 	int error;
2778 
2779 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2780 	if (error == 0) {
2781 		error = kern_stat(&nd, &st);
2782 		if (error == 0)
2783 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2784 	}
2785 	nlookup_done(&nd);
2786 	return (error);
2787 }
2788 
2789 /*
2790  * lstat_args(char *path, struct stat *ub)
2791  *
2792  * Get file status; this version does not follow links.
2793  *
2794  * MPALMOSTSAFE
2795  */
2796 int
2797 sys_lstat(struct lstat_args *uap)
2798 {
2799 	struct nlookupdata nd;
2800 	struct stat st;
2801 	int error;
2802 
2803 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2804 	if (error == 0) {
2805 		error = kern_stat(&nd, &st);
2806 		if (error == 0)
2807 			error = copyout(&st, uap->ub, sizeof(*uap->ub));
2808 	}
2809 	nlookup_done(&nd);
2810 	return (error);
2811 }
2812 
2813 /*
2814  * fstatat_args(int fd, char *path, struct stat *sb, int flags)
2815  *
2816  * Get status of file pointed to by fd/path.
2817  *
2818  * MPALMOSTSAFE
2819  */
2820 int
2821 sys_fstatat(struct fstatat_args *uap)
2822 {
2823 	struct nlookupdata nd;
2824 	struct stat st;
2825 	int error;
2826 	int flags;
2827 	struct file *fp;
2828 
2829 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
2830 		return (EINVAL);
2831 
2832 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
2833 
2834 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
2835 				UIO_USERSPACE, flags);
2836 	if (error == 0) {
2837 		error = kern_stat(&nd, &st);
2838 		if (error == 0)
2839 			error = copyout(&st, uap->sb, sizeof(*uap->sb));
2840 	}
2841 	nlookup_done_at(&nd, fp);
2842 	return (error);
2843 }
2844 
2845 /*
2846  * pathconf_Args(char *path, int name)
2847  *
2848  * Get configurable pathname variables.
2849  *
2850  * MPALMOSTSAFE
2851  */
2852 int
2853 sys_pathconf(struct pathconf_args *uap)
2854 {
2855 	struct nlookupdata nd;
2856 	struct vnode *vp;
2857 	int error;
2858 
2859 	vp = NULL;
2860 	get_mplock();
2861 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
2862 	if (error == 0)
2863 		error = nlookup(&nd);
2864 	if (error == 0)
2865 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
2866 	nlookup_done(&nd);
2867 	if (error == 0) {
2868 		error = VOP_PATHCONF(vp, uap->name, &uap->sysmsg_reg);
2869 		vput(vp);
2870 	}
2871 	rel_mplock();
2872 	return (error);
2873 }
2874 
2875 /*
2876  * XXX: daver
2877  * kern_readlink isn't properly split yet.  There is a copyin burried
2878  * in VOP_READLINK().
2879  */
2880 int
2881 kern_readlink(struct nlookupdata *nd, char *buf, int count, int *res)
2882 {
2883 	struct thread *td = curthread;
2884 	struct vnode *vp;
2885 	struct iovec aiov;
2886 	struct uio auio;
2887 	int error;
2888 
2889 	if ((error = nlookup(nd)) != 0)
2890 		return (error);
2891 	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &vp);
2892 	if (error)
2893 		return (error);
2894 	if (vp->v_type != VLNK) {
2895 		error = EINVAL;
2896 	} else {
2897 		aiov.iov_base = buf;
2898 		aiov.iov_len = count;
2899 		auio.uio_iov = &aiov;
2900 		auio.uio_iovcnt = 1;
2901 		auio.uio_offset = 0;
2902 		auio.uio_rw = UIO_READ;
2903 		auio.uio_segflg = UIO_USERSPACE;
2904 		auio.uio_td = td;
2905 		auio.uio_resid = count;
2906 		error = VOP_READLINK(vp, &auio, td->td_ucred);
2907 	}
2908 	vput(vp);
2909 	*res = count - auio.uio_resid;
2910 	return (error);
2911 }
2912 
2913 /*
2914  * readlink_args(char *path, char *buf, int count)
2915  *
2916  * Return target name of a symbolic link.
2917  *
2918  * MPALMOSTSAFE
2919  */
2920 int
2921 sys_readlink(struct readlink_args *uap)
2922 {
2923 	struct nlookupdata nd;
2924 	int error;
2925 
2926 	get_mplock();
2927 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
2928 	if (error == 0) {
2929 		error = kern_readlink(&nd, uap->buf, uap->count,
2930 					&uap->sysmsg_result);
2931 	}
2932 	nlookup_done(&nd);
2933 	rel_mplock();
2934 	return (error);
2935 }
2936 
2937 /*
2938  * readlinkat_args(int fd, char *path, char *buf, size_t bufsize)
2939  *
2940  * Return target name of a symbolic link.  The path is relative to the
2941  * directory associated with fd.
2942  *
2943  * MPALMOSTSAFE
2944  */
2945 int
2946 sys_readlinkat(struct readlinkat_args *uap)
2947 {
2948 	struct nlookupdata nd;
2949 	struct file *fp;
2950 	int error;
2951 
2952 	get_mplock();
2953 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
2954 	if (error == 0) {
2955 		error = kern_readlink(&nd, uap->buf, uap->bufsize,
2956 					&uap->sysmsg_result);
2957 	}
2958 	nlookup_done_at(&nd, fp);
2959 	rel_mplock();
2960 	return (error);
2961 }
2962 
2963 static int
2964 setfflags(struct vnode *vp, int flags)
2965 {
2966 	struct thread *td = curthread;
2967 	int error;
2968 	struct vattr vattr;
2969 
2970 	/*
2971 	 * Prevent non-root users from setting flags on devices.  When
2972 	 * a device is reused, users can retain ownership of the device
2973 	 * if they are allowed to set flags and programs assume that
2974 	 * chown can't fail when done as root.
2975 	 */
2976 	if ((vp->v_type == VCHR || vp->v_type == VBLK) &&
2977 	    ((error = priv_check_cred(td->td_ucred, PRIV_VFS_CHFLAGS_DEV, 0)) != 0))
2978 		return (error);
2979 
2980 	/*
2981 	 * note: vget is required for any operation that might mod the vnode
2982 	 * so VINACTIVE is properly cleared.
2983 	 */
2984 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
2985 		VATTR_NULL(&vattr);
2986 		vattr.va_flags = flags;
2987 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2988 		vput(vp);
2989 	}
2990 	return (error);
2991 }
2992 
2993 /*
2994  * chflags(char *path, int flags)
2995  *
2996  * Change flags of a file given a path name.
2997  *
2998  * MPALMOSTSAFE
2999  */
3000 int
3001 sys_chflags(struct chflags_args *uap)
3002 {
3003 	struct nlookupdata nd;
3004 	struct vnode *vp;
3005 	int error;
3006 
3007 	vp = NULL;
3008 	get_mplock();
3009 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3010 	if (error == 0)
3011 		error = nlookup(&nd);
3012 	if (error == 0)
3013 		error = ncp_writechk(&nd.nl_nch);
3014 	if (error == 0)
3015 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3016 	nlookup_done(&nd);
3017 	if (error == 0) {
3018 		error = setfflags(vp, uap->flags);
3019 		vrele(vp);
3020 	}
3021 	rel_mplock();
3022 	return (error);
3023 }
3024 
3025 /*
3026  * lchflags(char *path, int flags)
3027  *
3028  * Change flags of a file given a path name, but don't follow symlinks.
3029  *
3030  * MPALMOSTSAFE
3031  */
3032 int
3033 sys_lchflags(struct lchflags_args *uap)
3034 {
3035 	struct nlookupdata nd;
3036 	struct vnode *vp;
3037 	int error;
3038 
3039 	vp = NULL;
3040 	get_mplock();
3041 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3042 	if (error == 0)
3043 		error = nlookup(&nd);
3044 	if (error == 0)
3045 		error = ncp_writechk(&nd.nl_nch);
3046 	if (error == 0)
3047 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
3048 	nlookup_done(&nd);
3049 	if (error == 0) {
3050 		error = setfflags(vp, uap->flags);
3051 		vrele(vp);
3052 	}
3053 	rel_mplock();
3054 	return (error);
3055 }
3056 
3057 /*
3058  * fchflags_args(int fd, int flags)
3059  *
3060  * Change flags of a file given a file descriptor.
3061  *
3062  * MPALMOSTSAFE
3063  */
3064 int
3065 sys_fchflags(struct fchflags_args *uap)
3066 {
3067 	struct thread *td = curthread;
3068 	struct proc *p = td->td_proc;
3069 	struct file *fp;
3070 	int error;
3071 
3072 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3073 		return (error);
3074 	get_mplock();
3075 	if (fp->f_nchandle.ncp)
3076 		error = ncp_writechk(&fp->f_nchandle);
3077 	if (error == 0)
3078 		error = setfflags((struct vnode *) fp->f_data, uap->flags);
3079 	rel_mplock();
3080 	fdrop(fp);
3081 	return (error);
3082 }
3083 
3084 static int
3085 setfmode(struct vnode *vp, int mode)
3086 {
3087 	struct thread *td = curthread;
3088 	int error;
3089 	struct vattr vattr;
3090 
3091 	/*
3092 	 * note: vget is required for any operation that might mod the vnode
3093 	 * so VINACTIVE is properly cleared.
3094 	 */
3095 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3096 		VATTR_NULL(&vattr);
3097 		vattr.va_mode = mode & ALLPERMS;
3098 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3099 		vput(vp);
3100 	}
3101 	return error;
3102 }
3103 
3104 int
3105 kern_chmod(struct nlookupdata *nd, int mode)
3106 {
3107 	struct vnode *vp;
3108 	int error;
3109 
3110 	if ((error = nlookup(nd)) != 0)
3111 		return (error);
3112 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3113 		return (error);
3114 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3115 		error = setfmode(vp, mode);
3116 	vrele(vp);
3117 	return (error);
3118 }
3119 
3120 /*
3121  * chmod_args(char *path, int mode)
3122  *
3123  * Change mode of a file given path name.
3124  *
3125  * MPALMOSTSAFE
3126  */
3127 int
3128 sys_chmod(struct chmod_args *uap)
3129 {
3130 	struct nlookupdata nd;
3131 	int error;
3132 
3133 	get_mplock();
3134 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3135 	if (error == 0)
3136 		error = kern_chmod(&nd, uap->mode);
3137 	nlookup_done(&nd);
3138 	rel_mplock();
3139 	return (error);
3140 }
3141 
3142 /*
3143  * lchmod_args(char *path, int mode)
3144  *
3145  * Change mode of a file given path name (don't follow links.)
3146  *
3147  * MPALMOSTSAFE
3148  */
3149 int
3150 sys_lchmod(struct lchmod_args *uap)
3151 {
3152 	struct nlookupdata nd;
3153 	int error;
3154 
3155 	get_mplock();
3156 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3157 	if (error == 0)
3158 		error = kern_chmod(&nd, uap->mode);
3159 	nlookup_done(&nd);
3160 	rel_mplock();
3161 	return (error);
3162 }
3163 
3164 /*
3165  * fchmod_args(int fd, int mode)
3166  *
3167  * Change mode of a file given a file descriptor.
3168  *
3169  * MPALMOSTSAFE
3170  */
3171 int
3172 sys_fchmod(struct fchmod_args *uap)
3173 {
3174 	struct thread *td = curthread;
3175 	struct proc *p = td->td_proc;
3176 	struct file *fp;
3177 	int error;
3178 
3179 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3180 		return (error);
3181 	get_mplock();
3182 	if (fp->f_nchandle.ncp)
3183 		error = ncp_writechk(&fp->f_nchandle);
3184 	if (error == 0)
3185 		error = setfmode((struct vnode *)fp->f_data, uap->mode);
3186 	rel_mplock();
3187 	fdrop(fp);
3188 	return (error);
3189 }
3190 
3191 /*
3192  * fchmodat_args(char *path, int mode)
3193  *
3194  * Change mode of a file pointed to by fd/path.
3195  *
3196  * MPALMOSTSAFE
3197  */
3198 int
3199 sys_fchmodat(struct fchmodat_args *uap)
3200 {
3201 	struct nlookupdata nd;
3202 	struct file *fp;
3203 	int error;
3204 	int flags;
3205 
3206 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3207 		return (EINVAL);
3208 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3209 
3210 	get_mplock();
3211 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3212 				UIO_USERSPACE, flags);
3213 	if (error == 0)
3214 		error = kern_chmod(&nd, uap->mode);
3215 	nlookup_done_at(&nd, fp);
3216 	rel_mplock();
3217 	return (error);
3218 }
3219 
3220 static int
3221 setfown(struct vnode *vp, uid_t uid, gid_t gid)
3222 {
3223 	struct thread *td = curthread;
3224 	int error;
3225 	struct vattr vattr;
3226 
3227 	/*
3228 	 * note: vget is required for any operation that might mod the vnode
3229 	 * so VINACTIVE is properly cleared.
3230 	 */
3231 	if ((error = vget(vp, LK_EXCLUSIVE)) == 0) {
3232 		VATTR_NULL(&vattr);
3233 		vattr.va_uid = uid;
3234 		vattr.va_gid = gid;
3235 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3236 		vput(vp);
3237 	}
3238 	return error;
3239 }
3240 
3241 int
3242 kern_chown(struct nlookupdata *nd, int uid, int gid)
3243 {
3244 	struct vnode *vp;
3245 	int error;
3246 
3247 	if ((error = nlookup(nd)) != 0)
3248 		return (error);
3249 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3250 		return (error);
3251 	if ((error = ncp_writechk(&nd->nl_nch)) == 0)
3252 		error = setfown(vp, uid, gid);
3253 	vrele(vp);
3254 	return (error);
3255 }
3256 
3257 /*
3258  * chown(char *path, int uid, int gid)
3259  *
3260  * Set ownership given a path name.
3261  *
3262  * MPALMOSTSAFE
3263  */
3264 int
3265 sys_chown(struct chown_args *uap)
3266 {
3267 	struct nlookupdata nd;
3268 	int error;
3269 
3270 	get_mplock();
3271 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3272 	if (error == 0)
3273 		error = kern_chown(&nd, uap->uid, uap->gid);
3274 	nlookup_done(&nd);
3275 	rel_mplock();
3276 	return (error);
3277 }
3278 
3279 /*
3280  * lchown_args(char *path, int uid, int gid)
3281  *
3282  * Set ownership given a path name, do not cross symlinks.
3283  *
3284  * MPALMOSTSAFE
3285  */
3286 int
3287 sys_lchown(struct lchown_args *uap)
3288 {
3289 	struct nlookupdata nd;
3290 	int error;
3291 
3292 	get_mplock();
3293 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3294 	if (error == 0)
3295 		error = kern_chown(&nd, uap->uid, uap->gid);
3296 	nlookup_done(&nd);
3297 	rel_mplock();
3298 	return (error);
3299 }
3300 
3301 /*
3302  * fchown_args(int fd, int uid, int gid)
3303  *
3304  * Set ownership given a file descriptor.
3305  *
3306  * MPALMOSTSAFE
3307  */
3308 int
3309 sys_fchown(struct fchown_args *uap)
3310 {
3311 	struct thread *td = curthread;
3312 	struct proc *p = td->td_proc;
3313 	struct file *fp;
3314 	int error;
3315 
3316 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3317 		return (error);
3318 	get_mplock();
3319 	if (fp->f_nchandle.ncp)
3320 		error = ncp_writechk(&fp->f_nchandle);
3321 	if (error == 0)
3322 		error = setfown((struct vnode *)fp->f_data, uap->uid, uap->gid);
3323 	rel_mplock();
3324 	fdrop(fp);
3325 	return (error);
3326 }
3327 
3328 /*
3329  * fchownat(int fd, char *path, int uid, int gid, int flags)
3330  *
3331  * Set ownership of file pointed to by fd/path.
3332  *
3333  * MPALMOSTSAFE
3334  */
3335 int
3336 sys_fchownat(struct fchownat_args *uap)
3337 {
3338 	struct nlookupdata nd;
3339 	struct file *fp;
3340 	int error;
3341 	int flags;
3342 
3343 	if (uap->flags & ~AT_SYMLINK_NOFOLLOW)
3344 		return (EINVAL);
3345 	flags = (uap->flags & AT_SYMLINK_NOFOLLOW) ? 0 : NLC_FOLLOW;
3346 
3347 	get_mplock();
3348 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path,
3349 				UIO_USERSPACE, flags);
3350 	if (error == 0)
3351 		error = kern_chown(&nd, uap->uid, uap->gid);
3352 	nlookup_done_at(&nd, fp);
3353 	rel_mplock();
3354 	return (error);
3355 }
3356 
3357 
3358 static int
3359 getutimes(const struct timeval *tvp, struct timespec *tsp)
3360 {
3361 	struct timeval tv[2];
3362 
3363 	if (tvp == NULL) {
3364 		microtime(&tv[0]);
3365 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
3366 		tsp[1] = tsp[0];
3367 	} else {
3368 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3369 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3370 	}
3371 	return 0;
3372 }
3373 
3374 static int
3375 setutimes(struct vnode *vp, struct vattr *vattr,
3376 	  const struct timespec *ts, int nullflag)
3377 {
3378 	struct thread *td = curthread;
3379 	int error;
3380 
3381 	VATTR_NULL(vattr);
3382 	vattr->va_atime = ts[0];
3383 	vattr->va_mtime = ts[1];
3384 	if (nullflag)
3385 		vattr->va_vaflags |= VA_UTIMES_NULL;
3386 	error = VOP_SETATTR(vp, vattr, td->td_ucred);
3387 
3388 	return error;
3389 }
3390 
3391 int
3392 kern_utimes(struct nlookupdata *nd, struct timeval *tptr)
3393 {
3394 	struct timespec ts[2];
3395 	struct vnode *vp;
3396 	struct vattr vattr;
3397 	int error;
3398 
3399 	if ((error = getutimes(tptr, ts)) != 0)
3400 		return (error);
3401 
3402 	/*
3403 	 * NOTE: utimes() succeeds for the owner even if the file
3404 	 * is not user-writable.
3405 	 */
3406 	nd->nl_flags |= NLC_OWN | NLC_WRITE;
3407 
3408 	if ((error = nlookup(nd)) != 0)
3409 		return (error);
3410 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3411 		return (error);
3412 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3413 		return (error);
3414 
3415 	/*
3416 	 * note: vget is required for any operation that might mod the vnode
3417 	 * so VINACTIVE is properly cleared.
3418 	 */
3419 	if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3420 		error = vget(vp, LK_EXCLUSIVE);
3421 		if (error == 0) {
3422 			error = setutimes(vp, &vattr, ts, (tptr == NULL));
3423 			vput(vp);
3424 		}
3425 	}
3426 	vrele(vp);
3427 	return (error);
3428 }
3429 
3430 /*
3431  * utimes_args(char *path, struct timeval *tptr)
3432  *
3433  * Set the access and modification times of a file.
3434  *
3435  * MPALMOSTSAFE
3436  */
3437 int
3438 sys_utimes(struct utimes_args *uap)
3439 {
3440 	struct timeval tv[2];
3441 	struct nlookupdata nd;
3442 	int error;
3443 
3444 	if (uap->tptr) {
3445  		error = copyin(uap->tptr, tv, sizeof(tv));
3446 		if (error)
3447 			return (error);
3448 	}
3449 	get_mplock();
3450 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3451 	if (error == 0)
3452 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3453 	nlookup_done(&nd);
3454 	rel_mplock();
3455 	return (error);
3456 }
3457 
3458 /*
3459  * lutimes_args(char *path, struct timeval *tptr)
3460  *
3461  * Set the access and modification times of a file.
3462  *
3463  * MPALMOSTSAFE
3464  */
3465 int
3466 sys_lutimes(struct lutimes_args *uap)
3467 {
3468 	struct timeval tv[2];
3469 	struct nlookupdata nd;
3470 	int error;
3471 
3472 	if (uap->tptr) {
3473 		error = copyin(uap->tptr, tv, sizeof(tv));
3474 		if (error)
3475 			return (error);
3476 	}
3477 	get_mplock();
3478 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3479 	if (error == 0)
3480 		error = kern_utimes(&nd, uap->tptr ? tv : NULL);
3481 	nlookup_done(&nd);
3482 	rel_mplock();
3483 	return (error);
3484 }
3485 
3486 /*
3487  * Set utimes on a file descriptor.  The creds used to open the
3488  * file are used to determine whether the operation is allowed
3489  * or not.
3490  */
3491 int
3492 kern_futimes(int fd, struct timeval *tptr)
3493 {
3494 	struct thread *td = curthread;
3495 	struct proc *p = td->td_proc;
3496 	struct timespec ts[2];
3497 	struct file *fp;
3498 	struct vnode *vp;
3499 	struct vattr vattr;
3500 	int error;
3501 
3502 	error = getutimes(tptr, ts);
3503 	if (error)
3504 		return (error);
3505 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3506 		return (error);
3507 	if (fp->f_nchandle.ncp)
3508 		error = ncp_writechk(&fp->f_nchandle);
3509 	if (error == 0) {
3510 		vp = fp->f_data;
3511 		error = vget(vp, LK_EXCLUSIVE);
3512 		if (error == 0) {
3513 			error = VOP_GETATTR(vp, &vattr);
3514 			if (error == 0) {
3515 				error = naccess_va(&vattr, NLC_OWN | NLC_WRITE,
3516 						   fp->f_cred);
3517 			}
3518 			if (error == 0) {
3519 				error = setutimes(vp, &vattr, ts,
3520 						  (tptr == NULL));
3521 			}
3522 			vput(vp);
3523 		}
3524 	}
3525 	fdrop(fp);
3526 	return (error);
3527 }
3528 
3529 /*
3530  * futimes_args(int fd, struct timeval *tptr)
3531  *
3532  * Set the access and modification times of a file.
3533  *
3534  * MPALMOSTSAFE
3535  */
3536 int
3537 sys_futimes(struct futimes_args *uap)
3538 {
3539 	struct timeval tv[2];
3540 	int error;
3541 
3542 	if (uap->tptr) {
3543 		error = copyin(uap->tptr, tv, sizeof(tv));
3544 		if (error)
3545 			return (error);
3546 	}
3547 	get_mplock();
3548 	error = kern_futimes(uap->fd, uap->tptr ? tv : NULL);
3549 	rel_mplock();
3550 
3551 	return (error);
3552 }
3553 
3554 int
3555 kern_truncate(struct nlookupdata *nd, off_t length)
3556 {
3557 	struct vnode *vp;
3558 	struct vattr vattr;
3559 	int error;
3560 
3561 	if (length < 0)
3562 		return(EINVAL);
3563 	nd->nl_flags |= NLC_WRITE | NLC_TRUNCATE;
3564 	if ((error = nlookup(nd)) != 0)
3565 		return (error);
3566 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3567 		return (error);
3568 	if ((error = cache_vref(&nd->nl_nch, nd->nl_cred, &vp)) != 0)
3569 		return (error);
3570 	if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY)) != 0) {
3571 		vrele(vp);
3572 		return (error);
3573 	}
3574 	if (vp->v_type == VDIR) {
3575 		error = EISDIR;
3576 	} else if ((error = vn_writechk(vp, &nd->nl_nch)) == 0) {
3577 		VATTR_NULL(&vattr);
3578 		vattr.va_size = length;
3579 		error = VOP_SETATTR(vp, &vattr, nd->nl_cred);
3580 	}
3581 	vput(vp);
3582 	return (error);
3583 }
3584 
3585 /*
3586  * truncate(char *path, int pad, off_t length)
3587  *
3588  * Truncate a file given its path name.
3589  *
3590  * MPALMOSTSAFE
3591  */
3592 int
3593 sys_truncate(struct truncate_args *uap)
3594 {
3595 	struct nlookupdata nd;
3596 	int error;
3597 
3598 	get_mplock();
3599 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
3600 	if (error == 0)
3601 		error = kern_truncate(&nd, uap->length);
3602 	nlookup_done(&nd);
3603 	rel_mplock();
3604 	return error;
3605 }
3606 
3607 int
3608 kern_ftruncate(int fd, off_t length)
3609 {
3610 	struct thread *td = curthread;
3611 	struct proc *p = td->td_proc;
3612 	struct vattr vattr;
3613 	struct vnode *vp;
3614 	struct file *fp;
3615 	int error;
3616 
3617 	if (length < 0)
3618 		return(EINVAL);
3619 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
3620 		return (error);
3621 	if (fp->f_nchandle.ncp) {
3622 		error = ncp_writechk(&fp->f_nchandle);
3623 		if (error)
3624 			goto done;
3625 	}
3626 	if ((fp->f_flag & FWRITE) == 0) {
3627 		error = EINVAL;
3628 		goto done;
3629 	}
3630 	if (fp->f_flag & FAPPENDONLY) {	/* inode was set s/uapnd */
3631 		error = EINVAL;
3632 		goto done;
3633 	}
3634 	vp = (struct vnode *)fp->f_data;
3635 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3636 	if (vp->v_type == VDIR) {
3637 		error = EISDIR;
3638 	} else if ((error = vn_writechk(vp, NULL)) == 0) {
3639 		VATTR_NULL(&vattr);
3640 		vattr.va_size = length;
3641 		error = VOP_SETATTR(vp, &vattr, fp->f_cred);
3642 	}
3643 	vn_unlock(vp);
3644 done:
3645 	fdrop(fp);
3646 	return (error);
3647 }
3648 
3649 /*
3650  * ftruncate_args(int fd, int pad, off_t length)
3651  *
3652  * Truncate a file given a file descriptor.
3653  *
3654  * MPALMOSTSAFE
3655  */
3656 int
3657 sys_ftruncate(struct ftruncate_args *uap)
3658 {
3659 	int error;
3660 
3661 	get_mplock();
3662 	error = kern_ftruncate(uap->fd, uap->length);
3663 	rel_mplock();
3664 
3665 	return (error);
3666 }
3667 
3668 /*
3669  * fsync(int fd)
3670  *
3671  * Sync an open file.
3672  *
3673  * MPALMOSTSAFE
3674  */
3675 int
3676 sys_fsync(struct fsync_args *uap)
3677 {
3678 	struct thread *td = curthread;
3679 	struct proc *p = td->td_proc;
3680 	struct vnode *vp;
3681 	struct file *fp;
3682 	vm_object_t obj;
3683 	int error;
3684 
3685 	if ((error = holdvnode(p->p_fd, uap->fd, &fp)) != 0)
3686 		return (error);
3687 	get_mplock();
3688 	vp = (struct vnode *)fp->f_data;
3689 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3690 	if ((obj = vp->v_object) != NULL)
3691 		vm_object_page_clean(obj, 0, 0, 0);
3692 	error = VOP_FSYNC(vp, MNT_WAIT, VOP_FSYNC_SYSCALL);
3693 	if (error == 0 && vp->v_mount)
3694 		error = buf_fsync(vp);
3695 	vn_unlock(vp);
3696 	rel_mplock();
3697 	fdrop(fp);
3698 
3699 	return (error);
3700 }
3701 
3702 int
3703 kern_rename(struct nlookupdata *fromnd, struct nlookupdata *tond)
3704 {
3705 	struct nchandle fnchd;
3706 	struct nchandle tnchd;
3707 	struct namecache *ncp;
3708 	struct vnode *fdvp;
3709 	struct vnode *tdvp;
3710 	struct mount *mp;
3711 	int error;
3712 
3713 	bwillinode(1);
3714 	fromnd->nl_flags |= NLC_REFDVP | NLC_RENAME_SRC;
3715 	if ((error = nlookup(fromnd)) != 0)
3716 		return (error);
3717 	if ((fnchd.ncp = fromnd->nl_nch.ncp->nc_parent) == NULL)
3718 		return (ENOENT);
3719 	fnchd.mount = fromnd->nl_nch.mount;
3720 	cache_hold(&fnchd);
3721 
3722 	/*
3723 	 * unlock the source nch so we can lookup the target nch without
3724 	 * deadlocking.  The target may or may not exist so we do not check
3725 	 * for a target vp like kern_mkdir() and other creation functions do.
3726 	 *
3727 	 * The source and target directories are ref'd and rechecked after
3728 	 * everything is relocked to determine if the source or target file
3729 	 * has been renamed.
3730 	 */
3731 	KKASSERT(fromnd->nl_flags & NLC_NCPISLOCKED);
3732 	fromnd->nl_flags &= ~NLC_NCPISLOCKED;
3733 	cache_unlock(&fromnd->nl_nch);
3734 
3735 	tond->nl_flags |= NLC_RENAME_DST | NLC_REFDVP;
3736 	if ((error = nlookup(tond)) != 0) {
3737 		cache_drop(&fnchd);
3738 		return (error);
3739 	}
3740 	if ((tnchd.ncp = tond->nl_nch.ncp->nc_parent) == NULL) {
3741 		cache_drop(&fnchd);
3742 		return (ENOENT);
3743 	}
3744 	tnchd.mount = tond->nl_nch.mount;
3745 	cache_hold(&tnchd);
3746 
3747 	/*
3748 	 * If the source and target are the same there is nothing to do
3749 	 */
3750 	if (fromnd->nl_nch.ncp == tond->nl_nch.ncp) {
3751 		cache_drop(&fnchd);
3752 		cache_drop(&tnchd);
3753 		return (0);
3754 	}
3755 
3756 	/*
3757 	 * Mount points cannot be renamed or overwritten
3758 	 */
3759 	if ((fromnd->nl_nch.ncp->nc_flag | tond->nl_nch.ncp->nc_flag) &
3760 	    NCF_ISMOUNTPT
3761 	) {
3762 		cache_drop(&fnchd);
3763 		cache_drop(&tnchd);
3764 		return (EINVAL);
3765 	}
3766 
3767 	/*
3768 	 * Relock the source ncp.  cache_relock() will deal with any
3769 	 * deadlocks against the already-locked tond and will also
3770 	 * make sure both are resolved.
3771 	 *
3772 	 * NOTE AFTER RELOCKING: The source or target ncp may have become
3773 	 * invalid while they were unlocked, nc_vp and nc_mount could
3774 	 * be NULL.
3775 	 */
3776 	cache_relock(&fromnd->nl_nch, fromnd->nl_cred,
3777 		     &tond->nl_nch, tond->nl_cred);
3778 	fromnd->nl_flags |= NLC_NCPISLOCKED;
3779 
3780 	/*
3781 	 * make sure the parent directories linkages are the same
3782 	 */
3783 	if (fnchd.ncp != fromnd->nl_nch.ncp->nc_parent ||
3784 	    tnchd.ncp != tond->nl_nch.ncp->nc_parent) {
3785 		cache_drop(&fnchd);
3786 		cache_drop(&tnchd);
3787 		return (ENOENT);
3788 	}
3789 
3790 	/*
3791 	 * Both the source and target must be within the same filesystem and
3792 	 * in the same filesystem as their parent directories within the
3793 	 * namecache topology.
3794 	 *
3795 	 * NOTE: fromnd's nc_mount or nc_vp could be NULL.
3796 	 */
3797 	mp = fnchd.mount;
3798 	if (mp != tnchd.mount || mp != fromnd->nl_nch.mount ||
3799 	    mp != tond->nl_nch.mount) {
3800 		cache_drop(&fnchd);
3801 		cache_drop(&tnchd);
3802 		return (EXDEV);
3803 	}
3804 
3805 	/*
3806 	 * Make sure the mount point is writable
3807 	 */
3808 	if ((error = ncp_writechk(&tond->nl_nch)) != 0) {
3809 		cache_drop(&fnchd);
3810 		cache_drop(&tnchd);
3811 		return (error);
3812 	}
3813 
3814 	/*
3815 	 * If the target exists and either the source or target is a directory,
3816 	 * then both must be directories.
3817 	 *
3818 	 * Due to relocking of the source, fromnd->nl_nch.ncp->nc_vp might h
3819 	 * have become NULL.
3820 	 */
3821 	if (tond->nl_nch.ncp->nc_vp) {
3822 		if (fromnd->nl_nch.ncp->nc_vp == NULL) {
3823 			error = ENOENT;
3824 		} else if (fromnd->nl_nch.ncp->nc_vp->v_type == VDIR) {
3825 			if (tond->nl_nch.ncp->nc_vp->v_type != VDIR)
3826 				error = ENOTDIR;
3827 		} else if (tond->nl_nch.ncp->nc_vp->v_type == VDIR) {
3828 			error = EISDIR;
3829 		}
3830 	}
3831 
3832 	/*
3833 	 * You cannot rename a source into itself or a subdirectory of itself.
3834 	 * We check this by travsersing the target directory upwards looking
3835 	 * for a match against the source.
3836 	 *
3837 	 * XXX MPSAFE
3838 	 */
3839 	if (error == 0) {
3840 		for (ncp = tnchd.ncp; ncp; ncp = ncp->nc_parent) {
3841 			if (fromnd->nl_nch.ncp == ncp) {
3842 				error = EINVAL;
3843 				break;
3844 			}
3845 		}
3846 	}
3847 
3848 	cache_drop(&fnchd);
3849 	cache_drop(&tnchd);
3850 
3851 	/*
3852 	 * Even though the namespaces are different, they may still represent
3853 	 * hardlinks to the same file.  The filesystem might have a hard time
3854 	 * with this so we issue a NREMOVE of the source instead of a NRENAME
3855 	 * when we detect the situation.
3856 	 */
3857 	if (error == 0) {
3858 		fdvp = fromnd->nl_dvp;
3859 		tdvp = tond->nl_dvp;
3860 		if (fdvp == NULL || tdvp == NULL) {
3861 			error = EPERM;
3862 		} else if (fromnd->nl_nch.ncp->nc_vp == tond->nl_nch.ncp->nc_vp) {
3863 			error = VOP_NREMOVE(&fromnd->nl_nch, fdvp,
3864 					    fromnd->nl_cred);
3865 		} else {
3866 			error = VOP_NRENAME(&fromnd->nl_nch, &tond->nl_nch,
3867 					    fdvp, tdvp, tond->nl_cred);
3868 		}
3869 	}
3870 	return (error);
3871 }
3872 
3873 /*
3874  * rename_args(char *from, char *to)
3875  *
3876  * Rename files.  Source and destination must either both be directories,
3877  * or both not be directories.  If target is a directory, it must be empty.
3878  *
3879  * MPALMOSTSAFE
3880  */
3881 int
3882 sys_rename(struct rename_args *uap)
3883 {
3884 	struct nlookupdata fromnd, tond;
3885 	int error;
3886 
3887 	get_mplock();
3888 	error = nlookup_init(&fromnd, uap->from, UIO_USERSPACE, 0);
3889 	if (error == 0) {
3890 		error = nlookup_init(&tond, uap->to, UIO_USERSPACE, 0);
3891 		if (error == 0)
3892 			error = kern_rename(&fromnd, &tond);
3893 		nlookup_done(&tond);
3894 	}
3895 	nlookup_done(&fromnd);
3896 	rel_mplock();
3897 	return (error);
3898 }
3899 
3900 /*
3901  * renameat_args(int oldfd, char *old, int newfd, char *new)
3902  *
3903  * Rename files using paths relative to the directories associated with
3904  * oldfd and newfd.  Source and destination must either both be directories,
3905  * or both not be directories.  If target is a directory, it must be empty.
3906  *
3907  * MPALMOSTSAFE
3908  */
3909 int
3910 sys_renameat(struct renameat_args *uap)
3911 {
3912 	struct nlookupdata oldnd, newnd;
3913 	struct file *oldfp, *newfp;
3914 	int error;
3915 
3916 	get_mplock();
3917 	error = nlookup_init_at(&oldnd, &oldfp, uap->oldfd, uap->old,
3918 	    UIO_USERSPACE, 0);
3919 	if (error == 0) {
3920 		error = nlookup_init_at(&newnd, &newfp, uap->newfd, uap->new,
3921 		    UIO_USERSPACE, 0);
3922 		if (error == 0)
3923 			error = kern_rename(&oldnd, &newnd);
3924 		nlookup_done_at(&newnd, newfp);
3925 	}
3926 	nlookup_done_at(&oldnd, oldfp);
3927 	rel_mplock();
3928 	return (error);
3929 }
3930 
3931 int
3932 kern_mkdir(struct nlookupdata *nd, int mode)
3933 {
3934 	struct thread *td = curthread;
3935 	struct proc *p = td->td_proc;
3936 	struct vnode *vp;
3937 	struct vattr vattr;
3938 	int error;
3939 
3940 	bwillinode(1);
3941 	nd->nl_flags |= NLC_WILLBEDIR | NLC_CREATE | NLC_REFDVP;
3942 	if ((error = nlookup(nd)) != 0)
3943 		return (error);
3944 
3945 	if (nd->nl_nch.ncp->nc_vp)
3946 		return (EEXIST);
3947 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
3948 		return (error);
3949 	VATTR_NULL(&vattr);
3950 	vattr.va_type = VDIR;
3951 	vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_fd->fd_cmask;
3952 
3953 	vp = NULL;
3954 	error = VOP_NMKDIR(&nd->nl_nch, nd->nl_dvp, &vp, td->td_ucred, &vattr);
3955 	if (error == 0)
3956 		vput(vp);
3957 	return (error);
3958 }
3959 
3960 /*
3961  * mkdir_args(char *path, int mode)
3962  *
3963  * Make a directory file.
3964  *
3965  * MPALMOSTSAFE
3966  */
3967 int
3968 sys_mkdir(struct mkdir_args *uap)
3969 {
3970 	struct nlookupdata nd;
3971 	int error;
3972 
3973 	get_mplock();
3974 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
3975 	if (error == 0)
3976 		error = kern_mkdir(&nd, uap->mode);
3977 	nlookup_done(&nd);
3978 	rel_mplock();
3979 	return (error);
3980 }
3981 
3982 /*
3983  * mkdirat_args(int fd, char *path, mode_t mode)
3984  *
3985  * Make a directory file.  The path is relative to the directory associated
3986  * with fd.
3987  *
3988  * MPALMOSTSAFE
3989  */
3990 int
3991 sys_mkdirat(struct mkdirat_args *uap)
3992 {
3993 	struct nlookupdata nd;
3994 	struct file *fp;
3995 	int error;
3996 
3997 	get_mplock();
3998 	error = nlookup_init_at(&nd, &fp, uap->fd, uap->path, UIO_USERSPACE, 0);
3999 	if (error == 0)
4000 		error = kern_mkdir(&nd, uap->mode);
4001 	nlookup_done_at(&nd, fp);
4002 	rel_mplock();
4003 	return (error);
4004 }
4005 
4006 int
4007 kern_rmdir(struct nlookupdata *nd)
4008 {
4009 	int error;
4010 
4011 	bwillinode(1);
4012 	nd->nl_flags |= NLC_DELETE | NLC_REFDVP;
4013 	if ((error = nlookup(nd)) != 0)
4014 		return (error);
4015 
4016 	/*
4017 	 * Do not allow directories representing mount points to be
4018 	 * deleted, even if empty.  Check write perms on mount point
4019 	 * in case the vnode is aliased (aka nullfs).
4020 	 */
4021 	if (nd->nl_nch.ncp->nc_flag & (NCF_ISMOUNTPT))
4022 		return (EINVAL);
4023 	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
4024 		return (error);
4025 	error = VOP_NRMDIR(&nd->nl_nch, nd->nl_dvp, nd->nl_cred);
4026 	return (error);
4027 }
4028 
4029 /*
4030  * rmdir_args(char *path)
4031  *
4032  * Remove a directory file.
4033  *
4034  * MPALMOSTSAFE
4035  */
4036 int
4037 sys_rmdir(struct rmdir_args *uap)
4038 {
4039 	struct nlookupdata nd;
4040 	int error;
4041 
4042 	get_mplock();
4043 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, 0);
4044 	if (error == 0)
4045 		error = kern_rmdir(&nd);
4046 	nlookup_done(&nd);
4047 	rel_mplock();
4048 	return (error);
4049 }
4050 
4051 int
4052 kern_getdirentries(int fd, char *buf, u_int count, long *basep, int *res,
4053     enum uio_seg direction)
4054 {
4055 	struct thread *td = curthread;
4056 	struct proc *p = td->td_proc;
4057 	struct vnode *vp;
4058 	struct file *fp;
4059 	struct uio auio;
4060 	struct iovec aiov;
4061 	off_t loff;
4062 	int error, eofflag;
4063 
4064 	if ((error = holdvnode(p->p_fd, fd, &fp)) != 0)
4065 		return (error);
4066 	if ((fp->f_flag & FREAD) == 0) {
4067 		error = EBADF;
4068 		goto done;
4069 	}
4070 	vp = (struct vnode *)fp->f_data;
4071 unionread:
4072 	if (vp->v_type != VDIR) {
4073 		error = EINVAL;
4074 		goto done;
4075 	}
4076 	aiov.iov_base = buf;
4077 	aiov.iov_len = count;
4078 	auio.uio_iov = &aiov;
4079 	auio.uio_iovcnt = 1;
4080 	auio.uio_rw = UIO_READ;
4081 	auio.uio_segflg = direction;
4082 	auio.uio_td = td;
4083 	auio.uio_resid = count;
4084 	loff = auio.uio_offset = fp->f_offset;
4085 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
4086 	fp->f_offset = auio.uio_offset;
4087 	if (error)
4088 		goto done;
4089 	if (count == auio.uio_resid) {
4090 		if (union_dircheckp) {
4091 			error = union_dircheckp(td, &vp, fp);
4092 			if (error == -1)
4093 				goto unionread;
4094 			if (error)
4095 				goto done;
4096 		}
4097 #if 0
4098 		if ((vp->v_flag & VROOT) &&
4099 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
4100 			struct vnode *tvp = vp;
4101 			vp = vp->v_mount->mnt_vnodecovered;
4102 			vref(vp);
4103 			fp->f_data = vp;
4104 			fp->f_offset = 0;
4105 			vrele(tvp);
4106 			goto unionread;
4107 		}
4108 #endif
4109 	}
4110 
4111 	/*
4112 	 * WARNING!  *basep may not be wide enough to accomodate the
4113 	 * seek offset.   XXX should we hack this to return the upper 32 bits
4114 	 * for offsets greater then 4G?
4115 	 */
4116 	if (basep) {
4117 		*basep = (long)loff;
4118 	}
4119 	*res = count - auio.uio_resid;
4120 done:
4121 	fdrop(fp);
4122 	return (error);
4123 }
4124 
4125 /*
4126  * getdirentries_args(int fd, char *buf, u_int conut, long *basep)
4127  *
4128  * Read a block of directory entries in a file system independent format.
4129  *
4130  * MPALMOSTSAFE
4131  */
4132 int
4133 sys_getdirentries(struct getdirentries_args *uap)
4134 {
4135 	long base;
4136 	int error;
4137 
4138 	get_mplock();
4139 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, &base,
4140 				   &uap->sysmsg_result, UIO_USERSPACE);
4141 	rel_mplock();
4142 
4143 	if (error == 0 && uap->basep)
4144 		error = copyout(&base, uap->basep, sizeof(*uap->basep));
4145 	return (error);
4146 }
4147 
4148 /*
4149  * getdents_args(int fd, char *buf, size_t count)
4150  *
4151  * MPALMOSTSAFE
4152  */
4153 int
4154 sys_getdents(struct getdents_args *uap)
4155 {
4156 	int error;
4157 
4158 	get_mplock();
4159 	error = kern_getdirentries(uap->fd, uap->buf, uap->count, NULL,
4160 				   &uap->sysmsg_result, UIO_USERSPACE);
4161 	rel_mplock();
4162 
4163 	return (error);
4164 }
4165 
4166 /*
4167  * Set the mode mask for creation of filesystem nodes.
4168  *
4169  * umask(int newmask)
4170  *
4171  * MPSAFE
4172  */
4173 int
4174 sys_umask(struct umask_args *uap)
4175 {
4176 	struct thread *td = curthread;
4177 	struct proc *p = td->td_proc;
4178 	struct filedesc *fdp;
4179 
4180 	fdp = p->p_fd;
4181 	uap->sysmsg_result = fdp->fd_cmask;
4182 	fdp->fd_cmask = uap->newmask & ALLPERMS;
4183 	return (0);
4184 }
4185 
4186 /*
4187  * revoke(char *path)
4188  *
4189  * Void all references to file by ripping underlying filesystem
4190  * away from vnode.
4191  *
4192  * MPALMOSTSAFE
4193  */
4194 int
4195 sys_revoke(struct revoke_args *uap)
4196 {
4197 	struct nlookupdata nd;
4198 	struct vattr vattr;
4199 	struct vnode *vp;
4200 	struct ucred *cred;
4201 	int error;
4202 
4203 	vp = NULL;
4204 	get_mplock();
4205 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4206 	if (error == 0)
4207 		error = nlookup(&nd);
4208 	if (error == 0)
4209 		error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4210 	cred = crhold(nd.nl_cred);
4211 	nlookup_done(&nd);
4212 	if (error == 0) {
4213 		if (error == 0)
4214 			error = VOP_GETATTR(vp, &vattr);
4215 		if (error == 0 && cred->cr_uid != vattr.va_uid)
4216 			error = priv_check_cred(cred, PRIV_VFS_REVOKE, 0);
4217 		if (error == 0 && (vp->v_type == VCHR || vp->v_type == VBLK)) {
4218 			if (vcount(vp) > 0)
4219 				error = vrevoke(vp, cred);
4220 		} else if (error == 0) {
4221 			error = vrevoke(vp, cred);
4222 		}
4223 		vrele(vp);
4224 	}
4225 	if (cred)
4226 		crfree(cred);
4227 	rel_mplock();
4228 	return (error);
4229 }
4230 
4231 /*
4232  * getfh_args(char *fname, fhandle_t *fhp)
4233  *
4234  * Get (NFS) file handle
4235  *
4236  * NOTE: We use the fsid of the covering mount, even if it is a nullfs
4237  * mount.  This allows nullfs mounts to be explicitly exported.
4238  *
4239  * WARNING: nullfs mounts of HAMMER PFS ROOTs are safe.
4240  *
4241  * 	    nullfs mounts of subdirectories are not safe.  That is, it will
4242  *	    work, but you do not really have protection against access to
4243  *	    the related parent directories.
4244  *
4245  * MPALMOSTSAFE
4246  */
4247 int
4248 sys_getfh(struct getfh_args *uap)
4249 {
4250 	struct thread *td = curthread;
4251 	struct nlookupdata nd;
4252 	fhandle_t fh;
4253 	struct vnode *vp;
4254 	struct mount *mp;
4255 	int error;
4256 
4257 	/*
4258 	 * Must be super user
4259 	 */
4260 	if ((error = priv_check(td, PRIV_ROOT)) != 0)
4261 		return (error);
4262 
4263 	vp = NULL;
4264 	get_mplock();
4265 	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
4266 	if (error == 0)
4267 		error = nlookup(&nd);
4268 	if (error == 0)
4269 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4270 	mp = nd.nl_nch.mount;
4271 	nlookup_done(&nd);
4272 	if (error == 0) {
4273 		bzero(&fh, sizeof(fh));
4274 		fh.fh_fsid = mp->mnt_stat.f_fsid;
4275 		error = VFS_VPTOFH(vp, &fh.fh_fid);
4276 		vput(vp);
4277 		if (error == 0)
4278 			error = copyout(&fh, uap->fhp, sizeof(fh));
4279 	}
4280 	rel_mplock();
4281 	return (error);
4282 }
4283 
4284 /*
4285  * fhopen_args(const struct fhandle *u_fhp, int flags)
4286  *
4287  * syscall for the rpc.lockd to use to translate a NFS file handle into
4288  * an open descriptor.
4289  *
4290  * warning: do not remove the priv_check() call or this becomes one giant
4291  * security hole.
4292  *
4293  * MPALMOSTSAFE
4294  */
4295 int
4296 sys_fhopen(struct fhopen_args *uap)
4297 {
4298 	struct thread *td = curthread;
4299 	struct filedesc *fdp = td->td_proc->p_fd;
4300 	struct mount *mp;
4301 	struct vnode *vp;
4302 	struct fhandle fhp;
4303 	struct vattr vat;
4304 	struct vattr *vap = &vat;
4305 	struct flock lf;
4306 	int fmode, mode, error, type;
4307 	struct file *nfp;
4308 	struct file *fp;
4309 	int indx;
4310 
4311 	/*
4312 	 * Must be super user
4313 	 */
4314 	error = priv_check(td, PRIV_ROOT);
4315 	if (error)
4316 		return (error);
4317 
4318 	fmode = FFLAGS(uap->flags);
4319 
4320 	/*
4321 	 * Why not allow a non-read/write open for our lockd?
4322 	 */
4323 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4324 		return (EINVAL);
4325 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4326 	if (error)
4327 		return(error);
4328 
4329 	/*
4330 	 * Find the mount point
4331 	 */
4332 	get_mplock();
4333 	mp = vfs_getvfs(&fhp.fh_fsid);
4334 	if (mp == NULL) {
4335 		error = ESTALE;
4336 		goto  done;
4337 	}
4338 	/* now give me my vnode, it gets returned to me locked */
4339 	error = VFS_FHTOVP(mp, NULL, &fhp.fh_fid, &vp);
4340 	if (error)
4341 		goto done;
4342  	/*
4343 	 * from now on we have to make sure not
4344 	 * to forget about the vnode
4345 	 * any error that causes an abort must vput(vp)
4346 	 * just set error = err and 'goto bad;'.
4347 	 */
4348 
4349 	/*
4350 	 * from vn_open
4351 	 */
4352 	if (vp->v_type == VLNK) {
4353 		error = EMLINK;
4354 		goto bad;
4355 	}
4356 	if (vp->v_type == VSOCK) {
4357 		error = EOPNOTSUPP;
4358 		goto bad;
4359 	}
4360 	mode = 0;
4361 	if (fmode & (FWRITE | O_TRUNC)) {
4362 		if (vp->v_type == VDIR) {
4363 			error = EISDIR;
4364 			goto bad;
4365 		}
4366 		error = vn_writechk(vp, NULL);
4367 		if (error)
4368 			goto bad;
4369 		mode |= VWRITE;
4370 	}
4371 	if (fmode & FREAD)
4372 		mode |= VREAD;
4373 	if (mode) {
4374 		error = VOP_ACCESS(vp, mode, td->td_ucred);
4375 		if (error)
4376 			goto bad;
4377 	}
4378 	if (fmode & O_TRUNC) {
4379 		vn_unlock(vp);				/* XXX */
4380 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
4381 		VATTR_NULL(vap);
4382 		vap->va_size = 0;
4383 		error = VOP_SETATTR(vp, vap, td->td_ucred);
4384 		if (error)
4385 			goto bad;
4386 	}
4387 
4388 	/*
4389 	 * VOP_OPEN needs the file pointer so it can potentially override
4390 	 * it.
4391 	 *
4392 	 * WARNING! no f_nchandle will be associated when fhopen()ing a
4393 	 * directory.  XXX
4394 	 */
4395 	if ((error = falloc(td->td_lwp, &nfp, &indx)) != 0)
4396 		goto bad;
4397 	fp = nfp;
4398 
4399 	error = VOP_OPEN(vp, fmode, td->td_ucred, fp);
4400 	if (error) {
4401 		/*
4402 		 * setting f_ops this way prevents VOP_CLOSE from being
4403 		 * called or fdrop() releasing the vp from v_data.   Since
4404 		 * the VOP_OPEN failed we don't want to VOP_CLOSE.
4405 		 */
4406 		fp->f_ops = &badfileops;
4407 		fp->f_data = NULL;
4408 		goto bad_drop;
4409 	}
4410 
4411 	/*
4412 	 * The fp is given its own reference, we still have our ref and lock.
4413 	 *
4414 	 * Assert that all regular files must be created with a VM object.
4415 	 */
4416 	if (vp->v_type == VREG && vp->v_object == NULL) {
4417 		kprintf("fhopen: regular file did not have VM object: %p\n", vp);
4418 		goto bad_drop;
4419 	}
4420 
4421 	/*
4422 	 * The open was successful.  Handle any locking requirements.
4423 	 */
4424 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
4425 		lf.l_whence = SEEK_SET;
4426 		lf.l_start = 0;
4427 		lf.l_len = 0;
4428 		if (fmode & O_EXLOCK)
4429 			lf.l_type = F_WRLCK;
4430 		else
4431 			lf.l_type = F_RDLCK;
4432 		if (fmode & FNONBLOCK)
4433 			type = 0;
4434 		else
4435 			type = F_WAIT;
4436 		vn_unlock(vp);
4437 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
4438 			/*
4439 			 * release our private reference.
4440 			 */
4441 			fsetfd(fdp, NULL, indx);
4442 			fdrop(fp);
4443 			vrele(vp);
4444 			goto done;
4445 		}
4446 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4447 		fp->f_flag |= FHASLOCK;
4448 	}
4449 
4450 	/*
4451 	 * Clean up.  Associate the file pointer with the previously
4452 	 * reserved descriptor and return it.
4453 	 */
4454 	vput(vp);
4455 	rel_mplock();
4456 	fsetfd(fdp, fp, indx);
4457 	fdrop(fp);
4458 	uap->sysmsg_result = indx;
4459 	return (0);
4460 
4461 bad_drop:
4462 	fsetfd(fdp, NULL, indx);
4463 	fdrop(fp);
4464 bad:
4465 	vput(vp);
4466 done:
4467 	rel_mplock();
4468 	return (error);
4469 }
4470 
4471 /*
4472  * fhstat_args(struct fhandle *u_fhp, struct stat *sb)
4473  *
4474  * MPALMOSTSAFE
4475  */
4476 int
4477 sys_fhstat(struct fhstat_args *uap)
4478 {
4479 	struct thread *td = curthread;
4480 	struct stat sb;
4481 	fhandle_t fh;
4482 	struct mount *mp;
4483 	struct vnode *vp;
4484 	int error;
4485 
4486 	/*
4487 	 * Must be super user
4488 	 */
4489 	error = priv_check(td, PRIV_ROOT);
4490 	if (error)
4491 		return (error);
4492 
4493 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4494 	if (error)
4495 		return (error);
4496 
4497 	get_mplock();
4498 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
4499 		error = ESTALE;
4500 	if (error == 0) {
4501 		if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) == 0) {
4502 			error = vn_stat(vp, &sb, td->td_ucred);
4503 			vput(vp);
4504 		}
4505 	}
4506 	rel_mplock();
4507 	if (error == 0)
4508 		error = copyout(&sb, uap->sb, sizeof(sb));
4509 	return (error);
4510 }
4511 
4512 /*
4513  * fhstatfs_args(struct fhandle *u_fhp, struct statfs *buf)
4514  *
4515  * MPALMOSTSAFE
4516  */
4517 int
4518 sys_fhstatfs(struct fhstatfs_args *uap)
4519 {
4520 	struct thread *td = curthread;
4521 	struct proc *p = td->td_proc;
4522 	struct statfs *sp;
4523 	struct mount *mp;
4524 	struct vnode *vp;
4525 	struct statfs sb;
4526 	char *fullpath, *freepath;
4527 	fhandle_t fh;
4528 	int error;
4529 
4530 	/*
4531 	 * Must be super user
4532 	 */
4533 	if ((error = priv_check(td, PRIV_ROOT)))
4534 		return (error);
4535 
4536 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4537 		return (error);
4538 
4539 	get_mplock();
4540 
4541 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4542 		error = ESTALE;
4543 		goto done;
4544 	}
4545 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4546 		error = ESTALE;
4547 		goto done;
4548 	}
4549 
4550 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)) != 0)
4551 		goto done;
4552 	mp = vp->v_mount;
4553 	sp = &mp->mnt_stat;
4554 	vput(vp);
4555 	if ((error = VFS_STATFS(mp, sp, td->td_ucred)) != 0)
4556 		goto done;
4557 
4558 	error = mount_path(p, mp, &fullpath, &freepath);
4559 	if (error)
4560 		goto done;
4561 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
4562 	strlcpy(sp->f_mntonname, fullpath, sizeof(sp->f_mntonname));
4563 	kfree(freepath, M_TEMP);
4564 
4565 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4566 	if (priv_check(td, PRIV_ROOT)) {
4567 		bcopy(sp, &sb, sizeof(sb));
4568 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
4569 		sp = &sb;
4570 	}
4571 	error = copyout(sp, uap->buf, sizeof(*sp));
4572 done:
4573 	rel_mplock();
4574 	return (error);
4575 }
4576 
4577 /*
4578  * fhstatvfs_args(struct fhandle *u_fhp, struct statvfs *buf)
4579  *
4580  * MPALMOSTSAFE
4581  */
4582 int
4583 sys_fhstatvfs(struct fhstatvfs_args *uap)
4584 {
4585 	struct thread *td = curthread;
4586 	struct proc *p = td->td_proc;
4587 	struct statvfs *sp;
4588 	struct mount *mp;
4589 	struct vnode *vp;
4590 	fhandle_t fh;
4591 	int error;
4592 
4593 	/*
4594 	 * Must be super user
4595 	 */
4596 	if ((error = priv_check(td, PRIV_ROOT)))
4597 		return (error);
4598 
4599 	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
4600 		return (error);
4601 
4602 	get_mplock();
4603 
4604 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
4605 		error = ESTALE;
4606 		goto done;
4607 	}
4608 	if (p != NULL && !chroot_visible_mnt(mp, p)) {
4609 		error = ESTALE;
4610 		goto done;
4611 	}
4612 
4613 	if ((error = VFS_FHTOVP(mp, NULL, &fh.fh_fid, &vp)))
4614 		goto done;
4615 	mp = vp->v_mount;
4616 	sp = &mp->mnt_vstat;
4617 	vput(vp);
4618 	if ((error = VFS_STATVFS(mp, sp, td->td_ucred)) != 0)
4619 		goto done;
4620 
4621 	sp->f_flag = 0;
4622 	if (mp->mnt_flag & MNT_RDONLY)
4623 		sp->f_flag |= ST_RDONLY;
4624 	if (mp->mnt_flag & MNT_NOSUID)
4625 		sp->f_flag |= ST_NOSUID;
4626 	error = copyout(sp, uap->buf, sizeof(*sp));
4627 done:
4628 	rel_mplock();
4629 	return (error);
4630 }
4631 
4632 
4633 /*
4634  * Syscall to push extended attribute configuration information into the
4635  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
4636  * a command (int cmd), and attribute name and misc data.  For now, the
4637  * attribute name is left in userspace for consumption by the VFS_op.
4638  * It will probably be changed to be copied into sysspace by the
4639  * syscall in the future, once issues with various consumers of the
4640  * attribute code have raised their hands.
4641  *
4642  * Currently this is used only by UFS Extended Attributes.
4643  *
4644  * MPALMOSTSAFE
4645  */
4646 int
4647 sys_extattrctl(struct extattrctl_args *uap)
4648 {
4649 	struct nlookupdata nd;
4650 	struct vnode *vp;
4651 	char attrname[EXTATTR_MAXNAMELEN];
4652 	int error;
4653 	size_t size;
4654 
4655 	get_mplock();
4656 
4657 	attrname[0] = 0;
4658 	vp = NULL;
4659 	error = 0;
4660 
4661 	if (error == 0 && uap->filename) {
4662 		error = nlookup_init(&nd, uap->filename, UIO_USERSPACE,
4663 				     NLC_FOLLOW);
4664 		if (error == 0)
4665 			error = nlookup(&nd);
4666 		if (error == 0)
4667 			error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp);
4668 		nlookup_done(&nd);
4669 	}
4670 
4671 	if (error == 0 && uap->attrname) {
4672 		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
4673 				  &size);
4674 	}
4675 
4676 	if (error == 0) {
4677 		error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4678 		if (error == 0)
4679 			error = nlookup(&nd);
4680 		if (error == 0)
4681 			error = ncp_writechk(&nd.nl_nch);
4682 		if (error == 0) {
4683 			error = VFS_EXTATTRCTL(nd.nl_nch.mount, uap->cmd, vp,
4684 					       uap->attrnamespace,
4685 					       uap->attrname, nd.nl_cred);
4686 		}
4687 		nlookup_done(&nd);
4688 	}
4689 
4690 	rel_mplock();
4691 
4692 	return (error);
4693 }
4694 
4695 /*
4696  * Syscall to get a named extended attribute on a file or directory.
4697  *
4698  * MPALMOSTSAFE
4699  */
4700 int
4701 sys_extattr_set_file(struct extattr_set_file_args *uap)
4702 {
4703 	char attrname[EXTATTR_MAXNAMELEN];
4704 	struct nlookupdata nd;
4705 	struct vnode *vp;
4706 	struct uio auio;
4707 	struct iovec aiov;
4708 	int error;
4709 
4710 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4711 	if (error)
4712 		return (error);
4713 
4714 	vp = NULL;
4715 	get_mplock();
4716 
4717 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4718 	if (error == 0)
4719 		error = nlookup(&nd);
4720 	if (error == 0)
4721 		error = ncp_writechk(&nd.nl_nch);
4722 	if (error == 0)
4723 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4724 	if (error) {
4725 		nlookup_done(&nd);
4726 		rel_mplock();
4727 		return (error);
4728 	}
4729 
4730 	bzero(&auio, sizeof(auio));
4731 	aiov.iov_base = uap->data;
4732 	aiov.iov_len = uap->nbytes;
4733 	auio.uio_iov = &aiov;
4734 	auio.uio_iovcnt = 1;
4735 	auio.uio_offset = 0;
4736 	auio.uio_resid = uap->nbytes;
4737 	auio.uio_rw = UIO_WRITE;
4738 	auio.uio_td = curthread;
4739 
4740 	error = VOP_SETEXTATTR(vp, uap->attrnamespace, attrname,
4741 			       &auio, nd.nl_cred);
4742 
4743 	vput(vp);
4744 	nlookup_done(&nd);
4745 	rel_mplock();
4746 	return (error);
4747 }
4748 
4749 /*
4750  * Syscall to get a named extended attribute on a file or directory.
4751  *
4752  * MPALMOSTSAFE
4753  */
4754 int
4755 sys_extattr_get_file(struct extattr_get_file_args *uap)
4756 {
4757 	char attrname[EXTATTR_MAXNAMELEN];
4758 	struct nlookupdata nd;
4759 	struct uio auio;
4760 	struct iovec aiov;
4761 	struct vnode *vp;
4762 	int error;
4763 
4764 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4765 	if (error)
4766 		return (error);
4767 
4768 	vp = NULL;
4769 	get_mplock();
4770 
4771 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4772 	if (error == 0)
4773 		error = nlookup(&nd);
4774 	if (error == 0)
4775 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4776 	if (error) {
4777 		nlookup_done(&nd);
4778 		rel_mplock();
4779 		return (error);
4780 	}
4781 
4782 	bzero(&auio, sizeof(auio));
4783 	aiov.iov_base = uap->data;
4784 	aiov.iov_len = uap->nbytes;
4785 	auio.uio_iov = &aiov;
4786 	auio.uio_iovcnt = 1;
4787 	auio.uio_offset = 0;
4788 	auio.uio_resid = uap->nbytes;
4789 	auio.uio_rw = UIO_READ;
4790 	auio.uio_td = curthread;
4791 
4792 	error = VOP_GETEXTATTR(vp, uap->attrnamespace, attrname,
4793 				&auio, nd.nl_cred);
4794 	uap->sysmsg_result = uap->nbytes - auio.uio_resid;
4795 
4796 	vput(vp);
4797 	nlookup_done(&nd);
4798 	rel_mplock();
4799 	return(error);
4800 }
4801 
4802 /*
4803  * Syscall to delete a named extended attribute from a file or directory.
4804  * Accepts attribute name.  The real work happens in VOP_SETEXTATTR().
4805  *
4806  * MPALMOSTSAFE
4807  */
4808 int
4809 sys_extattr_delete_file(struct extattr_delete_file_args *uap)
4810 {
4811 	char attrname[EXTATTR_MAXNAMELEN];
4812 	struct nlookupdata nd;
4813 	struct vnode *vp;
4814 	int error;
4815 
4816 	error = copyin(uap->attrname, attrname, EXTATTR_MAXNAMELEN);
4817 	if (error)
4818 		return(error);
4819 
4820 	get_mplock();
4821 	error = nlookup_init(&nd, uap->path, UIO_USERSPACE, NLC_FOLLOW);
4822 	if (error == 0)
4823 		error = nlookup(&nd);
4824 	if (error == 0)
4825 		error = ncp_writechk(&nd.nl_nch);
4826 	if (error == 0) {
4827 		error = cache_vget(&nd.nl_nch, nd.nl_cred, LK_EXCLUSIVE, &vp);
4828 		if (error == 0) {
4829 			error = VOP_SETEXTATTR(vp, uap->attrnamespace,
4830 					       attrname, NULL, nd.nl_cred);
4831 			vput(vp);
4832 		}
4833 	}
4834 	nlookup_done(&nd);
4835 	rel_mplock();
4836 	return(error);
4837 }
4838 
4839 /*
4840  * Determine if the mount is visible to the process.
4841  */
4842 static int
4843 chroot_visible_mnt(struct mount *mp, struct proc *p)
4844 {
4845 	struct nchandle nch;
4846 
4847 	/*
4848 	 * Traverse from the mount point upwards.  If we hit the process
4849 	 * root then the mount point is visible to the process.
4850 	 */
4851 	nch = mp->mnt_ncmountpt;
4852 	while (nch.ncp) {
4853 		if (nch.mount == p->p_fd->fd_nrdir.mount &&
4854 		    nch.ncp == p->p_fd->fd_nrdir.ncp) {
4855 			return(1);
4856 		}
4857 		if (nch.ncp == nch.mount->mnt_ncmountpt.ncp) {
4858 			nch = nch.mount->mnt_ncmounton;
4859 		} else {
4860 			nch.ncp = nch.ncp->nc_parent;
4861 		}
4862 	}
4863 
4864 	/*
4865 	 * If the mount point is not visible to the process, but the
4866 	 * process root is in a subdirectory of the mount, return
4867 	 * TRUE anyway.
4868 	 */
4869 	if (p->p_fd->fd_nrdir.mount == mp)
4870 		return(1);
4871 
4872 	return(0);
4873 }
4874 
4875