xref: /freebsd/sys/kern/vfs_syscalls.c (revision 78ae60b4)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 #include <sys/cdefs.h>
38 #include "opt_capsicum.h"
39 #include "opt_ktrace.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #ifdef COMPAT_FREEBSD11
44 #include <sys/abi_compat.h>
45 #endif
46 #include <sys/bio.h>
47 #include <sys/buf.h>
48 #include <sys/capsicum.h>
49 #include <sys/disk.h>
50 #include <sys/malloc.h>
51 #include <sys/mount.h>
52 #include <sys/mutex.h>
53 #include <sys/sysproto.h>
54 #include <sys/namei.h>
55 #include <sys/filedesc.h>
56 #include <sys/kernel.h>
57 #include <sys/fcntl.h>
58 #include <sys/file.h>
59 #include <sys/filio.h>
60 #include <sys/limits.h>
61 #include <sys/linker.h>
62 #include <sys/rwlock.h>
63 #include <sys/sdt.h>
64 #include <sys/stat.h>
65 #include <sys/sx.h>
66 #include <sys/unistd.h>
67 #include <sys/vnode.h>
68 #include <sys/priv.h>
69 #include <sys/proc.h>
70 #include <sys/dirent.h>
71 #include <sys/jail.h>
72 #include <sys/syscallsubr.h>
73 #include <sys/sysctl.h>
74 #ifdef KTRACE
75 #include <sys/ktrace.h>
76 #endif
77 
78 #include <machine/stdarg.h>
79 
80 #include <security/audit/audit.h>
81 #include <security/mac/mac_framework.h>
82 
83 #include <vm/vm.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vnode_pager.h>
87 #include <vm/uma.h>
88 
89 #include <fs/devfs/devfs.h>
90 
91 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
92 
93 static int kern_chflagsat(struct thread *td, int fd, const char *path,
94     enum uio_seg pathseg, u_long flags, int atflag);
95 static int setfflags(struct thread *td, struct vnode *, u_long);
96 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
97 static int getutimens(const struct timespec *, enum uio_seg,
98     struct timespec *, int *);
99 static int setutimes(struct thread *td, struct vnode *,
100     const struct timespec *, int, int);
101 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
102     struct thread *td);
103 static int kern_fhlinkat(struct thread *td, int fd, const char *path,
104     enum uio_seg pathseg, fhandle_t *fhp);
105 static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg,
106     size_t count, struct thread *td);
107 static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd,
108     const char *path, enum uio_seg segflag);
109 
110 uint64_t
111 at2cnpflags(u_int at_flags, u_int mask)
112 {
113 	uint64_t res;
114 
115 	MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) !=
116 	    (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW));
117 
118 	res = 0;
119 	at_flags &= mask;
120 	if ((at_flags & AT_RESOLVE_BENEATH) != 0)
121 		res |= RBENEATH;
122 	if ((at_flags & AT_SYMLINK_FOLLOW) != 0)
123 		res |= FOLLOW;
124 	/* NOFOLLOW is pseudo flag */
125 	if ((mask & AT_SYMLINK_NOFOLLOW) != 0) {
126 		res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW :
127 		    FOLLOW;
128 	}
129 	if ((mask & AT_EMPTY_PATH) != 0 && (at_flags & AT_EMPTY_PATH) != 0)
130 		res |= EMPTYPATH;
131 	return (res);
132 }
133 
134 int
135 kern_sync(struct thread *td)
136 {
137 	struct mount *mp, *nmp;
138 	int save;
139 
140 	mtx_lock(&mountlist_mtx);
141 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
142 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
143 			nmp = TAILQ_NEXT(mp, mnt_list);
144 			continue;
145 		}
146 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
147 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
148 			save = curthread_pflags_set(TDP_SYNCIO);
149 			vfs_periodic(mp, MNT_NOWAIT);
150 			VFS_SYNC(mp, MNT_NOWAIT);
151 			curthread_pflags_restore(save);
152 			vn_finished_write(mp);
153 		}
154 		mtx_lock(&mountlist_mtx);
155 		nmp = TAILQ_NEXT(mp, mnt_list);
156 		vfs_unbusy(mp);
157 	}
158 	mtx_unlock(&mountlist_mtx);
159 	return (0);
160 }
161 
162 /*
163  * Sync each mounted filesystem.
164  */
165 #ifndef _SYS_SYSPROTO_H_
166 struct sync_args {
167 	int     dummy;
168 };
169 #endif
170 /* ARGSUSED */
171 int
172 sys_sync(struct thread *td, struct sync_args *uap)
173 {
174 
175 	return (kern_sync(td));
176 }
177 
178 /*
179  * Change filesystem quotas.
180  */
181 #ifndef _SYS_SYSPROTO_H_
182 struct quotactl_args {
183 	char *path;
184 	int cmd;
185 	int uid;
186 	caddr_t arg;
187 };
188 #endif
189 int
190 sys_quotactl(struct thread *td, struct quotactl_args *uap)
191 {
192 	struct mount *mp;
193 	struct nameidata nd;
194 	int error;
195 	bool mp_busy;
196 
197 	AUDIT_ARG_CMD(uap->cmd);
198 	AUDIT_ARG_UID(uap->uid);
199 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
200 		return (EPERM);
201 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
202 	    uap->path);
203 	if ((error = namei(&nd)) != 0)
204 		return (error);
205 	NDFREE_PNBUF(&nd);
206 	mp = nd.ni_vp->v_mount;
207 	vfs_ref(mp);
208 	vput(nd.ni_vp);
209 	error = vfs_busy(mp, 0);
210 	if (error != 0) {
211 		vfs_rel(mp);
212 		return (error);
213 	}
214 	mp_busy = true;
215 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, &mp_busy);
216 
217 	/*
218 	 * Since quota on/off operations typically need to open quota
219 	 * files, the implementation may need to unbusy the mount point
220 	 * before calling into namei.  Otherwise, unmount might be
221 	 * started between two vfs_busy() invocations (first is ours,
222 	 * second is from mount point cross-walk code in lookup()),
223 	 * causing deadlock.
224 	 *
225 	 * Avoid unbusying mp if the implementation indicates it has
226 	 * already done so.
227 	 */
228 	if (mp_busy)
229 		vfs_unbusy(mp);
230 	vfs_rel(mp);
231 	return (error);
232 }
233 
234 /*
235  * Used by statfs conversion routines to scale the block size up if
236  * necessary so that all of the block counts are <= 'max_size'.  Note
237  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
238  * value of 'n'.
239  */
240 void
241 statfs_scale_blocks(struct statfs *sf, long max_size)
242 {
243 	uint64_t count;
244 	int shift;
245 
246 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
247 
248 	/*
249 	 * Attempt to scale the block counts to give a more accurate
250 	 * overview to userland of the ratio of free space to used
251 	 * space.  To do this, find the largest block count and compute
252 	 * a divisor that lets it fit into a signed integer <= max_size.
253 	 */
254 	if (sf->f_bavail < 0)
255 		count = -sf->f_bavail;
256 	else
257 		count = sf->f_bavail;
258 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
259 	if (count <= max_size)
260 		return;
261 
262 	count >>= flsl(max_size);
263 	shift = 0;
264 	while (count > 0) {
265 		shift++;
266 		count >>=1;
267 	}
268 
269 	sf->f_bsize <<= shift;
270 	sf->f_blocks >>= shift;
271 	sf->f_bfree >>= shift;
272 	sf->f_bavail >>= shift;
273 }
274 
275 static int
276 kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
277 {
278 	int error;
279 
280 	if (mp == NULL)
281 		return (EBADF);
282 	error = vfs_busy(mp, 0);
283 	vfs_rel(mp);
284 	if (error != 0)
285 		return (error);
286 #ifdef MAC
287 	error = mac_mount_check_stat(td->td_ucred, mp);
288 	if (error != 0)
289 		goto out;
290 #endif
291 	error = VFS_STATFS(mp, buf);
292 	if (error != 0)
293 		goto out;
294 	if (priv_check_cred_vfs_generation(td->td_ucred)) {
295 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
296 		prison_enforce_statfs(td->td_ucred, mp, buf);
297 	}
298 out:
299 	vfs_unbusy(mp);
300 	return (error);
301 }
302 
303 /*
304  * Get filesystem statistics.
305  */
306 #ifndef _SYS_SYSPROTO_H_
307 struct statfs_args {
308 	char *path;
309 	struct statfs *buf;
310 };
311 #endif
312 int
313 sys_statfs(struct thread *td, struct statfs_args *uap)
314 {
315 	struct statfs *sfp;
316 	int error;
317 
318 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
319 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
320 	if (error == 0)
321 		error = copyout(sfp, uap->buf, sizeof(struct statfs));
322 	free(sfp, M_STATFS);
323 	return (error);
324 }
325 
326 int
327 kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg,
328     struct statfs *buf)
329 {
330 	struct mount *mp;
331 	struct nameidata nd;
332 	int error;
333 
334 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path);
335 	error = namei(&nd);
336 	if (error != 0)
337 		return (error);
338 	NDFREE_PNBUF(&nd);
339 	mp = vfs_ref_from_vp(nd.ni_vp);
340 	vrele(nd.ni_vp);
341 	return (kern_do_statfs(td, mp, buf));
342 }
343 
344 /*
345  * Get filesystem statistics.
346  */
347 #ifndef _SYS_SYSPROTO_H_
348 struct fstatfs_args {
349 	int fd;
350 	struct statfs *buf;
351 };
352 #endif
353 int
354 sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
355 {
356 	struct statfs *sfp;
357 	int error;
358 
359 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
360 	error = kern_fstatfs(td, uap->fd, sfp);
361 	if (error == 0)
362 		error = copyout(sfp, uap->buf, sizeof(struct statfs));
363 	free(sfp, M_STATFS);
364 	return (error);
365 }
366 
367 int
368 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
369 {
370 	struct file *fp;
371 	struct mount *mp;
372 	struct vnode *vp;
373 	int error;
374 
375 	AUDIT_ARG_FD(fd);
376 	error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp);
377 	if (error != 0)
378 		return (error);
379 	vp = fp->f_vnode;
380 #ifdef AUDIT
381 	if (AUDITING_TD(td)) {
382 		vn_lock(vp, LK_SHARED | LK_RETRY);
383 		AUDIT_ARG_VNODE1(vp);
384 		VOP_UNLOCK(vp);
385 	}
386 #endif
387 	mp = vfs_ref_from_vp(vp);
388 	fdrop(fp, td);
389 	return (kern_do_statfs(td, mp, buf));
390 }
391 
392 /*
393  * Get statistics on all filesystems.
394  */
395 #ifndef _SYS_SYSPROTO_H_
396 struct getfsstat_args {
397 	struct statfs *buf;
398 	long bufsize;
399 	int mode;
400 };
401 #endif
402 int
403 sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
404 {
405 	size_t count;
406 	int error;
407 
408 	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
409 		return (EINVAL);
410 	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
411 	    UIO_USERSPACE, uap->mode);
412 	if (error == 0)
413 		td->td_retval[0] = count;
414 	return (error);
415 }
416 
417 /*
418  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
419  *	The caller is responsible for freeing memory which will be allocated
420  *	in '*buf'.
421  */
422 int
423 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
424     size_t *countp, enum uio_seg bufseg, int mode)
425 {
426 	struct mount *mp, *nmp;
427 	struct statfs *sfsp, *sp, *sptmp, *tofree;
428 	size_t count, maxcount;
429 	int error;
430 
431 	switch (mode) {
432 	case MNT_WAIT:
433 	case MNT_NOWAIT:
434 		break;
435 	default:
436 		if (bufseg == UIO_SYSSPACE)
437 			*buf = NULL;
438 		return (EINVAL);
439 	}
440 restart:
441 	maxcount = bufsize / sizeof(struct statfs);
442 	if (bufsize == 0) {
443 		sfsp = NULL;
444 		tofree = NULL;
445 	} else if (bufseg == UIO_USERSPACE) {
446 		sfsp = *buf;
447 		tofree = NULL;
448 	} else /* if (bufseg == UIO_SYSSPACE) */ {
449 		count = 0;
450 		mtx_lock(&mountlist_mtx);
451 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
452 			count++;
453 		}
454 		mtx_unlock(&mountlist_mtx);
455 		if (maxcount > count)
456 			maxcount = count;
457 		tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
458 		    M_STATFS, M_WAITOK);
459 	}
460 
461 	count = 0;
462 
463 	/*
464 	 * If there is no target buffer they only want the count.
465 	 *
466 	 * This could be TAILQ_FOREACH but it is open-coded to match the original
467 	 * code below.
468 	 */
469 	if (sfsp == NULL) {
470 		mtx_lock(&mountlist_mtx);
471 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
472 			if (prison_canseemount(td->td_ucred, mp) != 0) {
473 				nmp = TAILQ_NEXT(mp, mnt_list);
474 				continue;
475 			}
476 #ifdef MAC
477 			if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
478 				nmp = TAILQ_NEXT(mp, mnt_list);
479 				continue;
480 			}
481 #endif
482 			count++;
483 			nmp = TAILQ_NEXT(mp, mnt_list);
484 		}
485 		mtx_unlock(&mountlist_mtx);
486 		*countp = count;
487 		return (0);
488 	}
489 
490 	/*
491 	 * They want the entire thing.
492 	 *
493 	 * Short-circuit the corner case of no room for anything, avoids
494 	 * relocking below.
495 	 */
496 	if (maxcount < 1) {
497 		goto out;
498 	}
499 
500 	mtx_lock(&mountlist_mtx);
501 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
502 		if (prison_canseemount(td->td_ucred, mp) != 0) {
503 			nmp = TAILQ_NEXT(mp, mnt_list);
504 			continue;
505 		}
506 #ifdef MAC
507 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
508 			nmp = TAILQ_NEXT(mp, mnt_list);
509 			continue;
510 		}
511 #endif
512 		if (mode == MNT_WAIT) {
513 			if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
514 				/*
515 				 * If vfs_busy() failed, and MBF_NOWAIT
516 				 * wasn't passed, then the mp is gone.
517 				 * Furthermore, because of MBF_MNTLSTLOCK,
518 				 * the mountlist_mtx was dropped.  We have
519 				 * no other choice than to start over.
520 				 */
521 				mtx_unlock(&mountlist_mtx);
522 				free(tofree, M_STATFS);
523 				goto restart;
524 			}
525 		} else {
526 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
527 				nmp = TAILQ_NEXT(mp, mnt_list);
528 				continue;
529 			}
530 		}
531 		sp = &mp->mnt_stat;
532 		/*
533 		 * If MNT_NOWAIT is specified, do not refresh
534 		 * the fsstat cache.
535 		 */
536 		if (mode != MNT_NOWAIT) {
537 			error = VFS_STATFS(mp, sp);
538 			if (error != 0) {
539 				mtx_lock(&mountlist_mtx);
540 				nmp = TAILQ_NEXT(mp, mnt_list);
541 				vfs_unbusy(mp);
542 				continue;
543 			}
544 		}
545 		if (priv_check_cred_vfs_generation(td->td_ucred)) {
546 			sptmp = malloc(sizeof(struct statfs), M_STATFS,
547 			    M_WAITOK);
548 			*sptmp = *sp;
549 			sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
550 			prison_enforce_statfs(td->td_ucred, mp, sptmp);
551 			sp = sptmp;
552 		} else
553 			sptmp = NULL;
554 		if (bufseg == UIO_SYSSPACE) {
555 			bcopy(sp, sfsp, sizeof(*sp));
556 			free(sptmp, M_STATFS);
557 		} else /* if (bufseg == UIO_USERSPACE) */ {
558 			error = copyout(sp, sfsp, sizeof(*sp));
559 			free(sptmp, M_STATFS);
560 			if (error != 0) {
561 				vfs_unbusy(mp);
562 				return (error);
563 			}
564 		}
565 		sfsp++;
566 		count++;
567 
568 		if (count == maxcount) {
569 			vfs_unbusy(mp);
570 			goto out;
571 		}
572 
573 		mtx_lock(&mountlist_mtx);
574 		nmp = TAILQ_NEXT(mp, mnt_list);
575 		vfs_unbusy(mp);
576 	}
577 	mtx_unlock(&mountlist_mtx);
578 out:
579 	*countp = count;
580 	return (0);
581 }
582 
583 #ifdef COMPAT_FREEBSD4
584 /*
585  * Get old format filesystem statistics.
586  */
587 static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *);
588 
589 #ifndef _SYS_SYSPROTO_H_
590 struct freebsd4_statfs_args {
591 	char *path;
592 	struct ostatfs *buf;
593 };
594 #endif
595 int
596 freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
597 {
598 	struct ostatfs osb;
599 	struct statfs *sfp;
600 	int error;
601 
602 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
603 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
604 	if (error == 0) {
605 		freebsd4_cvtstatfs(sfp, &osb);
606 		error = copyout(&osb, uap->buf, sizeof(osb));
607 	}
608 	free(sfp, M_STATFS);
609 	return (error);
610 }
611 
612 /*
613  * Get filesystem statistics.
614  */
615 #ifndef _SYS_SYSPROTO_H_
616 struct freebsd4_fstatfs_args {
617 	int fd;
618 	struct ostatfs *buf;
619 };
620 #endif
621 int
622 freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
623 {
624 	struct ostatfs osb;
625 	struct statfs *sfp;
626 	int error;
627 
628 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
629 	error = kern_fstatfs(td, uap->fd, sfp);
630 	if (error == 0) {
631 		freebsd4_cvtstatfs(sfp, &osb);
632 		error = copyout(&osb, uap->buf, sizeof(osb));
633 	}
634 	free(sfp, M_STATFS);
635 	return (error);
636 }
637 
638 /*
639  * Get statistics on all filesystems.
640  */
641 #ifndef _SYS_SYSPROTO_H_
642 struct freebsd4_getfsstat_args {
643 	struct ostatfs *buf;
644 	long bufsize;
645 	int mode;
646 };
647 #endif
648 int
649 freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
650 {
651 	struct statfs *buf, *sp;
652 	struct ostatfs osb;
653 	size_t count, size;
654 	int error;
655 
656 	if (uap->bufsize < 0)
657 		return (EINVAL);
658 	count = uap->bufsize / sizeof(struct ostatfs);
659 	if (count > SIZE_MAX / sizeof(struct statfs))
660 		return (EINVAL);
661 	size = count * sizeof(struct statfs);
662 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
663 	    uap->mode);
664 	if (error == 0)
665 		td->td_retval[0] = count;
666 	if (size != 0) {
667 		sp = buf;
668 		while (count != 0 && error == 0) {
669 			freebsd4_cvtstatfs(sp, &osb);
670 			error = copyout(&osb, uap->buf, sizeof(osb));
671 			sp++;
672 			uap->buf++;
673 			count--;
674 		}
675 		free(buf, M_STATFS);
676 	}
677 	return (error);
678 }
679 
680 /*
681  * Implement fstatfs() for (NFS) file handles.
682  */
683 #ifndef _SYS_SYSPROTO_H_
684 struct freebsd4_fhstatfs_args {
685 	struct fhandle *u_fhp;
686 	struct ostatfs *buf;
687 };
688 #endif
689 int
690 freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
691 {
692 	struct ostatfs osb;
693 	struct statfs *sfp;
694 	fhandle_t fh;
695 	int error;
696 
697 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
698 	if (error != 0)
699 		return (error);
700 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
701 	error = kern_fhstatfs(td, fh, sfp);
702 	if (error == 0) {
703 		freebsd4_cvtstatfs(sfp, &osb);
704 		error = copyout(&osb, uap->buf, sizeof(osb));
705 	}
706 	free(sfp, M_STATFS);
707 	return (error);
708 }
709 
710 /*
711  * Convert a new format statfs structure to an old format statfs structure.
712  */
713 static void
714 freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
715 {
716 
717 	statfs_scale_blocks(nsp, LONG_MAX);
718 	bzero(osp, sizeof(*osp));
719 	osp->f_bsize = nsp->f_bsize;
720 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
721 	osp->f_blocks = nsp->f_blocks;
722 	osp->f_bfree = nsp->f_bfree;
723 	osp->f_bavail = nsp->f_bavail;
724 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
725 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
726 	osp->f_owner = nsp->f_owner;
727 	osp->f_type = nsp->f_type;
728 	osp->f_flags = nsp->f_flags;
729 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
730 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
731 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
732 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
733 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
734 	    MIN(MFSNAMELEN, OMFSNAMELEN));
735 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
736 	    MIN(MNAMELEN, OMNAMELEN));
737 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
738 	    MIN(MNAMELEN, OMNAMELEN));
739 	osp->f_fsid = nsp->f_fsid;
740 }
741 #endif /* COMPAT_FREEBSD4 */
742 
743 #if defined(COMPAT_FREEBSD11)
744 /*
745  * Get old format filesystem statistics.
746  */
747 static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *);
748 
749 int
750 freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap)
751 {
752 	struct freebsd11_statfs osb;
753 	struct statfs *sfp;
754 	int error;
755 
756 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
757 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
758 	if (error == 0) {
759 		freebsd11_cvtstatfs(sfp, &osb);
760 		error = copyout(&osb, uap->buf, sizeof(osb));
761 	}
762 	free(sfp, M_STATFS);
763 	return (error);
764 }
765 
766 /*
767  * Get filesystem statistics.
768  */
769 int
770 freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap)
771 {
772 	struct freebsd11_statfs osb;
773 	struct statfs *sfp;
774 	int error;
775 
776 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
777 	error = kern_fstatfs(td, uap->fd, sfp);
778 	if (error == 0) {
779 		freebsd11_cvtstatfs(sfp, &osb);
780 		error = copyout(&osb, uap->buf, sizeof(osb));
781 	}
782 	free(sfp, M_STATFS);
783 	return (error);
784 }
785 
786 /*
787  * Get statistics on all filesystems.
788  */
789 int
790 freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap)
791 {
792 	return (kern_freebsd11_getfsstat(td, uap->buf, uap->bufsize, uap->mode));
793 }
794 
795 int
796 kern_freebsd11_getfsstat(struct thread *td, struct freebsd11_statfs * ubuf,
797     long bufsize, int mode)
798 {
799 	struct freebsd11_statfs osb;
800 	struct statfs *buf, *sp;
801 	size_t count, size;
802 	int error;
803 
804 	if (bufsize < 0)
805 		return (EINVAL);
806 
807 	count = bufsize / sizeof(struct ostatfs);
808 	size = count * sizeof(struct statfs);
809 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, mode);
810 	if (error == 0)
811 		td->td_retval[0] = count;
812 	if (size > 0) {
813 		sp = buf;
814 		while (count > 0 && error == 0) {
815 			freebsd11_cvtstatfs(sp, &osb);
816 			error = copyout(&osb, ubuf, sizeof(osb));
817 			sp++;
818 			ubuf++;
819 			count--;
820 		}
821 		free(buf, M_STATFS);
822 	}
823 	return (error);
824 }
825 
826 /*
827  * Implement fstatfs() for (NFS) file handles.
828  */
829 int
830 freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap)
831 {
832 	struct freebsd11_statfs osb;
833 	struct statfs *sfp;
834 	fhandle_t fh;
835 	int error;
836 
837 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
838 	if (error)
839 		return (error);
840 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
841 	error = kern_fhstatfs(td, fh, sfp);
842 	if (error == 0) {
843 		freebsd11_cvtstatfs(sfp, &osb);
844 		error = copyout(&osb, uap->buf, sizeof(osb));
845 	}
846 	free(sfp, M_STATFS);
847 	return (error);
848 }
849 
850 /*
851  * Convert a new format statfs structure to an old format statfs structure.
852  */
853 static void
854 freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp)
855 {
856 
857 	bzero(osp, sizeof(*osp));
858 	osp->f_version = FREEBSD11_STATFS_VERSION;
859 	osp->f_type = nsp->f_type;
860 	osp->f_flags = nsp->f_flags;
861 	osp->f_bsize = nsp->f_bsize;
862 	osp->f_iosize = nsp->f_iosize;
863 	osp->f_blocks = nsp->f_blocks;
864 	osp->f_bfree = nsp->f_bfree;
865 	osp->f_bavail = nsp->f_bavail;
866 	osp->f_files = nsp->f_files;
867 	osp->f_ffree = nsp->f_ffree;
868 	osp->f_syncwrites = nsp->f_syncwrites;
869 	osp->f_asyncwrites = nsp->f_asyncwrites;
870 	osp->f_syncreads = nsp->f_syncreads;
871 	osp->f_asyncreads = nsp->f_asyncreads;
872 	osp->f_namemax = nsp->f_namemax;
873 	osp->f_owner = nsp->f_owner;
874 	osp->f_fsid = nsp->f_fsid;
875 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
876 	    MIN(MFSNAMELEN, sizeof(osp->f_fstypename)));
877 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
878 	    MIN(MNAMELEN, sizeof(osp->f_mntonname)));
879 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
880 	    MIN(MNAMELEN, sizeof(osp->f_mntfromname)));
881 }
882 #endif /* COMPAT_FREEBSD11 */
883 
884 /*
885  * Change current working directory to a given file descriptor.
886  */
887 #ifndef _SYS_SYSPROTO_H_
888 struct fchdir_args {
889 	int	fd;
890 };
891 #endif
892 int
893 sys_fchdir(struct thread *td, struct fchdir_args *uap)
894 {
895 	struct vnode *vp, *tdp;
896 	struct mount *mp;
897 	struct file *fp;
898 	int error;
899 
900 	AUDIT_ARG_FD(uap->fd);
901 	error = getvnode_path(td, uap->fd, &cap_fchdir_rights,
902 	    &fp);
903 	if (error != 0)
904 		return (error);
905 	vp = fp->f_vnode;
906 	vrefact(vp);
907 	fdrop(fp, td);
908 	vn_lock(vp, LK_SHARED | LK_RETRY);
909 	AUDIT_ARG_VNODE1(vp);
910 	error = change_dir(vp, td);
911 	while (!error && (mp = vp->v_mountedhere) != NULL) {
912 		if (vfs_busy(mp, 0))
913 			continue;
914 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
915 		vfs_unbusy(mp);
916 		if (error != 0)
917 			break;
918 		vput(vp);
919 		vp = tdp;
920 	}
921 	if (error != 0) {
922 		vput(vp);
923 		return (error);
924 	}
925 	VOP_UNLOCK(vp);
926 	pwd_chdir(td, vp);
927 	return (0);
928 }
929 
930 /*
931  * Change current working directory (``.'').
932  */
933 #ifndef _SYS_SYSPROTO_H_
934 struct chdir_args {
935 	char	*path;
936 };
937 #endif
938 int
939 sys_chdir(struct thread *td, struct chdir_args *uap)
940 {
941 
942 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
943 }
944 
945 int
946 kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg)
947 {
948 	struct nameidata nd;
949 	int error;
950 
951 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
952 	    pathseg, path);
953 	if ((error = namei(&nd)) != 0)
954 		return (error);
955 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
956 		vput(nd.ni_vp);
957 		NDFREE_PNBUF(&nd);
958 		return (error);
959 	}
960 	VOP_UNLOCK(nd.ni_vp);
961 	NDFREE_PNBUF(&nd);
962 	pwd_chdir(td, nd.ni_vp);
963 	return (0);
964 }
965 
966 static int unprivileged_chroot = 0;
967 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_chroot, CTLFLAG_RW,
968     &unprivileged_chroot, 0,
969     "Unprivileged processes can use chroot(2)");
970 /*
971  * Change notion of root (``/'') directory.
972  */
973 #ifndef _SYS_SYSPROTO_H_
974 struct chroot_args {
975 	char	*path;
976 };
977 #endif
978 int
979 sys_chroot(struct thread *td, struct chroot_args *uap)
980 {
981 	struct nameidata nd;
982 	struct proc *p;
983 	int error;
984 
985 	error = priv_check(td, PRIV_VFS_CHROOT);
986 	if (error != 0) {
987 		p = td->td_proc;
988 		PROC_LOCK(p);
989 		if (unprivileged_chroot == 0 ||
990 		    (p->p_flag2 & P2_NO_NEW_PRIVS) == 0) {
991 			PROC_UNLOCK(p);
992 			return (error);
993 		}
994 		PROC_UNLOCK(p);
995 	}
996 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
997 	    UIO_USERSPACE, uap->path);
998 	error = namei(&nd);
999 	if (error != 0)
1000 		return (error);
1001 	NDFREE_PNBUF(&nd);
1002 	error = change_dir(nd.ni_vp, td);
1003 	if (error != 0)
1004 		goto e_vunlock;
1005 #ifdef MAC
1006 	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
1007 	if (error != 0)
1008 		goto e_vunlock;
1009 #endif
1010 	VOP_UNLOCK(nd.ni_vp);
1011 	error = pwd_chroot(td, nd.ni_vp);
1012 	vrele(nd.ni_vp);
1013 	return (error);
1014 e_vunlock:
1015 	vput(nd.ni_vp);
1016 	return (error);
1017 }
1018 
1019 /*
1020  * Common routine for chroot and chdir.  Callers must provide a locked vnode
1021  * instance.
1022  */
1023 int
1024 change_dir(struct vnode *vp, struct thread *td)
1025 {
1026 #ifdef MAC
1027 	int error;
1028 #endif
1029 
1030 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
1031 	if (vp->v_type != VDIR)
1032 		return (ENOTDIR);
1033 #ifdef MAC
1034 	error = mac_vnode_check_chdir(td->td_ucred, vp);
1035 	if (error != 0)
1036 		return (error);
1037 #endif
1038 	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
1039 }
1040 
1041 static __inline void
1042 flags_to_rights(int flags, cap_rights_t *rightsp)
1043 {
1044 	if (flags & O_EXEC) {
1045 		cap_rights_set_one(rightsp, CAP_FEXECVE);
1046 		if (flags & O_PATH)
1047 			return;
1048 	} else {
1049 		switch ((flags & O_ACCMODE)) {
1050 		case O_RDONLY:
1051 			cap_rights_set_one(rightsp, CAP_READ);
1052 			break;
1053 		case O_RDWR:
1054 			cap_rights_set_one(rightsp, CAP_READ);
1055 			/* FALLTHROUGH */
1056 		case O_WRONLY:
1057 			cap_rights_set_one(rightsp, CAP_WRITE);
1058 			if (!(flags & (O_APPEND | O_TRUNC)))
1059 				cap_rights_set_one(rightsp, CAP_SEEK);
1060 			break;
1061 		}
1062 	}
1063 
1064 	if (flags & O_CREAT)
1065 		cap_rights_set_one(rightsp, CAP_CREATE);
1066 
1067 	if (flags & O_TRUNC)
1068 		cap_rights_set_one(rightsp, CAP_FTRUNCATE);
1069 
1070 	if (flags & (O_SYNC | O_FSYNC))
1071 		cap_rights_set_one(rightsp, CAP_FSYNC);
1072 
1073 	if (flags & (O_EXLOCK | O_SHLOCK))
1074 		cap_rights_set_one(rightsp, CAP_FLOCK);
1075 }
1076 
1077 /*
1078  * Check permissions, allocate an open file structure, and call the device
1079  * open routine if any.
1080  */
1081 #ifndef _SYS_SYSPROTO_H_
1082 struct open_args {
1083 	char	*path;
1084 	int	flags;
1085 	int	mode;
1086 };
1087 #endif
1088 int
1089 sys_open(struct thread *td, struct open_args *uap)
1090 {
1091 
1092 	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1093 	    uap->flags, uap->mode));
1094 }
1095 
1096 #ifndef _SYS_SYSPROTO_H_
1097 struct openat_args {
1098 	int	fd;
1099 	char	*path;
1100 	int	flag;
1101 	int	mode;
1102 };
1103 #endif
1104 int
1105 sys_openat(struct thread *td, struct openat_args *uap)
1106 {
1107 
1108 	AUDIT_ARG_FD(uap->fd);
1109 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1110 	    uap->mode));
1111 }
1112 
1113 int
1114 kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
1115     int flags, int mode)
1116 {
1117 	struct proc *p = td->td_proc;
1118 	struct filedesc *fdp;
1119 	struct pwddesc *pdp;
1120 	struct file *fp;
1121 	struct vnode *vp;
1122 	struct nameidata nd;
1123 	cap_rights_t rights;
1124 	int cmode, error, indx;
1125 
1126 	indx = -1;
1127 	fdp = p->p_fd;
1128 	pdp = p->p_pd;
1129 
1130 	AUDIT_ARG_FFLAGS(flags);
1131 	AUDIT_ARG_MODE(mode);
1132 	cap_rights_init_one(&rights, CAP_LOOKUP);
1133 	flags_to_rights(flags, &rights);
1134 
1135 	/*
1136 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1137 	 * may be specified.  On the other hand, for O_PATH any mode
1138 	 * except O_EXEC is ignored.
1139 	 */
1140 	if ((flags & O_PATH) != 0) {
1141 		flags &= ~(O_CREAT | O_ACCMODE);
1142 	} else if ((flags & O_EXEC) != 0) {
1143 		if (flags & O_ACCMODE)
1144 			return (EINVAL);
1145 	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1146 		return (EINVAL);
1147 	} else {
1148 		flags = FFLAGS(flags);
1149 	}
1150 
1151 	/*
1152 	 * Allocate a file structure. The descriptor to reference it
1153 	 * is allocated and used by finstall_refed() below.
1154 	 */
1155 	error = falloc_noinstall(td, &fp);
1156 	if (error != 0)
1157 		return (error);
1158 	/* Set the flags early so the finit in devfs can pick them up. */
1159 	fp->f_flag = flags & FMASK;
1160 	cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT;
1161 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | WANTIOCTLCAPS,
1162 	    pathseg, path, fd, &rights);
1163 	td->td_dupfd = -1;		/* XXX check for fdopen */
1164 	error = vn_open_cred(&nd, &flags, cmode, VN_OPEN_WANTIOCTLCAPS,
1165 	    td->td_ucred, fp);
1166 	if (error != 0) {
1167 		/*
1168 		 * If the vn_open replaced the method vector, something
1169 		 * wonderous happened deep below and we just pass it up
1170 		 * pretending we know what we do.
1171 		 */
1172 		if (error == ENXIO && fp->f_ops != &badfileops) {
1173 			MPASS((flags & O_PATH) == 0);
1174 			goto success;
1175 		}
1176 
1177 		/*
1178 		 * Handle special fdopen() case. bleh.
1179 		 *
1180 		 * Don't do this for relative (capability) lookups; we don't
1181 		 * understand exactly what would happen, and we don't think
1182 		 * that it ever should.
1183 		 */
1184 		if ((nd.ni_resflags & NIRES_STRICTREL) == 0 &&
1185 		    (error == ENODEV || error == ENXIO) &&
1186 		    td->td_dupfd >= 0) {
1187 			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1188 			    &indx);
1189 			if (error == 0)
1190 				goto success;
1191 		}
1192 
1193 		goto bad;
1194 	}
1195 	td->td_dupfd = 0;
1196 	NDFREE_PNBUF(&nd);
1197 	vp = nd.ni_vp;
1198 
1199 	/*
1200 	 * Store the vnode, for any f_type. Typically, the vnode use
1201 	 * count is decremented by direct call to vn_closefile() for
1202 	 * files that switched type in the cdevsw fdopen() method.
1203 	 */
1204 	fp->f_vnode = vp;
1205 
1206 	/*
1207 	 * If the file wasn't claimed by devfs bind it to the normal
1208 	 * vnode operations here.
1209 	 */
1210 	if (fp->f_ops == &badfileops) {
1211 		KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0,
1212 		    ("Unexpected fifo fp %p vp %p", fp, vp));
1213 		if ((flags & O_PATH) != 0) {
1214 			finit(fp, (flags & FMASK) | (fp->f_flag & FKQALLOWED),
1215 			    DTYPE_VNODE, NULL, &path_fileops);
1216 		} else {
1217 			finit_vnode(fp, flags, NULL, &vnops);
1218 		}
1219 	}
1220 
1221 	VOP_UNLOCK(vp);
1222 	if (flags & O_TRUNC) {
1223 		error = fo_truncate(fp, 0, td->td_ucred, td);
1224 		if (error != 0)
1225 			goto bad;
1226 	}
1227 success:
1228 	/*
1229 	 * If we haven't already installed the FD (for dupfdopen), do so now.
1230 	 */
1231 	if (indx == -1) {
1232 		struct filecaps *fcaps;
1233 
1234 #ifdef CAPABILITIES
1235 		if ((nd.ni_resflags & NIRES_STRICTREL) != 0)
1236 			fcaps = &nd.ni_filecaps;
1237 		else
1238 #endif
1239 			fcaps = NULL;
1240 		error = finstall_refed(td, fp, &indx, flags, fcaps);
1241 		/* On success finstall_refed() consumes fcaps. */
1242 		if (error != 0) {
1243 			goto bad;
1244 		}
1245 	} else {
1246 		NDFREE_IOCTLCAPS(&nd);
1247 		falloc_abort(td, fp);
1248 	}
1249 
1250 	td->td_retval[0] = indx;
1251 	return (0);
1252 bad:
1253 	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1254 	NDFREE_IOCTLCAPS(&nd);
1255 	falloc_abort(td, fp);
1256 	return (error);
1257 }
1258 
1259 #ifdef COMPAT_43
1260 /*
1261  * Create a file.
1262  */
1263 #ifndef _SYS_SYSPROTO_H_
1264 struct ocreat_args {
1265 	char	*path;
1266 	int	mode;
1267 };
1268 #endif
1269 int
1270 ocreat(struct thread *td, struct ocreat_args *uap)
1271 {
1272 
1273 	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1274 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1275 }
1276 #endif /* COMPAT_43 */
1277 
1278 /*
1279  * Create a special file.
1280  */
1281 #ifndef _SYS_SYSPROTO_H_
1282 struct mknodat_args {
1283 	int	fd;
1284 	char	*path;
1285 	mode_t	mode;
1286 	dev_t	dev;
1287 };
1288 #endif
1289 int
1290 sys_mknodat(struct thread *td, struct mknodat_args *uap)
1291 {
1292 
1293 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1294 	    uap->dev));
1295 }
1296 
1297 #if defined(COMPAT_FREEBSD11)
1298 int
1299 freebsd11_mknod(struct thread *td,
1300     struct freebsd11_mknod_args *uap)
1301 {
1302 
1303 	return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1304 	    uap->mode, uap->dev));
1305 }
1306 
1307 int
1308 freebsd11_mknodat(struct thread *td,
1309     struct freebsd11_mknodat_args *uap)
1310 {
1311 
1312 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1313 	    uap->dev));
1314 }
1315 #endif /* COMPAT_FREEBSD11 */
1316 
1317 int
1318 kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
1319     int mode, dev_t dev)
1320 {
1321 	struct vnode *vp;
1322 	struct mount *mp;
1323 	struct vattr vattr;
1324 	struct nameidata nd;
1325 	int error, whiteout = 0;
1326 
1327 	AUDIT_ARG_MODE(mode);
1328 	AUDIT_ARG_DEV(dev);
1329 	switch (mode & S_IFMT) {
1330 	case S_IFCHR:
1331 	case S_IFBLK:
1332 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1333 		if (error == 0 && dev == VNOVAL)
1334 			error = EINVAL;
1335 		break;
1336 	case S_IFWHT:
1337 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1338 		break;
1339 	case S_IFIFO:
1340 		if (dev == 0)
1341 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1342 		/* FALLTHROUGH */
1343 	default:
1344 		error = EINVAL;
1345 		break;
1346 	}
1347 	if (error != 0)
1348 		return (error);
1349 	NDPREINIT(&nd);
1350 restart:
1351 	bwillwrite();
1352 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | AUDITVNODE1 | NOCACHE,
1353 	    pathseg, path, fd, &cap_mknodat_rights);
1354 	if ((error = namei(&nd)) != 0)
1355 		return (error);
1356 	vp = nd.ni_vp;
1357 	if (vp != NULL) {
1358 		NDFREE_PNBUF(&nd);
1359 		if (vp == nd.ni_dvp)
1360 			vrele(nd.ni_dvp);
1361 		else
1362 			vput(nd.ni_dvp);
1363 		vrele(vp);
1364 		return (EEXIST);
1365 	} else {
1366 		VATTR_NULL(&vattr);
1367 		vattr.va_mode = (mode & ALLPERMS) &
1368 		    ~td->td_proc->p_pd->pd_cmask;
1369 		vattr.va_rdev = dev;
1370 		whiteout = 0;
1371 
1372 		switch (mode & S_IFMT) {
1373 		case S_IFCHR:
1374 			vattr.va_type = VCHR;
1375 			break;
1376 		case S_IFBLK:
1377 			vattr.va_type = VBLK;
1378 			break;
1379 		case S_IFWHT:
1380 			whiteout = 1;
1381 			break;
1382 		default:
1383 			panic("kern_mknod: invalid mode");
1384 		}
1385 	}
1386 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1387 		NDFREE_PNBUF(&nd);
1388 		vput(nd.ni_dvp);
1389 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH)) != 0)
1390 			return (error);
1391 		goto restart;
1392 	}
1393 #ifdef MAC
1394 	if (error == 0 && !whiteout)
1395 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1396 		    &nd.ni_cnd, &vattr);
1397 #endif
1398 	if (error == 0) {
1399 		if (whiteout)
1400 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1401 		else {
1402 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1403 						&nd.ni_cnd, &vattr);
1404 		}
1405 	}
1406 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL,
1407 	    true);
1408 	vn_finished_write(mp);
1409 	NDFREE_PNBUF(&nd);
1410 	if (error == ERELOOKUP)
1411 		goto restart;
1412 	return (error);
1413 }
1414 
1415 /*
1416  * Create a named pipe.
1417  */
1418 #ifndef _SYS_SYSPROTO_H_
1419 struct mkfifo_args {
1420 	char	*path;
1421 	int	mode;
1422 };
1423 #endif
1424 int
1425 sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
1426 {
1427 
1428 	return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1429 	    uap->mode));
1430 }
1431 
1432 #ifndef _SYS_SYSPROTO_H_
1433 struct mkfifoat_args {
1434 	int	fd;
1435 	char	*path;
1436 	mode_t	mode;
1437 };
1438 #endif
1439 int
1440 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1441 {
1442 
1443 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1444 	    uap->mode));
1445 }
1446 
1447 int
1448 kern_mkfifoat(struct thread *td, int fd, const char *path,
1449     enum uio_seg pathseg, int mode)
1450 {
1451 	struct mount *mp;
1452 	struct vattr vattr;
1453 	struct nameidata nd;
1454 	int error;
1455 
1456 	AUDIT_ARG_MODE(mode);
1457 	NDPREINIT(&nd);
1458 restart:
1459 	bwillwrite();
1460 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | AUDITVNODE1 | NOCACHE,
1461 	    pathseg, path, fd, &cap_mkfifoat_rights);
1462 	if ((error = namei(&nd)) != 0)
1463 		return (error);
1464 	if (nd.ni_vp != NULL) {
1465 		NDFREE_PNBUF(&nd);
1466 		if (nd.ni_vp == nd.ni_dvp)
1467 			vrele(nd.ni_dvp);
1468 		else
1469 			vput(nd.ni_dvp);
1470 		vrele(nd.ni_vp);
1471 		return (EEXIST);
1472 	}
1473 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1474 		NDFREE_PNBUF(&nd);
1475 		vput(nd.ni_dvp);
1476 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH)) != 0)
1477 			return (error);
1478 		goto restart;
1479 	}
1480 	VATTR_NULL(&vattr);
1481 	vattr.va_type = VFIFO;
1482 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask;
1483 #ifdef MAC
1484 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1485 	    &vattr);
1486 	if (error != 0)
1487 		goto out;
1488 #endif
1489 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1490 #ifdef MAC
1491 out:
1492 #endif
1493 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
1494 	vn_finished_write(mp);
1495 	NDFREE_PNBUF(&nd);
1496 	if (error == ERELOOKUP)
1497 		goto restart;
1498 	return (error);
1499 }
1500 
1501 /*
1502  * Make a hard file link.
1503  */
1504 #ifndef _SYS_SYSPROTO_H_
1505 struct link_args {
1506 	char	*path;
1507 	char	*link;
1508 };
1509 #endif
1510 int
1511 sys_link(struct thread *td, struct link_args *uap)
1512 {
1513 
1514 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
1515 	    UIO_USERSPACE, AT_SYMLINK_FOLLOW));
1516 }
1517 
1518 #ifndef _SYS_SYSPROTO_H_
1519 struct linkat_args {
1520 	int	fd1;
1521 	char	*path1;
1522 	int	fd2;
1523 	char	*path2;
1524 	int	flag;
1525 };
1526 #endif
1527 int
1528 sys_linkat(struct thread *td, struct linkat_args *uap)
1529 {
1530 
1531 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1532 	    UIO_USERSPACE, uap->flag));
1533 }
1534 
1535 int hardlink_check_uid = 0;
1536 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1537     &hardlink_check_uid, 0,
1538     "Unprivileged processes cannot create hard links to files owned by other "
1539     "users");
1540 static int hardlink_check_gid = 0;
1541 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1542     &hardlink_check_gid, 0,
1543     "Unprivileged processes cannot create hard links to files owned by other "
1544     "groups");
1545 
1546 static int
1547 can_hardlink(struct vnode *vp, struct ucred *cred)
1548 {
1549 	struct vattr va;
1550 	int error;
1551 
1552 	if (!hardlink_check_uid && !hardlink_check_gid)
1553 		return (0);
1554 
1555 	error = VOP_GETATTR(vp, &va, cred);
1556 	if (error != 0)
1557 		return (error);
1558 
1559 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1560 		error = priv_check_cred(cred, PRIV_VFS_LINK);
1561 		if (error != 0)
1562 			return (error);
1563 	}
1564 
1565 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1566 		error = priv_check_cred(cred, PRIV_VFS_LINK);
1567 		if (error != 0)
1568 			return (error);
1569 	}
1570 
1571 	return (0);
1572 }
1573 
1574 int
1575 kern_linkat(struct thread *td, int fd1, int fd2, const char *path1,
1576     const char *path2, enum uio_seg segflag, int flag)
1577 {
1578 	struct nameidata nd;
1579 	int error;
1580 
1581 	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH |
1582 	    AT_EMPTY_PATH)) != 0)
1583 		return (EINVAL);
1584 
1585 	NDPREINIT(&nd);
1586 	do {
1587 		bwillwrite();
1588 		NDINIT_ATRIGHTS(&nd, LOOKUP, AUDITVNODE1 | at2cnpflags(flag,
1589 		    AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH),
1590 		    segflag, path1, fd1, &cap_linkat_source_rights);
1591 		if ((error = namei(&nd)) != 0)
1592 			return (error);
1593 		NDFREE_PNBUF(&nd);
1594 		if ((nd.ni_resflags & NIRES_EMPTYPATH) != 0) {
1595 			error = priv_check(td, PRIV_VFS_FHOPEN);
1596 			if (error != 0) {
1597 				vrele(nd.ni_vp);
1598 				return (error);
1599 			}
1600 		}
1601 		error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag);
1602 	} while (error ==  EAGAIN || error == ERELOOKUP);
1603 	return (error);
1604 }
1605 
1606 static int
1607 kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path,
1608     enum uio_seg segflag)
1609 {
1610 	struct nameidata nd;
1611 	struct mount *mp;
1612 	int error;
1613 
1614 	if (vp->v_type == VDIR) {
1615 		vrele(vp);
1616 		return (EPERM);		/* POSIX */
1617 	}
1618 	NDINIT_ATRIGHTS(&nd, CREATE,
1619 	    LOCKPARENT | AUDITVNODE2 | NOCACHE, segflag, path, fd,
1620 	    &cap_linkat_target_rights);
1621 	if ((error = namei(&nd)) == 0) {
1622 		if (nd.ni_vp != NULL) {
1623 			NDFREE_PNBUF(&nd);
1624 			if (nd.ni_dvp == nd.ni_vp)
1625 				vrele(nd.ni_dvp);
1626 			else
1627 				vput(nd.ni_dvp);
1628 			vrele(nd.ni_vp);
1629 			vrele(vp);
1630 			return (EEXIST);
1631 		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1632 			/*
1633 			 * Cross-device link.  No need to recheck
1634 			 * vp->v_type, since it cannot change, except
1635 			 * to VBAD.
1636 			 */
1637 			NDFREE_PNBUF(&nd);
1638 			vput(nd.ni_dvp);
1639 			vrele(vp);
1640 			return (EXDEV);
1641 		} else if (vn_lock(vp, LK_EXCLUSIVE) == 0) {
1642 			error = can_hardlink(vp, td->td_ucred);
1643 #ifdef MAC
1644 			if (error == 0)
1645 				error = mac_vnode_check_link(td->td_ucred,
1646 				    nd.ni_dvp, vp, &nd.ni_cnd);
1647 #endif
1648 			if (error != 0) {
1649 				vput(vp);
1650 				vput(nd.ni_dvp);
1651 				NDFREE_PNBUF(&nd);
1652 				return (error);
1653 			}
1654 			error = vn_start_write(vp, &mp, V_NOWAIT);
1655 			if (error != 0) {
1656 				vput(vp);
1657 				vput(nd.ni_dvp);
1658 				NDFREE_PNBUF(&nd);
1659 				error = vn_start_write(NULL, &mp,
1660 				    V_XSLEEP | V_PCATCH);
1661 				if (error != 0)
1662 					return (error);
1663 				return (EAGAIN);
1664 			}
1665 			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1666 			VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
1667 			vn_finished_write(mp);
1668 			NDFREE_PNBUF(&nd);
1669 			vp = NULL;
1670 		} else {
1671 			vput(nd.ni_dvp);
1672 			NDFREE_PNBUF(&nd);
1673 			vrele(vp);
1674 			return (EAGAIN);
1675 		}
1676 	}
1677 	if (vp != NULL)
1678 		vrele(vp);
1679 	return (error);
1680 }
1681 
1682 /*
1683  * Make a symbolic link.
1684  */
1685 #ifndef _SYS_SYSPROTO_H_
1686 struct symlink_args {
1687 	char	*path;
1688 	char	*link;
1689 };
1690 #endif
1691 int
1692 sys_symlink(struct thread *td, struct symlink_args *uap)
1693 {
1694 
1695 	return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
1696 	    UIO_USERSPACE));
1697 }
1698 
1699 #ifndef _SYS_SYSPROTO_H_
1700 struct symlinkat_args {
1701 	char	*path;
1702 	int	fd;
1703 	char	*path2;
1704 };
1705 #endif
1706 int
1707 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1708 {
1709 
1710 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1711 	    UIO_USERSPACE));
1712 }
1713 
1714 int
1715 kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2,
1716     enum uio_seg segflg)
1717 {
1718 	struct mount *mp;
1719 	struct vattr vattr;
1720 	const char *syspath;
1721 	char *tmppath;
1722 	struct nameidata nd;
1723 	int error;
1724 
1725 	if (segflg == UIO_SYSSPACE) {
1726 		syspath = path1;
1727 	} else {
1728 		tmppath = uma_zalloc(namei_zone, M_WAITOK);
1729 		if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0)
1730 			goto out;
1731 		syspath = tmppath;
1732 	}
1733 	AUDIT_ARG_TEXT(syspath);
1734 	NDPREINIT(&nd);
1735 restart:
1736 	bwillwrite();
1737 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | AUDITVNODE1 | NOCACHE, segflg,
1738 	    path2, fd, &cap_symlinkat_rights);
1739 	if ((error = namei(&nd)) != 0)
1740 		goto out;
1741 	if (nd.ni_vp) {
1742 		NDFREE_PNBUF(&nd);
1743 		if (nd.ni_vp == nd.ni_dvp)
1744 			vrele(nd.ni_dvp);
1745 		else
1746 			vput(nd.ni_dvp);
1747 		vrele(nd.ni_vp);
1748 		nd.ni_vp = NULL;
1749 		error = EEXIST;
1750 		goto out;
1751 	}
1752 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1753 		NDFREE_PNBUF(&nd);
1754 		vput(nd.ni_dvp);
1755 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH)) != 0)
1756 			goto out;
1757 		goto restart;
1758 	}
1759 	VATTR_NULL(&vattr);
1760 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask;
1761 #ifdef MAC
1762 	vattr.va_type = VLNK;
1763 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1764 	    &vattr);
1765 	if (error != 0)
1766 		goto out2;
1767 #endif
1768 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1769 #ifdef MAC
1770 out2:
1771 #endif
1772 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
1773 	vn_finished_write(mp);
1774 	NDFREE_PNBUF(&nd);
1775 	if (error == ERELOOKUP)
1776 		goto restart;
1777 out:
1778 	if (segflg != UIO_SYSSPACE)
1779 		uma_zfree(namei_zone, tmppath);
1780 	return (error);
1781 }
1782 
1783 /*
1784  * Delete a whiteout from the filesystem.
1785  */
1786 #ifndef _SYS_SYSPROTO_H_
1787 struct undelete_args {
1788 	char *path;
1789 };
1790 #endif
1791 int
1792 sys_undelete(struct thread *td, struct undelete_args *uap)
1793 {
1794 	struct mount *mp;
1795 	struct nameidata nd;
1796 	int error;
1797 
1798 	NDPREINIT(&nd);
1799 restart:
1800 	bwillwrite();
1801 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1802 	    UIO_USERSPACE, uap->path);
1803 	error = namei(&nd);
1804 	if (error != 0)
1805 		return (error);
1806 
1807 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1808 		NDFREE_PNBUF(&nd);
1809 		if (nd.ni_vp == nd.ni_dvp)
1810 			vrele(nd.ni_dvp);
1811 		else
1812 			vput(nd.ni_dvp);
1813 		if (nd.ni_vp)
1814 			vrele(nd.ni_vp);
1815 		return (EEXIST);
1816 	}
1817 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1818 		NDFREE_PNBUF(&nd);
1819 		vput(nd.ni_dvp);
1820 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH)) != 0)
1821 			return (error);
1822 		goto restart;
1823 	}
1824 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1825 	NDFREE_PNBUF(&nd);
1826 	vput(nd.ni_dvp);
1827 	vn_finished_write(mp);
1828 	if (error == ERELOOKUP)
1829 		goto restart;
1830 	return (error);
1831 }
1832 
1833 /*
1834  * Delete a name from the filesystem.
1835  */
1836 #ifndef _SYS_SYSPROTO_H_
1837 struct unlink_args {
1838 	char	*path;
1839 };
1840 #endif
1841 int
1842 sys_unlink(struct thread *td, struct unlink_args *uap)
1843 {
1844 
1845 	return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
1846 	    0, 0));
1847 }
1848 
1849 static int
1850 kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd,
1851     int flag, enum uio_seg pathseg, ino_t oldinum)
1852 {
1853 
1854 	if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0)
1855 		return (EINVAL);
1856 
1857 	if ((flag & AT_REMOVEDIR) != 0)
1858 		return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0));
1859 
1860 	return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0));
1861 }
1862 
1863 #ifndef _SYS_SYSPROTO_H_
1864 struct unlinkat_args {
1865 	int	fd;
1866 	char	*path;
1867 	int	flag;
1868 };
1869 #endif
1870 int
1871 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1872 {
1873 
1874 	return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag,
1875 	    UIO_USERSPACE, 0));
1876 }
1877 
1878 #ifndef _SYS_SYSPROTO_H_
1879 struct funlinkat_args {
1880 	int		dfd;
1881 	const char	*path;
1882 	int		fd;
1883 	int		flag;
1884 };
1885 #endif
1886 int
1887 sys_funlinkat(struct thread *td, struct funlinkat_args *uap)
1888 {
1889 
1890 	return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag,
1891 	    UIO_USERSPACE, 0));
1892 }
1893 
1894 int
1895 kern_funlinkat(struct thread *td, int dfd, const char *path, int fd,
1896     enum uio_seg pathseg, int flag, ino_t oldinum)
1897 {
1898 	struct mount *mp;
1899 	struct file *fp;
1900 	struct vnode *vp;
1901 	struct nameidata nd;
1902 	struct stat sb;
1903 	int error;
1904 
1905 	fp = NULL;
1906 	if (fd != FD_NONE) {
1907 		error = getvnode_path(td, fd, &cap_no_rights, &fp);
1908 		if (error != 0)
1909 			return (error);
1910 	}
1911 
1912 	NDPREINIT(&nd);
1913 restart:
1914 	bwillwrite();
1915 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
1916 	    at2cnpflags(flag, AT_RESOLVE_BENEATH),
1917 	    pathseg, path, dfd, &cap_unlinkat_rights);
1918 	if ((error = namei(&nd)) != 0) {
1919 		if (error == EINVAL)
1920 			error = EPERM;
1921 		goto fdout;
1922 	}
1923 	vp = nd.ni_vp;
1924 	if (vp->v_type == VDIR && oldinum == 0) {
1925 		error = EPERM;		/* POSIX */
1926 	} else if (oldinum != 0 &&
1927 	    ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED)) == 0) &&
1928 	    sb.st_ino != oldinum) {
1929 		error = EIDRM;	/* Identifier removed */
1930 	} else if (fp != NULL && fp->f_vnode != vp) {
1931 		if (VN_IS_DOOMED(fp->f_vnode))
1932 			error = EBADF;
1933 		else
1934 			error = EDEADLK;
1935 	} else {
1936 		/*
1937 		 * The root of a mounted filesystem cannot be deleted.
1938 		 *
1939 		 * XXX: can this only be a VDIR case?
1940 		 */
1941 		if (vp->v_vflag & VV_ROOT)
1942 			error = EBUSY;
1943 	}
1944 	if (error == 0) {
1945 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1946 			NDFREE_PNBUF(&nd);
1947 			vput(nd.ni_dvp);
1948 			if (vp == nd.ni_dvp)
1949 				vrele(vp);
1950 			else
1951 				vput(vp);
1952 			if ((error = vn_start_write(NULL, &mp,
1953 			    V_XSLEEP | V_PCATCH)) != 0) {
1954 				goto fdout;
1955 			}
1956 			goto restart;
1957 		}
1958 #ifdef MAC
1959 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1960 		    &nd.ni_cnd);
1961 		if (error != 0)
1962 			goto out;
1963 #endif
1964 		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1965 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1966 #ifdef MAC
1967 out:
1968 #endif
1969 		vn_finished_write(mp);
1970 	}
1971 	NDFREE_PNBUF(&nd);
1972 	vput(nd.ni_dvp);
1973 	if (vp == nd.ni_dvp)
1974 		vrele(vp);
1975 	else
1976 		vput(vp);
1977 	if (error == ERELOOKUP)
1978 		goto restart;
1979 fdout:
1980 	if (fp != NULL)
1981 		fdrop(fp, td);
1982 	return (error);
1983 }
1984 
1985 /*
1986  * Reposition read/write file offset.
1987  */
1988 #ifndef _SYS_SYSPROTO_H_
1989 struct lseek_args {
1990 	int	fd;
1991 	int	pad;
1992 	off_t	offset;
1993 	int	whence;
1994 };
1995 #endif
1996 int
1997 sys_lseek(struct thread *td, struct lseek_args *uap)
1998 {
1999 
2000 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
2001 }
2002 
2003 int
2004 kern_lseek(struct thread *td, int fd, off_t offset, int whence)
2005 {
2006 	struct file *fp;
2007 	int error;
2008 
2009 	AUDIT_ARG_FD(fd);
2010 	error = fget(td, fd, &cap_seek_rights, &fp);
2011 	if (error != 0)
2012 		return (error);
2013 	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
2014 	    fo_seek(fp, offset, whence, td) : ESPIPE;
2015 	fdrop(fp, td);
2016 	return (error);
2017 }
2018 
2019 #if defined(COMPAT_43)
2020 /*
2021  * Reposition read/write file offset.
2022  */
2023 #ifndef _SYS_SYSPROTO_H_
2024 struct olseek_args {
2025 	int	fd;
2026 	long	offset;
2027 	int	whence;
2028 };
2029 #endif
2030 int
2031 olseek(struct thread *td, struct olseek_args *uap)
2032 {
2033 
2034 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
2035 }
2036 #endif /* COMPAT_43 */
2037 
2038 #if defined(COMPAT_FREEBSD6)
2039 /* Version with the 'pad' argument */
2040 int
2041 freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
2042 {
2043 
2044 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
2045 }
2046 #endif
2047 
2048 /*
2049  * Check access permissions using passed credentials.
2050  */
2051 static int
2052 vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
2053      struct thread *td)
2054 {
2055 	accmode_t accmode;
2056 	int error;
2057 
2058 	/* Flags == 0 means only check for existence. */
2059 	if (user_flags == 0)
2060 		return (0);
2061 
2062 	accmode = 0;
2063 	if (user_flags & R_OK)
2064 		accmode |= VREAD;
2065 	if (user_flags & W_OK)
2066 		accmode |= VWRITE;
2067 	if (user_flags & X_OK)
2068 		accmode |= VEXEC;
2069 #ifdef MAC
2070 	error = mac_vnode_check_access(cred, vp, accmode);
2071 	if (error != 0)
2072 		return (error);
2073 #endif
2074 	if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2075 		error = VOP_ACCESS(vp, accmode, cred, td);
2076 	return (error);
2077 }
2078 
2079 /*
2080  * Check access permissions using "real" credentials.
2081  */
2082 #ifndef _SYS_SYSPROTO_H_
2083 struct access_args {
2084 	char	*path;
2085 	int	amode;
2086 };
2087 #endif
2088 int
2089 sys_access(struct thread *td, struct access_args *uap)
2090 {
2091 
2092 	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2093 	    0, uap->amode));
2094 }
2095 
2096 #ifndef _SYS_SYSPROTO_H_
2097 struct faccessat_args {
2098 	int	dirfd;
2099 	char	*path;
2100 	int	amode;
2101 	int	flag;
2102 }
2103 #endif
2104 int
2105 sys_faccessat(struct thread *td, struct faccessat_args *uap)
2106 {
2107 
2108 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2109 	    uap->amode));
2110 }
2111 
2112 int
2113 kern_accessat(struct thread *td, int fd, const char *path,
2114     enum uio_seg pathseg, int flag, int amode)
2115 {
2116 	struct ucred *cred, *usecred;
2117 	struct vnode *vp;
2118 	struct nameidata nd;
2119 	int error;
2120 
2121 	if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0)
2122 		return (EINVAL);
2123 	if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
2124 		return (EINVAL);
2125 
2126 	/*
2127 	 * Create and modify a temporary credential instead of one that
2128 	 * is potentially shared (if we need one).
2129 	 */
2130 	cred = td->td_ucred;
2131 	if ((flag & AT_EACCESS) == 0 &&
2132 	    ((cred->cr_uid != cred->cr_ruid ||
2133 	    cred->cr_rgid != cred->cr_groups[0]))) {
2134 		usecred = crdup(cred);
2135 		usecred->cr_uid = cred->cr_ruid;
2136 		usecred->cr_groups[0] = cred->cr_rgid;
2137 		td->td_ucred = usecred;
2138 	} else
2139 		usecred = cred;
2140 	AUDIT_ARG_VALUE(amode);
2141 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2142 	    AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH |
2143 	    AT_EMPTY_PATH), pathseg, path, fd, &cap_fstat_rights);
2144 	if ((error = namei(&nd)) != 0)
2145 		goto out;
2146 	vp = nd.ni_vp;
2147 
2148 	error = vn_access(vp, amode, usecred, td);
2149 	NDFREE_PNBUF(&nd);
2150 	vput(vp);
2151 out:
2152 	if (usecred != cred) {
2153 		td->td_ucred = cred;
2154 		crfree(usecred);
2155 	}
2156 	return (error);
2157 }
2158 
2159 /*
2160  * Check access permissions using "effective" credentials.
2161  */
2162 #ifndef _SYS_SYSPROTO_H_
2163 struct eaccess_args {
2164 	char	*path;
2165 	int	amode;
2166 };
2167 #endif
2168 int
2169 sys_eaccess(struct thread *td, struct eaccess_args *uap)
2170 {
2171 
2172 	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2173 	    AT_EACCESS, uap->amode));
2174 }
2175 
2176 #if defined(COMPAT_43)
2177 /*
2178  * Get file status; this version follows links.
2179  */
2180 #ifndef _SYS_SYSPROTO_H_
2181 struct ostat_args {
2182 	char	*path;
2183 	struct ostat *ub;
2184 };
2185 #endif
2186 int
2187 ostat(struct thread *td, struct ostat_args *uap)
2188 {
2189 	struct stat sb;
2190 	struct ostat osb;
2191 	int error;
2192 
2193 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb);
2194 	if (error != 0)
2195 		return (error);
2196 	cvtstat(&sb, &osb);
2197 	return (copyout(&osb, uap->ub, sizeof (osb)));
2198 }
2199 
2200 /*
2201  * Get file status; this version does not follow links.
2202  */
2203 #ifndef _SYS_SYSPROTO_H_
2204 struct olstat_args {
2205 	char	*path;
2206 	struct ostat *ub;
2207 };
2208 #endif
2209 int
2210 olstat(struct thread *td, struct olstat_args *uap)
2211 {
2212 	struct stat sb;
2213 	struct ostat osb;
2214 	int error;
2215 
2216 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2217 	    UIO_USERSPACE, &sb);
2218 	if (error != 0)
2219 		return (error);
2220 	cvtstat(&sb, &osb);
2221 	return (copyout(&osb, uap->ub, sizeof (osb)));
2222 }
2223 
2224 /*
2225  * Convert from an old to a new stat structure.
2226  * XXX: many values are blindly truncated.
2227  */
2228 void
2229 cvtstat(struct stat *st, struct ostat *ost)
2230 {
2231 
2232 	bzero(ost, sizeof(*ost));
2233 	ost->st_dev = st->st_dev;
2234 	ost->st_ino = st->st_ino;
2235 	ost->st_mode = st->st_mode;
2236 	ost->st_nlink = st->st_nlink;
2237 	ost->st_uid = st->st_uid;
2238 	ost->st_gid = st->st_gid;
2239 	ost->st_rdev = st->st_rdev;
2240 	ost->st_size = MIN(st->st_size, INT32_MAX);
2241 	ost->st_atim = st->st_atim;
2242 	ost->st_mtim = st->st_mtim;
2243 	ost->st_ctim = st->st_ctim;
2244 	ost->st_blksize = st->st_blksize;
2245 	ost->st_blocks = st->st_blocks;
2246 	ost->st_flags = st->st_flags;
2247 	ost->st_gen = st->st_gen;
2248 }
2249 #endif /* COMPAT_43 */
2250 
2251 #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
2252 int ino64_trunc_error;
2253 SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW,
2254     &ino64_trunc_error, 0,
2255     "Error on truncation of device, file or inode number, or link count");
2256 
2257 int
2258 freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost)
2259 {
2260 
2261 	ost->st_dev = st->st_dev;
2262 	if (ost->st_dev != st->st_dev) {
2263 		switch (ino64_trunc_error) {
2264 		default:
2265 			/*
2266 			 * Since dev_t is almost raw, don't clamp to the
2267 			 * maximum for case 2, but ignore the error.
2268 			 */
2269 			break;
2270 		case 1:
2271 			return (EOVERFLOW);
2272 		}
2273 	}
2274 	ost->st_ino = st->st_ino;
2275 	if (ost->st_ino != st->st_ino) {
2276 		switch (ino64_trunc_error) {
2277 		default:
2278 		case 0:
2279 			break;
2280 		case 1:
2281 			return (EOVERFLOW);
2282 		case 2:
2283 			ost->st_ino = UINT32_MAX;
2284 			break;
2285 		}
2286 	}
2287 	ost->st_mode = st->st_mode;
2288 	ost->st_nlink = st->st_nlink;
2289 	if (ost->st_nlink != st->st_nlink) {
2290 		switch (ino64_trunc_error) {
2291 		default:
2292 		case 0:
2293 			break;
2294 		case 1:
2295 			return (EOVERFLOW);
2296 		case 2:
2297 			ost->st_nlink = UINT16_MAX;
2298 			break;
2299 		}
2300 	}
2301 	ost->st_uid = st->st_uid;
2302 	ost->st_gid = st->st_gid;
2303 	ost->st_rdev = st->st_rdev;
2304 	if (ost->st_rdev != st->st_rdev) {
2305 		switch (ino64_trunc_error) {
2306 		default:
2307 			break;
2308 		case 1:
2309 			return (EOVERFLOW);
2310 		}
2311 	}
2312 	ost->st_atim = st->st_atim;
2313 	ost->st_mtim = st->st_mtim;
2314 	ost->st_ctim = st->st_ctim;
2315 	ost->st_size = st->st_size;
2316 	ost->st_blocks = st->st_blocks;
2317 	ost->st_blksize = st->st_blksize;
2318 	ost->st_flags = st->st_flags;
2319 	ost->st_gen = st->st_gen;
2320 	ost->st_lspare = 0;
2321 	ost->st_birthtim = st->st_birthtim;
2322 	bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim),
2323 	    sizeof(*ost) - offsetof(struct freebsd11_stat,
2324 	    st_birthtim) - sizeof(ost->st_birthtim));
2325 	return (0);
2326 }
2327 
2328 int
2329 freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap)
2330 {
2331 	struct stat sb;
2332 	struct freebsd11_stat osb;
2333 	int error;
2334 
2335 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb);
2336 	if (error != 0)
2337 		return (error);
2338 	error = freebsd11_cvtstat(&sb, &osb);
2339 	if (error == 0)
2340 		error = copyout(&osb, uap->ub, sizeof(osb));
2341 	return (error);
2342 }
2343 
2344 int
2345 freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap)
2346 {
2347 	struct stat sb;
2348 	struct freebsd11_stat osb;
2349 	int error;
2350 
2351 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2352 	    UIO_USERSPACE, &sb);
2353 	if (error != 0)
2354 		return (error);
2355 	error = freebsd11_cvtstat(&sb, &osb);
2356 	if (error == 0)
2357 		error = copyout(&osb, uap->ub, sizeof(osb));
2358 	return (error);
2359 }
2360 
2361 int
2362 freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap)
2363 {
2364 	struct fhandle fh;
2365 	struct stat sb;
2366 	struct freebsd11_stat osb;
2367 	int error;
2368 
2369 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
2370 	if (error != 0)
2371 		return (error);
2372 	error = kern_fhstat(td, fh, &sb);
2373 	if (error != 0)
2374 		return (error);
2375 	error = freebsd11_cvtstat(&sb, &osb);
2376 	if (error == 0)
2377 		error = copyout(&osb, uap->sb, sizeof(osb));
2378 	return (error);
2379 }
2380 
2381 int
2382 freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap)
2383 {
2384 	struct stat sb;
2385 	struct freebsd11_stat osb;
2386 	int error;
2387 
2388 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2389 	    UIO_USERSPACE, &sb);
2390 	if (error != 0)
2391 		return (error);
2392 	error = freebsd11_cvtstat(&sb, &osb);
2393 	if (error == 0)
2394 		error = copyout(&osb, uap->buf, sizeof(osb));
2395 	return (error);
2396 }
2397 #endif	/* COMPAT_FREEBSD11 */
2398 
2399 /*
2400  * Get file status
2401  */
2402 #ifndef _SYS_SYSPROTO_H_
2403 struct fstatat_args {
2404 	int	fd;
2405 	char	*path;
2406 	struct stat	*buf;
2407 	int	flag;
2408 }
2409 #endif
2410 int
2411 sys_fstatat(struct thread *td, struct fstatat_args *uap)
2412 {
2413 	struct stat sb;
2414 	int error;
2415 
2416 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2417 	    UIO_USERSPACE, &sb);
2418 	if (error == 0)
2419 		error = copyout(&sb, uap->buf, sizeof (sb));
2420 	return (error);
2421 }
2422 
2423 int
2424 kern_statat(struct thread *td, int flag, int fd, const char *path,
2425     enum uio_seg pathseg, struct stat *sbp)
2426 {
2427 	struct nameidata nd;
2428 	int error;
2429 
2430 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
2431 	    AT_EMPTY_PATH)) != 0)
2432 		return (EINVAL);
2433 
2434 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH |
2435 	    AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF |
2436 	    AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights);
2437 
2438 	if ((error = namei(&nd)) != 0) {
2439 		if (error == ENOTDIR &&
2440 		    (nd.ni_resflags & NIRES_EMPTYPATH) != 0)
2441 			error = kern_fstat(td, fd, sbp);
2442 		return (error);
2443 	}
2444 	error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED);
2445 	NDFREE_PNBUF(&nd);
2446 	vput(nd.ni_vp);
2447 #ifdef __STAT_TIME_T_EXT
2448 	sbp->st_atim_ext = 0;
2449 	sbp->st_mtim_ext = 0;
2450 	sbp->st_ctim_ext = 0;
2451 	sbp->st_btim_ext = 0;
2452 #endif
2453 #ifdef KTRACE
2454 	if (KTRPOINT(td, KTR_STRUCT))
2455 		ktrstat_error(sbp, error);
2456 #endif
2457 	return (error);
2458 }
2459 
2460 #if defined(COMPAT_FREEBSD11)
2461 /*
2462  * Implementation of the NetBSD [l]stat() functions.
2463  */
2464 int
2465 freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb)
2466 {
2467 	struct freebsd11_stat sb11;
2468 	int error;
2469 
2470 	error = freebsd11_cvtstat(sb, &sb11);
2471 	if (error != 0)
2472 		return (error);
2473 
2474 	bzero(nsb, sizeof(*nsb));
2475 	CP(sb11, *nsb, st_dev);
2476 	CP(sb11, *nsb, st_ino);
2477 	CP(sb11, *nsb, st_mode);
2478 	CP(sb11, *nsb, st_nlink);
2479 	CP(sb11, *nsb, st_uid);
2480 	CP(sb11, *nsb, st_gid);
2481 	CP(sb11, *nsb, st_rdev);
2482 	CP(sb11, *nsb, st_atim);
2483 	CP(sb11, *nsb, st_mtim);
2484 	CP(sb11, *nsb, st_ctim);
2485 	CP(sb11, *nsb, st_size);
2486 	CP(sb11, *nsb, st_blocks);
2487 	CP(sb11, *nsb, st_blksize);
2488 	CP(sb11, *nsb, st_flags);
2489 	CP(sb11, *nsb, st_gen);
2490 	CP(sb11, *nsb, st_birthtim);
2491 	return (0);
2492 }
2493 
2494 #ifndef _SYS_SYSPROTO_H_
2495 struct freebsd11_nstat_args {
2496 	char	*path;
2497 	struct nstat *ub;
2498 };
2499 #endif
2500 int
2501 freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap)
2502 {
2503 	struct stat sb;
2504 	struct nstat nsb;
2505 	int error;
2506 
2507 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb);
2508 	if (error != 0)
2509 		return (error);
2510 	error = freebsd11_cvtnstat(&sb, &nsb);
2511 	if (error == 0)
2512 		error = copyout(&nsb, uap->ub, sizeof (nsb));
2513 	return (error);
2514 }
2515 
2516 /*
2517  * NetBSD lstat.  Get file status; this version does not follow links.
2518  */
2519 #ifndef _SYS_SYSPROTO_H_
2520 struct freebsd11_nlstat_args {
2521 	char	*path;
2522 	struct nstat *ub;
2523 };
2524 #endif
2525 int
2526 freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap)
2527 {
2528 	struct stat sb;
2529 	struct nstat nsb;
2530 	int error;
2531 
2532 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2533 	    UIO_USERSPACE, &sb);
2534 	if (error != 0)
2535 		return (error);
2536 	error = freebsd11_cvtnstat(&sb, &nsb);
2537 	if (error == 0)
2538 		error = copyout(&nsb, uap->ub, sizeof (nsb));
2539 	return (error);
2540 }
2541 #endif /* COMPAT_FREEBSD11 */
2542 
2543 /*
2544  * Get configurable pathname variables.
2545  */
2546 #ifndef _SYS_SYSPROTO_H_
2547 struct pathconf_args {
2548 	char	*path;
2549 	int	name;
2550 };
2551 #endif
2552 int
2553 sys_pathconf(struct thread *td, struct pathconf_args *uap)
2554 {
2555 	long value;
2556 	int error;
2557 
2558 	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW,
2559 	    &value);
2560 	if (error == 0)
2561 		td->td_retval[0] = value;
2562 	return (error);
2563 }
2564 
2565 #ifndef _SYS_SYSPROTO_H_
2566 struct lpathconf_args {
2567 	char	*path;
2568 	int	name;
2569 };
2570 #endif
2571 int
2572 sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
2573 {
2574 	long value;
2575 	int error;
2576 
2577 	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2578 	    NOFOLLOW, &value);
2579 	if (error == 0)
2580 		td->td_retval[0] = value;
2581 	return (error);
2582 }
2583 
2584 int
2585 kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg,
2586     int name, u_long flags, long *valuep)
2587 {
2588 	struct nameidata nd;
2589 	int error;
2590 
2591 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2592 	       pathseg, path);
2593 	if ((error = namei(&nd)) != 0)
2594 		return (error);
2595 	NDFREE_PNBUF(&nd);
2596 
2597 	error = VOP_PATHCONF(nd.ni_vp, name, valuep);
2598 	vput(nd.ni_vp);
2599 	return (error);
2600 }
2601 
2602 /*
2603  * Return target name of a symbolic link.
2604  */
2605 #ifndef _SYS_SYSPROTO_H_
2606 struct readlink_args {
2607 	char	*path;
2608 	char	*buf;
2609 	size_t	count;
2610 };
2611 #endif
2612 int
2613 sys_readlink(struct thread *td, struct readlink_args *uap)
2614 {
2615 
2616 	return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2617 	    uap->buf, UIO_USERSPACE, uap->count));
2618 }
2619 #ifndef _SYS_SYSPROTO_H_
2620 struct readlinkat_args {
2621 	int	fd;
2622 	char	*path;
2623 	char	*buf;
2624 	size_t	bufsize;
2625 };
2626 #endif
2627 int
2628 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2629 {
2630 
2631 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2632 	    uap->buf, UIO_USERSPACE, uap->bufsize));
2633 }
2634 
2635 int
2636 kern_readlinkat(struct thread *td, int fd, const char *path,
2637     enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count)
2638 {
2639 	struct vnode *vp;
2640 	struct nameidata nd;
2641 	int error;
2642 
2643 	if (count > IOSIZE_MAX)
2644 		return (EINVAL);
2645 
2646 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
2647 	    EMPTYPATH, pathseg, path, fd);
2648 
2649 	if ((error = namei(&nd)) != 0)
2650 		return (error);
2651 	NDFREE_PNBUF(&nd);
2652 	vp = nd.ni_vp;
2653 
2654 	error = kern_readlink_vp(vp, buf, bufseg, count, td);
2655 	vput(vp);
2656 
2657 	return (error);
2658 }
2659 
2660 /*
2661  * Helper function to readlink from a vnode
2662  */
2663 static int
2664 kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count,
2665     struct thread *td)
2666 {
2667 	struct iovec aiov;
2668 	struct uio auio;
2669 	int error;
2670 
2671 	ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked");
2672 #ifdef MAC
2673 	error = mac_vnode_check_readlink(td->td_ucred, vp);
2674 	if (error != 0)
2675 		return (error);
2676 #endif
2677 	if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
2678 		return (EINVAL);
2679 
2680 	aiov.iov_base = buf;
2681 	aiov.iov_len = count;
2682 	auio.uio_iov = &aiov;
2683 	auio.uio_iovcnt = 1;
2684 	auio.uio_offset = 0;
2685 	auio.uio_rw = UIO_READ;
2686 	auio.uio_segflg = bufseg;
2687 	auio.uio_td = td;
2688 	auio.uio_resid = count;
2689 	error = VOP_READLINK(vp, &auio, td->td_ucred);
2690 	td->td_retval[0] = count - auio.uio_resid;
2691 	return (error);
2692 }
2693 
2694 /*
2695  * Common implementation code for chflags() and fchflags().
2696  */
2697 static int
2698 setfflags(struct thread *td, struct vnode *vp, u_long flags)
2699 {
2700 	struct mount *mp;
2701 	struct vattr vattr;
2702 	int error;
2703 
2704 	/* We can't support the value matching VNOVAL. */
2705 	if (flags == VNOVAL)
2706 		return (EOPNOTSUPP);
2707 
2708 	/*
2709 	 * Prevent non-root users from setting flags on devices.  When
2710 	 * a device is reused, users can retain ownership of the device
2711 	 * if they are allowed to set flags and programs assume that
2712 	 * chown can't fail when done as root.
2713 	 */
2714 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2715 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2716 		if (error != 0)
2717 			return (error);
2718 	}
2719 
2720 	if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0)
2721 		return (error);
2722 	VATTR_NULL(&vattr);
2723 	vattr.va_flags = flags;
2724 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2725 #ifdef MAC
2726 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2727 	if (error == 0)
2728 #endif
2729 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2730 	VOP_UNLOCK(vp);
2731 	vn_finished_write(mp);
2732 	return (error);
2733 }
2734 
2735 /*
2736  * Change flags of a file given a path name.
2737  */
2738 #ifndef _SYS_SYSPROTO_H_
2739 struct chflags_args {
2740 	const char *path;
2741 	u_long	flags;
2742 };
2743 #endif
2744 int
2745 sys_chflags(struct thread *td, struct chflags_args *uap)
2746 {
2747 
2748 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2749 	    uap->flags, 0));
2750 }
2751 
2752 #ifndef _SYS_SYSPROTO_H_
2753 struct chflagsat_args {
2754 	int	fd;
2755 	const char *path;
2756 	u_long	flags;
2757 	int	atflag;
2758 }
2759 #endif
2760 int
2761 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2762 {
2763 
2764 	return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE,
2765 	    uap->flags, uap->atflag));
2766 }
2767 
2768 /*
2769  * Same as chflags() but doesn't follow symlinks.
2770  */
2771 #ifndef _SYS_SYSPROTO_H_
2772 struct lchflags_args {
2773 	const char *path;
2774 	u_long flags;
2775 };
2776 #endif
2777 int
2778 sys_lchflags(struct thread *td, struct lchflags_args *uap)
2779 {
2780 
2781 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2782 	    uap->flags, AT_SYMLINK_NOFOLLOW));
2783 }
2784 
2785 static int
2786 kern_chflagsat(struct thread *td, int fd, const char *path,
2787     enum uio_seg pathseg, u_long flags, int atflag)
2788 {
2789 	struct nameidata nd;
2790 	int error;
2791 
2792 	if ((atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
2793 	    AT_EMPTY_PATH)) != 0)
2794 		return (EINVAL);
2795 
2796 	AUDIT_ARG_FFLAGS(flags);
2797 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW |
2798 	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
2799 	    fd, &cap_fchflags_rights);
2800 	if ((error = namei(&nd)) != 0)
2801 		return (error);
2802 	NDFREE_PNBUF(&nd);
2803 	error = setfflags(td, nd.ni_vp, flags);
2804 	vrele(nd.ni_vp);
2805 	return (error);
2806 }
2807 
2808 /*
2809  * Change flags of a file given a file descriptor.
2810  */
2811 #ifndef _SYS_SYSPROTO_H_
2812 struct fchflags_args {
2813 	int	fd;
2814 	u_long	flags;
2815 };
2816 #endif
2817 int
2818 sys_fchflags(struct thread *td, struct fchflags_args *uap)
2819 {
2820 	struct file *fp;
2821 	int error;
2822 
2823 	AUDIT_ARG_FD(uap->fd);
2824 	AUDIT_ARG_FFLAGS(uap->flags);
2825 	error = getvnode(td, uap->fd, &cap_fchflags_rights,
2826 	    &fp);
2827 	if (error != 0)
2828 		return (error);
2829 #ifdef AUDIT
2830 	if (AUDITING_TD(td)) {
2831 		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2832 		AUDIT_ARG_VNODE1(fp->f_vnode);
2833 		VOP_UNLOCK(fp->f_vnode);
2834 	}
2835 #endif
2836 	error = setfflags(td, fp->f_vnode, uap->flags);
2837 	fdrop(fp, td);
2838 	return (error);
2839 }
2840 
2841 /*
2842  * Common implementation code for chmod(), lchmod() and fchmod().
2843  */
2844 int
2845 setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
2846 {
2847 	struct mount *mp;
2848 	struct vattr vattr;
2849 	int error;
2850 
2851 	if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0)
2852 		return (error);
2853 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2854 	VATTR_NULL(&vattr);
2855 	vattr.va_mode = mode & ALLPERMS;
2856 #ifdef MAC
2857 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2858 	if (error == 0)
2859 #endif
2860 		error = VOP_SETATTR(vp, &vattr, cred);
2861 	VOP_UNLOCK(vp);
2862 	vn_finished_write(mp);
2863 	return (error);
2864 }
2865 
2866 /*
2867  * Change mode of a file given path name.
2868  */
2869 #ifndef _SYS_SYSPROTO_H_
2870 struct chmod_args {
2871 	char	*path;
2872 	int	mode;
2873 };
2874 #endif
2875 int
2876 sys_chmod(struct thread *td, struct chmod_args *uap)
2877 {
2878 
2879 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2880 	    uap->mode, 0));
2881 }
2882 
2883 #ifndef _SYS_SYSPROTO_H_
2884 struct fchmodat_args {
2885 	int	dirfd;
2886 	char	*path;
2887 	mode_t	mode;
2888 	int	flag;
2889 }
2890 #endif
2891 int
2892 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2893 {
2894 
2895 	return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE,
2896 	    uap->mode, uap->flag));
2897 }
2898 
2899 /*
2900  * Change mode of a file given path name (don't follow links.)
2901  */
2902 #ifndef _SYS_SYSPROTO_H_
2903 struct lchmod_args {
2904 	char	*path;
2905 	int	mode;
2906 };
2907 #endif
2908 int
2909 sys_lchmod(struct thread *td, struct lchmod_args *uap)
2910 {
2911 
2912 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2913 	    uap->mode, AT_SYMLINK_NOFOLLOW));
2914 }
2915 
2916 int
2917 kern_fchmodat(struct thread *td, int fd, const char *path,
2918     enum uio_seg pathseg, mode_t mode, int flag)
2919 {
2920 	struct nameidata nd;
2921 	int error;
2922 
2923 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
2924 	    AT_EMPTY_PATH)) != 0)
2925 		return (EINVAL);
2926 
2927 	AUDIT_ARG_MODE(mode);
2928 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
2929 	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
2930 	    fd, &cap_fchmod_rights);
2931 	if ((error = namei(&nd)) != 0)
2932 		return (error);
2933 	NDFREE_PNBUF(&nd);
2934 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2935 	vrele(nd.ni_vp);
2936 	return (error);
2937 }
2938 
2939 /*
2940  * Change mode of a file given a file descriptor.
2941  */
2942 #ifndef _SYS_SYSPROTO_H_
2943 struct fchmod_args {
2944 	int	fd;
2945 	int	mode;
2946 };
2947 #endif
2948 int
2949 sys_fchmod(struct thread *td, struct fchmod_args *uap)
2950 {
2951 	struct file *fp;
2952 	int error;
2953 
2954 	AUDIT_ARG_FD(uap->fd);
2955 	AUDIT_ARG_MODE(uap->mode);
2956 
2957 	error = fget(td, uap->fd, &cap_fchmod_rights, &fp);
2958 	if (error != 0)
2959 		return (error);
2960 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2961 	fdrop(fp, td);
2962 	return (error);
2963 }
2964 
2965 /*
2966  * Common implementation for chown(), lchown(), and fchown()
2967  */
2968 int
2969 setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
2970     gid_t gid)
2971 {
2972 	struct mount *mp;
2973 	struct vattr vattr;
2974 	int error;
2975 
2976 	if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0)
2977 		return (error);
2978 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2979 	VATTR_NULL(&vattr);
2980 	vattr.va_uid = uid;
2981 	vattr.va_gid = gid;
2982 #ifdef MAC
2983 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2984 	    vattr.va_gid);
2985 	if (error == 0)
2986 #endif
2987 		error = VOP_SETATTR(vp, &vattr, cred);
2988 	VOP_UNLOCK(vp);
2989 	vn_finished_write(mp);
2990 	return (error);
2991 }
2992 
2993 /*
2994  * Set ownership given a path name.
2995  */
2996 #ifndef _SYS_SYSPROTO_H_
2997 struct chown_args {
2998 	char	*path;
2999 	int	uid;
3000 	int	gid;
3001 };
3002 #endif
3003 int
3004 sys_chown(struct thread *td, struct chown_args *uap)
3005 {
3006 
3007 	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
3008 	    uap->gid, 0));
3009 }
3010 
3011 #ifndef _SYS_SYSPROTO_H_
3012 struct fchownat_args {
3013 	int fd;
3014 	const char * path;
3015 	uid_t uid;
3016 	gid_t gid;
3017 	int flag;
3018 };
3019 #endif
3020 int
3021 sys_fchownat(struct thread *td, struct fchownat_args *uap)
3022 {
3023 
3024 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
3025 	    uap->gid, uap->flag));
3026 }
3027 
3028 int
3029 kern_fchownat(struct thread *td, int fd, const char *path,
3030     enum uio_seg pathseg, int uid, int gid, int flag)
3031 {
3032 	struct nameidata nd;
3033 	int error;
3034 
3035 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
3036 	    AT_EMPTY_PATH)) != 0)
3037 		return (EINVAL);
3038 
3039 	AUDIT_ARG_OWNER(uid, gid);
3040 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
3041 	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
3042 	    fd, &cap_fchown_rights);
3043 
3044 	if ((error = namei(&nd)) != 0)
3045 		return (error);
3046 	NDFREE_PNBUF(&nd);
3047 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3048 	vrele(nd.ni_vp);
3049 	return (error);
3050 }
3051 
3052 /*
3053  * Set ownership given a path name, do not cross symlinks.
3054  */
3055 #ifndef _SYS_SYSPROTO_H_
3056 struct lchown_args {
3057 	char	*path;
3058 	int	uid;
3059 	int	gid;
3060 };
3061 #endif
3062 int
3063 sys_lchown(struct thread *td, struct lchown_args *uap)
3064 {
3065 
3066 	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3067 	    uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
3068 }
3069 
3070 /*
3071  * Set ownership given a file descriptor.
3072  */
3073 #ifndef _SYS_SYSPROTO_H_
3074 struct fchown_args {
3075 	int	fd;
3076 	int	uid;
3077 	int	gid;
3078 };
3079 #endif
3080 int
3081 sys_fchown(struct thread *td, struct fchown_args *uap)
3082 {
3083 	struct file *fp;
3084 	int error;
3085 
3086 	AUDIT_ARG_FD(uap->fd);
3087 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3088 	error = fget(td, uap->fd, &cap_fchown_rights, &fp);
3089 	if (error != 0)
3090 		return (error);
3091 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3092 	fdrop(fp, td);
3093 	return (error);
3094 }
3095 
3096 /*
3097  * Common implementation code for utimes(), lutimes(), and futimes().
3098  */
3099 static int
3100 getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
3101     struct timespec *tsp)
3102 {
3103 	struct timeval tv[2];
3104 	const struct timeval *tvp;
3105 	int error;
3106 
3107 	if (usrtvp == NULL) {
3108 		vfs_timestamp(&tsp[0]);
3109 		tsp[1] = tsp[0];
3110 	} else {
3111 		if (tvpseg == UIO_SYSSPACE) {
3112 			tvp = usrtvp;
3113 		} else {
3114 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3115 				return (error);
3116 			tvp = tv;
3117 		}
3118 
3119 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3120 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3121 			return (EINVAL);
3122 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3123 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3124 	}
3125 	return (0);
3126 }
3127 
3128 /*
3129  * Common implementation code for futimens(), utimensat().
3130  */
3131 #define	UTIMENS_NULL	0x1
3132 #define	UTIMENS_EXIT	0x2
3133 static int
3134 getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
3135     struct timespec *tsp, int *retflags)
3136 {
3137 	struct timespec tsnow;
3138 	int error;
3139 
3140 	vfs_timestamp(&tsnow);
3141 	*retflags = 0;
3142 	if (usrtsp == NULL) {
3143 		tsp[0] = tsnow;
3144 		tsp[1] = tsnow;
3145 		*retflags |= UTIMENS_NULL;
3146 		return (0);
3147 	}
3148 	if (tspseg == UIO_SYSSPACE) {
3149 		tsp[0] = usrtsp[0];
3150 		tsp[1] = usrtsp[1];
3151 	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
3152 		return (error);
3153 	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
3154 		*retflags |= UTIMENS_EXIT;
3155 	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
3156 		*retflags |= UTIMENS_NULL;
3157 	if (tsp[0].tv_nsec == UTIME_OMIT)
3158 		tsp[0].tv_sec = VNOVAL;
3159 	else if (tsp[0].tv_nsec == UTIME_NOW)
3160 		tsp[0] = tsnow;
3161 	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
3162 		return (EINVAL);
3163 	if (tsp[1].tv_nsec == UTIME_OMIT)
3164 		tsp[1].tv_sec = VNOVAL;
3165 	else if (tsp[1].tv_nsec == UTIME_NOW)
3166 		tsp[1] = tsnow;
3167 	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
3168 		return (EINVAL);
3169 
3170 	return (0);
3171 }
3172 
3173 /*
3174  * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
3175  * and utimensat().
3176  */
3177 static int
3178 setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
3179     int numtimes, int nullflag)
3180 {
3181 	struct mount *mp;
3182 	struct vattr vattr;
3183 	int error;
3184 	bool setbirthtime;
3185 
3186 	setbirthtime = false;
3187 	vattr.va_birthtime.tv_sec = VNOVAL;
3188 	vattr.va_birthtime.tv_nsec = 0;
3189 
3190 	if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0)
3191 		return (error);
3192 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3193 	if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred) == 0 &&
3194 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3195 		setbirthtime = true;
3196 	VATTR_NULL(&vattr);
3197 	vattr.va_atime = ts[0];
3198 	vattr.va_mtime = ts[1];
3199 	if (setbirthtime)
3200 		vattr.va_birthtime = ts[1];
3201 	if (numtimes > 2)
3202 		vattr.va_birthtime = ts[2];
3203 	if (nullflag)
3204 		vattr.va_vaflags |= VA_UTIMES_NULL;
3205 #ifdef MAC
3206 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3207 	    vattr.va_mtime);
3208 #endif
3209 	if (error == 0)
3210 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3211 	VOP_UNLOCK(vp);
3212 	vn_finished_write(mp);
3213 	return (error);
3214 }
3215 
3216 /*
3217  * Set the access and modification times of a file.
3218  */
3219 #ifndef _SYS_SYSPROTO_H_
3220 struct utimes_args {
3221 	char	*path;
3222 	struct	timeval *tptr;
3223 };
3224 #endif
3225 int
3226 sys_utimes(struct thread *td, struct utimes_args *uap)
3227 {
3228 
3229 	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3230 	    uap->tptr, UIO_USERSPACE));
3231 }
3232 
3233 #ifndef _SYS_SYSPROTO_H_
3234 struct futimesat_args {
3235 	int fd;
3236 	const char * path;
3237 	const struct timeval * times;
3238 };
3239 #endif
3240 int
3241 sys_futimesat(struct thread *td, struct futimesat_args *uap)
3242 {
3243 
3244 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3245 	    uap->times, UIO_USERSPACE));
3246 }
3247 
3248 int
3249 kern_utimesat(struct thread *td, int fd, const char *path,
3250     enum uio_seg pathseg, const struct timeval *tptr, enum uio_seg tptrseg)
3251 {
3252 	struct nameidata nd;
3253 	struct timespec ts[2];
3254 	int error;
3255 
3256 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3257 		return (error);
3258 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3259 	    &cap_futimes_rights);
3260 
3261 	if ((error = namei(&nd)) != 0)
3262 		return (error);
3263 	NDFREE_PNBUF(&nd);
3264 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3265 	vrele(nd.ni_vp);
3266 	return (error);
3267 }
3268 
3269 /*
3270  * Set the access and modification times of a file.
3271  */
3272 #ifndef _SYS_SYSPROTO_H_
3273 struct lutimes_args {
3274 	char	*path;
3275 	struct	timeval *tptr;
3276 };
3277 #endif
3278 int
3279 sys_lutimes(struct thread *td, struct lutimes_args *uap)
3280 {
3281 
3282 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3283 	    UIO_USERSPACE));
3284 }
3285 
3286 int
3287 kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg,
3288     const struct timeval *tptr, enum uio_seg tptrseg)
3289 {
3290 	struct timespec ts[2];
3291 	struct nameidata nd;
3292 	int error;
3293 
3294 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3295 		return (error);
3296 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path);
3297 	if ((error = namei(&nd)) != 0)
3298 		return (error);
3299 	NDFREE_PNBUF(&nd);
3300 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3301 	vrele(nd.ni_vp);
3302 	return (error);
3303 }
3304 
3305 /*
3306  * Set the access and modification times of a file.
3307  */
3308 #ifndef _SYS_SYSPROTO_H_
3309 struct futimes_args {
3310 	int	fd;
3311 	struct	timeval *tptr;
3312 };
3313 #endif
3314 int
3315 sys_futimes(struct thread *td, struct futimes_args *uap)
3316 {
3317 
3318 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3319 }
3320 
3321 int
3322 kern_futimes(struct thread *td, int fd, const struct timeval *tptr,
3323     enum uio_seg tptrseg)
3324 {
3325 	struct timespec ts[2];
3326 	struct file *fp;
3327 	int error;
3328 
3329 	AUDIT_ARG_FD(fd);
3330 	error = getutimes(tptr, tptrseg, ts);
3331 	if (error != 0)
3332 		return (error);
3333 	error = getvnode(td, fd, &cap_futimes_rights, &fp);
3334 	if (error != 0)
3335 		return (error);
3336 #ifdef AUDIT
3337 	if (AUDITING_TD(td)) {
3338 		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3339 		AUDIT_ARG_VNODE1(fp->f_vnode);
3340 		VOP_UNLOCK(fp->f_vnode);
3341 	}
3342 #endif
3343 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3344 	fdrop(fp, td);
3345 	return (error);
3346 }
3347 
3348 int
3349 sys_futimens(struct thread *td, struct futimens_args *uap)
3350 {
3351 
3352 	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
3353 }
3354 
3355 int
3356 kern_futimens(struct thread *td, int fd, const struct timespec *tptr,
3357     enum uio_seg tptrseg)
3358 {
3359 	struct timespec ts[2];
3360 	struct file *fp;
3361 	int error, flags;
3362 
3363 	AUDIT_ARG_FD(fd);
3364 	error = getutimens(tptr, tptrseg, ts, &flags);
3365 	if (error != 0)
3366 		return (error);
3367 	if (flags & UTIMENS_EXIT)
3368 		return (0);
3369 	error = getvnode(td, fd, &cap_futimes_rights, &fp);
3370 	if (error != 0)
3371 		return (error);
3372 #ifdef AUDIT
3373 	if (AUDITING_TD(td)) {
3374 		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3375 		AUDIT_ARG_VNODE1(fp->f_vnode);
3376 		VOP_UNLOCK(fp->f_vnode);
3377 	}
3378 #endif
3379 	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
3380 	fdrop(fp, td);
3381 	return (error);
3382 }
3383 
3384 int
3385 sys_utimensat(struct thread *td, struct utimensat_args *uap)
3386 {
3387 
3388 	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
3389 	    uap->times, UIO_USERSPACE, uap->flag));
3390 }
3391 
3392 int
3393 kern_utimensat(struct thread *td, int fd, const char *path,
3394     enum uio_seg pathseg, const struct timespec *tptr, enum uio_seg tptrseg,
3395     int flag)
3396 {
3397 	struct nameidata nd;
3398 	struct timespec ts[2];
3399 	int error, flags;
3400 
3401 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
3402 	    AT_EMPTY_PATH)) != 0)
3403 		return (EINVAL);
3404 
3405 	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
3406 		return (error);
3407 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
3408 	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1,
3409 	    pathseg, path, fd, &cap_futimes_rights);
3410 	if ((error = namei(&nd)) != 0)
3411 		return (error);
3412 	/*
3413 	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
3414 	 * POSIX states:
3415 	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
3416 	 * "Search permission is denied by a component of the path prefix."
3417 	 */
3418 	NDFREE_PNBUF(&nd);
3419 	if ((flags & UTIMENS_EXIT) == 0)
3420 		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
3421 	vrele(nd.ni_vp);
3422 	return (error);
3423 }
3424 
3425 /*
3426  * Truncate a file given its path name.
3427  */
3428 #ifndef _SYS_SYSPROTO_H_
3429 struct truncate_args {
3430 	char	*path;
3431 	int	pad;
3432 	off_t	length;
3433 };
3434 #endif
3435 int
3436 sys_truncate(struct thread *td, struct truncate_args *uap)
3437 {
3438 
3439 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3440 }
3441 
3442 int
3443 kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg,
3444     off_t length)
3445 {
3446 	struct mount *mp;
3447 	struct vnode *vp;
3448 	void *rl_cookie;
3449 	struct nameidata nd;
3450 	int error;
3451 
3452 	if (length < 0)
3453 		return (EINVAL);
3454 	NDPREINIT(&nd);
3455 retry:
3456 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path);
3457 	if ((error = namei(&nd)) != 0)
3458 		return (error);
3459 	vp = nd.ni_vp;
3460 	NDFREE_PNBUF(&nd);
3461 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3462 	if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0) {
3463 		vn_rangelock_unlock(vp, rl_cookie);
3464 		vrele(vp);
3465 		return (error);
3466 	}
3467 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3468 	if (vp->v_type == VDIR) {
3469 		error = EISDIR;
3470 		goto out;
3471 	}
3472 #ifdef MAC
3473 	error = mac_vnode_check_write(td->td_ucred, NOCRED, vp);
3474 	if (error != 0)
3475 		goto out;
3476 #endif
3477 	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
3478 	if (error != 0)
3479 		goto out;
3480 
3481 	error = vn_truncate_locked(vp, length, false, td->td_ucred);
3482 out:
3483 	VOP_UNLOCK(vp);
3484 	vn_finished_write(mp);
3485 	vn_rangelock_unlock(vp, rl_cookie);
3486 	vrele(vp);
3487 	if (error == ERELOOKUP)
3488 		goto retry;
3489 	return (error);
3490 }
3491 
3492 #if defined(COMPAT_43)
3493 /*
3494  * Truncate a file given its path name.
3495  */
3496 #ifndef _SYS_SYSPROTO_H_
3497 struct otruncate_args {
3498 	char	*path;
3499 	long	length;
3500 };
3501 #endif
3502 int
3503 otruncate(struct thread *td, struct otruncate_args *uap)
3504 {
3505 
3506 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3507 }
3508 #endif /* COMPAT_43 */
3509 
3510 #if defined(COMPAT_FREEBSD6)
3511 /* Versions with the pad argument */
3512 int
3513 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3514 {
3515 
3516 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3517 }
3518 
3519 int
3520 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3521 {
3522 
3523 	return (kern_ftruncate(td, uap->fd, uap->length));
3524 }
3525 #endif
3526 
3527 int
3528 kern_fsync(struct thread *td, int fd, bool fullsync)
3529 {
3530 	struct vnode *vp;
3531 	struct mount *mp;
3532 	struct file *fp;
3533 	int error;
3534 
3535 	AUDIT_ARG_FD(fd);
3536 	error = getvnode(td, fd, &cap_fsync_rights, &fp);
3537 	if (error != 0)
3538 		return (error);
3539 	vp = fp->f_vnode;
3540 #if 0
3541 	if (!fullsync)
3542 		/* XXXKIB: compete outstanding aio writes */;
3543 #endif
3544 retry:
3545 	error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
3546 	if (error != 0)
3547 		goto drop;
3548 	vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
3549 	AUDIT_ARG_VNODE1(vp);
3550 	vnode_pager_clean_async(vp);
3551 	error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
3552 	VOP_UNLOCK(vp);
3553 	vn_finished_write(mp);
3554 	if (error == ERELOOKUP)
3555 		goto retry;
3556 drop:
3557 	fdrop(fp, td);
3558 	return (error);
3559 }
3560 
3561 /*
3562  * Sync an open file.
3563  */
3564 #ifndef _SYS_SYSPROTO_H_
3565 struct fsync_args {
3566 	int	fd;
3567 };
3568 #endif
3569 int
3570 sys_fsync(struct thread *td, struct fsync_args *uap)
3571 {
3572 
3573 	return (kern_fsync(td, uap->fd, true));
3574 }
3575 
3576 int
3577 sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
3578 {
3579 
3580 	return (kern_fsync(td, uap->fd, false));
3581 }
3582 
3583 /*
3584  * Rename files.  Source and destination must either both be directories, or
3585  * both not be directories.  If target is a directory, it must be empty.
3586  */
3587 #ifndef _SYS_SYSPROTO_H_
3588 struct rename_args {
3589 	char	*from;
3590 	char	*to;
3591 };
3592 #endif
3593 int
3594 sys_rename(struct thread *td, struct rename_args *uap)
3595 {
3596 
3597 	return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
3598 	    uap->to, UIO_USERSPACE));
3599 }
3600 
3601 #ifndef _SYS_SYSPROTO_H_
3602 struct renameat_args {
3603 	int	oldfd;
3604 	char	*old;
3605 	int	newfd;
3606 	char	*new;
3607 };
3608 #endif
3609 int
3610 sys_renameat(struct thread *td, struct renameat_args *uap)
3611 {
3612 
3613 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3614 	    UIO_USERSPACE));
3615 }
3616 
3617 #ifdef MAC
3618 static int
3619 kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd,
3620     const char *new, enum uio_seg pathseg, struct nameidata *fromnd)
3621 {
3622 	int error;
3623 
3624 	NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3625 	    pathseg, old, oldfd, &cap_renameat_source_rights);
3626 	if ((error = namei(fromnd)) != 0)
3627 		return (error);
3628 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp,
3629 	    fromnd->ni_vp, &fromnd->ni_cnd);
3630 	VOP_UNLOCK(fromnd->ni_dvp);
3631 	if (fromnd->ni_dvp != fromnd->ni_vp)
3632 		VOP_UNLOCK(fromnd->ni_vp);
3633 	if (error != 0) {
3634 		NDFREE_PNBUF(fromnd);
3635 		vrele(fromnd->ni_dvp);
3636 		vrele(fromnd->ni_vp);
3637 	}
3638 	return (error);
3639 }
3640 #endif
3641 
3642 int
3643 kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
3644     const char *new, enum uio_seg pathseg)
3645 {
3646 	struct mount *mp = NULL;
3647 	struct vnode *tvp, *fvp, *tdvp;
3648 	struct nameidata fromnd, tond;
3649 	uint64_t tondflags;
3650 	int error;
3651 
3652 again:
3653 	bwillwrite();
3654 #ifdef MAC
3655 	if (mac_vnode_check_rename_from_enabled()) {
3656 		error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg,
3657 		    &fromnd);
3658 		if (error != 0)
3659 			return (error);
3660 	} else {
3661 #endif
3662 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | AUDITVNODE1,
3663 	    pathseg, old, oldfd, &cap_renameat_source_rights);
3664 	if ((error = namei(&fromnd)) != 0)
3665 		return (error);
3666 #ifdef MAC
3667 	}
3668 #endif
3669 	fvp = fromnd.ni_vp;
3670 	tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | AUDITVNODE2;
3671 	if (fromnd.ni_vp->v_type == VDIR)
3672 		tondflags |= WILLBEDIR;
3673 	NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd,
3674 	    &cap_renameat_target_rights);
3675 	if ((error = namei(&tond)) != 0) {
3676 		/* Translate error code for rename("dir1", "dir2/."). */
3677 		if (error == EISDIR && fvp->v_type == VDIR)
3678 			error = EINVAL;
3679 		NDFREE_PNBUF(&fromnd);
3680 		vrele(fromnd.ni_dvp);
3681 		vrele(fvp);
3682 		goto out1;
3683 	}
3684 	tdvp = tond.ni_dvp;
3685 	tvp = tond.ni_vp;
3686 	error = vn_start_write(fvp, &mp, V_NOWAIT);
3687 	if (error != 0) {
3688 		NDFREE_PNBUF(&fromnd);
3689 		NDFREE_PNBUF(&tond);
3690 		if (tvp != NULL)
3691 			vput(tvp);
3692 		if (tdvp == tvp)
3693 			vrele(tdvp);
3694 		else
3695 			vput(tdvp);
3696 		vrele(fromnd.ni_dvp);
3697 		vrele(fvp);
3698 		error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
3699 		if (error != 0)
3700 			return (error);
3701 		goto again;
3702 	}
3703 	if (tvp != NULL) {
3704 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3705 			error = ENOTDIR;
3706 			goto out;
3707 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3708 			error = EISDIR;
3709 			goto out;
3710 		}
3711 #ifdef CAPABILITIES
3712 		if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) {
3713 			/*
3714 			 * If the target already exists we require CAP_UNLINKAT
3715 			 * from 'newfd', when newfd was used for the lookup.
3716 			 */
3717 			error = cap_check(&tond.ni_filecaps.fc_rights,
3718 			    &cap_unlinkat_rights);
3719 			if (error != 0)
3720 				goto out;
3721 		}
3722 #endif
3723 	}
3724 	if (fvp == tdvp) {
3725 		error = EINVAL;
3726 		goto out;
3727 	}
3728 	/*
3729 	 * If the source is the same as the destination (that is, if they
3730 	 * are links to the same vnode), then there is nothing to do.
3731 	 */
3732 	if (fvp == tvp)
3733 		error = ERESTART;
3734 #ifdef MAC
3735 	else
3736 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3737 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3738 #endif
3739 out:
3740 	if (error == 0) {
3741 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3742 		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3743 		NDFREE_PNBUF(&fromnd);
3744 		NDFREE_PNBUF(&tond);
3745 	} else {
3746 		NDFREE_PNBUF(&fromnd);
3747 		NDFREE_PNBUF(&tond);
3748 		if (tvp != NULL)
3749 			vput(tvp);
3750 		if (tdvp == tvp)
3751 			vrele(tdvp);
3752 		else
3753 			vput(tdvp);
3754 		vrele(fromnd.ni_dvp);
3755 		vrele(fvp);
3756 	}
3757 	vn_finished_write(mp);
3758 out1:
3759 	if (error == ERESTART)
3760 		return (0);
3761 	if (error == ERELOOKUP)
3762 		goto again;
3763 	return (error);
3764 }
3765 
3766 /*
3767  * Make a directory file.
3768  */
3769 #ifndef _SYS_SYSPROTO_H_
3770 struct mkdir_args {
3771 	char	*path;
3772 	int	mode;
3773 };
3774 #endif
3775 int
3776 sys_mkdir(struct thread *td, struct mkdir_args *uap)
3777 {
3778 
3779 	return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3780 	    uap->mode));
3781 }
3782 
3783 #ifndef _SYS_SYSPROTO_H_
3784 struct mkdirat_args {
3785 	int	fd;
3786 	char	*path;
3787 	mode_t	mode;
3788 };
3789 #endif
3790 int
3791 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3792 {
3793 
3794 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3795 }
3796 
3797 int
3798 kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg,
3799     int mode)
3800 {
3801 	struct mount *mp;
3802 	struct vattr vattr;
3803 	struct nameidata nd;
3804 	int error;
3805 
3806 	AUDIT_ARG_MODE(mode);
3807 	NDPREINIT(&nd);
3808 restart:
3809 	bwillwrite();
3810 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | AUDITVNODE1 |
3811 	    NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR,
3812 	    segflg, path, fd, &cap_mkdirat_rights);
3813 	if ((error = namei(&nd)) != 0)
3814 		return (error);
3815 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3816 		NDFREE_PNBUF(&nd);
3817 		vput(nd.ni_dvp);
3818 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH)) != 0)
3819 			return (error);
3820 		goto restart;
3821 	}
3822 	VATTR_NULL(&vattr);
3823 	vattr.va_type = VDIR;
3824 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask;
3825 #ifdef MAC
3826 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3827 	    &vattr);
3828 	if (error != 0)
3829 		goto out;
3830 #endif
3831 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3832 #ifdef MAC
3833 out:
3834 #endif
3835 	NDFREE_PNBUF(&nd);
3836 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
3837 	vn_finished_write(mp);
3838 	if (error == ERELOOKUP)
3839 		goto restart;
3840 	return (error);
3841 }
3842 
3843 /*
3844  * Remove a directory file.
3845  */
3846 #ifndef _SYS_SYSPROTO_H_
3847 struct rmdir_args {
3848 	char	*path;
3849 };
3850 #endif
3851 int
3852 sys_rmdir(struct thread *td, struct rmdir_args *uap)
3853 {
3854 
3855 	return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
3856 	    0));
3857 }
3858 
3859 int
3860 kern_frmdirat(struct thread *td, int dfd, const char *path, int fd,
3861     enum uio_seg pathseg, int flag)
3862 {
3863 	struct mount *mp;
3864 	struct vnode *vp;
3865 	struct file *fp;
3866 	struct nameidata nd;
3867 	cap_rights_t rights;
3868 	int error;
3869 
3870 	fp = NULL;
3871 	if (fd != FD_NONE) {
3872 		error = getvnode(td, fd, cap_rights_init_one(&rights,
3873 		    CAP_LOOKUP), &fp);
3874 		if (error != 0)
3875 			return (error);
3876 	}
3877 
3878 	NDPREINIT(&nd);
3879 restart:
3880 	bwillwrite();
3881 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
3882 	    at2cnpflags(flag, AT_RESOLVE_BENEATH),
3883 	    pathseg, path, dfd, &cap_unlinkat_rights);
3884 	if ((error = namei(&nd)) != 0)
3885 		goto fdout;
3886 	vp = nd.ni_vp;
3887 	if (vp->v_type != VDIR) {
3888 		error = ENOTDIR;
3889 		goto out;
3890 	}
3891 	/*
3892 	 * No rmdir "." please.
3893 	 */
3894 	if (nd.ni_dvp == vp) {
3895 		error = EINVAL;
3896 		goto out;
3897 	}
3898 	/*
3899 	 * The root of a mounted filesystem cannot be deleted.
3900 	 */
3901 	if (vp->v_vflag & VV_ROOT) {
3902 		error = EBUSY;
3903 		goto out;
3904 	}
3905 
3906 	if (fp != NULL && fp->f_vnode != vp) {
3907 		if (VN_IS_DOOMED(fp->f_vnode))
3908 			error = EBADF;
3909 		else
3910 			error = EDEADLK;
3911 		goto out;
3912 	}
3913 
3914 #ifdef MAC
3915 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3916 	    &nd.ni_cnd);
3917 	if (error != 0)
3918 		goto out;
3919 #endif
3920 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3921 		NDFREE_PNBUF(&nd);
3922 		vput(vp);
3923 		if (nd.ni_dvp == vp)
3924 			vrele(nd.ni_dvp);
3925 		else
3926 			vput(nd.ni_dvp);
3927 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH)) != 0)
3928 			goto fdout;
3929 		goto restart;
3930 	}
3931 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3932 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3933 	vn_finished_write(mp);
3934 out:
3935 	NDFREE_PNBUF(&nd);
3936 	vput(vp);
3937 	if (nd.ni_dvp == vp)
3938 		vrele(nd.ni_dvp);
3939 	else
3940 		vput(nd.ni_dvp);
3941 	if (error == ERELOOKUP)
3942 		goto restart;
3943 fdout:
3944 	if (fp != NULL)
3945 		fdrop(fp, td);
3946 	return (error);
3947 }
3948 
3949 #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
3950 int
3951 freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count,
3952     long *basep, void (*func)(struct freebsd11_dirent *))
3953 {
3954 	struct freebsd11_dirent dstdp;
3955 	struct dirent *dp, *edp;
3956 	char *dirbuf;
3957 	off_t base;
3958 	ssize_t resid, ucount;
3959 	int error;
3960 
3961 	/* XXX arbitrary sanity limit on `count'. */
3962 	count = min(count, 64 * 1024);
3963 
3964 	dirbuf = malloc(count, M_TEMP, M_WAITOK);
3965 
3966 	error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid,
3967 	    UIO_SYSSPACE);
3968 	if (error != 0)
3969 		goto done;
3970 	if (basep != NULL)
3971 		*basep = base;
3972 
3973 	ucount = 0;
3974 	for (dp = (struct dirent *)dirbuf,
3975 	    edp = (struct dirent *)&dirbuf[count - resid];
3976 	    ucount < count && dp < edp; ) {
3977 		if (dp->d_reclen == 0)
3978 			break;
3979 		MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0));
3980 		if (dp->d_namlen >= sizeof(dstdp.d_name))
3981 			continue;
3982 		dstdp.d_type = dp->d_type;
3983 		dstdp.d_namlen = dp->d_namlen;
3984 		dstdp.d_fileno = dp->d_fileno;		/* truncate */
3985 		if (dstdp.d_fileno != dp->d_fileno) {
3986 			switch (ino64_trunc_error) {
3987 			default:
3988 			case 0:
3989 				break;
3990 			case 1:
3991 				error = EOVERFLOW;
3992 				goto done;
3993 			case 2:
3994 				dstdp.d_fileno = UINT32_MAX;
3995 				break;
3996 			}
3997 		}
3998 		dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) +
3999 		    ((dp->d_namlen + 1 + 3) &~ 3);
4000 		bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
4001 		bzero(dstdp.d_name + dstdp.d_namlen,
4002 		    dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) -
4003 		    dstdp.d_namlen);
4004 		MPASS(dstdp.d_reclen <= dp->d_reclen);
4005 		MPASS(ucount + dstdp.d_reclen <= count);
4006 		if (func != NULL)
4007 			func(&dstdp);
4008 		error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen);
4009 		if (error != 0)
4010 			break;
4011 		dp = (struct dirent *)((char *)dp + dp->d_reclen);
4012 		ucount += dstdp.d_reclen;
4013 	}
4014 
4015 done:
4016 	free(dirbuf, M_TEMP);
4017 	if (error == 0)
4018 		td->td_retval[0] = ucount;
4019 	return (error);
4020 }
4021 #endif /* COMPAT */
4022 
4023 #ifdef COMPAT_43
4024 static void
4025 ogetdirentries_cvt(struct freebsd11_dirent *dp)
4026 {
4027 #if (BYTE_ORDER == LITTLE_ENDIAN)
4028 	/*
4029 	 * The expected low byte of dp->d_namlen is our dp->d_type.
4030 	 * The high MBZ byte of dp->d_namlen is our dp->d_namlen.
4031 	 */
4032 	dp->d_type = dp->d_namlen;
4033 	dp->d_namlen = 0;
4034 #else
4035 	/*
4036 	 * The dp->d_type is the high byte of the expected dp->d_namlen,
4037 	 * so must be zero'ed.
4038 	 */
4039 	dp->d_type = 0;
4040 #endif
4041 }
4042 
4043 /*
4044  * Read a block of directory entries in a filesystem independent format.
4045  */
4046 #ifndef _SYS_SYSPROTO_H_
4047 struct ogetdirentries_args {
4048 	int	fd;
4049 	char	*buf;
4050 	u_int	count;
4051 	long	*basep;
4052 };
4053 #endif
4054 int
4055 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
4056 {
4057 	long loff;
4058 	int error;
4059 
4060 	error = kern_ogetdirentries(td, uap, &loff);
4061 	if (error == 0)
4062 		error = copyout(&loff, uap->basep, sizeof(long));
4063 	return (error);
4064 }
4065 
4066 int
4067 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
4068     long *ploff)
4069 {
4070 	long base;
4071 	int error;
4072 
4073 	/* XXX arbitrary sanity limit on `count'. */
4074 	if (uap->count > 64 * 1024)
4075 		return (EINVAL);
4076 
4077 	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
4078 	    &base, ogetdirentries_cvt);
4079 
4080 	if (error == 0 && uap->basep != NULL)
4081 		error = copyout(&base, uap->basep, sizeof(long));
4082 
4083 	return (error);
4084 }
4085 #endif /* COMPAT_43 */
4086 
4087 #if defined(COMPAT_FREEBSD11)
4088 #ifndef _SYS_SYSPROTO_H_
4089 struct freebsd11_getdirentries_args {
4090 	int	fd;
4091 	char	*buf;
4092 	u_int	count;
4093 	long	*basep;
4094 };
4095 #endif
4096 int
4097 freebsd11_getdirentries(struct thread *td,
4098     struct freebsd11_getdirentries_args *uap)
4099 {
4100 	long base;
4101 	int error;
4102 
4103 	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
4104 	    &base, NULL);
4105 
4106 	if (error == 0 && uap->basep != NULL)
4107 		error = copyout(&base, uap->basep, sizeof(long));
4108 	return (error);
4109 }
4110 
4111 int
4112 freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap)
4113 {
4114 	struct freebsd11_getdirentries_args ap;
4115 
4116 	ap.fd = uap->fd;
4117 	ap.buf = uap->buf;
4118 	ap.count = uap->count;
4119 	ap.basep = NULL;
4120 	return (freebsd11_getdirentries(td, &ap));
4121 }
4122 #endif /* COMPAT_FREEBSD11 */
4123 
4124 /*
4125  * Read a block of directory entries in a filesystem independent format.
4126  */
4127 int
4128 sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
4129 {
4130 	off_t base;
4131 	int error;
4132 
4133 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4134 	    NULL, UIO_USERSPACE);
4135 	if (error != 0)
4136 		return (error);
4137 	if (uap->basep != NULL)
4138 		error = copyout(&base, uap->basep, sizeof(off_t));
4139 	return (error);
4140 }
4141 
4142 int
4143 kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
4144     off_t *basep, ssize_t *residp, enum uio_seg bufseg)
4145 {
4146 	struct vnode *vp;
4147 	struct file *fp;
4148 	struct uio auio;
4149 	struct iovec aiov;
4150 	off_t loff;
4151 	int error, eofflag;
4152 	off_t foffset;
4153 
4154 	AUDIT_ARG_FD(fd);
4155 	if (count > IOSIZE_MAX)
4156 		return (EINVAL);
4157 	auio.uio_resid = count;
4158 	error = getvnode(td, fd, &cap_read_rights, &fp);
4159 	if (error != 0)
4160 		return (error);
4161 	if ((fp->f_flag & FREAD) == 0) {
4162 		fdrop(fp, td);
4163 		return (EBADF);
4164 	}
4165 	vp = fp->f_vnode;
4166 	foffset = foffset_lock(fp, 0);
4167 unionread:
4168 	if (vp->v_type != VDIR) {
4169 		error = EINVAL;
4170 		goto fail;
4171 	}
4172 	if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) {
4173 		error = ENOENT;
4174 		goto fail;
4175 	}
4176 	aiov.iov_base = buf;
4177 	aiov.iov_len = count;
4178 	auio.uio_iov = &aiov;
4179 	auio.uio_iovcnt = 1;
4180 	auio.uio_rw = UIO_READ;
4181 	auio.uio_segflg = bufseg;
4182 	auio.uio_td = td;
4183 	vn_lock(vp, LK_SHARED | LK_RETRY);
4184 	AUDIT_ARG_VNODE1(vp);
4185 	loff = auio.uio_offset = foffset;
4186 #ifdef MAC
4187 	error = mac_vnode_check_readdir(td->td_ucred, vp);
4188 	if (error == 0)
4189 #endif
4190 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4191 		    NULL);
4192 	foffset = auio.uio_offset;
4193 	if (error != 0) {
4194 		VOP_UNLOCK(vp);
4195 		goto fail;
4196 	}
4197 	if (count == auio.uio_resid &&
4198 	    (vp->v_vflag & VV_ROOT) &&
4199 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4200 		struct vnode *tvp = vp;
4201 
4202 		vp = vp->v_mount->mnt_vnodecovered;
4203 		VREF(vp);
4204 		fp->f_vnode = vp;
4205 		foffset = 0;
4206 		vput(tvp);
4207 		goto unionread;
4208 	}
4209 	VOP_UNLOCK(vp);
4210 	*basep = loff;
4211 	if (residp != NULL)
4212 		*residp = auio.uio_resid;
4213 	td->td_retval[0] = count - auio.uio_resid;
4214 fail:
4215 	foffset_unlock(fp, foffset, 0);
4216 	fdrop(fp, td);
4217 	return (error);
4218 }
4219 
4220 /*
4221  * Set the mode mask for creation of filesystem nodes.
4222  */
4223 #ifndef _SYS_SYSPROTO_H_
4224 struct umask_args {
4225 	int	newmask;
4226 };
4227 #endif
4228 int
4229 sys_umask(struct thread *td, struct umask_args *uap)
4230 {
4231 	struct pwddesc *pdp;
4232 
4233 	pdp = td->td_proc->p_pd;
4234 	PWDDESC_XLOCK(pdp);
4235 	td->td_retval[0] = pdp->pd_cmask;
4236 	pdp->pd_cmask = uap->newmask & ALLPERMS;
4237 	PWDDESC_XUNLOCK(pdp);
4238 	return (0);
4239 }
4240 
4241 /*
4242  * Void all references to file by ripping underlying filesystem away from
4243  * vnode.
4244  */
4245 #ifndef _SYS_SYSPROTO_H_
4246 struct revoke_args {
4247 	char	*path;
4248 };
4249 #endif
4250 int
4251 sys_revoke(struct thread *td, struct revoke_args *uap)
4252 {
4253 	struct vnode *vp;
4254 	struct vattr vattr;
4255 	struct nameidata nd;
4256 	int error;
4257 
4258 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4259 	    uap->path);
4260 	if ((error = namei(&nd)) != 0)
4261 		return (error);
4262 	vp = nd.ni_vp;
4263 	NDFREE_PNBUF(&nd);
4264 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4265 		error = EINVAL;
4266 		goto out;
4267 	}
4268 #ifdef MAC
4269 	error = mac_vnode_check_revoke(td->td_ucred, vp);
4270 	if (error != 0)
4271 		goto out;
4272 #endif
4273 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4274 	if (error != 0)
4275 		goto out;
4276 	if (td->td_ucred->cr_uid != vattr.va_uid) {
4277 		error = priv_check(td, PRIV_VFS_ADMIN);
4278 		if (error != 0)
4279 			goto out;
4280 	}
4281 	if (devfs_usecount(vp) > 0)
4282 		VOP_REVOKE(vp, REVOKEALL);
4283 out:
4284 	vput(vp);
4285 	return (error);
4286 }
4287 
4288 /*
4289  * This variant of getvnode() allows O_PATH files.  Caller should
4290  * ensure that returned file and vnode are only used for compatible
4291  * semantics.
4292  */
4293 int
4294 getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp,
4295     struct file **fpp)
4296 {
4297 	struct file *fp;
4298 	int error;
4299 
4300 	error = fget_unlocked(td, fd, rightsp, &fp);
4301 	if (error != 0)
4302 		return (error);
4303 
4304 	/*
4305 	 * The file could be not of the vnode type, or it may be not
4306 	 * yet fully initialized, in which case the f_vnode pointer
4307 	 * may be set, but f_ops is still badfileops.  E.g.,
4308 	 * devfs_open() transiently create such situation to
4309 	 * facilitate csw d_fdopen().
4310 	 *
4311 	 * Dupfdopen() handling in kern_openat() installs the
4312 	 * half-baked file into the process descriptor table, allowing
4313 	 * other thread to dereference it. Guard against the race by
4314 	 * checking f_ops.
4315 	 */
4316 	if (__predict_false(fp->f_vnode == NULL || fp->f_ops == &badfileops)) {
4317 		fdrop(fp, td);
4318 		*fpp = NULL;
4319 		return (EINVAL);
4320 	}
4321 
4322 	*fpp = fp;
4323 	return (0);
4324 }
4325 
4326 /*
4327  * Convert a user file descriptor to a kernel file entry and check
4328  * that, if it is a capability, the correct rights are present.
4329  * A reference on the file entry is held upon returning.
4330  */
4331 int
4332 getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
4333 {
4334 	int error;
4335 
4336 	error = getvnode_path(td, fd, rightsp, fpp);
4337 	if (__predict_false(error != 0))
4338 		return (error);
4339 
4340 	/*
4341 	 * Filter out O_PATH file descriptors, most getvnode() callers
4342 	 * do not call fo_ methods.
4343 	 */
4344 	if (__predict_false((*fpp)->f_ops == &path_fileops)) {
4345 		fdrop(*fpp, td);
4346 		*fpp = NULL;
4347 		error = EBADF;
4348 	}
4349 
4350 	return (error);
4351 }
4352 
4353 /*
4354  * Get an (NFS) file handle.
4355  */
4356 #ifndef _SYS_SYSPROTO_H_
4357 struct lgetfh_args {
4358 	char *fname;
4359 	fhandle_t *fhp;
4360 };
4361 #endif
4362 int
4363 sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
4364 {
4365 
4366 	return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname,
4367 	    UIO_USERSPACE, uap->fhp, UIO_USERSPACE));
4368 }
4369 
4370 #ifndef _SYS_SYSPROTO_H_
4371 struct getfh_args {
4372 	char *fname;
4373 	fhandle_t *fhp;
4374 };
4375 #endif
4376 int
4377 sys_getfh(struct thread *td, struct getfh_args *uap)
4378 {
4379 
4380 	return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE,
4381 	    uap->fhp, UIO_USERSPACE));
4382 }
4383 
4384 /*
4385  * syscall for the rpc.lockd to use to translate an open descriptor into
4386  * a NFS file handle.
4387  *
4388  * warning: do not remove the priv_check() call or this becomes one giant
4389  * security hole.
4390  */
4391 #ifndef _SYS_SYSPROTO_H_
4392 struct getfhat_args {
4393 	int fd;
4394 	char *path;
4395 	fhandle_t *fhp;
4396 	int flags;
4397 };
4398 #endif
4399 int
4400 sys_getfhat(struct thread *td, struct getfhat_args *uap)
4401 {
4402 
4403 	return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE,
4404 	    uap->fhp, UIO_USERSPACE));
4405 }
4406 
4407 int
4408 kern_getfhat(struct thread *td, int flags, int fd, const char *path,
4409     enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg)
4410 {
4411 	struct nameidata nd;
4412 	fhandle_t fh;
4413 	struct vnode *vp;
4414 	int error;
4415 
4416 	if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
4417 		return (EINVAL);
4418 	error = priv_check(td, PRIV_VFS_GETFH);
4419 	if (error != 0)
4420 		return (error);
4421 	NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW |
4422 	    AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path,
4423 	    fd);
4424 	error = namei(&nd);
4425 	if (error != 0)
4426 		return (error);
4427 	NDFREE_PNBUF(&nd);
4428 	vp = nd.ni_vp;
4429 	bzero(&fh, sizeof(fh));
4430 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4431 	error = VOP_VPTOFH(vp, &fh.fh_fid);
4432 	vput(vp);
4433 	if (error == 0) {
4434 		if (fhseg == UIO_USERSPACE)
4435 			error = copyout(&fh, fhp, sizeof (fh));
4436 		else
4437 			memcpy(fhp, &fh, sizeof(fh));
4438 	}
4439 	return (error);
4440 }
4441 
4442 #ifndef _SYS_SYSPROTO_H_
4443 struct fhlink_args {
4444 	fhandle_t *fhp;
4445 	const char *to;
4446 };
4447 #endif
4448 int
4449 sys_fhlink(struct thread *td, struct fhlink_args *uap)
4450 {
4451 
4452 	return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp));
4453 }
4454 
4455 #ifndef _SYS_SYSPROTO_H_
4456 struct fhlinkat_args {
4457 	fhandle_t *fhp;
4458 	int tofd;
4459 	const char *to;
4460 };
4461 #endif
4462 int
4463 sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap)
4464 {
4465 
4466 	return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp));
4467 }
4468 
4469 static int
4470 kern_fhlinkat(struct thread *td, int fd, const char *path,
4471     enum uio_seg pathseg, fhandle_t *fhp)
4472 {
4473 	fhandle_t fh;
4474 	struct mount *mp;
4475 	struct vnode *vp;
4476 	int error;
4477 
4478 	error = priv_check(td, PRIV_VFS_GETFH);
4479 	if (error != 0)
4480 		return (error);
4481 	error = copyin(fhp, &fh, sizeof(fh));
4482 	if (error != 0)
4483 		return (error);
4484 	do {
4485 		bwillwrite();
4486 		if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4487 			return (ESTALE);
4488 		error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
4489 		vfs_unbusy(mp);
4490 		if (error != 0)
4491 			return (error);
4492 		VOP_UNLOCK(vp);
4493 		error = kern_linkat_vp(td, vp, fd, path, pathseg);
4494 	} while (error == EAGAIN || error == ERELOOKUP);
4495 	return (error);
4496 }
4497 
4498 #ifndef _SYS_SYSPROTO_H_
4499 struct fhreadlink_args {
4500 	fhandle_t *fhp;
4501 	char *buf;
4502 	size_t bufsize;
4503 };
4504 #endif
4505 int
4506 sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap)
4507 {
4508 	fhandle_t fh;
4509 	struct mount *mp;
4510 	struct vnode *vp;
4511 	int error;
4512 
4513 	error = priv_check(td, PRIV_VFS_GETFH);
4514 	if (error != 0)
4515 		return (error);
4516 	if (uap->bufsize > IOSIZE_MAX)
4517 		return (EINVAL);
4518 	error = copyin(uap->fhp, &fh, sizeof(fh));
4519 	if (error != 0)
4520 		return (error);
4521 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4522 		return (ESTALE);
4523 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
4524 	vfs_unbusy(mp);
4525 	if (error != 0)
4526 		return (error);
4527 	error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td);
4528 	vput(vp);
4529 	return (error);
4530 }
4531 
4532 /*
4533  * syscall for the rpc.lockd to use to translate a NFS file handle into an
4534  * open descriptor.
4535  *
4536  * warning: do not remove the priv_check() call or this becomes one giant
4537  * security hole.
4538  */
4539 #ifndef _SYS_SYSPROTO_H_
4540 struct fhopen_args {
4541 	const struct fhandle *u_fhp;
4542 	int flags;
4543 };
4544 #endif
4545 int
4546 sys_fhopen(struct thread *td, struct fhopen_args *uap)
4547 {
4548 	return (kern_fhopen(td, uap->u_fhp, uap->flags));
4549 }
4550 
4551 int
4552 kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags)
4553 {
4554 	struct mount *mp;
4555 	struct vnode *vp;
4556 	struct fhandle fhp;
4557 	struct file *fp;
4558 	int fmode, error;
4559 	int indx;
4560 
4561 	error = priv_check(td, PRIV_VFS_FHOPEN);
4562 	if (error != 0)
4563 		return (error);
4564 	indx = -1;
4565 	fmode = FFLAGS(flags);
4566 	/* why not allow a non-read/write open for our lockd? */
4567 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4568 		return (EINVAL);
4569 	error = copyin(u_fhp, &fhp, sizeof(fhp));
4570 	if (error != 0)
4571 		return(error);
4572 	/* find the mount point */
4573 	mp = vfs_busyfs(&fhp.fh_fsid);
4574 	if (mp == NULL)
4575 		return (ESTALE);
4576 	/* now give me my vnode, it gets returned to me locked */
4577 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4578 	vfs_unbusy(mp);
4579 	if (error != 0)
4580 		return (error);
4581 
4582 	error = falloc_noinstall(td, &fp);
4583 	if (error != 0) {
4584 		vput(vp);
4585 		return (error);
4586 	}
4587 	/*
4588 	 * An extra reference on `fp' has been held for us by
4589 	 * falloc_noinstall().
4590 	 */
4591 
4592 #ifdef INVARIANTS
4593 	td->td_dupfd = -1;
4594 #endif
4595 	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4596 	if (error != 0) {
4597 		KASSERT(fp->f_ops == &badfileops,
4598 		    ("VOP_OPEN in fhopen() set f_ops"));
4599 		KASSERT(td->td_dupfd < 0,
4600 		    ("fhopen() encountered fdopen()"));
4601 
4602 		vput(vp);
4603 		goto bad;
4604 	}
4605 #ifdef INVARIANTS
4606 	td->td_dupfd = 0;
4607 #endif
4608 	fp->f_vnode = vp;
4609 	finit_vnode(fp, fmode, NULL, &vnops);
4610 	VOP_UNLOCK(vp);
4611 	if ((fmode & O_TRUNC) != 0) {
4612 		error = fo_truncate(fp, 0, td->td_ucred, td);
4613 		if (error != 0)
4614 			goto bad;
4615 	}
4616 
4617 	error = finstall(td, fp, &indx, fmode, NULL);
4618 bad:
4619 	fdrop(fp, td);
4620 	td->td_retval[0] = indx;
4621 	return (error);
4622 }
4623 
4624 /*
4625  * Stat an (NFS) file handle.
4626  */
4627 #ifndef _SYS_SYSPROTO_H_
4628 struct fhstat_args {
4629 	struct fhandle *u_fhp;
4630 	struct stat *sb;
4631 };
4632 #endif
4633 int
4634 sys_fhstat(struct thread *td, struct fhstat_args *uap)
4635 {
4636 	struct stat sb;
4637 	struct fhandle fh;
4638 	int error;
4639 
4640 	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4641 	if (error != 0)
4642 		return (error);
4643 	error = kern_fhstat(td, fh, &sb);
4644 	if (error == 0)
4645 		error = copyout(&sb, uap->sb, sizeof(sb));
4646 	return (error);
4647 }
4648 
4649 int
4650 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4651 {
4652 	struct mount *mp;
4653 	struct vnode *vp;
4654 	int error;
4655 
4656 	error = priv_check(td, PRIV_VFS_FHSTAT);
4657 	if (error != 0)
4658 		return (error);
4659 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4660 		return (ESTALE);
4661 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4662 	vfs_unbusy(mp);
4663 	if (error != 0)
4664 		return (error);
4665 	error = VOP_STAT(vp, sb, td->td_ucred, NOCRED);
4666 	vput(vp);
4667 	return (error);
4668 }
4669 
4670 /*
4671  * Implement fstatfs() for (NFS) file handles.
4672  */
4673 #ifndef _SYS_SYSPROTO_H_
4674 struct fhstatfs_args {
4675 	struct fhandle *u_fhp;
4676 	struct statfs *buf;
4677 };
4678 #endif
4679 int
4680 sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
4681 {
4682 	struct statfs *sfp;
4683 	fhandle_t fh;
4684 	int error;
4685 
4686 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4687 	if (error != 0)
4688 		return (error);
4689 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
4690 	error = kern_fhstatfs(td, fh, sfp);
4691 	if (error == 0)
4692 		error = copyout(sfp, uap->buf, sizeof(*sfp));
4693 	free(sfp, M_STATFS);
4694 	return (error);
4695 }
4696 
4697 int
4698 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4699 {
4700 	struct mount *mp;
4701 	struct vnode *vp;
4702 	int error;
4703 
4704 	error = priv_check(td, PRIV_VFS_FHSTATFS);
4705 	if (error != 0)
4706 		return (error);
4707 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4708 		return (ESTALE);
4709 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4710 	if (error != 0) {
4711 		vfs_unbusy(mp);
4712 		return (error);
4713 	}
4714 	vput(vp);
4715 	error = prison_canseemount(td->td_ucred, mp);
4716 	if (error != 0)
4717 		goto out;
4718 #ifdef MAC
4719 	error = mac_mount_check_stat(td->td_ucred, mp);
4720 	if (error != 0)
4721 		goto out;
4722 #endif
4723 	error = VFS_STATFS(mp, buf);
4724 out:
4725 	vfs_unbusy(mp);
4726 	return (error);
4727 }
4728 
4729 /*
4730  * Unlike madvise(2), we do not make a best effort to remember every
4731  * possible caching hint.  Instead, we remember the last setting with
4732  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4733  * region of any current setting.
4734  */
4735 int
4736 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4737     int advice)
4738 {
4739 	struct fadvise_info *fa, *new;
4740 	struct file *fp;
4741 	struct vnode *vp;
4742 	off_t end;
4743 	int error;
4744 
4745 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4746 		return (EINVAL);
4747 	AUDIT_ARG_VALUE(advice);
4748 	switch (advice) {
4749 	case POSIX_FADV_SEQUENTIAL:
4750 	case POSIX_FADV_RANDOM:
4751 	case POSIX_FADV_NOREUSE:
4752 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4753 		break;
4754 	case POSIX_FADV_NORMAL:
4755 	case POSIX_FADV_WILLNEED:
4756 	case POSIX_FADV_DONTNEED:
4757 		new = NULL;
4758 		break;
4759 	default:
4760 		return (EINVAL);
4761 	}
4762 	/* XXX: CAP_POSIX_FADVISE? */
4763 	AUDIT_ARG_FD(fd);
4764 	error = fget(td, fd, &cap_no_rights, &fp);
4765 	if (error != 0)
4766 		goto out;
4767 	AUDIT_ARG_FILE(td->td_proc, fp);
4768 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4769 		error = ESPIPE;
4770 		goto out;
4771 	}
4772 	if (fp->f_type != DTYPE_VNODE) {
4773 		error = ENODEV;
4774 		goto out;
4775 	}
4776 	vp = fp->f_vnode;
4777 	if (vp->v_type != VREG) {
4778 		error = ENODEV;
4779 		goto out;
4780 	}
4781 	if (len == 0)
4782 		end = OFF_MAX;
4783 	else
4784 		end = offset + len - 1;
4785 	switch (advice) {
4786 	case POSIX_FADV_SEQUENTIAL:
4787 	case POSIX_FADV_RANDOM:
4788 	case POSIX_FADV_NOREUSE:
4789 		/*
4790 		 * Try to merge any existing non-standard region with
4791 		 * this new region if possible, otherwise create a new
4792 		 * non-standard region for this request.
4793 		 */
4794 		mtx_pool_lock(mtxpool_sleep, fp);
4795 		fa = fp->f_advice;
4796 		if (fa != NULL && fa->fa_advice == advice &&
4797 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4798 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4799 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4800 			if (offset < fa->fa_start)
4801 				fa->fa_start = offset;
4802 			if (end > fa->fa_end)
4803 				fa->fa_end = end;
4804 		} else {
4805 			new->fa_advice = advice;
4806 			new->fa_start = offset;
4807 			new->fa_end = end;
4808 			fp->f_advice = new;
4809 			new = fa;
4810 		}
4811 		mtx_pool_unlock(mtxpool_sleep, fp);
4812 		break;
4813 	case POSIX_FADV_NORMAL:
4814 		/*
4815 		 * If a the "normal" region overlaps with an existing
4816 		 * non-standard region, trim or remove the
4817 		 * non-standard region.
4818 		 */
4819 		mtx_pool_lock(mtxpool_sleep, fp);
4820 		fa = fp->f_advice;
4821 		if (fa != NULL) {
4822 			if (offset <= fa->fa_start && end >= fa->fa_end) {
4823 				new = fa;
4824 				fp->f_advice = NULL;
4825 			} else if (offset <= fa->fa_start &&
4826 			    end >= fa->fa_start)
4827 				fa->fa_start = end + 1;
4828 			else if (offset <= fa->fa_end && end >= fa->fa_end)
4829 				fa->fa_end = offset - 1;
4830 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4831 				/*
4832 				 * If the "normal" region is a middle
4833 				 * portion of the existing
4834 				 * non-standard region, just remove
4835 				 * the whole thing rather than picking
4836 				 * one side or the other to
4837 				 * preserve.
4838 				 */
4839 				new = fa;
4840 				fp->f_advice = NULL;
4841 			}
4842 		}
4843 		mtx_pool_unlock(mtxpool_sleep, fp);
4844 		break;
4845 	case POSIX_FADV_WILLNEED:
4846 	case POSIX_FADV_DONTNEED:
4847 		error = VOP_ADVISE(vp, offset, end, advice);
4848 		break;
4849 	}
4850 out:
4851 	if (fp != NULL)
4852 		fdrop(fp, td);
4853 	free(new, M_FADVISE);
4854 	return (error);
4855 }
4856 
4857 int
4858 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4859 {
4860 	int error;
4861 
4862 	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
4863 	    uap->advice);
4864 	return (kern_posix_error(td, error));
4865 }
4866 
4867 int
4868 kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
4869     off_t *outoffp, size_t len, unsigned int flags)
4870 {
4871 	struct file *infp, *outfp;
4872 	struct vnode *invp, *outvp;
4873 	int error;
4874 	size_t retlen;
4875 	void *rl_rcookie, *rl_wcookie;
4876 	off_t savinoff, savoutoff;
4877 
4878 	infp = outfp = NULL;
4879 	rl_rcookie = rl_wcookie = NULL;
4880 	savinoff = -1;
4881 	error = 0;
4882 	retlen = 0;
4883 
4884 	if (flags != 0) {
4885 		error = EINVAL;
4886 		goto out;
4887 	}
4888 	if (len > SSIZE_MAX)
4889 		/*
4890 		 * Although the len argument is size_t, the return argument
4891 		 * is ssize_t (which is signed).  Therefore a size that won't
4892 		 * fit in ssize_t can't be returned.
4893 		 */
4894 		len = SSIZE_MAX;
4895 
4896 	/* Get the file structures for the file descriptors. */
4897 	error = fget_read(td, infd,
4898 	    inoffp != NULL ? &cap_pread_rights : &cap_read_rights, &infp);
4899 	if (error != 0)
4900 		goto out;
4901 	if (infp->f_ops == &badfileops) {
4902 		error = EBADF;
4903 		goto out;
4904 	}
4905 	if (infp->f_vnode == NULL) {
4906 		error = EINVAL;
4907 		goto out;
4908 	}
4909 	error = fget_write(td, outfd,
4910 	    outoffp != NULL ? &cap_pwrite_rights : &cap_write_rights, &outfp);
4911 	if (error != 0)
4912 		goto out;
4913 	if (outfp->f_ops == &badfileops) {
4914 		error = EBADF;
4915 		goto out;
4916 	}
4917 	if (outfp->f_vnode == NULL) {
4918 		error = EINVAL;
4919 		goto out;
4920 	}
4921 
4922 	/* Set the offset pointers to the correct place. */
4923 	if (inoffp == NULL)
4924 		inoffp = &infp->f_offset;
4925 	if (outoffp == NULL)
4926 		outoffp = &outfp->f_offset;
4927 	savinoff = *inoffp;
4928 	savoutoff = *outoffp;
4929 
4930 	invp = infp->f_vnode;
4931 	outvp = outfp->f_vnode;
4932 	/* Sanity check the f_flag bits. */
4933 	if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
4934 	    (infp->f_flag & FREAD) == 0) {
4935 		error = EBADF;
4936 		goto out;
4937 	}
4938 
4939 	/* If len == 0, just return 0. */
4940 	if (len == 0)
4941 		goto out;
4942 
4943 	/*
4944 	 * If infp and outfp refer to the same file, the byte ranges cannot
4945 	 * overlap.
4946 	 */
4947 	if (invp == outvp && ((savinoff <= savoutoff && savinoff + len >
4948 	    savoutoff) || (savinoff > savoutoff && savoutoff + len >
4949 	    savinoff))) {
4950 		error = EINVAL;
4951 		goto out;
4952 	}
4953 
4954 	/* Range lock the byte ranges for both invp and outvp. */
4955 	for (;;) {
4956 		rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
4957 		    len);
4958 		rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp +
4959 		    len);
4960 		if (rl_rcookie != NULL)
4961 			break;
4962 		vn_rangelock_unlock(outvp, rl_wcookie);
4963 		rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
4964 		vn_rangelock_unlock(invp, rl_rcookie);
4965 	}
4966 
4967 	retlen = len;
4968 	error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
4969 	    flags, infp->f_cred, outfp->f_cred, td);
4970 out:
4971 	if (rl_rcookie != NULL)
4972 		vn_rangelock_unlock(invp, rl_rcookie);
4973 	if (rl_wcookie != NULL)
4974 		vn_rangelock_unlock(outvp, rl_wcookie);
4975 	if (savinoff != -1 && (error == EINTR || error == ERESTART)) {
4976 		*inoffp = savinoff;
4977 		*outoffp = savoutoff;
4978 	}
4979 	if (outfp != NULL)
4980 		fdrop(outfp, td);
4981 	if (infp != NULL)
4982 		fdrop(infp, td);
4983 	td->td_retval[0] = retlen;
4984 	return (error);
4985 }
4986 
4987 int
4988 sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
4989 {
4990 	off_t inoff, outoff, *inoffp, *outoffp;
4991 	int error;
4992 
4993 	inoffp = outoffp = NULL;
4994 	if (uap->inoffp != NULL) {
4995 		error = copyin(uap->inoffp, &inoff, sizeof(off_t));
4996 		if (error != 0)
4997 			return (error);
4998 		inoffp = &inoff;
4999 	}
5000 	if (uap->outoffp != NULL) {
5001 		error = copyin(uap->outoffp, &outoff, sizeof(off_t));
5002 		if (error != 0)
5003 			return (error);
5004 		outoffp = &outoff;
5005 	}
5006 	error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
5007 	    outoffp, uap->len, uap->flags);
5008 	if (error == 0 && uap->inoffp != NULL)
5009 		error = copyout(inoffp, uap->inoffp, sizeof(off_t));
5010 	if (error == 0 && uap->outoffp != NULL)
5011 		error = copyout(outoffp, uap->outoffp, sizeof(off_t));
5012 	return (error);
5013 }
5014