xref: /openbsd/sys/kern/kern_descrip.c (revision 404b540a)
1 /*	$OpenBSD: kern_descrip.c,v 1.82 2009/07/09 22:29:56 thib Exp $	*/
2 /*	$NetBSD: kern_descrip.c,v 1.42 1996/03/30 22:24:38 christos Exp $	*/
3 
4 /*
5  * Copyright (c) 1982, 1986, 1989, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * (c) UNIX System Laboratories, Inc.
8  * All or some portions of this file are derived from material licensed
9  * to the University of California by American Telephone and Telegraph
10  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11  * the permission of UNIX System Laboratories, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/filedesc.h>
43 #include <sys/kernel.h>
44 #include <sys/vnode.h>
45 #include <sys/proc.h>
46 #include <sys/file.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/stat.h>
50 #include <sys/ioctl.h>
51 #include <sys/fcntl.h>
52 #include <sys/malloc.h>
53 #include <sys/syslog.h>
54 #include <sys/ucred.h>
55 #include <sys/unistd.h>
56 #include <sys/resourcevar.h>
57 #include <sys/conf.h>
58 #include <sys/mount.h>
59 #include <sys/syscallargs.h>
60 #include <sys/event.h>
61 #include <sys/pool.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 #include <sys/pipe.h>
66 
67 /*
68  * Descriptor management.
69  */
70 struct filelist filehead;	/* head of list of open files */
71 int nfiles;			/* actual number of open files */
72 
73 static __inline void fd_used(struct filedesc *, int);
74 static __inline void fd_unused(struct filedesc *, int);
75 static __inline int find_next_zero(u_int *, int, u_int);
76 int finishdup(struct proc *, struct file *, int, int, register_t *);
77 int find_last_set(struct filedesc *, int);
78 
79 struct pool file_pool;
80 struct pool fdesc_pool;
81 
82 void
83 filedesc_init(void)
84 {
85 	pool_init(&file_pool, sizeof(struct file), 0, 0, 0, "filepl",
86 		&pool_allocator_nointr);
87 	pool_init(&fdesc_pool, sizeof(struct filedesc0), 0, 0, 0, "fdescpl",
88 		&pool_allocator_nointr);
89 	LIST_INIT(&filehead);
90 }
91 
92 static __inline int
93 find_next_zero (u_int *bitmap, int want, u_int bits)
94 {
95 	int i, off, maxoff;
96 	u_int sub;
97 
98 	if (want > bits)
99 		return -1;
100 
101 	off = want >> NDENTRYSHIFT;
102 	i = want & NDENTRYMASK;
103 	if (i) {
104 		sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
105 		if (sub != ~0)
106 			goto found;
107 		off++;
108 	}
109 
110 	maxoff = NDLOSLOTS(bits);
111 	while (off < maxoff) {
112 		if ((sub = bitmap[off]) != ~0)
113 			goto found;
114 		off++;
115 	}
116 
117 	return -1;
118 
119  found:
120 	return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
121 }
122 
123 int
124 find_last_set(struct filedesc *fd, int last)
125 {
126 	int off, i;
127 	struct file **ofiles = fd->fd_ofiles;
128 	u_int *bitmap = fd->fd_lomap;
129 
130 	off = (last - 1) >> NDENTRYSHIFT;
131 
132 	while (off >= 0 && !bitmap[off])
133 		off--;
134 	if (off < 0)
135 		return 0;
136 
137 	i = ((off + 1) << NDENTRYSHIFT) - 1;
138 	if (i >= last)
139 		i = last - 1;
140 
141 	while (i > 0 && ofiles[i] == NULL)
142 		i--;
143 	return i;
144 }
145 
146 static __inline void
147 fd_used(struct filedesc *fdp, int fd)
148 {
149 	u_int off = fd >> NDENTRYSHIFT;
150 
151 	fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
152 	if (fdp->fd_lomap[off] == ~0)
153 		fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
154 
155 	if (fd > fdp->fd_lastfile)
156 		fdp->fd_lastfile = fd;
157 }
158 
159 static __inline void
160 fd_unused(struct filedesc *fdp, int fd)
161 {
162 	u_int off = fd >> NDENTRYSHIFT;
163 
164 	if (fd < fdp->fd_freefile)
165 		fdp->fd_freefile = fd;
166 
167 	if (fdp->fd_lomap[off] == ~0)
168 		fdp->fd_himap[off >> NDENTRYSHIFT] &= ~(1 << (off & NDENTRYMASK));
169 	fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
170 
171 #ifdef DIAGNOSTIC
172 	if (fd > fdp->fd_lastfile)
173 		panic("fd_unused: fd_lastfile inconsistent");
174 #endif
175 	if (fd == fdp->fd_lastfile)
176 		fdp->fd_lastfile = find_last_set(fdp, fd);
177 }
178 
179 struct file *
180 fd_getfile(struct filedesc *fdp, int fd)
181 {
182 	struct file *fp;
183 
184 	if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
185 		return (NULL);
186 
187 	if (!FILE_IS_USABLE(fp))
188 		return (NULL);
189 
190 	return (fp);
191 }
192 
193 /*
194  * System calls on descriptors.
195  */
196 
197 /*
198  * Duplicate a file descriptor.
199  */
200 /* ARGSUSED */
201 int
202 sys_dup(struct proc *p, void *v, register_t *retval)
203 {
204 	struct sys_dup_args /* {
205 		syscallarg(int) fd;
206 	} */ *uap = v;
207 	struct filedesc *fdp = p->p_fd;
208 	int old = SCARG(uap, fd);
209 	struct file *fp;
210 	int new;
211 	int error;
212 
213 restart:
214 	if ((fp = fd_getfile(fdp, old)) == NULL)
215 		return (EBADF);
216 	FREF(fp);
217 	fdplock(fdp);
218 	if ((error = fdalloc(p, 0, &new)) != 0) {
219 		FRELE(fp);
220 		if (error == ENOSPC) {
221 			fdexpand(p);
222 			fdpunlock(fdp);
223 			goto restart;
224 		}
225 		goto out;
226 	}
227 	error = finishdup(p, fp, old, new, retval);
228 
229 out:
230 	fdpunlock(fdp);
231 	return (error);
232 }
233 
234 /*
235  * Duplicate a file descriptor to a particular value.
236  */
237 /* ARGSUSED */
238 int
239 sys_dup2(struct proc *p, void *v, register_t *retval)
240 {
241 	struct sys_dup2_args /* {
242 		syscallarg(int) from;
243 		syscallarg(int) to;
244 	} */ *uap = v;
245 	int old = SCARG(uap, from), new = SCARG(uap, to);
246 	struct filedesc *fdp = p->p_fd;
247 	struct file *fp;
248 	int i, error;
249 
250 restart:
251 	if ((fp = fd_getfile(fdp, old)) == NULL)
252 		return (EBADF);
253 	if ((u_int)new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
254 	    (u_int)new >= maxfiles)
255 		return (EBADF);
256 	if (old == new) {
257 		/*
258 		 * NOTE! This doesn't clear the close-on-exec flag. This might
259 		 * or might not be the intended behavior from the start, but
260 		 * this is what everyone else does.
261 		 */
262 		*retval = new;
263 		return (0);
264 	}
265 	FREF(fp);
266 	fdplock(fdp);
267 	if (new >= fdp->fd_nfiles) {
268 		if ((error = fdalloc(p, new, &i)) != 0) {
269 			FRELE(fp);
270 			if (error == ENOSPC) {
271 				fdexpand(p);
272 				fdpunlock(fdp);
273 				goto restart;
274 			}
275 			goto out;
276 		}
277 		if (new != i)
278 			panic("dup2: fdalloc");
279 	}
280 	/* finishdup() does FRELE */
281 	error = finishdup(p, fp, old, new, retval);
282 
283 out:
284 	fdpunlock(fdp);
285 	return (error);
286 }
287 
288 /*
289  * The file control system call.
290  */
291 /* ARGSUSED */
292 int
293 sys_fcntl(struct proc *p, void *v, register_t *retval)
294 {
295 	struct sys_fcntl_args /* {
296 		syscallarg(int) fd;
297 		syscallarg(int) cmd;
298 		syscallarg(void *) arg;
299 	} */ *uap = v;
300 	int fd = SCARG(uap, fd);
301 	struct filedesc *fdp = p->p_fd;
302 	struct file *fp;
303 	struct vnode *vp;
304 	int i, tmp, newmin, flg = F_POSIX;
305 	struct flock fl;
306 	int error = 0;
307 
308 restart:
309 	if ((fp = fd_getfile(fdp, fd)) == NULL)
310 		return (EBADF);
311 	FREF(fp);
312 	switch (SCARG(uap, cmd)) {
313 
314 	case F_DUPFD:
315 		newmin = (long)SCARG(uap, arg);
316 		if ((u_int)newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
317 		    (u_int)newmin >= maxfiles) {
318 			error = EINVAL;
319 			break;
320 		}
321 		fdplock(fdp);
322 		if ((error = fdalloc(p, newmin, &i)) != 0) {
323 			if (error == ENOSPC) {
324 				fdexpand(p);
325 				FRELE(fp);
326 				fdpunlock(fdp);
327 				goto restart;
328 			}
329 		}
330 		/* finishdup will FRELE for us. */
331 		if (!error)
332 			error = finishdup(p, fp, fd, i, retval);
333 		else
334 			FRELE(fp);
335 
336 		fdpunlock(fdp);
337 		return (error);
338 
339 	case F_GETFD:
340 		*retval = fdp->fd_ofileflags[fd] & UF_EXCLOSE ? 1 : 0;
341 		break;
342 
343 	case F_SETFD:
344 		if ((long)SCARG(uap, arg) & 1)
345 			fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
346 		else
347 			fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
348 		break;
349 
350 	case F_GETFL:
351 		*retval = OFLAGS(fp->f_flag);
352 		break;
353 
354 	case F_SETFL:
355 		fp->f_flag &= ~FCNTLFLAGS;
356 		fp->f_flag |= FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS;
357 		tmp = fp->f_flag & FNONBLOCK;
358 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
359 		if (error)
360 			break;
361 		tmp = fp->f_flag & FASYNC;
362 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
363 		if (!error)
364 			break;
365 		fp->f_flag &= ~FNONBLOCK;
366 		tmp = 0;
367 		(void) (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
368 		break;
369 
370 	case F_GETOWN:
371 		if (fp->f_type == DTYPE_SOCKET) {
372 			*retval = ((struct socket *)fp->f_data)->so_pgid;
373 			break;
374 		}
375 		error = (*fp->f_ops->fo_ioctl)
376 			(fp, TIOCGPGRP, (caddr_t)&tmp, p);
377 		*retval = -tmp;
378 		break;
379 
380 	case F_SETOWN:
381 		if (fp->f_type == DTYPE_SOCKET) {
382 			struct socket *so = (struct socket *)fp->f_data;
383 
384 			so->so_pgid = (long)SCARG(uap, arg);
385 			so->so_siguid = p->p_cred->p_ruid;
386 			so->so_sigeuid = p->p_ucred->cr_uid;
387 			break;
388 		}
389 		if ((long)SCARG(uap, arg) <= 0) {
390 			SCARG(uap, arg) = (void *)(-(long)SCARG(uap, arg));
391 		} else {
392 			struct proc *p1 = pfind((long)SCARG(uap, arg));
393 			if (p1 == 0) {
394 				error = ESRCH;
395 				break;
396 			}
397 			SCARG(uap, arg) = (void *)(long)p1->p_pgrp->pg_id;
398 		}
399 		error = ((*fp->f_ops->fo_ioctl)
400 			(fp, TIOCSPGRP, (caddr_t)&SCARG(uap, arg), p));
401 		break;
402 
403 	case F_SETLKW:
404 		flg |= F_WAIT;
405 		/* FALLTHROUGH */
406 
407 	case F_SETLK:
408 		if (fp->f_type != DTYPE_VNODE) {
409 			error = EBADF;
410 			break;
411 		}
412 		vp = (struct vnode *)fp->f_data;
413 		/* Copy in the lock structure */
414 		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl,
415 		    sizeof (fl));
416 		if (error)
417 			break;
418 		if (fl.l_whence == SEEK_CUR) {
419 			if (fl.l_start == 0 && fl.l_len < 0) {
420 				/* lockf(3) compliance hack */
421 				fl.l_len = -fl.l_len;
422 				fl.l_start = fp->f_offset - fl.l_len;
423 			} else
424 				fl.l_start += fp->f_offset;
425 		}
426 		switch (fl.l_type) {
427 
428 		case F_RDLCK:
429 			if ((fp->f_flag & FREAD) == 0) {
430 				error = EBADF;
431 				goto out;
432 			}
433 			atomic_setbits_int(&fdp->fd_flags, FD_ADVLOCK);
434 			error = VOP_ADVLOCK(vp, fdp, F_SETLK, &fl, flg);
435 			break;
436 
437 		case F_WRLCK:
438 			if ((fp->f_flag & FWRITE) == 0) {
439 				error = EBADF;
440 				goto out;
441 			}
442 			atomic_setbits_int(&fdp->fd_flags, FD_ADVLOCK);
443 			error = VOP_ADVLOCK(vp, fdp, F_SETLK, &fl, flg);
444 			break;
445 
446 		case F_UNLCK:
447 			error = VOP_ADVLOCK(vp, fdp, F_UNLCK, &fl, F_POSIX);
448 			goto out;
449 
450 		default:
451 			error = EINVAL;
452 			goto out;
453 		}
454 
455 		if (fp != fd_getfile(fdp, fd)) {
456 			/*
457 			 * We have lost the race with close() or dup2();
458 			 * unlock, pretend that we've won the race and that
459 			 * lock had been removed by close()
460 			 */
461 			fl.l_whence = SEEK_SET;
462 			fl.l_start = 0;
463 			fl.l_len = 0;
464 			VOP_ADVLOCK(vp, fdp, F_UNLCK, &fl, F_POSIX);
465 			fl.l_type = F_UNLCK;
466 		}
467 		goto out;
468 
469 
470 	case F_GETLK:
471 		if (fp->f_type != DTYPE_VNODE) {
472 			error = EBADF;
473 			break;
474 		}
475 		vp = (struct vnode *)fp->f_data;
476 		/* Copy in the lock structure */
477 		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl,
478 		    sizeof (fl));
479 		if (error)
480 			break;
481 		if (fl.l_whence == SEEK_CUR) {
482 			if (fl.l_start == 0 && fl.l_len < 0) {
483 				/* lockf(3) compliance hack */
484 				fl.l_len = -fl.l_len;
485 				fl.l_start = fp->f_offset - fl.l_len;
486 			} else
487 				fl.l_start += fp->f_offset;
488 		}
489 		if (fl.l_type != F_RDLCK &&
490 		    fl.l_type != F_WRLCK &&
491 		    fl.l_type != F_UNLCK &&
492 		    fl.l_type != 0) {
493 			error = EINVAL;
494 			break;
495 		}
496 		error = VOP_ADVLOCK(vp, fdp, F_GETLK, &fl, F_POSIX);
497 		if (error)
498 			break;
499 		error = (copyout((caddr_t)&fl, (caddr_t)SCARG(uap, arg),
500 		    sizeof (fl)));
501 		break;
502 
503 	default:
504 		error = EINVAL;
505 		break;
506 	}
507 out:
508 	FRELE(fp);
509 	return (error);
510 }
511 
512 /*
513  * Common code for dup, dup2, and fcntl(F_DUPFD).
514  */
515 int
516 finishdup(struct proc *p, struct file *fp, int old, int new, register_t *retval)
517 {
518 	struct file *oldfp;
519 	struct filedesc *fdp = p->p_fd;
520 
521 	if (fp->f_count == LONG_MAX-2) {
522 		FRELE(fp);
523 		return (EDEADLK);
524 	}
525 
526 	/*
527 	 * Don't fd_getfile here. We want to closef LARVAL files and
528 	 * closef can deal with that.
529 	 */
530 	oldfp = fdp->fd_ofiles[new];
531 	if (oldfp != NULL)
532 		FREF(oldfp);
533 
534 	fdp->fd_ofiles[new] = fp;
535 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
536 	fp->f_count++;
537 	FRELE(fp);
538 	if (oldfp == NULL)
539 		fd_used(fdp, new);
540 	*retval = new;
541 
542 	if (oldfp != NULL) {
543 		if (new < fdp->fd_knlistsize)
544 			knote_fdclose(p, new);
545 		closef(oldfp, p);
546 	}
547 
548 	return (0);
549 }
550 
551 void
552 fdremove(struct filedesc *fdp, int fd)
553 {
554 	fdp->fd_ofiles[fd] = NULL;
555 	fd_unused(fdp, fd);
556 }
557 
558 int
559 fdrelease(struct proc *p, int fd)
560 {
561 	struct filedesc *fdp = p->p_fd;
562 	struct file **fpp, *fp;
563 
564 	/*
565 	 * Don't fd_getfile here. We want to closef LARVAL files and closef
566 	 * can deal with that.
567 	 */
568 	fpp = &fdp->fd_ofiles[fd];
569 	fp = *fpp;
570 	if (fp == NULL)
571 		return (EBADF);
572 	FREF(fp);
573 	*fpp = NULL;
574 	fdp->fd_ofileflags[fd] = 0;
575 	fd_unused(fdp, fd);
576 	if (fd < fdp->fd_knlistsize)
577 		knote_fdclose(p, fd);
578 	return (closef(fp, p));
579 }
580 
581 /*
582  * Close a file descriptor.
583  */
584 /* ARGSUSED */
585 int
586 sys_close(struct proc *p, void *v, register_t *retval)
587 {
588 	struct sys_close_args /* {
589 		syscallarg(int) fd;
590 	} */ *uap = v;
591 	int fd = SCARG(uap, fd), error;
592 	struct filedesc *fdp = p->p_fd;
593 
594 	if (fd_getfile(fdp, fd) == NULL)
595 		return (EBADF);
596 	fdplock(fdp);
597 	error = fdrelease(p, fd);
598 	fdpunlock(fdp);
599 
600 	return (error);
601 }
602 
603 /*
604  * Return status information about a file descriptor.
605  */
606 /* ARGSUSED */
607 int
608 sys_fstat(struct proc *p, void *v, register_t *retval)
609 {
610 	struct sys_fstat_args /* {
611 		syscallarg(int) fd;
612 		syscallarg(struct stat *) sb;
613 	} */ *uap = v;
614 	int fd = SCARG(uap, fd);
615 	struct filedesc *fdp = p->p_fd;
616 	struct file *fp;
617 	struct stat ub;
618 	int error;
619 
620 	if ((fp = fd_getfile(fdp, fd)) == NULL)
621 		return (EBADF);
622 	FREF(fp);
623 	error = (*fp->f_ops->fo_stat)(fp, &ub, p);
624 	FRELE(fp);
625 	if (error == 0) {
626 		/*
627 		 * Don't let non-root see generation numbers
628 		 * (for NFS security)
629 		 */
630 		if (suser(p, 0))
631 			ub.st_gen = 0;
632 		error = copyout((caddr_t)&ub, (caddr_t)SCARG(uap, sb),
633 		    sizeof (ub));
634 	}
635 	return (error);
636 }
637 
638 /*
639  * Return pathconf information about a file descriptor.
640  */
641 /* ARGSUSED */
642 int
643 sys_fpathconf(struct proc *p, void *v, register_t *retval)
644 {
645 	struct sys_fpathconf_args /* {
646 		syscallarg(int) fd;
647 		syscallarg(int) name;
648 	} */ *uap = v;
649 	int fd = SCARG(uap, fd);
650 	struct filedesc *fdp = p->p_fd;
651 	struct file *fp;
652 	struct vnode *vp;
653 	int error;
654 
655 	if ((fp = fd_getfile(fdp, fd)) == NULL)
656 		return (EBADF);
657 	FREF(fp);
658 	switch (fp->f_type) {
659 	case DTYPE_PIPE:
660 	case DTYPE_SOCKET:
661 		if (SCARG(uap, name) != _PC_PIPE_BUF) {
662 			error = EINVAL;
663 			break;
664 		}
665 		*retval = PIPE_BUF;
666 		error = 0;
667 		break;
668 
669 	case DTYPE_VNODE:
670 		vp = (struct vnode *)fp->f_data;
671 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
672 		error = VOP_PATHCONF(vp, SCARG(uap, name), retval);
673 		VOP_UNLOCK(vp, 0, p);
674 		break;
675 
676 	default:
677 		error = EOPNOTSUPP;
678 		break;
679 	}
680 	FRELE(fp);
681 	return (error);
682 }
683 
684 /*
685  * Allocate a file descriptor for the process.
686  */
687 int
688 fdalloc(struct proc *p, int want, int *result)
689 {
690 	struct filedesc *fdp = p->p_fd;
691 	int lim, last, i;
692 	u_int new, off;
693 
694 	/*
695 	 * Search for a free descriptor starting at the higher
696 	 * of want or fd_freefile.  If that fails, consider
697 	 * expanding the ofile array.
698 	 */
699 restart:
700 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
701 	last = min(fdp->fd_nfiles, lim);
702 	if ((i = want) < fdp->fd_freefile)
703 		i = fdp->fd_freefile;
704 	off = i >> NDENTRYSHIFT;
705 	new = find_next_zero(fdp->fd_himap, off,
706 	    (last + NDENTRIES - 1) >> NDENTRYSHIFT);
707 	if (new != -1) {
708 		i = find_next_zero(&fdp->fd_lomap[new],
709 				   new > off ? 0 : i & NDENTRYMASK,
710 				   NDENTRIES);
711 		if (i == -1) {
712 			/*
713 			 * Free file descriptor in this block was
714 			 * below want, try again with higher want.
715 			 */
716 			want = (new + 1) << NDENTRYSHIFT;
717 			goto restart;
718 		}
719 		i += (new << NDENTRYSHIFT);
720 		if (i < last) {
721 			fd_used(fdp, i);
722 			if (want <= fdp->fd_freefile)
723 				fdp->fd_freefile = i;
724 			*result = i;
725 			return (0);
726 		}
727 	}
728 	if (fdp->fd_nfiles >= lim)
729 		return (EMFILE);
730 
731 	return (ENOSPC);
732 }
733 
734 void
735 fdexpand(struct proc *p)
736 {
737 	struct filedesc *fdp = p->p_fd;
738 	int nfiles, i;
739 	struct file **newofile;
740 	char *newofileflags;
741 	u_int *newhimap, *newlomap;
742 
743 	/*
744 	 * No space in current array.
745 	 */
746 	if (fdp->fd_nfiles < NDEXTENT)
747 		nfiles = NDEXTENT;
748 	else
749 		nfiles = 2 * fdp->fd_nfiles;
750 
751 	newofile = malloc(nfiles * OFILESIZE, M_FILEDESC, M_WAITOK);
752 	newofileflags = (char *) &newofile[nfiles];
753 
754 	/*
755 	 * Copy the existing ofile and ofileflags arrays
756 	 * and zero the new portion of each array.
757 	 */
758 	bcopy(fdp->fd_ofiles, newofile,
759 		(i = sizeof(struct file *) * fdp->fd_nfiles));
760 	bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
761 	bcopy(fdp->fd_ofileflags, newofileflags,
762 		(i = sizeof(char) * fdp->fd_nfiles));
763 	bzero(newofileflags + i, nfiles * sizeof(char) - i);
764 
765 	if (fdp->fd_nfiles > NDFILE)
766 		free(fdp->fd_ofiles, M_FILEDESC);
767 
768 	if (NDHISLOTS(nfiles) > NDHISLOTS(fdp->fd_nfiles)) {
769 		newhimap = malloc(NDHISLOTS(nfiles) * sizeof(u_int),
770 		    M_FILEDESC, M_WAITOK);
771 		newlomap = malloc(NDLOSLOTS(nfiles) * sizeof(u_int),
772 		    M_FILEDESC, M_WAITOK);
773 
774 		bcopy(fdp->fd_himap, newhimap,
775 		    (i = NDHISLOTS(fdp->fd_nfiles) * sizeof(u_int)));
776 		bzero((char *)newhimap + i,
777 		    NDHISLOTS(nfiles) * sizeof(u_int) - i);
778 
779 		bcopy(fdp->fd_lomap, newlomap,
780 		    (i = NDLOSLOTS(fdp->fd_nfiles) * sizeof(u_int)));
781 		bzero((char *)newlomap + i,
782 		    NDLOSLOTS(nfiles) * sizeof(u_int) - i);
783 
784 		if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
785 			free(fdp->fd_himap, M_FILEDESC);
786 			free(fdp->fd_lomap, M_FILEDESC);
787 		}
788 		fdp->fd_himap = newhimap;
789 		fdp->fd_lomap = newlomap;
790 	}
791 	fdp->fd_ofiles = newofile;
792 	fdp->fd_ofileflags = newofileflags;
793 	fdp->fd_nfiles = nfiles;
794 }
795 
796 /*
797  * Create a new open file structure and allocate
798  * a file descriptor for the process that refers to it.
799  */
800 int
801 falloc(struct proc *p, struct file **resultfp, int *resultfd)
802 {
803 	struct file *fp, *fq;
804 	int error, i;
805 
806 restart:
807 	if ((error = fdalloc(p, 0, &i)) != 0) {
808 		if (error == ENOSPC) {
809 			fdexpand(p);
810 			goto restart;
811 		}
812 		return (error);
813 	}
814 	if (nfiles >= maxfiles) {
815 		fd_unused(p->p_fd, i);
816 		tablefull("file");
817 		return (ENFILE);
818 	}
819 	/*
820 	 * Allocate a new file descriptor.
821 	 * If the process has file descriptor zero open, add to the list
822 	 * of open files at that point, otherwise put it at the front of
823 	 * the list of open files.
824 	 */
825 	nfiles++;
826 	fp = pool_get(&file_pool, PR_WAITOK|PR_ZERO);
827 	fp->f_iflags = FIF_LARVAL;
828 	if ((fq = p->p_fd->fd_ofiles[0]) != NULL) {
829 		LIST_INSERT_AFTER(fq, fp, f_list);
830 	} else {
831 		LIST_INSERT_HEAD(&filehead, fp, f_list);
832 	}
833 	p->p_fd->fd_ofiles[i] = fp;
834 	fp->f_count = 1;
835 	fp->f_cred = p->p_ucred;
836 	crhold(fp->f_cred);
837 	if (resultfp)
838 		*resultfp = fp;
839 	if (resultfd)
840 		*resultfd = i;
841 	FREF(fp);
842 	return (0);
843 }
844 
845 /*
846  * Build a new filedesc structure.
847  */
848 struct filedesc *
849 fdinit(struct proc *p)
850 {
851 	struct filedesc0 *newfdp;
852 	extern int cmask;
853 
854 	newfdp = pool_get(&fdesc_pool, PR_WAITOK|PR_ZERO);
855 	if (p != NULL) {
856 		struct filedesc *fdp = p->p_fd;
857 
858 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
859 		vref(newfdp->fd_fd.fd_cdir);
860 		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
861 		if (newfdp->fd_fd.fd_rdir)
862 			vref(newfdp->fd_fd.fd_rdir);
863 	}
864 	rw_init(&newfdp->fd_fd.fd_lock, "fdlock");
865 
866 	/* Create the file descriptor table. */
867 	newfdp->fd_fd.fd_refcnt = 1;
868 	newfdp->fd_fd.fd_cmask = cmask;
869 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
870 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
871 	newfdp->fd_fd.fd_nfiles = NDFILE;
872 	newfdp->fd_fd.fd_himap = newfdp->fd_dhimap;
873 	newfdp->fd_fd.fd_lomap = newfdp->fd_dlomap;
874 	newfdp->fd_fd.fd_knlistsize = -1;
875 
876 	newfdp->fd_fd.fd_freefile = 0;
877 	newfdp->fd_fd.fd_lastfile = 0;
878 
879 	return (&newfdp->fd_fd);
880 }
881 
882 /*
883  * Share a filedesc structure.
884  */
885 struct filedesc *
886 fdshare(struct proc *p)
887 {
888 	p->p_fd->fd_refcnt++;
889 	return (p->p_fd);
890 }
891 
892 /*
893  * Copy a filedesc structure.
894  */
895 struct filedesc *
896 fdcopy(struct proc *p)
897 {
898 	struct filedesc *newfdp, *fdp = p->p_fd;
899 	struct file **fpp;
900 	int i;
901 
902 	newfdp = pool_get(&fdesc_pool, PR_WAITOK);
903 	bcopy(fdp, newfdp, sizeof(struct filedesc));
904 	if (newfdp->fd_cdir)
905 		vref(newfdp->fd_cdir);
906 	if (newfdp->fd_rdir)
907 		vref(newfdp->fd_rdir);
908 	newfdp->fd_refcnt = 1;
909 
910 	/*
911 	 * If the number of open files fits in the internal arrays
912 	 * of the open file structure, use them, otherwise allocate
913 	 * additional memory for the number of descriptors currently
914 	 * in use.
915 	 */
916 	if (newfdp->fd_lastfile < NDFILE) {
917 		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
918 		newfdp->fd_ofileflags =
919 		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
920 		i = NDFILE;
921 	} else {
922 		/*
923 		 * Compute the smallest multiple of NDEXTENT needed
924 		 * for the file descriptors currently in use,
925 		 * allowing the table to shrink.
926 		 */
927 		i = newfdp->fd_nfiles;
928 		while (i >= 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
929 			i /= 2;
930 		newfdp->fd_ofiles = malloc(i * OFILESIZE, M_FILEDESC, M_WAITOK);
931 		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
932 	}
933 	if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
934 		newfdp->fd_himap =
935 			((struct filedesc0 *) newfdp)->fd_dhimap;
936 		newfdp->fd_lomap =
937 			((struct filedesc0 *) newfdp)->fd_dlomap;
938 	} else {
939 		newfdp->fd_himap = malloc(NDHISLOTS(i) * sizeof(u_int),
940 		    M_FILEDESC, M_WAITOK);
941 		newfdp->fd_lomap = malloc(NDLOSLOTS(i) * sizeof(u_int),
942 		    M_FILEDESC, M_WAITOK);
943 	}
944 	newfdp->fd_nfiles = i;
945 	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
946 	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
947 	bcopy(fdp->fd_himap, newfdp->fd_himap, NDHISLOTS(i) * sizeof(u_int));
948 	bcopy(fdp->fd_lomap, newfdp->fd_lomap, NDLOSLOTS(i) * sizeof(u_int));
949 
950 	/*
951 	 * kq descriptors cannot be copied.
952 	 */
953 	if (newfdp->fd_knlistsize != -1) {
954 		fpp = newfdp->fd_ofiles;
955 		for (i = 0; i <= newfdp->fd_lastfile; i++, fpp++)
956 			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE)
957 				fdremove(newfdp, i);
958 		newfdp->fd_knlist = NULL;
959 		newfdp->fd_knlistsize = -1;
960 		newfdp->fd_knhash = NULL;
961 		newfdp->fd_knhashmask = 0;
962 	}
963 
964 	fpp = newfdp->fd_ofiles;
965 	for (i = 0; i <= newfdp->fd_lastfile; i++, fpp++)
966 		if (*fpp != NULL) {
967 			/*
968 			 * XXX Gruesome hack. If count gets too high, fail
969 			 * to copy an fd, since fdcopy()'s callers do not
970 			 * permit it to indicate failure yet.
971 			 */
972 			if ((*fpp)->f_count == LONG_MAX-2)
973 				fdremove(newfdp, i);
974 			else
975 				(*fpp)->f_count++;
976 		}
977 	return (newfdp);
978 }
979 
980 /*
981  * Release a filedesc structure.
982  */
983 void
984 fdfree(struct proc *p)
985 {
986 	struct filedesc *fdp = p->p_fd;
987 	struct file **fpp, *fp;
988 	int i;
989 
990 	if (--fdp->fd_refcnt > 0)
991 		return;
992 	fpp = fdp->fd_ofiles;
993 	for (i = fdp->fd_lastfile; i >= 0; i--, fpp++) {
994 		fp = *fpp;
995 		if (fp != NULL) {
996 			FREF(fp);
997 			*fpp = NULL;
998 			(void) closef(fp, p);
999 		}
1000 	}
1001 	p->p_fd = NULL;
1002 	if (fdp->fd_nfiles > NDFILE)
1003 		free(fdp->fd_ofiles, M_FILEDESC);
1004 	if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1005 		free(fdp->fd_himap, M_FILEDESC);
1006 		free(fdp->fd_lomap, M_FILEDESC);
1007 	}
1008 	if (fdp->fd_cdir)
1009 		vrele(fdp->fd_cdir);
1010 	if (fdp->fd_rdir)
1011 		vrele(fdp->fd_rdir);
1012 	if (fdp->fd_knlist)
1013 		free(fdp->fd_knlist, M_TEMP);
1014 	if (fdp->fd_knhash)
1015 		free(fdp->fd_knhash, M_TEMP);
1016 	pool_put(&fdesc_pool, fdp);
1017 }
1018 
1019 /*
1020  * Internal form of close.
1021  * Decrement reference count on file structure.
1022  * Note: p may be NULL when closing a file
1023  * that was being passed in a message.
1024  *
1025  * The fp must have its usecount bumped and will be FRELEd here.
1026  */
1027 int
1028 closef(struct file *fp, struct proc *p)
1029 {
1030 	struct filedesc *fdp;
1031 	int references_left;
1032 	int error;
1033 
1034 	if (fp == NULL)
1035 		return (0);
1036 
1037 	/*
1038 	 * Some files passed to this function could be accessed
1039 	 * without a FILE_IS_USABLE check (and in some cases it's perfectly
1040 	 * legal), we must beware of files where someone already won the
1041 	 * race to FIF_WANTCLOSE.
1042 	 */
1043 	if ((fp->f_iflags & FIF_WANTCLOSE) != 0 ||
1044 	    --fp->f_count > 0) {
1045 		references_left = 1;
1046 	} else {
1047 		references_left = 0;
1048 #ifdef DIAGNOSTIC
1049 		if (fp->f_count < 0)
1050 			panic("closef: count < 0");
1051 #endif
1052 
1053 		/* Wait for the last usecount to drain. */
1054 		fp->f_iflags |= FIF_WANTCLOSE;
1055 		while (fp->f_usecount > 1)
1056 			tsleep(&fp->f_usecount, PRIBIO, "closef", 0);
1057 	}
1058 
1059 	/*
1060 	 * POSIX record locking dictates that any close releases ALL
1061 	 * locks owned by this process.  This is handled by setting
1062 	 * a flag in the unlock to free ONLY locks obeying POSIX
1063 	 * semantics, and not to free BSD-style file locks.
1064 	 * If the descriptor was in a message, POSIX-style locks
1065 	 * aren't passed with the descriptor.
1066 	 */
1067 	if (p && ((fdp = p->p_fd) != NULL) &&
1068 	    (fdp->fd_flags & FD_ADVLOCK) &&
1069 	    fp->f_type == DTYPE_VNODE) {
1070 		struct vnode *vp = fp->f_data;
1071 		struct flock lf;
1072 
1073 		lf.l_whence = SEEK_SET;
1074 		lf.l_start = 0;
1075 		lf.l_len = 0;
1076 		lf.l_type = F_UNLCK;
1077 		(void) VOP_ADVLOCK(vp, fdp, F_UNLCK, &lf, F_POSIX);
1078 	}
1079 
1080 	if (references_left) {
1081 		FRELE(fp);
1082 		return (0);
1083 	}
1084 
1085 	if (fp->f_ops)
1086 		error = (*fp->f_ops->fo_close)(fp, p);
1087 	else
1088 		error = 0;
1089 
1090 	/* Free fp */
1091 	LIST_REMOVE(fp, f_list);
1092 	crfree(fp->f_cred);
1093 #ifdef DIAGNOSTIC
1094 	if (fp->f_count != 0 || fp->f_usecount != 1)
1095 		panic("closef: count: %d/%d", fp->f_count, fp->f_usecount);
1096 #endif
1097 	nfiles--;
1098 	pool_put(&file_pool, fp);
1099 
1100 	return (error);
1101 }
1102 
1103 /*
1104  * Apply an advisory lock on a file descriptor.
1105  *
1106  * Just attempt to get a record lock of the requested type on
1107  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
1108  */
1109 /* ARGSUSED */
1110 int
1111 sys_flock(struct proc *p, void *v, register_t *retval)
1112 {
1113 	struct sys_flock_args /* {
1114 		syscallarg(int) fd;
1115 		syscallarg(int) how;
1116 	} */ *uap = v;
1117 	int fd = SCARG(uap, fd);
1118 	int how = SCARG(uap, how);
1119 	struct filedesc *fdp = p->p_fd;
1120 	struct file *fp;
1121 	struct vnode *vp;
1122 	struct flock lf;
1123 	int error;
1124 
1125 	if ((fp = fd_getfile(fdp, fd)) == NULL)
1126 		return (EBADF);
1127 	if (fp->f_type != DTYPE_VNODE)
1128 		return (EOPNOTSUPP);
1129 	FREF(fp);
1130 	vp = (struct vnode *)fp->f_data;
1131 	lf.l_whence = SEEK_SET;
1132 	lf.l_start = 0;
1133 	lf.l_len = 0;
1134 	if (how & LOCK_UN) {
1135 		lf.l_type = F_UNLCK;
1136 		fp->f_flag &= ~FHASLOCK;
1137 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
1138 		goto out;
1139 	}
1140 	if (how & LOCK_EX)
1141 		lf.l_type = F_WRLCK;
1142 	else if (how & LOCK_SH)
1143 		lf.l_type = F_RDLCK;
1144 	else {
1145 		error = EINVAL;
1146 		goto out;
1147 	}
1148 	fp->f_flag |= FHASLOCK;
1149 	if (how & LOCK_NB)
1150 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK);
1151 	else
1152 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT);
1153 out:
1154 	FRELE(fp);
1155 	return (error);
1156 }
1157 
1158 /*
1159  * File Descriptor pseudo-device driver (/dev/fd/).
1160  *
1161  * Opening minor device N dup()s the file (if any) connected to file
1162  * descriptor N belonging to the calling process.  Note that this driver
1163  * consists of only the ``open()'' routine, because all subsequent
1164  * references to this file will be direct to the other driver.
1165  */
1166 /* ARGSUSED */
1167 int
1168 filedescopen(dev_t dev, int mode, int type, struct proc *p)
1169 {
1170 
1171 	/*
1172 	 * XXX Kludge: set curproc->p_dupfd to contain the value of the
1173 	 * the file descriptor being sought for duplication. The error
1174 	 * return ensures that the vnode for this device will be released
1175 	 * by vn_open. Open will detect this special error and take the
1176 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1177 	 * will simply report the error.
1178 	 */
1179 	p->p_dupfd = minor(dev);
1180 	return (ENODEV);
1181 }
1182 
1183 /*
1184  * Duplicate the specified descriptor to a free descriptor.
1185  */
1186 int
1187 dupfdopen(struct filedesc *fdp, int indx, int dfd, int mode, int error)
1188 {
1189 	struct file *wfp;
1190 
1191 	/*
1192 	 * Assume that the filename was user-specified; applications do
1193 	 * not tend to open /dev/fd/# when they can just call dup()
1194 	 */
1195 	if ((curproc->p_flag & (P_SUGIDEXEC | P_SUGID))) {
1196 		if (curproc->p_descfd == 255)
1197 			return (EPERM);
1198 		if (curproc->p_descfd != curproc->p_dupfd)
1199 			return (EPERM);
1200 	}
1201 
1202 	/*
1203 	 * If the to-be-dup'd fd number is greater than the allowed number
1204 	 * of file descriptors, or the fd to be dup'd has already been
1205 	 * closed, reject. Note, there is no need to check for new == old
1206 	 * because fd_getfile will return NULL if the file at indx is
1207 	 * newly created by falloc (FIF_LARVAL).
1208 	 */
1209 	if ((wfp = fd_getfile(fdp, dfd)) == NULL)
1210 		return (EBADF);
1211 
1212 	/*
1213 	 * There are two cases of interest here.
1214 	 *
1215 	 * For ENODEV simply dup (dfd) to file descriptor
1216 	 * (indx) and return.
1217 	 *
1218 	 * For ENXIO steal away the file structure from (dfd) and
1219 	 * store it in (indx).  (dfd) is effectively closed by
1220 	 * this operation.
1221 	 *
1222 	 * Any other error code is just returned.
1223 	 */
1224 	switch (error) {
1225 	case ENODEV:
1226 		/*
1227 		 * Check that the mode the file is being opened for is a
1228 		 * subset of the mode of the existing descriptor.
1229 		 */
1230 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag)
1231 			return (EACCES);
1232 		if (wfp->f_count == LONG_MAX-2)
1233 			return (EDEADLK);
1234 		fdp->fd_ofiles[indx] = wfp;
1235 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
1236 		wfp->f_count++;
1237 		fd_used(fdp, indx);
1238 		return (0);
1239 
1240 	case ENXIO:
1241 		/*
1242 		 * Steal away the file pointer from dfd, and stuff it into indx.
1243 		 */
1244 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
1245 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
1246 		fdp->fd_ofiles[dfd] = NULL;
1247 		fdp->fd_ofileflags[dfd] = 0;
1248 		/*
1249 		 * Complete the clean up of the filedesc structure by
1250 		 * recomputing the various hints.
1251 		 */
1252 		fd_used(fdp, indx);
1253 		fd_unused(fdp, dfd);
1254 		return (0);
1255 
1256 	default:
1257 		return (error);
1258 	}
1259 	/* NOTREACHED */
1260 }
1261 
1262 /*
1263  * Close any files on exec?
1264  */
1265 void
1266 fdcloseexec(struct proc *p)
1267 {
1268 	struct filedesc *fdp = p->p_fd;
1269 	int fd;
1270 
1271 	for (fd = 0; fd <= fdp->fd_lastfile; fd++)
1272 		if (fdp->fd_ofileflags[fd] & UF_EXCLOSE)
1273 			(void) fdrelease(p, fd);
1274 }
1275 
1276 int
1277 sys_closefrom(struct proc *p, void *v, register_t *retval)
1278 {
1279 	struct sys_closefrom_args *uap = v;
1280 	struct filedesc *fdp = p->p_fd;
1281 	u_int startfd, i;
1282 
1283 	startfd = SCARG(uap, fd);
1284 	fdplock(fdp);
1285 
1286 	if (startfd > fdp->fd_lastfile) {
1287 		fdpunlock(fdp);
1288 		return (EBADF);
1289 	}
1290 
1291 	for (i = startfd; i <= fdp->fd_lastfile; i++)
1292 		fdrelease(p, i);
1293 
1294 	fdpunlock(fdp);
1295 	return (0);
1296 }
1297