xref: /dragonfly/sys/kern/sys_generic.c (revision 984263bc)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40  */
41 
42 #include "opt_ktrace.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/filio.h>
49 #include <sys/fcntl.h>
50 #include <sys/file.h>
51 #include <sys/proc.h>
52 #include <sys/signalvar.h>
53 #include <sys/socketvar.h>
54 #include <sys/uio.h>
55 #include <sys/kernel.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/sysctl.h>
60 #include <sys/sysent.h>
61 #include <sys/buf.h>
62 #ifdef KTRACE
63 #include <sys/ktrace.h>
64 #endif
65 #include <vm/vm.h>
66 #include <vm/vm_page.h>
67 
68 #include <machine/limits.h>
69 
70 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
71 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
72 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
73 
74 static int	pollscan __P((struct proc *, struct pollfd *, u_int));
75 static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int));
76 static int	dofileread __P((struct proc *, struct file *, int, void *,
77 		    size_t, off_t, int));
78 static int	dofilewrite __P((struct proc *, struct file *, int,
79 		    const void *, size_t, off_t, int));
80 
81 struct file*
82 holdfp(fdp, fd, flag)
83 	struct filedesc* fdp;
84 	int fd, flag;
85 {
86 	struct file* fp;
87 
88 	if (((u_int)fd) >= fdp->fd_nfiles ||
89 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
90 	    (fp->f_flag & flag) == 0) {
91 		return (NULL);
92 	}
93 	fhold(fp);
94 	return (fp);
95 }
96 
97 /*
98  * Read system call.
99  */
100 #ifndef _SYS_SYSPROTO_H_
101 struct read_args {
102 	int	fd;
103 	void	*buf;
104 	size_t	nbyte;
105 };
106 #endif
107 int
108 read(p, uap)
109 	struct proc *p;
110 	register struct read_args *uap;
111 {
112 	register struct file *fp;
113 	int error;
114 
115 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
116 		return (EBADF);
117 	error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
118 	fdrop(fp, p);
119 	return(error);
120 }
121 
122 /*
123  * Pread system call
124  */
125 #ifndef _SYS_SYSPROTO_H_
126 struct pread_args {
127 	int	fd;
128 	void	*buf;
129 	size_t	nbyte;
130 	int	pad;
131 	off_t	offset;
132 };
133 #endif
134 int
135 pread(p, uap)
136 	struct proc *p;
137 	register struct pread_args *uap;
138 {
139 	register struct file *fp;
140 	int error;
141 
142 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
143 		return (EBADF);
144 	if (fp->f_type != DTYPE_VNODE) {
145 		error = ESPIPE;
146 	} else {
147 	    error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
148 		uap->offset, FOF_OFFSET);
149 	}
150 	fdrop(fp, p);
151 	return(error);
152 }
153 
154 /*
155  * Code common for read and pread
156  */
157 int
158 dofileread(p, fp, fd, buf, nbyte, offset, flags)
159 	struct proc *p;
160 	struct file *fp;
161 	int fd, flags;
162 	void *buf;
163 	size_t nbyte;
164 	off_t offset;
165 {
166 	struct uio auio;
167 	struct iovec aiov;
168 	long cnt, error = 0;
169 #ifdef KTRACE
170 	struct iovec ktriov;
171 	struct uio ktruio;
172 	int didktr = 0;
173 #endif
174 
175 	aiov.iov_base = (caddr_t)buf;
176 	aiov.iov_len = nbyte;
177 	auio.uio_iov = &aiov;
178 	auio.uio_iovcnt = 1;
179 	auio.uio_offset = offset;
180 	if (nbyte > INT_MAX)
181 		return (EINVAL);
182 	auio.uio_resid = nbyte;
183 	auio.uio_rw = UIO_READ;
184 	auio.uio_segflg = UIO_USERSPACE;
185 	auio.uio_procp = p;
186 #ifdef KTRACE
187 	/*
188 	 * if tracing, save a copy of iovec
189 	 */
190 	if (KTRPOINT(p, KTR_GENIO)) {
191 		ktriov = aiov;
192 		ktruio = auio;
193 		didktr = 1;
194 	}
195 #endif
196 	cnt = nbyte;
197 
198 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) {
199 		if (auio.uio_resid != cnt && (error == ERESTART ||
200 		    error == EINTR || error == EWOULDBLOCK))
201 			error = 0;
202 	}
203 	cnt -= auio.uio_resid;
204 #ifdef KTRACE
205 	if (didktr && error == 0) {
206 		ktruio.uio_iov = &ktriov;
207 		ktruio.uio_resid = cnt;
208 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
209 	}
210 #endif
211 	p->p_retval[0] = cnt;
212 	return (error);
213 }
214 
215 /*
216  * Scatter read system call.
217  */
218 #ifndef _SYS_SYSPROTO_H_
219 struct readv_args {
220 	int	fd;
221 	struct	iovec *iovp;
222 	u_int	iovcnt;
223 };
224 #endif
225 int
226 readv(p, uap)
227 	struct proc *p;
228 	register struct readv_args *uap;
229 {
230 	register struct file *fp;
231 	register struct filedesc *fdp = p->p_fd;
232 	struct uio auio;
233 	register struct iovec *iov;
234 	struct iovec *needfree;
235 	struct iovec aiov[UIO_SMALLIOV];
236 	long i, cnt, error = 0;
237 	u_int iovlen;
238 #ifdef KTRACE
239 	struct iovec *ktriov = NULL;
240 	struct uio ktruio;
241 #endif
242 
243 	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL)
244 		return (EBADF);
245 	/* note: can't use iovlen until iovcnt is validated */
246 	iovlen = uap->iovcnt * sizeof (struct iovec);
247 	if (uap->iovcnt > UIO_SMALLIOV) {
248 		if (uap->iovcnt > UIO_MAXIOV)
249 			return (EINVAL);
250 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
251 		needfree = iov;
252 	} else {
253 		iov = aiov;
254 		needfree = NULL;
255 	}
256 	auio.uio_iov = iov;
257 	auio.uio_iovcnt = uap->iovcnt;
258 	auio.uio_rw = UIO_READ;
259 	auio.uio_segflg = UIO_USERSPACE;
260 	auio.uio_procp = p;
261 	auio.uio_offset = -1;
262 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
263 		goto done;
264 	auio.uio_resid = 0;
265 	for (i = 0; i < uap->iovcnt; i++) {
266 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
267 			error = EINVAL;
268 			goto done;
269 		}
270 		auio.uio_resid += iov->iov_len;
271 		iov++;
272 	}
273 #ifdef KTRACE
274 	/*
275 	 * if tracing, save a copy of iovec
276 	 */
277 	if (KTRPOINT(p, KTR_GENIO))  {
278 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
279 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
280 		ktruio = auio;
281 	}
282 #endif
283 	cnt = auio.uio_resid;
284 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, p))) {
285 		if (auio.uio_resid != cnt && (error == ERESTART ||
286 		    error == EINTR || error == EWOULDBLOCK))
287 			error = 0;
288 	}
289 	cnt -= auio.uio_resid;
290 #ifdef KTRACE
291 	if (ktriov != NULL) {
292 		if (error == 0) {
293 			ktruio.uio_iov = ktriov;
294 			ktruio.uio_resid = cnt;
295 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
296 			    error);
297 		}
298 		FREE(ktriov, M_TEMP);
299 	}
300 #endif
301 	p->p_retval[0] = cnt;
302 done:
303 	fdrop(fp, p);
304 	if (needfree)
305 		FREE(needfree, M_IOV);
306 	return (error);
307 }
308 
309 /*
310  * Write system call
311  */
312 #ifndef _SYS_SYSPROTO_H_
313 struct write_args {
314 	int	fd;
315 	const void *buf;
316 	size_t	nbyte;
317 };
318 #endif
319 int
320 write(p, uap)
321 	struct proc *p;
322 	register struct write_args *uap;
323 {
324 	register struct file *fp;
325 	int error;
326 
327 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
328 		return (EBADF);
329 	error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0);
330 	fdrop(fp, p);
331 	return(error);
332 }
333 
334 /*
335  * Pwrite system call
336  */
337 #ifndef _SYS_SYSPROTO_H_
338 struct pwrite_args {
339 	int	fd;
340 	const void *buf;
341 	size_t	nbyte;
342 	int	pad;
343 	off_t	offset;
344 };
345 #endif
346 int
347 pwrite(p, uap)
348 	struct proc *p;
349 	register struct pwrite_args *uap;
350 {
351 	register struct file *fp;
352 	int error;
353 
354 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
355 		return (EBADF);
356 	if (fp->f_type != DTYPE_VNODE) {
357 		error = ESPIPE;
358 	} else {
359 	    error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
360 		uap->offset, FOF_OFFSET);
361 	}
362 	fdrop(fp, p);
363 	return(error);
364 }
365 
366 static int
367 dofilewrite(p, fp, fd, buf, nbyte, offset, flags)
368 	struct proc *p;
369 	struct file *fp;
370 	int fd, flags;
371 	const void *buf;
372 	size_t nbyte;
373 	off_t offset;
374 {
375 	struct uio auio;
376 	struct iovec aiov;
377 	long cnt, error = 0;
378 #ifdef KTRACE
379 	struct iovec ktriov;
380 	struct uio ktruio;
381 	int didktr = 0;
382 #endif
383 
384 	aiov.iov_base = (void *)(uintptr_t)buf;
385 	aiov.iov_len = nbyte;
386 	auio.uio_iov = &aiov;
387 	auio.uio_iovcnt = 1;
388 	auio.uio_offset = offset;
389 	if (nbyte > INT_MAX)
390 		return (EINVAL);
391 	auio.uio_resid = nbyte;
392 	auio.uio_rw = UIO_WRITE;
393 	auio.uio_segflg = UIO_USERSPACE;
394 	auio.uio_procp = p;
395 #ifdef KTRACE
396 	/*
397 	 * if tracing, save a copy of iovec and uio
398 	 */
399 	if (KTRPOINT(p, KTR_GENIO)) {
400 		ktriov = aiov;
401 		ktruio = auio;
402 		didktr = 1;
403 	}
404 #endif
405 	cnt = nbyte;
406 	if (fp->f_type == DTYPE_VNODE)
407 		bwillwrite();
408 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
409 		if (auio.uio_resid != cnt && (error == ERESTART ||
410 		    error == EINTR || error == EWOULDBLOCK))
411 			error = 0;
412 		if (error == EPIPE)
413 			psignal(p, SIGPIPE);
414 	}
415 	cnt -= auio.uio_resid;
416 #ifdef KTRACE
417 	if (didktr && error == 0) {
418 		ktruio.uio_iov = &ktriov;
419 		ktruio.uio_resid = cnt;
420 		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
421 	}
422 #endif
423 	p->p_retval[0] = cnt;
424 	return (error);
425 }
426 
427 /*
428  * Gather write system call
429  */
430 #ifndef _SYS_SYSPROTO_H_
431 struct writev_args {
432 	int	fd;
433 	struct	iovec *iovp;
434 	u_int	iovcnt;
435 };
436 #endif
437 int
438 writev(p, uap)
439 	struct proc *p;
440 	register struct writev_args *uap;
441 {
442 	register struct file *fp;
443 	register struct filedesc *fdp = p->p_fd;
444 	struct uio auio;
445 	register struct iovec *iov;
446 	struct iovec *needfree;
447 	struct iovec aiov[UIO_SMALLIOV];
448 	long i, cnt, error = 0;
449 	u_int iovlen;
450 #ifdef KTRACE
451 	struct iovec *ktriov = NULL;
452 	struct uio ktruio;
453 #endif
454 
455 	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL)
456 		return (EBADF);
457 	/* note: can't use iovlen until iovcnt is validated */
458 	iovlen = uap->iovcnt * sizeof (struct iovec);
459 	if (uap->iovcnt > UIO_SMALLIOV) {
460 		if (uap->iovcnt > UIO_MAXIOV) {
461 			needfree = NULL;
462 			error = EINVAL;
463 			goto done;
464 		}
465 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
466 		needfree = iov;
467 	} else {
468 		iov = aiov;
469 		needfree = NULL;
470 	}
471 	auio.uio_iov = iov;
472 	auio.uio_iovcnt = uap->iovcnt;
473 	auio.uio_rw = UIO_WRITE;
474 	auio.uio_segflg = UIO_USERSPACE;
475 	auio.uio_procp = p;
476 	auio.uio_offset = -1;
477 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
478 		goto done;
479 	auio.uio_resid = 0;
480 	for (i = 0; i < uap->iovcnt; i++) {
481 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
482 			error = EINVAL;
483 			goto done;
484 		}
485 		auio.uio_resid += iov->iov_len;
486 		iov++;
487 	}
488 #ifdef KTRACE
489 	/*
490 	 * if tracing, save a copy of iovec and uio
491 	 */
492 	if (KTRPOINT(p, KTR_GENIO))  {
493 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
494 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
495 		ktruio = auio;
496 	}
497 #endif
498 	cnt = auio.uio_resid;
499 	if (fp->f_type == DTYPE_VNODE)
500 		bwillwrite();
501 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
502 		if (auio.uio_resid != cnt && (error == ERESTART ||
503 		    error == EINTR || error == EWOULDBLOCK))
504 			error = 0;
505 		if (error == EPIPE)
506 			psignal(p, SIGPIPE);
507 	}
508 	cnt -= auio.uio_resid;
509 #ifdef KTRACE
510 	if (ktriov != NULL) {
511 		if (error == 0) {
512 			ktruio.uio_iov = ktriov;
513 			ktruio.uio_resid = cnt;
514 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
515 			    error);
516 		}
517 		FREE(ktriov, M_TEMP);
518 	}
519 #endif
520 	p->p_retval[0] = cnt;
521 done:
522 	fdrop(fp, p);
523 	if (needfree)
524 		FREE(needfree, M_IOV);
525 	return (error);
526 }
527 
528 /*
529  * Ioctl system call
530  */
531 #ifndef _SYS_SYSPROTO_H_
532 struct ioctl_args {
533 	int	fd;
534 	u_long	com;
535 	caddr_t	data;
536 };
537 #endif
538 /* ARGSUSED */
539 int
540 ioctl(p, uap)
541 	struct proc *p;
542 	register struct ioctl_args *uap;
543 {
544 	register struct file *fp;
545 	register struct filedesc *fdp;
546 	register u_long com;
547 	int error;
548 	register u_int size;
549 	caddr_t data, memp;
550 	int tmp;
551 #define STK_PARAMS	128
552 	union {
553 	    char stkbuf[STK_PARAMS];
554 	    long align;
555 	} ubuf;
556 
557 	fdp = p->p_fd;
558 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
559 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
560 		return (EBADF);
561 
562 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
563 		return (EBADF);
564 
565 	switch (com = uap->com) {
566 	case FIONCLEX:
567 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
568 		return (0);
569 	case FIOCLEX:
570 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
571 		return (0);
572 	}
573 
574 	/*
575 	 * Interpret high order word to find amount of data to be
576 	 * copied to/from the user's address space.
577 	 */
578 	size = IOCPARM_LEN(com);
579 	if (size > IOCPARM_MAX)
580 		return (ENOTTY);
581 
582 	fhold(fp);
583 
584 	memp = NULL;
585 	if (size > sizeof (ubuf.stkbuf)) {
586 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
587 		data = memp;
588 	} else {
589 		data = ubuf.stkbuf;
590 	}
591 	if (com&IOC_IN) {
592 		if (size) {
593 			error = copyin(uap->data, data, (u_int)size);
594 			if (error) {
595 				if (memp)
596 					free(memp, M_IOCTLOPS);
597 				fdrop(fp, p);
598 				return (error);
599 			}
600 		} else {
601 			*(caddr_t *)data = uap->data;
602 		}
603 	} else if ((com&IOC_OUT) && size) {
604 		/*
605 		 * Zero the buffer so the user always
606 		 * gets back something deterministic.
607 		 */
608 		bzero(data, size);
609 	} else if (com&IOC_VOID) {
610 		*(caddr_t *)data = uap->data;
611 	}
612 
613 	switch (com) {
614 
615 	case FIONBIO:
616 		if ((tmp = *(int *)data))
617 			fp->f_flag |= FNONBLOCK;
618 		else
619 			fp->f_flag &= ~FNONBLOCK;
620 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
621 		break;
622 
623 	case FIOASYNC:
624 		if ((tmp = *(int *)data))
625 			fp->f_flag |= FASYNC;
626 		else
627 			fp->f_flag &= ~FASYNC;
628 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
629 		break;
630 
631 	default:
632 		error = fo_ioctl(fp, com, data, p);
633 		/*
634 		 * Copy any data to user, size was
635 		 * already set and checked above.
636 		 */
637 		if (error == 0 && (com&IOC_OUT) && size)
638 			error = copyout(data, uap->data, (u_int)size);
639 		break;
640 	}
641 	if (memp)
642 		free(memp, M_IOCTLOPS);
643 	fdrop(fp, p);
644 	return (error);
645 }
646 
647 static int	nselcoll;	/* Select collisions since boot */
648 int	selwait;
649 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
650 
651 /*
652  * Select system call.
653  */
654 #ifndef _SYS_SYSPROTO_H_
655 struct select_args {
656 	int	nd;
657 	fd_set	*in, *ou, *ex;
658 	struct	timeval *tv;
659 };
660 #endif
661 int
662 select(p, uap)
663 	register struct proc *p;
664 	register struct select_args *uap;
665 {
666 	/*
667 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
668 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
669 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
670 	 * of 256.
671 	 */
672 	fd_mask s_selbits[howmany(2048, NFDBITS)];
673 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
674 	struct timeval atv, rtv, ttv;
675 	int s, ncoll, error, timo;
676 	u_int nbufbytes, ncpbytes, nfdbits;
677 
678 	if (uap->nd < 0)
679 		return (EINVAL);
680 	if (uap->nd > p->p_fd->fd_nfiles)
681 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
682 
683 	/*
684 	 * Allocate just enough bits for the non-null fd_sets.  Use the
685 	 * preallocated auto buffer if possible.
686 	 */
687 	nfdbits = roundup(uap->nd, NFDBITS);
688 	ncpbytes = nfdbits / NBBY;
689 	nbufbytes = 0;
690 	if (uap->in != NULL)
691 		nbufbytes += 2 * ncpbytes;
692 	if (uap->ou != NULL)
693 		nbufbytes += 2 * ncpbytes;
694 	if (uap->ex != NULL)
695 		nbufbytes += 2 * ncpbytes;
696 	if (nbufbytes <= sizeof s_selbits)
697 		selbits = &s_selbits[0];
698 	else
699 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
700 
701 	/*
702 	 * Assign pointers into the bit buffers and fetch the input bits.
703 	 * Put the output buffers together so that they can be bzeroed
704 	 * together.
705 	 */
706 	sbp = selbits;
707 #define	getbits(name, x) \
708 	do {								\
709 		if (uap->name == NULL)					\
710 			ibits[x] = NULL;				\
711 		else {							\
712 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
713 			obits[x] = sbp;					\
714 			sbp += ncpbytes / sizeof *sbp;			\
715 			error = copyin(uap->name, ibits[x], ncpbytes);	\
716 			if (error != 0)					\
717 				goto done;				\
718 		}							\
719 	} while (0)
720 	getbits(in, 0);
721 	getbits(ou, 1);
722 	getbits(ex, 2);
723 #undef	getbits
724 	if (nbufbytes != 0)
725 		bzero(selbits, nbufbytes / 2);
726 
727 	if (uap->tv) {
728 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
729 			sizeof (atv));
730 		if (error)
731 			goto done;
732 		if (itimerfix(&atv)) {
733 			error = EINVAL;
734 			goto done;
735 		}
736 		getmicrouptime(&rtv);
737 		timevaladd(&atv, &rtv);
738 	} else {
739 		atv.tv_sec = 0;
740 		atv.tv_usec = 0;
741 	}
742 	timo = 0;
743 retry:
744 	ncoll = nselcoll;
745 	p->p_flag |= P_SELECT;
746 	error = selscan(p, ibits, obits, uap->nd);
747 	if (error || p->p_retval[0])
748 		goto done;
749 	if (atv.tv_sec || atv.tv_usec) {
750 		getmicrouptime(&rtv);
751 		if (timevalcmp(&rtv, &atv, >=))
752 			goto done;
753 		ttv = atv;
754 		timevalsub(&ttv, &rtv);
755 		timo = ttv.tv_sec > 24 * 60 * 60 ?
756 		    24 * 60 * 60 * hz : tvtohz(&ttv);
757 	}
758 	s = splhigh();
759 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
760 		splx(s);
761 		goto retry;
762 	}
763 	p->p_flag &= ~P_SELECT;
764 
765 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
766 
767 	splx(s);
768 	if (error == 0)
769 		goto retry;
770 done:
771 	p->p_flag &= ~P_SELECT;
772 	/* select is not restarted after signals... */
773 	if (error == ERESTART)
774 		error = EINTR;
775 	if (error == EWOULDBLOCK)
776 		error = 0;
777 #define	putbits(name, x) \
778 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
779 		error = error2;
780 	if (error == 0) {
781 		int error2;
782 
783 		putbits(in, 0);
784 		putbits(ou, 1);
785 		putbits(ex, 2);
786 #undef putbits
787 	}
788 	if (selbits != &s_selbits[0])
789 		free(selbits, M_SELECT);
790 	return (error);
791 }
792 
793 static int
794 selscan(p, ibits, obits, nfd)
795 	struct proc *p;
796 	fd_mask **ibits, **obits;
797 	int nfd;
798 {
799 	struct filedesc *fdp = p->p_fd;
800 	int msk, i, fd;
801 	fd_mask bits;
802 	struct file *fp;
803 	int n = 0;
804 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
805 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
806 
807 	for (msk = 0; msk < 3; msk++) {
808 		if (ibits[msk] == NULL)
809 			continue;
810 		for (i = 0; i < nfd; i += NFDBITS) {
811 			bits = ibits[msk][i/NFDBITS];
812 			/* ffs(int mask) not portable, fd_mask is long */
813 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
814 				if (!(bits & 1))
815 					continue;
816 				fp = fdp->fd_ofiles[fd];
817 				if (fp == NULL)
818 					return (EBADF);
819 				if (fo_poll(fp, flag[msk], fp->f_cred, p)) {
820 					obits[msk][(fd)/NFDBITS] |=
821 					    ((fd_mask)1 << ((fd) % NFDBITS));
822 					n++;
823 				}
824 			}
825 		}
826 	}
827 	p->p_retval[0] = n;
828 	return (0);
829 }
830 
831 /*
832  * Poll system call.
833  */
834 #ifndef _SYS_SYSPROTO_H_
835 struct poll_args {
836 	struct pollfd *fds;
837 	u_int	nfds;
838 	int	timeout;
839 };
840 #endif
841 int
842 poll(p, uap)
843 	struct proc *p;
844 	struct poll_args *uap;
845 {
846 	caddr_t bits;
847 	char smallbits[32 * sizeof(struct pollfd)];
848 	struct timeval atv, rtv, ttv;
849 	int s, ncoll, error = 0, timo;
850 	u_int nfds;
851 	size_t ni;
852 
853 	nfds = SCARG(uap, nfds);
854 	/*
855 	 * This is kinda bogus.  We have fd limits, but that is not
856 	 * really related to the size of the pollfd array.  Make sure
857 	 * we let the process use at least FD_SETSIZE entries and at
858 	 * least enough for the current limits.  We want to be reasonably
859 	 * safe, but not overly restrictive.
860 	 */
861 	if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
862 		return (EINVAL);
863 	ni = nfds * sizeof(struct pollfd);
864 	if (ni > sizeof(smallbits))
865 		bits = malloc(ni, M_TEMP, M_WAITOK);
866 	else
867 		bits = smallbits;
868 	error = copyin(SCARG(uap, fds), bits, ni);
869 	if (error)
870 		goto done;
871 	if (SCARG(uap, timeout) != INFTIM) {
872 		atv.tv_sec = SCARG(uap, timeout) / 1000;
873 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
874 		if (itimerfix(&atv)) {
875 			error = EINVAL;
876 			goto done;
877 		}
878 		getmicrouptime(&rtv);
879 		timevaladd(&atv, &rtv);
880 	} else {
881 		atv.tv_sec = 0;
882 		atv.tv_usec = 0;
883 	}
884 	timo = 0;
885 retry:
886 	ncoll = nselcoll;
887 	p->p_flag |= P_SELECT;
888 	error = pollscan(p, (struct pollfd *)bits, nfds);
889 	if (error || p->p_retval[0])
890 		goto done;
891 	if (atv.tv_sec || atv.tv_usec) {
892 		getmicrouptime(&rtv);
893 		if (timevalcmp(&rtv, &atv, >=))
894 			goto done;
895 		ttv = atv;
896 		timevalsub(&ttv, &rtv);
897 		timo = ttv.tv_sec > 24 * 60 * 60 ?
898 		    24 * 60 * 60 * hz : tvtohz(&ttv);
899 	}
900 	s = splhigh();
901 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
902 		splx(s);
903 		goto retry;
904 	}
905 	p->p_flag &= ~P_SELECT;
906 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo);
907 	splx(s);
908 	if (error == 0)
909 		goto retry;
910 done:
911 	p->p_flag &= ~P_SELECT;
912 	/* poll is not restarted after signals... */
913 	if (error == ERESTART)
914 		error = EINTR;
915 	if (error == EWOULDBLOCK)
916 		error = 0;
917 	if (error == 0) {
918 		error = copyout(bits, SCARG(uap, fds), ni);
919 		if (error)
920 			goto out;
921 	}
922 out:
923 	if (ni > sizeof(smallbits))
924 		free(bits, M_TEMP);
925 	return (error);
926 }
927 
928 static int
929 pollscan(p, fds, nfd)
930 	struct proc *p;
931 	struct pollfd *fds;
932 	u_int nfd;
933 {
934 	register struct filedesc *fdp = p->p_fd;
935 	int i;
936 	struct file *fp;
937 	int n = 0;
938 
939 	for (i = 0; i < nfd; i++, fds++) {
940 		if (fds->fd >= fdp->fd_nfiles) {
941 			fds->revents = POLLNVAL;
942 			n++;
943 		} else if (fds->fd < 0) {
944 			fds->revents = 0;
945 		} else {
946 			fp = fdp->fd_ofiles[fds->fd];
947 			if (fp == NULL) {
948 				fds->revents = POLLNVAL;
949 				n++;
950 			} else {
951 				/*
952 				 * Note: backend also returns POLLHUP and
953 				 * POLLERR if appropriate.
954 				 */
955 				fds->revents = fo_poll(fp, fds->events,
956 				    fp->f_cred, p);
957 				if (fds->revents != 0)
958 					n++;
959 			}
960 		}
961 	}
962 	p->p_retval[0] = n;
963 	return (0);
964 }
965 
966 /*
967  * OpenBSD poll system call.
968  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
969  */
970 #ifndef _SYS_SYSPROTO_H_
971 struct openbsd_poll_args {
972 	struct pollfd *fds;
973 	u_int	nfds;
974 	int	timeout;
975 };
976 #endif
977 int
978 openbsd_poll(p, uap)
979 	register struct proc *p;
980 	register struct openbsd_poll_args *uap;
981 {
982 	return (poll(p, (struct poll_args *)uap));
983 }
984 
985 /*ARGSUSED*/
986 int
987 seltrue(dev, events, p)
988 	dev_t dev;
989 	int events;
990 	struct proc *p;
991 {
992 
993 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
994 }
995 
996 /*
997  * Record a select request.
998  */
999 void
1000 selrecord(selector, sip)
1001 	struct proc *selector;
1002 	struct selinfo *sip;
1003 {
1004 	struct proc *p;
1005 	pid_t mypid;
1006 
1007 	mypid = selector->p_pid;
1008 	if (sip->si_pid == mypid)
1009 		return;
1010 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
1011 	    p->p_wchan == (caddr_t)&selwait)
1012 		sip->si_flags |= SI_COLL;
1013 	else
1014 		sip->si_pid = mypid;
1015 }
1016 
1017 /*
1018  * Do a wakeup when a selectable event occurs.
1019  */
1020 void
1021 selwakeup(sip)
1022 	register struct selinfo *sip;
1023 {
1024 	register struct proc *p;
1025 	int s;
1026 
1027 	if (sip->si_pid == 0)
1028 		return;
1029 	if (sip->si_flags & SI_COLL) {
1030 		nselcoll++;
1031 		sip->si_flags &= ~SI_COLL;
1032 		wakeup((caddr_t)&selwait);
1033 	}
1034 	p = pfind(sip->si_pid);
1035 	sip->si_pid = 0;
1036 	if (p != NULL) {
1037 		s = splhigh();
1038 		if (p->p_wchan == (caddr_t)&selwait) {
1039 			if (p->p_stat == SSLEEP)
1040 				setrunnable(p);
1041 			else
1042 				unsleep(p);
1043 		} else if (p->p_flag & P_SELECT)
1044 			p->p_flag &= ~P_SELECT;
1045 		splx(s);
1046 	}
1047 }
1048