xref: /netbsd/sys/kern/sys_generic.c (revision bf9ec67e)
1 /*	$NetBSD: sys_generic.c,v 1.62 2002/03/22 18:58:59 jdolecek Exp $	*/
2 
3 /*
4  * Copyright (c) 1982, 1986, 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
41  */
42 
43 #include <sys/cdefs.h>
44 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.62 2002/03/22 18:58:59 jdolecek Exp $");
45 
46 #include "opt_ktrace.h"
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/filedesc.h>
51 #include <sys/ioctl.h>
52 #include <sys/file.h>
53 #include <sys/proc.h>
54 #include <sys/socketvar.h>
55 #include <sys/signalvar.h>
56 #include <sys/uio.h>
57 #include <sys/kernel.h>
58 #include <sys/stat.h>
59 #include <sys/malloc.h>
60 #include <sys/poll.h>
61 #ifdef KTRACE
62 #include <sys/ktrace.h>
63 #endif
64 
65 #include <sys/mount.h>
66 #include <sys/syscallargs.h>
67 
68 int selscan __P((struct proc *, fd_mask *, fd_mask *, int, register_t *));
69 int pollscan __P((struct proc *, struct pollfd *, int, register_t *));
70 
71 /*
72  * Read system call.
73  */
74 /* ARGSUSED */
75 int
76 sys_read(struct proc *p, void *v, register_t *retval)
77 {
78 	struct sys_read_args /* {
79 		syscallarg(int)		fd;
80 		syscallarg(void *)	buf;
81 		syscallarg(size_t)	nbyte;
82 	} */ *uap = v;
83 	int		fd;
84 	struct file	*fp;
85 	struct filedesc	*fdp;
86 
87 	fd = SCARG(uap, fd);
88 	fdp = p->p_fd;
89 
90 	if ((fp = fd_getfile(fdp, fd)) == NULL)
91 		return (EBADF);
92 
93 	if ((fp->f_flag & FREAD) == 0)
94 		return (EBADF);
95 
96 	FILE_USE(fp);
97 
98 	/* dofileread() will unuse the descriptor for us */
99 	return (dofileread(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
100 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
101 }
102 
103 int
104 dofileread(struct proc *p, int fd, struct file *fp, void *buf, size_t nbyte,
105 	off_t *offset, int flags, register_t *retval)
106 {
107 	struct uio	auio;
108 	struct iovec	aiov;
109 	long		cnt, error;
110 #ifdef KTRACE
111 	struct iovec	ktriov;
112 #endif
113 	error = 0;
114 
115 	aiov.iov_base = (caddr_t)buf;
116 	aiov.iov_len = nbyte;
117 	auio.uio_iov = &aiov;
118 	auio.uio_iovcnt = 1;
119 	auio.uio_resid = nbyte;
120 	auio.uio_rw = UIO_READ;
121 	auio.uio_segflg = UIO_USERSPACE;
122 	auio.uio_procp = p;
123 
124 	/*
125 	 * Reads return ssize_t because -1 is returned on error.  Therefore
126 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
127 	 * values.
128 	 */
129 	if (auio.uio_resid > SSIZE_MAX) {
130 		error = EINVAL;
131 		goto out;
132 	}
133 
134 #ifdef KTRACE
135 	/*
136 	 * if tracing, save a copy of iovec
137 	 */
138 	if (KTRPOINT(p, KTR_GENIO))
139 		ktriov = aiov;
140 #endif
141 	cnt = auio.uio_resid;
142 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
143 	if (error)
144 		if (auio.uio_resid != cnt && (error == ERESTART ||
145 		    error == EINTR || error == EWOULDBLOCK))
146 			error = 0;
147 	cnt -= auio.uio_resid;
148 #ifdef KTRACE
149 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
150 		ktrgenio(p, fd, UIO_READ, &ktriov, cnt, error);
151 #endif
152 	*retval = cnt;
153  out:
154 	FILE_UNUSE(fp, p);
155 	return (error);
156 }
157 
158 /*
159  * Scatter read system call.
160  */
161 int
162 sys_readv(struct proc *p, void *v, register_t *retval)
163 {
164 	struct sys_readv_args /* {
165 		syscallarg(int)				fd;
166 		syscallarg(const struct iovec *)	iovp;
167 		syscallarg(int)				iovcnt;
168 	} */ *uap = v;
169 	int		fd;
170 	struct file	*fp;
171 	struct filedesc	*fdp;
172 
173 	fd = SCARG(uap, fd);
174 	fdp = p->p_fd;
175 
176 	if ((fp = fd_getfile(fdp, fd)) == NULL)
177 		return (EBADF);
178 
179 	if ((fp->f_flag & FREAD) == 0)
180 		return (EBADF);
181 
182 	FILE_USE(fp);
183 
184 	/* dofilereadv() will unuse the descriptor for us */
185 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
186 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
187 }
188 
189 int
190 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
191 	int iovcnt, off_t *offset, int flags, register_t *retval)
192 {
193 	struct uio	auio;
194 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
195 	long		i, cnt, error;
196 	u_int		iovlen;
197 #ifdef KTRACE
198 	struct iovec	*ktriov;
199 #endif
200 
201 	error = 0;
202 #ifdef KTRACE
203 	ktriov = NULL;
204 #endif
205 	/* note: can't use iovlen until iovcnt is validated */
206 	iovlen = iovcnt * sizeof(struct iovec);
207 	if ((u_int)iovcnt > UIO_SMALLIOV) {
208 		if ((u_int)iovcnt > IOV_MAX) {
209 			error = EINVAL;
210 			goto out;
211 		}
212 		iov = malloc(iovlen, M_IOV, M_WAITOK);
213 		needfree = iov;
214 	} else if ((u_int)iovcnt > 0) {
215 		iov = aiov;
216 		needfree = NULL;
217 	} else {
218 		error = EINVAL;
219 		goto out;
220 	}
221 
222 	auio.uio_iov = iov;
223 	auio.uio_iovcnt = iovcnt;
224 	auio.uio_rw = UIO_READ;
225 	auio.uio_segflg = UIO_USERSPACE;
226 	auio.uio_procp = p;
227 	error = copyin(iovp, iov, iovlen);
228 	if (error)
229 		goto done;
230 	auio.uio_resid = 0;
231 	for (i = 0; i < iovcnt; i++) {
232 		auio.uio_resid += iov->iov_len;
233 		/*
234 		 * Reads return ssize_t because -1 is returned on error.
235 		 * Therefore we must restrict the length to SSIZE_MAX to
236 		 * avoid garbage return values.
237 		 */
238 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
239 			error = EINVAL;
240 			goto done;
241 		}
242 		iov++;
243 	}
244 #ifdef KTRACE
245 	/*
246 	 * if tracing, save a copy of iovec
247 	 */
248 	if (KTRPOINT(p, KTR_GENIO))  {
249 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
250 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
251 	}
252 #endif
253 	cnt = auio.uio_resid;
254 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
255 	if (error)
256 		if (auio.uio_resid != cnt && (error == ERESTART ||
257 		    error == EINTR || error == EWOULDBLOCK))
258 			error = 0;
259 	cnt -= auio.uio_resid;
260 #ifdef KTRACE
261 	if (ktriov != NULL) {
262 		if (error == 0)
263 			ktrgenio(p, fd, UIO_READ, ktriov, cnt, error);
264 		free(ktriov, M_TEMP);
265 	}
266 #endif
267 	*retval = cnt;
268  done:
269 	if (needfree)
270 		free(needfree, M_IOV);
271  out:
272 	FILE_UNUSE(fp, p);
273 	return (error);
274 }
275 
276 /*
277  * Write system call
278  */
279 int
280 sys_write(struct proc *p, void *v, register_t *retval)
281 {
282 	struct sys_write_args /* {
283 		syscallarg(int)			fd;
284 		syscallarg(const void *)	buf;
285 		syscallarg(size_t)		nbyte;
286 	} */ *uap = v;
287 	int		fd;
288 	struct file	*fp;
289 	struct filedesc	*fdp;
290 
291 	fd = SCARG(uap, fd);
292 	fdp = p->p_fd;
293 
294 	if ((fp = fd_getfile(fdp, fd)) == NULL)
295 		return (EBADF);
296 
297 	if ((fp->f_flag & FWRITE) == 0)
298 		return (EBADF);
299 
300 	FILE_USE(fp);
301 
302 	/* dofilewrite() will unuse the descriptor for us */
303 	return (dofilewrite(p, fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
304 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
305 }
306 
307 int
308 dofilewrite(struct proc *p, int fd, struct file *fp, const void *buf,
309 	size_t nbyte, off_t *offset, int flags, register_t *retval)
310 {
311 	struct uio	auio;
312 	struct iovec	aiov;
313 	long		cnt, error;
314 #ifdef KTRACE
315 	struct iovec	ktriov;
316 #endif
317 
318 	error = 0;
319 	aiov.iov_base = (caddr_t)buf;		/* XXX kills const */
320 	aiov.iov_len = nbyte;
321 	auio.uio_iov = &aiov;
322 	auio.uio_iovcnt = 1;
323 	auio.uio_resid = nbyte;
324 	auio.uio_rw = UIO_WRITE;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	auio.uio_procp = p;
327 
328 	/*
329 	 * Writes return ssize_t because -1 is returned on error.  Therefore
330 	 * we must restrict the length to SSIZE_MAX to avoid garbage return
331 	 * values.
332 	 */
333 	if (auio.uio_resid > SSIZE_MAX) {
334 		error = EINVAL;
335 		goto out;
336 	}
337 
338 #ifdef KTRACE
339 	/*
340 	 * if tracing, save a copy of iovec
341 	 */
342 	if (KTRPOINT(p, KTR_GENIO))
343 		ktriov = aiov;
344 #endif
345 	cnt = auio.uio_resid;
346 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
347 	if (error) {
348 		if (auio.uio_resid != cnt && (error == ERESTART ||
349 		    error == EINTR || error == EWOULDBLOCK))
350 			error = 0;
351 		if (error == EPIPE)
352 			psignal(p, SIGPIPE);
353 	}
354 	cnt -= auio.uio_resid;
355 #ifdef KTRACE
356 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
357 		ktrgenio(p, fd, UIO_WRITE, &ktriov, cnt, error);
358 #endif
359 	*retval = cnt;
360  out:
361 	FILE_UNUSE(fp, p);
362 	return (error);
363 }
364 
365 /*
366  * Gather write system call
367  */
368 int
369 sys_writev(struct proc *p, void *v, register_t *retval)
370 {
371 	struct sys_writev_args /* {
372 		syscallarg(int)				fd;
373 		syscallarg(const struct iovec *)	iovp;
374 		syscallarg(int)				iovcnt;
375 	} */ *uap = v;
376 	int		fd;
377 	struct file	*fp;
378 	struct filedesc	*fdp;
379 
380 	fd = SCARG(uap, fd);
381 	fdp = p->p_fd;
382 
383 	if ((fp = fd_getfile(fdp, fd)) == NULL)
384 		return (EBADF);
385 
386 	if ((fp->f_flag & FWRITE) == 0)
387 		return (EBADF);
388 
389 	FILE_USE(fp);
390 
391 	/* dofilewritev() will unuse the descriptor for us */
392 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt),
393 	    &fp->f_offset, FOF_UPDATE_OFFSET, retval));
394 }
395 
396 int
397 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
398 	int iovcnt, off_t *offset, int flags, register_t *retval)
399 {
400 	struct uio	auio;
401 	struct iovec	*iov, *needfree, aiov[UIO_SMALLIOV];
402 	long		i, cnt, error;
403 	u_int		iovlen;
404 #ifdef KTRACE
405 	struct iovec	*ktriov;
406 #endif
407 
408 	error = 0;
409 #ifdef KTRACE
410 	ktriov = NULL;
411 #endif
412 	/* note: can't use iovlen until iovcnt is validated */
413 	iovlen = iovcnt * sizeof(struct iovec);
414 	if ((u_int)iovcnt > UIO_SMALLIOV) {
415 		if ((u_int)iovcnt > IOV_MAX) {
416 			error = EINVAL;
417 			goto out;
418 		}
419 		iov = malloc(iovlen, M_IOV, M_WAITOK);
420 		needfree = iov;
421 	} else if ((u_int)iovcnt > 0) {
422 		iov = aiov;
423 		needfree = NULL;
424 	} else {
425 		error = EINVAL;
426 		goto out;
427 	}
428 
429 	auio.uio_iov = iov;
430 	auio.uio_iovcnt = iovcnt;
431 	auio.uio_rw = UIO_WRITE;
432 	auio.uio_segflg = UIO_USERSPACE;
433 	auio.uio_procp = p;
434 	error = copyin(iovp, iov, iovlen);
435 	if (error)
436 		goto done;
437 	auio.uio_resid = 0;
438 	for (i = 0; i < iovcnt; i++) {
439 		auio.uio_resid += iov->iov_len;
440 		/*
441 		 * Writes return ssize_t because -1 is returned on error.
442 		 * Therefore we must restrict the length to SSIZE_MAX to
443 		 * avoid garbage return values.
444 		 */
445 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
446 			error = EINVAL;
447 			goto done;
448 		}
449 		iov++;
450 	}
451 #ifdef KTRACE
452 	/*
453 	 * if tracing, save a copy of iovec
454 	 */
455 	if (KTRPOINT(p, KTR_GENIO))  {
456 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
457 		memcpy((caddr_t)ktriov, (caddr_t)auio.uio_iov, iovlen);
458 	}
459 #endif
460 	cnt = auio.uio_resid;
461 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
462 	if (error) {
463 		if (auio.uio_resid != cnt && (error == ERESTART ||
464 		    error == EINTR || error == EWOULDBLOCK))
465 			error = 0;
466 		if (error == EPIPE)
467 			psignal(p, SIGPIPE);
468 	}
469 	cnt -= auio.uio_resid;
470 #ifdef KTRACE
471 	if (KTRPOINT(p, KTR_GENIO))
472 		if (error == 0) {
473 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt, error);
474 		free(ktriov, M_TEMP);
475 	}
476 #endif
477 	*retval = cnt;
478  done:
479 	if (needfree)
480 		free(needfree, M_IOV);
481  out:
482 	FILE_UNUSE(fp, p);
483 	return (error);
484 }
485 
486 /*
487  * Ioctl system call
488  */
489 /* ARGSUSED */
490 int
491 sys_ioctl(struct proc *p, void *v, register_t *retval)
492 {
493 	struct sys_ioctl_args /* {
494 		syscallarg(int)		fd;
495 		syscallarg(u_long)	com;
496 		syscallarg(caddr_t)	data;
497 	} */ *uap = v;
498 	struct file	*fp;
499 	struct filedesc	*fdp;
500 	u_long		com;
501 	int		error;
502 	u_int		size;
503 	caddr_t		data, memp;
504 	int		tmp;
505 #define	STK_PARAMS	128
506 	u_long		stkbuf[STK_PARAMS/sizeof(u_long)];
507 
508 	error = 0;
509 	fdp = p->p_fd;
510 
511 	if ((fp = fd_getfile(fdp, SCARG(uap, fd))) == NULL)
512 		return (EBADF);
513 
514 	FILE_USE(fp);
515 
516 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
517 		error = EBADF;
518 		goto out;
519 	}
520 
521 	switch (com = SCARG(uap, com)) {
522 	case FIONCLEX:
523 		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
524 		goto out;
525 
526 	case FIOCLEX:
527 		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
528 		goto out;
529 	}
530 
531 	/*
532 	 * Interpret high order word to find amount of data to be
533 	 * copied to/from the user's address space.
534 	 */
535 	size = IOCPARM_LEN(com);
536 	if (size > IOCPARM_MAX) {
537 		error = ENOTTY;
538 		goto out;
539 	}
540 	memp = NULL;
541 	if (size > sizeof(stkbuf)) {
542 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
543 		data = memp;
544 	} else
545 		data = (caddr_t)stkbuf;
546 	if (com&IOC_IN) {
547 		if (size) {
548 			error = copyin(SCARG(uap, data), data, size);
549 			if (error) {
550 				if (memp)
551 					free(memp, M_IOCTLOPS);
552 				goto out;
553 			}
554 		} else
555 			*(caddr_t *)data = SCARG(uap, data);
556 	} else if ((com&IOC_OUT) && size)
557 		/*
558 		 * Zero the buffer so the user always
559 		 * gets back something deterministic.
560 		 */
561 		memset(data, 0, size);
562 	else if (com&IOC_VOID)
563 		*(caddr_t *)data = SCARG(uap, data);
564 
565 	switch (com) {
566 
567 	case FIONBIO:
568 		if ((tmp = *(int *)data) != 0)
569 			fp->f_flag |= FNONBLOCK;
570 		else
571 			fp->f_flag &= ~FNONBLOCK;
572 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
573 		break;
574 
575 	case FIOASYNC:
576 		if ((tmp = *(int *)data) != 0)
577 			fp->f_flag |= FASYNC;
578 		else
579 			fp->f_flag &= ~FASYNC;
580 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
581 		break;
582 
583 	case FIOSETOWN:
584 		tmp = *(int *)data;
585 		if (fp->f_type == DTYPE_SOCKET) {
586 			((struct socket *)fp->f_data)->so_pgid = tmp;
587 			error = 0;
588 			break;
589 		}
590 		if (tmp <= 0) {
591 			tmp = -tmp;
592 		} else {
593 			struct proc *p1 = pfind(tmp);
594 			if (p1 == 0) {
595 				error = ESRCH;
596 				break;
597 			}
598 			tmp = p1->p_pgrp->pg_id;
599 		}
600 		error = (*fp->f_ops->fo_ioctl)
601 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
602 		break;
603 
604 	case FIOGETOWN:
605 		if (fp->f_type == DTYPE_SOCKET) {
606 			error = 0;
607 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
608 			break;
609 		}
610 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
611 		if (error == 0)
612 			*(int *)data = -*(int *)data;
613 		break;
614 
615 	default:
616 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
617 		/*
618 		 * Copy any data to user, size was
619 		 * already set and checked above.
620 		 */
621 		if (error == 0 && (com&IOC_OUT) && size)
622 			error = copyout(data, SCARG(uap, data), size);
623 		break;
624 	}
625 	if (memp)
626 		free(memp, M_IOCTLOPS);
627  out:
628 	FILE_UNUSE(fp, p);
629 	switch (error) {
630 	case -1:
631 		printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
632 		    "pid=%d comm=%s\n",
633 		    (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
634 		    (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
635 		    p->p_pid, p->p_comm);
636 		/* FALLTHROUGH */
637 	case EPASSTHROUGH:
638 		error = ENOTTY;
639 		/* FALLTHROUGH */
640 	default:
641 		return (error);
642 	}
643 }
644 
645 int	selwait, nselcoll;
646 
647 /*
648  * Select system call.
649  */
650 int
651 sys_select(struct proc *p, void *v, register_t *retval)
652 {
653 	struct sys_select_args /* {
654 		syscallarg(int)			nd;
655 		syscallarg(fd_set *)		in;
656 		syscallarg(fd_set *)		ou;
657 		syscallarg(fd_set *)		ex;
658 		syscallarg(struct timeval *)	tv;
659 	} */ *uap = v;
660 	caddr_t		bits;
661 	char		smallbits[howmany(FD_SETSIZE, NFDBITS) *
662 			    sizeof(fd_mask) * 6];
663 	struct		timeval atv;
664 	int		s, ncoll, error, timo;
665 	size_t		ni;
666 
667 	error = 0;
668 	if (SCARG(uap, nd) < 0)
669 		return (EINVAL);
670 	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
671 		/* forgiving; slightly wrong */
672 		SCARG(uap, nd) = p->p_fd->fd_nfiles;
673 	}
674 	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
675 	if (ni * 6 > sizeof(smallbits))
676 		bits = malloc(ni * 6, M_TEMP, M_WAITOK);
677 	else
678 		bits = smallbits;
679 
680 #define	getbits(name, x)						\
681 	if (SCARG(uap, name)) {						\
682 		error = copyin(SCARG(uap, name), bits + ni * x, ni);	\
683 		if (error)						\
684 			goto done;					\
685 	} else								\
686 		memset(bits + ni * x, 0, ni);
687 	getbits(in, 0);
688 	getbits(ou, 1);
689 	getbits(ex, 2);
690 #undef	getbits
691 
692 	if (SCARG(uap, tv)) {
693 		error = copyin(SCARG(uap, tv), (caddr_t)&atv,
694 			sizeof(atv));
695 		if (error)
696 			goto done;
697 		if (itimerfix(&atv)) {
698 			error = EINVAL;
699 			goto done;
700 		}
701 		s = splclock();
702 		timeradd(&atv, &time, &atv);
703 		splx(s);
704 	} else
705 		timo = 0;
706  retry:
707 	ncoll = nselcoll;
708 	p->p_flag |= P_SELECT;
709 	error = selscan(p, (fd_mask *)(bits + ni * 0),
710 			   (fd_mask *)(bits + ni * 3), SCARG(uap, nd), retval);
711 	if (error || *retval)
712 		goto done;
713 	if (SCARG(uap, tv)) {
714 		/*
715 		 * We have to recalculate the timeout on every retry.
716 		 */
717 		timo = hzto(&atv);
718 		if (timo <= 0)
719 			goto done;
720 	}
721 	s = splsched();
722 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
723 		splx(s);
724 		goto retry;
725 	}
726 	p->p_flag &= ~P_SELECT;
727 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
728 	splx(s);
729 	if (error == 0)
730 		goto retry;
731  done:
732 	p->p_flag &= ~P_SELECT;
733 	/* select is not restarted after signals... */
734 	if (error == ERESTART)
735 		error = EINTR;
736 	if (error == EWOULDBLOCK)
737 		error = 0;
738 	if (error == 0) {
739 
740 #define	putbits(name, x)						\
741 		if (SCARG(uap, name)) {					\
742 			error = copyout(bits + ni * x, SCARG(uap, name), ni); \
743 			if (error)					\
744 				goto out;				\
745 		}
746 		putbits(in, 3);
747 		putbits(ou, 4);
748 		putbits(ex, 5);
749 #undef putbits
750 	}
751  out:
752 	if (ni * 6 > sizeof(smallbits))
753 		free(bits, M_TEMP);
754 	return (error);
755 }
756 
757 int
758 selscan(struct proc *p, fd_mask *ibitp, fd_mask *obitp, int nfd,
759 	register_t *retval)
760 {
761 	struct filedesc	*fdp;
762 	int		msk, i, j, fd, n;
763 	fd_mask		ibits, obits;
764 	struct file	*fp;
765 	static int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
766 			       POLLWRNORM | POLLHUP | POLLERR,
767 			       POLLRDBAND };
768 
769 	fdp = p->p_fd;
770 	n = 0;
771 	for (msk = 0; msk < 3; msk++) {
772 		for (i = 0; i < nfd; i += NFDBITS) {
773 			ibits = *ibitp++;
774 			obits = 0;
775 			while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
776 				ibits &= ~(1 << j);
777 				if ((fp = fd_getfile(fdp, fd)) == NULL)
778 					return (EBADF);
779 				FILE_USE(fp);
780 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
781 					obits |= (1 << j);
782 					n++;
783 				}
784 				FILE_UNUSE(fp, p);
785 			}
786 			*obitp++ = obits;
787 		}
788 	}
789 	*retval = n;
790 	return (0);
791 }
792 
793 /*
794  * Poll system call.
795  */
796 int
797 sys_poll(struct proc *p, void *v, register_t *retval)
798 {
799 	struct sys_poll_args /* {
800 		syscallarg(struct pollfd *)	fds;
801 		syscallarg(u_int)		nfds;
802 		syscallarg(int)			timeout;
803 	} */ *uap = v;
804 	caddr_t		bits;
805 	char		smallbits[32 * sizeof(struct pollfd)];
806 	struct timeval	atv;
807 	int		s, ncoll, error, timo;
808 	size_t		ni;
809 
810 	error = 0;
811 	if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) {
812 		/* forgiving; slightly wrong */
813 		SCARG(uap, nfds) = p->p_fd->fd_nfiles;
814 	}
815 	ni = SCARG(uap, nfds) * sizeof(struct pollfd);
816 	if (ni > sizeof(smallbits))
817 		bits = malloc(ni, M_TEMP, M_WAITOK);
818 	else
819 		bits = smallbits;
820 
821 	error = copyin(SCARG(uap, fds), bits, ni);
822 	if (error)
823 		goto done;
824 
825 	if (SCARG(uap, timeout) != INFTIM) {
826 		atv.tv_sec = SCARG(uap, timeout) / 1000;
827 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
828 		if (itimerfix(&atv)) {
829 			error = EINVAL;
830 			goto done;
831 		}
832 		s = splclock();
833 		timeradd(&atv, &time, &atv);
834 		splx(s);
835 	} else
836 		timo = 0;
837  retry:
838 	ncoll = nselcoll;
839 	p->p_flag |= P_SELECT;
840 	error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds), retval);
841 	if (error || *retval)
842 		goto done;
843 	if (SCARG(uap, timeout) != INFTIM) {
844 		/*
845 		 * We have to recalculate the timeout on every retry.
846 		 */
847 		timo = hzto(&atv);
848 		if (timo <= 0)
849 			goto done;
850 	}
851 	s = splsched();
852 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
853 		splx(s);
854 		goto retry;
855 	}
856 	p->p_flag &= ~P_SELECT;
857 	error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo);
858 	splx(s);
859 	if (error == 0)
860 		goto retry;
861  done:
862 	p->p_flag &= ~P_SELECT;
863 	/* poll is not restarted after signals... */
864 	if (error == ERESTART)
865 		error = EINTR;
866 	if (error == EWOULDBLOCK)
867 		error = 0;
868 	if (error == 0) {
869 		error = copyout(bits, SCARG(uap, fds), ni);
870 		if (error)
871 			goto out;
872 	}
873  out:
874 	if (ni > sizeof(smallbits))
875 		free(bits, M_TEMP);
876 	return (error);
877 }
878 
879 int
880 pollscan(struct proc *p, struct pollfd *fds, int nfd, register_t *retval)
881 {
882 	struct filedesc	*fdp;
883 	int		i, n;
884 	struct file	*fp;
885 
886 	fdp = p->p_fd;
887 	n = 0;
888 	for (i = 0; i < nfd; i++, fds++) {
889 		if (fds->fd >= fdp->fd_nfiles) {
890 			fds->revents = POLLNVAL;
891 			n++;
892 		} else if (fds->fd < 0) {
893 			fds->revents = 0;
894 		} else {
895 			if ((fp = fd_getfile(fdp, fds->fd)) == NULL) {
896 				fds->revents = POLLNVAL;
897 				n++;
898 			} else {
899 				FILE_USE(fp);
900 				fds->revents = (*fp->f_ops->fo_poll)(fp,
901 				    fds->events | POLLERR | POLLHUP, p);
902 				if (fds->revents != 0)
903 					n++;
904 				FILE_UNUSE(fp, p);
905 			}
906 		}
907 	}
908 	*retval = n;
909 	return (0);
910 }
911 
912 /*ARGSUSED*/
913 int
914 seltrue(dev_t dev, int events, struct proc *p)
915 {
916 
917 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
918 }
919 
920 /*
921  * Record a select request.
922  */
923 void
924 selrecord(struct proc *selector, struct selinfo *sip)
925 {
926 	struct proc	*p;
927 	pid_t		mypid;
928 
929 	mypid = selector->p_pid;
930 	if (sip->si_pid == mypid)
931 		return;
932 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
933 	    p->p_wchan == (caddr_t)&selwait)
934 		sip->si_flags |= SI_COLL;
935 	else {
936 		sip->si_flags &= ~SI_COLL;
937 		sip->si_pid = mypid;
938 	}
939 }
940 
941 /*
942  * Do a wakeup when a selectable event occurs.
943  */
944 void
945 selwakeup(sip)
946 	struct selinfo *sip;
947 {
948 	struct proc *p;
949 	int s;
950 
951 	if (sip->si_pid == 0)
952 		return;
953 	if (sip->si_flags & SI_COLL) {
954 		nselcoll++;
955 		sip->si_flags &= ~SI_COLL;
956 		wakeup((caddr_t)&selwait);
957 	}
958 	p = pfind(sip->si_pid);
959 	sip->si_pid = 0;
960 	if (p != NULL) {
961 		SCHED_LOCK(s);
962 		if (p->p_wchan == (caddr_t)&selwait) {
963 			if (p->p_stat == SSLEEP)
964 				setrunnable(p);
965 			else
966 				unsleep(p);
967 		} else if (p->p_flag & P_SELECT)
968 			p->p_flag &= ~P_SELECT;
969 		SCHED_UNLOCK(s);
970 	}
971 }
972