xref: /dragonfly/sys/kern/sys_generic.c (revision 38a690d7)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40  * $DragonFly: src/sys/kern/sys_generic.c,v 1.10 2003/07/30 00:19:14 dillon Exp $
41  */
42 
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/malloc.h>
58 #include <sys/poll.h>
59 #include <sys/resourcevar.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/buf.h>
63 #ifdef KTRACE
64 #include <sys/ktrace.h>
65 #endif
66 #include <vm/vm.h>
67 #include <vm/vm_page.h>
68 #include <sys/file2.h>
69 
70 #include <machine/limits.h>
71 
72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
75 
76 static int	pollscan __P((struct proc *, struct pollfd *, u_int, int *));
77 static int	selscan __P((struct proc *, fd_mask **, fd_mask **,
78 			int, int *));
79 static int	dofileread __P((struct file *, int, void *,
80 			size_t, off_t, int, int *));
81 static int	dofilewrite __P((struct file *, int,
82 			const void *, size_t, off_t, int, int *));
83 
84 struct file*
85 holdfp(fdp, fd, flag)
86 	struct filedesc* fdp;
87 	int fd, flag;
88 {
89 	struct file* fp;
90 
91 	if (((u_int)fd) >= fdp->fd_nfiles ||
92 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
93 	    (fp->f_flag & flag) == 0) {
94 		return (NULL);
95 	}
96 	fhold(fp);
97 	return (fp);
98 }
99 
100 /*
101  * Read system call.
102  */
103 int
104 read(struct read_args *uap)
105 {
106 	struct thread *td = curthread;
107 	struct proc *p = td->td_proc;
108 	struct file *fp;
109 	int error;
110 
111 	KKASSERT(p);
112 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
113 		return (EBADF);
114 	error = dofileread(fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0,
115 			&uap->sysmsg_result);
116 	fdrop(fp, td);
117 	return(error);
118 }
119 
120 /*
121  * Pread system call
122  */
123 int
124 pread(struct pread_args *uap)
125 {
126 	struct thread *td = curthread;
127 	struct proc *p = td->td_proc;
128 	struct file *fp;
129 	int error;
130 
131 	KKASSERT(p);
132 	if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
133 		return (EBADF);
134 	if (fp->f_type != DTYPE_VNODE) {
135 		error = ESPIPE;
136 	} else {
137 	    error = dofileread(fp, uap->fd, uap->buf, uap->nbyte,
138 		uap->offset, FOF_OFFSET, &uap->sysmsg_result);
139 	}
140 	fdrop(fp, td);
141 	return(error);
142 }
143 
144 /*
145  * Code common for read and pread
146  */
147 int
148 dofileread(fp, fd, buf, nbyte, offset, flags, res)
149 	struct file *fp;
150 	int fd, flags;
151 	void *buf;
152 	size_t nbyte;
153 	off_t offset;
154 	int *res;
155 {
156 	struct thread *td = curthread;
157 	struct proc *p = td->td_proc;
158 	struct uio auio;
159 	struct iovec aiov;
160 	long cnt, error = 0;
161 #ifdef KTRACE
162 	struct iovec ktriov;
163 	struct uio ktruio;
164 	int didktr = 0;
165 #endif
166 
167 	aiov.iov_base = (caddr_t)buf;
168 	aiov.iov_len = nbyte;
169 	auio.uio_iov = &aiov;
170 	auio.uio_iovcnt = 1;
171 	auio.uio_offset = offset;
172 	if (nbyte > INT_MAX)
173 		return (EINVAL);
174 	auio.uio_resid = nbyte;
175 	auio.uio_rw = UIO_READ;
176 	auio.uio_segflg = UIO_USERSPACE;
177 	auio.uio_td = td;
178 #ifdef KTRACE
179 	/*
180 	 * if tracing, save a copy of iovec
181 	 */
182 	if (KTRPOINT(td, KTR_GENIO)) {
183 		ktriov = aiov;
184 		ktruio = auio;
185 		didktr = 1;
186 	}
187 #endif
188 	cnt = nbyte;
189 
190 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
191 		if (auio.uio_resid != cnt && (error == ERESTART ||
192 		    error == EINTR || error == EWOULDBLOCK))
193 			error = 0;
194 	}
195 	cnt -= auio.uio_resid;
196 #ifdef KTRACE
197 	if (didktr && error == 0) {
198 		ktruio.uio_iov = &ktriov;
199 		ktruio.uio_resid = cnt;
200 		ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
201 	}
202 #endif
203 	*res = cnt;
204 	return (error);
205 }
206 
207 /*
208  * Scatter read system call.
209  */
210 int
211 readv(struct readv_args *uap)
212 {
213 	struct thread *td = curthread;
214 	struct proc *p = td->td_proc;
215 	struct file *fp;
216 	struct filedesc *fdp = p->p_fd;
217 	struct uio auio;
218 	struct iovec *iov;
219 	struct iovec *needfree;
220 	struct iovec aiov[UIO_SMALLIOV];
221 	long i, cnt, error = 0;
222 	u_int iovlen;
223 #ifdef KTRACE
224 	struct iovec *ktriov = NULL;
225 	struct uio ktruio;
226 #endif
227 
228 	if ((fp = holdfp(fdp, uap->fd, FREAD)) == NULL)
229 		return (EBADF);
230 	/* note: can't use iovlen until iovcnt is validated */
231 	iovlen = uap->iovcnt * sizeof (struct iovec);
232 	if (uap->iovcnt > UIO_SMALLIOV) {
233 		if (uap->iovcnt > UIO_MAXIOV)
234 			return (EINVAL);
235 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
236 		needfree = iov;
237 	} else {
238 		iov = aiov;
239 		needfree = NULL;
240 	}
241 	auio.uio_iov = iov;
242 	auio.uio_iovcnt = uap->iovcnt;
243 	auio.uio_rw = UIO_READ;
244 	auio.uio_segflg = UIO_USERSPACE;
245 	auio.uio_td = td;
246 	auio.uio_offset = -1;
247 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
248 		goto done;
249 	auio.uio_resid = 0;
250 	for (i = 0; i < uap->iovcnt; i++) {
251 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
252 			error = EINVAL;
253 			goto done;
254 		}
255 		auio.uio_resid += iov->iov_len;
256 		iov++;
257 	}
258 #ifdef KTRACE
259 	/*
260 	 * if tracing, save a copy of iovec
261 	 */
262 	if (KTRPOINT(td, KTR_GENIO))  {
263 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
264 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
265 		ktruio = auio;
266 	}
267 #endif
268 	cnt = auio.uio_resid;
269 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
270 		if (auio.uio_resid != cnt && (error == ERESTART ||
271 		    error == EINTR || error == EWOULDBLOCK))
272 			error = 0;
273 	}
274 	cnt -= auio.uio_resid;
275 #ifdef KTRACE
276 	if (ktriov != NULL) {
277 		if (error == 0) {
278 			ktruio.uio_iov = ktriov;
279 			ktruio.uio_resid = cnt;
280 			ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktruio,
281 			    error);
282 		}
283 		FREE(ktriov, M_TEMP);
284 	}
285 #endif
286 	uap->sysmsg_result = cnt;
287 done:
288 	fdrop(fp, td);
289 	if (needfree)
290 		FREE(needfree, M_IOV);
291 	return (error);
292 }
293 
294 /*
295  * Write system call
296  */
297 int
298 write(struct write_args *uap)
299 {
300 	struct thread *td = curthread;
301 	struct proc *p = td->td_proc;
302 	struct file *fp;
303 	int error;
304 
305 	KKASSERT(p);
306 
307 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
308 		return (EBADF);
309 	error = dofilewrite(fp, uap->fd, uap->buf, uap->nbyte, (off_t)-1, 0,
310 			&uap->sysmsg_result);
311 	fdrop(fp, td);
312 	return(error);
313 }
314 
315 /*
316  * Pwrite system call
317  */
318 int
319 pwrite(struct pwrite_args *uap)
320 {
321 	struct thread *td = curthread;
322 	struct proc *p = td->td_proc;
323 	struct file *fp;
324 	int error;
325 
326 	KKASSERT(p);
327 	if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
328 		return (EBADF);
329 	if (fp->f_type != DTYPE_VNODE) {
330 		error = ESPIPE;
331 	} else {
332 	    error = dofilewrite(fp, uap->fd, uap->buf, uap->nbyte,
333 		uap->offset, FOF_OFFSET, &uap->sysmsg_result);
334 	}
335 	fdrop(fp, td);
336 	return(error);
337 }
338 
339 static int
340 dofilewrite(
341 	struct file *fp,
342 	int fd,
343 	const void *buf,
344 	size_t nbyte,
345 	off_t offset,
346 	int flags,
347 	int *res
348 ) {
349 	struct thread *td = curthread;
350 	struct proc *p = td->td_proc;
351 	struct uio auio;
352 	struct iovec aiov;
353 	long cnt, error = 0;
354 #ifdef KTRACE
355 	struct iovec ktriov;
356 	struct uio ktruio;
357 	int didktr = 0;
358 #endif
359 
360 	aiov.iov_base = (void *)(uintptr_t)buf;
361 	aiov.iov_len = nbyte;
362 	auio.uio_iov = &aiov;
363 	auio.uio_iovcnt = 1;
364 	auio.uio_offset = offset;
365 	if (nbyte > INT_MAX)
366 		return (EINVAL);
367 	auio.uio_resid = nbyte;
368 	auio.uio_rw = UIO_WRITE;
369 	auio.uio_segflg = UIO_USERSPACE;
370 	auio.uio_td = td;
371 #ifdef KTRACE
372 	/*
373 	 * if tracing, save a copy of iovec and uio
374 	 */
375 	if (KTRPOINT(td, KTR_GENIO)) {
376 		ktriov = aiov;
377 		ktruio = auio;
378 		didktr = 1;
379 	}
380 #endif
381 	cnt = nbyte;
382 	if (fp->f_type == DTYPE_VNODE)
383 		bwillwrite();
384 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
385 		if (auio.uio_resid != cnt && (error == ERESTART ||
386 		    error == EINTR || error == EWOULDBLOCK))
387 			error = 0;
388 		if (error == EPIPE)
389 			psignal(p, SIGPIPE);
390 	}
391 	cnt -= auio.uio_resid;
392 #ifdef KTRACE
393 	if (didktr && error == 0) {
394 		ktruio.uio_iov = &ktriov;
395 		ktruio.uio_resid = cnt;
396 		ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
397 	}
398 #endif
399 	*res = cnt;
400 	return (error);
401 }
402 
403 /*
404  * Gather write system call
405  */
406 int
407 writev(struct writev_args *uap)
408 {
409 	struct thread *td = curthread;
410 	struct proc *p = td->td_proc;
411 	struct file *fp;
412 	struct filedesc *fdp;
413 	struct uio auio;
414 	struct iovec *iov;
415 	struct iovec *needfree;
416 	struct iovec aiov[UIO_SMALLIOV];
417 	long i, cnt, error = 0;
418 	u_int iovlen;
419 #ifdef KTRACE
420 	struct iovec *ktriov = NULL;
421 	struct uio ktruio;
422 #endif
423 
424 	KKASSERT(p);
425 	fdp = p->p_fd;
426 
427 	if ((fp = holdfp(fdp, uap->fd, FWRITE)) == NULL)
428 		return (EBADF);
429 	/* note: can't use iovlen until iovcnt is validated */
430 	iovlen = uap->iovcnt * sizeof (struct iovec);
431 	if (uap->iovcnt > UIO_SMALLIOV) {
432 		if (uap->iovcnt > UIO_MAXIOV) {
433 			needfree = NULL;
434 			error = EINVAL;
435 			goto done;
436 		}
437 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
438 		needfree = iov;
439 	} else {
440 		iov = aiov;
441 		needfree = NULL;
442 	}
443 	auio.uio_iov = iov;
444 	auio.uio_iovcnt = uap->iovcnt;
445 	auio.uio_rw = UIO_WRITE;
446 	auio.uio_segflg = UIO_USERSPACE;
447 	auio.uio_td = td;
448 	auio.uio_offset = -1;
449 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
450 		goto done;
451 	auio.uio_resid = 0;
452 	for (i = 0; i < uap->iovcnt; i++) {
453 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
454 			error = EINVAL;
455 			goto done;
456 		}
457 		auio.uio_resid += iov->iov_len;
458 		iov++;
459 	}
460 #ifdef KTRACE
461 	/*
462 	 * if tracing, save a copy of iovec and uio
463 	 */
464 	if (KTRPOINT(td, KTR_GENIO))  {
465 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
466 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
467 		ktruio = auio;
468 	}
469 #endif
470 	cnt = auio.uio_resid;
471 	if (fp->f_type == DTYPE_VNODE)
472 		bwillwrite();
473 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
474 		if (auio.uio_resid != cnt && (error == ERESTART ||
475 		    error == EINTR || error == EWOULDBLOCK))
476 			error = 0;
477 		if (error == EPIPE)
478 			psignal(p, SIGPIPE);
479 	}
480 	cnt -= auio.uio_resid;
481 #ifdef KTRACE
482 	if (ktriov != NULL) {
483 		if (error == 0) {
484 			ktruio.uio_iov = ktriov;
485 			ktruio.uio_resid = cnt;
486 			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktruio,
487 			    error);
488 		}
489 		FREE(ktriov, M_TEMP);
490 	}
491 #endif
492 	uap->sysmsg_result = cnt;
493 done:
494 	fdrop(fp, td);
495 	if (needfree)
496 		FREE(needfree, M_IOV);
497 	return (error);
498 }
499 
500 /*
501  * Ioctl system call
502  */
503 /* ARGSUSED */
504 int
505 ioctl(struct ioctl_args *uap)
506 {
507 	struct thread *td = curthread;
508 	struct proc *p = td->td_proc;
509 	struct file *fp;
510 	struct filedesc *fdp;
511 	u_long com;
512 	int error;
513 	u_int size;
514 	caddr_t data, memp;
515 	int tmp;
516 #define STK_PARAMS	128
517 	union {
518 	    char stkbuf[STK_PARAMS];
519 	    long align;
520 	} ubuf;
521 
522 	KKASSERT(p);
523 	fdp = p->p_fd;
524 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
525 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
526 		return (EBADF);
527 
528 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
529 		return (EBADF);
530 
531 	switch (com = uap->com) {
532 	case FIONCLEX:
533 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
534 		return (0);
535 	case FIOCLEX:
536 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
537 		return (0);
538 	}
539 
540 	/*
541 	 * Interpret high order word to find amount of data to be
542 	 * copied to/from the user's address space.
543 	 */
544 	size = IOCPARM_LEN(com);
545 	if (size > IOCPARM_MAX)
546 		return (ENOTTY);
547 
548 	fhold(fp);
549 
550 	memp = NULL;
551 	if (size > sizeof (ubuf.stkbuf)) {
552 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
553 		data = memp;
554 	} else {
555 		data = ubuf.stkbuf;
556 	}
557 	if (com&IOC_IN) {
558 		if (size) {
559 			error = copyin(uap->data, data, (u_int)size);
560 			if (error) {
561 				if (memp)
562 					free(memp, M_IOCTLOPS);
563 				fdrop(fp, td);
564 				return (error);
565 			}
566 		} else {
567 			*(caddr_t *)data = uap->data;
568 		}
569 	} else if ((com&IOC_OUT) && size) {
570 		/*
571 		 * Zero the buffer so the user always
572 		 * gets back something deterministic.
573 		 */
574 		bzero(data, size);
575 	} else if (com&IOC_VOID) {
576 		*(caddr_t *)data = uap->data;
577 	}
578 
579 	switch (com) {
580 
581 	case FIONBIO:
582 		if ((tmp = *(int *)data))
583 			fp->f_flag |= FNONBLOCK;
584 		else
585 			fp->f_flag &= ~FNONBLOCK;
586 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
587 		break;
588 
589 	case FIOASYNC:
590 		if ((tmp = *(int *)data))
591 			fp->f_flag |= FASYNC;
592 		else
593 			fp->f_flag &= ~FASYNC;
594 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
595 		break;
596 
597 	default:
598 		error = fo_ioctl(fp, com, data, td);
599 		/*
600 		 * Copy any data to user, size was
601 		 * already set and checked above.
602 		 */
603 		if (error == 0 && (com&IOC_OUT) && size)
604 			error = copyout(data, uap->data, (u_int)size);
605 		break;
606 	}
607 	if (memp)
608 		free(memp, M_IOCTLOPS);
609 	fdrop(fp, td);
610 	return (error);
611 }
612 
613 static int	nselcoll;	/* Select collisions since boot */
614 int	selwait;
615 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
616 
617 /*
618  * Select system call.
619  */
620 int
621 select(struct select_args *uap)
622 {
623 	struct proc *p = curproc;
624 
625 	/*
626 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
627 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
628 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
629 	 * of 256.
630 	 */
631 	fd_mask s_selbits[howmany(2048, NFDBITS)];
632 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
633 	struct timeval atv, rtv, ttv;
634 	int s, ncoll, error, timo;
635 	u_int nbufbytes, ncpbytes, nfdbits;
636 
637 	if (uap->nd < 0)
638 		return (EINVAL);
639 	if (uap->nd > p->p_fd->fd_nfiles)
640 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
641 
642 	/*
643 	 * Allocate just enough bits for the non-null fd_sets.  Use the
644 	 * preallocated auto buffer if possible.
645 	 */
646 	nfdbits = roundup(uap->nd, NFDBITS);
647 	ncpbytes = nfdbits / NBBY;
648 	nbufbytes = 0;
649 	if (uap->in != NULL)
650 		nbufbytes += 2 * ncpbytes;
651 	if (uap->ou != NULL)
652 		nbufbytes += 2 * ncpbytes;
653 	if (uap->ex != NULL)
654 		nbufbytes += 2 * ncpbytes;
655 	if (nbufbytes <= sizeof s_selbits)
656 		selbits = &s_selbits[0];
657 	else
658 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
659 
660 	/*
661 	 * Assign pointers into the bit buffers and fetch the input bits.
662 	 * Put the output buffers together so that they can be bzeroed
663 	 * together.
664 	 */
665 	sbp = selbits;
666 #define	getbits(name, x) \
667 	do {								\
668 		if (uap->name == NULL)					\
669 			ibits[x] = NULL;				\
670 		else {							\
671 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
672 			obits[x] = sbp;					\
673 			sbp += ncpbytes / sizeof *sbp;			\
674 			error = copyin(uap->name, ibits[x], ncpbytes);	\
675 			if (error != 0)					\
676 				goto done;				\
677 		}							\
678 	} while (0)
679 	getbits(in, 0);
680 	getbits(ou, 1);
681 	getbits(ex, 2);
682 #undef	getbits
683 	if (nbufbytes != 0)
684 		bzero(selbits, nbufbytes / 2);
685 
686 	if (uap->tv) {
687 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
688 			sizeof (atv));
689 		if (error)
690 			goto done;
691 		if (itimerfix(&atv)) {
692 			error = EINVAL;
693 			goto done;
694 		}
695 		getmicrouptime(&rtv);
696 		timevaladd(&atv, &rtv);
697 	} else {
698 		atv.tv_sec = 0;
699 		atv.tv_usec = 0;
700 	}
701 	timo = 0;
702 retry:
703 	ncoll = nselcoll;
704 	p->p_flag |= P_SELECT;
705 	error = selscan(p, ibits, obits, uap->nd, &uap->sysmsg_result);
706 	if (error || uap->sysmsg_result)
707 		goto done;
708 	if (atv.tv_sec || atv.tv_usec) {
709 		getmicrouptime(&rtv);
710 		if (timevalcmp(&rtv, &atv, >=))
711 			goto done;
712 		ttv = atv;
713 		timevalsub(&ttv, &rtv);
714 		timo = ttv.tv_sec > 24 * 60 * 60 ?
715 		    24 * 60 * 60 * hz : tvtohz(&ttv);
716 	}
717 	s = splhigh();
718 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
719 		splx(s);
720 		goto retry;
721 	}
722 	p->p_flag &= ~P_SELECT;
723 
724 	error = tsleep((caddr_t)&selwait, PCATCH, "select", timo);
725 
726 	splx(s);
727 	if (error == 0)
728 		goto retry;
729 done:
730 	p->p_flag &= ~P_SELECT;
731 	/* select is not restarted after signals... */
732 	if (error == ERESTART)
733 		error = EINTR;
734 	if (error == EWOULDBLOCK)
735 		error = 0;
736 #define	putbits(name, x) \
737 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
738 		error = error2;
739 	if (error == 0) {
740 		int error2;
741 
742 		putbits(in, 0);
743 		putbits(ou, 1);
744 		putbits(ex, 2);
745 #undef putbits
746 	}
747 	if (selbits != &s_selbits[0])
748 		free(selbits, M_SELECT);
749 	return (error);
750 }
751 
752 static int
753 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res)
754 {
755 	struct thread *td = p->p_thread;
756 	struct filedesc *fdp = p->p_fd;
757 	int msk, i, fd;
758 	fd_mask bits;
759 	struct file *fp;
760 	int n = 0;
761 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
762 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
763 
764 	for (msk = 0; msk < 3; msk++) {
765 		if (ibits[msk] == NULL)
766 			continue;
767 		for (i = 0; i < nfd; i += NFDBITS) {
768 			bits = ibits[msk][i/NFDBITS];
769 			/* ffs(int mask) not portable, fd_mask is long */
770 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
771 				if (!(bits & 1))
772 					continue;
773 				fp = fdp->fd_ofiles[fd];
774 				if (fp == NULL)
775 					return (EBADF);
776 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
777 					obits[msk][(fd)/NFDBITS] |=
778 					    ((fd_mask)1 << ((fd) % NFDBITS));
779 					n++;
780 				}
781 			}
782 		}
783 	}
784 	*res = n;
785 	return (0);
786 }
787 
788 /*
789  * Poll system call.
790  */
791 int
792 poll(struct poll_args *uap)
793 {
794 	caddr_t bits;
795 	char smallbits[32 * sizeof(struct pollfd)];
796 	struct timeval atv, rtv, ttv;
797 	int s, ncoll, error = 0, timo;
798 	u_int nfds;
799 	size_t ni;
800 	struct proc *p = curproc;
801 
802 	nfds = SCARG(uap, nfds);
803 	/*
804 	 * This is kinda bogus.  We have fd limits, but that is not
805 	 * really related to the size of the pollfd array.  Make sure
806 	 * we let the process use at least FD_SETSIZE entries and at
807 	 * least enough for the current limits.  We want to be reasonably
808 	 * safe, but not overly restrictive.
809 	 */
810 	if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
811 		return (EINVAL);
812 	ni = nfds * sizeof(struct pollfd);
813 	if (ni > sizeof(smallbits))
814 		bits = malloc(ni, M_TEMP, M_WAITOK);
815 	else
816 		bits = smallbits;
817 	error = copyin(SCARG(uap, fds), bits, ni);
818 	if (error)
819 		goto done;
820 	if (SCARG(uap, timeout) != INFTIM) {
821 		atv.tv_sec = SCARG(uap, timeout) / 1000;
822 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
823 		if (itimerfix(&atv)) {
824 			error = EINVAL;
825 			goto done;
826 		}
827 		getmicrouptime(&rtv);
828 		timevaladd(&atv, &rtv);
829 	} else {
830 		atv.tv_sec = 0;
831 		atv.tv_usec = 0;
832 	}
833 	timo = 0;
834 retry:
835 	ncoll = nselcoll;
836 	p->p_flag |= P_SELECT;
837 	error = pollscan(p, (struct pollfd *)bits, nfds, &uap->sysmsg_result);
838 	if (error || uap->sysmsg_result)
839 		goto done;
840 	if (atv.tv_sec || atv.tv_usec) {
841 		getmicrouptime(&rtv);
842 		if (timevalcmp(&rtv, &atv, >=))
843 			goto done;
844 		ttv = atv;
845 		timevalsub(&ttv, &rtv);
846 		timo = ttv.tv_sec > 24 * 60 * 60 ?
847 		    24 * 60 * 60 * hz : tvtohz(&ttv);
848 	}
849 	s = splhigh();
850 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
851 		splx(s);
852 		goto retry;
853 	}
854 	p->p_flag &= ~P_SELECT;
855 	error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo);
856 	splx(s);
857 	if (error == 0)
858 		goto retry;
859 done:
860 	p->p_flag &= ~P_SELECT;
861 	/* poll is not restarted after signals... */
862 	if (error == ERESTART)
863 		error = EINTR;
864 	if (error == EWOULDBLOCK)
865 		error = 0;
866 	if (error == 0) {
867 		error = copyout(bits, SCARG(uap, fds), ni);
868 		if (error)
869 			goto out;
870 	}
871 out:
872 	if (ni > sizeof(smallbits))
873 		free(bits, M_TEMP);
874 	return (error);
875 }
876 
877 static int
878 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res)
879 {
880 	struct thread *td = p->p_thread;
881 	struct filedesc *fdp = p->p_fd;
882 	int i;
883 	struct file *fp;
884 	int n = 0;
885 
886 	for (i = 0; i < nfd; i++, fds++) {
887 		if (fds->fd >= fdp->fd_nfiles) {
888 			fds->revents = POLLNVAL;
889 			n++;
890 		} else if (fds->fd < 0) {
891 			fds->revents = 0;
892 		} else {
893 			fp = fdp->fd_ofiles[fds->fd];
894 			if (fp == NULL) {
895 				fds->revents = POLLNVAL;
896 				n++;
897 			} else {
898 				/*
899 				 * Note: backend also returns POLLHUP and
900 				 * POLLERR if appropriate.
901 				 */
902 				fds->revents = fo_poll(fp, fds->events,
903 				    fp->f_cred, td);
904 				if (fds->revents != 0)
905 					n++;
906 			}
907 		}
908 	}
909 	*res = n;
910 	return (0);
911 }
912 
913 /*
914  * OpenBSD poll system call.
915  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
916  */
917 int
918 openbsd_poll(struct openbsd_poll_args *uap)
919 {
920 	return (poll((struct poll_args *)uap));
921 }
922 
923 /*ARGSUSED*/
924 int
925 seltrue(dev_t dev, int events, struct thread *td)
926 {
927 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
928 }
929 
930 /*
931  * Record a select request.  A global wait must be used since a process/thread
932  * might go away after recording its request.
933  */
934 void
935 selrecord(struct thread *selector, struct selinfo *sip)
936 {
937 	struct proc *p;
938 	pid_t mypid;
939 
940 	if ((p = selector->td_proc) == NULL)
941 		panic("selrecord: thread needs a process");
942 
943 	mypid = p->p_pid;
944 	if (sip->si_pid == mypid)
945 		return;
946 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
947 	    p->p_wchan == (caddr_t)&selwait) {
948 		sip->si_flags |= SI_COLL;
949 	} else {
950 		sip->si_pid = mypid;
951 	}
952 }
953 
954 /*
955  * Do a wakeup when a selectable event occurs.
956  */
957 void
958 selwakeup(struct selinfo *sip)
959 {
960 	struct proc *p;
961 	int s;
962 
963 	if (sip->si_pid == 0)
964 		return;
965 	if (sip->si_flags & SI_COLL) {
966 		nselcoll++;
967 		sip->si_flags &= ~SI_COLL;
968 		wakeup((caddr_t)&selwait);	/* YYY fixable */
969 	}
970 	p = pfind(sip->si_pid);
971 	sip->si_pid = 0;
972 	if (p != NULL) {
973 		s = splhigh();
974 		if (p->p_wchan == (caddr_t)&selwait) {
975 			if (p->p_stat == SSLEEP)
976 				setrunnable(p);
977 			else
978 				unsleep(p->p_thread);
979 		} else if (p->p_flag & P_SELECT)
980 			p->p_flag &= ~P_SELECT;
981 		splx(s);
982 	}
983 }
984 
985