xref: /dragonfly/sys/kern/sys_generic.c (revision 1bf4b486)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40  * $DragonFly: src/sys/kern/sys_generic.c,v 1.22 2005/06/22 01:33:21 dillon Exp $
41  */
42 
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/kern_syscall.h>
58 #include <sys/malloc.h>
59 #include <sys/mapped_ioctl.h>
60 #include <sys/poll.h>
61 #include <sys/queue.h>
62 #include <sys/resourcevar.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysent.h>
65 #include <sys/buf.h>
66 #ifdef KTRACE
67 #include <sys/ktrace.h>
68 #endif
69 #include <vm/vm.h>
70 #include <vm/vm_page.h>
71 #include <sys/file2.h>
72 
73 #include <machine/limits.h>
74 
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
79 
80 static int	pollscan (struct proc *, struct pollfd *, u_int, int *);
81 static int	selscan (struct proc *, fd_mask **, fd_mask **,
82 			int, int *);
83 
84 struct file*
85 holdfp(fdp, fd, flag)
86 	struct filedesc* fdp;
87 	int fd, flag;
88 {
89 	struct file* fp;
90 
91 	if (((u_int)fd) >= fdp->fd_nfiles ||
92 	    (fp = fdp->fd_files[fd].fp) == NULL ||
93 	    (fp->f_flag & flag) == 0) {
94 		return (NULL);
95 	}
96 	fhold(fp);
97 	return (fp);
98 }
99 
100 /*
101  * Read system call.
102  */
103 int
104 read(struct read_args *uap)
105 {
106 	struct thread *td = curthread;
107 	struct uio auio;
108 	struct iovec aiov;
109 	int error;
110 
111 	aiov.iov_base = uap->buf;
112 	aiov.iov_len = uap->nbyte;
113 	auio.uio_iov = &aiov;
114 	auio.uio_iovcnt = 1;
115 	auio.uio_offset = -1;
116 	auio.uio_resid = uap->nbyte;
117 	auio.uio_rw = UIO_READ;
118 	auio.uio_segflg = UIO_USERSPACE;
119 	auio.uio_td = td;
120 
121 	error = kern_readv(uap->fd, &auio, 0, &uap->sysmsg_result);
122 
123 	return(error);
124 }
125 
126 /*
127  * Pread system call
128  */
129 int
130 pread(struct pread_args *uap)
131 {
132 	struct thread *td = curthread;
133 	struct uio auio;
134 	struct iovec aiov;
135 	int error;
136 
137 	aiov.iov_base = uap->buf;
138 	aiov.iov_len = uap->nbyte;
139 	auio.uio_iov = &aiov;
140 	auio.uio_iovcnt = 1;
141 	auio.uio_offset = uap->offset;
142 	auio.uio_resid = uap->nbyte;
143 	auio.uio_rw = UIO_READ;
144 	auio.uio_segflg = UIO_USERSPACE;
145 	auio.uio_td = td;
146 
147 	error = kern_readv(uap->fd, &auio, FOF_OFFSET, &uap->sysmsg_result);
148 
149 	return(error);
150 }
151 
152 int
153 readv(struct readv_args *uap)
154 {
155 	struct thread *td = curthread;
156 	struct uio auio;
157 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
158 	int error;
159 
160 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
161 	    &auio.uio_resid);
162 	if (error)
163 		return (error);
164 	auio.uio_iov = iov;
165 	auio.uio_iovcnt = uap->iovcnt;
166 	auio.uio_offset = -1;
167 	auio.uio_rw = UIO_READ;
168 	auio.uio_segflg = UIO_USERSPACE;
169 	auio.uio_td = td;
170 
171 	error = kern_readv(uap->fd, &auio, 0, &uap->sysmsg_result);
172 
173 	iovec_free(&iov, aiov);
174 	return (error);
175 }
176 
177 int
178 kern_readv(int fd, struct uio *auio, int flags, int *res)
179 {
180 	struct thread *td = curthread;
181 	struct proc *p = td->td_proc;
182 	struct file *fp;
183 	struct filedesc *fdp = p->p_fd;
184 	int len, error;
185 #ifdef KTRACE
186 	struct iovec *ktriov = NULL;
187 	struct uio ktruio;
188 #endif
189 
190 	KKASSERT(p);
191 
192 	fp = holdfp(fdp, fd, FREAD);
193 	if (fp == NULL)
194 		return (EBADF);
195 	if (flags & FOF_OFFSET && fp->f_type != DTYPE_VNODE) {
196 		error = ESPIPE;
197 		goto done;
198 	}
199 	if (auio->uio_resid < 0) {
200 		error = EINVAL;
201 		goto done;
202 	}
203 #ifdef KTRACE
204 	/*
205 	 * if tracing, save a copy of iovec
206 	 */
207 	if (KTRPOINT(td, KTR_GENIO))  {
208 		int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
209 
210 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
211 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
212 		ktruio = *auio;
213 	}
214 #endif
215 	len = auio->uio_resid;
216 	error = fo_read(fp, auio, fp->f_cred, flags, td);
217 	if (error) {
218 		if (auio->uio_resid != len && (error == ERESTART ||
219 		    error == EINTR || error == EWOULDBLOCK))
220 			error = 0;
221 	}
222 #ifdef KTRACE
223 	if (ktriov != NULL) {
224 		if (error == 0) {
225 			ktruio.uio_iov = ktriov;
226 			ktruio.uio_resid = len - auio->uio_resid;
227 			ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
228 		}
229 		FREE(ktriov, M_TEMP);
230 	}
231 #endif
232 	if (error == 0)
233 		*res = len - auio->uio_resid;
234 done:
235 	fdrop(fp, td);
236 	return (error);
237 }
238 
239 /*
240  * Write system call
241  */
242 int
243 write(struct write_args *uap)
244 {
245 	struct thread *td = curthread;
246 	struct uio auio;
247 	struct iovec aiov;
248 	int error;
249 
250 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
251 	aiov.iov_len = uap->nbyte;
252 	auio.uio_iov = &aiov;
253 	auio.uio_iovcnt = 1;
254 	auio.uio_offset = -1;
255 	auio.uio_resid = uap->nbyte;
256 	auio.uio_rw = UIO_WRITE;
257 	auio.uio_segflg = UIO_USERSPACE;
258 	auio.uio_td = td;
259 
260 	error = kern_writev(uap->fd, &auio, 0, &uap->sysmsg_result);
261 
262 	return(error);
263 }
264 
265 /*
266  * Pwrite system call
267  */
268 int
269 pwrite(struct pwrite_args *uap)
270 {
271 	struct thread *td = curthread;
272 	struct uio auio;
273 	struct iovec aiov;
274 	int error;
275 
276 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
277 	aiov.iov_len = uap->nbyte;
278 	auio.uio_iov = &aiov;
279 	auio.uio_iovcnt = 1;
280 	auio.uio_offset = uap->offset;
281 	auio.uio_resid = uap->nbyte;
282 	auio.uio_rw = UIO_WRITE;
283 	auio.uio_segflg = UIO_USERSPACE;
284 	auio.uio_td = td;
285 
286 	error = kern_writev(uap->fd, &auio, FOF_OFFSET, &uap->sysmsg_result);
287 
288 	return(error);
289 }
290 
291 int
292 writev(struct writev_args *uap)
293 {
294 	struct thread *td = curthread;
295 	struct uio auio;
296 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
297 	int error;
298 
299 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
300 	    &auio.uio_resid);
301 	if (error)
302 		return (error);
303 	auio.uio_iov = iov;
304 	auio.uio_iovcnt = uap->iovcnt;
305 	auio.uio_offset = -1;
306 	auio.uio_rw = UIO_WRITE;
307 	auio.uio_segflg = UIO_USERSPACE;
308 	auio.uio_td = td;
309 
310 	error = kern_writev(uap->fd, &auio, 0, &uap->sysmsg_result);
311 
312 	iovec_free(&iov, aiov);
313 	return (error);
314 }
315 
316 /*
317  * Gather write system call
318  */
319 int
320 kern_writev(int fd, struct uio *auio, int flags, int *res)
321 {
322 	struct thread *td = curthread;
323 	struct proc *p = td->td_proc;
324 	struct file *fp;
325 	struct filedesc *fdp = p->p_fd;
326 	long len, error;
327 #ifdef KTRACE
328 	struct iovec *ktriov = NULL;
329 	struct uio ktruio;
330 #endif
331 
332 	KKASSERT(p);
333 
334 	fp = holdfp(fdp, fd, FWRITE);
335 	if (fp == NULL)
336 		return (EBADF);
337 	if ((flags & FOF_OFFSET) && fp->f_type != DTYPE_VNODE) {
338 		error = ESPIPE;
339 		goto done;
340 	}
341 	if (auio->uio_resid < 0) {
342 		error = EINVAL;
343 		goto done;
344 	}
345 #ifdef KTRACE
346 	/*
347 	 * if tracing, save a copy of iovec and uio
348 	 */
349 	if (KTRPOINT(td, KTR_GENIO))  {
350 		int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
351 
352 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
353 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
354 		ktruio = *auio;
355 	}
356 #endif
357 	len = auio->uio_resid;
358 	if (fp->f_type == DTYPE_VNODE)
359 		bwillwrite();
360 	error = fo_write(fp, auio, fp->f_cred, flags, td);
361 	if (error) {
362 		if (auio->uio_resid != len && (error == ERESTART ||
363 		    error == EINTR || error == EWOULDBLOCK))
364 			error = 0;
365 		if (error == EPIPE)
366 			psignal(p, SIGPIPE);
367 	}
368 #ifdef KTRACE
369 	if (ktriov != NULL) {
370 		if (error == 0) {
371 			ktruio.uio_iov = ktriov;
372 			ktruio.uio_resid = len - auio->uio_resid;
373 			ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
374 		}
375 		FREE(ktriov, M_TEMP);
376 	}
377 #endif
378 	if (error == 0)
379 		*res = len - auio->uio_resid;
380 done:
381 	fdrop(fp, td);
382 	return (error);
383 }
384 
385 /*
386  * Ioctl system call
387  */
388 /* ARGSUSED */
389 int
390 ioctl(struct ioctl_args *uap)
391 {
392 	return(mapped_ioctl(uap->fd, uap->com, uap->data, NULL));
393 }
394 
395 struct ioctl_map_entry {
396 	const char *subsys;
397 	struct ioctl_map_range *cmd_ranges;
398 	LIST_ENTRY(ioctl_map_entry) entries;
399 };
400 
401 /*
402  * The true heart of all ioctl syscall handlers (native, emulation).
403  * If map != NULL, it will be searched for a matching entry for com,
404  * and appropriate conversions/conversion functions will be utilized.
405  */
406 int
407 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map)
408 {
409 	struct thread *td = curthread;
410 	struct proc *p = td->td_proc;
411 	struct file *fp;
412 	struct filedesc *fdp;
413 	struct ioctl_map_range *iomc = NULL;
414 	int error;
415 	u_int size;
416 	u_long ocom = com;
417 	caddr_t data, memp;
418 	int tmp;
419 #define STK_PARAMS	128
420 	union {
421 	    char stkbuf[STK_PARAMS];
422 	    long align;
423 	} ubuf;
424 
425 	KKASSERT(p);
426 	fdp = p->p_fd;
427 	if ((u_int)fd >= fdp->fd_nfiles ||
428 	    (fp = fdp->fd_files[fd].fp) == NULL)
429 		return(EBADF);
430 
431 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
432 		return(EBADF);
433 
434 	if (map != NULL) {	/* obey translation map */
435 		u_long maskcmd;
436 		struct ioctl_map_entry *e;
437 
438 		maskcmd = com & map->mask;
439 
440 		LIST_FOREACH(e, &map->mapping, entries) {
441 			for (iomc = e->cmd_ranges; iomc->start != 0 ||
442 			     iomc->maptocmd != 0 || iomc->wrapfunc != NULL ||
443 			     iomc->mapfunc != NULL;
444 			     iomc++) {
445 				if (maskcmd >= iomc->start &&
446 				    maskcmd <= iomc->end)
447 					break;
448 			}
449 
450 			/* Did we find a match? */
451 			if (iomc->start != 0 || iomc->maptocmd != 0 ||
452 			    iomc->wrapfunc != NULL || iomc->mapfunc != NULL)
453 				break;
454 		}
455 
456 		if (iomc == NULL ||
457 		    (iomc->start == 0 && iomc->maptocmd == 0
458 		     && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) {
459 			printf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
460 			       map->sys, fd, maskcmd,
461 			       (int)((maskcmd >> 8) & 0xff),
462 			       (int)(maskcmd & 0xff));
463 			return(EINVAL);
464 		}
465 
466 		/*
467 		 * If it's a non-range one to one mapping, maptocmd should be
468 		 * correct. If it's a ranged one to one mapping, we pass the
469 		 * original value of com, and for a range mapped to a different
470 		 * range, we always need a mapping function to translate the
471 		 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
472 		 */
473 		if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) {
474 			com = iomc->maptocmd;
475 		} else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) {
476 			if (iomc->mapfunc != NULL)
477 				com = iomc->mapfunc(iomc->start, iomc->end,
478 						    iomc->start, iomc->end,
479 						    com, com);
480 		} else {
481 			if (iomc->mapfunc != NULL) {
482 				com = iomc->mapfunc(iomc->start, iomc->end,
483 						    iomc->maptocmd, iomc->maptoend,
484 						    com, ocom);
485 			} else {
486 				printf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
487 				       map->sys, fd, maskcmd,
488 				       (int)((maskcmd >> 8) & 0xff),
489 				       (int)(maskcmd & 0xff));
490 				return(EINVAL);
491 			}
492 		}
493 	}
494 
495 	switch (com) {
496 	case FIONCLEX:
497 		fdp->fd_files[fd].fileflags &= ~UF_EXCLOSE;
498 		return(0);
499 	case FIOCLEX:
500 		fdp->fd_files[fd].fileflags |= UF_EXCLOSE;
501 		return(0);
502 	}
503 
504 	/*
505 	 * Interpret high order word to find amount of data to be
506 	 * copied to/from the user's address space.
507 	 */
508 	size = IOCPARM_LEN(com);
509 	if (size > IOCPARM_MAX)
510 		return(ENOTTY);
511 
512 	fhold(fp);
513 
514 	memp = NULL;
515 	if (size > sizeof (ubuf.stkbuf)) {
516 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
517 		data = memp;
518 	} else {
519 		data = ubuf.stkbuf;
520 	}
521 	if ((com & IOC_IN) != 0) {
522 		if (size != 0) {
523 			error = copyin(uspc_data, data, (u_int)size);
524 			if (error) {
525 				if (memp != NULL)
526 					free(memp, M_IOCTLOPS);
527 				fdrop(fp, td);
528 				return(error);
529 			}
530 		} else {
531 			*(caddr_t *)data = uspc_data;
532 		}
533 	} else if ((com & IOC_OUT) != 0 && size) {
534 		/*
535 		 * Zero the buffer so the user always
536 		 * gets back something deterministic.
537 		 */
538 		bzero(data, size);
539 	} else if ((com & IOC_VOID) != 0) {
540 		*(caddr_t *)data = uspc_data;
541 	}
542 
543 	switch (com) {
544 
545 	case FIONBIO:
546 		if ((tmp = *(int *)data))
547 			fp->f_flag |= FNONBLOCK;
548 		else
549 			fp->f_flag &= ~FNONBLOCK;
550 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
551 		break;
552 
553 	case FIOASYNC:
554 		if ((tmp = *(int *)data))
555 			fp->f_flag |= FASYNC;
556 		else
557 			fp->f_flag &= ~FASYNC;
558 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
559 		break;
560 
561 	default:
562 		/*
563 		 *  If there is a override function,
564 		 *  call it instead of directly routing the call
565 		 */
566 		if (map != NULL && iomc->wrapfunc != NULL)
567 			error = iomc->wrapfunc(fp, com, ocom, data, td);
568 		else
569 			error = fo_ioctl(fp, com, data, td);
570 		/*
571 		 * Copy any data to user, size was
572 		 * already set and checked above.
573 		 */
574 		if (error == 0 && (com & IOC_OUT) != 0 && size != 0)
575 			error = copyout(data, uspc_data, (u_int)size);
576 		break;
577 	}
578 	if (memp != NULL)
579 		free(memp, M_IOCTLOPS);
580 	fdrop(fp, td);
581 	return(error);
582 }
583 
584 int
585 mapped_ioctl_register_handler(struct ioctl_map_handler *he)
586 {
587 	struct ioctl_map_entry *ne;
588 
589 	KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL &&
590 		 he->subsys != NULL && *he->subsys != '\0');
591 
592 	ne = malloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK);
593 
594 	ne->subsys = he->subsys;
595 	ne->cmd_ranges = he->cmd_ranges;
596 
597 	LIST_INSERT_HEAD(&he->map->mapping, ne, entries);
598 
599 	return(0);
600 }
601 
602 int
603 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he)
604 {
605 	struct ioctl_map_entry *ne;
606 
607 	KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL);
608 
609 	LIST_FOREACH(ne, &he->map->mapping, entries) {
610 		if (ne->cmd_ranges != he->cmd_ranges)
611 			continue;
612 		LIST_REMOVE(ne, entries);
613 		free(ne, M_IOCTLMAP);
614 		return(0);
615 	}
616 	return(EINVAL);
617 }
618 
619 static int	nselcoll;	/* Select collisions since boot */
620 int	selwait;
621 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
622 
623 /*
624  * Select system call.
625  */
626 int
627 select(struct select_args *uap)
628 {
629 	struct proc *p = curproc;
630 
631 	/*
632 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
633 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
634 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
635 	 * of 256.
636 	 */
637 	fd_mask s_selbits[howmany(2048, NFDBITS)];
638 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
639 	struct timeval atv, rtv, ttv;
640 	int ncoll, error, timo;
641 	u_int nbufbytes, ncpbytes, nfdbits;
642 
643 	if (uap->nd < 0)
644 		return (EINVAL);
645 	if (uap->nd > p->p_fd->fd_nfiles)
646 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
647 
648 	/*
649 	 * Allocate just enough bits for the non-null fd_sets.  Use the
650 	 * preallocated auto buffer if possible.
651 	 */
652 	nfdbits = roundup(uap->nd, NFDBITS);
653 	ncpbytes = nfdbits / NBBY;
654 	nbufbytes = 0;
655 	if (uap->in != NULL)
656 		nbufbytes += 2 * ncpbytes;
657 	if (uap->ou != NULL)
658 		nbufbytes += 2 * ncpbytes;
659 	if (uap->ex != NULL)
660 		nbufbytes += 2 * ncpbytes;
661 	if (nbufbytes <= sizeof s_selbits)
662 		selbits = &s_selbits[0];
663 	else
664 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
665 
666 	/*
667 	 * Assign pointers into the bit buffers and fetch the input bits.
668 	 * Put the output buffers together so that they can be bzeroed
669 	 * together.
670 	 */
671 	sbp = selbits;
672 #define	getbits(name, x) \
673 	do {								\
674 		if (uap->name == NULL)					\
675 			ibits[x] = NULL;				\
676 		else {							\
677 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
678 			obits[x] = sbp;					\
679 			sbp += ncpbytes / sizeof *sbp;			\
680 			error = copyin(uap->name, ibits[x], ncpbytes);	\
681 			if (error != 0)					\
682 				goto done;				\
683 		}							\
684 	} while (0)
685 	getbits(in, 0);
686 	getbits(ou, 1);
687 	getbits(ex, 2);
688 #undef	getbits
689 	if (nbufbytes != 0)
690 		bzero(selbits, nbufbytes / 2);
691 
692 	if (uap->tv) {
693 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
694 			sizeof (atv));
695 		if (error)
696 			goto done;
697 		if (itimerfix(&atv)) {
698 			error = EINVAL;
699 			goto done;
700 		}
701 		getmicrouptime(&rtv);
702 		timevaladd(&atv, &rtv);
703 	} else {
704 		atv.tv_sec = 0;
705 		atv.tv_usec = 0;
706 	}
707 	timo = 0;
708 retry:
709 	ncoll = nselcoll;
710 	p->p_flag |= P_SELECT;
711 	error = selscan(p, ibits, obits, uap->nd, &uap->sysmsg_result);
712 	if (error || uap->sysmsg_result)
713 		goto done;
714 	if (atv.tv_sec || atv.tv_usec) {
715 		getmicrouptime(&rtv);
716 		if (timevalcmp(&rtv, &atv, >=))
717 			goto done;
718 		ttv = atv;
719 		timevalsub(&ttv, &rtv);
720 		timo = ttv.tv_sec > 24 * 60 * 60 ?
721 		    24 * 60 * 60 * hz : tvtohz_high(&ttv);
722 	}
723 	crit_enter();
724 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
725 		crit_exit();
726 		goto retry;
727 	}
728 	p->p_flag &= ~P_SELECT;
729 
730 	error = tsleep((caddr_t)&selwait, PCATCH, "select", timo);
731 
732 	crit_exit();
733 	if (error == 0)
734 		goto retry;
735 done:
736 	p->p_flag &= ~P_SELECT;
737 	/* select is not restarted after signals... */
738 	if (error == ERESTART)
739 		error = EINTR;
740 	if (error == EWOULDBLOCK)
741 		error = 0;
742 #define	putbits(name, x) \
743 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
744 		error = error2;
745 	if (error == 0) {
746 		int error2;
747 
748 		putbits(in, 0);
749 		putbits(ou, 1);
750 		putbits(ex, 2);
751 #undef putbits
752 	}
753 	if (selbits != &s_selbits[0])
754 		free(selbits, M_SELECT);
755 	return (error);
756 }
757 
758 static int
759 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res)
760 {
761 	struct thread *td = p->p_thread;
762 	struct filedesc *fdp = p->p_fd;
763 	int msk, i, fd;
764 	fd_mask bits;
765 	struct file *fp;
766 	int n = 0;
767 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
768 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
769 
770 	for (msk = 0; msk < 3; msk++) {
771 		if (ibits[msk] == NULL)
772 			continue;
773 		for (i = 0; i < nfd; i += NFDBITS) {
774 			bits = ibits[msk][i/NFDBITS];
775 			/* ffs(int mask) not portable, fd_mask is long */
776 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
777 				if (!(bits & 1))
778 					continue;
779 				fp = fdp->fd_files[fd].fp;
780 				if (fp == NULL)
781 					return (EBADF);
782 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
783 					obits[msk][(fd)/NFDBITS] |=
784 					    ((fd_mask)1 << ((fd) % NFDBITS));
785 					n++;
786 				}
787 			}
788 		}
789 	}
790 	*res = n;
791 	return (0);
792 }
793 
794 /*
795  * Poll system call.
796  */
797 int
798 poll(struct poll_args *uap)
799 {
800 	struct pollfd *bits;
801 	struct pollfd smallbits[32];
802 	struct timeval atv, rtv, ttv;
803 	int ncoll, error = 0, timo;
804 	u_int nfds;
805 	size_t ni;
806 	struct proc *p = curproc;
807 
808 	nfds = uap->nfds;
809 	/*
810 	 * This is kinda bogus.  We have fd limits, but that is not
811 	 * really related to the size of the pollfd array.  Make sure
812 	 * we let the process use at least FD_SETSIZE entries and at
813 	 * least enough for the current limits.  We want to be reasonably
814 	 * safe, but not overly restrictive.
815 	 */
816 	if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
817 		return (EINVAL);
818 	ni = nfds * sizeof(struct pollfd);
819 	if (ni > sizeof(smallbits))
820 		bits = malloc(ni, M_TEMP, M_WAITOK);
821 	else
822 		bits = smallbits;
823 	error = copyin(uap->fds, bits, ni);
824 	if (error)
825 		goto done;
826 	if (uap->timeout != INFTIM) {
827 		atv.tv_sec = uap->timeout / 1000;
828 		atv.tv_usec = (uap->timeout % 1000) * 1000;
829 		if (itimerfix(&atv)) {
830 			error = EINVAL;
831 			goto done;
832 		}
833 		getmicrouptime(&rtv);
834 		timevaladd(&atv, &rtv);
835 	} else {
836 		atv.tv_sec = 0;
837 		atv.tv_usec = 0;
838 	}
839 	timo = 0;
840 retry:
841 	ncoll = nselcoll;
842 	p->p_flag |= P_SELECT;
843 	error = pollscan(p, bits, nfds, &uap->sysmsg_result);
844 	if (error || uap->sysmsg_result)
845 		goto done;
846 	if (atv.tv_sec || atv.tv_usec) {
847 		getmicrouptime(&rtv);
848 		if (timevalcmp(&rtv, &atv, >=))
849 			goto done;
850 		ttv = atv;
851 		timevalsub(&ttv, &rtv);
852 		timo = ttv.tv_sec > 24 * 60 * 60 ?
853 		    24 * 60 * 60 * hz : tvtohz_high(&ttv);
854 	}
855 	crit_enter();
856 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
857 		crit_exit();
858 		goto retry;
859 	}
860 	p->p_flag &= ~P_SELECT;
861 	error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo);
862 	crit_exit();
863 	if (error == 0)
864 		goto retry;
865 done:
866 	p->p_flag &= ~P_SELECT;
867 	/* poll is not restarted after signals... */
868 	if (error == ERESTART)
869 		error = EINTR;
870 	if (error == EWOULDBLOCK)
871 		error = 0;
872 	if (error == 0) {
873 		error = copyout(bits, uap->fds, ni);
874 		if (error)
875 			goto out;
876 	}
877 out:
878 	if (ni > sizeof(smallbits))
879 		free(bits, M_TEMP);
880 	return (error);
881 }
882 
883 static int
884 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res)
885 {
886 	struct thread *td = p->p_thread;
887 	struct filedesc *fdp = p->p_fd;
888 	int i;
889 	struct file *fp;
890 	int n = 0;
891 
892 	for (i = 0; i < nfd; i++, fds++) {
893 		if (fds->fd >= fdp->fd_nfiles) {
894 			fds->revents = POLLNVAL;
895 			n++;
896 		} else if (fds->fd < 0) {
897 			fds->revents = 0;
898 		} else {
899 			fp = fdp->fd_files[fds->fd].fp;
900 			if (fp == NULL) {
901 				fds->revents = POLLNVAL;
902 				n++;
903 			} else {
904 				/*
905 				 * Note: backend also returns POLLHUP and
906 				 * POLLERR if appropriate.
907 				 */
908 				fds->revents = fo_poll(fp, fds->events,
909 				    fp->f_cred, td);
910 				if (fds->revents != 0)
911 					n++;
912 			}
913 		}
914 	}
915 	*res = n;
916 	return (0);
917 }
918 
919 /*
920  * OpenBSD poll system call.
921  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
922  */
923 int
924 openbsd_poll(struct openbsd_poll_args *uap)
925 {
926 	return (poll((struct poll_args *)uap));
927 }
928 
929 /*ARGSUSED*/
930 int
931 seltrue(dev_t dev, int events, struct thread *td)
932 {
933 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
934 }
935 
936 /*
937  * Record a select request.  A global wait must be used since a process/thread
938  * might go away after recording its request.
939  */
940 void
941 selrecord(struct thread *selector, struct selinfo *sip)
942 {
943 	struct proc *p;
944 	pid_t mypid;
945 
946 	if ((p = selector->td_proc) == NULL)
947 		panic("selrecord: thread needs a process");
948 
949 	mypid = p->p_pid;
950 	if (sip->si_pid == mypid)
951 		return;
952 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
953 	    p->p_wchan == (caddr_t)&selwait) {
954 		sip->si_flags |= SI_COLL;
955 	} else {
956 		sip->si_pid = mypid;
957 	}
958 }
959 
960 /*
961  * Do a wakeup when a selectable event occurs.
962  */
963 void
964 selwakeup(struct selinfo *sip)
965 {
966 	struct proc *p;
967 
968 	if (sip->si_pid == 0)
969 		return;
970 	if (sip->si_flags & SI_COLL) {
971 		nselcoll++;
972 		sip->si_flags &= ~SI_COLL;
973 		wakeup((caddr_t)&selwait);	/* YYY fixable */
974 	}
975 	p = pfind(sip->si_pid);
976 	sip->si_pid = 0;
977 	if (p != NULL) {
978 		crit_enter();
979 		if (p->p_wchan == (caddr_t)&selwait) {
980 			if (p->p_stat == SSLEEP)
981 				setrunnable(p);
982 			else
983 				unsleep(p->p_thread);
984 		} else if (p->p_flag & P_SELECT)
985 			p->p_flag &= ~P_SELECT;
986 		crit_exit();
987 	}
988 }
989 
990