xref: /dragonfly/sys/kern/sys_generic.c (revision 71126e33)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40  * $DragonFly: src/sys/kern/sys_generic.c,v 1.18 2004/09/13 23:41:18 drhodus Exp $
41  */
42 
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/kern_syscall.h>
58 #include <sys/malloc.h>
59 #include <sys/mapped_ioctl.h>
60 #include <sys/poll.h>
61 #include <sys/queue.h>
62 #include <sys/resourcevar.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysent.h>
65 #include <sys/buf.h>
66 #ifdef KTRACE
67 #include <sys/ktrace.h>
68 #endif
69 #include <vm/vm.h>
70 #include <vm/vm_page.h>
71 #include <sys/file2.h>
72 
73 #include <machine/limits.h>
74 
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
79 
80 static int	pollscan (struct proc *, struct pollfd *, u_int, int *);
81 static int	selscan (struct proc *, fd_mask **, fd_mask **,
82 			int, int *);
83 
84 struct file*
85 holdfp(fdp, fd, flag)
86 	struct filedesc* fdp;
87 	int fd, flag;
88 {
89 	struct file* fp;
90 
91 	if (((u_int)fd) >= fdp->fd_nfiles ||
92 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
93 	    (fp->f_flag & flag) == 0) {
94 		return (NULL);
95 	}
96 	fhold(fp);
97 	return (fp);
98 }
99 
100 /*
101  * Read system call.
102  */
103 int
104 read(struct read_args *uap)
105 {
106 	struct thread *td = curthread;
107 	struct uio auio;
108 	struct iovec aiov;
109 	int error;
110 
111 	aiov.iov_base = uap->buf;
112 	aiov.iov_len = uap->nbyte;
113 	auio.uio_iov = &aiov;
114 	auio.uio_iovcnt = 1;
115 	auio.uio_offset = -1;
116 	auio.uio_resid = uap->nbyte;
117 	auio.uio_rw = UIO_READ;
118 	auio.uio_segflg = UIO_USERSPACE;
119 	auio.uio_td = td;
120 
121 	error = kern_readv(uap->fd, &auio, 0, &uap->sysmsg_result);
122 
123 	return(error);
124 }
125 
126 /*
127  * Pread system call
128  */
129 int
130 pread(struct pread_args *uap)
131 {
132 	struct thread *td = curthread;
133 	struct uio auio;
134 	struct iovec aiov;
135 	int error;
136 
137 	aiov.iov_base = uap->buf;
138 	aiov.iov_len = uap->nbyte;
139 	auio.uio_iov = &aiov;
140 	auio.uio_iovcnt = 1;
141 	auio.uio_offset = uap->offset;
142 	auio.uio_resid = uap->nbyte;
143 	auio.uio_rw = UIO_READ;
144 	auio.uio_segflg = UIO_USERSPACE;
145 	auio.uio_td = td;
146 
147 	error = kern_readv(uap->fd, &auio, FOF_OFFSET, &uap->sysmsg_result);
148 
149 	return(error);
150 }
151 
152 int
153 readv(struct readv_args *uap)
154 {
155 	struct thread *td = curthread;
156 	struct uio auio;
157 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
158 	int error;
159 
160 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
161 	    &auio.uio_resid);
162 	if (error)
163 		return (error);
164 	auio.uio_iov = iov;
165 	auio.uio_iovcnt = uap->iovcnt;
166 	auio.uio_offset = -1;
167 	auio.uio_rw = UIO_READ;
168 	auio.uio_segflg = UIO_USERSPACE;
169 	auio.uio_td = td;
170 
171 	error = kern_readv(uap->fd, &auio, 0, &uap->sysmsg_result);
172 
173 	iovec_free(&iov, aiov);
174 	return (error);
175 }
176 
177 int
178 kern_readv(int fd, struct uio *auio, int flags, int *res)
179 {
180 	struct thread *td = curthread;
181 	struct proc *p = td->td_proc;
182 	struct file *fp;
183 	struct filedesc *fdp = p->p_fd;
184 	int len, error;
185 #ifdef KTRACE
186 	struct iovec *ktriov = NULL;
187 	struct uio ktruio;
188 #endif
189 
190 	KKASSERT(p);
191 
192 	fp = holdfp(fdp, fd, FREAD);
193 	if (fp == NULL)
194 		return (EBADF);
195 	if (flags & FOF_OFFSET && fp->f_type != DTYPE_VNODE) {
196 		error = ESPIPE;
197 		goto done;
198 	}
199 	if (auio->uio_resid < 0) {
200 		error = EINVAL;
201 		goto done;
202 	}
203 #ifdef KTRACE
204 	/*
205 	 * if tracing, save a copy of iovec
206 	 */
207 	if (KTRPOINT(td, KTR_GENIO))  {
208 		int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
209 
210 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
211 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
212 		ktruio = *auio;
213 	}
214 #endif
215 	len = auio->uio_resid;
216 	error = fo_read(fp, auio, fp->f_cred, flags, td);
217 	if (error) {
218 		if (auio->uio_resid != len && (error == ERESTART ||
219 		    error == EINTR || error == EWOULDBLOCK))
220 			error = 0;
221 	}
222 #ifdef KTRACE
223 	if (ktriov != NULL) {
224 		if (error == 0) {
225 			ktruio.uio_iov = ktriov;
226 			ktruio.uio_resid = len - auio->uio_resid;
227 			ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
228 		}
229 		FREE(ktriov, M_TEMP);
230 	}
231 #endif
232 	if (error == 0)
233 		*res = len - auio->uio_resid;
234 done:
235 	fdrop(fp, td);
236 	return (error);
237 }
238 
239 /*
240  * Write system call
241  */
242 int
243 write(struct write_args *uap)
244 {
245 	struct thread *td = curthread;
246 	struct uio auio;
247 	struct iovec aiov;
248 	int error;
249 
250 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
251 	aiov.iov_len = uap->nbyte;
252 	auio.uio_iov = &aiov;
253 	auio.uio_iovcnt = 1;
254 	auio.uio_offset = -1;
255 	auio.uio_resid = uap->nbyte;
256 	auio.uio_rw = UIO_WRITE;
257 	auio.uio_segflg = UIO_USERSPACE;
258 	auio.uio_td = td;
259 
260 	error = kern_writev(uap->fd, &auio, 0, &uap->sysmsg_result);
261 
262 	return(error);
263 }
264 
265 /*
266  * Pwrite system call
267  */
268 int
269 pwrite(struct pwrite_args *uap)
270 {
271 	struct thread *td = curthread;
272 	struct uio auio;
273 	struct iovec aiov;
274 	int error;
275 
276 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
277 	aiov.iov_len = uap->nbyte;
278 	auio.uio_iov = &aiov;
279 	auio.uio_iovcnt = 1;
280 	auio.uio_offset = uap->offset;
281 	auio.uio_resid = uap->nbyte;
282 	auio.uio_rw = UIO_WRITE;
283 	auio.uio_segflg = UIO_USERSPACE;
284 	auio.uio_td = td;
285 
286 	error = kern_writev(uap->fd, &auio, FOF_OFFSET, &uap->sysmsg_result);
287 
288 	return(error);
289 }
290 
291 int
292 writev(struct writev_args *uap)
293 {
294 	struct thread *td = curthread;
295 	struct uio auio;
296 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
297 	int error;
298 
299 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
300 	    &auio.uio_resid);
301 	if (error)
302 		return (error);
303 	auio.uio_iov = iov;
304 	auio.uio_iovcnt = uap->iovcnt;
305 	auio.uio_offset = -1;
306 	auio.uio_rw = UIO_WRITE;
307 	auio.uio_segflg = UIO_USERSPACE;
308 	auio.uio_td = td;
309 
310 	error = kern_writev(uap->fd, &auio, 0, &uap->sysmsg_result);
311 
312 	iovec_free(&iov, aiov);
313 	return (error);
314 }
315 
316 /*
317  * Gather write system call
318  */
319 int
320 kern_writev(int fd, struct uio *auio, int flags, int *res)
321 {
322 	struct thread *td = curthread;
323 	struct proc *p = td->td_proc;
324 	struct file *fp;
325 	struct filedesc *fdp = p->p_fd;
326 	long len, error;
327 #ifdef KTRACE
328 	struct iovec *ktriov = NULL;
329 	struct uio ktruio;
330 #endif
331 
332 	KKASSERT(p);
333 
334 	fp = holdfp(fdp, fd, FWRITE);
335 	if (fp == NULL)
336 		return (EBADF);
337 	if ((flags & FOF_OFFSET) && fp->f_type != DTYPE_VNODE) {
338 		error = ESPIPE;
339 		goto done;
340 	}
341 	if (auio->uio_resid < 0) {
342 		error = EINVAL;
343 		goto done;
344 	}
345 #ifdef KTRACE
346 	/*
347 	 * if tracing, save a copy of iovec and uio
348 	 */
349 	if (KTRPOINT(td, KTR_GENIO))  {
350 		int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
351 
352 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
353 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
354 		ktruio = *auio;
355 	}
356 #endif
357 	len = auio->uio_resid;
358 	if (fp->f_type == DTYPE_VNODE)
359 		bwillwrite();
360 	error = fo_write(fp, auio, fp->f_cred, flags, td);
361 	if (error) {
362 		if (auio->uio_resid != len && (error == ERESTART ||
363 		    error == EINTR || error == EWOULDBLOCK))
364 			error = 0;
365 		if (error == EPIPE)
366 			psignal(p, SIGPIPE);
367 	}
368 #ifdef KTRACE
369 	if (ktriov != NULL) {
370 		if (error == 0) {
371 			ktruio.uio_iov = ktriov;
372 			ktruio.uio_resid = len - auio->uio_resid;
373 			ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
374 		}
375 		FREE(ktriov, M_TEMP);
376 	}
377 #endif
378 	if (error == 0)
379 		*res = len - auio->uio_resid;
380 done:
381 	fdrop(fp, td);
382 	return (error);
383 }
384 
385 /*
386  * Ioctl system call
387  */
388 /* ARGSUSED */
389 int
390 ioctl(struct ioctl_args *uap)
391 {
392 	return(mapped_ioctl(uap->fd, uap->com, uap->data, NULL));
393 }
394 
395 struct ioctl_map_entry {
396 	const char *subsys;
397 	struct ioctl_map_range *cmd_ranges;
398 	LIST_ENTRY(ioctl_map_entry) entries;
399 };
400 
401 int
402 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map)
403 {
404 	struct thread *td = curthread;
405 	struct proc *p = td->td_proc;
406 	struct file *fp;
407 	struct filedesc *fdp;
408 	struct ioctl_map_range *iomc = NULL;
409 	int error;
410 	u_int size;
411 	u_long ocom = com;
412 	caddr_t data, memp;
413 	int tmp;
414 #define STK_PARAMS	128
415 	union {
416 	    char stkbuf[STK_PARAMS];
417 	    long align;
418 	} ubuf;
419 
420 	KKASSERT(p);
421 	fdp = p->p_fd;
422 	if ((u_int)fd >= fdp->fd_nfiles ||
423 	    (fp = fdp->fd_ofiles[fd]) == NULL)
424 		return(EBADF);
425 
426 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
427 		return(EBADF);
428 
429 	if (map != NULL) {	/* obey translation map */
430 		u_long maskcmd;
431 		struct ioctl_map_entry *e;
432 
433 		maskcmd = com & map->mask;
434 
435 		LIST_FOREACH(e, &map->mapping, entries) {
436 			for (iomc = e->cmd_ranges; iomc->start != 0 ||
437 			     iomc->maptocmd != 0 || iomc->func != NULL;
438 			     iomc++) {
439 				if (maskcmd >= iomc->start &&
440 				    maskcmd <= iomc->end)
441 					break;
442 			}
443 
444 			/* Did we find a match? */
445 			if (iomc->start != 0 || iomc->maptocmd != 0 ||
446 			    iomc->func != NULL)
447 				break;
448 		}
449 
450 		if (iomc == NULL ||
451 		    (iomc->start == 0 && iomc->maptocmd == 0
452 		     && iomc->func == NULL)) {
453 			printf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
454 			       map->sys, fd, maskcmd,
455 			       (int)((maskcmd >> 8) & 0xff),
456 			       (int)(maskcmd & 0xff));
457 			return(EINVAL);
458 		}
459 
460 		com = iomc->maptocmd;
461 	}
462 
463 	switch (com) {
464 	case FIONCLEX:
465 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
466 		return(0);
467 	case FIOCLEX:
468 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
469 		return(0);
470 	}
471 
472 	/*
473 	 * Interpret high order word to find amount of data to be
474 	 * copied to/from the user's address space.
475 	 */
476 	size = IOCPARM_LEN(com);
477 	if (size > IOCPARM_MAX)
478 		return(ENOTTY);
479 
480 	fhold(fp);
481 
482 	memp = NULL;
483 	if (size > sizeof (ubuf.stkbuf)) {
484 		memp = malloc(size, M_IOCTLOPS, M_WAITOK);
485 		data = memp;
486 	} else {
487 		data = ubuf.stkbuf;
488 	}
489 	if ((com & IOC_IN) != 0) {
490 		if (size != 0) {
491 			error = copyin(uspc_data, data, (u_int)size);
492 			if (error) {
493 				if (memp != NULL)
494 					free(memp, M_IOCTLOPS);
495 				fdrop(fp, td);
496 				return(error);
497 			}
498 		} else {
499 			*(caddr_t *)data = uspc_data;
500 		}
501 	} else if ((com & IOC_OUT) != 0 && size) {
502 		/*
503 		 * Zero the buffer so the user always
504 		 * gets back something deterministic.
505 		 */
506 		bzero(data, size);
507 	} else if ((com & IOC_VOID) != 0) {
508 		*(caddr_t *)data = uspc_data;
509 	}
510 
511 	switch (com) {
512 
513 	case FIONBIO:
514 		if ((tmp = *(int *)data))
515 			fp->f_flag |= FNONBLOCK;
516 		else
517 			fp->f_flag &= ~FNONBLOCK;
518 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
519 		break;
520 
521 	case FIOASYNC:
522 		if ((tmp = *(int *)data))
523 			fp->f_flag |= FASYNC;
524 		else
525 			fp->f_flag &= ~FASYNC;
526 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
527 		break;
528 
529 	default:
530 		/*
531 		 *  If there is a override function,
532 		 *  call it instead of directly routing the call
533 		 */
534 		if (map != NULL && iomc->func != NULL)
535 			error = iomc->func(fp, com, ocom, data, td);
536 		else
537 			error = fo_ioctl(fp, com, data, td);
538 		/*
539 		 * Copy any data to user, size was
540 		 * already set and checked above.
541 		 */
542 		if (error == 0 && (com & IOC_OUT) != 0 && size != 0)
543 			error = copyout(data, uspc_data, (u_int)size);
544 		break;
545 	}
546 	if (memp != NULL)
547 		free(memp, M_IOCTLOPS);
548 	fdrop(fp, td);
549 	return(error);
550 }
551 
552 int
553 mapped_ioctl_register_handler(struct ioctl_map_handler *he)
554 {
555 	struct ioctl_map_entry *ne;
556 
557 	KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL &&
558 		 he->subsys != NULL && *he->subsys != '\0');
559 
560 	ne = malloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK);
561 
562 	ne->subsys = he->subsys;
563 	ne->cmd_ranges = he->cmd_ranges;
564 
565 	LIST_INSERT_HEAD(&he->map->mapping, ne, entries);
566 
567 	return(0);
568 }
569 
570 int
571 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he)
572 {
573 	struct ioctl_map_entry *ne;
574 
575 	KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL);
576 
577 	LIST_FOREACH(ne, &he->map->mapping, entries) {
578 		if (ne->cmd_ranges != he->cmd_ranges)
579 			continue;
580 		LIST_REMOVE(ne, entries);
581 		free(ne, M_IOCTLMAP);
582 		return(0);
583 	}
584 	return(EINVAL);
585 }
586 
587 static int	nselcoll;	/* Select collisions since boot */
588 int	selwait;
589 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
590 
591 /*
592  * Select system call.
593  */
594 int
595 select(struct select_args *uap)
596 {
597 	struct proc *p = curproc;
598 
599 	/*
600 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
601 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
602 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
603 	 * of 256.
604 	 */
605 	fd_mask s_selbits[howmany(2048, NFDBITS)];
606 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
607 	struct timeval atv, rtv, ttv;
608 	int s, ncoll, error, timo;
609 	u_int nbufbytes, ncpbytes, nfdbits;
610 
611 	if (uap->nd < 0)
612 		return (EINVAL);
613 	if (uap->nd > p->p_fd->fd_nfiles)
614 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
615 
616 	/*
617 	 * Allocate just enough bits for the non-null fd_sets.  Use the
618 	 * preallocated auto buffer if possible.
619 	 */
620 	nfdbits = roundup(uap->nd, NFDBITS);
621 	ncpbytes = nfdbits / NBBY;
622 	nbufbytes = 0;
623 	if (uap->in != NULL)
624 		nbufbytes += 2 * ncpbytes;
625 	if (uap->ou != NULL)
626 		nbufbytes += 2 * ncpbytes;
627 	if (uap->ex != NULL)
628 		nbufbytes += 2 * ncpbytes;
629 	if (nbufbytes <= sizeof s_selbits)
630 		selbits = &s_selbits[0];
631 	else
632 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
633 
634 	/*
635 	 * Assign pointers into the bit buffers and fetch the input bits.
636 	 * Put the output buffers together so that they can be bzeroed
637 	 * together.
638 	 */
639 	sbp = selbits;
640 #define	getbits(name, x) \
641 	do {								\
642 		if (uap->name == NULL)					\
643 			ibits[x] = NULL;				\
644 		else {							\
645 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
646 			obits[x] = sbp;					\
647 			sbp += ncpbytes / sizeof *sbp;			\
648 			error = copyin(uap->name, ibits[x], ncpbytes);	\
649 			if (error != 0)					\
650 				goto done;				\
651 		}							\
652 	} while (0)
653 	getbits(in, 0);
654 	getbits(ou, 1);
655 	getbits(ex, 2);
656 #undef	getbits
657 	if (nbufbytes != 0)
658 		bzero(selbits, nbufbytes / 2);
659 
660 	if (uap->tv) {
661 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
662 			sizeof (atv));
663 		if (error)
664 			goto done;
665 		if (itimerfix(&atv)) {
666 			error = EINVAL;
667 			goto done;
668 		}
669 		getmicrouptime(&rtv);
670 		timevaladd(&atv, &rtv);
671 	} else {
672 		atv.tv_sec = 0;
673 		atv.tv_usec = 0;
674 	}
675 	timo = 0;
676 retry:
677 	ncoll = nselcoll;
678 	p->p_flag |= P_SELECT;
679 	error = selscan(p, ibits, obits, uap->nd, &uap->sysmsg_result);
680 	if (error || uap->sysmsg_result)
681 		goto done;
682 	if (atv.tv_sec || atv.tv_usec) {
683 		getmicrouptime(&rtv);
684 		if (timevalcmp(&rtv, &atv, >=))
685 			goto done;
686 		ttv = atv;
687 		timevalsub(&ttv, &rtv);
688 		timo = ttv.tv_sec > 24 * 60 * 60 ?
689 		    24 * 60 * 60 * hz : tvtohz_high(&ttv);
690 	}
691 	s = splhigh();
692 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
693 		splx(s);
694 		goto retry;
695 	}
696 	p->p_flag &= ~P_SELECT;
697 
698 	error = tsleep((caddr_t)&selwait, PCATCH, "select", timo);
699 
700 	splx(s);
701 	if (error == 0)
702 		goto retry;
703 done:
704 	p->p_flag &= ~P_SELECT;
705 	/* select is not restarted after signals... */
706 	if (error == ERESTART)
707 		error = EINTR;
708 	if (error == EWOULDBLOCK)
709 		error = 0;
710 #define	putbits(name, x) \
711 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
712 		error = error2;
713 	if (error == 0) {
714 		int error2;
715 
716 		putbits(in, 0);
717 		putbits(ou, 1);
718 		putbits(ex, 2);
719 #undef putbits
720 	}
721 	if (selbits != &s_selbits[0])
722 		free(selbits, M_SELECT);
723 	return (error);
724 }
725 
726 static int
727 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res)
728 {
729 	struct thread *td = p->p_thread;
730 	struct filedesc *fdp = p->p_fd;
731 	int msk, i, fd;
732 	fd_mask bits;
733 	struct file *fp;
734 	int n = 0;
735 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
736 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
737 
738 	for (msk = 0; msk < 3; msk++) {
739 		if (ibits[msk] == NULL)
740 			continue;
741 		for (i = 0; i < nfd; i += NFDBITS) {
742 			bits = ibits[msk][i/NFDBITS];
743 			/* ffs(int mask) not portable, fd_mask is long */
744 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
745 				if (!(bits & 1))
746 					continue;
747 				fp = fdp->fd_ofiles[fd];
748 				if (fp == NULL)
749 					return (EBADF);
750 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
751 					obits[msk][(fd)/NFDBITS] |=
752 					    ((fd_mask)1 << ((fd) % NFDBITS));
753 					n++;
754 				}
755 			}
756 		}
757 	}
758 	*res = n;
759 	return (0);
760 }
761 
762 /*
763  * Poll system call.
764  */
765 int
766 poll(struct poll_args *uap)
767 {
768 	struct pollfd *bits;
769 	struct pollfd smallbits[32];
770 	struct timeval atv, rtv, ttv;
771 	int s, ncoll, error = 0, timo;
772 	u_int nfds;
773 	size_t ni;
774 	struct proc *p = curproc;
775 
776 	nfds = SCARG(uap, nfds);
777 	/*
778 	 * This is kinda bogus.  We have fd limits, but that is not
779 	 * really related to the size of the pollfd array.  Make sure
780 	 * we let the process use at least FD_SETSIZE entries and at
781 	 * least enough for the current limits.  We want to be reasonably
782 	 * safe, but not overly restrictive.
783 	 */
784 	if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
785 		return (EINVAL);
786 	ni = nfds * sizeof(struct pollfd);
787 	if (ni > sizeof(smallbits))
788 		bits = malloc(ni, M_TEMP, M_WAITOK);
789 	else
790 		bits = smallbits;
791 	error = copyin(SCARG(uap, fds), bits, ni);
792 	if (error)
793 		goto done;
794 	if (SCARG(uap, timeout) != INFTIM) {
795 		atv.tv_sec = SCARG(uap, timeout) / 1000;
796 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
797 		if (itimerfix(&atv)) {
798 			error = EINVAL;
799 			goto done;
800 		}
801 		getmicrouptime(&rtv);
802 		timevaladd(&atv, &rtv);
803 	} else {
804 		atv.tv_sec = 0;
805 		atv.tv_usec = 0;
806 	}
807 	timo = 0;
808 retry:
809 	ncoll = nselcoll;
810 	p->p_flag |= P_SELECT;
811 	error = pollscan(p, bits, nfds, &uap->sysmsg_result);
812 	if (error || uap->sysmsg_result)
813 		goto done;
814 	if (atv.tv_sec || atv.tv_usec) {
815 		getmicrouptime(&rtv);
816 		if (timevalcmp(&rtv, &atv, >=))
817 			goto done;
818 		ttv = atv;
819 		timevalsub(&ttv, &rtv);
820 		timo = ttv.tv_sec > 24 * 60 * 60 ?
821 		    24 * 60 * 60 * hz : tvtohz_high(&ttv);
822 	}
823 	s = splhigh();
824 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
825 		splx(s);
826 		goto retry;
827 	}
828 	p->p_flag &= ~P_SELECT;
829 	error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo);
830 	splx(s);
831 	if (error == 0)
832 		goto retry;
833 done:
834 	p->p_flag &= ~P_SELECT;
835 	/* poll is not restarted after signals... */
836 	if (error == ERESTART)
837 		error = EINTR;
838 	if (error == EWOULDBLOCK)
839 		error = 0;
840 	if (error == 0) {
841 		error = copyout(bits, SCARG(uap, fds), ni);
842 		if (error)
843 			goto out;
844 	}
845 out:
846 	if (ni > sizeof(smallbits))
847 		free(bits, M_TEMP);
848 	return (error);
849 }
850 
851 static int
852 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res)
853 {
854 	struct thread *td = p->p_thread;
855 	struct filedesc *fdp = p->p_fd;
856 	int i;
857 	struct file *fp;
858 	int n = 0;
859 
860 	for (i = 0; i < nfd; i++, fds++) {
861 		if (fds->fd >= fdp->fd_nfiles) {
862 			fds->revents = POLLNVAL;
863 			n++;
864 		} else if (fds->fd < 0) {
865 			fds->revents = 0;
866 		} else {
867 			fp = fdp->fd_ofiles[fds->fd];
868 			if (fp == NULL) {
869 				fds->revents = POLLNVAL;
870 				n++;
871 			} else {
872 				/*
873 				 * Note: backend also returns POLLHUP and
874 				 * POLLERR if appropriate.
875 				 */
876 				fds->revents = fo_poll(fp, fds->events,
877 				    fp->f_cred, td);
878 				if (fds->revents != 0)
879 					n++;
880 			}
881 		}
882 	}
883 	*res = n;
884 	return (0);
885 }
886 
887 /*
888  * OpenBSD poll system call.
889  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
890  */
891 int
892 openbsd_poll(struct openbsd_poll_args *uap)
893 {
894 	return (poll((struct poll_args *)uap));
895 }
896 
897 /*ARGSUSED*/
898 int
899 seltrue(dev_t dev, int events, struct thread *td)
900 {
901 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
902 }
903 
904 /*
905  * Record a select request.  A global wait must be used since a process/thread
906  * might go away after recording its request.
907  */
908 void
909 selrecord(struct thread *selector, struct selinfo *sip)
910 {
911 	struct proc *p;
912 	pid_t mypid;
913 
914 	if ((p = selector->td_proc) == NULL)
915 		panic("selrecord: thread needs a process");
916 
917 	mypid = p->p_pid;
918 	if (sip->si_pid == mypid)
919 		return;
920 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
921 	    p->p_wchan == (caddr_t)&selwait) {
922 		sip->si_flags |= SI_COLL;
923 	} else {
924 		sip->si_pid = mypid;
925 	}
926 }
927 
928 /*
929  * Do a wakeup when a selectable event occurs.
930  */
931 void
932 selwakeup(struct selinfo *sip)
933 {
934 	struct proc *p;
935 	int s;
936 
937 	if (sip->si_pid == 0)
938 		return;
939 	if (sip->si_flags & SI_COLL) {
940 		nselcoll++;
941 		sip->si_flags &= ~SI_COLL;
942 		wakeup((caddr_t)&selwait);	/* YYY fixable */
943 	}
944 	p = pfind(sip->si_pid);
945 	sip->si_pid = 0;
946 	if (p != NULL) {
947 		s = splhigh();
948 		if (p->p_wchan == (caddr_t)&selwait) {
949 			if (p->p_stat == SSLEEP)
950 				setrunnable(p);
951 			else
952 				unsleep(p->p_thread);
953 		} else if (p->p_flag & P_SELECT)
954 			p->p_flag &= ~P_SELECT;
955 		splx(s);
956 	}
957 }
958 
959