xref: /dragonfly/sys/kern/sys_generic.c (revision 9bb2a92d)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40  * $DragonFly: src/sys/kern/sys_generic.c,v 1.16 2004/01/07 11:04:18 dillon Exp $
41  */
42 
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/kern_syscall.h>
58 #include <sys/malloc.h>
59 #include <sys/poll.h>
60 #include <sys/resourcevar.h>
61 #include <sys/sysctl.h>
62 #include <sys/sysent.h>
63 #include <sys/buf.h>
64 #ifdef KTRACE
65 #include <sys/ktrace.h>
66 #endif
67 #include <vm/vm.h>
68 #include <vm/vm_page.h>
69 #include <sys/file2.h>
70 
71 #include <machine/limits.h>
72 
73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
76 
77 static int	pollscan (struct proc *, struct pollfd *, u_int, int *);
78 static int	selscan (struct proc *, fd_mask **, fd_mask **,
79 			int, int *);
80 
81 struct file*
82 holdfp(fdp, fd, flag)
83 	struct filedesc* fdp;
84 	int fd, flag;
85 {
86 	struct file* fp;
87 
88 	if (((u_int)fd) >= fdp->fd_nfiles ||
89 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
90 	    (fp->f_flag & flag) == 0) {
91 		return (NULL);
92 	}
93 	fhold(fp);
94 	return (fp);
95 }
96 
97 /*
98  * Read system call.
99  */
100 int
101 read(struct read_args *uap)
102 {
103 	struct thread *td = curthread;
104 	struct uio auio;
105 	struct iovec aiov;
106 	int error;
107 
108 	aiov.iov_base = uap->buf;
109 	aiov.iov_len = uap->nbyte;
110 	auio.uio_iov = &aiov;
111 	auio.uio_iovcnt = 1;
112 	auio.uio_offset = -1;
113 	auio.uio_resid = uap->nbyte;
114 	auio.uio_rw = UIO_READ;
115 	auio.uio_segflg = UIO_USERSPACE;
116 	auio.uio_td = td;
117 
118 	error = kern_readv(uap->fd, &auio, 0, &uap->sysmsg_result);
119 
120 	return(error);
121 }
122 
123 /*
124  * Pread system call
125  */
126 int
127 pread(struct pread_args *uap)
128 {
129 	struct thread *td = curthread;
130 	struct uio auio;
131 	struct iovec aiov;
132 	int error;
133 
134 	aiov.iov_base = uap->buf;
135 	aiov.iov_len = uap->nbyte;
136 	auio.uio_iov = &aiov;
137 	auio.uio_iovcnt = 1;
138 	auio.uio_offset = uap->offset;
139 	auio.uio_resid = uap->nbyte;
140 	auio.uio_rw = UIO_READ;
141 	auio.uio_segflg = UIO_USERSPACE;
142 	auio.uio_td = td;
143 
144 	error = kern_readv(uap->fd, &auio, FOF_OFFSET, &uap->sysmsg_result);
145 
146 	return(error);
147 }
148 
149 int
150 readv(struct readv_args *uap)
151 {
152 	struct thread *td = curthread;
153 	struct uio auio;
154 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
155 	int error;
156 
157 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
158 	    &auio.uio_resid);
159 	if (error)
160 		return (error);
161 	auio.uio_iov = iov;
162 	auio.uio_iovcnt = uap->iovcnt;
163 	auio.uio_offset = -1;
164 	auio.uio_rw = UIO_READ;
165 	auio.uio_segflg = UIO_USERSPACE;
166 	auio.uio_td = td;
167 
168 	error = kern_readv(uap->fd, &auio, 0, &uap->sysmsg_result);
169 
170 	iovec_free(&iov, aiov);
171 	return (error);
172 }
173 
174 int
175 kern_readv(int fd, struct uio *auio, int flags, int *res)
176 {
177 	struct thread *td = curthread;
178 	struct proc *p = td->td_proc;
179 	struct file *fp;
180 	struct filedesc *fdp = p->p_fd;
181 	int len, error;
182 #ifdef KTRACE
183 	struct iovec *ktriov = NULL;
184 	struct uio ktruio;
185 #endif
186 
187 	KKASSERT(p);
188 
189 	fp = holdfp(fdp, fd, FREAD);
190 	if (fp == NULL)
191 		return (EBADF);
192 	if (flags & FOF_OFFSET && fp->f_type != DTYPE_VNODE) {
193 		error = ESPIPE;
194 		goto done;
195 	}
196 	if (auio->uio_resid < 0) {
197 		error = EINVAL;
198 		goto done;
199 	}
200 #ifdef KTRACE
201 	/*
202 	 * if tracing, save a copy of iovec
203 	 */
204 	if (KTRPOINT(td, KTR_GENIO))  {
205 		int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
206 
207 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
208 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
209 		ktruio = *auio;
210 	}
211 #endif
212 	len = auio->uio_resid;
213 	error = fo_read(fp, auio, fp->f_cred, flags, td);
214 	if (error) {
215 		if (auio->uio_resid != len && (error == ERESTART ||
216 		    error == EINTR || error == EWOULDBLOCK))
217 			error = 0;
218 	}
219 #ifdef KTRACE
220 	if (ktriov != NULL) {
221 		if (error == 0) {
222 			ktruio.uio_iov = ktriov;
223 			ktruio.uio_resid = len - auio->uio_resid;
224 			ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error);
225 		}
226 		FREE(ktriov, M_TEMP);
227 	}
228 #endif
229 	if (error == 0)
230 		*res = len - auio->uio_resid;
231 done:
232 	fdrop(fp, td);
233 	return (error);
234 }
235 
236 /*
237  * Write system call
238  */
239 int
240 write(struct write_args *uap)
241 {
242 	struct thread *td = curthread;
243 	struct uio auio;
244 	struct iovec aiov;
245 	int error;
246 
247 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
248 	aiov.iov_len = uap->nbyte;
249 	auio.uio_iov = &aiov;
250 	auio.uio_iovcnt = 1;
251 	auio.uio_offset = -1;
252 	auio.uio_resid = uap->nbyte;
253 	auio.uio_rw = UIO_WRITE;
254 	auio.uio_segflg = UIO_USERSPACE;
255 	auio.uio_td = td;
256 
257 	error = kern_writev(uap->fd, &auio, 0, &uap->sysmsg_result);
258 
259 	return(error);
260 }
261 
262 /*
263  * Pwrite system call
264  */
265 int
266 pwrite(struct pwrite_args *uap)
267 {
268 	struct thread *td = curthread;
269 	struct uio auio;
270 	struct iovec aiov;
271 	int error;
272 
273 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
274 	aiov.iov_len = uap->nbyte;
275 	auio.uio_iov = &aiov;
276 	auio.uio_iovcnt = 1;
277 	auio.uio_offset = uap->offset;
278 	auio.uio_resid = uap->nbyte;
279 	auio.uio_rw = UIO_WRITE;
280 	auio.uio_segflg = UIO_USERSPACE;
281 	auio.uio_td = td;
282 
283 	error = kern_writev(uap->fd, &auio, FOF_OFFSET, &uap->sysmsg_result);
284 
285 	return(error);
286 }
287 
288 int
289 writev(struct writev_args *uap)
290 {
291 	struct thread *td = curthread;
292 	struct uio auio;
293 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
294 	int error;
295 
296 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
297 	    &auio.uio_resid);
298 	if (error)
299 		return (error);
300 	auio.uio_iov = iov;
301 	auio.uio_iovcnt = uap->iovcnt;
302 	auio.uio_offset = -1;
303 	auio.uio_rw = UIO_WRITE;
304 	auio.uio_segflg = UIO_USERSPACE;
305 	auio.uio_td = td;
306 
307 	error = kern_writev(uap->fd, &auio, 0, &uap->sysmsg_result);
308 
309 	iovec_free(&iov, aiov);
310 	return (error);
311 }
312 
313 /*
314  * Gather write system call
315  */
316 int
317 kern_writev(int fd, struct uio *auio, int flags, int *res)
318 {
319 	struct thread *td = curthread;
320 	struct proc *p = td->td_proc;
321 	struct file *fp;
322 	struct filedesc *fdp = p->p_fd;
323 	long len, error;
324 #ifdef KTRACE
325 	struct iovec *ktriov = NULL;
326 	struct uio ktruio;
327 #endif
328 
329 	KKASSERT(p);
330 
331 	fp = holdfp(fdp, fd, FWRITE);
332 	if (fp == NULL)
333 		return (EBADF);
334 	if ((flags & FOF_OFFSET) && fp->f_type != DTYPE_VNODE) {
335 		error = ESPIPE;
336 		goto done;
337 	}
338 	if (auio->uio_resid < 0) {
339 		error = EINVAL;
340 		goto done;
341 	}
342 #ifdef KTRACE
343 	/*
344 	 * if tracing, save a copy of iovec and uio
345 	 */
346 	if (KTRPOINT(td, KTR_GENIO))  {
347 		int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
348 
349 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
350 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
351 		ktruio = *auio;
352 	}
353 #endif
354 	len = auio->uio_resid;
355 	if (fp->f_type == DTYPE_VNODE)
356 		bwillwrite();
357 	error = fo_write(fp, auio, fp->f_cred, flags, td);
358 	if (error) {
359 		if (auio->uio_resid != len && (error == ERESTART ||
360 		    error == EINTR || error == EWOULDBLOCK))
361 			error = 0;
362 		if (error == EPIPE)
363 			psignal(p, SIGPIPE);
364 	}
365 #ifdef KTRACE
366 	if (ktriov != NULL) {
367 		if (error == 0) {
368 			ktruio.uio_iov = ktriov;
369 			ktruio.uio_resid = len - auio->uio_resid;
370 			ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error);
371 		}
372 		FREE(ktriov, M_TEMP);
373 	}
374 #endif
375 	if (error == 0)
376 		*res = len - auio->uio_resid;
377 done:
378 	fdrop(fp, td);
379 	return (error);
380 }
381 
382 /*
383  * Ioctl system call
384  */
385 /* ARGSUSED */
386 int
387 ioctl(struct ioctl_args *uap)
388 {
389 	struct thread *td = curthread;
390 	struct proc *p = td->td_proc;
391 	struct file *fp;
392 	struct filedesc *fdp;
393 	u_long com;
394 	int error;
395 	u_int size;
396 	caddr_t data, memp;
397 	int tmp;
398 #define STK_PARAMS	128
399 	union {
400 	    char stkbuf[STK_PARAMS];
401 	    long align;
402 	} ubuf;
403 
404 	KKASSERT(p);
405 	fdp = p->p_fd;
406 	if ((u_int)uap->fd >= fdp->fd_nfiles ||
407 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
408 		return (EBADF);
409 
410 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
411 		return (EBADF);
412 
413 	switch (com = uap->com) {
414 	case FIONCLEX:
415 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
416 		return (0);
417 	case FIOCLEX:
418 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
419 		return (0);
420 	}
421 
422 	/*
423 	 * Interpret high order word to find amount of data to be
424 	 * copied to/from the user's address space.
425 	 */
426 	size = IOCPARM_LEN(com);
427 	if (size > IOCPARM_MAX)
428 		return (ENOTTY);
429 
430 	fhold(fp);
431 
432 	memp = NULL;
433 	if (size > sizeof (ubuf.stkbuf)) {
434 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
435 		data = memp;
436 	} else {
437 		data = ubuf.stkbuf;
438 	}
439 	if (com&IOC_IN) {
440 		if (size) {
441 			error = copyin(uap->data, data, (u_int)size);
442 			if (error) {
443 				if (memp)
444 					free(memp, M_IOCTLOPS);
445 				fdrop(fp, td);
446 				return (error);
447 			}
448 		} else {
449 			*(caddr_t *)data = uap->data;
450 		}
451 	} else if ((com&IOC_OUT) && size) {
452 		/*
453 		 * Zero the buffer so the user always
454 		 * gets back something deterministic.
455 		 */
456 		bzero(data, size);
457 	} else if (com&IOC_VOID) {
458 		*(caddr_t *)data = uap->data;
459 	}
460 
461 	switch (com) {
462 
463 	case FIONBIO:
464 		if ((tmp = *(int *)data))
465 			fp->f_flag |= FNONBLOCK;
466 		else
467 			fp->f_flag &= ~FNONBLOCK;
468 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
469 		break;
470 
471 	case FIOASYNC:
472 		if ((tmp = *(int *)data))
473 			fp->f_flag |= FASYNC;
474 		else
475 			fp->f_flag &= ~FASYNC;
476 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
477 		break;
478 
479 	default:
480 		error = fo_ioctl(fp, com, data, td);
481 		/*
482 		 * Copy any data to user, size was
483 		 * already set and checked above.
484 		 */
485 		if (error == 0 && (com&IOC_OUT) && size)
486 			error = copyout(data, uap->data, (u_int)size);
487 		break;
488 	}
489 	if (memp)
490 		free(memp, M_IOCTLOPS);
491 	fdrop(fp, td);
492 	return (error);
493 }
494 
495 static int	nselcoll;	/* Select collisions since boot */
496 int	selwait;
497 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
498 
499 /*
500  * Select system call.
501  */
502 int
503 select(struct select_args *uap)
504 {
505 	struct proc *p = curproc;
506 
507 	/*
508 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
509 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
510 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
511 	 * of 256.
512 	 */
513 	fd_mask s_selbits[howmany(2048, NFDBITS)];
514 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
515 	struct timeval atv, rtv, ttv;
516 	int s, ncoll, error, timo;
517 	u_int nbufbytes, ncpbytes, nfdbits;
518 
519 	if (uap->nd < 0)
520 		return (EINVAL);
521 	if (uap->nd > p->p_fd->fd_nfiles)
522 		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
523 
524 	/*
525 	 * Allocate just enough bits for the non-null fd_sets.  Use the
526 	 * preallocated auto buffer if possible.
527 	 */
528 	nfdbits = roundup(uap->nd, NFDBITS);
529 	ncpbytes = nfdbits / NBBY;
530 	nbufbytes = 0;
531 	if (uap->in != NULL)
532 		nbufbytes += 2 * ncpbytes;
533 	if (uap->ou != NULL)
534 		nbufbytes += 2 * ncpbytes;
535 	if (uap->ex != NULL)
536 		nbufbytes += 2 * ncpbytes;
537 	if (nbufbytes <= sizeof s_selbits)
538 		selbits = &s_selbits[0];
539 	else
540 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
541 
542 	/*
543 	 * Assign pointers into the bit buffers and fetch the input bits.
544 	 * Put the output buffers together so that they can be bzeroed
545 	 * together.
546 	 */
547 	sbp = selbits;
548 #define	getbits(name, x) \
549 	do {								\
550 		if (uap->name == NULL)					\
551 			ibits[x] = NULL;				\
552 		else {							\
553 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
554 			obits[x] = sbp;					\
555 			sbp += ncpbytes / sizeof *sbp;			\
556 			error = copyin(uap->name, ibits[x], ncpbytes);	\
557 			if (error != 0)					\
558 				goto done;				\
559 		}							\
560 	} while (0)
561 	getbits(in, 0);
562 	getbits(ou, 1);
563 	getbits(ex, 2);
564 #undef	getbits
565 	if (nbufbytes != 0)
566 		bzero(selbits, nbufbytes / 2);
567 
568 	if (uap->tv) {
569 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
570 			sizeof (atv));
571 		if (error)
572 			goto done;
573 		if (itimerfix(&atv)) {
574 			error = EINVAL;
575 			goto done;
576 		}
577 		getmicrouptime(&rtv);
578 		timevaladd(&atv, &rtv);
579 	} else {
580 		atv.tv_sec = 0;
581 		atv.tv_usec = 0;
582 	}
583 	timo = 0;
584 retry:
585 	ncoll = nselcoll;
586 	p->p_flag |= P_SELECT;
587 	error = selscan(p, ibits, obits, uap->nd, &uap->sysmsg_result);
588 	if (error || uap->sysmsg_result)
589 		goto done;
590 	if (atv.tv_sec || atv.tv_usec) {
591 		getmicrouptime(&rtv);
592 		if (timevalcmp(&rtv, &atv, >=))
593 			goto done;
594 		ttv = atv;
595 		timevalsub(&ttv, &rtv);
596 		timo = ttv.tv_sec > 24 * 60 * 60 ?
597 		    24 * 60 * 60 * hz : tvtohz_high(&ttv);
598 	}
599 	s = splhigh();
600 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
601 		splx(s);
602 		goto retry;
603 	}
604 	p->p_flag &= ~P_SELECT;
605 
606 	error = tsleep((caddr_t)&selwait, PCATCH, "select", timo);
607 
608 	splx(s);
609 	if (error == 0)
610 		goto retry;
611 done:
612 	p->p_flag &= ~P_SELECT;
613 	/* select is not restarted after signals... */
614 	if (error == ERESTART)
615 		error = EINTR;
616 	if (error == EWOULDBLOCK)
617 		error = 0;
618 #define	putbits(name, x) \
619 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
620 		error = error2;
621 	if (error == 0) {
622 		int error2;
623 
624 		putbits(in, 0);
625 		putbits(ou, 1);
626 		putbits(ex, 2);
627 #undef putbits
628 	}
629 	if (selbits != &s_selbits[0])
630 		free(selbits, M_SELECT);
631 	return (error);
632 }
633 
634 static int
635 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res)
636 {
637 	struct thread *td = p->p_thread;
638 	struct filedesc *fdp = p->p_fd;
639 	int msk, i, fd;
640 	fd_mask bits;
641 	struct file *fp;
642 	int n = 0;
643 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
644 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
645 
646 	for (msk = 0; msk < 3; msk++) {
647 		if (ibits[msk] == NULL)
648 			continue;
649 		for (i = 0; i < nfd; i += NFDBITS) {
650 			bits = ibits[msk][i/NFDBITS];
651 			/* ffs(int mask) not portable, fd_mask is long */
652 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
653 				if (!(bits & 1))
654 					continue;
655 				fp = fdp->fd_ofiles[fd];
656 				if (fp == NULL)
657 					return (EBADF);
658 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
659 					obits[msk][(fd)/NFDBITS] |=
660 					    ((fd_mask)1 << ((fd) % NFDBITS));
661 					n++;
662 				}
663 			}
664 		}
665 	}
666 	*res = n;
667 	return (0);
668 }
669 
670 /*
671  * Poll system call.
672  */
673 int
674 poll(struct poll_args *uap)
675 {
676 	caddr_t bits;
677 	char smallbits[32 * sizeof(struct pollfd)];
678 	struct timeval atv, rtv, ttv;
679 	int s, ncoll, error = 0, timo;
680 	u_int nfds;
681 	size_t ni;
682 	struct proc *p = curproc;
683 
684 	nfds = SCARG(uap, nfds);
685 	/*
686 	 * This is kinda bogus.  We have fd limits, but that is not
687 	 * really related to the size of the pollfd array.  Make sure
688 	 * we let the process use at least FD_SETSIZE entries and at
689 	 * least enough for the current limits.  We want to be reasonably
690 	 * safe, but not overly restrictive.
691 	 */
692 	if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
693 		return (EINVAL);
694 	ni = nfds * sizeof(struct pollfd);
695 	if (ni > sizeof(smallbits))
696 		bits = malloc(ni, M_TEMP, M_WAITOK);
697 	else
698 		bits = smallbits;
699 	error = copyin(SCARG(uap, fds), bits, ni);
700 	if (error)
701 		goto done;
702 	if (SCARG(uap, timeout) != INFTIM) {
703 		atv.tv_sec = SCARG(uap, timeout) / 1000;
704 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
705 		if (itimerfix(&atv)) {
706 			error = EINVAL;
707 			goto done;
708 		}
709 		getmicrouptime(&rtv);
710 		timevaladd(&atv, &rtv);
711 	} else {
712 		atv.tv_sec = 0;
713 		atv.tv_usec = 0;
714 	}
715 	timo = 0;
716 retry:
717 	ncoll = nselcoll;
718 	p->p_flag |= P_SELECT;
719 	error = pollscan(p, (struct pollfd *)bits, nfds, &uap->sysmsg_result);
720 	if (error || uap->sysmsg_result)
721 		goto done;
722 	if (atv.tv_sec || atv.tv_usec) {
723 		getmicrouptime(&rtv);
724 		if (timevalcmp(&rtv, &atv, >=))
725 			goto done;
726 		ttv = atv;
727 		timevalsub(&ttv, &rtv);
728 		timo = ttv.tv_sec > 24 * 60 * 60 ?
729 		    24 * 60 * 60 * hz : tvtohz_high(&ttv);
730 	}
731 	s = splhigh();
732 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
733 		splx(s);
734 		goto retry;
735 	}
736 	p->p_flag &= ~P_SELECT;
737 	error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo);
738 	splx(s);
739 	if (error == 0)
740 		goto retry;
741 done:
742 	p->p_flag &= ~P_SELECT;
743 	/* poll is not restarted after signals... */
744 	if (error == ERESTART)
745 		error = EINTR;
746 	if (error == EWOULDBLOCK)
747 		error = 0;
748 	if (error == 0) {
749 		error = copyout(bits, SCARG(uap, fds), ni);
750 		if (error)
751 			goto out;
752 	}
753 out:
754 	if (ni > sizeof(smallbits))
755 		free(bits, M_TEMP);
756 	return (error);
757 }
758 
759 static int
760 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res)
761 {
762 	struct thread *td = p->p_thread;
763 	struct filedesc *fdp = p->p_fd;
764 	int i;
765 	struct file *fp;
766 	int n = 0;
767 
768 	for (i = 0; i < nfd; i++, fds++) {
769 		if (fds->fd >= fdp->fd_nfiles) {
770 			fds->revents = POLLNVAL;
771 			n++;
772 		} else if (fds->fd < 0) {
773 			fds->revents = 0;
774 		} else {
775 			fp = fdp->fd_ofiles[fds->fd];
776 			if (fp == NULL) {
777 				fds->revents = POLLNVAL;
778 				n++;
779 			} else {
780 				/*
781 				 * Note: backend also returns POLLHUP and
782 				 * POLLERR if appropriate.
783 				 */
784 				fds->revents = fo_poll(fp, fds->events,
785 				    fp->f_cred, td);
786 				if (fds->revents != 0)
787 					n++;
788 			}
789 		}
790 	}
791 	*res = n;
792 	return (0);
793 }
794 
795 /*
796  * OpenBSD poll system call.
797  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
798  */
799 int
800 openbsd_poll(struct openbsd_poll_args *uap)
801 {
802 	return (poll((struct poll_args *)uap));
803 }
804 
805 /*ARGSUSED*/
806 int
807 seltrue(dev_t dev, int events, struct thread *td)
808 {
809 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
810 }
811 
812 /*
813  * Record a select request.  A global wait must be used since a process/thread
814  * might go away after recording its request.
815  */
816 void
817 selrecord(struct thread *selector, struct selinfo *sip)
818 {
819 	struct proc *p;
820 	pid_t mypid;
821 
822 	if ((p = selector->td_proc) == NULL)
823 		panic("selrecord: thread needs a process");
824 
825 	mypid = p->p_pid;
826 	if (sip->si_pid == mypid)
827 		return;
828 	if (sip->si_pid && (p = pfind(sip->si_pid)) &&
829 	    p->p_wchan == (caddr_t)&selwait) {
830 		sip->si_flags |= SI_COLL;
831 	} else {
832 		sip->si_pid = mypid;
833 	}
834 }
835 
836 /*
837  * Do a wakeup when a selectable event occurs.
838  */
839 void
840 selwakeup(struct selinfo *sip)
841 {
842 	struct proc *p;
843 	int s;
844 
845 	if (sip->si_pid == 0)
846 		return;
847 	if (sip->si_flags & SI_COLL) {
848 		nselcoll++;
849 		sip->si_flags &= ~SI_COLL;
850 		wakeup((caddr_t)&selwait);	/* YYY fixable */
851 	}
852 	p = pfind(sip->si_pid);
853 	sip->si_pid = 0;
854 	if (p != NULL) {
855 		s = splhigh();
856 		if (p->p_wchan == (caddr_t)&selwait) {
857 			if (p->p_stat == SSLEEP)
858 				setrunnable(p);
859 			else
860 				unsleep(p->p_thread);
861 		} else if (p->p_flag & P_SELECT)
862 			p->p_flag &= ~P_SELECT;
863 		splx(s);
864 	}
865 }
866 
867