xref: /freebsd/sys/kern/sys_generic.c (revision 7bd6fde3)
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42 
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/socketvar.h>
53 #include <sys/uio.h>
54 #include <sys/kernel.h>
55 #include <sys/limits.h>
56 #include <sys/malloc.h>
57 #include <sys/poll.h>
58 #include <sys/resourcevar.h>
59 #include <sys/selinfo.h>
60 #include <sys/sleepqueue.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/condvar.h>
68 #ifdef KTRACE
69 #include <sys/ktrace.h>
70 #endif
71 #include <vm/vm.h>
72 #include <vm/vm_page.h>
73 
74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
77 
78 static int	pollscan(struct thread *, struct pollfd *, u_int);
79 static int	selscan(struct thread *, fd_mask **, fd_mask **, int);
80 static int	dofileread(struct thread *, int, struct file *, struct uio *,
81 		    off_t, int);
82 static int	dofilewrite(struct thread *, int, struct file *, struct uio *,
83 		    off_t, int);
84 static void	doselwakeup(struct selinfo *, int);
85 
86 /*
87  * Read system call.
88  */
89 #ifndef _SYS_SYSPROTO_H_
90 struct read_args {
91 	int	fd;
92 	void	*buf;
93 	size_t	nbyte;
94 };
95 #endif
96 /*
97  * MPSAFE
98  */
99 int
100 read(td, uap)
101 	struct thread *td;
102 	struct read_args *uap;
103 {
104 	struct uio auio;
105 	struct iovec aiov;
106 	int error;
107 
108 	if (uap->nbyte > INT_MAX)
109 		return (EINVAL);
110 	aiov.iov_base = uap->buf;
111 	aiov.iov_len = uap->nbyte;
112 	auio.uio_iov = &aiov;
113 	auio.uio_iovcnt = 1;
114 	auio.uio_resid = uap->nbyte;
115 	auio.uio_segflg = UIO_USERSPACE;
116 	error = kern_readv(td, uap->fd, &auio);
117 	return(error);
118 }
119 
120 /*
121  * Positioned read system call
122  */
123 #ifndef _SYS_SYSPROTO_H_
124 struct pread_args {
125 	int	fd;
126 	void	*buf;
127 	size_t	nbyte;
128 	int	pad;
129 	off_t	offset;
130 };
131 #endif
132 /*
133  * MPSAFE
134  */
135 int
136 pread(td, uap)
137 	struct thread *td;
138 	struct pread_args *uap;
139 {
140 	struct uio auio;
141 	struct iovec aiov;
142 	int error;
143 
144 	if (uap->nbyte > INT_MAX)
145 		return (EINVAL);
146 	aiov.iov_base = uap->buf;
147 	aiov.iov_len = uap->nbyte;
148 	auio.uio_iov = &aiov;
149 	auio.uio_iovcnt = 1;
150 	auio.uio_resid = uap->nbyte;
151 	auio.uio_segflg = UIO_USERSPACE;
152 	error = kern_preadv(td, uap->fd, &auio, uap->offset);
153 	return(error);
154 }
155 
156 /*
157  * Scatter read system call.
158  */
159 #ifndef _SYS_SYSPROTO_H_
160 struct readv_args {
161 	int	fd;
162 	struct	iovec *iovp;
163 	u_int	iovcnt;
164 };
165 #endif
166 /*
167  * MPSAFE
168  */
169 int
170 readv(struct thread *td, struct readv_args *uap)
171 {
172 	struct uio *auio;
173 	int error;
174 
175 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
176 	if (error)
177 		return (error);
178 	error = kern_readv(td, uap->fd, auio);
179 	free(auio, M_IOV);
180 	return (error);
181 }
182 
183 int
184 kern_readv(struct thread *td, int fd, struct uio *auio)
185 {
186 	struct file *fp;
187 	int error;
188 
189 	error = fget_read(td, fd, &fp);
190 	if (error)
191 		return (error);
192 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
193 	fdrop(fp, td);
194 	return (error);
195 }
196 
197 /*
198  * Scatter positioned read system call.
199  */
200 #ifndef _SYS_SYSPROTO_H_
201 struct preadv_args {
202 	int	fd;
203 	struct	iovec *iovp;
204 	u_int	iovcnt;
205 	off_t	offset;
206 };
207 #endif
208 /*
209  * MPSAFE
210  */
211 int
212 preadv(struct thread *td, struct preadv_args *uap)
213 {
214 	struct uio *auio;
215 	int error;
216 
217 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
218 	if (error)
219 		return (error);
220 	error = kern_preadv(td, uap->fd, auio, uap->offset);
221 	free(auio, M_IOV);
222 	return (error);
223 }
224 
225 int
226 kern_preadv(td, fd, auio, offset)
227 	struct thread *td;
228 	int fd;
229 	struct uio *auio;
230 	off_t offset;
231 {
232 	struct file *fp;
233 	int error;
234 
235 	error = fget_read(td, fd, &fp);
236 	if (error)
237 		return (error);
238 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
239 		error = ESPIPE;
240 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
241 		error = EINVAL;
242 	else
243 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
244 	fdrop(fp, td);
245 	return (error);
246 }
247 
248 /*
249  * Common code for readv and preadv that reads data in
250  * from a file using the passed in uio, offset, and flags.
251  */
252 static int
253 dofileread(td, fd, fp, auio, offset, flags)
254 	struct thread *td;
255 	int fd;
256 	struct file *fp;
257 	struct uio *auio;
258 	off_t offset;
259 	int flags;
260 {
261 	ssize_t cnt;
262 	int error;
263 #ifdef KTRACE
264 	struct uio *ktruio = NULL;
265 #endif
266 
267 	/* Finish zero length reads right here */
268 	if (auio->uio_resid == 0) {
269 		td->td_retval[0] = 0;
270 		return(0);
271 	}
272 	auio->uio_rw = UIO_READ;
273 	auio->uio_offset = offset;
274 	auio->uio_td = td;
275 #ifdef KTRACE
276 	if (KTRPOINT(td, KTR_GENIO))
277 		ktruio = cloneuio(auio);
278 #endif
279 	cnt = auio->uio_resid;
280 	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
281 		if (auio->uio_resid != cnt && (error == ERESTART ||
282 		    error == EINTR || error == EWOULDBLOCK))
283 			error = 0;
284 	}
285 	cnt -= auio->uio_resid;
286 #ifdef KTRACE
287 	if (ktruio != NULL) {
288 		ktruio->uio_resid = cnt;
289 		ktrgenio(fd, UIO_READ, ktruio, error);
290 	}
291 #endif
292 	td->td_retval[0] = cnt;
293 	return (error);
294 }
295 
296 /*
297  * Write system call
298  */
299 #ifndef _SYS_SYSPROTO_H_
300 struct write_args {
301 	int	fd;
302 	const void *buf;
303 	size_t	nbyte;
304 };
305 #endif
306 /*
307  * MPSAFE
308  */
309 int
310 write(td, uap)
311 	struct thread *td;
312 	struct write_args *uap;
313 {
314 	struct uio auio;
315 	struct iovec aiov;
316 	int error;
317 
318 	if (uap->nbyte > INT_MAX)
319 		return (EINVAL);
320 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
321 	aiov.iov_len = uap->nbyte;
322 	auio.uio_iov = &aiov;
323 	auio.uio_iovcnt = 1;
324 	auio.uio_resid = uap->nbyte;
325 	auio.uio_segflg = UIO_USERSPACE;
326 	error = kern_writev(td, uap->fd, &auio);
327 	return(error);
328 }
329 
330 /*
331  * Positioned write system call
332  */
333 #ifndef _SYS_SYSPROTO_H_
334 struct pwrite_args {
335 	int	fd;
336 	const void *buf;
337 	size_t	nbyte;
338 	int	pad;
339 	off_t	offset;
340 };
341 #endif
342 /*
343  * MPSAFE
344  */
345 int
346 pwrite(td, uap)
347 	struct thread *td;
348 	struct pwrite_args *uap;
349 {
350 	struct uio auio;
351 	struct iovec aiov;
352 	int error;
353 
354 	if (uap->nbyte > INT_MAX)
355 		return (EINVAL);
356 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
357 	aiov.iov_len = uap->nbyte;
358 	auio.uio_iov = &aiov;
359 	auio.uio_iovcnt = 1;
360 	auio.uio_resid = uap->nbyte;
361 	auio.uio_segflg = UIO_USERSPACE;
362 	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
363 	return(error);
364 }
365 
366 /*
367  * Gather write system call
368  */
369 #ifndef _SYS_SYSPROTO_H_
370 struct writev_args {
371 	int	fd;
372 	struct	iovec *iovp;
373 	u_int	iovcnt;
374 };
375 #endif
376 /*
377  * MPSAFE
378  */
379 int
380 writev(struct thread *td, struct writev_args *uap)
381 {
382 	struct uio *auio;
383 	int error;
384 
385 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
386 	if (error)
387 		return (error);
388 	error = kern_writev(td, uap->fd, auio);
389 	free(auio, M_IOV);
390 	return (error);
391 }
392 
393 int
394 kern_writev(struct thread *td, int fd, struct uio *auio)
395 {
396 	struct file *fp;
397 	int error;
398 
399 	error = fget_write(td, fd, &fp);
400 	if (error)
401 		return (error);
402 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
403 	fdrop(fp, td);
404 	return (error);
405 }
406 
407 /*
408  * Gather positioned write system call
409  */
410 #ifndef _SYS_SYSPROTO_H_
411 struct pwritev_args {
412 	int	fd;
413 	struct	iovec *iovp;
414 	u_int	iovcnt;
415 	off_t	offset;
416 };
417 #endif
418 /*
419  * MPSAFE
420  */
421 int
422 pwritev(struct thread *td, struct pwritev_args *uap)
423 {
424 	struct uio *auio;
425 	int error;
426 
427 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
428 	if (error)
429 		return (error);
430 	error = kern_pwritev(td, uap->fd, auio, uap->offset);
431 	free(auio, M_IOV);
432 	return (error);
433 }
434 
435 int
436 kern_pwritev(td, fd, auio, offset)
437 	struct thread *td;
438 	struct uio *auio;
439 	int fd;
440 	off_t offset;
441 {
442 	struct file *fp;
443 	int error;
444 
445 	error = fget_write(td, fd, &fp);
446 	if (error)
447 		return (error);
448 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
449 		error = ESPIPE;
450 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
451 		error = EINVAL;
452 	else
453 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
454 	fdrop(fp, td);
455 	return (error);
456 }
457 
458 /*
459  * Common code for writev and pwritev that writes data to
460  * a file using the passed in uio, offset, and flags.
461  */
462 static int
463 dofilewrite(td, fd, fp, auio, offset, flags)
464 	struct thread *td;
465 	int fd;
466 	struct file *fp;
467 	struct uio *auio;
468 	off_t offset;
469 	int flags;
470 {
471 	ssize_t cnt;
472 	int error;
473 #ifdef KTRACE
474 	struct uio *ktruio = NULL;
475 #endif
476 
477 	auio->uio_rw = UIO_WRITE;
478 	auio->uio_td = td;
479 	auio->uio_offset = offset;
480 #ifdef KTRACE
481 	if (KTRPOINT(td, KTR_GENIO))
482 		ktruio = cloneuio(auio);
483 #endif
484 	cnt = auio->uio_resid;
485 	if (fp->f_type == DTYPE_VNODE)
486 		bwillwrite();
487 	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
488 		if (auio->uio_resid != cnt && (error == ERESTART ||
489 		    error == EINTR || error == EWOULDBLOCK))
490 			error = 0;
491 		/* Socket layer is responsible for issuing SIGPIPE. */
492 		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
493 			PROC_LOCK(td->td_proc);
494 			psignal(td->td_proc, SIGPIPE);
495 			PROC_UNLOCK(td->td_proc);
496 		}
497 	}
498 	cnt -= auio->uio_resid;
499 #ifdef KTRACE
500 	if (ktruio != NULL) {
501 		ktruio->uio_resid = cnt;
502 		ktrgenio(fd, UIO_WRITE, ktruio, error);
503 	}
504 #endif
505 	td->td_retval[0] = cnt;
506 	return (error);
507 }
508 
509 /*
510  * Ioctl system call
511  */
512 #ifndef _SYS_SYSPROTO_H_
513 struct ioctl_args {
514 	int	fd;
515 	u_long	com;
516 	caddr_t	data;
517 };
518 #endif
519 /*
520  * MPSAFE
521  */
522 /* ARGSUSED */
523 int
524 ioctl(struct thread *td, struct ioctl_args *uap)
525 {
526 	u_long com;
527 	int arg, error;
528 	u_int size;
529 	caddr_t data;
530 
531 	if (uap->com > 0xffffffff) {
532 		printf(
533 		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
534 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
535 		uap->com &= 0xffffffff;
536 	}
537 	com = uap->com;
538 
539 	/*
540 	 * Interpret high order word to find amount of data to be
541 	 * copied to/from the user's address space.
542 	 */
543 	size = IOCPARM_LEN(com);
544 	if ((size > IOCPARM_MAX) ||
545 	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
546 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
547 	    ((com & IOC_OUT) && size == 0) ||
548 #else
549 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
550 #endif
551 	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
552 		return (ENOTTY);
553 
554 	if (size > 0) {
555 		if (!(com & IOC_VOID))
556 			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
557 		else {
558 			/* Integer argument. */
559 			arg = (intptr_t)uap->data;
560 			data = (void *)&arg;
561 			size = 0;
562 		}
563 	} else
564 		data = (void *)&uap->data;
565 	if (com & IOC_IN) {
566 		error = copyin(uap->data, data, (u_int)size);
567 		if (error) {
568 			if (size > 0)
569 				free(data, M_IOCTLOPS);
570 			return (error);
571 		}
572 	} else if (com & IOC_OUT) {
573 		/*
574 		 * Zero the buffer so the user always
575 		 * gets back something deterministic.
576 		 */
577 		bzero(data, size);
578 	}
579 
580 	error = kern_ioctl(td, uap->fd, com, data);
581 
582 	if (error == 0 && (com & IOC_OUT))
583 		error = copyout(data, uap->data, (u_int)size);
584 
585 	if (size > 0)
586 		free(data, M_IOCTLOPS);
587 	return (error);
588 }
589 
590 int
591 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
592 {
593 	struct file *fp;
594 	struct filedesc *fdp;
595 	int error;
596 	int tmp;
597 
598 	if ((error = fget(td, fd, &fp)) != 0)
599 		return (error);
600 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
601 		fdrop(fp, td);
602 		return (EBADF);
603 	}
604 	fdp = td->td_proc->p_fd;
605 	switch (com) {
606 	case FIONCLEX:
607 		FILEDESC_LOCK_FAST(fdp);
608 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
609 		FILEDESC_UNLOCK_FAST(fdp);
610 		goto out;
611 	case FIOCLEX:
612 		FILEDESC_LOCK_FAST(fdp);
613 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
614 		FILEDESC_UNLOCK_FAST(fdp);
615 		goto out;
616 	case FIONBIO:
617 		FILE_LOCK(fp);
618 		if ((tmp = *(int *)data))
619 			fp->f_flag |= FNONBLOCK;
620 		else
621 			fp->f_flag &= ~FNONBLOCK;
622 		FILE_UNLOCK(fp);
623 		data = (void *)&tmp;
624 		break;
625 	case FIOASYNC:
626 		FILE_LOCK(fp);
627 		if ((tmp = *(int *)data))
628 			fp->f_flag |= FASYNC;
629 		else
630 			fp->f_flag &= ~FASYNC;
631 		FILE_UNLOCK(fp);
632 		data = (void *)&tmp;
633 		break;
634 	}
635 
636 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
637 out:
638 	fdrop(fp, td);
639 	return (error);
640 }
641 
642 /*
643  * sellock and selwait are initialized in selectinit() via SYSINIT.
644  */
645 struct mtx	sellock;
646 struct cv	selwait;
647 u_int		nselcoll;	/* Select collisions since boot */
648 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
649 
650 /*
651  * Select system call.
652  */
653 #ifndef _SYS_SYSPROTO_H_
654 struct select_args {
655 	int	nd;
656 	fd_set	*in, *ou, *ex;
657 	struct	timeval *tv;
658 };
659 #endif
660 /*
661  * MPSAFE
662  */
663 int
664 select(td, uap)
665 	register struct thread *td;
666 	register struct select_args *uap;
667 {
668 	struct timeval tv, *tvp;
669 	int error;
670 
671 	if (uap->tv != NULL) {
672 		error = copyin(uap->tv, &tv, sizeof(tv));
673 		if (error)
674 			return (error);
675 		tvp = &tv;
676 	} else
677 		tvp = NULL;
678 
679 	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
680 }
681 
682 int
683 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
684     fd_set *fd_ex, struct timeval *tvp)
685 {
686 	struct filedesc *fdp;
687 	/*
688 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
689 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
690 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
691 	 * of 256.
692 	 */
693 	fd_mask s_selbits[howmany(2048, NFDBITS)];
694 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
695 	struct timeval atv, rtv, ttv;
696 	int error, timo;
697 	u_int ncoll, nbufbytes, ncpbytes, nfdbits;
698 
699 	if (nd < 0)
700 		return (EINVAL);
701 	fdp = td->td_proc->p_fd;
702 
703 	FILEDESC_LOCK_FAST(fdp);
704 
705 	if (nd > td->td_proc->p_fd->fd_nfiles)
706 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
707 	FILEDESC_UNLOCK_FAST(fdp);
708 
709 	/*
710 	 * Allocate just enough bits for the non-null fd_sets.  Use the
711 	 * preallocated auto buffer if possible.
712 	 */
713 	nfdbits = roundup(nd, NFDBITS);
714 	ncpbytes = nfdbits / NBBY;
715 	nbufbytes = 0;
716 	if (fd_in != NULL)
717 		nbufbytes += 2 * ncpbytes;
718 	if (fd_ou != NULL)
719 		nbufbytes += 2 * ncpbytes;
720 	if (fd_ex != NULL)
721 		nbufbytes += 2 * ncpbytes;
722 	if (nbufbytes <= sizeof s_selbits)
723 		selbits = &s_selbits[0];
724 	else
725 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
726 
727 	/*
728 	 * Assign pointers into the bit buffers and fetch the input bits.
729 	 * Put the output buffers together so that they can be bzeroed
730 	 * together.
731 	 */
732 	sbp = selbits;
733 #define	getbits(name, x) \
734 	do {								\
735 		if (name == NULL)					\
736 			ibits[x] = NULL;				\
737 		else {							\
738 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
739 			obits[x] = sbp;					\
740 			sbp += ncpbytes / sizeof *sbp;			\
741 			error = copyin(name, ibits[x], ncpbytes);	\
742 			if (error != 0)					\
743 				goto done_nosellock;			\
744 		}							\
745 	} while (0)
746 	getbits(fd_in, 0);
747 	getbits(fd_ou, 1);
748 	getbits(fd_ex, 2);
749 #undef	getbits
750 	if (nbufbytes != 0)
751 		bzero(selbits, nbufbytes / 2);
752 
753 	if (tvp != NULL) {
754 		atv = *tvp;
755 		if (itimerfix(&atv)) {
756 			error = EINVAL;
757 			goto done_nosellock;
758 		}
759 		getmicrouptime(&rtv);
760 		timevaladd(&atv, &rtv);
761 	} else {
762 		atv.tv_sec = 0;
763 		atv.tv_usec = 0;
764 	}
765 	timo = 0;
766 	TAILQ_INIT(&td->td_selq);
767 	mtx_lock(&sellock);
768 retry:
769 	ncoll = nselcoll;
770 	mtx_lock_spin(&sched_lock);
771 	td->td_flags |= TDF_SELECT;
772 	mtx_unlock_spin(&sched_lock);
773 	mtx_unlock(&sellock);
774 
775 	error = selscan(td, ibits, obits, nd);
776 	mtx_lock(&sellock);
777 	if (error || td->td_retval[0])
778 		goto done;
779 	if (atv.tv_sec || atv.tv_usec) {
780 		getmicrouptime(&rtv);
781 		if (timevalcmp(&rtv, &atv, >=))
782 			goto done;
783 		ttv = atv;
784 		timevalsub(&ttv, &rtv);
785 		timo = ttv.tv_sec > 24 * 60 * 60 ?
786 		    24 * 60 * 60 * hz : tvtohz(&ttv);
787 	}
788 
789 	/*
790 	 * An event of interest may occur while we do not hold
791 	 * sellock, so check TDF_SELECT and the number of
792 	 * collisions and rescan the file descriptors if
793 	 * necessary.
794 	 */
795 	mtx_lock_spin(&sched_lock);
796 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
797 		mtx_unlock_spin(&sched_lock);
798 		goto retry;
799 	}
800 	mtx_unlock_spin(&sched_lock);
801 
802 	if (timo > 0)
803 		error = cv_timedwait_sig(&selwait, &sellock, timo);
804 	else
805 		error = cv_wait_sig(&selwait, &sellock);
806 
807 	if (error == 0)
808 		goto retry;
809 
810 done:
811 	clear_selinfo_list(td);
812 	mtx_lock_spin(&sched_lock);
813 	td->td_flags &= ~TDF_SELECT;
814 	mtx_unlock_spin(&sched_lock);
815 	mtx_unlock(&sellock);
816 
817 done_nosellock:
818 	/* select is not restarted after signals... */
819 	if (error == ERESTART)
820 		error = EINTR;
821 	if (error == EWOULDBLOCK)
822 		error = 0;
823 #define	putbits(name, x) \
824 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
825 		error = error2;
826 	if (error == 0) {
827 		int error2;
828 
829 		putbits(fd_in, 0);
830 		putbits(fd_ou, 1);
831 		putbits(fd_ex, 2);
832 #undef putbits
833 	}
834 	if (selbits != &s_selbits[0])
835 		free(selbits, M_SELECT);
836 
837 	return (error);
838 }
839 
840 static int
841 selscan(td, ibits, obits, nfd)
842 	struct thread *td;
843 	fd_mask **ibits, **obits;
844 	int nfd;
845 {
846 	int msk, i, fd;
847 	fd_mask bits;
848 	struct file *fp;
849 	int n = 0;
850 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
851 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
852 	struct filedesc *fdp = td->td_proc->p_fd;
853 
854 	FILEDESC_LOCK(fdp);
855 	for (msk = 0; msk < 3; msk++) {
856 		if (ibits[msk] == NULL)
857 			continue;
858 		for (i = 0; i < nfd; i += NFDBITS) {
859 			bits = ibits[msk][i/NFDBITS];
860 			/* ffs(int mask) not portable, fd_mask is long */
861 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
862 				if (!(bits & 1))
863 					continue;
864 				if ((fp = fget_locked(fdp, fd)) == NULL) {
865 					FILEDESC_UNLOCK(fdp);
866 					return (EBADF);
867 				}
868 				if (fo_poll(fp, flag[msk], td->td_ucred,
869 				    td)) {
870 					obits[msk][(fd)/NFDBITS] |=
871 					    ((fd_mask)1 << ((fd) % NFDBITS));
872 					n++;
873 				}
874 			}
875 		}
876 	}
877 	FILEDESC_UNLOCK(fdp);
878 	td->td_retval[0] = n;
879 	return (0);
880 }
881 
882 /*
883  * Poll system call.
884  */
885 #ifndef _SYS_SYSPROTO_H_
886 struct poll_args {
887 	struct pollfd *fds;
888 	u_int	nfds;
889 	int	timeout;
890 };
891 #endif
892 /*
893  * MPSAFE
894  */
895 int
896 poll(td, uap)
897 	struct thread *td;
898 	struct poll_args *uap;
899 {
900 	struct pollfd *bits;
901 	struct pollfd smallbits[32];
902 	struct timeval atv, rtv, ttv;
903 	int error = 0, timo;
904 	u_int ncoll, nfds;
905 	size_t ni;
906 
907 	nfds = uap->nfds;
908 
909 	/*
910 	 * This is kinda bogus.  We have fd limits, but that is not
911 	 * really related to the size of the pollfd array.  Make sure
912 	 * we let the process use at least FD_SETSIZE entries and at
913 	 * least enough for the current limits.  We want to be reasonably
914 	 * safe, but not overly restrictive.
915 	 */
916 	PROC_LOCK(td->td_proc);
917 	if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
918 	    (nfds > FD_SETSIZE)) {
919 		PROC_UNLOCK(td->td_proc);
920 		error = EINVAL;
921 		goto done2;
922 	}
923 	PROC_UNLOCK(td->td_proc);
924 	ni = nfds * sizeof(struct pollfd);
925 	if (ni > sizeof(smallbits))
926 		bits = malloc(ni, M_TEMP, M_WAITOK);
927 	else
928 		bits = smallbits;
929 	error = copyin(uap->fds, bits, ni);
930 	if (error)
931 		goto done_nosellock;
932 	if (uap->timeout != INFTIM) {
933 		atv.tv_sec = uap->timeout / 1000;
934 		atv.tv_usec = (uap->timeout % 1000) * 1000;
935 		if (itimerfix(&atv)) {
936 			error = EINVAL;
937 			goto done_nosellock;
938 		}
939 		getmicrouptime(&rtv);
940 		timevaladd(&atv, &rtv);
941 	} else {
942 		atv.tv_sec = 0;
943 		atv.tv_usec = 0;
944 	}
945 	timo = 0;
946 	TAILQ_INIT(&td->td_selq);
947 	mtx_lock(&sellock);
948 retry:
949 	ncoll = nselcoll;
950 	mtx_lock_spin(&sched_lock);
951 	td->td_flags |= TDF_SELECT;
952 	mtx_unlock_spin(&sched_lock);
953 	mtx_unlock(&sellock);
954 
955 	error = pollscan(td, bits, nfds);
956 	mtx_lock(&sellock);
957 	if (error || td->td_retval[0])
958 		goto done;
959 	if (atv.tv_sec || atv.tv_usec) {
960 		getmicrouptime(&rtv);
961 		if (timevalcmp(&rtv, &atv, >=))
962 			goto done;
963 		ttv = atv;
964 		timevalsub(&ttv, &rtv);
965 		timo = ttv.tv_sec > 24 * 60 * 60 ?
966 		    24 * 60 * 60 * hz : tvtohz(&ttv);
967 	}
968 	/*
969 	 * An event of interest may occur while we do not hold
970 	 * sellock, so check TDF_SELECT and the number of collisions
971 	 * and rescan the file descriptors if necessary.
972 	 */
973 	mtx_lock_spin(&sched_lock);
974 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
975 		mtx_unlock_spin(&sched_lock);
976 		goto retry;
977 	}
978 	mtx_unlock_spin(&sched_lock);
979 
980 	if (timo > 0)
981 		error = cv_timedwait_sig(&selwait, &sellock, timo);
982 	else
983 		error = cv_wait_sig(&selwait, &sellock);
984 
985 	if (error == 0)
986 		goto retry;
987 
988 done:
989 	clear_selinfo_list(td);
990 	mtx_lock_spin(&sched_lock);
991 	td->td_flags &= ~TDF_SELECT;
992 	mtx_unlock_spin(&sched_lock);
993 	mtx_unlock(&sellock);
994 
995 done_nosellock:
996 	/* poll is not restarted after signals... */
997 	if (error == ERESTART)
998 		error = EINTR;
999 	if (error == EWOULDBLOCK)
1000 		error = 0;
1001 	if (error == 0) {
1002 		error = copyout(bits, uap->fds, ni);
1003 		if (error)
1004 			goto out;
1005 	}
1006 out:
1007 	if (ni > sizeof(smallbits))
1008 		free(bits, M_TEMP);
1009 done2:
1010 	return (error);
1011 }
1012 
1013 static int
1014 pollscan(td, fds, nfd)
1015 	struct thread *td;
1016 	struct pollfd *fds;
1017 	u_int nfd;
1018 {
1019 	register struct filedesc *fdp = td->td_proc->p_fd;
1020 	int i;
1021 	struct file *fp;
1022 	int n = 0;
1023 
1024 	FILEDESC_LOCK(fdp);
1025 	for (i = 0; i < nfd; i++, fds++) {
1026 		if (fds->fd >= fdp->fd_nfiles) {
1027 			fds->revents = POLLNVAL;
1028 			n++;
1029 		} else if (fds->fd < 0) {
1030 			fds->revents = 0;
1031 		} else {
1032 			fp = fdp->fd_ofiles[fds->fd];
1033 			if (fp == NULL) {
1034 				fds->revents = POLLNVAL;
1035 				n++;
1036 			} else {
1037 				/*
1038 				 * Note: backend also returns POLLHUP and
1039 				 * POLLERR if appropriate.
1040 				 */
1041 				fds->revents = fo_poll(fp, fds->events,
1042 				    td->td_ucred, td);
1043 				if (fds->revents != 0)
1044 					n++;
1045 			}
1046 		}
1047 	}
1048 	FILEDESC_UNLOCK(fdp);
1049 	td->td_retval[0] = n;
1050 	return (0);
1051 }
1052 
1053 /*
1054  * OpenBSD poll system call.
1055  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1056  */
1057 #ifndef _SYS_SYSPROTO_H_
1058 struct openbsd_poll_args {
1059 	struct pollfd *fds;
1060 	u_int	nfds;
1061 	int	timeout;
1062 };
1063 #endif
1064 /*
1065  * MPSAFE
1066  */
1067 int
1068 openbsd_poll(td, uap)
1069 	register struct thread *td;
1070 	register struct openbsd_poll_args *uap;
1071 {
1072 	return (poll(td, (struct poll_args *)uap));
1073 }
1074 
1075 /*
1076  * Remove the references to the thread from all of the objects
1077  * we were polling.
1078  *
1079  * This code assumes that the underlying owner of the selinfo
1080  * structure will hold sellock before it changes it, and that
1081  * it will unlink itself from our list if it goes away.
1082  */
1083 void
1084 clear_selinfo_list(td)
1085 	struct thread *td;
1086 {
1087 	struct selinfo *si;
1088 
1089 	mtx_assert(&sellock, MA_OWNED);
1090 	TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
1091 		si->si_thread = NULL;
1092 	TAILQ_INIT(&td->td_selq);
1093 }
1094 
1095 /*
1096  * Record a select request.
1097  */
1098 void
1099 selrecord(selector, sip)
1100 	struct thread *selector;
1101 	struct selinfo *sip;
1102 {
1103 
1104 	mtx_lock(&sellock);
1105 	/*
1106 	 * If the selinfo's thread pointer is NULL then take ownership of it.
1107 	 *
1108 	 * If the thread pointer is not NULL and it points to another
1109 	 * thread, then we have a collision.
1110 	 *
1111 	 * If the thread pointer is not NULL and points back to us then leave
1112 	 * it alone as we've already added pointed it at us and added it to
1113 	 * our list.
1114 	 */
1115 	if (sip->si_thread == NULL) {
1116 		sip->si_thread = selector;
1117 		TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
1118 	} else if (sip->si_thread != selector) {
1119 		sip->si_flags |= SI_COLL;
1120 	}
1121 
1122 	mtx_unlock(&sellock);
1123 }
1124 
1125 /* Wake up a selecting thread. */
1126 void
1127 selwakeup(sip)
1128 	struct selinfo *sip;
1129 {
1130 	doselwakeup(sip, -1);
1131 }
1132 
1133 /* Wake up a selecting thread, and set its priority. */
1134 void
1135 selwakeuppri(sip, pri)
1136 	struct selinfo *sip;
1137 	int pri;
1138 {
1139 	doselwakeup(sip, pri);
1140 }
1141 
1142 /*
1143  * Do a wakeup when a selectable event occurs.
1144  */
1145 static void
1146 doselwakeup(sip, pri)
1147 	struct selinfo *sip;
1148 	int pri;
1149 {
1150 	struct thread *td;
1151 
1152 	mtx_lock(&sellock);
1153 	td = sip->si_thread;
1154 	if ((sip->si_flags & SI_COLL) != 0) {
1155 		nselcoll++;
1156 		sip->si_flags &= ~SI_COLL;
1157 		cv_broadcastpri(&selwait, pri);
1158 	}
1159 	if (td == NULL) {
1160 		mtx_unlock(&sellock);
1161 		return;
1162 	}
1163 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
1164 	sip->si_thread = NULL;
1165 	mtx_lock_spin(&sched_lock);
1166 	td->td_flags &= ~TDF_SELECT;
1167 	mtx_unlock_spin(&sched_lock);
1168 	sleepq_remove(td, &selwait);
1169 	mtx_unlock(&sellock);
1170 }
1171 
1172 static void selectinit(void *);
1173 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
1174 
1175 /* ARGSUSED*/
1176 static void
1177 selectinit(dummy)
1178 	void *dummy;
1179 {
1180 	cv_init(&selwait, "select");
1181 	mtx_init(&sellock, "sellck", NULL, MTX_DEF);
1182 }
1183