xref: /openbsd/sys/kern/sys_generic.c (revision cecf84d4)
1 /*	$OpenBSD: sys_generic.c,v 1.98 2015/05/10 22:35:38 millert Exp $	*/
2 /*	$NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $	*/
3 
4 /*
5  * Copyright (c) 1996 Theo de Raadt
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/file.h>
46 #include <sys/proc.h>
47 #include <sys/resourcevar.h>
48 #include <sys/socketvar.h>
49 #include <sys/signalvar.h>
50 #include <sys/uio.h>
51 #include <sys/kernel.h>
52 #include <sys/stat.h>
53 #include <sys/malloc.h>
54 #include <sys/poll.h>
55 #ifdef KTRACE
56 #include <sys/ktrace.h>
57 #endif
58 #include <sys/sched.h>
59 
60 #include <sys/mount.h>
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm_extern.h>
64 
65 int selscan(struct proc *, fd_set *, fd_set *, int, int, register_t *);
66 void pollscan(struct proc *, struct pollfd *, u_int, register_t *);
67 int pollout(struct pollfd *, struct pollfd *, u_int);
68 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
69     const struct timespec *, const sigset_t *, register_t *);
70 int doppoll(struct proc *, struct pollfd *, u_int, const struct timespec *,
71     const sigset_t *, register_t *);
72 
73 /*
74  * Read system call.
75  */
76 /* ARGSUSED */
77 int
78 sys_read(struct proc *p, void *v, register_t *retval)
79 {
80 	struct sys_read_args /* {
81 		syscallarg(int) fd;
82 		syscallarg(void *) buf;
83 		syscallarg(size_t) nbyte;
84 	} */ *uap = v;
85 	struct iovec iov;
86 	int fd = SCARG(uap, fd);
87 	struct file *fp;
88 	struct filedesc *fdp = p->p_fd;
89 
90 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
91 		return (EBADF);
92 
93 	iov.iov_base = SCARG(uap, buf);
94 	iov.iov_len = SCARG(uap, nbyte);
95 
96 	FREF(fp);
97 
98 	/* dofilereadv() will FRELE the descriptor for us */
99 	return (dofilereadv(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
100 }
101 
102 /*
103  * Scatter read system call.
104  */
105 int
106 sys_readv(struct proc *p, void *v, register_t *retval)
107 {
108 	struct sys_readv_args /* {
109 		syscallarg(int) fd;
110 		syscallarg(const struct iovec *) iovp;
111 		syscallarg(int) iovcnt;
112 	} */ *uap = v;
113 	int fd = SCARG(uap, fd);
114 	struct file *fp;
115 	struct filedesc *fdp = p->p_fd;
116 
117 	if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
118 		return (EBADF);
119 	FREF(fp);
120 
121 	/* dofilereadv() will FRELE the descriptor for us */
122 	return (dofilereadv(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
123 	    &fp->f_offset, retval));
124 }
125 
126 int
127 dofilereadv(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
128     int iovcnt, int userspace, off_t *offset, register_t *retval)
129 {
130 	struct iovec aiov[UIO_SMALLIOV];
131 	struct uio auio;
132 	struct iovec *iov;
133 	struct iovec *needfree = NULL;
134 	long i, cnt, error = 0;
135 	u_int iovlen;
136 #ifdef KTRACE
137 	struct iovec *ktriov = NULL;
138 #endif
139 
140 	/* note: can't use iovlen until iovcnt is validated */
141 	iovlen = iovcnt * sizeof(struct iovec);
142 
143 	/*
144 	 * If the iovec array exists in userspace, it needs to be copied in;
145 	 * otherwise, it can be used directly.
146 	 */
147 	if (userspace) {
148 		if ((u_int)iovcnt > UIO_SMALLIOV) {
149 			if ((u_int)iovcnt > IOV_MAX) {
150 				error = EINVAL;
151 				goto out;
152 			}
153 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
154 		} else if ((u_int)iovcnt > 0) {
155 			iov = aiov;
156 			needfree = NULL;
157 		} else {
158 			error = EINVAL;
159 			goto out;
160 		}
161 		if ((error = copyin(iovp, iov, iovlen)))
162 			goto done;
163 	} else {
164 		iov = (struct iovec *)iovp;		/* de-constify */
165 	}
166 
167 	auio.uio_iov = iov;
168 	auio.uio_iovcnt = iovcnt;
169 	auio.uio_rw = UIO_READ;
170 	auio.uio_segflg = UIO_USERSPACE;
171 	auio.uio_procp = p;
172 	auio.uio_resid = 0;
173 	for (i = 0; i < iovcnt; i++) {
174 		auio.uio_resid += iov->iov_len;
175 		/*
176 		 * Reads return ssize_t because -1 is returned on error.
177 		 * Therefore we must restrict the length to SSIZE_MAX to
178 		 * avoid garbage return values.  Note that the addition is
179 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
180 		 */
181 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
182 			error = EINVAL;
183 			goto done;
184 		}
185 		iov++;
186 	}
187 #ifdef KTRACE
188 	/*
189 	 * if tracing, save a copy of iovec
190 	 */
191 	if (KTRPOINT(p, KTR_GENIO)) {
192 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
193 		memcpy(ktriov, auio.uio_iov, iovlen);
194 	}
195 #endif
196 	cnt = auio.uio_resid;
197 	error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred);
198 	if (error)
199 		if (auio.uio_resid != cnt && (error == ERESTART ||
200 		    error == EINTR || error == EWOULDBLOCK))
201 			error = 0;
202 	cnt -= auio.uio_resid;
203 
204 	fp->f_rxfer++;
205 	fp->f_rbytes += cnt;
206 #ifdef KTRACE
207 	if (ktriov != NULL) {
208 		if (error == 0)
209 			ktrgenio(p, fd, UIO_READ, ktriov, cnt);
210 		free(ktriov, M_TEMP, iovlen);
211 	}
212 #endif
213 	*retval = cnt;
214  done:
215 	if (needfree)
216 		free(needfree, M_IOV, iovlen);
217  out:
218 	FRELE(fp, p);
219 	return (error);
220 }
221 
222 /*
223  * Write system call
224  */
225 int
226 sys_write(struct proc *p, void *v, register_t *retval)
227 {
228 	struct sys_write_args /* {
229 		syscallarg(int) fd;
230 		syscallarg(const void *) buf;
231 		syscallarg(size_t) nbyte;
232 	} */ *uap = v;
233 	struct iovec iov;
234 	int fd = SCARG(uap, fd);
235 	struct file *fp;
236 	struct filedesc *fdp = p->p_fd;
237 
238 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
239 		return (EBADF);
240 
241 	iov.iov_base = (void *)SCARG(uap, buf);
242 	iov.iov_len = SCARG(uap, nbyte);
243 
244 	FREF(fp);
245 
246 	/* dofilewritev() will FRELE the descriptor for us */
247 	return (dofilewritev(p, fd, fp, &iov, 1, 0, &fp->f_offset, retval));
248 }
249 
250 /*
251  * Gather write system call
252  */
253 int
254 sys_writev(struct proc *p, void *v, register_t *retval)
255 {
256 	struct sys_writev_args /* {
257 		syscallarg(int) fd;
258 		syscallarg(const struct iovec *) iovp;
259 		syscallarg(int) iovcnt;
260 	} */ *uap = v;
261 	int fd = SCARG(uap, fd);
262 	struct file *fp;
263 	struct filedesc *fdp = p->p_fd;
264 
265 	if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
266 		return (EBADF);
267 	FREF(fp);
268 
269 	/* dofilewritev() will FRELE the descriptor for us */
270 	return (dofilewritev(p, fd, fp, SCARG(uap, iovp), SCARG(uap, iovcnt), 1,
271 	    &fp->f_offset, retval));
272 }
273 
274 int
275 dofilewritev(struct proc *p, int fd, struct file *fp, const struct iovec *iovp,
276     int iovcnt, int userspace, off_t *offset, register_t *retval)
277 {
278 	struct iovec aiov[UIO_SMALLIOV];
279 	struct uio auio;
280 	struct iovec *iov;
281 	struct iovec *needfree = NULL;
282 	long i, cnt, error = 0;
283 	u_int iovlen;
284 #ifdef KTRACE
285 	struct iovec *ktriov = NULL;
286 #endif
287 
288 	/* note: can't use iovlen until iovcnt is validated */
289 	iovlen = iovcnt * sizeof(struct iovec);
290 
291 	/*
292 	 * If the iovec array exists in userspace, it needs to be copied in;
293 	 * otherwise, it can be used directly.
294 	 */
295 	if (userspace) {
296 		if ((u_int)iovcnt > UIO_SMALLIOV) {
297 			if ((u_int)iovcnt > IOV_MAX) {
298 				error = EINVAL;
299 				goto out;
300 			}
301 			iov = needfree = malloc(iovlen, M_IOV, M_WAITOK);
302 		} else if ((u_int)iovcnt > 0) {
303 			iov = aiov;
304 			needfree = NULL;
305 		} else {
306 			error = EINVAL;
307 			goto out;
308 		}
309 		if ((error = copyin(iovp, iov, iovlen)))
310 			goto done;
311 	} else {
312 		iov = (struct iovec *)iovp;		/* de-constify */
313 	}
314 
315 	auio.uio_iov = iov;
316 	auio.uio_iovcnt = iovcnt;
317 	auio.uio_rw = UIO_WRITE;
318 	auio.uio_segflg = UIO_USERSPACE;
319 	auio.uio_procp = p;
320 	auio.uio_resid = 0;
321 	for (i = 0; i < iovcnt; i++) {
322 		auio.uio_resid += iov->iov_len;
323 		/*
324 		 * Writes return ssize_t because -1 is returned on error.
325 		 * Therefore we must restrict the length to SSIZE_MAX to
326 		 * avoid garbage return values.  Note that the addition is
327 		 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
328 		 */
329 		if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) {
330 			error = EINVAL;
331 			goto done;
332 		}
333 		iov++;
334 	}
335 #ifdef KTRACE
336 	/*
337 	 * if tracing, save a copy of iovec
338 	 */
339 	if (KTRPOINT(p, KTR_GENIO)) {
340 		ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
341 		memcpy(ktriov, auio.uio_iov, iovlen);
342 	}
343 #endif
344 	cnt = auio.uio_resid;
345 	error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred);
346 	if (error) {
347 		if (auio.uio_resid != cnt && (error == ERESTART ||
348 		    error == EINTR || error == EWOULDBLOCK))
349 			error = 0;
350 		if (error == EPIPE)
351 			ptsignal(p, SIGPIPE, STHREAD);
352 	}
353 	cnt -= auio.uio_resid;
354 
355 	fp->f_wxfer++;
356 	fp->f_wbytes += cnt;
357 #ifdef KTRACE
358 	if (ktriov != NULL) {
359 		if (error == 0)
360 			ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
361 		free(ktriov, M_TEMP, iovlen);
362 	}
363 #endif
364 	*retval = cnt;
365  done:
366 	if (needfree)
367 		free(needfree, M_IOV, iovlen);
368  out:
369 	FRELE(fp, p);
370 	return (error);
371 }
372 
373 /*
374  * Ioctl system call
375  */
376 /* ARGSUSED */
377 int
378 sys_ioctl(struct proc *p, void *v, register_t *retval)
379 {
380 	struct sys_ioctl_args /* {
381 		syscallarg(int) fd;
382 		syscallarg(u_long) com;
383 		syscallarg(void *) data;
384 	} */ *uap = v;
385 	struct file *fp;
386 	struct filedesc *fdp;
387 	u_long com;
388 	int error;
389 	u_int size;
390 	caddr_t data, memp;
391 	int tmp;
392 #define STK_PARAMS	128
393 	long long stkbuf[STK_PARAMS / sizeof(long long)];
394 
395 	fdp = p->p_fd;
396 	if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
397 		return (EBADF);
398 
399 	switch (com = SCARG(uap, com)) {
400 	case FIONCLEX:
401 	case FIOCLEX:
402 		fdplock(fdp);
403 		if (com == FIONCLEX)
404 			fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
405 		else
406 			fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
407 		fdpunlock(fdp);
408 		return (0);
409 	}
410 
411 	/*
412 	 * Interpret high order word to find amount of data to be
413 	 * copied to/from the user's address space.
414 	 */
415 	size = IOCPARM_LEN(com);
416 	if (size > IOCPARM_MAX)
417 		return (ENOTTY);
418 	FREF(fp);
419 	memp = NULL;
420 	if (size > sizeof (stkbuf)) {
421 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
422 		data = memp;
423 	} else
424 		data = (caddr_t)stkbuf;
425 	if (com&IOC_IN) {
426 		if (size) {
427 			error = copyin(SCARG(uap, data), data, (u_int)size);
428 			if (error) {
429 				goto out;
430 			}
431 		} else
432 			*(caddr_t *)data = SCARG(uap, data);
433 	} else if ((com&IOC_OUT) && size)
434 		/*
435 		 * Zero the buffer so the user always
436 		 * gets back something deterministic.
437 		 */
438 		memset(data, 0, size);
439 	else if (com&IOC_VOID)
440 		*(caddr_t *)data = SCARG(uap, data);
441 
442 	switch (com) {
443 
444 	case FIONBIO:
445 		if ((tmp = *(int *)data) != 0)
446 			fp->f_flag |= FNONBLOCK;
447 		else
448 			fp->f_flag &= ~FNONBLOCK;
449 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
450 		break;
451 
452 	case FIOASYNC:
453 		if ((tmp = *(int *)data) != 0)
454 			fp->f_flag |= FASYNC;
455 		else
456 			fp->f_flag &= ~FASYNC;
457 		error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
458 		break;
459 
460 	case FIOSETOWN:
461 		tmp = *(int *)data;
462 		if (fp->f_type == DTYPE_SOCKET) {
463 			struct socket *so = (struct socket *)fp->f_data;
464 
465 			so->so_pgid = tmp;
466 			so->so_siguid = p->p_ucred->cr_ruid;
467 			so->so_sigeuid = p->p_ucred->cr_uid;
468 			error = 0;
469 			break;
470 		}
471 		if (tmp <= 0) {
472 			tmp = -tmp;
473 		} else {
474 			struct process *pr = prfind(tmp);
475 			if (pr == NULL) {
476 				error = ESRCH;
477 				break;
478 			}
479 			tmp = pr->ps_pgrp->pg_id;
480 		}
481 		error = (*fp->f_ops->fo_ioctl)
482 			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
483 		break;
484 
485 	case FIOGETOWN:
486 		if (fp->f_type == DTYPE_SOCKET) {
487 			error = 0;
488 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
489 			break;
490 		}
491 		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
492 		*(int *)data = -*(int *)data;
493 		break;
494 
495 	default:
496 		error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
497 		break;
498 	}
499 	/*
500 	 * Copy any data to user, size was
501 	 * already set and checked above.
502 	 */
503 	if (error == 0 && (com&IOC_OUT) && size)
504 		error = copyout(data, SCARG(uap, data), (u_int)size);
505 out:
506 	FRELE(fp, p);
507 	if (memp)
508 		free(memp, M_IOCTLOPS, size);
509 	return (error);
510 }
511 
512 int	selwait, nselcoll;
513 
514 /*
515  * Select system call.
516  */
517 int
518 sys_select(struct proc *p, void *v, register_t *retval)
519 {
520 	struct sys_select_args /* {
521 		syscallarg(int) nd;
522 		syscallarg(fd_set *) in;
523 		syscallarg(fd_set *) ou;
524 		syscallarg(fd_set *) ex;
525 		syscallarg(struct timeval *) tv;
526 	} */ *uap = v;
527 
528 	struct timespec ts, *tsp = NULL;
529 	int error;
530 
531 	if (SCARG(uap, tv) != NULL) {
532 		struct timeval tv;
533 		if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
534 			return (error);
535 		if ((error = itimerfix(&tv)) != 0)
536 			return (error);
537 #ifdef KTRACE
538 		if (KTRPOINT(p, KTR_STRUCT))
539 			ktrreltimeval(p, &tv);
540 #endif
541 		TIMEVAL_TO_TIMESPEC(&tv, &ts);
542 		tsp = &ts;
543 	}
544 
545 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
546 	    SCARG(uap, ex), tsp, NULL, retval));
547 }
548 
549 int
550 sys_pselect(struct proc *p, void *v, register_t *retval)
551 {
552 	struct sys_pselect_args /* {
553 		syscallarg(int) nd;
554 		syscallarg(fd_set *) in;
555 		syscallarg(fd_set *) ou;
556 		syscallarg(fd_set *) ex;
557 		syscallarg(const struct timespec *) ts;
558 		syscallarg(const sigset_t *) mask;
559 	} */ *uap = v;
560 
561 	struct timespec ts, *tsp = NULL;
562 	sigset_t ss, *ssp = NULL;
563 	int error;
564 
565 	if (SCARG(uap, ts) != NULL) {
566 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
567 			return (error);
568 		if ((error = timespecfix(&ts)) != 0)
569 			return (error);
570 #ifdef KTRACE
571 		if (KTRPOINT(p, KTR_STRUCT))
572 			ktrreltimespec(p, &ts);
573 #endif
574 		tsp = &ts;
575 	}
576 	if (SCARG(uap, mask) != NULL) {
577 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
578 			return (error);
579 		ssp = &ss;
580 	}
581 
582 	return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
583 	    SCARG(uap, ex), tsp, ssp, retval));
584 }
585 
586 int
587 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
588     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
589 {
590 	fd_mask bits[6];
591 	fd_set *pibits[3], *pobits[3];
592 	struct timespec ats, rts, tts;
593 	int s, ncoll, error = 0, timo;
594 	u_int ni;
595 
596 	if (nd < 0)
597 		return (EINVAL);
598 	if (nd > p->p_fd->fd_nfiles) {
599 		/* forgiving; slightly wrong */
600 		nd = p->p_fd->fd_nfiles;
601 	}
602 	ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
603 	if (ni > sizeof(bits[0])) {
604 		caddr_t mbits;
605 
606 		mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
607 		pibits[0] = (fd_set *)&mbits[ni * 0];
608 		pibits[1] = (fd_set *)&mbits[ni * 1];
609 		pibits[2] = (fd_set *)&mbits[ni * 2];
610 		pobits[0] = (fd_set *)&mbits[ni * 3];
611 		pobits[1] = (fd_set *)&mbits[ni * 4];
612 		pobits[2] = (fd_set *)&mbits[ni * 5];
613 	} else {
614 		memset(bits, 0, sizeof(bits));
615 		pibits[0] = (fd_set *)&bits[0];
616 		pibits[1] = (fd_set *)&bits[1];
617 		pibits[2] = (fd_set *)&bits[2];
618 		pobits[0] = (fd_set *)&bits[3];
619 		pobits[1] = (fd_set *)&bits[4];
620 		pobits[2] = (fd_set *)&bits[5];
621 	}
622 
623 #define	getbits(name, x) \
624 	if (name && (error = copyin(name, pibits[x], ni))) \
625 		goto done;
626 	getbits(in, 0);
627 	getbits(ou, 1);
628 	getbits(ex, 2);
629 #undef	getbits
630 #ifdef KTRACE
631 	if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
632 		if (in) ktrfdset(p, pibits[0], ni);
633 		if (ou) ktrfdset(p, pibits[1], ni);
634 		if (ex) ktrfdset(p, pibits[2], ni);
635 	}
636 #endif
637 
638 	if (tsp) {
639 		getnanouptime(&rts);
640 		timespecadd(tsp, &rts, &ats);
641 	} else {
642 		ats.tv_sec = 0;
643 		ats.tv_nsec = 0;
644 	}
645 	timo = 0;
646 
647 	if (sigmask)
648 		dosigsuspend(p, *sigmask &~ sigcantmask);
649 
650 retry:
651 	ncoll = nselcoll;
652 	atomic_setbits_int(&p->p_flag, P_SELECT);
653 	error = selscan(p, pibits[0], pobits[0], nd, ni, retval);
654 	if (error || *retval)
655 		goto done;
656 	if (tsp) {
657 		getnanouptime(&rts);
658 		if (timespeccmp(&rts, &ats, >=))
659 			goto done;
660 		timespecsub(&ats, &rts, &tts);
661 		timo = tts.tv_sec > 24 * 60 * 60 ?
662 			24 * 60 * 60 * hz : tstohz(&tts);
663 	}
664 	s = splhigh();
665 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
666 		splx(s);
667 		goto retry;
668 	}
669 	atomic_clearbits_int(&p->p_flag, P_SELECT);
670 	error = tsleep(&selwait, PSOCK | PCATCH, "select", timo);
671 	splx(s);
672 	if (error == 0)
673 		goto retry;
674 done:
675 	atomic_clearbits_int(&p->p_flag, P_SELECT);
676 	/* select is not restarted after signals... */
677 	if (error == ERESTART)
678 		error = EINTR;
679 	if (error == EWOULDBLOCK)
680 		error = 0;
681 #define	putbits(name, x) \
682 	if (name && (error2 = copyout(pobits[x], name, ni))) \
683 		error = error2;
684 	if (error == 0) {
685 		int error2;
686 
687 		putbits(in, 0);
688 		putbits(ou, 1);
689 		putbits(ex, 2);
690 #undef putbits
691 #ifdef KTRACE
692 		if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
693 			if (in) ktrfdset(p, pobits[0], ni);
694 			if (ou) ktrfdset(p, pobits[1], ni);
695 			if (ex) ktrfdset(p, pobits[2], ni);
696 		}
697 #endif
698 	}
699 
700 	if (pibits[0] != (fd_set *)&bits[0])
701 		free(pibits[0], M_TEMP, 6 * ni);
702 	return (error);
703 }
704 
705 int
706 selscan(struct proc *p, fd_set *ibits, fd_set *obits, int nfd, int ni,
707     register_t *retval)
708 {
709 	caddr_t cibits = (caddr_t)ibits, cobits = (caddr_t)obits;
710 	struct filedesc *fdp = p->p_fd;
711 	int msk, i, j, fd;
712 	fd_mask bits;
713 	struct file *fp;
714 	int n = 0;
715 	static const int flag[3] = { POLLIN, POLLOUT|POLLNOHUP, POLLPRI };
716 
717 	for (msk = 0; msk < 3; msk++) {
718 		fd_set *pibits = (fd_set *)&cibits[msk*ni];
719 		fd_set *pobits = (fd_set *)&cobits[msk*ni];
720 
721 		for (i = 0; i < nfd; i += NFDBITS) {
722 			bits = pibits->fds_bits[i/NFDBITS];
723 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
724 				bits &= ~(1 << j);
725 				if ((fp = fd_getfile(fdp, fd)) == NULL)
726 					return (EBADF);
727 				FREF(fp);
728 				if ((*fp->f_ops->fo_poll)(fp, flag[msk], p)) {
729 					FD_SET(fd, pobits);
730 					n++;
731 				}
732 				FRELE(fp, p);
733 			}
734 		}
735 	}
736 	*retval = n;
737 	return (0);
738 }
739 
740 /*ARGSUSED*/
741 int
742 seltrue(dev_t dev, int events, struct proc *p)
743 {
744 
745 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
746 }
747 
748 int
749 selfalse(dev_t dev, int events, struct proc *p)
750 {
751 
752 	return (0);
753 }
754 
755 /*
756  * Record a select request.
757  */
758 void
759 selrecord(struct proc *selector, struct selinfo *sip)
760 {
761 	struct proc *p;
762 	pid_t mypid;
763 
764 	mypid = selector->p_pid;
765 	if (sip->si_selpid == mypid)
766 		return;
767 	if (sip->si_selpid && (p = pfind(sip->si_selpid)) &&
768 	    p->p_wchan == (caddr_t)&selwait)
769 		sip->si_flags |= SI_COLL;
770 	else
771 		sip->si_selpid = mypid;
772 }
773 
774 /*
775  * Do a wakeup when a selectable event occurs.
776  */
777 void
778 selwakeup(struct selinfo *sip)
779 {
780 	struct proc *p;
781 	int s;
782 
783 	KNOTE(&sip->si_note, 0);
784 	if (sip->si_selpid == 0)
785 		return;
786 	if (sip->si_flags & SI_COLL) {
787 		nselcoll++;
788 		sip->si_flags &= ~SI_COLL;
789 		wakeup(&selwait);
790 	}
791 	p = pfind(sip->si_selpid);
792 	sip->si_selpid = 0;
793 	if (p != NULL) {
794 		SCHED_LOCK(s);
795 		if (p->p_wchan == (caddr_t)&selwait) {
796 			if (p->p_stat == SSLEEP)
797 				setrunnable(p);
798 			else
799 				unsleep(p);
800 		} else if (p->p_flag & P_SELECT)
801 			atomic_clearbits_int(&p->p_flag, P_SELECT);
802 		SCHED_UNLOCK(s);
803 	}
804 }
805 
806 void
807 pollscan(struct proc *p, struct pollfd *pl, u_int nfd, register_t *retval)
808 {
809 	struct filedesc *fdp = p->p_fd;
810 	struct file *fp;
811 	u_int i;
812 	int n = 0;
813 
814 	for (i = 0; i < nfd; i++, pl++) {
815 		/* Check the file descriptor. */
816 		if (pl->fd < 0) {
817 			pl->revents = 0;
818 			continue;
819 		}
820 		if ((fp = fd_getfile(fdp, pl->fd)) == NULL) {
821 			pl->revents = POLLNVAL;
822 			n++;
823 			continue;
824 		}
825 		FREF(fp);
826 		pl->revents = (*fp->f_ops->fo_poll)(fp, pl->events, p);
827 		FRELE(fp, p);
828 		if (pl->revents != 0)
829 			n++;
830 	}
831 	*retval = n;
832 }
833 
834 /*
835  * Only copyout the revents field.
836  */
837 int
838 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
839 {
840 	int error = 0;
841 	u_int i = 0;
842 
843 	while (!error && i++ < nfds) {
844 		error = copyout(&pl->revents, &upl->revents,
845 		    sizeof(upl->revents));
846 		pl++;
847 		upl++;
848 	}
849 
850 	return (error);
851 }
852 
853 /*
854  * We are using the same mechanism as select only we encode/decode args
855  * differently.
856  */
857 int
858 sys_poll(struct proc *p, void *v, register_t *retval)
859 {
860 	struct sys_poll_args /* {
861 		syscallarg(struct pollfd *) fds;
862 		syscallarg(u_int) nfds;
863 		syscallarg(int) timeout;
864 	} */ *uap = v;
865 
866 	struct timespec ts, *tsp = NULL;
867 	int msec = SCARG(uap, timeout);
868 
869 	if (msec != INFTIM) {
870 		if (msec < 0)
871 			return (EINVAL);
872 		ts.tv_sec = msec / 1000;
873 		ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
874 		tsp = &ts;
875 	}
876 
877 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
878 	    retval));
879 }
880 
881 int
882 sys_ppoll(struct proc *p, void *v, register_t *retval)
883 {
884 	struct sys_ppoll_args /* {
885 		syscallarg(struct pollfd *) fds;
886 		syscallarg(u_int) nfds;
887 		syscallarg(const struct timespec *) ts;
888 		syscallarg(const sigset_t *) mask;
889 	} */ *uap = v;
890 
891 	int error;
892 	struct timespec ts, *tsp = NULL;
893 	sigset_t ss, *ssp = NULL;
894 
895 	if (SCARG(uap, ts) != NULL) {
896 		if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
897 			return (error);
898 		if ((error = timespecfix(&ts)) != 0)
899 			return (error);
900 #ifdef KTRACE
901 		if (KTRPOINT(p, KTR_STRUCT))
902 			ktrreltimespec(p, &ts);
903 #endif
904 		tsp = &ts;
905 	}
906 
907 	if (SCARG(uap, mask) != NULL) {
908 		if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
909 			return (error);
910 		ssp = &ss;
911 	}
912 
913 	return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
914 	    retval));
915 }
916 
917 int
918 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
919     const struct timespec *tsp, const sigset_t *sigmask, register_t *retval)
920 {
921 	size_t sz;
922 	struct pollfd pfds[4], *pl = pfds;
923 	struct timespec ats, rts, tts;
924 	int timo, ncoll, i, s, error;
925 
926 	/* Standards say no more than MAX_OPEN; this is possibly better. */
927 	if (nfds > min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles))
928 		return (EINVAL);
929 
930 	/* optimize for the default case, of a small nfds value */
931 	if (nfds > nitems(pfds)) {
932 		pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
933 		    M_WAITOK | M_CANFAIL);
934 		if (pl == NULL)
935 			return (EINVAL);
936 	}
937 
938 	sz = nfds * sizeof(*pl);
939 
940 	if ((error = copyin(fds, pl, sz)) != 0)
941 		goto bad;
942 
943 	for (i = 0; i < nfds; i++) {
944 		pl[i].events &= ~POLLNOHUP;
945 		pl[i].revents = 0;
946 	}
947 
948 	if (tsp != NULL) {
949 		getnanouptime(&rts);
950 		timespecadd(tsp, &rts, &ats);
951 	} else {
952 		ats.tv_sec = 0;
953 		ats.tv_nsec = 0;
954 	}
955 	timo = 0;
956 
957 	if (sigmask)
958 		dosigsuspend(p, *sigmask &~ sigcantmask);
959 
960 retry:
961 	ncoll = nselcoll;
962 	atomic_setbits_int(&p->p_flag, P_SELECT);
963 	pollscan(p, pl, nfds, retval);
964 	if (*retval)
965 		goto done;
966 	if (tsp != NULL) {
967 		getnanouptime(&rts);
968 		if (timespeccmp(&rts, &ats, >=))
969 			goto done;
970 		timespecsub(&ats, &rts, &tts);
971 		timo = tts.tv_sec > 24 * 60 * 60 ?
972 			24 * 60 * 60 * hz : tstohz(&tts);
973 	}
974 	s = splhigh();
975 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
976 		splx(s);
977 		goto retry;
978 	}
979 	atomic_clearbits_int(&p->p_flag, P_SELECT);
980 	error = tsleep(&selwait, PSOCK | PCATCH, "poll", timo);
981 	splx(s);
982 	if (error == 0)
983 		goto retry;
984 
985 done:
986 	atomic_clearbits_int(&p->p_flag, P_SELECT);
987 	/*
988 	 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
989 	 *       ignored (since the whole point is to see what would block).
990 	 */
991 	switch (error) {
992 	case ERESTART:
993 		error = pollout(pl, fds, nfds);
994 		if (error == 0)
995 			error = EINTR;
996 		break;
997 	case EWOULDBLOCK:
998 	case 0:
999 		error = pollout(pl, fds, nfds);
1000 		break;
1001 	}
1002 bad:
1003 	if (pl != pfds)
1004 		free(pl, M_TEMP, sz);
1005 	return (error);
1006 }
1007 
1008 /*
1009  * utrace system call
1010  */
1011 /* ARGSUSED */
1012 int
1013 sys_utrace(struct proc *curp, void *v, register_t *retval)
1014 {
1015 #ifdef KTRACE
1016 	struct sys_utrace_args /* {
1017 		syscallarg(const char *) label;
1018 		syscallarg(const void *) addr;
1019 		syscallarg(size_t) len;
1020 	} */ *uap = v;
1021 	return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1022 	    SCARG(uap, len)));
1023 #else
1024 	return (0);
1025 #endif
1026 }
1027