1 /* $OpenBSD: sys_generic.c,v 1.160 2024/12/30 02:46:00 guenther Exp $ */
2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */
3
4 /*
5 * Copyright (c) 1996 Theo de Raadt
6 * Copyright (c) 1982, 1986, 1989, 1993
7 * The Regents of the University of California. All rights reserved.
8 * (c) UNIX System Laboratories, Inc.
9 * All or some portions of this file are derived from material licensed
10 * to the University of California by American Telephone and Telegraph
11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12 * the permission of UNIX System Laboratories, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 */
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/time.h>
54 #include <sys/malloc.h>
55 #include <sys/poll.h>
56 #include <sys/eventvar.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 #include <sys/pledge.h>
61
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64
65 /*
66 * Debug values:
67 * 1 - print implementation errors, things that should not happen.
68 * 2 - print ppoll(2) information, somewhat verbose
69 * 3 - print pselect(2) and ppoll(2) information, very verbose
70 */
71 /* #define KQPOLL_DEBUG */
72 #ifdef KQPOLL_DEBUG
73 int kqpoll_debug = 1;
74 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \
75 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
76 printf(x); \
77 }
78 #else
79 #define DPRINTFN(v, x...) do {} while (0);
80 #endif
81
82 int pselregister(struct proc *, fd_set **, fd_set **, int, int *, int *);
83 int pselcollect(struct proc *, struct kevent *, fd_set **, int *);
84 void ppollregister(struct proc *, struct pollfd *, int, int *, int *);
85 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
86
87 int pollout(struct pollfd *, struct pollfd *, u_int);
88 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
89 struct timespec *, const sigset_t *, register_t *);
90 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
91 const sigset_t *, register_t *);
92
93 int
iovec_copyin(const struct iovec * uiov,struct iovec ** iovp,struct iovec * aiov,unsigned int iovcnt,size_t * residp)94 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
95 unsigned int iovcnt, size_t *residp)
96 {
97 #ifdef KTRACE
98 struct proc *p = curproc;
99 #endif
100 struct iovec *iov;
101 int error, i;
102 size_t resid = 0;
103
104 if (iovcnt > UIO_SMALLIOV) {
105 if (iovcnt > IOV_MAX)
106 return (EINVAL);
107 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
108 } else if (iovcnt > 0) {
109 iov = aiov;
110 } else {
111 return (EINVAL);
112 }
113 *iovp = iov;
114
115 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
116 return (error);
117
118 #ifdef KTRACE
119 if (KTRPOINT(p, KTR_STRUCT))
120 ktriovec(p, iov, iovcnt);
121 #endif
122
123 for (i = 0; i < iovcnt; i++) {
124 resid += iov->iov_len;
125 /*
126 * Writes return ssize_t because -1 is returned on error.
127 * Therefore we must restrict the length to SSIZE_MAX to
128 * avoid garbage return values. Note that the addition is
129 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
130 */
131 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
132 return (EINVAL);
133 iov++;
134 }
135
136 if (residp != NULL)
137 *residp = resid;
138
139 return (0);
140 }
141
142 void
iovec_free(struct iovec * iov,unsigned int iovcnt)143 iovec_free(struct iovec *iov, unsigned int iovcnt)
144 {
145 if (iovcnt > UIO_SMALLIOV)
146 free(iov, M_IOV, iovcnt * sizeof(*iov));
147 }
148
149 /*
150 * Read system call.
151 */
152 int
sys_read(struct proc * p,void * v,register_t * retval)153 sys_read(struct proc *p, void *v, register_t *retval)
154 {
155 struct sys_read_args /* {
156 syscallarg(int) fd;
157 syscallarg(void *) buf;
158 syscallarg(size_t) nbyte;
159 } */ *uap = v;
160 struct iovec iov;
161 struct uio auio;
162
163 iov.iov_base = SCARG(uap, buf);
164 iov.iov_len = SCARG(uap, nbyte);
165 if (iov.iov_len > SSIZE_MAX)
166 return (EINVAL);
167
168 auio.uio_iov = &iov;
169 auio.uio_iovcnt = 1;
170 auio.uio_resid = iov.iov_len;
171
172 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
173 }
174
175 /*
176 * Scatter read system call.
177 */
178 int
sys_readv(struct proc * p,void * v,register_t * retval)179 sys_readv(struct proc *p, void *v, register_t *retval)
180 {
181 struct sys_readv_args /* {
182 syscallarg(int) fd;
183 syscallarg(const struct iovec *) iovp;
184 syscallarg(int) iovcnt;
185 } */ *uap = v;
186 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
187 int error, iovcnt = SCARG(uap, iovcnt);
188 struct uio auio;
189 size_t resid;
190
191 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
192 if (error)
193 goto done;
194
195 auio.uio_iov = iov;
196 auio.uio_iovcnt = iovcnt;
197 auio.uio_resid = resid;
198
199 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
200 done:
201 iovec_free(iov, iovcnt);
202 return (error);
203 }
204
205 int
dofilereadv(struct proc * p,int fd,struct uio * uio,int flags,register_t * retval)206 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
207 register_t *retval)
208 {
209 struct filedesc *fdp = p->p_fd;
210 struct file *fp;
211 long cnt, error = 0;
212 u_int iovlen;
213 #ifdef KTRACE
214 struct iovec *ktriov = NULL;
215 #endif
216
217 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
218 iovlen = uio->uio_iovcnt * sizeof(struct iovec);
219
220 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
221 return (EBADF);
222
223 /* Checks for positioned read. */
224 if (flags & FO_POSITION) {
225 struct vnode *vp = fp->f_data;
226
227 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
228 (vp->v_flag & VISTTY)) {
229 error = ESPIPE;
230 goto done;
231 }
232
233 if (uio->uio_offset < 0 && vp->v_type != VCHR) {
234 error = EINVAL;
235 goto done;
236 }
237 }
238
239 uio->uio_rw = UIO_READ;
240 uio->uio_segflg = UIO_USERSPACE;
241 uio->uio_procp = p;
242 #ifdef KTRACE
243 /*
244 * if tracing, save a copy of iovec
245 */
246 if (KTRPOINT(p, KTR_GENIO)) {
247 ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
248 memcpy(ktriov, uio->uio_iov, iovlen);
249 }
250 #endif
251 cnt = uio->uio_resid;
252 error = (*fp->f_ops->fo_read)(fp, uio, flags);
253 if (error) {
254 if (uio->uio_resid != cnt && (error == ERESTART ||
255 error == EINTR || error == EWOULDBLOCK))
256 error = 0;
257 }
258 cnt -= uio->uio_resid;
259
260 mtx_enter(&fp->f_mtx);
261 fp->f_rxfer++;
262 fp->f_rbytes += cnt;
263 mtx_leave(&fp->f_mtx);
264 #ifdef KTRACE
265 if (ktriov != NULL) {
266 if (error == 0)
267 ktrgenio(p, fd, UIO_READ, ktriov, cnt);
268 free(ktriov, M_TEMP, iovlen);
269 }
270 #endif
271 *retval = cnt;
272 done:
273 FRELE(fp, p);
274 return (error);
275 }
276
277 /*
278 * Write system call
279 */
280 int
sys_write(struct proc * p,void * v,register_t * retval)281 sys_write(struct proc *p, void *v, register_t *retval)
282 {
283 struct sys_write_args /* {
284 syscallarg(int) fd;
285 syscallarg(const void *) buf;
286 syscallarg(size_t) nbyte;
287 } */ *uap = v;
288 struct iovec iov;
289 struct uio auio;
290
291 iov.iov_base = (void *)SCARG(uap, buf);
292 iov.iov_len = SCARG(uap, nbyte);
293 if (iov.iov_len > SSIZE_MAX)
294 return (EINVAL);
295
296 auio.uio_iov = &iov;
297 auio.uio_iovcnt = 1;
298 auio.uio_resid = iov.iov_len;
299
300 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
301 }
302
303 /*
304 * Gather write system call
305 */
306 int
sys_writev(struct proc * p,void * v,register_t * retval)307 sys_writev(struct proc *p, void *v, register_t *retval)
308 {
309 struct sys_writev_args /* {
310 syscallarg(int) fd;
311 syscallarg(const struct iovec *) iovp;
312 syscallarg(int) iovcnt;
313 } */ *uap = v;
314 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
315 int error, iovcnt = SCARG(uap, iovcnt);
316 struct uio auio;
317 size_t resid;
318
319 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
320 if (error)
321 goto done;
322
323 auio.uio_iov = iov;
324 auio.uio_iovcnt = iovcnt;
325 auio.uio_resid = resid;
326
327 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
328 done:
329 iovec_free(iov, iovcnt);
330 return (error);
331 }
332
333 int
dofilewritev(struct proc * p,int fd,struct uio * uio,int flags,register_t * retval)334 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
335 register_t *retval)
336 {
337 struct filedesc *fdp = p->p_fd;
338 struct file *fp;
339 long cnt, error = 0;
340 u_int iovlen;
341 #ifdef KTRACE
342 struct iovec *ktriov = NULL;
343 #endif
344
345 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
346 iovlen = uio->uio_iovcnt * sizeof(struct iovec);
347
348 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
349 return (EBADF);
350
351 /* Checks for positioned write. */
352 if (flags & FO_POSITION) {
353 struct vnode *vp = fp->f_data;
354
355 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
356 (vp->v_flag & VISTTY)) {
357 error = ESPIPE;
358 goto done;
359 }
360
361 if (uio->uio_offset < 0 && vp->v_type != VCHR) {
362 error = EINVAL;
363 goto done;
364 }
365 }
366
367 uio->uio_rw = UIO_WRITE;
368 uio->uio_segflg = UIO_USERSPACE;
369 uio->uio_procp = p;
370 #ifdef KTRACE
371 /*
372 * if tracing, save a copy of iovec
373 */
374 if (KTRPOINT(p, KTR_GENIO)) {
375 ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
376 memcpy(ktriov, uio->uio_iov, iovlen);
377 }
378 #endif
379 cnt = uio->uio_resid;
380 error = (*fp->f_ops->fo_write)(fp, uio, flags);
381 if (error) {
382 if (uio->uio_resid != cnt && (error == ERESTART ||
383 error == EINTR || error == EWOULDBLOCK))
384 error = 0;
385 if (error == EPIPE)
386 ptsignal(p, SIGPIPE, STHREAD);
387 }
388 cnt -= uio->uio_resid;
389
390 mtx_enter(&fp->f_mtx);
391 fp->f_wxfer++;
392 fp->f_wbytes += cnt;
393 mtx_leave(&fp->f_mtx);
394 #ifdef KTRACE
395 if (ktriov != NULL) {
396 if (error == 0)
397 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
398 free(ktriov, M_TEMP, iovlen);
399 }
400 #endif
401 *retval = cnt;
402 done:
403 FRELE(fp, p);
404 return (error);
405 }
406
407 /*
408 * Ioctl system call
409 */
410 int
sys_ioctl(struct proc * p,void * v,register_t * retval)411 sys_ioctl(struct proc *p, void *v, register_t *retval)
412 {
413 struct sys_ioctl_args /* {
414 syscallarg(int) fd;
415 syscallarg(u_long) com;
416 syscallarg(void *) data;
417 } */ *uap = v;
418 struct file *fp;
419 struct filedesc *fdp = p->p_fd;
420 u_long com = SCARG(uap, com);
421 int error = 0;
422 u_int size = 0;
423 caddr_t data, memp = NULL;
424 int tmp;
425 #define STK_PARAMS 128
426 long long stkbuf[STK_PARAMS / sizeof(long long)];
427
428 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
429 return (EBADF);
430
431 if (fp->f_type == DTYPE_SOCKET) {
432 struct socket *so = fp->f_data;
433
434 if (so->so_state & SS_DNS) {
435 error = EINVAL;
436 goto out;
437 }
438 }
439
440 error = pledge_ioctl(p, com, fp);
441 if (error)
442 goto out;
443
444 switch (com) {
445 case FIONCLEX:
446 case FIOCLEX:
447 fdplock(fdp);
448 if (com == FIONCLEX)
449 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
450 else
451 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
452 fdpunlock(fdp);
453 goto out;
454 }
455
456 /*
457 * Interpret high order word to find amount of data to be
458 * copied to/from the user's address space.
459 */
460 size = IOCPARM_LEN(com);
461 if (size > IOCPARM_MAX) {
462 error = ENOTTY;
463 goto out;
464 }
465 if (size > sizeof (stkbuf)) {
466 memp = malloc(size, M_IOCTLOPS, M_WAITOK);
467 data = memp;
468 } else
469 data = (caddr_t)stkbuf;
470 if (com&IOC_IN) {
471 if (size) {
472 error = copyin(SCARG(uap, data), data, size);
473 if (error) {
474 goto out;
475 }
476 } else
477 *(caddr_t *)data = SCARG(uap, data);
478 } else if ((com&IOC_OUT) && size)
479 /*
480 * Zero the buffer so the user always
481 * gets back something deterministic.
482 */
483 memset(data, 0, size);
484 else if (com&IOC_VOID)
485 *(caddr_t *)data = SCARG(uap, data);
486
487 switch (com) {
488
489 case FIONBIO:
490 if ((tmp = *(int *)data) != 0)
491 atomic_setbits_int(&fp->f_flag, FNONBLOCK);
492 else
493 atomic_clearbits_int(&fp->f_flag, FNONBLOCK);
494 error = 0;
495 break;
496
497 case FIOASYNC:
498 if ((tmp = *(int *)data) != 0)
499 atomic_setbits_int(&fp->f_flag, FASYNC);
500 else
501 atomic_clearbits_int(&fp->f_flag, FASYNC);
502 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
503 break;
504
505 default:
506 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
507 break;
508 }
509 /*
510 * Copy any data to user, size was
511 * already set and checked above.
512 */
513 if (error == 0 && (com&IOC_OUT) && size)
514 error = copyout(data, SCARG(uap, data), size);
515 out:
516 FRELE(fp, p);
517 free(memp, M_IOCTLOPS, size);
518 return (error);
519 }
520
521 /*
522 * Select system call.
523 */
524 int
sys_select(struct proc * p,void * v,register_t * retval)525 sys_select(struct proc *p, void *v, register_t *retval)
526 {
527 struct sys_select_args /* {
528 syscallarg(int) nd;
529 syscallarg(fd_set *) in;
530 syscallarg(fd_set *) ou;
531 syscallarg(fd_set *) ex;
532 syscallarg(struct timeval *) tv;
533 } */ *uap = v;
534
535 struct timespec ts, *tsp = NULL;
536 int error;
537
538 if (SCARG(uap, tv) != NULL) {
539 struct timeval tv;
540 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
541 return (error);
542 #ifdef KTRACE
543 if (KTRPOINT(p, KTR_STRUCT))
544 ktrreltimeval(p, &tv);
545 #endif
546 if (tv.tv_sec < 0 || !timerisvalid(&tv))
547 return (EINVAL);
548 TIMEVAL_TO_TIMESPEC(&tv, &ts);
549 tsp = &ts;
550 }
551
552 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
553 SCARG(uap, ex), tsp, NULL, retval));
554 }
555
556 int
sys_pselect(struct proc * p,void * v,register_t * retval)557 sys_pselect(struct proc *p, void *v, register_t *retval)
558 {
559 struct sys_pselect_args /* {
560 syscallarg(int) nd;
561 syscallarg(fd_set *) in;
562 syscallarg(fd_set *) ou;
563 syscallarg(fd_set *) ex;
564 syscallarg(const struct timespec *) ts;
565 syscallarg(const sigset_t *) mask;
566 } */ *uap = v;
567
568 struct timespec ts, *tsp = NULL;
569 sigset_t ss, *ssp = NULL;
570 int error;
571
572 if (SCARG(uap, ts) != NULL) {
573 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
574 return (error);
575 #ifdef KTRACE
576 if (KTRPOINT(p, KTR_STRUCT))
577 ktrreltimespec(p, &ts);
578 #endif
579 if (ts.tv_sec < 0 || !timespecisvalid(&ts))
580 return (EINVAL);
581 tsp = &ts;
582 }
583 if (SCARG(uap, mask) != NULL) {
584 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
585 return (error);
586 ssp = &ss;
587 }
588
589 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
590 SCARG(uap, ex), tsp, ssp, retval));
591 }
592
593 int
dopselect(struct proc * p,int nd,fd_set * in,fd_set * ou,fd_set * ex,struct timespec * timeout,const sigset_t * sigmask,register_t * retval)594 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
595 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
596 {
597 struct kqueue_scan_state scan;
598 struct timespec zerots = {};
599 fd_mask bits[6];
600 fd_set *pibits[3], *pobits[3];
601 int error, nfiles, ncollected = 0, nevents = 0;
602 u_int ni;
603
604 if (nd < 0)
605 return (EINVAL);
606
607 nfiles = READ_ONCE(p->p_fd->fd_nfiles);
608 if (nd > nfiles)
609 nd = nfiles;
610
611 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
612 if (ni > sizeof(bits[0])) {
613 caddr_t mbits;
614
615 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
616 pibits[0] = (fd_set *)&mbits[ni * 0];
617 pibits[1] = (fd_set *)&mbits[ni * 1];
618 pibits[2] = (fd_set *)&mbits[ni * 2];
619 pobits[0] = (fd_set *)&mbits[ni * 3];
620 pobits[1] = (fd_set *)&mbits[ni * 4];
621 pobits[2] = (fd_set *)&mbits[ni * 5];
622 } else {
623 memset(bits, 0, sizeof(bits));
624 pibits[0] = (fd_set *)&bits[0];
625 pibits[1] = (fd_set *)&bits[1];
626 pibits[2] = (fd_set *)&bits[2];
627 pobits[0] = (fd_set *)&bits[3];
628 pobits[1] = (fd_set *)&bits[4];
629 pobits[2] = (fd_set *)&bits[5];
630 }
631
632 kqpoll_init(nd);
633
634 #define getbits(name, x) \
635 if (name && (error = copyin(name, pibits[x], ni))) \
636 goto done;
637 getbits(in, 0);
638 getbits(ou, 1);
639 getbits(ex, 2);
640 #undef getbits
641 #ifdef KTRACE
642 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
643 if (in) ktrfdset(p, pibits[0], ni);
644 if (ou) ktrfdset(p, pibits[1], ni);
645 if (ex) ktrfdset(p, pibits[2], ni);
646 }
647 #endif
648
649 if (sigmask)
650 dosigsuspend(p, *sigmask &~ sigcantmask);
651
652 /* Register kqueue events */
653 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected);
654 if (error != 0)
655 goto done;
656
657 /*
658 * The poll/select family of syscalls has been designed to
659 * block when file descriptors are not available, even if
660 * there's nothing to wait for.
661 */
662 if (nevents == 0 && ncollected == 0) {
663 uint64_t nsecs = INFSLP;
664
665 if (timeout != NULL) {
666 if (!timespecisset(timeout))
667 goto done;
668 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
669 }
670 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs);
671 /* select is not restarted after signals... */
672 if (error == ERESTART)
673 error = EINTR;
674 if (error == EWOULDBLOCK)
675 error = 0;
676 goto done;
677 }
678
679 /* Do not block if registering found pending events. */
680 if (ncollected > 0)
681 timeout = &zerots;
682
683 /* Collect at most `nevents' possibly waiting in kqueue_scan() */
684 kqueue_scan_setup(&scan, p->p_kq);
685 while (nevents > 0) {
686 struct kevent kev[KQ_NEVENTS];
687 int i, ready, count;
688
689 /* Maximum number of events per iteration */
690 count = MIN(nitems(kev), nevents);
691 ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
692
693 /* Convert back events that are ready. */
694 for (i = 0; i < ready && error == 0; i++)
695 error = pselcollect(p, &kev[i], pobits, &ncollected);
696 /*
697 * Stop if there was an error or if we had enough
698 * space to collect all events that were ready.
699 */
700 if (error || ready < count)
701 break;
702
703 nevents -= ready;
704 }
705 kqueue_scan_finish(&scan);
706 *retval = ncollected;
707 done:
708 #define putbits(name, x) \
709 if (name && (error2 = copyout(pobits[x], name, ni))) \
710 error = error2;
711 if (error == 0) {
712 int error2;
713
714 putbits(in, 0);
715 putbits(ou, 1);
716 putbits(ex, 2);
717 #undef putbits
718 #ifdef KTRACE
719 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
720 if (in) ktrfdset(p, pobits[0], ni);
721 if (ou) ktrfdset(p, pobits[1], ni);
722 if (ex) ktrfdset(p, pobits[2], ni);
723 }
724 #endif
725 }
726
727 if (pibits[0] != (fd_set *)&bits[0])
728 free(pibits[0], M_TEMP, 6 * ni);
729
730 kqpoll_done(nd);
731
732 return (error);
733 }
734
735 /*
736 * Convert fd_set into kqueue events and register them on the
737 * per-thread queue.
738 */
739 int
pselregister(struct proc * p,fd_set * pibits[3],fd_set * pobits[3],int nfd,int * nregistered,int * ncollected)740 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd,
741 int *nregistered, int *ncollected)
742 {
743 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT };
744 static const int evff[] = { 0, 0, NOTE_OOB };
745 int msk, i, j, fd, nevents = 0, error = 0;
746 struct kevent kev;
747 fd_mask bits;
748
749 for (msk = 0; msk < 3; msk++) {
750 for (i = 0; i < nfd; i += NFDBITS) {
751 bits = pibits[msk]->fds_bits[i / NFDBITS];
752 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
753 bits &= ~(1 << j);
754
755 DPRINTFN(2, "select fd %d mask %d serial %lu\n",
756 fd, msk, p->p_kq_serial);
757 EV_SET(&kev, fd, evf[msk],
758 EV_ADD|EV_ENABLE|__EV_SELECT,
759 evff[msk], 0, (void *)(p->p_kq_serial));
760 error = kqueue_register(p->p_kq, &kev, 0, p);
761 switch (error) {
762 case 0:
763 nevents++;
764 /* FALLTHROUGH */
765 case EOPNOTSUPP:/* No underlying kqfilter */
766 case EINVAL: /* Unimplemented filter */
767 case EPERM: /* Specific to FIFO and
768 * __EV_SELECT */
769 error = 0;
770 break;
771 case ENXIO: /* Device has been detached */
772 default:
773 goto bad;
774 }
775 }
776 }
777 }
778
779 *nregistered = nevents;
780 return (0);
781 bad:
782 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
783 kev.filter, error);
784 return (error);
785 }
786
787 /*
788 * Convert given kqueue event into corresponding select(2) bit.
789 */
790 int
pselcollect(struct proc * p,struct kevent * kevp,fd_set * pobits[3],int * ncollected)791 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3],
792 int *ncollected)
793 {
794 if ((unsigned long)kevp->udata != p->p_kq_serial) {
795 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx",
796 __func__, kevp, (int)kevp->ident,
797 (unsigned long)kevp->udata, p->p_kq_serial);
798 }
799
800 if (kevp->flags & EV_ERROR) {
801 DPRINTFN(2, "select fd %d filt %d error %d\n",
802 (int)kevp->ident, kevp->filter, (int)kevp->data);
803 return (kevp->data);
804 }
805
806 switch (kevp->filter) {
807 case EVFILT_READ:
808 FD_SET(kevp->ident, pobits[0]);
809 break;
810 case EVFILT_WRITE:
811 FD_SET(kevp->ident, pobits[1]);
812 break;
813 case EVFILT_EXCEPT:
814 FD_SET(kevp->ident, pobits[2]);
815 break;
816 default:
817 KASSERT(0);
818 }
819 (*ncollected)++;
820
821 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
822 return (0);
823 }
824
825 /*
826 * Do a wakeup when a selectable event occurs.
827 */
828 void
selwakeup(struct selinfo * sip)829 selwakeup(struct selinfo *sip)
830 {
831 KERNEL_LOCK();
832 knote_locked(&sip->si_note, NOTE_SUBMIT);
833 KERNEL_UNLOCK();
834 }
835
836 /*
837 * Only copyout the revents field.
838 */
839 int
pollout(struct pollfd * pl,struct pollfd * upl,u_int nfds)840 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
841 {
842 int error = 0;
843 u_int i = 0;
844
845 while (!error && i++ < nfds) {
846 error = copyout(&pl->revents, &upl->revents,
847 sizeof(upl->revents));
848 pl++;
849 upl++;
850 }
851
852 return (error);
853 }
854
855 /*
856 * We are using the same mechanism as select only we encode/decode args
857 * differently.
858 */
859 int
sys_poll(struct proc * p,void * v,register_t * retval)860 sys_poll(struct proc *p, void *v, register_t *retval)
861 {
862 struct sys_poll_args /* {
863 syscallarg(struct pollfd *) fds;
864 syscallarg(u_int) nfds;
865 syscallarg(int) timeout;
866 } */ *uap = v;
867
868 struct timespec ts, *tsp = NULL;
869 int msec = SCARG(uap, timeout);
870
871 if (msec != INFTIM) {
872 if (msec < 0)
873 return (EINVAL);
874 ts.tv_sec = msec / 1000;
875 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
876 tsp = &ts;
877 }
878
879 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
880 retval));
881 }
882
883 int
sys_ppoll(struct proc * p,void * v,register_t * retval)884 sys_ppoll(struct proc *p, void *v, register_t *retval)
885 {
886 struct sys_ppoll_args /* {
887 syscallarg(struct pollfd *) fds;
888 syscallarg(u_int) nfds;
889 syscallarg(const struct timespec *) ts;
890 syscallarg(const sigset_t *) mask;
891 } */ *uap = v;
892
893 int error;
894 struct timespec ts, *tsp = NULL;
895 sigset_t ss, *ssp = NULL;
896
897 if (SCARG(uap, ts) != NULL) {
898 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
899 return (error);
900 #ifdef KTRACE
901 if (KTRPOINT(p, KTR_STRUCT))
902 ktrreltimespec(p, &ts);
903 #endif
904 if (ts.tv_sec < 0 || !timespecisvalid(&ts))
905 return (EINVAL);
906 tsp = &ts;
907 }
908
909 if (SCARG(uap, mask) != NULL) {
910 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
911 return (error);
912 ssp = &ss;
913 }
914
915 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
916 retval));
917 }
918
919 int
doppoll(struct proc * p,struct pollfd * fds,u_int nfds,struct timespec * timeout,const sigset_t * sigmask,register_t * retval)920 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
921 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
922 {
923 struct kqueue_scan_state scan;
924 struct timespec zerots = {};
925 struct pollfd pfds[4], *pl = pfds;
926 int error, ncollected = 0, nevents = 0;
927 size_t sz;
928
929 /* Standards say no more than MAX_OPEN; this is possibly better. */
930 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
931 return (EINVAL);
932
933 /* optimize for the default case, of a small nfds value */
934 if (nfds > nitems(pfds)) {
935 pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
936 M_WAITOK | M_CANFAIL);
937 if (pl == NULL)
938 return (EINVAL);
939 }
940
941 kqpoll_init(nfds);
942
943 sz = nfds * sizeof(*pl);
944
945 if ((error = copyin(fds, pl, sz)) != 0)
946 goto bad;
947
948 if (sigmask)
949 dosigsuspend(p, *sigmask &~ sigcantmask);
950
951 /* Register kqueue events */
952 ppollregister(p, pl, nfds, &nevents, &ncollected);
953
954 /*
955 * The poll/select family of syscalls has been designed to
956 * block when file descriptors are not available, even if
957 * there's nothing to wait for.
958 */
959 if (nevents == 0 && ncollected == 0) {
960 uint64_t nsecs = INFSLP;
961
962 if (timeout != NULL) {
963 if (!timespecisset(timeout))
964 goto done;
965 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
966 }
967
968 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs);
969 if (error == ERESTART)
970 error = EINTR;
971 if (error == EWOULDBLOCK)
972 error = 0;
973 goto done;
974 }
975
976 /* Do not block if registering found pending events. */
977 if (ncollected > 0)
978 timeout = &zerots;
979
980 /* Collect at most `nevents' possibly waiting in kqueue_scan() */
981 kqueue_scan_setup(&scan, p->p_kq);
982 while (nevents > 0) {
983 struct kevent kev[KQ_NEVENTS];
984 int i, ready, count;
985
986 /* Maximum number of events per iteration */
987 count = MIN(nitems(kev), nevents);
988 ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
989
990 /* Convert back events that are ready. */
991 for (i = 0; i < ready; i++)
992 ncollected += ppollcollect(p, &kev[i], pl, nfds);
993
994 /*
995 * Stop if there was an error or if we had enough
996 * place to collect all events that were ready.
997 */
998 if (error || ready < count)
999 break;
1000
1001 nevents -= ready;
1002 }
1003 kqueue_scan_finish(&scan);
1004 *retval = ncollected;
1005 done:
1006 /*
1007 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1008 * ignored (since the whole point is to see what would block).
1009 */
1010 switch (error) {
1011 case EINTR:
1012 error = pollout(pl, fds, nfds);
1013 if (error == 0)
1014 error = EINTR;
1015 break;
1016 case EWOULDBLOCK:
1017 case 0:
1018 error = pollout(pl, fds, nfds);
1019 break;
1020 }
1021 #ifdef KTRACE
1022 if (KTRPOINT(p, KTR_STRUCT))
1023 ktrpollfd(p, pl, nfds);
1024 #endif /* KTRACE */
1025 bad:
1026 if (pl != pfds)
1027 free(pl, M_TEMP, sz);
1028
1029 kqpoll_done(nfds);
1030
1031 return (error);
1032 }
1033
1034 int
ppollregister_evts(struct proc * p,struct kevent * kevp,int nkev,struct pollfd * pl,unsigned int pollid)1035 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev,
1036 struct pollfd *pl, unsigned int pollid)
1037 {
1038 int i, error, nevents = 0;
1039
1040 KASSERT(pl->revents == 0);
1041
1042 for (i = 0; i < nkev; i++, kevp++) {
1043 again:
1044 error = kqueue_register(p->p_kq, kevp, pollid, p);
1045 switch (error) {
1046 case 0:
1047 nevents++;
1048 break;
1049 case EOPNOTSUPP:/* No underlying kqfilter */
1050 case EINVAL: /* Unimplemented filter */
1051 break;
1052 case EBADF: /* Bad file descriptor */
1053 pl->revents |= POLLNVAL;
1054 break;
1055 case EPERM: /* Specific to FIFO */
1056 KASSERT(kevp->filter == EVFILT_WRITE);
1057 if (nkev == 1) {
1058 /*
1059 * If this is the only filter make sure
1060 * POLLHUP is passed to userland.
1061 */
1062 kevp->filter = EVFILT_EXCEPT;
1063 goto again;
1064 }
1065 break;
1066 default:
1067 DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
1068 " %lu filt %d ERROR=%d\n",
1069 ((unsigned long)kevp->udata - p->p_kq_serial),
1070 pl->fd, pl->revents, p->p_kq_serial, kevp->filter,
1071 error);
1072 /* FALLTHROUGH */
1073 case ENXIO: /* Device has been detached */
1074 pl->revents |= POLLERR;
1075 break;
1076 }
1077 }
1078
1079 return (nevents);
1080 }
1081
1082 /*
1083 * Convert pollfd into kqueue events and register them on the
1084 * per-thread queue.
1085 *
1086 * At most 3 events can correspond to a single pollfd.
1087 */
1088 void
ppollregister(struct proc * p,struct pollfd * pl,int nfds,int * nregistered,int * ncollected)1089 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered,
1090 int *ncollected)
1091 {
1092 int i, nkev, nevt, forcehup;
1093 struct kevent kev[3], *kevp;
1094
1095 for (i = 0; i < nfds; i++) {
1096 pl[i].events &= ~POLL_NOHUP;
1097 pl[i].revents = 0;
1098
1099 if (pl[i].fd < 0)
1100 continue;
1101
1102 /*
1103 * POLLHUP checking is implicit in the event filters.
1104 * However, the checking must be even if no events are
1105 * requested.
1106 */
1107 forcehup = ((pl[i].events & ~POLLHUP) == 0);
1108
1109 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n",
1110 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial);
1111
1112 nevt = 0;
1113 nkev = 0;
1114 kevp = kev;
1115 if (pl[i].events & (POLLIN | POLLRDNORM)) {
1116 EV_SET(kevp, pl[i].fd, EVFILT_READ,
1117 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
1118 (void *)(p->p_kq_serial + i));
1119 nkev++;
1120 kevp++;
1121 }
1122 if (pl[i].events & (POLLOUT | POLLWRNORM)) {
1123 EV_SET(kevp, pl[i].fd, EVFILT_WRITE,
1124 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
1125 (void *)(p->p_kq_serial + i));
1126 nkev++;
1127 kevp++;
1128 }
1129 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) {
1130 int evff = forcehup ? 0 : NOTE_OOB;
1131
1132 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT,
1133 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0,
1134 (void *)(p->p_kq_serial + i));
1135 nkev++;
1136 kevp++;
1137 }
1138
1139 if (nkev == 0)
1140 continue;
1141
1142 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i);
1143
1144 if (pl[i].revents != 0)
1145 (*ncollected)++;
1146 }
1147
1148 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered,
1149 *ncollected);
1150 }
1151
1152 /*
1153 * Convert given kqueue event into corresponding poll(2) revents bit.
1154 */
1155 int
ppollcollect(struct proc * p,struct kevent * kevp,struct pollfd * pl,u_int nfds)1156 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds)
1157 {
1158 static struct timeval poll_errintvl = { 5, 0 };
1159 static struct timeval poll_lasterr;
1160 int already_seen;
1161 unsigned long i;
1162
1163 /* Extract poll array index */
1164 i = (unsigned long)kevp->udata - p->p_kq_serial;
1165
1166 if (i >= nfds) {
1167 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx",
1168 __func__, kevp, nfds,
1169 (unsigned long)kevp->udata, p->p_kq_serial);
1170 }
1171 if ((int)kevp->ident != pl[i].fd) {
1172 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx",
1173 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd,
1174 p->p_kq_serial);
1175 }
1176
1177 /*
1178 * A given descriptor may already have generated an error
1179 * against another filter during kqueue_register().
1180 *
1181 * Make sure to set the appropriate flags but do not
1182 * increment `*retval' more than once.
1183 */
1184 already_seen = (pl[i].revents != 0);
1185
1186 /* POLLNVAL preempts other events. */
1187 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) {
1188 pl[i].revents = POLLNVAL;
1189 goto done;
1190 } else if (pl[i].revents & POLLNVAL) {
1191 goto done;
1192 }
1193
1194 switch (kevp->filter) {
1195 case EVFILT_READ:
1196 if (kevp->flags & __EV_HUP)
1197 pl[i].revents |= POLLHUP;
1198 if (pl[i].events & (POLLIN | POLLRDNORM))
1199 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM);
1200 break;
1201 case EVFILT_WRITE:
1202 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
1203 if (kevp->flags & __EV_HUP) {
1204 pl[i].revents |= POLLHUP;
1205 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) {
1206 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM);
1207 }
1208 break;
1209 case EVFILT_EXCEPT:
1210 if (kevp->flags & __EV_HUP) {
1211 if (pl[i].events != 0 && pl[i].events != POLLOUT)
1212 DPRINTFN(0, "weird events %x\n", pl[i].events);
1213 pl[i].revents |= POLLHUP;
1214 break;
1215 }
1216 if (pl[i].events & (POLLPRI | POLLRDBAND))
1217 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND);
1218 break;
1219 default:
1220 KASSERT(0);
1221 }
1222
1223 done:
1224 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n",
1225 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata,
1226 kevp->filter);
1227
1228 /*
1229 * Make noise about unclaimed events as they might indicate a bug
1230 * and can result in spurious-looking wakeups of poll(2).
1231 *
1232 * Live-locking within the system call should not happen because
1233 * the scan loop in doppoll() has an upper limit for the number
1234 * of events to process.
1235 */
1236 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) {
1237 printf("%s[%d]: poll index %lu fd %d events 0x%x "
1238 "filter %d/0x%x unclaimed\n",
1239 p->p_p->ps_comm, p->p_tid, i, pl[i].fd,
1240 pl[i].events, kevp->filter, kevp->flags);
1241 }
1242
1243 if (!already_seen && (pl[i].revents != 0))
1244 return (1);
1245
1246 return (0);
1247 }
1248
1249 /*
1250 * utrace system call
1251 */
1252 int
sys_utrace(struct proc * curp,void * v,register_t * retval)1253 sys_utrace(struct proc *curp, void *v, register_t *retval)
1254 {
1255 #ifdef KTRACE
1256 struct sys_utrace_args /* {
1257 syscallarg(const char *) label;
1258 syscallarg(const void *) addr;
1259 syscallarg(size_t) len;
1260 } */ *uap = v;
1261
1262 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1263 SCARG(uap, len)));
1264 #else
1265 return (0);
1266 #endif
1267 }
1268