1 /* $OpenBSD: sys_generic.c,v 1.157 2024/04/10 10:05:26 claudio Exp $ */
2 /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */
3
4 /*
5 * Copyright (c) 1996 Theo de Raadt
6 * Copyright (c) 1982, 1986, 1989, 1993
7 * The Regents of the University of California. All rights reserved.
8 * (c) UNIX System Laboratories, Inc.
9 * All or some portions of this file are derived from material licensed
10 * to the University of California by American Telephone and Telegraph
11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12 * the permission of UNIX System Laboratories, Inc.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
39 */
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/filedesc.h>
44 #include <sys/ioctl.h>
45 #include <sys/fcntl.h>
46 #include <sys/vnode.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/uio.h>
53 #include <sys/time.h>
54 #include <sys/malloc.h>
55 #include <sys/poll.h>
56 #include <sys/eventvar.h>
57 #ifdef KTRACE
58 #include <sys/ktrace.h>
59 #endif
60 #include <sys/pledge.h>
61
62 #include <sys/mount.h>
63 #include <sys/syscallargs.h>
64
65 /*
66 * Debug values:
67 * 1 - print implementation errors, things that should not happen.
68 * 2 - print ppoll(2) information, somewhat verbose
69 * 3 - print pselect(2) and ppoll(2) information, very verbose
70 */
71 int kqpoll_debug = 0;
72 #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \
73 printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \
74 printf(x); \
75 }
76
77 int pselregister(struct proc *, fd_set **, fd_set **, int, int *, int *);
78 int pselcollect(struct proc *, struct kevent *, fd_set **, int *);
79 void ppollregister(struct proc *, struct pollfd *, int, int *, int *);
80 int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int);
81
82 int pollout(struct pollfd *, struct pollfd *, u_int);
83 int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *,
84 struct timespec *, const sigset_t *, register_t *);
85 int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *,
86 const sigset_t *, register_t *);
87
88 int
iovec_copyin(const struct iovec * uiov,struct iovec ** iovp,struct iovec * aiov,unsigned int iovcnt,size_t * residp)89 iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov,
90 unsigned int iovcnt, size_t *residp)
91 {
92 #ifdef KTRACE
93 struct proc *p = curproc;
94 #endif
95 struct iovec *iov;
96 int error, i;
97 size_t resid = 0;
98
99 if (iovcnt > UIO_SMALLIOV) {
100 if (iovcnt > IOV_MAX)
101 return (EINVAL);
102 iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK);
103 } else if (iovcnt > 0) {
104 iov = aiov;
105 } else {
106 return (EINVAL);
107 }
108 *iovp = iov;
109
110 if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov))))
111 return (error);
112
113 #ifdef KTRACE
114 if (KTRPOINT(p, KTR_STRUCT))
115 ktriovec(p, iov, iovcnt);
116 #endif
117
118 for (i = 0; i < iovcnt; i++) {
119 resid += iov->iov_len;
120 /*
121 * Writes return ssize_t because -1 is returned on error.
122 * Therefore we must restrict the length to SSIZE_MAX to
123 * avoid garbage return values. Note that the addition is
124 * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX.
125 */
126 if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX)
127 return (EINVAL);
128 iov++;
129 }
130
131 if (residp != NULL)
132 *residp = resid;
133
134 return (0);
135 }
136
137 void
iovec_free(struct iovec * iov,unsigned int iovcnt)138 iovec_free(struct iovec *iov, unsigned int iovcnt)
139 {
140 if (iovcnt > UIO_SMALLIOV)
141 free(iov, M_IOV, iovcnt * sizeof(*iov));
142 }
143
144 /*
145 * Read system call.
146 */
147 int
sys_read(struct proc * p,void * v,register_t * retval)148 sys_read(struct proc *p, void *v, register_t *retval)
149 {
150 struct sys_read_args /* {
151 syscallarg(int) fd;
152 syscallarg(void *) buf;
153 syscallarg(size_t) nbyte;
154 } */ *uap = v;
155 struct iovec iov;
156 struct uio auio;
157
158 iov.iov_base = SCARG(uap, buf);
159 iov.iov_len = SCARG(uap, nbyte);
160 if (iov.iov_len > SSIZE_MAX)
161 return (EINVAL);
162
163 auio.uio_iov = &iov;
164 auio.uio_iovcnt = 1;
165 auio.uio_resid = iov.iov_len;
166
167 return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval));
168 }
169
170 /*
171 * Scatter read system call.
172 */
173 int
sys_readv(struct proc * p,void * v,register_t * retval)174 sys_readv(struct proc *p, void *v, register_t *retval)
175 {
176 struct sys_readv_args /* {
177 syscallarg(int) fd;
178 syscallarg(const struct iovec *) iovp;
179 syscallarg(int) iovcnt;
180 } */ *uap = v;
181 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
182 int error, iovcnt = SCARG(uap, iovcnt);
183 struct uio auio;
184 size_t resid;
185
186 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
187 if (error)
188 goto done;
189
190 auio.uio_iov = iov;
191 auio.uio_iovcnt = iovcnt;
192 auio.uio_resid = resid;
193
194 error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval);
195 done:
196 iovec_free(iov, iovcnt);
197 return (error);
198 }
199
200 int
dofilereadv(struct proc * p,int fd,struct uio * uio,int flags,register_t * retval)201 dofilereadv(struct proc *p, int fd, struct uio *uio, int flags,
202 register_t *retval)
203 {
204 struct filedesc *fdp = p->p_fd;
205 struct file *fp;
206 long cnt, error = 0;
207 u_int iovlen;
208 #ifdef KTRACE
209 struct iovec *ktriov = NULL;
210 #endif
211
212 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
213 iovlen = uio->uio_iovcnt * sizeof(struct iovec);
214
215 if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL)
216 return (EBADF);
217
218 /* Checks for positioned read. */
219 if (flags & FO_POSITION) {
220 struct vnode *vp = fp->f_data;
221
222 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
223 (vp->v_flag & VISTTY)) {
224 error = ESPIPE;
225 goto done;
226 }
227
228 if (uio->uio_offset < 0 && vp->v_type != VCHR) {
229 error = EINVAL;
230 goto done;
231 }
232 }
233
234 uio->uio_rw = UIO_READ;
235 uio->uio_segflg = UIO_USERSPACE;
236 uio->uio_procp = p;
237 #ifdef KTRACE
238 /*
239 * if tracing, save a copy of iovec
240 */
241 if (KTRPOINT(p, KTR_GENIO)) {
242 ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
243 memcpy(ktriov, uio->uio_iov, iovlen);
244 }
245 #endif
246 cnt = uio->uio_resid;
247 error = (*fp->f_ops->fo_read)(fp, uio, flags);
248 if (error) {
249 if (uio->uio_resid != cnt && (error == ERESTART ||
250 error == EINTR || error == EWOULDBLOCK))
251 error = 0;
252 }
253 cnt -= uio->uio_resid;
254
255 mtx_enter(&fp->f_mtx);
256 fp->f_rxfer++;
257 fp->f_rbytes += cnt;
258 mtx_leave(&fp->f_mtx);
259 #ifdef KTRACE
260 if (ktriov != NULL) {
261 if (error == 0)
262 ktrgenio(p, fd, UIO_READ, ktriov, cnt);
263 free(ktriov, M_TEMP, iovlen);
264 }
265 #endif
266 *retval = cnt;
267 done:
268 FRELE(fp, p);
269 return (error);
270 }
271
272 /*
273 * Write system call
274 */
275 int
sys_write(struct proc * p,void * v,register_t * retval)276 sys_write(struct proc *p, void *v, register_t *retval)
277 {
278 struct sys_write_args /* {
279 syscallarg(int) fd;
280 syscallarg(const void *) buf;
281 syscallarg(size_t) nbyte;
282 } */ *uap = v;
283 struct iovec iov;
284 struct uio auio;
285
286 iov.iov_base = (void *)SCARG(uap, buf);
287 iov.iov_len = SCARG(uap, nbyte);
288 if (iov.iov_len > SSIZE_MAX)
289 return (EINVAL);
290
291 auio.uio_iov = &iov;
292 auio.uio_iovcnt = 1;
293 auio.uio_resid = iov.iov_len;
294
295 return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval));
296 }
297
298 /*
299 * Gather write system call
300 */
301 int
sys_writev(struct proc * p,void * v,register_t * retval)302 sys_writev(struct proc *p, void *v, register_t *retval)
303 {
304 struct sys_writev_args /* {
305 syscallarg(int) fd;
306 syscallarg(const struct iovec *) iovp;
307 syscallarg(int) iovcnt;
308 } */ *uap = v;
309 struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
310 int error, iovcnt = SCARG(uap, iovcnt);
311 struct uio auio;
312 size_t resid;
313
314 error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid);
315 if (error)
316 goto done;
317
318 auio.uio_iov = iov;
319 auio.uio_iovcnt = iovcnt;
320 auio.uio_resid = resid;
321
322 error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval);
323 done:
324 iovec_free(iov, iovcnt);
325 return (error);
326 }
327
328 int
dofilewritev(struct proc * p,int fd,struct uio * uio,int flags,register_t * retval)329 dofilewritev(struct proc *p, int fd, struct uio *uio, int flags,
330 register_t *retval)
331 {
332 struct filedesc *fdp = p->p_fd;
333 struct file *fp;
334 long cnt, error = 0;
335 u_int iovlen;
336 #ifdef KTRACE
337 struct iovec *ktriov = NULL;
338 #endif
339
340 KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0);
341 iovlen = uio->uio_iovcnt * sizeof(struct iovec);
342
343 if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL)
344 return (EBADF);
345
346 /* Checks for positioned write. */
347 if (flags & FO_POSITION) {
348 struct vnode *vp = fp->f_data;
349
350 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO ||
351 (vp->v_flag & VISTTY)) {
352 error = ESPIPE;
353 goto done;
354 }
355
356 if (uio->uio_offset < 0 && vp->v_type != VCHR) {
357 error = EINVAL;
358 goto done;
359 }
360 }
361
362 uio->uio_rw = UIO_WRITE;
363 uio->uio_segflg = UIO_USERSPACE;
364 uio->uio_procp = p;
365 #ifdef KTRACE
366 /*
367 * if tracing, save a copy of iovec
368 */
369 if (KTRPOINT(p, KTR_GENIO)) {
370 ktriov = malloc(iovlen, M_TEMP, M_WAITOK);
371 memcpy(ktriov, uio->uio_iov, iovlen);
372 }
373 #endif
374 cnt = uio->uio_resid;
375 error = (*fp->f_ops->fo_write)(fp, uio, flags);
376 if (error) {
377 if (uio->uio_resid != cnt && (error == ERESTART ||
378 error == EINTR || error == EWOULDBLOCK))
379 error = 0;
380 if (error == EPIPE) {
381 KERNEL_LOCK();
382 ptsignal(p, SIGPIPE, STHREAD);
383 KERNEL_UNLOCK();
384 }
385 }
386 cnt -= uio->uio_resid;
387
388 mtx_enter(&fp->f_mtx);
389 fp->f_wxfer++;
390 fp->f_wbytes += cnt;
391 mtx_leave(&fp->f_mtx);
392 #ifdef KTRACE
393 if (ktriov != NULL) {
394 if (error == 0)
395 ktrgenio(p, fd, UIO_WRITE, ktriov, cnt);
396 free(ktriov, M_TEMP, iovlen);
397 }
398 #endif
399 *retval = cnt;
400 done:
401 FRELE(fp, p);
402 return (error);
403 }
404
405 /*
406 * Ioctl system call
407 */
408 int
sys_ioctl(struct proc * p,void * v,register_t * retval)409 sys_ioctl(struct proc *p, void *v, register_t *retval)
410 {
411 struct sys_ioctl_args /* {
412 syscallarg(int) fd;
413 syscallarg(u_long) com;
414 syscallarg(void *) data;
415 } */ *uap = v;
416 struct file *fp;
417 struct filedesc *fdp = p->p_fd;
418 u_long com = SCARG(uap, com);
419 int error = 0;
420 u_int size = 0;
421 caddr_t data, memp = NULL;
422 int tmp;
423 #define STK_PARAMS 128
424 long long stkbuf[STK_PARAMS / sizeof(long long)];
425
426 if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL)
427 return (EBADF);
428
429 if (fp->f_type == DTYPE_SOCKET) {
430 struct socket *so = fp->f_data;
431
432 if (so->so_state & SS_DNS) {
433 error = EINVAL;
434 goto out;
435 }
436 }
437
438 error = pledge_ioctl(p, com, fp);
439 if (error)
440 goto out;
441
442 switch (com) {
443 case FIONCLEX:
444 case FIOCLEX:
445 fdplock(fdp);
446 if (com == FIONCLEX)
447 fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
448 else
449 fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
450 fdpunlock(fdp);
451 goto out;
452 }
453
454 /*
455 * Interpret high order word to find amount of data to be
456 * copied to/from the user's address space.
457 */
458 size = IOCPARM_LEN(com);
459 if (size > IOCPARM_MAX) {
460 error = ENOTTY;
461 goto out;
462 }
463 if (size > sizeof (stkbuf)) {
464 memp = malloc(size, M_IOCTLOPS, M_WAITOK);
465 data = memp;
466 } else
467 data = (caddr_t)stkbuf;
468 if (com&IOC_IN) {
469 if (size) {
470 error = copyin(SCARG(uap, data), data, size);
471 if (error) {
472 goto out;
473 }
474 } else
475 *(caddr_t *)data = SCARG(uap, data);
476 } else if ((com&IOC_OUT) && size)
477 /*
478 * Zero the buffer so the user always
479 * gets back something deterministic.
480 */
481 memset(data, 0, size);
482 else if (com&IOC_VOID)
483 *(caddr_t *)data = SCARG(uap, data);
484
485 switch (com) {
486
487 case FIONBIO:
488 if ((tmp = *(int *)data) != 0)
489 atomic_setbits_int(&fp->f_flag, FNONBLOCK);
490 else
491 atomic_clearbits_int(&fp->f_flag, FNONBLOCK);
492 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
493 break;
494
495 case FIOASYNC:
496 if ((tmp = *(int *)data) != 0)
497 atomic_setbits_int(&fp->f_flag, FASYNC);
498 else
499 atomic_clearbits_int(&fp->f_flag, FASYNC);
500 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p);
501 break;
502
503 default:
504 error = (*fp->f_ops->fo_ioctl)(fp, com, data, p);
505 break;
506 }
507 /*
508 * Copy any data to user, size was
509 * already set and checked above.
510 */
511 if (error == 0 && (com&IOC_OUT) && size)
512 error = copyout(data, SCARG(uap, data), size);
513 out:
514 FRELE(fp, p);
515 free(memp, M_IOCTLOPS, size);
516 return (error);
517 }
518
519 /*
520 * Select system call.
521 */
522 int
sys_select(struct proc * p,void * v,register_t * retval)523 sys_select(struct proc *p, void *v, register_t *retval)
524 {
525 struct sys_select_args /* {
526 syscallarg(int) nd;
527 syscallarg(fd_set *) in;
528 syscallarg(fd_set *) ou;
529 syscallarg(fd_set *) ex;
530 syscallarg(struct timeval *) tv;
531 } */ *uap = v;
532
533 struct timespec ts, *tsp = NULL;
534 int error;
535
536 if (SCARG(uap, tv) != NULL) {
537 struct timeval tv;
538 if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0)
539 return (error);
540 #ifdef KTRACE
541 if (KTRPOINT(p, KTR_STRUCT))
542 ktrreltimeval(p, &tv);
543 #endif
544 if (tv.tv_sec < 0 || !timerisvalid(&tv))
545 return (EINVAL);
546 TIMEVAL_TO_TIMESPEC(&tv, &ts);
547 tsp = &ts;
548 }
549
550 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
551 SCARG(uap, ex), tsp, NULL, retval));
552 }
553
554 int
sys_pselect(struct proc * p,void * v,register_t * retval)555 sys_pselect(struct proc *p, void *v, register_t *retval)
556 {
557 struct sys_pselect_args /* {
558 syscallarg(int) nd;
559 syscallarg(fd_set *) in;
560 syscallarg(fd_set *) ou;
561 syscallarg(fd_set *) ex;
562 syscallarg(const struct timespec *) ts;
563 syscallarg(const sigset_t *) mask;
564 } */ *uap = v;
565
566 struct timespec ts, *tsp = NULL;
567 sigset_t ss, *ssp = NULL;
568 int error;
569
570 if (SCARG(uap, ts) != NULL) {
571 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
572 return (error);
573 #ifdef KTRACE
574 if (KTRPOINT(p, KTR_STRUCT))
575 ktrreltimespec(p, &ts);
576 #endif
577 if (ts.tv_sec < 0 || !timespecisvalid(&ts))
578 return (EINVAL);
579 tsp = &ts;
580 }
581 if (SCARG(uap, mask) != NULL) {
582 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
583 return (error);
584 ssp = &ss;
585 }
586
587 return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou),
588 SCARG(uap, ex), tsp, ssp, retval));
589 }
590
591 int
dopselect(struct proc * p,int nd,fd_set * in,fd_set * ou,fd_set * ex,struct timespec * timeout,const sigset_t * sigmask,register_t * retval)592 dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex,
593 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
594 {
595 struct kqueue_scan_state scan;
596 struct timespec zerots = {};
597 fd_mask bits[6];
598 fd_set *pibits[3], *pobits[3];
599 int error, nfiles, ncollected = 0, nevents = 0;
600 u_int ni;
601
602 if (nd < 0)
603 return (EINVAL);
604
605 nfiles = READ_ONCE(p->p_fd->fd_nfiles);
606 if (nd > nfiles)
607 nd = nfiles;
608
609 ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
610 if (ni > sizeof(bits[0])) {
611 caddr_t mbits;
612
613 mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO);
614 pibits[0] = (fd_set *)&mbits[ni * 0];
615 pibits[1] = (fd_set *)&mbits[ni * 1];
616 pibits[2] = (fd_set *)&mbits[ni * 2];
617 pobits[0] = (fd_set *)&mbits[ni * 3];
618 pobits[1] = (fd_set *)&mbits[ni * 4];
619 pobits[2] = (fd_set *)&mbits[ni * 5];
620 } else {
621 memset(bits, 0, sizeof(bits));
622 pibits[0] = (fd_set *)&bits[0];
623 pibits[1] = (fd_set *)&bits[1];
624 pibits[2] = (fd_set *)&bits[2];
625 pobits[0] = (fd_set *)&bits[3];
626 pobits[1] = (fd_set *)&bits[4];
627 pobits[2] = (fd_set *)&bits[5];
628 }
629
630 kqpoll_init(nd);
631
632 #define getbits(name, x) \
633 if (name && (error = copyin(name, pibits[x], ni))) \
634 goto done;
635 getbits(in, 0);
636 getbits(ou, 1);
637 getbits(ex, 2);
638 #undef getbits
639 #ifdef KTRACE
640 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
641 if (in) ktrfdset(p, pibits[0], ni);
642 if (ou) ktrfdset(p, pibits[1], ni);
643 if (ex) ktrfdset(p, pibits[2], ni);
644 }
645 #endif
646
647 if (sigmask)
648 dosigsuspend(p, *sigmask &~ sigcantmask);
649
650 /* Register kqueue events */
651 error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected);
652 if (error != 0)
653 goto done;
654
655 /*
656 * The poll/select family of syscalls has been designed to
657 * block when file descriptors are not available, even if
658 * there's nothing to wait for.
659 */
660 if (nevents == 0 && ncollected == 0) {
661 uint64_t nsecs = INFSLP;
662
663 if (timeout != NULL) {
664 if (!timespecisset(timeout))
665 goto done;
666 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
667 }
668 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs);
669 /* select is not restarted after signals... */
670 if (error == ERESTART)
671 error = EINTR;
672 if (error == EWOULDBLOCK)
673 error = 0;
674 goto done;
675 }
676
677 /* Do not block if registering found pending events. */
678 if (ncollected > 0)
679 timeout = &zerots;
680
681 /* Collect at most `nevents' possibly waiting in kqueue_scan() */
682 kqueue_scan_setup(&scan, p->p_kq);
683 while (nevents > 0) {
684 struct kevent kev[KQ_NEVENTS];
685 int i, ready, count;
686
687 /* Maximum number of events per iteration */
688 count = MIN(nitems(kev), nevents);
689 ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
690
691 /* Convert back events that are ready. */
692 for (i = 0; i < ready && error == 0; i++)
693 error = pselcollect(p, &kev[i], pobits, &ncollected);
694 /*
695 * Stop if there was an error or if we had enough
696 * space to collect all events that were ready.
697 */
698 if (error || ready < count)
699 break;
700
701 nevents -= ready;
702 }
703 kqueue_scan_finish(&scan);
704 *retval = ncollected;
705 done:
706 #define putbits(name, x) \
707 if (name && (error2 = copyout(pobits[x], name, ni))) \
708 error = error2;
709 if (error == 0) {
710 int error2;
711
712 putbits(in, 0);
713 putbits(ou, 1);
714 putbits(ex, 2);
715 #undef putbits
716 #ifdef KTRACE
717 if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) {
718 if (in) ktrfdset(p, pobits[0], ni);
719 if (ou) ktrfdset(p, pobits[1], ni);
720 if (ex) ktrfdset(p, pobits[2], ni);
721 }
722 #endif
723 }
724
725 if (pibits[0] != (fd_set *)&bits[0])
726 free(pibits[0], M_TEMP, 6 * ni);
727
728 kqpoll_done(nd);
729
730 return (error);
731 }
732
733 /*
734 * Convert fd_set into kqueue events and register them on the
735 * per-thread queue.
736 */
737 int
pselregister(struct proc * p,fd_set * pibits[3],fd_set * pobits[3],int nfd,int * nregistered,int * ncollected)738 pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd,
739 int *nregistered, int *ncollected)
740 {
741 static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT };
742 static const int evff[] = { 0, 0, NOTE_OOB };
743 int msk, i, j, fd, nevents = 0, error = 0;
744 struct kevent kev;
745 fd_mask bits;
746
747 for (msk = 0; msk < 3; msk++) {
748 for (i = 0; i < nfd; i += NFDBITS) {
749 bits = pibits[msk]->fds_bits[i / NFDBITS];
750 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
751 bits &= ~(1 << j);
752
753 DPRINTFN(2, "select fd %d mask %d serial %lu\n",
754 fd, msk, p->p_kq_serial);
755 EV_SET(&kev, fd, evf[msk],
756 EV_ADD|EV_ENABLE|__EV_SELECT,
757 evff[msk], 0, (void *)(p->p_kq_serial));
758 error = kqueue_register(p->p_kq, &kev, 0, p);
759 switch (error) {
760 case 0:
761 nevents++;
762 /* FALLTHROUGH */
763 case EOPNOTSUPP:/* No underlying kqfilter */
764 case EINVAL: /* Unimplemented filter */
765 case EPERM: /* Specific to FIFO and
766 * __EV_SELECT */
767 error = 0;
768 break;
769 case ENXIO: /* Device has been detached */
770 default:
771 goto bad;
772 }
773 }
774 }
775 }
776
777 *nregistered = nevents;
778 return (0);
779 bad:
780 DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident,
781 kev.filter, error);
782 return (error);
783 }
784
785 /*
786 * Convert given kqueue event into corresponding select(2) bit.
787 */
788 int
pselcollect(struct proc * p,struct kevent * kevp,fd_set * pobits[3],int * ncollected)789 pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3],
790 int *ncollected)
791 {
792 if ((unsigned long)kevp->udata != p->p_kq_serial) {
793 panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx",
794 __func__, kevp, (int)kevp->ident,
795 (unsigned long)kevp->udata, p->p_kq_serial);
796 }
797
798 if (kevp->flags & EV_ERROR) {
799 DPRINTFN(2, "select fd %d filt %d error %d\n",
800 (int)kevp->ident, kevp->filter, (int)kevp->data);
801 return (kevp->data);
802 }
803
804 switch (kevp->filter) {
805 case EVFILT_READ:
806 FD_SET(kevp->ident, pobits[0]);
807 break;
808 case EVFILT_WRITE:
809 FD_SET(kevp->ident, pobits[1]);
810 break;
811 case EVFILT_EXCEPT:
812 FD_SET(kevp->ident, pobits[2]);
813 break;
814 default:
815 KASSERT(0);
816 }
817 (*ncollected)++;
818
819 DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter);
820 return (0);
821 }
822
823 /*
824 * Do a wakeup when a selectable event occurs.
825 */
826 void
selwakeup(struct selinfo * sip)827 selwakeup(struct selinfo *sip)
828 {
829 KERNEL_LOCK();
830 knote_locked(&sip->si_note, NOTE_SUBMIT);
831 KERNEL_UNLOCK();
832 }
833
834 /*
835 * Only copyout the revents field.
836 */
837 int
pollout(struct pollfd * pl,struct pollfd * upl,u_int nfds)838 pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds)
839 {
840 int error = 0;
841 u_int i = 0;
842
843 while (!error && i++ < nfds) {
844 error = copyout(&pl->revents, &upl->revents,
845 sizeof(upl->revents));
846 pl++;
847 upl++;
848 }
849
850 return (error);
851 }
852
853 /*
854 * We are using the same mechanism as select only we encode/decode args
855 * differently.
856 */
857 int
sys_poll(struct proc * p,void * v,register_t * retval)858 sys_poll(struct proc *p, void *v, register_t *retval)
859 {
860 struct sys_poll_args /* {
861 syscallarg(struct pollfd *) fds;
862 syscallarg(u_int) nfds;
863 syscallarg(int) timeout;
864 } */ *uap = v;
865
866 struct timespec ts, *tsp = NULL;
867 int msec = SCARG(uap, timeout);
868
869 if (msec != INFTIM) {
870 if (msec < 0)
871 return (EINVAL);
872 ts.tv_sec = msec / 1000;
873 ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000;
874 tsp = &ts;
875 }
876
877 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL,
878 retval));
879 }
880
881 int
sys_ppoll(struct proc * p,void * v,register_t * retval)882 sys_ppoll(struct proc *p, void *v, register_t *retval)
883 {
884 struct sys_ppoll_args /* {
885 syscallarg(struct pollfd *) fds;
886 syscallarg(u_int) nfds;
887 syscallarg(const struct timespec *) ts;
888 syscallarg(const sigset_t *) mask;
889 } */ *uap = v;
890
891 int error;
892 struct timespec ts, *tsp = NULL;
893 sigset_t ss, *ssp = NULL;
894
895 if (SCARG(uap, ts) != NULL) {
896 if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0)
897 return (error);
898 #ifdef KTRACE
899 if (KTRPOINT(p, KTR_STRUCT))
900 ktrreltimespec(p, &ts);
901 #endif
902 if (ts.tv_sec < 0 || !timespecisvalid(&ts))
903 return (EINVAL);
904 tsp = &ts;
905 }
906
907 if (SCARG(uap, mask) != NULL) {
908 if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0)
909 return (error);
910 ssp = &ss;
911 }
912
913 return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp,
914 retval));
915 }
916
917 int
doppoll(struct proc * p,struct pollfd * fds,u_int nfds,struct timespec * timeout,const sigset_t * sigmask,register_t * retval)918 doppoll(struct proc *p, struct pollfd *fds, u_int nfds,
919 struct timespec *timeout, const sigset_t *sigmask, register_t *retval)
920 {
921 struct kqueue_scan_state scan;
922 struct timespec zerots = {};
923 struct pollfd pfds[4], *pl = pfds;
924 int error, ncollected = 0, nevents = 0;
925 size_t sz;
926
927 /* Standards say no more than MAX_OPEN; this is possibly better. */
928 if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles))
929 return (EINVAL);
930
931 /* optimize for the default case, of a small nfds value */
932 if (nfds > nitems(pfds)) {
933 pl = mallocarray(nfds, sizeof(*pl), M_TEMP,
934 M_WAITOK | M_CANFAIL);
935 if (pl == NULL)
936 return (EINVAL);
937 }
938
939 kqpoll_init(nfds);
940
941 sz = nfds * sizeof(*pl);
942
943 if ((error = copyin(fds, pl, sz)) != 0)
944 goto bad;
945
946 if (sigmask)
947 dosigsuspend(p, *sigmask &~ sigcantmask);
948
949 /* Register kqueue events */
950 ppollregister(p, pl, nfds, &nevents, &ncollected);
951
952 /*
953 * The poll/select family of syscalls has been designed to
954 * block when file descriptors are not available, even if
955 * there's nothing to wait for.
956 */
957 if (nevents == 0 && ncollected == 0) {
958 uint64_t nsecs = INFSLP;
959
960 if (timeout != NULL) {
961 if (!timespecisset(timeout))
962 goto done;
963 nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP));
964 }
965
966 error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs);
967 if (error == ERESTART)
968 error = EINTR;
969 if (error == EWOULDBLOCK)
970 error = 0;
971 goto done;
972 }
973
974 /* Do not block if registering found pending events. */
975 if (ncollected > 0)
976 timeout = &zerots;
977
978 /* Collect at most `nevents' possibly waiting in kqueue_scan() */
979 kqueue_scan_setup(&scan, p->p_kq);
980 while (nevents > 0) {
981 struct kevent kev[KQ_NEVENTS];
982 int i, ready, count;
983
984 /* Maximum number of events per iteration */
985 count = MIN(nitems(kev), nevents);
986 ready = kqueue_scan(&scan, count, kev, timeout, p, &error);
987
988 /* Convert back events that are ready. */
989 for (i = 0; i < ready; i++)
990 ncollected += ppollcollect(p, &kev[i], pl, nfds);
991
992 /*
993 * Stop if there was an error or if we had enough
994 * place to collect all events that were ready.
995 */
996 if (error || ready < count)
997 break;
998
999 nevents -= ready;
1000 }
1001 kqueue_scan_finish(&scan);
1002 *retval = ncollected;
1003 done:
1004 /*
1005 * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is
1006 * ignored (since the whole point is to see what would block).
1007 */
1008 switch (error) {
1009 case EINTR:
1010 error = pollout(pl, fds, nfds);
1011 if (error == 0)
1012 error = EINTR;
1013 break;
1014 case EWOULDBLOCK:
1015 case 0:
1016 error = pollout(pl, fds, nfds);
1017 break;
1018 }
1019 #ifdef KTRACE
1020 if (KTRPOINT(p, KTR_STRUCT))
1021 ktrpollfd(p, pl, nfds);
1022 #endif /* KTRACE */
1023 bad:
1024 if (pl != pfds)
1025 free(pl, M_TEMP, sz);
1026
1027 kqpoll_done(nfds);
1028
1029 return (error);
1030 }
1031
1032 int
ppollregister_evts(struct proc * p,struct kevent * kevp,int nkev,struct pollfd * pl,unsigned int pollid)1033 ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev,
1034 struct pollfd *pl, unsigned int pollid)
1035 {
1036 int i, error, nevents = 0;
1037
1038 KASSERT(pl->revents == 0);
1039
1040 for (i = 0; i < nkev; i++, kevp++) {
1041 again:
1042 error = kqueue_register(p->p_kq, kevp, pollid, p);
1043 switch (error) {
1044 case 0:
1045 nevents++;
1046 break;
1047 case EOPNOTSUPP:/* No underlying kqfilter */
1048 case EINVAL: /* Unimplemented filter */
1049 break;
1050 case EBADF: /* Bad file descriptor */
1051 pl->revents |= POLLNVAL;
1052 break;
1053 case EPERM: /* Specific to FIFO */
1054 KASSERT(kevp->filter == EVFILT_WRITE);
1055 if (nkev == 1) {
1056 /*
1057 * If this is the only filter make sure
1058 * POLLHUP is passed to userland.
1059 */
1060 kevp->filter = EVFILT_EXCEPT;
1061 goto again;
1062 }
1063 break;
1064 default:
1065 DPRINTFN(0, "poll err %lu fd %d revents %02x serial"
1066 " %lu filt %d ERROR=%d\n",
1067 ((unsigned long)kevp->udata - p->p_kq_serial),
1068 pl->fd, pl->revents, p->p_kq_serial, kevp->filter,
1069 error);
1070 /* FALLTHROUGH */
1071 case ENXIO: /* Device has been detached */
1072 pl->revents |= POLLERR;
1073 break;
1074 }
1075 }
1076
1077 return (nevents);
1078 }
1079
1080 /*
1081 * Convert pollfd into kqueue events and register them on the
1082 * per-thread queue.
1083 *
1084 * At most 3 events can correspond to a single pollfd.
1085 */
1086 void
ppollregister(struct proc * p,struct pollfd * pl,int nfds,int * nregistered,int * ncollected)1087 ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered,
1088 int *ncollected)
1089 {
1090 int i, nkev, nevt, forcehup;
1091 struct kevent kev[3], *kevp;
1092
1093 for (i = 0; i < nfds; i++) {
1094 pl[i].events &= ~POLL_NOHUP;
1095 pl[i].revents = 0;
1096
1097 if (pl[i].fd < 0)
1098 continue;
1099
1100 /*
1101 * POLLHUP checking is implicit in the event filters.
1102 * However, the checking must be even if no events are
1103 * requested.
1104 */
1105 forcehup = ((pl[i].events & ~POLLHUP) == 0);
1106
1107 DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n",
1108 i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial);
1109
1110 nevt = 0;
1111 nkev = 0;
1112 kevp = kev;
1113 if (pl[i].events & (POLLIN | POLLRDNORM)) {
1114 EV_SET(kevp, pl[i].fd, EVFILT_READ,
1115 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
1116 (void *)(p->p_kq_serial + i));
1117 nkev++;
1118 kevp++;
1119 }
1120 if (pl[i].events & (POLLOUT | POLLWRNORM)) {
1121 EV_SET(kevp, pl[i].fd, EVFILT_WRITE,
1122 EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
1123 (void *)(p->p_kq_serial + i));
1124 nkev++;
1125 kevp++;
1126 }
1127 if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) {
1128 int evff = forcehup ? 0 : NOTE_OOB;
1129
1130 EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT,
1131 EV_ADD|EV_ENABLE|__EV_POLL, evff, 0,
1132 (void *)(p->p_kq_serial + i));
1133 nkev++;
1134 kevp++;
1135 }
1136
1137 if (nkev == 0)
1138 continue;
1139
1140 *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i);
1141
1142 if (pl[i].revents != 0)
1143 (*ncollected)++;
1144 }
1145
1146 DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered,
1147 *ncollected);
1148 }
1149
1150 /*
1151 * Convert given kqueue event into corresponding poll(2) revents bit.
1152 */
1153 int
ppollcollect(struct proc * p,struct kevent * kevp,struct pollfd * pl,u_int nfds)1154 ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds)
1155 {
1156 static struct timeval poll_errintvl = { 5, 0 };
1157 static struct timeval poll_lasterr;
1158 int already_seen;
1159 unsigned long i;
1160
1161 /* Extract poll array index */
1162 i = (unsigned long)kevp->udata - p->p_kq_serial;
1163
1164 if (i >= nfds) {
1165 panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx",
1166 __func__, kevp, nfds,
1167 (unsigned long)kevp->udata, p->p_kq_serial);
1168 }
1169 if ((int)kevp->ident != pl[i].fd) {
1170 panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx",
1171 __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd,
1172 p->p_kq_serial);
1173 }
1174
1175 /*
1176 * A given descriptor may already have generated an error
1177 * against another filter during kqueue_register().
1178 *
1179 * Make sure to set the appropriate flags but do not
1180 * increment `*retval' more than once.
1181 */
1182 already_seen = (pl[i].revents != 0);
1183
1184 /* POLLNVAL preempts other events. */
1185 if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) {
1186 pl[i].revents = POLLNVAL;
1187 goto done;
1188 } else if (pl[i].revents & POLLNVAL) {
1189 goto done;
1190 }
1191
1192 switch (kevp->filter) {
1193 case EVFILT_READ:
1194 if (kevp->flags & __EV_HUP)
1195 pl[i].revents |= POLLHUP;
1196 if (pl[i].events & (POLLIN | POLLRDNORM))
1197 pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM);
1198 break;
1199 case EVFILT_WRITE:
1200 /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */
1201 if (kevp->flags & __EV_HUP) {
1202 pl[i].revents |= POLLHUP;
1203 } else if (pl[i].events & (POLLOUT | POLLWRNORM)) {
1204 pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM);
1205 }
1206 break;
1207 case EVFILT_EXCEPT:
1208 if (kevp->flags & __EV_HUP) {
1209 if (pl[i].events != 0 && pl[i].events != POLLOUT)
1210 DPRINTFN(0, "weird events %x\n", pl[i].events);
1211 pl[i].revents |= POLLHUP;
1212 break;
1213 }
1214 if (pl[i].events & (POLLPRI | POLLRDBAND))
1215 pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND);
1216 break;
1217 default:
1218 KASSERT(0);
1219 }
1220
1221 done:
1222 DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n",
1223 i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata,
1224 kevp->filter);
1225
1226 /*
1227 * Make noise about unclaimed events as they might indicate a bug
1228 * and can result in spurious-looking wakeups of poll(2).
1229 *
1230 * Live-locking within the system call should not happen because
1231 * the scan loop in doppoll() has an upper limit for the number
1232 * of events to process.
1233 */
1234 if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) {
1235 printf("%s[%d]: poll index %lu fd %d events 0x%x "
1236 "filter %d/0x%x unclaimed\n",
1237 p->p_p->ps_comm, p->p_tid, i, pl[i].fd,
1238 pl[i].events, kevp->filter, kevp->flags);
1239 }
1240
1241 if (!already_seen && (pl[i].revents != 0))
1242 return (1);
1243
1244 return (0);
1245 }
1246
1247 /*
1248 * utrace system call
1249 */
1250 int
sys_utrace(struct proc * curp,void * v,register_t * retval)1251 sys_utrace(struct proc *curp, void *v, register_t *retval)
1252 {
1253 #ifdef KTRACE
1254 struct sys_utrace_args /* {
1255 syscallarg(const char *) label;
1256 syscallarg(const void *) addr;
1257 syscallarg(size_t) len;
1258 } */ *uap = v;
1259
1260 return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr),
1261 SCARG(uap, len)));
1262 #else
1263 return (0);
1264 #endif
1265 }
1266