1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 35 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 36 */ 37 38 #include "opt_ktrace.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/sysproto.h> 43 #include <sys/event.h> 44 #include <sys/filedesc.h> 45 #include <sys/filio.h> 46 #include <sys/fcntl.h> 47 #include <sys/file.h> 48 #include <sys/proc.h> 49 #include <sys/signalvar.h> 50 #include <sys/socketvar.h> 51 #include <sys/uio.h> 52 #include <sys/kernel.h> 53 #include <sys/kern_syscall.h> 54 #include <sys/malloc.h> 55 #include <sys/mapped_ioctl.h> 56 #include <sys/poll.h> 57 #include <sys/queue.h> 58 #include <sys/resourcevar.h> 59 #include <sys/socketops.h> 60 #include <sys/sysctl.h> 61 #include <sys/sysent.h> 62 #include <sys/buf.h> 63 #ifdef KTRACE 64 #include <sys/ktrace.h> 65 #endif 66 #include <vm/vm.h> 67 #include <vm/vm_page.h> 68 69 #include <sys/file2.h> 70 #include <sys/mplock2.h> 71 #include <sys/spinlock2.h> 72 73 #include <machine/limits.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 typedef struct kfd_set { 81 fd_mask fds_bits[2]; 82 } kfd_set; 83 84 enum select_copyin_states { 85 COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE }; 86 87 struct select_kevent_copyin_args { 88 kfd_set *read_set; 89 kfd_set *write_set; 90 kfd_set *except_set; 91 int active_set; /* One of select_copyin_states */ 92 struct lwp *lwp; /* Pointer to our lwp */ 93 int num_fds; /* Number of file descriptors (syscall arg) */ 94 int proc_fds; /* Processed fd's (wraps) */ 95 int error; /* Returned to userland */ 96 }; 97 98 struct poll_kevent_copyin_args { 99 struct lwp *lwp; 100 struct pollfd *fds; 101 int nfds; 102 int pfds; 103 int error; 104 }; 105 106 static struct lwkt_token mioctl_token = LWKT_TOKEN_INITIALIZER(mioctl_token); 107 108 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 109 struct timespec *ts, int *res); 110 static int dopoll(int nfds, struct pollfd *fds, struct timespec *ts, 111 int *res, int flags); 112 static int dofileread(int, struct file *, struct uio *, int, size_t *); 113 static int dofilewrite(int, struct file *, struct uio *, int, size_t *); 114 115 /* 116 * Read system call. 117 * 118 * MPSAFE 119 */ 120 int 121 sys_read(struct read_args *uap) 122 { 123 struct thread *td = curthread; 124 struct uio auio; 125 struct iovec aiov; 126 int error; 127 128 if ((ssize_t)uap->nbyte < 0) 129 error = EINVAL; 130 131 aiov.iov_base = uap->buf; 132 aiov.iov_len = uap->nbyte; 133 auio.uio_iov = &aiov; 134 auio.uio_iovcnt = 1; 135 auio.uio_offset = -1; 136 auio.uio_resid = uap->nbyte; 137 auio.uio_rw = UIO_READ; 138 auio.uio_segflg = UIO_USERSPACE; 139 auio.uio_td = td; 140 141 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 142 return(error); 143 } 144 145 /* 146 * Positioned (Pread) read system call 147 * 148 * MPSAFE 149 */ 150 int 151 sys_extpread(struct extpread_args *uap) 152 { 153 struct thread *td = curthread; 154 struct uio auio; 155 struct iovec aiov; 156 int error; 157 int flags; 158 159 if ((ssize_t)uap->nbyte < 0) 160 return(EINVAL); 161 162 aiov.iov_base = uap->buf; 163 aiov.iov_len = uap->nbyte; 164 auio.uio_iov = &aiov; 165 auio.uio_iovcnt = 1; 166 auio.uio_offset = uap->offset; 167 auio.uio_resid = uap->nbyte; 168 auio.uio_rw = UIO_READ; 169 auio.uio_segflg = UIO_USERSPACE; 170 auio.uio_td = td; 171 172 flags = uap->flags & O_FMASK; 173 if (uap->offset != (off_t)-1) 174 flags |= O_FOFFSET; 175 176 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 177 return(error); 178 } 179 180 /* 181 * Scatter read system call. 182 * 183 * MPSAFE 184 */ 185 int 186 sys_readv(struct readv_args *uap) 187 { 188 struct thread *td = curthread; 189 struct uio auio; 190 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 191 int error; 192 193 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 194 &auio.uio_resid); 195 if (error) 196 return (error); 197 auio.uio_iov = iov; 198 auio.uio_iovcnt = uap->iovcnt; 199 auio.uio_offset = -1; 200 auio.uio_rw = UIO_READ; 201 auio.uio_segflg = UIO_USERSPACE; 202 auio.uio_td = td; 203 204 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 205 206 iovec_free(&iov, aiov); 207 return (error); 208 } 209 210 211 /* 212 * Scatter positioned read system call. 213 * 214 * MPSAFE 215 */ 216 int 217 sys_extpreadv(struct extpreadv_args *uap) 218 { 219 struct thread *td = curthread; 220 struct uio auio; 221 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 222 int error; 223 int flags; 224 225 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 226 &auio.uio_resid); 227 if (error) 228 return (error); 229 auio.uio_iov = iov; 230 auio.uio_iovcnt = uap->iovcnt; 231 auio.uio_offset = uap->offset; 232 auio.uio_rw = UIO_READ; 233 auio.uio_segflg = UIO_USERSPACE; 234 auio.uio_td = td; 235 236 flags = uap->flags & O_FMASK; 237 if (uap->offset != (off_t)-1) 238 flags |= O_FOFFSET; 239 240 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 241 242 iovec_free(&iov, aiov); 243 return(error); 244 } 245 246 /* 247 * MPSAFE 248 */ 249 int 250 kern_preadv(int fd, struct uio *auio, int flags, size_t *res) 251 { 252 struct thread *td = curthread; 253 struct proc *p = td->td_proc; 254 struct file *fp; 255 int error; 256 257 KKASSERT(p); 258 259 fp = holdfp(p->p_fd, fd, FREAD); 260 if (fp == NULL) 261 return (EBADF); 262 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 263 error = ESPIPE; 264 } else { 265 error = dofileread(fd, fp, auio, flags, res); 266 } 267 fdrop(fp); 268 return(error); 269 } 270 271 /* 272 * Common code for readv and preadv that reads data in 273 * from a file using the passed in uio, offset, and flags. 274 * 275 * MPALMOSTSAFE - ktrace needs help 276 */ 277 static int 278 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 279 { 280 int error; 281 size_t len; 282 #ifdef KTRACE 283 struct thread *td = curthread; 284 struct iovec *ktriov = NULL; 285 struct uio ktruio; 286 #endif 287 288 #ifdef KTRACE 289 /* 290 * if tracing, save a copy of iovec 291 */ 292 if (KTRPOINT(td, KTR_GENIO)) { 293 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 294 295 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK); 296 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 297 ktruio = *auio; 298 } 299 #endif 300 len = auio->uio_resid; 301 error = fo_read(fp, auio, fp->f_cred, flags); 302 if (error) { 303 if (auio->uio_resid != len && (error == ERESTART || 304 error == EINTR || error == EWOULDBLOCK)) 305 error = 0; 306 } 307 #ifdef KTRACE 308 if (ktriov != NULL) { 309 if (error == 0) { 310 ktruio.uio_iov = ktriov; 311 ktruio.uio_resid = len - auio->uio_resid; 312 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 313 } 314 kfree(ktriov, M_TEMP); 315 } 316 #endif 317 if (error == 0) 318 *res = len - auio->uio_resid; 319 320 return(error); 321 } 322 323 /* 324 * Write system call 325 * 326 * MPSAFE 327 */ 328 int 329 sys_write(struct write_args *uap) 330 { 331 struct thread *td = curthread; 332 struct uio auio; 333 struct iovec aiov; 334 int error; 335 336 if ((ssize_t)uap->nbyte < 0) 337 error = EINVAL; 338 339 aiov.iov_base = (void *)(uintptr_t)uap->buf; 340 aiov.iov_len = uap->nbyte; 341 auio.uio_iov = &aiov; 342 auio.uio_iovcnt = 1; 343 auio.uio_offset = -1; 344 auio.uio_resid = uap->nbyte; 345 auio.uio_rw = UIO_WRITE; 346 auio.uio_segflg = UIO_USERSPACE; 347 auio.uio_td = td; 348 349 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 350 351 return(error); 352 } 353 354 /* 355 * Pwrite system call 356 * 357 * MPSAFE 358 */ 359 int 360 sys_extpwrite(struct extpwrite_args *uap) 361 { 362 struct thread *td = curthread; 363 struct uio auio; 364 struct iovec aiov; 365 int error; 366 int flags; 367 368 if ((ssize_t)uap->nbyte < 0) 369 error = EINVAL; 370 371 aiov.iov_base = (void *)(uintptr_t)uap->buf; 372 aiov.iov_len = uap->nbyte; 373 auio.uio_iov = &aiov; 374 auio.uio_iovcnt = 1; 375 auio.uio_offset = uap->offset; 376 auio.uio_resid = uap->nbyte; 377 auio.uio_rw = UIO_WRITE; 378 auio.uio_segflg = UIO_USERSPACE; 379 auio.uio_td = td; 380 381 flags = uap->flags & O_FMASK; 382 if (uap->offset != (off_t)-1) 383 flags |= O_FOFFSET; 384 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 385 return(error); 386 } 387 388 /* 389 * MPSAFE 390 */ 391 int 392 sys_writev(struct writev_args *uap) 393 { 394 struct thread *td = curthread; 395 struct uio auio; 396 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 397 int error; 398 399 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 400 &auio.uio_resid); 401 if (error) 402 return (error); 403 auio.uio_iov = iov; 404 auio.uio_iovcnt = uap->iovcnt; 405 auio.uio_offset = -1; 406 auio.uio_rw = UIO_WRITE; 407 auio.uio_segflg = UIO_USERSPACE; 408 auio.uio_td = td; 409 410 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 411 412 iovec_free(&iov, aiov); 413 return (error); 414 } 415 416 417 /* 418 * Gather positioned write system call 419 * 420 * MPSAFE 421 */ 422 int 423 sys_extpwritev(struct extpwritev_args *uap) 424 { 425 struct thread *td = curthread; 426 struct uio auio; 427 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 428 int error; 429 int flags; 430 431 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 432 &auio.uio_resid); 433 if (error) 434 return (error); 435 auio.uio_iov = iov; 436 auio.uio_iovcnt = uap->iovcnt; 437 auio.uio_offset = uap->offset; 438 auio.uio_rw = UIO_WRITE; 439 auio.uio_segflg = UIO_USERSPACE; 440 auio.uio_td = td; 441 442 flags = uap->flags & O_FMASK; 443 if (uap->offset != (off_t)-1) 444 flags |= O_FOFFSET; 445 446 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 447 448 iovec_free(&iov, aiov); 449 return(error); 450 } 451 452 /* 453 * MPSAFE 454 */ 455 int 456 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) 457 { 458 struct thread *td = curthread; 459 struct proc *p = td->td_proc; 460 struct file *fp; 461 int error; 462 463 KKASSERT(p); 464 465 fp = holdfp(p->p_fd, fd, FWRITE); 466 if (fp == NULL) 467 return (EBADF); 468 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 469 error = ESPIPE; 470 } else { 471 error = dofilewrite(fd, fp, auio, flags, res); 472 } 473 474 fdrop(fp); 475 return (error); 476 } 477 478 /* 479 * Common code for writev and pwritev that writes data to 480 * a file using the passed in uio, offset, and flags. 481 * 482 * MPALMOSTSAFE - ktrace needs help 483 */ 484 static int 485 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 486 { 487 struct thread *td = curthread; 488 struct lwp *lp = td->td_lwp; 489 int error; 490 size_t len; 491 #ifdef KTRACE 492 struct iovec *ktriov = NULL; 493 struct uio ktruio; 494 #endif 495 496 #ifdef KTRACE 497 /* 498 * if tracing, save a copy of iovec and uio 499 */ 500 if (KTRPOINT(td, KTR_GENIO)) { 501 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 502 503 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK); 504 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 505 ktruio = *auio; 506 } 507 #endif 508 len = auio->uio_resid; 509 error = fo_write(fp, auio, fp->f_cred, flags); 510 if (error) { 511 if (auio->uio_resid != len && (error == ERESTART || 512 error == EINTR || error == EWOULDBLOCK)) 513 error = 0; 514 /* Socket layer is responsible for issuing SIGPIPE. */ 515 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) 516 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 517 } 518 #ifdef KTRACE 519 if (ktriov != NULL) { 520 if (error == 0) { 521 ktruio.uio_iov = ktriov; 522 ktruio.uio_resid = len - auio->uio_resid; 523 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 524 } 525 kfree(ktriov, M_TEMP); 526 } 527 #endif 528 if (error == 0) 529 *res = len - auio->uio_resid; 530 531 return(error); 532 } 533 534 /* 535 * Ioctl system call 536 * 537 * MPSAFE 538 */ 539 int 540 sys_ioctl(struct ioctl_args *uap) 541 { 542 int error; 543 544 error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, &uap->sysmsg); 545 return (error); 546 } 547 548 struct ioctl_map_entry { 549 const char *subsys; 550 struct ioctl_map_range *cmd_ranges; 551 LIST_ENTRY(ioctl_map_entry) entries; 552 }; 553 554 /* 555 * The true heart of all ioctl syscall handlers (native, emulation). 556 * If map != NULL, it will be searched for a matching entry for com, 557 * and appropriate conversions/conversion functions will be utilized. 558 * 559 * MPSAFE 560 */ 561 int 562 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, 563 struct sysmsg *msg) 564 { 565 struct thread *td = curthread; 566 struct proc *p = td->td_proc; 567 struct ucred *cred; 568 struct file *fp; 569 struct ioctl_map_range *iomc = NULL; 570 int error; 571 u_int size; 572 u_long ocom = com; 573 caddr_t data, memp; 574 int tmp; 575 #define STK_PARAMS 128 576 union { 577 char stkbuf[STK_PARAMS]; 578 long align; 579 } ubuf; 580 581 KKASSERT(p); 582 cred = td->td_ucred; 583 memp = NULL; 584 585 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 586 if (fp == NULL) 587 return(EBADF); 588 589 if (map != NULL) { /* obey translation map */ 590 u_long maskcmd; 591 struct ioctl_map_entry *e; 592 593 maskcmd = com & map->mask; 594 595 lwkt_gettoken(&mioctl_token); 596 LIST_FOREACH(e, &map->mapping, entries) { 597 for (iomc = e->cmd_ranges; iomc->start != 0 || 598 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 599 iomc->mapfunc != NULL; 600 iomc++) { 601 if (maskcmd >= iomc->start && 602 maskcmd <= iomc->end) 603 break; 604 } 605 606 /* Did we find a match? */ 607 if (iomc->start != 0 || iomc->maptocmd != 0 || 608 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 609 break; 610 } 611 lwkt_reltoken(&mioctl_token); 612 613 if (iomc == NULL || 614 (iomc->start == 0 && iomc->maptocmd == 0 615 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 616 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 617 map->sys, fd, maskcmd, 618 (int)((maskcmd >> 8) & 0xff), 619 (int)(maskcmd & 0xff)); 620 error = EINVAL; 621 goto done; 622 } 623 624 /* 625 * If it's a non-range one to one mapping, maptocmd should be 626 * correct. If it's a ranged one to one mapping, we pass the 627 * original value of com, and for a range mapped to a different 628 * range, we always need a mapping function to translate the 629 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 630 */ 631 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 632 com = iomc->maptocmd; 633 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 634 if (iomc->mapfunc != NULL) 635 com = iomc->mapfunc(iomc->start, iomc->end, 636 iomc->start, iomc->end, 637 com, com); 638 } else { 639 if (iomc->mapfunc != NULL) { 640 com = iomc->mapfunc(iomc->start, iomc->end, 641 iomc->maptocmd, iomc->maptoend, 642 com, ocom); 643 } else { 644 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 645 map->sys, fd, maskcmd, 646 (int)((maskcmd >> 8) & 0xff), 647 (int)(maskcmd & 0xff)); 648 error = EINVAL; 649 goto done; 650 } 651 } 652 } 653 654 switch (com) { 655 case FIONCLEX: 656 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 657 goto done; 658 case FIOCLEX: 659 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 660 goto done; 661 } 662 663 /* 664 * Interpret high order word to find amount of data to be 665 * copied to/from the user's address space. 666 */ 667 size = IOCPARM_LEN(com); 668 if (size > IOCPARM_MAX) { 669 error = ENOTTY; 670 goto done; 671 } 672 673 if ((com & IOC_VOID) == 0 && size > sizeof(ubuf.stkbuf)) { 674 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 675 data = memp; 676 } else { 677 memp = NULL; 678 data = ubuf.stkbuf; 679 } 680 if (com & IOC_VOID) { 681 *(caddr_t *)data = uspc_data; 682 } else if (com & IOC_IN) { 683 if (size != 0) { 684 error = copyin(uspc_data, data, (size_t)size); 685 if (error) 686 goto done; 687 } else { 688 *(caddr_t *)data = uspc_data; 689 } 690 } else if ((com & IOC_OUT) != 0 && size) { 691 /* 692 * Zero the buffer so the user always 693 * gets back something deterministic. 694 */ 695 bzero(data, (size_t)size); 696 } 697 698 switch (com) { 699 case FIONBIO: 700 if ((tmp = *(int *)data)) 701 atomic_set_int(&fp->f_flag, FNONBLOCK); 702 else 703 atomic_clear_int(&fp->f_flag, FNONBLOCK); 704 error = 0; 705 break; 706 707 case FIOASYNC: 708 if ((tmp = *(int *)data)) 709 atomic_set_int(&fp->f_flag, FASYNC); 710 else 711 atomic_clear_int(&fp->f_flag, FASYNC); 712 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); 713 break; 714 715 default: 716 /* 717 * If there is a override function, 718 * call it instead of directly routing the call 719 */ 720 if (map != NULL && iomc->wrapfunc != NULL) 721 error = iomc->wrapfunc(fp, com, ocom, data, cred); 722 else 723 error = fo_ioctl(fp, com, data, cred, msg); 724 /* 725 * Copy any data to user, size was 726 * already set and checked above. 727 */ 728 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 729 error = copyout(data, uspc_data, (size_t)size); 730 break; 731 } 732 done: 733 if (memp != NULL) 734 kfree(memp, M_IOCTLOPS); 735 fdrop(fp); 736 return(error); 737 } 738 739 /* 740 * MPSAFE 741 */ 742 int 743 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 744 { 745 struct ioctl_map_entry *ne; 746 747 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 748 he->subsys != NULL && *he->subsys != '\0'); 749 750 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, 751 M_WAITOK | M_ZERO); 752 753 ne->subsys = he->subsys; 754 ne->cmd_ranges = he->cmd_ranges; 755 756 lwkt_gettoken(&mioctl_token); 757 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 758 lwkt_reltoken(&mioctl_token); 759 760 return(0); 761 } 762 763 /* 764 * MPSAFE 765 */ 766 int 767 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 768 { 769 struct ioctl_map_entry *ne; 770 int error = EINVAL; 771 772 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 773 774 lwkt_gettoken(&mioctl_token); 775 LIST_FOREACH(ne, &he->map->mapping, entries) { 776 if (ne->cmd_ranges == he->cmd_ranges) { 777 LIST_REMOVE(ne, entries); 778 kfree(ne, M_IOCTLMAP); 779 error = 0; 780 break; 781 } 782 } 783 lwkt_reltoken(&mioctl_token); 784 return(error); 785 } 786 787 static int nselcoll; /* Select collisions since boot */ 788 int selwait; 789 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 790 static int nseldebug; 791 SYSCTL_INT(_kern, OID_AUTO, nseldebug, CTLFLAG_RW, &nseldebug, 0, ""); 792 793 /* 794 * Select system call. 795 * 796 * MPSAFE 797 */ 798 int 799 sys_select(struct select_args *uap) 800 { 801 struct timeval ktv; 802 struct timespec *ktsp, kts; 803 int error; 804 805 /* 806 * Get timeout if any. 807 */ 808 if (uap->tv != NULL) { 809 error = copyin(uap->tv, &ktv, sizeof (ktv)); 810 if (error) 811 return (error); 812 TIMEVAL_TO_TIMESPEC(&ktv, &kts); 813 ktsp = &kts; 814 } else { 815 ktsp = NULL; 816 } 817 818 /* 819 * Do real work. 820 */ 821 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 822 &uap->sysmsg_result); 823 824 return (error); 825 } 826 827 828 /* 829 * Pselect system call. 830 */ 831 int 832 sys_pselect(struct pselect_args *uap) 833 { 834 struct thread *td = curthread; 835 struct lwp *lp = td->td_lwp; 836 struct timespec *ktsp, kts; 837 sigset_t sigmask; 838 int error; 839 840 /* 841 * Get timeout if any. 842 */ 843 if (uap->ts != NULL) { 844 error = copyin(uap->ts, &kts, sizeof (kts)); 845 if (error) 846 return (error); 847 ktsp = &kts; 848 } else { 849 ktsp = NULL; 850 } 851 852 /* 853 * Install temporary signal mask if any provided. 854 */ 855 if (uap->sigmask != NULL) { 856 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 857 if (error) 858 return (error); 859 lwkt_gettoken(&lp->lwp_proc->p_token); 860 lp->lwp_oldsigmask = lp->lwp_sigmask; 861 SIG_CANTMASK(sigmask); 862 lp->lwp_sigmask = sigmask; 863 lwkt_reltoken(&lp->lwp_proc->p_token); 864 } 865 866 /* 867 * Do real job. 868 */ 869 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp, 870 &uap->sysmsg_result); 871 872 if (uap->sigmask != NULL) { 873 lwkt_gettoken(&lp->lwp_proc->p_token); 874 /* doselect() responsible for turning ERESTART into EINTR */ 875 KKASSERT(error != ERESTART); 876 if (error == EINTR) { 877 /* 878 * We can't restore the previous signal mask now 879 * because it could block the signal that interrupted 880 * us. So make a note to restore it after executing 881 * the handler. 882 */ 883 lp->lwp_flags |= LWP_OLDMASK; 884 } else { 885 /* 886 * No handler to run. Restore previous mask immediately. 887 */ 888 lp->lwp_sigmask = lp->lwp_oldsigmask; 889 } 890 lwkt_reltoken(&lp->lwp_proc->p_token); 891 } 892 893 return (error); 894 } 895 896 static int 897 select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 898 { 899 struct select_kevent_copyin_args *skap = NULL; 900 struct kevent *kev; 901 int fd; 902 kfd_set *fdp = NULL; 903 short filter = 0; 904 u_int fflags = 0; 905 906 skap = (struct select_kevent_copyin_args *)arg; 907 908 if (*events == maxevents) 909 return (0); 910 911 while (skap->active_set < COPYIN_DONE) { 912 switch (skap->active_set) { 913 case COPYIN_READ: 914 /* 915 * Register descriptors for the read filter 916 */ 917 fdp = skap->read_set; 918 filter = EVFILT_READ; 919 fflags = NOTE_OLDAPI; 920 if (fdp) 921 break; 922 ++skap->active_set; 923 skap->proc_fds = 0; 924 /* fall through */ 925 case COPYIN_WRITE: 926 /* 927 * Register descriptors for the write filter 928 */ 929 fdp = skap->write_set; 930 filter = EVFILT_WRITE; 931 fflags = NOTE_OLDAPI; 932 if (fdp) 933 break; 934 ++skap->active_set; 935 skap->proc_fds = 0; 936 /* fall through */ 937 case COPYIN_EXCEPT: 938 /* 939 * Register descriptors for the exception filter 940 */ 941 fdp = skap->except_set; 942 filter = EVFILT_EXCEPT; 943 fflags = NOTE_OLDAPI | NOTE_OOB; 944 if (fdp) 945 break; 946 ++skap->active_set; 947 skap->proc_fds = 0; 948 /* fall through */ 949 case COPYIN_DONE: 950 /* 951 * Nothing left to register 952 */ 953 return(0); 954 /* NOT REACHED */ 955 } 956 957 while (skap->proc_fds < skap->num_fds) { 958 fd = skap->proc_fds; 959 if (FD_ISSET(fd, fdp)) { 960 kev = &kevp[*events]; 961 EV_SET(kev, fd, filter, 962 EV_ADD|EV_ENABLE, 963 fflags, 0, 964 (void *)(uintptr_t) 965 skap->lwp->lwp_kqueue_serial); 966 FD_CLR(fd, fdp); 967 ++*events; 968 969 if (nseldebug) 970 kprintf("select fd %d filter %d serial %d\n", 971 fd, filter, skap->lwp->lwp_kqueue_serial); 972 } 973 ++skap->proc_fds; 974 if (*events == maxevents) 975 return (0); 976 } 977 skap->active_set++; 978 skap->proc_fds = 0; 979 } 980 981 return (0); 982 } 983 984 static int 985 select_copyout(void *arg, struct kevent *kevp, int count, int *res) 986 { 987 struct select_kevent_copyin_args *skap; 988 struct kevent kev; 989 int i = 0; 990 991 skap = (struct select_kevent_copyin_args *)arg; 992 993 for (i = 0; i < count; ++i) { 994 /* 995 * Filter out and delete spurious events 996 */ 997 if ((u_int)(uintptr_t)kevp[i].udata != 998 skap->lwp->lwp_kqueue_serial) { 999 kev = kevp[i]; 1000 kev.flags = EV_DISABLE|EV_DELETE; 1001 kqueue_register(&skap->lwp->lwp_kqueue, &kev); 1002 if (nseldebug) 1003 kprintf("select fd %ju mismatched serial %d\n", 1004 (uintmax_t)kevp[i].ident, 1005 skap->lwp->lwp_kqueue_serial); 1006 continue; 1007 } 1008 1009 /* 1010 * Handle errors 1011 */ 1012 if (kevp[i].flags & EV_ERROR) { 1013 int error = kevp[i].data; 1014 1015 switch (error) { 1016 case EBADF: 1017 /* 1018 * A bad file descriptor is considered a 1019 * fatal error for select, bail out. 1020 */ 1021 skap->error = error; 1022 *res = -1; 1023 return error; 1024 1025 default: 1026 /* 1027 * Select silently swallows any unknown errors 1028 * for descriptors in the read or write sets. 1029 * 1030 * ALWAYS filter out EOPNOTSUPP errors from 1031 * filters (at least until all filters support 1032 * EVFILT_EXCEPT) 1033 * 1034 * We also filter out ENODEV since dev_dkqfilter 1035 * returns ENODEV if EOPNOTSUPP is returned in an 1036 * inner call. 1037 * 1038 * XXX: fix this 1039 */ 1040 if (kevp[i].filter != EVFILT_READ && 1041 kevp[i].filter != EVFILT_WRITE && 1042 error != EOPNOTSUPP && 1043 error != ENODEV) { 1044 skap->error = error; 1045 *res = -1; 1046 return error; 1047 } 1048 break; 1049 } 1050 if (nseldebug) 1051 kprintf("select fd %ju filter %d error %d\n", 1052 (uintmax_t)kevp[i].ident, 1053 kevp[i].filter, error); 1054 continue; 1055 } 1056 1057 switch (kevp[i].filter) { 1058 case EVFILT_READ: 1059 FD_SET(kevp[i].ident, skap->read_set); 1060 break; 1061 case EVFILT_WRITE: 1062 FD_SET(kevp[i].ident, skap->write_set); 1063 break; 1064 case EVFILT_EXCEPT: 1065 FD_SET(kevp[i].ident, skap->except_set); 1066 break; 1067 } 1068 1069 ++*res; 1070 } 1071 1072 return (0); 1073 } 1074 1075 /* 1076 * Copy select bits in from userland. Allocate kernel memory if the 1077 * set is large. 1078 */ 1079 static int 1080 getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set) 1081 { 1082 int error; 1083 1084 if (in_set) { 1085 if (bytes < sizeof(*tmp_set)) 1086 *out_set = tmp_set; 1087 else 1088 *out_set = kmalloc(bytes, M_SELECT, M_WAITOK); 1089 error = copyin(in_set, *out_set, bytes); 1090 } else { 1091 *out_set = NULL; 1092 error = 0; 1093 } 1094 return (error); 1095 } 1096 1097 /* 1098 * Copy returned select bits back out to userland. 1099 */ 1100 static int 1101 putbits(int bytes, kfd_set *in_set, fd_set *out_set) 1102 { 1103 int error; 1104 1105 if (in_set) { 1106 error = copyout(in_set, out_set, bytes); 1107 } else { 1108 error = 0; 1109 } 1110 return (error); 1111 } 1112 1113 static int 1114 dotimeout_only(struct timespec *ts) 1115 { 1116 return(nanosleep1(ts, NULL)); 1117 } 1118 1119 /* 1120 * Common code for sys_select() and sys_pselect(). 1121 * 1122 * in, out and ex are userland pointers. ts must point to validated 1123 * kernel-side timeout value or NULL for infinite timeout. res must 1124 * point to syscall return value. 1125 */ 1126 static int 1127 doselect(int nd, fd_set *read, fd_set *write, fd_set *except, 1128 struct timespec *ts, int *res) 1129 { 1130 struct proc *p = curproc; 1131 struct select_kevent_copyin_args *kap, ka; 1132 int bytes, error; 1133 kfd_set read_tmp; 1134 kfd_set write_tmp; 1135 kfd_set except_tmp; 1136 1137 *res = 0; 1138 if (nd < 0) 1139 return (EINVAL); 1140 if (nd == 0 && ts) 1141 return (dotimeout_only(ts)); 1142 1143 if (nd > p->p_fd->fd_nfiles) /* limit kmalloc */ 1144 nd = p->p_fd->fd_nfiles; 1145 1146 kap = &ka; 1147 kap->lwp = curthread->td_lwp; 1148 kap->num_fds = nd; 1149 kap->proc_fds = 0; 1150 kap->error = 0; 1151 kap->active_set = COPYIN_READ; 1152 1153 /* 1154 * Calculate bytes based on the number of __fd_mask[] array entries 1155 * multiplied by the size of __fd_mask. 1156 */ 1157 bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask); 1158 1159 /* kap->read_set = NULL; not needed */ 1160 kap->write_set = NULL; 1161 kap->except_set = NULL; 1162 1163 error = getbits(bytes, read, &kap->read_set, &read_tmp); 1164 if (error == 0) 1165 error = getbits(bytes, write, &kap->write_set, &write_tmp); 1166 if (error == 0) 1167 error = getbits(bytes, except, &kap->except_set, &except_tmp); 1168 if (error) 1169 goto done; 1170 1171 /* 1172 * NOTE: Make sure the max events passed to kern_kevent() is 1173 * effectively unlimited. (nd * 3) accomplishes this. 1174 * 1175 * (*res) continues to increment as returned events are 1176 * loaded in. 1177 */ 1178 error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap, 1179 select_copyin, select_copyout, ts, 0); 1180 if (error == 0) 1181 error = putbits(bytes, kap->read_set, read); 1182 if (error == 0) 1183 error = putbits(bytes, kap->write_set, write); 1184 if (error == 0) 1185 error = putbits(bytes, kap->except_set, except); 1186 1187 /* 1188 * An error from an individual event that should be passed 1189 * back to userland (EBADF) 1190 */ 1191 if (kap->error) 1192 error = kap->error; 1193 1194 /* 1195 * Clean up. 1196 */ 1197 done: 1198 if (kap->read_set && kap->read_set != &read_tmp) 1199 kfree(kap->read_set, M_SELECT); 1200 if (kap->write_set && kap->write_set != &write_tmp) 1201 kfree(kap->write_set, M_SELECT); 1202 if (kap->except_set && kap->except_set != &except_tmp) 1203 kfree(kap->except_set, M_SELECT); 1204 1205 kap->lwp->lwp_kqueue_serial += kap->num_fds; 1206 1207 return (error); 1208 } 1209 1210 /* 1211 * Poll system call. 1212 * 1213 * MPSAFE 1214 */ 1215 int 1216 sys_poll(struct poll_args *uap) 1217 { 1218 struct timespec ts, *tsp; 1219 int error; 1220 1221 if (uap->timeout != INFTIM) { 1222 if (uap->timeout < 0) 1223 return (EINVAL); 1224 ts.tv_sec = uap->timeout / 1000; 1225 ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000; 1226 tsp = &ts; 1227 } else { 1228 tsp = NULL; 1229 } 1230 1231 error = dopoll(uap->nfds, uap->fds, tsp, &uap->sysmsg_result, 0); 1232 1233 return (error); 1234 } 1235 1236 /* 1237 * Ppoll system call. 1238 * 1239 * MPSAFE 1240 */ 1241 int 1242 sys_ppoll(struct ppoll_args *uap) 1243 { 1244 struct thread *td = curthread; 1245 struct lwp *lp = td->td_lwp; 1246 struct timespec *ktsp, kts; 1247 sigset_t sigmask; 1248 int error; 1249 1250 /* 1251 * Get timeout if any. 1252 */ 1253 if (uap->ts != NULL) { 1254 error = copyin(uap->ts, &kts, sizeof (kts)); 1255 if (error) 1256 return (error); 1257 ktsp = &kts; 1258 } else { 1259 ktsp = NULL; 1260 } 1261 1262 /* 1263 * Install temporary signal mask if any provided. 1264 */ 1265 if (uap->sigmask != NULL) { 1266 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 1267 if (error) 1268 return (error); 1269 lwkt_gettoken(&lp->lwp_proc->p_token); 1270 lp->lwp_oldsigmask = lp->lwp_sigmask; 1271 SIG_CANTMASK(sigmask); 1272 lp->lwp_sigmask = sigmask; 1273 lwkt_reltoken(&lp->lwp_proc->p_token); 1274 } 1275 1276 error = dopoll(uap->nfds, uap->fds, ktsp, &uap->sysmsg_result, 1277 ktsp != NULL ? KEVENT_TIMEOUT_PRECISE : 0); 1278 1279 if (uap->sigmask != NULL) { 1280 lwkt_gettoken(&lp->lwp_proc->p_token); 1281 /* dopoll() responsible for turning ERESTART into EINTR */ 1282 KKASSERT(error != ERESTART); 1283 if (error == EINTR) { 1284 /* 1285 * We can't restore the previous signal mask now 1286 * because it could block the signal that interrupted 1287 * us. So make a note to restore it after executing 1288 * the handler. 1289 */ 1290 lp->lwp_flags |= LWP_OLDMASK; 1291 } else { 1292 /* 1293 * No handler to run. Restore previous mask immediately. 1294 */ 1295 lp->lwp_sigmask = lp->lwp_oldsigmask; 1296 } 1297 lwkt_reltoken(&lp->lwp_proc->p_token); 1298 } 1299 1300 return (error); 1301 } 1302 1303 static int 1304 poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 1305 { 1306 struct poll_kevent_copyin_args *pkap; 1307 struct pollfd *pfd; 1308 struct kevent *kev; 1309 int kev_count; 1310 1311 pkap = (struct poll_kevent_copyin_args *)arg; 1312 1313 while (pkap->pfds < pkap->nfds) { 1314 pfd = &pkap->fds[pkap->pfds]; 1315 1316 /* Clear return events */ 1317 pfd->revents = 0; 1318 1319 /* Do not check if fd is equal to -1 */ 1320 if (pfd->fd == -1) { 1321 ++pkap->pfds; 1322 continue; 1323 } 1324 1325 kev_count = 0; 1326 if (pfd->events & (POLLIN | POLLRDNORM)) 1327 kev_count++; 1328 if (pfd->events & (POLLOUT | POLLWRNORM)) 1329 kev_count++; 1330 if (pfd->events & (POLLPRI | POLLRDBAND)) 1331 kev_count++; 1332 1333 if (*events + kev_count > maxevents) 1334 return (0); 1335 1336 /* 1337 * NOTE: A combined serial number and poll array index is 1338 * stored in kev->udata. 1339 */ 1340 kev = &kevp[*events]; 1341 if (pfd->events & (POLLIN | POLLRDNORM)) { 1342 EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 1343 NOTE_OLDAPI, 0, (void *)(uintptr_t) 1344 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1345 } 1346 if (pfd->events & (POLLOUT | POLLWRNORM)) { 1347 EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE, 1348 NOTE_OLDAPI, 0, (void *)(uintptr_t) 1349 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1350 } 1351 if (pfd->events & (POLLPRI | POLLRDBAND)) { 1352 EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE, 1353 NOTE_OLDAPI | NOTE_OOB, 0, 1354 (void *)(uintptr_t) 1355 (pkap->lwp->lwp_kqueue_serial + pkap->pfds)); 1356 } 1357 1358 if (nseldebug) { 1359 kprintf("poll index %d/%d fd %d events %08x serial %d\n", 1360 pkap->pfds, pkap->nfds-1, pfd->fd, pfd->events, 1361 pkap->lwp->lwp_kqueue_serial); 1362 } 1363 1364 ++pkap->pfds; 1365 (*events) += kev_count; 1366 } 1367 1368 return (0); 1369 } 1370 1371 static int 1372 poll_copyout(void *arg, struct kevent *kevp, int count, int *res) 1373 { 1374 struct poll_kevent_copyin_args *pkap; 1375 struct pollfd *pfd; 1376 struct kevent kev; 1377 int count_res; 1378 int i; 1379 u_int pi; 1380 1381 pkap = (struct poll_kevent_copyin_args *)arg; 1382 1383 for (i = 0; i < count; ++i) { 1384 /* 1385 * Extract the poll array index and delete spurious events. 1386 * We can easily tell if the serial number is incorrect 1387 * by checking whether the extracted index is out of range. 1388 */ 1389 pi = (u_int)(uintptr_t)kevp[i].udata - 1390 (u_int)pkap->lwp->lwp_kqueue_serial; 1391 1392 if (pi >= pkap->nfds) { 1393 kev = kevp[i]; 1394 kev.flags = EV_DISABLE|EV_DELETE; 1395 kqueue_register(&pkap->lwp->lwp_kqueue, &kev); 1396 if (nseldebug) 1397 kprintf("poll index %d out of range against serial %d\n", 1398 pi, pkap->lwp->lwp_kqueue_serial); 1399 continue; 1400 } 1401 pfd = &pkap->fds[pi]; 1402 if (kevp[i].ident == pfd->fd) { 1403 /* 1404 * A single descriptor may generate an error against 1405 * more than one filter, make sure to set the 1406 * appropriate flags but do not increment (*res) 1407 * more than once. 1408 */ 1409 count_res = (pfd->revents == 0); 1410 if (kevp[i].flags & EV_ERROR) { 1411 switch(kevp[i].data) { 1412 case EBADF: 1413 case POLLNVAL: 1414 /* Bad file descriptor */ 1415 if (count_res) 1416 ++*res; 1417 pfd->revents |= POLLNVAL; 1418 break; 1419 default: 1420 /* 1421 * Poll silently swallows any unknown 1422 * errors except in the case of POLLPRI 1423 * (OOB/urgent data). 1424 * 1425 * ALWAYS filter out EOPNOTSUPP errors 1426 * from filters, common applications 1427 * set POLLPRI|POLLRDBAND and most 1428 * filters do not support EVFILT_EXCEPT. 1429 * 1430 * We also filter out ENODEV since dev_dkqfilter 1431 * returns ENODEV if EOPNOTSUPP is returned in an 1432 * inner call. 1433 * 1434 * XXX: fix this 1435 */ 1436 if (kevp[i].filter != EVFILT_READ && 1437 kevp[i].filter != EVFILT_WRITE && 1438 kevp[i].data != EOPNOTSUPP && 1439 kevp[i].data != ENODEV) { 1440 if (count_res == 0) 1441 ++*res; 1442 pfd->revents |= POLLERR; 1443 } 1444 break; 1445 } 1446 if (nseldebug) { 1447 kprintf("poll index %d fd %d " 1448 "filter %d error %jd\n", 1449 pi, pfd->fd, 1450 kevp[i].filter, 1451 (intmax_t)kevp[i].data); 1452 } 1453 continue; 1454 } 1455 1456 switch (kevp[i].filter) { 1457 case EVFILT_READ: 1458 #if 0 1459 /* 1460 * NODATA on the read side can indicate a 1461 * half-closed situation and not necessarily 1462 * a disconnect, so depend on the user 1463 * issuing a read() and getting 0 bytes back. 1464 */ 1465 if (kevp[i].flags & EV_NODATA) 1466 pfd->revents |= POLLHUP; 1467 #endif 1468 if ((kevp[i].flags & EV_EOF) && 1469 kevp[i].fflags != 0) 1470 pfd->revents |= POLLERR; 1471 if (pfd->events & POLLIN) 1472 pfd->revents |= POLLIN; 1473 if (pfd->events & POLLRDNORM) 1474 pfd->revents |= POLLRDNORM; 1475 break; 1476 case EVFILT_WRITE: 1477 /* 1478 * As per the OpenGroup POLLHUP is mutually 1479 * exclusive with the writability flags. I 1480 * consider this a bit broken but... 1481 * 1482 * In this case a disconnect is implied even 1483 * for a half-closed (write side) situation. 1484 */ 1485 if (kevp[i].flags & EV_EOF) { 1486 pfd->revents |= POLLHUP; 1487 if (kevp[i].fflags != 0) 1488 pfd->revents |= POLLERR; 1489 } else { 1490 if (pfd->events & POLLOUT) 1491 pfd->revents |= POLLOUT; 1492 if (pfd->events & POLLWRNORM) 1493 pfd->revents |= POLLWRNORM; 1494 } 1495 break; 1496 case EVFILT_EXCEPT: 1497 /* 1498 * EV_NODATA should never be tagged for this 1499 * filter. 1500 */ 1501 if (pfd->events & POLLPRI) 1502 pfd->revents |= POLLPRI; 1503 if (pfd->events & POLLRDBAND) 1504 pfd->revents |= POLLRDBAND; 1505 break; 1506 } 1507 1508 if (nseldebug) { 1509 kprintf("poll index %d/%d fd %d revents %08x\n", 1510 pi, pkap->nfds, pfd->fd, pfd->revents); 1511 } 1512 1513 if (count_res && pfd->revents) 1514 ++*res; 1515 } else { 1516 if (nseldebug) { 1517 kprintf("poll index %d mismatch %ju/%d\n", 1518 pi, (uintmax_t)kevp[i].ident, pfd->fd); 1519 } 1520 } 1521 } 1522 1523 return (0); 1524 } 1525 1526 static int 1527 dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res, int flags) 1528 { 1529 struct poll_kevent_copyin_args ka; 1530 struct pollfd sfds[64]; 1531 int bytes; 1532 int error; 1533 1534 *res = 0; 1535 if (nfds < 0) 1536 return (EINVAL); 1537 1538 if (nfds == 0 && ts) 1539 return (dotimeout_only(ts)); 1540 1541 /* 1542 * This is a bit arbitrary but we need to limit internal kmallocs. 1543 */ 1544 if (nfds > maxfilesperproc * 2) 1545 nfds = maxfilesperproc * 2; 1546 bytes = sizeof(struct pollfd) * nfds; 1547 1548 ka.lwp = curthread->td_lwp; 1549 ka.nfds = nfds; 1550 ka.pfds = 0; 1551 ka.error = 0; 1552 1553 if (ka.nfds < 64) 1554 ka.fds = sfds; 1555 else 1556 ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK); 1557 1558 error = copyin(fds, ka.fds, bytes); 1559 if (error == 0) 1560 error = kern_kevent(&ka.lwp->lwp_kqueue, 0x7FFFFFFF, res, &ka, 1561 poll_copyin, poll_copyout, ts, flags); 1562 1563 if (error == 0) 1564 error = copyout(ka.fds, fds, bytes); 1565 1566 if (ka.fds != sfds) 1567 kfree(ka.fds, M_SELECT); 1568 1569 ka.lwp->lwp_kqueue_serial += nfds; 1570 1571 return (error); 1572 } 1573 1574 static int 1575 socket_wait_copyin(void *arg, struct kevent *kevp, int maxevents, int *events) 1576 { 1577 return (0); 1578 } 1579 1580 static int 1581 socket_wait_copyout(void *arg, struct kevent *kevp, int count, int *res) 1582 { 1583 ++*res; 1584 return (0); 1585 } 1586 1587 extern struct fileops socketops; 1588 1589 /* 1590 * NOTE: Callers of socket_wait() must already have a reference on the 1591 * socket. 1592 */ 1593 int 1594 socket_wait(struct socket *so, struct timespec *ts, int *res) 1595 { 1596 struct thread *td = curthread; 1597 struct file *fp; 1598 struct kqueue kq; 1599 struct kevent kev; 1600 int error, fd; 1601 1602 if ((error = falloc(td->td_lwp, &fp, &fd)) != 0) 1603 return (error); 1604 1605 fp->f_type = DTYPE_SOCKET; 1606 fp->f_flag = FREAD | FWRITE; 1607 fp->f_ops = &socketops; 1608 fp->f_data = so; 1609 fsetfd(td->td_lwp->lwp_proc->p_fd, fp, fd); 1610 fsetfdflags(td->td_proc->p_fd, fd, UF_EXCLOSE); 1611 1612 bzero(&kq, sizeof(kq)); 1613 kqueue_init(&kq, td->td_lwp->lwp_proc->p_fd); 1614 EV_SET(&kev, fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, NULL); 1615 if ((error = kqueue_register(&kq, &kev)) != 0) { 1616 fdrop(fp); 1617 return (error); 1618 } 1619 1620 error = kern_kevent(&kq, 1, res, NULL, socket_wait_copyin, 1621 socket_wait_copyout, ts, 0); 1622 1623 EV_SET(&kev, fd, EVFILT_READ, EV_DELETE|EV_DISABLE, 0, 0, NULL); 1624 kqueue_register(&kq, &kev); 1625 fp->f_ops = &badfileops; 1626 fdrop(fp); 1627 1628 return (error); 1629 } 1630 1631 /* 1632 * OpenBSD poll system call. 1633 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1634 * 1635 * MPSAFE 1636 */ 1637 int 1638 sys_openbsd_poll(struct openbsd_poll_args *uap) 1639 { 1640 return (sys_poll((struct poll_args *)uap)); 1641 } 1642 1643 /*ARGSUSED*/ 1644 int 1645 seltrue(cdev_t dev, int events) 1646 { 1647 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1648 } 1649