1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/socketvar.h> 55 #include <sys/uio.h> 56 #include <sys/kernel.h> 57 #include <sys/kern_syscall.h> 58 #include <sys/malloc.h> 59 #include <sys/mapped_ioctl.h> 60 #include <sys/poll.h> 61 #include <sys/queue.h> 62 #include <sys/resourcevar.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysent.h> 65 #include <sys/buf.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_page.h> 71 72 #include <sys/file2.h> 73 #include <sys/mplock2.h> 74 75 #include <machine/limits.h> 76 77 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 78 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 79 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 80 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 81 82 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 83 struct timeval *tv, int *res); 84 static int pollscan (struct proc *, struct pollfd *, u_int, int *); 85 static int selscan (struct proc *, fd_mask **, fd_mask **, 86 int, int *); 87 static int dofileread(int, struct file *, struct uio *, int, size_t *); 88 static int dofilewrite(int, struct file *, struct uio *, int, size_t *); 89 90 /* 91 * Read system call. 92 * 93 * MPSAFE 94 */ 95 int 96 sys_read(struct read_args *uap) 97 { 98 struct thread *td = curthread; 99 struct uio auio; 100 struct iovec aiov; 101 int error; 102 103 if ((ssize_t)uap->nbyte < 0) 104 error = EINVAL; 105 106 aiov.iov_base = uap->buf; 107 aiov.iov_len = uap->nbyte; 108 auio.uio_iov = &aiov; 109 auio.uio_iovcnt = 1; 110 auio.uio_offset = -1; 111 auio.uio_resid = uap->nbyte; 112 auio.uio_rw = UIO_READ; 113 auio.uio_segflg = UIO_USERSPACE; 114 auio.uio_td = td; 115 116 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 117 return(error); 118 } 119 120 /* 121 * Positioned (Pread) read system call 122 * 123 * MPSAFE 124 */ 125 int 126 sys_extpread(struct extpread_args *uap) 127 { 128 struct thread *td = curthread; 129 struct uio auio; 130 struct iovec aiov; 131 int error; 132 int flags; 133 134 if ((ssize_t)uap->nbyte < 0) 135 return(EINVAL); 136 137 aiov.iov_base = uap->buf; 138 aiov.iov_len = uap->nbyte; 139 auio.uio_iov = &aiov; 140 auio.uio_iovcnt = 1; 141 auio.uio_offset = uap->offset; 142 auio.uio_resid = uap->nbyte; 143 auio.uio_rw = UIO_READ; 144 auio.uio_segflg = UIO_USERSPACE; 145 auio.uio_td = td; 146 147 flags = uap->flags & O_FMASK; 148 if (uap->offset != (off_t)-1) 149 flags |= O_FOFFSET; 150 151 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 152 return(error); 153 } 154 155 /* 156 * Scatter read system call. 157 * 158 * MPSAFE 159 */ 160 int 161 sys_readv(struct readv_args *uap) 162 { 163 struct thread *td = curthread; 164 struct uio auio; 165 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 166 int error; 167 168 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 169 &auio.uio_resid); 170 if (error) 171 return (error); 172 auio.uio_iov = iov; 173 auio.uio_iovcnt = uap->iovcnt; 174 auio.uio_offset = -1; 175 auio.uio_rw = UIO_READ; 176 auio.uio_segflg = UIO_USERSPACE; 177 auio.uio_td = td; 178 179 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 180 181 iovec_free(&iov, aiov); 182 return (error); 183 } 184 185 186 /* 187 * Scatter positioned read system call. 188 * 189 * MPSAFE 190 */ 191 int 192 sys_extpreadv(struct extpreadv_args *uap) 193 { 194 struct thread *td = curthread; 195 struct uio auio; 196 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 197 int error; 198 int flags; 199 200 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 201 &auio.uio_resid); 202 if (error) 203 return (error); 204 auio.uio_iov = iov; 205 auio.uio_iovcnt = uap->iovcnt; 206 auio.uio_offset = uap->offset; 207 auio.uio_rw = UIO_READ; 208 auio.uio_segflg = UIO_USERSPACE; 209 auio.uio_td = td; 210 211 flags = uap->flags & O_FMASK; 212 if (uap->offset != (off_t)-1) 213 flags |= O_FOFFSET; 214 215 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 216 217 iovec_free(&iov, aiov); 218 return(error); 219 } 220 221 /* 222 * MPSAFE 223 */ 224 int 225 kern_preadv(int fd, struct uio *auio, int flags, size_t *res) 226 { 227 struct thread *td = curthread; 228 struct proc *p = td->td_proc; 229 struct file *fp; 230 int error; 231 232 KKASSERT(p); 233 234 fp = holdfp(p->p_fd, fd, FREAD); 235 if (fp == NULL) 236 return (EBADF); 237 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 238 error = ESPIPE; 239 } else { 240 error = dofileread(fd, fp, auio, flags, res); 241 } 242 fdrop(fp); 243 return(error); 244 } 245 246 /* 247 * Common code for readv and preadv that reads data in 248 * from a file using the passed in uio, offset, and flags. 249 * 250 * MPALMOSTSAFE - ktrace needs help 251 */ 252 static int 253 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 254 { 255 struct thread *td = curthread; 256 int error; 257 size_t len; 258 #ifdef KTRACE 259 struct iovec *ktriov = NULL; 260 struct uio ktruio; 261 #endif 262 263 #ifdef KTRACE 264 /* 265 * if tracing, save a copy of iovec 266 */ 267 if (KTRPOINT(td, KTR_GENIO)) { 268 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 269 270 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 271 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 272 ktruio = *auio; 273 } 274 #endif 275 len = auio->uio_resid; 276 error = fo_read(fp, auio, fp->f_cred, flags); 277 if (error) { 278 if (auio->uio_resid != len && (error == ERESTART || 279 error == EINTR || error == EWOULDBLOCK)) 280 error = 0; 281 } 282 #ifdef KTRACE 283 if (ktriov != NULL) { 284 if (error == 0) { 285 ktruio.uio_iov = ktriov; 286 ktruio.uio_resid = len - auio->uio_resid; 287 get_mplock(); 288 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 289 rel_mplock(); 290 } 291 FREE(ktriov, M_TEMP); 292 } 293 #endif 294 if (error == 0) 295 *res = len - auio->uio_resid; 296 297 return(error); 298 } 299 300 /* 301 * Write system call 302 * 303 * MPSAFE 304 */ 305 int 306 sys_write(struct write_args *uap) 307 { 308 struct thread *td = curthread; 309 struct uio auio; 310 struct iovec aiov; 311 int error; 312 313 if ((ssize_t)uap->nbyte < 0) 314 error = EINVAL; 315 316 aiov.iov_base = (void *)(uintptr_t)uap->buf; 317 aiov.iov_len = uap->nbyte; 318 auio.uio_iov = &aiov; 319 auio.uio_iovcnt = 1; 320 auio.uio_offset = -1; 321 auio.uio_resid = uap->nbyte; 322 auio.uio_rw = UIO_WRITE; 323 auio.uio_segflg = UIO_USERSPACE; 324 auio.uio_td = td; 325 326 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 327 328 return(error); 329 } 330 331 /* 332 * Pwrite system call 333 * 334 * MPSAFE 335 */ 336 int 337 sys_extpwrite(struct extpwrite_args *uap) 338 { 339 struct thread *td = curthread; 340 struct uio auio; 341 struct iovec aiov; 342 int error; 343 int flags; 344 345 if ((ssize_t)uap->nbyte < 0) 346 error = EINVAL; 347 348 aiov.iov_base = (void *)(uintptr_t)uap->buf; 349 aiov.iov_len = uap->nbyte; 350 auio.uio_iov = &aiov; 351 auio.uio_iovcnt = 1; 352 auio.uio_offset = uap->offset; 353 auio.uio_resid = uap->nbyte; 354 auio.uio_rw = UIO_WRITE; 355 auio.uio_segflg = UIO_USERSPACE; 356 auio.uio_td = td; 357 358 flags = uap->flags & O_FMASK; 359 if (uap->offset != (off_t)-1) 360 flags |= O_FOFFSET; 361 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 362 return(error); 363 } 364 365 /* 366 * MPSAFE 367 */ 368 int 369 sys_writev(struct writev_args *uap) 370 { 371 struct thread *td = curthread; 372 struct uio auio; 373 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 374 int error; 375 376 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 377 &auio.uio_resid); 378 if (error) 379 return (error); 380 auio.uio_iov = iov; 381 auio.uio_iovcnt = uap->iovcnt; 382 auio.uio_offset = -1; 383 auio.uio_rw = UIO_WRITE; 384 auio.uio_segflg = UIO_USERSPACE; 385 auio.uio_td = td; 386 387 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 388 389 iovec_free(&iov, aiov); 390 return (error); 391 } 392 393 394 /* 395 * Gather positioned write system call 396 * 397 * MPSAFE 398 */ 399 int 400 sys_extpwritev(struct extpwritev_args *uap) 401 { 402 struct thread *td = curthread; 403 struct uio auio; 404 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 405 int error; 406 int flags; 407 408 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 409 &auio.uio_resid); 410 if (error) 411 return (error); 412 auio.uio_iov = iov; 413 auio.uio_iovcnt = uap->iovcnt; 414 auio.uio_offset = uap->offset; 415 auio.uio_rw = UIO_WRITE; 416 auio.uio_segflg = UIO_USERSPACE; 417 auio.uio_td = td; 418 419 flags = uap->flags & O_FMASK; 420 if (uap->offset != (off_t)-1) 421 flags |= O_FOFFSET; 422 423 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 424 425 iovec_free(&iov, aiov); 426 return(error); 427 } 428 429 /* 430 * MPSAFE 431 */ 432 int 433 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) 434 { 435 struct thread *td = curthread; 436 struct proc *p = td->td_proc; 437 struct file *fp; 438 int error; 439 440 KKASSERT(p); 441 442 fp = holdfp(p->p_fd, fd, FWRITE); 443 if (fp == NULL) 444 return (EBADF); 445 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 446 error = ESPIPE; 447 } else { 448 error = dofilewrite(fd, fp, auio, flags, res); 449 } 450 451 fdrop(fp); 452 return (error); 453 } 454 455 /* 456 * Common code for writev and pwritev that writes data to 457 * a file using the passed in uio, offset, and flags. 458 * 459 * MPALMOSTSAFE - ktrace needs help 460 */ 461 static int 462 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 463 { 464 struct thread *td = curthread; 465 struct lwp *lp = td->td_lwp; 466 int error; 467 size_t len; 468 #ifdef KTRACE 469 struct iovec *ktriov = NULL; 470 struct uio ktruio; 471 #endif 472 473 #ifdef KTRACE 474 /* 475 * if tracing, save a copy of iovec and uio 476 */ 477 if (KTRPOINT(td, KTR_GENIO)) { 478 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 479 480 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 481 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 482 ktruio = *auio; 483 } 484 #endif 485 len = auio->uio_resid; 486 error = fo_write(fp, auio, fp->f_cred, flags); 487 if (error) { 488 if (auio->uio_resid != len && (error == ERESTART || 489 error == EINTR || error == EWOULDBLOCK)) 490 error = 0; 491 /* Socket layer is responsible for issuing SIGPIPE. */ 492 if (error == EPIPE) { 493 get_mplock(); 494 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 495 rel_mplock(); 496 } 497 } 498 #ifdef KTRACE 499 if (ktriov != NULL) { 500 if (error == 0) { 501 ktruio.uio_iov = ktriov; 502 ktruio.uio_resid = len - auio->uio_resid; 503 get_mplock(); 504 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 505 rel_mplock(); 506 } 507 FREE(ktriov, M_TEMP); 508 } 509 #endif 510 if (error == 0) 511 *res = len - auio->uio_resid; 512 513 return(error); 514 } 515 516 /* 517 * Ioctl system call 518 * 519 * MPALMOSTSAFE 520 */ 521 int 522 sys_ioctl(struct ioctl_args *uap) 523 { 524 int error; 525 526 get_mplock(); 527 error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, &uap->sysmsg); 528 rel_mplock(); 529 return (error); 530 } 531 532 struct ioctl_map_entry { 533 const char *subsys; 534 struct ioctl_map_range *cmd_ranges; 535 LIST_ENTRY(ioctl_map_entry) entries; 536 }; 537 538 /* 539 * The true heart of all ioctl syscall handlers (native, emulation). 540 * If map != NULL, it will be searched for a matching entry for com, 541 * and appropriate conversions/conversion functions will be utilized. 542 */ 543 int 544 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, 545 struct sysmsg *msg) 546 { 547 struct thread *td = curthread; 548 struct proc *p = td->td_proc; 549 struct ucred *cred; 550 struct file *fp; 551 struct ioctl_map_range *iomc = NULL; 552 int error; 553 u_int size; 554 u_long ocom = com; 555 caddr_t data, memp; 556 int tmp; 557 #define STK_PARAMS 128 558 union { 559 char stkbuf[STK_PARAMS]; 560 long align; 561 } ubuf; 562 563 KKASSERT(p); 564 cred = td->td_ucred; 565 566 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 567 if (fp == NULL) 568 return(EBADF); 569 570 if (map != NULL) { /* obey translation map */ 571 u_long maskcmd; 572 struct ioctl_map_entry *e; 573 574 maskcmd = com & map->mask; 575 576 LIST_FOREACH(e, &map->mapping, entries) { 577 for (iomc = e->cmd_ranges; iomc->start != 0 || 578 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 579 iomc->mapfunc != NULL; 580 iomc++) { 581 if (maskcmd >= iomc->start && 582 maskcmd <= iomc->end) 583 break; 584 } 585 586 /* Did we find a match? */ 587 if (iomc->start != 0 || iomc->maptocmd != 0 || 588 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 589 break; 590 } 591 592 if (iomc == NULL || 593 (iomc->start == 0 && iomc->maptocmd == 0 594 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 595 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 596 map->sys, fd, maskcmd, 597 (int)((maskcmd >> 8) & 0xff), 598 (int)(maskcmd & 0xff)); 599 error = EINVAL; 600 goto done; 601 } 602 603 /* 604 * If it's a non-range one to one mapping, maptocmd should be 605 * correct. If it's a ranged one to one mapping, we pass the 606 * original value of com, and for a range mapped to a different 607 * range, we always need a mapping function to translate the 608 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 609 */ 610 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 611 com = iomc->maptocmd; 612 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 613 if (iomc->mapfunc != NULL) 614 com = iomc->mapfunc(iomc->start, iomc->end, 615 iomc->start, iomc->end, 616 com, com); 617 } else { 618 if (iomc->mapfunc != NULL) { 619 com = iomc->mapfunc(iomc->start, iomc->end, 620 iomc->maptocmd, iomc->maptoend, 621 com, ocom); 622 } else { 623 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 624 map->sys, fd, maskcmd, 625 (int)((maskcmd >> 8) & 0xff), 626 (int)(maskcmd & 0xff)); 627 error = EINVAL; 628 goto done; 629 } 630 } 631 } 632 633 switch (com) { 634 case FIONCLEX: 635 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 636 goto done; 637 case FIOCLEX: 638 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 639 goto done; 640 } 641 642 /* 643 * Interpret high order word to find amount of data to be 644 * copied to/from the user's address space. 645 */ 646 size = IOCPARM_LEN(com); 647 if (size > IOCPARM_MAX) { 648 error = ENOTTY; 649 goto done; 650 } 651 652 memp = NULL; 653 if (size > sizeof (ubuf.stkbuf)) { 654 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 655 data = memp; 656 } else { 657 data = ubuf.stkbuf; 658 } 659 if ((com & IOC_IN) != 0) { 660 if (size != 0) { 661 error = copyin(uspc_data, data, (size_t)size); 662 if (error) { 663 if (memp != NULL) 664 kfree(memp, M_IOCTLOPS); 665 goto done; 666 } 667 } else { 668 *(caddr_t *)data = uspc_data; 669 } 670 } else if ((com & IOC_OUT) != 0 && size) { 671 /* 672 * Zero the buffer so the user always 673 * gets back something deterministic. 674 */ 675 bzero(data, (size_t)size); 676 } else if ((com & IOC_VOID) != 0) { 677 *(caddr_t *)data = uspc_data; 678 } 679 680 switch (com) { 681 case FIONBIO: 682 if ((tmp = *(int *)data)) 683 fp->f_flag |= FNONBLOCK; 684 else 685 fp->f_flag &= ~FNONBLOCK; 686 error = 0; 687 break; 688 689 case FIOASYNC: 690 if ((tmp = *(int *)data)) 691 fp->f_flag |= FASYNC; 692 else 693 fp->f_flag &= ~FASYNC; 694 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); 695 break; 696 697 default: 698 /* 699 * If there is a override function, 700 * call it instead of directly routing the call 701 */ 702 if (map != NULL && iomc->wrapfunc != NULL) 703 error = iomc->wrapfunc(fp, com, ocom, data, cred); 704 else 705 error = fo_ioctl(fp, com, data, cred, msg); 706 /* 707 * Copy any data to user, size was 708 * already set and checked above. 709 */ 710 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 711 error = copyout(data, uspc_data, (size_t)size); 712 break; 713 } 714 if (memp != NULL) 715 kfree(memp, M_IOCTLOPS); 716 done: 717 fdrop(fp); 718 return(error); 719 } 720 721 int 722 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 723 { 724 struct ioctl_map_entry *ne; 725 726 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 727 he->subsys != NULL && *he->subsys != '\0'); 728 729 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); 730 731 ne->subsys = he->subsys; 732 ne->cmd_ranges = he->cmd_ranges; 733 734 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 735 736 return(0); 737 } 738 739 int 740 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 741 { 742 struct ioctl_map_entry *ne; 743 744 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 745 746 LIST_FOREACH(ne, &he->map->mapping, entries) { 747 if (ne->cmd_ranges != he->cmd_ranges) 748 continue; 749 LIST_REMOVE(ne, entries); 750 kfree(ne, M_IOCTLMAP); 751 return(0); 752 } 753 return(EINVAL); 754 } 755 756 static int nselcoll; /* Select collisions since boot */ 757 int selwait; 758 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 759 760 /* 761 * Select system call. 762 * 763 * MPALMOSTSAFE 764 */ 765 int 766 sys_select(struct select_args *uap) 767 { 768 struct timeval ktv; 769 struct timeval *ktvp; 770 int error; 771 772 /* 773 * Get timeout if any. 774 */ 775 if (uap->tv != NULL) { 776 error = copyin(uap->tv, &ktv, sizeof (ktv)); 777 if (error) 778 return (error); 779 error = itimerfix(&ktv); 780 if (error) 781 return (error); 782 ktvp = &ktv; 783 } else { 784 ktvp = NULL; 785 } 786 787 /* 788 * Do real work. 789 */ 790 get_mplock(); 791 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 792 &uap->sysmsg_result); 793 rel_mplock(); 794 795 return (error); 796 } 797 798 799 /* 800 * Pselect system call. 801 * 802 * MPALMOSTSAFE 803 */ 804 int 805 sys_pselect(struct pselect_args *uap) 806 { 807 struct thread *td = curthread; 808 struct lwp *lp = td->td_lwp; 809 struct timespec kts; 810 struct timeval ktv; 811 struct timeval *ktvp; 812 sigset_t sigmask; 813 int error; 814 815 /* 816 * Get timeout if any and convert it. 817 * Round up during conversion to avoid timeout going off early. 818 */ 819 if (uap->ts != NULL) { 820 error = copyin(uap->ts, &kts, sizeof (kts)); 821 if (error) 822 return (error); 823 ktv.tv_sec = kts.tv_sec; 824 ktv.tv_usec = (kts.tv_nsec + 999) / 1000; 825 error = itimerfix(&ktv); 826 if (error) 827 return (error); 828 ktvp = &ktv; 829 } else { 830 ktvp = NULL; 831 } 832 833 /* 834 * Install temporary signal mask if any provided. 835 */ 836 if (uap->sigmask != NULL) { 837 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 838 if (error) 839 return (error); 840 get_mplock(); 841 lp->lwp_oldsigmask = lp->lwp_sigmask; 842 SIG_CANTMASK(sigmask); 843 lp->lwp_sigmask = sigmask; 844 } else { 845 get_mplock(); 846 } 847 848 /* 849 * Do real job. 850 */ 851 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 852 &uap->sysmsg_result); 853 854 if (uap->sigmask != NULL) { 855 /* doselect() responsible for turning ERESTART into EINTR */ 856 KKASSERT(error != ERESTART); 857 if (error == EINTR) { 858 /* 859 * We can't restore the previous signal mask now 860 * because it could block the signal that interrupted 861 * us. So make a note to restore it after executing 862 * the handler. 863 */ 864 lp->lwp_flag |= LWP_OLDMASK; 865 } else { 866 /* 867 * No handler to run. Restore previous mask immediately. 868 */ 869 lp->lwp_sigmask = lp->lwp_oldsigmask; 870 } 871 } 872 rel_mplock(); 873 874 return (error); 875 } 876 877 /* 878 * Common code for sys_select() and sys_pselect(). 879 * 880 * in, out and ex are userland pointers. tv must point to validated 881 * kernel-side timeout value or NULL for infinite timeout. res must 882 * point to syscall return value. 883 */ 884 static int 885 doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tv, 886 int *res) 887 { 888 struct lwp *lp = curthread->td_lwp; 889 struct proc *p = curproc; 890 891 /* 892 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 893 * infds with the new FD_SETSIZE of 1024, and more than enough for 894 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 895 * of 256. 896 */ 897 fd_mask s_selbits[howmany(2048, NFDBITS)]; 898 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 899 struct timeval atv, rtv, ttv; 900 int ncoll, error, timo; 901 u_int nbufbytes, ncpbytes, nfdbits; 902 903 if (nd < 0) 904 return (EINVAL); 905 if (nd > p->p_fd->fd_nfiles) 906 nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 907 908 /* 909 * Allocate just enough bits for the non-null fd_sets. Use the 910 * preallocated auto buffer if possible. 911 */ 912 nfdbits = roundup(nd, NFDBITS); 913 ncpbytes = nfdbits / NBBY; 914 nbufbytes = 0; 915 if (in != NULL) 916 nbufbytes += 2 * ncpbytes; 917 if (ou != NULL) 918 nbufbytes += 2 * ncpbytes; 919 if (ex != NULL) 920 nbufbytes += 2 * ncpbytes; 921 if (nbufbytes <= sizeof s_selbits) 922 selbits = &s_selbits[0]; 923 else 924 selbits = kmalloc(nbufbytes, M_SELECT, M_WAITOK); 925 926 /* 927 * Assign pointers into the bit buffers and fetch the input bits. 928 * Put the output buffers together so that they can be bzeroed 929 * together. 930 */ 931 sbp = selbits; 932 #define getbits(name, x) \ 933 do { \ 934 if (name == NULL) \ 935 ibits[x] = NULL; \ 936 else { \ 937 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 938 obits[x] = sbp; \ 939 sbp += ncpbytes / sizeof *sbp; \ 940 error = copyin(name, ibits[x], ncpbytes); \ 941 if (error != 0) \ 942 goto done; \ 943 } \ 944 } while (0) 945 getbits(in, 0); 946 getbits(ou, 1); 947 getbits(ex, 2); 948 #undef getbits 949 if (nbufbytes != 0) 950 bzero(selbits, nbufbytes / 2); 951 952 if (tv != NULL) { 953 atv = *tv; 954 getmicrouptime(&rtv); 955 timevaladd(&atv, &rtv); 956 } else { 957 atv.tv_sec = 0; 958 atv.tv_usec = 0; 959 } 960 timo = 0; 961 retry: 962 ncoll = nselcoll; 963 lp->lwp_flag |= LWP_SELECT; 964 error = selscan(p, ibits, obits, nd, res); 965 if (error || *res) 966 goto done; 967 if (atv.tv_sec || atv.tv_usec) { 968 getmicrouptime(&rtv); 969 if (timevalcmp(&rtv, &atv, >=)) 970 goto done; 971 ttv = atv; 972 timevalsub(&ttv, &rtv); 973 timo = ttv.tv_sec > 24 * 60 * 60 ? 974 24 * 60 * 60 * hz : tvtohz_high(&ttv); 975 } 976 crit_enter(); 977 tsleep_interlock(&selwait, PCATCH); 978 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 979 crit_exit(); 980 goto retry; 981 } 982 lp->lwp_flag &= ~LWP_SELECT; 983 error = tsleep(&selwait, PCATCH | PINTERLOCKED, "select", timo); 984 crit_exit(); 985 986 if (error == 0) 987 goto retry; 988 done: 989 lp->lwp_flag &= ~LWP_SELECT; 990 /* select is not restarted after signals... */ 991 if (error == ERESTART) 992 error = EINTR; 993 if (error == EWOULDBLOCK) 994 error = 0; 995 #define putbits(name, x) \ 996 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 997 error = error2; 998 if (error == 0) { 999 int error2; 1000 1001 putbits(in, 0); 1002 putbits(ou, 1); 1003 putbits(ex, 2); 1004 #undef putbits 1005 } 1006 if (selbits != &s_selbits[0]) 1007 kfree(selbits, M_SELECT); 1008 return (error); 1009 } 1010 1011 static int 1012 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res) 1013 { 1014 int msk, i, fd; 1015 fd_mask bits; 1016 struct file *fp; 1017 int n = 0; 1018 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 1019 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 1020 1021 for (msk = 0; msk < 3; msk++) { 1022 if (ibits[msk] == NULL) 1023 continue; 1024 for (i = 0; i < nfd; i += NFDBITS) { 1025 bits = ibits[msk][i/NFDBITS]; 1026 /* ffs(int mask) not portable, fd_mask is long */ 1027 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 1028 if (!(bits & 1)) 1029 continue; 1030 fp = holdfp(p->p_fd, fd, -1); 1031 if (fp == NULL) 1032 return (EBADF); 1033 if (fo_poll(fp, flag[msk], fp->f_cred)) { 1034 obits[msk][(fd)/NFDBITS] |= 1035 ((fd_mask)1 << ((fd) % NFDBITS)); 1036 n++; 1037 } 1038 fdrop(fp); 1039 } 1040 } 1041 } 1042 *res = n; 1043 return (0); 1044 } 1045 1046 /* 1047 * Poll system call. 1048 * 1049 * MPALMOSTSAFE 1050 */ 1051 int 1052 sys_poll(struct poll_args *uap) 1053 { 1054 struct pollfd *bits; 1055 struct pollfd smallbits[32]; 1056 struct timeval atv, rtv, ttv; 1057 int ncoll, error = 0, timo; 1058 u_int nfds; 1059 size_t ni; 1060 struct lwp *lp = curthread->td_lwp; 1061 struct proc *p = curproc; 1062 1063 nfds = uap->nfds; 1064 /* 1065 * This is kinda bogus. We have fd limits, but that is not 1066 * really related to the size of the pollfd array. Make sure 1067 * we let the process use at least FD_SETSIZE entries and at 1068 * least enough for the current limits. We want to be reasonably 1069 * safe, but not overly restrictive. 1070 */ 1071 if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE) 1072 return (EINVAL); 1073 ni = nfds * sizeof(struct pollfd); 1074 if (ni > sizeof(smallbits)) 1075 bits = kmalloc(ni, M_TEMP, M_WAITOK); 1076 else 1077 bits = smallbits; 1078 error = copyin(uap->fds, bits, ni); 1079 if (error) 1080 goto done2; 1081 if (uap->timeout != INFTIM) { 1082 atv.tv_sec = uap->timeout / 1000; 1083 atv.tv_usec = (uap->timeout % 1000) * 1000; 1084 if (itimerfix(&atv)) { 1085 error = EINVAL; 1086 goto done2; 1087 } 1088 getmicrouptime(&rtv); 1089 timevaladd(&atv, &rtv); 1090 } else { 1091 atv.tv_sec = 0; 1092 atv.tv_usec = 0; 1093 } 1094 timo = 0; 1095 get_mplock(); 1096 retry: 1097 ncoll = nselcoll; 1098 lp->lwp_flag |= LWP_SELECT; 1099 error = pollscan(p, bits, nfds, &uap->sysmsg_result); 1100 if (error || uap->sysmsg_result) 1101 goto done1; 1102 if (atv.tv_sec || atv.tv_usec) { 1103 getmicrouptime(&rtv); 1104 if (timevalcmp(&rtv, &atv, >=)) 1105 goto done1; 1106 ttv = atv; 1107 timevalsub(&ttv, &rtv); 1108 timo = ttv.tv_sec > 24 * 60 * 60 ? 1109 24 * 60 * 60 * hz : tvtohz_high(&ttv); 1110 } 1111 crit_enter(); 1112 tsleep_interlock(&selwait, PCATCH); 1113 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 1114 crit_exit(); 1115 goto retry; 1116 } 1117 lp->lwp_flag &= ~LWP_SELECT; 1118 error = tsleep(&selwait, PCATCH | PINTERLOCKED, "poll", timo); 1119 crit_exit(); 1120 1121 if (error == 0) 1122 goto retry; 1123 done1: 1124 rel_mplock(); 1125 done2: 1126 lp->lwp_flag &= ~LWP_SELECT; 1127 /* poll is not restarted after signals... */ 1128 if (error == ERESTART) 1129 error = EINTR; 1130 if (error == EWOULDBLOCK) 1131 error = 0; 1132 if (error == 0) { 1133 error = copyout(bits, uap->fds, ni); 1134 if (error) 1135 goto out; 1136 } 1137 out: 1138 if (ni > sizeof(smallbits)) 1139 kfree(bits, M_TEMP); 1140 return (error); 1141 } 1142 1143 static int 1144 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res) 1145 { 1146 int i; 1147 struct file *fp; 1148 int n = 0; 1149 1150 for (i = 0; i < nfd; i++, fds++) { 1151 if (fds->fd >= p->p_fd->fd_nfiles) { 1152 fds->revents = POLLNVAL; 1153 n++; 1154 } else if (fds->fd < 0) { 1155 fds->revents = 0; 1156 } else { 1157 fp = holdfp(p->p_fd, fds->fd, -1); 1158 if (fp == NULL) { 1159 fds->revents = POLLNVAL; 1160 n++; 1161 } else { 1162 /* 1163 * Note: backend also returns POLLHUP and 1164 * POLLERR if appropriate. 1165 */ 1166 fds->revents = fo_poll(fp, fds->events, 1167 fp->f_cred); 1168 if (fds->revents != 0) 1169 n++; 1170 fdrop(fp); 1171 } 1172 } 1173 } 1174 *res = n; 1175 return (0); 1176 } 1177 1178 /* 1179 * OpenBSD poll system call. 1180 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1181 * 1182 * MPSAFE 1183 */ 1184 int 1185 sys_openbsd_poll(struct openbsd_poll_args *uap) 1186 { 1187 return (sys_poll((struct poll_args *)uap)); 1188 } 1189 1190 /*ARGSUSED*/ 1191 int 1192 seltrue(cdev_t dev, int events) 1193 { 1194 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1195 } 1196 1197 /* 1198 * Record a select request. A global wait must be used since a process/thread 1199 * might go away after recording its request. 1200 */ 1201 void 1202 selrecord(struct thread *selector, struct selinfo *sip) 1203 { 1204 struct proc *p; 1205 struct lwp *lp = NULL; 1206 1207 if (selector->td_lwp == NULL) 1208 panic("selrecord: thread needs a process"); 1209 1210 if (sip->si_pid == selector->td_proc->p_pid && 1211 sip->si_tid == selector->td_lwp->lwp_tid) 1212 return; 1213 if (sip->si_pid && (p = pfind(sip->si_pid))) 1214 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1215 if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) { 1216 sip->si_flags |= SI_COLL; 1217 } else { 1218 sip->si_pid = selector->td_proc->p_pid; 1219 sip->si_tid = selector->td_lwp->lwp_tid; 1220 } 1221 } 1222 1223 /* 1224 * Do a wakeup when a selectable event occurs. 1225 */ 1226 void 1227 selwakeup(struct selinfo *sip) 1228 { 1229 struct proc *p; 1230 struct lwp *lp = NULL; 1231 1232 if (sip->si_pid == 0) 1233 return; 1234 if (sip->si_flags & SI_COLL) { 1235 nselcoll++; 1236 sip->si_flags &= ~SI_COLL; 1237 wakeup((caddr_t)&selwait); /* YYY fixable */ 1238 } 1239 p = pfind(sip->si_pid); 1240 sip->si_pid = 0; 1241 if (p == NULL) 1242 return; 1243 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1244 if (lp == NULL) 1245 return; 1246 1247 /* 1248 * This is a temporary hack until the code can be rewritten. 1249 * Check LWP_SELECT before assuming we can setrunnable(). 1250 * Otherwise we might catch the lwp before it actually goes to 1251 * sleep. 1252 */ 1253 crit_enter(); 1254 if (lp->lwp_flag & LWP_SELECT) { 1255 lp->lwp_flag &= ~LWP_SELECT; 1256 } else if (lp->lwp_wchan == (caddr_t)&selwait) { 1257 /* 1258 * Flag the process to break the tsleep when 1259 * setrunnable is called, but only call setrunnable 1260 * here if the process is not in a stopped state. 1261 */ 1262 lp->lwp_flag |= LWP_BREAKTSLEEP; 1263 if (p->p_stat != SSTOP) 1264 setrunnable(lp); 1265 } 1266 crit_exit(); 1267 } 1268 1269