1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/socketvar.h> 55 #include <sys/uio.h> 56 #include <sys/kernel.h> 57 #include <sys/kern_syscall.h> 58 #include <sys/malloc.h> 59 #include <sys/mapped_ioctl.h> 60 #include <sys/poll.h> 61 #include <sys/queue.h> 62 #include <sys/resourcevar.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysent.h> 65 #include <sys/buf.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_page.h> 71 #include <sys/file2.h> 72 73 #include <machine/limits.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 81 struct timeval *tv, int *res); 82 static int pollscan (struct proc *, struct pollfd *, u_int, int *); 83 static int selscan (struct proc *, fd_mask **, fd_mask **, 84 int, int *); 85 static int dofileread(int, struct file *, struct uio *, int, size_t *); 86 static int dofilewrite(int, struct file *, struct uio *, int, size_t *); 87 88 /* 89 * Read system call. 90 * 91 * MPSAFE 92 */ 93 int 94 sys_read(struct read_args *uap) 95 { 96 struct thread *td = curthread; 97 struct uio auio; 98 struct iovec aiov; 99 int error; 100 101 if ((ssize_t)uap->nbyte < 0) 102 error = EINVAL; 103 104 aiov.iov_base = uap->buf; 105 aiov.iov_len = uap->nbyte; 106 auio.uio_iov = &aiov; 107 auio.uio_iovcnt = 1; 108 auio.uio_offset = -1; 109 auio.uio_resid = uap->nbyte; 110 auio.uio_rw = UIO_READ; 111 auio.uio_segflg = UIO_USERSPACE; 112 auio.uio_td = td; 113 114 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 115 return(error); 116 } 117 118 /* 119 * Positioned (Pread) read system call 120 * 121 * MPSAFE 122 */ 123 int 124 sys_extpread(struct extpread_args *uap) 125 { 126 struct thread *td = curthread; 127 struct uio auio; 128 struct iovec aiov; 129 int error; 130 int flags; 131 132 if ((ssize_t)uap->nbyte < 0) 133 return(EINVAL); 134 135 aiov.iov_base = uap->buf; 136 aiov.iov_len = uap->nbyte; 137 auio.uio_iov = &aiov; 138 auio.uio_iovcnt = 1; 139 auio.uio_offset = uap->offset; 140 auio.uio_resid = uap->nbyte; 141 auio.uio_rw = UIO_READ; 142 auio.uio_segflg = UIO_USERSPACE; 143 auio.uio_td = td; 144 145 flags = uap->flags & O_FMASK; 146 if (uap->offset != (off_t)-1) 147 flags |= O_FOFFSET; 148 149 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 150 return(error); 151 } 152 153 /* 154 * Scatter read system call. 155 * 156 * MPSAFE 157 */ 158 int 159 sys_readv(struct readv_args *uap) 160 { 161 struct thread *td = curthread; 162 struct uio auio; 163 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 164 int error; 165 166 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 167 &auio.uio_resid); 168 if (error) 169 return (error); 170 auio.uio_iov = iov; 171 auio.uio_iovcnt = uap->iovcnt; 172 auio.uio_offset = -1; 173 auio.uio_rw = UIO_READ; 174 auio.uio_segflg = UIO_USERSPACE; 175 auio.uio_td = td; 176 177 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult); 178 179 iovec_free(&iov, aiov); 180 return (error); 181 } 182 183 184 /* 185 * Scatter positioned read system call. 186 * 187 * MPSAFE 188 */ 189 int 190 sys_extpreadv(struct extpreadv_args *uap) 191 { 192 struct thread *td = curthread; 193 struct uio auio; 194 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 195 int error; 196 int flags; 197 198 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 199 &auio.uio_resid); 200 if (error) 201 return (error); 202 auio.uio_iov = iov; 203 auio.uio_iovcnt = uap->iovcnt; 204 auio.uio_offset = uap->offset; 205 auio.uio_rw = UIO_READ; 206 auio.uio_segflg = UIO_USERSPACE; 207 auio.uio_td = td; 208 209 flags = uap->flags & O_FMASK; 210 if (uap->offset != (off_t)-1) 211 flags |= O_FOFFSET; 212 213 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult); 214 215 iovec_free(&iov, aiov); 216 return(error); 217 } 218 219 /* 220 * MPSAFE 221 */ 222 int 223 kern_preadv(int fd, struct uio *auio, int flags, size_t *res) 224 { 225 struct thread *td = curthread; 226 struct proc *p = td->td_proc; 227 struct file *fp; 228 int error; 229 230 KKASSERT(p); 231 232 fp = holdfp(p->p_fd, fd, FREAD); 233 if (fp == NULL) 234 return (EBADF); 235 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 236 error = ESPIPE; 237 } else { 238 error = dofileread(fd, fp, auio, flags, res); 239 } 240 fdrop(fp); 241 return(error); 242 } 243 244 /* 245 * Common code for readv and preadv that reads data in 246 * from a file using the passed in uio, offset, and flags. 247 * 248 * MPALMOSTSAFE - ktrace needs help 249 */ 250 static int 251 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 252 { 253 struct thread *td = curthread; 254 int error; 255 size_t len; 256 #ifdef KTRACE 257 struct iovec *ktriov = NULL; 258 struct uio ktruio; 259 #endif 260 261 #ifdef KTRACE 262 /* 263 * if tracing, save a copy of iovec 264 */ 265 if (KTRPOINT(td, KTR_GENIO)) { 266 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 267 268 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 269 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 270 ktruio = *auio; 271 } 272 #endif 273 len = auio->uio_resid; 274 error = fo_read(fp, auio, fp->f_cred, flags); 275 if (error) { 276 if (auio->uio_resid != len && (error == ERESTART || 277 error == EINTR || error == EWOULDBLOCK)) 278 error = 0; 279 } 280 #ifdef KTRACE 281 if (ktriov != NULL) { 282 if (error == 0) { 283 ktruio.uio_iov = ktriov; 284 ktruio.uio_resid = len - auio->uio_resid; 285 get_mplock(); 286 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 287 rel_mplock(); 288 } 289 FREE(ktriov, M_TEMP); 290 } 291 #endif 292 if (error == 0) 293 *res = len - auio->uio_resid; 294 295 return(error); 296 } 297 298 /* 299 * Write system call 300 * 301 * MPSAFE 302 */ 303 int 304 sys_write(struct write_args *uap) 305 { 306 struct thread *td = curthread; 307 struct uio auio; 308 struct iovec aiov; 309 int error; 310 311 if ((ssize_t)uap->nbyte < 0) 312 error = EINVAL; 313 314 aiov.iov_base = (void *)(uintptr_t)uap->buf; 315 aiov.iov_len = uap->nbyte; 316 auio.uio_iov = &aiov; 317 auio.uio_iovcnt = 1; 318 auio.uio_offset = -1; 319 auio.uio_resid = uap->nbyte; 320 auio.uio_rw = UIO_WRITE; 321 auio.uio_segflg = UIO_USERSPACE; 322 auio.uio_td = td; 323 324 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 325 326 return(error); 327 } 328 329 /* 330 * Pwrite system call 331 * 332 * MPSAFE 333 */ 334 int 335 sys_extpwrite(struct extpwrite_args *uap) 336 { 337 struct thread *td = curthread; 338 struct uio auio; 339 struct iovec aiov; 340 int error; 341 int flags; 342 343 if ((ssize_t)uap->nbyte < 0) 344 error = EINVAL; 345 346 aiov.iov_base = (void *)(uintptr_t)uap->buf; 347 aiov.iov_len = uap->nbyte; 348 auio.uio_iov = &aiov; 349 auio.uio_iovcnt = 1; 350 auio.uio_offset = uap->offset; 351 auio.uio_resid = uap->nbyte; 352 auio.uio_rw = UIO_WRITE; 353 auio.uio_segflg = UIO_USERSPACE; 354 auio.uio_td = td; 355 356 flags = uap->flags & O_FMASK; 357 if (uap->offset != (off_t)-1) 358 flags |= O_FOFFSET; 359 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 360 return(error); 361 } 362 363 /* 364 * MPSAFE 365 */ 366 int 367 sys_writev(struct writev_args *uap) 368 { 369 struct thread *td = curthread; 370 struct uio auio; 371 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 372 int error; 373 374 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 375 &auio.uio_resid); 376 if (error) 377 return (error); 378 auio.uio_iov = iov; 379 auio.uio_iovcnt = uap->iovcnt; 380 auio.uio_offset = -1; 381 auio.uio_rw = UIO_WRITE; 382 auio.uio_segflg = UIO_USERSPACE; 383 auio.uio_td = td; 384 385 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult); 386 387 iovec_free(&iov, aiov); 388 return (error); 389 } 390 391 392 /* 393 * Gather positioned write system call 394 * 395 * MPSAFE 396 */ 397 int 398 sys_extpwritev(struct extpwritev_args *uap) 399 { 400 struct thread *td = curthread; 401 struct uio auio; 402 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 403 int error; 404 int flags; 405 406 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 407 &auio.uio_resid); 408 if (error) 409 return (error); 410 auio.uio_iov = iov; 411 auio.uio_iovcnt = uap->iovcnt; 412 auio.uio_offset = uap->offset; 413 auio.uio_rw = UIO_WRITE; 414 auio.uio_segflg = UIO_USERSPACE; 415 auio.uio_td = td; 416 417 flags = uap->flags & O_FMASK; 418 if (uap->offset != (off_t)-1) 419 flags |= O_FOFFSET; 420 421 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult); 422 423 iovec_free(&iov, aiov); 424 return(error); 425 } 426 427 /* 428 * MPSAFE 429 */ 430 int 431 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res) 432 { 433 struct thread *td = curthread; 434 struct proc *p = td->td_proc; 435 struct file *fp; 436 int error; 437 438 KKASSERT(p); 439 440 fp = holdfp(p->p_fd, fd, FWRITE); 441 if (fp == NULL) 442 return (EBADF); 443 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 444 error = ESPIPE; 445 } else { 446 error = dofilewrite(fd, fp, auio, flags, res); 447 } 448 449 fdrop(fp); 450 return (error); 451 } 452 453 /* 454 * Common code for writev and pwritev that writes data to 455 * a file using the passed in uio, offset, and flags. 456 * 457 * MPALMOSTSAFE - ktrace needs help 458 */ 459 static int 460 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res) 461 { 462 struct thread *td = curthread; 463 struct lwp *lp = td->td_lwp; 464 int error; 465 size_t len; 466 #ifdef KTRACE 467 struct iovec *ktriov = NULL; 468 struct uio ktruio; 469 #endif 470 471 #ifdef KTRACE 472 /* 473 * if tracing, save a copy of iovec and uio 474 */ 475 if (KTRPOINT(td, KTR_GENIO)) { 476 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 477 478 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 479 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 480 ktruio = *auio; 481 } 482 #endif 483 len = auio->uio_resid; 484 error = fo_write(fp, auio, fp->f_cred, flags); 485 if (error) { 486 if (auio->uio_resid != len && (error == ERESTART || 487 error == EINTR || error == EWOULDBLOCK)) 488 error = 0; 489 /* Socket layer is responsible for issuing SIGPIPE. */ 490 if (error == EPIPE) { 491 get_mplock(); 492 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 493 rel_mplock(); 494 } 495 } 496 #ifdef KTRACE 497 if (ktriov != NULL) { 498 if (error == 0) { 499 ktruio.uio_iov = ktriov; 500 ktruio.uio_resid = len - auio->uio_resid; 501 get_mplock(); 502 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 503 rel_mplock(); 504 } 505 FREE(ktriov, M_TEMP); 506 } 507 #endif 508 if (error == 0) 509 *res = len - auio->uio_resid; 510 511 return(error); 512 } 513 514 /* 515 * Ioctl system call 516 */ 517 /* ARGSUSED */ 518 int 519 sys_ioctl(struct ioctl_args *uap) 520 { 521 return(mapped_ioctl(uap->fd, uap->com, uap->data, NULL, &uap->sysmsg)); 522 } 523 524 struct ioctl_map_entry { 525 const char *subsys; 526 struct ioctl_map_range *cmd_ranges; 527 LIST_ENTRY(ioctl_map_entry) entries; 528 }; 529 530 /* 531 * The true heart of all ioctl syscall handlers (native, emulation). 532 * If map != NULL, it will be searched for a matching entry for com, 533 * and appropriate conversions/conversion functions will be utilized. 534 */ 535 int 536 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map, 537 struct sysmsg *msg) 538 { 539 struct thread *td = curthread; 540 struct proc *p = td->td_proc; 541 struct ucred *cred; 542 struct file *fp; 543 struct ioctl_map_range *iomc = NULL; 544 int error; 545 u_int size; 546 u_long ocom = com; 547 caddr_t data, memp; 548 int tmp; 549 #define STK_PARAMS 128 550 union { 551 char stkbuf[STK_PARAMS]; 552 long align; 553 } ubuf; 554 555 KKASSERT(p); 556 cred = p->p_ucred; 557 558 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 559 if (fp == NULL) 560 return(EBADF); 561 562 if (map != NULL) { /* obey translation map */ 563 u_long maskcmd; 564 struct ioctl_map_entry *e; 565 566 maskcmd = com & map->mask; 567 568 LIST_FOREACH(e, &map->mapping, entries) { 569 for (iomc = e->cmd_ranges; iomc->start != 0 || 570 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 571 iomc->mapfunc != NULL; 572 iomc++) { 573 if (maskcmd >= iomc->start && 574 maskcmd <= iomc->end) 575 break; 576 } 577 578 /* Did we find a match? */ 579 if (iomc->start != 0 || iomc->maptocmd != 0 || 580 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 581 break; 582 } 583 584 if (iomc == NULL || 585 (iomc->start == 0 && iomc->maptocmd == 0 586 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 587 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 588 map->sys, fd, maskcmd, 589 (int)((maskcmd >> 8) & 0xff), 590 (int)(maskcmd & 0xff)); 591 error = EINVAL; 592 goto done; 593 } 594 595 /* 596 * If it's a non-range one to one mapping, maptocmd should be 597 * correct. If it's a ranged one to one mapping, we pass the 598 * original value of com, and for a range mapped to a different 599 * range, we always need a mapping function to translate the 600 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 601 */ 602 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 603 com = iomc->maptocmd; 604 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 605 if (iomc->mapfunc != NULL) 606 com = iomc->mapfunc(iomc->start, iomc->end, 607 iomc->start, iomc->end, 608 com, com); 609 } else { 610 if (iomc->mapfunc != NULL) { 611 com = iomc->mapfunc(iomc->start, iomc->end, 612 iomc->maptocmd, iomc->maptoend, 613 com, ocom); 614 } else { 615 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 616 map->sys, fd, maskcmd, 617 (int)((maskcmd >> 8) & 0xff), 618 (int)(maskcmd & 0xff)); 619 error = EINVAL; 620 goto done; 621 } 622 } 623 } 624 625 switch (com) { 626 case FIONCLEX: 627 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 628 goto done; 629 case FIOCLEX: 630 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 631 goto done; 632 } 633 634 /* 635 * Interpret high order word to find amount of data to be 636 * copied to/from the user's address space. 637 */ 638 size = IOCPARM_LEN(com); 639 if (size > IOCPARM_MAX) { 640 error = ENOTTY; 641 goto done; 642 } 643 644 memp = NULL; 645 if (size > sizeof (ubuf.stkbuf)) { 646 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 647 data = memp; 648 } else { 649 data = ubuf.stkbuf; 650 } 651 if ((com & IOC_IN) != 0) { 652 if (size != 0) { 653 error = copyin(uspc_data, data, (size_t)size); 654 if (error) { 655 if (memp != NULL) 656 kfree(memp, M_IOCTLOPS); 657 goto done; 658 } 659 } else { 660 *(caddr_t *)data = uspc_data; 661 } 662 } else if ((com & IOC_OUT) != 0 && size) { 663 /* 664 * Zero the buffer so the user always 665 * gets back something deterministic. 666 */ 667 bzero(data, (size_t)size); 668 } else if ((com & IOC_VOID) != 0) { 669 *(caddr_t *)data = uspc_data; 670 } 671 672 switch (com) { 673 case FIONBIO: 674 if ((tmp = *(int *)data)) 675 fp->f_flag |= FNONBLOCK; 676 else 677 fp->f_flag &= ~FNONBLOCK; 678 error = 0; 679 break; 680 681 case FIOASYNC: 682 if ((tmp = *(int *)data)) 683 fp->f_flag |= FASYNC; 684 else 685 fp->f_flag &= ~FASYNC; 686 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg); 687 break; 688 689 default: 690 /* 691 * If there is a override function, 692 * call it instead of directly routing the call 693 */ 694 if (map != NULL && iomc->wrapfunc != NULL) 695 error = iomc->wrapfunc(fp, com, ocom, data, cred); 696 else 697 error = fo_ioctl(fp, com, data, cred, msg); 698 /* 699 * Copy any data to user, size was 700 * already set and checked above. 701 */ 702 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 703 error = copyout(data, uspc_data, (size_t)size); 704 break; 705 } 706 if (memp != NULL) 707 kfree(memp, M_IOCTLOPS); 708 done: 709 fdrop(fp); 710 return(error); 711 } 712 713 int 714 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 715 { 716 struct ioctl_map_entry *ne; 717 718 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 719 he->subsys != NULL && *he->subsys != '\0'); 720 721 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); 722 723 ne->subsys = he->subsys; 724 ne->cmd_ranges = he->cmd_ranges; 725 726 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 727 728 return(0); 729 } 730 731 int 732 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 733 { 734 struct ioctl_map_entry *ne; 735 736 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 737 738 LIST_FOREACH(ne, &he->map->mapping, entries) { 739 if (ne->cmd_ranges != he->cmd_ranges) 740 continue; 741 LIST_REMOVE(ne, entries); 742 kfree(ne, M_IOCTLMAP); 743 return(0); 744 } 745 return(EINVAL); 746 } 747 748 static int nselcoll; /* Select collisions since boot */ 749 int selwait; 750 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 751 752 /* 753 * Select system call. 754 */ 755 int 756 sys_select(struct select_args *uap) 757 { 758 struct timeval ktv; 759 struct timeval *ktvp; 760 int error; 761 762 /* 763 * Get timeout if any. 764 */ 765 if (uap->tv != NULL) { 766 error = copyin(uap->tv, &ktv, sizeof (ktv)); 767 if (error) 768 return (error); 769 error = itimerfix(&ktv); 770 if (error) 771 return (error); 772 ktvp = &ktv; 773 } else { 774 ktvp = NULL; 775 } 776 777 /* 778 * Do real work. 779 */ 780 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 781 &uap->sysmsg_result); 782 783 return (error); 784 } 785 786 787 /* 788 * Pselect system call. 789 */ 790 int 791 sys_pselect(struct pselect_args *uap) 792 { 793 struct thread *td = curthread; 794 struct lwp *lp = td->td_lwp; 795 struct timespec kts; 796 struct timeval ktv; 797 struct timeval *ktvp; 798 sigset_t sigmask; 799 int error; 800 801 /* 802 * Get timeout if any and convert it. 803 * Round up during conversion to avoid timeout going off early. 804 */ 805 if (uap->ts != NULL) { 806 error = copyin(uap->ts, &kts, sizeof (kts)); 807 if (error) 808 return (error); 809 ktv.tv_sec = kts.tv_sec; 810 ktv.tv_usec = (kts.tv_nsec + 999) / 1000; 811 error = itimerfix(&ktv); 812 if (error) 813 return (error); 814 ktvp = &ktv; 815 } else { 816 ktvp = NULL; 817 } 818 819 /* 820 * Install temporary signal mask if any provided. 821 */ 822 if (uap->sigmask != NULL) { 823 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 824 if (error) 825 return (error); 826 lp->lwp_oldsigmask = lp->lwp_sigmask; 827 SIG_CANTMASK(sigmask); 828 lp->lwp_sigmask = sigmask; 829 } 830 831 /* 832 * Do real job. 833 */ 834 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 835 &uap->sysmsg_result); 836 837 if (uap->sigmask != NULL) { 838 /* doselect() responsible for turning ERESTART into EINTR */ 839 KKASSERT(error != ERESTART); 840 if (error == EINTR) { 841 /* 842 * We can't restore the previous signal mask now 843 * because it could block the signal that interrupted 844 * us. So make a note to restore it after executing 845 * the handler. 846 */ 847 lp->lwp_flag |= LWP_OLDMASK; 848 } else { 849 /* 850 * No handler to run. Restore previous mask immediately. 851 */ 852 lp->lwp_sigmask = lp->lwp_oldsigmask; 853 } 854 } 855 856 return (error); 857 } 858 859 /* 860 * Common code for sys_select() and sys_pselect(). 861 * 862 * in, out and ex are userland pointers. tv must point to validated 863 * kernel-side timeout value or NULL for infinite timeout. res must 864 * point to syscall return value. 865 */ 866 static int 867 doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tv, 868 int *res) 869 { 870 struct lwp *lp = curthread->td_lwp; 871 struct proc *p = curproc; 872 873 /* 874 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 875 * infds with the new FD_SETSIZE of 1024, and more than enough for 876 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 877 * of 256. 878 */ 879 fd_mask s_selbits[howmany(2048, NFDBITS)]; 880 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 881 struct timeval atv, rtv, ttv; 882 int ncoll, error, timo; 883 u_int nbufbytes, ncpbytes, nfdbits; 884 885 if (nd < 0) 886 return (EINVAL); 887 if (nd > p->p_fd->fd_nfiles) 888 nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 889 890 /* 891 * Allocate just enough bits for the non-null fd_sets. Use the 892 * preallocated auto buffer if possible. 893 */ 894 nfdbits = roundup(nd, NFDBITS); 895 ncpbytes = nfdbits / NBBY; 896 nbufbytes = 0; 897 if (in != NULL) 898 nbufbytes += 2 * ncpbytes; 899 if (ou != NULL) 900 nbufbytes += 2 * ncpbytes; 901 if (ex != NULL) 902 nbufbytes += 2 * ncpbytes; 903 if (nbufbytes <= sizeof s_selbits) 904 selbits = &s_selbits[0]; 905 else 906 selbits = kmalloc(nbufbytes, M_SELECT, M_WAITOK); 907 908 /* 909 * Assign pointers into the bit buffers and fetch the input bits. 910 * Put the output buffers together so that they can be bzeroed 911 * together. 912 */ 913 sbp = selbits; 914 #define getbits(name, x) \ 915 do { \ 916 if (name == NULL) \ 917 ibits[x] = NULL; \ 918 else { \ 919 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 920 obits[x] = sbp; \ 921 sbp += ncpbytes / sizeof *sbp; \ 922 error = copyin(name, ibits[x], ncpbytes); \ 923 if (error != 0) \ 924 goto done; \ 925 } \ 926 } while (0) 927 getbits(in, 0); 928 getbits(ou, 1); 929 getbits(ex, 2); 930 #undef getbits 931 if (nbufbytes != 0) 932 bzero(selbits, nbufbytes / 2); 933 934 if (tv != NULL) { 935 atv = *tv; 936 getmicrouptime(&rtv); 937 timevaladd(&atv, &rtv); 938 } else { 939 atv.tv_sec = 0; 940 atv.tv_usec = 0; 941 } 942 timo = 0; 943 retry: 944 ncoll = nselcoll; 945 lp->lwp_flag |= LWP_SELECT; 946 error = selscan(p, ibits, obits, nd, res); 947 if (error || *res) 948 goto done; 949 if (atv.tv_sec || atv.tv_usec) { 950 getmicrouptime(&rtv); 951 if (timevalcmp(&rtv, &atv, >=)) 952 goto done; 953 ttv = atv; 954 timevalsub(&ttv, &rtv); 955 timo = ttv.tv_sec > 24 * 60 * 60 ? 956 24 * 60 * 60 * hz : tvtohz_high(&ttv); 957 } 958 crit_enter(); 959 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 960 crit_exit(); 961 goto retry; 962 } 963 lp->lwp_flag &= ~LWP_SELECT; 964 965 error = tsleep((caddr_t)&selwait, PCATCH, "select", timo); 966 967 crit_exit(); 968 if (error == 0) 969 goto retry; 970 done: 971 lp->lwp_flag &= ~LWP_SELECT; 972 /* select is not restarted after signals... */ 973 if (error == ERESTART) 974 error = EINTR; 975 if (error == EWOULDBLOCK) 976 error = 0; 977 #define putbits(name, x) \ 978 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 979 error = error2; 980 if (error == 0) { 981 int error2; 982 983 putbits(in, 0); 984 putbits(ou, 1); 985 putbits(ex, 2); 986 #undef putbits 987 } 988 if (selbits != &s_selbits[0]) 989 kfree(selbits, M_SELECT); 990 return (error); 991 } 992 993 static int 994 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res) 995 { 996 int msk, i, fd; 997 fd_mask bits; 998 struct file *fp; 999 int n = 0; 1000 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 1001 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 1002 1003 for (msk = 0; msk < 3; msk++) { 1004 if (ibits[msk] == NULL) 1005 continue; 1006 for (i = 0; i < nfd; i += NFDBITS) { 1007 bits = ibits[msk][i/NFDBITS]; 1008 /* ffs(int mask) not portable, fd_mask is long */ 1009 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 1010 if (!(bits & 1)) 1011 continue; 1012 fp = holdfp(p->p_fd, fd, -1); 1013 if (fp == NULL) 1014 return (EBADF); 1015 if (fo_poll(fp, flag[msk], fp->f_cred)) { 1016 obits[msk][(fd)/NFDBITS] |= 1017 ((fd_mask)1 << ((fd) % NFDBITS)); 1018 n++; 1019 } 1020 fdrop(fp); 1021 } 1022 } 1023 } 1024 *res = n; 1025 return (0); 1026 } 1027 1028 /* 1029 * Poll system call. 1030 */ 1031 int 1032 sys_poll(struct poll_args *uap) 1033 { 1034 struct pollfd *bits; 1035 struct pollfd smallbits[32]; 1036 struct timeval atv, rtv, ttv; 1037 int ncoll, error = 0, timo; 1038 u_int nfds; 1039 size_t ni; 1040 struct lwp *lp = curthread->td_lwp; 1041 struct proc *p = curproc; 1042 1043 nfds = uap->nfds; 1044 /* 1045 * This is kinda bogus. We have fd limits, but that is not 1046 * really related to the size of the pollfd array. Make sure 1047 * we let the process use at least FD_SETSIZE entries and at 1048 * least enough for the current limits. We want to be reasonably 1049 * safe, but not overly restrictive. 1050 */ 1051 if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE) 1052 return (EINVAL); 1053 ni = nfds * sizeof(struct pollfd); 1054 if (ni > sizeof(smallbits)) 1055 bits = kmalloc(ni, M_TEMP, M_WAITOK); 1056 else 1057 bits = smallbits; 1058 error = copyin(uap->fds, bits, ni); 1059 if (error) 1060 goto done; 1061 if (uap->timeout != INFTIM) { 1062 atv.tv_sec = uap->timeout / 1000; 1063 atv.tv_usec = (uap->timeout % 1000) * 1000; 1064 if (itimerfix(&atv)) { 1065 error = EINVAL; 1066 goto done; 1067 } 1068 getmicrouptime(&rtv); 1069 timevaladd(&atv, &rtv); 1070 } else { 1071 atv.tv_sec = 0; 1072 atv.tv_usec = 0; 1073 } 1074 timo = 0; 1075 retry: 1076 ncoll = nselcoll; 1077 lp->lwp_flag |= LWP_SELECT; 1078 error = pollscan(p, bits, nfds, &uap->sysmsg_result); 1079 if (error || uap->sysmsg_result) 1080 goto done; 1081 if (atv.tv_sec || atv.tv_usec) { 1082 getmicrouptime(&rtv); 1083 if (timevalcmp(&rtv, &atv, >=)) 1084 goto done; 1085 ttv = atv; 1086 timevalsub(&ttv, &rtv); 1087 timo = ttv.tv_sec > 24 * 60 * 60 ? 1088 24 * 60 * 60 * hz : tvtohz_high(&ttv); 1089 } 1090 crit_enter(); 1091 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 1092 crit_exit(); 1093 goto retry; 1094 } 1095 lp->lwp_flag &= ~LWP_SELECT; 1096 error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo); 1097 crit_exit(); 1098 if (error == 0) 1099 goto retry; 1100 done: 1101 lp->lwp_flag &= ~LWP_SELECT; 1102 /* poll is not restarted after signals... */ 1103 if (error == ERESTART) 1104 error = EINTR; 1105 if (error == EWOULDBLOCK) 1106 error = 0; 1107 if (error == 0) { 1108 error = copyout(bits, uap->fds, ni); 1109 if (error) 1110 goto out; 1111 } 1112 out: 1113 if (ni > sizeof(smallbits)) 1114 kfree(bits, M_TEMP); 1115 return (error); 1116 } 1117 1118 static int 1119 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res) 1120 { 1121 int i; 1122 struct file *fp; 1123 int n = 0; 1124 1125 for (i = 0; i < nfd; i++, fds++) { 1126 if (fds->fd >= p->p_fd->fd_nfiles) { 1127 fds->revents = POLLNVAL; 1128 n++; 1129 } else if (fds->fd < 0) { 1130 fds->revents = 0; 1131 } else { 1132 fp = holdfp(p->p_fd, fds->fd, -1); 1133 if (fp == NULL) { 1134 fds->revents = POLLNVAL; 1135 n++; 1136 } else { 1137 /* 1138 * Note: backend also returns POLLHUP and 1139 * POLLERR if appropriate. 1140 */ 1141 fds->revents = fo_poll(fp, fds->events, 1142 fp->f_cred); 1143 if (fds->revents != 0) 1144 n++; 1145 fdrop(fp); 1146 } 1147 } 1148 } 1149 *res = n; 1150 return (0); 1151 } 1152 1153 /* 1154 * OpenBSD poll system call. 1155 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1156 */ 1157 int 1158 sys_openbsd_poll(struct openbsd_poll_args *uap) 1159 { 1160 return (sys_poll((struct poll_args *)uap)); 1161 } 1162 1163 /*ARGSUSED*/ 1164 int 1165 seltrue(cdev_t dev, int events) 1166 { 1167 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1168 } 1169 1170 /* 1171 * Record a select request. A global wait must be used since a process/thread 1172 * might go away after recording its request. 1173 */ 1174 void 1175 selrecord(struct thread *selector, struct selinfo *sip) 1176 { 1177 struct proc *p; 1178 struct lwp *lp = NULL; 1179 1180 if (selector->td_lwp == NULL) 1181 panic("selrecord: thread needs a process"); 1182 1183 if (sip->si_pid == selector->td_proc->p_pid && 1184 sip->si_tid == selector->td_lwp->lwp_tid) 1185 return; 1186 if (sip->si_pid && (p = pfind(sip->si_pid))) 1187 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1188 if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) { 1189 sip->si_flags |= SI_COLL; 1190 } else { 1191 sip->si_pid = selector->td_proc->p_pid; 1192 sip->si_tid = selector->td_lwp->lwp_tid; 1193 } 1194 } 1195 1196 /* 1197 * Do a wakeup when a selectable event occurs. 1198 */ 1199 void 1200 selwakeup(struct selinfo *sip) 1201 { 1202 struct proc *p; 1203 struct lwp *lp = NULL; 1204 1205 if (sip->si_pid == 0) 1206 return; 1207 if (sip->si_flags & SI_COLL) { 1208 nselcoll++; 1209 sip->si_flags &= ~SI_COLL; 1210 wakeup((caddr_t)&selwait); /* YYY fixable */ 1211 } 1212 p = pfind(sip->si_pid); 1213 sip->si_pid = 0; 1214 if (p == NULL) 1215 return; 1216 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1217 if (lp == NULL) 1218 return; 1219 1220 crit_enter(); 1221 if (lp->lwp_wchan == (caddr_t)&selwait) { 1222 /* 1223 * Flag the process to break the tsleep when 1224 * setrunnable is called, but only call setrunnable 1225 * here if the process is not in a stopped state. 1226 */ 1227 lp->lwp_flag |= LWP_BREAKTSLEEP; 1228 if (p->p_stat != SSTOP) 1229 setrunnable(lp); 1230 } else if (lp->lwp_flag & LWP_SELECT) { 1231 lp->lwp_flag &= ~LWP_SELECT; 1232 } 1233 crit_exit(); 1234 } 1235 1236