1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.49 2008/05/05 22:09:44 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/socketvar.h> 55 #include <sys/uio.h> 56 #include <sys/kernel.h> 57 #include <sys/kern_syscall.h> 58 #include <sys/malloc.h> 59 #include <sys/mapped_ioctl.h> 60 #include <sys/poll.h> 61 #include <sys/queue.h> 62 #include <sys/resourcevar.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysent.h> 65 #include <sys/buf.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_page.h> 71 #include <sys/file2.h> 72 73 #include <machine/limits.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 81 struct timeval *tv, int *res); 82 static int pollscan (struct proc *, struct pollfd *, u_int, int *); 83 static int selscan (struct proc *, fd_mask **, fd_mask **, 84 int, int *); 85 static int dofileread(int, struct file *, struct uio *, int, int *); 86 static int dofilewrite(int, struct file *, struct uio *, int, int *); 87 88 /* 89 * Read system call. 90 * 91 * MPSAFE 92 */ 93 int 94 sys_read(struct read_args *uap) 95 { 96 struct thread *td = curthread; 97 struct uio auio; 98 struct iovec aiov; 99 int error; 100 101 aiov.iov_base = uap->buf; 102 aiov.iov_len = uap->nbyte; 103 auio.uio_iov = &aiov; 104 auio.uio_iovcnt = 1; 105 auio.uio_offset = -1; 106 auio.uio_resid = uap->nbyte; 107 auio.uio_rw = UIO_READ; 108 auio.uio_segflg = UIO_USERSPACE; 109 auio.uio_td = td; 110 111 if (auio.uio_resid < 0) 112 error = EINVAL; 113 else 114 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_result); 115 return(error); 116 } 117 118 /* 119 * Positioned (Pread) read system call 120 * 121 * MPSAFE 122 */ 123 int 124 sys_extpread(struct extpread_args *uap) 125 { 126 struct thread *td = curthread; 127 struct uio auio; 128 struct iovec aiov; 129 int error; 130 int flags; 131 132 aiov.iov_base = uap->buf; 133 aiov.iov_len = uap->nbyte; 134 auio.uio_iov = &aiov; 135 auio.uio_iovcnt = 1; 136 auio.uio_offset = uap->offset; 137 auio.uio_resid = uap->nbyte; 138 auio.uio_rw = UIO_READ; 139 auio.uio_segflg = UIO_USERSPACE; 140 auio.uio_td = td; 141 142 flags = uap->flags & O_FMASK; 143 if (uap->offset != (off_t)-1) 144 flags |= O_FOFFSET; 145 146 if (auio.uio_resid < 0) 147 error = EINVAL; 148 else 149 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_result); 150 return(error); 151 } 152 153 /* 154 * Scatter read system call. 155 * 156 * MPSAFE 157 */ 158 int 159 sys_readv(struct readv_args *uap) 160 { 161 struct thread *td = curthread; 162 struct uio auio; 163 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 164 int error; 165 166 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 167 &auio.uio_resid); 168 if (error) 169 return (error); 170 auio.uio_iov = iov; 171 auio.uio_iovcnt = uap->iovcnt; 172 auio.uio_offset = -1; 173 auio.uio_rw = UIO_READ; 174 auio.uio_segflg = UIO_USERSPACE; 175 auio.uio_td = td; 176 177 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_result); 178 179 iovec_free(&iov, aiov); 180 return (error); 181 } 182 183 184 /* 185 * Scatter positioned read system call. 186 * 187 * MPSAFE 188 */ 189 int 190 sys_extpreadv(struct extpreadv_args *uap) 191 { 192 struct thread *td = curthread; 193 struct uio auio; 194 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 195 int error; 196 int flags; 197 198 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 199 &auio.uio_resid); 200 if (error) 201 return (error); 202 auio.uio_iov = iov; 203 auio.uio_iovcnt = uap->iovcnt; 204 auio.uio_offset = uap->offset; 205 auio.uio_rw = UIO_READ; 206 auio.uio_segflg = UIO_USERSPACE; 207 auio.uio_td = td; 208 209 flags = uap->flags & O_FMASK; 210 if (uap->offset != (off_t)-1) 211 flags |= O_FOFFSET; 212 213 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_result); 214 215 iovec_free(&iov, aiov); 216 return(error); 217 } 218 219 /* 220 * MPSAFE 221 */ 222 int 223 kern_preadv(int fd, struct uio *auio, int flags, int *res) 224 { 225 struct thread *td = curthread; 226 struct proc *p = td->td_proc; 227 struct file *fp; 228 int error; 229 230 KKASSERT(p); 231 232 fp = holdfp(p->p_fd, fd, FREAD); 233 if (fp == NULL) 234 return (EBADF); 235 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 236 error = ESPIPE; 237 } else if (auio->uio_resid < 0) { 238 error = EINVAL; 239 } else { 240 error = dofileread(fd, fp, auio, flags, res); 241 } 242 fdrop(fp); 243 return(error); 244 } 245 246 /* 247 * Common code for readv and preadv that reads data in 248 * from a file using the passed in uio, offset, and flags. 249 * 250 * MPALMOSTSAFE - ktrace needs help 251 */ 252 static int 253 dofileread(int fd, struct file *fp, struct uio *auio, int flags, int *res) 254 { 255 struct thread *td = curthread; 256 int error; 257 int len; 258 #ifdef KTRACE 259 struct iovec *ktriov = NULL; 260 struct uio ktruio; 261 #endif 262 263 #ifdef KTRACE 264 /* 265 * if tracing, save a copy of iovec 266 */ 267 if (KTRPOINT(td, KTR_GENIO)) { 268 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 269 270 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 271 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 272 ktruio = *auio; 273 } 274 #endif 275 len = auio->uio_resid; 276 error = fo_read(fp, auio, fp->f_cred, flags); 277 if (error) { 278 if (auio->uio_resid != len && (error == ERESTART || 279 error == EINTR || error == EWOULDBLOCK)) 280 error = 0; 281 } 282 #ifdef KTRACE 283 if (ktriov != NULL) { 284 if (error == 0) { 285 ktruio.uio_iov = ktriov; 286 ktruio.uio_resid = len - auio->uio_resid; 287 get_mplock(); 288 ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error); 289 rel_mplock(); 290 } 291 FREE(ktriov, M_TEMP); 292 } 293 #endif 294 if (error == 0) 295 *res = len - auio->uio_resid; 296 297 return(error); 298 } 299 300 /* 301 * Write system call 302 * 303 * MPSAFE 304 */ 305 int 306 sys_write(struct write_args *uap) 307 { 308 struct thread *td = curthread; 309 struct uio auio; 310 struct iovec aiov; 311 int error; 312 313 aiov.iov_base = (void *)(uintptr_t)uap->buf; 314 aiov.iov_len = uap->nbyte; 315 auio.uio_iov = &aiov; 316 auio.uio_iovcnt = 1; 317 auio.uio_offset = -1; 318 auio.uio_resid = uap->nbyte; 319 auio.uio_rw = UIO_WRITE; 320 auio.uio_segflg = UIO_USERSPACE; 321 auio.uio_td = td; 322 323 if (auio.uio_resid < 0) 324 error = EINVAL; 325 else 326 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_result); 327 328 return(error); 329 } 330 331 /* 332 * Pwrite system call 333 * 334 * MPSAFE 335 */ 336 int 337 sys_extpwrite(struct extpwrite_args *uap) 338 { 339 struct thread *td = curthread; 340 struct uio auio; 341 struct iovec aiov; 342 int error; 343 int flags; 344 345 aiov.iov_base = (void *)(uintptr_t)uap->buf; 346 aiov.iov_len = uap->nbyte; 347 auio.uio_iov = &aiov; 348 auio.uio_iovcnt = 1; 349 auio.uio_offset = uap->offset; 350 auio.uio_resid = uap->nbyte; 351 auio.uio_rw = UIO_WRITE; 352 auio.uio_segflg = UIO_USERSPACE; 353 auio.uio_td = td; 354 355 flags = uap->flags & O_FMASK; 356 if (uap->offset != (off_t)-1) 357 flags |= O_FOFFSET; 358 359 if (auio.uio_resid < 0) 360 error = EINVAL; 361 else 362 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_result); 363 364 return(error); 365 } 366 367 /* 368 * MPSAFE 369 */ 370 int 371 sys_writev(struct writev_args *uap) 372 { 373 struct thread *td = curthread; 374 struct uio auio; 375 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 376 int error; 377 378 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 379 &auio.uio_resid); 380 if (error) 381 return (error); 382 auio.uio_iov = iov; 383 auio.uio_iovcnt = uap->iovcnt; 384 auio.uio_offset = -1; 385 auio.uio_rw = UIO_WRITE; 386 auio.uio_segflg = UIO_USERSPACE; 387 auio.uio_td = td; 388 389 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_result); 390 391 iovec_free(&iov, aiov); 392 return (error); 393 } 394 395 396 /* 397 * Gather positioned write system call 398 * 399 * MPSAFE 400 */ 401 int 402 sys_extpwritev(struct extpwritev_args *uap) 403 { 404 struct thread *td = curthread; 405 struct uio auio; 406 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 407 int error; 408 int flags; 409 410 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 411 &auio.uio_resid); 412 if (error) 413 return (error); 414 auio.uio_iov = iov; 415 auio.uio_iovcnt = uap->iovcnt; 416 auio.uio_offset = uap->offset; 417 auio.uio_rw = UIO_WRITE; 418 auio.uio_segflg = UIO_USERSPACE; 419 auio.uio_td = td; 420 421 flags = uap->flags & O_FMASK; 422 if (uap->offset != (off_t)-1) 423 flags |= O_FOFFSET; 424 425 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_result); 426 427 iovec_free(&iov, aiov); 428 return(error); 429 } 430 431 /* 432 * MPSAFE 433 */ 434 int 435 kern_pwritev(int fd, struct uio *auio, int flags, int *res) 436 { 437 struct thread *td = curthread; 438 struct proc *p = td->td_proc; 439 struct file *fp; 440 int error; 441 442 KKASSERT(p); 443 444 fp = holdfp(p->p_fd, fd, FWRITE); 445 if (fp == NULL) 446 return (EBADF); 447 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 448 error = ESPIPE; 449 } else { 450 error = dofilewrite(fd, fp, auio, flags, res); 451 } 452 453 fdrop(fp); 454 return (error); 455 } 456 457 /* 458 * Common code for writev and pwritev that writes data to 459 * a file using the passed in uio, offset, and flags. 460 * 461 * MPALMOSTSAFE - ktrace needs help 462 */ 463 static int 464 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, int *res) 465 { 466 struct thread *td = curthread; 467 struct lwp *lp = td->td_lwp; 468 int error; 469 int len; 470 #ifdef KTRACE 471 struct iovec *ktriov = NULL; 472 struct uio ktruio; 473 #endif 474 475 #ifdef KTRACE 476 /* 477 * if tracing, save a copy of iovec and uio 478 */ 479 if (KTRPOINT(td, KTR_GENIO)) { 480 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 481 482 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 483 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 484 ktruio = *auio; 485 } 486 #endif 487 len = auio->uio_resid; 488 error = fo_write(fp, auio, fp->f_cred, flags); 489 if (error) { 490 if (auio->uio_resid != len && (error == ERESTART || 491 error == EINTR || error == EWOULDBLOCK)) 492 error = 0; 493 /* Socket layer is responsible for issuing SIGPIPE. */ 494 if (error == EPIPE) { 495 get_mplock(); 496 lwpsignal(lp->lwp_proc, lp, SIGPIPE); 497 rel_mplock(); 498 } 499 } 500 #ifdef KTRACE 501 if (ktriov != NULL) { 502 if (error == 0) { 503 ktruio.uio_iov = ktriov; 504 ktruio.uio_resid = len - auio->uio_resid; 505 get_mplock(); 506 ktrgenio(lp, fd, UIO_WRITE, &ktruio, error); 507 rel_mplock(); 508 } 509 FREE(ktriov, M_TEMP); 510 } 511 #endif 512 if (error == 0) 513 *res = len - auio->uio_resid; 514 515 return(error); 516 } 517 518 /* 519 * Ioctl system call 520 */ 521 /* ARGSUSED */ 522 int 523 sys_ioctl(struct ioctl_args *uap) 524 { 525 return(mapped_ioctl(uap->fd, uap->com, uap->data, NULL)); 526 } 527 528 struct ioctl_map_entry { 529 const char *subsys; 530 struct ioctl_map_range *cmd_ranges; 531 LIST_ENTRY(ioctl_map_entry) entries; 532 }; 533 534 /* 535 * The true heart of all ioctl syscall handlers (native, emulation). 536 * If map != NULL, it will be searched for a matching entry for com, 537 * and appropriate conversions/conversion functions will be utilized. 538 */ 539 int 540 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map) 541 { 542 struct thread *td = curthread; 543 struct proc *p = td->td_proc; 544 struct ucred *cred; 545 struct file *fp; 546 struct ioctl_map_range *iomc = NULL; 547 int error; 548 u_int size; 549 u_long ocom = com; 550 caddr_t data, memp; 551 int tmp; 552 #define STK_PARAMS 128 553 union { 554 char stkbuf[STK_PARAMS]; 555 long align; 556 } ubuf; 557 558 KKASSERT(p); 559 cred = p->p_ucred; 560 561 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 562 if (fp == NULL) 563 return(EBADF); 564 565 if (map != NULL) { /* obey translation map */ 566 u_long maskcmd; 567 struct ioctl_map_entry *e; 568 569 maskcmd = com & map->mask; 570 571 LIST_FOREACH(e, &map->mapping, entries) { 572 for (iomc = e->cmd_ranges; iomc->start != 0 || 573 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 574 iomc->mapfunc != NULL; 575 iomc++) { 576 if (maskcmd >= iomc->start && 577 maskcmd <= iomc->end) 578 break; 579 } 580 581 /* Did we find a match? */ 582 if (iomc->start != 0 || iomc->maptocmd != 0 || 583 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 584 break; 585 } 586 587 if (iomc == NULL || 588 (iomc->start == 0 && iomc->maptocmd == 0 589 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 590 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 591 map->sys, fd, maskcmd, 592 (int)((maskcmd >> 8) & 0xff), 593 (int)(maskcmd & 0xff)); 594 error = EINVAL; 595 goto done; 596 } 597 598 /* 599 * If it's a non-range one to one mapping, maptocmd should be 600 * correct. If it's a ranged one to one mapping, we pass the 601 * original value of com, and for a range mapped to a different 602 * range, we always need a mapping function to translate the 603 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 604 */ 605 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 606 com = iomc->maptocmd; 607 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 608 if (iomc->mapfunc != NULL) 609 com = iomc->mapfunc(iomc->start, iomc->end, 610 iomc->start, iomc->end, 611 com, com); 612 } else { 613 if (iomc->mapfunc != NULL) { 614 com = iomc->mapfunc(iomc->start, iomc->end, 615 iomc->maptocmd, iomc->maptoend, 616 com, ocom); 617 } else { 618 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 619 map->sys, fd, maskcmd, 620 (int)((maskcmd >> 8) & 0xff), 621 (int)(maskcmd & 0xff)); 622 error = EINVAL; 623 goto done; 624 } 625 } 626 } 627 628 switch (com) { 629 case FIONCLEX: 630 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 631 goto done; 632 case FIOCLEX: 633 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 634 goto done; 635 } 636 637 /* 638 * Interpret high order word to find amount of data to be 639 * copied to/from the user's address space. 640 */ 641 size = IOCPARM_LEN(com); 642 if (size > IOCPARM_MAX) { 643 error = ENOTTY; 644 goto done; 645 } 646 647 memp = NULL; 648 if (size > sizeof (ubuf.stkbuf)) { 649 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 650 data = memp; 651 } else { 652 data = ubuf.stkbuf; 653 } 654 if ((com & IOC_IN) != 0) { 655 if (size != 0) { 656 error = copyin(uspc_data, data, (u_int)size); 657 if (error) { 658 if (memp != NULL) 659 kfree(memp, M_IOCTLOPS); 660 goto done; 661 } 662 } else { 663 *(caddr_t *)data = uspc_data; 664 } 665 } else if ((com & IOC_OUT) != 0 && size) { 666 /* 667 * Zero the buffer so the user always 668 * gets back something deterministic. 669 */ 670 bzero(data, size); 671 } else if ((com & IOC_VOID) != 0) { 672 *(caddr_t *)data = uspc_data; 673 } 674 675 switch (com) { 676 case FIONBIO: 677 if ((tmp = *(int *)data)) 678 fp->f_flag |= FNONBLOCK; 679 else 680 fp->f_flag &= ~FNONBLOCK; 681 error = 0; 682 break; 683 684 case FIOASYNC: 685 if ((tmp = *(int *)data)) 686 fp->f_flag |= FASYNC; 687 else 688 fp->f_flag &= ~FASYNC; 689 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred); 690 break; 691 692 default: 693 /* 694 * If there is a override function, 695 * call it instead of directly routing the call 696 */ 697 if (map != NULL && iomc->wrapfunc != NULL) 698 error = iomc->wrapfunc(fp, com, ocom, data, cred); 699 else 700 error = fo_ioctl(fp, com, data, cred); 701 /* 702 * Copy any data to user, size was 703 * already set and checked above. 704 */ 705 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 706 error = copyout(data, uspc_data, (u_int)size); 707 break; 708 } 709 if (memp != NULL) 710 kfree(memp, M_IOCTLOPS); 711 done: 712 fdrop(fp); 713 return(error); 714 } 715 716 int 717 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 718 { 719 struct ioctl_map_entry *ne; 720 721 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 722 he->subsys != NULL && *he->subsys != '\0'); 723 724 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); 725 726 ne->subsys = he->subsys; 727 ne->cmd_ranges = he->cmd_ranges; 728 729 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 730 731 return(0); 732 } 733 734 int 735 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 736 { 737 struct ioctl_map_entry *ne; 738 739 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 740 741 LIST_FOREACH(ne, &he->map->mapping, entries) { 742 if (ne->cmd_ranges != he->cmd_ranges) 743 continue; 744 LIST_REMOVE(ne, entries); 745 kfree(ne, M_IOCTLMAP); 746 return(0); 747 } 748 return(EINVAL); 749 } 750 751 static int nselcoll; /* Select collisions since boot */ 752 int selwait; 753 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 754 755 /* 756 * Select system call. 757 */ 758 int 759 sys_select(struct select_args *uap) 760 { 761 struct timeval ktv; 762 struct timeval *ktvp; 763 int error; 764 765 /* 766 * Get timeout if any. 767 */ 768 if (uap->tv != NULL) { 769 error = copyin(uap->tv, &ktv, sizeof (ktv)); 770 if (error) 771 return (error); 772 error = itimerfix(&ktv); 773 if (error) 774 return (error); 775 ktvp = &ktv; 776 } else { 777 ktvp = NULL; 778 } 779 780 /* 781 * Do real work. 782 */ 783 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 784 &uap->sysmsg_result); 785 786 return (error); 787 } 788 789 790 /* 791 * Pselect system call. 792 */ 793 int 794 sys_pselect(struct pselect_args *uap) 795 { 796 struct thread *td = curthread; 797 struct lwp *lp = td->td_lwp; 798 struct timespec kts; 799 struct timeval ktv; 800 struct timeval *ktvp; 801 sigset_t sigmask; 802 int error; 803 804 /* 805 * Get timeout if any and convert it. 806 * Round up during conversion to avoid timeout going off early. 807 */ 808 if (uap->ts != NULL) { 809 error = copyin(uap->ts, &kts, sizeof (kts)); 810 if (error) 811 return (error); 812 ktv.tv_sec = kts.tv_sec; 813 ktv.tv_usec = (kts.tv_nsec + 999) / 1000; 814 error = itimerfix(&ktv); 815 if (error) 816 return (error); 817 ktvp = &ktv; 818 } else { 819 ktvp = NULL; 820 } 821 822 /* 823 * Install temporary signal mask if any provided. 824 */ 825 if (uap->sigmask != NULL) { 826 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 827 if (error) 828 return (error); 829 lp->lwp_oldsigmask = lp->lwp_sigmask; 830 SIG_CANTMASK(sigmask); 831 lp->lwp_sigmask = sigmask; 832 } 833 834 /* 835 * Do real job. 836 */ 837 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 838 &uap->sysmsg_result); 839 840 if (uap->sigmask != NULL) { 841 /* doselect() responsible for turning ERESTART into EINTR */ 842 KKASSERT(error != ERESTART); 843 if (error == EINTR) { 844 /* 845 * We can't restore the previous signal mask now 846 * because it could block the signal that interrupted 847 * us. So make a note to restore it after executing 848 * the handler. 849 */ 850 lp->lwp_flag |= LWP_OLDMASK; 851 } else { 852 /* 853 * No handler to run. Restore previous mask immediately. 854 */ 855 lp->lwp_sigmask = lp->lwp_oldsigmask; 856 } 857 } 858 859 return (error); 860 } 861 862 /* 863 * Common code for sys_select() and sys_pselect(). 864 * 865 * in, out and ex are userland pointers. tv must point to validated 866 * kernel-side timeout value or NULL for infinite timeout. res must 867 * point to syscall return value. 868 */ 869 static int 870 doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tv, 871 int *res) 872 { 873 struct lwp *lp = curthread->td_lwp; 874 struct proc *p = curproc; 875 876 /* 877 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 878 * infds with the new FD_SETSIZE of 1024, and more than enough for 879 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 880 * of 256. 881 */ 882 fd_mask s_selbits[howmany(2048, NFDBITS)]; 883 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 884 struct timeval atv, rtv, ttv; 885 int ncoll, error, timo; 886 u_int nbufbytes, ncpbytes, nfdbits; 887 888 if (nd < 0) 889 return (EINVAL); 890 if (nd > p->p_fd->fd_nfiles) 891 nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 892 893 /* 894 * Allocate just enough bits for the non-null fd_sets. Use the 895 * preallocated auto buffer if possible. 896 */ 897 nfdbits = roundup(nd, NFDBITS); 898 ncpbytes = nfdbits / NBBY; 899 nbufbytes = 0; 900 if (in != NULL) 901 nbufbytes += 2 * ncpbytes; 902 if (ou != NULL) 903 nbufbytes += 2 * ncpbytes; 904 if (ex != NULL) 905 nbufbytes += 2 * ncpbytes; 906 if (nbufbytes <= sizeof s_selbits) 907 selbits = &s_selbits[0]; 908 else 909 selbits = kmalloc(nbufbytes, M_SELECT, M_WAITOK); 910 911 /* 912 * Assign pointers into the bit buffers and fetch the input bits. 913 * Put the output buffers together so that they can be bzeroed 914 * together. 915 */ 916 sbp = selbits; 917 #define getbits(name, x) \ 918 do { \ 919 if (name == NULL) \ 920 ibits[x] = NULL; \ 921 else { \ 922 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 923 obits[x] = sbp; \ 924 sbp += ncpbytes / sizeof *sbp; \ 925 error = copyin(name, ibits[x], ncpbytes); \ 926 if (error != 0) \ 927 goto done; \ 928 } \ 929 } while (0) 930 getbits(in, 0); 931 getbits(ou, 1); 932 getbits(ex, 2); 933 #undef getbits 934 if (nbufbytes != 0) 935 bzero(selbits, nbufbytes / 2); 936 937 if (tv != NULL) { 938 atv = *tv; 939 getmicrouptime(&rtv); 940 timevaladd(&atv, &rtv); 941 } else { 942 atv.tv_sec = 0; 943 atv.tv_usec = 0; 944 } 945 timo = 0; 946 retry: 947 ncoll = nselcoll; 948 lp->lwp_flag |= LWP_SELECT; 949 error = selscan(p, ibits, obits, nd, res); 950 if (error || *res) 951 goto done; 952 if (atv.tv_sec || atv.tv_usec) { 953 getmicrouptime(&rtv); 954 if (timevalcmp(&rtv, &atv, >=)) 955 goto done; 956 ttv = atv; 957 timevalsub(&ttv, &rtv); 958 timo = ttv.tv_sec > 24 * 60 * 60 ? 959 24 * 60 * 60 * hz : tvtohz_high(&ttv); 960 } 961 crit_enter(); 962 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 963 crit_exit(); 964 goto retry; 965 } 966 lp->lwp_flag &= ~LWP_SELECT; 967 968 error = tsleep((caddr_t)&selwait, PCATCH, "select", timo); 969 970 crit_exit(); 971 if (error == 0) 972 goto retry; 973 done: 974 lp->lwp_flag &= ~LWP_SELECT; 975 /* select is not restarted after signals... */ 976 if (error == ERESTART) 977 error = EINTR; 978 if (error == EWOULDBLOCK) 979 error = 0; 980 #define putbits(name, x) \ 981 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 982 error = error2; 983 if (error == 0) { 984 int error2; 985 986 putbits(in, 0); 987 putbits(ou, 1); 988 putbits(ex, 2); 989 #undef putbits 990 } 991 if (selbits != &s_selbits[0]) 992 kfree(selbits, M_SELECT); 993 return (error); 994 } 995 996 static int 997 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res) 998 { 999 int msk, i, fd; 1000 fd_mask bits; 1001 struct file *fp; 1002 int n = 0; 1003 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 1004 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 1005 1006 for (msk = 0; msk < 3; msk++) { 1007 if (ibits[msk] == NULL) 1008 continue; 1009 for (i = 0; i < nfd; i += NFDBITS) { 1010 bits = ibits[msk][i/NFDBITS]; 1011 /* ffs(int mask) not portable, fd_mask is long */ 1012 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 1013 if (!(bits & 1)) 1014 continue; 1015 fp = holdfp(p->p_fd, fd, -1); 1016 if (fp == NULL) 1017 return (EBADF); 1018 if (fo_poll(fp, flag[msk], fp->f_cred)) { 1019 obits[msk][(fd)/NFDBITS] |= 1020 ((fd_mask)1 << ((fd) % NFDBITS)); 1021 n++; 1022 } 1023 fdrop(fp); 1024 } 1025 } 1026 } 1027 *res = n; 1028 return (0); 1029 } 1030 1031 /* 1032 * Poll system call. 1033 */ 1034 int 1035 sys_poll(struct poll_args *uap) 1036 { 1037 struct pollfd *bits; 1038 struct pollfd smallbits[32]; 1039 struct timeval atv, rtv, ttv; 1040 int ncoll, error = 0, timo; 1041 u_int nfds; 1042 size_t ni; 1043 struct lwp *lp = curthread->td_lwp; 1044 struct proc *p = curproc; 1045 1046 nfds = uap->nfds; 1047 /* 1048 * This is kinda bogus. We have fd limits, but that is not 1049 * really related to the size of the pollfd array. Make sure 1050 * we let the process use at least FD_SETSIZE entries and at 1051 * least enough for the current limits. We want to be reasonably 1052 * safe, but not overly restrictive. 1053 */ 1054 if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE) 1055 return (EINVAL); 1056 ni = nfds * sizeof(struct pollfd); 1057 if (ni > sizeof(smallbits)) 1058 bits = kmalloc(ni, M_TEMP, M_WAITOK); 1059 else 1060 bits = smallbits; 1061 error = copyin(uap->fds, bits, ni); 1062 if (error) 1063 goto done; 1064 if (uap->timeout != INFTIM) { 1065 atv.tv_sec = uap->timeout / 1000; 1066 atv.tv_usec = (uap->timeout % 1000) * 1000; 1067 if (itimerfix(&atv)) { 1068 error = EINVAL; 1069 goto done; 1070 } 1071 getmicrouptime(&rtv); 1072 timevaladd(&atv, &rtv); 1073 } else { 1074 atv.tv_sec = 0; 1075 atv.tv_usec = 0; 1076 } 1077 timo = 0; 1078 retry: 1079 ncoll = nselcoll; 1080 lp->lwp_flag |= LWP_SELECT; 1081 error = pollscan(p, bits, nfds, &uap->sysmsg_result); 1082 if (error || uap->sysmsg_result) 1083 goto done; 1084 if (atv.tv_sec || atv.tv_usec) { 1085 getmicrouptime(&rtv); 1086 if (timevalcmp(&rtv, &atv, >=)) 1087 goto done; 1088 ttv = atv; 1089 timevalsub(&ttv, &rtv); 1090 timo = ttv.tv_sec > 24 * 60 * 60 ? 1091 24 * 60 * 60 * hz : tvtohz_high(&ttv); 1092 } 1093 crit_enter(); 1094 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 1095 crit_exit(); 1096 goto retry; 1097 } 1098 lp->lwp_flag &= ~LWP_SELECT; 1099 error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo); 1100 crit_exit(); 1101 if (error == 0) 1102 goto retry; 1103 done: 1104 lp->lwp_flag &= ~LWP_SELECT; 1105 /* poll is not restarted after signals... */ 1106 if (error == ERESTART) 1107 error = EINTR; 1108 if (error == EWOULDBLOCK) 1109 error = 0; 1110 if (error == 0) { 1111 error = copyout(bits, uap->fds, ni); 1112 if (error) 1113 goto out; 1114 } 1115 out: 1116 if (ni > sizeof(smallbits)) 1117 kfree(bits, M_TEMP); 1118 return (error); 1119 } 1120 1121 static int 1122 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res) 1123 { 1124 int i; 1125 struct file *fp; 1126 int n = 0; 1127 1128 for (i = 0; i < nfd; i++, fds++) { 1129 if (fds->fd >= p->p_fd->fd_nfiles) { 1130 fds->revents = POLLNVAL; 1131 n++; 1132 } else if (fds->fd < 0) { 1133 fds->revents = 0; 1134 } else { 1135 fp = holdfp(p->p_fd, fds->fd, -1); 1136 if (fp == NULL) { 1137 fds->revents = POLLNVAL; 1138 n++; 1139 } else { 1140 /* 1141 * Note: backend also returns POLLHUP and 1142 * POLLERR if appropriate. 1143 */ 1144 fds->revents = fo_poll(fp, fds->events, 1145 fp->f_cred); 1146 if (fds->revents != 0) 1147 n++; 1148 fdrop(fp); 1149 } 1150 } 1151 } 1152 *res = n; 1153 return (0); 1154 } 1155 1156 /* 1157 * OpenBSD poll system call. 1158 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1159 */ 1160 int 1161 sys_openbsd_poll(struct openbsd_poll_args *uap) 1162 { 1163 return (sys_poll((struct poll_args *)uap)); 1164 } 1165 1166 /*ARGSUSED*/ 1167 int 1168 seltrue(cdev_t dev, int events) 1169 { 1170 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1171 } 1172 1173 /* 1174 * Record a select request. A global wait must be used since a process/thread 1175 * might go away after recording its request. 1176 */ 1177 void 1178 selrecord(struct thread *selector, struct selinfo *sip) 1179 { 1180 struct proc *p; 1181 struct lwp *lp = NULL; 1182 1183 if (selector->td_lwp == NULL) 1184 panic("selrecord: thread needs a process"); 1185 1186 if (sip->si_pid == selector->td_proc->p_pid && 1187 sip->si_tid == selector->td_lwp->lwp_tid) 1188 return; 1189 if (sip->si_pid && (p = pfind(sip->si_pid))) 1190 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1191 if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) { 1192 sip->si_flags |= SI_COLL; 1193 } else { 1194 sip->si_pid = selector->td_proc->p_pid; 1195 sip->si_tid = selector->td_lwp->lwp_tid; 1196 } 1197 } 1198 1199 /* 1200 * Do a wakeup when a selectable event occurs. 1201 */ 1202 void 1203 selwakeup(struct selinfo *sip) 1204 { 1205 struct proc *p; 1206 struct lwp *lp = NULL; 1207 1208 if (sip->si_pid == 0) 1209 return; 1210 if (sip->si_flags & SI_COLL) { 1211 nselcoll++; 1212 sip->si_flags &= ~SI_COLL; 1213 wakeup((caddr_t)&selwait); /* YYY fixable */ 1214 } 1215 p = pfind(sip->si_pid); 1216 sip->si_pid = 0; 1217 if (p == NULL) 1218 return; 1219 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1220 if (lp == NULL) 1221 return; 1222 1223 crit_enter(); 1224 if (lp->lwp_wchan == (caddr_t)&selwait) { 1225 /* 1226 * Flag the process to break the tsleep when 1227 * setrunnable is called, but only call setrunnable 1228 * here if the process is not in a stopped state. 1229 */ 1230 lp->lwp_flag |= LWP_BREAKTSLEEP; 1231 if (p->p_stat != SSTOP) 1232 setrunnable(lp); 1233 } else if (lp->lwp_flag & LWP_SELECT) { 1234 lp->lwp_flag &= ~LWP_SELECT; 1235 } 1236 crit_exit(); 1237 } 1238 1239