1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.47 2008/01/10 22:30:27 nth Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/socketvar.h> 55 #include <sys/uio.h> 56 #include <sys/kernel.h> 57 #include <sys/kern_syscall.h> 58 #include <sys/malloc.h> 59 #include <sys/mapped_ioctl.h> 60 #include <sys/poll.h> 61 #include <sys/queue.h> 62 #include <sys/resourcevar.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysent.h> 65 #include <sys/buf.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_page.h> 71 #include <sys/file2.h> 72 73 #include <machine/limits.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, 81 struct timeval *tv, int *res); 82 static int pollscan (struct proc *, struct pollfd *, u_int, int *); 83 static int selscan (struct proc *, fd_mask **, fd_mask **, 84 int, int *); 85 static int dofileread(int, struct file *, struct uio *, int, int *); 86 static int dofilewrite(int, struct file *, struct uio *, int, int *); 87 88 /* 89 * Read system call. 90 * 91 * MPSAFE 92 */ 93 int 94 sys_read(struct read_args *uap) 95 { 96 struct thread *td = curthread; 97 struct uio auio; 98 struct iovec aiov; 99 int error; 100 101 aiov.iov_base = uap->buf; 102 aiov.iov_len = uap->nbyte; 103 auio.uio_iov = &aiov; 104 auio.uio_iovcnt = 1; 105 auio.uio_offset = -1; 106 auio.uio_resid = uap->nbyte; 107 auio.uio_rw = UIO_READ; 108 auio.uio_segflg = UIO_USERSPACE; 109 auio.uio_td = td; 110 111 if (auio.uio_resid < 0) 112 error = EINVAL; 113 else 114 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_result); 115 return(error); 116 } 117 118 /* 119 * Positioned (Pread) read system call 120 * 121 * MPSAFE 122 */ 123 int 124 sys_extpread(struct extpread_args *uap) 125 { 126 struct thread *td = curthread; 127 struct uio auio; 128 struct iovec aiov; 129 int error; 130 int flags; 131 132 aiov.iov_base = uap->buf; 133 aiov.iov_len = uap->nbyte; 134 auio.uio_iov = &aiov; 135 auio.uio_iovcnt = 1; 136 auio.uio_offset = uap->offset; 137 auio.uio_resid = uap->nbyte; 138 auio.uio_rw = UIO_READ; 139 auio.uio_segflg = UIO_USERSPACE; 140 auio.uio_td = td; 141 142 flags = uap->flags & O_FMASK; 143 if (uap->offset != (off_t)-1) 144 flags |= O_FOFFSET; 145 146 if (auio.uio_resid < 0) 147 error = EINVAL; 148 else 149 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_result); 150 return(error); 151 } 152 153 /* 154 * Scatter read system call. 155 * 156 * MPSAFE 157 */ 158 int 159 sys_readv(struct readv_args *uap) 160 { 161 struct thread *td = curthread; 162 struct uio auio; 163 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 164 int error; 165 166 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 167 &auio.uio_resid); 168 if (error) 169 return (error); 170 auio.uio_iov = iov; 171 auio.uio_iovcnt = uap->iovcnt; 172 auio.uio_offset = -1; 173 auio.uio_rw = UIO_READ; 174 auio.uio_segflg = UIO_USERSPACE; 175 auio.uio_td = td; 176 177 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_result); 178 179 iovec_free(&iov, aiov); 180 return (error); 181 } 182 183 184 /* 185 * Scatter positioned read system call. 186 * 187 * MPSAFE 188 */ 189 int 190 sys_extpreadv(struct extpreadv_args *uap) 191 { 192 struct thread *td = curthread; 193 struct uio auio; 194 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 195 int error; 196 int flags; 197 198 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 199 &auio.uio_resid); 200 if (error) 201 return (error); 202 auio.uio_iov = iov; 203 auio.uio_iovcnt = uap->iovcnt; 204 auio.uio_offset = uap->offset; 205 auio.uio_rw = UIO_READ; 206 auio.uio_segflg = UIO_USERSPACE; 207 auio.uio_td = td; 208 209 flags = uap->flags & O_FMASK; 210 if (uap->offset != (off_t)-1) 211 flags |= O_FOFFSET; 212 213 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_result); 214 215 iovec_free(&iov, aiov); 216 return(error); 217 } 218 219 /* 220 * MPSAFE 221 */ 222 int 223 kern_preadv(int fd, struct uio *auio, int flags, int *res) 224 { 225 struct thread *td = curthread; 226 struct proc *p = td->td_proc; 227 struct file *fp; 228 int error; 229 230 KKASSERT(p); 231 232 fp = holdfp(p->p_fd, fd, FREAD); 233 if (fp == NULL) 234 return (EBADF); 235 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 236 error = ESPIPE; 237 } else if (auio->uio_resid < 0) { 238 error = EINVAL; 239 } else { 240 error = dofileread(fd, fp, auio, flags, res); 241 } 242 fdrop(fp); 243 return(error); 244 } 245 246 /* 247 * Common code for readv and preadv that reads data in 248 * from a file using the passed in uio, offset, and flags. 249 * 250 * MPALMOSTSAFE - ktrace needs help 251 */ 252 static int 253 dofileread(int fd, struct file *fp, struct uio *auio, int flags, int *res) 254 { 255 struct thread *td = curthread; 256 struct proc *p = td->td_proc; 257 int error; 258 int len; 259 #ifdef KTRACE 260 struct iovec *ktriov = NULL; 261 struct uio ktruio; 262 #endif 263 264 #ifdef KTRACE 265 /* 266 * if tracing, save a copy of iovec 267 */ 268 if (KTRPOINT(td, KTR_GENIO)) { 269 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 270 271 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 272 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 273 ktruio = *auio; 274 } 275 #endif 276 len = auio->uio_resid; 277 error = fo_read(fp, auio, fp->f_cred, flags); 278 if (error) { 279 if (auio->uio_resid != len && (error == ERESTART || 280 error == EINTR || error == EWOULDBLOCK)) 281 error = 0; 282 } 283 #ifdef KTRACE 284 if (ktriov != NULL) { 285 if (error == 0) { 286 ktruio.uio_iov = ktriov; 287 ktruio.uio_resid = len - auio->uio_resid; 288 get_mplock(); 289 ktrgenio(p, fd, UIO_READ, &ktruio, error); 290 rel_mplock(); 291 } 292 FREE(ktriov, M_TEMP); 293 } 294 #endif 295 if (error == 0) 296 *res = len - auio->uio_resid; 297 298 return(error); 299 } 300 301 /* 302 * Write system call 303 * 304 * MPSAFE 305 */ 306 int 307 sys_write(struct write_args *uap) 308 { 309 struct thread *td = curthread; 310 struct uio auio; 311 struct iovec aiov; 312 int error; 313 314 aiov.iov_base = (void *)(uintptr_t)uap->buf; 315 aiov.iov_len = uap->nbyte; 316 auio.uio_iov = &aiov; 317 auio.uio_iovcnt = 1; 318 auio.uio_offset = -1; 319 auio.uio_resid = uap->nbyte; 320 auio.uio_rw = UIO_WRITE; 321 auio.uio_segflg = UIO_USERSPACE; 322 auio.uio_td = td; 323 324 if (auio.uio_resid < 0) 325 error = EINVAL; 326 else 327 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_result); 328 329 return(error); 330 } 331 332 /* 333 * Pwrite system call 334 * 335 * MPSAFE 336 */ 337 int 338 sys_extpwrite(struct extpwrite_args *uap) 339 { 340 struct thread *td = curthread; 341 struct uio auio; 342 struct iovec aiov; 343 int error; 344 int flags; 345 346 aiov.iov_base = (void *)(uintptr_t)uap->buf; 347 aiov.iov_len = uap->nbyte; 348 auio.uio_iov = &aiov; 349 auio.uio_iovcnt = 1; 350 auio.uio_offset = uap->offset; 351 auio.uio_resid = uap->nbyte; 352 auio.uio_rw = UIO_WRITE; 353 auio.uio_segflg = UIO_USERSPACE; 354 auio.uio_td = td; 355 356 flags = uap->flags & O_FMASK; 357 if (uap->offset != (off_t)-1) 358 flags |= O_FOFFSET; 359 360 if (auio.uio_resid < 0) 361 error = EINVAL; 362 else 363 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_result); 364 365 return(error); 366 } 367 368 /* 369 * MPSAFE 370 */ 371 int 372 sys_writev(struct writev_args *uap) 373 { 374 struct thread *td = curthread; 375 struct uio auio; 376 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 377 int error; 378 379 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 380 &auio.uio_resid); 381 if (error) 382 return (error); 383 auio.uio_iov = iov; 384 auio.uio_iovcnt = uap->iovcnt; 385 auio.uio_offset = -1; 386 auio.uio_rw = UIO_WRITE; 387 auio.uio_segflg = UIO_USERSPACE; 388 auio.uio_td = td; 389 390 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_result); 391 392 iovec_free(&iov, aiov); 393 return (error); 394 } 395 396 397 /* 398 * Gather positioned write system call 399 * 400 * MPSAFE 401 */ 402 int 403 sys_extpwritev(struct extpwritev_args *uap) 404 { 405 struct thread *td = curthread; 406 struct uio auio; 407 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 408 int error; 409 int flags; 410 411 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 412 &auio.uio_resid); 413 if (error) 414 return (error); 415 auio.uio_iov = iov; 416 auio.uio_iovcnt = uap->iovcnt; 417 auio.uio_offset = uap->offset; 418 auio.uio_rw = UIO_WRITE; 419 auio.uio_segflg = UIO_USERSPACE; 420 auio.uio_td = td; 421 422 flags = uap->flags & O_FMASK; 423 if (uap->offset != (off_t)-1) 424 flags |= O_FOFFSET; 425 426 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_result); 427 428 iovec_free(&iov, aiov); 429 return(error); 430 } 431 432 /* 433 * MPSAFE 434 */ 435 int 436 kern_pwritev(int fd, struct uio *auio, int flags, int *res) 437 { 438 struct thread *td = curthread; 439 struct proc *p = td->td_proc; 440 struct file *fp; 441 int error; 442 443 KKASSERT(p); 444 445 fp = holdfp(p->p_fd, fd, FWRITE); 446 if (fp == NULL) 447 return (EBADF); 448 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 449 error = ESPIPE; 450 } else { 451 error = dofilewrite(fd, fp, auio, flags, res); 452 } 453 454 fdrop(fp); 455 return (error); 456 } 457 458 /* 459 * Common code for writev and pwritev that writes data to 460 * a file using the passed in uio, offset, and flags. 461 * 462 * MPALMOSTSAFE - ktrace needs help 463 */ 464 static int 465 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, int *res) 466 { 467 struct thread *td = curthread; 468 struct lwp *lp = td->td_lwp; 469 struct proc *p = td->td_proc; 470 int error; 471 int len; 472 #ifdef KTRACE 473 struct iovec *ktriov = NULL; 474 struct uio ktruio; 475 #endif 476 477 #ifdef KTRACE 478 /* 479 * if tracing, save a copy of iovec and uio 480 */ 481 if (KTRPOINT(td, KTR_GENIO)) { 482 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 483 484 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 485 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 486 ktruio = *auio; 487 } 488 #endif 489 len = auio->uio_resid; 490 if (fp->f_type == DTYPE_VNODE) 491 bwillwrite(); 492 error = fo_write(fp, auio, fp->f_cred, flags); 493 if (error) { 494 if (auio->uio_resid != len && (error == ERESTART || 495 error == EINTR || error == EWOULDBLOCK)) 496 error = 0; 497 /* Socket layer is responsible for issuing SIGPIPE. */ 498 if (error == EPIPE) { 499 get_mplock(); 500 lwpsignal(p, lp, SIGPIPE); 501 rel_mplock(); 502 } 503 } 504 #ifdef KTRACE 505 if (ktriov != NULL) { 506 if (error == 0) { 507 ktruio.uio_iov = ktriov; 508 ktruio.uio_resid = len - auio->uio_resid; 509 get_mplock(); 510 ktrgenio(p, fd, UIO_WRITE, &ktruio, error); 511 rel_mplock(); 512 } 513 FREE(ktriov, M_TEMP); 514 } 515 #endif 516 if (error == 0) 517 *res = len - auio->uio_resid; 518 519 return(error); 520 } 521 522 /* 523 * Ioctl system call 524 */ 525 /* ARGSUSED */ 526 int 527 sys_ioctl(struct ioctl_args *uap) 528 { 529 return(mapped_ioctl(uap->fd, uap->com, uap->data, NULL)); 530 } 531 532 struct ioctl_map_entry { 533 const char *subsys; 534 struct ioctl_map_range *cmd_ranges; 535 LIST_ENTRY(ioctl_map_entry) entries; 536 }; 537 538 /* 539 * The true heart of all ioctl syscall handlers (native, emulation). 540 * If map != NULL, it will be searched for a matching entry for com, 541 * and appropriate conversions/conversion functions will be utilized. 542 */ 543 int 544 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map) 545 { 546 struct thread *td = curthread; 547 struct proc *p = td->td_proc; 548 struct ucred *cred; 549 struct file *fp; 550 struct ioctl_map_range *iomc = NULL; 551 int error; 552 u_int size; 553 u_long ocom = com; 554 caddr_t data, memp; 555 int tmp; 556 #define STK_PARAMS 128 557 union { 558 char stkbuf[STK_PARAMS]; 559 long align; 560 } ubuf; 561 562 KKASSERT(p); 563 cred = p->p_ucred; 564 565 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 566 if (fp == NULL) 567 return(EBADF); 568 569 if (map != NULL) { /* obey translation map */ 570 u_long maskcmd; 571 struct ioctl_map_entry *e; 572 573 maskcmd = com & map->mask; 574 575 LIST_FOREACH(e, &map->mapping, entries) { 576 for (iomc = e->cmd_ranges; iomc->start != 0 || 577 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 578 iomc->mapfunc != NULL; 579 iomc++) { 580 if (maskcmd >= iomc->start && 581 maskcmd <= iomc->end) 582 break; 583 } 584 585 /* Did we find a match? */ 586 if (iomc->start != 0 || iomc->maptocmd != 0 || 587 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 588 break; 589 } 590 591 if (iomc == NULL || 592 (iomc->start == 0 && iomc->maptocmd == 0 593 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 594 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 595 map->sys, fd, maskcmd, 596 (int)((maskcmd >> 8) & 0xff), 597 (int)(maskcmd & 0xff)); 598 error = EINVAL; 599 goto done; 600 } 601 602 /* 603 * If it's a non-range one to one mapping, maptocmd should be 604 * correct. If it's a ranged one to one mapping, we pass the 605 * original value of com, and for a range mapped to a different 606 * range, we always need a mapping function to translate the 607 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 608 */ 609 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 610 com = iomc->maptocmd; 611 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 612 if (iomc->mapfunc != NULL) 613 com = iomc->mapfunc(iomc->start, iomc->end, 614 iomc->start, iomc->end, 615 com, com); 616 } else { 617 if (iomc->mapfunc != NULL) { 618 com = iomc->mapfunc(iomc->start, iomc->end, 619 iomc->maptocmd, iomc->maptoend, 620 com, ocom); 621 } else { 622 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 623 map->sys, fd, maskcmd, 624 (int)((maskcmd >> 8) & 0xff), 625 (int)(maskcmd & 0xff)); 626 error = EINVAL; 627 goto done; 628 } 629 } 630 } 631 632 switch (com) { 633 case FIONCLEX: 634 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 635 goto done; 636 case FIOCLEX: 637 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 638 goto done; 639 } 640 641 /* 642 * Interpret high order word to find amount of data to be 643 * copied to/from the user's address space. 644 */ 645 size = IOCPARM_LEN(com); 646 if (size > IOCPARM_MAX) { 647 error = ENOTTY; 648 goto done; 649 } 650 651 memp = NULL; 652 if (size > sizeof (ubuf.stkbuf)) { 653 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 654 data = memp; 655 } else { 656 data = ubuf.stkbuf; 657 } 658 if ((com & IOC_IN) != 0) { 659 if (size != 0) { 660 error = copyin(uspc_data, data, (u_int)size); 661 if (error) { 662 if (memp != NULL) 663 kfree(memp, M_IOCTLOPS); 664 goto done; 665 } 666 } else { 667 *(caddr_t *)data = uspc_data; 668 } 669 } else if ((com & IOC_OUT) != 0 && size) { 670 /* 671 * Zero the buffer so the user always 672 * gets back something deterministic. 673 */ 674 bzero(data, size); 675 } else if ((com & IOC_VOID) != 0) { 676 *(caddr_t *)data = uspc_data; 677 } 678 679 switch (com) { 680 case FIONBIO: 681 if ((tmp = *(int *)data)) 682 fp->f_flag |= FNONBLOCK; 683 else 684 fp->f_flag &= ~FNONBLOCK; 685 error = 0; 686 break; 687 688 case FIOASYNC: 689 if ((tmp = *(int *)data)) 690 fp->f_flag |= FASYNC; 691 else 692 fp->f_flag &= ~FASYNC; 693 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred); 694 break; 695 696 default: 697 /* 698 * If there is a override function, 699 * call it instead of directly routing the call 700 */ 701 if (map != NULL && iomc->wrapfunc != NULL) 702 error = iomc->wrapfunc(fp, com, ocom, data, cred); 703 else 704 error = fo_ioctl(fp, com, data, cred); 705 /* 706 * Copy any data to user, size was 707 * already set and checked above. 708 */ 709 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 710 error = copyout(data, uspc_data, (u_int)size); 711 break; 712 } 713 if (memp != NULL) 714 kfree(memp, M_IOCTLOPS); 715 done: 716 fdrop(fp); 717 return(error); 718 } 719 720 int 721 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 722 { 723 struct ioctl_map_entry *ne; 724 725 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 726 he->subsys != NULL && *he->subsys != '\0'); 727 728 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); 729 730 ne->subsys = he->subsys; 731 ne->cmd_ranges = he->cmd_ranges; 732 733 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 734 735 return(0); 736 } 737 738 int 739 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 740 { 741 struct ioctl_map_entry *ne; 742 743 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 744 745 LIST_FOREACH(ne, &he->map->mapping, entries) { 746 if (ne->cmd_ranges != he->cmd_ranges) 747 continue; 748 LIST_REMOVE(ne, entries); 749 kfree(ne, M_IOCTLMAP); 750 return(0); 751 } 752 return(EINVAL); 753 } 754 755 static int nselcoll; /* Select collisions since boot */ 756 int selwait; 757 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 758 759 /* 760 * Select system call. 761 */ 762 int 763 sys_select(struct select_args *uap) 764 { 765 struct timeval ktv; 766 struct timeval *ktvp; 767 int error; 768 769 /* 770 * Get timeout if any. 771 */ 772 if (uap->tv != NULL) { 773 error = copyin(uap->tv, &ktv, sizeof (ktv)); 774 if (error) 775 return (error); 776 error = itimerfix(&ktv); 777 if (error) 778 return (error); 779 ktvp = &ktv; 780 } else { 781 ktvp = NULL; 782 } 783 784 /* 785 * Do real work. 786 */ 787 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 788 &uap->sysmsg_result); 789 790 return (error); 791 } 792 793 794 /* 795 * Pselect system call. 796 */ 797 int 798 sys_pselect(struct pselect_args *uap) 799 { 800 struct thread *td = curthread; 801 struct lwp *lp = td->td_lwp; 802 struct timespec kts; 803 struct timeval ktv; 804 struct timeval *ktvp; 805 sigset_t sigmask; 806 int error; 807 808 /* 809 * Get timeout if any and convert it. 810 * Round up during conversion to avoid timeout going off early. 811 */ 812 if (uap->ts != NULL) { 813 error = copyin(uap->ts, &kts, sizeof (kts)); 814 if (error) 815 return (error); 816 ktv.tv_sec = kts.tv_sec; 817 ktv.tv_usec = (kts.tv_nsec + 999) / 1000; 818 error = itimerfix(&ktv); 819 if (error) 820 return (error); 821 ktvp = &ktv; 822 } else { 823 ktvp = NULL; 824 } 825 826 /* 827 * Install temporary signal mask if any provided. 828 */ 829 if (uap->sigmask != NULL) { 830 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask)); 831 if (error) 832 return (error); 833 lp->lwp_oldsigmask = lp->lwp_sigmask; 834 SIG_CANTMASK(sigmask); 835 lp->lwp_sigmask = sigmask; 836 } 837 838 /* 839 * Do real job. 840 */ 841 error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp, 842 &uap->sysmsg_result); 843 844 if (uap->sigmask != NULL) { 845 /* doselect() responsible for turning ERESTART into EINTR */ 846 KKASSERT(error != ERESTART); 847 if (error == EINTR) { 848 /* 849 * We can't restore the previous signal mask now 850 * because it could block the signal that interrupted 851 * us. So make a note to restore it after executing 852 * the handler. 853 */ 854 lp->lwp_flag |= LWP_OLDMASK; 855 } else { 856 /* 857 * No handler to run. Restore previous mask immediately. 858 */ 859 lp->lwp_sigmask = lp->lwp_oldsigmask; 860 } 861 } 862 863 return (error); 864 } 865 866 /* 867 * Common code for sys_select() and sys_pselect(). 868 * 869 * in, out and ex are userland pointers. tv must point to validated 870 * kernel-side timeout value or NULL for infinite timeout. res must 871 * point to syscall return value. 872 */ 873 static int 874 doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tv, 875 int *res) 876 { 877 struct lwp *lp = curthread->td_lwp; 878 struct proc *p = curproc; 879 880 /* 881 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 882 * infds with the new FD_SETSIZE of 1024, and more than enough for 883 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 884 * of 256. 885 */ 886 fd_mask s_selbits[howmany(2048, NFDBITS)]; 887 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 888 struct timeval atv, rtv, ttv; 889 int ncoll, error, timo; 890 u_int nbufbytes, ncpbytes, nfdbits; 891 892 if (nd < 0) 893 return (EINVAL); 894 if (nd > p->p_fd->fd_nfiles) 895 nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 896 897 /* 898 * Allocate just enough bits for the non-null fd_sets. Use the 899 * preallocated auto buffer if possible. 900 */ 901 nfdbits = roundup(nd, NFDBITS); 902 ncpbytes = nfdbits / NBBY; 903 nbufbytes = 0; 904 if (in != NULL) 905 nbufbytes += 2 * ncpbytes; 906 if (ou != NULL) 907 nbufbytes += 2 * ncpbytes; 908 if (ex != NULL) 909 nbufbytes += 2 * ncpbytes; 910 if (nbufbytes <= sizeof s_selbits) 911 selbits = &s_selbits[0]; 912 else 913 selbits = kmalloc(nbufbytes, M_SELECT, M_WAITOK); 914 915 /* 916 * Assign pointers into the bit buffers and fetch the input bits. 917 * Put the output buffers together so that they can be bzeroed 918 * together. 919 */ 920 sbp = selbits; 921 #define getbits(name, x) \ 922 do { \ 923 if (name == NULL) \ 924 ibits[x] = NULL; \ 925 else { \ 926 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 927 obits[x] = sbp; \ 928 sbp += ncpbytes / sizeof *sbp; \ 929 error = copyin(name, ibits[x], ncpbytes); \ 930 if (error != 0) \ 931 goto done; \ 932 } \ 933 } while (0) 934 getbits(in, 0); 935 getbits(ou, 1); 936 getbits(ex, 2); 937 #undef getbits 938 if (nbufbytes != 0) 939 bzero(selbits, nbufbytes / 2); 940 941 if (tv != NULL) { 942 atv = *tv; 943 getmicrouptime(&rtv); 944 timevaladd(&atv, &rtv); 945 } else { 946 atv.tv_sec = 0; 947 atv.tv_usec = 0; 948 } 949 timo = 0; 950 retry: 951 ncoll = nselcoll; 952 lp->lwp_flag |= LWP_SELECT; 953 error = selscan(p, ibits, obits, nd, res); 954 if (error || *res) 955 goto done; 956 if (atv.tv_sec || atv.tv_usec) { 957 getmicrouptime(&rtv); 958 if (timevalcmp(&rtv, &atv, >=)) 959 goto done; 960 ttv = atv; 961 timevalsub(&ttv, &rtv); 962 timo = ttv.tv_sec > 24 * 60 * 60 ? 963 24 * 60 * 60 * hz : tvtohz_high(&ttv); 964 } 965 crit_enter(); 966 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 967 crit_exit(); 968 goto retry; 969 } 970 lp->lwp_flag &= ~LWP_SELECT; 971 972 error = tsleep((caddr_t)&selwait, PCATCH, "select", timo); 973 974 crit_exit(); 975 if (error == 0) 976 goto retry; 977 done: 978 lp->lwp_flag &= ~LWP_SELECT; 979 /* select is not restarted after signals... */ 980 if (error == ERESTART) 981 error = EINTR; 982 if (error == EWOULDBLOCK) 983 error = 0; 984 #define putbits(name, x) \ 985 if (name && (error2 = copyout(obits[x], name, ncpbytes))) \ 986 error = error2; 987 if (error == 0) { 988 int error2; 989 990 putbits(in, 0); 991 putbits(ou, 1); 992 putbits(ex, 2); 993 #undef putbits 994 } 995 if (selbits != &s_selbits[0]) 996 kfree(selbits, M_SELECT); 997 return (error); 998 } 999 1000 static int 1001 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res) 1002 { 1003 int msk, i, fd; 1004 fd_mask bits; 1005 struct file *fp; 1006 int n = 0; 1007 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 1008 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 1009 1010 for (msk = 0; msk < 3; msk++) { 1011 if (ibits[msk] == NULL) 1012 continue; 1013 for (i = 0; i < nfd; i += NFDBITS) { 1014 bits = ibits[msk][i/NFDBITS]; 1015 /* ffs(int mask) not portable, fd_mask is long */ 1016 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 1017 if (!(bits & 1)) 1018 continue; 1019 fp = holdfp(p->p_fd, fd, -1); 1020 if (fp == NULL) 1021 return (EBADF); 1022 if (fo_poll(fp, flag[msk], fp->f_cred)) { 1023 obits[msk][(fd)/NFDBITS] |= 1024 ((fd_mask)1 << ((fd) % NFDBITS)); 1025 n++; 1026 } 1027 fdrop(fp); 1028 } 1029 } 1030 } 1031 *res = n; 1032 return (0); 1033 } 1034 1035 /* 1036 * Poll system call. 1037 */ 1038 int 1039 sys_poll(struct poll_args *uap) 1040 { 1041 struct pollfd *bits; 1042 struct pollfd smallbits[32]; 1043 struct timeval atv, rtv, ttv; 1044 int ncoll, error = 0, timo; 1045 u_int nfds; 1046 size_t ni; 1047 struct lwp *lp = curthread->td_lwp; 1048 struct proc *p = curproc; 1049 1050 nfds = uap->nfds; 1051 /* 1052 * This is kinda bogus. We have fd limits, but that is not 1053 * really related to the size of the pollfd array. Make sure 1054 * we let the process use at least FD_SETSIZE entries and at 1055 * least enough for the current limits. We want to be reasonably 1056 * safe, but not overly restrictive. 1057 */ 1058 if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE) 1059 return (EINVAL); 1060 ni = nfds * sizeof(struct pollfd); 1061 if (ni > sizeof(smallbits)) 1062 bits = kmalloc(ni, M_TEMP, M_WAITOK); 1063 else 1064 bits = smallbits; 1065 error = copyin(uap->fds, bits, ni); 1066 if (error) 1067 goto done; 1068 if (uap->timeout != INFTIM) { 1069 atv.tv_sec = uap->timeout / 1000; 1070 atv.tv_usec = (uap->timeout % 1000) * 1000; 1071 if (itimerfix(&atv)) { 1072 error = EINVAL; 1073 goto done; 1074 } 1075 getmicrouptime(&rtv); 1076 timevaladd(&atv, &rtv); 1077 } else { 1078 atv.tv_sec = 0; 1079 atv.tv_usec = 0; 1080 } 1081 timo = 0; 1082 retry: 1083 ncoll = nselcoll; 1084 lp->lwp_flag |= LWP_SELECT; 1085 error = pollscan(p, bits, nfds, &uap->sysmsg_result); 1086 if (error || uap->sysmsg_result) 1087 goto done; 1088 if (atv.tv_sec || atv.tv_usec) { 1089 getmicrouptime(&rtv); 1090 if (timevalcmp(&rtv, &atv, >=)) 1091 goto done; 1092 ttv = atv; 1093 timevalsub(&ttv, &rtv); 1094 timo = ttv.tv_sec > 24 * 60 * 60 ? 1095 24 * 60 * 60 * hz : tvtohz_high(&ttv); 1096 } 1097 crit_enter(); 1098 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 1099 crit_exit(); 1100 goto retry; 1101 } 1102 lp->lwp_flag &= ~LWP_SELECT; 1103 error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo); 1104 crit_exit(); 1105 if (error == 0) 1106 goto retry; 1107 done: 1108 lp->lwp_flag &= ~LWP_SELECT; 1109 /* poll is not restarted after signals... */ 1110 if (error == ERESTART) 1111 error = EINTR; 1112 if (error == EWOULDBLOCK) 1113 error = 0; 1114 if (error == 0) { 1115 error = copyout(bits, uap->fds, ni); 1116 if (error) 1117 goto out; 1118 } 1119 out: 1120 if (ni > sizeof(smallbits)) 1121 kfree(bits, M_TEMP); 1122 return (error); 1123 } 1124 1125 static int 1126 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res) 1127 { 1128 int i; 1129 struct file *fp; 1130 int n = 0; 1131 1132 for (i = 0; i < nfd; i++, fds++) { 1133 if (fds->fd >= p->p_fd->fd_nfiles) { 1134 fds->revents = POLLNVAL; 1135 n++; 1136 } else if (fds->fd < 0) { 1137 fds->revents = 0; 1138 } else { 1139 fp = holdfp(p->p_fd, fds->fd, -1); 1140 if (fp == NULL) { 1141 fds->revents = POLLNVAL; 1142 n++; 1143 } else { 1144 /* 1145 * Note: backend also returns POLLHUP and 1146 * POLLERR if appropriate. 1147 */ 1148 fds->revents = fo_poll(fp, fds->events, 1149 fp->f_cred); 1150 if (fds->revents != 0) 1151 n++; 1152 fdrop(fp); 1153 } 1154 } 1155 } 1156 *res = n; 1157 return (0); 1158 } 1159 1160 /* 1161 * OpenBSD poll system call. 1162 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1163 */ 1164 int 1165 sys_openbsd_poll(struct openbsd_poll_args *uap) 1166 { 1167 return (sys_poll((struct poll_args *)uap)); 1168 } 1169 1170 /*ARGSUSED*/ 1171 int 1172 seltrue(cdev_t dev, int events) 1173 { 1174 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1175 } 1176 1177 /* 1178 * Record a select request. A global wait must be used since a process/thread 1179 * might go away after recording its request. 1180 */ 1181 void 1182 selrecord(struct thread *selector, struct selinfo *sip) 1183 { 1184 struct proc *p; 1185 struct lwp *lp = NULL; 1186 1187 if (selector->td_lwp == NULL) 1188 panic("selrecord: thread needs a process"); 1189 1190 if (sip->si_pid == selector->td_proc->p_pid && 1191 sip->si_tid == selector->td_lwp->lwp_tid) 1192 return; 1193 if (sip->si_pid && (p = pfind(sip->si_pid))) 1194 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1195 if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) { 1196 sip->si_flags |= SI_COLL; 1197 } else { 1198 sip->si_pid = selector->td_proc->p_pid; 1199 sip->si_tid = selector->td_lwp->lwp_tid; 1200 } 1201 } 1202 1203 /* 1204 * Do a wakeup when a selectable event occurs. 1205 */ 1206 void 1207 selwakeup(struct selinfo *sip) 1208 { 1209 struct proc *p; 1210 struct lwp *lp = NULL; 1211 1212 if (sip->si_pid == 0) 1213 return; 1214 if (sip->si_flags & SI_COLL) { 1215 nselcoll++; 1216 sip->si_flags &= ~SI_COLL; 1217 wakeup((caddr_t)&selwait); /* YYY fixable */ 1218 } 1219 p = pfind(sip->si_pid); 1220 sip->si_pid = 0; 1221 if (p == NULL) 1222 return; 1223 lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid); 1224 if (lp == NULL) 1225 return; 1226 1227 crit_enter(); 1228 if (lp->lwp_wchan == (caddr_t)&selwait) { 1229 /* 1230 * Flag the process to break the tsleep when 1231 * setrunnable is called, but only call setrunnable 1232 * here if the process is not in a stopped state. 1233 */ 1234 lp->lwp_flag |= LWP_BREAKTSLEEP; 1235 if (p->p_stat != SSTOP) 1236 setrunnable(lp); 1237 } else if (lp->lwp_flag & LWP_SELECT) { 1238 lp->lwp_flag &= ~LWP_SELECT; 1239 } 1240 crit_exit(); 1241 } 1242 1243