1 /* 2 * Copyright (c) 1982, 1986, 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 39 * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $ 40 * $DragonFly: src/sys/kern/sys_generic.c,v 1.43 2007/02/18 16:13:27 corecode Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/sysproto.h> 48 #include <sys/filedesc.h> 49 #include <sys/filio.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/proc.h> 53 #include <sys/signalvar.h> 54 #include <sys/socketvar.h> 55 #include <sys/uio.h> 56 #include <sys/kernel.h> 57 #include <sys/kern_syscall.h> 58 #include <sys/malloc.h> 59 #include <sys/mapped_ioctl.h> 60 #include <sys/poll.h> 61 #include <sys/queue.h> 62 #include <sys/resourcevar.h> 63 #include <sys/sysctl.h> 64 #include <sys/sysent.h> 65 #include <sys/buf.h> 66 #ifdef KTRACE 67 #include <sys/ktrace.h> 68 #endif 69 #include <vm/vm.h> 70 #include <vm/vm_page.h> 71 #include <sys/file2.h> 72 73 #include <machine/limits.h> 74 75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); 76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer"); 77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); 78 MALLOC_DEFINE(M_IOV, "iov", "large iov's"); 79 80 static int pollscan (struct proc *, struct pollfd *, u_int, int *); 81 static int selscan (struct proc *, fd_mask **, fd_mask **, 82 int, int *); 83 static int dofileread(int, struct file *, struct uio *, int, int *); 84 static int dofilewrite(int, struct file *, struct uio *, int, int *); 85 86 /* 87 * Read system call. 88 * 89 * MPSAFE 90 */ 91 int 92 sys_read(struct read_args *uap) 93 { 94 struct thread *td = curthread; 95 struct uio auio; 96 struct iovec aiov; 97 int error; 98 99 aiov.iov_base = uap->buf; 100 aiov.iov_len = uap->nbyte; 101 auio.uio_iov = &aiov; 102 auio.uio_iovcnt = 1; 103 auio.uio_offset = -1; 104 auio.uio_resid = uap->nbyte; 105 auio.uio_rw = UIO_READ; 106 auio.uio_segflg = UIO_USERSPACE; 107 auio.uio_td = td; 108 109 if (auio.uio_resid < 0) 110 error = EINVAL; 111 else 112 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_result); 113 return(error); 114 } 115 116 /* 117 * Positioned (Pread) read system call 118 * 119 * MPSAFE 120 */ 121 int 122 sys_extpread(struct extpread_args *uap) 123 { 124 struct thread *td = curthread; 125 struct uio auio; 126 struct iovec aiov; 127 int error; 128 int flags; 129 130 aiov.iov_base = uap->buf; 131 aiov.iov_len = uap->nbyte; 132 auio.uio_iov = &aiov; 133 auio.uio_iovcnt = 1; 134 auio.uio_offset = uap->offset; 135 auio.uio_resid = uap->nbyte; 136 auio.uio_rw = UIO_READ; 137 auio.uio_segflg = UIO_USERSPACE; 138 auio.uio_td = td; 139 140 flags = uap->flags & O_FMASK; 141 if (uap->offset != (off_t)-1) 142 flags |= O_FOFFSET; 143 144 if (auio.uio_resid < 0) 145 error = EINVAL; 146 else 147 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_result); 148 return(error); 149 } 150 151 /* 152 * Scatter read system call. 153 * 154 * MPSAFE 155 */ 156 int 157 sys_readv(struct readv_args *uap) 158 { 159 struct thread *td = curthread; 160 struct uio auio; 161 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 162 int error; 163 164 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 165 &auio.uio_resid); 166 if (error) 167 return (error); 168 auio.uio_iov = iov; 169 auio.uio_iovcnt = uap->iovcnt; 170 auio.uio_offset = -1; 171 auio.uio_rw = UIO_READ; 172 auio.uio_segflg = UIO_USERSPACE; 173 auio.uio_td = td; 174 175 error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_result); 176 177 iovec_free(&iov, aiov); 178 return (error); 179 } 180 181 182 /* 183 * Scatter positioned read system call. 184 * 185 * MPSAFE 186 */ 187 int 188 sys_extpreadv(struct extpreadv_args *uap) 189 { 190 struct thread *td = curthread; 191 struct uio auio; 192 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 193 int error; 194 int flags; 195 196 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 197 &auio.uio_resid); 198 if (error) 199 return (error); 200 auio.uio_iov = iov; 201 auio.uio_iovcnt = uap->iovcnt; 202 auio.uio_offset = uap->offset; 203 auio.uio_rw = UIO_READ; 204 auio.uio_segflg = UIO_USERSPACE; 205 auio.uio_td = td; 206 207 flags = uap->flags & O_FMASK; 208 if (uap->offset != (off_t)-1) 209 flags |= O_FOFFSET; 210 211 error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_result); 212 213 iovec_free(&iov, aiov); 214 return(error); 215 } 216 217 /* 218 * MPSAFE 219 */ 220 int 221 kern_preadv(int fd, struct uio *auio, int flags, int *res) 222 { 223 struct thread *td = curthread; 224 struct proc *p = td->td_proc; 225 struct file *fp; 226 int error; 227 228 KKASSERT(p); 229 230 fp = holdfp(p->p_fd, fd, FREAD); 231 if (fp == NULL) 232 return (EBADF); 233 if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) { 234 error = ESPIPE; 235 } else if (auio->uio_resid < 0) { 236 error = EINVAL; 237 } else { 238 error = dofileread(fd, fp, auio, flags, res); 239 } 240 fdrop(fp); 241 return(error); 242 } 243 244 /* 245 * Common code for readv and preadv that reads data in 246 * from a file using the passed in uio, offset, and flags. 247 * 248 * MPALMOSTSAFE - ktrace needs help 249 */ 250 static int 251 dofileread(int fd, struct file *fp, struct uio *auio, int flags, int *res) 252 { 253 struct thread *td = curthread; 254 struct proc *p = td->td_proc; 255 int error; 256 int len; 257 #ifdef KTRACE 258 struct iovec *ktriov = NULL; 259 struct uio ktruio; 260 #endif 261 262 #ifdef KTRACE 263 /* 264 * if tracing, save a copy of iovec 265 */ 266 if (KTRPOINT(td, KTR_GENIO)) { 267 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 268 269 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 270 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 271 ktruio = *auio; 272 } 273 #endif 274 len = auio->uio_resid; 275 error = fo_read(fp, auio, fp->f_cred, flags); 276 if (error) { 277 if (auio->uio_resid != len && (error == ERESTART || 278 error == EINTR || error == EWOULDBLOCK)) 279 error = 0; 280 } 281 #ifdef KTRACE 282 if (ktriov != NULL) { 283 if (error == 0) { 284 ktruio.uio_iov = ktriov; 285 ktruio.uio_resid = len - auio->uio_resid; 286 get_mplock(); 287 ktrgenio(p, fd, UIO_READ, &ktruio, error); 288 rel_mplock(); 289 } 290 FREE(ktriov, M_TEMP); 291 } 292 #endif 293 if (error == 0) 294 *res = len - auio->uio_resid; 295 296 return(error); 297 } 298 299 /* 300 * Write system call 301 * 302 * MPSAFE 303 */ 304 int 305 sys_write(struct write_args *uap) 306 { 307 struct thread *td = curthread; 308 struct uio auio; 309 struct iovec aiov; 310 int error; 311 312 aiov.iov_base = (void *)(uintptr_t)uap->buf; 313 aiov.iov_len = uap->nbyte; 314 auio.uio_iov = &aiov; 315 auio.uio_iovcnt = 1; 316 auio.uio_offset = -1; 317 auio.uio_resid = uap->nbyte; 318 auio.uio_rw = UIO_WRITE; 319 auio.uio_segflg = UIO_USERSPACE; 320 auio.uio_td = td; 321 322 if (auio.uio_resid < 0) 323 error = EINVAL; 324 else 325 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_result); 326 327 return(error); 328 } 329 330 /* 331 * Pwrite system call 332 * 333 * MPSAFE 334 */ 335 int 336 sys_extpwrite(struct extpwrite_args *uap) 337 { 338 struct thread *td = curthread; 339 struct uio auio; 340 struct iovec aiov; 341 int error; 342 int flags; 343 344 aiov.iov_base = (void *)(uintptr_t)uap->buf; 345 aiov.iov_len = uap->nbyte; 346 auio.uio_iov = &aiov; 347 auio.uio_iovcnt = 1; 348 auio.uio_offset = uap->offset; 349 auio.uio_resid = uap->nbyte; 350 auio.uio_rw = UIO_WRITE; 351 auio.uio_segflg = UIO_USERSPACE; 352 auio.uio_td = td; 353 354 flags = uap->flags & O_FMASK; 355 if (uap->offset != (off_t)-1) 356 flags |= O_FOFFSET; 357 358 if (auio.uio_resid < 0) 359 error = EINVAL; 360 else 361 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_result); 362 363 return(error); 364 } 365 366 /* 367 * MPSAFE 368 */ 369 int 370 sys_writev(struct writev_args *uap) 371 { 372 struct thread *td = curthread; 373 struct uio auio; 374 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 375 int error; 376 377 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 378 &auio.uio_resid); 379 if (error) 380 return (error); 381 auio.uio_iov = iov; 382 auio.uio_iovcnt = uap->iovcnt; 383 auio.uio_offset = -1; 384 auio.uio_rw = UIO_WRITE; 385 auio.uio_segflg = UIO_USERSPACE; 386 auio.uio_td = td; 387 388 error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_result); 389 390 iovec_free(&iov, aiov); 391 return (error); 392 } 393 394 395 /* 396 * Gather positioned write system call 397 * 398 * MPSAFE 399 */ 400 int 401 sys_extpwritev(struct extpwritev_args *uap) 402 { 403 struct thread *td = curthread; 404 struct uio auio; 405 struct iovec aiov[UIO_SMALLIOV], *iov = NULL; 406 int error; 407 int flags; 408 409 error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt, 410 &auio.uio_resid); 411 if (error) 412 return (error); 413 auio.uio_iov = iov; 414 auio.uio_iovcnt = uap->iovcnt; 415 auio.uio_offset = uap->offset; 416 auio.uio_rw = UIO_WRITE; 417 auio.uio_segflg = UIO_USERSPACE; 418 auio.uio_td = td; 419 420 flags = uap->flags & O_FMASK; 421 if (uap->offset != (off_t)-1) 422 flags |= O_FOFFSET; 423 424 error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_result); 425 426 iovec_free(&iov, aiov); 427 return(error); 428 } 429 430 /* 431 * MPSAFE 432 */ 433 int 434 kern_pwritev(int fd, struct uio *auio, int flags, int *res) 435 { 436 struct thread *td = curthread; 437 struct proc *p = td->td_proc; 438 struct file *fp; 439 int error; 440 441 KKASSERT(p); 442 443 fp = holdfp(p->p_fd, fd, FWRITE); 444 if (fp == NULL) 445 return (EBADF); 446 else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) { 447 error = ESPIPE; 448 } else { 449 error = dofilewrite(fd, fp, auio, flags, res); 450 } 451 452 fdrop(fp); 453 return (error); 454 } 455 456 /* 457 * Common code for writev and pwritev that writes data to 458 * a file using the passed in uio, offset, and flags. 459 * 460 * MPALMOSTSAFE - ktrace needs help 461 */ 462 static int 463 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, int *res) 464 { 465 struct thread *td = curthread; 466 struct proc *p = td->td_proc; 467 int error; 468 int len; 469 #ifdef KTRACE 470 struct iovec *ktriov = NULL; 471 struct uio ktruio; 472 #endif 473 474 #ifdef KTRACE 475 /* 476 * if tracing, save a copy of iovec and uio 477 */ 478 if (KTRPOINT(td, KTR_GENIO)) { 479 int iovlen = auio->uio_iovcnt * sizeof(struct iovec); 480 481 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); 482 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen); 483 ktruio = *auio; 484 } 485 #endif 486 len = auio->uio_resid; 487 if (fp->f_type == DTYPE_VNODE) 488 bwillwrite(); 489 error = fo_write(fp, auio, fp->f_cred, flags); 490 if (error) { 491 if (auio->uio_resid != len && (error == ERESTART || 492 error == EINTR || error == EWOULDBLOCK)) 493 error = 0; 494 /* Socket layer is responsible for issuing SIGPIPE. */ 495 if (error == EPIPE) { 496 get_mplock(); 497 ksignal(p, SIGPIPE); 498 rel_mplock(); 499 } 500 } 501 #ifdef KTRACE 502 if (ktriov != NULL) { 503 if (error == 0) { 504 ktruio.uio_iov = ktriov; 505 ktruio.uio_resid = len - auio->uio_resid; 506 get_mplock(); 507 ktrgenio(p, fd, UIO_WRITE, &ktruio, error); 508 rel_mplock(); 509 } 510 FREE(ktriov, M_TEMP); 511 } 512 #endif 513 if (error == 0) 514 *res = len - auio->uio_resid; 515 516 return(error); 517 } 518 519 /* 520 * Ioctl system call 521 */ 522 /* ARGSUSED */ 523 int 524 sys_ioctl(struct ioctl_args *uap) 525 { 526 return(mapped_ioctl(uap->fd, uap->com, uap->data, NULL)); 527 } 528 529 struct ioctl_map_entry { 530 const char *subsys; 531 struct ioctl_map_range *cmd_ranges; 532 LIST_ENTRY(ioctl_map_entry) entries; 533 }; 534 535 /* 536 * The true heart of all ioctl syscall handlers (native, emulation). 537 * If map != NULL, it will be searched for a matching entry for com, 538 * and appropriate conversions/conversion functions will be utilized. 539 */ 540 int 541 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map) 542 { 543 struct thread *td = curthread; 544 struct proc *p = td->td_proc; 545 struct ucred *cred; 546 struct file *fp; 547 struct ioctl_map_range *iomc = NULL; 548 int error; 549 u_int size; 550 u_long ocom = com; 551 caddr_t data, memp; 552 int tmp; 553 #define STK_PARAMS 128 554 union { 555 char stkbuf[STK_PARAMS]; 556 long align; 557 } ubuf; 558 559 KKASSERT(p); 560 cred = p->p_ucred; 561 562 fp = holdfp(p->p_fd, fd, FREAD|FWRITE); 563 if (fp == NULL) 564 return(EBADF); 565 566 if (map != NULL) { /* obey translation map */ 567 u_long maskcmd; 568 struct ioctl_map_entry *e; 569 570 maskcmd = com & map->mask; 571 572 LIST_FOREACH(e, &map->mapping, entries) { 573 for (iomc = e->cmd_ranges; iomc->start != 0 || 574 iomc->maptocmd != 0 || iomc->wrapfunc != NULL || 575 iomc->mapfunc != NULL; 576 iomc++) { 577 if (maskcmd >= iomc->start && 578 maskcmd <= iomc->end) 579 break; 580 } 581 582 /* Did we find a match? */ 583 if (iomc->start != 0 || iomc->maptocmd != 0 || 584 iomc->wrapfunc != NULL || iomc->mapfunc != NULL) 585 break; 586 } 587 588 if (iomc == NULL || 589 (iomc->start == 0 && iomc->maptocmd == 0 590 && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) { 591 kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n", 592 map->sys, fd, maskcmd, 593 (int)((maskcmd >> 8) & 0xff), 594 (int)(maskcmd & 0xff)); 595 error = EINVAL; 596 goto done; 597 } 598 599 /* 600 * If it's a non-range one to one mapping, maptocmd should be 601 * correct. If it's a ranged one to one mapping, we pass the 602 * original value of com, and for a range mapped to a different 603 * range, we always need a mapping function to translate the 604 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff 605 */ 606 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) { 607 com = iomc->maptocmd; 608 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) { 609 if (iomc->mapfunc != NULL) 610 com = iomc->mapfunc(iomc->start, iomc->end, 611 iomc->start, iomc->end, 612 com, com); 613 } else { 614 if (iomc->mapfunc != NULL) { 615 com = iomc->mapfunc(iomc->start, iomc->end, 616 iomc->maptocmd, iomc->maptoend, 617 com, ocom); 618 } else { 619 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n", 620 map->sys, fd, maskcmd, 621 (int)((maskcmd >> 8) & 0xff), 622 (int)(maskcmd & 0xff)); 623 error = EINVAL; 624 goto done; 625 } 626 } 627 } 628 629 switch (com) { 630 case FIONCLEX: 631 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE); 632 goto done; 633 case FIOCLEX: 634 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE); 635 goto done; 636 } 637 638 /* 639 * Interpret high order word to find amount of data to be 640 * copied to/from the user's address space. 641 */ 642 size = IOCPARM_LEN(com); 643 if (size > IOCPARM_MAX) { 644 error = ENOTTY; 645 goto done; 646 } 647 648 memp = NULL; 649 if (size > sizeof (ubuf.stkbuf)) { 650 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK); 651 data = memp; 652 } else { 653 data = ubuf.stkbuf; 654 } 655 if ((com & IOC_IN) != 0) { 656 if (size != 0) { 657 error = copyin(uspc_data, data, (u_int)size); 658 if (error) { 659 if (memp != NULL) 660 kfree(memp, M_IOCTLOPS); 661 goto done; 662 } 663 } else { 664 *(caddr_t *)data = uspc_data; 665 } 666 } else if ((com & IOC_OUT) != 0 && size) { 667 /* 668 * Zero the buffer so the user always 669 * gets back something deterministic. 670 */ 671 bzero(data, size); 672 } else if ((com & IOC_VOID) != 0) { 673 *(caddr_t *)data = uspc_data; 674 } 675 676 switch (com) { 677 case FIONBIO: 678 if ((tmp = *(int *)data)) 679 fp->f_flag |= FNONBLOCK; 680 else 681 fp->f_flag &= ~FNONBLOCK; 682 error = 0; 683 break; 684 685 case FIOASYNC: 686 if ((tmp = *(int *)data)) 687 fp->f_flag |= FASYNC; 688 else 689 fp->f_flag &= ~FASYNC; 690 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred); 691 break; 692 693 default: 694 /* 695 * If there is a override function, 696 * call it instead of directly routing the call 697 */ 698 if (map != NULL && iomc->wrapfunc != NULL) 699 error = iomc->wrapfunc(fp, com, ocom, data, cred); 700 else 701 error = fo_ioctl(fp, com, data, cred); 702 /* 703 * Copy any data to user, size was 704 * already set and checked above. 705 */ 706 if (error == 0 && (com & IOC_OUT) != 0 && size != 0) 707 error = copyout(data, uspc_data, (u_int)size); 708 break; 709 } 710 if (memp != NULL) 711 kfree(memp, M_IOCTLOPS); 712 done: 713 fdrop(fp); 714 return(error); 715 } 716 717 int 718 mapped_ioctl_register_handler(struct ioctl_map_handler *he) 719 { 720 struct ioctl_map_entry *ne; 721 722 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL && 723 he->subsys != NULL && *he->subsys != '\0'); 724 725 ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK); 726 727 ne->subsys = he->subsys; 728 ne->cmd_ranges = he->cmd_ranges; 729 730 LIST_INSERT_HEAD(&he->map->mapping, ne, entries); 731 732 return(0); 733 } 734 735 int 736 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he) 737 { 738 struct ioctl_map_entry *ne; 739 740 KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL); 741 742 LIST_FOREACH(ne, &he->map->mapping, entries) { 743 if (ne->cmd_ranges != he->cmd_ranges) 744 continue; 745 LIST_REMOVE(ne, entries); 746 kfree(ne, M_IOCTLMAP); 747 return(0); 748 } 749 return(EINVAL); 750 } 751 752 static int nselcoll; /* Select collisions since boot */ 753 int selwait; 754 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, ""); 755 756 /* 757 * Select system call. 758 */ 759 int 760 sys_select(struct select_args *uap) 761 { 762 struct lwp *lp = curthread->td_lwp; 763 struct proc *p = curproc; 764 765 /* 766 * The magic 2048 here is chosen to be just enough for FD_SETSIZE 767 * infds with the new FD_SETSIZE of 1024, and more than enough for 768 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE 769 * of 256. 770 */ 771 fd_mask s_selbits[howmany(2048, NFDBITS)]; 772 fd_mask *ibits[3], *obits[3], *selbits, *sbp; 773 struct timeval atv, rtv, ttv; 774 int ncoll, error, timo; 775 u_int nbufbytes, ncpbytes, nfdbits; 776 777 if (uap->nd < 0) 778 return (EINVAL); 779 if (uap->nd > p->p_fd->fd_nfiles) 780 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ 781 782 /* 783 * Allocate just enough bits for the non-null fd_sets. Use the 784 * preallocated auto buffer if possible. 785 */ 786 nfdbits = roundup(uap->nd, NFDBITS); 787 ncpbytes = nfdbits / NBBY; 788 nbufbytes = 0; 789 if (uap->in != NULL) 790 nbufbytes += 2 * ncpbytes; 791 if (uap->ou != NULL) 792 nbufbytes += 2 * ncpbytes; 793 if (uap->ex != NULL) 794 nbufbytes += 2 * ncpbytes; 795 if (nbufbytes <= sizeof s_selbits) 796 selbits = &s_selbits[0]; 797 else 798 selbits = kmalloc(nbufbytes, M_SELECT, M_WAITOK); 799 800 /* 801 * Assign pointers into the bit buffers and fetch the input bits. 802 * Put the output buffers together so that they can be bzeroed 803 * together. 804 */ 805 sbp = selbits; 806 #define getbits(name, x) \ 807 do { \ 808 if (uap->name == NULL) \ 809 ibits[x] = NULL; \ 810 else { \ 811 ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ 812 obits[x] = sbp; \ 813 sbp += ncpbytes / sizeof *sbp; \ 814 error = copyin(uap->name, ibits[x], ncpbytes); \ 815 if (error != 0) \ 816 goto done; \ 817 } \ 818 } while (0) 819 getbits(in, 0); 820 getbits(ou, 1); 821 getbits(ex, 2); 822 #undef getbits 823 if (nbufbytes != 0) 824 bzero(selbits, nbufbytes / 2); 825 826 if (uap->tv) { 827 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, 828 sizeof (atv)); 829 if (error) 830 goto done; 831 if (itimerfix(&atv)) { 832 error = EINVAL; 833 goto done; 834 } 835 getmicrouptime(&rtv); 836 timevaladd(&atv, &rtv); 837 } else { 838 atv.tv_sec = 0; 839 atv.tv_usec = 0; 840 } 841 timo = 0; 842 retry: 843 ncoll = nselcoll; 844 lp->lwp_flag |= LWP_SELECT; 845 error = selscan(p, ibits, obits, uap->nd, &uap->sysmsg_result); 846 if (error || uap->sysmsg_result) 847 goto done; 848 if (atv.tv_sec || atv.tv_usec) { 849 getmicrouptime(&rtv); 850 if (timevalcmp(&rtv, &atv, >=)) 851 goto done; 852 ttv = atv; 853 timevalsub(&ttv, &rtv); 854 timo = ttv.tv_sec > 24 * 60 * 60 ? 855 24 * 60 * 60 * hz : tvtohz_high(&ttv); 856 } 857 crit_enter(); 858 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 859 crit_exit(); 860 goto retry; 861 } 862 lp->lwp_flag &= ~LWP_SELECT; 863 864 error = tsleep((caddr_t)&selwait, PCATCH, "select", timo); 865 866 crit_exit(); 867 if (error == 0) 868 goto retry; 869 done: 870 lp->lwp_flag &= ~LWP_SELECT; 871 /* select is not restarted after signals... */ 872 if (error == ERESTART) 873 error = EINTR; 874 if (error == EWOULDBLOCK) 875 error = 0; 876 #define putbits(name, x) \ 877 if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ 878 error = error2; 879 if (error == 0) { 880 int error2; 881 882 putbits(in, 0); 883 putbits(ou, 1); 884 putbits(ex, 2); 885 #undef putbits 886 } 887 if (selbits != &s_selbits[0]) 888 kfree(selbits, M_SELECT); 889 return (error); 890 } 891 892 static int 893 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res) 894 { 895 int msk, i, fd; 896 fd_mask bits; 897 struct file *fp; 898 int n = 0; 899 /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ 900 static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; 901 902 for (msk = 0; msk < 3; msk++) { 903 if (ibits[msk] == NULL) 904 continue; 905 for (i = 0; i < nfd; i += NFDBITS) { 906 bits = ibits[msk][i/NFDBITS]; 907 /* ffs(int mask) not portable, fd_mask is long */ 908 for (fd = i; bits && fd < nfd; fd++, bits >>= 1) { 909 if (!(bits & 1)) 910 continue; 911 fp = holdfp(p->p_fd, fd, -1); 912 if (fp == NULL) 913 return (EBADF); 914 if (fo_poll(fp, flag[msk], fp->f_cred)) { 915 obits[msk][(fd)/NFDBITS] |= 916 ((fd_mask)1 << ((fd) % NFDBITS)); 917 n++; 918 } 919 fdrop(fp); 920 } 921 } 922 } 923 *res = n; 924 return (0); 925 } 926 927 /* 928 * Poll system call. 929 */ 930 int 931 sys_poll(struct poll_args *uap) 932 { 933 struct pollfd *bits; 934 struct pollfd smallbits[32]; 935 struct timeval atv, rtv, ttv; 936 int ncoll, error = 0, timo; 937 u_int nfds; 938 size_t ni; 939 struct lwp *lp = curthread->td_lwp; 940 struct proc *p = curproc; 941 942 nfds = uap->nfds; 943 /* 944 * This is kinda bogus. We have fd limits, but that is not 945 * really related to the size of the pollfd array. Make sure 946 * we let the process use at least FD_SETSIZE entries and at 947 * least enough for the current limits. We want to be reasonably 948 * safe, but not overly restrictive. 949 */ 950 if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE) 951 return (EINVAL); 952 ni = nfds * sizeof(struct pollfd); 953 if (ni > sizeof(smallbits)) 954 bits = kmalloc(ni, M_TEMP, M_WAITOK); 955 else 956 bits = smallbits; 957 error = copyin(uap->fds, bits, ni); 958 if (error) 959 goto done; 960 if (uap->timeout != INFTIM) { 961 atv.tv_sec = uap->timeout / 1000; 962 atv.tv_usec = (uap->timeout % 1000) * 1000; 963 if (itimerfix(&atv)) { 964 error = EINVAL; 965 goto done; 966 } 967 getmicrouptime(&rtv); 968 timevaladd(&atv, &rtv); 969 } else { 970 atv.tv_sec = 0; 971 atv.tv_usec = 0; 972 } 973 timo = 0; 974 retry: 975 ncoll = nselcoll; 976 lp->lwp_flag |= LWP_SELECT; 977 error = pollscan(p, bits, nfds, &uap->sysmsg_result); 978 if (error || uap->sysmsg_result) 979 goto done; 980 if (atv.tv_sec || atv.tv_usec) { 981 getmicrouptime(&rtv); 982 if (timevalcmp(&rtv, &atv, >=)) 983 goto done; 984 ttv = atv; 985 timevalsub(&ttv, &rtv); 986 timo = ttv.tv_sec > 24 * 60 * 60 ? 987 24 * 60 * 60 * hz : tvtohz_high(&ttv); 988 } 989 crit_enter(); 990 if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) { 991 crit_exit(); 992 goto retry; 993 } 994 lp->lwp_flag &= ~LWP_SELECT; 995 error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo); 996 crit_exit(); 997 if (error == 0) 998 goto retry; 999 done: 1000 lp->lwp_flag &= ~LWP_SELECT; 1001 /* poll is not restarted after signals... */ 1002 if (error == ERESTART) 1003 error = EINTR; 1004 if (error == EWOULDBLOCK) 1005 error = 0; 1006 if (error == 0) { 1007 error = copyout(bits, uap->fds, ni); 1008 if (error) 1009 goto out; 1010 } 1011 out: 1012 if (ni > sizeof(smallbits)) 1013 kfree(bits, M_TEMP); 1014 return (error); 1015 } 1016 1017 static int 1018 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res) 1019 { 1020 int i; 1021 struct file *fp; 1022 int n = 0; 1023 1024 for (i = 0; i < nfd; i++, fds++) { 1025 if (fds->fd >= p->p_fd->fd_nfiles) { 1026 fds->revents = POLLNVAL; 1027 n++; 1028 } else if (fds->fd < 0) { 1029 fds->revents = 0; 1030 } else { 1031 fp = holdfp(p->p_fd, fds->fd, -1); 1032 if (fp == NULL) { 1033 fds->revents = POLLNVAL; 1034 n++; 1035 } else { 1036 /* 1037 * Note: backend also returns POLLHUP and 1038 * POLLERR if appropriate. 1039 */ 1040 fds->revents = fo_poll(fp, fds->events, 1041 fp->f_cred); 1042 if (fds->revents != 0) 1043 n++; 1044 fdrop(fp); 1045 } 1046 } 1047 } 1048 *res = n; 1049 return (0); 1050 } 1051 1052 /* 1053 * OpenBSD poll system call. 1054 * XXX this isn't quite a true representation.. OpenBSD uses select ops. 1055 */ 1056 int 1057 sys_openbsd_poll(struct openbsd_poll_args *uap) 1058 { 1059 return (sys_poll((struct poll_args *)uap)); 1060 } 1061 1062 /*ARGSUSED*/ 1063 int 1064 seltrue(cdev_t dev, int events) 1065 { 1066 return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); 1067 } 1068 1069 /* 1070 * Record a select request. A global wait must be used since a process/thread 1071 * might go away after recording its request. 1072 */ 1073 void 1074 selrecord(struct thread *selector, struct selinfo *sip) 1075 { 1076 struct proc *p; 1077 struct lwp *lp = NULL; 1078 1079 if (selector->td_lwp == NULL) 1080 panic("selrecord: thread needs a process"); 1081 1082 if (sip->si_pid == selector->td_proc->p_pid && 1083 sip->si_tid == selector->td_lwp->lwp_tid) 1084 return; 1085 if (sip->si_pid && (p = pfind(sip->si_pid))) { 1086 FOREACH_LWP_IN_PROC(lp, p) { 1087 if (sip->si_tid == lp->lwp_tid) 1088 break; 1089 } 1090 } 1091 if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) { 1092 sip->si_flags |= SI_COLL; 1093 } else { 1094 sip->si_pid = selector->td_proc->p_pid; 1095 } 1096 } 1097 1098 /* 1099 * Do a wakeup when a selectable event occurs. 1100 */ 1101 void 1102 selwakeup(struct selinfo *sip) 1103 { 1104 struct proc *p; 1105 struct lwp *lp = NULL; 1106 1107 if (sip->si_pid == 0) 1108 return; 1109 if (sip->si_flags & SI_COLL) { 1110 nselcoll++; 1111 sip->si_flags &= ~SI_COLL; 1112 wakeup((caddr_t)&selwait); /* YYY fixable */ 1113 } 1114 p = pfind(sip->si_pid); 1115 sip->si_pid = 0; 1116 if (p == NULL) 1117 return; 1118 FOREACH_LWP_IN_PROC(lp, p) { 1119 if (lp->lwp_tid == sip->si_tid) 1120 break; 1121 } 1122 if (lp == NULL) 1123 return; 1124 1125 crit_enter(); 1126 if (lp->lwp_wchan == (caddr_t)&selwait) { 1127 /* 1128 * Flag the process to break the tsleep when 1129 * setrunnable is called, but only call setrunnable 1130 * here if the process is not in a stopped state. 1131 */ 1132 lp->lwp_flag |= LWP_BREAKTSLEEP; 1133 if (p->p_stat != SSTOP) 1134 setrunnable(lp); 1135 } else if (lp->lwp_flag & LWP_SELECT) { 1136 lp->lwp_flag &= ~LWP_SELECT; 1137 } 1138 crit_exit(); 1139 } 1140 1141