1 /* $NetBSD: sys_generic.c,v 1.125 2011/01/18 19:52:23 matt Exp $ */ 2 3 /*- 4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 1982, 1986, 1989, 1993 34 * The Regents of the University of California. All rights reserved. 35 * (c) UNIX System Laboratories, Inc. 36 * All or some portions of this file are derived from material licensed 37 * to the University of California by American Telephone and Telegraph 38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 39 * the permission of UNIX System Laboratories, Inc. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * 1. Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * 2. Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * 3. Neither the name of the University nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 63 * SUCH DAMAGE. 64 * 65 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 66 */ 67 68 /* 69 * System calls relating to files. 70 */ 71 72 #include <sys/cdefs.h> 73 __KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.125 2011/01/18 19:52:23 matt Exp $"); 74 75 #include <sys/param.h> 76 #include <sys/systm.h> 77 #include <sys/filedesc.h> 78 #include <sys/ioctl.h> 79 #include <sys/file.h> 80 #include <sys/proc.h> 81 #include <sys/socketvar.h> 82 #include <sys/signalvar.h> 83 #include <sys/uio.h> 84 #include <sys/kernel.h> 85 #include <sys/stat.h> 86 #include <sys/kmem.h> 87 #include <sys/poll.h> 88 #include <sys/vnode.h> 89 #include <sys/mount.h> 90 #include <sys/syscallargs.h> 91 #include <sys/ktrace.h> 92 #include <sys/atomic.h> 93 #include <sys/disklabel.h> 94 95 #include <uvm/uvm_extern.h> 96 97 /* 98 * Read system call. 99 */ 100 /* ARGSUSED */ 101 int 102 sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval) 103 { 104 /* { 105 syscallarg(int) fd; 106 syscallarg(void *) buf; 107 syscallarg(size_t) nbyte; 108 } */ 109 file_t *fp; 110 int fd; 111 112 fd = SCARG(uap, fd); 113 114 if ((fp = fd_getfile(fd)) == NULL) 115 return (EBADF); 116 117 if ((fp->f_flag & FREAD) == 0) { 118 fd_putfile(fd); 119 return (EBADF); 120 } 121 122 /* dofileread() will unuse the descriptor for us */ 123 return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 124 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 125 } 126 127 int 128 dofileread(int fd, struct file *fp, void *buf, size_t nbyte, 129 off_t *offset, int flags, register_t *retval) 130 { 131 struct iovec aiov; 132 struct uio auio; 133 size_t cnt; 134 int error; 135 lwp_t *l; 136 137 l = curlwp; 138 139 aiov.iov_base = (void *)buf; 140 aiov.iov_len = nbyte; 141 auio.uio_iov = &aiov; 142 auio.uio_iovcnt = 1; 143 auio.uio_resid = nbyte; 144 auio.uio_rw = UIO_READ; 145 auio.uio_vmspace = l->l_proc->p_vmspace; 146 147 /* 148 * Reads return ssize_t because -1 is returned on error. Therefore 149 * we must restrict the length to SSIZE_MAX to avoid garbage return 150 * values. 151 */ 152 if (auio.uio_resid > SSIZE_MAX) { 153 error = EINVAL; 154 goto out; 155 } 156 157 cnt = auio.uio_resid; 158 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 159 if (error) 160 if (auio.uio_resid != cnt && (error == ERESTART || 161 error == EINTR || error == EWOULDBLOCK)) 162 error = 0; 163 cnt -= auio.uio_resid; 164 ktrgenio(fd, UIO_READ, buf, cnt, error); 165 *retval = cnt; 166 out: 167 fd_putfile(fd); 168 return (error); 169 } 170 171 /* 172 * Scatter read system call. 173 */ 174 int 175 sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval) 176 { 177 /* { 178 syscallarg(int) fd; 179 syscallarg(const struct iovec *) iovp; 180 syscallarg(int) iovcnt; 181 } */ 182 183 return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp), 184 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 185 } 186 187 int 188 do_filereadv(int fd, const struct iovec *iovp, int iovcnt, 189 off_t *offset, int flags, register_t *retval) 190 { 191 struct uio auio; 192 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 193 int i, error; 194 size_t cnt; 195 u_int iovlen; 196 struct file *fp; 197 struct iovec *ktriov = NULL; 198 199 if (iovcnt == 0) 200 return EINVAL; 201 202 if ((fp = fd_getfile(fd)) == NULL) 203 return EBADF; 204 205 if ((fp->f_flag & FREAD) == 0) { 206 fd_putfile(fd); 207 return EBADF; 208 } 209 210 if (offset == NULL) 211 offset = &fp->f_offset; 212 else { 213 struct vnode *vp = fp->f_data; 214 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 215 error = ESPIPE; 216 goto out; 217 } 218 /* 219 * Test that the device is seekable ? 220 * XXX This works because no file systems actually 221 * XXX take any action on the seek operation. 222 */ 223 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 224 if (error != 0) 225 goto out; 226 } 227 228 iovlen = iovcnt * sizeof(struct iovec); 229 if (flags & FOF_IOV_SYSSPACE) 230 iov = __UNCONST(iovp); 231 else { 232 iov = aiov; 233 if ((u_int)iovcnt > UIO_SMALLIOV) { 234 if ((u_int)iovcnt > IOV_MAX) { 235 error = EINVAL; 236 goto out; 237 } 238 iov = kmem_alloc(iovlen, KM_SLEEP); 239 if (iov == NULL) { 240 error = ENOMEM; 241 goto out; 242 } 243 needfree = iov; 244 } 245 error = copyin(iovp, iov, iovlen); 246 if (error) 247 goto done; 248 } 249 250 auio.uio_iov = iov; 251 auio.uio_iovcnt = iovcnt; 252 auio.uio_rw = UIO_READ; 253 auio.uio_vmspace = curproc->p_vmspace; 254 255 auio.uio_resid = 0; 256 for (i = 0; i < iovcnt; i++, iov++) { 257 auio.uio_resid += iov->iov_len; 258 /* 259 * Reads return ssize_t because -1 is returned on error. 260 * Therefore we must restrict the length to SSIZE_MAX to 261 * avoid garbage return values. 262 */ 263 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 264 error = EINVAL; 265 goto done; 266 } 267 } 268 269 /* 270 * if tracing, save a copy of iovec 271 */ 272 if (ktrpoint(KTR_GENIO)) { 273 ktriov = kmem_alloc(iovlen, KM_SLEEP); 274 if (ktriov != NULL) 275 memcpy(ktriov, auio.uio_iov, iovlen); 276 } 277 278 cnt = auio.uio_resid; 279 error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags); 280 if (error) 281 if (auio.uio_resid != cnt && (error == ERESTART || 282 error == EINTR || error == EWOULDBLOCK)) 283 error = 0; 284 cnt -= auio.uio_resid; 285 *retval = cnt; 286 287 if (ktriov != NULL) { 288 ktrgeniov(fd, UIO_READ, ktriov, cnt, error); 289 kmem_free(ktriov, iovlen); 290 } 291 292 done: 293 if (needfree) 294 kmem_free(needfree, iovlen); 295 out: 296 fd_putfile(fd); 297 return (error); 298 } 299 300 /* 301 * Write system call 302 */ 303 int 304 sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval) 305 { 306 /* { 307 syscallarg(int) fd; 308 syscallarg(const void *) buf; 309 syscallarg(size_t) nbyte; 310 } */ 311 file_t *fp; 312 int fd; 313 314 fd = SCARG(uap, fd); 315 316 if ((fp = fd_getfile(fd)) == NULL) 317 return (EBADF); 318 319 if ((fp->f_flag & FWRITE) == 0) { 320 fd_putfile(fd); 321 return (EBADF); 322 } 323 324 /* dofilewrite() will unuse the descriptor for us */ 325 return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte), 326 &fp->f_offset, FOF_UPDATE_OFFSET, retval)); 327 } 328 329 int 330 dofilewrite(int fd, struct file *fp, const void *buf, 331 size_t nbyte, off_t *offset, int flags, register_t *retval) 332 { 333 struct iovec aiov; 334 struct uio auio; 335 size_t cnt; 336 int error; 337 338 aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */ 339 aiov.iov_len = nbyte; 340 auio.uio_iov = &aiov; 341 auio.uio_iovcnt = 1; 342 auio.uio_resid = nbyte; 343 auio.uio_rw = UIO_WRITE; 344 auio.uio_vmspace = curproc->p_vmspace; 345 346 /* 347 * Writes return ssize_t because -1 is returned on error. Therefore 348 * we must restrict the length to SSIZE_MAX to avoid garbage return 349 * values. 350 */ 351 if (auio.uio_resid > SSIZE_MAX) { 352 error = EINVAL; 353 goto out; 354 } 355 356 cnt = auio.uio_resid; 357 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 358 if (error) { 359 if (auio.uio_resid != cnt && (error == ERESTART || 360 error == EINTR || error == EWOULDBLOCK)) 361 error = 0; 362 if (error == EPIPE) { 363 mutex_enter(proc_lock); 364 psignal(curproc, SIGPIPE); 365 mutex_exit(proc_lock); 366 } 367 } 368 cnt -= auio.uio_resid; 369 ktrgenio(fd, UIO_WRITE, buf, cnt, error); 370 *retval = cnt; 371 out: 372 fd_putfile(fd); 373 return (error); 374 } 375 376 /* 377 * Gather write system call 378 */ 379 int 380 sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval) 381 { 382 /* { 383 syscallarg(int) fd; 384 syscallarg(const struct iovec *) iovp; 385 syscallarg(int) iovcnt; 386 } */ 387 388 return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp), 389 SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval); 390 } 391 392 int 393 do_filewritev(int fd, const struct iovec *iovp, int iovcnt, 394 off_t *offset, int flags, register_t *retval) 395 { 396 struct uio auio; 397 struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV]; 398 int i, error; 399 size_t cnt; 400 u_int iovlen; 401 struct file *fp; 402 struct iovec *ktriov = NULL; 403 404 if (iovcnt == 0) 405 return EINVAL; 406 407 if ((fp = fd_getfile(fd)) == NULL) 408 return EBADF; 409 410 if ((fp->f_flag & FWRITE) == 0) { 411 fd_putfile(fd); 412 return EBADF; 413 } 414 415 if (offset == NULL) 416 offset = &fp->f_offset; 417 else { 418 struct vnode *vp = fp->f_data; 419 if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) { 420 error = ESPIPE; 421 goto out; 422 } 423 /* 424 * Test that the device is seekable ? 425 * XXX This works because no file systems actually 426 * XXX take any action on the seek operation. 427 */ 428 error = VOP_SEEK(vp, fp->f_offset, *offset, fp->f_cred); 429 if (error != 0) 430 goto out; 431 } 432 433 iovlen = iovcnt * sizeof(struct iovec); 434 if (flags & FOF_IOV_SYSSPACE) 435 iov = __UNCONST(iovp); 436 else { 437 iov = aiov; 438 if ((u_int)iovcnt > UIO_SMALLIOV) { 439 if ((u_int)iovcnt > IOV_MAX) { 440 error = EINVAL; 441 goto out; 442 } 443 iov = kmem_alloc(iovlen, KM_SLEEP); 444 if (iov == NULL) { 445 error = ENOMEM; 446 goto out; 447 } 448 needfree = iov; 449 } 450 error = copyin(iovp, iov, iovlen); 451 if (error) 452 goto done; 453 } 454 455 auio.uio_iov = iov; 456 auio.uio_iovcnt = iovcnt; 457 auio.uio_rw = UIO_WRITE; 458 auio.uio_vmspace = curproc->p_vmspace; 459 460 auio.uio_resid = 0; 461 for (i = 0; i < iovcnt; i++, iov++) { 462 auio.uio_resid += iov->iov_len; 463 /* 464 * Writes return ssize_t because -1 is returned on error. 465 * Therefore we must restrict the length to SSIZE_MAX to 466 * avoid garbage return values. 467 */ 468 if (iov->iov_len > SSIZE_MAX || auio.uio_resid > SSIZE_MAX) { 469 error = EINVAL; 470 goto done; 471 } 472 } 473 474 /* 475 * if tracing, save a copy of iovec 476 */ 477 if (ktrpoint(KTR_GENIO)) { 478 ktriov = kmem_alloc(iovlen, KM_SLEEP); 479 if (ktriov != NULL) 480 memcpy(ktriov, auio.uio_iov, iovlen); 481 } 482 483 cnt = auio.uio_resid; 484 error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags); 485 if (error) { 486 if (auio.uio_resid != cnt && (error == ERESTART || 487 error == EINTR || error == EWOULDBLOCK)) 488 error = 0; 489 if (error == EPIPE) { 490 mutex_enter(proc_lock); 491 psignal(curproc, SIGPIPE); 492 mutex_exit(proc_lock); 493 } 494 } 495 cnt -= auio.uio_resid; 496 *retval = cnt; 497 498 if (ktriov != NULL) { 499 ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error); 500 kmem_free(ktriov, iovlen); 501 } 502 503 done: 504 if (needfree) 505 kmem_free(needfree, iovlen); 506 out: 507 fd_putfile(fd); 508 return (error); 509 } 510 511 /* 512 * Ioctl system call 513 */ 514 /* ARGSUSED */ 515 int 516 sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval) 517 { 518 /* { 519 syscallarg(int) fd; 520 syscallarg(u_long) com; 521 syscallarg(void *) data; 522 } */ 523 struct file *fp; 524 proc_t *p; 525 struct filedesc *fdp; 526 u_long com; 527 int error; 528 size_t size, alloc_size; 529 void *data, *memp; 530 #define STK_PARAMS 128 531 u_long stkbuf[STK_PARAMS/sizeof(u_long)]; 532 fdfile_t *ff; 533 534 memp = NULL; 535 alloc_size = 0; 536 error = 0; 537 p = l->l_proc; 538 fdp = p->p_fd; 539 540 if ((fp = fd_getfile(SCARG(uap, fd))) == NULL) 541 return (EBADF); 542 543 if ((fp->f_flag & (FREAD | FWRITE)) == 0) { 544 error = EBADF; 545 com = 0; 546 goto out; 547 } 548 549 ff = fdp->fd_dt->dt_ff[SCARG(uap, fd)]; 550 switch (com = SCARG(uap, com)) { 551 case FIONCLEX: 552 ff->ff_exclose = false; 553 goto out; 554 555 case FIOCLEX: 556 ff->ff_exclose = true; 557 fdp->fd_exclose = true; 558 goto out; 559 } 560 561 /* 562 * Interpret high order word to find amount of data to be 563 * copied to/from the user's address space. 564 */ 565 size = IOCPARM_LEN(com); 566 alloc_size = size; 567 568 /* 569 * The disklabel is now padded to a multiple of 8 bytes however the old 570 * disklabel on 32bit platforms wasn't. This leaves a difference in 571 * size of 4 bytes between the two but are otherwise identical. 572 * To deal with this, we allocate enough space for the new disklabel 573 * but only copyin/out the smaller amount. 574 */ 575 if (IOCGROUP(com) == 'd') { 576 u_long ncom = com ^ (DIOCGDINFO ^ DIOCGDINFO32); 577 switch (ncom) { 578 case DIOCGDINFO: 579 case DIOCWDINFO: 580 case DIOCSDINFO: 581 case DIOCGDEFLABEL: 582 com = ncom; 583 if (IOCPARM_LEN(DIOCGDINFO32) < IOCPARM_LEN(DIOCGDINFO)) 584 alloc_size = IOCPARM_LEN(DIOCGDINFO); 585 break; 586 } 587 } 588 if (size > IOCPARM_MAX) { 589 error = ENOTTY; 590 goto out; 591 } 592 memp = NULL; 593 if ((com >> IOCPARM_SHIFT) == 0) { 594 /* UNIX-style ioctl. */ 595 data = SCARG(uap, data); 596 } else { 597 if (alloc_size > sizeof(stkbuf)) { 598 memp = kmem_alloc(alloc_size, KM_SLEEP); 599 data = memp; 600 } else { 601 data = (void *)stkbuf; 602 } 603 if (com&IOC_IN) { 604 if (size) { 605 error = copyin(SCARG(uap, data), data, size); 606 if (error) { 607 goto out; 608 } 609 /* 610 * The data between size and alloc_size has 611 * not been overwritten. It shouldn't matter 612 * but let's clear that anyway. 613 */ 614 if (__predict_false(size < alloc_size)) { 615 memset((char *)data+size, 0, 616 alloc_size - size); 617 } 618 ktrgenio(SCARG(uap, fd), UIO_WRITE, 619 SCARG(uap, data), size, 0); 620 } else { 621 *(void **)data = SCARG(uap, data); 622 } 623 } else if ((com&IOC_OUT) && size) { 624 /* 625 * Zero the buffer so the user always 626 * gets back something deterministic. 627 */ 628 memset(data, 0, size); 629 } else if (com&IOC_VOID) { 630 *(void **)data = SCARG(uap, data); 631 } 632 } 633 634 switch (com) { 635 636 case FIONBIO: 637 /* XXX Code block is not atomic */ 638 if (*(int *)data != 0) 639 atomic_or_uint(&fp->f_flag, FNONBLOCK); 640 else 641 atomic_and_uint(&fp->f_flag, ~FNONBLOCK); 642 error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data); 643 break; 644 645 case FIOASYNC: 646 /* XXX Code block is not atomic */ 647 if (*(int *)data != 0) 648 atomic_or_uint(&fp->f_flag, FASYNC); 649 else 650 atomic_and_uint(&fp->f_flag, ~FASYNC); 651 error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data); 652 break; 653 654 default: 655 error = (*fp->f_ops->fo_ioctl)(fp, com, data); 656 /* 657 * Copy any data to user, size was 658 * already set and checked above. 659 */ 660 if (error == 0 && (com&IOC_OUT) && size) { 661 error = copyout(data, SCARG(uap, data), size); 662 ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data), 663 size, error); 664 } 665 break; 666 } 667 out: 668 if (memp) 669 kmem_free(memp, alloc_size); 670 fd_putfile(SCARG(uap, fd)); 671 switch (error) { 672 case -1: 673 printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: " 674 "pid=%d comm=%s\n", 675 (com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "", 676 (char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com), 677 p->p_pid, p->p_comm); 678 /* FALLTHROUGH */ 679 case EPASSTHROUGH: 680 error = ENOTTY; 681 /* FALLTHROUGH */ 682 default: 683 return (error); 684 } 685 } 686