1 /* 2 * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> All rights reserved. 3 * cdevsw from kern/kern_conf.c Copyright (c) 1995 Terrence R. Lambert 4 * cdevsw from kern/kern_conf.c Copyright (c) 1995 Julian R. Elishcer, 5 * All rights reserved. 6 * Copyright (c) 1982, 1986, 1991, 1993 7 * The Regents of the University of California. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/kernel.h> 34 #include <sys/sysctl.h> 35 #include <sys/module.h> 36 #include <sys/malloc.h> 37 #include <sys/conf.h> 38 #include <sys/bio.h> 39 #include <sys/buf.h> 40 #include <sys/vnode.h> 41 #include <sys/queue.h> 42 #include <sys/device.h> 43 #include <sys/tree.h> 44 #include <sys/syslink_rpc.h> 45 #include <sys/proc.h> 46 #include <sys/dsched.h> 47 #include <sys/devfs.h> 48 49 #include <machine/stdarg.h> 50 51 #include <sys/thread2.h> 52 #include <sys/mplock2.h> 53 54 static int mpsafe_writes; 55 static int mplock_writes; 56 static int mpsafe_reads; 57 static int mplock_reads; 58 static int mpsafe_strategies; 59 static int mplock_strategies; 60 61 SYSCTL_INT(_kern, OID_AUTO, mpsafe_writes, CTLFLAG_RD, &mpsafe_writes, 62 0, "mpsafe writes"); 63 SYSCTL_INT(_kern, OID_AUTO, mplock_writes, CTLFLAG_RD, &mplock_writes, 64 0, "non-mpsafe writes"); 65 SYSCTL_INT(_kern, OID_AUTO, mpsafe_reads, CTLFLAG_RD, &mpsafe_reads, 66 0, "mpsafe reads"); 67 SYSCTL_INT(_kern, OID_AUTO, mplock_reads, CTLFLAG_RD, &mplock_reads, 68 0, "non-mpsafe reads"); 69 SYSCTL_INT(_kern, OID_AUTO, mpsafe_strategies, CTLFLAG_RD, &mpsafe_strategies, 70 0, "mpsafe strategies"); 71 SYSCTL_INT(_kern, OID_AUTO, mplock_strategies, CTLFLAG_RD, &mplock_strategies, 72 0, "non-mpsafe strategies"); 73 74 /* 75 * system link descriptors identify the command in the 76 * arguments structure. 77 */ 78 #define DDESCNAME(name) __CONCAT(__CONCAT(dev_,name),_desc) 79 80 #define DEVOP_DESC_INIT(name) \ 81 struct syslink_desc DDESCNAME(name) = { \ 82 __offsetof(struct dev_ops, __CONCAT(d_, name)), \ 83 #name } 84 85 DEVOP_DESC_INIT(default); 86 DEVOP_DESC_INIT(open); 87 DEVOP_DESC_INIT(close); 88 DEVOP_DESC_INIT(read); 89 DEVOP_DESC_INIT(write); 90 DEVOP_DESC_INIT(ioctl); 91 DEVOP_DESC_INIT(dump); 92 DEVOP_DESC_INIT(psize); 93 DEVOP_DESC_INIT(mmap); 94 DEVOP_DESC_INIT(mmap_single); 95 DEVOP_DESC_INIT(strategy); 96 DEVOP_DESC_INIT(kqfilter); 97 DEVOP_DESC_INIT(revoke); 98 DEVOP_DESC_INIT(clone); 99 100 /* 101 * Misc default ops 102 */ 103 struct dev_ops dead_dev_ops; 104 105 static d_open_t noopen; 106 static d_close_t noclose; 107 static d_read_t noread; 108 static d_write_t nowrite; 109 static d_ioctl_t noioctl; 110 static d_mmap_t nommap; 111 static d_mmap_single_t nommap_single; 112 static d_strategy_t nostrategy; 113 static d_dump_t nodump; 114 static d_psize_t nopsize; 115 static d_kqfilter_t nokqfilter; 116 static d_clone_t noclone; 117 static d_revoke_t norevoke; 118 119 struct dev_ops default_dev_ops = { 120 { "null" }, 121 .d_default = NULL, /* must be NULL */ 122 .d_open = noopen, 123 .d_close = noclose, 124 .d_read = noread, 125 .d_write = nowrite, 126 .d_ioctl = noioctl, 127 .d_mmap = nommap, 128 .d_mmap_single = nommap_single, 129 .d_strategy = nostrategy, 130 .d_dump = nodump, 131 .d_psize = nopsize, 132 .d_kqfilter = nokqfilter, 133 .d_revoke = norevoke, 134 .d_clone = noclone 135 }; 136 137 static __inline 138 int 139 dev_needmplock(cdev_t dev) 140 { 141 return((dev->si_ops->head.flags & D_MPSAFE) == 0); 142 } 143 144 /************************************************************************ 145 * GENERAL DEVICE API FUNCTIONS * 146 ************************************************************************ 147 * 148 * The MPSAFEness of these depends on dev->si_ops->head.flags 149 */ 150 int 151 dev_dopen(cdev_t dev, int oflags, int devtype, struct ucred *cred, struct file *fp) 152 { 153 struct dev_open_args ap; 154 int needmplock = dev_needmplock(dev); 155 int error; 156 157 ap.a_head.a_desc = &dev_open_desc; 158 ap.a_head.a_dev = dev; 159 ap.a_oflags = oflags; 160 ap.a_devtype = devtype; 161 ap.a_cred = cred; 162 ap.a_fp = fp; 163 164 if (needmplock) 165 get_mplock(); 166 error = dev->si_ops->d_open(&ap); 167 if (needmplock) 168 rel_mplock(); 169 return (error); 170 } 171 172 int 173 dev_dclose(cdev_t dev, int fflag, int devtype, struct file *fp) 174 { 175 struct dev_close_args ap; 176 int needmplock = dev_needmplock(dev); 177 int error; 178 179 ap.a_head.a_desc = &dev_close_desc; 180 ap.a_head.a_dev = dev; 181 ap.a_fflag = fflag; 182 ap.a_devtype = devtype; 183 ap.a_fp = fp; 184 185 if (needmplock) 186 get_mplock(); 187 error = dev->si_ops->d_close(&ap); 188 if (needmplock) 189 rel_mplock(); 190 return (error); 191 } 192 193 int 194 dev_dread(cdev_t dev, struct uio *uio, int ioflag, struct file *fp) 195 { 196 struct dev_read_args ap; 197 int needmplock = dev_needmplock(dev); 198 int error; 199 200 ap.a_head.a_desc = &dev_read_desc; 201 ap.a_head.a_dev = dev; 202 ap.a_uio = uio; 203 ap.a_ioflag = ioflag; 204 ap.a_fp = fp; 205 206 if (needmplock) { 207 get_mplock(); 208 ++mplock_reads; 209 } else { 210 ++mpsafe_reads; 211 } 212 error = dev->si_ops->d_read(&ap); 213 if (needmplock) 214 rel_mplock(); 215 if (error == 0) 216 dev->si_lastread = time_uptime; 217 return (error); 218 } 219 220 int 221 dev_dwrite(cdev_t dev, struct uio *uio, int ioflag, struct file *fp) 222 { 223 struct dev_write_args ap; 224 int needmplock = dev_needmplock(dev); 225 int error; 226 227 dev->si_lastwrite = time_uptime; 228 ap.a_head.a_desc = &dev_write_desc; 229 ap.a_head.a_dev = dev; 230 ap.a_uio = uio; 231 ap.a_ioflag = ioflag; 232 ap.a_fp = fp; 233 234 if (needmplock) { 235 get_mplock(); 236 ++mplock_writes; 237 } else { 238 ++mpsafe_writes; 239 } 240 error = dev->si_ops->d_write(&ap); 241 if (needmplock) 242 rel_mplock(); 243 return (error); 244 } 245 246 int 247 dev_dioctl(cdev_t dev, u_long cmd, caddr_t data, int fflag, struct ucred *cred, 248 struct sysmsg *msg, struct file *fp) 249 { 250 struct dev_ioctl_args ap; 251 int needmplock = dev_needmplock(dev); 252 int error; 253 254 ap.a_head.a_desc = &dev_ioctl_desc; 255 ap.a_head.a_dev = dev; 256 ap.a_cmd = cmd; 257 ap.a_data = data; 258 ap.a_fflag = fflag; 259 ap.a_cred = cred; 260 ap.a_sysmsg = msg; 261 ap.a_fp = fp; 262 263 if (needmplock) 264 get_mplock(); 265 error = dev->si_ops->d_ioctl(&ap); 266 if (needmplock) 267 rel_mplock(); 268 return (error); 269 } 270 271 int 272 dev_dmmap(cdev_t dev, vm_offset_t offset, int nprot, struct file *fp) 273 { 274 struct dev_mmap_args ap; 275 int needmplock = dev_needmplock(dev); 276 int error; 277 278 ap.a_head.a_desc = &dev_mmap_desc; 279 ap.a_head.a_dev = dev; 280 ap.a_offset = offset; 281 ap.a_nprot = nprot; 282 ap.a_fp = fp; 283 284 if (needmplock) 285 get_mplock(); 286 error = dev->si_ops->d_mmap(&ap); 287 if (needmplock) 288 rel_mplock(); 289 290 if (error == 0) 291 return(ap.a_result); 292 return(-1); 293 } 294 295 int 296 dev_dmmap_single(cdev_t dev, vm_ooffset_t *offset, vm_size_t size, 297 struct vm_object **object, int nprot, struct file *fp) 298 { 299 struct dev_mmap_single_args ap; 300 int needmplock = dev_needmplock(dev); 301 int error; 302 303 ap.a_head.a_desc = &dev_mmap_single_desc; 304 ap.a_head.a_dev = dev; 305 ap.a_offset = offset; 306 ap.a_size = size; 307 ap.a_object = object; 308 ap.a_nprot = nprot; 309 ap.a_fp = fp; 310 311 if (needmplock) 312 get_mplock(); 313 error = dev->si_ops->d_mmap_single(&ap); 314 if (needmplock) 315 rel_mplock(); 316 317 return(error); 318 } 319 320 int 321 dev_dclone(cdev_t dev) 322 { 323 struct dev_clone_args ap; 324 int needmplock = dev_needmplock(dev); 325 int error; 326 327 ap.a_head.a_desc = &dev_clone_desc; 328 ap.a_head.a_dev = dev; 329 330 if (needmplock) 331 get_mplock(); 332 error = dev->si_ops->d_clone(&ap); 333 if (needmplock) 334 rel_mplock(); 335 return (error); 336 } 337 338 int 339 dev_drevoke(cdev_t dev) 340 { 341 struct dev_revoke_args ap; 342 int needmplock = dev_needmplock(dev); 343 int error; 344 345 ap.a_head.a_desc = &dev_revoke_desc; 346 ap.a_head.a_dev = dev; 347 348 if (needmplock) 349 get_mplock(); 350 error = dev->si_ops->d_revoke(&ap); 351 if (needmplock) 352 rel_mplock(); 353 354 return (error); 355 } 356 357 /* 358 * Core device strategy call, used to issue I/O on a device. There are 359 * two versions, a non-chained version and a chained version. The chained 360 * version reuses a BIO set up by vn_strategy(). The only difference is 361 * that, for now, we do not push a new tracking structure when chaining 362 * from vn_strategy. XXX this will ultimately have to change. 363 */ 364 void 365 dev_dstrategy(cdev_t dev, struct bio *bio) 366 { 367 struct dev_strategy_args ap; 368 struct bio_track *track; 369 int needmplock = dev_needmplock(dev); 370 371 ap.a_head.a_desc = &dev_strategy_desc; 372 ap.a_head.a_dev = dev; 373 ap.a_bio = bio; 374 375 KKASSERT(bio->bio_track == NULL); 376 KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE); 377 if (bio->bio_buf->b_cmd == BUF_CMD_READ) 378 track = &dev->si_track_read; 379 else 380 track = &dev->si_track_write; 381 bio_track_ref(track); 382 bio->bio_track = track; 383 dsched_buf_enter(bio->bio_buf); /* might stack */ 384 385 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 386 if (needmplock) { 387 get_mplock(); 388 ++mplock_strategies; 389 } else { 390 ++mpsafe_strategies; 391 } 392 (void)dev->si_ops->d_strategy(&ap); 393 if (needmplock) 394 rel_mplock(); 395 } 396 397 void 398 dev_dstrategy_chain(cdev_t dev, struct bio *bio) 399 { 400 struct dev_strategy_args ap; 401 int needmplock = dev_needmplock(dev); 402 403 ap.a_head.a_desc = &dev_strategy_desc; 404 ap.a_head.a_dev = dev; 405 ap.a_bio = bio; 406 407 KKASSERT(bio->bio_track != NULL); 408 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 409 if (needmplock) 410 get_mplock(); 411 (void)dev->si_ops->d_strategy(&ap); 412 if (needmplock) 413 rel_mplock(); 414 } 415 416 /* 417 * note: the disk layer is expected to set count, blkno, and secsize before 418 * forwarding the message. 419 */ 420 int 421 dev_ddump(cdev_t dev, void *virtual, vm_offset_t physical, off_t offset, 422 size_t length) 423 { 424 struct dev_dump_args ap; 425 int needmplock = dev_needmplock(dev); 426 int error; 427 428 ap.a_head.a_desc = &dev_dump_desc; 429 ap.a_head.a_dev = dev; 430 ap.a_count = 0; 431 ap.a_blkno = 0; 432 ap.a_secsize = 0; 433 ap.a_virtual = virtual; 434 ap.a_physical = physical; 435 ap.a_offset = offset; 436 ap.a_length = length; 437 438 if (needmplock) 439 get_mplock(); 440 error = dev->si_ops->d_dump(&ap); 441 if (needmplock) 442 rel_mplock(); 443 return (error); 444 } 445 446 int64_t 447 dev_dpsize(cdev_t dev) 448 { 449 struct dev_psize_args ap; 450 int needmplock = dev_needmplock(dev); 451 int error; 452 453 ap.a_head.a_desc = &dev_psize_desc; 454 ap.a_head.a_dev = dev; 455 456 if (needmplock) 457 get_mplock(); 458 error = dev->si_ops->d_psize(&ap); 459 if (needmplock) 460 rel_mplock(); 461 462 if (error == 0) 463 return (ap.a_result); 464 return(-1); 465 } 466 467 /* 468 * Pass-thru to the device kqfilter. 469 * 470 * NOTE: We explicitly preset a_result to 0 so d_kqfilter() functions 471 * which return 0 do not have to bother setting a_result. 472 */ 473 int 474 dev_dkqfilter(cdev_t dev, struct knote *kn, struct file *fp) 475 { 476 struct dev_kqfilter_args ap; 477 int needmplock = dev_needmplock(dev); 478 int error; 479 480 ap.a_head.a_desc = &dev_kqfilter_desc; 481 ap.a_head.a_dev = dev; 482 ap.a_kn = kn; 483 ap.a_result = 0; 484 ap.a_fp = fp; 485 486 if (needmplock) 487 get_mplock(); 488 error = dev->si_ops->d_kqfilter(&ap); 489 if (needmplock) 490 rel_mplock(); 491 492 if (error == 0) 493 return(ap.a_result); 494 return(ENODEV); 495 } 496 497 /************************************************************************ 498 * DEVICE HELPER FUNCTIONS * 499 ************************************************************************/ 500 501 /* 502 * MPSAFE 503 */ 504 int 505 dev_drefs(cdev_t dev) 506 { 507 return(dev->si_sysref.refcnt); 508 } 509 510 /* 511 * MPSAFE 512 */ 513 const char * 514 dev_dname(cdev_t dev) 515 { 516 return(dev->si_ops->head.name); 517 } 518 519 /* 520 * MPSAFE 521 */ 522 int 523 dev_dflags(cdev_t dev) 524 { 525 return(dev->si_ops->head.flags); 526 } 527 528 /* 529 * MPSAFE 530 */ 531 int 532 dev_dmaj(cdev_t dev) 533 { 534 return(dev->si_ops->head.maj); 535 } 536 537 /* 538 * Used when forwarding a request through layers. The caller adjusts 539 * ap->a_head.a_dev and then calls this function. 540 */ 541 int 542 dev_doperate(struct dev_generic_args *ap) 543 { 544 int (*func)(struct dev_generic_args *); 545 int needmplock = dev_needmplock(ap->a_dev); 546 int error; 547 548 func = *(void **)((char *)ap->a_dev->si_ops + ap->a_desc->sd_offset); 549 550 if (needmplock) 551 get_mplock(); 552 error = func(ap); 553 if (needmplock) 554 rel_mplock(); 555 556 return (error); 557 } 558 559 /* 560 * Used by the console intercept code only. Issue an operation through 561 * a foreign ops structure allowing the ops structure associated 562 * with the device to remain intact. 563 */ 564 int 565 dev_doperate_ops(struct dev_ops *ops, struct dev_generic_args *ap) 566 { 567 int (*func)(struct dev_generic_args *); 568 int needmplock = ((ops->head.flags & D_MPSAFE) == 0); 569 int error; 570 571 func = *(void **)((char *)ops + ap->a_desc->sd_offset); 572 573 if (needmplock) 574 get_mplock(); 575 error = func(ap); 576 if (needmplock) 577 rel_mplock(); 578 579 return (error); 580 } 581 582 /* 583 * Convert a template dev_ops into the real thing by filling in 584 * uninitialized fields. 585 */ 586 void 587 compile_dev_ops(struct dev_ops *ops) 588 { 589 int offset; 590 591 for (offset = offsetof(struct dev_ops, dev_ops_first_field); 592 offset <= offsetof(struct dev_ops, dev_ops_last_field); 593 offset += sizeof(void *) 594 ) { 595 void **func_p = (void **)((char *)ops + offset); 596 void **def_p = (void **)((char *)&default_dev_ops + offset); 597 if (*func_p == NULL) { 598 if (ops->d_default) 599 *func_p = ops->d_default; 600 else 601 *func_p = *def_p; 602 } 603 } 604 } 605 606 /************************************************************************ 607 * MAJOR/MINOR SPACE FUNCTION * 608 ************************************************************************/ 609 610 /* 611 * This makes a dev_ops entry visible to userland (e.g /dev/<blah>). 612 * 613 * Disk devices typically register their major, e.g. 'ad0', and then call 614 * into the disk label management code which overloads its own onto e.g. 'ad0' 615 * to support all the various slice and partition combinations. 616 * 617 * The mask/match supplied in this call are a full 32 bits and the same 618 * mask and match must be specified in a later dev_ops_remove() call to 619 * match this add. However, the match value for the minor number should never 620 * have any bits set in the major number's bit range (8-15). The mask value 621 * may be conveniently specified as -1 without creating any major number 622 * interference. 623 */ 624 625 static 626 int 627 rb_dev_ops_compare(struct dev_ops_maj *a, struct dev_ops_maj *b) 628 { 629 if (a->maj < b->maj) 630 return(-1); 631 else if (a->maj > b->maj) 632 return(1); 633 return(0); 634 } 635 636 RB_GENERATE2(dev_ops_rb_tree, dev_ops_maj, rbnode, rb_dev_ops_compare, int, maj); 637 638 struct dev_ops_rb_tree dev_ops_rbhead = RB_INITIALIZER(dev_ops_rbhead); 639 640 int 641 dev_ops_remove_all(struct dev_ops *ops) 642 { 643 return devfs_destroy_dev_by_ops(ops, -1); 644 } 645 646 int 647 dev_ops_remove_minor(struct dev_ops *ops, int minor) 648 { 649 return devfs_destroy_dev_by_ops(ops, minor); 650 } 651 652 struct dev_ops * 653 dev_ops_intercept(cdev_t dev, struct dev_ops *iops) 654 { 655 struct dev_ops *oops = dev->si_ops; 656 657 compile_dev_ops(iops); 658 iops->head.maj = oops->head.maj; 659 iops->head.data = oops->head.data; 660 iops->head.flags = oops->head.flags; 661 dev->si_ops = iops; 662 dev->si_flags |= SI_INTERCEPTED; 663 664 return (oops); 665 } 666 667 void 668 dev_ops_restore(cdev_t dev, struct dev_ops *oops) 669 { 670 struct dev_ops *iops = dev->si_ops; 671 672 dev->si_ops = oops; 673 dev->si_flags &= ~SI_INTERCEPTED; 674 iops->head.maj = 0; 675 iops->head.data = NULL; 676 iops->head.flags = 0; 677 } 678 679 /************************************************************************ 680 * DEFAULT DEV OPS FUNCTIONS * 681 ************************************************************************/ 682 683 684 /* 685 * Unsupported devswitch functions (e.g. for writing to read-only device). 686 * XXX may belong elsewhere. 687 */ 688 static int 689 norevoke(struct dev_revoke_args *ap) 690 { 691 /* take no action */ 692 return(0); 693 } 694 695 static int 696 noclone(struct dev_clone_args *ap) 697 { 698 /* take no action */ 699 return (0); /* allow the clone */ 700 } 701 702 static int 703 noopen(struct dev_open_args *ap) 704 { 705 return (ENODEV); 706 } 707 708 static int 709 noclose(struct dev_close_args *ap) 710 { 711 return (ENODEV); 712 } 713 714 static int 715 noread(struct dev_read_args *ap) 716 { 717 return (ENODEV); 718 } 719 720 static int 721 nowrite(struct dev_write_args *ap) 722 { 723 return (ENODEV); 724 } 725 726 static int 727 noioctl(struct dev_ioctl_args *ap) 728 { 729 return (ENODEV); 730 } 731 732 static int 733 nokqfilter(struct dev_kqfilter_args *ap) 734 { 735 return (ENODEV); 736 } 737 738 static int 739 nommap(struct dev_mmap_args *ap) 740 { 741 return (ENODEV); 742 } 743 744 static int 745 nommap_single(struct dev_mmap_single_args *ap) 746 { 747 return (ENODEV); 748 } 749 750 static int 751 nostrategy(struct dev_strategy_args *ap) 752 { 753 struct bio *bio = ap->a_bio; 754 755 bio->bio_buf->b_flags |= B_ERROR; 756 bio->bio_buf->b_error = EOPNOTSUPP; 757 biodone(bio); 758 return(0); 759 } 760 761 static int 762 nopsize(struct dev_psize_args *ap) 763 { 764 ap->a_result = 0; 765 return(0); 766 } 767 768 static int 769 nodump(struct dev_dump_args *ap) 770 { 771 return (ENODEV); 772 } 773 774 /* 775 * XXX this is probably bogus. Any device that uses it isn't checking the 776 * minor number. 777 */ 778 int 779 nullopen(struct dev_open_args *ap) 780 { 781 return (0); 782 } 783 784 int 785 nullclose(struct dev_close_args *ap) 786 { 787 return (0); 788 } 789 790