1 /* 2 * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> All rights reserved. 3 * cdevsw from kern/kern_conf.c Copyright (c) 1995 Terrence R. Lambert 4 * cdevsw from kern/kern_conf.c Copyright (c) 1995 Julian R. Elishcer, 5 * All rights reserved. 6 * Copyright (c) 1982, 1986, 1991, 1993 7 * The Regents of the University of California. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/kernel.h> 34 #include <sys/sysctl.h> 35 #include <sys/module.h> 36 #include <sys/malloc.h> 37 #include <sys/conf.h> 38 #include <sys/bio.h> 39 #include <sys/buf.h> 40 #include <sys/vnode.h> 41 #include <sys/queue.h> 42 #include <sys/device.h> 43 #include <sys/tree.h> 44 #include <sys/syslink_rpc.h> 45 #include <sys/proc.h> 46 #include <machine/stdarg.h> 47 #include <sys/devfs.h> 48 #include <sys/dsched.h> 49 50 #include <sys/thread2.h> 51 #include <sys/mplock2.h> 52 53 static int mpsafe_writes; 54 static int mplock_writes; 55 static int mpsafe_reads; 56 static int mplock_reads; 57 static int mpsafe_strategies; 58 static int mplock_strategies; 59 60 SYSCTL_INT(_kern, OID_AUTO, mpsafe_writes, CTLFLAG_RD, &mpsafe_writes, 61 0, "mpsafe writes"); 62 SYSCTL_INT(_kern, OID_AUTO, mplock_writes, CTLFLAG_RD, &mplock_writes, 63 0, "non-mpsafe writes"); 64 SYSCTL_INT(_kern, OID_AUTO, mpsafe_reads, CTLFLAG_RD, &mpsafe_reads, 65 0, "mpsafe reads"); 66 SYSCTL_INT(_kern, OID_AUTO, mplock_reads, CTLFLAG_RD, &mplock_reads, 67 0, "non-mpsafe reads"); 68 SYSCTL_INT(_kern, OID_AUTO, mpsafe_strategies, CTLFLAG_RD, &mpsafe_strategies, 69 0, "mpsafe strategies"); 70 SYSCTL_INT(_kern, OID_AUTO, mplock_strategies, CTLFLAG_RD, &mplock_strategies, 71 0, "non-mpsafe strategies"); 72 73 /* 74 * system link descriptors identify the command in the 75 * arguments structure. 76 */ 77 #define DDESCNAME(name) __CONCAT(__CONCAT(dev_,name),_desc) 78 79 #define DEVOP_DESC_INIT(name) \ 80 struct syslink_desc DDESCNAME(name) = { \ 81 __offsetof(struct dev_ops, __CONCAT(d_, name)), \ 82 #name } 83 84 DEVOP_DESC_INIT(default); 85 DEVOP_DESC_INIT(open); 86 DEVOP_DESC_INIT(close); 87 DEVOP_DESC_INIT(read); 88 DEVOP_DESC_INIT(write); 89 DEVOP_DESC_INIT(ioctl); 90 DEVOP_DESC_INIT(dump); 91 DEVOP_DESC_INIT(psize); 92 DEVOP_DESC_INIT(mmap); 93 DEVOP_DESC_INIT(mmap_single); 94 DEVOP_DESC_INIT(strategy); 95 DEVOP_DESC_INIT(kqfilter); 96 DEVOP_DESC_INIT(revoke); 97 DEVOP_DESC_INIT(clone); 98 99 /* 100 * Misc default ops 101 */ 102 struct dev_ops dead_dev_ops; 103 104 struct dev_ops default_dev_ops = { 105 { "null" }, 106 .d_default = NULL, /* must be NULL */ 107 .d_open = noopen, 108 .d_close = noclose, 109 .d_read = noread, 110 .d_write = nowrite, 111 .d_ioctl = noioctl, 112 .d_mmap = nommap, 113 .d_mmap_single = nommap_single, 114 .d_strategy = nostrategy, 115 .d_dump = nodump, 116 .d_psize = nopsize, 117 .d_kqfilter = nokqfilter, 118 .d_revoke = norevoke, 119 .d_clone = noclone 120 }; 121 122 static __inline 123 int 124 dev_needmplock(cdev_t dev) 125 { 126 return((dev->si_ops->head.flags & D_MPSAFE) == 0); 127 } 128 129 /************************************************************************ 130 * GENERAL DEVICE API FUNCTIONS * 131 ************************************************************************ 132 * 133 * The MPSAFEness of these depends on dev->si_ops->head.flags 134 */ 135 int 136 dev_dopen(cdev_t dev, int oflags, int devtype, struct ucred *cred, struct file *fp) 137 { 138 struct dev_open_args ap; 139 int needmplock = dev_needmplock(dev); 140 int error; 141 142 ap.a_head.a_desc = &dev_open_desc; 143 ap.a_head.a_dev = dev; 144 ap.a_oflags = oflags; 145 ap.a_devtype = devtype; 146 ap.a_cred = cred; 147 ap.a_fp = fp; 148 149 if (needmplock) 150 get_mplock(); 151 error = dev->si_ops->d_open(&ap); 152 if (needmplock) 153 rel_mplock(); 154 return (error); 155 } 156 157 int 158 dev_dclose(cdev_t dev, int fflag, int devtype, struct file *fp) 159 { 160 struct dev_close_args ap; 161 int needmplock = dev_needmplock(dev); 162 int error; 163 164 ap.a_head.a_desc = &dev_close_desc; 165 ap.a_head.a_dev = dev; 166 ap.a_fflag = fflag; 167 ap.a_devtype = devtype; 168 ap.a_fp = fp; 169 170 if (needmplock) 171 get_mplock(); 172 error = dev->si_ops->d_close(&ap); 173 if (needmplock) 174 rel_mplock(); 175 return (error); 176 } 177 178 int 179 dev_dread(cdev_t dev, struct uio *uio, int ioflag, struct file *fp) 180 { 181 struct dev_read_args ap; 182 int needmplock = dev_needmplock(dev); 183 int error; 184 185 ap.a_head.a_desc = &dev_read_desc; 186 ap.a_head.a_dev = dev; 187 ap.a_uio = uio; 188 ap.a_ioflag = ioflag; 189 ap.a_fp = fp; 190 191 if (needmplock) { 192 get_mplock(); 193 ++mplock_reads; 194 } else { 195 ++mpsafe_reads; 196 } 197 error = dev->si_ops->d_read(&ap); 198 if (needmplock) 199 rel_mplock(); 200 if (error == 0) 201 dev->si_lastread = time_uptime; 202 return (error); 203 } 204 205 int 206 dev_dwrite(cdev_t dev, struct uio *uio, int ioflag, struct file *fp) 207 { 208 struct dev_write_args ap; 209 int needmplock = dev_needmplock(dev); 210 int error; 211 212 dev->si_lastwrite = time_uptime; 213 ap.a_head.a_desc = &dev_write_desc; 214 ap.a_head.a_dev = dev; 215 ap.a_uio = uio; 216 ap.a_ioflag = ioflag; 217 ap.a_fp = fp; 218 219 if (needmplock) { 220 get_mplock(); 221 ++mplock_writes; 222 } else { 223 ++mpsafe_writes; 224 } 225 error = dev->si_ops->d_write(&ap); 226 if (needmplock) 227 rel_mplock(); 228 return (error); 229 } 230 231 int 232 dev_dioctl(cdev_t dev, u_long cmd, caddr_t data, int fflag, struct ucred *cred, 233 struct sysmsg *msg, struct file *fp) 234 { 235 struct dev_ioctl_args ap; 236 int needmplock = dev_needmplock(dev); 237 int error; 238 239 ap.a_head.a_desc = &dev_ioctl_desc; 240 ap.a_head.a_dev = dev; 241 ap.a_cmd = cmd; 242 ap.a_data = data; 243 ap.a_fflag = fflag; 244 ap.a_cred = cred; 245 ap.a_sysmsg = msg; 246 ap.a_fp = fp; 247 248 if (needmplock) 249 get_mplock(); 250 error = dev->si_ops->d_ioctl(&ap); 251 if (needmplock) 252 rel_mplock(); 253 return (error); 254 } 255 256 int 257 dev_dmmap(cdev_t dev, vm_offset_t offset, int nprot, struct file *fp) 258 { 259 struct dev_mmap_args ap; 260 int needmplock = dev_needmplock(dev); 261 int error; 262 263 ap.a_head.a_desc = &dev_mmap_desc; 264 ap.a_head.a_dev = dev; 265 ap.a_offset = offset; 266 ap.a_nprot = nprot; 267 ap.a_fp = fp; 268 269 if (needmplock) 270 get_mplock(); 271 error = dev->si_ops->d_mmap(&ap); 272 if (needmplock) 273 rel_mplock(); 274 275 if (error == 0) 276 return(ap.a_result); 277 return(-1); 278 } 279 280 int 281 dev_dmmap_single(cdev_t dev, vm_ooffset_t *offset, vm_size_t size, 282 struct vm_object **object, int nprot, struct file *fp) 283 { 284 struct dev_mmap_single_args ap; 285 int needmplock = dev_needmplock(dev); 286 int error; 287 288 ap.a_head.a_desc = &dev_mmap_single_desc; 289 ap.a_head.a_dev = dev; 290 ap.a_offset = offset; 291 ap.a_size = size; 292 ap.a_object = object; 293 ap.a_nprot = nprot; 294 ap.a_fp = fp; 295 296 if (needmplock) 297 get_mplock(); 298 error = dev->si_ops->d_mmap_single(&ap); 299 if (needmplock) 300 rel_mplock(); 301 302 return(error); 303 } 304 305 int 306 dev_dclone(cdev_t dev) 307 { 308 struct dev_clone_args ap; 309 int needmplock = dev_needmplock(dev); 310 int error; 311 312 ap.a_head.a_desc = &dev_clone_desc; 313 ap.a_head.a_dev = dev; 314 315 if (needmplock) 316 get_mplock(); 317 error = dev->si_ops->d_clone(&ap); 318 if (needmplock) 319 rel_mplock(); 320 return (error); 321 } 322 323 int 324 dev_drevoke(cdev_t dev) 325 { 326 struct dev_revoke_args ap; 327 int needmplock = dev_needmplock(dev); 328 int error; 329 330 ap.a_head.a_desc = &dev_revoke_desc; 331 ap.a_head.a_dev = dev; 332 333 if (needmplock) 334 get_mplock(); 335 error = dev->si_ops->d_revoke(&ap); 336 if (needmplock) 337 rel_mplock(); 338 339 return (error); 340 } 341 342 /* 343 * Core device strategy call, used to issue I/O on a device. There are 344 * two versions, a non-chained version and a chained version. The chained 345 * version reuses a BIO set up by vn_strategy(). The only difference is 346 * that, for now, we do not push a new tracking structure when chaining 347 * from vn_strategy. XXX this will ultimately have to change. 348 */ 349 void 350 dev_dstrategy(cdev_t dev, struct bio *bio) 351 { 352 struct dev_strategy_args ap; 353 struct bio_track *track; 354 int needmplock = dev_needmplock(dev); 355 356 ap.a_head.a_desc = &dev_strategy_desc; 357 ap.a_head.a_dev = dev; 358 ap.a_bio = bio; 359 360 KKASSERT(bio->bio_track == NULL); 361 KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE); 362 if (bio->bio_buf->b_cmd == BUF_CMD_READ) 363 track = &dev->si_track_read; 364 else 365 track = &dev->si_track_write; 366 bio_track_ref(track); 367 bio->bio_track = track; 368 369 if (dsched_is_clear_buf_priv(bio->bio_buf)) 370 dsched_new_buf(bio->bio_buf); 371 372 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 373 if (needmplock) { 374 get_mplock(); 375 ++mplock_strategies; 376 } else { 377 ++mpsafe_strategies; 378 } 379 (void)dev->si_ops->d_strategy(&ap); 380 if (needmplock) 381 rel_mplock(); 382 } 383 384 void 385 dev_dstrategy_chain(cdev_t dev, struct bio *bio) 386 { 387 struct dev_strategy_args ap; 388 int needmplock = dev_needmplock(dev); 389 390 ap.a_head.a_desc = &dev_strategy_desc; 391 ap.a_head.a_dev = dev; 392 ap.a_bio = bio; 393 394 KKASSERT(bio->bio_track != NULL); 395 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 396 if (needmplock) 397 get_mplock(); 398 (void)dev->si_ops->d_strategy(&ap); 399 if (needmplock) 400 rel_mplock(); 401 } 402 403 /* 404 * note: the disk layer is expected to set count, blkno, and secsize before 405 * forwarding the message. 406 */ 407 int 408 dev_ddump(cdev_t dev, void *virtual, vm_offset_t physical, off_t offset, 409 size_t length) 410 { 411 struct dev_dump_args ap; 412 int needmplock = dev_needmplock(dev); 413 int error; 414 415 ap.a_head.a_desc = &dev_dump_desc; 416 ap.a_head.a_dev = dev; 417 ap.a_count = 0; 418 ap.a_blkno = 0; 419 ap.a_secsize = 0; 420 ap.a_virtual = virtual; 421 ap.a_physical = physical; 422 ap.a_offset = offset; 423 ap.a_length = length; 424 425 if (needmplock) 426 get_mplock(); 427 error = dev->si_ops->d_dump(&ap); 428 if (needmplock) 429 rel_mplock(); 430 return (error); 431 } 432 433 int64_t 434 dev_dpsize(cdev_t dev) 435 { 436 struct dev_psize_args ap; 437 int needmplock = dev_needmplock(dev); 438 int error; 439 440 ap.a_head.a_desc = &dev_psize_desc; 441 ap.a_head.a_dev = dev; 442 443 if (needmplock) 444 get_mplock(); 445 error = dev->si_ops->d_psize(&ap); 446 if (needmplock) 447 rel_mplock(); 448 449 if (error == 0) 450 return (ap.a_result); 451 return(-1); 452 } 453 454 /* 455 * Pass-thru to the device kqfilter. 456 * 457 * NOTE: We explicitly preset a_result to 0 so d_kqfilter() functions 458 * which return 0 do not have to bother setting a_result. 459 */ 460 int 461 dev_dkqfilter(cdev_t dev, struct knote *kn, struct file *fp) 462 { 463 struct dev_kqfilter_args ap; 464 int needmplock = dev_needmplock(dev); 465 int error; 466 467 ap.a_head.a_desc = &dev_kqfilter_desc; 468 ap.a_head.a_dev = dev; 469 ap.a_kn = kn; 470 ap.a_result = 0; 471 ap.a_fp = fp; 472 473 if (needmplock) 474 get_mplock(); 475 error = dev->si_ops->d_kqfilter(&ap); 476 if (needmplock) 477 rel_mplock(); 478 479 if (error == 0) 480 return(ap.a_result); 481 return(ENODEV); 482 } 483 484 /************************************************************************ 485 * DEVICE HELPER FUNCTIONS * 486 ************************************************************************/ 487 488 /* 489 * MPSAFE 490 */ 491 int 492 dev_drefs(cdev_t dev) 493 { 494 return(dev->si_sysref.refcnt); 495 } 496 497 /* 498 * MPSAFE 499 */ 500 const char * 501 dev_dname(cdev_t dev) 502 { 503 return(dev->si_ops->head.name); 504 } 505 506 /* 507 * MPSAFE 508 */ 509 int 510 dev_dflags(cdev_t dev) 511 { 512 return(dev->si_ops->head.flags); 513 } 514 515 /* 516 * MPSAFE 517 */ 518 int 519 dev_dmaj(cdev_t dev) 520 { 521 return(dev->si_ops->head.maj); 522 } 523 524 /* 525 * Used when forwarding a request through layers. The caller adjusts 526 * ap->a_head.a_dev and then calls this function. 527 */ 528 int 529 dev_doperate(struct dev_generic_args *ap) 530 { 531 int (*func)(struct dev_generic_args *); 532 int needmplock = dev_needmplock(ap->a_dev); 533 int error; 534 535 func = *(void **)((char *)ap->a_dev->si_ops + ap->a_desc->sd_offset); 536 537 if (needmplock) 538 get_mplock(); 539 error = func(ap); 540 if (needmplock) 541 rel_mplock(); 542 543 return (error); 544 } 545 546 /* 547 * Used by the console intercept code only. Issue an operation through 548 * a foreign ops structure allowing the ops structure associated 549 * with the device to remain intact. 550 */ 551 int 552 dev_doperate_ops(struct dev_ops *ops, struct dev_generic_args *ap) 553 { 554 int (*func)(struct dev_generic_args *); 555 int needmplock = ((ops->head.flags & D_MPSAFE) == 0); 556 int error; 557 558 func = *(void **)((char *)ops + ap->a_desc->sd_offset); 559 560 if (needmplock) 561 get_mplock(); 562 error = func(ap); 563 if (needmplock) 564 rel_mplock(); 565 566 return (error); 567 } 568 569 /* 570 * Convert a template dev_ops into the real thing by filling in 571 * uninitialized fields. 572 */ 573 void 574 compile_dev_ops(struct dev_ops *ops) 575 { 576 int offset; 577 578 for (offset = offsetof(struct dev_ops, dev_ops_first_field); 579 offset <= offsetof(struct dev_ops, dev_ops_last_field); 580 offset += sizeof(void *) 581 ) { 582 void **func_p = (void **)((char *)ops + offset); 583 void **def_p = (void **)((char *)&default_dev_ops + offset); 584 if (*func_p == NULL) { 585 if (ops->d_default) 586 *func_p = ops->d_default; 587 else 588 *func_p = *def_p; 589 } 590 } 591 } 592 593 /************************************************************************ 594 * MAJOR/MINOR SPACE FUNCTION * 595 ************************************************************************/ 596 597 /* 598 * This makes a dev_ops entry visible to userland (e.g /dev/<blah>). 599 * 600 * Disk devices typically register their major, e.g. 'ad0', and then call 601 * into the disk label management code which overloads its own onto e.g. 'ad0' 602 * to support all the various slice and partition combinations. 603 * 604 * The mask/match supplied in this call are a full 32 bits and the same 605 * mask and match must be specified in a later dev_ops_remove() call to 606 * match this add. However, the match value for the minor number should never 607 * have any bits set in the major number's bit range (8-15). The mask value 608 * may be conveniently specified as -1 without creating any major number 609 * interference. 610 */ 611 612 static 613 int 614 rb_dev_ops_compare(struct dev_ops_maj *a, struct dev_ops_maj *b) 615 { 616 if (a->maj < b->maj) 617 return(-1); 618 else if (a->maj > b->maj) 619 return(1); 620 return(0); 621 } 622 623 RB_GENERATE2(dev_ops_rb_tree, dev_ops_maj, rbnode, rb_dev_ops_compare, int, maj); 624 625 struct dev_ops_rb_tree dev_ops_rbhead = RB_INITIALIZER(dev_ops_rbhead); 626 627 int 628 dev_ops_remove_all(struct dev_ops *ops) 629 { 630 return devfs_destroy_dev_by_ops(ops, -1); 631 } 632 633 int 634 dev_ops_remove_minor(struct dev_ops *ops, int minor) 635 { 636 return devfs_destroy_dev_by_ops(ops, minor); 637 } 638 639 struct dev_ops * 640 dev_ops_intercept(cdev_t dev, struct dev_ops *iops) 641 { 642 struct dev_ops *oops = dev->si_ops; 643 644 compile_dev_ops(iops); 645 iops->head.maj = oops->head.maj; 646 iops->head.data = oops->head.data; 647 iops->head.flags = oops->head.flags; 648 dev->si_ops = iops; 649 dev->si_flags |= SI_INTERCEPTED; 650 651 return (oops); 652 } 653 654 void 655 dev_ops_restore(cdev_t dev, struct dev_ops *oops) 656 { 657 struct dev_ops *iops = dev->si_ops; 658 659 dev->si_ops = oops; 660 dev->si_flags &= ~SI_INTERCEPTED; 661 iops->head.maj = 0; 662 iops->head.data = NULL; 663 iops->head.flags = 0; 664 } 665 666 /************************************************************************ 667 * DEFAULT DEV OPS FUNCTIONS * 668 ************************************************************************/ 669 670 671 /* 672 * Unsupported devswitch functions (e.g. for writing to read-only device). 673 * XXX may belong elsewhere. 674 */ 675 int 676 norevoke(struct dev_revoke_args *ap) 677 { 678 /* take no action */ 679 return(0); 680 } 681 682 int 683 noclone(struct dev_clone_args *ap) 684 { 685 /* take no action */ 686 return (0); /* allow the clone */ 687 } 688 689 int 690 noopen(struct dev_open_args *ap) 691 { 692 return (ENODEV); 693 } 694 695 int 696 noclose(struct dev_close_args *ap) 697 { 698 return (ENODEV); 699 } 700 701 int 702 noread(struct dev_read_args *ap) 703 { 704 return (ENODEV); 705 } 706 707 int 708 nowrite(struct dev_write_args *ap) 709 { 710 return (ENODEV); 711 } 712 713 int 714 noioctl(struct dev_ioctl_args *ap) 715 { 716 return (ENODEV); 717 } 718 719 int 720 nokqfilter(struct dev_kqfilter_args *ap) 721 { 722 return (ENODEV); 723 } 724 725 int 726 nommap(struct dev_mmap_args *ap) 727 { 728 return (ENODEV); 729 } 730 731 int 732 nommap_single(struct dev_mmap_single_args *ap) 733 { 734 return (ENODEV); 735 } 736 737 int 738 nostrategy(struct dev_strategy_args *ap) 739 { 740 struct bio *bio = ap->a_bio; 741 742 bio->bio_buf->b_flags |= B_ERROR; 743 bio->bio_buf->b_error = EOPNOTSUPP; 744 biodone(bio); 745 return(0); 746 } 747 748 int 749 nopsize(struct dev_psize_args *ap) 750 { 751 ap->a_result = 0; 752 return(0); 753 } 754 755 int 756 nodump(struct dev_dump_args *ap) 757 { 758 return (ENODEV); 759 } 760 761 /* 762 * XXX this is probably bogus. Any device that uses it isn't checking the 763 * minor number. 764 */ 765 int 766 nullopen(struct dev_open_args *ap) 767 { 768 return (0); 769 } 770 771 int 772 nullclose(struct dev_close_args *ap) 773 { 774 return (0); 775 } 776 777