1 /* 2 * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com> All rights reserved. 3 * cdevsw from kern/kern_conf.c Copyright (c) 1995 Terrence R. Lambert 4 * cdevsw from kern/kern_conf.c Copyright (c) 1995 Julian R. Elishcer, 5 * All rights reserved. 6 * Copyright (c) 1982, 1986, 1991, 1993 7 * The Regents of the University of California. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/kernel.h> 34 #include <sys/sysctl.h> 35 #include <sys/module.h> 36 #include <sys/malloc.h> 37 #include <sys/conf.h> 38 #include <sys/bio.h> 39 #include <sys/buf.h> 40 #include <sys/vnode.h> 41 #include <sys/queue.h> 42 #include <sys/device.h> 43 #include <sys/tree.h> 44 #include <sys/syslink_rpc.h> 45 #include <sys/proc.h> 46 #include <machine/stdarg.h> 47 #include <sys/devfs.h> 48 #include <sys/dsched.h> 49 50 #include <sys/thread2.h> 51 #include <sys/mplock2.h> 52 53 static int mpsafe_writes; 54 static int mplock_writes; 55 static int mpsafe_reads; 56 static int mplock_reads; 57 static int mpsafe_strategies; 58 static int mplock_strategies; 59 60 SYSCTL_INT(_kern, OID_AUTO, mpsafe_writes, CTLFLAG_RD, &mpsafe_writes, 61 0, "mpsafe writes"); 62 SYSCTL_INT(_kern, OID_AUTO, mplock_writes, CTLFLAG_RD, &mplock_writes, 63 0, "non-mpsafe writes"); 64 SYSCTL_INT(_kern, OID_AUTO, mpsafe_reads, CTLFLAG_RD, &mpsafe_reads, 65 0, "mpsafe reads"); 66 SYSCTL_INT(_kern, OID_AUTO, mplock_reads, CTLFLAG_RD, &mplock_reads, 67 0, "non-mpsafe reads"); 68 SYSCTL_INT(_kern, OID_AUTO, mpsafe_strategies, CTLFLAG_RD, &mpsafe_strategies, 69 0, "mpsafe strategies"); 70 SYSCTL_INT(_kern, OID_AUTO, mplock_strategies, CTLFLAG_RD, &mplock_strategies, 71 0, "non-mpsafe strategies"); 72 73 /* 74 * system link descriptors identify the command in the 75 * arguments structure. 76 */ 77 #define DDESCNAME(name) __CONCAT(__CONCAT(dev_,name),_desc) 78 79 #define DEVOP_DESC_INIT(name) \ 80 struct syslink_desc DDESCNAME(name) = { \ 81 __offsetof(struct dev_ops, __CONCAT(d_, name)), \ 82 #name } 83 84 DEVOP_DESC_INIT(default); 85 DEVOP_DESC_INIT(open); 86 DEVOP_DESC_INIT(close); 87 DEVOP_DESC_INIT(read); 88 DEVOP_DESC_INIT(write); 89 DEVOP_DESC_INIT(ioctl); 90 DEVOP_DESC_INIT(dump); 91 DEVOP_DESC_INIT(psize); 92 DEVOP_DESC_INIT(mmap); 93 DEVOP_DESC_INIT(mmap_single); 94 DEVOP_DESC_INIT(strategy); 95 DEVOP_DESC_INIT(kqfilter); 96 DEVOP_DESC_INIT(revoke); 97 DEVOP_DESC_INIT(clone); 98 99 /* 100 * Misc default ops 101 */ 102 struct dev_ops dead_dev_ops; 103 104 struct dev_ops default_dev_ops = { 105 { "null" }, 106 .d_default = NULL, /* must be NULL */ 107 .d_open = noopen, 108 .d_close = noclose, 109 .d_read = noread, 110 .d_write = nowrite, 111 .d_ioctl = noioctl, 112 .d_mmap = nommap, 113 .d_mmap_single = nommap_single, 114 .d_strategy = nostrategy, 115 .d_dump = nodump, 116 .d_psize = nopsize, 117 .d_kqfilter = nokqfilter, 118 .d_revoke = norevoke, 119 .d_clone = noclone 120 }; 121 122 static __inline 123 int 124 dev_needmplock(cdev_t dev) 125 { 126 return((dev->si_ops->head.flags & D_MPSAFE) == 0); 127 } 128 129 /************************************************************************ 130 * GENERAL DEVICE API FUNCTIONS * 131 ************************************************************************ 132 * 133 * The MPSAFEness of these depends on dev->si_ops->head.flags 134 */ 135 int 136 dev_dopen(cdev_t dev, int oflags, int devtype, struct ucred *cred) 137 { 138 struct dev_open_args ap; 139 int needmplock = dev_needmplock(dev); 140 int error; 141 142 ap.a_head.a_desc = &dev_open_desc; 143 ap.a_head.a_dev = dev; 144 ap.a_oflags = oflags; 145 ap.a_devtype = devtype; 146 ap.a_cred = cred; 147 148 if (needmplock) 149 get_mplock(); 150 error = dev->si_ops->d_open(&ap); 151 if (needmplock) 152 rel_mplock(); 153 return (error); 154 } 155 156 int 157 dev_dclose(cdev_t dev, int fflag, int devtype) 158 { 159 struct dev_close_args ap; 160 int needmplock = dev_needmplock(dev); 161 int error; 162 163 ap.a_head.a_desc = &dev_close_desc; 164 ap.a_head.a_dev = dev; 165 ap.a_fflag = fflag; 166 ap.a_devtype = devtype; 167 168 if (needmplock) 169 get_mplock(); 170 error = dev->si_ops->d_close(&ap); 171 if (needmplock) 172 rel_mplock(); 173 return (error); 174 } 175 176 int 177 dev_dread(cdev_t dev, struct uio *uio, int ioflag) 178 { 179 struct dev_read_args ap; 180 int needmplock = dev_needmplock(dev); 181 int error; 182 183 ap.a_head.a_desc = &dev_read_desc; 184 ap.a_head.a_dev = dev; 185 ap.a_uio = uio; 186 ap.a_ioflag = ioflag; 187 188 if (needmplock) { 189 get_mplock(); 190 ++mplock_reads; 191 } else { 192 ++mpsafe_reads; 193 } 194 error = dev->si_ops->d_read(&ap); 195 if (needmplock) 196 rel_mplock(); 197 if (error == 0) 198 dev->si_lastread = time_uptime; 199 return (error); 200 } 201 202 int 203 dev_dwrite(cdev_t dev, struct uio *uio, int ioflag) 204 { 205 struct dev_write_args ap; 206 int needmplock = dev_needmplock(dev); 207 int error; 208 209 dev->si_lastwrite = time_uptime; 210 ap.a_head.a_desc = &dev_write_desc; 211 ap.a_head.a_dev = dev; 212 ap.a_uio = uio; 213 ap.a_ioflag = ioflag; 214 215 if (needmplock) { 216 get_mplock(); 217 ++mplock_writes; 218 } else { 219 ++mpsafe_writes; 220 } 221 error = dev->si_ops->d_write(&ap); 222 if (needmplock) 223 rel_mplock(); 224 return (error); 225 } 226 227 int 228 dev_dioctl(cdev_t dev, u_long cmd, caddr_t data, int fflag, struct ucred *cred, 229 struct sysmsg *msg) 230 { 231 struct dev_ioctl_args ap; 232 int needmplock = dev_needmplock(dev); 233 int error; 234 235 ap.a_head.a_desc = &dev_ioctl_desc; 236 ap.a_head.a_dev = dev; 237 ap.a_cmd = cmd; 238 ap.a_data = data; 239 ap.a_fflag = fflag; 240 ap.a_cred = cred; 241 ap.a_sysmsg = msg; 242 243 if (needmplock) 244 get_mplock(); 245 error = dev->si_ops->d_ioctl(&ap); 246 if (needmplock) 247 rel_mplock(); 248 return (error); 249 } 250 251 int 252 dev_dmmap(cdev_t dev, vm_offset_t offset, int nprot) 253 { 254 struct dev_mmap_args ap; 255 int needmplock = dev_needmplock(dev); 256 int error; 257 258 ap.a_head.a_desc = &dev_mmap_desc; 259 ap.a_head.a_dev = dev; 260 ap.a_offset = offset; 261 ap.a_nprot = nprot; 262 263 if (needmplock) 264 get_mplock(); 265 error = dev->si_ops->d_mmap(&ap); 266 if (needmplock) 267 rel_mplock(); 268 269 if (error == 0) 270 return(ap.a_result); 271 return(-1); 272 } 273 274 int 275 dev_dmmap_single(cdev_t dev, vm_ooffset_t *offset, vm_size_t size, 276 struct vm_object **object, int nprot) 277 { 278 struct dev_mmap_single_args ap; 279 int needmplock = dev_needmplock(dev); 280 int error; 281 282 ap.a_head.a_desc = &dev_mmap_single_desc; 283 ap.a_head.a_dev = dev; 284 ap.a_offset = offset; 285 ap.a_size = size; 286 ap.a_object = object; 287 ap.a_nprot = nprot; 288 289 if (needmplock) 290 get_mplock(); 291 error = dev->si_ops->d_mmap_single(&ap); 292 if (needmplock) 293 rel_mplock(); 294 295 return(error); 296 } 297 298 int 299 dev_dclone(cdev_t dev) 300 { 301 struct dev_clone_args ap; 302 int needmplock = dev_needmplock(dev); 303 int error; 304 305 ap.a_head.a_desc = &dev_clone_desc; 306 ap.a_head.a_dev = dev; 307 308 if (needmplock) 309 get_mplock(); 310 error = dev->si_ops->d_clone(&ap); 311 if (needmplock) 312 rel_mplock(); 313 return (error); 314 } 315 316 int 317 dev_drevoke(cdev_t dev) 318 { 319 struct dev_revoke_args ap; 320 int needmplock = dev_needmplock(dev); 321 int error; 322 323 ap.a_head.a_desc = &dev_revoke_desc; 324 ap.a_head.a_dev = dev; 325 326 if (needmplock) 327 get_mplock(); 328 error = dev->si_ops->d_revoke(&ap); 329 if (needmplock) 330 rel_mplock(); 331 332 return (error); 333 } 334 335 /* 336 * Core device strategy call, used to issue I/O on a device. There are 337 * two versions, a non-chained version and a chained version. The chained 338 * version reuses a BIO set up by vn_strategy(). The only difference is 339 * that, for now, we do not push a new tracking structure when chaining 340 * from vn_strategy. XXX this will ultimately have to change. 341 */ 342 void 343 dev_dstrategy(cdev_t dev, struct bio *bio) 344 { 345 struct dev_strategy_args ap; 346 struct bio_track *track; 347 int needmplock = dev_needmplock(dev); 348 349 ap.a_head.a_desc = &dev_strategy_desc; 350 ap.a_head.a_dev = dev; 351 ap.a_bio = bio; 352 353 KKASSERT(bio->bio_track == NULL); 354 KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE); 355 if (bio->bio_buf->b_cmd == BUF_CMD_READ) 356 track = &dev->si_track_read; 357 else 358 track = &dev->si_track_write; 359 bio_track_ref(track); 360 bio->bio_track = track; 361 362 if (dsched_is_clear_buf_priv(bio->bio_buf)) 363 dsched_new_buf(bio->bio_buf); 364 365 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 366 if (needmplock) { 367 get_mplock(); 368 ++mplock_strategies; 369 } else { 370 ++mpsafe_strategies; 371 } 372 (void)dev->si_ops->d_strategy(&ap); 373 if (needmplock) 374 rel_mplock(); 375 } 376 377 void 378 dev_dstrategy_chain(cdev_t dev, struct bio *bio) 379 { 380 struct dev_strategy_args ap; 381 int needmplock = dev_needmplock(dev); 382 383 ap.a_head.a_desc = &dev_strategy_desc; 384 ap.a_head.a_dev = dev; 385 ap.a_bio = bio; 386 387 KKASSERT(bio->bio_track != NULL); 388 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 389 if (needmplock) 390 get_mplock(); 391 (void)dev->si_ops->d_strategy(&ap); 392 if (needmplock) 393 rel_mplock(); 394 } 395 396 /* 397 * note: the disk layer is expected to set count, blkno, and secsize before 398 * forwarding the message. 399 */ 400 int 401 dev_ddump(cdev_t dev, void *virtual, vm_offset_t physical, off_t offset, 402 size_t length) 403 { 404 struct dev_dump_args ap; 405 int needmplock = dev_needmplock(dev); 406 int error; 407 408 ap.a_head.a_desc = &dev_dump_desc; 409 ap.a_head.a_dev = dev; 410 ap.a_count = 0; 411 ap.a_blkno = 0; 412 ap.a_secsize = 0; 413 ap.a_virtual = virtual; 414 ap.a_physical = physical; 415 ap.a_offset = offset; 416 ap.a_length = length; 417 418 if (needmplock) 419 get_mplock(); 420 error = dev->si_ops->d_dump(&ap); 421 if (needmplock) 422 rel_mplock(); 423 return (error); 424 } 425 426 int64_t 427 dev_dpsize(cdev_t dev) 428 { 429 struct dev_psize_args ap; 430 int needmplock = dev_needmplock(dev); 431 int error; 432 433 ap.a_head.a_desc = &dev_psize_desc; 434 ap.a_head.a_dev = dev; 435 436 if (needmplock) 437 get_mplock(); 438 error = dev->si_ops->d_psize(&ap); 439 if (needmplock) 440 rel_mplock(); 441 442 if (error == 0) 443 return (ap.a_result); 444 return(-1); 445 } 446 447 /* 448 * Pass-thru to the device kqfilter. 449 * 450 * NOTE: We explicitly preset a_result to 0 so d_kqfilter() functions 451 * which return 0 do not have to bother setting a_result. 452 */ 453 int 454 dev_dkqfilter(cdev_t dev, struct knote *kn) 455 { 456 struct dev_kqfilter_args ap; 457 int needmplock = dev_needmplock(dev); 458 int error; 459 460 ap.a_head.a_desc = &dev_kqfilter_desc; 461 ap.a_head.a_dev = dev; 462 ap.a_kn = kn; 463 ap.a_result = 0; 464 465 if (needmplock) 466 get_mplock(); 467 error = dev->si_ops->d_kqfilter(&ap); 468 if (needmplock) 469 rel_mplock(); 470 471 if (error == 0) 472 return(ap.a_result); 473 return(ENODEV); 474 } 475 476 /************************************************************************ 477 * DEVICE HELPER FUNCTIONS * 478 ************************************************************************/ 479 480 /* 481 * MPSAFE 482 */ 483 int 484 dev_drefs(cdev_t dev) 485 { 486 return(dev->si_sysref.refcnt); 487 } 488 489 /* 490 * MPSAFE 491 */ 492 const char * 493 dev_dname(cdev_t dev) 494 { 495 return(dev->si_ops->head.name); 496 } 497 498 /* 499 * MPSAFE 500 */ 501 int 502 dev_dflags(cdev_t dev) 503 { 504 return(dev->si_ops->head.flags); 505 } 506 507 /* 508 * MPSAFE 509 */ 510 int 511 dev_dmaj(cdev_t dev) 512 { 513 return(dev->si_ops->head.maj); 514 } 515 516 /* 517 * Used when forwarding a request through layers. The caller adjusts 518 * ap->a_head.a_dev and then calls this function. 519 */ 520 int 521 dev_doperate(struct dev_generic_args *ap) 522 { 523 int (*func)(struct dev_generic_args *); 524 int needmplock = dev_needmplock(ap->a_dev); 525 int error; 526 527 func = *(void **)((char *)ap->a_dev->si_ops + ap->a_desc->sd_offset); 528 529 if (needmplock) 530 get_mplock(); 531 error = func(ap); 532 if (needmplock) 533 rel_mplock(); 534 535 return (error); 536 } 537 538 /* 539 * Used by the console intercept code only. Issue an operation through 540 * a foreign ops structure allowing the ops structure associated 541 * with the device to remain intact. 542 */ 543 int 544 dev_doperate_ops(struct dev_ops *ops, struct dev_generic_args *ap) 545 { 546 int (*func)(struct dev_generic_args *); 547 int needmplock = ((ops->head.flags & D_MPSAFE) == 0); 548 int error; 549 550 func = *(void **)((char *)ops + ap->a_desc->sd_offset); 551 552 if (needmplock) 553 get_mplock(); 554 error = func(ap); 555 if (needmplock) 556 rel_mplock(); 557 558 return (error); 559 } 560 561 /* 562 * Convert a template dev_ops into the real thing by filling in 563 * uninitialized fields. 564 */ 565 void 566 compile_dev_ops(struct dev_ops *ops) 567 { 568 int offset; 569 570 for (offset = offsetof(struct dev_ops, dev_ops_first_field); 571 offset <= offsetof(struct dev_ops, dev_ops_last_field); 572 offset += sizeof(void *) 573 ) { 574 void **func_p = (void **)((char *)ops + offset); 575 void **def_p = (void **)((char *)&default_dev_ops + offset); 576 if (*func_p == NULL) { 577 if (ops->d_default) 578 *func_p = ops->d_default; 579 else 580 *func_p = *def_p; 581 } 582 } 583 } 584 585 /************************************************************************ 586 * MAJOR/MINOR SPACE FUNCTION * 587 ************************************************************************/ 588 589 /* 590 * This makes a dev_ops entry visible to userland (e.g /dev/<blah>). 591 * 592 * Disk devices typically register their major, e.g. 'ad0', and then call 593 * into the disk label management code which overloads its own onto e.g. 'ad0' 594 * to support all the various slice and partition combinations. 595 * 596 * The mask/match supplied in this call are a full 32 bits and the same 597 * mask and match must be specified in a later dev_ops_remove() call to 598 * match this add. However, the match value for the minor number should never 599 * have any bits set in the major number's bit range (8-15). The mask value 600 * may be conveniently specified as -1 without creating any major number 601 * interference. 602 */ 603 604 static 605 int 606 rb_dev_ops_compare(struct dev_ops_maj *a, struct dev_ops_maj *b) 607 { 608 if (a->maj < b->maj) 609 return(-1); 610 else if (a->maj > b->maj) 611 return(1); 612 return(0); 613 } 614 615 RB_GENERATE2(dev_ops_rb_tree, dev_ops_maj, rbnode, rb_dev_ops_compare, int, maj); 616 617 struct dev_ops_rb_tree dev_ops_rbhead = RB_INITIALIZER(dev_ops_rbhead); 618 619 int 620 dev_ops_remove_all(struct dev_ops *ops) 621 { 622 return devfs_destroy_dev_by_ops(ops, -1); 623 } 624 625 int 626 dev_ops_remove_minor(struct dev_ops *ops, int minor) 627 { 628 return devfs_destroy_dev_by_ops(ops, minor); 629 } 630 631 struct dev_ops * 632 dev_ops_intercept(cdev_t dev, struct dev_ops *iops) 633 { 634 struct dev_ops *oops = dev->si_ops; 635 636 compile_dev_ops(iops); 637 iops->head.maj = oops->head.maj; 638 iops->head.data = oops->head.data; 639 iops->head.flags = oops->head.flags; 640 dev->si_ops = iops; 641 dev->si_flags |= SI_INTERCEPTED; 642 643 return (oops); 644 } 645 646 void 647 dev_ops_restore(cdev_t dev, struct dev_ops *oops) 648 { 649 struct dev_ops *iops = dev->si_ops; 650 651 dev->si_ops = oops; 652 dev->si_flags &= ~SI_INTERCEPTED; 653 iops->head.maj = 0; 654 iops->head.data = NULL; 655 iops->head.flags = 0; 656 } 657 658 /************************************************************************ 659 * DEFAULT DEV OPS FUNCTIONS * 660 ************************************************************************/ 661 662 663 /* 664 * Unsupported devswitch functions (e.g. for writing to read-only device). 665 * XXX may belong elsewhere. 666 */ 667 int 668 norevoke(struct dev_revoke_args *ap) 669 { 670 /* take no action */ 671 return(0); 672 } 673 674 int 675 noclone(struct dev_clone_args *ap) 676 { 677 /* take no action */ 678 return (0); /* allow the clone */ 679 } 680 681 int 682 noopen(struct dev_open_args *ap) 683 { 684 return (ENODEV); 685 } 686 687 int 688 noclose(struct dev_close_args *ap) 689 { 690 return (ENODEV); 691 } 692 693 int 694 noread(struct dev_read_args *ap) 695 { 696 return (ENODEV); 697 } 698 699 int 700 nowrite(struct dev_write_args *ap) 701 { 702 return (ENODEV); 703 } 704 705 int 706 noioctl(struct dev_ioctl_args *ap) 707 { 708 return (ENODEV); 709 } 710 711 int 712 nokqfilter(struct dev_kqfilter_args *ap) 713 { 714 return (ENODEV); 715 } 716 717 int 718 nommap(struct dev_mmap_args *ap) 719 { 720 return (ENODEV); 721 } 722 723 int 724 nommap_single(struct dev_mmap_single_args *ap) 725 { 726 return (ENODEV); 727 } 728 729 int 730 nostrategy(struct dev_strategy_args *ap) 731 { 732 struct bio *bio = ap->a_bio; 733 734 bio->bio_buf->b_flags |= B_ERROR; 735 bio->bio_buf->b_error = EOPNOTSUPP; 736 biodone(bio); 737 return(0); 738 } 739 740 int 741 nopsize(struct dev_psize_args *ap) 742 { 743 ap->a_result = 0; 744 return(0); 745 } 746 747 int 748 nodump(struct dev_dump_args *ap) 749 { 750 return (ENODEV); 751 } 752 753 /* 754 * XXX this is probably bogus. Any device that uses it isn't checking the 755 * minor number. 756 */ 757 int 758 nullopen(struct dev_open_args *ap) 759 { 760 return (0); 761 } 762 763 int 764 nullclose(struct dev_close_args *ap) 765 { 766 return (0); 767 } 768 769