1 /* 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * and Alex Hornung <ahornung@gmail.com> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * ---------------------------------------------------------------------------- 36 * "THE BEER-WARE LICENSE" (Revision 42): 37 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 38 * can do whatever you want with this stuff. If we meet some day, and you think 39 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 40 * ---------------------------------------------------------------------------- 41 * 42 * Copyright (c) 1982, 1986, 1988, 1993 43 * The Regents of the University of California. All rights reserved. 44 * (c) UNIX System Laboratories, Inc. 45 * All or some portions of this file are derived from material licensed 46 * to the University of California by American Telephone and Telegraph 47 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 48 * the permission of UNIX System Laboratories, Inc. 49 * 50 * Redistribution and use in source and binary forms, with or without 51 * modification, are permitted provided that the following conditions 52 * are met: 53 * 1. Redistributions of source code must retain the above copyright 54 * notice, this list of conditions and the following disclaimer. 55 * 2. Redistributions in binary form must reproduce the above copyright 56 * notice, this list of conditions and the following disclaimer in the 57 * documentation and/or other materials provided with the distribution. 58 * 3. Neither the name of the University nor the names of its contributors 59 * may be used to endorse or promote products derived from this software 60 * without specific prior written permission. 61 * 62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 72 * SUCH DAMAGE. 73 * 74 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 75 * $FreeBSD: src/sys/kern/subr_disk.c,v 1.20.2.6 2001/10/05 07:14:57 peter Exp $ 76 * $FreeBSD: src/sys/ufs/ufs/ufs_disksubr.c,v 1.44.2.3 2001/03/05 05:42:19 obrien Exp $ 77 */ 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/kernel.h> 82 #include <sys/proc.h> 83 #include <sys/sysctl.h> 84 #include <sys/buf.h> 85 #include <sys/conf.h> 86 #include <sys/disklabel.h> 87 #include <sys/disklabel32.h> 88 #include <sys/disklabel64.h> 89 #include <sys/diskslice.h> 90 #include <sys/diskmbr.h> 91 #include <sys/disk.h> 92 #include <sys/kerneldump.h> 93 #include <sys/malloc.h> 94 #include <machine/md_var.h> 95 #include <sys/ctype.h> 96 #include <sys/syslog.h> 97 #include <sys/device.h> 98 #include <sys/msgport.h> 99 #include <sys/devfs.h> 100 #include <sys/thread.h> 101 #include <sys/dsched.h> 102 #include <sys/queue.h> 103 #include <sys/lock.h> 104 #include <sys/udev.h> 105 #include <sys/uuid.h> 106 107 #include <sys/buf2.h> 108 #include <sys/msgport2.h> 109 #include <sys/thread2.h> 110 111 static MALLOC_DEFINE(M_DISK, "disk", "disk data"); 112 static int disk_debug_enable = 0; 113 114 static void disk_msg_autofree_reply(lwkt_port_t, lwkt_msg_t); 115 static void disk_msg_core(void *); 116 static int disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe); 117 static void disk_probe(struct disk *dp, int reprobe); 118 static void _setdiskinfo(struct disk *disk, struct disk_info *info); 119 static void bioqwritereorder(struct bio_queue_head *bioq); 120 static void disk_cleanserial(char *serno); 121 static int disk_debug(int, char *, ...) __printflike(2, 3); 122 static cdev_t _disk_create_named(const char *name, int unit, struct disk *dp, 123 struct dev_ops *raw_ops, int clone); 124 125 static d_open_t diskopen; 126 static d_close_t diskclose; 127 static d_ioctl_t diskioctl; 128 static d_strategy_t diskstrategy; 129 static d_psize_t diskpsize; 130 static d_dump_t diskdump; 131 132 static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist); 133 static struct lwkt_token disklist_token; 134 static struct lwkt_token ds_token; 135 136 static struct dev_ops disk1_ops = { 137 { "disk", 0, D_DISK | D_MPSAFE | D_TRACKCLOSE }, 138 .d_open = diskopen, 139 .d_close = diskclose, 140 .d_read = physread, 141 .d_write = physwrite, 142 .d_ioctl = diskioctl, 143 .d_strategy = diskstrategy, 144 .d_dump = diskdump, 145 .d_psize = diskpsize, 146 }; 147 148 static struct dev_ops disk2_ops = { 149 { "disk", 0, D_DISK | D_MPSAFE | D_TRACKCLOSE | D_NOEMERGPGR }, 150 .d_open = diskopen, 151 .d_close = diskclose, 152 .d_read = physread, 153 .d_write = physwrite, 154 .d_ioctl = diskioctl, 155 .d_strategy = diskstrategy, 156 .d_dump = diskdump, 157 .d_psize = diskpsize, 158 }; 159 160 static struct objcache *disk_msg_cache; 161 162 struct objcache_malloc_args disk_msg_malloc_args = { 163 sizeof(struct disk_msg), M_DISK }; 164 165 static struct lwkt_port disk_dispose_port; 166 static struct lwkt_port disk_msg_port; 167 168 static int 169 disk_debug(int level, char *fmt, ...) 170 { 171 __va_list ap; 172 173 __va_start(ap, fmt); 174 if (level <= disk_debug_enable) 175 kvprintf(fmt, ap); 176 __va_end(ap); 177 178 return 0; 179 } 180 181 static int 182 disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe) 183 { 184 struct disk_info *info = &dp->d_info; 185 struct diskslice *sp = &dp->d_slice->dss_slices[slice]; 186 disklabel_ops_t ops; 187 struct dev_ops *dops; 188 struct partinfo part; 189 const char *msg; 190 char uuid_buf[128]; 191 cdev_t ndev; 192 int sno; 193 u_int i; 194 195 disk_debug(2, "disk_probe_slice (begin): %s (%s)\n", 196 dev->si_name, dp->d_cdev->si_name); 197 198 sno = slice ? slice - 1 : 0; 199 dops = (dp->d_rawdev->si_ops->head.flags & D_NOEMERGPGR) ? 200 &disk2_ops : &disk1_ops; 201 202 ops = &disklabel32_ops; 203 msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info); 204 if (msg && !strcmp(msg, "no disk label")) { 205 ops = &disklabel64_ops; 206 msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info); 207 } 208 209 if (msg == NULL) { 210 if (slice != WHOLE_DISK_SLICE) 211 ops->op_adjust_label_reserved(dp->d_slice, slice, sp); 212 else 213 sp->ds_reserved = 0; 214 215 sp->ds_ops = ops; 216 for (i = 0; i < ops->op_getnumparts(sp->ds_label); i++) { 217 ops->op_loadpartinfo(sp->ds_label, i, &part); 218 if (part.fstype) { 219 if (reprobe && 220 (ndev = devfs_find_device_by_name("%s%c", 221 dev->si_name, 'a' + i)) 222 ) { 223 /* 224 * Device already exists and 225 * is still valid. 226 */ 227 ndev->si_flags |= SI_REPROBE_TEST; 228 229 /* 230 * Destroy old UUID alias 231 */ 232 destroy_dev_alias(ndev, "part-by-uuid/*"); 233 234 /* Create UUID alias */ 235 if (!kuuid_is_nil(&part.storage_uuid)) { 236 snprintf_uuid(uuid_buf, 237 sizeof(uuid_buf), 238 &part.storage_uuid); 239 make_dev_alias(ndev, 240 "part-by-uuid/%s", 241 uuid_buf); 242 udev_dict_set_cstr(ndev, "uuid", uuid_buf); 243 } 244 } else { 245 ndev = make_dev_covering(dops, 246 dp->d_rawdev->si_ops, 247 dkmakeminor(dkunit(dp->d_cdev), 248 slice, i), 249 UID_ROOT, GID_OPERATOR, 0640, 250 "%s%c", dev->si_name, 'a'+ i); 251 ndev->si_parent = dev; 252 ndev->si_iosize_max = dev->si_iosize_max; 253 ndev->si_disk = dp; 254 udev_dict_set_cstr(ndev, "subsystem", "disk"); 255 /* Inherit parent's disk type */ 256 if (dp->d_disktype) { 257 udev_dict_set_cstr(ndev, "disk-type", 258 __DECONST(char *, dp->d_disktype)); 259 } 260 261 /* Create serno alias */ 262 if (dp->d_info.d_serialno) { 263 make_dev_alias(ndev, 264 "serno/%s.s%d%c", 265 dp->d_info.d_serialno, 266 sno, 'a' + i); 267 } 268 269 /* Create UUID alias */ 270 if (!kuuid_is_nil(&part.storage_uuid)) { 271 snprintf_uuid(uuid_buf, 272 sizeof(uuid_buf), 273 &part.storage_uuid); 274 make_dev_alias(ndev, 275 "part-by-uuid/%s", 276 uuid_buf); 277 udev_dict_set_cstr(ndev, "uuid", uuid_buf); 278 } 279 ndev->si_flags |= SI_REPROBE_TEST; 280 } 281 } 282 } 283 } else if (info->d_dsflags & DSO_COMPATLABEL) { 284 msg = NULL; 285 if (sp->ds_size >= 0x100000000ULL) 286 ops = &disklabel64_ops; 287 else 288 ops = &disklabel32_ops; 289 sp->ds_label = ops->op_clone_label(info, sp); 290 } else { 291 if (sp->ds_type == DOSPTYP_386BSD || /* XXX */ 292 sp->ds_type == DOSPTYP_NETBSD || 293 sp->ds_type == DOSPTYP_OPENBSD || 294 sp->ds_type == DOSPTYP_DFLYBSD) { 295 log(LOG_WARNING, "%s: cannot find label (%s)\n", 296 dev->si_name, msg); 297 } 298 299 if (sp->ds_label.opaque != NULL && sp->ds_ops != NULL) { 300 /* Clear out old label - it's not around anymore */ 301 disk_debug(2, 302 "disk_probe_slice: clear out old diskabel on %s\n", 303 dev->si_name); 304 305 sp->ds_ops->op_freedisklabel(&sp->ds_label); 306 sp->ds_ops = NULL; 307 } 308 } 309 310 if (msg == NULL) { 311 sp->ds_wlabel = FALSE; 312 } 313 314 return (msg ? EINVAL : 0); 315 } 316 317 /* 318 * This routine is only called for newly minted drives or to reprobe 319 * a drive with no open slices. disk_probe_slice() is called directly 320 * when reprobing partition changes within slices. 321 */ 322 static void 323 disk_probe(struct disk *dp, int reprobe) 324 { 325 struct disk_info *info = &dp->d_info; 326 cdev_t dev = dp->d_cdev; 327 cdev_t ndev; 328 int error, i, sno; 329 struct diskslices *osp; 330 struct diskslice *sp; 331 struct dev_ops *dops; 332 char uuid_buf[128]; 333 334 KKASSERT (info->d_media_blksize != 0); 335 336 osp = dp->d_slice; 337 dp->d_slice = dsmakeslicestruct(BASE_SLICE, info); 338 disk_debug(1, "disk_probe (begin): %s\n", dp->d_cdev->si_name); 339 340 error = mbrinit(dev, info, &(dp->d_slice)); 341 if (error) { 342 dsgone(&osp); 343 return; 344 } 345 346 dops = (dp->d_rawdev->si_ops->head.flags & D_NOEMERGPGR) ? 347 &disk2_ops : &disk1_ops; 348 349 for (i = 0; i < dp->d_slice->dss_nslices; i++) { 350 /* 351 * Ignore the whole-disk slice, it has already been created. 352 */ 353 if (i == WHOLE_DISK_SLICE) 354 continue; 355 356 #if 1 357 /* 358 * Ignore the compatibility slice s0 if it's a device mapper 359 * volume. 360 */ 361 if ((i == COMPATIBILITY_SLICE) && 362 (info->d_dsflags & DSO_DEVICEMAPPER)) 363 continue; 364 #endif 365 366 sp = &dp->d_slice->dss_slices[i]; 367 368 /* 369 * Handle s0. s0 is a compatibility slice if there are no 370 * other slices and it has not otherwise been set up, else 371 * we ignore it. 372 */ 373 if (i == COMPATIBILITY_SLICE) { 374 sno = 0; 375 if (sp->ds_type == 0 && 376 dp->d_slice->dss_nslices == BASE_SLICE) { 377 sp->ds_size = info->d_media_blocks; 378 sp->ds_reserved = 0; 379 } 380 } else { 381 sno = i - 1; 382 sp->ds_reserved = 0; 383 } 384 385 /* 386 * Ignore 0-length slices 387 */ 388 if (sp->ds_size == 0) 389 continue; 390 391 if (reprobe && 392 (ndev = devfs_find_device_by_name("%ss%d", 393 dev->si_name, sno))) { 394 /* 395 * Device already exists and is still valid 396 */ 397 ndev->si_flags |= SI_REPROBE_TEST; 398 399 /* 400 * Destroy old UUID alias 401 */ 402 destroy_dev_alias(ndev, "slice-by-uuid/*"); 403 404 /* Create UUID alias */ 405 if (!kuuid_is_nil(&sp->ds_stor_uuid)) { 406 snprintf_uuid(uuid_buf, sizeof(uuid_buf), 407 &sp->ds_stor_uuid); 408 make_dev_alias(ndev, "slice-by-uuid/%s", 409 uuid_buf); 410 } 411 } else { 412 /* 413 * Else create new device 414 */ 415 ndev = make_dev_covering(dops, dp->d_rawdev->si_ops, 416 dkmakewholeslice(dkunit(dev), i), 417 UID_ROOT, GID_OPERATOR, 0640, 418 (info->d_dsflags & DSO_DEVICEMAPPER)? 419 "%s.s%d" : "%ss%d", dev->si_name, sno); 420 ndev->si_parent = dev; 421 ndev->si_iosize_max = dev->si_iosize_max; 422 udev_dict_set_cstr(ndev, "subsystem", "disk"); 423 /* Inherit parent's disk type */ 424 if (dp->d_disktype) { 425 udev_dict_set_cstr(ndev, "disk-type", 426 __DECONST(char *, dp->d_disktype)); 427 } 428 429 /* Create serno alias */ 430 if (dp->d_info.d_serialno) { 431 make_dev_alias(ndev, "serno/%s.s%d", 432 dp->d_info.d_serialno, sno); 433 } 434 435 /* Create UUID alias */ 436 if (!kuuid_is_nil(&sp->ds_stor_uuid)) { 437 snprintf_uuid(uuid_buf, sizeof(uuid_buf), 438 &sp->ds_stor_uuid); 439 make_dev_alias(ndev, "slice-by-uuid/%s", 440 uuid_buf); 441 } 442 443 ndev->si_disk = dp; 444 ndev->si_flags |= SI_REPROBE_TEST; 445 } 446 sp->ds_dev = ndev; 447 448 /* 449 * Probe appropriate slices for a disklabel 450 * 451 * XXX slice type 1 used by our gpt probe code. 452 * XXX slice type 0 used by mbr compat slice. 453 */ 454 if (sp->ds_type == DOSPTYP_386BSD || 455 sp->ds_type == DOSPTYP_NETBSD || 456 sp->ds_type == DOSPTYP_OPENBSD || 457 sp->ds_type == DOSPTYP_DFLYBSD || 458 sp->ds_type == 0 || 459 sp->ds_type == 1) { 460 if (dp->d_slice->dss_first_bsd_slice == 0) 461 dp->d_slice->dss_first_bsd_slice = i; 462 disk_probe_slice(dp, ndev, i, reprobe); 463 } 464 } 465 dsgone(&osp); 466 disk_debug(1, "disk_probe (end): %s\n", dp->d_cdev->si_name); 467 } 468 469 470 static void 471 disk_msg_core(void *arg) 472 { 473 struct disk *dp; 474 struct diskslice *sp; 475 disk_msg_t msg; 476 int run; 477 478 lwkt_gettoken(&disklist_token); 479 lwkt_initport_thread(&disk_msg_port, curthread); 480 wakeup(curthread); /* synchronous startup */ 481 lwkt_reltoken(&disklist_token); 482 483 lwkt_gettoken(&ds_token); 484 run = 1; 485 486 while (run) { 487 msg = (disk_msg_t)lwkt_waitport(&disk_msg_port, 0); 488 489 switch (msg->hdr.u.ms_result) { 490 case DISK_DISK_PROBE: 491 dp = (struct disk *)msg->load; 492 disk_debug(1, 493 "DISK_DISK_PROBE: %s\n", 494 dp->d_cdev->si_name); 495 disk_iocom_update(dp); 496 disk_probe(dp, 0); 497 break; 498 case DISK_DISK_DESTROY: 499 dp = (struct disk *)msg->load; 500 disk_debug(1, 501 "DISK_DISK_DESTROY: %s\n", 502 dp->d_cdev->si_name); 503 disk_iocom_uninit(dp); 504 505 /* 506 * Interlock against struct disk enumerations. 507 * Wait for enumerations to complete then remove 508 * the dp from the list before tearing it down. 509 * This avoids numerous races. 510 */ 511 lwkt_gettoken(&disklist_token); 512 while (dp->d_refs) 513 tsleep(&dp->d_refs, 0, "diskdel", hz / 10); 514 LIST_REMOVE(dp, d_list); 515 516 dsched_disk_destroy(dp); 517 devfs_destroy_related(dp->d_cdev); 518 destroy_dev(dp->d_cdev); 519 destroy_only_dev(dp->d_rawdev); 520 521 lwkt_reltoken(&disklist_token); 522 523 if (dp->d_info.d_serialno) { 524 kfree(dp->d_info.d_serialno, M_TEMP); 525 dp->d_info.d_serialno = NULL; 526 } 527 break; 528 case DISK_UNPROBE: 529 dp = (struct disk *)msg->load; 530 disk_debug(1, 531 "DISK_DISK_UNPROBE: %s\n", 532 dp->d_cdev->si_name); 533 devfs_destroy_related(dp->d_cdev); 534 break; 535 case DISK_SLICE_REPROBE: 536 dp = (struct disk *)msg->load; 537 sp = (struct diskslice *)msg->load2; 538 devfs_clr_related_flag(sp->ds_dev, 539 SI_REPROBE_TEST); 540 disk_debug(1, 541 "DISK_SLICE_REPROBE: %s\n", 542 sp->ds_dev->si_name); 543 disk_probe_slice(dp, sp->ds_dev, 544 dkslice(sp->ds_dev), 1); 545 devfs_destroy_related_without_flag( 546 sp->ds_dev, SI_REPROBE_TEST); 547 break; 548 case DISK_DISK_REPROBE: 549 dp = (struct disk *)msg->load; 550 devfs_clr_related_flag(dp->d_cdev, SI_REPROBE_TEST); 551 disk_debug(1, 552 "DISK_DISK_REPROBE: %s\n", 553 dp->d_cdev->si_name); 554 disk_probe(dp, 1); 555 devfs_destroy_related_without_flag( 556 dp->d_cdev, SI_REPROBE_TEST); 557 break; 558 case DISK_SYNC: 559 disk_debug(1, "DISK_SYNC\n"); 560 break; 561 default: 562 devfs_debug(DEVFS_DEBUG_WARNING, 563 "disk_msg_core: unknown message " 564 "received at core\n"); 565 break; 566 } 567 lwkt_replymsg(&msg->hdr, 0); 568 } 569 lwkt_reltoken(&ds_token); 570 lwkt_exit(); 571 } 572 573 574 /* 575 * Acts as a message drain. Any message that is replied to here gets 576 * destroyed and the memory freed. 577 */ 578 static void 579 disk_msg_autofree_reply(lwkt_port_t port, lwkt_msg_t msg) 580 { 581 objcache_put(disk_msg_cache, msg); 582 } 583 584 585 void 586 disk_msg_send(uint32_t cmd, void *load, void *load2) 587 { 588 disk_msg_t disk_msg; 589 lwkt_port_t port = &disk_msg_port; 590 591 disk_msg = objcache_get(disk_msg_cache, M_WAITOK); 592 593 lwkt_initmsg(&disk_msg->hdr, &disk_dispose_port, 0); 594 595 disk_msg->hdr.u.ms_result = cmd; 596 disk_msg->load = load; 597 disk_msg->load2 = load2; 598 KKASSERT(port); 599 lwkt_sendmsg(port, &disk_msg->hdr); 600 } 601 602 void 603 disk_msg_send_sync(uint32_t cmd, void *load, void *load2) 604 { 605 struct lwkt_port rep_port; 606 disk_msg_t disk_msg; 607 lwkt_port_t port; 608 609 disk_msg = objcache_get(disk_msg_cache, M_WAITOK); 610 port = &disk_msg_port; 611 612 /* XXX could probably use curthread's built-in msgport */ 613 lwkt_initport_thread(&rep_port, curthread); 614 lwkt_initmsg(&disk_msg->hdr, &rep_port, 0); 615 616 disk_msg->hdr.u.ms_result = cmd; 617 disk_msg->load = load; 618 disk_msg->load2 = load2; 619 620 lwkt_domsg(port, &disk_msg->hdr, 0); 621 objcache_put(disk_msg_cache, disk_msg); 622 } 623 624 /* 625 * Create a raw device for the dev_ops template (which is returned). Also 626 * create a slice and unit managed disk and overload the user visible 627 * device space with it. 628 * 629 * NOTE: The returned raw device is NOT a slice and unit managed device. 630 * It is an actual raw device representing the raw disk as specified by 631 * the passed dev_ops. The disk layer not only returns such a raw device, 632 * it also uses it internally when passing (modified) commands through. 633 */ 634 cdev_t 635 disk_create(int unit, struct disk *dp, struct dev_ops *raw_ops) 636 { 637 return _disk_create_named(NULL, unit, dp, raw_ops, 0); 638 } 639 640 cdev_t 641 disk_create_clone(int unit, struct disk *dp, 642 struct dev_ops *raw_ops) 643 { 644 return _disk_create_named(NULL, unit, dp, raw_ops, 1); 645 } 646 647 cdev_t 648 disk_create_named(const char *name, int unit, struct disk *dp, 649 struct dev_ops *raw_ops) 650 { 651 return _disk_create_named(name, unit, dp, raw_ops, 0); 652 } 653 654 cdev_t 655 disk_create_named_clone(const char *name, int unit, struct disk *dp, 656 struct dev_ops *raw_ops) 657 { 658 return _disk_create_named(name, unit, dp, raw_ops, 1); 659 } 660 661 static cdev_t 662 _disk_create_named(const char *name, int unit, struct disk *dp, 663 struct dev_ops *raw_ops, int clone) 664 { 665 cdev_t rawdev; 666 struct dev_ops *dops; 667 668 disk_debug(1, "disk_create (begin): %s%d\n", name, unit); 669 670 if (name) { 671 rawdev = make_only_dev(raw_ops, dkmakewholedisk(unit), 672 UID_ROOT, GID_OPERATOR, 0640, "%s", name); 673 } else { 674 rawdev = make_only_dev(raw_ops, dkmakewholedisk(unit), 675 UID_ROOT, GID_OPERATOR, 0640, 676 "%s%d", raw_ops->head.name, unit); 677 } 678 679 bzero(dp, sizeof(*dp)); 680 681 dops = (raw_ops->head.flags & D_NOEMERGPGR) ? &disk2_ops : &disk1_ops; 682 683 dp->d_rawdev = rawdev; 684 dp->d_raw_ops = raw_ops; 685 dp->d_dev_ops = dops; 686 687 if (name) { 688 if (clone) { 689 dp->d_cdev = make_only_dev_covering( 690 dops, dp->d_rawdev->si_ops, 691 dkmakewholedisk(unit), 692 UID_ROOT, GID_OPERATOR, 0640, 693 "%s", name); 694 } else { 695 dp->d_cdev = make_dev_covering( 696 dops, dp->d_rawdev->si_ops, 697 dkmakewholedisk(unit), 698 UID_ROOT, GID_OPERATOR, 0640, 699 "%s", name); 700 } 701 } else { 702 if (clone) { 703 dp->d_cdev = make_only_dev_covering( 704 dops, dp->d_rawdev->si_ops, 705 dkmakewholedisk(unit), 706 UID_ROOT, GID_OPERATOR, 0640, 707 "%s%d", raw_ops->head.name, unit); 708 } else { 709 dp->d_cdev = make_dev_covering( 710 dops, dp->d_rawdev->si_ops, 711 dkmakewholedisk(unit), 712 UID_ROOT, GID_OPERATOR, 0640, 713 "%s%d", raw_ops->head.name, unit); 714 } 715 } 716 717 udev_dict_set_cstr(dp->d_cdev, "subsystem", "disk"); 718 dp->d_cdev->si_disk = dp; 719 720 if (name) 721 dsched_disk_create(dp, name, unit); 722 else 723 dsched_disk_create(dp, raw_ops->head.name, unit); 724 725 lwkt_gettoken(&disklist_token); 726 LIST_INSERT_HEAD(&disklist, dp, d_list); 727 lwkt_reltoken(&disklist_token); 728 729 disk_iocom_init(dp); 730 731 disk_debug(1, "disk_create (end): %s%d\n", 732 (name != NULL)?(name):(raw_ops->head.name), unit); 733 734 return (dp->d_rawdev); 735 } 736 737 int 738 disk_setdisktype(struct disk *disk, const char *type) 739 { 740 int error; 741 742 KKASSERT(disk != NULL); 743 744 disk->d_disktype = type; 745 error = udev_dict_set_cstr(disk->d_cdev, "disk-type", 746 __DECONST(char *, type)); 747 return error; 748 } 749 750 int 751 disk_getopencount(struct disk *disk) 752 { 753 return disk->d_opencount; 754 } 755 756 static void 757 _setdiskinfo(struct disk *disk, struct disk_info *info) 758 { 759 char *oldserialno; 760 761 oldserialno = disk->d_info.d_serialno; 762 bcopy(info, &disk->d_info, sizeof(disk->d_info)); 763 info = &disk->d_info; 764 765 disk_debug(1, "_setdiskinfo: %s\n", disk->d_cdev->si_name); 766 767 /* 768 * The serial number is duplicated so the caller can throw 769 * their copy away. 770 */ 771 if (info->d_serialno && info->d_serialno[0] && 772 (info->d_serialno[0] != ' ' || strlen(info->d_serialno) > 1)) { 773 info->d_serialno = kstrdup(info->d_serialno, M_TEMP); 774 disk_cleanserial(info->d_serialno); 775 if (disk->d_cdev) { 776 make_dev_alias(disk->d_cdev, "serno/%s", 777 info->d_serialno); 778 } 779 } else { 780 info->d_serialno = NULL; 781 } 782 if (oldserialno) 783 kfree(oldserialno, M_TEMP); 784 785 dsched_disk_update(disk, info); 786 787 /* 788 * The caller may set d_media_size or d_media_blocks and we 789 * calculate the other. 790 */ 791 KKASSERT(info->d_media_size == 0 || info->d_media_blocks == 0); 792 if (info->d_media_size == 0 && info->d_media_blocks) { 793 info->d_media_size = (u_int64_t)info->d_media_blocks * 794 info->d_media_blksize; 795 } else if (info->d_media_size && info->d_media_blocks == 0 && 796 info->d_media_blksize) { 797 info->d_media_blocks = info->d_media_size / 798 info->d_media_blksize; 799 } 800 801 /* 802 * The si_* fields for rawdev are not set until after the 803 * disk_create() call, so someone using the cooked version 804 * of the raw device (i.e. da0s0) will not get the right 805 * si_iosize_max unless we fix it up here. 806 */ 807 if (disk->d_cdev && disk->d_rawdev && 808 disk->d_cdev->si_iosize_max == 0) { 809 disk->d_cdev->si_iosize_max = disk->d_rawdev->si_iosize_max; 810 disk->d_cdev->si_bsize_phys = disk->d_rawdev->si_bsize_phys; 811 disk->d_cdev->si_bsize_best = disk->d_rawdev->si_bsize_best; 812 } 813 814 /* Add the serial number to the udev_dictionary */ 815 if (info->d_serialno) 816 udev_dict_set_cstr(disk->d_cdev, "serno", info->d_serialno); 817 } 818 819 /* 820 * Disk drivers must call this routine when media parameters are available 821 * or have changed. 822 */ 823 void 824 disk_setdiskinfo(struct disk *disk, struct disk_info *info) 825 { 826 _setdiskinfo(disk, info); 827 disk_msg_send(DISK_DISK_PROBE, disk, NULL); 828 disk_debug(1, "disk_setdiskinfo: sent probe for %s\n", 829 disk->d_cdev->si_name); 830 } 831 832 void 833 disk_setdiskinfo_sync(struct disk *disk, struct disk_info *info) 834 { 835 _setdiskinfo(disk, info); 836 disk_msg_send_sync(DISK_DISK_PROBE, disk, NULL); 837 disk_debug(1, "disk_setdiskinfo_sync: sent probe for %s\n", 838 disk->d_cdev->si_name); 839 } 840 841 /* 842 * This routine is called when an adapter detaches. The higher level 843 * managed disk device is destroyed while the lower level raw device is 844 * released. 845 */ 846 void 847 disk_destroy(struct disk *disk) 848 { 849 disk_msg_send_sync(DISK_DISK_DESTROY, disk, NULL); 850 return; 851 } 852 853 int 854 disk_dumpcheck(cdev_t dev, u_int64_t *size, 855 u_int64_t *blkno, u_int32_t *secsize) 856 { 857 struct partinfo pinfo; 858 int error; 859 860 if (size) 861 *size = 0; /* avoid gcc warnings */ 862 if (secsize) 863 *secsize = 512; /* avoid gcc warnings */ 864 bzero(&pinfo, sizeof(pinfo)); 865 866 error = dev_dioctl(dev, DIOCGPART, (void *)&pinfo, 0, 867 proc0.p_ucred, NULL, NULL); 868 if (error) 869 return (error); 870 871 if (pinfo.media_blksize == 0) 872 return (ENXIO); 873 874 if (blkno) /* XXX: make sure this reserved stuff is right */ 875 *blkno = pinfo.reserved_blocks + 876 pinfo.media_offset / pinfo.media_blksize; 877 if (secsize) 878 *secsize = pinfo.media_blksize; 879 if (size) 880 *size = (pinfo.media_blocks - pinfo.reserved_blocks); 881 882 return (0); 883 } 884 885 int 886 disk_dumpconf(cdev_t dev, u_int onoff) 887 { 888 struct dumperinfo di; 889 u_int64_t size, blkno; 890 u_int32_t secsize; 891 int error; 892 893 if (!onoff) 894 return set_dumper(NULL); 895 896 error = disk_dumpcheck(dev, &size, &blkno, &secsize); 897 898 if (error) 899 return ENXIO; 900 901 bzero(&di, sizeof(struct dumperinfo)); 902 di.dumper = diskdump; 903 di.priv = dev; 904 di.blocksize = secsize; 905 di.maxiosize = dev->si_iosize_max; 906 di.mediaoffset = blkno * DEV_BSIZE; 907 di.mediasize = size * DEV_BSIZE; 908 909 return set_dumper(&di); 910 } 911 912 void 913 disk_unprobe(struct disk *disk) 914 { 915 if (disk == NULL) 916 return; 917 918 disk_msg_send_sync(DISK_UNPROBE, disk, NULL); 919 } 920 921 void 922 disk_invalidate (struct disk *disk) 923 { 924 dsgone(&disk->d_slice); 925 } 926 927 /* 928 * Enumerate disks, pass a marker and an initial NULL dp to initialize, 929 * then loop with the previously returned dp. 930 * 931 * The returned dp will be referenced, preventing its destruction. When 932 * you pass the returned dp back into the loop the ref is dropped. 933 * 934 * WARNING: If terminating your loop early you must call 935 * disk_enumerate_stop(). 936 */ 937 struct disk * 938 disk_enumerate(struct disk *marker, struct disk *dp) 939 { 940 lwkt_gettoken(&disklist_token); 941 if (dp) { 942 --dp->d_refs; 943 dp = LIST_NEXT(marker, d_list); 944 LIST_REMOVE(marker, d_list); 945 } else { 946 bzero(marker, sizeof(*marker)); 947 marker->d_flags = DISKFLAG_MARKER; 948 dp = LIST_FIRST(&disklist); 949 } 950 while (dp) { 951 if ((dp->d_flags & DISKFLAG_MARKER) == 0) 952 break; 953 dp = LIST_NEXT(dp, d_list); 954 } 955 if (dp) { 956 ++dp->d_refs; 957 LIST_INSERT_AFTER(dp, marker, d_list); 958 } 959 lwkt_reltoken(&disklist_token); 960 return (dp); 961 } 962 963 /* 964 * Terminate an enumeration early. Do not call this function if the 965 * enumeration ended normally. dp can be NULL, indicating that you 966 * wish to retain the ref count on dp. 967 * 968 * This function removes the marker. 969 */ 970 void 971 disk_enumerate_stop(struct disk *marker, struct disk *dp) 972 { 973 lwkt_gettoken(&disklist_token); 974 LIST_REMOVE(marker, d_list); 975 if (dp) 976 --dp->d_refs; 977 lwkt_reltoken(&disklist_token); 978 } 979 980 static 981 int 982 sysctl_disks(SYSCTL_HANDLER_ARGS) 983 { 984 struct disk marker; 985 struct disk *dp; 986 int error, first; 987 988 first = 1; 989 error = 0; 990 dp = NULL; 991 992 while ((dp = disk_enumerate(&marker, dp))) { 993 if (!first) { 994 error = SYSCTL_OUT(req, " ", 1); 995 if (error) { 996 disk_enumerate_stop(&marker, dp); 997 break; 998 } 999 } else { 1000 first = 0; 1001 } 1002 error = SYSCTL_OUT(req, dp->d_rawdev->si_name, 1003 strlen(dp->d_rawdev->si_name)); 1004 if (error) { 1005 disk_enumerate_stop(&marker, dp); 1006 break; 1007 } 1008 } 1009 if (error == 0) 1010 error = SYSCTL_OUT(req, "", 1); 1011 return error; 1012 } 1013 1014 SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, 1015 sysctl_disks, "A", "names of available disks"); 1016 1017 /* 1018 * Open a disk device or partition. 1019 */ 1020 static 1021 int 1022 diskopen(struct dev_open_args *ap) 1023 { 1024 cdev_t dev = ap->a_head.a_dev; 1025 struct disk *dp; 1026 int error; 1027 1028 /* 1029 * dp can't be NULL here XXX. 1030 * 1031 * d_slice will be NULL if setdiskinfo() has not been called yet. 1032 * setdiskinfo() is typically called whether the disk is present 1033 * or not (e.g. CD), but the base disk device is created first 1034 * and there may be a race. 1035 */ 1036 dp = dev->si_disk; 1037 if (dp == NULL || dp->d_slice == NULL) 1038 return (ENXIO); 1039 error = 0; 1040 1041 /* 1042 * Deal with open races 1043 */ 1044 lwkt_gettoken(&ds_token); 1045 while (dp->d_flags & DISKFLAG_LOCK) { 1046 dp->d_flags |= DISKFLAG_WANTED; 1047 error = tsleep(dp, PCATCH, "diskopen", hz); 1048 if (error) { 1049 lwkt_reltoken(&ds_token); 1050 return (error); 1051 } 1052 } 1053 dp->d_flags |= DISKFLAG_LOCK; 1054 1055 /* 1056 * Open the underlying raw device. 1057 */ 1058 if (!dsisopen(dp->d_slice)) { 1059 #if 0 1060 if (!pdev->si_iosize_max) 1061 pdev->si_iosize_max = dev->si_iosize_max; 1062 #endif 1063 error = dev_dopen(dp->d_rawdev, ap->a_oflags, 1064 ap->a_devtype, ap->a_cred, NULL); 1065 } 1066 1067 if (error) 1068 goto out; 1069 error = dsopen(dev, ap->a_devtype, dp->d_info.d_dsflags, 1070 &dp->d_slice, &dp->d_info); 1071 if (!dsisopen(dp->d_slice)) { 1072 dev_dclose(dp->d_rawdev, ap->a_oflags, ap->a_devtype, NULL); 1073 } 1074 out: 1075 dp->d_flags &= ~DISKFLAG_LOCK; 1076 if (dp->d_flags & DISKFLAG_WANTED) { 1077 dp->d_flags &= ~DISKFLAG_WANTED; 1078 wakeup(dp); 1079 } 1080 lwkt_reltoken(&ds_token); 1081 1082 KKASSERT(dp->d_opencount >= 0); 1083 /* If the open was successful, bump open count */ 1084 if (error == 0) 1085 atomic_add_int(&dp->d_opencount, 1); 1086 1087 return(error); 1088 } 1089 1090 /* 1091 * Close a disk device or partition 1092 */ 1093 static 1094 int 1095 diskclose(struct dev_close_args *ap) 1096 { 1097 cdev_t dev = ap->a_head.a_dev; 1098 struct disk *dp; 1099 int error; 1100 int lcount; 1101 1102 error = 0; 1103 dp = dev->si_disk; 1104 1105 /* 1106 * The cdev_t represents the disk/slice/part. The shared 1107 * dp structure governs all cdevs associated with the disk. 1108 * 1109 * As a safety only close the underlying raw device on the last 1110 * close the disk device if our tracking of the slices/partitions 1111 * also indicates nothing is open. 1112 */ 1113 KKASSERT(dp->d_opencount >= 1); 1114 lcount = atomic_fetchadd_int(&dp->d_opencount, -1); 1115 1116 lwkt_gettoken(&ds_token); 1117 dsclose(dev, ap->a_devtype, dp->d_slice); 1118 if (lcount <= 1 && !dsisopen(dp->d_slice)) { 1119 error = dev_dclose(dp->d_rawdev, ap->a_fflag, ap->a_devtype, NULL); 1120 } 1121 lwkt_reltoken(&ds_token); 1122 1123 return (error); 1124 } 1125 1126 /* 1127 * First execute the ioctl on the disk device, and if it isn't supported 1128 * try running it on the backing device. 1129 */ 1130 static 1131 int 1132 diskioctl(struct dev_ioctl_args *ap) 1133 { 1134 cdev_t dev = ap->a_head.a_dev; 1135 struct disk *dp; 1136 int error; 1137 u_int u; 1138 1139 dp = dev->si_disk; 1140 if (dp == NULL) 1141 return (ENXIO); 1142 1143 devfs_debug(DEVFS_DEBUG_DEBUG, 1144 "diskioctl: cmd is: %lx (name: %s)\n", 1145 ap->a_cmd, dev->si_name); 1146 devfs_debug(DEVFS_DEBUG_DEBUG, 1147 "diskioctl: &dp->d_slice is: %p, %p\n", 1148 &dp->d_slice, dp->d_slice); 1149 1150 if (ap->a_cmd == DIOCGKERNELDUMP) { 1151 u = *(u_int *)ap->a_data; 1152 return disk_dumpconf(dev, u); 1153 } 1154 1155 if (ap->a_cmd == DIOCRECLUSTER && dev == dp->d_cdev) { 1156 error = disk_iocom_ioctl(dp, ap->a_cmd, ap->a_data); 1157 return error; 1158 } 1159 1160 if (&dp->d_slice == NULL || dp->d_slice == NULL || 1161 ((dp->d_info.d_dsflags & DSO_DEVICEMAPPER) && 1162 dkslice(dev) == WHOLE_DISK_SLICE)) { 1163 error = ENOIOCTL; 1164 } else { 1165 lwkt_gettoken(&ds_token); 1166 error = dsioctl(dev, ap->a_cmd, ap->a_data, ap->a_fflag, 1167 &dp->d_slice, &dp->d_info); 1168 lwkt_reltoken(&ds_token); 1169 } 1170 1171 if (error == ENOIOCTL) { 1172 error = dev_dioctl(dp->d_rawdev, ap->a_cmd, ap->a_data, 1173 ap->a_fflag, ap->a_cred, NULL, NULL); 1174 } 1175 return (error); 1176 } 1177 1178 /* 1179 * Execute strategy routine 1180 */ 1181 static 1182 int 1183 diskstrategy(struct dev_strategy_args *ap) 1184 { 1185 cdev_t dev = ap->a_head.a_dev; 1186 struct bio *bio = ap->a_bio; 1187 struct bio *nbio; 1188 struct disk *dp; 1189 1190 dp = dev->si_disk; 1191 1192 if (dp == NULL) { 1193 bio->bio_buf->b_error = ENXIO; 1194 bio->bio_buf->b_flags |= B_ERROR; 1195 biodone(bio); 1196 return(0); 1197 } 1198 KKASSERT(dev->si_disk == dp); 1199 1200 /* 1201 * The dscheck() function will also transform the slice relative 1202 * block number i.e. bio->bio_offset into a block number that can be 1203 * passed directly to the underlying raw device. If dscheck() 1204 * returns NULL it will have handled the bio for us (e.g. EOF 1205 * or error due to being beyond the device size). 1206 */ 1207 if ((nbio = dscheck(dev, bio, dp->d_slice)) != NULL) { 1208 dev_dstrategy(dp->d_rawdev, nbio); 1209 } else { 1210 biodone(bio); 1211 } 1212 return(0); 1213 } 1214 1215 /* 1216 * Return the partition size in ?blocks? 1217 */ 1218 static 1219 int 1220 diskpsize(struct dev_psize_args *ap) 1221 { 1222 cdev_t dev = ap->a_head.a_dev; 1223 struct disk *dp; 1224 1225 dp = dev->si_disk; 1226 if (dp == NULL) 1227 return(ENODEV); 1228 1229 ap->a_result = dssize(dev, &dp->d_slice); 1230 1231 if ((ap->a_result == -1) && 1232 (dp->d_info.d_dsflags & DSO_RAWPSIZE)) { 1233 ap->a_head.a_dev = dp->d_rawdev; 1234 return dev_doperate(&ap->a_head); 1235 } 1236 return(0); 1237 } 1238 1239 static int 1240 diskdump(struct dev_dump_args *ap) 1241 { 1242 cdev_t dev = ap->a_head.a_dev; 1243 struct disk *dp = dev->si_disk; 1244 u_int64_t size, offset; 1245 int error; 1246 1247 error = disk_dumpcheck(dev, &size, &ap->a_blkno, &ap->a_secsize); 1248 /* XXX: this should probably go in disk_dumpcheck somehow */ 1249 if (ap->a_length != 0) { 1250 size *= DEV_BSIZE; 1251 offset = ap->a_blkno * DEV_BSIZE; 1252 if ((ap->a_offset < offset) || 1253 (ap->a_offset + ap->a_length - offset > size)) { 1254 kprintf("Attempt to write outside dump " 1255 "device boundaries.\n"); 1256 error = ENOSPC; 1257 } 1258 } 1259 1260 if (error == 0) { 1261 ap->a_head.a_dev = dp->d_rawdev; 1262 error = dev_doperate(&ap->a_head); 1263 } 1264 1265 return(error); 1266 } 1267 1268 1269 SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD, 1270 0, sizeof(struct diskslices), "sizeof(struct diskslices)"); 1271 1272 SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD, 1273 0, sizeof(struct disk), "sizeof(struct disk)"); 1274 1275 /* 1276 * Reorder interval for burst write allowance and minor write 1277 * allowance. 1278 * 1279 * We always want to trickle some writes in to make use of the 1280 * disk's zone cache. Bursting occurs on a longer interval and only 1281 * runningbufspace is well over the hirunningspace limit. 1282 */ 1283 int bioq_reorder_burst_interval = 60; /* should be multiple of minor */ 1284 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_interval, 1285 CTLFLAG_RW, &bioq_reorder_burst_interval, 0, ""); 1286 int bioq_reorder_minor_interval = 5; 1287 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_interval, 1288 CTLFLAG_RW, &bioq_reorder_minor_interval, 0, ""); 1289 1290 int bioq_reorder_burst_bytes = 3000000; 1291 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_bytes, 1292 CTLFLAG_RW, &bioq_reorder_burst_bytes, 0, ""); 1293 int bioq_reorder_minor_bytes = 262144; 1294 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_bytes, 1295 CTLFLAG_RW, &bioq_reorder_minor_bytes, 0, ""); 1296 1297 1298 /* 1299 * Order I/Os. Generally speaking this code is designed to make better 1300 * use of drive zone caches. A drive zone cache can typically track linear 1301 * reads or writes for around 16 zones simultaniously. 1302 * 1303 * Read prioritization issues: It is possible for hundreds of megabytes worth 1304 * of writes to be queued asynchronously. This creates a huge bottleneck 1305 * for reads which reduce read bandwidth to a trickle. 1306 * 1307 * To solve this problem we generally reorder reads before writes. 1308 * 1309 * However, a large number of random reads can also starve writes and 1310 * make poor use of the drive zone cache so we allow writes to trickle 1311 * in every N reads. 1312 */ 1313 void 1314 bioqdisksort(struct bio_queue_head *bioq, struct bio *bio) 1315 { 1316 #if 0 1317 /* 1318 * The BIO wants to be ordered. Adding to the tail also 1319 * causes transition to be set to NULL, forcing the ordering 1320 * of all prior I/O's. 1321 */ 1322 if (bio->bio_buf->b_flags & B_ORDERED) { 1323 bioq_insert_tail(bioq, bio); 1324 return; 1325 } 1326 #endif 1327 1328 switch(bio->bio_buf->b_cmd) { 1329 case BUF_CMD_READ: 1330 if (bioq->transition) { 1331 /* 1332 * Insert before the first write. Bleedover writes 1333 * based on reorder intervals to prevent starvation. 1334 */ 1335 TAILQ_INSERT_BEFORE(bioq->transition, bio, bio_act); 1336 ++bioq->reorder; 1337 if (bioq->reorder % bioq_reorder_minor_interval == 0) { 1338 bioqwritereorder(bioq); 1339 if (bioq->reorder >= 1340 bioq_reorder_burst_interval) { 1341 bioq->reorder = 0; 1342 } 1343 } 1344 } else { 1345 /* 1346 * No writes queued (or ordering was forced), 1347 * insert at tail. 1348 */ 1349 TAILQ_INSERT_TAIL(&bioq->queue, bio, bio_act); 1350 } 1351 break; 1352 case BUF_CMD_WRITE: 1353 /* 1354 * Writes are always appended. If no writes were previously 1355 * queued or an ordered tail insertion occured the transition 1356 * field will be NULL. 1357 */ 1358 TAILQ_INSERT_TAIL(&bioq->queue, bio, bio_act); 1359 if (bioq->transition == NULL) 1360 bioq->transition = bio; 1361 break; 1362 default: 1363 /* 1364 * All other request types are forced to be ordered. 1365 */ 1366 bioq_insert_tail(bioq, bio); 1367 break; 1368 } 1369 } 1370 1371 /* 1372 * Move the read-write transition point to prevent reads from 1373 * completely starving our writes. This brings a number of writes into 1374 * the fold every N reads. 1375 * 1376 * We bring a few linear writes into the fold on a minor interval 1377 * and we bring a non-linear burst of writes into the fold on a major 1378 * interval. Bursting only occurs if runningbufspace is really high 1379 * (typically from syncs, fsyncs, or HAMMER flushes). 1380 */ 1381 static 1382 void 1383 bioqwritereorder(struct bio_queue_head *bioq) 1384 { 1385 struct bio *bio; 1386 off_t next_offset; 1387 size_t left; 1388 size_t n; 1389 int check_off; 1390 1391 if (bioq->reorder < bioq_reorder_burst_interval || 1392 !buf_runningbufspace_severe()) { 1393 left = (size_t)bioq_reorder_minor_bytes; 1394 check_off = 1; 1395 } else { 1396 left = (size_t)bioq_reorder_burst_bytes; 1397 check_off = 0; 1398 } 1399 1400 next_offset = bioq->transition->bio_offset; 1401 while ((bio = bioq->transition) != NULL && 1402 (check_off == 0 || next_offset == bio->bio_offset) 1403 ) { 1404 n = bio->bio_buf->b_bcount; 1405 next_offset = bio->bio_offset + n; 1406 bioq->transition = TAILQ_NEXT(bio, bio_act); 1407 if (left < n) 1408 break; 1409 left -= n; 1410 } 1411 } 1412 1413 /* 1414 * Bounds checking against the media size, used for the raw partition. 1415 * secsize, mediasize and b_blkno must all be the same units. 1416 * Possibly this has to be DEV_BSIZE (512). 1417 */ 1418 int 1419 bounds_check_with_mediasize(struct bio *bio, int secsize, uint64_t mediasize) 1420 { 1421 struct buf *bp = bio->bio_buf; 1422 int64_t sz; 1423 1424 sz = howmany(bp->b_bcount, secsize); 1425 1426 if (bio->bio_offset/DEV_BSIZE + sz > mediasize) { 1427 sz = mediasize - bio->bio_offset/DEV_BSIZE; 1428 if (sz == 0) { 1429 /* If exactly at end of disk, return EOF. */ 1430 bp->b_resid = bp->b_bcount; 1431 return 0; 1432 } 1433 if (sz < 0) { 1434 /* If past end of disk, return EINVAL. */ 1435 bp->b_error = EINVAL; 1436 return 0; 1437 } 1438 /* Otherwise, truncate request. */ 1439 bp->b_bcount = sz * secsize; 1440 } 1441 1442 return 1; 1443 } 1444 1445 /* 1446 * Disk error is the preface to plaintive error messages 1447 * about failing disk transfers. It prints messages of the form 1448 1449 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 1450 1451 * if the offset of the error in the transfer and a disk label 1452 * are both available. blkdone should be -1 if the position of the error 1453 * is unknown; the disklabel pointer may be null from drivers that have not 1454 * been converted to use them. The message is printed with kprintf 1455 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 1456 * The message should be completed (with at least a newline) with kprintf 1457 * or log(-1, ...), respectively. There is no trailing space. 1458 */ 1459 void 1460 diskerr(struct bio *bio, cdev_t dev, const char *what, int pri, int donecnt) 1461 { 1462 struct buf *bp = bio->bio_buf; 1463 const char *term; 1464 1465 switch(bp->b_cmd) { 1466 case BUF_CMD_READ: 1467 term = "read"; 1468 break; 1469 case BUF_CMD_WRITE: 1470 term = "write"; 1471 break; 1472 default: 1473 term = "access"; 1474 break; 1475 } 1476 kprintf("%s: %s %sing ", dev->si_name, what, term); 1477 kprintf("offset %012llx for %d", 1478 (long long)bio->bio_offset, 1479 bp->b_bcount); 1480 1481 if (donecnt) 1482 kprintf(" (%d bytes completed)", donecnt); 1483 } 1484 1485 /* 1486 * Locate a disk device 1487 */ 1488 cdev_t 1489 disk_locate(const char *devname) 1490 { 1491 return devfs_find_device_by_name("%s", devname); 1492 } 1493 1494 void 1495 disk_config(void *arg) 1496 { 1497 disk_msg_send_sync(DISK_SYNC, NULL, NULL); 1498 } 1499 1500 static void 1501 disk_init(void) 1502 { 1503 struct thread* td_core; 1504 1505 disk_msg_cache = objcache_create("disk-msg-cache", 0, 0, 1506 NULL, NULL, NULL, 1507 objcache_malloc_alloc, 1508 objcache_malloc_free, 1509 &disk_msg_malloc_args); 1510 1511 lwkt_token_init(&disklist_token, "disks"); 1512 lwkt_token_init(&ds_token, "ds"); 1513 1514 /* 1515 * Initialize the reply-only port which acts as a message drain 1516 */ 1517 lwkt_initport_replyonly(&disk_dispose_port, disk_msg_autofree_reply); 1518 1519 lwkt_gettoken(&disklist_token); 1520 lwkt_create(disk_msg_core, /*args*/NULL, &td_core, NULL, 1521 0, -1, "disk_msg_core"); 1522 tsleep(td_core, 0, "diskcore", 0); 1523 lwkt_reltoken(&disklist_token); 1524 } 1525 1526 static void 1527 disk_uninit(void) 1528 { 1529 objcache_destroy(disk_msg_cache); 1530 } 1531 1532 /* 1533 * Clean out illegal characters in serial numbers. 1534 */ 1535 static void 1536 disk_cleanserial(char *serno) 1537 { 1538 char c; 1539 1540 while ((c = *serno) != 0) { 1541 if (c >= 'a' && c <= 'z') 1542 ; 1543 else if (c >= 'A' && c <= 'Z') 1544 ; 1545 else if (c >= '0' && c <= '9') 1546 ; 1547 else if (c == '-' || c == '@' || c == '+' || c == '.') 1548 ; 1549 else 1550 c = '_'; 1551 *serno++= c; 1552 } 1553 } 1554 1555 TUNABLE_INT("kern.disk_debug", &disk_debug_enable); 1556 SYSCTL_INT(_kern, OID_AUTO, disk_debug, CTLFLAG_RW, &disk_debug_enable, 1557 0, "Enable subr_disk debugging"); 1558 1559 SYSINIT(disk_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, disk_init, NULL); 1560 SYSUNINIT(disk_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, disk_uninit, NULL); 1561