1 /* 2 * Copyright (c) 2003,2004,2009 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * and Alex Hornung <ahornung@gmail.com> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * ---------------------------------------------------------------------------- 36 * "THE BEER-WARE LICENSE" (Revision 42): 37 * <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you 38 * can do whatever you want with this stuff. If we meet some day, and you think 39 * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp 40 * ---------------------------------------------------------------------------- 41 * 42 * Copyright (c) 1982, 1986, 1988, 1993 43 * The Regents of the University of California. All rights reserved. 44 * (c) UNIX System Laboratories, Inc. 45 * All or some portions of this file are derived from material licensed 46 * to the University of California by American Telephone and Telegraph 47 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 48 * the permission of UNIX System Laboratories, Inc. 49 * 50 * Redistribution and use in source and binary forms, with or without 51 * modification, are permitted provided that the following conditions 52 * are met: 53 * 1. Redistributions of source code must retain the above copyright 54 * notice, this list of conditions and the following disclaimer. 55 * 2. Redistributions in binary form must reproduce the above copyright 56 * notice, this list of conditions and the following disclaimer in the 57 * documentation and/or other materials provided with the distribution. 58 * 3. Neither the name of the University nor the names of its contributors 59 * may be used to endorse or promote products derived from this software 60 * without specific prior written permission. 61 * 62 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 65 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 72 * SUCH DAMAGE. 73 * 74 * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 75 * $FreeBSD: src/sys/kern/subr_disk.c,v 1.20.2.6 2001/10/05 07:14:57 peter Exp $ 76 * $FreeBSD: src/sys/ufs/ufs/ufs_disksubr.c,v 1.44.2.3 2001/03/05 05:42:19 obrien Exp $ 77 */ 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/kernel.h> 82 #include <sys/proc.h> 83 #include <sys/sysctl.h> 84 #include <sys/buf.h> 85 #include <sys/conf.h> 86 #include <sys/disklabel.h> 87 #include <sys/disklabel32.h> 88 #include <sys/disklabel64.h> 89 #include <sys/diskslice.h> 90 #include <sys/diskmbr.h> 91 #include <sys/disk.h> 92 #include <sys/kerneldump.h> 93 #include <sys/malloc.h> 94 #include <machine/md_var.h> 95 #include <sys/ctype.h> 96 #include <sys/syslog.h> 97 #include <sys/device.h> 98 #include <sys/msgport.h> 99 #include <sys/devfs.h> 100 #include <sys/thread.h> 101 #include <sys/dsched.h> 102 #include <sys/queue.h> 103 #include <sys/lock.h> 104 #include <sys/udev.h> 105 #include <sys/uuid.h> 106 107 #include <sys/buf2.h> 108 #include <sys/msgport2.h> 109 110 static MALLOC_DEFINE(M_DISK, "disk", "disk data"); 111 static int disk_debug_enable = 0; 112 113 static void disk_msg_autofree_reply(lwkt_port_t, lwkt_msg_t); 114 static void disk_msg_core(void *); 115 static int disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe); 116 static void disk_probe(struct disk *dp, int reprobe); 117 static void _setdiskinfo(struct disk *disk, struct disk_info *info); 118 static void bioqwritereorder(struct bio_queue_head *bioq); 119 static void disk_cleanserial(char *serno); 120 static int disk_debug(int, char *, ...) __printflike(2, 3); 121 static cdev_t _disk_create_named(const char *name, int unit, struct disk *dp, 122 struct dev_ops *raw_ops, int clone); 123 124 static d_open_t diskopen; 125 static d_close_t diskclose; 126 static d_ioctl_t diskioctl; 127 static d_strategy_t diskstrategy; 128 static d_psize_t diskpsize; 129 static d_dump_t diskdump; 130 131 static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist); 132 static struct lwkt_token disklist_token; 133 static struct lwkt_token ds_token; 134 135 static struct dev_ops disk1_ops = { 136 { "disk", 0, D_DISK | D_MPSAFE | D_TRACKCLOSE | D_KVABIO }, 137 .d_open = diskopen, 138 .d_close = diskclose, 139 .d_read = physread, 140 .d_write = physwrite, 141 .d_ioctl = diskioctl, 142 .d_strategy = diskstrategy, 143 .d_dump = diskdump, 144 .d_psize = diskpsize, 145 }; 146 147 static struct dev_ops disk2_ops = { 148 { "disk", 0, D_DISK | D_MPSAFE | D_TRACKCLOSE | D_KVABIO | 149 D_NOEMERGPGR }, 150 .d_open = diskopen, 151 .d_close = diskclose, 152 .d_read = physread, 153 .d_write = physwrite, 154 .d_ioctl = diskioctl, 155 .d_strategy = diskstrategy, 156 .d_dump = diskdump, 157 .d_psize = diskpsize, 158 }; 159 160 static struct objcache *disk_msg_cache; 161 162 struct objcache_malloc_args disk_msg_malloc_args = { 163 sizeof(struct disk_msg), M_DISK }; 164 165 static struct lwkt_port disk_dispose_port; 166 static struct lwkt_port disk_msg_port; 167 168 static int 169 disk_debug(int level, char *fmt, ...) 170 { 171 __va_list ap; 172 173 __va_start(ap, fmt); 174 if (level <= disk_debug_enable) 175 kvprintf(fmt, ap); 176 __va_end(ap); 177 178 return 0; 179 } 180 181 static int 182 disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe) 183 { 184 struct disk_info *info = &dp->d_info; 185 struct diskslice *sp = &dp->d_slice->dss_slices[slice]; 186 disklabel_ops_t ops; 187 struct dev_ops *dops; 188 struct partinfo part; 189 const char *msg; 190 char uuid_buf[128]; 191 cdev_t ndev; 192 int sno; 193 u_int i; 194 195 disk_debug(2, "disk_probe_slice (begin): %s (%s)\n", 196 dev->si_name, dp->d_cdev->si_name); 197 198 sno = slice ? slice - 1 : 0; 199 dops = (dp->d_rawdev->si_ops->head.flags & D_NOEMERGPGR) ? 200 &disk2_ops : &disk1_ops; 201 202 ops = &disklabel32_ops; 203 msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info); 204 if (msg && !strcmp(msg, "no disk label")) { 205 ops = &disklabel64_ops; 206 msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info); 207 } 208 209 if (msg == NULL) { 210 char packname[DISKLABEL_MAXPACKNAME]; 211 212 if (slice != WHOLE_DISK_SLICE) 213 ops->op_adjust_label_reserved(dp->d_slice, slice, sp); 214 else 215 sp->ds_reserved = 0; 216 217 ops->op_getpackname(sp->ds_label, packname, sizeof(packname)); 218 219 destroy_dev_alias(dev, "by-label/*"); 220 if (packname[0]) 221 make_dev_alias(dev, "by-label/%s", packname); 222 223 sp->ds_ops = ops; 224 for (i = 0; i < ops->op_getnumparts(sp->ds_label); i++) { 225 ops->op_loadpartinfo(sp->ds_label, i, &part); 226 227 if (part.fstype) { 228 if (reprobe && 229 (ndev = devfs_find_device_by_name("%s%c", 230 dev->si_name, 'a' + i)) 231 ) { 232 /* 233 * Device already exists and 234 * is still valid. 235 */ 236 ndev->si_flags |= SI_REPROBE_TEST; 237 238 /* 239 * Destroy old UUID alias 240 */ 241 destroy_dev_alias(ndev, 242 "part-by-uuid/*"); 243 destroy_dev_alias(ndev, 244 "part-by-label/*"); 245 246 /* Create UUID alias */ 247 if (!kuuid_is_nil(&part.storage_uuid)) { 248 snprintf_uuid(uuid_buf, 249 sizeof(uuid_buf), 250 &part.storage_uuid); 251 make_dev_alias(ndev, 252 "part-by-uuid/%s", 253 uuid_buf); 254 udev_dict_set_cstr(ndev, "uuid", uuid_buf); 255 } 256 if (packname[0]) { 257 make_dev_alias(ndev, 258 "part-by-label/%s.%c", 259 packname, 'a' + i); 260 } 261 } else { 262 ndev = make_dev_covering(dops, 263 dp->d_rawdev->si_ops, 264 dkmakeminor(dkunit(dp->d_cdev), 265 slice, i), 266 UID_ROOT, GID_OPERATOR, 0640, 267 "%s%c", dev->si_name, 'a'+ i); 268 ndev->si_parent = dev; 269 ndev->si_iosize_max = dev->si_iosize_max; 270 ndev->si_disk = dp; 271 udev_dict_set_cstr(ndev, "subsystem", "disk"); 272 /* Inherit parent's disk type */ 273 if (dp->d_disktype) { 274 udev_dict_set_cstr(ndev, "disk-type", 275 __DECONST(char *, dp->d_disktype)); 276 } 277 278 /* Create serno alias */ 279 if (dp->d_info.d_serialno) { 280 make_dev_alias(ndev, 281 "serno/%s.s%d%c", 282 dp->d_info.d_serialno, 283 sno, 'a' + i); 284 } 285 286 /* Create UUID alias */ 287 if (!kuuid_is_nil(&part.storage_uuid)) { 288 snprintf_uuid(uuid_buf, 289 sizeof(uuid_buf), 290 &part.storage_uuid); 291 make_dev_alias(ndev, 292 "part-by-uuid/%s", 293 uuid_buf); 294 udev_dict_set_cstr(ndev, "uuid", uuid_buf); 295 } 296 if (packname[0]) { 297 make_dev_alias(ndev, 298 "part-by-label/%s.%c", 299 packname, 'a' + i); 300 } 301 ndev->si_flags |= SI_REPROBE_TEST; 302 } 303 } 304 } 305 } else if (info->d_dsflags & DSO_COMPATLABEL) { 306 msg = NULL; 307 if (sp->ds_size >= 0x100000000ULL) 308 ops = &disklabel64_ops; 309 else 310 ops = &disklabel32_ops; 311 sp->ds_label = ops->op_clone_label(info, sp); 312 } else { 313 if (sp->ds_type == DOSPTYP_386BSD || /* XXX */ 314 sp->ds_type == DOSPTYP_NETBSD || 315 sp->ds_type == DOSPTYP_OPENBSD || 316 sp->ds_type == DOSPTYP_DFLYBSD) { 317 log(LOG_WARNING, "%s: cannot find label (%s)\n", 318 dev->si_name, msg); 319 } 320 321 if (sp->ds_label.opaque != NULL && sp->ds_ops != NULL) { 322 /* Clear out old label - it's not around anymore */ 323 disk_debug(2, 324 "disk_probe_slice: clear out old diskabel on %s\n", 325 dev->si_name); 326 327 sp->ds_ops->op_freedisklabel(&sp->ds_label); 328 sp->ds_ops = NULL; 329 } 330 } 331 332 if (msg == NULL) { 333 sp->ds_wlabel = FALSE; 334 } 335 336 return (msg ? EINVAL : 0); 337 } 338 339 /* 340 * This routine is only called for newly minted drives or to reprobe 341 * a drive with no open slices. disk_probe_slice() is called directly 342 * when reprobing partition changes within slices. 343 */ 344 static void 345 disk_probe(struct disk *dp, int reprobe) 346 { 347 struct disk_info *info = &dp->d_info; 348 cdev_t dev = dp->d_cdev; 349 cdev_t ndev; 350 int error, i, sno; 351 struct diskslices *osp; 352 struct diskslice *sp; 353 struct dev_ops *dops; 354 char uuid_buf[128]; 355 356 /* 357 * d_media_blksize can be 0 for non-disk storage devices such 358 * as audio CDs. 359 */ 360 if (info->d_media_blksize == 0) 361 return; 362 363 osp = dp->d_slice; 364 dp->d_slice = dsmakeslicestruct(BASE_SLICE, info); 365 disk_debug(1, "disk_probe (begin): %s\n", dp->d_cdev->si_name); 366 367 error = mbrinit(dev, info, &(dp->d_slice)); 368 if (error) { 369 dsgone(&osp); 370 return; 371 } 372 373 dops = (dp->d_rawdev->si_ops->head.flags & D_NOEMERGPGR) ? 374 &disk2_ops : &disk1_ops; 375 376 for (i = 0; i < dp->d_slice->dss_nslices; i++) { 377 /* 378 * Ignore the whole-disk slice, it has already been created. 379 */ 380 if (i == WHOLE_DISK_SLICE) 381 continue; 382 383 #if 1 384 /* 385 * Ignore the compatibility slice s0 if it's a device mapper 386 * volume. 387 */ 388 if ((i == COMPATIBILITY_SLICE) && 389 (info->d_dsflags & DSO_DEVICEMAPPER)) 390 continue; 391 #endif 392 393 sp = &dp->d_slice->dss_slices[i]; 394 395 /* 396 * Handle s0. s0 is a compatibility slice if there are no 397 * other slices and it has not otherwise been set up, else 398 * we ignore it. 399 */ 400 if (i == COMPATIBILITY_SLICE) { 401 sno = 0; 402 if (sp->ds_type == 0 && 403 dp->d_slice->dss_nslices == BASE_SLICE) { 404 sp->ds_size = info->d_media_blocks; 405 sp->ds_reserved = 0; 406 } 407 } else { 408 sno = i - 1; 409 sp->ds_reserved = 0; 410 } 411 412 /* 413 * Ignore 0-length slices 414 */ 415 if (sp->ds_size == 0) 416 continue; 417 418 if (reprobe && 419 (ndev = devfs_find_device_by_name("%ss%d", 420 dev->si_name, sno))) { 421 /* 422 * Device already exists and is still valid 423 */ 424 ndev->si_flags |= SI_REPROBE_TEST; 425 426 /* 427 * Destroy old UUID alias 428 */ 429 destroy_dev_alias(ndev, "slice-by-uuid/*"); 430 431 /* Create UUID alias */ 432 if (!kuuid_is_nil(&sp->ds_stor_uuid)) { 433 snprintf_uuid(uuid_buf, sizeof(uuid_buf), 434 &sp->ds_stor_uuid); 435 make_dev_alias(ndev, "slice-by-uuid/%s", 436 uuid_buf); 437 } 438 } else { 439 /* 440 * Else create new device 441 */ 442 ndev = make_dev_covering(dops, dp->d_rawdev->si_ops, 443 dkmakewholeslice(dkunit(dev), i), 444 UID_ROOT, GID_OPERATOR, 0640, 445 (info->d_dsflags & DSO_DEVICEMAPPER)? 446 "%s.s%d" : "%ss%d", dev->si_name, sno); 447 ndev->si_parent = dev; 448 ndev->si_iosize_max = dev->si_iosize_max; 449 udev_dict_set_cstr(ndev, "subsystem", "disk"); 450 /* Inherit parent's disk type */ 451 if (dp->d_disktype) { 452 udev_dict_set_cstr(ndev, "disk-type", 453 __DECONST(char *, dp->d_disktype)); 454 } 455 456 /* Create serno alias */ 457 if (dp->d_info.d_serialno) { 458 make_dev_alias(ndev, "serno/%s.s%d", 459 dp->d_info.d_serialno, sno); 460 } 461 462 /* Create UUID alias */ 463 if (!kuuid_is_nil(&sp->ds_stor_uuid)) { 464 snprintf_uuid(uuid_buf, sizeof(uuid_buf), 465 &sp->ds_stor_uuid); 466 make_dev_alias(ndev, "slice-by-uuid/%s", 467 uuid_buf); 468 } 469 470 ndev->si_disk = dp; 471 ndev->si_flags |= SI_REPROBE_TEST; 472 } 473 sp->ds_dev = ndev; 474 475 /* 476 * Probe appropriate slices for a disklabel 477 * 478 * XXX slice type 1 used by our gpt probe code. 479 * XXX slice type 0 used by mbr compat slice. 480 */ 481 if (sp->ds_type == DOSPTYP_386BSD || 482 sp->ds_type == DOSPTYP_NETBSD || 483 sp->ds_type == DOSPTYP_OPENBSD || 484 sp->ds_type == DOSPTYP_DFLYBSD || 485 sp->ds_type == 0 || 486 sp->ds_type == 1) { 487 if (dp->d_slice->dss_first_bsd_slice == 0) 488 dp->d_slice->dss_first_bsd_slice = i; 489 disk_probe_slice(dp, ndev, i, reprobe); 490 } 491 } 492 dsgone(&osp); 493 disk_debug(1, "disk_probe (end): %s\n", dp->d_cdev->si_name); 494 } 495 496 497 static void 498 disk_msg_core(void *arg) 499 { 500 struct disk *dp; 501 struct diskslice *sp; 502 disk_msg_t msg; 503 int run; 504 505 lwkt_gettoken(&disklist_token); 506 lwkt_initport_thread(&disk_msg_port, curthread); 507 wakeup(curthread); /* synchronous startup */ 508 lwkt_reltoken(&disklist_token); 509 510 lwkt_gettoken(&ds_token); 511 run = 1; 512 513 while (run) { 514 msg = (disk_msg_t)lwkt_waitport(&disk_msg_port, 0); 515 516 switch (msg->hdr.u.ms_result) { 517 case DISK_DISK_PROBE: 518 dp = (struct disk *)msg->load; 519 disk_debug(1, 520 "DISK_DISK_PROBE: %s\n", 521 dp->d_cdev->si_name); 522 disk_iocom_update(dp); 523 disk_probe(dp, 0); 524 break; 525 case DISK_DISK_DESTROY: 526 dp = (struct disk *)msg->load; 527 disk_debug(1, 528 "DISK_DISK_DESTROY: %s\n", 529 dp->d_cdev->si_name); 530 disk_iocom_uninit(dp); 531 532 /* 533 * Interlock against struct disk enumerations. 534 * Wait for enumerations to complete then remove 535 * the dp from the list before tearing it down. 536 * This avoids numerous races. 537 */ 538 lwkt_gettoken(&disklist_token); 539 while (dp->d_refs) 540 tsleep(&dp->d_refs, 0, "diskdel", hz / 10); 541 LIST_REMOVE(dp, d_list); 542 543 dsched_disk_destroy(dp); 544 devfs_destroy_related(dp->d_cdev); 545 destroy_dev(dp->d_cdev); 546 destroy_only_dev(dp->d_rawdev); 547 548 lwkt_reltoken(&disklist_token); 549 550 if (dp->d_info.d_serialno) { 551 kfree(dp->d_info.d_serialno, M_TEMP); 552 dp->d_info.d_serialno = NULL; 553 } 554 break; 555 case DISK_UNPROBE: 556 dp = (struct disk *)msg->load; 557 disk_debug(1, 558 "DISK_DISK_UNPROBE: %s\n", 559 dp->d_cdev->si_name); 560 devfs_destroy_related(dp->d_cdev); 561 break; 562 case DISK_SLICE_REPROBE: 563 dp = (struct disk *)msg->load; 564 sp = (struct diskslice *)msg->load2; 565 devfs_clr_related_flag(sp->ds_dev, 566 SI_REPROBE_TEST); 567 disk_debug(1, 568 "DISK_SLICE_REPROBE: %s\n", 569 sp->ds_dev->si_name); 570 disk_probe_slice(dp, sp->ds_dev, 571 dkslice(sp->ds_dev), 1); 572 devfs_destroy_related_without_flag( 573 sp->ds_dev, SI_REPROBE_TEST); 574 break; 575 case DISK_DISK_REPROBE: 576 dp = (struct disk *)msg->load; 577 devfs_clr_related_flag(dp->d_cdev, SI_REPROBE_TEST); 578 disk_debug(1, 579 "DISK_DISK_REPROBE: %s\n", 580 dp->d_cdev->si_name); 581 disk_probe(dp, 1); 582 devfs_destroy_related_without_flag( 583 dp->d_cdev, SI_REPROBE_TEST); 584 break; 585 case DISK_SYNC: 586 disk_debug(1, "DISK_SYNC\n"); 587 break; 588 default: 589 devfs_debug(DEVFS_DEBUG_WARNING, 590 "disk_msg_core: unknown message " 591 "received at core\n"); 592 break; 593 } 594 lwkt_replymsg(&msg->hdr, 0); 595 } 596 lwkt_reltoken(&ds_token); 597 lwkt_exit(); 598 } 599 600 601 /* 602 * Acts as a message drain. Any message that is replied to here gets 603 * destroyed and the memory freed. 604 */ 605 static void 606 disk_msg_autofree_reply(lwkt_port_t port, lwkt_msg_t msg) 607 { 608 objcache_put(disk_msg_cache, msg); 609 } 610 611 612 void 613 disk_msg_send(uint32_t cmd, void *load, void *load2) 614 { 615 disk_msg_t disk_msg; 616 lwkt_port_t port = &disk_msg_port; 617 618 disk_msg = objcache_get(disk_msg_cache, M_WAITOK); 619 620 lwkt_initmsg(&disk_msg->hdr, &disk_dispose_port, 0); 621 622 disk_msg->hdr.u.ms_result = cmd; 623 disk_msg->load = load; 624 disk_msg->load2 = load2; 625 KKASSERT(port); 626 lwkt_sendmsg(port, &disk_msg->hdr); 627 } 628 629 void 630 disk_msg_send_sync(uint32_t cmd, void *load, void *load2) 631 { 632 struct lwkt_port rep_port; 633 disk_msg_t disk_msg; 634 lwkt_port_t port; 635 636 disk_msg = objcache_get(disk_msg_cache, M_WAITOK); 637 port = &disk_msg_port; 638 639 /* XXX could probably use curthread's built-in msgport */ 640 lwkt_initport_thread(&rep_port, curthread); 641 lwkt_initmsg(&disk_msg->hdr, &rep_port, 0); 642 643 disk_msg->hdr.u.ms_result = cmd; 644 disk_msg->load = load; 645 disk_msg->load2 = load2; 646 647 lwkt_domsg(port, &disk_msg->hdr, 0); 648 objcache_put(disk_msg_cache, disk_msg); 649 } 650 651 /* 652 * Create a raw device for the dev_ops template (which is returned). Also 653 * create a slice and unit managed disk and overload the user visible 654 * device space with it. 655 * 656 * NOTE: The returned raw device is NOT a slice and unit managed device. 657 * It is an actual raw device representing the raw disk as specified by 658 * the passed dev_ops. The disk layer not only returns such a raw device, 659 * it also uses it internally when passing (modified) commands through. 660 */ 661 cdev_t 662 disk_create(int unit, struct disk *dp, struct dev_ops *raw_ops) 663 { 664 return _disk_create_named(NULL, unit, dp, raw_ops, 0); 665 } 666 667 cdev_t 668 disk_create_clone(int unit, struct disk *dp, 669 struct dev_ops *raw_ops) 670 { 671 return _disk_create_named(NULL, unit, dp, raw_ops, 1); 672 } 673 674 cdev_t 675 disk_create_named(const char *name, int unit, struct disk *dp, 676 struct dev_ops *raw_ops) 677 { 678 return _disk_create_named(name, unit, dp, raw_ops, 0); 679 } 680 681 cdev_t 682 disk_create_named_clone(const char *name, int unit, struct disk *dp, 683 struct dev_ops *raw_ops) 684 { 685 return _disk_create_named(name, unit, dp, raw_ops, 1); 686 } 687 688 static cdev_t 689 _disk_create_named(const char *name, int unit, struct disk *dp, 690 struct dev_ops *raw_ops, int clone) 691 { 692 cdev_t rawdev; 693 struct dev_ops *dops; 694 695 disk_debug(1, "disk_create (begin): %s%d\n", name, unit); 696 697 if (name) { 698 rawdev = make_only_dev(raw_ops, dkmakewholedisk(unit), 699 UID_ROOT, GID_OPERATOR, 0640, "%s", name); 700 } else { 701 rawdev = make_only_dev(raw_ops, dkmakewholedisk(unit), 702 UID_ROOT, GID_OPERATOR, 0640, 703 "%s%d", raw_ops->head.name, unit); 704 } 705 706 bzero(dp, sizeof(*dp)); 707 708 dops = (raw_ops->head.flags & D_NOEMERGPGR) ? &disk2_ops : &disk1_ops; 709 710 dp->d_rawdev = rawdev; 711 dp->d_raw_ops = raw_ops; 712 dp->d_dev_ops = dops; 713 714 if (name) { 715 if (clone) { 716 dp->d_cdev = make_only_dev_covering( 717 dops, dp->d_rawdev->si_ops, 718 dkmakewholedisk(unit), 719 UID_ROOT, GID_OPERATOR, 0640, 720 "%s", name); 721 } else { 722 dp->d_cdev = make_dev_covering( 723 dops, dp->d_rawdev->si_ops, 724 dkmakewholedisk(unit), 725 UID_ROOT, GID_OPERATOR, 0640, 726 "%s", name); 727 } 728 } else { 729 if (clone) { 730 dp->d_cdev = make_only_dev_covering( 731 dops, dp->d_rawdev->si_ops, 732 dkmakewholedisk(unit), 733 UID_ROOT, GID_OPERATOR, 0640, 734 "%s%d", raw_ops->head.name, unit); 735 } else { 736 dp->d_cdev = make_dev_covering( 737 dops, dp->d_rawdev->si_ops, 738 dkmakewholedisk(unit), 739 UID_ROOT, GID_OPERATOR, 0640, 740 "%s%d", raw_ops->head.name, unit); 741 } 742 } 743 744 udev_dict_set_cstr(dp->d_cdev, "subsystem", "disk"); 745 dp->d_cdev->si_disk = dp; 746 747 if (name) 748 dsched_disk_create(dp, name, unit); 749 else 750 dsched_disk_create(dp, raw_ops->head.name, unit); 751 752 lwkt_gettoken(&disklist_token); 753 LIST_INSERT_HEAD(&disklist, dp, d_list); 754 lwkt_reltoken(&disklist_token); 755 756 disk_iocom_init(dp); 757 758 disk_debug(1, "disk_create (end): %s%d\n", 759 (name != NULL)?(name):(raw_ops->head.name), unit); 760 761 return (dp->d_rawdev); 762 } 763 764 int 765 disk_setdisktype(struct disk *disk, const char *type) 766 { 767 int error; 768 769 KKASSERT(disk != NULL); 770 771 disk->d_disktype = type; 772 error = udev_dict_set_cstr(disk->d_cdev, "disk-type", 773 __DECONST(char *, type)); 774 return error; 775 } 776 777 int 778 disk_getopencount(struct disk *disk) 779 { 780 return disk->d_opencount; 781 } 782 783 static void 784 _setdiskinfo(struct disk *disk, struct disk_info *info) 785 { 786 char *oldserialno; 787 788 oldserialno = disk->d_info.d_serialno; 789 bcopy(info, &disk->d_info, sizeof(disk->d_info)); 790 info = &disk->d_info; 791 792 disk_debug(1, "_setdiskinfo: %s\n", disk->d_cdev->si_name); 793 794 /* 795 * The serial number is duplicated so the caller can throw 796 * their copy away. 797 */ 798 if (info->d_serialno && info->d_serialno[0] && 799 (info->d_serialno[0] != ' ' || strlen(info->d_serialno) > 1)) { 800 info->d_serialno = kstrdup(info->d_serialno, M_TEMP); 801 disk_cleanserial(info->d_serialno); 802 if (disk->d_cdev) { 803 make_dev_alias(disk->d_cdev, "serno/%s", 804 info->d_serialno); 805 } 806 } else { 807 info->d_serialno = NULL; 808 } 809 if (oldserialno) 810 kfree(oldserialno, M_TEMP); 811 812 dsched_disk_update(disk, info); 813 814 /* 815 * The caller may set d_media_size or d_media_blocks and we 816 * calculate the other. 817 */ 818 KKASSERT(info->d_media_size == 0 || info->d_media_blocks == 0); 819 if (info->d_media_size == 0 && info->d_media_blocks) { 820 info->d_media_size = (u_int64_t)info->d_media_blocks * 821 info->d_media_blksize; 822 } else if (info->d_media_size && info->d_media_blocks == 0 && 823 info->d_media_blksize) { 824 info->d_media_blocks = info->d_media_size / 825 info->d_media_blksize; 826 } 827 828 /* 829 * The si_* fields for rawdev are not set until after the 830 * disk_create() call, so someone using the cooked version 831 * of the raw device (i.e. da0s0) will not get the right 832 * si_iosize_max unless we fix it up here. 833 */ 834 if (disk->d_cdev && disk->d_rawdev && 835 disk->d_cdev->si_iosize_max == 0) { 836 disk->d_cdev->si_iosize_max = disk->d_rawdev->si_iosize_max; 837 disk->d_cdev->si_bsize_phys = disk->d_rawdev->si_bsize_phys; 838 disk->d_cdev->si_bsize_best = disk->d_rawdev->si_bsize_best; 839 } 840 841 /* Add the serial number to the udev_dictionary */ 842 if (info->d_serialno) 843 udev_dict_set_cstr(disk->d_cdev, "serno", info->d_serialno); 844 } 845 846 /* 847 * Disk drivers must call this routine when media parameters are available 848 * or have changed. 849 */ 850 void 851 disk_setdiskinfo(struct disk *disk, struct disk_info *info) 852 { 853 _setdiskinfo(disk, info); 854 disk_msg_send(DISK_DISK_PROBE, disk, NULL); 855 disk_debug(1, "disk_setdiskinfo: sent probe for %s\n", 856 disk->d_cdev->si_name); 857 } 858 859 void 860 disk_setdiskinfo_sync(struct disk *disk, struct disk_info *info) 861 { 862 _setdiskinfo(disk, info); 863 disk_msg_send_sync(DISK_DISK_PROBE, disk, NULL); 864 disk_debug(1, "disk_setdiskinfo_sync: sent probe for %s\n", 865 disk->d_cdev->si_name); 866 } 867 868 /* 869 * This routine is called when an adapter detaches. The higher level 870 * managed disk device is destroyed while the lower level raw device is 871 * released. 872 */ 873 void 874 disk_destroy(struct disk *disk) 875 { 876 disk_msg_send_sync(DISK_DISK_DESTROY, disk, NULL); 877 return; 878 } 879 880 int 881 disk_dumpcheck(cdev_t dev, u_int64_t *size, 882 u_int64_t *blkno, u_int32_t *secsize) 883 { 884 struct partinfo pinfo; 885 int error; 886 887 if (size) 888 *size = 0; /* avoid gcc warnings */ 889 if (secsize) 890 *secsize = 512; /* avoid gcc warnings */ 891 bzero(&pinfo, sizeof(pinfo)); 892 893 error = dev_dioctl(dev, DIOCGPART, (void *)&pinfo, 0, 894 proc0.p_ucred, NULL, NULL); 895 if (error) 896 return (error); 897 898 if (pinfo.media_blksize == 0) 899 return (ENXIO); 900 901 if (blkno) /* XXX: make sure this reserved stuff is right */ 902 *blkno = pinfo.reserved_blocks + 903 pinfo.media_offset / pinfo.media_blksize; 904 if (secsize) 905 *secsize = pinfo.media_blksize; 906 if (size) 907 *size = (pinfo.media_blocks - pinfo.reserved_blocks); 908 909 return (0); 910 } 911 912 int 913 disk_dumpconf(cdev_t dev, u_int onoff) 914 { 915 struct dumperinfo di; 916 u_int64_t size, blkno; 917 u_int32_t secsize; 918 int error; 919 920 if (!onoff) 921 return set_dumper(NULL); 922 923 error = disk_dumpcheck(dev, &size, &blkno, &secsize); 924 925 if (error) 926 return ENXIO; 927 928 bzero(&di, sizeof(struct dumperinfo)); 929 di.dumper = diskdump; 930 di.priv = dev; 931 di.blocksize = secsize; 932 di.maxiosize = dev->si_iosize_max; 933 di.mediaoffset = blkno * DEV_BSIZE; 934 di.mediasize = size * DEV_BSIZE; 935 936 return set_dumper(&di); 937 } 938 939 void 940 disk_unprobe(struct disk *disk) 941 { 942 if (disk == NULL) 943 return; 944 945 disk_msg_send_sync(DISK_UNPROBE, disk, NULL); 946 } 947 948 void 949 disk_invalidate (struct disk *disk) 950 { 951 dsgone(&disk->d_slice); 952 } 953 954 /* 955 * Enumerate disks, pass a marker and an initial NULL dp to initialize, 956 * then loop with the previously returned dp. 957 * 958 * The returned dp will be referenced, preventing its destruction. When 959 * you pass the returned dp back into the loop the ref is dropped. 960 * 961 * WARNING: If terminating your loop early you must call 962 * disk_enumerate_stop(). 963 */ 964 struct disk * 965 disk_enumerate(struct disk *marker, struct disk *dp) 966 { 967 lwkt_gettoken(&disklist_token); 968 if (dp) { 969 --dp->d_refs; 970 dp = LIST_NEXT(marker, d_list); 971 LIST_REMOVE(marker, d_list); 972 } else { 973 bzero(marker, sizeof(*marker)); 974 marker->d_flags = DISKFLAG_MARKER; 975 dp = LIST_FIRST(&disklist); 976 } 977 while (dp) { 978 if ((dp->d_flags & DISKFLAG_MARKER) == 0) 979 break; 980 dp = LIST_NEXT(dp, d_list); 981 } 982 if (dp) { 983 ++dp->d_refs; 984 LIST_INSERT_AFTER(dp, marker, d_list); 985 } 986 lwkt_reltoken(&disklist_token); 987 return (dp); 988 } 989 990 /* 991 * Terminate an enumeration early. Do not call this function if the 992 * enumeration ended normally. dp can be NULL, indicating that you 993 * wish to retain the ref count on dp. 994 * 995 * This function removes the marker. 996 */ 997 void 998 disk_enumerate_stop(struct disk *marker, struct disk *dp) 999 { 1000 lwkt_gettoken(&disklist_token); 1001 LIST_REMOVE(marker, d_list); 1002 if (dp) 1003 --dp->d_refs; 1004 lwkt_reltoken(&disklist_token); 1005 } 1006 1007 static 1008 int 1009 sysctl_disks(SYSCTL_HANDLER_ARGS) 1010 { 1011 struct disk marker; 1012 struct disk *dp; 1013 int error, first; 1014 1015 first = 1; 1016 error = 0; 1017 dp = NULL; 1018 1019 while ((dp = disk_enumerate(&marker, dp))) { 1020 if (!first) { 1021 error = SYSCTL_OUT(req, " ", 1); 1022 if (error) { 1023 disk_enumerate_stop(&marker, dp); 1024 break; 1025 } 1026 } else { 1027 first = 0; 1028 } 1029 error = SYSCTL_OUT(req, dp->d_rawdev->si_name, 1030 strlen(dp->d_rawdev->si_name)); 1031 if (error) { 1032 disk_enumerate_stop(&marker, dp); 1033 break; 1034 } 1035 } 1036 if (error == 0) 1037 error = SYSCTL_OUT(req, "", 1); 1038 return error; 1039 } 1040 1041 SYSCTL_PROC(_kern, OID_AUTO, disks, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, 1042 sysctl_disks, "A", "names of available disks"); 1043 1044 /* 1045 * Open a disk device or partition. 1046 */ 1047 static 1048 int 1049 diskopen(struct dev_open_args *ap) 1050 { 1051 cdev_t dev = ap->a_head.a_dev; 1052 struct disk *dp; 1053 int error; 1054 1055 /* 1056 * dp can't be NULL here XXX. 1057 * 1058 * d_slice will be NULL if setdiskinfo() has not been called yet. 1059 * setdiskinfo() is typically called whether the disk is present 1060 * or not (e.g. CD), but the base disk device is created first 1061 * and there may be a race. 1062 */ 1063 dp = dev->si_disk; 1064 if (dp == NULL || dp->d_slice == NULL) 1065 return (ENXIO); 1066 error = 0; 1067 1068 /* 1069 * Deal with open races 1070 */ 1071 lwkt_gettoken(&ds_token); 1072 while (dp->d_flags & DISKFLAG_LOCK) { 1073 dp->d_flags |= DISKFLAG_WANTED; 1074 error = tsleep(dp, PCATCH, "diskopen", hz); 1075 if (error) { 1076 lwkt_reltoken(&ds_token); 1077 return (error); 1078 } 1079 } 1080 dp->d_flags |= DISKFLAG_LOCK; 1081 1082 /* 1083 * Open the underlying raw device. 1084 */ 1085 if (!dsisopen(dp->d_slice)) { 1086 #if 0 1087 if (!pdev->si_iosize_max) 1088 pdev->si_iosize_max = dev->si_iosize_max; 1089 #endif 1090 error = dev_dopen(dp->d_rawdev, ap->a_oflags, 1091 ap->a_devtype, ap->a_cred, NULL); 1092 } 1093 1094 if (error) 1095 goto out; 1096 error = dsopen(dev, ap->a_devtype, dp->d_info.d_dsflags, 1097 &dp->d_slice, &dp->d_info); 1098 if (!dsisopen(dp->d_slice)) { 1099 dev_dclose(dp->d_rawdev, ap->a_oflags, ap->a_devtype, NULL); 1100 } 1101 out: 1102 dp->d_flags &= ~DISKFLAG_LOCK; 1103 if (dp->d_flags & DISKFLAG_WANTED) { 1104 dp->d_flags &= ~DISKFLAG_WANTED; 1105 wakeup(dp); 1106 } 1107 lwkt_reltoken(&ds_token); 1108 1109 KKASSERT(dp->d_opencount >= 0); 1110 /* If the open was successful, bump open count */ 1111 if (error == 0) 1112 atomic_add_int(&dp->d_opencount, 1); 1113 1114 return(error); 1115 } 1116 1117 /* 1118 * Close a disk device or partition 1119 */ 1120 static 1121 int 1122 diskclose(struct dev_close_args *ap) 1123 { 1124 cdev_t dev = ap->a_head.a_dev; 1125 struct disk *dp; 1126 int error; 1127 int lcount; 1128 1129 error = 0; 1130 dp = dev->si_disk; 1131 1132 /* 1133 * The cdev_t represents the disk/slice/part. The shared 1134 * dp structure governs all cdevs associated with the disk. 1135 * 1136 * As a safety only close the underlying raw device on the last 1137 * close the disk device if our tracking of the slices/partitions 1138 * also indicates nothing is open. 1139 */ 1140 KKASSERT(dp->d_opencount >= 1); 1141 lcount = atomic_fetchadd_int(&dp->d_opencount, -1); 1142 1143 lwkt_gettoken(&ds_token); 1144 dsclose(dev, ap->a_devtype, dp->d_slice); 1145 if (lcount <= 1 && !dsisopen(dp->d_slice)) { 1146 error = dev_dclose(dp->d_rawdev, ap->a_fflag, ap->a_devtype, NULL); 1147 } 1148 lwkt_reltoken(&ds_token); 1149 1150 return (error); 1151 } 1152 1153 /* 1154 * First execute the ioctl on the disk device, and if it isn't supported 1155 * try running it on the backing device. 1156 */ 1157 static 1158 int 1159 diskioctl(struct dev_ioctl_args *ap) 1160 { 1161 cdev_t dev = ap->a_head.a_dev; 1162 struct disk *dp; 1163 int error; 1164 u_int u; 1165 1166 dp = dev->si_disk; 1167 if (dp == NULL) 1168 return (ENXIO); 1169 1170 devfs_debug(DEVFS_DEBUG_DEBUG, 1171 "diskioctl: cmd is: %lx (name: %s)\n", 1172 ap->a_cmd, dev->si_name); 1173 devfs_debug(DEVFS_DEBUG_DEBUG, 1174 "diskioctl: &dp->d_slice is: %p, %p\n", 1175 &dp->d_slice, dp->d_slice); 1176 1177 if (ap->a_cmd == DIOCGKERNELDUMP) { 1178 u = *(u_int *)ap->a_data; 1179 return disk_dumpconf(dev, u); 1180 } 1181 1182 if (ap->a_cmd == DIOCRECLUSTER && dev == dp->d_cdev) { 1183 error = disk_iocom_ioctl(dp, ap->a_cmd, ap->a_data); 1184 return error; 1185 } 1186 1187 if (&dp->d_slice == NULL || dp->d_slice == NULL || 1188 ((dp->d_info.d_dsflags & DSO_DEVICEMAPPER) && 1189 dkslice(dev) == WHOLE_DISK_SLICE)) { 1190 error = ENOIOCTL; 1191 } else { 1192 lwkt_gettoken(&ds_token); 1193 error = dsioctl(dev, ap->a_cmd, ap->a_data, ap->a_fflag, 1194 &dp->d_slice, &dp->d_info); 1195 lwkt_reltoken(&ds_token); 1196 } 1197 1198 if (error == ENOIOCTL) { 1199 error = dev_dioctl(dp->d_rawdev, ap->a_cmd, ap->a_data, 1200 ap->a_fflag, ap->a_cred, NULL, NULL); 1201 } 1202 return (error); 1203 } 1204 1205 /* 1206 * Execute strategy routine 1207 * 1208 * WARNING! We are using the KVABIO API and must not access memory 1209 * through bp->b_data without first calling bkvasync(bp). 1210 */ 1211 static 1212 int 1213 diskstrategy(struct dev_strategy_args *ap) 1214 { 1215 cdev_t dev = ap->a_head.a_dev; 1216 struct bio *bio = ap->a_bio; 1217 struct bio *nbio; 1218 struct disk *dp; 1219 1220 dp = dev->si_disk; 1221 1222 if (dp == NULL) { 1223 bio->bio_buf->b_error = ENXIO; 1224 bio->bio_buf->b_flags |= B_ERROR; 1225 biodone(bio); 1226 return(0); 1227 } 1228 KKASSERT(dev->si_disk == dp); 1229 1230 /* 1231 * The dscheck() function will also transform the slice relative 1232 * block number i.e. bio->bio_offset into a block number that can be 1233 * passed directly to the underlying raw device. If dscheck() 1234 * returns NULL it will have handled the bio for us (e.g. EOF 1235 * or error due to being beyond the device size). 1236 */ 1237 if ((nbio = dscheck(dev, bio, dp->d_slice)) != NULL) { 1238 dev_dstrategy(dp->d_rawdev, nbio); 1239 } else { 1240 biodone(bio); 1241 } 1242 return(0); 1243 } 1244 1245 /* 1246 * Return the partition size in ?blocks? 1247 */ 1248 static 1249 int 1250 diskpsize(struct dev_psize_args *ap) 1251 { 1252 cdev_t dev = ap->a_head.a_dev; 1253 struct disk *dp; 1254 1255 dp = dev->si_disk; 1256 if (dp == NULL) 1257 return(ENODEV); 1258 1259 ap->a_result = dssize(dev, &dp->d_slice); 1260 1261 if ((ap->a_result == -1) && 1262 (dp->d_info.d_dsflags & DSO_RAWPSIZE)) { 1263 ap->a_head.a_dev = dp->d_rawdev; 1264 return dev_doperate(&ap->a_head); 1265 } 1266 return(0); 1267 } 1268 1269 static int 1270 diskdump(struct dev_dump_args *ap) 1271 { 1272 cdev_t dev = ap->a_head.a_dev; 1273 struct disk *dp = dev->si_disk; 1274 u_int64_t size, offset; 1275 int error; 1276 1277 error = disk_dumpcheck(dev, &size, &ap->a_blkno, &ap->a_secsize); 1278 /* XXX: this should probably go in disk_dumpcheck somehow */ 1279 if (ap->a_length != 0) { 1280 size *= DEV_BSIZE; 1281 offset = ap->a_blkno * DEV_BSIZE; 1282 if ((ap->a_offset < offset) || 1283 (ap->a_offset + ap->a_length - offset > size)) { 1284 kprintf("Attempt to write outside dump " 1285 "device boundaries.\n"); 1286 error = ENOSPC; 1287 } 1288 } 1289 1290 if (error == 0) { 1291 ap->a_head.a_dev = dp->d_rawdev; 1292 error = dev_doperate(&ap->a_head); 1293 } 1294 1295 return(error); 1296 } 1297 1298 1299 SYSCTL_INT(_debug_sizeof, OID_AUTO, diskslices, CTLFLAG_RD, 1300 0, sizeof(struct diskslices), "sizeof(struct diskslices)"); 1301 1302 SYSCTL_INT(_debug_sizeof, OID_AUTO, disk, CTLFLAG_RD, 1303 0, sizeof(struct disk), "sizeof(struct disk)"); 1304 1305 /* 1306 * Reorder interval for burst write allowance and minor write 1307 * allowance. 1308 * 1309 * We always want to trickle some writes in to make use of the 1310 * disk's zone cache. Bursting occurs on a longer interval and only 1311 * runningbufspace is well over the hirunningspace limit. 1312 */ 1313 int bioq_reorder_burst_interval = 60; /* should be multiple of minor */ 1314 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_interval, 1315 CTLFLAG_RW, &bioq_reorder_burst_interval, 0, ""); 1316 int bioq_reorder_minor_interval = 5; 1317 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_interval, 1318 CTLFLAG_RW, &bioq_reorder_minor_interval, 0, ""); 1319 1320 int bioq_reorder_burst_bytes = 3000000; 1321 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_burst_bytes, 1322 CTLFLAG_RW, &bioq_reorder_burst_bytes, 0, ""); 1323 int bioq_reorder_minor_bytes = 262144; 1324 SYSCTL_INT(_kern, OID_AUTO, bioq_reorder_minor_bytes, 1325 CTLFLAG_RW, &bioq_reorder_minor_bytes, 0, ""); 1326 1327 1328 /* 1329 * Order I/Os. Generally speaking this code is designed to make better 1330 * use of drive zone caches. A drive zone cache can typically track linear 1331 * reads or writes for around 16 zones simultaniously. 1332 * 1333 * Read prioritization issues: It is possible for hundreds of megabytes worth 1334 * of writes to be queued asynchronously. This creates a huge bottleneck 1335 * for reads which reduce read bandwidth to a trickle. 1336 * 1337 * To solve this problem we generally reorder reads before writes. 1338 * 1339 * However, a large number of random reads can also starve writes and 1340 * make poor use of the drive zone cache so we allow writes to trickle 1341 * in every N reads. 1342 */ 1343 void 1344 bioqdisksort(struct bio_queue_head *bioq, struct bio *bio) 1345 { 1346 #if 0 1347 /* 1348 * The BIO wants to be ordered. Adding to the tail also 1349 * causes transition to be set to NULL, forcing the ordering 1350 * of all prior I/O's. 1351 */ 1352 if (bio->bio_buf->b_flags & B_ORDERED) { 1353 bioq_insert_tail(bioq, bio); 1354 return; 1355 } 1356 #endif 1357 1358 switch(bio->bio_buf->b_cmd) { 1359 case BUF_CMD_READ: 1360 if (bioq->transition) { 1361 /* 1362 * Insert before the first write. Bleedover writes 1363 * based on reorder intervals to prevent starvation. 1364 */ 1365 TAILQ_INSERT_BEFORE(bioq->transition, bio, bio_act); 1366 ++bioq->reorder; 1367 if (bioq->reorder % bioq_reorder_minor_interval == 0) { 1368 bioqwritereorder(bioq); 1369 if (bioq->reorder >= 1370 bioq_reorder_burst_interval) { 1371 bioq->reorder = 0; 1372 } 1373 } 1374 } else { 1375 /* 1376 * No writes queued (or ordering was forced), 1377 * insert at tail. 1378 */ 1379 TAILQ_INSERT_TAIL(&bioq->queue, bio, bio_act); 1380 } 1381 break; 1382 case BUF_CMD_WRITE: 1383 /* 1384 * Writes are always appended. If no writes were previously 1385 * queued or an ordered tail insertion occured the transition 1386 * field will be NULL. 1387 */ 1388 TAILQ_INSERT_TAIL(&bioq->queue, bio, bio_act); 1389 if (bioq->transition == NULL) 1390 bioq->transition = bio; 1391 break; 1392 default: 1393 /* 1394 * All other request types are forced to be ordered. 1395 */ 1396 bioq_insert_tail(bioq, bio); 1397 break; 1398 } 1399 } 1400 1401 /* 1402 * Move the read-write transition point to prevent reads from 1403 * completely starving our writes. This brings a number of writes into 1404 * the fold every N reads. 1405 * 1406 * We bring a few linear writes into the fold on a minor interval 1407 * and we bring a non-linear burst of writes into the fold on a major 1408 * interval. Bursting only occurs if runningbufspace is really high 1409 * (typically from syncs, fsyncs, or HAMMER flushes). 1410 */ 1411 static 1412 void 1413 bioqwritereorder(struct bio_queue_head *bioq) 1414 { 1415 struct bio *bio; 1416 off_t next_offset; 1417 size_t left; 1418 size_t n; 1419 int check_off; 1420 1421 if (bioq->reorder < bioq_reorder_burst_interval || 1422 !buf_runningbufspace_severe()) { 1423 left = (size_t)bioq_reorder_minor_bytes; 1424 check_off = 1; 1425 } else { 1426 left = (size_t)bioq_reorder_burst_bytes; 1427 check_off = 0; 1428 } 1429 1430 next_offset = bioq->transition->bio_offset; 1431 while ((bio = bioq->transition) != NULL && 1432 (check_off == 0 || next_offset == bio->bio_offset) 1433 ) { 1434 n = bio->bio_buf->b_bcount; 1435 next_offset = bio->bio_offset + n; 1436 bioq->transition = TAILQ_NEXT(bio, bio_act); 1437 if (left < n) 1438 break; 1439 left -= n; 1440 } 1441 } 1442 1443 /* 1444 * Bounds checking against the media size, used for the raw partition. 1445 * secsize, mediasize and b_blkno must all be the same units. 1446 * Possibly this has to be DEV_BSIZE (512). 1447 */ 1448 int 1449 bounds_check_with_mediasize(struct bio *bio, int secsize, uint64_t mediasize) 1450 { 1451 struct buf *bp = bio->bio_buf; 1452 int64_t sz; 1453 1454 sz = howmany(bp->b_bcount, secsize); 1455 1456 if (bio->bio_offset/DEV_BSIZE + sz > mediasize) { 1457 sz = mediasize - bio->bio_offset/DEV_BSIZE; 1458 if (sz == 0) { 1459 /* If exactly at end of disk, return EOF. */ 1460 bp->b_resid = bp->b_bcount; 1461 return 0; 1462 } 1463 if (sz < 0) { 1464 /* If past end of disk, return EINVAL. */ 1465 bp->b_error = EINVAL; 1466 return 0; 1467 } 1468 /* Otherwise, truncate request. */ 1469 bp->b_bcount = sz * secsize; 1470 } 1471 1472 return 1; 1473 } 1474 1475 /* 1476 * Disk error is the preface to plaintive error messages 1477 * about failing disk transfers. It prints messages of the form 1478 1479 hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) 1480 1481 * if the offset of the error in the transfer and a disk label 1482 * are both available. blkdone should be -1 if the position of the error 1483 * is unknown; the disklabel pointer may be null from drivers that have not 1484 * been converted to use them. The message is printed with kprintf 1485 * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. 1486 * The message should be completed (with at least a newline) with kprintf 1487 * or log(-1, ...), respectively. There is no trailing space. 1488 */ 1489 void 1490 diskerr(struct bio *bio, cdev_t dev, const char *what, int pri, int donecnt) 1491 { 1492 struct buf *bp = bio->bio_buf; 1493 const char *term; 1494 1495 switch(bp->b_cmd) { 1496 case BUF_CMD_READ: 1497 term = "read"; 1498 break; 1499 case BUF_CMD_WRITE: 1500 term = "write"; 1501 break; 1502 default: 1503 term = "access"; 1504 break; 1505 } 1506 kprintf("%s: %s %sing ", dev->si_name, what, term); 1507 kprintf("offset %012llx for %d", 1508 (long long)bio->bio_offset, 1509 bp->b_bcount); 1510 1511 if (donecnt) 1512 kprintf(" (%d bytes completed)", donecnt); 1513 } 1514 1515 /* 1516 * Locate a disk device 1517 */ 1518 cdev_t 1519 disk_locate(const char *devname) 1520 { 1521 return devfs_find_device_by_name("%s", devname); 1522 } 1523 1524 void 1525 disk_config(void *arg) 1526 { 1527 disk_msg_send_sync(DISK_SYNC, NULL, NULL); 1528 } 1529 1530 static void 1531 disk_init(void) 1532 { 1533 struct thread* td_core; 1534 1535 disk_msg_cache = objcache_create("disk-msg-cache", 0, 0, 1536 NULL, NULL, NULL, 1537 objcache_malloc_alloc, 1538 objcache_malloc_free, 1539 &disk_msg_malloc_args); 1540 1541 lwkt_token_init(&disklist_token, "disks"); 1542 lwkt_token_init(&ds_token, "ds"); 1543 1544 /* 1545 * Initialize the reply-only port which acts as a message drain 1546 */ 1547 lwkt_initport_replyonly(&disk_dispose_port, disk_msg_autofree_reply); 1548 1549 lwkt_gettoken(&disklist_token); 1550 lwkt_create(disk_msg_core, /*args*/NULL, &td_core, NULL, 1551 0, -1, "disk_msg_core"); 1552 tsleep(td_core, 0, "diskcore", 0); 1553 lwkt_reltoken(&disklist_token); 1554 } 1555 1556 static void 1557 disk_uninit(void) 1558 { 1559 objcache_destroy(disk_msg_cache); 1560 } 1561 1562 /* 1563 * Clean out illegal characters in serial numbers. 1564 */ 1565 static void 1566 disk_cleanserial(char *serno) 1567 { 1568 char c; 1569 1570 while ((c = *serno) != 0) { 1571 if (c >= 'a' && c <= 'z') 1572 ; 1573 else if (c >= 'A' && c <= 'Z') 1574 ; 1575 else if (c >= '0' && c <= '9') 1576 ; 1577 else if (c == '-' || c == '@' || c == '+' || c == '.') 1578 ; 1579 else 1580 c = '_'; 1581 *serno++= c; 1582 } 1583 } 1584 1585 TUNABLE_INT("kern.disk_debug", &disk_debug_enable); 1586 SYSCTL_INT(_kern, OID_AUTO, disk_debug, CTLFLAG_RW, &disk_debug_enable, 1587 0, "Enable subr_disk debugging"); 1588 1589 SYSINIT(disk_register, SI_SUB_PRE_DRIVERS, SI_ORDER_FIRST, disk_init, NULL); 1590 SYSUNINIT(disk_register, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, disk_uninit, NULL); 1591