1 /* 2 * Copyright (c) 2012 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 /* 35 * This module allows disk devices to be created and associated with a 36 * communications pipe or socket. You open the device and issue an 37 * ioctl() to install a new disk along with its communications descriptor. 38 * 39 * All further communication occurs via the descriptor using the DMSG 40 * LNK_CONN, LNK_SPAN, and BLOCK protocols. The descriptor can be a 41 * direct connection to a remote machine's disk (in-kernenl), to a remote 42 * cluster controller, to the local cluster controller, etc. 43 * 44 * /dev/xdisk is the control device, issue ioctl()s to create the /dev/xa%d 45 * devices. These devices look like raw disks to the system. 46 * 47 * TODO: 48 * Handle circuit disconnects, leave bio's pending 49 * Restart bio's on circuit reconnect. 50 */ 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/buf.h> 54 #include <sys/conf.h> 55 #include <sys/device.h> 56 #include <sys/devicestat.h> 57 #include <sys/disk.h> 58 #include <sys/kernel.h> 59 #include <sys/malloc.h> 60 #include <sys/sysctl.h> 61 #include <sys/proc.h> 62 #include <sys/queue.h> 63 #include <sys/udev.h> 64 #include <sys/uuid.h> 65 #include <sys/kern_syscall.h> 66 67 #include <sys/dmsg.h> 68 #include <sys/xdiskioctl.h> 69 70 #include <sys/buf2.h> 71 #include <sys/thread2.h> 72 73 struct xa_softc; 74 75 struct xa_tag { 76 TAILQ_ENTRY(xa_tag) entry; 77 struct xa_softc *xa; 78 dmsg_blk_error_t status; 79 kdmsg_state_t *state; 80 kdmsg_circuit_t *circ; 81 struct bio *bio; 82 int running; /* transaction running */ 83 int waitseq; /* streaming reply */ 84 int done; /* final (transaction closed) */ 85 }; 86 87 typedef struct xa_tag xa_tag_t; 88 89 struct xa_softc { 90 TAILQ_ENTRY(xa_softc) entry; 91 cdev_t dev; 92 kdmsg_iocom_t iocom; 93 struct xdisk_attach_ioctl xaioc; 94 struct disk_info info; 95 struct disk disk; 96 uuid_t pfs_fsid; 97 int unit; 98 int serializing; 99 int attached; 100 int opencnt; 101 uint64_t keyid; 102 xa_tag_t *opentag; 103 TAILQ_HEAD(, bio) bioq; 104 TAILQ_HEAD(, xa_tag) tag_freeq; 105 TAILQ_HEAD(, xa_tag) tag_pendq; 106 TAILQ_HEAD(, kdmsg_circuit) circq; 107 struct lwkt_token tok; 108 }; 109 110 typedef struct xa_softc xa_softc_t; 111 112 #define MAXTAGS 64 /* no real limit */ 113 114 static int xdisk_attach(struct xdisk_attach_ioctl *xaioc); 115 static int xdisk_detach(struct xdisk_attach_ioctl *xaioc); 116 static void xa_exit(kdmsg_iocom_t *iocom); 117 static void xa_terminate_check(struct xa_softc *xa); 118 static int xa_rcvdmsg(kdmsg_msg_t *msg); 119 static void xa_autodmsg(kdmsg_msg_t *msg); 120 121 static xa_tag_t *xa_setup_cmd(xa_softc_t *xa, struct bio *bio); 122 static void xa_start(xa_tag_t *tag, kdmsg_msg_t *msg); 123 static uint32_t xa_wait(xa_tag_t *tag, int seq); 124 static void xa_done(xa_tag_t *tag, int wasbio); 125 static int xa_sync_completion(kdmsg_state_t *state, kdmsg_msg_t *msg); 126 static int xa_bio_completion(kdmsg_state_t *state, kdmsg_msg_t *msg); 127 static void xa_restart_deferred(xa_softc_t *xa); 128 129 MALLOC_DEFINE(M_XDISK, "Networked disk client", "Network Disks"); 130 131 /* 132 * Control device, issue ioctls to create xa devices. 133 */ 134 static d_open_t xdisk_open; 135 static d_close_t xdisk_close; 136 static d_ioctl_t xdisk_ioctl; 137 138 static struct dev_ops xdisk_ops = { 139 { "xdisk", 0, D_MPSAFE | D_TRACKCLOSE }, 140 .d_open = xdisk_open, 141 .d_close = xdisk_close, 142 .d_ioctl = xdisk_ioctl 143 }; 144 145 /* 146 * XA disk devices 147 */ 148 static d_open_t xa_open; 149 static d_close_t xa_close; 150 static d_ioctl_t xa_ioctl; 151 static d_strategy_t xa_strategy; 152 static d_psize_t xa_size; 153 154 static struct dev_ops xa_ops = { 155 { "xa", 0, D_DISK | D_CANFREE | D_MPSAFE | D_TRACKCLOSE }, 156 .d_open = xa_open, 157 .d_close = xa_close, 158 .d_ioctl = xa_ioctl, 159 .d_read = physread, 160 .d_write = physwrite, 161 .d_strategy = xa_strategy, 162 .d_psize = xa_size 163 }; 164 165 static struct lwkt_token xdisk_token = LWKT_TOKEN_INITIALIZER(xdisk_token); 166 static int xdisk_opencount; 167 static cdev_t xdisk_dev; 168 static TAILQ_HEAD(, xa_softc) xa_queue; 169 170 /* 171 * Module initialization 172 */ 173 static int 174 xdisk_modevent(module_t mod, int type, void *data) 175 { 176 switch (type) { 177 case MOD_LOAD: 178 TAILQ_INIT(&xa_queue); 179 xdisk_dev = make_dev(&xdisk_ops, 0, 180 UID_ROOT, GID_WHEEL, 0600, "xdisk"); 181 break; 182 case MOD_UNLOAD: 183 case MOD_SHUTDOWN: 184 if (xdisk_opencount || TAILQ_FIRST(&xa_queue)) 185 return (EBUSY); 186 if (xdisk_dev) { 187 destroy_dev(xdisk_dev); 188 xdisk_dev = NULL; 189 } 190 dev_ops_remove_all(&xdisk_ops); 191 dev_ops_remove_all(&xa_ops); 192 break; 193 default: 194 break; 195 } 196 return 0; 197 } 198 199 DEV_MODULE(xdisk, xdisk_modevent, 0); 200 201 /* 202 * Control device 203 */ 204 static int 205 xdisk_open(struct dev_open_args *ap) 206 { 207 lwkt_gettoken(&xdisk_token); 208 ++xdisk_opencount; 209 lwkt_reltoken(&xdisk_token); 210 return(0); 211 } 212 213 static int 214 xdisk_close(struct dev_close_args *ap) 215 { 216 lwkt_gettoken(&xdisk_token); 217 --xdisk_opencount; 218 lwkt_reltoken(&xdisk_token); 219 return(0); 220 } 221 222 static int 223 xdisk_ioctl(struct dev_ioctl_args *ap) 224 { 225 int error; 226 227 switch(ap->a_cmd) { 228 case XDISKIOCATTACH: 229 error = xdisk_attach((void *)ap->a_data); 230 break; 231 case XDISKIOCDETACH: 232 error = xdisk_detach((void *)ap->a_data); 233 break; 234 default: 235 error = ENOTTY; 236 break; 237 } 238 return error; 239 } 240 241 /************************************************************************ 242 * DMSG INTERFACE * 243 ************************************************************************/ 244 245 static int 246 xdisk_attach(struct xdisk_attach_ioctl *xaioc) 247 { 248 xa_softc_t *xa; 249 xa_tag_t *tag; 250 struct file *fp; 251 int unit; 252 int n; 253 char devname[64]; 254 cdev_t dev; 255 256 /* 257 * Normalize ioctl params 258 */ 259 fp = holdfp(curproc->p_fd, xaioc->fd, -1); 260 if (fp == NULL) 261 return EINVAL; 262 if (xaioc->cl_label[sizeof(xaioc->cl_label) - 1] != 0) 263 return EINVAL; 264 if (xaioc->fs_label[sizeof(xaioc->fs_label) - 1] != 0) 265 return EINVAL; 266 if (xaioc->blksize < DEV_BSIZE || xaioc->blksize > MAXBSIZE) 267 return EINVAL; 268 269 /* 270 * See if the serial number is already present. If we are 271 * racing a termination the disk subsystem may still have 272 * duplicate entries not yet removed so we wait a bit and 273 * retry. 274 */ 275 lwkt_gettoken(&xdisk_token); 276 again: 277 TAILQ_FOREACH(xa, &xa_queue, entry) { 278 if (strcmp(xa->iocom.auto_lnk_conn.fs_label, 279 xaioc->fs_label) == 0) { 280 if (xa->serializing) { 281 tsleep(xa, 0, "xadelay", hz / 10); 282 goto again; 283 } 284 xa->serializing = 1; 285 kdmsg_iocom_uninit(&xa->iocom); 286 break; 287 } 288 } 289 290 /* 291 * Create a new xa if not already present 292 */ 293 if (xa == NULL) { 294 unit = 0; 295 for (;;) { 296 TAILQ_FOREACH(xa, &xa_queue, entry) { 297 if (xa->unit == unit) 298 break; 299 } 300 if (xa == NULL) 301 break; 302 ++unit; 303 } 304 xa = kmalloc(sizeof(*xa), M_XDISK, M_WAITOK|M_ZERO); 305 xa->unit = unit; 306 xa->serializing = 1; 307 lwkt_token_init(&xa->tok, "xa"); 308 TAILQ_INIT(&xa->circq); 309 TAILQ_INIT(&xa->bioq); 310 TAILQ_INIT(&xa->tag_freeq); 311 TAILQ_INIT(&xa->tag_pendq); 312 for (n = 0; n < MAXTAGS; ++n) { 313 tag = kmalloc(sizeof(*tag), M_XDISK, M_WAITOK|M_ZERO); 314 tag->xa = xa; 315 TAILQ_INSERT_TAIL(&xa->tag_freeq, tag, entry); 316 } 317 TAILQ_INSERT_TAIL(&xa_queue, xa, entry); 318 } 319 320 /* 321 * (xa) is now serializing. 322 */ 323 xa->xaioc = *xaioc; 324 xa->attached = 1; 325 lwkt_reltoken(&xdisk_token); 326 327 /* 328 * Create device 329 */ 330 if (xa->dev == NULL) { 331 dev = disk_create(unit, &xa->disk, &xa_ops); 332 dev->si_drv1 = xa; 333 xa->dev = dev; 334 } 335 336 xa->info.d_media_blksize = xaioc->blksize; 337 xa->info.d_media_blocks = xaioc->bytes / xaioc->blksize; 338 xa->info.d_dsflags = DSO_MBRQUIET | DSO_RAWPSIZE; 339 xa->info.d_secpertrack = 32; 340 xa->info.d_nheads = 64; 341 xa->info.d_secpercyl = xa->info.d_secpertrack * xa->info.d_nheads; 342 xa->info.d_ncylinders = 0; 343 if (xa->xaioc.fs_label[0]) 344 xa->info.d_serialno = xa->xaioc.fs_label; 345 346 /* 347 * Set up messaging connection 348 */ 349 ksnprintf(devname, sizeof(devname), "xa%d", unit); 350 kdmsg_iocom_init(&xa->iocom, xa, 351 KDMSG_IOCOMF_AUTOCONN | 352 KDMSG_IOCOMF_AUTOSPAN | 353 KDMSG_IOCOMF_AUTOCIRC | 354 KDMSG_IOCOMF_AUTOFORGE, 355 M_XDISK, xa_rcvdmsg); 356 xa->iocom.exit_func = xa_exit; 357 358 kdmsg_iocom_reconnect(&xa->iocom, fp, devname); 359 360 /* 361 * Setup our LNK_CONN advertisement for autoinitiate. 362 * 363 * Our filter is setup to only accept PEER_BLOCK/SERVER 364 * advertisements. 365 */ 366 xa->iocom.auto_lnk_conn.pfs_type = DMSG_PFSTYPE_CLIENT; 367 xa->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1; 368 xa->iocom.auto_lnk_conn.peer_type = DMSG_PEER_BLOCK; 369 xa->iocom.auto_lnk_conn.peer_mask = 1LLU << DMSG_PEER_BLOCK; 370 xa->iocom.auto_lnk_conn.pfs_mask = 1LLU << DMSG_PFSTYPE_SERVER; 371 ksnprintf(xa->iocom.auto_lnk_conn.cl_label, 372 sizeof(xa->iocom.auto_lnk_conn.cl_label), 373 "%s", xaioc->cl_label); 374 375 /* 376 * We need a unique pfs_fsid to avoid confusion. 377 * We supply a rendezvous fs_label using the serial number. 378 */ 379 kern_uuidgen(&xa->pfs_fsid, 1); 380 xa->iocom.auto_lnk_conn.pfs_fsid = xa->pfs_fsid; 381 ksnprintf(xa->iocom.auto_lnk_conn.fs_label, 382 sizeof(xa->iocom.auto_lnk_conn.fs_label), 383 "%s", xaioc->fs_label); 384 385 /* 386 * Setup our LNK_SPAN advertisement for autoinitiate 387 */ 388 xa->iocom.auto_lnk_span.pfs_type = DMSG_PFSTYPE_CLIENT; 389 xa->iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1; 390 xa->iocom.auto_lnk_span.peer_type = DMSG_PEER_BLOCK; 391 ksnprintf(xa->iocom.auto_lnk_span.cl_label, 392 sizeof(xa->iocom.auto_lnk_span.cl_label), 393 "%s", xa->xaioc.cl_label); 394 395 kdmsg_iocom_autoinitiate(&xa->iocom, xa_autodmsg); 396 disk_setdiskinfo_sync(&xa->disk, &xa->info); 397 398 lwkt_gettoken(&xdisk_token); 399 xa->serializing = 0; 400 xa_terminate_check(xa); 401 lwkt_reltoken(&xdisk_token); 402 403 return(0); 404 } 405 406 static int 407 xdisk_detach(struct xdisk_attach_ioctl *xaioc) 408 { 409 struct xa_softc *xa; 410 411 lwkt_gettoken(&xdisk_token); 412 for (;;) { 413 TAILQ_FOREACH(xa, &xa_queue, entry) { 414 if (strcmp(xa->iocom.auto_lnk_conn.fs_label, 415 xaioc->fs_label) == 0) { 416 break; 417 } 418 } 419 if (xa == NULL || xa->serializing == 0) { 420 xa->serializing = 1; 421 break; 422 } 423 tsleep(xa, 0, "xadet", hz / 10); 424 } 425 if (xa) { 426 kdmsg_iocom_uninit(&xa->iocom); 427 xa->serializing = 0; 428 } 429 lwkt_reltoken(&xdisk_token); 430 return(0); 431 } 432 433 /* 434 * Called from iocom core transmit thread upon disconnect. 435 */ 436 static 437 void 438 xa_exit(kdmsg_iocom_t *iocom) 439 { 440 struct xa_softc *xa = iocom->handle; 441 442 lwkt_gettoken(&xa->tok); 443 lwkt_gettoken(&xdisk_token); 444 445 /* 446 * We must wait for any I/O's to complete to ensure that all 447 * state structure references are cleaned up before returning. 448 */ 449 xa->attached = -1; /* force deferral or failure */ 450 while (TAILQ_FIRST(&xa->tag_pendq)) { 451 tsleep(xa, 0, "xabiow", hz / 10); 452 } 453 454 /* 455 * All serializing code checks for de-initialization so only 456 * do it if we aren't already serializing. 457 */ 458 if (xa->serializing == 0) { 459 xa->serializing = 1; 460 kdmsg_iocom_uninit(iocom); 461 xa->serializing = 0; 462 } 463 464 /* 465 * If the drive is not in use and no longer attach it can be 466 * destroyed. 467 */ 468 xa->attached = 0; 469 xa_terminate_check(xa); 470 lwkt_reltoken(&xdisk_token); 471 lwkt_reltoken(&xa->tok); 472 } 473 474 /* 475 * Determine if we can destroy the xa_softc. 476 * 477 * Called with xdisk_token held. 478 */ 479 static 480 void 481 xa_terminate_check(struct xa_softc *xa) 482 { 483 xa_tag_t *tag; 484 struct bio *bio; 485 486 if (xa->opencnt || xa->attached || xa->serializing) 487 return; 488 xa->serializing = 1; 489 kdmsg_iocom_uninit(&xa->iocom); 490 491 /* 492 * When destroying an xa make sure all pending I/O (typically 493 * from the disk probe) is done. 494 * 495 * XXX what about new I/O initiated prior to disk_destroy(). 496 */ 497 while ((tag = TAILQ_FIRST(&xa->tag_pendq)) != NULL) { 498 TAILQ_REMOVE(&xa->tag_pendq, tag, entry); 499 if ((bio = tag->bio) != NULL) { 500 tag->bio = NULL; 501 bio->bio_buf->b_error = ENXIO; 502 bio->bio_buf->b_flags |= B_ERROR; 503 biodone(bio); 504 } 505 TAILQ_INSERT_TAIL(&xa->tag_freeq, tag, entry); 506 } 507 if (xa->dev) { 508 disk_destroy(&xa->disk); 509 xa->dev->si_drv1 = NULL; 510 xa->dev = NULL; 511 } 512 KKASSERT(xa->opencnt == 0 && xa->attached == 0); 513 while ((tag = TAILQ_FIRST(&xa->tag_freeq)) != NULL) { 514 TAILQ_REMOVE(&xa->tag_freeq, tag, entry); 515 tag->xa = NULL; 516 kfree(tag, M_XDISK); 517 } 518 KKASSERT(TAILQ_EMPTY(&xa->tag_pendq)); 519 TAILQ_REMOVE(&xa_queue, xa, entry); /* XXX */ 520 kfree(xa, M_XDISK); 521 } 522 523 /* 524 * Shim to catch and record virtual circuit events. 525 */ 526 static void 527 xa_autodmsg(kdmsg_msg_t *msg) 528 { 529 xa_softc_t *xa = msg->iocom->handle; 530 531 kdmsg_circuit_t *circ; 532 kdmsg_circuit_t *cscan; 533 uint32_t xcmd; 534 535 /* 536 * Because this is just a shim we don't have a state callback for 537 * the transactions we are sniffing, so make things easier by 538 * calculating the original command along with the current message's 539 * flags. This is because transactions are made up of numerous 540 * messages and only the first typically specifies the actual command. 541 */ 542 if (msg->state) { 543 xcmd = msg->state->icmd | 544 (msg->any.head.cmd & (DMSGF_CREATE | 545 DMSGF_DELETE | 546 DMSGF_REPLY)); 547 } else { 548 xcmd = msg->any.head.cmd; 549 } 550 551 /* 552 * Add or remove a circuit, sorted by weight (lower numbers are 553 * better). 554 */ 555 switch(xcmd) { 556 case DMSG_LNK_CIRC | DMSGF_CREATE | DMSGF_REPLY: 557 /* 558 * Track established circuits 559 */ 560 circ = msg->state->any.circ; 561 lwkt_gettoken(&xa->tok); 562 if (circ->recorded == 0) { 563 TAILQ_FOREACH(cscan, &xa->circq, entry) { 564 if (circ->weight < cscan->weight) 565 break; 566 } 567 if (cscan) 568 TAILQ_INSERT_BEFORE(cscan, circ, entry); 569 else 570 TAILQ_INSERT_TAIL(&xa->circq, circ, entry); 571 circ->recorded = 1; 572 } 573 574 /* 575 * Restart any deferred I/O. 576 */ 577 xa_restart_deferred(xa); 578 lwkt_reltoken(&xa->tok); 579 break; 580 case DMSG_LNK_CIRC | DMSGF_DELETE | DMSGF_REPLY: 581 /* 582 * Losing virtual circuit. Remove the circ from contention. 583 */ 584 circ = msg->state->any.circ; 585 lwkt_gettoken(&xa->tok); 586 if (circ->recorded) { 587 TAILQ_REMOVE(&xa->circq, circ, entry); 588 circ->recorded = 0; 589 } 590 xa_restart_deferred(xa); 591 lwkt_reltoken(&xa->tok); 592 break; 593 default: 594 break; 595 } 596 } 597 598 static int 599 xa_rcvdmsg(kdmsg_msg_t *msg) 600 { 601 switch(msg->any.head.cmd & DMSGF_TRANSMASK) { 602 case DMSG_DBG_SHELL: 603 /* 604 * Execute shell command (not supported atm). 605 * 606 * This is a one-way packet but if not (e.g. if part of 607 * a streaming transaction), we will have already closed 608 * our end. 609 */ 610 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP); 611 break; 612 case DMSG_DBG_SHELL | DMSGF_REPLY: 613 /* 614 * Receive one or more replies to a shell command that we 615 * sent. 616 * 617 * This is a one-way packet but if not (e.g. if part of 618 * a streaming transaction), we will have already closed 619 * our end. 620 */ 621 if (msg->aux_data) { 622 msg->aux_data[msg->aux_size - 1] = 0; 623 kprintf("xdisk: DEBUGMSG: %s\n", msg->aux_data); 624 } 625 break; 626 default: 627 /* 628 * Unsupported LNK message received. We only need to 629 * reply if it's a transaction in order to close our end. 630 * Ignore any one-way messages are any further messages 631 * associated with the transaction. 632 * 633 * NOTE: This case also includes DMSG_LNK_ERROR messages 634 * which might be one-way, replying to those would 635 * cause an infinite ping-pong. 636 */ 637 if (msg->any.head.cmd & DMSGF_CREATE) 638 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP); 639 break; 640 } 641 return(0); 642 } 643 644 645 /************************************************************************ 646 * XA DEVICE INTERFACE * 647 ************************************************************************/ 648 649 static int 650 xa_open(struct dev_open_args *ap) 651 { 652 cdev_t dev = ap->a_head.a_dev; 653 xa_softc_t *xa; 654 xa_tag_t *tag; 655 kdmsg_msg_t *msg; 656 int error; 657 658 dev->si_bsize_phys = 512; 659 dev->si_bsize_best = 32768; 660 661 /* 662 * Interlock open with opencnt, wait for attachment operations 663 * to finish. 664 */ 665 lwkt_gettoken(&xdisk_token); 666 again: 667 xa = dev->si_drv1; 668 if (xa == NULL) { 669 lwkt_reltoken(&xdisk_token); 670 return ENXIO; /* raced destruction */ 671 } 672 if (xa->serializing) { 673 tsleep(xa, 0, "xarace", hz / 10); 674 goto again; 675 } 676 if (xa->attached == 0) { 677 lwkt_reltoken(&xdisk_token); 678 return ENXIO; /* raced destruction */ 679 } 680 681 /* 682 * Serialize initial open 683 */ 684 if (xa->opencnt++ > 0) { 685 lwkt_reltoken(&xdisk_token); 686 return(0); 687 } 688 xa->serializing = 1; 689 lwkt_reltoken(&xdisk_token); 690 691 tag = xa_setup_cmd(xa, NULL); 692 if (tag == NULL) { 693 lwkt_gettoken(&xdisk_token); 694 KKASSERT(xa->opencnt > 0); 695 --xa->opencnt; 696 xa->serializing = 0; 697 xa_terminate_check(xa); 698 lwkt_reltoken(&xdisk_token); 699 return(ENXIO); 700 } 701 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ, 702 DMSG_BLK_OPEN | DMSGF_CREATE, 703 xa_sync_completion, tag); 704 msg->any.blk_open.modes = DMSG_BLKOPEN_RD | DMSG_BLKOPEN_WR; 705 xa_start(tag, msg); 706 if (xa_wait(tag, 0) == 0) { 707 xa->keyid = tag->status.keyid; 708 xa->opentag = tag; /* leave tag open */ 709 xa->serializing = 0; 710 error = 0; 711 } else { 712 xa_done(tag, 0); 713 lwkt_gettoken(&xdisk_token); 714 KKASSERT(xa->opencnt > 0); 715 --xa->opencnt; 716 xa->serializing = 0; 717 xa_terminate_check(xa); 718 lwkt_reltoken(&xdisk_token); 719 error = ENXIO; 720 } 721 return (error); 722 } 723 724 static int 725 xa_close(struct dev_close_args *ap) 726 { 727 cdev_t dev = ap->a_head.a_dev; 728 xa_softc_t *xa; 729 xa_tag_t *tag; 730 731 xa = dev->si_drv1; 732 if (xa == NULL) 733 return ENXIO; /* raced destruction */ 734 735 lwkt_gettoken(&xa->tok); 736 if ((tag = xa->opentag) != NULL) { 737 xa->opentag = NULL; 738 kdmsg_state_reply(tag->state, 0); 739 while (tag->done == 0) 740 xa_wait(tag, tag->waitseq); 741 xa_done(tag, 0); 742 } 743 lwkt_reltoken(&xa->tok); 744 745 lwkt_gettoken(&xdisk_token); 746 KKASSERT(xa->opencnt > 0); 747 --xa->opencnt; 748 xa_terminate_check(xa); 749 lwkt_reltoken(&xdisk_token); 750 751 return(0); 752 } 753 754 static int 755 xa_strategy(struct dev_strategy_args *ap) 756 { 757 xa_softc_t *xa = ap->a_head.a_dev->si_drv1; 758 xa_tag_t *tag; 759 struct bio *bio = ap->a_bio; 760 761 /* 762 * Allow potentially temporary link failures to fail the I/Os 763 * only if the device is not open. That is, we allow the disk 764 * probe code prior to mount to fail. 765 */ 766 if (xa->attached == 0 && xa->opencnt == 0) { 767 bio->bio_buf->b_error = ENXIO; 768 bio->bio_buf->b_flags |= B_ERROR; 769 biodone(bio); 770 return(0); 771 } 772 773 tag = xa_setup_cmd(xa, bio); 774 if (tag) 775 xa_start(tag, NULL); 776 return(0); 777 } 778 779 static int 780 xa_ioctl(struct dev_ioctl_args *ap) 781 { 782 return(ENOTTY); 783 } 784 785 static int 786 xa_size(struct dev_psize_args *ap) 787 { 788 struct xa_softc *xa; 789 790 if ((xa = ap->a_head.a_dev->si_drv1) == NULL) 791 return (ENXIO); 792 ap->a_result = xa->info.d_media_blocks; 793 return (0); 794 } 795 796 /************************************************************************ 797 * XA BLOCK PROTOCOL STATE MACHINE * 798 ************************************************************************ 799 * 800 * Implement tag/msg setup and related functions. 801 */ 802 static xa_tag_t * 803 xa_setup_cmd(xa_softc_t *xa, struct bio *bio) 804 { 805 kdmsg_circuit_t *circ; 806 xa_tag_t *tag; 807 808 /* 809 * Only get a tag if we have a valid virtual circuit to the server. 810 */ 811 lwkt_gettoken(&xa->tok); 812 TAILQ_FOREACH(circ, &xa->circq, entry) { 813 if (circ->lost == 0) 814 break; 815 } 816 if (circ == NULL || xa->attached <= 0) { 817 tag = NULL; 818 } else if ((tag = TAILQ_FIRST(&xa->tag_freeq)) != NULL) { 819 TAILQ_REMOVE(&xa->tag_freeq, tag, entry); 820 tag->bio = bio; 821 tag->circ = circ; 822 kdmsg_circ_hold(circ); 823 TAILQ_INSERT_TAIL(&xa->tag_pendq, tag, entry); 824 } 825 826 /* 827 * If we can't dispatch now and this is a bio, queue it for later. 828 */ 829 if (tag == NULL && bio) { 830 TAILQ_INSERT_TAIL(&xa->bioq, bio, bio_act); 831 } 832 lwkt_reltoken(&xa->tok); 833 834 return (tag); 835 } 836 837 static void 838 xa_start(xa_tag_t *tag, kdmsg_msg_t *msg) 839 { 840 xa_softc_t *xa = tag->xa; 841 842 if (msg == NULL) { 843 struct bio *bio; 844 struct buf *bp; 845 846 KKASSERT(tag->bio); 847 bio = tag->bio; 848 bp = bio->bio_buf; 849 850 switch(bp->b_cmd) { 851 case BUF_CMD_READ: 852 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ, 853 DMSG_BLK_READ | 854 DMSGF_CREATE | DMSGF_DELETE, 855 xa_bio_completion, tag); 856 msg->any.blk_read.keyid = xa->keyid; 857 msg->any.blk_read.offset = bio->bio_offset; 858 msg->any.blk_read.bytes = bp->b_bcount; 859 break; 860 case BUF_CMD_WRITE: 861 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ, 862 DMSG_BLK_WRITE | 863 DMSGF_CREATE | DMSGF_DELETE, 864 xa_bio_completion, tag); 865 msg->any.blk_write.keyid = xa->keyid; 866 msg->any.blk_write.offset = bio->bio_offset; 867 msg->any.blk_write.bytes = bp->b_bcount; 868 msg->aux_data = bp->b_data; 869 msg->aux_size = bp->b_bcount; 870 break; 871 case BUF_CMD_FLUSH: 872 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ, 873 DMSG_BLK_FLUSH | 874 DMSGF_CREATE | DMSGF_DELETE, 875 xa_bio_completion, tag); 876 msg->any.blk_flush.keyid = xa->keyid; 877 msg->any.blk_flush.offset = bio->bio_offset; 878 msg->any.blk_flush.bytes = bp->b_bcount; 879 break; 880 case BUF_CMD_FREEBLKS: 881 msg = kdmsg_msg_alloc(&xa->iocom, tag->circ, 882 DMSG_BLK_FREEBLKS | 883 DMSGF_CREATE | DMSGF_DELETE, 884 xa_bio_completion, tag); 885 msg->any.blk_freeblks.keyid = xa->keyid; 886 msg->any.blk_freeblks.offset = bio->bio_offset; 887 msg->any.blk_freeblks.bytes = bp->b_bcount; 888 break; 889 default: 890 bp->b_flags |= B_ERROR; 891 bp->b_error = EIO; 892 biodone(bio); 893 tag->bio = NULL; 894 break; 895 } 896 } 897 898 tag->done = 0; 899 tag->waitseq = 0; 900 if (msg) { 901 tag->state = msg->state; 902 kdmsg_msg_write(msg); 903 } else { 904 xa_done(tag, 1); 905 } 906 } 907 908 static uint32_t 909 xa_wait(xa_tag_t *tag, int seq) 910 { 911 xa_softc_t *xa = tag->xa; 912 913 lwkt_gettoken(&xa->tok); 914 while (tag->waitseq == seq) 915 tsleep(tag, 0, "xawait", 0); 916 lwkt_reltoken(&xa->tok); 917 return (tag->status.head.error); 918 } 919 920 static void 921 xa_done(xa_tag_t *tag, int wasbio) 922 { 923 xa_softc_t *xa = tag->xa; 924 struct bio *bio; 925 926 KKASSERT(tag->bio == NULL); 927 tag->done = 1; 928 tag->state = NULL; 929 930 lwkt_gettoken(&xa->tok); 931 if (wasbio && (bio = TAILQ_FIRST(&xa->bioq)) != NULL) { 932 TAILQ_REMOVE(&xa->bioq, bio, bio_act); 933 tag->bio = bio; 934 lwkt_reltoken(&xa->tok); 935 xa_start(tag, NULL); 936 } else { 937 if (tag->circ) { 938 kdmsg_circ_drop(tag->circ); 939 tag->circ = NULL; 940 } 941 TAILQ_REMOVE(&xa->tag_pendq, tag, entry); 942 TAILQ_INSERT_TAIL(&xa->tag_freeq, tag, entry); 943 lwkt_reltoken(&xa->tok); 944 } 945 } 946 947 static int 948 xa_sync_completion(kdmsg_state_t *state, kdmsg_msg_t *msg) 949 { 950 xa_tag_t *tag = state->any.any; 951 xa_softc_t *xa = tag->xa; 952 953 switch(msg->any.head.cmd & DMSGF_CMDSWMASK) { 954 case DMSG_LNK_ERROR | DMSGF_REPLY: 955 bzero(&tag->status, sizeof(tag->status)); 956 tag->status.head = msg->any.head; 957 break; 958 case DMSG_BLK_ERROR | DMSGF_REPLY: 959 tag->status = msg->any.blk_error; 960 break; 961 } 962 lwkt_gettoken(&xa->tok); 963 if (msg->any.head.cmd & DMSGF_DELETE) { /* receive termination */ 964 if (xa->opentag == tag) { 965 xa->opentag = NULL; /* XXX */ 966 kdmsg_state_reply(tag->state, 0); 967 xa_done(tag, 0); 968 lwkt_reltoken(&xa->tok); 969 return(0); 970 } else { 971 tag->done = 1; 972 } 973 } 974 ++tag->waitseq; 975 lwkt_reltoken(&xa->tok); 976 977 wakeup(tag); 978 979 return (0); 980 } 981 982 static int 983 xa_bio_completion(kdmsg_state_t *state, kdmsg_msg_t *msg) 984 { 985 xa_tag_t *tag = state->any.any; 986 xa_softc_t *xa = tag->xa; 987 struct bio *bio; 988 struct buf *bp; 989 990 /* 991 * Get the bio from the tag. If no bio is present we just do 992 * 'done' handling. 993 */ 994 if ((bio = tag->bio) == NULL) 995 goto handle_done; 996 bp = bio->bio_buf; 997 998 /* 999 * Process return status 1000 */ 1001 switch(msg->any.head.cmd & DMSGF_CMDSWMASK) { 1002 case DMSG_LNK_ERROR | DMSGF_REPLY: 1003 bzero(&tag->status, sizeof(tag->status)); 1004 tag->status.head = msg->any.head; 1005 if (tag->status.head.error) 1006 tag->status.resid = bp->b_bcount; 1007 else 1008 tag->status.resid = 0; 1009 break; 1010 case DMSG_BLK_ERROR | DMSGF_REPLY: 1011 tag->status = msg->any.blk_error; 1012 break; 1013 } 1014 1015 /* 1016 * Potentially move the bio back onto the pending queue if the 1017 * device is open and the error is related to losing the virtual 1018 * circuit. 1019 */ 1020 if (tag->status.head.error && 1021 (msg->any.head.cmd & DMSGF_DELETE) && xa->opencnt) { 1022 if (tag->status.head.error == DMSG_ERR_LOSTLINK || 1023 tag->status.head.error == DMSG_ERR_CANTCIRC) { 1024 goto handle_repend; 1025 } 1026 } 1027 1028 /* 1029 * Process bio completion 1030 * 1031 * For reads any returned data is zero-extended if necessary, so 1032 * the server can short-cut any all-zeros reads if it desires. 1033 */ 1034 switch(bp->b_cmd) { 1035 case BUF_CMD_READ: 1036 if (msg->aux_data && msg->aux_size) { 1037 if (msg->aux_size < bp->b_bcount) { 1038 bcopy(msg->aux_data, bp->b_data, msg->aux_size); 1039 bzero(bp->b_data + msg->aux_size, 1040 bp->b_bcount - msg->aux_size); 1041 } else { 1042 bcopy(msg->aux_data, bp->b_data, bp->b_bcount); 1043 } 1044 } else { 1045 bzero(bp->b_data, bp->b_bcount); 1046 } 1047 /* fall through */ 1048 case BUF_CMD_WRITE: 1049 case BUF_CMD_FLUSH: 1050 case BUF_CMD_FREEBLKS: 1051 default: 1052 if (tag->status.resid > bp->b_bcount) 1053 tag->status.resid = bp->b_bcount; 1054 bp->b_resid = tag->status.resid; 1055 if ((bp->b_error = tag->status.head.error) != 0) { 1056 bp->b_flags |= B_ERROR; 1057 } else { 1058 bp->b_resid = 0; 1059 } 1060 biodone(bio); 1061 tag->bio = NULL; 1062 break; 1063 } 1064 1065 /* 1066 * Handle completion of the transaction. If the bioq is not empty 1067 * we can initiate another bio on the same tag. 1068 * 1069 * NOTE: Most of our transactions will be single-message 1070 * CREATE+DELETEs, so we won't have to terminate the 1071 * transaction separately, here. But just in case they 1072 * aren't be sure to terminate the transaction. 1073 */ 1074 handle_done: 1075 if (msg->any.head.cmd & DMSGF_DELETE) { 1076 xa_done(tag, 1); 1077 if ((state->txcmd & DMSGF_DELETE) == 0) 1078 kdmsg_msg_reply(msg, 0); 1079 } 1080 return (0); 1081 1082 /* 1083 * Handle the case where the transaction failed due to a 1084 * connectivity issue. The tag is put away with wasbio=0 1085 * and we restart the bio. 1086 * 1087 * Setting circ->lost causes xa_setup_cmd() to skip the circuit. 1088 * Other circuits might still be live. Once a circuit gets messed 1089 * up it will (eventually) be deleted so we can simply leave (lost) 1090 * set forever after. 1091 */ 1092 handle_repend: 1093 lwkt_gettoken(&xa->tok); 1094 kprintf("BIO CIRC FAILURE, REPEND BIO %p\n", bio); 1095 tag->circ->lost = 1; 1096 tag->bio = NULL; 1097 xa_done(tag, 0); 1098 if ((state->txcmd & DMSGF_DELETE) == 0) 1099 kdmsg_msg_reply(msg, 0); 1100 1101 /* 1102 * Restart or requeue the bio 1103 */ 1104 tag = xa_setup_cmd(xa, bio); 1105 if (tag) 1106 xa_start(tag, NULL); 1107 lwkt_reltoken(&xa->tok); 1108 return (0); 1109 } 1110 1111 /* 1112 * Restart as much deferred I/O as we can. 1113 * 1114 * Called with xa->tok held 1115 */ 1116 static 1117 void 1118 xa_restart_deferred(xa_softc_t *xa) 1119 { 1120 struct bio *bio; 1121 xa_tag_t *tag; 1122 1123 while ((bio = TAILQ_FIRST(&xa->bioq)) != NULL) { 1124 tag = xa_setup_cmd(xa, NULL); 1125 if (tag == NULL) 1126 break; 1127 TAILQ_REMOVE(&xa->bioq, bio, bio_act); 1128 tag->bio = bio; 1129 xa_start(tag, NULL); 1130 } 1131 } 1132