1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 5 * All rights reserved. 6 * Copyright 2020 Joyent, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * $FreeBSD$ 30 */ 31 32 /* 33 * Copyright 2020 Joyent, Inc. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #ifndef WITHOUT_CAPSICUM 41 #include <sys/capsicum.h> 42 #endif 43 #include <sys/queue.h> 44 #include <sys/errno.h> 45 #include <sys/stat.h> 46 #include <sys/ioctl.h> 47 #include <sys/disk.h> 48 #include <sys/limits.h> 49 #include <sys/uio.h> 50 #ifndef __FreeBSD__ 51 #include <sys/dkio.h> 52 #endif 53 54 #include <assert.h> 55 #ifndef WITHOUT_CAPSICUM 56 #include <capsicum_helpers.h> 57 #endif 58 #include <err.h> 59 #include <fcntl.h> 60 #include <stdio.h> 61 #include <stdlib.h> 62 #include <string.h> 63 #include <pthread.h> 64 #include <pthread_np.h> 65 #include <signal.h> 66 #include <sysexits.h> 67 #include <unistd.h> 68 69 #include <machine/atomic.h> 70 71 #include "bhyverun.h" 72 #include "config.h" 73 #include "debug.h" 74 #include "mevent.h" 75 #include "pci_emul.h" 76 #include "block_if.h" 77 78 #define BLOCKIF_SIG 0xb109b109 79 80 #ifdef __FreeBSD__ 81 #define BLOCKIF_NUMTHR 8 82 #else 83 /* Enlarge to keep pace with the virtio-block ring size */ 84 #define BLOCKIF_NUMTHR 16 85 #endif 86 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) 87 88 enum blockop { 89 BOP_READ, 90 BOP_WRITE, 91 #ifndef __FreeBSD__ 92 BOP_WRITE_SYNC, 93 #endif 94 BOP_FLUSH, 95 BOP_DELETE 96 }; 97 98 enum blockstat { 99 BST_FREE, 100 BST_BLOCK, 101 BST_PEND, 102 BST_BUSY, 103 BST_DONE 104 }; 105 106 struct blockif_elem { 107 TAILQ_ENTRY(blockif_elem) be_link; 108 struct blockif_req *be_req; 109 enum blockop be_op; 110 enum blockstat be_status; 111 pthread_t be_tid; 112 off_t be_block; 113 }; 114 115 #ifndef __FreeBSD__ 116 enum blockif_wce { 117 WCE_NONE = 0, 118 WCE_IOCTL, 119 WCE_FCNTL 120 }; 121 #endif 122 123 struct blockif_ctxt { 124 int bc_magic; 125 int bc_fd; 126 int bc_ischr; 127 int bc_isgeom; 128 int bc_candelete; 129 #ifndef __FreeBSD__ 130 enum blockif_wce bc_wce; 131 #endif 132 int bc_rdonly; 133 off_t bc_size; 134 int bc_sectsz; 135 int bc_psectsz; 136 int bc_psectoff; 137 int bc_closing; 138 pthread_t bc_btid[BLOCKIF_NUMTHR]; 139 pthread_mutex_t bc_mtx; 140 pthread_cond_t bc_cond; 141 blockif_resize_cb *bc_resize_cb; 142 void *bc_resize_cb_arg; 143 struct mevent *bc_resize_event; 144 145 /* Request elements and free/pending/busy queues */ 146 TAILQ_HEAD(, blockif_elem) bc_freeq; 147 TAILQ_HEAD(, blockif_elem) bc_pendq; 148 TAILQ_HEAD(, blockif_elem) bc_busyq; 149 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 150 }; 151 152 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 153 154 struct blockif_sig_elem { 155 pthread_mutex_t bse_mtx; 156 pthread_cond_t bse_cond; 157 int bse_pending; 158 struct blockif_sig_elem *bse_next; 159 }; 160 161 static struct blockif_sig_elem *blockif_bse_head; 162 163 static int 164 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 165 enum blockop op) 166 { 167 struct blockif_elem *be, *tbe; 168 off_t off; 169 int i; 170 171 be = TAILQ_FIRST(&bc->bc_freeq); 172 assert(be != NULL); 173 assert(be->be_status == BST_FREE); 174 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 175 be->be_req = breq; 176 be->be_op = op; 177 switch (op) { 178 case BOP_READ: 179 case BOP_WRITE: 180 #ifndef __FreeBSD__ 181 case BOP_WRITE_SYNC: 182 #endif 183 case BOP_DELETE: 184 off = breq->br_offset; 185 for (i = 0; i < breq->br_iovcnt; i++) 186 off += breq->br_iov[i].iov_len; 187 break; 188 default: 189 off = OFF_MAX; 190 } 191 be->be_block = off; 192 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 193 if (tbe->be_block == breq->br_offset) 194 break; 195 } 196 if (tbe == NULL) { 197 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 198 if (tbe->be_block == breq->br_offset) 199 break; 200 } 201 } 202 if (tbe == NULL) 203 be->be_status = BST_PEND; 204 else 205 be->be_status = BST_BLOCK; 206 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 207 return (be->be_status == BST_PEND); 208 } 209 210 static int 211 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 212 { 213 struct blockif_elem *be; 214 215 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 216 if (be->be_status == BST_PEND) 217 break; 218 assert(be->be_status == BST_BLOCK); 219 } 220 if (be == NULL) 221 return (0); 222 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 223 be->be_status = BST_BUSY; 224 be->be_tid = t; 225 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 226 *bep = be; 227 return (1); 228 } 229 230 static void 231 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 232 { 233 struct blockif_elem *tbe; 234 235 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 236 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 237 else 238 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 239 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 240 if (tbe->be_req->br_offset == be->be_block) 241 tbe->be_status = BST_PEND; 242 } 243 be->be_tid = 0; 244 be->be_status = BST_FREE; 245 be->be_req = NULL; 246 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 247 } 248 249 static void 250 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) 251 { 252 struct blockif_req *br; 253 #ifdef __FreeBSD__ 254 off_t arg[2]; 255 #endif 256 ssize_t clen, len, off, boff, voff; 257 int i, err; 258 259 br = be->be_req; 260 if (br->br_iovcnt <= 1) 261 buf = NULL; 262 err = 0; 263 switch (be->be_op) { 264 case BOP_READ: 265 if (buf == NULL) { 266 if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 267 br->br_offset)) < 0) 268 err = errno; 269 else 270 br->br_resid -= len; 271 break; 272 } 273 i = 0; 274 off = voff = 0; 275 while (br->br_resid > 0) { 276 len = MIN(br->br_resid, MAXPHYS); 277 if (pread(bc->bc_fd, buf, len, br->br_offset + 278 off) < 0) { 279 err = errno; 280 break; 281 } 282 boff = 0; 283 do { 284 clen = MIN(len - boff, br->br_iov[i].iov_len - 285 voff); 286 memcpy(br->br_iov[i].iov_base + voff, 287 buf + boff, clen); 288 if (clen < br->br_iov[i].iov_len - voff) 289 voff += clen; 290 else { 291 i++; 292 voff = 0; 293 } 294 boff += clen; 295 } while (boff < len); 296 off += len; 297 br->br_resid -= len; 298 } 299 break; 300 case BOP_WRITE: 301 if (bc->bc_rdonly) { 302 err = EROFS; 303 break; 304 } 305 if (buf == NULL) { 306 if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 307 br->br_offset)) < 0) 308 err = errno; 309 else 310 br->br_resid -= len; 311 break; 312 } 313 i = 0; 314 off = voff = 0; 315 while (br->br_resid > 0) { 316 len = MIN(br->br_resid, MAXPHYS); 317 boff = 0; 318 do { 319 clen = MIN(len - boff, br->br_iov[i].iov_len - 320 voff); 321 memcpy(buf + boff, 322 br->br_iov[i].iov_base + voff, clen); 323 if (clen < br->br_iov[i].iov_len - voff) 324 voff += clen; 325 else { 326 i++; 327 voff = 0; 328 } 329 boff += clen; 330 } while (boff < len); 331 if (pwrite(bc->bc_fd, buf, len, br->br_offset + 332 off) < 0) { 333 err = errno; 334 break; 335 } 336 off += len; 337 br->br_resid -= len; 338 } 339 break; 340 case BOP_FLUSH: 341 #ifdef __FreeBSD__ 342 if (bc->bc_ischr) { 343 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 344 err = errno; 345 } else if (fsync(bc->bc_fd)) 346 err = errno; 347 #else 348 /* 349 * This fsync() should be adequate to flush the cache of a file 350 * or device. In VFS, the VOP_SYNC operation is converted to 351 * the appropriate ioctl in both sdev (for real devices) and 352 * zfs (for zvols). 353 */ 354 if (fsync(bc->bc_fd)) 355 err = errno; 356 #endif 357 break; 358 case BOP_DELETE: 359 if (!bc->bc_candelete) 360 err = EOPNOTSUPP; 361 else if (bc->bc_rdonly) 362 err = EROFS; 363 #ifdef __FreeBSD__ 364 else if (bc->bc_ischr) { 365 arg[0] = br->br_offset; 366 arg[1] = br->br_resid; 367 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 368 err = errno; 369 else 370 br->br_resid = 0; 371 } else { 372 range.r_offset = br->br_offset; 373 range.r_len = br->br_resid; 374 375 while (range.r_len > 0) { 376 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC, 377 &range, 0, &range) != 0) { 378 err = errno; 379 break; 380 } 381 } 382 if (err == 0) 383 br->br_resid = 0; 384 } 385 #else 386 else if (bc->bc_ischr) { 387 dkioc_free_list_t dfl = { 388 .dfl_num_exts = 1, 389 .dfl_offset = 0, 390 .dfl_flags = 0, 391 .dfl_exts = { 392 { 393 .dfle_start = br->br_offset, 394 .dfle_length = br->br_resid 395 } 396 } 397 }; 398 399 if (ioctl(bc->bc_fd, DKIOCFREE, &dfl)) 400 err = errno; 401 else 402 br->br_resid = 0; 403 } else { 404 struct flock fl = { 405 .l_whence = 0, 406 .l_type = F_WRLCK, 407 .l_start = br->br_offset, 408 .l_len = br->br_resid 409 }; 410 411 if (fcntl(bc->bc_fd, F_FREESP, &fl)) 412 err = errno; 413 else 414 br->br_resid = 0; 415 } 416 #endif 417 break; 418 default: 419 err = EINVAL; 420 break; 421 } 422 423 be->be_status = BST_DONE; 424 425 (*br->br_callback)(br, err); 426 } 427 428 static void * 429 blockif_thr(void *arg) 430 { 431 struct blockif_ctxt *bc; 432 struct blockif_elem *be; 433 pthread_t t; 434 uint8_t *buf; 435 436 bc = arg; 437 if (bc->bc_isgeom) 438 buf = malloc(MAXPHYS); 439 else 440 buf = NULL; 441 t = pthread_self(); 442 443 pthread_mutex_lock(&bc->bc_mtx); 444 for (;;) { 445 while (blockif_dequeue(bc, t, &be)) { 446 pthread_mutex_unlock(&bc->bc_mtx); 447 blockif_proc(bc, be, buf); 448 pthread_mutex_lock(&bc->bc_mtx); 449 blockif_complete(bc, be); 450 } 451 /* Check ctxt status here to see if exit requested */ 452 if (bc->bc_closing) 453 break; 454 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 455 } 456 pthread_mutex_unlock(&bc->bc_mtx); 457 458 if (buf) 459 free(buf); 460 pthread_exit(NULL); 461 return (NULL); 462 } 463 464 #ifdef __FreeBSD__ 465 static void 466 blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 467 #else 468 static void 469 blockif_sigcont_handler(int signal) 470 #endif 471 { 472 struct blockif_sig_elem *bse; 473 474 for (;;) { 475 /* 476 * Process the entire list even if not intended for 477 * this thread. 478 */ 479 do { 480 bse = blockif_bse_head; 481 if (bse == NULL) 482 return; 483 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 484 (uintptr_t)bse, 485 (uintptr_t)bse->bse_next)); 486 487 pthread_mutex_lock(&bse->bse_mtx); 488 bse->bse_pending = 0; 489 pthread_cond_signal(&bse->bse_cond); 490 pthread_mutex_unlock(&bse->bse_mtx); 491 } 492 } 493 494 static void 495 blockif_init(void) 496 { 497 #ifdef __FreeBSD__ 498 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 499 (void) signal(SIGCONT, SIG_IGN); 500 #else 501 (void) sigset(SIGCONT, blockif_sigcont_handler); 502 #endif 503 } 504 505 int 506 blockif_legacy_config(nvlist_t *nvl, const char *opts) 507 { 508 char *cp, *path; 509 510 if (opts == NULL) 511 return (0); 512 513 cp = strchr(opts, ','); 514 if (cp == NULL) { 515 set_config_value_node(nvl, "path", opts); 516 return (0); 517 } 518 path = strndup(opts, cp - opts); 519 set_config_value_node(nvl, "path", path); 520 free(path); 521 return (pci_parse_legacy_config(nvl, cp + 1)); 522 } 523 524 struct blockif_ctxt * 525 blockif_open(nvlist_t *nvl, const char *ident) 526 { 527 char tname[MAXCOMLEN + 1]; 528 #ifdef __FreeBSD__ 529 char name[MAXPATHLEN]; 530 #endif 531 const char *path, *pssval, *ssval; 532 char *cp; 533 struct blockif_ctxt *bc; 534 struct stat sbuf; 535 #ifdef __FreeBSD__ 536 struct diocgattr_arg arg; 537 #else 538 enum blockif_wce wce = WCE_NONE; 539 #endif 540 off_t size, psectsz, psectoff; 541 int extra, fd, i, sectsz; 542 int ro, candelete, geom, ssopt, pssopt; 543 int nodelete; 544 545 #ifndef WITHOUT_CAPSICUM 546 cap_rights_t rights; 547 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE }; 548 #endif 549 550 pthread_once(&blockif_once, blockif_init); 551 552 fd = -1; 553 extra = 0; 554 ssopt = 0; 555 #ifndef __FreeBSD__ 556 pssopt = 0; 557 #endif 558 ro = 0; 559 nodelete = 0; 560 561 if (get_config_bool_node_default(nvl, "nocache", false)) 562 extra |= O_DIRECT; 563 if (get_config_bool_node_default(nvl, "nodelete", false)) 564 nodelete = 1; 565 if (get_config_bool_node_default(nvl, "sync", false) || 566 get_config_bool_node_default(nvl, "direct", false)) 567 extra |= O_SYNC; 568 if (get_config_bool_node_default(nvl, "ro", false)) 569 ro = 1; 570 ssval = get_config_value_node(nvl, "sectorsize"); 571 if (ssval != NULL) { 572 ssopt = strtol(ssval, &cp, 10); 573 if (cp == ssval) { 574 EPRINTLN("Invalid sector size \"%s\"", ssval); 575 goto err; 576 } 577 if (*cp == '\0') { 578 pssopt = ssopt; 579 } else if (*cp == '/') { 580 pssval = cp + 1; 581 pssopt = strtol(pssval, &cp, 10); 582 if (cp == pssval || *cp != '\0') { 583 EPRINTLN("Invalid sector size \"%s\"", ssval); 584 goto err; 585 } 586 } else { 587 EPRINTLN("Invalid sector size \"%s\"", ssval); 588 goto err; 589 } 590 } 591 592 path = get_config_value_node(nvl, "path"); 593 if (path == NULL) { 594 EPRINTLN("Missing \"path\" for block device."); 595 goto err; 596 } 597 598 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra); 599 if (fd < 0 && !ro) { 600 /* Attempt a r/w fail with a r/o open */ 601 fd = open(path, O_RDONLY | extra); 602 ro = 1; 603 } 604 605 if (fd < 0) { 606 warn("Could not open backing file: %s", path); 607 goto err; 608 } 609 610 if (fstat(fd, &sbuf) < 0) { 611 warn("Could not stat backing file %s", path); 612 goto err; 613 } 614 615 #ifndef WITHOUT_CAPSICUM 616 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, 617 CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF); 618 if (ro) 619 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); 620 621 if (caph_rights_limit(fd, &rights) == -1) 622 errx(EX_OSERR, "Unable to apply rights for sandbox"); 623 #endif 624 625 /* 626 * Deal with raw devices 627 */ 628 size = sbuf.st_size; 629 sectsz = DEV_BSIZE; 630 psectsz = psectoff = 0; 631 candelete = geom = 0; 632 #ifdef __FreeBSD__ 633 if (S_ISCHR(sbuf.st_mode)) { 634 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 635 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 636 perror("Could not fetch dev blk/sector size"); 637 goto err; 638 } 639 assert(size != 0); 640 assert(sectsz != 0); 641 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 642 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 643 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 644 arg.len = sizeof(arg.value.i); 645 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0) 646 candelete = arg.value.i; 647 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) 648 geom = 1; 649 } else { 650 psectsz = sbuf.st_blksize; 651 /* Avoid fallback implementation */ 652 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1; 653 } 654 #else 655 psectsz = sbuf.st_blksize; 656 if (S_ISCHR(sbuf.st_mode)) { 657 struct dk_minfo_ext dkmext; 658 int wce_val; 659 660 /* Look for a more accurate physical block/media size */ 661 if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { 662 psectsz = dkmext.dki_pbsize; 663 size = dkmext.dki_lbsize * dkmext.dki_capacity; 664 } 665 /* See if a configurable write cache is present and working */ 666 if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { 667 /* 668 * If WCE is already active, disable it until the 669 * specific device driver calls for its return. If it 670 * is not active, toggle it on and off to verify that 671 * such actions are possible. 672 */ 673 if (wce_val != 0) { 674 wce_val = 0; 675 /* 676 * Inability to disable the cache is a threat 677 * to data durability. 678 */ 679 assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); 680 wce = WCE_IOCTL; 681 } else { 682 int r1, r2; 683 684 wce_val = 1; 685 r1 = ioctl(fd, DKIOCSETWCE, &wce_val); 686 wce_val = 0; 687 r2 = ioctl(fd, DKIOCSETWCE, &wce_val); 688 689 if (r1 == 0 && r2 == 0) { 690 wce = WCE_IOCTL; 691 } else { 692 /* 693 * If the cache cache toggle was not 694 * successful, ensure that the cache 695 * was not left enabled. 696 */ 697 assert(r1 != 0); 698 } 699 } 700 } 701 702 if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete)) 703 candelete = 0; 704 705 } else { 706 int flags; 707 708 if ((flags = fcntl(fd, F_GETFL)) >= 0) { 709 flags |= O_DSYNC; 710 if (fcntl(fd, F_SETFL, flags) != -1) { 711 wce = WCE_FCNTL; 712 } 713 } 714 715 /* 716 * We don't have a way to discover if a file supports the 717 * FREESP fcntl cmd (other than trying it). However, 718 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd. 719 * Nfsv4 and nfsv4 also forward the FREESP request 720 * to the server, so we always enable it for file based 721 * volumes. Anyone trying to run volumes on an unsupported 722 * configuration is on their own, and should be prepared 723 * for the requests to fail. 724 */ 725 if (nodelete == 0) 726 candelete = 1; 727 } 728 #endif 729 730 #ifndef WITHOUT_CAPSICUM 731 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) 732 errx(EX_OSERR, "Unable to apply rights for sandbox"); 733 #endif 734 735 if (ssopt != 0) { 736 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || 737 ssopt > pssopt) { 738 EPRINTLN("Invalid sector size %d/%d", 739 ssopt, pssopt); 740 goto err; 741 } 742 743 /* 744 * Some backend drivers (e.g. cd0, ada0) require that the I/O 745 * size be a multiple of the device's sector size. 746 * 747 * Validate that the emulated sector size complies with this 748 * requirement. 749 */ 750 if (S_ISCHR(sbuf.st_mode)) { 751 if (ssopt < sectsz || (ssopt % sectsz) != 0) { 752 EPRINTLN("Sector size %d incompatible " 753 "with underlying device sector size %d", 754 ssopt, sectsz); 755 goto err; 756 } 757 } 758 759 sectsz = ssopt; 760 psectsz = pssopt; 761 psectoff = 0; 762 } 763 764 bc = calloc(1, sizeof(struct blockif_ctxt)); 765 if (bc == NULL) { 766 perror("calloc"); 767 goto err; 768 } 769 770 bc->bc_magic = BLOCKIF_SIG; 771 bc->bc_fd = fd; 772 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 773 bc->bc_isgeom = geom; 774 bc->bc_candelete = candelete; 775 #ifndef __FreeBSD__ 776 bc->bc_wce = wce; 777 #endif 778 bc->bc_rdonly = ro; 779 bc->bc_size = size; 780 bc->bc_sectsz = sectsz; 781 bc->bc_psectsz = psectsz; 782 bc->bc_psectoff = psectoff; 783 pthread_mutex_init(&bc->bc_mtx, NULL); 784 pthread_cond_init(&bc->bc_cond, NULL); 785 TAILQ_INIT(&bc->bc_freeq); 786 TAILQ_INIT(&bc->bc_pendq); 787 TAILQ_INIT(&bc->bc_busyq); 788 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 789 bc->bc_reqs[i].be_status = BST_FREE; 790 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 791 } 792 793 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 794 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 795 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 796 pthread_set_name_np(bc->bc_btid[i], tname); 797 } 798 799 return (bc); 800 err: 801 if (fd >= 0) 802 close(fd); 803 return (NULL); 804 } 805 806 static void 807 blockif_resized(int fd, enum ev_type type, void *arg) 808 { 809 struct blockif_ctxt *bc; 810 struct stat sb; 811 off_t mediasize; 812 813 if (fstat(fd, &sb) != 0) 814 return; 815 816 #ifdef __FreeBSD__ 817 if (S_ISCHR(sb.st_mode)) { 818 if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) { 819 EPRINTLN("blockif_resized: get mediasize failed: %s", 820 strerror(errno)); 821 return; 822 } 823 } else 824 mediasize = sb.st_size; 825 #else 826 mediasize = sb.st_size; 827 if (S_ISCHR(sb.st_mode)) { 828 struct dk_minfo dkm; 829 830 if (ioctl(fd, DKIOCGMEDIAINFO, &dkm) == 0) 831 mediasize = dkm.dki_lbsize * dkm.dki_capacity; 832 } 833 #endif 834 835 bc = arg; 836 pthread_mutex_lock(&bc->bc_mtx); 837 if (mediasize != bc->bc_size) { 838 bc->bc_size = mediasize; 839 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size); 840 } 841 pthread_mutex_unlock(&bc->bc_mtx); 842 } 843 844 int 845 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb, 846 void *cb_arg) 847 { 848 struct stat sb; 849 int err; 850 #ifndef __FreeBSD__ 851 err = 0; 852 #endif 853 854 if (cb == NULL) 855 return (EINVAL); 856 857 pthread_mutex_lock(&bc->bc_mtx); 858 if (bc->bc_resize_cb != NULL) { 859 err = EBUSY; 860 goto out; 861 } 862 863 assert(bc->bc_closing == 0); 864 865 if (fstat(bc->bc_fd, &sb) != 0) { 866 err = errno; 867 goto out; 868 } 869 870 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE, 871 EVFF_ATTRIB, blockif_resized, bc); 872 if (bc->bc_resize_event == NULL) { 873 err = ENXIO; 874 goto out; 875 } 876 877 bc->bc_resize_cb = cb; 878 bc->bc_resize_cb_arg = cb_arg; 879 out: 880 pthread_mutex_unlock(&bc->bc_mtx); 881 882 return (err); 883 } 884 885 static int 886 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 887 enum blockop op) 888 { 889 int err; 890 891 err = 0; 892 893 pthread_mutex_lock(&bc->bc_mtx); 894 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 895 /* 896 * Enqueue and inform the block i/o thread 897 * that there is work available 898 */ 899 if (blockif_enqueue(bc, breq, op)) 900 pthread_cond_signal(&bc->bc_cond); 901 } else { 902 /* 903 * Callers are not allowed to enqueue more than 904 * the specified blockif queue limit. Return an 905 * error to indicate that the queue length has been 906 * exceeded. 907 */ 908 err = E2BIG; 909 } 910 pthread_mutex_unlock(&bc->bc_mtx); 911 912 return (err); 913 } 914 915 int 916 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 917 { 918 919 assert(bc->bc_magic == BLOCKIF_SIG); 920 return (blockif_request(bc, breq, BOP_READ)); 921 } 922 923 int 924 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 925 { 926 927 assert(bc->bc_magic == BLOCKIF_SIG); 928 return (blockif_request(bc, breq, BOP_WRITE)); 929 } 930 931 int 932 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 933 { 934 935 assert(bc->bc_magic == BLOCKIF_SIG); 936 return (blockif_request(bc, breq, BOP_FLUSH)); 937 } 938 939 int 940 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 941 { 942 943 assert(bc->bc_magic == BLOCKIF_SIG); 944 return (blockif_request(bc, breq, BOP_DELETE)); 945 } 946 947 int 948 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 949 { 950 struct blockif_elem *be; 951 952 assert(bc->bc_magic == BLOCKIF_SIG); 953 954 pthread_mutex_lock(&bc->bc_mtx); 955 /* 956 * Check pending requests. 957 */ 958 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 959 if (be->be_req == breq) 960 break; 961 } 962 if (be != NULL) { 963 /* 964 * Found it. 965 */ 966 blockif_complete(bc, be); 967 pthread_mutex_unlock(&bc->bc_mtx); 968 969 return (0); 970 } 971 972 /* 973 * Check in-flight requests. 974 */ 975 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 976 if (be->be_req == breq) 977 break; 978 } 979 if (be == NULL) { 980 /* 981 * Didn't find it. 982 */ 983 pthread_mutex_unlock(&bc->bc_mtx); 984 return (EINVAL); 985 } 986 987 /* 988 * Interrupt the processing thread to force it return 989 * prematurely via it's normal callback path. 990 */ 991 while (be->be_status == BST_BUSY) { 992 struct blockif_sig_elem bse, *old_head; 993 994 pthread_mutex_init(&bse.bse_mtx, NULL); 995 pthread_cond_init(&bse.bse_cond, NULL); 996 997 bse.bse_pending = 1; 998 999 do { 1000 old_head = blockif_bse_head; 1001 bse.bse_next = old_head; 1002 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 1003 (uintptr_t)old_head, 1004 (uintptr_t)&bse)); 1005 1006 pthread_kill(be->be_tid, SIGCONT); 1007 1008 pthread_mutex_lock(&bse.bse_mtx); 1009 while (bse.bse_pending) 1010 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 1011 pthread_mutex_unlock(&bse.bse_mtx); 1012 } 1013 1014 pthread_mutex_unlock(&bc->bc_mtx); 1015 1016 /* 1017 * The processing thread has been interrupted. Since it's not 1018 * clear if the callback has been invoked yet, return EBUSY. 1019 */ 1020 return (EBUSY); 1021 } 1022 1023 int 1024 blockif_close(struct blockif_ctxt *bc) 1025 { 1026 void *jval; 1027 int i; 1028 1029 assert(bc->bc_magic == BLOCKIF_SIG); 1030 1031 /* 1032 * Stop the block i/o thread 1033 */ 1034 pthread_mutex_lock(&bc->bc_mtx); 1035 bc->bc_closing = 1; 1036 if (bc->bc_resize_event != NULL) 1037 mevent_disable(bc->bc_resize_event); 1038 pthread_mutex_unlock(&bc->bc_mtx); 1039 pthread_cond_broadcast(&bc->bc_cond); 1040 for (i = 0; i < BLOCKIF_NUMTHR; i++) 1041 pthread_join(bc->bc_btid[i], &jval); 1042 1043 /* XXX Cancel queued i/o's ??? */ 1044 1045 /* 1046 * Release resources 1047 */ 1048 bc->bc_magic = 0; 1049 close(bc->bc_fd); 1050 free(bc); 1051 1052 return (0); 1053 } 1054 1055 /* 1056 * Return virtual C/H/S values for a given block. Use the algorithm 1057 * outlined in the VHD specification to calculate values. 1058 */ 1059 void 1060 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 1061 { 1062 off_t sectors; /* total sectors of the block dev */ 1063 off_t hcyl; /* cylinders times heads */ 1064 uint16_t secpt; /* sectors per track */ 1065 uint8_t heads; 1066 1067 assert(bc->bc_magic == BLOCKIF_SIG); 1068 1069 sectors = bc->bc_size / bc->bc_sectsz; 1070 1071 /* Clamp the size to the largest possible with CHS */ 1072 if (sectors > 65535UL*16*255) 1073 sectors = 65535UL*16*255; 1074 1075 if (sectors >= 65536UL*16*63) { 1076 secpt = 255; 1077 heads = 16; 1078 hcyl = sectors / secpt; 1079 } else { 1080 secpt = 17; 1081 hcyl = sectors / secpt; 1082 heads = (hcyl + 1023) / 1024; 1083 1084 if (heads < 4) 1085 heads = 4; 1086 1087 if (hcyl >= (heads * 1024) || heads > 16) { 1088 secpt = 31; 1089 heads = 16; 1090 hcyl = sectors / secpt; 1091 } 1092 if (hcyl >= (heads * 1024)) { 1093 secpt = 63; 1094 heads = 16; 1095 hcyl = sectors / secpt; 1096 } 1097 } 1098 1099 *c = hcyl / heads; 1100 *h = heads; 1101 *s = secpt; 1102 } 1103 1104 /* 1105 * Accessors 1106 */ 1107 off_t 1108 blockif_size(struct blockif_ctxt *bc) 1109 { 1110 1111 assert(bc->bc_magic == BLOCKIF_SIG); 1112 return (bc->bc_size); 1113 } 1114 1115 int 1116 blockif_sectsz(struct blockif_ctxt *bc) 1117 { 1118 1119 assert(bc->bc_magic == BLOCKIF_SIG); 1120 return (bc->bc_sectsz); 1121 } 1122 1123 void 1124 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 1125 { 1126 1127 assert(bc->bc_magic == BLOCKIF_SIG); 1128 *size = bc->bc_psectsz; 1129 *off = bc->bc_psectoff; 1130 } 1131 1132 int 1133 blockif_queuesz(struct blockif_ctxt *bc) 1134 { 1135 1136 assert(bc->bc_magic == BLOCKIF_SIG); 1137 return (BLOCKIF_MAXREQ - 1); 1138 } 1139 1140 int 1141 blockif_is_ro(struct blockif_ctxt *bc) 1142 { 1143 1144 assert(bc->bc_magic == BLOCKIF_SIG); 1145 return (bc->bc_rdonly); 1146 } 1147 1148 int 1149 blockif_candelete(struct blockif_ctxt *bc) 1150 { 1151 1152 assert(bc->bc_magic == BLOCKIF_SIG); 1153 return (bc->bc_candelete); 1154 } 1155 1156 #ifndef __FreeBSD__ 1157 int 1158 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) 1159 { 1160 int res = 0, flags; 1161 int clean_val = (wc_enable != 0) ? 1 : 0; 1162 1163 (void) pthread_mutex_lock(&bc->bc_mtx); 1164 switch (bc->bc_wce) { 1165 case WCE_IOCTL: 1166 res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); 1167 break; 1168 case WCE_FCNTL: 1169 if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { 1170 if (wc_enable == 0) { 1171 flags |= O_DSYNC; 1172 } else { 1173 flags &= ~O_DSYNC; 1174 } 1175 if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { 1176 res = -1; 1177 } 1178 } else { 1179 res = -1; 1180 } 1181 break; 1182 default: 1183 break; 1184 } 1185 1186 /* 1187 * After a successful disable of the write cache, ensure that any 1188 * lingering data in the cache is synced out. 1189 */ 1190 if (res == 0 && wc_enable == 0) { 1191 res = fsync(bc->bc_fd); 1192 } 1193 (void) pthread_mutex_unlock(&bc->bc_mtx); 1194 1195 return (res); 1196 } 1197 #endif /* __FreeBSD__ */ 1198