1 /* 2 * Block driver for RAW files (posix) 3 * 4 * Copyright (c) 2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu/osdep.h" 25 #include "qapi/error.h" 26 #include "qemu/cutils.h" 27 #include "qemu/error-report.h" 28 #include "block/block_int.h" 29 #include "qemu/module.h" 30 #include "trace.h" 31 #include "block/thread-pool.h" 32 #include "qemu/iov.h" 33 #include "block/raw-aio.h" 34 #include "qapi/qmp/qdict.h" 35 #include "qapi/qmp/qstring.h" 36 37 #include "scsi/pr-manager.h" 38 #include "scsi/constants.h" 39 40 #if defined(__APPLE__) && (__MACH__) 41 #include <paths.h> 42 #include <sys/param.h> 43 #include <IOKit/IOKitLib.h> 44 #include <IOKit/IOBSD.h> 45 #include <IOKit/storage/IOMediaBSDClient.h> 46 #include <IOKit/storage/IOMedia.h> 47 #include <IOKit/storage/IOCDMedia.h> 48 //#include <IOKit/storage/IOCDTypes.h> 49 #include <IOKit/storage/IODVDMedia.h> 50 #include <CoreFoundation/CoreFoundation.h> 51 #endif 52 53 #ifdef __sun__ 54 #define _POSIX_PTHREAD_SEMANTICS 1 55 #include <sys/dkio.h> 56 #endif 57 #ifdef __linux__ 58 #include <sys/ioctl.h> 59 #include <sys/param.h> 60 #include <linux/cdrom.h> 61 #include <linux/fd.h> 62 #include <linux/fs.h> 63 #include <linux/hdreg.h> 64 #include <scsi/sg.h> 65 #ifdef __s390__ 66 #include <asm/dasd.h> 67 #endif 68 #ifndef FS_NOCOW_FL 69 #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ 70 #endif 71 #endif 72 #if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) 73 #include <linux/falloc.h> 74 #endif 75 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 76 #include <sys/disk.h> 77 #include <sys/cdio.h> 78 #endif 79 80 #ifdef __OpenBSD__ 81 #include <sys/ioctl.h> 82 #include <sys/disklabel.h> 83 #include <sys/dkio.h> 84 #endif 85 86 #ifdef __NetBSD__ 87 #include <sys/ioctl.h> 88 #include <sys/disklabel.h> 89 #include <sys/dkio.h> 90 #include <sys/disk.h> 91 #endif 92 93 #ifdef __DragonFly__ 94 #include <sys/ioctl.h> 95 #include <sys/diskslice.h> 96 #endif 97 98 #ifdef CONFIG_XFS 99 #include <xfs/xfs.h> 100 #endif 101 102 //#define DEBUG_BLOCK 103 104 #ifdef DEBUG_BLOCK 105 # define DEBUG_BLOCK_PRINT 1 106 #else 107 # define DEBUG_BLOCK_PRINT 0 108 #endif 109 #define DPRINTF(fmt, ...) \ 110 do { \ 111 if (DEBUG_BLOCK_PRINT) { \ 112 printf(fmt, ## __VA_ARGS__); \ 113 } \ 114 } while (0) 115 116 /* OS X does not have O_DSYNC */ 117 #ifndef O_DSYNC 118 #ifdef O_SYNC 119 #define O_DSYNC O_SYNC 120 #elif defined(O_FSYNC) 121 #define O_DSYNC O_FSYNC 122 #endif 123 #endif 124 125 /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ 126 #ifndef O_DIRECT 127 #define O_DIRECT O_DSYNC 128 #endif 129 130 #define FTYPE_FILE 0 131 #define FTYPE_CD 1 132 133 #define MAX_BLOCKSIZE 4096 134 135 /* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, 136 * leaving a few more bytes for its future use. */ 137 #define RAW_LOCK_PERM_BASE 100 138 #define RAW_LOCK_SHARED_BASE 200 139 140 typedef struct BDRVRawState { 141 int fd; 142 int lock_fd; 143 bool use_lock; 144 int type; 145 int open_flags; 146 size_t buf_align; 147 148 /* The current permissions. */ 149 uint64_t perm; 150 uint64_t shared_perm; 151 152 #ifdef CONFIG_XFS 153 bool is_xfs:1; 154 #endif 155 bool has_discard:1; 156 bool has_write_zeroes:1; 157 bool discard_zeroes:1; 158 bool use_linux_aio:1; 159 bool page_cache_inconsistent:1; 160 bool has_fallocate; 161 bool needs_alignment; 162 163 PRManager *pr_mgr; 164 } BDRVRawState; 165 166 typedef struct BDRVRawReopenState { 167 int fd; 168 int open_flags; 169 } BDRVRawReopenState; 170 171 static int fd_open(BlockDriverState *bs); 172 static int64_t raw_getlength(BlockDriverState *bs); 173 174 typedef struct RawPosixAIOData { 175 BlockDriverState *bs; 176 int aio_fildes; 177 union { 178 struct iovec *aio_iov; 179 void *aio_ioctl_buf; 180 }; 181 int aio_niov; 182 uint64_t aio_nbytes; 183 #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ 184 off_t aio_offset; 185 int aio_type; 186 } RawPosixAIOData; 187 188 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 189 static int cdrom_reopen(BlockDriverState *bs); 190 #endif 191 192 #if defined(__NetBSD__) 193 static int raw_normalize_devicepath(const char **filename) 194 { 195 static char namebuf[PATH_MAX]; 196 const char *dp, *fname; 197 struct stat sb; 198 199 fname = *filename; 200 dp = strrchr(fname, '/'); 201 if (lstat(fname, &sb) < 0) { 202 fprintf(stderr, "%s: stat failed: %s\n", 203 fname, strerror(errno)); 204 return -errno; 205 } 206 207 if (!S_ISBLK(sb.st_mode)) { 208 return 0; 209 } 210 211 if (dp == NULL) { 212 snprintf(namebuf, PATH_MAX, "r%s", fname); 213 } else { 214 snprintf(namebuf, PATH_MAX, "%.*s/r%s", 215 (int)(dp - fname), fname, dp + 1); 216 } 217 fprintf(stderr, "%s is a block device", fname); 218 *filename = namebuf; 219 fprintf(stderr, ", using %s\n", *filename); 220 221 return 0; 222 } 223 #else 224 static int raw_normalize_devicepath(const char **filename) 225 { 226 return 0; 227 } 228 #endif 229 230 /* 231 * Get logical block size via ioctl. On success store it in @sector_size_p. 232 */ 233 static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) 234 { 235 unsigned int sector_size; 236 bool success = false; 237 int i; 238 239 errno = ENOTSUP; 240 static const unsigned long ioctl_list[] = { 241 #ifdef BLKSSZGET 242 BLKSSZGET, 243 #endif 244 #ifdef DKIOCGETBLOCKSIZE 245 DKIOCGETBLOCKSIZE, 246 #endif 247 #ifdef DIOCGSECTORSIZE 248 DIOCGSECTORSIZE, 249 #endif 250 }; 251 252 /* Try a few ioctls to get the right size */ 253 for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) { 254 if (ioctl(fd, ioctl_list[i], §or_size) >= 0) { 255 *sector_size_p = sector_size; 256 success = true; 257 } 258 } 259 260 return success ? 0 : -errno; 261 } 262 263 /** 264 * Get physical block size of @fd. 265 * On success, store it in @blk_size and return 0. 266 * On failure, return -errno. 267 */ 268 static int probe_physical_blocksize(int fd, unsigned int *blk_size) 269 { 270 #ifdef BLKPBSZGET 271 if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { 272 return -errno; 273 } 274 return 0; 275 #else 276 return -ENOTSUP; 277 #endif 278 } 279 280 /* Check if read is allowed with given memory buffer and length. 281 * 282 * This function is used to check O_DIRECT memory buffer and request alignment. 283 */ 284 static bool raw_is_io_aligned(int fd, void *buf, size_t len) 285 { 286 ssize_t ret = pread(fd, buf, len, 0); 287 288 if (ret >= 0) { 289 return true; 290 } 291 292 #ifdef __linux__ 293 /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore 294 * other errors (e.g. real I/O error), which could happen on a failed 295 * drive, since we only care about probing alignment. 296 */ 297 if (errno != EINVAL) { 298 return true; 299 } 300 #endif 301 302 return false; 303 } 304 305 static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) 306 { 307 BDRVRawState *s = bs->opaque; 308 char *buf; 309 size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); 310 311 /* For SCSI generic devices the alignment is not really used. 312 With buffered I/O, we don't have any restrictions. */ 313 if (bdrv_is_sg(bs) || !s->needs_alignment) { 314 bs->bl.request_alignment = 1; 315 s->buf_align = 1; 316 return; 317 } 318 319 bs->bl.request_alignment = 0; 320 s->buf_align = 0; 321 /* Let's try to use the logical blocksize for the alignment. */ 322 if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) { 323 bs->bl.request_alignment = 0; 324 } 325 #ifdef CONFIG_XFS 326 if (s->is_xfs) { 327 struct dioattr da; 328 if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { 329 bs->bl.request_alignment = da.d_miniosz; 330 /* The kernel returns wrong information for d_mem */ 331 /* s->buf_align = da.d_mem; */ 332 } 333 } 334 #endif 335 336 /* If we could not get the sizes so far, we can only guess them */ 337 if (!s->buf_align) { 338 size_t align; 339 buf = qemu_memalign(max_align, 2 * max_align); 340 for (align = 512; align <= max_align; align <<= 1) { 341 if (raw_is_io_aligned(fd, buf + align, max_align)) { 342 s->buf_align = align; 343 break; 344 } 345 } 346 qemu_vfree(buf); 347 } 348 349 if (!bs->bl.request_alignment) { 350 size_t align; 351 buf = qemu_memalign(s->buf_align, max_align); 352 for (align = 512; align <= max_align; align <<= 1) { 353 if (raw_is_io_aligned(fd, buf, align)) { 354 bs->bl.request_alignment = align; 355 break; 356 } 357 } 358 qemu_vfree(buf); 359 } 360 361 if (!s->buf_align || !bs->bl.request_alignment) { 362 error_setg(errp, "Could not find working O_DIRECT alignment"); 363 error_append_hint(errp, "Try cache.direct=off\n"); 364 } 365 } 366 367 static void raw_parse_flags(int bdrv_flags, int *open_flags) 368 { 369 assert(open_flags != NULL); 370 371 *open_flags |= O_BINARY; 372 *open_flags &= ~O_ACCMODE; 373 if (bdrv_flags & BDRV_O_RDWR) { 374 *open_flags |= O_RDWR; 375 } else { 376 *open_flags |= O_RDONLY; 377 } 378 379 /* Use O_DSYNC for write-through caching, no flags for write-back caching, 380 * and O_DIRECT for no caching. */ 381 if ((bdrv_flags & BDRV_O_NOCACHE)) { 382 *open_flags |= O_DIRECT; 383 } 384 } 385 386 static void raw_parse_filename(const char *filename, QDict *options, 387 Error **errp) 388 { 389 bdrv_parse_filename_strip_prefix(filename, "file:", options); 390 } 391 392 static QemuOptsList raw_runtime_opts = { 393 .name = "raw", 394 .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), 395 .desc = { 396 { 397 .name = "filename", 398 .type = QEMU_OPT_STRING, 399 .help = "File name of the image", 400 }, 401 { 402 .name = "aio", 403 .type = QEMU_OPT_STRING, 404 .help = "host AIO implementation (threads, native)", 405 }, 406 { 407 .name = "locking", 408 .type = QEMU_OPT_STRING, 409 .help = "file locking mode (on/off/auto, default: auto)", 410 }, 411 { 412 .name = "pr-manager", 413 .type = QEMU_OPT_STRING, 414 .help = "id of persistent reservation manager object (default: none)", 415 }, 416 { /* end of list */ } 417 }, 418 }; 419 420 static int raw_open_common(BlockDriverState *bs, QDict *options, 421 int bdrv_flags, int open_flags, Error **errp) 422 { 423 BDRVRawState *s = bs->opaque; 424 QemuOpts *opts; 425 Error *local_err = NULL; 426 const char *filename = NULL; 427 const char *str; 428 BlockdevAioOptions aio, aio_default; 429 int fd, ret; 430 struct stat st; 431 OnOffAuto locking; 432 433 opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); 434 qemu_opts_absorb_qdict(opts, options, &local_err); 435 if (local_err) { 436 error_propagate(errp, local_err); 437 ret = -EINVAL; 438 goto fail; 439 } 440 441 filename = qemu_opt_get(opts, "filename"); 442 443 ret = raw_normalize_devicepath(&filename); 444 if (ret != 0) { 445 error_setg_errno(errp, -ret, "Could not normalize device path"); 446 goto fail; 447 } 448 449 aio_default = (bdrv_flags & BDRV_O_NATIVE_AIO) 450 ? BLOCKDEV_AIO_OPTIONS_NATIVE 451 : BLOCKDEV_AIO_OPTIONS_THREADS; 452 aio = qapi_enum_parse(&BlockdevAioOptions_lookup, 453 qemu_opt_get(opts, "aio"), 454 aio_default, &local_err); 455 if (local_err) { 456 error_propagate(errp, local_err); 457 ret = -EINVAL; 458 goto fail; 459 } 460 s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); 461 462 locking = qapi_enum_parse(&OnOffAuto_lookup, 463 qemu_opt_get(opts, "locking"), 464 ON_OFF_AUTO_AUTO, &local_err); 465 if (local_err) { 466 error_propagate(errp, local_err); 467 ret = -EINVAL; 468 goto fail; 469 } 470 switch (locking) { 471 case ON_OFF_AUTO_ON: 472 s->use_lock = true; 473 if (!qemu_has_ofd_lock()) { 474 fprintf(stderr, 475 "File lock requested but OFD locking syscall is " 476 "unavailable, falling back to POSIX file locks.\n" 477 "Due to the implementation, locks can be lost " 478 "unexpectedly.\n"); 479 } 480 break; 481 case ON_OFF_AUTO_OFF: 482 s->use_lock = false; 483 break; 484 case ON_OFF_AUTO_AUTO: 485 s->use_lock = qemu_has_ofd_lock(); 486 break; 487 default: 488 abort(); 489 } 490 491 str = qemu_opt_get(opts, "pr-manager"); 492 if (str) { 493 s->pr_mgr = pr_manager_lookup(str, &local_err); 494 if (local_err) { 495 error_propagate(errp, local_err); 496 ret = -EINVAL; 497 goto fail; 498 } 499 } 500 501 s->open_flags = open_flags; 502 raw_parse_flags(bdrv_flags, &s->open_flags); 503 504 s->fd = -1; 505 fd = qemu_open(filename, s->open_flags, 0644); 506 if (fd < 0) { 507 ret = -errno; 508 error_setg_errno(errp, errno, "Could not open '%s'", filename); 509 if (ret == -EROFS) { 510 ret = -EACCES; 511 } 512 goto fail; 513 } 514 s->fd = fd; 515 516 s->lock_fd = -1; 517 if (s->use_lock) { 518 fd = qemu_open(filename, s->open_flags); 519 if (fd < 0) { 520 ret = -errno; 521 error_setg_errno(errp, errno, "Could not open '%s' for locking", 522 filename); 523 qemu_close(s->fd); 524 goto fail; 525 } 526 s->lock_fd = fd; 527 } 528 s->perm = 0; 529 s->shared_perm = BLK_PERM_ALL; 530 531 #ifdef CONFIG_LINUX_AIO 532 /* Currently Linux does AIO only for files opened with O_DIRECT */ 533 if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) { 534 error_setg(errp, "aio=native was specified, but it requires " 535 "cache.direct=on, which was not specified."); 536 ret = -EINVAL; 537 goto fail; 538 } 539 #else 540 if (s->use_linux_aio) { 541 error_setg(errp, "aio=native was specified, but is not supported " 542 "in this build."); 543 ret = -EINVAL; 544 goto fail; 545 } 546 #endif /* !defined(CONFIG_LINUX_AIO) */ 547 548 s->has_discard = true; 549 s->has_write_zeroes = true; 550 bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; 551 if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { 552 s->needs_alignment = true; 553 } 554 555 if (fstat(s->fd, &st) < 0) { 556 ret = -errno; 557 error_setg_errno(errp, errno, "Could not stat file"); 558 goto fail; 559 } 560 if (S_ISREG(st.st_mode)) { 561 s->discard_zeroes = true; 562 s->has_fallocate = true; 563 } 564 if (S_ISBLK(st.st_mode)) { 565 #ifdef BLKDISCARDZEROES 566 unsigned int arg; 567 if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { 568 s->discard_zeroes = true; 569 } 570 #endif 571 #ifdef __linux__ 572 /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do 573 * not rely on the contents of discarded blocks unless using O_DIRECT. 574 * Same for BLKZEROOUT. 575 */ 576 if (!(bs->open_flags & BDRV_O_NOCACHE)) { 577 s->discard_zeroes = false; 578 s->has_write_zeroes = false; 579 } 580 #endif 581 } 582 #ifdef __FreeBSD__ 583 if (S_ISCHR(st.st_mode)) { 584 /* 585 * The file is a char device (disk), which on FreeBSD isn't behind 586 * a pager, so force all requests to be aligned. This is needed 587 * so QEMU makes sure all IO operations on the device are aligned 588 * to sector size, or else FreeBSD will reject them with EINVAL. 589 */ 590 s->needs_alignment = true; 591 } 592 #endif 593 594 #ifdef CONFIG_XFS 595 if (platform_test_xfs_fd(s->fd)) { 596 s->is_xfs = true; 597 } 598 #endif 599 600 ret = 0; 601 fail: 602 if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { 603 unlink(filename); 604 } 605 qemu_opts_del(opts); 606 return ret; 607 } 608 609 static int raw_open(BlockDriverState *bs, QDict *options, int flags, 610 Error **errp) 611 { 612 BDRVRawState *s = bs->opaque; 613 614 s->type = FTYPE_FILE; 615 return raw_open_common(bs, options, flags, 0, errp); 616 } 617 618 typedef enum { 619 RAW_PL_PREPARE, 620 RAW_PL_COMMIT, 621 RAW_PL_ABORT, 622 } RawPermLockOp; 623 624 #define PERM_FOREACH(i) \ 625 for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++) 626 627 /* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the 628 * file; if @unlock == true, also unlock the unneeded bytes. 629 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared. 630 */ 631 static int raw_apply_lock_bytes(BDRVRawState *s, 632 uint64_t perm_lock_bits, 633 uint64_t shared_perm_lock_bits, 634 bool unlock, Error **errp) 635 { 636 int ret; 637 int i; 638 639 PERM_FOREACH(i) { 640 int off = RAW_LOCK_PERM_BASE + i; 641 if (perm_lock_bits & (1ULL << i)) { 642 ret = qemu_lock_fd(s->lock_fd, off, 1, false); 643 if (ret) { 644 error_setg(errp, "Failed to lock byte %d", off); 645 return ret; 646 } 647 } else if (unlock) { 648 ret = qemu_unlock_fd(s->lock_fd, off, 1); 649 if (ret) { 650 error_setg(errp, "Failed to unlock byte %d", off); 651 return ret; 652 } 653 } 654 } 655 PERM_FOREACH(i) { 656 int off = RAW_LOCK_SHARED_BASE + i; 657 if (shared_perm_lock_bits & (1ULL << i)) { 658 ret = qemu_lock_fd(s->lock_fd, off, 1, false); 659 if (ret) { 660 error_setg(errp, "Failed to lock byte %d", off); 661 return ret; 662 } 663 } else if (unlock) { 664 ret = qemu_unlock_fd(s->lock_fd, off, 1); 665 if (ret) { 666 error_setg(errp, "Failed to unlock byte %d", off); 667 return ret; 668 } 669 } 670 } 671 return 0; 672 } 673 674 /* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */ 675 static int raw_check_lock_bytes(BDRVRawState *s, 676 uint64_t perm, uint64_t shared_perm, 677 Error **errp) 678 { 679 int ret; 680 int i; 681 682 PERM_FOREACH(i) { 683 int off = RAW_LOCK_SHARED_BASE + i; 684 uint64_t p = 1ULL << i; 685 if (perm & p) { 686 ret = qemu_lock_fd_test(s->lock_fd, off, 1, true); 687 if (ret) { 688 char *perm_name = bdrv_perm_names(p); 689 error_setg(errp, 690 "Failed to get \"%s\" lock", 691 perm_name); 692 g_free(perm_name); 693 error_append_hint(errp, 694 "Is another process using the image?\n"); 695 return ret; 696 } 697 } 698 } 699 PERM_FOREACH(i) { 700 int off = RAW_LOCK_PERM_BASE + i; 701 uint64_t p = 1ULL << i; 702 if (!(shared_perm & p)) { 703 ret = qemu_lock_fd_test(s->lock_fd, off, 1, true); 704 if (ret) { 705 char *perm_name = bdrv_perm_names(p); 706 error_setg(errp, 707 "Failed to get shared \"%s\" lock", 708 perm_name); 709 g_free(perm_name); 710 error_append_hint(errp, 711 "Is another process using the image?\n"); 712 return ret; 713 } 714 } 715 } 716 return 0; 717 } 718 719 static int raw_handle_perm_lock(BlockDriverState *bs, 720 RawPermLockOp op, 721 uint64_t new_perm, uint64_t new_shared, 722 Error **errp) 723 { 724 BDRVRawState *s = bs->opaque; 725 int ret = 0; 726 Error *local_err = NULL; 727 728 if (!s->use_lock) { 729 return 0; 730 } 731 732 if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) { 733 return 0; 734 } 735 736 assert(s->lock_fd > 0); 737 738 switch (op) { 739 case RAW_PL_PREPARE: 740 ret = raw_apply_lock_bytes(s, s->perm | new_perm, 741 ~s->shared_perm | ~new_shared, 742 false, errp); 743 if (!ret) { 744 ret = raw_check_lock_bytes(s, new_perm, new_shared, errp); 745 if (!ret) { 746 return 0; 747 } 748 } 749 op = RAW_PL_ABORT; 750 /* fall through to unlock bytes. */ 751 case RAW_PL_ABORT: 752 raw_apply_lock_bytes(s, s->perm, ~s->shared_perm, true, &local_err); 753 if (local_err) { 754 /* Theoretically the above call only unlocks bytes and it cannot 755 * fail. Something weird happened, report it. 756 */ 757 error_report_err(local_err); 758 } 759 break; 760 case RAW_PL_COMMIT: 761 raw_apply_lock_bytes(s, new_perm, ~new_shared, true, &local_err); 762 if (local_err) { 763 /* Theoretically the above call only unlocks bytes and it cannot 764 * fail. Something weird happened, report it. 765 */ 766 error_report_err(local_err); 767 } 768 break; 769 } 770 return ret; 771 } 772 773 static int raw_reopen_prepare(BDRVReopenState *state, 774 BlockReopenQueue *queue, Error **errp) 775 { 776 BDRVRawState *s; 777 BDRVRawReopenState *rs; 778 int ret = 0; 779 Error *local_err = NULL; 780 781 assert(state != NULL); 782 assert(state->bs != NULL); 783 784 s = state->bs->opaque; 785 786 state->opaque = g_new0(BDRVRawReopenState, 1); 787 rs = state->opaque; 788 789 if (s->type == FTYPE_CD) { 790 rs->open_flags |= O_NONBLOCK; 791 } 792 793 raw_parse_flags(state->flags, &rs->open_flags); 794 795 rs->fd = -1; 796 797 int fcntl_flags = O_APPEND | O_NONBLOCK; 798 #ifdef O_NOATIME 799 fcntl_flags |= O_NOATIME; 800 #endif 801 802 #ifdef O_ASYNC 803 /* Not all operating systems have O_ASYNC, and those that don't 804 * will not let us track the state into rs->open_flags (typically 805 * you achieve the same effect with an ioctl, for example I_SETSIG 806 * on Solaris). But we do not use O_ASYNC, so that's fine. 807 */ 808 assert((s->open_flags & O_ASYNC) == 0); 809 #endif 810 811 if ((rs->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { 812 /* dup the original fd */ 813 rs->fd = qemu_dup(s->fd); 814 if (rs->fd >= 0) { 815 ret = fcntl_setfl(rs->fd, rs->open_flags); 816 if (ret) { 817 qemu_close(rs->fd); 818 rs->fd = -1; 819 } 820 } 821 } 822 823 /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ 824 if (rs->fd == -1) { 825 const char *normalized_filename = state->bs->filename; 826 ret = raw_normalize_devicepath(&normalized_filename); 827 if (ret < 0) { 828 error_setg_errno(errp, -ret, "Could not normalize device path"); 829 } else { 830 assert(!(rs->open_flags & O_CREAT)); 831 rs->fd = qemu_open(normalized_filename, rs->open_flags); 832 if (rs->fd == -1) { 833 error_setg_errno(errp, errno, "Could not reopen file"); 834 ret = -1; 835 } 836 } 837 } 838 839 /* Fail already reopen_prepare() if we can't get a working O_DIRECT 840 * alignment with the new fd. */ 841 if (rs->fd != -1) { 842 raw_probe_alignment(state->bs, rs->fd, &local_err); 843 if (local_err) { 844 qemu_close(rs->fd); 845 rs->fd = -1; 846 error_propagate(errp, local_err); 847 ret = -EINVAL; 848 } 849 } 850 851 return ret; 852 } 853 854 static void raw_reopen_commit(BDRVReopenState *state) 855 { 856 BDRVRawReopenState *rs = state->opaque; 857 BDRVRawState *s = state->bs->opaque; 858 859 s->open_flags = rs->open_flags; 860 861 qemu_close(s->fd); 862 s->fd = rs->fd; 863 864 g_free(state->opaque); 865 state->opaque = NULL; 866 } 867 868 869 static void raw_reopen_abort(BDRVReopenState *state) 870 { 871 BDRVRawReopenState *rs = state->opaque; 872 873 /* nothing to do if NULL, we didn't get far enough */ 874 if (rs == NULL) { 875 return; 876 } 877 878 if (rs->fd >= 0) { 879 qemu_close(rs->fd); 880 rs->fd = -1; 881 } 882 g_free(state->opaque); 883 state->opaque = NULL; 884 } 885 886 static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd) 887 { 888 #ifdef BLKSECTGET 889 int max_bytes = 0; 890 short max_sectors = 0; 891 if (bs->sg && ioctl(fd, BLKSECTGET, &max_bytes) == 0) { 892 return max_bytes; 893 } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) { 894 return max_sectors << BDRV_SECTOR_BITS; 895 } else { 896 return -errno; 897 } 898 #else 899 return -ENOSYS; 900 #endif 901 } 902 903 static int hdev_get_max_segments(const struct stat *st) 904 { 905 #ifdef CONFIG_LINUX 906 char buf[32]; 907 const char *end; 908 char *sysfspath; 909 int ret; 910 int fd = -1; 911 long max_segments; 912 913 sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments", 914 major(st->st_rdev), minor(st->st_rdev)); 915 fd = open(sysfspath, O_RDONLY); 916 if (fd == -1) { 917 ret = -errno; 918 goto out; 919 } 920 do { 921 ret = read(fd, buf, sizeof(buf) - 1); 922 } while (ret == -1 && errno == EINTR); 923 if (ret < 0) { 924 ret = -errno; 925 goto out; 926 } else if (ret == 0) { 927 ret = -EIO; 928 goto out; 929 } 930 buf[ret] = 0; 931 /* The file is ended with '\n', pass 'end' to accept that. */ 932 ret = qemu_strtol(buf, &end, 10, &max_segments); 933 if (ret == 0 && end && *end == '\n') { 934 ret = max_segments; 935 } 936 937 out: 938 if (fd != -1) { 939 close(fd); 940 } 941 g_free(sysfspath); 942 return ret; 943 #else 944 return -ENOTSUP; 945 #endif 946 } 947 948 static void raw_refresh_limits(BlockDriverState *bs, Error **errp) 949 { 950 BDRVRawState *s = bs->opaque; 951 struct stat st; 952 953 if (!fstat(s->fd, &st)) { 954 if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { 955 int ret = hdev_get_max_transfer_length(bs, s->fd); 956 if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) { 957 bs->bl.max_transfer = pow2floor(ret); 958 } 959 ret = hdev_get_max_segments(&st); 960 if (ret > 0) { 961 bs->bl.max_transfer = MIN(bs->bl.max_transfer, 962 ret * getpagesize()); 963 } 964 } 965 } 966 967 raw_probe_alignment(bs, s->fd, errp); 968 bs->bl.min_mem_alignment = s->buf_align; 969 bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); 970 } 971 972 static int check_for_dasd(int fd) 973 { 974 #ifdef BIODASDINFO2 975 struct dasd_information2_t info = {0}; 976 977 return ioctl(fd, BIODASDINFO2, &info); 978 #else 979 return -1; 980 #endif 981 } 982 983 /** 984 * Try to get @bs's logical and physical block size. 985 * On success, store them in @bsz and return zero. 986 * On failure, return negative errno. 987 */ 988 static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) 989 { 990 BDRVRawState *s = bs->opaque; 991 int ret; 992 993 /* If DASD, get blocksizes */ 994 if (check_for_dasd(s->fd) < 0) { 995 return -ENOTSUP; 996 } 997 ret = probe_logical_blocksize(s->fd, &bsz->log); 998 if (ret < 0) { 999 return ret; 1000 } 1001 return probe_physical_blocksize(s->fd, &bsz->phys); 1002 } 1003 1004 /** 1005 * Try to get @bs's geometry: cyls, heads, sectors. 1006 * On success, store them in @geo and return 0. 1007 * On failure return -errno. 1008 * (Allows block driver to assign default geometry values that guest sees) 1009 */ 1010 #ifdef __linux__ 1011 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 1012 { 1013 BDRVRawState *s = bs->opaque; 1014 struct hd_geometry ioctl_geo = {0}; 1015 1016 /* If DASD, get its geometry */ 1017 if (check_for_dasd(s->fd) < 0) { 1018 return -ENOTSUP; 1019 } 1020 if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { 1021 return -errno; 1022 } 1023 /* HDIO_GETGEO may return success even though geo contains zeros 1024 (e.g. certain multipath setups) */ 1025 if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { 1026 return -ENOTSUP; 1027 } 1028 /* Do not return a geometry for partition */ 1029 if (ioctl_geo.start != 0) { 1030 return -ENOTSUP; 1031 } 1032 geo->heads = ioctl_geo.heads; 1033 geo->sectors = ioctl_geo.sectors; 1034 geo->cylinders = ioctl_geo.cylinders; 1035 1036 return 0; 1037 } 1038 #else /* __linux__ */ 1039 static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) 1040 { 1041 return -ENOTSUP; 1042 } 1043 #endif 1044 1045 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) 1046 { 1047 int ret; 1048 1049 ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); 1050 if (ret == -1) { 1051 return -errno; 1052 } 1053 1054 return 0; 1055 } 1056 1057 static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) 1058 { 1059 BDRVRawState *s = aiocb->bs->opaque; 1060 int ret; 1061 1062 if (s->page_cache_inconsistent) { 1063 return -EIO; 1064 } 1065 1066 ret = qemu_fdatasync(aiocb->aio_fildes); 1067 if (ret == -1) { 1068 /* There is no clear definition of the semantics of a failing fsync(), 1069 * so we may have to assume the worst. The sad truth is that this 1070 * assumption is correct for Linux. Some pages are now probably marked 1071 * clean in the page cache even though they are inconsistent with the 1072 * on-disk contents. The next fdatasync() call would succeed, but no 1073 * further writeback attempt will be made. We can't get back to a state 1074 * in which we know what is on disk (we would have to rewrite 1075 * everything that was touched since the last fdatasync() at least), so 1076 * make bdrv_flush() fail permanently. Given that the behaviour isn't 1077 * really defined, I have little hope that other OSes are doing better. 1078 * 1079 * Obviously, this doesn't affect O_DIRECT, which bypasses the page 1080 * cache. */ 1081 if ((s->open_flags & O_DIRECT) == 0) { 1082 s->page_cache_inconsistent = true; 1083 } 1084 return -errno; 1085 } 1086 return 0; 1087 } 1088 1089 #ifdef CONFIG_PREADV 1090 1091 static bool preadv_present = true; 1092 1093 static ssize_t 1094 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1095 { 1096 return preadv(fd, iov, nr_iov, offset); 1097 } 1098 1099 static ssize_t 1100 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1101 { 1102 return pwritev(fd, iov, nr_iov, offset); 1103 } 1104 1105 #else 1106 1107 static bool preadv_present = false; 1108 1109 static ssize_t 1110 qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1111 { 1112 return -ENOSYS; 1113 } 1114 1115 static ssize_t 1116 qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) 1117 { 1118 return -ENOSYS; 1119 } 1120 1121 #endif 1122 1123 static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) 1124 { 1125 ssize_t len; 1126 1127 do { 1128 if (aiocb->aio_type & QEMU_AIO_WRITE) 1129 len = qemu_pwritev(aiocb->aio_fildes, 1130 aiocb->aio_iov, 1131 aiocb->aio_niov, 1132 aiocb->aio_offset); 1133 else 1134 len = qemu_preadv(aiocb->aio_fildes, 1135 aiocb->aio_iov, 1136 aiocb->aio_niov, 1137 aiocb->aio_offset); 1138 } while (len == -1 && errno == EINTR); 1139 1140 if (len == -1) { 1141 return -errno; 1142 } 1143 return len; 1144 } 1145 1146 /* 1147 * Read/writes the data to/from a given linear buffer. 1148 * 1149 * Returns the number of bytes handles or -errno in case of an error. Short 1150 * reads are only returned if the end of the file is reached. 1151 */ 1152 static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) 1153 { 1154 ssize_t offset = 0; 1155 ssize_t len; 1156 1157 while (offset < aiocb->aio_nbytes) { 1158 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1159 len = pwrite(aiocb->aio_fildes, 1160 (const char *)buf + offset, 1161 aiocb->aio_nbytes - offset, 1162 aiocb->aio_offset + offset); 1163 } else { 1164 len = pread(aiocb->aio_fildes, 1165 buf + offset, 1166 aiocb->aio_nbytes - offset, 1167 aiocb->aio_offset + offset); 1168 } 1169 if (len == -1 && errno == EINTR) { 1170 continue; 1171 } else if (len == -1 && errno == EINVAL && 1172 (aiocb->bs->open_flags & BDRV_O_NOCACHE) && 1173 !(aiocb->aio_type & QEMU_AIO_WRITE) && 1174 offset > 0) { 1175 /* O_DIRECT pread() may fail with EINVAL when offset is unaligned 1176 * after a short read. Assume that O_DIRECT short reads only occur 1177 * at EOF. Therefore this is a short read, not an I/O error. 1178 */ 1179 break; 1180 } else if (len == -1) { 1181 offset = -errno; 1182 break; 1183 } else if (len == 0) { 1184 break; 1185 } 1186 offset += len; 1187 } 1188 1189 return offset; 1190 } 1191 1192 static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) 1193 { 1194 ssize_t nbytes; 1195 char *buf; 1196 1197 if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { 1198 /* 1199 * If there is just a single buffer, and it is properly aligned 1200 * we can just use plain pread/pwrite without any problems. 1201 */ 1202 if (aiocb->aio_niov == 1) { 1203 return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); 1204 } 1205 /* 1206 * We have more than one iovec, and all are properly aligned. 1207 * 1208 * Try preadv/pwritev first and fall back to linearizing the 1209 * buffer if it's not supported. 1210 */ 1211 if (preadv_present) { 1212 nbytes = handle_aiocb_rw_vector(aiocb); 1213 if (nbytes == aiocb->aio_nbytes || 1214 (nbytes < 0 && nbytes != -ENOSYS)) { 1215 return nbytes; 1216 } 1217 preadv_present = false; 1218 } 1219 1220 /* 1221 * XXX(hch): short read/write. no easy way to handle the reminder 1222 * using these interfaces. For now retry using plain 1223 * pread/pwrite? 1224 */ 1225 } 1226 1227 /* 1228 * Ok, we have to do it the hard way, copy all segments into 1229 * a single aligned buffer. 1230 */ 1231 buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); 1232 if (buf == NULL) { 1233 return -ENOMEM; 1234 } 1235 1236 if (aiocb->aio_type & QEMU_AIO_WRITE) { 1237 char *p = buf; 1238 int i; 1239 1240 for (i = 0; i < aiocb->aio_niov; ++i) { 1241 memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); 1242 p += aiocb->aio_iov[i].iov_len; 1243 } 1244 assert(p - buf == aiocb->aio_nbytes); 1245 } 1246 1247 nbytes = handle_aiocb_rw_linear(aiocb, buf); 1248 if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { 1249 char *p = buf; 1250 size_t count = aiocb->aio_nbytes, copy; 1251 int i; 1252 1253 for (i = 0; i < aiocb->aio_niov && count; ++i) { 1254 copy = count; 1255 if (copy > aiocb->aio_iov[i].iov_len) { 1256 copy = aiocb->aio_iov[i].iov_len; 1257 } 1258 memcpy(aiocb->aio_iov[i].iov_base, p, copy); 1259 assert(count >= copy); 1260 p += copy; 1261 count -= copy; 1262 } 1263 assert(count == 0); 1264 } 1265 qemu_vfree(buf); 1266 1267 return nbytes; 1268 } 1269 1270 #ifdef CONFIG_XFS 1271 static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) 1272 { 1273 struct xfs_flock64 fl; 1274 int err; 1275 1276 memset(&fl, 0, sizeof(fl)); 1277 fl.l_whence = SEEK_SET; 1278 fl.l_start = offset; 1279 fl.l_len = bytes; 1280 1281 if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { 1282 err = errno; 1283 DPRINTF("cannot write zero range (%s)\n", strerror(errno)); 1284 return -err; 1285 } 1286 1287 return 0; 1288 } 1289 1290 static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) 1291 { 1292 struct xfs_flock64 fl; 1293 int err; 1294 1295 memset(&fl, 0, sizeof(fl)); 1296 fl.l_whence = SEEK_SET; 1297 fl.l_start = offset; 1298 fl.l_len = bytes; 1299 1300 if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { 1301 err = errno; 1302 DPRINTF("cannot punch hole (%s)\n", strerror(errno)); 1303 return -err; 1304 } 1305 1306 return 0; 1307 } 1308 #endif 1309 1310 static int translate_err(int err) 1311 { 1312 if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || 1313 err == -ENOTTY) { 1314 err = -ENOTSUP; 1315 } 1316 return err; 1317 } 1318 1319 #ifdef CONFIG_FALLOCATE 1320 static int do_fallocate(int fd, int mode, off_t offset, off_t len) 1321 { 1322 do { 1323 if (fallocate(fd, mode, offset, len) == 0) { 1324 return 0; 1325 } 1326 } while (errno == EINTR); 1327 return translate_err(-errno); 1328 } 1329 #endif 1330 1331 static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) 1332 { 1333 int ret = -ENOTSUP; 1334 BDRVRawState *s = aiocb->bs->opaque; 1335 1336 if (!s->has_write_zeroes) { 1337 return -ENOTSUP; 1338 } 1339 1340 #ifdef BLKZEROOUT 1341 do { 1342 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1343 if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { 1344 return 0; 1345 } 1346 } while (errno == EINTR); 1347 1348 ret = translate_err(-errno); 1349 #endif 1350 1351 if (ret == -ENOTSUP) { 1352 s->has_write_zeroes = false; 1353 } 1354 return ret; 1355 } 1356 1357 static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) 1358 { 1359 #if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) 1360 BDRVRawState *s = aiocb->bs->opaque; 1361 #endif 1362 #ifdef CONFIG_FALLOCATE 1363 int64_t len; 1364 #endif 1365 1366 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1367 return handle_aiocb_write_zeroes_block(aiocb); 1368 } 1369 1370 #ifdef CONFIG_XFS 1371 if (s->is_xfs) { 1372 return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); 1373 } 1374 #endif 1375 1376 #ifdef CONFIG_FALLOCATE_ZERO_RANGE 1377 if (s->has_write_zeroes) { 1378 int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, 1379 aiocb->aio_offset, aiocb->aio_nbytes); 1380 if (ret == 0 || ret != -ENOTSUP) { 1381 return ret; 1382 } 1383 s->has_write_zeroes = false; 1384 } 1385 #endif 1386 1387 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1388 if (s->has_discard && s->has_fallocate) { 1389 int ret = do_fallocate(s->fd, 1390 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1391 aiocb->aio_offset, aiocb->aio_nbytes); 1392 if (ret == 0) { 1393 ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1394 if (ret == 0 || ret != -ENOTSUP) { 1395 return ret; 1396 } 1397 s->has_fallocate = false; 1398 } else if (ret != -ENOTSUP) { 1399 return ret; 1400 } else { 1401 s->has_discard = false; 1402 } 1403 } 1404 #endif 1405 1406 #ifdef CONFIG_FALLOCATE 1407 /* Last resort: we are trying to extend the file with zeroed data. This 1408 * can be done via fallocate(fd, 0) */ 1409 len = bdrv_getlength(aiocb->bs); 1410 if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) { 1411 int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); 1412 if (ret == 0 || ret != -ENOTSUP) { 1413 return ret; 1414 } 1415 s->has_fallocate = false; 1416 } 1417 #endif 1418 1419 return -ENOTSUP; 1420 } 1421 1422 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) 1423 { 1424 int ret = -EOPNOTSUPP; 1425 BDRVRawState *s = aiocb->bs->opaque; 1426 1427 if (!s->has_discard) { 1428 return -ENOTSUP; 1429 } 1430 1431 if (aiocb->aio_type & QEMU_AIO_BLKDEV) { 1432 #ifdef BLKDISCARD 1433 do { 1434 uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; 1435 if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { 1436 return 0; 1437 } 1438 } while (errno == EINTR); 1439 1440 ret = -errno; 1441 #endif 1442 } else { 1443 #ifdef CONFIG_XFS 1444 if (s->is_xfs) { 1445 return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); 1446 } 1447 #endif 1448 1449 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE 1450 ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 1451 aiocb->aio_offset, aiocb->aio_nbytes); 1452 #endif 1453 } 1454 1455 ret = translate_err(ret); 1456 if (ret == -ENOTSUP) { 1457 s->has_discard = false; 1458 } 1459 return ret; 1460 } 1461 1462 static int aio_worker(void *arg) 1463 { 1464 RawPosixAIOData *aiocb = arg; 1465 ssize_t ret = 0; 1466 1467 switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { 1468 case QEMU_AIO_READ: 1469 ret = handle_aiocb_rw(aiocb); 1470 if (ret >= 0 && ret < aiocb->aio_nbytes) { 1471 iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, 1472 0, aiocb->aio_nbytes - ret); 1473 1474 ret = aiocb->aio_nbytes; 1475 } 1476 if (ret == aiocb->aio_nbytes) { 1477 ret = 0; 1478 } else if (ret >= 0 && ret < aiocb->aio_nbytes) { 1479 ret = -EINVAL; 1480 } 1481 break; 1482 case QEMU_AIO_WRITE: 1483 ret = handle_aiocb_rw(aiocb); 1484 if (ret == aiocb->aio_nbytes) { 1485 ret = 0; 1486 } else if (ret >= 0 && ret < aiocb->aio_nbytes) { 1487 ret = -EINVAL; 1488 } 1489 break; 1490 case QEMU_AIO_FLUSH: 1491 ret = handle_aiocb_flush(aiocb); 1492 break; 1493 case QEMU_AIO_IOCTL: 1494 ret = handle_aiocb_ioctl(aiocb); 1495 break; 1496 case QEMU_AIO_DISCARD: 1497 ret = handle_aiocb_discard(aiocb); 1498 break; 1499 case QEMU_AIO_WRITE_ZEROES: 1500 ret = handle_aiocb_write_zeroes(aiocb); 1501 break; 1502 default: 1503 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); 1504 ret = -EINVAL; 1505 break; 1506 } 1507 1508 g_free(aiocb); 1509 return ret; 1510 } 1511 1512 static int paio_submit_co(BlockDriverState *bs, int fd, 1513 int64_t offset, QEMUIOVector *qiov, 1514 int bytes, int type) 1515 { 1516 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); 1517 ThreadPool *pool; 1518 1519 acb->bs = bs; 1520 acb->aio_type = type; 1521 acb->aio_fildes = fd; 1522 1523 acb->aio_nbytes = bytes; 1524 acb->aio_offset = offset; 1525 1526 if (qiov) { 1527 acb->aio_iov = qiov->iov; 1528 acb->aio_niov = qiov->niov; 1529 assert(qiov->size == bytes); 1530 } 1531 1532 trace_paio_submit_co(offset, bytes, type); 1533 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1534 return thread_pool_submit_co(pool, aio_worker, acb); 1535 } 1536 1537 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, 1538 int64_t offset, QEMUIOVector *qiov, int bytes, 1539 BlockCompletionFunc *cb, void *opaque, int type) 1540 { 1541 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); 1542 ThreadPool *pool; 1543 1544 acb->bs = bs; 1545 acb->aio_type = type; 1546 acb->aio_fildes = fd; 1547 1548 acb->aio_nbytes = bytes; 1549 acb->aio_offset = offset; 1550 1551 if (qiov) { 1552 acb->aio_iov = qiov->iov; 1553 acb->aio_niov = qiov->niov; 1554 assert(qiov->size == acb->aio_nbytes); 1555 } 1556 1557 trace_paio_submit(acb, opaque, offset, bytes, type); 1558 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 1559 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); 1560 } 1561 1562 static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset, 1563 uint64_t bytes, QEMUIOVector *qiov, int type) 1564 { 1565 BDRVRawState *s = bs->opaque; 1566 1567 if (fd_open(bs) < 0) 1568 return -EIO; 1569 1570 /* 1571 * Check if the underlying device requires requests to be aligned, 1572 * and if the request we are trying to submit is aligned or not. 1573 * If this is the case tell the low-level driver that it needs 1574 * to copy the buffer. 1575 */ 1576 if (s->needs_alignment) { 1577 if (!bdrv_qiov_is_aligned(bs, qiov)) { 1578 type |= QEMU_AIO_MISALIGNED; 1579 #ifdef CONFIG_LINUX_AIO 1580 } else if (s->use_linux_aio) { 1581 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1582 assert(qiov->size == bytes); 1583 return laio_co_submit(bs, aio, s->fd, offset, qiov, type); 1584 #endif 1585 } 1586 } 1587 1588 return paio_submit_co(bs, s->fd, offset, qiov, bytes, type); 1589 } 1590 1591 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset, 1592 uint64_t bytes, QEMUIOVector *qiov, 1593 int flags) 1594 { 1595 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_READ); 1596 } 1597 1598 static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset, 1599 uint64_t bytes, QEMUIOVector *qiov, 1600 int flags) 1601 { 1602 assert(flags == 0); 1603 return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE); 1604 } 1605 1606 static void raw_aio_plug(BlockDriverState *bs) 1607 { 1608 #ifdef CONFIG_LINUX_AIO 1609 BDRVRawState *s = bs->opaque; 1610 if (s->use_linux_aio) { 1611 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1612 laio_io_plug(bs, aio); 1613 } 1614 #endif 1615 } 1616 1617 static void raw_aio_unplug(BlockDriverState *bs) 1618 { 1619 #ifdef CONFIG_LINUX_AIO 1620 BDRVRawState *s = bs->opaque; 1621 if (s->use_linux_aio) { 1622 LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs)); 1623 laio_io_unplug(bs, aio); 1624 } 1625 #endif 1626 } 1627 1628 static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, 1629 BlockCompletionFunc *cb, void *opaque) 1630 { 1631 BDRVRawState *s = bs->opaque; 1632 1633 if (fd_open(bs) < 0) 1634 return NULL; 1635 1636 return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); 1637 } 1638 1639 static void raw_close(BlockDriverState *bs) 1640 { 1641 BDRVRawState *s = bs->opaque; 1642 1643 if (s->fd >= 0) { 1644 qemu_close(s->fd); 1645 s->fd = -1; 1646 } 1647 if (s->lock_fd >= 0) { 1648 qemu_close(s->lock_fd); 1649 s->lock_fd = -1; 1650 } 1651 } 1652 1653 /** 1654 * Truncates the given regular file @fd to @offset and, when growing, fills the 1655 * new space according to @prealloc. 1656 * 1657 * Returns: 0 on success, -errno on failure. 1658 */ 1659 static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc, 1660 Error **errp) 1661 { 1662 int result = 0; 1663 int64_t current_length = 0; 1664 char *buf = NULL; 1665 struct stat st; 1666 1667 if (fstat(fd, &st) < 0) { 1668 result = -errno; 1669 error_setg_errno(errp, -result, "Could not stat file"); 1670 return result; 1671 } 1672 1673 current_length = st.st_size; 1674 if (current_length > offset && prealloc != PREALLOC_MODE_OFF) { 1675 error_setg(errp, "Cannot use preallocation for shrinking files"); 1676 return -ENOTSUP; 1677 } 1678 1679 switch (prealloc) { 1680 #ifdef CONFIG_POSIX_FALLOCATE 1681 case PREALLOC_MODE_FALLOC: 1682 /* 1683 * Truncating before posix_fallocate() makes it about twice slower on 1684 * file systems that do not support fallocate(), trying to check if a 1685 * block is allocated before allocating it, so don't do that here. 1686 */ 1687 result = -posix_fallocate(fd, current_length, offset - current_length); 1688 if (result != 0) { 1689 /* posix_fallocate() doesn't set errno. */ 1690 error_setg_errno(errp, -result, 1691 "Could not preallocate new data"); 1692 } 1693 goto out; 1694 #endif 1695 case PREALLOC_MODE_FULL: 1696 { 1697 int64_t num = 0, left = offset - current_length; 1698 1699 /* 1700 * Knowing the final size from the beginning could allow the file 1701 * system driver to do less allocations and possibly avoid 1702 * fragmentation of the file. 1703 */ 1704 if (ftruncate(fd, offset) != 0) { 1705 result = -errno; 1706 error_setg_errno(errp, -result, "Could not resize file"); 1707 goto out; 1708 } 1709 1710 buf = g_malloc0(65536); 1711 1712 result = lseek(fd, current_length, SEEK_SET); 1713 if (result < 0) { 1714 result = -errno; 1715 error_setg_errno(errp, -result, 1716 "Failed to seek to the old end of file"); 1717 goto out; 1718 } 1719 1720 while (left > 0) { 1721 num = MIN(left, 65536); 1722 result = write(fd, buf, num); 1723 if (result < 0) { 1724 result = -errno; 1725 error_setg_errno(errp, -result, 1726 "Could not write zeros for preallocation"); 1727 goto out; 1728 } 1729 left -= result; 1730 } 1731 if (result >= 0) { 1732 result = fsync(fd); 1733 if (result < 0) { 1734 result = -errno; 1735 error_setg_errno(errp, -result, 1736 "Could not flush file to disk"); 1737 goto out; 1738 } 1739 } 1740 goto out; 1741 } 1742 case PREALLOC_MODE_OFF: 1743 if (ftruncate(fd, offset) != 0) { 1744 result = -errno; 1745 error_setg_errno(errp, -result, "Could not resize file"); 1746 } 1747 return result; 1748 default: 1749 result = -ENOTSUP; 1750 error_setg(errp, "Unsupported preallocation mode: %s", 1751 PreallocMode_str(prealloc)); 1752 return result; 1753 } 1754 1755 out: 1756 if (result < 0) { 1757 if (ftruncate(fd, current_length) < 0) { 1758 error_report("Failed to restore old file length: %s", 1759 strerror(errno)); 1760 } 1761 } 1762 1763 g_free(buf); 1764 return result; 1765 } 1766 1767 static int raw_truncate(BlockDriverState *bs, int64_t offset, 1768 PreallocMode prealloc, Error **errp) 1769 { 1770 BDRVRawState *s = bs->opaque; 1771 struct stat st; 1772 int ret; 1773 1774 if (fstat(s->fd, &st)) { 1775 ret = -errno; 1776 error_setg_errno(errp, -ret, "Failed to fstat() the file"); 1777 return ret; 1778 } 1779 1780 if (S_ISREG(st.st_mode)) { 1781 return raw_regular_truncate(s->fd, offset, prealloc, errp); 1782 } 1783 1784 if (prealloc != PREALLOC_MODE_OFF) { 1785 error_setg(errp, "Preallocation mode '%s' unsupported for this " 1786 "non-regular file", PreallocMode_str(prealloc)); 1787 return -ENOTSUP; 1788 } 1789 1790 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1791 if (offset > raw_getlength(bs)) { 1792 error_setg(errp, "Cannot grow device files"); 1793 return -EINVAL; 1794 } 1795 } else { 1796 error_setg(errp, "Resizing this file is not supported"); 1797 return -ENOTSUP; 1798 } 1799 1800 return 0; 1801 } 1802 1803 #ifdef __OpenBSD__ 1804 static int64_t raw_getlength(BlockDriverState *bs) 1805 { 1806 BDRVRawState *s = bs->opaque; 1807 int fd = s->fd; 1808 struct stat st; 1809 1810 if (fstat(fd, &st)) 1811 return -errno; 1812 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1813 struct disklabel dl; 1814 1815 if (ioctl(fd, DIOCGDINFO, &dl)) 1816 return -errno; 1817 return (uint64_t)dl.d_secsize * 1818 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 1819 } else 1820 return st.st_size; 1821 } 1822 #elif defined(__NetBSD__) 1823 static int64_t raw_getlength(BlockDriverState *bs) 1824 { 1825 BDRVRawState *s = bs->opaque; 1826 int fd = s->fd; 1827 struct stat st; 1828 1829 if (fstat(fd, &st)) 1830 return -errno; 1831 if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { 1832 struct dkwedge_info dkw; 1833 1834 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { 1835 return dkw.dkw_size * 512; 1836 } else { 1837 struct disklabel dl; 1838 1839 if (ioctl(fd, DIOCGDINFO, &dl)) 1840 return -errno; 1841 return (uint64_t)dl.d_secsize * 1842 dl.d_partitions[DISKPART(st.st_rdev)].p_size; 1843 } 1844 } else 1845 return st.st_size; 1846 } 1847 #elif defined(__sun__) 1848 static int64_t raw_getlength(BlockDriverState *bs) 1849 { 1850 BDRVRawState *s = bs->opaque; 1851 struct dk_minfo minfo; 1852 int ret; 1853 int64_t size; 1854 1855 ret = fd_open(bs); 1856 if (ret < 0) { 1857 return ret; 1858 } 1859 1860 /* 1861 * Use the DKIOCGMEDIAINFO ioctl to read the size. 1862 */ 1863 ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); 1864 if (ret != -1) { 1865 return minfo.dki_lbsize * minfo.dki_capacity; 1866 } 1867 1868 /* 1869 * There are reports that lseek on some devices fails, but 1870 * irc discussion said that contingency on contingency was overkill. 1871 */ 1872 size = lseek(s->fd, 0, SEEK_END); 1873 if (size < 0) { 1874 return -errno; 1875 } 1876 return size; 1877 } 1878 #elif defined(CONFIG_BSD) 1879 static int64_t raw_getlength(BlockDriverState *bs) 1880 { 1881 BDRVRawState *s = bs->opaque; 1882 int fd = s->fd; 1883 int64_t size; 1884 struct stat sb; 1885 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 1886 int reopened = 0; 1887 #endif 1888 int ret; 1889 1890 ret = fd_open(bs); 1891 if (ret < 0) 1892 return ret; 1893 1894 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 1895 again: 1896 #endif 1897 if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { 1898 #ifdef DIOCGMEDIASIZE 1899 if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) 1900 #elif defined(DIOCGPART) 1901 { 1902 struct partinfo pi; 1903 if (ioctl(fd, DIOCGPART, &pi) == 0) 1904 size = pi.media_size; 1905 else 1906 size = 0; 1907 } 1908 if (size == 0) 1909 #endif 1910 #if defined(__APPLE__) && defined(__MACH__) 1911 { 1912 uint64_t sectors = 0; 1913 uint32_t sector_size = 0; 1914 1915 if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 1916 && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { 1917 size = sectors * sector_size; 1918 } else { 1919 size = lseek(fd, 0LL, SEEK_END); 1920 if (size < 0) { 1921 return -errno; 1922 } 1923 } 1924 } 1925 #else 1926 size = lseek(fd, 0LL, SEEK_END); 1927 if (size < 0) { 1928 return -errno; 1929 } 1930 #endif 1931 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 1932 switch(s->type) { 1933 case FTYPE_CD: 1934 /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ 1935 if (size == 2048LL * (unsigned)-1) 1936 size = 0; 1937 /* XXX no disc? maybe we need to reopen... */ 1938 if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { 1939 reopened = 1; 1940 goto again; 1941 } 1942 } 1943 #endif 1944 } else { 1945 size = lseek(fd, 0, SEEK_END); 1946 if (size < 0) { 1947 return -errno; 1948 } 1949 } 1950 return size; 1951 } 1952 #else 1953 static int64_t raw_getlength(BlockDriverState *bs) 1954 { 1955 BDRVRawState *s = bs->opaque; 1956 int ret; 1957 int64_t size; 1958 1959 ret = fd_open(bs); 1960 if (ret < 0) { 1961 return ret; 1962 } 1963 1964 size = lseek(s->fd, 0, SEEK_END); 1965 if (size < 0) { 1966 return -errno; 1967 } 1968 return size; 1969 } 1970 #endif 1971 1972 static int64_t raw_get_allocated_file_size(BlockDriverState *bs) 1973 { 1974 struct stat st; 1975 BDRVRawState *s = bs->opaque; 1976 1977 if (fstat(s->fd, &st) < 0) { 1978 return -errno; 1979 } 1980 return (int64_t)st.st_blocks * 512; 1981 } 1982 1983 static int raw_create(const char *filename, QemuOpts *opts, Error **errp) 1984 { 1985 int fd; 1986 int result = 0; 1987 int64_t total_size = 0; 1988 bool nocow = false; 1989 PreallocMode prealloc; 1990 char *buf = NULL; 1991 Error *local_err = NULL; 1992 1993 strstart(filename, "file:", &filename); 1994 1995 /* Read out options */ 1996 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 1997 BDRV_SECTOR_SIZE); 1998 nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); 1999 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); 2000 prealloc = qapi_enum_parse(&PreallocMode_lookup, buf, 2001 PREALLOC_MODE_OFF, &local_err); 2002 g_free(buf); 2003 if (local_err) { 2004 error_propagate(errp, local_err); 2005 result = -EINVAL; 2006 goto out; 2007 } 2008 2009 fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, 2010 0644); 2011 if (fd < 0) { 2012 result = -errno; 2013 error_setg_errno(errp, -result, "Could not create file"); 2014 goto out; 2015 } 2016 2017 if (nocow) { 2018 #ifdef __linux__ 2019 /* Set NOCOW flag to solve performance issue on fs like btrfs. 2020 * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value 2021 * will be ignored since any failure of this operation should not 2022 * block the left work. 2023 */ 2024 int attr; 2025 if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { 2026 attr |= FS_NOCOW_FL; 2027 ioctl(fd, FS_IOC_SETFLAGS, &attr); 2028 } 2029 #endif 2030 } 2031 2032 result = raw_regular_truncate(fd, total_size, prealloc, errp); 2033 if (result < 0) { 2034 goto out_close; 2035 } 2036 2037 out_close: 2038 if (qemu_close(fd) != 0 && result == 0) { 2039 result = -errno; 2040 error_setg_errno(errp, -result, "Could not close the new file"); 2041 } 2042 out: 2043 return result; 2044 } 2045 2046 /* 2047 * Find allocation range in @bs around offset @start. 2048 * May change underlying file descriptor's file offset. 2049 * If @start is not in a hole, store @start in @data, and the 2050 * beginning of the next hole in @hole, and return 0. 2051 * If @start is in a non-trailing hole, store @start in @hole and the 2052 * beginning of the next non-hole in @data, and return 0. 2053 * If @start is in a trailing hole or beyond EOF, return -ENXIO. 2054 * If we can't find out, return a negative errno other than -ENXIO. 2055 */ 2056 static int find_allocation(BlockDriverState *bs, off_t start, 2057 off_t *data, off_t *hole) 2058 { 2059 #if defined SEEK_HOLE && defined SEEK_DATA 2060 BDRVRawState *s = bs->opaque; 2061 off_t offs; 2062 2063 /* 2064 * SEEK_DATA cases: 2065 * D1. offs == start: start is in data 2066 * D2. offs > start: start is in a hole, next data at offs 2067 * D3. offs < 0, errno = ENXIO: either start is in a trailing hole 2068 * or start is beyond EOF 2069 * If the latter happens, the file has been truncated behind 2070 * our back since we opened it. All bets are off then. 2071 * Treating like a trailing hole is simplest. 2072 * D4. offs < 0, errno != ENXIO: we learned nothing 2073 */ 2074 offs = lseek(s->fd, start, SEEK_DATA); 2075 if (offs < 0) { 2076 return -errno; /* D3 or D4 */ 2077 } 2078 assert(offs >= start); 2079 2080 if (offs > start) { 2081 /* D2: in hole, next data at offs */ 2082 *hole = start; 2083 *data = offs; 2084 return 0; 2085 } 2086 2087 /* D1: in data, end not yet known */ 2088 2089 /* 2090 * SEEK_HOLE cases: 2091 * H1. offs == start: start is in a hole 2092 * If this happens here, a hole has been dug behind our back 2093 * since the previous lseek(). 2094 * H2. offs > start: either start is in data, next hole at offs, 2095 * or start is in trailing hole, EOF at offs 2096 * Linux treats trailing holes like any other hole: offs == 2097 * start. Solaris seeks to EOF instead: offs > start (blech). 2098 * If that happens here, a hole has been dug behind our back 2099 * since the previous lseek(). 2100 * H3. offs < 0, errno = ENXIO: start is beyond EOF 2101 * If this happens, the file has been truncated behind our 2102 * back since we opened it. Treat it like a trailing hole. 2103 * H4. offs < 0, errno != ENXIO: we learned nothing 2104 * Pretend we know nothing at all, i.e. "forget" about D1. 2105 */ 2106 offs = lseek(s->fd, start, SEEK_HOLE); 2107 if (offs < 0) { 2108 return -errno; /* D1 and (H3 or H4) */ 2109 } 2110 assert(offs >= start); 2111 2112 if (offs > start) { 2113 /* 2114 * D1 and H2: either in data, next hole at offs, or it was in 2115 * data but is now in a trailing hole. In the latter case, 2116 * all bets are off. Treating it as if it there was data all 2117 * the way to EOF is safe, so simply do that. 2118 */ 2119 *data = start; 2120 *hole = offs; 2121 return 0; 2122 } 2123 2124 /* D1 and H1 */ 2125 return -EBUSY; 2126 #else 2127 return -ENOTSUP; 2128 #endif 2129 } 2130 2131 /* 2132 * Returns the allocation status of the specified sectors. 2133 * 2134 * If 'sector_num' is beyond the end of the disk image the return value is 0 2135 * and 'pnum' is set to 0. 2136 * 2137 * 'pnum' is set to the number of sectors (including and immediately following 2138 * the specified sector) that are known to be in the same 2139 * allocated/unallocated state. 2140 * 2141 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes 2142 * beyond the end of the disk image it will be clamped. 2143 */ 2144 static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, 2145 int64_t sector_num, 2146 int nb_sectors, int *pnum, 2147 BlockDriverState **file) 2148 { 2149 off_t start, data = 0, hole = 0; 2150 int64_t total_size; 2151 int ret; 2152 2153 ret = fd_open(bs); 2154 if (ret < 0) { 2155 return ret; 2156 } 2157 2158 start = sector_num * BDRV_SECTOR_SIZE; 2159 total_size = bdrv_getlength(bs); 2160 if (total_size < 0) { 2161 return total_size; 2162 } else if (start >= total_size) { 2163 *pnum = 0; 2164 return 0; 2165 } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { 2166 nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); 2167 } 2168 2169 ret = find_allocation(bs, start, &data, &hole); 2170 if (ret == -ENXIO) { 2171 /* Trailing hole */ 2172 *pnum = nb_sectors; 2173 ret = BDRV_BLOCK_ZERO; 2174 } else if (ret < 0) { 2175 /* No info available, so pretend there are no holes */ 2176 *pnum = nb_sectors; 2177 ret = BDRV_BLOCK_DATA; 2178 } else if (data == start) { 2179 /* On a data extent, compute sectors to the end of the extent, 2180 * possibly including a partial sector at EOF. */ 2181 *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); 2182 ret = BDRV_BLOCK_DATA; 2183 } else { 2184 /* On a hole, compute sectors to the beginning of the next extent. */ 2185 assert(hole == start); 2186 *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); 2187 ret = BDRV_BLOCK_ZERO; 2188 } 2189 *file = bs; 2190 return ret | BDRV_BLOCK_OFFSET_VALID | start; 2191 } 2192 2193 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs, 2194 int64_t offset, int bytes, 2195 BlockCompletionFunc *cb, void *opaque) 2196 { 2197 BDRVRawState *s = bs->opaque; 2198 2199 return paio_submit(bs, s->fd, offset, NULL, bytes, 2200 cb, opaque, QEMU_AIO_DISCARD); 2201 } 2202 2203 static int coroutine_fn raw_co_pwrite_zeroes( 2204 BlockDriverState *bs, int64_t offset, 2205 int bytes, BdrvRequestFlags flags) 2206 { 2207 BDRVRawState *s = bs->opaque; 2208 2209 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 2210 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2211 QEMU_AIO_WRITE_ZEROES); 2212 } else if (s->discard_zeroes) { 2213 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2214 QEMU_AIO_DISCARD); 2215 } 2216 return -ENOTSUP; 2217 } 2218 2219 static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 2220 { 2221 BDRVRawState *s = bs->opaque; 2222 2223 bdi->unallocated_blocks_are_zero = s->discard_zeroes; 2224 bdi->can_write_zeroes_with_unmap = s->discard_zeroes; 2225 return 0; 2226 } 2227 2228 static QemuOptsList raw_create_opts = { 2229 .name = "raw-create-opts", 2230 .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), 2231 .desc = { 2232 { 2233 .name = BLOCK_OPT_SIZE, 2234 .type = QEMU_OPT_SIZE, 2235 .help = "Virtual disk size" 2236 }, 2237 { 2238 .name = BLOCK_OPT_NOCOW, 2239 .type = QEMU_OPT_BOOL, 2240 .help = "Turn off copy-on-write (valid only on btrfs)" 2241 }, 2242 { 2243 .name = BLOCK_OPT_PREALLOC, 2244 .type = QEMU_OPT_STRING, 2245 .help = "Preallocation mode (allowed values: off, falloc, full)" 2246 }, 2247 { /* end of list */ } 2248 } 2249 }; 2250 2251 static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared, 2252 Error **errp) 2253 { 2254 return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp); 2255 } 2256 2257 static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) 2258 { 2259 BDRVRawState *s = bs->opaque; 2260 raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL); 2261 s->perm = perm; 2262 s->shared_perm = shared; 2263 } 2264 2265 static void raw_abort_perm_update(BlockDriverState *bs) 2266 { 2267 raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL); 2268 } 2269 2270 BlockDriver bdrv_file = { 2271 .format_name = "file", 2272 .protocol_name = "file", 2273 .instance_size = sizeof(BDRVRawState), 2274 .bdrv_needs_filename = true, 2275 .bdrv_probe = NULL, /* no probe for protocols */ 2276 .bdrv_parse_filename = raw_parse_filename, 2277 .bdrv_file_open = raw_open, 2278 .bdrv_reopen_prepare = raw_reopen_prepare, 2279 .bdrv_reopen_commit = raw_reopen_commit, 2280 .bdrv_reopen_abort = raw_reopen_abort, 2281 .bdrv_close = raw_close, 2282 .bdrv_create = raw_create, 2283 .bdrv_has_zero_init = bdrv_has_zero_init_1, 2284 .bdrv_co_get_block_status = raw_co_get_block_status, 2285 .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes, 2286 2287 .bdrv_co_preadv = raw_co_preadv, 2288 .bdrv_co_pwritev = raw_co_pwritev, 2289 .bdrv_aio_flush = raw_aio_flush, 2290 .bdrv_aio_pdiscard = raw_aio_pdiscard, 2291 .bdrv_refresh_limits = raw_refresh_limits, 2292 .bdrv_io_plug = raw_aio_plug, 2293 .bdrv_io_unplug = raw_aio_unplug, 2294 2295 .bdrv_truncate = raw_truncate, 2296 .bdrv_getlength = raw_getlength, 2297 .bdrv_get_info = raw_get_info, 2298 .bdrv_get_allocated_file_size 2299 = raw_get_allocated_file_size, 2300 .bdrv_check_perm = raw_check_perm, 2301 .bdrv_set_perm = raw_set_perm, 2302 .bdrv_abort_perm_update = raw_abort_perm_update, 2303 .create_opts = &raw_create_opts, 2304 }; 2305 2306 /***********************************************/ 2307 /* host device */ 2308 2309 #if defined(__APPLE__) && defined(__MACH__) 2310 static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 2311 CFIndex maxPathSize, int flags); 2312 static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) 2313 { 2314 kern_return_t kernResult = KERN_FAILURE; 2315 mach_port_t masterPort; 2316 CFMutableDictionaryRef classesToMatch; 2317 const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass}; 2318 char *mediaType = NULL; 2319 2320 kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); 2321 if ( KERN_SUCCESS != kernResult ) { 2322 printf( "IOMasterPort returned %d\n", kernResult ); 2323 } 2324 2325 int index; 2326 for (index = 0; index < ARRAY_SIZE(matching_array); index++) { 2327 classesToMatch = IOServiceMatching(matching_array[index]); 2328 if (classesToMatch == NULL) { 2329 error_report("IOServiceMatching returned NULL for %s", 2330 matching_array[index]); 2331 continue; 2332 } 2333 CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey), 2334 kCFBooleanTrue); 2335 kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch, 2336 mediaIterator); 2337 if (kernResult != KERN_SUCCESS) { 2338 error_report("Note: IOServiceGetMatchingServices returned %d", 2339 kernResult); 2340 continue; 2341 } 2342 2343 /* If a match was found, leave the loop */ 2344 if (*mediaIterator != 0) { 2345 DPRINTF("Matching using %s\n", matching_array[index]); 2346 mediaType = g_strdup(matching_array[index]); 2347 break; 2348 } 2349 } 2350 return mediaType; 2351 } 2352 2353 kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, 2354 CFIndex maxPathSize, int flags) 2355 { 2356 io_object_t nextMedia; 2357 kern_return_t kernResult = KERN_FAILURE; 2358 *bsdPath = '\0'; 2359 nextMedia = IOIteratorNext( mediaIterator ); 2360 if ( nextMedia ) 2361 { 2362 CFTypeRef bsdPathAsCFString; 2363 bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); 2364 if ( bsdPathAsCFString ) { 2365 size_t devPathLength; 2366 strcpy( bsdPath, _PATH_DEV ); 2367 if (flags & BDRV_O_NOCACHE) { 2368 strcat(bsdPath, "r"); 2369 } 2370 devPathLength = strlen( bsdPath ); 2371 if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { 2372 kernResult = KERN_SUCCESS; 2373 } 2374 CFRelease( bsdPathAsCFString ); 2375 } 2376 IOObjectRelease( nextMedia ); 2377 } 2378 2379 return kernResult; 2380 } 2381 2382 /* Sets up a real cdrom for use in QEMU */ 2383 static bool setup_cdrom(char *bsd_path, Error **errp) 2384 { 2385 int index, num_of_test_partitions = 2, fd; 2386 char test_partition[MAXPATHLEN]; 2387 bool partition_found = false; 2388 2389 /* look for a working partition */ 2390 for (index = 0; index < num_of_test_partitions; index++) { 2391 snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path, 2392 index); 2393 fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE); 2394 if (fd >= 0) { 2395 partition_found = true; 2396 qemu_close(fd); 2397 break; 2398 } 2399 } 2400 2401 /* if a working partition on the device was not found */ 2402 if (partition_found == false) { 2403 error_setg(errp, "Failed to find a working partition on disc"); 2404 } else { 2405 DPRINTF("Using %s as optical disc\n", test_partition); 2406 pstrcpy(bsd_path, MAXPATHLEN, test_partition); 2407 } 2408 return partition_found; 2409 } 2410 2411 /* Prints directions on mounting and unmounting a device */ 2412 static void print_unmounting_directions(const char *file_name) 2413 { 2414 error_report("If device %s is mounted on the desktop, unmount" 2415 " it first before using it in QEMU", file_name); 2416 error_report("Command to unmount device: diskutil unmountDisk %s", 2417 file_name); 2418 error_report("Command to mount device: diskutil mountDisk %s", file_name); 2419 } 2420 2421 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2422 2423 static int hdev_probe_device(const char *filename) 2424 { 2425 struct stat st; 2426 2427 /* allow a dedicated CD-ROM driver to match with a higher priority */ 2428 if (strstart(filename, "/dev/cdrom", NULL)) 2429 return 50; 2430 2431 if (stat(filename, &st) >= 0 && 2432 (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { 2433 return 100; 2434 } 2435 2436 return 0; 2437 } 2438 2439 static int check_hdev_writable(BDRVRawState *s) 2440 { 2441 #if defined(BLKROGET) 2442 /* Linux block devices can be configured "read-only" using blockdev(8). 2443 * This is independent of device node permissions and therefore open(2) 2444 * with O_RDWR succeeds. Actual writes fail with EPERM. 2445 * 2446 * bdrv_open() is supposed to fail if the disk is read-only. Explicitly 2447 * check for read-only block devices so that Linux block devices behave 2448 * properly. 2449 */ 2450 struct stat st; 2451 int readonly = 0; 2452 2453 if (fstat(s->fd, &st)) { 2454 return -errno; 2455 } 2456 2457 if (!S_ISBLK(st.st_mode)) { 2458 return 0; 2459 } 2460 2461 if (ioctl(s->fd, BLKROGET, &readonly) < 0) { 2462 return -errno; 2463 } 2464 2465 if (readonly) { 2466 return -EACCES; 2467 } 2468 #endif /* defined(BLKROGET) */ 2469 return 0; 2470 } 2471 2472 static void hdev_parse_filename(const char *filename, QDict *options, 2473 Error **errp) 2474 { 2475 bdrv_parse_filename_strip_prefix(filename, "host_device:", options); 2476 } 2477 2478 static bool hdev_is_sg(BlockDriverState *bs) 2479 { 2480 2481 #if defined(__linux__) 2482 2483 BDRVRawState *s = bs->opaque; 2484 struct stat st; 2485 struct sg_scsi_id scsiid; 2486 int sg_version; 2487 int ret; 2488 2489 if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) { 2490 return false; 2491 } 2492 2493 ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version); 2494 if (ret < 0) { 2495 return false; 2496 } 2497 2498 ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid); 2499 if (ret >= 0) { 2500 DPRINTF("SG device found: type=%d, version=%d\n", 2501 scsiid.scsi_type, sg_version); 2502 return true; 2503 } 2504 2505 #endif 2506 2507 return false; 2508 } 2509 2510 static int hdev_open(BlockDriverState *bs, QDict *options, int flags, 2511 Error **errp) 2512 { 2513 BDRVRawState *s = bs->opaque; 2514 Error *local_err = NULL; 2515 int ret; 2516 2517 #if defined(__APPLE__) && defined(__MACH__) 2518 /* 2519 * Caution: while qdict_get_str() is fine, getting non-string types 2520 * would require more care. When @options come from -blockdev or 2521 * blockdev_add, its members are typed according to the QAPI 2522 * schema, but when they come from -drive, they're all QString. 2523 */ 2524 const char *filename = qdict_get_str(options, "filename"); 2525 char bsd_path[MAXPATHLEN] = ""; 2526 bool error_occurred = false; 2527 2528 /* If using a real cdrom */ 2529 if (strcmp(filename, "/dev/cdrom") == 0) { 2530 char *mediaType = NULL; 2531 kern_return_t ret_val; 2532 io_iterator_t mediaIterator = 0; 2533 2534 mediaType = FindEjectableOpticalMedia(&mediaIterator); 2535 if (mediaType == NULL) { 2536 error_setg(errp, "Please make sure your CD/DVD is in the optical" 2537 " drive"); 2538 error_occurred = true; 2539 goto hdev_open_Mac_error; 2540 } 2541 2542 ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags); 2543 if (ret_val != KERN_SUCCESS) { 2544 error_setg(errp, "Could not get BSD path for optical drive"); 2545 error_occurred = true; 2546 goto hdev_open_Mac_error; 2547 } 2548 2549 /* If a real optical drive was not found */ 2550 if (bsd_path[0] == '\0') { 2551 error_setg(errp, "Failed to obtain bsd path for optical drive"); 2552 error_occurred = true; 2553 goto hdev_open_Mac_error; 2554 } 2555 2556 /* If using a cdrom disc and finding a partition on the disc failed */ 2557 if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 && 2558 setup_cdrom(bsd_path, errp) == false) { 2559 print_unmounting_directions(bsd_path); 2560 error_occurred = true; 2561 goto hdev_open_Mac_error; 2562 } 2563 2564 qdict_put_str(options, "filename", bsd_path); 2565 2566 hdev_open_Mac_error: 2567 g_free(mediaType); 2568 if (mediaIterator) { 2569 IOObjectRelease(mediaIterator); 2570 } 2571 if (error_occurred) { 2572 return -ENOENT; 2573 } 2574 } 2575 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2576 2577 s->type = FTYPE_FILE; 2578 2579 ret = raw_open_common(bs, options, flags, 0, &local_err); 2580 if (ret < 0) { 2581 error_propagate(errp, local_err); 2582 #if defined(__APPLE__) && defined(__MACH__) 2583 if (*bsd_path) { 2584 filename = bsd_path; 2585 } 2586 /* if a physical device experienced an error while being opened */ 2587 if (strncmp(filename, "/dev/", 5) == 0) { 2588 print_unmounting_directions(filename); 2589 } 2590 #endif /* defined(__APPLE__) && defined(__MACH__) */ 2591 return ret; 2592 } 2593 2594 /* Since this does ioctl the device must be already opened */ 2595 bs->sg = hdev_is_sg(bs); 2596 2597 if (flags & BDRV_O_RDWR) { 2598 ret = check_hdev_writable(s); 2599 if (ret < 0) { 2600 raw_close(bs); 2601 error_setg_errno(errp, -ret, "The device is not writable"); 2602 return ret; 2603 } 2604 } 2605 2606 return ret; 2607 } 2608 2609 #if defined(__linux__) 2610 2611 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, 2612 unsigned long int req, void *buf, 2613 BlockCompletionFunc *cb, void *opaque) 2614 { 2615 BDRVRawState *s = bs->opaque; 2616 RawPosixAIOData *acb; 2617 ThreadPool *pool; 2618 2619 if (fd_open(bs) < 0) 2620 return NULL; 2621 2622 if (req == SG_IO && s->pr_mgr) { 2623 struct sg_io_hdr *io_hdr = buf; 2624 if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT || 2625 io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) { 2626 return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs), 2627 s->fd, io_hdr, cb, opaque); 2628 } 2629 } 2630 2631 acb = g_new(RawPosixAIOData, 1); 2632 acb->bs = bs; 2633 acb->aio_type = QEMU_AIO_IOCTL; 2634 acb->aio_fildes = s->fd; 2635 acb->aio_offset = 0; 2636 acb->aio_ioctl_buf = buf; 2637 acb->aio_ioctl_cmd = req; 2638 pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); 2639 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); 2640 } 2641 #endif /* linux */ 2642 2643 static int fd_open(BlockDriverState *bs) 2644 { 2645 BDRVRawState *s = bs->opaque; 2646 2647 /* this is just to ensure s->fd is sane (its called by io ops) */ 2648 if (s->fd >= 0) 2649 return 0; 2650 return -EIO; 2651 } 2652 2653 static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs, 2654 int64_t offset, int bytes, 2655 BlockCompletionFunc *cb, void *opaque) 2656 { 2657 BDRVRawState *s = bs->opaque; 2658 2659 if (fd_open(bs) < 0) { 2660 return NULL; 2661 } 2662 return paio_submit(bs, s->fd, offset, NULL, bytes, 2663 cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); 2664 } 2665 2666 static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs, 2667 int64_t offset, int bytes, BdrvRequestFlags flags) 2668 { 2669 BDRVRawState *s = bs->opaque; 2670 int rc; 2671 2672 rc = fd_open(bs); 2673 if (rc < 0) { 2674 return rc; 2675 } 2676 if (!(flags & BDRV_REQ_MAY_UNMAP)) { 2677 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2678 QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); 2679 } else if (s->discard_zeroes) { 2680 return paio_submit_co(bs, s->fd, offset, NULL, bytes, 2681 QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); 2682 } 2683 return -ENOTSUP; 2684 } 2685 2686 static int hdev_create(const char *filename, QemuOpts *opts, 2687 Error **errp) 2688 { 2689 int fd; 2690 int ret = 0; 2691 struct stat stat_buf; 2692 int64_t total_size = 0; 2693 bool has_prefix; 2694 2695 /* This function is used by both protocol block drivers and therefore either 2696 * of these prefixes may be given. 2697 * The return value has to be stored somewhere, otherwise this is an error 2698 * due to -Werror=unused-value. */ 2699 has_prefix = 2700 strstart(filename, "host_device:", &filename) || 2701 strstart(filename, "host_cdrom:" , &filename); 2702 2703 (void)has_prefix; 2704 2705 ret = raw_normalize_devicepath(&filename); 2706 if (ret < 0) { 2707 error_setg_errno(errp, -ret, "Could not normalize device path"); 2708 return ret; 2709 } 2710 2711 /* Read out options */ 2712 total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), 2713 BDRV_SECTOR_SIZE); 2714 2715 fd = qemu_open(filename, O_WRONLY | O_BINARY); 2716 if (fd < 0) { 2717 ret = -errno; 2718 error_setg_errno(errp, -ret, "Could not open device"); 2719 return ret; 2720 } 2721 2722 if (fstat(fd, &stat_buf) < 0) { 2723 ret = -errno; 2724 error_setg_errno(errp, -ret, "Could not stat device"); 2725 } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { 2726 error_setg(errp, 2727 "The given file is neither a block nor a character device"); 2728 ret = -ENODEV; 2729 } else if (lseek(fd, 0, SEEK_END) < total_size) { 2730 error_setg(errp, "Device is too small"); 2731 ret = -ENOSPC; 2732 } 2733 2734 if (!ret && total_size) { 2735 uint8_t buf[BDRV_SECTOR_SIZE] = { 0 }; 2736 int64_t zero_size = MIN(BDRV_SECTOR_SIZE, total_size); 2737 if (lseek(fd, 0, SEEK_SET) == -1) { 2738 ret = -errno; 2739 } else { 2740 ret = qemu_write_full(fd, buf, zero_size); 2741 ret = ret == zero_size ? 0 : -errno; 2742 } 2743 } 2744 qemu_close(fd); 2745 return ret; 2746 } 2747 2748 static BlockDriver bdrv_host_device = { 2749 .format_name = "host_device", 2750 .protocol_name = "host_device", 2751 .instance_size = sizeof(BDRVRawState), 2752 .bdrv_needs_filename = true, 2753 .bdrv_probe_device = hdev_probe_device, 2754 .bdrv_parse_filename = hdev_parse_filename, 2755 .bdrv_file_open = hdev_open, 2756 .bdrv_close = raw_close, 2757 .bdrv_reopen_prepare = raw_reopen_prepare, 2758 .bdrv_reopen_commit = raw_reopen_commit, 2759 .bdrv_reopen_abort = raw_reopen_abort, 2760 .bdrv_create = hdev_create, 2761 .create_opts = &raw_create_opts, 2762 .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes, 2763 2764 .bdrv_co_preadv = raw_co_preadv, 2765 .bdrv_co_pwritev = raw_co_pwritev, 2766 .bdrv_aio_flush = raw_aio_flush, 2767 .bdrv_aio_pdiscard = hdev_aio_pdiscard, 2768 .bdrv_refresh_limits = raw_refresh_limits, 2769 .bdrv_io_plug = raw_aio_plug, 2770 .bdrv_io_unplug = raw_aio_unplug, 2771 2772 .bdrv_truncate = raw_truncate, 2773 .bdrv_getlength = raw_getlength, 2774 .bdrv_get_info = raw_get_info, 2775 .bdrv_get_allocated_file_size 2776 = raw_get_allocated_file_size, 2777 .bdrv_check_perm = raw_check_perm, 2778 .bdrv_set_perm = raw_set_perm, 2779 .bdrv_abort_perm_update = raw_abort_perm_update, 2780 .bdrv_probe_blocksizes = hdev_probe_blocksizes, 2781 .bdrv_probe_geometry = hdev_probe_geometry, 2782 2783 /* generic scsi device */ 2784 #ifdef __linux__ 2785 .bdrv_aio_ioctl = hdev_aio_ioctl, 2786 #endif 2787 }; 2788 2789 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 2790 static void cdrom_parse_filename(const char *filename, QDict *options, 2791 Error **errp) 2792 { 2793 bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options); 2794 } 2795 #endif 2796 2797 #ifdef __linux__ 2798 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 2799 Error **errp) 2800 { 2801 BDRVRawState *s = bs->opaque; 2802 2803 s->type = FTYPE_CD; 2804 2805 /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ 2806 return raw_open_common(bs, options, flags, O_NONBLOCK, errp); 2807 } 2808 2809 static int cdrom_probe_device(const char *filename) 2810 { 2811 int fd, ret; 2812 int prio = 0; 2813 struct stat st; 2814 2815 fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); 2816 if (fd < 0) { 2817 goto out; 2818 } 2819 ret = fstat(fd, &st); 2820 if (ret == -1 || !S_ISBLK(st.st_mode)) { 2821 goto outc; 2822 } 2823 2824 /* Attempt to detect via a CDROM specific ioctl */ 2825 ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 2826 if (ret >= 0) 2827 prio = 100; 2828 2829 outc: 2830 qemu_close(fd); 2831 out: 2832 return prio; 2833 } 2834 2835 static bool cdrom_is_inserted(BlockDriverState *bs) 2836 { 2837 BDRVRawState *s = bs->opaque; 2838 int ret; 2839 2840 ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); 2841 return ret == CDS_DISC_OK; 2842 } 2843 2844 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 2845 { 2846 BDRVRawState *s = bs->opaque; 2847 2848 if (eject_flag) { 2849 if (ioctl(s->fd, CDROMEJECT, NULL) < 0) 2850 perror("CDROMEJECT"); 2851 } else { 2852 if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) 2853 perror("CDROMEJECT"); 2854 } 2855 } 2856 2857 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 2858 { 2859 BDRVRawState *s = bs->opaque; 2860 2861 if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { 2862 /* 2863 * Note: an error can happen if the distribution automatically 2864 * mounts the CD-ROM 2865 */ 2866 /* perror("CDROM_LOCKDOOR"); */ 2867 } 2868 } 2869 2870 static BlockDriver bdrv_host_cdrom = { 2871 .format_name = "host_cdrom", 2872 .protocol_name = "host_cdrom", 2873 .instance_size = sizeof(BDRVRawState), 2874 .bdrv_needs_filename = true, 2875 .bdrv_probe_device = cdrom_probe_device, 2876 .bdrv_parse_filename = cdrom_parse_filename, 2877 .bdrv_file_open = cdrom_open, 2878 .bdrv_close = raw_close, 2879 .bdrv_reopen_prepare = raw_reopen_prepare, 2880 .bdrv_reopen_commit = raw_reopen_commit, 2881 .bdrv_reopen_abort = raw_reopen_abort, 2882 .bdrv_create = hdev_create, 2883 .create_opts = &raw_create_opts, 2884 2885 2886 .bdrv_co_preadv = raw_co_preadv, 2887 .bdrv_co_pwritev = raw_co_pwritev, 2888 .bdrv_aio_flush = raw_aio_flush, 2889 .bdrv_refresh_limits = raw_refresh_limits, 2890 .bdrv_io_plug = raw_aio_plug, 2891 .bdrv_io_unplug = raw_aio_unplug, 2892 2893 .bdrv_truncate = raw_truncate, 2894 .bdrv_getlength = raw_getlength, 2895 .has_variable_length = true, 2896 .bdrv_get_allocated_file_size 2897 = raw_get_allocated_file_size, 2898 2899 /* removable device support */ 2900 .bdrv_is_inserted = cdrom_is_inserted, 2901 .bdrv_eject = cdrom_eject, 2902 .bdrv_lock_medium = cdrom_lock_medium, 2903 2904 /* generic scsi device */ 2905 .bdrv_aio_ioctl = hdev_aio_ioctl, 2906 }; 2907 #endif /* __linux__ */ 2908 2909 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) 2910 static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, 2911 Error **errp) 2912 { 2913 BDRVRawState *s = bs->opaque; 2914 Error *local_err = NULL; 2915 int ret; 2916 2917 s->type = FTYPE_CD; 2918 2919 ret = raw_open_common(bs, options, flags, 0, &local_err); 2920 if (ret) { 2921 error_propagate(errp, local_err); 2922 return ret; 2923 } 2924 2925 /* make sure the door isn't locked at this time */ 2926 ioctl(s->fd, CDIOCALLOW); 2927 return 0; 2928 } 2929 2930 static int cdrom_probe_device(const char *filename) 2931 { 2932 if (strstart(filename, "/dev/cd", NULL) || 2933 strstart(filename, "/dev/acd", NULL)) 2934 return 100; 2935 return 0; 2936 } 2937 2938 static int cdrom_reopen(BlockDriverState *bs) 2939 { 2940 BDRVRawState *s = bs->opaque; 2941 int fd; 2942 2943 /* 2944 * Force reread of possibly changed/newly loaded disc, 2945 * FreeBSD seems to not notice sometimes... 2946 */ 2947 if (s->fd >= 0) 2948 qemu_close(s->fd); 2949 fd = qemu_open(bs->filename, s->open_flags, 0644); 2950 if (fd < 0) { 2951 s->fd = -1; 2952 return -EIO; 2953 } 2954 s->fd = fd; 2955 2956 /* make sure the door isn't locked at this time */ 2957 ioctl(s->fd, CDIOCALLOW); 2958 return 0; 2959 } 2960 2961 static bool cdrom_is_inserted(BlockDriverState *bs) 2962 { 2963 return raw_getlength(bs) > 0; 2964 } 2965 2966 static void cdrom_eject(BlockDriverState *bs, bool eject_flag) 2967 { 2968 BDRVRawState *s = bs->opaque; 2969 2970 if (s->fd < 0) 2971 return; 2972 2973 (void) ioctl(s->fd, CDIOCALLOW); 2974 2975 if (eject_flag) { 2976 if (ioctl(s->fd, CDIOCEJECT) < 0) 2977 perror("CDIOCEJECT"); 2978 } else { 2979 if (ioctl(s->fd, CDIOCCLOSE) < 0) 2980 perror("CDIOCCLOSE"); 2981 } 2982 2983 cdrom_reopen(bs); 2984 } 2985 2986 static void cdrom_lock_medium(BlockDriverState *bs, bool locked) 2987 { 2988 BDRVRawState *s = bs->opaque; 2989 2990 if (s->fd < 0) 2991 return; 2992 if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { 2993 /* 2994 * Note: an error can happen if the distribution automatically 2995 * mounts the CD-ROM 2996 */ 2997 /* perror("CDROM_LOCKDOOR"); */ 2998 } 2999 } 3000 3001 static BlockDriver bdrv_host_cdrom = { 3002 .format_name = "host_cdrom", 3003 .protocol_name = "host_cdrom", 3004 .instance_size = sizeof(BDRVRawState), 3005 .bdrv_needs_filename = true, 3006 .bdrv_probe_device = cdrom_probe_device, 3007 .bdrv_parse_filename = cdrom_parse_filename, 3008 .bdrv_file_open = cdrom_open, 3009 .bdrv_close = raw_close, 3010 .bdrv_reopen_prepare = raw_reopen_prepare, 3011 .bdrv_reopen_commit = raw_reopen_commit, 3012 .bdrv_reopen_abort = raw_reopen_abort, 3013 .bdrv_create = hdev_create, 3014 .create_opts = &raw_create_opts, 3015 3016 .bdrv_co_preadv = raw_co_preadv, 3017 .bdrv_co_pwritev = raw_co_pwritev, 3018 .bdrv_aio_flush = raw_aio_flush, 3019 .bdrv_refresh_limits = raw_refresh_limits, 3020 .bdrv_io_plug = raw_aio_plug, 3021 .bdrv_io_unplug = raw_aio_unplug, 3022 3023 .bdrv_truncate = raw_truncate, 3024 .bdrv_getlength = raw_getlength, 3025 .has_variable_length = true, 3026 .bdrv_get_allocated_file_size 3027 = raw_get_allocated_file_size, 3028 3029 /* removable device support */ 3030 .bdrv_is_inserted = cdrom_is_inserted, 3031 .bdrv_eject = cdrom_eject, 3032 .bdrv_lock_medium = cdrom_lock_medium, 3033 }; 3034 #endif /* __FreeBSD__ */ 3035 3036 static void bdrv_file_init(void) 3037 { 3038 /* 3039 * Register all the drivers. Note that order is important, the driver 3040 * registered last will get probed first. 3041 */ 3042 bdrv_register(&bdrv_file); 3043 bdrv_register(&bdrv_host_device); 3044 #ifdef __linux__ 3045 bdrv_register(&bdrv_host_cdrom); 3046 #endif 3047 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) 3048 bdrv_register(&bdrv_host_cdrom); 3049 #endif 3050 } 3051 3052 block_init(bdrv_file_init); 3053