1 /* $OpenBSD: vioqcow2.c,v 1.24 2023/09/14 15:25:43 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/stat.h> 21 22 #include <dev/pci/pcireg.h> 23 24 #include <assert.h> 25 #include <err.h> 26 #include <errno.h> 27 #include <fcntl.h> 28 #include <libgen.h> 29 #include <stdlib.h> 30 #include <string.h> 31 #include <unistd.h> 32 33 #include "virtio.h" 34 35 #define QCOW2_COMPRESSED 0x4000000000000000ull 36 #define QCOW2_INPLACE 0x8000000000000000ull 37 38 #define QCOW2_DIRTY (1 << 0) 39 #define QCOW2_CORRUPT (1 << 1) 40 41 enum { 42 ICFEATURE_DIRTY = 1 << 0, 43 ICFEATURE_CORRUPT = 1 << 1, 44 }; 45 46 enum { 47 ACFEATURE_BITEXT = 1 << 0, 48 }; 49 50 struct qcheader { 51 char magic[4]; 52 uint32_t version; 53 uint64_t backingoff; 54 uint32_t backingsz; 55 uint32_t clustershift; 56 uint64_t disksz; 57 uint32_t cryptmethod; 58 uint32_t l1sz; 59 uint64_t l1off; 60 uint64_t refoff; 61 uint32_t refsz; 62 uint32_t snapcount; 63 uint64_t snapsz; 64 /* v3 additions */ 65 uint64_t incompatfeatures; 66 uint64_t compatfeatures; 67 uint64_t autoclearfeatures; 68 uint32_t reforder; /* Bits = 1 << reforder */ 69 uint32_t headersz; 70 } __packed; 71 72 struct qcdisk { 73 pthread_rwlock_t lock; 74 struct qcdisk *base; 75 struct qcheader header; 76 77 int fd; 78 uint64_t *l1; 79 off_t end; 80 off_t clustersz; 81 off_t disksz; /* In bytes */ 82 uint32_t cryptmethod; 83 84 uint32_t l1sz; 85 off_t l1off; 86 87 off_t refoff; 88 off_t refsz; 89 90 uint32_t nsnap; 91 off_t snapoff; 92 93 /* v3 features */ 94 uint64_t incompatfeatures; 95 uint64_t autoclearfeatures; 96 uint32_t refssz; 97 uint32_t headersz; 98 }; 99 100 extern char *__progname; 101 102 static off_t xlate(struct qcdisk *, off_t, int *); 103 static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 104 static void inc_refs(struct qcdisk *, off_t, int); 105 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 106 static int qc2_open(struct qcdisk *, int *, size_t); 107 static ssize_t qc2_pread(void *, char *, size_t, off_t); 108 static ssize_t qc2_preadv(void *, struct iovec *, int, off_t); 109 static ssize_t qc2_pwrite(void *, char *, size_t, off_t); 110 static ssize_t qc2_pwritev(void *, struct iovec *, int, off_t); 111 static void qc2_close(void *, int); 112 113 /* 114 * Initializes a raw disk image backing file from an fd. Stores the 115 * number of bytes in *szp, returning -1 for error, 0 for success. 116 * 117 * May open snapshot base images. 118 */ 119 int 120 virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd) 121 { 122 struct qcdisk *diskp; 123 124 diskp = malloc(sizeof(struct qcdisk)); 125 if (diskp == NULL) 126 return -1; 127 if (qc2_open(diskp, fd, nfd) == -1) { 128 log_warnx("could not open qcow2 disk"); 129 return -1; 130 } 131 file->p = diskp; 132 file->pread = qc2_pread; 133 file->preadv = qc2_preadv; 134 file->pwrite = qc2_pwrite; 135 file->pwritev = qc2_pwritev; 136 file->close = qc2_close; 137 *szp = diskp->disksz; 138 return 0; 139 } 140 141 /* 142 * Return the path to the base image given a disk image. 143 * Called from vmctl. 144 */ 145 ssize_t 146 virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath) 147 { 148 char dpathbuf[PATH_MAX]; 149 char expanded[PATH_MAX]; 150 struct qcheader header; 151 uint64_t backingoff; 152 uint32_t backingsz; 153 char *s = NULL; 154 155 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) { 156 log_warnx("short read on header"); 157 return -1; 158 } 159 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) { 160 log_warnx("invalid magic numbers"); 161 return -1; 162 } 163 backingoff = be64toh(header.backingoff); 164 backingsz = be32toh(header.backingsz); 165 if (backingsz == 0) 166 return 0; 167 168 if (backingsz >= npath - 1) { 169 log_warnx("snapshot path too long"); 170 return -1; 171 } 172 if (pread(fd, path, backingsz, backingoff) != backingsz) { 173 log_warnx("could not read snapshot base name"); 174 return -1; 175 } 176 path[backingsz] = '\0'; 177 178 /* 179 * Relative paths should be interpreted relative to the disk image, 180 * rather than relative to the directory vmd happens to be running in, 181 * since this is the only useful interpretation. 182 */ 183 if (path[0] == '/') { 184 if (realpath(path, expanded) == NULL || 185 strlcpy(path, expanded, npath) >= npath) { 186 log_warnx("unable to resolve %s", path); 187 return -1; 188 } 189 } else { 190 if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >= 191 sizeof(dpathbuf)) { 192 log_warnx("path too long: %s", dpath); 193 return -1; 194 } 195 s = dirname(dpathbuf); 196 if (snprintf(expanded, sizeof(expanded), 197 "%s/%s", s, path) >= (int)sizeof(expanded)) { 198 log_warnx("path too long: %s/%s", s, path); 199 return -1; 200 } 201 if (npath < PATH_MAX || 202 realpath(expanded, path) == NULL) { 203 log_warnx("unable to resolve %s", path); 204 return -1; 205 } 206 } 207 208 return strlen(path); 209 } 210 211 static int 212 qc2_open(struct qcdisk *disk, int *fds, size_t nfd) 213 { 214 char basepath[PATH_MAX]; 215 struct stat st; 216 struct qcheader header; 217 uint64_t backingoff; 218 uint32_t backingsz; 219 off_t i; 220 int version, fd; 221 222 pthread_rwlock_init(&disk->lock, NULL); 223 fd = fds[0]; 224 disk->fd = fd; 225 disk->base = NULL; 226 disk->l1 = NULL; 227 228 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) 229 fatalx("short read on header"); 230 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 231 fatalx("invalid magic numbers"); 232 233 disk->clustersz = (1ull << be32toh(header.clustershift)); 234 disk->disksz = be64toh(header.disksz); 235 disk->cryptmethod = be32toh(header.cryptmethod); 236 disk->l1sz = be32toh(header.l1sz); 237 disk->l1off = be64toh(header.l1off); 238 disk->refsz = be32toh(header.refsz); 239 disk->refoff = be64toh(header.refoff); 240 disk->nsnap = be32toh(header.snapcount); 241 disk->snapoff = be64toh(header.snapsz); 242 243 /* 244 * The additional features here are defined as 0 in the v2 format, 245 * so as long as we clear the buffer before parsing, we don't need 246 * to check versions here. 247 */ 248 disk->incompatfeatures = be64toh(header.incompatfeatures); 249 disk->autoclearfeatures = be64toh(header.autoclearfeatures); 250 disk->refssz = be32toh(header.refsz); 251 disk->headersz = be32toh(header.headersz); 252 253 /* 254 * We only know about the dirty or corrupt bits here. 255 */ 256 if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) 257 fatalx("unsupported features %llx", 258 disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)); 259 if (be32toh(header.reforder) != 4) 260 fatalx("unsupported refcount size\n"); 261 262 disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1)); 263 if (!disk->l1) 264 fatal("%s: could not allocate l1 table", __func__); 265 if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off) 266 != 8 * disk->l1sz) 267 fatalx("%s: unable to read qcow2 L1 table", __func__); 268 for (i = 0; i < disk->l1sz; i++) 269 disk->l1[i] = be64toh(disk->l1[i]); 270 version = be32toh(header.version); 271 if (version != 2 && version != 3) 272 fatalx("%s: unknown qcow2 version %d", __func__, version); 273 274 backingoff = be64toh(header.backingoff); 275 backingsz = be32toh(header.backingsz); 276 if (backingsz != 0) { 277 if (backingsz >= sizeof(basepath) - 1) { 278 fatalx("%s: snapshot path too long", __func__); 279 } 280 if (pread(fd, basepath, backingsz, backingoff) != backingsz) { 281 fatalx("%s: could not read snapshot base name", 282 __func__); 283 } 284 basepath[backingsz] = 0; 285 if (nfd <= 1) { 286 fatalx("%s: missing base image %s", __func__, 287 basepath); 288 } 289 290 291 disk->base = calloc(1, sizeof(struct qcdisk)); 292 if (!disk->base) 293 fatal("%s: could not open %s", __func__, basepath); 294 if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) 295 fatalx("%s: could not open %s", __func__, basepath); 296 if (disk->base->clustersz != disk->clustersz) 297 fatalx("%s: all disk parts must share clustersize", 298 __func__); 299 } 300 if (fstat(fd, &st) == -1) 301 fatal("%s: unable to stat disk", __func__); 302 303 disk->end = st.st_size; 304 305 log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d", 306 __func__, version, disk->disksz, disk->end, disk->nsnap); 307 308 return 0; 309 } 310 311 static ssize_t 312 qc2_preadv(void *p, struct iovec *iov, int cnt, off_t offset) 313 { 314 int i; 315 off_t pos = offset; 316 ssize_t sz = 0, total = 0; 317 318 for (i = 0; i < cnt; i++, iov++) { 319 sz = qc2_pread(p, iov->iov_base, iov->iov_len, pos); 320 if (sz == -1) 321 return (sz); 322 total += sz; 323 pos += sz; 324 } 325 326 return (total); 327 } 328 329 static ssize_t 330 qc2_pread(void *p, char *buf, size_t len, off_t off) 331 { 332 struct qcdisk *disk, *d; 333 off_t phys_off, end, cluster_off; 334 ssize_t sz, rem; 335 336 disk = p; 337 end = off + len; 338 if (off < 0 || end > disk->disksz) 339 return -1; 340 341 /* handle head chunk separately */ 342 rem = len; 343 while (off != end) { 344 for (d = disk; d; d = d->base) 345 if ((phys_off = xlate(d, off, NULL)) > 0) 346 break; 347 /* Break out into chunks. This handles 348 * three cases: 349 * 350 * |----+====|========|====+-----| 351 * 352 * Either we are at the start of the read, 353 * and the cluster has some leading bytes. 354 * This means that we are reading the tail 355 * of the cluster, and our size is: 356 * 357 * clustersz - (off % clustersz). 358 * 359 * Otherwise, we're reading the middle section. 360 * We're already aligned here, so we can just 361 * read the whole cluster size. Or we're at the 362 * tail, at which point we just want to read the 363 * remaining bytes. 364 */ 365 cluster_off = off % disk->clustersz; 366 sz = disk->clustersz - cluster_off; 367 if (sz > rem) 368 sz = rem; 369 /* 370 * If we're within the disk, but don't have backing bytes, 371 * just read back zeros. 372 */ 373 if (!d) 374 bzero(buf, sz); 375 else if (pread(d->fd, buf, sz, phys_off) != sz) 376 return -1; 377 off += sz; 378 buf += sz; 379 rem -= sz; 380 } 381 return len; 382 } 383 384 static ssize_t 385 qc2_pwritev(void *p, struct iovec *iov, int cnt, off_t offset) 386 { 387 int i; 388 off_t pos = offset; 389 ssize_t sz = 0, total = 0; 390 391 for (i = 0; i < cnt; i++, iov++) { 392 sz = qc2_pwrite(p, iov->iov_base, iov->iov_len, pos); 393 if (sz == -1) 394 return (sz); 395 total += sz; 396 pos += sz; 397 } 398 399 return (total); 400 } 401 402 static ssize_t 403 qc2_pwrite(void *p, char *buf, size_t len, off_t off) 404 { 405 struct qcdisk *disk, *d; 406 off_t phys_off, cluster_off, end; 407 ssize_t sz, rem; 408 int inplace; 409 410 d = p; 411 disk = p; 412 inplace = 1; 413 end = off + len; 414 if (off < 0 || end > disk->disksz) 415 return -1; 416 rem = len; 417 while (off != end) { 418 /* See the read code for a summary of the computation */ 419 cluster_off = off % disk->clustersz; 420 sz = disk->clustersz - cluster_off; 421 if (sz > rem) 422 sz = rem; 423 424 phys_off = xlate(disk, off, &inplace); 425 if (phys_off == -1) 426 return -1; 427 /* 428 * If we couldn't find the cluster in the writable disk, 429 * see if it exists in the base image. If it does, we 430 * need to copy it before the write. The copy happens 431 * in the '!inplace' if clause below te search. 432 */ 433 if (phys_off == 0) 434 for (d = disk->base; d; d = d->base) 435 if ((phys_off = xlate(d, off, NULL)) > 0) 436 break; 437 if (!inplace || phys_off == 0) 438 phys_off = mkcluster(disk, d, off, phys_off); 439 if (phys_off == -1) 440 return -1; 441 if (phys_off < disk->clustersz) 442 fatalx("%s: writing reserved cluster", __func__); 443 if (pwrite(disk->fd, buf, sz, phys_off) != sz) 444 return -1; 445 off += sz; 446 buf += sz; 447 rem -= sz; 448 } 449 return len; 450 } 451 452 static void 453 qc2_close(void *p, int stayopen) 454 { 455 struct qcdisk *disk; 456 457 disk = p; 458 if (disk->base) 459 qc2_close(disk->base, stayopen); 460 if (!stayopen) 461 close(disk->fd); 462 free(disk->l1); 463 free(disk); 464 } 465 466 /* 467 * Translates a virtual offset into an on-disk offset. 468 * Returns: 469 * -1 on error 470 * 0 on 'not found' 471 * >0 on found 472 */ 473 static off_t 474 xlate(struct qcdisk *disk, off_t off, int *inplace) 475 { 476 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff; 477 uint64_t buf; 478 479 480 /* 481 * Clear out inplace flag -- xlate misses should not 482 * be flagged as updatable in place. We will still 483 * return 0 from them, but this leaves less surprises 484 * in the API. 485 */ 486 if (inplace) 487 *inplace = 0; 488 pthread_rwlock_rdlock(&disk->lock); 489 if (off < 0) 490 goto err; 491 492 l2sz = disk->clustersz / 8; 493 l1off = (off / disk->clustersz) / l2sz; 494 if (l1off >= disk->l1sz) 495 goto err; 496 497 l2tab = disk->l1[l1off]; 498 l2tab &= ~QCOW2_INPLACE; 499 if (l2tab == 0) { 500 pthread_rwlock_unlock(&disk->lock); 501 return 0; 502 } 503 l2off = (off / disk->clustersz) % l2sz; 504 pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8); 505 cluster = be64toh(buf); 506 /* 507 * cluster may be 0, but all future operations don't affect 508 * the return value. 509 */ 510 if (inplace) 511 *inplace = !!(cluster & QCOW2_INPLACE); 512 if (cluster & QCOW2_COMPRESSED) 513 fatalx("%s: compressed clusters unsupported", __func__); 514 pthread_rwlock_unlock(&disk->lock); 515 clusteroff = 0; 516 cluster &= ~QCOW2_INPLACE; 517 if (cluster) 518 clusteroff = off % disk->clustersz; 519 return cluster + clusteroff; 520 err: 521 pthread_rwlock_unlock(&disk->lock); 522 return -1; 523 } 524 525 /* 526 * Allocates a new cluster on disk, creating a new L2 table 527 * if needed. The cluster starts off with a refs of one, 528 * and the writable bit set. 529 * 530 * Returns -1 on error, and the physical address within the 531 * cluster of the write offset if it exists. 532 */ 533 static off_t 534 mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) 535 { 536 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; 537 uint64_t buf; 538 539 pthread_rwlock_wrlock(&disk->lock); 540 541 cluster = -1; 542 /* L1 entries always exist */ 543 l2sz = disk->clustersz / 8; 544 l1off = off / (disk->clustersz * l2sz); 545 if (l1off >= disk->l1sz) 546 fatalx("l1 offset outside disk"); 547 548 disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); 549 550 l2tab = disk->l1[l1off]; 551 l2off = (off / disk->clustersz) % l2sz; 552 /* We may need to create or clone an L2 entry to map the block */ 553 if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { 554 orig = l2tab & ~QCOW2_INPLACE; 555 l2tab = disk->end; 556 disk->end += disk->clustersz; 557 if (ftruncate(disk->fd, disk->end) == -1) 558 fatal("%s: ftruncate failed", __func__); 559 560 /* 561 * If we translated, found a L2 entry, but it needed to 562 * be copied, copy it. 563 */ 564 if (orig != 0) 565 copy_cluster(disk, disk, l2tab, orig); 566 /* Update l1 -- we flush it later */ 567 disk->l1[l1off] = l2tab | QCOW2_INPLACE; 568 inc_refs(disk, l2tab, 1); 569 } 570 l2tab &= ~QCOW2_INPLACE; 571 572 /* Grow the disk */ 573 if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) 574 fatal("%s: could not grow disk", __func__); 575 if (src_phys > 0) 576 copy_cluster(disk, base, disk->end, src_phys); 577 cluster = disk->end; 578 disk->end += disk->clustersz; 579 buf = htobe64(cluster | QCOW2_INPLACE); 580 if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8) 581 fatalx("%s: could not write cluster", __func__); 582 583 /* TODO: lazily sync: currently VMD doesn't close things */ 584 buf = htobe64(disk->l1[l1off]); 585 if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8) 586 fatalx("%s: could not write l1", __func__); 587 inc_refs(disk, cluster, 1); 588 589 pthread_rwlock_unlock(&disk->lock); 590 clusteroff = off % disk->clustersz; 591 if (cluster + clusteroff < disk->clustersz) 592 fatalx("write would clobber header"); 593 return cluster + clusteroff; 594 } 595 596 /* Copies a cluster containing src to dst. Src and dst need not be aligned. */ 597 static void 598 copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src) 599 { 600 char *scratch; 601 602 scratch = malloc(disk->clustersz); 603 if (!scratch) 604 fatal("out of memory"); 605 src &= ~(disk->clustersz - 1); 606 dst &= ~(disk->clustersz - 1); 607 if (pread(base->fd, scratch, disk->clustersz, src) == -1) 608 fatal("%s: could not read cluster", __func__); 609 if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) 610 fatal("%s: could not write cluster", __func__); 611 free(scratch); 612 } 613 614 static void 615 inc_refs(struct qcdisk *disk, off_t off, int newcluster) 616 { 617 off_t l1off, l1idx, l2idx, l2cluster; 618 size_t nper; 619 uint16_t refs; 620 uint64_t buf; 621 622 off &= ~QCOW2_INPLACE; 623 nper = disk->clustersz / 2; 624 l1idx = (off / disk->clustersz) / nper; 625 l2idx = (off / disk->clustersz) % nper; 626 l1off = disk->refoff + 8 * l1idx; 627 if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8) 628 fatal("could not read refs"); 629 630 l2cluster = be64toh(buf); 631 if (l2cluster == 0) { 632 l2cluster = disk->end; 633 disk->end += disk->clustersz; 634 if (ftruncate(disk->fd, disk->end) < 0) 635 fatal("%s: failed to allocate ref block", __func__); 636 buf = htobe64(l2cluster); 637 if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) 638 fatal("%s: failed to write ref block", __func__); 639 } 640 641 refs = 1; 642 if (!newcluster) { 643 if (pread(disk->fd, &refs, sizeof(refs), 644 l2cluster + 2 * l2idx) != 2) 645 fatal("could not read ref cluster"); 646 refs = be16toh(refs) + 1; 647 } 648 refs = htobe16(refs); 649 if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2) 650 fatal("%s: could not write ref block", __func__); 651 } 652 653 /* 654 * virtio_qcow2_create 655 * 656 * Create an empty qcow2 imagefile with the specified path and size. 657 * 658 * Parameters: 659 * imgfile_path: path to the image file to create 660 * imgsize : size of the image file to create (in bytes) 661 * 662 * Return: 663 * EEXIST: The requested image file already exists 664 * 0 : Image file successfully created 665 * Exxxx : Various other Exxxx errno codes due to other I/O errors 666 */ 667 int 668 virtio_qcow2_create(const char *imgfile_path, 669 const char *base_path, uint64_t disksz) 670 { 671 struct qcheader hdr, basehdr; 672 int fd, ret; 673 ssize_t base_len; 674 uint64_t l1sz, refsz, initsz, clustersz; 675 uint64_t l1off, refoff, v, i, l1entrysz, refentrysz; 676 uint16_t refs; 677 678 if (base_path) { 679 fd = open(base_path, O_RDONLY); 680 if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr)) 681 errx(1, "failure to read base image header"); 682 close(fd); 683 if (strncmp(basehdr.magic, 684 VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 685 errx(1, "base image is not a qcow2 file"); 686 if (!disksz) 687 disksz = betoh64(basehdr.disksz); 688 else if (disksz != betoh64(basehdr.disksz)) 689 errx(1, "base size does not match requested size"); 690 } 691 if (!base_path && !disksz) 692 errx(1, "missing disk size"); 693 694 clustersz = (1<<16); 695 l1off = ALIGNSZ(sizeof(hdr), clustersz); 696 697 l1entrysz = clustersz * clustersz / 8; 698 l1sz = (disksz + l1entrysz - 1) / l1entrysz; 699 700 refoff = ALIGNSZ(l1off + 8*l1sz, clustersz); 701 refentrysz = clustersz * clustersz * clustersz / 2; 702 refsz = (disksz + refentrysz - 1) / refentrysz; 703 704 initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz); 705 base_len = base_path ? strlen(base_path) : 0; 706 707 memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)); 708 hdr.version = htobe32(3); 709 hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0); 710 hdr.backingsz = htobe32(base_len); 711 hdr.clustershift = htobe32(16); 712 hdr.disksz = htobe64(disksz); 713 hdr.cryptmethod = htobe32(0); 714 hdr.l1sz = htobe32(l1sz); 715 hdr.l1off = htobe64(l1off); 716 hdr.refoff = htobe64(refoff); 717 hdr.refsz = htobe32(refsz); 718 hdr.snapcount = htobe32(0); 719 hdr.snapsz = htobe64(0); 720 hdr.incompatfeatures = htobe64(0); 721 hdr.compatfeatures = htobe64(0); 722 hdr.autoclearfeatures = htobe64(0); 723 hdr.reforder = htobe32(4); 724 hdr.headersz = htobe32(sizeof(hdr)); 725 726 /* Refuse to overwrite an existing image */ 727 fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL, 728 S_IRUSR | S_IWUSR); 729 if (fd == -1) 730 return (errno); 731 732 /* Write out the header */ 733 if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 734 goto error; 735 736 /* Add the base image */ 737 if (base_path && write(fd, base_path, base_len) != base_len) 738 goto error; 739 740 /* Extend to desired size, and add one refcount cluster */ 741 if (ftruncate(fd, (off_t)initsz + clustersz) == -1) 742 goto error; 743 744 /* 745 * Paranoia: if our disk image takes more than one cluster 746 * to refcount the initial image, fail. 747 */ 748 if (initsz/clustersz > clustersz/2) { 749 errno = ERANGE; 750 goto error; 751 } 752 753 /* Add a refcount block, and refcount ourselves. */ 754 v = htobe64(initsz); 755 if (pwrite(fd, &v, 8, refoff) != 8) 756 goto error; 757 for (i = 0; i < initsz/clustersz + 1; i++) { 758 refs = htobe16(1); 759 if (pwrite(fd, &refs, 2, initsz + 2*i) != 2) 760 goto error; 761 } 762 763 ret = close(fd); 764 return (ret); 765 error: 766 ret = errno; 767 close(fd); 768 unlink(imgfile_path); 769 return (errno); 770 } 771