1 /* $OpenBSD: vioqcow2.c,v 1.16 2021/06/16 16:55:02 dv Exp $ */ 2 3 /* 4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/stat.h> 21 22 #include <dev/pci/pcireg.h> 23 #include <machine/vmmvar.h> 24 25 #include <assert.h> 26 #include <err.h> 27 #include <errno.h> 28 #include <fcntl.h> 29 #include <libgen.h> 30 #include <stdlib.h> 31 #include <string.h> 32 #include <unistd.h> 33 34 #include "virtio.h" 35 36 #define QCOW2_COMPRESSED 0x4000000000000000ull 37 #define QCOW2_INPLACE 0x8000000000000000ull 38 39 #define QCOW2_DIRTY (1 << 0) 40 #define QCOW2_CORRUPT (1 << 1) 41 42 enum { 43 ICFEATURE_DIRTY = 1 << 0, 44 ICFEATURE_CORRUPT = 1 << 1, 45 }; 46 47 enum { 48 ACFEATURE_BITEXT = 1 << 0, 49 }; 50 51 struct qcheader { 52 char magic[4]; 53 uint32_t version; 54 uint64_t backingoff; 55 uint32_t backingsz; 56 uint32_t clustershift; 57 uint64_t disksz; 58 uint32_t cryptmethod; 59 uint32_t l1sz; 60 uint64_t l1off; 61 uint64_t refoff; 62 uint32_t refsz; 63 uint32_t snapcount; 64 uint64_t snapsz; 65 /* v3 additions */ 66 uint64_t incompatfeatures; 67 uint64_t compatfeatures; 68 uint64_t autoclearfeatures; 69 uint32_t reforder; /* Bits = 1 << reforder */ 70 uint32_t headersz; 71 } __packed; 72 73 struct qcdisk { 74 pthread_rwlock_t lock; 75 struct qcdisk *base; 76 struct qcheader header; 77 78 int fd; 79 uint64_t *l1; 80 off_t end; 81 off_t clustersz; 82 off_t disksz; /* In bytes */ 83 uint32_t cryptmethod; 84 85 uint32_t l1sz; 86 off_t l1off; 87 88 off_t refoff; 89 off_t refsz; 90 91 uint32_t nsnap; 92 off_t snapoff; 93 94 /* v3 features */ 95 uint64_t incompatfeatures; 96 uint64_t autoclearfeatures; 97 uint32_t refssz; 98 uint32_t headersz; 99 }; 100 101 extern char *__progname; 102 103 static off_t xlate(struct qcdisk *, off_t, int *); 104 static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 105 static void inc_refs(struct qcdisk *, off_t, int); 106 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t); 107 static int qc2_open(struct qcdisk *, int *, size_t); 108 static ssize_t qc2_pread(void *, char *, size_t, off_t); 109 static ssize_t qc2_pwrite(void *, char *, size_t, off_t); 110 static void qc2_close(void *, int); 111 112 /* 113 * Initializes a raw disk image backing file from an fd. 114 * Stores the number of 512 byte sectors in *szp, 115 * returning -1 for error, 0 for success. 116 * 117 * May open snapshot base images. 118 */ 119 int 120 virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd) 121 { 122 struct qcdisk *diskp; 123 124 diskp = malloc(sizeof(struct qcdisk)); 125 if (diskp == NULL) 126 return -1; 127 if (qc2_open(diskp, fd, nfd) == -1) { 128 log_warnx("could not open qcow2 disk"); 129 return -1; 130 } 131 file->p = diskp; 132 file->pread = qc2_pread; 133 file->pwrite = qc2_pwrite; 134 file->close = qc2_close; 135 *szp = diskp->disksz; 136 return 0; 137 } 138 139 /* 140 * Return the path to the base image given a disk image. 141 * Called from vmctl. 142 */ 143 ssize_t 144 virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath) 145 { 146 char dpathbuf[PATH_MAX]; 147 char expanded[PATH_MAX]; 148 struct qcheader header; 149 uint64_t backingoff; 150 uint32_t backingsz; 151 char *s = NULL; 152 153 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) { 154 log_warnx("short read on header"); 155 return -1; 156 } 157 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) { 158 log_warnx("invalid magic numbers"); 159 return -1; 160 } 161 backingoff = be64toh(header.backingoff); 162 backingsz = be32toh(header.backingsz); 163 if (backingsz == 0) 164 return 0; 165 166 if (backingsz >= npath - 1) { 167 log_warnx("snapshot path too long"); 168 return -1; 169 } 170 if (pread(fd, path, backingsz, backingoff) != backingsz) { 171 log_warnx("could not read snapshot base name"); 172 return -1; 173 } 174 path[backingsz] = '\0'; 175 176 /* 177 * Relative paths should be interpreted relative to the disk image, 178 * rather than relative to the directory vmd happens to be running in, 179 * since this is the only userful interpretation. 180 */ 181 if (path[0] == '/') { 182 if (realpath(path, expanded) == NULL || 183 strlcpy(path, expanded, npath) >= npath) { 184 log_warnx("unable to resolve %s", path); 185 return -1; 186 } 187 } else { 188 if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >= 189 sizeof(dpathbuf)) { 190 log_warnx("path too long: %s", dpath); 191 return -1; 192 } 193 s = dirname(dpathbuf); 194 if (snprintf(expanded, sizeof(expanded), 195 "%s/%s", s, path) >= (int)sizeof(expanded)) { 196 log_warnx("path too long: %s/%s", s, path); 197 return -1; 198 } 199 if (npath < PATH_MAX || 200 realpath(expanded, path) == NULL) { 201 log_warnx("unable to resolve %s", path); 202 return -1; 203 } 204 } 205 206 return strlen(path); 207 } 208 209 static int 210 qc2_open(struct qcdisk *disk, int *fds, size_t nfd) 211 { 212 char basepath[PATH_MAX]; 213 struct stat st; 214 struct qcheader header; 215 uint64_t backingoff; 216 uint32_t backingsz; 217 off_t i; 218 int version, fd; 219 220 pthread_rwlock_init(&disk->lock, NULL); 221 fd = fds[0]; 222 disk->fd = fd; 223 disk->base = NULL; 224 disk->l1 = NULL; 225 226 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) 227 fatalx("short read on header"); 228 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 229 fatalx("invalid magic numbers"); 230 231 disk->clustersz = (1ull << be32toh(header.clustershift)); 232 disk->disksz = be64toh(header.disksz); 233 disk->cryptmethod = be32toh(header.cryptmethod); 234 disk->l1sz = be32toh(header.l1sz); 235 disk->l1off = be64toh(header.l1off); 236 disk->refsz = be32toh(header.refsz); 237 disk->refoff = be64toh(header.refoff); 238 disk->nsnap = be32toh(header.snapcount); 239 disk->snapoff = be64toh(header.snapsz); 240 241 /* 242 * The additional features here are defined as 0 in the v2 format, 243 * so as long as we clear the buffer before parsing, we don't need 244 * to check versions here. 245 */ 246 disk->incompatfeatures = be64toh(header.incompatfeatures); 247 disk->autoclearfeatures = be64toh(header.autoclearfeatures); 248 disk->refssz = be32toh(header.refsz); 249 disk->headersz = be32toh(header.headersz); 250 251 /* 252 * We only know about the dirty or corrupt bits here. 253 */ 254 if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)) 255 fatalx("unsupported features %llx", 256 disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT)); 257 if (be32toh(header.reforder) != 4) 258 fatalx("unsupported refcount size\n"); 259 260 disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1)); 261 if (!disk->l1) 262 fatal("%s: could not allocate l1 table", __func__); 263 if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off) 264 != 8 * disk->l1sz) 265 fatalx("%s: unable to read qcow2 L1 table", __func__); 266 for (i = 0; i < disk->l1sz; i++) 267 disk->l1[i] = be64toh(disk->l1[i]); 268 version = be32toh(header.version); 269 if (version != 2 && version != 3) 270 fatalx("%s: unknown qcow2 version %d", __func__, version); 271 272 backingoff = be64toh(header.backingoff); 273 backingsz = be32toh(header.backingsz); 274 if (backingsz != 0) { 275 if (backingsz >= sizeof(basepath) - 1) { 276 fatalx("%s: snapshot path too long", __func__); 277 } 278 if (pread(fd, basepath, backingsz, backingoff) != backingsz) { 279 fatalx("%s: could not read snapshot base name", 280 __func__); 281 } 282 basepath[backingsz] = 0; 283 if (nfd <= 1) { 284 fatalx("%s: missing base image %s", __func__, 285 basepath); 286 } 287 288 289 disk->base = calloc(1, sizeof(struct qcdisk)); 290 if (!disk->base) 291 fatal("%s: could not open %s", __func__, basepath); 292 if (qc2_open(disk->base, fds + 1, nfd - 1) == -1) 293 fatalx("%s: could not open %s", __func__, basepath); 294 if (disk->base->clustersz != disk->clustersz) 295 fatalx("%s: all disk parts must share clustersize", 296 __func__); 297 } 298 if (fstat(fd, &st) == -1) 299 fatal("%s: unable to stat disk", __func__); 300 301 disk->end = st.st_size; 302 303 log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d", 304 __func__, version, disk->disksz, disk->end, disk->nsnap); 305 306 return 0; 307 } 308 309 static ssize_t 310 qc2_pread(void *p, char *buf, size_t len, off_t off) 311 { 312 struct qcdisk *disk, *d; 313 off_t phys_off, end, cluster_off; 314 ssize_t sz, rem; 315 316 disk = p; 317 end = off + len; 318 if (off < 0 || end > disk->disksz) 319 return -1; 320 321 /* handle head chunk separately */ 322 rem = len; 323 while (off != end) { 324 for (d = disk; d; d = d->base) 325 if ((phys_off = xlate(d, off, NULL)) > 0) 326 break; 327 /* Break out into chunks. This handles 328 * three cases: 329 * 330 * |----+====|========|====+-----| 331 * 332 * Either we are at the start of the read, 333 * and the cluster has some leading bytes. 334 * This means that we are reading the tail 335 * of the cluster, and our size is: 336 * 337 * clustersz - (off % clustersz). 338 * 339 * Otherwise, we're reading the middle section. 340 * We're already aligned here, so we can just 341 * read the whole cluster size. Or we're at the 342 * tail, at which point we just want to read the 343 * remaining bytes. 344 */ 345 cluster_off = off % disk->clustersz; 346 sz = disk->clustersz - cluster_off; 347 if (sz > rem) 348 sz = rem; 349 /* 350 * If we're within the disk, but don't have backing bytes, 351 * just read back zeros. 352 */ 353 if (!d) 354 bzero(buf, sz); 355 else if (pread(d->fd, buf, sz, phys_off) != sz) 356 return -1; 357 off += sz; 358 buf += sz; 359 rem -= sz; 360 } 361 return len; 362 } 363 364 ssize_t 365 qc2_pwrite(void *p, char *buf, size_t len, off_t off) 366 { 367 struct qcdisk *disk, *d; 368 off_t phys_off, cluster_off, end; 369 ssize_t sz, rem; 370 int inplace; 371 372 d = p; 373 disk = p; 374 inplace = 1; 375 end = off + len; 376 if (off < 0 || end > disk->disksz) 377 return -1; 378 rem = len; 379 while (off != end) { 380 /* See the read code for a summary of the computation */ 381 cluster_off = off % disk->clustersz; 382 sz = disk->clustersz - cluster_off; 383 if (sz > rem) 384 sz = rem; 385 386 phys_off = xlate(disk, off, &inplace); 387 if (phys_off == -1) 388 return -1; 389 /* 390 * If we couldn't find the cluster in the writable disk, 391 * see if it exists in the base image. If it does, we 392 * need to copy it before the write. The copy happens 393 * in the '!inplace' if clause below te search. 394 */ 395 if (phys_off == 0) 396 for (d = disk->base; d; d = d->base) 397 if ((phys_off = xlate(d, off, NULL)) > 0) 398 break; 399 if (!inplace || phys_off == 0) 400 phys_off = mkcluster(disk, d, off, phys_off); 401 if (phys_off == -1) 402 return -1; 403 if (phys_off < disk->clustersz) 404 fatalx("%s: writing reserved cluster", __func__); 405 if (pwrite(disk->fd, buf, sz, phys_off) != sz) 406 return -1; 407 off += sz; 408 buf += sz; 409 rem -= sz; 410 } 411 return len; 412 } 413 414 static void 415 qc2_close(void *p, int stayopen) 416 { 417 struct qcdisk *disk; 418 419 disk = p; 420 if (disk->base) 421 qc2_close(disk->base, stayopen); 422 if (!stayopen) 423 close(disk->fd); 424 free(disk->l1); 425 free(disk); 426 } 427 428 /* 429 * Translates a virtual offset into an on-disk offset. 430 * Returns: 431 * -1 on error 432 * 0 on 'not found' 433 * >0 on found 434 */ 435 static off_t 436 xlate(struct qcdisk *disk, off_t off, int *inplace) 437 { 438 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff; 439 uint64_t buf; 440 441 442 /* 443 * Clear out inplace flag -- xlate misses should not 444 * be flagged as updatable in place. We will still 445 * return 0 from them, but this leaves less surprises 446 * in the API. 447 */ 448 if (inplace) 449 *inplace = 0; 450 pthread_rwlock_rdlock(&disk->lock); 451 if (off < 0) 452 goto err; 453 454 l2sz = disk->clustersz / 8; 455 l1off = (off / disk->clustersz) / l2sz; 456 if (l1off >= disk->l1sz) 457 goto err; 458 459 l2tab = disk->l1[l1off]; 460 l2tab &= ~QCOW2_INPLACE; 461 if (l2tab == 0) { 462 pthread_rwlock_unlock(&disk->lock); 463 return 0; 464 } 465 l2off = (off / disk->clustersz) % l2sz; 466 pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8); 467 cluster = be64toh(buf); 468 /* 469 * cluster may be 0, but all future operations don't affect 470 * the return value. 471 */ 472 if (inplace) 473 *inplace = !!(cluster & QCOW2_INPLACE); 474 if (cluster & QCOW2_COMPRESSED) 475 fatalx("%s: compressed clusters unsupported", __func__); 476 pthread_rwlock_unlock(&disk->lock); 477 clusteroff = 0; 478 cluster &= ~QCOW2_INPLACE; 479 if (cluster) 480 clusteroff = off % disk->clustersz; 481 return cluster + clusteroff; 482 err: 483 pthread_rwlock_unlock(&disk->lock); 484 return -1; 485 } 486 487 /* 488 * Allocates a new cluster on disk, creating a new L2 table 489 * if needed. The cluster starts off with a refs of one, 490 * and the writable bit set. 491 * 492 * Returns -1 on error, and the physical address within the 493 * cluster of the write offset if it exists. 494 */ 495 static off_t 496 mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys) 497 { 498 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig; 499 uint64_t buf; 500 int fd; 501 502 pthread_rwlock_wrlock(&disk->lock); 503 504 cluster = -1; 505 fd = disk->fd; 506 /* L1 entries always exist */ 507 l2sz = disk->clustersz / 8; 508 l1off = off / (disk->clustersz * l2sz); 509 if (l1off >= disk->l1sz) 510 fatalx("l1 offset outside disk"); 511 512 disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1); 513 514 l2tab = disk->l1[l1off]; 515 l2off = (off / disk->clustersz) % l2sz; 516 /* We may need to create or clone an L2 entry to map the block */ 517 if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) { 518 orig = l2tab & ~QCOW2_INPLACE; 519 l2tab = disk->end; 520 disk->end += disk->clustersz; 521 if (ftruncate(disk->fd, disk->end) == -1) 522 fatal("%s: ftruncate failed", __func__); 523 524 /* 525 * If we translated, found a L2 entry, but it needed to 526 * be copied, copy it. 527 */ 528 if (orig != 0) 529 copy_cluster(disk, disk, l2tab, orig); 530 /* Update l1 -- we flush it later */ 531 disk->l1[l1off] = l2tab | QCOW2_INPLACE; 532 inc_refs(disk, l2tab, 1); 533 } 534 l2tab &= ~QCOW2_INPLACE; 535 536 /* Grow the disk */ 537 if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0) 538 fatalx("%s: could not grow disk", __func__); 539 if (src_phys > 0) 540 copy_cluster(disk, base, disk->end, src_phys); 541 cluster = disk->end; 542 disk->end += disk->clustersz; 543 buf = htobe64(cluster | QCOW2_INPLACE); 544 if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8) 545 fatalx("%s: could not write cluster", __func__); 546 547 /* TODO: lazily sync: currently VMD doesn't close things */ 548 buf = htobe64(disk->l1[l1off]); 549 if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8) 550 fatalx("%s: could not write l1", __func__); 551 inc_refs(disk, cluster, 1); 552 553 pthread_rwlock_unlock(&disk->lock); 554 clusteroff = off % disk->clustersz; 555 if (cluster + clusteroff < disk->clustersz) 556 fatalx("write would clobber header"); 557 return cluster + clusteroff; 558 } 559 560 /* Copies a cluster containing src to dst. Src and dst need not be aligned. */ 561 static void 562 copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src) 563 { 564 char *scratch; 565 566 scratch = malloc(disk->clustersz); 567 if (!scratch) 568 fatal("out of memory"); 569 src &= ~(disk->clustersz - 1); 570 dst &= ~(disk->clustersz - 1); 571 if (pread(base->fd, scratch, disk->clustersz, src) == -1) 572 fatal("%s: could not read cluster", __func__); 573 if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1) 574 fatal("%s: could not write cluster", __func__); 575 free(scratch); 576 } 577 578 static void 579 inc_refs(struct qcdisk *disk, off_t off, int newcluster) 580 { 581 off_t l1off, l1idx, l2idx, l2cluster; 582 size_t nper; 583 uint16_t refs; 584 uint64_t buf; 585 586 off &= ~QCOW2_INPLACE; 587 nper = disk->clustersz / 2; 588 l1idx = (off / disk->clustersz) / nper; 589 l2idx = (off / disk->clustersz) % nper; 590 l1off = disk->refoff + 8 * l1idx; 591 if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8) 592 fatal("could not read refs"); 593 594 l2cluster = be64toh(buf); 595 if (l2cluster == 0) { 596 l2cluster = disk->end; 597 disk->end += disk->clustersz; 598 if (ftruncate(disk->fd, disk->end) < 0) 599 fatal("%s: failed to allocate ref block", __func__); 600 buf = htobe64(l2cluster); 601 if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8) 602 fatal("%s: failed to write ref block", __func__); 603 } 604 605 refs = 1; 606 if (!newcluster) { 607 if (pread(disk->fd, &refs, sizeof(refs), 608 l2cluster + 2 * l2idx) != 2) 609 fatal("could not read ref cluster"); 610 refs = be16toh(refs) + 1; 611 } 612 refs = htobe16(refs); 613 if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2) 614 fatal("%s: could not write ref block", __func__); 615 } 616 617 /* 618 * virtio_qcow2_create 619 * 620 * Create an empty qcow2 imagefile with the specified path and size. 621 * 622 * Parameters: 623 * imgfile_path: path to the image file to create 624 * imgsize : size of the image file to create (in MB) 625 * 626 * Return: 627 * EEXIST: The requested image file already exists 628 * 0 : Image file successfully created 629 * Exxxx : Various other Exxxx errno codes due to other I/O errors 630 */ 631 int 632 virtio_qcow2_create(const char *imgfile_path, 633 const char *base_path, long imgsize) 634 { 635 struct qcheader hdr, basehdr; 636 int fd, ret; 637 ssize_t base_len; 638 uint64_t l1sz, refsz, disksz, initsz, clustersz; 639 uint64_t l1off, refoff, v, i, l1entrysz, refentrysz; 640 uint16_t refs; 641 642 disksz = 1024 * 1024 * imgsize; 643 644 if (base_path) { 645 fd = open(base_path, O_RDONLY); 646 if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr)) 647 err(1, "failure to read base image header"); 648 close(fd); 649 if (strncmp(basehdr.magic, 650 VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) 651 errx(1, "base image is not a qcow2 file"); 652 if (!disksz) 653 disksz = betoh64(basehdr.disksz); 654 else if (disksz != betoh64(basehdr.disksz)) 655 errx(1, "base size does not match requested size"); 656 } 657 if (!base_path && !disksz) 658 errx(1, "missing disk size"); 659 660 clustersz = (1<<16); 661 l1off = ALIGNSZ(sizeof(hdr), clustersz); 662 663 l1entrysz = clustersz * clustersz / 8; 664 l1sz = (disksz + l1entrysz - 1) / l1entrysz; 665 666 refoff = ALIGNSZ(l1off + 8*l1sz, clustersz); 667 refentrysz = clustersz * clustersz * clustersz / 2; 668 refsz = (disksz + refentrysz - 1) / refentrysz; 669 670 initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz); 671 base_len = base_path ? strlen(base_path) : 0; 672 673 memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)); 674 hdr.version = htobe32(3); 675 hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0); 676 hdr.backingsz = htobe32(base_len); 677 hdr.clustershift = htobe32(16); 678 hdr.disksz = htobe64(disksz); 679 hdr.cryptmethod = htobe32(0); 680 hdr.l1sz = htobe32(l1sz); 681 hdr.l1off = htobe64(l1off); 682 hdr.refoff = htobe64(refoff); 683 hdr.refsz = htobe32(refsz); 684 hdr.snapcount = htobe32(0); 685 hdr.snapsz = htobe64(0); 686 hdr.incompatfeatures = htobe64(0); 687 hdr.compatfeatures = htobe64(0); 688 hdr.autoclearfeatures = htobe64(0); 689 hdr.reforder = htobe32(4); 690 hdr.headersz = htobe32(sizeof(hdr)); 691 692 /* Refuse to overwrite an existing image */ 693 fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL, 694 S_IRUSR | S_IWUSR); 695 if (fd == -1) 696 return (errno); 697 698 /* Write out the header */ 699 if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 700 goto error; 701 702 /* Add the base image */ 703 if (base_path && write(fd, base_path, base_len) != base_len) 704 goto error; 705 706 /* Extend to desired size, and add one refcount cluster */ 707 if (ftruncate(fd, (off_t)initsz + clustersz) == -1) 708 goto error; 709 710 /* 711 * Paranoia: if our disk image takes more than one cluster 712 * to refcount the initial image, fail. 713 */ 714 if (initsz/clustersz > clustersz/2) { 715 errno = ERANGE; 716 goto error; 717 } 718 719 /* Add a refcount block, and refcount ourselves. */ 720 v = htobe64(initsz); 721 if (pwrite(fd, &v, 8, refoff) != 8) 722 goto error; 723 for (i = 0; i < initsz/clustersz + 1; i++) { 724 refs = htobe16(1); 725 if (pwrite(fd, &refs, 2, initsz + 2*i) != 2) 726 goto error; 727 } 728 729 ret = close(fd); 730 return (ret); 731 error: 732 ret = errno; 733 close(fd); 734 unlink(imgfile_path); 735 return (errno); 736 } 737