1 /* $OpenBSD: vioqcow2.c,v 1.25 2024/09/26 01:45:13 jsg Exp $ */
2
3 /*
4 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 #include <sys/types.h>
20 #include <sys/stat.h>
21
22 #include <err.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <libgen.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <unistd.h>
29
30 #include "virtio.h"
31
32 #define QCOW2_COMPRESSED 0x4000000000000000ull
33 #define QCOW2_INPLACE 0x8000000000000000ull
34
35 #define QCOW2_DIRTY (1 << 0)
36 #define QCOW2_CORRUPT (1 << 1)
37
38 enum {
39 ICFEATURE_DIRTY = 1 << 0,
40 ICFEATURE_CORRUPT = 1 << 1,
41 };
42
43 enum {
44 ACFEATURE_BITEXT = 1 << 0,
45 };
46
47 struct qcheader {
48 char magic[4];
49 uint32_t version;
50 uint64_t backingoff;
51 uint32_t backingsz;
52 uint32_t clustershift;
53 uint64_t disksz;
54 uint32_t cryptmethod;
55 uint32_t l1sz;
56 uint64_t l1off;
57 uint64_t refoff;
58 uint32_t refsz;
59 uint32_t snapcount;
60 uint64_t snapsz;
61 /* v3 additions */
62 uint64_t incompatfeatures;
63 uint64_t compatfeatures;
64 uint64_t autoclearfeatures;
65 uint32_t reforder; /* Bits = 1 << reforder */
66 uint32_t headersz;
67 } __packed;
68
69 struct qcdisk {
70 pthread_rwlock_t lock;
71 struct qcdisk *base;
72 struct qcheader header;
73
74 int fd;
75 uint64_t *l1;
76 off_t end;
77 off_t clustersz;
78 off_t disksz; /* In bytes */
79 uint32_t cryptmethod;
80
81 uint32_t l1sz;
82 off_t l1off;
83
84 off_t refoff;
85 off_t refsz;
86
87 uint32_t nsnap;
88 off_t snapoff;
89
90 /* v3 features */
91 uint64_t incompatfeatures;
92 uint64_t autoclearfeatures;
93 uint32_t refssz;
94 uint32_t headersz;
95 };
96
97 extern char *__progname;
98
99 static off_t xlate(struct qcdisk *, off_t, int *);
100 static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
101 static void inc_refs(struct qcdisk *, off_t, int);
102 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
103 static int qc2_open(struct qcdisk *, int *, size_t);
104 static ssize_t qc2_pread(void *, char *, size_t, off_t);
105 static ssize_t qc2_preadv(void *, struct iovec *, int, off_t);
106 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
107 static ssize_t qc2_pwritev(void *, struct iovec *, int, off_t);
108 static void qc2_close(void *, int);
109
110 /*
111 * Initializes a raw disk image backing file from an fd. Stores the
112 * number of bytes in *szp, returning -1 for error, 0 for success.
113 *
114 * May open snapshot base images.
115 */
116 int
virtio_qcow2_init(struct virtio_backing * file,off_t * szp,int * fd,size_t nfd)117 virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
118 {
119 struct qcdisk *diskp;
120
121 diskp = malloc(sizeof(struct qcdisk));
122 if (diskp == NULL)
123 return -1;
124 if (qc2_open(diskp, fd, nfd) == -1) {
125 log_warnx("could not open qcow2 disk");
126 return -1;
127 }
128 file->p = diskp;
129 file->pread = qc2_pread;
130 file->preadv = qc2_preadv;
131 file->pwrite = qc2_pwrite;
132 file->pwritev = qc2_pwritev;
133 file->close = qc2_close;
134 *szp = diskp->disksz;
135 return 0;
136 }
137
138 /*
139 * Return the path to the base image given a disk image.
140 * Called from vmctl.
141 */
142 ssize_t
virtio_qcow2_get_base(int fd,char * path,size_t npath,const char * dpath)143 virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
144 {
145 char dpathbuf[PATH_MAX];
146 char expanded[PATH_MAX];
147 struct qcheader header;
148 uint64_t backingoff;
149 uint32_t backingsz;
150 char *s = NULL;
151
152 if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
153 log_warnx("short read on header");
154 return -1;
155 }
156 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
157 log_warnx("invalid magic numbers");
158 return -1;
159 }
160 backingoff = be64toh(header.backingoff);
161 backingsz = be32toh(header.backingsz);
162 if (backingsz == 0)
163 return 0;
164
165 if (backingsz >= npath - 1) {
166 log_warnx("snapshot path too long");
167 return -1;
168 }
169 if (pread(fd, path, backingsz, backingoff) != backingsz) {
170 log_warnx("could not read snapshot base name");
171 return -1;
172 }
173 path[backingsz] = '\0';
174
175 /*
176 * Relative paths should be interpreted relative to the disk image,
177 * rather than relative to the directory vmd happens to be running in,
178 * since this is the only useful interpretation.
179 */
180 if (path[0] == '/') {
181 if (realpath(path, expanded) == NULL ||
182 strlcpy(path, expanded, npath) >= npath) {
183 log_warnx("unable to resolve %s", path);
184 return -1;
185 }
186 } else {
187 if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
188 sizeof(dpathbuf)) {
189 log_warnx("path too long: %s", dpath);
190 return -1;
191 }
192 s = dirname(dpathbuf);
193 if (snprintf(expanded, sizeof(expanded),
194 "%s/%s", s, path) >= (int)sizeof(expanded)) {
195 log_warnx("path too long: %s/%s", s, path);
196 return -1;
197 }
198 if (npath < PATH_MAX ||
199 realpath(expanded, path) == NULL) {
200 log_warnx("unable to resolve %s", path);
201 return -1;
202 }
203 }
204
205 return strlen(path);
206 }
207
208 static int
qc2_open(struct qcdisk * disk,int * fds,size_t nfd)209 qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
210 {
211 char basepath[PATH_MAX];
212 struct stat st;
213 struct qcheader header;
214 uint64_t backingoff;
215 uint32_t backingsz;
216 off_t i;
217 int version, fd;
218
219 pthread_rwlock_init(&disk->lock, NULL);
220 fd = fds[0];
221 disk->fd = fd;
222 disk->base = NULL;
223 disk->l1 = NULL;
224
225 if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
226 fatalx("short read on header");
227 if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
228 fatalx("invalid magic numbers");
229
230 disk->clustersz = (1ull << be32toh(header.clustershift));
231 disk->disksz = be64toh(header.disksz);
232 disk->cryptmethod = be32toh(header.cryptmethod);
233 disk->l1sz = be32toh(header.l1sz);
234 disk->l1off = be64toh(header.l1off);
235 disk->refsz = be32toh(header.refsz);
236 disk->refoff = be64toh(header.refoff);
237 disk->nsnap = be32toh(header.snapcount);
238 disk->snapoff = be64toh(header.snapsz);
239
240 /*
241 * The additional features here are defined as 0 in the v2 format,
242 * so as long as we clear the buffer before parsing, we don't need
243 * to check versions here.
244 */
245 disk->incompatfeatures = be64toh(header.incompatfeatures);
246 disk->autoclearfeatures = be64toh(header.autoclearfeatures);
247 disk->refssz = be32toh(header.refsz);
248 disk->headersz = be32toh(header.headersz);
249
250 /*
251 * We only know about the dirty or corrupt bits here.
252 */
253 if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
254 fatalx("unsupported features %llx",
255 disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
256 if (be32toh(header.reforder) != 4)
257 fatalx("unsupported refcount size\n");
258
259 disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
260 if (!disk->l1)
261 fatal("%s: could not allocate l1 table", __func__);
262 if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
263 != 8 * disk->l1sz)
264 fatalx("%s: unable to read qcow2 L1 table", __func__);
265 for (i = 0; i < disk->l1sz; i++)
266 disk->l1[i] = be64toh(disk->l1[i]);
267 version = be32toh(header.version);
268 if (version != 2 && version != 3)
269 fatalx("%s: unknown qcow2 version %d", __func__, version);
270
271 backingoff = be64toh(header.backingoff);
272 backingsz = be32toh(header.backingsz);
273 if (backingsz != 0) {
274 if (backingsz >= sizeof(basepath) - 1) {
275 fatalx("%s: snapshot path too long", __func__);
276 }
277 if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
278 fatalx("%s: could not read snapshot base name",
279 __func__);
280 }
281 basepath[backingsz] = 0;
282 if (nfd <= 1) {
283 fatalx("%s: missing base image %s", __func__,
284 basepath);
285 }
286
287
288 disk->base = calloc(1, sizeof(struct qcdisk));
289 if (!disk->base)
290 fatal("%s: could not open %s", __func__, basepath);
291 if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
292 fatalx("%s: could not open %s", __func__, basepath);
293 if (disk->base->clustersz != disk->clustersz)
294 fatalx("%s: all disk parts must share clustersize",
295 __func__);
296 }
297 if (fstat(fd, &st) == -1)
298 fatal("%s: unable to stat disk", __func__);
299
300 disk->end = st.st_size;
301
302 log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
303 __func__, version, disk->disksz, disk->end, disk->nsnap);
304
305 return 0;
306 }
307
308 static ssize_t
qc2_preadv(void * p,struct iovec * iov,int cnt,off_t offset)309 qc2_preadv(void *p, struct iovec *iov, int cnt, off_t offset)
310 {
311 int i;
312 off_t pos = offset;
313 ssize_t sz = 0, total = 0;
314
315 for (i = 0; i < cnt; i++, iov++) {
316 sz = qc2_pread(p, iov->iov_base, iov->iov_len, pos);
317 if (sz == -1)
318 return (sz);
319 total += sz;
320 pos += sz;
321 }
322
323 return (total);
324 }
325
326 static ssize_t
qc2_pread(void * p,char * buf,size_t len,off_t off)327 qc2_pread(void *p, char *buf, size_t len, off_t off)
328 {
329 struct qcdisk *disk, *d;
330 off_t phys_off, end, cluster_off;
331 ssize_t sz, rem;
332
333 disk = p;
334 end = off + len;
335 if (off < 0 || end > disk->disksz)
336 return -1;
337
338 /* handle head chunk separately */
339 rem = len;
340 while (off != end) {
341 for (d = disk; d; d = d->base)
342 if ((phys_off = xlate(d, off, NULL)) > 0)
343 break;
344 /* Break out into chunks. This handles
345 * three cases:
346 *
347 * |----+====|========|====+-----|
348 *
349 * Either we are at the start of the read,
350 * and the cluster has some leading bytes.
351 * This means that we are reading the tail
352 * of the cluster, and our size is:
353 *
354 * clustersz - (off % clustersz).
355 *
356 * Otherwise, we're reading the middle section.
357 * We're already aligned here, so we can just
358 * read the whole cluster size. Or we're at the
359 * tail, at which point we just want to read the
360 * remaining bytes.
361 */
362 cluster_off = off % disk->clustersz;
363 sz = disk->clustersz - cluster_off;
364 if (sz > rem)
365 sz = rem;
366 /*
367 * If we're within the disk, but don't have backing bytes,
368 * just read back zeros.
369 */
370 if (!d)
371 bzero(buf, sz);
372 else if (pread(d->fd, buf, sz, phys_off) != sz)
373 return -1;
374 off += sz;
375 buf += sz;
376 rem -= sz;
377 }
378 return len;
379 }
380
381 static ssize_t
qc2_pwritev(void * p,struct iovec * iov,int cnt,off_t offset)382 qc2_pwritev(void *p, struct iovec *iov, int cnt, off_t offset)
383 {
384 int i;
385 off_t pos = offset;
386 ssize_t sz = 0, total = 0;
387
388 for (i = 0; i < cnt; i++, iov++) {
389 sz = qc2_pwrite(p, iov->iov_base, iov->iov_len, pos);
390 if (sz == -1)
391 return (sz);
392 total += sz;
393 pos += sz;
394 }
395
396 return (total);
397 }
398
399 static ssize_t
qc2_pwrite(void * p,char * buf,size_t len,off_t off)400 qc2_pwrite(void *p, char *buf, size_t len, off_t off)
401 {
402 struct qcdisk *disk, *d;
403 off_t phys_off, cluster_off, end;
404 ssize_t sz, rem;
405 int inplace;
406
407 d = p;
408 disk = p;
409 inplace = 1;
410 end = off + len;
411 if (off < 0 || end > disk->disksz)
412 return -1;
413 rem = len;
414 while (off != end) {
415 /* See the read code for a summary of the computation */
416 cluster_off = off % disk->clustersz;
417 sz = disk->clustersz - cluster_off;
418 if (sz > rem)
419 sz = rem;
420
421 phys_off = xlate(disk, off, &inplace);
422 if (phys_off == -1)
423 return -1;
424 /*
425 * If we couldn't find the cluster in the writable disk,
426 * see if it exists in the base image. If it does, we
427 * need to copy it before the write. The copy happens
428 * in the '!inplace' if clause below te search.
429 */
430 if (phys_off == 0)
431 for (d = disk->base; d; d = d->base)
432 if ((phys_off = xlate(d, off, NULL)) > 0)
433 break;
434 if (!inplace || phys_off == 0)
435 phys_off = mkcluster(disk, d, off, phys_off);
436 if (phys_off == -1)
437 return -1;
438 if (phys_off < disk->clustersz)
439 fatalx("%s: writing reserved cluster", __func__);
440 if (pwrite(disk->fd, buf, sz, phys_off) != sz)
441 return -1;
442 off += sz;
443 buf += sz;
444 rem -= sz;
445 }
446 return len;
447 }
448
449 static void
qc2_close(void * p,int stayopen)450 qc2_close(void *p, int stayopen)
451 {
452 struct qcdisk *disk;
453
454 disk = p;
455 if (disk->base)
456 qc2_close(disk->base, stayopen);
457 if (!stayopen)
458 close(disk->fd);
459 free(disk->l1);
460 free(disk);
461 }
462
463 /*
464 * Translates a virtual offset into an on-disk offset.
465 * Returns:
466 * -1 on error
467 * 0 on 'not found'
468 * >0 on found
469 */
470 static off_t
xlate(struct qcdisk * disk,off_t off,int * inplace)471 xlate(struct qcdisk *disk, off_t off, int *inplace)
472 {
473 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
474 uint64_t buf;
475
476
477 /*
478 * Clear out inplace flag -- xlate misses should not
479 * be flagged as updatable in place. We will still
480 * return 0 from them, but this leaves less surprises
481 * in the API.
482 */
483 if (inplace)
484 *inplace = 0;
485 pthread_rwlock_rdlock(&disk->lock);
486 if (off < 0)
487 goto err;
488
489 l2sz = disk->clustersz / 8;
490 l1off = (off / disk->clustersz) / l2sz;
491 if (l1off >= disk->l1sz)
492 goto err;
493
494 l2tab = disk->l1[l1off];
495 l2tab &= ~QCOW2_INPLACE;
496 if (l2tab == 0) {
497 pthread_rwlock_unlock(&disk->lock);
498 return 0;
499 }
500 l2off = (off / disk->clustersz) % l2sz;
501 pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
502 cluster = be64toh(buf);
503 /*
504 * cluster may be 0, but all future operations don't affect
505 * the return value.
506 */
507 if (inplace)
508 *inplace = !!(cluster & QCOW2_INPLACE);
509 if (cluster & QCOW2_COMPRESSED)
510 fatalx("%s: compressed clusters unsupported", __func__);
511 pthread_rwlock_unlock(&disk->lock);
512 clusteroff = 0;
513 cluster &= ~QCOW2_INPLACE;
514 if (cluster)
515 clusteroff = off % disk->clustersz;
516 return cluster + clusteroff;
517 err:
518 pthread_rwlock_unlock(&disk->lock);
519 return -1;
520 }
521
522 /*
523 * Allocates a new cluster on disk, creating a new L2 table
524 * if needed. The cluster starts off with a refs of one,
525 * and the writable bit set.
526 *
527 * Returns -1 on error, and the physical address within the
528 * cluster of the write offset if it exists.
529 */
530 static off_t
mkcluster(struct qcdisk * disk,struct qcdisk * base,off_t off,off_t src_phys)531 mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
532 {
533 off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
534 uint64_t buf;
535
536 pthread_rwlock_wrlock(&disk->lock);
537
538 cluster = -1;
539 /* L1 entries always exist */
540 l2sz = disk->clustersz / 8;
541 l1off = off / (disk->clustersz * l2sz);
542 if (l1off >= disk->l1sz)
543 fatalx("l1 offset outside disk");
544
545 disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
546
547 l2tab = disk->l1[l1off];
548 l2off = (off / disk->clustersz) % l2sz;
549 /* We may need to create or clone an L2 entry to map the block */
550 if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
551 orig = l2tab & ~QCOW2_INPLACE;
552 l2tab = disk->end;
553 disk->end += disk->clustersz;
554 if (ftruncate(disk->fd, disk->end) == -1)
555 fatal("%s: ftruncate failed", __func__);
556
557 /*
558 * If we translated, found a L2 entry, but it needed to
559 * be copied, copy it.
560 */
561 if (orig != 0)
562 copy_cluster(disk, disk, l2tab, orig);
563 /* Update l1 -- we flush it later */
564 disk->l1[l1off] = l2tab | QCOW2_INPLACE;
565 inc_refs(disk, l2tab, 1);
566 }
567 l2tab &= ~QCOW2_INPLACE;
568
569 /* Grow the disk */
570 if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
571 fatal("%s: could not grow disk", __func__);
572 if (src_phys > 0)
573 copy_cluster(disk, base, disk->end, src_phys);
574 cluster = disk->end;
575 disk->end += disk->clustersz;
576 buf = htobe64(cluster | QCOW2_INPLACE);
577 if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
578 fatalx("%s: could not write cluster", __func__);
579
580 /* TODO: lazily sync: currently VMD doesn't close things */
581 buf = htobe64(disk->l1[l1off]);
582 if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
583 fatalx("%s: could not write l1", __func__);
584 inc_refs(disk, cluster, 1);
585
586 pthread_rwlock_unlock(&disk->lock);
587 clusteroff = off % disk->clustersz;
588 if (cluster + clusteroff < disk->clustersz)
589 fatalx("write would clobber header");
590 return cluster + clusteroff;
591 }
592
593 /* Copies a cluster containing src to dst. Src and dst need not be aligned. */
594 static void
copy_cluster(struct qcdisk * disk,struct qcdisk * base,off_t dst,off_t src)595 copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
596 {
597 char *scratch;
598
599 scratch = malloc(disk->clustersz);
600 if (!scratch)
601 fatal("out of memory");
602 src &= ~(disk->clustersz - 1);
603 dst &= ~(disk->clustersz - 1);
604 if (pread(base->fd, scratch, disk->clustersz, src) == -1)
605 fatal("%s: could not read cluster", __func__);
606 if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
607 fatal("%s: could not write cluster", __func__);
608 free(scratch);
609 }
610
611 static void
inc_refs(struct qcdisk * disk,off_t off,int newcluster)612 inc_refs(struct qcdisk *disk, off_t off, int newcluster)
613 {
614 off_t l1off, l1idx, l2idx, l2cluster;
615 size_t nper;
616 uint16_t refs;
617 uint64_t buf;
618
619 off &= ~QCOW2_INPLACE;
620 nper = disk->clustersz / 2;
621 l1idx = (off / disk->clustersz) / nper;
622 l2idx = (off / disk->clustersz) % nper;
623 l1off = disk->refoff + 8 * l1idx;
624 if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
625 fatal("could not read refs");
626
627 l2cluster = be64toh(buf);
628 if (l2cluster == 0) {
629 l2cluster = disk->end;
630 disk->end += disk->clustersz;
631 if (ftruncate(disk->fd, disk->end) < 0)
632 fatal("%s: failed to allocate ref block", __func__);
633 buf = htobe64(l2cluster);
634 if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
635 fatal("%s: failed to write ref block", __func__);
636 }
637
638 refs = 1;
639 if (!newcluster) {
640 if (pread(disk->fd, &refs, sizeof(refs),
641 l2cluster + 2 * l2idx) != 2)
642 fatal("could not read ref cluster");
643 refs = be16toh(refs) + 1;
644 }
645 refs = htobe16(refs);
646 if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
647 fatal("%s: could not write ref block", __func__);
648 }
649
650 /*
651 * virtio_qcow2_create
652 *
653 * Create an empty qcow2 imagefile with the specified path and size.
654 *
655 * Parameters:
656 * imgfile_path: path to the image file to create
657 * imgsize : size of the image file to create (in bytes)
658 *
659 * Return:
660 * EEXIST: The requested image file already exists
661 * 0 : Image file successfully created
662 * Exxxx : Various other Exxxx errno codes due to other I/O errors
663 */
664 int
virtio_qcow2_create(const char * imgfile_path,const char * base_path,uint64_t disksz)665 virtio_qcow2_create(const char *imgfile_path,
666 const char *base_path, uint64_t disksz)
667 {
668 struct qcheader hdr, basehdr;
669 int fd, ret;
670 ssize_t base_len;
671 uint64_t l1sz, refsz, initsz, clustersz;
672 uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
673 uint16_t refs;
674
675 if (base_path) {
676 fd = open(base_path, O_RDONLY);
677 if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
678 errx(1, "failure to read base image header");
679 close(fd);
680 if (strncmp(basehdr.magic,
681 VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
682 errx(1, "base image is not a qcow2 file");
683 if (!disksz)
684 disksz = betoh64(basehdr.disksz);
685 else if (disksz != betoh64(basehdr.disksz))
686 errx(1, "base size does not match requested size");
687 }
688 if (!base_path && !disksz)
689 errx(1, "missing disk size");
690
691 clustersz = (1<<16);
692 l1off = ALIGNSZ(sizeof(hdr), clustersz);
693
694 l1entrysz = clustersz * clustersz / 8;
695 l1sz = (disksz + l1entrysz - 1) / l1entrysz;
696
697 refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
698 refentrysz = clustersz * clustersz * clustersz / 2;
699 refsz = (disksz + refentrysz - 1) / refentrysz;
700
701 initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
702 base_len = base_path ? strlen(base_path) : 0;
703
704 memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
705 hdr.version = htobe32(3);
706 hdr.backingoff = htobe64(base_path ? sizeof(hdr) : 0);
707 hdr.backingsz = htobe32(base_len);
708 hdr.clustershift = htobe32(16);
709 hdr.disksz = htobe64(disksz);
710 hdr.cryptmethod = htobe32(0);
711 hdr.l1sz = htobe32(l1sz);
712 hdr.l1off = htobe64(l1off);
713 hdr.refoff = htobe64(refoff);
714 hdr.refsz = htobe32(refsz);
715 hdr.snapcount = htobe32(0);
716 hdr.snapsz = htobe64(0);
717 hdr.incompatfeatures = htobe64(0);
718 hdr.compatfeatures = htobe64(0);
719 hdr.autoclearfeatures = htobe64(0);
720 hdr.reforder = htobe32(4);
721 hdr.headersz = htobe32(sizeof(hdr));
722
723 /* Refuse to overwrite an existing image */
724 fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
725 S_IRUSR | S_IWUSR);
726 if (fd == -1)
727 return (errno);
728
729 /* Write out the header */
730 if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
731 goto error;
732
733 /* Add the base image */
734 if (base_path && write(fd, base_path, base_len) != base_len)
735 goto error;
736
737 /* Extend to desired size, and add one refcount cluster */
738 if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
739 goto error;
740
741 /*
742 * Paranoia: if our disk image takes more than one cluster
743 * to refcount the initial image, fail.
744 */
745 if (initsz/clustersz > clustersz/2) {
746 errno = ERANGE;
747 goto error;
748 }
749
750 /* Add a refcount block, and refcount ourselves. */
751 v = htobe64(initsz);
752 if (pwrite(fd, &v, 8, refoff) != 8)
753 goto error;
754 for (i = 0; i < initsz/clustersz + 1; i++) {
755 refs = htobe16(1);
756 if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
757 goto error;
758 }
759
760 ret = close(fd);
761 return (ret);
762 error:
763 ret = errno;
764 close(fd);
765 unlink(imgfile_path);
766 return (errno);
767 }
768