xref: /openbsd/usr.sbin/vmd/vioqcow2.c (revision 76d0caae)
1 /*	$OpenBSD: vioqcow2.c,v 1.16 2021/06/16 16:55:02 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 
22 #include <dev/pci/pcireg.h>
23 #include <machine/vmmvar.h>
24 
25 #include <assert.h>
26 #include <err.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <libgen.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <unistd.h>
33 
34 #include "virtio.h"
35 
36 #define QCOW2_COMPRESSED	0x4000000000000000ull
37 #define QCOW2_INPLACE		0x8000000000000000ull
38 
39 #define QCOW2_DIRTY		(1 << 0)
40 #define QCOW2_CORRUPT		(1 << 1)
41 
42 enum {
43 	ICFEATURE_DIRTY		= 1 << 0,
44 	ICFEATURE_CORRUPT	= 1 << 1,
45 };
46 
47 enum {
48 	ACFEATURE_BITEXT	= 1 << 0,
49 };
50 
51 struct qcheader {
52 	char magic[4];
53 	uint32_t version;
54 	uint64_t backingoff;
55 	uint32_t backingsz;
56 	uint32_t clustershift;
57 	uint64_t disksz;
58 	uint32_t cryptmethod;
59 	uint32_t l1sz;
60 	uint64_t l1off;
61 	uint64_t refoff;
62 	uint32_t refsz;
63 	uint32_t snapcount;
64 	uint64_t snapsz;
65 	/* v3 additions */
66 	uint64_t incompatfeatures;
67 	uint64_t compatfeatures;
68 	uint64_t autoclearfeatures;
69 	uint32_t reforder;	/* Bits = 1 << reforder */
70 	uint32_t headersz;
71 } __packed;
72 
73 struct qcdisk {
74 	pthread_rwlock_t lock;
75 	struct qcdisk *base;
76 	struct qcheader header;
77 
78 	int       fd;
79 	uint64_t *l1;
80 	off_t     end;
81 	off_t	  clustersz;
82 	off_t	  disksz; /* In bytes */
83 	uint32_t  cryptmethod;
84 
85 	uint32_t l1sz;
86 	off_t	 l1off;
87 
88 	off_t	 refoff;
89 	off_t	 refsz;
90 
91 	uint32_t nsnap;
92 	off_t	 snapoff;
93 
94 	/* v3 features */
95 	uint64_t incompatfeatures;
96 	uint64_t autoclearfeatures;
97 	uint32_t refssz;
98 	uint32_t headersz;
99 };
100 
101 extern char *__progname;
102 
103 static off_t xlate(struct qcdisk *, off_t, int *);
104 static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
105 static void inc_refs(struct qcdisk *, off_t, int);
106 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
107 static int qc2_open(struct qcdisk *, int *, size_t);
108 static ssize_t qc2_pread(void *, char *, size_t, off_t);
109 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
110 static void qc2_close(void *, int);
111 
112 /*
113  * Initializes a raw disk image backing file from an fd.
114  * Stores the number of 512 byte sectors in *szp,
115  * returning -1 for error, 0 for success.
116  *
117  * May open snapshot base images.
118  */
119 int
120 virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
121 {
122 	struct qcdisk *diskp;
123 
124 	diskp = malloc(sizeof(struct qcdisk));
125 	if (diskp == NULL)
126 		return -1;
127 	if (qc2_open(diskp, fd, nfd) == -1) {
128 		log_warnx("could not open qcow2 disk");
129 		return -1;
130 	}
131 	file->p = diskp;
132 	file->pread = qc2_pread;
133 	file->pwrite = qc2_pwrite;
134 	file->close = qc2_close;
135 	*szp = diskp->disksz;
136 	return 0;
137 }
138 
139 /*
140  * Return the path to the base image given a disk image.
141  * Called from vmctl.
142  */
143 ssize_t
144 virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
145 {
146 	char dpathbuf[PATH_MAX];
147 	char expanded[PATH_MAX];
148 	struct qcheader header;
149 	uint64_t backingoff;
150 	uint32_t backingsz;
151 	char *s = NULL;
152 
153 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
154 		log_warnx("short read on header");
155 		return -1;
156 	}
157 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
158 		log_warnx("invalid magic numbers");
159 		return -1;
160 	}
161 	backingoff = be64toh(header.backingoff);
162 	backingsz = be32toh(header.backingsz);
163 	if (backingsz == 0)
164 		return 0;
165 
166 	if (backingsz >= npath - 1) {
167 		log_warnx("snapshot path too long");
168 		return -1;
169 	}
170 	if (pread(fd, path, backingsz, backingoff) != backingsz) {
171 		log_warnx("could not read snapshot base name");
172 		return -1;
173 	}
174 	path[backingsz] = '\0';
175 
176 	/*
177 	 * Relative paths should be interpreted relative to the disk image,
178 	 * rather than relative to the directory vmd happens to be running in,
179 	 * since this is the only userful interpretation.
180 	 */
181 	if (path[0] == '/') {
182 		if (realpath(path, expanded) == NULL ||
183 		    strlcpy(path, expanded, npath) >= npath) {
184 			log_warnx("unable to resolve %s", path);
185 			return -1;
186 		}
187 	} else {
188 		if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
189 		    sizeof(dpathbuf)) {
190 			log_warnx("path too long: %s", dpath);
191 			return -1;
192 		}
193 		s = dirname(dpathbuf);
194 		if (snprintf(expanded, sizeof(expanded),
195 		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
196 			log_warnx("path too long: %s/%s", s, path);
197 			return -1;
198 		}
199 		if (npath < PATH_MAX ||
200 		    realpath(expanded, path) == NULL) {
201 			log_warnx("unable to resolve %s", path);
202 			return -1;
203 		}
204 	}
205 
206 	return strlen(path);
207 }
208 
209 static int
210 qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
211 {
212 	char basepath[PATH_MAX];
213 	struct stat st;
214 	struct qcheader header;
215 	uint64_t backingoff;
216 	uint32_t backingsz;
217 	off_t i;
218 	int version, fd;
219 
220 	pthread_rwlock_init(&disk->lock, NULL);
221 	fd = fds[0];
222 	disk->fd = fd;
223 	disk->base = NULL;
224 	disk->l1 = NULL;
225 
226 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
227 		fatalx("short read on header");
228 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
229 		fatalx("invalid magic numbers");
230 
231 	disk->clustersz		= (1ull << be32toh(header.clustershift));
232 	disk->disksz		= be64toh(header.disksz);
233 	disk->cryptmethod	= be32toh(header.cryptmethod);
234 	disk->l1sz		= be32toh(header.l1sz);
235 	disk->l1off		= be64toh(header.l1off);
236 	disk->refsz		= be32toh(header.refsz);
237 	disk->refoff		= be64toh(header.refoff);
238 	disk->nsnap		= be32toh(header.snapcount);
239 	disk->snapoff		= be64toh(header.snapsz);
240 
241 	/*
242 	 * The additional features here are defined as 0 in the v2 format,
243 	 * so as long as we clear the buffer before parsing, we don't need
244 	 * to check versions here.
245 	 */
246 	disk->incompatfeatures = be64toh(header.incompatfeatures);
247 	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
248 	disk->refssz = be32toh(header.refsz);
249 	disk->headersz = be32toh(header.headersz);
250 
251 	/*
252 	 * We only know about the dirty or corrupt bits here.
253 	 */
254 	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
255 		fatalx("unsupported features %llx",
256 		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
257 	if (be32toh(header.reforder) != 4)
258 		fatalx("unsupported refcount size\n");
259 
260 	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
261 	if (!disk->l1)
262 		fatal("%s: could not allocate l1 table", __func__);
263 	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
264 	    != 8 * disk->l1sz)
265 		fatalx("%s: unable to read qcow2 L1 table", __func__);
266 	for (i = 0; i < disk->l1sz; i++)
267 		disk->l1[i] = be64toh(disk->l1[i]);
268 	version = be32toh(header.version);
269 	if (version != 2 && version != 3)
270 		fatalx("%s: unknown qcow2 version %d", __func__, version);
271 
272 	backingoff = be64toh(header.backingoff);
273 	backingsz = be32toh(header.backingsz);
274 	if (backingsz != 0) {
275 		if (backingsz >= sizeof(basepath) - 1) {
276 			fatalx("%s: snapshot path too long", __func__);
277 		}
278 		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
279 			fatalx("%s: could not read snapshot base name",
280 			    __func__);
281 		}
282 		basepath[backingsz] = 0;
283 		if (nfd <= 1) {
284 			fatalx("%s: missing base image %s", __func__,
285 			    basepath);
286 		}
287 
288 
289 		disk->base = calloc(1, sizeof(struct qcdisk));
290 		if (!disk->base)
291 			fatal("%s: could not open %s", __func__, basepath);
292 		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
293 			fatalx("%s: could not open %s", __func__, basepath);
294 		if (disk->base->clustersz != disk->clustersz)
295 			fatalx("%s: all disk parts must share clustersize",
296 			    __func__);
297 	}
298 	if (fstat(fd, &st) == -1)
299 		fatal("%s: unable to stat disk", __func__);
300 
301 	disk->end = st.st_size;
302 
303 	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
304 	    __func__, version, disk->disksz, disk->end, disk->nsnap);
305 
306 	return 0;
307 }
308 
309 static ssize_t
310 qc2_pread(void *p, char *buf, size_t len, off_t off)
311 {
312 	struct qcdisk *disk, *d;
313 	off_t phys_off, end, cluster_off;
314 	ssize_t sz, rem;
315 
316 	disk = p;
317 	end = off + len;
318 	if (off < 0 || end > disk->disksz)
319 		return -1;
320 
321 	/* handle head chunk separately */
322 	rem = len;
323 	while (off != end) {
324 		for (d = disk; d; d = d->base)
325 			if ((phys_off = xlate(d, off, NULL)) > 0)
326 				break;
327 		/* Break out into chunks. This handles
328 		 * three cases:
329 		 *
330 		 *    |----+====|========|====+-----|
331 		 *
332 		 * Either we are at the start of the read,
333 		 * and the cluster has some leading bytes.
334 		 * This means that we are reading the tail
335 		 * of the cluster, and our size is:
336 		 *
337 		 * 	clustersz - (off % clustersz).
338 		 *
339 		 * Otherwise, we're reading the middle section.
340 		 * We're already aligned here, so we can just
341 		 * read the whole cluster size. Or we're at the
342 		 * tail, at which point we just want to read the
343 		 * remaining bytes.
344 		 */
345 		cluster_off = off % disk->clustersz;
346 		sz = disk->clustersz - cluster_off;
347 		if (sz > rem)
348 			sz = rem;
349 		/*
350 		 * If we're within the disk, but don't have backing bytes,
351 		 * just read back zeros.
352 		 */
353 		if (!d)
354 			bzero(buf, sz);
355 		else if (pread(d->fd, buf, sz, phys_off) != sz)
356 			return -1;
357 		off += sz;
358 		buf += sz;
359 		rem -= sz;
360 	}
361 	return len;
362 }
363 
364 ssize_t
365 qc2_pwrite(void *p, char *buf, size_t len, off_t off)
366 {
367 	struct qcdisk *disk, *d;
368 	off_t phys_off, cluster_off, end;
369 	ssize_t sz, rem;
370 	int inplace;
371 
372 	d = p;
373 	disk = p;
374 	inplace = 1;
375 	end = off + len;
376 	if (off < 0 || end > disk->disksz)
377 		return -1;
378 	rem = len;
379 	while (off != end) {
380 		/* See the read code for a summary of the computation */
381 		cluster_off = off % disk->clustersz;
382 		sz = disk->clustersz - cluster_off;
383 		if (sz > rem)
384 			sz = rem;
385 
386 		phys_off = xlate(disk, off, &inplace);
387 		if (phys_off == -1)
388 			return -1;
389 		/*
390 		 * If we couldn't find the cluster in the writable disk,
391 		 * see if it exists in the base image. If it does, we
392 		 * need to copy it before the write. The copy happens
393 		 * in the '!inplace' if clause below te search.
394 		 */
395 		if (phys_off == 0)
396 			for (d = disk->base; d; d = d->base)
397 				if ((phys_off = xlate(d, off, NULL)) > 0)
398 					break;
399 		if (!inplace || phys_off == 0)
400 			phys_off = mkcluster(disk, d, off, phys_off);
401 		if (phys_off == -1)
402 			return -1;
403 		if (phys_off < disk->clustersz)
404 			fatalx("%s: writing reserved cluster", __func__);
405 		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
406 			return -1;
407 		off += sz;
408 		buf += sz;
409 		rem -= sz;
410 	}
411 	return len;
412 }
413 
414 static void
415 qc2_close(void *p, int stayopen)
416 {
417 	struct qcdisk *disk;
418 
419 	disk = p;
420 	if (disk->base)
421 		qc2_close(disk->base, stayopen);
422 	if (!stayopen)
423 		close(disk->fd);
424 	free(disk->l1);
425 	free(disk);
426 }
427 
428 /*
429  * Translates a virtual offset into an on-disk offset.
430  * Returns:
431  * 	-1 on error
432  * 	 0 on 'not found'
433  * 	>0 on found
434  */
435 static off_t
436 xlate(struct qcdisk *disk, off_t off, int *inplace)
437 {
438 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
439 	uint64_t buf;
440 
441 
442 	/*
443 	 * Clear out inplace flag -- xlate misses should not
444 	 * be flagged as updatable in place. We will still
445 	 * return 0 from them, but this leaves less surprises
446 	 * in the API.
447 	 */
448 	if (inplace)
449 		*inplace = 0;
450 	pthread_rwlock_rdlock(&disk->lock);
451 	if (off < 0)
452 		goto err;
453 
454 	l2sz = disk->clustersz / 8;
455 	l1off = (off / disk->clustersz) / l2sz;
456 	if (l1off >= disk->l1sz)
457 		goto err;
458 
459 	l2tab = disk->l1[l1off];
460 	l2tab &= ~QCOW2_INPLACE;
461 	if (l2tab == 0) {
462 		pthread_rwlock_unlock(&disk->lock);
463 		return 0;
464 	}
465 	l2off = (off / disk->clustersz) % l2sz;
466 	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
467 	cluster = be64toh(buf);
468 	/*
469 	 * cluster may be 0, but all future operations don't affect
470 	 * the return value.
471 	 */
472 	if (inplace)
473 		*inplace = !!(cluster & QCOW2_INPLACE);
474 	if (cluster & QCOW2_COMPRESSED)
475 		fatalx("%s: compressed clusters unsupported", __func__);
476 	pthread_rwlock_unlock(&disk->lock);
477 	clusteroff = 0;
478 	cluster &= ~QCOW2_INPLACE;
479 	if (cluster)
480 		clusteroff = off % disk->clustersz;
481 	return cluster + clusteroff;
482 err:
483 	pthread_rwlock_unlock(&disk->lock);
484 	return -1;
485 }
486 
487 /*
488  * Allocates a new cluster on disk, creating a new L2 table
489  * if needed. The cluster starts off with a refs of one,
490  * and the writable bit set.
491  *
492  * Returns -1 on error, and the physical address within the
493  * cluster of the write offset if it exists.
494  */
495 static off_t
496 mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
497 {
498 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
499 	uint64_t buf;
500 	int fd;
501 
502 	pthread_rwlock_wrlock(&disk->lock);
503 
504 	cluster = -1;
505 	fd = disk->fd;
506 	/* L1 entries always exist */
507 	l2sz = disk->clustersz / 8;
508 	l1off = off / (disk->clustersz * l2sz);
509 	if (l1off >= disk->l1sz)
510 		fatalx("l1 offset outside disk");
511 
512 	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
513 
514 	l2tab = disk->l1[l1off];
515 	l2off = (off / disk->clustersz) % l2sz;
516 	/* We may need to create or clone an L2 entry to map the block */
517 	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
518 		orig = l2tab & ~QCOW2_INPLACE;
519 		l2tab = disk->end;
520 		disk->end += disk->clustersz;
521 		if (ftruncate(disk->fd, disk->end) == -1)
522 			fatal("%s: ftruncate failed", __func__);
523 
524 		/*
525 		 * If we translated, found a L2 entry, but it needed to
526 		 * be copied, copy it.
527 		 */
528 		if (orig != 0)
529 			copy_cluster(disk, disk, l2tab, orig);
530 		/* Update l1 -- we flush it later */
531 		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
532 		inc_refs(disk, l2tab, 1);
533 	}
534 	l2tab &= ~QCOW2_INPLACE;
535 
536 	/* Grow the disk */
537 	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
538 		fatalx("%s: could not grow disk", __func__);
539 	if (src_phys > 0)
540 		copy_cluster(disk, base, disk->end, src_phys);
541 	cluster = disk->end;
542 	disk->end += disk->clustersz;
543 	buf = htobe64(cluster | QCOW2_INPLACE);
544 	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
545 		fatalx("%s: could not write cluster", __func__);
546 
547 	/* TODO: lazily sync: currently VMD doesn't close things */
548 	buf = htobe64(disk->l1[l1off]);
549 	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
550 		fatalx("%s: could not write l1", __func__);
551 	inc_refs(disk, cluster, 1);
552 
553 	pthread_rwlock_unlock(&disk->lock);
554 	clusteroff = off % disk->clustersz;
555 	if (cluster + clusteroff < disk->clustersz)
556 		fatalx("write would clobber header");
557 	return cluster + clusteroff;
558 }
559 
560 /* Copies a cluster containing src to dst. Src and dst need not be aligned. */
561 static void
562 copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
563 {
564 	char *scratch;
565 
566 	scratch = malloc(disk->clustersz);
567 	if (!scratch)
568 		fatal("out of memory");
569 	src &= ~(disk->clustersz - 1);
570 	dst &= ~(disk->clustersz - 1);
571 	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
572 		fatal("%s: could not read cluster", __func__);
573 	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
574 		fatal("%s: could not write cluster", __func__);
575 	free(scratch);
576 }
577 
578 static void
579 inc_refs(struct qcdisk *disk, off_t off, int newcluster)
580 {
581 	off_t l1off, l1idx, l2idx, l2cluster;
582 	size_t nper;
583 	uint16_t refs;
584 	uint64_t buf;
585 
586 	off &= ~QCOW2_INPLACE;
587 	nper = disk->clustersz / 2;
588 	l1idx = (off / disk->clustersz) / nper;
589 	l2idx = (off / disk->clustersz) % nper;
590 	l1off = disk->refoff + 8 * l1idx;
591 	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
592 		fatal("could not read refs");
593 
594 	l2cluster = be64toh(buf);
595 	if (l2cluster == 0) {
596 		l2cluster = disk->end;
597 		disk->end += disk->clustersz;
598 		if (ftruncate(disk->fd, disk->end) < 0)
599 			fatal("%s: failed to allocate ref block", __func__);
600 		buf = htobe64(l2cluster);
601 		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
602 			fatal("%s: failed to write ref block", __func__);
603 	}
604 
605 	refs = 1;
606 	if (!newcluster) {
607 		if (pread(disk->fd, &refs, sizeof(refs),
608 		    l2cluster + 2 * l2idx) != 2)
609 			fatal("could not read ref cluster");
610 		refs = be16toh(refs) + 1;
611 	}
612 	refs = htobe16(refs);
613 	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
614 		fatal("%s: could not write ref block", __func__);
615 }
616 
617 /*
618  * virtio_qcow2_create
619  *
620  * Create an empty qcow2 imagefile with the specified path and size.
621  *
622  * Parameters:
623  *  imgfile_path: path to the image file to create
624  *  imgsize     : size of the image file to create (in MB)
625  *
626  * Return:
627  *  EEXIST: The requested image file already exists
628  *  0     : Image file successfully created
629  *  Exxxx : Various other Exxxx errno codes due to other I/O errors
630  */
631 int
632 virtio_qcow2_create(const char *imgfile_path,
633     const char *base_path, long imgsize)
634 {
635 	struct qcheader hdr, basehdr;
636 	int fd, ret;
637 	ssize_t base_len;
638 	uint64_t l1sz, refsz, disksz, initsz, clustersz;
639 	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
640 	uint16_t refs;
641 
642 	disksz = 1024 * 1024 * imgsize;
643 
644 	if (base_path) {
645 		fd = open(base_path, O_RDONLY);
646 		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
647 			err(1, "failure to read base image header");
648 		close(fd);
649 		if (strncmp(basehdr.magic,
650 		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
651 			errx(1, "base image is not a qcow2 file");
652 		if (!disksz)
653 			disksz = betoh64(basehdr.disksz);
654 		else if (disksz != betoh64(basehdr.disksz))
655 			errx(1, "base size does not match requested size");
656 	}
657 	if (!base_path && !disksz)
658 		errx(1, "missing disk size");
659 
660 	clustersz = (1<<16);
661 	l1off = ALIGNSZ(sizeof(hdr), clustersz);
662 
663 	l1entrysz = clustersz * clustersz / 8;
664 	l1sz = (disksz + l1entrysz - 1) / l1entrysz;
665 
666 	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
667 	refentrysz = clustersz * clustersz * clustersz / 2;
668 	refsz = (disksz + refentrysz - 1) / refentrysz;
669 
670 	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
671 	base_len = base_path ? strlen(base_path) : 0;
672 
673 	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
674 	hdr.version		= htobe32(3);
675 	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
676 	hdr.backingsz		= htobe32(base_len);
677 	hdr.clustershift	= htobe32(16);
678 	hdr.disksz		= htobe64(disksz);
679 	hdr.cryptmethod		= htobe32(0);
680 	hdr.l1sz		= htobe32(l1sz);
681 	hdr.l1off		= htobe64(l1off);
682 	hdr.refoff		= htobe64(refoff);
683 	hdr.refsz		= htobe32(refsz);
684 	hdr.snapcount		= htobe32(0);
685 	hdr.snapsz		= htobe64(0);
686 	hdr.incompatfeatures	= htobe64(0);
687 	hdr.compatfeatures	= htobe64(0);
688 	hdr.autoclearfeatures	= htobe64(0);
689 	hdr.reforder		= htobe32(4);
690 	hdr.headersz		= htobe32(sizeof(hdr));
691 
692 	/* Refuse to overwrite an existing image */
693 	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
694 	    S_IRUSR | S_IWUSR);
695 	if (fd == -1)
696 		return (errno);
697 
698 	/* Write out the header */
699 	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
700 		goto error;
701 
702 	/* Add the base image */
703 	if (base_path && write(fd, base_path, base_len) != base_len)
704 		goto error;
705 
706 	/* Extend to desired size, and add one refcount cluster */
707 	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
708 		goto error;
709 
710 	/*
711 	 * Paranoia: if our disk image takes more than one cluster
712 	 * to refcount the initial image, fail.
713 	 */
714 	if (initsz/clustersz > clustersz/2) {
715 		errno = ERANGE;
716 		goto error;
717 	}
718 
719 	/* Add a refcount block, and refcount ourselves. */
720 	v = htobe64(initsz);
721 	if (pwrite(fd, &v, 8, refoff) != 8)
722 		goto error;
723 	for (i = 0; i < initsz/clustersz + 1; i++) {
724 		refs = htobe16(1);
725 		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
726 			goto error;
727 	}
728 
729 	ret = close(fd);
730 	return (ret);
731 error:
732 	ret = errno;
733 	close(fd);
734 	unlink(imgfile_path);
735 	return (errno);
736 }
737