xref: /openbsd/usr.sbin/vmd/vioqcow2.c (revision 55cc5ba3)
1 /*	$OpenBSD: vioqcow2.c,v 1.14 2020/10/19 19:06:49 naddy Exp $	*/
2 
3 /*
4  * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 
22 #include <machine/vmmvar.h>
23 #include <dev/pci/pcireg.h>
24 
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29 #include <assert.h>
30 #include <libgen.h>
31 #include <err.h>
32 #include <errno.h>
33 
34 #include "vmd.h"
35 #include "vmm.h"
36 #include "virtio.h"
37 
38 #define QCOW2_COMPRESSED	0x4000000000000000ull
39 #define QCOW2_INPLACE		0x8000000000000000ull
40 
41 #define QCOW2_DIRTY		(1 << 0)
42 #define QCOW2_CORRUPT		(1 << 1)
43 
44 enum {
45 	ICFEATURE_DIRTY		= 1 << 0,
46 	ICFEATURE_CORRUPT	= 1 << 1,
47 };
48 
49 enum {
50 	ACFEATURE_BITEXT	= 1 << 0,
51 };
52 
53 struct qcheader {
54 	char magic[4];
55 	uint32_t version;
56 	uint64_t backingoff;
57 	uint32_t backingsz;
58 	uint32_t clustershift;
59 	uint64_t disksz;
60 	uint32_t cryptmethod;
61 	uint32_t l1sz;
62 	uint64_t l1off;
63 	uint64_t refoff;
64 	uint32_t refsz;
65 	uint32_t snapcount;
66 	uint64_t snapsz;
67 	/* v3 additions */
68 	uint64_t incompatfeatures;
69 	uint64_t compatfeatures;
70 	uint64_t autoclearfeatures;
71 	uint32_t reforder;	/* Bits = 1 << reforder */
72 	uint32_t headersz;
73 } __packed;
74 
75 struct qcdisk {
76 	pthread_rwlock_t lock;
77 	struct qcdisk *base;
78 	struct qcheader header;
79 
80 	int       fd;
81 	uint64_t *l1;
82 	off_t     end;
83 	off_t	  clustersz;
84 	off_t	  disksz; /* In bytes */
85 	uint32_t  cryptmethod;
86 
87 	uint32_t l1sz;
88 	off_t	 l1off;
89 
90 	off_t	 refoff;
91 	off_t	 refsz;
92 
93 	uint32_t nsnap;
94 	off_t	 snapoff;
95 
96 	/* v3 features */
97 	uint64_t incompatfeatures;
98 	uint64_t autoclearfeatures;
99 	uint32_t refssz;
100 	uint32_t headersz;
101 };
102 
103 extern char *__progname;
104 
105 static off_t xlate(struct qcdisk *, off_t, int *);
106 static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
107 static void inc_refs(struct qcdisk *, off_t, int);
108 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
109 static int qc2_open(struct qcdisk *, int *, size_t);
110 static ssize_t qc2_pread(void *, char *, size_t, off_t);
111 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
112 static void qc2_close(void *, int);
113 
114 /*
115  * Initializes a raw disk image backing file from an fd.
116  * Stores the number of 512 byte sectors in *szp,
117  * returning -1 for error, 0 for success.
118  *
119  * May open snapshot base images.
120  */
121 int
122 virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
123 {
124 	struct qcdisk *diskp;
125 
126 	diskp = malloc(sizeof(struct qcdisk));
127 	if (diskp == NULL)
128 		return -1;
129 	if (qc2_open(diskp, fd, nfd) == -1) {
130 		log_warnx("could not open qcow2 disk");
131 		return -1;
132 	}
133 	file->p = diskp;
134 	file->pread = qc2_pread;
135 	file->pwrite = qc2_pwrite;
136 	file->close = qc2_close;
137 	*szp = diskp->disksz;
138 	return 0;
139 }
140 
141 /*
142  * Return the path to the base image given a disk image.
143  * Called from vmctl.
144  */
145 ssize_t
146 virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
147 {
148 	char dpathbuf[PATH_MAX];
149 	char expanded[PATH_MAX];
150 	struct qcheader header;
151 	uint64_t backingoff;
152 	uint32_t backingsz;
153 	char *s = NULL;
154 
155 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
156 		log_warnx("short read on header");
157 		return -1;
158 	}
159 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
160 		log_warnx("invalid magic numbers");
161 		return -1;
162 	}
163 	backingoff = be64toh(header.backingoff);
164 	backingsz = be32toh(header.backingsz);
165 	if (backingsz == 0)
166 		return 0;
167 
168 	if (backingsz >= npath - 1) {
169 		log_warnx("snapshot path too long");
170 		return -1;
171 	}
172 	if (pread(fd, path, backingsz, backingoff) != backingsz) {
173 		log_warnx("could not read snapshot base name");
174 		return -1;
175 	}
176 	path[backingsz] = '\0';
177 
178 	/*
179 	 * Relative paths should be interpreted relative to the disk image,
180 	 * rather than relative to the directory vmd happens to be running in,
181 	 * since this is the only userful interpretation.
182 	 */
183 	if (path[0] == '/') {
184 		if (realpath(path, expanded) == NULL ||
185 		    strlcpy(path, expanded, npath) >= npath) {
186 			log_warnx("unable to resolve %s", path);
187 			return -1;
188 		}
189 	} else {
190 		if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
191 		    sizeof(dpathbuf)) {
192 			log_warnx("path too long: %s", dpath);
193 			return -1;
194 		}
195 		s = dirname(dpathbuf);
196 		if (snprintf(expanded, sizeof(expanded),
197 		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
198 			log_warnx("path too long: %s/%s", s, path);
199 			return -1;
200 		}
201 		if (npath < PATH_MAX ||
202 		    realpath(expanded, path) == NULL) {
203 			log_warnx("unable to resolve %s", path);
204 			return -1;
205 		}
206 	}
207 
208 	return strlen(path);
209 }
210 
211 static int
212 qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
213 {
214 	char basepath[PATH_MAX];
215 	struct stat st;
216 	struct qcheader header;
217 	uint64_t backingoff;
218 	uint32_t backingsz;
219 	off_t i;
220 	int version, fd;
221 
222 	pthread_rwlock_init(&disk->lock, NULL);
223 	fd = fds[0];
224 	disk->fd = fd;
225 	disk->base = NULL;
226 	disk->l1 = NULL;
227 
228 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
229 		fatalx("short read on header");
230 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
231 		fatalx("invalid magic numbers");
232 
233 	disk->clustersz		= (1ull << be32toh(header.clustershift));
234 	disk->disksz		= be64toh(header.disksz);
235 	disk->cryptmethod	= be32toh(header.cryptmethod);
236 	disk->l1sz		= be32toh(header.l1sz);
237 	disk->l1off		= be64toh(header.l1off);
238 	disk->refsz		= be32toh(header.refsz);
239 	disk->refoff		= be64toh(header.refoff);
240 	disk->nsnap		= be32toh(header.snapcount);
241 	disk->snapoff		= be64toh(header.snapsz);
242 
243 	/*
244 	 * The additional features here are defined as 0 in the v2 format,
245 	 * so as long as we clear the buffer before parsing, we don't need
246 	 * to check versions here.
247 	 */
248 	disk->incompatfeatures = be64toh(header.incompatfeatures);
249 	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
250 	disk->refssz = be32toh(header.refsz);
251 	disk->headersz = be32toh(header.headersz);
252 
253 	/*
254 	 * We only know about the dirty or corrupt bits here.
255 	 */
256 	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
257 		fatalx("unsupported features %llx",
258 		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
259 	if (be32toh(header.reforder) != 4)
260 		fatalx("unsupported refcount size\n");
261 
262 	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
263 	if (!disk->l1)
264 		fatal("%s: could not allocate l1 table", __func__);
265 	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
266 	    != 8 * disk->l1sz)
267 		fatalx("%s: unable to read qcow2 L1 table", __func__);
268 	for (i = 0; i < disk->l1sz; i++)
269 		disk->l1[i] = be64toh(disk->l1[i]);
270 	version = be32toh(header.version);
271 	if (version != 2 && version != 3)
272 		fatalx("%s: unknown qcow2 version %d", __func__, version);
273 
274 	backingoff = be64toh(header.backingoff);
275 	backingsz = be32toh(header.backingsz);
276 	if (backingsz != 0) {
277 		if (backingsz >= sizeof(basepath) - 1) {
278 			fatalx("%s: snapshot path too long", __func__);
279 		}
280 		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
281 			fatalx("%s: could not read snapshot base name",
282 			    __func__);
283 		}
284 		basepath[backingsz] = 0;
285 		if (nfd <= 1) {
286 			fatalx("%s: missing base image %s", __func__,
287 			    basepath);
288 		}
289 
290 
291 		disk->base = calloc(1, sizeof(struct qcdisk));
292 		if (!disk->base)
293 			fatal("%s: could not open %s", __func__, basepath);
294 		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
295 			fatalx("%s: could not open %s", __func__, basepath);
296 		if (disk->base->clustersz != disk->clustersz)
297 			fatalx("%s: all disk parts must share clustersize",
298 			    __func__);
299 	}
300 	if (fstat(fd, &st) == -1)
301 		fatal("%s: unable to stat disk", __func__);
302 
303 	disk->end = st.st_size;
304 
305 	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
306 	    __func__, version, disk->disksz, disk->end, disk->nsnap);
307 
308 	return 0;
309 }
310 
311 static ssize_t
312 qc2_pread(void *p, char *buf, size_t len, off_t off)
313 {
314 	struct qcdisk *disk, *d;
315 	off_t phys_off, end, cluster_off;
316 	ssize_t sz, rem;
317 
318 	disk = p;
319 	end = off + len;
320 	if (off < 0 || end > disk->disksz)
321 		return -1;
322 
323 	/* handle head chunk separately */
324 	rem = len;
325 	while (off != end) {
326 		for (d = disk; d; d = d->base)
327 			if ((phys_off = xlate(d, off, NULL)) > 0)
328 				break;
329 		/* Break out into chunks. This handles
330 		 * three cases:
331 		 *
332 		 *    |----+====|========|====+-----|
333 		 *
334 		 * Either we are at the start of the read,
335 		 * and the cluster has some leading bytes.
336 		 * This means that we are reading the tail
337 		 * of the cluster, and our size is:
338 		 *
339 		 * 	clustersz - (off % clustersz).
340 		 *
341 		 * Otherwise, we're reading the middle section.
342 		 * We're already aligned here, so we can just
343 		 * read the whole cluster size. Or we're at the
344 		 * tail, at which point we just want to read the
345 		 * remaining bytes.
346 		 */
347 		cluster_off = off % disk->clustersz;
348 		sz = disk->clustersz - cluster_off;
349 		if (sz > rem)
350 			sz = rem;
351 		/*
352 		 * If we're within the disk, but don't have backing bytes,
353 		 * just read back zeros.
354 		 */
355 		if (!d)
356 			bzero(buf, sz);
357 		else if (pread(d->fd, buf, sz, phys_off) != sz)
358 			return -1;
359 		off += sz;
360 		buf += sz;
361 		rem -= sz;
362 	}
363 	return len;
364 }
365 
366 ssize_t
367 qc2_pwrite(void *p, char *buf, size_t len, off_t off)
368 {
369 	struct qcdisk *disk, *d;
370 	off_t phys_off, cluster_off, end;
371 	ssize_t sz, rem;
372 	int inplace;
373 
374 	d = p;
375 	disk = p;
376 	inplace = 1;
377 	end = off + len;
378 	if (off < 0 || end > disk->disksz)
379 		return -1;
380 	rem = len;
381 	while (off != end) {
382 		/* See the read code for a summary of the computation */
383 		cluster_off = off % disk->clustersz;
384 		sz = disk->clustersz - cluster_off;
385 		if (sz > rem)
386 			sz = rem;
387 
388 		phys_off = xlate(disk, off, &inplace);
389 		if (phys_off == -1)
390 			return -1;
391 		/*
392 		 * If we couldn't find the cluster in the writable disk,
393 		 * see if it exists in the base image. If it does, we
394 		 * need to copy it before the write. The copy happens
395 		 * in the '!inplace' if clause below te search.
396 		 */
397 		if (phys_off == 0)
398 			for (d = disk->base; d; d = d->base)
399 				if ((phys_off = xlate(d, off, NULL)) > 0)
400 					break;
401 		if (!inplace || phys_off == 0)
402 			phys_off = mkcluster(disk, d, off, phys_off);
403 		if (phys_off == -1)
404 			return -1;
405 		if (phys_off < disk->clustersz)
406 			fatalx("%s: writing reserved cluster", __func__);
407 		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
408 			return -1;
409 		off += sz;
410 		buf += sz;
411 		rem -= sz;
412 	}
413 	return len;
414 }
415 
416 static void
417 qc2_close(void *p, int stayopen)
418 {
419 	struct qcdisk *disk;
420 
421 	disk = p;
422 	if (disk->base)
423 		qc2_close(disk->base, stayopen);
424 	if (!stayopen)
425 		close(disk->fd);
426 	free(disk->l1);
427 	free(disk);
428 }
429 
430 /*
431  * Translates a virtual offset into an on-disk offset.
432  * Returns:
433  * 	-1 on error
434  * 	 0 on 'not found'
435  * 	>0 on found
436  */
437 static off_t
438 xlate(struct qcdisk *disk, off_t off, int *inplace)
439 {
440 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
441 	uint64_t buf;
442 
443 
444 	/*
445 	 * Clear out inplace flag -- xlate misses should not
446 	 * be flagged as updatable in place. We will still
447 	 * return 0 from them, but this leaves less surprises
448 	 * in the API.
449 	 */
450 	if (inplace)
451 		*inplace = 0;
452 	pthread_rwlock_rdlock(&disk->lock);
453 	if (off < 0)
454 		goto err;
455 
456 	l2sz = disk->clustersz / 8;
457 	l1off = (off / disk->clustersz) / l2sz;
458 	if (l1off >= disk->l1sz)
459 		goto err;
460 
461 	l2tab = disk->l1[l1off];
462 	l2tab &= ~QCOW2_INPLACE;
463 	if (l2tab == 0) {
464 		pthread_rwlock_unlock(&disk->lock);
465 		return 0;
466 	}
467 	l2off = (off / disk->clustersz) % l2sz;
468 	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
469 	cluster = be64toh(buf);
470 	/*
471 	 * cluster may be 0, but all future operations don't affect
472 	 * the return value.
473 	 */
474 	if (inplace)
475 		*inplace = !!(cluster & QCOW2_INPLACE);
476 	if (cluster & QCOW2_COMPRESSED)
477 		fatalx("%s: compressed clusters unsupported", __func__);
478 	pthread_rwlock_unlock(&disk->lock);
479 	clusteroff = 0;
480 	cluster &= ~QCOW2_INPLACE;
481 	if (cluster)
482 		clusteroff = off % disk->clustersz;
483 	return cluster + clusteroff;
484 err:
485 	pthread_rwlock_unlock(&disk->lock);
486 	return -1;
487 }
488 
489 /*
490  * Allocates a new cluster on disk, creating a new L2 table
491  * if needed. The cluster starts off with a refs of one,
492  * and the writable bit set.
493  *
494  * Returns -1 on error, and the physical address within the
495  * cluster of the write offset if it exists.
496  */
497 static off_t
498 mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
499 {
500 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
501 	uint64_t buf;
502 	int fd;
503 
504 	pthread_rwlock_wrlock(&disk->lock);
505 
506 	cluster = -1;
507 	fd = disk->fd;
508 	/* L1 entries always exist */
509 	l2sz = disk->clustersz / 8;
510 	l1off = off / (disk->clustersz * l2sz);
511 	if (l1off >= disk->l1sz)
512 		fatalx("l1 offset outside disk");
513 
514 	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
515 
516 	l2tab = disk->l1[l1off];
517 	l2off = (off / disk->clustersz) % l2sz;
518 	/* We may need to create or clone an L2 entry to map the block */
519 	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
520 		orig = l2tab & ~QCOW2_INPLACE;
521 		l2tab = disk->end;
522 		disk->end += disk->clustersz;
523 		if (ftruncate(disk->fd, disk->end) == -1)
524 			fatal("%s: ftruncate failed", __func__);
525 
526 		/*
527 		 * If we translated, found a L2 entry, but it needed to
528 		 * be copied, copy it.
529 		 */
530 		if (orig != 0)
531 			copy_cluster(disk, disk, l2tab, orig);
532 		/* Update l1 -- we flush it later */
533 		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
534 		inc_refs(disk, l2tab, 1);
535 	}
536 	l2tab &= ~QCOW2_INPLACE;
537 
538 	/* Grow the disk */
539 	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
540 		fatalx("%s: could not grow disk", __func__);
541 	if (src_phys > 0)
542 		copy_cluster(disk, base, disk->end, src_phys);
543 	cluster = disk->end;
544 	disk->end += disk->clustersz;
545 	buf = htobe64(cluster | QCOW2_INPLACE);
546 	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
547 		fatalx("%s: could not write cluster", __func__);
548 
549 	/* TODO: lazily sync: currently VMD doesn't close things */
550 	buf = htobe64(disk->l1[l1off]);
551 	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
552 		fatalx("%s: could not write l1", __func__);
553 	inc_refs(disk, cluster, 1);
554 
555 	pthread_rwlock_unlock(&disk->lock);
556 	clusteroff = off % disk->clustersz;
557 	if (cluster + clusteroff < disk->clustersz)
558 		fatalx("write would clobber header");
559 	return cluster + clusteroff;
560 }
561 
562 /* Copies a cluster containing src to dst. Src and dst need not be aligned. */
563 static void
564 copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
565 {
566 	char *scratch;
567 
568 	scratch = malloc(disk->clustersz);
569 	if (!scratch)
570 		fatal("out of memory");
571 	src &= ~(disk->clustersz - 1);
572 	dst &= ~(disk->clustersz - 1);
573 	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
574 		fatal("%s: could not read cluster", __func__);
575 	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
576 		fatal("%s: could not write cluster", __func__);
577 	free(scratch);
578 }
579 
580 static void
581 inc_refs(struct qcdisk *disk, off_t off, int newcluster)
582 {
583 	off_t l1off, l1idx, l2idx, l2cluster;
584 	size_t nper;
585 	uint16_t refs;
586 	uint64_t buf;
587 
588 	off &= ~QCOW2_INPLACE;
589 	nper = disk->clustersz / 2;
590 	l1idx = (off / disk->clustersz) / nper;
591 	l2idx = (off / disk->clustersz) % nper;
592 	l1off = disk->refoff + 8 * l1idx;
593 	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
594 		fatal("could not read refs");
595 
596 	l2cluster = be64toh(buf);
597 	if (l2cluster == 0) {
598 		l2cluster = disk->end;
599 		disk->end += disk->clustersz;
600 		if (ftruncate(disk->fd, disk->end) < 0)
601 			fatal("%s: failed to allocate ref block", __func__);
602 		buf = htobe64(l2cluster);
603 		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
604 			fatal("%s: failed to write ref block", __func__);
605 	}
606 
607 	refs = 1;
608 	if (!newcluster) {
609 		if (pread(disk->fd, &refs, sizeof(refs),
610 		    l2cluster + 2 * l2idx) != 2)
611 			fatal("could not read ref cluster");
612 		refs = be16toh(refs) + 1;
613 	}
614 	refs = htobe16(refs);
615 	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
616 		fatal("%s: could not write ref block", __func__);
617 }
618 
619 /*
620  * virtio_qcow2_create
621  *
622  * Create an empty qcow2 imagefile with the specified path and size.
623  *
624  * Parameters:
625  *  imgfile_path: path to the image file to create
626  *  imgsize     : size of the image file to create (in MB)
627  *
628  * Return:
629  *  EEXIST: The requested image file already exists
630  *  0     : Image file successfully created
631  *  Exxxx : Various other Exxxx errno codes due to other I/O errors
632  */
633 int
634 virtio_qcow2_create(const char *imgfile_path,
635     const char *base_path, long imgsize)
636 {
637 	struct qcheader {
638 		char magic[4];
639 		uint32_t version;
640 		uint64_t backingoff;
641 		uint32_t backingsz;
642 		uint32_t clustershift;
643 		uint64_t disksz;
644 		uint32_t cryptmethod;
645 		uint32_t l1sz;
646 		uint64_t l1off;
647 		uint64_t refoff;
648 		uint32_t refsz;
649 		uint32_t snapcount;
650 		uint64_t snapsz;
651 		/* v3 additions */
652 		uint64_t incompatfeatures;
653 		uint64_t compatfeatures;
654 		uint64_t autoclearfeatures;
655 		uint32_t reforder;
656 		uint32_t headersz;
657 	} __packed hdr, basehdr;
658 	int fd, ret;
659 	ssize_t base_len;
660 	uint64_t l1sz, refsz, disksz, initsz, clustersz;
661 	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
662 	uint16_t refs;
663 
664 	disksz = 1024 * 1024 * imgsize;
665 
666 	if (base_path) {
667 		fd = open(base_path, O_RDONLY);
668 		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
669 			err(1, "failure to read base image header");
670 		close(fd);
671 		if (strncmp(basehdr.magic,
672 		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
673 			errx(1, "base image is not a qcow2 file");
674 		if (!disksz)
675 			disksz = betoh64(basehdr.disksz);
676 		else if (disksz != betoh64(basehdr.disksz))
677 			errx(1, "base size does not match requested size");
678 	}
679 	if (!base_path && !disksz)
680 		errx(1, "missing disk size");
681 
682 	clustersz = (1<<16);
683 	l1off = ALIGNSZ(sizeof(hdr), clustersz);
684 
685 	l1entrysz = clustersz * clustersz / 8;
686 	l1sz = (disksz + l1entrysz - 1) / l1entrysz;
687 
688 	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
689 	refentrysz = clustersz * clustersz * clustersz / 2;
690 	refsz = (disksz + refentrysz - 1) / refentrysz;
691 
692 	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
693 	base_len = base_path ? strlen(base_path) : 0;
694 
695 	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
696 	hdr.version		= htobe32(3);
697 	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
698 	hdr.backingsz		= htobe32(base_len);
699 	hdr.clustershift	= htobe32(16);
700 	hdr.disksz		= htobe64(disksz);
701 	hdr.cryptmethod		= htobe32(0);
702 	hdr.l1sz		= htobe32(l1sz);
703 	hdr.l1off		= htobe64(l1off);
704 	hdr.refoff		= htobe64(refoff);
705 	hdr.refsz		= htobe32(refsz);
706 	hdr.snapcount		= htobe32(0);
707 	hdr.snapsz		= htobe64(0);
708 	hdr.incompatfeatures	= htobe64(0);
709 	hdr.compatfeatures	= htobe64(0);
710 	hdr.autoclearfeatures	= htobe64(0);
711 	hdr.reforder		= htobe32(4);
712 	hdr.headersz		= htobe32(sizeof(hdr));
713 
714 	/* Refuse to overwrite an existing image */
715 	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
716 	    S_IRUSR | S_IWUSR);
717 	if (fd == -1)
718 		return (errno);
719 
720 	/* Write out the header */
721 	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
722 		goto error;
723 
724 	/* Add the base image */
725 	if (base_path && write(fd, base_path, base_len) != base_len)
726 		goto error;
727 
728 	/* Extend to desired size, and add one refcount cluster */
729 	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
730 		goto error;
731 
732 	/*
733 	 * Paranoia: if our disk image takes more than one cluster
734 	 * to refcount the initial image, fail.
735 	 */
736 	if (initsz/clustersz > clustersz/2) {
737 		errno = ERANGE;
738 		goto error;
739 	}
740 
741 	/* Add a refcount block, and refcount ourselves. */
742 	v = htobe64(initsz);
743 	if (pwrite(fd, &v, 8, refoff) != 8)
744 		goto error;
745 	for (i = 0; i < initsz/clustersz + 1; i++) {
746 		refs = htobe16(1);
747 		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
748 			goto error;
749 	}
750 
751 	ret = close(fd);
752 	return (ret);
753 error:
754 	ret = errno;
755 	close(fd);
756 	unlink(imgfile_path);
757 	return (errno);
758 }
759