xref: /openbsd/usr.sbin/vmd/vioqcow2.c (revision 3bef86f7)
1 /*	$OpenBSD: vioqcow2.c,v 1.24 2023/09/14 15:25:43 dv Exp $	*/
2 
3 /*
4  * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 
22 #include <dev/pci/pcireg.h>
23 
24 #include <assert.h>
25 #include <err.h>
26 #include <errno.h>
27 #include <fcntl.h>
28 #include <libgen.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <unistd.h>
32 
33 #include "virtio.h"
34 
35 #define QCOW2_COMPRESSED	0x4000000000000000ull
36 #define QCOW2_INPLACE		0x8000000000000000ull
37 
38 #define QCOW2_DIRTY		(1 << 0)
39 #define QCOW2_CORRUPT		(1 << 1)
40 
41 enum {
42 	ICFEATURE_DIRTY		= 1 << 0,
43 	ICFEATURE_CORRUPT	= 1 << 1,
44 };
45 
46 enum {
47 	ACFEATURE_BITEXT	= 1 << 0,
48 };
49 
50 struct qcheader {
51 	char magic[4];
52 	uint32_t version;
53 	uint64_t backingoff;
54 	uint32_t backingsz;
55 	uint32_t clustershift;
56 	uint64_t disksz;
57 	uint32_t cryptmethod;
58 	uint32_t l1sz;
59 	uint64_t l1off;
60 	uint64_t refoff;
61 	uint32_t refsz;
62 	uint32_t snapcount;
63 	uint64_t snapsz;
64 	/* v3 additions */
65 	uint64_t incompatfeatures;
66 	uint64_t compatfeatures;
67 	uint64_t autoclearfeatures;
68 	uint32_t reforder;	/* Bits = 1 << reforder */
69 	uint32_t headersz;
70 } __packed;
71 
72 struct qcdisk {
73 	pthread_rwlock_t lock;
74 	struct qcdisk *base;
75 	struct qcheader header;
76 
77 	int       fd;
78 	uint64_t *l1;
79 	off_t     end;
80 	off_t	  clustersz;
81 	off_t	  disksz; /* In bytes */
82 	uint32_t  cryptmethod;
83 
84 	uint32_t l1sz;
85 	off_t	 l1off;
86 
87 	off_t	 refoff;
88 	off_t	 refsz;
89 
90 	uint32_t nsnap;
91 	off_t	 snapoff;
92 
93 	/* v3 features */
94 	uint64_t incompatfeatures;
95 	uint64_t autoclearfeatures;
96 	uint32_t refssz;
97 	uint32_t headersz;
98 };
99 
100 extern char *__progname;
101 
102 static off_t xlate(struct qcdisk *, off_t, int *);
103 static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
104 static void inc_refs(struct qcdisk *, off_t, int);
105 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
106 static int qc2_open(struct qcdisk *, int *, size_t);
107 static ssize_t qc2_pread(void *, char *, size_t, off_t);
108 static ssize_t qc2_preadv(void *, struct iovec *, int, off_t);
109 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
110 static ssize_t qc2_pwritev(void *, struct iovec *, int, off_t);
111 static void qc2_close(void *, int);
112 
113 /*
114  * Initializes a raw disk image backing file from an fd. Stores the
115  * number of bytes in *szp, returning -1 for error, 0 for success.
116  *
117  * May open snapshot base images.
118  */
119 int
120 virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
121 {
122 	struct qcdisk *diskp;
123 
124 	diskp = malloc(sizeof(struct qcdisk));
125 	if (diskp == NULL)
126 		return -1;
127 	if (qc2_open(diskp, fd, nfd) == -1) {
128 		log_warnx("could not open qcow2 disk");
129 		return -1;
130 	}
131 	file->p = diskp;
132 	file->pread = qc2_pread;
133 	file->preadv = qc2_preadv;
134 	file->pwrite = qc2_pwrite;
135 	file->pwritev = qc2_pwritev;
136 	file->close = qc2_close;
137 	*szp = diskp->disksz;
138 	return 0;
139 }
140 
141 /*
142  * Return the path to the base image given a disk image.
143  * Called from vmctl.
144  */
145 ssize_t
146 virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
147 {
148 	char dpathbuf[PATH_MAX];
149 	char expanded[PATH_MAX];
150 	struct qcheader header;
151 	uint64_t backingoff;
152 	uint32_t backingsz;
153 	char *s = NULL;
154 
155 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
156 		log_warnx("short read on header");
157 		return -1;
158 	}
159 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
160 		log_warnx("invalid magic numbers");
161 		return -1;
162 	}
163 	backingoff = be64toh(header.backingoff);
164 	backingsz = be32toh(header.backingsz);
165 	if (backingsz == 0)
166 		return 0;
167 
168 	if (backingsz >= npath - 1) {
169 		log_warnx("snapshot path too long");
170 		return -1;
171 	}
172 	if (pread(fd, path, backingsz, backingoff) != backingsz) {
173 		log_warnx("could not read snapshot base name");
174 		return -1;
175 	}
176 	path[backingsz] = '\0';
177 
178 	/*
179 	 * Relative paths should be interpreted relative to the disk image,
180 	 * rather than relative to the directory vmd happens to be running in,
181 	 * since this is the only useful interpretation.
182 	 */
183 	if (path[0] == '/') {
184 		if (realpath(path, expanded) == NULL ||
185 		    strlcpy(path, expanded, npath) >= npath) {
186 			log_warnx("unable to resolve %s", path);
187 			return -1;
188 		}
189 	} else {
190 		if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
191 		    sizeof(dpathbuf)) {
192 			log_warnx("path too long: %s", dpath);
193 			return -1;
194 		}
195 		s = dirname(dpathbuf);
196 		if (snprintf(expanded, sizeof(expanded),
197 		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
198 			log_warnx("path too long: %s/%s", s, path);
199 			return -1;
200 		}
201 		if (npath < PATH_MAX ||
202 		    realpath(expanded, path) == NULL) {
203 			log_warnx("unable to resolve %s", path);
204 			return -1;
205 		}
206 	}
207 
208 	return strlen(path);
209 }
210 
211 static int
212 qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
213 {
214 	char basepath[PATH_MAX];
215 	struct stat st;
216 	struct qcheader header;
217 	uint64_t backingoff;
218 	uint32_t backingsz;
219 	off_t i;
220 	int version, fd;
221 
222 	pthread_rwlock_init(&disk->lock, NULL);
223 	fd = fds[0];
224 	disk->fd = fd;
225 	disk->base = NULL;
226 	disk->l1 = NULL;
227 
228 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
229 		fatalx("short read on header");
230 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
231 		fatalx("invalid magic numbers");
232 
233 	disk->clustersz		= (1ull << be32toh(header.clustershift));
234 	disk->disksz		= be64toh(header.disksz);
235 	disk->cryptmethod	= be32toh(header.cryptmethod);
236 	disk->l1sz		= be32toh(header.l1sz);
237 	disk->l1off		= be64toh(header.l1off);
238 	disk->refsz		= be32toh(header.refsz);
239 	disk->refoff		= be64toh(header.refoff);
240 	disk->nsnap		= be32toh(header.snapcount);
241 	disk->snapoff		= be64toh(header.snapsz);
242 
243 	/*
244 	 * The additional features here are defined as 0 in the v2 format,
245 	 * so as long as we clear the buffer before parsing, we don't need
246 	 * to check versions here.
247 	 */
248 	disk->incompatfeatures = be64toh(header.incompatfeatures);
249 	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
250 	disk->refssz = be32toh(header.refsz);
251 	disk->headersz = be32toh(header.headersz);
252 
253 	/*
254 	 * We only know about the dirty or corrupt bits here.
255 	 */
256 	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
257 		fatalx("unsupported features %llx",
258 		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
259 	if (be32toh(header.reforder) != 4)
260 		fatalx("unsupported refcount size\n");
261 
262 	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
263 	if (!disk->l1)
264 		fatal("%s: could not allocate l1 table", __func__);
265 	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
266 	    != 8 * disk->l1sz)
267 		fatalx("%s: unable to read qcow2 L1 table", __func__);
268 	for (i = 0; i < disk->l1sz; i++)
269 		disk->l1[i] = be64toh(disk->l1[i]);
270 	version = be32toh(header.version);
271 	if (version != 2 && version != 3)
272 		fatalx("%s: unknown qcow2 version %d", __func__, version);
273 
274 	backingoff = be64toh(header.backingoff);
275 	backingsz = be32toh(header.backingsz);
276 	if (backingsz != 0) {
277 		if (backingsz >= sizeof(basepath) - 1) {
278 			fatalx("%s: snapshot path too long", __func__);
279 		}
280 		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
281 			fatalx("%s: could not read snapshot base name",
282 			    __func__);
283 		}
284 		basepath[backingsz] = 0;
285 		if (nfd <= 1) {
286 			fatalx("%s: missing base image %s", __func__,
287 			    basepath);
288 		}
289 
290 
291 		disk->base = calloc(1, sizeof(struct qcdisk));
292 		if (!disk->base)
293 			fatal("%s: could not open %s", __func__, basepath);
294 		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
295 			fatalx("%s: could not open %s", __func__, basepath);
296 		if (disk->base->clustersz != disk->clustersz)
297 			fatalx("%s: all disk parts must share clustersize",
298 			    __func__);
299 	}
300 	if (fstat(fd, &st) == -1)
301 		fatal("%s: unable to stat disk", __func__);
302 
303 	disk->end = st.st_size;
304 
305 	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
306 	    __func__, version, disk->disksz, disk->end, disk->nsnap);
307 
308 	return 0;
309 }
310 
311 static ssize_t
312 qc2_preadv(void *p, struct iovec *iov, int cnt, off_t offset)
313 {
314 	int i;
315 	off_t pos = offset;
316 	ssize_t sz = 0, total = 0;
317 
318 	for (i = 0; i < cnt; i++, iov++) {
319 		sz = qc2_pread(p, iov->iov_base, iov->iov_len, pos);
320 		if (sz == -1)
321 			return (sz);
322 		total += sz;
323 		pos += sz;
324 	}
325 
326 	return (total);
327 }
328 
329 static ssize_t
330 qc2_pread(void *p, char *buf, size_t len, off_t off)
331 {
332 	struct qcdisk *disk, *d;
333 	off_t phys_off, end, cluster_off;
334 	ssize_t sz, rem;
335 
336 	disk = p;
337 	end = off + len;
338 	if (off < 0 || end > disk->disksz)
339 		return -1;
340 
341 	/* handle head chunk separately */
342 	rem = len;
343 	while (off != end) {
344 		for (d = disk; d; d = d->base)
345 			if ((phys_off = xlate(d, off, NULL)) > 0)
346 				break;
347 		/* Break out into chunks. This handles
348 		 * three cases:
349 		 *
350 		 *    |----+====|========|====+-----|
351 		 *
352 		 * Either we are at the start of the read,
353 		 * and the cluster has some leading bytes.
354 		 * This means that we are reading the tail
355 		 * of the cluster, and our size is:
356 		 *
357 		 * 	clustersz - (off % clustersz).
358 		 *
359 		 * Otherwise, we're reading the middle section.
360 		 * We're already aligned here, so we can just
361 		 * read the whole cluster size. Or we're at the
362 		 * tail, at which point we just want to read the
363 		 * remaining bytes.
364 		 */
365 		cluster_off = off % disk->clustersz;
366 		sz = disk->clustersz - cluster_off;
367 		if (sz > rem)
368 			sz = rem;
369 		/*
370 		 * If we're within the disk, but don't have backing bytes,
371 		 * just read back zeros.
372 		 */
373 		if (!d)
374 			bzero(buf, sz);
375 		else if (pread(d->fd, buf, sz, phys_off) != sz)
376 			return -1;
377 		off += sz;
378 		buf += sz;
379 		rem -= sz;
380 	}
381 	return len;
382 }
383 
384 static ssize_t
385 qc2_pwritev(void *p, struct iovec *iov, int cnt, off_t offset)
386 {
387 	int i;
388 	off_t pos = offset;
389 	ssize_t sz = 0, total = 0;
390 
391 	for (i = 0; i < cnt; i++, iov++) {
392 		sz = qc2_pwrite(p, iov->iov_base, iov->iov_len, pos);
393 		if (sz == -1)
394 			return (sz);
395 		total += sz;
396 		pos += sz;
397 	}
398 
399 	return (total);
400 }
401 
402 static ssize_t
403 qc2_pwrite(void *p, char *buf, size_t len, off_t off)
404 {
405 	struct qcdisk *disk, *d;
406 	off_t phys_off, cluster_off, end;
407 	ssize_t sz, rem;
408 	int inplace;
409 
410 	d = p;
411 	disk = p;
412 	inplace = 1;
413 	end = off + len;
414 	if (off < 0 || end > disk->disksz)
415 		return -1;
416 	rem = len;
417 	while (off != end) {
418 		/* See the read code for a summary of the computation */
419 		cluster_off = off % disk->clustersz;
420 		sz = disk->clustersz - cluster_off;
421 		if (sz > rem)
422 			sz = rem;
423 
424 		phys_off = xlate(disk, off, &inplace);
425 		if (phys_off == -1)
426 			return -1;
427 		/*
428 		 * If we couldn't find the cluster in the writable disk,
429 		 * see if it exists in the base image. If it does, we
430 		 * need to copy it before the write. The copy happens
431 		 * in the '!inplace' if clause below te search.
432 		 */
433 		if (phys_off == 0)
434 			for (d = disk->base; d; d = d->base)
435 				if ((phys_off = xlate(d, off, NULL)) > 0)
436 					break;
437 		if (!inplace || phys_off == 0)
438 			phys_off = mkcluster(disk, d, off, phys_off);
439 		if (phys_off == -1)
440 			return -1;
441 		if (phys_off < disk->clustersz)
442 			fatalx("%s: writing reserved cluster", __func__);
443 		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
444 			return -1;
445 		off += sz;
446 		buf += sz;
447 		rem -= sz;
448 	}
449 	return len;
450 }
451 
452 static void
453 qc2_close(void *p, int stayopen)
454 {
455 	struct qcdisk *disk;
456 
457 	disk = p;
458 	if (disk->base)
459 		qc2_close(disk->base, stayopen);
460 	if (!stayopen)
461 		close(disk->fd);
462 	free(disk->l1);
463 	free(disk);
464 }
465 
466 /*
467  * Translates a virtual offset into an on-disk offset.
468  * Returns:
469  * 	-1 on error
470  * 	 0 on 'not found'
471  * 	>0 on found
472  */
473 static off_t
474 xlate(struct qcdisk *disk, off_t off, int *inplace)
475 {
476 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
477 	uint64_t buf;
478 
479 
480 	/*
481 	 * Clear out inplace flag -- xlate misses should not
482 	 * be flagged as updatable in place. We will still
483 	 * return 0 from them, but this leaves less surprises
484 	 * in the API.
485 	 */
486 	if (inplace)
487 		*inplace = 0;
488 	pthread_rwlock_rdlock(&disk->lock);
489 	if (off < 0)
490 		goto err;
491 
492 	l2sz = disk->clustersz / 8;
493 	l1off = (off / disk->clustersz) / l2sz;
494 	if (l1off >= disk->l1sz)
495 		goto err;
496 
497 	l2tab = disk->l1[l1off];
498 	l2tab &= ~QCOW2_INPLACE;
499 	if (l2tab == 0) {
500 		pthread_rwlock_unlock(&disk->lock);
501 		return 0;
502 	}
503 	l2off = (off / disk->clustersz) % l2sz;
504 	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
505 	cluster = be64toh(buf);
506 	/*
507 	 * cluster may be 0, but all future operations don't affect
508 	 * the return value.
509 	 */
510 	if (inplace)
511 		*inplace = !!(cluster & QCOW2_INPLACE);
512 	if (cluster & QCOW2_COMPRESSED)
513 		fatalx("%s: compressed clusters unsupported", __func__);
514 	pthread_rwlock_unlock(&disk->lock);
515 	clusteroff = 0;
516 	cluster &= ~QCOW2_INPLACE;
517 	if (cluster)
518 		clusteroff = off % disk->clustersz;
519 	return cluster + clusteroff;
520 err:
521 	pthread_rwlock_unlock(&disk->lock);
522 	return -1;
523 }
524 
525 /*
526  * Allocates a new cluster on disk, creating a new L2 table
527  * if needed. The cluster starts off with a refs of one,
528  * and the writable bit set.
529  *
530  * Returns -1 on error, and the physical address within the
531  * cluster of the write offset if it exists.
532  */
533 static off_t
534 mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
535 {
536 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
537 	uint64_t buf;
538 
539 	pthread_rwlock_wrlock(&disk->lock);
540 
541 	cluster = -1;
542 	/* L1 entries always exist */
543 	l2sz = disk->clustersz / 8;
544 	l1off = off / (disk->clustersz * l2sz);
545 	if (l1off >= disk->l1sz)
546 		fatalx("l1 offset outside disk");
547 
548 	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
549 
550 	l2tab = disk->l1[l1off];
551 	l2off = (off / disk->clustersz) % l2sz;
552 	/* We may need to create or clone an L2 entry to map the block */
553 	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
554 		orig = l2tab & ~QCOW2_INPLACE;
555 		l2tab = disk->end;
556 		disk->end += disk->clustersz;
557 		if (ftruncate(disk->fd, disk->end) == -1)
558 			fatal("%s: ftruncate failed", __func__);
559 
560 		/*
561 		 * If we translated, found a L2 entry, but it needed to
562 		 * be copied, copy it.
563 		 */
564 		if (orig != 0)
565 			copy_cluster(disk, disk, l2tab, orig);
566 		/* Update l1 -- we flush it later */
567 		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
568 		inc_refs(disk, l2tab, 1);
569 	}
570 	l2tab &= ~QCOW2_INPLACE;
571 
572 	/* Grow the disk */
573 	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
574 		fatal("%s: could not grow disk", __func__);
575 	if (src_phys > 0)
576 		copy_cluster(disk, base, disk->end, src_phys);
577 	cluster = disk->end;
578 	disk->end += disk->clustersz;
579 	buf = htobe64(cluster | QCOW2_INPLACE);
580 	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
581 		fatalx("%s: could not write cluster", __func__);
582 
583 	/* TODO: lazily sync: currently VMD doesn't close things */
584 	buf = htobe64(disk->l1[l1off]);
585 	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
586 		fatalx("%s: could not write l1", __func__);
587 	inc_refs(disk, cluster, 1);
588 
589 	pthread_rwlock_unlock(&disk->lock);
590 	clusteroff = off % disk->clustersz;
591 	if (cluster + clusteroff < disk->clustersz)
592 		fatalx("write would clobber header");
593 	return cluster + clusteroff;
594 }
595 
596 /* Copies a cluster containing src to dst. Src and dst need not be aligned. */
597 static void
598 copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
599 {
600 	char *scratch;
601 
602 	scratch = malloc(disk->clustersz);
603 	if (!scratch)
604 		fatal("out of memory");
605 	src &= ~(disk->clustersz - 1);
606 	dst &= ~(disk->clustersz - 1);
607 	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
608 		fatal("%s: could not read cluster", __func__);
609 	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
610 		fatal("%s: could not write cluster", __func__);
611 	free(scratch);
612 }
613 
614 static void
615 inc_refs(struct qcdisk *disk, off_t off, int newcluster)
616 {
617 	off_t l1off, l1idx, l2idx, l2cluster;
618 	size_t nper;
619 	uint16_t refs;
620 	uint64_t buf;
621 
622 	off &= ~QCOW2_INPLACE;
623 	nper = disk->clustersz / 2;
624 	l1idx = (off / disk->clustersz) / nper;
625 	l2idx = (off / disk->clustersz) % nper;
626 	l1off = disk->refoff + 8 * l1idx;
627 	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
628 		fatal("could not read refs");
629 
630 	l2cluster = be64toh(buf);
631 	if (l2cluster == 0) {
632 		l2cluster = disk->end;
633 		disk->end += disk->clustersz;
634 		if (ftruncate(disk->fd, disk->end) < 0)
635 			fatal("%s: failed to allocate ref block", __func__);
636 		buf = htobe64(l2cluster);
637 		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
638 			fatal("%s: failed to write ref block", __func__);
639 	}
640 
641 	refs = 1;
642 	if (!newcluster) {
643 		if (pread(disk->fd, &refs, sizeof(refs),
644 		    l2cluster + 2 * l2idx) != 2)
645 			fatal("could not read ref cluster");
646 		refs = be16toh(refs) + 1;
647 	}
648 	refs = htobe16(refs);
649 	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
650 		fatal("%s: could not write ref block", __func__);
651 }
652 
653 /*
654  * virtio_qcow2_create
655  *
656  * Create an empty qcow2 imagefile with the specified path and size.
657  *
658  * Parameters:
659  *  imgfile_path: path to the image file to create
660  *  imgsize     : size of the image file to create (in bytes)
661  *
662  * Return:
663  *  EEXIST: The requested image file already exists
664  *  0     : Image file successfully created
665  *  Exxxx : Various other Exxxx errno codes due to other I/O errors
666  */
667 int
668 virtio_qcow2_create(const char *imgfile_path,
669     const char *base_path, uint64_t disksz)
670 {
671 	struct qcheader hdr, basehdr;
672 	int fd, ret;
673 	ssize_t base_len;
674 	uint64_t l1sz, refsz, initsz, clustersz;
675 	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
676 	uint16_t refs;
677 
678 	if (base_path) {
679 		fd = open(base_path, O_RDONLY);
680 		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
681 			errx(1, "failure to read base image header");
682 		close(fd);
683 		if (strncmp(basehdr.magic,
684 		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
685 			errx(1, "base image is not a qcow2 file");
686 		if (!disksz)
687 			disksz = betoh64(basehdr.disksz);
688 		else if (disksz != betoh64(basehdr.disksz))
689 			errx(1, "base size does not match requested size");
690 	}
691 	if (!base_path && !disksz)
692 		errx(1, "missing disk size");
693 
694 	clustersz = (1<<16);
695 	l1off = ALIGNSZ(sizeof(hdr), clustersz);
696 
697 	l1entrysz = clustersz * clustersz / 8;
698 	l1sz = (disksz + l1entrysz - 1) / l1entrysz;
699 
700 	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
701 	refentrysz = clustersz * clustersz * clustersz / 2;
702 	refsz = (disksz + refentrysz - 1) / refentrysz;
703 
704 	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
705 	base_len = base_path ? strlen(base_path) : 0;
706 
707 	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
708 	hdr.version		= htobe32(3);
709 	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
710 	hdr.backingsz		= htobe32(base_len);
711 	hdr.clustershift	= htobe32(16);
712 	hdr.disksz		= htobe64(disksz);
713 	hdr.cryptmethod		= htobe32(0);
714 	hdr.l1sz		= htobe32(l1sz);
715 	hdr.l1off		= htobe64(l1off);
716 	hdr.refoff		= htobe64(refoff);
717 	hdr.refsz		= htobe32(refsz);
718 	hdr.snapcount		= htobe32(0);
719 	hdr.snapsz		= htobe64(0);
720 	hdr.incompatfeatures	= htobe64(0);
721 	hdr.compatfeatures	= htobe64(0);
722 	hdr.autoclearfeatures	= htobe64(0);
723 	hdr.reforder		= htobe32(4);
724 	hdr.headersz		= htobe32(sizeof(hdr));
725 
726 	/* Refuse to overwrite an existing image */
727 	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
728 	    S_IRUSR | S_IWUSR);
729 	if (fd == -1)
730 		return (errno);
731 
732 	/* Write out the header */
733 	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
734 		goto error;
735 
736 	/* Add the base image */
737 	if (base_path && write(fd, base_path, base_len) != base_len)
738 		goto error;
739 
740 	/* Extend to desired size, and add one refcount cluster */
741 	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
742 		goto error;
743 
744 	/*
745 	 * Paranoia: if our disk image takes more than one cluster
746 	 * to refcount the initial image, fail.
747 	 */
748 	if (initsz/clustersz > clustersz/2) {
749 		errno = ERANGE;
750 		goto error;
751 	}
752 
753 	/* Add a refcount block, and refcount ourselves. */
754 	v = htobe64(initsz);
755 	if (pwrite(fd, &v, 8, refoff) != 8)
756 		goto error;
757 	for (i = 0; i < initsz/clustersz + 1; i++) {
758 		refs = htobe16(1);
759 		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
760 			goto error;
761 	}
762 
763 	ret = close(fd);
764 	return (ret);
765 error:
766 	ret = errno;
767 	close(fd);
768 	unlink(imgfile_path);
769 	return (errno);
770 }
771