xref: /openbsd/usr.sbin/vmd/vioqcow2.c (revision 65bbee46)
1 /*	$OpenBSD: vioqcow2.c,v 1.25 2024/09/26 01:45:13 jsg Exp $	*/
2 
3 /*
4  * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 
22 #include <err.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <libgen.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <unistd.h>
29 
30 #include "virtio.h"
31 
32 #define QCOW2_COMPRESSED	0x4000000000000000ull
33 #define QCOW2_INPLACE		0x8000000000000000ull
34 
35 #define QCOW2_DIRTY		(1 << 0)
36 #define QCOW2_CORRUPT		(1 << 1)
37 
38 enum {
39 	ICFEATURE_DIRTY		= 1 << 0,
40 	ICFEATURE_CORRUPT	= 1 << 1,
41 };
42 
43 enum {
44 	ACFEATURE_BITEXT	= 1 << 0,
45 };
46 
47 struct qcheader {
48 	char magic[4];
49 	uint32_t version;
50 	uint64_t backingoff;
51 	uint32_t backingsz;
52 	uint32_t clustershift;
53 	uint64_t disksz;
54 	uint32_t cryptmethod;
55 	uint32_t l1sz;
56 	uint64_t l1off;
57 	uint64_t refoff;
58 	uint32_t refsz;
59 	uint32_t snapcount;
60 	uint64_t snapsz;
61 	/* v3 additions */
62 	uint64_t incompatfeatures;
63 	uint64_t compatfeatures;
64 	uint64_t autoclearfeatures;
65 	uint32_t reforder;	/* Bits = 1 << reforder */
66 	uint32_t headersz;
67 } __packed;
68 
69 struct qcdisk {
70 	pthread_rwlock_t lock;
71 	struct qcdisk *base;
72 	struct qcheader header;
73 
74 	int       fd;
75 	uint64_t *l1;
76 	off_t     end;
77 	off_t	  clustersz;
78 	off_t	  disksz; /* In bytes */
79 	uint32_t  cryptmethod;
80 
81 	uint32_t l1sz;
82 	off_t	 l1off;
83 
84 	off_t	 refoff;
85 	off_t	 refsz;
86 
87 	uint32_t nsnap;
88 	off_t	 snapoff;
89 
90 	/* v3 features */
91 	uint64_t incompatfeatures;
92 	uint64_t autoclearfeatures;
93 	uint32_t refssz;
94 	uint32_t headersz;
95 };
96 
97 extern char *__progname;
98 
99 static off_t xlate(struct qcdisk *, off_t, int *);
100 static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
101 static void inc_refs(struct qcdisk *, off_t, int);
102 static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
103 static int qc2_open(struct qcdisk *, int *, size_t);
104 static ssize_t qc2_pread(void *, char *, size_t, off_t);
105 static ssize_t qc2_preadv(void *, struct iovec *, int, off_t);
106 static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
107 static ssize_t qc2_pwritev(void *, struct iovec *, int, off_t);
108 static void qc2_close(void *, int);
109 
110 /*
111  * Initializes a raw disk image backing file from an fd. Stores the
112  * number of bytes in *szp, returning -1 for error, 0 for success.
113  *
114  * May open snapshot base images.
115  */
116 int
virtio_qcow2_init(struct virtio_backing * file,off_t * szp,int * fd,size_t nfd)117 virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
118 {
119 	struct qcdisk *diskp;
120 
121 	diskp = malloc(sizeof(struct qcdisk));
122 	if (diskp == NULL)
123 		return -1;
124 	if (qc2_open(diskp, fd, nfd) == -1) {
125 		log_warnx("could not open qcow2 disk");
126 		return -1;
127 	}
128 	file->p = diskp;
129 	file->pread = qc2_pread;
130 	file->preadv = qc2_preadv;
131 	file->pwrite = qc2_pwrite;
132 	file->pwritev = qc2_pwritev;
133 	file->close = qc2_close;
134 	*szp = diskp->disksz;
135 	return 0;
136 }
137 
138 /*
139  * Return the path to the base image given a disk image.
140  * Called from vmctl.
141  */
142 ssize_t
virtio_qcow2_get_base(int fd,char * path,size_t npath,const char * dpath)143 virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
144 {
145 	char dpathbuf[PATH_MAX];
146 	char expanded[PATH_MAX];
147 	struct qcheader header;
148 	uint64_t backingoff;
149 	uint32_t backingsz;
150 	char *s = NULL;
151 
152 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
153 		log_warnx("short read on header");
154 		return -1;
155 	}
156 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
157 		log_warnx("invalid magic numbers");
158 		return -1;
159 	}
160 	backingoff = be64toh(header.backingoff);
161 	backingsz = be32toh(header.backingsz);
162 	if (backingsz == 0)
163 		return 0;
164 
165 	if (backingsz >= npath - 1) {
166 		log_warnx("snapshot path too long");
167 		return -1;
168 	}
169 	if (pread(fd, path, backingsz, backingoff) != backingsz) {
170 		log_warnx("could not read snapshot base name");
171 		return -1;
172 	}
173 	path[backingsz] = '\0';
174 
175 	/*
176 	 * Relative paths should be interpreted relative to the disk image,
177 	 * rather than relative to the directory vmd happens to be running in,
178 	 * since this is the only useful interpretation.
179 	 */
180 	if (path[0] == '/') {
181 		if (realpath(path, expanded) == NULL ||
182 		    strlcpy(path, expanded, npath) >= npath) {
183 			log_warnx("unable to resolve %s", path);
184 			return -1;
185 		}
186 	} else {
187 		if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
188 		    sizeof(dpathbuf)) {
189 			log_warnx("path too long: %s", dpath);
190 			return -1;
191 		}
192 		s = dirname(dpathbuf);
193 		if (snprintf(expanded, sizeof(expanded),
194 		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
195 			log_warnx("path too long: %s/%s", s, path);
196 			return -1;
197 		}
198 		if (npath < PATH_MAX ||
199 		    realpath(expanded, path) == NULL) {
200 			log_warnx("unable to resolve %s", path);
201 			return -1;
202 		}
203 	}
204 
205 	return strlen(path);
206 }
207 
208 static int
qc2_open(struct qcdisk * disk,int * fds,size_t nfd)209 qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
210 {
211 	char basepath[PATH_MAX];
212 	struct stat st;
213 	struct qcheader header;
214 	uint64_t backingoff;
215 	uint32_t backingsz;
216 	off_t i;
217 	int version, fd;
218 
219 	pthread_rwlock_init(&disk->lock, NULL);
220 	fd = fds[0];
221 	disk->fd = fd;
222 	disk->base = NULL;
223 	disk->l1 = NULL;
224 
225 	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
226 		fatalx("short read on header");
227 	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
228 		fatalx("invalid magic numbers");
229 
230 	disk->clustersz		= (1ull << be32toh(header.clustershift));
231 	disk->disksz		= be64toh(header.disksz);
232 	disk->cryptmethod	= be32toh(header.cryptmethod);
233 	disk->l1sz		= be32toh(header.l1sz);
234 	disk->l1off		= be64toh(header.l1off);
235 	disk->refsz		= be32toh(header.refsz);
236 	disk->refoff		= be64toh(header.refoff);
237 	disk->nsnap		= be32toh(header.snapcount);
238 	disk->snapoff		= be64toh(header.snapsz);
239 
240 	/*
241 	 * The additional features here are defined as 0 in the v2 format,
242 	 * so as long as we clear the buffer before parsing, we don't need
243 	 * to check versions here.
244 	 */
245 	disk->incompatfeatures = be64toh(header.incompatfeatures);
246 	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
247 	disk->refssz = be32toh(header.refsz);
248 	disk->headersz = be32toh(header.headersz);
249 
250 	/*
251 	 * We only know about the dirty or corrupt bits here.
252 	 */
253 	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
254 		fatalx("unsupported features %llx",
255 		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
256 	if (be32toh(header.reforder) != 4)
257 		fatalx("unsupported refcount size\n");
258 
259 	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
260 	if (!disk->l1)
261 		fatal("%s: could not allocate l1 table", __func__);
262 	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
263 	    != 8 * disk->l1sz)
264 		fatalx("%s: unable to read qcow2 L1 table", __func__);
265 	for (i = 0; i < disk->l1sz; i++)
266 		disk->l1[i] = be64toh(disk->l1[i]);
267 	version = be32toh(header.version);
268 	if (version != 2 && version != 3)
269 		fatalx("%s: unknown qcow2 version %d", __func__, version);
270 
271 	backingoff = be64toh(header.backingoff);
272 	backingsz = be32toh(header.backingsz);
273 	if (backingsz != 0) {
274 		if (backingsz >= sizeof(basepath) - 1) {
275 			fatalx("%s: snapshot path too long", __func__);
276 		}
277 		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
278 			fatalx("%s: could not read snapshot base name",
279 			    __func__);
280 		}
281 		basepath[backingsz] = 0;
282 		if (nfd <= 1) {
283 			fatalx("%s: missing base image %s", __func__,
284 			    basepath);
285 		}
286 
287 
288 		disk->base = calloc(1, sizeof(struct qcdisk));
289 		if (!disk->base)
290 			fatal("%s: could not open %s", __func__, basepath);
291 		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
292 			fatalx("%s: could not open %s", __func__, basepath);
293 		if (disk->base->clustersz != disk->clustersz)
294 			fatalx("%s: all disk parts must share clustersize",
295 			    __func__);
296 	}
297 	if (fstat(fd, &st) == -1)
298 		fatal("%s: unable to stat disk", __func__);
299 
300 	disk->end = st.st_size;
301 
302 	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
303 	    __func__, version, disk->disksz, disk->end, disk->nsnap);
304 
305 	return 0;
306 }
307 
308 static ssize_t
qc2_preadv(void * p,struct iovec * iov,int cnt,off_t offset)309 qc2_preadv(void *p, struct iovec *iov, int cnt, off_t offset)
310 {
311 	int i;
312 	off_t pos = offset;
313 	ssize_t sz = 0, total = 0;
314 
315 	for (i = 0; i < cnt; i++, iov++) {
316 		sz = qc2_pread(p, iov->iov_base, iov->iov_len, pos);
317 		if (sz == -1)
318 			return (sz);
319 		total += sz;
320 		pos += sz;
321 	}
322 
323 	return (total);
324 }
325 
326 static ssize_t
qc2_pread(void * p,char * buf,size_t len,off_t off)327 qc2_pread(void *p, char *buf, size_t len, off_t off)
328 {
329 	struct qcdisk *disk, *d;
330 	off_t phys_off, end, cluster_off;
331 	ssize_t sz, rem;
332 
333 	disk = p;
334 	end = off + len;
335 	if (off < 0 || end > disk->disksz)
336 		return -1;
337 
338 	/* handle head chunk separately */
339 	rem = len;
340 	while (off != end) {
341 		for (d = disk; d; d = d->base)
342 			if ((phys_off = xlate(d, off, NULL)) > 0)
343 				break;
344 		/* Break out into chunks. This handles
345 		 * three cases:
346 		 *
347 		 *    |----+====|========|====+-----|
348 		 *
349 		 * Either we are at the start of the read,
350 		 * and the cluster has some leading bytes.
351 		 * This means that we are reading the tail
352 		 * of the cluster, and our size is:
353 		 *
354 		 * 	clustersz - (off % clustersz).
355 		 *
356 		 * Otherwise, we're reading the middle section.
357 		 * We're already aligned here, so we can just
358 		 * read the whole cluster size. Or we're at the
359 		 * tail, at which point we just want to read the
360 		 * remaining bytes.
361 		 */
362 		cluster_off = off % disk->clustersz;
363 		sz = disk->clustersz - cluster_off;
364 		if (sz > rem)
365 			sz = rem;
366 		/*
367 		 * If we're within the disk, but don't have backing bytes,
368 		 * just read back zeros.
369 		 */
370 		if (!d)
371 			bzero(buf, sz);
372 		else if (pread(d->fd, buf, sz, phys_off) != sz)
373 			return -1;
374 		off += sz;
375 		buf += sz;
376 		rem -= sz;
377 	}
378 	return len;
379 }
380 
381 static ssize_t
qc2_pwritev(void * p,struct iovec * iov,int cnt,off_t offset)382 qc2_pwritev(void *p, struct iovec *iov, int cnt, off_t offset)
383 {
384 	int i;
385 	off_t pos = offset;
386 	ssize_t sz = 0, total = 0;
387 
388 	for (i = 0; i < cnt; i++, iov++) {
389 		sz = qc2_pwrite(p, iov->iov_base, iov->iov_len, pos);
390 		if (sz == -1)
391 			return (sz);
392 		total += sz;
393 		pos += sz;
394 	}
395 
396 	return (total);
397 }
398 
399 static ssize_t
qc2_pwrite(void * p,char * buf,size_t len,off_t off)400 qc2_pwrite(void *p, char *buf, size_t len, off_t off)
401 {
402 	struct qcdisk *disk, *d;
403 	off_t phys_off, cluster_off, end;
404 	ssize_t sz, rem;
405 	int inplace;
406 
407 	d = p;
408 	disk = p;
409 	inplace = 1;
410 	end = off + len;
411 	if (off < 0 || end > disk->disksz)
412 		return -1;
413 	rem = len;
414 	while (off != end) {
415 		/* See the read code for a summary of the computation */
416 		cluster_off = off % disk->clustersz;
417 		sz = disk->clustersz - cluster_off;
418 		if (sz > rem)
419 			sz = rem;
420 
421 		phys_off = xlate(disk, off, &inplace);
422 		if (phys_off == -1)
423 			return -1;
424 		/*
425 		 * If we couldn't find the cluster in the writable disk,
426 		 * see if it exists in the base image. If it does, we
427 		 * need to copy it before the write. The copy happens
428 		 * in the '!inplace' if clause below te search.
429 		 */
430 		if (phys_off == 0)
431 			for (d = disk->base; d; d = d->base)
432 				if ((phys_off = xlate(d, off, NULL)) > 0)
433 					break;
434 		if (!inplace || phys_off == 0)
435 			phys_off = mkcluster(disk, d, off, phys_off);
436 		if (phys_off == -1)
437 			return -1;
438 		if (phys_off < disk->clustersz)
439 			fatalx("%s: writing reserved cluster", __func__);
440 		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
441 			return -1;
442 		off += sz;
443 		buf += sz;
444 		rem -= sz;
445 	}
446 	return len;
447 }
448 
449 static void
qc2_close(void * p,int stayopen)450 qc2_close(void *p, int stayopen)
451 {
452 	struct qcdisk *disk;
453 
454 	disk = p;
455 	if (disk->base)
456 		qc2_close(disk->base, stayopen);
457 	if (!stayopen)
458 		close(disk->fd);
459 	free(disk->l1);
460 	free(disk);
461 }
462 
463 /*
464  * Translates a virtual offset into an on-disk offset.
465  * Returns:
466  * 	-1 on error
467  * 	 0 on 'not found'
468  * 	>0 on found
469  */
470 static off_t
xlate(struct qcdisk * disk,off_t off,int * inplace)471 xlate(struct qcdisk *disk, off_t off, int *inplace)
472 {
473 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
474 	uint64_t buf;
475 
476 
477 	/*
478 	 * Clear out inplace flag -- xlate misses should not
479 	 * be flagged as updatable in place. We will still
480 	 * return 0 from them, but this leaves less surprises
481 	 * in the API.
482 	 */
483 	if (inplace)
484 		*inplace = 0;
485 	pthread_rwlock_rdlock(&disk->lock);
486 	if (off < 0)
487 		goto err;
488 
489 	l2sz = disk->clustersz / 8;
490 	l1off = (off / disk->clustersz) / l2sz;
491 	if (l1off >= disk->l1sz)
492 		goto err;
493 
494 	l2tab = disk->l1[l1off];
495 	l2tab &= ~QCOW2_INPLACE;
496 	if (l2tab == 0) {
497 		pthread_rwlock_unlock(&disk->lock);
498 		return 0;
499 	}
500 	l2off = (off / disk->clustersz) % l2sz;
501 	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
502 	cluster = be64toh(buf);
503 	/*
504 	 * cluster may be 0, but all future operations don't affect
505 	 * the return value.
506 	 */
507 	if (inplace)
508 		*inplace = !!(cluster & QCOW2_INPLACE);
509 	if (cluster & QCOW2_COMPRESSED)
510 		fatalx("%s: compressed clusters unsupported", __func__);
511 	pthread_rwlock_unlock(&disk->lock);
512 	clusteroff = 0;
513 	cluster &= ~QCOW2_INPLACE;
514 	if (cluster)
515 		clusteroff = off % disk->clustersz;
516 	return cluster + clusteroff;
517 err:
518 	pthread_rwlock_unlock(&disk->lock);
519 	return -1;
520 }
521 
522 /*
523  * Allocates a new cluster on disk, creating a new L2 table
524  * if needed. The cluster starts off with a refs of one,
525  * and the writable bit set.
526  *
527  * Returns -1 on error, and the physical address within the
528  * cluster of the write offset if it exists.
529  */
530 static off_t
mkcluster(struct qcdisk * disk,struct qcdisk * base,off_t off,off_t src_phys)531 mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
532 {
533 	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
534 	uint64_t buf;
535 
536 	pthread_rwlock_wrlock(&disk->lock);
537 
538 	cluster = -1;
539 	/* L1 entries always exist */
540 	l2sz = disk->clustersz / 8;
541 	l1off = off / (disk->clustersz * l2sz);
542 	if (l1off >= disk->l1sz)
543 		fatalx("l1 offset outside disk");
544 
545 	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);
546 
547 	l2tab = disk->l1[l1off];
548 	l2off = (off / disk->clustersz) % l2sz;
549 	/* We may need to create or clone an L2 entry to map the block */
550 	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
551 		orig = l2tab & ~QCOW2_INPLACE;
552 		l2tab = disk->end;
553 		disk->end += disk->clustersz;
554 		if (ftruncate(disk->fd, disk->end) == -1)
555 			fatal("%s: ftruncate failed", __func__);
556 
557 		/*
558 		 * If we translated, found a L2 entry, but it needed to
559 		 * be copied, copy it.
560 		 */
561 		if (orig != 0)
562 			copy_cluster(disk, disk, l2tab, orig);
563 		/* Update l1 -- we flush it later */
564 		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
565 		inc_refs(disk, l2tab, 1);
566 	}
567 	l2tab &= ~QCOW2_INPLACE;
568 
569 	/* Grow the disk */
570 	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
571 		fatal("%s: could not grow disk", __func__);
572 	if (src_phys > 0)
573 		copy_cluster(disk, base, disk->end, src_phys);
574 	cluster = disk->end;
575 	disk->end += disk->clustersz;
576 	buf = htobe64(cluster | QCOW2_INPLACE);
577 	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
578 		fatalx("%s: could not write cluster", __func__);
579 
580 	/* TODO: lazily sync: currently VMD doesn't close things */
581 	buf = htobe64(disk->l1[l1off]);
582 	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
583 		fatalx("%s: could not write l1", __func__);
584 	inc_refs(disk, cluster, 1);
585 
586 	pthread_rwlock_unlock(&disk->lock);
587 	clusteroff = off % disk->clustersz;
588 	if (cluster + clusteroff < disk->clustersz)
589 		fatalx("write would clobber header");
590 	return cluster + clusteroff;
591 }
592 
593 /* Copies a cluster containing src to dst. Src and dst need not be aligned. */
594 static void
copy_cluster(struct qcdisk * disk,struct qcdisk * base,off_t dst,off_t src)595 copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
596 {
597 	char *scratch;
598 
599 	scratch = malloc(disk->clustersz);
600 	if (!scratch)
601 		fatal("out of memory");
602 	src &= ~(disk->clustersz - 1);
603 	dst &= ~(disk->clustersz - 1);
604 	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
605 		fatal("%s: could not read cluster", __func__);
606 	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
607 		fatal("%s: could not write cluster", __func__);
608 	free(scratch);
609 }
610 
611 static void
inc_refs(struct qcdisk * disk,off_t off,int newcluster)612 inc_refs(struct qcdisk *disk, off_t off, int newcluster)
613 {
614 	off_t l1off, l1idx, l2idx, l2cluster;
615 	size_t nper;
616 	uint16_t refs;
617 	uint64_t buf;
618 
619 	off &= ~QCOW2_INPLACE;
620 	nper = disk->clustersz / 2;
621 	l1idx = (off / disk->clustersz) / nper;
622 	l2idx = (off / disk->clustersz) % nper;
623 	l1off = disk->refoff + 8 * l1idx;
624 	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
625 		fatal("could not read refs");
626 
627 	l2cluster = be64toh(buf);
628 	if (l2cluster == 0) {
629 		l2cluster = disk->end;
630 		disk->end += disk->clustersz;
631 		if (ftruncate(disk->fd, disk->end) < 0)
632 			fatal("%s: failed to allocate ref block", __func__);
633 		buf = htobe64(l2cluster);
634 		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
635 			fatal("%s: failed to write ref block", __func__);
636 	}
637 
638 	refs = 1;
639 	if (!newcluster) {
640 		if (pread(disk->fd, &refs, sizeof(refs),
641 		    l2cluster + 2 * l2idx) != 2)
642 			fatal("could not read ref cluster");
643 		refs = be16toh(refs) + 1;
644 	}
645 	refs = htobe16(refs);
646 	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
647 		fatal("%s: could not write ref block", __func__);
648 }
649 
650 /*
651  * virtio_qcow2_create
652  *
653  * Create an empty qcow2 imagefile with the specified path and size.
654  *
655  * Parameters:
656  *  imgfile_path: path to the image file to create
657  *  imgsize     : size of the image file to create (in bytes)
658  *
659  * Return:
660  *  EEXIST: The requested image file already exists
661  *  0     : Image file successfully created
662  *  Exxxx : Various other Exxxx errno codes due to other I/O errors
663  */
664 int
virtio_qcow2_create(const char * imgfile_path,const char * base_path,uint64_t disksz)665 virtio_qcow2_create(const char *imgfile_path,
666     const char *base_path, uint64_t disksz)
667 {
668 	struct qcheader hdr, basehdr;
669 	int fd, ret;
670 	ssize_t base_len;
671 	uint64_t l1sz, refsz, initsz, clustersz;
672 	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
673 	uint16_t refs;
674 
675 	if (base_path) {
676 		fd = open(base_path, O_RDONLY);
677 		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
678 			errx(1, "failure to read base image header");
679 		close(fd);
680 		if (strncmp(basehdr.magic,
681 		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
682 			errx(1, "base image is not a qcow2 file");
683 		if (!disksz)
684 			disksz = betoh64(basehdr.disksz);
685 		else if (disksz != betoh64(basehdr.disksz))
686 			errx(1, "base size does not match requested size");
687 	}
688 	if (!base_path && !disksz)
689 		errx(1, "missing disk size");
690 
691 	clustersz = (1<<16);
692 	l1off = ALIGNSZ(sizeof(hdr), clustersz);
693 
694 	l1entrysz = clustersz * clustersz / 8;
695 	l1sz = (disksz + l1entrysz - 1) / l1entrysz;
696 
697 	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
698 	refentrysz = clustersz * clustersz * clustersz / 2;
699 	refsz = (disksz + refentrysz - 1) / refentrysz;
700 
701 	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
702 	base_len = base_path ? strlen(base_path) : 0;
703 
704 	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
705 	hdr.version		= htobe32(3);
706 	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
707 	hdr.backingsz		= htobe32(base_len);
708 	hdr.clustershift	= htobe32(16);
709 	hdr.disksz		= htobe64(disksz);
710 	hdr.cryptmethod		= htobe32(0);
711 	hdr.l1sz		= htobe32(l1sz);
712 	hdr.l1off		= htobe64(l1off);
713 	hdr.refoff		= htobe64(refoff);
714 	hdr.refsz		= htobe32(refsz);
715 	hdr.snapcount		= htobe32(0);
716 	hdr.snapsz		= htobe64(0);
717 	hdr.incompatfeatures	= htobe64(0);
718 	hdr.compatfeatures	= htobe64(0);
719 	hdr.autoclearfeatures	= htobe64(0);
720 	hdr.reforder		= htobe32(4);
721 	hdr.headersz		= htobe32(sizeof(hdr));
722 
723 	/* Refuse to overwrite an existing image */
724 	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
725 	    S_IRUSR | S_IWUSR);
726 	if (fd == -1)
727 		return (errno);
728 
729 	/* Write out the header */
730 	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
731 		goto error;
732 
733 	/* Add the base image */
734 	if (base_path && write(fd, base_path, base_len) != base_len)
735 		goto error;
736 
737 	/* Extend to desired size, and add one refcount cluster */
738 	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
739 		goto error;
740 
741 	/*
742 	 * Paranoia: if our disk image takes more than one cluster
743 	 * to refcount the initial image, fail.
744 	 */
745 	if (initsz/clustersz > clustersz/2) {
746 		errno = ERANGE;
747 		goto error;
748 	}
749 
750 	/* Add a refcount block, and refcount ourselves. */
751 	v = htobe64(initsz);
752 	if (pwrite(fd, &v, 8, refoff) != 8)
753 		goto error;
754 	for (i = 0; i < initsz/clustersz + 1; i++) {
755 		refs = htobe16(1);
756 		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
757 			goto error;
758 	}
759 
760 	ret = close(fd);
761 	return (ret);
762 error:
763 	ret = errno;
764 	close(fd);
765 	unlink(imgfile_path);
766 	return (errno);
767 }
768