xref: /qemu/block/qcow2.c (revision bfa3ab61)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "qemu-common.h"
25 #include "block/block_int.h"
26 #include "qemu/module.h"
27 #include <zlib.h>
28 #include "qemu/aes.h"
29 #include "block/qcow2.h"
30 #include "qemu/error-report.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qapi/qmp/qbool.h"
33 #include "qapi/util.h"
34 #include "qapi/qmp/types.h"
35 #include "qapi-event.h"
36 #include "trace.h"
37 #include "qemu/option_int.h"
38 
39 /*
40   Differences with QCOW:
41 
42   - Support for multiple incremental snapshots.
43   - Memory management by reference counts.
44   - Clusters which have a reference count of one have the bit
45     QCOW_OFLAG_COPIED to optimize write performance.
46   - Size of compressed clusters is stored in sectors to reduce bit usage
47     in the cluster offsets.
48   - Support for storing additional data (such as the VM state) in the
49     snapshots.
50   - If a backing store is used, the cluster size is not constrained
51     (could be backported to QCOW).
52   - L2 tables have always a size of one cluster.
53 */
54 
55 
56 typedef struct {
57     uint32_t magic;
58     uint32_t len;
59 } QEMU_PACKED QCowExtension;
60 
61 #define  QCOW2_EXT_MAGIC_END 0
62 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
63 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
64 
65 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
66 {
67     const QCowHeader *cow_header = (const void *)buf;
68 
69     if (buf_size >= sizeof(QCowHeader) &&
70         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
71         be32_to_cpu(cow_header->version) >= 2)
72         return 100;
73     else
74         return 0;
75 }
76 
77 
78 /*
79  * read qcow2 extension and fill bs
80  * start reading from start_offset
81  * finish reading upon magic of value 0 or when end_offset reached
82  * unknown magic is skipped (future extension this version knows nothing about)
83  * return 0 upon success, non-0 otherwise
84  */
85 static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
86                                  uint64_t end_offset, void **p_feature_table,
87                                  Error **errp)
88 {
89     BDRVQcowState *s = bs->opaque;
90     QCowExtension ext;
91     uint64_t offset;
92     int ret;
93 
94 #ifdef DEBUG_EXT
95     printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
96 #endif
97     offset = start_offset;
98     while (offset < end_offset) {
99 
100 #ifdef DEBUG_EXT
101         /* Sanity check */
102         if (offset > s->cluster_size)
103             printf("qcow2_read_extension: suspicious offset %lu\n", offset);
104 
105         printf("attempting to read extended header in offset %lu\n", offset);
106 #endif
107 
108         ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext));
109         if (ret < 0) {
110             error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
111                              "pread fail from offset %" PRIu64, offset);
112             return 1;
113         }
114         be32_to_cpus(&ext.magic);
115         be32_to_cpus(&ext.len);
116         offset += sizeof(ext);
117 #ifdef DEBUG_EXT
118         printf("ext.magic = 0x%x\n", ext.magic);
119 #endif
120         if (offset > end_offset || ext.len > end_offset - offset) {
121             error_setg(errp, "Header extension too large");
122             return -EINVAL;
123         }
124 
125         switch (ext.magic) {
126         case QCOW2_EXT_MAGIC_END:
127             return 0;
128 
129         case QCOW2_EXT_MAGIC_BACKING_FORMAT:
130             if (ext.len >= sizeof(bs->backing_format)) {
131                 error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
132                            " too large (>=%zu)", ext.len,
133                            sizeof(bs->backing_format));
134                 return 2;
135             }
136             ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len);
137             if (ret < 0) {
138                 error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
139                                  "Could not read format name");
140                 return 3;
141             }
142             bs->backing_format[ext.len] = '\0';
143             s->image_backing_format = g_strdup(bs->backing_format);
144 #ifdef DEBUG_EXT
145             printf("Qcow2: Got format extension %s\n", bs->backing_format);
146 #endif
147             break;
148 
149         case QCOW2_EXT_MAGIC_FEATURE_TABLE:
150             if (p_feature_table != NULL) {
151                 void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
152                 ret = bdrv_pread(bs->file, offset , feature_table, ext.len);
153                 if (ret < 0) {
154                     error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
155                                      "Could not read table");
156                     return ret;
157                 }
158 
159                 *p_feature_table = feature_table;
160             }
161             break;
162 
163         default:
164             /* unknown magic - save it in case we need to rewrite the header */
165             {
166                 Qcow2UnknownHeaderExtension *uext;
167 
168                 uext = g_malloc0(sizeof(*uext)  + ext.len);
169                 uext->magic = ext.magic;
170                 uext->len = ext.len;
171                 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
172 
173                 ret = bdrv_pread(bs->file, offset , uext->data, uext->len);
174                 if (ret < 0) {
175                     error_setg_errno(errp, -ret, "ERROR: unknown extension: "
176                                      "Could not read data");
177                     return ret;
178                 }
179             }
180             break;
181         }
182 
183         offset += ((ext.len + 7) & ~7);
184     }
185 
186     return 0;
187 }
188 
189 static void cleanup_unknown_header_ext(BlockDriverState *bs)
190 {
191     BDRVQcowState *s = bs->opaque;
192     Qcow2UnknownHeaderExtension *uext, *next;
193 
194     QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
195         QLIST_REMOVE(uext, next);
196         g_free(uext);
197     }
198 }
199 
200 static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs,
201     Error **errp, const char *fmt, ...)
202 {
203     char msg[64];
204     va_list ap;
205 
206     va_start(ap, fmt);
207     vsnprintf(msg, sizeof(msg), fmt, ap);
208     va_end(ap);
209 
210     error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE,
211                bdrv_get_device_or_node_name(bs), "qcow2", msg);
212 }
213 
214 static void report_unsupported_feature(BlockDriverState *bs,
215     Error **errp, Qcow2Feature *table, uint64_t mask)
216 {
217     char *features = g_strdup("");
218     char *old;
219 
220     while (table && table->name[0] != '\0') {
221         if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
222             if (mask & (1ULL << table->bit)) {
223                 old = features;
224                 features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "",
225                                            table->name);
226                 g_free(old);
227                 mask &= ~(1ULL << table->bit);
228             }
229         }
230         table++;
231     }
232 
233     if (mask) {
234         old = features;
235         features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64,
236                                    old, *old ? ", " : "", mask);
237         g_free(old);
238     }
239 
240     report_unsupported(bs, errp, "%s", features);
241     g_free(features);
242 }
243 
244 /*
245  * Sets the dirty bit and flushes afterwards if necessary.
246  *
247  * The incompatible_features bit is only set if the image file header was
248  * updated successfully.  Therefore it is not required to check the return
249  * value of this function.
250  */
251 int qcow2_mark_dirty(BlockDriverState *bs)
252 {
253     BDRVQcowState *s = bs->opaque;
254     uint64_t val;
255     int ret;
256 
257     assert(s->qcow_version >= 3);
258 
259     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
260         return 0; /* already dirty */
261     }
262 
263     val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
264     ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features),
265                       &val, sizeof(val));
266     if (ret < 0) {
267         return ret;
268     }
269     ret = bdrv_flush(bs->file);
270     if (ret < 0) {
271         return ret;
272     }
273 
274     /* Only treat image as dirty if the header was updated successfully */
275     s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
276     return 0;
277 }
278 
279 /*
280  * Clears the dirty bit and flushes before if necessary.  Only call this
281  * function when there are no pending requests, it does not guard against
282  * concurrent requests dirtying the image.
283  */
284 static int qcow2_mark_clean(BlockDriverState *bs)
285 {
286     BDRVQcowState *s = bs->opaque;
287 
288     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
289         int ret;
290 
291         s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
292 
293         ret = bdrv_flush(bs);
294         if (ret < 0) {
295             return ret;
296         }
297 
298         return qcow2_update_header(bs);
299     }
300     return 0;
301 }
302 
303 /*
304  * Marks the image as corrupt.
305  */
306 int qcow2_mark_corrupt(BlockDriverState *bs)
307 {
308     BDRVQcowState *s = bs->opaque;
309 
310     s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
311     return qcow2_update_header(bs);
312 }
313 
314 /*
315  * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
316  * before if necessary.
317  */
318 int qcow2_mark_consistent(BlockDriverState *bs)
319 {
320     BDRVQcowState *s = bs->opaque;
321 
322     if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
323         int ret = bdrv_flush(bs);
324         if (ret < 0) {
325             return ret;
326         }
327 
328         s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
329         return qcow2_update_header(bs);
330     }
331     return 0;
332 }
333 
334 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
335                        BdrvCheckMode fix)
336 {
337     int ret = qcow2_check_refcounts(bs, result, fix);
338     if (ret < 0) {
339         return ret;
340     }
341 
342     if (fix && result->check_errors == 0 && result->corruptions == 0) {
343         ret = qcow2_mark_clean(bs);
344         if (ret < 0) {
345             return ret;
346         }
347         return qcow2_mark_consistent(bs);
348     }
349     return ret;
350 }
351 
352 static int validate_table_offset(BlockDriverState *bs, uint64_t offset,
353                                  uint64_t entries, size_t entry_len)
354 {
355     BDRVQcowState *s = bs->opaque;
356     uint64_t size;
357 
358     /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
359      * because values will be passed to qemu functions taking int64_t. */
360     if (entries > INT64_MAX / entry_len) {
361         return -EINVAL;
362     }
363 
364     size = entries * entry_len;
365 
366     if (INT64_MAX - size < offset) {
367         return -EINVAL;
368     }
369 
370     /* Tables must be cluster aligned */
371     if (offset & (s->cluster_size - 1)) {
372         return -EINVAL;
373     }
374 
375     return 0;
376 }
377 
378 static QemuOptsList qcow2_runtime_opts = {
379     .name = "qcow2",
380     .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
381     .desc = {
382         {
383             .name = QCOW2_OPT_LAZY_REFCOUNTS,
384             .type = QEMU_OPT_BOOL,
385             .help = "Postpone refcount updates",
386         },
387         {
388             .name = QCOW2_OPT_DISCARD_REQUEST,
389             .type = QEMU_OPT_BOOL,
390             .help = "Pass guest discard requests to the layer below",
391         },
392         {
393             .name = QCOW2_OPT_DISCARD_SNAPSHOT,
394             .type = QEMU_OPT_BOOL,
395             .help = "Generate discard requests when snapshot related space "
396                     "is freed",
397         },
398         {
399             .name = QCOW2_OPT_DISCARD_OTHER,
400             .type = QEMU_OPT_BOOL,
401             .help = "Generate discard requests when other clusters are freed",
402         },
403         {
404             .name = QCOW2_OPT_OVERLAP,
405             .type = QEMU_OPT_STRING,
406             .help = "Selects which overlap checks to perform from a range of "
407                     "templates (none, constant, cached, all)",
408         },
409         {
410             .name = QCOW2_OPT_OVERLAP_TEMPLATE,
411             .type = QEMU_OPT_STRING,
412             .help = "Selects which overlap checks to perform from a range of "
413                     "templates (none, constant, cached, all)",
414         },
415         {
416             .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
417             .type = QEMU_OPT_BOOL,
418             .help = "Check for unintended writes into the main qcow2 header",
419         },
420         {
421             .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
422             .type = QEMU_OPT_BOOL,
423             .help = "Check for unintended writes into the active L1 table",
424         },
425         {
426             .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
427             .type = QEMU_OPT_BOOL,
428             .help = "Check for unintended writes into an active L2 table",
429         },
430         {
431             .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
432             .type = QEMU_OPT_BOOL,
433             .help = "Check for unintended writes into the refcount table",
434         },
435         {
436             .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
437             .type = QEMU_OPT_BOOL,
438             .help = "Check for unintended writes into a refcount block",
439         },
440         {
441             .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
442             .type = QEMU_OPT_BOOL,
443             .help = "Check for unintended writes into the snapshot table",
444         },
445         {
446             .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
447             .type = QEMU_OPT_BOOL,
448             .help = "Check for unintended writes into an inactive L1 table",
449         },
450         {
451             .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
452             .type = QEMU_OPT_BOOL,
453             .help = "Check for unintended writes into an inactive L2 table",
454         },
455         {
456             .name = QCOW2_OPT_CACHE_SIZE,
457             .type = QEMU_OPT_SIZE,
458             .help = "Maximum combined metadata (L2 tables and refcount blocks) "
459                     "cache size",
460         },
461         {
462             .name = QCOW2_OPT_L2_CACHE_SIZE,
463             .type = QEMU_OPT_SIZE,
464             .help = "Maximum L2 table cache size",
465         },
466         {
467             .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
468             .type = QEMU_OPT_SIZE,
469             .help = "Maximum refcount block cache size",
470         },
471         { /* end of list */ }
472     },
473 };
474 
475 static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
476     [QCOW2_OL_MAIN_HEADER_BITNR]    = QCOW2_OPT_OVERLAP_MAIN_HEADER,
477     [QCOW2_OL_ACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L1,
478     [QCOW2_OL_ACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_ACTIVE_L2,
479     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
480     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
481     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
482     [QCOW2_OL_INACTIVE_L1_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L1,
483     [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
484 };
485 
486 static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
487                              uint64_t *l2_cache_size,
488                              uint64_t *refcount_cache_size, Error **errp)
489 {
490     BDRVQcowState *s = bs->opaque;
491     uint64_t combined_cache_size;
492     bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
493 
494     combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
495     l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
496     refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
497 
498     combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
499     *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0);
500     *refcount_cache_size = qemu_opt_get_size(opts,
501                                              QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
502 
503     if (combined_cache_size_set) {
504         if (l2_cache_size_set && refcount_cache_size_set) {
505             error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
506                        " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
507                        "the same time");
508             return;
509         } else if (*l2_cache_size > combined_cache_size) {
510             error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
511                        QCOW2_OPT_CACHE_SIZE);
512             return;
513         } else if (*refcount_cache_size > combined_cache_size) {
514             error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
515                        QCOW2_OPT_CACHE_SIZE);
516             return;
517         }
518 
519         if (l2_cache_size_set) {
520             *refcount_cache_size = combined_cache_size - *l2_cache_size;
521         } else if (refcount_cache_size_set) {
522             *l2_cache_size = combined_cache_size - *refcount_cache_size;
523         } else {
524             *refcount_cache_size = combined_cache_size
525                                  / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
526             *l2_cache_size = combined_cache_size - *refcount_cache_size;
527         }
528     } else {
529         if (!l2_cache_size_set && !refcount_cache_size_set) {
530             *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
531                                  (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
532                                  * s->cluster_size);
533             *refcount_cache_size = *l2_cache_size
534                                  / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
535         } else if (!l2_cache_size_set) {
536             *l2_cache_size = *refcount_cache_size
537                            * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
538         } else if (!refcount_cache_size_set) {
539             *refcount_cache_size = *l2_cache_size
540                                  / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
541         }
542     }
543 }
544 
545 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
546                       Error **errp)
547 {
548     BDRVQcowState *s = bs->opaque;
549     unsigned int len, i;
550     int ret = 0;
551     QCowHeader header;
552     QemuOpts *opts = NULL;
553     Error *local_err = NULL;
554     uint64_t ext_end;
555     uint64_t l1_vm_state_index;
556     const char *opt_overlap_check, *opt_overlap_check_template;
557     int overlap_check_template = 0;
558     uint64_t l2_cache_size, refcount_cache_size;
559 
560     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
561     if (ret < 0) {
562         error_setg_errno(errp, -ret, "Could not read qcow2 header");
563         goto fail;
564     }
565     be32_to_cpus(&header.magic);
566     be32_to_cpus(&header.version);
567     be64_to_cpus(&header.backing_file_offset);
568     be32_to_cpus(&header.backing_file_size);
569     be64_to_cpus(&header.size);
570     be32_to_cpus(&header.cluster_bits);
571     be32_to_cpus(&header.crypt_method);
572     be64_to_cpus(&header.l1_table_offset);
573     be32_to_cpus(&header.l1_size);
574     be64_to_cpus(&header.refcount_table_offset);
575     be32_to_cpus(&header.refcount_table_clusters);
576     be64_to_cpus(&header.snapshots_offset);
577     be32_to_cpus(&header.nb_snapshots);
578 
579     if (header.magic != QCOW_MAGIC) {
580         error_setg(errp, "Image is not in qcow2 format");
581         ret = -EINVAL;
582         goto fail;
583     }
584     if (header.version < 2 || header.version > 3) {
585         report_unsupported(bs, errp, "QCOW version %" PRIu32, header.version);
586         ret = -ENOTSUP;
587         goto fail;
588     }
589 
590     s->qcow_version = header.version;
591 
592     /* Initialise cluster size */
593     if (header.cluster_bits < MIN_CLUSTER_BITS ||
594         header.cluster_bits > MAX_CLUSTER_BITS) {
595         error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
596                    header.cluster_bits);
597         ret = -EINVAL;
598         goto fail;
599     }
600 
601     s->cluster_bits = header.cluster_bits;
602     s->cluster_size = 1 << s->cluster_bits;
603     s->cluster_sectors = 1 << (s->cluster_bits - 9);
604 
605     /* Initialise version 3 header fields */
606     if (header.version == 2) {
607         header.incompatible_features    = 0;
608         header.compatible_features      = 0;
609         header.autoclear_features       = 0;
610         header.refcount_order           = 4;
611         header.header_length            = 72;
612     } else {
613         be64_to_cpus(&header.incompatible_features);
614         be64_to_cpus(&header.compatible_features);
615         be64_to_cpus(&header.autoclear_features);
616         be32_to_cpus(&header.refcount_order);
617         be32_to_cpus(&header.header_length);
618 
619         if (header.header_length < 104) {
620             error_setg(errp, "qcow2 header too short");
621             ret = -EINVAL;
622             goto fail;
623         }
624     }
625 
626     if (header.header_length > s->cluster_size) {
627         error_setg(errp, "qcow2 header exceeds cluster size");
628         ret = -EINVAL;
629         goto fail;
630     }
631 
632     if (header.header_length > sizeof(header)) {
633         s->unknown_header_fields_size = header.header_length - sizeof(header);
634         s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
635         ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields,
636                          s->unknown_header_fields_size);
637         if (ret < 0) {
638             error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
639                              "fields");
640             goto fail;
641         }
642     }
643 
644     if (header.backing_file_offset > s->cluster_size) {
645         error_setg(errp, "Invalid backing file offset");
646         ret = -EINVAL;
647         goto fail;
648     }
649 
650     if (header.backing_file_offset) {
651         ext_end = header.backing_file_offset;
652     } else {
653         ext_end = 1 << header.cluster_bits;
654     }
655 
656     /* Handle feature bits */
657     s->incompatible_features    = header.incompatible_features;
658     s->compatible_features      = header.compatible_features;
659     s->autoclear_features       = header.autoclear_features;
660 
661     if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
662         void *feature_table = NULL;
663         qcow2_read_extensions(bs, header.header_length, ext_end,
664                               &feature_table, NULL);
665         report_unsupported_feature(bs, errp, feature_table,
666                                    s->incompatible_features &
667                                    ~QCOW2_INCOMPAT_MASK);
668         ret = -ENOTSUP;
669         g_free(feature_table);
670         goto fail;
671     }
672 
673     if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
674         /* Corrupt images may not be written to unless they are being repaired
675          */
676         if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
677             error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
678                        "read/write");
679             ret = -EACCES;
680             goto fail;
681         }
682     }
683 
684     /* Check support for various header values */
685     if (header.refcount_order > 6) {
686         error_setg(errp, "Reference count entry width too large; may not "
687                    "exceed 64 bits");
688         ret = -EINVAL;
689         goto fail;
690     }
691     s->refcount_order = header.refcount_order;
692     s->refcount_bits = 1 << s->refcount_order;
693     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
694     s->refcount_max += s->refcount_max - 1;
695 
696     if (header.crypt_method > QCOW_CRYPT_AES) {
697         error_setg(errp, "Unsupported encryption method: %" PRIu32,
698                    header.crypt_method);
699         ret = -EINVAL;
700         goto fail;
701     }
702     s->crypt_method_header = header.crypt_method;
703     if (s->crypt_method_header) {
704         bs->encrypted = 1;
705     }
706 
707     s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
708     s->l2_size = 1 << s->l2_bits;
709     /* 2^(s->refcount_order - 3) is the refcount width in bytes */
710     s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
711     s->refcount_block_size = 1 << s->refcount_block_bits;
712     bs->total_sectors = header.size / 512;
713     s->csize_shift = (62 - (s->cluster_bits - 8));
714     s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
715     s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
716 
717     s->refcount_table_offset = header.refcount_table_offset;
718     s->refcount_table_size =
719         header.refcount_table_clusters << (s->cluster_bits - 3);
720 
721     if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) {
722         error_setg(errp, "Reference count table too large");
723         ret = -EINVAL;
724         goto fail;
725     }
726 
727     ret = validate_table_offset(bs, s->refcount_table_offset,
728                                 s->refcount_table_size, sizeof(uint64_t));
729     if (ret < 0) {
730         error_setg(errp, "Invalid reference count table offset");
731         goto fail;
732     }
733 
734     /* Snapshot table offset/length */
735     if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) {
736         error_setg(errp, "Too many snapshots");
737         ret = -EINVAL;
738         goto fail;
739     }
740 
741     ret = validate_table_offset(bs, header.snapshots_offset,
742                                 header.nb_snapshots,
743                                 sizeof(QCowSnapshotHeader));
744     if (ret < 0) {
745         error_setg(errp, "Invalid snapshot table offset");
746         goto fail;
747     }
748 
749     /* read the level 1 table */
750     if (header.l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
751         error_setg(errp, "Active L1 table too large");
752         ret = -EFBIG;
753         goto fail;
754     }
755     s->l1_size = header.l1_size;
756 
757     l1_vm_state_index = size_to_l1(s, header.size);
758     if (l1_vm_state_index > INT_MAX) {
759         error_setg(errp, "Image is too big");
760         ret = -EFBIG;
761         goto fail;
762     }
763     s->l1_vm_state_index = l1_vm_state_index;
764 
765     /* the L1 table must contain at least enough entries to put
766        header.size bytes */
767     if (s->l1_size < s->l1_vm_state_index) {
768         error_setg(errp, "L1 table is too small");
769         ret = -EINVAL;
770         goto fail;
771     }
772 
773     ret = validate_table_offset(bs, header.l1_table_offset,
774                                 header.l1_size, sizeof(uint64_t));
775     if (ret < 0) {
776         error_setg(errp, "Invalid L1 table offset");
777         goto fail;
778     }
779     s->l1_table_offset = header.l1_table_offset;
780 
781 
782     if (s->l1_size > 0) {
783         s->l1_table = qemu_try_blockalign(bs->file,
784             align_offset(s->l1_size * sizeof(uint64_t), 512));
785         if (s->l1_table == NULL) {
786             error_setg(errp, "Could not allocate L1 table");
787             ret = -ENOMEM;
788             goto fail;
789         }
790         ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table,
791                          s->l1_size * sizeof(uint64_t));
792         if (ret < 0) {
793             error_setg_errno(errp, -ret, "Could not read L1 table");
794             goto fail;
795         }
796         for(i = 0;i < s->l1_size; i++) {
797             be64_to_cpus(&s->l1_table[i]);
798         }
799     }
800 
801     /* get L2 table/refcount block cache size from command line options */
802     opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
803     qemu_opts_absorb_qdict(opts, options, &local_err);
804     if (local_err) {
805         error_propagate(errp, local_err);
806         ret = -EINVAL;
807         goto fail;
808     }
809 
810     read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size,
811                      &local_err);
812     if (local_err) {
813         error_propagate(errp, local_err);
814         ret = -EINVAL;
815         goto fail;
816     }
817 
818     l2_cache_size /= s->cluster_size;
819     if (l2_cache_size < MIN_L2_CACHE_SIZE) {
820         l2_cache_size = MIN_L2_CACHE_SIZE;
821     }
822     if (l2_cache_size > INT_MAX) {
823         error_setg(errp, "L2 cache size too big");
824         ret = -EINVAL;
825         goto fail;
826     }
827 
828     refcount_cache_size /= s->cluster_size;
829     if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
830         refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
831     }
832     if (refcount_cache_size > INT_MAX) {
833         error_setg(errp, "Refcount cache size too big");
834         ret = -EINVAL;
835         goto fail;
836     }
837 
838     /* alloc L2 table/refcount block cache */
839     s->l2_table_cache = qcow2_cache_create(bs, l2_cache_size);
840     s->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size);
841     if (s->l2_table_cache == NULL || s->refcount_block_cache == NULL) {
842         error_setg(errp, "Could not allocate metadata caches");
843         ret = -ENOMEM;
844         goto fail;
845     }
846 
847     s->cluster_cache = g_malloc(s->cluster_size);
848     /* one more sector for decompressed data alignment */
849     s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
850                                                     * s->cluster_size + 512);
851     if (s->cluster_data == NULL) {
852         error_setg(errp, "Could not allocate temporary cluster buffer");
853         ret = -ENOMEM;
854         goto fail;
855     }
856 
857     s->cluster_cache_offset = -1;
858     s->flags = flags;
859 
860     ret = qcow2_refcount_init(bs);
861     if (ret != 0) {
862         error_setg_errno(errp, -ret, "Could not initialize refcount handling");
863         goto fail;
864     }
865 
866     QLIST_INIT(&s->cluster_allocs);
867     QTAILQ_INIT(&s->discards);
868 
869     /* read qcow2 extensions */
870     if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
871         &local_err)) {
872         error_propagate(errp, local_err);
873         ret = -EINVAL;
874         goto fail;
875     }
876 
877     /* read the backing file name */
878     if (header.backing_file_offset != 0) {
879         len = header.backing_file_size;
880         if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
881             len >= sizeof(bs->backing_file)) {
882             error_setg(errp, "Backing file name too long");
883             ret = -EINVAL;
884             goto fail;
885         }
886         ret = bdrv_pread(bs->file, header.backing_file_offset,
887                          bs->backing_file, len);
888         if (ret < 0) {
889             error_setg_errno(errp, -ret, "Could not read backing file name");
890             goto fail;
891         }
892         bs->backing_file[len] = '\0';
893         s->image_backing_file = g_strdup(bs->backing_file);
894     }
895 
896     /* Internal snapshots */
897     s->snapshots_offset = header.snapshots_offset;
898     s->nb_snapshots = header.nb_snapshots;
899 
900     ret = qcow2_read_snapshots(bs);
901     if (ret < 0) {
902         error_setg_errno(errp, -ret, "Could not read snapshots");
903         goto fail;
904     }
905 
906     /* Clear unknown autoclear feature bits */
907     if (!bs->read_only && !(flags & BDRV_O_INCOMING) && s->autoclear_features) {
908         s->autoclear_features = 0;
909         ret = qcow2_update_header(bs);
910         if (ret < 0) {
911             error_setg_errno(errp, -ret, "Could not update qcow2 header");
912             goto fail;
913         }
914     }
915 
916     /* Initialise locks */
917     qemu_co_mutex_init(&s->lock);
918 
919     /* Repair image if dirty */
920     if (!(flags & (BDRV_O_CHECK | BDRV_O_INCOMING)) && !bs->read_only &&
921         (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
922         BdrvCheckResult result = {0};
923 
924         ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
925         if (ret < 0) {
926             error_setg_errno(errp, -ret, "Could not repair dirty image");
927             goto fail;
928         }
929     }
930 
931     /* Enable lazy_refcounts according to image and command line options */
932     s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
933         (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
934 
935     s->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
936     s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
937     s->discard_passthrough[QCOW2_DISCARD_REQUEST] =
938         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
939                           flags & BDRV_O_UNMAP);
940     s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
941         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
942     s->discard_passthrough[QCOW2_DISCARD_OTHER] =
943         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
944 
945     opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
946     opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
947     if (opt_overlap_check_template && opt_overlap_check &&
948         strcmp(opt_overlap_check_template, opt_overlap_check))
949     {
950         error_setg(errp, "Conflicting values for qcow2 options '"
951                    QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
952                    "' ('%s')", opt_overlap_check, opt_overlap_check_template);
953         ret = -EINVAL;
954         goto fail;
955     }
956     if (!opt_overlap_check) {
957         opt_overlap_check = opt_overlap_check_template ?: "cached";
958     }
959 
960     if (!strcmp(opt_overlap_check, "none")) {
961         overlap_check_template = 0;
962     } else if (!strcmp(opt_overlap_check, "constant")) {
963         overlap_check_template = QCOW2_OL_CONSTANT;
964     } else if (!strcmp(opt_overlap_check, "cached")) {
965         overlap_check_template = QCOW2_OL_CACHED;
966     } else if (!strcmp(opt_overlap_check, "all")) {
967         overlap_check_template = QCOW2_OL_ALL;
968     } else {
969         error_setg(errp, "Unsupported value '%s' for qcow2 option "
970                    "'overlap-check'. Allowed are either of the following: "
971                    "none, constant, cached, all", opt_overlap_check);
972         ret = -EINVAL;
973         goto fail;
974     }
975 
976     s->overlap_check = 0;
977     for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
978         /* overlap-check defines a template bitmask, but every flag may be
979          * overwritten through the associated boolean option */
980         s->overlap_check |=
981             qemu_opt_get_bool(opts, overlap_bool_option_names[i],
982                               overlap_check_template & (1 << i)) << i;
983     }
984 
985     qemu_opts_del(opts);
986     opts = NULL;
987 
988     if (s->use_lazy_refcounts && s->qcow_version < 3) {
989         error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
990                    "qemu 1.1 compatibility level");
991         ret = -EINVAL;
992         goto fail;
993     }
994 
995 #ifdef DEBUG_ALLOC
996     {
997         BdrvCheckResult result = {0};
998         qcow2_check_refcounts(bs, &result, 0);
999     }
1000 #endif
1001     return ret;
1002 
1003  fail:
1004     qemu_opts_del(opts);
1005     g_free(s->unknown_header_fields);
1006     cleanup_unknown_header_ext(bs);
1007     qcow2_free_snapshots(bs);
1008     qcow2_refcount_close(bs);
1009     qemu_vfree(s->l1_table);
1010     /* else pre-write overlap checks in cache_destroy may crash */
1011     s->l1_table = NULL;
1012     if (s->l2_table_cache) {
1013         qcow2_cache_destroy(bs, s->l2_table_cache);
1014     }
1015     if (s->refcount_block_cache) {
1016         qcow2_cache_destroy(bs, s->refcount_block_cache);
1017     }
1018     g_free(s->cluster_cache);
1019     qemu_vfree(s->cluster_data);
1020     return ret;
1021 }
1022 
1023 static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
1024 {
1025     BDRVQcowState *s = bs->opaque;
1026 
1027     bs->bl.write_zeroes_alignment = s->cluster_sectors;
1028 }
1029 
1030 static int qcow2_set_key(BlockDriverState *bs, const char *key)
1031 {
1032     BDRVQcowState *s = bs->opaque;
1033     uint8_t keybuf[16];
1034     int len, i;
1035 
1036     memset(keybuf, 0, 16);
1037     len = strlen(key);
1038     if (len > 16)
1039         len = 16;
1040     /* XXX: we could compress the chars to 7 bits to increase
1041        entropy */
1042     for(i = 0;i < len;i++) {
1043         keybuf[i] = key[i];
1044     }
1045     assert(bs->encrypted);
1046     s->crypt_method = s->crypt_method_header;
1047 
1048     if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
1049         return -1;
1050     if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
1051         return -1;
1052 #if 0
1053     /* test */
1054     {
1055         uint8_t in[16];
1056         uint8_t out[16];
1057         uint8_t tmp[16];
1058         for(i=0;i<16;i++)
1059             in[i] = i;
1060         AES_encrypt(in, tmp, &s->aes_encrypt_key);
1061         AES_decrypt(tmp, out, &s->aes_decrypt_key);
1062         for(i = 0; i < 16; i++)
1063             printf(" %02x", tmp[i]);
1064         printf("\n");
1065         for(i = 0; i < 16; i++)
1066             printf(" %02x", out[i]);
1067         printf("\n");
1068     }
1069 #endif
1070     return 0;
1071 }
1072 
1073 /* We have no actual commit/abort logic for qcow2, but we need to write out any
1074  * unwritten data if we reopen read-only. */
1075 static int qcow2_reopen_prepare(BDRVReopenState *state,
1076                                 BlockReopenQueue *queue, Error **errp)
1077 {
1078     int ret;
1079 
1080     if ((state->flags & BDRV_O_RDWR) == 0) {
1081         ret = bdrv_flush(state->bs);
1082         if (ret < 0) {
1083             return ret;
1084         }
1085 
1086         ret = qcow2_mark_clean(state->bs);
1087         if (ret < 0) {
1088             return ret;
1089         }
1090     }
1091 
1092     return 0;
1093 }
1094 
1095 static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
1096         int64_t sector_num, int nb_sectors, int *pnum)
1097 {
1098     BDRVQcowState *s = bs->opaque;
1099     uint64_t cluster_offset;
1100     int index_in_cluster, ret;
1101     int64_t status = 0;
1102 
1103     *pnum = nb_sectors;
1104     qemu_co_mutex_lock(&s->lock);
1105     ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
1106     qemu_co_mutex_unlock(&s->lock);
1107     if (ret < 0) {
1108         return ret;
1109     }
1110 
1111     if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
1112         !s->crypt_method) {
1113         index_in_cluster = sector_num & (s->cluster_sectors - 1);
1114         cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
1115         status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
1116     }
1117     if (ret == QCOW2_CLUSTER_ZERO) {
1118         status |= BDRV_BLOCK_ZERO;
1119     } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
1120         status |= BDRV_BLOCK_DATA;
1121     }
1122     return status;
1123 }
1124 
1125 /* handle reading after the end of the backing file */
1126 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
1127                   int64_t sector_num, int nb_sectors)
1128 {
1129     int n1;
1130     if ((sector_num + nb_sectors) <= bs->total_sectors)
1131         return nb_sectors;
1132     if (sector_num >= bs->total_sectors)
1133         n1 = 0;
1134     else
1135         n1 = bs->total_sectors - sector_num;
1136 
1137     qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1));
1138 
1139     return n1;
1140 }
1141 
1142 static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
1143                           int remaining_sectors, QEMUIOVector *qiov)
1144 {
1145     BDRVQcowState *s = bs->opaque;
1146     int index_in_cluster, n1;
1147     int ret;
1148     int cur_nr_sectors; /* number of sectors in current iteration */
1149     uint64_t cluster_offset = 0;
1150     uint64_t bytes_done = 0;
1151     QEMUIOVector hd_qiov;
1152     uint8_t *cluster_data = NULL;
1153 
1154     qemu_iovec_init(&hd_qiov, qiov->niov);
1155 
1156     qemu_co_mutex_lock(&s->lock);
1157 
1158     while (remaining_sectors != 0) {
1159 
1160         /* prepare next request */
1161         cur_nr_sectors = remaining_sectors;
1162         if (s->crypt_method) {
1163             cur_nr_sectors = MIN(cur_nr_sectors,
1164                 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
1165         }
1166 
1167         ret = qcow2_get_cluster_offset(bs, sector_num << 9,
1168             &cur_nr_sectors, &cluster_offset);
1169         if (ret < 0) {
1170             goto fail;
1171         }
1172 
1173         index_in_cluster = sector_num & (s->cluster_sectors - 1);
1174 
1175         qemu_iovec_reset(&hd_qiov);
1176         qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
1177             cur_nr_sectors * 512);
1178 
1179         switch (ret) {
1180         case QCOW2_CLUSTER_UNALLOCATED:
1181 
1182             if (bs->backing_hd) {
1183                 /* read from the base image */
1184                 n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov,
1185                     sector_num, cur_nr_sectors);
1186                 if (n1 > 0) {
1187                     QEMUIOVector local_qiov;
1188 
1189                     qemu_iovec_init(&local_qiov, hd_qiov.niov);
1190                     qemu_iovec_concat(&local_qiov, &hd_qiov, 0,
1191                                       n1 * BDRV_SECTOR_SIZE);
1192 
1193                     BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
1194                     qemu_co_mutex_unlock(&s->lock);
1195                     ret = bdrv_co_readv(bs->backing_hd, sector_num,
1196                                         n1, &local_qiov);
1197                     qemu_co_mutex_lock(&s->lock);
1198 
1199                     qemu_iovec_destroy(&local_qiov);
1200 
1201                     if (ret < 0) {
1202                         goto fail;
1203                     }
1204                 }
1205             } else {
1206                 /* Note: in this case, no need to wait */
1207                 qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
1208             }
1209             break;
1210 
1211         case QCOW2_CLUSTER_ZERO:
1212             qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors);
1213             break;
1214 
1215         case QCOW2_CLUSTER_COMPRESSED:
1216             /* add AIO support for compressed blocks ? */
1217             ret = qcow2_decompress_cluster(bs, cluster_offset);
1218             if (ret < 0) {
1219                 goto fail;
1220             }
1221 
1222             qemu_iovec_from_buf(&hd_qiov, 0,
1223                 s->cluster_cache + index_in_cluster * 512,
1224                 512 * cur_nr_sectors);
1225             break;
1226 
1227         case QCOW2_CLUSTER_NORMAL:
1228             if ((cluster_offset & 511) != 0) {
1229                 ret = -EIO;
1230                 goto fail;
1231             }
1232 
1233             if (bs->encrypted) {
1234                 assert(s->crypt_method);
1235 
1236                 /*
1237                  * For encrypted images, read everything into a temporary
1238                  * contiguous buffer on which the AES functions can work.
1239                  */
1240                 if (!cluster_data) {
1241                     cluster_data =
1242                         qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
1243                                                       * s->cluster_size);
1244                     if (cluster_data == NULL) {
1245                         ret = -ENOMEM;
1246                         goto fail;
1247                     }
1248                 }
1249 
1250                 assert(cur_nr_sectors <=
1251                     QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
1252                 qemu_iovec_reset(&hd_qiov);
1253                 qemu_iovec_add(&hd_qiov, cluster_data,
1254                     512 * cur_nr_sectors);
1255             }
1256 
1257             BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
1258             qemu_co_mutex_unlock(&s->lock);
1259             ret = bdrv_co_readv(bs->file,
1260                                 (cluster_offset >> 9) + index_in_cluster,
1261                                 cur_nr_sectors, &hd_qiov);
1262             qemu_co_mutex_lock(&s->lock);
1263             if (ret < 0) {
1264                 goto fail;
1265             }
1266             if (bs->encrypted) {
1267                 assert(s->crypt_method);
1268                 qcow2_encrypt_sectors(s, sector_num,  cluster_data,
1269                     cluster_data, cur_nr_sectors, 0, &s->aes_decrypt_key);
1270                 qemu_iovec_from_buf(qiov, bytes_done,
1271                     cluster_data, 512 * cur_nr_sectors);
1272             }
1273             break;
1274 
1275         default:
1276             g_assert_not_reached();
1277             ret = -EIO;
1278             goto fail;
1279         }
1280 
1281         remaining_sectors -= cur_nr_sectors;
1282         sector_num += cur_nr_sectors;
1283         bytes_done += cur_nr_sectors * 512;
1284     }
1285     ret = 0;
1286 
1287 fail:
1288     qemu_co_mutex_unlock(&s->lock);
1289 
1290     qemu_iovec_destroy(&hd_qiov);
1291     qemu_vfree(cluster_data);
1292 
1293     return ret;
1294 }
1295 
1296 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
1297                            int64_t sector_num,
1298                            int remaining_sectors,
1299                            QEMUIOVector *qiov)
1300 {
1301     BDRVQcowState *s = bs->opaque;
1302     int index_in_cluster;
1303     int ret;
1304     int cur_nr_sectors; /* number of sectors in current iteration */
1305     uint64_t cluster_offset;
1306     QEMUIOVector hd_qiov;
1307     uint64_t bytes_done = 0;
1308     uint8_t *cluster_data = NULL;
1309     QCowL2Meta *l2meta = NULL;
1310 
1311     trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
1312                                  remaining_sectors);
1313 
1314     qemu_iovec_init(&hd_qiov, qiov->niov);
1315 
1316     s->cluster_cache_offset = -1; /* disable compressed cache */
1317 
1318     qemu_co_mutex_lock(&s->lock);
1319 
1320     while (remaining_sectors != 0) {
1321 
1322         l2meta = NULL;
1323 
1324         trace_qcow2_writev_start_part(qemu_coroutine_self());
1325         index_in_cluster = sector_num & (s->cluster_sectors - 1);
1326         cur_nr_sectors = remaining_sectors;
1327         if (bs->encrypted &&
1328             cur_nr_sectors >
1329             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
1330             cur_nr_sectors =
1331                 QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster;
1332         }
1333 
1334         ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
1335             &cur_nr_sectors, &cluster_offset, &l2meta);
1336         if (ret < 0) {
1337             goto fail;
1338         }
1339 
1340         assert((cluster_offset & 511) == 0);
1341 
1342         qemu_iovec_reset(&hd_qiov);
1343         qemu_iovec_concat(&hd_qiov, qiov, bytes_done,
1344             cur_nr_sectors * 512);
1345 
1346         if (bs->encrypted) {
1347             assert(s->crypt_method);
1348             if (!cluster_data) {
1349                 cluster_data = qemu_try_blockalign(bs->file,
1350                                                    QCOW_MAX_CRYPT_CLUSTERS
1351                                                    * s->cluster_size);
1352                 if (cluster_data == NULL) {
1353                     ret = -ENOMEM;
1354                     goto fail;
1355                 }
1356             }
1357 
1358             assert(hd_qiov.size <=
1359                    QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
1360             qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size);
1361 
1362             qcow2_encrypt_sectors(s, sector_num, cluster_data,
1363                 cluster_data, cur_nr_sectors, 1, &s->aes_encrypt_key);
1364 
1365             qemu_iovec_reset(&hd_qiov);
1366             qemu_iovec_add(&hd_qiov, cluster_data,
1367                 cur_nr_sectors * 512);
1368         }
1369 
1370         ret = qcow2_pre_write_overlap_check(bs, 0,
1371                 cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
1372                 cur_nr_sectors * BDRV_SECTOR_SIZE);
1373         if (ret < 0) {
1374             goto fail;
1375         }
1376 
1377         qemu_co_mutex_unlock(&s->lock);
1378         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
1379         trace_qcow2_writev_data(qemu_coroutine_self(),
1380                                 (cluster_offset >> 9) + index_in_cluster);
1381         ret = bdrv_co_writev(bs->file,
1382                              (cluster_offset >> 9) + index_in_cluster,
1383                              cur_nr_sectors, &hd_qiov);
1384         qemu_co_mutex_lock(&s->lock);
1385         if (ret < 0) {
1386             goto fail;
1387         }
1388 
1389         while (l2meta != NULL) {
1390             QCowL2Meta *next;
1391 
1392             ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
1393             if (ret < 0) {
1394                 goto fail;
1395             }
1396 
1397             /* Take the request off the list of running requests */
1398             if (l2meta->nb_clusters != 0) {
1399                 QLIST_REMOVE(l2meta, next_in_flight);
1400             }
1401 
1402             qemu_co_queue_restart_all(&l2meta->dependent_requests);
1403 
1404             next = l2meta->next;
1405             g_free(l2meta);
1406             l2meta = next;
1407         }
1408 
1409         remaining_sectors -= cur_nr_sectors;
1410         sector_num += cur_nr_sectors;
1411         bytes_done += cur_nr_sectors * 512;
1412         trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors);
1413     }
1414     ret = 0;
1415 
1416 fail:
1417     qemu_co_mutex_unlock(&s->lock);
1418 
1419     while (l2meta != NULL) {
1420         QCowL2Meta *next;
1421 
1422         if (l2meta->nb_clusters != 0) {
1423             QLIST_REMOVE(l2meta, next_in_flight);
1424         }
1425         qemu_co_queue_restart_all(&l2meta->dependent_requests);
1426 
1427         next = l2meta->next;
1428         g_free(l2meta);
1429         l2meta = next;
1430     }
1431 
1432     qemu_iovec_destroy(&hd_qiov);
1433     qemu_vfree(cluster_data);
1434     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
1435 
1436     return ret;
1437 }
1438 
1439 static void qcow2_close(BlockDriverState *bs)
1440 {
1441     BDRVQcowState *s = bs->opaque;
1442     qemu_vfree(s->l1_table);
1443     /* else pre-write overlap checks in cache_destroy may crash */
1444     s->l1_table = NULL;
1445 
1446     if (!(bs->open_flags & BDRV_O_INCOMING)) {
1447         int ret1, ret2;
1448 
1449         ret1 = qcow2_cache_flush(bs, s->l2_table_cache);
1450         ret2 = qcow2_cache_flush(bs, s->refcount_block_cache);
1451 
1452         if (ret1) {
1453             error_report("Failed to flush the L2 table cache: %s",
1454                          strerror(-ret1));
1455         }
1456         if (ret2) {
1457             error_report("Failed to flush the refcount block cache: %s",
1458                          strerror(-ret2));
1459         }
1460 
1461         if (!ret1 && !ret2) {
1462             qcow2_mark_clean(bs);
1463         }
1464     }
1465 
1466     qcow2_cache_destroy(bs, s->l2_table_cache);
1467     qcow2_cache_destroy(bs, s->refcount_block_cache);
1468 
1469     g_free(s->unknown_header_fields);
1470     cleanup_unknown_header_ext(bs);
1471 
1472     g_free(s->image_backing_file);
1473     g_free(s->image_backing_format);
1474 
1475     g_free(s->cluster_cache);
1476     qemu_vfree(s->cluster_data);
1477     qcow2_refcount_close(bs);
1478     qcow2_free_snapshots(bs);
1479 }
1480 
1481 static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp)
1482 {
1483     BDRVQcowState *s = bs->opaque;
1484     int flags = s->flags;
1485     AES_KEY aes_encrypt_key;
1486     AES_KEY aes_decrypt_key;
1487     uint32_t crypt_method = 0;
1488     QDict *options;
1489     Error *local_err = NULL;
1490     int ret;
1491 
1492     /*
1493      * Backing files are read-only which makes all of their metadata immutable,
1494      * that means we don't have to worry about reopening them here.
1495      */
1496 
1497     if (bs->encrypted) {
1498         assert(s->crypt_method);
1499         crypt_method = s->crypt_method;
1500         memcpy(&aes_encrypt_key, &s->aes_encrypt_key, sizeof(aes_encrypt_key));
1501         memcpy(&aes_decrypt_key, &s->aes_decrypt_key, sizeof(aes_decrypt_key));
1502     }
1503 
1504     qcow2_close(bs);
1505 
1506     bdrv_invalidate_cache(bs->file, &local_err);
1507     if (local_err) {
1508         error_propagate(errp, local_err);
1509         return;
1510     }
1511 
1512     memset(s, 0, sizeof(BDRVQcowState));
1513     options = qdict_clone_shallow(bs->options);
1514 
1515     ret = qcow2_open(bs, options, flags, &local_err);
1516     QDECREF(options);
1517     if (local_err) {
1518         error_setg(errp, "Could not reopen qcow2 layer: %s",
1519                    error_get_pretty(local_err));
1520         error_free(local_err);
1521         return;
1522     } else if (ret < 0) {
1523         error_setg_errno(errp, -ret, "Could not reopen qcow2 layer");
1524         return;
1525     }
1526 
1527     if (bs->encrypted) {
1528         s->crypt_method = crypt_method;
1529         memcpy(&s->aes_encrypt_key, &aes_encrypt_key, sizeof(aes_encrypt_key));
1530         memcpy(&s->aes_decrypt_key, &aes_decrypt_key, sizeof(aes_decrypt_key));
1531     }
1532 }
1533 
1534 static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
1535     size_t len, size_t buflen)
1536 {
1537     QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
1538     size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
1539 
1540     if (buflen < ext_len) {
1541         return -ENOSPC;
1542     }
1543 
1544     *ext_backing_fmt = (QCowExtension) {
1545         .magic  = cpu_to_be32(magic),
1546         .len    = cpu_to_be32(len),
1547     };
1548     memcpy(buf + sizeof(QCowExtension), s, len);
1549 
1550     return ext_len;
1551 }
1552 
1553 /*
1554  * Updates the qcow2 header, including the variable length parts of it, i.e.
1555  * the backing file name and all extensions. qcow2 was not designed to allow
1556  * such changes, so if we run out of space (we can only use the first cluster)
1557  * this function may fail.
1558  *
1559  * Returns 0 on success, -errno in error cases.
1560  */
1561 int qcow2_update_header(BlockDriverState *bs)
1562 {
1563     BDRVQcowState *s = bs->opaque;
1564     QCowHeader *header;
1565     char *buf;
1566     size_t buflen = s->cluster_size;
1567     int ret;
1568     uint64_t total_size;
1569     uint32_t refcount_table_clusters;
1570     size_t header_length;
1571     Qcow2UnknownHeaderExtension *uext;
1572 
1573     buf = qemu_blockalign(bs, buflen);
1574 
1575     /* Header structure */
1576     header = (QCowHeader*) buf;
1577 
1578     if (buflen < sizeof(*header)) {
1579         ret = -ENOSPC;
1580         goto fail;
1581     }
1582 
1583     header_length = sizeof(*header) + s->unknown_header_fields_size;
1584     total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
1585     refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
1586 
1587     *header = (QCowHeader) {
1588         /* Version 2 fields */
1589         .magic                  = cpu_to_be32(QCOW_MAGIC),
1590         .version                = cpu_to_be32(s->qcow_version),
1591         .backing_file_offset    = 0,
1592         .backing_file_size      = 0,
1593         .cluster_bits           = cpu_to_be32(s->cluster_bits),
1594         .size                   = cpu_to_be64(total_size),
1595         .crypt_method           = cpu_to_be32(s->crypt_method_header),
1596         .l1_size                = cpu_to_be32(s->l1_size),
1597         .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
1598         .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
1599         .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
1600         .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
1601         .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
1602 
1603         /* Version 3 fields */
1604         .incompatible_features  = cpu_to_be64(s->incompatible_features),
1605         .compatible_features    = cpu_to_be64(s->compatible_features),
1606         .autoclear_features     = cpu_to_be64(s->autoclear_features),
1607         .refcount_order         = cpu_to_be32(s->refcount_order),
1608         .header_length          = cpu_to_be32(header_length),
1609     };
1610 
1611     /* For older versions, write a shorter header */
1612     switch (s->qcow_version) {
1613     case 2:
1614         ret = offsetof(QCowHeader, incompatible_features);
1615         break;
1616     case 3:
1617         ret = sizeof(*header);
1618         break;
1619     default:
1620         ret = -EINVAL;
1621         goto fail;
1622     }
1623 
1624     buf += ret;
1625     buflen -= ret;
1626     memset(buf, 0, buflen);
1627 
1628     /* Preserve any unknown field in the header */
1629     if (s->unknown_header_fields_size) {
1630         if (buflen < s->unknown_header_fields_size) {
1631             ret = -ENOSPC;
1632             goto fail;
1633         }
1634 
1635         memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
1636         buf += s->unknown_header_fields_size;
1637         buflen -= s->unknown_header_fields_size;
1638     }
1639 
1640     /* Backing file format header extension */
1641     if (s->image_backing_format) {
1642         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
1643                              s->image_backing_format,
1644                              strlen(s->image_backing_format),
1645                              buflen);
1646         if (ret < 0) {
1647             goto fail;
1648         }
1649 
1650         buf += ret;
1651         buflen -= ret;
1652     }
1653 
1654     /* Feature table */
1655     Qcow2Feature features[] = {
1656         {
1657             .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1658             .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
1659             .name = "dirty bit",
1660         },
1661         {
1662             .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
1663             .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
1664             .name = "corrupt bit",
1665         },
1666         {
1667             .type = QCOW2_FEAT_TYPE_COMPATIBLE,
1668             .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
1669             .name = "lazy refcounts",
1670         },
1671     };
1672 
1673     ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
1674                          features, sizeof(features), buflen);
1675     if (ret < 0) {
1676         goto fail;
1677     }
1678     buf += ret;
1679     buflen -= ret;
1680 
1681     /* Keep unknown header extensions */
1682     QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
1683         ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
1684         if (ret < 0) {
1685             goto fail;
1686         }
1687 
1688         buf += ret;
1689         buflen -= ret;
1690     }
1691 
1692     /* End of header extensions */
1693     ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
1694     if (ret < 0) {
1695         goto fail;
1696     }
1697 
1698     buf += ret;
1699     buflen -= ret;
1700 
1701     /* Backing file name */
1702     if (s->image_backing_file) {
1703         size_t backing_file_len = strlen(s->image_backing_file);
1704 
1705         if (buflen < backing_file_len) {
1706             ret = -ENOSPC;
1707             goto fail;
1708         }
1709 
1710         /* Using strncpy is ok here, since buf is not NUL-terminated. */
1711         strncpy(buf, s->image_backing_file, buflen);
1712 
1713         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
1714         header->backing_file_size   = cpu_to_be32(backing_file_len);
1715     }
1716 
1717     /* Write the new header */
1718     ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size);
1719     if (ret < 0) {
1720         goto fail;
1721     }
1722 
1723     ret = 0;
1724 fail:
1725     qemu_vfree(header);
1726     return ret;
1727 }
1728 
1729 static int qcow2_change_backing_file(BlockDriverState *bs,
1730     const char *backing_file, const char *backing_fmt)
1731 {
1732     BDRVQcowState *s = bs->opaque;
1733 
1734     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
1735     pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
1736 
1737     g_free(s->image_backing_file);
1738     g_free(s->image_backing_format);
1739 
1740     s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
1741     s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;
1742 
1743     return qcow2_update_header(bs);
1744 }
1745 
1746 static int preallocate(BlockDriverState *bs)
1747 {
1748     uint64_t nb_sectors;
1749     uint64_t offset;
1750     uint64_t host_offset = 0;
1751     int num;
1752     int ret;
1753     QCowL2Meta *meta;
1754 
1755     nb_sectors = bdrv_nb_sectors(bs);
1756     offset = 0;
1757 
1758     while (nb_sectors) {
1759         num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS);
1760         ret = qcow2_alloc_cluster_offset(bs, offset, &num,
1761                                          &host_offset, &meta);
1762         if (ret < 0) {
1763             return ret;
1764         }
1765 
1766         while (meta) {
1767             QCowL2Meta *next = meta->next;
1768 
1769             ret = qcow2_alloc_cluster_link_l2(bs, meta);
1770             if (ret < 0) {
1771                 qcow2_free_any_clusters(bs, meta->alloc_offset,
1772                                         meta->nb_clusters, QCOW2_DISCARD_NEVER);
1773                 return ret;
1774             }
1775 
1776             /* There are no dependent requests, but we need to remove our
1777              * request from the list of in-flight requests */
1778             QLIST_REMOVE(meta, next_in_flight);
1779 
1780             g_free(meta);
1781             meta = next;
1782         }
1783 
1784         /* TODO Preallocate data if requested */
1785 
1786         nb_sectors -= num;
1787         offset += num << BDRV_SECTOR_BITS;
1788     }
1789 
1790     /*
1791      * It is expected that the image file is large enough to actually contain
1792      * all of the allocated clusters (otherwise we get failing reads after
1793      * EOF). Extend the image to the last allocated sector.
1794      */
1795     if (host_offset != 0) {
1796         uint8_t buf[BDRV_SECTOR_SIZE];
1797         memset(buf, 0, BDRV_SECTOR_SIZE);
1798         ret = bdrv_write(bs->file, (host_offset >> BDRV_SECTOR_BITS) + num - 1,
1799                          buf, 1);
1800         if (ret < 0) {
1801             return ret;
1802         }
1803     }
1804 
1805     return 0;
1806 }
1807 
1808 static int qcow2_create2(const char *filename, int64_t total_size,
1809                          const char *backing_file, const char *backing_format,
1810                          int flags, size_t cluster_size, PreallocMode prealloc,
1811                          QemuOpts *opts, int version, int refcount_order,
1812                          Error **errp)
1813 {
1814     /* Calculate cluster_bits */
1815     int cluster_bits;
1816     cluster_bits = ctz32(cluster_size);
1817     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
1818         (1 << cluster_bits) != cluster_size)
1819     {
1820         error_setg(errp, "Cluster size must be a power of two between %d and "
1821                    "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
1822         return -EINVAL;
1823     }
1824 
1825     /*
1826      * Open the image file and write a minimal qcow2 header.
1827      *
1828      * We keep things simple and start with a zero-sized image. We also
1829      * do without refcount blocks or a L1 table for now. We'll fix the
1830      * inconsistency later.
1831      *
1832      * We do need a refcount table because growing the refcount table means
1833      * allocating two new refcount blocks - the seconds of which would be at
1834      * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
1835      * size for any qcow2 image.
1836      */
1837     BlockDriverState* bs;
1838     QCowHeader *header;
1839     uint64_t* refcount_table;
1840     Error *local_err = NULL;
1841     int ret;
1842 
1843     if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
1844         /* Note: The following calculation does not need to be exact; if it is a
1845          * bit off, either some bytes will be "leaked" (which is fine) or we
1846          * will need to increase the file size by some bytes (which is fine,
1847          * too, as long as the bulk is allocated here). Therefore, using
1848          * floating point arithmetic is fine. */
1849         int64_t meta_size = 0;
1850         uint64_t nreftablee, nrefblocke, nl1e, nl2e;
1851         int64_t aligned_total_size = align_offset(total_size, cluster_size);
1852         int refblock_bits, refblock_size;
1853         /* refcount entry size in bytes */
1854         double rces = (1 << refcount_order) / 8.;
1855 
1856         /* see qcow2_open() */
1857         refblock_bits = cluster_bits - (refcount_order - 3);
1858         refblock_size = 1 << refblock_bits;
1859 
1860         /* header: 1 cluster */
1861         meta_size += cluster_size;
1862 
1863         /* total size of L2 tables */
1864         nl2e = aligned_total_size / cluster_size;
1865         nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t));
1866         meta_size += nl2e * sizeof(uint64_t);
1867 
1868         /* total size of L1 tables */
1869         nl1e = nl2e * sizeof(uint64_t) / cluster_size;
1870         nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t));
1871         meta_size += nl1e * sizeof(uint64_t);
1872 
1873         /* total size of refcount blocks
1874          *
1875          * note: every host cluster is reference-counted, including metadata
1876          * (even refcount blocks are recursively included).
1877          * Let:
1878          *   a = total_size (this is the guest disk size)
1879          *   m = meta size not including refcount blocks and refcount tables
1880          *   c = cluster size
1881          *   y1 = number of refcount blocks entries
1882          *   y2 = meta size including everything
1883          *   rces = refcount entry size in bytes
1884          * then,
1885          *   y1 = (y2 + a)/c
1886          *   y2 = y1 * rces + y1 * rces * sizeof(u64) / c + m
1887          * we can get y1:
1888          *   y1 = (a + m) / (c - rces - rces * sizeof(u64) / c)
1889          */
1890         nrefblocke = (aligned_total_size + meta_size + cluster_size)
1891                    / (cluster_size - rces - rces * sizeof(uint64_t)
1892                                                  / cluster_size);
1893         meta_size += DIV_ROUND_UP(nrefblocke, refblock_size) * cluster_size;
1894 
1895         /* total size of refcount tables */
1896         nreftablee = nrefblocke / refblock_size;
1897         nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t));
1898         meta_size += nreftablee * sizeof(uint64_t);
1899 
1900         qemu_opt_set_number(opts, BLOCK_OPT_SIZE,
1901                             aligned_total_size + meta_size, &error_abort);
1902         qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_lookup[prealloc],
1903                      &error_abort);
1904     }
1905 
1906     ret = bdrv_create_file(filename, opts, &local_err);
1907     if (ret < 0) {
1908         error_propagate(errp, local_err);
1909         return ret;
1910     }
1911 
1912     bs = NULL;
1913     ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
1914                     NULL, &local_err);
1915     if (ret < 0) {
1916         error_propagate(errp, local_err);
1917         return ret;
1918     }
1919 
1920     /* Write the header */
1921     QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
1922     header = g_malloc0(cluster_size);
1923     *header = (QCowHeader) {
1924         .magic                      = cpu_to_be32(QCOW_MAGIC),
1925         .version                    = cpu_to_be32(version),
1926         .cluster_bits               = cpu_to_be32(cluster_bits),
1927         .size                       = cpu_to_be64(0),
1928         .l1_table_offset            = cpu_to_be64(0),
1929         .l1_size                    = cpu_to_be32(0),
1930         .refcount_table_offset      = cpu_to_be64(cluster_size),
1931         .refcount_table_clusters    = cpu_to_be32(1),
1932         .refcount_order             = cpu_to_be32(refcount_order),
1933         .header_length              = cpu_to_be32(sizeof(*header)),
1934     };
1935 
1936     if (flags & BLOCK_FLAG_ENCRYPT) {
1937         header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
1938     } else {
1939         header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1940     }
1941 
1942     if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) {
1943         header->compatible_features |=
1944             cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
1945     }
1946 
1947     ret = bdrv_pwrite(bs, 0, header, cluster_size);
1948     g_free(header);
1949     if (ret < 0) {
1950         error_setg_errno(errp, -ret, "Could not write qcow2 header");
1951         goto out;
1952     }
1953 
1954     /* Write a refcount table with one refcount block */
1955     refcount_table = g_malloc0(2 * cluster_size);
1956     refcount_table[0] = cpu_to_be64(2 * cluster_size);
1957     ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size);
1958     g_free(refcount_table);
1959 
1960     if (ret < 0) {
1961         error_setg_errno(errp, -ret, "Could not write refcount table");
1962         goto out;
1963     }
1964 
1965     bdrv_unref(bs);
1966     bs = NULL;
1967 
1968     /*
1969      * And now open the image and make it consistent first (i.e. increase the
1970      * refcount of the cluster that is occupied by the header and the refcount
1971      * table)
1972      */
1973     ret = bdrv_open(&bs, filename, NULL, NULL,
1974                     BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH,
1975                     &bdrv_qcow2, &local_err);
1976     if (ret < 0) {
1977         error_propagate(errp, local_err);
1978         goto out;
1979     }
1980 
1981     ret = qcow2_alloc_clusters(bs, 3 * cluster_size);
1982     if (ret < 0) {
1983         error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
1984                          "header and refcount table");
1985         goto out;
1986 
1987     } else if (ret != 0) {
1988         error_report("Huh, first cluster in empty image is already in use?");
1989         abort();
1990     }
1991 
1992     /* Okay, now that we have a valid image, let's give it the right size */
1993     ret = bdrv_truncate(bs, total_size);
1994     if (ret < 0) {
1995         error_setg_errno(errp, -ret, "Could not resize image");
1996         goto out;
1997     }
1998 
1999     /* Want a backing file? There you go.*/
2000     if (backing_file) {
2001         ret = bdrv_change_backing_file(bs, backing_file, backing_format);
2002         if (ret < 0) {
2003             error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
2004                              "with format '%s'", backing_file, backing_format);
2005             goto out;
2006         }
2007     }
2008 
2009     /* And if we're supposed to preallocate metadata, do that now */
2010     if (prealloc != PREALLOC_MODE_OFF) {
2011         BDRVQcowState *s = bs->opaque;
2012         qemu_co_mutex_lock(&s->lock);
2013         ret = preallocate(bs);
2014         qemu_co_mutex_unlock(&s->lock);
2015         if (ret < 0) {
2016             error_setg_errno(errp, -ret, "Could not preallocate metadata");
2017             goto out;
2018         }
2019     }
2020 
2021     bdrv_unref(bs);
2022     bs = NULL;
2023 
2024     /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */
2025     ret = bdrv_open(&bs, filename, NULL, NULL,
2026                     BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING,
2027                     &bdrv_qcow2, &local_err);
2028     if (local_err) {
2029         error_propagate(errp, local_err);
2030         goto out;
2031     }
2032 
2033     ret = 0;
2034 out:
2035     if (bs) {
2036         bdrv_unref(bs);
2037     }
2038     return ret;
2039 }
2040 
2041 static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
2042 {
2043     char *backing_file = NULL;
2044     char *backing_fmt = NULL;
2045     char *buf = NULL;
2046     uint64_t size = 0;
2047     int flags = 0;
2048     size_t cluster_size = DEFAULT_CLUSTER_SIZE;
2049     PreallocMode prealloc;
2050     int version = 3;
2051     uint64_t refcount_bits = 16;
2052     int refcount_order;
2053     Error *local_err = NULL;
2054     int ret;
2055 
2056     /* Read out options */
2057     size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
2058                     BDRV_SECTOR_SIZE);
2059     backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
2060     backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT);
2061     if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) {
2062         flags |= BLOCK_FLAG_ENCRYPT;
2063     }
2064     cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
2065                                          DEFAULT_CLUSTER_SIZE);
2066     buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
2067     prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
2068                                PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
2069                                &local_err);
2070     if (local_err) {
2071         error_propagate(errp, local_err);
2072         ret = -EINVAL;
2073         goto finish;
2074     }
2075     g_free(buf);
2076     buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
2077     if (!buf) {
2078         /* keep the default */
2079     } else if (!strcmp(buf, "0.10")) {
2080         version = 2;
2081     } else if (!strcmp(buf, "1.1")) {
2082         version = 3;
2083     } else {
2084         error_setg(errp, "Invalid compatibility level: '%s'", buf);
2085         ret = -EINVAL;
2086         goto finish;
2087     }
2088 
2089     if (qemu_opt_get_bool_del(opts, BLOCK_OPT_LAZY_REFCOUNTS, false)) {
2090         flags |= BLOCK_FLAG_LAZY_REFCOUNTS;
2091     }
2092 
2093     if (backing_file && prealloc != PREALLOC_MODE_OFF) {
2094         error_setg(errp, "Backing file and preallocation cannot be used at "
2095                    "the same time");
2096         ret = -EINVAL;
2097         goto finish;
2098     }
2099 
2100     if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) {
2101         error_setg(errp, "Lazy refcounts only supported with compatibility "
2102                    "level 1.1 and above (use compat=1.1 or greater)");
2103         ret = -EINVAL;
2104         goto finish;
2105     }
2106 
2107     refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS,
2108                                             refcount_bits);
2109     if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
2110         error_setg(errp, "Refcount width must be a power of two and may not "
2111                    "exceed 64 bits");
2112         ret = -EINVAL;
2113         goto finish;
2114     }
2115 
2116     if (version < 3 && refcount_bits != 16) {
2117         error_setg(errp, "Different refcount widths than 16 bits require "
2118                    "compatibility level 1.1 or above (use compat=1.1 or "
2119                    "greater)");
2120         ret = -EINVAL;
2121         goto finish;
2122     }
2123 
2124     refcount_order = ctz32(refcount_bits);
2125 
2126     ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
2127                         cluster_size, prealloc, opts, version, refcount_order,
2128                         &local_err);
2129     if (local_err) {
2130         error_propagate(errp, local_err);
2131     }
2132 
2133 finish:
2134     g_free(backing_file);
2135     g_free(backing_fmt);
2136     g_free(buf);
2137     return ret;
2138 }
2139 
2140 static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
2141     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
2142 {
2143     int ret;
2144     BDRVQcowState *s = bs->opaque;
2145 
2146     /* Emulate misaligned zero writes */
2147     if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) {
2148         return -ENOTSUP;
2149     }
2150 
2151     /* Whatever is left can use real zero clusters */
2152     qemu_co_mutex_lock(&s->lock);
2153     ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
2154         nb_sectors);
2155     qemu_co_mutex_unlock(&s->lock);
2156 
2157     return ret;
2158 }
2159 
2160 static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
2161     int64_t sector_num, int nb_sectors)
2162 {
2163     int ret;
2164     BDRVQcowState *s = bs->opaque;
2165 
2166     qemu_co_mutex_lock(&s->lock);
2167     ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
2168         nb_sectors, QCOW2_DISCARD_REQUEST, false);
2169     qemu_co_mutex_unlock(&s->lock);
2170     return ret;
2171 }
2172 
2173 static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
2174 {
2175     BDRVQcowState *s = bs->opaque;
2176     int64_t new_l1_size;
2177     int ret;
2178 
2179     if (offset & 511) {
2180         error_report("The new size must be a multiple of 512");
2181         return -EINVAL;
2182     }
2183 
2184     /* cannot proceed if image has snapshots */
2185     if (s->nb_snapshots) {
2186         error_report("Can't resize an image which has snapshots");
2187         return -ENOTSUP;
2188     }
2189 
2190     /* shrinking is currently not supported */
2191     if (offset < bs->total_sectors * 512) {
2192         error_report("qcow2 doesn't support shrinking images yet");
2193         return -ENOTSUP;
2194     }
2195 
2196     new_l1_size = size_to_l1(s, offset);
2197     ret = qcow2_grow_l1_table(bs, new_l1_size, true);
2198     if (ret < 0) {
2199         return ret;
2200     }
2201 
2202     /* write updated header.size */
2203     offset = cpu_to_be64(offset);
2204     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
2205                            &offset, sizeof(uint64_t));
2206     if (ret < 0) {
2207         return ret;
2208     }
2209 
2210     s->l1_vm_state_index = new_l1_size;
2211     return 0;
2212 }
2213 
2214 /* XXX: put compressed sectors first, then all the cluster aligned
2215    tables to avoid losing bytes in alignment */
2216 static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
2217                                   const uint8_t *buf, int nb_sectors)
2218 {
2219     BDRVQcowState *s = bs->opaque;
2220     z_stream strm;
2221     int ret, out_len;
2222     uint8_t *out_buf;
2223     uint64_t cluster_offset;
2224 
2225     if (nb_sectors == 0) {
2226         /* align end of file to a sector boundary to ease reading with
2227            sector based I/Os */
2228         cluster_offset = bdrv_getlength(bs->file);
2229         return bdrv_truncate(bs->file, cluster_offset);
2230     }
2231 
2232     if (nb_sectors != s->cluster_sectors) {
2233         ret = -EINVAL;
2234 
2235         /* Zero-pad last write if image size is not cluster aligned */
2236         if (sector_num + nb_sectors == bs->total_sectors &&
2237             nb_sectors < s->cluster_sectors) {
2238             uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size);
2239             memset(pad_buf, 0, s->cluster_size);
2240             memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE);
2241             ret = qcow2_write_compressed(bs, sector_num,
2242                                          pad_buf, s->cluster_sectors);
2243             qemu_vfree(pad_buf);
2244         }
2245         return ret;
2246     }
2247 
2248     out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
2249 
2250     /* best compression, small window, no zlib header */
2251     memset(&strm, 0, sizeof(strm));
2252     ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
2253                        Z_DEFLATED, -12,
2254                        9, Z_DEFAULT_STRATEGY);
2255     if (ret != 0) {
2256         ret = -EINVAL;
2257         goto fail;
2258     }
2259 
2260     strm.avail_in = s->cluster_size;
2261     strm.next_in = (uint8_t *)buf;
2262     strm.avail_out = s->cluster_size;
2263     strm.next_out = out_buf;
2264 
2265     ret = deflate(&strm, Z_FINISH);
2266     if (ret != Z_STREAM_END && ret != Z_OK) {
2267         deflateEnd(&strm);
2268         ret = -EINVAL;
2269         goto fail;
2270     }
2271     out_len = strm.next_out - out_buf;
2272 
2273     deflateEnd(&strm);
2274 
2275     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
2276         /* could not compress: write normal cluster */
2277         ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
2278         if (ret < 0) {
2279             goto fail;
2280         }
2281     } else {
2282         cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
2283             sector_num << 9, out_len);
2284         if (!cluster_offset) {
2285             ret = -EIO;
2286             goto fail;
2287         }
2288         cluster_offset &= s->cluster_offset_mask;
2289 
2290         ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len);
2291         if (ret < 0) {
2292             goto fail;
2293         }
2294 
2295         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
2296         ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
2297         if (ret < 0) {
2298             goto fail;
2299         }
2300     }
2301 
2302     ret = 0;
2303 fail:
2304     g_free(out_buf);
2305     return ret;
2306 }
2307 
2308 static int make_completely_empty(BlockDriverState *bs)
2309 {
2310     BDRVQcowState *s = bs->opaque;
2311     int ret, l1_clusters;
2312     int64_t offset;
2313     uint64_t *new_reftable = NULL;
2314     uint64_t rt_entry, l1_size2;
2315     struct {
2316         uint64_t l1_offset;
2317         uint64_t reftable_offset;
2318         uint32_t reftable_clusters;
2319     } QEMU_PACKED l1_ofs_rt_ofs_cls;
2320 
2321     ret = qcow2_cache_empty(bs, s->l2_table_cache);
2322     if (ret < 0) {
2323         goto fail;
2324     }
2325 
2326     ret = qcow2_cache_empty(bs, s->refcount_block_cache);
2327     if (ret < 0) {
2328         goto fail;
2329     }
2330 
2331     /* Refcounts will be broken utterly */
2332     ret = qcow2_mark_dirty(bs);
2333     if (ret < 0) {
2334         goto fail;
2335     }
2336 
2337     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
2338 
2339     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
2340     l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t);
2341 
2342     /* After this call, neither the in-memory nor the on-disk refcount
2343      * information accurately describe the actual references */
2344 
2345     ret = bdrv_write_zeroes(bs->file, s->l1_table_offset / BDRV_SECTOR_SIZE,
2346                             l1_clusters * s->cluster_sectors, 0);
2347     if (ret < 0) {
2348         goto fail_broken_refcounts;
2349     }
2350     memset(s->l1_table, 0, l1_size2);
2351 
2352     BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
2353 
2354     /* Overwrite enough clusters at the beginning of the sectors to place
2355      * the refcount table, a refcount block and the L1 table in; this may
2356      * overwrite parts of the existing refcount and L1 table, which is not
2357      * an issue because the dirty flag is set, complete data loss is in fact
2358      * desired and partial data loss is consequently fine as well */
2359     ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
2360                             (2 + l1_clusters) * s->cluster_size /
2361                             BDRV_SECTOR_SIZE, 0);
2362     /* This call (even if it failed overall) may have overwritten on-disk
2363      * refcount structures; in that case, the in-memory refcount information
2364      * will probably differ from the on-disk information which makes the BDS
2365      * unusable */
2366     if (ret < 0) {
2367         goto fail_broken_refcounts;
2368     }
2369 
2370     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
2371     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
2372 
2373     /* "Create" an empty reftable (one cluster) directly after the image
2374      * header and an empty L1 table three clusters after the image header;
2375      * the cluster between those two will be used as the first refblock */
2376     cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
2377     cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
2378     cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
2379     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
2380                            &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
2381     if (ret < 0) {
2382         goto fail_broken_refcounts;
2383     }
2384 
2385     s->l1_table_offset = 3 * s->cluster_size;
2386 
2387     new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
2388     if (!new_reftable) {
2389         ret = -ENOMEM;
2390         goto fail_broken_refcounts;
2391     }
2392 
2393     s->refcount_table_offset = s->cluster_size;
2394     s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
2395 
2396     g_free(s->refcount_table);
2397     s->refcount_table = new_reftable;
2398     new_reftable = NULL;
2399 
2400     /* Now the in-memory refcount information again corresponds to the on-disk
2401      * information (reftable is empty and no refblocks (the refblock cache is
2402      * empty)); however, this means some clusters (e.g. the image header) are
2403      * referenced, but not refcounted, but the normal qcow2 code assumes that
2404      * the in-memory information is always correct */
2405 
2406     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
2407 
2408     /* Enter the first refblock into the reftable */
2409     rt_entry = cpu_to_be64(2 * s->cluster_size);
2410     ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
2411                            &rt_entry, sizeof(rt_entry));
2412     if (ret < 0) {
2413         goto fail_broken_refcounts;
2414     }
2415     s->refcount_table[0] = 2 * s->cluster_size;
2416 
2417     s->free_cluster_index = 0;
2418     assert(3 + l1_clusters <= s->refcount_block_size);
2419     offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
2420     if (offset < 0) {
2421         ret = offset;
2422         goto fail_broken_refcounts;
2423     } else if (offset > 0) {
2424         error_report("First cluster in emptied image is in use");
2425         abort();
2426     }
2427 
2428     /* Now finally the in-memory information corresponds to the on-disk
2429      * structures and is correct */
2430     ret = qcow2_mark_clean(bs);
2431     if (ret < 0) {
2432         goto fail;
2433     }
2434 
2435     ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size);
2436     if (ret < 0) {
2437         goto fail;
2438     }
2439 
2440     return 0;
2441 
2442 fail_broken_refcounts:
2443     /* The BDS is unusable at this point. If we wanted to make it usable, we
2444      * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
2445      * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
2446      * again. However, because the functions which could have caused this error
2447      * path to be taken are used by those functions as well, it's very likely
2448      * that that sequence will fail as well. Therefore, just eject the BDS. */
2449     bs->drv = NULL;
2450 
2451 fail:
2452     g_free(new_reftable);
2453     return ret;
2454 }
2455 
2456 static int qcow2_make_empty(BlockDriverState *bs)
2457 {
2458     BDRVQcowState *s = bs->opaque;
2459     uint64_t start_sector;
2460     int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
2461     int l1_clusters, ret = 0;
2462 
2463     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
2464 
2465     if (s->qcow_version >= 3 && !s->snapshots &&
2466         3 + l1_clusters <= s->refcount_block_size) {
2467         /* The following function only works for qcow2 v3 images (it requires
2468          * the dirty flag) and only as long as there are no snapshots (because
2469          * it completely empties the image). Furthermore, the L1 table and three
2470          * additional clusters (image header, refcount table, one refcount
2471          * block) have to fit inside one refcount block. */
2472         return make_completely_empty(bs);
2473     }
2474 
2475     /* This fallback code simply discards every active cluster; this is slow,
2476      * but works in all cases */
2477     for (start_sector = 0; start_sector < bs->total_sectors;
2478          start_sector += sector_step)
2479     {
2480         /* As this function is generally used after committing an external
2481          * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
2482          * default action for this kind of discard is to pass the discard,
2483          * which will ideally result in an actually smaller image file, as
2484          * is probably desired. */
2485         ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
2486                                      MIN(sector_step,
2487                                          bs->total_sectors - start_sector),
2488                                      QCOW2_DISCARD_SNAPSHOT, true);
2489         if (ret < 0) {
2490             break;
2491         }
2492     }
2493 
2494     return ret;
2495 }
2496 
2497 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
2498 {
2499     BDRVQcowState *s = bs->opaque;
2500     int ret;
2501 
2502     qemu_co_mutex_lock(&s->lock);
2503     ret = qcow2_cache_flush(bs, s->l2_table_cache);
2504     if (ret < 0) {
2505         qemu_co_mutex_unlock(&s->lock);
2506         return ret;
2507     }
2508 
2509     if (qcow2_need_accurate_refcounts(s)) {
2510         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2511         if (ret < 0) {
2512             qemu_co_mutex_unlock(&s->lock);
2513             return ret;
2514         }
2515     }
2516     qemu_co_mutex_unlock(&s->lock);
2517 
2518     return 0;
2519 }
2520 
2521 static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2522 {
2523     BDRVQcowState *s = bs->opaque;
2524     bdi->unallocated_blocks_are_zero = true;
2525     bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3);
2526     bdi->cluster_size = s->cluster_size;
2527     bdi->vm_state_offset = qcow2_vm_state_offset(s);
2528     return 0;
2529 }
2530 
2531 static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs)
2532 {
2533     BDRVQcowState *s = bs->opaque;
2534     ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);
2535 
2536     *spec_info = (ImageInfoSpecific){
2537         .kind  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
2538         {
2539             .qcow2 = g_new(ImageInfoSpecificQCow2, 1),
2540         },
2541     };
2542     if (s->qcow_version == 2) {
2543         *spec_info->qcow2 = (ImageInfoSpecificQCow2){
2544             .compat             = g_strdup("0.10"),
2545             .refcount_bits      = s->refcount_bits,
2546         };
2547     } else if (s->qcow_version == 3) {
2548         *spec_info->qcow2 = (ImageInfoSpecificQCow2){
2549             .compat             = g_strdup("1.1"),
2550             .lazy_refcounts     = s->compatible_features &
2551                                   QCOW2_COMPAT_LAZY_REFCOUNTS,
2552             .has_lazy_refcounts = true,
2553             .corrupt            = s->incompatible_features &
2554                                   QCOW2_INCOMPAT_CORRUPT,
2555             .has_corrupt        = true,
2556             .refcount_bits      = s->refcount_bits,
2557         };
2558     }
2559 
2560     return spec_info;
2561 }
2562 
2563 #if 0
2564 static void dump_refcounts(BlockDriverState *bs)
2565 {
2566     BDRVQcowState *s = bs->opaque;
2567     int64_t nb_clusters, k, k1, size;
2568     int refcount;
2569 
2570     size = bdrv_getlength(bs->file);
2571     nb_clusters = size_to_clusters(s, size);
2572     for(k = 0; k < nb_clusters;) {
2573         k1 = k;
2574         refcount = get_refcount(bs, k);
2575         k++;
2576         while (k < nb_clusters && get_refcount(bs, k) == refcount)
2577             k++;
2578         printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount,
2579                k - k1);
2580     }
2581 }
2582 #endif
2583 
2584 static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
2585                               int64_t pos)
2586 {
2587     BDRVQcowState *s = bs->opaque;
2588     int64_t total_sectors = bs->total_sectors;
2589     bool zero_beyond_eof = bs->zero_beyond_eof;
2590     int ret;
2591 
2592     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
2593     bs->zero_beyond_eof = false;
2594     ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
2595     bs->zero_beyond_eof = zero_beyond_eof;
2596 
2597     /* bdrv_co_do_writev will have increased the total_sectors value to include
2598      * the VM state - the VM state is however not an actual part of the block
2599      * device, therefore, we need to restore the old value. */
2600     bs->total_sectors = total_sectors;
2601 
2602     return ret;
2603 }
2604 
2605 static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2606                               int64_t pos, int size)
2607 {
2608     BDRVQcowState *s = bs->opaque;
2609     bool zero_beyond_eof = bs->zero_beyond_eof;
2610     int ret;
2611 
2612     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
2613     bs->zero_beyond_eof = false;
2614     ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size);
2615     bs->zero_beyond_eof = zero_beyond_eof;
2616 
2617     return ret;
2618 }
2619 
2620 /*
2621  * Downgrades an image's version. To achieve this, any incompatible features
2622  * have to be removed.
2623  */
2624 static int qcow2_downgrade(BlockDriverState *bs, int target_version,
2625                            BlockDriverAmendStatusCB *status_cb)
2626 {
2627     BDRVQcowState *s = bs->opaque;
2628     int current_version = s->qcow_version;
2629     int ret;
2630 
2631     if (target_version == current_version) {
2632         return 0;
2633     } else if (target_version > current_version) {
2634         return -EINVAL;
2635     } else if (target_version != 2) {
2636         return -EINVAL;
2637     }
2638 
2639     if (s->refcount_order != 4) {
2640         /* we would have to convert the image to a refcount_order == 4 image
2641          * here; however, since qemu (at the time of writing this) does not
2642          * support anything different than 4 anyway, there is no point in doing
2643          * so right now; however, we should error out (if qemu supports this in
2644          * the future and this code has not been adapted) */
2645         error_report("qcow2_downgrade: Image refcount orders other than 4 are "
2646                      "currently not supported.");
2647         return -ENOTSUP;
2648     }
2649 
2650     /* clear incompatible features */
2651     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
2652         ret = qcow2_mark_clean(bs);
2653         if (ret < 0) {
2654             return ret;
2655         }
2656     }
2657 
2658     /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
2659      * the first place; if that happens nonetheless, returning -ENOTSUP is the
2660      * best thing to do anyway */
2661 
2662     if (s->incompatible_features) {
2663         return -ENOTSUP;
2664     }
2665 
2666     /* since we can ignore compatible features, we can set them to 0 as well */
2667     s->compatible_features = 0;
2668     /* if lazy refcounts have been used, they have already been fixed through
2669      * clearing the dirty flag */
2670 
2671     /* clearing autoclear features is trivial */
2672     s->autoclear_features = 0;
2673 
2674     ret = qcow2_expand_zero_clusters(bs, status_cb);
2675     if (ret < 0) {
2676         return ret;
2677     }
2678 
2679     s->qcow_version = target_version;
2680     ret = qcow2_update_header(bs);
2681     if (ret < 0) {
2682         s->qcow_version = current_version;
2683         return ret;
2684     }
2685     return 0;
2686 }
2687 
2688 static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
2689                                BlockDriverAmendStatusCB *status_cb)
2690 {
2691     BDRVQcowState *s = bs->opaque;
2692     int old_version = s->qcow_version, new_version = old_version;
2693     uint64_t new_size = 0;
2694     const char *backing_file = NULL, *backing_format = NULL;
2695     bool lazy_refcounts = s->use_lazy_refcounts;
2696     const char *compat = NULL;
2697     uint64_t cluster_size = s->cluster_size;
2698     bool encrypt;
2699     int ret;
2700     QemuOptDesc *desc = opts->list->desc;
2701 
2702     while (desc && desc->name) {
2703         if (!qemu_opt_find(opts, desc->name)) {
2704             /* only change explicitly defined options */
2705             desc++;
2706             continue;
2707         }
2708 
2709         if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
2710             compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
2711             if (!compat) {
2712                 /* preserve default */
2713             } else if (!strcmp(compat, "0.10")) {
2714                 new_version = 2;
2715             } else if (!strcmp(compat, "1.1")) {
2716                 new_version = 3;
2717             } else {
2718                 fprintf(stderr, "Unknown compatibility level %s.\n", compat);
2719                 return -EINVAL;
2720             }
2721         } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) {
2722             fprintf(stderr, "Cannot change preallocation mode.\n");
2723             return -ENOTSUP;
2724         } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
2725             new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
2726         } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
2727             backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
2728         } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
2729             backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
2730         } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) {
2731             encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT,
2732                                         s->crypt_method);
2733             if (encrypt != !!s->crypt_method) {
2734                 fprintf(stderr, "Changing the encryption flag is not "
2735                         "supported.\n");
2736                 return -ENOTSUP;
2737             }
2738         } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) {
2739             cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE,
2740                                              cluster_size);
2741             if (cluster_size != s->cluster_size) {
2742                 fprintf(stderr, "Changing the cluster size is not "
2743                         "supported.\n");
2744                 return -ENOTSUP;
2745             }
2746         } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
2747             lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
2748                                                lazy_refcounts);
2749         } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
2750             error_report("Cannot change refcount entry width");
2751             return -ENOTSUP;
2752         } else {
2753             /* if this assertion fails, this probably means a new option was
2754              * added without having it covered here */
2755             assert(false);
2756         }
2757 
2758         desc++;
2759     }
2760 
2761     if (new_version != old_version) {
2762         if (new_version > old_version) {
2763             /* Upgrade */
2764             s->qcow_version = new_version;
2765             ret = qcow2_update_header(bs);
2766             if (ret < 0) {
2767                 s->qcow_version = old_version;
2768                 return ret;
2769             }
2770         } else {
2771             ret = qcow2_downgrade(bs, new_version, status_cb);
2772             if (ret < 0) {
2773                 return ret;
2774             }
2775         }
2776     }
2777 
2778     if (backing_file || backing_format) {
2779         ret = qcow2_change_backing_file(bs,
2780                     backing_file ?: s->image_backing_file,
2781                     backing_format ?: s->image_backing_format);
2782         if (ret < 0) {
2783             return ret;
2784         }
2785     }
2786 
2787     if (s->use_lazy_refcounts != lazy_refcounts) {
2788         if (lazy_refcounts) {
2789             if (s->qcow_version < 3) {
2790                 fprintf(stderr, "Lazy refcounts only supported with compatibility "
2791                         "level 1.1 and above (use compat=1.1 or greater)\n");
2792                 return -EINVAL;
2793             }
2794             s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
2795             ret = qcow2_update_header(bs);
2796             if (ret < 0) {
2797                 s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
2798                 return ret;
2799             }
2800             s->use_lazy_refcounts = true;
2801         } else {
2802             /* make image clean first */
2803             ret = qcow2_mark_clean(bs);
2804             if (ret < 0) {
2805                 return ret;
2806             }
2807             /* now disallow lazy refcounts */
2808             s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
2809             ret = qcow2_update_header(bs);
2810             if (ret < 0) {
2811                 s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
2812                 return ret;
2813             }
2814             s->use_lazy_refcounts = false;
2815         }
2816     }
2817 
2818     if (new_size) {
2819         ret = bdrv_truncate(bs, new_size);
2820         if (ret < 0) {
2821             return ret;
2822         }
2823     }
2824 
2825     return 0;
2826 }
2827 
2828 /*
2829  * If offset or size are negative, respectively, they will not be included in
2830  * the BLOCK_IMAGE_CORRUPTED event emitted.
2831  * fatal will be ignored for read-only BDS; corruptions found there will always
2832  * be considered non-fatal.
2833  */
2834 void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
2835                              int64_t size, const char *message_format, ...)
2836 {
2837     BDRVQcowState *s = bs->opaque;
2838     const char *node_name;
2839     char *message;
2840     va_list ap;
2841 
2842     fatal = fatal && !bs->read_only;
2843 
2844     if (s->signaled_corruption &&
2845         (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
2846     {
2847         return;
2848     }
2849 
2850     va_start(ap, message_format);
2851     message = g_strdup_vprintf(message_format, ap);
2852     va_end(ap);
2853 
2854     if (fatal) {
2855         fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
2856                 "corruption events will be suppressed\n", message);
2857     } else {
2858         fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
2859                 "corruption events will be suppressed\n", message);
2860     }
2861 
2862     node_name = bdrv_get_node_name(bs);
2863     qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
2864                                           *node_name != '\0', node_name,
2865                                           message, offset >= 0, offset,
2866                                           size >= 0, size,
2867                                           fatal, &error_abort);
2868     g_free(message);
2869 
2870     if (fatal) {
2871         qcow2_mark_corrupt(bs);
2872         bs->drv = NULL; /* make BDS unusable */
2873     }
2874 
2875     s->signaled_corruption = true;
2876 }
2877 
2878 static QemuOptsList qcow2_create_opts = {
2879     .name = "qcow2-create-opts",
2880     .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
2881     .desc = {
2882         {
2883             .name = BLOCK_OPT_SIZE,
2884             .type = QEMU_OPT_SIZE,
2885             .help = "Virtual disk size"
2886         },
2887         {
2888             .name = BLOCK_OPT_COMPAT_LEVEL,
2889             .type = QEMU_OPT_STRING,
2890             .help = "Compatibility level (0.10 or 1.1)"
2891         },
2892         {
2893             .name = BLOCK_OPT_BACKING_FILE,
2894             .type = QEMU_OPT_STRING,
2895             .help = "File name of a base image"
2896         },
2897         {
2898             .name = BLOCK_OPT_BACKING_FMT,
2899             .type = QEMU_OPT_STRING,
2900             .help = "Image format of the base image"
2901         },
2902         {
2903             .name = BLOCK_OPT_ENCRYPT,
2904             .type = QEMU_OPT_BOOL,
2905             .help = "Encrypt the image",
2906             .def_value_str = "off"
2907         },
2908         {
2909             .name = BLOCK_OPT_CLUSTER_SIZE,
2910             .type = QEMU_OPT_SIZE,
2911             .help = "qcow2 cluster size",
2912             .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)
2913         },
2914         {
2915             .name = BLOCK_OPT_PREALLOC,
2916             .type = QEMU_OPT_STRING,
2917             .help = "Preallocation mode (allowed values: off, metadata, "
2918                     "falloc, full)"
2919         },
2920         {
2921             .name = BLOCK_OPT_LAZY_REFCOUNTS,
2922             .type = QEMU_OPT_BOOL,
2923             .help = "Postpone refcount updates",
2924             .def_value_str = "off"
2925         },
2926         {
2927             .name = BLOCK_OPT_REFCOUNT_BITS,
2928             .type = QEMU_OPT_NUMBER,
2929             .help = "Width of a reference count entry in bits",
2930             .def_value_str = "16"
2931         },
2932         { /* end of list */ }
2933     }
2934 };
2935 
2936 BlockDriver bdrv_qcow2 = {
2937     .format_name        = "qcow2",
2938     .instance_size      = sizeof(BDRVQcowState),
2939     .bdrv_probe         = qcow2_probe,
2940     .bdrv_open          = qcow2_open,
2941     .bdrv_close         = qcow2_close,
2942     .bdrv_reopen_prepare  = qcow2_reopen_prepare,
2943     .bdrv_create        = qcow2_create,
2944     .bdrv_has_zero_init = bdrv_has_zero_init_1,
2945     .bdrv_co_get_block_status = qcow2_co_get_block_status,
2946     .bdrv_set_key       = qcow2_set_key,
2947 
2948     .bdrv_co_readv          = qcow2_co_readv,
2949     .bdrv_co_writev         = qcow2_co_writev,
2950     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
2951 
2952     .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
2953     .bdrv_co_discard        = qcow2_co_discard,
2954     .bdrv_truncate          = qcow2_truncate,
2955     .bdrv_write_compressed  = qcow2_write_compressed,
2956     .bdrv_make_empty        = qcow2_make_empty,
2957 
2958     .bdrv_snapshot_create   = qcow2_snapshot_create,
2959     .bdrv_snapshot_goto     = qcow2_snapshot_goto,
2960     .bdrv_snapshot_delete   = qcow2_snapshot_delete,
2961     .bdrv_snapshot_list     = qcow2_snapshot_list,
2962     .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
2963     .bdrv_get_info          = qcow2_get_info,
2964     .bdrv_get_specific_info = qcow2_get_specific_info,
2965 
2966     .bdrv_save_vmstate    = qcow2_save_vmstate,
2967     .bdrv_load_vmstate    = qcow2_load_vmstate,
2968 
2969     .supports_backing           = true,
2970     .bdrv_change_backing_file   = qcow2_change_backing_file,
2971 
2972     .bdrv_refresh_limits        = qcow2_refresh_limits,
2973     .bdrv_invalidate_cache      = qcow2_invalidate_cache,
2974 
2975     .create_opts         = &qcow2_create_opts,
2976     .bdrv_check          = qcow2_check,
2977     .bdrv_amend_options  = qcow2_amend_options,
2978 };
2979 
2980 static void bdrv_qcow2_init(void)
2981 {
2982     bdrv_register(&bdrv_qcow2);
2983 }
2984 
2985 block_init(bdrv_qcow2_init);
2986