xref: /qemu/block/qcow2.c (revision 2bfb10df)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 
27 #include "block/qdict.h"
28 #include "sysemu/block-backend.h"
29 #include "qemu/main-loop.h"
30 #include "qemu/module.h"
31 #include "qcow2.h"
32 #include "qemu/error-report.h"
33 #include "qapi/error.h"
34 #include "qapi/qapi-events-block-core.h"
35 #include "qapi/qmp/qdict.h"
36 #include "qapi/qmp/qstring.h"
37 #include "trace.h"
38 #include "qemu/option_int.h"
39 #include "qemu/cutils.h"
40 #include "qemu/bswap.h"
41 #include "qemu/memalign.h"
42 #include "qapi/qobject-input-visitor.h"
43 #include "qapi/qapi-visit-block-core.h"
44 #include "crypto.h"
45 #include "block/aio_task.h"
46 #include "block/dirty-bitmap.h"
47 
48 /*
49   Differences with QCOW:
50 
51   - Support for multiple incremental snapshots.
52   - Memory management by reference counts.
53   - Clusters which have a reference count of one have the bit
54     QCOW_OFLAG_COPIED to optimize write performance.
55   - Size of compressed clusters is stored in sectors to reduce bit usage
56     in the cluster offsets.
57   - Support for storing additional data (such as the VM state) in the
58     snapshots.
59   - If a backing store is used, the cluster size is not constrained
60     (could be backported to QCOW).
61   - L2 tables have always a size of one cluster.
62 */
63 
64 
65 typedef struct {
66     uint32_t magic;
67     uint32_t len;
68 } QEMU_PACKED QCowExtension;
69 
70 #define  QCOW2_EXT_MAGIC_END 0
71 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xe2792aca
72 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
73 #define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
74 #define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
75 #define  QCOW2_EXT_MAGIC_DATA_FILE 0x44415441
76 
77 static int coroutine_fn
78 qcow2_co_preadv_compressed(BlockDriverState *bs,
79                            uint64_t l2_entry,
80                            uint64_t offset,
81                            uint64_t bytes,
82                            QEMUIOVector *qiov,
83                            size_t qiov_offset);
84 
85 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
86 {
87     const QCowHeader *cow_header = (const void *)buf;
88 
89     if (buf_size >= sizeof(QCowHeader) &&
90         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
91         be32_to_cpu(cow_header->version) >= 2)
92         return 100;
93     else
94         return 0;
95 }
96 
97 
98 static int qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
99                                       uint8_t *buf, size_t buflen,
100                                       void *opaque, Error **errp)
101 {
102     BlockDriverState *bs = opaque;
103     BDRVQcow2State *s = bs->opaque;
104     ssize_t ret;
105 
106     if ((offset + buflen) > s->crypto_header.length) {
107         error_setg(errp, "Request for data outside of extension header");
108         return -1;
109     }
110 
111     ret = bdrv_pread(bs->file, s->crypto_header.offset + offset, buflen, buf,
112                      0);
113     if (ret < 0) {
114         error_setg_errno(errp, -ret, "Could not read encryption header");
115         return -1;
116     }
117     return 0;
118 }
119 
120 
121 static int coroutine_fn GRAPH_RDLOCK
122 qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, void *opaque,
123                            Error **errp)
124 {
125     BlockDriverState *bs = opaque;
126     BDRVQcow2State *s = bs->opaque;
127     int64_t ret;
128     int64_t clusterlen;
129 
130     ret = qcow2_alloc_clusters(bs, headerlen);
131     if (ret < 0) {
132         error_setg_errno(errp, -ret,
133                          "Cannot allocate cluster for LUKS header size %zu",
134                          headerlen);
135         return -1;
136     }
137 
138     s->crypto_header.length = headerlen;
139     s->crypto_header.offset = ret;
140 
141     /*
142      * Zero fill all space in cluster so it has predictable
143      * content, as we may not initialize some regions of the
144      * header (eg only 1 out of 8 key slots will be initialized)
145      */
146     clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
147     assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen, false) == 0);
148     ret = bdrv_co_pwrite_zeroes(bs->file, ret, clusterlen, 0);
149     if (ret < 0) {
150         error_setg_errno(errp, -ret, "Could not zero fill encryption header");
151         return -1;
152     }
153 
154     return 0;
155 }
156 
157 
158 /* The graph lock must be held when called in coroutine context */
159 static int coroutine_mixed_fn
160 qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
161                             const uint8_t *buf, size_t buflen,
162                             void *opaque, Error **errp)
163 {
164     BlockDriverState *bs = opaque;
165     BDRVQcow2State *s = bs->opaque;
166     ssize_t ret;
167 
168     if ((offset + buflen) > s->crypto_header.length) {
169         error_setg(errp, "Request for data outside of extension header");
170         return -1;
171     }
172 
173     ret = bdrv_pwrite(bs->file, s->crypto_header.offset + offset, buflen, buf,
174                       0);
175     if (ret < 0) {
176         error_setg_errno(errp, -ret, "Could not read encryption header");
177         return -1;
178     }
179     return 0;
180 }
181 
182 static QDict*
183 qcow2_extract_crypto_opts(QemuOpts *opts, const char *fmt, Error **errp)
184 {
185     QDict *cryptoopts_qdict;
186     QDict *opts_qdict;
187 
188     /* Extract "encrypt." options into a qdict */
189     opts_qdict = qemu_opts_to_qdict(opts, NULL);
190     qdict_extract_subqdict(opts_qdict, &cryptoopts_qdict, "encrypt.");
191     qobject_unref(opts_qdict);
192     qdict_put_str(cryptoopts_qdict, "format", fmt);
193     return cryptoopts_qdict;
194 }
195 
196 /*
197  * read qcow2 extension and fill bs
198  * start reading from start_offset
199  * finish reading upon magic of value 0 or when end_offset reached
200  * unknown magic is skipped (future extension this version knows nothing about)
201  * return 0 upon success, non-0 otherwise
202  */
203 static int coroutine_fn GRAPH_RDLOCK
204 qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
205                       uint64_t end_offset, void **p_feature_table,
206                       int flags, bool *need_update_header, Error **errp)
207 {
208     BDRVQcow2State *s = bs->opaque;
209     QCowExtension ext;
210     uint64_t offset;
211     int ret;
212     Qcow2BitmapHeaderExt bitmaps_ext;
213 
214     if (need_update_header != NULL) {
215         *need_update_header = false;
216     }
217 
218 #ifdef DEBUG_EXT
219     printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
220 #endif
221     offset = start_offset;
222     while (offset < end_offset) {
223 
224 #ifdef DEBUG_EXT
225         /* Sanity check */
226         if (offset > s->cluster_size)
227             printf("qcow2_read_extension: suspicious offset %lu\n", offset);
228 
229         printf("attempting to read extended header in offset %lu\n", offset);
230 #endif
231 
232         ret = bdrv_co_pread(bs->file, offset, sizeof(ext), &ext, 0);
233         if (ret < 0) {
234             error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
235                              "pread fail from offset %" PRIu64, offset);
236             return 1;
237         }
238         ext.magic = be32_to_cpu(ext.magic);
239         ext.len = be32_to_cpu(ext.len);
240         offset += sizeof(ext);
241 #ifdef DEBUG_EXT
242         printf("ext.magic = 0x%x\n", ext.magic);
243 #endif
244         if (offset > end_offset || ext.len > end_offset - offset) {
245             error_setg(errp, "Header extension too large");
246             return -EINVAL;
247         }
248 
249         switch (ext.magic) {
250         case QCOW2_EXT_MAGIC_END:
251             return 0;
252 
253         case QCOW2_EXT_MAGIC_BACKING_FORMAT:
254             if (ext.len >= sizeof(bs->backing_format)) {
255                 error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
256                            " too large (>=%zu)", ext.len,
257                            sizeof(bs->backing_format));
258                 return 2;
259             }
260             ret = bdrv_co_pread(bs->file, offset, ext.len, bs->backing_format, 0);
261             if (ret < 0) {
262                 error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
263                                  "Could not read format name");
264                 return 3;
265             }
266             bs->backing_format[ext.len] = '\0';
267             s->image_backing_format = g_strdup(bs->backing_format);
268 #ifdef DEBUG_EXT
269             printf("Qcow2: Got format extension %s\n", bs->backing_format);
270 #endif
271             break;
272 
273         case QCOW2_EXT_MAGIC_FEATURE_TABLE:
274             if (p_feature_table != NULL) {
275                 void *feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
276                 ret = bdrv_co_pread(bs->file, offset, ext.len, feature_table, 0);
277                 if (ret < 0) {
278                     error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
279                                      "Could not read table");
280                     g_free(feature_table);
281                     return ret;
282                 }
283 
284                 *p_feature_table = feature_table;
285             }
286             break;
287 
288         case QCOW2_EXT_MAGIC_CRYPTO_HEADER: {
289             unsigned int cflags = 0;
290             if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
291                 error_setg(errp, "CRYPTO header extension only "
292                            "expected with LUKS encryption method");
293                 return -EINVAL;
294             }
295             if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) {
296                 error_setg(errp, "CRYPTO header extension size %u, "
297                            "but expected size %zu", ext.len,
298                            sizeof(Qcow2CryptoHeaderExtension));
299                 return -EINVAL;
300             }
301 
302             ret = bdrv_co_pread(bs->file, offset, ext.len, &s->crypto_header, 0);
303             if (ret < 0) {
304                 error_setg_errno(errp, -ret,
305                                  "Unable to read CRYPTO header extension");
306                 return ret;
307             }
308             s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
309             s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
310 
311             if ((s->crypto_header.offset % s->cluster_size) != 0) {
312                 error_setg(errp, "Encryption header offset '%" PRIu64 "' is "
313                            "not a multiple of cluster size '%u'",
314                            s->crypto_header.offset, s->cluster_size);
315                 return -EINVAL;
316             }
317 
318             if (flags & BDRV_O_NO_IO) {
319                 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
320             }
321             s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
322                                            qcow2_crypto_hdr_read_func,
323                                            bs, cflags, QCOW2_MAX_THREADS, errp);
324             if (!s->crypto) {
325                 return -EINVAL;
326             }
327         }   break;
328 
329         case QCOW2_EXT_MAGIC_BITMAPS:
330             if (ext.len != sizeof(bitmaps_ext)) {
331                 error_setg_errno(errp, -ret, "bitmaps_ext: "
332                                  "Invalid extension length");
333                 return -EINVAL;
334             }
335 
336             if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
337                 if (s->qcow_version < 3) {
338                     /* Let's be a bit more specific */
339                     warn_report("This qcow2 v2 image contains bitmaps, but "
340                                 "they may have been modified by a program "
341                                 "without persistent bitmap support; so now "
342                                 "they must all be considered inconsistent");
343                 } else {
344                     warn_report("a program lacking bitmap support "
345                                 "modified this file, so all bitmaps are now "
346                                 "considered inconsistent");
347                 }
348                 error_printf("Some clusters may be leaked, "
349                              "run 'qemu-img check -r' on the image "
350                              "file to fix.");
351                 if (need_update_header != NULL) {
352                     /* Updating is needed to drop invalid bitmap extension. */
353                     *need_update_header = true;
354                 }
355                 break;
356             }
357 
358             ret = bdrv_co_pread(bs->file, offset, ext.len, &bitmaps_ext, 0);
359             if (ret < 0) {
360                 error_setg_errno(errp, -ret, "bitmaps_ext: "
361                                  "Could not read ext header");
362                 return ret;
363             }
364 
365             if (bitmaps_ext.reserved32 != 0) {
366                 error_setg_errno(errp, -ret, "bitmaps_ext: "
367                                  "Reserved field is not zero");
368                 return -EINVAL;
369             }
370 
371             bitmaps_ext.nb_bitmaps = be32_to_cpu(bitmaps_ext.nb_bitmaps);
372             bitmaps_ext.bitmap_directory_size =
373                 be64_to_cpu(bitmaps_ext.bitmap_directory_size);
374             bitmaps_ext.bitmap_directory_offset =
375                 be64_to_cpu(bitmaps_ext.bitmap_directory_offset);
376 
377             if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) {
378                 error_setg(errp,
379                            "bitmaps_ext: Image has %" PRIu32 " bitmaps, "
380                            "exceeding the QEMU supported maximum of %d",
381                            bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS);
382                 return -EINVAL;
383             }
384 
385             if (bitmaps_ext.nb_bitmaps == 0) {
386                 error_setg(errp, "found bitmaps extension with zero bitmaps");
387                 return -EINVAL;
388             }
389 
390             if (offset_into_cluster(s, bitmaps_ext.bitmap_directory_offset)) {
391                 error_setg(errp, "bitmaps_ext: "
392                                  "invalid bitmap directory offset");
393                 return -EINVAL;
394             }
395 
396             if (bitmaps_ext.bitmap_directory_size >
397                 QCOW2_MAX_BITMAP_DIRECTORY_SIZE) {
398                 error_setg(errp, "bitmaps_ext: "
399                                  "bitmap directory size (%" PRIu64 ") exceeds "
400                                  "the maximum supported size (%d)",
401                                  bitmaps_ext.bitmap_directory_size,
402                                  QCOW2_MAX_BITMAP_DIRECTORY_SIZE);
403                 return -EINVAL;
404             }
405 
406             s->nb_bitmaps = bitmaps_ext.nb_bitmaps;
407             s->bitmap_directory_offset =
408                     bitmaps_ext.bitmap_directory_offset;
409             s->bitmap_directory_size =
410                     bitmaps_ext.bitmap_directory_size;
411 
412 #ifdef DEBUG_EXT
413             printf("Qcow2: Got bitmaps extension: "
414                    "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n",
415                    s->bitmap_directory_offset, s->nb_bitmaps);
416 #endif
417             break;
418 
419         case QCOW2_EXT_MAGIC_DATA_FILE:
420         {
421             s->image_data_file = g_malloc0(ext.len + 1);
422             ret = bdrv_co_pread(bs->file, offset, ext.len, s->image_data_file, 0);
423             if (ret < 0) {
424                 error_setg_errno(errp, -ret,
425                                  "ERROR: Could not read data file name");
426                 return ret;
427             }
428 #ifdef DEBUG_EXT
429             printf("Qcow2: Got external data file %s\n", s->image_data_file);
430 #endif
431             break;
432         }
433 
434         default:
435             /* unknown magic - save it in case we need to rewrite the header */
436             /* If you add a new feature, make sure to also update the fast
437              * path of qcow2_make_empty() to deal with it. */
438             {
439                 Qcow2UnknownHeaderExtension *uext;
440 
441                 uext = g_malloc0(sizeof(*uext)  + ext.len);
442                 uext->magic = ext.magic;
443                 uext->len = ext.len;
444                 QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
445 
446                 ret = bdrv_co_pread(bs->file, offset, uext->len, uext->data, 0);
447                 if (ret < 0) {
448                     error_setg_errno(errp, -ret, "ERROR: unknown extension: "
449                                      "Could not read data");
450                     return ret;
451                 }
452             }
453             break;
454         }
455 
456         offset += ((ext.len + 7) & ~7);
457     }
458 
459     return 0;
460 }
461 
462 static void cleanup_unknown_header_ext(BlockDriverState *bs)
463 {
464     BDRVQcow2State *s = bs->opaque;
465     Qcow2UnknownHeaderExtension *uext, *next;
466 
467     QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
468         QLIST_REMOVE(uext, next);
469         g_free(uext);
470     }
471 }
472 
473 static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
474                                        uint64_t mask)
475 {
476     g_autoptr(GString) features = g_string_sized_new(60);
477 
478     while (table && table->name[0] != '\0') {
479         if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
480             if (mask & (1ULL << table->bit)) {
481                 if (features->len > 0) {
482                     g_string_append(features, ", ");
483                 }
484                 g_string_append_printf(features, "%.46s", table->name);
485                 mask &= ~(1ULL << table->bit);
486             }
487         }
488         table++;
489     }
490 
491     if (mask) {
492         if (features->len > 0) {
493             g_string_append(features, ", ");
494         }
495         g_string_append_printf(features,
496                                "Unknown incompatible feature: %" PRIx64, mask);
497     }
498 
499     error_setg(errp, "Unsupported qcow2 feature(s): %s", features->str);
500 }
501 
502 /*
503  * Sets the dirty bit and flushes afterwards if necessary.
504  *
505  * The incompatible_features bit is only set if the image file header was
506  * updated successfully.  Therefore it is not required to check the return
507  * value of this function.
508  */
509 int qcow2_mark_dirty(BlockDriverState *bs)
510 {
511     BDRVQcow2State *s = bs->opaque;
512     uint64_t val;
513     int ret;
514 
515     assert(s->qcow_version >= 3);
516 
517     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
518         return 0; /* already dirty */
519     }
520 
521     val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
522     ret = bdrv_pwrite_sync(bs->file,
523                            offsetof(QCowHeader, incompatible_features),
524                            sizeof(val), &val, 0);
525     if (ret < 0) {
526         return ret;
527     }
528 
529     /* Only treat image as dirty if the header was updated successfully */
530     s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
531     return 0;
532 }
533 
534 /*
535  * Clears the dirty bit and flushes before if necessary.  Only call this
536  * function when there are no pending requests, it does not guard against
537  * concurrent requests dirtying the image.
538  */
539 static int qcow2_mark_clean(BlockDriverState *bs)
540 {
541     BDRVQcow2State *s = bs->opaque;
542 
543     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
544         int ret;
545 
546         s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
547 
548         ret = qcow2_flush_caches(bs);
549         if (ret < 0) {
550             return ret;
551         }
552 
553         return qcow2_update_header(bs);
554     }
555     return 0;
556 }
557 
558 /*
559  * Marks the image as corrupt.
560  */
561 int qcow2_mark_corrupt(BlockDriverState *bs)
562 {
563     BDRVQcow2State *s = bs->opaque;
564 
565     s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
566     return qcow2_update_header(bs);
567 }
568 
569 /*
570  * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
571  * before if necessary.
572  */
573 int qcow2_mark_consistent(BlockDriverState *bs)
574 {
575     BDRVQcow2State *s = bs->opaque;
576 
577     if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
578         int ret = qcow2_flush_caches(bs);
579         if (ret < 0) {
580             return ret;
581         }
582 
583         s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
584         return qcow2_update_header(bs);
585     }
586     return 0;
587 }
588 
589 static void qcow2_add_check_result(BdrvCheckResult *out,
590                                    const BdrvCheckResult *src,
591                                    bool set_allocation_info)
592 {
593     out->corruptions += src->corruptions;
594     out->leaks += src->leaks;
595     out->check_errors += src->check_errors;
596     out->corruptions_fixed += src->corruptions_fixed;
597     out->leaks_fixed += src->leaks_fixed;
598 
599     if (set_allocation_info) {
600         out->image_end_offset = src->image_end_offset;
601         out->bfi = src->bfi;
602     }
603 }
604 
605 static int coroutine_fn GRAPH_RDLOCK
606 qcow2_co_check_locked(BlockDriverState *bs, BdrvCheckResult *result,
607                       BdrvCheckMode fix)
608 {
609     BdrvCheckResult snapshot_res = {};
610     BdrvCheckResult refcount_res = {};
611     int ret;
612 
613     memset(result, 0, sizeof(*result));
614 
615     ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix);
616     if (ret < 0) {
617         qcow2_add_check_result(result, &snapshot_res, false);
618         return ret;
619     }
620 
621     ret = qcow2_check_refcounts(bs, &refcount_res, fix);
622     qcow2_add_check_result(result, &refcount_res, true);
623     if (ret < 0) {
624         qcow2_add_check_result(result, &snapshot_res, false);
625         return ret;
626     }
627 
628     ret = qcow2_check_fix_snapshot_table(bs, &snapshot_res, fix);
629     qcow2_add_check_result(result, &snapshot_res, false);
630     if (ret < 0) {
631         return ret;
632     }
633 
634     if (fix && result->check_errors == 0 && result->corruptions == 0) {
635         ret = qcow2_mark_clean(bs);
636         if (ret < 0) {
637             return ret;
638         }
639         return qcow2_mark_consistent(bs);
640     }
641     return ret;
642 }
643 
644 static int coroutine_fn GRAPH_RDLOCK
645 qcow2_co_check(BlockDriverState *bs, BdrvCheckResult *result,
646                BdrvCheckMode fix)
647 {
648     BDRVQcow2State *s = bs->opaque;
649     int ret;
650 
651     qemu_co_mutex_lock(&s->lock);
652     ret = qcow2_co_check_locked(bs, result, fix);
653     qemu_co_mutex_unlock(&s->lock);
654     return ret;
655 }
656 
657 int qcow2_validate_table(BlockDriverState *bs, uint64_t offset,
658                          uint64_t entries, size_t entry_len,
659                          int64_t max_size_bytes, const char *table_name,
660                          Error **errp)
661 {
662     BDRVQcow2State *s = bs->opaque;
663 
664     if (entries > max_size_bytes / entry_len) {
665         error_setg(errp, "%s too large", table_name);
666         return -EFBIG;
667     }
668 
669     /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
670      * because values will be passed to qemu functions taking int64_t. */
671     if ((INT64_MAX - entries * entry_len < offset) ||
672         (offset_into_cluster(s, offset) != 0)) {
673         error_setg(errp, "%s offset invalid", table_name);
674         return -EINVAL;
675     }
676 
677     return 0;
678 }
679 
680 static const char *const mutable_opts[] = {
681     QCOW2_OPT_LAZY_REFCOUNTS,
682     QCOW2_OPT_DISCARD_REQUEST,
683     QCOW2_OPT_DISCARD_SNAPSHOT,
684     QCOW2_OPT_DISCARD_OTHER,
685     QCOW2_OPT_OVERLAP,
686     QCOW2_OPT_OVERLAP_TEMPLATE,
687     QCOW2_OPT_OVERLAP_MAIN_HEADER,
688     QCOW2_OPT_OVERLAP_ACTIVE_L1,
689     QCOW2_OPT_OVERLAP_ACTIVE_L2,
690     QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
691     QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
692     QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
693     QCOW2_OPT_OVERLAP_INACTIVE_L1,
694     QCOW2_OPT_OVERLAP_INACTIVE_L2,
695     QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
696     QCOW2_OPT_CACHE_SIZE,
697     QCOW2_OPT_L2_CACHE_SIZE,
698     QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
699     QCOW2_OPT_REFCOUNT_CACHE_SIZE,
700     QCOW2_OPT_CACHE_CLEAN_INTERVAL,
701     NULL
702 };
703 
704 static QemuOptsList qcow2_runtime_opts = {
705     .name = "qcow2",
706     .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
707     .desc = {
708         {
709             .name = QCOW2_OPT_LAZY_REFCOUNTS,
710             .type = QEMU_OPT_BOOL,
711             .help = "Postpone refcount updates",
712         },
713         {
714             .name = QCOW2_OPT_DISCARD_REQUEST,
715             .type = QEMU_OPT_BOOL,
716             .help = "Pass guest discard requests to the layer below",
717         },
718         {
719             .name = QCOW2_OPT_DISCARD_SNAPSHOT,
720             .type = QEMU_OPT_BOOL,
721             .help = "Generate discard requests when snapshot related space "
722                     "is freed",
723         },
724         {
725             .name = QCOW2_OPT_DISCARD_OTHER,
726             .type = QEMU_OPT_BOOL,
727             .help = "Generate discard requests when other clusters are freed",
728         },
729         {
730             .name = QCOW2_OPT_OVERLAP,
731             .type = QEMU_OPT_STRING,
732             .help = "Selects which overlap checks to perform from a range of "
733                     "templates (none, constant, cached, all)",
734         },
735         {
736             .name = QCOW2_OPT_OVERLAP_TEMPLATE,
737             .type = QEMU_OPT_STRING,
738             .help = "Selects which overlap checks to perform from a range of "
739                     "templates (none, constant, cached, all)",
740         },
741         {
742             .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
743             .type = QEMU_OPT_BOOL,
744             .help = "Check for unintended writes into the main qcow2 header",
745         },
746         {
747             .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
748             .type = QEMU_OPT_BOOL,
749             .help = "Check for unintended writes into the active L1 table",
750         },
751         {
752             .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
753             .type = QEMU_OPT_BOOL,
754             .help = "Check for unintended writes into an active L2 table",
755         },
756         {
757             .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
758             .type = QEMU_OPT_BOOL,
759             .help = "Check for unintended writes into the refcount table",
760         },
761         {
762             .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
763             .type = QEMU_OPT_BOOL,
764             .help = "Check for unintended writes into a refcount block",
765         },
766         {
767             .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
768             .type = QEMU_OPT_BOOL,
769             .help = "Check for unintended writes into the snapshot table",
770         },
771         {
772             .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
773             .type = QEMU_OPT_BOOL,
774             .help = "Check for unintended writes into an inactive L1 table",
775         },
776         {
777             .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
778             .type = QEMU_OPT_BOOL,
779             .help = "Check for unintended writes into an inactive L2 table",
780         },
781         {
782             .name = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
783             .type = QEMU_OPT_BOOL,
784             .help = "Check for unintended writes into the bitmap directory",
785         },
786         {
787             .name = QCOW2_OPT_CACHE_SIZE,
788             .type = QEMU_OPT_SIZE,
789             .help = "Maximum combined metadata (L2 tables and refcount blocks) "
790                     "cache size",
791         },
792         {
793             .name = QCOW2_OPT_L2_CACHE_SIZE,
794             .type = QEMU_OPT_SIZE,
795             .help = "Maximum L2 table cache size",
796         },
797         {
798             .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
799             .type = QEMU_OPT_SIZE,
800             .help = "Size of each entry in the L2 cache",
801         },
802         {
803             .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
804             .type = QEMU_OPT_SIZE,
805             .help = "Maximum refcount block cache size",
806         },
807         {
808             .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
809             .type = QEMU_OPT_NUMBER,
810             .help = "Clean unused cache entries after this time (in seconds)",
811         },
812         BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
813             "ID of secret providing qcow2 AES key or LUKS passphrase"),
814         { /* end of list */ }
815     },
816 };
817 
818 static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
819     [QCOW2_OL_MAIN_HEADER_BITNR]      = QCOW2_OPT_OVERLAP_MAIN_HEADER,
820     [QCOW2_OL_ACTIVE_L1_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L1,
821     [QCOW2_OL_ACTIVE_L2_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L2,
822     [QCOW2_OL_REFCOUNT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
823     [QCOW2_OL_REFCOUNT_BLOCK_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
824     [QCOW2_OL_SNAPSHOT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
825     [QCOW2_OL_INACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L1,
826     [QCOW2_OL_INACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L2,
827     [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
828 };
829 
830 static void cache_clean_timer_cb(void *opaque)
831 {
832     BlockDriverState *bs = opaque;
833     BDRVQcow2State *s = bs->opaque;
834     qcow2_cache_clean_unused(s->l2_table_cache);
835     qcow2_cache_clean_unused(s->refcount_block_cache);
836     timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
837               (int64_t) s->cache_clean_interval * 1000);
838 }
839 
840 static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
841 {
842     BDRVQcow2State *s = bs->opaque;
843     if (s->cache_clean_interval > 0) {
844         s->cache_clean_timer =
845             aio_timer_new_with_attrs(context, QEMU_CLOCK_VIRTUAL,
846                                      SCALE_MS, QEMU_TIMER_ATTR_EXTERNAL,
847                                      cache_clean_timer_cb, bs);
848         timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
849                   (int64_t) s->cache_clean_interval * 1000);
850     }
851 }
852 
853 static void cache_clean_timer_del(BlockDriverState *bs)
854 {
855     BDRVQcow2State *s = bs->opaque;
856     if (s->cache_clean_timer) {
857         timer_free(s->cache_clean_timer);
858         s->cache_clean_timer = NULL;
859     }
860 }
861 
862 static void qcow2_detach_aio_context(BlockDriverState *bs)
863 {
864     cache_clean_timer_del(bs);
865 }
866 
867 static void qcow2_attach_aio_context(BlockDriverState *bs,
868                                      AioContext *new_context)
869 {
870     cache_clean_timer_init(bs, new_context);
871 }
872 
873 static bool read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
874                              uint64_t *l2_cache_size,
875                              uint64_t *l2_cache_entry_size,
876                              uint64_t *refcount_cache_size, Error **errp)
877 {
878     BDRVQcow2State *s = bs->opaque;
879     uint64_t combined_cache_size, l2_cache_max_setting;
880     bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
881     bool l2_cache_entry_size_set;
882     int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
883     uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
884     uint64_t max_l2_entries = DIV_ROUND_UP(virtual_disk_size, s->cluster_size);
885     /* An L2 table is always one cluster in size so the max cache size
886      * should be a multiple of the cluster size. */
887     uint64_t max_l2_cache = ROUND_UP(max_l2_entries * l2_entry_size(s),
888                                      s->cluster_size);
889 
890     combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
891     l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
892     refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
893     l2_cache_entry_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE);
894 
895     combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
896     l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE,
897                                              DEFAULT_L2_CACHE_MAX_SIZE);
898     *refcount_cache_size = qemu_opt_get_size(opts,
899                                              QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
900 
901     *l2_cache_entry_size = qemu_opt_get_size(
902         opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);
903 
904     *l2_cache_size = MIN(max_l2_cache, l2_cache_max_setting);
905 
906     if (combined_cache_size_set) {
907         if (l2_cache_size_set && refcount_cache_size_set) {
908             error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
909                        " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
910                        "at the same time");
911             return false;
912         } else if (l2_cache_size_set &&
913                    (l2_cache_max_setting > combined_cache_size)) {
914             error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
915                        QCOW2_OPT_CACHE_SIZE);
916             return false;
917         } else if (*refcount_cache_size > combined_cache_size) {
918             error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
919                        QCOW2_OPT_CACHE_SIZE);
920             return false;
921         }
922 
923         if (l2_cache_size_set) {
924             *refcount_cache_size = combined_cache_size - *l2_cache_size;
925         } else if (refcount_cache_size_set) {
926             *l2_cache_size = combined_cache_size - *refcount_cache_size;
927         } else {
928             /* Assign as much memory as possible to the L2 cache, and
929              * use the remainder for the refcount cache */
930             if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
931                 *l2_cache_size = max_l2_cache;
932                 *refcount_cache_size = combined_cache_size - *l2_cache_size;
933             } else {
934                 *refcount_cache_size =
935                     MIN(combined_cache_size, min_refcount_cache);
936                 *l2_cache_size = combined_cache_size - *refcount_cache_size;
937             }
938         }
939     }
940 
941     /*
942      * If the L2 cache is not enough to cover the whole disk then
943      * default to 4KB entries. Smaller entries reduce the cost of
944      * loads and evictions and increase I/O performance.
945      */
946     if (*l2_cache_size < max_l2_cache && !l2_cache_entry_size_set) {
947         *l2_cache_entry_size = MIN(s->cluster_size, 4096);
948     }
949 
950     /* l2_cache_size and refcount_cache_size are ensured to have at least
951      * their minimum values in qcow2_update_options_prepare() */
952 
953     if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
954         *l2_cache_entry_size > s->cluster_size ||
955         !is_power_of_2(*l2_cache_entry_size)) {
956         error_setg(errp, "L2 cache entry size must be a power of two "
957                    "between %d and the cluster size (%d)",
958                    1 << MIN_CLUSTER_BITS, s->cluster_size);
959         return false;
960     }
961 
962     return true;
963 }
964 
965 typedef struct Qcow2ReopenState {
966     Qcow2Cache *l2_table_cache;
967     Qcow2Cache *refcount_block_cache;
968     int l2_slice_size; /* Number of entries in a slice of the L2 table */
969     bool use_lazy_refcounts;
970     int overlap_check;
971     bool discard_passthrough[QCOW2_DISCARD_MAX];
972     uint64_t cache_clean_interval;
973     QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
974 } Qcow2ReopenState;
975 
976 static int qcow2_update_options_prepare(BlockDriverState *bs,
977                                         Qcow2ReopenState *r,
978                                         QDict *options, int flags,
979                                         Error **errp)
980 {
981     BDRVQcow2State *s = bs->opaque;
982     QemuOpts *opts = NULL;
983     const char *opt_overlap_check, *opt_overlap_check_template;
984     int overlap_check_template = 0;
985     uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
986     int i;
987     const char *encryptfmt;
988     QDict *encryptopts = NULL;
989     int ret;
990 
991     qdict_extract_subqdict(options, &encryptopts, "encrypt.");
992     encryptfmt = qdict_get_try_str(encryptopts, "format");
993 
994     opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
995     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
996         ret = -EINVAL;
997         goto fail;
998     }
999 
1000     /* get L2 table/refcount block cache size from command line options */
1001     if (!read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
1002                           &refcount_cache_size, errp)) {
1003         ret = -EINVAL;
1004         goto fail;
1005     }
1006 
1007     l2_cache_size /= l2_cache_entry_size;
1008     if (l2_cache_size < MIN_L2_CACHE_SIZE) {
1009         l2_cache_size = MIN_L2_CACHE_SIZE;
1010     }
1011     if (l2_cache_size > INT_MAX) {
1012         error_setg(errp, "L2 cache size too big");
1013         ret = -EINVAL;
1014         goto fail;
1015     }
1016 
1017     refcount_cache_size /= s->cluster_size;
1018     if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
1019         refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
1020     }
1021     if (refcount_cache_size > INT_MAX) {
1022         error_setg(errp, "Refcount cache size too big");
1023         ret = -EINVAL;
1024         goto fail;
1025     }
1026 
1027     /* alloc new L2 table/refcount block cache, flush old one */
1028     if (s->l2_table_cache) {
1029         ret = qcow2_cache_flush(bs, s->l2_table_cache);
1030         if (ret) {
1031             error_setg_errno(errp, -ret, "Failed to flush the L2 table cache");
1032             goto fail;
1033         }
1034     }
1035 
1036     if (s->refcount_block_cache) {
1037         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
1038         if (ret) {
1039             error_setg_errno(errp, -ret,
1040                              "Failed to flush the refcount block cache");
1041             goto fail;
1042         }
1043     }
1044 
1045     r->l2_slice_size = l2_cache_entry_size / l2_entry_size(s);
1046     r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
1047                                            l2_cache_entry_size);
1048     r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
1049                                                  s->cluster_size);
1050     if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
1051         error_setg(errp, "Could not allocate metadata caches");
1052         ret = -ENOMEM;
1053         goto fail;
1054     }
1055 
1056     /* New interval for cache cleanup timer */
1057     r->cache_clean_interval =
1058         qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
1059                             DEFAULT_CACHE_CLEAN_INTERVAL);
1060 #ifndef CONFIG_LINUX
1061     if (r->cache_clean_interval != 0) {
1062         error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
1063                    " not supported on this host");
1064         ret = -EINVAL;
1065         goto fail;
1066     }
1067 #endif
1068     if (r->cache_clean_interval > UINT_MAX) {
1069         error_setg(errp, "Cache clean interval too big");
1070         ret = -EINVAL;
1071         goto fail;
1072     }
1073 
1074     /* lazy-refcounts; flush if going from enabled to disabled */
1075     r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
1076         (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
1077     if (r->use_lazy_refcounts && s->qcow_version < 3) {
1078         error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
1079                    "qemu 1.1 compatibility level");
1080         ret = -EINVAL;
1081         goto fail;
1082     }
1083 
1084     if (s->use_lazy_refcounts && !r->use_lazy_refcounts) {
1085         ret = qcow2_mark_clean(bs);
1086         if (ret < 0) {
1087             error_setg_errno(errp, -ret, "Failed to disable lazy refcounts");
1088             goto fail;
1089         }
1090     }
1091 
1092     /* Overlap check options */
1093     opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
1094     opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
1095     if (opt_overlap_check_template && opt_overlap_check &&
1096         strcmp(opt_overlap_check_template, opt_overlap_check))
1097     {
1098         error_setg(errp, "Conflicting values for qcow2 options '"
1099                    QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
1100                    "' ('%s')", opt_overlap_check, opt_overlap_check_template);
1101         ret = -EINVAL;
1102         goto fail;
1103     }
1104     if (!opt_overlap_check) {
1105         opt_overlap_check = opt_overlap_check_template ?: "cached";
1106     }
1107 
1108     if (!strcmp(opt_overlap_check, "none")) {
1109         overlap_check_template = 0;
1110     } else if (!strcmp(opt_overlap_check, "constant")) {
1111         overlap_check_template = QCOW2_OL_CONSTANT;
1112     } else if (!strcmp(opt_overlap_check, "cached")) {
1113         overlap_check_template = QCOW2_OL_CACHED;
1114     } else if (!strcmp(opt_overlap_check, "all")) {
1115         overlap_check_template = QCOW2_OL_ALL;
1116     } else {
1117         error_setg(errp, "Unsupported value '%s' for qcow2 option "
1118                    "'overlap-check'. Allowed are any of the following: "
1119                    "none, constant, cached, all", opt_overlap_check);
1120         ret = -EINVAL;
1121         goto fail;
1122     }
1123 
1124     r->overlap_check = 0;
1125     for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
1126         /* overlap-check defines a template bitmask, but every flag may be
1127          * overwritten through the associated boolean option */
1128         r->overlap_check |=
1129             qemu_opt_get_bool(opts, overlap_bool_option_names[i],
1130                               overlap_check_template & (1 << i)) << i;
1131     }
1132 
1133     r->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
1134     r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
1135     r->discard_passthrough[QCOW2_DISCARD_REQUEST] =
1136         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
1137                           flags & BDRV_O_UNMAP);
1138     r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
1139         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
1140     r->discard_passthrough[QCOW2_DISCARD_OTHER] =
1141         qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
1142 
1143     switch (s->crypt_method_header) {
1144     case QCOW_CRYPT_NONE:
1145         if (encryptfmt) {
1146             error_setg(errp, "No encryption in image header, but options "
1147                        "specified format '%s'", encryptfmt);
1148             ret = -EINVAL;
1149             goto fail;
1150         }
1151         break;
1152 
1153     case QCOW_CRYPT_AES:
1154         if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
1155             error_setg(errp,
1156                        "Header reported 'aes' encryption format but "
1157                        "options specify '%s'", encryptfmt);
1158             ret = -EINVAL;
1159             goto fail;
1160         }
1161         qdict_put_str(encryptopts, "format", "qcow");
1162         r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1163         if (!r->crypto_opts) {
1164             ret = -EINVAL;
1165             goto fail;
1166         }
1167         break;
1168 
1169     case QCOW_CRYPT_LUKS:
1170         if (encryptfmt && !g_str_equal(encryptfmt, "luks")) {
1171             error_setg(errp,
1172                        "Header reported 'luks' encryption format but "
1173                        "options specify '%s'", encryptfmt);
1174             ret = -EINVAL;
1175             goto fail;
1176         }
1177         qdict_put_str(encryptopts, "format", "luks");
1178         r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1179         if (!r->crypto_opts) {
1180             ret = -EINVAL;
1181             goto fail;
1182         }
1183         break;
1184 
1185     default:
1186         error_setg(errp, "Unsupported encryption method %d",
1187                    s->crypt_method_header);
1188         ret = -EINVAL;
1189         goto fail;
1190     }
1191 
1192     ret = 0;
1193 fail:
1194     qobject_unref(encryptopts);
1195     qemu_opts_del(opts);
1196     opts = NULL;
1197     return ret;
1198 }
1199 
1200 static void qcow2_update_options_commit(BlockDriverState *bs,
1201                                         Qcow2ReopenState *r)
1202 {
1203     BDRVQcow2State *s = bs->opaque;
1204     int i;
1205 
1206     if (s->l2_table_cache) {
1207         qcow2_cache_destroy(s->l2_table_cache);
1208     }
1209     if (s->refcount_block_cache) {
1210         qcow2_cache_destroy(s->refcount_block_cache);
1211     }
1212     s->l2_table_cache = r->l2_table_cache;
1213     s->refcount_block_cache = r->refcount_block_cache;
1214     s->l2_slice_size = r->l2_slice_size;
1215 
1216     s->overlap_check = r->overlap_check;
1217     s->use_lazy_refcounts = r->use_lazy_refcounts;
1218 
1219     for (i = 0; i < QCOW2_DISCARD_MAX; i++) {
1220         s->discard_passthrough[i] = r->discard_passthrough[i];
1221     }
1222 
1223     if (s->cache_clean_interval != r->cache_clean_interval) {
1224         cache_clean_timer_del(bs);
1225         s->cache_clean_interval = r->cache_clean_interval;
1226         cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
1227     }
1228 
1229     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1230     s->crypto_opts = r->crypto_opts;
1231 }
1232 
1233 static void qcow2_update_options_abort(BlockDriverState *bs,
1234                                        Qcow2ReopenState *r)
1235 {
1236     if (r->l2_table_cache) {
1237         qcow2_cache_destroy(r->l2_table_cache);
1238     }
1239     if (r->refcount_block_cache) {
1240         qcow2_cache_destroy(r->refcount_block_cache);
1241     }
1242     qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
1243 }
1244 
1245 static int coroutine_fn
1246 qcow2_update_options(BlockDriverState *bs, QDict *options, int flags,
1247                      Error **errp)
1248 {
1249     Qcow2ReopenState r = {};
1250     int ret;
1251 
1252     ret = qcow2_update_options_prepare(bs, &r, options, flags, errp);
1253     if (ret >= 0) {
1254         qcow2_update_options_commit(bs, &r);
1255     } else {
1256         qcow2_update_options_abort(bs, &r);
1257     }
1258 
1259     return ret;
1260 }
1261 
1262 static int validate_compression_type(BDRVQcow2State *s, Error **errp)
1263 {
1264     switch (s->compression_type) {
1265     case QCOW2_COMPRESSION_TYPE_ZLIB:
1266 #ifdef CONFIG_ZSTD
1267     case QCOW2_COMPRESSION_TYPE_ZSTD:
1268 #endif
1269         break;
1270 
1271     default:
1272         error_setg(errp, "qcow2: unknown compression type: %u",
1273                    s->compression_type);
1274         return -ENOTSUP;
1275     }
1276 
1277     /*
1278      * if the compression type differs from QCOW2_COMPRESSION_TYPE_ZLIB
1279      * the incompatible feature flag must be set
1280      */
1281     if (s->compression_type == QCOW2_COMPRESSION_TYPE_ZLIB) {
1282         if (s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION) {
1283             error_setg(errp, "qcow2: Compression type incompatible feature "
1284                              "bit must not be set");
1285             return -EINVAL;
1286         }
1287     } else {
1288         if (!(s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION)) {
1289             error_setg(errp, "qcow2: Compression type incompatible feature "
1290                              "bit must be set");
1291             return -EINVAL;
1292         }
1293     }
1294 
1295     return 0;
1296 }
1297 
1298 /* Called with s->lock held.  */
1299 static int coroutine_fn GRAPH_RDLOCK
1300 qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
1301               bool open_data_file, Error **errp)
1302 {
1303     ERRP_GUARD();
1304     BDRVQcow2State *s = bs->opaque;
1305     unsigned int len, i;
1306     int ret = 0;
1307     QCowHeader header;
1308     uint64_t ext_end;
1309     uint64_t l1_vm_state_index;
1310     bool update_header = false;
1311 
1312     ret = bdrv_co_pread(bs->file, 0, sizeof(header), &header, 0);
1313     if (ret < 0) {
1314         error_setg_errno(errp, -ret, "Could not read qcow2 header");
1315         goto fail;
1316     }
1317     header.magic = be32_to_cpu(header.magic);
1318     header.version = be32_to_cpu(header.version);
1319     header.backing_file_offset = be64_to_cpu(header.backing_file_offset);
1320     header.backing_file_size = be32_to_cpu(header.backing_file_size);
1321     header.size = be64_to_cpu(header.size);
1322     header.cluster_bits = be32_to_cpu(header.cluster_bits);
1323     header.crypt_method = be32_to_cpu(header.crypt_method);
1324     header.l1_table_offset = be64_to_cpu(header.l1_table_offset);
1325     header.l1_size = be32_to_cpu(header.l1_size);
1326     header.refcount_table_offset = be64_to_cpu(header.refcount_table_offset);
1327     header.refcount_table_clusters =
1328         be32_to_cpu(header.refcount_table_clusters);
1329     header.snapshots_offset = be64_to_cpu(header.snapshots_offset);
1330     header.nb_snapshots = be32_to_cpu(header.nb_snapshots);
1331 
1332     if (header.magic != QCOW_MAGIC) {
1333         error_setg(errp, "Image is not in qcow2 format");
1334         ret = -EINVAL;
1335         goto fail;
1336     }
1337     if (header.version < 2 || header.version > 3) {
1338         error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
1339         ret = -ENOTSUP;
1340         goto fail;
1341     }
1342 
1343     s->qcow_version = header.version;
1344 
1345     /* Initialise cluster size */
1346     if (header.cluster_bits < MIN_CLUSTER_BITS ||
1347         header.cluster_bits > MAX_CLUSTER_BITS) {
1348         error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
1349                    header.cluster_bits);
1350         ret = -EINVAL;
1351         goto fail;
1352     }
1353 
1354     s->cluster_bits = header.cluster_bits;
1355     s->cluster_size = 1 << s->cluster_bits;
1356 
1357     /* Initialise version 3 header fields */
1358     if (header.version == 2) {
1359         header.incompatible_features    = 0;
1360         header.compatible_features      = 0;
1361         header.autoclear_features       = 0;
1362         header.refcount_order           = 4;
1363         header.header_length            = 72;
1364     } else {
1365         header.incompatible_features =
1366             be64_to_cpu(header.incompatible_features);
1367         header.compatible_features = be64_to_cpu(header.compatible_features);
1368         header.autoclear_features = be64_to_cpu(header.autoclear_features);
1369         header.refcount_order = be32_to_cpu(header.refcount_order);
1370         header.header_length = be32_to_cpu(header.header_length);
1371 
1372         if (header.header_length < 104) {
1373             error_setg(errp, "qcow2 header too short");
1374             ret = -EINVAL;
1375             goto fail;
1376         }
1377     }
1378 
1379     if (header.header_length > s->cluster_size) {
1380         error_setg(errp, "qcow2 header exceeds cluster size");
1381         ret = -EINVAL;
1382         goto fail;
1383     }
1384 
1385     if (header.header_length > sizeof(header)) {
1386         s->unknown_header_fields_size = header.header_length - sizeof(header);
1387         s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
1388         ret = bdrv_co_pread(bs->file, sizeof(header),
1389                             s->unknown_header_fields_size,
1390                             s->unknown_header_fields, 0);
1391         if (ret < 0) {
1392             error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
1393                              "fields");
1394             goto fail;
1395         }
1396     }
1397 
1398     if (header.backing_file_offset > s->cluster_size) {
1399         error_setg(errp, "Invalid backing file offset");
1400         ret = -EINVAL;
1401         goto fail;
1402     }
1403 
1404     if (header.backing_file_offset) {
1405         ext_end = header.backing_file_offset;
1406     } else {
1407         ext_end = 1 << header.cluster_bits;
1408     }
1409 
1410     /* Handle feature bits */
1411     s->incompatible_features    = header.incompatible_features;
1412     s->compatible_features      = header.compatible_features;
1413     s->autoclear_features       = header.autoclear_features;
1414 
1415     /*
1416      * Handle compression type
1417      * Older qcow2 images don't contain the compression type header.
1418      * Distinguish them by the header length and use
1419      * the only valid (default) compression type in that case
1420      */
1421     if (header.header_length > offsetof(QCowHeader, compression_type)) {
1422         s->compression_type = header.compression_type;
1423     } else {
1424         s->compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
1425     }
1426 
1427     ret = validate_compression_type(s, errp);
1428     if (ret) {
1429         goto fail;
1430     }
1431 
1432     if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
1433         void *feature_table = NULL;
1434         qcow2_read_extensions(bs, header.header_length, ext_end,
1435                               &feature_table, flags, NULL, NULL);
1436         report_unsupported_feature(errp, feature_table,
1437                                    s->incompatible_features &
1438                                    ~QCOW2_INCOMPAT_MASK);
1439         ret = -ENOTSUP;
1440         g_free(feature_table);
1441         goto fail;
1442     }
1443 
1444     if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
1445         /* Corrupt images may not be written to unless they are being repaired
1446          */
1447         if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
1448             error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
1449                        "read/write");
1450             ret = -EACCES;
1451             goto fail;
1452         }
1453     }
1454 
1455     s->subclusters_per_cluster =
1456         has_subclusters(s) ? QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER : 1;
1457     s->subcluster_size = s->cluster_size / s->subclusters_per_cluster;
1458     s->subcluster_bits = ctz32(s->subcluster_size);
1459 
1460     if (s->subcluster_size < (1 << MIN_CLUSTER_BITS)) {
1461         error_setg(errp, "Unsupported subcluster size: %d", s->subcluster_size);
1462         ret = -EINVAL;
1463         goto fail;
1464     }
1465 
1466     /* Check support for various header values */
1467     if (header.refcount_order > 6) {
1468         error_setg(errp, "Reference count entry width too large; may not "
1469                    "exceed 64 bits");
1470         ret = -EINVAL;
1471         goto fail;
1472     }
1473     s->refcount_order = header.refcount_order;
1474     s->refcount_bits = 1 << s->refcount_order;
1475     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
1476     s->refcount_max += s->refcount_max - 1;
1477 
1478     s->crypt_method_header = header.crypt_method;
1479     if (s->crypt_method_header) {
1480         if (bdrv_uses_whitelist() &&
1481             s->crypt_method_header == QCOW_CRYPT_AES) {
1482             error_setg(errp,
1483                        "Use of AES-CBC encrypted qcow2 images is no longer "
1484                        "supported in system emulators");
1485             error_append_hint(errp,
1486                               "You can use 'qemu-img convert' to convert your "
1487                               "image to an alternative supported format, such "
1488                               "as unencrypted qcow2, or raw with the LUKS "
1489                               "format instead.\n");
1490             ret = -ENOSYS;
1491             goto fail;
1492         }
1493 
1494         if (s->crypt_method_header == QCOW_CRYPT_AES) {
1495             s->crypt_physical_offset = false;
1496         } else {
1497             /* Assuming LUKS and any future crypt methods we
1498              * add will all use physical offsets, due to the
1499              * fact that the alternative is insecure...  */
1500             s->crypt_physical_offset = true;
1501         }
1502 
1503         bs->encrypted = true;
1504     }
1505 
1506     s->l2_bits = s->cluster_bits - ctz32(l2_entry_size(s));
1507     s->l2_size = 1 << s->l2_bits;
1508     /* 2^(s->refcount_order - 3) is the refcount width in bytes */
1509     s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
1510     s->refcount_block_size = 1 << s->refcount_block_bits;
1511     bs->total_sectors = header.size / BDRV_SECTOR_SIZE;
1512     s->csize_shift = (62 - (s->cluster_bits - 8));
1513     s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
1514     s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
1515 
1516     s->refcount_table_offset = header.refcount_table_offset;
1517     s->refcount_table_size =
1518         header.refcount_table_clusters << (s->cluster_bits - 3);
1519 
1520     if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) {
1521         error_setg(errp, "Image does not contain a reference count table");
1522         ret = -EINVAL;
1523         goto fail;
1524     }
1525 
1526     ret = qcow2_validate_table(bs, s->refcount_table_offset,
1527                                header.refcount_table_clusters,
1528                                s->cluster_size, QCOW_MAX_REFTABLE_SIZE,
1529                                "Reference count table", errp);
1530     if (ret < 0) {
1531         goto fail;
1532     }
1533 
1534     if (!(flags & BDRV_O_CHECK)) {
1535         /*
1536          * The total size in bytes of the snapshot table is checked in
1537          * qcow2_read_snapshots() because the size of each snapshot is
1538          * variable and we don't know it yet.
1539          * Here we only check the offset and number of snapshots.
1540          */
1541         ret = qcow2_validate_table(bs, header.snapshots_offset,
1542                                    header.nb_snapshots,
1543                                    sizeof(QCowSnapshotHeader),
1544                                    sizeof(QCowSnapshotHeader) *
1545                                        QCOW_MAX_SNAPSHOTS,
1546                                    "Snapshot table", errp);
1547         if (ret < 0) {
1548             goto fail;
1549         }
1550     }
1551 
1552     /* read the level 1 table */
1553     ret = qcow2_validate_table(bs, header.l1_table_offset,
1554                                header.l1_size, L1E_SIZE,
1555                                QCOW_MAX_L1_SIZE, "Active L1 table", errp);
1556     if (ret < 0) {
1557         goto fail;
1558     }
1559     s->l1_size = header.l1_size;
1560     s->l1_table_offset = header.l1_table_offset;
1561 
1562     l1_vm_state_index = size_to_l1(s, header.size);
1563     if (l1_vm_state_index > INT_MAX) {
1564         error_setg(errp, "Image is too big");
1565         ret = -EFBIG;
1566         goto fail;
1567     }
1568     s->l1_vm_state_index = l1_vm_state_index;
1569 
1570     /* the L1 table must contain at least enough entries to put
1571        header.size bytes */
1572     if (s->l1_size < s->l1_vm_state_index) {
1573         error_setg(errp, "L1 table is too small");
1574         ret = -EINVAL;
1575         goto fail;
1576     }
1577 
1578     if (s->l1_size > 0) {
1579         s->l1_table = qemu_try_blockalign(bs->file->bs, s->l1_size * L1E_SIZE);
1580         if (s->l1_table == NULL) {
1581             error_setg(errp, "Could not allocate L1 table");
1582             ret = -ENOMEM;
1583             goto fail;
1584         }
1585         ret = bdrv_co_pread(bs->file, s->l1_table_offset, s->l1_size * L1E_SIZE,
1586                             s->l1_table, 0);
1587         if (ret < 0) {
1588             error_setg_errno(errp, -ret, "Could not read L1 table");
1589             goto fail;
1590         }
1591         for(i = 0;i < s->l1_size; i++) {
1592             s->l1_table[i] = be64_to_cpu(s->l1_table[i]);
1593         }
1594     }
1595 
1596     /* Parse driver-specific options */
1597     ret = qcow2_update_options(bs, options, flags, errp);
1598     if (ret < 0) {
1599         goto fail;
1600     }
1601 
1602     s->flags = flags;
1603 
1604     ret = qcow2_refcount_init(bs);
1605     if (ret != 0) {
1606         error_setg_errno(errp, -ret, "Could not initialize refcount handling");
1607         goto fail;
1608     }
1609 
1610     QLIST_INIT(&s->cluster_allocs);
1611     QTAILQ_INIT(&s->discards);
1612 
1613     /* read qcow2 extensions */
1614     if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
1615                               flags, &update_header, errp)) {
1616         ret = -EINVAL;
1617         goto fail;
1618     }
1619 
1620     if (open_data_file) {
1621         /* Open external data file */
1622         bdrv_graph_co_rdunlock();
1623         s->data_file = bdrv_co_open_child(NULL, options, "data-file", bs,
1624                                           &child_of_bds, BDRV_CHILD_DATA,
1625                                           true, errp);
1626         bdrv_graph_co_rdlock();
1627         if (*errp) {
1628             ret = -EINVAL;
1629             goto fail;
1630         }
1631 
1632         if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
1633             if (!s->data_file && s->image_data_file) {
1634                 bdrv_graph_co_rdunlock();
1635                 s->data_file = bdrv_co_open_child(s->image_data_file, options,
1636                                                   "data-file", bs,
1637                                                   &child_of_bds,
1638                                                   BDRV_CHILD_DATA, false, errp);
1639                 bdrv_graph_co_rdlock();
1640                 if (!s->data_file) {
1641                     ret = -EINVAL;
1642                     goto fail;
1643                 }
1644             }
1645             if (!s->data_file) {
1646                 error_setg(errp, "'data-file' is required for this image");
1647                 ret = -EINVAL;
1648                 goto fail;
1649             }
1650 
1651             /* No data here */
1652             bs->file->role &= ~BDRV_CHILD_DATA;
1653 
1654             /* Must succeed because we have given up permissions if anything */
1655             bdrv_child_refresh_perms(bs, bs->file, &error_abort);
1656         } else {
1657             if (s->data_file) {
1658                 error_setg(errp, "'data-file' can only be set for images with "
1659                                  "an external data file");
1660                 ret = -EINVAL;
1661                 goto fail;
1662             }
1663 
1664             s->data_file = bs->file;
1665 
1666             if (data_file_is_raw(bs)) {
1667                 error_setg(errp, "data-file-raw requires a data file");
1668                 ret = -EINVAL;
1669                 goto fail;
1670             }
1671         }
1672     }
1673 
1674     /* qcow2_read_extension may have set up the crypto context
1675      * if the crypt method needs a header region, some methods
1676      * don't need header extensions, so must check here
1677      */
1678     if (s->crypt_method_header && !s->crypto) {
1679         if (s->crypt_method_header == QCOW_CRYPT_AES) {
1680             unsigned int cflags = 0;
1681             if (flags & BDRV_O_NO_IO) {
1682                 cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
1683             }
1684             s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
1685                                            NULL, NULL, cflags,
1686                                            QCOW2_MAX_THREADS, errp);
1687             if (!s->crypto) {
1688                 ret = -EINVAL;
1689                 goto fail;
1690             }
1691         } else if (!(flags & BDRV_O_NO_IO)) {
1692             error_setg(errp, "Missing CRYPTO header for crypt method %d",
1693                        s->crypt_method_header);
1694             ret = -EINVAL;
1695             goto fail;
1696         }
1697     }
1698 
1699     /* read the backing file name */
1700     if (header.backing_file_offset != 0) {
1701         len = header.backing_file_size;
1702         if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
1703             len >= sizeof(bs->backing_file)) {
1704             error_setg(errp, "Backing file name too long");
1705             ret = -EINVAL;
1706             goto fail;
1707         }
1708 
1709         s->image_backing_file = g_malloc(len + 1);
1710         ret = bdrv_co_pread(bs->file, header.backing_file_offset, len,
1711                             s->image_backing_file, 0);
1712         if (ret < 0) {
1713             error_setg_errno(errp, -ret, "Could not read backing file name");
1714             goto fail;
1715         }
1716         s->image_backing_file[len] = '\0';
1717 
1718         /*
1719          * Update only when something has changed.  This function is called by
1720          * qcow2_co_invalidate_cache(), and we do not want to reset
1721          * auto_backing_file unless necessary.
1722          */
1723         if (!g_str_equal(s->image_backing_file, bs->backing_file)) {
1724             pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1725                     s->image_backing_file);
1726             pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
1727                     s->image_backing_file);
1728         }
1729     }
1730 
1731     /*
1732      * Internal snapshots; skip reading them in check mode, because
1733      * we do not need them then, and we do not want to abort because
1734      * of a broken table.
1735      */
1736     if (!(flags & BDRV_O_CHECK)) {
1737         s->snapshots_offset = header.snapshots_offset;
1738         s->nb_snapshots = header.nb_snapshots;
1739 
1740         ret = qcow2_read_snapshots(bs, errp);
1741         if (ret < 0) {
1742             goto fail;
1743         }
1744     }
1745 
1746     /* Clear unknown autoclear feature bits */
1747     update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK;
1748     update_header = update_header && bdrv_is_writable(bs);
1749     if (update_header) {
1750         s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
1751     }
1752 
1753     /* == Handle persistent dirty bitmaps ==
1754      *
1755      * We want load dirty bitmaps in three cases:
1756      *
1757      * 1. Normal open of the disk in active mode, not related to invalidation
1758      *    after migration.
1759      *
1760      * 2. Invalidation of the target vm after pre-copy phase of migration, if
1761      *    bitmaps are _not_ migrating through migration channel, i.e.
1762      *    'dirty-bitmaps' capability is disabled.
1763      *
1764      * 3. Invalidation of source vm after failed or canceled migration.
1765      *    This is a very interesting case. There are two possible types of
1766      *    bitmaps:
1767      *
1768      *    A. Stored on inactivation and removed. They should be loaded from the
1769      *       image.
1770      *
1771      *    B. Not stored: not-persistent bitmaps and bitmaps, migrated through
1772      *       the migration channel (with dirty-bitmaps capability).
1773      *
1774      *    On the other hand, there are two possible sub-cases:
1775      *
1776      *    3.1 disk was changed by somebody else while were inactive. In this
1777      *        case all in-RAM dirty bitmaps (both persistent and not) are
1778      *        definitely invalid. And we don't have any method to determine
1779      *        this.
1780      *
1781      *        Simple and safe thing is to just drop all the bitmaps of type B on
1782      *        inactivation. But in this case we lose bitmaps in valid 4.2 case.
1783      *
1784      *        On the other hand, resuming source vm, if disk was already changed
1785      *        is a bad thing anyway: not only bitmaps, the whole vm state is
1786      *        out of sync with disk.
1787      *
1788      *        This means, that user or management tool, who for some reason
1789      *        decided to resume source vm, after disk was already changed by
1790      *        target vm, should at least drop all dirty bitmaps by hand.
1791      *
1792      *        So, we can ignore this case for now, but TODO: "generation"
1793      *        extension for qcow2, to determine, that image was changed after
1794      *        last inactivation. And if it is changed, we will drop (or at least
1795      *        mark as 'invalid' all the bitmaps of type B, both persistent
1796      *        and not).
1797      *
1798      *    3.2 disk was _not_ changed while were inactive. Bitmaps may be saved
1799      *        to disk ('dirty-bitmaps' capability disabled), or not saved
1800      *        ('dirty-bitmaps' capability enabled), but we don't need to care
1801      *        of: let's load bitmaps as always: stored bitmaps will be loaded,
1802      *        and not stored has flag IN_USE=1 in the image and will be skipped
1803      *        on loading.
1804      *
1805      * One remaining possible case when we don't want load bitmaps:
1806      *
1807      * 4. Open disk in inactive mode in target vm (bitmaps are migrating or
1808      *    will be loaded on invalidation, no needs try loading them before)
1809      */
1810 
1811     if (!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)) {
1812         /* It's case 1, 2 or 3.2. Or 3.1 which is BUG in management layer. */
1813         bool header_updated;
1814         if (!qcow2_load_dirty_bitmaps(bs, &header_updated, errp)) {
1815             ret = -EINVAL;
1816             goto fail;
1817         }
1818 
1819         update_header = update_header && !header_updated;
1820     }
1821 
1822     if (update_header) {
1823         ret = qcow2_update_header(bs);
1824         if (ret < 0) {
1825             error_setg_errno(errp, -ret, "Could not update qcow2 header");
1826             goto fail;
1827         }
1828     }
1829 
1830     bs->supported_zero_flags = header.version >= 3 ?
1831                                BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK : 0;
1832     bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
1833 
1834     /* Repair image if dirty */
1835     if (!(flags & BDRV_O_CHECK) && bdrv_is_writable(bs) &&
1836         (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
1837         BdrvCheckResult result = {0};
1838 
1839         ret = qcow2_co_check_locked(bs, &result,
1840                                     BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
1841         if (ret < 0 || result.check_errors) {
1842             if (ret >= 0) {
1843                 ret = -EIO;
1844             }
1845             error_setg_errno(errp, -ret, "Could not repair dirty image");
1846             goto fail;
1847         }
1848     }
1849 
1850 #ifdef DEBUG_ALLOC
1851     {
1852         BdrvCheckResult result = {0};
1853         qcow2_check_refcounts(bs, &result, 0);
1854     }
1855 #endif
1856 
1857     qemu_co_queue_init(&s->thread_task_queue);
1858 
1859     return ret;
1860 
1861  fail:
1862     g_free(s->image_data_file);
1863     if (open_data_file && has_data_file(bs)) {
1864         bdrv_graph_co_rdunlock();
1865         bdrv_unref_child(bs, s->data_file);
1866         bdrv_graph_co_rdlock();
1867         s->data_file = NULL;
1868     }
1869     g_free(s->unknown_header_fields);
1870     cleanup_unknown_header_ext(bs);
1871     qcow2_free_snapshots(bs);
1872     qcow2_refcount_close(bs);
1873     qemu_vfree(s->l1_table);
1874     /* else pre-write overlap checks in cache_destroy may crash */
1875     s->l1_table = NULL;
1876     cache_clean_timer_del(bs);
1877     if (s->l2_table_cache) {
1878         qcow2_cache_destroy(s->l2_table_cache);
1879     }
1880     if (s->refcount_block_cache) {
1881         qcow2_cache_destroy(s->refcount_block_cache);
1882     }
1883     qcrypto_block_free(s->crypto);
1884     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1885     return ret;
1886 }
1887 
1888 typedef struct QCow2OpenCo {
1889     BlockDriverState *bs;
1890     QDict *options;
1891     int flags;
1892     Error **errp;
1893     int ret;
1894 } QCow2OpenCo;
1895 
1896 static void coroutine_fn qcow2_open_entry(void *opaque)
1897 {
1898     QCow2OpenCo *qoc = opaque;
1899     BDRVQcow2State *s = qoc->bs->opaque;
1900 
1901     GRAPH_RDLOCK_GUARD();
1902 
1903     qemu_co_mutex_lock(&s->lock);
1904     qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, true,
1905                              qoc->errp);
1906     qemu_co_mutex_unlock(&s->lock);
1907 }
1908 
1909 static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
1910                       Error **errp)
1911 {
1912     BDRVQcow2State *s = bs->opaque;
1913     QCow2OpenCo qoc = {
1914         .bs = bs,
1915         .options = options,
1916         .flags = flags,
1917         .errp = errp,
1918         .ret = -EINPROGRESS
1919     };
1920     int ret;
1921 
1922     ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
1923     if (ret < 0) {
1924         return ret;
1925     }
1926 
1927     /* Initialise locks */
1928     qemu_co_mutex_init(&s->lock);
1929 
1930     assert(!qemu_in_coroutine());
1931     assert(qemu_get_current_aio_context() == qemu_get_aio_context());
1932     qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, &qoc));
1933     BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
1934 
1935     return qoc.ret;
1936 }
1937 
1938 static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
1939 {
1940     BDRVQcow2State *s = bs->opaque;
1941 
1942     if (bs->encrypted) {
1943         /* Encryption works on a sector granularity */
1944         bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto);
1945     }
1946     bs->bl.pwrite_zeroes_alignment = s->subcluster_size;
1947     bs->bl.pdiscard_alignment = s->cluster_size;
1948 }
1949 
1950 static int qcow2_reopen_prepare(BDRVReopenState *state,
1951                                 BlockReopenQueue *queue, Error **errp)
1952 {
1953     BDRVQcow2State *s = state->bs->opaque;
1954     Qcow2ReopenState *r;
1955     int ret;
1956 
1957     r = g_new0(Qcow2ReopenState, 1);
1958     state->opaque = r;
1959 
1960     ret = qcow2_update_options_prepare(state->bs, r, state->options,
1961                                        state->flags, errp);
1962     if (ret < 0) {
1963         goto fail;
1964     }
1965 
1966     /* We need to write out any unwritten data if we reopen read-only. */
1967     if ((state->flags & BDRV_O_RDWR) == 0) {
1968         ret = qcow2_reopen_bitmaps_ro(state->bs, errp);
1969         if (ret < 0) {
1970             goto fail;
1971         }
1972 
1973         ret = bdrv_flush(state->bs);
1974         if (ret < 0) {
1975             goto fail;
1976         }
1977 
1978         ret = qcow2_mark_clean(state->bs);
1979         if (ret < 0) {
1980             goto fail;
1981         }
1982     }
1983 
1984     /*
1985      * Without an external data file, s->data_file points to the same BdrvChild
1986      * as bs->file. It needs to be resynced after reopen because bs->file may
1987      * be changed. We can't use it in the meantime.
1988      */
1989     if (!has_data_file(state->bs)) {
1990         assert(s->data_file == state->bs->file);
1991         s->data_file = NULL;
1992     }
1993 
1994     return 0;
1995 
1996 fail:
1997     qcow2_update_options_abort(state->bs, r);
1998     g_free(r);
1999     return ret;
2000 }
2001 
2002 static void qcow2_reopen_commit(BDRVReopenState *state)
2003 {
2004     BDRVQcow2State *s = state->bs->opaque;
2005 
2006     qcow2_update_options_commit(state->bs, state->opaque);
2007     if (!s->data_file) {
2008         /*
2009          * If we don't have an external data file, s->data_file was cleared by
2010          * qcow2_reopen_prepare() and needs to be updated.
2011          */
2012         s->data_file = state->bs->file;
2013     }
2014     g_free(state->opaque);
2015 }
2016 
2017 static void qcow2_reopen_commit_post(BDRVReopenState *state)
2018 {
2019     if (state->flags & BDRV_O_RDWR) {
2020         Error *local_err = NULL;
2021 
2022         if (qcow2_reopen_bitmaps_rw(state->bs, &local_err) < 0) {
2023             /*
2024              * This is not fatal, bitmaps just left read-only, so all following
2025              * writes will fail. User can remove read-only bitmaps to unblock
2026              * writes or retry reopen.
2027              */
2028             error_reportf_err(local_err,
2029                               "%s: Failed to make dirty bitmaps writable: ",
2030                               bdrv_get_node_name(state->bs));
2031         }
2032     }
2033 }
2034 
2035 static void qcow2_reopen_abort(BDRVReopenState *state)
2036 {
2037     BDRVQcow2State *s = state->bs->opaque;
2038 
2039     if (!s->data_file) {
2040         /*
2041          * If we don't have an external data file, s->data_file was cleared by
2042          * qcow2_reopen_prepare() and needs to be restored.
2043          */
2044         s->data_file = state->bs->file;
2045     }
2046     qcow2_update_options_abort(state->bs, state->opaque);
2047     g_free(state->opaque);
2048 }
2049 
2050 static void qcow2_join_options(QDict *options, QDict *old_options)
2051 {
2052     bool has_new_overlap_template =
2053         qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
2054         qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
2055     bool has_new_total_cache_size =
2056         qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
2057     bool has_all_cache_options;
2058 
2059     /* New overlap template overrides all old overlap options */
2060     if (has_new_overlap_template) {
2061         qdict_del(old_options, QCOW2_OPT_OVERLAP);
2062         qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
2063         qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
2064         qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
2065         qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
2066         qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
2067         qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
2068         qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
2069         qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
2070         qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
2071     }
2072 
2073     /* New total cache size overrides all old options */
2074     if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
2075         qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
2076         qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
2077     }
2078 
2079     qdict_join(options, old_options, false);
2080 
2081     /*
2082      * If after merging all cache size options are set, an old total size is
2083      * overwritten. Do keep all options, however, if all three are new. The
2084      * resulting error message is what we want to happen.
2085      */
2086     has_all_cache_options =
2087         qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
2088         qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
2089         qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
2090 
2091     if (has_all_cache_options && !has_new_total_cache_size) {
2092         qdict_del(options, QCOW2_OPT_CACHE_SIZE);
2093     }
2094 }
2095 
2096 static int coroutine_fn GRAPH_RDLOCK
2097 qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
2098                       int64_t count, int64_t *pnum, int64_t *map,
2099                       BlockDriverState **file)
2100 {
2101     BDRVQcow2State *s = bs->opaque;
2102     uint64_t host_offset;
2103     unsigned int bytes;
2104     QCow2SubclusterType type;
2105     int ret, status = 0;
2106 
2107     qemu_co_mutex_lock(&s->lock);
2108 
2109     if (!s->metadata_preallocation_checked) {
2110         ret = qcow2_detect_metadata_preallocation(bs);
2111         s->metadata_preallocation = (ret == 1);
2112         s->metadata_preallocation_checked = true;
2113     }
2114 
2115     bytes = MIN(INT_MAX, count);
2116     ret = qcow2_get_host_offset(bs, offset, &bytes, &host_offset, &type);
2117     qemu_co_mutex_unlock(&s->lock);
2118     if (ret < 0) {
2119         return ret;
2120     }
2121 
2122     *pnum = bytes;
2123 
2124     if ((type == QCOW2_SUBCLUSTER_NORMAL ||
2125          type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
2126          type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC) && !s->crypto) {
2127         *map = host_offset;
2128         *file = s->data_file->bs;
2129         status |= BDRV_BLOCK_OFFSET_VALID;
2130     }
2131     if (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
2132         type == QCOW2_SUBCLUSTER_ZERO_ALLOC) {
2133         status |= BDRV_BLOCK_ZERO;
2134     } else if (type != QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN &&
2135                type != QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC) {
2136         status |= BDRV_BLOCK_DATA;
2137     }
2138     if (s->metadata_preallocation && (status & BDRV_BLOCK_DATA) &&
2139         (status & BDRV_BLOCK_OFFSET_VALID))
2140     {
2141         status |= BDRV_BLOCK_RECURSE;
2142     }
2143     return status;
2144 }
2145 
2146 static int coroutine_fn GRAPH_RDLOCK
2147 qcow2_handle_l2meta(BlockDriverState *bs, QCowL2Meta **pl2meta, bool link_l2)
2148 {
2149     int ret = 0;
2150     QCowL2Meta *l2meta = *pl2meta;
2151 
2152     while (l2meta != NULL) {
2153         QCowL2Meta *next;
2154 
2155         if (link_l2) {
2156             ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
2157             if (ret) {
2158                 goto out;
2159             }
2160         } else {
2161             qcow2_alloc_cluster_abort(bs, l2meta);
2162         }
2163 
2164         /* Take the request off the list of running requests */
2165         QLIST_REMOVE(l2meta, next_in_flight);
2166 
2167         qemu_co_queue_restart_all(&l2meta->dependent_requests);
2168 
2169         next = l2meta->next;
2170         g_free(l2meta);
2171         l2meta = next;
2172     }
2173 out:
2174     *pl2meta = l2meta;
2175     return ret;
2176 }
2177 
2178 static int coroutine_fn GRAPH_RDLOCK
2179 qcow2_co_preadv_encrypted(BlockDriverState *bs,
2180                            uint64_t host_offset,
2181                            uint64_t offset,
2182                            uint64_t bytes,
2183                            QEMUIOVector *qiov,
2184                            uint64_t qiov_offset)
2185 {
2186     int ret;
2187     BDRVQcow2State *s = bs->opaque;
2188     uint8_t *buf;
2189 
2190     assert(bs->encrypted && s->crypto);
2191     assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2192 
2193     /*
2194      * For encrypted images, read everything into a temporary
2195      * contiguous buffer on which the AES functions can work.
2196      * Also, decryption in a separate buffer is better as it
2197      * prevents the guest from learning information about the
2198      * encrypted nature of the virtual disk.
2199      */
2200 
2201     buf = qemu_try_blockalign(s->data_file->bs, bytes);
2202     if (buf == NULL) {
2203         return -ENOMEM;
2204     }
2205 
2206     BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
2207     ret = bdrv_co_pread(s->data_file, host_offset, bytes, buf, 0);
2208     if (ret < 0) {
2209         goto fail;
2210     }
2211 
2212     if (qcow2_co_decrypt(bs, host_offset, offset, buf, bytes) < 0)
2213     {
2214         ret = -EIO;
2215         goto fail;
2216     }
2217     qemu_iovec_from_buf(qiov, qiov_offset, buf, bytes);
2218 
2219 fail:
2220     qemu_vfree(buf);
2221 
2222     return ret;
2223 }
2224 
2225 typedef struct Qcow2AioTask {
2226     AioTask task;
2227 
2228     BlockDriverState *bs;
2229     QCow2SubclusterType subcluster_type; /* only for read */
2230     uint64_t host_offset; /* or l2_entry for compressed read */
2231     uint64_t offset;
2232     uint64_t bytes;
2233     QEMUIOVector *qiov;
2234     uint64_t qiov_offset;
2235     QCowL2Meta *l2meta; /* only for write */
2236 } Qcow2AioTask;
2237 
2238 static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task);
2239 static coroutine_fn int qcow2_add_task(BlockDriverState *bs,
2240                                        AioTaskPool *pool,
2241                                        AioTaskFunc func,
2242                                        QCow2SubclusterType subcluster_type,
2243                                        uint64_t host_offset,
2244                                        uint64_t offset,
2245                                        uint64_t bytes,
2246                                        QEMUIOVector *qiov,
2247                                        size_t qiov_offset,
2248                                        QCowL2Meta *l2meta)
2249 {
2250     Qcow2AioTask local_task;
2251     Qcow2AioTask *task = pool ? g_new(Qcow2AioTask, 1) : &local_task;
2252 
2253     *task = (Qcow2AioTask) {
2254         .task.func = func,
2255         .bs = bs,
2256         .subcluster_type = subcluster_type,
2257         .qiov = qiov,
2258         .host_offset = host_offset,
2259         .offset = offset,
2260         .bytes = bytes,
2261         .qiov_offset = qiov_offset,
2262         .l2meta = l2meta,
2263     };
2264 
2265     trace_qcow2_add_task(qemu_coroutine_self(), bs, pool,
2266                          func == qcow2_co_preadv_task_entry ? "read" : "write",
2267                          subcluster_type, host_offset, offset, bytes,
2268                          qiov, qiov_offset);
2269 
2270     if (!pool) {
2271         return func(&task->task);
2272     }
2273 
2274     aio_task_pool_start_task(pool, &task->task);
2275 
2276     return 0;
2277 }
2278 
2279 static int coroutine_fn GRAPH_RDLOCK
2280 qcow2_co_preadv_task(BlockDriverState *bs, QCow2SubclusterType subc_type,
2281                      uint64_t host_offset, uint64_t offset, uint64_t bytes,
2282                      QEMUIOVector *qiov, size_t qiov_offset)
2283 {
2284     BDRVQcow2State *s = bs->opaque;
2285 
2286     switch (subc_type) {
2287     case QCOW2_SUBCLUSTER_ZERO_PLAIN:
2288     case QCOW2_SUBCLUSTER_ZERO_ALLOC:
2289         /* Both zero types are handled in qcow2_co_preadv_part */
2290         g_assert_not_reached();
2291 
2292     case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
2293     case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
2294         assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */
2295 
2296         BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
2297         return bdrv_co_preadv_part(bs->backing, offset, bytes,
2298                                    qiov, qiov_offset, 0);
2299 
2300     case QCOW2_SUBCLUSTER_COMPRESSED:
2301         return qcow2_co_preadv_compressed(bs, host_offset,
2302                                           offset, bytes, qiov, qiov_offset);
2303 
2304     case QCOW2_SUBCLUSTER_NORMAL:
2305         if (bs->encrypted) {
2306             return qcow2_co_preadv_encrypted(bs, host_offset,
2307                                              offset, bytes, qiov, qiov_offset);
2308         }
2309 
2310         BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
2311         return bdrv_co_preadv_part(s->data_file, host_offset,
2312                                    bytes, qiov, qiov_offset, 0);
2313 
2314     default:
2315         g_assert_not_reached();
2316     }
2317 
2318     g_assert_not_reached();
2319 }
2320 
2321 /*
2322  * This function can count as GRAPH_RDLOCK because qcow2_co_preadv_part() holds
2323  * the graph lock and keeps it until this coroutine has terminated.
2324  */
2325 static int coroutine_fn GRAPH_RDLOCK qcow2_co_preadv_task_entry(AioTask *task)
2326 {
2327     Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
2328 
2329     assert(!t->l2meta);
2330 
2331     return qcow2_co_preadv_task(t->bs, t->subcluster_type,
2332                                 t->host_offset, t->offset, t->bytes,
2333                                 t->qiov, t->qiov_offset);
2334 }
2335 
2336 static int coroutine_fn GRAPH_RDLOCK
2337 qcow2_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
2338                      QEMUIOVector *qiov, size_t qiov_offset,
2339                      BdrvRequestFlags flags)
2340 {
2341     BDRVQcow2State *s = bs->opaque;
2342     int ret = 0;
2343     unsigned int cur_bytes; /* number of bytes in current iteration */
2344     uint64_t host_offset = 0;
2345     QCow2SubclusterType type;
2346     AioTaskPool *aio = NULL;
2347 
2348     while (bytes != 0 && aio_task_pool_status(aio) == 0) {
2349         /* prepare next request */
2350         cur_bytes = MIN(bytes, INT_MAX);
2351         if (s->crypto) {
2352             cur_bytes = MIN(cur_bytes,
2353                             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2354         }
2355 
2356         qemu_co_mutex_lock(&s->lock);
2357         ret = qcow2_get_host_offset(bs, offset, &cur_bytes,
2358                                     &host_offset, &type);
2359         qemu_co_mutex_unlock(&s->lock);
2360         if (ret < 0) {
2361             goto out;
2362         }
2363 
2364         if (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
2365             type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
2366             (type == QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN && !bs->backing) ||
2367             (type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC && !bs->backing))
2368         {
2369             qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes);
2370         } else {
2371             if (!aio && cur_bytes != bytes) {
2372                 aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
2373             }
2374             ret = qcow2_add_task(bs, aio, qcow2_co_preadv_task_entry, type,
2375                                  host_offset, offset, cur_bytes,
2376                                  qiov, qiov_offset, NULL);
2377             if (ret < 0) {
2378                 goto out;
2379             }
2380         }
2381 
2382         bytes -= cur_bytes;
2383         offset += cur_bytes;
2384         qiov_offset += cur_bytes;
2385     }
2386 
2387 out:
2388     if (aio) {
2389         aio_task_pool_wait_all(aio);
2390         if (ret == 0) {
2391             ret = aio_task_pool_status(aio);
2392         }
2393         g_free(aio);
2394     }
2395 
2396     return ret;
2397 }
2398 
2399 /* Check if it's possible to merge a write request with the writing of
2400  * the data from the COW regions */
2401 static bool merge_cow(uint64_t offset, unsigned bytes,
2402                       QEMUIOVector *qiov, size_t qiov_offset,
2403                       QCowL2Meta *l2meta)
2404 {
2405     QCowL2Meta *m;
2406 
2407     for (m = l2meta; m != NULL; m = m->next) {
2408         /* If both COW regions are empty then there's nothing to merge */
2409         if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
2410             continue;
2411         }
2412 
2413         /* If COW regions are handled already, skip this too */
2414         if (m->skip_cow) {
2415             continue;
2416         }
2417 
2418         /*
2419          * The write request should start immediately after the first
2420          * COW region. This does not always happen because the area
2421          * touched by the request can be larger than the one defined
2422          * by @m (a single request can span an area consisting of a
2423          * mix of previously unallocated and allocated clusters, that
2424          * is why @l2meta is a list).
2425          */
2426         if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
2427             /* In this case the request starts before this region */
2428             assert(offset < l2meta_cow_start(m));
2429             assert(m->cow_start.nb_bytes == 0);
2430             continue;
2431         }
2432 
2433         /* The write request should end immediately before the second
2434          * COW region (see above for why it does not always happen) */
2435         if (m->offset + m->cow_end.offset != offset + bytes) {
2436             assert(offset + bytes > m->offset + m->cow_end.offset);
2437             assert(m->cow_end.nb_bytes == 0);
2438             continue;
2439         }
2440 
2441         /* Make sure that adding both COW regions to the QEMUIOVector
2442          * does not exceed IOV_MAX */
2443         if (qemu_iovec_subvec_niov(qiov, qiov_offset, bytes) > IOV_MAX - 2) {
2444             continue;
2445         }
2446 
2447         m->data_qiov = qiov;
2448         m->data_qiov_offset = qiov_offset;
2449         return true;
2450     }
2451 
2452     return false;
2453 }
2454 
2455 /*
2456  * Return 1 if the COW regions read as zeroes, 0 if not, < 0 on error.
2457  * Note that returning 0 does not guarantee non-zero data.
2458  */
2459 static int coroutine_fn GRAPH_RDLOCK
2460 is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
2461 {
2462     /*
2463      * This check is designed for optimization shortcut so it must be
2464      * efficient.
2465      * Instead of is_zero(), use bdrv_co_is_zero_fast() as it is
2466      * faster (but not as accurate and can result in false negatives).
2467      */
2468     int ret = bdrv_co_is_zero_fast(bs, m->offset + m->cow_start.offset,
2469                                    m->cow_start.nb_bytes);
2470     if (ret <= 0) {
2471         return ret;
2472     }
2473 
2474     return bdrv_co_is_zero_fast(bs, m->offset + m->cow_end.offset,
2475                                 m->cow_end.nb_bytes);
2476 }
2477 
2478 static int coroutine_fn GRAPH_RDLOCK
2479 handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
2480 {
2481     BDRVQcow2State *s = bs->opaque;
2482     QCowL2Meta *m;
2483 
2484     if (!(s->data_file->bs->supported_zero_flags & BDRV_REQ_NO_FALLBACK)) {
2485         return 0;
2486     }
2487 
2488     if (bs->encrypted) {
2489         return 0;
2490     }
2491 
2492     for (m = l2meta; m != NULL; m = m->next) {
2493         int ret;
2494         uint64_t start_offset = m->alloc_offset + m->cow_start.offset;
2495         unsigned nb_bytes = m->cow_end.offset + m->cow_end.nb_bytes -
2496             m->cow_start.offset;
2497 
2498         if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
2499             continue;
2500         }
2501 
2502         ret = is_zero_cow(bs, m);
2503         if (ret < 0) {
2504             return ret;
2505         } else if (ret == 0) {
2506             continue;
2507         }
2508 
2509         /*
2510          * instead of writing zero COW buffers,
2511          * efficiently zero out the whole clusters
2512          */
2513 
2514         ret = qcow2_pre_write_overlap_check(bs, 0, start_offset, nb_bytes,
2515                                             true);
2516         if (ret < 0) {
2517             return ret;
2518         }
2519 
2520         BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
2521         ret = bdrv_co_pwrite_zeroes(s->data_file, start_offset, nb_bytes,
2522                                     BDRV_REQ_NO_FALLBACK);
2523         if (ret < 0) {
2524             if (ret != -ENOTSUP && ret != -EAGAIN) {
2525                 return ret;
2526             }
2527             continue;
2528         }
2529 
2530         trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
2531         m->skip_cow = true;
2532     }
2533     return 0;
2534 }
2535 
2536 /*
2537  * qcow2_co_pwritev_task
2538  * Called with s->lock unlocked
2539  * l2meta  - if not NULL, qcow2_co_pwritev_task() will consume it. Caller must
2540  *           not use it somehow after qcow2_co_pwritev_task() call
2541  */
2542 static coroutine_fn GRAPH_RDLOCK
2543 int qcow2_co_pwritev_task(BlockDriverState *bs, uint64_t host_offset,
2544                           uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
2545                           uint64_t qiov_offset, QCowL2Meta *l2meta)
2546 {
2547     int ret;
2548     BDRVQcow2State *s = bs->opaque;
2549     void *crypt_buf = NULL;
2550     QEMUIOVector encrypted_qiov;
2551 
2552     if (bs->encrypted) {
2553         assert(s->crypto);
2554         assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2555         crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
2556         if (crypt_buf == NULL) {
2557             ret = -ENOMEM;
2558             goto out_unlocked;
2559         }
2560         qemu_iovec_to_buf(qiov, qiov_offset, crypt_buf, bytes);
2561 
2562         if (qcow2_co_encrypt(bs, host_offset, offset, crypt_buf, bytes) < 0) {
2563             ret = -EIO;
2564             goto out_unlocked;
2565         }
2566 
2567         qemu_iovec_init_buf(&encrypted_qiov, crypt_buf, bytes);
2568         qiov = &encrypted_qiov;
2569         qiov_offset = 0;
2570     }
2571 
2572     /* Try to efficiently initialize the physical space with zeroes */
2573     ret = handle_alloc_space(bs, l2meta);
2574     if (ret < 0) {
2575         goto out_unlocked;
2576     }
2577 
2578     /*
2579      * If we need to do COW, check if it's possible to merge the
2580      * writing of the guest data together with that of the COW regions.
2581      * If it's not possible (or not necessary) then write the
2582      * guest data now.
2583      */
2584     if (!merge_cow(offset, bytes, qiov, qiov_offset, l2meta)) {
2585         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
2586         trace_qcow2_writev_data(qemu_coroutine_self(), host_offset);
2587         ret = bdrv_co_pwritev_part(s->data_file, host_offset,
2588                                    bytes, qiov, qiov_offset, 0);
2589         if (ret < 0) {
2590             goto out_unlocked;
2591         }
2592     }
2593 
2594     qemu_co_mutex_lock(&s->lock);
2595 
2596     ret = qcow2_handle_l2meta(bs, &l2meta, true);
2597     goto out_locked;
2598 
2599 out_unlocked:
2600     qemu_co_mutex_lock(&s->lock);
2601 
2602 out_locked:
2603     qcow2_handle_l2meta(bs, &l2meta, false);
2604     qemu_co_mutex_unlock(&s->lock);
2605 
2606     qemu_vfree(crypt_buf);
2607 
2608     return ret;
2609 }
2610 
2611 /*
2612  * This function can count as GRAPH_RDLOCK because qcow2_co_pwritev_part() holds
2613  * the graph lock and keeps it until this coroutine has terminated.
2614  */
2615 static coroutine_fn GRAPH_RDLOCK int qcow2_co_pwritev_task_entry(AioTask *task)
2616 {
2617     Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
2618 
2619     assert(!t->subcluster_type);
2620 
2621     return qcow2_co_pwritev_task(t->bs, t->host_offset,
2622                                  t->offset, t->bytes, t->qiov, t->qiov_offset,
2623                                  t->l2meta);
2624 }
2625 
2626 static int coroutine_fn GRAPH_RDLOCK
2627 qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
2628                       QEMUIOVector *qiov, size_t qiov_offset,
2629                       BdrvRequestFlags flags)
2630 {
2631     BDRVQcow2State *s = bs->opaque;
2632     int offset_in_cluster;
2633     int ret;
2634     unsigned int cur_bytes; /* number of sectors in current iteration */
2635     uint64_t host_offset;
2636     QCowL2Meta *l2meta = NULL;
2637     AioTaskPool *aio = NULL;
2638 
2639     trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
2640 
2641     while (bytes != 0 && aio_task_pool_status(aio) == 0) {
2642 
2643         l2meta = NULL;
2644 
2645         trace_qcow2_writev_start_part(qemu_coroutine_self());
2646         offset_in_cluster = offset_into_cluster(s, offset);
2647         cur_bytes = MIN(bytes, INT_MAX);
2648         if (bs->encrypted) {
2649             cur_bytes = MIN(cur_bytes,
2650                             QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
2651                             - offset_in_cluster);
2652         }
2653 
2654         qemu_co_mutex_lock(&s->lock);
2655 
2656         ret = qcow2_alloc_host_offset(bs, offset, &cur_bytes,
2657                                       &host_offset, &l2meta);
2658         if (ret < 0) {
2659             goto out_locked;
2660         }
2661 
2662         ret = qcow2_pre_write_overlap_check(bs, 0, host_offset,
2663                                             cur_bytes, true);
2664         if (ret < 0) {
2665             goto out_locked;
2666         }
2667 
2668         qemu_co_mutex_unlock(&s->lock);
2669 
2670         if (!aio && cur_bytes != bytes) {
2671             aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
2672         }
2673         ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_task_entry, 0,
2674                              host_offset, offset,
2675                              cur_bytes, qiov, qiov_offset, l2meta);
2676         l2meta = NULL; /* l2meta is consumed by qcow2_co_pwritev_task() */
2677         if (ret < 0) {
2678             goto fail_nometa;
2679         }
2680 
2681         bytes -= cur_bytes;
2682         offset += cur_bytes;
2683         qiov_offset += cur_bytes;
2684         trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
2685     }
2686     ret = 0;
2687 
2688     qemu_co_mutex_lock(&s->lock);
2689 
2690 out_locked:
2691     qcow2_handle_l2meta(bs, &l2meta, false);
2692 
2693     qemu_co_mutex_unlock(&s->lock);
2694 
2695 fail_nometa:
2696     if (aio) {
2697         aio_task_pool_wait_all(aio);
2698         if (ret == 0) {
2699             ret = aio_task_pool_status(aio);
2700         }
2701         g_free(aio);
2702     }
2703 
2704     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
2705 
2706     return ret;
2707 }
2708 
2709 static int qcow2_inactivate(BlockDriverState *bs)
2710 {
2711     BDRVQcow2State *s = bs->opaque;
2712     int ret, result = 0;
2713     Error *local_err = NULL;
2714 
2715     qcow2_store_persistent_dirty_bitmaps(bs, true, &local_err);
2716     if (local_err != NULL) {
2717         result = -EINVAL;
2718         error_reportf_err(local_err, "Lost persistent bitmaps during "
2719                           "inactivation of node '%s': ",
2720                           bdrv_get_device_or_node_name(bs));
2721     }
2722 
2723     ret = qcow2_cache_flush(bs, s->l2_table_cache);
2724     if (ret) {
2725         result = ret;
2726         error_report("Failed to flush the L2 table cache: %s",
2727                      strerror(-ret));
2728     }
2729 
2730     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2731     if (ret) {
2732         result = ret;
2733         error_report("Failed to flush the refcount block cache: %s",
2734                      strerror(-ret));
2735     }
2736 
2737     if (result == 0) {
2738         qcow2_mark_clean(bs);
2739     }
2740 
2741     return result;
2742 }
2743 
2744 static void qcow2_do_close(BlockDriverState *bs, bool close_data_file)
2745 {
2746     BDRVQcow2State *s = bs->opaque;
2747     qemu_vfree(s->l1_table);
2748     /* else pre-write overlap checks in cache_destroy may crash */
2749     s->l1_table = NULL;
2750 
2751     if (!(s->flags & BDRV_O_INACTIVE)) {
2752         qcow2_inactivate(bs);
2753     }
2754 
2755     cache_clean_timer_del(bs);
2756     qcow2_cache_destroy(s->l2_table_cache);
2757     qcow2_cache_destroy(s->refcount_block_cache);
2758 
2759     qcrypto_block_free(s->crypto);
2760     s->crypto = NULL;
2761     qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
2762 
2763     g_free(s->unknown_header_fields);
2764     cleanup_unknown_header_ext(bs);
2765 
2766     g_free(s->image_data_file);
2767     g_free(s->image_backing_file);
2768     g_free(s->image_backing_format);
2769 
2770     if (close_data_file && has_data_file(bs)) {
2771         bdrv_unref_child(bs, s->data_file);
2772         s->data_file = NULL;
2773     }
2774 
2775     qcow2_refcount_close(bs);
2776     qcow2_free_snapshots(bs);
2777 }
2778 
2779 static void qcow2_close(BlockDriverState *bs)
2780 {
2781     qcow2_do_close(bs, true);
2782 }
2783 
2784 static void coroutine_fn GRAPH_RDLOCK
2785 qcow2_co_invalidate_cache(BlockDriverState *bs, Error **errp)
2786 {
2787     ERRP_GUARD();
2788     BDRVQcow2State *s = bs->opaque;
2789     BdrvChild *data_file;
2790     int flags = s->flags;
2791     QCryptoBlock *crypto = NULL;
2792     QDict *options;
2793     int ret;
2794 
2795     /*
2796      * Backing files are read-only which makes all of their metadata immutable,
2797      * that means we don't have to worry about reopening them here.
2798      */
2799 
2800     crypto = s->crypto;
2801     s->crypto = NULL;
2802 
2803     /*
2804      * Do not reopen s->data_file (i.e., have qcow2_do_close() not close it,
2805      * and then prevent qcow2_do_open() from opening it), because this function
2806      * runs in the I/O path and as such we must not invoke global-state
2807      * functions like bdrv_unref_child() and bdrv_open_child().
2808      */
2809 
2810     qcow2_do_close(bs, false);
2811 
2812     data_file = s->data_file;
2813     memset(s, 0, sizeof(BDRVQcow2State));
2814     s->data_file = data_file;
2815 
2816     options = qdict_clone_shallow(bs->options);
2817 
2818     flags &= ~BDRV_O_INACTIVE;
2819     qemu_co_mutex_lock(&s->lock);
2820     ret = qcow2_do_open(bs, options, flags, false, errp);
2821     qemu_co_mutex_unlock(&s->lock);
2822     qobject_unref(options);
2823     if (ret < 0) {
2824         error_prepend(errp, "Could not reopen qcow2 layer: ");
2825         bs->drv = NULL;
2826         return;
2827     }
2828 
2829     s->crypto = crypto;
2830 }
2831 
2832 static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
2833     size_t len, size_t buflen)
2834 {
2835     QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
2836     size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
2837 
2838     if (buflen < ext_len) {
2839         return -ENOSPC;
2840     }
2841 
2842     *ext_backing_fmt = (QCowExtension) {
2843         .magic  = cpu_to_be32(magic),
2844         .len    = cpu_to_be32(len),
2845     };
2846 
2847     if (len) {
2848         memcpy(buf + sizeof(QCowExtension), s, len);
2849     }
2850 
2851     return ext_len;
2852 }
2853 
2854 /*
2855  * Updates the qcow2 header, including the variable length parts of it, i.e.
2856  * the backing file name and all extensions. qcow2 was not designed to allow
2857  * such changes, so if we run out of space (we can only use the first cluster)
2858  * this function may fail.
2859  *
2860  * Returns 0 on success, -errno in error cases.
2861  */
2862 int qcow2_update_header(BlockDriverState *bs)
2863 {
2864     BDRVQcow2State *s = bs->opaque;
2865     QCowHeader *header;
2866     char *buf;
2867     size_t buflen = s->cluster_size;
2868     int ret;
2869     uint64_t total_size;
2870     uint32_t refcount_table_clusters;
2871     size_t header_length;
2872     Qcow2UnknownHeaderExtension *uext;
2873 
2874     buf = qemu_blockalign(bs, buflen);
2875 
2876     /* Header structure */
2877     header = (QCowHeader*) buf;
2878 
2879     if (buflen < sizeof(*header)) {
2880         ret = -ENOSPC;
2881         goto fail;
2882     }
2883 
2884     header_length = sizeof(*header) + s->unknown_header_fields_size;
2885     total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
2886     refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2887 
2888     ret = validate_compression_type(s, NULL);
2889     if (ret) {
2890         goto fail;
2891     }
2892 
2893     *header = (QCowHeader) {
2894         /* Version 2 fields */
2895         .magic                  = cpu_to_be32(QCOW_MAGIC),
2896         .version                = cpu_to_be32(s->qcow_version),
2897         .backing_file_offset    = 0,
2898         .backing_file_size      = 0,
2899         .cluster_bits           = cpu_to_be32(s->cluster_bits),
2900         .size                   = cpu_to_be64(total_size),
2901         .crypt_method           = cpu_to_be32(s->crypt_method_header),
2902         .l1_size                = cpu_to_be32(s->l1_size),
2903         .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
2904         .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
2905         .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
2906         .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
2907         .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
2908 
2909         /* Version 3 fields */
2910         .incompatible_features  = cpu_to_be64(s->incompatible_features),
2911         .compatible_features    = cpu_to_be64(s->compatible_features),
2912         .autoclear_features     = cpu_to_be64(s->autoclear_features),
2913         .refcount_order         = cpu_to_be32(s->refcount_order),
2914         .header_length          = cpu_to_be32(header_length),
2915         .compression_type       = s->compression_type,
2916     };
2917 
2918     /* For older versions, write a shorter header */
2919     switch (s->qcow_version) {
2920     case 2:
2921         ret = offsetof(QCowHeader, incompatible_features);
2922         break;
2923     case 3:
2924         ret = sizeof(*header);
2925         break;
2926     default:
2927         ret = -EINVAL;
2928         goto fail;
2929     }
2930 
2931     buf += ret;
2932     buflen -= ret;
2933     memset(buf, 0, buflen);
2934 
2935     /* Preserve any unknown field in the header */
2936     if (s->unknown_header_fields_size) {
2937         if (buflen < s->unknown_header_fields_size) {
2938             ret = -ENOSPC;
2939             goto fail;
2940         }
2941 
2942         memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
2943         buf += s->unknown_header_fields_size;
2944         buflen -= s->unknown_header_fields_size;
2945     }
2946 
2947     /* Backing file format header extension */
2948     if (s->image_backing_format) {
2949         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
2950                              s->image_backing_format,
2951                              strlen(s->image_backing_format),
2952                              buflen);
2953         if (ret < 0) {
2954             goto fail;
2955         }
2956 
2957         buf += ret;
2958         buflen -= ret;
2959     }
2960 
2961     /* External data file header extension */
2962     if (has_data_file(bs) && s->image_data_file) {
2963         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_DATA_FILE,
2964                              s->image_data_file, strlen(s->image_data_file),
2965                              buflen);
2966         if (ret < 0) {
2967             goto fail;
2968         }
2969 
2970         buf += ret;
2971         buflen -= ret;
2972     }
2973 
2974     /* Full disk encryption header pointer extension */
2975     if (s->crypto_header.offset != 0) {
2976         s->crypto_header.offset = cpu_to_be64(s->crypto_header.offset);
2977         s->crypto_header.length = cpu_to_be64(s->crypto_header.length);
2978         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER,
2979                              &s->crypto_header, sizeof(s->crypto_header),
2980                              buflen);
2981         s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
2982         s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
2983         if (ret < 0) {
2984             goto fail;
2985         }
2986         buf += ret;
2987         buflen -= ret;
2988     }
2989 
2990     /*
2991      * Feature table.  A mere 8 feature names occupies 392 bytes, and
2992      * when coupled with the v3 minimum header of 104 bytes plus the
2993      * 8-byte end-of-extension marker, that would leave only 8 bytes
2994      * for a backing file name in an image with 512-byte clusters.
2995      * Thus, we choose to omit this header for cluster sizes 4k and
2996      * smaller.
2997      */
2998     if (s->qcow_version >= 3 && s->cluster_size > 4096) {
2999         static const Qcow2Feature features[] = {
3000             {
3001                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3002                 .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
3003                 .name = "dirty bit",
3004             },
3005             {
3006                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3007                 .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
3008                 .name = "corrupt bit",
3009             },
3010             {
3011                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3012                 .bit  = QCOW2_INCOMPAT_DATA_FILE_BITNR,
3013                 .name = "external data file",
3014             },
3015             {
3016                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3017                 .bit  = QCOW2_INCOMPAT_COMPRESSION_BITNR,
3018                 .name = "compression type",
3019             },
3020             {
3021                 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3022                 .bit  = QCOW2_INCOMPAT_EXTL2_BITNR,
3023                 .name = "extended L2 entries",
3024             },
3025             {
3026                 .type = QCOW2_FEAT_TYPE_COMPATIBLE,
3027                 .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
3028                 .name = "lazy refcounts",
3029             },
3030             {
3031                 .type = QCOW2_FEAT_TYPE_AUTOCLEAR,
3032                 .bit  = QCOW2_AUTOCLEAR_BITMAPS_BITNR,
3033                 .name = "bitmaps",
3034             },
3035             {
3036                 .type = QCOW2_FEAT_TYPE_AUTOCLEAR,
3037                 .bit  = QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR,
3038                 .name = "raw external data",
3039             },
3040         };
3041 
3042         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
3043                              features, sizeof(features), buflen);
3044         if (ret < 0) {
3045             goto fail;
3046         }
3047         buf += ret;
3048         buflen -= ret;
3049     }
3050 
3051     /* Bitmap extension */
3052     if (s->nb_bitmaps > 0) {
3053         Qcow2BitmapHeaderExt bitmaps_header = {
3054             .nb_bitmaps = cpu_to_be32(s->nb_bitmaps),
3055             .bitmap_directory_size =
3056                     cpu_to_be64(s->bitmap_directory_size),
3057             .bitmap_directory_offset =
3058                     cpu_to_be64(s->bitmap_directory_offset)
3059         };
3060         ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS,
3061                              &bitmaps_header, sizeof(bitmaps_header),
3062                              buflen);
3063         if (ret < 0) {
3064             goto fail;
3065         }
3066         buf += ret;
3067         buflen -= ret;
3068     }
3069 
3070     /* Keep unknown header extensions */
3071     QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
3072         ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
3073         if (ret < 0) {
3074             goto fail;
3075         }
3076 
3077         buf += ret;
3078         buflen -= ret;
3079     }
3080 
3081     /* End of header extensions */
3082     ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
3083     if (ret < 0) {
3084         goto fail;
3085     }
3086 
3087     buf += ret;
3088     buflen -= ret;
3089 
3090     /* Backing file name */
3091     if (s->image_backing_file) {
3092         size_t backing_file_len = strlen(s->image_backing_file);
3093 
3094         if (buflen < backing_file_len) {
3095             ret = -ENOSPC;
3096             goto fail;
3097         }
3098 
3099         /* Using strncpy is ok here, since buf is not NUL-terminated. */
3100         strncpy(buf, s->image_backing_file, buflen);
3101 
3102         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
3103         header->backing_file_size   = cpu_to_be32(backing_file_len);
3104     }
3105 
3106     /* Write the new header */
3107     ret = bdrv_pwrite(bs->file, 0, s->cluster_size, header, 0);
3108     if (ret < 0) {
3109         goto fail;
3110     }
3111 
3112     ret = 0;
3113 fail:
3114     qemu_vfree(header);
3115     return ret;
3116 }
3117 
3118 static int qcow2_change_backing_file(BlockDriverState *bs,
3119     const char *backing_file, const char *backing_fmt)
3120 {
3121     BDRVQcow2State *s = bs->opaque;
3122 
3123     /* Adding a backing file means that the external data file alone won't be
3124      * enough to make sense of the content */
3125     if (backing_file && data_file_is_raw(bs)) {
3126         return -EINVAL;
3127     }
3128 
3129     if (backing_file && strlen(backing_file) > 1023) {
3130         return -EINVAL;
3131     }
3132 
3133     pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
3134             backing_file ?: "");
3135     pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
3136     pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
3137 
3138     g_free(s->image_backing_file);
3139     g_free(s->image_backing_format);
3140 
3141     s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
3142     s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;
3143 
3144     return qcow2_update_header(bs);
3145 }
3146 
3147 static int coroutine_fn GRAPH_RDLOCK
3148 qcow2_set_up_encryption(BlockDriverState *bs,
3149                         QCryptoBlockCreateOptions *cryptoopts,
3150                         Error **errp)
3151 {
3152     BDRVQcow2State *s = bs->opaque;
3153     QCryptoBlock *crypto = NULL;
3154     int fmt, ret;
3155 
3156     switch (cryptoopts->format) {
3157     case Q_CRYPTO_BLOCK_FORMAT_LUKS:
3158         fmt = QCOW_CRYPT_LUKS;
3159         break;
3160     case Q_CRYPTO_BLOCK_FORMAT_QCOW:
3161         fmt = QCOW_CRYPT_AES;
3162         break;
3163     default:
3164         error_setg(errp, "Crypto format not supported in qcow2");
3165         return -EINVAL;
3166     }
3167 
3168     s->crypt_method_header = fmt;
3169 
3170     crypto = qcrypto_block_create(cryptoopts, "encrypt.",
3171                                   qcow2_crypto_hdr_init_func,
3172                                   qcow2_crypto_hdr_write_func,
3173                                   bs, errp);
3174     if (!crypto) {
3175         return -EINVAL;
3176     }
3177 
3178     ret = qcow2_update_header(bs);
3179     if (ret < 0) {
3180         error_setg_errno(errp, -ret, "Could not write encryption header");
3181         goto out;
3182     }
3183 
3184     ret = 0;
3185  out:
3186     qcrypto_block_free(crypto);
3187     return ret;
3188 }
3189 
3190 /**
3191  * Preallocates metadata structures for data clusters between @offset (in the
3192  * guest disk) and @new_length (which is thus generally the new guest disk
3193  * size).
3194  *
3195  * Returns: 0 on success, -errno on failure.
3196  */
3197 static int coroutine_fn GRAPH_RDLOCK
3198 preallocate_co(BlockDriverState *bs, uint64_t offset, uint64_t new_length,
3199                PreallocMode mode, Error **errp)
3200 {
3201     BDRVQcow2State *s = bs->opaque;
3202     uint64_t bytes;
3203     uint64_t host_offset = 0;
3204     int64_t file_length;
3205     unsigned int cur_bytes;
3206     int ret;
3207     QCowL2Meta *meta = NULL, *m;
3208 
3209     assert(offset <= new_length);
3210     bytes = new_length - offset;
3211 
3212     while (bytes) {
3213         cur_bytes = MIN(bytes, QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size));
3214         ret = qcow2_alloc_host_offset(bs, offset, &cur_bytes,
3215                                       &host_offset, &meta);
3216         if (ret < 0) {
3217             error_setg_errno(errp, -ret, "Allocating clusters failed");
3218             goto out;
3219         }
3220 
3221         for (m = meta; m != NULL; m = m->next) {
3222             m->prealloc = true;
3223         }
3224 
3225         ret = qcow2_handle_l2meta(bs, &meta, true);
3226         if (ret < 0) {
3227             error_setg_errno(errp, -ret, "Mapping clusters failed");
3228             goto out;
3229         }
3230 
3231         /* TODO Preallocate data if requested */
3232 
3233         bytes -= cur_bytes;
3234         offset += cur_bytes;
3235     }
3236 
3237     /*
3238      * It is expected that the image file is large enough to actually contain
3239      * all of the allocated clusters (otherwise we get failing reads after
3240      * EOF). Extend the image to the last allocated sector.
3241      */
3242     file_length = bdrv_co_getlength(s->data_file->bs);
3243     if (file_length < 0) {
3244         error_setg_errno(errp, -file_length, "Could not get file size");
3245         ret = file_length;
3246         goto out;
3247     }
3248 
3249     if (host_offset + cur_bytes > file_length) {
3250         if (mode == PREALLOC_MODE_METADATA) {
3251             mode = PREALLOC_MODE_OFF;
3252         }
3253         ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
3254                                mode, 0, errp);
3255         if (ret < 0) {
3256             goto out;
3257         }
3258     }
3259 
3260     ret = 0;
3261 
3262 out:
3263     qcow2_handle_l2meta(bs, &meta, false);
3264     return ret;
3265 }
3266 
3267 /* qcow2_refcount_metadata_size:
3268  * @clusters: number of clusters to refcount (including data and L1/L2 tables)
3269  * @cluster_size: size of a cluster, in bytes
3270  * @refcount_order: refcount bits power-of-2 exponent
3271  * @generous_increase: allow for the refcount table to be 1.5x as large as it
3272  *                     needs to be
3273  *
3274  * Returns: Number of bytes required for refcount blocks and table metadata.
3275  */
3276 int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
3277                                      int refcount_order, bool generous_increase,
3278                                      uint64_t *refblock_count)
3279 {
3280     /*
3281      * Every host cluster is reference-counted, including metadata (even
3282      * refcount metadata is recursively included).
3283      *
3284      * An accurate formula for the size of refcount metadata size is difficult
3285      * to derive.  An easier method of calculation is finding the fixed point
3286      * where no further refcount blocks or table clusters are required to
3287      * reference count every cluster.
3288      */
3289     int64_t blocks_per_table_cluster = cluster_size / REFTABLE_ENTRY_SIZE;
3290     int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
3291     int64_t table = 0;  /* number of refcount table clusters */
3292     int64_t blocks = 0; /* number of refcount block clusters */
3293     int64_t last;
3294     int64_t n = 0;
3295 
3296     do {
3297         last = n;
3298         blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
3299         table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
3300         n = clusters + blocks + table;
3301 
3302         if (n == last && generous_increase) {
3303             clusters += DIV_ROUND_UP(table, 2);
3304             n = 0; /* force another loop */
3305             generous_increase = false;
3306         }
3307     } while (n != last);
3308 
3309     if (refblock_count) {
3310         *refblock_count = blocks;
3311     }
3312 
3313     return (blocks + table) * cluster_size;
3314 }
3315 
3316 /**
3317  * qcow2_calc_prealloc_size:
3318  * @total_size: virtual disk size in bytes
3319  * @cluster_size: cluster size in bytes
3320  * @refcount_order: refcount bits power-of-2 exponent
3321  * @extended_l2: true if the image has extended L2 entries
3322  *
3323  * Returns: Total number of bytes required for the fully allocated image
3324  * (including metadata).
3325  */
3326 static int64_t qcow2_calc_prealloc_size(int64_t total_size,
3327                                         size_t cluster_size,
3328                                         int refcount_order,
3329                                         bool extended_l2)
3330 {
3331     int64_t meta_size = 0;
3332     uint64_t nl1e, nl2e;
3333     int64_t aligned_total_size = ROUND_UP(total_size, cluster_size);
3334     size_t l2e_size = extended_l2 ? L2E_SIZE_EXTENDED : L2E_SIZE_NORMAL;
3335 
3336     /* header: 1 cluster */
3337     meta_size += cluster_size;
3338 
3339     /* total size of L2 tables */
3340     nl2e = aligned_total_size / cluster_size;
3341     nl2e = ROUND_UP(nl2e, cluster_size / l2e_size);
3342     meta_size += nl2e * l2e_size;
3343 
3344     /* total size of L1 tables */
3345     nl1e = nl2e * l2e_size / cluster_size;
3346     nl1e = ROUND_UP(nl1e, cluster_size / L1E_SIZE);
3347     meta_size += nl1e * L1E_SIZE;
3348 
3349     /* total size of refcount table and blocks */
3350     meta_size += qcow2_refcount_metadata_size(
3351             (meta_size + aligned_total_size) / cluster_size,
3352             cluster_size, refcount_order, false, NULL);
3353 
3354     return meta_size + aligned_total_size;
3355 }
3356 
3357 static bool validate_cluster_size(size_t cluster_size, bool extended_l2,
3358                                   Error **errp)
3359 {
3360     int cluster_bits = ctz32(cluster_size);
3361     if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
3362         (1 << cluster_bits) != cluster_size)
3363     {
3364         error_setg(errp, "Cluster size must be a power of two between %d and "
3365                    "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
3366         return false;
3367     }
3368 
3369     if (extended_l2) {
3370         unsigned min_cluster_size =
3371             (1 << MIN_CLUSTER_BITS) * QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER;
3372         if (cluster_size < min_cluster_size) {
3373             error_setg(errp, "Extended L2 entries are only supported with "
3374                        "cluster sizes of at least %u bytes", min_cluster_size);
3375             return false;
3376         }
3377     }
3378 
3379     return true;
3380 }
3381 
3382 static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, bool extended_l2,
3383                                              Error **errp)
3384 {
3385     size_t cluster_size;
3386 
3387     cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
3388                                          DEFAULT_CLUSTER_SIZE);
3389     if (!validate_cluster_size(cluster_size, extended_l2, errp)) {
3390         return 0;
3391     }
3392     return cluster_size;
3393 }
3394 
3395 static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
3396 {
3397     char *buf;
3398     int ret;
3399 
3400     buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
3401     if (!buf) {
3402         ret = 3; /* default */
3403     } else if (!strcmp(buf, "0.10")) {
3404         ret = 2;
3405     } else if (!strcmp(buf, "1.1")) {
3406         ret = 3;
3407     } else {
3408         error_setg(errp, "Invalid compatibility level: '%s'", buf);
3409         ret = -EINVAL;
3410     }
3411     g_free(buf);
3412     return ret;
3413 }
3414 
3415 static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
3416                                                 Error **errp)
3417 {
3418     uint64_t refcount_bits;
3419 
3420     refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
3421     if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
3422         error_setg(errp, "Refcount width must be a power of two and may not "
3423                    "exceed 64 bits");
3424         return 0;
3425     }
3426 
3427     if (version < 3 && refcount_bits != 16) {
3428         error_setg(errp, "Different refcount widths than 16 bits require "
3429                    "compatibility level 1.1 or above (use compat=1.1 or "
3430                    "greater)");
3431         return 0;
3432     }
3433 
3434     return refcount_bits;
3435 }
3436 
3437 static int coroutine_fn GRAPH_UNLOCKED
3438 qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
3439 {
3440     BlockdevCreateOptionsQcow2 *qcow2_opts;
3441     QDict *options;
3442 
3443     /*
3444      * Open the image file and write a minimal qcow2 header.
3445      *
3446      * We keep things simple and start with a zero-sized image. We also
3447      * do without refcount blocks or a L1 table for now. We'll fix the
3448      * inconsistency later.
3449      *
3450      * We do need a refcount table because growing the refcount table means
3451      * allocating two new refcount blocks - the second of which would be at
3452      * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
3453      * size for any qcow2 image.
3454      */
3455     BlockBackend *blk = NULL;
3456     BlockDriverState *bs = NULL;
3457     BlockDriverState *data_bs = NULL;
3458     QCowHeader *header;
3459     size_t cluster_size;
3460     int version;
3461     int refcount_order;
3462     uint64_t *refcount_table;
3463     int ret;
3464     uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
3465 
3466     assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
3467     qcow2_opts = &create_options->u.qcow2;
3468 
3469     bs = bdrv_co_open_blockdev_ref(qcow2_opts->file, errp);
3470     if (bs == NULL) {
3471         return -EIO;
3472     }
3473 
3474     /* Validate options and set default values */
3475     if (!QEMU_IS_ALIGNED(qcow2_opts->size, BDRV_SECTOR_SIZE)) {
3476         error_setg(errp, "Image size must be a multiple of %u bytes",
3477                    (unsigned) BDRV_SECTOR_SIZE);
3478         ret = -EINVAL;
3479         goto out;
3480     }
3481 
3482     if (qcow2_opts->has_version) {
3483         switch (qcow2_opts->version) {
3484         case BLOCKDEV_QCOW2_VERSION_V2:
3485             version = 2;
3486             break;
3487         case BLOCKDEV_QCOW2_VERSION_V3:
3488             version = 3;
3489             break;
3490         default:
3491             g_assert_not_reached();
3492         }
3493     } else {
3494         version = 3;
3495     }
3496 
3497     if (qcow2_opts->has_cluster_size) {
3498         cluster_size = qcow2_opts->cluster_size;
3499     } else {
3500         cluster_size = DEFAULT_CLUSTER_SIZE;
3501     }
3502 
3503     if (!qcow2_opts->has_extended_l2) {
3504         qcow2_opts->extended_l2 = false;
3505     }
3506     if (qcow2_opts->extended_l2) {
3507         if (version < 3) {
3508             error_setg(errp, "Extended L2 entries are only supported with "
3509                        "compatibility level 1.1 and above (use version=v3 or "
3510                        "greater)");
3511             ret = -EINVAL;
3512             goto out;
3513         }
3514     }
3515 
3516     if (!validate_cluster_size(cluster_size, qcow2_opts->extended_l2, errp)) {
3517         ret = -EINVAL;
3518         goto out;
3519     }
3520 
3521     if (!qcow2_opts->has_preallocation) {
3522         qcow2_opts->preallocation = PREALLOC_MODE_OFF;
3523     }
3524     if (qcow2_opts->backing_file &&
3525         qcow2_opts->preallocation != PREALLOC_MODE_OFF &&
3526         !qcow2_opts->extended_l2)
3527     {
3528         error_setg(errp, "Backing file and preallocation can only be used at "
3529                    "the same time if extended_l2 is on");
3530         ret = -EINVAL;
3531         goto out;
3532     }
3533     if (qcow2_opts->has_backing_fmt && !qcow2_opts->backing_file) {
3534         error_setg(errp, "Backing format cannot be used without backing file");
3535         ret = -EINVAL;
3536         goto out;
3537     }
3538 
3539     if (!qcow2_opts->has_lazy_refcounts) {
3540         qcow2_opts->lazy_refcounts = false;
3541     }
3542     if (version < 3 && qcow2_opts->lazy_refcounts) {
3543         error_setg(errp, "Lazy refcounts only supported with compatibility "
3544                    "level 1.1 and above (use version=v3 or greater)");
3545         ret = -EINVAL;
3546         goto out;
3547     }
3548 
3549     if (!qcow2_opts->has_refcount_bits) {
3550         qcow2_opts->refcount_bits = 16;
3551     }
3552     if (qcow2_opts->refcount_bits > 64 ||
3553         !is_power_of_2(qcow2_opts->refcount_bits))
3554     {
3555         error_setg(errp, "Refcount width must be a power of two and may not "
3556                    "exceed 64 bits");
3557         ret = -EINVAL;
3558         goto out;
3559     }
3560     if (version < 3 && qcow2_opts->refcount_bits != 16) {
3561         error_setg(errp, "Different refcount widths than 16 bits require "
3562                    "compatibility level 1.1 or above (use version=v3 or "
3563                    "greater)");
3564         ret = -EINVAL;
3565         goto out;
3566     }
3567     refcount_order = ctz32(qcow2_opts->refcount_bits);
3568 
3569     if (qcow2_opts->data_file_raw && !qcow2_opts->data_file) {
3570         error_setg(errp, "data-file-raw requires data-file");
3571         ret = -EINVAL;
3572         goto out;
3573     }
3574     if (qcow2_opts->data_file_raw && qcow2_opts->backing_file) {
3575         error_setg(errp, "Backing file and data-file-raw cannot be used at "
3576                    "the same time");
3577         ret = -EINVAL;
3578         goto out;
3579     }
3580     if (qcow2_opts->data_file_raw &&
3581         qcow2_opts->preallocation == PREALLOC_MODE_OFF)
3582     {
3583         /*
3584          * data-file-raw means that "the external data file can be
3585          * read as a consistent standalone raw image without looking
3586          * at the qcow2 metadata."  It does not say that the metadata
3587          * must be ignored, though (and the qcow2 driver in fact does
3588          * not ignore it), so the L1/L2 tables must be present and
3589          * give a 1:1 mapping, so you get the same result regardless
3590          * of whether you look at the metadata or whether you ignore
3591          * it.
3592          */
3593         qcow2_opts->preallocation = PREALLOC_MODE_METADATA;
3594 
3595         /*
3596          * Cannot use preallocation with backing files, but giving a
3597          * backing file when specifying data_file_raw is an error
3598          * anyway.
3599          */
3600         assert(!qcow2_opts->backing_file);
3601     }
3602 
3603     if (qcow2_opts->data_file) {
3604         if (version < 3) {
3605             error_setg(errp, "External data files are only supported with "
3606                        "compatibility level 1.1 and above (use version=v3 or "
3607                        "greater)");
3608             ret = -EINVAL;
3609             goto out;
3610         }
3611         data_bs = bdrv_co_open_blockdev_ref(qcow2_opts->data_file, errp);
3612         if (data_bs == NULL) {
3613             ret = -EIO;
3614             goto out;
3615         }
3616     }
3617 
3618     if (qcow2_opts->has_compression_type &&
3619         qcow2_opts->compression_type != QCOW2_COMPRESSION_TYPE_ZLIB) {
3620 
3621         ret = -EINVAL;
3622 
3623         if (version < 3) {
3624             error_setg(errp, "Non-zlib compression type is only supported with "
3625                        "compatibility level 1.1 and above (use version=v3 or "
3626                        "greater)");
3627             goto out;
3628         }
3629 
3630         switch (qcow2_opts->compression_type) {
3631 #ifdef CONFIG_ZSTD
3632         case QCOW2_COMPRESSION_TYPE_ZSTD:
3633             break;
3634 #endif
3635         default:
3636             error_setg(errp, "Unknown compression type");
3637             goto out;
3638         }
3639 
3640         compression_type = qcow2_opts->compression_type;
3641     }
3642 
3643     /* Create BlockBackend to write to the image */
3644     blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
3645                              errp);
3646     if (!blk) {
3647         ret = -EPERM;
3648         goto out;
3649     }
3650     blk_set_allow_write_beyond_eof(blk, true);
3651 
3652     /* Write the header */
3653     QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
3654     header = g_malloc0(cluster_size);
3655     *header = (QCowHeader) {
3656         .magic                      = cpu_to_be32(QCOW_MAGIC),
3657         .version                    = cpu_to_be32(version),
3658         .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
3659         .size                       = cpu_to_be64(0),
3660         .l1_table_offset            = cpu_to_be64(0),
3661         .l1_size                    = cpu_to_be32(0),
3662         .refcount_table_offset      = cpu_to_be64(cluster_size),
3663         .refcount_table_clusters    = cpu_to_be32(1),
3664         .refcount_order             = cpu_to_be32(refcount_order),
3665         /* don't deal with endianness since compression_type is 1 byte long */
3666         .compression_type           = compression_type,
3667         .header_length              = cpu_to_be32(sizeof(*header)),
3668     };
3669 
3670     /* We'll update this to correct value later */
3671     header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
3672 
3673     if (qcow2_opts->lazy_refcounts) {
3674         header->compatible_features |=
3675             cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
3676     }
3677     if (data_bs) {
3678         header->incompatible_features |=
3679             cpu_to_be64(QCOW2_INCOMPAT_DATA_FILE);
3680     }
3681     if (qcow2_opts->data_file_raw) {
3682         header->autoclear_features |=
3683             cpu_to_be64(QCOW2_AUTOCLEAR_DATA_FILE_RAW);
3684     }
3685     if (compression_type != QCOW2_COMPRESSION_TYPE_ZLIB) {
3686         header->incompatible_features |=
3687             cpu_to_be64(QCOW2_INCOMPAT_COMPRESSION);
3688     }
3689 
3690     if (qcow2_opts->extended_l2) {
3691         header->incompatible_features |=
3692             cpu_to_be64(QCOW2_INCOMPAT_EXTL2);
3693     }
3694 
3695     ret = blk_co_pwrite(blk, 0, cluster_size, header, 0);
3696     g_free(header);
3697     if (ret < 0) {
3698         error_setg_errno(errp, -ret, "Could not write qcow2 header");
3699         goto out;
3700     }
3701 
3702     /* Write a refcount table with one refcount block */
3703     refcount_table = g_malloc0(2 * cluster_size);
3704     refcount_table[0] = cpu_to_be64(2 * cluster_size);
3705     ret = blk_co_pwrite(blk, cluster_size, 2 * cluster_size, refcount_table, 0);
3706     g_free(refcount_table);
3707 
3708     if (ret < 0) {
3709         error_setg_errno(errp, -ret, "Could not write refcount table");
3710         goto out;
3711     }
3712 
3713     blk_co_unref(blk);
3714     blk = NULL;
3715 
3716     /*
3717      * And now open the image and make it consistent first (i.e. increase the
3718      * refcount of the cluster that is occupied by the header and the refcount
3719      * table)
3720      */
3721     options = qdict_new();
3722     qdict_put_str(options, "driver", "qcow2");
3723     qdict_put_str(options, "file", bs->node_name);
3724     if (data_bs) {
3725         qdict_put_str(options, "data-file", data_bs->node_name);
3726     }
3727     blk = blk_co_new_open(NULL, NULL, options,
3728                           BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
3729                           errp);
3730     if (blk == NULL) {
3731         ret = -EIO;
3732         goto out;
3733     }
3734 
3735     bdrv_graph_co_rdlock();
3736     ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
3737     if (ret < 0) {
3738         bdrv_graph_co_rdunlock();
3739         error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
3740                          "header and refcount table");
3741         goto out;
3742 
3743     } else if (ret != 0) {
3744         error_report("Huh, first cluster in empty image is already in use?");
3745         abort();
3746     }
3747 
3748     /* Set the external data file if necessary */
3749     if (data_bs) {
3750         BDRVQcow2State *s = blk_bs(blk)->opaque;
3751         s->image_data_file = g_strdup(data_bs->filename);
3752     }
3753 
3754     /* Create a full header (including things like feature table) */
3755     ret = qcow2_update_header(blk_bs(blk));
3756     bdrv_graph_co_rdunlock();
3757 
3758     if (ret < 0) {
3759         error_setg_errno(errp, -ret, "Could not update qcow2 header");
3760         goto out;
3761     }
3762 
3763     /* Okay, now that we have a valid image, let's give it the right size */
3764     ret = blk_co_truncate(blk, qcow2_opts->size, false,
3765                           qcow2_opts->preallocation, 0, errp);
3766     if (ret < 0) {
3767         error_prepend(errp, "Could not resize image: ");
3768         goto out;
3769     }
3770 
3771     /* Want a backing file? There you go. */
3772     if (qcow2_opts->backing_file) {
3773         const char *backing_format = NULL;
3774 
3775         if (qcow2_opts->has_backing_fmt) {
3776             backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt);
3777         }
3778 
3779         ret = bdrv_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
3780                                        backing_format, false);
3781         if (ret < 0) {
3782             error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
3783                              "with format '%s'", qcow2_opts->backing_file,
3784                              backing_format);
3785             goto out;
3786         }
3787     }
3788 
3789     /* Want encryption? There you go. */
3790     if (qcow2_opts->encrypt) {
3791         bdrv_graph_co_rdlock();
3792         ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp);
3793         bdrv_graph_co_rdunlock();
3794 
3795         if (ret < 0) {
3796             goto out;
3797         }
3798     }
3799 
3800     blk_co_unref(blk);
3801     blk = NULL;
3802 
3803     /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
3804      * Using BDRV_O_NO_IO, since encryption is now setup we don't want to
3805      * have to setup decryption context. We're not doing any I/O on the top
3806      * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does
3807      * not have effect.
3808      */
3809     options = qdict_new();
3810     qdict_put_str(options, "driver", "qcow2");
3811     qdict_put_str(options, "file", bs->node_name);
3812     if (data_bs) {
3813         qdict_put_str(options, "data-file", data_bs->node_name);
3814     }
3815     blk = blk_co_new_open(NULL, NULL, options,
3816                           BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO,
3817                           errp);
3818     if (blk == NULL) {
3819         ret = -EIO;
3820         goto out;
3821     }
3822 
3823     ret = 0;
3824 out:
3825     blk_co_unref(blk);
3826     bdrv_co_unref(bs);
3827     bdrv_co_unref(data_bs);
3828     return ret;
3829 }
3830 
3831 static int coroutine_fn GRAPH_UNLOCKED
3832 qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts,
3833                      Error **errp)
3834 {
3835     BlockdevCreateOptions *create_options = NULL;
3836     QDict *qdict;
3837     Visitor *v;
3838     BlockDriverState *bs = NULL;
3839     BlockDriverState *data_bs = NULL;
3840     const char *val;
3841     int ret;
3842 
3843     /* Only the keyval visitor supports the dotted syntax needed for
3844      * encryption, so go through a QDict before getting a QAPI type. Ignore
3845      * options meant for the protocol layer so that the visitor doesn't
3846      * complain. */
3847     qdict = qemu_opts_to_qdict_filtered(opts, NULL, bdrv_qcow2.create_opts,
3848                                         true);
3849 
3850     /* Handle encryption options */
3851     val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
3852     if (val && !strcmp(val, "on")) {
3853         qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
3854     } else if (val && !strcmp(val, "off")) {
3855         qdict_del(qdict, BLOCK_OPT_ENCRYPT);
3856     }
3857 
3858     val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
3859     if (val && !strcmp(val, "aes")) {
3860         qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
3861     }
3862 
3863     /* Convert compat=0.10/1.1 into compat=v2/v3, to be renamed into
3864      * version=v2/v3 below. */
3865     val = qdict_get_try_str(qdict, BLOCK_OPT_COMPAT_LEVEL);
3866     if (val && !strcmp(val, "0.10")) {
3867         qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v2");
3868     } else if (val && !strcmp(val, "1.1")) {
3869         qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v3");
3870     }
3871 
3872     /* Change legacy command line options into QMP ones */
3873     static const QDictRenames opt_renames[] = {
3874         { BLOCK_OPT_BACKING_FILE,       "backing-file" },
3875         { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
3876         { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
3877         { BLOCK_OPT_LAZY_REFCOUNTS,     "lazy-refcounts" },
3878         { BLOCK_OPT_EXTL2,              "extended-l2" },
3879         { BLOCK_OPT_REFCOUNT_BITS,      "refcount-bits" },
3880         { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
3881         { BLOCK_OPT_COMPAT_LEVEL,       "version" },
3882         { BLOCK_OPT_DATA_FILE_RAW,      "data-file-raw" },
3883         { BLOCK_OPT_COMPRESSION_TYPE,   "compression-type" },
3884         { NULL, NULL },
3885     };
3886 
3887     if (!qdict_rename_keys(qdict, opt_renames, errp)) {
3888         ret = -EINVAL;
3889         goto finish;
3890     }
3891 
3892     /* Create and open the file (protocol layer) */
3893     ret = bdrv_co_create_file(filename, opts, errp);
3894     if (ret < 0) {
3895         goto finish;
3896     }
3897 
3898     bs = bdrv_co_open(filename, NULL, NULL,
3899                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
3900     if (bs == NULL) {
3901         ret = -EIO;
3902         goto finish;
3903     }
3904 
3905     /* Create and open an external data file (protocol layer) */
3906     val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE);
3907     if (val) {
3908         ret = bdrv_co_create_file(val, opts, errp);
3909         if (ret < 0) {
3910             goto finish;
3911         }
3912 
3913         data_bs = bdrv_co_open(val, NULL, NULL,
3914                                BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
3915                                errp);
3916         if (data_bs == NULL) {
3917             ret = -EIO;
3918             goto finish;
3919         }
3920 
3921         qdict_del(qdict, BLOCK_OPT_DATA_FILE);
3922         qdict_put_str(qdict, "data-file", data_bs->node_name);
3923     }
3924 
3925     /* Set 'driver' and 'node' options */
3926     qdict_put_str(qdict, "driver", "qcow2");
3927     qdict_put_str(qdict, "file", bs->node_name);
3928 
3929     /* Now get the QAPI type BlockdevCreateOptions */
3930     v = qobject_input_visitor_new_flat_confused(qdict, errp);
3931     if (!v) {
3932         ret = -EINVAL;
3933         goto finish;
3934     }
3935 
3936     visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
3937     visit_free(v);
3938     if (!create_options) {
3939         ret = -EINVAL;
3940         goto finish;
3941     }
3942 
3943     /* Silently round up size */
3944     create_options->u.qcow2.size = ROUND_UP(create_options->u.qcow2.size,
3945                                             BDRV_SECTOR_SIZE);
3946 
3947     /* Create the qcow2 image (format layer) */
3948     ret = qcow2_co_create(create_options, errp);
3949 finish:
3950     if (ret < 0) {
3951         bdrv_graph_co_rdlock();
3952         bdrv_co_delete_file_noerr(bs);
3953         bdrv_co_delete_file_noerr(data_bs);
3954         bdrv_graph_co_rdunlock();
3955     } else {
3956         ret = 0;
3957     }
3958 
3959     qobject_unref(qdict);
3960     bdrv_co_unref(bs);
3961     bdrv_co_unref(data_bs);
3962     qapi_free_BlockdevCreateOptions(create_options);
3963     return ret;
3964 }
3965 
3966 
3967 static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
3968 {
3969     int64_t nr;
3970     int res;
3971 
3972     /* Clamp to image length, before checking status of underlying sectors */
3973     if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
3974         bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset;
3975     }
3976 
3977     if (!bytes) {
3978         return true;
3979     }
3980 
3981     /*
3982      * bdrv_block_status_above doesn't merge different types of zeros, for
3983      * example, zeros which come from the region which is unallocated in
3984      * the whole backing chain, and zeros which come because of a short
3985      * backing file. So, we need a loop.
3986      */
3987     do {
3988         res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
3989         offset += nr;
3990         bytes -= nr;
3991     } while (res >= 0 && (res & BDRV_BLOCK_ZERO) && nr && bytes);
3992 
3993     return res >= 0 && (res & BDRV_BLOCK_ZERO) && bytes == 0;
3994 }
3995 
3996 static int coroutine_fn GRAPH_RDLOCK
3997 qcow2_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
3998                        BdrvRequestFlags flags)
3999 {
4000     int ret;
4001     BDRVQcow2State *s = bs->opaque;
4002 
4003     uint32_t head = offset_into_subcluster(s, offset);
4004     uint32_t tail = ROUND_UP(offset + bytes, s->subcluster_size) -
4005         (offset + bytes);
4006 
4007     trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
4008     if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
4009         tail = 0;
4010     }
4011 
4012     if (head || tail) {
4013         uint64_t off;
4014         unsigned int nr;
4015         QCow2SubclusterType type;
4016 
4017         assert(head + bytes + tail <= s->subcluster_size);
4018 
4019         /* check whether remainder of cluster already reads as zero */
4020         if (!(is_zero(bs, offset - head, head) &&
4021               is_zero(bs, offset + bytes, tail))) {
4022             return -ENOTSUP;
4023         }
4024 
4025         qemu_co_mutex_lock(&s->lock);
4026         /* We can have new write after previous check */
4027         offset -= head;
4028         bytes = s->subcluster_size;
4029         nr = s->subcluster_size;
4030         ret = qcow2_get_host_offset(bs, offset, &nr, &off, &type);
4031         if (ret < 0 ||
4032             (type != QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN &&
4033              type != QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC &&
4034              type != QCOW2_SUBCLUSTER_ZERO_PLAIN &&
4035              type != QCOW2_SUBCLUSTER_ZERO_ALLOC)) {
4036             qemu_co_mutex_unlock(&s->lock);
4037             return ret < 0 ? ret : -ENOTSUP;
4038         }
4039     } else {
4040         qemu_co_mutex_lock(&s->lock);
4041     }
4042 
4043     trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
4044 
4045     /* Whatever is left can use real zero subclusters */
4046     ret = qcow2_subcluster_zeroize(bs, offset, bytes, flags);
4047     qemu_co_mutex_unlock(&s->lock);
4048 
4049     return ret;
4050 }
4051 
4052 static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
4053                                           int64_t offset, int64_t bytes)
4054 {
4055     int ret;
4056     BDRVQcow2State *s = bs->opaque;
4057 
4058     /* If the image does not support QCOW_OFLAG_ZERO then discarding
4059      * clusters could expose stale data from the backing file. */
4060     if (s->qcow_version < 3 && bs->backing) {
4061         return -ENOTSUP;
4062     }
4063 
4064     if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
4065         assert(bytes < s->cluster_size);
4066         /* Ignore partial clusters, except for the special case of the
4067          * complete partial cluster at the end of an unaligned file */
4068         if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
4069             offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
4070             return -ENOTSUP;
4071         }
4072     }
4073 
4074     qemu_co_mutex_lock(&s->lock);
4075     ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
4076                                 false);
4077     qemu_co_mutex_unlock(&s->lock);
4078     return ret;
4079 }
4080 
4081 static int coroutine_fn GRAPH_RDLOCK
4082 qcow2_co_copy_range_from(BlockDriverState *bs,
4083                          BdrvChild *src, int64_t src_offset,
4084                          BdrvChild *dst, int64_t dst_offset,
4085                          int64_t bytes, BdrvRequestFlags read_flags,
4086                          BdrvRequestFlags write_flags)
4087 {
4088     BDRVQcow2State *s = bs->opaque;
4089     int ret;
4090     unsigned int cur_bytes; /* number of bytes in current iteration */
4091     BdrvChild *child = NULL;
4092     BdrvRequestFlags cur_write_flags;
4093 
4094     assert(!bs->encrypted);
4095     qemu_co_mutex_lock(&s->lock);
4096 
4097     while (bytes != 0) {
4098         uint64_t copy_offset = 0;
4099         QCow2SubclusterType type;
4100         /* prepare next request */
4101         cur_bytes = MIN(bytes, INT_MAX);
4102         cur_write_flags = write_flags;
4103 
4104         ret = qcow2_get_host_offset(bs, src_offset, &cur_bytes,
4105                                     &copy_offset, &type);
4106         if (ret < 0) {
4107             goto out;
4108         }
4109 
4110         switch (type) {
4111         case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
4112         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
4113             if (bs->backing && bs->backing->bs) {
4114                 int64_t backing_length = bdrv_co_getlength(bs->backing->bs);
4115                 if (src_offset >= backing_length) {
4116                     cur_write_flags |= BDRV_REQ_ZERO_WRITE;
4117                 } else {
4118                     child = bs->backing;
4119                     cur_bytes = MIN(cur_bytes, backing_length - src_offset);
4120                     copy_offset = src_offset;
4121                 }
4122             } else {
4123                 cur_write_flags |= BDRV_REQ_ZERO_WRITE;
4124             }
4125             break;
4126 
4127         case QCOW2_SUBCLUSTER_ZERO_PLAIN:
4128         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
4129             cur_write_flags |= BDRV_REQ_ZERO_WRITE;
4130             break;
4131 
4132         case QCOW2_SUBCLUSTER_COMPRESSED:
4133             ret = -ENOTSUP;
4134             goto out;
4135 
4136         case QCOW2_SUBCLUSTER_NORMAL:
4137             child = s->data_file;
4138             break;
4139 
4140         default:
4141             abort();
4142         }
4143         qemu_co_mutex_unlock(&s->lock);
4144         ret = bdrv_co_copy_range_from(child,
4145                                       copy_offset,
4146                                       dst, dst_offset,
4147                                       cur_bytes, read_flags, cur_write_flags);
4148         qemu_co_mutex_lock(&s->lock);
4149         if (ret < 0) {
4150             goto out;
4151         }
4152 
4153         bytes -= cur_bytes;
4154         src_offset += cur_bytes;
4155         dst_offset += cur_bytes;
4156     }
4157     ret = 0;
4158 
4159 out:
4160     qemu_co_mutex_unlock(&s->lock);
4161     return ret;
4162 }
4163 
4164 static int coroutine_fn GRAPH_RDLOCK
4165 qcow2_co_copy_range_to(BlockDriverState *bs,
4166                        BdrvChild *src, int64_t src_offset,
4167                        BdrvChild *dst, int64_t dst_offset,
4168                        int64_t bytes, BdrvRequestFlags read_flags,
4169                        BdrvRequestFlags write_flags)
4170 {
4171     BDRVQcow2State *s = bs->opaque;
4172     int ret;
4173     unsigned int cur_bytes; /* number of sectors in current iteration */
4174     uint64_t host_offset;
4175     QCowL2Meta *l2meta = NULL;
4176 
4177     assert(!bs->encrypted);
4178 
4179     qemu_co_mutex_lock(&s->lock);
4180 
4181     while (bytes != 0) {
4182 
4183         l2meta = NULL;
4184 
4185         cur_bytes = MIN(bytes, INT_MAX);
4186 
4187         /* TODO:
4188          * If src->bs == dst->bs, we could simply copy by incrementing
4189          * the refcnt, without copying user data.
4190          * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */
4191         ret = qcow2_alloc_host_offset(bs, dst_offset, &cur_bytes,
4192                                       &host_offset, &l2meta);
4193         if (ret < 0) {
4194             goto fail;
4195         }
4196 
4197         ret = qcow2_pre_write_overlap_check(bs, 0, host_offset, cur_bytes,
4198                                             true);
4199         if (ret < 0) {
4200             goto fail;
4201         }
4202 
4203         qemu_co_mutex_unlock(&s->lock);
4204         ret = bdrv_co_copy_range_to(src, src_offset, s->data_file, host_offset,
4205                                     cur_bytes, read_flags, write_flags);
4206         qemu_co_mutex_lock(&s->lock);
4207         if (ret < 0) {
4208             goto fail;
4209         }
4210 
4211         ret = qcow2_handle_l2meta(bs, &l2meta, true);
4212         if (ret) {
4213             goto fail;
4214         }
4215 
4216         bytes -= cur_bytes;
4217         src_offset += cur_bytes;
4218         dst_offset += cur_bytes;
4219     }
4220     ret = 0;
4221 
4222 fail:
4223     qcow2_handle_l2meta(bs, &l2meta, false);
4224 
4225     qemu_co_mutex_unlock(&s->lock);
4226 
4227     trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
4228 
4229     return ret;
4230 }
4231 
4232 static int coroutine_fn GRAPH_RDLOCK
4233 qcow2_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
4234                   PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
4235 {
4236     BDRVQcow2State *s = bs->opaque;
4237     uint64_t old_length;
4238     int64_t new_l1_size;
4239     int ret;
4240     QDict *options;
4241 
4242     if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
4243         prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
4244     {
4245         error_setg(errp, "Unsupported preallocation mode '%s'",
4246                    PreallocMode_str(prealloc));
4247         return -ENOTSUP;
4248     }
4249 
4250     if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
4251         error_setg(errp, "The new size must be a multiple of %u",
4252                    (unsigned) BDRV_SECTOR_SIZE);
4253         return -EINVAL;
4254     }
4255 
4256     qemu_co_mutex_lock(&s->lock);
4257 
4258     /*
4259      * Even though we store snapshot size for all images, it was not
4260      * required until v3, so it is not safe to proceed for v2.
4261      */
4262     if (s->nb_snapshots && s->qcow_version < 3) {
4263         error_setg(errp, "Can't resize a v2 image which has snapshots");
4264         ret = -ENOTSUP;
4265         goto fail;
4266     }
4267 
4268     /* See qcow2-bitmap.c for which bitmap scenarios prevent a resize. */
4269     if (qcow2_truncate_bitmaps_check(bs, errp)) {
4270         ret = -ENOTSUP;
4271         goto fail;
4272     }
4273 
4274     old_length = bs->total_sectors * BDRV_SECTOR_SIZE;
4275     new_l1_size = size_to_l1(s, offset);
4276 
4277     if (offset < old_length) {
4278         int64_t last_cluster, old_file_size;
4279         if (prealloc != PREALLOC_MODE_OFF) {
4280             error_setg(errp,
4281                        "Preallocation can't be used for shrinking an image");
4282             ret = -EINVAL;
4283             goto fail;
4284         }
4285 
4286         ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
4287                                     old_length - ROUND_UP(offset,
4288                                                           s->cluster_size),
4289                                     QCOW2_DISCARD_ALWAYS, true);
4290         if (ret < 0) {
4291             error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
4292             goto fail;
4293         }
4294 
4295         ret = qcow2_shrink_l1_table(bs, new_l1_size);
4296         if (ret < 0) {
4297             error_setg_errno(errp, -ret,
4298                              "Failed to reduce the number of L2 tables");
4299             goto fail;
4300         }
4301 
4302         ret = qcow2_shrink_reftable(bs);
4303         if (ret < 0) {
4304             error_setg_errno(errp, -ret,
4305                              "Failed to discard unused refblocks");
4306             goto fail;
4307         }
4308 
4309         old_file_size = bdrv_co_getlength(bs->file->bs);
4310         if (old_file_size < 0) {
4311             error_setg_errno(errp, -old_file_size,
4312                              "Failed to inquire current file length");
4313             ret = old_file_size;
4314             goto fail;
4315         }
4316         last_cluster = qcow2_get_last_cluster(bs, old_file_size);
4317         if (last_cluster < 0) {
4318             error_setg_errno(errp, -last_cluster,
4319                              "Failed to find the last cluster");
4320             ret = last_cluster;
4321             goto fail;
4322         }
4323         if ((last_cluster + 1) * s->cluster_size < old_file_size) {
4324             Error *local_err = NULL;
4325 
4326             /*
4327              * Do not pass @exact here: It will not help the user if
4328              * we get an error here just because they wanted to shrink
4329              * their qcow2 image (on a block device) with qemu-img.
4330              * (And on the qcow2 layer, the @exact requirement is
4331              * always fulfilled, so there is no need to pass it on.)
4332              */
4333             bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
4334                              false, PREALLOC_MODE_OFF, 0, &local_err);
4335             if (local_err) {
4336                 warn_reportf_err(local_err,
4337                                  "Failed to truncate the tail of the image: ");
4338             }
4339         }
4340     } else {
4341         ret = qcow2_grow_l1_table(bs, new_l1_size, true);
4342         if (ret < 0) {
4343             error_setg_errno(errp, -ret, "Failed to grow the L1 table");
4344             goto fail;
4345         }
4346 
4347         if (data_file_is_raw(bs) && prealloc == PREALLOC_MODE_OFF) {
4348             /*
4349              * When creating a qcow2 image with data-file-raw, we enforce
4350              * at least prealloc=metadata, so that the L1/L2 tables are
4351              * fully allocated and reading from the data file will return
4352              * the same data as reading from the qcow2 image.  When the
4353              * image is grown, we must consequently preallocate the
4354              * metadata structures to cover the added area.
4355              */
4356             prealloc = PREALLOC_MODE_METADATA;
4357         }
4358     }
4359 
4360     switch (prealloc) {
4361     case PREALLOC_MODE_OFF:
4362         if (has_data_file(bs)) {
4363             /*
4364              * If the caller wants an exact resize, the external data
4365              * file should be resized to the exact target size, too,
4366              * so we pass @exact here.
4367              */
4368             ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, 0,
4369                                    errp);
4370             if (ret < 0) {
4371                 goto fail;
4372             }
4373         }
4374         break;
4375 
4376     case PREALLOC_MODE_METADATA:
4377         ret = preallocate_co(bs, old_length, offset, prealloc, errp);
4378         if (ret < 0) {
4379             goto fail;
4380         }
4381         break;
4382 
4383     case PREALLOC_MODE_FALLOC:
4384     case PREALLOC_MODE_FULL:
4385     {
4386         int64_t allocation_start, host_offset, guest_offset;
4387         int64_t clusters_allocated;
4388         int64_t old_file_size, last_cluster, new_file_size;
4389         uint64_t nb_new_data_clusters, nb_new_l2_tables;
4390         bool subclusters_need_allocation = false;
4391 
4392         /* With a data file, preallocation means just allocating the metadata
4393          * and forwarding the truncate request to the data file */
4394         if (has_data_file(bs)) {
4395             ret = preallocate_co(bs, old_length, offset, prealloc, errp);
4396             if (ret < 0) {
4397                 goto fail;
4398             }
4399             break;
4400         }
4401 
4402         old_file_size = bdrv_co_getlength(bs->file->bs);
4403         if (old_file_size < 0) {
4404             error_setg_errno(errp, -old_file_size,
4405                              "Failed to inquire current file length");
4406             ret = old_file_size;
4407             goto fail;
4408         }
4409 
4410         last_cluster = qcow2_get_last_cluster(bs, old_file_size);
4411         if (last_cluster >= 0) {
4412             old_file_size = (last_cluster + 1) * s->cluster_size;
4413         } else {
4414             old_file_size = ROUND_UP(old_file_size, s->cluster_size);
4415         }
4416 
4417         nb_new_data_clusters = (ROUND_UP(offset, s->cluster_size) -
4418             start_of_cluster(s, old_length)) >> s->cluster_bits;
4419 
4420         /* This is an overestimation; we will not actually allocate space for
4421          * these in the file but just make sure the new refcount structures are
4422          * able to cover them so we will not have to allocate new refblocks
4423          * while entering the data blocks in the potentially new L2 tables.
4424          * (We do not actually care where the L2 tables are placed. Maybe they
4425          *  are already allocated or they can be placed somewhere before
4426          *  @old_file_size. It does not matter because they will be fully
4427          *  allocated automatically, so they do not need to be covered by the
4428          *  preallocation. All that matters is that we will not have to allocate
4429          *  new refcount structures for them.) */
4430         nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
4431                                         s->cluster_size / l2_entry_size(s));
4432         /* The cluster range may not be aligned to L2 boundaries, so add one L2
4433          * table for a potential head/tail */
4434         nb_new_l2_tables++;
4435 
4436         allocation_start = qcow2_refcount_area(bs, old_file_size,
4437                                                nb_new_data_clusters +
4438                                                nb_new_l2_tables,
4439                                                true, 0, 0);
4440         if (allocation_start < 0) {
4441             error_setg_errno(errp, -allocation_start,
4442                              "Failed to resize refcount structures");
4443             ret = allocation_start;
4444             goto fail;
4445         }
4446 
4447         clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
4448                                                      nb_new_data_clusters);
4449         if (clusters_allocated < 0) {
4450             error_setg_errno(errp, -clusters_allocated,
4451                              "Failed to allocate data clusters");
4452             ret = clusters_allocated;
4453             goto fail;
4454         }
4455 
4456         assert(clusters_allocated == nb_new_data_clusters);
4457 
4458         /* Allocate the data area */
4459         new_file_size = allocation_start +
4460                         nb_new_data_clusters * s->cluster_size;
4461         /*
4462          * Image file grows, so @exact does not matter.
4463          *
4464          * If we need to zero out the new area, try first whether the protocol
4465          * driver can already take care of this.
4466          */
4467         if (flags & BDRV_REQ_ZERO_WRITE) {
4468             ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc,
4469                                    BDRV_REQ_ZERO_WRITE, NULL);
4470             if (ret >= 0) {
4471                 flags &= ~BDRV_REQ_ZERO_WRITE;
4472                 /* Ensure that we read zeroes and not backing file data */
4473                 subclusters_need_allocation = true;
4474             }
4475         } else {
4476             ret = -1;
4477         }
4478         if (ret < 0) {
4479             ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
4480                                    errp);
4481         }
4482         if (ret < 0) {
4483             error_prepend(errp, "Failed to resize underlying file: ");
4484             qcow2_free_clusters(bs, allocation_start,
4485                                 nb_new_data_clusters * s->cluster_size,
4486                                 QCOW2_DISCARD_OTHER);
4487             goto fail;
4488         }
4489 
4490         /* Create the necessary L2 entries */
4491         host_offset = allocation_start;
4492         guest_offset = old_length;
4493         while (nb_new_data_clusters) {
4494             int64_t nb_clusters = MIN(
4495                 nb_new_data_clusters,
4496                 s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
4497             unsigned cow_start_length = offset_into_cluster(s, guest_offset);
4498             QCowL2Meta allocation;
4499             guest_offset = start_of_cluster(s, guest_offset);
4500             allocation = (QCowL2Meta) {
4501                 .offset       = guest_offset,
4502                 .alloc_offset = host_offset,
4503                 .nb_clusters  = nb_clusters,
4504                 .cow_start    = {
4505                     .offset       = 0,
4506                     .nb_bytes     = cow_start_length,
4507                 },
4508                 .cow_end      = {
4509                     .offset       = nb_clusters << s->cluster_bits,
4510                     .nb_bytes     = 0,
4511                 },
4512                 .prealloc     = !subclusters_need_allocation,
4513             };
4514             qemu_co_queue_init(&allocation.dependent_requests);
4515 
4516             ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
4517             if (ret < 0) {
4518                 error_setg_errno(errp, -ret, "Failed to update L2 tables");
4519                 qcow2_free_clusters(bs, host_offset,
4520                                     nb_new_data_clusters * s->cluster_size,
4521                                     QCOW2_DISCARD_OTHER);
4522                 goto fail;
4523             }
4524 
4525             guest_offset += nb_clusters * s->cluster_size;
4526             host_offset += nb_clusters * s->cluster_size;
4527             nb_new_data_clusters -= nb_clusters;
4528         }
4529         break;
4530     }
4531 
4532     default:
4533         g_assert_not_reached();
4534     }
4535 
4536     if ((flags & BDRV_REQ_ZERO_WRITE) && offset > old_length) {
4537         uint64_t zero_start = QEMU_ALIGN_UP(old_length, s->subcluster_size);
4538 
4539         /*
4540          * Use zero clusters as much as we can. qcow2_subcluster_zeroize()
4541          * requires a subcluster-aligned start. The end may be unaligned if
4542          * it is at the end of the image (which it is here).
4543          */
4544         if (offset > zero_start) {
4545             ret = qcow2_subcluster_zeroize(bs, zero_start, offset - zero_start,
4546                                            0);
4547             if (ret < 0) {
4548                 error_setg_errno(errp, -ret, "Failed to zero out new clusters");
4549                 goto fail;
4550             }
4551         }
4552 
4553         /* Write explicit zeros for the unaligned head */
4554         if (zero_start > old_length) {
4555             uint64_t len = MIN(zero_start, offset) - old_length;
4556             uint8_t *buf = qemu_blockalign0(bs, len);
4557             QEMUIOVector qiov;
4558             qemu_iovec_init_buf(&qiov, buf, len);
4559 
4560             qemu_co_mutex_unlock(&s->lock);
4561             ret = qcow2_co_pwritev_part(bs, old_length, len, &qiov, 0, 0);
4562             qemu_co_mutex_lock(&s->lock);
4563 
4564             qemu_vfree(buf);
4565             if (ret < 0) {
4566                 error_setg_errno(errp, -ret, "Failed to zero out the new area");
4567                 goto fail;
4568             }
4569         }
4570     }
4571 
4572     if (prealloc != PREALLOC_MODE_OFF) {
4573         /* Flush metadata before actually changing the image size */
4574         ret = qcow2_write_caches(bs);
4575         if (ret < 0) {
4576             error_setg_errno(errp, -ret,
4577                              "Failed to flush the preallocated area to disk");
4578             goto fail;
4579         }
4580     }
4581 
4582     bs->total_sectors = offset / BDRV_SECTOR_SIZE;
4583 
4584     /* write updated header.size */
4585     offset = cpu_to_be64(offset);
4586     ret = bdrv_co_pwrite_sync(bs->file, offsetof(QCowHeader, size),
4587                               sizeof(offset), &offset, 0);
4588     if (ret < 0) {
4589         error_setg_errno(errp, -ret, "Failed to update the image size");
4590         goto fail;
4591     }
4592 
4593     s->l1_vm_state_index = new_l1_size;
4594 
4595     /* Update cache sizes */
4596     options = qdict_clone_shallow(bs->options);
4597     ret = qcow2_update_options(bs, options, s->flags, errp);
4598     qobject_unref(options);
4599     if (ret < 0) {
4600         goto fail;
4601     }
4602     ret = 0;
4603 fail:
4604     qemu_co_mutex_unlock(&s->lock);
4605     return ret;
4606 }
4607 
4608 static int coroutine_fn GRAPH_RDLOCK
4609 qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
4610                                  uint64_t offset, uint64_t bytes,
4611                                  QEMUIOVector *qiov, size_t qiov_offset)
4612 {
4613     BDRVQcow2State *s = bs->opaque;
4614     int ret;
4615     ssize_t out_len;
4616     uint8_t *buf, *out_buf;
4617     uint64_t cluster_offset;
4618 
4619     assert(bytes == s->cluster_size || (bytes < s->cluster_size &&
4620            (offset + bytes == bs->total_sectors << BDRV_SECTOR_BITS)));
4621 
4622     buf = qemu_blockalign(bs, s->cluster_size);
4623     if (bytes < s->cluster_size) {
4624         /* Zero-pad last write if image size is not cluster aligned */
4625         memset(buf + bytes, 0, s->cluster_size - bytes);
4626     }
4627     qemu_iovec_to_buf(qiov, qiov_offset, buf, bytes);
4628 
4629     out_buf = g_malloc(s->cluster_size);
4630 
4631     out_len = qcow2_co_compress(bs, out_buf, s->cluster_size - 1,
4632                                 buf, s->cluster_size);
4633     if (out_len == -ENOMEM) {
4634         /* could not compress: write normal cluster */
4635         ret = qcow2_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 0);
4636         if (ret < 0) {
4637             goto fail;
4638         }
4639         goto success;
4640     } else if (out_len < 0) {
4641         ret = -EINVAL;
4642         goto fail;
4643     }
4644 
4645     qemu_co_mutex_lock(&s->lock);
4646     ret = qcow2_alloc_compressed_cluster_offset(bs, offset, out_len,
4647                                                 &cluster_offset);
4648     if (ret < 0) {
4649         qemu_co_mutex_unlock(&s->lock);
4650         goto fail;
4651     }
4652 
4653     ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true);
4654     qemu_co_mutex_unlock(&s->lock);
4655     if (ret < 0) {
4656         goto fail;
4657     }
4658 
4659     BLKDBG_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
4660     ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
4661     if (ret < 0) {
4662         goto fail;
4663     }
4664 success:
4665     ret = 0;
4666 fail:
4667     qemu_vfree(buf);
4668     g_free(out_buf);
4669     return ret;
4670 }
4671 
4672 /*
4673  * This function can count as GRAPH_RDLOCK because
4674  * qcow2_co_pwritev_compressed_part() holds the graph lock and keeps it until
4675  * this coroutine has terminated.
4676  */
4677 static int coroutine_fn GRAPH_RDLOCK
4678 qcow2_co_pwritev_compressed_task_entry(AioTask *task)
4679 {
4680     Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
4681 
4682     assert(!t->subcluster_type && !t->l2meta);
4683 
4684     return qcow2_co_pwritev_compressed_task(t->bs, t->offset, t->bytes, t->qiov,
4685                                             t->qiov_offset);
4686 }
4687 
4688 /*
4689  * XXX: put compressed sectors first, then all the cluster aligned
4690  * tables to avoid losing bytes in alignment
4691  */
4692 static int coroutine_fn GRAPH_RDLOCK
4693 qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
4694                                  int64_t offset, int64_t bytes,
4695                                  QEMUIOVector *qiov, size_t qiov_offset)
4696 {
4697     BDRVQcow2State *s = bs->opaque;
4698     AioTaskPool *aio = NULL;
4699     int ret = 0;
4700 
4701     if (has_data_file(bs)) {
4702         return -ENOTSUP;
4703     }
4704 
4705     if (bytes == 0) {
4706         /*
4707          * align end of file to a sector boundary to ease reading with
4708          * sector based I/Os
4709          */
4710         int64_t len = bdrv_co_getlength(bs->file->bs);
4711         if (len < 0) {
4712             return len;
4713         }
4714         return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, 0,
4715                                 NULL);
4716     }
4717 
4718     if (offset_into_cluster(s, offset)) {
4719         return -EINVAL;
4720     }
4721 
4722     if (offset_into_cluster(s, bytes) &&
4723         (offset + bytes) != (bs->total_sectors << BDRV_SECTOR_BITS)) {
4724         return -EINVAL;
4725     }
4726 
4727     while (bytes && aio_task_pool_status(aio) == 0) {
4728         uint64_t chunk_size = MIN(bytes, s->cluster_size);
4729 
4730         if (!aio && chunk_size != bytes) {
4731             aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
4732         }
4733 
4734         ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_compressed_task_entry,
4735                              0, 0, offset, chunk_size, qiov, qiov_offset, NULL);
4736         if (ret < 0) {
4737             break;
4738         }
4739         qiov_offset += chunk_size;
4740         offset += chunk_size;
4741         bytes -= chunk_size;
4742     }
4743 
4744     if (aio) {
4745         aio_task_pool_wait_all(aio);
4746         if (ret == 0) {
4747             ret = aio_task_pool_status(aio);
4748         }
4749         g_free(aio);
4750     }
4751 
4752     return ret;
4753 }
4754 
4755 static int coroutine_fn GRAPH_RDLOCK
4756 qcow2_co_preadv_compressed(BlockDriverState *bs,
4757                            uint64_t l2_entry,
4758                            uint64_t offset,
4759                            uint64_t bytes,
4760                            QEMUIOVector *qiov,
4761                            size_t qiov_offset)
4762 {
4763     BDRVQcow2State *s = bs->opaque;
4764     int ret = 0, csize;
4765     uint64_t coffset;
4766     uint8_t *buf, *out_buf;
4767     int offset_in_cluster = offset_into_cluster(s, offset);
4768 
4769     qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
4770 
4771     buf = g_try_malloc(csize);
4772     if (!buf) {
4773         return -ENOMEM;
4774     }
4775 
4776     out_buf = qemu_blockalign(bs, s->cluster_size);
4777 
4778     BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
4779     ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
4780     if (ret < 0) {
4781         goto fail;
4782     }
4783 
4784     if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
4785         ret = -EIO;
4786         goto fail;
4787     }
4788 
4789     qemu_iovec_from_buf(qiov, qiov_offset, out_buf + offset_in_cluster, bytes);
4790 
4791 fail:
4792     qemu_vfree(out_buf);
4793     g_free(buf);
4794 
4795     return ret;
4796 }
4797 
4798 static int make_completely_empty(BlockDriverState *bs)
4799 {
4800     BDRVQcow2State *s = bs->opaque;
4801     Error *local_err = NULL;
4802     int ret, l1_clusters;
4803     int64_t offset;
4804     uint64_t *new_reftable = NULL;
4805     uint64_t rt_entry, l1_size2;
4806     struct {
4807         uint64_t l1_offset;
4808         uint64_t reftable_offset;
4809         uint32_t reftable_clusters;
4810     } QEMU_PACKED l1_ofs_rt_ofs_cls;
4811 
4812     ret = qcow2_cache_empty(bs, s->l2_table_cache);
4813     if (ret < 0) {
4814         goto fail;
4815     }
4816 
4817     ret = qcow2_cache_empty(bs, s->refcount_block_cache);
4818     if (ret < 0) {
4819         goto fail;
4820     }
4821 
4822     /* Refcounts will be broken utterly */
4823     ret = qcow2_mark_dirty(bs);
4824     if (ret < 0) {
4825         goto fail;
4826     }
4827 
4828     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
4829 
4830     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / L1E_SIZE);
4831     l1_size2 = (uint64_t)s->l1_size * L1E_SIZE;
4832 
4833     /* After this call, neither the in-memory nor the on-disk refcount
4834      * information accurately describe the actual references */
4835 
4836     ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
4837                              l1_clusters * s->cluster_size, 0);
4838     if (ret < 0) {
4839         goto fail_broken_refcounts;
4840     }
4841     memset(s->l1_table, 0, l1_size2);
4842 
4843     BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
4844 
4845     /* Overwrite enough clusters at the beginning of the sectors to place
4846      * the refcount table, a refcount block and the L1 table in; this may
4847      * overwrite parts of the existing refcount and L1 table, which is not
4848      * an issue because the dirty flag is set, complete data loss is in fact
4849      * desired and partial data loss is consequently fine as well */
4850     ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
4851                              (2 + l1_clusters) * s->cluster_size, 0);
4852     /* This call (even if it failed overall) may have overwritten on-disk
4853      * refcount structures; in that case, the in-memory refcount information
4854      * will probably differ from the on-disk information which makes the BDS
4855      * unusable */
4856     if (ret < 0) {
4857         goto fail_broken_refcounts;
4858     }
4859 
4860     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
4861     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
4862 
4863     /* "Create" an empty reftable (one cluster) directly after the image
4864      * header and an empty L1 table three clusters after the image header;
4865      * the cluster between those two will be used as the first refblock */
4866     l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
4867     l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
4868     l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
4869     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
4870                            sizeof(l1_ofs_rt_ofs_cls), &l1_ofs_rt_ofs_cls, 0);
4871     if (ret < 0) {
4872         goto fail_broken_refcounts;
4873     }
4874 
4875     s->l1_table_offset = 3 * s->cluster_size;
4876 
4877     new_reftable = g_try_new0(uint64_t, s->cluster_size / REFTABLE_ENTRY_SIZE);
4878     if (!new_reftable) {
4879         ret = -ENOMEM;
4880         goto fail_broken_refcounts;
4881     }
4882 
4883     s->refcount_table_offset = s->cluster_size;
4884     s->refcount_table_size   = s->cluster_size / REFTABLE_ENTRY_SIZE;
4885     s->max_refcount_table_index = 0;
4886 
4887     g_free(s->refcount_table);
4888     s->refcount_table = new_reftable;
4889     new_reftable = NULL;
4890 
4891     /* Now the in-memory refcount information again corresponds to the on-disk
4892      * information (reftable is empty and no refblocks (the refblock cache is
4893      * empty)); however, this means some clusters (e.g. the image header) are
4894      * referenced, but not refcounted, but the normal qcow2 code assumes that
4895      * the in-memory information is always correct */
4896 
4897     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
4898 
4899     /* Enter the first refblock into the reftable */
4900     rt_entry = cpu_to_be64(2 * s->cluster_size);
4901     ret = bdrv_pwrite_sync(bs->file, s->cluster_size, sizeof(rt_entry),
4902                            &rt_entry, 0);
4903     if (ret < 0) {
4904         goto fail_broken_refcounts;
4905     }
4906     s->refcount_table[0] = 2 * s->cluster_size;
4907 
4908     s->free_cluster_index = 0;
4909     assert(3 + l1_clusters <= s->refcount_block_size);
4910     offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
4911     if (offset < 0) {
4912         ret = offset;
4913         goto fail_broken_refcounts;
4914     } else if (offset > 0) {
4915         error_report("First cluster in emptied image is in use");
4916         abort();
4917     }
4918 
4919     /* Now finally the in-memory information corresponds to the on-disk
4920      * structures and is correct */
4921     ret = qcow2_mark_clean(bs);
4922     if (ret < 0) {
4923         goto fail;
4924     }
4925 
4926     ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
4927                         PREALLOC_MODE_OFF, 0, &local_err);
4928     if (ret < 0) {
4929         error_report_err(local_err);
4930         goto fail;
4931     }
4932 
4933     return 0;
4934 
4935 fail_broken_refcounts:
4936     /* The BDS is unusable at this point. If we wanted to make it usable, we
4937      * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
4938      * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
4939      * again. However, because the functions which could have caused this error
4940      * path to be taken are used by those functions as well, it's very likely
4941      * that that sequence will fail as well. Therefore, just eject the BDS. */
4942     bs->drv = NULL;
4943 
4944 fail:
4945     g_free(new_reftable);
4946     return ret;
4947 }
4948 
4949 static int qcow2_make_empty(BlockDriverState *bs)
4950 {
4951     BDRVQcow2State *s = bs->opaque;
4952     uint64_t offset, end_offset;
4953     int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
4954     int l1_clusters, ret = 0;
4955 
4956     l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / L1E_SIZE);
4957 
4958     if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps &&
4959         3 + l1_clusters <= s->refcount_block_size &&
4960         s->crypt_method_header != QCOW_CRYPT_LUKS &&
4961         !has_data_file(bs)) {
4962         /* The following function only works for qcow2 v3 images (it
4963          * requires the dirty flag) and only as long as there are no
4964          * features that reserve extra clusters (such as snapshots,
4965          * LUKS header, or persistent bitmaps), because it completely
4966          * empties the image.  Furthermore, the L1 table and three
4967          * additional clusters (image header, refcount table, one
4968          * refcount block) have to fit inside one refcount block. It
4969          * only resets the image file, i.e. does not work with an
4970          * external data file. */
4971         return make_completely_empty(bs);
4972     }
4973 
4974     /* This fallback code simply discards every active cluster; this is slow,
4975      * but works in all cases */
4976     end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
4977     for (offset = 0; offset < end_offset; offset += step) {
4978         /* As this function is generally used after committing an external
4979          * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
4980          * default action for this kind of discard is to pass the discard,
4981          * which will ideally result in an actually smaller image file, as
4982          * is probably desired. */
4983         ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
4984                                     QCOW2_DISCARD_SNAPSHOT, true);
4985         if (ret < 0) {
4986             break;
4987         }
4988     }
4989 
4990     return ret;
4991 }
4992 
4993 static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
4994 {
4995     BDRVQcow2State *s = bs->opaque;
4996     int ret;
4997 
4998     qemu_co_mutex_lock(&s->lock);
4999     ret = qcow2_write_caches(bs);
5000     qemu_co_mutex_unlock(&s->lock);
5001 
5002     return ret;
5003 }
5004 
5005 static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
5006                                        Error **errp)
5007 {
5008     Error *local_err = NULL;
5009     BlockMeasureInfo *info;
5010     uint64_t required = 0; /* bytes that contribute to required size */
5011     uint64_t virtual_size; /* disk size as seen by guest */
5012     uint64_t refcount_bits;
5013     uint64_t l2_tables;
5014     uint64_t luks_payload_size = 0;
5015     size_t cluster_size;
5016     int version;
5017     char *optstr;
5018     PreallocMode prealloc;
5019     bool has_backing_file;
5020     bool has_luks;
5021     bool extended_l2;
5022     size_t l2e_size;
5023 
5024     /* Parse image creation options */
5025     extended_l2 = qemu_opt_get_bool_del(opts, BLOCK_OPT_EXTL2, false);
5026 
5027     cluster_size = qcow2_opt_get_cluster_size_del(opts, extended_l2,
5028                                                   &local_err);
5029     if (local_err) {
5030         goto err;
5031     }
5032 
5033     version = qcow2_opt_get_version_del(opts, &local_err);
5034     if (local_err) {
5035         goto err;
5036     }
5037 
5038     refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
5039     if (local_err) {
5040         goto err;
5041     }
5042 
5043     optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
5044     prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
5045                                PREALLOC_MODE_OFF, &local_err);
5046     g_free(optstr);
5047     if (local_err) {
5048         goto err;
5049     }
5050 
5051     optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
5052     has_backing_file = !!optstr;
5053     g_free(optstr);
5054 
5055     optstr = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT);
5056     has_luks = optstr && strcmp(optstr, "luks") == 0;
5057     g_free(optstr);
5058 
5059     if (has_luks) {
5060         g_autoptr(QCryptoBlockCreateOptions) create_opts = NULL;
5061         QDict *cryptoopts = qcow2_extract_crypto_opts(opts, "luks", errp);
5062         size_t headerlen;
5063 
5064         create_opts = block_crypto_create_opts_init(cryptoopts, errp);
5065         qobject_unref(cryptoopts);
5066         if (!create_opts) {
5067             goto err;
5068         }
5069 
5070         if (!qcrypto_block_calculate_payload_offset(create_opts,
5071                                                     "encrypt.",
5072                                                     &headerlen,
5073                                                     &local_err)) {
5074             goto err;
5075         }
5076 
5077         luks_payload_size = ROUND_UP(headerlen, cluster_size);
5078     }
5079 
5080     virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
5081     virtual_size = ROUND_UP(virtual_size, cluster_size);
5082 
5083     /* Check that virtual disk size is valid */
5084     l2e_size = extended_l2 ? L2E_SIZE_EXTENDED : L2E_SIZE_NORMAL;
5085     l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
5086                              cluster_size / l2e_size);
5087     if (l2_tables * L1E_SIZE > QCOW_MAX_L1_SIZE) {
5088         error_setg(&local_err, "The image size is too large "
5089                                "(try using a larger cluster size)");
5090         goto err;
5091     }
5092 
5093     /* Account for input image */
5094     if (in_bs) {
5095         int64_t ssize = bdrv_getlength(in_bs);
5096         if (ssize < 0) {
5097             error_setg_errno(&local_err, -ssize,
5098                              "Unable to get image virtual_size");
5099             goto err;
5100         }
5101 
5102         virtual_size = ROUND_UP(ssize, cluster_size);
5103 
5104         if (has_backing_file) {
5105             /* We don't how much of the backing chain is shared by the input
5106              * image and the new image file.  In the worst case the new image's
5107              * backing file has nothing in common with the input image.  Be
5108              * conservative and assume all clusters need to be written.
5109              */
5110             required = virtual_size;
5111         } else {
5112             int64_t offset;
5113             int64_t pnum = 0;
5114 
5115             for (offset = 0; offset < ssize; offset += pnum) {
5116                 int ret;
5117 
5118                 ret = bdrv_block_status_above(in_bs, NULL, offset,
5119                                               ssize - offset, &pnum, NULL,
5120                                               NULL);
5121                 if (ret < 0) {
5122                     error_setg_errno(&local_err, -ret,
5123                                      "Unable to get block status");
5124                     goto err;
5125                 }
5126 
5127                 if (ret & BDRV_BLOCK_ZERO) {
5128                     /* Skip zero regions (safe with no backing file) */
5129                 } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
5130                            (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
5131                     /* Extend pnum to end of cluster for next iteration */
5132                     pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
5133 
5134                     /* Count clusters we've seen */
5135                     required += offset % cluster_size + pnum;
5136                 }
5137             }
5138         }
5139     }
5140 
5141     /* Take into account preallocation.  Nothing special is needed for
5142      * PREALLOC_MODE_METADATA since metadata is always counted.
5143      */
5144     if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
5145         required = virtual_size;
5146     }
5147 
5148     info = g_new0(BlockMeasureInfo, 1);
5149     info->fully_allocated = luks_payload_size +
5150         qcow2_calc_prealloc_size(virtual_size, cluster_size,
5151                                  ctz32(refcount_bits), extended_l2);
5152 
5153     /*
5154      * Remove data clusters that are not required.  This overestimates the
5155      * required size because metadata needed for the fully allocated file is
5156      * still counted.  Show bitmaps only if both source and destination
5157      * would support them.
5158      */
5159     info->required = info->fully_allocated - virtual_size + required;
5160     info->has_bitmaps = version >= 3 && in_bs &&
5161         bdrv_supports_persistent_dirty_bitmap(in_bs);
5162     if (info->has_bitmaps) {
5163         info->bitmaps = qcow2_get_persistent_dirty_bitmap_size(in_bs,
5164                                                                cluster_size);
5165     }
5166     return info;
5167 
5168 err:
5169     error_propagate(errp, local_err);
5170     return NULL;
5171 }
5172 
5173 static int coroutine_fn
5174 qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
5175 {
5176     BDRVQcow2State *s = bs->opaque;
5177     bdi->cluster_size = s->cluster_size;
5178     bdi->vm_state_offset = qcow2_vm_state_offset(s);
5179     bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY;
5180     return 0;
5181 }
5182 
5183 static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs,
5184                                                   Error **errp)
5185 {
5186     BDRVQcow2State *s = bs->opaque;
5187     ImageInfoSpecific *spec_info;
5188     QCryptoBlockInfo *encrypt_info = NULL;
5189 
5190     if (s->crypto != NULL) {
5191         encrypt_info = qcrypto_block_get_info(s->crypto, errp);
5192         if (!encrypt_info) {
5193             return NULL;
5194         }
5195     }
5196 
5197     spec_info = g_new(ImageInfoSpecific, 1);
5198     *spec_info = (ImageInfoSpecific){
5199         .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
5200         .u.qcow2.data = g_new0(ImageInfoSpecificQCow2, 1),
5201     };
5202     if (s->qcow_version == 2) {
5203         *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
5204             .compat             = g_strdup("0.10"),
5205             .refcount_bits      = s->refcount_bits,
5206         };
5207     } else if (s->qcow_version == 3) {
5208         Qcow2BitmapInfoList *bitmaps;
5209         if (!qcow2_get_bitmap_info_list(bs, &bitmaps, errp)) {
5210             qapi_free_ImageInfoSpecific(spec_info);
5211             qapi_free_QCryptoBlockInfo(encrypt_info);
5212             return NULL;
5213         }
5214         *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
5215             .compat             = g_strdup("1.1"),
5216             .lazy_refcounts     = s->compatible_features &
5217                                   QCOW2_COMPAT_LAZY_REFCOUNTS,
5218             .has_lazy_refcounts = true,
5219             .corrupt            = s->incompatible_features &
5220                                   QCOW2_INCOMPAT_CORRUPT,
5221             .has_corrupt        = true,
5222             .has_extended_l2    = true,
5223             .extended_l2        = has_subclusters(s),
5224             .refcount_bits      = s->refcount_bits,
5225             .has_bitmaps        = !!bitmaps,
5226             .bitmaps            = bitmaps,
5227             .data_file          = g_strdup(s->image_data_file),
5228             .has_data_file_raw  = has_data_file(bs),
5229             .data_file_raw      = data_file_is_raw(bs),
5230             .compression_type   = s->compression_type,
5231         };
5232     } else {
5233         /* if this assertion fails, this probably means a new version was
5234          * added without having it covered here */
5235         assert(false);
5236     }
5237 
5238     if (encrypt_info) {
5239         ImageInfoSpecificQCow2Encryption *qencrypt =
5240             g_new(ImageInfoSpecificQCow2Encryption, 1);
5241         switch (encrypt_info->format) {
5242         case Q_CRYPTO_BLOCK_FORMAT_QCOW:
5243             qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
5244             break;
5245         case Q_CRYPTO_BLOCK_FORMAT_LUKS:
5246             qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
5247             qencrypt->u.luks = encrypt_info->u.luks;
5248             break;
5249         default:
5250             abort();
5251         }
5252         /* Since we did shallow copy above, erase any pointers
5253          * in the original info */
5254         memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
5255         qapi_free_QCryptoBlockInfo(encrypt_info);
5256 
5257         spec_info->u.qcow2.data->encrypt = qencrypt;
5258     }
5259 
5260     return spec_info;
5261 }
5262 
5263 static int qcow2_has_zero_init(BlockDriverState *bs)
5264 {
5265     BDRVQcow2State *s = bs->opaque;
5266     bool preallocated;
5267 
5268     if (qemu_in_coroutine()) {
5269         qemu_co_mutex_lock(&s->lock);
5270     }
5271     /*
5272      * Check preallocation status: Preallocated images have all L2
5273      * tables allocated, nonpreallocated images have none.  It is
5274      * therefore enough to check the first one.
5275      */
5276     preallocated = s->l1_size > 0 && s->l1_table[0] != 0;
5277     if (qemu_in_coroutine()) {
5278         qemu_co_mutex_unlock(&s->lock);
5279     }
5280 
5281     if (!preallocated) {
5282         return 1;
5283     } else if (bs->encrypted) {
5284         return 0;
5285     } else {
5286         return bdrv_has_zero_init(s->data_file->bs);
5287     }
5288 }
5289 
5290 /*
5291  * Check the request to vmstate. On success return
5292  *      qcow2_vm_state_offset(bs) + @pos
5293  */
5294 static int64_t qcow2_check_vmstate_request(BlockDriverState *bs,
5295                                            QEMUIOVector *qiov, int64_t pos)
5296 {
5297     BDRVQcow2State *s = bs->opaque;
5298     int64_t vmstate_offset = qcow2_vm_state_offset(s);
5299     int ret;
5300 
5301     /* Incoming requests must be OK */
5302     bdrv_check_qiov_request(pos, qiov->size, qiov, 0, &error_abort);
5303 
5304     if (INT64_MAX - pos < vmstate_offset) {
5305         return -EIO;
5306     }
5307 
5308     pos += vmstate_offset;
5309     ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
5310     if (ret < 0) {
5311         return ret;
5312     }
5313 
5314     return pos;
5315 }
5316 
5317 static int coroutine_fn GRAPH_RDLOCK
5318 qcow2_co_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
5319 {
5320     int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
5321     if (offset < 0) {
5322         return offset;
5323     }
5324 
5325     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
5326     return bs->drv->bdrv_co_pwritev_part(bs, offset, qiov->size, qiov, 0, 0);
5327 }
5328 
5329 static int coroutine_fn GRAPH_RDLOCK
5330 qcow2_co_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
5331 {
5332     int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
5333     if (offset < 0) {
5334         return offset;
5335     }
5336 
5337     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
5338     return bs->drv->bdrv_co_preadv_part(bs, offset, qiov->size, qiov, 0, 0);
5339 }
5340 
5341 static int qcow2_has_compressed_clusters(BlockDriverState *bs)
5342 {
5343     int64_t offset = 0;
5344     int64_t bytes = bdrv_getlength(bs);
5345 
5346     if (bytes < 0) {
5347         return bytes;
5348     }
5349 
5350     while (bytes != 0) {
5351         int ret;
5352         QCow2SubclusterType type;
5353         unsigned int cur_bytes = MIN(INT_MAX, bytes);
5354         uint64_t host_offset;
5355 
5356         ret = qcow2_get_host_offset(bs, offset, &cur_bytes, &host_offset,
5357                                     &type);
5358         if (ret < 0) {
5359             return ret;
5360         }
5361 
5362         if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
5363             return 1;
5364         }
5365 
5366         offset += cur_bytes;
5367         bytes -= cur_bytes;
5368     }
5369 
5370     return 0;
5371 }
5372 
5373 /*
5374  * Downgrades an image's version. To achieve this, any incompatible features
5375  * have to be removed.
5376  */
5377 static int qcow2_downgrade(BlockDriverState *bs, int target_version,
5378                            BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
5379                            Error **errp)
5380 {
5381     BDRVQcow2State *s = bs->opaque;
5382     int current_version = s->qcow_version;
5383     int ret;
5384     int i;
5385 
5386     /* This is qcow2_downgrade(), not qcow2_upgrade() */
5387     assert(target_version < current_version);
5388 
5389     /* There are no other versions (now) that you can downgrade to */
5390     assert(target_version == 2);
5391 
5392     if (s->refcount_order != 4) {
5393         error_setg(errp, "compat=0.10 requires refcount_bits=16");
5394         return -ENOTSUP;
5395     }
5396 
5397     if (has_data_file(bs)) {
5398         error_setg(errp, "Cannot downgrade an image with a data file");
5399         return -ENOTSUP;
5400     }
5401 
5402     /*
5403      * If any internal snapshot has a different size than the current
5404      * image size, or VM state size that exceeds 32 bits, downgrading
5405      * is unsafe.  Even though we would still use v3-compliant output
5406      * to preserve that data, other v2 programs might not realize
5407      * those optional fields are important.
5408      */
5409     for (i = 0; i < s->nb_snapshots; i++) {
5410         if (s->snapshots[i].vm_state_size > UINT32_MAX ||
5411             s->snapshots[i].disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) {
5412             error_setg(errp, "Internal snapshots prevent downgrade of image");
5413             return -ENOTSUP;
5414         }
5415     }
5416 
5417     /* clear incompatible features */
5418     if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
5419         ret = qcow2_mark_clean(bs);
5420         if (ret < 0) {
5421             error_setg_errno(errp, -ret, "Failed to make the image clean");
5422             return ret;
5423         }
5424     }
5425 
5426     /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
5427      * the first place; if that happens nonetheless, returning -ENOTSUP is the
5428      * best thing to do anyway */
5429 
5430     if (s->incompatible_features & ~QCOW2_INCOMPAT_COMPRESSION) {
5431         error_setg(errp, "Cannot downgrade an image with incompatible features "
5432                    "0x%" PRIx64 " set",
5433                    s->incompatible_features & ~QCOW2_INCOMPAT_COMPRESSION);
5434         return -ENOTSUP;
5435     }
5436 
5437     /* since we can ignore compatible features, we can set them to 0 as well */
5438     s->compatible_features = 0;
5439     /* if lazy refcounts have been used, they have already been fixed through
5440      * clearing the dirty flag */
5441 
5442     /* clearing autoclear features is trivial */
5443     s->autoclear_features = 0;
5444 
5445     ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
5446     if (ret < 0) {
5447         error_setg_errno(errp, -ret, "Failed to turn zero into data clusters");
5448         return ret;
5449     }
5450 
5451     if (s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION) {
5452         ret = qcow2_has_compressed_clusters(bs);
5453         if (ret < 0) {
5454             error_setg(errp, "Failed to check block status");
5455             return -EINVAL;
5456         }
5457         if (ret) {
5458             error_setg(errp, "Cannot downgrade an image with zstd compression "
5459                        "type and existing compressed clusters");
5460             return -ENOTSUP;
5461         }
5462         /*
5463          * No compressed clusters for now, so just chose default zlib
5464          * compression.
5465          */
5466         s->incompatible_features &= ~QCOW2_INCOMPAT_COMPRESSION;
5467         s->compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
5468     }
5469 
5470     assert(s->incompatible_features == 0);
5471 
5472     s->qcow_version = target_version;
5473     ret = qcow2_update_header(bs);
5474     if (ret < 0) {
5475         s->qcow_version = current_version;
5476         error_setg_errno(errp, -ret, "Failed to update the image header");
5477         return ret;
5478     }
5479     return 0;
5480 }
5481 
5482 /*
5483  * Upgrades an image's version.  While newer versions encompass all
5484  * features of older versions, some things may have to be presented
5485  * differently.
5486  */
5487 static int qcow2_upgrade(BlockDriverState *bs, int target_version,
5488                          BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
5489                          Error **errp)
5490 {
5491     BDRVQcow2State *s = bs->opaque;
5492     bool need_snapshot_update;
5493     int current_version = s->qcow_version;
5494     int i;
5495     int ret;
5496 
5497     /* This is qcow2_upgrade(), not qcow2_downgrade() */
5498     assert(target_version > current_version);
5499 
5500     /* There are no other versions (yet) that you can upgrade to */
5501     assert(target_version == 3);
5502 
5503     status_cb(bs, 0, 2, cb_opaque);
5504 
5505     /*
5506      * In v2, snapshots do not need to have extra data.  v3 requires
5507      * the 64-bit VM state size and the virtual disk size to be
5508      * present.
5509      * qcow2_write_snapshots() will always write the list in the
5510      * v3-compliant format.
5511      */
5512     need_snapshot_update = false;
5513     for (i = 0; i < s->nb_snapshots; i++) {
5514         if (s->snapshots[i].extra_data_size <
5515             sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
5516             sizeof_field(QCowSnapshotExtraData, disk_size))
5517         {
5518             need_snapshot_update = true;
5519             break;
5520         }
5521     }
5522     if (need_snapshot_update) {
5523         ret = qcow2_write_snapshots(bs);
5524         if (ret < 0) {
5525             error_setg_errno(errp, -ret, "Failed to update the snapshot table");
5526             return ret;
5527         }
5528     }
5529     status_cb(bs, 1, 2, cb_opaque);
5530 
5531     s->qcow_version = target_version;
5532     ret = qcow2_update_header(bs);
5533     if (ret < 0) {
5534         s->qcow_version = current_version;
5535         error_setg_errno(errp, -ret, "Failed to update the image header");
5536         return ret;
5537     }
5538     status_cb(bs, 2, 2, cb_opaque);
5539 
5540     return 0;
5541 }
5542 
5543 typedef enum Qcow2AmendOperation {
5544     /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
5545      * statically initialized to so that the helper CB can discern the first
5546      * invocation from an operation change */
5547     QCOW2_NO_OPERATION = 0,
5548 
5549     QCOW2_UPGRADING,
5550     QCOW2_UPDATING_ENCRYPTION,
5551     QCOW2_CHANGING_REFCOUNT_ORDER,
5552     QCOW2_DOWNGRADING,
5553 } Qcow2AmendOperation;
5554 
5555 typedef struct Qcow2AmendHelperCBInfo {
5556     /* The code coordinating the amend operations should only modify
5557      * these four fields; the rest will be managed by the CB */
5558     BlockDriverAmendStatusCB *original_status_cb;
5559     void *original_cb_opaque;
5560 
5561     Qcow2AmendOperation current_operation;
5562 
5563     /* Total number of operations to perform (only set once) */
5564     int total_operations;
5565 
5566     /* The following fields are managed by the CB */
5567 
5568     /* Number of operations completed */
5569     int operations_completed;
5570 
5571     /* Cumulative offset of all completed operations */
5572     int64_t offset_completed;
5573 
5574     Qcow2AmendOperation last_operation;
5575     int64_t last_work_size;
5576 } Qcow2AmendHelperCBInfo;
5577 
5578 static void qcow2_amend_helper_cb(BlockDriverState *bs,
5579                                   int64_t operation_offset,
5580                                   int64_t operation_work_size, void *opaque)
5581 {
5582     Qcow2AmendHelperCBInfo *info = opaque;
5583     int64_t current_work_size;
5584     int64_t projected_work_size;
5585 
5586     if (info->current_operation != info->last_operation) {
5587         if (info->last_operation != QCOW2_NO_OPERATION) {
5588             info->offset_completed += info->last_work_size;
5589             info->operations_completed++;
5590         }
5591 
5592         info->last_operation = info->current_operation;
5593     }
5594 
5595     assert(info->total_operations > 0);
5596     assert(info->operations_completed < info->total_operations);
5597 
5598     info->last_work_size = operation_work_size;
5599 
5600     current_work_size = info->offset_completed + operation_work_size;
5601 
5602     /* current_work_size is the total work size for (operations_completed + 1)
5603      * operations (which includes this one), so multiply it by the number of
5604      * operations not covered and divide it by the number of operations
5605      * covered to get a projection for the operations not covered */
5606     projected_work_size = current_work_size * (info->total_operations -
5607                                                info->operations_completed - 1)
5608                                             / (info->operations_completed + 1);
5609 
5610     info->original_status_cb(bs, info->offset_completed + operation_offset,
5611                              current_work_size + projected_work_size,
5612                              info->original_cb_opaque);
5613 }
5614 
5615 static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
5616                                BlockDriverAmendStatusCB *status_cb,
5617                                void *cb_opaque,
5618                                bool force,
5619                                Error **errp)
5620 {
5621     BDRVQcow2State *s = bs->opaque;
5622     int old_version = s->qcow_version, new_version = old_version;
5623     uint64_t new_size = 0;
5624     const char *backing_file = NULL, *backing_format = NULL, *data_file = NULL;
5625     bool lazy_refcounts = s->use_lazy_refcounts;
5626     bool data_file_raw = data_file_is_raw(bs);
5627     const char *compat = NULL;
5628     int refcount_bits = s->refcount_bits;
5629     int ret;
5630     QemuOptDesc *desc = opts->list->desc;
5631     Qcow2AmendHelperCBInfo helper_cb_info;
5632     bool encryption_update = false;
5633 
5634     while (desc && desc->name) {
5635         if (!qemu_opt_find(opts, desc->name)) {
5636             /* only change explicitly defined options */
5637             desc++;
5638             continue;
5639         }
5640 
5641         if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
5642             compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
5643             if (!compat) {
5644                 /* preserve default */
5645             } else if (!strcmp(compat, "0.10") || !strcmp(compat, "v2")) {
5646                 new_version = 2;
5647             } else if (!strcmp(compat, "1.1") || !strcmp(compat, "v3")) {
5648                 new_version = 3;
5649             } else {
5650                 error_setg(errp, "Unknown compatibility level %s", compat);
5651                 return -EINVAL;
5652             }
5653         } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
5654             new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5655         } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
5656             backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5657         } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
5658             backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5659         } else if (g_str_has_prefix(desc->name, "encrypt.")) {
5660             if (!s->crypto) {
5661                 error_setg(errp,
5662                            "Can't amend encryption options - encryption not present");
5663                 return -EINVAL;
5664             }
5665             if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
5666                 error_setg(errp,
5667                            "Only LUKS encryption options can be amended");
5668                 return -ENOTSUP;
5669             }
5670             encryption_update = true;
5671         } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
5672             lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
5673                                                lazy_refcounts);
5674         } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
5675             refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
5676                                                 refcount_bits);
5677 
5678             if (refcount_bits <= 0 || refcount_bits > 64 ||
5679                 !is_power_of_2(refcount_bits))
5680             {
5681                 error_setg(errp, "Refcount width must be a power of two and "
5682                            "may not exceed 64 bits");
5683                 return -EINVAL;
5684             }
5685         } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE)) {
5686             data_file = qemu_opt_get(opts, BLOCK_OPT_DATA_FILE);
5687             if (data_file && !has_data_file(bs)) {
5688                 error_setg(errp, "data-file can only be set for images that "
5689                                  "use an external data file");
5690                 return -EINVAL;
5691             }
5692         } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE_RAW)) {
5693             data_file_raw = qemu_opt_get_bool(opts, BLOCK_OPT_DATA_FILE_RAW,
5694                                               data_file_raw);
5695             if (data_file_raw && !data_file_is_raw(bs)) {
5696                 error_setg(errp, "data-file-raw cannot be set on existing "
5697                                  "images");
5698                 return -EINVAL;
5699             }
5700         } else {
5701             /* if this point is reached, this probably means a new option was
5702              * added without having it covered here */
5703             abort();
5704         }
5705 
5706         desc++;
5707     }
5708 
5709     helper_cb_info = (Qcow2AmendHelperCBInfo){
5710         .original_status_cb = status_cb,
5711         .original_cb_opaque = cb_opaque,
5712         .total_operations = (new_version != old_version)
5713                           + (s->refcount_bits != refcount_bits) +
5714                             (encryption_update == true)
5715     };
5716 
5717     /* Upgrade first (some features may require compat=1.1) */
5718     if (new_version > old_version) {
5719         helper_cb_info.current_operation = QCOW2_UPGRADING;
5720         ret = qcow2_upgrade(bs, new_version, &qcow2_amend_helper_cb,
5721                             &helper_cb_info, errp);
5722         if (ret < 0) {
5723             return ret;
5724         }
5725     }
5726 
5727     if (encryption_update) {
5728         QDict *amend_opts_dict;
5729         QCryptoBlockAmendOptions *amend_opts;
5730 
5731         helper_cb_info.current_operation = QCOW2_UPDATING_ENCRYPTION;
5732         amend_opts_dict = qcow2_extract_crypto_opts(opts, "luks", errp);
5733         if (!amend_opts_dict) {
5734             return -EINVAL;
5735         }
5736         amend_opts = block_crypto_amend_opts_init(amend_opts_dict, errp);
5737         qobject_unref(amend_opts_dict);
5738         if (!amend_opts) {
5739             return -EINVAL;
5740         }
5741         ret = qcrypto_block_amend_options(s->crypto,
5742                                           qcow2_crypto_hdr_read_func,
5743                                           qcow2_crypto_hdr_write_func,
5744                                           bs,
5745                                           amend_opts,
5746                                           force,
5747                                           errp);
5748         qapi_free_QCryptoBlockAmendOptions(amend_opts);
5749         if (ret < 0) {
5750             return ret;
5751         }
5752     }
5753 
5754     if (s->refcount_bits != refcount_bits) {
5755         int refcount_order = ctz32(refcount_bits);
5756 
5757         if (new_version < 3 && refcount_bits != 16) {
5758             error_setg(errp, "Refcount widths other than 16 bits require "
5759                        "compatibility level 1.1 or above (use compat=1.1 or "
5760                        "greater)");
5761             return -EINVAL;
5762         }
5763 
5764         helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
5765         ret = qcow2_change_refcount_order(bs, refcount_order,
5766                                           &qcow2_amend_helper_cb,
5767                                           &helper_cb_info, errp);
5768         if (ret < 0) {
5769             return ret;
5770         }
5771     }
5772 
5773     /* data-file-raw blocks backing files, so clear it first if requested */
5774     if (data_file_raw) {
5775         s->autoclear_features |= QCOW2_AUTOCLEAR_DATA_FILE_RAW;
5776     } else {
5777         s->autoclear_features &= ~QCOW2_AUTOCLEAR_DATA_FILE_RAW;
5778     }
5779 
5780     if (data_file) {
5781         g_free(s->image_data_file);
5782         s->image_data_file = *data_file ? g_strdup(data_file) : NULL;
5783     }
5784 
5785     ret = qcow2_update_header(bs);
5786     if (ret < 0) {
5787         error_setg_errno(errp, -ret, "Failed to update the image header");
5788         return ret;
5789     }
5790 
5791     if (backing_file || backing_format) {
5792         if (g_strcmp0(backing_file, s->image_backing_file) ||
5793             g_strcmp0(backing_format, s->image_backing_format)) {
5794             error_setg(errp, "Cannot amend the backing file");
5795             error_append_hint(errp,
5796                               "You can use 'qemu-img rebase' instead.\n");
5797             return -EINVAL;
5798         }
5799     }
5800 
5801     if (s->use_lazy_refcounts != lazy_refcounts) {
5802         if (lazy_refcounts) {
5803             if (new_version < 3) {
5804                 error_setg(errp, "Lazy refcounts only supported with "
5805                            "compatibility level 1.1 and above (use compat=1.1 "
5806                            "or greater)");
5807                 return -EINVAL;
5808             }
5809             s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
5810             ret = qcow2_update_header(bs);
5811             if (ret < 0) {
5812                 s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
5813                 error_setg_errno(errp, -ret, "Failed to update the image header");
5814                 return ret;
5815             }
5816             s->use_lazy_refcounts = true;
5817         } else {
5818             /* make image clean first */
5819             ret = qcow2_mark_clean(bs);
5820             if (ret < 0) {
5821                 error_setg_errno(errp, -ret, "Failed to make the image clean");
5822                 return ret;
5823             }
5824             /* now disallow lazy refcounts */
5825             s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
5826             ret = qcow2_update_header(bs);
5827             if (ret < 0) {
5828                 s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
5829                 error_setg_errno(errp, -ret, "Failed to update the image header");
5830                 return ret;
5831             }
5832             s->use_lazy_refcounts = false;
5833         }
5834     }
5835 
5836     if (new_size) {
5837         BlockBackend *blk = blk_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL,
5838                                             errp);
5839         if (!blk) {
5840             return -EPERM;
5841         }
5842 
5843         /*
5844          * Amending image options should ensure that the image has
5845          * exactly the given new values, so pass exact=true here.
5846          */
5847         ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, 0, errp);
5848         blk_unref(blk);
5849         if (ret < 0) {
5850             return ret;
5851         }
5852     }
5853 
5854     /* Downgrade last (so unsupported features can be removed before) */
5855     if (new_version < old_version) {
5856         helper_cb_info.current_operation = QCOW2_DOWNGRADING;
5857         ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
5858                               &helper_cb_info, errp);
5859         if (ret < 0) {
5860             return ret;
5861         }
5862     }
5863 
5864     return 0;
5865 }
5866 
5867 static int coroutine_fn qcow2_co_amend(BlockDriverState *bs,
5868                                        BlockdevAmendOptions *opts,
5869                                        bool force,
5870                                        Error **errp)
5871 {
5872     BlockdevAmendOptionsQcow2 *qopts = &opts->u.qcow2;
5873     BDRVQcow2State *s = bs->opaque;
5874     int ret = 0;
5875 
5876     if (qopts->encrypt) {
5877         if (!s->crypto) {
5878             error_setg(errp, "image is not encrypted, can't amend");
5879             return -EOPNOTSUPP;
5880         }
5881 
5882         if (qopts->encrypt->format != Q_CRYPTO_BLOCK_FORMAT_LUKS) {
5883             error_setg(errp,
5884                        "Amend can't be used to change the qcow2 encryption format");
5885             return -EOPNOTSUPP;
5886         }
5887 
5888         if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
5889             error_setg(errp,
5890                        "Only LUKS encryption options can be amended for qcow2 with blockdev-amend");
5891             return -EOPNOTSUPP;
5892         }
5893 
5894         ret = qcrypto_block_amend_options(s->crypto,
5895                                           qcow2_crypto_hdr_read_func,
5896                                           qcow2_crypto_hdr_write_func,
5897                                           bs,
5898                                           qopts->encrypt,
5899                                           force,
5900                                           errp);
5901     }
5902     return ret;
5903 }
5904 
5905 /*
5906  * If offset or size are negative, respectively, they will not be included in
5907  * the BLOCK_IMAGE_CORRUPTED event emitted.
5908  * fatal will be ignored for read-only BDS; corruptions found there will always
5909  * be considered non-fatal.
5910  */
5911 void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
5912                              int64_t size, const char *message_format, ...)
5913 {
5914     BDRVQcow2State *s = bs->opaque;
5915     const char *node_name;
5916     char *message;
5917     va_list ap;
5918 
5919     fatal = fatal && bdrv_is_writable(bs);
5920 
5921     if (s->signaled_corruption &&
5922         (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
5923     {
5924         return;
5925     }
5926 
5927     va_start(ap, message_format);
5928     message = g_strdup_vprintf(message_format, ap);
5929     va_end(ap);
5930 
5931     if (fatal) {
5932         fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
5933                 "corruption events will be suppressed\n", message);
5934     } else {
5935         fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
5936                 "corruption events will be suppressed\n", message);
5937     }
5938 
5939     node_name = bdrv_get_node_name(bs);
5940     qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
5941                                           *node_name ? node_name : NULL,
5942                                           message, offset >= 0, offset,
5943                                           size >= 0, size,
5944                                           fatal);
5945     g_free(message);
5946 
5947     if (fatal) {
5948         qcow2_mark_corrupt(bs);
5949         bs->drv = NULL; /* make BDS unusable */
5950     }
5951 
5952     s->signaled_corruption = true;
5953 }
5954 
5955 #define QCOW_COMMON_OPTIONS                                         \
5956     {                                                               \
5957         .name = BLOCK_OPT_SIZE,                                     \
5958         .type = QEMU_OPT_SIZE,                                      \
5959         .help = "Virtual disk size"                                 \
5960     },                                                              \
5961     {                                                               \
5962         .name = BLOCK_OPT_COMPAT_LEVEL,                             \
5963         .type = QEMU_OPT_STRING,                                    \
5964         .help = "Compatibility level (v2 [0.10] or v3 [1.1])"       \
5965     },                                                              \
5966     {                                                               \
5967         .name = BLOCK_OPT_BACKING_FILE,                             \
5968         .type = QEMU_OPT_STRING,                                    \
5969         .help = "File name of a base image"                         \
5970     },                                                              \
5971     {                                                               \
5972         .name = BLOCK_OPT_BACKING_FMT,                              \
5973         .type = QEMU_OPT_STRING,                                    \
5974         .help = "Image format of the base image"                    \
5975     },                                                              \
5976     {                                                               \
5977         .name = BLOCK_OPT_DATA_FILE,                                \
5978         .type = QEMU_OPT_STRING,                                    \
5979         .help = "File name of an external data file"                \
5980     },                                                              \
5981     {                                                               \
5982         .name = BLOCK_OPT_DATA_FILE_RAW,                            \
5983         .type = QEMU_OPT_BOOL,                                      \
5984         .help = "The external data file must stay valid "           \
5985                 "as a raw image"                                    \
5986     },                                                              \
5987     {                                                               \
5988         .name = BLOCK_OPT_LAZY_REFCOUNTS,                           \
5989         .type = QEMU_OPT_BOOL,                                      \
5990         .help = "Postpone refcount updates",                        \
5991         .def_value_str = "off"                                      \
5992     },                                                              \
5993     {                                                               \
5994         .name = BLOCK_OPT_REFCOUNT_BITS,                            \
5995         .type = QEMU_OPT_NUMBER,                                    \
5996         .help = "Width of a reference count entry in bits",         \
5997         .def_value_str = "16"                                       \
5998     }
5999 
6000 static QemuOptsList qcow2_create_opts = {
6001     .name = "qcow2-create-opts",
6002     .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
6003     .desc = {
6004         {                                                               \
6005             .name = BLOCK_OPT_ENCRYPT,                                  \
6006             .type = QEMU_OPT_BOOL,                                      \
6007             .help = "Encrypt the image with format 'aes'. (Deprecated " \
6008                     "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",    \
6009         },                                                              \
6010         {                                                               \
6011             .name = BLOCK_OPT_ENCRYPT_FORMAT,                           \
6012             .type = QEMU_OPT_STRING,                                    \
6013             .help = "Encrypt the image, format choices: 'aes', 'luks'", \
6014         },                                                              \
6015         BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",                     \
6016             "ID of secret providing qcow AES key or LUKS passphrase"),  \
6017         BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."),               \
6018         BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."),              \
6019         BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."),                \
6020         BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."),           \
6021         BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."),                 \
6022         BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),                \
6023         {                                                               \
6024             .name = BLOCK_OPT_CLUSTER_SIZE,                             \
6025             .type = QEMU_OPT_SIZE,                                      \
6026             .help = "qcow2 cluster size",                               \
6027             .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)            \
6028         },                                                              \
6029         {                                                               \
6030             .name = BLOCK_OPT_EXTL2,                                    \
6031             .type = QEMU_OPT_BOOL,                                      \
6032             .help = "Extended L2 tables",                               \
6033             .def_value_str = "off"                                      \
6034         },                                                              \
6035         {                                                               \
6036             .name = BLOCK_OPT_PREALLOC,                                 \
6037             .type = QEMU_OPT_STRING,                                    \
6038             .help = "Preallocation mode (allowed values: off, "         \
6039                     "metadata, falloc, full)"                           \
6040         },                                                              \
6041         {                                                               \
6042             .name = BLOCK_OPT_COMPRESSION_TYPE,                         \
6043             .type = QEMU_OPT_STRING,                                    \
6044             .help = "Compression method used for image cluster "        \
6045                     "compression",                                      \
6046             .def_value_str = "zlib"                                     \
6047         },
6048         QCOW_COMMON_OPTIONS,
6049         { /* end of list */ }
6050     }
6051 };
6052 
6053 static QemuOptsList qcow2_amend_opts = {
6054     .name = "qcow2-amend-opts",
6055     .head = QTAILQ_HEAD_INITIALIZER(qcow2_amend_opts.head),
6056     .desc = {
6057         BLOCK_CRYPTO_OPT_DEF_LUKS_STATE("encrypt."),
6058         BLOCK_CRYPTO_OPT_DEF_LUKS_KEYSLOT("encrypt."),
6059         BLOCK_CRYPTO_OPT_DEF_LUKS_OLD_SECRET("encrypt."),
6060         BLOCK_CRYPTO_OPT_DEF_LUKS_NEW_SECRET("encrypt."),
6061         BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),
6062         QCOW_COMMON_OPTIONS,
6063         { /* end of list */ }
6064     }
6065 };
6066 
6067 static const char *const qcow2_strong_runtime_opts[] = {
6068     "encrypt." BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET,
6069 
6070     NULL
6071 };
6072 
6073 BlockDriver bdrv_qcow2 = {
6074     .format_name        = "qcow2",
6075     .instance_size      = sizeof(BDRVQcow2State),
6076     .bdrv_probe         = qcow2_probe,
6077     .bdrv_open          = qcow2_open,
6078     .bdrv_close         = qcow2_close,
6079     .bdrv_reopen_prepare  = qcow2_reopen_prepare,
6080     .bdrv_reopen_commit   = qcow2_reopen_commit,
6081     .bdrv_reopen_commit_post = qcow2_reopen_commit_post,
6082     .bdrv_reopen_abort    = qcow2_reopen_abort,
6083     .bdrv_join_options    = qcow2_join_options,
6084     .bdrv_child_perm      = bdrv_default_perms,
6085     .bdrv_co_create_opts  = qcow2_co_create_opts,
6086     .bdrv_co_create       = qcow2_co_create,
6087     .bdrv_has_zero_init   = qcow2_has_zero_init,
6088     .bdrv_co_block_status = qcow2_co_block_status,
6089 
6090     .bdrv_co_preadv_part    = qcow2_co_preadv_part,
6091     .bdrv_co_pwritev_part   = qcow2_co_pwritev_part,
6092     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
6093 
6094     .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
6095     .bdrv_co_pdiscard       = qcow2_co_pdiscard,
6096     .bdrv_co_copy_range_from = qcow2_co_copy_range_from,
6097     .bdrv_co_copy_range_to  = qcow2_co_copy_range_to,
6098     .bdrv_co_truncate       = qcow2_co_truncate,
6099     .bdrv_co_pwritev_compressed_part = qcow2_co_pwritev_compressed_part,
6100     .bdrv_make_empty        = qcow2_make_empty,
6101 
6102     .bdrv_snapshot_create   = qcow2_snapshot_create,
6103     .bdrv_snapshot_goto     = qcow2_snapshot_goto,
6104     .bdrv_snapshot_delete   = qcow2_snapshot_delete,
6105     .bdrv_snapshot_list     = qcow2_snapshot_list,
6106     .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp,
6107     .bdrv_measure           = qcow2_measure,
6108     .bdrv_co_get_info       = qcow2_co_get_info,
6109     .bdrv_get_specific_info = qcow2_get_specific_info,
6110 
6111     .bdrv_co_save_vmstate   = qcow2_co_save_vmstate,
6112     .bdrv_co_load_vmstate   = qcow2_co_load_vmstate,
6113 
6114     .is_format                  = true,
6115     .supports_backing           = true,
6116     .bdrv_change_backing_file   = qcow2_change_backing_file,
6117 
6118     .bdrv_refresh_limits        = qcow2_refresh_limits,
6119     .bdrv_co_invalidate_cache   = qcow2_co_invalidate_cache,
6120     .bdrv_inactivate            = qcow2_inactivate,
6121 
6122     .create_opts         = &qcow2_create_opts,
6123     .amend_opts          = &qcow2_amend_opts,
6124     .strong_runtime_opts = qcow2_strong_runtime_opts,
6125     .mutable_opts        = mutable_opts,
6126     .bdrv_co_check       = qcow2_co_check,
6127     .bdrv_amend_options  = qcow2_amend_options,
6128     .bdrv_co_amend       = qcow2_co_amend,
6129 
6130     .bdrv_detach_aio_context  = qcow2_detach_aio_context,
6131     .bdrv_attach_aio_context  = qcow2_attach_aio_context,
6132 
6133     .bdrv_supports_persistent_dirty_bitmap =
6134             qcow2_supports_persistent_dirty_bitmap,
6135     .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap,
6136     .bdrv_co_remove_persistent_dirty_bitmap =
6137             qcow2_co_remove_persistent_dirty_bitmap,
6138 };
6139 
6140 static void bdrv_qcow2_init(void)
6141 {
6142     bdrv_register(&bdrv_qcow2);
6143 }
6144 
6145 block_init(bdrv_qcow2_init);
6146