xref: /qemu/block/qcow2-refcount.c (revision 85aad98a)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "block/qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 
33 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
34 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
35                             int64_t offset, int64_t length, uint64_t addend,
36                             bool decrease, enum qcow2_discard_type type);
37 
38 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
39 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
45 
46 static void set_refcount_ro0(void *refcount_array, uint64_t index,
47                              uint64_t value);
48 static void set_refcount_ro1(void *refcount_array, uint64_t index,
49                              uint64_t value);
50 static void set_refcount_ro2(void *refcount_array, uint64_t index,
51                              uint64_t value);
52 static void set_refcount_ro3(void *refcount_array, uint64_t index,
53                              uint64_t value);
54 static void set_refcount_ro4(void *refcount_array, uint64_t index,
55                              uint64_t value);
56 static void set_refcount_ro5(void *refcount_array, uint64_t index,
57                              uint64_t value);
58 static void set_refcount_ro6(void *refcount_array, uint64_t index,
59                              uint64_t value);
60 
61 
62 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
63     &get_refcount_ro0,
64     &get_refcount_ro1,
65     &get_refcount_ro2,
66     &get_refcount_ro3,
67     &get_refcount_ro4,
68     &get_refcount_ro5,
69     &get_refcount_ro6
70 };
71 
72 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
73     &set_refcount_ro0,
74     &set_refcount_ro1,
75     &set_refcount_ro2,
76     &set_refcount_ro3,
77     &set_refcount_ro4,
78     &set_refcount_ro5,
79     &set_refcount_ro6
80 };
81 
82 
83 /*********************************************************/
84 /* refcount handling */
85 
86 int qcow2_refcount_init(BlockDriverState *bs)
87 {
88     BDRVQcow2State *s = bs->opaque;
89     unsigned int refcount_table_size2, i;
90     int ret;
91 
92     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
93 
94     s->get_refcount = get_refcount_funcs[s->refcount_order];
95     s->set_refcount = set_refcount_funcs[s->refcount_order];
96 
97     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
98     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
99     s->refcount_table = g_try_malloc(refcount_table_size2);
100 
101     if (s->refcount_table_size > 0) {
102         if (s->refcount_table == NULL) {
103             ret = -ENOMEM;
104             goto fail;
105         }
106         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
107         ret = bdrv_pread(bs->file, s->refcount_table_offset,
108                          s->refcount_table, refcount_table_size2);
109         if (ret < 0) {
110             goto fail;
111         }
112         for(i = 0; i < s->refcount_table_size; i++)
113             be64_to_cpus(&s->refcount_table[i]);
114     }
115     return 0;
116  fail:
117     return ret;
118 }
119 
120 void qcow2_refcount_close(BlockDriverState *bs)
121 {
122     BDRVQcow2State *s = bs->opaque;
123     g_free(s->refcount_table);
124 }
125 
126 
127 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
128 {
129     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
130 }
131 
132 static void set_refcount_ro0(void *refcount_array, uint64_t index,
133                              uint64_t value)
134 {
135     assert(!(value >> 1));
136     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
137     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
138 }
139 
140 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
141 {
142     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
143            & 0x3;
144 }
145 
146 static void set_refcount_ro1(void *refcount_array, uint64_t index,
147                              uint64_t value)
148 {
149     assert(!(value >> 2));
150     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
151     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
152 }
153 
154 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
155 {
156     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
157            & 0xf;
158 }
159 
160 static void set_refcount_ro2(void *refcount_array, uint64_t index,
161                              uint64_t value)
162 {
163     assert(!(value >> 4));
164     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
165     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
166 }
167 
168 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
169 {
170     return ((const uint8_t *)refcount_array)[index];
171 }
172 
173 static void set_refcount_ro3(void *refcount_array, uint64_t index,
174                              uint64_t value)
175 {
176     assert(!(value >> 8));
177     ((uint8_t *)refcount_array)[index] = value;
178 }
179 
180 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
181 {
182     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
183 }
184 
185 static void set_refcount_ro4(void *refcount_array, uint64_t index,
186                              uint64_t value)
187 {
188     assert(!(value >> 16));
189     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
190 }
191 
192 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
193 {
194     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
195 }
196 
197 static void set_refcount_ro5(void *refcount_array, uint64_t index,
198                              uint64_t value)
199 {
200     assert(!(value >> 32));
201     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
202 }
203 
204 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
205 {
206     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
207 }
208 
209 static void set_refcount_ro6(void *refcount_array, uint64_t index,
210                              uint64_t value)
211 {
212     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
213 }
214 
215 
216 static int load_refcount_block(BlockDriverState *bs,
217                                int64_t refcount_block_offset,
218                                void **refcount_block)
219 {
220     BDRVQcow2State *s = bs->opaque;
221 
222     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
223     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
224                            refcount_block);
225 }
226 
227 /*
228  * Retrieves the refcount of the cluster given by its index and stores it in
229  * *refcount. Returns 0 on success and -errno on failure.
230  */
231 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
232                        uint64_t *refcount)
233 {
234     BDRVQcow2State *s = bs->opaque;
235     uint64_t refcount_table_index, block_index;
236     int64_t refcount_block_offset;
237     int ret;
238     void *refcount_block;
239 
240     refcount_table_index = cluster_index >> s->refcount_block_bits;
241     if (refcount_table_index >= s->refcount_table_size) {
242         *refcount = 0;
243         return 0;
244     }
245     refcount_block_offset =
246         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
247     if (!refcount_block_offset) {
248         *refcount = 0;
249         return 0;
250     }
251 
252     if (offset_into_cluster(s, refcount_block_offset)) {
253         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
254                                 " unaligned (reftable index: %#" PRIx64 ")",
255                                 refcount_block_offset, refcount_table_index);
256         return -EIO;
257     }
258 
259     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
260                           &refcount_block);
261     if (ret < 0) {
262         return ret;
263     }
264 
265     block_index = cluster_index & (s->refcount_block_size - 1);
266     *refcount = s->get_refcount(refcount_block, block_index);
267 
268     qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
269 
270     return 0;
271 }
272 
273 /*
274  * Rounds the refcount table size up to avoid growing the table for each single
275  * refcount block that is allocated.
276  */
277 static unsigned int next_refcount_table_size(BDRVQcow2State *s,
278     unsigned int min_size)
279 {
280     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
281     unsigned int refcount_table_clusters =
282         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
283 
284     while (min_clusters > refcount_table_clusters) {
285         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
286     }
287 
288     return refcount_table_clusters << (s->cluster_bits - 3);
289 }
290 
291 
292 /* Checks if two offsets are described by the same refcount block */
293 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
294     uint64_t offset_b)
295 {
296     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
297     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
298 
299     return (block_a == block_b);
300 }
301 
302 /*
303  * Loads a refcount block. If it doesn't exist yet, it is allocated first
304  * (including growing the refcount table if needed).
305  *
306  * Returns 0 on success or -errno in error case
307  */
308 static int alloc_refcount_block(BlockDriverState *bs,
309                                 int64_t cluster_index, void **refcount_block)
310 {
311     BDRVQcow2State *s = bs->opaque;
312     unsigned int refcount_table_index;
313     int ret;
314 
315     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
316 
317     /* Find the refcount block for the given cluster */
318     refcount_table_index = cluster_index >> s->refcount_block_bits;
319 
320     if (refcount_table_index < s->refcount_table_size) {
321 
322         uint64_t refcount_block_offset =
323             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
324 
325         /* If it's already there, we're done */
326         if (refcount_block_offset) {
327             if (offset_into_cluster(s, refcount_block_offset)) {
328                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
329                                         PRIx64 " unaligned (reftable index: "
330                                         "%#x)", refcount_block_offset,
331                                         refcount_table_index);
332                 return -EIO;
333             }
334 
335              return load_refcount_block(bs, refcount_block_offset,
336                                         refcount_block);
337         }
338     }
339 
340     /*
341      * If we came here, we need to allocate something. Something is at least
342      * a cluster for the new refcount block. It may also include a new refcount
343      * table if the old refcount table is too small.
344      *
345      * Note that allocating clusters here needs some special care:
346      *
347      * - We can't use the normal qcow2_alloc_clusters(), it would try to
348      *   increase the refcount and very likely we would end up with an endless
349      *   recursion. Instead we must place the refcount blocks in a way that
350      *   they can describe them themselves.
351      *
352      * - We need to consider that at this point we are inside update_refcounts
353      *   and potentially doing an initial refcount increase. This means that
354      *   some clusters have already been allocated by the caller, but their
355      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
356      *   need to return -EAGAIN to signal the caller that it needs to restart
357      *   the search for free clusters.
358      *
359      * - alloc_clusters_noref and qcow2_free_clusters may load a different
360      *   refcount block into the cache
361      */
362 
363     *refcount_block = NULL;
364 
365     /* We write to the refcount table, so we might depend on L2 tables */
366     ret = qcow2_cache_flush(bs, s->l2_table_cache);
367     if (ret < 0) {
368         return ret;
369     }
370 
371     /* Allocate the refcount block itself and mark it as used */
372     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
373     if (new_block < 0) {
374         return new_block;
375     }
376 
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382 
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail_block;
389         }
390 
391         memset(*refcount_block, 0, s->cluster_size);
392 
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail_block;
404         }
405 
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail_block;
409         }
410 
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail_block;
417         }
418 
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421 
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail_block;
428     }
429 
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail_block;
439         }
440 
441         s->refcount_table[refcount_table_index] = new_block;
442 
443         /* The new refcount block may be where the caller intended to put its
444          * data, so let it restart the search. */
445         return -EAGAIN;
446     }
447 
448     qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
449 
450     /*
451      * If we come here, we need to grow the refcount table. Again, a new
452      * refcount table needs some space and we can't simply allocate to avoid
453      * endless recursion.
454      *
455      * Therefore let's grab new refcount blocks at the end of the image, which
456      * will describe themselves and the new refcount table. This way we can
457      * reference them only in the new table and do the switch to the new
458      * refcount table at once without producing an inconsistent state in
459      * between.
460      */
461     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
462 
463     /* Calculate the number of refcount blocks needed so far; this will be the
464      * basis for calculating the index of the first cluster used for the
465      * self-describing refcount structures which we are about to create.
466      *
467      * Because we reached this point, there cannot be any refcount entries for
468      * cluster_index or higher indices yet. However, because new_block has been
469      * allocated to describe that cluster (and it will assume this role later
470      * on), we cannot use that index; also, new_block may actually have a higher
471      * cluster index than cluster_index, so it needs to be taken into account
472      * here (and 1 needs to be added to its value because that cluster is used).
473      */
474     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
475                                             (new_block >> s->cluster_bits) + 1),
476                                         s->refcount_block_size);
477 
478     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
479         return -EFBIG;
480     }
481 
482     /* And now we need at least one block more for the new metadata */
483     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
484     uint64_t last_table_size;
485     uint64_t blocks_clusters;
486     do {
487         uint64_t table_clusters =
488             size_to_clusters(s, table_size * sizeof(uint64_t));
489         blocks_clusters = 1 +
490             DIV_ROUND_UP(table_clusters, s->refcount_block_size);
491         uint64_t meta_clusters = table_clusters + blocks_clusters;
492 
493         last_table_size = table_size;
494         table_size = next_refcount_table_size(s, blocks_used +
495             DIV_ROUND_UP(meta_clusters, s->refcount_block_size));
496 
497     } while (last_table_size != table_size);
498 
499 #ifdef DEBUG_ALLOC2
500     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
501         s->refcount_table_size, table_size);
502 #endif
503 
504     /* Create the new refcount table and blocks */
505     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
506         s->cluster_size;
507     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
508     uint64_t *new_table = g_try_new0(uint64_t, table_size);
509     void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
510 
511     assert(table_size > 0 && blocks_clusters > 0);
512     if (new_table == NULL || new_blocks == NULL) {
513         ret = -ENOMEM;
514         goto fail_table;
515     }
516 
517     /* Fill the new refcount table */
518     memcpy(new_table, s->refcount_table,
519         s->refcount_table_size * sizeof(uint64_t));
520     new_table[refcount_table_index] = new_block;
521 
522     int i;
523     for (i = 0; i < blocks_clusters; i++) {
524         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
525     }
526 
527     /* Fill the refcount blocks */
528     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
529     int block = 0;
530     for (i = 0; i < table_clusters + blocks_clusters; i++) {
531         s->set_refcount(new_blocks, block++, 1);
532     }
533 
534     /* Write refcount blocks to disk */
535     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
536     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
537         blocks_clusters * s->cluster_size);
538     g_free(new_blocks);
539     new_blocks = NULL;
540     if (ret < 0) {
541         goto fail_table;
542     }
543 
544     /* Write refcount table to disk */
545     for(i = 0; i < table_size; i++) {
546         cpu_to_be64s(&new_table[i]);
547     }
548 
549     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
550     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
551         table_size * sizeof(uint64_t));
552     if (ret < 0) {
553         goto fail_table;
554     }
555 
556     for(i = 0; i < table_size; i++) {
557         be64_to_cpus(&new_table[i]);
558     }
559 
560     /* Hook up the new refcount table in the qcow2 header */
561     struct QEMU_PACKED {
562         uint64_t d64;
563         uint32_t d32;
564     } data;
565     data.d64 = cpu_to_be64(table_offset);
566     data.d32 = cpu_to_be32(table_clusters);
567     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
568     ret = bdrv_pwrite_sync(bs->file,
569                            offsetof(QCowHeader, refcount_table_offset),
570                            &data, sizeof(data));
571     if (ret < 0) {
572         goto fail_table;
573     }
574 
575     /* And switch it in memory */
576     uint64_t old_table_offset = s->refcount_table_offset;
577     uint64_t old_table_size = s->refcount_table_size;
578 
579     g_free(s->refcount_table);
580     s->refcount_table = new_table;
581     s->refcount_table_size = table_size;
582     s->refcount_table_offset = table_offset;
583 
584     /* Free old table. */
585     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
586                         QCOW2_DISCARD_OTHER);
587 
588     ret = load_refcount_block(bs, new_block, refcount_block);
589     if (ret < 0) {
590         return ret;
591     }
592 
593     /* If we were trying to do the initial refcount update for some cluster
594      * allocation, we might have used the same clusters to store newly
595      * allocated metadata. Make the caller search some new space. */
596     return -EAGAIN;
597 
598 fail_table:
599     g_free(new_blocks);
600     g_free(new_table);
601 fail_block:
602     if (*refcount_block != NULL) {
603         qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
604     }
605     return ret;
606 }
607 
608 void qcow2_process_discards(BlockDriverState *bs, int ret)
609 {
610     BDRVQcow2State *s = bs->opaque;
611     Qcow2DiscardRegion *d, *next;
612 
613     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
614         QTAILQ_REMOVE(&s->discards, d, next);
615 
616         /* Discard is optional, ignore the return value */
617         if (ret >= 0) {
618             bdrv_discard(bs->file->bs,
619                          d->offset >> BDRV_SECTOR_BITS,
620                          d->bytes >> BDRV_SECTOR_BITS);
621         }
622 
623         g_free(d);
624     }
625 }
626 
627 static void update_refcount_discard(BlockDriverState *bs,
628                                     uint64_t offset, uint64_t length)
629 {
630     BDRVQcow2State *s = bs->opaque;
631     Qcow2DiscardRegion *d, *p, *next;
632 
633     QTAILQ_FOREACH(d, &s->discards, next) {
634         uint64_t new_start = MIN(offset, d->offset);
635         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
636 
637         if (new_end - new_start <= length + d->bytes) {
638             /* There can't be any overlap, areas ending up here have no
639              * references any more and therefore shouldn't get freed another
640              * time. */
641             assert(d->bytes + length == new_end - new_start);
642             d->offset = new_start;
643             d->bytes = new_end - new_start;
644             goto found;
645         }
646     }
647 
648     d = g_malloc(sizeof(*d));
649     *d = (Qcow2DiscardRegion) {
650         .bs     = bs,
651         .offset = offset,
652         .bytes  = length,
653     };
654     QTAILQ_INSERT_TAIL(&s->discards, d, next);
655 
656 found:
657     /* Merge discard requests if they are adjacent now */
658     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
659         if (p == d
660             || p->offset > d->offset + d->bytes
661             || d->offset > p->offset + p->bytes)
662         {
663             continue;
664         }
665 
666         /* Still no overlap possible */
667         assert(p->offset == d->offset + d->bytes
668             || d->offset == p->offset + p->bytes);
669 
670         QTAILQ_REMOVE(&s->discards, p, next);
671         d->offset = MIN(d->offset, p->offset);
672         d->bytes += p->bytes;
673         g_free(p);
674     }
675 }
676 
677 /* XXX: cache several refcount block clusters ? */
678 /* @addend is the absolute value of the addend; if @decrease is set, @addend
679  * will be subtracted from the current refcount, otherwise it will be added */
680 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
681                                                    int64_t offset,
682                                                    int64_t length,
683                                                    uint64_t addend,
684                                                    bool decrease,
685                                                    enum qcow2_discard_type type)
686 {
687     BDRVQcow2State *s = bs->opaque;
688     int64_t start, last, cluster_offset;
689     void *refcount_block = NULL;
690     int64_t old_table_index = -1;
691     int ret;
692 
693 #ifdef DEBUG_ALLOC2
694     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
695             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
696             addend);
697 #endif
698     if (length < 0) {
699         return -EINVAL;
700     } else if (length == 0) {
701         return 0;
702     }
703 
704     if (decrease) {
705         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
706             s->l2_table_cache);
707     }
708 
709     start = start_of_cluster(s, offset);
710     last = start_of_cluster(s, offset + length - 1);
711     for(cluster_offset = start; cluster_offset <= last;
712         cluster_offset += s->cluster_size)
713     {
714         int block_index;
715         uint64_t refcount;
716         int64_t cluster_index = cluster_offset >> s->cluster_bits;
717         int64_t table_index = cluster_index >> s->refcount_block_bits;
718 
719         /* Load the refcount block and allocate it if needed */
720         if (table_index != old_table_index) {
721             if (refcount_block) {
722                 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
723             }
724             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
725             if (ret < 0) {
726                 goto fail;
727             }
728         }
729         old_table_index = table_index;
730 
731         qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
732                                      refcount_block);
733 
734         /* we can update the count and save it */
735         block_index = cluster_index & (s->refcount_block_size - 1);
736 
737         refcount = s->get_refcount(refcount_block, block_index);
738         if (decrease ? (refcount - addend > refcount)
739                      : (refcount + addend < refcount ||
740                         refcount + addend > s->refcount_max))
741         {
742             ret = -EINVAL;
743             goto fail;
744         }
745         if (decrease) {
746             refcount -= addend;
747         } else {
748             refcount += addend;
749         }
750         if (refcount == 0 && cluster_index < s->free_cluster_index) {
751             s->free_cluster_index = cluster_index;
752         }
753         s->set_refcount(refcount_block, block_index, refcount);
754 
755         if (refcount == 0 && s->discard_passthrough[type]) {
756             update_refcount_discard(bs, cluster_offset, s->cluster_size);
757         }
758     }
759 
760     ret = 0;
761 fail:
762     if (!s->cache_discards) {
763         qcow2_process_discards(bs, ret);
764     }
765 
766     /* Write last changed block to disk */
767     if (refcount_block) {
768         qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
769     }
770 
771     /*
772      * Try do undo any updates if an error is returned (This may succeed in
773      * some cases like ENOSPC for allocating a new refcount block)
774      */
775     if (ret < 0) {
776         int dummy;
777         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
778                                 !decrease, QCOW2_DISCARD_NEVER);
779         (void)dummy;
780     }
781 
782     return ret;
783 }
784 
785 /*
786  * Increases or decreases the refcount of a given cluster.
787  *
788  * @addend is the absolute value of the addend; if @decrease is set, @addend
789  * will be subtracted from the current refcount, otherwise it will be added.
790  *
791  * On success 0 is returned; on failure -errno is returned.
792  */
793 int qcow2_update_cluster_refcount(BlockDriverState *bs,
794                                   int64_t cluster_index,
795                                   uint64_t addend, bool decrease,
796                                   enum qcow2_discard_type type)
797 {
798     BDRVQcow2State *s = bs->opaque;
799     int ret;
800 
801     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
802                           decrease, type);
803     if (ret < 0) {
804         return ret;
805     }
806 
807     return 0;
808 }
809 
810 
811 
812 /*********************************************************/
813 /* cluster allocation functions */
814 
815 
816 
817 /* return < 0 if error */
818 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
819 {
820     BDRVQcow2State *s = bs->opaque;
821     uint64_t i, nb_clusters, refcount;
822     int ret;
823 
824     /* We can't allocate clusters if they may still be queued for discard. */
825     if (s->cache_discards) {
826         qcow2_process_discards(bs, 0);
827     }
828 
829     nb_clusters = size_to_clusters(s, size);
830 retry:
831     for(i = 0; i < nb_clusters; i++) {
832         uint64_t next_cluster_index = s->free_cluster_index++;
833         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
834 
835         if (ret < 0) {
836             return ret;
837         } else if (refcount != 0) {
838             goto retry;
839         }
840     }
841 
842     /* Make sure that all offsets in the "allocated" range are representable
843      * in an int64_t */
844     if (s->free_cluster_index > 0 &&
845         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
846     {
847         return -EFBIG;
848     }
849 
850 #ifdef DEBUG_ALLOC2
851     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
852             size,
853             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
854 #endif
855     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
856 }
857 
858 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
859 {
860     int64_t offset;
861     int ret;
862 
863     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
864     do {
865         offset = alloc_clusters_noref(bs, size);
866         if (offset < 0) {
867             return offset;
868         }
869 
870         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
871     } while (ret == -EAGAIN);
872 
873     if (ret < 0) {
874         return ret;
875     }
876 
877     return offset;
878 }
879 
880 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
881                                 int64_t nb_clusters)
882 {
883     BDRVQcow2State *s = bs->opaque;
884     uint64_t cluster_index, refcount;
885     uint64_t i;
886     int ret;
887 
888     assert(nb_clusters >= 0);
889     if (nb_clusters == 0) {
890         return 0;
891     }
892 
893     do {
894         /* Check how many clusters there are free */
895         cluster_index = offset >> s->cluster_bits;
896         for(i = 0; i < nb_clusters; i++) {
897             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
898             if (ret < 0) {
899                 return ret;
900             } else if (refcount != 0) {
901                 break;
902             }
903         }
904 
905         /* And then allocate them */
906         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
907                               QCOW2_DISCARD_NEVER);
908     } while (ret == -EAGAIN);
909 
910     if (ret < 0) {
911         return ret;
912     }
913 
914     return i;
915 }
916 
917 /* only used to allocate compressed sectors. We try to allocate
918    contiguous sectors. size must be <= cluster_size */
919 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
920 {
921     BDRVQcow2State *s = bs->opaque;
922     int64_t offset;
923     size_t free_in_cluster;
924     int ret;
925 
926     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
927     assert(size > 0 && size <= s->cluster_size);
928     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
929 
930     offset = s->free_byte_offset;
931 
932     if (offset) {
933         uint64_t refcount;
934         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
935         if (ret < 0) {
936             return ret;
937         }
938 
939         if (refcount == s->refcount_max) {
940             offset = 0;
941         }
942     }
943 
944     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
945     do {
946         if (!offset || free_in_cluster < size) {
947             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
948             if (new_cluster < 0) {
949                 return new_cluster;
950             }
951 
952             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
953                 offset = new_cluster;
954                 free_in_cluster = s->cluster_size;
955             } else {
956                 free_in_cluster += s->cluster_size;
957             }
958         }
959 
960         assert(offset);
961         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
962         if (ret < 0) {
963             offset = 0;
964         }
965     } while (ret == -EAGAIN);
966     if (ret < 0) {
967         return ret;
968     }
969 
970     /* The cluster refcount was incremented; refcount blocks must be flushed
971      * before the caller's L2 table updates. */
972     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
973 
974     s->free_byte_offset = offset + size;
975     if (!offset_into_cluster(s, s->free_byte_offset)) {
976         s->free_byte_offset = 0;
977     }
978 
979     return offset;
980 }
981 
982 void qcow2_free_clusters(BlockDriverState *bs,
983                           int64_t offset, int64_t size,
984                           enum qcow2_discard_type type)
985 {
986     int ret;
987 
988     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
989     ret = update_refcount(bs, offset, size, 1, true, type);
990     if (ret < 0) {
991         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
992         /* TODO Remember the clusters to free them later and avoid leaking */
993     }
994 }
995 
996 /*
997  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
998  * normal cluster, compressed cluster, etc.)
999  */
1000 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1001                              int nb_clusters, enum qcow2_discard_type type)
1002 {
1003     BDRVQcow2State *s = bs->opaque;
1004 
1005     switch (qcow2_get_cluster_type(l2_entry)) {
1006     case QCOW2_CLUSTER_COMPRESSED:
1007         {
1008             int nb_csectors;
1009             nb_csectors = ((l2_entry >> s->csize_shift) &
1010                            s->csize_mask) + 1;
1011             qcow2_free_clusters(bs,
1012                 (l2_entry & s->cluster_offset_mask) & ~511,
1013                 nb_csectors * 512, type);
1014         }
1015         break;
1016     case QCOW2_CLUSTER_NORMAL:
1017     case QCOW2_CLUSTER_ZERO:
1018         if (l2_entry & L2E_OFFSET_MASK) {
1019             if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1020                 qcow2_signal_corruption(bs, false, -1, -1,
1021                                         "Cannot free unaligned cluster %#llx",
1022                                         l2_entry & L2E_OFFSET_MASK);
1023             } else {
1024                 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1025                                     nb_clusters << s->cluster_bits, type);
1026             }
1027         }
1028         break;
1029     case QCOW2_CLUSTER_UNALLOCATED:
1030         break;
1031     default:
1032         abort();
1033     }
1034 }
1035 
1036 
1037 
1038 /*********************************************************/
1039 /* snapshots and image creation */
1040 
1041 
1042 
1043 /* update the refcounts of snapshots and the copied flag */
1044 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1045     int64_t l1_table_offset, int l1_size, int addend)
1046 {
1047     BDRVQcow2State *s = bs->opaque;
1048     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1049     bool l1_allocated = false;
1050     int64_t old_offset, old_l2_offset;
1051     int i, j, l1_modified = 0, nb_csectors;
1052     int ret;
1053 
1054     assert(addend >= -1 && addend <= 1);
1055 
1056     l2_table = NULL;
1057     l1_table = NULL;
1058     l1_size2 = l1_size * sizeof(uint64_t);
1059 
1060     s->cache_discards = true;
1061 
1062     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1063      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1064      * when changing this! */
1065     if (l1_table_offset != s->l1_table_offset) {
1066         l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1067         if (l1_size2 && l1_table == NULL) {
1068             ret = -ENOMEM;
1069             goto fail;
1070         }
1071         l1_allocated = true;
1072 
1073         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1074         if (ret < 0) {
1075             goto fail;
1076         }
1077 
1078         for(i = 0;i < l1_size; i++)
1079             be64_to_cpus(&l1_table[i]);
1080     } else {
1081         assert(l1_size == s->l1_size);
1082         l1_table = s->l1_table;
1083         l1_allocated = false;
1084     }
1085 
1086     for(i = 0; i < l1_size; i++) {
1087         l2_offset = l1_table[i];
1088         if (l2_offset) {
1089             old_l2_offset = l2_offset;
1090             l2_offset &= L1E_OFFSET_MASK;
1091 
1092             if (offset_into_cluster(s, l2_offset)) {
1093                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1094                                         PRIx64 " unaligned (L1 index: %#x)",
1095                                         l2_offset, i);
1096                 ret = -EIO;
1097                 goto fail;
1098             }
1099 
1100             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1101                 (void**) &l2_table);
1102             if (ret < 0) {
1103                 goto fail;
1104             }
1105 
1106             for(j = 0; j < s->l2_size; j++) {
1107                 uint64_t cluster_index;
1108 
1109                 offset = be64_to_cpu(l2_table[j]);
1110                 old_offset = offset;
1111                 offset &= ~QCOW_OFLAG_COPIED;
1112 
1113                 switch (qcow2_get_cluster_type(offset)) {
1114                     case QCOW2_CLUSTER_COMPRESSED:
1115                         nb_csectors = ((offset >> s->csize_shift) &
1116                                        s->csize_mask) + 1;
1117                         if (addend != 0) {
1118                             ret = update_refcount(bs,
1119                                 (offset & s->cluster_offset_mask) & ~511,
1120                                 nb_csectors * 512, abs(addend), addend < 0,
1121                                 QCOW2_DISCARD_SNAPSHOT);
1122                             if (ret < 0) {
1123                                 goto fail;
1124                             }
1125                         }
1126                         /* compressed clusters are never modified */
1127                         refcount = 2;
1128                         break;
1129 
1130                     case QCOW2_CLUSTER_NORMAL:
1131                     case QCOW2_CLUSTER_ZERO:
1132                         if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1133                             qcow2_signal_corruption(bs, true, -1, -1, "Data "
1134                                                     "cluster offset %#llx "
1135                                                     "unaligned (L2 offset: %#"
1136                                                     PRIx64 ", L2 index: %#x)",
1137                                                     offset & L2E_OFFSET_MASK,
1138                                                     l2_offset, j);
1139                             ret = -EIO;
1140                             goto fail;
1141                         }
1142 
1143                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1144                         if (!cluster_index) {
1145                             /* unallocated */
1146                             refcount = 0;
1147                             break;
1148                         }
1149                         if (addend != 0) {
1150                             ret = qcow2_update_cluster_refcount(bs,
1151                                     cluster_index, abs(addend), addend < 0,
1152                                     QCOW2_DISCARD_SNAPSHOT);
1153                             if (ret < 0) {
1154                                 goto fail;
1155                             }
1156                         }
1157 
1158                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1159                         if (ret < 0) {
1160                             goto fail;
1161                         }
1162                         break;
1163 
1164                     case QCOW2_CLUSTER_UNALLOCATED:
1165                         refcount = 0;
1166                         break;
1167 
1168                     default:
1169                         abort();
1170                 }
1171 
1172                 if (refcount == 1) {
1173                     offset |= QCOW_OFLAG_COPIED;
1174                 }
1175                 if (offset != old_offset) {
1176                     if (addend > 0) {
1177                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
1178                             s->refcount_block_cache);
1179                     }
1180                     l2_table[j] = cpu_to_be64(offset);
1181                     qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1182                                                  l2_table);
1183                 }
1184             }
1185 
1186             qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1187 
1188             if (addend != 0) {
1189                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1190                                                         s->cluster_bits,
1191                                                     abs(addend), addend < 0,
1192                                                     QCOW2_DISCARD_SNAPSHOT);
1193                 if (ret < 0) {
1194                     goto fail;
1195                 }
1196             }
1197             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1198                                      &refcount);
1199             if (ret < 0) {
1200                 goto fail;
1201             } else if (refcount == 1) {
1202                 l2_offset |= QCOW_OFLAG_COPIED;
1203             }
1204             if (l2_offset != old_l2_offset) {
1205                 l1_table[i] = l2_offset;
1206                 l1_modified = 1;
1207             }
1208         }
1209     }
1210 
1211     ret = bdrv_flush(bs);
1212 fail:
1213     if (l2_table) {
1214         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1215     }
1216 
1217     s->cache_discards = false;
1218     qcow2_process_discards(bs, ret);
1219 
1220     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1221     if (ret == 0 && addend >= 0 && l1_modified) {
1222         for (i = 0; i < l1_size; i++) {
1223             cpu_to_be64s(&l1_table[i]);
1224         }
1225 
1226         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1227                                l1_table, l1_size2);
1228 
1229         for (i = 0; i < l1_size; i++) {
1230             be64_to_cpus(&l1_table[i]);
1231         }
1232     }
1233     if (l1_allocated)
1234         g_free(l1_table);
1235     return ret;
1236 }
1237 
1238 
1239 
1240 
1241 /*********************************************************/
1242 /* refcount checking functions */
1243 
1244 
1245 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1246 {
1247     /* This assertion holds because there is no way we can address more than
1248      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1249      * offsets have to be representable in bytes); due to every cluster
1250      * corresponding to one refcount entry, we are well below that limit */
1251     assert(entries < (UINT64_C(1) << (64 - 9)));
1252 
1253     /* Thanks to the assertion this will not overflow, because
1254      * s->refcount_order < 7.
1255      * (note: x << s->refcount_order == x * s->refcount_bits) */
1256     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1257 }
1258 
1259 /**
1260  * Reallocates *array so that it can hold new_size entries. *size must contain
1261  * the current number of entries in *array. If the reallocation fails, *array
1262  * and *size will not be modified and -errno will be returned. If the
1263  * reallocation is successful, *array will be set to the new buffer, *size
1264  * will be set to new_size and 0 will be returned. The size of the reallocated
1265  * refcount array buffer will be aligned to a cluster boundary, and the newly
1266  * allocated area will be zeroed.
1267  */
1268 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1269                                   int64_t *size, int64_t new_size)
1270 {
1271     int64_t old_byte_size, new_byte_size;
1272     void *new_ptr;
1273 
1274     /* Round to clusters so the array can be directly written to disk */
1275     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1276                     * s->cluster_size;
1277     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1278                     * s->cluster_size;
1279 
1280     if (new_byte_size == old_byte_size) {
1281         *size = new_size;
1282         return 0;
1283     }
1284 
1285     assert(new_byte_size > 0);
1286 
1287     if (new_byte_size > SIZE_MAX) {
1288         return -ENOMEM;
1289     }
1290 
1291     new_ptr = g_try_realloc(*array, new_byte_size);
1292     if (!new_ptr) {
1293         return -ENOMEM;
1294     }
1295 
1296     if (new_byte_size > old_byte_size) {
1297         memset((char *)new_ptr + old_byte_size, 0,
1298                new_byte_size - old_byte_size);
1299     }
1300 
1301     *array = new_ptr;
1302     *size  = new_size;
1303 
1304     return 0;
1305 }
1306 
1307 /*
1308  * Increases the refcount for a range of clusters in a given refcount table.
1309  * This is used to construct a temporary refcount table out of L1 and L2 tables
1310  * which can be compared to the refcount table saved in the image.
1311  *
1312  * Modifies the number of errors in res.
1313  */
1314 static int inc_refcounts(BlockDriverState *bs,
1315                          BdrvCheckResult *res,
1316                          void **refcount_table,
1317                          int64_t *refcount_table_size,
1318                          int64_t offset, int64_t size)
1319 {
1320     BDRVQcow2State *s = bs->opaque;
1321     uint64_t start, last, cluster_offset, k, refcount;
1322     int ret;
1323 
1324     if (size <= 0) {
1325         return 0;
1326     }
1327 
1328     start = start_of_cluster(s, offset);
1329     last = start_of_cluster(s, offset + size - 1);
1330     for(cluster_offset = start; cluster_offset <= last;
1331         cluster_offset += s->cluster_size) {
1332         k = cluster_offset >> s->cluster_bits;
1333         if (k >= *refcount_table_size) {
1334             ret = realloc_refcount_array(s, refcount_table,
1335                                          refcount_table_size, k + 1);
1336             if (ret < 0) {
1337                 res->check_errors++;
1338                 return ret;
1339             }
1340         }
1341 
1342         refcount = s->get_refcount(*refcount_table, k);
1343         if (refcount == s->refcount_max) {
1344             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1345                     "\n", cluster_offset);
1346             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1347                     "width or qemu-img convert to create a clean copy if the "
1348                     "image cannot be opened for writing\n");
1349             res->corruptions++;
1350             continue;
1351         }
1352         s->set_refcount(*refcount_table, k, refcount + 1);
1353     }
1354 
1355     return 0;
1356 }
1357 
1358 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1359 enum {
1360     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1361 };
1362 
1363 /*
1364  * Increases the refcount in the given refcount table for the all clusters
1365  * referenced in the L2 table. While doing so, performs some checks on L2
1366  * entries.
1367  *
1368  * Returns the number of errors found by the checks or -errno if an internal
1369  * error occurred.
1370  */
1371 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1372                               void **refcount_table,
1373                               int64_t *refcount_table_size, int64_t l2_offset,
1374                               int flags)
1375 {
1376     BDRVQcow2State *s = bs->opaque;
1377     uint64_t *l2_table, l2_entry;
1378     uint64_t next_contiguous_offset = 0;
1379     int i, l2_size, nb_csectors, ret;
1380 
1381     /* Read L2 table from disk */
1382     l2_size = s->l2_size * sizeof(uint64_t);
1383     l2_table = g_malloc(l2_size);
1384 
1385     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1386     if (ret < 0) {
1387         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1388         res->check_errors++;
1389         goto fail;
1390     }
1391 
1392     /* Do the actual checks */
1393     for(i = 0; i < s->l2_size; i++) {
1394         l2_entry = be64_to_cpu(l2_table[i]);
1395 
1396         switch (qcow2_get_cluster_type(l2_entry)) {
1397         case QCOW2_CLUSTER_COMPRESSED:
1398             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1399             if (l2_entry & QCOW_OFLAG_COPIED) {
1400                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1401                     "copied flag must never be set for compressed "
1402                     "clusters\n", l2_entry >> s->cluster_bits);
1403                 l2_entry &= ~QCOW_OFLAG_COPIED;
1404                 res->corruptions++;
1405             }
1406 
1407             /* Mark cluster as used */
1408             nb_csectors = ((l2_entry >> s->csize_shift) &
1409                            s->csize_mask) + 1;
1410             l2_entry &= s->cluster_offset_mask;
1411             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1412                                 l2_entry & ~511, nb_csectors * 512);
1413             if (ret < 0) {
1414                 goto fail;
1415             }
1416 
1417             if (flags & CHECK_FRAG_INFO) {
1418                 res->bfi.allocated_clusters++;
1419                 res->bfi.compressed_clusters++;
1420 
1421                 /* Compressed clusters are fragmented by nature.  Since they
1422                  * take up sub-sector space but we only have sector granularity
1423                  * I/O we need to re-read the same sectors even for adjacent
1424                  * compressed clusters.
1425                  */
1426                 res->bfi.fragmented_clusters++;
1427             }
1428             break;
1429 
1430         case QCOW2_CLUSTER_ZERO:
1431             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1432                 break;
1433             }
1434             /* fall through */
1435 
1436         case QCOW2_CLUSTER_NORMAL:
1437         {
1438             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1439 
1440             if (flags & CHECK_FRAG_INFO) {
1441                 res->bfi.allocated_clusters++;
1442                 if (next_contiguous_offset &&
1443                     offset != next_contiguous_offset) {
1444                     res->bfi.fragmented_clusters++;
1445                 }
1446                 next_contiguous_offset = offset + s->cluster_size;
1447             }
1448 
1449             /* Mark cluster as used */
1450             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1451                                 offset, s->cluster_size);
1452             if (ret < 0) {
1453                 goto fail;
1454             }
1455 
1456             /* Correct offsets are cluster aligned */
1457             if (offset_into_cluster(s, offset)) {
1458                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1459                     "properly aligned; L2 entry corrupted.\n", offset);
1460                 res->corruptions++;
1461             }
1462             break;
1463         }
1464 
1465         case QCOW2_CLUSTER_UNALLOCATED:
1466             break;
1467 
1468         default:
1469             abort();
1470         }
1471     }
1472 
1473     g_free(l2_table);
1474     return 0;
1475 
1476 fail:
1477     g_free(l2_table);
1478     return ret;
1479 }
1480 
1481 /*
1482  * Increases the refcount for the L1 table, its L2 tables and all referenced
1483  * clusters in the given refcount table. While doing so, performs some checks
1484  * on L1 and L2 entries.
1485  *
1486  * Returns the number of errors found by the checks or -errno if an internal
1487  * error occurred.
1488  */
1489 static int check_refcounts_l1(BlockDriverState *bs,
1490                               BdrvCheckResult *res,
1491                               void **refcount_table,
1492                               int64_t *refcount_table_size,
1493                               int64_t l1_table_offset, int l1_size,
1494                               int flags)
1495 {
1496     BDRVQcow2State *s = bs->opaque;
1497     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1498     int i, ret;
1499 
1500     l1_size2 = l1_size * sizeof(uint64_t);
1501 
1502     /* Mark L1 table as used */
1503     ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1504                         l1_table_offset, l1_size2);
1505     if (ret < 0) {
1506         goto fail;
1507     }
1508 
1509     /* Read L1 table entries from disk */
1510     if (l1_size2 > 0) {
1511         l1_table = g_try_malloc(l1_size2);
1512         if (l1_table == NULL) {
1513             ret = -ENOMEM;
1514             res->check_errors++;
1515             goto fail;
1516         }
1517         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1518         if (ret < 0) {
1519             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1520             res->check_errors++;
1521             goto fail;
1522         }
1523         for(i = 0;i < l1_size; i++)
1524             be64_to_cpus(&l1_table[i]);
1525     }
1526 
1527     /* Do the actual checks */
1528     for(i = 0; i < l1_size; i++) {
1529         l2_offset = l1_table[i];
1530         if (l2_offset) {
1531             /* Mark L2 table as used */
1532             l2_offset &= L1E_OFFSET_MASK;
1533             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1534                                 l2_offset, s->cluster_size);
1535             if (ret < 0) {
1536                 goto fail;
1537             }
1538 
1539             /* L2 tables are cluster aligned */
1540             if (offset_into_cluster(s, l2_offset)) {
1541                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1542                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1543                 res->corruptions++;
1544             }
1545 
1546             /* Process and check L2 entries */
1547             ret = check_refcounts_l2(bs, res, refcount_table,
1548                                      refcount_table_size, l2_offset, flags);
1549             if (ret < 0) {
1550                 goto fail;
1551             }
1552         }
1553     }
1554     g_free(l1_table);
1555     return 0;
1556 
1557 fail:
1558     g_free(l1_table);
1559     return ret;
1560 }
1561 
1562 /*
1563  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1564  *
1565  * This function does not print an error message nor does it increment
1566  * check_errors if qcow2_get_refcount fails (this is because such an error will
1567  * have been already detected and sufficiently signaled by the calling function
1568  * (qcow2_check_refcounts) by the time this function is called).
1569  */
1570 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1571                               BdrvCheckMode fix)
1572 {
1573     BDRVQcow2State *s = bs->opaque;
1574     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1575     int ret;
1576     uint64_t refcount;
1577     int i, j;
1578 
1579     for (i = 0; i < s->l1_size; i++) {
1580         uint64_t l1_entry = s->l1_table[i];
1581         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1582         bool l2_dirty = false;
1583 
1584         if (!l2_offset) {
1585             continue;
1586         }
1587 
1588         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1589                                  &refcount);
1590         if (ret < 0) {
1591             /* don't print message nor increment check_errors */
1592             continue;
1593         }
1594         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1595             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1596                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1597                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1598                                             "ERROR",
1599                     i, l1_entry, refcount);
1600             if (fix & BDRV_FIX_ERRORS) {
1601                 s->l1_table[i] = refcount == 1
1602                                ? l1_entry |  QCOW_OFLAG_COPIED
1603                                : l1_entry & ~QCOW_OFLAG_COPIED;
1604                 ret = qcow2_write_l1_entry(bs, i);
1605                 if (ret < 0) {
1606                     res->check_errors++;
1607                     goto fail;
1608                 }
1609                 res->corruptions_fixed++;
1610             } else {
1611                 res->corruptions++;
1612             }
1613         }
1614 
1615         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1616                          s->l2_size * sizeof(uint64_t));
1617         if (ret < 0) {
1618             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1619                     strerror(-ret));
1620             res->check_errors++;
1621             goto fail;
1622         }
1623 
1624         for (j = 0; j < s->l2_size; j++) {
1625             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1626             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1627             int cluster_type = qcow2_get_cluster_type(l2_entry);
1628 
1629             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1630                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1631                 ret = qcow2_get_refcount(bs,
1632                                          data_offset >> s->cluster_bits,
1633                                          &refcount);
1634                 if (ret < 0) {
1635                     /* don't print message nor increment check_errors */
1636                     continue;
1637                 }
1638                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1639                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1640                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1641                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1642                                                     "ERROR",
1643                             l2_entry, refcount);
1644                     if (fix & BDRV_FIX_ERRORS) {
1645                         l2_table[j] = cpu_to_be64(refcount == 1
1646                                     ? l2_entry |  QCOW_OFLAG_COPIED
1647                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1648                         l2_dirty = true;
1649                         res->corruptions_fixed++;
1650                     } else {
1651                         res->corruptions++;
1652                     }
1653                 }
1654             }
1655         }
1656 
1657         if (l2_dirty) {
1658             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1659                                                 l2_offset, s->cluster_size);
1660             if (ret < 0) {
1661                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1662                         "overlap check failed: %s\n", strerror(-ret));
1663                 res->check_errors++;
1664                 goto fail;
1665             }
1666 
1667             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1668                               s->cluster_size);
1669             if (ret < 0) {
1670                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1671                         strerror(-ret));
1672                 res->check_errors++;
1673                 goto fail;
1674             }
1675         }
1676     }
1677 
1678     ret = 0;
1679 
1680 fail:
1681     qemu_vfree(l2_table);
1682     return ret;
1683 }
1684 
1685 /*
1686  * Checks consistency of refblocks and accounts for each refblock in
1687  * *refcount_table.
1688  */
1689 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1690                            BdrvCheckMode fix, bool *rebuild,
1691                            void **refcount_table, int64_t *nb_clusters)
1692 {
1693     BDRVQcow2State *s = bs->opaque;
1694     int64_t i, size;
1695     int ret;
1696 
1697     for(i = 0; i < s->refcount_table_size; i++) {
1698         uint64_t offset, cluster;
1699         offset = s->refcount_table[i];
1700         cluster = offset >> s->cluster_bits;
1701 
1702         /* Refcount blocks are cluster aligned */
1703         if (offset_into_cluster(s, offset)) {
1704             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1705                 "cluster aligned; refcount table entry corrupted\n", i);
1706             res->corruptions++;
1707             *rebuild = true;
1708             continue;
1709         }
1710 
1711         if (cluster >= *nb_clusters) {
1712             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1713                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1714 
1715             if (fix & BDRV_FIX_ERRORS) {
1716                 int64_t new_nb_clusters;
1717 
1718                 if (offset > INT64_MAX - s->cluster_size) {
1719                     ret = -EINVAL;
1720                     goto resize_fail;
1721                 }
1722 
1723                 ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
1724                 if (ret < 0) {
1725                     goto resize_fail;
1726                 }
1727                 size = bdrv_getlength(bs->file->bs);
1728                 if (size < 0) {
1729                     ret = size;
1730                     goto resize_fail;
1731                 }
1732 
1733                 new_nb_clusters = size_to_clusters(s, size);
1734                 assert(new_nb_clusters >= *nb_clusters);
1735 
1736                 ret = realloc_refcount_array(s, refcount_table,
1737                                              nb_clusters, new_nb_clusters);
1738                 if (ret < 0) {
1739                     res->check_errors++;
1740                     return ret;
1741                 }
1742 
1743                 if (cluster >= *nb_clusters) {
1744                     ret = -EINVAL;
1745                     goto resize_fail;
1746                 }
1747 
1748                 res->corruptions_fixed++;
1749                 ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1750                                     offset, s->cluster_size);
1751                 if (ret < 0) {
1752                     return ret;
1753                 }
1754                 /* No need to check whether the refcount is now greater than 1:
1755                  * This area was just allocated and zeroed, so it can only be
1756                  * exactly 1 after inc_refcounts() */
1757                 continue;
1758 
1759 resize_fail:
1760                 res->corruptions++;
1761                 *rebuild = true;
1762                 fprintf(stderr, "ERROR could not resize image: %s\n",
1763                         strerror(-ret));
1764             } else {
1765                 res->corruptions++;
1766             }
1767             continue;
1768         }
1769 
1770         if (offset != 0) {
1771             ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1772                                 offset, s->cluster_size);
1773             if (ret < 0) {
1774                 return ret;
1775             }
1776             if (s->get_refcount(*refcount_table, cluster) != 1) {
1777                 fprintf(stderr, "ERROR refcount block %" PRId64
1778                         " refcount=%" PRIu64 "\n", i,
1779                         s->get_refcount(*refcount_table, cluster));
1780                 res->corruptions++;
1781                 *rebuild = true;
1782             }
1783         }
1784     }
1785 
1786     return 0;
1787 }
1788 
1789 /*
1790  * Calculates an in-memory refcount table.
1791  */
1792 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1793                                BdrvCheckMode fix, bool *rebuild,
1794                                void **refcount_table, int64_t *nb_clusters)
1795 {
1796     BDRVQcow2State *s = bs->opaque;
1797     int64_t i;
1798     QCowSnapshot *sn;
1799     int ret;
1800 
1801     if (!*refcount_table) {
1802         int64_t old_size = 0;
1803         ret = realloc_refcount_array(s, refcount_table,
1804                                      &old_size, *nb_clusters);
1805         if (ret < 0) {
1806             res->check_errors++;
1807             return ret;
1808         }
1809     }
1810 
1811     /* header */
1812     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1813                         0, s->cluster_size);
1814     if (ret < 0) {
1815         return ret;
1816     }
1817 
1818     /* current L1 table */
1819     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1820                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1821     if (ret < 0) {
1822         return ret;
1823     }
1824 
1825     /* snapshots */
1826     for (i = 0; i < s->nb_snapshots; i++) {
1827         sn = s->snapshots + i;
1828         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1829                                  sn->l1_table_offset, sn->l1_size, 0);
1830         if (ret < 0) {
1831             return ret;
1832         }
1833     }
1834     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1835                         s->snapshots_offset, s->snapshots_size);
1836     if (ret < 0) {
1837         return ret;
1838     }
1839 
1840     /* refcount data */
1841     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1842                         s->refcount_table_offset,
1843                         s->refcount_table_size * sizeof(uint64_t));
1844     if (ret < 0) {
1845         return ret;
1846     }
1847 
1848     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1849 }
1850 
1851 /*
1852  * Compares the actual reference count for each cluster in the image against the
1853  * refcount as reported by the refcount structures on-disk.
1854  */
1855 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1856                               BdrvCheckMode fix, bool *rebuild,
1857                               int64_t *highest_cluster,
1858                               void *refcount_table, int64_t nb_clusters)
1859 {
1860     BDRVQcow2State *s = bs->opaque;
1861     int64_t i;
1862     uint64_t refcount1, refcount2;
1863     int ret;
1864 
1865     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1866         ret = qcow2_get_refcount(bs, i, &refcount1);
1867         if (ret < 0) {
1868             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1869                     i, strerror(-ret));
1870             res->check_errors++;
1871             continue;
1872         }
1873 
1874         refcount2 = s->get_refcount(refcount_table, i);
1875 
1876         if (refcount1 > 0 || refcount2 > 0) {
1877             *highest_cluster = i;
1878         }
1879 
1880         if (refcount1 != refcount2) {
1881             /* Check if we're allowed to fix the mismatch */
1882             int *num_fixed = NULL;
1883             if (refcount1 == 0) {
1884                 *rebuild = true;
1885             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1886                 num_fixed = &res->leaks_fixed;
1887             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1888                 num_fixed = &res->corruptions_fixed;
1889             }
1890 
1891             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1892                     " reference=%" PRIu64 "\n",
1893                    num_fixed != NULL     ? "Repairing" :
1894                    refcount1 < refcount2 ? "ERROR" :
1895                                            "Leaked",
1896                    i, refcount1, refcount2);
1897 
1898             if (num_fixed) {
1899                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1900                                       refcount_diff(refcount1, refcount2),
1901                                       refcount1 > refcount2,
1902                                       QCOW2_DISCARD_ALWAYS);
1903                 if (ret >= 0) {
1904                     (*num_fixed)++;
1905                     continue;
1906                 }
1907             }
1908 
1909             /* And if we couldn't, print an error */
1910             if (refcount1 < refcount2) {
1911                 res->corruptions++;
1912             } else {
1913                 res->leaks++;
1914             }
1915         }
1916     }
1917 }
1918 
1919 /*
1920  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1921  * the on-disk refcount structures.
1922  *
1923  * On input, *first_free_cluster tells where to start looking, and need not
1924  * actually be a free cluster; the returned offset will not be before that
1925  * cluster.  On output, *first_free_cluster points to the first gap found, even
1926  * if that gap was too small to be used as the returned offset.
1927  *
1928  * Note that *first_free_cluster is a cluster index whereas the return value is
1929  * an offset.
1930  */
1931 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1932                                    int cluster_count,
1933                                    void **refcount_table,
1934                                    int64_t *imrt_nb_clusters,
1935                                    int64_t *first_free_cluster)
1936 {
1937     BDRVQcow2State *s = bs->opaque;
1938     int64_t cluster = *first_free_cluster, i;
1939     bool first_gap = true;
1940     int contiguous_free_clusters;
1941     int ret;
1942 
1943     /* Starting at *first_free_cluster, find a range of at least cluster_count
1944      * continuously free clusters */
1945     for (contiguous_free_clusters = 0;
1946          cluster < *imrt_nb_clusters &&
1947          contiguous_free_clusters < cluster_count;
1948          cluster++)
1949     {
1950         if (!s->get_refcount(*refcount_table, cluster)) {
1951             contiguous_free_clusters++;
1952             if (first_gap) {
1953                 /* If this is the first free cluster found, update
1954                  * *first_free_cluster accordingly */
1955                 *first_free_cluster = cluster;
1956                 first_gap = false;
1957             }
1958         } else if (contiguous_free_clusters) {
1959             contiguous_free_clusters = 0;
1960         }
1961     }
1962 
1963     /* If contiguous_free_clusters is greater than zero, it contains the number
1964      * of continuously free clusters until the current cluster; the first free
1965      * cluster in the current "gap" is therefore
1966      * cluster - contiguous_free_clusters */
1967 
1968     /* If no such range could be found, grow the in-memory refcount table
1969      * accordingly to append free clusters at the end of the image */
1970     if (contiguous_free_clusters < cluster_count) {
1971         /* contiguous_free_clusters clusters are already empty at the image end;
1972          * we need cluster_count clusters; therefore, we have to allocate
1973          * cluster_count - contiguous_free_clusters new clusters at the end of
1974          * the image (which is the current value of cluster; note that cluster
1975          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1976          * the image end) */
1977         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1978                                      cluster + cluster_count
1979                                      - contiguous_free_clusters);
1980         if (ret < 0) {
1981             return ret;
1982         }
1983     }
1984 
1985     /* Go back to the first free cluster */
1986     cluster -= contiguous_free_clusters;
1987     for (i = 0; i < cluster_count; i++) {
1988         s->set_refcount(*refcount_table, cluster + i, 1);
1989     }
1990 
1991     return cluster << s->cluster_bits;
1992 }
1993 
1994 /*
1995  * Creates a new refcount structure based solely on the in-memory information
1996  * given through *refcount_table. All necessary allocations will be reflected
1997  * in that array.
1998  *
1999  * On success, the old refcount structure is leaked (it will be covered by the
2000  * new refcount structure).
2001  */
2002 static int rebuild_refcount_structure(BlockDriverState *bs,
2003                                       BdrvCheckResult *res,
2004                                       void **refcount_table,
2005                                       int64_t *nb_clusters)
2006 {
2007     BDRVQcow2State *s = bs->opaque;
2008     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2009     int64_t refblock_offset, refblock_start, refblock_index;
2010     uint32_t reftable_size = 0;
2011     uint64_t *on_disk_reftable = NULL;
2012     void *on_disk_refblock;
2013     int ret = 0;
2014     struct {
2015         uint64_t reftable_offset;
2016         uint32_t reftable_clusters;
2017     } QEMU_PACKED reftable_offset_and_clusters;
2018 
2019     qcow2_cache_empty(bs, s->refcount_block_cache);
2020 
2021 write_refblocks:
2022     for (; cluster < *nb_clusters; cluster++) {
2023         if (!s->get_refcount(*refcount_table, cluster)) {
2024             continue;
2025         }
2026 
2027         refblock_index = cluster >> s->refcount_block_bits;
2028         refblock_start = refblock_index << s->refcount_block_bits;
2029 
2030         /* Don't allocate a cluster in a refblock already written to disk */
2031         if (first_free_cluster < refblock_start) {
2032             first_free_cluster = refblock_start;
2033         }
2034         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2035                                               nb_clusters, &first_free_cluster);
2036         if (refblock_offset < 0) {
2037             fprintf(stderr, "ERROR allocating refblock: %s\n",
2038                     strerror(-refblock_offset));
2039             res->check_errors++;
2040             ret = refblock_offset;
2041             goto fail;
2042         }
2043 
2044         if (reftable_size <= refblock_index) {
2045             uint32_t old_reftable_size = reftable_size;
2046             uint64_t *new_on_disk_reftable;
2047 
2048             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2049                                      s->cluster_size) / sizeof(uint64_t);
2050             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2051                                                  reftable_size *
2052                                                  sizeof(uint64_t));
2053             if (!new_on_disk_reftable) {
2054                 res->check_errors++;
2055                 ret = -ENOMEM;
2056                 goto fail;
2057             }
2058             on_disk_reftable = new_on_disk_reftable;
2059 
2060             memset(on_disk_reftable + old_reftable_size, 0,
2061                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2062 
2063             /* The offset we have for the reftable is now no longer valid;
2064              * this will leak that range, but we can easily fix that by running
2065              * a leak-fixing check after this rebuild operation */
2066             reftable_offset = -1;
2067         }
2068         on_disk_reftable[refblock_index] = refblock_offset;
2069 
2070         /* If this is apparently the last refblock (for now), try to squeeze the
2071          * reftable in */
2072         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2073             reftable_offset < 0)
2074         {
2075             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2076                                                           sizeof(uint64_t));
2077             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2078                                                   refcount_table, nb_clusters,
2079                                                   &first_free_cluster);
2080             if (reftable_offset < 0) {
2081                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2082                         strerror(-reftable_offset));
2083                 res->check_errors++;
2084                 ret = reftable_offset;
2085                 goto fail;
2086             }
2087         }
2088 
2089         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2090                                             s->cluster_size);
2091         if (ret < 0) {
2092             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2093             goto fail;
2094         }
2095 
2096         /* The size of *refcount_table is always cluster-aligned, therefore the
2097          * write operation will not overflow */
2098         on_disk_refblock = (void *)((char *) *refcount_table +
2099                                     refblock_index * s->cluster_size);
2100 
2101         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2102                          on_disk_refblock, s->cluster_sectors);
2103         if (ret < 0) {
2104             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2105             goto fail;
2106         }
2107 
2108         /* Go to the end of this refblock */
2109         cluster = refblock_start + s->refcount_block_size - 1;
2110     }
2111 
2112     if (reftable_offset < 0) {
2113         uint64_t post_refblock_start, reftable_clusters;
2114 
2115         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2116         reftable_clusters = size_to_clusters(s,
2117                                              reftable_size * sizeof(uint64_t));
2118         /* Not pretty but simple */
2119         if (first_free_cluster < post_refblock_start) {
2120             first_free_cluster = post_refblock_start;
2121         }
2122         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2123                                               refcount_table, nb_clusters,
2124                                               &first_free_cluster);
2125         if (reftable_offset < 0) {
2126             fprintf(stderr, "ERROR allocating reftable: %s\n",
2127                     strerror(-reftable_offset));
2128             res->check_errors++;
2129             ret = reftable_offset;
2130             goto fail;
2131         }
2132 
2133         goto write_refblocks;
2134     }
2135 
2136     assert(on_disk_reftable);
2137 
2138     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2139         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2140     }
2141 
2142     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2143                                         reftable_size * sizeof(uint64_t));
2144     if (ret < 0) {
2145         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2146         goto fail;
2147     }
2148 
2149     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2150     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2151                       reftable_size * sizeof(uint64_t));
2152     if (ret < 0) {
2153         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2154         goto fail;
2155     }
2156 
2157     /* Enter new reftable into the image header */
2158     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2159     reftable_offset_and_clusters.reftable_clusters =
2160         cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2161     ret = bdrv_pwrite_sync(bs->file,
2162                            offsetof(QCowHeader, refcount_table_offset),
2163                            &reftable_offset_and_clusters,
2164                            sizeof(reftable_offset_and_clusters));
2165     if (ret < 0) {
2166         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2167         goto fail;
2168     }
2169 
2170     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2171         be64_to_cpus(&on_disk_reftable[refblock_index]);
2172     }
2173     s->refcount_table = on_disk_reftable;
2174     s->refcount_table_offset = reftable_offset;
2175     s->refcount_table_size = reftable_size;
2176 
2177     return 0;
2178 
2179 fail:
2180     g_free(on_disk_reftable);
2181     return ret;
2182 }
2183 
2184 /*
2185  * Checks an image for refcount consistency.
2186  *
2187  * Returns 0 if no errors are found, the number of errors in case the image is
2188  * detected as corrupted, and -errno when an internal error occurred.
2189  */
2190 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2191                           BdrvCheckMode fix)
2192 {
2193     BDRVQcow2State *s = bs->opaque;
2194     BdrvCheckResult pre_compare_res;
2195     int64_t size, highest_cluster, nb_clusters;
2196     void *refcount_table = NULL;
2197     bool rebuild = false;
2198     int ret;
2199 
2200     size = bdrv_getlength(bs->file->bs);
2201     if (size < 0) {
2202         res->check_errors++;
2203         return size;
2204     }
2205 
2206     nb_clusters = size_to_clusters(s, size);
2207     if (nb_clusters > INT_MAX) {
2208         res->check_errors++;
2209         return -EFBIG;
2210     }
2211 
2212     res->bfi.total_clusters =
2213         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2214 
2215     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2216                               &nb_clusters);
2217     if (ret < 0) {
2218         goto fail;
2219     }
2220 
2221     /* In case we don't need to rebuild the refcount structure (but want to fix
2222      * something), this function is immediately called again, in which case the
2223      * result should be ignored */
2224     pre_compare_res = *res;
2225     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2226                       nb_clusters);
2227 
2228     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2229         BdrvCheckResult old_res = *res;
2230         int fresh_leaks = 0;
2231 
2232         fprintf(stderr, "Rebuilding refcount structure\n");
2233         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2234                                          &nb_clusters);
2235         if (ret < 0) {
2236             goto fail;
2237         }
2238 
2239         res->corruptions = 0;
2240         res->leaks = 0;
2241 
2242         /* Because the old reftable has been exchanged for a new one the
2243          * references have to be recalculated */
2244         rebuild = false;
2245         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2246         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2247                                   &nb_clusters);
2248         if (ret < 0) {
2249             goto fail;
2250         }
2251 
2252         if (fix & BDRV_FIX_LEAKS) {
2253             /* The old refcount structures are now leaked, fix it; the result
2254              * can be ignored, aside from leaks which were introduced by
2255              * rebuild_refcount_structure() that could not be fixed */
2256             BdrvCheckResult saved_res = *res;
2257             *res = (BdrvCheckResult){ 0 };
2258 
2259             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2260                               &highest_cluster, refcount_table, nb_clusters);
2261             if (rebuild) {
2262                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2263                         "broken\n");
2264             }
2265 
2266             /* Any leaks accounted for here were introduced by
2267              * rebuild_refcount_structure() because that function has created a
2268              * new refcount structure from scratch */
2269             fresh_leaks = res->leaks;
2270             *res = saved_res;
2271         }
2272 
2273         if (res->corruptions < old_res.corruptions) {
2274             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2275         }
2276         if (res->leaks < old_res.leaks) {
2277             res->leaks_fixed += old_res.leaks - res->leaks;
2278         }
2279         res->leaks += fresh_leaks;
2280     } else if (fix) {
2281         if (rebuild) {
2282             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2283             res->check_errors++;
2284             ret = -EIO;
2285             goto fail;
2286         }
2287 
2288         if (res->leaks || res->corruptions) {
2289             *res = pre_compare_res;
2290             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2291                               refcount_table, nb_clusters);
2292         }
2293     }
2294 
2295     /* check OFLAG_COPIED */
2296     ret = check_oflag_copied(bs, res, fix);
2297     if (ret < 0) {
2298         goto fail;
2299     }
2300 
2301     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2302     ret = 0;
2303 
2304 fail:
2305     g_free(refcount_table);
2306 
2307     return ret;
2308 }
2309 
2310 #define overlaps_with(ofs, sz) \
2311     ranges_overlap(offset, size, ofs, sz)
2312 
2313 /*
2314  * Checks if the given offset into the image file is actually free to use by
2315  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2316  * i.e. a sanity check without relying on the refcount tables.
2317  *
2318  * The ign parameter specifies what checks not to perform (being a bitmask of
2319  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2320  *
2321  * Returns:
2322  * - 0 if writing to this offset will not affect the mentioned metadata
2323  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2324  * - a negative value (-errno) indicating an error while performing a check,
2325  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2326  */
2327 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2328                                  int64_t size)
2329 {
2330     BDRVQcow2State *s = bs->opaque;
2331     int chk = s->overlap_check & ~ign;
2332     int i, j;
2333 
2334     if (!size) {
2335         return 0;
2336     }
2337 
2338     if (chk & QCOW2_OL_MAIN_HEADER) {
2339         if (offset < s->cluster_size) {
2340             return QCOW2_OL_MAIN_HEADER;
2341         }
2342     }
2343 
2344     /* align range to test to cluster boundaries */
2345     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2346     offset = start_of_cluster(s, offset);
2347 
2348     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2349         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2350             return QCOW2_OL_ACTIVE_L1;
2351         }
2352     }
2353 
2354     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2355         if (overlaps_with(s->refcount_table_offset,
2356             s->refcount_table_size * sizeof(uint64_t))) {
2357             return QCOW2_OL_REFCOUNT_TABLE;
2358         }
2359     }
2360 
2361     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2362         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2363             return QCOW2_OL_SNAPSHOT_TABLE;
2364         }
2365     }
2366 
2367     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2368         for (i = 0; i < s->nb_snapshots; i++) {
2369             if (s->snapshots[i].l1_size &&
2370                 overlaps_with(s->snapshots[i].l1_table_offset,
2371                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2372                 return QCOW2_OL_INACTIVE_L1;
2373             }
2374         }
2375     }
2376 
2377     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2378         for (i = 0; i < s->l1_size; i++) {
2379             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2380                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2381                 s->cluster_size)) {
2382                 return QCOW2_OL_ACTIVE_L2;
2383             }
2384         }
2385     }
2386 
2387     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2388         for (i = 0; i < s->refcount_table_size; i++) {
2389             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2390                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2391                 s->cluster_size)) {
2392                 return QCOW2_OL_REFCOUNT_BLOCK;
2393             }
2394         }
2395     }
2396 
2397     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2398         for (i = 0; i < s->nb_snapshots; i++) {
2399             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2400             uint32_t l1_sz  = s->snapshots[i].l1_size;
2401             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2402             uint64_t *l1 = g_try_malloc(l1_sz2);
2403             int ret;
2404 
2405             if (l1_sz2 && l1 == NULL) {
2406                 return -ENOMEM;
2407             }
2408 
2409             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2410             if (ret < 0) {
2411                 g_free(l1);
2412                 return ret;
2413             }
2414 
2415             for (j = 0; j < l1_sz; j++) {
2416                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2417                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2418                     g_free(l1);
2419                     return QCOW2_OL_INACTIVE_L2;
2420                 }
2421             }
2422 
2423             g_free(l1);
2424         }
2425     }
2426 
2427     return 0;
2428 }
2429 
2430 static const char *metadata_ol_names[] = {
2431     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2432     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2433     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2434     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2435     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2436     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2437     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2438     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2439 };
2440 
2441 /*
2442  * First performs a check for metadata overlaps (through
2443  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2444  * while performing a check), that value is returned. If an impending overlap
2445  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2446  * and -EIO returned.
2447  *
2448  * Returns 0 if there were neither overlaps nor errors while checking for
2449  * overlaps; or a negative value (-errno) on error.
2450  */
2451 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2452                                   int64_t size)
2453 {
2454     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2455 
2456     if (ret < 0) {
2457         return ret;
2458     } else if (ret > 0) {
2459         int metadata_ol_bitnr = ctz32(ret);
2460         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2461 
2462         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2463                                 "write on metadata (overlaps with %s)",
2464                                 metadata_ol_names[metadata_ol_bitnr]);
2465         return -EIO;
2466     }
2467 
2468     return 0;
2469 }
2470 
2471 /* A pointer to a function of this type is given to walk_over_reftable(). That
2472  * function will create refblocks and pass them to a RefblockFinishOp once they
2473  * are completed (@refblock). @refblock_empty is set if the refblock is
2474  * completely empty.
2475  *
2476  * Along with the refblock, a corresponding reftable entry is passed, in the
2477  * reftable @reftable (which may be reallocated) at @reftable_index.
2478  *
2479  * @allocated should be set to true if a new cluster has been allocated.
2480  */
2481 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2482                                uint64_t reftable_index, uint64_t *reftable_size,
2483                                void *refblock, bool refblock_empty,
2484                                bool *allocated, Error **errp);
2485 
2486 /**
2487  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2488  * it is not empty) and inserts its offset into the new reftable. The size of
2489  * this new reftable is increased as required.
2490  */
2491 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2492                           uint64_t reftable_index, uint64_t *reftable_size,
2493                           void *refblock, bool refblock_empty, bool *allocated,
2494                           Error **errp)
2495 {
2496     BDRVQcow2State *s = bs->opaque;
2497     int64_t offset;
2498 
2499     if (!refblock_empty && reftable_index >= *reftable_size) {
2500         uint64_t *new_reftable;
2501         uint64_t new_reftable_size;
2502 
2503         new_reftable_size = ROUND_UP(reftable_index + 1,
2504                                      s->cluster_size / sizeof(uint64_t));
2505         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2506             error_setg(errp,
2507                        "This operation would make the refcount table grow "
2508                        "beyond the maximum size supported by QEMU, aborting");
2509             return -ENOTSUP;
2510         }
2511 
2512         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2513                                                 sizeof(uint64_t));
2514         if (!new_reftable) {
2515             error_setg(errp, "Failed to increase reftable buffer size");
2516             return -ENOMEM;
2517         }
2518 
2519         memset(new_reftable + *reftable_size, 0,
2520                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2521 
2522         *reftable      = new_reftable;
2523         *reftable_size = new_reftable_size;
2524     }
2525 
2526     if (!refblock_empty && !(*reftable)[reftable_index]) {
2527         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2528         if (offset < 0) {
2529             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2530             return offset;
2531         }
2532         (*reftable)[reftable_index] = offset;
2533         *allocated = true;
2534     }
2535 
2536     return 0;
2537 }
2538 
2539 /**
2540  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2541  * offset specified by the new reftable's entry. It does not modify the new
2542  * reftable or change any refcounts.
2543  */
2544 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2545                           uint64_t reftable_index, uint64_t *reftable_size,
2546                           void *refblock, bool refblock_empty, bool *allocated,
2547                           Error **errp)
2548 {
2549     BDRVQcow2State *s = bs->opaque;
2550     int64_t offset;
2551     int ret;
2552 
2553     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2554         offset = (*reftable)[reftable_index];
2555 
2556         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2557         if (ret < 0) {
2558             error_setg_errno(errp, -ret, "Overlap check failed");
2559             return ret;
2560         }
2561 
2562         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2563         if (ret < 0) {
2564             error_setg_errno(errp, -ret, "Failed to write refblock");
2565             return ret;
2566         }
2567     } else {
2568         assert(refblock_empty);
2569     }
2570 
2571     return 0;
2572 }
2573 
2574 /**
2575  * This function walks over the existing reftable and every referenced refblock;
2576  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2577  * create an equal new entry in the passed @new_refblock. Once that
2578  * @new_refblock is completely filled, @operation will be called.
2579  *
2580  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2581  * @index is the index of the walk_over_reftable() calls and @total is the total
2582  * number of walk_over_reftable() calls per amend operation. Both are used for
2583  * calculating the parameters for the status callback.
2584  *
2585  * @allocated is set to true if a new cluster has been allocated.
2586  */
2587 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2588                               uint64_t *new_reftable_index,
2589                               uint64_t *new_reftable_size,
2590                               void *new_refblock, int new_refblock_size,
2591                               int new_refcount_bits,
2592                               RefblockFinishOp *operation, bool *allocated,
2593                               Qcow2SetRefcountFunc *new_set_refcount,
2594                               BlockDriverAmendStatusCB *status_cb,
2595                               void *cb_opaque, int index, int total,
2596                               Error **errp)
2597 {
2598     BDRVQcow2State *s = bs->opaque;
2599     uint64_t reftable_index;
2600     bool new_refblock_empty = true;
2601     int refblock_index;
2602     int new_refblock_index = 0;
2603     int ret;
2604 
2605     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2606          reftable_index++)
2607     {
2608         uint64_t refblock_offset = s->refcount_table[reftable_index]
2609                                  & REFT_OFFSET_MASK;
2610 
2611         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2612                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2613 
2614         if (refblock_offset) {
2615             void *refblock;
2616 
2617             if (offset_into_cluster(s, refblock_offset)) {
2618                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2619                                         PRIx64 " unaligned (reftable index: %#"
2620                                         PRIx64 ")", refblock_offset,
2621                                         reftable_index);
2622                 error_setg(errp,
2623                            "Image is corrupt (unaligned refblock offset)");
2624                 return -EIO;
2625             }
2626 
2627             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2628                                   &refblock);
2629             if (ret < 0) {
2630                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2631                 return ret;
2632             }
2633 
2634             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2635                  refblock_index++)
2636             {
2637                 uint64_t refcount;
2638 
2639                 if (new_refblock_index >= new_refblock_size) {
2640                     /* new_refblock is now complete */
2641                     ret = operation(bs, new_reftable, *new_reftable_index,
2642                                     new_reftable_size, new_refblock,
2643                                     new_refblock_empty, allocated, errp);
2644                     if (ret < 0) {
2645                         qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2646                         return ret;
2647                     }
2648 
2649                     (*new_reftable_index)++;
2650                     new_refblock_index = 0;
2651                     new_refblock_empty = true;
2652                 }
2653 
2654                 refcount = s->get_refcount(refblock, refblock_index);
2655                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2656                     uint64_t offset;
2657 
2658                     qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2659 
2660                     offset = ((reftable_index << s->refcount_block_bits)
2661                               + refblock_index) << s->cluster_bits;
2662 
2663                     error_setg(errp, "Cannot decrease refcount entry width to "
2664                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2665                                "refcount of %" PRIu64, new_refcount_bits,
2666                                offset, refcount);
2667                     return -EINVAL;
2668                 }
2669 
2670                 if (new_set_refcount) {
2671                     new_set_refcount(new_refblock, new_refblock_index++,
2672                                      refcount);
2673                 } else {
2674                     new_refblock_index++;
2675                 }
2676                 new_refblock_empty = new_refblock_empty && refcount == 0;
2677             }
2678 
2679             qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2680         } else {
2681             /* No refblock means every refcount is 0 */
2682             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2683                  refblock_index++)
2684             {
2685                 if (new_refblock_index >= new_refblock_size) {
2686                     /* new_refblock is now complete */
2687                     ret = operation(bs, new_reftable, *new_reftable_index,
2688                                     new_reftable_size, new_refblock,
2689                                     new_refblock_empty, allocated, errp);
2690                     if (ret < 0) {
2691                         return ret;
2692                     }
2693 
2694                     (*new_reftable_index)++;
2695                     new_refblock_index = 0;
2696                     new_refblock_empty = true;
2697                 }
2698 
2699                 if (new_set_refcount) {
2700                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2701                 } else {
2702                     new_refblock_index++;
2703                 }
2704             }
2705         }
2706     }
2707 
2708     if (new_refblock_index > 0) {
2709         /* Complete the potentially existing partially filled final refblock */
2710         if (new_set_refcount) {
2711             for (; new_refblock_index < new_refblock_size;
2712                  new_refblock_index++)
2713             {
2714                 new_set_refcount(new_refblock, new_refblock_index, 0);
2715             }
2716         }
2717 
2718         ret = operation(bs, new_reftable, *new_reftable_index,
2719                         new_reftable_size, new_refblock, new_refblock_empty,
2720                         allocated, errp);
2721         if (ret < 0) {
2722             return ret;
2723         }
2724 
2725         (*new_reftable_index)++;
2726     }
2727 
2728     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2729               (uint64_t)total * s->refcount_table_size, cb_opaque);
2730 
2731     return 0;
2732 }
2733 
2734 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2735                                 BlockDriverAmendStatusCB *status_cb,
2736                                 void *cb_opaque, Error **errp)
2737 {
2738     BDRVQcow2State *s = bs->opaque;
2739     Qcow2GetRefcountFunc *new_get_refcount;
2740     Qcow2SetRefcountFunc *new_set_refcount;
2741     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2742     uint64_t *new_reftable = NULL, new_reftable_size = 0;
2743     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2744     uint64_t new_reftable_index = 0;
2745     uint64_t i;
2746     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2747     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2748     int old_refcount_order;
2749     int walk_index = 0;
2750     int ret;
2751     bool new_allocation;
2752 
2753     assert(s->qcow_version >= 3);
2754     assert(refcount_order >= 0 && refcount_order <= 6);
2755 
2756     /* see qcow2_open() */
2757     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2758 
2759     new_get_refcount = get_refcount_funcs[refcount_order];
2760     new_set_refcount = set_refcount_funcs[refcount_order];
2761 
2762 
2763     do {
2764         int total_walks;
2765 
2766         new_allocation = false;
2767 
2768         /* At least we have to do this walk and the one which writes the
2769          * refblocks; also, at least we have to do this loop here at least
2770          * twice (normally), first to do the allocations, and second to
2771          * determine that everything is correctly allocated, this then makes
2772          * three walks in total */
2773         total_walks = MAX(walk_index + 2, 3);
2774 
2775         /* First, allocate the structures so they are present in the refcount
2776          * structures */
2777         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2778                                  &new_reftable_size, NULL, new_refblock_size,
2779                                  new_refcount_bits, &alloc_refblock,
2780                                  &new_allocation, NULL, status_cb, cb_opaque,
2781                                  walk_index++, total_walks, errp);
2782         if (ret < 0) {
2783             goto done;
2784         }
2785 
2786         new_reftable_index = 0;
2787 
2788         if (new_allocation) {
2789             if (new_reftable_offset) {
2790                 qcow2_free_clusters(bs, new_reftable_offset,
2791                                     allocated_reftable_size * sizeof(uint64_t),
2792                                     QCOW2_DISCARD_NEVER);
2793             }
2794 
2795             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2796                                                            sizeof(uint64_t));
2797             if (new_reftable_offset < 0) {
2798                 error_setg_errno(errp, -new_reftable_offset,
2799                                  "Failed to allocate the new reftable");
2800                 ret = new_reftable_offset;
2801                 goto done;
2802             }
2803             allocated_reftable_size = new_reftable_size;
2804         }
2805     } while (new_allocation);
2806 
2807     /* Second, write the new refblocks */
2808     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2809                              &new_reftable_size, new_refblock,
2810                              new_refblock_size, new_refcount_bits,
2811                              &flush_refblock, &new_allocation, new_set_refcount,
2812                              status_cb, cb_opaque, walk_index, walk_index + 1,
2813                              errp);
2814     if (ret < 0) {
2815         goto done;
2816     }
2817     assert(!new_allocation);
2818 
2819 
2820     /* Write the new reftable */
2821     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2822                                         new_reftable_size * sizeof(uint64_t));
2823     if (ret < 0) {
2824         error_setg_errno(errp, -ret, "Overlap check failed");
2825         goto done;
2826     }
2827 
2828     for (i = 0; i < new_reftable_size; i++) {
2829         cpu_to_be64s(&new_reftable[i]);
2830     }
2831 
2832     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
2833                       new_reftable_size * sizeof(uint64_t));
2834 
2835     for (i = 0; i < new_reftable_size; i++) {
2836         be64_to_cpus(&new_reftable[i]);
2837     }
2838 
2839     if (ret < 0) {
2840         error_setg_errno(errp, -ret, "Failed to write the new reftable");
2841         goto done;
2842     }
2843 
2844 
2845     /* Empty the refcount cache */
2846     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2847     if (ret < 0) {
2848         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2849         goto done;
2850     }
2851 
2852     /* Update the image header to point to the new reftable; this only updates
2853      * the fields which are relevant to qcow2_update_header(); other fields
2854      * such as s->refcount_table or s->refcount_bits stay stale for now
2855      * (because we have to restore everything if qcow2_update_header() fails) */
2856     old_refcount_order  = s->refcount_order;
2857     old_reftable_size   = s->refcount_table_size;
2858     old_reftable_offset = s->refcount_table_offset;
2859 
2860     s->refcount_order        = refcount_order;
2861     s->refcount_table_size   = new_reftable_size;
2862     s->refcount_table_offset = new_reftable_offset;
2863 
2864     ret = qcow2_update_header(bs);
2865     if (ret < 0) {
2866         s->refcount_order        = old_refcount_order;
2867         s->refcount_table_size   = old_reftable_size;
2868         s->refcount_table_offset = old_reftable_offset;
2869         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
2870         goto done;
2871     }
2872 
2873     /* Now update the rest of the in-memory information */
2874     old_reftable = s->refcount_table;
2875     s->refcount_table = new_reftable;
2876 
2877     s->refcount_bits = 1 << refcount_order;
2878     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
2879     s->refcount_max += s->refcount_max - 1;
2880 
2881     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
2882     s->refcount_block_size = 1 << s->refcount_block_bits;
2883 
2884     s->get_refcount = new_get_refcount;
2885     s->set_refcount = new_set_refcount;
2886 
2887     /* For cleaning up all old refblocks and the old reftable below the "done"
2888      * label */
2889     new_reftable        = old_reftable;
2890     new_reftable_size   = old_reftable_size;
2891     new_reftable_offset = old_reftable_offset;
2892 
2893 done:
2894     if (new_reftable) {
2895         /* On success, new_reftable actually points to the old reftable (and
2896          * new_reftable_size is the old reftable's size); but that is just
2897          * fine */
2898         for (i = 0; i < new_reftable_size; i++) {
2899             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
2900             if (offset) {
2901                 qcow2_free_clusters(bs, offset, s->cluster_size,
2902                                     QCOW2_DISCARD_OTHER);
2903             }
2904         }
2905         g_free(new_reftable);
2906 
2907         if (new_reftable_offset > 0) {
2908             qcow2_free_clusters(bs, new_reftable_offset,
2909                                 new_reftable_size * sizeof(uint64_t),
2910                                 QCOW2_DISCARD_OTHER);
2911         }
2912     }
2913 
2914     qemu_vfree(new_refblock);
2915     return ret;
2916 }
2917