xref: /qemu/block/qcow2-refcount.c (revision 9be38598)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "block/qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 
33 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
34 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
35                             int64_t offset, int64_t length, uint64_t addend,
36                             bool decrease, enum qcow2_discard_type type);
37 
38 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
39 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
45 
46 static void set_refcount_ro0(void *refcount_array, uint64_t index,
47                              uint64_t value);
48 static void set_refcount_ro1(void *refcount_array, uint64_t index,
49                              uint64_t value);
50 static void set_refcount_ro2(void *refcount_array, uint64_t index,
51                              uint64_t value);
52 static void set_refcount_ro3(void *refcount_array, uint64_t index,
53                              uint64_t value);
54 static void set_refcount_ro4(void *refcount_array, uint64_t index,
55                              uint64_t value);
56 static void set_refcount_ro5(void *refcount_array, uint64_t index,
57                              uint64_t value);
58 static void set_refcount_ro6(void *refcount_array, uint64_t index,
59                              uint64_t value);
60 
61 
62 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
63     &get_refcount_ro0,
64     &get_refcount_ro1,
65     &get_refcount_ro2,
66     &get_refcount_ro3,
67     &get_refcount_ro4,
68     &get_refcount_ro5,
69     &get_refcount_ro6
70 };
71 
72 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
73     &set_refcount_ro0,
74     &set_refcount_ro1,
75     &set_refcount_ro2,
76     &set_refcount_ro3,
77     &set_refcount_ro4,
78     &set_refcount_ro5,
79     &set_refcount_ro6
80 };
81 
82 
83 /*********************************************************/
84 /* refcount handling */
85 
86 int qcow2_refcount_init(BlockDriverState *bs)
87 {
88     BDRVQcow2State *s = bs->opaque;
89     unsigned int refcount_table_size2, i;
90     int ret;
91 
92     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
93 
94     s->get_refcount = get_refcount_funcs[s->refcount_order];
95     s->set_refcount = set_refcount_funcs[s->refcount_order];
96 
97     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
98     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
99     s->refcount_table = g_try_malloc(refcount_table_size2);
100 
101     if (s->refcount_table_size > 0) {
102         if (s->refcount_table == NULL) {
103             ret = -ENOMEM;
104             goto fail;
105         }
106         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
107         ret = bdrv_pread(bs->file->bs, s->refcount_table_offset,
108                          s->refcount_table, refcount_table_size2);
109         if (ret < 0) {
110             goto fail;
111         }
112         for(i = 0; i < s->refcount_table_size; i++)
113             be64_to_cpus(&s->refcount_table[i]);
114     }
115     return 0;
116  fail:
117     return ret;
118 }
119 
120 void qcow2_refcount_close(BlockDriverState *bs)
121 {
122     BDRVQcow2State *s = bs->opaque;
123     g_free(s->refcount_table);
124 }
125 
126 
127 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
128 {
129     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
130 }
131 
132 static void set_refcount_ro0(void *refcount_array, uint64_t index,
133                              uint64_t value)
134 {
135     assert(!(value >> 1));
136     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
137     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
138 }
139 
140 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
141 {
142     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
143            & 0x3;
144 }
145 
146 static void set_refcount_ro1(void *refcount_array, uint64_t index,
147                              uint64_t value)
148 {
149     assert(!(value >> 2));
150     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
151     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
152 }
153 
154 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
155 {
156     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
157            & 0xf;
158 }
159 
160 static void set_refcount_ro2(void *refcount_array, uint64_t index,
161                              uint64_t value)
162 {
163     assert(!(value >> 4));
164     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
165     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
166 }
167 
168 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
169 {
170     return ((const uint8_t *)refcount_array)[index];
171 }
172 
173 static void set_refcount_ro3(void *refcount_array, uint64_t index,
174                              uint64_t value)
175 {
176     assert(!(value >> 8));
177     ((uint8_t *)refcount_array)[index] = value;
178 }
179 
180 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
181 {
182     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
183 }
184 
185 static void set_refcount_ro4(void *refcount_array, uint64_t index,
186                              uint64_t value)
187 {
188     assert(!(value >> 16));
189     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
190 }
191 
192 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
193 {
194     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
195 }
196 
197 static void set_refcount_ro5(void *refcount_array, uint64_t index,
198                              uint64_t value)
199 {
200     assert(!(value >> 32));
201     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
202 }
203 
204 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
205 {
206     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
207 }
208 
209 static void set_refcount_ro6(void *refcount_array, uint64_t index,
210                              uint64_t value)
211 {
212     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
213 }
214 
215 
216 static int load_refcount_block(BlockDriverState *bs,
217                                int64_t refcount_block_offset,
218                                void **refcount_block)
219 {
220     BDRVQcow2State *s = bs->opaque;
221 
222     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
223     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
224                            refcount_block);
225 }
226 
227 /*
228  * Retrieves the refcount of the cluster given by its index and stores it in
229  * *refcount. Returns 0 on success and -errno on failure.
230  */
231 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
232                        uint64_t *refcount)
233 {
234     BDRVQcow2State *s = bs->opaque;
235     uint64_t refcount_table_index, block_index;
236     int64_t refcount_block_offset;
237     int ret;
238     void *refcount_block;
239 
240     refcount_table_index = cluster_index >> s->refcount_block_bits;
241     if (refcount_table_index >= s->refcount_table_size) {
242         *refcount = 0;
243         return 0;
244     }
245     refcount_block_offset =
246         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
247     if (!refcount_block_offset) {
248         *refcount = 0;
249         return 0;
250     }
251 
252     if (offset_into_cluster(s, refcount_block_offset)) {
253         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
254                                 " unaligned (reftable index: %#" PRIx64 ")",
255                                 refcount_block_offset, refcount_table_index);
256         return -EIO;
257     }
258 
259     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
260                           &refcount_block);
261     if (ret < 0) {
262         return ret;
263     }
264 
265     block_index = cluster_index & (s->refcount_block_size - 1);
266     *refcount = s->get_refcount(refcount_block, block_index);
267 
268     qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
269 
270     return 0;
271 }
272 
273 /*
274  * Rounds the refcount table size up to avoid growing the table for each single
275  * refcount block that is allocated.
276  */
277 static unsigned int next_refcount_table_size(BDRVQcow2State *s,
278     unsigned int min_size)
279 {
280     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
281     unsigned int refcount_table_clusters =
282         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
283 
284     while (min_clusters > refcount_table_clusters) {
285         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
286     }
287 
288     return refcount_table_clusters << (s->cluster_bits - 3);
289 }
290 
291 
292 /* Checks if two offsets are described by the same refcount block */
293 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
294     uint64_t offset_b)
295 {
296     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
297     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
298 
299     return (block_a == block_b);
300 }
301 
302 /*
303  * Loads a refcount block. If it doesn't exist yet, it is allocated first
304  * (including growing the refcount table if needed).
305  *
306  * Returns 0 on success or -errno in error case
307  */
308 static int alloc_refcount_block(BlockDriverState *bs,
309                                 int64_t cluster_index, void **refcount_block)
310 {
311     BDRVQcow2State *s = bs->opaque;
312     unsigned int refcount_table_index;
313     int ret;
314 
315     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
316 
317     /* Find the refcount block for the given cluster */
318     refcount_table_index = cluster_index >> s->refcount_block_bits;
319 
320     if (refcount_table_index < s->refcount_table_size) {
321 
322         uint64_t refcount_block_offset =
323             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
324 
325         /* If it's already there, we're done */
326         if (refcount_block_offset) {
327             if (offset_into_cluster(s, refcount_block_offset)) {
328                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
329                                         PRIx64 " unaligned (reftable index: "
330                                         "%#x)", refcount_block_offset,
331                                         refcount_table_index);
332                 return -EIO;
333             }
334 
335              return load_refcount_block(bs, refcount_block_offset,
336                                         refcount_block);
337         }
338     }
339 
340     /*
341      * If we came here, we need to allocate something. Something is at least
342      * a cluster for the new refcount block. It may also include a new refcount
343      * table if the old refcount table is too small.
344      *
345      * Note that allocating clusters here needs some special care:
346      *
347      * - We can't use the normal qcow2_alloc_clusters(), it would try to
348      *   increase the refcount and very likely we would end up with an endless
349      *   recursion. Instead we must place the refcount blocks in a way that
350      *   they can describe them themselves.
351      *
352      * - We need to consider that at this point we are inside update_refcounts
353      *   and potentially doing an initial refcount increase. This means that
354      *   some clusters have already been allocated by the caller, but their
355      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
356      *   need to return -EAGAIN to signal the caller that it needs to restart
357      *   the search for free clusters.
358      *
359      * - alloc_clusters_noref and qcow2_free_clusters may load a different
360      *   refcount block into the cache
361      */
362 
363     *refcount_block = NULL;
364 
365     /* We write to the refcount table, so we might depend on L2 tables */
366     ret = qcow2_cache_flush(bs, s->l2_table_cache);
367     if (ret < 0) {
368         return ret;
369     }
370 
371     /* Allocate the refcount block itself and mark it as used */
372     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
373     if (new_block < 0) {
374         return new_block;
375     }
376 
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382 
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail_block;
389         }
390 
391         memset(*refcount_block, 0, s->cluster_size);
392 
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail_block;
404         }
405 
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail_block;
409         }
410 
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail_block;
417         }
418 
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421 
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail_block;
428     }
429 
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file->bs,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail_block;
439         }
440 
441         s->refcount_table[refcount_table_index] = new_block;
442 
443         /* The new refcount block may be where the caller intended to put its
444          * data, so let it restart the search. */
445         return -EAGAIN;
446     }
447 
448     qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
449 
450     /*
451      * If we come here, we need to grow the refcount table. Again, a new
452      * refcount table needs some space and we can't simply allocate to avoid
453      * endless recursion.
454      *
455      * Therefore let's grab new refcount blocks at the end of the image, which
456      * will describe themselves and the new refcount table. This way we can
457      * reference them only in the new table and do the switch to the new
458      * refcount table at once without producing an inconsistent state in
459      * between.
460      */
461     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
462 
463     /* Calculate the number of refcount blocks needed so far; this will be the
464      * basis for calculating the index of the first cluster used for the
465      * self-describing refcount structures which we are about to create.
466      *
467      * Because we reached this point, there cannot be any refcount entries for
468      * cluster_index or higher indices yet. However, because new_block has been
469      * allocated to describe that cluster (and it will assume this role later
470      * on), we cannot use that index; also, new_block may actually have a higher
471      * cluster index than cluster_index, so it needs to be taken into account
472      * here (and 1 needs to be added to its value because that cluster is used).
473      */
474     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
475                                             (new_block >> s->cluster_bits) + 1),
476                                         s->refcount_block_size);
477 
478     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
479         return -EFBIG;
480     }
481 
482     /* And now we need at least one block more for the new metadata */
483     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
484     uint64_t last_table_size;
485     uint64_t blocks_clusters;
486     do {
487         uint64_t table_clusters =
488             size_to_clusters(s, table_size * sizeof(uint64_t));
489         blocks_clusters = 1 +
490             DIV_ROUND_UP(table_clusters, s->refcount_block_size);
491         uint64_t meta_clusters = table_clusters + blocks_clusters;
492 
493         last_table_size = table_size;
494         table_size = next_refcount_table_size(s, blocks_used +
495             DIV_ROUND_UP(meta_clusters, s->refcount_block_size));
496 
497     } while (last_table_size != table_size);
498 
499 #ifdef DEBUG_ALLOC2
500     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
501         s->refcount_table_size, table_size);
502 #endif
503 
504     /* Create the new refcount table and blocks */
505     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
506         s->cluster_size;
507     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
508     uint64_t *new_table = g_try_new0(uint64_t, table_size);
509     void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
510 
511     assert(table_size > 0 && blocks_clusters > 0);
512     if (new_table == NULL || new_blocks == NULL) {
513         ret = -ENOMEM;
514         goto fail_table;
515     }
516 
517     /* Fill the new refcount table */
518     memcpy(new_table, s->refcount_table,
519         s->refcount_table_size * sizeof(uint64_t));
520     new_table[refcount_table_index] = new_block;
521 
522     int i;
523     for (i = 0; i < blocks_clusters; i++) {
524         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
525     }
526 
527     /* Fill the refcount blocks */
528     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
529     int block = 0;
530     for (i = 0; i < table_clusters + blocks_clusters; i++) {
531         s->set_refcount(new_blocks, block++, 1);
532     }
533 
534     /* Write refcount blocks to disk */
535     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
536     ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks,
537         blocks_clusters * s->cluster_size);
538     g_free(new_blocks);
539     new_blocks = NULL;
540     if (ret < 0) {
541         goto fail_table;
542     }
543 
544     /* Write refcount table to disk */
545     for(i = 0; i < table_size; i++) {
546         cpu_to_be64s(&new_table[i]);
547     }
548 
549     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
550     ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table,
551         table_size * sizeof(uint64_t));
552     if (ret < 0) {
553         goto fail_table;
554     }
555 
556     for(i = 0; i < table_size; i++) {
557         be64_to_cpus(&new_table[i]);
558     }
559 
560     /* Hook up the new refcount table in the qcow2 header */
561     struct QEMU_PACKED {
562         uint64_t d64;
563         uint32_t d32;
564     } data;
565     cpu_to_be64w(&data.d64, table_offset);
566     cpu_to_be32w(&data.d32, table_clusters);
567     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
568     ret = bdrv_pwrite_sync(bs->file->bs,
569                            offsetof(QCowHeader, refcount_table_offset),
570                            &data, sizeof(data));
571     if (ret < 0) {
572         goto fail_table;
573     }
574 
575     /* And switch it in memory */
576     uint64_t old_table_offset = s->refcount_table_offset;
577     uint64_t old_table_size = s->refcount_table_size;
578 
579     g_free(s->refcount_table);
580     s->refcount_table = new_table;
581     s->refcount_table_size = table_size;
582     s->refcount_table_offset = table_offset;
583 
584     /* Free old table. */
585     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
586                         QCOW2_DISCARD_OTHER);
587 
588     ret = load_refcount_block(bs, new_block, refcount_block);
589     if (ret < 0) {
590         return ret;
591     }
592 
593     /* If we were trying to do the initial refcount update for some cluster
594      * allocation, we might have used the same clusters to store newly
595      * allocated metadata. Make the caller search some new space. */
596     return -EAGAIN;
597 
598 fail_table:
599     g_free(new_blocks);
600     g_free(new_table);
601 fail_block:
602     if (*refcount_block != NULL) {
603         qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
604     }
605     return ret;
606 }
607 
608 void qcow2_process_discards(BlockDriverState *bs, int ret)
609 {
610     BDRVQcow2State *s = bs->opaque;
611     Qcow2DiscardRegion *d, *next;
612 
613     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
614         QTAILQ_REMOVE(&s->discards, d, next);
615 
616         /* Discard is optional, ignore the return value */
617         if (ret >= 0) {
618             bdrv_discard(bs->file->bs,
619                          d->offset >> BDRV_SECTOR_BITS,
620                          d->bytes >> BDRV_SECTOR_BITS);
621         }
622 
623         g_free(d);
624     }
625 }
626 
627 static void update_refcount_discard(BlockDriverState *bs,
628                                     uint64_t offset, uint64_t length)
629 {
630     BDRVQcow2State *s = bs->opaque;
631     Qcow2DiscardRegion *d, *p, *next;
632 
633     QTAILQ_FOREACH(d, &s->discards, next) {
634         uint64_t new_start = MIN(offset, d->offset);
635         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
636 
637         if (new_end - new_start <= length + d->bytes) {
638             /* There can't be any overlap, areas ending up here have no
639              * references any more and therefore shouldn't get freed another
640              * time. */
641             assert(d->bytes + length == new_end - new_start);
642             d->offset = new_start;
643             d->bytes = new_end - new_start;
644             goto found;
645         }
646     }
647 
648     d = g_malloc(sizeof(*d));
649     *d = (Qcow2DiscardRegion) {
650         .bs     = bs,
651         .offset = offset,
652         .bytes  = length,
653     };
654     QTAILQ_INSERT_TAIL(&s->discards, d, next);
655 
656 found:
657     /* Merge discard requests if they are adjacent now */
658     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
659         if (p == d
660             || p->offset > d->offset + d->bytes
661             || d->offset > p->offset + p->bytes)
662         {
663             continue;
664         }
665 
666         /* Still no overlap possible */
667         assert(p->offset == d->offset + d->bytes
668             || d->offset == p->offset + p->bytes);
669 
670         QTAILQ_REMOVE(&s->discards, p, next);
671         d->offset = MIN(d->offset, p->offset);
672         d->bytes += p->bytes;
673         g_free(p);
674     }
675 }
676 
677 /* XXX: cache several refcount block clusters ? */
678 /* @addend is the absolute value of the addend; if @decrease is set, @addend
679  * will be subtracted from the current refcount, otherwise it will be added */
680 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
681                                                    int64_t offset,
682                                                    int64_t length,
683                                                    uint64_t addend,
684                                                    bool decrease,
685                                                    enum qcow2_discard_type type)
686 {
687     BDRVQcow2State *s = bs->opaque;
688     int64_t start, last, cluster_offset;
689     void *refcount_block = NULL;
690     int64_t old_table_index = -1;
691     int ret;
692 
693 #ifdef DEBUG_ALLOC2
694     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
695             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
696             addend);
697 #endif
698     if (length < 0) {
699         return -EINVAL;
700     } else if (length == 0) {
701         return 0;
702     }
703 
704     if (decrease) {
705         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
706             s->l2_table_cache);
707     }
708 
709     start = start_of_cluster(s, offset);
710     last = start_of_cluster(s, offset + length - 1);
711     for(cluster_offset = start; cluster_offset <= last;
712         cluster_offset += s->cluster_size)
713     {
714         int block_index;
715         uint64_t refcount;
716         int64_t cluster_index = cluster_offset >> s->cluster_bits;
717         int64_t table_index = cluster_index >> s->refcount_block_bits;
718 
719         /* Load the refcount block and allocate it if needed */
720         if (table_index != old_table_index) {
721             if (refcount_block) {
722                 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
723             }
724             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
725             if (ret < 0) {
726                 goto fail;
727             }
728         }
729         old_table_index = table_index;
730 
731         qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
732                                      refcount_block);
733 
734         /* we can update the count and save it */
735         block_index = cluster_index & (s->refcount_block_size - 1);
736 
737         refcount = s->get_refcount(refcount_block, block_index);
738         if (decrease ? (refcount - addend > refcount)
739                      : (refcount + addend < refcount ||
740                         refcount + addend > s->refcount_max))
741         {
742             ret = -EINVAL;
743             goto fail;
744         }
745         if (decrease) {
746             refcount -= addend;
747         } else {
748             refcount += addend;
749         }
750         if (refcount == 0 && cluster_index < s->free_cluster_index) {
751             s->free_cluster_index = cluster_index;
752         }
753         s->set_refcount(refcount_block, block_index, refcount);
754 
755         if (refcount == 0 && s->discard_passthrough[type]) {
756             update_refcount_discard(bs, cluster_offset, s->cluster_size);
757         }
758     }
759 
760     ret = 0;
761 fail:
762     if (!s->cache_discards) {
763         qcow2_process_discards(bs, ret);
764     }
765 
766     /* Write last changed block to disk */
767     if (refcount_block) {
768         qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
769     }
770 
771     /*
772      * Try do undo any updates if an error is returned (This may succeed in
773      * some cases like ENOSPC for allocating a new refcount block)
774      */
775     if (ret < 0) {
776         int dummy;
777         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
778                                 !decrease, QCOW2_DISCARD_NEVER);
779         (void)dummy;
780     }
781 
782     return ret;
783 }
784 
785 /*
786  * Increases or decreases the refcount of a given cluster.
787  *
788  * @addend is the absolute value of the addend; if @decrease is set, @addend
789  * will be subtracted from the current refcount, otherwise it will be added.
790  *
791  * On success 0 is returned; on failure -errno is returned.
792  */
793 int qcow2_update_cluster_refcount(BlockDriverState *bs,
794                                   int64_t cluster_index,
795                                   uint64_t addend, bool decrease,
796                                   enum qcow2_discard_type type)
797 {
798     BDRVQcow2State *s = bs->opaque;
799     int ret;
800 
801     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
802                           decrease, type);
803     if (ret < 0) {
804         return ret;
805     }
806 
807     return 0;
808 }
809 
810 
811 
812 /*********************************************************/
813 /* cluster allocation functions */
814 
815 
816 
817 /* return < 0 if error */
818 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
819 {
820     BDRVQcow2State *s = bs->opaque;
821     uint64_t i, nb_clusters, refcount;
822     int ret;
823 
824     /* We can't allocate clusters if they may still be queued for discard. */
825     if (s->cache_discards) {
826         qcow2_process_discards(bs, 0);
827     }
828 
829     nb_clusters = size_to_clusters(s, size);
830 retry:
831     for(i = 0; i < nb_clusters; i++) {
832         uint64_t next_cluster_index = s->free_cluster_index++;
833         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
834 
835         if (ret < 0) {
836             return ret;
837         } else if (refcount != 0) {
838             goto retry;
839         }
840     }
841 
842     /* Make sure that all offsets in the "allocated" range are representable
843      * in an int64_t */
844     if (s->free_cluster_index > 0 &&
845         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
846     {
847         return -EFBIG;
848     }
849 
850 #ifdef DEBUG_ALLOC2
851     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
852             size,
853             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
854 #endif
855     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
856 }
857 
858 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
859 {
860     int64_t offset;
861     int ret;
862 
863     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
864     do {
865         offset = alloc_clusters_noref(bs, size);
866         if (offset < 0) {
867             return offset;
868         }
869 
870         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
871     } while (ret == -EAGAIN);
872 
873     if (ret < 0) {
874         return ret;
875     }
876 
877     return offset;
878 }
879 
880 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
881                                 int64_t nb_clusters)
882 {
883     BDRVQcow2State *s = bs->opaque;
884     uint64_t cluster_index, refcount;
885     uint64_t i;
886     int ret;
887 
888     assert(nb_clusters >= 0);
889     if (nb_clusters == 0) {
890         return 0;
891     }
892 
893     do {
894         /* Check how many clusters there are free */
895         cluster_index = offset >> s->cluster_bits;
896         for(i = 0; i < nb_clusters; i++) {
897             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
898             if (ret < 0) {
899                 return ret;
900             } else if (refcount != 0) {
901                 break;
902             }
903         }
904 
905         /* And then allocate them */
906         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
907                               QCOW2_DISCARD_NEVER);
908     } while (ret == -EAGAIN);
909 
910     if (ret < 0) {
911         return ret;
912     }
913 
914     return i;
915 }
916 
917 /* only used to allocate compressed sectors. We try to allocate
918    contiguous sectors. size must be <= cluster_size */
919 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
920 {
921     BDRVQcow2State *s = bs->opaque;
922     int64_t offset;
923     size_t free_in_cluster;
924     int ret;
925 
926     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
927     assert(size > 0 && size <= s->cluster_size);
928     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
929 
930     offset = s->free_byte_offset;
931 
932     if (offset) {
933         uint64_t refcount;
934         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
935         if (ret < 0) {
936             return ret;
937         }
938 
939         if (refcount == s->refcount_max) {
940             offset = 0;
941         }
942     }
943 
944     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
945     do {
946         if (!offset || free_in_cluster < size) {
947             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
948             if (new_cluster < 0) {
949                 return new_cluster;
950             }
951 
952             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
953                 offset = new_cluster;
954                 free_in_cluster = s->cluster_size;
955             } else {
956                 free_in_cluster += s->cluster_size;
957             }
958         }
959 
960         assert(offset);
961         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
962         if (ret < 0) {
963             offset = 0;
964         }
965     } while (ret == -EAGAIN);
966     if (ret < 0) {
967         return ret;
968     }
969 
970     /* The cluster refcount was incremented; refcount blocks must be flushed
971      * before the caller's L2 table updates. */
972     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
973 
974     s->free_byte_offset = offset + size;
975     if (!offset_into_cluster(s, s->free_byte_offset)) {
976         s->free_byte_offset = 0;
977     }
978 
979     return offset;
980 }
981 
982 void qcow2_free_clusters(BlockDriverState *bs,
983                           int64_t offset, int64_t size,
984                           enum qcow2_discard_type type)
985 {
986     int ret;
987 
988     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
989     ret = update_refcount(bs, offset, size, 1, true, type);
990     if (ret < 0) {
991         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
992         /* TODO Remember the clusters to free them later and avoid leaking */
993     }
994 }
995 
996 /*
997  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
998  * normal cluster, compressed cluster, etc.)
999  */
1000 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1001                              int nb_clusters, enum qcow2_discard_type type)
1002 {
1003     BDRVQcow2State *s = bs->opaque;
1004 
1005     switch (qcow2_get_cluster_type(l2_entry)) {
1006     case QCOW2_CLUSTER_COMPRESSED:
1007         {
1008             int nb_csectors;
1009             nb_csectors = ((l2_entry >> s->csize_shift) &
1010                            s->csize_mask) + 1;
1011             qcow2_free_clusters(bs,
1012                 (l2_entry & s->cluster_offset_mask) & ~511,
1013                 nb_csectors * 512, type);
1014         }
1015         break;
1016     case QCOW2_CLUSTER_NORMAL:
1017     case QCOW2_CLUSTER_ZERO:
1018         if (l2_entry & L2E_OFFSET_MASK) {
1019             if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1020                 qcow2_signal_corruption(bs, false, -1, -1,
1021                                         "Cannot free unaligned cluster %#llx",
1022                                         l2_entry & L2E_OFFSET_MASK);
1023             } else {
1024                 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1025                                     nb_clusters << s->cluster_bits, type);
1026             }
1027         }
1028         break;
1029     case QCOW2_CLUSTER_UNALLOCATED:
1030         break;
1031     default:
1032         abort();
1033     }
1034 }
1035 
1036 
1037 
1038 /*********************************************************/
1039 /* snapshots and image creation */
1040 
1041 
1042 
1043 /* update the refcounts of snapshots and the copied flag */
1044 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1045     int64_t l1_table_offset, int l1_size, int addend)
1046 {
1047     BDRVQcow2State *s = bs->opaque;
1048     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1049     bool l1_allocated = false;
1050     int64_t old_offset, old_l2_offset;
1051     int i, j, l1_modified = 0, nb_csectors;
1052     int ret;
1053 
1054     assert(addend >= -1 && addend <= 1);
1055 
1056     l2_table = NULL;
1057     l1_table = NULL;
1058     l1_size2 = l1_size * sizeof(uint64_t);
1059 
1060     s->cache_discards = true;
1061 
1062     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1063      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1064      * when changing this! */
1065     if (l1_table_offset != s->l1_table_offset) {
1066         l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1067         if (l1_size2 && l1_table == NULL) {
1068             ret = -ENOMEM;
1069             goto fail;
1070         }
1071         l1_allocated = true;
1072 
1073         ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1074         if (ret < 0) {
1075             goto fail;
1076         }
1077 
1078         for(i = 0;i < l1_size; i++)
1079             be64_to_cpus(&l1_table[i]);
1080     } else {
1081         assert(l1_size == s->l1_size);
1082         l1_table = s->l1_table;
1083         l1_allocated = false;
1084     }
1085 
1086     for(i = 0; i < l1_size; i++) {
1087         l2_offset = l1_table[i];
1088         if (l2_offset) {
1089             old_l2_offset = l2_offset;
1090             l2_offset &= L1E_OFFSET_MASK;
1091 
1092             if (offset_into_cluster(s, l2_offset)) {
1093                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1094                                         PRIx64 " unaligned (L1 index: %#x)",
1095                                         l2_offset, i);
1096                 ret = -EIO;
1097                 goto fail;
1098             }
1099 
1100             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1101                 (void**) &l2_table);
1102             if (ret < 0) {
1103                 goto fail;
1104             }
1105 
1106             for(j = 0; j < s->l2_size; j++) {
1107                 uint64_t cluster_index;
1108 
1109                 offset = be64_to_cpu(l2_table[j]);
1110                 old_offset = offset;
1111                 offset &= ~QCOW_OFLAG_COPIED;
1112 
1113                 switch (qcow2_get_cluster_type(offset)) {
1114                     case QCOW2_CLUSTER_COMPRESSED:
1115                         nb_csectors = ((offset >> s->csize_shift) &
1116                                        s->csize_mask) + 1;
1117                         if (addend != 0) {
1118                             ret = update_refcount(bs,
1119                                 (offset & s->cluster_offset_mask) & ~511,
1120                                 nb_csectors * 512, abs(addend), addend < 0,
1121                                 QCOW2_DISCARD_SNAPSHOT);
1122                             if (ret < 0) {
1123                                 goto fail;
1124                             }
1125                         }
1126                         /* compressed clusters are never modified */
1127                         refcount = 2;
1128                         break;
1129 
1130                     case QCOW2_CLUSTER_NORMAL:
1131                     case QCOW2_CLUSTER_ZERO:
1132                         if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1133                             qcow2_signal_corruption(bs, true, -1, -1, "Data "
1134                                                     "cluster offset %#llx "
1135                                                     "unaligned (L2 offset: %#"
1136                                                     PRIx64 ", L2 index: %#x)",
1137                                                     offset & L2E_OFFSET_MASK,
1138                                                     l2_offset, j);
1139                             ret = -EIO;
1140                             goto fail;
1141                         }
1142 
1143                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1144                         if (!cluster_index) {
1145                             /* unallocated */
1146                             refcount = 0;
1147                             break;
1148                         }
1149                         if (addend != 0) {
1150                             ret = qcow2_update_cluster_refcount(bs,
1151                                     cluster_index, abs(addend), addend < 0,
1152                                     QCOW2_DISCARD_SNAPSHOT);
1153                             if (ret < 0) {
1154                                 goto fail;
1155                             }
1156                         }
1157 
1158                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1159                         if (ret < 0) {
1160                             goto fail;
1161                         }
1162                         break;
1163 
1164                     case QCOW2_CLUSTER_UNALLOCATED:
1165                         refcount = 0;
1166                         break;
1167 
1168                     default:
1169                         abort();
1170                 }
1171 
1172                 if (refcount == 1) {
1173                     offset |= QCOW_OFLAG_COPIED;
1174                 }
1175                 if (offset != old_offset) {
1176                     if (addend > 0) {
1177                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
1178                             s->refcount_block_cache);
1179                     }
1180                     l2_table[j] = cpu_to_be64(offset);
1181                     qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1182                                                  l2_table);
1183                 }
1184             }
1185 
1186             qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1187 
1188             if (addend != 0) {
1189                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1190                                                         s->cluster_bits,
1191                                                     abs(addend), addend < 0,
1192                                                     QCOW2_DISCARD_SNAPSHOT);
1193                 if (ret < 0) {
1194                     goto fail;
1195                 }
1196             }
1197             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1198                                      &refcount);
1199             if (ret < 0) {
1200                 goto fail;
1201             } else if (refcount == 1) {
1202                 l2_offset |= QCOW_OFLAG_COPIED;
1203             }
1204             if (l2_offset != old_l2_offset) {
1205                 l1_table[i] = l2_offset;
1206                 l1_modified = 1;
1207             }
1208         }
1209     }
1210 
1211     ret = bdrv_flush(bs);
1212 fail:
1213     if (l2_table) {
1214         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1215     }
1216 
1217     s->cache_discards = false;
1218     qcow2_process_discards(bs, ret);
1219 
1220     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1221     if (ret == 0 && addend >= 0 && l1_modified) {
1222         for (i = 0; i < l1_size; i++) {
1223             cpu_to_be64s(&l1_table[i]);
1224         }
1225 
1226         ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset,
1227                                l1_table, l1_size2);
1228 
1229         for (i = 0; i < l1_size; i++) {
1230             be64_to_cpus(&l1_table[i]);
1231         }
1232     }
1233     if (l1_allocated)
1234         g_free(l1_table);
1235     return ret;
1236 }
1237 
1238 
1239 
1240 
1241 /*********************************************************/
1242 /* refcount checking functions */
1243 
1244 
1245 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1246 {
1247     /* This assertion holds because there is no way we can address more than
1248      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1249      * offsets have to be representable in bytes); due to every cluster
1250      * corresponding to one refcount entry, we are well below that limit */
1251     assert(entries < (UINT64_C(1) << (64 - 9)));
1252 
1253     /* Thanks to the assertion this will not overflow, because
1254      * s->refcount_order < 7.
1255      * (note: x << s->refcount_order == x * s->refcount_bits) */
1256     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1257 }
1258 
1259 /**
1260  * Reallocates *array so that it can hold new_size entries. *size must contain
1261  * the current number of entries in *array. If the reallocation fails, *array
1262  * and *size will not be modified and -errno will be returned. If the
1263  * reallocation is successful, *array will be set to the new buffer, *size
1264  * will be set to new_size and 0 will be returned. The size of the reallocated
1265  * refcount array buffer will be aligned to a cluster boundary, and the newly
1266  * allocated area will be zeroed.
1267  */
1268 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1269                                   int64_t *size, int64_t new_size)
1270 {
1271     int64_t old_byte_size, new_byte_size;
1272     void *new_ptr;
1273 
1274     /* Round to clusters so the array can be directly written to disk */
1275     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1276                     * s->cluster_size;
1277     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1278                     * s->cluster_size;
1279 
1280     if (new_byte_size == old_byte_size) {
1281         *size = new_size;
1282         return 0;
1283     }
1284 
1285     assert(new_byte_size > 0);
1286 
1287     if (new_byte_size > SIZE_MAX) {
1288         return -ENOMEM;
1289     }
1290 
1291     new_ptr = g_try_realloc(*array, new_byte_size);
1292     if (!new_ptr) {
1293         return -ENOMEM;
1294     }
1295 
1296     if (new_byte_size > old_byte_size) {
1297         memset((char *)new_ptr + old_byte_size, 0,
1298                new_byte_size - old_byte_size);
1299     }
1300 
1301     *array = new_ptr;
1302     *size  = new_size;
1303 
1304     return 0;
1305 }
1306 
1307 /*
1308  * Increases the refcount for a range of clusters in a given refcount table.
1309  * This is used to construct a temporary refcount table out of L1 and L2 tables
1310  * which can be compared to the refcount table saved in the image.
1311  *
1312  * Modifies the number of errors in res.
1313  */
1314 static int inc_refcounts(BlockDriverState *bs,
1315                          BdrvCheckResult *res,
1316                          void **refcount_table,
1317                          int64_t *refcount_table_size,
1318                          int64_t offset, int64_t size)
1319 {
1320     BDRVQcow2State *s = bs->opaque;
1321     uint64_t start, last, cluster_offset, k, refcount;
1322     int ret;
1323 
1324     if (size <= 0) {
1325         return 0;
1326     }
1327 
1328     start = start_of_cluster(s, offset);
1329     last = start_of_cluster(s, offset + size - 1);
1330     for(cluster_offset = start; cluster_offset <= last;
1331         cluster_offset += s->cluster_size) {
1332         k = cluster_offset >> s->cluster_bits;
1333         if (k >= *refcount_table_size) {
1334             ret = realloc_refcount_array(s, refcount_table,
1335                                          refcount_table_size, k + 1);
1336             if (ret < 0) {
1337                 res->check_errors++;
1338                 return ret;
1339             }
1340         }
1341 
1342         refcount = s->get_refcount(*refcount_table, k);
1343         if (refcount == s->refcount_max) {
1344             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1345                     "\n", cluster_offset);
1346             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1347                     "width or qemu-img convert to create a clean copy if the "
1348                     "image cannot be opened for writing\n");
1349             res->corruptions++;
1350             continue;
1351         }
1352         s->set_refcount(*refcount_table, k, refcount + 1);
1353     }
1354 
1355     return 0;
1356 }
1357 
1358 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1359 enum {
1360     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1361 };
1362 
1363 /*
1364  * Increases the refcount in the given refcount table for the all clusters
1365  * referenced in the L2 table. While doing so, performs some checks on L2
1366  * entries.
1367  *
1368  * Returns the number of errors found by the checks or -errno if an internal
1369  * error occurred.
1370  */
1371 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1372                               void **refcount_table,
1373                               int64_t *refcount_table_size, int64_t l2_offset,
1374                               int flags)
1375 {
1376     BDRVQcow2State *s = bs->opaque;
1377     uint64_t *l2_table, l2_entry;
1378     uint64_t next_contiguous_offset = 0;
1379     int i, l2_size, nb_csectors, ret;
1380 
1381     /* Read L2 table from disk */
1382     l2_size = s->l2_size * sizeof(uint64_t);
1383     l2_table = g_malloc(l2_size);
1384 
1385     ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size);
1386     if (ret < 0) {
1387         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1388         res->check_errors++;
1389         goto fail;
1390     }
1391 
1392     /* Do the actual checks */
1393     for(i = 0; i < s->l2_size; i++) {
1394         l2_entry = be64_to_cpu(l2_table[i]);
1395 
1396         switch (qcow2_get_cluster_type(l2_entry)) {
1397         case QCOW2_CLUSTER_COMPRESSED:
1398             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1399             if (l2_entry & QCOW_OFLAG_COPIED) {
1400                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1401                     "copied flag must never be set for compressed "
1402                     "clusters\n", l2_entry >> s->cluster_bits);
1403                 l2_entry &= ~QCOW_OFLAG_COPIED;
1404                 res->corruptions++;
1405             }
1406 
1407             /* Mark cluster as used */
1408             nb_csectors = ((l2_entry >> s->csize_shift) &
1409                            s->csize_mask) + 1;
1410             l2_entry &= s->cluster_offset_mask;
1411             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1412                                 l2_entry & ~511, nb_csectors * 512);
1413             if (ret < 0) {
1414                 goto fail;
1415             }
1416 
1417             if (flags & CHECK_FRAG_INFO) {
1418                 res->bfi.allocated_clusters++;
1419                 res->bfi.compressed_clusters++;
1420 
1421                 /* Compressed clusters are fragmented by nature.  Since they
1422                  * take up sub-sector space but we only have sector granularity
1423                  * I/O we need to re-read the same sectors even for adjacent
1424                  * compressed clusters.
1425                  */
1426                 res->bfi.fragmented_clusters++;
1427             }
1428             break;
1429 
1430         case QCOW2_CLUSTER_ZERO:
1431             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1432                 break;
1433             }
1434             /* fall through */
1435 
1436         case QCOW2_CLUSTER_NORMAL:
1437         {
1438             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1439 
1440             if (flags & CHECK_FRAG_INFO) {
1441                 res->bfi.allocated_clusters++;
1442                 if (next_contiguous_offset &&
1443                     offset != next_contiguous_offset) {
1444                     res->bfi.fragmented_clusters++;
1445                 }
1446                 next_contiguous_offset = offset + s->cluster_size;
1447             }
1448 
1449             /* Mark cluster as used */
1450             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1451                                 offset, s->cluster_size);
1452             if (ret < 0) {
1453                 goto fail;
1454             }
1455 
1456             /* Correct offsets are cluster aligned */
1457             if (offset_into_cluster(s, offset)) {
1458                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1459                     "properly aligned; L2 entry corrupted.\n", offset);
1460                 res->corruptions++;
1461             }
1462             break;
1463         }
1464 
1465         case QCOW2_CLUSTER_UNALLOCATED:
1466             break;
1467 
1468         default:
1469             abort();
1470         }
1471     }
1472 
1473     g_free(l2_table);
1474     return 0;
1475 
1476 fail:
1477     g_free(l2_table);
1478     return ret;
1479 }
1480 
1481 /*
1482  * Increases the refcount for the L1 table, its L2 tables and all referenced
1483  * clusters in the given refcount table. While doing so, performs some checks
1484  * on L1 and L2 entries.
1485  *
1486  * Returns the number of errors found by the checks or -errno if an internal
1487  * error occurred.
1488  */
1489 static int check_refcounts_l1(BlockDriverState *bs,
1490                               BdrvCheckResult *res,
1491                               void **refcount_table,
1492                               int64_t *refcount_table_size,
1493                               int64_t l1_table_offset, int l1_size,
1494                               int flags)
1495 {
1496     BDRVQcow2State *s = bs->opaque;
1497     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1498     int i, ret;
1499 
1500     l1_size2 = l1_size * sizeof(uint64_t);
1501 
1502     /* Mark L1 table as used */
1503     ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1504                         l1_table_offset, l1_size2);
1505     if (ret < 0) {
1506         goto fail;
1507     }
1508 
1509     /* Read L1 table entries from disk */
1510     if (l1_size2 > 0) {
1511         l1_table = g_try_malloc(l1_size2);
1512         if (l1_table == NULL) {
1513             ret = -ENOMEM;
1514             res->check_errors++;
1515             goto fail;
1516         }
1517         ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1518         if (ret < 0) {
1519             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1520             res->check_errors++;
1521             goto fail;
1522         }
1523         for(i = 0;i < l1_size; i++)
1524             be64_to_cpus(&l1_table[i]);
1525     }
1526 
1527     /* Do the actual checks */
1528     for(i = 0; i < l1_size; i++) {
1529         l2_offset = l1_table[i];
1530         if (l2_offset) {
1531             /* Mark L2 table as used */
1532             l2_offset &= L1E_OFFSET_MASK;
1533             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1534                                 l2_offset, s->cluster_size);
1535             if (ret < 0) {
1536                 goto fail;
1537             }
1538 
1539             /* L2 tables are cluster aligned */
1540             if (offset_into_cluster(s, l2_offset)) {
1541                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1542                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1543                 res->corruptions++;
1544             }
1545 
1546             /* Process and check L2 entries */
1547             ret = check_refcounts_l2(bs, res, refcount_table,
1548                                      refcount_table_size, l2_offset, flags);
1549             if (ret < 0) {
1550                 goto fail;
1551             }
1552         }
1553     }
1554     g_free(l1_table);
1555     return 0;
1556 
1557 fail:
1558     g_free(l1_table);
1559     return ret;
1560 }
1561 
1562 /*
1563  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1564  *
1565  * This function does not print an error message nor does it increment
1566  * check_errors if qcow2_get_refcount fails (this is because such an error will
1567  * have been already detected and sufficiently signaled by the calling function
1568  * (qcow2_check_refcounts) by the time this function is called).
1569  */
1570 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1571                               BdrvCheckMode fix)
1572 {
1573     BDRVQcow2State *s = bs->opaque;
1574     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1575     int ret;
1576     uint64_t refcount;
1577     int i, j;
1578 
1579     for (i = 0; i < s->l1_size; i++) {
1580         uint64_t l1_entry = s->l1_table[i];
1581         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1582         bool l2_dirty = false;
1583 
1584         if (!l2_offset) {
1585             continue;
1586         }
1587 
1588         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1589                                  &refcount);
1590         if (ret < 0) {
1591             /* don't print message nor increment check_errors */
1592             continue;
1593         }
1594         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1595             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1596                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1597                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1598                                             "ERROR",
1599                     i, l1_entry, refcount);
1600             if (fix & BDRV_FIX_ERRORS) {
1601                 s->l1_table[i] = refcount == 1
1602                                ? l1_entry |  QCOW_OFLAG_COPIED
1603                                : l1_entry & ~QCOW_OFLAG_COPIED;
1604                 ret = qcow2_write_l1_entry(bs, i);
1605                 if (ret < 0) {
1606                     res->check_errors++;
1607                     goto fail;
1608                 }
1609                 res->corruptions_fixed++;
1610             } else {
1611                 res->corruptions++;
1612             }
1613         }
1614 
1615         ret = bdrv_pread(bs->file->bs, l2_offset, l2_table,
1616                          s->l2_size * sizeof(uint64_t));
1617         if (ret < 0) {
1618             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1619                     strerror(-ret));
1620             res->check_errors++;
1621             goto fail;
1622         }
1623 
1624         for (j = 0; j < s->l2_size; j++) {
1625             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1626             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1627             int cluster_type = qcow2_get_cluster_type(l2_entry);
1628 
1629             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1630                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1631                 ret = qcow2_get_refcount(bs,
1632                                          data_offset >> s->cluster_bits,
1633                                          &refcount);
1634                 if (ret < 0) {
1635                     /* don't print message nor increment check_errors */
1636                     continue;
1637                 }
1638                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1639                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1640                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1641                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1642                                                     "ERROR",
1643                             l2_entry, refcount);
1644                     if (fix & BDRV_FIX_ERRORS) {
1645                         l2_table[j] = cpu_to_be64(refcount == 1
1646                                     ? l2_entry |  QCOW_OFLAG_COPIED
1647                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1648                         l2_dirty = true;
1649                         res->corruptions_fixed++;
1650                     } else {
1651                         res->corruptions++;
1652                     }
1653                 }
1654             }
1655         }
1656 
1657         if (l2_dirty) {
1658             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1659                                                 l2_offset, s->cluster_size);
1660             if (ret < 0) {
1661                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1662                         "overlap check failed: %s\n", strerror(-ret));
1663                 res->check_errors++;
1664                 goto fail;
1665             }
1666 
1667             ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table,
1668                               s->cluster_size);
1669             if (ret < 0) {
1670                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1671                         strerror(-ret));
1672                 res->check_errors++;
1673                 goto fail;
1674             }
1675         }
1676     }
1677 
1678     ret = 0;
1679 
1680 fail:
1681     qemu_vfree(l2_table);
1682     return ret;
1683 }
1684 
1685 /*
1686  * Checks consistency of refblocks and accounts for each refblock in
1687  * *refcount_table.
1688  */
1689 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1690                            BdrvCheckMode fix, bool *rebuild,
1691                            void **refcount_table, int64_t *nb_clusters)
1692 {
1693     BDRVQcow2State *s = bs->opaque;
1694     int64_t i, size;
1695     int ret;
1696 
1697     for(i = 0; i < s->refcount_table_size; i++) {
1698         uint64_t offset, cluster;
1699         offset = s->refcount_table[i];
1700         cluster = offset >> s->cluster_bits;
1701 
1702         /* Refcount blocks are cluster aligned */
1703         if (offset_into_cluster(s, offset)) {
1704             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1705                 "cluster aligned; refcount table entry corrupted\n", i);
1706             res->corruptions++;
1707             *rebuild = true;
1708             continue;
1709         }
1710 
1711         if (cluster >= *nb_clusters) {
1712             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1713                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1714 
1715             if (fix & BDRV_FIX_ERRORS) {
1716                 int64_t new_nb_clusters;
1717 
1718                 if (offset > INT64_MAX - s->cluster_size) {
1719                     ret = -EINVAL;
1720                     goto resize_fail;
1721                 }
1722 
1723                 ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
1724                 if (ret < 0) {
1725                     goto resize_fail;
1726                 }
1727                 size = bdrv_getlength(bs->file->bs);
1728                 if (size < 0) {
1729                     ret = size;
1730                     goto resize_fail;
1731                 }
1732 
1733                 new_nb_clusters = size_to_clusters(s, size);
1734                 assert(new_nb_clusters >= *nb_clusters);
1735 
1736                 ret = realloc_refcount_array(s, refcount_table,
1737                                              nb_clusters, new_nb_clusters);
1738                 if (ret < 0) {
1739                     res->check_errors++;
1740                     return ret;
1741                 }
1742 
1743                 if (cluster >= *nb_clusters) {
1744                     ret = -EINVAL;
1745                     goto resize_fail;
1746                 }
1747 
1748                 res->corruptions_fixed++;
1749                 ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1750                                     offset, s->cluster_size);
1751                 if (ret < 0) {
1752                     return ret;
1753                 }
1754                 /* No need to check whether the refcount is now greater than 1:
1755                  * This area was just allocated and zeroed, so it can only be
1756                  * exactly 1 after inc_refcounts() */
1757                 continue;
1758 
1759 resize_fail:
1760                 res->corruptions++;
1761                 *rebuild = true;
1762                 fprintf(stderr, "ERROR could not resize image: %s\n",
1763                         strerror(-ret));
1764             } else {
1765                 res->corruptions++;
1766             }
1767             continue;
1768         }
1769 
1770         if (offset != 0) {
1771             ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1772                                 offset, s->cluster_size);
1773             if (ret < 0) {
1774                 return ret;
1775             }
1776             if (s->get_refcount(*refcount_table, cluster) != 1) {
1777                 fprintf(stderr, "ERROR refcount block %" PRId64
1778                         " refcount=%" PRIu64 "\n", i,
1779                         s->get_refcount(*refcount_table, cluster));
1780                 res->corruptions++;
1781                 *rebuild = true;
1782             }
1783         }
1784     }
1785 
1786     return 0;
1787 }
1788 
1789 /*
1790  * Calculates an in-memory refcount table.
1791  */
1792 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1793                                BdrvCheckMode fix, bool *rebuild,
1794                                void **refcount_table, int64_t *nb_clusters)
1795 {
1796     BDRVQcow2State *s = bs->opaque;
1797     int64_t i;
1798     QCowSnapshot *sn;
1799     int ret;
1800 
1801     if (!*refcount_table) {
1802         int64_t old_size = 0;
1803         ret = realloc_refcount_array(s, refcount_table,
1804                                      &old_size, *nb_clusters);
1805         if (ret < 0) {
1806             res->check_errors++;
1807             return ret;
1808         }
1809     }
1810 
1811     /* header */
1812     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1813                         0, s->cluster_size);
1814     if (ret < 0) {
1815         return ret;
1816     }
1817 
1818     /* current L1 table */
1819     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1820                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1821     if (ret < 0) {
1822         return ret;
1823     }
1824 
1825     /* snapshots */
1826     for (i = 0; i < s->nb_snapshots; i++) {
1827         sn = s->snapshots + i;
1828         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1829                                  sn->l1_table_offset, sn->l1_size, 0);
1830         if (ret < 0) {
1831             return ret;
1832         }
1833     }
1834     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1835                         s->snapshots_offset, s->snapshots_size);
1836     if (ret < 0) {
1837         return ret;
1838     }
1839 
1840     /* refcount data */
1841     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1842                         s->refcount_table_offset,
1843                         s->refcount_table_size * sizeof(uint64_t));
1844     if (ret < 0) {
1845         return ret;
1846     }
1847 
1848     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1849 }
1850 
1851 /*
1852  * Compares the actual reference count for each cluster in the image against the
1853  * refcount as reported by the refcount structures on-disk.
1854  */
1855 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1856                               BdrvCheckMode fix, bool *rebuild,
1857                               int64_t *highest_cluster,
1858                               void *refcount_table, int64_t nb_clusters)
1859 {
1860     BDRVQcow2State *s = bs->opaque;
1861     int64_t i;
1862     uint64_t refcount1, refcount2;
1863     int ret;
1864 
1865     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1866         ret = qcow2_get_refcount(bs, i, &refcount1);
1867         if (ret < 0) {
1868             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1869                     i, strerror(-ret));
1870             res->check_errors++;
1871             continue;
1872         }
1873 
1874         refcount2 = s->get_refcount(refcount_table, i);
1875 
1876         if (refcount1 > 0 || refcount2 > 0) {
1877             *highest_cluster = i;
1878         }
1879 
1880         if (refcount1 != refcount2) {
1881             /* Check if we're allowed to fix the mismatch */
1882             int *num_fixed = NULL;
1883             if (refcount1 == 0) {
1884                 *rebuild = true;
1885             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1886                 num_fixed = &res->leaks_fixed;
1887             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1888                 num_fixed = &res->corruptions_fixed;
1889             }
1890 
1891             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1892                     " reference=%" PRIu64 "\n",
1893                    num_fixed != NULL     ? "Repairing" :
1894                    refcount1 < refcount2 ? "ERROR" :
1895                                            "Leaked",
1896                    i, refcount1, refcount2);
1897 
1898             if (num_fixed) {
1899                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1900                                       refcount_diff(refcount1, refcount2),
1901                                       refcount1 > refcount2,
1902                                       QCOW2_DISCARD_ALWAYS);
1903                 if (ret >= 0) {
1904                     (*num_fixed)++;
1905                     continue;
1906                 }
1907             }
1908 
1909             /* And if we couldn't, print an error */
1910             if (refcount1 < refcount2) {
1911                 res->corruptions++;
1912             } else {
1913                 res->leaks++;
1914             }
1915         }
1916     }
1917 }
1918 
1919 /*
1920  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1921  * the on-disk refcount structures.
1922  *
1923  * On input, *first_free_cluster tells where to start looking, and need not
1924  * actually be a free cluster; the returned offset will not be before that
1925  * cluster.  On output, *first_free_cluster points to the first gap found, even
1926  * if that gap was too small to be used as the returned offset.
1927  *
1928  * Note that *first_free_cluster is a cluster index whereas the return value is
1929  * an offset.
1930  */
1931 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1932                                    int cluster_count,
1933                                    void **refcount_table,
1934                                    int64_t *imrt_nb_clusters,
1935                                    int64_t *first_free_cluster)
1936 {
1937     BDRVQcow2State *s = bs->opaque;
1938     int64_t cluster = *first_free_cluster, i;
1939     bool first_gap = true;
1940     int contiguous_free_clusters;
1941     int ret;
1942 
1943     /* Starting at *first_free_cluster, find a range of at least cluster_count
1944      * continuously free clusters */
1945     for (contiguous_free_clusters = 0;
1946          cluster < *imrt_nb_clusters &&
1947          contiguous_free_clusters < cluster_count;
1948          cluster++)
1949     {
1950         if (!s->get_refcount(*refcount_table, cluster)) {
1951             contiguous_free_clusters++;
1952             if (first_gap) {
1953                 /* If this is the first free cluster found, update
1954                  * *first_free_cluster accordingly */
1955                 *first_free_cluster = cluster;
1956                 first_gap = false;
1957             }
1958         } else if (contiguous_free_clusters) {
1959             contiguous_free_clusters = 0;
1960         }
1961     }
1962 
1963     /* If contiguous_free_clusters is greater than zero, it contains the number
1964      * of continuously free clusters until the current cluster; the first free
1965      * cluster in the current "gap" is therefore
1966      * cluster - contiguous_free_clusters */
1967 
1968     /* If no such range could be found, grow the in-memory refcount table
1969      * accordingly to append free clusters at the end of the image */
1970     if (contiguous_free_clusters < cluster_count) {
1971         /* contiguous_free_clusters clusters are already empty at the image end;
1972          * we need cluster_count clusters; therefore, we have to allocate
1973          * cluster_count - contiguous_free_clusters new clusters at the end of
1974          * the image (which is the current value of cluster; note that cluster
1975          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1976          * the image end) */
1977         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1978                                      cluster + cluster_count
1979                                      - contiguous_free_clusters);
1980         if (ret < 0) {
1981             return ret;
1982         }
1983     }
1984 
1985     /* Go back to the first free cluster */
1986     cluster -= contiguous_free_clusters;
1987     for (i = 0; i < cluster_count; i++) {
1988         s->set_refcount(*refcount_table, cluster + i, 1);
1989     }
1990 
1991     return cluster << s->cluster_bits;
1992 }
1993 
1994 /*
1995  * Creates a new refcount structure based solely on the in-memory information
1996  * given through *refcount_table. All necessary allocations will be reflected
1997  * in that array.
1998  *
1999  * On success, the old refcount structure is leaked (it will be covered by the
2000  * new refcount structure).
2001  */
2002 static int rebuild_refcount_structure(BlockDriverState *bs,
2003                                       BdrvCheckResult *res,
2004                                       void **refcount_table,
2005                                       int64_t *nb_clusters)
2006 {
2007     BDRVQcow2State *s = bs->opaque;
2008     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2009     int64_t refblock_offset, refblock_start, refblock_index;
2010     uint32_t reftable_size = 0;
2011     uint64_t *on_disk_reftable = NULL;
2012     void *on_disk_refblock;
2013     int ret = 0;
2014     struct {
2015         uint64_t reftable_offset;
2016         uint32_t reftable_clusters;
2017     } QEMU_PACKED reftable_offset_and_clusters;
2018 
2019     qcow2_cache_empty(bs, s->refcount_block_cache);
2020 
2021 write_refblocks:
2022     for (; cluster < *nb_clusters; cluster++) {
2023         if (!s->get_refcount(*refcount_table, cluster)) {
2024             continue;
2025         }
2026 
2027         refblock_index = cluster >> s->refcount_block_bits;
2028         refblock_start = refblock_index << s->refcount_block_bits;
2029 
2030         /* Don't allocate a cluster in a refblock already written to disk */
2031         if (first_free_cluster < refblock_start) {
2032             first_free_cluster = refblock_start;
2033         }
2034         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2035                                               nb_clusters, &first_free_cluster);
2036         if (refblock_offset < 0) {
2037             fprintf(stderr, "ERROR allocating refblock: %s\n",
2038                     strerror(-refblock_offset));
2039             res->check_errors++;
2040             ret = refblock_offset;
2041             goto fail;
2042         }
2043 
2044         if (reftable_size <= refblock_index) {
2045             uint32_t old_reftable_size = reftable_size;
2046             uint64_t *new_on_disk_reftable;
2047 
2048             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2049                                      s->cluster_size) / sizeof(uint64_t);
2050             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2051                                                  reftable_size *
2052                                                  sizeof(uint64_t));
2053             if (!new_on_disk_reftable) {
2054                 res->check_errors++;
2055                 ret = -ENOMEM;
2056                 goto fail;
2057             }
2058             on_disk_reftable = new_on_disk_reftable;
2059 
2060             memset(on_disk_reftable + old_reftable_size, 0,
2061                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2062 
2063             /* The offset we have for the reftable is now no longer valid;
2064              * this will leak that range, but we can easily fix that by running
2065              * a leak-fixing check after this rebuild operation */
2066             reftable_offset = -1;
2067         }
2068         on_disk_reftable[refblock_index] = refblock_offset;
2069 
2070         /* If this is apparently the last refblock (for now), try to squeeze the
2071          * reftable in */
2072         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2073             reftable_offset < 0)
2074         {
2075             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2076                                                           sizeof(uint64_t));
2077             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2078                                                   refcount_table, nb_clusters,
2079                                                   &first_free_cluster);
2080             if (reftable_offset < 0) {
2081                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2082                         strerror(-reftable_offset));
2083                 res->check_errors++;
2084                 ret = reftable_offset;
2085                 goto fail;
2086             }
2087         }
2088 
2089         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2090                                             s->cluster_size);
2091         if (ret < 0) {
2092             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2093             goto fail;
2094         }
2095 
2096         /* The size of *refcount_table is always cluster-aligned, therefore the
2097          * write operation will not overflow */
2098         on_disk_refblock = (void *)((char *) *refcount_table +
2099                                     refblock_index * s->cluster_size);
2100 
2101         ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE,
2102                          on_disk_refblock, s->cluster_sectors);
2103         if (ret < 0) {
2104             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2105             goto fail;
2106         }
2107 
2108         /* Go to the end of this refblock */
2109         cluster = refblock_start + s->refcount_block_size - 1;
2110     }
2111 
2112     if (reftable_offset < 0) {
2113         uint64_t post_refblock_start, reftable_clusters;
2114 
2115         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2116         reftable_clusters = size_to_clusters(s,
2117                                              reftable_size * sizeof(uint64_t));
2118         /* Not pretty but simple */
2119         if (first_free_cluster < post_refblock_start) {
2120             first_free_cluster = post_refblock_start;
2121         }
2122         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2123                                               refcount_table, nb_clusters,
2124                                               &first_free_cluster);
2125         if (reftable_offset < 0) {
2126             fprintf(stderr, "ERROR allocating reftable: %s\n",
2127                     strerror(-reftable_offset));
2128             res->check_errors++;
2129             ret = reftable_offset;
2130             goto fail;
2131         }
2132 
2133         goto write_refblocks;
2134     }
2135 
2136     assert(on_disk_reftable);
2137 
2138     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2139         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2140     }
2141 
2142     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2143                                         reftable_size * sizeof(uint64_t));
2144     if (ret < 0) {
2145         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2146         goto fail;
2147     }
2148 
2149     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2150     ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable,
2151                       reftable_size * sizeof(uint64_t));
2152     if (ret < 0) {
2153         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2154         goto fail;
2155     }
2156 
2157     /* Enter new reftable into the image header */
2158     cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset,
2159                  reftable_offset);
2160     cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters,
2161                  size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2162     ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader,
2163                                                   refcount_table_offset),
2164                            &reftable_offset_and_clusters,
2165                            sizeof(reftable_offset_and_clusters));
2166     if (ret < 0) {
2167         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2168         goto fail;
2169     }
2170 
2171     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2172         be64_to_cpus(&on_disk_reftable[refblock_index]);
2173     }
2174     s->refcount_table = on_disk_reftable;
2175     s->refcount_table_offset = reftable_offset;
2176     s->refcount_table_size = reftable_size;
2177 
2178     return 0;
2179 
2180 fail:
2181     g_free(on_disk_reftable);
2182     return ret;
2183 }
2184 
2185 /*
2186  * Checks an image for refcount consistency.
2187  *
2188  * Returns 0 if no errors are found, the number of errors in case the image is
2189  * detected as corrupted, and -errno when an internal error occurred.
2190  */
2191 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2192                           BdrvCheckMode fix)
2193 {
2194     BDRVQcow2State *s = bs->opaque;
2195     BdrvCheckResult pre_compare_res;
2196     int64_t size, highest_cluster, nb_clusters;
2197     void *refcount_table = NULL;
2198     bool rebuild = false;
2199     int ret;
2200 
2201     size = bdrv_getlength(bs->file->bs);
2202     if (size < 0) {
2203         res->check_errors++;
2204         return size;
2205     }
2206 
2207     nb_clusters = size_to_clusters(s, size);
2208     if (nb_clusters > INT_MAX) {
2209         res->check_errors++;
2210         return -EFBIG;
2211     }
2212 
2213     res->bfi.total_clusters =
2214         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2215 
2216     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2217                               &nb_clusters);
2218     if (ret < 0) {
2219         goto fail;
2220     }
2221 
2222     /* In case we don't need to rebuild the refcount structure (but want to fix
2223      * something), this function is immediately called again, in which case the
2224      * result should be ignored */
2225     pre_compare_res = *res;
2226     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2227                       nb_clusters);
2228 
2229     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2230         BdrvCheckResult old_res = *res;
2231         int fresh_leaks = 0;
2232 
2233         fprintf(stderr, "Rebuilding refcount structure\n");
2234         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2235                                          &nb_clusters);
2236         if (ret < 0) {
2237             goto fail;
2238         }
2239 
2240         res->corruptions = 0;
2241         res->leaks = 0;
2242 
2243         /* Because the old reftable has been exchanged for a new one the
2244          * references have to be recalculated */
2245         rebuild = false;
2246         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2247         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2248                                   &nb_clusters);
2249         if (ret < 0) {
2250             goto fail;
2251         }
2252 
2253         if (fix & BDRV_FIX_LEAKS) {
2254             /* The old refcount structures are now leaked, fix it; the result
2255              * can be ignored, aside from leaks which were introduced by
2256              * rebuild_refcount_structure() that could not be fixed */
2257             BdrvCheckResult saved_res = *res;
2258             *res = (BdrvCheckResult){ 0 };
2259 
2260             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2261                               &highest_cluster, refcount_table, nb_clusters);
2262             if (rebuild) {
2263                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2264                         "broken\n");
2265             }
2266 
2267             /* Any leaks accounted for here were introduced by
2268              * rebuild_refcount_structure() because that function has created a
2269              * new refcount structure from scratch */
2270             fresh_leaks = res->leaks;
2271             *res = saved_res;
2272         }
2273 
2274         if (res->corruptions < old_res.corruptions) {
2275             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2276         }
2277         if (res->leaks < old_res.leaks) {
2278             res->leaks_fixed += old_res.leaks - res->leaks;
2279         }
2280         res->leaks += fresh_leaks;
2281     } else if (fix) {
2282         if (rebuild) {
2283             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2284             res->check_errors++;
2285             ret = -EIO;
2286             goto fail;
2287         }
2288 
2289         if (res->leaks || res->corruptions) {
2290             *res = pre_compare_res;
2291             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2292                               refcount_table, nb_clusters);
2293         }
2294     }
2295 
2296     /* check OFLAG_COPIED */
2297     ret = check_oflag_copied(bs, res, fix);
2298     if (ret < 0) {
2299         goto fail;
2300     }
2301 
2302     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2303     ret = 0;
2304 
2305 fail:
2306     g_free(refcount_table);
2307 
2308     return ret;
2309 }
2310 
2311 #define overlaps_with(ofs, sz) \
2312     ranges_overlap(offset, size, ofs, sz)
2313 
2314 /*
2315  * Checks if the given offset into the image file is actually free to use by
2316  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2317  * i.e. a sanity check without relying on the refcount tables.
2318  *
2319  * The ign parameter specifies what checks not to perform (being a bitmask of
2320  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2321  *
2322  * Returns:
2323  * - 0 if writing to this offset will not affect the mentioned metadata
2324  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2325  * - a negative value (-errno) indicating an error while performing a check,
2326  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2327  */
2328 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2329                                  int64_t size)
2330 {
2331     BDRVQcow2State *s = bs->opaque;
2332     int chk = s->overlap_check & ~ign;
2333     int i, j;
2334 
2335     if (!size) {
2336         return 0;
2337     }
2338 
2339     if (chk & QCOW2_OL_MAIN_HEADER) {
2340         if (offset < s->cluster_size) {
2341             return QCOW2_OL_MAIN_HEADER;
2342         }
2343     }
2344 
2345     /* align range to test to cluster boundaries */
2346     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2347     offset = start_of_cluster(s, offset);
2348 
2349     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2350         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2351             return QCOW2_OL_ACTIVE_L1;
2352         }
2353     }
2354 
2355     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2356         if (overlaps_with(s->refcount_table_offset,
2357             s->refcount_table_size * sizeof(uint64_t))) {
2358             return QCOW2_OL_REFCOUNT_TABLE;
2359         }
2360     }
2361 
2362     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2363         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2364             return QCOW2_OL_SNAPSHOT_TABLE;
2365         }
2366     }
2367 
2368     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2369         for (i = 0; i < s->nb_snapshots; i++) {
2370             if (s->snapshots[i].l1_size &&
2371                 overlaps_with(s->snapshots[i].l1_table_offset,
2372                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2373                 return QCOW2_OL_INACTIVE_L1;
2374             }
2375         }
2376     }
2377 
2378     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2379         for (i = 0; i < s->l1_size; i++) {
2380             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2381                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2382                 s->cluster_size)) {
2383                 return QCOW2_OL_ACTIVE_L2;
2384             }
2385         }
2386     }
2387 
2388     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2389         for (i = 0; i < s->refcount_table_size; i++) {
2390             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2391                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2392                 s->cluster_size)) {
2393                 return QCOW2_OL_REFCOUNT_BLOCK;
2394             }
2395         }
2396     }
2397 
2398     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2399         for (i = 0; i < s->nb_snapshots; i++) {
2400             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2401             uint32_t l1_sz  = s->snapshots[i].l1_size;
2402             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2403             uint64_t *l1 = g_try_malloc(l1_sz2);
2404             int ret;
2405 
2406             if (l1_sz2 && l1 == NULL) {
2407                 return -ENOMEM;
2408             }
2409 
2410             ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2);
2411             if (ret < 0) {
2412                 g_free(l1);
2413                 return ret;
2414             }
2415 
2416             for (j = 0; j < l1_sz; j++) {
2417                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2418                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2419                     g_free(l1);
2420                     return QCOW2_OL_INACTIVE_L2;
2421                 }
2422             }
2423 
2424             g_free(l1);
2425         }
2426     }
2427 
2428     return 0;
2429 }
2430 
2431 static const char *metadata_ol_names[] = {
2432     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2433     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2434     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2435     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2436     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2437     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2438     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2439     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2440 };
2441 
2442 /*
2443  * First performs a check for metadata overlaps (through
2444  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2445  * while performing a check), that value is returned. If an impending overlap
2446  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2447  * and -EIO returned.
2448  *
2449  * Returns 0 if there were neither overlaps nor errors while checking for
2450  * overlaps; or a negative value (-errno) on error.
2451  */
2452 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2453                                   int64_t size)
2454 {
2455     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2456 
2457     if (ret < 0) {
2458         return ret;
2459     } else if (ret > 0) {
2460         int metadata_ol_bitnr = ctz32(ret);
2461         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2462 
2463         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2464                                 "write on metadata (overlaps with %s)",
2465                                 metadata_ol_names[metadata_ol_bitnr]);
2466         return -EIO;
2467     }
2468 
2469     return 0;
2470 }
2471 
2472 /* A pointer to a function of this type is given to walk_over_reftable(). That
2473  * function will create refblocks and pass them to a RefblockFinishOp once they
2474  * are completed (@refblock). @refblock_empty is set if the refblock is
2475  * completely empty.
2476  *
2477  * Along with the refblock, a corresponding reftable entry is passed, in the
2478  * reftable @reftable (which may be reallocated) at @reftable_index.
2479  *
2480  * @allocated should be set to true if a new cluster has been allocated.
2481  */
2482 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2483                                uint64_t reftable_index, uint64_t *reftable_size,
2484                                void *refblock, bool refblock_empty,
2485                                bool *allocated, Error **errp);
2486 
2487 /**
2488  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2489  * it is not empty) and inserts its offset into the new reftable. The size of
2490  * this new reftable is increased as required.
2491  */
2492 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2493                           uint64_t reftable_index, uint64_t *reftable_size,
2494                           void *refblock, bool refblock_empty, bool *allocated,
2495                           Error **errp)
2496 {
2497     BDRVQcow2State *s = bs->opaque;
2498     int64_t offset;
2499 
2500     if (!refblock_empty && reftable_index >= *reftable_size) {
2501         uint64_t *new_reftable;
2502         uint64_t new_reftable_size;
2503 
2504         new_reftable_size = ROUND_UP(reftable_index + 1,
2505                                      s->cluster_size / sizeof(uint64_t));
2506         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2507             error_setg(errp,
2508                        "This operation would make the refcount table grow "
2509                        "beyond the maximum size supported by QEMU, aborting");
2510             return -ENOTSUP;
2511         }
2512 
2513         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2514                                                 sizeof(uint64_t));
2515         if (!new_reftable) {
2516             error_setg(errp, "Failed to increase reftable buffer size");
2517             return -ENOMEM;
2518         }
2519 
2520         memset(new_reftable + *reftable_size, 0,
2521                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2522 
2523         *reftable      = new_reftable;
2524         *reftable_size = new_reftable_size;
2525     }
2526 
2527     if (!refblock_empty && !(*reftable)[reftable_index]) {
2528         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2529         if (offset < 0) {
2530             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2531             return offset;
2532         }
2533         (*reftable)[reftable_index] = offset;
2534         *allocated = true;
2535     }
2536 
2537     return 0;
2538 }
2539 
2540 /**
2541  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2542  * offset specified by the new reftable's entry. It does not modify the new
2543  * reftable or change any refcounts.
2544  */
2545 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2546                           uint64_t reftable_index, uint64_t *reftable_size,
2547                           void *refblock, bool refblock_empty, bool *allocated,
2548                           Error **errp)
2549 {
2550     BDRVQcow2State *s = bs->opaque;
2551     int64_t offset;
2552     int ret;
2553 
2554     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2555         offset = (*reftable)[reftable_index];
2556 
2557         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2558         if (ret < 0) {
2559             error_setg_errno(errp, -ret, "Overlap check failed");
2560             return ret;
2561         }
2562 
2563         ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size);
2564         if (ret < 0) {
2565             error_setg_errno(errp, -ret, "Failed to write refblock");
2566             return ret;
2567         }
2568     } else {
2569         assert(refblock_empty);
2570     }
2571 
2572     return 0;
2573 }
2574 
2575 /**
2576  * This function walks over the existing reftable and every referenced refblock;
2577  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2578  * create an equal new entry in the passed @new_refblock. Once that
2579  * @new_refblock is completely filled, @operation will be called.
2580  *
2581  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2582  * @index is the index of the walk_over_reftable() calls and @total is the total
2583  * number of walk_over_reftable() calls per amend operation. Both are used for
2584  * calculating the parameters for the status callback.
2585  *
2586  * @allocated is set to true if a new cluster has been allocated.
2587  */
2588 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2589                               uint64_t *new_reftable_index,
2590                               uint64_t *new_reftable_size,
2591                               void *new_refblock, int new_refblock_size,
2592                               int new_refcount_bits,
2593                               RefblockFinishOp *operation, bool *allocated,
2594                               Qcow2SetRefcountFunc *new_set_refcount,
2595                               BlockDriverAmendStatusCB *status_cb,
2596                               void *cb_opaque, int index, int total,
2597                               Error **errp)
2598 {
2599     BDRVQcow2State *s = bs->opaque;
2600     uint64_t reftable_index;
2601     bool new_refblock_empty = true;
2602     int refblock_index;
2603     int new_refblock_index = 0;
2604     int ret;
2605 
2606     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2607          reftable_index++)
2608     {
2609         uint64_t refblock_offset = s->refcount_table[reftable_index]
2610                                  & REFT_OFFSET_MASK;
2611 
2612         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2613                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2614 
2615         if (refblock_offset) {
2616             void *refblock;
2617 
2618             if (offset_into_cluster(s, refblock_offset)) {
2619                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2620                                         PRIx64 " unaligned (reftable index: %#"
2621                                         PRIx64 ")", refblock_offset,
2622                                         reftable_index);
2623                 error_setg(errp,
2624                            "Image is corrupt (unaligned refblock offset)");
2625                 return -EIO;
2626             }
2627 
2628             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2629                                   &refblock);
2630             if (ret < 0) {
2631                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2632                 return ret;
2633             }
2634 
2635             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2636                  refblock_index++)
2637             {
2638                 uint64_t refcount;
2639 
2640                 if (new_refblock_index >= new_refblock_size) {
2641                     /* new_refblock is now complete */
2642                     ret = operation(bs, new_reftable, *new_reftable_index,
2643                                     new_reftable_size, new_refblock,
2644                                     new_refblock_empty, allocated, errp);
2645                     if (ret < 0) {
2646                         qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2647                         return ret;
2648                     }
2649 
2650                     (*new_reftable_index)++;
2651                     new_refblock_index = 0;
2652                     new_refblock_empty = true;
2653                 }
2654 
2655                 refcount = s->get_refcount(refblock, refblock_index);
2656                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2657                     uint64_t offset;
2658 
2659                     qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2660 
2661                     offset = ((reftable_index << s->refcount_block_bits)
2662                               + refblock_index) << s->cluster_bits;
2663 
2664                     error_setg(errp, "Cannot decrease refcount entry width to "
2665                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2666                                "refcount of %" PRIu64, new_refcount_bits,
2667                                offset, refcount);
2668                     return -EINVAL;
2669                 }
2670 
2671                 if (new_set_refcount) {
2672                     new_set_refcount(new_refblock, new_refblock_index++,
2673                                      refcount);
2674                 } else {
2675                     new_refblock_index++;
2676                 }
2677                 new_refblock_empty = new_refblock_empty && refcount == 0;
2678             }
2679 
2680             qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2681         } else {
2682             /* No refblock means every refcount is 0 */
2683             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2684                  refblock_index++)
2685             {
2686                 if (new_refblock_index >= new_refblock_size) {
2687                     /* new_refblock is now complete */
2688                     ret = operation(bs, new_reftable, *new_reftable_index,
2689                                     new_reftable_size, new_refblock,
2690                                     new_refblock_empty, allocated, errp);
2691                     if (ret < 0) {
2692                         return ret;
2693                     }
2694 
2695                     (*new_reftable_index)++;
2696                     new_refblock_index = 0;
2697                     new_refblock_empty = true;
2698                 }
2699 
2700                 if (new_set_refcount) {
2701                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2702                 } else {
2703                     new_refblock_index++;
2704                 }
2705             }
2706         }
2707     }
2708 
2709     if (new_refblock_index > 0) {
2710         /* Complete the potentially existing partially filled final refblock */
2711         if (new_set_refcount) {
2712             for (; new_refblock_index < new_refblock_size;
2713                  new_refblock_index++)
2714             {
2715                 new_set_refcount(new_refblock, new_refblock_index, 0);
2716             }
2717         }
2718 
2719         ret = operation(bs, new_reftable, *new_reftable_index,
2720                         new_reftable_size, new_refblock, new_refblock_empty,
2721                         allocated, errp);
2722         if (ret < 0) {
2723             return ret;
2724         }
2725 
2726         (*new_reftable_index)++;
2727     }
2728 
2729     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2730               (uint64_t)total * s->refcount_table_size, cb_opaque);
2731 
2732     return 0;
2733 }
2734 
2735 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2736                                 BlockDriverAmendStatusCB *status_cb,
2737                                 void *cb_opaque, Error **errp)
2738 {
2739     BDRVQcow2State *s = bs->opaque;
2740     Qcow2GetRefcountFunc *new_get_refcount;
2741     Qcow2SetRefcountFunc *new_set_refcount;
2742     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2743     uint64_t *new_reftable = NULL, new_reftable_size = 0;
2744     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2745     uint64_t new_reftable_index = 0;
2746     uint64_t i;
2747     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2748     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2749     int old_refcount_order;
2750     int walk_index = 0;
2751     int ret;
2752     bool new_allocation;
2753 
2754     assert(s->qcow_version >= 3);
2755     assert(refcount_order >= 0 && refcount_order <= 6);
2756 
2757     /* see qcow2_open() */
2758     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2759 
2760     new_get_refcount = get_refcount_funcs[refcount_order];
2761     new_set_refcount = set_refcount_funcs[refcount_order];
2762 
2763 
2764     do {
2765         int total_walks;
2766 
2767         new_allocation = false;
2768 
2769         /* At least we have to do this walk and the one which writes the
2770          * refblocks; also, at least we have to do this loop here at least
2771          * twice (normally), first to do the allocations, and second to
2772          * determine that everything is correctly allocated, this then makes
2773          * three walks in total */
2774         total_walks = MAX(walk_index + 2, 3);
2775 
2776         /* First, allocate the structures so they are present in the refcount
2777          * structures */
2778         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2779                                  &new_reftable_size, NULL, new_refblock_size,
2780                                  new_refcount_bits, &alloc_refblock,
2781                                  &new_allocation, NULL, status_cb, cb_opaque,
2782                                  walk_index++, total_walks, errp);
2783         if (ret < 0) {
2784             goto done;
2785         }
2786 
2787         new_reftable_index = 0;
2788 
2789         if (new_allocation) {
2790             if (new_reftable_offset) {
2791                 qcow2_free_clusters(bs, new_reftable_offset,
2792                                     allocated_reftable_size * sizeof(uint64_t),
2793                                     QCOW2_DISCARD_NEVER);
2794             }
2795 
2796             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2797                                                            sizeof(uint64_t));
2798             if (new_reftable_offset < 0) {
2799                 error_setg_errno(errp, -new_reftable_offset,
2800                                  "Failed to allocate the new reftable");
2801                 ret = new_reftable_offset;
2802                 goto done;
2803             }
2804             allocated_reftable_size = new_reftable_size;
2805         }
2806     } while (new_allocation);
2807 
2808     /* Second, write the new refblocks */
2809     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2810                              &new_reftable_size, new_refblock,
2811                              new_refblock_size, new_refcount_bits,
2812                              &flush_refblock, &new_allocation, new_set_refcount,
2813                              status_cb, cb_opaque, walk_index, walk_index + 1,
2814                              errp);
2815     if (ret < 0) {
2816         goto done;
2817     }
2818     assert(!new_allocation);
2819 
2820 
2821     /* Write the new reftable */
2822     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2823                                         new_reftable_size * sizeof(uint64_t));
2824     if (ret < 0) {
2825         error_setg_errno(errp, -ret, "Overlap check failed");
2826         goto done;
2827     }
2828 
2829     for (i = 0; i < new_reftable_size; i++) {
2830         cpu_to_be64s(&new_reftable[i]);
2831     }
2832 
2833     ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable,
2834                       new_reftable_size * sizeof(uint64_t));
2835 
2836     for (i = 0; i < new_reftable_size; i++) {
2837         be64_to_cpus(&new_reftable[i]);
2838     }
2839 
2840     if (ret < 0) {
2841         error_setg_errno(errp, -ret, "Failed to write the new reftable");
2842         goto done;
2843     }
2844 
2845 
2846     /* Empty the refcount cache */
2847     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2848     if (ret < 0) {
2849         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2850         goto done;
2851     }
2852 
2853     /* Update the image header to point to the new reftable; this only updates
2854      * the fields which are relevant to qcow2_update_header(); other fields
2855      * such as s->refcount_table or s->refcount_bits stay stale for now
2856      * (because we have to restore everything if qcow2_update_header() fails) */
2857     old_refcount_order  = s->refcount_order;
2858     old_reftable_size   = s->refcount_table_size;
2859     old_reftable_offset = s->refcount_table_offset;
2860 
2861     s->refcount_order        = refcount_order;
2862     s->refcount_table_size   = new_reftable_size;
2863     s->refcount_table_offset = new_reftable_offset;
2864 
2865     ret = qcow2_update_header(bs);
2866     if (ret < 0) {
2867         s->refcount_order        = old_refcount_order;
2868         s->refcount_table_size   = old_reftable_size;
2869         s->refcount_table_offset = old_reftable_offset;
2870         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
2871         goto done;
2872     }
2873 
2874     /* Now update the rest of the in-memory information */
2875     old_reftable = s->refcount_table;
2876     s->refcount_table = new_reftable;
2877 
2878     s->refcount_bits = 1 << refcount_order;
2879     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
2880     s->refcount_max += s->refcount_max - 1;
2881 
2882     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
2883     s->refcount_block_size = 1 << s->refcount_block_bits;
2884 
2885     s->get_refcount = new_get_refcount;
2886     s->set_refcount = new_set_refcount;
2887 
2888     /* For cleaning up all old refblocks and the old reftable below the "done"
2889      * label */
2890     new_reftable        = old_reftable;
2891     new_reftable_size   = old_reftable_size;
2892     new_reftable_offset = old_reftable_offset;
2893 
2894 done:
2895     if (new_reftable) {
2896         /* On success, new_reftable actually points to the old reftable (and
2897          * new_reftable_size is the old reftable's size); but that is just
2898          * fine */
2899         for (i = 0; i < new_reftable_size; i++) {
2900             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
2901             if (offset) {
2902                 qcow2_free_clusters(bs, offset, s->cluster_size,
2903                                     QCOW2_DISCARD_OTHER);
2904             }
2905         }
2906         g_free(new_reftable);
2907 
2908         if (new_reftable_offset > 0) {
2909             qcow2_free_clusters(bs, new_reftable_offset,
2910                                 new_reftable_size * sizeof(uint64_t),
2911                                 QCOW2_DISCARD_OTHER);
2912         }
2913     }
2914 
2915     qemu_vfree(new_refblock);
2916     return ret;
2917 }
2918