xref: /qemu/block/qcow2-refcount.c (revision 79854b95)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qcow2.h"
28 #include "qemu/range.h"
29 #include "qemu/bswap.h"
30 #include "qemu/cutils.h"
31 #include "qemu/memalign.h"
32 #include "trace.h"
33 
34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
35                                     uint64_t max);
36 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
37                             int64_t offset, int64_t length, uint64_t addend,
38                             bool decrease, enum qcow2_discard_type type);
39 
40 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
46 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
47 
48 static void set_refcount_ro0(void *refcount_array, uint64_t index,
49                              uint64_t value);
50 static void set_refcount_ro1(void *refcount_array, uint64_t index,
51                              uint64_t value);
52 static void set_refcount_ro2(void *refcount_array, uint64_t index,
53                              uint64_t value);
54 static void set_refcount_ro3(void *refcount_array, uint64_t index,
55                              uint64_t value);
56 static void set_refcount_ro4(void *refcount_array, uint64_t index,
57                              uint64_t value);
58 static void set_refcount_ro5(void *refcount_array, uint64_t index,
59                              uint64_t value);
60 static void set_refcount_ro6(void *refcount_array, uint64_t index,
61                              uint64_t value);
62 
63 
64 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
65     &get_refcount_ro0,
66     &get_refcount_ro1,
67     &get_refcount_ro2,
68     &get_refcount_ro3,
69     &get_refcount_ro4,
70     &get_refcount_ro5,
71     &get_refcount_ro6
72 };
73 
74 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
75     &set_refcount_ro0,
76     &set_refcount_ro1,
77     &set_refcount_ro2,
78     &set_refcount_ro3,
79     &set_refcount_ro4,
80     &set_refcount_ro5,
81     &set_refcount_ro6
82 };
83 
84 
85 /*********************************************************/
86 /* refcount handling */
87 
88 static void update_max_refcount_table_index(BDRVQcow2State *s)
89 {
90     unsigned i = s->refcount_table_size - 1;
91     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
92         i--;
93     }
94     /* Set s->max_refcount_table_index to the index of the last used entry */
95     s->max_refcount_table_index = i;
96 }
97 
98 int qcow2_refcount_init(BlockDriverState *bs)
99 {
100     BDRVQcow2State *s = bs->opaque;
101     unsigned int refcount_table_size2, i;
102     int ret;
103 
104     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
105 
106     s->get_refcount = get_refcount_funcs[s->refcount_order];
107     s->set_refcount = set_refcount_funcs[s->refcount_order];
108 
109     assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
110     refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
111     s->refcount_table = g_try_malloc(refcount_table_size2);
112 
113     if (s->refcount_table_size > 0) {
114         if (s->refcount_table == NULL) {
115             ret = -ENOMEM;
116             goto fail;
117         }
118         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
119         ret = bdrv_pread(bs->file, s->refcount_table_offset,
120                          s->refcount_table, refcount_table_size2);
121         if (ret < 0) {
122             goto fail;
123         }
124         for(i = 0; i < s->refcount_table_size; i++)
125             be64_to_cpus(&s->refcount_table[i]);
126         update_max_refcount_table_index(s);
127     }
128     return 0;
129  fail:
130     return ret;
131 }
132 
133 void qcow2_refcount_close(BlockDriverState *bs)
134 {
135     BDRVQcow2State *s = bs->opaque;
136     g_free(s->refcount_table);
137 }
138 
139 
140 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
141 {
142     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
143 }
144 
145 static void set_refcount_ro0(void *refcount_array, uint64_t index,
146                              uint64_t value)
147 {
148     assert(!(value >> 1));
149     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
150     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
151 }
152 
153 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
154 {
155     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
156            & 0x3;
157 }
158 
159 static void set_refcount_ro1(void *refcount_array, uint64_t index,
160                              uint64_t value)
161 {
162     assert(!(value >> 2));
163     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
164     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
165 }
166 
167 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
168 {
169     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
170            & 0xf;
171 }
172 
173 static void set_refcount_ro2(void *refcount_array, uint64_t index,
174                              uint64_t value)
175 {
176     assert(!(value >> 4));
177     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
178     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
179 }
180 
181 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
182 {
183     return ((const uint8_t *)refcount_array)[index];
184 }
185 
186 static void set_refcount_ro3(void *refcount_array, uint64_t index,
187                              uint64_t value)
188 {
189     assert(!(value >> 8));
190     ((uint8_t *)refcount_array)[index] = value;
191 }
192 
193 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
194 {
195     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
196 }
197 
198 static void set_refcount_ro4(void *refcount_array, uint64_t index,
199                              uint64_t value)
200 {
201     assert(!(value >> 16));
202     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
203 }
204 
205 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
206 {
207     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
208 }
209 
210 static void set_refcount_ro5(void *refcount_array, uint64_t index,
211                              uint64_t value)
212 {
213     assert(!(value >> 32));
214     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
215 }
216 
217 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
218 {
219     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
220 }
221 
222 static void set_refcount_ro6(void *refcount_array, uint64_t index,
223                              uint64_t value)
224 {
225     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
226 }
227 
228 
229 static int load_refcount_block(BlockDriverState *bs,
230                                int64_t refcount_block_offset,
231                                void **refcount_block)
232 {
233     BDRVQcow2State *s = bs->opaque;
234 
235     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
236     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
237                            refcount_block);
238 }
239 
240 /*
241  * Retrieves the refcount of the cluster given by its index and stores it in
242  * *refcount. Returns 0 on success and -errno on failure.
243  */
244 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
245                        uint64_t *refcount)
246 {
247     BDRVQcow2State *s = bs->opaque;
248     uint64_t refcount_table_index, block_index;
249     int64_t refcount_block_offset;
250     int ret;
251     void *refcount_block;
252 
253     refcount_table_index = cluster_index >> s->refcount_block_bits;
254     if (refcount_table_index >= s->refcount_table_size) {
255         *refcount = 0;
256         return 0;
257     }
258     refcount_block_offset =
259         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
260     if (!refcount_block_offset) {
261         *refcount = 0;
262         return 0;
263     }
264 
265     if (offset_into_cluster(s, refcount_block_offset)) {
266         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
267                                 " unaligned (reftable index: %#" PRIx64 ")",
268                                 refcount_block_offset, refcount_table_index);
269         return -EIO;
270     }
271 
272     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
273                           &refcount_block);
274     if (ret < 0) {
275         return ret;
276     }
277 
278     block_index = cluster_index & (s->refcount_block_size - 1);
279     *refcount = s->get_refcount(refcount_block, block_index);
280 
281     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
282 
283     return 0;
284 }
285 
286 /* Checks if two offsets are described by the same refcount block */
287 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
288     uint64_t offset_b)
289 {
290     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
291     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
292 
293     return (block_a == block_b);
294 }
295 
296 /*
297  * Loads a refcount block. If it doesn't exist yet, it is allocated first
298  * (including growing the refcount table if needed).
299  *
300  * Returns 0 on success or -errno in error case
301  */
302 static int alloc_refcount_block(BlockDriverState *bs,
303                                 int64_t cluster_index, void **refcount_block)
304 {
305     BDRVQcow2State *s = bs->opaque;
306     unsigned int refcount_table_index;
307     int64_t ret;
308 
309     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
310 
311     /* Find the refcount block for the given cluster */
312     refcount_table_index = cluster_index >> s->refcount_block_bits;
313 
314     if (refcount_table_index < s->refcount_table_size) {
315 
316         uint64_t refcount_block_offset =
317             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
318 
319         /* If it's already there, we're done */
320         if (refcount_block_offset) {
321             if (offset_into_cluster(s, refcount_block_offset)) {
322                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
323                                         PRIx64 " unaligned (reftable index: "
324                                         "%#x)", refcount_block_offset,
325                                         refcount_table_index);
326                 return -EIO;
327             }
328 
329              return load_refcount_block(bs, refcount_block_offset,
330                                         refcount_block);
331         }
332     }
333 
334     /*
335      * If we came here, we need to allocate something. Something is at least
336      * a cluster for the new refcount block. It may also include a new refcount
337      * table if the old refcount table is too small.
338      *
339      * Note that allocating clusters here needs some special care:
340      *
341      * - We can't use the normal qcow2_alloc_clusters(), it would try to
342      *   increase the refcount and very likely we would end up with an endless
343      *   recursion. Instead we must place the refcount blocks in a way that
344      *   they can describe them themselves.
345      *
346      * - We need to consider that at this point we are inside update_refcounts
347      *   and potentially doing an initial refcount increase. This means that
348      *   some clusters have already been allocated by the caller, but their
349      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
350      *   need to return -EAGAIN to signal the caller that it needs to restart
351      *   the search for free clusters.
352      *
353      * - alloc_clusters_noref and qcow2_free_clusters may load a different
354      *   refcount block into the cache
355      */
356 
357     *refcount_block = NULL;
358 
359     /* We write to the refcount table, so we might depend on L2 tables */
360     ret = qcow2_cache_flush(bs, s->l2_table_cache);
361     if (ret < 0) {
362         return ret;
363     }
364 
365     /* Allocate the refcount block itself and mark it as used */
366     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
367     if (new_block < 0) {
368         return new_block;
369     }
370 
371     /* The offset must fit in the offset field of the refcount table entry */
372     assert((new_block & REFT_OFFSET_MASK) == new_block);
373 
374     /* If we're allocating the block at offset 0 then something is wrong */
375     if (new_block == 0) {
376         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
377                                 "allocation of refcount block at offset 0");
378         return -EIO;
379     }
380 
381 #ifdef DEBUG_ALLOC2
382     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
383         " at %" PRIx64 "\n",
384         refcount_table_index, cluster_index << s->cluster_bits, new_block);
385 #endif
386 
387     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
388         /* Zero the new refcount block before updating it */
389         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
390                                     refcount_block);
391         if (ret < 0) {
392             goto fail;
393         }
394 
395         memset(*refcount_block, 0, s->cluster_size);
396 
397         /* The block describes itself, need to update the cache */
398         int block_index = (new_block >> s->cluster_bits) &
399             (s->refcount_block_size - 1);
400         s->set_refcount(*refcount_block, block_index, 1);
401     } else {
402         /* Described somewhere else. This can recurse at most twice before we
403          * arrive at a block that describes itself. */
404         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
405                               QCOW2_DISCARD_NEVER);
406         if (ret < 0) {
407             goto fail;
408         }
409 
410         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
411         if (ret < 0) {
412             goto fail;
413         }
414 
415         /* Initialize the new refcount block only after updating its refcount,
416          * update_refcount uses the refcount cache itself */
417         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
418                                     refcount_block);
419         if (ret < 0) {
420             goto fail;
421         }
422 
423         memset(*refcount_block, 0, s->cluster_size);
424     }
425 
426     /* Now the new refcount block needs to be written to disk */
427     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
428     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
429     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
430     if (ret < 0) {
431         goto fail;
432     }
433 
434     /* If the refcount table is big enough, just hook the block up there */
435     if (refcount_table_index < s->refcount_table_size) {
436         uint64_t data64 = cpu_to_be64(new_block);
437         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
438         ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
439                                refcount_table_index * REFTABLE_ENTRY_SIZE,
440             &data64, sizeof(data64));
441         if (ret < 0) {
442             goto fail;
443         }
444 
445         s->refcount_table[refcount_table_index] = new_block;
446         /* If there's a hole in s->refcount_table then it can happen
447          * that refcount_table_index < s->max_refcount_table_index */
448         s->max_refcount_table_index =
449             MAX(s->max_refcount_table_index, refcount_table_index);
450 
451         /* The new refcount block may be where the caller intended to put its
452          * data, so let it restart the search. */
453         return -EAGAIN;
454     }
455 
456     qcow2_cache_put(s->refcount_block_cache, refcount_block);
457 
458     /*
459      * If we come here, we need to grow the refcount table. Again, a new
460      * refcount table needs some space and we can't simply allocate to avoid
461      * endless recursion.
462      *
463      * Therefore let's grab new refcount blocks at the end of the image, which
464      * will describe themselves and the new refcount table. This way we can
465      * reference them only in the new table and do the switch to the new
466      * refcount table at once without producing an inconsistent state in
467      * between.
468      */
469     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
470 
471     /* Calculate the number of refcount blocks needed so far; this will be the
472      * basis for calculating the index of the first cluster used for the
473      * self-describing refcount structures which we are about to create.
474      *
475      * Because we reached this point, there cannot be any refcount entries for
476      * cluster_index or higher indices yet. However, because new_block has been
477      * allocated to describe that cluster (and it will assume this role later
478      * on), we cannot use that index; also, new_block may actually have a higher
479      * cluster index than cluster_index, so it needs to be taken into account
480      * here (and 1 needs to be added to its value because that cluster is used).
481      */
482     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
483                                             (new_block >> s->cluster_bits) + 1),
484                                         s->refcount_block_size);
485 
486     /* Create the new refcount table and blocks */
487     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
488         s->cluster_size;
489 
490     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
491                               refcount_table_index, new_block);
492     if (ret < 0) {
493         return ret;
494     }
495 
496     ret = load_refcount_block(bs, new_block, refcount_block);
497     if (ret < 0) {
498         return ret;
499     }
500 
501     /* If we were trying to do the initial refcount update for some cluster
502      * allocation, we might have used the same clusters to store newly
503      * allocated metadata. Make the caller search some new space. */
504     return -EAGAIN;
505 
506 fail:
507     if (*refcount_block != NULL) {
508         qcow2_cache_put(s->refcount_block_cache, refcount_block);
509     }
510     return ret;
511 }
512 
513 /*
514  * Starting at @start_offset, this function creates new self-covering refcount
515  * structures: A new refcount table and refcount blocks which cover all of
516  * themselves, and a number of @additional_clusters beyond their end.
517  * @start_offset must be at the end of the image file, that is, there must be
518  * only empty space beyond it.
519  * If @exact_size is false, the refcount table will have 50 % more entries than
520  * necessary so it will not need to grow again soon.
521  * If @new_refblock_offset is not zero, it contains the offset of a refcount
522  * block that should be entered into the new refcount table at index
523  * @new_refblock_index.
524  *
525  * Returns: The offset after the new refcount structures (i.e. where the
526  *          @additional_clusters may be placed) on success, -errno on error.
527  */
528 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
529                             uint64_t additional_clusters, bool exact_size,
530                             int new_refblock_index,
531                             uint64_t new_refblock_offset)
532 {
533     BDRVQcow2State *s = bs->opaque;
534     uint64_t total_refblock_count_u64, additional_refblock_count;
535     int total_refblock_count, table_size, area_reftable_index, table_clusters;
536     int i;
537     uint64_t table_offset, block_offset, end_offset;
538     int ret;
539     uint64_t *new_table;
540 
541     assert(!(start_offset % s->cluster_size));
542 
543     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
544                                  additional_clusters,
545                                  s->cluster_size, s->refcount_order,
546                                  !exact_size, &total_refblock_count_u64);
547     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
548         return -EFBIG;
549     }
550     total_refblock_count = total_refblock_count_u64;
551 
552     /* Index in the refcount table of the first refcount block to cover the area
553      * of refcount structures we are about to create; we know that
554      * @total_refblock_count can cover @start_offset, so this will definitely
555      * fit into an int. */
556     area_reftable_index = (start_offset / s->cluster_size) /
557                           s->refcount_block_size;
558 
559     if (exact_size) {
560         table_size = total_refblock_count;
561     } else {
562         table_size = total_refblock_count +
563                      DIV_ROUND_UP(total_refblock_count, 2);
564     }
565     /* The qcow2 file can only store the reftable size in number of clusters */
566     table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
567     table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
568 
569     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
570         return -EFBIG;
571     }
572 
573     new_table = g_try_new0(uint64_t, table_size);
574 
575     assert(table_size > 0);
576     if (new_table == NULL) {
577         ret = -ENOMEM;
578         goto fail;
579     }
580 
581     /* Fill the new refcount table */
582     if (table_size > s->max_refcount_table_index) {
583         /* We're actually growing the reftable */
584         memcpy(new_table, s->refcount_table,
585                (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
586     } else {
587         /* Improbable case: We're shrinking the reftable. However, the caller
588          * has assured us that there is only empty space beyond @start_offset,
589          * so we can simply drop all of the refblocks that won't fit into the
590          * new reftable. */
591         memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
592     }
593 
594     if (new_refblock_offset) {
595         assert(new_refblock_index < total_refblock_count);
596         new_table[new_refblock_index] = new_refblock_offset;
597     }
598 
599     /* Count how many new refblocks we have to create */
600     additional_refblock_count = 0;
601     for (i = area_reftable_index; i < total_refblock_count; i++) {
602         if (!new_table[i]) {
603             additional_refblock_count++;
604         }
605     }
606 
607     table_offset = start_offset + additional_refblock_count * s->cluster_size;
608     end_offset = table_offset + table_clusters * s->cluster_size;
609 
610     /* Fill the refcount blocks, and create new ones, if necessary */
611     block_offset = start_offset;
612     for (i = area_reftable_index; i < total_refblock_count; i++) {
613         void *refblock_data;
614         uint64_t first_offset_covered;
615 
616         /* Reuse an existing refblock if possible, create a new one otherwise */
617         if (new_table[i]) {
618             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
619                                   &refblock_data);
620             if (ret < 0) {
621                 goto fail;
622             }
623         } else {
624             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
625                                         block_offset, &refblock_data);
626             if (ret < 0) {
627                 goto fail;
628             }
629             memset(refblock_data, 0, s->cluster_size);
630             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
631                                          refblock_data);
632 
633             new_table[i] = block_offset;
634             block_offset += s->cluster_size;
635         }
636 
637         /* First host offset covered by this refblock */
638         first_offset_covered = (uint64_t)i * s->refcount_block_size *
639                                s->cluster_size;
640         if (first_offset_covered < end_offset) {
641             int j, end_index;
642 
643             /* Set the refcount of all of the new refcount structures to 1 */
644 
645             if (first_offset_covered < start_offset) {
646                 assert(i == area_reftable_index);
647                 j = (start_offset - first_offset_covered) / s->cluster_size;
648                 assert(j < s->refcount_block_size);
649             } else {
650                 j = 0;
651             }
652 
653             end_index = MIN((end_offset - first_offset_covered) /
654                             s->cluster_size,
655                             s->refcount_block_size);
656 
657             for (; j < end_index; j++) {
658                 /* The caller guaranteed us this space would be empty */
659                 assert(s->get_refcount(refblock_data, j) == 0);
660                 s->set_refcount(refblock_data, j, 1);
661             }
662 
663             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
664                                          refblock_data);
665         }
666 
667         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
668     }
669 
670     assert(block_offset == table_offset);
671 
672     /* Write refcount blocks to disk */
673     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
674     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
675     if (ret < 0) {
676         goto fail;
677     }
678 
679     /* Write refcount table to disk */
680     for (i = 0; i < total_refblock_count; i++) {
681         cpu_to_be64s(&new_table[i]);
682     }
683 
684     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
685     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
686         table_size * REFTABLE_ENTRY_SIZE);
687     if (ret < 0) {
688         goto fail;
689     }
690 
691     for (i = 0; i < total_refblock_count; i++) {
692         be64_to_cpus(&new_table[i]);
693     }
694 
695     /* Hook up the new refcount table in the qcow2 header */
696     struct QEMU_PACKED {
697         uint64_t d64;
698         uint32_t d32;
699     } data;
700     data.d64 = cpu_to_be64(table_offset);
701     data.d32 = cpu_to_be32(table_clusters);
702     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
703     ret = bdrv_pwrite_sync(bs->file,
704                            offsetof(QCowHeader, refcount_table_offset),
705                            &data, sizeof(data));
706     if (ret < 0) {
707         goto fail;
708     }
709 
710     /* And switch it in memory */
711     uint64_t old_table_offset = s->refcount_table_offset;
712     uint64_t old_table_size = s->refcount_table_size;
713 
714     g_free(s->refcount_table);
715     s->refcount_table = new_table;
716     s->refcount_table_size = table_size;
717     s->refcount_table_offset = table_offset;
718     update_max_refcount_table_index(s);
719 
720     /* Free old table. */
721     qcow2_free_clusters(bs, old_table_offset,
722                         old_table_size * REFTABLE_ENTRY_SIZE,
723                         QCOW2_DISCARD_OTHER);
724 
725     return end_offset;
726 
727 fail:
728     g_free(new_table);
729     return ret;
730 }
731 
732 void qcow2_process_discards(BlockDriverState *bs, int ret)
733 {
734     BDRVQcow2State *s = bs->opaque;
735     Qcow2DiscardRegion *d, *next;
736 
737     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
738         QTAILQ_REMOVE(&s->discards, d, next);
739 
740         /* Discard is optional, ignore the return value */
741         if (ret >= 0) {
742             int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
743             if (r2 < 0) {
744                 trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
745                                                            r2);
746             }
747         }
748 
749         g_free(d);
750     }
751 }
752 
753 static void update_refcount_discard(BlockDriverState *bs,
754                                     uint64_t offset, uint64_t length)
755 {
756     BDRVQcow2State *s = bs->opaque;
757     Qcow2DiscardRegion *d, *p, *next;
758 
759     QTAILQ_FOREACH(d, &s->discards, next) {
760         uint64_t new_start = MIN(offset, d->offset);
761         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
762 
763         if (new_end - new_start <= length + d->bytes) {
764             /* There can't be any overlap, areas ending up here have no
765              * references any more and therefore shouldn't get freed another
766              * time. */
767             assert(d->bytes + length == new_end - new_start);
768             d->offset = new_start;
769             d->bytes = new_end - new_start;
770             goto found;
771         }
772     }
773 
774     d = g_malloc(sizeof(*d));
775     *d = (Qcow2DiscardRegion) {
776         .bs     = bs,
777         .offset = offset,
778         .bytes  = length,
779     };
780     QTAILQ_INSERT_TAIL(&s->discards, d, next);
781 
782 found:
783     /* Merge discard requests if they are adjacent now */
784     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
785         if (p == d
786             || p->offset > d->offset + d->bytes
787             || d->offset > p->offset + p->bytes)
788         {
789             continue;
790         }
791 
792         /* Still no overlap possible */
793         assert(p->offset == d->offset + d->bytes
794             || d->offset == p->offset + p->bytes);
795 
796         QTAILQ_REMOVE(&s->discards, p, next);
797         d->offset = MIN(d->offset, p->offset);
798         d->bytes += p->bytes;
799         g_free(p);
800     }
801 }
802 
803 /* XXX: cache several refcount block clusters ? */
804 /* @addend is the absolute value of the addend; if @decrease is set, @addend
805  * will be subtracted from the current refcount, otherwise it will be added */
806 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
807                                                    int64_t offset,
808                                                    int64_t length,
809                                                    uint64_t addend,
810                                                    bool decrease,
811                                                    enum qcow2_discard_type type)
812 {
813     BDRVQcow2State *s = bs->opaque;
814     int64_t start, last, cluster_offset;
815     void *refcount_block = NULL;
816     int64_t old_table_index = -1;
817     int ret;
818 
819 #ifdef DEBUG_ALLOC2
820     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
821             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
822             addend);
823 #endif
824     if (length < 0) {
825         return -EINVAL;
826     } else if (length == 0) {
827         return 0;
828     }
829 
830     if (decrease) {
831         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
832             s->l2_table_cache);
833     }
834 
835     start = start_of_cluster(s, offset);
836     last = start_of_cluster(s, offset + length - 1);
837     for(cluster_offset = start; cluster_offset <= last;
838         cluster_offset += s->cluster_size)
839     {
840         int block_index;
841         uint64_t refcount;
842         int64_t cluster_index = cluster_offset >> s->cluster_bits;
843         int64_t table_index = cluster_index >> s->refcount_block_bits;
844 
845         /* Load the refcount block and allocate it if needed */
846         if (table_index != old_table_index) {
847             if (refcount_block) {
848                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
849             }
850             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
851             /* If the caller needs to restart the search for free clusters,
852              * try the same ones first to see if they're still free. */
853             if (ret == -EAGAIN) {
854                 if (s->free_cluster_index > (start >> s->cluster_bits)) {
855                     s->free_cluster_index = (start >> s->cluster_bits);
856                 }
857             }
858             if (ret < 0) {
859                 goto fail;
860             }
861         }
862         old_table_index = table_index;
863 
864         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
865 
866         /* we can update the count and save it */
867         block_index = cluster_index & (s->refcount_block_size - 1);
868 
869         refcount = s->get_refcount(refcount_block, block_index);
870         if (decrease ? (refcount - addend > refcount)
871                      : (refcount + addend < refcount ||
872                         refcount + addend > s->refcount_max))
873         {
874             ret = -EINVAL;
875             goto fail;
876         }
877         if (decrease) {
878             refcount -= addend;
879         } else {
880             refcount += addend;
881         }
882         if (refcount == 0 && cluster_index < s->free_cluster_index) {
883             s->free_cluster_index = cluster_index;
884         }
885         s->set_refcount(refcount_block, block_index, refcount);
886 
887         if (refcount == 0) {
888             void *table;
889 
890             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
891                                                 offset);
892             if (table != NULL) {
893                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
894                 old_table_index = -1;
895                 qcow2_cache_discard(s->refcount_block_cache, table);
896             }
897 
898             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
899             if (table != NULL) {
900                 qcow2_cache_discard(s->l2_table_cache, table);
901             }
902 
903             if (s->discard_passthrough[type]) {
904                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
905             }
906         }
907     }
908 
909     ret = 0;
910 fail:
911     if (!s->cache_discards) {
912         qcow2_process_discards(bs, ret);
913     }
914 
915     /* Write last changed block to disk */
916     if (refcount_block) {
917         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
918     }
919 
920     /*
921      * Try do undo any updates if an error is returned (This may succeed in
922      * some cases like ENOSPC for allocating a new refcount block)
923      */
924     if (ret < 0) {
925         int dummy;
926         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
927                                 !decrease, QCOW2_DISCARD_NEVER);
928         (void)dummy;
929     }
930 
931     return ret;
932 }
933 
934 /*
935  * Increases or decreases the refcount of a given cluster.
936  *
937  * @addend is the absolute value of the addend; if @decrease is set, @addend
938  * will be subtracted from the current refcount, otherwise it will be added.
939  *
940  * On success 0 is returned; on failure -errno is returned.
941  */
942 int qcow2_update_cluster_refcount(BlockDriverState *bs,
943                                   int64_t cluster_index,
944                                   uint64_t addend, bool decrease,
945                                   enum qcow2_discard_type type)
946 {
947     BDRVQcow2State *s = bs->opaque;
948     int ret;
949 
950     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
951                           decrease, type);
952     if (ret < 0) {
953         return ret;
954     }
955 
956     return 0;
957 }
958 
959 
960 
961 /*********************************************************/
962 /* cluster allocation functions */
963 
964 
965 
966 /* return < 0 if error */
967 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
968                                     uint64_t max)
969 {
970     BDRVQcow2State *s = bs->opaque;
971     uint64_t i, nb_clusters, refcount;
972     int ret;
973 
974     /* We can't allocate clusters if they may still be queued for discard. */
975     if (s->cache_discards) {
976         qcow2_process_discards(bs, 0);
977     }
978 
979     nb_clusters = size_to_clusters(s, size);
980 retry:
981     for(i = 0; i < nb_clusters; i++) {
982         uint64_t next_cluster_index = s->free_cluster_index++;
983         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
984 
985         if (ret < 0) {
986             return ret;
987         } else if (refcount != 0) {
988             goto retry;
989         }
990     }
991 
992     /* Make sure that all offsets in the "allocated" range are representable
993      * in the requested max */
994     if (s->free_cluster_index > 0 &&
995         s->free_cluster_index - 1 > (max >> s->cluster_bits))
996     {
997         return -EFBIG;
998     }
999 
1000 #ifdef DEBUG_ALLOC2
1001     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1002             size,
1003             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1004 #endif
1005     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1006 }
1007 
1008 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1009 {
1010     int64_t offset;
1011     int ret;
1012 
1013     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1014     do {
1015         offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1016         if (offset < 0) {
1017             return offset;
1018         }
1019 
1020         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1021     } while (ret == -EAGAIN);
1022 
1023     if (ret < 0) {
1024         return ret;
1025     }
1026 
1027     return offset;
1028 }
1029 
1030 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1031                                 int64_t nb_clusters)
1032 {
1033     BDRVQcow2State *s = bs->opaque;
1034     uint64_t cluster_index, refcount;
1035     uint64_t i;
1036     int ret;
1037 
1038     assert(nb_clusters >= 0);
1039     if (nb_clusters == 0) {
1040         return 0;
1041     }
1042 
1043     do {
1044         /* Check how many clusters there are free */
1045         cluster_index = offset >> s->cluster_bits;
1046         for(i = 0; i < nb_clusters; i++) {
1047             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1048             if (ret < 0) {
1049                 return ret;
1050             } else if (refcount != 0) {
1051                 break;
1052             }
1053         }
1054 
1055         /* And then allocate them */
1056         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1057                               QCOW2_DISCARD_NEVER);
1058     } while (ret == -EAGAIN);
1059 
1060     if (ret < 0) {
1061         return ret;
1062     }
1063 
1064     return i;
1065 }
1066 
1067 /* only used to allocate compressed sectors. We try to allocate
1068    contiguous sectors. size must be <= cluster_size */
1069 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1070 {
1071     BDRVQcow2State *s = bs->opaque;
1072     int64_t offset;
1073     size_t free_in_cluster;
1074     int ret;
1075 
1076     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1077     assert(size > 0 && size <= s->cluster_size);
1078     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1079 
1080     offset = s->free_byte_offset;
1081 
1082     if (offset) {
1083         uint64_t refcount;
1084         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1085         if (ret < 0) {
1086             return ret;
1087         }
1088 
1089         if (refcount == s->refcount_max) {
1090             offset = 0;
1091         }
1092     }
1093 
1094     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1095     do {
1096         if (!offset || free_in_cluster < size) {
1097             int64_t new_cluster;
1098 
1099             new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1100                                                MIN(s->cluster_offset_mask,
1101                                                    QCOW_MAX_CLUSTER_OFFSET));
1102             if (new_cluster < 0) {
1103                 return new_cluster;
1104             }
1105 
1106             if (new_cluster == 0) {
1107                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1108                                         "allocation of compressed cluster "
1109                                         "at offset 0");
1110                 return -EIO;
1111             }
1112 
1113             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1114                 offset = new_cluster;
1115                 free_in_cluster = s->cluster_size;
1116             } else {
1117                 free_in_cluster += s->cluster_size;
1118             }
1119         }
1120 
1121         assert(offset);
1122         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1123         if (ret < 0) {
1124             offset = 0;
1125         }
1126     } while (ret == -EAGAIN);
1127     if (ret < 0) {
1128         return ret;
1129     }
1130 
1131     /* The cluster refcount was incremented; refcount blocks must be flushed
1132      * before the caller's L2 table updates. */
1133     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1134 
1135     s->free_byte_offset = offset + size;
1136     if (!offset_into_cluster(s, s->free_byte_offset)) {
1137         s->free_byte_offset = 0;
1138     }
1139 
1140     return offset;
1141 }
1142 
1143 void qcow2_free_clusters(BlockDriverState *bs,
1144                           int64_t offset, int64_t size,
1145                           enum qcow2_discard_type type)
1146 {
1147     int ret;
1148 
1149     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1150     ret = update_refcount(bs, offset, size, 1, true, type);
1151     if (ret < 0) {
1152         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1153         /* TODO Remember the clusters to free them later and avoid leaking */
1154     }
1155 }
1156 
1157 /*
1158  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1159  * normal cluster, compressed cluster, etc.)
1160  */
1161 void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1162                             enum qcow2_discard_type type)
1163 {
1164     BDRVQcow2State *s = bs->opaque;
1165     QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1166 
1167     if (has_data_file(bs)) {
1168         if (s->discard_passthrough[type] &&
1169             (ctype == QCOW2_CLUSTER_NORMAL ||
1170              ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1171         {
1172             bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1173                           s->cluster_size);
1174         }
1175         return;
1176     }
1177 
1178     switch (ctype) {
1179     case QCOW2_CLUSTER_COMPRESSED:
1180         {
1181             uint64_t coffset;
1182             int csize;
1183 
1184             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1185             qcow2_free_clusters(bs, coffset, csize, type);
1186         }
1187         break;
1188     case QCOW2_CLUSTER_NORMAL:
1189     case QCOW2_CLUSTER_ZERO_ALLOC:
1190         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1191             qcow2_signal_corruption(bs, false, -1, -1,
1192                                     "Cannot free unaligned cluster %#llx",
1193                                     l2_entry & L2E_OFFSET_MASK);
1194         } else {
1195             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1196                                 s->cluster_size, type);
1197         }
1198         break;
1199     case QCOW2_CLUSTER_ZERO_PLAIN:
1200     case QCOW2_CLUSTER_UNALLOCATED:
1201         break;
1202     default:
1203         abort();
1204     }
1205 }
1206 
1207 int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1208 {
1209     BDRVQcow2State *s = bs->opaque;
1210     int ret;
1211 
1212     ret = qcow2_cache_write(bs, s->l2_table_cache);
1213     if (ret < 0) {
1214         return ret;
1215     }
1216 
1217     if (qcow2_need_accurate_refcounts(s)) {
1218         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1219         if (ret < 0) {
1220             return ret;
1221         }
1222     }
1223 
1224     return 0;
1225 }
1226 
1227 int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1228 {
1229     int ret = qcow2_write_caches(bs);
1230     if (ret < 0) {
1231         return ret;
1232     }
1233 
1234     return bdrv_flush(bs->file->bs);
1235 }
1236 
1237 /*********************************************************/
1238 /* snapshots and image creation */
1239 
1240 
1241 
1242 /* update the refcounts of snapshots and the copied flag */
1243 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1244     int64_t l1_table_offset, int l1_size, int addend)
1245 {
1246     BDRVQcow2State *s = bs->opaque;
1247     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1248     bool l1_allocated = false;
1249     int64_t old_entry, old_l2_offset;
1250     unsigned slice, slice_size2, n_slices;
1251     int i, j, l1_modified = 0;
1252     int ret;
1253 
1254     assert(addend >= -1 && addend <= 1);
1255 
1256     l2_slice = NULL;
1257     l1_table = NULL;
1258     l1_size2 = l1_size * L1E_SIZE;
1259     slice_size2 = s->l2_slice_size * l2_entry_size(s);
1260     n_slices = s->cluster_size / slice_size2;
1261 
1262     s->cache_discards = true;
1263 
1264     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1265      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1266      * when changing this! */
1267     if (l1_table_offset != s->l1_table_offset) {
1268         l1_table = g_try_malloc0(l1_size2);
1269         if (l1_size2 && l1_table == NULL) {
1270             ret = -ENOMEM;
1271             goto fail;
1272         }
1273         l1_allocated = true;
1274 
1275         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1276         if (ret < 0) {
1277             goto fail;
1278         }
1279 
1280         for (i = 0; i < l1_size; i++) {
1281             be64_to_cpus(&l1_table[i]);
1282         }
1283     } else {
1284         assert(l1_size == s->l1_size);
1285         l1_table = s->l1_table;
1286         l1_allocated = false;
1287     }
1288 
1289     for (i = 0; i < l1_size; i++) {
1290         l2_offset = l1_table[i];
1291         if (l2_offset) {
1292             old_l2_offset = l2_offset;
1293             l2_offset &= L1E_OFFSET_MASK;
1294 
1295             if (offset_into_cluster(s, l2_offset)) {
1296                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1297                                         PRIx64 " unaligned (L1 index: %#x)",
1298                                         l2_offset, i);
1299                 ret = -EIO;
1300                 goto fail;
1301             }
1302 
1303             for (slice = 0; slice < n_slices; slice++) {
1304                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1305                                       l2_offset + slice * slice_size2,
1306                                       (void **) &l2_slice);
1307                 if (ret < 0) {
1308                     goto fail;
1309                 }
1310 
1311                 for (j = 0; j < s->l2_slice_size; j++) {
1312                     uint64_t cluster_index;
1313                     uint64_t offset;
1314 
1315                     entry = get_l2_entry(s, l2_slice, j);
1316                     old_entry = entry;
1317                     entry &= ~QCOW_OFLAG_COPIED;
1318                     offset = entry & L2E_OFFSET_MASK;
1319 
1320                     switch (qcow2_get_cluster_type(bs, entry)) {
1321                     case QCOW2_CLUSTER_COMPRESSED:
1322                         if (addend != 0) {
1323                             uint64_t coffset;
1324                             int csize;
1325 
1326                             qcow2_parse_compressed_l2_entry(bs, entry,
1327                                                             &coffset, &csize);
1328                             ret = update_refcount(
1329                                 bs, coffset, csize,
1330                                 abs(addend), addend < 0,
1331                                 QCOW2_DISCARD_SNAPSHOT);
1332                             if (ret < 0) {
1333                                 goto fail;
1334                             }
1335                         }
1336                         /* compressed clusters are never modified */
1337                         refcount = 2;
1338                         break;
1339 
1340                     case QCOW2_CLUSTER_NORMAL:
1341                     case QCOW2_CLUSTER_ZERO_ALLOC:
1342                         if (offset_into_cluster(s, offset)) {
1343                             /* Here l2_index means table (not slice) index */
1344                             int l2_index = slice * s->l2_slice_size + j;
1345                             qcow2_signal_corruption(
1346                                 bs, true, -1, -1, "Cluster "
1347                                 "allocation offset %#" PRIx64
1348                                 " unaligned (L2 offset: %#"
1349                                 PRIx64 ", L2 index: %#x)",
1350                                 offset, l2_offset, l2_index);
1351                             ret = -EIO;
1352                             goto fail;
1353                         }
1354 
1355                         cluster_index = offset >> s->cluster_bits;
1356                         assert(cluster_index);
1357                         if (addend != 0) {
1358                             ret = qcow2_update_cluster_refcount(
1359                                 bs, cluster_index, abs(addend), addend < 0,
1360                                 QCOW2_DISCARD_SNAPSHOT);
1361                             if (ret < 0) {
1362                                 goto fail;
1363                             }
1364                         }
1365 
1366                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1367                         if (ret < 0) {
1368                             goto fail;
1369                         }
1370                         break;
1371 
1372                     case QCOW2_CLUSTER_ZERO_PLAIN:
1373                     case QCOW2_CLUSTER_UNALLOCATED:
1374                         refcount = 0;
1375                         break;
1376 
1377                     default:
1378                         abort();
1379                     }
1380 
1381                     if (refcount == 1) {
1382                         entry |= QCOW_OFLAG_COPIED;
1383                     }
1384                     if (entry != old_entry) {
1385                         if (addend > 0) {
1386                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1387                                                        s->refcount_block_cache);
1388                         }
1389                         set_l2_entry(s, l2_slice, j, entry);
1390                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1391                                                      l2_slice);
1392                     }
1393                 }
1394 
1395                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1396             }
1397 
1398             if (addend != 0) {
1399                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1400                                                         s->cluster_bits,
1401                                                     abs(addend), addend < 0,
1402                                                     QCOW2_DISCARD_SNAPSHOT);
1403                 if (ret < 0) {
1404                     goto fail;
1405                 }
1406             }
1407             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1408                                      &refcount);
1409             if (ret < 0) {
1410                 goto fail;
1411             } else if (refcount == 1) {
1412                 l2_offset |= QCOW_OFLAG_COPIED;
1413             }
1414             if (l2_offset != old_l2_offset) {
1415                 l1_table[i] = l2_offset;
1416                 l1_modified = 1;
1417             }
1418         }
1419     }
1420 
1421     ret = bdrv_flush(bs);
1422 fail:
1423     if (l2_slice) {
1424         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1425     }
1426 
1427     s->cache_discards = false;
1428     qcow2_process_discards(bs, ret);
1429 
1430     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1431     if (ret == 0 && addend >= 0 && l1_modified) {
1432         for (i = 0; i < l1_size; i++) {
1433             cpu_to_be64s(&l1_table[i]);
1434         }
1435 
1436         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1437                                l1_table, l1_size2);
1438 
1439         for (i = 0; i < l1_size; i++) {
1440             be64_to_cpus(&l1_table[i]);
1441         }
1442     }
1443     if (l1_allocated)
1444         g_free(l1_table);
1445     return ret;
1446 }
1447 
1448 
1449 
1450 
1451 /*********************************************************/
1452 /* refcount checking functions */
1453 
1454 
1455 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1456 {
1457     /* This assertion holds because there is no way we can address more than
1458      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1459      * offsets have to be representable in bytes); due to every cluster
1460      * corresponding to one refcount entry, we are well below that limit */
1461     assert(entries < (UINT64_C(1) << (64 - 9)));
1462 
1463     /* Thanks to the assertion this will not overflow, because
1464      * s->refcount_order < 7.
1465      * (note: x << s->refcount_order == x * s->refcount_bits) */
1466     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1467 }
1468 
1469 /**
1470  * Reallocates *array so that it can hold new_size entries. *size must contain
1471  * the current number of entries in *array. If the reallocation fails, *array
1472  * and *size will not be modified and -errno will be returned. If the
1473  * reallocation is successful, *array will be set to the new buffer, *size
1474  * will be set to new_size and 0 will be returned. The size of the reallocated
1475  * refcount array buffer will be aligned to a cluster boundary, and the newly
1476  * allocated area will be zeroed.
1477  */
1478 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1479                                   int64_t *size, int64_t new_size)
1480 {
1481     int64_t old_byte_size, new_byte_size;
1482     void *new_ptr;
1483 
1484     /* Round to clusters so the array can be directly written to disk */
1485     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1486                     * s->cluster_size;
1487     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1488                     * s->cluster_size;
1489 
1490     if (new_byte_size == old_byte_size) {
1491         *size = new_size;
1492         return 0;
1493     }
1494 
1495     assert(new_byte_size > 0);
1496 
1497     if (new_byte_size > SIZE_MAX) {
1498         return -ENOMEM;
1499     }
1500 
1501     new_ptr = g_try_realloc(*array, new_byte_size);
1502     if (!new_ptr) {
1503         return -ENOMEM;
1504     }
1505 
1506     if (new_byte_size > old_byte_size) {
1507         memset((char *)new_ptr + old_byte_size, 0,
1508                new_byte_size - old_byte_size);
1509     }
1510 
1511     *array = new_ptr;
1512     *size  = new_size;
1513 
1514     return 0;
1515 }
1516 
1517 /*
1518  * Increases the refcount for a range of clusters in a given refcount table.
1519  * This is used to construct a temporary refcount table out of L1 and L2 tables
1520  * which can be compared to the refcount table saved in the image.
1521  *
1522  * Modifies the number of errors in res.
1523  */
1524 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1525                              void **refcount_table,
1526                              int64_t *refcount_table_size,
1527                              int64_t offset, int64_t size)
1528 {
1529     BDRVQcow2State *s = bs->opaque;
1530     uint64_t start, last, cluster_offset, k, refcount;
1531     int64_t file_len;
1532     int ret;
1533 
1534     if (size <= 0) {
1535         return 0;
1536     }
1537 
1538     file_len = bdrv_getlength(bs->file->bs);
1539     if (file_len < 0) {
1540         return file_len;
1541     }
1542 
1543     /*
1544      * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1545      * reference some space after file end but it should be less than one
1546      * cluster.
1547      */
1548     if (offset + size - file_len >= s->cluster_size) {
1549         fprintf(stderr, "ERROR: counting reference for region exceeding the "
1550                 "end of the file by one cluster or more: offset 0x%" PRIx64
1551                 " size 0x%" PRIx64 "\n", offset, size);
1552         res->corruptions++;
1553         return 0;
1554     }
1555 
1556     start = start_of_cluster(s, offset);
1557     last = start_of_cluster(s, offset + size - 1);
1558     for(cluster_offset = start; cluster_offset <= last;
1559         cluster_offset += s->cluster_size) {
1560         k = cluster_offset >> s->cluster_bits;
1561         if (k >= *refcount_table_size) {
1562             ret = realloc_refcount_array(s, refcount_table,
1563                                          refcount_table_size, k + 1);
1564             if (ret < 0) {
1565                 res->check_errors++;
1566                 return ret;
1567             }
1568         }
1569 
1570         refcount = s->get_refcount(*refcount_table, k);
1571         if (refcount == s->refcount_max) {
1572             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1573                     "\n", cluster_offset);
1574             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1575                     "width or qemu-img convert to create a clean copy if the "
1576                     "image cannot be opened for writing\n");
1577             res->corruptions++;
1578             continue;
1579         }
1580         s->set_refcount(*refcount_table, k, refcount + 1);
1581     }
1582 
1583     return 0;
1584 }
1585 
1586 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1587 enum {
1588     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1589 };
1590 
1591 /*
1592  * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
1593  * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
1594  *
1595  * This function decrements res->corruptions on success, so the caller is
1596  * responsible to increment res->corruptions prior to the call.
1597  *
1598  * On failure in-memory @l2_table may be modified.
1599  */
1600 static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
1601                                 uint64_t l2_offset,
1602                                 uint64_t *l2_table, int l2_index, bool active,
1603                                 bool *metadata_overlap)
1604 {
1605     BDRVQcow2State *s = bs->opaque;
1606     int ret;
1607     int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
1608     uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
1609     int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
1610 
1611     if (has_subclusters(s)) {
1612         uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
1613 
1614         /* Allocated subclusters become zero */
1615         l2_bitmap |= l2_bitmap << 32;
1616         l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
1617 
1618         set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
1619         set_l2_entry(s, l2_table, l2_index, 0);
1620     } else {
1621         set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
1622     }
1623 
1624     ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
1625                                         false);
1626     if (metadata_overlap) {
1627         *metadata_overlap = ret < 0;
1628     }
1629     if (ret < 0) {
1630         fprintf(stderr, "ERROR: Overlap check failed\n");
1631         goto fail;
1632     }
1633 
1634     ret = bdrv_pwrite_sync(bs->file, l2e_offset, &l2_table[idx],
1635                            l2_entry_size(s));
1636     if (ret < 0) {
1637         fprintf(stderr, "ERROR: Failed to overwrite L2 "
1638                 "table entry: %s\n", strerror(-ret));
1639         goto fail;
1640     }
1641 
1642     res->corruptions--;
1643     res->corruptions_fixed++;
1644     return 0;
1645 
1646 fail:
1647     res->check_errors++;
1648     return ret;
1649 }
1650 
1651 /*
1652  * Increases the refcount in the given refcount table for the all clusters
1653  * referenced in the L2 table. While doing so, performs some checks on L2
1654  * entries.
1655  *
1656  * Returns the number of errors found by the checks or -errno if an internal
1657  * error occurred.
1658  */
1659 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1660                               void **refcount_table,
1661                               int64_t *refcount_table_size, int64_t l2_offset,
1662                               int flags, BdrvCheckMode fix, bool active)
1663 {
1664     BDRVQcow2State *s = bs->opaque;
1665     uint64_t l2_entry, l2_bitmap;
1666     uint64_t next_contiguous_offset = 0;
1667     int i, ret;
1668     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
1669     g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
1670     bool metadata_overlap;
1671 
1672     /* Read L2 table from disk */
1673     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size_bytes);
1674     if (ret < 0) {
1675         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1676         res->check_errors++;
1677         return ret;
1678     }
1679 
1680     /* Do the actual checks */
1681     for (i = 0; i < s->l2_size; i++) {
1682         uint64_t coffset;
1683         int csize;
1684         QCow2ClusterType type;
1685 
1686         l2_entry = get_l2_entry(s, l2_table, i);
1687         l2_bitmap = get_l2_bitmap(s, l2_table, i);
1688         type = qcow2_get_cluster_type(bs, l2_entry);
1689 
1690         if (type != QCOW2_CLUSTER_COMPRESSED) {
1691             /* Check reserved bits of Standard Cluster Descriptor */
1692             if (l2_entry & L2E_STD_RESERVED_MASK) {
1693                 fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
1694                         "%" PRIx64 "\n", l2_entry);
1695                 res->corruptions++;
1696             }
1697         }
1698 
1699         switch (type) {
1700         case QCOW2_CLUSTER_COMPRESSED:
1701             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1702             if (l2_entry & QCOW_OFLAG_COPIED) {
1703                 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1704                     "copied flag must never be set for compressed "
1705                     "clusters\n", l2_entry & s->cluster_offset_mask);
1706                 l2_entry &= ~QCOW_OFLAG_COPIED;
1707                 res->corruptions++;
1708             }
1709 
1710             if (has_data_file(bs)) {
1711                 fprintf(stderr, "ERROR compressed cluster %d with data file, "
1712                         "entry=0x%" PRIx64 "\n", i, l2_entry);
1713                 res->corruptions++;
1714                 break;
1715             }
1716 
1717             if (l2_bitmap) {
1718                 fprintf(stderr, "ERROR compressed cluster %d with non-zero "
1719                         "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
1720                         i, l2_entry);
1721                 res->corruptions++;
1722                 break;
1723             }
1724 
1725             /* Mark cluster as used */
1726             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1727             ret = qcow2_inc_refcounts_imrt(
1728                 bs, res, refcount_table, refcount_table_size, coffset, csize);
1729             if (ret < 0) {
1730                 return ret;
1731             }
1732 
1733             if (flags & CHECK_FRAG_INFO) {
1734                 res->bfi.allocated_clusters++;
1735                 res->bfi.compressed_clusters++;
1736 
1737                 /*
1738                  * Compressed clusters are fragmented by nature.  Since they
1739                  * take up sub-sector space but we only have sector granularity
1740                  * I/O we need to re-read the same sectors even for adjacent
1741                  * compressed clusters.
1742                  */
1743                 res->bfi.fragmented_clusters++;
1744             }
1745             break;
1746 
1747         case QCOW2_CLUSTER_ZERO_ALLOC:
1748         case QCOW2_CLUSTER_NORMAL:
1749         {
1750             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1751 
1752             if ((l2_bitmap >> 32) & l2_bitmap) {
1753                 res->corruptions++;
1754                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
1755                         "cluster has corrupted subcluster allocation bitmap\n",
1756                         offset);
1757             }
1758 
1759             /* Correct offsets are cluster aligned */
1760             if (offset_into_cluster(s, offset)) {
1761                 bool contains_data;
1762                 res->corruptions++;
1763 
1764                 if (has_subclusters(s)) {
1765                     contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1766                 } else {
1767                     contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1768                 }
1769 
1770                 if (!contains_data) {
1771                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1772                             "cluster is not properly aligned; L2 entry "
1773                             "corrupted.\n",
1774                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1775                             offset);
1776                     if (fix & BDRV_FIX_ERRORS) {
1777                         ret = fix_l2_entry_by_zero(bs, res, l2_offset,
1778                                                    l2_table, i, active,
1779                                                    &metadata_overlap);
1780                         if (metadata_overlap) {
1781                             /*
1782                              * Something is seriously wrong, so abort checking
1783                              * this L2 table.
1784                              */
1785                             return ret;
1786                         }
1787 
1788                         if (ret == 0) {
1789                             /*
1790                              * Skip marking the cluster as used
1791                              * (it is unused now).
1792                              */
1793                             continue;
1794                         }
1795 
1796                         /*
1797                          * Failed to fix.
1798                          * Do not abort, continue checking the rest of this
1799                          * L2 table's entries.
1800                          */
1801                     }
1802                 } else {
1803                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1804                         "not properly aligned; L2 entry corrupted.\n", offset);
1805                 }
1806             }
1807 
1808             if (flags & CHECK_FRAG_INFO) {
1809                 res->bfi.allocated_clusters++;
1810                 if (next_contiguous_offset &&
1811                     offset != next_contiguous_offset) {
1812                     res->bfi.fragmented_clusters++;
1813                 }
1814                 next_contiguous_offset = offset + s->cluster_size;
1815             }
1816 
1817             /* Mark cluster as used */
1818             if (!has_data_file(bs)) {
1819                 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1820                                                refcount_table_size,
1821                                                offset, s->cluster_size);
1822                 if (ret < 0) {
1823                     return ret;
1824                 }
1825             }
1826             break;
1827         }
1828 
1829         case QCOW2_CLUSTER_ZERO_PLAIN:
1830             /* Impossible when image has subclusters */
1831             assert(!l2_bitmap);
1832             break;
1833 
1834         case QCOW2_CLUSTER_UNALLOCATED:
1835             if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
1836                 res->corruptions++;
1837                 fprintf(stderr, "ERROR: Unallocated "
1838                         "cluster has non-zero subcluster allocation map\n");
1839             }
1840             break;
1841 
1842         default:
1843             abort();
1844         }
1845     }
1846 
1847     return 0;
1848 }
1849 
1850 /*
1851  * Increases the refcount for the L1 table, its L2 tables and all referenced
1852  * clusters in the given refcount table. While doing so, performs some checks
1853  * on L1 and L2 entries.
1854  *
1855  * Returns the number of errors found by the checks or -errno if an internal
1856  * error occurred.
1857  */
1858 static int check_refcounts_l1(BlockDriverState *bs,
1859                               BdrvCheckResult *res,
1860                               void **refcount_table,
1861                               int64_t *refcount_table_size,
1862                               int64_t l1_table_offset, int l1_size,
1863                               int flags, BdrvCheckMode fix, bool active)
1864 {
1865     BDRVQcow2State *s = bs->opaque;
1866     size_t l1_size_bytes = l1_size * L1E_SIZE;
1867     g_autofree uint64_t *l1_table = NULL;
1868     uint64_t l2_offset;
1869     int i, ret;
1870 
1871     if (!l1_size) {
1872         return 0;
1873     }
1874 
1875     /* Mark L1 table as used */
1876     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1877                                    l1_table_offset, l1_size_bytes);
1878     if (ret < 0) {
1879         return ret;
1880     }
1881 
1882     l1_table = g_try_malloc(l1_size_bytes);
1883     if (l1_table == NULL) {
1884         res->check_errors++;
1885         return -ENOMEM;
1886     }
1887 
1888     /* Read L1 table entries from disk */
1889     ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size_bytes);
1890     if (ret < 0) {
1891         fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1892         res->check_errors++;
1893         return ret;
1894     }
1895 
1896     for (i = 0; i < l1_size; i++) {
1897         be64_to_cpus(&l1_table[i]);
1898     }
1899 
1900     /* Do the actual checks */
1901     for (i = 0; i < l1_size; i++) {
1902         if (!l1_table[i]) {
1903             continue;
1904         }
1905 
1906         if (l1_table[i] & L1E_RESERVED_MASK) {
1907             fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
1908                     "%" PRIx64 "\n", l1_table[i]);
1909             res->corruptions++;
1910         }
1911 
1912         l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1913 
1914         /* Mark L2 table as used */
1915         ret = qcow2_inc_refcounts_imrt(bs, res,
1916                                        refcount_table, refcount_table_size,
1917                                        l2_offset, s->cluster_size);
1918         if (ret < 0) {
1919             return ret;
1920         }
1921 
1922         /* L2 tables are cluster aligned */
1923         if (offset_into_cluster(s, l2_offset)) {
1924             fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1925                 "cluster aligned; L1 entry corrupted\n", l2_offset);
1926             res->corruptions++;
1927         }
1928 
1929         /* Process and check L2 entries */
1930         ret = check_refcounts_l2(bs, res, refcount_table,
1931                                  refcount_table_size, l2_offset, flags,
1932                                  fix, active);
1933         if (ret < 0) {
1934             return ret;
1935         }
1936     }
1937 
1938     return 0;
1939 }
1940 
1941 /*
1942  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1943  *
1944  * This function does not print an error message nor does it increment
1945  * check_errors if qcow2_get_refcount fails (this is because such an error will
1946  * have been already detected and sufficiently signaled by the calling function
1947  * (qcow2_check_refcounts) by the time this function is called).
1948  */
1949 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1950                               BdrvCheckMode fix)
1951 {
1952     BDRVQcow2State *s = bs->opaque;
1953     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1954     int ret;
1955     uint64_t refcount;
1956     int i, j;
1957     bool repair;
1958 
1959     if (fix & BDRV_FIX_ERRORS) {
1960         /* Always repair */
1961         repair = true;
1962     } else if (fix & BDRV_FIX_LEAKS) {
1963         /* Repair only if that seems safe: This function is always
1964          * called after the refcounts have been fixed, so the refcount
1965          * is accurate if that repair was successful */
1966         repair = !res->check_errors && !res->corruptions && !res->leaks;
1967     } else {
1968         repair = false;
1969     }
1970 
1971     for (i = 0; i < s->l1_size; i++) {
1972         uint64_t l1_entry = s->l1_table[i];
1973         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1974         int l2_dirty = 0;
1975 
1976         if (!l2_offset) {
1977             continue;
1978         }
1979 
1980         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1981                                  &refcount);
1982         if (ret < 0) {
1983             /* don't print message nor increment check_errors */
1984             continue;
1985         }
1986         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1987             res->corruptions++;
1988             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1989                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1990                     repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1991             if (repair) {
1992                 s->l1_table[i] = refcount == 1
1993                                ? l1_entry |  QCOW_OFLAG_COPIED
1994                                : l1_entry & ~QCOW_OFLAG_COPIED;
1995                 ret = qcow2_write_l1_entry(bs, i);
1996                 if (ret < 0) {
1997                     res->check_errors++;
1998                     goto fail;
1999                 }
2000                 res->corruptions--;
2001                 res->corruptions_fixed++;
2002             }
2003         }
2004 
2005         ret = bdrv_pread(bs->file, l2_offset, l2_table,
2006                          s->l2_size * l2_entry_size(s));
2007         if (ret < 0) {
2008             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
2009                     strerror(-ret));
2010             res->check_errors++;
2011             goto fail;
2012         }
2013 
2014         for (j = 0; j < s->l2_size; j++) {
2015             uint64_t l2_entry = get_l2_entry(s, l2_table, j);
2016             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
2017             QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
2018 
2019             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
2020                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
2021                 if (has_data_file(bs)) {
2022                     refcount = 1;
2023                 } else {
2024                     ret = qcow2_get_refcount(bs,
2025                                              data_offset >> s->cluster_bits,
2026                                              &refcount);
2027                     if (ret < 0) {
2028                         /* don't print message nor increment check_errors */
2029                         continue;
2030                     }
2031                 }
2032                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
2033                     res->corruptions++;
2034                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
2035                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2036                             repair ? "Repairing" : "ERROR", l2_entry, refcount);
2037                     if (repair) {
2038                         set_l2_entry(s, l2_table, j,
2039                                      refcount == 1 ?
2040                                      l2_entry |  QCOW_OFLAG_COPIED :
2041                                      l2_entry & ~QCOW_OFLAG_COPIED);
2042                         l2_dirty++;
2043                     }
2044                 }
2045             }
2046         }
2047 
2048         if (l2_dirty > 0) {
2049             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
2050                                                 l2_offset, s->cluster_size,
2051                                                 false);
2052             if (ret < 0) {
2053                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
2054                         "overlap check failed: %s\n", strerror(-ret));
2055                 res->check_errors++;
2056                 goto fail;
2057             }
2058 
2059             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
2060                               s->cluster_size);
2061             if (ret < 0) {
2062                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
2063                         strerror(-ret));
2064                 res->check_errors++;
2065                 goto fail;
2066             }
2067             res->corruptions -= l2_dirty;
2068             res->corruptions_fixed += l2_dirty;
2069         }
2070     }
2071 
2072     ret = 0;
2073 
2074 fail:
2075     qemu_vfree(l2_table);
2076     return ret;
2077 }
2078 
2079 /*
2080  * Checks consistency of refblocks and accounts for each refblock in
2081  * *refcount_table.
2082  */
2083 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
2084                            BdrvCheckMode fix, bool *rebuild,
2085                            void **refcount_table, int64_t *nb_clusters)
2086 {
2087     BDRVQcow2State *s = bs->opaque;
2088     int64_t i, size;
2089     int ret;
2090 
2091     for(i = 0; i < s->refcount_table_size; i++) {
2092         uint64_t offset, cluster;
2093         offset = s->refcount_table[i] & REFT_OFFSET_MASK;
2094         cluster = offset >> s->cluster_bits;
2095 
2096         if (s->refcount_table[i] & REFT_RESERVED_MASK) {
2097             fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
2098                     "reserved bits set\n", i);
2099             res->corruptions++;
2100             *rebuild = true;
2101             continue;
2102         }
2103 
2104         /* Refcount blocks are cluster aligned */
2105         if (offset_into_cluster(s, offset)) {
2106             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2107                 "cluster aligned; refcount table entry corrupted\n", i);
2108             res->corruptions++;
2109             *rebuild = true;
2110             continue;
2111         }
2112 
2113         if (cluster >= *nb_clusters) {
2114             res->corruptions++;
2115             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2116                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2117 
2118             if (fix & BDRV_FIX_ERRORS) {
2119                 int64_t new_nb_clusters;
2120                 Error *local_err = NULL;
2121 
2122                 if (offset > INT64_MAX - s->cluster_size) {
2123                     ret = -EINVAL;
2124                     goto resize_fail;
2125                 }
2126 
2127                 ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
2128                                     PREALLOC_MODE_OFF, 0, &local_err);
2129                 if (ret < 0) {
2130                     error_report_err(local_err);
2131                     goto resize_fail;
2132                 }
2133                 size = bdrv_getlength(bs->file->bs);
2134                 if (size < 0) {
2135                     ret = size;
2136                     goto resize_fail;
2137                 }
2138 
2139                 new_nb_clusters = size_to_clusters(s, size);
2140                 assert(new_nb_clusters >= *nb_clusters);
2141 
2142                 ret = realloc_refcount_array(s, refcount_table,
2143                                              nb_clusters, new_nb_clusters);
2144                 if (ret < 0) {
2145                     res->check_errors++;
2146                     return ret;
2147                 }
2148 
2149                 if (cluster >= *nb_clusters) {
2150                     ret = -EINVAL;
2151                     goto resize_fail;
2152                 }
2153 
2154                 res->corruptions--;
2155                 res->corruptions_fixed++;
2156                 ret = qcow2_inc_refcounts_imrt(bs, res,
2157                                                refcount_table, nb_clusters,
2158                                                offset, s->cluster_size);
2159                 if (ret < 0) {
2160                     return ret;
2161                 }
2162                 /* No need to check whether the refcount is now greater than 1:
2163                  * This area was just allocated and zeroed, so it can only be
2164                  * exactly 1 after qcow2_inc_refcounts_imrt() */
2165                 continue;
2166 
2167 resize_fail:
2168                 *rebuild = true;
2169                 fprintf(stderr, "ERROR could not resize image: %s\n",
2170                         strerror(-ret));
2171             }
2172             continue;
2173         }
2174 
2175         if (offset != 0) {
2176             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2177                                            offset, s->cluster_size);
2178             if (ret < 0) {
2179                 return ret;
2180             }
2181             if (s->get_refcount(*refcount_table, cluster) != 1) {
2182                 fprintf(stderr, "ERROR refcount block %" PRId64
2183                         " refcount=%" PRIu64 "\n", i,
2184                         s->get_refcount(*refcount_table, cluster));
2185                 res->corruptions++;
2186                 *rebuild = true;
2187             }
2188         }
2189     }
2190 
2191     return 0;
2192 }
2193 
2194 /*
2195  * Calculates an in-memory refcount table.
2196  */
2197 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2198                                BdrvCheckMode fix, bool *rebuild,
2199                                void **refcount_table, int64_t *nb_clusters)
2200 {
2201     BDRVQcow2State *s = bs->opaque;
2202     int64_t i;
2203     QCowSnapshot *sn;
2204     int ret;
2205 
2206     if (!*refcount_table) {
2207         int64_t old_size = 0;
2208         ret = realloc_refcount_array(s, refcount_table,
2209                                      &old_size, *nb_clusters);
2210         if (ret < 0) {
2211             res->check_errors++;
2212             return ret;
2213         }
2214     }
2215 
2216     /* header */
2217     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2218                                    0, s->cluster_size);
2219     if (ret < 0) {
2220         return ret;
2221     }
2222 
2223     /* current L1 table */
2224     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2225                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2226                              fix, true);
2227     if (ret < 0) {
2228         return ret;
2229     }
2230 
2231     /* snapshots */
2232     if (has_data_file(bs) && s->nb_snapshots) {
2233         fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2234                 s->nb_snapshots);
2235         res->corruptions++;
2236     }
2237 
2238     for (i = 0; i < s->nb_snapshots; i++) {
2239         sn = s->snapshots + i;
2240         if (offset_into_cluster(s, sn->l1_table_offset)) {
2241             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2242                     "L1 table is not cluster aligned; snapshot table entry "
2243                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2244             res->corruptions++;
2245             continue;
2246         }
2247         if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2248             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2249                     "L1 table is too large; snapshot table entry corrupted\n",
2250                     sn->id_str, sn->name, sn->l1_size);
2251             res->corruptions++;
2252             continue;
2253         }
2254         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2255                                  sn->l1_table_offset, sn->l1_size, 0, fix,
2256                                  false);
2257         if (ret < 0) {
2258             return ret;
2259         }
2260     }
2261     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2262                                    s->snapshots_offset, s->snapshots_size);
2263     if (ret < 0) {
2264         return ret;
2265     }
2266 
2267     /* refcount data */
2268     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2269                                    s->refcount_table_offset,
2270                                    s->refcount_table_size *
2271                                    REFTABLE_ENTRY_SIZE);
2272     if (ret < 0) {
2273         return ret;
2274     }
2275 
2276     /* encryption */
2277     if (s->crypto_header.length) {
2278         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2279                                        s->crypto_header.offset,
2280                                        s->crypto_header.length);
2281         if (ret < 0) {
2282             return ret;
2283         }
2284     }
2285 
2286     /* bitmaps */
2287     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2288     if (ret < 0) {
2289         return ret;
2290     }
2291 
2292     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2293 }
2294 
2295 /*
2296  * Compares the actual reference count for each cluster in the image against the
2297  * refcount as reported by the refcount structures on-disk.
2298  */
2299 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2300                               BdrvCheckMode fix, bool *rebuild,
2301                               int64_t *highest_cluster,
2302                               void *refcount_table, int64_t nb_clusters)
2303 {
2304     BDRVQcow2State *s = bs->opaque;
2305     int64_t i;
2306     uint64_t refcount1, refcount2;
2307     int ret;
2308 
2309     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2310         ret = qcow2_get_refcount(bs, i, &refcount1);
2311         if (ret < 0) {
2312             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2313                     i, strerror(-ret));
2314             res->check_errors++;
2315             continue;
2316         }
2317 
2318         refcount2 = s->get_refcount(refcount_table, i);
2319 
2320         if (refcount1 > 0 || refcount2 > 0) {
2321             *highest_cluster = i;
2322         }
2323 
2324         if (refcount1 != refcount2) {
2325             /* Check if we're allowed to fix the mismatch */
2326             int *num_fixed = NULL;
2327             if (refcount1 == 0) {
2328                 *rebuild = true;
2329             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2330                 num_fixed = &res->leaks_fixed;
2331             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2332                 num_fixed = &res->corruptions_fixed;
2333             }
2334 
2335             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2336                     " reference=%" PRIu64 "\n",
2337                    num_fixed != NULL     ? "Repairing" :
2338                    refcount1 < refcount2 ? "ERROR" :
2339                                            "Leaked",
2340                    i, refcount1, refcount2);
2341 
2342             if (num_fixed) {
2343                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2344                                       refcount_diff(refcount1, refcount2),
2345                                       refcount1 > refcount2,
2346                                       QCOW2_DISCARD_ALWAYS);
2347                 if (ret >= 0) {
2348                     (*num_fixed)++;
2349                     continue;
2350                 }
2351             }
2352 
2353             /* And if we couldn't, print an error */
2354             if (refcount1 < refcount2) {
2355                 res->corruptions++;
2356             } else {
2357                 res->leaks++;
2358             }
2359         }
2360     }
2361 }
2362 
2363 /*
2364  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2365  * the on-disk refcount structures.
2366  *
2367  * On input, *first_free_cluster tells where to start looking, and need not
2368  * actually be a free cluster; the returned offset will not be before that
2369  * cluster.  On output, *first_free_cluster points to the first gap found, even
2370  * if that gap was too small to be used as the returned offset.
2371  *
2372  * Note that *first_free_cluster is a cluster index whereas the return value is
2373  * an offset.
2374  */
2375 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2376                                    int cluster_count,
2377                                    void **refcount_table,
2378                                    int64_t *imrt_nb_clusters,
2379                                    int64_t *first_free_cluster)
2380 {
2381     BDRVQcow2State *s = bs->opaque;
2382     int64_t cluster = *first_free_cluster, i;
2383     bool first_gap = true;
2384     int contiguous_free_clusters;
2385     int ret;
2386 
2387     /* Starting at *first_free_cluster, find a range of at least cluster_count
2388      * continuously free clusters */
2389     for (contiguous_free_clusters = 0;
2390          cluster < *imrt_nb_clusters &&
2391          contiguous_free_clusters < cluster_count;
2392          cluster++)
2393     {
2394         if (!s->get_refcount(*refcount_table, cluster)) {
2395             contiguous_free_clusters++;
2396             if (first_gap) {
2397                 /* If this is the first free cluster found, update
2398                  * *first_free_cluster accordingly */
2399                 *first_free_cluster = cluster;
2400                 first_gap = false;
2401             }
2402         } else if (contiguous_free_clusters) {
2403             contiguous_free_clusters = 0;
2404         }
2405     }
2406 
2407     /* If contiguous_free_clusters is greater than zero, it contains the number
2408      * of continuously free clusters until the current cluster; the first free
2409      * cluster in the current "gap" is therefore
2410      * cluster - contiguous_free_clusters */
2411 
2412     /* If no such range could be found, grow the in-memory refcount table
2413      * accordingly to append free clusters at the end of the image */
2414     if (contiguous_free_clusters < cluster_count) {
2415         /* contiguous_free_clusters clusters are already empty at the image end;
2416          * we need cluster_count clusters; therefore, we have to allocate
2417          * cluster_count - contiguous_free_clusters new clusters at the end of
2418          * the image (which is the current value of cluster; note that cluster
2419          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2420          * the image end) */
2421         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2422                                      cluster + cluster_count
2423                                      - contiguous_free_clusters);
2424         if (ret < 0) {
2425             return ret;
2426         }
2427     }
2428 
2429     /* Go back to the first free cluster */
2430     cluster -= contiguous_free_clusters;
2431     for (i = 0; i < cluster_count; i++) {
2432         s->set_refcount(*refcount_table, cluster + i, 1);
2433     }
2434 
2435     return cluster << s->cluster_bits;
2436 }
2437 
2438 /*
2439  * Creates a new refcount structure based solely on the in-memory information
2440  * given through *refcount_table. All necessary allocations will be reflected
2441  * in that array.
2442  *
2443  * On success, the old refcount structure is leaked (it will be covered by the
2444  * new refcount structure).
2445  */
2446 static int rebuild_refcount_structure(BlockDriverState *bs,
2447                                       BdrvCheckResult *res,
2448                                       void **refcount_table,
2449                                       int64_t *nb_clusters)
2450 {
2451     BDRVQcow2State *s = bs->opaque;
2452     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2453     int64_t refblock_offset, refblock_start, refblock_index;
2454     uint32_t reftable_size = 0;
2455     uint64_t *on_disk_reftable = NULL;
2456     void *on_disk_refblock;
2457     int ret = 0;
2458     struct {
2459         uint64_t reftable_offset;
2460         uint32_t reftable_clusters;
2461     } QEMU_PACKED reftable_offset_and_clusters;
2462 
2463     qcow2_cache_empty(bs, s->refcount_block_cache);
2464 
2465 write_refblocks:
2466     for (; cluster < *nb_clusters; cluster++) {
2467         if (!s->get_refcount(*refcount_table, cluster)) {
2468             continue;
2469         }
2470 
2471         refblock_index = cluster >> s->refcount_block_bits;
2472         refblock_start = refblock_index << s->refcount_block_bits;
2473 
2474         /* Don't allocate a cluster in a refblock already written to disk */
2475         if (first_free_cluster < refblock_start) {
2476             first_free_cluster = refblock_start;
2477         }
2478         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2479                                               nb_clusters, &first_free_cluster);
2480         if (refblock_offset < 0) {
2481             fprintf(stderr, "ERROR allocating refblock: %s\n",
2482                     strerror(-refblock_offset));
2483             res->check_errors++;
2484             ret = refblock_offset;
2485             goto fail;
2486         }
2487 
2488         if (reftable_size <= refblock_index) {
2489             uint32_t old_reftable_size = reftable_size;
2490             uint64_t *new_on_disk_reftable;
2491 
2492             reftable_size = ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2493                                      s->cluster_size) / REFTABLE_ENTRY_SIZE;
2494             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2495                                                  reftable_size *
2496                                                  REFTABLE_ENTRY_SIZE);
2497             if (!new_on_disk_reftable) {
2498                 res->check_errors++;
2499                 ret = -ENOMEM;
2500                 goto fail;
2501             }
2502             on_disk_reftable = new_on_disk_reftable;
2503 
2504             memset(on_disk_reftable + old_reftable_size, 0,
2505                    (reftable_size - old_reftable_size) * REFTABLE_ENTRY_SIZE);
2506 
2507             /* The offset we have for the reftable is now no longer valid;
2508              * this will leak that range, but we can easily fix that by running
2509              * a leak-fixing check after this rebuild operation */
2510             reftable_offset = -1;
2511         } else {
2512             assert(on_disk_reftable);
2513         }
2514         on_disk_reftable[refblock_index] = refblock_offset;
2515 
2516         /* If this is apparently the last refblock (for now), try to squeeze the
2517          * reftable in */
2518         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2519             reftable_offset < 0)
2520         {
2521             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2522                                                           REFTABLE_ENTRY_SIZE);
2523             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2524                                                   refcount_table, nb_clusters,
2525                                                   &first_free_cluster);
2526             if (reftable_offset < 0) {
2527                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2528                         strerror(-reftable_offset));
2529                 res->check_errors++;
2530                 ret = reftable_offset;
2531                 goto fail;
2532             }
2533         }
2534 
2535         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2536                                             s->cluster_size, false);
2537         if (ret < 0) {
2538             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2539             goto fail;
2540         }
2541 
2542         /* The size of *refcount_table is always cluster-aligned, therefore the
2543          * write operation will not overflow */
2544         on_disk_refblock = (void *)((char *) *refcount_table +
2545                                     refblock_index * s->cluster_size);
2546 
2547         ret = bdrv_pwrite(bs->file, refblock_offset, on_disk_refblock,
2548                           s->cluster_size);
2549         if (ret < 0) {
2550             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2551             goto fail;
2552         }
2553 
2554         /* Go to the end of this refblock */
2555         cluster = refblock_start + s->refcount_block_size - 1;
2556     }
2557 
2558     if (reftable_offset < 0) {
2559         uint64_t post_refblock_start, reftable_clusters;
2560 
2561         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2562         reftable_clusters =
2563             size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE);
2564         /* Not pretty but simple */
2565         if (first_free_cluster < post_refblock_start) {
2566             first_free_cluster = post_refblock_start;
2567         }
2568         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2569                                               refcount_table, nb_clusters,
2570                                               &first_free_cluster);
2571         if (reftable_offset < 0) {
2572             fprintf(stderr, "ERROR allocating reftable: %s\n",
2573                     strerror(-reftable_offset));
2574             res->check_errors++;
2575             ret = reftable_offset;
2576             goto fail;
2577         }
2578 
2579         goto write_refblocks;
2580     }
2581 
2582     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2583         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2584     }
2585 
2586     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2587                                         reftable_size * REFTABLE_ENTRY_SIZE,
2588                                         false);
2589     if (ret < 0) {
2590         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2591         goto fail;
2592     }
2593 
2594     assert(reftable_size < INT_MAX / REFTABLE_ENTRY_SIZE);
2595     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2596                       reftable_size * REFTABLE_ENTRY_SIZE);
2597     if (ret < 0) {
2598         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2599         goto fail;
2600     }
2601 
2602     /* Enter new reftable into the image header */
2603     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2604     reftable_offset_and_clusters.reftable_clusters =
2605         cpu_to_be32(size_to_clusters(s, reftable_size * REFTABLE_ENTRY_SIZE));
2606     ret = bdrv_pwrite_sync(bs->file,
2607                            offsetof(QCowHeader, refcount_table_offset),
2608                            &reftable_offset_and_clusters,
2609                            sizeof(reftable_offset_and_clusters));
2610     if (ret < 0) {
2611         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2612         goto fail;
2613     }
2614 
2615     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2616         be64_to_cpus(&on_disk_reftable[refblock_index]);
2617     }
2618     s->refcount_table = on_disk_reftable;
2619     s->refcount_table_offset = reftable_offset;
2620     s->refcount_table_size = reftable_size;
2621     update_max_refcount_table_index(s);
2622 
2623     return 0;
2624 
2625 fail:
2626     g_free(on_disk_reftable);
2627     return ret;
2628 }
2629 
2630 /*
2631  * Checks an image for refcount consistency.
2632  *
2633  * Returns 0 if no errors are found, the number of errors in case the image is
2634  * detected as corrupted, and -errno when an internal error occurred.
2635  */
2636 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2637                           BdrvCheckMode fix)
2638 {
2639     BDRVQcow2State *s = bs->opaque;
2640     BdrvCheckResult pre_compare_res;
2641     int64_t size, highest_cluster, nb_clusters;
2642     void *refcount_table = NULL;
2643     bool rebuild = false;
2644     int ret;
2645 
2646     size = bdrv_getlength(bs->file->bs);
2647     if (size < 0) {
2648         res->check_errors++;
2649         return size;
2650     }
2651 
2652     nb_clusters = size_to_clusters(s, size);
2653     if (nb_clusters > INT_MAX) {
2654         res->check_errors++;
2655         return -EFBIG;
2656     }
2657 
2658     res->bfi.total_clusters =
2659         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2660 
2661     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2662                               &nb_clusters);
2663     if (ret < 0) {
2664         goto fail;
2665     }
2666 
2667     /* In case we don't need to rebuild the refcount structure (but want to fix
2668      * something), this function is immediately called again, in which case the
2669      * result should be ignored */
2670     pre_compare_res = *res;
2671     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2672                       nb_clusters);
2673 
2674     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2675         BdrvCheckResult old_res = *res;
2676         int fresh_leaks = 0;
2677 
2678         fprintf(stderr, "Rebuilding refcount structure\n");
2679         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2680                                          &nb_clusters);
2681         if (ret < 0) {
2682             goto fail;
2683         }
2684 
2685         res->corruptions = 0;
2686         res->leaks = 0;
2687 
2688         /* Because the old reftable has been exchanged for a new one the
2689          * references have to be recalculated */
2690         rebuild = false;
2691         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2692         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2693                                   &nb_clusters);
2694         if (ret < 0) {
2695             goto fail;
2696         }
2697 
2698         if (fix & BDRV_FIX_LEAKS) {
2699             /* The old refcount structures are now leaked, fix it; the result
2700              * can be ignored, aside from leaks which were introduced by
2701              * rebuild_refcount_structure() that could not be fixed */
2702             BdrvCheckResult saved_res = *res;
2703             *res = (BdrvCheckResult){ 0 };
2704 
2705             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2706                               &highest_cluster, refcount_table, nb_clusters);
2707             if (rebuild) {
2708                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2709                         "broken\n");
2710             }
2711 
2712             /* Any leaks accounted for here were introduced by
2713              * rebuild_refcount_structure() because that function has created a
2714              * new refcount structure from scratch */
2715             fresh_leaks = res->leaks;
2716             *res = saved_res;
2717         }
2718 
2719         if (res->corruptions < old_res.corruptions) {
2720             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2721         }
2722         if (res->leaks < old_res.leaks) {
2723             res->leaks_fixed += old_res.leaks - res->leaks;
2724         }
2725         res->leaks += fresh_leaks;
2726     } else if (fix) {
2727         if (rebuild) {
2728             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2729             res->check_errors++;
2730             ret = -EIO;
2731             goto fail;
2732         }
2733 
2734         if (res->leaks || res->corruptions) {
2735             *res = pre_compare_res;
2736             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2737                               refcount_table, nb_clusters);
2738         }
2739     }
2740 
2741     /* check OFLAG_COPIED */
2742     ret = check_oflag_copied(bs, res, fix);
2743     if (ret < 0) {
2744         goto fail;
2745     }
2746 
2747     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2748     ret = 0;
2749 
2750 fail:
2751     g_free(refcount_table);
2752 
2753     return ret;
2754 }
2755 
2756 #define overlaps_with(ofs, sz) \
2757     ranges_overlap(offset, size, ofs, sz)
2758 
2759 /*
2760  * Checks if the given offset into the image file is actually free to use by
2761  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2762  * i.e. a sanity check without relying on the refcount tables.
2763  *
2764  * The ign parameter specifies what checks not to perform (being a bitmask of
2765  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2766  *
2767  * Returns:
2768  * - 0 if writing to this offset will not affect the mentioned metadata
2769  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2770  * - a negative value (-errno) indicating an error while performing a check,
2771  *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2772  */
2773 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2774                                  int64_t size)
2775 {
2776     BDRVQcow2State *s = bs->opaque;
2777     int chk = s->overlap_check & ~ign;
2778     int i, j;
2779 
2780     if (!size) {
2781         return 0;
2782     }
2783 
2784     if (chk & QCOW2_OL_MAIN_HEADER) {
2785         if (offset < s->cluster_size) {
2786             return QCOW2_OL_MAIN_HEADER;
2787         }
2788     }
2789 
2790     /* align range to test to cluster boundaries */
2791     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2792     offset = start_of_cluster(s, offset);
2793 
2794     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2795         if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2796             return QCOW2_OL_ACTIVE_L1;
2797         }
2798     }
2799 
2800     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2801         if (overlaps_with(s->refcount_table_offset,
2802             s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2803             return QCOW2_OL_REFCOUNT_TABLE;
2804         }
2805     }
2806 
2807     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2808         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2809             return QCOW2_OL_SNAPSHOT_TABLE;
2810         }
2811     }
2812 
2813     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2814         for (i = 0; i < s->nb_snapshots; i++) {
2815             if (s->snapshots[i].l1_size &&
2816                 overlaps_with(s->snapshots[i].l1_table_offset,
2817                 s->snapshots[i].l1_size * L1E_SIZE)) {
2818                 return QCOW2_OL_INACTIVE_L1;
2819             }
2820         }
2821     }
2822 
2823     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2824         for (i = 0; i < s->l1_size; i++) {
2825             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2826                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2827                 s->cluster_size)) {
2828                 return QCOW2_OL_ACTIVE_L2;
2829             }
2830         }
2831     }
2832 
2833     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2834         unsigned last_entry = s->max_refcount_table_index;
2835         assert(last_entry < s->refcount_table_size);
2836         assert(last_entry + 1 == s->refcount_table_size ||
2837                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2838         for (i = 0; i <= last_entry; i++) {
2839             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2840                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2841                 s->cluster_size)) {
2842                 return QCOW2_OL_REFCOUNT_BLOCK;
2843             }
2844         }
2845     }
2846 
2847     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2848         for (i = 0; i < s->nb_snapshots; i++) {
2849             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2850             uint32_t l1_sz  = s->snapshots[i].l1_size;
2851             uint64_t l1_sz2 = l1_sz * L1E_SIZE;
2852             uint64_t *l1;
2853             int ret;
2854 
2855             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
2856                                        QCOW_MAX_L1_SIZE, "", NULL);
2857             if (ret < 0) {
2858                 return ret;
2859             }
2860 
2861             l1 = g_try_malloc(l1_sz2);
2862 
2863             if (l1_sz2 && l1 == NULL) {
2864                 return -ENOMEM;
2865             }
2866 
2867             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2868             if (ret < 0) {
2869                 g_free(l1);
2870                 return ret;
2871             }
2872 
2873             for (j = 0; j < l1_sz; j++) {
2874                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2875                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2876                     g_free(l1);
2877                     return QCOW2_OL_INACTIVE_L2;
2878                 }
2879             }
2880 
2881             g_free(l1);
2882         }
2883     }
2884 
2885     if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
2886         (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
2887     {
2888         if (overlaps_with(s->bitmap_directory_offset,
2889                           s->bitmap_directory_size))
2890         {
2891             return QCOW2_OL_BITMAP_DIRECTORY;
2892         }
2893     }
2894 
2895     return 0;
2896 }
2897 
2898 static const char *metadata_ol_names[] = {
2899     [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
2900     [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
2901     [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
2902     [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
2903     [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
2904     [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
2905     [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
2906     [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
2907     [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
2908 };
2909 QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
2910 
2911 /*
2912  * First performs a check for metadata overlaps (through
2913  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2914  * while performing a check), that value is returned. If an impending overlap
2915  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2916  * and -EIO returned.
2917  *
2918  * Returns 0 if there were neither overlaps nor errors while checking for
2919  * overlaps; or a negative value (-errno) on error.
2920  */
2921 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2922                                   int64_t size, bool data_file)
2923 {
2924     int ret;
2925 
2926     if (data_file && has_data_file(bs)) {
2927         return 0;
2928     }
2929 
2930     ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2931     if (ret < 0) {
2932         return ret;
2933     } else if (ret > 0) {
2934         int metadata_ol_bitnr = ctz32(ret);
2935         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2936 
2937         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2938                                 "write on metadata (overlaps with %s)",
2939                                 metadata_ol_names[metadata_ol_bitnr]);
2940         return -EIO;
2941     }
2942 
2943     return 0;
2944 }
2945 
2946 /* A pointer to a function of this type is given to walk_over_reftable(). That
2947  * function will create refblocks and pass them to a RefblockFinishOp once they
2948  * are completed (@refblock). @refblock_empty is set if the refblock is
2949  * completely empty.
2950  *
2951  * Along with the refblock, a corresponding reftable entry is passed, in the
2952  * reftable @reftable (which may be reallocated) at @reftable_index.
2953  *
2954  * @allocated should be set to true if a new cluster has been allocated.
2955  */
2956 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2957                                uint64_t reftable_index, uint64_t *reftable_size,
2958                                void *refblock, bool refblock_empty,
2959                                bool *allocated, Error **errp);
2960 
2961 /**
2962  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2963  * it is not empty) and inserts its offset into the new reftable. The size of
2964  * this new reftable is increased as required.
2965  */
2966 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2967                           uint64_t reftable_index, uint64_t *reftable_size,
2968                           void *refblock, bool refblock_empty, bool *allocated,
2969                           Error **errp)
2970 {
2971     BDRVQcow2State *s = bs->opaque;
2972     int64_t offset;
2973 
2974     if (!refblock_empty && reftable_index >= *reftable_size) {
2975         uint64_t *new_reftable;
2976         uint64_t new_reftable_size;
2977 
2978         new_reftable_size = ROUND_UP(reftable_index + 1,
2979                                      s->cluster_size / REFTABLE_ENTRY_SIZE);
2980         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
2981             error_setg(errp,
2982                        "This operation would make the refcount table grow "
2983                        "beyond the maximum size supported by QEMU, aborting");
2984             return -ENOTSUP;
2985         }
2986 
2987         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2988                                                 REFTABLE_ENTRY_SIZE);
2989         if (!new_reftable) {
2990             error_setg(errp, "Failed to increase reftable buffer size");
2991             return -ENOMEM;
2992         }
2993 
2994         memset(new_reftable + *reftable_size, 0,
2995                (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
2996 
2997         *reftable      = new_reftable;
2998         *reftable_size = new_reftable_size;
2999     }
3000 
3001     if (!refblock_empty && !(*reftable)[reftable_index]) {
3002         offset = qcow2_alloc_clusters(bs, s->cluster_size);
3003         if (offset < 0) {
3004             error_setg_errno(errp, -offset, "Failed to allocate refblock");
3005             return offset;
3006         }
3007         (*reftable)[reftable_index] = offset;
3008         *allocated = true;
3009     }
3010 
3011     return 0;
3012 }
3013 
3014 /**
3015  * This "operation" for walk_over_reftable() writes the refblock to disk at the
3016  * offset specified by the new reftable's entry. It does not modify the new
3017  * reftable or change any refcounts.
3018  */
3019 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
3020                           uint64_t reftable_index, uint64_t *reftable_size,
3021                           void *refblock, bool refblock_empty, bool *allocated,
3022                           Error **errp)
3023 {
3024     BDRVQcow2State *s = bs->opaque;
3025     int64_t offset;
3026     int ret;
3027 
3028     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
3029         offset = (*reftable)[reftable_index];
3030 
3031         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
3032                                             false);
3033         if (ret < 0) {
3034             error_setg_errno(errp, -ret, "Overlap check failed");
3035             return ret;
3036         }
3037 
3038         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
3039         if (ret < 0) {
3040             error_setg_errno(errp, -ret, "Failed to write refblock");
3041             return ret;
3042         }
3043     } else {
3044         assert(refblock_empty);
3045     }
3046 
3047     return 0;
3048 }
3049 
3050 /**
3051  * This function walks over the existing reftable and every referenced refblock;
3052  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
3053  * create an equal new entry in the passed @new_refblock. Once that
3054  * @new_refblock is completely filled, @operation will be called.
3055  *
3056  * @status_cb and @cb_opaque are used for the amend operation's status callback.
3057  * @index is the index of the walk_over_reftable() calls and @total is the total
3058  * number of walk_over_reftable() calls per amend operation. Both are used for
3059  * calculating the parameters for the status callback.
3060  *
3061  * @allocated is set to true if a new cluster has been allocated.
3062  */
3063 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
3064                               uint64_t *new_reftable_index,
3065                               uint64_t *new_reftable_size,
3066                               void *new_refblock, int new_refblock_size,
3067                               int new_refcount_bits,
3068                               RefblockFinishOp *operation, bool *allocated,
3069                               Qcow2SetRefcountFunc *new_set_refcount,
3070                               BlockDriverAmendStatusCB *status_cb,
3071                               void *cb_opaque, int index, int total,
3072                               Error **errp)
3073 {
3074     BDRVQcow2State *s = bs->opaque;
3075     uint64_t reftable_index;
3076     bool new_refblock_empty = true;
3077     int refblock_index;
3078     int new_refblock_index = 0;
3079     int ret;
3080 
3081     for (reftable_index = 0; reftable_index < s->refcount_table_size;
3082          reftable_index++)
3083     {
3084         uint64_t refblock_offset = s->refcount_table[reftable_index]
3085                                  & REFT_OFFSET_MASK;
3086 
3087         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
3088                   (uint64_t)total * s->refcount_table_size, cb_opaque);
3089 
3090         if (refblock_offset) {
3091             void *refblock;
3092 
3093             if (offset_into_cluster(s, refblock_offset)) {
3094                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
3095                                         PRIx64 " unaligned (reftable index: %#"
3096                                         PRIx64 ")", refblock_offset,
3097                                         reftable_index);
3098                 error_setg(errp,
3099                            "Image is corrupt (unaligned refblock offset)");
3100                 return -EIO;
3101             }
3102 
3103             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3104                                   &refblock);
3105             if (ret < 0) {
3106                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3107                 return ret;
3108             }
3109 
3110             for (refblock_index = 0; refblock_index < s->refcount_block_size;
3111                  refblock_index++)
3112             {
3113                 uint64_t refcount;
3114 
3115                 if (new_refblock_index >= new_refblock_size) {
3116                     /* new_refblock is now complete */
3117                     ret = operation(bs, new_reftable, *new_reftable_index,
3118                                     new_reftable_size, new_refblock,
3119                                     new_refblock_empty, allocated, errp);
3120                     if (ret < 0) {
3121                         qcow2_cache_put(s->refcount_block_cache, &refblock);
3122                         return ret;
3123                     }
3124 
3125                     (*new_reftable_index)++;
3126                     new_refblock_index = 0;
3127                     new_refblock_empty = true;
3128                 }
3129 
3130                 refcount = s->get_refcount(refblock, refblock_index);
3131                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3132                     uint64_t offset;
3133 
3134                     qcow2_cache_put(s->refcount_block_cache, &refblock);
3135 
3136                     offset = ((reftable_index << s->refcount_block_bits)
3137                               + refblock_index) << s->cluster_bits;
3138 
3139                     error_setg(errp, "Cannot decrease refcount entry width to "
3140                                "%i bits: Cluster at offset %#" PRIx64 " has a "
3141                                "refcount of %" PRIu64, new_refcount_bits,
3142                                offset, refcount);
3143                     return -EINVAL;
3144                 }
3145 
3146                 if (new_set_refcount) {
3147                     new_set_refcount(new_refblock, new_refblock_index++,
3148                                      refcount);
3149                 } else {
3150                     new_refblock_index++;
3151                 }
3152                 new_refblock_empty = new_refblock_empty && refcount == 0;
3153             }
3154 
3155             qcow2_cache_put(s->refcount_block_cache, &refblock);
3156         } else {
3157             /* No refblock means every refcount is 0 */
3158             for (refblock_index = 0; refblock_index < s->refcount_block_size;
3159                  refblock_index++)
3160             {
3161                 if (new_refblock_index >= new_refblock_size) {
3162                     /* new_refblock is now complete */
3163                     ret = operation(bs, new_reftable, *new_reftable_index,
3164                                     new_reftable_size, new_refblock,
3165                                     new_refblock_empty, allocated, errp);
3166                     if (ret < 0) {
3167                         return ret;
3168                     }
3169 
3170                     (*new_reftable_index)++;
3171                     new_refblock_index = 0;
3172                     new_refblock_empty = true;
3173                 }
3174 
3175                 if (new_set_refcount) {
3176                     new_set_refcount(new_refblock, new_refblock_index++, 0);
3177                 } else {
3178                     new_refblock_index++;
3179                 }
3180             }
3181         }
3182     }
3183 
3184     if (new_refblock_index > 0) {
3185         /* Complete the potentially existing partially filled final refblock */
3186         if (new_set_refcount) {
3187             for (; new_refblock_index < new_refblock_size;
3188                  new_refblock_index++)
3189             {
3190                 new_set_refcount(new_refblock, new_refblock_index, 0);
3191             }
3192         }
3193 
3194         ret = operation(bs, new_reftable, *new_reftable_index,
3195                         new_reftable_size, new_refblock, new_refblock_empty,
3196                         allocated, errp);
3197         if (ret < 0) {
3198             return ret;
3199         }
3200 
3201         (*new_reftable_index)++;
3202     }
3203 
3204     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3205               (uint64_t)total * s->refcount_table_size, cb_opaque);
3206 
3207     return 0;
3208 }
3209 
3210 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3211                                 BlockDriverAmendStatusCB *status_cb,
3212                                 void *cb_opaque, Error **errp)
3213 {
3214     BDRVQcow2State *s = bs->opaque;
3215     Qcow2GetRefcountFunc *new_get_refcount;
3216     Qcow2SetRefcountFunc *new_set_refcount;
3217     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3218     uint64_t *new_reftable = NULL, new_reftable_size = 0;
3219     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3220     uint64_t new_reftable_index = 0;
3221     uint64_t i;
3222     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3223     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3224     int old_refcount_order;
3225     int walk_index = 0;
3226     int ret;
3227     bool new_allocation;
3228 
3229     assert(s->qcow_version >= 3);
3230     assert(refcount_order >= 0 && refcount_order <= 6);
3231 
3232     /* see qcow2_open() */
3233     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3234 
3235     new_get_refcount = get_refcount_funcs[refcount_order];
3236     new_set_refcount = set_refcount_funcs[refcount_order];
3237 
3238 
3239     do {
3240         int total_walks;
3241 
3242         new_allocation = false;
3243 
3244         /* At least we have to do this walk and the one which writes the
3245          * refblocks; also, at least we have to do this loop here at least
3246          * twice (normally), first to do the allocations, and second to
3247          * determine that everything is correctly allocated, this then makes
3248          * three walks in total */
3249         total_walks = MAX(walk_index + 2, 3);
3250 
3251         /* First, allocate the structures so they are present in the refcount
3252          * structures */
3253         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3254                                  &new_reftable_size, NULL, new_refblock_size,
3255                                  new_refcount_bits, &alloc_refblock,
3256                                  &new_allocation, NULL, status_cb, cb_opaque,
3257                                  walk_index++, total_walks, errp);
3258         if (ret < 0) {
3259             goto done;
3260         }
3261 
3262         new_reftable_index = 0;
3263 
3264         if (new_allocation) {
3265             if (new_reftable_offset) {
3266                 qcow2_free_clusters(
3267                     bs, new_reftable_offset,
3268                     allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3269                     QCOW2_DISCARD_NEVER);
3270             }
3271 
3272             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3273                                                            REFTABLE_ENTRY_SIZE);
3274             if (new_reftable_offset < 0) {
3275                 error_setg_errno(errp, -new_reftable_offset,
3276                                  "Failed to allocate the new reftable");
3277                 ret = new_reftable_offset;
3278                 goto done;
3279             }
3280             allocated_reftable_size = new_reftable_size;
3281         }
3282     } while (new_allocation);
3283 
3284     /* Second, write the new refblocks */
3285     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3286                              &new_reftable_size, new_refblock,
3287                              new_refblock_size, new_refcount_bits,
3288                              &flush_refblock, &new_allocation, new_set_refcount,
3289                              status_cb, cb_opaque, walk_index, walk_index + 1,
3290                              errp);
3291     if (ret < 0) {
3292         goto done;
3293     }
3294     assert(!new_allocation);
3295 
3296 
3297     /* Write the new reftable */
3298     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3299                                         new_reftable_size * REFTABLE_ENTRY_SIZE,
3300                                         false);
3301     if (ret < 0) {
3302         error_setg_errno(errp, -ret, "Overlap check failed");
3303         goto done;
3304     }
3305 
3306     for (i = 0; i < new_reftable_size; i++) {
3307         cpu_to_be64s(&new_reftable[i]);
3308     }
3309 
3310     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3311                       new_reftable_size * REFTABLE_ENTRY_SIZE);
3312 
3313     for (i = 0; i < new_reftable_size; i++) {
3314         be64_to_cpus(&new_reftable[i]);
3315     }
3316 
3317     if (ret < 0) {
3318         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3319         goto done;
3320     }
3321 
3322 
3323     /* Empty the refcount cache */
3324     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3325     if (ret < 0) {
3326         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3327         goto done;
3328     }
3329 
3330     /* Update the image header to point to the new reftable; this only updates
3331      * the fields which are relevant to qcow2_update_header(); other fields
3332      * such as s->refcount_table or s->refcount_bits stay stale for now
3333      * (because we have to restore everything if qcow2_update_header() fails) */
3334     old_refcount_order  = s->refcount_order;
3335     old_reftable_size   = s->refcount_table_size;
3336     old_reftable_offset = s->refcount_table_offset;
3337 
3338     s->refcount_order        = refcount_order;
3339     s->refcount_table_size   = new_reftable_size;
3340     s->refcount_table_offset = new_reftable_offset;
3341 
3342     ret = qcow2_update_header(bs);
3343     if (ret < 0) {
3344         s->refcount_order        = old_refcount_order;
3345         s->refcount_table_size   = old_reftable_size;
3346         s->refcount_table_offset = old_reftable_offset;
3347         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3348         goto done;
3349     }
3350 
3351     /* Now update the rest of the in-memory information */
3352     old_reftable = s->refcount_table;
3353     s->refcount_table = new_reftable;
3354     update_max_refcount_table_index(s);
3355 
3356     s->refcount_bits = 1 << refcount_order;
3357     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3358     s->refcount_max += s->refcount_max - 1;
3359 
3360     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3361     s->refcount_block_size = 1 << s->refcount_block_bits;
3362 
3363     s->get_refcount = new_get_refcount;
3364     s->set_refcount = new_set_refcount;
3365 
3366     /* For cleaning up all old refblocks and the old reftable below the "done"
3367      * label */
3368     new_reftable        = old_reftable;
3369     new_reftable_size   = old_reftable_size;
3370     new_reftable_offset = old_reftable_offset;
3371 
3372 done:
3373     if (new_reftable) {
3374         /* On success, new_reftable actually points to the old reftable (and
3375          * new_reftable_size is the old reftable's size); but that is just
3376          * fine */
3377         for (i = 0; i < new_reftable_size; i++) {
3378             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3379             if (offset) {
3380                 qcow2_free_clusters(bs, offset, s->cluster_size,
3381                                     QCOW2_DISCARD_OTHER);
3382             }
3383         }
3384         g_free(new_reftable);
3385 
3386         if (new_reftable_offset > 0) {
3387             qcow2_free_clusters(bs, new_reftable_offset,
3388                                 new_reftable_size * REFTABLE_ENTRY_SIZE,
3389                                 QCOW2_DISCARD_OTHER);
3390         }
3391     }
3392 
3393     qemu_vfree(new_refblock);
3394     return ret;
3395 }
3396 
3397 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3398 {
3399     BDRVQcow2State *s = bs->opaque;
3400     uint32_t index = offset_to_reftable_index(s, offset);
3401     int64_t covering_refblock_offset = 0;
3402 
3403     if (index < s->refcount_table_size) {
3404         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3405     }
3406     if (!covering_refblock_offset) {
3407         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3408                                 "not covered by the refcount structures",
3409                                 offset);
3410         return -EIO;
3411     }
3412 
3413     return covering_refblock_offset;
3414 }
3415 
3416 static int qcow2_discard_refcount_block(BlockDriverState *bs,
3417                                         uint64_t discard_block_offs)
3418 {
3419     BDRVQcow2State *s = bs->opaque;
3420     int64_t refblock_offs;
3421     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3422     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3423     void *refblock;
3424     int ret;
3425 
3426     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3427     if (refblock_offs < 0) {
3428         return refblock_offs;
3429     }
3430 
3431     assert(discard_block_offs != 0);
3432 
3433     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3434                           &refblock);
3435     if (ret < 0) {
3436         return ret;
3437     }
3438 
3439     if (s->get_refcount(refblock, block_index) != 1) {
3440         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3441                                 " refblock offset %#" PRIx64
3442                                 ", reftable index %u"
3443                                 ", block offset %#" PRIx64
3444                                 ", refcount %#" PRIx64,
3445                                 refblock_offs,
3446                                 offset_to_reftable_index(s, discard_block_offs),
3447                                 discard_block_offs,
3448                                 s->get_refcount(refblock, block_index));
3449         qcow2_cache_put(s->refcount_block_cache, &refblock);
3450         return -EINVAL;
3451     }
3452     s->set_refcount(refblock, block_index, 0);
3453 
3454     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3455 
3456     qcow2_cache_put(s->refcount_block_cache, &refblock);
3457 
3458     if (cluster_index < s->free_cluster_index) {
3459         s->free_cluster_index = cluster_index;
3460     }
3461 
3462     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3463                                            discard_block_offs);
3464     if (refblock) {
3465         /* discard refblock from the cache if refblock is cached */
3466         qcow2_cache_discard(s->refcount_block_cache, refblock);
3467     }
3468     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3469 
3470     return 0;
3471 }
3472 
3473 int qcow2_shrink_reftable(BlockDriverState *bs)
3474 {
3475     BDRVQcow2State *s = bs->opaque;
3476     uint64_t *reftable_tmp =
3477         g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3478     int i, ret;
3479 
3480     for (i = 0; i < s->refcount_table_size; i++) {
3481         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3482         void *refblock;
3483         bool unused_block;
3484 
3485         if (refblock_offs == 0) {
3486             reftable_tmp[i] = 0;
3487             continue;
3488         }
3489         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3490                               &refblock);
3491         if (ret < 0) {
3492             goto out;
3493         }
3494 
3495         /* the refblock has own reference */
3496         if (i == offset_to_reftable_index(s, refblock_offs)) {
3497             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3498                                    (s->refcount_block_size - 1);
3499             uint64_t refcount = s->get_refcount(refblock, block_index);
3500 
3501             s->set_refcount(refblock, block_index, 0);
3502 
3503             unused_block = buffer_is_zero(refblock, s->cluster_size);
3504 
3505             s->set_refcount(refblock, block_index, refcount);
3506         } else {
3507             unused_block = buffer_is_zero(refblock, s->cluster_size);
3508         }
3509         qcow2_cache_put(s->refcount_block_cache, &refblock);
3510 
3511         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3512     }
3513 
3514     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3515                            s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3516     /*
3517      * If the write in the reftable failed the image may contain a partially
3518      * overwritten reftable. In this case it would be better to clear the
3519      * reftable in memory to avoid possible image corruption.
3520      */
3521     for (i = 0; i < s->refcount_table_size; i++) {
3522         if (s->refcount_table[i] && !reftable_tmp[i]) {
3523             if (ret == 0) {
3524                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3525                                                        REFT_OFFSET_MASK);
3526             }
3527             s->refcount_table[i] = 0;
3528         }
3529     }
3530 
3531     if (!s->cache_discards) {
3532         qcow2_process_discards(bs, ret);
3533     }
3534 
3535 out:
3536     g_free(reftable_tmp);
3537     return ret;
3538 }
3539 
3540 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3541 {
3542     BDRVQcow2State *s = bs->opaque;
3543     int64_t i;
3544 
3545     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3546         uint64_t refcount;
3547         int ret = qcow2_get_refcount(bs, i, &refcount);
3548         if (ret < 0) {
3549             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3550                     i, strerror(-ret));
3551             return ret;
3552         }
3553         if (refcount > 0) {
3554             return i;
3555         }
3556     }
3557     qcow2_signal_corruption(bs, true, -1, -1,
3558                             "There are no references in the refcount table.");
3559     return -EIO;
3560 }
3561 
3562 int qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3563 {
3564     BDRVQcow2State *s = bs->opaque;
3565     int64_t i, end_cluster, cluster_count = 0, threshold;
3566     int64_t file_length, real_allocation, real_clusters;
3567 
3568     qemu_co_mutex_assert_locked(&s->lock);
3569 
3570     file_length = bdrv_getlength(bs->file->bs);
3571     if (file_length < 0) {
3572         return file_length;
3573     }
3574 
3575     real_allocation = bdrv_get_allocated_file_size(bs->file->bs);
3576     if (real_allocation < 0) {
3577         return real_allocation;
3578     }
3579 
3580     real_clusters = real_allocation / s->cluster_size;
3581     threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3582 
3583     end_cluster = size_to_clusters(s, file_length);
3584     for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3585         uint64_t refcount;
3586         int ret = qcow2_get_refcount(bs, i, &refcount);
3587         if (ret < 0) {
3588             return ret;
3589         }
3590         cluster_count += !!refcount;
3591     }
3592 
3593     return cluster_count >= threshold;
3594 }
3595