xref: /qemu/block/qcow2-refcount.c (revision dc03272d)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "qcow2.h"
30 #include "qemu/range.h"
31 #include "qemu/bswap.h"
32 #include "qemu/cutils.h"
33 
34 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
35 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
36                             int64_t offset, int64_t length, uint64_t addend,
37                             bool decrease, enum qcow2_discard_type type);
38 
39 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
46 
47 static void set_refcount_ro0(void *refcount_array, uint64_t index,
48                              uint64_t value);
49 static void set_refcount_ro1(void *refcount_array, uint64_t index,
50                              uint64_t value);
51 static void set_refcount_ro2(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro3(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro4(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro5(void *refcount_array, uint64_t index,
58                              uint64_t value);
59 static void set_refcount_ro6(void *refcount_array, uint64_t index,
60                              uint64_t value);
61 
62 
63 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
64     &get_refcount_ro0,
65     &get_refcount_ro1,
66     &get_refcount_ro2,
67     &get_refcount_ro3,
68     &get_refcount_ro4,
69     &get_refcount_ro5,
70     &get_refcount_ro6
71 };
72 
73 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
74     &set_refcount_ro0,
75     &set_refcount_ro1,
76     &set_refcount_ro2,
77     &set_refcount_ro3,
78     &set_refcount_ro4,
79     &set_refcount_ro5,
80     &set_refcount_ro6
81 };
82 
83 
84 /*********************************************************/
85 /* refcount handling */
86 
87 static void update_max_refcount_table_index(BDRVQcow2State *s)
88 {
89     unsigned i = s->refcount_table_size - 1;
90     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
91         i--;
92     }
93     /* Set s->max_refcount_table_index to the index of the last used entry */
94     s->max_refcount_table_index = i;
95 }
96 
97 int qcow2_refcount_init(BlockDriverState *bs)
98 {
99     BDRVQcow2State *s = bs->opaque;
100     unsigned int refcount_table_size2, i;
101     int ret;
102 
103     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
104 
105     s->get_refcount = get_refcount_funcs[s->refcount_order];
106     s->set_refcount = set_refcount_funcs[s->refcount_order];
107 
108     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
109     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
110     s->refcount_table = g_try_malloc(refcount_table_size2);
111 
112     if (s->refcount_table_size > 0) {
113         if (s->refcount_table == NULL) {
114             ret = -ENOMEM;
115             goto fail;
116         }
117         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
118         ret = bdrv_pread(bs->file, s->refcount_table_offset,
119                          s->refcount_table, refcount_table_size2);
120         if (ret < 0) {
121             goto fail;
122         }
123         for(i = 0; i < s->refcount_table_size; i++)
124             be64_to_cpus(&s->refcount_table[i]);
125         update_max_refcount_table_index(s);
126     }
127     return 0;
128  fail:
129     return ret;
130 }
131 
132 void qcow2_refcount_close(BlockDriverState *bs)
133 {
134     BDRVQcow2State *s = bs->opaque;
135     g_free(s->refcount_table);
136 }
137 
138 
139 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
140 {
141     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
142 }
143 
144 static void set_refcount_ro0(void *refcount_array, uint64_t index,
145                              uint64_t value)
146 {
147     assert(!(value >> 1));
148     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
149     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
150 }
151 
152 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
153 {
154     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
155            & 0x3;
156 }
157 
158 static void set_refcount_ro1(void *refcount_array, uint64_t index,
159                              uint64_t value)
160 {
161     assert(!(value >> 2));
162     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
163     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
164 }
165 
166 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
167 {
168     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
169            & 0xf;
170 }
171 
172 static void set_refcount_ro2(void *refcount_array, uint64_t index,
173                              uint64_t value)
174 {
175     assert(!(value >> 4));
176     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
177     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
178 }
179 
180 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
181 {
182     return ((const uint8_t *)refcount_array)[index];
183 }
184 
185 static void set_refcount_ro3(void *refcount_array, uint64_t index,
186                              uint64_t value)
187 {
188     assert(!(value >> 8));
189     ((uint8_t *)refcount_array)[index] = value;
190 }
191 
192 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
193 {
194     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
195 }
196 
197 static void set_refcount_ro4(void *refcount_array, uint64_t index,
198                              uint64_t value)
199 {
200     assert(!(value >> 16));
201     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
202 }
203 
204 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
205 {
206     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
207 }
208 
209 static void set_refcount_ro5(void *refcount_array, uint64_t index,
210                              uint64_t value)
211 {
212     assert(!(value >> 32));
213     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
214 }
215 
216 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
217 {
218     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
219 }
220 
221 static void set_refcount_ro6(void *refcount_array, uint64_t index,
222                              uint64_t value)
223 {
224     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
225 }
226 
227 
228 static int load_refcount_block(BlockDriverState *bs,
229                                int64_t refcount_block_offset,
230                                void **refcount_block)
231 {
232     BDRVQcow2State *s = bs->opaque;
233 
234     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
235     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
236                            refcount_block);
237 }
238 
239 /*
240  * Retrieves the refcount of the cluster given by its index and stores it in
241  * *refcount. Returns 0 on success and -errno on failure.
242  */
243 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
244                        uint64_t *refcount)
245 {
246     BDRVQcow2State *s = bs->opaque;
247     uint64_t refcount_table_index, block_index;
248     int64_t refcount_block_offset;
249     int ret;
250     void *refcount_block;
251 
252     refcount_table_index = cluster_index >> s->refcount_block_bits;
253     if (refcount_table_index >= s->refcount_table_size) {
254         *refcount = 0;
255         return 0;
256     }
257     refcount_block_offset =
258         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
259     if (!refcount_block_offset) {
260         *refcount = 0;
261         return 0;
262     }
263 
264     if (offset_into_cluster(s, refcount_block_offset)) {
265         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
266                                 " unaligned (reftable index: %#" PRIx64 ")",
267                                 refcount_block_offset, refcount_table_index);
268         return -EIO;
269     }
270 
271     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
272                           &refcount_block);
273     if (ret < 0) {
274         return ret;
275     }
276 
277     block_index = cluster_index & (s->refcount_block_size - 1);
278     *refcount = s->get_refcount(refcount_block, block_index);
279 
280     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
281 
282     return 0;
283 }
284 
285 /* Checks if two offsets are described by the same refcount block */
286 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
287     uint64_t offset_b)
288 {
289     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
290     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
291 
292     return (block_a == block_b);
293 }
294 
295 /*
296  * Loads a refcount block. If it doesn't exist yet, it is allocated first
297  * (including growing the refcount table if needed).
298  *
299  * Returns 0 on success or -errno in error case
300  */
301 static int alloc_refcount_block(BlockDriverState *bs,
302                                 int64_t cluster_index, void **refcount_block)
303 {
304     BDRVQcow2State *s = bs->opaque;
305     unsigned int refcount_table_index;
306     int64_t ret;
307 
308     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
309 
310     /* Find the refcount block for the given cluster */
311     refcount_table_index = cluster_index >> s->refcount_block_bits;
312 
313     if (refcount_table_index < s->refcount_table_size) {
314 
315         uint64_t refcount_block_offset =
316             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
317 
318         /* If it's already there, we're done */
319         if (refcount_block_offset) {
320             if (offset_into_cluster(s, refcount_block_offset)) {
321                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
322                                         PRIx64 " unaligned (reftable index: "
323                                         "%#x)", refcount_block_offset,
324                                         refcount_table_index);
325                 return -EIO;
326             }
327 
328              return load_refcount_block(bs, refcount_block_offset,
329                                         refcount_block);
330         }
331     }
332 
333     /*
334      * If we came here, we need to allocate something. Something is at least
335      * a cluster for the new refcount block. It may also include a new refcount
336      * table if the old refcount table is too small.
337      *
338      * Note that allocating clusters here needs some special care:
339      *
340      * - We can't use the normal qcow2_alloc_clusters(), it would try to
341      *   increase the refcount and very likely we would end up with an endless
342      *   recursion. Instead we must place the refcount blocks in a way that
343      *   they can describe them themselves.
344      *
345      * - We need to consider that at this point we are inside update_refcounts
346      *   and potentially doing an initial refcount increase. This means that
347      *   some clusters have already been allocated by the caller, but their
348      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
349      *   need to return -EAGAIN to signal the caller that it needs to restart
350      *   the search for free clusters.
351      *
352      * - alloc_clusters_noref and qcow2_free_clusters may load a different
353      *   refcount block into the cache
354      */
355 
356     *refcount_block = NULL;
357 
358     /* We write to the refcount table, so we might depend on L2 tables */
359     ret = qcow2_cache_flush(bs, s->l2_table_cache);
360     if (ret < 0) {
361         return ret;
362     }
363 
364     /* Allocate the refcount block itself and mark it as used */
365     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
366     if (new_block < 0) {
367         return new_block;
368     }
369 
370     /* If we're allocating the block at offset 0 then something is wrong */
371     if (new_block == 0) {
372         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
373                                 "allocation of refcount block at offset 0");
374         return -EIO;
375     }
376 
377 #ifdef DEBUG_ALLOC2
378     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
379         " at %" PRIx64 "\n",
380         refcount_table_index, cluster_index << s->cluster_bits, new_block);
381 #endif
382 
383     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
384         /* Zero the new refcount block before updating it */
385         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
386                                     refcount_block);
387         if (ret < 0) {
388             goto fail;
389         }
390 
391         memset(*refcount_block, 0, s->cluster_size);
392 
393         /* The block describes itself, need to update the cache */
394         int block_index = (new_block >> s->cluster_bits) &
395             (s->refcount_block_size - 1);
396         s->set_refcount(*refcount_block, block_index, 1);
397     } else {
398         /* Described somewhere else. This can recurse at most twice before we
399          * arrive at a block that describes itself. */
400         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
401                               QCOW2_DISCARD_NEVER);
402         if (ret < 0) {
403             goto fail;
404         }
405 
406         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
407         if (ret < 0) {
408             goto fail;
409         }
410 
411         /* Initialize the new refcount block only after updating its refcount,
412          * update_refcount uses the refcount cache itself */
413         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
414                                     refcount_block);
415         if (ret < 0) {
416             goto fail;
417         }
418 
419         memset(*refcount_block, 0, s->cluster_size);
420     }
421 
422     /* Now the new refcount block needs to be written to disk */
423     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
424     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
425     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
426     if (ret < 0) {
427         goto fail;
428     }
429 
430     /* If the refcount table is big enough, just hook the block up there */
431     if (refcount_table_index < s->refcount_table_size) {
432         uint64_t data64 = cpu_to_be64(new_block);
433         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
434         ret = bdrv_pwrite_sync(bs->file,
435             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
436             &data64, sizeof(data64));
437         if (ret < 0) {
438             goto fail;
439         }
440 
441         s->refcount_table[refcount_table_index] = new_block;
442         /* If there's a hole in s->refcount_table then it can happen
443          * that refcount_table_index < s->max_refcount_table_index */
444         s->max_refcount_table_index =
445             MAX(s->max_refcount_table_index, refcount_table_index);
446 
447         /* The new refcount block may be where the caller intended to put its
448          * data, so let it restart the search. */
449         return -EAGAIN;
450     }
451 
452     qcow2_cache_put(s->refcount_block_cache, refcount_block);
453 
454     /*
455      * If we come here, we need to grow the refcount table. Again, a new
456      * refcount table needs some space and we can't simply allocate to avoid
457      * endless recursion.
458      *
459      * Therefore let's grab new refcount blocks at the end of the image, which
460      * will describe themselves and the new refcount table. This way we can
461      * reference them only in the new table and do the switch to the new
462      * refcount table at once without producing an inconsistent state in
463      * between.
464      */
465     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
466 
467     /* Calculate the number of refcount blocks needed so far; this will be the
468      * basis for calculating the index of the first cluster used for the
469      * self-describing refcount structures which we are about to create.
470      *
471      * Because we reached this point, there cannot be any refcount entries for
472      * cluster_index or higher indices yet. However, because new_block has been
473      * allocated to describe that cluster (and it will assume this role later
474      * on), we cannot use that index; also, new_block may actually have a higher
475      * cluster index than cluster_index, so it needs to be taken into account
476      * here (and 1 needs to be added to its value because that cluster is used).
477      */
478     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
479                                             (new_block >> s->cluster_bits) + 1),
480                                         s->refcount_block_size);
481 
482     /* Create the new refcount table and blocks */
483     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
484         s->cluster_size;
485 
486     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
487                               refcount_table_index, new_block);
488     if (ret < 0) {
489         return ret;
490     }
491 
492     ret = load_refcount_block(bs, new_block, refcount_block);
493     if (ret < 0) {
494         return ret;
495     }
496 
497     /* If we were trying to do the initial refcount update for some cluster
498      * allocation, we might have used the same clusters to store newly
499      * allocated metadata. Make the caller search some new space. */
500     return -EAGAIN;
501 
502 fail:
503     if (*refcount_block != NULL) {
504         qcow2_cache_put(s->refcount_block_cache, refcount_block);
505     }
506     return ret;
507 }
508 
509 /*
510  * Starting at @start_offset, this function creates new self-covering refcount
511  * structures: A new refcount table and refcount blocks which cover all of
512  * themselves, and a number of @additional_clusters beyond their end.
513  * @start_offset must be at the end of the image file, that is, there must be
514  * only empty space beyond it.
515  * If @exact_size is false, the refcount table will have 50 % more entries than
516  * necessary so it will not need to grow again soon.
517  * If @new_refblock_offset is not zero, it contains the offset of a refcount
518  * block that should be entered into the new refcount table at index
519  * @new_refblock_index.
520  *
521  * Returns: The offset after the new refcount structures (i.e. where the
522  *          @additional_clusters may be placed) on success, -errno on error.
523  */
524 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
525                             uint64_t additional_clusters, bool exact_size,
526                             int new_refblock_index,
527                             uint64_t new_refblock_offset)
528 {
529     BDRVQcow2State *s = bs->opaque;
530     uint64_t total_refblock_count_u64, additional_refblock_count;
531     int total_refblock_count, table_size, area_reftable_index, table_clusters;
532     int i;
533     uint64_t table_offset, block_offset, end_offset;
534     int ret;
535     uint64_t *new_table;
536 
537     assert(!(start_offset % s->cluster_size));
538 
539     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
540                                  additional_clusters,
541                                  s->cluster_size, s->refcount_order,
542                                  !exact_size, &total_refblock_count_u64);
543     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
544         return -EFBIG;
545     }
546     total_refblock_count = total_refblock_count_u64;
547 
548     /* Index in the refcount table of the first refcount block to cover the area
549      * of refcount structures we are about to create; we know that
550      * @total_refblock_count can cover @start_offset, so this will definitely
551      * fit into an int. */
552     area_reftable_index = (start_offset / s->cluster_size) /
553                           s->refcount_block_size;
554 
555     if (exact_size) {
556         table_size = total_refblock_count;
557     } else {
558         table_size = total_refblock_count +
559                      DIV_ROUND_UP(total_refblock_count, 2);
560     }
561     /* The qcow2 file can only store the reftable size in number of clusters */
562     table_size = ROUND_UP(table_size, s->cluster_size / sizeof(uint64_t));
563     table_clusters = (table_size * sizeof(uint64_t)) / s->cluster_size;
564 
565     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
566         return -EFBIG;
567     }
568 
569     new_table = g_try_new0(uint64_t, table_size);
570 
571     assert(table_size > 0);
572     if (new_table == NULL) {
573         ret = -ENOMEM;
574         goto fail;
575     }
576 
577     /* Fill the new refcount table */
578     if (table_size > s->max_refcount_table_index) {
579         /* We're actually growing the reftable */
580         memcpy(new_table, s->refcount_table,
581                (s->max_refcount_table_index + 1) * sizeof(uint64_t));
582     } else {
583         /* Improbable case: We're shrinking the reftable. However, the caller
584          * has assured us that there is only empty space beyond @start_offset,
585          * so we can simply drop all of the refblocks that won't fit into the
586          * new reftable. */
587         memcpy(new_table, s->refcount_table, table_size * sizeof(uint64_t));
588     }
589 
590     if (new_refblock_offset) {
591         assert(new_refblock_index < total_refblock_count);
592         new_table[new_refblock_index] = new_refblock_offset;
593     }
594 
595     /* Count how many new refblocks we have to create */
596     additional_refblock_count = 0;
597     for (i = area_reftable_index; i < total_refblock_count; i++) {
598         if (!new_table[i]) {
599             additional_refblock_count++;
600         }
601     }
602 
603     table_offset = start_offset + additional_refblock_count * s->cluster_size;
604     end_offset = table_offset + table_clusters * s->cluster_size;
605 
606     /* Fill the refcount blocks, and create new ones, if necessary */
607     block_offset = start_offset;
608     for (i = area_reftable_index; i < total_refblock_count; i++) {
609         void *refblock_data;
610         uint64_t first_offset_covered;
611 
612         /* Reuse an existing refblock if possible, create a new one otherwise */
613         if (new_table[i]) {
614             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
615                                   &refblock_data);
616             if (ret < 0) {
617                 goto fail;
618             }
619         } else {
620             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
621                                         block_offset, &refblock_data);
622             if (ret < 0) {
623                 goto fail;
624             }
625             memset(refblock_data, 0, s->cluster_size);
626             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
627                                          refblock_data);
628 
629             new_table[i] = block_offset;
630             block_offset += s->cluster_size;
631         }
632 
633         /* First host offset covered by this refblock */
634         first_offset_covered = (uint64_t)i * s->refcount_block_size *
635                                s->cluster_size;
636         if (first_offset_covered < end_offset) {
637             int j, end_index;
638 
639             /* Set the refcount of all of the new refcount structures to 1 */
640 
641             if (first_offset_covered < start_offset) {
642                 assert(i == area_reftable_index);
643                 j = (start_offset - first_offset_covered) / s->cluster_size;
644                 assert(j < s->refcount_block_size);
645             } else {
646                 j = 0;
647             }
648 
649             end_index = MIN((end_offset - first_offset_covered) /
650                             s->cluster_size,
651                             s->refcount_block_size);
652 
653             for (; j < end_index; j++) {
654                 /* The caller guaranteed us this space would be empty */
655                 assert(s->get_refcount(refblock_data, j) == 0);
656                 s->set_refcount(refblock_data, j, 1);
657             }
658 
659             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
660                                          refblock_data);
661         }
662 
663         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
664     }
665 
666     assert(block_offset == table_offset);
667 
668     /* Write refcount blocks to disk */
669     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
670     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
671     if (ret < 0) {
672         goto fail;
673     }
674 
675     /* Write refcount table to disk */
676     for (i = 0; i < total_refblock_count; i++) {
677         cpu_to_be64s(&new_table[i]);
678     }
679 
680     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
681     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
682         table_size * sizeof(uint64_t));
683     if (ret < 0) {
684         goto fail;
685     }
686 
687     for (i = 0; i < total_refblock_count; i++) {
688         be64_to_cpus(&new_table[i]);
689     }
690 
691     /* Hook up the new refcount table in the qcow2 header */
692     struct QEMU_PACKED {
693         uint64_t d64;
694         uint32_t d32;
695     } data;
696     data.d64 = cpu_to_be64(table_offset);
697     data.d32 = cpu_to_be32(table_clusters);
698     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
699     ret = bdrv_pwrite_sync(bs->file,
700                            offsetof(QCowHeader, refcount_table_offset),
701                            &data, sizeof(data));
702     if (ret < 0) {
703         goto fail;
704     }
705 
706     /* And switch it in memory */
707     uint64_t old_table_offset = s->refcount_table_offset;
708     uint64_t old_table_size = s->refcount_table_size;
709 
710     g_free(s->refcount_table);
711     s->refcount_table = new_table;
712     s->refcount_table_size = table_size;
713     s->refcount_table_offset = table_offset;
714     update_max_refcount_table_index(s);
715 
716     /* Free old table. */
717     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
718                         QCOW2_DISCARD_OTHER);
719 
720     return end_offset;
721 
722 fail:
723     g_free(new_table);
724     return ret;
725 }
726 
727 void qcow2_process_discards(BlockDriverState *bs, int ret)
728 {
729     BDRVQcow2State *s = bs->opaque;
730     Qcow2DiscardRegion *d, *next;
731 
732     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
733         QTAILQ_REMOVE(&s->discards, d, next);
734 
735         /* Discard is optional, ignore the return value */
736         if (ret >= 0) {
737             bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
738         }
739 
740         g_free(d);
741     }
742 }
743 
744 static void update_refcount_discard(BlockDriverState *bs,
745                                     uint64_t offset, uint64_t length)
746 {
747     BDRVQcow2State *s = bs->opaque;
748     Qcow2DiscardRegion *d, *p, *next;
749 
750     QTAILQ_FOREACH(d, &s->discards, next) {
751         uint64_t new_start = MIN(offset, d->offset);
752         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
753 
754         if (new_end - new_start <= length + d->bytes) {
755             /* There can't be any overlap, areas ending up here have no
756              * references any more and therefore shouldn't get freed another
757              * time. */
758             assert(d->bytes + length == new_end - new_start);
759             d->offset = new_start;
760             d->bytes = new_end - new_start;
761             goto found;
762         }
763     }
764 
765     d = g_malloc(sizeof(*d));
766     *d = (Qcow2DiscardRegion) {
767         .bs     = bs,
768         .offset = offset,
769         .bytes  = length,
770     };
771     QTAILQ_INSERT_TAIL(&s->discards, d, next);
772 
773 found:
774     /* Merge discard requests if they are adjacent now */
775     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
776         if (p == d
777             || p->offset > d->offset + d->bytes
778             || d->offset > p->offset + p->bytes)
779         {
780             continue;
781         }
782 
783         /* Still no overlap possible */
784         assert(p->offset == d->offset + d->bytes
785             || d->offset == p->offset + p->bytes);
786 
787         QTAILQ_REMOVE(&s->discards, p, next);
788         d->offset = MIN(d->offset, p->offset);
789         d->bytes += p->bytes;
790         g_free(p);
791     }
792 }
793 
794 /* XXX: cache several refcount block clusters ? */
795 /* @addend is the absolute value of the addend; if @decrease is set, @addend
796  * will be subtracted from the current refcount, otherwise it will be added */
797 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
798                                                    int64_t offset,
799                                                    int64_t length,
800                                                    uint64_t addend,
801                                                    bool decrease,
802                                                    enum qcow2_discard_type type)
803 {
804     BDRVQcow2State *s = bs->opaque;
805     int64_t start, last, cluster_offset;
806     void *refcount_block = NULL;
807     int64_t old_table_index = -1;
808     int ret;
809 
810 #ifdef DEBUG_ALLOC2
811     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
812             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
813             addend);
814 #endif
815     if (length < 0) {
816         return -EINVAL;
817     } else if (length == 0) {
818         return 0;
819     }
820 
821     if (decrease) {
822         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
823             s->l2_table_cache);
824     }
825 
826     start = start_of_cluster(s, offset);
827     last = start_of_cluster(s, offset + length - 1);
828     for(cluster_offset = start; cluster_offset <= last;
829         cluster_offset += s->cluster_size)
830     {
831         int block_index;
832         uint64_t refcount;
833         int64_t cluster_index = cluster_offset >> s->cluster_bits;
834         int64_t table_index = cluster_index >> s->refcount_block_bits;
835 
836         /* Load the refcount block and allocate it if needed */
837         if (table_index != old_table_index) {
838             if (refcount_block) {
839                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
840             }
841             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
842             /* If the caller needs to restart the search for free clusters,
843              * try the same ones first to see if they're still free. */
844             if (ret == -EAGAIN) {
845                 if (s->free_cluster_index > (start >> s->cluster_bits)) {
846                     s->free_cluster_index = (start >> s->cluster_bits);
847                 }
848             }
849             if (ret < 0) {
850                 goto fail;
851             }
852         }
853         old_table_index = table_index;
854 
855         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
856 
857         /* we can update the count and save it */
858         block_index = cluster_index & (s->refcount_block_size - 1);
859 
860         refcount = s->get_refcount(refcount_block, block_index);
861         if (decrease ? (refcount - addend > refcount)
862                      : (refcount + addend < refcount ||
863                         refcount + addend > s->refcount_max))
864         {
865             ret = -EINVAL;
866             goto fail;
867         }
868         if (decrease) {
869             refcount -= addend;
870         } else {
871             refcount += addend;
872         }
873         if (refcount == 0 && cluster_index < s->free_cluster_index) {
874             s->free_cluster_index = cluster_index;
875         }
876         s->set_refcount(refcount_block, block_index, refcount);
877 
878         if (refcount == 0) {
879             void *table;
880 
881             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
882                                                 offset);
883             if (table != NULL) {
884                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
885                 qcow2_cache_discard(s->refcount_block_cache, table);
886             }
887 
888             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
889             if (table != NULL) {
890                 qcow2_cache_discard(s->l2_table_cache, table);
891             }
892 
893             if (s->discard_passthrough[type]) {
894                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
895             }
896         }
897     }
898 
899     ret = 0;
900 fail:
901     if (!s->cache_discards) {
902         qcow2_process_discards(bs, ret);
903     }
904 
905     /* Write last changed block to disk */
906     if (refcount_block) {
907         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
908     }
909 
910     /*
911      * Try do undo any updates if an error is returned (This may succeed in
912      * some cases like ENOSPC for allocating a new refcount block)
913      */
914     if (ret < 0) {
915         int dummy;
916         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
917                                 !decrease, QCOW2_DISCARD_NEVER);
918         (void)dummy;
919     }
920 
921     return ret;
922 }
923 
924 /*
925  * Increases or decreases the refcount of a given cluster.
926  *
927  * @addend is the absolute value of the addend; if @decrease is set, @addend
928  * will be subtracted from the current refcount, otherwise it will be added.
929  *
930  * On success 0 is returned; on failure -errno is returned.
931  */
932 int qcow2_update_cluster_refcount(BlockDriverState *bs,
933                                   int64_t cluster_index,
934                                   uint64_t addend, bool decrease,
935                                   enum qcow2_discard_type type)
936 {
937     BDRVQcow2State *s = bs->opaque;
938     int ret;
939 
940     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
941                           decrease, type);
942     if (ret < 0) {
943         return ret;
944     }
945 
946     return 0;
947 }
948 
949 
950 
951 /*********************************************************/
952 /* cluster allocation functions */
953 
954 
955 
956 /* return < 0 if error */
957 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
958 {
959     BDRVQcow2State *s = bs->opaque;
960     uint64_t i, nb_clusters, refcount;
961     int ret;
962 
963     /* We can't allocate clusters if they may still be queued for discard. */
964     if (s->cache_discards) {
965         qcow2_process_discards(bs, 0);
966     }
967 
968     nb_clusters = size_to_clusters(s, size);
969 retry:
970     for(i = 0; i < nb_clusters; i++) {
971         uint64_t next_cluster_index = s->free_cluster_index++;
972         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
973 
974         if (ret < 0) {
975             return ret;
976         } else if (refcount != 0) {
977             goto retry;
978         }
979     }
980 
981     /* Make sure that all offsets in the "allocated" range are representable
982      * in an int64_t */
983     if (s->free_cluster_index > 0 &&
984         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
985     {
986         return -EFBIG;
987     }
988 
989 #ifdef DEBUG_ALLOC2
990     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
991             size,
992             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
993 #endif
994     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
995 }
996 
997 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
998 {
999     int64_t offset;
1000     int ret;
1001 
1002     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1003     do {
1004         offset = alloc_clusters_noref(bs, size);
1005         if (offset < 0) {
1006             return offset;
1007         }
1008 
1009         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1010     } while (ret == -EAGAIN);
1011 
1012     if (ret < 0) {
1013         return ret;
1014     }
1015 
1016     return offset;
1017 }
1018 
1019 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1020                                 int64_t nb_clusters)
1021 {
1022     BDRVQcow2State *s = bs->opaque;
1023     uint64_t cluster_index, refcount;
1024     uint64_t i;
1025     int ret;
1026 
1027     assert(nb_clusters >= 0);
1028     if (nb_clusters == 0) {
1029         return 0;
1030     }
1031 
1032     do {
1033         /* Check how many clusters there are free */
1034         cluster_index = offset >> s->cluster_bits;
1035         for(i = 0; i < nb_clusters; i++) {
1036             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1037             if (ret < 0) {
1038                 return ret;
1039             } else if (refcount != 0) {
1040                 break;
1041             }
1042         }
1043 
1044         /* And then allocate them */
1045         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1046                               QCOW2_DISCARD_NEVER);
1047     } while (ret == -EAGAIN);
1048 
1049     if (ret < 0) {
1050         return ret;
1051     }
1052 
1053     return i;
1054 }
1055 
1056 /* only used to allocate compressed sectors. We try to allocate
1057    contiguous sectors. size must be <= cluster_size */
1058 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
1059 {
1060     BDRVQcow2State *s = bs->opaque;
1061     int64_t offset;
1062     size_t free_in_cluster;
1063     int ret;
1064 
1065     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1066     assert(size > 0 && size <= s->cluster_size);
1067     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1068 
1069     offset = s->free_byte_offset;
1070 
1071     if (offset) {
1072         uint64_t refcount;
1073         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1074         if (ret < 0) {
1075             return ret;
1076         }
1077 
1078         if (refcount == s->refcount_max) {
1079             offset = 0;
1080         }
1081     }
1082 
1083     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1084     do {
1085         if (!offset || free_in_cluster < size) {
1086             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
1087             if (new_cluster < 0) {
1088                 return new_cluster;
1089             }
1090 
1091             if (new_cluster == 0) {
1092                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1093                                         "allocation of compressed cluster "
1094                                         "at offset 0");
1095                 return -EIO;
1096             }
1097 
1098             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1099                 offset = new_cluster;
1100                 free_in_cluster = s->cluster_size;
1101             } else {
1102                 free_in_cluster += s->cluster_size;
1103             }
1104         }
1105 
1106         assert(offset);
1107         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1108         if (ret < 0) {
1109             offset = 0;
1110         }
1111     } while (ret == -EAGAIN);
1112     if (ret < 0) {
1113         return ret;
1114     }
1115 
1116     /* The cluster refcount was incremented; refcount blocks must be flushed
1117      * before the caller's L2 table updates. */
1118     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1119 
1120     s->free_byte_offset = offset + size;
1121     if (!offset_into_cluster(s, s->free_byte_offset)) {
1122         s->free_byte_offset = 0;
1123     }
1124 
1125     return offset;
1126 }
1127 
1128 void qcow2_free_clusters(BlockDriverState *bs,
1129                           int64_t offset, int64_t size,
1130                           enum qcow2_discard_type type)
1131 {
1132     int ret;
1133 
1134     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1135     ret = update_refcount(bs, offset, size, 1, true, type);
1136     if (ret < 0) {
1137         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1138         /* TODO Remember the clusters to free them later and avoid leaking */
1139     }
1140 }
1141 
1142 /*
1143  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1144  * normal cluster, compressed cluster, etc.)
1145  */
1146 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1147                              int nb_clusters, enum qcow2_discard_type type)
1148 {
1149     BDRVQcow2State *s = bs->opaque;
1150 
1151     switch (qcow2_get_cluster_type(l2_entry)) {
1152     case QCOW2_CLUSTER_COMPRESSED:
1153         {
1154             int nb_csectors;
1155             nb_csectors = ((l2_entry >> s->csize_shift) &
1156                            s->csize_mask) + 1;
1157             qcow2_free_clusters(bs,
1158                 (l2_entry & s->cluster_offset_mask) & ~511,
1159                 nb_csectors * 512, type);
1160         }
1161         break;
1162     case QCOW2_CLUSTER_NORMAL:
1163     case QCOW2_CLUSTER_ZERO_ALLOC:
1164         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1165             qcow2_signal_corruption(bs, false, -1, -1,
1166                                     "Cannot free unaligned cluster %#llx",
1167                                     l2_entry & L2E_OFFSET_MASK);
1168         } else {
1169             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1170                                 nb_clusters << s->cluster_bits, type);
1171         }
1172         break;
1173     case QCOW2_CLUSTER_ZERO_PLAIN:
1174     case QCOW2_CLUSTER_UNALLOCATED:
1175         break;
1176     default:
1177         abort();
1178     }
1179 }
1180 
1181 int coroutine_fn qcow2_write_caches(BlockDriverState *bs)
1182 {
1183     BDRVQcow2State *s = bs->opaque;
1184     int ret;
1185 
1186     ret = qcow2_cache_write(bs, s->l2_table_cache);
1187     if (ret < 0) {
1188         return ret;
1189     }
1190 
1191     if (qcow2_need_accurate_refcounts(s)) {
1192         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1193         if (ret < 0) {
1194             return ret;
1195         }
1196     }
1197 
1198     return 0;
1199 }
1200 
1201 int coroutine_fn qcow2_flush_caches(BlockDriverState *bs)
1202 {
1203     int ret = qcow2_write_caches(bs);
1204     if (ret < 0) {
1205         return ret;
1206     }
1207 
1208     return bdrv_flush(bs->file->bs);
1209 }
1210 
1211 /*********************************************************/
1212 /* snapshots and image creation */
1213 
1214 
1215 
1216 /* update the refcounts of snapshots and the copied flag */
1217 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1218     int64_t l1_table_offset, int l1_size, int addend)
1219 {
1220     BDRVQcow2State *s = bs->opaque;
1221     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1222     bool l1_allocated = false;
1223     int64_t old_entry, old_l2_offset;
1224     unsigned slice, slice_size2, n_slices;
1225     int i, j, l1_modified = 0, nb_csectors;
1226     int ret;
1227 
1228     assert(addend >= -1 && addend <= 1);
1229 
1230     l2_slice = NULL;
1231     l1_table = NULL;
1232     l1_size2 = l1_size * sizeof(uint64_t);
1233     slice_size2 = s->l2_slice_size * sizeof(uint64_t);
1234     n_slices = s->cluster_size / slice_size2;
1235 
1236     s->cache_discards = true;
1237 
1238     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1239      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1240      * when changing this! */
1241     if (l1_table_offset != s->l1_table_offset) {
1242         l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512));
1243         if (l1_size2 && l1_table == NULL) {
1244             ret = -ENOMEM;
1245             goto fail;
1246         }
1247         l1_allocated = true;
1248 
1249         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1250         if (ret < 0) {
1251             goto fail;
1252         }
1253 
1254         for (i = 0; i < l1_size; i++) {
1255             be64_to_cpus(&l1_table[i]);
1256         }
1257     } else {
1258         assert(l1_size == s->l1_size);
1259         l1_table = s->l1_table;
1260         l1_allocated = false;
1261     }
1262 
1263     for (i = 0; i < l1_size; i++) {
1264         l2_offset = l1_table[i];
1265         if (l2_offset) {
1266             old_l2_offset = l2_offset;
1267             l2_offset &= L1E_OFFSET_MASK;
1268 
1269             if (offset_into_cluster(s, l2_offset)) {
1270                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1271                                         PRIx64 " unaligned (L1 index: %#x)",
1272                                         l2_offset, i);
1273                 ret = -EIO;
1274                 goto fail;
1275             }
1276 
1277             for (slice = 0; slice < n_slices; slice++) {
1278                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1279                                       l2_offset + slice * slice_size2,
1280                                       (void **) &l2_slice);
1281                 if (ret < 0) {
1282                     goto fail;
1283                 }
1284 
1285                 for (j = 0; j < s->l2_slice_size; j++) {
1286                     uint64_t cluster_index;
1287                     uint64_t offset;
1288 
1289                     entry = be64_to_cpu(l2_slice[j]);
1290                     old_entry = entry;
1291                     entry &= ~QCOW_OFLAG_COPIED;
1292                     offset = entry & L2E_OFFSET_MASK;
1293 
1294                     switch (qcow2_get_cluster_type(entry)) {
1295                     case QCOW2_CLUSTER_COMPRESSED:
1296                         nb_csectors = ((entry >> s->csize_shift) &
1297                                        s->csize_mask) + 1;
1298                         if (addend != 0) {
1299                             ret = update_refcount(
1300                                 bs, (entry & s->cluster_offset_mask) & ~511,
1301                                 nb_csectors * 512, abs(addend), addend < 0,
1302                                 QCOW2_DISCARD_SNAPSHOT);
1303                             if (ret < 0) {
1304                                 goto fail;
1305                             }
1306                         }
1307                         /* compressed clusters are never modified */
1308                         refcount = 2;
1309                         break;
1310 
1311                     case QCOW2_CLUSTER_NORMAL:
1312                     case QCOW2_CLUSTER_ZERO_ALLOC:
1313                         if (offset_into_cluster(s, offset)) {
1314                             /* Here l2_index means table (not slice) index */
1315                             int l2_index = slice * s->l2_slice_size + j;
1316                             qcow2_signal_corruption(
1317                                 bs, true, -1, -1, "Cluster "
1318                                 "allocation offset %#" PRIx64
1319                                 " unaligned (L2 offset: %#"
1320                                 PRIx64 ", L2 index: %#x)",
1321                                 offset, l2_offset, l2_index);
1322                             ret = -EIO;
1323                             goto fail;
1324                         }
1325 
1326                         cluster_index = offset >> s->cluster_bits;
1327                         assert(cluster_index);
1328                         if (addend != 0) {
1329                             ret = qcow2_update_cluster_refcount(
1330                                 bs, cluster_index, abs(addend), addend < 0,
1331                                 QCOW2_DISCARD_SNAPSHOT);
1332                             if (ret < 0) {
1333                                 goto fail;
1334                             }
1335                         }
1336 
1337                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1338                         if (ret < 0) {
1339                             goto fail;
1340                         }
1341                         break;
1342 
1343                     case QCOW2_CLUSTER_ZERO_PLAIN:
1344                     case QCOW2_CLUSTER_UNALLOCATED:
1345                         refcount = 0;
1346                         break;
1347 
1348                     default:
1349                         abort();
1350                     }
1351 
1352                     if (refcount == 1) {
1353                         entry |= QCOW_OFLAG_COPIED;
1354                     }
1355                     if (entry != old_entry) {
1356                         if (addend > 0) {
1357                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1358                                                        s->refcount_block_cache);
1359                         }
1360                         l2_slice[j] = cpu_to_be64(entry);
1361                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1362                                                      l2_slice);
1363                     }
1364                 }
1365 
1366                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1367             }
1368 
1369             if (addend != 0) {
1370                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1371                                                         s->cluster_bits,
1372                                                     abs(addend), addend < 0,
1373                                                     QCOW2_DISCARD_SNAPSHOT);
1374                 if (ret < 0) {
1375                     goto fail;
1376                 }
1377             }
1378             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1379                                      &refcount);
1380             if (ret < 0) {
1381                 goto fail;
1382             } else if (refcount == 1) {
1383                 l2_offset |= QCOW_OFLAG_COPIED;
1384             }
1385             if (l2_offset != old_l2_offset) {
1386                 l1_table[i] = l2_offset;
1387                 l1_modified = 1;
1388             }
1389         }
1390     }
1391 
1392     ret = bdrv_flush(bs);
1393 fail:
1394     if (l2_slice) {
1395         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1396     }
1397 
1398     s->cache_discards = false;
1399     qcow2_process_discards(bs, ret);
1400 
1401     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1402     if (ret == 0 && addend >= 0 && l1_modified) {
1403         for (i = 0; i < l1_size; i++) {
1404             cpu_to_be64s(&l1_table[i]);
1405         }
1406 
1407         ret = bdrv_pwrite_sync(bs->file, l1_table_offset,
1408                                l1_table, l1_size2);
1409 
1410         for (i = 0; i < l1_size; i++) {
1411             be64_to_cpus(&l1_table[i]);
1412         }
1413     }
1414     if (l1_allocated)
1415         g_free(l1_table);
1416     return ret;
1417 }
1418 
1419 
1420 
1421 
1422 /*********************************************************/
1423 /* refcount checking functions */
1424 
1425 
1426 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1427 {
1428     /* This assertion holds because there is no way we can address more than
1429      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1430      * offsets have to be representable in bytes); due to every cluster
1431      * corresponding to one refcount entry, we are well below that limit */
1432     assert(entries < (UINT64_C(1) << (64 - 9)));
1433 
1434     /* Thanks to the assertion this will not overflow, because
1435      * s->refcount_order < 7.
1436      * (note: x << s->refcount_order == x * s->refcount_bits) */
1437     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1438 }
1439 
1440 /**
1441  * Reallocates *array so that it can hold new_size entries. *size must contain
1442  * the current number of entries in *array. If the reallocation fails, *array
1443  * and *size will not be modified and -errno will be returned. If the
1444  * reallocation is successful, *array will be set to the new buffer, *size
1445  * will be set to new_size and 0 will be returned. The size of the reallocated
1446  * refcount array buffer will be aligned to a cluster boundary, and the newly
1447  * allocated area will be zeroed.
1448  */
1449 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1450                                   int64_t *size, int64_t new_size)
1451 {
1452     int64_t old_byte_size, new_byte_size;
1453     void *new_ptr;
1454 
1455     /* Round to clusters so the array can be directly written to disk */
1456     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1457                     * s->cluster_size;
1458     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1459                     * s->cluster_size;
1460 
1461     if (new_byte_size == old_byte_size) {
1462         *size = new_size;
1463         return 0;
1464     }
1465 
1466     assert(new_byte_size > 0);
1467 
1468     if (new_byte_size > SIZE_MAX) {
1469         return -ENOMEM;
1470     }
1471 
1472     new_ptr = g_try_realloc(*array, new_byte_size);
1473     if (!new_ptr) {
1474         return -ENOMEM;
1475     }
1476 
1477     if (new_byte_size > old_byte_size) {
1478         memset((char *)new_ptr + old_byte_size, 0,
1479                new_byte_size - old_byte_size);
1480     }
1481 
1482     *array = new_ptr;
1483     *size  = new_size;
1484 
1485     return 0;
1486 }
1487 
1488 /*
1489  * Increases the refcount for a range of clusters in a given refcount table.
1490  * This is used to construct a temporary refcount table out of L1 and L2 tables
1491  * which can be compared to the refcount table saved in the image.
1492  *
1493  * Modifies the number of errors in res.
1494  */
1495 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1496                              void **refcount_table,
1497                              int64_t *refcount_table_size,
1498                              int64_t offset, int64_t size)
1499 {
1500     BDRVQcow2State *s = bs->opaque;
1501     uint64_t start, last, cluster_offset, k, refcount;
1502     int ret;
1503 
1504     if (size <= 0) {
1505         return 0;
1506     }
1507 
1508     start = start_of_cluster(s, offset);
1509     last = start_of_cluster(s, offset + size - 1);
1510     for(cluster_offset = start; cluster_offset <= last;
1511         cluster_offset += s->cluster_size) {
1512         k = cluster_offset >> s->cluster_bits;
1513         if (k >= *refcount_table_size) {
1514             ret = realloc_refcount_array(s, refcount_table,
1515                                          refcount_table_size, k + 1);
1516             if (ret < 0) {
1517                 res->check_errors++;
1518                 return ret;
1519             }
1520         }
1521 
1522         refcount = s->get_refcount(*refcount_table, k);
1523         if (refcount == s->refcount_max) {
1524             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1525                     "\n", cluster_offset);
1526             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1527                     "width or qemu-img convert to create a clean copy if the "
1528                     "image cannot be opened for writing\n");
1529             res->corruptions++;
1530             continue;
1531         }
1532         s->set_refcount(*refcount_table, k, refcount + 1);
1533     }
1534 
1535     return 0;
1536 }
1537 
1538 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1539 enum {
1540     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1541 };
1542 
1543 /*
1544  * Increases the refcount in the given refcount table for the all clusters
1545  * referenced in the L2 table. While doing so, performs some checks on L2
1546  * entries.
1547  *
1548  * Returns the number of errors found by the checks or -errno if an internal
1549  * error occurred.
1550  */
1551 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1552                               void **refcount_table,
1553                               int64_t *refcount_table_size, int64_t l2_offset,
1554                               int flags, BdrvCheckMode fix)
1555 {
1556     BDRVQcow2State *s = bs->opaque;
1557     uint64_t *l2_table, l2_entry;
1558     uint64_t next_contiguous_offset = 0;
1559     int i, l2_size, nb_csectors, ret;
1560 
1561     /* Read L2 table from disk */
1562     l2_size = s->l2_size * sizeof(uint64_t);
1563     l2_table = g_malloc(l2_size);
1564 
1565     ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size);
1566     if (ret < 0) {
1567         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1568         res->check_errors++;
1569         goto fail;
1570     }
1571 
1572     /* Do the actual checks */
1573     for(i = 0; i < s->l2_size; i++) {
1574         l2_entry = be64_to_cpu(l2_table[i]);
1575 
1576         switch (qcow2_get_cluster_type(l2_entry)) {
1577         case QCOW2_CLUSTER_COMPRESSED:
1578             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1579             if (l2_entry & QCOW_OFLAG_COPIED) {
1580                 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1581                     "copied flag must never be set for compressed "
1582                     "clusters\n", l2_entry & s->cluster_offset_mask);
1583                 l2_entry &= ~QCOW_OFLAG_COPIED;
1584                 res->corruptions++;
1585             }
1586 
1587             /* Mark cluster as used */
1588             nb_csectors = ((l2_entry >> s->csize_shift) &
1589                            s->csize_mask) + 1;
1590             l2_entry &= s->cluster_offset_mask;
1591             ret = qcow2_inc_refcounts_imrt(bs, res,
1592                                            refcount_table, refcount_table_size,
1593                                            l2_entry & ~511, nb_csectors * 512);
1594             if (ret < 0) {
1595                 goto fail;
1596             }
1597 
1598             if (flags & CHECK_FRAG_INFO) {
1599                 res->bfi.allocated_clusters++;
1600                 res->bfi.compressed_clusters++;
1601 
1602                 /* Compressed clusters are fragmented by nature.  Since they
1603                  * take up sub-sector space but we only have sector granularity
1604                  * I/O we need to re-read the same sectors even for adjacent
1605                  * compressed clusters.
1606                  */
1607                 res->bfi.fragmented_clusters++;
1608             }
1609             break;
1610 
1611         case QCOW2_CLUSTER_ZERO_ALLOC:
1612         case QCOW2_CLUSTER_NORMAL:
1613         {
1614             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1615 
1616             if (flags & CHECK_FRAG_INFO) {
1617                 res->bfi.allocated_clusters++;
1618                 if (next_contiguous_offset &&
1619                     offset != next_contiguous_offset) {
1620                     res->bfi.fragmented_clusters++;
1621                 }
1622                 next_contiguous_offset = offset + s->cluster_size;
1623             }
1624 
1625             /* Correct offsets are cluster aligned */
1626             if (offset_into_cluster(s, offset)) {
1627                 if (qcow2_get_cluster_type(l2_entry) ==
1628                     QCOW2_CLUSTER_ZERO_ALLOC)
1629                 {
1630                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
1631                             "cluster is not properly aligned; L2 entry "
1632                             "corrupted.\n",
1633                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1634                             offset);
1635                     if (fix & BDRV_FIX_ERRORS) {
1636                         uint64_t l2e_offset =
1637                             l2_offset + (uint64_t)i * sizeof(uint64_t);
1638 
1639                         l2_entry = QCOW_OFLAG_ZERO;
1640                         l2_table[i] = cpu_to_be64(l2_entry);
1641                         ret = qcow2_pre_write_overlap_check(bs,
1642                                 QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2,
1643                                 l2e_offset, sizeof(uint64_t));
1644                         if (ret < 0) {
1645                             fprintf(stderr, "ERROR: Overlap check failed\n");
1646                             res->check_errors++;
1647                             /* Something is seriously wrong, so abort checking
1648                              * this L2 table */
1649                             goto fail;
1650                         }
1651 
1652                         ret = bdrv_pwrite_sync(bs->file, l2e_offset,
1653                                                &l2_table[i], sizeof(uint64_t));
1654                         if (ret < 0) {
1655                             fprintf(stderr, "ERROR: Failed to overwrite L2 "
1656                                     "table entry: %s\n", strerror(-ret));
1657                             res->check_errors++;
1658                             /* Do not abort, continue checking the rest of this
1659                              * L2 table's entries */
1660                         } else {
1661                             res->corruptions_fixed++;
1662                             /* Skip marking the cluster as used
1663                              * (it is unused now) */
1664                             continue;
1665                         }
1666                     } else {
1667                         res->corruptions++;
1668                     }
1669                 } else {
1670                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1671                         "not properly aligned; L2 entry corrupted.\n", offset);
1672                     res->corruptions++;
1673                 }
1674             }
1675 
1676             /* Mark cluster as used */
1677             ret = qcow2_inc_refcounts_imrt(bs, res,
1678                                            refcount_table, refcount_table_size,
1679                                            offset, s->cluster_size);
1680             if (ret < 0) {
1681                 goto fail;
1682             }
1683             break;
1684         }
1685 
1686         case QCOW2_CLUSTER_ZERO_PLAIN:
1687         case QCOW2_CLUSTER_UNALLOCATED:
1688             break;
1689 
1690         default:
1691             abort();
1692         }
1693     }
1694 
1695     g_free(l2_table);
1696     return 0;
1697 
1698 fail:
1699     g_free(l2_table);
1700     return ret;
1701 }
1702 
1703 /*
1704  * Increases the refcount for the L1 table, its L2 tables and all referenced
1705  * clusters in the given refcount table. While doing so, performs some checks
1706  * on L1 and L2 entries.
1707  *
1708  * Returns the number of errors found by the checks or -errno if an internal
1709  * error occurred.
1710  */
1711 static int check_refcounts_l1(BlockDriverState *bs,
1712                               BdrvCheckResult *res,
1713                               void **refcount_table,
1714                               int64_t *refcount_table_size,
1715                               int64_t l1_table_offset, int l1_size,
1716                               int flags, BdrvCheckMode fix)
1717 {
1718     BDRVQcow2State *s = bs->opaque;
1719     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1720     int i, ret;
1721 
1722     l1_size2 = l1_size * sizeof(uint64_t);
1723 
1724     /* Mark L1 table as used */
1725     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1726                                    l1_table_offset, l1_size2);
1727     if (ret < 0) {
1728         goto fail;
1729     }
1730 
1731     /* Read L1 table entries from disk */
1732     if (l1_size2 > 0) {
1733         l1_table = g_try_malloc(l1_size2);
1734         if (l1_table == NULL) {
1735             ret = -ENOMEM;
1736             res->check_errors++;
1737             goto fail;
1738         }
1739         ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2);
1740         if (ret < 0) {
1741             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1742             res->check_errors++;
1743             goto fail;
1744         }
1745         for(i = 0;i < l1_size; i++)
1746             be64_to_cpus(&l1_table[i]);
1747     }
1748 
1749     /* Do the actual checks */
1750     for(i = 0; i < l1_size; i++) {
1751         l2_offset = l1_table[i];
1752         if (l2_offset) {
1753             /* Mark L2 table as used */
1754             l2_offset &= L1E_OFFSET_MASK;
1755             ret = qcow2_inc_refcounts_imrt(bs, res,
1756                                            refcount_table, refcount_table_size,
1757                                            l2_offset, s->cluster_size);
1758             if (ret < 0) {
1759                 goto fail;
1760             }
1761 
1762             /* L2 tables are cluster aligned */
1763             if (offset_into_cluster(s, l2_offset)) {
1764                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1765                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1766                 res->corruptions++;
1767             }
1768 
1769             /* Process and check L2 entries */
1770             ret = check_refcounts_l2(bs, res, refcount_table,
1771                                      refcount_table_size, l2_offset, flags,
1772                                      fix);
1773             if (ret < 0) {
1774                 goto fail;
1775             }
1776         }
1777     }
1778     g_free(l1_table);
1779     return 0;
1780 
1781 fail:
1782     g_free(l1_table);
1783     return ret;
1784 }
1785 
1786 /*
1787  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1788  *
1789  * This function does not print an error message nor does it increment
1790  * check_errors if qcow2_get_refcount fails (this is because such an error will
1791  * have been already detected and sufficiently signaled by the calling function
1792  * (qcow2_check_refcounts) by the time this function is called).
1793  */
1794 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1795                               BdrvCheckMode fix)
1796 {
1797     BDRVQcow2State *s = bs->opaque;
1798     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1799     int ret;
1800     uint64_t refcount;
1801     int i, j;
1802 
1803     for (i = 0; i < s->l1_size; i++) {
1804         uint64_t l1_entry = s->l1_table[i];
1805         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1806         bool l2_dirty = false;
1807 
1808         if (!l2_offset) {
1809             continue;
1810         }
1811 
1812         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1813                                  &refcount);
1814         if (ret < 0) {
1815             /* don't print message nor increment check_errors */
1816             continue;
1817         }
1818         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1819             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1820                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1821                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1822                                             "ERROR",
1823                     i, l1_entry, refcount);
1824             if (fix & BDRV_FIX_ERRORS) {
1825                 s->l1_table[i] = refcount == 1
1826                                ? l1_entry |  QCOW_OFLAG_COPIED
1827                                : l1_entry & ~QCOW_OFLAG_COPIED;
1828                 ret = qcow2_write_l1_entry(bs, i);
1829                 if (ret < 0) {
1830                     res->check_errors++;
1831                     goto fail;
1832                 }
1833                 res->corruptions_fixed++;
1834             } else {
1835                 res->corruptions++;
1836             }
1837         }
1838 
1839         ret = bdrv_pread(bs->file, l2_offset, l2_table,
1840                          s->l2_size * sizeof(uint64_t));
1841         if (ret < 0) {
1842             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1843                     strerror(-ret));
1844             res->check_errors++;
1845             goto fail;
1846         }
1847 
1848         for (j = 0; j < s->l2_size; j++) {
1849             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1850             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1851             QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
1852 
1853             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
1854                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
1855                 ret = qcow2_get_refcount(bs,
1856                                          data_offset >> s->cluster_bits,
1857                                          &refcount);
1858                 if (ret < 0) {
1859                     /* don't print message nor increment check_errors */
1860                     continue;
1861                 }
1862                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1863                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1864                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1865                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1866                                                     "ERROR",
1867                             l2_entry, refcount);
1868                     if (fix & BDRV_FIX_ERRORS) {
1869                         l2_table[j] = cpu_to_be64(refcount == 1
1870                                     ? l2_entry |  QCOW_OFLAG_COPIED
1871                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1872                         l2_dirty = true;
1873                         res->corruptions_fixed++;
1874                     } else {
1875                         res->corruptions++;
1876                     }
1877                 }
1878             }
1879         }
1880 
1881         if (l2_dirty) {
1882             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1883                                                 l2_offset, s->cluster_size);
1884             if (ret < 0) {
1885                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1886                         "overlap check failed: %s\n", strerror(-ret));
1887                 res->check_errors++;
1888                 goto fail;
1889             }
1890 
1891             ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
1892                               s->cluster_size);
1893             if (ret < 0) {
1894                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1895                         strerror(-ret));
1896                 res->check_errors++;
1897                 goto fail;
1898             }
1899         }
1900     }
1901 
1902     ret = 0;
1903 
1904 fail:
1905     qemu_vfree(l2_table);
1906     return ret;
1907 }
1908 
1909 /*
1910  * Checks consistency of refblocks and accounts for each refblock in
1911  * *refcount_table.
1912  */
1913 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1914                            BdrvCheckMode fix, bool *rebuild,
1915                            void **refcount_table, int64_t *nb_clusters)
1916 {
1917     BDRVQcow2State *s = bs->opaque;
1918     int64_t i, size;
1919     int ret;
1920 
1921     for(i = 0; i < s->refcount_table_size; i++) {
1922         uint64_t offset, cluster;
1923         offset = s->refcount_table[i];
1924         cluster = offset >> s->cluster_bits;
1925 
1926         /* Refcount blocks are cluster aligned */
1927         if (offset_into_cluster(s, offset)) {
1928             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1929                 "cluster aligned; refcount table entry corrupted\n", i);
1930             res->corruptions++;
1931             *rebuild = true;
1932             continue;
1933         }
1934 
1935         if (cluster >= *nb_clusters) {
1936             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1937                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1938 
1939             if (fix & BDRV_FIX_ERRORS) {
1940                 int64_t new_nb_clusters;
1941                 Error *local_err = NULL;
1942 
1943                 if (offset > INT64_MAX - s->cluster_size) {
1944                     ret = -EINVAL;
1945                     goto resize_fail;
1946                 }
1947 
1948                 ret = bdrv_truncate(bs->file, offset + s->cluster_size,
1949                                     PREALLOC_MODE_OFF, &local_err);
1950                 if (ret < 0) {
1951                     error_report_err(local_err);
1952                     goto resize_fail;
1953                 }
1954                 size = bdrv_getlength(bs->file->bs);
1955                 if (size < 0) {
1956                     ret = size;
1957                     goto resize_fail;
1958                 }
1959 
1960                 new_nb_clusters = size_to_clusters(s, size);
1961                 assert(new_nb_clusters >= *nb_clusters);
1962 
1963                 ret = realloc_refcount_array(s, refcount_table,
1964                                              nb_clusters, new_nb_clusters);
1965                 if (ret < 0) {
1966                     res->check_errors++;
1967                     return ret;
1968                 }
1969 
1970                 if (cluster >= *nb_clusters) {
1971                     ret = -EINVAL;
1972                     goto resize_fail;
1973                 }
1974 
1975                 res->corruptions_fixed++;
1976                 ret = qcow2_inc_refcounts_imrt(bs, res,
1977                                                refcount_table, nb_clusters,
1978                                                offset, s->cluster_size);
1979                 if (ret < 0) {
1980                     return ret;
1981                 }
1982                 /* No need to check whether the refcount is now greater than 1:
1983                  * This area was just allocated and zeroed, so it can only be
1984                  * exactly 1 after qcow2_inc_refcounts_imrt() */
1985                 continue;
1986 
1987 resize_fail:
1988                 res->corruptions++;
1989                 *rebuild = true;
1990                 fprintf(stderr, "ERROR could not resize image: %s\n",
1991                         strerror(-ret));
1992             } else {
1993                 res->corruptions++;
1994             }
1995             continue;
1996         }
1997 
1998         if (offset != 0) {
1999             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2000                                            offset, s->cluster_size);
2001             if (ret < 0) {
2002                 return ret;
2003             }
2004             if (s->get_refcount(*refcount_table, cluster) != 1) {
2005                 fprintf(stderr, "ERROR refcount block %" PRId64
2006                         " refcount=%" PRIu64 "\n", i,
2007                         s->get_refcount(*refcount_table, cluster));
2008                 res->corruptions++;
2009                 *rebuild = true;
2010             }
2011         }
2012     }
2013 
2014     return 0;
2015 }
2016 
2017 /*
2018  * Calculates an in-memory refcount table.
2019  */
2020 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2021                                BdrvCheckMode fix, bool *rebuild,
2022                                void **refcount_table, int64_t *nb_clusters)
2023 {
2024     BDRVQcow2State *s = bs->opaque;
2025     int64_t i;
2026     QCowSnapshot *sn;
2027     int ret;
2028 
2029     if (!*refcount_table) {
2030         int64_t old_size = 0;
2031         ret = realloc_refcount_array(s, refcount_table,
2032                                      &old_size, *nb_clusters);
2033         if (ret < 0) {
2034             res->check_errors++;
2035             return ret;
2036         }
2037     }
2038 
2039     /* header */
2040     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2041                                    0, s->cluster_size);
2042     if (ret < 0) {
2043         return ret;
2044     }
2045 
2046     /* current L1 table */
2047     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2048                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2049                              fix);
2050     if (ret < 0) {
2051         return ret;
2052     }
2053 
2054     /* snapshots */
2055     for (i = 0; i < s->nb_snapshots; i++) {
2056         sn = s->snapshots + i;
2057         if (offset_into_cluster(s, sn->l1_table_offset)) {
2058             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2059                     "L1 table is not cluster aligned; snapshot table entry "
2060                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2061             res->corruptions++;
2062             continue;
2063         }
2064         if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
2065             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2066                     "L1 table is too large; snapshot table entry corrupted\n",
2067                     sn->id_str, sn->name, sn->l1_size);
2068             res->corruptions++;
2069             continue;
2070         }
2071         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2072                                  sn->l1_table_offset, sn->l1_size, 0, fix);
2073         if (ret < 0) {
2074             return ret;
2075         }
2076     }
2077     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2078                                    s->snapshots_offset, s->snapshots_size);
2079     if (ret < 0) {
2080         return ret;
2081     }
2082 
2083     /* refcount data */
2084     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2085                                    s->refcount_table_offset,
2086                                    s->refcount_table_size * sizeof(uint64_t));
2087     if (ret < 0) {
2088         return ret;
2089     }
2090 
2091     /* encryption */
2092     if (s->crypto_header.length) {
2093         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2094                                        s->crypto_header.offset,
2095                                        s->crypto_header.length);
2096         if (ret < 0) {
2097             return ret;
2098         }
2099     }
2100 
2101     /* bitmaps */
2102     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2103     if (ret < 0) {
2104         return ret;
2105     }
2106 
2107     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2108 }
2109 
2110 /*
2111  * Compares the actual reference count for each cluster in the image against the
2112  * refcount as reported by the refcount structures on-disk.
2113  */
2114 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2115                               BdrvCheckMode fix, bool *rebuild,
2116                               int64_t *highest_cluster,
2117                               void *refcount_table, int64_t nb_clusters)
2118 {
2119     BDRVQcow2State *s = bs->opaque;
2120     int64_t i;
2121     uint64_t refcount1, refcount2;
2122     int ret;
2123 
2124     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2125         ret = qcow2_get_refcount(bs, i, &refcount1);
2126         if (ret < 0) {
2127             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2128                     i, strerror(-ret));
2129             res->check_errors++;
2130             continue;
2131         }
2132 
2133         refcount2 = s->get_refcount(refcount_table, i);
2134 
2135         if (refcount1 > 0 || refcount2 > 0) {
2136             *highest_cluster = i;
2137         }
2138 
2139         if (refcount1 != refcount2) {
2140             /* Check if we're allowed to fix the mismatch */
2141             int *num_fixed = NULL;
2142             if (refcount1 == 0) {
2143                 *rebuild = true;
2144             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2145                 num_fixed = &res->leaks_fixed;
2146             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2147                 num_fixed = &res->corruptions_fixed;
2148             }
2149 
2150             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2151                     " reference=%" PRIu64 "\n",
2152                    num_fixed != NULL     ? "Repairing" :
2153                    refcount1 < refcount2 ? "ERROR" :
2154                                            "Leaked",
2155                    i, refcount1, refcount2);
2156 
2157             if (num_fixed) {
2158                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2159                                       refcount_diff(refcount1, refcount2),
2160                                       refcount1 > refcount2,
2161                                       QCOW2_DISCARD_ALWAYS);
2162                 if (ret >= 0) {
2163                     (*num_fixed)++;
2164                     continue;
2165                 }
2166             }
2167 
2168             /* And if we couldn't, print an error */
2169             if (refcount1 < refcount2) {
2170                 res->corruptions++;
2171             } else {
2172                 res->leaks++;
2173             }
2174         }
2175     }
2176 }
2177 
2178 /*
2179  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2180  * the on-disk refcount structures.
2181  *
2182  * On input, *first_free_cluster tells where to start looking, and need not
2183  * actually be a free cluster; the returned offset will not be before that
2184  * cluster.  On output, *first_free_cluster points to the first gap found, even
2185  * if that gap was too small to be used as the returned offset.
2186  *
2187  * Note that *first_free_cluster is a cluster index whereas the return value is
2188  * an offset.
2189  */
2190 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2191                                    int cluster_count,
2192                                    void **refcount_table,
2193                                    int64_t *imrt_nb_clusters,
2194                                    int64_t *first_free_cluster)
2195 {
2196     BDRVQcow2State *s = bs->opaque;
2197     int64_t cluster = *first_free_cluster, i;
2198     bool first_gap = true;
2199     int contiguous_free_clusters;
2200     int ret;
2201 
2202     /* Starting at *first_free_cluster, find a range of at least cluster_count
2203      * continuously free clusters */
2204     for (contiguous_free_clusters = 0;
2205          cluster < *imrt_nb_clusters &&
2206          contiguous_free_clusters < cluster_count;
2207          cluster++)
2208     {
2209         if (!s->get_refcount(*refcount_table, cluster)) {
2210             contiguous_free_clusters++;
2211             if (first_gap) {
2212                 /* If this is the first free cluster found, update
2213                  * *first_free_cluster accordingly */
2214                 *first_free_cluster = cluster;
2215                 first_gap = false;
2216             }
2217         } else if (contiguous_free_clusters) {
2218             contiguous_free_clusters = 0;
2219         }
2220     }
2221 
2222     /* If contiguous_free_clusters is greater than zero, it contains the number
2223      * of continuously free clusters until the current cluster; the first free
2224      * cluster in the current "gap" is therefore
2225      * cluster - contiguous_free_clusters */
2226 
2227     /* If no such range could be found, grow the in-memory refcount table
2228      * accordingly to append free clusters at the end of the image */
2229     if (contiguous_free_clusters < cluster_count) {
2230         /* contiguous_free_clusters clusters are already empty at the image end;
2231          * we need cluster_count clusters; therefore, we have to allocate
2232          * cluster_count - contiguous_free_clusters new clusters at the end of
2233          * the image (which is the current value of cluster; note that cluster
2234          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2235          * the image end) */
2236         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2237                                      cluster + cluster_count
2238                                      - contiguous_free_clusters);
2239         if (ret < 0) {
2240             return ret;
2241         }
2242     }
2243 
2244     /* Go back to the first free cluster */
2245     cluster -= contiguous_free_clusters;
2246     for (i = 0; i < cluster_count; i++) {
2247         s->set_refcount(*refcount_table, cluster + i, 1);
2248     }
2249 
2250     return cluster << s->cluster_bits;
2251 }
2252 
2253 /*
2254  * Creates a new refcount structure based solely on the in-memory information
2255  * given through *refcount_table. All necessary allocations will be reflected
2256  * in that array.
2257  *
2258  * On success, the old refcount structure is leaked (it will be covered by the
2259  * new refcount structure).
2260  */
2261 static int rebuild_refcount_structure(BlockDriverState *bs,
2262                                       BdrvCheckResult *res,
2263                                       void **refcount_table,
2264                                       int64_t *nb_clusters)
2265 {
2266     BDRVQcow2State *s = bs->opaque;
2267     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2268     int64_t refblock_offset, refblock_start, refblock_index;
2269     uint32_t reftable_size = 0;
2270     uint64_t *on_disk_reftable = NULL;
2271     void *on_disk_refblock;
2272     int ret = 0;
2273     struct {
2274         uint64_t reftable_offset;
2275         uint32_t reftable_clusters;
2276     } QEMU_PACKED reftable_offset_and_clusters;
2277 
2278     qcow2_cache_empty(bs, s->refcount_block_cache);
2279 
2280 write_refblocks:
2281     for (; cluster < *nb_clusters; cluster++) {
2282         if (!s->get_refcount(*refcount_table, cluster)) {
2283             continue;
2284         }
2285 
2286         refblock_index = cluster >> s->refcount_block_bits;
2287         refblock_start = refblock_index << s->refcount_block_bits;
2288 
2289         /* Don't allocate a cluster in a refblock already written to disk */
2290         if (first_free_cluster < refblock_start) {
2291             first_free_cluster = refblock_start;
2292         }
2293         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2294                                               nb_clusters, &first_free_cluster);
2295         if (refblock_offset < 0) {
2296             fprintf(stderr, "ERROR allocating refblock: %s\n",
2297                     strerror(-refblock_offset));
2298             res->check_errors++;
2299             ret = refblock_offset;
2300             goto fail;
2301         }
2302 
2303         if (reftable_size <= refblock_index) {
2304             uint32_t old_reftable_size = reftable_size;
2305             uint64_t *new_on_disk_reftable;
2306 
2307             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2308                                      s->cluster_size) / sizeof(uint64_t);
2309             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2310                                                  reftable_size *
2311                                                  sizeof(uint64_t));
2312             if (!new_on_disk_reftable) {
2313                 res->check_errors++;
2314                 ret = -ENOMEM;
2315                 goto fail;
2316             }
2317             on_disk_reftable = new_on_disk_reftable;
2318 
2319             memset(on_disk_reftable + old_reftable_size, 0,
2320                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2321 
2322             /* The offset we have for the reftable is now no longer valid;
2323              * this will leak that range, but we can easily fix that by running
2324              * a leak-fixing check after this rebuild operation */
2325             reftable_offset = -1;
2326         } else {
2327             assert(on_disk_reftable);
2328         }
2329         on_disk_reftable[refblock_index] = refblock_offset;
2330 
2331         /* If this is apparently the last refblock (for now), try to squeeze the
2332          * reftable in */
2333         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2334             reftable_offset < 0)
2335         {
2336             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2337                                                           sizeof(uint64_t));
2338             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2339                                                   refcount_table, nb_clusters,
2340                                                   &first_free_cluster);
2341             if (reftable_offset < 0) {
2342                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2343                         strerror(-reftable_offset));
2344                 res->check_errors++;
2345                 ret = reftable_offset;
2346                 goto fail;
2347             }
2348         }
2349 
2350         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2351                                             s->cluster_size);
2352         if (ret < 0) {
2353             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2354             goto fail;
2355         }
2356 
2357         /* The size of *refcount_table is always cluster-aligned, therefore the
2358          * write operation will not overflow */
2359         on_disk_refblock = (void *)((char *) *refcount_table +
2360                                     refblock_index * s->cluster_size);
2361 
2362         ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE,
2363                          on_disk_refblock, s->cluster_sectors);
2364         if (ret < 0) {
2365             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2366             goto fail;
2367         }
2368 
2369         /* Go to the end of this refblock */
2370         cluster = refblock_start + s->refcount_block_size - 1;
2371     }
2372 
2373     if (reftable_offset < 0) {
2374         uint64_t post_refblock_start, reftable_clusters;
2375 
2376         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2377         reftable_clusters = size_to_clusters(s,
2378                                              reftable_size * sizeof(uint64_t));
2379         /* Not pretty but simple */
2380         if (first_free_cluster < post_refblock_start) {
2381             first_free_cluster = post_refblock_start;
2382         }
2383         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2384                                               refcount_table, nb_clusters,
2385                                               &first_free_cluster);
2386         if (reftable_offset < 0) {
2387             fprintf(stderr, "ERROR allocating reftable: %s\n",
2388                     strerror(-reftable_offset));
2389             res->check_errors++;
2390             ret = reftable_offset;
2391             goto fail;
2392         }
2393 
2394         goto write_refblocks;
2395     }
2396 
2397     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2398         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2399     }
2400 
2401     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2402                                         reftable_size * sizeof(uint64_t));
2403     if (ret < 0) {
2404         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2405         goto fail;
2406     }
2407 
2408     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2409     ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable,
2410                       reftable_size * sizeof(uint64_t));
2411     if (ret < 0) {
2412         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2413         goto fail;
2414     }
2415 
2416     /* Enter new reftable into the image header */
2417     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2418     reftable_offset_and_clusters.reftable_clusters =
2419         cpu_to_be32(size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2420     ret = bdrv_pwrite_sync(bs->file,
2421                            offsetof(QCowHeader, refcount_table_offset),
2422                            &reftable_offset_and_clusters,
2423                            sizeof(reftable_offset_and_clusters));
2424     if (ret < 0) {
2425         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2426         goto fail;
2427     }
2428 
2429     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2430         be64_to_cpus(&on_disk_reftable[refblock_index]);
2431     }
2432     s->refcount_table = on_disk_reftable;
2433     s->refcount_table_offset = reftable_offset;
2434     s->refcount_table_size = reftable_size;
2435     update_max_refcount_table_index(s);
2436 
2437     return 0;
2438 
2439 fail:
2440     g_free(on_disk_reftable);
2441     return ret;
2442 }
2443 
2444 /*
2445  * Checks an image for refcount consistency.
2446  *
2447  * Returns 0 if no errors are found, the number of errors in case the image is
2448  * detected as corrupted, and -errno when an internal error occurred.
2449  */
2450 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2451                           BdrvCheckMode fix)
2452 {
2453     BDRVQcow2State *s = bs->opaque;
2454     BdrvCheckResult pre_compare_res;
2455     int64_t size, highest_cluster, nb_clusters;
2456     void *refcount_table = NULL;
2457     bool rebuild = false;
2458     int ret;
2459 
2460     size = bdrv_getlength(bs->file->bs);
2461     if (size < 0) {
2462         res->check_errors++;
2463         return size;
2464     }
2465 
2466     nb_clusters = size_to_clusters(s, size);
2467     if (nb_clusters > INT_MAX) {
2468         res->check_errors++;
2469         return -EFBIG;
2470     }
2471 
2472     res->bfi.total_clusters =
2473         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2474 
2475     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2476                               &nb_clusters);
2477     if (ret < 0) {
2478         goto fail;
2479     }
2480 
2481     /* In case we don't need to rebuild the refcount structure (but want to fix
2482      * something), this function is immediately called again, in which case the
2483      * result should be ignored */
2484     pre_compare_res = *res;
2485     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2486                       nb_clusters);
2487 
2488     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2489         BdrvCheckResult old_res = *res;
2490         int fresh_leaks = 0;
2491 
2492         fprintf(stderr, "Rebuilding refcount structure\n");
2493         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2494                                          &nb_clusters);
2495         if (ret < 0) {
2496             goto fail;
2497         }
2498 
2499         res->corruptions = 0;
2500         res->leaks = 0;
2501 
2502         /* Because the old reftable has been exchanged for a new one the
2503          * references have to be recalculated */
2504         rebuild = false;
2505         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2506         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2507                                   &nb_clusters);
2508         if (ret < 0) {
2509             goto fail;
2510         }
2511 
2512         if (fix & BDRV_FIX_LEAKS) {
2513             /* The old refcount structures are now leaked, fix it; the result
2514              * can be ignored, aside from leaks which were introduced by
2515              * rebuild_refcount_structure() that could not be fixed */
2516             BdrvCheckResult saved_res = *res;
2517             *res = (BdrvCheckResult){ 0 };
2518 
2519             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2520                               &highest_cluster, refcount_table, nb_clusters);
2521             if (rebuild) {
2522                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2523                         "broken\n");
2524             }
2525 
2526             /* Any leaks accounted for here were introduced by
2527              * rebuild_refcount_structure() because that function has created a
2528              * new refcount structure from scratch */
2529             fresh_leaks = res->leaks;
2530             *res = saved_res;
2531         }
2532 
2533         if (res->corruptions < old_res.corruptions) {
2534             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2535         }
2536         if (res->leaks < old_res.leaks) {
2537             res->leaks_fixed += old_res.leaks - res->leaks;
2538         }
2539         res->leaks += fresh_leaks;
2540     } else if (fix) {
2541         if (rebuild) {
2542             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2543             res->check_errors++;
2544             ret = -EIO;
2545             goto fail;
2546         }
2547 
2548         if (res->leaks || res->corruptions) {
2549             *res = pre_compare_res;
2550             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2551                               refcount_table, nb_clusters);
2552         }
2553     }
2554 
2555     /* check OFLAG_COPIED */
2556     ret = check_oflag_copied(bs, res, fix);
2557     if (ret < 0) {
2558         goto fail;
2559     }
2560 
2561     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2562     ret = 0;
2563 
2564 fail:
2565     g_free(refcount_table);
2566 
2567     return ret;
2568 }
2569 
2570 #define overlaps_with(ofs, sz) \
2571     ranges_overlap(offset, size, ofs, sz)
2572 
2573 /*
2574  * Checks if the given offset into the image file is actually free to use by
2575  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2576  * i.e. a sanity check without relying on the refcount tables.
2577  *
2578  * The ign parameter specifies what checks not to perform (being a bitmask of
2579  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2580  *
2581  * Returns:
2582  * - 0 if writing to this offset will not affect the mentioned metadata
2583  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2584  * - a negative value (-errno) indicating an error while performing a check,
2585  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2586  */
2587 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2588                                  int64_t size)
2589 {
2590     BDRVQcow2State *s = bs->opaque;
2591     int chk = s->overlap_check & ~ign;
2592     int i, j;
2593 
2594     if (!size) {
2595         return 0;
2596     }
2597 
2598     if (chk & QCOW2_OL_MAIN_HEADER) {
2599         if (offset < s->cluster_size) {
2600             return QCOW2_OL_MAIN_HEADER;
2601         }
2602     }
2603 
2604     /* align range to test to cluster boundaries */
2605     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2606     offset = start_of_cluster(s, offset);
2607 
2608     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2609         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2610             return QCOW2_OL_ACTIVE_L1;
2611         }
2612     }
2613 
2614     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2615         if (overlaps_with(s->refcount_table_offset,
2616             s->refcount_table_size * sizeof(uint64_t))) {
2617             return QCOW2_OL_REFCOUNT_TABLE;
2618         }
2619     }
2620 
2621     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2622         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2623             return QCOW2_OL_SNAPSHOT_TABLE;
2624         }
2625     }
2626 
2627     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2628         for (i = 0; i < s->nb_snapshots; i++) {
2629             if (s->snapshots[i].l1_size &&
2630                 overlaps_with(s->snapshots[i].l1_table_offset,
2631                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2632                 return QCOW2_OL_INACTIVE_L1;
2633             }
2634         }
2635     }
2636 
2637     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2638         for (i = 0; i < s->l1_size; i++) {
2639             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2640                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2641                 s->cluster_size)) {
2642                 return QCOW2_OL_ACTIVE_L2;
2643             }
2644         }
2645     }
2646 
2647     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2648         unsigned last_entry = s->max_refcount_table_index;
2649         assert(last_entry < s->refcount_table_size);
2650         assert(last_entry + 1 == s->refcount_table_size ||
2651                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2652         for (i = 0; i <= last_entry; i++) {
2653             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2654                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2655                 s->cluster_size)) {
2656                 return QCOW2_OL_REFCOUNT_BLOCK;
2657             }
2658         }
2659     }
2660 
2661     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2662         for (i = 0; i < s->nb_snapshots; i++) {
2663             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2664             uint32_t l1_sz  = s->snapshots[i].l1_size;
2665             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2666             uint64_t *l1;
2667             int ret;
2668 
2669             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, sizeof(uint64_t),
2670                                        QCOW_MAX_L1_SIZE, "", NULL);
2671             if (ret < 0) {
2672                 return ret;
2673             }
2674 
2675             l1 = g_try_malloc(l1_sz2);
2676 
2677             if (l1_sz2 && l1 == NULL) {
2678                 return -ENOMEM;
2679             }
2680 
2681             ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2);
2682             if (ret < 0) {
2683                 g_free(l1);
2684                 return ret;
2685             }
2686 
2687             for (j = 0; j < l1_sz; j++) {
2688                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2689                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2690                     g_free(l1);
2691                     return QCOW2_OL_INACTIVE_L2;
2692                 }
2693             }
2694 
2695             g_free(l1);
2696         }
2697     }
2698 
2699     return 0;
2700 }
2701 
2702 static const char *metadata_ol_names[] = {
2703     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2704     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2705     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2706     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2707     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2708     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2709     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2710     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2711 };
2712 
2713 /*
2714  * First performs a check for metadata overlaps (through
2715  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2716  * while performing a check), that value is returned. If an impending overlap
2717  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2718  * and -EIO returned.
2719  *
2720  * Returns 0 if there were neither overlaps nor errors while checking for
2721  * overlaps; or a negative value (-errno) on error.
2722  */
2723 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2724                                   int64_t size)
2725 {
2726     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2727 
2728     if (ret < 0) {
2729         return ret;
2730     } else if (ret > 0) {
2731         int metadata_ol_bitnr = ctz32(ret);
2732         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2733 
2734         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2735                                 "write on metadata (overlaps with %s)",
2736                                 metadata_ol_names[metadata_ol_bitnr]);
2737         return -EIO;
2738     }
2739 
2740     return 0;
2741 }
2742 
2743 /* A pointer to a function of this type is given to walk_over_reftable(). That
2744  * function will create refblocks and pass them to a RefblockFinishOp once they
2745  * are completed (@refblock). @refblock_empty is set if the refblock is
2746  * completely empty.
2747  *
2748  * Along with the refblock, a corresponding reftable entry is passed, in the
2749  * reftable @reftable (which may be reallocated) at @reftable_index.
2750  *
2751  * @allocated should be set to true if a new cluster has been allocated.
2752  */
2753 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2754                                uint64_t reftable_index, uint64_t *reftable_size,
2755                                void *refblock, bool refblock_empty,
2756                                bool *allocated, Error **errp);
2757 
2758 /**
2759  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2760  * it is not empty) and inserts its offset into the new reftable. The size of
2761  * this new reftable is increased as required.
2762  */
2763 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2764                           uint64_t reftable_index, uint64_t *reftable_size,
2765                           void *refblock, bool refblock_empty, bool *allocated,
2766                           Error **errp)
2767 {
2768     BDRVQcow2State *s = bs->opaque;
2769     int64_t offset;
2770 
2771     if (!refblock_empty && reftable_index >= *reftable_size) {
2772         uint64_t *new_reftable;
2773         uint64_t new_reftable_size;
2774 
2775         new_reftable_size = ROUND_UP(reftable_index + 1,
2776                                      s->cluster_size / sizeof(uint64_t));
2777         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2778             error_setg(errp,
2779                        "This operation would make the refcount table grow "
2780                        "beyond the maximum size supported by QEMU, aborting");
2781             return -ENOTSUP;
2782         }
2783 
2784         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2785                                                 sizeof(uint64_t));
2786         if (!new_reftable) {
2787             error_setg(errp, "Failed to increase reftable buffer size");
2788             return -ENOMEM;
2789         }
2790 
2791         memset(new_reftable + *reftable_size, 0,
2792                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2793 
2794         *reftable      = new_reftable;
2795         *reftable_size = new_reftable_size;
2796     }
2797 
2798     if (!refblock_empty && !(*reftable)[reftable_index]) {
2799         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2800         if (offset < 0) {
2801             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2802             return offset;
2803         }
2804         (*reftable)[reftable_index] = offset;
2805         *allocated = true;
2806     }
2807 
2808     return 0;
2809 }
2810 
2811 /**
2812  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2813  * offset specified by the new reftable's entry. It does not modify the new
2814  * reftable or change any refcounts.
2815  */
2816 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2817                           uint64_t reftable_index, uint64_t *reftable_size,
2818                           void *refblock, bool refblock_empty, bool *allocated,
2819                           Error **errp)
2820 {
2821     BDRVQcow2State *s = bs->opaque;
2822     int64_t offset;
2823     int ret;
2824 
2825     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2826         offset = (*reftable)[reftable_index];
2827 
2828         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2829         if (ret < 0) {
2830             error_setg_errno(errp, -ret, "Overlap check failed");
2831             return ret;
2832         }
2833 
2834         ret = bdrv_pwrite(bs->file, offset, refblock, s->cluster_size);
2835         if (ret < 0) {
2836             error_setg_errno(errp, -ret, "Failed to write refblock");
2837             return ret;
2838         }
2839     } else {
2840         assert(refblock_empty);
2841     }
2842 
2843     return 0;
2844 }
2845 
2846 /**
2847  * This function walks over the existing reftable and every referenced refblock;
2848  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2849  * create an equal new entry in the passed @new_refblock. Once that
2850  * @new_refblock is completely filled, @operation will be called.
2851  *
2852  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2853  * @index is the index of the walk_over_reftable() calls and @total is the total
2854  * number of walk_over_reftable() calls per amend operation. Both are used for
2855  * calculating the parameters for the status callback.
2856  *
2857  * @allocated is set to true if a new cluster has been allocated.
2858  */
2859 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2860                               uint64_t *new_reftable_index,
2861                               uint64_t *new_reftable_size,
2862                               void *new_refblock, int new_refblock_size,
2863                               int new_refcount_bits,
2864                               RefblockFinishOp *operation, bool *allocated,
2865                               Qcow2SetRefcountFunc *new_set_refcount,
2866                               BlockDriverAmendStatusCB *status_cb,
2867                               void *cb_opaque, int index, int total,
2868                               Error **errp)
2869 {
2870     BDRVQcow2State *s = bs->opaque;
2871     uint64_t reftable_index;
2872     bool new_refblock_empty = true;
2873     int refblock_index;
2874     int new_refblock_index = 0;
2875     int ret;
2876 
2877     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2878          reftable_index++)
2879     {
2880         uint64_t refblock_offset = s->refcount_table[reftable_index]
2881                                  & REFT_OFFSET_MASK;
2882 
2883         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2884                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2885 
2886         if (refblock_offset) {
2887             void *refblock;
2888 
2889             if (offset_into_cluster(s, refblock_offset)) {
2890                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2891                                         PRIx64 " unaligned (reftable index: %#"
2892                                         PRIx64 ")", refblock_offset,
2893                                         reftable_index);
2894                 error_setg(errp,
2895                            "Image is corrupt (unaligned refblock offset)");
2896                 return -EIO;
2897             }
2898 
2899             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2900                                   &refblock);
2901             if (ret < 0) {
2902                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2903                 return ret;
2904             }
2905 
2906             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2907                  refblock_index++)
2908             {
2909                 uint64_t refcount;
2910 
2911                 if (new_refblock_index >= new_refblock_size) {
2912                     /* new_refblock is now complete */
2913                     ret = operation(bs, new_reftable, *new_reftable_index,
2914                                     new_reftable_size, new_refblock,
2915                                     new_refblock_empty, allocated, errp);
2916                     if (ret < 0) {
2917                         qcow2_cache_put(s->refcount_block_cache, &refblock);
2918                         return ret;
2919                     }
2920 
2921                     (*new_reftable_index)++;
2922                     new_refblock_index = 0;
2923                     new_refblock_empty = true;
2924                 }
2925 
2926                 refcount = s->get_refcount(refblock, refblock_index);
2927                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2928                     uint64_t offset;
2929 
2930                     qcow2_cache_put(s->refcount_block_cache, &refblock);
2931 
2932                     offset = ((reftable_index << s->refcount_block_bits)
2933                               + refblock_index) << s->cluster_bits;
2934 
2935                     error_setg(errp, "Cannot decrease refcount entry width to "
2936                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2937                                "refcount of %" PRIu64, new_refcount_bits,
2938                                offset, refcount);
2939                     return -EINVAL;
2940                 }
2941 
2942                 if (new_set_refcount) {
2943                     new_set_refcount(new_refblock, new_refblock_index++,
2944                                      refcount);
2945                 } else {
2946                     new_refblock_index++;
2947                 }
2948                 new_refblock_empty = new_refblock_empty && refcount == 0;
2949             }
2950 
2951             qcow2_cache_put(s->refcount_block_cache, &refblock);
2952         } else {
2953             /* No refblock means every refcount is 0 */
2954             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2955                  refblock_index++)
2956             {
2957                 if (new_refblock_index >= new_refblock_size) {
2958                     /* new_refblock is now complete */
2959                     ret = operation(bs, new_reftable, *new_reftable_index,
2960                                     new_reftable_size, new_refblock,
2961                                     new_refblock_empty, allocated, errp);
2962                     if (ret < 0) {
2963                         return ret;
2964                     }
2965 
2966                     (*new_reftable_index)++;
2967                     new_refblock_index = 0;
2968                     new_refblock_empty = true;
2969                 }
2970 
2971                 if (new_set_refcount) {
2972                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2973                 } else {
2974                     new_refblock_index++;
2975                 }
2976             }
2977         }
2978     }
2979 
2980     if (new_refblock_index > 0) {
2981         /* Complete the potentially existing partially filled final refblock */
2982         if (new_set_refcount) {
2983             for (; new_refblock_index < new_refblock_size;
2984                  new_refblock_index++)
2985             {
2986                 new_set_refcount(new_refblock, new_refblock_index, 0);
2987             }
2988         }
2989 
2990         ret = operation(bs, new_reftable, *new_reftable_index,
2991                         new_reftable_size, new_refblock, new_refblock_empty,
2992                         allocated, errp);
2993         if (ret < 0) {
2994             return ret;
2995         }
2996 
2997         (*new_reftable_index)++;
2998     }
2999 
3000     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3001               (uint64_t)total * s->refcount_table_size, cb_opaque);
3002 
3003     return 0;
3004 }
3005 
3006 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3007                                 BlockDriverAmendStatusCB *status_cb,
3008                                 void *cb_opaque, Error **errp)
3009 {
3010     BDRVQcow2State *s = bs->opaque;
3011     Qcow2GetRefcountFunc *new_get_refcount;
3012     Qcow2SetRefcountFunc *new_set_refcount;
3013     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3014     uint64_t *new_reftable = NULL, new_reftable_size = 0;
3015     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3016     uint64_t new_reftable_index = 0;
3017     uint64_t i;
3018     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3019     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3020     int old_refcount_order;
3021     int walk_index = 0;
3022     int ret;
3023     bool new_allocation;
3024 
3025     assert(s->qcow_version >= 3);
3026     assert(refcount_order >= 0 && refcount_order <= 6);
3027 
3028     /* see qcow2_open() */
3029     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3030 
3031     new_get_refcount = get_refcount_funcs[refcount_order];
3032     new_set_refcount = set_refcount_funcs[refcount_order];
3033 
3034 
3035     do {
3036         int total_walks;
3037 
3038         new_allocation = false;
3039 
3040         /* At least we have to do this walk and the one which writes the
3041          * refblocks; also, at least we have to do this loop here at least
3042          * twice (normally), first to do the allocations, and second to
3043          * determine that everything is correctly allocated, this then makes
3044          * three walks in total */
3045         total_walks = MAX(walk_index + 2, 3);
3046 
3047         /* First, allocate the structures so they are present in the refcount
3048          * structures */
3049         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3050                                  &new_reftable_size, NULL, new_refblock_size,
3051                                  new_refcount_bits, &alloc_refblock,
3052                                  &new_allocation, NULL, status_cb, cb_opaque,
3053                                  walk_index++, total_walks, errp);
3054         if (ret < 0) {
3055             goto done;
3056         }
3057 
3058         new_reftable_index = 0;
3059 
3060         if (new_allocation) {
3061             if (new_reftable_offset) {
3062                 qcow2_free_clusters(bs, new_reftable_offset,
3063                                     allocated_reftable_size * sizeof(uint64_t),
3064                                     QCOW2_DISCARD_NEVER);
3065             }
3066 
3067             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3068                                                            sizeof(uint64_t));
3069             if (new_reftable_offset < 0) {
3070                 error_setg_errno(errp, -new_reftable_offset,
3071                                  "Failed to allocate the new reftable");
3072                 ret = new_reftable_offset;
3073                 goto done;
3074             }
3075             allocated_reftable_size = new_reftable_size;
3076         }
3077     } while (new_allocation);
3078 
3079     /* Second, write the new refblocks */
3080     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3081                              &new_reftable_size, new_refblock,
3082                              new_refblock_size, new_refcount_bits,
3083                              &flush_refblock, &new_allocation, new_set_refcount,
3084                              status_cb, cb_opaque, walk_index, walk_index + 1,
3085                              errp);
3086     if (ret < 0) {
3087         goto done;
3088     }
3089     assert(!new_allocation);
3090 
3091 
3092     /* Write the new reftable */
3093     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3094                                         new_reftable_size * sizeof(uint64_t));
3095     if (ret < 0) {
3096         error_setg_errno(errp, -ret, "Overlap check failed");
3097         goto done;
3098     }
3099 
3100     for (i = 0; i < new_reftable_size; i++) {
3101         cpu_to_be64s(&new_reftable[i]);
3102     }
3103 
3104     ret = bdrv_pwrite(bs->file, new_reftable_offset, new_reftable,
3105                       new_reftable_size * sizeof(uint64_t));
3106 
3107     for (i = 0; i < new_reftable_size; i++) {
3108         be64_to_cpus(&new_reftable[i]);
3109     }
3110 
3111     if (ret < 0) {
3112         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3113         goto done;
3114     }
3115 
3116 
3117     /* Empty the refcount cache */
3118     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3119     if (ret < 0) {
3120         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3121         goto done;
3122     }
3123 
3124     /* Update the image header to point to the new reftable; this only updates
3125      * the fields which are relevant to qcow2_update_header(); other fields
3126      * such as s->refcount_table or s->refcount_bits stay stale for now
3127      * (because we have to restore everything if qcow2_update_header() fails) */
3128     old_refcount_order  = s->refcount_order;
3129     old_reftable_size   = s->refcount_table_size;
3130     old_reftable_offset = s->refcount_table_offset;
3131 
3132     s->refcount_order        = refcount_order;
3133     s->refcount_table_size   = new_reftable_size;
3134     s->refcount_table_offset = new_reftable_offset;
3135 
3136     ret = qcow2_update_header(bs);
3137     if (ret < 0) {
3138         s->refcount_order        = old_refcount_order;
3139         s->refcount_table_size   = old_reftable_size;
3140         s->refcount_table_offset = old_reftable_offset;
3141         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3142         goto done;
3143     }
3144 
3145     /* Now update the rest of the in-memory information */
3146     old_reftable = s->refcount_table;
3147     s->refcount_table = new_reftable;
3148     update_max_refcount_table_index(s);
3149 
3150     s->refcount_bits = 1 << refcount_order;
3151     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3152     s->refcount_max += s->refcount_max - 1;
3153 
3154     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3155     s->refcount_block_size = 1 << s->refcount_block_bits;
3156 
3157     s->get_refcount = new_get_refcount;
3158     s->set_refcount = new_set_refcount;
3159 
3160     /* For cleaning up all old refblocks and the old reftable below the "done"
3161      * label */
3162     new_reftable        = old_reftable;
3163     new_reftable_size   = old_reftable_size;
3164     new_reftable_offset = old_reftable_offset;
3165 
3166 done:
3167     if (new_reftable) {
3168         /* On success, new_reftable actually points to the old reftable (and
3169          * new_reftable_size is the old reftable's size); but that is just
3170          * fine */
3171         for (i = 0; i < new_reftable_size; i++) {
3172             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3173             if (offset) {
3174                 qcow2_free_clusters(bs, offset, s->cluster_size,
3175                                     QCOW2_DISCARD_OTHER);
3176             }
3177         }
3178         g_free(new_reftable);
3179 
3180         if (new_reftable_offset > 0) {
3181             qcow2_free_clusters(bs, new_reftable_offset,
3182                                 new_reftable_size * sizeof(uint64_t),
3183                                 QCOW2_DISCARD_OTHER);
3184         }
3185     }
3186 
3187     qemu_vfree(new_refblock);
3188     return ret;
3189 }
3190 
3191 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3192 {
3193     BDRVQcow2State *s = bs->opaque;
3194     uint32_t index = offset_to_reftable_index(s, offset);
3195     int64_t covering_refblock_offset = 0;
3196 
3197     if (index < s->refcount_table_size) {
3198         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3199     }
3200     if (!covering_refblock_offset) {
3201         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3202                                 "not covered by the refcount structures",
3203                                 offset);
3204         return -EIO;
3205     }
3206 
3207     return covering_refblock_offset;
3208 }
3209 
3210 static int qcow2_discard_refcount_block(BlockDriverState *bs,
3211                                         uint64_t discard_block_offs)
3212 {
3213     BDRVQcow2State *s = bs->opaque;
3214     int64_t refblock_offs;
3215     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3216     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3217     void *refblock;
3218     int ret;
3219 
3220     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3221     if (refblock_offs < 0) {
3222         return refblock_offs;
3223     }
3224 
3225     assert(discard_block_offs != 0);
3226 
3227     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3228                           &refblock);
3229     if (ret < 0) {
3230         return ret;
3231     }
3232 
3233     if (s->get_refcount(refblock, block_index) != 1) {
3234         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3235                                 " refblock offset %#" PRIx64
3236                                 ", reftable index %u"
3237                                 ", block offset %#" PRIx64
3238                                 ", refcount %#" PRIx64,
3239                                 refblock_offs,
3240                                 offset_to_reftable_index(s, discard_block_offs),
3241                                 discard_block_offs,
3242                                 s->get_refcount(refblock, block_index));
3243         qcow2_cache_put(s->refcount_block_cache, &refblock);
3244         return -EINVAL;
3245     }
3246     s->set_refcount(refblock, block_index, 0);
3247 
3248     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3249 
3250     qcow2_cache_put(s->refcount_block_cache, &refblock);
3251 
3252     if (cluster_index < s->free_cluster_index) {
3253         s->free_cluster_index = cluster_index;
3254     }
3255 
3256     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3257                                            discard_block_offs);
3258     if (refblock) {
3259         /* discard refblock from the cache if refblock is cached */
3260         qcow2_cache_discard(s->refcount_block_cache, refblock);
3261     }
3262     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3263 
3264     return 0;
3265 }
3266 
3267 int qcow2_shrink_reftable(BlockDriverState *bs)
3268 {
3269     BDRVQcow2State *s = bs->opaque;
3270     uint64_t *reftable_tmp =
3271         g_malloc(s->refcount_table_size * sizeof(uint64_t));
3272     int i, ret;
3273 
3274     for (i = 0; i < s->refcount_table_size; i++) {
3275         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3276         void *refblock;
3277         bool unused_block;
3278 
3279         if (refblock_offs == 0) {
3280             reftable_tmp[i] = 0;
3281             continue;
3282         }
3283         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3284                               &refblock);
3285         if (ret < 0) {
3286             goto out;
3287         }
3288 
3289         /* the refblock has own reference */
3290         if (i == offset_to_reftable_index(s, refblock_offs)) {
3291             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3292                                    (s->refcount_block_size - 1);
3293             uint64_t refcount = s->get_refcount(refblock, block_index);
3294 
3295             s->set_refcount(refblock, block_index, 0);
3296 
3297             unused_block = buffer_is_zero(refblock, s->cluster_size);
3298 
3299             s->set_refcount(refblock, block_index, refcount);
3300         } else {
3301             unused_block = buffer_is_zero(refblock, s->cluster_size);
3302         }
3303         qcow2_cache_put(s->refcount_block_cache, &refblock);
3304 
3305         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3306     }
3307 
3308     ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
3309                            s->refcount_table_size * sizeof(uint64_t));
3310     /*
3311      * If the write in the reftable failed the image may contain a partially
3312      * overwritten reftable. In this case it would be better to clear the
3313      * reftable in memory to avoid possible image corruption.
3314      */
3315     for (i = 0; i < s->refcount_table_size; i++) {
3316         if (s->refcount_table[i] && !reftable_tmp[i]) {
3317             if (ret == 0) {
3318                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3319                                                        REFT_OFFSET_MASK);
3320             }
3321             s->refcount_table[i] = 0;
3322         }
3323     }
3324 
3325     if (!s->cache_discards) {
3326         qcow2_process_discards(bs, ret);
3327     }
3328 
3329 out:
3330     g_free(reftable_tmp);
3331     return ret;
3332 }
3333 
3334 int64_t qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3335 {
3336     BDRVQcow2State *s = bs->opaque;
3337     int64_t i;
3338 
3339     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3340         uint64_t refcount;
3341         int ret = qcow2_get_refcount(bs, i, &refcount);
3342         if (ret < 0) {
3343             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3344                     i, strerror(-ret));
3345             return ret;
3346         }
3347         if (refcount > 0) {
3348             return i;
3349         }
3350     }
3351     qcow2_signal_corruption(bs, true, -1, -1,
3352                             "There are no references in the refcount table.");
3353     return -EIO;
3354 }
3355