xref: /qemu/block/qcow2-refcount.c (revision ab930e80)
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 
25 #include "qemu/osdep.h"
26 #include "block/block-io.h"
27 #include "qapi/error.h"
28 #include "qcow2.h"
29 #include "qemu/range.h"
30 #include "qemu/bswap.h"
31 #include "qemu/cutils.h"
32 #include "qemu/memalign.h"
33 #include "trace.h"
34 
35 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
36                                     uint64_t max);
37 
38 G_GNUC_WARN_UNUSED_RESULT
39 static int update_refcount(BlockDriverState *bs,
40                            int64_t offset, int64_t length, uint64_t addend,
41                            bool decrease, enum qcow2_discard_type type);
42 
43 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
44 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
45 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
46 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
47 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
48 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
49 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
50 
51 static void set_refcount_ro0(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro1(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro2(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro3(void *refcount_array, uint64_t index,
58                              uint64_t value);
59 static void set_refcount_ro4(void *refcount_array, uint64_t index,
60                              uint64_t value);
61 static void set_refcount_ro5(void *refcount_array, uint64_t index,
62                              uint64_t value);
63 static void set_refcount_ro6(void *refcount_array, uint64_t index,
64                              uint64_t value);
65 
66 
67 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
68     &get_refcount_ro0,
69     &get_refcount_ro1,
70     &get_refcount_ro2,
71     &get_refcount_ro3,
72     &get_refcount_ro4,
73     &get_refcount_ro5,
74     &get_refcount_ro6
75 };
76 
77 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
78     &set_refcount_ro0,
79     &set_refcount_ro1,
80     &set_refcount_ro2,
81     &set_refcount_ro3,
82     &set_refcount_ro4,
83     &set_refcount_ro5,
84     &set_refcount_ro6
85 };
86 
87 
88 /*********************************************************/
89 /* refcount handling */
90 
91 static void update_max_refcount_table_index(BDRVQcow2State *s)
92 {
93     unsigned i = s->refcount_table_size - 1;
94     while (i > 0 && (s->refcount_table[i] & REFT_OFFSET_MASK) == 0) {
95         i--;
96     }
97     /* Set s->max_refcount_table_index to the index of the last used entry */
98     s->max_refcount_table_index = i;
99 }
100 
101 int coroutine_fn qcow2_refcount_init(BlockDriverState *bs)
102 {
103     BDRVQcow2State *s = bs->opaque;
104     unsigned int refcount_table_size2, i;
105     int ret;
106 
107     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
108 
109     s->get_refcount = get_refcount_funcs[s->refcount_order];
110     s->set_refcount = set_refcount_funcs[s->refcount_order];
111 
112     assert(s->refcount_table_size <= INT_MAX / REFTABLE_ENTRY_SIZE);
113     refcount_table_size2 = s->refcount_table_size * REFTABLE_ENTRY_SIZE;
114     s->refcount_table = g_try_malloc(refcount_table_size2);
115 
116     if (s->refcount_table_size > 0) {
117         if (s->refcount_table == NULL) {
118             ret = -ENOMEM;
119             goto fail;
120         }
121         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
122         ret = bdrv_co_pread(bs->file, s->refcount_table_offset,
123                             refcount_table_size2, s->refcount_table, 0);
124         if (ret < 0) {
125             goto fail;
126         }
127         for(i = 0; i < s->refcount_table_size; i++)
128             be64_to_cpus(&s->refcount_table[i]);
129         update_max_refcount_table_index(s);
130     }
131     return 0;
132  fail:
133     return ret;
134 }
135 
136 void qcow2_refcount_close(BlockDriverState *bs)
137 {
138     BDRVQcow2State *s = bs->opaque;
139     g_free(s->refcount_table);
140 }
141 
142 
143 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
144 {
145     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
146 }
147 
148 static void set_refcount_ro0(void *refcount_array, uint64_t index,
149                              uint64_t value)
150 {
151     assert(!(value >> 1));
152     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
153     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
154 }
155 
156 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
157 {
158     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
159            & 0x3;
160 }
161 
162 static void set_refcount_ro1(void *refcount_array, uint64_t index,
163                              uint64_t value)
164 {
165     assert(!(value >> 2));
166     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
167     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
168 }
169 
170 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
171 {
172     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
173            & 0xf;
174 }
175 
176 static void set_refcount_ro2(void *refcount_array, uint64_t index,
177                              uint64_t value)
178 {
179     assert(!(value >> 4));
180     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
181     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
182 }
183 
184 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
185 {
186     return ((const uint8_t *)refcount_array)[index];
187 }
188 
189 static void set_refcount_ro3(void *refcount_array, uint64_t index,
190                              uint64_t value)
191 {
192     assert(!(value >> 8));
193     ((uint8_t *)refcount_array)[index] = value;
194 }
195 
196 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
197 {
198     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
199 }
200 
201 static void set_refcount_ro4(void *refcount_array, uint64_t index,
202                              uint64_t value)
203 {
204     assert(!(value >> 16));
205     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
206 }
207 
208 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
209 {
210     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
211 }
212 
213 static void set_refcount_ro5(void *refcount_array, uint64_t index,
214                              uint64_t value)
215 {
216     assert(!(value >> 32));
217     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
218 }
219 
220 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
221 {
222     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
223 }
224 
225 static void set_refcount_ro6(void *refcount_array, uint64_t index,
226                              uint64_t value)
227 {
228     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
229 }
230 
231 
232 static int load_refcount_block(BlockDriverState *bs,
233                                int64_t refcount_block_offset,
234                                void **refcount_block)
235 {
236     BDRVQcow2State *s = bs->opaque;
237 
238     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
239     return qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
240                            refcount_block);
241 }
242 
243 /*
244  * Retrieves the refcount of the cluster given by its index and stores it in
245  * *refcount. Returns 0 on success and -errno on failure.
246  */
247 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
248                        uint64_t *refcount)
249 {
250     BDRVQcow2State *s = bs->opaque;
251     uint64_t refcount_table_index, block_index;
252     int64_t refcount_block_offset;
253     int ret;
254     void *refcount_block;
255 
256     refcount_table_index = cluster_index >> s->refcount_block_bits;
257     if (refcount_table_index >= s->refcount_table_size) {
258         *refcount = 0;
259         return 0;
260     }
261     refcount_block_offset =
262         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
263     if (!refcount_block_offset) {
264         *refcount = 0;
265         return 0;
266     }
267 
268     if (offset_into_cluster(s, refcount_block_offset)) {
269         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
270                                 " unaligned (reftable index: %#" PRIx64 ")",
271                                 refcount_block_offset, refcount_table_index);
272         return -EIO;
273     }
274 
275     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
276                           &refcount_block);
277     if (ret < 0) {
278         return ret;
279     }
280 
281     block_index = cluster_index & (s->refcount_block_size - 1);
282     *refcount = s->get_refcount(refcount_block, block_index);
283 
284     qcow2_cache_put(s->refcount_block_cache, &refcount_block);
285 
286     return 0;
287 }
288 
289 /* Checks if two offsets are described by the same refcount block */
290 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
291     uint64_t offset_b)
292 {
293     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
294     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
295 
296     return (block_a == block_b);
297 }
298 
299 /*
300  * Loads a refcount block. If it doesn't exist yet, it is allocated first
301  * (including growing the refcount table if needed).
302  *
303  * Returns 0 on success or -errno in error case
304  */
305 static int alloc_refcount_block(BlockDriverState *bs,
306                                 int64_t cluster_index, void **refcount_block)
307 {
308     BDRVQcow2State *s = bs->opaque;
309     unsigned int refcount_table_index;
310     int64_t ret;
311 
312     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
313 
314     /* Find the refcount block for the given cluster */
315     refcount_table_index = cluster_index >> s->refcount_block_bits;
316 
317     if (refcount_table_index < s->refcount_table_size) {
318 
319         uint64_t refcount_block_offset =
320             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
321 
322         /* If it's already there, we're done */
323         if (refcount_block_offset) {
324             if (offset_into_cluster(s, refcount_block_offset)) {
325                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
326                                         PRIx64 " unaligned (reftable index: "
327                                         "%#x)", refcount_block_offset,
328                                         refcount_table_index);
329                 return -EIO;
330             }
331 
332              return load_refcount_block(bs, refcount_block_offset,
333                                         refcount_block);
334         }
335     }
336 
337     /*
338      * If we came here, we need to allocate something. Something is at least
339      * a cluster for the new refcount block. It may also include a new refcount
340      * table if the old refcount table is too small.
341      *
342      * Note that allocating clusters here needs some special care:
343      *
344      * - We can't use the normal qcow2_alloc_clusters(), it would try to
345      *   increase the refcount and very likely we would end up with an endless
346      *   recursion. Instead we must place the refcount blocks in a way that
347      *   they can describe them themselves.
348      *
349      * - We need to consider that at this point we are inside update_refcounts
350      *   and potentially doing an initial refcount increase. This means that
351      *   some clusters have already been allocated by the caller, but their
352      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
353      *   need to return -EAGAIN to signal the caller that it needs to restart
354      *   the search for free clusters.
355      *
356      * - alloc_clusters_noref and qcow2_free_clusters may load a different
357      *   refcount block into the cache
358      */
359 
360     *refcount_block = NULL;
361 
362     /* We write to the refcount table, so we might depend on L2 tables */
363     ret = qcow2_cache_flush(bs, s->l2_table_cache);
364     if (ret < 0) {
365         return ret;
366     }
367 
368     /* Allocate the refcount block itself and mark it as used */
369     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size, INT64_MAX);
370     if (new_block < 0) {
371         return new_block;
372     }
373 
374     /* The offset must fit in the offset field of the refcount table entry */
375     assert((new_block & REFT_OFFSET_MASK) == new_block);
376 
377     /* If we're allocating the block at offset 0 then something is wrong */
378     if (new_block == 0) {
379         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
380                                 "allocation of refcount block at offset 0");
381         return -EIO;
382     }
383 
384 #ifdef DEBUG_ALLOC2
385     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
386         " at %" PRIx64 "\n",
387         refcount_table_index, cluster_index << s->cluster_bits, new_block);
388 #endif
389 
390     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
391         /* Zero the new refcount block before updating it */
392         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
393                                     refcount_block);
394         if (ret < 0) {
395             goto fail;
396         }
397 
398         memset(*refcount_block, 0, s->cluster_size);
399 
400         /* The block describes itself, need to update the cache */
401         int block_index = (new_block >> s->cluster_bits) &
402             (s->refcount_block_size - 1);
403         s->set_refcount(*refcount_block, block_index, 1);
404     } else {
405         /* Described somewhere else. This can recurse at most twice before we
406          * arrive at a block that describes itself. */
407         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
408                               QCOW2_DISCARD_NEVER);
409         if (ret < 0) {
410             goto fail;
411         }
412 
413         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
414         if (ret < 0) {
415             goto fail;
416         }
417 
418         /* Initialize the new refcount block only after updating its refcount,
419          * update_refcount uses the refcount cache itself */
420         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
421                                     refcount_block);
422         if (ret < 0) {
423             goto fail;
424         }
425 
426         memset(*refcount_block, 0, s->cluster_size);
427     }
428 
429     /* Now the new refcount block needs to be written to disk */
430     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
431     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, *refcount_block);
432     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
433     if (ret < 0) {
434         goto fail;
435     }
436 
437     /* If the refcount table is big enough, just hook the block up there */
438     if (refcount_table_index < s->refcount_table_size) {
439         uint64_t data64 = cpu_to_be64(new_block);
440         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
441         ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
442                                refcount_table_index * REFTABLE_ENTRY_SIZE,
443             sizeof(data64), &data64, 0);
444         if (ret < 0) {
445             goto fail;
446         }
447 
448         s->refcount_table[refcount_table_index] = new_block;
449         /* If there's a hole in s->refcount_table then it can happen
450          * that refcount_table_index < s->max_refcount_table_index */
451         s->max_refcount_table_index =
452             MAX(s->max_refcount_table_index, refcount_table_index);
453 
454         /* The new refcount block may be where the caller intended to put its
455          * data, so let it restart the search. */
456         return -EAGAIN;
457     }
458 
459     qcow2_cache_put(s->refcount_block_cache, refcount_block);
460 
461     /*
462      * If we come here, we need to grow the refcount table. Again, a new
463      * refcount table needs some space and we can't simply allocate to avoid
464      * endless recursion.
465      *
466      * Therefore let's grab new refcount blocks at the end of the image, which
467      * will describe themselves and the new refcount table. This way we can
468      * reference them only in the new table and do the switch to the new
469      * refcount table at once without producing an inconsistent state in
470      * between.
471      */
472     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
473 
474     /* Calculate the number of refcount blocks needed so far; this will be the
475      * basis for calculating the index of the first cluster used for the
476      * self-describing refcount structures which we are about to create.
477      *
478      * Because we reached this point, there cannot be any refcount entries for
479      * cluster_index or higher indices yet. However, because new_block has been
480      * allocated to describe that cluster (and it will assume this role later
481      * on), we cannot use that index; also, new_block may actually have a higher
482      * cluster index than cluster_index, so it needs to be taken into account
483      * here (and 1 needs to be added to its value because that cluster is used).
484      */
485     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
486                                             (new_block >> s->cluster_bits) + 1),
487                                         s->refcount_block_size);
488 
489     /* Create the new refcount table and blocks */
490     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
491         s->cluster_size;
492 
493     ret = qcow2_refcount_area(bs, meta_offset, 0, false,
494                               refcount_table_index, new_block);
495     if (ret < 0) {
496         return ret;
497     }
498 
499     ret = load_refcount_block(bs, new_block, refcount_block);
500     if (ret < 0) {
501         return ret;
502     }
503 
504     /* If we were trying to do the initial refcount update for some cluster
505      * allocation, we might have used the same clusters to store newly
506      * allocated metadata. Make the caller search some new space. */
507     return -EAGAIN;
508 
509 fail:
510     if (*refcount_block != NULL) {
511         qcow2_cache_put(s->refcount_block_cache, refcount_block);
512     }
513     return ret;
514 }
515 
516 /*
517  * Starting at @start_offset, this function creates new self-covering refcount
518  * structures: A new refcount table and refcount blocks which cover all of
519  * themselves, and a number of @additional_clusters beyond their end.
520  * @start_offset must be at the end of the image file, that is, there must be
521  * only empty space beyond it.
522  * If @exact_size is false, the refcount table will have 50 % more entries than
523  * necessary so it will not need to grow again soon.
524  * If @new_refblock_offset is not zero, it contains the offset of a refcount
525  * block that should be entered into the new refcount table at index
526  * @new_refblock_index.
527  *
528  * Returns: The offset after the new refcount structures (i.e. where the
529  *          @additional_clusters may be placed) on success, -errno on error.
530  */
531 int64_t qcow2_refcount_area(BlockDriverState *bs, uint64_t start_offset,
532                             uint64_t additional_clusters, bool exact_size,
533                             int new_refblock_index,
534                             uint64_t new_refblock_offset)
535 {
536     BDRVQcow2State *s = bs->opaque;
537     uint64_t total_refblock_count_u64, additional_refblock_count;
538     int total_refblock_count, table_size, area_reftable_index, table_clusters;
539     int i;
540     uint64_t table_offset, block_offset, end_offset;
541     int ret;
542     uint64_t *new_table;
543 
544     assert(!(start_offset % s->cluster_size));
545 
546     qcow2_refcount_metadata_size(start_offset / s->cluster_size +
547                                  additional_clusters,
548                                  s->cluster_size, s->refcount_order,
549                                  !exact_size, &total_refblock_count_u64);
550     if (total_refblock_count_u64 > QCOW_MAX_REFTABLE_SIZE) {
551         return -EFBIG;
552     }
553     total_refblock_count = total_refblock_count_u64;
554 
555     /* Index in the refcount table of the first refcount block to cover the area
556      * of refcount structures we are about to create; we know that
557      * @total_refblock_count can cover @start_offset, so this will definitely
558      * fit into an int. */
559     area_reftable_index = (start_offset / s->cluster_size) /
560                           s->refcount_block_size;
561 
562     if (exact_size) {
563         table_size = total_refblock_count;
564     } else {
565         table_size = total_refblock_count +
566                      DIV_ROUND_UP(total_refblock_count, 2);
567     }
568     /* The qcow2 file can only store the reftable size in number of clusters */
569     table_size = ROUND_UP(table_size, s->cluster_size / REFTABLE_ENTRY_SIZE);
570     table_clusters = (table_size * REFTABLE_ENTRY_SIZE) / s->cluster_size;
571 
572     if (table_size > QCOW_MAX_REFTABLE_SIZE) {
573         return -EFBIG;
574     }
575 
576     new_table = g_try_new0(uint64_t, table_size);
577 
578     assert(table_size > 0);
579     if (new_table == NULL) {
580         ret = -ENOMEM;
581         goto fail;
582     }
583 
584     /* Fill the new refcount table */
585     if (table_size > s->max_refcount_table_index) {
586         /* We're actually growing the reftable */
587         memcpy(new_table, s->refcount_table,
588                (s->max_refcount_table_index + 1) * REFTABLE_ENTRY_SIZE);
589     } else {
590         /* Improbable case: We're shrinking the reftable. However, the caller
591          * has assured us that there is only empty space beyond @start_offset,
592          * so we can simply drop all of the refblocks that won't fit into the
593          * new reftable. */
594         memcpy(new_table, s->refcount_table, table_size * REFTABLE_ENTRY_SIZE);
595     }
596 
597     if (new_refblock_offset) {
598         assert(new_refblock_index < total_refblock_count);
599         new_table[new_refblock_index] = new_refblock_offset;
600     }
601 
602     /* Count how many new refblocks we have to create */
603     additional_refblock_count = 0;
604     for (i = area_reftable_index; i < total_refblock_count; i++) {
605         if (!new_table[i]) {
606             additional_refblock_count++;
607         }
608     }
609 
610     table_offset = start_offset + additional_refblock_count * s->cluster_size;
611     end_offset = table_offset + table_clusters * s->cluster_size;
612 
613     /* Fill the refcount blocks, and create new ones, if necessary */
614     block_offset = start_offset;
615     for (i = area_reftable_index; i < total_refblock_count; i++) {
616         void *refblock_data;
617         uint64_t first_offset_covered;
618 
619         /* Reuse an existing refblock if possible, create a new one otherwise */
620         if (new_table[i]) {
621             ret = qcow2_cache_get(bs, s->refcount_block_cache, new_table[i],
622                                   &refblock_data);
623             if (ret < 0) {
624                 goto fail;
625             }
626         } else {
627             ret = qcow2_cache_get_empty(bs, s->refcount_block_cache,
628                                         block_offset, &refblock_data);
629             if (ret < 0) {
630                 goto fail;
631             }
632             memset(refblock_data, 0, s->cluster_size);
633             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
634                                          refblock_data);
635 
636             new_table[i] = block_offset;
637             block_offset += s->cluster_size;
638         }
639 
640         /* First host offset covered by this refblock */
641         first_offset_covered = (uint64_t)i * s->refcount_block_size *
642                                s->cluster_size;
643         if (first_offset_covered < end_offset) {
644             int j, end_index;
645 
646             /* Set the refcount of all of the new refcount structures to 1 */
647 
648             if (first_offset_covered < start_offset) {
649                 assert(i == area_reftable_index);
650                 j = (start_offset - first_offset_covered) / s->cluster_size;
651                 assert(j < s->refcount_block_size);
652             } else {
653                 j = 0;
654             }
655 
656             end_index = MIN((end_offset - first_offset_covered) /
657                             s->cluster_size,
658                             s->refcount_block_size);
659 
660             for (; j < end_index; j++) {
661                 /* The caller guaranteed us this space would be empty */
662                 assert(s->get_refcount(refblock_data, j) == 0);
663                 s->set_refcount(refblock_data, j, 1);
664             }
665 
666             qcow2_cache_entry_mark_dirty(s->refcount_block_cache,
667                                          refblock_data);
668         }
669 
670         qcow2_cache_put(s->refcount_block_cache, &refblock_data);
671     }
672 
673     assert(block_offset == table_offset);
674 
675     /* Write refcount blocks to disk */
676     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
677     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
678     if (ret < 0) {
679         goto fail;
680     }
681 
682     /* Write refcount table to disk */
683     for (i = 0; i < total_refblock_count; i++) {
684         cpu_to_be64s(&new_table[i]);
685     }
686 
687     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
688     ret = bdrv_pwrite_sync(bs->file, table_offset,
689                            table_size * REFTABLE_ENTRY_SIZE, new_table, 0);
690     if (ret < 0) {
691         goto fail;
692     }
693 
694     for (i = 0; i < total_refblock_count; i++) {
695         be64_to_cpus(&new_table[i]);
696     }
697 
698     /* Hook up the new refcount table in the qcow2 header */
699     struct QEMU_PACKED {
700         uint64_t d64;
701         uint32_t d32;
702     } data;
703     data.d64 = cpu_to_be64(table_offset);
704     data.d32 = cpu_to_be32(table_clusters);
705     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
706     ret = bdrv_pwrite_sync(bs->file,
707                            offsetof(QCowHeader, refcount_table_offset),
708                            sizeof(data), &data, 0);
709     if (ret < 0) {
710         goto fail;
711     }
712 
713     /* And switch it in memory */
714     uint64_t old_table_offset = s->refcount_table_offset;
715     uint64_t old_table_size = s->refcount_table_size;
716 
717     g_free(s->refcount_table);
718     s->refcount_table = new_table;
719     s->refcount_table_size = table_size;
720     s->refcount_table_offset = table_offset;
721     update_max_refcount_table_index(s);
722 
723     /* Free old table. */
724     qcow2_free_clusters(bs, old_table_offset,
725                         old_table_size * REFTABLE_ENTRY_SIZE,
726                         QCOW2_DISCARD_OTHER);
727 
728     return end_offset;
729 
730 fail:
731     g_free(new_table);
732     return ret;
733 }
734 
735 void qcow2_process_discards(BlockDriverState *bs, int ret)
736 {
737     BDRVQcow2State *s = bs->opaque;
738     Qcow2DiscardRegion *d, *next;
739 
740     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
741         QTAILQ_REMOVE(&s->discards, d, next);
742 
743         /* Discard is optional, ignore the return value */
744         if (ret >= 0) {
745             int r2 = bdrv_pdiscard(bs->file, d->offset, d->bytes);
746             if (r2 < 0) {
747                 trace_qcow2_process_discards_failed_region(d->offset, d->bytes,
748                                                            r2);
749             }
750         }
751 
752         g_free(d);
753     }
754 }
755 
756 static void update_refcount_discard(BlockDriverState *bs,
757                                     uint64_t offset, uint64_t length)
758 {
759     BDRVQcow2State *s = bs->opaque;
760     Qcow2DiscardRegion *d, *p, *next;
761 
762     QTAILQ_FOREACH(d, &s->discards, next) {
763         uint64_t new_start = MIN(offset, d->offset);
764         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
765 
766         if (new_end - new_start <= length + d->bytes) {
767             /* There can't be any overlap, areas ending up here have no
768              * references any more and therefore shouldn't get freed another
769              * time. */
770             assert(d->bytes + length == new_end - new_start);
771             d->offset = new_start;
772             d->bytes = new_end - new_start;
773             goto found;
774         }
775     }
776 
777     d = g_malloc(sizeof(*d));
778     *d = (Qcow2DiscardRegion) {
779         .bs     = bs,
780         .offset = offset,
781         .bytes  = length,
782     };
783     QTAILQ_INSERT_TAIL(&s->discards, d, next);
784 
785 found:
786     /* Merge discard requests if they are adjacent now */
787     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
788         if (p == d
789             || p->offset > d->offset + d->bytes
790             || d->offset > p->offset + p->bytes)
791         {
792             continue;
793         }
794 
795         /* Still no overlap possible */
796         assert(p->offset == d->offset + d->bytes
797             || d->offset == p->offset + p->bytes);
798 
799         QTAILQ_REMOVE(&s->discards, p, next);
800         d->offset = MIN(d->offset, p->offset);
801         d->bytes += p->bytes;
802         g_free(p);
803     }
804 }
805 
806 /* XXX: cache several refcount block clusters ? */
807 /* @addend is the absolute value of the addend; if @decrease is set, @addend
808  * will be subtracted from the current refcount, otherwise it will be added */
809 static int update_refcount(BlockDriverState *bs,
810                            int64_t offset,
811                            int64_t length,
812                            uint64_t addend,
813                            bool decrease,
814                            enum qcow2_discard_type type)
815 {
816     BDRVQcow2State *s = bs->opaque;
817     int64_t start, last, cluster_offset;
818     void *refcount_block = NULL;
819     int64_t old_table_index = -1;
820     int ret;
821 
822 #ifdef DEBUG_ALLOC2
823     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
824             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
825             addend);
826 #endif
827     if (length < 0) {
828         return -EINVAL;
829     } else if (length == 0) {
830         return 0;
831     }
832 
833     if (decrease) {
834         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
835             s->l2_table_cache);
836     }
837 
838     start = start_of_cluster(s, offset);
839     last = start_of_cluster(s, offset + length - 1);
840     for(cluster_offset = start; cluster_offset <= last;
841         cluster_offset += s->cluster_size)
842     {
843         int block_index;
844         uint64_t refcount;
845         int64_t cluster_index = cluster_offset >> s->cluster_bits;
846         int64_t table_index = cluster_index >> s->refcount_block_bits;
847 
848         /* Load the refcount block and allocate it if needed */
849         if (table_index != old_table_index) {
850             if (refcount_block) {
851                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
852             }
853             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
854             /* If the caller needs to restart the search for free clusters,
855              * try the same ones first to see if they're still free. */
856             if (ret == -EAGAIN) {
857                 if (s->free_cluster_index > (start >> s->cluster_bits)) {
858                     s->free_cluster_index = (start >> s->cluster_bits);
859                 }
860             }
861             if (ret < 0) {
862                 goto fail;
863             }
864         }
865         old_table_index = table_index;
866 
867         qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refcount_block);
868 
869         /* we can update the count and save it */
870         block_index = cluster_index & (s->refcount_block_size - 1);
871 
872         refcount = s->get_refcount(refcount_block, block_index);
873         if (decrease ? (refcount - addend > refcount)
874                      : (refcount + addend < refcount ||
875                         refcount + addend > s->refcount_max))
876         {
877             ret = -EINVAL;
878             goto fail;
879         }
880         if (decrease) {
881             refcount -= addend;
882         } else {
883             refcount += addend;
884         }
885         if (refcount == 0 && cluster_index < s->free_cluster_index) {
886             s->free_cluster_index = cluster_index;
887         }
888         s->set_refcount(refcount_block, block_index, refcount);
889 
890         if (refcount == 0) {
891             void *table;
892 
893             table = qcow2_cache_is_table_offset(s->refcount_block_cache,
894                                                 offset);
895             if (table != NULL) {
896                 qcow2_cache_put(s->refcount_block_cache, &refcount_block);
897                 old_table_index = -1;
898                 qcow2_cache_discard(s->refcount_block_cache, table);
899             }
900 
901             table = qcow2_cache_is_table_offset(s->l2_table_cache, offset);
902             if (table != NULL) {
903                 qcow2_cache_discard(s->l2_table_cache, table);
904             }
905 
906             if (s->discard_passthrough[type]) {
907                 update_refcount_discard(bs, cluster_offset, s->cluster_size);
908             }
909         }
910     }
911 
912     ret = 0;
913 fail:
914     if (!s->cache_discards) {
915         qcow2_process_discards(bs, ret);
916     }
917 
918     /* Write last changed block to disk */
919     if (refcount_block) {
920         qcow2_cache_put(s->refcount_block_cache, &refcount_block);
921     }
922 
923     /*
924      * Try do undo any updates if an error is returned (This may succeed in
925      * some cases like ENOSPC for allocating a new refcount block)
926      */
927     if (ret < 0) {
928         int dummy;
929         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
930                                 !decrease, QCOW2_DISCARD_NEVER);
931         (void)dummy;
932     }
933 
934     return ret;
935 }
936 
937 /*
938  * Increases or decreases the refcount of a given cluster.
939  *
940  * @addend is the absolute value of the addend; if @decrease is set, @addend
941  * will be subtracted from the current refcount, otherwise it will be added.
942  *
943  * On success 0 is returned; on failure -errno is returned.
944  */
945 int qcow2_update_cluster_refcount(BlockDriverState *bs,
946                                   int64_t cluster_index,
947                                   uint64_t addend, bool decrease,
948                                   enum qcow2_discard_type type)
949 {
950     BDRVQcow2State *s = bs->opaque;
951     int ret;
952 
953     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
954                           decrease, type);
955     if (ret < 0) {
956         return ret;
957     }
958 
959     return 0;
960 }
961 
962 
963 
964 /*********************************************************/
965 /* cluster allocation functions */
966 
967 
968 
969 /* return < 0 if error */
970 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size,
971                                     uint64_t max)
972 {
973     BDRVQcow2State *s = bs->opaque;
974     uint64_t i, nb_clusters, refcount;
975     int ret;
976 
977     /* We can't allocate clusters if they may still be queued for discard. */
978     if (s->cache_discards) {
979         qcow2_process_discards(bs, 0);
980     }
981 
982     nb_clusters = size_to_clusters(s, size);
983 retry:
984     for(i = 0; i < nb_clusters; i++) {
985         uint64_t next_cluster_index = s->free_cluster_index++;
986         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
987 
988         if (ret < 0) {
989             return ret;
990         } else if (refcount != 0) {
991             goto retry;
992         }
993     }
994 
995     /* Make sure that all offsets in the "allocated" range are representable
996      * in the requested max */
997     if (s->free_cluster_index > 0 &&
998         s->free_cluster_index - 1 > (max >> s->cluster_bits))
999     {
1000         return -EFBIG;
1001     }
1002 
1003 #ifdef DEBUG_ALLOC2
1004     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
1005             size,
1006             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
1007 #endif
1008     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
1009 }
1010 
1011 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
1012 {
1013     int64_t offset;
1014     int ret;
1015 
1016     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
1017     do {
1018         offset = alloc_clusters_noref(bs, size, QCOW_MAX_CLUSTER_OFFSET);
1019         if (offset < 0) {
1020             return offset;
1021         }
1022 
1023         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1024     } while (ret == -EAGAIN);
1025 
1026     if (ret < 0) {
1027         return ret;
1028     }
1029 
1030     return offset;
1031 }
1032 
1033 int64_t coroutine_fn qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
1034                                              int64_t nb_clusters)
1035 {
1036     BDRVQcow2State *s = bs->opaque;
1037     uint64_t cluster_index, refcount;
1038     uint64_t i;
1039     int ret;
1040 
1041     assert(nb_clusters >= 0);
1042     if (nb_clusters == 0) {
1043         return 0;
1044     }
1045 
1046     do {
1047         /* Check how many clusters there are free */
1048         cluster_index = offset >> s->cluster_bits;
1049         for(i = 0; i < nb_clusters; i++) {
1050             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
1051             if (ret < 0) {
1052                 return ret;
1053             } else if (refcount != 0) {
1054                 break;
1055             }
1056         }
1057 
1058         /* And then allocate them */
1059         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
1060                               QCOW2_DISCARD_NEVER);
1061     } while (ret == -EAGAIN);
1062 
1063     if (ret < 0) {
1064         return ret;
1065     }
1066 
1067     return i;
1068 }
1069 
1070 /* only used to allocate compressed sectors. We try to allocate
1071    contiguous sectors. size must be <= cluster_size */
1072 int64_t coroutine_fn qcow2_alloc_bytes(BlockDriverState *bs, int size)
1073 {
1074     BDRVQcow2State *s = bs->opaque;
1075     int64_t offset;
1076     size_t free_in_cluster;
1077     int ret;
1078 
1079     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
1080     assert(size > 0 && size <= s->cluster_size);
1081     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
1082 
1083     offset = s->free_byte_offset;
1084 
1085     if (offset) {
1086         uint64_t refcount;
1087         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
1088         if (ret < 0) {
1089             return ret;
1090         }
1091 
1092         if (refcount == s->refcount_max) {
1093             offset = 0;
1094         }
1095     }
1096 
1097     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
1098     do {
1099         if (!offset || free_in_cluster < size) {
1100             int64_t new_cluster;
1101 
1102             new_cluster = alloc_clusters_noref(bs, s->cluster_size,
1103                                                MIN(s->cluster_offset_mask,
1104                                                    QCOW_MAX_CLUSTER_OFFSET));
1105             if (new_cluster < 0) {
1106                 return new_cluster;
1107             }
1108 
1109             if (new_cluster == 0) {
1110                 qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
1111                                         "allocation of compressed cluster "
1112                                         "at offset 0");
1113                 return -EIO;
1114             }
1115 
1116             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
1117                 offset = new_cluster;
1118                 free_in_cluster = s->cluster_size;
1119             } else {
1120                 free_in_cluster += s->cluster_size;
1121             }
1122         }
1123 
1124         assert(offset);
1125         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
1126         if (ret < 0) {
1127             offset = 0;
1128         }
1129     } while (ret == -EAGAIN);
1130     if (ret < 0) {
1131         return ret;
1132     }
1133 
1134     /* The cluster refcount was incremented; refcount blocks must be flushed
1135      * before the caller's L2 table updates. */
1136     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
1137 
1138     s->free_byte_offset = offset + size;
1139     if (!offset_into_cluster(s, s->free_byte_offset)) {
1140         s->free_byte_offset = 0;
1141     }
1142 
1143     return offset;
1144 }
1145 
1146 void qcow2_free_clusters(BlockDriverState *bs,
1147                           int64_t offset, int64_t size,
1148                           enum qcow2_discard_type type)
1149 {
1150     int ret;
1151 
1152     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
1153     ret = update_refcount(bs, offset, size, 1, true, type);
1154     if (ret < 0) {
1155         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
1156         /* TODO Remember the clusters to free them later and avoid leaking */
1157     }
1158 }
1159 
1160 /*
1161  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1162  * normal cluster, compressed cluster, etc.)
1163  */
1164 void qcow2_free_any_cluster(BlockDriverState *bs, uint64_t l2_entry,
1165                             enum qcow2_discard_type type)
1166 {
1167     BDRVQcow2State *s = bs->opaque;
1168     QCow2ClusterType ctype = qcow2_get_cluster_type(bs, l2_entry);
1169 
1170     if (has_data_file(bs)) {
1171         if (s->discard_passthrough[type] &&
1172             (ctype == QCOW2_CLUSTER_NORMAL ||
1173              ctype == QCOW2_CLUSTER_ZERO_ALLOC))
1174         {
1175             bdrv_pdiscard(s->data_file, l2_entry & L2E_OFFSET_MASK,
1176                           s->cluster_size);
1177         }
1178         return;
1179     }
1180 
1181     switch (ctype) {
1182     case QCOW2_CLUSTER_COMPRESSED:
1183         {
1184             uint64_t coffset;
1185             int csize;
1186 
1187             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1188             qcow2_free_clusters(bs, coffset, csize, type);
1189         }
1190         break;
1191     case QCOW2_CLUSTER_NORMAL:
1192     case QCOW2_CLUSTER_ZERO_ALLOC:
1193         if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1194             qcow2_signal_corruption(bs, false, -1, -1,
1195                                     "Cannot free unaligned cluster %#llx",
1196                                     l2_entry & L2E_OFFSET_MASK);
1197         } else {
1198             qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1199                                 s->cluster_size, type);
1200         }
1201         break;
1202     case QCOW2_CLUSTER_ZERO_PLAIN:
1203     case QCOW2_CLUSTER_UNALLOCATED:
1204         break;
1205     default:
1206         abort();
1207     }
1208 }
1209 
1210 int qcow2_write_caches(BlockDriverState *bs)
1211 {
1212     BDRVQcow2State *s = bs->opaque;
1213     int ret;
1214 
1215     ret = qcow2_cache_write(bs, s->l2_table_cache);
1216     if (ret < 0) {
1217         return ret;
1218     }
1219 
1220     if (qcow2_need_accurate_refcounts(s)) {
1221         ret = qcow2_cache_write(bs, s->refcount_block_cache);
1222         if (ret < 0) {
1223             return ret;
1224         }
1225     }
1226 
1227     return 0;
1228 }
1229 
1230 int qcow2_flush_caches(BlockDriverState *bs)
1231 {
1232     int ret = qcow2_write_caches(bs);
1233     if (ret < 0) {
1234         return ret;
1235     }
1236 
1237     return bdrv_flush(bs->file->bs);
1238 }
1239 
1240 /*********************************************************/
1241 /* snapshots and image creation */
1242 
1243 
1244 
1245 /* update the refcounts of snapshots and the copied flag */
1246 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1247     int64_t l1_table_offset, int l1_size, int addend)
1248 {
1249     BDRVQcow2State *s = bs->opaque;
1250     uint64_t *l1_table, *l2_slice, l2_offset, entry, l1_size2, refcount;
1251     bool l1_allocated = false;
1252     int64_t old_entry, old_l2_offset;
1253     unsigned slice, slice_size2, n_slices;
1254     int i, j, l1_modified = 0;
1255     int ret;
1256 
1257     assert(addend >= -1 && addend <= 1);
1258 
1259     l2_slice = NULL;
1260     l1_table = NULL;
1261     l1_size2 = l1_size * L1E_SIZE;
1262     slice_size2 = s->l2_slice_size * l2_entry_size(s);
1263     n_slices = s->cluster_size / slice_size2;
1264 
1265     s->cache_discards = true;
1266 
1267     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1268      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1269      * when changing this! */
1270     if (l1_table_offset != s->l1_table_offset) {
1271         l1_table = g_try_malloc0(l1_size2);
1272         if (l1_size2 && l1_table == NULL) {
1273             ret = -ENOMEM;
1274             goto fail;
1275         }
1276         l1_allocated = true;
1277 
1278         ret = bdrv_pread(bs->file, l1_table_offset, l1_size2, l1_table, 0);
1279         if (ret < 0) {
1280             goto fail;
1281         }
1282 
1283         for (i = 0; i < l1_size; i++) {
1284             be64_to_cpus(&l1_table[i]);
1285         }
1286     } else {
1287         assert(l1_size == s->l1_size);
1288         l1_table = s->l1_table;
1289         l1_allocated = false;
1290     }
1291 
1292     for (i = 0; i < l1_size; i++) {
1293         l2_offset = l1_table[i];
1294         if (l2_offset) {
1295             old_l2_offset = l2_offset;
1296             l2_offset &= L1E_OFFSET_MASK;
1297 
1298             if (offset_into_cluster(s, l2_offset)) {
1299                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1300                                         PRIx64 " unaligned (L1 index: %#x)",
1301                                         l2_offset, i);
1302                 ret = -EIO;
1303                 goto fail;
1304             }
1305 
1306             for (slice = 0; slice < n_slices; slice++) {
1307                 ret = qcow2_cache_get(bs, s->l2_table_cache,
1308                                       l2_offset + slice * slice_size2,
1309                                       (void **) &l2_slice);
1310                 if (ret < 0) {
1311                     goto fail;
1312                 }
1313 
1314                 for (j = 0; j < s->l2_slice_size; j++) {
1315                     uint64_t cluster_index;
1316                     uint64_t offset;
1317 
1318                     entry = get_l2_entry(s, l2_slice, j);
1319                     old_entry = entry;
1320                     entry &= ~QCOW_OFLAG_COPIED;
1321                     offset = entry & L2E_OFFSET_MASK;
1322 
1323                     switch (qcow2_get_cluster_type(bs, entry)) {
1324                     case QCOW2_CLUSTER_COMPRESSED:
1325                         if (addend != 0) {
1326                             uint64_t coffset;
1327                             int csize;
1328 
1329                             qcow2_parse_compressed_l2_entry(bs, entry,
1330                                                             &coffset, &csize);
1331                             ret = update_refcount(
1332                                 bs, coffset, csize,
1333                                 abs(addend), addend < 0,
1334                                 QCOW2_DISCARD_SNAPSHOT);
1335                             if (ret < 0) {
1336                                 goto fail;
1337                             }
1338                         }
1339                         /* compressed clusters are never modified */
1340                         refcount = 2;
1341                         break;
1342 
1343                     case QCOW2_CLUSTER_NORMAL:
1344                     case QCOW2_CLUSTER_ZERO_ALLOC:
1345                         if (offset_into_cluster(s, offset)) {
1346                             /* Here l2_index means table (not slice) index */
1347                             int l2_index = slice * s->l2_slice_size + j;
1348                             qcow2_signal_corruption(
1349                                 bs, true, -1, -1, "Cluster "
1350                                 "allocation offset %#" PRIx64
1351                                 " unaligned (L2 offset: %#"
1352                                 PRIx64 ", L2 index: %#x)",
1353                                 offset, l2_offset, l2_index);
1354                             ret = -EIO;
1355                             goto fail;
1356                         }
1357 
1358                         cluster_index = offset >> s->cluster_bits;
1359                         assert(cluster_index);
1360                         if (addend != 0) {
1361                             ret = qcow2_update_cluster_refcount(
1362                                 bs, cluster_index, abs(addend), addend < 0,
1363                                 QCOW2_DISCARD_SNAPSHOT);
1364                             if (ret < 0) {
1365                                 goto fail;
1366                             }
1367                         }
1368 
1369                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1370                         if (ret < 0) {
1371                             goto fail;
1372                         }
1373                         break;
1374 
1375                     case QCOW2_CLUSTER_ZERO_PLAIN:
1376                     case QCOW2_CLUSTER_UNALLOCATED:
1377                         refcount = 0;
1378                         break;
1379 
1380                     default:
1381                         abort();
1382                     }
1383 
1384                     if (refcount == 1) {
1385                         entry |= QCOW_OFLAG_COPIED;
1386                     }
1387                     if (entry != old_entry) {
1388                         if (addend > 0) {
1389                             qcow2_cache_set_dependency(bs, s->l2_table_cache,
1390                                                        s->refcount_block_cache);
1391                         }
1392                         set_l2_entry(s, l2_slice, j, entry);
1393                         qcow2_cache_entry_mark_dirty(s->l2_table_cache,
1394                                                      l2_slice);
1395                     }
1396                 }
1397 
1398                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1399             }
1400 
1401             if (addend != 0) {
1402                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1403                                                         s->cluster_bits,
1404                                                     abs(addend), addend < 0,
1405                                                     QCOW2_DISCARD_SNAPSHOT);
1406                 if (ret < 0) {
1407                     goto fail;
1408                 }
1409             }
1410             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1411                                      &refcount);
1412             if (ret < 0) {
1413                 goto fail;
1414             } else if (refcount == 1) {
1415                 l2_offset |= QCOW_OFLAG_COPIED;
1416             }
1417             if (l2_offset != old_l2_offset) {
1418                 l1_table[i] = l2_offset;
1419                 l1_modified = 1;
1420             }
1421         }
1422     }
1423 
1424     ret = bdrv_flush(bs);
1425 fail:
1426     if (l2_slice) {
1427         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1428     }
1429 
1430     s->cache_discards = false;
1431     qcow2_process_discards(bs, ret);
1432 
1433     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1434     if (ret == 0 && addend >= 0 && l1_modified) {
1435         for (i = 0; i < l1_size; i++) {
1436             cpu_to_be64s(&l1_table[i]);
1437         }
1438 
1439         ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_size2, l1_table,
1440                                0);
1441 
1442         for (i = 0; i < l1_size; i++) {
1443             be64_to_cpus(&l1_table[i]);
1444         }
1445     }
1446     if (l1_allocated)
1447         g_free(l1_table);
1448     return ret;
1449 }
1450 
1451 
1452 
1453 
1454 /*********************************************************/
1455 /* refcount checking functions */
1456 
1457 
1458 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1459 {
1460     /* This assertion holds because there is no way we can address more than
1461      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1462      * offsets have to be representable in bytes); due to every cluster
1463      * corresponding to one refcount entry, we are well below that limit */
1464     assert(entries < (UINT64_C(1) << (64 - 9)));
1465 
1466     /* Thanks to the assertion this will not overflow, because
1467      * s->refcount_order < 7.
1468      * (note: x << s->refcount_order == x * s->refcount_bits) */
1469     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1470 }
1471 
1472 /**
1473  * Reallocates *array so that it can hold new_size entries. *size must contain
1474  * the current number of entries in *array. If the reallocation fails, *array
1475  * and *size will not be modified and -errno will be returned. If the
1476  * reallocation is successful, *array will be set to the new buffer, *size
1477  * will be set to new_size and 0 will be returned. The size of the reallocated
1478  * refcount array buffer will be aligned to a cluster boundary, and the newly
1479  * allocated area will be zeroed.
1480  */
1481 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1482                                   int64_t *size, int64_t new_size)
1483 {
1484     int64_t old_byte_size, new_byte_size;
1485     void *new_ptr;
1486 
1487     /* Round to clusters so the array can be directly written to disk */
1488     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1489                     * s->cluster_size;
1490     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1491                     * s->cluster_size;
1492 
1493     if (new_byte_size == old_byte_size) {
1494         *size = new_size;
1495         return 0;
1496     }
1497 
1498     assert(new_byte_size > 0);
1499 
1500     if (new_byte_size > SIZE_MAX) {
1501         return -ENOMEM;
1502     }
1503 
1504     new_ptr = g_try_realloc(*array, new_byte_size);
1505     if (!new_ptr) {
1506         return -ENOMEM;
1507     }
1508 
1509     if (new_byte_size > old_byte_size) {
1510         memset((char *)new_ptr + old_byte_size, 0,
1511                new_byte_size - old_byte_size);
1512     }
1513 
1514     *array = new_ptr;
1515     *size  = new_size;
1516 
1517     return 0;
1518 }
1519 
1520 /*
1521  * Increases the refcount for a range of clusters in a given refcount table.
1522  * This is used to construct a temporary refcount table out of L1 and L2 tables
1523  * which can be compared to the refcount table saved in the image.
1524  *
1525  * Modifies the number of errors in res.
1526  */
1527 int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
1528                              void **refcount_table,
1529                              int64_t *refcount_table_size,
1530                              int64_t offset, int64_t size)
1531 {
1532     BDRVQcow2State *s = bs->opaque;
1533     uint64_t start, last, cluster_offset, k, refcount;
1534     int64_t file_len;
1535     int ret;
1536 
1537     if (size <= 0) {
1538         return 0;
1539     }
1540 
1541     file_len = bdrv_getlength(bs->file->bs);
1542     if (file_len < 0) {
1543         return file_len;
1544     }
1545 
1546     /*
1547      * Last cluster of qcow2 image may be semi-allocated, so it may be OK to
1548      * reference some space after file end but it should be less than one
1549      * cluster.
1550      */
1551     if (offset + size - file_len >= s->cluster_size) {
1552         fprintf(stderr, "ERROR: counting reference for region exceeding the "
1553                 "end of the file by one cluster or more: offset 0x%" PRIx64
1554                 " size 0x%" PRIx64 "\n", offset, size);
1555         res->corruptions++;
1556         return 0;
1557     }
1558 
1559     start = start_of_cluster(s, offset);
1560     last = start_of_cluster(s, offset + size - 1);
1561     for(cluster_offset = start; cluster_offset <= last;
1562         cluster_offset += s->cluster_size) {
1563         k = cluster_offset >> s->cluster_bits;
1564         if (k >= *refcount_table_size) {
1565             ret = realloc_refcount_array(s, refcount_table,
1566                                          refcount_table_size, k + 1);
1567             if (ret < 0) {
1568                 res->check_errors++;
1569                 return ret;
1570             }
1571         }
1572 
1573         refcount = s->get_refcount(*refcount_table, k);
1574         if (refcount == s->refcount_max) {
1575             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1576                     "\n", cluster_offset);
1577             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1578                     "width or qemu-img convert to create a clean copy if the "
1579                     "image cannot be opened for writing\n");
1580             res->corruptions++;
1581             continue;
1582         }
1583         s->set_refcount(*refcount_table, k, refcount + 1);
1584     }
1585 
1586     return 0;
1587 }
1588 
1589 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1590 enum {
1591     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1592 };
1593 
1594 /*
1595  * Fix L2 entry by making it QCOW2_CLUSTER_ZERO_PLAIN (or making all its present
1596  * subclusters QCOW2_SUBCLUSTER_ZERO_PLAIN).
1597  *
1598  * This function decrements res->corruptions on success, so the caller is
1599  * responsible to increment res->corruptions prior to the call.
1600  *
1601  * On failure in-memory @l2_table may be modified.
1602  */
1603 static int fix_l2_entry_by_zero(BlockDriverState *bs, BdrvCheckResult *res,
1604                                 uint64_t l2_offset,
1605                                 uint64_t *l2_table, int l2_index, bool active,
1606                                 bool *metadata_overlap)
1607 {
1608     BDRVQcow2State *s = bs->opaque;
1609     int ret;
1610     int idx = l2_index * (l2_entry_size(s) / sizeof(uint64_t));
1611     uint64_t l2e_offset = l2_offset + (uint64_t)l2_index * l2_entry_size(s);
1612     int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
1613 
1614     if (has_subclusters(s)) {
1615         uint64_t l2_bitmap = get_l2_bitmap(s, l2_table, l2_index);
1616 
1617         /* Allocated subclusters become zero */
1618         l2_bitmap |= l2_bitmap << 32;
1619         l2_bitmap &= QCOW_L2_BITMAP_ALL_ZEROES;
1620 
1621         set_l2_bitmap(s, l2_table, l2_index, l2_bitmap);
1622         set_l2_entry(s, l2_table, l2_index, 0);
1623     } else {
1624         set_l2_entry(s, l2_table, l2_index, QCOW_OFLAG_ZERO);
1625     }
1626 
1627     ret = qcow2_pre_write_overlap_check(bs, ign, l2e_offset, l2_entry_size(s),
1628                                         false);
1629     if (metadata_overlap) {
1630         *metadata_overlap = ret < 0;
1631     }
1632     if (ret < 0) {
1633         fprintf(stderr, "ERROR: Overlap check failed\n");
1634         goto fail;
1635     }
1636 
1637     ret = bdrv_pwrite_sync(bs->file, l2e_offset, l2_entry_size(s),
1638                            &l2_table[idx], 0);
1639     if (ret < 0) {
1640         fprintf(stderr, "ERROR: Failed to overwrite L2 "
1641                 "table entry: %s\n", strerror(-ret));
1642         goto fail;
1643     }
1644 
1645     res->corruptions--;
1646     res->corruptions_fixed++;
1647     return 0;
1648 
1649 fail:
1650     res->check_errors++;
1651     return ret;
1652 }
1653 
1654 /*
1655  * Increases the refcount in the given refcount table for the all clusters
1656  * referenced in the L2 table. While doing so, performs some checks on L2
1657  * entries.
1658  *
1659  * Returns the number of errors found by the checks or -errno if an internal
1660  * error occurred.
1661  */
1662 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1663                               void **refcount_table,
1664                               int64_t *refcount_table_size, int64_t l2_offset,
1665                               int flags, BdrvCheckMode fix, bool active)
1666 {
1667     BDRVQcow2State *s = bs->opaque;
1668     uint64_t l2_entry, l2_bitmap;
1669     uint64_t next_contiguous_offset = 0;
1670     int i, ret;
1671     size_t l2_size_bytes = s->l2_size * l2_entry_size(s);
1672     g_autofree uint64_t *l2_table = g_malloc(l2_size_bytes);
1673     bool metadata_overlap;
1674 
1675     /* Read L2 table from disk */
1676     ret = bdrv_pread(bs->file, l2_offset, l2_size_bytes, l2_table, 0);
1677     if (ret < 0) {
1678         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1679         res->check_errors++;
1680         return ret;
1681     }
1682 
1683     /* Do the actual checks */
1684     for (i = 0; i < s->l2_size; i++) {
1685         uint64_t coffset;
1686         int csize;
1687         QCow2ClusterType type;
1688 
1689         l2_entry = get_l2_entry(s, l2_table, i);
1690         l2_bitmap = get_l2_bitmap(s, l2_table, i);
1691         type = qcow2_get_cluster_type(bs, l2_entry);
1692 
1693         if (type != QCOW2_CLUSTER_COMPRESSED) {
1694             /* Check reserved bits of Standard Cluster Descriptor */
1695             if (l2_entry & L2E_STD_RESERVED_MASK) {
1696                 fprintf(stderr, "ERROR found l2 entry with reserved bits set: "
1697                         "%" PRIx64 "\n", l2_entry);
1698                 res->corruptions++;
1699             }
1700         }
1701 
1702         switch (type) {
1703         case QCOW2_CLUSTER_COMPRESSED:
1704             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1705             if (l2_entry & QCOW_OFLAG_COPIED) {
1706                 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
1707                     "copied flag must never be set for compressed "
1708                     "clusters\n", l2_entry & s->cluster_offset_mask);
1709                 l2_entry &= ~QCOW_OFLAG_COPIED;
1710                 res->corruptions++;
1711             }
1712 
1713             if (has_data_file(bs)) {
1714                 fprintf(stderr, "ERROR compressed cluster %d with data file, "
1715                         "entry=0x%" PRIx64 "\n", i, l2_entry);
1716                 res->corruptions++;
1717                 break;
1718             }
1719 
1720             if (l2_bitmap) {
1721                 fprintf(stderr, "ERROR compressed cluster %d with non-zero "
1722                         "subcluster allocation bitmap, entry=0x%" PRIx64 "\n",
1723                         i, l2_entry);
1724                 res->corruptions++;
1725                 break;
1726             }
1727 
1728             /* Mark cluster as used */
1729             qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
1730             ret = qcow2_inc_refcounts_imrt(
1731                 bs, res, refcount_table, refcount_table_size, coffset, csize);
1732             if (ret < 0) {
1733                 return ret;
1734             }
1735 
1736             if (flags & CHECK_FRAG_INFO) {
1737                 res->bfi.allocated_clusters++;
1738                 res->bfi.compressed_clusters++;
1739 
1740                 /*
1741                  * Compressed clusters are fragmented by nature.  Since they
1742                  * take up sub-sector space but we only have sector granularity
1743                  * I/O we need to re-read the same sectors even for adjacent
1744                  * compressed clusters.
1745                  */
1746                 res->bfi.fragmented_clusters++;
1747             }
1748             break;
1749 
1750         case QCOW2_CLUSTER_ZERO_ALLOC:
1751         case QCOW2_CLUSTER_NORMAL:
1752         {
1753             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1754 
1755             if ((l2_bitmap >> 32) & l2_bitmap) {
1756                 res->corruptions++;
1757                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Allocated "
1758                         "cluster has corrupted subcluster allocation bitmap\n",
1759                         offset);
1760             }
1761 
1762             /* Correct offsets are cluster aligned */
1763             if (offset_into_cluster(s, offset)) {
1764                 bool contains_data;
1765                 res->corruptions++;
1766 
1767                 if (has_subclusters(s)) {
1768                     contains_data = (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC);
1769                 } else {
1770                     contains_data = !(l2_entry & QCOW_OFLAG_ZERO);
1771                 }
1772 
1773                 if (!contains_data) {
1774                     fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated "
1775                             "cluster is not properly aligned; L2 entry "
1776                             "corrupted.\n",
1777                             fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
1778                             offset);
1779                     if (fix & BDRV_FIX_ERRORS) {
1780                         ret = fix_l2_entry_by_zero(bs, res, l2_offset,
1781                                                    l2_table, i, active,
1782                                                    &metadata_overlap);
1783                         if (metadata_overlap) {
1784                             /*
1785                              * Something is seriously wrong, so abort checking
1786                              * this L2 table.
1787                              */
1788                             return ret;
1789                         }
1790 
1791                         if (ret == 0) {
1792                             /*
1793                              * Skip marking the cluster as used
1794                              * (it is unused now).
1795                              */
1796                             continue;
1797                         }
1798 
1799                         /*
1800                          * Failed to fix.
1801                          * Do not abort, continue checking the rest of this
1802                          * L2 table's entries.
1803                          */
1804                     }
1805                 } else {
1806                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Data cluster is "
1807                         "not properly aligned; L2 entry corrupted.\n", offset);
1808                 }
1809             }
1810 
1811             if (flags & CHECK_FRAG_INFO) {
1812                 res->bfi.allocated_clusters++;
1813                 if (next_contiguous_offset &&
1814                     offset != next_contiguous_offset) {
1815                     res->bfi.fragmented_clusters++;
1816                 }
1817                 next_contiguous_offset = offset + s->cluster_size;
1818             }
1819 
1820             /* Mark cluster as used */
1821             if (!has_data_file(bs)) {
1822                 ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table,
1823                                                refcount_table_size,
1824                                                offset, s->cluster_size);
1825                 if (ret < 0) {
1826                     return ret;
1827                 }
1828             }
1829             break;
1830         }
1831 
1832         case QCOW2_CLUSTER_ZERO_PLAIN:
1833             /* Impossible when image has subclusters */
1834             assert(!l2_bitmap);
1835             break;
1836 
1837         case QCOW2_CLUSTER_UNALLOCATED:
1838             if (l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC) {
1839                 res->corruptions++;
1840                 fprintf(stderr, "ERROR: Unallocated "
1841                         "cluster has non-zero subcluster allocation map\n");
1842             }
1843             break;
1844 
1845         default:
1846             abort();
1847         }
1848     }
1849 
1850     return 0;
1851 }
1852 
1853 /*
1854  * Increases the refcount for the L1 table, its L2 tables and all referenced
1855  * clusters in the given refcount table. While doing so, performs some checks
1856  * on L1 and L2 entries.
1857  *
1858  * Returns the number of errors found by the checks or -errno if an internal
1859  * error occurred.
1860  */
1861 static int check_refcounts_l1(BlockDriverState *bs,
1862                               BdrvCheckResult *res,
1863                               void **refcount_table,
1864                               int64_t *refcount_table_size,
1865                               int64_t l1_table_offset, int l1_size,
1866                               int flags, BdrvCheckMode fix, bool active)
1867 {
1868     BDRVQcow2State *s = bs->opaque;
1869     size_t l1_size_bytes = l1_size * L1E_SIZE;
1870     g_autofree uint64_t *l1_table = NULL;
1871     uint64_t l2_offset;
1872     int i, ret;
1873 
1874     if (!l1_size) {
1875         return 0;
1876     }
1877 
1878     /* Mark L1 table as used */
1879     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, refcount_table_size,
1880                                    l1_table_offset, l1_size_bytes);
1881     if (ret < 0) {
1882         return ret;
1883     }
1884 
1885     l1_table = g_try_malloc(l1_size_bytes);
1886     if (l1_table == NULL) {
1887         res->check_errors++;
1888         return -ENOMEM;
1889     }
1890 
1891     /* Read L1 table entries from disk */
1892     ret = bdrv_pread(bs->file, l1_table_offset, l1_size_bytes, l1_table, 0);
1893     if (ret < 0) {
1894         fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1895         res->check_errors++;
1896         return ret;
1897     }
1898 
1899     for (i = 0; i < l1_size; i++) {
1900         be64_to_cpus(&l1_table[i]);
1901     }
1902 
1903     /* Do the actual checks */
1904     for (i = 0; i < l1_size; i++) {
1905         if (!l1_table[i]) {
1906             continue;
1907         }
1908 
1909         if (l1_table[i] & L1E_RESERVED_MASK) {
1910             fprintf(stderr, "ERROR found L1 entry with reserved bits set: "
1911                     "%" PRIx64 "\n", l1_table[i]);
1912             res->corruptions++;
1913         }
1914 
1915         l2_offset = l1_table[i] & L1E_OFFSET_MASK;
1916 
1917         /* Mark L2 table as used */
1918         ret = qcow2_inc_refcounts_imrt(bs, res,
1919                                        refcount_table, refcount_table_size,
1920                                        l2_offset, s->cluster_size);
1921         if (ret < 0) {
1922             return ret;
1923         }
1924 
1925         /* L2 tables are cluster aligned */
1926         if (offset_into_cluster(s, l2_offset)) {
1927             fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1928                 "cluster aligned; L1 entry corrupted\n", l2_offset);
1929             res->corruptions++;
1930         }
1931 
1932         /* Process and check L2 entries */
1933         ret = check_refcounts_l2(bs, res, refcount_table,
1934                                  refcount_table_size, l2_offset, flags,
1935                                  fix, active);
1936         if (ret < 0) {
1937             return ret;
1938         }
1939     }
1940 
1941     return 0;
1942 }
1943 
1944 /*
1945  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1946  *
1947  * This function does not print an error message nor does it increment
1948  * check_errors if qcow2_get_refcount fails (this is because such an error will
1949  * have been already detected and sufficiently signaled by the calling function
1950  * (qcow2_check_refcounts) by the time this function is called).
1951  */
1952 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1953                               BdrvCheckMode fix)
1954 {
1955     BDRVQcow2State *s = bs->opaque;
1956     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1957     int ret;
1958     uint64_t refcount;
1959     int i, j;
1960     bool repair;
1961 
1962     if (fix & BDRV_FIX_ERRORS) {
1963         /* Always repair */
1964         repair = true;
1965     } else if (fix & BDRV_FIX_LEAKS) {
1966         /* Repair only if that seems safe: This function is always
1967          * called after the refcounts have been fixed, so the refcount
1968          * is accurate if that repair was successful */
1969         repair = !res->check_errors && !res->corruptions && !res->leaks;
1970     } else {
1971         repair = false;
1972     }
1973 
1974     for (i = 0; i < s->l1_size; i++) {
1975         uint64_t l1_entry = s->l1_table[i];
1976         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1977         int l2_dirty = 0;
1978 
1979         if (!l2_offset) {
1980             continue;
1981         }
1982 
1983         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1984                                  &refcount);
1985         if (ret < 0) {
1986             /* don't print message nor increment check_errors */
1987             continue;
1988         }
1989         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1990             res->corruptions++;
1991             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1992                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1993                     repair ? "Repairing" : "ERROR", i, l1_entry, refcount);
1994             if (repair) {
1995                 s->l1_table[i] = refcount == 1
1996                                ? l1_entry |  QCOW_OFLAG_COPIED
1997                                : l1_entry & ~QCOW_OFLAG_COPIED;
1998                 ret = qcow2_write_l1_entry(bs, i);
1999                 if (ret < 0) {
2000                     res->check_errors++;
2001                     goto fail;
2002                 }
2003                 res->corruptions--;
2004                 res->corruptions_fixed++;
2005             }
2006         }
2007 
2008         ret = bdrv_pread(bs->file, l2_offset, s->l2_size * l2_entry_size(s),
2009                          l2_table, 0);
2010         if (ret < 0) {
2011             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
2012                     strerror(-ret));
2013             res->check_errors++;
2014             goto fail;
2015         }
2016 
2017         for (j = 0; j < s->l2_size; j++) {
2018             uint64_t l2_entry = get_l2_entry(s, l2_table, j);
2019             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
2020             QCow2ClusterType cluster_type = qcow2_get_cluster_type(bs, l2_entry);
2021 
2022             if (cluster_type == QCOW2_CLUSTER_NORMAL ||
2023                 cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
2024                 if (has_data_file(bs)) {
2025                     refcount = 1;
2026                 } else {
2027                     ret = qcow2_get_refcount(bs,
2028                                              data_offset >> s->cluster_bits,
2029                                              &refcount);
2030                     if (ret < 0) {
2031                         /* don't print message nor increment check_errors */
2032                         continue;
2033                     }
2034                 }
2035                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
2036                     res->corruptions++;
2037                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
2038                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
2039                             repair ? "Repairing" : "ERROR", l2_entry, refcount);
2040                     if (repair) {
2041                         set_l2_entry(s, l2_table, j,
2042                                      refcount == 1 ?
2043                                      l2_entry |  QCOW_OFLAG_COPIED :
2044                                      l2_entry & ~QCOW_OFLAG_COPIED);
2045                         l2_dirty++;
2046                     }
2047                 }
2048             }
2049         }
2050 
2051         if (l2_dirty > 0) {
2052             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
2053                                                 l2_offset, s->cluster_size,
2054                                                 false);
2055             if (ret < 0) {
2056                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
2057                         "overlap check failed: %s\n", strerror(-ret));
2058                 res->check_errors++;
2059                 goto fail;
2060             }
2061 
2062             ret = bdrv_pwrite(bs->file, l2_offset, s->cluster_size, l2_table,
2063                               0);
2064             if (ret < 0) {
2065                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
2066                         strerror(-ret));
2067                 res->check_errors++;
2068                 goto fail;
2069             }
2070             res->corruptions -= l2_dirty;
2071             res->corruptions_fixed += l2_dirty;
2072         }
2073     }
2074 
2075     ret = 0;
2076 
2077 fail:
2078     qemu_vfree(l2_table);
2079     return ret;
2080 }
2081 
2082 /*
2083  * Checks consistency of refblocks and accounts for each refblock in
2084  * *refcount_table.
2085  */
2086 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
2087                            BdrvCheckMode fix, bool *rebuild,
2088                            void **refcount_table, int64_t *nb_clusters)
2089 {
2090     BDRVQcow2State *s = bs->opaque;
2091     int64_t i, size;
2092     int ret;
2093 
2094     for(i = 0; i < s->refcount_table_size; i++) {
2095         uint64_t offset, cluster;
2096         offset = s->refcount_table[i] & REFT_OFFSET_MASK;
2097         cluster = offset >> s->cluster_bits;
2098 
2099         if (s->refcount_table[i] & REFT_RESERVED_MASK) {
2100             fprintf(stderr, "ERROR refcount table entry %" PRId64 " has "
2101                     "reserved bits set\n", i);
2102             res->corruptions++;
2103             *rebuild = true;
2104             continue;
2105         }
2106 
2107         /* Refcount blocks are cluster aligned */
2108         if (offset_into_cluster(s, offset)) {
2109             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
2110                 "cluster aligned; refcount table entry corrupted\n", i);
2111             res->corruptions++;
2112             *rebuild = true;
2113             continue;
2114         }
2115 
2116         if (cluster >= *nb_clusters) {
2117             res->corruptions++;
2118             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
2119                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
2120 
2121             if (fix & BDRV_FIX_ERRORS) {
2122                 int64_t new_nb_clusters;
2123                 Error *local_err = NULL;
2124 
2125                 if (offset > INT64_MAX - s->cluster_size) {
2126                     ret = -EINVAL;
2127                     goto resize_fail;
2128                 }
2129 
2130                 ret = bdrv_truncate(bs->file, offset + s->cluster_size, false,
2131                                     PREALLOC_MODE_OFF, 0, &local_err);
2132                 if (ret < 0) {
2133                     error_report_err(local_err);
2134                     goto resize_fail;
2135                 }
2136                 size = bdrv_getlength(bs->file->bs);
2137                 if (size < 0) {
2138                     ret = size;
2139                     goto resize_fail;
2140                 }
2141 
2142                 new_nb_clusters = size_to_clusters(s, size);
2143                 assert(new_nb_clusters >= *nb_clusters);
2144 
2145                 ret = realloc_refcount_array(s, refcount_table,
2146                                              nb_clusters, new_nb_clusters);
2147                 if (ret < 0) {
2148                     res->check_errors++;
2149                     return ret;
2150                 }
2151 
2152                 if (cluster >= *nb_clusters) {
2153                     ret = -EINVAL;
2154                     goto resize_fail;
2155                 }
2156 
2157                 res->corruptions--;
2158                 res->corruptions_fixed++;
2159                 ret = qcow2_inc_refcounts_imrt(bs, res,
2160                                                refcount_table, nb_clusters,
2161                                                offset, s->cluster_size);
2162                 if (ret < 0) {
2163                     return ret;
2164                 }
2165                 /* No need to check whether the refcount is now greater than 1:
2166                  * This area was just allocated and zeroed, so it can only be
2167                  * exactly 1 after qcow2_inc_refcounts_imrt() */
2168                 continue;
2169 
2170 resize_fail:
2171                 *rebuild = true;
2172                 fprintf(stderr, "ERROR could not resize image: %s\n",
2173                         strerror(-ret));
2174             }
2175             continue;
2176         }
2177 
2178         if (offset != 0) {
2179             ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2180                                            offset, s->cluster_size);
2181             if (ret < 0) {
2182                 return ret;
2183             }
2184             if (s->get_refcount(*refcount_table, cluster) != 1) {
2185                 fprintf(stderr, "ERROR refcount block %" PRId64
2186                         " refcount=%" PRIu64 "\n", i,
2187                         s->get_refcount(*refcount_table, cluster));
2188                 res->corruptions++;
2189                 *rebuild = true;
2190             }
2191         }
2192     }
2193 
2194     return 0;
2195 }
2196 
2197 /*
2198  * Calculates an in-memory refcount table.
2199  */
2200 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2201                                BdrvCheckMode fix, bool *rebuild,
2202                                void **refcount_table, int64_t *nb_clusters)
2203 {
2204     BDRVQcow2State *s = bs->opaque;
2205     int64_t i;
2206     QCowSnapshot *sn;
2207     int ret;
2208 
2209     if (!*refcount_table) {
2210         int64_t old_size = 0;
2211         ret = realloc_refcount_array(s, refcount_table,
2212                                      &old_size, *nb_clusters);
2213         if (ret < 0) {
2214             res->check_errors++;
2215             return ret;
2216         }
2217     }
2218 
2219     /* header */
2220     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2221                                    0, s->cluster_size);
2222     if (ret < 0) {
2223         return ret;
2224     }
2225 
2226     /* current L1 table */
2227     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2228                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
2229                              fix, true);
2230     if (ret < 0) {
2231         return ret;
2232     }
2233 
2234     /* snapshots */
2235     if (has_data_file(bs) && s->nb_snapshots) {
2236         fprintf(stderr, "ERROR %d snapshots in image with data file\n",
2237                 s->nb_snapshots);
2238         res->corruptions++;
2239     }
2240 
2241     for (i = 0; i < s->nb_snapshots; i++) {
2242         sn = s->snapshots + i;
2243         if (offset_into_cluster(s, sn->l1_table_offset)) {
2244             fprintf(stderr, "ERROR snapshot %s (%s) l1_offset=%#" PRIx64 ": "
2245                     "L1 table is not cluster aligned; snapshot table entry "
2246                     "corrupted\n", sn->id_str, sn->name, sn->l1_table_offset);
2247             res->corruptions++;
2248             continue;
2249         }
2250         if (sn->l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
2251             fprintf(stderr, "ERROR snapshot %s (%s) l1_size=%#" PRIx32 ": "
2252                     "L1 table is too large; snapshot table entry corrupted\n",
2253                     sn->id_str, sn->name, sn->l1_size);
2254             res->corruptions++;
2255             continue;
2256         }
2257         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
2258                                  sn->l1_table_offset, sn->l1_size, 0, fix,
2259                                  false);
2260         if (ret < 0) {
2261             return ret;
2262         }
2263     }
2264     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2265                                    s->snapshots_offset, s->snapshots_size);
2266     if (ret < 0) {
2267         return ret;
2268     }
2269 
2270     /* refcount data */
2271     ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2272                                    s->refcount_table_offset,
2273                                    s->refcount_table_size *
2274                                    REFTABLE_ENTRY_SIZE);
2275     if (ret < 0) {
2276         return ret;
2277     }
2278 
2279     /* encryption */
2280     if (s->crypto_header.length) {
2281         ret = qcow2_inc_refcounts_imrt(bs, res, refcount_table, nb_clusters,
2282                                        s->crypto_header.offset,
2283                                        s->crypto_header.length);
2284         if (ret < 0) {
2285             return ret;
2286         }
2287     }
2288 
2289     /* bitmaps */
2290     ret = qcow2_check_bitmaps_refcounts(bs, res, refcount_table, nb_clusters);
2291     if (ret < 0) {
2292         return ret;
2293     }
2294 
2295     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
2296 }
2297 
2298 /*
2299  * Compares the actual reference count for each cluster in the image against the
2300  * refcount as reported by the refcount structures on-disk.
2301  */
2302 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2303                               BdrvCheckMode fix, bool *rebuild,
2304                               int64_t *highest_cluster,
2305                               void *refcount_table, int64_t nb_clusters)
2306 {
2307     BDRVQcow2State *s = bs->opaque;
2308     int64_t i;
2309     uint64_t refcount1, refcount2;
2310     int ret;
2311 
2312     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
2313         ret = qcow2_get_refcount(bs, i, &refcount1);
2314         if (ret < 0) {
2315             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
2316                     i, strerror(-ret));
2317             res->check_errors++;
2318             continue;
2319         }
2320 
2321         refcount2 = s->get_refcount(refcount_table, i);
2322 
2323         if (refcount1 > 0 || refcount2 > 0) {
2324             *highest_cluster = i;
2325         }
2326 
2327         if (refcount1 != refcount2) {
2328             /* Check if we're allowed to fix the mismatch */
2329             int *num_fixed = NULL;
2330             if (refcount1 == 0) {
2331                 *rebuild = true;
2332             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
2333                 num_fixed = &res->leaks_fixed;
2334             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
2335                 num_fixed = &res->corruptions_fixed;
2336             }
2337 
2338             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
2339                     " reference=%" PRIu64 "\n",
2340                    num_fixed != NULL     ? "Repairing" :
2341                    refcount1 < refcount2 ? "ERROR" :
2342                                            "Leaked",
2343                    i, refcount1, refcount2);
2344 
2345             if (num_fixed) {
2346                 ret = update_refcount(bs, i << s->cluster_bits, 1,
2347                                       refcount_diff(refcount1, refcount2),
2348                                       refcount1 > refcount2,
2349                                       QCOW2_DISCARD_ALWAYS);
2350                 if (ret >= 0) {
2351                     (*num_fixed)++;
2352                     continue;
2353                 }
2354             }
2355 
2356             /* And if we couldn't, print an error */
2357             if (refcount1 < refcount2) {
2358                 res->corruptions++;
2359             } else {
2360                 res->leaks++;
2361             }
2362         }
2363     }
2364 }
2365 
2366 /*
2367  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
2368  * the on-disk refcount structures.
2369  *
2370  * On input, *first_free_cluster tells where to start looking, and need not
2371  * actually be a free cluster; the returned offset will not be before that
2372  * cluster.  On output, *first_free_cluster points to the first gap found, even
2373  * if that gap was too small to be used as the returned offset.
2374  *
2375  * Note that *first_free_cluster is a cluster index whereas the return value is
2376  * an offset.
2377  */
2378 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
2379                                    int cluster_count,
2380                                    void **refcount_table,
2381                                    int64_t *imrt_nb_clusters,
2382                                    int64_t *first_free_cluster)
2383 {
2384     BDRVQcow2State *s = bs->opaque;
2385     int64_t cluster = *first_free_cluster, i;
2386     bool first_gap = true;
2387     int contiguous_free_clusters;
2388     int ret;
2389 
2390     /* Starting at *first_free_cluster, find a range of at least cluster_count
2391      * continuously free clusters */
2392     for (contiguous_free_clusters = 0;
2393          cluster < *imrt_nb_clusters &&
2394          contiguous_free_clusters < cluster_count;
2395          cluster++)
2396     {
2397         if (!s->get_refcount(*refcount_table, cluster)) {
2398             contiguous_free_clusters++;
2399             if (first_gap) {
2400                 /* If this is the first free cluster found, update
2401                  * *first_free_cluster accordingly */
2402                 *first_free_cluster = cluster;
2403                 first_gap = false;
2404             }
2405         } else if (contiguous_free_clusters) {
2406             contiguous_free_clusters = 0;
2407         }
2408     }
2409 
2410     /* If contiguous_free_clusters is greater than zero, it contains the number
2411      * of continuously free clusters until the current cluster; the first free
2412      * cluster in the current "gap" is therefore
2413      * cluster - contiguous_free_clusters */
2414 
2415     /* If no such range could be found, grow the in-memory refcount table
2416      * accordingly to append free clusters at the end of the image */
2417     if (contiguous_free_clusters < cluster_count) {
2418         /* contiguous_free_clusters clusters are already empty at the image end;
2419          * we need cluster_count clusters; therefore, we have to allocate
2420          * cluster_count - contiguous_free_clusters new clusters at the end of
2421          * the image (which is the current value of cluster; note that cluster
2422          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
2423          * the image end) */
2424         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
2425                                      cluster + cluster_count
2426                                      - contiguous_free_clusters);
2427         if (ret < 0) {
2428             return ret;
2429         }
2430     }
2431 
2432     /* Go back to the first free cluster */
2433     cluster -= contiguous_free_clusters;
2434     for (i = 0; i < cluster_count; i++) {
2435         s->set_refcount(*refcount_table, cluster + i, 1);
2436     }
2437 
2438     return cluster << s->cluster_bits;
2439 }
2440 
2441 /*
2442  * Helper function for rebuild_refcount_structure().
2443  *
2444  * Scan the range of clusters [first_cluster, end_cluster) for allocated
2445  * clusters and write all corresponding refblocks to disk.  The refblock
2446  * and allocation data is taken from the in-memory refcount table
2447  * *refcount_table[] (of size *nb_clusters), which is basically one big
2448  * (unlimited size) refblock for the whole image.
2449  *
2450  * For these refblocks, clusters are allocated using said in-memory
2451  * refcount table.  Care is taken that these allocations are reflected
2452  * in the refblocks written to disk.
2453  *
2454  * The refblocks' offsets are written into a reftable, which is
2455  * *on_disk_reftable_ptr[] (of size *on_disk_reftable_entries_ptr).  If
2456  * that reftable is of insufficient size, it will be resized to fit.
2457  * This reftable is not written to disk.
2458  *
2459  * (If *on_disk_reftable_ptr is not NULL, the entries within are assumed
2460  * to point to existing valid refblocks that do not need to be allocated
2461  * again.)
2462  *
2463  * Return whether the on-disk reftable array was resized (true/false),
2464  * or -errno on error.
2465  */
2466 static int rebuild_refcounts_write_refblocks(
2467         BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters,
2468         int64_t first_cluster, int64_t end_cluster,
2469         uint64_t **on_disk_reftable_ptr, uint32_t *on_disk_reftable_entries_ptr,
2470         Error **errp
2471     )
2472 {
2473     BDRVQcow2State *s = bs->opaque;
2474     int64_t cluster;
2475     int64_t refblock_offset, refblock_start, refblock_index;
2476     int64_t first_free_cluster = 0;
2477     uint64_t *on_disk_reftable = *on_disk_reftable_ptr;
2478     uint32_t on_disk_reftable_entries = *on_disk_reftable_entries_ptr;
2479     void *on_disk_refblock;
2480     bool reftable_grown = false;
2481     int ret;
2482 
2483     for (cluster = first_cluster; cluster < end_cluster; cluster++) {
2484         /* Check all clusters to find refblocks that contain non-zero entries */
2485         if (!s->get_refcount(*refcount_table, cluster)) {
2486             continue;
2487         }
2488 
2489         /*
2490          * This cluster is allocated, so we need to create a refblock
2491          * for it.  The data we will write to disk is just the
2492          * respective slice from *refcount_table, so it will contain
2493          * accurate refcounts for all clusters belonging to this
2494          * refblock.  After we have written it, we will therefore skip
2495          * all remaining clusters in this refblock.
2496          */
2497 
2498         refblock_index = cluster >> s->refcount_block_bits;
2499         refblock_start = refblock_index << s->refcount_block_bits;
2500 
2501         if (on_disk_reftable_entries > refblock_index &&
2502             on_disk_reftable[refblock_index])
2503         {
2504             /*
2505              * We can get here after a `goto write_refblocks`: We have a
2506              * reftable from a previous run, and the refblock is already
2507              * allocated.  No need to allocate it again.
2508              */
2509             refblock_offset = on_disk_reftable[refblock_index];
2510         } else {
2511             int64_t refblock_cluster_index;
2512 
2513             /* Don't allocate a cluster in a refblock already written to disk */
2514             if (first_free_cluster < refblock_start) {
2515                 first_free_cluster = refblock_start;
2516             }
2517             refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2518                                                   nb_clusters,
2519                                                   &first_free_cluster);
2520             if (refblock_offset < 0) {
2521                 error_setg_errno(errp, -refblock_offset,
2522                                  "ERROR allocating refblock");
2523                 return refblock_offset;
2524             }
2525 
2526             refblock_cluster_index = refblock_offset / s->cluster_size;
2527             if (refblock_cluster_index >= end_cluster) {
2528                 /*
2529                  * We must write the refblock that holds this refblock's
2530                  * refcount
2531                  */
2532                 end_cluster = refblock_cluster_index + 1;
2533             }
2534 
2535             if (on_disk_reftable_entries <= refblock_index) {
2536                 on_disk_reftable_entries =
2537                     ROUND_UP((refblock_index + 1) * REFTABLE_ENTRY_SIZE,
2538                              s->cluster_size) / REFTABLE_ENTRY_SIZE;
2539                 on_disk_reftable =
2540                     g_try_realloc(on_disk_reftable,
2541                                   on_disk_reftable_entries *
2542                                   REFTABLE_ENTRY_SIZE);
2543                 if (!on_disk_reftable) {
2544                     error_setg(errp, "ERROR allocating reftable memory");
2545                     return -ENOMEM;
2546                 }
2547 
2548                 memset(on_disk_reftable + *on_disk_reftable_entries_ptr, 0,
2549                        (on_disk_reftable_entries -
2550                         *on_disk_reftable_entries_ptr) *
2551                        REFTABLE_ENTRY_SIZE);
2552 
2553                 *on_disk_reftable_ptr = on_disk_reftable;
2554                 *on_disk_reftable_entries_ptr = on_disk_reftable_entries;
2555 
2556                 reftable_grown = true;
2557             } else {
2558                 assert(on_disk_reftable);
2559             }
2560             on_disk_reftable[refblock_index] = refblock_offset;
2561         }
2562 
2563         /* Refblock is allocated, write it to disk */
2564 
2565         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2566                                             s->cluster_size, false);
2567         if (ret < 0) {
2568             error_setg_errno(errp, -ret, "ERROR writing refblock");
2569             return ret;
2570         }
2571 
2572         /*
2573          * The refblock is simply a slice of *refcount_table.
2574          * Note that the size of *refcount_table is always aligned to
2575          * whole clusters, so the write operation will not result in
2576          * out-of-bounds accesses.
2577          */
2578         on_disk_refblock = (void *)((char *) *refcount_table +
2579                                     refblock_index * s->cluster_size);
2580 
2581         ret = bdrv_pwrite(bs->file, refblock_offset, s->cluster_size,
2582                           on_disk_refblock, 0);
2583         if (ret < 0) {
2584             error_setg_errno(errp, -ret, "ERROR writing refblock");
2585             return ret;
2586         }
2587 
2588         /* This refblock is done, skip to its end */
2589         cluster = refblock_start + s->refcount_block_size - 1;
2590     }
2591 
2592     return reftable_grown;
2593 }
2594 
2595 /*
2596  * Creates a new refcount structure based solely on the in-memory information
2597  * given through *refcount_table (this in-memory information is basically just
2598  * the concatenation of all refblocks).  All necessary allocations will be
2599  * reflected in that array.
2600  *
2601  * On success, the old refcount structure is leaked (it will be covered by the
2602  * new refcount structure).
2603  */
2604 static int rebuild_refcount_structure(BlockDriverState *bs,
2605                                       BdrvCheckResult *res,
2606                                       void **refcount_table,
2607                                       int64_t *nb_clusters,
2608                                       Error **errp)
2609 {
2610     BDRVQcow2State *s = bs->opaque;
2611     int64_t reftable_offset = -1;
2612     int64_t reftable_length = 0;
2613     int64_t reftable_clusters;
2614     int64_t refblock_index;
2615     uint32_t on_disk_reftable_entries = 0;
2616     uint64_t *on_disk_reftable = NULL;
2617     int ret = 0;
2618     int reftable_size_changed = 0;
2619     struct {
2620         uint64_t reftable_offset;
2621         uint32_t reftable_clusters;
2622     } QEMU_PACKED reftable_offset_and_clusters;
2623 
2624     qcow2_cache_empty(bs, s->refcount_block_cache);
2625 
2626     /*
2627      * For each refblock containing entries, we try to allocate a
2628      * cluster (in the in-memory refcount table) and write its offset
2629      * into on_disk_reftable[].  We then write the whole refblock to
2630      * disk (as a slice of the in-memory refcount table).
2631      * This is done by rebuild_refcounts_write_refblocks().
2632      *
2633      * Once we have scanned all clusters, we try to find space for the
2634      * reftable.  This will dirty the in-memory refcount table (i.e.
2635      * make it differ from the refblocks we have already written), so we
2636      * need to run rebuild_refcounts_write_refblocks() again for the
2637      * range of clusters where the reftable has been allocated.
2638      *
2639      * This second run might make the reftable grow again, in which case
2640      * we will need to allocate another space for it, which is why we
2641      * repeat all this until the reftable stops growing.
2642      *
2643      * (This loop will terminate, because with every cluster the
2644      * reftable grows, it can accomodate a multitude of more refcounts,
2645      * so that at some point this must be able to cover the reftable
2646      * and all refblocks describing it.)
2647      *
2648      * We then convert the reftable to big-endian and write it to disk.
2649      *
2650      * Note that we never free any reftable allocations.  Doing so would
2651      * needlessly complicate the algorithm: The eventual second check
2652      * run we do will clean up all leaks we have caused.
2653      */
2654 
2655     reftable_size_changed =
2656         rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2657                                           0, *nb_clusters,
2658                                           &on_disk_reftable,
2659                                           &on_disk_reftable_entries, errp);
2660     if (reftable_size_changed < 0) {
2661         res->check_errors++;
2662         ret = reftable_size_changed;
2663         goto fail;
2664     }
2665 
2666     /*
2667      * There was no reftable before, so rebuild_refcounts_write_refblocks()
2668      * must have increased its size (from 0 to something).
2669      */
2670     assert(reftable_size_changed);
2671 
2672     do {
2673         int64_t reftable_start_cluster, reftable_end_cluster;
2674         int64_t first_free_cluster = 0;
2675 
2676         reftable_length = on_disk_reftable_entries * REFTABLE_ENTRY_SIZE;
2677         reftable_clusters = size_to_clusters(s, reftable_length);
2678 
2679         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2680                                               refcount_table, nb_clusters,
2681                                               &first_free_cluster);
2682         if (reftable_offset < 0) {
2683             error_setg_errno(errp, -reftable_offset,
2684                              "ERROR allocating reftable");
2685             res->check_errors++;
2686             ret = reftable_offset;
2687             goto fail;
2688         }
2689 
2690         /*
2691          * We need to update the affected refblocks, so re-run the
2692          * write_refblocks loop for the reftable's range of clusters.
2693          */
2694         assert(offset_into_cluster(s, reftable_offset) == 0);
2695         reftable_start_cluster = reftable_offset / s->cluster_size;
2696         reftable_end_cluster = reftable_start_cluster + reftable_clusters;
2697         reftable_size_changed =
2698             rebuild_refcounts_write_refblocks(bs, refcount_table, nb_clusters,
2699                                               reftable_start_cluster,
2700                                               reftable_end_cluster,
2701                                               &on_disk_reftable,
2702                                               &on_disk_reftable_entries, errp);
2703         if (reftable_size_changed < 0) {
2704             res->check_errors++;
2705             ret = reftable_size_changed;
2706             goto fail;
2707         }
2708 
2709         /*
2710          * If the reftable size has changed, we will need to find a new
2711          * allocation, repeating the loop.
2712          */
2713     } while (reftable_size_changed);
2714 
2715     /* The above loop must have run at least once */
2716     assert(reftable_offset >= 0);
2717 
2718     /*
2719      * All allocations are done, all refblocks are written, convert the
2720      * reftable to big-endian and write it to disk.
2721      */
2722 
2723     for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2724          refblock_index++)
2725     {
2726         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2727     }
2728 
2729     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, reftable_length,
2730                                         false);
2731     if (ret < 0) {
2732         error_setg_errno(errp, -ret, "ERROR writing reftable");
2733         goto fail;
2734     }
2735 
2736     assert(reftable_length < INT_MAX);
2737     ret = bdrv_pwrite(bs->file, reftable_offset, reftable_length,
2738                       on_disk_reftable, 0);
2739     if (ret < 0) {
2740         error_setg_errno(errp, -ret, "ERROR writing reftable");
2741         goto fail;
2742     }
2743 
2744     /* Enter new reftable into the image header */
2745     reftable_offset_and_clusters.reftable_offset = cpu_to_be64(reftable_offset);
2746     reftable_offset_and_clusters.reftable_clusters =
2747         cpu_to_be32(reftable_clusters);
2748     ret = bdrv_pwrite_sync(bs->file,
2749                            offsetof(QCowHeader, refcount_table_offset),
2750                            sizeof(reftable_offset_and_clusters),
2751                            &reftable_offset_and_clusters, 0);
2752     if (ret < 0) {
2753         error_setg_errno(errp, -ret, "ERROR setting reftable");
2754         goto fail;
2755     }
2756 
2757     for (refblock_index = 0; refblock_index < on_disk_reftable_entries;
2758          refblock_index++)
2759     {
2760         be64_to_cpus(&on_disk_reftable[refblock_index]);
2761     }
2762     s->refcount_table = on_disk_reftable;
2763     s->refcount_table_offset = reftable_offset;
2764     s->refcount_table_size = on_disk_reftable_entries;
2765     update_max_refcount_table_index(s);
2766 
2767     return 0;
2768 
2769 fail:
2770     g_free(on_disk_reftable);
2771     return ret;
2772 }
2773 
2774 /*
2775  * Checks an image for refcount consistency.
2776  *
2777  * Returns 0 if no errors are found, the number of errors in case the image is
2778  * detected as corrupted, and -errno when an internal error occurred.
2779  */
2780 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2781                           BdrvCheckMode fix)
2782 {
2783     BDRVQcow2State *s = bs->opaque;
2784     BdrvCheckResult pre_compare_res;
2785     int64_t size, highest_cluster, nb_clusters;
2786     void *refcount_table = NULL;
2787     bool rebuild = false;
2788     int ret;
2789 
2790     size = bdrv_getlength(bs->file->bs);
2791     if (size < 0) {
2792         res->check_errors++;
2793         return size;
2794     }
2795 
2796     nb_clusters = size_to_clusters(s, size);
2797     if (nb_clusters > INT_MAX) {
2798         res->check_errors++;
2799         return -EFBIG;
2800     }
2801 
2802     res->bfi.total_clusters =
2803         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2804 
2805     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2806                               &nb_clusters);
2807     if (ret < 0) {
2808         goto fail;
2809     }
2810 
2811     /* In case we don't need to rebuild the refcount structure (but want to fix
2812      * something), this function is immediately called again, in which case the
2813      * result should be ignored */
2814     pre_compare_res = *res;
2815     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2816                       nb_clusters);
2817 
2818     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2819         BdrvCheckResult old_res = *res;
2820         int fresh_leaks = 0;
2821         Error *local_err = NULL;
2822 
2823         fprintf(stderr, "Rebuilding refcount structure\n");
2824         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2825                                          &nb_clusters, &local_err);
2826         if (ret < 0) {
2827             error_report_err(local_err);
2828             goto fail;
2829         }
2830 
2831         res->corruptions = 0;
2832         res->leaks = 0;
2833 
2834         /* Because the old reftable has been exchanged for a new one the
2835          * references have to be recalculated */
2836         rebuild = false;
2837         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2838         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2839                                   &nb_clusters);
2840         if (ret < 0) {
2841             goto fail;
2842         }
2843 
2844         if (fix & BDRV_FIX_LEAKS) {
2845             /* The old refcount structures are now leaked, fix it; the result
2846              * can be ignored, aside from leaks which were introduced by
2847              * rebuild_refcount_structure() that could not be fixed */
2848             BdrvCheckResult saved_res = *res;
2849             *res = (BdrvCheckResult){ 0 };
2850 
2851             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2852                               &highest_cluster, refcount_table, nb_clusters);
2853             if (rebuild) {
2854                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2855                         "broken\n");
2856             }
2857 
2858             /* Any leaks accounted for here were introduced by
2859              * rebuild_refcount_structure() because that function has created a
2860              * new refcount structure from scratch */
2861             fresh_leaks = res->leaks;
2862             *res = saved_res;
2863         }
2864 
2865         if (res->corruptions < old_res.corruptions) {
2866             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2867         }
2868         if (res->leaks < old_res.leaks) {
2869             res->leaks_fixed += old_res.leaks - res->leaks;
2870         }
2871         res->leaks += fresh_leaks;
2872     } else if (fix) {
2873         if (rebuild) {
2874             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2875             res->check_errors++;
2876             ret = -EIO;
2877             goto fail;
2878         }
2879 
2880         if (res->leaks || res->corruptions) {
2881             *res = pre_compare_res;
2882             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2883                               refcount_table, nb_clusters);
2884         }
2885     }
2886 
2887     /* check OFLAG_COPIED */
2888     ret = check_oflag_copied(bs, res, fix);
2889     if (ret < 0) {
2890         goto fail;
2891     }
2892 
2893     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2894     ret = 0;
2895 
2896 fail:
2897     g_free(refcount_table);
2898 
2899     return ret;
2900 }
2901 
2902 #define overlaps_with(ofs, sz) \
2903     ranges_overlap(offset, size, ofs, sz)
2904 
2905 /*
2906  * Checks if the given offset into the image file is actually free to use by
2907  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2908  * i.e. a sanity check without relying on the refcount tables.
2909  *
2910  * The ign parameter specifies what checks not to perform (being a bitmask of
2911  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2912  *
2913  * Returns:
2914  * - 0 if writing to this offset will not affect the mentioned metadata
2915  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2916  * - a negative value (-errno) indicating an error while performing a check,
2917  *   e.g. when bdrv_pread failed on QCOW2_OL_INACTIVE_L2
2918  */
2919 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2920                                  int64_t size)
2921 {
2922     BDRVQcow2State *s = bs->opaque;
2923     int chk = s->overlap_check & ~ign;
2924     int i, j;
2925 
2926     if (!size) {
2927         return 0;
2928     }
2929 
2930     if (chk & QCOW2_OL_MAIN_HEADER) {
2931         if (offset < s->cluster_size) {
2932             return QCOW2_OL_MAIN_HEADER;
2933         }
2934     }
2935 
2936     /* align range to test to cluster boundaries */
2937     size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
2938     offset = start_of_cluster(s, offset);
2939 
2940     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2941         if (overlaps_with(s->l1_table_offset, s->l1_size * L1E_SIZE)) {
2942             return QCOW2_OL_ACTIVE_L1;
2943         }
2944     }
2945 
2946     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2947         if (overlaps_with(s->refcount_table_offset,
2948             s->refcount_table_size * REFTABLE_ENTRY_SIZE)) {
2949             return QCOW2_OL_REFCOUNT_TABLE;
2950         }
2951     }
2952 
2953     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2954         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2955             return QCOW2_OL_SNAPSHOT_TABLE;
2956         }
2957     }
2958 
2959     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2960         for (i = 0; i < s->nb_snapshots; i++) {
2961             if (s->snapshots[i].l1_size &&
2962                 overlaps_with(s->snapshots[i].l1_table_offset,
2963                 s->snapshots[i].l1_size * L1E_SIZE)) {
2964                 return QCOW2_OL_INACTIVE_L1;
2965             }
2966         }
2967     }
2968 
2969     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2970         for (i = 0; i < s->l1_size; i++) {
2971             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2972                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2973                 s->cluster_size)) {
2974                 return QCOW2_OL_ACTIVE_L2;
2975             }
2976         }
2977     }
2978 
2979     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2980         unsigned last_entry = s->max_refcount_table_index;
2981         assert(last_entry < s->refcount_table_size);
2982         assert(last_entry + 1 == s->refcount_table_size ||
2983                (s->refcount_table[last_entry + 1] & REFT_OFFSET_MASK) == 0);
2984         for (i = 0; i <= last_entry; i++) {
2985             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2986                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2987                 s->cluster_size)) {
2988                 return QCOW2_OL_REFCOUNT_BLOCK;
2989             }
2990         }
2991     }
2992 
2993     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2994         for (i = 0; i < s->nb_snapshots; i++) {
2995             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2996             uint32_t l1_sz  = s->snapshots[i].l1_size;
2997             uint64_t l1_sz2 = l1_sz * L1E_SIZE;
2998             uint64_t *l1;
2999             int ret;
3000 
3001             ret = qcow2_validate_table(bs, l1_ofs, l1_sz, L1E_SIZE,
3002                                        QCOW_MAX_L1_SIZE, "", NULL);
3003             if (ret < 0) {
3004                 return ret;
3005             }
3006 
3007             l1 = g_try_malloc(l1_sz2);
3008 
3009             if (l1_sz2 && l1 == NULL) {
3010                 return -ENOMEM;
3011             }
3012 
3013             ret = bdrv_pread(bs->file, l1_ofs, l1_sz2, l1, 0);
3014             if (ret < 0) {
3015                 g_free(l1);
3016                 return ret;
3017             }
3018 
3019             for (j = 0; j < l1_sz; j++) {
3020                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
3021                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
3022                     g_free(l1);
3023                     return QCOW2_OL_INACTIVE_L2;
3024                 }
3025             }
3026 
3027             g_free(l1);
3028         }
3029     }
3030 
3031     if ((chk & QCOW2_OL_BITMAP_DIRECTORY) &&
3032         (s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS))
3033     {
3034         if (overlaps_with(s->bitmap_directory_offset,
3035                           s->bitmap_directory_size))
3036         {
3037             return QCOW2_OL_BITMAP_DIRECTORY;
3038         }
3039     }
3040 
3041     return 0;
3042 }
3043 
3044 static const char *metadata_ol_names[] = {
3045     [QCOW2_OL_MAIN_HEADER_BITNR]        = "qcow2_header",
3046     [QCOW2_OL_ACTIVE_L1_BITNR]          = "active L1 table",
3047     [QCOW2_OL_ACTIVE_L2_BITNR]          = "active L2 table",
3048     [QCOW2_OL_REFCOUNT_TABLE_BITNR]     = "refcount table",
3049     [QCOW2_OL_REFCOUNT_BLOCK_BITNR]     = "refcount block",
3050     [QCOW2_OL_SNAPSHOT_TABLE_BITNR]     = "snapshot table",
3051     [QCOW2_OL_INACTIVE_L1_BITNR]        = "inactive L1 table",
3052     [QCOW2_OL_INACTIVE_L2_BITNR]        = "inactive L2 table",
3053     [QCOW2_OL_BITMAP_DIRECTORY_BITNR]   = "bitmap directory",
3054 };
3055 QEMU_BUILD_BUG_ON(QCOW2_OL_MAX_BITNR != ARRAY_SIZE(metadata_ol_names));
3056 
3057 /*
3058  * First performs a check for metadata overlaps (through
3059  * qcow2_check_metadata_overlap); if that fails with a negative value (error
3060  * while performing a check), that value is returned. If an impending overlap
3061  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
3062  * and -EIO returned.
3063  *
3064  * Returns 0 if there were neither overlaps nor errors while checking for
3065  * overlaps; or a negative value (-errno) on error.
3066  */
3067 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
3068                                   int64_t size, bool data_file)
3069 {
3070     int ret;
3071 
3072     if (data_file && has_data_file(bs)) {
3073         return 0;
3074     }
3075 
3076     ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
3077     if (ret < 0) {
3078         return ret;
3079     } else if (ret > 0) {
3080         int metadata_ol_bitnr = ctz32(ret);
3081         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
3082 
3083         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
3084                                 "write on metadata (overlaps with %s)",
3085                                 metadata_ol_names[metadata_ol_bitnr]);
3086         return -EIO;
3087     }
3088 
3089     return 0;
3090 }
3091 
3092 /* A pointer to a function of this type is given to walk_over_reftable(). That
3093  * function will create refblocks and pass them to a RefblockFinishOp once they
3094  * are completed (@refblock). @refblock_empty is set if the refblock is
3095  * completely empty.
3096  *
3097  * Along with the refblock, a corresponding reftable entry is passed, in the
3098  * reftable @reftable (which may be reallocated) at @reftable_index.
3099  *
3100  * @allocated should be set to true if a new cluster has been allocated.
3101  */
3102 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
3103                                uint64_t reftable_index, uint64_t *reftable_size,
3104                                void *refblock, bool refblock_empty,
3105                                bool *allocated, Error **errp);
3106 
3107 /**
3108  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
3109  * it is not empty) and inserts its offset into the new reftable. The size of
3110  * this new reftable is increased as required.
3111  */
3112 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
3113                           uint64_t reftable_index, uint64_t *reftable_size,
3114                           void *refblock, bool refblock_empty, bool *allocated,
3115                           Error **errp)
3116 {
3117     BDRVQcow2State *s = bs->opaque;
3118     int64_t offset;
3119 
3120     if (!refblock_empty && reftable_index >= *reftable_size) {
3121         uint64_t *new_reftable;
3122         uint64_t new_reftable_size;
3123 
3124         new_reftable_size = ROUND_UP(reftable_index + 1,
3125                                      s->cluster_size / REFTABLE_ENTRY_SIZE);
3126         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / REFTABLE_ENTRY_SIZE) {
3127             error_setg(errp,
3128                        "This operation would make the refcount table grow "
3129                        "beyond the maximum size supported by QEMU, aborting");
3130             return -ENOTSUP;
3131         }
3132 
3133         new_reftable = g_try_realloc(*reftable, new_reftable_size *
3134                                                 REFTABLE_ENTRY_SIZE);
3135         if (!new_reftable) {
3136             error_setg(errp, "Failed to increase reftable buffer size");
3137             return -ENOMEM;
3138         }
3139 
3140         memset(new_reftable + *reftable_size, 0,
3141                (new_reftable_size - *reftable_size) * REFTABLE_ENTRY_SIZE);
3142 
3143         *reftable      = new_reftable;
3144         *reftable_size = new_reftable_size;
3145     }
3146 
3147     if (!refblock_empty && !(*reftable)[reftable_index]) {
3148         offset = qcow2_alloc_clusters(bs, s->cluster_size);
3149         if (offset < 0) {
3150             error_setg_errno(errp, -offset, "Failed to allocate refblock");
3151             return offset;
3152         }
3153         (*reftable)[reftable_index] = offset;
3154         *allocated = true;
3155     }
3156 
3157     return 0;
3158 }
3159 
3160 /**
3161  * This "operation" for walk_over_reftable() writes the refblock to disk at the
3162  * offset specified by the new reftable's entry. It does not modify the new
3163  * reftable or change any refcounts.
3164  */
3165 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
3166                           uint64_t reftable_index, uint64_t *reftable_size,
3167                           void *refblock, bool refblock_empty, bool *allocated,
3168                           Error **errp)
3169 {
3170     BDRVQcow2State *s = bs->opaque;
3171     int64_t offset;
3172     int ret;
3173 
3174     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
3175         offset = (*reftable)[reftable_index];
3176 
3177         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size,
3178                                             false);
3179         if (ret < 0) {
3180             error_setg_errno(errp, -ret, "Overlap check failed");
3181             return ret;
3182         }
3183 
3184         ret = bdrv_pwrite(bs->file, offset, s->cluster_size, refblock, 0);
3185         if (ret < 0) {
3186             error_setg_errno(errp, -ret, "Failed to write refblock");
3187             return ret;
3188         }
3189     } else {
3190         assert(refblock_empty);
3191     }
3192 
3193     return 0;
3194 }
3195 
3196 /**
3197  * This function walks over the existing reftable and every referenced refblock;
3198  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
3199  * create an equal new entry in the passed @new_refblock. Once that
3200  * @new_refblock is completely filled, @operation will be called.
3201  *
3202  * @status_cb and @cb_opaque are used for the amend operation's status callback.
3203  * @index is the index of the walk_over_reftable() calls and @total is the total
3204  * number of walk_over_reftable() calls per amend operation. Both are used for
3205  * calculating the parameters for the status callback.
3206  *
3207  * @allocated is set to true if a new cluster has been allocated.
3208  */
3209 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
3210                               uint64_t *new_reftable_index,
3211                               uint64_t *new_reftable_size,
3212                               void *new_refblock, int new_refblock_size,
3213                               int new_refcount_bits,
3214                               RefblockFinishOp *operation, bool *allocated,
3215                               Qcow2SetRefcountFunc *new_set_refcount,
3216                               BlockDriverAmendStatusCB *status_cb,
3217                               void *cb_opaque, int index, int total,
3218                               Error **errp)
3219 {
3220     BDRVQcow2State *s = bs->opaque;
3221     uint64_t reftable_index;
3222     bool new_refblock_empty = true;
3223     int refblock_index;
3224     int new_refblock_index = 0;
3225     int ret;
3226 
3227     for (reftable_index = 0; reftable_index < s->refcount_table_size;
3228          reftable_index++)
3229     {
3230         uint64_t refblock_offset = s->refcount_table[reftable_index]
3231                                  & REFT_OFFSET_MASK;
3232 
3233         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
3234                   (uint64_t)total * s->refcount_table_size, cb_opaque);
3235 
3236         if (refblock_offset) {
3237             void *refblock;
3238 
3239             if (offset_into_cluster(s, refblock_offset)) {
3240                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
3241                                         PRIx64 " unaligned (reftable index: %#"
3242                                         PRIx64 ")", refblock_offset,
3243                                         reftable_index);
3244                 error_setg(errp,
3245                            "Image is corrupt (unaligned refblock offset)");
3246                 return -EIO;
3247             }
3248 
3249             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
3250                                   &refblock);
3251             if (ret < 0) {
3252                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
3253                 return ret;
3254             }
3255 
3256             for (refblock_index = 0; refblock_index < s->refcount_block_size;
3257                  refblock_index++)
3258             {
3259                 uint64_t refcount;
3260 
3261                 if (new_refblock_index >= new_refblock_size) {
3262                     /* new_refblock is now complete */
3263                     ret = operation(bs, new_reftable, *new_reftable_index,
3264                                     new_reftable_size, new_refblock,
3265                                     new_refblock_empty, allocated, errp);
3266                     if (ret < 0) {
3267                         qcow2_cache_put(s->refcount_block_cache, &refblock);
3268                         return ret;
3269                     }
3270 
3271                     (*new_reftable_index)++;
3272                     new_refblock_index = 0;
3273                     new_refblock_empty = true;
3274                 }
3275 
3276                 refcount = s->get_refcount(refblock, refblock_index);
3277                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
3278                     uint64_t offset;
3279 
3280                     qcow2_cache_put(s->refcount_block_cache, &refblock);
3281 
3282                     offset = ((reftable_index << s->refcount_block_bits)
3283                               + refblock_index) << s->cluster_bits;
3284 
3285                     error_setg(errp, "Cannot decrease refcount entry width to "
3286                                "%i bits: Cluster at offset %#" PRIx64 " has a "
3287                                "refcount of %" PRIu64, new_refcount_bits,
3288                                offset, refcount);
3289                     return -EINVAL;
3290                 }
3291 
3292                 if (new_set_refcount) {
3293                     new_set_refcount(new_refblock, new_refblock_index++,
3294                                      refcount);
3295                 } else {
3296                     new_refblock_index++;
3297                 }
3298                 new_refblock_empty = new_refblock_empty && refcount == 0;
3299             }
3300 
3301             qcow2_cache_put(s->refcount_block_cache, &refblock);
3302         } else {
3303             /* No refblock means every refcount is 0 */
3304             for (refblock_index = 0; refblock_index < s->refcount_block_size;
3305                  refblock_index++)
3306             {
3307                 if (new_refblock_index >= new_refblock_size) {
3308                     /* new_refblock is now complete */
3309                     ret = operation(bs, new_reftable, *new_reftable_index,
3310                                     new_reftable_size, new_refblock,
3311                                     new_refblock_empty, allocated, errp);
3312                     if (ret < 0) {
3313                         return ret;
3314                     }
3315 
3316                     (*new_reftable_index)++;
3317                     new_refblock_index = 0;
3318                     new_refblock_empty = true;
3319                 }
3320 
3321                 if (new_set_refcount) {
3322                     new_set_refcount(new_refblock, new_refblock_index++, 0);
3323                 } else {
3324                     new_refblock_index++;
3325                 }
3326             }
3327         }
3328     }
3329 
3330     if (new_refblock_index > 0) {
3331         /* Complete the potentially existing partially filled final refblock */
3332         if (new_set_refcount) {
3333             for (; new_refblock_index < new_refblock_size;
3334                  new_refblock_index++)
3335             {
3336                 new_set_refcount(new_refblock, new_refblock_index, 0);
3337             }
3338         }
3339 
3340         ret = operation(bs, new_reftable, *new_reftable_index,
3341                         new_reftable_size, new_refblock, new_refblock_empty,
3342                         allocated, errp);
3343         if (ret < 0) {
3344             return ret;
3345         }
3346 
3347         (*new_reftable_index)++;
3348     }
3349 
3350     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
3351               (uint64_t)total * s->refcount_table_size, cb_opaque);
3352 
3353     return 0;
3354 }
3355 
3356 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
3357                                 BlockDriverAmendStatusCB *status_cb,
3358                                 void *cb_opaque, Error **errp)
3359 {
3360     BDRVQcow2State *s = bs->opaque;
3361     Qcow2GetRefcountFunc *new_get_refcount;
3362     Qcow2SetRefcountFunc *new_set_refcount;
3363     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
3364     uint64_t *new_reftable = NULL, new_reftable_size = 0;
3365     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
3366     uint64_t new_reftable_index = 0;
3367     uint64_t i;
3368     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
3369     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
3370     int old_refcount_order;
3371     int walk_index = 0;
3372     int ret;
3373     bool new_allocation;
3374 
3375     assert(s->qcow_version >= 3);
3376     assert(refcount_order >= 0 && refcount_order <= 6);
3377 
3378     /* see qcow2_open() */
3379     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
3380 
3381     new_get_refcount = get_refcount_funcs[refcount_order];
3382     new_set_refcount = set_refcount_funcs[refcount_order];
3383 
3384 
3385     do {
3386         int total_walks;
3387 
3388         new_allocation = false;
3389 
3390         /* At least we have to do this walk and the one which writes the
3391          * refblocks; also, at least we have to do this loop here at least
3392          * twice (normally), first to do the allocations, and second to
3393          * determine that everything is correctly allocated, this then makes
3394          * three walks in total */
3395         total_walks = MAX(walk_index + 2, 3);
3396 
3397         /* First, allocate the structures so they are present in the refcount
3398          * structures */
3399         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3400                                  &new_reftable_size, NULL, new_refblock_size,
3401                                  new_refcount_bits, &alloc_refblock,
3402                                  &new_allocation, NULL, status_cb, cb_opaque,
3403                                  walk_index++, total_walks, errp);
3404         if (ret < 0) {
3405             goto done;
3406         }
3407 
3408         new_reftable_index = 0;
3409 
3410         if (new_allocation) {
3411             if (new_reftable_offset) {
3412                 qcow2_free_clusters(
3413                     bs, new_reftable_offset,
3414                     allocated_reftable_size * REFTABLE_ENTRY_SIZE,
3415                     QCOW2_DISCARD_NEVER);
3416             }
3417 
3418             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
3419                                                            REFTABLE_ENTRY_SIZE);
3420             if (new_reftable_offset < 0) {
3421                 error_setg_errno(errp, -new_reftable_offset,
3422                                  "Failed to allocate the new reftable");
3423                 ret = new_reftable_offset;
3424                 goto done;
3425             }
3426             allocated_reftable_size = new_reftable_size;
3427         }
3428     } while (new_allocation);
3429 
3430     /* Second, write the new refblocks */
3431     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
3432                              &new_reftable_size, new_refblock,
3433                              new_refblock_size, new_refcount_bits,
3434                              &flush_refblock, &new_allocation, new_set_refcount,
3435                              status_cb, cb_opaque, walk_index, walk_index + 1,
3436                              errp);
3437     if (ret < 0) {
3438         goto done;
3439     }
3440     assert(!new_allocation);
3441 
3442 
3443     /* Write the new reftable */
3444     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
3445                                         new_reftable_size * REFTABLE_ENTRY_SIZE,
3446                                         false);
3447     if (ret < 0) {
3448         error_setg_errno(errp, -ret, "Overlap check failed");
3449         goto done;
3450     }
3451 
3452     for (i = 0; i < new_reftable_size; i++) {
3453         cpu_to_be64s(&new_reftable[i]);
3454     }
3455 
3456     ret = bdrv_pwrite(bs->file, new_reftable_offset,
3457                       new_reftable_size * REFTABLE_ENTRY_SIZE, new_reftable,
3458                       0);
3459 
3460     for (i = 0; i < new_reftable_size; i++) {
3461         be64_to_cpus(&new_reftable[i]);
3462     }
3463 
3464     if (ret < 0) {
3465         error_setg_errno(errp, -ret, "Failed to write the new reftable");
3466         goto done;
3467     }
3468 
3469 
3470     /* Empty the refcount cache */
3471     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
3472     if (ret < 0) {
3473         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
3474         goto done;
3475     }
3476 
3477     /* Update the image header to point to the new reftable; this only updates
3478      * the fields which are relevant to qcow2_update_header(); other fields
3479      * such as s->refcount_table or s->refcount_bits stay stale for now
3480      * (because we have to restore everything if qcow2_update_header() fails) */
3481     old_refcount_order  = s->refcount_order;
3482     old_reftable_size   = s->refcount_table_size;
3483     old_reftable_offset = s->refcount_table_offset;
3484 
3485     s->refcount_order        = refcount_order;
3486     s->refcount_table_size   = new_reftable_size;
3487     s->refcount_table_offset = new_reftable_offset;
3488 
3489     ret = qcow2_update_header(bs);
3490     if (ret < 0) {
3491         s->refcount_order        = old_refcount_order;
3492         s->refcount_table_size   = old_reftable_size;
3493         s->refcount_table_offset = old_reftable_offset;
3494         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
3495         goto done;
3496     }
3497 
3498     /* Now update the rest of the in-memory information */
3499     old_reftable = s->refcount_table;
3500     s->refcount_table = new_reftable;
3501     update_max_refcount_table_index(s);
3502 
3503     s->refcount_bits = 1 << refcount_order;
3504     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
3505     s->refcount_max += s->refcount_max - 1;
3506 
3507     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
3508     s->refcount_block_size = 1 << s->refcount_block_bits;
3509 
3510     s->get_refcount = new_get_refcount;
3511     s->set_refcount = new_set_refcount;
3512 
3513     /* For cleaning up all old refblocks and the old reftable below the "done"
3514      * label */
3515     new_reftable        = old_reftable;
3516     new_reftable_size   = old_reftable_size;
3517     new_reftable_offset = old_reftable_offset;
3518 
3519 done:
3520     if (new_reftable) {
3521         /* On success, new_reftable actually points to the old reftable (and
3522          * new_reftable_size is the old reftable's size); but that is just
3523          * fine */
3524         for (i = 0; i < new_reftable_size; i++) {
3525             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
3526             if (offset) {
3527                 qcow2_free_clusters(bs, offset, s->cluster_size,
3528                                     QCOW2_DISCARD_OTHER);
3529             }
3530         }
3531         g_free(new_reftable);
3532 
3533         if (new_reftable_offset > 0) {
3534             qcow2_free_clusters(bs, new_reftable_offset,
3535                                 new_reftable_size * REFTABLE_ENTRY_SIZE,
3536                                 QCOW2_DISCARD_OTHER);
3537         }
3538     }
3539 
3540     qemu_vfree(new_refblock);
3541     return ret;
3542 }
3543 
3544 static int64_t get_refblock_offset(BlockDriverState *bs, uint64_t offset)
3545 {
3546     BDRVQcow2State *s = bs->opaque;
3547     uint32_t index = offset_to_reftable_index(s, offset);
3548     int64_t covering_refblock_offset = 0;
3549 
3550     if (index < s->refcount_table_size) {
3551         covering_refblock_offset = s->refcount_table[index] & REFT_OFFSET_MASK;
3552     }
3553     if (!covering_refblock_offset) {
3554         qcow2_signal_corruption(bs, true, -1, -1, "Refblock at %#" PRIx64 " is "
3555                                 "not covered by the refcount structures",
3556                                 offset);
3557         return -EIO;
3558     }
3559 
3560     return covering_refblock_offset;
3561 }
3562 
3563 static int coroutine_fn
3564 qcow2_discard_refcount_block(BlockDriverState *bs, uint64_t discard_block_offs)
3565 {
3566     BDRVQcow2State *s = bs->opaque;
3567     int64_t refblock_offs;
3568     uint64_t cluster_index = discard_block_offs >> s->cluster_bits;
3569     uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
3570     void *refblock;
3571     int ret;
3572 
3573     refblock_offs = get_refblock_offset(bs, discard_block_offs);
3574     if (refblock_offs < 0) {
3575         return refblock_offs;
3576     }
3577 
3578     assert(discard_block_offs != 0);
3579 
3580     ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3581                           &refblock);
3582     if (ret < 0) {
3583         return ret;
3584     }
3585 
3586     if (s->get_refcount(refblock, block_index) != 1) {
3587         qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
3588                                 " refblock offset %#" PRIx64
3589                                 ", reftable index %u"
3590                                 ", block offset %#" PRIx64
3591                                 ", refcount %#" PRIx64,
3592                                 refblock_offs,
3593                                 offset_to_reftable_index(s, discard_block_offs),
3594                                 discard_block_offs,
3595                                 s->get_refcount(refblock, block_index));
3596         qcow2_cache_put(s->refcount_block_cache, &refblock);
3597         return -EINVAL;
3598     }
3599     s->set_refcount(refblock, block_index, 0);
3600 
3601     qcow2_cache_entry_mark_dirty(s->refcount_block_cache, refblock);
3602 
3603     qcow2_cache_put(s->refcount_block_cache, &refblock);
3604 
3605     if (cluster_index < s->free_cluster_index) {
3606         s->free_cluster_index = cluster_index;
3607     }
3608 
3609     refblock = qcow2_cache_is_table_offset(s->refcount_block_cache,
3610                                            discard_block_offs);
3611     if (refblock) {
3612         /* discard refblock from the cache if refblock is cached */
3613         qcow2_cache_discard(s->refcount_block_cache, refblock);
3614     }
3615     update_refcount_discard(bs, discard_block_offs, s->cluster_size);
3616 
3617     return 0;
3618 }
3619 
3620 int coroutine_fn qcow2_shrink_reftable(BlockDriverState *bs)
3621 {
3622     BDRVQcow2State *s = bs->opaque;
3623     uint64_t *reftable_tmp =
3624         g_malloc(s->refcount_table_size * REFTABLE_ENTRY_SIZE);
3625     int i, ret;
3626 
3627     for (i = 0; i < s->refcount_table_size; i++) {
3628         int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
3629         void *refblock;
3630         bool unused_block;
3631 
3632         if (refblock_offs == 0) {
3633             reftable_tmp[i] = 0;
3634             continue;
3635         }
3636         ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
3637                               &refblock);
3638         if (ret < 0) {
3639             goto out;
3640         }
3641 
3642         /* the refblock has own reference */
3643         if (i == offset_to_reftable_index(s, refblock_offs)) {
3644             uint64_t block_index = (refblock_offs >> s->cluster_bits) &
3645                                    (s->refcount_block_size - 1);
3646             uint64_t refcount = s->get_refcount(refblock, block_index);
3647 
3648             s->set_refcount(refblock, block_index, 0);
3649 
3650             unused_block = buffer_is_zero(refblock, s->cluster_size);
3651 
3652             s->set_refcount(refblock, block_index, refcount);
3653         } else {
3654             unused_block = buffer_is_zero(refblock, s->cluster_size);
3655         }
3656         qcow2_cache_put(s->refcount_block_cache, &refblock);
3657 
3658         reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
3659     }
3660 
3661     ret = bdrv_co_pwrite_sync(bs->file, s->refcount_table_offset,
3662                               s->refcount_table_size * REFTABLE_ENTRY_SIZE,
3663                               reftable_tmp, 0);
3664     /*
3665      * If the write in the reftable failed the image may contain a partially
3666      * overwritten reftable. In this case it would be better to clear the
3667      * reftable in memory to avoid possible image corruption.
3668      */
3669     for (i = 0; i < s->refcount_table_size; i++) {
3670         if (s->refcount_table[i] && !reftable_tmp[i]) {
3671             if (ret == 0) {
3672                 ret = qcow2_discard_refcount_block(bs, s->refcount_table[i] &
3673                                                        REFT_OFFSET_MASK);
3674             }
3675             s->refcount_table[i] = 0;
3676         }
3677     }
3678 
3679     if (!s->cache_discards) {
3680         qcow2_process_discards(bs, ret);
3681     }
3682 
3683 out:
3684     g_free(reftable_tmp);
3685     return ret;
3686 }
3687 
3688 int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t size)
3689 {
3690     BDRVQcow2State *s = bs->opaque;
3691     int64_t i;
3692 
3693     for (i = size_to_clusters(s, size) - 1; i >= 0; i--) {
3694         uint64_t refcount;
3695         int ret = qcow2_get_refcount(bs, i, &refcount);
3696         if (ret < 0) {
3697             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
3698                     i, strerror(-ret));
3699             return ret;
3700         }
3701         if (refcount > 0) {
3702             return i;
3703         }
3704     }
3705     qcow2_signal_corruption(bs, true, -1, -1,
3706                             "There are no references in the refcount table.");
3707     return -EIO;
3708 }
3709 
3710 int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs)
3711 {
3712     BDRVQcow2State *s = bs->opaque;
3713     int64_t i, end_cluster, cluster_count = 0, threshold;
3714     int64_t file_length, real_allocation, real_clusters;
3715 
3716     qemu_co_mutex_assert_locked(&s->lock);
3717 
3718     file_length = bdrv_co_getlength(bs->file->bs);
3719     if (file_length < 0) {
3720         return file_length;
3721     }
3722 
3723     real_allocation = bdrv_co_get_allocated_file_size(bs->file->bs);
3724     if (real_allocation < 0) {
3725         return real_allocation;
3726     }
3727 
3728     real_clusters = real_allocation / s->cluster_size;
3729     threshold = MAX(real_clusters * 10 / 9, real_clusters + 2);
3730 
3731     end_cluster = size_to_clusters(s, file_length);
3732     for (i = 0; i < end_cluster && cluster_count < threshold; i++) {
3733         uint64_t refcount;
3734         int ret = qcow2_get_refcount(bs, i, &refcount);
3735         if (ret < 0) {
3736             return ret;
3737         }
3738         cluster_count += !!refcount;
3739     }
3740 
3741     return cluster_count >= threshold;
3742 }
3743