xref: /qemu/block/qcow2-cluster.c (revision b9be6fae)
145aba42fSKevin Wolf /*
245aba42fSKevin Wolf  * Block driver for the QCOW version 2 format
345aba42fSKevin Wolf  *
445aba42fSKevin Wolf  * Copyright (c) 2004-2006 Fabrice Bellard
545aba42fSKevin Wolf  *
645aba42fSKevin Wolf  * Permission is hereby granted, free of charge, to any person obtaining a copy
745aba42fSKevin Wolf  * of this software and associated documentation files (the "Software"), to deal
845aba42fSKevin Wolf  * in the Software without restriction, including without limitation the rights
945aba42fSKevin Wolf  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1045aba42fSKevin Wolf  * copies of the Software, and to permit persons to whom the Software is
1145aba42fSKevin Wolf  * furnished to do so, subject to the following conditions:
1245aba42fSKevin Wolf  *
1345aba42fSKevin Wolf  * The above copyright notice and this permission notice shall be included in
1445aba42fSKevin Wolf  * all copies or substantial portions of the Software.
1545aba42fSKevin Wolf  *
1645aba42fSKevin Wolf  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1745aba42fSKevin Wolf  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1845aba42fSKevin Wolf  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
1945aba42fSKevin Wolf  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2045aba42fSKevin Wolf  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2145aba42fSKevin Wolf  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
2245aba42fSKevin Wolf  * THE SOFTWARE.
2345aba42fSKevin Wolf  */
2445aba42fSKevin Wolf 
2580c71a24SPeter Maydell #include "qemu/osdep.h"
2645aba42fSKevin Wolf #include <zlib.h>
2745aba42fSKevin Wolf 
28c9a442e4SAlberto Garcia #include "qapi/error.h"
290d8c41daSMichael S. Tsirkin #include "qcow2.h"
3058369e22SPaolo Bonzini #include "qemu/bswap.h"
313cce16f4SKevin Wolf #include "trace.h"
3245aba42fSKevin Wolf 
3346b732cdSPavel Butsykin int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
3446b732cdSPavel Butsykin {
3546b732cdSPavel Butsykin     BDRVQcow2State *s = bs->opaque;
3646b732cdSPavel Butsykin     int new_l1_size, i, ret;
3746b732cdSPavel Butsykin 
3846b732cdSPavel Butsykin     if (exact_size >= s->l1_size) {
3946b732cdSPavel Butsykin         return 0;
4046b732cdSPavel Butsykin     }
4146b732cdSPavel Butsykin 
4246b732cdSPavel Butsykin     new_l1_size = exact_size;
4346b732cdSPavel Butsykin 
4446b732cdSPavel Butsykin #ifdef DEBUG_ALLOC2
4546b732cdSPavel Butsykin     fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
4646b732cdSPavel Butsykin #endif
4746b732cdSPavel Butsykin 
4846b732cdSPavel Butsykin     BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
4946b732cdSPavel Butsykin     ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
5046b732cdSPavel Butsykin                                        new_l1_size * sizeof(uint64_t),
5146b732cdSPavel Butsykin                              (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
5246b732cdSPavel Butsykin     if (ret < 0) {
5346b732cdSPavel Butsykin         goto fail;
5446b732cdSPavel Butsykin     }
5546b732cdSPavel Butsykin 
5646b732cdSPavel Butsykin     ret = bdrv_flush(bs->file->bs);
5746b732cdSPavel Butsykin     if (ret < 0) {
5846b732cdSPavel Butsykin         goto fail;
5946b732cdSPavel Butsykin     }
6046b732cdSPavel Butsykin 
6146b732cdSPavel Butsykin     BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
6246b732cdSPavel Butsykin     for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
6346b732cdSPavel Butsykin         if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
6446b732cdSPavel Butsykin             continue;
6546b732cdSPavel Butsykin         }
6646b732cdSPavel Butsykin         qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
6746b732cdSPavel Butsykin                             s->cluster_size, QCOW2_DISCARD_ALWAYS);
6846b732cdSPavel Butsykin         s->l1_table[i] = 0;
6946b732cdSPavel Butsykin     }
7046b732cdSPavel Butsykin     return 0;
7146b732cdSPavel Butsykin 
7246b732cdSPavel Butsykin fail:
7346b732cdSPavel Butsykin     /*
7446b732cdSPavel Butsykin      * If the write in the l1_table failed the image may contain a partially
7546b732cdSPavel Butsykin      * overwritten l1_table. In this case it would be better to clear the
7646b732cdSPavel Butsykin      * l1_table in memory to avoid possible image corruption.
7746b732cdSPavel Butsykin      */
7846b732cdSPavel Butsykin     memset(s->l1_table + new_l1_size, 0,
7946b732cdSPavel Butsykin            (s->l1_size - new_l1_size) * sizeof(uint64_t));
8046b732cdSPavel Butsykin     return ret;
8146b732cdSPavel Butsykin }
8246b732cdSPavel Butsykin 
832cf7cfa1SKevin Wolf int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
842cf7cfa1SKevin Wolf                         bool exact_size)
8545aba42fSKevin Wolf {
86ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
872cf7cfa1SKevin Wolf     int new_l1_size2, ret, i;
8845aba42fSKevin Wolf     uint64_t *new_l1_table;
89fda74f82SMax Reitz     int64_t old_l1_table_offset, old_l1_size;
902cf7cfa1SKevin Wolf     int64_t new_l1_table_offset, new_l1_size;
9145aba42fSKevin Wolf     uint8_t data[12];
9245aba42fSKevin Wolf 
9372893756SStefan Hajnoczi     if (min_size <= s->l1_size)
9445aba42fSKevin Wolf         return 0;
9572893756SStefan Hajnoczi 
96b93f9950SMax Reitz     /* Do a sanity check on min_size before trying to calculate new_l1_size
97b93f9950SMax Reitz      * (this prevents overflows during the while loop for the calculation of
98b93f9950SMax Reitz      * new_l1_size) */
99b93f9950SMax Reitz     if (min_size > INT_MAX / sizeof(uint64_t)) {
100b93f9950SMax Reitz         return -EFBIG;
101b93f9950SMax Reitz     }
102b93f9950SMax Reitz 
10372893756SStefan Hajnoczi     if (exact_size) {
10472893756SStefan Hajnoczi         new_l1_size = min_size;
10572893756SStefan Hajnoczi     } else {
10672893756SStefan Hajnoczi         /* Bump size up to reduce the number of times we have to grow */
10772893756SStefan Hajnoczi         new_l1_size = s->l1_size;
108d191d12dSStefan Weil         if (new_l1_size == 0) {
109d191d12dSStefan Weil             new_l1_size = 1;
110d191d12dSStefan Weil         }
11145aba42fSKevin Wolf         while (min_size > new_l1_size) {
11221cf3e12SMarc-André Lureau             new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2);
11345aba42fSKevin Wolf         }
11472893756SStefan Hajnoczi     }
11572893756SStefan Hajnoczi 
11684c26520SMax Reitz     QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX);
11784c26520SMax Reitz     if (new_l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) {
1182cf7cfa1SKevin Wolf         return -EFBIG;
1192cf7cfa1SKevin Wolf     }
1202cf7cfa1SKevin Wolf 
12145aba42fSKevin Wolf #ifdef DEBUG_ALLOC2
1222cf7cfa1SKevin Wolf     fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
1232cf7cfa1SKevin Wolf             s->l1_size, new_l1_size);
12445aba42fSKevin Wolf #endif
12545aba42fSKevin Wolf 
12645aba42fSKevin Wolf     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
127ef97d608SAlberto Garcia     new_l1_table = qemu_try_blockalign(bs->file->bs, new_l1_size2);
128de82815dSKevin Wolf     if (new_l1_table == NULL) {
129de82815dSKevin Wolf         return -ENOMEM;
130de82815dSKevin Wolf     }
131ef97d608SAlberto Garcia     memset(new_l1_table, 0, new_l1_size2);
132de82815dSKevin Wolf 
1330647d47cSStefan Hajnoczi     if (s->l1_size) {
13445aba42fSKevin Wolf         memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
1350647d47cSStefan Hajnoczi     }
13645aba42fSKevin Wolf 
13745aba42fSKevin Wolf     /* write new table (align to cluster) */
13866f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
139ed6ccf0fSKevin Wolf     new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
1405d757b56SKevin Wolf     if (new_l1_table_offset < 0) {
141de82815dSKevin Wolf         qemu_vfree(new_l1_table);
1425d757b56SKevin Wolf         return new_l1_table_offset;
1435d757b56SKevin Wolf     }
14429c1a730SKevin Wolf 
14529c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
14629c1a730SKevin Wolf     if (ret < 0) {
14780fa3341SKevin Wolf         goto fail;
14829c1a730SKevin Wolf     }
14945aba42fSKevin Wolf 
150cf93980eSMax Reitz     /* the L1 position has not yet been updated, so these clusters must
151cf93980eSMax Reitz      * indeed be completely free */
152231bb267SMax Reitz     ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
153966b000fSKevin Wolf                                         new_l1_size2, false);
154cf93980eSMax Reitz     if (ret < 0) {
155cf93980eSMax Reitz         goto fail;
156cf93980eSMax Reitz     }
157cf93980eSMax Reitz 
15866f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
15945aba42fSKevin Wolf     for(i = 0; i < s->l1_size; i++)
16045aba42fSKevin Wolf         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
161d9ca2ea2SKevin Wolf     ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
1629a4f4c31SKevin Wolf                            new_l1_table, new_l1_size2);
1638b3b7206SKevin Wolf     if (ret < 0)
16445aba42fSKevin Wolf         goto fail;
16545aba42fSKevin Wolf     for(i = 0; i < s->l1_size; i++)
16645aba42fSKevin Wolf         new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
16745aba42fSKevin Wolf 
16845aba42fSKevin Wolf     /* set new table */
16966f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
170f1f7a1ddSPeter Maydell     stl_be_p(data, new_l1_size);
171e4ef9f46SPeter Maydell     stq_be_p(data + 4, new_l1_table_offset);
172d9ca2ea2SKevin Wolf     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
1739a4f4c31SKevin Wolf                            data, sizeof(data));
1748b3b7206SKevin Wolf     if (ret < 0) {
17545aba42fSKevin Wolf         goto fail;
176fb8fa77cSKevin Wolf     }
177de82815dSKevin Wolf     qemu_vfree(s->l1_table);
178fda74f82SMax Reitz     old_l1_table_offset = s->l1_table_offset;
17945aba42fSKevin Wolf     s->l1_table_offset = new_l1_table_offset;
18045aba42fSKevin Wolf     s->l1_table = new_l1_table;
181fda74f82SMax Reitz     old_l1_size = s->l1_size;
18245aba42fSKevin Wolf     s->l1_size = new_l1_size;
183fda74f82SMax Reitz     qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
184fda74f82SMax Reitz                         QCOW2_DISCARD_OTHER);
18545aba42fSKevin Wolf     return 0;
18645aba42fSKevin Wolf  fail:
187de82815dSKevin Wolf     qemu_vfree(new_l1_table);
1886cfcb9b8SKevin Wolf     qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
1896cfcb9b8SKevin Wolf                         QCOW2_DISCARD_OTHER);
1908b3b7206SKevin Wolf     return ret;
19145aba42fSKevin Wolf }
19245aba42fSKevin Wolf 
19345aba42fSKevin Wolf /*
19445aba42fSKevin Wolf  * l2_load
19545aba42fSKevin Wolf  *
196e2b5713eSAlberto Garcia  * @bs: The BlockDriverState
197e2b5713eSAlberto Garcia  * @offset: A guest offset, used to calculate what slice of the L2
198e2b5713eSAlberto Garcia  *          table to load.
199e2b5713eSAlberto Garcia  * @l2_offset: Offset to the L2 table in the image file.
200e2b5713eSAlberto Garcia  * @l2_slice: Location to store the pointer to the L2 slice.
20145aba42fSKevin Wolf  *
202e2b5713eSAlberto Garcia  * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
203e2b5713eSAlberto Garcia  * that are loaded by the qcow2 cache). If the slice is in the cache,
204e2b5713eSAlberto Garcia  * the cache is used; otherwise the L2 slice is loaded from the image
205e2b5713eSAlberto Garcia  * file.
20645aba42fSKevin Wolf  */
207e2b5713eSAlberto Garcia static int l2_load(BlockDriverState *bs, uint64_t offset,
208e2b5713eSAlberto Garcia                    uint64_t l2_offset, uint64_t **l2_slice)
20945aba42fSKevin Wolf {
210ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
211c8fd8554SAlberto Garcia     int start_of_slice = l2_entry_size(s) *
212e2b5713eSAlberto Garcia         (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));
21345aba42fSKevin Wolf 
214e2b5713eSAlberto Garcia     return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
215e2b5713eSAlberto Garcia                            (void **)l2_slice);
21655c17e98SKevin Wolf }
21755c17e98SKevin Wolf 
21845aba42fSKevin Wolf /*
219da86f8cbSAlberto Garcia  * Writes an L1 entry to disk (note that depending on the alignment
220da86f8cbSAlberto Garcia  * requirements this function may write more that just one entry in
221da86f8cbSAlberto Garcia  * order to prevent bdrv_pwrite from performing a read-modify-write)
2226583e3c7SKevin Wolf  */
223e23e400eSMax Reitz int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
2246583e3c7SKevin Wolf {
225ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
2266583e3c7SKevin Wolf     int l1_start_index;
227f7defcb6SKevin Wolf     int i, ret;
228da86f8cbSAlberto Garcia     int bufsize = MAX(sizeof(uint64_t),
229da86f8cbSAlberto Garcia                       MIN(bs->file->bs->bl.request_alignment, s->cluster_size));
230da86f8cbSAlberto Garcia     int nentries = bufsize / sizeof(uint64_t);
231da86f8cbSAlberto Garcia     g_autofree uint64_t *buf = g_try_new0(uint64_t, nentries);
2326583e3c7SKevin Wolf 
233da86f8cbSAlberto Garcia     if (buf == NULL) {
234da86f8cbSAlberto Garcia         return -ENOMEM;
235da86f8cbSAlberto Garcia     }
236da86f8cbSAlberto Garcia 
237da86f8cbSAlberto Garcia     l1_start_index = QEMU_ALIGN_DOWN(l1_index, nentries);
238da86f8cbSAlberto Garcia     for (i = 0; i < MIN(nentries, s->l1_size - l1_start_index); i++) {
2396583e3c7SKevin Wolf         buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
2406583e3c7SKevin Wolf     }
2416583e3c7SKevin Wolf 
242231bb267SMax Reitz     ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
243da86f8cbSAlberto Garcia             s->l1_table_offset + 8 * l1_start_index, bufsize, false);
244cf93980eSMax Reitz     if (ret < 0) {
245cf93980eSMax Reitz         return ret;
246cf93980eSMax Reitz     }
247cf93980eSMax Reitz 
24866f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
249d9ca2ea2SKevin Wolf     ret = bdrv_pwrite_sync(bs->file,
2509a4f4c31SKevin Wolf                            s->l1_table_offset + 8 * l1_start_index,
251da86f8cbSAlberto Garcia                            buf, bufsize);
252f7defcb6SKevin Wolf     if (ret < 0) {
253f7defcb6SKevin Wolf         return ret;
2546583e3c7SKevin Wolf     }
2556583e3c7SKevin Wolf 
2566583e3c7SKevin Wolf     return 0;
2576583e3c7SKevin Wolf }
2586583e3c7SKevin Wolf 
2596583e3c7SKevin Wolf /*
26045aba42fSKevin Wolf  * l2_allocate
26145aba42fSKevin Wolf  *
26245aba42fSKevin Wolf  * Allocate a new l2 entry in the file. If l1_index points to an already
26345aba42fSKevin Wolf  * used entry in the L2 table (i.e. we are doing a copy on write for the L2
26445aba42fSKevin Wolf  * table) copy the contents of the old L2 table into the newly allocated one.
26545aba42fSKevin Wolf  * Otherwise the new table is initialized with zeros.
26645aba42fSKevin Wolf  *
26745aba42fSKevin Wolf  */
26845aba42fSKevin Wolf 
2693861946aSAlberto Garcia static int l2_allocate(BlockDriverState *bs, int l1_index)
27045aba42fSKevin Wolf {
271ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
2726583e3c7SKevin Wolf     uint64_t old_l2_offset;
2733861946aSAlberto Garcia     uint64_t *l2_slice = NULL;
2743861946aSAlberto Garcia     unsigned slice, slice_size2, n_slices;
275f4f0d391SKevin Wolf     int64_t l2_offset;
276c46e1167SKevin Wolf     int ret;
27745aba42fSKevin Wolf 
27845aba42fSKevin Wolf     old_l2_offset = s->l1_table[l1_index];
27945aba42fSKevin Wolf 
2803cce16f4SKevin Wolf     trace_qcow2_l2_allocate(bs, l1_index);
2813cce16f4SKevin Wolf 
28245aba42fSKevin Wolf     /* allocate a new l2 entry */
28345aba42fSKevin Wolf 
284c8fd8554SAlberto Garcia     l2_offset = qcow2_alloc_clusters(bs, s->l2_size * l2_entry_size(s));
2855d757b56SKevin Wolf     if (l2_offset < 0) {
286be0b742eSMax Reitz         ret = l2_offset;
287be0b742eSMax Reitz         goto fail;
2885d757b56SKevin Wolf     }
28929c1a730SKevin Wolf 
290c1c43990SAlberto Garcia     /* The offset must fit in the offset field of the L1 table entry */
291c1c43990SAlberto Garcia     assert((l2_offset & L1E_OFFSET_MASK) == l2_offset);
292c1c43990SAlberto Garcia 
29398839750SAlberto Garcia     /* If we're allocating the table at offset 0 then something is wrong */
29498839750SAlberto Garcia     if (l2_offset == 0) {
29598839750SAlberto Garcia         qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
29698839750SAlberto Garcia                                 "allocation of L2 table at offset 0");
29798839750SAlberto Garcia         ret = -EIO;
29898839750SAlberto Garcia         goto fail;
29998839750SAlberto Garcia     }
30098839750SAlberto Garcia 
30129c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
30229c1a730SKevin Wolf     if (ret < 0) {
30329c1a730SKevin Wolf         goto fail;
30429c1a730SKevin Wolf     }
30545aba42fSKevin Wolf 
30645aba42fSKevin Wolf     /* allocate a new entry in the l2 cache */
30745aba42fSKevin Wolf 
308c8fd8554SAlberto Garcia     slice_size2 = s->l2_slice_size * l2_entry_size(s);
3093861946aSAlberto Garcia     n_slices = s->cluster_size / slice_size2;
3103861946aSAlberto Garcia 
3113cce16f4SKevin Wolf     trace_qcow2_l2_allocate_get_empty(bs, l1_index);
3123861946aSAlberto Garcia     for (slice = 0; slice < n_slices; slice++) {
3136580bb09SAlberto Garcia         ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
3143861946aSAlberto Garcia                                     l2_offset + slice * slice_size2,
3153861946aSAlberto Garcia                                     (void **) &l2_slice);
31629c1a730SKevin Wolf         if (ret < 0) {
317be0b742eSMax Reitz             goto fail;
31829c1a730SKevin Wolf         }
31929c1a730SKevin Wolf 
3208e37f681SKevin Wolf         if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
3213861946aSAlberto Garcia             /* if there was no old l2 table, clear the new slice */
3223861946aSAlberto Garcia             memset(l2_slice, 0, slice_size2);
32345aba42fSKevin Wolf         } else {
3243861946aSAlberto Garcia             uint64_t *old_slice;
3253861946aSAlberto Garcia             uint64_t old_l2_slice_offset =
3263861946aSAlberto Garcia                 (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;
32729c1a730SKevin Wolf 
3283861946aSAlberto Garcia             /* if there was an old l2 table, read a slice from the disk */
32966f82ceeSKevin Wolf             BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
3303861946aSAlberto Garcia             ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
3313861946aSAlberto Garcia                                   (void **) &old_slice);
33229c1a730SKevin Wolf             if (ret < 0) {
33329c1a730SKevin Wolf                 goto fail;
33429c1a730SKevin Wolf             }
33529c1a730SKevin Wolf 
3363861946aSAlberto Garcia             memcpy(l2_slice, old_slice, slice_size2);
33729c1a730SKevin Wolf 
3383861946aSAlberto Garcia             qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
33945aba42fSKevin Wolf         }
34029c1a730SKevin Wolf 
3413861946aSAlberto Garcia         /* write the l2 slice to the file */
34266f82ceeSKevin Wolf         BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
34329c1a730SKevin Wolf 
3443cce16f4SKevin Wolf         trace_qcow2_l2_allocate_write_l2(bs, l1_index);
3453861946aSAlberto Garcia         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
3463861946aSAlberto Garcia         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
3476580bb09SAlberto Garcia     }
3486580bb09SAlberto Garcia 
34929c1a730SKevin Wolf     ret = qcow2_cache_flush(bs, s->l2_table_cache);
350c46e1167SKevin Wolf     if (ret < 0) {
351175e1152SKevin Wolf         goto fail;
352175e1152SKevin Wolf     }
353175e1152SKevin Wolf 
354175e1152SKevin Wolf     /* update the L1 entry */
3553cce16f4SKevin Wolf     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
356175e1152SKevin Wolf     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
357e23e400eSMax Reitz     ret = qcow2_write_l1_entry(bs, l1_index);
358175e1152SKevin Wolf     if (ret < 0) {
359175e1152SKevin Wolf         goto fail;
360c46e1167SKevin Wolf     }
36145aba42fSKevin Wolf 
3623cce16f4SKevin Wolf     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
363c46e1167SKevin Wolf     return 0;
364175e1152SKevin Wolf 
365175e1152SKevin Wolf fail:
3663cce16f4SKevin Wolf     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
3673861946aSAlberto Garcia     if (l2_slice != NULL) {
3683861946aSAlberto Garcia         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
3698585afd8SMax Reitz     }
37068dba0bfSKevin Wolf     s->l1_table[l1_index] = old_l2_offset;
371e3b21ef9SMax Reitz     if (l2_offset > 0) {
372c8fd8554SAlberto Garcia         qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
373e3b21ef9SMax Reitz                             QCOW2_DISCARD_ALWAYS);
374e3b21ef9SMax Reitz     }
375175e1152SKevin Wolf     return ret;
37645aba42fSKevin Wolf }
37745aba42fSKevin Wolf 
3782bfcc4a0SKevin Wolf /*
37970d1cbaeSAlberto Garcia  * For a given L2 entry, count the number of contiguous subclusters of
38070d1cbaeSAlberto Garcia  * the same type starting from @sc_from. Compressed clusters are
38170d1cbaeSAlberto Garcia  * treated as if they were divided into subclusters of size
38270d1cbaeSAlberto Garcia  * s->subcluster_size.
38370d1cbaeSAlberto Garcia  *
38470d1cbaeSAlberto Garcia  * Return the number of contiguous subclusters and set @type to the
38570d1cbaeSAlberto Garcia  * subcluster type.
38670d1cbaeSAlberto Garcia  *
38770d1cbaeSAlberto Garcia  * If the L2 entry is invalid return -errno and set @type to
38870d1cbaeSAlberto Garcia  * QCOW2_SUBCLUSTER_INVALID.
38970d1cbaeSAlberto Garcia  */
39070d1cbaeSAlberto Garcia static int qcow2_get_subcluster_range_type(BlockDriverState *bs,
39170d1cbaeSAlberto Garcia                                            uint64_t l2_entry,
39270d1cbaeSAlberto Garcia                                            uint64_t l2_bitmap,
39370d1cbaeSAlberto Garcia                                            unsigned sc_from,
39470d1cbaeSAlberto Garcia                                            QCow2SubclusterType *type)
39570d1cbaeSAlberto Garcia {
39670d1cbaeSAlberto Garcia     BDRVQcow2State *s = bs->opaque;
39770d1cbaeSAlberto Garcia     uint32_t val;
39870d1cbaeSAlberto Garcia 
39970d1cbaeSAlberto Garcia     *type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_from);
40070d1cbaeSAlberto Garcia 
40170d1cbaeSAlberto Garcia     if (*type == QCOW2_SUBCLUSTER_INVALID) {
40270d1cbaeSAlberto Garcia         return -EINVAL;
40370d1cbaeSAlberto Garcia     } else if (!has_subclusters(s) || *type == QCOW2_SUBCLUSTER_COMPRESSED) {
40470d1cbaeSAlberto Garcia         return s->subclusters_per_cluster - sc_from;
40570d1cbaeSAlberto Garcia     }
40670d1cbaeSAlberto Garcia 
40770d1cbaeSAlberto Garcia     switch (*type) {
40870d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_NORMAL:
40970d1cbaeSAlberto Garcia         val = l2_bitmap | QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
41070d1cbaeSAlberto Garcia         return cto32(val) - sc_from;
41170d1cbaeSAlberto Garcia 
41270d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_ZERO_PLAIN:
41370d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_ZERO_ALLOC:
41470d1cbaeSAlberto Garcia         val = (l2_bitmap | QCOW_OFLAG_SUB_ZERO_RANGE(0, sc_from)) >> 32;
41570d1cbaeSAlberto Garcia         return cto32(val) - sc_from;
41670d1cbaeSAlberto Garcia 
41770d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
41870d1cbaeSAlberto Garcia     case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
41970d1cbaeSAlberto Garcia         val = ((l2_bitmap >> 32) | l2_bitmap)
42070d1cbaeSAlberto Garcia             & ~QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
42170d1cbaeSAlberto Garcia         return ctz32(val) - sc_from;
42270d1cbaeSAlberto Garcia 
42370d1cbaeSAlberto Garcia     default:
42470d1cbaeSAlberto Garcia         g_assert_not_reached();
42570d1cbaeSAlberto Garcia     }
42670d1cbaeSAlberto Garcia }
42770d1cbaeSAlberto Garcia 
42870d1cbaeSAlberto Garcia /*
4293f9c6b3bSAlberto Garcia  * Return the number of contiguous subclusters of the exact same type
4303f9c6b3bSAlberto Garcia  * in a given L2 slice, starting from cluster @l2_index, subcluster
4313f9c6b3bSAlberto Garcia  * @sc_index. Allocated subclusters are required to be contiguous in
4323f9c6b3bSAlberto Garcia  * the image file.
4333f9c6b3bSAlberto Garcia  * At most @nb_clusters are checked (note that this means clusters,
4343f9c6b3bSAlberto Garcia  * not subclusters).
4353f9c6b3bSAlberto Garcia  * Compressed clusters are always processed one by one but for the
4363f9c6b3bSAlberto Garcia  * purpose of this count they are treated as if they were divided into
4373f9c6b3bSAlberto Garcia  * subclusters of size s->subcluster_size.
4383f9c6b3bSAlberto Garcia  * On failure return -errno and update @l2_index to point to the
4393f9c6b3bSAlberto Garcia  * invalid entry.
4402bfcc4a0SKevin Wolf  */
4413f9c6b3bSAlberto Garcia static int count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters,
4423f9c6b3bSAlberto Garcia                                         unsigned sc_index, uint64_t *l2_slice,
4433f9c6b3bSAlberto Garcia                                         unsigned *l2_index)
44445aba42fSKevin Wolf {
44512c6aebeSAlberto Garcia     BDRVQcow2State *s = bs->opaque;
4463f9c6b3bSAlberto Garcia     int i, count = 0;
4473f9c6b3bSAlberto Garcia     bool check_offset = false;
4483f9c6b3bSAlberto Garcia     uint64_t expected_offset = 0;
4493f9c6b3bSAlberto Garcia     QCow2SubclusterType expected_type = QCOW2_SUBCLUSTER_NORMAL, type;
45045aba42fSKevin Wolf 
4513f9c6b3bSAlberto Garcia     assert(*l2_index + nb_clusters <= s->l2_slice_size);
45215684a47SMax Reitz 
45361653008SKevin Wolf     for (i = 0; i < nb_clusters; i++) {
4543f9c6b3bSAlberto Garcia         unsigned first_sc = (i == 0) ? sc_index : 0;
4553f9c6b3bSAlberto Garcia         uint64_t l2_entry = get_l2_entry(s, l2_slice, *l2_index + i);
4563f9c6b3bSAlberto Garcia         uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, *l2_index + i);
4573f9c6b3bSAlberto Garcia         int ret = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
4583f9c6b3bSAlberto Garcia                                                   first_sc, &type);
4593f9c6b3bSAlberto Garcia         if (ret < 0) {
4603f9c6b3bSAlberto Garcia             *l2_index += i; /* Point to the invalid entry */
4613f9c6b3bSAlberto Garcia             return -EIO;
4623f9c6b3bSAlberto Garcia         }
4633f9c6b3bSAlberto Garcia         if (i == 0) {
4643f9c6b3bSAlberto Garcia             if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
4653f9c6b3bSAlberto Garcia                 /* Compressed clusters are always processed one by one */
4663f9c6b3bSAlberto Garcia                 return ret;
4673f9c6b3bSAlberto Garcia             }
4683f9c6b3bSAlberto Garcia             expected_type = type;
4693f9c6b3bSAlberto Garcia             expected_offset = l2_entry & L2E_OFFSET_MASK;
4703f9c6b3bSAlberto Garcia             check_offset = (type == QCOW2_SUBCLUSTER_NORMAL ||
4713f9c6b3bSAlberto Garcia                             type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
4723f9c6b3bSAlberto Garcia                             type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC);
4733f9c6b3bSAlberto Garcia         } else if (type != expected_type) {
4743f9c6b3bSAlberto Garcia             break;
4753f9c6b3bSAlberto Garcia         } else if (check_offset) {
4763f9c6b3bSAlberto Garcia             expected_offset += s->cluster_size;
4773f9c6b3bSAlberto Garcia             if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
4783f9c6b3bSAlberto Garcia                 break;
4793f9c6b3bSAlberto Garcia             }
4803f9c6b3bSAlberto Garcia         }
4813f9c6b3bSAlberto Garcia         count += ret;
4823f9c6b3bSAlberto Garcia         /* Stop if there are type changes before the end of the cluster */
4833f9c6b3bSAlberto Garcia         if (first_sc + ret < s->subclusters_per_cluster) {
48445aba42fSKevin Wolf             break;
4852bfcc4a0SKevin Wolf         }
4862bfcc4a0SKevin Wolf     }
48745aba42fSKevin Wolf 
4883f9c6b3bSAlberto Garcia     return count;
48945aba42fSKevin Wolf }
49045aba42fSKevin Wolf 
491672f0f2cSAlberto Garcia static int coroutine_fn do_perform_cow_read(BlockDriverState *bs,
492aaa4d20bSKevin Wolf                                             uint64_t src_cluster_offset,
493e034f5bcSAlberto Garcia                                             unsigned offset_in_cluster,
49486b862c4SAlberto Garcia                                             QEMUIOVector *qiov)
49545aba42fSKevin Wolf {
496aaa4d20bSKevin Wolf     int ret;
4971b9f1491SKevin Wolf 
49886b862c4SAlberto Garcia     if (qiov->size == 0) {
49999450c6fSAlberto Garcia         return 0;
50099450c6fSAlberto Garcia     }
50199450c6fSAlberto Garcia 
50266f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_COW_READ);
503aef4acb6SStefan Hajnoczi 
504dba28555SMax Reitz     if (!bs->drv) {
505672f0f2cSAlberto Garcia         return -ENOMEDIUM;
506dba28555SMax Reitz     }
507dba28555SMax Reitz 
508aef4acb6SStefan Hajnoczi     /* Call .bdrv_co_readv() directly instead of using the public block-layer
509aef4acb6SStefan Hajnoczi      * interface.  This avoids double I/O throttling and request tracking,
510aef4acb6SStefan Hajnoczi      * which can lead to deadlock when block layer copy-on-read is enabled.
511aef4acb6SStefan Hajnoczi      */
512df893d25SVladimir Sementsov-Ogievskiy     ret = bs->drv->bdrv_co_preadv_part(bs,
513df893d25SVladimir Sementsov-Ogievskiy                                        src_cluster_offset + offset_in_cluster,
514df893d25SVladimir Sementsov-Ogievskiy                                        qiov->size, qiov, 0, 0);
5151b9f1491SKevin Wolf     if (ret < 0) {
516672f0f2cSAlberto Garcia         return ret;
5171b9f1491SKevin Wolf     }
5181b9f1491SKevin Wolf 
519672f0f2cSAlberto Garcia     return 0;
520672f0f2cSAlberto Garcia }
521672f0f2cSAlberto Garcia 
522672f0f2cSAlberto Garcia static int coroutine_fn do_perform_cow_write(BlockDriverState *bs,
523672f0f2cSAlberto Garcia                                              uint64_t cluster_offset,
524672f0f2cSAlberto Garcia                                              unsigned offset_in_cluster,
52586b862c4SAlberto Garcia                                              QEMUIOVector *qiov)
526672f0f2cSAlberto Garcia {
527966b000fSKevin Wolf     BDRVQcow2State *s = bs->opaque;
528672f0f2cSAlberto Garcia     int ret;
529672f0f2cSAlberto Garcia 
53086b862c4SAlberto Garcia     if (qiov->size == 0) {
531672f0f2cSAlberto Garcia         return 0;
532672f0f2cSAlberto Garcia     }
533672f0f2cSAlberto Garcia 
534231bb267SMax Reitz     ret = qcow2_pre_write_overlap_check(bs, 0,
535966b000fSKevin Wolf             cluster_offset + offset_in_cluster, qiov->size, true);
536cf93980eSMax Reitz     if (ret < 0) {
537672f0f2cSAlberto Garcia         return ret;
538cf93980eSMax Reitz     }
539cf93980eSMax Reitz 
54066f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
541966b000fSKevin Wolf     ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster,
54286b862c4SAlberto Garcia                           qiov->size, qiov, 0);
5431b9f1491SKevin Wolf     if (ret < 0) {
544672f0f2cSAlberto Garcia         return ret;
5451b9f1491SKevin Wolf     }
5461b9f1491SKevin Wolf 
547672f0f2cSAlberto Garcia     return 0;
54845aba42fSKevin Wolf }
54945aba42fSKevin Wolf 
55045aba42fSKevin Wolf 
55145aba42fSKevin Wolf /*
552388e5816SAlberto Garcia  * get_host_offset
55345aba42fSKevin Wolf  *
554388e5816SAlberto Garcia  * For a given offset of the virtual disk find the equivalent host
555388e5816SAlberto Garcia  * offset in the qcow2 file and store it in *host_offset. Neither
556388e5816SAlberto Garcia  * offset needs to be aligned to a cluster boundary.
557388e5816SAlberto Garcia  *
558388e5816SAlberto Garcia  * If the cluster is unallocated then *host_offset will be 0.
559388e5816SAlberto Garcia  * If the cluster is compressed then *host_offset will contain the
560388e5816SAlberto Garcia  * complete compressed cluster descriptor.
56145aba42fSKevin Wolf  *
562ecfe1863SKevin Wolf  * On entry, *bytes is the maximum number of contiguous bytes starting at
563ecfe1863SKevin Wolf  * offset that we are interested in.
56445aba42fSKevin Wolf  *
565ecfe1863SKevin Wolf  * On exit, *bytes is the number of bytes starting at offset that have the same
56610dabdc5SAlberto Garcia  * subcluster type and (if applicable) are stored contiguously in the image
56710dabdc5SAlberto Garcia  * file. The subcluster type is stored in *subcluster_type.
56810dabdc5SAlberto Garcia  * Compressed clusters are always processed one by one.
56945aba42fSKevin Wolf  *
570ca4a0bb8SAlberto Garcia  * Returns 0 on success, -errno in error cases.
57145aba42fSKevin Wolf  */
572388e5816SAlberto Garcia int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
573ca4a0bb8SAlberto Garcia                           unsigned int *bytes, uint64_t *host_offset,
57410dabdc5SAlberto Garcia                           QCow2SubclusterType *subcluster_type)
57545aba42fSKevin Wolf {
576ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
5773f9c6b3bSAlberto Garcia     unsigned int l2_index, sc_index;
5783f9c6b3bSAlberto Garcia     uint64_t l1_index, l2_offset, *l2_slice, l2_entry, l2_bitmap;
5793f9c6b3bSAlberto Garcia     int sc;
580c834cba9SMax Reitz     unsigned int offset_in_cluster;
581c834cba9SMax Reitz     uint64_t bytes_available, bytes_needed, nb_clusters;
5823f9c6b3bSAlberto Garcia     QCow2SubclusterType type;
58355c17e98SKevin Wolf     int ret;
584b2f65d6bSKevin Wolf 
585b2f65d6bSKevin Wolf     offset_in_cluster = offset_into_cluster(s, offset);
586ecfe1863SKevin Wolf     bytes_needed = (uint64_t) *bytes + offset_in_cluster;
58745aba42fSKevin Wolf 
588b2f65d6bSKevin Wolf     /* compute how many bytes there are between the start of the cluster
589fd630039SAlberto Garcia      * containing offset and the end of the l2 slice that contains
590fd630039SAlberto Garcia      * the entry pointing to it */
591fd630039SAlberto Garcia     bytes_available =
592fd630039SAlberto Garcia         ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
593fd630039SAlberto Garcia         << s->cluster_bits;
59445aba42fSKevin Wolf 
595b2f65d6bSKevin Wolf     if (bytes_needed > bytes_available) {
596b2f65d6bSKevin Wolf         bytes_needed = bytes_available;
59745aba42fSKevin Wolf     }
59845aba42fSKevin Wolf 
599388e5816SAlberto Garcia     *host_offset = 0;
60045aba42fSKevin Wolf 
601b6af0975SDaniel P. Berrange     /* seek to the l2 offset in the l1 table */
60245aba42fSKevin Wolf 
60305b5b6eeSAlberto Garcia     l1_index = offset_to_l1_index(s, offset);
60468d000a3SKevin Wolf     if (l1_index >= s->l1_size) {
6053f9c6b3bSAlberto Garcia         type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
60645aba42fSKevin Wolf         goto out;
60768d000a3SKevin Wolf     }
60845aba42fSKevin Wolf 
60968d000a3SKevin Wolf     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
61068d000a3SKevin Wolf     if (!l2_offset) {
6113f9c6b3bSAlberto Garcia         type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
61245aba42fSKevin Wolf         goto out;
61368d000a3SKevin Wolf     }
61445aba42fSKevin Wolf 
615a97c67eeSMax Reitz     if (offset_into_cluster(s, l2_offset)) {
616a97c67eeSMax Reitz         qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
617a97c67eeSMax Reitz                                 " unaligned (L1 index: %#" PRIx64 ")",
618a97c67eeSMax Reitz                                 l2_offset, l1_index);
619a97c67eeSMax Reitz         return -EIO;
620a97c67eeSMax Reitz     }
621a97c67eeSMax Reitz 
622fd630039SAlberto Garcia     /* load the l2 slice in memory */
62345aba42fSKevin Wolf 
624fd630039SAlberto Garcia     ret = l2_load(bs, offset, l2_offset, &l2_slice);
62555c17e98SKevin Wolf     if (ret < 0) {
62655c17e98SKevin Wolf         return ret;
6271c46efaaSKevin Wolf     }
62845aba42fSKevin Wolf 
62945aba42fSKevin Wolf     /* find the cluster offset for the given disk offset */
63045aba42fSKevin Wolf 
631fd630039SAlberto Garcia     l2_index = offset_to_l2_slice_index(s, offset);
6323f9c6b3bSAlberto Garcia     sc_index = offset_to_sc_index(s, offset);
63312c6aebeSAlberto Garcia     l2_entry = get_l2_entry(s, l2_slice, l2_index);
6343f9c6b3bSAlberto Garcia     l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
635b6d36defSMax Reitz 
636b2f65d6bSKevin Wolf     nb_clusters = size_to_clusters(s, bytes_needed);
637c834cba9SMax Reitz     /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
638c834cba9SMax Reitz      * integers; the minimum cluster size is 512, so this assertion is always
639c834cba9SMax Reitz      * true */
640c834cba9SMax Reitz     assert(nb_clusters <= INT_MAX);
64145aba42fSKevin Wolf 
6423f9c6b3bSAlberto Garcia     type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
6433f9c6b3bSAlberto Garcia     if (s->qcow_version < 3 && (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
6443f9c6b3bSAlberto Garcia                                 type == QCOW2_SUBCLUSTER_ZERO_ALLOC)) {
645a97c67eeSMax Reitz         qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
646a97c67eeSMax Reitz                                 " in pre-v3 image (L2 offset: %#" PRIx64
647a97c67eeSMax Reitz                                 ", L2 index: %#x)", l2_offset, l2_index);
648a97c67eeSMax Reitz         ret = -EIO;
649a97c67eeSMax Reitz         goto fail;
650381b487dSPaolo Bonzini     }
651fdfab37dSEric Blake     switch (type) {
6523f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_INVALID:
6533f9c6b3bSAlberto Garcia         break; /* This is handled by count_contiguous_subclusters() below */
6543f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_COMPRESSED:
655966b000fSKevin Wolf         if (has_data_file(bs)) {
656966b000fSKevin Wolf             qcow2_signal_corruption(bs, true, -1, -1, "Compressed cluster "
657966b000fSKevin Wolf                                     "entry found in image with external data "
658966b000fSKevin Wolf                                     "file (L2 offset: %#" PRIx64 ", L2 index: "
659966b000fSKevin Wolf                                     "%#x)", l2_offset, l2_index);
660966b000fSKevin Wolf             ret = -EIO;
661966b000fSKevin Wolf             goto fail;
662966b000fSKevin Wolf         }
663388e5816SAlberto Garcia         *host_offset = l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK;
6646377af48SKevin Wolf         break;
6653f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_ZERO_PLAIN:
6663f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
66768d000a3SKevin Wolf         break;
6683f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_ZERO_ALLOC:
6693f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_NORMAL:
6703f9c6b3bSAlberto Garcia     case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: {
671388e5816SAlberto Garcia         uint64_t host_cluster_offset = l2_entry & L2E_OFFSET_MASK;
672388e5816SAlberto Garcia         *host_offset = host_cluster_offset + offset_in_cluster;
673388e5816SAlberto Garcia         if (offset_into_cluster(s, host_cluster_offset)) {
674fdfab37dSEric Blake             qcow2_signal_corruption(bs, true, -1, -1,
675fdfab37dSEric Blake                                     "Cluster allocation offset %#"
676a97c67eeSMax Reitz                                     PRIx64 " unaligned (L2 offset: %#" PRIx64
677388e5816SAlberto Garcia                                     ", L2 index: %#x)", host_cluster_offset,
678a97c67eeSMax Reitz                                     l2_offset, l2_index);
679a97c67eeSMax Reitz             ret = -EIO;
680a97c67eeSMax Reitz             goto fail;
681a97c67eeSMax Reitz         }
682388e5816SAlberto Garcia         if (has_data_file(bs) && *host_offset != offset) {
683966b000fSKevin Wolf             qcow2_signal_corruption(bs, true, -1, -1,
684966b000fSKevin Wolf                                     "External data file host cluster offset %#"
685966b000fSKevin Wolf                                     PRIx64 " does not match guest cluster "
686966b000fSKevin Wolf                                     "offset: %#" PRIx64
687388e5816SAlberto Garcia                                     ", L2 index: %#x)", host_cluster_offset,
688966b000fSKevin Wolf                                     offset - offset_in_cluster, l2_index);
689966b000fSKevin Wolf             ret = -EIO;
690966b000fSKevin Wolf             goto fail;
691966b000fSKevin Wolf         }
69268d000a3SKevin Wolf         break;
693388e5816SAlberto Garcia     }
6941417d7e4SKevin Wolf     default:
6951417d7e4SKevin Wolf         abort();
69645aba42fSKevin Wolf     }
69745aba42fSKevin Wolf 
6983f9c6b3bSAlberto Garcia     sc = count_contiguous_subclusters(bs, nb_clusters, sc_index,
6993f9c6b3bSAlberto Garcia                                       l2_slice, &l2_index);
7003f9c6b3bSAlberto Garcia     if (sc < 0) {
7013f9c6b3bSAlberto Garcia         qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster entry found "
7023f9c6b3bSAlberto Garcia                                 " (L2 offset: %#" PRIx64 ", L2 index: %#x)",
7033f9c6b3bSAlberto Garcia                                 l2_offset, l2_index);
7043f9c6b3bSAlberto Garcia         ret = -EIO;
7053f9c6b3bSAlberto Garcia         goto fail;
7063f9c6b3bSAlberto Garcia     }
707fd630039SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
70829c1a730SKevin Wolf 
7093f9c6b3bSAlberto Garcia     bytes_available = ((int64_t)sc + sc_index) << s->subcluster_bits;
71068d000a3SKevin Wolf 
71145aba42fSKevin Wolf out:
712b2f65d6bSKevin Wolf     if (bytes_available > bytes_needed) {
713b2f65d6bSKevin Wolf         bytes_available = bytes_needed;
714b2f65d6bSKevin Wolf     }
71545aba42fSKevin Wolf 
716c834cba9SMax Reitz     /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster;
717c834cba9SMax Reitz      * subtracting offset_in_cluster will therefore definitely yield something
718c834cba9SMax Reitz      * not exceeding UINT_MAX */
719c834cba9SMax Reitz     assert(bytes_available - offset_in_cluster <= UINT_MAX);
720ecfe1863SKevin Wolf     *bytes = bytes_available - offset_in_cluster;
72145aba42fSKevin Wolf 
7223f9c6b3bSAlberto Garcia     *subcluster_type = type;
723ca4a0bb8SAlberto Garcia 
724ca4a0bb8SAlberto Garcia     return 0;
725a97c67eeSMax Reitz 
726a97c67eeSMax Reitz fail:
727fd630039SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
728a97c67eeSMax Reitz     return ret;
72945aba42fSKevin Wolf }
73045aba42fSKevin Wolf 
73145aba42fSKevin Wolf /*
73245aba42fSKevin Wolf  * get_cluster_table
73345aba42fSKevin Wolf  *
73445aba42fSKevin Wolf  * for a given disk offset, load (and allocate if needed)
735c03bfc5bSAlberto Garcia  * the appropriate slice of its l2 table.
73645aba42fSKevin Wolf  *
737c03bfc5bSAlberto Garcia  * the cluster index in the l2 slice is given to the caller.
73845aba42fSKevin Wolf  *
7391e3e8f1aSKevin Wolf  * Returns 0 on success, -errno in failure case
74045aba42fSKevin Wolf  */
74145aba42fSKevin Wolf static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
742c03bfc5bSAlberto Garcia                              uint64_t **new_l2_slice,
74345aba42fSKevin Wolf                              int *new_l2_index)
74445aba42fSKevin Wolf {
745ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
7462cf7cfa1SKevin Wolf     unsigned int l2_index;
7472cf7cfa1SKevin Wolf     uint64_t l1_index, l2_offset;
748c03bfc5bSAlberto Garcia     uint64_t *l2_slice = NULL;
74980ee15a6SKevin Wolf     int ret;
75045aba42fSKevin Wolf 
751b6af0975SDaniel P. Berrange     /* seek to the l2 offset in the l1 table */
75245aba42fSKevin Wolf 
75305b5b6eeSAlberto Garcia     l1_index = offset_to_l1_index(s, offset);
75445aba42fSKevin Wolf     if (l1_index >= s->l1_size) {
75572893756SStefan Hajnoczi         ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
7561e3e8f1aSKevin Wolf         if (ret < 0) {
7571e3e8f1aSKevin Wolf             return ret;
7581e3e8f1aSKevin Wolf         }
75945aba42fSKevin Wolf     }
7608e37f681SKevin Wolf 
7612cf7cfa1SKevin Wolf     assert(l1_index < s->l1_size);
7628e37f681SKevin Wolf     l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
763a97c67eeSMax Reitz     if (offset_into_cluster(s, l2_offset)) {
764a97c67eeSMax Reitz         qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
765a97c67eeSMax Reitz                                 " unaligned (L1 index: %#" PRIx64 ")",
766a97c67eeSMax Reitz                                 l2_offset, l1_index);
767a97c67eeSMax Reitz         return -EIO;
768a97c67eeSMax Reitz     }
76945aba42fSKevin Wolf 
77005f9ee46SAlberto Garcia     if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
77116fde5f2SKevin Wolf         /* First allocate a new L2 table (and do COW if needed) */
7723861946aSAlberto Garcia         ret = l2_allocate(bs, l1_index);
773c46e1167SKevin Wolf         if (ret < 0) {
774c46e1167SKevin Wolf             return ret;
7751e3e8f1aSKevin Wolf         }
77616fde5f2SKevin Wolf 
77716fde5f2SKevin Wolf         /* Then decrease the refcount of the old table */
77816fde5f2SKevin Wolf         if (l2_offset) {
779c8fd8554SAlberto Garcia             qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
7806cfcb9b8SKevin Wolf                                 QCOW2_DISCARD_OTHER);
78116fde5f2SKevin Wolf         }
7823861946aSAlberto Garcia 
7833861946aSAlberto Garcia         /* Get the offset of the newly-allocated l2 table */
7843861946aSAlberto Garcia         l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
7853861946aSAlberto Garcia         assert(offset_into_cluster(s, l2_offset) == 0);
78605f9ee46SAlberto Garcia     }
78705f9ee46SAlberto Garcia 
788c03bfc5bSAlberto Garcia     /* load the l2 slice in memory */
789c03bfc5bSAlberto Garcia     ret = l2_load(bs, offset, l2_offset, &l2_slice);
7903861946aSAlberto Garcia     if (ret < 0) {
7913861946aSAlberto Garcia         return ret;
7923861946aSAlberto Garcia     }
79345aba42fSKevin Wolf 
79445aba42fSKevin Wolf     /* find the cluster offset for the given disk offset */
79545aba42fSKevin Wolf 
796c03bfc5bSAlberto Garcia     l2_index = offset_to_l2_slice_index(s, offset);
79745aba42fSKevin Wolf 
798c03bfc5bSAlberto Garcia     *new_l2_slice = l2_slice;
79945aba42fSKevin Wolf     *new_l2_index = l2_index;
80045aba42fSKevin Wolf 
8011e3e8f1aSKevin Wolf     return 0;
80245aba42fSKevin Wolf }
80345aba42fSKevin Wolf 
80445aba42fSKevin Wolf /*
80545aba42fSKevin Wolf  * alloc_compressed_cluster_offset
80645aba42fSKevin Wolf  *
80777e023ffSKevin Wolf  * For a given offset on the virtual disk, allocate a new compressed cluster
80877e023ffSKevin Wolf  * and put the host offset of the cluster into *host_offset. If a cluster is
80977e023ffSKevin Wolf  * already allocated at the offset, return an error.
81045aba42fSKevin Wolf  *
81177e023ffSKevin Wolf  * Return 0 on success and -errno in error cases
81245aba42fSKevin Wolf  */
81377e023ffSKevin Wolf int qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
81445aba42fSKevin Wolf                                           uint64_t offset,
81577e023ffSKevin Wolf                                           int compressed_size,
81677e023ffSKevin Wolf                                           uint64_t *host_offset)
81745aba42fSKevin Wolf {
818ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
81945aba42fSKevin Wolf     int l2_index, ret;
820e4e72548SAlberto Garcia     uint64_t *l2_slice;
821f4f0d391SKevin Wolf     int64_t cluster_offset;
82245aba42fSKevin Wolf     int nb_csectors;
82345aba42fSKevin Wolf 
824966b000fSKevin Wolf     if (has_data_file(bs)) {
825966b000fSKevin Wolf         return 0;
826966b000fSKevin Wolf     }
827966b000fSKevin Wolf 
828e4e72548SAlberto Garcia     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
8291e3e8f1aSKevin Wolf     if (ret < 0) {
83077e023ffSKevin Wolf         return ret;
8311e3e8f1aSKevin Wolf     }
83245aba42fSKevin Wolf 
833b0b6862eSKevin Wolf     /* Compression can't overwrite anything. Fail if the cluster was already
834b0b6862eSKevin Wolf      * allocated. */
83512c6aebeSAlberto Garcia     cluster_offset = get_l2_entry(s, l2_slice, l2_index);
836b0b6862eSKevin Wolf     if (cluster_offset & L2E_OFFSET_MASK) {
837e4e72548SAlberto Garcia         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
83877e023ffSKevin Wolf         return -EIO;
8398f1efd00SKevin Wolf     }
84045aba42fSKevin Wolf 
841ed6ccf0fSKevin Wolf     cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
8425d757b56SKevin Wolf     if (cluster_offset < 0) {
843e4e72548SAlberto Garcia         qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
84477e023ffSKevin Wolf         return cluster_offset;
8455d757b56SKevin Wolf     }
8465d757b56SKevin Wolf 
847b6c24694SAlberto Garcia     nb_csectors =
848b6c24694SAlberto Garcia         (cluster_offset + compressed_size - 1) / QCOW2_COMPRESSED_SECTOR_SIZE -
849b6c24694SAlberto Garcia         (cluster_offset / QCOW2_COMPRESSED_SECTOR_SIZE);
85045aba42fSKevin Wolf 
8513a75a870SAlberto Garcia     /* The offset and size must fit in their fields of the L2 table entry */
8523a75a870SAlberto Garcia     assert((cluster_offset & s->cluster_offset_mask) == cluster_offset);
8533a75a870SAlberto Garcia     assert((nb_csectors & s->csize_mask) == nb_csectors);
8543a75a870SAlberto Garcia 
85545aba42fSKevin Wolf     cluster_offset |= QCOW_OFLAG_COMPRESSED |
85645aba42fSKevin Wolf                       ((uint64_t)nb_csectors << s->csize_shift);
85745aba42fSKevin Wolf 
85845aba42fSKevin Wolf     /* update L2 table */
85945aba42fSKevin Wolf 
86045aba42fSKevin Wolf     /* compressed clusters never have the copied flag */
86145aba42fSKevin Wolf 
86266f82ceeSKevin Wolf     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
863e4e72548SAlberto Garcia     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
86412c6aebeSAlberto Garcia     set_l2_entry(s, l2_slice, l2_index, cluster_offset);
865ff4cdec7SAlberto Garcia     if (has_subclusters(s)) {
866ff4cdec7SAlberto Garcia         set_l2_bitmap(s, l2_slice, l2_index, 0);
867ff4cdec7SAlberto Garcia     }
868e4e72548SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
86945aba42fSKevin Wolf 
87077e023ffSKevin Wolf     *host_offset = cluster_offset & s->cluster_offset_mask;
87177e023ffSKevin Wolf     return 0;
87245aba42fSKevin Wolf }
87345aba42fSKevin Wolf 
87499450c6fSAlberto Garcia static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
875593fb83cSKevin Wolf {
876ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
87799450c6fSAlberto Garcia     Qcow2COWRegion *start = &m->cow_start;
87899450c6fSAlberto Garcia     Qcow2COWRegion *end = &m->cow_end;
879672f0f2cSAlberto Garcia     unsigned buffer_size;
880b3cf1c7cSAlberto Garcia     unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
881b3cf1c7cSAlberto Garcia     bool merge_reads;
882672f0f2cSAlberto Garcia     uint8_t *start_buffer, *end_buffer;
88386b862c4SAlberto Garcia     QEMUIOVector qiov;
884593fb83cSKevin Wolf     int ret;
885593fb83cSKevin Wolf 
886672f0f2cSAlberto Garcia     assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
887b3cf1c7cSAlberto Garcia     assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
888b3cf1c7cSAlberto Garcia     assert(start->offset + start->nb_bytes <= end->offset);
889672f0f2cSAlberto Garcia 
890c8bb23cbSAnton Nefedov     if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
891593fb83cSKevin Wolf         return 0;
892593fb83cSKevin Wolf     }
893593fb83cSKevin Wolf 
894b3cf1c7cSAlberto Garcia     /* If we have to read both the start and end COW regions and the
895b3cf1c7cSAlberto Garcia      * middle region is not too large then perform just one read
896b3cf1c7cSAlberto Garcia      * operation */
897b3cf1c7cSAlberto Garcia     merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
898b3cf1c7cSAlberto Garcia     if (merge_reads) {
899b3cf1c7cSAlberto Garcia         buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
900b3cf1c7cSAlberto Garcia     } else {
901b3cf1c7cSAlberto Garcia         /* If we have to do two reads, add some padding in the middle
902b3cf1c7cSAlberto Garcia          * if necessary to make sure that the end region is optimally
903b3cf1c7cSAlberto Garcia          * aligned. */
904b3cf1c7cSAlberto Garcia         size_t align = bdrv_opt_mem_align(bs);
905b3cf1c7cSAlberto Garcia         assert(align > 0 && align <= UINT_MAX);
906b3cf1c7cSAlberto Garcia         assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
907b3cf1c7cSAlberto Garcia                UINT_MAX - end->nb_bytes);
908b3cf1c7cSAlberto Garcia         buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
909b3cf1c7cSAlberto Garcia     }
910b3cf1c7cSAlberto Garcia 
911b3cf1c7cSAlberto Garcia     /* Reserve a buffer large enough to store all the data that we're
912b3cf1c7cSAlberto Garcia      * going to read */
913672f0f2cSAlberto Garcia     start_buffer = qemu_try_blockalign(bs, buffer_size);
914672f0f2cSAlberto Garcia     if (start_buffer == NULL) {
915672f0f2cSAlberto Garcia         return -ENOMEM;
916672f0f2cSAlberto Garcia     }
917672f0f2cSAlberto Garcia     /* The part of the buffer where the end region is located */
918672f0f2cSAlberto Garcia     end_buffer = start_buffer + buffer_size - end->nb_bytes;
919672f0f2cSAlberto Garcia 
9205396234bSVladimir Sementsov-Ogievskiy     qemu_iovec_init(&qiov, 2 + (m->data_qiov ?
9215396234bSVladimir Sementsov-Ogievskiy                                 qemu_iovec_subvec_niov(m->data_qiov,
9225396234bSVladimir Sementsov-Ogievskiy                                                        m->data_qiov_offset,
9235396234bSVladimir Sementsov-Ogievskiy                                                        data_bytes)
9245396234bSVladimir Sementsov-Ogievskiy                                 : 0));
92586b862c4SAlberto Garcia 
926593fb83cSKevin Wolf     qemu_co_mutex_unlock(&s->lock);
927b3cf1c7cSAlberto Garcia     /* First we read the existing data from both COW regions. We
928b3cf1c7cSAlberto Garcia      * either read the whole region in one go, or the start and end
929b3cf1c7cSAlberto Garcia      * regions separately. */
930b3cf1c7cSAlberto Garcia     if (merge_reads) {
93186b862c4SAlberto Garcia         qemu_iovec_add(&qiov, start_buffer, buffer_size);
93286b862c4SAlberto Garcia         ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
933b3cf1c7cSAlberto Garcia     } else {
93486b862c4SAlberto Garcia         qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
93586b862c4SAlberto Garcia         ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
936593fb83cSKevin Wolf         if (ret < 0) {
93799450c6fSAlberto Garcia             goto fail;
938593fb83cSKevin Wolf         }
939593fb83cSKevin Wolf 
94086b862c4SAlberto Garcia         qemu_iovec_reset(&qiov);
94186b862c4SAlberto Garcia         qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
94286b862c4SAlberto Garcia         ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
943b3cf1c7cSAlberto Garcia     }
944672f0f2cSAlberto Garcia     if (ret < 0) {
945672f0f2cSAlberto Garcia         goto fail;
946672f0f2cSAlberto Garcia     }
94799450c6fSAlberto Garcia 
948672f0f2cSAlberto Garcia     /* Encrypt the data if necessary before writing it */
949672f0f2cSAlberto Garcia     if (bs->encrypted) {
950603fbd07SMaxim Levitsky         ret = qcow2_co_encrypt(bs,
951603fbd07SMaxim Levitsky                                m->alloc_offset + start->offset,
952603fbd07SMaxim Levitsky                                m->offset + start->offset,
953603fbd07SMaxim Levitsky                                start_buffer, start->nb_bytes);
954603fbd07SMaxim Levitsky         if (ret < 0) {
955603fbd07SMaxim Levitsky             goto fail;
956603fbd07SMaxim Levitsky         }
957603fbd07SMaxim Levitsky 
958603fbd07SMaxim Levitsky         ret = qcow2_co_encrypt(bs,
959603fbd07SMaxim Levitsky                                m->alloc_offset + end->offset,
960603fbd07SMaxim Levitsky                                m->offset + end->offset,
961603fbd07SMaxim Levitsky                                end_buffer, end->nb_bytes);
962603fbd07SMaxim Levitsky         if (ret < 0) {
963672f0f2cSAlberto Garcia             goto fail;
964672f0f2cSAlberto Garcia         }
965672f0f2cSAlberto Garcia     }
966672f0f2cSAlberto Garcia 
967ee22a9d8SAlberto Garcia     /* And now we can write everything. If we have the guest data we
968ee22a9d8SAlberto Garcia      * can write everything in one single operation */
969ee22a9d8SAlberto Garcia     if (m->data_qiov) {
970ee22a9d8SAlberto Garcia         qemu_iovec_reset(&qiov);
971ee22a9d8SAlberto Garcia         if (start->nb_bytes) {
972ee22a9d8SAlberto Garcia             qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
973ee22a9d8SAlberto Garcia         }
9745396234bSVladimir Sementsov-Ogievskiy         qemu_iovec_concat(&qiov, m->data_qiov, m->data_qiov_offset, data_bytes);
975ee22a9d8SAlberto Garcia         if (end->nb_bytes) {
976ee22a9d8SAlberto Garcia             qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
977ee22a9d8SAlberto Garcia         }
978ee22a9d8SAlberto Garcia         /* NOTE: we have a write_aio blkdebug event here followed by
979ee22a9d8SAlberto Garcia          * a cow_write one in do_perform_cow_write(), but there's only
980ee22a9d8SAlberto Garcia          * one single I/O operation */
981ee22a9d8SAlberto Garcia         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
982ee22a9d8SAlberto Garcia         ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
983ee22a9d8SAlberto Garcia     } else {
984ee22a9d8SAlberto Garcia         /* If there's no guest data then write both COW regions separately */
98586b862c4SAlberto Garcia         qemu_iovec_reset(&qiov);
98686b862c4SAlberto Garcia         qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
98786b862c4SAlberto Garcia         ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
988672f0f2cSAlberto Garcia         if (ret < 0) {
989672f0f2cSAlberto Garcia             goto fail;
990672f0f2cSAlberto Garcia         }
991672f0f2cSAlberto Garcia 
99286b862c4SAlberto Garcia         qemu_iovec_reset(&qiov);
99386b862c4SAlberto Garcia         qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
99486b862c4SAlberto Garcia         ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
995ee22a9d8SAlberto Garcia     }
996ee22a9d8SAlberto Garcia 
99799450c6fSAlberto Garcia fail:
99899450c6fSAlberto Garcia     qemu_co_mutex_lock(&s->lock);
99999450c6fSAlberto Garcia 
1000593fb83cSKevin Wolf     /*
1001593fb83cSKevin Wolf      * Before we update the L2 table to actually point to the new cluster, we
1002593fb83cSKevin Wolf      * need to be sure that the refcounts have been increased and COW was
1003593fb83cSKevin Wolf      * handled.
1004593fb83cSKevin Wolf      */
100599450c6fSAlberto Garcia     if (ret == 0) {
1006593fb83cSKevin Wolf         qcow2_cache_depends_on_flush(s->l2_table_cache);
100799450c6fSAlberto Garcia     }
1008593fb83cSKevin Wolf 
1009672f0f2cSAlberto Garcia     qemu_vfree(start_buffer);
101086b862c4SAlberto Garcia     qemu_iovec_destroy(&qiov);
101199450c6fSAlberto Garcia     return ret;
1012593fb83cSKevin Wolf }
1013593fb83cSKevin Wolf 
1014148da7eaSKevin Wolf int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
101545aba42fSKevin Wolf {
1016ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
101745aba42fSKevin Wolf     int i, j = 0, l2_index, ret;
1018a002c0b0SAlberto Garcia     uint64_t *old_cluster, *l2_slice;
1019250196f1SKevin Wolf     uint64_t cluster_offset = m->alloc_offset;
102045aba42fSKevin Wolf 
10213cce16f4SKevin Wolf     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
1022f50f88b9SKevin Wolf     assert(m->nb_clusters > 0);
102345aba42fSKevin Wolf 
10245839e53bSMarkus Armbruster     old_cluster = g_try_new(uint64_t, m->nb_clusters);
1025de82815dSKevin Wolf     if (old_cluster == NULL) {
1026de82815dSKevin Wolf         ret = -ENOMEM;
1027de82815dSKevin Wolf         goto err;
1028de82815dSKevin Wolf     }
102945aba42fSKevin Wolf 
103045aba42fSKevin Wolf     /* copy content of unmodified sectors */
103199450c6fSAlberto Garcia     ret = perform_cow(bs, m);
1032593fb83cSKevin Wolf     if (ret < 0) {
103345aba42fSKevin Wolf         goto err;
103445aba42fSKevin Wolf     }
103545aba42fSKevin Wolf 
1036593fb83cSKevin Wolf     /* Update L2 table. */
103774c4510aSKevin Wolf     if (s->use_lazy_refcounts) {
1038280d3735SKevin Wolf         qcow2_mark_dirty(bs);
1039280d3735SKevin Wolf     }
1040bfe8043eSStefan Hajnoczi     if (qcow2_need_accurate_refcounts(s)) {
1041bfe8043eSStefan Hajnoczi         qcow2_cache_set_dependency(bs, s->l2_table_cache,
1042bfe8043eSStefan Hajnoczi                                    s->refcount_block_cache);
1043bfe8043eSStefan Hajnoczi     }
1044280d3735SKevin Wolf 
1045a002c0b0SAlberto Garcia     ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
10461e3e8f1aSKevin Wolf     if (ret < 0) {
104745aba42fSKevin Wolf         goto err;
10481e3e8f1aSKevin Wolf     }
1049a002c0b0SAlberto Garcia     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
105045aba42fSKevin Wolf 
1051a002c0b0SAlberto Garcia     assert(l2_index + m->nb_clusters <= s->l2_slice_size);
105245aba42fSKevin Wolf     for (i = 0; i < m->nb_clusters; i++) {
1053348fcc4fSTuguoyi         uint64_t offset = cluster_offset + ((uint64_t)i << s->cluster_bits);
105445aba42fSKevin Wolf         /* if two concurrent writes happen to the same unallocated cluster
105545aba42fSKevin Wolf          * each write allocates separate cluster and writes data concurrently.
105645aba42fSKevin Wolf          * The first one to complete updates l2 table with pointer to its
105745aba42fSKevin Wolf          * cluster the second one has to do RMW (which is done above by
1058aaa4d20bSKevin Wolf          * perform_cow()), update l2 table with its cluster pointer and free
105945aba42fSKevin Wolf          * old cluster. This is what this loop does */
106012c6aebeSAlberto Garcia         if (get_l2_entry(s, l2_slice, l2_index + i) != 0) {
106112c6aebeSAlberto Garcia             old_cluster[j++] = get_l2_entry(s, l2_slice, l2_index + i);
1062aaa4d20bSKevin Wolf         }
106345aba42fSKevin Wolf 
10643a75a870SAlberto Garcia         /* The offset must fit in the offset field of the L2 table entry */
10653a75a870SAlberto Garcia         assert((offset & L2E_OFFSET_MASK) == offset);
10663a75a870SAlberto Garcia 
106712c6aebeSAlberto Garcia         set_l2_entry(s, l2_slice, l2_index + i, offset | QCOW_OFLAG_COPIED);
1068aca00cd9SAlberto Garcia 
1069aca00cd9SAlberto Garcia         /* Update bitmap with the subclusters that were just written */
107040dee943SAlberto Garcia         if (has_subclusters(s) && !m->prealloc) {
1071aca00cd9SAlberto Garcia             uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1072aca00cd9SAlberto Garcia             unsigned written_from = m->cow_start.offset;
1073aca00cd9SAlberto Garcia             unsigned written_to = m->cow_end.offset + m->cow_end.nb_bytes ?:
1074aca00cd9SAlberto Garcia                 m->nb_clusters << s->cluster_bits;
1075aca00cd9SAlberto Garcia             int first_sc, last_sc;
1076aca00cd9SAlberto Garcia             /* Narrow written_from and written_to down to the current cluster */
1077aca00cd9SAlberto Garcia             written_from = MAX(written_from, i << s->cluster_bits);
1078aca00cd9SAlberto Garcia             written_to   = MIN(written_to, (i + 1) << s->cluster_bits);
1079aca00cd9SAlberto Garcia             assert(written_from < written_to);
1080aca00cd9SAlberto Garcia             first_sc = offset_to_sc_index(s, written_from);
1081aca00cd9SAlberto Garcia             last_sc  = offset_to_sc_index(s, written_to - 1);
1082aca00cd9SAlberto Garcia             l2_bitmap |= QCOW_OFLAG_SUB_ALLOC_RANGE(first_sc, last_sc + 1);
1083aca00cd9SAlberto Garcia             l2_bitmap &= ~QCOW_OFLAG_SUB_ZERO_RANGE(first_sc, last_sc + 1);
1084aca00cd9SAlberto Garcia             set_l2_bitmap(s, l2_slice, l2_index + i, l2_bitmap);
1085aca00cd9SAlberto Garcia         }
108645aba42fSKevin Wolf      }
108745aba42fSKevin Wolf 
10889f8e668eSKevin Wolf 
1089a002c0b0SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
109045aba42fSKevin Wolf 
10917ec5e6a4SKevin Wolf     /*
10927ec5e6a4SKevin Wolf      * If this was a COW, we need to decrease the refcount of the old cluster.
10936cfcb9b8SKevin Wolf      *
10946cfcb9b8SKevin Wolf      * Don't discard clusters that reach a refcount of 0 (e.g. compressed
10956cfcb9b8SKevin Wolf      * clusters), the next write will reuse them anyway.
10967ec5e6a4SKevin Wolf      */
1097564a6b69SMax Reitz     if (!m->keep_old_clusters && j != 0) {
10987ec5e6a4SKevin Wolf         for (i = 0; i < j; i++) {
109912c6aebeSAlberto Garcia             qcow2_free_any_clusters(bs, old_cluster[i], 1, QCOW2_DISCARD_NEVER);
11007ec5e6a4SKevin Wolf         }
11017ec5e6a4SKevin Wolf     }
110245aba42fSKevin Wolf 
110345aba42fSKevin Wolf     ret = 0;
110445aba42fSKevin Wolf err:
11057267c094SAnthony Liguori     g_free(old_cluster);
110645aba42fSKevin Wolf     return ret;
110745aba42fSKevin Wolf  }
110845aba42fSKevin Wolf 
11098b24cd14SKevin Wolf /**
11108b24cd14SKevin Wolf  * Frees the allocated clusters because the request failed and they won't
11118b24cd14SKevin Wolf  * actually be linked.
11128b24cd14SKevin Wolf  */
11138b24cd14SKevin Wolf void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
11148b24cd14SKevin Wolf {
11158b24cd14SKevin Wolf     BDRVQcow2State *s = bs->opaque;
11163ede935fSMax Reitz     if (!has_data_file(bs) && !m->keep_old_clusters) {
1117c3b6658cSKevin Wolf         qcow2_free_clusters(bs, m->alloc_offset,
1118c3b6658cSKevin Wolf                             m->nb_clusters << s->cluster_bits,
11198b24cd14SKevin Wolf                             QCOW2_DISCARD_NEVER);
11208b24cd14SKevin Wolf     }
1121c3b6658cSKevin Wolf }
11228b24cd14SKevin Wolf 
112345aba42fSKevin Wolf /*
11248f91d690SAlberto Garcia  * For a given write request, create a new QCowL2Meta structure, add
112557538c86SAlberto Garcia  * it to @m and the BDRVQcow2State.cluster_allocs list. If the write
112657538c86SAlberto Garcia  * request does not need copy-on-write or changes to the L2 metadata
112757538c86SAlberto Garcia  * then this function does nothing.
11288f91d690SAlberto Garcia  *
11298f91d690SAlberto Garcia  * @host_cluster_offset points to the beginning of the first cluster.
11308f91d690SAlberto Garcia  *
11318f91d690SAlberto Garcia  * @guest_offset and @bytes indicate the offset and length of the
11328f91d690SAlberto Garcia  * request.
11338f91d690SAlberto Garcia  *
113457538c86SAlberto Garcia  * @l2_slice contains the L2 entries of all clusters involved in this
113557538c86SAlberto Garcia  * write request.
113657538c86SAlberto Garcia  *
11378f91d690SAlberto Garcia  * If @keep_old is true it means that the clusters were already
11388f91d690SAlberto Garcia  * allocated and will be overwritten. If false then the clusters are
11398f91d690SAlberto Garcia  * new and we have to decrease the reference count of the old ones.
1140d53ec3d8SAlberto Garcia  *
1141d53ec3d8SAlberto Garcia  * Returns 0 on success, -errno on failure.
11428f91d690SAlberto Garcia  */
1143d53ec3d8SAlberto Garcia static int calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset,
11448f91d690SAlberto Garcia                              uint64_t guest_offset, unsigned bytes,
114557538c86SAlberto Garcia                              uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
11468f91d690SAlberto Garcia {
11478f91d690SAlberto Garcia     BDRVQcow2State *s = bs->opaque;
1148d53ec3d8SAlberto Garcia     int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset);
1149d53ec3d8SAlberto Garcia     uint64_t l2_entry, l2_bitmap;
115057538c86SAlberto Garcia     unsigned cow_start_from, cow_end_to;
11518f91d690SAlberto Garcia     unsigned cow_start_to = offset_into_cluster(s, guest_offset);
11528f91d690SAlberto Garcia     unsigned cow_end_from = cow_start_to + bytes;
11538f91d690SAlberto Garcia     unsigned nb_clusters = size_to_clusters(s, cow_end_from);
11548f91d690SAlberto Garcia     QCowL2Meta *old_m = *m;
1155d53ec3d8SAlberto Garcia     QCow2SubclusterType type;
1156d53ec3d8SAlberto Garcia     int i;
1157d53ec3d8SAlberto Garcia     bool skip_cow = keep_old;
115857538c86SAlberto Garcia 
115957538c86SAlberto Garcia     assert(nb_clusters <= s->l2_slice_size - l2_index);
116057538c86SAlberto Garcia 
1161d53ec3d8SAlberto Garcia     /* Check the type of all affected subclusters */
116257538c86SAlberto Garcia     for (i = 0; i < nb_clusters; i++) {
116312c6aebeSAlberto Garcia         l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1164d53ec3d8SAlberto Garcia         l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1165d53ec3d8SAlberto Garcia         if (skip_cow) {
1166d53ec3d8SAlberto Garcia             unsigned write_from = MAX(cow_start_to, i << s->cluster_bits);
1167d53ec3d8SAlberto Garcia             unsigned write_to = MIN(cow_end_from, (i + 1) << s->cluster_bits);
1168d53ec3d8SAlberto Garcia             int first_sc = offset_to_sc_index(s, write_from);
1169d53ec3d8SAlberto Garcia             int last_sc = offset_to_sc_index(s, write_to - 1);
1170d53ec3d8SAlberto Garcia             int cnt = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
1171d53ec3d8SAlberto Garcia                                                       first_sc, &type);
1172d53ec3d8SAlberto Garcia             /* Is any of the subclusters of type != QCOW2_SUBCLUSTER_NORMAL ? */
1173d53ec3d8SAlberto Garcia             if (type != QCOW2_SUBCLUSTER_NORMAL || first_sc + cnt <= last_sc) {
1174d53ec3d8SAlberto Garcia                 skip_cow = false;
1175d53ec3d8SAlberto Garcia             }
1176d53ec3d8SAlberto Garcia         } else {
1177d53ec3d8SAlberto Garcia             /* If we can't skip the cow we can still look for invalid entries */
1178d53ec3d8SAlberto Garcia             type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, 0);
1179d53ec3d8SAlberto Garcia         }
1180d53ec3d8SAlberto Garcia         if (type == QCOW2_SUBCLUSTER_INVALID) {
1181d53ec3d8SAlberto Garcia             int l1_index = offset_to_l1_index(s, guest_offset);
1182d53ec3d8SAlberto Garcia             uint64_t l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
1183d53ec3d8SAlberto Garcia             qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster "
1184d53ec3d8SAlberto Garcia                                     "entry found (L2 offset: %#" PRIx64
1185d53ec3d8SAlberto Garcia                                     ", L2 index: %#x)",
1186d53ec3d8SAlberto Garcia                                     l2_offset, l2_index + i);
1187d53ec3d8SAlberto Garcia             return -EIO;
118857538c86SAlberto Garcia         }
118957538c86SAlberto Garcia     }
1190d53ec3d8SAlberto Garcia 
1191d53ec3d8SAlberto Garcia     if (skip_cow) {
1192d53ec3d8SAlberto Garcia         return 0;
119357538c86SAlberto Garcia     }
119457538c86SAlberto Garcia 
119557538c86SAlberto Garcia     /* Get the L2 entry of the first cluster */
119612c6aebeSAlberto Garcia     l2_entry = get_l2_entry(s, l2_slice, l2_index);
1197d53ec3d8SAlberto Garcia     l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
1198d53ec3d8SAlberto Garcia     sc_index = offset_to_sc_index(s, guest_offset);
1199d53ec3d8SAlberto Garcia     type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
120057538c86SAlberto Garcia 
1201d53ec3d8SAlberto Garcia     if (!keep_old) {
1202d53ec3d8SAlberto Garcia         switch (type) {
1203d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_COMPRESSED:
1204d53ec3d8SAlberto Garcia             cow_start_from = 0;
1205d53ec3d8SAlberto Garcia             break;
1206d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_NORMAL:
1207d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1208d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1209d53ec3d8SAlberto Garcia             if (has_subclusters(s)) {
1210d53ec3d8SAlberto Garcia                 /* Skip all leading zero and unallocated subclusters */
1211d53ec3d8SAlberto Garcia                 uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
1212d53ec3d8SAlberto Garcia                 cow_start_from =
1213d53ec3d8SAlberto Garcia                     MIN(sc_index, ctz32(alloc_bitmap)) << s->subcluster_bits;
121457538c86SAlberto Garcia             } else {
121557538c86SAlberto Garcia                 cow_start_from = 0;
121657538c86SAlberto Garcia             }
1217d53ec3d8SAlberto Garcia             break;
1218d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_PLAIN:
1219d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
1220d53ec3d8SAlberto Garcia             cow_start_from = sc_index << s->subcluster_bits;
1221d53ec3d8SAlberto Garcia             break;
1222d53ec3d8SAlberto Garcia         default:
1223d53ec3d8SAlberto Garcia             g_assert_not_reached();
1224d53ec3d8SAlberto Garcia         }
1225d53ec3d8SAlberto Garcia     } else {
1226d53ec3d8SAlberto Garcia         switch (type) {
1227d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_NORMAL:
1228d53ec3d8SAlberto Garcia             cow_start_from = cow_start_to;
1229d53ec3d8SAlberto Garcia             break;
1230d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1231d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1232d53ec3d8SAlberto Garcia             cow_start_from = sc_index << s->subcluster_bits;
1233d53ec3d8SAlberto Garcia             break;
1234d53ec3d8SAlberto Garcia         default:
1235d53ec3d8SAlberto Garcia             g_assert_not_reached();
1236d53ec3d8SAlberto Garcia         }
1237d53ec3d8SAlberto Garcia     }
123857538c86SAlberto Garcia 
123957538c86SAlberto Garcia     /* Get the L2 entry of the last cluster */
1240d53ec3d8SAlberto Garcia     l2_index += nb_clusters - 1;
1241d53ec3d8SAlberto Garcia     l2_entry = get_l2_entry(s, l2_slice, l2_index);
1242d53ec3d8SAlberto Garcia     l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
1243d53ec3d8SAlberto Garcia     sc_index = offset_to_sc_index(s, guest_offset + bytes - 1);
1244d53ec3d8SAlberto Garcia     type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
124557538c86SAlberto Garcia 
1246d53ec3d8SAlberto Garcia     if (!keep_old) {
1247d53ec3d8SAlberto Garcia         switch (type) {
1248d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_COMPRESSED:
124957538c86SAlberto Garcia             cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
1250d53ec3d8SAlberto Garcia             break;
1251d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_NORMAL:
1252d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1253d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1254d53ec3d8SAlberto Garcia             cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
1255d53ec3d8SAlberto Garcia             if (has_subclusters(s)) {
1256d53ec3d8SAlberto Garcia                 /* Skip all trailing zero and unallocated subclusters */
1257d53ec3d8SAlberto Garcia                 uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
1258d53ec3d8SAlberto Garcia                 cow_end_to -=
1259d53ec3d8SAlberto Garcia                     MIN(s->subclusters_per_cluster - sc_index - 1,
1260d53ec3d8SAlberto Garcia                         clz32(alloc_bitmap)) << s->subcluster_bits;
1261d53ec3d8SAlberto Garcia             }
1262d53ec3d8SAlberto Garcia             break;
1263d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_PLAIN:
1264d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
1265d53ec3d8SAlberto Garcia             cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
1266d53ec3d8SAlberto Garcia             break;
1267d53ec3d8SAlberto Garcia         default:
1268d53ec3d8SAlberto Garcia             g_assert_not_reached();
1269d53ec3d8SAlberto Garcia         }
1270d53ec3d8SAlberto Garcia     } else {
1271d53ec3d8SAlberto Garcia         switch (type) {
1272d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_NORMAL:
1273d53ec3d8SAlberto Garcia             cow_end_to = cow_end_from;
1274d53ec3d8SAlberto Garcia             break;
1275d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1276d53ec3d8SAlberto Garcia         case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1277d53ec3d8SAlberto Garcia             cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
1278d53ec3d8SAlberto Garcia             break;
1279d53ec3d8SAlberto Garcia         default:
1280d53ec3d8SAlberto Garcia             g_assert_not_reached();
1281d53ec3d8SAlberto Garcia         }
128257538c86SAlberto Garcia     }
12838f91d690SAlberto Garcia 
12848f91d690SAlberto Garcia     *m = g_malloc0(sizeof(**m));
12858f91d690SAlberto Garcia     **m = (QCowL2Meta) {
12868f91d690SAlberto Garcia         .next           = old_m,
12878f91d690SAlberto Garcia 
12888f91d690SAlberto Garcia         .alloc_offset   = host_cluster_offset,
12898f91d690SAlberto Garcia         .offset         = start_of_cluster(s, guest_offset),
12908f91d690SAlberto Garcia         .nb_clusters    = nb_clusters,
12918f91d690SAlberto Garcia 
12928f91d690SAlberto Garcia         .keep_old_clusters = keep_old,
12938f91d690SAlberto Garcia 
12948f91d690SAlberto Garcia         .cow_start = {
12958f91d690SAlberto Garcia             .offset     = cow_start_from,
12968f91d690SAlberto Garcia             .nb_bytes   = cow_start_to - cow_start_from,
12978f91d690SAlberto Garcia         },
12988f91d690SAlberto Garcia         .cow_end = {
12998f91d690SAlberto Garcia             .offset     = cow_end_from,
13008f91d690SAlberto Garcia             .nb_bytes   = cow_end_to - cow_end_from,
13018f91d690SAlberto Garcia         },
13028f91d690SAlberto Garcia     };
13038f91d690SAlberto Garcia 
13048f91d690SAlberto Garcia     qemu_co_queue_init(&(*m)->dependent_requests);
13058f91d690SAlberto Garcia     QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1306d53ec3d8SAlberto Garcia 
1307d53ec3d8SAlberto Garcia     return 0;
13088f91d690SAlberto Garcia }
13098f91d690SAlberto Garcia 
131057538c86SAlberto Garcia /*
131157538c86SAlberto Garcia  * Returns true if writing to the cluster pointed to by @l2_entry
131257538c86SAlberto Garcia  * requires a new allocation (that is, if the cluster is unallocated
131357538c86SAlberto Garcia  * or has refcount > 1 and therefore cannot be written in-place).
131457538c86SAlberto Garcia  */
131557538c86SAlberto Garcia static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
1316c1587d87SAlberto Garcia {
1317c1587d87SAlberto Garcia     switch (qcow2_get_cluster_type(bs, l2_entry)) {
1318c1587d87SAlberto Garcia     case QCOW2_CLUSTER_NORMAL:
131957538c86SAlberto Garcia     case QCOW2_CLUSTER_ZERO_ALLOC:
1320c1587d87SAlberto Garcia         if (l2_entry & QCOW_OFLAG_COPIED) {
1321c1587d87SAlberto Garcia             return false;
1322c1587d87SAlberto Garcia         }
1323*b9be6faeSThomas Huth         /* fallthrough */
1324c1587d87SAlberto Garcia     case QCOW2_CLUSTER_UNALLOCATED:
1325c1587d87SAlberto Garcia     case QCOW2_CLUSTER_COMPRESSED:
1326c1587d87SAlberto Garcia     case QCOW2_CLUSTER_ZERO_PLAIN:
1327c1587d87SAlberto Garcia         return true;
1328c1587d87SAlberto Garcia     default:
1329c1587d87SAlberto Garcia         abort();
1330c1587d87SAlberto Garcia     }
1331c1587d87SAlberto Garcia }
1332c1587d87SAlberto Garcia 
13338f91d690SAlberto Garcia /*
133457538c86SAlberto Garcia  * Returns the number of contiguous clusters that can be written to
133557538c86SAlberto Garcia  * using one single write request, starting from @l2_index.
133657538c86SAlberto Garcia  * At most @nb_clusters are checked.
133757538c86SAlberto Garcia  *
133857538c86SAlberto Garcia  * If @new_alloc is true this counts clusters that are either
133957538c86SAlberto Garcia  * unallocated, or allocated but with refcount > 1 (so they need to be
134057538c86SAlberto Garcia  * newly allocated and COWed).
134157538c86SAlberto Garcia  *
134257538c86SAlberto Garcia  * If @new_alloc is false this counts clusters that are already
134357538c86SAlberto Garcia  * allocated and can be overwritten in-place (this includes clusters
134457538c86SAlberto Garcia  * of type QCOW2_CLUSTER_ZERO_ALLOC).
1345bf319eceSKevin Wolf  */
134657538c86SAlberto Garcia static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
134757538c86SAlberto Garcia                                        uint64_t *l2_slice, int l2_index,
134857538c86SAlberto Garcia                                        bool new_alloc)
1349bf319eceSKevin Wolf {
135057538c86SAlberto Garcia     BDRVQcow2State *s = bs->opaque;
135112c6aebeSAlberto Garcia     uint64_t l2_entry = get_l2_entry(s, l2_slice, l2_index);
135257538c86SAlberto Garcia     uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK;
1353143550a8SKevin Wolf     int i;
1354bf319eceSKevin Wolf 
1355143550a8SKevin Wolf     for (i = 0; i < nb_clusters; i++) {
135612c6aebeSAlberto Garcia         l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
135757538c86SAlberto Garcia         if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) {
1358bf319eceSKevin Wolf             break;
1359143550a8SKevin Wolf         }
136057538c86SAlberto Garcia         if (!new_alloc) {
136157538c86SAlberto Garcia             if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
136257538c86SAlberto Garcia                 break;
136357538c86SAlberto Garcia             }
136457538c86SAlberto Garcia             expected_offset += s->cluster_size;
136557538c86SAlberto Garcia         }
1366bf319eceSKevin Wolf     }
1367bf319eceSKevin Wolf 
1368bf319eceSKevin Wolf     assert(i <= nb_clusters);
1369bf319eceSKevin Wolf     return i;
1370bf319eceSKevin Wolf }
1371bf319eceSKevin Wolf 
1372bf319eceSKevin Wolf /*
1373250196f1SKevin Wolf  * Check if there already is an AIO write request in flight which allocates
1374250196f1SKevin Wolf  * the same cluster. In this case we need to wait until the previous
1375250196f1SKevin Wolf  * request has completed and updated the L2 table accordingly.
137665eb2e35SKevin Wolf  *
137765eb2e35SKevin Wolf  * Returns:
137865eb2e35SKevin Wolf  *   0       if there was no dependency. *cur_bytes indicates the number of
137965eb2e35SKevin Wolf  *           bytes from guest_offset that can be read before the next
138065eb2e35SKevin Wolf  *           dependency must be processed (or the request is complete)
138165eb2e35SKevin Wolf  *
138265eb2e35SKevin Wolf  *   -EAGAIN if we had to wait for another request, previously gathered
138365eb2e35SKevin Wolf  *           information on cluster allocation may be invalid now. The caller
138465eb2e35SKevin Wolf  *           must start over anyway, so consider *cur_bytes undefined.
1385250196f1SKevin Wolf  */
1386226c3c26SKevin Wolf static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
1387ecdd5333SKevin Wolf     uint64_t *cur_bytes, QCowL2Meta **m)
1388226c3c26SKevin Wolf {
1389ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
1390226c3c26SKevin Wolf     QCowL2Meta *old_alloc;
139165eb2e35SKevin Wolf     uint64_t bytes = *cur_bytes;
1392226c3c26SKevin Wolf 
1393250196f1SKevin Wolf     QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
1394250196f1SKevin Wolf 
139565eb2e35SKevin Wolf         uint64_t start = guest_offset;
139665eb2e35SKevin Wolf         uint64_t end = start + bytes;
1397d53ec3d8SAlberto Garcia         uint64_t old_start = start_of_cluster(s, l2meta_cow_start(old_alloc));
1398d53ec3d8SAlberto Garcia         uint64_t old_end = ROUND_UP(l2meta_cow_end(old_alloc), s->cluster_size);
1399250196f1SKevin Wolf 
1400d9d74f41SKevin Wolf         if (end <= old_start || start >= old_end) {
1401250196f1SKevin Wolf             /* No intersection */
1402250196f1SKevin Wolf         } else {
1403250196f1SKevin Wolf             if (start < old_start) {
1404250196f1SKevin Wolf                 /* Stop at the start of a running allocation */
140565eb2e35SKevin Wolf                 bytes = old_start - start;
1406250196f1SKevin Wolf             } else {
140765eb2e35SKevin Wolf                 bytes = 0;
1408250196f1SKevin Wolf             }
1409250196f1SKevin Wolf 
1410ecdd5333SKevin Wolf             /* Stop if already an l2meta exists. After yielding, it wouldn't
1411ecdd5333SKevin Wolf              * be valid any more, so we'd have to clean up the old L2Metas
1412ecdd5333SKevin Wolf              * and deal with requests depending on them before starting to
1413ecdd5333SKevin Wolf              * gather new ones. Not worth the trouble. */
1414ecdd5333SKevin Wolf             if (bytes == 0 && *m) {
1415ecdd5333SKevin Wolf                 *cur_bytes = 0;
1416ecdd5333SKevin Wolf                 return 0;
1417ecdd5333SKevin Wolf             }
1418ecdd5333SKevin Wolf 
141965eb2e35SKevin Wolf             if (bytes == 0) {
1420250196f1SKevin Wolf                 /* Wait for the dependency to complete. We need to recheck
1421250196f1SKevin Wolf                  * the free/allocated clusters when we continue. */
14221ace7ceaSPaolo Bonzini                 qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
1423250196f1SKevin Wolf                 return -EAGAIN;
1424250196f1SKevin Wolf             }
1425250196f1SKevin Wolf         }
1426250196f1SKevin Wolf     }
1427250196f1SKevin Wolf 
142865eb2e35SKevin Wolf     /* Make sure that existing clusters and new allocations are only used up to
142965eb2e35SKevin Wolf      * the next dependency if we shortened the request above */
143065eb2e35SKevin Wolf     *cur_bytes = bytes;
1431250196f1SKevin Wolf 
1432226c3c26SKevin Wolf     return 0;
1433226c3c26SKevin Wolf }
1434226c3c26SKevin Wolf 
1435226c3c26SKevin Wolf /*
143657538c86SAlberto Garcia  * Checks how many already allocated clusters that don't require a new
143757538c86SAlberto Garcia  * allocation there are at the given guest_offset (up to *bytes).
143857538c86SAlberto Garcia  * If *host_offset is not INV_OFFSET, only physically contiguous clusters
143957538c86SAlberto Garcia  * beginning at this host offset are counted.
14400af729ecSKevin Wolf  *
1441411d62b0SKevin Wolf  * Note that guest_offset may not be cluster aligned. In this case, the
1442411d62b0SKevin Wolf  * returned *host_offset points to exact byte referenced by guest_offset and
1443411d62b0SKevin Wolf  * therefore isn't cluster aligned as well.
14440af729ecSKevin Wolf  *
14450af729ecSKevin Wolf  * Returns:
14460af729ecSKevin Wolf  *   0:     if no allocated clusters are available at the given offset.
14470af729ecSKevin Wolf  *          *bytes is normally unchanged. It is set to 0 if the cluster
144857538c86SAlberto Garcia  *          is allocated and can be overwritten in-place but doesn't have
144957538c86SAlberto Garcia  *          the right physical offset.
14500af729ecSKevin Wolf  *
145157538c86SAlberto Garcia  *   1:     if allocated clusters that can be overwritten in place are
145257538c86SAlberto Garcia  *          available at the requested offset. *bytes may have decreased
145357538c86SAlberto Garcia  *          and describes the length of the area that can be written to.
14540af729ecSKevin Wolf  *
14550af729ecSKevin Wolf  *  -errno: in error cases
14560af729ecSKevin Wolf  */
14570af729ecSKevin Wolf static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
1458c53ede9fSKevin Wolf     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
14590af729ecSKevin Wolf {
1460ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
14610af729ecSKevin Wolf     int l2_index;
146257538c86SAlberto Garcia     uint64_t l2_entry, cluster_offset;
1463cde91766SAlberto Garcia     uint64_t *l2_slice;
1464b6d36defSMax Reitz     uint64_t nb_clusters;
1465c53ede9fSKevin Wolf     unsigned int keep_clusters;
1466a3f1afb4SAlberto Garcia     int ret;
14670af729ecSKevin Wolf 
14680af729ecSKevin Wolf     trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
14690af729ecSKevin Wolf                               *bytes);
14700af729ecSKevin Wolf 
1471c6d619ccSKevin Wolf     assert(*host_offset == INV_OFFSET || offset_into_cluster(s, guest_offset)
1472411d62b0SKevin Wolf                                       == offset_into_cluster(s, *host_offset));
1473411d62b0SKevin Wolf 
1474acb0467fSKevin Wolf     /*
1475cde91766SAlberto Garcia      * Calculate the number of clusters to look for. We stop at L2 slice
1476acb0467fSKevin Wolf      * boundaries to keep things simple.
1477acb0467fSKevin Wolf      */
1478acb0467fSKevin Wolf     nb_clusters =
1479acb0467fSKevin Wolf         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1480acb0467fSKevin Wolf 
1481cde91766SAlberto Garcia     l2_index = offset_to_l2_slice_index(s, guest_offset);
1482cde91766SAlberto Garcia     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
148357538c86SAlberto Garcia     /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */
148457538c86SAlberto Garcia     nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
1485acb0467fSKevin Wolf 
14860af729ecSKevin Wolf     /* Find L2 entry for the first involved cluster */
1487cde91766SAlberto Garcia     ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
14880af729ecSKevin Wolf     if (ret < 0) {
14890af729ecSKevin Wolf         return ret;
14900af729ecSKevin Wolf     }
14910af729ecSKevin Wolf 
149212c6aebeSAlberto Garcia     l2_entry = get_l2_entry(s, l2_slice, l2_index);
149357538c86SAlberto Garcia     cluster_offset = l2_entry & L2E_OFFSET_MASK;
14940af729ecSKevin Wolf 
149557538c86SAlberto Garcia     if (!cluster_needs_new_alloc(bs, l2_entry)) {
149657538c86SAlberto Garcia         if (offset_into_cluster(s, cluster_offset)) {
149757538c86SAlberto Garcia             qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset "
149857538c86SAlberto Garcia                                     "%#" PRIx64 " unaligned (guest offset: %#"
149957538c86SAlberto Garcia                                     PRIx64 ")", l2_entry & QCOW_OFLAG_ZERO ?
150057538c86SAlberto Garcia                                     "Preallocated zero" : "Data",
150157538c86SAlberto Garcia                                     cluster_offset, guest_offset);
1502a97c67eeSMax Reitz             ret = -EIO;
1503a97c67eeSMax Reitz             goto out;
1504a97c67eeSMax Reitz         }
1505a97c67eeSMax Reitz 
150657538c86SAlberto Garcia         /* If a specific host_offset is required, check it */
150757538c86SAlberto Garcia         if (*host_offset != INV_OFFSET && cluster_offset != *host_offset) {
1508e62daaf6SKevin Wolf             *bytes = 0;
1509e62daaf6SKevin Wolf             ret = 0;
1510e62daaf6SKevin Wolf             goto out;
1511e62daaf6SKevin Wolf         }
1512e62daaf6SKevin Wolf 
15130af729ecSKevin Wolf         /* We keep all QCOW_OFLAG_COPIED clusters */
151457538c86SAlberto Garcia         keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice,
151557538c86SAlberto Garcia                                                     l2_index, false);
1516c53ede9fSKevin Wolf         assert(keep_clusters <= nb_clusters);
1517c53ede9fSKevin Wolf 
1518c53ede9fSKevin Wolf         *bytes = MIN(*bytes,
1519c53ede9fSKevin Wolf                  keep_clusters * s->cluster_size
1520c53ede9fSKevin Wolf                  - offset_into_cluster(s, guest_offset));
152157538c86SAlberto Garcia         assert(*bytes != 0);
152257538c86SAlberto Garcia 
1523d53ec3d8SAlberto Garcia         ret = calculate_l2_meta(bs, cluster_offset, guest_offset,
152457538c86SAlberto Garcia                                 *bytes, l2_slice, m, true);
1525d53ec3d8SAlberto Garcia         if (ret < 0) {
1526d53ec3d8SAlberto Garcia             goto out;
1527d53ec3d8SAlberto Garcia         }
15280af729ecSKevin Wolf 
15290af729ecSKevin Wolf         ret = 1;
15300af729ecSKevin Wolf     } else {
15310af729ecSKevin Wolf         ret = 0;
15320af729ecSKevin Wolf     }
15330af729ecSKevin Wolf 
15340af729ecSKevin Wolf     /* Cleanup */
1535e62daaf6SKevin Wolf out:
1536cde91766SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
15370af729ecSKevin Wolf 
1538e62daaf6SKevin Wolf     /* Only return a host offset if we actually made progress. Otherwise we
1539e62daaf6SKevin Wolf      * would make requirements for handle_alloc() that it can't fulfill */
1540a97c67eeSMax Reitz     if (ret > 0) {
154157538c86SAlberto Garcia         *host_offset = cluster_offset + offset_into_cluster(s, guest_offset);
1542e62daaf6SKevin Wolf     }
1543e62daaf6SKevin Wolf 
15440af729ecSKevin Wolf     return ret;
15450af729ecSKevin Wolf }
15460af729ecSKevin Wolf 
15470af729ecSKevin Wolf /*
1548226c3c26SKevin Wolf  * Allocates new clusters for the given guest_offset.
1549226c3c26SKevin Wolf  *
1550226c3c26SKevin Wolf  * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
1551226c3c26SKevin Wolf  * contain the number of clusters that have been allocated and are contiguous
1552226c3c26SKevin Wolf  * in the image file.
1553226c3c26SKevin Wolf  *
1554c6d619ccSKevin Wolf  * If *host_offset is not INV_OFFSET, it specifies the offset in the image file
1555c6d619ccSKevin Wolf  * at which the new clusters must start. *nb_clusters can be 0 on return in
1556c6d619ccSKevin Wolf  * this case if the cluster at host_offset is already in use. If *host_offset
1557c6d619ccSKevin Wolf  * is INV_OFFSET, the clusters can be allocated anywhere in the image file.
1558226c3c26SKevin Wolf  *
1559226c3c26SKevin Wolf  * *host_offset is updated to contain the offset into the image file at which
1560226c3c26SKevin Wolf  * the first allocated cluster starts.
1561226c3c26SKevin Wolf  *
1562226c3c26SKevin Wolf  * Return 0 on success and -errno in error cases. -EAGAIN means that the
1563226c3c26SKevin Wolf  * function has been waiting for another request and the allocation must be
1564226c3c26SKevin Wolf  * restarted, but the whole request should not be failed.
1565226c3c26SKevin Wolf  */
1566226c3c26SKevin Wolf static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
1567b6d36defSMax Reitz                                    uint64_t *host_offset, uint64_t *nb_clusters)
1568226c3c26SKevin Wolf {
1569ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
1570226c3c26SKevin Wolf 
1571226c3c26SKevin Wolf     trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
1572226c3c26SKevin Wolf                                          *host_offset, *nb_clusters);
1573226c3c26SKevin Wolf 
1574966b000fSKevin Wolf     if (has_data_file(bs)) {
1575966b000fSKevin Wolf         assert(*host_offset == INV_OFFSET ||
1576966b000fSKevin Wolf                *host_offset == start_of_cluster(s, guest_offset));
1577966b000fSKevin Wolf         *host_offset = start_of_cluster(s, guest_offset);
1578966b000fSKevin Wolf         return 0;
1579966b000fSKevin Wolf     }
1580966b000fSKevin Wolf 
1581250196f1SKevin Wolf     /* Allocate new clusters */
1582250196f1SKevin Wolf     trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
1583c6d619ccSKevin Wolf     if (*host_offset == INV_OFFSET) {
1584df021791SKevin Wolf         int64_t cluster_offset =
1585df021791SKevin Wolf             qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
1586250196f1SKevin Wolf         if (cluster_offset < 0) {
1587250196f1SKevin Wolf             return cluster_offset;
1588250196f1SKevin Wolf         }
1589250196f1SKevin Wolf         *host_offset = cluster_offset;
1590250196f1SKevin Wolf         return 0;
1591df021791SKevin Wolf     } else {
1592b6d36defSMax Reitz         int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
1593df021791SKevin Wolf         if (ret < 0) {
1594df021791SKevin Wolf             return ret;
1595df021791SKevin Wolf         }
1596df021791SKevin Wolf         *nb_clusters = ret;
1597df021791SKevin Wolf         return 0;
1598df021791SKevin Wolf     }
1599250196f1SKevin Wolf }
1600250196f1SKevin Wolf 
1601250196f1SKevin Wolf /*
160257538c86SAlberto Garcia  * Allocates new clusters for an area that is either still unallocated or
160357538c86SAlberto Garcia  * cannot be overwritten in-place. If *host_offset is not INV_OFFSET,
160457538c86SAlberto Garcia  * clusters are only allocated if the new allocation can match the specified
160557538c86SAlberto Garcia  * host offset.
160610f0ed8bSKevin Wolf  *
1607411d62b0SKevin Wolf  * Note that guest_offset may not be cluster aligned. In this case, the
1608411d62b0SKevin Wolf  * returned *host_offset points to exact byte referenced by guest_offset and
1609411d62b0SKevin Wolf  * therefore isn't cluster aligned as well.
161010f0ed8bSKevin Wolf  *
161110f0ed8bSKevin Wolf  * Returns:
161210f0ed8bSKevin Wolf  *   0:     if no clusters could be allocated. *bytes is set to 0,
161310f0ed8bSKevin Wolf  *          *host_offset is left unchanged.
161410f0ed8bSKevin Wolf  *
161510f0ed8bSKevin Wolf  *   1:     if new clusters were allocated. *bytes may be decreased if the
161610f0ed8bSKevin Wolf  *          new allocation doesn't cover all of the requested area.
161710f0ed8bSKevin Wolf  *          *host_offset is updated to contain the host offset of the first
161810f0ed8bSKevin Wolf  *          newly allocated cluster.
161910f0ed8bSKevin Wolf  *
162010f0ed8bSKevin Wolf  *  -errno: in error cases
162110f0ed8bSKevin Wolf  */
162210f0ed8bSKevin Wolf static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
1623c37f4cd7SKevin Wolf     uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
162410f0ed8bSKevin Wolf {
1625ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
162610f0ed8bSKevin Wolf     int l2_index;
16276d99a344SAlberto Garcia     uint64_t *l2_slice;
1628b6d36defSMax Reitz     uint64_t nb_clusters;
162910f0ed8bSKevin Wolf     int ret;
163010f0ed8bSKevin Wolf 
163157538c86SAlberto Garcia     uint64_t alloc_cluster_offset;
163210f0ed8bSKevin Wolf 
163310f0ed8bSKevin Wolf     trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
163410f0ed8bSKevin Wolf                              *bytes);
163510f0ed8bSKevin Wolf     assert(*bytes > 0);
163610f0ed8bSKevin Wolf 
1637f5bc6350SKevin Wolf     /*
16386d99a344SAlberto Garcia      * Calculate the number of clusters to look for. We stop at L2 slice
1639f5bc6350SKevin Wolf      * boundaries to keep things simple.
1640f5bc6350SKevin Wolf      */
1641c37f4cd7SKevin Wolf     nb_clusters =
1642c37f4cd7SKevin Wolf         size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1643c37f4cd7SKevin Wolf 
16446d99a344SAlberto Garcia     l2_index = offset_to_l2_slice_index(s, guest_offset);
16456d99a344SAlberto Garcia     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
164657538c86SAlberto Garcia     /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */
164757538c86SAlberto Garcia     nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
1648d1b9d19fSMax Reitz 
164910f0ed8bSKevin Wolf     /* Find L2 entry for the first involved cluster */
16506d99a344SAlberto Garcia     ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
165110f0ed8bSKevin Wolf     if (ret < 0) {
165210f0ed8bSKevin Wolf         return ret;
165310f0ed8bSKevin Wolf     }
165410f0ed8bSKevin Wolf 
165557538c86SAlberto Garcia     nb_clusters = count_single_write_clusters(bs, nb_clusters,
165657538c86SAlberto Garcia                                               l2_slice, l2_index, true);
165710f0ed8bSKevin Wolf 
1658ecdd5333SKevin Wolf     /* This function is only called when there were no non-COW clusters, so if
1659ecdd5333SKevin Wolf      * we can't find any unallocated or COW clusters either, something is
1660ecdd5333SKevin Wolf      * wrong with our code. */
1661ecdd5333SKevin Wolf     assert(nb_clusters > 0);
1662ecdd5333SKevin Wolf 
166357538c86SAlberto Garcia     /* Allocate at a given offset in the image file */
1664c6d619ccSKevin Wolf     alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
1665c6d619ccSKevin Wolf         start_of_cluster(s, *host_offset);
166683baa9a4SKevin Wolf     ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
166710f0ed8bSKevin Wolf                                   &nb_clusters);
166810f0ed8bSKevin Wolf     if (ret < 0) {
166957538c86SAlberto Garcia         goto out;
167010f0ed8bSKevin Wolf     }
167110f0ed8bSKevin Wolf 
167283baa9a4SKevin Wolf     /* Can't extend contiguous allocation */
167383baa9a4SKevin Wolf     if (nb_clusters == 0) {
167483baa9a4SKevin Wolf         *bytes = 0;
167557538c86SAlberto Garcia         ret = 0;
167657538c86SAlberto Garcia         goto out;
167783baa9a4SKevin Wolf     }
167883baa9a4SKevin Wolf 
1679c6d619ccSKevin Wolf     assert(alloc_cluster_offset != INV_OFFSET);
1680ff52aab2SMax Reitz 
168110f0ed8bSKevin Wolf     /*
168283baa9a4SKevin Wolf      * Save info needed for meta data update.
168383baa9a4SKevin Wolf      *
168485567393SKevin Wolf      * requested_bytes: Number of bytes from the start of the first
168510f0ed8bSKevin Wolf      * newly allocated cluster to the end of the (possibly shortened
168610f0ed8bSKevin Wolf      * before) write request.
168710f0ed8bSKevin Wolf      *
168885567393SKevin Wolf      * avail_bytes: Number of bytes from the start of the first
168910f0ed8bSKevin Wolf      * newly allocated to the end of the last newly allocated cluster.
169010f0ed8bSKevin Wolf      *
169185567393SKevin Wolf      * nb_bytes: The number of bytes from the start of the first
169283baa9a4SKevin Wolf      * newly allocated cluster to the end of the area that the write
169310f0ed8bSKevin Wolf      * request actually writes to (excluding COW at the end)
169410f0ed8bSKevin Wolf      */
169585567393SKevin Wolf     uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
1696d1b9d19fSMax Reitz     int avail_bytes = nb_clusters << s->cluster_bits;
169785567393SKevin Wolf     int nb_bytes = MIN(requested_bytes, avail_bytes);
169810f0ed8bSKevin Wolf 
1699411d62b0SKevin Wolf     *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
170085567393SKevin Wolf     *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
1701c37f4cd7SKevin Wolf     assert(*bytes != 0);
170210f0ed8bSKevin Wolf 
1703d53ec3d8SAlberto Garcia     ret = calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
1704d53ec3d8SAlberto Garcia                             l2_slice, m, false);
1705d53ec3d8SAlberto Garcia     if (ret < 0) {
1706d53ec3d8SAlberto Garcia         goto out;
1707d53ec3d8SAlberto Garcia     }
17088f91d690SAlberto Garcia 
170957538c86SAlberto Garcia     ret = 1;
171010f0ed8bSKevin Wolf 
171157538c86SAlberto Garcia out:
171257538c86SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
171357538c86SAlberto Garcia     if (ret < 0 && *m && (*m)->nb_clusters > 0) {
171410f0ed8bSKevin Wolf         QLIST_REMOVE(*m, next_in_flight);
171510f0ed8bSKevin Wolf     }
171610f0ed8bSKevin Wolf     return ret;
171710f0ed8bSKevin Wolf }
171810f0ed8bSKevin Wolf 
171910f0ed8bSKevin Wolf /*
172045aba42fSKevin Wolf  * alloc_cluster_offset
172145aba42fSKevin Wolf  *
1722250196f1SKevin Wolf  * For a given offset on the virtual disk, find the cluster offset in qcow2
1723250196f1SKevin Wolf  * file. If the offset is not found, allocate a new cluster.
172445aba42fSKevin Wolf  *
1725250196f1SKevin Wolf  * If the cluster was already allocated, m->nb_clusters is set to 0 and
1726a7912369SFrediano Ziglio  * other fields in m are meaningless.
172745aba42fSKevin Wolf  *
1728148da7eaSKevin Wolf  * If the cluster is newly allocated, m->nb_clusters is set to the number of
172968d100e9SKevin Wolf  * contiguous clusters that have been allocated. In this case, the other
173068d100e9SKevin Wolf  * fields of m are valid and contain information about the first allocated
173168d100e9SKevin Wolf  * cluster.
1732148da7eaSKevin Wolf  *
173368d100e9SKevin Wolf  * If the request conflicts with another write request in flight, the coroutine
173468d100e9SKevin Wolf  * is queued and will be reentered when the dependency has completed.
1735148da7eaSKevin Wolf  *
1736148da7eaSKevin Wolf  * Return 0 on success and -errno in error cases
173745aba42fSKevin Wolf  */
1738f4f0d391SKevin Wolf int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
1739d46a0bb2SKevin Wolf                                unsigned int *bytes, uint64_t *host_offset,
1740d46a0bb2SKevin Wolf                                QCowL2Meta **m)
174145aba42fSKevin Wolf {
1742ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
1743710c2496SKevin Wolf     uint64_t start, remaining;
1744250196f1SKevin Wolf     uint64_t cluster_offset;
174565eb2e35SKevin Wolf     uint64_t cur_bytes;
1746710c2496SKevin Wolf     int ret;
174745aba42fSKevin Wolf 
1748d46a0bb2SKevin Wolf     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
1749710c2496SKevin Wolf 
175072424114SKevin Wolf again:
175116f0587eSHu Tao     start = offset;
1752d46a0bb2SKevin Wolf     remaining = *bytes;
1753c6d619ccSKevin Wolf     cluster_offset = INV_OFFSET;
1754c6d619ccSKevin Wolf     *host_offset = INV_OFFSET;
1755ecdd5333SKevin Wolf     cur_bytes = 0;
1756ecdd5333SKevin Wolf     *m = NULL;
17570af729ecSKevin Wolf 
17582c3b32d2SKevin Wolf     while (true) {
1759ecdd5333SKevin Wolf 
1760c6d619ccSKevin Wolf         if (*host_offset == INV_OFFSET && cluster_offset != INV_OFFSET) {
1761ecdd5333SKevin Wolf             *host_offset = start_of_cluster(s, cluster_offset);
1762ecdd5333SKevin Wolf         }
1763ecdd5333SKevin Wolf 
1764ecdd5333SKevin Wolf         assert(remaining >= cur_bytes);
1765ecdd5333SKevin Wolf 
1766ecdd5333SKevin Wolf         start           += cur_bytes;
1767ecdd5333SKevin Wolf         remaining       -= cur_bytes;
1768c6d619ccSKevin Wolf 
1769c6d619ccSKevin Wolf         if (cluster_offset != INV_OFFSET) {
1770ecdd5333SKevin Wolf             cluster_offset += cur_bytes;
1771c6d619ccSKevin Wolf         }
1772ecdd5333SKevin Wolf 
1773ecdd5333SKevin Wolf         if (remaining == 0) {
1774ecdd5333SKevin Wolf             break;
1775ecdd5333SKevin Wolf         }
1776ecdd5333SKevin Wolf 
1777ecdd5333SKevin Wolf         cur_bytes = remaining;
1778ecdd5333SKevin Wolf 
1779250196f1SKevin Wolf         /*
178017a71e58SKevin Wolf          * Now start gathering as many contiguous clusters as possible:
178117a71e58SKevin Wolf          *
178217a71e58SKevin Wolf          * 1. Check for overlaps with in-flight allocations
178317a71e58SKevin Wolf          *
17842c3b32d2SKevin Wolf          *      a) Overlap not in the first cluster -> shorten this request and
17852c3b32d2SKevin Wolf          *         let the caller handle the rest in its next loop iteration.
178617a71e58SKevin Wolf          *
17872c3b32d2SKevin Wolf          *      b) Real overlaps of two requests. Yield and restart the search
17882c3b32d2SKevin Wolf          *         for contiguous clusters (the situation could have changed
17892c3b32d2SKevin Wolf          *         while we were sleeping)
179017a71e58SKevin Wolf          *
179117a71e58SKevin Wolf          *      c) TODO: Request starts in the same cluster as the in-flight
17922c3b32d2SKevin Wolf          *         allocation ends. Shorten the COW of the in-fight allocation,
17932c3b32d2SKevin Wolf          *         set cluster_offset to write to the same cluster and set up
17942c3b32d2SKevin Wolf          *         the right synchronisation between the in-flight request and
17952c3b32d2SKevin Wolf          *         the new one.
179617a71e58SKevin Wolf          */
1797ecdd5333SKevin Wolf         ret = handle_dependencies(bs, start, &cur_bytes, m);
179817a71e58SKevin Wolf         if (ret == -EAGAIN) {
1799ecdd5333SKevin Wolf             /* Currently handle_dependencies() doesn't yield if we already had
1800ecdd5333SKevin Wolf              * an allocation. If it did, we would have to clean up the L2Meta
1801ecdd5333SKevin Wolf              * structs before starting over. */
1802ecdd5333SKevin Wolf             assert(*m == NULL);
180317a71e58SKevin Wolf             goto again;
180417a71e58SKevin Wolf         } else if (ret < 0) {
180517a71e58SKevin Wolf             return ret;
1806ecdd5333SKevin Wolf         } else if (cur_bytes == 0) {
1807ecdd5333SKevin Wolf             break;
180817a71e58SKevin Wolf         } else {
180917a71e58SKevin Wolf             /* handle_dependencies() may have decreased cur_bytes (shortened
181017a71e58SKevin Wolf              * the allocations below) so that the next dependency is processed
181117a71e58SKevin Wolf              * correctly during the next loop iteration. */
181217a71e58SKevin Wolf         }
181317a71e58SKevin Wolf 
181472424114SKevin Wolf         /*
18150af729ecSKevin Wolf          * 2. Count contiguous COPIED clusters.
181672424114SKevin Wolf          */
1817710c2496SKevin Wolf         ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
181872424114SKevin Wolf         if (ret < 0) {
181972424114SKevin Wolf             return ret;
18200af729ecSKevin Wolf         } else if (ret) {
1821ecdd5333SKevin Wolf             continue;
1822e62daaf6SKevin Wolf         } else if (cur_bytes == 0) {
18232c3b32d2SKevin Wolf             break;
182472424114SKevin Wolf         }
182572424114SKevin Wolf 
18260af729ecSKevin Wolf         /*
18270af729ecSKevin Wolf          * 3. If the request still hasn't completed, allocate new clusters,
18280af729ecSKevin Wolf          *    considering any cluster_offset of steps 1c or 2.
18290af729ecSKevin Wolf          */
1830710c2496SKevin Wolf         ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1831037689d8SKevin Wolf         if (ret < 0) {
1832037689d8SKevin Wolf             return ret;
1833710c2496SKevin Wolf         } else if (ret) {
1834ecdd5333SKevin Wolf             continue;
18352c3b32d2SKevin Wolf         } else {
18362c3b32d2SKevin Wolf             assert(cur_bytes == 0);
18372c3b32d2SKevin Wolf             break;
18382c3b32d2SKevin Wolf         }
1839710c2496SKevin Wolf     }
1840250196f1SKevin Wolf 
1841d46a0bb2SKevin Wolf     *bytes -= remaining;
1842d46a0bb2SKevin Wolf     assert(*bytes > 0);
1843c6d619ccSKevin Wolf     assert(*host_offset != INV_OFFSET);
184445aba42fSKevin Wolf 
1845148da7eaSKevin Wolf     return 0;
184645aba42fSKevin Wolf }
184745aba42fSKevin Wolf 
18485ea929e3SKevin Wolf /*
18495ea929e3SKevin Wolf  * This discards as many clusters of nb_clusters as possible at once (i.e.
185021ab3addSAlberto Garcia  * all clusters in the same L2 slice) and returns the number of discarded
18515ea929e3SKevin Wolf  * clusters.
18525ea929e3SKevin Wolf  */
185321ab3addSAlberto Garcia static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
185421ab3addSAlberto Garcia                                uint64_t nb_clusters,
185521ab3addSAlberto Garcia                                enum qcow2_discard_type type, bool full_discard)
18565ea929e3SKevin Wolf {
1857ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
185821ab3addSAlberto Garcia     uint64_t *l2_slice;
18595ea929e3SKevin Wolf     int l2_index;
18605ea929e3SKevin Wolf     int ret;
18615ea929e3SKevin Wolf     int i;
18625ea929e3SKevin Wolf 
186321ab3addSAlberto Garcia     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
18645ea929e3SKevin Wolf     if (ret < 0) {
18655ea929e3SKevin Wolf         return ret;
18665ea929e3SKevin Wolf     }
18675ea929e3SKevin Wolf 
186821ab3addSAlberto Garcia     /* Limit nb_clusters to one L2 slice */
186921ab3addSAlberto Garcia     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1870b6d36defSMax Reitz     assert(nb_clusters <= INT_MAX);
18715ea929e3SKevin Wolf 
18725ea929e3SKevin Wolf     for (i = 0; i < nb_clusters; i++) {
1873a68cd703SAlberto Garcia         uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1874a68cd703SAlberto Garcia         uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1875a68cd703SAlberto Garcia         uint64_t new_l2_entry = old_l2_entry;
1876a68cd703SAlberto Garcia         uint64_t new_l2_bitmap = old_l2_bitmap;
1877a68cd703SAlberto Garcia         QCow2ClusterType cluster_type =
1878a68cd703SAlberto Garcia             qcow2_get_cluster_type(bs, old_l2_entry);
1879a71835a0SKevin Wolf 
1880a71835a0SKevin Wolf         /*
1881a68cd703SAlberto Garcia          * If full_discard is true, the cluster should not read back as zeroes,
1882a68cd703SAlberto Garcia          * but rather fall through to the backing file.
1883a68cd703SAlberto Garcia          *
1884808c4b6fSMax Reitz          * If full_discard is false, make sure that a discarded area reads back
1885808c4b6fSMax Reitz          * as zeroes for v3 images (we cannot do it for v2 without actually
1886808c4b6fSMax Reitz          * writing a zero-filled buffer). We can skip the operation if the
1887808c4b6fSMax Reitz          * cluster is already marked as zero, or if it's unallocated and we
1888808c4b6fSMax Reitz          * don't have a backing file.
1889a71835a0SKevin Wolf          *
1890237d78f8SEric Blake          * TODO We might want to use bdrv_block_status(bs) here, but we're
1891a71835a0SKevin Wolf          * holding s->lock, so that doesn't work today.
1892a71835a0SKevin Wolf          */
1893a68cd703SAlberto Garcia         if (full_discard) {
1894a68cd703SAlberto Garcia             new_l2_entry = new_l2_bitmap = 0;
1895a68cd703SAlberto Garcia         } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
1896a68cd703SAlberto Garcia             if (has_subclusters(s)) {
1897a68cd703SAlberto Garcia                 new_l2_entry = 0;
1898a68cd703SAlberto Garcia                 new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
1899a68cd703SAlberto Garcia             } else {
1900a68cd703SAlberto Garcia                 new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0;
1901a71835a0SKevin Wolf             }
1902808c4b6fSMax Reitz         }
1903c883db0dSMax Reitz 
1904a68cd703SAlberto Garcia         if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
1905a68cd703SAlberto Garcia             continue;
19065ea929e3SKevin Wolf         }
19075ea929e3SKevin Wolf 
19085ea929e3SKevin Wolf         /* First remove L2 entries */
190921ab3addSAlberto Garcia         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
1910a68cd703SAlberto Garcia         set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
1911a68cd703SAlberto Garcia         if (has_subclusters(s)) {
1912a68cd703SAlberto Garcia             set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
1913a71835a0SKevin Wolf         }
19145ea929e3SKevin Wolf         /* Then decrease the refcount */
1915c883db0dSMax Reitz         qcow2_free_any_clusters(bs, old_l2_entry, 1, type);
19165ea929e3SKevin Wolf     }
19175ea929e3SKevin Wolf 
191821ab3addSAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
19195ea929e3SKevin Wolf 
19205ea929e3SKevin Wolf     return nb_clusters;
19215ea929e3SKevin Wolf }
19225ea929e3SKevin Wolf 
1923d2cb36afSEric Blake int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
1924d2cb36afSEric Blake                           uint64_t bytes, enum qcow2_discard_type type,
1925d2cb36afSEric Blake                           bool full_discard)
19265ea929e3SKevin Wolf {
1927ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
1928d2cb36afSEric Blake     uint64_t end_offset = offset + bytes;
1929b6d36defSMax Reitz     uint64_t nb_clusters;
1930d2cb36afSEric Blake     int64_t cleared;
19315ea929e3SKevin Wolf     int ret;
19325ea929e3SKevin Wolf 
1933f10ee139SEric Blake     /* Caller must pass aligned values, except at image end */
19340c1bd469SEric Blake     assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
1935f10ee139SEric Blake     assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
1936f10ee139SEric Blake            end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
19375ea929e3SKevin Wolf 
1938d2cb36afSEric Blake     nb_clusters = size_to_clusters(s, bytes);
19395ea929e3SKevin Wolf 
19400b919faeSKevin Wolf     s->cache_discards = true;
19410b919faeSKevin Wolf 
194221ab3addSAlberto Garcia     /* Each L2 slice is handled by its own loop iteration */
19435ea929e3SKevin Wolf     while (nb_clusters > 0) {
194421ab3addSAlberto Garcia         cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
1945d2cb36afSEric Blake                                       full_discard);
1946d2cb36afSEric Blake         if (cleared < 0) {
1947d2cb36afSEric Blake             ret = cleared;
19480b919faeSKevin Wolf             goto fail;
19495ea929e3SKevin Wolf         }
19505ea929e3SKevin Wolf 
1951d2cb36afSEric Blake         nb_clusters -= cleared;
1952d2cb36afSEric Blake         offset += (cleared * s->cluster_size);
19535ea929e3SKevin Wolf     }
19545ea929e3SKevin Wolf 
19550b919faeSKevin Wolf     ret = 0;
19560b919faeSKevin Wolf fail:
19570b919faeSKevin Wolf     s->cache_discards = false;
19580b919faeSKevin Wolf     qcow2_process_discards(bs, ret);
19590b919faeSKevin Wolf 
19600b919faeSKevin Wolf     return ret;
19615ea929e3SKevin Wolf }
1962621f0589SKevin Wolf 
1963621f0589SKevin Wolf /*
1964621f0589SKevin Wolf  * This zeroes as many clusters of nb_clusters as possible at once (i.e.
1965a9a9f8f0SAlberto Garcia  * all clusters in the same L2 slice) and returns the number of zeroed
1966621f0589SKevin Wolf  * clusters.
1967621f0589SKevin Wolf  */
1968a9a9f8f0SAlberto Garcia static int zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
1969170f4b2eSFam Zheng                             uint64_t nb_clusters, int flags)
1970621f0589SKevin Wolf {
1971ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
1972a9a9f8f0SAlberto Garcia     uint64_t *l2_slice;
1973621f0589SKevin Wolf     int l2_index;
1974621f0589SKevin Wolf     int ret;
1975621f0589SKevin Wolf     int i;
1976621f0589SKevin Wolf 
1977a9a9f8f0SAlberto Garcia     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
1978621f0589SKevin Wolf     if (ret < 0) {
1979621f0589SKevin Wolf         return ret;
1980621f0589SKevin Wolf     }
1981621f0589SKevin Wolf 
1982a9a9f8f0SAlberto Garcia     /* Limit nb_clusters to one L2 slice */
1983a9a9f8f0SAlberto Garcia     nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1984b6d36defSMax Reitz     assert(nb_clusters <= INT_MAX);
1985621f0589SKevin Wolf 
1986621f0589SKevin Wolf     for (i = 0; i < nb_clusters; i++) {
1987205fa507SAlberto Garcia         uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1988205fa507SAlberto Garcia         uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1989205fa507SAlberto Garcia         QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry);
1990205fa507SAlberto Garcia         bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) ||
1991205fa507SAlberto Garcia             ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type));
1992205fa507SAlberto Garcia         uint64_t new_l2_entry = unmap ? 0 : old_l2_entry;
1993205fa507SAlberto Garcia         uint64_t new_l2_bitmap = old_l2_bitmap;
1994621f0589SKevin Wolf 
1995205fa507SAlberto Garcia         if (has_subclusters(s)) {
1996205fa507SAlberto Garcia             new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
1997205fa507SAlberto Garcia         } else {
1998205fa507SAlberto Garcia             new_l2_entry |= QCOW_OFLAG_ZERO;
1999205fa507SAlberto Garcia         }
2000621f0589SKevin Wolf 
2001205fa507SAlberto Garcia         if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
200206cc5e2bSEric Blake             continue;
200306cc5e2bSEric Blake         }
200406cc5e2bSEric Blake 
2005a9a9f8f0SAlberto Garcia         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2006205fa507SAlberto Garcia         if (unmap) {
2007205fa507SAlberto Garcia             qcow2_free_any_clusters(bs, old_l2_entry, 1, QCOW2_DISCARD_REQUEST);
2008205fa507SAlberto Garcia         }
2009205fa507SAlberto Garcia         set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
2010205fa507SAlberto Garcia         if (has_subclusters(s)) {
2011205fa507SAlberto Garcia             set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
2012621f0589SKevin Wolf         }
2013621f0589SKevin Wolf     }
2014621f0589SKevin Wolf 
2015a9a9f8f0SAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2016621f0589SKevin Wolf 
2017621f0589SKevin Wolf     return nb_clusters;
2018621f0589SKevin Wolf }
2019621f0589SKevin Wolf 
2020a6841a2dSAlberto Garcia static int zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
2021a6841a2dSAlberto Garcia                                unsigned nb_subclusters)
2022a6841a2dSAlberto Garcia {
2023a6841a2dSAlberto Garcia     BDRVQcow2State *s = bs->opaque;
2024a6841a2dSAlberto Garcia     uint64_t *l2_slice;
2025a6841a2dSAlberto Garcia     uint64_t old_l2_bitmap, l2_bitmap;
2026a6841a2dSAlberto Garcia     int l2_index, ret, sc = offset_to_sc_index(s, offset);
2027a6841a2dSAlberto Garcia 
2028a6841a2dSAlberto Garcia     /* For full clusters use zero_in_l2_slice() instead */
2029a6841a2dSAlberto Garcia     assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
2030a6841a2dSAlberto Garcia     assert(sc + nb_subclusters <= s->subclusters_per_cluster);
2031a6841a2dSAlberto Garcia     assert(offset_into_subcluster(s, offset) == 0);
2032a6841a2dSAlberto Garcia 
2033a6841a2dSAlberto Garcia     ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
2034a6841a2dSAlberto Garcia     if (ret < 0) {
2035a6841a2dSAlberto Garcia         return ret;
2036a6841a2dSAlberto Garcia     }
2037a6841a2dSAlberto Garcia 
2038a6841a2dSAlberto Garcia     switch (qcow2_get_cluster_type(bs, get_l2_entry(s, l2_slice, l2_index))) {
2039a6841a2dSAlberto Garcia     case QCOW2_CLUSTER_COMPRESSED:
2040a6841a2dSAlberto Garcia         ret = -ENOTSUP; /* We cannot partially zeroize compressed clusters */
2041a6841a2dSAlberto Garcia         goto out;
2042a6841a2dSAlberto Garcia     case QCOW2_CLUSTER_NORMAL:
2043a6841a2dSAlberto Garcia     case QCOW2_CLUSTER_UNALLOCATED:
2044a6841a2dSAlberto Garcia         break;
2045a6841a2dSAlberto Garcia     default:
2046a6841a2dSAlberto Garcia         g_assert_not_reached();
2047a6841a2dSAlberto Garcia     }
2048a6841a2dSAlberto Garcia 
2049a6841a2dSAlberto Garcia     old_l2_bitmap = l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
2050a6841a2dSAlberto Garcia 
2051a6841a2dSAlberto Garcia     l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
2052a6841a2dSAlberto Garcia     l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
2053a6841a2dSAlberto Garcia 
2054a6841a2dSAlberto Garcia     if (old_l2_bitmap != l2_bitmap) {
2055a6841a2dSAlberto Garcia         set_l2_bitmap(s, l2_slice, l2_index, l2_bitmap);
2056a6841a2dSAlberto Garcia         qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2057a6841a2dSAlberto Garcia     }
2058a6841a2dSAlberto Garcia 
2059a6841a2dSAlberto Garcia     ret = 0;
2060a6841a2dSAlberto Garcia out:
2061a6841a2dSAlberto Garcia     qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2062a6841a2dSAlberto Garcia 
2063a6841a2dSAlberto Garcia     return ret;
2064a6841a2dSAlberto Garcia }
2065a6841a2dSAlberto Garcia 
2066a6841a2dSAlberto Garcia int qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
2067d2cb36afSEric Blake                              uint64_t bytes, int flags)
2068621f0589SKevin Wolf {
2069ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
2070d2cb36afSEric Blake     uint64_t end_offset = offset + bytes;
2071b6d36defSMax Reitz     uint64_t nb_clusters;
2072a6841a2dSAlberto Garcia     unsigned head, tail;
2073d2cb36afSEric Blake     int64_t cleared;
2074621f0589SKevin Wolf     int ret;
2075621f0589SKevin Wolf 
20766c3944dcSKevin Wolf     /* If we have to stay in sync with an external data file, zero out
20776c3944dcSKevin Wolf      * s->data_file first. */
20786c3944dcSKevin Wolf     if (data_file_is_raw(bs)) {
20796c3944dcSKevin Wolf         assert(has_data_file(bs));
20806c3944dcSKevin Wolf         ret = bdrv_co_pwrite_zeroes(s->data_file, offset, bytes, flags);
20816c3944dcSKevin Wolf         if (ret < 0) {
20826c3944dcSKevin Wolf             return ret;
20836c3944dcSKevin Wolf         }
20846c3944dcSKevin Wolf     }
20856c3944dcSKevin Wolf 
2086f10ee139SEric Blake     /* Caller must pass aligned values, except at image end */
2087a6841a2dSAlberto Garcia     assert(offset_into_subcluster(s, offset) == 0);
2088a6841a2dSAlberto Garcia     assert(offset_into_subcluster(s, end_offset) == 0 ||
2089f01643fbSKevin Wolf            end_offset >= bs->total_sectors << BDRV_SECTOR_BITS);
2090f10ee139SEric Blake 
209161b30439SKevin Wolf     /*
209261b30439SKevin Wolf      * The zero flag is only supported by version 3 and newer. However, if we
209361b30439SKevin Wolf      * have no backing file, we can resort to discard in version 2.
209461b30439SKevin Wolf      */
2095621f0589SKevin Wolf     if (s->qcow_version < 3) {
209661b30439SKevin Wolf         if (!bs->backing) {
209761b30439SKevin Wolf             return qcow2_cluster_discard(bs, offset, bytes,
209861b30439SKevin Wolf                                          QCOW2_DISCARD_REQUEST, false);
209961b30439SKevin Wolf         }
2100621f0589SKevin Wolf         return -ENOTSUP;
2101621f0589SKevin Wolf     }
2102621f0589SKevin Wolf 
2103a6841a2dSAlberto Garcia     head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
2104a6841a2dSAlberto Garcia     offset += head;
2105a6841a2dSAlberto Garcia 
2106a6841a2dSAlberto Garcia     tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
2107a6841a2dSAlberto Garcia         end_offset - MAX(offset, start_of_cluster(s, end_offset));
2108a6841a2dSAlberto Garcia     end_offset -= tail;
2109621f0589SKevin Wolf 
21100b919faeSKevin Wolf     s->cache_discards = true;
21110b919faeSKevin Wolf 
2112a6841a2dSAlberto Garcia     if (head) {
2113a6841a2dSAlberto Garcia         ret = zero_l2_subclusters(bs, offset - head,
2114a6841a2dSAlberto Garcia                                   size_to_subclusters(s, head));
2115a6841a2dSAlberto Garcia         if (ret < 0) {
2116a6841a2dSAlberto Garcia             goto fail;
2117a6841a2dSAlberto Garcia         }
2118a6841a2dSAlberto Garcia     }
2119a6841a2dSAlberto Garcia 
2120a6841a2dSAlberto Garcia     /* Each L2 slice is handled by its own loop iteration */
2121a6841a2dSAlberto Garcia     nb_clusters = size_to_clusters(s, end_offset - offset);
2122a6841a2dSAlberto Garcia 
2123621f0589SKevin Wolf     while (nb_clusters > 0) {
2124a9a9f8f0SAlberto Garcia         cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
2125d2cb36afSEric Blake         if (cleared < 0) {
2126d2cb36afSEric Blake             ret = cleared;
21270b919faeSKevin Wolf             goto fail;
2128621f0589SKevin Wolf         }
2129621f0589SKevin Wolf 
2130d2cb36afSEric Blake         nb_clusters -= cleared;
2131d2cb36afSEric Blake         offset += (cleared * s->cluster_size);
2132621f0589SKevin Wolf     }
2133621f0589SKevin Wolf 
2134a6841a2dSAlberto Garcia     if (tail) {
2135a6841a2dSAlberto Garcia         ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail));
2136a6841a2dSAlberto Garcia         if (ret < 0) {
2137a6841a2dSAlberto Garcia             goto fail;
2138a6841a2dSAlberto Garcia         }
2139a6841a2dSAlberto Garcia     }
2140a6841a2dSAlberto Garcia 
21410b919faeSKevin Wolf     ret = 0;
21420b919faeSKevin Wolf fail:
21430b919faeSKevin Wolf     s->cache_discards = false;
21440b919faeSKevin Wolf     qcow2_process_discards(bs, ret);
21450b919faeSKevin Wolf 
21460b919faeSKevin Wolf     return ret;
2147621f0589SKevin Wolf }
214832b6444dSMax Reitz 
214932b6444dSMax Reitz /*
215032b6444dSMax Reitz  * Expands all zero clusters in a specific L1 table (or deallocates them, for
215132b6444dSMax Reitz  * non-backed non-pre-allocated zero clusters).
215232b6444dSMax Reitz  *
21534057a2b2SMax Reitz  * l1_entries and *visited_l1_entries are used to keep track of progress for
21544057a2b2SMax Reitz  * status_cb(). l1_entries contains the total number of L1 entries and
21554057a2b2SMax Reitz  * *visited_l1_entries counts all visited L1 entries.
215632b6444dSMax Reitz  */
215732b6444dSMax Reitz static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
2158ecf58777SMax Reitz                                       int l1_size, int64_t *visited_l1_entries,
21594057a2b2SMax Reitz                                       int64_t l1_entries,
21608b13976dSMax Reitz                                       BlockDriverAmendStatusCB *status_cb,
21618b13976dSMax Reitz                                       void *cb_opaque)
216232b6444dSMax Reitz {
2163ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
216432b6444dSMax Reitz     bool is_active_l1 = (l1_table == s->l1_table);
2165415184f5SAlberto Garcia     uint64_t *l2_slice = NULL;
2166415184f5SAlberto Garcia     unsigned slice, slice_size2, n_slices;
216732b6444dSMax Reitz     int ret;
216832b6444dSMax Reitz     int i, j;
216932b6444dSMax Reitz 
21707bbb5920SAlberto Garcia     /* qcow2_downgrade() is not allowed in images with subclusters */
21717bbb5920SAlberto Garcia     assert(!has_subclusters(s));
21727bbb5920SAlberto Garcia 
2173c8fd8554SAlberto Garcia     slice_size2 = s->l2_slice_size * l2_entry_size(s);
2174415184f5SAlberto Garcia     n_slices = s->cluster_size / slice_size2;
2175415184f5SAlberto Garcia 
217632b6444dSMax Reitz     if (!is_active_l1) {
217732b6444dSMax Reitz         /* inactive L2 tables require a buffer to be stored in when loading
217832b6444dSMax Reitz          * them from disk */
2179415184f5SAlberto Garcia         l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2);
2180415184f5SAlberto Garcia         if (l2_slice == NULL) {
2181de82815dSKevin Wolf             return -ENOMEM;
2182de82815dSKevin Wolf         }
218332b6444dSMax Reitz     }
218432b6444dSMax Reitz 
218532b6444dSMax Reitz     for (i = 0; i < l1_size; i++) {
218632b6444dSMax Reitz         uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
21870e06528eSMax Reitz         uint64_t l2_refcount;
218832b6444dSMax Reitz 
218932b6444dSMax Reitz         if (!l2_offset) {
219032b6444dSMax Reitz             /* unallocated */
21914057a2b2SMax Reitz             (*visited_l1_entries)++;
21924057a2b2SMax Reitz             if (status_cb) {
21938b13976dSMax Reitz                 status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
21944057a2b2SMax Reitz             }
219532b6444dSMax Reitz             continue;
219632b6444dSMax Reitz         }
219732b6444dSMax Reitz 
21988dd93d93SMax Reitz         if (offset_into_cluster(s, l2_offset)) {
21998dd93d93SMax Reitz             qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
22008dd93d93SMax Reitz                                     PRIx64 " unaligned (L1 index: %#x)",
22018dd93d93SMax Reitz                                     l2_offset, i);
22028dd93d93SMax Reitz             ret = -EIO;
22038dd93d93SMax Reitz             goto fail;
22048dd93d93SMax Reitz         }
22058dd93d93SMax Reitz 
22069b765486SAlberto Garcia         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
22079b765486SAlberto Garcia                                  &l2_refcount);
22089b765486SAlberto Garcia         if (ret < 0) {
22099b765486SAlberto Garcia             goto fail;
22109b765486SAlberto Garcia         }
22119b765486SAlberto Garcia 
2212415184f5SAlberto Garcia         for (slice = 0; slice < n_slices; slice++) {
2213415184f5SAlberto Garcia             uint64_t slice_offset = l2_offset + slice * slice_size2;
2214415184f5SAlberto Garcia             bool l2_dirty = false;
221532b6444dSMax Reitz             if (is_active_l1) {
221632b6444dSMax Reitz                 /* get active L2 tables from cache */
2217415184f5SAlberto Garcia                 ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset,
2218415184f5SAlberto Garcia                                       (void **)&l2_slice);
221932b6444dSMax Reitz             } else {
222032b6444dSMax Reitz                 /* load inactive L2 tables from disk */
2221415184f5SAlberto Garcia                 ret = bdrv_pread(bs->file, slice_offset, l2_slice, slice_size2);
222232b6444dSMax Reitz             }
222332b6444dSMax Reitz             if (ret < 0) {
222432b6444dSMax Reitz                 goto fail;
222532b6444dSMax Reitz             }
222632b6444dSMax Reitz 
2227415184f5SAlberto Garcia             for (j = 0; j < s->l2_slice_size; j++) {
222812c6aebeSAlberto Garcia                 uint64_t l2_entry = get_l2_entry(s, l2_slice, j);
2229ecf58777SMax Reitz                 int64_t offset = l2_entry & L2E_OFFSET_MASK;
2230226494ffSAlberto Garcia                 QCow2ClusterType cluster_type =
2231808c2bb4SKevin Wolf                     qcow2_get_cluster_type(bs, l2_entry);
223232b6444dSMax Reitz 
2233fdfab37dSEric Blake                 if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
2234fdfab37dSEric Blake                     cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
223532b6444dSMax Reitz                     continue;
223632b6444dSMax Reitz                 }
223732b6444dSMax Reitz 
2238fdfab37dSEric Blake                 if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
2239760e0063SKevin Wolf                     if (!bs->backing) {
22407bbb5920SAlberto Garcia                         /*
22417bbb5920SAlberto Garcia                          * not backed; therefore we can simply deallocate the
22427bbb5920SAlberto Garcia                          * cluster. No need to call set_l2_bitmap(), this
22437bbb5920SAlberto Garcia                          * function doesn't support images with subclusters.
22447bbb5920SAlberto Garcia                          */
224512c6aebeSAlberto Garcia                         set_l2_entry(s, l2_slice, j, 0);
224632b6444dSMax Reitz                         l2_dirty = true;
224732b6444dSMax Reitz                         continue;
224832b6444dSMax Reitz                     }
224932b6444dSMax Reitz 
225032b6444dSMax Reitz                     offset = qcow2_alloc_clusters(bs, s->cluster_size);
225132b6444dSMax Reitz                     if (offset < 0) {
225232b6444dSMax Reitz                         ret = offset;
225332b6444dSMax Reitz                         goto fail;
225432b6444dSMax Reitz                     }
2255ecf58777SMax Reitz 
22563a75a870SAlberto Garcia                     /* The offset must fit in the offset field */
22573a75a870SAlberto Garcia                     assert((offset & L2E_OFFSET_MASK) == offset);
22583a75a870SAlberto Garcia 
2259ecf58777SMax Reitz                     if (l2_refcount > 1) {
2260226494ffSAlberto Garcia                         /* For shared L2 tables, set the refcount accordingly
2261226494ffSAlberto Garcia                          * (it is already 1 and needs to be l2_refcount) */
2262226494ffSAlberto Garcia                         ret = qcow2_update_cluster_refcount(
2263226494ffSAlberto Garcia                             bs, offset >> s->cluster_bits,
22642aabe7c7SMax Reitz                             refcount_diff(1, l2_refcount), false,
2265ecf58777SMax Reitz                             QCOW2_DISCARD_OTHER);
2266ecf58777SMax Reitz                         if (ret < 0) {
2267ecf58777SMax Reitz                             qcow2_free_clusters(bs, offset, s->cluster_size,
2268ecf58777SMax Reitz                                                 QCOW2_DISCARD_OTHER);
2269ecf58777SMax Reitz                             goto fail;
2270ecf58777SMax Reitz                         }
2271ecf58777SMax Reitz                     }
227232b6444dSMax Reitz                 }
227332b6444dSMax Reitz 
22748dd93d93SMax Reitz                 if (offset_into_cluster(s, offset)) {
2275415184f5SAlberto Garcia                     int l2_index = slice * s->l2_slice_size + j;
2276226494ffSAlberto Garcia                     qcow2_signal_corruption(
2277226494ffSAlberto Garcia                         bs, true, -1, -1,
2278bcb07dbaSEric Blake                         "Cluster allocation offset "
22798dd93d93SMax Reitz                         "%#" PRIx64 " unaligned (L2 offset: %#"
22808dd93d93SMax Reitz                         PRIx64 ", L2 index: %#x)", offset,
2281415184f5SAlberto Garcia                         l2_offset, l2_index);
2282fdfab37dSEric Blake                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
22838dd93d93SMax Reitz                         qcow2_free_clusters(bs, offset, s->cluster_size,
22848dd93d93SMax Reitz                                             QCOW2_DISCARD_ALWAYS);
22858dd93d93SMax Reitz                     }
22868dd93d93SMax Reitz                     ret = -EIO;
22878dd93d93SMax Reitz                     goto fail;
22888dd93d93SMax Reitz                 }
22898dd93d93SMax Reitz 
2290226494ffSAlberto Garcia                 ret = qcow2_pre_write_overlap_check(bs, 0, offset,
2291966b000fSKevin Wolf                                                     s->cluster_size, true);
229232b6444dSMax Reitz                 if (ret < 0) {
2293fdfab37dSEric Blake                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
229432b6444dSMax Reitz                         qcow2_free_clusters(bs, offset, s->cluster_size,
229532b6444dSMax Reitz                                             QCOW2_DISCARD_ALWAYS);
2296320c7066SMax Reitz                     }
229732b6444dSMax Reitz                     goto fail;
229832b6444dSMax Reitz                 }
229932b6444dSMax Reitz 
2300966b000fSKevin Wolf                 ret = bdrv_pwrite_zeroes(s->data_file, offset,
2301966b000fSKevin Wolf                                          s->cluster_size, 0);
230232b6444dSMax Reitz                 if (ret < 0) {
2303fdfab37dSEric Blake                     if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
230432b6444dSMax Reitz                         qcow2_free_clusters(bs, offset, s->cluster_size,
230532b6444dSMax Reitz                                             QCOW2_DISCARD_ALWAYS);
2306320c7066SMax Reitz                     }
230732b6444dSMax Reitz                     goto fail;
230832b6444dSMax Reitz                 }
230932b6444dSMax Reitz 
2310ecf58777SMax Reitz                 if (l2_refcount == 1) {
231112c6aebeSAlberto Garcia                     set_l2_entry(s, l2_slice, j, offset | QCOW_OFLAG_COPIED);
2312ecf58777SMax Reitz                 } else {
231312c6aebeSAlberto Garcia                     set_l2_entry(s, l2_slice, j, offset);
2314e390cf5aSMax Reitz                 }
23157bbb5920SAlberto Garcia                 /*
23167bbb5920SAlberto Garcia                  * No need to call set_l2_bitmap() after set_l2_entry() because
23177bbb5920SAlberto Garcia                  * this function doesn't support images with subclusters.
23187bbb5920SAlberto Garcia                  */
2319ecf58777SMax Reitz                 l2_dirty = true;
232032b6444dSMax Reitz             }
232132b6444dSMax Reitz 
232232b6444dSMax Reitz             if (is_active_l1) {
232332b6444dSMax Reitz                 if (l2_dirty) {
2324415184f5SAlberto Garcia                     qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
232532b6444dSMax Reitz                     qcow2_cache_depends_on_flush(s->l2_table_cache);
232632b6444dSMax Reitz                 }
2327415184f5SAlberto Garcia                 qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
232832b6444dSMax Reitz             } else {
232932b6444dSMax Reitz                 if (l2_dirty) {
2330226494ffSAlberto Garcia                     ret = qcow2_pre_write_overlap_check(
2331226494ffSAlberto Garcia                         bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
2332966b000fSKevin Wolf                         slice_offset, slice_size2, false);
233332b6444dSMax Reitz                     if (ret < 0) {
233432b6444dSMax Reitz                         goto fail;
233532b6444dSMax Reitz                     }
233632b6444dSMax Reitz 
2337415184f5SAlberto Garcia                     ret = bdrv_pwrite(bs->file, slice_offset,
2338415184f5SAlberto Garcia                                       l2_slice, slice_size2);
233932b6444dSMax Reitz                     if (ret < 0) {
234032b6444dSMax Reitz                         goto fail;
234132b6444dSMax Reitz                     }
234232b6444dSMax Reitz                 }
234332b6444dSMax Reitz             }
2344226494ffSAlberto Garcia         }
23454057a2b2SMax Reitz 
23464057a2b2SMax Reitz         (*visited_l1_entries)++;
23474057a2b2SMax Reitz         if (status_cb) {
23488b13976dSMax Reitz             status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
23494057a2b2SMax Reitz         }
235032b6444dSMax Reitz     }
235132b6444dSMax Reitz 
235232b6444dSMax Reitz     ret = 0;
235332b6444dSMax Reitz 
235432b6444dSMax Reitz fail:
2355415184f5SAlberto Garcia     if (l2_slice) {
235632b6444dSMax Reitz         if (!is_active_l1) {
2357415184f5SAlberto Garcia             qemu_vfree(l2_slice);
235832b6444dSMax Reitz         } else {
2359415184f5SAlberto Garcia             qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
236032b6444dSMax Reitz         }
236132b6444dSMax Reitz     }
236232b6444dSMax Reitz     return ret;
236332b6444dSMax Reitz }
236432b6444dSMax Reitz 
236532b6444dSMax Reitz /*
236632b6444dSMax Reitz  * For backed images, expands all zero clusters on the image. For non-backed
236732b6444dSMax Reitz  * images, deallocates all non-pre-allocated zero clusters (and claims the
236832b6444dSMax Reitz  * allocation for pre-allocated ones). This is important for downgrading to a
236932b6444dSMax Reitz  * qcow2 version which doesn't yet support metadata zero clusters.
237032b6444dSMax Reitz  */
23714057a2b2SMax Reitz int qcow2_expand_zero_clusters(BlockDriverState *bs,
23728b13976dSMax Reitz                                BlockDriverAmendStatusCB *status_cb,
23738b13976dSMax Reitz                                void *cb_opaque)
237432b6444dSMax Reitz {
2375ff99129aSKevin Wolf     BDRVQcow2State *s = bs->opaque;
237632b6444dSMax Reitz     uint64_t *l1_table = NULL;
23774057a2b2SMax Reitz     int64_t l1_entries = 0, visited_l1_entries = 0;
237832b6444dSMax Reitz     int ret;
237932b6444dSMax Reitz     int i, j;
238032b6444dSMax Reitz 
23814057a2b2SMax Reitz     if (status_cb) {
23824057a2b2SMax Reitz         l1_entries = s->l1_size;
23834057a2b2SMax Reitz         for (i = 0; i < s->nb_snapshots; i++) {
23844057a2b2SMax Reitz             l1_entries += s->snapshots[i].l1_size;
23854057a2b2SMax Reitz         }
23864057a2b2SMax Reitz     }
23874057a2b2SMax Reitz 
238832b6444dSMax Reitz     ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
23894057a2b2SMax Reitz                                      &visited_l1_entries, l1_entries,
23908b13976dSMax Reitz                                      status_cb, cb_opaque);
239132b6444dSMax Reitz     if (ret < 0) {
239232b6444dSMax Reitz         goto fail;
239332b6444dSMax Reitz     }
239432b6444dSMax Reitz 
239532b6444dSMax Reitz     /* Inactive L1 tables may point to active L2 tables - therefore it is
239632b6444dSMax Reitz      * necessary to flush the L2 table cache before trying to access the L2
239732b6444dSMax Reitz      * tables pointed to by inactive L1 entries (else we might try to expand
239832b6444dSMax Reitz      * zero clusters that have already been expanded); furthermore, it is also
239932b6444dSMax Reitz      * necessary to empty the L2 table cache, since it may contain tables which
240032b6444dSMax Reitz      * are now going to be modified directly on disk, bypassing the cache.
240132b6444dSMax Reitz      * qcow2_cache_empty() does both for us. */
240232b6444dSMax Reitz     ret = qcow2_cache_empty(bs, s->l2_table_cache);
240332b6444dSMax Reitz     if (ret < 0) {
240432b6444dSMax Reitz         goto fail;
240532b6444dSMax Reitz     }
240632b6444dSMax Reitz 
240732b6444dSMax Reitz     for (i = 0; i < s->nb_snapshots; i++) {
2408c9a442e4SAlberto Garcia         int l1_size2;
2409c9a442e4SAlberto Garcia         uint64_t *new_l1_table;
2410c9a442e4SAlberto Garcia         Error *local_err = NULL;
241132b6444dSMax Reitz 
2412c9a442e4SAlberto Garcia         ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset,
2413c9a442e4SAlberto Garcia                                    s->snapshots[i].l1_size, sizeof(uint64_t),
2414c9a442e4SAlberto Garcia                                    QCOW_MAX_L1_SIZE, "Snapshot L1 table",
2415c9a442e4SAlberto Garcia                                    &local_err);
2416c9a442e4SAlberto Garcia         if (ret < 0) {
2417c9a442e4SAlberto Garcia             error_report_err(local_err);
2418c9a442e4SAlberto Garcia             goto fail;
2419c9a442e4SAlberto Garcia         }
2420c9a442e4SAlberto Garcia 
2421c9a442e4SAlberto Garcia         l1_size2 = s->snapshots[i].l1_size * sizeof(uint64_t);
2422c9a442e4SAlberto Garcia         new_l1_table = g_try_realloc(l1_table, l1_size2);
2423de7269d2SAlberto Garcia 
2424de7269d2SAlberto Garcia         if (!new_l1_table) {
2425de7269d2SAlberto Garcia             ret = -ENOMEM;
2426de7269d2SAlberto Garcia             goto fail;
2427de7269d2SAlberto Garcia         }
2428de7269d2SAlberto Garcia 
2429de7269d2SAlberto Garcia         l1_table = new_l1_table;
243032b6444dSMax Reitz 
2431c9a442e4SAlberto Garcia         ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset,
2432c9a442e4SAlberto Garcia                          l1_table, l1_size2);
243332b6444dSMax Reitz         if (ret < 0) {
243432b6444dSMax Reitz             goto fail;
243532b6444dSMax Reitz         }
243632b6444dSMax Reitz 
243732b6444dSMax Reitz         for (j = 0; j < s->snapshots[i].l1_size; j++) {
243832b6444dSMax Reitz             be64_to_cpus(&l1_table[j]);
243932b6444dSMax Reitz         }
244032b6444dSMax Reitz 
244132b6444dSMax Reitz         ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
24424057a2b2SMax Reitz                                          &visited_l1_entries, l1_entries,
24438b13976dSMax Reitz                                          status_cb, cb_opaque);
244432b6444dSMax Reitz         if (ret < 0) {
244532b6444dSMax Reitz             goto fail;
244632b6444dSMax Reitz         }
244732b6444dSMax Reitz     }
244832b6444dSMax Reitz 
244932b6444dSMax Reitz     ret = 0;
245032b6444dSMax Reitz 
245132b6444dSMax Reitz fail:
245232b6444dSMax Reitz     g_free(l1_table);
245332b6444dSMax Reitz     return ret;
245432b6444dSMax Reitz }
2455